LLVM API Documentation

X86ISelLowering.cpp
Go to the documentation of this file.
00001 //===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===//
00002 //
00003 //                     The LLVM Compiler Infrastructure
00004 //
00005 // This file is distributed under the University of Illinois Open Source
00006 // License. See LICENSE.TXT for details.
00007 //
00008 //===----------------------------------------------------------------------===//
00009 //
00010 // This file defines the interfaces that X86 uses to lower LLVM code into a
00011 // selection DAG.
00012 //
00013 //===----------------------------------------------------------------------===//
00014 
00015 #define DEBUG_TYPE "x86-isel"
00016 #include "X86ISelLowering.h"
00017 #include "Utils/X86ShuffleDecode.h"
00018 #include "X86CallingConv.h"
00019 #include "X86InstrBuilder.h"
00020 #include "X86MachineFunctionInfo.h"
00021 #include "X86TargetMachine.h"
00022 #include "X86TargetObjectFile.h"
00023 #include "llvm/ADT/SmallSet.h"
00024 #include "llvm/ADT/Statistic.h"
00025 #include "llvm/ADT/StringExtras.h"
00026 #include "llvm/ADT/VariadicFunction.h"
00027 #include "llvm/CodeGen/IntrinsicLowering.h"
00028 #include "llvm/CodeGen/MachineFrameInfo.h"
00029 #include "llvm/CodeGen/MachineFunction.h"
00030 #include "llvm/CodeGen/MachineInstrBuilder.h"
00031 #include "llvm/CodeGen/MachineJumpTableInfo.h"
00032 #include "llvm/CodeGen/MachineModuleInfo.h"
00033 #include "llvm/CodeGen/MachineRegisterInfo.h"
00034 #include "llvm/IR/CallSite.h"
00035 #include "llvm/IR/CallingConv.h"
00036 #include "llvm/IR/Constants.h"
00037 #include "llvm/IR/DerivedTypes.h"
00038 #include "llvm/IR/Function.h"
00039 #include "llvm/IR/GlobalAlias.h"
00040 #include "llvm/IR/GlobalVariable.h"
00041 #include "llvm/IR/Instructions.h"
00042 #include "llvm/IR/Intrinsics.h"
00043 #include "llvm/MC/MCAsmInfo.h"
00044 #include "llvm/MC/MCContext.h"
00045 #include "llvm/MC/MCExpr.h"
00046 #include "llvm/MC/MCSymbol.h"
00047 #include "llvm/Support/Debug.h"
00048 #include "llvm/Support/ErrorHandling.h"
00049 #include "llvm/Support/MathExtras.h"
00050 #include "llvm/Target/TargetOptions.h"
00051 #include <bitset>
00052 #include <cctype>
00053 using namespace llvm;
00054 
00055 STATISTIC(NumTailCalls, "Number of tail calls");
00056 
00057 // Forward declarations.
00058 static SDValue getMOVL(SelectionDAG &DAG, SDLoc dl, EVT VT, SDValue V1,
00059                        SDValue V2);
00060 
00061 static SDValue ExtractSubVector(SDValue Vec, unsigned IdxVal,
00062                                 SelectionDAG &DAG, SDLoc dl,
00063                                 unsigned vectorWidth) {
00064   assert((vectorWidth == 128 || vectorWidth == 256) &&
00065          "Unsupported vector width");
00066   EVT VT = Vec.getValueType();
00067   EVT ElVT = VT.getVectorElementType();
00068   unsigned Factor = VT.getSizeInBits()/vectorWidth;
00069   EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT,
00070                                   VT.getVectorNumElements()/Factor);
00071 
00072   // Extract from UNDEF is UNDEF.
00073   if (Vec.getOpcode() == ISD::UNDEF)
00074     return DAG.getUNDEF(ResultVT);
00075 
00076   // Extract the relevant vectorWidth bits.  Generate an EXTRACT_SUBVECTOR
00077   unsigned ElemsPerChunk = vectorWidth / ElVT.getSizeInBits();
00078 
00079   // This is the index of the first element of the vectorWidth-bit chunk
00080   // we want.
00081   unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits()) / vectorWidth)
00082                                * ElemsPerChunk);
00083 
00084   // If the input is a buildvector just emit a smaller one.
00085   if (Vec.getOpcode() == ISD::BUILD_VECTOR)
00086     return DAG.getNode(ISD::BUILD_VECTOR, dl, ResultVT,
00087                        Vec->op_begin()+NormalizedIdxVal, ElemsPerChunk);
00088 
00089   SDValue VecIdx = DAG.getIntPtrConstant(NormalizedIdxVal);
00090   SDValue Result = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec,
00091                                VecIdx);
00092 
00093   return Result;
00094 
00095 }
00096 /// Generate a DAG to grab 128-bits from a vector > 128 bits.  This
00097 /// sets things up to match to an AVX VEXTRACTF128 / VEXTRACTI128
00098 /// or AVX-512 VEXTRACTF32x4 / VEXTRACTI32x4
00099 /// instructions or a simple subregister reference. Idx is an index in the
00100 /// 128 bits we want.  It need not be aligned to a 128-bit bounday.  That makes
00101 /// lowering EXTRACT_VECTOR_ELT operations easier.
00102 static SDValue Extract128BitVector(SDValue Vec, unsigned IdxVal,
00103                                    SelectionDAG &DAG, SDLoc dl) {
00104   assert((Vec.getValueType().is256BitVector() ||
00105           Vec.getValueType().is512BitVector()) && "Unexpected vector size!");
00106   return ExtractSubVector(Vec, IdxVal, DAG, dl, 128);
00107 }
00108 
00109 /// Generate a DAG to grab 256-bits from a 512-bit vector.
00110 static SDValue Extract256BitVector(SDValue Vec, unsigned IdxVal,
00111                                    SelectionDAG &DAG, SDLoc dl) {
00112   assert(Vec.getValueType().is512BitVector() && "Unexpected vector size!");
00113   return ExtractSubVector(Vec, IdxVal, DAG, dl, 256);
00114 }
00115 
00116 static SDValue InsertSubVector(SDValue Result, SDValue Vec,
00117                                unsigned IdxVal, SelectionDAG &DAG,
00118                                SDLoc dl, unsigned vectorWidth) {
00119   assert((vectorWidth == 128 || vectorWidth == 256) &&
00120          "Unsupported vector width");
00121   // Inserting UNDEF is Result
00122   if (Vec.getOpcode() == ISD::UNDEF)
00123     return Result;
00124   EVT VT = Vec.getValueType();
00125   EVT ElVT = VT.getVectorElementType();
00126   EVT ResultVT = Result.getValueType();
00127 
00128   // Insert the relevant vectorWidth bits.
00129   unsigned ElemsPerChunk = vectorWidth/ElVT.getSizeInBits();
00130 
00131   // This is the index of the first element of the vectorWidth-bit chunk
00132   // we want.
00133   unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits())/vectorWidth)
00134                                * ElemsPerChunk);
00135 
00136   SDValue VecIdx = DAG.getIntPtrConstant(NormalizedIdxVal);
00137   return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec,
00138                      VecIdx);
00139 }
00140 /// Generate a DAG to put 128-bits into a vector > 128 bits.  This
00141 /// sets things up to match to an AVX VINSERTF128/VINSERTI128 or
00142 /// AVX-512 VINSERTF32x4/VINSERTI32x4 instructions or a
00143 /// simple superregister reference.  Idx is an index in the 128 bits
00144 /// we want.  It need not be aligned to a 128-bit bounday.  That makes
00145 /// lowering INSERT_VECTOR_ELT operations easier.
00146 static SDValue Insert128BitVector(SDValue Result, SDValue Vec,
00147                                   unsigned IdxVal, SelectionDAG &DAG,
00148                                   SDLoc dl) {
00149   assert(Vec.getValueType().is128BitVector() && "Unexpected vector size!");
00150   return InsertSubVector(Result, Vec, IdxVal, DAG, dl, 128);
00151 }
00152 
00153 static SDValue Insert256BitVector(SDValue Result, SDValue Vec,
00154                                   unsigned IdxVal, SelectionDAG &DAG,
00155                                   SDLoc dl) {
00156   assert(Vec.getValueType().is256BitVector() && "Unexpected vector size!");
00157   return InsertSubVector(Result, Vec, IdxVal, DAG, dl, 256);
00158 }
00159 
00160 /// Concat two 128-bit vectors into a 256 bit vector using VINSERTF128
00161 /// instructions. This is used because creating CONCAT_VECTOR nodes of
00162 /// BUILD_VECTORS returns a larger BUILD_VECTOR while we're trying to lower
00163 /// large BUILD_VECTORS.
00164 static SDValue Concat128BitVectors(SDValue V1, SDValue V2, EVT VT,
00165                                    unsigned NumElems, SelectionDAG &DAG,
00166                                    SDLoc dl) {
00167   SDValue V = Insert128BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl);
00168   return Insert128BitVector(V, V2, NumElems/2, DAG, dl);
00169 }
00170 
00171 static SDValue Concat256BitVectors(SDValue V1, SDValue V2, EVT VT,
00172                                    unsigned NumElems, SelectionDAG &DAG,
00173                                    SDLoc dl) {
00174   SDValue V = Insert256BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl);
00175   return Insert256BitVector(V, V2, NumElems/2, DAG, dl);
00176 }
00177 
00178 static TargetLoweringObjectFile *createTLOF(X86TargetMachine &TM) {
00179   const X86Subtarget *Subtarget = &TM.getSubtarget<X86Subtarget>();
00180   bool is64Bit = Subtarget->is64Bit();
00181 
00182   if (Subtarget->isTargetMacho()) {
00183     if (is64Bit)
00184       return new X86_64MachoTargetObjectFile();
00185     return new TargetLoweringObjectFileMachO();
00186   }
00187 
00188   if (Subtarget->isTargetLinux())
00189     return new X86LinuxTargetObjectFile();
00190   if (Subtarget->isTargetELF())
00191     return new TargetLoweringObjectFileELF();
00192   if (Subtarget->isTargetKnownWindowsMSVC())
00193     return new X86WindowsTargetObjectFile();
00194   if (Subtarget->isTargetCOFF())
00195     return new TargetLoweringObjectFileCOFF();
00196   llvm_unreachable("unknown subtarget type");
00197 }
00198 
00199 X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
00200   : TargetLowering(TM, createTLOF(TM)) {
00201   Subtarget = &TM.getSubtarget<X86Subtarget>();
00202   X86ScalarSSEf64 = Subtarget->hasSSE2();
00203   X86ScalarSSEf32 = Subtarget->hasSSE1();
00204   TD = getDataLayout();
00205 
00206   resetOperationActions();
00207 }
00208 
00209 void X86TargetLowering::resetOperationActions() {
00210   const TargetMachine &TM = getTargetMachine();
00211   static bool FirstTimeThrough = true;
00212 
00213   // If none of the target options have changed, then we don't need to reset the
00214   // operation actions.
00215   if (!FirstTimeThrough && TO == TM.Options) return;
00216 
00217   if (!FirstTimeThrough) {
00218     // Reinitialize the actions.
00219     initActions();
00220     FirstTimeThrough = false;
00221   }
00222 
00223   TO = TM.Options;
00224 
00225   // Set up the TargetLowering object.
00226   static const MVT IntVTs[] = { MVT::i8, MVT::i16, MVT::i32, MVT::i64 };
00227 
00228   // X86 is weird, it always uses i8 for shift amounts and setcc results.
00229   setBooleanContents(ZeroOrOneBooleanContent);
00230   // X86-SSE is even stranger. It uses -1 or 0 for vector masks.
00231   setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
00232 
00233   // For 64-bit since we have so many registers use the ILP scheduler, for
00234   // 32-bit code use the register pressure specific scheduling.
00235   // For Atom, always use ILP scheduling.
00236   if (Subtarget->isAtom())
00237     setSchedulingPreference(Sched::ILP);
00238   else if (Subtarget->is64Bit())
00239     setSchedulingPreference(Sched::ILP);
00240   else
00241     setSchedulingPreference(Sched::RegPressure);
00242   const X86RegisterInfo *RegInfo =
00243     static_cast<const X86RegisterInfo*>(TM.getRegisterInfo());
00244   setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister());
00245 
00246   // Bypass expensive divides on Atom when compiling with O2
00247   if (Subtarget->hasSlowDivide() && TM.getOptLevel() >= CodeGenOpt::Default) {
00248     addBypassSlowDiv(32, 8);
00249     if (Subtarget->is64Bit())
00250       addBypassSlowDiv(64, 16);
00251   }
00252 
00253   if (Subtarget->isTargetKnownWindowsMSVC()) {
00254     // Setup Windows compiler runtime calls.
00255     setLibcallName(RTLIB::SDIV_I64, "_alldiv");
00256     setLibcallName(RTLIB::UDIV_I64, "_aulldiv");
00257     setLibcallName(RTLIB::SREM_I64, "_allrem");
00258     setLibcallName(RTLIB::UREM_I64, "_aullrem");
00259     setLibcallName(RTLIB::MUL_I64, "_allmul");
00260     setLibcallCallingConv(RTLIB::SDIV_I64, CallingConv::X86_StdCall);
00261     setLibcallCallingConv(RTLIB::UDIV_I64, CallingConv::X86_StdCall);
00262     setLibcallCallingConv(RTLIB::SREM_I64, CallingConv::X86_StdCall);
00263     setLibcallCallingConv(RTLIB::UREM_I64, CallingConv::X86_StdCall);
00264     setLibcallCallingConv(RTLIB::MUL_I64, CallingConv::X86_StdCall);
00265 
00266     // The _ftol2 runtime function has an unusual calling conv, which
00267     // is modeled by a special pseudo-instruction.
00268     setLibcallName(RTLIB::FPTOUINT_F64_I64, 0);
00269     setLibcallName(RTLIB::FPTOUINT_F32_I64, 0);
00270     setLibcallName(RTLIB::FPTOUINT_F64_I32, 0);
00271     setLibcallName(RTLIB::FPTOUINT_F32_I32, 0);
00272   }
00273 
00274   if (Subtarget->isTargetDarwin()) {
00275     // Darwin should use _setjmp/_longjmp instead of setjmp/longjmp.
00276     setUseUnderscoreSetJmp(false);
00277     setUseUnderscoreLongJmp(false);
00278   } else if (Subtarget->isTargetWindowsGNU()) {
00279     // MS runtime is weird: it exports _setjmp, but longjmp!
00280     setUseUnderscoreSetJmp(true);
00281     setUseUnderscoreLongJmp(false);
00282   } else {
00283     setUseUnderscoreSetJmp(true);
00284     setUseUnderscoreLongJmp(true);
00285   }
00286 
00287   // Set up the register classes.
00288   addRegisterClass(MVT::i8, &X86::GR8RegClass);
00289   addRegisterClass(MVT::i16, &X86::GR16RegClass);
00290   addRegisterClass(MVT::i32, &X86::GR32RegClass);
00291   if (Subtarget->is64Bit())
00292     addRegisterClass(MVT::i64, &X86::GR64RegClass);
00293 
00294   setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote);
00295 
00296   // We don't accept any truncstore of integer registers.
00297   setTruncStoreAction(MVT::i64, MVT::i32, Expand);
00298   setTruncStoreAction(MVT::i64, MVT::i16, Expand);
00299   setTruncStoreAction(MVT::i64, MVT::i8 , Expand);
00300   setTruncStoreAction(MVT::i32, MVT::i16, Expand);
00301   setTruncStoreAction(MVT::i32, MVT::i8 , Expand);
00302   setTruncStoreAction(MVT::i16, MVT::i8,  Expand);
00303 
00304   // SETOEQ and SETUNE require checking two conditions.
00305   setCondCodeAction(ISD::SETOEQ, MVT::f32, Expand);
00306   setCondCodeAction(ISD::SETOEQ, MVT::f64, Expand);
00307   setCondCodeAction(ISD::SETOEQ, MVT::f80, Expand);
00308   setCondCodeAction(ISD::SETUNE, MVT::f32, Expand);
00309   setCondCodeAction(ISD::SETUNE, MVT::f64, Expand);
00310   setCondCodeAction(ISD::SETUNE, MVT::f80, Expand);
00311 
00312   // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this
00313   // operation.
00314   setOperationAction(ISD::UINT_TO_FP       , MVT::i1   , Promote);
00315   setOperationAction(ISD::UINT_TO_FP       , MVT::i8   , Promote);
00316   setOperationAction(ISD::UINT_TO_FP       , MVT::i16  , Promote);
00317 
00318   if (Subtarget->is64Bit()) {
00319     setOperationAction(ISD::UINT_TO_FP     , MVT::i32  , Promote);
00320     setOperationAction(ISD::UINT_TO_FP     , MVT::i64  , Custom);
00321   } else if (!TM.Options.UseSoftFloat) {
00322     // We have an algorithm for SSE2->double, and we turn this into a
00323     // 64-bit FILD followed by conditional FADD for other targets.
00324     setOperationAction(ISD::UINT_TO_FP     , MVT::i64  , Custom);
00325     // We have an algorithm for SSE2, and we turn this into a 64-bit
00326     // FILD for other targets.
00327     setOperationAction(ISD::UINT_TO_FP     , MVT::i32  , Custom);
00328   }
00329 
00330   // Promote i1/i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have
00331   // this operation.
00332   setOperationAction(ISD::SINT_TO_FP       , MVT::i1   , Promote);
00333   setOperationAction(ISD::SINT_TO_FP       , MVT::i8   , Promote);
00334 
00335   if (!TM.Options.UseSoftFloat) {
00336     // SSE has no i16 to fp conversion, only i32
00337     if (X86ScalarSSEf32) {
00338       setOperationAction(ISD::SINT_TO_FP     , MVT::i16  , Promote);
00339       // f32 and f64 cases are Legal, f80 case is not
00340       setOperationAction(ISD::SINT_TO_FP     , MVT::i32  , Custom);
00341     } else {
00342       setOperationAction(ISD::SINT_TO_FP     , MVT::i16  , Custom);
00343       setOperationAction(ISD::SINT_TO_FP     , MVT::i32  , Custom);
00344     }
00345   } else {
00346     setOperationAction(ISD::SINT_TO_FP     , MVT::i16  , Promote);
00347     setOperationAction(ISD::SINT_TO_FP     , MVT::i32  , Promote);
00348   }
00349 
00350   // In 32-bit mode these are custom lowered.  In 64-bit mode F32 and F64
00351   // are Legal, f80 is custom lowered.
00352   setOperationAction(ISD::FP_TO_SINT     , MVT::i64  , Custom);
00353   setOperationAction(ISD::SINT_TO_FP     , MVT::i64  , Custom);
00354 
00355   // Promote i1/i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have
00356   // this operation.
00357   setOperationAction(ISD::FP_TO_SINT       , MVT::i1   , Promote);
00358   setOperationAction(ISD::FP_TO_SINT       , MVT::i8   , Promote);
00359 
00360   if (X86ScalarSSEf32) {
00361     setOperationAction(ISD::FP_TO_SINT     , MVT::i16  , Promote);
00362     // f32 and f64 cases are Legal, f80 case is not
00363     setOperationAction(ISD::FP_TO_SINT     , MVT::i32  , Custom);
00364   } else {
00365     setOperationAction(ISD::FP_TO_SINT     , MVT::i16  , Custom);
00366     setOperationAction(ISD::FP_TO_SINT     , MVT::i32  , Custom);
00367   }
00368 
00369   // Handle FP_TO_UINT by promoting the destination to a larger signed
00370   // conversion.
00371   setOperationAction(ISD::FP_TO_UINT       , MVT::i1   , Promote);
00372   setOperationAction(ISD::FP_TO_UINT       , MVT::i8   , Promote);
00373   setOperationAction(ISD::FP_TO_UINT       , MVT::i16  , Promote);
00374 
00375   if (Subtarget->is64Bit()) {
00376     setOperationAction(ISD::FP_TO_UINT     , MVT::i64  , Expand);
00377     setOperationAction(ISD::FP_TO_UINT     , MVT::i32  , Promote);
00378   } else if (!TM.Options.UseSoftFloat) {
00379     // Since AVX is a superset of SSE3, only check for SSE here.
00380     if (Subtarget->hasSSE1() && !Subtarget->hasSSE3())
00381       // Expand FP_TO_UINT into a select.
00382       // FIXME: We would like to use a Custom expander here eventually to do
00383       // the optimal thing for SSE vs. the default expansion in the legalizer.
00384       setOperationAction(ISD::FP_TO_UINT   , MVT::i32  , Expand);
00385     else
00386       // With SSE3 we can use fisttpll to convert to a signed i64; without
00387       // SSE, we're stuck with a fistpll.
00388       setOperationAction(ISD::FP_TO_UINT   , MVT::i32  , Custom);
00389   }
00390 
00391   if (isTargetFTOL()) {
00392     // Use the _ftol2 runtime function, which has a pseudo-instruction
00393     // to handle its weird calling convention.
00394     setOperationAction(ISD::FP_TO_UINT     , MVT::i64  , Custom);
00395   }
00396 
00397   // TODO: when we have SSE, these could be more efficient, by using movd/movq.
00398   if (!X86ScalarSSEf64) {
00399     setOperationAction(ISD::BITCAST        , MVT::f32  , Expand);
00400     setOperationAction(ISD::BITCAST        , MVT::i32  , Expand);
00401     if (Subtarget->is64Bit()) {
00402       setOperationAction(ISD::BITCAST      , MVT::f64  , Expand);
00403       // Without SSE, i64->f64 goes through memory.
00404       setOperationAction(ISD::BITCAST      , MVT::i64  , Expand);
00405     }
00406   }
00407 
00408   // Scalar integer divide and remainder are lowered to use operations that
00409   // produce two results, to match the available instructions. This exposes
00410   // the two-result form to trivial CSE, which is able to combine x/y and x%y
00411   // into a single instruction.
00412   //
00413   // Scalar integer multiply-high is also lowered to use two-result
00414   // operations, to match the available instructions. However, plain multiply
00415   // (low) operations are left as Legal, as there are single-result
00416   // instructions for this in x86. Using the two-result multiply instructions
00417   // when both high and low results are needed must be arranged by dagcombine.
00418   for (unsigned i = 0; i != array_lengthof(IntVTs); ++i) {
00419     MVT VT = IntVTs[i];
00420     setOperationAction(ISD::MULHS, VT, Expand);
00421     setOperationAction(ISD::MULHU, VT, Expand);
00422     setOperationAction(ISD::SDIV, VT, Expand);
00423     setOperationAction(ISD::UDIV, VT, Expand);
00424     setOperationAction(ISD::SREM, VT, Expand);
00425     setOperationAction(ISD::UREM, VT, Expand);
00426 
00427     // Add/Sub overflow ops with MVT::Glues are lowered to EFLAGS dependences.
00428     setOperationAction(ISD::ADDC, VT, Custom);
00429     setOperationAction(ISD::ADDE, VT, Custom);
00430     setOperationAction(ISD::SUBC, VT, Custom);
00431     setOperationAction(ISD::SUBE, VT, Custom);
00432   }
00433 
00434   setOperationAction(ISD::BR_JT            , MVT::Other, Expand);
00435   setOperationAction(ISD::BRCOND           , MVT::Other, Custom);
00436   setOperationAction(ISD::BR_CC            , MVT::f32,   Expand);
00437   setOperationAction(ISD::BR_CC            , MVT::f64,   Expand);
00438   setOperationAction(ISD::BR_CC            , MVT::f80,   Expand);
00439   setOperationAction(ISD::BR_CC            , MVT::i8,    Expand);
00440   setOperationAction(ISD::BR_CC            , MVT::i16,   Expand);
00441   setOperationAction(ISD::BR_CC            , MVT::i32,   Expand);
00442   setOperationAction(ISD::BR_CC            , MVT::i64,   Expand);
00443   setOperationAction(ISD::SELECT_CC        , MVT::Other, Expand);
00444   if (Subtarget->is64Bit())
00445     setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
00446   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16  , Legal);
00447   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8   , Legal);
00448   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1   , Expand);
00449   setOperationAction(ISD::FP_ROUND_INREG   , MVT::f32  , Expand);
00450   setOperationAction(ISD::FREM             , MVT::f32  , Expand);
00451   setOperationAction(ISD::FREM             , MVT::f64  , Expand);
00452   setOperationAction(ISD::FREM             , MVT::f80  , Expand);
00453   setOperationAction(ISD::FLT_ROUNDS_      , MVT::i32  , Custom);
00454 
00455   // Promote the i8 variants and force them on up to i32 which has a shorter
00456   // encoding.
00457   setOperationAction(ISD::CTTZ             , MVT::i8   , Promote);
00458   AddPromotedToType (ISD::CTTZ             , MVT::i8   , MVT::i32);
00459   setOperationAction(ISD::CTTZ_ZERO_UNDEF  , MVT::i8   , Promote);
00460   AddPromotedToType (ISD::CTTZ_ZERO_UNDEF  , MVT::i8   , MVT::i32);
00461   if (Subtarget->hasBMI()) {
00462     setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i16  , Expand);
00463     setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32  , Expand);
00464     if (Subtarget->is64Bit())
00465       setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Expand);
00466   } else {
00467     setOperationAction(ISD::CTTZ           , MVT::i16  , Custom);
00468     setOperationAction(ISD::CTTZ           , MVT::i32  , Custom);
00469     if (Subtarget->is64Bit())
00470       setOperationAction(ISD::CTTZ         , MVT::i64  , Custom);
00471   }
00472 
00473   if (Subtarget->hasLZCNT()) {
00474     // When promoting the i8 variants, force them to i32 for a shorter
00475     // encoding.
00476     setOperationAction(ISD::CTLZ           , MVT::i8   , Promote);
00477     AddPromotedToType (ISD::CTLZ           , MVT::i8   , MVT::i32);
00478     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8   , Promote);
00479     AddPromotedToType (ISD::CTLZ_ZERO_UNDEF, MVT::i8   , MVT::i32);
00480     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16  , Expand);
00481     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32  , Expand);
00482     if (Subtarget->is64Bit())
00483       setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Expand);
00484   } else {
00485     setOperationAction(ISD::CTLZ           , MVT::i8   , Custom);
00486     setOperationAction(ISD::CTLZ           , MVT::i16  , Custom);
00487     setOperationAction(ISD::CTLZ           , MVT::i32  , Custom);
00488     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8   , Custom);
00489     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16  , Custom);
00490     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32  , Custom);
00491     if (Subtarget->is64Bit()) {
00492       setOperationAction(ISD::CTLZ         , MVT::i64  , Custom);
00493       setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom);
00494     }
00495   }
00496 
00497   if (Subtarget->hasPOPCNT()) {
00498     setOperationAction(ISD::CTPOP          , MVT::i8   , Promote);
00499   } else {
00500     setOperationAction(ISD::CTPOP          , MVT::i8   , Expand);
00501     setOperationAction(ISD::CTPOP          , MVT::i16  , Expand);
00502     setOperationAction(ISD::CTPOP          , MVT::i32  , Expand);
00503     if (Subtarget->is64Bit())
00504       setOperationAction(ISD::CTPOP        , MVT::i64  , Expand);
00505   }
00506 
00507   setOperationAction(ISD::READCYCLECOUNTER , MVT::i64  , Custom);
00508 
00509   if (!Subtarget->hasMOVBE())
00510     setOperationAction(ISD::BSWAP          , MVT::i16  , Expand);
00511 
00512   // These should be promoted to a larger select which is supported.
00513   setOperationAction(ISD::SELECT          , MVT::i1   , Promote);
00514   // X86 wants to expand cmov itself.
00515   setOperationAction(ISD::SELECT          , MVT::i8   , Custom);
00516   setOperationAction(ISD::SELECT          , MVT::i16  , Custom);
00517   setOperationAction(ISD::SELECT          , MVT::i32  , Custom);
00518   setOperationAction(ISD::SELECT          , MVT::f32  , Custom);
00519   setOperationAction(ISD::SELECT          , MVT::f64  , Custom);
00520   setOperationAction(ISD::SELECT          , MVT::f80  , Custom);
00521   setOperationAction(ISD::SETCC           , MVT::i8   , Custom);
00522   setOperationAction(ISD::SETCC           , MVT::i16  , Custom);
00523   setOperationAction(ISD::SETCC           , MVT::i32  , Custom);
00524   setOperationAction(ISD::SETCC           , MVT::f32  , Custom);
00525   setOperationAction(ISD::SETCC           , MVT::f64  , Custom);
00526   setOperationAction(ISD::SETCC           , MVT::f80  , Custom);
00527   if (Subtarget->is64Bit()) {
00528     setOperationAction(ISD::SELECT        , MVT::i64  , Custom);
00529     setOperationAction(ISD::SETCC         , MVT::i64  , Custom);
00530   }
00531   setOperationAction(ISD::EH_RETURN       , MVT::Other, Custom);
00532   // NOTE: EH_SJLJ_SETJMP/_LONGJMP supported here is NOT intended to support
00533   // SjLj exception handling but a light-weight setjmp/longjmp replacement to
00534   // support continuation, user-level threading, and etc.. As a result, no
00535   // other SjLj exception interfaces are implemented and please don't build
00536   // your own exception handling based on them.
00537   // LLVM/Clang supports zero-cost DWARF exception handling.
00538   setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom);
00539   setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom);
00540 
00541   // Darwin ABI issue.
00542   setOperationAction(ISD::ConstantPool    , MVT::i32  , Custom);
00543   setOperationAction(ISD::JumpTable       , MVT::i32  , Custom);
00544   setOperationAction(ISD::GlobalAddress   , MVT::i32  , Custom);
00545   setOperationAction(ISD::GlobalTLSAddress, MVT::i32  , Custom);
00546   if (Subtarget->is64Bit())
00547     setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom);
00548   setOperationAction(ISD::ExternalSymbol  , MVT::i32  , Custom);
00549   setOperationAction(ISD::BlockAddress    , MVT::i32  , Custom);
00550   if (Subtarget->is64Bit()) {
00551     setOperationAction(ISD::ConstantPool  , MVT::i64  , Custom);
00552     setOperationAction(ISD::JumpTable     , MVT::i64  , Custom);
00553     setOperationAction(ISD::GlobalAddress , MVT::i64  , Custom);
00554     setOperationAction(ISD::ExternalSymbol, MVT::i64  , Custom);
00555     setOperationAction(ISD::BlockAddress  , MVT::i64  , Custom);
00556   }
00557   // 64-bit addm sub, shl, sra, srl (iff 32-bit x86)
00558   setOperationAction(ISD::SHL_PARTS       , MVT::i32  , Custom);
00559   setOperationAction(ISD::SRA_PARTS       , MVT::i32  , Custom);
00560   setOperationAction(ISD::SRL_PARTS       , MVT::i32  , Custom);
00561   if (Subtarget->is64Bit()) {
00562     setOperationAction(ISD::SHL_PARTS     , MVT::i64  , Custom);
00563     setOperationAction(ISD::SRA_PARTS     , MVT::i64  , Custom);
00564     setOperationAction(ISD::SRL_PARTS     , MVT::i64  , Custom);
00565   }
00566 
00567   if (Subtarget->hasSSE1())
00568     setOperationAction(ISD::PREFETCH      , MVT::Other, Legal);
00569 
00570   setOperationAction(ISD::ATOMIC_FENCE  , MVT::Other, Custom);
00571 
00572   // Expand certain atomics
00573   for (unsigned i = 0; i != array_lengthof(IntVTs); ++i) {
00574     MVT VT = IntVTs[i];
00575     setOperationAction(ISD::ATOMIC_CMP_SWAP, VT, Custom);
00576     setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom);
00577     setOperationAction(ISD::ATOMIC_STORE, VT, Custom);
00578   }
00579 
00580   if (!Subtarget->is64Bit()) {
00581     setOperationAction(ISD::ATOMIC_LOAD, MVT::i64, Custom);
00582     setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i64, Custom);
00583     setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i64, Custom);
00584     setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i64, Custom);
00585     setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i64, Custom);
00586     setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i64, Custom);
00587     setOperationAction(ISD::ATOMIC_LOAD_NAND, MVT::i64, Custom);
00588     setOperationAction(ISD::ATOMIC_SWAP, MVT::i64, Custom);
00589     setOperationAction(ISD::ATOMIC_LOAD_MAX, MVT::i64, Custom);
00590     setOperationAction(ISD::ATOMIC_LOAD_MIN, MVT::i64, Custom);
00591     setOperationAction(ISD::ATOMIC_LOAD_UMAX, MVT::i64, Custom);
00592     setOperationAction(ISD::ATOMIC_LOAD_UMIN, MVT::i64, Custom);
00593   }
00594 
00595   if (Subtarget->hasCmpxchg16b()) {
00596     setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i128, Custom);
00597   }
00598 
00599   // FIXME - use subtarget debug flags
00600   if (!Subtarget->isTargetDarwin() &&
00601       !Subtarget->isTargetELF() &&
00602       !Subtarget->isTargetCygMing()) {
00603     setOperationAction(ISD::EH_LABEL, MVT::Other, Expand);
00604   }
00605 
00606   if (Subtarget->is64Bit()) {
00607     setExceptionPointerRegister(X86::RAX);
00608     setExceptionSelectorRegister(X86::RDX);
00609   } else {
00610     setExceptionPointerRegister(X86::EAX);
00611     setExceptionSelectorRegister(X86::EDX);
00612   }
00613   setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom);
00614   setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i64, Custom);
00615 
00616   setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom);
00617   setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom);
00618 
00619   setOperationAction(ISD::TRAP, MVT::Other, Legal);
00620   setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);
00621 
00622   // VASTART needs to be custom lowered to use the VarArgsFrameIndex
00623   setOperationAction(ISD::VASTART           , MVT::Other, Custom);
00624   setOperationAction(ISD::VAEND             , MVT::Other, Expand);
00625   if (Subtarget->is64Bit() && !Subtarget->isTargetWin64()) {
00626     // TargetInfo::X86_64ABIBuiltinVaList
00627     setOperationAction(ISD::VAARG           , MVT::Other, Custom);
00628     setOperationAction(ISD::VACOPY          , MVT::Other, Custom);
00629   } else {
00630     // TargetInfo::CharPtrBuiltinVaList
00631     setOperationAction(ISD::VAARG           , MVT::Other, Expand);
00632     setOperationAction(ISD::VACOPY          , MVT::Other, Expand);
00633   }
00634 
00635   setOperationAction(ISD::STACKSAVE,          MVT::Other, Expand);
00636   setOperationAction(ISD::STACKRESTORE,       MVT::Other, Expand);
00637 
00638   setOperationAction(ISD::DYNAMIC_STACKALLOC, Subtarget->is64Bit() ?
00639                      MVT::i64 : MVT::i32, Custom);
00640 
00641   if (!TM.Options.UseSoftFloat && X86ScalarSSEf64) {
00642     // f32 and f64 use SSE.
00643     // Set up the FP register classes.
00644     addRegisterClass(MVT::f32, &X86::FR32RegClass);
00645     addRegisterClass(MVT::f64, &X86::FR64RegClass);
00646 
00647     // Use ANDPD to simulate FABS.
00648     setOperationAction(ISD::FABS , MVT::f64, Custom);
00649     setOperationAction(ISD::FABS , MVT::f32, Custom);
00650 
00651     // Use XORP to simulate FNEG.
00652     setOperationAction(ISD::FNEG , MVT::f64, Custom);
00653     setOperationAction(ISD::FNEG , MVT::f32, Custom);
00654 
00655     // Use ANDPD and ORPD to simulate FCOPYSIGN.
00656     setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom);
00657     setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
00658 
00659     // Lower this to FGETSIGNx86 plus an AND.
00660     setOperationAction(ISD::FGETSIGN, MVT::i64, Custom);
00661     setOperationAction(ISD::FGETSIGN, MVT::i32, Custom);
00662 
00663     // We don't support sin/cos/fmod
00664     setOperationAction(ISD::FSIN   , MVT::f64, Expand);
00665     setOperationAction(ISD::FCOS   , MVT::f64, Expand);
00666     setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
00667     setOperationAction(ISD::FSIN   , MVT::f32, Expand);
00668     setOperationAction(ISD::FCOS   , MVT::f32, Expand);
00669     setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
00670 
00671     // Expand FP immediates into loads from the stack, except for the special
00672     // cases we handle.
00673     addLegalFPImmediate(APFloat(+0.0)); // xorpd
00674     addLegalFPImmediate(APFloat(+0.0f)); // xorps
00675   } else if (!TM.Options.UseSoftFloat && X86ScalarSSEf32) {
00676     // Use SSE for f32, x87 for f64.
00677     // Set up the FP register classes.
00678     addRegisterClass(MVT::f32, &X86::FR32RegClass);
00679     addRegisterClass(MVT::f64, &X86::RFP64RegClass);
00680 
00681     // Use ANDPS to simulate FABS.
00682     setOperationAction(ISD::FABS , MVT::f32, Custom);
00683 
00684     // Use XORP to simulate FNEG.
00685     setOperationAction(ISD::FNEG , MVT::f32, Custom);
00686 
00687     setOperationAction(ISD::UNDEF,     MVT::f64, Expand);
00688 
00689     // Use ANDPS and ORPS to simulate FCOPYSIGN.
00690     setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
00691     setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
00692 
00693     // We don't support sin/cos/fmod
00694     setOperationAction(ISD::FSIN   , MVT::f32, Expand);
00695     setOperationAction(ISD::FCOS   , MVT::f32, Expand);
00696     setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
00697 
00698     // Special cases we handle for FP constants.
00699     addLegalFPImmediate(APFloat(+0.0f)); // xorps
00700     addLegalFPImmediate(APFloat(+0.0)); // FLD0
00701     addLegalFPImmediate(APFloat(+1.0)); // FLD1
00702     addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
00703     addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
00704 
00705     if (!TM.Options.UnsafeFPMath) {
00706       setOperationAction(ISD::FSIN   , MVT::f64, Expand);
00707       setOperationAction(ISD::FCOS   , MVT::f64, Expand);
00708       setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
00709     }
00710   } else if (!TM.Options.UseSoftFloat) {
00711     // f32 and f64 in x87.
00712     // Set up the FP register classes.
00713     addRegisterClass(MVT::f64, &X86::RFP64RegClass);
00714     addRegisterClass(MVT::f32, &X86::RFP32RegClass);
00715 
00716     setOperationAction(ISD::UNDEF,     MVT::f64, Expand);
00717     setOperationAction(ISD::UNDEF,     MVT::f32, Expand);
00718     setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
00719     setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
00720 
00721     if (!TM.Options.UnsafeFPMath) {
00722       setOperationAction(ISD::FSIN   , MVT::f64, Expand);
00723       setOperationAction(ISD::FSIN   , MVT::f32, Expand);
00724       setOperationAction(ISD::FCOS   , MVT::f64, Expand);
00725       setOperationAction(ISD::FCOS   , MVT::f32, Expand);
00726       setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
00727       setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
00728     }
00729     addLegalFPImmediate(APFloat(+0.0)); // FLD0
00730     addLegalFPImmediate(APFloat(+1.0)); // FLD1
00731     addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
00732     addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
00733     addLegalFPImmediate(APFloat(+0.0f)); // FLD0
00734     addLegalFPImmediate(APFloat(+1.0f)); // FLD1
00735     addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS
00736     addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS
00737   }
00738 
00739   // We don't support FMA.
00740   setOperationAction(ISD::FMA, MVT::f64, Expand);
00741   setOperationAction(ISD::FMA, MVT::f32, Expand);
00742 
00743   // Long double always uses X87.
00744   if (!TM.Options.UseSoftFloat) {
00745     addRegisterClass(MVT::f80, &X86::RFP80RegClass);
00746     setOperationAction(ISD::UNDEF,     MVT::f80, Expand);
00747     setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand);
00748     {
00749       APFloat TmpFlt = APFloat::getZero(APFloat::x87DoubleExtended);
00750       addLegalFPImmediate(TmpFlt);  // FLD0
00751       TmpFlt.changeSign();
00752       addLegalFPImmediate(TmpFlt);  // FLD0/FCHS
00753 
00754       bool ignored;
00755       APFloat TmpFlt2(+1.0);
00756       TmpFlt2.convert(APFloat::x87DoubleExtended, APFloat::rmNearestTiesToEven,
00757                       &ignored);
00758       addLegalFPImmediate(TmpFlt2);  // FLD1
00759       TmpFlt2.changeSign();
00760       addLegalFPImmediate(TmpFlt2);  // FLD1/FCHS
00761     }
00762 
00763     if (!TM.Options.UnsafeFPMath) {
00764       setOperationAction(ISD::FSIN   , MVT::f80, Expand);
00765       setOperationAction(ISD::FCOS   , MVT::f80, Expand);
00766       setOperationAction(ISD::FSINCOS, MVT::f80, Expand);
00767     }
00768 
00769     setOperationAction(ISD::FFLOOR, MVT::f80, Expand);
00770     setOperationAction(ISD::FCEIL,  MVT::f80, Expand);
00771     setOperationAction(ISD::FTRUNC, MVT::f80, Expand);
00772     setOperationAction(ISD::FRINT,  MVT::f80, Expand);
00773     setOperationAction(ISD::FNEARBYINT, MVT::f80, Expand);
00774     setOperationAction(ISD::FMA, MVT::f80, Expand);
00775   }
00776 
00777   // Always use a library call for pow.
00778   setOperationAction(ISD::FPOW             , MVT::f32  , Expand);
00779   setOperationAction(ISD::FPOW             , MVT::f64  , Expand);
00780   setOperationAction(ISD::FPOW             , MVT::f80  , Expand);
00781 
00782   setOperationAction(ISD::FLOG, MVT::f80, Expand);
00783   setOperationAction(ISD::FLOG2, MVT::f80, Expand);
00784   setOperationAction(ISD::FLOG10, MVT::f80, Expand);
00785   setOperationAction(ISD::FEXP, MVT::f80, Expand);
00786   setOperationAction(ISD::FEXP2, MVT::f80, Expand);
00787 
00788   // First set operation action for all vector types to either promote
00789   // (for widening) or expand (for scalarization). Then we will selectively
00790   // turn on ones that can be effectively codegen'd.
00791   for (int i = MVT::FIRST_VECTOR_VALUETYPE;
00792            i <= MVT::LAST_VECTOR_VALUETYPE; ++i) {
00793     MVT VT = (MVT::SimpleValueType)i;
00794     setOperationAction(ISD::ADD , VT, Expand);
00795     setOperationAction(ISD::SUB , VT, Expand);
00796     setOperationAction(ISD::FADD, VT, Expand);
00797     setOperationAction(ISD::FNEG, VT, Expand);
00798     setOperationAction(ISD::FSUB, VT, Expand);
00799     setOperationAction(ISD::MUL , VT, Expand);
00800     setOperationAction(ISD::FMUL, VT, Expand);
00801     setOperationAction(ISD::SDIV, VT, Expand);
00802     setOperationAction(ISD::UDIV, VT, Expand);
00803     setOperationAction(ISD::FDIV, VT, Expand);
00804     setOperationAction(ISD::SREM, VT, Expand);
00805     setOperationAction(ISD::UREM, VT, Expand);
00806     setOperationAction(ISD::LOAD, VT, Expand);
00807     setOperationAction(ISD::VECTOR_SHUFFLE, VT, Expand);
00808     setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT,Expand);
00809     setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Expand);
00810     setOperationAction(ISD::EXTRACT_SUBVECTOR, VT,Expand);
00811     setOperationAction(ISD::INSERT_SUBVECTOR, VT,Expand);
00812     setOperationAction(ISD::FABS, VT, Expand);
00813     setOperationAction(ISD::FSIN, VT, Expand);
00814     setOperationAction(ISD::FSINCOS, VT, Expand);
00815     setOperationAction(ISD::FCOS, VT, Expand);
00816     setOperationAction(ISD::FSINCOS, VT, Expand);
00817     setOperationAction(ISD::FREM, VT, Expand);
00818     setOperationAction(ISD::FMA,  VT, Expand);
00819     setOperationAction(ISD::FPOWI, VT, Expand);
00820     setOperationAction(ISD::FSQRT, VT, Expand);
00821     setOperationAction(ISD::FCOPYSIGN, VT, Expand);
00822     setOperationAction(ISD::FFLOOR, VT, Expand);
00823     setOperationAction(ISD::FCEIL, VT, Expand);
00824     setOperationAction(ISD::FTRUNC, VT, Expand);
00825     setOperationAction(ISD::FRINT, VT, Expand);
00826     setOperationAction(ISD::FNEARBYINT, VT, Expand);
00827     setOperationAction(ISD::SMUL_LOHI, VT, Expand);
00828     setOperationAction(ISD::UMUL_LOHI, VT, Expand);
00829     setOperationAction(ISD::SDIVREM, VT, Expand);
00830     setOperationAction(ISD::UDIVREM, VT, Expand);
00831     setOperationAction(ISD::FPOW, VT, Expand);
00832     setOperationAction(ISD::CTPOP, VT, Expand);
00833     setOperationAction(ISD::CTTZ, VT, Expand);
00834     setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Expand);
00835     setOperationAction(ISD::CTLZ, VT, Expand);
00836     setOperationAction(ISD::CTLZ_ZERO_UNDEF, VT, Expand);
00837     setOperationAction(ISD::SHL, VT, Expand);
00838     setOperationAction(ISD::SRA, VT, Expand);
00839     setOperationAction(ISD::SRL, VT, Expand);
00840     setOperationAction(ISD::ROTL, VT, Expand);
00841     setOperationAction(ISD::ROTR, VT, Expand);
00842     setOperationAction(ISD::BSWAP, VT, Expand);
00843     setOperationAction(ISD::SETCC, VT, Expand);
00844     setOperationAction(ISD::FLOG, VT, Expand);
00845     setOperationAction(ISD::FLOG2, VT, Expand);
00846     setOperationAction(ISD::FLOG10, VT, Expand);
00847     setOperationAction(ISD::FEXP, VT, Expand);
00848     setOperationAction(ISD::FEXP2, VT, Expand);
00849     setOperationAction(ISD::FP_TO_UINT, VT, Expand);
00850     setOperationAction(ISD::FP_TO_SINT, VT, Expand);
00851     setOperationAction(ISD::UINT_TO_FP, VT, Expand);
00852     setOperationAction(ISD::SINT_TO_FP, VT, Expand);
00853     setOperationAction(ISD::SIGN_EXTEND_INREG, VT,Expand);
00854     setOperationAction(ISD::TRUNCATE, VT, Expand);
00855     setOperationAction(ISD::SIGN_EXTEND, VT, Expand);
00856     setOperationAction(ISD::ZERO_EXTEND, VT, Expand);
00857     setOperationAction(ISD::ANY_EXTEND, VT, Expand);
00858     setOperationAction(ISD::VSELECT, VT, Expand);
00859     for (int InnerVT = MVT::FIRST_VECTOR_VALUETYPE;
00860              InnerVT <= MVT::LAST_VECTOR_VALUETYPE; ++InnerVT)
00861       setTruncStoreAction(VT,
00862                           (MVT::SimpleValueType)InnerVT, Expand);
00863     setLoadExtAction(ISD::SEXTLOAD, VT, Expand);
00864     setLoadExtAction(ISD::ZEXTLOAD, VT, Expand);
00865     setLoadExtAction(ISD::EXTLOAD, VT, Expand);
00866   }
00867 
00868   // FIXME: In order to prevent SSE instructions being expanded to MMX ones
00869   // with -msoft-float, disable use of MMX as well.
00870   if (!TM.Options.UseSoftFloat && Subtarget->hasMMX()) {
00871     addRegisterClass(MVT::x86mmx, &X86::VR64RegClass);
00872     // No operations on x86mmx supported, everything uses intrinsics.
00873   }
00874 
00875   // MMX-sized vectors (other than x86mmx) are expected to be expanded
00876   // into smaller operations.
00877   setOperationAction(ISD::MULHS,              MVT::v8i8,  Expand);
00878   setOperationAction(ISD::MULHS,              MVT::v4i16, Expand);
00879   setOperationAction(ISD::MULHS,              MVT::v2i32, Expand);
00880   setOperationAction(ISD::MULHS,              MVT::v1i64, Expand);
00881   setOperationAction(ISD::AND,                MVT::v8i8,  Expand);
00882   setOperationAction(ISD::AND,                MVT::v4i16, Expand);
00883   setOperationAction(ISD::AND,                MVT::v2i32, Expand);
00884   setOperationAction(ISD::AND,                MVT::v1i64, Expand);
00885   setOperationAction(ISD::OR,                 MVT::v8i8,  Expand);
00886   setOperationAction(ISD::OR,                 MVT::v4i16, Expand);
00887   setOperationAction(ISD::OR,                 MVT::v2i32, Expand);
00888   setOperationAction(ISD::OR,                 MVT::v1i64, Expand);
00889   setOperationAction(ISD::XOR,                MVT::v8i8,  Expand);
00890   setOperationAction(ISD::XOR,                MVT::v4i16, Expand);
00891   setOperationAction(ISD::XOR,                MVT::v2i32, Expand);
00892   setOperationAction(ISD::XOR,                MVT::v1i64, Expand);
00893   setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v8i8,  Expand);
00894   setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v4i16, Expand);
00895   setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v2i32, Expand);
00896   setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v1i64, Expand);
00897   setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v1i64, Expand);
00898   setOperationAction(ISD::SELECT,             MVT::v8i8,  Expand);
00899   setOperationAction(ISD::SELECT,             MVT::v4i16, Expand);
00900   setOperationAction(ISD::SELECT,             MVT::v2i32, Expand);
00901   setOperationAction(ISD::SELECT,             MVT::v1i64, Expand);
00902   setOperationAction(ISD::BITCAST,            MVT::v8i8,  Expand);
00903   setOperationAction(ISD::BITCAST,            MVT::v4i16, Expand);
00904   setOperationAction(ISD::BITCAST,            MVT::v2i32, Expand);
00905   setOperationAction(ISD::BITCAST,            MVT::v1i64, Expand);
00906 
00907   if (!TM.Options.UseSoftFloat && Subtarget->hasSSE1()) {
00908     addRegisterClass(MVT::v4f32, &X86::VR128RegClass);
00909 
00910     setOperationAction(ISD::FADD,               MVT::v4f32, Legal);
00911     setOperationAction(ISD::FSUB,               MVT::v4f32, Legal);
00912     setOperationAction(ISD::FMUL,               MVT::v4f32, Legal);
00913     setOperationAction(ISD::FDIV,               MVT::v4f32, Legal);
00914     setOperationAction(ISD::FSQRT,              MVT::v4f32, Legal);
00915     setOperationAction(ISD::FNEG,               MVT::v4f32, Custom);
00916     setOperationAction(ISD::FABS,               MVT::v4f32, Custom);
00917     setOperationAction(ISD::LOAD,               MVT::v4f32, Legal);
00918     setOperationAction(ISD::BUILD_VECTOR,       MVT::v4f32, Custom);
00919     setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v4f32, Custom);
00920     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
00921     setOperationAction(ISD::SELECT,             MVT::v4f32, Custom);
00922   }
00923 
00924   if (!TM.Options.UseSoftFloat && Subtarget->hasSSE2()) {
00925     addRegisterClass(MVT::v2f64, &X86::VR128RegClass);
00926 
00927     // FIXME: Unfortunately -soft-float and -no-implicit-float means XMM
00928     // registers cannot be used even for integer operations.
00929     addRegisterClass(MVT::v16i8, &X86::VR128RegClass);
00930     addRegisterClass(MVT::v8i16, &X86::VR128RegClass);
00931     addRegisterClass(MVT::v4i32, &X86::VR128RegClass);
00932     addRegisterClass(MVT::v2i64, &X86::VR128RegClass);
00933 
00934     setOperationAction(ISD::ADD,                MVT::v16i8, Legal);
00935     setOperationAction(ISD::ADD,                MVT::v8i16, Legal);
00936     setOperationAction(ISD::ADD,                MVT::v4i32, Legal);
00937     setOperationAction(ISD::ADD,                MVT::v2i64, Legal);
00938     setOperationAction(ISD::MUL,                MVT::v4i32, Custom);
00939     setOperationAction(ISD::MUL,                MVT::v2i64, Custom);
00940     setOperationAction(ISD::SUB,                MVT::v16i8, Legal);
00941     setOperationAction(ISD::SUB,                MVT::v8i16, Legal);
00942     setOperationAction(ISD::SUB,                MVT::v4i32, Legal);
00943     setOperationAction(ISD::SUB,                MVT::v2i64, Legal);
00944     setOperationAction(ISD::MUL,                MVT::v8i16, Legal);
00945     setOperationAction(ISD::FADD,               MVT::v2f64, Legal);
00946     setOperationAction(ISD::FSUB,               MVT::v2f64, Legal);
00947     setOperationAction(ISD::FMUL,               MVT::v2f64, Legal);
00948     setOperationAction(ISD::FDIV,               MVT::v2f64, Legal);
00949     setOperationAction(ISD::FSQRT,              MVT::v2f64, Legal);
00950     setOperationAction(ISD::FNEG,               MVT::v2f64, Custom);
00951     setOperationAction(ISD::FABS,               MVT::v2f64, Custom);
00952 
00953     setOperationAction(ISD::SETCC,              MVT::v2i64, Custom);
00954     setOperationAction(ISD::SETCC,              MVT::v16i8, Custom);
00955     setOperationAction(ISD::SETCC,              MVT::v8i16, Custom);
00956     setOperationAction(ISD::SETCC,              MVT::v4i32, Custom);
00957 
00958     setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v16i8, Custom);
00959     setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v8i16, Custom);
00960     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v8i16, Custom);
00961     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4i32, Custom);
00962     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4f32, Custom);
00963 
00964     // Custom lower build_vector, vector_shuffle, and extract_vector_elt.
00965     for (int i = MVT::v16i8; i != MVT::v2i64; ++i) {
00966       MVT VT = (MVT::SimpleValueType)i;
00967       // Do not attempt to custom lower non-power-of-2 vectors
00968       if (!isPowerOf2_32(VT.getVectorNumElements()))
00969         continue;
00970       // Do not attempt to custom lower non-128-bit vectors
00971       if (!VT.is128BitVector())
00972         continue;
00973       setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);
00974       setOperationAction(ISD::VECTOR_SHUFFLE,     VT, Custom);
00975       setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
00976     }
00977 
00978     setOperationAction(ISD::BUILD_VECTOR,       MVT::v2f64, Custom);
00979     setOperationAction(ISD::BUILD_VECTOR,       MVT::v2i64, Custom);
00980     setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v2f64, Custom);
00981     setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v2i64, Custom);
00982     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v2f64, Custom);
00983     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f64, Custom);
00984 
00985     if (Subtarget->is64Bit()) {
00986       setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v2i64, Custom);
00987       setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Custom);
00988     }
00989 
00990     // Promote v16i8, v8i16, v4i32 load, select, and, or, xor to v2i64.
00991     for (int i = MVT::v16i8; i != MVT::v2i64; ++i) {
00992       MVT VT = (MVT::SimpleValueType)i;
00993 
00994       // Do not attempt to promote non-128-bit vectors
00995       if (!VT.is128BitVector())
00996         continue;
00997 
00998       setOperationAction(ISD::AND,    VT, Promote);
00999       AddPromotedToType (ISD::AND,    VT, MVT::v2i64);
01000       setOperationAction(ISD::OR,     VT, Promote);
01001       AddPromotedToType (ISD::OR,     VT, MVT::v2i64);
01002       setOperationAction(ISD::XOR,    VT, Promote);
01003       AddPromotedToType (ISD::XOR,    VT, MVT::v2i64);
01004       setOperationAction(ISD::LOAD,   VT, Promote);
01005       AddPromotedToType (ISD::LOAD,   VT, MVT::v2i64);
01006       setOperationAction(ISD::SELECT, VT, Promote);
01007       AddPromotedToType (ISD::SELECT, VT, MVT::v2i64);
01008     }
01009 
01010     setTruncStoreAction(MVT::f64, MVT::f32, Expand);
01011 
01012     // Custom lower v2i64 and v2f64 selects.
01013     setOperationAction(ISD::LOAD,               MVT::v2f64, Legal);
01014     setOperationAction(ISD::LOAD,               MVT::v2i64, Legal);
01015     setOperationAction(ISD::SELECT,             MVT::v2f64, Custom);
01016     setOperationAction(ISD::SELECT,             MVT::v2i64, Custom);
01017 
01018     setOperationAction(ISD::FP_TO_SINT,         MVT::v4i32, Legal);
01019     setOperationAction(ISD::SINT_TO_FP,         MVT::v4i32, Legal);
01020 
01021     setOperationAction(ISD::UINT_TO_FP,         MVT::v4i8,  Custom);
01022     setOperationAction(ISD::UINT_TO_FP,         MVT::v4i16, Custom);
01023     // As there is no 64-bit GPR available, we need build a special custom
01024     // sequence to convert from v2i32 to v2f32.
01025     if (!Subtarget->is64Bit())
01026       setOperationAction(ISD::UINT_TO_FP,       MVT::v2f32, Custom);
01027 
01028     setOperationAction(ISD::FP_EXTEND,          MVT::v2f32, Custom);
01029     setOperationAction(ISD::FP_ROUND,           MVT::v2f32, Custom);
01030 
01031     setLoadExtAction(ISD::EXTLOAD,              MVT::v2f32, Legal);
01032   }
01033 
01034   if (!TM.Options.UseSoftFloat && Subtarget->hasSSE41()) {
01035     setOperationAction(ISD::FFLOOR,             MVT::f32,   Legal);
01036     setOperationAction(ISD::FCEIL,              MVT::f32,   Legal);
01037     setOperationAction(ISD::FTRUNC,             MVT::f32,   Legal);
01038     setOperationAction(ISD::FRINT,              MVT::f32,   Legal);
01039     setOperationAction(ISD::FNEARBYINT,         MVT::f32,   Legal);
01040     setOperationAction(ISD::FFLOOR,             MVT::f64,   Legal);
01041     setOperationAction(ISD::FCEIL,              MVT::f64,   Legal);
01042     setOperationAction(ISD::FTRUNC,             MVT::f64,   Legal);
01043     setOperationAction(ISD::FRINT,              MVT::f64,   Legal);
01044     setOperationAction(ISD::FNEARBYINT,         MVT::f64,   Legal);
01045 
01046     setOperationAction(ISD::FFLOOR,             MVT::v4f32, Legal);
01047     setOperationAction(ISD::FCEIL,              MVT::v4f32, Legal);
01048     setOperationAction(ISD::FTRUNC,             MVT::v4f32, Legal);
01049     setOperationAction(ISD::FRINT,              MVT::v4f32, Legal);
01050     setOperationAction(ISD::FNEARBYINT,         MVT::v4f32, Legal);
01051     setOperationAction(ISD::FFLOOR,             MVT::v2f64, Legal);
01052     setOperationAction(ISD::FCEIL,              MVT::v2f64, Legal);
01053     setOperationAction(ISD::FTRUNC,             MVT::v2f64, Legal);
01054     setOperationAction(ISD::FRINT,              MVT::v2f64, Legal);
01055     setOperationAction(ISD::FNEARBYINT,         MVT::v2f64, Legal);
01056 
01057     // FIXME: Do we need to handle scalar-to-vector here?
01058     setOperationAction(ISD::MUL,                MVT::v4i32, Legal);
01059 
01060     setOperationAction(ISD::VSELECT,            MVT::v2f64, Legal);
01061     setOperationAction(ISD::VSELECT,            MVT::v2i64, Legal);
01062     setOperationAction(ISD::VSELECT,            MVT::v16i8, Legal);
01063     setOperationAction(ISD::VSELECT,            MVT::v4i32, Legal);
01064     setOperationAction(ISD::VSELECT,            MVT::v4f32, Legal);
01065 
01066     // i8 and i16 vectors are custom , because the source register and source
01067     // source memory operand types are not the same width.  f32 vectors are
01068     // custom since the immediate controlling the insert encodes additional
01069     // information.
01070     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v16i8, Custom);
01071     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v8i16, Custom);
01072     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4i32, Custom);
01073     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4f32, Custom);
01074 
01075     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i8, Custom);
01076     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i16, Custom);
01077     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Custom);
01078     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
01079 
01080     // FIXME: these should be Legal but thats only for the case where
01081     // the index is constant.  For now custom expand to deal with that.
01082     if (Subtarget->is64Bit()) {
01083       setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v2i64, Custom);
01084       setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Custom);
01085     }
01086   }
01087 
01088   if (Subtarget->hasSSE2()) {
01089     setOperationAction(ISD::SRL,               MVT::v8i16, Custom);
01090     setOperationAction(ISD::SRL,               MVT::v16i8, Custom);
01091 
01092     setOperationAction(ISD::SHL,               MVT::v8i16, Custom);
01093     setOperationAction(ISD::SHL,               MVT::v16i8, Custom);
01094 
01095     setOperationAction(ISD::SRA,               MVT::v8i16, Custom);
01096     setOperationAction(ISD::SRA,               MVT::v16i8, Custom);
01097 
01098     // In the customized shift lowering, the legal cases in AVX2 will be
01099     // recognized.
01100     setOperationAction(ISD::SRL,               MVT::v2i64, Custom);
01101     setOperationAction(ISD::SRL,               MVT::v4i32, Custom);
01102 
01103     setOperationAction(ISD::SHL,               MVT::v2i64, Custom);
01104     setOperationAction(ISD::SHL,               MVT::v4i32, Custom);
01105 
01106     setOperationAction(ISD::SRA,               MVT::v4i32, Custom);
01107 
01108     setOperationAction(ISD::SDIV,              MVT::v8i16, Custom);
01109     setOperationAction(ISD::SDIV,              MVT::v4i32, Custom);
01110   }
01111 
01112   if (!TM.Options.UseSoftFloat && Subtarget->hasFp256()) {
01113     addRegisterClass(MVT::v32i8,  &X86::VR256RegClass);
01114     addRegisterClass(MVT::v16i16, &X86::VR256RegClass);
01115     addRegisterClass(MVT::v8i32,  &X86::VR256RegClass);
01116     addRegisterClass(MVT::v8f32,  &X86::VR256RegClass);
01117     addRegisterClass(MVT::v4i64,  &X86::VR256RegClass);
01118     addRegisterClass(MVT::v4f64,  &X86::VR256RegClass);
01119 
01120     setOperationAction(ISD::LOAD,               MVT::v8f32, Legal);
01121     setOperationAction(ISD::LOAD,               MVT::v4f64, Legal);
01122     setOperationAction(ISD::LOAD,               MVT::v4i64, Legal);
01123 
01124     setOperationAction(ISD::FADD,               MVT::v8f32, Legal);
01125     setOperationAction(ISD::FSUB,               MVT::v8f32, Legal);
01126     setOperationAction(ISD::FMUL,               MVT::v8f32, Legal);
01127     setOperationAction(ISD::FDIV,               MVT::v8f32, Legal);
01128     setOperationAction(ISD::FSQRT,              MVT::v8f32, Legal);
01129     setOperationAction(ISD::FFLOOR,             MVT::v8f32, Legal);
01130     setOperationAction(ISD::FCEIL,              MVT::v8f32, Legal);
01131     setOperationAction(ISD::FTRUNC,             MVT::v8f32, Legal);
01132     setOperationAction(ISD::FRINT,              MVT::v8f32, Legal);
01133     setOperationAction(ISD::FNEARBYINT,         MVT::v8f32, Legal);
01134     setOperationAction(ISD::FNEG,               MVT::v8f32, Custom);
01135     setOperationAction(ISD::FABS,               MVT::v8f32, Custom);
01136 
01137     setOperationAction(ISD::FADD,               MVT::v4f64, Legal);
01138     setOperationAction(ISD::FSUB,               MVT::v4f64, Legal);
01139     setOperationAction(ISD::FMUL,               MVT::v4f64, Legal);
01140     setOperationAction(ISD::FDIV,               MVT::v4f64, Legal);
01141     setOperationAction(ISD::FSQRT,              MVT::v4f64, Legal);
01142     setOperationAction(ISD::FFLOOR,             MVT::v4f64, Legal);
01143     setOperationAction(ISD::FCEIL,              MVT::v4f64, Legal);
01144     setOperationAction(ISD::FTRUNC,             MVT::v4f64, Legal);
01145     setOperationAction(ISD::FRINT,              MVT::v4f64, Legal);
01146     setOperationAction(ISD::FNEARBYINT,         MVT::v4f64, Legal);
01147     setOperationAction(ISD::FNEG,               MVT::v4f64, Custom);
01148     setOperationAction(ISD::FABS,               MVT::v4f64, Custom);
01149 
01150     // (fp_to_int:v8i16 (v8f32 ..)) requires the result type to be promoted
01151     // even though v8i16 is a legal type.
01152     setOperationAction(ISD::FP_TO_SINT,         MVT::v8i16, Promote);
01153     setOperationAction(ISD::FP_TO_UINT,         MVT::v8i16, Promote);
01154     setOperationAction(ISD::FP_TO_SINT,         MVT::v8i32, Legal);
01155 
01156     setOperationAction(ISD::SINT_TO_FP,         MVT::v8i16, Promote);
01157     setOperationAction(ISD::SINT_TO_FP,         MVT::v8i32, Legal);
01158     setOperationAction(ISD::FP_ROUND,           MVT::v4f32, Legal);
01159 
01160     setOperationAction(ISD::UINT_TO_FP,         MVT::v8i8,  Custom);
01161     setOperationAction(ISD::UINT_TO_FP,         MVT::v8i16, Custom);
01162 
01163     setLoadExtAction(ISD::EXTLOAD,              MVT::v4f32, Legal);
01164 
01165     setOperationAction(ISD::SRL,               MVT::v16i16, Custom);
01166     setOperationAction(ISD::SRL,               MVT::v32i8, Custom);
01167 
01168     setOperationAction(ISD::SHL,               MVT::v16i16, Custom);
01169     setOperationAction(ISD::SHL,               MVT::v32i8, Custom);
01170 
01171     setOperationAction(ISD::SRA,               MVT::v16i16, Custom);
01172     setOperationAction(ISD::SRA,               MVT::v32i8, Custom);
01173 
01174     setOperationAction(ISD::SDIV,              MVT::v16i16, Custom);
01175 
01176     setOperationAction(ISD::SETCC,             MVT::v32i8, Custom);
01177     setOperationAction(ISD::SETCC,             MVT::v16i16, Custom);
01178     setOperationAction(ISD::SETCC,             MVT::v8i32, Custom);
01179     setOperationAction(ISD::SETCC,             MVT::v4i64, Custom);
01180 
01181     setOperationAction(ISD::SELECT,            MVT::v4f64, Custom);
01182     setOperationAction(ISD::SELECT,            MVT::v4i64, Custom);
01183     setOperationAction(ISD::SELECT,            MVT::v8f32, Custom);
01184 
01185     setOperationAction(ISD::VSELECT,           MVT::v4f64, Legal);
01186     setOperationAction(ISD::VSELECT,           MVT::v4i64, Legal);
01187     setOperationAction(ISD::VSELECT,           MVT::v8i32, Legal);
01188     setOperationAction(ISD::VSELECT,           MVT::v8f32, Legal);
01189 
01190     setOperationAction(ISD::SIGN_EXTEND,       MVT::v4i64, Custom);
01191     setOperationAction(ISD::SIGN_EXTEND,       MVT::v8i32, Custom);
01192     setOperationAction(ISD::SIGN_EXTEND,       MVT::v16i16, Custom);
01193     setOperationAction(ISD::ZERO_EXTEND,       MVT::v4i64, Custom);
01194     setOperationAction(ISD::ZERO_EXTEND,       MVT::v8i32, Custom);
01195     setOperationAction(ISD::ZERO_EXTEND,       MVT::v16i16, Custom);
01196     setOperationAction(ISD::ANY_EXTEND,        MVT::v4i64, Custom);
01197     setOperationAction(ISD::ANY_EXTEND,        MVT::v8i32, Custom);
01198     setOperationAction(ISD::ANY_EXTEND,        MVT::v16i16, Custom);
01199     setOperationAction(ISD::TRUNCATE,          MVT::v16i8, Custom);
01200     setOperationAction(ISD::TRUNCATE,          MVT::v8i16, Custom);
01201     setOperationAction(ISD::TRUNCATE,          MVT::v4i32, Custom);
01202 
01203     if (Subtarget->hasFMA() || Subtarget->hasFMA4()) {
01204       setOperationAction(ISD::FMA,             MVT::v8f32, Legal);
01205       setOperationAction(ISD::FMA,             MVT::v4f64, Legal);
01206       setOperationAction(ISD::FMA,             MVT::v4f32, Legal);
01207       setOperationAction(ISD::FMA,             MVT::v2f64, Legal);
01208       setOperationAction(ISD::FMA,             MVT::f32, Legal);
01209       setOperationAction(ISD::FMA,             MVT::f64, Legal);
01210     }
01211 
01212     if (Subtarget->hasInt256()) {
01213       setOperationAction(ISD::ADD,             MVT::v4i64, Legal);
01214       setOperationAction(ISD::ADD,             MVT::v8i32, Legal);
01215       setOperationAction(ISD::ADD,             MVT::v16i16, Legal);
01216       setOperationAction(ISD::ADD,             MVT::v32i8, Legal);
01217 
01218       setOperationAction(ISD::SUB,             MVT::v4i64, Legal);
01219       setOperationAction(ISD::SUB,             MVT::v8i32, Legal);
01220       setOperationAction(ISD::SUB,             MVT::v16i16, Legal);
01221       setOperationAction(ISD::SUB,             MVT::v32i8, Legal);
01222 
01223       setOperationAction(ISD::MUL,             MVT::v4i64, Custom);
01224       setOperationAction(ISD::MUL,             MVT::v8i32, Legal);
01225       setOperationAction(ISD::MUL,             MVT::v16i16, Legal);
01226       // Don't lower v32i8 because there is no 128-bit byte mul
01227 
01228       setOperationAction(ISD::VSELECT,         MVT::v32i8, Legal);
01229 
01230       setOperationAction(ISD::SDIV,            MVT::v8i32, Custom);
01231     } else {
01232       setOperationAction(ISD::ADD,             MVT::v4i64, Custom);
01233       setOperationAction(ISD::ADD,             MVT::v8i32, Custom);
01234       setOperationAction(ISD::ADD,             MVT::v16i16, Custom);
01235       setOperationAction(ISD::ADD,             MVT::v32i8, Custom);
01236 
01237       setOperationAction(ISD::SUB,             MVT::v4i64, Custom);
01238       setOperationAction(ISD::SUB,             MVT::v8i32, Custom);
01239       setOperationAction(ISD::SUB,             MVT::v16i16, Custom);
01240       setOperationAction(ISD::SUB,             MVT::v32i8, Custom);
01241 
01242       setOperationAction(ISD::MUL,             MVT::v4i64, Custom);
01243       setOperationAction(ISD::MUL,             MVT::v8i32, Custom);
01244       setOperationAction(ISD::MUL,             MVT::v16i16, Custom);
01245       // Don't lower v32i8 because there is no 128-bit byte mul
01246     }
01247 
01248     // In the customized shift lowering, the legal cases in AVX2 will be
01249     // recognized.
01250     setOperationAction(ISD::SRL,               MVT::v4i64, Custom);
01251     setOperationAction(ISD::SRL,               MVT::v8i32, Custom);
01252 
01253     setOperationAction(ISD::SHL,               MVT::v4i64, Custom);
01254     setOperationAction(ISD::SHL,               MVT::v8i32, Custom);
01255 
01256     setOperationAction(ISD::SRA,               MVT::v8i32, Custom);
01257 
01258     // Custom lower several nodes for 256-bit types.
01259     for (int i = MVT::FIRST_VECTOR_VALUETYPE;
01260              i <= MVT::LAST_VECTOR_VALUETYPE; ++i) {
01261       MVT VT = (MVT::SimpleValueType)i;
01262 
01263       // Extract subvector is special because the value type
01264       // (result) is 128-bit but the source is 256-bit wide.
01265       if (VT.is128BitVector())
01266         setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
01267 
01268       // Do not attempt to custom lower other non-256-bit vectors
01269       if (!VT.is256BitVector())
01270         continue;
01271 
01272       setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);
01273       setOperationAction(ISD::VECTOR_SHUFFLE,     VT, Custom);
01274       setOperationAction(ISD::INSERT_VECTOR_ELT,  VT, Custom);
01275       setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
01276       setOperationAction(ISD::SCALAR_TO_VECTOR,   VT, Custom);
01277       setOperationAction(ISD::INSERT_SUBVECTOR,   VT, Custom);
01278       setOperationAction(ISD::CONCAT_VECTORS,     VT, Custom);
01279     }
01280 
01281     // Promote v32i8, v16i16, v8i32 select, and, or, xor to v4i64.
01282     for (int i = MVT::v32i8; i != MVT::v4i64; ++i) {
01283       MVT VT = (MVT::SimpleValueType)i;
01284 
01285       // Do not attempt to promote non-256-bit vectors
01286       if (!VT.is256BitVector())
01287         continue;
01288 
01289       setOperationAction(ISD::AND,    VT, Promote);
01290       AddPromotedToType (ISD::AND,    VT, MVT::v4i64);
01291       setOperationAction(ISD::OR,     VT, Promote);
01292       AddPromotedToType (ISD::OR,     VT, MVT::v4i64);
01293       setOperationAction(ISD::XOR,    VT, Promote);
01294       AddPromotedToType (ISD::XOR,    VT, MVT::v4i64);
01295       setOperationAction(ISD::LOAD,   VT, Promote);
01296       AddPromotedToType (ISD::LOAD,   VT, MVT::v4i64);
01297       setOperationAction(ISD::SELECT, VT, Promote);
01298       AddPromotedToType (ISD::SELECT, VT, MVT::v4i64);
01299     }
01300   }
01301 
01302   if (!TM.Options.UseSoftFloat && Subtarget->hasAVX512()) {
01303     addRegisterClass(MVT::v16i32, &X86::VR512RegClass);
01304     addRegisterClass(MVT::v16f32, &X86::VR512RegClass);
01305     addRegisterClass(MVT::v8i64,  &X86::VR512RegClass);
01306     addRegisterClass(MVT::v8f64,  &X86::VR512RegClass);
01307 
01308     addRegisterClass(MVT::i1,     &X86::VK1RegClass);
01309     addRegisterClass(MVT::v8i1,   &X86::VK8RegClass);
01310     addRegisterClass(MVT::v16i1,  &X86::VK16RegClass);
01311 
01312     setOperationAction(ISD::BR_CC,              MVT::i1,    Expand);
01313     setOperationAction(ISD::SETCC,              MVT::i1,    Custom);
01314     setOperationAction(ISD::XOR,                MVT::i1,    Legal);
01315     setOperationAction(ISD::OR,                 MVT::i1,    Legal);
01316     setOperationAction(ISD::AND,                MVT::i1,    Legal);
01317     setLoadExtAction(ISD::EXTLOAD,              MVT::v8f32, Legal);
01318     setOperationAction(ISD::LOAD,               MVT::v16f32, Legal);
01319     setOperationAction(ISD::LOAD,               MVT::v8f64, Legal);
01320     setOperationAction(ISD::LOAD,               MVT::v8i64, Legal);
01321     setOperationAction(ISD::LOAD,               MVT::v16i32, Legal);
01322     setOperationAction(ISD::LOAD,               MVT::v16i1, Legal);
01323 
01324     setOperationAction(ISD::FADD,               MVT::v16f32, Legal);
01325     setOperationAction(ISD::FSUB,               MVT::v16f32, Legal);
01326     setOperationAction(ISD::FMUL,               MVT::v16f32, Legal);
01327     setOperationAction(ISD::FDIV,               MVT::v16f32, Legal);
01328     setOperationAction(ISD::FSQRT,              MVT::v16f32, Legal);
01329     setOperationAction(ISD::FNEG,               MVT::v16f32, Custom);
01330 
01331     setOperationAction(ISD::FADD,               MVT::v8f64, Legal);
01332     setOperationAction(ISD::FSUB,               MVT::v8f64, Legal);
01333     setOperationAction(ISD::FMUL,               MVT::v8f64, Legal);
01334     setOperationAction(ISD::FDIV,               MVT::v8f64, Legal);
01335     setOperationAction(ISD::FSQRT,              MVT::v8f64, Legal);
01336     setOperationAction(ISD::FNEG,               MVT::v8f64, Custom);
01337     setOperationAction(ISD::FMA,                MVT::v8f64, Legal);
01338     setOperationAction(ISD::FMA,                MVT::v16f32, Legal);
01339     setOperationAction(ISD::SDIV,               MVT::v16i32, Custom);
01340 
01341     setOperationAction(ISD::FP_TO_SINT,         MVT::i32, Legal);
01342     setOperationAction(ISD::FP_TO_UINT,         MVT::i32, Legal);
01343     setOperationAction(ISD::SINT_TO_FP,         MVT::i32, Legal);
01344     setOperationAction(ISD::UINT_TO_FP,         MVT::i32, Legal);
01345     if (Subtarget->is64Bit()) {
01346       setOperationAction(ISD::FP_TO_UINT,       MVT::i64, Legal);
01347       setOperationAction(ISD::FP_TO_SINT,       MVT::i64, Legal);
01348       setOperationAction(ISD::SINT_TO_FP,       MVT::i64, Legal);
01349       setOperationAction(ISD::UINT_TO_FP,       MVT::i64, Legal);
01350     }
01351     setOperationAction(ISD::FP_TO_SINT,         MVT::v16i32, Legal);
01352     setOperationAction(ISD::FP_TO_UINT,         MVT::v16i32, Legal);
01353     setOperationAction(ISD::FP_TO_UINT,         MVT::v8i32, Legal);
01354     setOperationAction(ISD::FP_TO_UINT,         MVT::v4i32, Legal);
01355     setOperationAction(ISD::SINT_TO_FP,         MVT::v16i32, Legal);
01356     setOperationAction(ISD::UINT_TO_FP,         MVT::v16i32, Legal);
01357     setOperationAction(ISD::UINT_TO_FP,         MVT::v8i32, Legal);
01358     setOperationAction(ISD::UINT_TO_FP,         MVT::v4i32, Legal);
01359     setOperationAction(ISD::FP_ROUND,           MVT::v8f32, Legal);
01360     setOperationAction(ISD::FP_EXTEND,          MVT::v8f32, Legal);
01361 
01362     setOperationAction(ISD::TRUNCATE,           MVT::i1, Custom);
01363     setOperationAction(ISD::TRUNCATE,           MVT::v16i8, Custom);
01364     setOperationAction(ISD::TRUNCATE,           MVT::v8i32, Custom);
01365     setOperationAction(ISD::TRUNCATE,           MVT::v8i1, Custom);
01366     setOperationAction(ISD::TRUNCATE,           MVT::v16i1, Custom);
01367     setOperationAction(ISD::TRUNCATE,           MVT::v16i16, Custom);
01368     setOperationAction(ISD::ZERO_EXTEND,        MVT::v16i32, Custom);
01369     setOperationAction(ISD::ZERO_EXTEND,        MVT::v8i64, Custom);
01370     setOperationAction(ISD::SIGN_EXTEND,        MVT::v16i32, Custom);
01371     setOperationAction(ISD::SIGN_EXTEND,        MVT::v8i64, Custom);
01372     setOperationAction(ISD::SIGN_EXTEND,        MVT::v16i8, Custom);
01373     setOperationAction(ISD::SIGN_EXTEND,        MVT::v8i16, Custom);
01374     setOperationAction(ISD::SIGN_EXTEND,        MVT::v16i16, Custom);
01375 
01376     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v8f64,  Custom);
01377     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v8i64,  Custom);
01378     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v16f32,  Custom);
01379     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v16i32,  Custom);
01380     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v8i1,    Custom);
01381     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v16i1, Legal);
01382 
01383     setOperationAction(ISD::SETCC,              MVT::v16i1, Custom);
01384     setOperationAction(ISD::SETCC,              MVT::v8i1, Custom);
01385 
01386     setOperationAction(ISD::MUL,              MVT::v8i64, Custom);
01387 
01388     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i1,  Custom);
01389     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i1, Custom);
01390     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v16i1, Custom);
01391     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v8i1, Custom);
01392     setOperationAction(ISD::BUILD_VECTOR,       MVT::v8i1, Custom);
01393     setOperationAction(ISD::BUILD_VECTOR,       MVT::v16i1, Custom);
01394     setOperationAction(ISD::SELECT,             MVT::v8f64, Custom);
01395     setOperationAction(ISD::SELECT,             MVT::v8i64, Custom);
01396     setOperationAction(ISD::SELECT,             MVT::v16f32, Custom);
01397 
01398     setOperationAction(ISD::ADD,                MVT::v8i64, Legal);
01399     setOperationAction(ISD::ADD,                MVT::v16i32, Legal);
01400 
01401     setOperationAction(ISD::SUB,                MVT::v8i64, Legal);
01402     setOperationAction(ISD::SUB,                MVT::v16i32, Legal);
01403 
01404     setOperationAction(ISD::MUL,                MVT::v16i32, Legal);
01405 
01406     setOperationAction(ISD::SRL,                MVT::v8i64, Custom);
01407     setOperationAction(ISD::SRL,                MVT::v16i32, Custom);
01408 
01409     setOperationAction(ISD::SHL,                MVT::v8i64, Custom);
01410     setOperationAction(ISD::SHL,                MVT::v16i32, Custom);
01411 
01412     setOperationAction(ISD::SRA,                MVT::v8i64, Custom);
01413     setOperationAction(ISD::SRA,                MVT::v16i32, Custom);
01414 
01415     setOperationAction(ISD::AND,                MVT::v8i64, Legal);
01416     setOperationAction(ISD::OR,                 MVT::v8i64, Legal);
01417     setOperationAction(ISD::XOR,                MVT::v8i64, Legal);
01418     setOperationAction(ISD::AND,                MVT::v16i32, Legal);
01419     setOperationAction(ISD::OR,                 MVT::v16i32, Legal);
01420     setOperationAction(ISD::XOR,                MVT::v16i32, Legal);
01421 
01422     // Custom lower several nodes.
01423     for (int i = MVT::FIRST_VECTOR_VALUETYPE;
01424              i <= MVT::LAST_VECTOR_VALUETYPE; ++i) {
01425       MVT VT = (MVT::SimpleValueType)i;
01426 
01427       unsigned EltSize = VT.getVectorElementType().getSizeInBits();
01428       // Extract subvector is special because the value type
01429       // (result) is 256/128-bit but the source is 512-bit wide.
01430       if (VT.is128BitVector() || VT.is256BitVector())
01431         setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
01432 
01433       if (VT.getVectorElementType() == MVT::i1)
01434         setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);
01435 
01436       // Do not attempt to custom lower other non-512-bit vectors
01437       if (!VT.is512BitVector())
01438         continue;
01439 
01440       if ( EltSize >= 32) {
01441         setOperationAction(ISD::VECTOR_SHUFFLE,      VT, Custom);
01442         setOperationAction(ISD::INSERT_VECTOR_ELT,   VT, Custom);
01443         setOperationAction(ISD::BUILD_VECTOR,        VT, Custom);
01444         setOperationAction(ISD::VSELECT,             VT, Legal);
01445         setOperationAction(ISD::EXTRACT_VECTOR_ELT,  VT, Custom);
01446         setOperationAction(ISD::SCALAR_TO_VECTOR,    VT, Custom);
01447         setOperationAction(ISD::INSERT_SUBVECTOR,    VT, Custom);
01448       }
01449     }
01450     for (int i = MVT::v32i8; i != MVT::v8i64; ++i) {
01451       MVT VT = (MVT::SimpleValueType)i;
01452 
01453       // Do not attempt to promote non-256-bit vectors
01454       if (!VT.is512BitVector())
01455         continue;
01456 
01457       setOperationAction(ISD::SELECT, VT, Promote);
01458       AddPromotedToType (ISD::SELECT, VT, MVT::v8i64);
01459     }
01460   }// has  AVX-512
01461 
01462   // SIGN_EXTEND_INREGs are evaluated by the extend type. Handle the expansion
01463   // of this type with custom code.
01464   for (int VT = MVT::FIRST_VECTOR_VALUETYPE;
01465            VT != MVT::LAST_VECTOR_VALUETYPE; VT++) {
01466     setOperationAction(ISD::SIGN_EXTEND_INREG, (MVT::SimpleValueType)VT,
01467                        Custom);
01468   }
01469 
01470   // We want to custom lower some of our intrinsics.
01471   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
01472   setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
01473   setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
01474 
01475   // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't
01476   // handle type legalization for these operations here.
01477   //
01478   // FIXME: We really should do custom legalization for addition and
01479   // subtraction on x86-32 once PR3203 is fixed.  We really can't do much better
01480   // than generic legalization for 64-bit multiplication-with-overflow, though.
01481   for (unsigned i = 0, e = 3+Subtarget->is64Bit(); i != e; ++i) {
01482     // Add/Sub/Mul with overflow operations are custom lowered.
01483     MVT VT = IntVTs[i];
01484     setOperationAction(ISD::SADDO, VT, Custom);
01485     setOperationAction(ISD::UADDO, VT, Custom);
01486     setOperationAction(ISD::SSUBO, VT, Custom);
01487     setOperationAction(ISD::USUBO, VT, Custom);
01488     setOperationAction(ISD::SMULO, VT, Custom);
01489     setOperationAction(ISD::UMULO, VT, Custom);
01490   }
01491 
01492   // There are no 8-bit 3-address imul/mul instructions
01493   setOperationAction(ISD::SMULO, MVT::i8, Expand);
01494   setOperationAction(ISD::UMULO, MVT::i8, Expand);
01495 
01496   if (!Subtarget->is64Bit()) {
01497     // These libcalls are not available in 32-bit.
01498     setLibcallName(RTLIB::SHL_I128, 0);
01499     setLibcallName(RTLIB::SRL_I128, 0);
01500     setLibcallName(RTLIB::SRA_I128, 0);
01501   }
01502 
01503   // Combine sin / cos into one node or libcall if possible.
01504   if (Subtarget->hasSinCos()) {
01505     setLibcallName(RTLIB::SINCOS_F32, "sincosf");
01506     setLibcallName(RTLIB::SINCOS_F64, "sincos");
01507     if (Subtarget->isTargetDarwin()) {
01508       // For MacOSX, we don't want to the normal expansion of a libcall to
01509       // sincos. We want to issue a libcall to __sincos_stret to avoid memory
01510       // traffic.
01511       setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
01512       setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
01513     }
01514   }
01515 
01516   // We have target-specific dag combine patterns for the following nodes:
01517   setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
01518   setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
01519   setTargetDAGCombine(ISD::VSELECT);
01520   setTargetDAGCombine(ISD::SELECT);
01521   setTargetDAGCombine(ISD::SHL);
01522   setTargetDAGCombine(ISD::SRA);
01523   setTargetDAGCombine(ISD::SRL);
01524   setTargetDAGCombine(ISD::OR);
01525   setTargetDAGCombine(ISD::AND);
01526   setTargetDAGCombine(ISD::ADD);
01527   setTargetDAGCombine(ISD::FADD);
01528   setTargetDAGCombine(ISD::FSUB);
01529   setTargetDAGCombine(ISD::FMA);
01530   setTargetDAGCombine(ISD::SUB);
01531   setTargetDAGCombine(ISD::LOAD);
01532   setTargetDAGCombine(ISD::STORE);
01533   setTargetDAGCombine(ISD::ZERO_EXTEND);
01534   setTargetDAGCombine(ISD::ANY_EXTEND);
01535   setTargetDAGCombine(ISD::SIGN_EXTEND);
01536   setTargetDAGCombine(ISD::SIGN_EXTEND_INREG);
01537   setTargetDAGCombine(ISD::TRUNCATE);
01538   setTargetDAGCombine(ISD::SINT_TO_FP);
01539   setTargetDAGCombine(ISD::SETCC);
01540   if (Subtarget->is64Bit())
01541     setTargetDAGCombine(ISD::MUL);
01542   setTargetDAGCombine(ISD::XOR);
01543 
01544   computeRegisterProperties();
01545 
01546   // On Darwin, -Os means optimize for size without hurting performance,
01547   // do not reduce the limit.
01548   MaxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores
01549   MaxStoresPerMemsetOptSize = Subtarget->isTargetDarwin() ? 16 : 8;
01550   MaxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores
01551   MaxStoresPerMemcpyOptSize = Subtarget->isTargetDarwin() ? 8 : 4;
01552   MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores
01553   MaxStoresPerMemmoveOptSize = Subtarget->isTargetDarwin() ? 8 : 4;
01554   setPrefLoopAlignment(4); // 2^4 bytes.
01555 
01556   // Predictable cmov don't hurt on atom because it's in-order.
01557   PredictableSelectIsExpensive = !Subtarget->isAtom();
01558 
01559   setPrefFunctionAlignment(4); // 2^4 bytes.
01560 }
01561 
01562 EVT X86TargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const {
01563   if (!VT.isVector())
01564     return Subtarget->hasAVX512() ? MVT::i1: MVT::i8;
01565 
01566   if (Subtarget->hasAVX512())
01567     switch(VT.getVectorNumElements()) {
01568     case  8: return MVT::v8i1;
01569     case 16: return MVT::v16i1;
01570   }
01571 
01572   return VT.changeVectorElementTypeToInteger();
01573 }
01574 
01575 /// getMaxByValAlign - Helper for getByValTypeAlignment to determine
01576 /// the desired ByVal argument alignment.
01577 static void getMaxByValAlign(Type *Ty, unsigned &MaxAlign) {
01578   if (MaxAlign == 16)
01579     return;
01580   if (VectorType *VTy = dyn_cast<VectorType>(Ty)) {
01581     if (VTy->getBitWidth() == 128)
01582       MaxAlign = 16;
01583   } else if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
01584     unsigned EltAlign = 0;
01585     getMaxByValAlign(ATy->getElementType(), EltAlign);
01586     if (EltAlign > MaxAlign)
01587       MaxAlign = EltAlign;
01588   } else if (StructType *STy = dyn_cast<StructType>(Ty)) {
01589     for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
01590       unsigned EltAlign = 0;
01591       getMaxByValAlign(STy->getElementType(i), EltAlign);
01592       if (EltAlign > MaxAlign)
01593         MaxAlign = EltAlign;
01594       if (MaxAlign == 16)
01595         break;
01596     }
01597   }
01598 }
01599 
01600 /// getByValTypeAlignment - Return the desired alignment for ByVal aggregate
01601 /// function arguments in the caller parameter area. For X86, aggregates
01602 /// that contain SSE vectors are placed at 16-byte boundaries while the rest
01603 /// are at 4-byte boundaries.
01604 unsigned X86TargetLowering::getByValTypeAlignment(Type *Ty) const {
01605   if (Subtarget->is64Bit()) {
01606     // Max of 8 and alignment of type.
01607     unsigned TyAlign = TD->getABITypeAlignment(Ty);
01608     if (TyAlign > 8)
01609       return TyAlign;
01610     return 8;
01611   }
01612 
01613   unsigned Align = 4;
01614   if (Subtarget->hasSSE1())
01615     getMaxByValAlign(Ty, Align);
01616   return Align;
01617 }
01618 
01619 /// getOptimalMemOpType - Returns the target specific optimal type for load
01620 /// and store operations as a result of memset, memcpy, and memmove
01621 /// lowering. If DstAlign is zero that means it's safe to destination
01622 /// alignment can satisfy any constraint. Similarly if SrcAlign is zero it
01623 /// means there isn't a need to check it against alignment requirement,
01624 /// probably because the source does not need to be loaded. If 'IsMemset' is
01625 /// true, that means it's expanding a memset. If 'ZeroMemset' is true, that
01626 /// means it's a memset of zero. 'MemcpyStrSrc' indicates whether the memcpy
01627 /// source is constant so it does not need to be loaded.
01628 /// It returns EVT::Other if the type should be determined using generic
01629 /// target-independent logic.
01630 EVT
01631 X86TargetLowering::getOptimalMemOpType(uint64_t Size,
01632                                        unsigned DstAlign, unsigned SrcAlign,
01633                                        bool IsMemset, bool ZeroMemset,
01634                                        bool MemcpyStrSrc,
01635                                        MachineFunction &MF) const {
01636   const Function *F = MF.getFunction();
01637   if ((!IsMemset || ZeroMemset) &&
01638       !F->getAttributes().hasAttribute(AttributeSet::FunctionIndex,
01639                                        Attribute::NoImplicitFloat)) {
01640     if (Size >= 16 &&
01641         (Subtarget->isUnalignedMemAccessFast() ||
01642          ((DstAlign == 0 || DstAlign >= 16) &&
01643           (SrcAlign == 0 || SrcAlign >= 16)))) {
01644       if (Size >= 32) {
01645         if (Subtarget->hasInt256())
01646           return MVT::v8i32;
01647         if (Subtarget->hasFp256())
01648           return MVT::v8f32;
01649       }
01650       if (Subtarget->hasSSE2())
01651         return MVT::v4i32;
01652       if (Subtarget->hasSSE1())
01653         return MVT::v4f32;
01654     } else if (!MemcpyStrSrc && Size >= 8 &&
01655                !Subtarget->is64Bit() &&
01656                Subtarget->hasSSE2()) {
01657       // Do not use f64 to lower memcpy if source is string constant. It's
01658       // better to use i32 to avoid the loads.
01659       return MVT::f64;
01660     }
01661   }
01662   if (Subtarget->is64Bit() && Size >= 8)
01663     return MVT::i64;
01664   return MVT::i32;
01665 }
01666 
01667 bool X86TargetLowering::isSafeMemOpType(MVT VT) const {
01668   if (VT == MVT::f32)
01669     return X86ScalarSSEf32;
01670   else if (VT == MVT::f64)
01671     return X86ScalarSSEf64;
01672   return true;
01673 }
01674 
01675 bool
01676 X86TargetLowering::allowsUnalignedMemoryAccesses(EVT VT,
01677                                                  unsigned,
01678                                                  bool *Fast) const {
01679   if (Fast)
01680     *Fast = Subtarget->isUnalignedMemAccessFast();
01681   return true;
01682 }
01683 
01684 /// getJumpTableEncoding - Return the entry encoding for a jump table in the
01685 /// current function.  The returned value is a member of the
01686 /// MachineJumpTableInfo::JTEntryKind enum.
01687 unsigned X86TargetLowering::getJumpTableEncoding() const {
01688   // In GOT pic mode, each entry in the jump table is emitted as a @GOTOFF
01689   // symbol.
01690   if (getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
01691       Subtarget->isPICStyleGOT())
01692     return MachineJumpTableInfo::EK_Custom32;
01693 
01694   // Otherwise, use the normal jump table encoding heuristics.
01695   return TargetLowering::getJumpTableEncoding();
01696 }
01697 
01698 const MCExpr *
01699 X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI,
01700                                              const MachineBasicBlock *MBB,
01701                                              unsigned uid,MCContext &Ctx) const{
01702   assert(getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
01703          Subtarget->isPICStyleGOT());
01704   // In 32-bit ELF systems, our jump table entries are formed with @GOTOFF
01705   // entries.
01706   return MCSymbolRefExpr::Create(MBB->getSymbol(),
01707                                  MCSymbolRefExpr::VK_GOTOFF, Ctx);
01708 }
01709 
01710 /// getPICJumpTableRelocaBase - Returns relocation base for the given PIC
01711 /// jumptable.
01712 SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table,
01713                                                     SelectionDAG &DAG) const {
01714   if (!Subtarget->is64Bit())
01715     // This doesn't have SDLoc associated with it, but is not really the
01716     // same as a Register.
01717     return DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), getPointerTy());
01718   return Table;
01719 }
01720 
01721 /// getPICJumpTableRelocBaseExpr - This returns the relocation base for the
01722 /// given PIC jumptable, the same as getPICJumpTableRelocBase, but as an
01723 /// MCExpr.
01724 const MCExpr *X86TargetLowering::
01725 getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI,
01726                              MCContext &Ctx) const {
01727   // X86-64 uses RIP relative addressing based on the jump table label.
01728   if (Subtarget->isPICStyleRIPRel())
01729     return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx);
01730 
01731   // Otherwise, the reference is relative to the PIC base.
01732   return MCSymbolRefExpr::Create(MF->getPICBaseSymbol(), Ctx);
01733 }
01734 
01735 // FIXME: Why this routine is here? Move to RegInfo!
01736 std::pair<const TargetRegisterClass*, uint8_t>
01737 X86TargetLowering::findRepresentativeClass(MVT VT) const{
01738   const TargetRegisterClass *RRC = 0;
01739   uint8_t Cost = 1;
01740   switch (VT.SimpleTy) {
01741   default:
01742     return TargetLowering::findRepresentativeClass(VT);
01743   case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64:
01744     RRC = Subtarget->is64Bit() ?
01745       (const TargetRegisterClass*)&X86::GR64RegClass :
01746       (const TargetRegisterClass*)&X86::GR32RegClass;
01747     break;
01748   case MVT::x86mmx:
01749     RRC = &X86::VR64RegClass;
01750     break;
01751   case MVT::f32: case MVT::f64:
01752   case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64:
01753   case MVT::v4f32: case MVT::v2f64:
01754   case MVT::v32i8: case MVT::v8i32: case MVT::v4i64: case MVT::v8f32:
01755   case MVT::v4f64:
01756     RRC = &X86::VR128RegClass;
01757     break;
01758   }
01759   return std::make_pair(RRC, Cost);
01760 }
01761 
01762 bool X86TargetLowering::getStackCookieLocation(unsigned &AddressSpace,
01763                                                unsigned &Offset) const {
01764   if (!Subtarget->isTargetLinux())
01765     return false;
01766 
01767   if (Subtarget->is64Bit()) {
01768     // %fs:0x28, unless we're using a Kernel code model, in which case it's %gs:
01769     Offset = 0x28;
01770     if (getTargetMachine().getCodeModel() == CodeModel::Kernel)
01771       AddressSpace = 256;
01772     else
01773       AddressSpace = 257;
01774   } else {
01775     // %gs:0x14 on i386
01776     Offset = 0x14;
01777     AddressSpace = 256;
01778   }
01779   return true;
01780 }
01781 
01782 bool X86TargetLowering::isNoopAddrSpaceCast(unsigned SrcAS,
01783                                             unsigned DestAS) const {
01784   assert(SrcAS != DestAS && "Expected different address spaces!");
01785 
01786   return SrcAS < 256 && DestAS < 256;
01787 }
01788 
01789 //===----------------------------------------------------------------------===//
01790 //               Return Value Calling Convention Implementation
01791 //===----------------------------------------------------------------------===//
01792 
01793 #include "X86GenCallingConv.inc"
01794 
01795 bool
01796 X86TargetLowering::CanLowerReturn(CallingConv::ID CallConv,
01797                                   MachineFunction &MF, bool isVarArg,
01798                         const SmallVectorImpl<ISD::OutputArg> &Outs,
01799                         LLVMContext &Context) const {
01800   SmallVector<CCValAssign, 16> RVLocs;
01801   CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(),
01802                  RVLocs, Context);
01803   return CCInfo.CheckReturn(Outs, RetCC_X86);
01804 }
01805 
01806 const MCPhysReg *X86TargetLowering::getScratchRegisters(CallingConv::ID) const {
01807   static const MCPhysReg ScratchRegs[] = { X86::R11, 0 };
01808   return ScratchRegs;
01809 }
01810 
01811 SDValue
01812 X86TargetLowering::LowerReturn(SDValue Chain,
01813                                CallingConv::ID CallConv, bool isVarArg,
01814                                const SmallVectorImpl<ISD::OutputArg> &Outs,
01815                                const SmallVectorImpl<SDValue> &OutVals,
01816                                SDLoc dl, SelectionDAG &DAG) const {
01817   MachineFunction &MF = DAG.getMachineFunction();
01818   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
01819 
01820   SmallVector<CCValAssign, 16> RVLocs;
01821   CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(),
01822                  RVLocs, *DAG.getContext());
01823   CCInfo.AnalyzeReturn(Outs, RetCC_X86);
01824 
01825   SDValue Flag;
01826   SmallVector<SDValue, 6> RetOps;
01827   RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
01828   // Operand #1 = Bytes To Pop
01829   RetOps.push_back(DAG.getTargetConstant(FuncInfo->getBytesToPopOnReturn(),
01830                    MVT::i16));
01831 
01832   // Copy the result values into the output registers.
01833   for (unsigned i = 0; i != RVLocs.size(); ++i) {
01834     CCValAssign &VA = RVLocs[i];
01835     assert(VA.isRegLoc() && "Can only return in registers!");
01836     SDValue ValToCopy = OutVals[i];
01837     EVT ValVT = ValToCopy.getValueType();
01838 
01839     // Promote values to the appropriate types
01840     if (VA.getLocInfo() == CCValAssign::SExt)
01841       ValToCopy = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), ValToCopy);
01842     else if (VA.getLocInfo() == CCValAssign::ZExt)
01843       ValToCopy = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), ValToCopy);
01844     else if (VA.getLocInfo() == CCValAssign::AExt)
01845       ValToCopy = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), ValToCopy);
01846     else if (VA.getLocInfo() == CCValAssign::BCvt)
01847       ValToCopy = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), ValToCopy);
01848 
01849     assert(VA.getLocInfo() != CCValAssign::FPExt &&
01850            "Unexpected FP-extend for return value.");  
01851 
01852     // If this is x86-64, and we disabled SSE, we can't return FP values,
01853     // or SSE or MMX vectors.
01854     if ((ValVT == MVT::f32 || ValVT == MVT::f64 ||
01855          VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) &&
01856           (Subtarget->is64Bit() && !Subtarget->hasSSE1())) {
01857       report_fatal_error("SSE register return with SSE disabled");
01858     }
01859     // Likewise we can't return F64 values with SSE1 only.  gcc does so, but
01860     // llvm-gcc has never done it right and no one has noticed, so this
01861     // should be OK for now.
01862     if (ValVT == MVT::f64 &&
01863         (Subtarget->is64Bit() && !Subtarget->hasSSE2()))
01864       report_fatal_error("SSE2 register return with SSE2 disabled");
01865 
01866     // Returns in ST0/ST1 are handled specially: these are pushed as operands to
01867     // the RET instruction and handled by the FP Stackifier.
01868     if (VA.getLocReg() == X86::ST0 ||
01869         VA.getLocReg() == X86::ST1) {
01870       // If this is a copy from an xmm register to ST(0), use an FPExtend to
01871       // change the value to the FP stack register class.
01872       if (isScalarFPTypeInSSEReg(VA.getValVT()))
01873         ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy);
01874       RetOps.push_back(ValToCopy);
01875       // Don't emit a copytoreg.
01876       continue;
01877     }
01878 
01879     // 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64
01880     // which is returned in RAX / RDX.
01881     if (Subtarget->is64Bit()) {
01882       if (ValVT == MVT::x86mmx) {
01883         if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) {
01884           ValToCopy = DAG.getNode(ISD::BITCAST, dl, MVT::i64, ValToCopy);
01885           ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
01886                                   ValToCopy);
01887           // If we don't have SSE2 available, convert to v4f32 so the generated
01888           // register is legal.
01889           if (!Subtarget->hasSSE2())
01890             ValToCopy = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32,ValToCopy);
01891         }
01892       }
01893     }
01894 
01895     Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), ValToCopy, Flag);
01896     Flag = Chain.getValue(1);
01897     RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
01898   }
01899 
01900   // The x86-64 ABIs require that for returning structs by value we copy
01901   // the sret argument into %rax/%eax (depending on ABI) for the return.
01902   // Win32 requires us to put the sret argument to %eax as well.
01903   // We saved the argument into a virtual register in the entry block,
01904   // so now we copy the value out and into %rax/%eax.
01905   if (DAG.getMachineFunction().getFunction()->hasStructRetAttr() &&
01906       (Subtarget->is64Bit() || Subtarget->isTargetKnownWindowsMSVC())) {
01907     MachineFunction &MF = DAG.getMachineFunction();
01908     X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
01909     unsigned Reg = FuncInfo->getSRetReturnReg();
01910     assert(Reg &&
01911            "SRetReturnReg should have been set in LowerFormalArguments().");
01912     SDValue Val = DAG.getCopyFromReg(Chain, dl, Reg, getPointerTy());
01913 
01914     unsigned RetValReg
01915         = (Subtarget->is64Bit() && !Subtarget->isTarget64BitILP32()) ?
01916           X86::RAX : X86::EAX;
01917     Chain = DAG.getCopyToReg(Chain, dl, RetValReg, Val, Flag);
01918     Flag = Chain.getValue(1);
01919 
01920     // RAX/EAX now acts like a return value.
01921     RetOps.push_back(DAG.getRegister(RetValReg, getPointerTy()));
01922   }
01923 
01924   RetOps[0] = Chain;  // Update chain.
01925 
01926   // Add the flag if we have it.
01927   if (Flag.getNode())
01928     RetOps.push_back(Flag);
01929 
01930   return DAG.getNode(X86ISD::RET_FLAG, dl,
01931                      MVT::Other, &RetOps[0], RetOps.size());
01932 }
01933 
01934 bool X86TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
01935   if (N->getNumValues() != 1)
01936     return false;
01937   if (!N->hasNUsesOfValue(1, 0))
01938     return false;
01939 
01940   SDValue TCChain = Chain;
01941   SDNode *Copy = *N->use_begin();
01942   if (Copy->getOpcode() == ISD::CopyToReg) {
01943     // If the copy has a glue operand, we conservatively assume it isn't safe to
01944     // perform a tail call.
01945     if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
01946       return false;
01947     TCChain = Copy->getOperand(0);
01948   } else if (Copy->getOpcode() != ISD::FP_EXTEND)
01949     return false;
01950 
01951   bool HasRet = false;
01952   for (SDNode::use_iterator UI = Copy->use_begin(), UE = Copy->use_end();
01953        UI != UE; ++UI) {
01954     if (UI->getOpcode() != X86ISD::RET_FLAG)
01955       return false;
01956     HasRet = true;
01957   }
01958 
01959   if (!HasRet)
01960     return false;
01961 
01962   Chain = TCChain;
01963   return true;
01964 }
01965 
01966 MVT
01967 X86TargetLowering::getTypeForExtArgOrReturn(MVT VT,
01968                                             ISD::NodeType ExtendKind) const {
01969   MVT ReturnMVT;
01970   // TODO: Is this also valid on 32-bit?
01971   if (Subtarget->is64Bit() && VT == MVT::i1 && ExtendKind == ISD::ZERO_EXTEND)
01972     ReturnMVT = MVT::i8;
01973   else
01974     ReturnMVT = MVT::i32;
01975 
01976   MVT MinVT = getRegisterType(ReturnMVT);
01977   return VT.bitsLT(MinVT) ? MinVT : VT;
01978 }
01979 
01980 /// LowerCallResult - Lower the result values of a call into the
01981 /// appropriate copies out of appropriate physical registers.
01982 ///
01983 SDValue
01984 X86TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
01985                                    CallingConv::ID CallConv, bool isVarArg,
01986                                    const SmallVectorImpl<ISD::InputArg> &Ins,
01987                                    SDLoc dl, SelectionDAG &DAG,
01988                                    SmallVectorImpl<SDValue> &InVals) const {
01989 
01990   // Assign locations to each value returned by this call.
01991   SmallVector<CCValAssign, 16> RVLocs;
01992   bool Is64Bit = Subtarget->is64Bit();
01993   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
01994                  getTargetMachine(), RVLocs, *DAG.getContext());
01995   CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
01996 
01997   // Copy all of the result registers out of their specified physreg.
01998   for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
01999     CCValAssign &VA = RVLocs[i];
02000     EVT CopyVT = VA.getValVT();
02001 
02002     // If this is x86-64, and we disabled SSE, we can't return FP values
02003     if ((CopyVT == MVT::f32 || CopyVT == MVT::f64) &&
02004         ((Is64Bit || Ins[i].Flags.isInReg()) && !Subtarget->hasSSE1())) {
02005       report_fatal_error("SSE register return with SSE disabled");
02006     }
02007 
02008     SDValue Val;
02009 
02010     // If this is a call to a function that returns an fp value on the floating
02011     // point stack, we must guarantee the value is popped from the stack, so
02012     // a CopyFromReg is not good enough - the copy instruction may be eliminated
02013     // if the return value is not used. We use the FpPOP_RETVAL instruction
02014     // instead.
02015     if (VA.getLocReg() == X86::ST0 || VA.getLocReg() == X86::ST1) {
02016       // If we prefer to use the value in xmm registers, copy it out as f80 and
02017       // use a truncate to move it from fp stack reg to xmm reg.
02018       if (isScalarFPTypeInSSEReg(VA.getValVT())) CopyVT = MVT::f80;
02019       SDValue Ops[] = { Chain, InFlag };
02020       Chain = SDValue(DAG.getMachineNode(X86::FpPOP_RETVAL, dl, CopyVT,
02021                                          MVT::Other, MVT::Glue, Ops), 1);
02022       Val = Chain.getValue(0);
02023 
02024       // Round the f80 to the right size, which also moves it to the appropriate
02025       // xmm register.
02026       if (CopyVT != VA.getValVT())
02027         Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val,
02028                           // This truncation won't change the value.
02029                           DAG.getIntPtrConstant(1));
02030     } else {
02031       Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(),
02032                                  CopyVT, InFlag).getValue(1);
02033       Val = Chain.getValue(0);
02034     }
02035     InFlag = Chain.getValue(2);
02036     InVals.push_back(Val);
02037   }
02038 
02039   return Chain;
02040 }
02041 
02042 //===----------------------------------------------------------------------===//
02043 //                C & StdCall & Fast Calling Convention implementation
02044 //===----------------------------------------------------------------------===//
02045 //  StdCall calling convention seems to be standard for many Windows' API
02046 //  routines and around. It differs from C calling convention just a little:
02047 //  callee should clean up the stack, not caller. Symbols should be also
02048 //  decorated in some fancy way :) It doesn't support any vector arguments.
02049 //  For info on fast calling convention see Fast Calling Convention (tail call)
02050 //  implementation LowerX86_32FastCCCallTo.
02051 
02052 /// CallIsStructReturn - Determines whether a call uses struct return
02053 /// semantics.
02054 enum StructReturnType {
02055   NotStructReturn,
02056   RegStructReturn,
02057   StackStructReturn
02058 };
02059 static StructReturnType
02060 callIsStructReturn(const SmallVectorImpl<ISD::OutputArg> &Outs) {
02061   if (Outs.empty())
02062     return NotStructReturn;
02063 
02064   const ISD::ArgFlagsTy &Flags = Outs[0].Flags;
02065   if (!Flags.isSRet())
02066     return NotStructReturn;
02067   if (Flags.isInReg())
02068     return RegStructReturn;
02069   return StackStructReturn;
02070 }
02071 
02072 /// ArgsAreStructReturn - Determines whether a function uses struct
02073 /// return semantics.
02074 static StructReturnType
02075 argsAreStructReturn(const SmallVectorImpl<ISD::InputArg> &Ins) {
02076   if (Ins.empty())
02077     return NotStructReturn;
02078 
02079   const ISD::ArgFlagsTy &Flags = Ins[0].Flags;
02080   if (!Flags.isSRet())
02081     return NotStructReturn;
02082   if (Flags.isInReg())
02083     return RegStructReturn;
02084   return StackStructReturn;
02085 }
02086 
02087 /// CreateCopyOfByValArgument - Make a copy of an aggregate at address specified
02088 /// by "Src" to address "Dst" with size and alignment information specified by
02089 /// the specific parameter attribute. The copy will be passed as a byval
02090 /// function parameter.
02091 static SDValue
02092 CreateCopyOfByValArgument(SDValue Src, SDValue Dst, SDValue Chain,
02093                           ISD::ArgFlagsTy Flags, SelectionDAG &DAG,
02094                           SDLoc dl) {
02095   SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), MVT::i32);
02096 
02097   return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(),
02098                        /*isVolatile*/false, /*AlwaysInline=*/true,
02099                        MachinePointerInfo(), MachinePointerInfo());
02100 }
02101 
02102 /// IsTailCallConvention - Return true if the calling convention is one that
02103 /// supports tail call optimization.
02104 static bool IsTailCallConvention(CallingConv::ID CC) {
02105   return (CC == CallingConv::Fast || CC == CallingConv::GHC ||
02106           CC == CallingConv::HiPE);
02107 }
02108 
02109 /// \brief Return true if the calling convention is a C calling convention.
02110 static bool IsCCallConvention(CallingConv::ID CC) {
02111   return (CC == CallingConv::C || CC == CallingConv::X86_64_Win64 ||
02112           CC == CallingConv::X86_64_SysV);
02113 }
02114 
02115 bool X86TargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const {
02116   if (!CI->isTailCall() || getTargetMachine().Options.DisableTailCalls)
02117     return false;
02118 
02119   CallSite CS(CI);
02120   CallingConv::ID CalleeCC = CS.getCallingConv();
02121   if (!IsTailCallConvention(CalleeCC) && !IsCCallConvention(CalleeCC))
02122     return false;
02123 
02124   return true;
02125 }
02126 
02127 /// FuncIsMadeTailCallSafe - Return true if the function is being made into
02128 /// a tailcall target by changing its ABI.
02129 static bool FuncIsMadeTailCallSafe(CallingConv::ID CC,
02130                                    bool GuaranteedTailCallOpt) {
02131   return GuaranteedTailCallOpt && IsTailCallConvention(CC);
02132 }
02133 
02134 SDValue
02135 X86TargetLowering::LowerMemArgument(SDValue Chain,
02136                                     CallingConv::ID CallConv,
02137                                     const SmallVectorImpl<ISD::InputArg> &Ins,
02138                                     SDLoc dl, SelectionDAG &DAG,
02139                                     const CCValAssign &VA,
02140                                     MachineFrameInfo *MFI,
02141                                     unsigned i) const {
02142   // Create the nodes corresponding to a load from this parameter slot.
02143   ISD::ArgFlagsTy Flags = Ins[i].Flags;
02144   bool AlwaysUseMutable = FuncIsMadeTailCallSafe(CallConv,
02145                               getTargetMachine().Options.GuaranteedTailCallOpt);
02146   bool isImmutable = !AlwaysUseMutable && !Flags.isByVal();
02147   EVT ValVT;
02148 
02149   // If value is passed by pointer we have address passed instead of the value
02150   // itself.
02151   if (VA.getLocInfo() == CCValAssign::Indirect)
02152     ValVT = VA.getLocVT();
02153   else
02154     ValVT = VA.getValVT();
02155 
02156   // FIXME: For now, all byval parameter objects are marked mutable. This can be
02157   // changed with more analysis.
02158   // In case of tail call optimization mark all arguments mutable. Since they
02159   // could be overwritten by lowering of arguments in case of a tail call.
02160   if (Flags.isByVal()) {
02161     unsigned Bytes = Flags.getByValSize();
02162     if (Bytes == 0) Bytes = 1; // Don't create zero-sized stack objects.
02163     int FI = MFI->CreateFixedObject(Bytes, VA.getLocMemOffset(), isImmutable);
02164     return DAG.getFrameIndex(FI, getPointerTy());
02165   } else {
02166     int FI = MFI->CreateFixedObject(ValVT.getSizeInBits()/8,
02167                                     VA.getLocMemOffset(), isImmutable);
02168     SDValue FIN = DAG.getFrameIndex(FI, getPointerTy());
02169     return DAG.getLoad(ValVT, dl, Chain, FIN,
02170                        MachinePointerInfo::getFixedStack(FI),
02171                        false, false, false, 0);
02172   }
02173 }
02174 
02175 SDValue
02176 X86TargetLowering::LowerFormalArguments(SDValue Chain,
02177                                         CallingConv::ID CallConv,
02178                                         bool isVarArg,
02179                                       const SmallVectorImpl<ISD::InputArg> &Ins,
02180                                         SDLoc dl,
02181                                         SelectionDAG &DAG,
02182                                         SmallVectorImpl<SDValue> &InVals)
02183                                           const {
02184   MachineFunction &MF = DAG.getMachineFunction();
02185   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
02186 
02187   const Function* Fn = MF.getFunction();
02188   if (Fn->hasExternalLinkage() &&
02189       Subtarget->isTargetCygMing() &&
02190       Fn->getName() == "main")
02191     FuncInfo->setForceFramePointer(true);
02192 
02193   MachineFrameInfo *MFI = MF.getFrameInfo();
02194   bool Is64Bit = Subtarget->is64Bit();
02195   bool IsWin64 = Subtarget->isCallingConvWin64(CallConv);
02196 
02197   assert(!(isVarArg && IsTailCallConvention(CallConv)) &&
02198          "Var args not supported with calling convention fastcc, ghc or hipe");
02199 
02200   // Assign locations to all of the incoming arguments.
02201   SmallVector<CCValAssign, 16> ArgLocs;
02202   CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(),
02203                  ArgLocs, *DAG.getContext());
02204 
02205   // Allocate shadow area for Win64
02206   if (IsWin64)
02207     CCInfo.AllocateStack(32, 8);
02208 
02209   CCInfo.AnalyzeFormalArguments(Ins, CC_X86);
02210 
02211   unsigned LastVal = ~0U;
02212   SDValue ArgValue;
02213   for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
02214     CCValAssign &VA = ArgLocs[i];
02215     // TODO: If an arg is passed in two places (e.g. reg and stack), skip later
02216     // places.
02217     assert(VA.getValNo() != LastVal &&
02218            "Don't support value assigned to multiple locs yet");
02219     (void)LastVal;
02220     LastVal = VA.getValNo();
02221 
02222     if (VA.isRegLoc()) {
02223       EVT RegVT = VA.getLocVT();
02224       const TargetRegisterClass *RC;
02225       if (RegVT == MVT::i32)
02226         RC = &X86::GR32RegClass;
02227       else if (Is64Bit && RegVT == MVT::i64)
02228         RC = &X86::GR64RegClass;
02229       else if (RegVT == MVT::f32)
02230         RC = &X86::FR32RegClass;
02231       else if (RegVT == MVT::f64)
02232         RC = &X86::FR64RegClass;
02233       else if (RegVT.is512BitVector())
02234         RC = &X86::VR512RegClass;
02235       else if (RegVT.is256BitVector())
02236         RC = &X86::VR256RegClass;
02237       else if (RegVT.is128BitVector())
02238         RC = &X86::VR128RegClass;
02239       else if (RegVT == MVT::x86mmx)
02240         RC = &X86::VR64RegClass;
02241       else if (RegVT == MVT::i1)
02242         RC = &X86::VK1RegClass;
02243       else if (RegVT == MVT::v8i1)
02244         RC = &X86::VK8RegClass;
02245       else if (RegVT == MVT::v16i1)
02246         RC = &X86::VK16RegClass;
02247       else
02248         llvm_unreachable("Unknown argument type!");
02249 
02250       unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
02251       ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
02252 
02253       // If this is an 8 or 16-bit value, it is really passed promoted to 32
02254       // bits.  Insert an assert[sz]ext to capture this, then truncate to the
02255       // right size.
02256       if (VA.getLocInfo() == CCValAssign::SExt)
02257         ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue,
02258                                DAG.getValueType(VA.getValVT()));
02259       else if (VA.getLocInfo() == CCValAssign::ZExt)
02260         ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue,
02261                                DAG.getValueType(VA.getValVT()));
02262       else if (VA.getLocInfo() == CCValAssign::BCvt)
02263         ArgValue = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), ArgValue);
02264 
02265       if (VA.isExtInLoc()) {
02266         // Handle MMX values passed in XMM regs.
02267         if (RegVT.isVector())
02268           ArgValue = DAG.getNode(X86ISD::MOVDQ2Q, dl, VA.getValVT(), ArgValue);
02269         else
02270           ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
02271       }
02272     } else {
02273       assert(VA.isMemLoc());
02274       ArgValue = LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, i);
02275     }
02276 
02277     // If value is passed via pointer - do a load.
02278     if (VA.getLocInfo() == CCValAssign::Indirect)
02279       ArgValue = DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue,
02280                              MachinePointerInfo(), false, false, false, 0);
02281 
02282     InVals.push_back(ArgValue);
02283   }
02284 
02285   // The x86-64 ABIs require that for returning structs by value we copy
02286   // the sret argument into %rax/%eax (depending on ABI) for the return.
02287   // Win32 requires us to put the sret argument to %eax as well.
02288   // Save the argument into a virtual register so that we can access it
02289   // from the return points.
02290   if (MF.getFunction()->hasStructRetAttr() &&
02291       (Subtarget->is64Bit() || Subtarget->isTargetKnownWindowsMSVC())) {
02292     X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
02293     unsigned Reg = FuncInfo->getSRetReturnReg();
02294     if (!Reg) {
02295       MVT PtrTy = getPointerTy();
02296       Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy));
02297       FuncInfo->setSRetReturnReg(Reg);
02298     }
02299     SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[0]);
02300     Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain);
02301   }
02302 
02303   unsigned StackSize = CCInfo.getNextStackOffset();
02304   // Align stack specially for tail calls.
02305   if (FuncIsMadeTailCallSafe(CallConv,
02306                              MF.getTarget().Options.GuaranteedTailCallOpt))
02307     StackSize = GetAlignedArgumentStackSize(StackSize, DAG);
02308 
02309   // If the function takes variable number of arguments, make a frame index for
02310   // the start of the first vararg value... for expansion of llvm.va_start.
02311   if (isVarArg) {
02312     if (Is64Bit || (CallConv != CallingConv::X86_FastCall &&
02313                     CallConv != CallingConv::X86_ThisCall)) {
02314       FuncInfo->setVarArgsFrameIndex(MFI->CreateFixedObject(1, StackSize,true));
02315     }
02316     if (Is64Bit) {
02317       unsigned TotalNumIntRegs = 0, TotalNumXMMRegs = 0;
02318 
02319       // FIXME: We should really autogenerate these arrays
02320       static const MCPhysReg GPR64ArgRegsWin64[] = {
02321         X86::RCX, X86::RDX, X86::R8,  X86::R9
02322       };
02323       static const MCPhysReg GPR64ArgRegs64Bit[] = {
02324         X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9
02325       };
02326       static const MCPhysReg XMMArgRegs64Bit[] = {
02327         X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
02328         X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
02329       };
02330       const MCPhysReg *GPR64ArgRegs;
02331       unsigned NumXMMRegs = 0;
02332 
02333       if (IsWin64) {
02334         // The XMM registers which might contain var arg parameters are shadowed
02335         // in their paired GPR.  So we only need to save the GPR to their home
02336         // slots.
02337         TotalNumIntRegs = 4;
02338         GPR64ArgRegs = GPR64ArgRegsWin64;
02339       } else {
02340         TotalNumIntRegs = 6; TotalNumXMMRegs = 8;
02341         GPR64ArgRegs = GPR64ArgRegs64Bit;
02342 
02343         NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs64Bit,
02344                                                 TotalNumXMMRegs);
02345       }
02346       unsigned NumIntRegs = CCInfo.getFirstUnallocated(GPR64ArgRegs,
02347                                                        TotalNumIntRegs);
02348 
02349       bool NoImplicitFloatOps = Fn->getAttributes().
02350         hasAttribute(AttributeSet::FunctionIndex, Attribute::NoImplicitFloat);
02351       assert(!(NumXMMRegs && !Subtarget->hasSSE1()) &&
02352              "SSE register cannot be used when SSE is disabled!");
02353       assert(!(NumXMMRegs && MF.getTarget().Options.UseSoftFloat &&
02354                NoImplicitFloatOps) &&
02355              "SSE register cannot be used when SSE is disabled!");
02356       if (MF.getTarget().Options.UseSoftFloat || NoImplicitFloatOps ||
02357           !Subtarget->hasSSE1())
02358         // Kernel mode asks for SSE to be disabled, so don't push them
02359         // on the stack.
02360         TotalNumXMMRegs = 0;
02361 
02362       if (IsWin64) {
02363         const TargetFrameLowering &TFI = *getTargetMachine().getFrameLowering();
02364         // Get to the caller-allocated home save location.  Add 8 to account
02365         // for the return address.
02366         int HomeOffset = TFI.getOffsetOfLocalArea() + 8;
02367         FuncInfo->setRegSaveFrameIndex(
02368           MFI->CreateFixedObject(1, NumIntRegs * 8 + HomeOffset, false));
02369         // Fixup to set vararg frame on shadow area (4 x i64).
02370         if (NumIntRegs < 4)
02371           FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex());
02372       } else {
02373         // For X86-64, if there are vararg parameters that are passed via
02374         // registers, then we must store them to their spots on the stack so
02375         // they may be loaded by deferencing the result of va_next.
02376         FuncInfo->setVarArgsGPOffset(NumIntRegs * 8);
02377         FuncInfo->setVarArgsFPOffset(TotalNumIntRegs * 8 + NumXMMRegs * 16);
02378         FuncInfo->setRegSaveFrameIndex(
02379           MFI->CreateStackObject(TotalNumIntRegs * 8 + TotalNumXMMRegs * 16, 16,
02380                                false));
02381       }
02382 
02383       // Store the integer parameter registers.
02384       SmallVector<SDValue, 8> MemOps;
02385       SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(),
02386                                         getPointerTy());
02387       unsigned Offset = FuncInfo->getVarArgsGPOffset();
02388       for (; NumIntRegs != TotalNumIntRegs; ++NumIntRegs) {
02389         SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), RSFIN,
02390                                   DAG.getIntPtrConstant(Offset));
02391         unsigned VReg = MF.addLiveIn(GPR64ArgRegs[NumIntRegs],
02392                                      &X86::GR64RegClass);
02393         SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64);
02394         SDValue Store =
02395           DAG.getStore(Val.getValue(1), dl, Val, FIN,
02396                        MachinePointerInfo::getFixedStack(
02397                          FuncInfo->getRegSaveFrameIndex(), Offset),
02398                        false, false, 0);
02399         MemOps.push_back(Store);
02400         Offset += 8;
02401       }
02402 
02403       if (TotalNumXMMRegs != 0 && NumXMMRegs != TotalNumXMMRegs) {
02404         // Now store the XMM (fp + vector) parameter registers.
02405         SmallVector<SDValue, 11> SaveXMMOps;
02406         SaveXMMOps.push_back(Chain);
02407 
02408         unsigned AL = MF.addLiveIn(X86::AL, &X86::GR8RegClass);
02409         SDValue ALVal = DAG.getCopyFromReg(DAG.getEntryNode(), dl, AL, MVT::i8);
02410         SaveXMMOps.push_back(ALVal);
02411 
02412         SaveXMMOps.push_back(DAG.getIntPtrConstant(
02413                                FuncInfo->getRegSaveFrameIndex()));
02414         SaveXMMOps.push_back(DAG.getIntPtrConstant(
02415                                FuncInfo->getVarArgsFPOffset()));
02416 
02417         for (; NumXMMRegs != TotalNumXMMRegs; ++NumXMMRegs) {
02418           unsigned VReg = MF.addLiveIn(XMMArgRegs64Bit[NumXMMRegs],
02419                                        &X86::VR128RegClass);
02420           SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::v4f32);
02421           SaveXMMOps.push_back(Val);
02422         }
02423         MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, dl,
02424                                      MVT::Other,
02425                                      &SaveXMMOps[0], SaveXMMOps.size()));
02426       }
02427 
02428       if (!MemOps.empty())
02429         Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
02430                             &MemOps[0], MemOps.size());
02431     }
02432   }
02433 
02434   // Some CCs need callee pop.
02435   if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
02436                        MF.getTarget().Options.GuaranteedTailCallOpt)) {
02437     FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything.
02438   } else {
02439     FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing.
02440     // If this is an sret function, the return should pop the hidden pointer.
02441     if (!Is64Bit && !IsTailCallConvention(CallConv) &&
02442         !Subtarget->getTargetTriple().isOSMSVCRT() &&
02443         argsAreStructReturn(Ins) == StackStructReturn)
02444       FuncInfo->setBytesToPopOnReturn(4);
02445   }
02446 
02447   if (!Is64Bit) {
02448     // RegSaveFrameIndex is X86-64 only.
02449     FuncInfo->setRegSaveFrameIndex(0xAAAAAAA);
02450     if (CallConv == CallingConv::X86_FastCall ||
02451         CallConv == CallingConv::X86_ThisCall)
02452       // fastcc functions can't have varargs.
02453       FuncInfo->setVarArgsFrameIndex(0xAAAAAAA);
02454   }
02455 
02456   FuncInfo->setArgumentStackSize(StackSize);
02457 
02458   return Chain;
02459 }
02460 
02461 SDValue
02462 X86TargetLowering::LowerMemOpCallTo(SDValue Chain,
02463                                     SDValue StackPtr, SDValue Arg,
02464                                     SDLoc dl, SelectionDAG &DAG,
02465                                     const CCValAssign &VA,
02466                                     ISD::ArgFlagsTy Flags) const {
02467   unsigned LocMemOffset = VA.getLocMemOffset();
02468   SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset);
02469   PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, PtrOff);
02470   if (Flags.isByVal())
02471     return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl);
02472 
02473   return DAG.getStore(Chain, dl, Arg, PtrOff,
02474                       MachinePointerInfo::getStack(LocMemOffset),
02475                       false, false, 0);
02476 }
02477 
02478 /// EmitTailCallLoadRetAddr - Emit a load of return address if tail call
02479 /// optimization is performed and it is required.
02480 SDValue
02481 X86TargetLowering::EmitTailCallLoadRetAddr(SelectionDAG &DAG,
02482                                            SDValue &OutRetAddr, SDValue Chain,
02483                                            bool IsTailCall, bool Is64Bit,
02484                                            int FPDiff, SDLoc dl) const {
02485   // Adjust the Return address stack slot.
02486   EVT VT = getPointerTy();
02487   OutRetAddr = getReturnAddressFrameIndex(DAG);
02488 
02489   // Load the "old" Return address.
02490   OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, MachinePointerInfo(),
02491                            false, false, false, 0);
02492   return SDValue(OutRetAddr.getNode(), 1);
02493 }
02494 
02495 /// EmitTailCallStoreRetAddr - Emit a store of the return address if tail call
02496 /// optimization is performed and it is required (FPDiff!=0).
02497 static SDValue
02498 EmitTailCallStoreRetAddr(SelectionDAG & DAG, MachineFunction &MF,
02499                          SDValue Chain, SDValue RetAddrFrIdx, EVT PtrVT,
02500                          unsigned SlotSize, int FPDiff, SDLoc dl) {
02501   // Store the return address to the appropriate stack slot.
02502   if (!FPDiff) return Chain;
02503   // Calculate the new stack slot for the return address.
02504   int NewReturnAddrFI =
02505     MF.getFrameInfo()->CreateFixedObject(SlotSize, (int64_t)FPDiff - SlotSize,
02506                                          false);
02507   SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, PtrVT);
02508   Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx,
02509                        MachinePointerInfo::getFixedStack(NewReturnAddrFI),
02510                        false, false, 0);
02511   return Chain;
02512 }
02513 
02514 SDValue
02515 X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
02516                              SmallVectorImpl<SDValue> &InVals) const {
02517   SelectionDAG &DAG                     = CLI.DAG;
02518   SDLoc &dl                             = CLI.DL;
02519   SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
02520   SmallVectorImpl<SDValue> &OutVals     = CLI.OutVals;
02521   SmallVectorImpl<ISD::InputArg> &Ins   = CLI.Ins;
02522   SDValue Chain                         = CLI.Chain;
02523   SDValue Callee                        = CLI.Callee;
02524   CallingConv::ID CallConv              = CLI.CallConv;
02525   bool &isTailCall                      = CLI.IsTailCall;
02526   bool isVarArg                         = CLI.IsVarArg;
02527 
02528   MachineFunction &MF = DAG.getMachineFunction();
02529   bool Is64Bit        = Subtarget->is64Bit();
02530   bool IsWin64        = Subtarget->isCallingConvWin64(CallConv);
02531   StructReturnType SR = callIsStructReturn(Outs);
02532   bool IsSibcall      = false;
02533 
02534   if (MF.getTarget().Options.DisableTailCalls)
02535     isTailCall = false;
02536 
02537   if (isTailCall) {
02538     // Check if it's really possible to do a tail call.
02539     isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv,
02540                     isVarArg, SR != NotStructReturn,
02541                     MF.getFunction()->hasStructRetAttr(), CLI.RetTy,
02542                     Outs, OutVals, Ins, DAG);
02543 
02544     // Sibcalls are automatically detected tailcalls which do not require
02545     // ABI changes.
02546     if (!MF.getTarget().Options.GuaranteedTailCallOpt && isTailCall)
02547       IsSibcall = true;
02548 
02549     if (isTailCall)
02550       ++NumTailCalls;
02551   }
02552 
02553   assert(!(isVarArg && IsTailCallConvention(CallConv)) &&
02554          "Var args not supported with calling convention fastcc, ghc or hipe");
02555 
02556   // Analyze operands of the call, assigning locations to each operand.
02557   SmallVector<CCValAssign, 16> ArgLocs;
02558   CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(),
02559                  ArgLocs, *DAG.getContext());
02560 
02561   // Allocate shadow area for Win64
02562   if (IsWin64)
02563     CCInfo.AllocateStack(32, 8);
02564 
02565   CCInfo.AnalyzeCallOperands(Outs, CC_X86);
02566 
02567   // Get a count of how many bytes are to be pushed on the stack.
02568   unsigned NumBytes = CCInfo.getNextStackOffset();
02569   if (IsSibcall)
02570     // This is a sibcall. The memory operands are available in caller's
02571     // own caller's stack.
02572     NumBytes = 0;
02573   else if (getTargetMachine().Options.GuaranteedTailCallOpt &&
02574            IsTailCallConvention(CallConv))
02575     NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG);
02576 
02577   int FPDiff = 0;
02578   if (isTailCall && !IsSibcall) {
02579     // Lower arguments at fp - stackoffset + fpdiff.
02580     X86MachineFunctionInfo *X86Info = MF.getInfo<X86MachineFunctionInfo>();
02581     unsigned NumBytesCallerPushed = X86Info->getBytesToPopOnReturn();
02582 
02583     FPDiff = NumBytesCallerPushed - NumBytes;
02584 
02585     // Set the delta of movement of the returnaddr stackslot.
02586     // But only set if delta is greater than previous delta.
02587     if (FPDiff < X86Info->getTCReturnAddrDelta())
02588       X86Info->setTCReturnAddrDelta(FPDiff);
02589   }
02590 
02591   unsigned NumBytesToPush = NumBytes;
02592   unsigned NumBytesToPop = NumBytes;
02593 
02594   // If we have an inalloca argument, all stack space has already been allocated
02595   // for us and be right at the top of the stack.  We don't support multiple
02596   // arguments passed in memory when using inalloca.
02597   if (!Outs.empty() && Outs.back().Flags.isInAlloca()) {
02598     NumBytesToPush = 0;
02599     assert(ArgLocs.back().getLocMemOffset() == 0 &&
02600            "an inalloca argument must be the only memory argument");
02601   }
02602 
02603   if (!IsSibcall)
02604     Chain = DAG.getCALLSEQ_START(
02605         Chain, DAG.getIntPtrConstant(NumBytesToPush, true), dl);
02606 
02607   SDValue RetAddrFrIdx;
02608   // Load return address for tail calls.
02609   if (isTailCall && FPDiff)
02610     Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall,
02611                                     Is64Bit, FPDiff, dl);
02612 
02613   SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
02614   SmallVector<SDValue, 8> MemOpChains;
02615   SDValue StackPtr;
02616 
02617   // Walk the register/memloc assignments, inserting copies/loads.  In the case
02618   // of tail call optimization arguments are handle later.
02619   const X86RegisterInfo *RegInfo =
02620     static_cast<const X86RegisterInfo*>(getTargetMachine().getRegisterInfo());
02621   for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
02622     // Skip inalloca arguments, they have already been written.
02623     ISD::ArgFlagsTy Flags = Outs[i].Flags;
02624     if (Flags.isInAlloca())
02625       continue;
02626 
02627     CCValAssign &VA = ArgLocs[i];
02628     EVT RegVT = VA.getLocVT();
02629     SDValue Arg = OutVals[i];
02630     bool isByVal = Flags.isByVal();
02631 
02632     // Promote the value if needed.
02633     switch (VA.getLocInfo()) {
02634     default: llvm_unreachable("Unknown loc info!");
02635     case CCValAssign::Full: break;
02636     case CCValAssign::SExt:
02637       Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg);
02638       break;
02639     case CCValAssign::ZExt:
02640       Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg);
02641       break;
02642     case CCValAssign::AExt:
02643       if (RegVT.is128BitVector()) {
02644         // Special case: passing MMX values in XMM registers.
02645         Arg = DAG.getNode(ISD::BITCAST, dl, MVT::i64, Arg);
02646         Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg);
02647         Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg);
02648       } else
02649         Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg);
02650       break;
02651     case CCValAssign::BCvt:
02652       Arg = DAG.getNode(ISD::BITCAST, dl, RegVT, Arg);
02653       break;
02654     case CCValAssign::Indirect: {
02655       // Store the argument.
02656       SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT());
02657       int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex();
02658       Chain = DAG.getStore(Chain, dl, Arg, SpillSlot,
02659                            MachinePointerInfo::getFixedStack(FI),
02660                            false, false, 0);
02661       Arg = SpillSlot;
02662       break;
02663     }
02664     }
02665 
02666     if (VA.isRegLoc()) {
02667       RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
02668       if (isVarArg && IsWin64) {
02669         // Win64 ABI requires argument XMM reg to be copied to the corresponding
02670         // shadow reg if callee is a varargs function.
02671         unsigned ShadowReg = 0;
02672         switch (VA.getLocReg()) {
02673         case X86::XMM0: ShadowReg = X86::RCX; break;
02674         case X86::XMM1: ShadowReg = X86::RDX; break;
02675         case X86::XMM2: ShadowReg = X86::R8; break;
02676         case X86::XMM3: ShadowReg = X86::R9; break;
02677         }
02678         if (ShadowReg)
02679           RegsToPass.push_back(std::make_pair(ShadowReg, Arg));
02680       }
02681     } else if (!IsSibcall && (!isTailCall || isByVal)) {
02682       assert(VA.isMemLoc());
02683       if (StackPtr.getNode() == 0)
02684         StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
02685                                       getPointerTy());
02686       MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg,
02687                                              dl, DAG, VA, Flags));
02688     }
02689   }
02690 
02691   if (!MemOpChains.empty())
02692     Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
02693                         &MemOpChains[0], MemOpChains.size());
02694 
02695   if (Subtarget->isPICStyleGOT()) {
02696     // ELF / PIC requires GOT in the EBX register before function calls via PLT
02697     // GOT pointer.
02698     if (!isTailCall) {
02699       RegsToPass.push_back(std::make_pair(unsigned(X86::EBX),
02700                DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), getPointerTy())));
02701     } else {
02702       // If we are tail calling and generating PIC/GOT style code load the
02703       // address of the callee into ECX. The value in ecx is used as target of
02704       // the tail jump. This is done to circumvent the ebx/callee-saved problem
02705       // for tail calls on PIC/GOT architectures. Normally we would just put the
02706       // address of GOT into ebx and then call target@PLT. But for tail calls
02707       // ebx would be restored (since ebx is callee saved) before jumping to the
02708       // target@PLT.
02709 
02710       // Note: The actual moving to ECX is done further down.
02711       GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
02712       if (G && !G->getGlobal()->hasHiddenVisibility() &&
02713           !G->getGlobal()->hasProtectedVisibility())
02714         Callee = LowerGlobalAddress(Callee, DAG);
02715       else if (isa<ExternalSymbolSDNode>(Callee))
02716         Callee = LowerExternalSymbol(Callee, DAG);
02717     }
02718   }
02719 
02720   if (Is64Bit && isVarArg && !IsWin64) {
02721     // From AMD64 ABI document:
02722     // For calls that may call functions that use varargs or stdargs
02723     // (prototype-less calls or calls to functions containing ellipsis (...) in
02724     // the declaration) %al is used as hidden argument to specify the number
02725     // of SSE registers used. The contents of %al do not need to match exactly
02726     // the number of registers, but must be an ubound on the number of SSE
02727     // registers used and is in the range 0 - 8 inclusive.
02728 
02729     // Count the number of XMM registers allocated.
02730     static const MCPhysReg XMMArgRegs[] = {
02731       X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
02732       X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
02733     };
02734     unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs, 8);
02735     assert((Subtarget->hasSSE1() || !NumXMMRegs)
02736            && "SSE registers cannot be used when SSE is disabled");
02737 
02738     RegsToPass.push_back(std::make_pair(unsigned(X86::AL),
02739                                         DAG.getConstant(NumXMMRegs, MVT::i8)));
02740   }
02741 
02742   // For tail calls lower the arguments to the 'real' stack slot.
02743   if (isTailCall) {
02744     // Force all the incoming stack arguments to be loaded from the stack
02745     // before any new outgoing arguments are stored to the stack, because the
02746     // outgoing stack slots may alias the incoming argument stack slots, and
02747     // the alias isn't otherwise explicit. This is slightly more conservative
02748     // than necessary, because it means that each store effectively depends
02749     // on every argument instead of just those arguments it would clobber.
02750     SDValue ArgChain = DAG.getStackArgumentTokenFactor(Chain);
02751 
02752     SmallVector<SDValue, 8> MemOpChains2;
02753     SDValue FIN;
02754     int FI = 0;
02755     if (getTargetMachine().Options.GuaranteedTailCallOpt) {
02756       for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
02757         CCValAssign &VA = ArgLocs[i];
02758         if (VA.isRegLoc())
02759           continue;
02760         assert(VA.isMemLoc());
02761         SDValue Arg = OutVals[i];
02762         ISD::ArgFlagsTy Flags = Outs[i].Flags;
02763         // Create frame index.
02764         int32_t Offset = VA.getLocMemOffset()+FPDiff;
02765         uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8;
02766         FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset, true);
02767         FIN = DAG.getFrameIndex(FI, getPointerTy());
02768 
02769         if (Flags.isByVal()) {
02770           // Copy relative to framepointer.
02771           SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset());
02772           if (StackPtr.getNode() == 0)
02773             StackPtr = DAG.getCopyFromReg(Chain, dl,
02774                                           RegInfo->getStackRegister(),
02775                                           getPointerTy());
02776           Source = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, Source);
02777 
02778           MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN,
02779                                                            ArgChain,
02780                                                            Flags, DAG, dl));
02781         } else {
02782           // Store relative to framepointer.
02783           MemOpChains2.push_back(
02784             DAG.getStore(ArgChain, dl, Arg, FIN,
02785                          MachinePointerInfo::getFixedStack(FI),
02786                          false, false, 0));
02787         }
02788       }
02789     }
02790 
02791     if (!MemOpChains2.empty())
02792       Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
02793                           &MemOpChains2[0], MemOpChains2.size());
02794 
02795     // Store the return address to the appropriate stack slot.
02796     Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx,
02797                                      getPointerTy(), RegInfo->getSlotSize(),
02798                                      FPDiff, dl);
02799   }
02800 
02801   // Build a sequence of copy-to-reg nodes chained together with token chain
02802   // and flag operands which copy the outgoing args into registers.
02803   SDValue InFlag;
02804   for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
02805     Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
02806                              RegsToPass[i].second, InFlag);
02807     InFlag = Chain.getValue(1);
02808   }
02809 
02810   if (getTargetMachine().getCodeModel() == CodeModel::Large) {
02811     assert(Is64Bit && "Large code model is only legal in 64-bit mode.");
02812     // In the 64-bit large code model, we have to make all calls
02813     // through a register, since the call instruction's 32-bit
02814     // pc-relative offset may not be large enough to hold the whole
02815     // address.
02816   } else if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
02817     // If the callee is a GlobalAddress node (quite common, every direct call
02818     // is) turn it into a TargetGlobalAddress node so that legalize doesn't hack
02819     // it.
02820 
02821     // We should use extra load for direct calls to dllimported functions in
02822     // non-JIT mode.
02823     const GlobalValue *GV = G->getGlobal();
02824     if (!GV->hasDLLImportStorageClass()) {
02825       unsigned char OpFlags = 0;
02826       bool ExtraLoad = false;
02827       unsigned WrapperKind = ISD::DELETED_NODE;
02828 
02829       // On ELF targets, in both X86-64 and X86-32 mode, direct calls to
02830       // external symbols most go through the PLT in PIC mode.  If the symbol
02831       // has hidden or protected visibility, or if it is static or local, then
02832       // we don't need to use the PLT - we can directly call it.
02833       if (Subtarget->isTargetELF() &&
02834           getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
02835           GV->hasDefaultVisibility() && !GV->hasLocalLinkage()) {
02836         OpFlags = X86II::MO_PLT;
02837       } else if (Subtarget->isPICStyleStubAny() &&
02838                  (GV->isDeclaration() || GV->isWeakForLinker()) &&
02839                  (!Subtarget->getTargetTriple().isMacOSX() ||
02840                   Subtarget->getTargetTriple().isMacOSXVersionLT(10, 5))) {
02841         // PC-relative references to external symbols should go through $stub,
02842         // unless we're building with the leopard linker or later, which
02843         // automatically synthesizes these stubs.
02844         OpFlags = X86II::MO_DARWIN_STUB;
02845       } else if (Subtarget->isPICStyleRIPRel() &&
02846                  isa<Function>(GV) &&
02847                  cast<Function>(GV)->getAttributes().
02848                    hasAttribute(AttributeSet::FunctionIndex,
02849                                 Attribute::NonLazyBind)) {
02850         // If the function is marked as non-lazy, generate an indirect call
02851         // which loads from the GOT directly. This avoids runtime overhead
02852         // at the cost of eager binding (and one extra byte of encoding).
02853         OpFlags = X86II::MO_GOTPCREL;
02854         WrapperKind = X86ISD::WrapperRIP;
02855         ExtraLoad = true;
02856       }
02857 
02858       Callee = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(),
02859                                           G->getOffset(), OpFlags);
02860 
02861       // Add a wrapper if needed.
02862       if (WrapperKind != ISD::DELETED_NODE)
02863         Callee = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Callee);
02864       // Add extra indirection if needed.
02865       if (ExtraLoad)
02866         Callee = DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), Callee,
02867                              MachinePointerInfo::getGOT(),
02868                              false, false, false, 0);
02869     }
02870   } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
02871     unsigned char OpFlags = 0;
02872 
02873     // On ELF targets, in either X86-64 or X86-32 mode, direct calls to
02874     // external symbols should go through the PLT.
02875     if (Subtarget->isTargetELF() &&
02876         getTargetMachine().getRelocationModel() == Reloc::PIC_) {
02877       OpFlags = X86II::MO_PLT;
02878     } else if (Subtarget->isPICStyleStubAny() &&
02879                (!Subtarget->getTargetTriple().isMacOSX() ||
02880                 Subtarget->getTargetTriple().isMacOSXVersionLT(10, 5))) {
02881       // PC-relative references to external symbols should go through $stub,
02882       // unless we're building with the leopard linker or later, which
02883       // automatically synthesizes these stubs.
02884       OpFlags = X86II::MO_DARWIN_STUB;
02885     }
02886 
02887     Callee = DAG.getTargetExternalSymbol(S->getSymbol(), getPointerTy(),
02888                                          OpFlags);
02889   }
02890 
02891   // Returns a chain & a flag for retval copy to use.
02892   SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
02893   SmallVector<SDValue, 8> Ops;
02894 
02895   if (!IsSibcall && isTailCall) {
02896     Chain = DAG.getCALLSEQ_END(Chain,
02897                                DAG.getIntPtrConstant(NumBytesToPop, true),
02898                                DAG.getIntPtrConstant(0, true), InFlag, dl);
02899     InFlag = Chain.getValue(1);
02900   }
02901 
02902   Ops.push_back(Chain);
02903   Ops.push_back(Callee);
02904 
02905   if (isTailCall)
02906     Ops.push_back(DAG.getConstant(FPDiff, MVT::i32));
02907 
02908   // Add argument registers to the end of the list so that they are known live
02909   // into the call.
02910   for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
02911     Ops.push_back(DAG.getRegister(RegsToPass[i].first,
02912                                   RegsToPass[i].second.getValueType()));
02913 
02914   // Add a register mask operand representing the call-preserved registers.
02915   const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo();
02916   const uint32_t *Mask = TRI->getCallPreservedMask(CallConv);
02917   assert(Mask && "Missing call preserved mask for calling convention");
02918   Ops.push_back(DAG.getRegisterMask(Mask));
02919 
02920   if (InFlag.getNode())
02921     Ops.push_back(InFlag);
02922 
02923   if (isTailCall) {
02924     // We used to do:
02925     //// If this is the first return lowered for this function, add the regs
02926     //// to the liveout set for the function.
02927     // This isn't right, although it's probably harmless on x86; liveouts
02928     // should be computed from returns not tail calls.  Consider a void
02929     // function making a tail call to a function returning int.
02930     return DAG.getNode(X86ISD::TC_RETURN, dl, NodeTys, &Ops[0], Ops.size());
02931   }
02932 
02933   Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, &Ops[0], Ops.size());
02934   InFlag = Chain.getValue(1);
02935 
02936   // Create the CALLSEQ_END node.
02937   unsigned NumBytesForCalleeToPop;
02938   if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
02939                        getTargetMachine().Options.GuaranteedTailCallOpt))
02940     NumBytesForCalleeToPop = NumBytes;    // Callee pops everything
02941   else if (!Is64Bit && !IsTailCallConvention(CallConv) &&
02942            !Subtarget->getTargetTriple().isOSMSVCRT() &&
02943            SR == StackStructReturn)
02944     // If this is a call to a struct-return function, the callee
02945     // pops the hidden struct pointer, so we have to push it back.
02946     // This is common for Darwin/X86, Linux & Mingw32 targets.
02947     // For MSVC Win32 targets, the caller pops the hidden struct pointer.
02948     NumBytesForCalleeToPop = 4;
02949   else
02950     NumBytesForCalleeToPop = 0;  // Callee pops nothing.
02951 
02952   // Returns a flag for retval copy to use.
02953   if (!IsSibcall) {
02954     Chain = DAG.getCALLSEQ_END(Chain,
02955                                DAG.getIntPtrConstant(NumBytesToPop, true),
02956                                DAG.getIntPtrConstant(NumBytesForCalleeToPop,
02957                                                      true),
02958                                InFlag, dl);
02959     InFlag = Chain.getValue(1);
02960   }
02961 
02962   // Handle result values, copying them out of physregs into vregs that we
02963   // return.
02964   return LowerCallResult(Chain, InFlag, CallConv, isVarArg,
02965                          Ins, dl, DAG, InVals);
02966 }
02967 
02968 //===----------------------------------------------------------------------===//
02969 //                Fast Calling Convention (tail call) implementation
02970 //===----------------------------------------------------------------------===//
02971 
02972 //  Like std call, callee cleans arguments, convention except that ECX is
02973 //  reserved for storing the tail called function address. Only 2 registers are
02974 //  free for argument passing (inreg). Tail call optimization is performed
02975 //  provided:
02976 //                * tailcallopt is enabled
02977 //                * caller/callee are fastcc
02978 //  On X86_64 architecture with GOT-style position independent code only local
02979 //  (within module) calls are supported at the moment.
02980 //  To keep the stack aligned according to platform abi the function
02981 //  GetAlignedArgumentStackSize ensures that argument delta is always multiples
02982 //  of stack alignment. (Dynamic linkers need this - darwin's dyld for example)
02983 //  If a tail called function callee has more arguments than the caller the
02984 //  caller needs to make sure that there is room to move the RETADDR to. This is
02985 //  achieved by reserving an area the size of the argument delta right after the
02986 //  original REtADDR, but before the saved framepointer or the spilled registers
02987 //  e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4)
02988 //  stack layout:
02989 //    arg1
02990 //    arg2
02991 //    RETADDR
02992 //    [ new RETADDR
02993 //      move area ]
02994 //    (possible EBP)
02995 //    ESI
02996 //    EDI
02997 //    local1 ..
02998 
02999 /// GetAlignedArgumentStackSize - Make the stack size align e.g 16n + 12 aligned
03000 /// for a 16 byte align requirement.
03001 unsigned
03002 X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize,
03003                                                SelectionDAG& DAG) const {
03004   MachineFunction &MF = DAG.getMachineFunction();
03005   const TargetMachine &TM = MF.getTarget();
03006   const X86RegisterInfo *RegInfo =
03007     static_cast<const X86RegisterInfo*>(TM.getRegisterInfo());
03008   const TargetFrameLowering &TFI = *TM.getFrameLowering();
03009   unsigned StackAlignment = TFI.getStackAlignment();
03010   uint64_t AlignMask = StackAlignment - 1;
03011   int64_t Offset = StackSize;
03012   unsigned SlotSize = RegInfo->getSlotSize();
03013   if ( (Offset & AlignMask) <= (StackAlignment - SlotSize) ) {
03014     // Number smaller than 12 so just add the difference.
03015     Offset += ((StackAlignment - SlotSize) - (Offset & AlignMask));
03016   } else {
03017     // Mask out lower bits, add stackalignment once plus the 12 bytes.
03018     Offset = ((~AlignMask) & Offset) + StackAlignment +
03019       (StackAlignment-SlotSize);
03020   }
03021   return Offset;
03022 }
03023 
03024 /// MatchingStackOffset - Return true if the given stack call argument is
03025 /// already available in the same position (relatively) of the caller's
03026 /// incoming argument stack.
03027 static
03028 bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
03029                          MachineFrameInfo *MFI, const MachineRegisterInfo *MRI,
03030                          const X86InstrInfo *TII) {
03031   unsigned Bytes = Arg.getValueType().getSizeInBits() / 8;
03032   int FI = INT_MAX;
03033   if (Arg.getOpcode() == ISD::CopyFromReg) {
03034     unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg();
03035     if (!TargetRegisterInfo::isVirtualRegister(VR))
03036       return false;
03037     MachineInstr *Def = MRI->getVRegDef(VR);
03038     if (!Def)
03039       return false;
03040     if (!Flags.isByVal()) {
03041       if (!TII->isLoadFromStackSlot(Def, FI))
03042         return false;
03043     } else {
03044       unsigned Opcode = Def->getOpcode();
03045       if ((Opcode == X86::LEA32r || Opcode == X86::LEA64r) &&
03046           Def->getOperand(1).isFI()) {
03047         FI = Def->getOperand(1).getIndex();
03048         Bytes = Flags.getByValSize();
03049       } else
03050         return false;
03051     }
03052   } else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) {
03053     if (Flags.isByVal())
03054       // ByVal argument is passed in as a pointer but it's now being
03055       // dereferenced. e.g.
03056       // define @foo(%struct.X* %A) {
03057       //   tail call @bar(%struct.X* byval %A)
03058       // }
03059       return false;
03060     SDValue Ptr = Ld->getBasePtr();
03061     FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr);
03062     if (!FINode)
03063       return false;
03064     FI = FINode->getIndex();
03065   } else if (Arg.getOpcode() == ISD::FrameIndex && Flags.isByVal()) {
03066     FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Arg);
03067     FI = FINode->getIndex();
03068     Bytes = Flags.getByValSize();
03069   } else
03070     return false;
03071 
03072   assert(FI != INT_MAX);
03073   if (!MFI->isFixedObjectIndex(FI))
03074     return false;
03075   return Offset == MFI->getObjectOffset(FI) && Bytes == MFI->getObjectSize(FI);
03076 }
03077 
03078 /// IsEligibleForTailCallOptimization - Check whether the call is eligible
03079 /// for tail call optimization. Targets which want to do tail call
03080 /// optimization should implement this function.
03081 bool
03082 X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
03083                                                      CallingConv::ID CalleeCC,
03084                                                      bool isVarArg,
03085                                                      bool isCalleeStructRet,
03086                                                      bool isCallerStructRet,
03087                                                      Type *RetTy,
03088                                     const SmallVectorImpl<ISD::OutputArg> &Outs,
03089                                     const SmallVectorImpl<SDValue> &OutVals,
03090                                     const SmallVectorImpl<ISD::InputArg> &Ins,
03091                                                      SelectionDAG &DAG) const {
03092   if (!IsTailCallConvention(CalleeCC) && !IsCCallConvention(CalleeCC))
03093     return false;
03094 
03095   // If -tailcallopt is specified, make fastcc functions tail-callable.
03096   const MachineFunction &MF = DAG.getMachineFunction();
03097   const Function *CallerF = MF.getFunction();
03098 
03099   // If the function return type is x86_fp80 and the callee return type is not,
03100   // then the FP_EXTEND of the call result is not a nop. It's not safe to
03101   // perform a tailcall optimization here.
03102   if (CallerF->getReturnType()->isX86_FP80Ty() && !RetTy->isX86_FP80Ty())
03103     return false;
03104 
03105   CallingConv::ID CallerCC = CallerF->getCallingConv();
03106   bool CCMatch = CallerCC == CalleeCC;
03107   bool IsCalleeWin64 = Subtarget->isCallingConvWin64(CalleeCC);
03108   bool IsCallerWin64 = Subtarget->isCallingConvWin64(CallerCC);
03109 
03110   if (getTargetMachine().Options.GuaranteedTailCallOpt) {
03111     if (IsTailCallConvention(CalleeCC) && CCMatch)
03112       return true;
03113     return false;
03114   }
03115 
03116   // Look for obvious safe cases to perform tail call optimization that do not
03117   // require ABI changes. This is what gcc calls sibcall.
03118 
03119   // Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to
03120   // emit a special epilogue.
03121   const X86RegisterInfo *RegInfo =
03122     static_cast<const X86RegisterInfo*>(getTargetMachine().getRegisterInfo());
03123   if (RegInfo->needsStackRealignment(MF))
03124     return false;
03125 
03126   // Also avoid sibcall optimization if either caller or callee uses struct
03127   // return semantics.
03128   if (isCalleeStructRet || isCallerStructRet)
03129     return false;
03130 
03131   // An stdcall/thiscall caller is expected to clean up its arguments; the
03132   // callee isn't going to do that.
03133   // FIXME: this is more restrictive than needed. We could produce a tailcall
03134   // when the stack adjustment matches. For example, with a thiscall that takes
03135   // only one argument.
03136   if (!CCMatch && (CallerCC == CallingConv::X86_StdCall ||
03137                    CallerCC == CallingConv::X86_ThisCall))
03138     return false;
03139 
03140   // Do not sibcall optimize vararg calls unless all arguments are passed via
03141   // registers.
03142   if (isVarArg && !Outs.empty()) {
03143 
03144     // Optimizing for varargs on Win64 is unlikely to be safe without
03145     // additional testing.
03146     if (IsCalleeWin64 || IsCallerWin64)
03147       return false;
03148 
03149     SmallVector<CCValAssign, 16> ArgLocs;
03150     CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(),
03151                    getTargetMachine(), ArgLocs, *DAG.getContext());
03152 
03153     CCInfo.AnalyzeCallOperands(Outs, CC_X86);
03154     for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i)
03155       if (!ArgLocs[i].isRegLoc())
03156         return false;
03157   }
03158 
03159   // If the call result is in ST0 / ST1, it needs to be popped off the x87
03160   // stack.  Therefore, if it's not used by the call it is not safe to optimize
03161   // this into a sibcall.
03162   bool Unused = false;
03163   for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
03164     if (!Ins[i].Used) {
03165       Unused = true;
03166       break;
03167     }
03168   }
03169   if (Unused) {
03170     SmallVector<CCValAssign, 16> RVLocs;
03171     CCState CCInfo(CalleeCC, false, DAG.getMachineFunction(),
03172                    getTargetMachine(), RVLocs, *DAG.getContext());
03173     CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
03174     for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
03175       CCValAssign &VA = RVLocs[i];
03176       if (VA.getLocReg() == X86::ST0 || VA.getLocReg() == X86::ST1)
03177         return false;
03178     }
03179   }
03180 
03181   // If the calling conventions do not match, then we'd better make sure the
03182   // results are returned in the same way as what the caller expects.
03183   if (!CCMatch) {
03184     SmallVector<CCValAssign, 16> RVLocs1;
03185     CCState CCInfo1(CalleeCC, false, DAG.getMachineFunction(),
03186                     getTargetMachine(), RVLocs1, *DAG.getContext());
03187     CCInfo1.AnalyzeCallResult(Ins, RetCC_X86);
03188 
03189     SmallVector<CCValAssign, 16> RVLocs2;
03190     CCState CCInfo2(CallerCC, false, DAG.getMachineFunction(),
03191                     getTargetMachine(), RVLocs2, *DAG.getContext());
03192     CCInfo2.AnalyzeCallResult(Ins, RetCC_X86);
03193 
03194     if (RVLocs1.size() != RVLocs2.size())
03195       return false;
03196     for (unsigned i = 0, e = RVLocs1.size(); i != e; ++i) {
03197       if (RVLocs1[i].isRegLoc() != RVLocs2[i].isRegLoc())
03198         return false;
03199       if (RVLocs1[i].getLocInfo() != RVLocs2[i].getLocInfo())
03200         return false;
03201       if (RVLocs1[i].isRegLoc()) {
03202         if (RVLocs1[i].getLocReg() != RVLocs2[i].getLocReg())
03203           return false;
03204       } else {
03205         if (RVLocs1[i].getLocMemOffset() != RVLocs2[i].getLocMemOffset())
03206           return false;
03207       }
03208     }
03209   }
03210 
03211   // If the callee takes no arguments then go on to check the results of the
03212   // call.
03213   if (!Outs.empty()) {
03214     // Check if stack adjustment is needed. For now, do not do this if any
03215     // argument is passed on the stack.
03216     SmallVector<CCValAssign, 16> ArgLocs;
03217     CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(),
03218                    getTargetMachine(), ArgLocs, *DAG.getContext());
03219 
03220     // Allocate shadow area for Win64
03221     if (IsCalleeWin64)
03222       CCInfo.AllocateStack(32, 8);
03223 
03224     CCInfo.AnalyzeCallOperands(Outs, CC_X86);
03225     if (CCInfo.getNextStackOffset()) {
03226       MachineFunction &MF = DAG.getMachineFunction();
03227       if (MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn())
03228         return false;
03229 
03230       // Check if the arguments are already laid out in the right way as
03231       // the caller's fixed stack objects.
03232       MachineFrameInfo *MFI = MF.getFrameInfo();
03233       const MachineRegisterInfo *MRI = &MF.getRegInfo();
03234       const X86InstrInfo *TII =
03235         ((const X86TargetMachine&)getTargetMachine()).getInstrInfo();
03236       for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
03237         CCValAssign &VA = ArgLocs[i];
03238         SDValue Arg = OutVals[i];
03239         ISD::ArgFlagsTy Flags = Outs[i].Flags;
03240         if (VA.getLocInfo() == CCValAssign::Indirect)
03241           return false;
03242         if (!VA.isRegLoc()) {
03243           if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags,
03244                                    MFI, MRI, TII))
03245             return false;
03246         }
03247       }
03248     }
03249 
03250     // If the tailcall address may be in a register, then make sure it's
03251     // possible to register allocate for it. In 32-bit, the call address can
03252     // only target EAX, EDX, or ECX since the tail call must be scheduled after
03253     // callee-saved registers are restored. These happen to be the same
03254     // registers used to pass 'inreg' arguments so watch out for those.
03255     if (!Subtarget->is64Bit() &&
03256         ((!isa<GlobalAddressSDNode>(Callee) &&
03257           !isa<ExternalSymbolSDNode>(Callee)) ||
03258          getTargetMachine().getRelocationModel() == Reloc::PIC_)) {
03259       unsigned NumInRegs = 0;
03260       // In PIC we need an extra register to formulate the address computation
03261       // for the callee.
03262       unsigned MaxInRegs =
03263           (getTargetMachine().getRelocationModel() == Reloc::PIC_) ? 2 : 3;
03264 
03265       for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
03266         CCValAssign &VA = ArgLocs[i];
03267         if (!VA.isRegLoc())
03268           continue;
03269         unsigned Reg = VA.getLocReg();
03270         switch (Reg) {
03271         default: break;
03272         case X86::EAX: case X86::EDX: case X86::ECX:
03273           if (++NumInRegs == MaxInRegs)
03274             return false;
03275           break;
03276         }
03277       }
03278     }
03279   }
03280 
03281   return true;
03282 }
03283 
03284 FastISel *
03285 X86TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
03286                                   const TargetLibraryInfo *libInfo) const {
03287   return X86::createFastISel(funcInfo, libInfo);
03288 }
03289 
03290 //===----------------------------------------------------------------------===//
03291 //                           Other Lowering Hooks
03292 //===----------------------------------------------------------------------===//
03293 
03294 static bool MayFoldLoad(SDValue Op) {
03295   return Op.hasOneUse() && ISD::isNormalLoad(Op.getNode());
03296 }
03297 
03298 static bool MayFoldIntoStore(SDValue Op) {
03299   return Op.hasOneUse() && ISD::isNormalStore(*Op.getNode()->use_begin());
03300 }
03301 
03302 static bool isTargetShuffle(unsigned Opcode) {
03303   switch(Opcode) {
03304   default: return false;
03305   case X86ISD::PSHUFD:
03306   case X86ISD::PSHUFHW:
03307   case X86ISD::PSHUFLW:
03308   case X86ISD::SHUFP:
03309   case X86ISD::PALIGNR:
03310   case X86ISD::MOVLHPS:
03311   case X86ISD::MOVLHPD:
03312   case X86ISD::MOVHLPS:
03313   case X86ISD::MOVLPS:
03314   case X86ISD::MOVLPD:
03315   case X86ISD::MOVSHDUP:
03316   case X86ISD::MOVSLDUP:
03317   case X86ISD::MOVDDUP:
03318   case X86ISD::MOVSS:
03319   case X86ISD::MOVSD:
03320   case X86ISD::UNPCKL:
03321   case X86ISD::UNPCKH:
03322   case X86ISD::VPERMILP:
03323   case X86ISD::VPERM2X128:
03324   case X86ISD::VPERMI:
03325     return true;
03326   }
03327 }
03328 
03329 static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT,
03330                                     SDValue V1, SelectionDAG &DAG) {
03331   switch(Opc) {
03332   default: llvm_unreachable("Unknown x86 shuffle node");
03333   case X86ISD::MOVSHDUP:
03334   case X86ISD::MOVSLDUP:
03335   case X86ISD::MOVDDUP:
03336     return DAG.getNode(Opc, dl, VT, V1);
03337   }
03338 }
03339 
03340 static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT,
03341                                     SDValue V1, unsigned TargetMask,
03342                                     SelectionDAG &DAG) {
03343   switch(Opc) {
03344   default: llvm_unreachable("Unknown x86 shuffle node");
03345   case X86ISD::PSHUFD:
03346   case X86ISD::PSHUFHW:
03347   case X86ISD::PSHUFLW:
03348   case X86ISD::VPERMILP:
03349   case X86ISD::VPERMI:
03350     return DAG.getNode(Opc, dl, VT, V1, DAG.getConstant(TargetMask, MVT::i8));
03351   }
03352 }
03353 
03354 static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT,
03355                                     SDValue V1, SDValue V2, unsigned TargetMask,
03356                                     SelectionDAG &DAG) {
03357   switch(Opc) {
03358   default: llvm_unreachable("Unknown x86 shuffle node");
03359   case X86ISD::PALIGNR:
03360   case X86ISD::SHUFP:
03361   case X86ISD::VPERM2X128:
03362     return DAG.getNode(Opc, dl, VT, V1, V2,
03363                        DAG.getConstant(TargetMask, MVT::i8));
03364   }
03365 }
03366 
03367 static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT,
03368                                     SDValue V1, SDValue V2, SelectionDAG &DAG) {
03369   switch(Opc) {
03370   default: llvm_unreachable("Unknown x86 shuffle node");
03371   case X86ISD::MOVLHPS:
03372   case X86ISD::MOVLHPD:
03373   case X86ISD::MOVHLPS:
03374   case X86ISD::MOVLPS:
03375   case X86ISD::MOVLPD:
03376   case X86ISD::MOVSS:
03377   case X86ISD::MOVSD:
03378   case X86ISD::UNPCKL:
03379   case X86ISD::UNPCKH:
03380     return DAG.getNode(Opc, dl, VT, V1, V2);
03381   }
03382 }
03383 
03384 SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const {
03385   MachineFunction &MF = DAG.getMachineFunction();
03386   const X86RegisterInfo *RegInfo =
03387     static_cast<const X86RegisterInfo*>(getTargetMachine().getRegisterInfo());
03388   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
03389   int ReturnAddrIndex = FuncInfo->getRAIndex();
03390 
03391   if (ReturnAddrIndex == 0) {
03392     // Set up a frame object for the return address.
03393     unsigned SlotSize = RegInfo->getSlotSize();
03394     ReturnAddrIndex = MF.getFrameInfo()->CreateFixedObject(SlotSize,
03395                                                            -(int64_t)SlotSize,
03396                                                            false);
03397     FuncInfo->setRAIndex(ReturnAddrIndex);
03398   }
03399 
03400   return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy());
03401 }
03402 
03403 bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M,
03404                                        bool hasSymbolicDisplacement) {
03405   // Offset should fit into 32 bit immediate field.
03406   if (!isInt<32>(Offset))
03407     return false;
03408 
03409   // If we don't have a symbolic displacement - we don't have any extra
03410   // restrictions.
03411   if (!hasSymbolicDisplacement)
03412     return true;
03413 
03414   // FIXME: Some tweaks might be needed for medium code model.
03415   if (M != CodeModel::Small && M != CodeModel::Kernel)
03416     return false;
03417 
03418   // For small code model we assume that latest object is 16MB before end of 31
03419   // bits boundary. We may also accept pretty large negative constants knowing
03420   // that all objects are in the positive half of address space.
03421   if (M == CodeModel::Small && Offset < 16*1024*1024)
03422     return true;
03423 
03424   // For kernel code model we know that all object resist in the negative half
03425   // of 32bits address space. We may not accept negative offsets, since they may
03426   // be just off and we may accept pretty large positive ones.
03427   if (M == CodeModel::Kernel && Offset > 0)
03428     return true;
03429 
03430   return false;
03431 }
03432 
03433 /// isCalleePop - Determines whether the callee is required to pop its
03434 /// own arguments. Callee pop is necessary to support tail calls.
03435 bool X86::isCalleePop(CallingConv::ID CallingConv,
03436                       bool is64Bit, bool IsVarArg, bool TailCallOpt) {
03437   if (IsVarArg)
03438     return false;
03439 
03440   switch (CallingConv) {
03441   default:
03442     return false;
03443   case CallingConv::X86_StdCall:
03444     return !is64Bit;
03445   case CallingConv::X86_FastCall:
03446     return !is64Bit;
03447   case CallingConv::X86_ThisCall:
03448     return !is64Bit;
03449   case CallingConv::Fast:
03450     return TailCallOpt;
03451   case CallingConv::GHC:
03452     return TailCallOpt;
03453   case CallingConv::HiPE:
03454     return TailCallOpt;
03455   }
03456 }
03457 
03458 /// \brief Return true if the condition is an unsigned comparison operation.
03459 static bool isX86CCUnsigned(unsigned X86CC) {
03460   switch (X86CC) {
03461   default: llvm_unreachable("Invalid integer condition!");
03462   case X86::COND_E:     return true;
03463   case X86::COND_G:     return false;
03464   case X86::COND_GE:    return false;
03465   case X86::COND_L:     return false;
03466   case X86::COND_LE:    return false;
03467   case X86::COND_NE:    return true;
03468   case X86::COND_B:     return true;
03469   case X86::COND_A:     return true;
03470   case X86::COND_BE:    return true;
03471   case X86::COND_AE:    return true;
03472   }
03473   llvm_unreachable("covered switch fell through?!");
03474 }
03475 
03476 /// TranslateX86CC - do a one to one translation of a ISD::CondCode to the X86
03477 /// specific condition code, returning the condition code and the LHS/RHS of the
03478 /// comparison to make.
03479 static unsigned TranslateX86CC(ISD::CondCode SetCCOpcode, bool isFP,
03480                                SDValue &LHS, SDValue &RHS, SelectionDAG &DAG) {
03481   if (!isFP) {
03482     if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) {
03483       if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnesValue()) {
03484         // X > -1   -> X == 0, jump !sign.
03485         RHS = DAG.getConstant(0, RHS.getValueType());
03486         return X86::COND_NS;
03487       }
03488       if (SetCCOpcode == ISD::SETLT && RHSC->isNullValue()) {
03489         // X < 0   -> X == 0, jump on sign.
03490         return X86::COND_S;
03491       }
03492       if (SetCCOpcode == ISD::SETLT && RHSC->getZExtValue() == 1) {
03493         // X < 1   -> X <= 0
03494         RHS = DAG.getConstant(0, RHS.getValueType());
03495         return X86::COND_LE;
03496       }
03497     }
03498 
03499     switch (SetCCOpcode) {
03500     default: llvm_unreachable("Invalid integer condition!");
03501     case ISD::SETEQ:  return X86::COND_E;
03502     case ISD::SETGT:  return X86::COND_G;
03503     case ISD::SETGE:  return X86::COND_GE;
03504     case ISD::SETLT:  return X86::COND_L;
03505     case ISD::SETLE:  return X86::COND_LE;
03506     case ISD::SETNE:  return X86::COND_NE;
03507     case ISD::SETULT: return X86::COND_B;
03508     case ISD::SETUGT: return X86::COND_A;
03509     case ISD::SETULE: return X86::COND_BE;
03510     case ISD::SETUGE: return X86::COND_AE;
03511     }
03512   }
03513 
03514   // First determine if it is required or is profitable to flip the operands.
03515 
03516   // If LHS is a foldable load, but RHS is not, flip the condition.
03517   if (ISD::isNON_EXTLoad(LHS.getNode()) &&
03518       !ISD::isNON_EXTLoad(RHS.getNode())) {
03519     SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode);
03520     std::swap(LHS, RHS);
03521   }
03522 
03523   switch (SetCCOpcode) {
03524   default: break;
03525   case ISD::SETOLT:
03526   case ISD::SETOLE:
03527   case ISD::SETUGT:
03528   case ISD::SETUGE:
03529     std::swap(LHS, RHS);
03530     break;
03531   }
03532 
03533   // On a floating point condition, the flags are set as follows:
03534   // ZF  PF  CF   op
03535   //  0 | 0 | 0 | X > Y
03536   //  0 | 0 | 1 | X < Y
03537   //  1 | 0 | 0 | X == Y
03538   //  1 | 1 | 1 | unordered
03539   switch (SetCCOpcode) {
03540   default: llvm_unreachable("Condcode should be pre-legalized away");
03541   case ISD::SETUEQ:
03542   case ISD::SETEQ:   return X86::COND_E;
03543   case ISD::SETOLT:              // flipped
03544   case ISD::SETOGT:
03545   case ISD::SETGT:   return X86::COND_A;
03546   case ISD::SETOLE:              // flipped
03547   case ISD::SETOGE:
03548   case ISD::SETGE:   return X86::COND_AE;
03549   case ISD::SETUGT:              // flipped
03550   case ISD::SETULT:
03551   case ISD::SETLT:   return X86::COND_B;
03552   case ISD::SETUGE:              // flipped
03553   case ISD::SETULE:
03554   case ISD::SETLE:   return X86::COND_BE;
03555   case ISD::SETONE:
03556   case ISD::SETNE:   return X86::COND_NE;
03557   case ISD::SETUO:   return X86::COND_P;
03558   case ISD::SETO:    return X86::COND_NP;
03559   case ISD::SETOEQ:
03560   case ISD::SETUNE:  return X86::COND_INVALID;
03561   }
03562 }
03563 
03564 /// hasFPCMov - is there a floating point cmov for the specific X86 condition
03565 /// code. Current x86 isa includes the following FP cmov instructions:
03566 /// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu.
03567 static bool hasFPCMov(unsigned X86CC) {
03568   switch (X86CC) {
03569   default:
03570     return false;
03571   case X86::COND_B:
03572   case X86::COND_BE:
03573   case X86::COND_E:
03574   case X86::COND_P:
03575   case X86::COND_A:
03576   case X86::COND_AE:
03577   case X86::COND_NE:
03578   case X86::COND_NP:
03579     return true;
03580   }
03581 }
03582 
03583 /// isFPImmLegal - Returns true if the target can instruction select the
03584 /// specified FP immediate natively. If false, the legalizer will
03585 /// materialize the FP immediate as a load from a constant pool.
03586 bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
03587   for (unsigned i = 0, e = LegalFPImmediates.size(); i != e; ++i) {
03588     if (Imm.bitwiseIsEqual(LegalFPImmediates[i]))
03589       return true;
03590   }
03591   return false;
03592 }
03593 
03594 /// \brief Returns true if it is beneficial to convert a load of a constant
03595 /// to just the constant itself.
03596 bool X86TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
03597                                                           Type *Ty) const {
03598   assert(Ty->isIntegerTy());
03599 
03600   unsigned BitSize = Ty->getPrimitiveSizeInBits();
03601   if (BitSize == 0 || BitSize > 64)
03602     return false;
03603   return true;
03604 }
03605 
03606 /// isUndefOrInRange - Return true if Val is undef or if its value falls within
03607 /// the specified range (L, H].
03608 static bool isUndefOrInRange(int Val, int Low, int Hi) {
03609   return (Val < 0) || (Val >= Low && Val < Hi);
03610 }
03611 
03612 /// isUndefOrEqual - Val is either less than zero (undef) or equal to the
03613 /// specified value.
03614 static bool isUndefOrEqual(int Val, int CmpVal) {
03615   return (Val < 0 || Val == CmpVal);
03616 }
03617 
03618 /// isSequentialOrUndefInRange - Return true if every element in Mask, beginning
03619 /// from position Pos and ending in Pos+Size, falls within the specified
03620 /// sequential range (L, L+Pos]. or is undef.
03621 static bool isSequentialOrUndefInRange(ArrayRef<int> Mask,
03622                                        unsigned Pos, unsigned Size, int Low) {
03623   for (unsigned i = Pos, e = Pos+Size; i != e; ++i, ++Low)
03624     if (!isUndefOrEqual(Mask[i], Low))
03625       return false;
03626   return true;
03627 }
03628 
03629 /// isPSHUFDMask - Return true if the node specifies a shuffle of elements that
03630 /// is suitable for input to PSHUFD or PSHUFW.  That is, it doesn't reference
03631 /// the second operand.
03632 static bool isPSHUFDMask(ArrayRef<int> Mask, MVT VT) {
03633   if (VT == MVT::v4f32 || VT == MVT::v4i32 )
03634     return (Mask[0] < 4 && Mask[1] < 4 && Mask[2] < 4 && Mask[3] < 4);
03635   if (VT == MVT::v2f64 || VT == MVT::v2i64)
03636     return (Mask[0] < 2 && Mask[1] < 2);
03637   return false;
03638 }
03639 
03640 /// isPSHUFHWMask - Return true if the node specifies a shuffle of elements that
03641 /// is suitable for input to PSHUFHW.
03642 static bool isPSHUFHWMask(ArrayRef<int> Mask, MVT VT, bool HasInt256) {
03643   if (VT != MVT::v8i16 && (!HasInt256 || VT != MVT::v16i16))
03644     return false;
03645 
03646   // Lower quadword copied in order or undef.
03647   if (!isSequentialOrUndefInRange(Mask, 0, 4, 0))
03648     return false;
03649 
03650   // Upper quadword shuffled.
03651   for (unsigned i = 4; i != 8; ++i)
03652     if (!isUndefOrInRange(Mask[i], 4, 8))
03653       return false;
03654 
03655   if (VT == MVT::v16i16) {
03656     // Lower quadword copied in order or undef.
03657     if (!isSequentialOrUndefInRange(Mask, 8, 4, 8))
03658       return false;
03659 
03660     // Upper quadword shuffled.
03661     for (unsigned i = 12; i != 16; ++i)
03662       if (!isUndefOrInRange(Mask[i], 12, 16))
03663         return false;
03664   }
03665 
03666   return true;
03667 }
03668 
03669 /// isPSHUFLWMask - Return true if the node specifies a shuffle of elements that
03670 /// is suitable for input to PSHUFLW.
03671 static bool isPSHUFLWMask(ArrayRef<int> Mask, MVT VT, bool HasInt256) {
03672   if (VT != MVT::v8i16 && (!HasInt256 || VT != MVT::v16i16))
03673     return false;
03674 
03675   // Upper quadword copied in order.
03676   if (!isSequentialOrUndefInRange(Mask, 4, 4, 4))
03677     return false;
03678 
03679   // Lower quadword shuffled.
03680   for (unsigned i = 0; i != 4; ++i)
03681     if (!isUndefOrInRange(Mask[i], 0, 4))
03682       return false;
03683 
03684   if (VT == MVT::v16i16) {
03685     // Upper quadword copied in order.
03686     if (!isSequentialOrUndefInRange(Mask, 12, 4, 12))
03687       return false;
03688 
03689     // Lower quadword shuffled.
03690     for (unsigned i = 8; i != 12; ++i)
03691       if (!isUndefOrInRange(Mask[i], 8, 12))
03692         return false;
03693   }
03694 
03695   return true;
03696 }
03697 
03698 /// isPALIGNRMask - Return true if the node specifies a shuffle of elements that
03699 /// is suitable for input to PALIGNR.
03700 static bool isPALIGNRMask(ArrayRef<int> Mask, MVT VT,
03701                           const X86Subtarget *Subtarget) {
03702   if ((VT.is128BitVector() && !Subtarget->hasSSSE3()) ||
03703       (VT.is256BitVector() && !Subtarget->hasInt256()))
03704     return false;
03705 
03706   unsigned NumElts = VT.getVectorNumElements();
03707   unsigned NumLanes = VT.is512BitVector() ? 1: VT.getSizeInBits()/128;
03708   unsigned NumLaneElts = NumElts/NumLanes;
03709 
03710   // Do not handle 64-bit element shuffles with palignr.
03711   if (NumLaneElts == 2)
03712     return false;
03713 
03714   for (unsigned l = 0; l != NumElts; l+=NumLaneElts) {
03715     unsigned i;
03716     for (i = 0; i != NumLaneElts; ++i) {
03717       if (Mask[i+l] >= 0)
03718         break;
03719     }
03720 
03721     // Lane is all undef, go to next lane
03722     if (i == NumLaneElts)
03723       continue;
03724 
03725     int Start = Mask[i+l];
03726 
03727     // Make sure its in this lane in one of the sources
03728     if (!isUndefOrInRange(Start, l, l+NumLaneElts) &&
03729         !isUndefOrInRange(Start, l+NumElts, l+NumElts+NumLaneElts))
03730       return false;
03731 
03732     // If not lane 0, then we must match lane 0
03733     if (l != 0 && Mask[i] >= 0 && !isUndefOrEqual(Start, Mask[i]+l))
03734       return false;
03735 
03736     // Correct second source to be contiguous with first source
03737     if (Start >= (int)NumElts)
03738       Start -= NumElts - NumLaneElts;
03739 
03740     // Make sure we're shifting in the right direction.
03741     if (Start <= (int)(i+l))
03742       return false;
03743 
03744     Start -= i;
03745 
03746     // Check the rest of the elements to see if they are consecutive.
03747     for (++i; i != NumLaneElts; ++i) {
03748       int Idx = Mask[i+l];
03749 
03750       // Make sure its in this lane
03751       if (!isUndefOrInRange(Idx, l, l+NumLaneElts) &&
03752           !isUndefOrInRange(Idx, l+NumElts, l+NumElts+NumLaneElts))
03753         return false;
03754 
03755       // If not lane 0, then we must match lane 0
03756       if (l != 0 && Mask[i] >= 0 && !isUndefOrEqual(Idx, Mask[i]+l))
03757         return false;
03758 
03759       if (Idx >= (int)NumElts)
03760         Idx -= NumElts - NumLaneElts;
03761 
03762       if (!isUndefOrEqual(Idx, Start+i))
03763         return false;
03764 
03765     }
03766   }
03767 
03768   return true;
03769 }
03770 
03771 /// CommuteVectorShuffleMask - Change values in a shuffle permute mask assuming
03772 /// the two vector operands have swapped position.
03773 static void CommuteVectorShuffleMask(SmallVectorImpl<int> &Mask,
03774                                      unsigned NumElems) {
03775   for (unsigned i = 0; i != NumElems; ++i) {
03776     int idx = Mask[i];
03777     if (idx < 0)
03778       continue;
03779     else if (idx < (int)NumElems)
03780       Mask[i] = idx + NumElems;
03781     else
03782       Mask[i] = idx - NumElems;
03783   }
03784 }
03785 
03786 /// isSHUFPMask - Return true if the specified VECTOR_SHUFFLE operand
03787 /// specifies a shuffle of elements that is suitable for input to 128/256-bit
03788 /// SHUFPS and SHUFPD. If Commuted is true, then it checks for sources to be
03789 /// reverse of what x86 shuffles want.
03790 static bool isSHUFPMask(ArrayRef<int> Mask, MVT VT, bool Commuted = false) {
03791 
03792   unsigned NumElems = VT.getVectorNumElements();
03793   unsigned NumLanes = VT.getSizeInBits()/128;
03794   unsigned NumLaneElems = NumElems/NumLanes;
03795 
03796   if (NumLaneElems != 2 && NumLaneElems != 4)
03797     return false;
03798 
03799   unsigned EltSize = VT.getVectorElementType().getSizeInBits();
03800   bool symetricMaskRequired =
03801     (VT.getSizeInBits() >= 256) && (EltSize == 32);
03802 
03803   // VSHUFPSY divides the resulting vector into 4 chunks.
03804   // The sources are also splitted into 4 chunks, and each destination
03805   // chunk must come from a different source chunk.
03806   //
03807   //  SRC1 =>   X7    X6    X5    X4    X3    X2    X1    X0
03808   //  SRC2 =>   Y7    Y6    Y5    Y4    Y3    Y2    Y1    Y9
03809   //
03810   //  DST  =>  Y7..Y4,   Y7..Y4,   X7..X4,   X7..X4,
03811   //           Y3..Y0,   Y3..Y0,   X3..X0,   X3..X0
03812   //
03813   // VSHUFPDY divides the resulting vector into 4 chunks.
03814   // The sources are also splitted into 4 chunks, and each destination
03815   // chunk must come from a different source chunk.
03816   //
03817   //  SRC1 =>      X3       X2       X1       X0
03818   //  SRC2 =>      Y3       Y2       Y1       Y0
03819   //
03820   //  DST  =>  Y3..Y2,  X3..X2,  Y1..Y0,  X1..X0
03821   //
03822   SmallVector<int, 4> MaskVal(NumLaneElems, -1);
03823   unsigned HalfLaneElems = NumLaneElems/2;
03824   for (unsigned l = 0; l != NumElems; l += NumLaneElems) {
03825     for (unsigned i = 0; i != NumLaneElems; ++i) {
03826       int Idx = Mask[i+l];
03827       unsigned RngStart = l + ((Commuted == (i<HalfLaneElems)) ? NumElems : 0);
03828       if (!isUndefOrInRange(Idx, RngStart, RngStart+NumLaneElems))
03829         return false;
03830       // For VSHUFPSY, the mask of the second half must be the same as the
03831       // first but with the appropriate offsets. This works in the same way as
03832       // VPERMILPS works with masks.
03833       if (!symetricMaskRequired || Idx < 0)
03834         continue;
03835       if (MaskVal[i] < 0) {
03836         MaskVal[i] = Idx - l;
03837         continue;
03838       }
03839       if ((signed)(Idx - l) != MaskVal[i])
03840         return false;
03841     }
03842   }
03843 
03844   return true;
03845 }
03846 
03847 /// isMOVHLPSMask - Return true if the specified VECTOR_SHUFFLE operand
03848 /// specifies a shuffle of elements that is suitable for input to MOVHLPS.
03849 static bool isMOVHLPSMask(ArrayRef<int> Mask, MVT VT) {
03850   if (!VT.is128BitVector())
03851     return false;
03852 
03853   unsigned NumElems = VT.getVectorNumElements();
03854 
03855   if (NumElems != 4)
03856     return false;
03857 
03858   // Expect bit0 == 6, bit1 == 7, bit2 == 2, bit3 == 3
03859   return isUndefOrEqual(Mask[0], 6) &&
03860          isUndefOrEqual(Mask[1], 7) &&
03861          isUndefOrEqual(Mask[2], 2) &&
03862          isUndefOrEqual(Mask[3], 3);
03863 }
03864 
03865 /// isMOVHLPS_v_undef_Mask - Special case of isMOVHLPSMask for canonical form
03866 /// of vector_shuffle v, v, <2, 3, 2, 3>, i.e. vector_shuffle v, undef,
03867 /// <2, 3, 2, 3>
03868 static bool isMOVHLPS_v_undef_Mask(ArrayRef<int> Mask, MVT VT) {
03869   if (!VT.is128BitVector())
03870     return false;
03871 
03872   unsigned NumElems = VT.getVectorNumElements();
03873 
03874   if (NumElems != 4)
03875     return false;
03876 
03877   return isUndefOrEqual(Mask[0], 2) &&
03878          isUndefOrEqual(Mask[1], 3) &&
03879          isUndefOrEqual(Mask[2], 2) &&
03880          isUndefOrEqual(Mask[3], 3);
03881 }
03882 
03883 /// isMOVLPMask - Return true if the specified VECTOR_SHUFFLE operand
03884 /// specifies a shuffle of elements that is suitable for input to MOVLP{S|D}.
03885 static bool isMOVLPMask(ArrayRef<int> Mask, MVT VT) {
03886   if (!VT.is128BitVector())
03887     return false;
03888 
03889   unsigned NumElems = VT.getVectorNumElements();
03890 
03891   if (NumElems != 2 && NumElems != 4)
03892     return false;
03893 
03894   for (unsigned i = 0, e = NumElems/2; i != e; ++i)
03895     if (!isUndefOrEqual(Mask[i], i + NumElems))
03896       return false;
03897 
03898   for (unsigned i = NumElems/2, e = NumElems; i != e; ++i)
03899     if (!isUndefOrEqual(Mask[i], i))
03900       return false;
03901 
03902   return true;
03903 }
03904 
03905 /// isMOVLHPSMask - Return true if the specified VECTOR_SHUFFLE operand
03906 /// specifies a shuffle of elements that is suitable for input to MOVLHPS.
03907 static bool isMOVLHPSMask(ArrayRef<int> Mask, MVT VT) {
03908   if (!VT.is128BitVector())
03909     return false;
03910 
03911   unsigned NumElems = VT.getVectorNumElements();
03912 
03913   if (NumElems != 2 && NumElems != 4)
03914     return false;
03915 
03916   for (unsigned i = 0, e = NumElems/2; i != e; ++i)
03917     if (!isUndefOrEqual(Mask[i], i))
03918       return false;
03919 
03920   for (unsigned i = 0, e = NumElems/2; i != e; ++i)
03921     if (!isUndefOrEqual(Mask[i + e], i + NumElems))
03922       return false;
03923 
03924   return true;
03925 }
03926 
03927 //
03928 // Some special combinations that can be optimized.
03929 //
03930 static
03931 SDValue Compact8x32ShuffleNode(ShuffleVectorSDNode *SVOp,
03932                                SelectionDAG &DAG) {
03933   MVT VT = SVOp->getSimpleValueType(0);
03934   SDLoc dl(SVOp);
03935 
03936   if (VT != MVT::v8i32 && VT != MVT::v8f32)
03937     return SDValue();
03938 
03939   ArrayRef<int> Mask = SVOp->getMask();
03940 
03941   // These are the special masks that may be optimized.
03942   static const int MaskToOptimizeEven[] = {0, 8, 2, 10, 4, 12, 6, 14};
03943   static const int MaskToOptimizeOdd[]  = {1, 9, 3, 11, 5, 13, 7, 15};
03944   bool MatchEvenMask = true;
03945   bool MatchOddMask  = true;
03946   for (int i=0; i<8; ++i) {
03947     if (!isUndefOrEqual(Mask[i], MaskToOptimizeEven[i]))
03948       MatchEvenMask = false;
03949     if (!isUndefOrEqual(Mask[i], MaskToOptimizeOdd[i]))
03950       MatchOddMask = false;
03951   }
03952 
03953   if (!MatchEvenMask && !MatchOddMask)
03954     return SDValue();
03955 
03956   SDValue UndefNode = DAG.getNode(ISD::UNDEF, dl, VT);
03957 
03958   SDValue Op0 = SVOp->getOperand(0);
03959   SDValue Op1 = SVOp->getOperand(1);
03960 
03961   if (MatchEvenMask) {
03962     // Shift the second operand right to 32 bits.
03963     static const int ShiftRightMask[] = {-1, 0, -1, 2, -1, 4, -1, 6 };
03964     Op1 = DAG.getVectorShuffle(VT, dl, Op1, UndefNode, ShiftRightMask);
03965   } else {
03966     // Shift the first operand left to 32 bits.
03967     static const int ShiftLeftMask[] = {1, -1, 3, -1, 5, -1, 7, -1 };
03968     Op0 = DAG.getVectorShuffle(VT, dl, Op0, UndefNode, ShiftLeftMask);
03969   }
03970   static const int BlendMask[] = {0, 9, 2, 11, 4, 13, 6, 15};
03971   return DAG.getVectorShuffle(VT, dl, Op0, Op1, BlendMask);
03972 }
03973 
03974 /// isUNPCKLMask - Return true if the specified VECTOR_SHUFFLE operand
03975 /// specifies a shuffle of elements that is suitable for input to UNPCKL.
03976 static bool isUNPCKLMask(ArrayRef<int> Mask, MVT VT,
03977                          bool HasInt256, bool V2IsSplat = false) {
03978 
03979   assert(VT.getSizeInBits() >= 128 &&
03980          "Unsupported vector type for unpckl");
03981 
03982   // AVX defines UNPCK* to operate independently on 128-bit lanes.
03983   unsigned NumLanes;
03984   unsigned NumOf256BitLanes;
03985   unsigned NumElts = VT.getVectorNumElements();
03986   if (VT.is256BitVector()) {
03987     if (NumElts != 4 && NumElts != 8 &&
03988         (!HasInt256 || (NumElts != 16 && NumElts != 32)))
03989     return false;
03990     NumLanes = 2;
03991     NumOf256BitLanes = 1;
03992   } else if (VT.is512BitVector()) {
03993     assert(VT.getScalarType().getSizeInBits() >= 32 &&
03994            "Unsupported vector type for unpckh");
03995     NumLanes = 2;
03996     NumOf256BitLanes = 2;
03997   } else {
03998     NumLanes = 1;
03999     NumOf256BitLanes = 1;
04000   }
04001 
04002   unsigned NumEltsInStride = NumElts/NumOf256BitLanes;
04003   unsigned NumLaneElts = NumEltsInStride/NumLanes;
04004 
04005   for (unsigned l256 = 0; l256 < NumOf256BitLanes; l256 += 1) {
04006     for (unsigned l = 0; l != NumEltsInStride; l += NumLaneElts) {
04007       for (unsigned i = 0, j = l; i != NumLaneElts; i += 2, ++j) {
04008         int BitI  = Mask[l256*NumEltsInStride+l+i];
04009         int BitI1 = Mask[l256*NumEltsInStride+l+i+1];
04010         if (!isUndefOrEqual(BitI, j+l256*NumElts))
04011           return false;
04012         if (V2IsSplat && !isUndefOrEqual(BitI1, NumElts))
04013           return false;
04014         if (!isUndefOrEqual(BitI1, j+l256*NumElts+NumEltsInStride))
04015           return false;
04016       }
04017     }
04018   }
04019   return true;
04020 }
04021 
04022 /// isUNPCKHMask - Return true if the specified VECTOR_SHUFFLE operand
04023 /// specifies a shuffle of elements that is suitable for input to UNPCKH.
04024 static bool isUNPCKHMask(ArrayRef<int> Mask, MVT VT,
04025                          bool HasInt256, bool V2IsSplat = false) {
04026   assert(VT.getSizeInBits() >= 128 &&
04027          "Unsupported vector type for unpckh");
04028 
04029   // AVX defines UNPCK* to operate independently on 128-bit lanes.
04030   unsigned NumLanes;
04031   unsigned NumOf256BitLanes;
04032   unsigned NumElts = VT.getVectorNumElements();
04033   if (VT.is256BitVector()) {
04034     if (NumElts != 4 && NumElts != 8 &&
04035         (!HasInt256 || (NumElts != 16 && NumElts != 32)))
04036     return false;
04037     NumLanes = 2;
04038     NumOf256BitLanes = 1;
04039   } else if (VT.is512BitVector()) {
04040     assert(VT.getScalarType().getSizeInBits() >= 32 &&
04041            "Unsupported vector type for unpckh");
04042     NumLanes = 2;
04043     NumOf256BitLanes = 2;
04044   } else {
04045     NumLanes = 1;
04046     NumOf256BitLanes = 1;
04047   }
04048 
04049   unsigned NumEltsInStride = NumElts/NumOf256BitLanes;
04050   unsigned NumLaneElts = NumEltsInStride/NumLanes;
04051 
04052   for (unsigned l256 = 0; l256 < NumOf256BitLanes; l256 += 1) {
04053     for (unsigned l = 0; l != NumEltsInStride; l += NumLaneElts) {
04054       for (unsigned i = 0, j = l+NumLaneElts/2; i != NumLaneElts; i += 2, ++j) {
04055         int BitI  = Mask[l256*NumEltsInStride+l+i];
04056         int BitI1 = Mask[l256*NumEltsInStride+l+i+1];
04057         if (!isUndefOrEqual(BitI, j+l256*NumElts))
04058           return false;
04059         if (V2IsSplat && !isUndefOrEqual(BitI1, NumElts))
04060           return false;
04061         if (!isUndefOrEqual(BitI1, j+l256*NumElts+NumEltsInStride))
04062           return false;
04063       }
04064     }
04065   }
04066   return true;
04067 }
04068 
04069 /// isUNPCKL_v_undef_Mask - Special case of isUNPCKLMask for canonical form
04070 /// of vector_shuffle v, v, <0, 4, 1, 5>, i.e. vector_shuffle v, undef,
04071 /// <0, 0, 1, 1>
04072 static bool isUNPCKL_v_undef_Mask(ArrayRef<int> Mask, MVT VT, bool HasInt256) {
04073   unsigned NumElts = VT.getVectorNumElements();
04074   bool Is256BitVec = VT.is256BitVector();
04075 
04076   if (VT.is512BitVector())
04077     return false;
04078   assert((VT.is128BitVector() || VT.is256BitVector()) &&
04079          "Unsupported vector type for unpckh");
04080 
04081   if (Is256BitVec && NumElts != 4 && NumElts != 8 &&
04082       (!HasInt256 || (NumElts != 16 && NumElts != 32)))
04083     return false;
04084 
04085   // For 256-bit i64/f64, use MOVDDUPY instead, so reject the matching pattern
04086   // FIXME: Need a better way to get rid of this, there's no latency difference
04087   // between UNPCKLPD and MOVDDUP, the later should always be checked first and
04088   // the former later. We should also remove the "_undef" special mask.
04089   if (NumElts == 4 && Is256BitVec)
04090     return false;
04091 
04092   // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate
04093   // independently on 128-bit lanes.
04094   unsigned NumLanes = VT.getSizeInBits()/128;
04095   unsigned NumLaneElts = NumElts/NumLanes;
04096 
04097   for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
04098     for (unsigned i = 0, j = l; i != NumLaneElts; i += 2, ++j) {
04099       int BitI  = Mask[l+i];
04100       int BitI1 = Mask[l+i+1];
04101 
04102       if (!isUndefOrEqual(BitI, j))
04103         return false;
04104       if (!isUndefOrEqual(BitI1, j))
04105         return false;
04106     }
04107   }
04108 
04109   return true;
04110 }
04111 
04112 /// isUNPCKH_v_undef_Mask - Special case of isUNPCKHMask for canonical form
04113 /// of vector_shuffle v, v, <2, 6, 3, 7>, i.e. vector_shuffle v, undef,
04114 /// <2, 2, 3, 3>
04115 static bool isUNPCKH_v_undef_Mask(ArrayRef<int> Mask, MVT VT, bool HasInt256) {
04116   unsigned NumElts = VT.getVectorNumElements();
04117 
04118   if (VT.is512BitVector())
04119     return false;
04120 
04121   assert((VT.is128BitVector() || VT.is256BitVector()) &&
04122          "Unsupported vector type for unpckh");
04123 
04124   if (VT.is256BitVector() && NumElts != 4 && NumElts != 8 &&
04125       (!HasInt256 || (NumElts != 16 && NumElts != 32)))
04126     return false;
04127 
04128   // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate
04129   // independently on 128-bit lanes.
04130   unsigned NumLanes = VT.getSizeInBits()/128;
04131   unsigned NumLaneElts = NumElts/NumLanes;
04132 
04133   for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
04134     for (unsigned i = 0, j = l+NumLaneElts/2; i != NumLaneElts; i += 2, ++j) {
04135       int BitI  = Mask[l+i];
04136       int BitI1 = Mask[l+i+1];
04137       if (!isUndefOrEqual(BitI, j))
04138         return false;
04139       if (!isUndefOrEqual(BitI1, j))
04140         return false;
04141     }
04142   }
04143   return true;
04144 }
04145 
04146 /// isMOVLMask - Return true if the specified VECTOR_SHUFFLE operand
04147 /// specifies a shuffle of elements that is suitable for input to MOVSS,
04148 /// MOVSD, and MOVD, i.e. setting the lowest element.
04149 static bool isMOVLMask(ArrayRef<int> Mask, EVT VT) {
04150   if (VT.getVectorElementType().getSizeInBits() < 32)
04151     return false;
04152   if (!VT.is128BitVector())
04153     return false;
04154 
04155   unsigned NumElts = VT.getVectorNumElements();
04156 
04157   if (!isUndefOrEqual(Mask[0], NumElts))
04158     return false;
04159 
04160   for (unsigned i = 1; i != NumElts; ++i)
04161     if (!isUndefOrEqual(Mask[i], i))
04162       return false;
04163 
04164   return true;
04165 }
04166 
04167 /// isVPERM2X128Mask - Match 256-bit shuffles where the elements are considered
04168 /// as permutations between 128-bit chunks or halves. As an example: this
04169 /// shuffle bellow:
04170 ///   vector_shuffle <4, 5, 6, 7, 12, 13, 14, 15>
04171 /// The first half comes from the second half of V1 and the second half from the
04172 /// the second half of V2.
04173 static bool isVPERM2X128Mask(ArrayRef<int> Mask, MVT VT, bool HasFp256) {
04174   if (!HasFp256 || !VT.is256BitVector())
04175     return false;
04176 
04177   // The shuffle result is divided into half A and half B. In total the two
04178   // sources have 4 halves, namely: C, D, E, F. The final values of A and
04179   // B must come from C, D, E or F.
04180   unsigned HalfSize = VT.getVectorNumElements()/2;
04181   bool MatchA = false, MatchB = false;
04182 
04183   // Check if A comes from one of C, D, E, F.
04184   for (unsigned Half = 0; Half != 4; ++Half) {
04185     if (isSequentialOrUndefInRange(Mask, 0, HalfSize, Half*HalfSize)) {
04186       MatchA = true;
04187       break;
04188     }
04189   }
04190 
04191   // Check if B comes from one of C, D, E, F.
04192   for (unsigned Half = 0; Half != 4; ++Half) {
04193     if (isSequentialOrUndefInRange(Mask, HalfSize, HalfSize, Half*HalfSize)) {
04194       MatchB = true;
04195       break;
04196     }
04197   }
04198 
04199   return MatchA && MatchB;
04200 }
04201 
04202 /// getShuffleVPERM2X128Immediate - Return the appropriate immediate to shuffle
04203 /// the specified VECTOR_MASK mask with VPERM2F128/VPERM2I128 instructions.
04204 static unsigned getShuffleVPERM2X128Immediate(ShuffleVectorSDNode *SVOp) {
04205   MVT VT = SVOp->getSimpleValueType(0);
04206 
04207   unsigned HalfSize = VT.getVectorNumElements()/2;
04208 
04209   unsigned FstHalf = 0, SndHalf = 0;
04210   for (unsigned i = 0; i < HalfSize; ++i) {
04211     if (SVOp->getMaskElt(i) > 0) {
04212       FstHalf = SVOp->getMaskElt(i)/HalfSize;
04213       break;
04214     }
04215   }
04216   for (unsigned i = HalfSize; i < HalfSize*2; ++i) {
04217     if (SVOp->getMaskElt(i) > 0) {
04218       SndHalf = SVOp->getMaskElt(i)/HalfSize;
04219       break;
04220     }
04221   }
04222 
04223   return (FstHalf | (SndHalf << 4));
04224 }
04225 
04226 // Symetric in-lane mask. Each lane has 4 elements (for imm8)
04227 static bool isPermImmMask(ArrayRef<int> Mask, MVT VT, unsigned& Imm8) {
04228   unsigned EltSize = VT.getVectorElementType().getSizeInBits();
04229   if (EltSize < 32)
04230     return false;
04231 
04232   unsigned NumElts = VT.getVectorNumElements();
04233   Imm8 = 0;
04234   if (VT.is128BitVector() || (VT.is256BitVector() && EltSize == 64)) {
04235     for (unsigned i = 0; i != NumElts; ++i) {
04236       if (Mask[i] < 0)
04237         continue;
04238       Imm8 |= Mask[i] << (i*2);
04239     }
04240     return true;
04241   }
04242 
04243   unsigned LaneSize = 4;
04244   SmallVector<int, 4> MaskVal(LaneSize, -1);
04245 
04246   for (unsigned l = 0; l != NumElts; l += LaneSize) {
04247     for (unsigned i = 0; i != LaneSize; ++i) {
04248       if (!isUndefOrInRange(Mask[i+l], l, l+LaneSize))
04249         return false;
04250       if (Mask[i+l] < 0)
04251         continue;
04252       if (MaskVal[i] < 0) {
04253         MaskVal[i] = Mask[i+l] - l;
04254         Imm8 |= MaskVal[i] << (i*2);
04255         continue;
04256       }
04257       if (Mask[i+l] != (signed)(MaskVal[i]+l))
04258         return false;
04259     }
04260   }
04261   return true;
04262 }
04263 
04264 /// isVPERMILPMask - Return true if the specified VECTOR_SHUFFLE operand
04265 /// specifies a shuffle of elements that is suitable for input to VPERMILPD*.
04266 /// Note that VPERMIL mask matching is different depending whether theunderlying
04267 /// type is 32 or 64. In the VPERMILPS the high half of the mask should point
04268 /// to the same elements of the low, but to the higher half of the source.
04269 /// In VPERMILPD the two lanes could be shuffled independently of each other
04270 /// with the same restriction that lanes can't be crossed. Also handles PSHUFDY.
04271 static bool isVPERMILPMask(ArrayRef<int> Mask, MVT VT) {
04272   unsigned EltSize = VT.getVectorElementType().getSizeInBits();
04273   if (VT.getSizeInBits() < 256 || EltSize < 32)
04274     return false;
04275   bool symetricMaskRequired = (EltSize == 32);
04276   unsigned NumElts = VT.getVectorNumElements();
04277 
04278   unsigned NumLanes = VT.getSizeInBits()/128;
04279   unsigned LaneSize = NumElts/NumLanes;
04280   // 2 or 4 elements in one lane
04281 
04282   SmallVector<int, 4> ExpectedMaskVal(LaneSize, -1);
04283   for (unsigned l = 0; l != NumElts; l += LaneSize) {
04284     for (unsigned i = 0; i != LaneSize; ++i) {
04285       if (!isUndefOrInRange(Mask[i+l], l, l+LaneSize))
04286         return false;
04287       if (symetricMaskRequired) {
04288         if (ExpectedMaskVal[i] < 0 && Mask[i+l] >= 0) {
04289           ExpectedMaskVal[i] = Mask[i+l] - l;
04290           continue;
04291         }
04292         if (!isUndefOrEqual(Mask[i+l], ExpectedMaskVal[i]+l))
04293           return false;
04294       }
04295     }
04296   }
04297   return true;
04298 }
04299 
04300 /// isCommutedMOVLMask - Returns true if the shuffle mask is except the reverse
04301 /// of what x86 movss want. X86 movs requires the lowest  element to be lowest
04302 /// element of vector 2 and the other elements to come from vector 1 in order.
04303 static bool isCommutedMOVLMask(ArrayRef<int> Mask, MVT VT,
04304                                bool V2IsSplat = false, bool V2IsUndef = false) {
04305   if (!VT.is128BitVector())
04306     return false;
04307 
04308   unsigned NumOps = VT.getVectorNumElements();
04309   if (NumOps != 2 && NumOps != 4 && NumOps != 8 && NumOps != 16)
04310     return false;
04311 
04312   if (!isUndefOrEqual(Mask[0], 0))
04313     return false;
04314 
04315   for (unsigned i = 1; i != NumOps; ++i)
04316     if (!(isUndefOrEqual(Mask[i], i+NumOps) ||
04317           (V2IsUndef && isUndefOrInRange(Mask[i], NumOps, NumOps*2)) ||
04318           (V2IsSplat && isUndefOrEqual(Mask[i], NumOps))))
04319       return false;
04320 
04321   return true;
04322 }
04323 
04324 /// isMOVSHDUPMask - Return true if the specified VECTOR_SHUFFLE operand
04325 /// specifies a shuffle of elements that is suitable for input to MOVSHDUP.
04326 /// Masks to match: <1, 1, 3, 3> or <1, 1, 3, 3, 5, 5, 7, 7>
04327 static bool isMOVSHDUPMask(ArrayRef<int> Mask, MVT VT,
04328                            const X86Subtarget *Subtarget) {
04329   if (!Subtarget->hasSSE3())
04330     return false;
04331 
04332   unsigned NumElems = VT.getVectorNumElements();
04333 
04334   if ((VT.is128BitVector() && NumElems != 4) ||
04335       (VT.is256BitVector() && NumElems != 8) ||
04336       (VT.is512BitVector() && NumElems != 16))
04337     return false;
04338 
04339   // "i+1" is the value the indexed mask element must have
04340   for (unsigned i = 0; i != NumElems; i += 2)
04341     if (!isUndefOrEqual(Mask[i], i+1) ||
04342         !isUndefOrEqual(Mask[i+1], i+1))
04343       return false;
04344 
04345   return true;
04346 }
04347 
04348 /// isMOVSLDUPMask - Return true if the specified VECTOR_SHUFFLE operand
04349 /// specifies a shuffle of elements that is suitable for input to MOVSLDUP.
04350 /// Masks to match: <0, 0, 2, 2> or <0, 0, 2, 2, 4, 4, 6, 6>
04351 static bool isMOVSLDUPMask(ArrayRef<int> Mask, MVT VT,
04352                            const X86Subtarget *Subtarget) {
04353   if (!Subtarget->hasSSE3())
04354     return false;
04355 
04356   unsigned NumElems = VT.getVectorNumElements();
04357 
04358   if ((VT.is128BitVector() && NumElems != 4) ||
04359       (VT.is256BitVector() && NumElems != 8) ||
04360       (VT.is512BitVector() && NumElems != 16))
04361     return false;
04362 
04363   // "i" is the value the indexed mask element must have
04364   for (unsigned i = 0; i != NumElems; i += 2)
04365     if (!isUndefOrEqual(Mask[i], i) ||
04366         !isUndefOrEqual(Mask[i+1], i))
04367       return false;
04368 
04369   return true;
04370 }
04371 
04372 /// isMOVDDUPYMask - Return true if the specified VECTOR_SHUFFLE operand
04373 /// specifies a shuffle of elements that is suitable for input to 256-bit
04374 /// version of MOVDDUP.
04375 static bool isMOVDDUPYMask(ArrayRef<int> Mask, MVT VT, bool HasFp256) {
04376   if (!HasFp256 || !VT.is256BitVector())
04377     return false;
04378 
04379   unsigned NumElts = VT.getVectorNumElements();
04380   if (NumElts != 4)
04381     return false;
04382 
04383   for (unsigned i = 0; i != NumElts/2; ++i)
04384     if (!isUndefOrEqual(Mask[i], 0))
04385       return false;
04386   for (unsigned i = NumElts/2; i != NumElts; ++i)
04387     if (!isUndefOrEqual(Mask[i], NumElts/2))
04388       return false;
04389   return true;
04390 }
04391 
04392 /// isMOVDDUPMask - Return true if the specified VECTOR_SHUFFLE operand
04393 /// specifies a shuffle of elements that is suitable for input to 128-bit
04394 /// version of MOVDDUP.
04395 static bool isMOVDDUPMask(ArrayRef<int> Mask, MVT VT) {
04396   if (!VT.is128BitVector())
04397     return false;
04398 
04399   unsigned e = VT.getVectorNumElements() / 2;
04400   for (unsigned i = 0; i != e; ++i)
04401     if (!isUndefOrEqual(Mask[i], i))
04402       return false;
04403   for (unsigned i = 0; i != e; ++i)
04404     if (!isUndefOrEqual(Mask[e+i], i))
04405       return false;
04406   return true;
04407 }
04408 
04409 /// isVEXTRACTIndex - Return true if the specified
04410 /// EXTRACT_SUBVECTOR operand specifies a vector extract that is
04411 /// suitable for instruction that extract 128 or 256 bit vectors
04412 static bool isVEXTRACTIndex(SDNode *N, unsigned vecWidth) {
04413   assert((vecWidth == 128 || vecWidth == 256) && "Unexpected vector width");
04414   if (!isa<ConstantSDNode>(N->getOperand(1).getNode()))
04415     return false;
04416 
04417   // The index should be aligned on a vecWidth-bit boundary.
04418   uint64_t Index =
04419     cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue();
04420 
04421   MVT VT = N->getSimpleValueType(0);
04422   unsigned ElSize = VT.getVectorElementType().getSizeInBits();
04423   bool Result = (Index * ElSize) % vecWidth == 0;
04424 
04425   return Result;
04426 }
04427 
04428 /// isVINSERTIndex - Return true if the specified INSERT_SUBVECTOR
04429 /// operand specifies a subvector insert that is suitable for input to
04430 /// insertion of 128 or 256-bit subvectors
04431 static bool isVINSERTIndex(SDNode *N, unsigned vecWidth) {
04432   assert((vecWidth == 128 || vecWidth == 256) && "Unexpected vector width");
04433   if (!isa<ConstantSDNode>(N->getOperand(2).getNode()))
04434     return false;
04435   // The index should be aligned on a vecWidth-bit boundary.
04436   uint64_t Index =
04437     cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue();
04438 
04439   MVT VT = N->getSimpleValueType(0);
04440   unsigned ElSize = VT.getVectorElementType().getSizeInBits();
04441   bool Result = (Index * ElSize) % vecWidth == 0;
04442 
04443   return Result;
04444 }
04445 
04446 bool X86::isVINSERT128Index(SDNode *N) {
04447   return isVINSERTIndex(N, 128);
04448 }
04449 
04450 bool X86::isVINSERT256Index(SDNode *N) {
04451   return isVINSERTIndex(N, 256);
04452 }
04453 
04454 bool X86::isVEXTRACT128Index(SDNode *N) {
04455   return isVEXTRACTIndex(N, 128);
04456 }
04457 
04458 bool X86::isVEXTRACT256Index(SDNode *N) {
04459   return isVEXTRACTIndex(N, 256);
04460 }
04461 
04462 /// getShuffleSHUFImmediate - Return the appropriate immediate to shuffle
04463 /// the specified VECTOR_SHUFFLE mask with PSHUF* and SHUFP* instructions.
04464 /// Handles 128-bit and 256-bit.
04465 static unsigned getShuffleSHUFImmediate(ShuffleVectorSDNode *N) {
04466   MVT VT = N->getSimpleValueType(0);
04467 
04468   assert((VT.getSizeInBits() >= 128) &&
04469          "Unsupported vector type for PSHUF/SHUFP");
04470 
04471   // Handle 128 and 256-bit vector lengths. AVX defines PSHUF/SHUFP to operate
04472   // independently on 128-bit lanes.
04473   unsigned NumElts = VT.getVectorNumElements();
04474   unsigned NumLanes = VT.getSizeInBits()/128;
04475   unsigned NumLaneElts = NumElts/NumLanes;
04476 
04477   assert((NumLaneElts == 2 || NumLaneElts == 4 || NumLaneElts == 8) &&
04478          "Only supports 2, 4 or 8 elements per lane");
04479 
04480   unsigned Shift = (NumLaneElts >= 4) ? 1 : 0;
04481   unsigned Mask = 0;
04482   for (unsigned i = 0; i != NumElts; ++i) {
04483     int Elt = N->getMaskElt(i);
04484     if (Elt < 0) continue;
04485     Elt &= NumLaneElts - 1;
04486     unsigned ShAmt = (i << Shift) % 8;
04487     Mask |= Elt << ShAmt;
04488   }
04489 
04490   return Mask;
04491 }
04492 
04493 /// getShufflePSHUFHWImmediate - Return the appropriate immediate to shuffle
04494 /// the specified VECTOR_SHUFFLE mask with the PSHUFHW instruction.
04495 static unsigned getShufflePSHUFHWImmediate(ShuffleVectorSDNode *N) {
04496   MVT VT = N->getSimpleValueType(0);
04497 
04498   assert((VT == MVT::v8i16 || VT == MVT::v16i16) &&
04499          "Unsupported vector type for PSHUFHW");
04500 
04501   unsigned NumElts = VT.getVectorNumElements();
04502 
04503   unsigned Mask = 0;
04504   for (unsigned l = 0; l != NumElts; l += 8) {
04505     // 8 nodes per lane, but we only care about the last 4.
04506     for (unsigned i = 0; i < 4; ++i) {
04507       int Elt = N->getMaskElt(l+i+4);
04508       if (Elt < 0) continue;
04509       Elt &= 0x3; // only 2-bits.
04510       Mask |= Elt << (i * 2);
04511     }
04512   }
04513 
04514   return Mask;
04515 }
04516 
04517 /// getShufflePSHUFLWImmediate - Return the appropriate immediate to shuffle
04518 /// the specified VECTOR_SHUFFLE mask with the PSHUFLW instruction.
04519 static unsigned getShufflePSHUFLWImmediate(ShuffleVectorSDNode *N) {
04520   MVT VT = N->getSimpleValueType(0);
04521 
04522   assert((VT == MVT::v8i16 || VT == MVT::v16i16) &&
04523          "Unsupported vector type for PSHUFHW");
04524 
04525   unsigned NumElts = VT.getVectorNumElements();
04526 
04527   unsigned Mask = 0;
04528   for (unsigned l = 0; l != NumElts; l += 8) {
04529     // 8 nodes per lane, but we only care about the first 4.
04530     for (unsigned i = 0; i < 4; ++i) {
04531       int Elt = N->getMaskElt(l+i);
04532       if (Elt < 0) continue;
04533       Elt &= 0x3; // only 2-bits
04534       Mask |= Elt << (i * 2);
04535     }
04536   }
04537 
04538   return Mask;
04539 }
04540 
04541 /// getShufflePALIGNRImmediate - Return the appropriate immediate to shuffle
04542 /// the specified VECTOR_SHUFFLE mask with the PALIGNR instruction.
04543 static unsigned getShufflePALIGNRImmediate(ShuffleVectorSDNode *SVOp) {
04544   MVT VT = SVOp->getSimpleValueType(0);
04545   unsigned EltSize = VT.is512BitVector() ? 1 :
04546     VT.getVectorElementType().getSizeInBits() >> 3;
04547 
04548   unsigned NumElts = VT.getVectorNumElements();
04549   unsigned NumLanes = VT.is512BitVector() ? 1 : VT.getSizeInBits()/128;
04550   unsigned NumLaneElts = NumElts/NumLanes;
04551 
04552   int Val = 0;
04553   unsigned i;
04554   for (i = 0; i != NumElts; ++i) {
04555     Val = SVOp->getMaskElt(i);
04556     if (Val >= 0)
04557       break;
04558   }
04559   if (Val >= (int)NumElts)
04560     Val -= NumElts - NumLaneElts;
04561 
04562   assert(Val - i > 0 && "PALIGNR imm should be positive");
04563   return (Val - i) * EltSize;
04564 }
04565 
04566 static unsigned getExtractVEXTRACTImmediate(SDNode *N, unsigned vecWidth) {
04567   assert((vecWidth == 128 || vecWidth == 256) && "Unsupported vector width");
04568   if (!isa<ConstantSDNode>(N->getOperand(1).getNode()))
04569     llvm_unreachable("Illegal extract subvector for VEXTRACT");
04570 
04571   uint64_t Index =
04572     cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue();
04573 
04574   MVT VecVT = N->getOperand(0).getSimpleValueType();
04575   MVT ElVT = VecVT.getVectorElementType();
04576 
04577   unsigned NumElemsPerChunk = vecWidth / ElVT.getSizeInBits();
04578   return Index / NumElemsPerChunk;
04579 }
04580 
04581 static unsigned getInsertVINSERTImmediate(SDNode *N, unsigned vecWidth) {
04582   assert((vecWidth == 128 || vecWidth == 256) && "Unsupported vector width");
04583   if (!isa<ConstantSDNode>(N->getOperand(2).getNode()))
04584     llvm_unreachable("Illegal insert subvector for VINSERT");
04585 
04586   uint64_t Index =
04587     cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue();
04588 
04589   MVT VecVT = N->getSimpleValueType(0);
04590   MVT ElVT = VecVT.getVectorElementType();
04591 
04592   unsigned NumElemsPerChunk = vecWidth / ElVT.getSizeInBits();
04593   return Index / NumElemsPerChunk;
04594 }
04595 
04596 /// getExtractVEXTRACT128Immediate - Return the appropriate immediate
04597 /// to extract the specified EXTRACT_SUBVECTOR index with VEXTRACTF128
04598 /// and VINSERTI128 instructions.
04599 unsigned X86::getExtractVEXTRACT128Immediate(SDNode *N) {
04600   return getExtractVEXTRACTImmediate(N, 128);
04601 }
04602 
04603 /// getExtractVEXTRACT256Immediate - Return the appropriate immediate
04604 /// to extract the specified EXTRACT_SUBVECTOR index with VEXTRACTF64x4
04605 /// and VINSERTI64x4 instructions.
04606 unsigned X86::getExtractVEXTRACT256Immediate(SDNode *N) {
04607   return getExtractVEXTRACTImmediate(N, 256);
04608 }
04609 
04610 /// getInsertVINSERT128Immediate - Return the appropriate immediate
04611 /// to insert at the specified INSERT_SUBVECTOR index with VINSERTF128
04612 /// and VINSERTI128 instructions.
04613 unsigned X86::getInsertVINSERT128Immediate(SDNode *N) {
04614   return getInsertVINSERTImmediate(N, 128);
04615 }
04616 
04617 /// getInsertVINSERT256Immediate - Return the appropriate immediate
04618 /// to insert at the specified INSERT_SUBVECTOR index with VINSERTF46x4
04619 /// and VINSERTI64x4 instructions.
04620 unsigned X86::getInsertVINSERT256Immediate(SDNode *N) {
04621   return getInsertVINSERTImmediate(N, 256);
04622 }
04623 
04624 /// isZeroNode - Returns true if Elt is a constant zero or a floating point
04625 /// constant +0.0.
04626 bool X86::isZeroNode(SDValue Elt) {
04627   if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Elt))
04628     return CN->isNullValue();
04629   if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Elt))
04630     return CFP->getValueAPF().isPosZero();
04631   return false;
04632 }
04633 
04634 /// CommuteVectorShuffle - Swap vector_shuffle operands as well as values in
04635 /// their permute mask.
04636 static SDValue CommuteVectorShuffle(ShuffleVectorSDNode *SVOp,
04637                                     SelectionDAG &DAG) {
04638   MVT VT = SVOp->getSimpleValueType(0);
04639   unsigned NumElems = VT.getVectorNumElements();
04640   SmallVector<int, 8> MaskVec;
04641 
04642   for (unsigned i = 0; i != NumElems; ++i) {
04643     int Idx = SVOp->getMaskElt(i);
04644     if (Idx >= 0) {
04645       if (Idx < (int)NumElems)
04646         Idx += NumElems;
04647       else
04648         Idx -= NumElems;
04649     }
04650     MaskVec.push_back(Idx);
04651   }
04652   return DAG.getVectorShuffle(VT, SDLoc(SVOp), SVOp->getOperand(1),
04653                               SVOp->getOperand(0), &MaskVec[0]);
04654 }
04655 
04656 /// ShouldXformToMOVHLPS - Return true if the node should be transformed to
04657 /// match movhlps. The lower half elements should come from upper half of
04658 /// V1 (and in order), and the upper half elements should come from the upper
04659 /// half of V2 (and in order).
04660 static bool ShouldXformToMOVHLPS(ArrayRef<int> Mask, MVT VT) {
04661   if (!VT.is128BitVector())
04662     return false;
04663   if (VT.getVectorNumElements() != 4)
04664     return false;
04665   for (unsigned i = 0, e = 2; i != e; ++i)
04666     if (!isUndefOrEqual(Mask[i], i+2))
04667       return false;
04668   for (unsigned i = 2; i != 4; ++i)
04669     if (!isUndefOrEqual(Mask[i], i+4))
04670       return false;
04671   return true;
04672 }
04673 
04674 /// isScalarLoadToVector - Returns true if the node is a scalar load that
04675 /// is promoted to a vector. It also returns the LoadSDNode by reference if
04676 /// required.
04677 static bool isScalarLoadToVector(SDNode *N, LoadSDNode **LD = NULL) {
04678   if (N->getOpcode() != ISD::SCALAR_TO_VECTOR)
04679     return false;
04680   N = N->getOperand(0).getNode();
04681   if (!ISD::isNON_EXTLoad(N))
04682     return false;
04683   if (LD)
04684     *LD = cast<LoadSDNode>(N);
04685   return true;
04686 }
04687 
04688 // Test whether the given value is a vector value which will be legalized
04689 // into a load.
04690 static bool WillBeConstantPoolLoad(SDNode *N) {
04691   if (N->getOpcode() != ISD::BUILD_VECTOR)
04692     return false;
04693 
04694   // Check for any non-constant elements.
04695   for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i)
04696     switch (N->getOperand(i).getNode()->getOpcode()) {
04697     case ISD::UNDEF:
04698     case ISD::ConstantFP:
04699     case ISD::Constant:
04700       break;
04701     default:
04702       return false;
04703     }
04704 
04705   // Vectors of all-zeros and all-ones are materialized with special
04706   // instructions rather than being loaded.
04707   return !ISD::isBuildVectorAllZeros(N) &&
04708          !ISD::isBuildVectorAllOnes(N);
04709 }
04710 
04711 /// ShouldXformToMOVLP{S|D} - Return true if the node should be transformed to
04712 /// match movlp{s|d}. The lower half elements should come from lower half of
04713 /// V1 (and in order), and the upper half elements should come from the upper
04714 /// half of V2 (and in order). And since V1 will become the source of the
04715 /// MOVLP, it must be either a vector load or a scalar load to vector.
04716 static bool ShouldXformToMOVLP(SDNode *V1, SDNode *V2,
04717                                ArrayRef<int> Mask, MVT VT) {
04718   if (!VT.is128BitVector())
04719     return false;
04720 
04721   if (!ISD::isNON_EXTLoad(V1) && !isScalarLoadToVector(V1))
04722     return false;
04723   // Is V2 is a vector load, don't do this transformation. We will try to use
04724   // load folding shufps op.
04725   if (ISD::isNON_EXTLoad(V2) || WillBeConstantPoolLoad(V2))
04726     return false;
04727 
04728   unsigned NumElems = VT.getVectorNumElements();
04729 
04730   if (NumElems != 2 && NumElems != 4)
04731     return false;
04732   for (unsigned i = 0, e = NumElems/2; i != e; ++i)
04733     if (!isUndefOrEqual(Mask[i], i))
04734       return false;
04735   for (unsigned i = NumElems/2, e = NumElems; i != e; ++i)
04736     if (!isUndefOrEqual(Mask[i], i+NumElems))
04737       return false;
04738   return true;
04739 }
04740 
04741 /// isSplatVector - Returns true if N is a BUILD_VECTOR node whose elements are
04742 /// all the same.
04743 static bool isSplatVector(SDNode *N) {
04744   if (N->getOpcode() != ISD::BUILD_VECTOR)
04745     return false;
04746 
04747   SDValue SplatValue = N->getOperand(0);
04748   for (unsigned i = 1, e = N->getNumOperands(); i != e; ++i)
04749     if (N->getOperand(i) != SplatValue)
04750       return false;
04751   return true;
04752 }
04753 
04754 /// isZeroShuffle - Returns true if N is a VECTOR_SHUFFLE that can be resolved
04755 /// to an zero vector.
04756 /// FIXME: move to dag combiner / method on ShuffleVectorSDNode
04757 static bool isZeroShuffle(ShuffleVectorSDNode *N) {
04758   SDValue V1 = N->getOperand(0);
04759   SDValue V2 = N->getOperand(1);
04760   unsigned NumElems = N->getValueType(0).getVectorNumElements();
04761   for (unsigned i = 0; i != NumElems; ++i) {
04762     int Idx = N->getMaskElt(i);
04763     if (Idx >= (int)NumElems) {
04764       unsigned Opc = V2.getOpcode();
04765       if (Opc == ISD::UNDEF || ISD::isBuildVectorAllZeros(V2.getNode()))
04766         continue;
04767       if (Opc != ISD::BUILD_VECTOR ||
04768           !X86::isZeroNode(V2.getOperand(Idx-NumElems)))
04769         return false;
04770     } else if (Idx >= 0) {
04771       unsigned Opc = V1.getOpcode();
04772       if (Opc == ISD::UNDEF || ISD::isBuildVectorAllZeros(V1.getNode()))
04773         continue;
04774       if (Opc != ISD::BUILD_VECTOR ||
04775           !X86::isZeroNode(V1.getOperand(Idx)))
04776         return false;
04777     }
04778   }
04779   return true;
04780 }
04781 
04782 /// getZeroVector - Returns a vector of specified type with all zero elements.
04783 ///
04784 static SDValue getZeroVector(EVT VT, const X86Subtarget *Subtarget,
04785                              SelectionDAG &DAG, SDLoc dl) {
04786   assert(VT.isVector() && "Expected a vector type");
04787 
04788   // Always build SSE zero vectors as <4 x i32> bitcasted
04789   // to their dest type. This ensures they get CSE'd.
04790   SDValue Vec;
04791   if (VT.is128BitVector()) {  // SSE
04792     if (Subtarget->hasSSE2()) {  // SSE2
04793       SDValue Cst = DAG.getTargetConstant(0, MVT::i32);
04794       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
04795     } else { // SSE1
04796       SDValue Cst = DAG.getTargetConstantFP(+0.0, MVT::f32);
04797       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4f32, Cst, Cst, Cst, Cst);
04798     }
04799   } else if (VT.is256BitVector()) { // AVX
04800     if (Subtarget->hasInt256()) { // AVX2
04801       SDValue Cst = DAG.getTargetConstant(0, MVT::i32);
04802       SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
04803       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops,
04804                         array_lengthof(Ops));
04805     } else {
04806       // 256-bit logic and arithmetic instructions in AVX are all
04807       // floating-point, no support for integer ops. Emit fp zeroed vectors.
04808       SDValue Cst = DAG.getTargetConstantFP(+0.0, MVT::f32);
04809       SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
04810       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8f32, Ops,
04811                         array_lengthof(Ops));
04812     }
04813   } else if (VT.is512BitVector()) { // AVX-512
04814       SDValue Cst = DAG.getTargetConstant(0, MVT::i32);
04815       SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst,
04816                         Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
04817       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v16i32, Ops, 16);
04818   } else if (VT.getScalarType() == MVT::i1) {
04819     assert(VT.getVectorNumElements() <= 16 && "Unexpected vector type");
04820     SDValue Cst = DAG.getTargetConstant(0, MVT::i1);
04821     SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst,
04822                       Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
04823     return DAG.getNode(ISD::BUILD_VECTOR, dl, VT,
04824                        Ops, VT.getVectorNumElements());
04825   } else
04826     llvm_unreachable("Unexpected vector type");
04827 
04828   return DAG.getNode(ISD::BITCAST, dl, VT, Vec);
04829 }
04830 
04831 /// getOnesVector - Returns a vector of specified type with all bits set.
04832 /// Always build ones vectors as <4 x i32> or <8 x i32>. For 256-bit types with
04833 /// no AVX2 supprt, use two <4 x i32> inserted in a <8 x i32> appropriately.
04834 /// Then bitcast to their original type, ensuring they get CSE'd.
04835 static SDValue getOnesVector(MVT VT, bool HasInt256, SelectionDAG &DAG,
04836                              SDLoc dl) {
04837   assert(VT.isVector() && "Expected a vector type");
04838 
04839   SDValue Cst = DAG.getTargetConstant(~0U, MVT::i32);
04840   SDValue Vec;
04841   if (VT.is256BitVector()) {
04842     if (HasInt256) { // AVX2
04843       SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
04844       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops,
04845                         array_lengthof(Ops));
04846     } else { // AVX
04847       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
04848       Vec = Concat128BitVectors(Vec, Vec, MVT::v8i32, 8, DAG, dl);
04849     }
04850   } else if (VT.is128BitVector()) {
04851     Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
04852   } else
04853     llvm_unreachable("Unexpected vector type");
04854 
04855   return DAG.getNode(ISD::BITCAST, dl, VT, Vec);
04856 }
04857 
04858 /// NormalizeMask - V2 is a splat, modify the mask (if needed) so all elements
04859 /// that point to V2 points to its first element.
04860 static void NormalizeMask(SmallVectorImpl<int> &Mask, unsigned NumElems) {
04861   for (unsigned i = 0; i != NumElems; ++i) {
04862     if (Mask[i] > (int)NumElems) {
04863       Mask[i] = NumElems;
04864     }
04865   }
04866 }
04867 
04868 /// getMOVLMask - Returns a vector_shuffle mask for an movs{s|d}, movd
04869 /// operation of specified width.
04870 static SDValue getMOVL(SelectionDAG &DAG, SDLoc dl, EVT VT, SDValue V1,
04871                        SDValue V2) {
04872   unsigned NumElems = VT.getVectorNumElements();
04873   SmallVector<int, 8> Mask;
04874   Mask.push_back(NumElems);
04875   for (unsigned i = 1; i != NumElems; ++i)
04876     Mask.push_back(i);
04877   return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]);
04878 }
04879 
04880 /// getUnpackl - Returns a vector_shuffle node for an unpackl operation.
04881 static SDValue getUnpackl(SelectionDAG &DAG, SDLoc dl, MVT VT, SDValue V1,
04882                           SDValue V2) {
04883   unsigned NumElems = VT.getVectorNumElements();
04884   SmallVector<int, 8> Mask;
04885   for (unsigned i = 0, e = NumElems/2; i != e; ++i) {
04886     Mask.push_back(i);
04887     Mask.push_back(i + NumElems);
04888   }
04889   return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]);
04890 }
04891 
04892 /// getUnpackh - Returns a vector_shuffle node for an unpackh operation.
04893 static SDValue getUnpackh(SelectionDAG &DAG, SDLoc dl, MVT VT, SDValue V1,
04894                           SDValue V2) {
04895   unsigned NumElems = VT.getVectorNumElements();
04896   SmallVector<int, 8> Mask;
04897   for (unsigned i = 0, Half = NumElems/2; i != Half; ++i) {
04898     Mask.push_back(i + Half);
04899     Mask.push_back(i + NumElems + Half);
04900   }
04901   return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]);
04902 }
04903 
04904 // PromoteSplati8i16 - All i16 and i8 vector types can't be used directly by
04905 // a generic shuffle instruction because the target has no such instructions.
04906 // Generate shuffles which repeat i16 and i8 several times until they can be
04907 // represented by v4f32 and then be manipulated by target suported shuffles.
04908 static SDValue PromoteSplati8i16(SDValue V, SelectionDAG &DAG, int &EltNo) {
04909   MVT VT = V.getSimpleValueType();
04910   int NumElems = VT.getVectorNumElements();
04911   SDLoc dl(V);
04912 
04913   while (NumElems > 4) {
04914     if (EltNo < NumElems/2) {
04915       V = getUnpackl(DAG, dl, VT, V, V);
04916     } else {
04917       V = getUnpackh(DAG, dl, VT, V, V);
04918       EltNo -= NumElems/2;
04919     }
04920     NumElems >>= 1;
04921   }
04922   return V;
04923 }
04924 
04925 /// getLegalSplat - Generate a legal splat with supported x86 shuffles
04926 static SDValue getLegalSplat(SelectionDAG &DAG, SDValue V, int EltNo) {
04927   MVT VT = V.getSimpleValueType();
04928   SDLoc dl(V);
04929 
04930   if (VT.is128BitVector()) {
04931     V = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, V);
04932     int SplatMask[4] = { EltNo, EltNo, EltNo, EltNo };
04933     V = DAG.getVectorShuffle(MVT::v4f32, dl, V, DAG.getUNDEF(MVT::v4f32),
04934                              &SplatMask[0]);
04935   } else if (VT.is256BitVector()) {
04936     // To use VPERMILPS to splat scalars, the second half of indicies must
04937     // refer to the higher part, which is a duplication of the lower one,
04938     // because VPERMILPS can only handle in-lane permutations.
04939     int SplatMask[8] = { EltNo, EltNo, EltNo, EltNo,
04940                          EltNo+4, EltNo+4, EltNo+4, EltNo+4 };
04941 
04942     V = DAG.getNode(ISD::BITCAST, dl, MVT::v8f32, V);
04943     V = DAG.getVectorShuffle(MVT::v8f32, dl, V, DAG.getUNDEF(MVT::v8f32),
04944                              &SplatMask[0]);
04945   } else
04946     llvm_unreachable("Vector size not supported");
04947 
04948   return DAG.getNode(ISD::BITCAST, dl, VT, V);
04949 }
04950 
04951 /// PromoteSplat - Splat is promoted to target supported vector shuffles.
04952 static SDValue PromoteSplat(ShuffleVectorSDNode *SV, SelectionDAG &DAG) {
04953   MVT SrcVT = SV->getSimpleValueType(0);
04954   SDValue V1 = SV->getOperand(0);
04955   SDLoc dl(SV);
04956 
04957   int EltNo = SV->getSplatIndex();
04958   int NumElems = SrcVT.getVectorNumElements();
04959   bool Is256BitVec = SrcVT.is256BitVector();
04960 
04961   assert(((SrcVT.is128BitVector() && NumElems > 4) || Is256BitVec) &&
04962          "Unknown how to promote splat for type");
04963 
04964   // Extract the 128-bit part containing the splat element and update
04965   // the splat element index when it refers to the higher register.
04966   if (Is256BitVec) {
04967     V1 = Extract128BitVector(V1, EltNo, DAG, dl);
04968     if (EltNo >= NumElems/2)
04969       EltNo -= NumElems/2;
04970   }
04971 
04972   // All i16 and i8 vector types can't be used directly by a generic shuffle
04973   // instruction because the target has no such instruction. Generate shuffles
04974   // which repeat i16 and i8 several times until they fit in i32, and then can
04975   // be manipulated by target suported shuffles.
04976   MVT EltVT = SrcVT.getVectorElementType();
04977   if (EltVT == MVT::i8 || EltVT == MVT::i16)
04978     V1 = PromoteSplati8i16(V1, DAG, EltNo);
04979 
04980   // Recreate the 256-bit vector and place the same 128-bit vector
04981   // into the low and high part. This is necessary because we want
04982   // to use VPERM* to shuffle the vectors
04983   if (Is256BitVec) {
04984     V1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, SrcVT, V1, V1);
04985   }
04986 
04987   return getLegalSplat(DAG, V1, EltNo);
04988 }
04989 
04990 /// getShuffleVectorZeroOrUndef - Return a vector_shuffle of the specified
04991 /// vector of zero or undef vector.  This produces a shuffle where the low
04992 /// element of V2 is swizzled into the zero/undef vector, landing at element
04993 /// Idx.  This produces a shuffle mask like 4,1,2,3 (idx=0) or  0,1,2,4 (idx=3).
04994 static SDValue getShuffleVectorZeroOrUndef(SDValue V2, unsigned Idx,
04995                                            bool IsZero,
04996                                            const X86Subtarget *Subtarget,
04997                                            SelectionDAG &DAG) {
04998   MVT VT = V2.getSimpleValueType();
04999   SDValue V1 = IsZero
05000     ? getZeroVector(VT, Subtarget, DAG, SDLoc(V2)) : DAG.getUNDEF(VT);
05001   unsigned NumElems = VT.getVectorNumElements();
05002   SmallVector<int, 16> MaskVec;
05003   for (unsigned i = 0; i != NumElems; ++i)
05004     // If this is the insertion idx, put the low elt of V2 here.
05005     MaskVec.push_back(i == Idx ? NumElems : i);
05006   return DAG.getVectorShuffle(VT, SDLoc(V2), V1, V2, &MaskVec[0]);
05007 }
05008 
05009 /// getTargetShuffleMask - Calculates the shuffle mask corresponding to the
05010 /// target specific opcode. Returns true if the Mask could be calculated.
05011 /// Sets IsUnary to true if only uses one source.
05012 static bool getTargetShuffleMask(SDNode *N, MVT VT,
05013                                  SmallVectorImpl<int> &Mask, bool &IsUnary) {
05014   unsigned NumElems = VT.getVectorNumElements();
05015   SDValue ImmN;
05016 
05017   IsUnary = false;
05018   switch(N->getOpcode()) {
05019   case X86ISD::SHUFP:
05020     ImmN = N->getOperand(N->getNumOperands()-1);
05021     DecodeSHUFPMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
05022     break;
05023   case X86ISD::UNPCKH:
05024     DecodeUNPCKHMask(VT, Mask);
05025     break;
05026   case X86ISD::UNPCKL:
05027     DecodeUNPCKLMask(VT, Mask);
05028     break;
05029   case X86ISD::MOVHLPS:
05030     DecodeMOVHLPSMask(NumElems, Mask);
05031     break;
05032   case X86ISD::MOVLHPS:
05033     DecodeMOVLHPSMask(NumElems, Mask);
05034     break;
05035   case X86ISD::PALIGNR:
05036     ImmN = N->getOperand(N->getNumOperands()-1);
05037     DecodePALIGNRMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
05038     break;
05039   case X86ISD::PSHUFD:
05040   case X86ISD::VPERMILP:
05041     ImmN = N->getOperand(N->getNumOperands()-1);
05042     DecodePSHUFMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
05043     IsUnary = true;
05044     break;
05045   case X86ISD::PSHUFHW:
05046     ImmN = N->getOperand(N->getNumOperands()-1);
05047     DecodePSHUFHWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
05048     IsUnary = true;
05049     break;
05050   case X86ISD::PSHUFLW:
05051     ImmN = N->getOperand(N->getNumOperands()-1);
05052     DecodePSHUFLWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
05053     IsUnary = true;
05054     break;
05055   case X86ISD::VPERMI:
05056     ImmN = N->getOperand(N->getNumOperands()-1);
05057     DecodeVPERMMask(cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
05058     IsUnary = true;
05059     break;
05060   case X86ISD::MOVSS:
05061   case X86ISD::MOVSD: {
05062     // The index 0 always comes from the first element of the second source,
05063     // this is why MOVSS and MOVSD are used in the first place. The other
05064     // elements come from the other positions of the first source vector
05065     Mask.push_back(NumElems);
05066     for (unsigned i = 1; i != NumElems; ++i) {
05067       Mask.push_back(i);
05068     }
05069     break;
05070   }
05071   case X86ISD::VPERM2X128:
05072     ImmN = N->getOperand(N->getNumOperands()-1);
05073     DecodeVPERM2X128Mask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
05074     if (Mask.empty()) return false;
05075     break;
05076   case X86ISD::MOVDDUP:
05077   case X86ISD::MOVLHPD:
05078   case X86ISD::MOVLPD:
05079   case X86ISD::MOVLPS:
05080   case X86ISD::MOVSHDUP:
05081   case X86ISD::MOVSLDUP:
05082     // Not yet implemented
05083     return false;
05084   default: llvm_unreachable("unknown target shuffle node");
05085   }
05086 
05087   return true;
05088 }
05089 
05090 /// getShuffleScalarElt - Returns the scalar element that will make up the ith
05091 /// element of the result of the vector shuffle.
05092 static SDValue getShuffleScalarElt(SDNode *N, unsigned Index, SelectionDAG &DAG,
05093                                    unsigned Depth) {
05094   if (Depth == 6)
05095     return SDValue();  // Limit search depth.
05096 
05097   SDValue V = SDValue(N, 0);
05098   EVT VT = V.getValueType();
05099   unsigned Opcode = V.getOpcode();
05100 
05101   // Recurse into ISD::VECTOR_SHUFFLE node to find scalars.
05102   if (const ShuffleVectorSDNode *SV = dyn_cast<ShuffleVectorSDNode>(N)) {
05103     int Elt = SV->getMaskElt(Index);
05104 
05105     if (Elt < 0)
05106       return DAG.getUNDEF(VT.getVectorElementType());
05107 
05108     unsigned NumElems = VT.getVectorNumElements();
05109     SDValue NewV = (Elt < (int)NumElems) ? SV->getOperand(0)
05110                                          : SV->getOperand(1);
05111     return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG, Depth+1);
05112   }
05113 
05114   // Recurse into target specific vector shuffles to find scalars.
05115   if (isTargetShuffle(Opcode)) {
05116     MVT ShufVT = V.getSimpleValueType();
05117     unsigned NumElems = ShufVT.getVectorNumElements();
05118     SmallVector<int, 16> ShuffleMask;
05119     bool IsUnary;
05120 
05121     if (!getTargetShuffleMask(N, ShufVT, ShuffleMask, IsUnary))
05122       return SDValue();
05123 
05124     int Elt = ShuffleMask[Index];
05125     if (Elt < 0)
05126       return DAG.getUNDEF(ShufVT.getVectorElementType());
05127 
05128     SDValue NewV = (Elt < (int)NumElems) ? N->getOperand(0)
05129                                          : N->getOperand(1);
05130     return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG,
05131                                Depth+1);
05132   }
05133 
05134   // Actual nodes that may contain scalar elements
05135   if (Opcode == ISD::BITCAST) {
05136     V = V.getOperand(0);
05137     EVT SrcVT = V.getValueType();
05138     unsigned NumElems = VT.getVectorNumElements();
05139 
05140     if (!SrcVT.isVector() || SrcVT.getVectorNumElements() != NumElems)
05141       return SDValue();
05142   }
05143 
05144   if (V.getOpcode() == ISD::SCALAR_TO_VECTOR)
05145     return (Index == 0) ? V.getOperand(0)
05146                         : DAG.getUNDEF(VT.getVectorElementType());
05147 
05148   if (V.getOpcode() == ISD::BUILD_VECTOR)
05149     return V.getOperand(Index);
05150 
05151   return SDValue();
05152 }
05153 
05154 /// getNumOfConsecutiveZeros - Return the number of elements of a vector
05155 /// shuffle operation which come from a consecutively from a zero. The
05156 /// search can start in two different directions, from left or right.
05157 /// We count undefs as zeros until PreferredNum is reached.
05158 static unsigned getNumOfConsecutiveZeros(ShuffleVectorSDNode *SVOp,
05159                                          unsigned NumElems, bool ZerosFromLeft,
05160                                          SelectionDAG &DAG,
05161                                          unsigned PreferredNum = -1U) {
05162   unsigned NumZeros = 0;
05163   for (unsigned i = 0; i != NumElems; ++i) {
05164     unsigned Index = ZerosFromLeft ? i : NumElems - i - 1;
05165     SDValue Elt = getShuffleScalarElt(SVOp, Index, DAG, 0);
05166     if (!Elt.getNode())
05167       break;
05168 
05169     if (X86::isZeroNode(Elt))
05170       ++NumZeros;
05171     else if (Elt.getOpcode() == ISD::UNDEF) // Undef as zero up to PreferredNum.
05172       NumZeros = std::min(NumZeros + 1, PreferredNum);
05173     else
05174       break;
05175   }
05176 
05177   return NumZeros;
05178 }
05179 
05180 /// isShuffleMaskConsecutive - Check if the shuffle mask indicies [MaskI, MaskE)
05181 /// correspond consecutively to elements from one of the vector operands,
05182 /// starting from its index OpIdx. Also tell OpNum which source vector operand.
05183 static
05184 bool isShuffleMaskConsecutive(ShuffleVectorSDNode *SVOp,
05185                               unsigned MaskI, unsigned MaskE, unsigned OpIdx,
05186                               unsigned NumElems, unsigned &OpNum) {
05187   bool SeenV1 = false;
05188   bool SeenV2 = false;
05189 
05190   for (unsigned i = MaskI; i != MaskE; ++i, ++OpIdx) {
05191     int Idx = SVOp->getMaskElt(i);
05192     // Ignore undef indicies
05193     if (Idx < 0)
05194       continue;
05195 
05196     if (Idx < (int)NumElems)
05197       SeenV1 = true;
05198     else
05199       SeenV2 = true;
05200 
05201     // Only accept consecutive elements from the same vector
05202     if ((Idx % NumElems != OpIdx) || (SeenV1 && SeenV2))
05203       return false;
05204   }
05205 
05206   OpNum = SeenV1 ? 0 : 1;
05207   return true;
05208 }
05209 
05210 /// isVectorShiftRight - Returns true if the shuffle can be implemented as a
05211 /// logical left shift of a vector.
05212 static bool isVectorShiftRight(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG,
05213                                bool &isLeft, SDValue &ShVal, unsigned &ShAmt) {
05214   unsigned NumElems =
05215     SVOp->getSimpleValueType(0).getVectorNumElements();
05216   unsigned NumZeros = getNumOfConsecutiveZeros(
05217       SVOp, NumElems, false /* check zeros from right */, DAG,
05218       SVOp->getMaskElt(0));
05219   unsigned OpSrc;
05220 
05221   if (!NumZeros)
05222     return false;
05223 
05224   // Considering the elements in the mask that are not consecutive zeros,
05225   // check if they consecutively come from only one of the source vectors.
05226   //
05227   //               V1 = {X, A, B, C}     0
05228   //                         \  \  \    /
05229   //   vector_shuffle V1, V2 <1, 2, 3, X>
05230   //
05231   if (!isShuffleMaskConsecutive(SVOp,
05232             0,                   // Mask Start Index
05233             NumElems-NumZeros,   // Mask End Index(exclusive)
05234             NumZeros,            // Where to start looking in the src vector
05235             NumElems,            // Number of elements in vector
05236             OpSrc))              // Which source operand ?
05237     return false;
05238 
05239   isLeft = false;
05240   ShAmt = NumZeros;
05241   ShVal = SVOp->getOperand(OpSrc);
05242   return true;
05243 }
05244 
05245 /// isVectorShiftLeft - Returns true if the shuffle can be implemented as a
05246 /// logical left shift of a vector.
05247 static bool isVectorShiftLeft(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG,
05248                               bool &isLeft, SDValue &ShVal, unsigned &ShAmt) {
05249   unsigned NumElems =
05250     SVOp->getSimpleValueType(0).getVectorNumElements();
05251   unsigned NumZeros = getNumOfConsecutiveZeros(
05252       SVOp, NumElems, true /* check zeros from left */, DAG,
05253       NumElems - SVOp->getMaskElt(NumElems - 1) - 1);
05254   unsigned OpSrc;
05255 
05256   if (!NumZeros)
05257     return false;
05258 
05259   // Considering the elements in the mask that are not consecutive zeros,
05260   // check if they consecutively come from only one of the source vectors.
05261   //
05262   //                           0    { A, B, X, X } = V2
05263   //                          / \    /  /
05264   //   vector_shuffle V1, V2 <X, X, 4, 5>
05265   //
05266   if (!isShuffleMaskConsecutive(SVOp,
05267             NumZeros,     // Mask Start Index
05268             NumElems,     // Mask End Index(exclusive)
05269             0,            // Where to start looking in the src vector
05270             NumElems,     // Number of elements in vector
05271             OpSrc))       // Which source operand ?
05272     return false;
05273 
05274   isLeft = true;
05275   ShAmt = NumZeros;
05276   ShVal = SVOp->getOperand(OpSrc);
05277   return true;
05278 }
05279 
05280 /// isVectorShift - Returns true if the shuffle can be implemented as a
05281 /// logical left or right shift of a vector.
05282 static bool isVectorShift(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG,
05283                           bool &isLeft, SDValue &ShVal, unsigned &ShAmt) {
05284   // Although the logic below support any bitwidth size, there are no
05285   // shift instructions which handle more than 128-bit vectors.
05286   if (!SVOp->getSimpleValueType(0).is128BitVector())
05287     return false;
05288 
05289   if (isVectorShiftLeft(SVOp, DAG, isLeft, ShVal, ShAmt) ||
05290       isVectorShiftRight(SVOp, DAG, isLeft, ShVal, ShAmt))
05291     return true;
05292 
05293   return false;
05294 }
05295 
05296 /// LowerBuildVectorv16i8 - Custom lower build_vector of v16i8.
05297 ///
05298 static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros,
05299                                        unsigned NumNonZero, unsigned NumZero,
05300                                        SelectionDAG &DAG,
05301                                        const X86Subtarget* Subtarget,
05302                                        const TargetLowering &TLI) {
05303   if (NumNonZero > 8)
05304     return SDValue();
05305 
05306   SDLoc dl(Op);
05307   SDValue V(0, 0);
05308   bool First = true;
05309   for (unsigned i = 0; i < 16; ++i) {
05310     bool ThisIsNonZero = (NonZeros & (1 << i)) != 0;
05311     if (ThisIsNonZero && First) {
05312       if (NumZero)
05313         V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
05314       else
05315         V = DAG.getUNDEF(MVT::v8i16);
05316       First = false;
05317     }
05318 
05319     if ((i & 1) != 0) {
05320       SDValue ThisElt(0, 0), LastElt(0, 0);
05321       bool LastIsNonZero = (NonZeros & (1 << (i-1))) != 0;
05322       if (LastIsNonZero) {
05323         LastElt = DAG.getNode(ISD::ZERO_EXTEND, dl,
05324                               MVT::i16, Op.getOperand(i-1));
05325       }
05326       if (ThisIsNonZero) {
05327         ThisElt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i));
05328         ThisElt = DAG.getNode(ISD::SHL, dl, MVT::i16,
05329                               ThisElt, DAG.getConstant(8, MVT::i8));
05330         if (LastIsNonZero)
05331           ThisElt = DAG.getNode(ISD::OR, dl, MVT::i16, ThisElt, LastElt);
05332       } else
05333         ThisElt = LastElt;
05334 
05335       if (ThisElt.getNode())
05336         V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, ThisElt,
05337                         DAG.getIntPtrConstant(i/2));
05338     }
05339   }
05340 
05341   return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, V);
05342 }
05343 
05344 /// LowerBuildVectorv8i16 - Custom lower build_vector of v8i16.
05345 ///
05346 static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros,
05347                                      unsigned NumNonZero, unsigned NumZero,
05348                                      SelectionDAG &DAG,
05349                                      const X86Subtarget* Subtarget,
05350                                      const TargetLowering &TLI) {
05351   if (NumNonZero > 4)
05352     return SDValue();
05353 
05354   SDLoc dl(Op);
05355   SDValue V(0, 0);
05356   bool First = true;
05357   for (unsigned i = 0; i < 8; ++i) {
05358     bool isNonZero = (NonZeros & (1 << i)) != 0;
05359     if (isNonZero) {
05360       if (First) {
05361         if (NumZero)
05362           V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
05363         else
05364           V = DAG.getUNDEF(MVT::v8i16);
05365         First = false;
05366       }
05367       V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl,
05368                       MVT::v8i16, V, Op.getOperand(i),
05369                       DAG.getIntPtrConstant(i));
05370     }
05371   }
05372 
05373   return V;
05374 }
05375 
05376 /// getVShift - Return a vector logical shift node.
05377 ///
05378 static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp,
05379                          unsigned NumBits, SelectionDAG &DAG,
05380                          const TargetLowering &TLI, SDLoc dl) {
05381   assert(VT.is128BitVector() && "Unknown type for VShift");
05382   EVT ShVT = MVT::v2i64;
05383   unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ;
05384   SrcOp = DAG.getNode(ISD::BITCAST, dl, ShVT, SrcOp);
05385   return DAG.getNode(ISD::BITCAST, dl, VT,
05386                      DAG.getNode(Opc, dl, ShVT, SrcOp,
05387                              DAG.getConstant(NumBits,
05388                                   TLI.getScalarShiftAmountTy(SrcOp.getValueType()))));
05389 }
05390 
05391 static SDValue
05392 LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, SDLoc dl, SelectionDAG &DAG) {
05393 
05394   // Check if the scalar load can be widened into a vector load. And if
05395   // the address is "base + cst" see if the cst can be "absorbed" into
05396   // the shuffle mask.
05397   if (LoadSDNode *LD = dyn_cast<LoadSDNode>(SrcOp)) {
05398     SDValue Ptr = LD->getBasePtr();
05399     if (!ISD::isNormalLoad(LD) || LD->isVolatile())
05400       return SDValue();
05401     EVT PVT = LD->getValueType(0);
05402     if (PVT != MVT::i32 && PVT != MVT::f32)
05403       return SDValue();
05404 
05405     int FI = -1;
05406     int64_t Offset = 0;
05407     if (FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr)) {
05408       FI = FINode->getIndex();
05409       Offset = 0;
05410     } else if (DAG.isBaseWithConstantOffset(Ptr) &&
05411                isa<FrameIndexSDNode>(Ptr.getOperand(0))) {
05412       FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex();
05413       Offset = Ptr.getConstantOperandVal(1);
05414       Ptr = Ptr.getOperand(0);
05415     } else {
05416       return SDValue();
05417     }
05418 
05419     // FIXME: 256-bit vector instructions don't require a strict alignment,
05420     // improve this code to support it better.
05421     unsigned RequiredAlign = VT.getSizeInBits()/8;
05422     SDValue Chain = LD->getChain();
05423     // Make sure the stack object alignment is at least 16 or 32.
05424     MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
05425     if (DAG.InferPtrAlignment(Ptr) < RequiredAlign) {
05426       if (MFI->isFixedObjectIndex(FI)) {
05427         // Can't change the alignment. FIXME: It's possible to compute
05428         // the exact stack offset and reference FI + adjust offset instead.
05429         // If someone *really* cares about this. That's the way to implement it.
05430         return SDValue();
05431       } else {
05432         MFI->setObjectAlignment(FI, RequiredAlign);
05433       }
05434     }
05435 
05436     // (Offset % 16 or 32) must be multiple of 4. Then address is then
05437     // Ptr + (Offset & ~15).
05438     if (Offset < 0)
05439       return SDValue();
05440     if ((Offset % RequiredAlign) & 3)
05441       return SDValue();
05442     int64_t StartOffset = Offset & ~(RequiredAlign-1);
05443     if (StartOffset)
05444       Ptr = DAG.getNode(ISD::ADD, SDLoc(Ptr), Ptr.getValueType(),
05445                         Ptr,DAG.getConstant(StartOffset, Ptr.getValueType()));
05446 
05447     int EltNo = (Offset - StartOffset) >> 2;
05448     unsigned NumElems = VT.getVectorNumElements();
05449 
05450     EVT NVT = EVT::getVectorVT(*DAG.getContext(), PVT, NumElems);
05451     SDValue V1 = DAG.getLoad(NVT, dl, Chain, Ptr,
05452                              LD->getPointerInfo().getWithOffset(StartOffset),
05453                              false, false, false, 0);
05454 
05455     SmallVector<int, 8> Mask;
05456     for (unsigned i = 0; i != NumElems; ++i)
05457       Mask.push_back(EltNo);
05458 
05459     return DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), &Mask[0]);
05460   }
05461 
05462   return SDValue();
05463 }
05464 
05465 /// EltsFromConsecutiveLoads - Given the initializing elements 'Elts' of a
05466 /// vector of type 'VT', see if the elements can be replaced by a single large
05467 /// load which has the same value as a build_vector whose operands are 'elts'.
05468 ///
05469 /// Example: <load i32 *a, load i32 *a+4, undef, undef> -> zextload a
05470 ///
05471 /// FIXME: we'd also like to handle the case where the last elements are zero
05472 /// rather than undef via VZEXT_LOAD, but we do not detect that case today.
05473 /// There's even a handy isZeroNode for that purpose.
05474 static SDValue EltsFromConsecutiveLoads(EVT VT, SmallVectorImpl<SDValue> &Elts,
05475                                         SDLoc &DL, SelectionDAG &DAG,
05476                                         bool isAfterLegalize) {
05477   EVT EltVT = VT.getVectorElementType();
05478   unsigned NumElems = Elts.size();
05479 
05480   LoadSDNode *LDBase = NULL;
05481   unsigned LastLoadedElt = -1U;
05482 
05483   // For each element in the initializer, see if we've found a load or an undef.
05484   // If we don't find an initial load element, or later load elements are
05485   // non-consecutive, bail out.
05486   for (unsigned i = 0; i < NumElems; ++i) {
05487     SDValue Elt = Elts[i];
05488 
05489     if (!Elt.getNode() ||
05490         (Elt.getOpcode() != ISD::UNDEF && !ISD::isNON_EXTLoad(Elt.getNode())))
05491       return SDValue();
05492     if (!LDBase) {
05493       if (Elt.getNode()->getOpcode() == ISD::UNDEF)
05494         return SDValue();
05495       LDBase = cast<LoadSDNode>(Elt.getNode());
05496       LastLoadedElt = i;
05497       continue;
05498     }
05499     if (Elt.getOpcode() == ISD::UNDEF)
05500       continue;
05501 
05502     LoadSDNode *LD = cast<LoadSDNode>(Elt);
05503     if (!DAG.isConsecutiveLoad(LD, LDBase, EltVT.getSizeInBits()/8, i))
05504       return SDValue();
05505     LastLoadedElt = i;
05506   }
05507 
05508   // If we have found an entire vector of loads and undefs, then return a large
05509   // load of the entire vector width starting at the base pointer.  If we found
05510   // consecutive loads for the low half, generate a vzext_load node.
05511   if (LastLoadedElt == NumElems - 1) {
05512 
05513     if (isAfterLegalize &&
05514         !DAG.getTargetLoweringInfo().isOperationLegal(ISD::LOAD, VT))
05515       return SDValue();
05516 
05517     SDValue NewLd = SDValue();
05518 
05519     if (DAG.InferPtrAlignment(LDBase->getBasePtr()) >= 16)
05520       NewLd = DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
05521                           LDBase->getPointerInfo(),
05522                           LDBase->isVolatile(), LDBase->isNonTemporal(),
05523                           LDBase->isInvariant(), 0);
05524     NewLd = DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
05525                         LDBase->getPointerInfo(),
05526                         LDBase->isVolatile(), LDBase->isNonTemporal(),
05527                         LDBase->isInvariant(), LDBase->getAlignment());
05528 
05529     if (LDBase->hasAnyUseOfValue(1)) {
05530       SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
05531                                      SDValue(LDBase, 1),
05532                                      SDValue(NewLd.getNode(), 1));
05533       DAG.ReplaceAllUsesOfValueWith(SDValue(LDBase, 1), NewChain);
05534       DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(LDBase, 1),
05535                              SDValue(NewLd.getNode(), 1));
05536     }
05537 
05538     return NewLd;
05539   }
05540   if (NumElems == 4 && LastLoadedElt == 1 &&
05541       DAG.getTargetLoweringInfo().isTypeLegal(MVT::v2i64)) {
05542     SDVTList Tys = DAG.getVTList(MVT::v2i64, MVT::Other);
05543     SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };
05544     SDValue ResNode =
05545         DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, Ops,
05546                                 array_lengthof(Ops), MVT::i64,
05547                                 LDBase->getPointerInfo(),
05548                                 LDBase->getAlignment(),
05549                                 false/*isVolatile*/, true/*ReadMem*/,
05550                                 false/*WriteMem*/);
05551 
05552     // Make sure the newly-created LOAD is in the same position as LDBase in
05553     // terms of dependency. We create a TokenFactor for LDBase and ResNode, and
05554     // update uses of LDBase's output chain to use the TokenFactor.
05555     if (LDBase->hasAnyUseOfValue(1)) {
05556       SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
05557                              SDValue(LDBase, 1), SDValue(ResNode.getNode(), 1));
05558       DAG.ReplaceAllUsesOfValueWith(SDValue(LDBase, 1), NewChain);
05559       DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(LDBase, 1),
05560                              SDValue(ResNode.getNode(), 1));
05561     }
05562 
05563     return DAG.getNode(ISD::BITCAST, DL, VT, ResNode);
05564   }
05565   return SDValue();
05566 }
05567 
05568 /// LowerVectorBroadcast - Attempt to use the vbroadcast instruction
05569 /// to generate a splat value for the following cases:
05570 /// 1. A splat BUILD_VECTOR which uses a single scalar load, or a constant.
05571 /// 2. A splat shuffle which uses a scalar_to_vector node which comes from
05572 /// a scalar load, or a constant.
05573 /// The VBROADCAST node is returned when a pattern is found,
05574 /// or SDValue() otherwise.
05575 static SDValue LowerVectorBroadcast(SDValue Op, const X86Subtarget* Subtarget,
05576                                     SelectionDAG &DAG) {
05577   if (!Subtarget->hasFp256())
05578     return SDValue();
05579 
05580   MVT VT = Op.getSimpleValueType();
05581   SDLoc dl(Op);
05582 
05583   assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
05584          "Unsupported vector type for broadcast.");
05585 
05586   SDValue Ld;
05587   bool ConstSplatVal;
05588 
05589   switch (Op.getOpcode()) {
05590     default:
05591       // Unknown pattern found.
05592       return SDValue();
05593 
05594     case ISD::BUILD_VECTOR: {
05595       // The BUILD_VECTOR node must be a splat.
05596       if (!isSplatVector(Op.getNode()))
05597         return SDValue();
05598 
05599       Ld = Op.getOperand(0);
05600       ConstSplatVal = (Ld.getOpcode() == ISD::Constant ||
05601                      Ld.getOpcode() == ISD::ConstantFP);
05602 
05603       // The suspected load node has several users. Make sure that all
05604       // of its users are from the BUILD_VECTOR node.
05605       // Constants may have multiple users.
05606       if (!ConstSplatVal && !Ld->hasNUsesOfValue(VT.getVectorNumElements(), 0))
05607         return SDValue();
05608       break;
05609     }
05610 
05611     case ISD::VECTOR_SHUFFLE: {
05612       ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
05613 
05614       // Shuffles must have a splat mask where the first element is
05615       // broadcasted.
05616       if ((!SVOp->isSplat()) || SVOp->getMaskElt(0) != 0)
05617         return SDValue();
05618 
05619       SDValue Sc = Op.getOperand(0);
05620       if (Sc.getOpcode() != ISD::SCALAR_TO_VECTOR &&
05621           Sc.getOpcode() != ISD::BUILD_VECTOR) {
05622 
05623         if (!Subtarget->hasInt256())
05624           return SDValue();
05625 
05626         // Use the register form of the broadcast instruction available on AVX2.
05627         if (VT.getSizeInBits() >= 256)
05628           Sc = Extract128BitVector(Sc, 0, DAG, dl);
05629         return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Sc);
05630       }
05631 
05632       Ld = Sc.getOperand(0);
05633       ConstSplatVal = (Ld.getOpcode() == ISD::Constant ||
05634                        Ld.getOpcode() == ISD::ConstantFP);
05635 
05636       // The scalar_to_vector node and the suspected
05637       // load node must have exactly one user.
05638       // Constants may have multiple users.
05639 
05640       // AVX-512 has register version of the broadcast
05641       bool hasRegVer = Subtarget->hasAVX512() && VT.is512BitVector() &&
05642         Ld.getValueType().getSizeInBits() >= 32;
05643       if (!ConstSplatVal && ((!Sc.hasOneUse() || !Ld.hasOneUse()) &&
05644           !hasRegVer))
05645         return SDValue();
05646       break;
05647     }
05648   }
05649 
05650   bool IsGE256 = (VT.getSizeInBits() >= 256);
05651 
05652   // Handle the broadcasting a single constant scalar from the constant pool
05653   // into a vector. On Sandybridge it is still better to load a constant vector
05654   // from the constant pool and not to broadcast it from a scalar.
05655   if (ConstSplatVal && Subtarget->hasInt256()) {
05656     EVT CVT = Ld.getValueType();
05657     assert(!CVT.isVector() && "Must not broadcast a vector type");
05658     unsigned ScalarSize = CVT.getSizeInBits();
05659 
05660     if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64)) {
05661       const Constant *C = 0;
05662       if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Ld))
05663         C = CI->getConstantIntValue();
05664       else if (ConstantFPSDNode *CF = dyn_cast<ConstantFPSDNode>(Ld))
05665         C = CF->getConstantFPValue();
05666 
05667       assert(C && "Invalid constant type");
05668 
05669       const TargetLowering &TLI = DAG.getTargetLoweringInfo();
05670       SDValue CP = DAG.getConstantPool(C, TLI.getPointerTy());
05671       unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
05672       Ld = DAG.getLoad(CVT, dl, DAG.getEntryNode(), CP,
05673                        MachinePointerInfo::getConstantPool(),
05674                        false, false, false, Alignment);
05675 
05676       return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
05677     }
05678   }
05679 
05680   bool IsLoad = ISD::isNormalLoad(Ld.getNode());
05681   unsigned ScalarSize = Ld.getValueType().getSizeInBits();
05682 
05683   // Handle AVX2 in-register broadcasts.
05684   if (!IsLoad && Subtarget->hasInt256() &&
05685       (ScalarSize == 32 || (IsGE256 && ScalarSize == 64)))
05686     return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
05687 
05688   // The scalar source must be a normal load.
05689   if (!IsLoad)
05690     return SDValue();
05691 
05692   if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64))
05693     return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
05694 
05695   // The integer check is needed for the 64-bit into 128-bit so it doesn't match
05696   // double since there is no vbroadcastsd xmm
05697   if (Subtarget->hasInt256() && Ld.getValueType().isInteger()) {
05698     if (ScalarSize == 8 || ScalarSize == 16 || ScalarSize == 64)
05699       return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
05700   }
05701 
05702   // Unsupported broadcast.
05703   return SDValue();
05704 }
05705 
05706 /// \brief For an EXTRACT_VECTOR_ELT with a constant index return the real
05707 /// underlying vector and index.
05708 ///
05709 /// Modifies \p ExtractedFromVec to the real vector and returns the real
05710 /// index.
05711 static int getUnderlyingExtractedFromVec(SDValue &ExtractedFromVec,
05712                                          SDValue ExtIdx) {
05713   int Idx = cast<ConstantSDNode>(ExtIdx)->getZExtValue();
05714   if (!isa<ShuffleVectorSDNode>(ExtractedFromVec))
05715     return Idx;
05716 
05717   // For 256-bit vectors, LowerEXTRACT_VECTOR_ELT_SSE4 may have already
05718   // lowered this:
05719   //   (extract_vector_elt (v8f32 %vreg1), Constant<6>)
05720   // to:
05721   //   (extract_vector_elt (vector_shuffle<2,u,u,u>
05722   //                           (extract_subvector (v8f32 %vreg0), Constant<4>),
05723   //                           undef)
05724   //                       Constant<0>)
05725   // In this case the vector is the extract_subvector expression and the index
05726   // is 2, as specified by the shuffle.
05727   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(ExtractedFromVec);
05728   SDValue ShuffleVec = SVOp->getOperand(0);
05729   MVT ShuffleVecVT = ShuffleVec.getSimpleValueType();
05730   assert(ShuffleVecVT.getVectorElementType() ==
05731          ExtractedFromVec.getSimpleValueType().getVectorElementType());
05732 
05733   int ShuffleIdx = SVOp->getMaskElt(Idx);
05734   if (isUndefOrInRange(ShuffleIdx, 0, ShuffleVecVT.getVectorNumElements())) {
05735     ExtractedFromVec = ShuffleVec;
05736     return ShuffleIdx;
05737   }
05738   return Idx;
05739 }
05740 
05741 static SDValue buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) {
05742   MVT VT = Op.getSimpleValueType();
05743 
05744   // Skip if insert_vec_elt is not supported.
05745   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
05746   if (!TLI.isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT))
05747     return SDValue();
05748 
05749   SDLoc DL(Op);
05750   unsigned NumElems = Op.getNumOperands();
05751 
05752   SDValue VecIn1;
05753   SDValue VecIn2;
05754   SmallVector<unsigned, 4> InsertIndices;
05755   SmallVector<int, 8> Mask(NumElems, -1);
05756 
05757   for (unsigned i = 0; i != NumElems; ++i) {
05758     unsigned Opc = Op.getOperand(i).getOpcode();
05759 
05760     if (Opc == ISD::UNDEF)
05761       continue;
05762 
05763     if (Opc != ISD::EXTRACT_VECTOR_ELT) {
05764       // Quit if more than 1 elements need inserting.
05765       if (InsertIndices.size() > 1)
05766         return SDValue();
05767 
05768       InsertIndices.push_back(i);
05769       continue;
05770     }
05771 
05772     SDValue ExtractedFromVec = Op.getOperand(i).getOperand(0);
05773     SDValue ExtIdx = Op.getOperand(i).getOperand(1);
05774     // Quit if non-constant index.
05775     if (!isa<ConstantSDNode>(ExtIdx))
05776       return SDValue();
05777     int Idx = getUnderlyingExtractedFromVec(ExtractedFromVec, ExtIdx);
05778 
05779     // Quit if extracted from vector of different type.
05780     if (ExtractedFromVec.getValueType() != VT)
05781       return SDValue();
05782 
05783     if (VecIn1.getNode() == 0)
05784       VecIn1 = ExtractedFromVec;
05785     else if (VecIn1 != ExtractedFromVec) {
05786       if (VecIn2.getNode() == 0)
05787         VecIn2 = ExtractedFromVec;
05788       else if (VecIn2 != ExtractedFromVec)
05789         // Quit if more than 2 vectors to shuffle
05790         return SDValue();
05791     }
05792 
05793     if (ExtractedFromVec == VecIn1)
05794       Mask[i] = Idx;
05795     else if (ExtractedFromVec == VecIn2)
05796       Mask[i] = Idx + NumElems;
05797   }
05798 
05799   if (VecIn1.getNode() == 0)
05800     return SDValue();
05801 
05802   VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(VT);
05803   SDValue NV = DAG.getVectorShuffle(VT, DL, VecIn1, VecIn2, &Mask[0]);
05804   for (unsigned i = 0, e = InsertIndices.size(); i != e; ++i) {
05805     unsigned Idx = InsertIndices[i];
05806     NV = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, NV, Op.getOperand(Idx),
05807                      DAG.getIntPtrConstant(Idx));
05808   }
05809 
05810   return NV;
05811 }
05812 
05813 // Lower BUILD_VECTOR operation for v8i1 and v16i1 types.
05814 SDValue
05815 X86TargetLowering::LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const {
05816 
05817   MVT VT = Op.getSimpleValueType();
05818   assert((VT.getVectorElementType() == MVT::i1) && (VT.getSizeInBits() <= 16) &&
05819          "Unexpected type in LowerBUILD_VECTORvXi1!");
05820 
05821   SDLoc dl(Op);
05822   if (ISD::isBuildVectorAllZeros(Op.getNode())) {
05823     SDValue Cst = DAG.getTargetConstant(0, MVT::i1);
05824     SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst,
05825                       Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
05826     return DAG.getNode(ISD::BUILD_VECTOR, dl, VT,
05827                        Ops, VT.getVectorNumElements());
05828   }
05829 
05830   if (ISD::isBuildVectorAllOnes(Op.getNode())) {
05831     SDValue Cst = DAG.getTargetConstant(1, MVT::i1);
05832     SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst,
05833                       Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
05834     return DAG.getNode(ISD::BUILD_VECTOR, dl, VT,
05835                        Ops, VT.getVectorNumElements());
05836   }
05837 
05838   bool AllContants = true;
05839   uint64_t Immediate = 0;
05840   int NonConstIdx = -1;
05841   bool IsSplat = true;
05842   unsigned NumNonConsts = 0;
05843   unsigned NumConsts = 0;
05844   for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
05845     SDValue In = Op.getOperand(idx);
05846     if (In.getOpcode() == ISD::UNDEF)
05847       continue;
05848     if (!isa<ConstantSDNode>(In)) {
05849       AllContants = false;
05850       NonConstIdx = idx;
05851       NumNonConsts++;
05852     }
05853     else {
05854       NumConsts++;
05855       if (cast<ConstantSDNode>(In)->getZExtValue())
05856       Immediate |= (1ULL << idx);
05857     }
05858     if (In != Op.getOperand(0))
05859       IsSplat = false;
05860   }
05861 
05862   if (AllContants) {
05863     SDValue FullMask = DAG.getNode(ISD::BITCAST, dl, MVT::v16i1,
05864       DAG.getConstant(Immediate, MVT::i16));
05865     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, FullMask,
05866                        DAG.getIntPtrConstant(0));
05867   }
05868 
05869   if (NumNonConsts == 1 && NonConstIdx != 0) {
05870     SDValue DstVec;
05871     if (NumConsts) {
05872       SDValue VecAsImm = DAG.getConstant(Immediate,
05873                                          MVT::getIntegerVT(VT.getSizeInBits()));
05874       DstVec = DAG.getNode(ISD::BITCAST, dl, VT, VecAsImm);
05875     }
05876     else 
05877       DstVec = DAG.getUNDEF(VT);
05878     return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DstVec,
05879                        Op.getOperand(NonConstIdx),
05880                        DAG.getIntPtrConstant(NonConstIdx));
05881   }
05882   if (!IsSplat && (NonConstIdx != 0))
05883     llvm_unreachable("Unsupported BUILD_VECTOR operation");
05884   MVT SelectVT = (VT == MVT::v16i1)? MVT::i16 : MVT::i8;
05885   SDValue Select;
05886   if (IsSplat)
05887     Select = DAG.getNode(ISD::SELECT, dl, SelectVT, Op.getOperand(0),
05888                           DAG.getConstant(-1, SelectVT),
05889                           DAG.getConstant(0, SelectVT));
05890   else
05891     Select = DAG.getNode(ISD::SELECT, dl, SelectVT, Op.getOperand(0),
05892                          DAG.getConstant((Immediate | 1), SelectVT),
05893                          DAG.getConstant(Immediate, SelectVT));
05894   return DAG.getNode(ISD::BITCAST, dl, VT, Select);
05895 }
05896 
05897 SDValue
05898 X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
05899   SDLoc dl(Op);
05900 
05901   MVT VT = Op.getSimpleValueType();
05902   MVT ExtVT = VT.getVectorElementType();
05903   unsigned NumElems = Op.getNumOperands();
05904 
05905   // Generate vectors for predicate vectors.
05906   if (VT.getScalarType() == MVT::i1 && Subtarget->hasAVX512())
05907     return LowerBUILD_VECTORvXi1(Op, DAG);
05908 
05909   // Vectors containing all zeros can be matched by pxor and xorps later
05910   if (ISD::isBuildVectorAllZeros(Op.getNode())) {
05911     // Canonicalize this to <4 x i32> to 1) ensure the zero vectors are CSE'd
05912     // and 2) ensure that i64 scalars are eliminated on x86-32 hosts.
05913     if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32)
05914       return Op;
05915 
05916     return getZeroVector(VT, Subtarget, DAG, dl);
05917   }
05918 
05919   // Vectors containing all ones can be matched by pcmpeqd on 128-bit width
05920   // vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use
05921   // vpcmpeqd on 256-bit vectors.
05922   if (Subtarget->hasSSE2() && ISD::isBuildVectorAllOnes(Op.getNode())) {
05923     if (VT == MVT::v4i32 || (VT == MVT::v8i32 && Subtarget->hasInt256()))
05924       return Op;
05925 
05926     if (!VT.is512BitVector())
05927       return getOnesVector(VT, Subtarget->hasInt256(), DAG, dl);
05928   }
05929 
05930   SDValue Broadcast = LowerVectorBroadcast(Op, Subtarget, DAG);
05931   if (Broadcast.getNode())
05932     return Broadcast;
05933 
05934   unsigned EVTBits = ExtVT.getSizeInBits();
05935 
05936   unsigned NumZero  = 0;
05937   unsigned NumNonZero = 0;
05938   unsigned NonZeros = 0;
05939   bool IsAllConstants = true;
05940   SmallSet<SDValue, 8> Values;
05941   for (unsigned i = 0; i < NumElems; ++i) {
05942     SDValue Elt = Op.getOperand(i);
05943     if (Elt.getOpcode() == ISD::UNDEF)
05944       continue;
05945     Values.insert(Elt);
05946     if (Elt.getOpcode() != ISD::Constant &&
05947         Elt.getOpcode() != ISD::ConstantFP)
05948       IsAllConstants = false;
05949     if (X86::isZeroNode(Elt))
05950       NumZero++;
05951     else {
05952       NonZeros |= (1 << i);
05953       NumNonZero++;
05954     }
05955   }
05956 
05957   // All undef vector. Return an UNDEF.  All zero vectors were handled above.
05958   if (NumNonZero == 0)
05959     return DAG.getUNDEF(VT);
05960 
05961   // Special case for single non-zero, non-undef, element.
05962   if (NumNonZero == 1) {
05963     unsigned Idx = countTrailingZeros(NonZeros);
05964     SDValue Item = Op.getOperand(Idx);
05965 
05966     // If this is an insertion of an i64 value on x86-32, and if the top bits of
05967     // the value are obviously zero, truncate the value to i32 and do the
05968     // insertion that way.  Only do this if the value is non-constant or if the
05969     // value is a constant being inserted into element 0.  It is cheaper to do
05970     // a constant pool load than it is to do a movd + shuffle.
05971     if (ExtVT == MVT::i64 && !Subtarget->is64Bit() &&
05972         (!IsAllConstants || Idx == 0)) {
05973       if (DAG.MaskedValueIsZero(Item, APInt::getBitsSet(64, 32, 64))) {
05974         // Handle SSE only.
05975         assert(VT == MVT::v2i64 && "Expected an SSE value type!");
05976         EVT VecVT = MVT::v4i32;
05977         unsigned VecElts = 4;
05978 
05979         // Truncate the value (which may itself be a constant) to i32, and
05980         // convert it to a vector with movd (S2V+shuffle to zero extend).
05981         Item = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Item);
05982         Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Item);
05983         Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
05984 
05985         // Now we have our 32-bit value zero extended in the low element of
05986         // a vector.  If Idx != 0, swizzle it into place.
05987         if (Idx != 0) {
05988           SmallVector<int, 4> Mask;
05989           Mask.push_back(Idx);
05990           for (unsigned i = 1; i != VecElts; ++i)
05991             Mask.push_back(i);
05992           Item = DAG.getVectorShuffle(VecVT, dl, Item, DAG.getUNDEF(VecVT),
05993                                       &Mask[0]);
05994         }
05995         return DAG.getNode(ISD::BITCAST, dl, VT, Item);
05996       }
05997     }
05998 
05999     // If we have a constant or non-constant insertion into the low element of
06000     // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into
06001     // the rest of the elements.  This will be matched as movd/movq/movss/movsd
06002     // depending on what the source datatype is.
06003     if (Idx == 0) {
06004       if (NumZero == 0)
06005         return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
06006 
06007       if (ExtVT == MVT::i32 || ExtVT == MVT::f32 || ExtVT == MVT::f64 ||
06008           (ExtVT == MVT::i64 && Subtarget->is64Bit())) {
06009         if (VT.is256BitVector() || VT.is512BitVector()) {
06010           SDValue ZeroVec = getZeroVector(VT, Subtarget, DAG, dl);
06011           return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, ZeroVec,
06012                              Item, DAG.getIntPtrConstant(0));
06013         }
06014         assert(VT.is128BitVector() && "Expected an SSE value type!");
06015         Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
06016         // Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector.
06017         return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
06018       }
06019 
06020       if (ExtVT == MVT::i16 || ExtVT == MVT::i8) {
06021         Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item);
06022         Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item);
06023         if (VT.is256BitVector()) {
06024           SDValue ZeroVec = getZeroVector(MVT::v8i32, Subtarget, DAG, dl);
06025           Item = Insert128BitVector(ZeroVec, Item, 0, DAG, dl);
06026         } else {
06027           assert(VT.is128BitVector() && "Expected an SSE value type!");
06028           Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
06029         }
06030         return DAG.getNode(ISD::BITCAST, dl, VT, Item);
06031       }
06032     }
06033 
06034     // Is it a vector logical left shift?
06035     if (NumElems == 2 && Idx == 1 &&
06036         X86::isZeroNode(Op.getOperand(0)) &&
06037         !X86::isZeroNode(Op.getOperand(1))) {
06038       unsigned NumBits = VT.getSizeInBits();
06039       return getVShift(true, VT,
06040                        DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
06041                                    VT, Op.getOperand(1)),
06042                        NumBits/2, DAG, *this, dl);
06043     }
06044 
06045     if (IsAllConstants) // Otherwise, it's better to do a constpool load.
06046       return SDValue();
06047 
06048     // Otherwise, if this is a vector with i32 or f32 elements, and the element
06049     // is a non-constant being inserted into an element other than the low one,
06050     // we can't use a constant pool load.  Instead, use SCALAR_TO_VECTOR (aka
06051     // movd/movss) to move this into the low element, then shuffle it into
06052     // place.
06053     if (EVTBits == 32) {
06054       Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
06055 
06056       // Turn it into a shuffle of zero and zero-extended scalar to vector.
06057       Item = getShuffleVectorZeroOrUndef(Item, 0, NumZero > 0, Subtarget, DAG);
06058       SmallVector<int, 8> MaskVec;
06059       for (unsigned i = 0; i != NumElems; ++i)
06060         MaskVec.push_back(i == Idx ? 0 : 1);
06061       return DAG.getVectorShuffle(VT, dl, Item, DAG.getUNDEF(VT), &MaskVec[0]);
06062     }
06063   }
06064 
06065   // Splat is obviously ok. Let legalizer expand it to a shuffle.
06066   if (Values.size() == 1) {
06067     if (EVTBits == 32) {
06068       // Instead of a shuffle like this:
06069       // shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0>
06070       // Check if it's possible to issue this instead.
06071       // shuffle (vload ptr)), undef, <1, 1, 1, 1>
06072       unsigned Idx = countTrailingZeros(NonZeros);
06073       SDValue Item = Op.getOperand(Idx);
06074       if (Op.getNode()->isOnlyUserOf(Item.getNode()))
06075         return LowerAsSplatVectorLoad(Item, VT, dl, DAG);
06076     }
06077     return SDValue();
06078   }
06079 
06080   // A vector full of immediates; various special cases are already
06081   // handled, so this is best done with a single constant-pool load.
06082   if (IsAllConstants)
06083     return SDValue();
06084 
06085   // For AVX-length vectors, build the individual 128-bit pieces and use
06086   // shuffles to put them in place.
06087   if (VT.is256BitVector() || VT.is512BitVector()) {
06088     SmallVector<SDValue, 64> V;
06089     for (unsigned i = 0; i != NumElems; ++i)
06090       V.push_back(Op.getOperand(i));
06091 
06092     EVT HVT = EVT::getVectorVT(*DAG.getContext(), ExtVT, NumElems/2);
06093 
06094     // Build both the lower and upper subvector.
06095     SDValue Lower = DAG.getNode(ISD::BUILD_VECTOR, dl, HVT, &V[0], NumElems/2);
06096     SDValue Upper = DAG.getNode(ISD::BUILD_VECTOR, dl, HVT, &V[NumElems / 2],
06097                                 NumElems/2);
06098 
06099     // Recreate the wider vector with the lower and upper part.
06100     if (VT.is256BitVector())
06101       return Concat128BitVectors(Lower, Upper, VT, NumElems, DAG, dl);
06102     return Concat256BitVectors(Lower, Upper, VT, NumElems, DAG, dl);
06103   }
06104 
06105   // Let legalizer expand 2-wide build_vectors.
06106   if (EVTBits == 64) {
06107     if (NumNonZero == 1) {
06108       // One half is zero or undef.
06109       unsigned Idx = countTrailingZeros(NonZeros);
06110       SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT,
06111                                  Op.getOperand(Idx));
06112       return getShuffleVectorZeroOrUndef(V2, Idx, true, Subtarget, DAG);
06113     }
06114     return SDValue();
06115   }
06116 
06117   // If element VT is < 32 bits, convert it to inserts into a zero vector.
06118   if (EVTBits == 8 && NumElems == 16) {
06119     SDValue V = LowerBuildVectorv16i8(Op, NonZeros,NumNonZero,NumZero, DAG,
06120                                         Subtarget, *this);
06121     if (V.getNode()) return V;
06122   }
06123 
06124   if (EVTBits == 16 && NumElems == 8) {
06125     SDValue V = LowerBuildVectorv8i16(Op, NonZeros,NumNonZero,NumZero, DAG,
06126                                       Subtarget, *this);
06127     if (V.getNode()) return V;
06128   }
06129 
06130   // If element VT is == 32 bits, turn it into a number of shuffles.
06131   SmallVector<SDValue, 8> V(NumElems);
06132   if (NumElems == 4 && NumZero > 0) {
06133     for (unsigned i = 0; i < 4; ++i) {
06134       bool isZero = !(NonZeros & (1 << i));
06135       if (isZero)
06136         V[i] = getZeroVector(VT, Subtarget, DAG, dl);
06137       else
06138         V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
06139     }
06140 
06141     for (unsigned i = 0; i < 2; ++i) {
06142       switch ((NonZeros & (0x3 << i*2)) >> (i*2)) {
06143         default: break;
06144         case 0:
06145           V[i] = V[i*2];  // Must be a zero vector.
06146           break;
06147         case 1:
06148           V[i] = getMOVL(DAG, dl, VT, V[i*2+1], V[i*2]);
06149           break;
06150         case 2:
06151           V[i] = getMOVL(DAG, dl, VT, V[i*2], V[i*2+1]);
06152           break;
06153         case 3:
06154           V[i] = getUnpackl(DAG, dl, VT, V[i*2], V[i*2+1]);
06155           break;
06156       }
06157     }
06158 
06159     bool Reverse1 = (NonZeros & 0x3) == 2;
06160     bool Reverse2 = ((NonZeros & (0x3 << 2)) >> 2) == 2;
06161     int MaskVec[] = {
06162       Reverse1 ? 1 : 0,
06163       Reverse1 ? 0 : 1,
06164       static_cast<int>(Reverse2 ? NumElems+1 : NumElems),
06165       static_cast<int>(Reverse2 ? NumElems   : NumElems+1)
06166     };
06167     return DAG.getVectorShuffle(VT, dl, V[0], V[1], &MaskVec[0]);
06168   }
06169 
06170   if (Values.size() > 1 && VT.is128BitVector()) {
06171     // Check for a build vector of consecutive loads.
06172     for (unsigned i = 0; i < NumElems; ++i)
06173       V[i] = Op.getOperand(i);
06174 
06175     // Check for elements which are consecutive loads.
06176     SDValue LD = EltsFromConsecutiveLoads(VT, V, dl, DAG, false);
06177     if (LD.getNode())
06178       return LD;
06179 
06180     // Check for a build vector from mostly shuffle plus few inserting.
06181     SDValue Sh = buildFromShuffleMostly(Op, DAG);
06182     if (Sh.getNode())
06183       return Sh;
06184 
06185     // For SSE 4.1, use insertps to put the high elements into the low element.
06186     if (getSubtarget()->hasSSE41()) {
06187       SDValue Result;
06188       if (Op.getOperand(0).getOpcode() != ISD::UNDEF)
06189         Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0));
06190       else
06191         Result = DAG.getUNDEF(VT);
06192 
06193       for (unsigned i = 1; i < NumElems; ++i) {
06194         if (Op.getOperand(i).getOpcode() == ISD::UNDEF) continue;
06195         Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result,
06196                              Op.getOperand(i), DAG.getIntPtrConstant(i));
06197       }
06198       return Result;
06199     }
06200 
06201     // Otherwise, expand into a number of unpckl*, start by extending each of
06202     // our (non-undef) elements to the full vector width with the element in the
06203     // bottom slot of the vector (which generates no code for SSE).
06204     for (unsigned i = 0; i < NumElems; ++i) {
06205       if (Op.getOperand(i).getOpcode() != ISD::UNDEF)
06206         V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
06207       else
06208         V[i] = DAG.getUNDEF(VT);
06209     }
06210 
06211     // Next, we iteratively mix elements, e.g. for v4f32:
06212     //   Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
06213     //         : unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
06214     //   Step 2: unpcklps X, Y ==>    <3, 2, 1, 0>
06215     unsigned EltStride = NumElems >> 1;
06216     while (EltStride != 0) {
06217       for (unsigned i = 0; i < EltStride; ++i) {
06218         // If V[i+EltStride] is undef and this is the first round of mixing,
06219         // then it is safe to just drop this shuffle: V[i] is already in the
06220         // right place, the one element (since it's the first round) being
06221         // inserted as undef can be dropped.  This isn't safe for successive
06222         // rounds because they will permute elements within both vectors.
06223         if (V[i+EltStride].getOpcode() == ISD::UNDEF &&
06224             EltStride == NumElems/2)
06225           continue;
06226 
06227         V[i] = getUnpackl(DAG, dl, VT, V[i], V[i + EltStride]);
06228       }
06229       EltStride >>= 1;
06230     }
06231     return V[0];
06232   }
06233   return SDValue();
06234 }
06235 
06236 // LowerAVXCONCAT_VECTORS - 256-bit AVX can use the vinsertf128 instruction
06237 // to create 256-bit vectors from two other 128-bit ones.
06238 static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
06239   SDLoc dl(Op);
06240   MVT ResVT = Op.getSimpleValueType();
06241 
06242   assert((ResVT.is256BitVector() ||
06243           ResVT.is512BitVector()) && "Value type must be 256-/512-bit wide");
06244 
06245   SDValue V1 = Op.getOperand(0);
06246   SDValue V2 = Op.getOperand(1);
06247   unsigned NumElems = ResVT.getVectorNumElements();
06248   if(ResVT.is256BitVector())
06249     return Concat128BitVectors(V1, V2, ResVT, NumElems, DAG, dl);
06250 
06251   if (Op.getNumOperands() == 4) {
06252     MVT HalfVT = MVT::getVectorVT(ResVT.getScalarType(),
06253                                 ResVT.getVectorNumElements()/2);
06254     SDValue V3 = Op.getOperand(2);
06255     SDValue V4 = Op.getOperand(3);
06256     return Concat256BitVectors(Concat128BitVectors(V1, V2, HalfVT, NumElems/2, DAG, dl),
06257       Concat128BitVectors(V3, V4, HalfVT, NumElems/2, DAG, dl), ResVT, NumElems, DAG, dl);
06258   }
06259   return Concat256BitVectors(V1, V2, ResVT, NumElems, DAG, dl);
06260 }
06261 
06262 static SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
06263   MVT LLVM_ATTRIBUTE_UNUSED VT = Op.getSimpleValueType();
06264   assert((VT.is256BitVector() && Op.getNumOperands() == 2) ||
06265          (VT.is512BitVector() && (Op.getNumOperands() == 2 ||
06266           Op.getNumOperands() == 4)));
06267 
06268   // AVX can use the vinsertf128 instruction to create 256-bit vectors
06269   // from two other 128-bit ones.
06270 
06271   // 512-bit vector may contain 2 256-bit vectors or 4 128-bit vectors
06272   return LowerAVXCONCAT_VECTORS(Op, DAG);
06273 }
06274 
06275 // Try to lower a shuffle node into a simple blend instruction.
06276 static SDValue
06277 LowerVECTOR_SHUFFLEtoBlend(ShuffleVectorSDNode *SVOp,
06278                            const X86Subtarget *Subtarget, SelectionDAG &DAG) {
06279   SDValue V1 = SVOp->getOperand(0);
06280   SDValue V2 = SVOp->getOperand(1);
06281   SDLoc dl(SVOp);
06282   MVT VT = SVOp->getSimpleValueType(0);
06283   MVT EltVT = VT.getVectorElementType();
06284   unsigned NumElems = VT.getVectorNumElements();
06285 
06286   // There is no blend with immediate in AVX-512.
06287   if (VT.is512BitVector())
06288     return SDValue();
06289 
06290   if (!Subtarget->hasSSE41() || EltVT == MVT::i8)
06291     return SDValue();
06292   if (!Subtarget->hasInt256() && VT == MVT::v16i16)
06293     return SDValue();
06294 
06295   // Check the mask for BLEND and build the value.
06296   unsigned MaskValue = 0;
06297   // There are 2 lanes if (NumElems > 8), and 1 lane otherwise.
06298   unsigned NumLanes = (NumElems-1)/8 + 1;
06299   unsigned NumElemsInLane = NumElems / NumLanes;
06300 
06301   // Blend for v16i16 should be symetric for the both lanes.
06302   for (unsigned i = 0; i < NumElemsInLane; ++i) {
06303 
06304     int SndLaneEltIdx = (NumLanes == 2) ?
06305       SVOp->getMaskElt(i + NumElemsInLane) : -1;
06306     int EltIdx = SVOp->getMaskElt(i);
06307 
06308     if ((EltIdx < 0 || EltIdx == (int)i) &&
06309         (SndLaneEltIdx < 0 || SndLaneEltIdx == (int)(i + NumElemsInLane)))
06310       continue;
06311 
06312     if (((unsigned)EltIdx == (i + NumElems)) &&
06313         (SndLaneEltIdx < 0 ||
06314          (unsigned)SndLaneEltIdx == i + NumElems + NumElemsInLane))
06315       MaskValue |= (1<<i);
06316     else
06317       return SDValue();
06318   }
06319 
06320   // Convert i32 vectors to floating point if it is not AVX2.
06321   // AVX2 introduced VPBLENDD instruction for 128 and 256-bit vectors.
06322   MVT BlendVT = VT;
06323   if (EltVT == MVT::i64 || (EltVT == MVT::i32 && !Subtarget->hasInt256())) {
06324     BlendVT = MVT::getVectorVT(MVT::getFloatingPointVT(EltVT.getSizeInBits()),
06325                                NumElems);
06326     V1 = DAG.getNode(ISD::BITCAST, dl, VT, V1);
06327     V2 = DAG.getNode(ISD::BITCAST, dl, VT, V2);
06328   }
06329 
06330   SDValue Ret = DAG.getNode(X86ISD::BLENDI, dl, BlendVT, V1, V2,
06331                             DAG.getConstant(MaskValue, MVT::i32));
06332   return DAG.getNode(ISD::BITCAST, dl, VT, Ret);
06333 }
06334 
06335 /// In vector type \p VT, return true if the element at index \p InputIdx
06336 /// falls on a different 128-bit lane than \p OutputIdx.
06337 static bool ShuffleCrosses128bitLane(MVT VT, unsigned InputIdx,
06338                                      unsigned OutputIdx) {
06339   unsigned EltSize = VT.getVectorElementType().getSizeInBits();
06340   return InputIdx * EltSize / 128 != OutputIdx * EltSize / 128;
06341 }
06342 
06343 /// Generate a PSHUFB if possible.  Selects elements from \p V1 according to
06344 /// \p MaskVals.  MaskVals[OutputIdx] = InputIdx specifies that we want to
06345 /// shuffle the element at InputIdx in V1 to OutputIdx in the result.  If \p
06346 /// MaskVals refers to elements outside of \p V1 or is undef (-1), insert a
06347 /// zero.
06348 static SDValue getPSHUFB(ArrayRef<int> MaskVals, SDValue V1, SDLoc &dl,
06349                          SelectionDAG &DAG) {
06350   MVT VT = V1.getSimpleValueType();
06351   assert(VT.is128BitVector() || VT.is256BitVector());
06352 
06353   MVT EltVT = VT.getVectorElementType();
06354   unsigned EltSizeInBytes = EltVT.getSizeInBits() / 8;
06355   unsigned NumElts = VT.getVectorNumElements();
06356 
06357   SmallVector<SDValue, 32> PshufbMask;
06358   for (unsigned OutputIdx = 0; OutputIdx < NumElts; ++OutputIdx) {
06359     int InputIdx = MaskVals[OutputIdx];
06360     unsigned InputByteIdx;
06361 
06362     if (InputIdx < 0 || NumElts <= (unsigned)InputIdx)
06363       InputByteIdx = 0x80;
06364     else {
06365       // Cross lane is not allowed.
06366       if (ShuffleCrosses128bitLane(VT, InputIdx, OutputIdx))
06367         return SDValue();
06368       InputByteIdx = InputIdx * EltSizeInBytes;
06369       // Index is an byte offset within the 128-bit lane.
06370       InputByteIdx &= 0xf;
06371     }
06372 
06373     for (unsigned j = 0; j < EltSizeInBytes; ++j) {
06374       PshufbMask.push_back(DAG.getConstant(InputByteIdx, MVT::i8));
06375       if (InputByteIdx != 0x80)
06376         ++InputByteIdx;
06377     }
06378   }
06379 
06380   MVT ShufVT = MVT::getVectorVT(MVT::i8, PshufbMask.size());
06381   if (ShufVT != VT)
06382     V1 = DAG.getNode(ISD::BITCAST, dl, ShufVT, V1);
06383   return DAG.getNode(X86ISD::PSHUFB, dl, ShufVT, V1,
06384                      DAG.getNode(ISD::BUILD_VECTOR, dl, ShufVT,
06385                                  PshufbMask.data(), PshufbMask.size()));
06386 }
06387 
06388 // v8i16 shuffles - Prefer shuffles in the following order:
06389 // 1. [all]   pshuflw, pshufhw, optional move
06390 // 2. [ssse3] 1 x pshufb
06391 // 3. [ssse3] 2 x pshufb + 1 x por
06392 // 4. [all]   mov + pshuflw + pshufhw + N x (pextrw + pinsrw)
06393 static SDValue
06394 LowerVECTOR_SHUFFLEv8i16(SDValue Op, const X86Subtarget *Subtarget,
06395                          SelectionDAG &DAG) {
06396   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
06397   SDValue V1 = SVOp->getOperand(0);
06398   SDValue V2 = SVOp->getOperand(1);
06399   SDLoc dl(SVOp);
06400   SmallVector<int, 8> MaskVals;
06401 
06402   // Determine if more than 1 of the words in each of the low and high quadwords
06403   // of the result come from the same quadword of one of the two inputs.  Undef
06404   // mask values count as coming from any quadword, for better codegen.
06405   //
06406   // Lo/HiQuad[i] = j indicates how many words from the ith quad of the input
06407   // feeds this quad.  For i, 0 and 1 refer to V1, 2 and 3 refer to V2.
06408   unsigned LoQuad[] = { 0, 0, 0, 0 };
06409   unsigned HiQuad[] = { 0, 0, 0, 0 };
06410   // Indices of quads used.
06411   std::bitset<4> InputQuads;
06412   for (unsigned i = 0; i < 8; ++i) {
06413     unsigned *Quad = i < 4 ? LoQuad : HiQuad;
06414     int EltIdx = SVOp->getMaskElt(i);
06415     MaskVals.push_back(EltIdx);
06416     if (EltIdx < 0) {
06417       ++Quad[0];
06418       ++Quad[1];
06419       ++Quad[2];
06420       ++Quad[3];
06421       continue;
06422     }
06423     ++Quad[EltIdx / 4];
06424     InputQuads.set(EltIdx / 4);
06425   }
06426 
06427   int BestLoQuad = -1;
06428   unsigned MaxQuad = 1;
06429   for (unsigned i = 0; i < 4; ++i) {
06430     if (LoQuad[i] > MaxQuad) {
06431       BestLoQuad = i;
06432       MaxQuad = LoQuad[i];
06433     }
06434   }
06435 
06436   int BestHiQuad = -1;
06437   MaxQuad = 1;
06438   for (unsigned i = 0; i < 4; ++i) {
06439     if (HiQuad[i] > MaxQuad) {
06440       BestHiQuad = i;
06441       MaxQuad = HiQuad[i];
06442     }
06443   }
06444 
06445   // For SSSE3, If all 8 words of the result come from only 1 quadword of each
06446   // of the two input vectors, shuffle them into one input vector so only a
06447   // single pshufb instruction is necessary. If there are more than 2 input
06448   // quads, disable the next transformation since it does not help SSSE3.
06449   bool V1Used = InputQuads[0] || InputQuads[1];
06450   bool V2Used = InputQuads[2] || InputQuads[3];
06451   if (Subtarget->hasSSSE3()) {
06452     if (InputQuads.count() == 2 && V1Used && V2Used) {
06453       BestLoQuad = InputQuads[0] ? 0 : 1;
06454       BestHiQuad = InputQuads[2] ? 2 : 3;
06455     }
06456     if (InputQuads.count() > 2) {
06457       BestLoQuad = -1;
06458       BestHiQuad = -1;
06459     }
06460   }
06461 
06462   // If BestLoQuad or BestHiQuad are set, shuffle the quads together and update
06463   // the shuffle mask.  If a quad is scored as -1, that means that it contains
06464   // words from all 4 input quadwords.
06465   SDValue NewV;
06466   if (BestLoQuad >= 0 || BestHiQuad >= 0) {
06467     int MaskV[] = {
06468       BestLoQuad < 0 ? 0 : BestLoQuad,
06469       BestHiQuad < 0 ? 1 : BestHiQuad
06470     };
06471     NewV = DAG.getVectorShuffle(MVT::v2i64, dl,
06472                   DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, V1),
06473                   DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, V2), &MaskV[0]);
06474     NewV = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, NewV);
06475 
06476     // Rewrite the MaskVals and assign NewV to V1 if NewV now contains all the
06477     // source words for the shuffle, to aid later transformations.
06478     bool AllWordsInNewV = true;
06479     bool InOrder[2] = { true, true };
06480     for (unsigned i = 0; i != 8; ++i) {
06481       int idx = MaskVals[i];
06482       if (idx != (int)i)
06483         InOrder[i/4] = false;
06484       if (idx < 0 || (idx/4) == BestLoQuad || (idx/4) == BestHiQuad)
06485         continue;
06486       AllWordsInNewV = false;
06487       break;
06488     }
06489 
06490     bool pshuflw = AllWordsInNewV, pshufhw = AllWordsInNewV;
06491     if (AllWordsInNewV) {
06492       for (int i = 0; i != 8; ++i) {
06493         int idx = MaskVals[i];
06494         if (idx < 0)
06495           continue;
06496         idx = MaskVals[i] = (idx / 4) == BestLoQuad ? (idx & 3) : (idx & 3) + 4;
06497         if ((idx != i) && idx < 4)
06498           pshufhw = false;
06499         if ((idx != i) && idx > 3)
06500           pshuflw = false;
06501       }
06502       V1 = NewV;
06503       V2Used = false;
06504       BestLoQuad = 0;
06505       BestHiQuad = 1;
06506     }
06507 
06508     // If we've eliminated the use of V2, and the new mask is a pshuflw or
06509     // pshufhw, that's as cheap as it gets.  Return the new shuffle.
06510     if ((pshufhw && InOrder[0]) || (pshuflw && InOrder[1])) {
06511       unsigned Opc = pshufhw ? X86ISD::PSHUFHW : X86ISD::PSHUFLW;
06512       unsigned TargetMask = 0;
06513       NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV,
06514                                   DAG.getUNDEF(MVT::v8i16), &MaskVals[0]);
06515       ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(NewV.getNode());
06516       TargetMask = pshufhw ? getShufflePSHUFHWImmediate(SVOp):
06517                              getShufflePSHUFLWImmediate(SVOp);
06518       V1 = NewV.getOperand(0);
06519       return getTargetShuffleNode(Opc, dl, MVT::v8i16, V1, TargetMask, DAG);
06520     }
06521   }
06522 
06523   // Promote splats to a larger type which usually leads to more efficient code.
06524   // FIXME: Is this true if pshufb is available?
06525   if (SVOp->isSplat())
06526     return PromoteSplat(SVOp, DAG);
06527 
06528   // If we have SSSE3, and all words of the result are from 1 input vector,
06529   // case 2 is generated, otherwise case 3 is generated.  If no SSSE3
06530   // is present, fall back to case 4.
06531   if (Subtarget->hasSSSE3()) {
06532     SmallVector<SDValue,16> pshufbMask;
06533 
06534     // If we have elements from both input vectors, set the high bit of the
06535     // shuffle mask element to zero out elements that come from V2 in the V1
06536     // mask, and elements that come from V1 in the V2 mask, so that the two
06537     // results can be OR'd together.
06538     bool TwoInputs = V1Used && V2Used;
06539     V1 = getPSHUFB(MaskVals, V1, dl, DAG);
06540     if (!TwoInputs)
06541       return DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1);
06542 
06543     // Calculate the shuffle mask for the second input, shuffle it, and
06544     // OR it with the first shuffled input.
06545     CommuteVectorShuffleMask(MaskVals, 8);
06546     V2 = getPSHUFB(MaskVals, V2, dl, DAG);
06547     V1 = DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2);
06548     return DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1);
06549   }
06550 
06551   // If BestLoQuad >= 0, generate a pshuflw to put the low elements in order,
06552   // and update MaskVals with new element order.
06553   std::bitset<8> InOrder;
06554   if (BestLoQuad >= 0) {
06555     int MaskV[] = { -1, -1, -1, -1, 4, 5, 6, 7 };
06556     for (int i = 0; i != 4; ++i) {
06557       int idx = MaskVals[i];
06558       if (idx < 0) {
06559         InOrder.set(i);
06560       } else if ((idx / 4) == BestLoQuad) {
06561         MaskV[i] = idx & 3;
06562         InOrder.set(i);
06563       }
06564     }
06565     NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16),
06566                                 &MaskV[0]);
06567 
06568     if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE && Subtarget->hasSSSE3()) {
06569       ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(NewV.getNode());
06570       NewV = getTargetShuffleNode(X86ISD::PSHUFLW, dl, MVT::v8i16,
06571                                   NewV.getOperand(0),
06572                                   getShufflePSHUFLWImmediate(SVOp), DAG);
06573     }
06574   }
06575 
06576   // If BestHi >= 0, generate a pshufhw to put the high elements in order,
06577   // and update MaskVals with the new element order.
06578   if (BestHiQuad >= 0) {
06579     int MaskV[] = { 0, 1, 2, 3, -1, -1, -1, -1 };
06580     for (unsigned i = 4; i != 8; ++i) {
06581       int idx = MaskVals[i];
06582       if (idx < 0) {
06583         InOrder.set(i);
06584       } else if ((idx / 4) == BestHiQuad) {
06585         MaskV[i] = (idx & 3) + 4;
06586         InOrder.set(i);
06587       }
06588     }
06589     NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16),
06590                                 &MaskV[0]);
06591 
06592     if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE && Subtarget->hasSSSE3()) {
06593       ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(NewV.getNode());
06594       NewV = getTargetShuffleNode(X86ISD::PSHUFHW, dl, MVT::v8i16,
06595                                   NewV.getOperand(0),
06596                                   getShufflePSHUFHWImmediate(SVOp), DAG);
06597     }
06598   }
06599 
06600   // In case BestHi & BestLo were both -1, which means each quadword has a word
06601   // from each of the four input quadwords, calculate the InOrder bitvector now
06602   // before falling through to the insert/extract cleanup.
06603   if (BestLoQuad == -1 && BestHiQuad == -1) {
06604     NewV = V1;
06605     for (int i = 0; i != 8; ++i)
06606       if (MaskVals[i] < 0 || MaskVals[i] == i)
06607         InOrder.set(i);
06608   }
06609 
06610   // The other elements are put in the right place using pextrw and pinsrw.
06611   for (unsigned i = 0; i != 8; ++i) {
06612     if (InOrder[i])
06613       continue;
06614     int EltIdx = MaskVals[i];
06615     if (EltIdx < 0)
06616       continue;
06617     SDValue ExtOp = (EltIdx < 8) ?
06618       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V1,
06619                   DAG.getIntPtrConstant(EltIdx)) :
06620       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V2,
06621                   DAG.getIntPtrConstant(EltIdx - 8));
06622     NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, ExtOp,
06623                        DAG.getIntPtrConstant(i));
06624   }
06625   return NewV;
06626 }
06627 
06628 /// \brief v16i16 shuffles
06629 ///
06630 /// FIXME: We only support generation of a single pshufb currently.  We can
06631 /// generalize the other applicable cases from LowerVECTOR_SHUFFLEv8i16 as
06632 /// well (e.g 2 x pshufb + 1 x por).
06633 static SDValue
06634 LowerVECTOR_SHUFFLEv16i16(SDValue Op, SelectionDAG &DAG) {
06635   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
06636   SDValue V1 = SVOp->getOperand(0);
06637   SDValue V2 = SVOp->getOperand(1);
06638   SDLoc dl(SVOp);
06639 
06640   if (V2.getOpcode() != ISD::UNDEF)
06641     return SDValue();
06642 
06643   SmallVector<int, 16> MaskVals(SVOp->getMask().begin(), SVOp->getMask().end());
06644   return getPSHUFB(MaskVals, V1, dl, DAG);
06645 }
06646 
06647 // v16i8 shuffles - Prefer shuffles in the following order:
06648 // 1. [ssse3] 1 x pshufb
06649 // 2. [ssse3] 2 x pshufb + 1 x por
06650 // 3. [all]   v8i16 shuffle + N x pextrw + rotate + pinsrw
06651 static SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp,
06652                                         const X86Subtarget* Subtarget,
06653                                         SelectionDAG &DAG) {
06654   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
06655   SDValue V1 = SVOp->getOperand(0);
06656   SDValue V2 = SVOp->getOperand(1);
06657   SDLoc dl(SVOp);
06658   ArrayRef<int> MaskVals = SVOp->getMask();
06659 
06660   // Promote splats to a larger type which usually leads to more efficient code.
06661   // FIXME: Is this true if pshufb is available?
06662   if (SVOp->isSplat())
06663     return PromoteSplat(SVOp, DAG);
06664 
06665   // If we have SSSE3, case 1 is generated when all result bytes come from
06666   // one of  the inputs.  Otherwise, case 2 is generated.  If no SSSE3 is
06667   // present, fall back to case 3.
06668 
06669   // If SSSE3, use 1 pshufb instruction per vector with elements in the result.
06670   if (Subtarget->hasSSSE3()) {
06671     SmallVector<SDValue,16> pshufbMask;
06672 
06673     // If all result elements are from one input vector, then only translate
06674     // undef mask values to 0x80 (zero out result) in the pshufb mask.
06675     //
06676     // Otherwise, we have elements from both input vectors, and must zero out
06677     // elements that come from V2 in the first mask, and V1 in the second mask
06678     // so that we can OR them together.
06679     for (unsigned i = 0; i != 16; ++i) {
06680       int EltIdx = MaskVals[i];
06681       if (EltIdx < 0 || EltIdx >= 16)
06682         EltIdx = 0x80;
06683       pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8));
06684     }
06685     V1 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V1,
06686                      DAG.getNode(ISD::BUILD_VECTOR, dl,
06687                                  MVT::v16i8, &pshufbMask[0], 16));
06688 
06689     // As PSHUFB will zero elements with negative indices, it's safe to ignore
06690     // the 2nd operand if it's undefined or zero.
06691     if (V2.getOpcode() == ISD::UNDEF ||
06692         ISD::isBuildVectorAllZeros(V2.getNode()))
06693       return V1;
06694 
06695     // Calculate the shuffle mask for the second input, shuffle it, and
06696     // OR it with the first shuffled input.
06697     pshufbMask.clear();
06698     for (unsigned i = 0; i != 16; ++i) {
06699       int EltIdx = MaskVals[i];
06700       EltIdx = (EltIdx < 16) ? 0x80 : EltIdx - 16;
06701       pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8));
06702     }
06703     V2 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V2,
06704                      DAG.getNode(ISD::BUILD_VECTOR, dl,
06705                                  MVT::v16i8, &pshufbMask[0], 16));
06706     return DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2);
06707   }
06708 
06709   // No SSSE3 - Calculate in place words and then fix all out of place words
06710   // With 0-16 extracts & inserts.  Worst case is 16 bytes out of order from
06711   // the 16 different words that comprise the two doublequadword input vectors.
06712   V1 = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1);
06713   V2 = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V2);
06714   SDValue NewV = V1;
06715   for (int i = 0; i != 8; ++i) {
06716     int Elt0 = MaskVals[i*2];
06717     int Elt1 = MaskVals[i*2+1];
06718 
06719     // This word of the result is all undef, skip it.
06720     if (Elt0 < 0 && Elt1 < 0)
06721       continue;
06722 
06723     // This word of the result is already in the correct place, skip it.
06724     if ((Elt0 == i*2) && (Elt1 == i*2+1))
06725       continue;
06726 
06727     SDValue Elt0Src = Elt0 < 16 ? V1 : V2;
06728     SDValue Elt1Src = Elt1 < 16 ? V1 : V2;
06729     SDValue InsElt;
06730 
06731     // If Elt0 and Elt1 are defined, are consecutive, and can be load
06732     // using a single extract together, load it and store it.
06733     if ((Elt0 >= 0) && ((Elt0 + 1) == Elt1) && ((Elt0 & 1) == 0)) {
06734       InsElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Elt1Src,
06735                            DAG.getIntPtrConstant(Elt1 / 2));
06736       NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, InsElt,
06737                         DAG.getIntPtrConstant(i));
06738       continue;
06739     }
06740 
06741     // If Elt1 is defined, extract it from the appropriate source.  If the
06742     // source byte is not also odd, shift the extracted word left 8 bits
06743     // otherwise clear the bottom 8 bits if we need to do an or.
06744     if (Elt1 >= 0) {
06745       InsElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Elt1Src,
06746                            DAG.getIntPtrConstant(Elt1 / 2));
06747       if ((Elt1 & 1) == 0)
06748         InsElt = DAG.getNode(ISD::SHL, dl, MVT::i16, InsElt,
06749                              DAG.getConstant(8,
06750                                   TLI.getShiftAmountTy(InsElt.getValueType())));
06751       else if (Elt0 >= 0)
06752         InsElt = DAG.getNode(ISD::AND, dl, MVT::i16, InsElt,
06753                              DAG.getConstant(0xFF00, MVT::i16));
06754     }
06755     // If Elt0 is defined, extract it from the appropriate source.  If the
06756     // source byte is not also even, shift the extracted word right 8 bits. If
06757     // Elt1 was also defined, OR the extracted values together before
06758     // inserting them in the result.
06759     if (Elt0 >= 0) {
06760       SDValue InsElt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16,
06761                                     Elt0Src, DAG.getIntPtrConstant(Elt0 / 2));
06762       if ((Elt0 & 1) != 0)
06763         InsElt0 = DAG.getNode(ISD::SRL, dl, MVT::i16, InsElt0,
06764                               DAG.getConstant(8,
06765                                  TLI.getShiftAmountTy(InsElt0.getValueType())));
06766       else if (Elt1 >= 0)
06767         InsElt0 = DAG.getNode(ISD::AND, dl, MVT::i16, InsElt0,
06768                              DAG.getConstant(0x00FF, MVT::i16));
06769       InsElt = Elt1 >= 0 ? DAG.getNode(ISD::OR, dl, MVT::i16, InsElt, InsElt0)
06770                          : InsElt0;
06771     }
06772     NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, InsElt,
06773                        DAG.getIntPtrConstant(i));
06774   }
06775   return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, NewV);
06776 }
06777 
06778 // v32i8 shuffles - Translate to VPSHUFB if possible.
06779 static
06780 SDValue LowerVECTOR_SHUFFLEv32i8(ShuffleVectorSDNode *SVOp,
06781                                  const X86Subtarget *Subtarget,
06782                                  SelectionDAG &DAG) {
06783   MVT VT = SVOp->getSimpleValueType(0);
06784   SDValue V1 = SVOp->getOperand(0);
06785   SDValue V2 = SVOp->getOperand(1);
06786   SDLoc dl(SVOp);
06787   SmallVector<int, 32> MaskVals(SVOp->getMask().begin(), SVOp->getMask().end());
06788 
06789   bool V2IsUndef = V2.getOpcode() == ISD::UNDEF;
06790   bool V1IsAllZero = ISD::isBuildVectorAllZeros(V1.getNode());
06791   bool V2IsAllZero = ISD::isBuildVectorAllZeros(V2.getNode());
06792 
06793   // VPSHUFB may be generated if
06794   // (1) one of input vector is undefined or zeroinitializer.
06795   // The mask value 0x80 puts 0 in the corresponding slot of the vector.
06796   // And (2) the mask indexes don't cross the 128-bit lane.
06797   if (VT != MVT::v32i8 || !Subtarget->hasInt256() ||
06798       (!V2IsUndef && !V2IsAllZero && !V1IsAllZero))
06799     return SDValue();
06800 
06801   if (V1IsAllZero && !V2IsAllZero) {
06802     CommuteVectorShuffleMask(MaskVals, 32);
06803     V1 = V2;
06804   }
06805   return getPSHUFB(MaskVals, V1, dl, DAG);
06806 }
06807 
06808 /// RewriteAsNarrowerShuffle - Try rewriting v8i16 and v16i8 shuffles as 4 wide
06809 /// ones, or rewriting v4i32 / v4f32 as 2 wide ones if possible. This can be
06810 /// done when every pair / quad of shuffle mask elements point to elements in
06811 /// the right sequence. e.g.
06812 /// vector_shuffle X, Y, <2, 3, | 10, 11, | 0, 1, | 14, 15>
06813 static
06814 SDValue RewriteAsNarrowerShuffle(ShuffleVectorSDNode *SVOp,
06815                                  SelectionDAG &DAG) {
06816   MVT VT = SVOp->getSimpleValueType(0);
06817   SDLoc dl(SVOp);
06818   unsigned NumElems = VT.getVectorNumElements();
06819   MVT NewVT;
06820   unsigned Scale;
06821   switch (VT.SimpleTy) {
06822   default: llvm_unreachable("Unexpected!");
06823   case MVT::v4f32:  NewVT = MVT::v2f64; Scale = 2; break;
06824   case MVT::v4i32:  NewVT = MVT::v2i64; Scale = 2; break;
06825   case MVT::v8i16:  NewVT = MVT::v4i32; Scale = 2; break;
06826   case MVT::v16i8:  NewVT = MVT::v4i32; Scale = 4; break;
06827   case MVT::v16i16: NewVT = MVT::v8i32; Scale = 2; break;
06828   case MVT::v32i8:  NewVT = MVT::v8i32; Scale = 4; break;
06829   }
06830 
06831   SmallVector<int, 8> MaskVec;
06832   for (unsigned i = 0; i != NumElems; i += Scale) {
06833     int StartIdx = -1;
06834     for (unsigned j = 0; j != Scale; ++j) {
06835       int EltIdx = SVOp->getMaskElt(i+j);
06836       if (EltIdx < 0)
06837         continue;
06838       if (StartIdx < 0)
06839         StartIdx = (EltIdx / Scale);
06840       if (EltIdx != (int)(StartIdx*Scale + j))
06841         return SDValue();
06842     }
06843     MaskVec.push_back(StartIdx);
06844   }
06845 
06846   SDValue V1 = DAG.getNode(ISD::BITCAST, dl, NewVT, SVOp->getOperand(0));
06847   SDValue V2 = DAG.getNode(ISD::BITCAST, dl, NewVT, SVOp->getOperand(1));
06848   return DAG.getVectorShuffle(NewVT, dl, V1, V2, &MaskVec[0]);
06849 }
06850 
06851 /// getVZextMovL - Return a zero-extending vector move low node.
06852 ///
06853 static SDValue getVZextMovL(MVT VT, MVT OpVT,
06854                             SDValue SrcOp, SelectionDAG &DAG,
06855                             const X86Subtarget *Subtarget, SDLoc dl) {
06856   if (VT == MVT::v2f64 || VT == MVT::v4f32) {
06857     LoadSDNode *LD = NULL;
06858     if (!isScalarLoadToVector(SrcOp.getNode(), &LD))
06859       LD = dyn_cast<LoadSDNode>(SrcOp);
06860     if (!LD) {
06861       // movssrr and movsdrr do not clear top bits. Try to use movd, movq
06862       // instead.
06863       MVT ExtVT = (OpVT == MVT::v2f64) ? MVT::i64 : MVT::i32;
06864       if ((ExtVT != MVT::i64 || Subtarget->is64Bit()) &&
06865           SrcOp.getOpcode() == ISD::SCALAR_TO_VECTOR &&
06866           SrcOp.getOperand(0).getOpcode() == ISD::BITCAST &&
06867           SrcOp.getOperand(0).getOperand(0).getValueType() == ExtVT) {
06868         // PR2108
06869         OpVT = (OpVT == MVT::v2f64) ? MVT::v2i64 : MVT::v4i32;
06870         return DAG.getNode(ISD::BITCAST, dl, VT,
06871                            DAG.getNode(X86ISD::VZEXT_MOVL, dl, OpVT,
06872                                        DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
06873                                                    OpVT,
06874                                                    SrcOp.getOperand(0)
06875                                                           .getOperand(0))));
06876       }
06877     }
06878   }
06879 
06880   return DAG.getNode(ISD::BITCAST, dl, VT,
06881                      DAG.getNode(X86ISD::VZEXT_MOVL, dl, OpVT,
06882                                  DAG.getNode(ISD::BITCAST, dl,
06883                                              OpVT, SrcOp)));
06884 }
06885 
06886 /// LowerVECTOR_SHUFFLE_256 - Handle all 256-bit wide vectors shuffles
06887 /// which could not be matched by any known target speficic shuffle
06888 static SDValue
06889 LowerVECTOR_SHUFFLE_256(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) {
06890 
06891   SDValue NewOp = Compact8x32ShuffleNode(SVOp, DAG);
06892   if (NewOp.getNode())
06893     return NewOp;
06894 
06895   MVT VT = SVOp->getSimpleValueType(0);
06896 
06897   unsigned NumElems = VT.getVectorNumElements();
06898   unsigned NumLaneElems = NumElems / 2;
06899 
06900   SDLoc dl(SVOp);
06901   MVT EltVT = VT.getVectorElementType();
06902   MVT NVT = MVT::getVectorVT(EltVT, NumLaneElems);
06903   SDValue Output[2];
06904 
06905   SmallVector<int, 16> Mask;
06906   for (unsigned l = 0; l < 2; ++l) {
06907     // Build a shuffle mask for the output, discovering on the fly which
06908     // input vectors to use as shuffle operands (recorded in InputUsed).
06909     // If building a suitable shuffle vector proves too hard, then bail
06910     // out with UseBuildVector set.
06911     bool UseBuildVector = false;
06912     int InputUsed[2] = { -1, -1 }; // Not yet discovered.
06913     unsigned LaneStart = l * NumLaneElems;
06914     for (unsigned i = 0; i != NumLaneElems; ++i) {
06915       // The mask element.  This indexes into the input.
06916       int Idx = SVOp->getMaskElt(i+LaneStart);
06917       if (Idx < 0) {
06918         // the mask element does not index into any input vector.
06919         Mask.push_back(-1);
06920         continue;
06921       }
06922 
06923       // The input vector this mask element indexes into.
06924       int Input = Idx / NumLaneElems;
06925 
06926       // Turn the index into an offset from the start of the input vector.
06927       Idx -= Input * NumLaneElems;
06928 
06929       // Find or create a shuffle vector operand to hold this input.
06930       unsigned OpNo;
06931       for (OpNo = 0; OpNo < array_lengthof(InputUsed); ++OpNo) {
06932         if (InputUsed[OpNo] == Input)
06933           // This input vector is already an operand.
06934           break;
06935         if (InputUsed[OpNo] < 0) {
06936           // Create a new operand for this input vector.
06937           InputUsed[OpNo] = Input;
06938           break;
06939         }
06940       }
06941 
06942       if (OpNo >= array_lengthof(InputUsed)) {
06943         // More than two input vectors used!  Give up on trying to create a
06944         // shuffle vector.  Insert all elements into a BUILD_VECTOR instead.
06945         UseBuildVector = true;
06946         break;
06947       }
06948 
06949       // Add the mask index for the new shuffle vector.
06950       Mask.push_back(Idx + OpNo * NumLaneElems);
06951     }
06952 
06953     if (UseBuildVector) {
06954       SmallVector<SDValue, 16> SVOps;
06955       for (unsigned i = 0; i != NumLaneElems; ++i) {
06956         // The mask element.  This indexes into the input.
06957         int Idx = SVOp->getMaskElt(i+LaneStart);
06958         if (Idx < 0) {
06959           SVOps.push_back(DAG.getUNDEF(EltVT));
06960           continue;
06961         }
06962 
06963         // The input vector this mask element indexes into.
06964         int Input = Idx / NumElems;
06965 
06966         // Turn the index into an offset from the start of the input vector.
06967         Idx -= Input * NumElems;
06968 
06969         // Extract the vector element by hand.
06970         SVOps.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT,
06971                                     SVOp->getOperand(Input),
06972                                     DAG.getIntPtrConstant(Idx)));
06973       }
06974 
06975       // Construct the output using a BUILD_VECTOR.
06976       Output[l] = DAG.getNode(ISD::BUILD_VECTOR, dl, NVT, &SVOps[0],
06977                               SVOps.size());
06978     } else if (InputUsed[0] < 0) {
06979       // No input vectors were used! The result is undefined.
06980       Output[l] = DAG.getUNDEF(NVT);
06981     } else {
06982       SDValue Op0 = Extract128BitVector(SVOp->getOperand(InputUsed[0] / 2),
06983                                         (InputUsed[0] % 2) * NumLaneElems,
06984                                         DAG, dl);
06985       // If only one input was used, use an undefined vector for the other.
06986       SDValue Op1 = (InputUsed[1] < 0) ? DAG.getUNDEF(NVT) :
06987         Extract128BitVector(SVOp->getOperand(InputUsed[1] / 2),
06988                             (InputUsed[1] % 2) * NumLaneElems, DAG, dl);
06989       // At least one input vector was used. Create a new shuffle vector.
06990       Output[l] = DAG.getVectorShuffle(NVT, dl, Op0, Op1, &Mask[0]);
06991     }
06992 
06993     Mask.clear();
06994   }
06995 
06996   // Concatenate the result back
06997   return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Output[0], Output[1]);
06998 }
06999 
07000 /// LowerVECTOR_SHUFFLE_128v4 - Handle all 128-bit wide vectors with
07001 /// 4 elements, and match them with several different shuffle types.
07002 static SDValue
07003 LowerVECTOR_SHUFFLE_128v4(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) {
07004   SDValue V1 = SVOp->getOperand(0);
07005   SDValue V2 = SVOp->getOperand(1);
07006   SDLoc dl(SVOp);
07007   MVT VT = SVOp->getSimpleValueType(0);
07008 
07009   assert(VT.is128BitVector() && "Unsupported vector size");
07010 
07011   std::pair<int, int> Locs[4];
07012   int Mask1[] = { -1, -1, -1, -1 };
07013   SmallVector<int, 8> PermMask(SVOp->getMask().begin(), SVOp->getMask().end());
07014 
07015   unsigned NumHi = 0;
07016   unsigned NumLo = 0;
07017   for (unsigned i = 0; i != 4; ++i) {
07018     int Idx = PermMask[i];
07019     if (Idx < 0) {
07020       Locs[i] = std::make_pair(-1, -1);
07021     } else {
07022       assert(Idx < 8 && "Invalid VECTOR_SHUFFLE index!");
07023       if (Idx < 4) {
07024         Locs[i] = std::make_pair(0, NumLo);
07025         Mask1[NumLo] = Idx;
07026         NumLo++;
07027       } else {
07028         Locs[i] = std::make_pair(1, NumHi);
07029         if (2+NumHi < 4)
07030           Mask1[2+NumHi] = Idx;
07031         NumHi++;
07032       }
07033     }
07034   }
07035 
07036   if (NumLo <= 2 && NumHi <= 2) {
07037     // If no more than two elements come from either vector. This can be
07038     // implemented with two shuffles. First shuffle gather the elements.
07039     // The second shuffle, which takes the first shuffle as both of its
07040     // vector operands, put the elements into the right order.
07041     V1 = DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]);
07042 
07043     int Mask2[] = { -1, -1, -1, -1 };
07044 
07045     for (unsigned i = 0; i != 4; ++i)
07046       if (Locs[i].first != -1) {
07047         unsigned Idx = (i < 2) ? 0 : 4;
07048         Idx += Locs[i].first * 2 + Locs[i].second;
07049         Mask2[i] = Idx;
07050       }
07051 
07052     return DAG.getVectorShuffle(VT, dl, V1, V1, &Mask2[0]);
07053   }
07054 
07055   if (NumLo == 3 || NumHi == 3) {
07056     // Otherwise, we must have three elements from one vector, call it X, and
07057     // one element from the other, call it Y.  First, use a shufps to build an
07058     // intermediate vector with the one element from Y and the element from X
07059     // that will be in the same half in the final destination (the indexes don't
07060     // matter). Then, use a shufps to build the final vector, taking the half
07061     // containing the element from Y from the intermediate, and the other half
07062     // from X.
07063     if (NumHi == 3) {
07064       // Normalize it so the 3 elements come from V1.
07065       CommuteVectorShuffleMask(PermMask, 4);
07066       std::swap(V1, V2);
07067     }
07068 
07069     // Find the element from V2.
07070     unsigned HiIndex;
07071     for (HiIndex = 0; HiIndex < 3; ++HiIndex) {
07072       int Val = PermMask[HiIndex];
07073       if (Val < 0)
07074         continue;
07075       if (Val >= 4)
07076         break;
07077     }
07078 
07079     Mask1[0] = PermMask[HiIndex];
07080     Mask1[1] = -1;
07081     Mask1[2] = PermMask[HiIndex^1];
07082     Mask1[3] = -1;
07083     V2 = DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]);
07084 
07085     if (HiIndex >= 2) {
07086       Mask1[0] = PermMask[0];
07087       Mask1[1] = PermMask[1];
07088       Mask1[2] = HiIndex & 1 ? 6 : 4;
07089       Mask1[3] = HiIndex & 1 ? 4 : 6;
07090       return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]);
07091     }
07092 
07093     Mask1[0] = HiIndex & 1 ? 2 : 0;
07094     Mask1[1] = HiIndex & 1 ? 0 : 2;
07095     Mask1[2] = PermMask[2];
07096     Mask1[3] = PermMask[3];
07097     if (Mask1[2] >= 0)
07098       Mask1[2] += 4;
07099     if (Mask1[3] >= 0)
07100       Mask1[3] += 4;
07101     return DAG.getVectorShuffle(VT, dl, V2, V1, &Mask1[0]);
07102   }
07103 
07104   // Break it into (shuffle shuffle_hi, shuffle_lo).
07105   int LoMask[] = { -1, -1, -1, -1 };
07106   int HiMask[] = { -1, -1, -1, -1 };
07107 
07108   int *MaskPtr = LoMask;
07109   unsigned MaskIdx = 0;
07110   unsigned LoIdx = 0;
07111   unsigned HiIdx = 2;
07112   for (unsigned i = 0; i != 4; ++i) {
07113     if (i == 2) {
07114       MaskPtr = HiMask;
07115       MaskIdx = 1;
07116       LoIdx = 0;
07117       HiIdx = 2;
07118     }
07119     int Idx = PermMask[i];
07120     if (Idx < 0) {
07121       Locs[i] = std::make_pair(-1, -1);
07122     } else if (Idx < 4) {
07123       Locs[i] = std::make_pair(MaskIdx, LoIdx);
07124       MaskPtr[LoIdx] = Idx;
07125       LoIdx++;
07126     } else {
07127       Locs[i] = std::make_pair(MaskIdx, HiIdx);
07128       MaskPtr[HiIdx] = Idx;
07129       HiIdx++;
07130     }
07131   }
07132 
07133   SDValue LoShuffle = DAG.getVectorShuffle(VT, dl, V1, V2, &LoMask[0]);
07134   SDValue HiShuffle = DAG.getVectorShuffle(VT, dl, V1, V2, &HiMask[0]);
07135   int MaskOps[] = { -1, -1, -1, -1 };
07136   for (unsigned i = 0; i != 4; ++i)
07137     if (Locs[i].first != -1)
07138       MaskOps[i] = Locs[i].first * 4 + Locs[i].second;
07139   return DAG.getVectorShuffle(VT, dl, LoShuffle, HiShuffle, &MaskOps[0]);
07140 }
07141 
07142 static bool MayFoldVectorLoad(SDValue V) {
07143   while (V.hasOneUse() && V.getOpcode() == ISD::BITCAST)
07144     V = V.getOperand(0);
07145 
07146   if (V.hasOneUse() && V.getOpcode() == ISD::SCALAR_TO_VECTOR)
07147     V = V.getOperand(0);
07148   if (V.hasOneUse() && V.getOpcode() == ISD::BUILD_VECTOR &&
07149       V.getNumOperands() == 2 && V.getOperand(1).getOpcode() == ISD::UNDEF)
07150     // BUILD_VECTOR (load), undef
07151     V = V.getOperand(0);
07152 
07153   return MayFoldLoad(V);
07154 }
07155 
07156 static
07157 SDValue getMOVDDup(SDValue &Op, SDLoc &dl, SDValue V1, SelectionDAG &DAG) {
07158   MVT VT = Op.getSimpleValueType();
07159 
07160   // Canonizalize to v2f64.
07161   V1 = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, V1);
07162   return DAG.getNode(ISD::BITCAST, dl, VT,
07163                      getTargetShuffleNode(X86ISD::MOVDDUP, dl, MVT::v2f64,
07164                                           V1, DAG));
07165 }
07166 
07167 static
07168 SDValue getMOVLowToHigh(SDValue &Op, SDLoc &dl, SelectionDAG &DAG,
07169                         bool HasSSE2) {
07170   SDValue V1 = Op.getOperand(0);
07171   SDValue V2 = Op.getOperand(1);
07172   MVT VT = Op.getSimpleValueType();
07173 
07174   assert(VT != MVT::v2i64 && "unsupported shuffle type");
07175 
07176   if (HasSSE2 && VT == MVT::v2f64)
07177     return getTargetShuffleNode(X86ISD::MOVLHPD, dl, VT, V1, V2, DAG);
07178 
07179   // v4f32 or v4i32: canonizalized to v4f32 (which is legal for SSE1)
07180   return DAG.getNode(ISD::BITCAST, dl, VT,
07181                      getTargetShuffleNode(X86ISD::MOVLHPS, dl, MVT::v4f32,
07182                            DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, V1),
07183                            DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, V2), DAG));
07184 }
07185 
07186 static
07187 SDValue getMOVHighToLow(SDValue &Op, SDLoc &dl, SelectionDAG &DAG) {
07188   SDValue V1 = Op.getOperand(0);
07189   SDValue V2 = Op.getOperand(1);
07190   MVT VT = Op.getSimpleValueType();
07191 
07192   assert((VT == MVT::v4i32 || VT == MVT::v4f32) &&
07193          "unsupported shuffle type");
07194 
07195   if (V2.getOpcode() == ISD::UNDEF)
07196     V2 = V1;
07197 
07198   // v4i32 or v4f32
07199   return getTargetShuffleNode(X86ISD::MOVHLPS, dl, VT, V1, V2, DAG);
07200 }
07201 
07202 static
07203 SDValue getMOVLP(SDValue &Op, SDLoc &dl, SelectionDAG &DAG, bool HasSSE2) {
07204   SDValue V1 = Op.getOperand(0);
07205   SDValue V2 = Op.getOperand(1);
07206   MVT VT = Op.getSimpleValueType();
07207   unsigned NumElems = VT.getVectorNumElements();
07208 
07209   // Use MOVLPS and MOVLPD in case V1 or V2 are loads. During isel, the second
07210   // operand of these instructions is only memory, so check if there's a
07211   // potencial load folding here, otherwise use SHUFPS or MOVSD to match the
07212   // same masks.
07213   bool CanFoldLoad = false;
07214 
07215   // Trivial case, when V2 comes from a load.
07216   if (MayFoldVectorLoad(V2))
07217     CanFoldLoad = true;
07218 
07219   // When V1 is a load, it can be folded later into a store in isel, example:
07220   //  (store (v4f32 (X86Movlps (load addr:$src1), VR128:$src2)), addr:$src1)
07221   //    turns into:
07222   //  (MOVLPSmr addr:$src1, VR128:$src2)
07223   // So, recognize this potential and also use MOVLPS or MOVLPD
07224   else if (MayFoldVectorLoad(V1) && MayFoldIntoStore(Op))
07225     CanFoldLoad = true;
07226 
07227   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
07228   if (CanFoldLoad) {
07229     if (HasSSE2 && NumElems == 2)
07230       return getTargetShuffleNode(X86ISD::MOVLPD, dl, VT, V1, V2, DAG);
07231 
07232     if (NumElems == 4)
07233       // If we don't care about the second element, proceed to use movss.
07234       if (SVOp->getMaskElt(1) != -1)
07235         return getTargetShuffleNode(X86ISD::MOVLPS, dl, VT, V1, V2, DAG);
07236   }
07237 
07238   // movl and movlp will both match v2i64, but v2i64 is never matched by
07239   // movl earlier because we make it strict to avoid messing with the movlp load
07240   // folding logic (see the code above getMOVLP call). Match it here then,
07241   // this is horrible, but will stay like this until we move all shuffle
07242   // matching to x86 specific nodes. Note that for the 1st condition all
07243   // types are matched with movsd.
07244   if (HasSSE2) {
07245     // FIXME: isMOVLMask should be checked and matched before getMOVLP,
07246     // as to remove this logic from here, as much as possible
07247     if (NumElems == 2 || !isMOVLMask(SVOp->getMask(), VT))
07248       return getTargetShuffleNode(X86ISD::MOVSD, dl, VT, V1, V2, DAG);
07249     return getTargetShuffleNode(X86ISD::MOVSS, dl, VT, V1, V2, DAG);
07250   }
07251 
07252   assert(VT != MVT::v4i32 && "unsupported shuffle type");
07253 
07254   // Invert the operand order and use SHUFPS to match it.
07255   return getTargetShuffleNode(X86ISD::SHUFP, dl, VT, V2, V1,
07256                               getShuffleSHUFImmediate(SVOp), DAG);
07257 }
07258 
07259 // Reduce a vector shuffle to zext.
07260 static SDValue LowerVectorIntExtend(SDValue Op, const X86Subtarget *Subtarget,
07261                                     SelectionDAG &DAG) {
07262   // PMOVZX is only available from SSE41.
07263   if (!Subtarget->hasSSE41())
07264     return SDValue();
07265 
07266   MVT VT = Op.getSimpleValueType();
07267 
07268   // Only AVX2 support 256-bit vector integer extending.
07269   if (!Subtarget->hasInt256() && VT.is256BitVector())
07270     return SDValue();
07271 
07272   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
07273   SDLoc DL(Op);
07274   SDValue V1 = Op.getOperand(0);
07275   SDValue V2 = Op.getOperand(1);
07276   unsigned NumElems = VT.getVectorNumElements();
07277 
07278   // Extending is an unary operation and the element type of the source vector
07279   // won't be equal to or larger than i64.
07280   if (V2.getOpcode() != ISD::UNDEF || !VT.isInteger() ||
07281       VT.getVectorElementType() == MVT::i64)
07282     return SDValue();
07283 
07284   // Find the expansion ratio, e.g. expanding from i8 to i32 has a ratio of 4.
07285   unsigned Shift = 1; // Start from 2, i.e. 1 << 1.
07286   while ((1U << Shift) < NumElems) {
07287     if (SVOp->getMaskElt(1U << Shift) == 1)
07288       break;
07289     Shift += 1;
07290     // The maximal ratio is 8, i.e. from i8 to i64.
07291     if (Shift > 3)
07292       return SDValue();
07293   }
07294 
07295   // Check the shuffle mask.
07296   unsigned Mask = (1U << Shift) - 1;
07297   for (unsigned i = 0; i != NumElems; ++i) {
07298     int EltIdx = SVOp->getMaskElt(i);
07299     if ((i & Mask) != 0 && EltIdx != -1)
07300       return SDValue();
07301     if ((i & Mask) == 0 && (unsigned)EltIdx != (i >> Shift))
07302       return SDValue();
07303   }
07304 
07305   unsigned NBits = VT.getVectorElementType().getSizeInBits() << Shift;
07306   MVT NeVT = MVT::getIntegerVT(NBits);
07307   MVT NVT = MVT::getVectorVT(NeVT, NumElems >> Shift);
07308 
07309   if (!DAG.getTargetLoweringInfo().isTypeLegal(NVT))
07310     return SDValue();
07311 
07312   // Simplify the operand as it's prepared to be fed into shuffle.
07313   unsigned SignificantBits = NVT.getSizeInBits() >> Shift;
07314   if (V1.getOpcode() == ISD::BITCAST &&
07315       V1.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR &&
07316       V1.getOperand(0).getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
07317       V1.getOperand(0).getOperand(0)
07318         .getSimpleValueType().getSizeInBits() == SignificantBits) {
07319     // (bitcast (sclr2vec (ext_vec_elt x))) -> (bitcast x)
07320     SDValue V = V1.getOperand(0).getOperand(0).getOperand(0);
07321     ConstantSDNode *CIdx =
07322       dyn_cast<ConstantSDNode>(V1.getOperand(0).getOperand(0).getOperand(1));
07323     // If it's foldable, i.e. normal load with single use, we will let code
07324     // selection to fold it. Otherwise, we will short the conversion sequence.
07325     if (CIdx && CIdx->getZExtValue() == 0 &&
07326         (!ISD::isNormalLoad(V.