LLVM API Documentation

X86ISelLowering.cpp
Go to the documentation of this file.
00001 //===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===//
00002 //
00003 //                     The LLVM Compiler Infrastructure
00004 //
00005 // This file is distributed under the University of Illinois Open Source
00006 // License. See LICENSE.TXT for details.
00007 //
00008 //===----------------------------------------------------------------------===//
00009 //
00010 // This file defines the interfaces that X86 uses to lower LLVM code into a
00011 // selection DAG.
00012 //
00013 //===----------------------------------------------------------------------===//
00014 
00015 #include "X86ISelLowering.h"
00016 #include "Utils/X86ShuffleDecode.h"
00017 #include "X86CallingConv.h"
00018 #include "X86InstrBuilder.h"
00019 #include "X86MachineFunctionInfo.h"
00020 #include "X86TargetMachine.h"
00021 #include "X86TargetObjectFile.h"
00022 #include "llvm/ADT/SmallSet.h"
00023 #include "llvm/ADT/Statistic.h"
00024 #include "llvm/ADT/StringExtras.h"
00025 #include "llvm/ADT/VariadicFunction.h"
00026 #include "llvm/CodeGen/IntrinsicLowering.h"
00027 #include "llvm/CodeGen/MachineFrameInfo.h"
00028 #include "llvm/CodeGen/MachineFunction.h"
00029 #include "llvm/CodeGen/MachineInstrBuilder.h"
00030 #include "llvm/CodeGen/MachineJumpTableInfo.h"
00031 #include "llvm/CodeGen/MachineModuleInfo.h"
00032 #include "llvm/CodeGen/MachineRegisterInfo.h"
00033 #include "llvm/IR/CallSite.h"
00034 #include "llvm/IR/CallingConv.h"
00035 #include "llvm/IR/Constants.h"
00036 #include "llvm/IR/DerivedTypes.h"
00037 #include "llvm/IR/Function.h"
00038 #include "llvm/IR/GlobalAlias.h"
00039 #include "llvm/IR/GlobalVariable.h"
00040 #include "llvm/IR/Instructions.h"
00041 #include "llvm/IR/Intrinsics.h"
00042 #include "llvm/MC/MCAsmInfo.h"
00043 #include "llvm/MC/MCContext.h"
00044 #include "llvm/MC/MCExpr.h"
00045 #include "llvm/MC/MCSymbol.h"
00046 #include "llvm/Support/Debug.h"
00047 #include "llvm/Support/ErrorHandling.h"
00048 #include "llvm/Support/MathExtras.h"
00049 #include "llvm/Target/TargetOptions.h"
00050 #include <bitset>
00051 #include <cctype>
00052 using namespace llvm;
00053 
00054 #define DEBUG_TYPE "x86-isel"
00055 
00056 STATISTIC(NumTailCalls, "Number of tail calls");
00057 
00058 // Forward declarations.
00059 static SDValue getMOVL(SelectionDAG &DAG, SDLoc dl, EVT VT, SDValue V1,
00060                        SDValue V2);
00061 
00062 static SDValue ExtractSubVector(SDValue Vec, unsigned IdxVal,
00063                                 SelectionDAG &DAG, SDLoc dl,
00064                                 unsigned vectorWidth) {
00065   assert((vectorWidth == 128 || vectorWidth == 256) &&
00066          "Unsupported vector width");
00067   EVT VT = Vec.getValueType();
00068   EVT ElVT = VT.getVectorElementType();
00069   unsigned Factor = VT.getSizeInBits()/vectorWidth;
00070   EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT,
00071                                   VT.getVectorNumElements()/Factor);
00072 
00073   // Extract from UNDEF is UNDEF.
00074   if (Vec.getOpcode() == ISD::UNDEF)
00075     return DAG.getUNDEF(ResultVT);
00076 
00077   // Extract the relevant vectorWidth bits.  Generate an EXTRACT_SUBVECTOR
00078   unsigned ElemsPerChunk = vectorWidth / ElVT.getSizeInBits();
00079 
00080   // This is the index of the first element of the vectorWidth-bit chunk
00081   // we want.
00082   unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits()) / vectorWidth)
00083                                * ElemsPerChunk);
00084 
00085   // If the input is a buildvector just emit a smaller one.
00086   if (Vec.getOpcode() == ISD::BUILD_VECTOR)
00087     return DAG.getNode(ISD::BUILD_VECTOR, dl, ResultVT,
00088                        Vec->op_begin()+NormalizedIdxVal, ElemsPerChunk);
00089 
00090   SDValue VecIdx = DAG.getIntPtrConstant(NormalizedIdxVal);
00091   SDValue Result = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec,
00092                                VecIdx);
00093 
00094   return Result;
00095 
00096 }
00097 /// Generate a DAG to grab 128-bits from a vector > 128 bits.  This
00098 /// sets things up to match to an AVX VEXTRACTF128 / VEXTRACTI128
00099 /// or AVX-512 VEXTRACTF32x4 / VEXTRACTI32x4
00100 /// instructions or a simple subregister reference. Idx is an index in the
00101 /// 128 bits we want.  It need not be aligned to a 128-bit bounday.  That makes
00102 /// lowering EXTRACT_VECTOR_ELT operations easier.
00103 static SDValue Extract128BitVector(SDValue Vec, unsigned IdxVal,
00104                                    SelectionDAG &DAG, SDLoc dl) {
00105   assert((Vec.getValueType().is256BitVector() ||
00106           Vec.getValueType().is512BitVector()) && "Unexpected vector size!");
00107   return ExtractSubVector(Vec, IdxVal, DAG, dl, 128);
00108 }
00109 
00110 /// Generate a DAG to grab 256-bits from a 512-bit vector.
00111 static SDValue Extract256BitVector(SDValue Vec, unsigned IdxVal,
00112                                    SelectionDAG &DAG, SDLoc dl) {
00113   assert(Vec.getValueType().is512BitVector() && "Unexpected vector size!");
00114   return ExtractSubVector(Vec, IdxVal, DAG, dl, 256);
00115 }
00116 
00117 static SDValue InsertSubVector(SDValue Result, SDValue Vec,
00118                                unsigned IdxVal, SelectionDAG &DAG,
00119                                SDLoc dl, unsigned vectorWidth) {
00120   assert((vectorWidth == 128 || vectorWidth == 256) &&
00121          "Unsupported vector width");
00122   // Inserting UNDEF is Result
00123   if (Vec.getOpcode() == ISD::UNDEF)
00124     return Result;
00125   EVT VT = Vec.getValueType();
00126   EVT ElVT = VT.getVectorElementType();
00127   EVT ResultVT = Result.getValueType();
00128 
00129   // Insert the relevant vectorWidth bits.
00130   unsigned ElemsPerChunk = vectorWidth/ElVT.getSizeInBits();
00131 
00132   // This is the index of the first element of the vectorWidth-bit chunk
00133   // we want.
00134   unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits())/vectorWidth)
00135                                * ElemsPerChunk);
00136 
00137   SDValue VecIdx = DAG.getIntPtrConstant(NormalizedIdxVal);
00138   return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec,
00139                      VecIdx);
00140 }
00141 /// Generate a DAG to put 128-bits into a vector > 128 bits.  This
00142 /// sets things up to match to an AVX VINSERTF128/VINSERTI128 or
00143 /// AVX-512 VINSERTF32x4/VINSERTI32x4 instructions or a
00144 /// simple superregister reference.  Idx is an index in the 128 bits
00145 /// we want.  It need not be aligned to a 128-bit bounday.  That makes
00146 /// lowering INSERT_VECTOR_ELT operations easier.
00147 static SDValue Insert128BitVector(SDValue Result, SDValue Vec,
00148                                   unsigned IdxVal, SelectionDAG &DAG,
00149                                   SDLoc dl) {
00150   assert(Vec.getValueType().is128BitVector() && "Unexpected vector size!");
00151   return InsertSubVector(Result, Vec, IdxVal, DAG, dl, 128);
00152 }
00153 
00154 static SDValue Insert256BitVector(SDValue Result, SDValue Vec,
00155                                   unsigned IdxVal, SelectionDAG &DAG,
00156                                   SDLoc dl) {
00157   assert(Vec.getValueType().is256BitVector() && "Unexpected vector size!");
00158   return InsertSubVector(Result, Vec, IdxVal, DAG, dl, 256);
00159 }
00160 
00161 /// Concat two 128-bit vectors into a 256 bit vector using VINSERTF128
00162 /// instructions. This is used because creating CONCAT_VECTOR nodes of
00163 /// BUILD_VECTORS returns a larger BUILD_VECTOR while we're trying to lower
00164 /// large BUILD_VECTORS.
00165 static SDValue Concat128BitVectors(SDValue V1, SDValue V2, EVT VT,
00166                                    unsigned NumElems, SelectionDAG &DAG,
00167                                    SDLoc dl) {
00168   SDValue V = Insert128BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl);
00169   return Insert128BitVector(V, V2, NumElems/2, DAG, dl);
00170 }
00171 
00172 static SDValue Concat256BitVectors(SDValue V1, SDValue V2, EVT VT,
00173                                    unsigned NumElems, SelectionDAG &DAG,
00174                                    SDLoc dl) {
00175   SDValue V = Insert256BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl);
00176   return Insert256BitVector(V, V2, NumElems/2, DAG, dl);
00177 }
00178 
00179 static TargetLoweringObjectFile *createTLOF(X86TargetMachine &TM) {
00180   const X86Subtarget *Subtarget = &TM.getSubtarget<X86Subtarget>();
00181   bool is64Bit = Subtarget->is64Bit();
00182 
00183   if (Subtarget->isTargetMacho()) {
00184     if (is64Bit)
00185       return new X86_64MachoTargetObjectFile();
00186     return new TargetLoweringObjectFileMachO();
00187   }
00188 
00189   if (Subtarget->isTargetLinux())
00190     return new X86LinuxTargetObjectFile();
00191   if (Subtarget->isTargetELF())
00192     return new TargetLoweringObjectFileELF();
00193   if (Subtarget->isTargetKnownWindowsMSVC())
00194     return new X86WindowsTargetObjectFile();
00195   if (Subtarget->isTargetCOFF())
00196     return new TargetLoweringObjectFileCOFF();
00197   llvm_unreachable("unknown subtarget type");
00198 }
00199 
00200 X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
00201   : TargetLowering(TM, createTLOF(TM)) {
00202   Subtarget = &TM.getSubtarget<X86Subtarget>();
00203   X86ScalarSSEf64 = Subtarget->hasSSE2();
00204   X86ScalarSSEf32 = Subtarget->hasSSE1();
00205   TD = getDataLayout();
00206 
00207   resetOperationActions();
00208 }
00209 
00210 void X86TargetLowering::resetOperationActions() {
00211   const TargetMachine &TM = getTargetMachine();
00212   static bool FirstTimeThrough = true;
00213 
00214   // If none of the target options have changed, then we don't need to reset the
00215   // operation actions.
00216   if (!FirstTimeThrough && TO == TM.Options) return;
00217 
00218   if (!FirstTimeThrough) {
00219     // Reinitialize the actions.
00220     initActions();
00221     FirstTimeThrough = false;
00222   }
00223 
00224   TO = TM.Options;
00225 
00226   // Set up the TargetLowering object.
00227   static const MVT IntVTs[] = { MVT::i8, MVT::i16, MVT::i32, MVT::i64 };
00228 
00229   // X86 is weird, it always uses i8 for shift amounts and setcc results.
00230   setBooleanContents(ZeroOrOneBooleanContent);
00231   // X86-SSE is even stranger. It uses -1 or 0 for vector masks.
00232   setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
00233 
00234   // For 64-bit since we have so many registers use the ILP scheduler, for
00235   // 32-bit code use the register pressure specific scheduling.
00236   // For Atom, always use ILP scheduling.
00237   if (Subtarget->isAtom())
00238     setSchedulingPreference(Sched::ILP);
00239   else if (Subtarget->is64Bit())
00240     setSchedulingPreference(Sched::ILP);
00241   else
00242     setSchedulingPreference(Sched::RegPressure);
00243   const X86RegisterInfo *RegInfo =
00244     static_cast<const X86RegisterInfo*>(TM.getRegisterInfo());
00245   setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister());
00246 
00247   // Bypass expensive divides on Atom when compiling with O2
00248   if (Subtarget->hasSlowDivide() && TM.getOptLevel() >= CodeGenOpt::Default) {
00249     addBypassSlowDiv(32, 8);
00250     if (Subtarget->is64Bit())
00251       addBypassSlowDiv(64, 16);
00252   }
00253 
00254   if (Subtarget->isTargetKnownWindowsMSVC()) {
00255     // Setup Windows compiler runtime calls.
00256     setLibcallName(RTLIB::SDIV_I64, "_alldiv");
00257     setLibcallName(RTLIB::UDIV_I64, "_aulldiv");
00258     setLibcallName(RTLIB::SREM_I64, "_allrem");
00259     setLibcallName(RTLIB::UREM_I64, "_aullrem");
00260     setLibcallName(RTLIB::MUL_I64, "_allmul");
00261     setLibcallCallingConv(RTLIB::SDIV_I64, CallingConv::X86_StdCall);
00262     setLibcallCallingConv(RTLIB::UDIV_I64, CallingConv::X86_StdCall);
00263     setLibcallCallingConv(RTLIB::SREM_I64, CallingConv::X86_StdCall);
00264     setLibcallCallingConv(RTLIB::UREM_I64, CallingConv::X86_StdCall);
00265     setLibcallCallingConv(RTLIB::MUL_I64, CallingConv::X86_StdCall);
00266 
00267     // The _ftol2 runtime function has an unusual calling conv, which
00268     // is modeled by a special pseudo-instruction.
00269     setLibcallName(RTLIB::FPTOUINT_F64_I64, 0);
00270     setLibcallName(RTLIB::FPTOUINT_F32_I64, 0);
00271     setLibcallName(RTLIB::FPTOUINT_F64_I32, 0);
00272     setLibcallName(RTLIB::FPTOUINT_F32_I32, 0);
00273   }
00274 
00275   if (Subtarget->isTargetDarwin()) {
00276     // Darwin should use _setjmp/_longjmp instead of setjmp/longjmp.
00277     setUseUnderscoreSetJmp(false);
00278     setUseUnderscoreLongJmp(false);
00279   } else if (Subtarget->isTargetWindowsGNU()) {
00280     // MS runtime is weird: it exports _setjmp, but longjmp!
00281     setUseUnderscoreSetJmp(true);
00282     setUseUnderscoreLongJmp(false);
00283   } else {
00284     setUseUnderscoreSetJmp(true);
00285     setUseUnderscoreLongJmp(true);
00286   }
00287 
00288   // Set up the register classes.
00289   addRegisterClass(MVT::i8, &X86::GR8RegClass);
00290   addRegisterClass(MVT::i16, &X86::GR16RegClass);
00291   addRegisterClass(MVT::i32, &X86::GR32RegClass);
00292   if (Subtarget->is64Bit())
00293     addRegisterClass(MVT::i64, &X86::GR64RegClass);
00294 
00295   setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote);
00296 
00297   // We don't accept any truncstore of integer registers.
00298   setTruncStoreAction(MVT::i64, MVT::i32, Expand);
00299   setTruncStoreAction(MVT::i64, MVT::i16, Expand);
00300   setTruncStoreAction(MVT::i64, MVT::i8 , Expand);
00301   setTruncStoreAction(MVT::i32, MVT::i16, Expand);
00302   setTruncStoreAction(MVT::i32, MVT::i8 , Expand);
00303   setTruncStoreAction(MVT::i16, MVT::i8,  Expand);
00304 
00305   // SETOEQ and SETUNE require checking two conditions.
00306   setCondCodeAction(ISD::SETOEQ, MVT::f32, Expand);
00307   setCondCodeAction(ISD::SETOEQ, MVT::f64, Expand);
00308   setCondCodeAction(ISD::SETOEQ, MVT::f80, Expand);
00309   setCondCodeAction(ISD::SETUNE, MVT::f32, Expand);
00310   setCondCodeAction(ISD::SETUNE, MVT::f64, Expand);
00311   setCondCodeAction(ISD::SETUNE, MVT::f80, Expand);
00312 
00313   // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this
00314   // operation.
00315   setOperationAction(ISD::UINT_TO_FP       , MVT::i1   , Promote);
00316   setOperationAction(ISD::UINT_TO_FP       , MVT::i8   , Promote);
00317   setOperationAction(ISD::UINT_TO_FP       , MVT::i16  , Promote);
00318 
00319   if (Subtarget->is64Bit()) {
00320     setOperationAction(ISD::UINT_TO_FP     , MVT::i32  , Promote);
00321     setOperationAction(ISD::UINT_TO_FP     , MVT::i64  , Custom);
00322   } else if (!TM.Options.UseSoftFloat) {
00323     // We have an algorithm for SSE2->double, and we turn this into a
00324     // 64-bit FILD followed by conditional FADD for other targets.
00325     setOperationAction(ISD::UINT_TO_FP     , MVT::i64  , Custom);
00326     // We have an algorithm for SSE2, and we turn this into a 64-bit
00327     // FILD for other targets.
00328     setOperationAction(ISD::UINT_TO_FP     , MVT::i32  , Custom);
00329   }
00330 
00331   // Promote i1/i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have
00332   // this operation.
00333   setOperationAction(ISD::SINT_TO_FP       , MVT::i1   , Promote);
00334   setOperationAction(ISD::SINT_TO_FP       , MVT::i8   , Promote);
00335 
00336   if (!TM.Options.UseSoftFloat) {
00337     // SSE has no i16 to fp conversion, only i32
00338     if (X86ScalarSSEf32) {
00339       setOperationAction(ISD::SINT_TO_FP     , MVT::i16  , Promote);
00340       // f32 and f64 cases are Legal, f80 case is not
00341       setOperationAction(ISD::SINT_TO_FP     , MVT::i32  , Custom);
00342     } else {
00343       setOperationAction(ISD::SINT_TO_FP     , MVT::i16  , Custom);
00344       setOperationAction(ISD::SINT_TO_FP     , MVT::i32  , Custom);
00345     }
00346   } else {
00347     setOperationAction(ISD::SINT_TO_FP     , MVT::i16  , Promote);
00348     setOperationAction(ISD::SINT_TO_FP     , MVT::i32  , Promote);
00349   }
00350 
00351   // In 32-bit mode these are custom lowered.  In 64-bit mode F32 and F64
00352   // are Legal, f80 is custom lowered.
00353   setOperationAction(ISD::FP_TO_SINT     , MVT::i64  , Custom);
00354   setOperationAction(ISD::SINT_TO_FP     , MVT::i64  , Custom);
00355 
00356   // Promote i1/i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have
00357   // this operation.
00358   setOperationAction(ISD::FP_TO_SINT       , MVT::i1   , Promote);
00359   setOperationAction(ISD::FP_TO_SINT       , MVT::i8   , Promote);
00360 
00361   if (X86ScalarSSEf32) {
00362     setOperationAction(ISD::FP_TO_SINT     , MVT::i16  , Promote);
00363     // f32 and f64 cases are Legal, f80 case is not
00364     setOperationAction(ISD::FP_TO_SINT     , MVT::i32  , Custom);
00365   } else {
00366     setOperationAction(ISD::FP_TO_SINT     , MVT::i16  , Custom);
00367     setOperationAction(ISD::FP_TO_SINT     , MVT::i32  , Custom);
00368   }
00369 
00370   // Handle FP_TO_UINT by promoting the destination to a larger signed
00371   // conversion.
00372   setOperationAction(ISD::FP_TO_UINT       , MVT::i1   , Promote);
00373   setOperationAction(ISD::FP_TO_UINT       , MVT::i8   , Promote);
00374   setOperationAction(ISD::FP_TO_UINT       , MVT::i16  , Promote);
00375 
00376   if (Subtarget->is64Bit()) {
00377     setOperationAction(ISD::FP_TO_UINT     , MVT::i64  , Expand);
00378     setOperationAction(ISD::FP_TO_UINT     , MVT::i32  , Promote);
00379   } else if (!TM.Options.UseSoftFloat) {
00380     // Since AVX is a superset of SSE3, only check for SSE here.
00381     if (Subtarget->hasSSE1() && !Subtarget->hasSSE3())
00382       // Expand FP_TO_UINT into a select.
00383       // FIXME: We would like to use a Custom expander here eventually to do
00384       // the optimal thing for SSE vs. the default expansion in the legalizer.
00385       setOperationAction(ISD::FP_TO_UINT   , MVT::i32  , Expand);
00386     else
00387       // With SSE3 we can use fisttpll to convert to a signed i64; without
00388       // SSE, we're stuck with a fistpll.
00389       setOperationAction(ISD::FP_TO_UINT   , MVT::i32  , Custom);
00390   }
00391 
00392   if (isTargetFTOL()) {
00393     // Use the _ftol2 runtime function, which has a pseudo-instruction
00394     // to handle its weird calling convention.
00395     setOperationAction(ISD::FP_TO_UINT     , MVT::i64  , Custom);
00396   }
00397 
00398   // TODO: when we have SSE, these could be more efficient, by using movd/movq.
00399   if (!X86ScalarSSEf64) {
00400     setOperationAction(ISD::BITCAST        , MVT::f32  , Expand);
00401     setOperationAction(ISD::BITCAST        , MVT::i32  , Expand);
00402     if (Subtarget->is64Bit()) {
00403       setOperationAction(ISD::BITCAST      , MVT::f64  , Expand);
00404       // Without SSE, i64->f64 goes through memory.
00405       setOperationAction(ISD::BITCAST      , MVT::i64  , Expand);
00406     }
00407   }
00408 
00409   // Scalar integer divide and remainder are lowered to use operations that
00410   // produce two results, to match the available instructions. This exposes
00411   // the two-result form to trivial CSE, which is able to combine x/y and x%y
00412   // into a single instruction.
00413   //
00414   // Scalar integer multiply-high is also lowered to use two-result
00415   // operations, to match the available instructions. However, plain multiply
00416   // (low) operations are left as Legal, as there are single-result
00417   // instructions for this in x86. Using the two-result multiply instructions
00418   // when both high and low results are needed must be arranged by dagcombine.
00419   for (unsigned i = 0; i != array_lengthof(IntVTs); ++i) {
00420     MVT VT = IntVTs[i];
00421     setOperationAction(ISD::MULHS, VT, Expand);
00422     setOperationAction(ISD::MULHU, VT, Expand);
00423     setOperationAction(ISD::SDIV, VT, Expand);
00424     setOperationAction(ISD::UDIV, VT, Expand);
00425     setOperationAction(ISD::SREM, VT, Expand);
00426     setOperationAction(ISD::UREM, VT, Expand);
00427 
00428     // Add/Sub overflow ops with MVT::Glues are lowered to EFLAGS dependences.
00429     setOperationAction(ISD::ADDC, VT, Custom);
00430     setOperationAction(ISD::ADDE, VT, Custom);
00431     setOperationAction(ISD::SUBC, VT, Custom);
00432     setOperationAction(ISD::SUBE, VT, Custom);
00433   }
00434 
00435   setOperationAction(ISD::BR_JT            , MVT::Other, Expand);
00436   setOperationAction(ISD::BRCOND           , MVT::Other, Custom);
00437   setOperationAction(ISD::BR_CC            , MVT::f32,   Expand);
00438   setOperationAction(ISD::BR_CC            , MVT::f64,   Expand);
00439   setOperationAction(ISD::BR_CC            , MVT::f80,   Expand);
00440   setOperationAction(ISD::BR_CC            , MVT::i8,    Expand);
00441   setOperationAction(ISD::BR_CC            , MVT::i16,   Expand);
00442   setOperationAction(ISD::BR_CC            , MVT::i32,   Expand);
00443   setOperationAction(ISD::BR_CC            , MVT::i64,   Expand);
00444   setOperationAction(ISD::SELECT_CC        , MVT::Other, Expand);
00445   if (Subtarget->is64Bit())
00446     setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
00447   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16  , Legal);
00448   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8   , Legal);
00449   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1   , Expand);
00450   setOperationAction(ISD::FP_ROUND_INREG   , MVT::f32  , Expand);
00451   setOperationAction(ISD::FREM             , MVT::f32  , Expand);
00452   setOperationAction(ISD::FREM             , MVT::f64  , Expand);
00453   setOperationAction(ISD::FREM             , MVT::f80  , Expand);
00454   setOperationAction(ISD::FLT_ROUNDS_      , MVT::i32  , Custom);
00455 
00456   // Promote the i8 variants and force them on up to i32 which has a shorter
00457   // encoding.
00458   setOperationAction(ISD::CTTZ             , MVT::i8   , Promote);
00459   AddPromotedToType (ISD::CTTZ             , MVT::i8   , MVT::i32);
00460   setOperationAction(ISD::CTTZ_ZERO_UNDEF  , MVT::i8   , Promote);
00461   AddPromotedToType (ISD::CTTZ_ZERO_UNDEF  , MVT::i8   , MVT::i32);
00462   if (Subtarget->hasBMI()) {
00463     setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i16  , Expand);
00464     setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32  , Expand);
00465     if (Subtarget->is64Bit())
00466       setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Expand);
00467   } else {
00468     setOperationAction(ISD::CTTZ           , MVT::i16  , Custom);
00469     setOperationAction(ISD::CTTZ           , MVT::i32  , Custom);
00470     if (Subtarget->is64Bit())
00471       setOperationAction(ISD::CTTZ         , MVT::i64  , Custom);
00472   }
00473 
00474   if (Subtarget->hasLZCNT()) {
00475     // When promoting the i8 variants, force them to i32 for a shorter
00476     // encoding.
00477     setOperationAction(ISD::CTLZ           , MVT::i8   , Promote);
00478     AddPromotedToType (ISD::CTLZ           , MVT::i8   , MVT::i32);
00479     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8   , Promote);
00480     AddPromotedToType (ISD::CTLZ_ZERO_UNDEF, MVT::i8   , MVT::i32);
00481     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16  , Expand);
00482     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32  , Expand);
00483     if (Subtarget->is64Bit())
00484       setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Expand);
00485   } else {
00486     setOperationAction(ISD::CTLZ           , MVT::i8   , Custom);
00487     setOperationAction(ISD::CTLZ           , MVT::i16  , Custom);
00488     setOperationAction(ISD::CTLZ           , MVT::i32  , Custom);
00489     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8   , Custom);
00490     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16  , Custom);
00491     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32  , Custom);
00492     if (Subtarget->is64Bit()) {
00493       setOperationAction(ISD::CTLZ         , MVT::i64  , Custom);
00494       setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom);
00495     }
00496   }
00497 
00498   if (Subtarget->hasPOPCNT()) {
00499     setOperationAction(ISD::CTPOP          , MVT::i8   , Promote);
00500   } else {
00501     setOperationAction(ISD::CTPOP          , MVT::i8   , Expand);
00502     setOperationAction(ISD::CTPOP          , MVT::i16  , Expand);
00503     setOperationAction(ISD::CTPOP          , MVT::i32  , Expand);
00504     if (Subtarget->is64Bit())
00505       setOperationAction(ISD::CTPOP        , MVT::i64  , Expand);
00506   }
00507 
00508   setOperationAction(ISD::READCYCLECOUNTER , MVT::i64  , Custom);
00509 
00510   if (!Subtarget->hasMOVBE())
00511     setOperationAction(ISD::BSWAP          , MVT::i16  , Expand);
00512 
00513   // These should be promoted to a larger select which is supported.
00514   setOperationAction(ISD::SELECT          , MVT::i1   , Promote);
00515   // X86 wants to expand cmov itself.
00516   setOperationAction(ISD::SELECT          , MVT::i8   , Custom);
00517   setOperationAction(ISD::SELECT          , MVT::i16  , Custom);
00518   setOperationAction(ISD::SELECT          , MVT::i32  , Custom);
00519   setOperationAction(ISD::SELECT          , MVT::f32  , Custom);
00520   setOperationAction(ISD::SELECT          , MVT::f64  , Custom);
00521   setOperationAction(ISD::SELECT          , MVT::f80  , Custom);
00522   setOperationAction(ISD::SETCC           , MVT::i8   , Custom);
00523   setOperationAction(ISD::SETCC           , MVT::i16  , Custom);
00524   setOperationAction(ISD::SETCC           , MVT::i32  , Custom);
00525   setOperationAction(ISD::SETCC           , MVT::f32  , Custom);
00526   setOperationAction(ISD::SETCC           , MVT::f64  , Custom);
00527   setOperationAction(ISD::SETCC           , MVT::f80  , Custom);
00528   if (Subtarget->is64Bit()) {
00529     setOperationAction(ISD::SELECT        , MVT::i64  , Custom);
00530     setOperationAction(ISD::SETCC         , MVT::i64  , Custom);
00531   }
00532   setOperationAction(ISD::EH_RETURN       , MVT::Other, Custom);
00533   // NOTE: EH_SJLJ_SETJMP/_LONGJMP supported here is NOT intended to support
00534   // SjLj exception handling but a light-weight setjmp/longjmp replacement to
00535   // support continuation, user-level threading, and etc.. As a result, no
00536   // other SjLj exception interfaces are implemented and please don't build
00537   // your own exception handling based on them.
00538   // LLVM/Clang supports zero-cost DWARF exception handling.
00539   setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom);
00540   setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom);
00541 
00542   // Darwin ABI issue.
00543   setOperationAction(ISD::ConstantPool    , MVT::i32  , Custom);
00544   setOperationAction(ISD::JumpTable       , MVT::i32  , Custom);
00545   setOperationAction(ISD::GlobalAddress   , MVT::i32  , Custom);
00546   setOperationAction(ISD::GlobalTLSAddress, MVT::i32  , Custom);
00547   if (Subtarget->is64Bit())
00548     setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom);
00549   setOperationAction(ISD::ExternalSymbol  , MVT::i32  , Custom);
00550   setOperationAction(ISD::BlockAddress    , MVT::i32  , Custom);
00551   if (Subtarget->is64Bit()) {
00552     setOperationAction(ISD::ConstantPool  , MVT::i64  , Custom);
00553     setOperationAction(ISD::JumpTable     , MVT::i64  , Custom);
00554     setOperationAction(ISD::GlobalAddress , MVT::i64  , Custom);
00555     setOperationAction(ISD::ExternalSymbol, MVT::i64  , Custom);
00556     setOperationAction(ISD::BlockAddress  , MVT::i64  , Custom);
00557   }
00558   // 64-bit addm sub, shl, sra, srl (iff 32-bit x86)
00559   setOperationAction(ISD::SHL_PARTS       , MVT::i32  , Custom);
00560   setOperationAction(ISD::SRA_PARTS       , MVT::i32  , Custom);
00561   setOperationAction(ISD::SRL_PARTS       , MVT::i32  , Custom);
00562   if (Subtarget->is64Bit()) {
00563     setOperationAction(ISD::SHL_PARTS     , MVT::i64  , Custom);
00564     setOperationAction(ISD::SRA_PARTS     , MVT::i64  , Custom);
00565     setOperationAction(ISD::SRL_PARTS     , MVT::i64  , Custom);
00566   }
00567 
00568   if (Subtarget->hasSSE1())
00569     setOperationAction(ISD::PREFETCH      , MVT::Other, Legal);
00570 
00571   setOperationAction(ISD::ATOMIC_FENCE  , MVT::Other, Custom);
00572 
00573   // Expand certain atomics
00574   for (unsigned i = 0; i != array_lengthof(IntVTs); ++i) {
00575     MVT VT = IntVTs[i];
00576     setOperationAction(ISD::ATOMIC_CMP_SWAP, VT, Custom);
00577     setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom);
00578     setOperationAction(ISD::ATOMIC_STORE, VT, Custom);
00579   }
00580 
00581   if (!Subtarget->is64Bit()) {
00582     setOperationAction(ISD::ATOMIC_LOAD, MVT::i64, Custom);
00583     setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i64, Custom);
00584     setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i64, Custom);
00585     setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i64, Custom);
00586     setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i64, Custom);
00587     setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i64, Custom);
00588     setOperationAction(ISD::ATOMIC_LOAD_NAND, MVT::i64, Custom);
00589     setOperationAction(ISD::ATOMIC_SWAP, MVT::i64, Custom);
00590     setOperationAction(ISD::ATOMIC_LOAD_MAX, MVT::i64, Custom);
00591     setOperationAction(ISD::ATOMIC_LOAD_MIN, MVT::i64, Custom);
00592     setOperationAction(ISD::ATOMIC_LOAD_UMAX, MVT::i64, Custom);
00593     setOperationAction(ISD::ATOMIC_LOAD_UMIN, MVT::i64, Custom);
00594   }
00595 
00596   if (Subtarget->hasCmpxchg16b()) {
00597     setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i128, Custom);
00598   }
00599 
00600   // FIXME - use subtarget debug flags
00601   if (!Subtarget->isTargetDarwin() &&
00602       !Subtarget->isTargetELF() &&
00603       !Subtarget->isTargetCygMing()) {
00604     setOperationAction(ISD::EH_LABEL, MVT::Other, Expand);
00605   }
00606 
00607   if (Subtarget->is64Bit()) {
00608     setExceptionPointerRegister(X86::RAX);
00609     setExceptionSelectorRegister(X86::RDX);
00610   } else {
00611     setExceptionPointerRegister(X86::EAX);
00612     setExceptionSelectorRegister(X86::EDX);
00613   }
00614   setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom);
00615   setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i64, Custom);
00616 
00617   setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom);
00618   setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom);
00619 
00620   setOperationAction(ISD::TRAP, MVT::Other, Legal);
00621   setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);
00622 
00623   // VASTART needs to be custom lowered to use the VarArgsFrameIndex
00624   setOperationAction(ISD::VASTART           , MVT::Other, Custom);
00625   setOperationAction(ISD::VAEND             , MVT::Other, Expand);
00626   if (Subtarget->is64Bit() && !Subtarget->isTargetWin64()) {
00627     // TargetInfo::X86_64ABIBuiltinVaList
00628     setOperationAction(ISD::VAARG           , MVT::Other, Custom);
00629     setOperationAction(ISD::VACOPY          , MVT::Other, Custom);
00630   } else {
00631     // TargetInfo::CharPtrBuiltinVaList
00632     setOperationAction(ISD::VAARG           , MVT::Other, Expand);
00633     setOperationAction(ISD::VACOPY          , MVT::Other, Expand);
00634   }
00635 
00636   setOperationAction(ISD::STACKSAVE,          MVT::Other, Expand);
00637   setOperationAction(ISD::STACKRESTORE,       MVT::Other, Expand);
00638 
00639   setOperationAction(ISD::DYNAMIC_STACKALLOC, Subtarget->is64Bit() ?
00640                      MVT::i64 : MVT::i32, Custom);
00641 
00642   if (!TM.Options.UseSoftFloat && X86ScalarSSEf64) {
00643     // f32 and f64 use SSE.
00644     // Set up the FP register classes.
00645     addRegisterClass(MVT::f32, &X86::FR32RegClass);
00646     addRegisterClass(MVT::f64, &X86::FR64RegClass);
00647 
00648     // Use ANDPD to simulate FABS.
00649     setOperationAction(ISD::FABS , MVT::f64, Custom);
00650     setOperationAction(ISD::FABS , MVT::f32, Custom);
00651 
00652     // Use XORP to simulate FNEG.
00653     setOperationAction(ISD::FNEG , MVT::f64, Custom);
00654     setOperationAction(ISD::FNEG , MVT::f32, Custom);
00655 
00656     // Use ANDPD and ORPD to simulate FCOPYSIGN.
00657     setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom);
00658     setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
00659 
00660     // Lower this to FGETSIGNx86 plus an AND.
00661     setOperationAction(ISD::FGETSIGN, MVT::i64, Custom);
00662     setOperationAction(ISD::FGETSIGN, MVT::i32, Custom);
00663 
00664     // We don't support sin/cos/fmod
00665     setOperationAction(ISD::FSIN   , MVT::f64, Expand);
00666     setOperationAction(ISD::FCOS   , MVT::f64, Expand);
00667     setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
00668     setOperationAction(ISD::FSIN   , MVT::f32, Expand);
00669     setOperationAction(ISD::FCOS   , MVT::f32, Expand);
00670     setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
00671 
00672     // Expand FP immediates into loads from the stack, except for the special
00673     // cases we handle.
00674     addLegalFPImmediate(APFloat(+0.0)); // xorpd
00675     addLegalFPImmediate(APFloat(+0.0f)); // xorps
00676   } else if (!TM.Options.UseSoftFloat && X86ScalarSSEf32) {
00677     // Use SSE for f32, x87 for f64.
00678     // Set up the FP register classes.
00679     addRegisterClass(MVT::f32, &X86::FR32RegClass);
00680     addRegisterClass(MVT::f64, &X86::RFP64RegClass);
00681 
00682     // Use ANDPS to simulate FABS.
00683     setOperationAction(ISD::FABS , MVT::f32, Custom);
00684 
00685     // Use XORP to simulate FNEG.
00686     setOperationAction(ISD::FNEG , MVT::f32, Custom);
00687 
00688     setOperationAction(ISD::UNDEF,     MVT::f64, Expand);
00689 
00690     // Use ANDPS and ORPS to simulate FCOPYSIGN.
00691     setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
00692     setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
00693 
00694     // We don't support sin/cos/fmod
00695     setOperationAction(ISD::FSIN   , MVT::f32, Expand);
00696     setOperationAction(ISD::FCOS   , MVT::f32, Expand);
00697     setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
00698 
00699     // Special cases we handle for FP constants.
00700     addLegalFPImmediate(APFloat(+0.0f)); // xorps
00701     addLegalFPImmediate(APFloat(+0.0)); // FLD0
00702     addLegalFPImmediate(APFloat(+1.0)); // FLD1
00703     addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
00704     addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
00705 
00706     if (!TM.Options.UnsafeFPMath) {
00707       setOperationAction(ISD::FSIN   , MVT::f64, Expand);
00708       setOperationAction(ISD::FCOS   , MVT::f64, Expand);
00709       setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
00710     }
00711   } else if (!TM.Options.UseSoftFloat) {
00712     // f32 and f64 in x87.
00713     // Set up the FP register classes.
00714     addRegisterClass(MVT::f64, &X86::RFP64RegClass);
00715     addRegisterClass(MVT::f32, &X86::RFP32RegClass);
00716 
00717     setOperationAction(ISD::UNDEF,     MVT::f64, Expand);
00718     setOperationAction(ISD::UNDEF,     MVT::f32, Expand);
00719     setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
00720     setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
00721 
00722     if (!TM.Options.UnsafeFPMath) {
00723       setOperationAction(ISD::FSIN   , MVT::f64, Expand);
00724       setOperationAction(ISD::FSIN   , MVT::f32, Expand);
00725       setOperationAction(ISD::FCOS   , MVT::f64, Expand);
00726       setOperationAction(ISD::FCOS   , MVT::f32, Expand);
00727       setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
00728       setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
00729     }
00730     addLegalFPImmediate(APFloat(+0.0)); // FLD0
00731     addLegalFPImmediate(APFloat(+1.0)); // FLD1
00732     addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
00733     addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
00734     addLegalFPImmediate(APFloat(+0.0f)); // FLD0
00735     addLegalFPImmediate(APFloat(+1.0f)); // FLD1
00736     addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS
00737     addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS
00738   }
00739 
00740   // We don't support FMA.
00741   setOperationAction(ISD::FMA, MVT::f64, Expand);
00742   setOperationAction(ISD::FMA, MVT::f32, Expand);
00743 
00744   // Long double always uses X87.
00745   if (!TM.Options.UseSoftFloat) {
00746     addRegisterClass(MVT::f80, &X86::RFP80RegClass);
00747     setOperationAction(ISD::UNDEF,     MVT::f80, Expand);
00748     setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand);
00749     {
00750       APFloat TmpFlt = APFloat::getZero(APFloat::x87DoubleExtended);
00751       addLegalFPImmediate(TmpFlt);  // FLD0
00752       TmpFlt.changeSign();
00753       addLegalFPImmediate(TmpFlt);  // FLD0/FCHS
00754 
00755       bool ignored;
00756       APFloat TmpFlt2(+1.0);
00757       TmpFlt2.convert(APFloat::x87DoubleExtended, APFloat::rmNearestTiesToEven,
00758                       &ignored);
00759       addLegalFPImmediate(TmpFlt2);  // FLD1
00760       TmpFlt2.changeSign();
00761       addLegalFPImmediate(TmpFlt2);  // FLD1/FCHS
00762     }
00763 
00764     if (!TM.Options.UnsafeFPMath) {
00765       setOperationAction(ISD::FSIN   , MVT::f80, Expand);
00766       setOperationAction(ISD::FCOS   , MVT::f80, Expand);
00767       setOperationAction(ISD::FSINCOS, MVT::f80, Expand);
00768     }
00769 
00770     setOperationAction(ISD::FFLOOR, MVT::f80, Expand);
00771     setOperationAction(ISD::FCEIL,  MVT::f80, Expand);
00772     setOperationAction(ISD::FTRUNC, MVT::f80, Expand);
00773     setOperationAction(ISD::FRINT,  MVT::f80, Expand);
00774     setOperationAction(ISD::FNEARBYINT, MVT::f80, Expand);
00775     setOperationAction(ISD::FMA, MVT::f80, Expand);
00776   }
00777 
00778   // Always use a library call for pow.
00779   setOperationAction(ISD::FPOW             , MVT::f32  , Expand);
00780   setOperationAction(ISD::FPOW             , MVT::f64  , Expand);
00781   setOperationAction(ISD::FPOW             , MVT::f80  , Expand);
00782 
00783   setOperationAction(ISD::FLOG, MVT::f80, Expand);
00784   setOperationAction(ISD::FLOG2, MVT::f80, Expand);
00785   setOperationAction(ISD::FLOG10, MVT::f80, Expand);
00786   setOperationAction(ISD::FEXP, MVT::f80, Expand);
00787   setOperationAction(ISD::FEXP2, MVT::f80, Expand);
00788 
00789   // First set operation action for all vector types to either promote
00790   // (for widening) or expand (for scalarization). Then we will selectively
00791   // turn on ones that can be effectively codegen'd.
00792   for (int i = MVT::FIRST_VECTOR_VALUETYPE;
00793            i <= MVT::LAST_VECTOR_VALUETYPE; ++i) {
00794     MVT VT = (MVT::SimpleValueType)i;
00795     setOperationAction(ISD::ADD , VT, Expand);
00796     setOperationAction(ISD::SUB , VT, Expand);
00797     setOperationAction(ISD::FADD, VT, Expand);
00798     setOperationAction(ISD::FNEG, VT, Expand);
00799     setOperationAction(ISD::FSUB, VT, Expand);
00800     setOperationAction(ISD::MUL , VT, Expand);
00801     setOperationAction(ISD::FMUL, VT, Expand);
00802     setOperationAction(ISD::SDIV, VT, Expand);
00803     setOperationAction(ISD::UDIV, VT, Expand);
00804     setOperationAction(ISD::FDIV, VT, Expand);
00805     setOperationAction(ISD::SREM, VT, Expand);
00806     setOperationAction(ISD::UREM, VT, Expand);
00807     setOperationAction(ISD::LOAD, VT, Expand);
00808     setOperationAction(ISD::VECTOR_SHUFFLE, VT, Expand);
00809     setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT,Expand);
00810     setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Expand);
00811     setOperationAction(ISD::EXTRACT_SUBVECTOR, VT,Expand);
00812     setOperationAction(ISD::INSERT_SUBVECTOR, VT,Expand);
00813     setOperationAction(ISD::FABS, VT, Expand);
00814     setOperationAction(ISD::FSIN, VT, Expand);
00815     setOperationAction(ISD::FSINCOS, VT, Expand);
00816     setOperationAction(ISD::FCOS, VT, Expand);
00817     setOperationAction(ISD::FSINCOS, VT, Expand);
00818     setOperationAction(ISD::FREM, VT, Expand);
00819     setOperationAction(ISD::FMA,  VT, Expand);
00820     setOperationAction(ISD::FPOWI, VT, Expand);
00821     setOperationAction(ISD::FSQRT, VT, Expand);
00822     setOperationAction(ISD::FCOPYSIGN, VT, Expand);
00823     setOperationAction(ISD::FFLOOR, VT, Expand);
00824     setOperationAction(ISD::FCEIL, VT, Expand);
00825     setOperationAction(ISD::FTRUNC, VT, Expand);
00826     setOperationAction(ISD::FRINT, VT, Expand);
00827     setOperationAction(ISD::FNEARBYINT, VT, Expand);
00828     setOperationAction(ISD::SMUL_LOHI, VT, Expand);
00829     setOperationAction(ISD::UMUL_LOHI, VT, Expand);
00830     setOperationAction(ISD::SDIVREM, VT, Expand);
00831     setOperationAction(ISD::UDIVREM, VT, Expand);
00832     setOperationAction(ISD::FPOW, VT, Expand);
00833     setOperationAction(ISD::CTPOP, VT, Expand);
00834     setOperationAction(ISD::CTTZ, VT, Expand);
00835     setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Expand);
00836     setOperationAction(ISD::CTLZ, VT, Expand);
00837     setOperationAction(ISD::CTLZ_ZERO_UNDEF, VT, Expand);
00838     setOperationAction(ISD::SHL, VT, Expand);
00839     setOperationAction(ISD::SRA, VT, Expand);
00840     setOperationAction(ISD::SRL, VT, Expand);
00841     setOperationAction(ISD::ROTL, VT, Expand);
00842     setOperationAction(ISD::ROTR, VT, Expand);
00843     setOperationAction(ISD::BSWAP, VT, Expand);
00844     setOperationAction(ISD::SETCC, VT, Expand);
00845     setOperationAction(ISD::FLOG, VT, Expand);
00846     setOperationAction(ISD::FLOG2, VT, Expand);
00847     setOperationAction(ISD::FLOG10, VT, Expand);
00848     setOperationAction(ISD::FEXP, VT, Expand);
00849     setOperationAction(ISD::FEXP2, VT, Expand);
00850     setOperationAction(ISD::FP_TO_UINT, VT, Expand);
00851     setOperationAction(ISD::FP_TO_SINT, VT, Expand);
00852     setOperationAction(ISD::UINT_TO_FP, VT, Expand);
00853     setOperationAction(ISD::SINT_TO_FP, VT, Expand);
00854     setOperationAction(ISD::SIGN_EXTEND_INREG, VT,Expand);
00855     setOperationAction(ISD::TRUNCATE, VT, Expand);
00856     setOperationAction(ISD::SIGN_EXTEND, VT, Expand);
00857     setOperationAction(ISD::ZERO_EXTEND, VT, Expand);
00858     setOperationAction(ISD::ANY_EXTEND, VT, Expand);
00859     setOperationAction(ISD::VSELECT, VT, Expand);
00860     for (int InnerVT = MVT::FIRST_VECTOR_VALUETYPE;
00861              InnerVT <= MVT::LAST_VECTOR_VALUETYPE; ++InnerVT)
00862       setTruncStoreAction(VT,
00863                           (MVT::SimpleValueType)InnerVT, Expand);
00864     setLoadExtAction(ISD::SEXTLOAD, VT, Expand);
00865     setLoadExtAction(ISD::ZEXTLOAD, VT, Expand);
00866     setLoadExtAction(ISD::EXTLOAD, VT, Expand);
00867   }
00868 
00869   // FIXME: In order to prevent SSE instructions being expanded to MMX ones
00870   // with -msoft-float, disable use of MMX as well.
00871   if (!TM.Options.UseSoftFloat && Subtarget->hasMMX()) {
00872     addRegisterClass(MVT::x86mmx, &X86::VR64RegClass);
00873     // No operations on x86mmx supported, everything uses intrinsics.
00874   }
00875 
00876   // MMX-sized vectors (other than x86mmx) are expected to be expanded
00877   // into smaller operations.
00878   setOperationAction(ISD::MULHS,              MVT::v8i8,  Expand);
00879   setOperationAction(ISD::MULHS,              MVT::v4i16, Expand);
00880   setOperationAction(ISD::MULHS,              MVT::v2i32, Expand);
00881   setOperationAction(ISD::MULHS,              MVT::v1i64, Expand);
00882   setOperationAction(ISD::AND,                MVT::v8i8,  Expand);
00883   setOperationAction(ISD::AND,                MVT::v4i16, Expand);
00884   setOperationAction(ISD::AND,                MVT::v2i32, Expand);
00885   setOperationAction(ISD::AND,                MVT::v1i64, Expand);
00886   setOperationAction(ISD::OR,                 MVT::v8i8,  Expand);
00887   setOperationAction(ISD::OR,                 MVT::v4i16, Expand);
00888   setOperationAction(ISD::OR,                 MVT::v2i32, Expand);
00889   setOperationAction(ISD::OR,                 MVT::v1i64, Expand);
00890   setOperationAction(ISD::XOR,                MVT::v8i8,  Expand);
00891   setOperationAction(ISD::XOR,                MVT::v4i16, Expand);
00892   setOperationAction(ISD::XOR,                MVT::v2i32, Expand);
00893   setOperationAction(ISD::XOR,                MVT::v1i64, Expand);
00894   setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v8i8,  Expand);
00895   setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v4i16, Expand);
00896   setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v2i32, Expand);
00897   setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v1i64, Expand);
00898   setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v1i64, Expand);
00899   setOperationAction(ISD::SELECT,             MVT::v8i8,  Expand);
00900   setOperationAction(ISD::SELECT,             MVT::v4i16, Expand);
00901   setOperationAction(ISD::SELECT,             MVT::v2i32, Expand);
00902   setOperationAction(ISD::SELECT,             MVT::v1i64, Expand);
00903   setOperationAction(ISD::BITCAST,            MVT::v8i8,  Expand);
00904   setOperationAction(ISD::BITCAST,            MVT::v4i16, Expand);
00905   setOperationAction(ISD::BITCAST,            MVT::v2i32, Expand);
00906   setOperationAction(ISD::BITCAST,            MVT::v1i64, Expand);
00907 
00908   if (!TM.Options.UseSoftFloat && Subtarget->hasSSE1()) {
00909     addRegisterClass(MVT::v4f32, &X86::VR128RegClass);
00910 
00911     setOperationAction(ISD::FADD,               MVT::v4f32, Legal);
00912     setOperationAction(ISD::FSUB,               MVT::v4f32, Legal);
00913     setOperationAction(ISD::FMUL,               MVT::v4f32, Legal);
00914     setOperationAction(ISD::FDIV,               MVT::v4f32, Legal);
00915     setOperationAction(ISD::FSQRT,              MVT::v4f32, Legal);
00916     setOperationAction(ISD::FNEG,               MVT::v4f32, Custom);
00917     setOperationAction(ISD::FABS,               MVT::v4f32, Custom);
00918     setOperationAction(ISD::LOAD,               MVT::v4f32, Legal);
00919     setOperationAction(ISD::BUILD_VECTOR,       MVT::v4f32, Custom);
00920     setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v4f32, Custom);
00921     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
00922     setOperationAction(ISD::SELECT,             MVT::v4f32, Custom);
00923   }
00924 
00925   if (!TM.Options.UseSoftFloat && Subtarget->hasSSE2()) {
00926     addRegisterClass(MVT::v2f64, &X86::VR128RegClass);
00927 
00928     // FIXME: Unfortunately -soft-float and -no-implicit-float means XMM
00929     // registers cannot be used even for integer operations.
00930     addRegisterClass(MVT::v16i8, &X86::VR128RegClass);
00931     addRegisterClass(MVT::v8i16, &X86::VR128RegClass);
00932     addRegisterClass(MVT::v4i32, &X86::VR128RegClass);
00933     addRegisterClass(MVT::v2i64, &X86::VR128RegClass);
00934 
00935     setOperationAction(ISD::ADD,                MVT::v16i8, Legal);
00936     setOperationAction(ISD::ADD,                MVT::v8i16, Legal);
00937     setOperationAction(ISD::ADD,                MVT::v4i32, Legal);
00938     setOperationAction(ISD::ADD,                MVT::v2i64, Legal);
00939     setOperationAction(ISD::MUL,                MVT::v4i32, Custom);
00940     setOperationAction(ISD::MUL,                MVT::v2i64, Custom);
00941     setOperationAction(ISD::SUB,                MVT::v16i8, Legal);
00942     setOperationAction(ISD::SUB,                MVT::v8i16, Legal);
00943     setOperationAction(ISD::SUB,                MVT::v4i32, Legal);
00944     setOperationAction(ISD::SUB,                MVT::v2i64, Legal);
00945     setOperationAction(ISD::MUL,                MVT::v8i16, Legal);
00946     setOperationAction(ISD::FADD,               MVT::v2f64, Legal);
00947     setOperationAction(ISD::FSUB,               MVT::v2f64, Legal);
00948     setOperationAction(ISD::FMUL,               MVT::v2f64, Legal);
00949     setOperationAction(ISD::FDIV,               MVT::v2f64, Legal);
00950     setOperationAction(ISD::FSQRT,              MVT::v2f64, Legal);
00951     setOperationAction(ISD::FNEG,               MVT::v2f64, Custom);
00952     setOperationAction(ISD::FABS,               MVT::v2f64, Custom);
00953 
00954     setOperationAction(ISD::SETCC,              MVT::v2i64, Custom);
00955     setOperationAction(ISD::SETCC,              MVT::v16i8, Custom);
00956     setOperationAction(ISD::SETCC,              MVT::v8i16, Custom);
00957     setOperationAction(ISD::SETCC,              MVT::v4i32, Custom);
00958 
00959     setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v16i8, Custom);
00960     setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v8i16, Custom);
00961     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v8i16, Custom);
00962     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4i32, Custom);
00963     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4f32, Custom);
00964 
00965     // Custom lower build_vector, vector_shuffle, and extract_vector_elt.
00966     for (int i = MVT::v16i8; i != MVT::v2i64; ++i) {
00967       MVT VT = (MVT::SimpleValueType)i;
00968       // Do not attempt to custom lower non-power-of-2 vectors
00969       if (!isPowerOf2_32(VT.getVectorNumElements()))
00970         continue;
00971       // Do not attempt to custom lower non-128-bit vectors
00972       if (!VT.is128BitVector())
00973         continue;
00974       setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);
00975       setOperationAction(ISD::VECTOR_SHUFFLE,     VT, Custom);
00976       setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
00977     }
00978 
00979     setOperationAction(ISD::BUILD_VECTOR,       MVT::v2f64, Custom);
00980     setOperationAction(ISD::BUILD_VECTOR,       MVT::v2i64, Custom);
00981     setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v2f64, Custom);
00982     setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v2i64, Custom);
00983     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v2f64, Custom);
00984     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f64, Custom);
00985 
00986     if (Subtarget->is64Bit()) {
00987       setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v2i64, Custom);
00988       setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Custom);
00989     }
00990 
00991     // Promote v16i8, v8i16, v4i32 load, select, and, or, xor to v2i64.
00992     for (int i = MVT::v16i8; i != MVT::v2i64; ++i) {
00993       MVT VT = (MVT::SimpleValueType)i;
00994 
00995       // Do not attempt to promote non-128-bit vectors
00996       if (!VT.is128BitVector())
00997         continue;
00998 
00999       setOperationAction(ISD::AND,    VT, Promote);
01000       AddPromotedToType (ISD::AND,    VT, MVT::v2i64);
01001       setOperationAction(ISD::OR,     VT, Promote);
01002       AddPromotedToType (ISD::OR,     VT, MVT::v2i64);
01003       setOperationAction(ISD::XOR,    VT, Promote);
01004       AddPromotedToType (ISD::XOR,    VT, MVT::v2i64);
01005       setOperationAction(ISD::LOAD,   VT, Promote);
01006       AddPromotedToType (ISD::LOAD,   VT, MVT::v2i64);
01007       setOperationAction(ISD::SELECT, VT, Promote);
01008       AddPromotedToType (ISD::SELECT, VT, MVT::v2i64);
01009     }
01010 
01011     setTruncStoreAction(MVT::f64, MVT::f32, Expand);
01012 
01013     // Custom lower v2i64 and v2f64 selects.
01014     setOperationAction(ISD::LOAD,               MVT::v2f64, Legal);
01015     setOperationAction(ISD::LOAD,               MVT::v2i64, Legal);
01016     setOperationAction(ISD::SELECT,             MVT::v2f64, Custom);
01017     setOperationAction(ISD::SELECT,             MVT::v2i64, Custom);
01018 
01019     setOperationAction(ISD::FP_TO_SINT,         MVT::v4i32, Legal);
01020     setOperationAction(ISD::SINT_TO_FP,         MVT::v4i32, Legal);
01021 
01022     setOperationAction(ISD::UINT_TO_FP,         MVT::v4i8,  Custom);
01023     setOperationAction(ISD::UINT_TO_FP,         MVT::v4i16, Custom);
01024     // As there is no 64-bit GPR available, we need build a special custom
01025     // sequence to convert from v2i32 to v2f32.
01026     if (!Subtarget->is64Bit())
01027       setOperationAction(ISD::UINT_TO_FP,       MVT::v2f32, Custom);
01028 
01029     setOperationAction(ISD::FP_EXTEND,          MVT::v2f32, Custom);
01030     setOperationAction(ISD::FP_ROUND,           MVT::v2f32, Custom);
01031 
01032     setLoadExtAction(ISD::EXTLOAD,              MVT::v2f32, Legal);
01033   }
01034 
01035   if (!TM.Options.UseSoftFloat && Subtarget->hasSSE41()) {
01036     setOperationAction(ISD::FFLOOR,             MVT::f32,   Legal);
01037     setOperationAction(ISD::FCEIL,              MVT::f32,   Legal);
01038     setOperationAction(ISD::FTRUNC,             MVT::f32,   Legal);
01039     setOperationAction(ISD::FRINT,              MVT::f32,   Legal);
01040     setOperationAction(ISD::FNEARBYINT,         MVT::f32,   Legal);
01041     setOperationAction(ISD::FFLOOR,             MVT::f64,   Legal);
01042     setOperationAction(ISD::FCEIL,              MVT::f64,   Legal);
01043     setOperationAction(ISD::FTRUNC,             MVT::f64,   Legal);
01044     setOperationAction(ISD::FRINT,              MVT::f64,   Legal);
01045     setOperationAction(ISD::FNEARBYINT,         MVT::f64,   Legal);
01046 
01047     setOperationAction(ISD::FFLOOR,             MVT::v4f32, Legal);
01048     setOperationAction(ISD::FCEIL,              MVT::v4f32, Legal);
01049     setOperationAction(ISD::FTRUNC,             MVT::v4f32, Legal);
01050     setOperationAction(ISD::FRINT,              MVT::v4f32, Legal);
01051     setOperationAction(ISD::FNEARBYINT,         MVT::v4f32, Legal);
01052     setOperationAction(ISD::FFLOOR,             MVT::v2f64, Legal);
01053     setOperationAction(ISD::FCEIL,              MVT::v2f64, Legal);
01054     setOperationAction(ISD::FTRUNC,             MVT::v2f64, Legal);
01055     setOperationAction(ISD::FRINT,              MVT::v2f64, Legal);
01056     setOperationAction(ISD::FNEARBYINT,         MVT::v2f64, Legal);
01057 
01058     // FIXME: Do we need to handle scalar-to-vector here?
01059     setOperationAction(ISD::MUL,                MVT::v4i32, Legal);
01060 
01061     setOperationAction(ISD::VSELECT,            MVT::v2f64, Legal);
01062     setOperationAction(ISD::VSELECT,            MVT::v2i64, Legal);
01063     setOperationAction(ISD::VSELECT,            MVT::v16i8, Legal);
01064     setOperationAction(ISD::VSELECT,            MVT::v4i32, Legal);
01065     setOperationAction(ISD::VSELECT,            MVT::v4f32, Legal);
01066 
01067     // i8 and i16 vectors are custom , because the source register and source
01068     // source memory operand types are not the same width.  f32 vectors are
01069     // custom since the immediate controlling the insert encodes additional
01070     // information.
01071     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v16i8, Custom);
01072     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v8i16, Custom);
01073     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4i32, Custom);
01074     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4f32, Custom);
01075 
01076     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i8, Custom);
01077     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i16, Custom);
01078     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Custom);
01079     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
01080 
01081     // FIXME: these should be Legal but thats only for the case where
01082     // the index is constant.  For now custom expand to deal with that.
01083     if (Subtarget->is64Bit()) {
01084       setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v2i64, Custom);
01085       setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Custom);
01086     }
01087   }
01088 
01089   if (Subtarget->hasSSE2()) {
01090     setOperationAction(ISD::SRL,               MVT::v8i16, Custom);
01091     setOperationAction(ISD::SRL,               MVT::v16i8, Custom);
01092 
01093     setOperationAction(ISD::SHL,               MVT::v8i16, Custom);
01094     setOperationAction(ISD::SHL,               MVT::v16i8, Custom);
01095 
01096     setOperationAction(ISD::SRA,               MVT::v8i16, Custom);
01097     setOperationAction(ISD::SRA,               MVT::v16i8, Custom);
01098 
01099     // In the customized shift lowering, the legal cases in AVX2 will be
01100     // recognized.
01101     setOperationAction(ISD::SRL,               MVT::v2i64, Custom);
01102     setOperationAction(ISD::SRL,               MVT::v4i32, Custom);
01103 
01104     setOperationAction(ISD::SHL,               MVT::v2i64, Custom);
01105     setOperationAction(ISD::SHL,               MVT::v4i32, Custom);
01106 
01107     setOperationAction(ISD::SRA,               MVT::v4i32, Custom);
01108 
01109     setOperationAction(ISD::SDIV,              MVT::v8i16, Custom);
01110     setOperationAction(ISD::SDIV,              MVT::v4i32, Custom);
01111   }
01112 
01113   if (!TM.Options.UseSoftFloat && Subtarget->hasFp256()) {
01114     addRegisterClass(MVT::v32i8,  &X86::VR256RegClass);
01115     addRegisterClass(MVT::v16i16, &X86::VR256RegClass);
01116     addRegisterClass(MVT::v8i32,  &X86::VR256RegClass);
01117     addRegisterClass(MVT::v8f32,  &X86::VR256RegClass);
01118     addRegisterClass(MVT::v4i64,  &X86::VR256RegClass);
01119     addRegisterClass(MVT::v4f64,  &X86::VR256RegClass);
01120 
01121     setOperationAction(ISD::LOAD,               MVT::v8f32, Legal);
01122     setOperationAction(ISD::LOAD,               MVT::v4f64, Legal);
01123     setOperationAction(ISD::LOAD,               MVT::v4i64, Legal);
01124 
01125     setOperationAction(ISD::FADD,               MVT::v8f32, Legal);
01126     setOperationAction(ISD::FSUB,               MVT::v8f32, Legal);
01127     setOperationAction(ISD::FMUL,               MVT::v8f32, Legal);
01128     setOperationAction(ISD::FDIV,               MVT::v8f32, Legal);
01129     setOperationAction(ISD::FSQRT,              MVT::v8f32, Legal);
01130     setOperationAction(ISD::FFLOOR,             MVT::v8f32, Legal);
01131     setOperationAction(ISD::FCEIL,              MVT::v8f32, Legal);
01132     setOperationAction(ISD::FTRUNC,             MVT::v8f32, Legal);
01133     setOperationAction(ISD::FRINT,              MVT::v8f32, Legal);
01134     setOperationAction(ISD::FNEARBYINT,         MVT::v8f32, Legal);
01135     setOperationAction(ISD::FNEG,               MVT::v8f32, Custom);
01136     setOperationAction(ISD::FABS,               MVT::v8f32, Custom);
01137 
01138     setOperationAction(ISD::FADD,               MVT::v4f64, Legal);
01139     setOperationAction(ISD::FSUB,               MVT::v4f64, Legal);
01140     setOperationAction(ISD::FMUL,               MVT::v4f64, Legal);
01141     setOperationAction(ISD::FDIV,               MVT::v4f64, Legal);
01142     setOperationAction(ISD::FSQRT,              MVT::v4f64, Legal);
01143     setOperationAction(ISD::FFLOOR,             MVT::v4f64, Legal);
01144     setOperationAction(ISD::FCEIL,              MVT::v4f64, Legal);
01145     setOperationAction(ISD::FTRUNC,             MVT::v4f64, Legal);
01146     setOperationAction(ISD::FRINT,              MVT::v4f64, Legal);
01147     setOperationAction(ISD::FNEARBYINT,         MVT::v4f64, Legal);
01148     setOperationAction(ISD::FNEG,               MVT::v4f64, Custom);
01149     setOperationAction(ISD::FABS,               MVT::v4f64, Custom);
01150 
01151     // (fp_to_int:v8i16 (v8f32 ..)) requires the result type to be promoted
01152     // even though v8i16 is a legal type.
01153     setOperationAction(ISD::FP_TO_SINT,         MVT::v8i16, Promote);
01154     setOperationAction(ISD::FP_TO_UINT,         MVT::v8i16, Promote);
01155     setOperationAction(ISD::FP_TO_SINT,         MVT::v8i32, Legal);
01156 
01157     setOperationAction(ISD::SINT_TO_FP,         MVT::v8i16, Promote);
01158     setOperationAction(ISD::SINT_TO_FP,         MVT::v8i32, Legal);
01159     setOperationAction(ISD::FP_ROUND,           MVT::v4f32, Legal);
01160 
01161     setOperationAction(ISD::UINT_TO_FP,         MVT::v8i8,  Custom);
01162     setOperationAction(ISD::UINT_TO_FP,         MVT::v8i16, Custom);
01163 
01164     setLoadExtAction(ISD::EXTLOAD,              MVT::v4f32, Legal);
01165 
01166     setOperationAction(ISD::SRL,               MVT::v16i16, Custom);
01167     setOperationAction(ISD::SRL,               MVT::v32i8, Custom);
01168 
01169     setOperationAction(ISD::SHL,               MVT::v16i16, Custom);
01170     setOperationAction(ISD::SHL,               MVT::v32i8, Custom);
01171 
01172     setOperationAction(ISD::SRA,               MVT::v16i16, Custom);
01173     setOperationAction(ISD::SRA,               MVT::v32i8, Custom);
01174 
01175     setOperationAction(ISD::SDIV,              MVT::v16i16, Custom);
01176 
01177     setOperationAction(ISD::SETCC,             MVT::v32i8, Custom);
01178     setOperationAction(ISD::SETCC,             MVT::v16i16, Custom);
01179     setOperationAction(ISD::SETCC,             MVT::v8i32, Custom);
01180     setOperationAction(ISD::SETCC,             MVT::v4i64, Custom);
01181 
01182     setOperationAction(ISD::SELECT,            MVT::v4f64, Custom);
01183     setOperationAction(ISD::SELECT,            MVT::v4i64, Custom);
01184     setOperationAction(ISD::SELECT,            MVT::v8f32, Custom);
01185 
01186     setOperationAction(ISD::VSELECT,           MVT::v4f64, Legal);
01187     setOperationAction(ISD::VSELECT,           MVT::v4i64, Legal);
01188     setOperationAction(ISD::VSELECT,           MVT::v8i32, Legal);
01189     setOperationAction(ISD::VSELECT,           MVT::v8f32, Legal);
01190 
01191     setOperationAction(ISD::SIGN_EXTEND,       MVT::v4i64, Custom);
01192     setOperationAction(ISD::SIGN_EXTEND,       MVT::v8i32, Custom);
01193     setOperationAction(ISD::SIGN_EXTEND,       MVT::v16i16, Custom);
01194     setOperationAction(ISD::ZERO_EXTEND,       MVT::v4i64, Custom);
01195     setOperationAction(ISD::ZERO_EXTEND,       MVT::v8i32, Custom);
01196     setOperationAction(ISD::ZERO_EXTEND,       MVT::v16i16, Custom);
01197     setOperationAction(ISD::ANY_EXTEND,        MVT::v4i64, Custom);
01198     setOperationAction(ISD::ANY_EXTEND,        MVT::v8i32, Custom);
01199     setOperationAction(ISD::ANY_EXTEND,        MVT::v16i16, Custom);
01200     setOperationAction(ISD::TRUNCATE,          MVT::v16i8, Custom);
01201     setOperationAction(ISD::TRUNCATE,          MVT::v8i16, Custom);
01202     setOperationAction(ISD::TRUNCATE,          MVT::v4i32, Custom);
01203 
01204     if (Subtarget->hasFMA() || Subtarget->hasFMA4()) {
01205       setOperationAction(ISD::FMA,             MVT::v8f32, Legal);
01206       setOperationAction(ISD::FMA,             MVT::v4f64, Legal);
01207       setOperationAction(ISD::FMA,             MVT::v4f32, Legal);
01208       setOperationAction(ISD::FMA,             MVT::v2f64, Legal);
01209       setOperationAction(ISD::FMA,             MVT::f32, Legal);
01210       setOperationAction(ISD::FMA,             MVT::f64, Legal);
01211     }
01212 
01213     if (Subtarget->hasInt256()) {
01214       setOperationAction(ISD::ADD,             MVT::v4i64, Legal);
01215       setOperationAction(ISD::ADD,             MVT::v8i32, Legal);
01216       setOperationAction(ISD::ADD,             MVT::v16i16, Legal);
01217       setOperationAction(ISD::ADD,             MVT::v32i8, Legal);
01218 
01219       setOperationAction(ISD::SUB,             MVT::v4i64, Legal);
01220       setOperationAction(ISD::SUB,             MVT::v8i32, Legal);
01221       setOperationAction(ISD::SUB,             MVT::v16i16, Legal);
01222       setOperationAction(ISD::SUB,             MVT::v32i8, Legal);
01223 
01224       setOperationAction(ISD::MUL,             MVT::v4i64, Custom);
01225       setOperationAction(ISD::MUL,             MVT::v8i32, Legal);
01226       setOperationAction(ISD::MUL,             MVT::v16i16, Legal);
01227       // Don't lower v32i8 because there is no 128-bit byte mul
01228 
01229       setOperationAction(ISD::VSELECT,         MVT::v32i8, Legal);
01230 
01231       setOperationAction(ISD::SDIV,            MVT::v8i32, Custom);
01232     } else {
01233       setOperationAction(ISD::ADD,             MVT::v4i64, Custom);
01234       setOperationAction(ISD::ADD,             MVT::v8i32, Custom);
01235       setOperationAction(ISD::ADD,             MVT::v16i16, Custom);
01236       setOperationAction(ISD::ADD,             MVT::v32i8, Custom);
01237 
01238       setOperationAction(ISD::SUB,             MVT::v4i64, Custom);
01239       setOperationAction(ISD::SUB,             MVT::v8i32, Custom);
01240       setOperationAction(ISD::SUB,             MVT::v16i16, Custom);
01241       setOperationAction(ISD::SUB,             MVT::v32i8, Custom);
01242 
01243       setOperationAction(ISD::MUL,             MVT::v4i64, Custom);
01244       setOperationAction(ISD::MUL,             MVT::v8i32, Custom);
01245       setOperationAction(ISD::MUL,             MVT::v16i16, Custom);
01246       // Don't lower v32i8 because there is no 128-bit byte mul
01247     }
01248 
01249     // In the customized shift lowering, the legal cases in AVX2 will be
01250     // recognized.
01251     setOperationAction(ISD::SRL,               MVT::v4i64, Custom);
01252     setOperationAction(ISD::SRL,               MVT::v8i32, Custom);
01253 
01254     setOperationAction(ISD::SHL,               MVT::v4i64, Custom);
01255     setOperationAction(ISD::SHL,               MVT::v8i32, Custom);
01256 
01257     setOperationAction(ISD::SRA,               MVT::v8i32, Custom);
01258 
01259     // Custom lower several nodes for 256-bit types.
01260     for (int i = MVT::FIRST_VECTOR_VALUETYPE;
01261              i <= MVT::LAST_VECTOR_VALUETYPE; ++i) {
01262       MVT VT = (MVT::SimpleValueType)i;
01263 
01264       // Extract subvector is special because the value type
01265       // (result) is 128-bit but the source is 256-bit wide.
01266       if (VT.is128BitVector())
01267         setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
01268 
01269       // Do not attempt to custom lower other non-256-bit vectors
01270       if (!VT.is256BitVector())
01271         continue;
01272 
01273       setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);
01274       setOperationAction(ISD::VECTOR_SHUFFLE,     VT, Custom);
01275       setOperationAction(ISD::INSERT_VECTOR_ELT,  VT, Custom);
01276       setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
01277       setOperationAction(ISD::SCALAR_TO_VECTOR,   VT, Custom);
01278       setOperationAction(ISD::INSERT_SUBVECTOR,   VT, Custom);
01279       setOperationAction(ISD::CONCAT_VECTORS,     VT, Custom);
01280     }
01281 
01282     // Promote v32i8, v16i16, v8i32 select, and, or, xor to v4i64.
01283     for (int i = MVT::v32i8; i != MVT::v4i64; ++i) {
01284       MVT VT = (MVT::SimpleValueType)i;
01285 
01286       // Do not attempt to promote non-256-bit vectors
01287       if (!VT.is256BitVector())
01288         continue;
01289 
01290       setOperationAction(ISD::AND,    VT, Promote);
01291       AddPromotedToType (ISD::AND,    VT, MVT::v4i64);
01292       setOperationAction(ISD::OR,     VT, Promote);
01293       AddPromotedToType (ISD::OR,     VT, MVT::v4i64);
01294       setOperationAction(ISD::XOR,    VT, Promote);
01295       AddPromotedToType (ISD::XOR,    VT, MVT::v4i64);
01296       setOperationAction(ISD::LOAD,   VT, Promote);
01297       AddPromotedToType (ISD::LOAD,   VT, MVT::v4i64);
01298       setOperationAction(ISD::SELECT, VT, Promote);
01299       AddPromotedToType (ISD::SELECT, VT, MVT::v4i64);
01300     }
01301   }
01302 
01303   if (!TM.Options.UseSoftFloat && Subtarget->hasAVX512()) {
01304     addRegisterClass(MVT::v16i32, &X86::VR512RegClass);
01305     addRegisterClass(MVT::v16f32, &X86::VR512RegClass);
01306     addRegisterClass(MVT::v8i64,  &X86::VR512RegClass);
01307     addRegisterClass(MVT::v8f64,  &X86::VR512RegClass);
01308 
01309     addRegisterClass(MVT::i1,     &X86::VK1RegClass);
01310     addRegisterClass(MVT::v8i1,   &X86::VK8RegClass);
01311     addRegisterClass(MVT::v16i1,  &X86::VK16RegClass);
01312 
01313     setOperationAction(ISD::BR_CC,              MVT::i1,    Expand);
01314     setOperationAction(ISD::SETCC,              MVT::i1,    Custom);
01315     setOperationAction(ISD::XOR,                MVT::i1,    Legal);
01316     setOperationAction(ISD::OR,                 MVT::i1,    Legal);
01317     setOperationAction(ISD::AND,                MVT::i1,    Legal);
01318     setLoadExtAction(ISD::EXTLOAD,              MVT::v8f32, Legal);
01319     setOperationAction(ISD::LOAD,               MVT::v16f32, Legal);
01320     setOperationAction(ISD::LOAD,               MVT::v8f64, Legal);
01321     setOperationAction(ISD::LOAD,               MVT::v8i64, Legal);
01322     setOperationAction(ISD::LOAD,               MVT::v16i32, Legal);
01323     setOperationAction(ISD::LOAD,               MVT::v16i1, Legal);
01324 
01325     setOperationAction(ISD::FADD,               MVT::v16f32, Legal);
01326     setOperationAction(ISD::FSUB,               MVT::v16f32, Legal);
01327     setOperationAction(ISD::FMUL,               MVT::v16f32, Legal);
01328     setOperationAction(ISD::FDIV,               MVT::v16f32, Legal);
01329     setOperationAction(ISD::FSQRT,              MVT::v16f32, Legal);
01330     setOperationAction(ISD::FNEG,               MVT::v16f32, Custom);
01331 
01332     setOperationAction(ISD::FADD,               MVT::v8f64, Legal);
01333     setOperationAction(ISD::FSUB,               MVT::v8f64, Legal);
01334     setOperationAction(ISD::FMUL,               MVT::v8f64, Legal);
01335     setOperationAction(ISD::FDIV,               MVT::v8f64, Legal);
01336     setOperationAction(ISD::FSQRT,              MVT::v8f64, Legal);
01337     setOperationAction(ISD::FNEG,               MVT::v8f64, Custom);
01338     setOperationAction(ISD::FMA,                MVT::v8f64, Legal);
01339     setOperationAction(ISD::FMA,                MVT::v16f32, Legal);
01340     setOperationAction(ISD::SDIV,               MVT::v16i32, Custom);
01341 
01342     setOperationAction(ISD::FP_TO_SINT,         MVT::i32, Legal);
01343     setOperationAction(ISD::FP_TO_UINT,         MVT::i32, Legal);
01344     setOperationAction(ISD::SINT_TO_FP,         MVT::i32, Legal);
01345     setOperationAction(ISD::UINT_TO_FP,         MVT::i32, Legal);
01346     if (Subtarget->is64Bit()) {
01347       setOperationAction(ISD::FP_TO_UINT,       MVT::i64, Legal);
01348       setOperationAction(ISD::FP_TO_SINT,       MVT::i64, Legal);
01349       setOperationAction(ISD::SINT_TO_FP,       MVT::i64, Legal);
01350       setOperationAction(ISD::UINT_TO_FP,       MVT::i64, Legal);
01351     }
01352     setOperationAction(ISD::FP_TO_SINT,         MVT::v16i32, Legal);
01353     setOperationAction(ISD::FP_TO_UINT,         MVT::v16i32, Legal);
01354     setOperationAction(ISD::FP_TO_UINT,         MVT::v8i32, Legal);
01355     setOperationAction(ISD::FP_TO_UINT,         MVT::v4i32, Legal);
01356     setOperationAction(ISD::SINT_TO_FP,         MVT::v16i32, Legal);
01357     setOperationAction(ISD::UINT_TO_FP,         MVT::v16i32, Legal);
01358     setOperationAction(ISD::UINT_TO_FP,         MVT::v8i32, Legal);
01359     setOperationAction(ISD::UINT_TO_FP,         MVT::v4i32, Legal);
01360     setOperationAction(ISD::FP_ROUND,           MVT::v8f32, Legal);
01361     setOperationAction(ISD::FP_EXTEND,          MVT::v8f32, Legal);
01362 
01363     setOperationAction(ISD::TRUNCATE,           MVT::i1, Custom);
01364     setOperationAction(ISD::TRUNCATE,           MVT::v16i8, Custom);
01365     setOperationAction(ISD::TRUNCATE,           MVT::v8i32, Custom);
01366     setOperationAction(ISD::TRUNCATE,           MVT::v8i1, Custom);
01367     setOperationAction(ISD::TRUNCATE,           MVT::v16i1, Custom);
01368     setOperationAction(ISD::TRUNCATE,           MVT::v16i16, Custom);
01369     setOperationAction(ISD::ZERO_EXTEND,        MVT::v16i32, Custom);
01370     setOperationAction(ISD::ZERO_EXTEND,        MVT::v8i64, Custom);
01371     setOperationAction(ISD::SIGN_EXTEND,        MVT::v16i32, Custom);
01372     setOperationAction(ISD::SIGN_EXTEND,        MVT::v8i64, Custom);
01373     setOperationAction(ISD::SIGN_EXTEND,        MVT::v16i8, Custom);
01374     setOperationAction(ISD::SIGN_EXTEND,        MVT::v8i16, Custom);
01375     setOperationAction(ISD::SIGN_EXTEND,        MVT::v16i16, Custom);
01376 
01377     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v8f64,  Custom);
01378     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v8i64,  Custom);
01379     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v16f32,  Custom);
01380     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v16i32,  Custom);
01381     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v8i1,    Custom);
01382     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v16i1, Legal);
01383 
01384     setOperationAction(ISD::SETCC,              MVT::v16i1, Custom);
01385     setOperationAction(ISD::SETCC,              MVT::v8i1, Custom);
01386 
01387     setOperationAction(ISD::MUL,              MVT::v8i64, Custom);
01388 
01389     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i1,  Custom);
01390     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i1, Custom);
01391     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v16i1, Custom);
01392     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v8i1, Custom);
01393     setOperationAction(ISD::BUILD_VECTOR,       MVT::v8i1, Custom);
01394     setOperationAction(ISD::BUILD_VECTOR,       MVT::v16i1, Custom);
01395     setOperationAction(ISD::SELECT,             MVT::v8f64, Custom);
01396     setOperationAction(ISD::SELECT,             MVT::v8i64, Custom);
01397     setOperationAction(ISD::SELECT,             MVT::v16f32, Custom);
01398 
01399     setOperationAction(ISD::ADD,                MVT::v8i64, Legal);
01400     setOperationAction(ISD::ADD,                MVT::v16i32, Legal);
01401 
01402     setOperationAction(ISD::SUB,                MVT::v8i64, Legal);
01403     setOperationAction(ISD::SUB,                MVT::v16i32, Legal);
01404 
01405     setOperationAction(ISD::MUL,                MVT::v16i32, Legal);
01406 
01407     setOperationAction(ISD::SRL,                MVT::v8i64, Custom);
01408     setOperationAction(ISD::SRL,                MVT::v16i32, Custom);
01409 
01410     setOperationAction(ISD::SHL,                MVT::v8i64, Custom);
01411     setOperationAction(ISD::SHL,                MVT::v16i32, Custom);
01412 
01413     setOperationAction(ISD::SRA,                MVT::v8i64, Custom);
01414     setOperationAction(ISD::SRA,                MVT::v16i32, Custom);
01415 
01416     setOperationAction(ISD::AND,                MVT::v8i64, Legal);
01417     setOperationAction(ISD::OR,                 MVT::v8i64, Legal);
01418     setOperationAction(ISD::XOR,                MVT::v8i64, Legal);
01419     setOperationAction(ISD::AND,                MVT::v16i32, Legal);
01420     setOperationAction(ISD::OR,                 MVT::v16i32, Legal);
01421     setOperationAction(ISD::XOR,                MVT::v16i32, Legal);
01422 
01423     // Custom lower several nodes.
01424     for (int i = MVT::FIRST_VECTOR_VALUETYPE;
01425              i <= MVT::LAST_VECTOR_VALUETYPE; ++i) {
01426       MVT VT = (MVT::SimpleValueType)i;
01427 
01428       unsigned EltSize = VT.getVectorElementType().getSizeInBits();
01429       // Extract subvector is special because the value type
01430       // (result) is 256/128-bit but the source is 512-bit wide.
01431       if (VT.is128BitVector() || VT.is256BitVector())
01432         setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
01433 
01434       if (VT.getVectorElementType() == MVT::i1)
01435         setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);
01436 
01437       // Do not attempt to custom lower other non-512-bit vectors
01438       if (!VT.is512BitVector())
01439         continue;
01440 
01441       if ( EltSize >= 32) {
01442         setOperationAction(ISD::VECTOR_SHUFFLE,      VT, Custom);
01443         setOperationAction(ISD::INSERT_VECTOR_ELT,   VT, Custom);
01444         setOperationAction(ISD::BUILD_VECTOR,        VT, Custom);
01445         setOperationAction(ISD::VSELECT,             VT, Legal);
01446         setOperationAction(ISD::EXTRACT_VECTOR_ELT,  VT, Custom);
01447         setOperationAction(ISD::SCALAR_TO_VECTOR,    VT, Custom);
01448         setOperationAction(ISD::INSERT_SUBVECTOR,    VT, Custom);
01449       }
01450     }
01451     for (int i = MVT::v32i8; i != MVT::v8i64; ++i) {
01452       MVT VT = (MVT::SimpleValueType)i;
01453 
01454       // Do not attempt to promote non-256-bit vectors
01455       if (!VT.is512BitVector())
01456         continue;
01457 
01458       setOperationAction(ISD::SELECT, VT, Promote);
01459       AddPromotedToType (ISD::SELECT, VT, MVT::v8i64);
01460     }
01461   }// has  AVX-512
01462 
01463   // SIGN_EXTEND_INREGs are evaluated by the extend type. Handle the expansion
01464   // of this type with custom code.
01465   for (int VT = MVT::FIRST_VECTOR_VALUETYPE;
01466            VT != MVT::LAST_VECTOR_VALUETYPE; VT++) {
01467     setOperationAction(ISD::SIGN_EXTEND_INREG, (MVT::SimpleValueType)VT,
01468                        Custom);
01469   }
01470 
01471   // We want to custom lower some of our intrinsics.
01472   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
01473   setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
01474   setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
01475 
01476   // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't
01477   // handle type legalization for these operations here.
01478   //
01479   // FIXME: We really should do custom legalization for addition and
01480   // subtraction on x86-32 once PR3203 is fixed.  We really can't do much better
01481   // than generic legalization for 64-bit multiplication-with-overflow, though.
01482   for (unsigned i = 0, e = 3+Subtarget->is64Bit(); i != e; ++i) {
01483     // Add/Sub/Mul with overflow operations are custom lowered.
01484     MVT VT = IntVTs[i];
01485     setOperationAction(ISD::SADDO, VT, Custom);
01486     setOperationAction(ISD::UADDO, VT, Custom);
01487     setOperationAction(ISD::SSUBO, VT, Custom);
01488     setOperationAction(ISD::USUBO, VT, Custom);
01489     setOperationAction(ISD::SMULO, VT, Custom);
01490     setOperationAction(ISD::UMULO, VT, Custom);
01491   }
01492 
01493   // There are no 8-bit 3-address imul/mul instructions
01494   setOperationAction(ISD::SMULO, MVT::i8, Expand);
01495   setOperationAction(ISD::UMULO, MVT::i8, Expand);
01496 
01497   if (!Subtarget->is64Bit()) {
01498     // These libcalls are not available in 32-bit.
01499     setLibcallName(RTLIB::SHL_I128, 0);
01500     setLibcallName(RTLIB::SRL_I128, 0);
01501     setLibcallName(RTLIB::SRA_I128, 0);
01502   }
01503 
01504   // Combine sin / cos into one node or libcall if possible.
01505   if (Subtarget->hasSinCos()) {
01506     setLibcallName(RTLIB::SINCOS_F32, "sincosf");
01507     setLibcallName(RTLIB::SINCOS_F64, "sincos");
01508     if (Subtarget->isTargetDarwin()) {
01509       // For MacOSX, we don't want to the normal expansion of a libcall to
01510       // sincos. We want to issue a libcall to __sincos_stret to avoid memory
01511       // traffic.
01512       setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
01513       setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
01514     }
01515   }
01516 
01517   // We have target-specific dag combine patterns for the following nodes:
01518   setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
01519   setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
01520   setTargetDAGCombine(ISD::VSELECT);
01521   setTargetDAGCombine(ISD::SELECT);
01522   setTargetDAGCombine(ISD::SHL);
01523   setTargetDAGCombine(ISD::SRA);
01524   setTargetDAGCombine(ISD::SRL);
01525   setTargetDAGCombine(ISD::OR);
01526   setTargetDAGCombine(ISD::AND);
01527   setTargetDAGCombine(ISD::ADD);
01528   setTargetDAGCombine(ISD::FADD);
01529   setTargetDAGCombine(ISD::FSUB);
01530   setTargetDAGCombine(ISD::FMA);
01531   setTargetDAGCombine(ISD::SUB);
01532   setTargetDAGCombine(ISD::LOAD);
01533   setTargetDAGCombine(ISD::STORE);
01534   setTargetDAGCombine(ISD::ZERO_EXTEND);
01535   setTargetDAGCombine(ISD::ANY_EXTEND);
01536   setTargetDAGCombine(ISD::SIGN_EXTEND);
01537   setTargetDAGCombine(ISD::SIGN_EXTEND_INREG);
01538   setTargetDAGCombine(ISD::TRUNCATE);
01539   setTargetDAGCombine(ISD::SINT_TO_FP);
01540   setTargetDAGCombine(ISD::SETCC);
01541   if (Subtarget->is64Bit())
01542     setTargetDAGCombine(ISD::MUL);
01543   setTargetDAGCombine(ISD::XOR);
01544 
01545   computeRegisterProperties();
01546 
01547   // On Darwin, -Os means optimize for size without hurting performance,
01548   // do not reduce the limit.
01549   MaxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores
01550   MaxStoresPerMemsetOptSize = Subtarget->isTargetDarwin() ? 16 : 8;
01551   MaxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores
01552   MaxStoresPerMemcpyOptSize = Subtarget->isTargetDarwin() ? 8 : 4;
01553   MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores
01554   MaxStoresPerMemmoveOptSize = Subtarget->isTargetDarwin() ? 8 : 4;
01555   setPrefLoopAlignment(4); // 2^4 bytes.
01556 
01557   // Predictable cmov don't hurt on atom because it's in-order.
01558   PredictableSelectIsExpensive = !Subtarget->isAtom();
01559 
01560   setPrefFunctionAlignment(4); // 2^4 bytes.
01561 }
01562 
01563 EVT X86TargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const {
01564   if (!VT.isVector())
01565     return Subtarget->hasAVX512() ? MVT::i1: MVT::i8;
01566 
01567   if (Subtarget->hasAVX512())
01568     switch(VT.getVectorNumElements()) {
01569     case  8: return MVT::v8i1;
01570     case 16: return MVT::v16i1;
01571   }
01572 
01573   return VT.changeVectorElementTypeToInteger();
01574 }
01575 
01576 /// getMaxByValAlign - Helper for getByValTypeAlignment to determine
01577 /// the desired ByVal argument alignment.
01578 static void getMaxByValAlign(Type *Ty, unsigned &MaxAlign) {
01579   if (MaxAlign == 16)
01580     return;
01581   if (VectorType *VTy = dyn_cast<VectorType>(Ty)) {
01582     if (VTy->getBitWidth() == 128)
01583       MaxAlign = 16;
01584   } else if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
01585     unsigned EltAlign = 0;
01586     getMaxByValAlign(ATy->getElementType(), EltAlign);
01587     if (EltAlign > MaxAlign)
01588       MaxAlign = EltAlign;
01589   } else if (StructType *STy = dyn_cast<StructType>(Ty)) {
01590     for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
01591       unsigned EltAlign = 0;
01592       getMaxByValAlign(STy->getElementType(i), EltAlign);
01593       if (EltAlign > MaxAlign)
01594         MaxAlign = EltAlign;
01595       if (MaxAlign == 16)
01596         break;
01597     }
01598   }
01599 }
01600 
01601 /// getByValTypeAlignment - Return the desired alignment for ByVal aggregate
01602 /// function arguments in the caller parameter area. For X86, aggregates
01603 /// that contain SSE vectors are placed at 16-byte boundaries while the rest
01604 /// are at 4-byte boundaries.
01605 unsigned X86TargetLowering::getByValTypeAlignment(Type *Ty) const {
01606   if (Subtarget->is64Bit()) {
01607     // Max of 8 and alignment of type.
01608     unsigned TyAlign = TD->getABITypeAlignment(Ty);
01609     if (TyAlign > 8)
01610       return TyAlign;
01611     return 8;
01612   }
01613 
01614   unsigned Align = 4;
01615   if (Subtarget->hasSSE1())
01616     getMaxByValAlign(Ty, Align);
01617   return Align;
01618 }
01619 
01620 /// getOptimalMemOpType - Returns the target specific optimal type for load
01621 /// and store operations as a result of memset, memcpy, and memmove
01622 /// lowering. If DstAlign is zero that means it's safe to destination
01623 /// alignment can satisfy any constraint. Similarly if SrcAlign is zero it
01624 /// means there isn't a need to check it against alignment requirement,
01625 /// probably because the source does not need to be loaded. If 'IsMemset' is
01626 /// true, that means it's expanding a memset. If 'ZeroMemset' is true, that
01627 /// means it's a memset of zero. 'MemcpyStrSrc' indicates whether the memcpy
01628 /// source is constant so it does not need to be loaded.
01629 /// It returns EVT::Other if the type should be determined using generic
01630 /// target-independent logic.
01631 EVT
01632 X86TargetLowering::getOptimalMemOpType(uint64_t Size,
01633                                        unsigned DstAlign, unsigned SrcAlign,
01634                                        bool IsMemset, bool ZeroMemset,
01635                                        bool MemcpyStrSrc,
01636                                        MachineFunction &MF) const {
01637   const Function *F = MF.getFunction();
01638   if ((!IsMemset || ZeroMemset) &&
01639       !F->getAttributes().hasAttribute(AttributeSet::FunctionIndex,
01640                                        Attribute::NoImplicitFloat)) {
01641     if (Size >= 16 &&
01642         (Subtarget->isUnalignedMemAccessFast() ||
01643          ((DstAlign == 0 || DstAlign >= 16) &&
01644           (SrcAlign == 0 || SrcAlign >= 16)))) {
01645       if (Size >= 32) {
01646         if (Subtarget->hasInt256())
01647           return MVT::v8i32;
01648         if (Subtarget->hasFp256())
01649           return MVT::v8f32;
01650       }
01651       if (Subtarget->hasSSE2())
01652         return MVT::v4i32;
01653       if (Subtarget->hasSSE1())
01654         return MVT::v4f32;
01655     } else if (!MemcpyStrSrc && Size >= 8 &&
01656                !Subtarget->is64Bit() &&
01657                Subtarget->hasSSE2()) {
01658       // Do not use f64 to lower memcpy if source is string constant. It's
01659       // better to use i32 to avoid the loads.
01660       return MVT::f64;
01661     }
01662   }
01663   if (Subtarget->is64Bit() && Size >= 8)
01664     return MVT::i64;
01665   return MVT::i32;
01666 }
01667 
01668 bool X86TargetLowering::isSafeMemOpType(MVT VT) const {
01669   if (VT == MVT::f32)
01670     return X86ScalarSSEf32;
01671   else if (VT == MVT::f64)
01672     return X86ScalarSSEf64;
01673   return true;
01674 }
01675 
01676 bool
01677 X86TargetLowering::allowsUnalignedMemoryAccesses(EVT VT,
01678                                                  unsigned,
01679                                                  bool *Fast) const {
01680   if (Fast)
01681     *Fast = Subtarget->isUnalignedMemAccessFast();
01682   return true;
01683 }
01684 
01685 /// getJumpTableEncoding - Return the entry encoding for a jump table in the
01686 /// current function.  The returned value is a member of the
01687 /// MachineJumpTableInfo::JTEntryKind enum.
01688 unsigned X86TargetLowering::getJumpTableEncoding() const {
01689   // In GOT pic mode, each entry in the jump table is emitted as a @GOTOFF
01690   // symbol.
01691   if (getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
01692       Subtarget->isPICStyleGOT())
01693     return MachineJumpTableInfo::EK_Custom32;
01694 
01695   // Otherwise, use the normal jump table encoding heuristics.
01696   return TargetLowering::getJumpTableEncoding();
01697 }
01698 
01699 const MCExpr *
01700 X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI,
01701                                              const MachineBasicBlock *MBB,
01702                                              unsigned uid,MCContext &Ctx) const{
01703   assert(getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
01704          Subtarget->isPICStyleGOT());
01705   // In 32-bit ELF systems, our jump table entries are formed with @GOTOFF
01706   // entries.
01707   return MCSymbolRefExpr::Create(MBB->getSymbol(),
01708                                  MCSymbolRefExpr::VK_GOTOFF, Ctx);
01709 }
01710 
01711 /// getPICJumpTableRelocaBase - Returns relocation base for the given PIC
01712 /// jumptable.
01713 SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table,
01714                                                     SelectionDAG &DAG) const {
01715   if (!Subtarget->is64Bit())
01716     // This doesn't have SDLoc associated with it, but is not really the
01717     // same as a Register.
01718     return DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), getPointerTy());
01719   return Table;
01720 }
01721 
01722 /// getPICJumpTableRelocBaseExpr - This returns the relocation base for the
01723 /// given PIC jumptable, the same as getPICJumpTableRelocBase, but as an
01724 /// MCExpr.
01725 const MCExpr *X86TargetLowering::
01726 getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI,
01727                              MCContext &Ctx) const {
01728   // X86-64 uses RIP relative addressing based on the jump table label.
01729   if (Subtarget->isPICStyleRIPRel())
01730     return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx);
01731 
01732   // Otherwise, the reference is relative to the PIC base.
01733   return MCSymbolRefExpr::Create(MF->getPICBaseSymbol(), Ctx);
01734 }
01735 
01736 // FIXME: Why this routine is here? Move to RegInfo!
01737 std::pair<const TargetRegisterClass*, uint8_t>
01738 X86TargetLowering::findRepresentativeClass(MVT VT) const{
01739   const TargetRegisterClass *RRC = 0;
01740   uint8_t Cost = 1;
01741   switch (VT.SimpleTy) {
01742   default:
01743     return TargetLowering::findRepresentativeClass(VT);
01744   case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64:
01745     RRC = Subtarget->is64Bit() ?
01746       (const TargetRegisterClass*)&X86::GR64RegClass :
01747       (const TargetRegisterClass*)&X86::GR32RegClass;
01748     break;
01749   case MVT::x86mmx:
01750     RRC = &X86::VR64RegClass;
01751     break;
01752   case MVT::f32: case MVT::f64:
01753   case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64:
01754   case MVT::v4f32: case MVT::v2f64:
01755   case MVT::v32i8: case MVT::v8i32: case MVT::v4i64: case MVT::v8f32:
01756   case MVT::v4f64:
01757     RRC = &X86::VR128RegClass;
01758     break;
01759   }
01760   return std::make_pair(RRC, Cost);
01761 }
01762 
01763 bool X86TargetLowering::getStackCookieLocation(unsigned &AddressSpace,
01764                                                unsigned &Offset) const {
01765   if (!Subtarget->isTargetLinux())
01766     return false;
01767 
01768   if (Subtarget->is64Bit()) {
01769     // %fs:0x28, unless we're using a Kernel code model, in which case it's %gs:
01770     Offset = 0x28;
01771     if (getTargetMachine().getCodeModel() == CodeModel::Kernel)
01772       AddressSpace = 256;
01773     else
01774       AddressSpace = 257;
01775   } else {
01776     // %gs:0x14 on i386
01777     Offset = 0x14;
01778     AddressSpace = 256;
01779   }
01780   return true;
01781 }
01782 
01783 bool X86TargetLowering::isNoopAddrSpaceCast(unsigned SrcAS,
01784                                             unsigned DestAS) const {
01785   assert(SrcAS != DestAS && "Expected different address spaces!");
01786 
01787   return SrcAS < 256 && DestAS < 256;
01788 }
01789 
01790 //===----------------------------------------------------------------------===//
01791 //               Return Value Calling Convention Implementation
01792 //===----------------------------------------------------------------------===//
01793 
01794 #include "X86GenCallingConv.inc"
01795 
01796 bool
01797 X86TargetLowering::CanLowerReturn(CallingConv::ID CallConv,
01798                                   MachineFunction &MF, bool isVarArg,
01799                         const SmallVectorImpl<ISD::OutputArg> &Outs,
01800                         LLVMContext &Context) const {
01801   SmallVector<CCValAssign, 16> RVLocs;
01802   CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(),
01803                  RVLocs, Context);
01804   return CCInfo.CheckReturn(Outs, RetCC_X86);
01805 }
01806 
01807 const MCPhysReg *X86TargetLowering::getScratchRegisters(CallingConv::ID) const {
01808   static const MCPhysReg ScratchRegs[] = { X86::R11, 0 };
01809   return ScratchRegs;
01810 }
01811 
01812 SDValue
01813 X86TargetLowering::LowerReturn(SDValue Chain,
01814                                CallingConv::ID CallConv, bool isVarArg,
01815                                const SmallVectorImpl<ISD::OutputArg> &Outs,
01816                                const SmallVectorImpl<SDValue> &OutVals,
01817                                SDLoc dl, SelectionDAG &DAG) const {
01818   MachineFunction &MF = DAG.getMachineFunction();
01819   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
01820 
01821   SmallVector<CCValAssign, 16> RVLocs;
01822   CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(),
01823                  RVLocs, *DAG.getContext());
01824   CCInfo.AnalyzeReturn(Outs, RetCC_X86);
01825 
01826   SDValue Flag;
01827   SmallVector<SDValue, 6> RetOps;
01828   RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
01829   // Operand #1 = Bytes To Pop
01830   RetOps.push_back(DAG.getTargetConstant(FuncInfo->getBytesToPopOnReturn(),
01831                    MVT::i16));
01832 
01833   // Copy the result values into the output registers.
01834   for (unsigned i = 0; i != RVLocs.size(); ++i) {
01835     CCValAssign &VA = RVLocs[i];
01836     assert(VA.isRegLoc() && "Can only return in registers!");
01837     SDValue ValToCopy = OutVals[i];
01838     EVT ValVT = ValToCopy.getValueType();
01839 
01840     // Promote values to the appropriate types
01841     if (VA.getLocInfo() == CCValAssign::SExt)
01842       ValToCopy = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), ValToCopy);
01843     else if (VA.getLocInfo() == CCValAssign::ZExt)
01844       ValToCopy = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), ValToCopy);
01845     else if (VA.getLocInfo() == CCValAssign::AExt)
01846       ValToCopy = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), ValToCopy);
01847     else if (VA.getLocInfo() == CCValAssign::BCvt)
01848       ValToCopy = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), ValToCopy);
01849 
01850     assert(VA.getLocInfo() != CCValAssign::FPExt &&
01851            "Unexpected FP-extend for return value.");  
01852 
01853     // If this is x86-64, and we disabled SSE, we can't return FP values,
01854     // or SSE or MMX vectors.
01855     if ((ValVT == MVT::f32 || ValVT == MVT::f64 ||
01856          VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) &&
01857           (Subtarget->is64Bit() && !Subtarget->hasSSE1())) {
01858       report_fatal_error("SSE register return with SSE disabled");
01859     }
01860     // Likewise we can't return F64 values with SSE1 only.  gcc does so, but
01861     // llvm-gcc has never done it right and no one has noticed, so this
01862     // should be OK for now.
01863     if (ValVT == MVT::f64 &&
01864         (Subtarget->is64Bit() && !Subtarget->hasSSE2()))
01865       report_fatal_error("SSE2 register return with SSE2 disabled");
01866 
01867     // Returns in ST0/ST1 are handled specially: these are pushed as operands to
01868     // the RET instruction and handled by the FP Stackifier.
01869     if (VA.getLocReg() == X86::ST0 ||
01870         VA.getLocReg() == X86::ST1) {
01871       // If this is a copy from an xmm register to ST(0), use an FPExtend to
01872       // change the value to the FP stack register class.
01873       if (isScalarFPTypeInSSEReg(VA.getValVT()))
01874         ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy);
01875       RetOps.push_back(ValToCopy);
01876       // Don't emit a copytoreg.
01877       continue;
01878     }
01879 
01880     // 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64
01881     // which is returned in RAX / RDX.
01882     if (Subtarget->is64Bit()) {
01883       if (ValVT == MVT::x86mmx) {
01884         if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) {
01885           ValToCopy = DAG.getNode(ISD::BITCAST, dl, MVT::i64, ValToCopy);
01886           ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
01887                                   ValToCopy);
01888           // If we don't have SSE2 available, convert to v4f32 so the generated
01889           // register is legal.
01890           if (!Subtarget->hasSSE2())
01891             ValToCopy = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32,ValToCopy);
01892         }
01893       }
01894     }
01895 
01896     Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), ValToCopy, Flag);
01897     Flag = Chain.getValue(1);
01898     RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
01899   }
01900 
01901   // The x86-64 ABIs require that for returning structs by value we copy
01902   // the sret argument into %rax/%eax (depending on ABI) for the return.
01903   // Win32 requires us to put the sret argument to %eax as well.
01904   // We saved the argument into a virtual register in the entry block,
01905   // so now we copy the value out and into %rax/%eax.
01906   if (DAG.getMachineFunction().getFunction()->hasStructRetAttr() &&
01907       (Subtarget->is64Bit() || Subtarget->isTargetKnownWindowsMSVC())) {
01908     MachineFunction &MF = DAG.getMachineFunction();
01909     X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
01910     unsigned Reg = FuncInfo->getSRetReturnReg();
01911     assert(Reg &&
01912            "SRetReturnReg should have been set in LowerFormalArguments().");
01913     SDValue Val = DAG.getCopyFromReg(Chain, dl, Reg, getPointerTy());
01914 
01915     unsigned RetValReg
01916         = (Subtarget->is64Bit() && !Subtarget->isTarget64BitILP32()) ?
01917           X86::RAX : X86::EAX;
01918     Chain = DAG.getCopyToReg(Chain, dl, RetValReg, Val, Flag);
01919     Flag = Chain.getValue(1);
01920 
01921     // RAX/EAX now acts like a return value.
01922     RetOps.push_back(DAG.getRegister(RetValReg, getPointerTy()));
01923   }
01924 
01925   RetOps[0] = Chain;  // Update chain.
01926 
01927   // Add the flag if we have it.
01928   if (Flag.getNode())
01929     RetOps.push_back(Flag);
01930 
01931   return DAG.getNode(X86ISD::RET_FLAG, dl,
01932                      MVT::Other, &RetOps[0], RetOps.size());
01933 }
01934 
01935 bool X86TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
01936   if (N->getNumValues() != 1)
01937     return false;
01938   if (!N->hasNUsesOfValue(1, 0))
01939     return false;
01940 
01941   SDValue TCChain = Chain;
01942   SDNode *Copy = *N->use_begin();
01943   if (Copy->getOpcode() == ISD::CopyToReg) {
01944     // If the copy has a glue operand, we conservatively assume it isn't safe to
01945     // perform a tail call.
01946     if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
01947       return false;
01948     TCChain = Copy->getOperand(0);
01949   } else if (Copy->getOpcode() != ISD::FP_EXTEND)
01950     return false;
01951 
01952   bool HasRet = false;
01953   for (SDNode::use_iterator UI = Copy->use_begin(), UE = Copy->use_end();
01954        UI != UE; ++UI) {
01955     if (UI->getOpcode() != X86ISD::RET_FLAG)
01956       return false;
01957     HasRet = true;
01958   }
01959 
01960   if (!HasRet)
01961     return false;
01962 
01963   Chain = TCChain;
01964   return true;
01965 }
01966 
01967 MVT
01968 X86TargetLowering::getTypeForExtArgOrReturn(MVT VT,
01969                                             ISD::NodeType ExtendKind) const {
01970   MVT ReturnMVT;
01971   // TODO: Is this also valid on 32-bit?
01972   if (Subtarget->is64Bit() && VT == MVT::i1 && ExtendKind == ISD::ZERO_EXTEND)
01973     ReturnMVT = MVT::i8;
01974   else
01975     ReturnMVT = MVT::i32;
01976 
01977   MVT MinVT = getRegisterType(ReturnMVT);
01978   return VT.bitsLT(MinVT) ? MinVT : VT;
01979 }
01980 
01981 /// LowerCallResult - Lower the result values of a call into the
01982 /// appropriate copies out of appropriate physical registers.
01983 ///
01984 SDValue
01985 X86TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
01986                                    CallingConv::ID CallConv, bool isVarArg,
01987                                    const SmallVectorImpl<ISD::InputArg> &Ins,
01988                                    SDLoc dl, SelectionDAG &DAG,
01989                                    SmallVectorImpl<SDValue> &InVals) const {
01990 
01991   // Assign locations to each value returned by this call.
01992   SmallVector<CCValAssign, 16> RVLocs;
01993   bool Is64Bit = Subtarget->is64Bit();
01994   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
01995                  getTargetMachine(), RVLocs, *DAG.getContext());
01996   CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
01997 
01998   // Copy all of the result registers out of their specified physreg.
01999   for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
02000     CCValAssign &VA = RVLocs[i];
02001     EVT CopyVT = VA.getValVT();
02002 
02003     // If this is x86-64, and we disabled SSE, we can't return FP values
02004     if ((CopyVT == MVT::f32 || CopyVT == MVT::f64) &&
02005         ((Is64Bit || Ins[i].Flags.isInReg()) && !Subtarget->hasSSE1())) {
02006       report_fatal_error("SSE register return with SSE disabled");
02007     }
02008 
02009     SDValue Val;
02010 
02011     // If this is a call to a function that returns an fp value on the floating
02012     // point stack, we must guarantee the value is popped from the stack, so
02013     // a CopyFromReg is not good enough - the copy instruction may be eliminated
02014     // if the return value is not used. We use the FpPOP_RETVAL instruction
02015     // instead.
02016     if (VA.getLocReg() == X86::ST0 || VA.getLocReg() == X86::ST1) {
02017       // If we prefer to use the value in xmm registers, copy it out as f80 and
02018       // use a truncate to move it from fp stack reg to xmm reg.
02019       if (isScalarFPTypeInSSEReg(VA.getValVT())) CopyVT = MVT::f80;
02020       SDValue Ops[] = { Chain, InFlag };
02021       Chain = SDValue(DAG.getMachineNode(X86::FpPOP_RETVAL, dl, CopyVT,
02022                                          MVT::Other, MVT::Glue, Ops), 1);
02023       Val = Chain.getValue(0);
02024 
02025       // Round the f80 to the right size, which also moves it to the appropriate
02026       // xmm register.
02027       if (CopyVT != VA.getValVT())
02028         Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val,
02029                           // This truncation won't change the value.
02030                           DAG.getIntPtrConstant(1));
02031     } else {
02032       Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(),
02033                                  CopyVT, InFlag).getValue(1);
02034       Val = Chain.getValue(0);
02035     }
02036     InFlag = Chain.getValue(2);
02037     InVals.push_back(Val);
02038   }
02039 
02040   return Chain;
02041 }
02042 
02043 //===----------------------------------------------------------------------===//
02044 //                C & StdCall & Fast Calling Convention implementation
02045 //===----------------------------------------------------------------------===//
02046 //  StdCall calling convention seems to be standard for many Windows' API
02047 //  routines and around. It differs from C calling convention just a little:
02048 //  callee should clean up the stack, not caller. Symbols should be also
02049 //  decorated in some fancy way :) It doesn't support any vector arguments.
02050 //  For info on fast calling convention see Fast Calling Convention (tail call)
02051 //  implementation LowerX86_32FastCCCallTo.
02052 
02053 /// CallIsStructReturn - Determines whether a call uses struct return
02054 /// semantics.
02055 enum StructReturnType {
02056   NotStructReturn,
02057   RegStructReturn,
02058   StackStructReturn
02059 };
02060 static StructReturnType
02061 callIsStructReturn(const SmallVectorImpl<ISD::OutputArg> &Outs) {
02062   if (Outs.empty())
02063     return NotStructReturn;
02064 
02065   const ISD::ArgFlagsTy &Flags = Outs[0].Flags;
02066   if (!Flags.isSRet())
02067     return NotStructReturn;
02068   if (Flags.isInReg())
02069     return RegStructReturn;
02070   return StackStructReturn;
02071 }
02072 
02073 /// ArgsAreStructReturn - Determines whether a function uses struct
02074 /// return semantics.
02075 static StructReturnType
02076 argsAreStructReturn(const SmallVectorImpl<ISD::InputArg> &Ins) {
02077   if (Ins.empty())
02078     return NotStructReturn;
02079 
02080   const ISD::ArgFlagsTy &Flags = Ins[0].Flags;
02081   if (!Flags.isSRet())
02082     return NotStructReturn;
02083   if (Flags.isInReg())
02084     return RegStructReturn;
02085   return StackStructReturn;
02086 }
02087 
02088 /// CreateCopyOfByValArgument - Make a copy of an aggregate at address specified
02089 /// by "Src" to address "Dst" with size and alignment information specified by
02090 /// the specific parameter attribute. The copy will be passed as a byval
02091 /// function parameter.
02092 static SDValue
02093 CreateCopyOfByValArgument(SDValue Src, SDValue Dst, SDValue Chain,
02094                           ISD::ArgFlagsTy Flags, SelectionDAG &DAG,
02095                           SDLoc dl) {
02096   SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), MVT::i32);
02097 
02098   return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(),
02099                        /*isVolatile*/false, /*AlwaysInline=*/true,
02100                        MachinePointerInfo(), MachinePointerInfo());
02101 }
02102 
02103 /// IsTailCallConvention - Return true if the calling convention is one that
02104 /// supports tail call optimization.
02105 static bool IsTailCallConvention(CallingConv::ID CC) {
02106   return (CC == CallingConv::Fast || CC == CallingConv::GHC ||
02107           CC == CallingConv::HiPE);
02108 }
02109 
02110 /// \brief Return true if the calling convention is a C calling convention.
02111 static bool IsCCallConvention(CallingConv::ID CC) {
02112   return (CC == CallingConv::C || CC == CallingConv::X86_64_Win64 ||
02113           CC == CallingConv::X86_64_SysV);
02114 }
02115 
02116 bool X86TargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const {
02117   if (!CI->isTailCall() || getTargetMachine().Options.DisableTailCalls)
02118     return false;
02119 
02120   CallSite CS(CI);
02121   CallingConv::ID CalleeCC = CS.getCallingConv();
02122   if (!IsTailCallConvention(CalleeCC) && !IsCCallConvention(CalleeCC))
02123     return false;
02124 
02125   return true;
02126 }
02127 
02128 /// FuncIsMadeTailCallSafe - Return true if the function is being made into
02129 /// a tailcall target by changing its ABI.
02130 static bool FuncIsMadeTailCallSafe(CallingConv::ID CC,
02131                                    bool GuaranteedTailCallOpt) {
02132   return GuaranteedTailCallOpt && IsTailCallConvention(CC);
02133 }
02134 
02135 SDValue
02136 X86TargetLowering::LowerMemArgument(SDValue Chain,
02137                                     CallingConv::ID CallConv,
02138                                     const SmallVectorImpl<ISD::InputArg> &Ins,
02139                                     SDLoc dl, SelectionDAG &DAG,
02140                                     const CCValAssign &VA,
02141                                     MachineFrameInfo *MFI,
02142                                     unsigned i) const {
02143   // Create the nodes corresponding to a load from this parameter slot.
02144   ISD::ArgFlagsTy Flags = Ins[i].Flags;
02145   bool AlwaysUseMutable = FuncIsMadeTailCallSafe(CallConv,
02146                               getTargetMachine().Options.GuaranteedTailCallOpt);
02147   bool isImmutable = !AlwaysUseMutable && !Flags.isByVal();
02148   EVT ValVT;
02149 
02150   // If value is passed by pointer we have address passed instead of the value
02151   // itself.
02152   if (VA.getLocInfo() == CCValAssign::Indirect)
02153     ValVT = VA.getLocVT();
02154   else
02155     ValVT = VA.getValVT();
02156 
02157   // FIXME: For now, all byval parameter objects are marked mutable. This can be
02158   // changed with more analysis.
02159   // In case of tail call optimization mark all arguments mutable. Since they
02160   // could be overwritten by lowering of arguments in case of a tail call.
02161   if (Flags.isByVal()) {
02162     unsigned Bytes = Flags.getByValSize();
02163     if (Bytes == 0) Bytes = 1; // Don't create zero-sized stack objects.
02164     int FI = MFI->CreateFixedObject(Bytes, VA.getLocMemOffset(), isImmutable);
02165     return DAG.getFrameIndex(FI, getPointerTy());
02166   } else {
02167     int FI = MFI->CreateFixedObject(ValVT.getSizeInBits()/8,
02168                                     VA.getLocMemOffset(), isImmutable);
02169     SDValue FIN = DAG.getFrameIndex(FI, getPointerTy());
02170     return DAG.getLoad(ValVT, dl, Chain, FIN,
02171                        MachinePointerInfo::getFixedStack(FI),
02172                        false, false, false, 0);
02173   }
02174 }
02175 
02176 SDValue
02177 X86TargetLowering::LowerFormalArguments(SDValue Chain,
02178                                         CallingConv::ID CallConv,
02179                                         bool isVarArg,
02180                                       const SmallVectorImpl<ISD::InputArg> &Ins,
02181                                         SDLoc dl,
02182                                         SelectionDAG &DAG,
02183                                         SmallVectorImpl<SDValue> &InVals)
02184                                           const {
02185   MachineFunction &MF = DAG.getMachineFunction();
02186   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
02187 
02188   const Function* Fn = MF.getFunction();
02189   if (Fn->hasExternalLinkage() &&
02190       Subtarget->isTargetCygMing() &&
02191       Fn->getName() == "main")
02192     FuncInfo->setForceFramePointer(true);
02193 
02194   MachineFrameInfo *MFI = MF.getFrameInfo();
02195   bool Is64Bit = Subtarget->is64Bit();
02196   bool IsWin64 = Subtarget->isCallingConvWin64(CallConv);
02197 
02198   assert(!(isVarArg && IsTailCallConvention(CallConv)) &&
02199          "Var args not supported with calling convention fastcc, ghc or hipe");
02200 
02201   // Assign locations to all of the incoming arguments.
02202   SmallVector<CCValAssign, 16> ArgLocs;
02203   CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(),
02204                  ArgLocs, *DAG.getContext());
02205 
02206   // Allocate shadow area for Win64
02207   if (IsWin64)
02208     CCInfo.AllocateStack(32, 8);
02209 
02210   CCInfo.AnalyzeFormalArguments(Ins, CC_X86);
02211 
02212   unsigned LastVal = ~0U;
02213   SDValue ArgValue;
02214   for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
02215     CCValAssign &VA = ArgLocs[i];
02216     // TODO: If an arg is passed in two places (e.g. reg and stack), skip later
02217     // places.
02218     assert(VA.getValNo() != LastVal &&
02219            "Don't support value assigned to multiple locs yet");
02220     (void)LastVal;
02221     LastVal = VA.getValNo();
02222 
02223     if (VA.isRegLoc()) {
02224       EVT RegVT = VA.getLocVT();
02225       const TargetRegisterClass *RC;
02226       if (RegVT == MVT::i32)
02227         RC = &X86::GR32RegClass;
02228       else if (Is64Bit && RegVT == MVT::i64)
02229         RC = &X86::GR64RegClass;
02230       else if (RegVT == MVT::f32)
02231         RC = &X86::FR32RegClass;
02232       else if (RegVT == MVT::f64)
02233         RC = &X86::FR64RegClass;
02234       else if (RegVT.is512BitVector())
02235         RC = &X86::VR512RegClass;
02236       else if (RegVT.is256BitVector())
02237         RC = &X86::VR256RegClass;
02238       else if (RegVT.is128BitVector())
02239         RC = &X86::VR128RegClass;
02240       else if (RegVT == MVT::x86mmx)
02241         RC = &X86::VR64RegClass;
02242       else if (RegVT == MVT::i1)
02243         RC = &X86::VK1RegClass;
02244       else if (RegVT == MVT::v8i1)
02245         RC = &X86::VK8RegClass;
02246       else if (RegVT == MVT::v16i1)
02247         RC = &X86::VK16RegClass;
02248       else
02249         llvm_unreachable("Unknown argument type!");
02250 
02251       unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
02252       ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
02253 
02254       // If this is an 8 or 16-bit value, it is really passed promoted to 32
02255       // bits.  Insert an assert[sz]ext to capture this, then truncate to the
02256       // right size.
02257       if (VA.getLocInfo() == CCValAssign::SExt)
02258         ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue,
02259                                DAG.getValueType(VA.getValVT()));
02260       else if (VA.getLocInfo() == CCValAssign::ZExt)
02261         ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue,
02262                                DAG.getValueType(VA.getValVT()));
02263       else if (VA.getLocInfo() == CCValAssign::BCvt)
02264         ArgValue = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), ArgValue);
02265 
02266       if (VA.isExtInLoc()) {
02267         // Handle MMX values passed in XMM regs.
02268         if (RegVT.isVector())
02269           ArgValue = DAG.getNode(X86ISD::MOVDQ2Q, dl, VA.getValVT(), ArgValue);
02270         else
02271           ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
02272       }
02273     } else {
02274       assert(VA.isMemLoc());
02275       ArgValue = LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, i);
02276     }
02277 
02278     // If value is passed via pointer - do a load.
02279     if (VA.getLocInfo() == CCValAssign::Indirect)
02280       ArgValue = DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue,
02281                              MachinePointerInfo(), false, false, false, 0);
02282 
02283     InVals.push_back(ArgValue);
02284   }
02285 
02286   // The x86-64 ABIs require that for returning structs by value we copy
02287   // the sret argument into %rax/%eax (depending on ABI) for the return.
02288   // Win32 requires us to put the sret argument to %eax as well.
02289   // Save the argument into a virtual register so that we can access it
02290   // from the return points.
02291   if (MF.getFunction()->hasStructRetAttr() &&
02292       (Subtarget->is64Bit() || Subtarget->isTargetKnownWindowsMSVC())) {
02293     X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
02294     unsigned Reg = FuncInfo->getSRetReturnReg();
02295     if (!Reg) {
02296       MVT PtrTy = getPointerTy();
02297       Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy));
02298       FuncInfo->setSRetReturnReg(Reg);
02299     }
02300     SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[0]);
02301     Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain);
02302   }
02303 
02304   unsigned StackSize = CCInfo.getNextStackOffset();
02305   // Align stack specially for tail calls.
02306   if (FuncIsMadeTailCallSafe(CallConv,
02307                              MF.getTarget().Options.GuaranteedTailCallOpt))
02308     StackSize = GetAlignedArgumentStackSize(StackSize, DAG);
02309 
02310   // If the function takes variable number of arguments, make a frame index for
02311   // the start of the first vararg value... for expansion of llvm.va_start.
02312   if (isVarArg) {
02313     if (Is64Bit || (CallConv != CallingConv::X86_FastCall &&
02314                     CallConv != CallingConv::X86_ThisCall)) {
02315       FuncInfo->setVarArgsFrameIndex(MFI->CreateFixedObject(1, StackSize,true));
02316     }
02317     if (Is64Bit) {
02318       unsigned TotalNumIntRegs = 0, TotalNumXMMRegs = 0;
02319 
02320       // FIXME: We should really autogenerate these arrays
02321       static const MCPhysReg GPR64ArgRegsWin64[] = {
02322         X86::RCX, X86::RDX, X86::R8,  X86::R9
02323       };
02324       static const MCPhysReg GPR64ArgRegs64Bit[] = {
02325         X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9
02326       };
02327       static const MCPhysReg XMMArgRegs64Bit[] = {
02328         X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
02329         X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
02330       };
02331       const MCPhysReg *GPR64ArgRegs;
02332       unsigned NumXMMRegs = 0;
02333 
02334       if (IsWin64) {
02335         // The XMM registers which might contain var arg parameters are shadowed
02336         // in their paired GPR.  So we only need to save the GPR to their home
02337         // slots.
02338         TotalNumIntRegs = 4;
02339         GPR64ArgRegs = GPR64ArgRegsWin64;
02340       } else {
02341         TotalNumIntRegs = 6; TotalNumXMMRegs = 8;
02342         GPR64ArgRegs = GPR64ArgRegs64Bit;
02343 
02344         NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs64Bit,
02345                                                 TotalNumXMMRegs);
02346       }
02347       unsigned NumIntRegs = CCInfo.getFirstUnallocated(GPR64ArgRegs,
02348                                                        TotalNumIntRegs);
02349 
02350       bool NoImplicitFloatOps = Fn->getAttributes().
02351         hasAttribute(AttributeSet::FunctionIndex, Attribute::NoImplicitFloat);
02352       assert(!(NumXMMRegs && !Subtarget->hasSSE1()) &&
02353              "SSE register cannot be used when SSE is disabled!");
02354       assert(!(NumXMMRegs && MF.getTarget().Options.UseSoftFloat &&
02355                NoImplicitFloatOps) &&
02356              "SSE register cannot be used when SSE is disabled!");
02357       if (MF.getTarget().Options.UseSoftFloat || NoImplicitFloatOps ||
02358           !Subtarget->hasSSE1())
02359         // Kernel mode asks for SSE to be disabled, so don't push them
02360         // on the stack.
02361         TotalNumXMMRegs = 0;
02362 
02363       if (IsWin64) {
02364         const TargetFrameLowering &TFI = *getTargetMachine().getFrameLowering();
02365         // Get to the caller-allocated home save location.  Add 8 to account
02366         // for the return address.
02367         int HomeOffset = TFI.getOffsetOfLocalArea() + 8;
02368         FuncInfo->setRegSaveFrameIndex(
02369           MFI->CreateFixedObject(1, NumIntRegs * 8 + HomeOffset, false));
02370         // Fixup to set vararg frame on shadow area (4 x i64).
02371         if (NumIntRegs < 4)
02372           FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex());
02373       } else {
02374         // For X86-64, if there are vararg parameters that are passed via
02375         // registers, then we must store them to their spots on the stack so
02376         // they may be loaded by deferencing the result of va_next.
02377         FuncInfo->setVarArgsGPOffset(NumIntRegs * 8);
02378         FuncInfo->setVarArgsFPOffset(TotalNumIntRegs * 8 + NumXMMRegs * 16);
02379         FuncInfo->setRegSaveFrameIndex(
02380           MFI->CreateStackObject(TotalNumIntRegs * 8 + TotalNumXMMRegs * 16, 16,
02381                                false));
02382       }
02383 
02384       // Store the integer parameter registers.
02385       SmallVector<SDValue, 8> MemOps;
02386       SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(),
02387                                         getPointerTy());
02388       unsigned Offset = FuncInfo->getVarArgsGPOffset();
02389       for (; NumIntRegs != TotalNumIntRegs; ++NumIntRegs) {
02390         SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), RSFIN,
02391                                   DAG.getIntPtrConstant(Offset));
02392         unsigned VReg = MF.addLiveIn(GPR64ArgRegs[NumIntRegs],
02393                                      &X86::GR64RegClass);
02394         SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64);
02395         SDValue Store =
02396           DAG.getStore(Val.getValue(1), dl, Val, FIN,
02397                        MachinePointerInfo::getFixedStack(
02398                          FuncInfo->getRegSaveFrameIndex(), Offset),
02399                        false, false, 0);
02400         MemOps.push_back(Store);
02401         Offset += 8;
02402       }
02403 
02404       if (TotalNumXMMRegs != 0 && NumXMMRegs != TotalNumXMMRegs) {
02405         // Now store the XMM (fp + vector) parameter registers.
02406         SmallVector<SDValue, 11> SaveXMMOps;
02407         SaveXMMOps.push_back(Chain);
02408 
02409         unsigned AL = MF.addLiveIn(X86::AL, &X86::GR8RegClass);
02410         SDValue ALVal = DAG.getCopyFromReg(DAG.getEntryNode(), dl, AL, MVT::i8);
02411         SaveXMMOps.push_back(ALVal);
02412 
02413         SaveXMMOps.push_back(DAG.getIntPtrConstant(
02414                                FuncInfo->getRegSaveFrameIndex()));
02415         SaveXMMOps.push_back(DAG.getIntPtrConstant(
02416                                FuncInfo->getVarArgsFPOffset()));
02417 
02418         for (; NumXMMRegs != TotalNumXMMRegs; ++NumXMMRegs) {
02419           unsigned VReg = MF.addLiveIn(XMMArgRegs64Bit[NumXMMRegs],
02420                                        &X86::VR128RegClass);
02421           SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::v4f32);
02422           SaveXMMOps.push_back(Val);
02423         }
02424         MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, dl,
02425                                      MVT::Other,
02426                                      &SaveXMMOps[0], SaveXMMOps.size()));
02427       }
02428 
02429       if (!MemOps.empty())
02430         Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
02431                             &MemOps[0], MemOps.size());
02432     }
02433   }
02434 
02435   // Some CCs need callee pop.
02436   if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
02437                        MF.getTarget().Options.GuaranteedTailCallOpt)) {
02438     FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything.
02439   } else {
02440     FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing.
02441     // If this is an sret function, the return should pop the hidden pointer.
02442     if (!Is64Bit && !IsTailCallConvention(CallConv) &&
02443         !Subtarget->getTargetTriple().isOSMSVCRT() &&
02444         argsAreStructReturn(Ins) == StackStructReturn)
02445       FuncInfo->setBytesToPopOnReturn(4);
02446   }
02447 
02448   if (!Is64Bit) {
02449     // RegSaveFrameIndex is X86-64 only.
02450     FuncInfo->setRegSaveFrameIndex(0xAAAAAAA);
02451     if (CallConv == CallingConv::X86_FastCall ||
02452         CallConv == CallingConv::X86_ThisCall)
02453       // fastcc functions can't have varargs.
02454       FuncInfo->setVarArgsFrameIndex(0xAAAAAAA);
02455   }
02456 
02457   FuncInfo->setArgumentStackSize(StackSize);
02458 
02459   return Chain;
02460 }
02461 
02462 SDValue
02463 X86TargetLowering::LowerMemOpCallTo(SDValue Chain,
02464                                     SDValue StackPtr, SDValue Arg,
02465                                     SDLoc dl, SelectionDAG &DAG,
02466                                     const CCValAssign &VA,
02467                                     ISD::ArgFlagsTy Flags) const {
02468   unsigned LocMemOffset = VA.getLocMemOffset();
02469   SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset);
02470   PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, PtrOff);
02471   if (Flags.isByVal())
02472     return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl);
02473 
02474   return DAG.getStore(Chain, dl, Arg, PtrOff,
02475                       MachinePointerInfo::getStack(LocMemOffset),
02476                       false, false, 0);
02477 }
02478 
02479 /// EmitTailCallLoadRetAddr - Emit a load of return address if tail call
02480 /// optimization is performed and it is required.
02481 SDValue
02482 X86TargetLowering::EmitTailCallLoadRetAddr(SelectionDAG &DAG,
02483                                            SDValue &OutRetAddr, SDValue Chain,
02484                                            bool IsTailCall, bool Is64Bit,
02485                                            int FPDiff, SDLoc dl) const {
02486   // Adjust the Return address stack slot.
02487   EVT VT = getPointerTy();
02488   OutRetAddr = getReturnAddressFrameIndex(DAG);
02489 
02490   // Load the "old" Return address.
02491   OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, MachinePointerInfo(),
02492                            false, false, false, 0);
02493   return SDValue(OutRetAddr.getNode(), 1);
02494 }
02495 
02496 /// EmitTailCallStoreRetAddr - Emit a store of the return address if tail call
02497 /// optimization is performed and it is required (FPDiff!=0).
02498 static SDValue
02499 EmitTailCallStoreRetAddr(SelectionDAG & DAG, MachineFunction &MF,
02500                          SDValue Chain, SDValue RetAddrFrIdx, EVT PtrVT,
02501                          unsigned SlotSize, int FPDiff, SDLoc dl) {
02502   // Store the return address to the appropriate stack slot.
02503   if (!FPDiff) return Chain;
02504   // Calculate the new stack slot for the return address.
02505   int NewReturnAddrFI =
02506     MF.getFrameInfo()->CreateFixedObject(SlotSize, (int64_t)FPDiff - SlotSize,
02507                                          false);
02508   SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, PtrVT);
02509   Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx,
02510                        MachinePointerInfo::getFixedStack(NewReturnAddrFI),
02511                        false, false, 0);
02512   return Chain;
02513 }
02514 
02515 SDValue
02516 X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
02517                              SmallVectorImpl<SDValue> &InVals) const {
02518   SelectionDAG &DAG                     = CLI.DAG;
02519   SDLoc &dl                             = CLI.DL;
02520   SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
02521   SmallVectorImpl<SDValue> &OutVals     = CLI.OutVals;
02522   SmallVectorImpl<ISD::InputArg> &Ins   = CLI.Ins;
02523   SDValue Chain                         = CLI.Chain;
02524   SDValue Callee                        = CLI.Callee;
02525   CallingConv::ID CallConv              = CLI.CallConv;
02526   bool &isTailCall                      = CLI.IsTailCall;
02527   bool isVarArg                         = CLI.IsVarArg;
02528 
02529   MachineFunction &MF = DAG.getMachineFunction();
02530   bool Is64Bit        = Subtarget->is64Bit();
02531   bool IsWin64        = Subtarget->isCallingConvWin64(CallConv);
02532   StructReturnType SR = callIsStructReturn(Outs);
02533   bool IsSibcall      = false;
02534 
02535   if (MF.getTarget().Options.DisableTailCalls)
02536     isTailCall = false;
02537 
02538   if (isTailCall) {
02539     // Check if it's really possible to do a tail call.
02540     isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv,
02541                     isVarArg, SR != NotStructReturn,
02542                     MF.getFunction()->hasStructRetAttr(), CLI.RetTy,
02543                     Outs, OutVals, Ins, DAG);
02544 
02545     // Sibcalls are automatically detected tailcalls which do not require
02546     // ABI changes.
02547     if (!MF.getTarget().Options.GuaranteedTailCallOpt && isTailCall)
02548       IsSibcall = true;
02549 
02550     if (isTailCall)
02551       ++NumTailCalls;
02552   }
02553 
02554   assert(!(isVarArg && IsTailCallConvention(CallConv)) &&
02555          "Var args not supported with calling convention fastcc, ghc or hipe");
02556 
02557   // Analyze operands of the call, assigning locations to each operand.
02558   SmallVector<CCValAssign, 16> ArgLocs;
02559   CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(),
02560                  ArgLocs, *DAG.getContext());
02561 
02562   // Allocate shadow area for Win64
02563   if (IsWin64)
02564     CCInfo.AllocateStack(32, 8);
02565 
02566   CCInfo.AnalyzeCallOperands(Outs, CC_X86);
02567 
02568   // Get a count of how many bytes are to be pushed on the stack.
02569   unsigned NumBytes = CCInfo.getNextStackOffset();
02570   if (IsSibcall)
02571     // This is a sibcall. The memory operands are available in caller's
02572     // own caller's stack.
02573     NumBytes = 0;
02574   else if (getTargetMachine().Options.GuaranteedTailCallOpt &&
02575            IsTailCallConvention(CallConv))
02576     NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG);
02577 
02578   int FPDiff = 0;
02579   if (isTailCall && !IsSibcall) {
02580     // Lower arguments at fp - stackoffset + fpdiff.
02581     X86MachineFunctionInfo *X86Info = MF.getInfo<X86MachineFunctionInfo>();
02582     unsigned NumBytesCallerPushed = X86Info->getBytesToPopOnReturn();
02583 
02584     FPDiff = NumBytesCallerPushed - NumBytes;
02585 
02586     // Set the delta of movement of the returnaddr stackslot.
02587     // But only set if delta is greater than previous delta.
02588     if (FPDiff < X86Info->getTCReturnAddrDelta())
02589       X86Info->setTCReturnAddrDelta(FPDiff);
02590   }
02591 
02592   unsigned NumBytesToPush = NumBytes;
02593   unsigned NumBytesToPop = NumBytes;
02594 
02595   // If we have an inalloca argument, all stack space has already been allocated
02596   // for us and be right at the top of the stack.  We don't support multiple
02597   // arguments passed in memory when using inalloca.
02598   if (!Outs.empty() && Outs.back().Flags.isInAlloca()) {
02599     NumBytesToPush = 0;
02600     assert(ArgLocs.back().getLocMemOffset() == 0 &&
02601            "an inalloca argument must be the only memory argument");
02602   }
02603 
02604   if (!IsSibcall)
02605     Chain = DAG.getCALLSEQ_START(
02606         Chain, DAG.getIntPtrConstant(NumBytesToPush, true), dl);
02607 
02608   SDValue RetAddrFrIdx;
02609   // Load return address for tail calls.
02610   if (isTailCall && FPDiff)
02611     Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall,
02612                                     Is64Bit, FPDiff, dl);
02613 
02614   SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
02615   SmallVector<SDValue, 8> MemOpChains;
02616   SDValue StackPtr;
02617 
02618   // Walk the register/memloc assignments, inserting copies/loads.  In the case
02619   // of tail call optimization arguments are handle later.
02620   const X86RegisterInfo *RegInfo =
02621     static_cast<const X86RegisterInfo*>(getTargetMachine().getRegisterInfo());
02622   for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
02623     // Skip inalloca arguments, they have already been written.
02624     ISD::ArgFlagsTy Flags = Outs[i].Flags;
02625     if (Flags.isInAlloca())
02626       continue;
02627 
02628     CCValAssign &VA = ArgLocs[i];
02629     EVT RegVT = VA.getLocVT();
02630     SDValue Arg = OutVals[i];
02631     bool isByVal = Flags.isByVal();
02632 
02633     // Promote the value if needed.
02634     switch (VA.getLocInfo()) {
02635     default: llvm_unreachable("Unknown loc info!");
02636     case CCValAssign::Full: break;
02637     case CCValAssign::SExt:
02638       Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg);
02639       break;
02640     case CCValAssign::ZExt:
02641       Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg);
02642       break;
02643     case CCValAssign::AExt:
02644       if (RegVT.is128BitVector()) {
02645         // Special case: passing MMX values in XMM registers.
02646         Arg = DAG.getNode(ISD::BITCAST, dl, MVT::i64, Arg);
02647         Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg);
02648         Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg);
02649       } else
02650         Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg);
02651       break;
02652     case CCValAssign::BCvt:
02653       Arg = DAG.getNode(ISD::BITCAST, dl, RegVT, Arg);
02654       break;
02655     case CCValAssign::Indirect: {
02656       // Store the argument.
02657       SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT());
02658       int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex();
02659       Chain = DAG.getStore(Chain, dl, Arg, SpillSlot,
02660                            MachinePointerInfo::getFixedStack(FI),
02661                            false, false, 0);
02662       Arg = SpillSlot;
02663       break;
02664     }
02665     }
02666 
02667     if (VA.isRegLoc()) {
02668       RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
02669       if (isVarArg && IsWin64) {
02670         // Win64 ABI requires argument XMM reg to be copied to the corresponding
02671         // shadow reg if callee is a varargs function.
02672         unsigned ShadowReg = 0;
02673         switch (VA.getLocReg()) {
02674         case X86::XMM0: ShadowReg = X86::RCX; break;
02675         case X86::XMM1: ShadowReg = X86::RDX; break;
02676         case X86::XMM2: ShadowReg = X86::R8; break;
02677         case X86::XMM3: ShadowReg = X86::R9; break;
02678         }
02679         if (ShadowReg)
02680           RegsToPass.push_back(std::make_pair(ShadowReg, Arg));
02681       }
02682     } else if (!IsSibcall && (!isTailCall || isByVal)) {
02683       assert(VA.isMemLoc());
02684       if (StackPtr.getNode() == 0)
02685         StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
02686                                       getPointerTy());
02687       MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg,
02688                                              dl, DAG, VA, Flags));
02689     }
02690   }
02691 
02692   if (!MemOpChains.empty())
02693     Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
02694                         &MemOpChains[0], MemOpChains.size());
02695 
02696   if (Subtarget->isPICStyleGOT()) {
02697     // ELF / PIC requires GOT in the EBX register before function calls via PLT
02698     // GOT pointer.
02699     if (!isTailCall) {
02700       RegsToPass.push_back(std::make_pair(unsigned(X86::EBX),
02701                DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), getPointerTy())));
02702     } else {
02703       // If we are tail calling and generating PIC/GOT style code load the
02704       // address of the callee into ECX. The value in ecx is used as target of
02705       // the tail jump. This is done to circumvent the ebx/callee-saved problem
02706       // for tail calls on PIC/GOT architectures. Normally we would just put the
02707       // address of GOT into ebx and then call target@PLT. But for tail calls
02708       // ebx would be restored (since ebx is callee saved) before jumping to the
02709       // target@PLT.
02710 
02711       // Note: The actual moving to ECX is done further down.
02712       GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
02713       if (G && !G->getGlobal()->hasHiddenVisibility() &&
02714           !G->getGlobal()->hasProtectedVisibility())
02715         Callee = LowerGlobalAddress(Callee, DAG);
02716       else if (isa<ExternalSymbolSDNode>(Callee))
02717         Callee = LowerExternalSymbol(Callee, DAG);
02718     }
02719   }
02720 
02721   if (Is64Bit && isVarArg && !IsWin64) {
02722     // From AMD64 ABI document:
02723     // For calls that may call functions that use varargs or stdargs
02724     // (prototype-less calls or calls to functions containing ellipsis (...) in
02725     // the declaration) %al is used as hidden argument to specify the number
02726     // of SSE registers used. The contents of %al do not need to match exactly
02727     // the number of registers, but must be an ubound on the number of SSE
02728     // registers used and is in the range 0 - 8 inclusive.
02729 
02730     // Count the number of XMM registers allocated.
02731     static const MCPhysReg XMMArgRegs[] = {
02732       X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
02733       X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
02734     };
02735     unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs, 8);
02736     assert((Subtarget->hasSSE1() || !NumXMMRegs)
02737            && "SSE registers cannot be used when SSE is disabled");
02738 
02739     RegsToPass.push_back(std::make_pair(unsigned(X86::AL),
02740                                         DAG.getConstant(NumXMMRegs, MVT::i8)));
02741   }
02742 
02743   // For tail calls lower the arguments to the 'real' stack slot.
02744   if (isTailCall) {
02745     // Force all the incoming stack arguments to be loaded from the stack
02746     // before any new outgoing arguments are stored to the stack, because the
02747     // outgoing stack slots may alias the incoming argument stack slots, and
02748     // the alias isn't otherwise explicit. This is slightly more conservative
02749     // than necessary, because it means that each store effectively depends
02750     // on every argument instead of just those arguments it would clobber.
02751     SDValue ArgChain = DAG.getStackArgumentTokenFactor(Chain);
02752 
02753     SmallVector<SDValue, 8> MemOpChains2;
02754     SDValue FIN;
02755     int FI = 0;
02756     if (getTargetMachine().Options.GuaranteedTailCallOpt) {
02757       for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
02758         CCValAssign &VA = ArgLocs[i];
02759         if (VA.isRegLoc())
02760           continue;
02761         assert(VA.isMemLoc());
02762         SDValue Arg = OutVals[i];
02763         ISD::ArgFlagsTy Flags = Outs[i].Flags;
02764         // Create frame index.
02765         int32_t Offset = VA.getLocMemOffset()+FPDiff;
02766         uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8;
02767         FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset, true);
02768         FIN = DAG.getFrameIndex(FI, getPointerTy());
02769 
02770         if (Flags.isByVal()) {
02771           // Copy relative to framepointer.
02772           SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset());
02773           if (StackPtr.getNode() == 0)
02774             StackPtr = DAG.getCopyFromReg(Chain, dl,
02775                                           RegInfo->getStackRegister(),
02776                                           getPointerTy());
02777           Source = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, Source);
02778 
02779           MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN,
02780                                                            ArgChain,
02781                                                            Flags, DAG, dl));
02782         } else {
02783           // Store relative to framepointer.
02784           MemOpChains2.push_back(
02785             DAG.getStore(ArgChain, dl, Arg, FIN,
02786                          MachinePointerInfo::getFixedStack(FI),
02787                          false, false, 0));
02788         }
02789       }
02790     }
02791 
02792     if (!MemOpChains2.empty())
02793       Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
02794                           &MemOpChains2[0], MemOpChains2.size());
02795 
02796     // Store the return address to the appropriate stack slot.
02797     Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx,
02798                                      getPointerTy(), RegInfo->getSlotSize(),
02799                                      FPDiff, dl);
02800   }
02801 
02802   // Build a sequence of copy-to-reg nodes chained together with token chain
02803   // and flag operands which copy the outgoing args into registers.
02804   SDValue InFlag;
02805   for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
02806     Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
02807                              RegsToPass[i].second, InFlag);
02808     InFlag = Chain.getValue(1);
02809   }
02810 
02811   if (getTargetMachine().getCodeModel() == CodeModel::Large) {
02812     assert(Is64Bit && "Large code model is only legal in 64-bit mode.");
02813     // In the 64-bit large code model, we have to make all calls
02814     // through a register, since the call instruction's 32-bit
02815     // pc-relative offset may not be large enough to hold the whole
02816     // address.
02817   } else if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
02818     // If the callee is a GlobalAddress node (quite common, every direct call
02819     // is) turn it into a TargetGlobalAddress node so that legalize doesn't hack
02820     // it.
02821 
02822     // We should use extra load for direct calls to dllimported functions in
02823     // non-JIT mode.
02824     const GlobalValue *GV = G->getGlobal();
02825     if (!GV->hasDLLImportStorageClass()) {
02826       unsigned char OpFlags = 0;
02827       bool ExtraLoad = false;
02828       unsigned WrapperKind = ISD::DELETED_NODE;
02829 
02830       // On ELF targets, in both X86-64 and X86-32 mode, direct calls to
02831       // external symbols most go through the PLT in PIC mode.  If the symbol
02832       // has hidden or protected visibility, or if it is static or local, then
02833       // we don't need to use the PLT - we can directly call it.
02834       if (Subtarget->isTargetELF() &&
02835           getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
02836           GV->hasDefaultVisibility() && !GV->hasLocalLinkage()) {
02837         OpFlags = X86II::MO_PLT;
02838       } else if (Subtarget->isPICStyleStubAny() &&
02839                  (GV->isDeclaration() || GV->isWeakForLinker()) &&
02840                  (!Subtarget->getTargetTriple().isMacOSX() ||
02841                   Subtarget->getTargetTriple().isMacOSXVersionLT(10, 5))) {
02842         // PC-relative references to external symbols should go through $stub,
02843         // unless we're building with the leopard linker or later, which
02844         // automatically synthesizes these stubs.
02845         OpFlags = X86II::MO_DARWIN_STUB;
02846       } else if (Subtarget->isPICStyleRIPRel() &&
02847                  isa<Function>(GV) &&
02848                  cast<Function>(GV)->getAttributes().
02849                    hasAttribute(AttributeSet::FunctionIndex,
02850                                 Attribute::NonLazyBind)) {
02851         // If the function is marked as non-lazy, generate an indirect call
02852         // which loads from the GOT directly. This avoids runtime overhead
02853         // at the cost of eager binding (and one extra byte of encoding).
02854         OpFlags = X86II::MO_GOTPCREL;
02855         WrapperKind = X86ISD::WrapperRIP;
02856         ExtraLoad = true;
02857       }
02858 
02859       Callee = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(),
02860                                           G->getOffset(), OpFlags);
02861 
02862       // Add a wrapper if needed.
02863       if (WrapperKind != ISD::DELETED_NODE)
02864         Callee = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Callee);
02865       // Add extra indirection if needed.
02866       if (ExtraLoad)
02867         Callee = DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), Callee,
02868                              MachinePointerInfo::getGOT(),
02869                              false, false, false, 0);
02870     }
02871   } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
02872     unsigned char OpFlags = 0;
02873 
02874     // On ELF targets, in either X86-64 or X86-32 mode, direct calls to
02875     // external symbols should go through the PLT.
02876     if (Subtarget->isTargetELF() &&
02877         getTargetMachine().getRelocationModel() == Reloc::PIC_) {
02878       OpFlags = X86II::MO_PLT;
02879     } else if (Subtarget->isPICStyleStubAny() &&
02880                (!Subtarget->getTargetTriple().isMacOSX() ||
02881                 Subtarget->getTargetTriple().isMacOSXVersionLT(10, 5))) {
02882       // PC-relative references to external symbols should go through $stub,
02883       // unless we're building with the leopard linker or later, which
02884       // automatically synthesizes these stubs.
02885       OpFlags = X86II::MO_DARWIN_STUB;
02886     }
02887 
02888     Callee = DAG.getTargetExternalSymbol(S->getSymbol(), getPointerTy(),
02889                                          OpFlags);
02890   }
02891 
02892   // Returns a chain & a flag for retval copy to use.
02893   SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
02894   SmallVector<SDValue, 8> Ops;
02895 
02896   if (!IsSibcall && isTailCall) {
02897     Chain = DAG.getCALLSEQ_END(Chain,
02898                                DAG.getIntPtrConstant(NumBytesToPop, true),
02899                                DAG.getIntPtrConstant(0, true), InFlag, dl);
02900     InFlag = Chain.getValue(1);
02901   }
02902 
02903   Ops.push_back(Chain);
02904   Ops.push_back(Callee);
02905 
02906   if (isTailCall)
02907     Ops.push_back(DAG.getConstant(FPDiff, MVT::i32));
02908 
02909   // Add argument registers to the end of the list so that they are known live
02910   // into the call.
02911   for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
02912     Ops.push_back(DAG.getRegister(RegsToPass[i].first,
02913                                   RegsToPass[i].second.getValueType()));
02914 
02915   // Add a register mask operand representing the call-preserved registers.
02916   const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo();
02917   const uint32_t *Mask = TRI->getCallPreservedMask(CallConv);
02918   assert(Mask && "Missing call preserved mask for calling convention");
02919   Ops.push_back(DAG.getRegisterMask(Mask));
02920 
02921   if (InFlag.getNode())
02922     Ops.push_back(InFlag);
02923 
02924   if (isTailCall) {
02925     // We used to do:
02926     //// If this is the first return lowered for this function, add the regs
02927     //// to the liveout set for the function.
02928     // This isn't right, although it's probably harmless on x86; liveouts
02929     // should be computed from returns not tail calls.  Consider a void
02930     // function making a tail call to a function returning int.
02931     return DAG.getNode(X86ISD::TC_RETURN, dl, NodeTys, &Ops[0], Ops.size());
02932   }
02933 
02934   Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, &Ops[0], Ops.size());
02935   InFlag = Chain.getValue(1);
02936 
02937   // Create the CALLSEQ_END node.
02938   unsigned NumBytesForCalleeToPop;
02939   if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
02940                        getTargetMachine().Options.GuaranteedTailCallOpt))
02941     NumBytesForCalleeToPop = NumBytes;    // Callee pops everything
02942   else if (!Is64Bit && !IsTailCallConvention(CallConv) &&
02943            !Subtarget->getTargetTriple().isOSMSVCRT() &&
02944            SR == StackStructReturn)
02945     // If this is a call to a struct-return function, the callee
02946     // pops the hidden struct pointer, so we have to push it back.
02947     // This is common for Darwin/X86, Linux & Mingw32 targets.
02948     // For MSVC Win32 targets, the caller pops the hidden struct pointer.
02949     NumBytesForCalleeToPop = 4;
02950   else
02951     NumBytesForCalleeToPop = 0;  // Callee pops nothing.
02952 
02953   // Returns a flag for retval copy to use.
02954   if (!IsSibcall) {
02955     Chain = DAG.getCALLSEQ_END(Chain,
02956                                DAG.getIntPtrConstant(NumBytesToPop, true),
02957                                DAG.getIntPtrConstant(NumBytesForCalleeToPop,
02958                                                      true),
02959                                InFlag, dl);
02960     InFlag = Chain.getValue(1);
02961   }
02962 
02963   // Handle result values, copying them out of physregs into vregs that we
02964   // return.
02965   return LowerCallResult(Chain, InFlag, CallConv, isVarArg,
02966                          Ins, dl, DAG, InVals);
02967 }
02968 
02969 //===----------------------------------------------------------------------===//
02970 //                Fast Calling Convention (tail call) implementation
02971 //===----------------------------------------------------------------------===//
02972 
02973 //  Like std call, callee cleans arguments, convention except that ECX is
02974 //  reserved for storing the tail called function address. Only 2 registers are
02975 //  free for argument passing (inreg). Tail call optimization is performed
02976 //  provided:
02977 //                * tailcallopt is enabled
02978 //                * caller/callee are fastcc
02979 //  On X86_64 architecture with GOT-style position independent code only local
02980 //  (within module) calls are supported at the moment.
02981 //  To keep the stack aligned according to platform abi the function
02982 //  GetAlignedArgumentStackSize ensures that argument delta is always multiples
02983 //  of stack alignment. (Dynamic linkers need this - darwin's dyld for example)
02984 //  If a tail called function callee has more arguments than the caller the
02985 //  caller needs to make sure that there is room to move the RETADDR to. This is
02986 //  achieved by reserving an area the size of the argument delta right after the
02987 //  original REtADDR, but before the saved framepointer or the spilled registers
02988 //  e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4)
02989 //  stack layout:
02990 //    arg1
02991 //    arg2
02992 //    RETADDR
02993 //    [ new RETADDR
02994 //      move area ]
02995 //    (possible EBP)
02996 //    ESI
02997 //    EDI
02998 //    local1 ..
02999 
03000 /// GetAlignedArgumentStackSize - Make the stack size align e.g 16n + 12 aligned
03001 /// for a 16 byte align requirement.
03002 unsigned
03003 X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize,
03004                                                SelectionDAG& DAG) const {
03005   MachineFunction &MF = DAG.getMachineFunction();
03006   const TargetMachine &TM = MF.getTarget();
03007   const X86RegisterInfo *RegInfo =
03008     static_cast<const X86RegisterInfo*>(TM.getRegisterInfo());
03009   const TargetFrameLowering &TFI = *TM.getFrameLowering();
03010   unsigned StackAlignment = TFI.getStackAlignment();
03011   uint64_t AlignMask = StackAlignment - 1;
03012   int64_t Offset = StackSize;
03013   unsigned SlotSize = RegInfo->getSlotSize();
03014   if ( (Offset & AlignMask) <= (StackAlignment - SlotSize) ) {
03015     // Number smaller than 12 so just add the difference.
03016     Offset += ((StackAlignment - SlotSize) - (Offset & AlignMask));
03017   } else {
03018     // Mask out lower bits, add stackalignment once plus the 12 bytes.
03019     Offset = ((~AlignMask) & Offset) + StackAlignment +
03020       (StackAlignment-SlotSize);
03021   }
03022   return Offset;
03023 }
03024 
03025 /// MatchingStackOffset - Return true if the given stack call argument is
03026 /// already available in the same position (relatively) of the caller's
03027 /// incoming argument stack.
03028 static
03029 bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
03030                          MachineFrameInfo *MFI, const MachineRegisterInfo *MRI,
03031                          const X86InstrInfo *TII) {
03032   unsigned Bytes = Arg.getValueType().getSizeInBits() / 8;
03033   int FI = INT_MAX;
03034   if (Arg.getOpcode() == ISD::CopyFromReg) {
03035     unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg();
03036     if (!TargetRegisterInfo::isVirtualRegister(VR))
03037       return false;
03038     MachineInstr *Def = MRI->getVRegDef(VR);
03039     if (!Def)
03040       return false;
03041     if (!Flags.isByVal()) {
03042       if (!TII->isLoadFromStackSlot(Def, FI))
03043         return false;
03044     } else {
03045       unsigned Opcode = Def->getOpcode();
03046       if ((Opcode == X86::LEA32r || Opcode == X86::LEA64r) &&
03047           Def->getOperand(1).isFI()) {
03048         FI = Def->getOperand(1).getIndex();
03049         Bytes = Flags.getByValSize();
03050       } else
03051         return false;
03052     }
03053   } else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) {
03054     if (Flags.isByVal())
03055       // ByVal argument is passed in as a pointer but it's now being
03056       // dereferenced. e.g.
03057       // define @foo(%struct.X* %A) {
03058       //   tail call @bar(%struct.X* byval %A)
03059       // }
03060       return false;
03061     SDValue Ptr = Ld->getBasePtr();
03062     FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr);
03063     if (!FINode)
03064       return false;
03065     FI = FINode->getIndex();
03066   } else if (Arg.getOpcode() == ISD::FrameIndex && Flags.isByVal()) {
03067     FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Arg);
03068     FI = FINode->getIndex();
03069     Bytes = Flags.getByValSize();
03070   } else
03071     return false;
03072 
03073   assert(FI != INT_MAX);
03074   if (!MFI->isFixedObjectIndex(FI))
03075     return false;
03076   return Offset == MFI->getObjectOffset(FI) && Bytes == MFI->getObjectSize(FI);
03077 }
03078 
03079 /// IsEligibleForTailCallOptimization - Check whether the call is eligible
03080 /// for tail call optimization. Targets which want to do tail call
03081 /// optimization should implement this function.
03082 bool
03083 X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
03084                                                      CallingConv::ID CalleeCC,
03085                                                      bool isVarArg,
03086                                                      bool isCalleeStructRet,
03087                                                      bool isCallerStructRet,
03088                                                      Type *RetTy,
03089                                     const SmallVectorImpl<ISD::OutputArg> &Outs,
03090                                     const SmallVectorImpl<SDValue> &OutVals,
03091                                     const SmallVectorImpl<ISD::InputArg> &Ins,
03092                                                      SelectionDAG &DAG) const {
03093   if (!IsTailCallConvention(CalleeCC) && !IsCCallConvention(CalleeCC))
03094     return false;
03095 
03096   // If -tailcallopt is specified, make fastcc functions tail-callable.
03097   const MachineFunction &MF = DAG.getMachineFunction();
03098   const Function *CallerF = MF.getFunction();
03099 
03100   // If the function return type is x86_fp80 and the callee return type is not,
03101   // then the FP_EXTEND of the call result is not a nop. It's not safe to
03102   // perform a tailcall optimization here.
03103   if (CallerF->getReturnType()->isX86_FP80Ty() && !RetTy->isX86_FP80Ty())
03104     return false;
03105 
03106   CallingConv::ID CallerCC = CallerF->getCallingConv();
03107   bool CCMatch = CallerCC == CalleeCC;
03108   bool IsCalleeWin64 = Subtarget->isCallingConvWin64(CalleeCC);
03109   bool IsCallerWin64 = Subtarget->isCallingConvWin64(CallerCC);
03110 
03111   if (getTargetMachine().Options.GuaranteedTailCallOpt) {
03112     if (IsTailCallConvention(CalleeCC) && CCMatch)
03113       return true;
03114     return false;
03115   }
03116 
03117   // Look for obvious safe cases to perform tail call optimization that do not
03118   // require ABI changes. This is what gcc calls sibcall.
03119 
03120   // Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to
03121   // emit a special epilogue.
03122   const X86RegisterInfo *RegInfo =
03123     static_cast<const X86RegisterInfo*>(getTargetMachine().getRegisterInfo());
03124   if (RegInfo->needsStackRealignment(MF))
03125     return false;
03126 
03127   // Also avoid sibcall optimization if either caller or callee uses struct
03128   // return semantics.
03129   if (isCalleeStructRet || isCallerStructRet)
03130     return false;
03131 
03132   // An stdcall/thiscall caller is expected to clean up its arguments; the
03133   // callee isn't going to do that.
03134   // FIXME: this is more restrictive than needed. We could produce a tailcall
03135   // when the stack adjustment matches. For example, with a thiscall that takes
03136   // only one argument.
03137   if (!CCMatch && (CallerCC == CallingConv::X86_StdCall ||
03138                    CallerCC == CallingConv::X86_ThisCall))
03139     return false;
03140 
03141   // Do not sibcall optimize vararg calls unless all arguments are passed via
03142   // registers.
03143   if (isVarArg && !Outs.empty()) {
03144 
03145     // Optimizing for varargs on Win64 is unlikely to be safe without
03146     // additional testing.
03147     if (IsCalleeWin64 || IsCallerWin64)
03148       return false;
03149 
03150     SmallVector<CCValAssign, 16> ArgLocs;
03151     CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(),
03152                    getTargetMachine(), ArgLocs, *DAG.getContext());
03153 
03154     CCInfo.AnalyzeCallOperands(Outs, CC_X86);
03155     for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i)
03156       if (!ArgLocs[i].isRegLoc())
03157         return false;
03158   }
03159 
03160   // If the call result is in ST0 / ST1, it needs to be popped off the x87
03161   // stack.  Therefore, if it's not used by the call it is not safe to optimize
03162   // this into a sibcall.
03163   bool Unused = false;
03164   for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
03165     if (!Ins[i].Used) {
03166       Unused = true;
03167       break;
03168     }
03169   }
03170   if (Unused) {
03171     SmallVector<CCValAssign, 16> RVLocs;
03172     CCState CCInfo(CalleeCC, false, DAG.getMachineFunction(),
03173                    getTargetMachine(), RVLocs, *DAG.getContext());
03174     CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
03175     for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
03176       CCValAssign &VA = RVLocs[i];
03177       if (VA.getLocReg() == X86::ST0 || VA.getLocReg() == X86::ST1)
03178         return false;
03179     }
03180   }
03181 
03182   // If the calling conventions do not match, then we'd better make sure the
03183   // results are returned in the same way as what the caller expects.
03184   if (!CCMatch) {
03185     SmallVector<CCValAssign, 16> RVLocs1;
03186     CCState CCInfo1(CalleeCC, false, DAG.getMachineFunction(),
03187                     getTargetMachine(), RVLocs1, *DAG.getContext());
03188     CCInfo1.AnalyzeCallResult(Ins, RetCC_X86);
03189 
03190     SmallVector<CCValAssign, 16> RVLocs2;
03191     CCState CCInfo2(CallerCC, false, DAG.getMachineFunction(),
03192                     getTargetMachine(), RVLocs2, *DAG.getContext());
03193     CCInfo2.AnalyzeCallResult(Ins, RetCC_X86);
03194 
03195     if (RVLocs1.size() != RVLocs2.size())
03196       return false;
03197     for (unsigned i = 0, e = RVLocs1.size(); i != e; ++i) {
03198       if (RVLocs1[i].isRegLoc() != RVLocs2[i].isRegLoc())
03199         return false;
03200       if (RVLocs1[i].getLocInfo() != RVLocs2[i].getLocInfo())
03201         return false;
03202       if (RVLocs1[i].isRegLoc()) {
03203         if (RVLocs1[i].getLocReg() != RVLocs2[i].getLocReg())
03204           return false;
03205       } else {
03206         if (RVLocs1[i].getLocMemOffset() != RVLocs2[i].getLocMemOffset())
03207           return false;
03208       }
03209     }
03210   }
03211 
03212   // If the callee takes no arguments then go on to check the results of the
03213   // call.
03214   if (!Outs.empty()) {
03215     // Check if stack adjustment is needed. For now, do not do this if any
03216     // argument is passed on the stack.
03217     SmallVector<CCValAssign, 16> ArgLocs;
03218     CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(),
03219                    getTargetMachine(), ArgLocs, *DAG.getContext());
03220 
03221     // Allocate shadow area for Win64
03222     if (IsCalleeWin64)
03223       CCInfo.AllocateStack(32, 8);
03224 
03225     CCInfo.AnalyzeCallOperands(Outs, CC_X86);
03226     if (CCInfo.getNextStackOffset()) {
03227       MachineFunction &MF = DAG.getMachineFunction();
03228       if (MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn())
03229         return false;
03230 
03231       // Check if the arguments are already laid out in the right way as
03232       // the caller's fixed stack objects.
03233       MachineFrameInfo *MFI = MF.getFrameInfo();
03234       const MachineRegisterInfo *MRI = &MF.getRegInfo();
03235       const X86InstrInfo *TII =
03236         ((const X86TargetMachine&)getTargetMachine()).getInstrInfo();
03237       for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
03238         CCValAssign &VA = ArgLocs[i];
03239         SDValue Arg = OutVals[i];
03240         ISD::ArgFlagsTy Flags = Outs[i].Flags;
03241         if (VA.getLocInfo() == CCValAssign::Indirect)
03242           return false;
03243         if (!VA.isRegLoc()) {
03244           if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags,
03245                                    MFI, MRI, TII))
03246             return false;
03247         }
03248       }
03249     }
03250 
03251     // If the tailcall address may be in a register, then make sure it's
03252     // possible to register allocate for it. In 32-bit, the call address can
03253     // only target EAX, EDX, or ECX since the tail call must be scheduled after
03254     // callee-saved registers are restored. These happen to be the same
03255     // registers used to pass 'inreg' arguments so watch out for those.
03256     if (!Subtarget->is64Bit() &&
03257         ((!isa<GlobalAddressSDNode>(Callee) &&
03258           !isa<ExternalSymbolSDNode>(Callee)) ||
03259          getTargetMachine().getRelocationModel() == Reloc::PIC_)) {
03260       unsigned NumInRegs = 0;
03261       // In PIC we need an extra register to formulate the address computation
03262       // for the callee.
03263       unsigned MaxInRegs =
03264           (getTargetMachine().getRelocationModel() == Reloc::PIC_) ? 2 : 3;
03265 
03266       for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
03267         CCValAssign &VA = ArgLocs[i];
03268         if (!VA.isRegLoc())
03269           continue;
03270         unsigned Reg = VA.getLocReg();
03271         switch (Reg) {
03272         default: break;
03273         case X86::EAX: case X86::EDX: case X86::ECX:
03274           if (++NumInRegs == MaxInRegs)
03275             return false;
03276           break;
03277         }
03278       }
03279     }
03280   }
03281 
03282   return true;
03283 }
03284 
03285 FastISel *
03286 X86TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
03287                                   const TargetLibraryInfo *libInfo) const {
03288   return X86::createFastISel(funcInfo, libInfo);
03289 }
03290 
03291 //===----------------------------------------------------------------------===//
03292 //                           Other Lowering Hooks
03293 //===----------------------------------------------------------------------===//
03294 
03295 static bool MayFoldLoad(SDValue Op) {
03296   return Op.hasOneUse() && ISD::isNormalLoad(Op.getNode());
03297 }
03298 
03299 static bool MayFoldIntoStore(SDValue Op) {
03300   return Op.hasOneUse() && ISD::isNormalStore(*Op.getNode()->use_begin());
03301 }
03302 
03303 static bool isTargetShuffle(unsigned Opcode) {
03304   switch(Opcode) {
03305   default: return false;
03306   case X86ISD::PSHUFD:
03307   case X86ISD::PSHUFHW:
03308   case X86ISD::PSHUFLW:
03309   case X86ISD::SHUFP:
03310   case X86ISD::PALIGNR:
03311   case X86ISD::MOVLHPS:
03312   case X86ISD::MOVLHPD:
03313   case X86ISD::MOVHLPS:
03314   case X86ISD::MOVLPS:
03315   case X86ISD::MOVLPD:
03316   case X86ISD::MOVSHDUP:
03317   case X86ISD::MOVSLDUP:
03318   case X86ISD::MOVDDUP:
03319   case X86ISD::MOVSS:
03320   case X86ISD::MOVSD:
03321   case X86ISD::UNPCKL:
03322   case X86ISD::UNPCKH:
03323   case X86ISD::VPERMILP:
03324   case X86ISD::VPERM2X128:
03325   case X86ISD::VPERMI:
03326     return true;
03327   }
03328 }
03329 
03330 static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT,
03331                                     SDValue V1, SelectionDAG &DAG) {
03332   switch(Opc) {
03333   default: llvm_unreachable("Unknown x86 shuffle node");
03334   case X86ISD::MOVSHDUP:
03335   case X86ISD::MOVSLDUP:
03336   case X86ISD::MOVDDUP:
03337     return DAG.getNode(Opc, dl, VT, V1);
03338   }
03339 }
03340 
03341 static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT,
03342                                     SDValue V1, unsigned TargetMask,
03343                                     SelectionDAG &DAG) {
03344   switch(Opc) {
03345   default: llvm_unreachable("Unknown x86 shuffle node");
03346   case X86ISD::PSHUFD:
03347   case X86ISD::PSHUFHW:
03348   case X86ISD::PSHUFLW:
03349   case X86ISD::VPERMILP:
03350   case X86ISD::VPERMI:
03351     return DAG.getNode(Opc, dl, VT, V1, DAG.getConstant(TargetMask, MVT::i8));
03352   }
03353 }
03354 
03355 static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT,
03356                                     SDValue V1, SDValue V2, unsigned TargetMask,
03357                                     SelectionDAG &DAG) {
03358   switch(Opc) {
03359   default: llvm_unreachable("Unknown x86 shuffle node");
03360   case X86ISD::PALIGNR:
03361   case X86ISD::SHUFP:
03362   case X86ISD::VPERM2X128:
03363     return DAG.getNode(Opc, dl, VT, V1, V2,
03364                        DAG.getConstant(TargetMask, MVT::i8));
03365   }
03366 }
03367 
03368 static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT,
03369                                     SDValue V1, SDValue V2, SelectionDAG &DAG) {
03370   switch(Opc) {
03371   default: llvm_unreachable("Unknown x86 shuffle node");
03372   case X86ISD::MOVLHPS:
03373   case X86ISD::MOVLHPD:
03374   case X86ISD::MOVHLPS:
03375   case X86ISD::MOVLPS:
03376   case X86ISD::MOVLPD:
03377   case X86ISD::MOVSS:
03378   case X86ISD::MOVSD:
03379   case X86ISD::UNPCKL:
03380   case X86ISD::UNPCKH:
03381     return DAG.getNode(Opc, dl, VT, V1, V2);
03382   }
03383 }
03384 
03385 SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const {
03386   MachineFunction &MF = DAG.getMachineFunction();
03387   const X86RegisterInfo *RegInfo =
03388     static_cast<const X86RegisterInfo*>(getTargetMachine().getRegisterInfo());
03389   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
03390   int ReturnAddrIndex = FuncInfo->getRAIndex();
03391 
03392   if (ReturnAddrIndex == 0) {
03393     // Set up a frame object for the return address.
03394     unsigned SlotSize = RegInfo->getSlotSize();
03395     ReturnAddrIndex = MF.getFrameInfo()->CreateFixedObject(SlotSize,
03396                                                            -(int64_t)SlotSize,
03397                                                            false);
03398     FuncInfo->setRAIndex(ReturnAddrIndex);
03399   }
03400 
03401   return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy());
03402 }
03403 
03404 bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M,
03405                                        bool hasSymbolicDisplacement) {
03406   // Offset should fit into 32 bit immediate field.
03407   if (!isInt<32>(Offset))
03408     return false;
03409 
03410   // If we don't have a symbolic displacement - we don't have any extra
03411   // restrictions.
03412   if (!hasSymbolicDisplacement)
03413     return true;
03414 
03415   // FIXME: Some tweaks might be needed for medium code model.
03416   if (M != CodeModel::Small && M != CodeModel::Kernel)
03417     return false;
03418 
03419   // For small code model we assume that latest object is 16MB before end of 31
03420   // bits boundary. We may also accept pretty large negative constants knowing
03421   // that all objects are in the positive half of address space.
03422   if (M == CodeModel::Small && Offset < 16*1024*1024)
03423     return true;
03424 
03425   // For kernel code model we know that all object resist in the negative half
03426   // of 32bits address space. We may not accept negative offsets, since they may
03427   // be just off and we may accept pretty large positive ones.
03428   if (M == CodeModel::Kernel && Offset > 0)
03429     return true;
03430 
03431   return false;
03432 }
03433 
03434 /// isCalleePop - Determines whether the callee is required to pop its
03435 /// own arguments. Callee pop is necessary to support tail calls.
03436 bool X86::isCalleePop(CallingConv::ID CallingConv,
03437                       bool is64Bit, bool IsVarArg, bool TailCallOpt) {
03438   if (IsVarArg)
03439     return false;
03440 
03441   switch (CallingConv) {
03442   default:
03443     return false;
03444   case CallingConv::X86_StdCall:
03445     return !is64Bit;
03446   case CallingConv::X86_FastCall:
03447     return !is64Bit;
03448   case CallingConv::X86_ThisCall:
03449     return !is64Bit;
03450   case CallingConv::Fast:
03451     return TailCallOpt;
03452   case CallingConv::GHC:
03453     return TailCallOpt;
03454   case CallingConv::HiPE:
03455     return TailCallOpt;
03456   }
03457 }
03458 
03459 /// \brief Return true if the condition is an unsigned comparison operation.
03460 static bool isX86CCUnsigned(unsigned X86CC) {
03461   switch (X86CC) {
03462   default: llvm_unreachable("Invalid integer condition!");
03463   case X86::COND_E:     return true;
03464   case X86::COND_G:     return false;
03465   case X86::COND_GE:    return false;
03466   case X86::COND_L:     return false;
03467   case X86::COND_LE:    return false;
03468   case X86::COND_NE:    return true;
03469   case X86::COND_B:     return true;
03470   case X86::COND_A:     return true;
03471   case X86::COND_BE:    return true;
03472   case X86::COND_AE:    return true;
03473   }
03474   llvm_unreachable("covered switch fell through?!");
03475 }
03476 
03477 /// TranslateX86CC - do a one to one translation of a ISD::CondCode to the X86
03478 /// specific condition code, returning the condition code and the LHS/RHS of the
03479 /// comparison to make.
03480 static unsigned TranslateX86CC(ISD::CondCode SetCCOpcode, bool isFP,
03481                                SDValue &LHS, SDValue &RHS, SelectionDAG &DAG) {
03482   if (!isFP) {
03483     if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) {
03484       if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnesValue()) {
03485         // X > -1   -> X == 0, jump !sign.
03486         RHS = DAG.getConstant(0, RHS.getValueType());
03487         return X86::COND_NS;
03488       }
03489       if (SetCCOpcode == ISD::SETLT && RHSC->isNullValue()) {
03490         // X < 0   -> X == 0, jump on sign.
03491         return X86::COND_S;
03492       }
03493       if (SetCCOpcode == ISD::SETLT && RHSC->getZExtValue() == 1) {
03494         // X < 1   -> X <= 0
03495         RHS = DAG.getConstant(0, RHS.getValueType());
03496         return X86::COND_LE;
03497       }
03498     }
03499 
03500     switch (SetCCOpcode) {
03501     default: llvm_unreachable("Invalid integer condition!");
03502     case ISD::SETEQ:  return X86::COND_E;
03503     case ISD::SETGT:  return X86::COND_G;
03504     case ISD::SETGE:  return X86::COND_GE;
03505     case ISD::SETLT:  return X86::COND_L;
03506     case ISD::SETLE:  return X86::COND_LE;
03507     case ISD::SETNE:  return X86::COND_NE;
03508     case ISD::SETULT: return X86::COND_B;
03509     case ISD::SETUGT: return X86::COND_A;
03510     case ISD::SETULE: return X86::COND_BE;
03511     case ISD::SETUGE: return X86::COND_AE;
03512     }
03513   }
03514 
03515   // First determine if it is required or is profitable to flip the operands.
03516 
03517   // If LHS is a foldable load, but RHS is not, flip the condition.
03518   if (ISD::isNON_EXTLoad(LHS.getNode()) &&
03519       !ISD::isNON_EXTLoad(RHS.getNode())) {
03520     SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode);
03521     std::swap(LHS, RHS);
03522   }
03523 
03524   switch (SetCCOpcode) {
03525   default: break;
03526   case ISD::SETOLT:
03527   case ISD::SETOLE:
03528   case ISD::SETUGT:
03529   case ISD::SETUGE:
03530     std::swap(LHS, RHS);
03531     break;
03532   }
03533 
03534   // On a floating point condition, the flags are set as follows:
03535   // ZF  PF  CF   op
03536   //  0 | 0 | 0 | X > Y
03537   //  0 | 0 | 1 | X < Y
03538   //  1 | 0 | 0 | X == Y
03539   //  1 | 1 | 1 | unordered
03540   switch (SetCCOpcode) {
03541   default: llvm_unreachable("Condcode should be pre-legalized away");
03542   case ISD::SETUEQ:
03543   case ISD::SETEQ:   return X86::COND_E;
03544   case ISD::SETOLT:              // flipped
03545   case ISD::SETOGT:
03546   case ISD::SETGT:   return X86::COND_A;
03547   case ISD::SETOLE:              // flipped
03548   case ISD::SETOGE:
03549   case ISD::SETGE:   return X86::COND_AE;
03550   case ISD::SETUGT:              // flipped
03551   case ISD::SETULT:
03552   case ISD::SETLT:   return X86::COND_B;
03553   case ISD::SETUGE:              // flipped
03554   case ISD::SETULE:
03555   case ISD::SETLE:   return X86::COND_BE;
03556   case ISD::SETONE:
03557   case ISD::SETNE:   return X86::COND_NE;
03558   case ISD::SETUO:   return X86::COND_P;
03559   case ISD::SETO:    return X86::COND_NP;
03560   case ISD::SETOEQ:
03561   case ISD::SETUNE:  return X86::COND_INVALID;
03562   }
03563 }
03564 
03565 /// hasFPCMov - is there a floating point cmov for the specific X86 condition
03566 /// code. Current x86 isa includes the following FP cmov instructions:
03567 /// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu.
03568 static bool hasFPCMov(unsigned X86CC) {
03569   switch (X86CC) {
03570   default:
03571     return false;
03572   case X86::COND_B:
03573   case X86::COND_BE:
03574   case X86::COND_E:
03575   case X86::COND_P:
03576   case X86::COND_A:
03577   case X86::COND_AE:
03578   case X86::COND_NE:
03579   case X86::COND_NP:
03580     return true;
03581   }
03582 }
03583 
03584 /// isFPImmLegal - Returns true if the target can instruction select the
03585 /// specified FP immediate natively. If false, the legalizer will
03586 /// materialize the FP immediate as a load from a constant pool.
03587 bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
03588   for (unsigned i = 0, e = LegalFPImmediates.size(); i != e; ++i) {
03589     if (Imm.bitwiseIsEqual(LegalFPImmediates[i]))
03590       return true;
03591   }
03592   return false;
03593 }
03594 
03595 /// \brief Returns true if it is beneficial to convert a load of a constant
03596 /// to just the constant itself.
03597 bool X86TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
03598                                                           Type *Ty) const {
03599   assert(Ty->isIntegerTy());
03600 
03601   unsigned BitSize = Ty->getPrimitiveSizeInBits();
03602   if (BitSize == 0 || BitSize > 64)
03603     return false;
03604   return true;
03605 }
03606 
03607 /// isUndefOrInRange - Return true if Val is undef or if its value falls within
03608 /// the specified range (L, H].
03609 static bool isUndefOrInRange(int Val, int Low, int Hi) {
03610   return (Val < 0) || (Val >= Low && Val < Hi);
03611 }
03612 
03613 /// isUndefOrEqual - Val is either less than zero (undef) or equal to the
03614 /// specified value.
03615 static bool isUndefOrEqual(int Val, int CmpVal) {
03616   return (Val < 0 || Val == CmpVal);
03617 }
03618 
03619 /// isSequentialOrUndefInRange - Return true if every element in Mask, beginning
03620 /// from position Pos and ending in Pos+Size, falls within the specified
03621 /// sequential range (L, L+Pos]. or is undef.
03622 static bool isSequentialOrUndefInRange(ArrayRef<int> Mask,
03623                                        unsigned Pos, unsigned Size, int Low) {
03624   for (unsigned i = Pos, e = Pos+Size; i != e; ++i, ++Low)
03625     if (!isUndefOrEqual(Mask[i], Low))
03626       return false;
03627   return true;
03628 }
03629 
03630 /// isPSHUFDMask - Return true if the node specifies a shuffle of elements that
03631 /// is suitable for input to PSHUFD or PSHUFW.  That is, it doesn't reference
03632 /// the second operand.
03633 static bool isPSHUFDMask(ArrayRef<int> Mask, MVT VT) {
03634   if (VT == MVT::v4f32 || VT == MVT::v4i32 )
03635     return (Mask[0] < 4 && Mask[1] < 4 && Mask[2] < 4 && Mask[3] < 4);
03636   if (VT == MVT::v2f64 || VT == MVT::v2i64)
03637     return (Mask[0] < 2 && Mask[1] < 2);
03638   return false;
03639 }
03640 
03641 /// isPSHUFHWMask - Return true if the node specifies a shuffle of elements that
03642 /// is suitable for input to PSHUFHW.
03643 static bool isPSHUFHWMask(ArrayRef<int> Mask, MVT VT, bool HasInt256) {
03644   if (VT != MVT::v8i16 && (!HasInt256 || VT != MVT::v16i16))
03645     return false;
03646 
03647   // Lower quadword copied in order or undef.
03648   if (!isSequentialOrUndefInRange(Mask, 0, 4, 0))
03649     return false;
03650 
03651   // Upper quadword shuffled.
03652   for (unsigned i = 4; i != 8; ++i)
03653     if (!isUndefOrInRange(Mask[i], 4, 8))
03654       return false;
03655 
03656   if (VT == MVT::v16i16) {
03657     // Lower quadword copied in order or undef.
03658     if (!isSequentialOrUndefInRange(Mask, 8, 4, 8))
03659       return false;
03660 
03661     // Upper quadword shuffled.
03662     for (unsigned i = 12; i != 16; ++i)
03663       if (!isUndefOrInRange(Mask[i], 12, 16))
03664         return false;
03665   }
03666 
03667   return true;
03668 }
03669 
03670 /// isPSHUFLWMask - Return true if the node specifies a shuffle of elements that
03671 /// is suitable for input to PSHUFLW.
03672 static bool isPSHUFLWMask(ArrayRef<int> Mask, MVT VT, bool HasInt256) {
03673   if (VT != MVT::v8i16 && (!HasInt256 || VT != MVT::v16i16))
03674     return false;
03675 
03676   // Upper quadword copied in order.
03677   if (!isSequentialOrUndefInRange(Mask, 4, 4, 4))
03678     return false;
03679 
03680   // Lower quadword shuffled.
03681   for (unsigned i = 0; i != 4; ++i)
03682     if (!isUndefOrInRange(Mask[i], 0, 4))
03683       return false;
03684 
03685   if (VT == MVT::v16i16) {
03686     // Upper quadword copied in order.
03687     if (!isSequentialOrUndefInRange(Mask, 12, 4, 12))
03688       return false;
03689 
03690     // Lower quadword shuffled.
03691     for (unsigned i = 8; i != 12; ++i)
03692       if (!isUndefOrInRange(Mask[i], 8, 12))
03693         return false;
03694   }
03695 
03696   return true;
03697 }
03698 
03699 /// isPALIGNRMask - Return true if the node specifies a shuffle of elements that
03700 /// is suitable for input to PALIGNR.
03701 static bool isPALIGNRMask(ArrayRef<int> Mask, MVT VT,
03702                           const X86Subtarget *Subtarget) {
03703   if ((VT.is128BitVector() && !Subtarget->hasSSSE3()) ||
03704       (VT.is256BitVector() && !Subtarget->hasInt256()))
03705     return false;
03706 
03707   unsigned NumElts = VT.getVectorNumElements();
03708   unsigned NumLanes = VT.is512BitVector() ? 1: VT.getSizeInBits()/128;
03709   unsigned NumLaneElts = NumElts/NumLanes;
03710 
03711   // Do not handle 64-bit element shuffles with palignr.
03712   if (NumLaneElts == 2)
03713     return false;
03714 
03715   for (unsigned l = 0; l != NumElts; l+=NumLaneElts) {
03716     unsigned i;
03717     for (i = 0; i != NumLaneElts; ++i) {
03718       if (Mask[i+l] >= 0)
03719         break;
03720     }
03721 
03722     // Lane is all undef, go to next lane
03723     if (i == NumLaneElts)
03724       continue;
03725 
03726     int Start = Mask[i+l];
03727 
03728     // Make sure its in this lane in one of the sources
03729     if (!isUndefOrInRange(Start, l, l+NumLaneElts) &&
03730         !isUndefOrInRange(Start, l+NumElts, l+NumElts+NumLaneElts))
03731       return false;
03732 
03733     // If not lane 0, then we must match lane 0
03734     if (l != 0 && Mask[i] >= 0 && !isUndefOrEqual(Start, Mask[i]+l))
03735       return false;
03736 
03737     // Correct second source to be contiguous with first source
03738     if (Start >= (int)NumElts)
03739       Start -= NumElts - NumLaneElts;
03740 
03741     // Make sure we're shifting in the right direction.
03742     if (Start <= (int)(i+l))
03743       return false;
03744 
03745     Start -= i;
03746 
03747     // Check the rest of the elements to see if they are consecutive.
03748     for (++i; i != NumLaneElts; ++i) {
03749       int Idx = Mask[i+l];
03750 
03751       // Make sure its in this lane
03752       if (!isUndefOrInRange(Idx, l, l+NumLaneElts) &&
03753           !isUndefOrInRange(Idx, l+NumElts, l+NumElts+NumLaneElts))
03754         return false;
03755 
03756       // If not lane 0, then we must match lane 0
03757       if (l != 0 && Mask[i] >= 0 && !isUndefOrEqual(Idx, Mask[i]+l))
03758         return false;
03759 
03760       if (Idx >= (int)NumElts)
03761         Idx -= NumElts - NumLaneElts;
03762 
03763       if (!isUndefOrEqual(Idx, Start+i))
03764         return false;
03765 
03766     }
03767   }
03768 
03769   return true;
03770 }
03771 
03772 /// CommuteVectorShuffleMask - Change values in a shuffle permute mask assuming
03773 /// the two vector operands have swapped position.
03774 static void CommuteVectorShuffleMask(SmallVectorImpl<int> &Mask,
03775                                      unsigned NumElems) {
03776   for (unsigned i = 0; i != NumElems; ++i) {
03777     int idx = Mask[i];
03778     if (idx < 0)
03779       continue;
03780     else if (idx < (int)NumElems)
03781       Mask[i] = idx + NumElems;
03782     else
03783       Mask[i] = idx - NumElems;
03784   }
03785 }
03786 
03787 /// isSHUFPMask - Return true if the specified VECTOR_SHUFFLE operand
03788 /// specifies a shuffle of elements that is suitable for input to 128/256-bit
03789 /// SHUFPS and SHUFPD. If Commuted is true, then it checks for sources to be
03790 /// reverse of what x86 shuffles want.
03791 static bool isSHUFPMask(ArrayRef<int> Mask, MVT VT, bool Commuted = false) {
03792 
03793   unsigned NumElems = VT.getVectorNumElements();
03794   unsigned NumLanes = VT.getSizeInBits()/128;
03795   unsigned NumLaneElems = NumElems/NumLanes;
03796 
03797   if (NumLaneElems != 2 && NumLaneElems != 4)
03798     return false;
03799 
03800   unsigned EltSize = VT.getVectorElementType().getSizeInBits();
03801   bool symetricMaskRequired =
03802     (VT.getSizeInBits() >= 256) && (EltSize == 32);
03803 
03804   // VSHUFPSY divides the resulting vector into 4 chunks.
03805   // The sources are also splitted into 4 chunks, and each destination
03806   // chunk must come from a different source chunk.
03807   //
03808   //  SRC1 =>   X7    X6    X5    X4    X3    X2    X1    X0
03809   //  SRC2 =>   Y7    Y6    Y5    Y4    Y3    Y2    Y1    Y9
03810   //
03811   //  DST  =>  Y7..Y4,   Y7..Y4,   X7..X4,   X7..X4,
03812   //           Y3..Y0,   Y3..Y0,   X3..X0,   X3..X0
03813   //
03814   // VSHUFPDY divides the resulting vector into 4 chunks.
03815   // The sources are also splitted into 4 chunks, and each destination
03816   // chunk must come from a different source chunk.
03817   //
03818   //  SRC1 =>      X3       X2       X1       X0
03819   //  SRC2 =>      Y3       Y2       Y1       Y0
03820   //
03821   //  DST  =>  Y3..Y2,  X3..X2,  Y1..Y0,  X1..X0
03822   //
03823   SmallVector<int, 4> MaskVal(NumLaneElems, -1);
03824   unsigned HalfLaneElems = NumLaneElems/2;
03825   for (unsigned l = 0; l != NumElems; l += NumLaneElems) {
03826     for (unsigned i = 0; i != NumLaneElems; ++i) {
03827       int Idx = Mask[i+l];
03828       unsigned RngStart = l + ((Commuted == (i<HalfLaneElems)) ? NumElems : 0);
03829       if (!isUndefOrInRange(Idx, RngStart, RngStart+NumLaneElems))
03830         return false;
03831       // For VSHUFPSY, the mask of the second half must be the same as the
03832       // first but with the appropriate offsets. This works in the same way as
03833       // VPERMILPS works with masks.
03834       if (!symetricMaskRequired || Idx < 0)
03835         continue;
03836       if (MaskVal[i] < 0) {
03837         MaskVal[i] = Idx - l;
03838         continue;
03839       }
03840       if ((signed)(Idx - l) != MaskVal[i])
03841         return false;
03842     }
03843   }
03844 
03845   return true;
03846 }
03847 
03848 /// isMOVHLPSMask - Return true if the specified VECTOR_SHUFFLE operand
03849 /// specifies a shuffle of elements that is suitable for input to MOVHLPS.
03850 static bool isMOVHLPSMask(ArrayRef<int> Mask, MVT VT) {
03851   if (!VT.is128BitVector())
03852     return false;
03853 
03854   unsigned NumElems = VT.getVectorNumElements();
03855 
03856   if (NumElems != 4)
03857     return false;
03858 
03859   // Expect bit0 == 6, bit1 == 7, bit2 == 2, bit3 == 3
03860   return isUndefOrEqual(Mask[0], 6) &&
03861          isUndefOrEqual(Mask[1], 7) &&
03862          isUndefOrEqual(Mask[2], 2) &&
03863          isUndefOrEqual(Mask[3], 3);
03864 }
03865 
03866 /// isMOVHLPS_v_undef_Mask - Special case of isMOVHLPSMask for canonical form
03867 /// of vector_shuffle v, v, <2, 3, 2, 3>, i.e. vector_shuffle v, undef,
03868 /// <2, 3, 2, 3>
03869 static bool isMOVHLPS_v_undef_Mask(ArrayRef<int> Mask, MVT VT) {
03870   if (!VT.is128BitVector())
03871     return false;
03872 
03873   unsigned NumElems = VT.getVectorNumElements();
03874 
03875   if (NumElems != 4)
03876     return false;
03877 
03878   return isUndefOrEqual(Mask[0], 2) &&
03879          isUndefOrEqual(Mask[1], 3) &&
03880          isUndefOrEqual(Mask[2], 2) &&
03881          isUndefOrEqual(Mask[3], 3);
03882 }
03883 
03884 /// isMOVLPMask - Return true if the specified VECTOR_SHUFFLE operand
03885 /// specifies a shuffle of elements that is suitable for input to MOVLP{S|D}.
03886 static bool isMOVLPMask(ArrayRef<int> Mask, MVT VT) {
03887   if (!VT.is128BitVector())
03888     return false;
03889 
03890   unsigned NumElems = VT.getVectorNumElements();
03891 
03892   if (NumElems != 2 && NumElems != 4)
03893     return false;
03894 
03895   for (unsigned i = 0, e = NumElems/2; i != e; ++i)
03896     if (!isUndefOrEqual(Mask[i], i + NumElems))
03897       return false;
03898 
03899   for (unsigned i = NumElems/2, e = NumElems; i != e; ++i)
03900     if (!isUndefOrEqual(Mask[i], i))
03901       return false;
03902 
03903   return true;
03904 }
03905 
03906 /// isMOVLHPSMask - Return true if the specified VECTOR_SHUFFLE operand
03907 /// specifies a shuffle of elements that is suitable for input to MOVLHPS.
03908 static bool isMOVLHPSMask(ArrayRef<int> Mask, MVT VT) {
03909   if (!VT.is128BitVector())
03910     return false;
03911 
03912   unsigned NumElems = VT.getVectorNumElements();
03913 
03914   if (NumElems != 2 && NumElems != 4)
03915     return false;
03916 
03917   for (unsigned i = 0, e = NumElems/2; i != e; ++i)
03918     if (!isUndefOrEqual(Mask[i], i))
03919       return false;
03920 
03921   for (unsigned i = 0, e = NumElems/2; i != e; ++i)
03922     if (!isUndefOrEqual(Mask[i + e], i + NumElems))
03923       return false;
03924 
03925   return true;
03926 }
03927 
03928 //
03929 // Some special combinations that can be optimized.
03930 //
03931 static
03932 SDValue Compact8x32ShuffleNode(ShuffleVectorSDNode *SVOp,
03933                                SelectionDAG &DAG) {
03934   MVT VT = SVOp->getSimpleValueType(0);
03935   SDLoc dl(SVOp);
03936 
03937   if (VT != MVT::v8i32 && VT != MVT::v8f32)
03938     return SDValue();
03939 
03940   ArrayRef<int> Mask = SVOp->getMask();
03941 
03942   // These are the special masks that may be optimized.
03943   static const int MaskToOptimizeEven[] = {0, 8, 2, 10, 4, 12, 6, 14};
03944   static const int MaskToOptimizeOdd[]  = {1, 9, 3, 11, 5, 13, 7, 15};
03945   bool MatchEvenMask = true;
03946   bool MatchOddMask  = true;
03947   for (int i=0; i<8; ++i) {
03948     if (!isUndefOrEqual(Mask[i], MaskToOptimizeEven[i]))
03949       MatchEvenMask = false;
03950     if (!isUndefOrEqual(Mask[i], MaskToOptimizeOdd[i]))
03951       MatchOddMask = false;
03952   }
03953 
03954   if (!MatchEvenMask && !MatchOddMask)
03955     return SDValue();
03956 
03957   SDValue UndefNode = DAG.getNode(ISD::UNDEF, dl, VT);
03958 
03959   SDValue Op0 = SVOp->getOperand(0);
03960   SDValue Op1 = SVOp->getOperand(1);
03961 
03962   if (MatchEvenMask) {
03963     // Shift the second operand right to 32 bits.
03964     static const int ShiftRightMask[] = {-1, 0, -1, 2, -1, 4, -1, 6 };
03965     Op1 = DAG.getVectorShuffle(VT, dl, Op1, UndefNode, ShiftRightMask);
03966   } else {
03967     // Shift the first operand left to 32 bits.
03968     static const int ShiftLeftMask[] = {1, -1, 3, -1, 5, -1, 7, -1 };
03969     Op0 = DAG.getVectorShuffle(VT, dl, Op0, UndefNode, ShiftLeftMask);
03970   }
03971   static const int BlendMask[] = {0, 9, 2, 11, 4, 13, 6, 15};
03972   return DAG.getVectorShuffle(VT, dl, Op0, Op1, BlendMask);
03973 }
03974 
03975 /// isUNPCKLMask - Return true if the specified VECTOR_SHUFFLE operand
03976 /// specifies a shuffle of elements that is suitable for input to UNPCKL.
03977 static bool isUNPCKLMask(ArrayRef<int> Mask, MVT VT,
03978                          bool HasInt256, bool V2IsSplat = false) {
03979 
03980   assert(VT.getSizeInBits() >= 128 &&
03981          "Unsupported vector type for unpckl");
03982 
03983   // AVX defines UNPCK* to operate independently on 128-bit lanes.
03984   unsigned NumLanes;
03985   unsigned NumOf256BitLanes;
03986   unsigned NumElts = VT.getVectorNumElements();
03987   if (VT.is256BitVector()) {
03988     if (NumElts != 4 && NumElts != 8 &&
03989         (!HasInt256 || (NumElts != 16 && NumElts != 32)))
03990     return false;
03991     NumLanes = 2;
03992     NumOf256BitLanes = 1;
03993   } else if (VT.is512BitVector()) {
03994     assert(VT.getScalarType().getSizeInBits() >= 32 &&
03995            "Unsupported vector type for unpckh");
03996     NumLanes = 2;
03997     NumOf256BitLanes = 2;
03998   } else {
03999     NumLanes = 1;
04000     NumOf256BitLanes = 1;
04001   }
04002 
04003   unsigned NumEltsInStride = NumElts/NumOf256BitLanes;
04004   unsigned NumLaneElts = NumEltsInStride/NumLanes;
04005 
04006   for (unsigned l256 = 0; l256 < NumOf256BitLanes; l256 += 1) {
04007     for (unsigned l = 0; l != NumEltsInStride; l += NumLaneElts) {
04008       for (unsigned i = 0, j = l; i != NumLaneElts; i += 2, ++j) {
04009         int BitI  = Mask[l256*NumEltsInStride+l+i];
04010         int BitI1 = Mask[l256*NumEltsInStride+l+i+1];
04011         if (!isUndefOrEqual(BitI, j+l256*NumElts))
04012           return false;
04013         if (V2IsSplat && !isUndefOrEqual(BitI1, NumElts))
04014           return false;
04015         if (!isUndefOrEqual(BitI1, j+l256*NumElts+NumEltsInStride))
04016           return false;
04017       }
04018     }
04019   }
04020   return true;
04021 }
04022 
04023 /// isUNPCKHMask - Return true if the specified VECTOR_SHUFFLE operand
04024 /// specifies a shuffle of elements that is suitable for input to UNPCKH.
04025 static bool isUNPCKHMask(ArrayRef<int> Mask, MVT VT,
04026                          bool HasInt256, bool V2IsSplat = false) {
04027   assert(VT.getSizeInBits() >= 128 &&
04028          "Unsupported vector type for unpckh");
04029 
04030   // AVX defines UNPCK* to operate independently on 128-bit lanes.
04031   unsigned NumLanes;
04032   unsigned NumOf256BitLanes;
04033   unsigned NumElts = VT.getVectorNumElements();
04034   if (VT.is256BitVector()) {
04035     if (NumElts != 4 && NumElts != 8 &&
04036         (!HasInt256 || (NumElts != 16 && NumElts != 32)))
04037     return false;
04038     NumLanes = 2;
04039     NumOf256BitLanes = 1;
04040   } else if (VT.is512BitVector()) {
04041     assert(VT.getScalarType().getSizeInBits() >= 32 &&
04042            "Unsupported vector type for unpckh");
04043     NumLanes = 2;
04044     NumOf256BitLanes = 2;
04045   } else {
04046     NumLanes = 1;
04047     NumOf256BitLanes = 1;
04048   }
04049 
04050   unsigned NumEltsInStride = NumElts/NumOf256BitLanes;
04051   unsigned NumLaneElts = NumEltsInStride/NumLanes;
04052 
04053   for (unsigned l256 = 0; l256 < NumOf256BitLanes; l256 += 1) {
04054     for (unsigned l = 0; l != NumEltsInStride; l += NumLaneElts) {
04055       for (unsigned i = 0, j = l+NumLaneElts/2; i != NumLaneElts; i += 2, ++j) {
04056         int BitI  = Mask[l256*NumEltsInStride+l+i];
04057         int BitI1 = Mask[l256*NumEltsInStride+l+i+1];
04058         if (!isUndefOrEqual(BitI, j+l256*NumElts))
04059           return false;
04060         if (V2IsSplat && !isUndefOrEqual(BitI1, NumElts))
04061           return false;
04062         if (!isUndefOrEqual(BitI1, j+l256*NumElts+NumEltsInStride))
04063           return false;
04064       }
04065     }
04066   }
04067   return true;
04068 }
04069 
04070 /// isUNPCKL_v_undef_Mask - Special case of isUNPCKLMask for canonical form
04071 /// of vector_shuffle v, v, <0, 4, 1, 5>, i.e. vector_shuffle v, undef,
04072 /// <0, 0, 1, 1>
04073 static bool isUNPCKL_v_undef_Mask(ArrayRef<int> Mask, MVT VT, bool HasInt256) {
04074   unsigned NumElts = VT.getVectorNumElements();
04075   bool Is256BitVec = VT.is256BitVector();
04076 
04077   if (VT.is512BitVector())
04078     return false;
04079   assert((VT.is128BitVector() || VT.is256BitVector()) &&
04080          "Unsupported vector type for unpckh");
04081 
04082   if (Is256BitVec && NumElts != 4 && NumElts != 8 &&
04083       (!HasInt256 || (NumElts != 16 && NumElts != 32)))
04084     return false;
04085 
04086   // For 256-bit i64/f64, use MOVDDUPY instead, so reject the matching pattern
04087   // FIXME: Need a better way to get rid of this, there's no latency difference
04088   // between UNPCKLPD and MOVDDUP, the later should always be checked first and
04089   // the former later. We should also remove the "_undef" special mask.
04090   if (NumElts == 4 && Is256BitVec)
04091     return false;
04092 
04093   // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate
04094   // independently on 128-bit lanes.
04095   unsigned NumLanes = VT.getSizeInBits()/128;
04096   unsigned NumLaneElts = NumElts/NumLanes;
04097 
04098   for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
04099     for (unsigned i = 0, j = l; i != NumLaneElts; i += 2, ++j) {
04100       int BitI  = Mask[l+i];
04101       int BitI1 = Mask[l+i+1];
04102 
04103       if (!isUndefOrEqual(BitI, j))
04104         return false;
04105       if (!isUndefOrEqual(BitI1, j))
04106         return false;
04107     }
04108   }
04109 
04110   return true;
04111 }
04112 
04113 /// isUNPCKH_v_undef_Mask - Special case of isUNPCKHMask for canonical form
04114 /// of vector_shuffle v, v, <2, 6, 3, 7>, i.e. vector_shuffle v, undef,
04115 /// <2, 2, 3, 3>
04116 static bool isUNPCKH_v_undef_Mask(ArrayRef<int> Mask, MVT VT, bool HasInt256) {
04117   unsigned NumElts = VT.getVectorNumElements();
04118 
04119   if (VT.is512BitVector())
04120     return false;
04121 
04122   assert((VT.is128BitVector() || VT.is256BitVector()) &&
04123          "Unsupported vector type for unpckh");
04124 
04125   if (VT.is256BitVector() && NumElts != 4 && NumElts != 8 &&
04126       (!HasInt256 || (NumElts != 16 && NumElts != 32)))
04127     return false;
04128 
04129   // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate
04130   // independently on 128-bit lanes.
04131   unsigned NumLanes = VT.getSizeInBits()/128;
04132   unsigned NumLaneElts = NumElts/NumLanes;
04133 
04134   for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
04135     for (unsigned i = 0, j = l+NumLaneElts/2; i != NumLaneElts; i += 2, ++j) {
04136       int BitI  = Mask[l+i];
04137       int BitI1 = Mask[l+i+1];
04138       if (!isUndefOrEqual(BitI, j))
04139         return false;
04140       if (!isUndefOrEqual(BitI1, j))
04141         return false;
04142     }
04143   }
04144   return true;
04145 }
04146 
04147 /// isMOVLMask - Return true if the specified VECTOR_SHUFFLE operand
04148 /// specifies a shuffle of elements that is suitable for input to MOVSS,
04149 /// MOVSD, and MOVD, i.e. setting the lowest element.
04150 static bool isMOVLMask(ArrayRef<int> Mask, EVT VT) {
04151   if (VT.getVectorElementType().getSizeInBits() < 32)
04152     return false;
04153   if (!VT.is128BitVector())
04154     return false;
04155 
04156   unsigned NumElts = VT.getVectorNumElements();
04157 
04158   if (!isUndefOrEqual(Mask[0], NumElts))
04159     return false;
04160 
04161   for (unsigned i = 1; i != NumElts; ++i)
04162     if (!isUndefOrEqual(Mask[i], i))
04163       return false;
04164 
04165   return true;
04166 }
04167 
04168 /// isVPERM2X128Mask - Match 256-bit shuffles where the elements are considered
04169 /// as permutations between 128-bit chunks or halves. As an example: this
04170 /// shuffle bellow:
04171 ///   vector_shuffle <4, 5, 6, 7, 12, 13, 14, 15>
04172 /// The first half comes from the second half of V1 and the second half from the
04173 /// the second half of V2.
04174 static bool isVPERM2X128Mask(ArrayRef<int> Mask, MVT VT, bool HasFp256) {
04175   if (!HasFp256 || !VT.is256BitVector())
04176     return false;
04177 
04178   // The shuffle result is divided into half A and half B. In total the two
04179   // sources have 4 halves, namely: C, D, E, F. The final values of A and
04180   // B must come from C, D, E or F.
04181   unsigned HalfSize = VT.getVectorNumElements()/2;
04182   bool MatchA = false, MatchB = false;
04183 
04184   // Check if A comes from one of C, D, E, F.
04185   for (unsigned Half = 0; Half != 4; ++Half) {
04186     if (isSequentialOrUndefInRange(Mask, 0, HalfSize, Half*HalfSize)) {
04187       MatchA = true;
04188       break;
04189     }
04190   }
04191 
04192   // Check if B comes from one of C, D, E, F.
04193   for (unsigned Half = 0; Half != 4; ++Half) {
04194     if (isSequentialOrUndefInRange(Mask, HalfSize, HalfSize, Half*HalfSize)) {
04195       MatchB = true;
04196       break;
04197     }
04198   }
04199 
04200   return MatchA && MatchB;
04201 }
04202 
04203 /// getShuffleVPERM2X128Immediate - Return the appropriate immediate to shuffle
04204 /// the specified VECTOR_MASK mask with VPERM2F128/VPERM2I128 instructions.
04205 static unsigned getShuffleVPERM2X128Immediate(ShuffleVectorSDNode *SVOp) {
04206   MVT VT = SVOp->getSimpleValueType(0);
04207 
04208   unsigned HalfSize = VT.getVectorNumElements()/2;
04209 
04210   unsigned FstHalf = 0, SndHalf = 0;
04211   for (unsigned i = 0; i < HalfSize; ++i) {
04212     if (SVOp->getMaskElt(i) > 0) {
04213       FstHalf = SVOp->getMaskElt(i)/HalfSize;
04214       break;
04215     }
04216   }
04217   for (unsigned i = HalfSize; i < HalfSize*2; ++i) {
04218     if (SVOp->getMaskElt(i) > 0) {
04219       SndHalf = SVOp->getMaskElt(i)/HalfSize;
04220       break;
04221     }
04222   }
04223 
04224   return (FstHalf | (SndHalf << 4));
04225 }
04226 
04227 // Symetric in-lane mask. Each lane has 4 elements (for imm8)
04228 static bool isPermImmMask(ArrayRef<int> Mask, MVT VT, unsigned& Imm8) {
04229   unsigned EltSize = VT.getVectorElementType().getSizeInBits();
04230   if (EltSize < 32)
04231     return false;
04232 
04233   unsigned NumElts = VT.getVectorNumElements();
04234   Imm8 = 0;
04235   if (VT.is128BitVector() || (VT.is256BitVector() && EltSize == 64)) {
04236     for (unsigned i = 0; i != NumElts; ++i) {
04237       if (Mask[i] < 0)
04238         continue;
04239       Imm8 |= Mask[i] << (i*2);
04240     }
04241     return true;
04242   }
04243 
04244   unsigned LaneSize = 4;
04245   SmallVector<int, 4> MaskVal(LaneSize, -1);
04246 
04247   for (unsigned l = 0; l != NumElts; l += LaneSize) {
04248     for (unsigned i = 0; i != LaneSize; ++i) {
04249       if (!isUndefOrInRange(Mask[i+l], l, l+LaneSize))
04250         return false;
04251       if (Mask[i+l] < 0)
04252         continue;
04253       if (MaskVal[i] < 0) {
04254         MaskVal[i] = Mask[i+l] - l;
04255         Imm8 |= MaskVal[i] << (i*2);
04256         continue;
04257       }
04258       if (Mask[i+l] != (signed)(MaskVal[i]+l))
04259         return false;
04260     }
04261   }
04262   return true;
04263 }
04264 
04265 /// isVPERMILPMask - Return true if the specified VECTOR_SHUFFLE operand
04266 /// specifies a shuffle of elements that is suitable for input to VPERMILPD*.
04267 /// Note that VPERMIL mask matching is different depending whether theunderlying
04268 /// type is 32 or 64. In the VPERMILPS the high half of the mask should point
04269 /// to the same elements of the low, but to the higher half of the source.
04270 /// In VPERMILPD the two lanes could be shuffled independently of each other
04271 /// with the same restriction that lanes can't be crossed. Also handles PSHUFDY.
04272 static bool isVPERMILPMask(ArrayRef<int> Mask, MVT VT) {
04273   unsigned EltSize = VT.getVectorElementType().getSizeInBits();
04274   if (VT.getSizeInBits() < 256 || EltSize < 32)
04275     return false;
04276   bool symetricMaskRequired = (EltSize == 32);
04277   unsigned NumElts = VT.getVectorNumElements();
04278 
04279   unsigned NumLanes = VT.getSizeInBits()/128;
04280   unsigned LaneSize = NumElts/NumLanes;
04281   // 2 or 4 elements in one lane
04282 
04283   SmallVector<int, 4> ExpectedMaskVal(LaneSize, -1);
04284   for (unsigned l = 0; l != NumElts; l += LaneSize) {
04285     for (unsigned i = 0; i != LaneSize; ++i) {
04286       if (!isUndefOrInRange(Mask[i+l], l, l+LaneSize))
04287         return false;
04288       if (symetricMaskRequired) {
04289         if (ExpectedMaskVal[i] < 0 && Mask[i+l] >= 0) {
04290           ExpectedMaskVal[i] = Mask[i+l] - l;
04291           continue;
04292         }
04293         if (!isUndefOrEqual(Mask[i+l], ExpectedMaskVal[i]+l))
04294           return false;
04295       }
04296     }
04297   }
04298   return true;
04299 }
04300 
04301 /// isCommutedMOVLMask - Returns true if the shuffle mask is except the reverse
04302 /// of what x86 movss want. X86 movs requires the lowest  element to be lowest
04303 /// element of vector 2 and the other elements to come from vector 1 in order.
04304 static bool isCommutedMOVLMask(ArrayRef<int> Mask, MVT VT,
04305                                bool V2IsSplat = false, bool V2IsUndef = false) {
04306   if (!VT.is128BitVector())
04307     return false;
04308 
04309   unsigned NumOps = VT.getVectorNumElements();
04310   if (NumOps != 2 && NumOps != 4 && NumOps != 8 && NumOps != 16)
04311     return false;
04312 
04313   if (!isUndefOrEqual(Mask[0], 0))
04314     return false;
04315 
04316   for (unsigned i = 1; i != NumOps; ++i)
04317     if (!(isUndefOrEqual(Mask[i], i+NumOps) ||
04318           (V2IsUndef && isUndefOrInRange(Mask[i], NumOps, NumOps*2)) ||
04319           (V2IsSplat && isUndefOrEqual(Mask[i], NumOps))))
04320       return false;
04321 
04322   return true;
04323 }
04324 
04325 /// isMOVSHDUPMask - Return true if the specified VECTOR_SHUFFLE operand
04326 /// specifies a shuffle of elements that is suitable for input to MOVSHDUP.
04327 /// Masks to match: <1, 1, 3, 3> or <1, 1, 3, 3, 5, 5, 7, 7>
04328 static bool isMOVSHDUPMask(ArrayRef<int> Mask, MVT VT,
04329                            const X86Subtarget *Subtarget) {
04330   if (!Subtarget->hasSSE3())
04331     return false;
04332 
04333   unsigned NumElems = VT.getVectorNumElements();
04334 
04335   if ((VT.is128BitVector() && NumElems != 4) ||
04336       (VT.is256BitVector() && NumElems != 8) ||
04337       (VT.is512BitVector() && NumElems != 16))
04338     return false;
04339 
04340   // "i+1" is the value the indexed mask element must have
04341   for (unsigned i = 0; i != NumElems; i += 2)
04342     if (!isUndefOrEqual(Mask[i], i+1) ||
04343         !isUndefOrEqual(Mask[i+1], i+1))
04344       return false;
04345 
04346   return true;
04347 }
04348 
04349 /// isMOVSLDUPMask - Return true if the specified VECTOR_SHUFFLE operand
04350 /// specifies a shuffle of elements that is suitable for input to MOVSLDUP.
04351 /// Masks to match: <0, 0, 2, 2> or <0, 0, 2, 2, 4, 4, 6, 6>
04352 static bool isMOVSLDUPMask(ArrayRef<int> Mask, MVT VT,
04353                            const X86Subtarget *Subtarget) {
04354   if (!Subtarget->hasSSE3())
04355     return false;
04356 
04357   unsigned NumElems = VT.getVectorNumElements();
04358 
04359   if ((VT.is128BitVector() && NumElems != 4) ||
04360       (VT.is256BitVector() && NumElems != 8) ||
04361       (VT.is512BitVector() && NumElems != 16))
04362     return false;
04363 
04364   // "i" is the value the indexed mask element must have
04365   for (unsigned i = 0; i != NumElems; i += 2)
04366     if (!isUndefOrEqual(Mask[i], i) ||
04367         !isUndefOrEqual(Mask[i+1], i))
04368       return false;
04369 
04370   return true;
04371 }
04372 
04373 /// isMOVDDUPYMask - Return true if the specified VECTOR_SHUFFLE operand
04374 /// specifies a shuffle of elements that is suitable for input to 256-bit
04375 /// version of MOVDDUP.
04376 static bool isMOVDDUPYMask(ArrayRef<int> Mask, MVT VT, bool HasFp256) {
04377   if (!HasFp256 || !VT.is256BitVector())
04378     return false;
04379 
04380   unsigned NumElts = VT.getVectorNumElements();
04381   if (NumElts != 4)
04382     return false;
04383 
04384   for (unsigned i = 0; i != NumElts/2; ++i)
04385     if (!isUndefOrEqual(Mask[i], 0))
04386       return false;
04387   for (unsigned i = NumElts/2; i != NumElts; ++i)
04388     if (!isUndefOrEqual(Mask[i], NumElts/2))
04389       return false;
04390   return true;
04391 }
04392 
04393 /// isMOVDDUPMask - Return true if the specified VECTOR_SHUFFLE operand
04394 /// specifies a shuffle of elements that is suitable for input to 128-bit
04395 /// version of MOVDDUP.
04396 static bool isMOVDDUPMask(ArrayRef<int> Mask, MVT VT) {
04397   if (!VT.is128BitVector())
04398     return false;
04399 
04400   unsigned e = VT.getVectorNumElements() / 2;
04401   for (unsigned i = 0; i != e; ++i)
04402     if (!isUndefOrEqual(Mask[i], i))
04403       return false;
04404   for (unsigned i = 0; i != e; ++i)
04405     if (!isUndefOrEqual(Mask[e+i], i))
04406       return false;
04407   return true;
04408 }
04409 
04410 /// isVEXTRACTIndex - Return true if the specified
04411 /// EXTRACT_SUBVECTOR operand specifies a vector extract that is
04412 /// suitable for instruction that extract 128 or 256 bit vectors
04413 static bool isVEXTRACTIndex(SDNode *N, unsigned vecWidth) {
04414   assert((vecWidth == 128 || vecWidth == 256) && "Unexpected vector width");
04415   if (!isa<ConstantSDNode>(N->getOperand(1).getNode()))
04416     return false;
04417 
04418   // The index should be aligned on a vecWidth-bit boundary.
04419   uint64_t Index =
04420     cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue();
04421 
04422   MVT VT = N->getSimpleValueType(0);
04423   unsigned ElSize = VT.getVectorElementType().getSizeInBits();
04424   bool Result = (Index * ElSize) % vecWidth == 0;
04425 
04426   return Result;
04427 }
04428 
04429 /// isVINSERTIndex - Return true if the specified INSERT_SUBVECTOR
04430 /// operand specifies a subvector insert that is suitable for input to
04431 /// insertion of 128 or 256-bit subvectors
04432 static bool isVINSERTIndex(SDNode *N, unsigned vecWidth) {
04433   assert((vecWidth == 128 || vecWidth == 256) && "Unexpected vector width");
04434   if (!isa<ConstantSDNode>(N->getOperand(2).getNode()))
04435     return false;
04436   // The index should be aligned on a vecWidth-bit boundary.
04437   uint64_t Index =
04438     cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue();
04439 
04440   MVT VT = N->getSimpleValueType(0);
04441   unsigned ElSize = VT.getVectorElementType().getSizeInBits();
04442   bool Result = (Index * ElSize) % vecWidth == 0;
04443 
04444   return Result;
04445 }
04446 
04447 bool X86::isVINSERT128Index(SDNode *N) {
04448   return isVINSERTIndex(N, 128);
04449 }
04450 
04451 bool X86::isVINSERT256Index(SDNode *N) {
04452   return isVINSERTIndex(N, 256);
04453 }
04454 
04455 bool X86::isVEXTRACT128Index(SDNode *N) {
04456   return isVEXTRACTIndex(N, 128);
04457 }
04458 
04459 bool X86::isVEXTRACT256Index(SDNode *N) {
04460   return isVEXTRACTIndex(N, 256);
04461 }
04462 
04463 /// getShuffleSHUFImmediate - Return the appropriate immediate to shuffle
04464 /// the specified VECTOR_SHUFFLE mask with PSHUF* and SHUFP* instructions.
04465 /// Handles 128-bit and 256-bit.
04466 static unsigned getShuffleSHUFImmediate(ShuffleVectorSDNode *N) {
04467   MVT VT = N->getSimpleValueType(0);
04468 
04469   assert((VT.getSizeInBits() >= 128) &&
04470          "Unsupported vector type for PSHUF/SHUFP");
04471 
04472   // Handle 128 and 256-bit vector lengths. AVX defines PSHUF/SHUFP to operate
04473   // independently on 128-bit lanes.
04474   unsigned NumElts = VT.getVectorNumElements();
04475   unsigned NumLanes = VT.getSizeInBits()/128;
04476   unsigned NumLaneElts = NumElts/NumLanes;
04477 
04478   assert((NumLaneElts == 2 || NumLaneElts == 4 || NumLaneElts == 8) &&
04479          "Only supports 2, 4 or 8 elements per lane");
04480 
04481   unsigned Shift = (NumLaneElts >= 4) ? 1 : 0;
04482   unsigned Mask = 0;
04483   for (unsigned i = 0; i != NumElts; ++i) {
04484     int Elt = N->getMaskElt(i);
04485     if (Elt < 0) continue;
04486     Elt &= NumLaneElts - 1;
04487     unsigned ShAmt = (i << Shift) % 8;
04488     Mask |= Elt << ShAmt;
04489   }
04490 
04491   return Mask;
04492 }
04493 
04494 /// getShufflePSHUFHWImmediate - Return the appropriate immediate to shuffle
04495 /// the specified VECTOR_SHUFFLE mask with the PSHUFHW instruction.
04496 static unsigned getShufflePSHUFHWImmediate(ShuffleVectorSDNode *N) {
04497   MVT VT = N->getSimpleValueType(0);
04498 
04499   assert((VT == MVT::v8i16 || VT == MVT::v16i16) &&
04500          "Unsupported vector type for PSHUFHW");
04501 
04502   unsigned NumElts = VT.getVectorNumElements();
04503 
04504   unsigned Mask = 0;
04505   for (unsigned l = 0; l != NumElts; l += 8) {
04506     // 8 nodes per lane, but we only care about the last 4.
04507     for (unsigned i = 0; i < 4; ++i) {
04508       int Elt = N->getMaskElt(l+i+4);
04509       if (Elt < 0) continue;
04510       Elt &= 0x3; // only 2-bits.
04511       Mask |= Elt << (i * 2);
04512     }
04513   }
04514 
04515   return Mask;
04516 }
04517 
04518 /// getShufflePSHUFLWImmediate - Return the appropriate immediate to shuffle
04519 /// the specified VECTOR_SHUFFLE mask with the PSHUFLW instruction.
04520 static unsigned getShufflePSHUFLWImmediate(ShuffleVectorSDNode *N) {
04521   MVT VT = N->getSimpleValueType(0);
04522 
04523   assert((VT == MVT::v8i16 || VT == MVT::v16i16) &&
04524          "Unsupported vector type for PSHUFHW");
04525 
04526   unsigned NumElts = VT.getVectorNumElements();
04527 
04528   unsigned Mask = 0;
04529   for (unsigned l = 0; l != NumElts; l += 8) {
04530     // 8 nodes per lane, but we only care about the first 4.
04531     for (unsigned i = 0; i < 4; ++i) {
04532       int Elt = N->getMaskElt(l+i);
04533       if (Elt < 0) continue;
04534       Elt &= 0x3; // only 2-bits
04535       Mask |= Elt << (i * 2);
04536     }
04537   }
04538 
04539   return Mask;
04540 }
04541 
04542 /// getShufflePALIGNRImmediate - Return the appropriate immediate to shuffle
04543 /// the specified VECTOR_SHUFFLE mask with the PALIGNR instruction.
04544 static unsigned getShufflePALIGNRImmediate(ShuffleVectorSDNode *SVOp) {
04545   MVT VT = SVOp->getSimpleValueType(0);
04546   unsigned EltSize = VT.is512BitVector() ? 1 :
04547     VT.getVectorElementType().getSizeInBits() >> 3;
04548 
04549   unsigned NumElts = VT.getVectorNumElements();
04550   unsigned NumLanes = VT.is512BitVector() ? 1 : VT.getSizeInBits()/128;
04551   unsigned NumLaneElts = NumElts/NumLanes;
04552 
04553   int Val = 0;
04554   unsigned i;
04555   for (i = 0; i != NumElts; ++i) {
04556     Val = SVOp->getMaskElt(i);
04557     if (Val >= 0)
04558       break;
04559   }
04560   if (Val >= (int)NumElts)
04561     Val -= NumElts - NumLaneElts;
04562 
04563   assert(Val - i > 0 && "PALIGNR imm should be positive");
04564   return (Val - i) * EltSize;
04565 }
04566 
04567 static unsigned getExtractVEXTRACTImmediate(SDNode *N, unsigned vecWidth) {
04568   assert((vecWidth == 128 || vecWidth == 256) && "Unsupported vector width");
04569   if (!isa<ConstantSDNode>(N->getOperand(1).getNode()))
04570     llvm_unreachable("Illegal extract subvector for VEXTRACT");
04571 
04572   uint64_t Index =
04573     cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue();
04574 
04575   MVT VecVT = N->getOperand(0).getSimpleValueType();
04576   MVT ElVT = VecVT.getVectorElementType();
04577 
04578   unsigned NumElemsPerChunk = vecWidth / ElVT.getSizeInBits();
04579   return Index / NumElemsPerChunk;
04580 }
04581 
04582 static unsigned getInsertVINSERTImmediate(SDNode *N, unsigned vecWidth) {
04583   assert((vecWidth == 128 || vecWidth == 256) && "Unsupported vector width");
04584   if (!isa<ConstantSDNode>(N->getOperand(2).getNode()))
04585     llvm_unreachable("Illegal insert subvector for VINSERT");
04586 
04587   uint64_t Index =
04588     cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue();
04589 
04590   MVT VecVT = N->getSimpleValueType(0);
04591   MVT ElVT = VecVT.getVectorElementType();
04592 
04593   unsigned NumElemsPerChunk = vecWidth / ElVT.getSizeInBits();
04594   return Index / NumElemsPerChunk;
04595 }
04596 
04597 /// getExtractVEXTRACT128Immediate - Return the appropriate immediate
04598 /// to extract the specified EXTRACT_SUBVECTOR index with VEXTRACTF128
04599 /// and VINSERTI128 instructions.
04600 unsigned X86::getExtractVEXTRACT128Immediate(SDNode *N) {
04601   return getExtractVEXTRACTImmediate(N, 128);
04602 }
04603 
04604 /// getExtractVEXTRACT256Immediate - Return the appropriate immediate
04605 /// to extract the specified EXTRACT_SUBVECTOR index with VEXTRACTF64x4
04606 /// and VINSERTI64x4 instructions.
04607 unsigned X86::getExtractVEXTRACT256Immediate(SDNode *N) {
04608   return getExtractVEXTRACTImmediate(N, 256);
04609 }
04610 
04611 /// getInsertVINSERT128Immediate - Return the appropriate immediate
04612 /// to insert at the specified INSERT_SUBVECTOR index with VINSERTF128
04613 /// and VINSERTI128 instructions.
04614 unsigned X86::getInsertVINSERT128Immediate(SDNode *N) {
04615   return getInsertVINSERTImmediate(N, 128);
04616 }
04617 
04618 /// getInsertVINSERT256Immediate - Return the appropriate immediate
04619 /// to insert at the specified INSERT_SUBVECTOR index with VINSERTF46x4
04620 /// and VINSERTI64x4 instructions.
04621 unsigned X86::getInsertVINSERT256Immediate(SDNode *N) {
04622   return getInsertVINSERTImmediate(N, 256);
04623 }
04624 
04625 /// isZeroNode - Returns true if Elt is a constant zero or a floating point
04626 /// constant +0.0.
04627 bool X86::isZeroNode(SDValue Elt) {
04628   if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Elt))
04629     return CN->isNullValue();
04630   if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Elt))
04631     return CFP->getValueAPF().isPosZero();
04632   return false;
04633 }
04634 
04635 /// CommuteVectorShuffle - Swap vector_shuffle operands as well as values in
04636 /// their permute mask.
04637 static SDValue CommuteVectorShuffle(ShuffleVectorSDNode *SVOp,
04638                                     SelectionDAG &DAG) {
04639   MVT VT = SVOp->getSimpleValueType(0);
04640   unsigned NumElems = VT.getVectorNumElements();
04641   SmallVector<int, 8> MaskVec;
04642 
04643   for (unsigned i = 0; i != NumElems; ++i) {
04644     int Idx = SVOp->getMaskElt(i);
04645     if (Idx >= 0) {
04646       if (Idx < (int)NumElems)
04647         Idx += NumElems;
04648       else
04649         Idx -= NumElems;
04650     }
04651     MaskVec.push_back(Idx);
04652   }
04653   return DAG.getVectorShuffle(VT, SDLoc(SVOp), SVOp->getOperand(1),
04654                               SVOp->getOperand(0), &MaskVec[0]);
04655 }
04656 
04657 /// ShouldXformToMOVHLPS - Return true if the node should be transformed to
04658 /// match movhlps. The lower half elements should come from upper half of
04659 /// V1 (and in order), and the upper half elements should come from the upper
04660 /// half of V2 (and in order).
04661 static bool ShouldXformToMOVHLPS(ArrayRef<int> Mask, MVT VT) {
04662   if (!VT.is128BitVector())
04663     return false;
04664   if (VT.getVectorNumElements() != 4)
04665     return false;
04666   for (unsigned i = 0, e = 2; i != e; ++i)
04667     if (!isUndefOrEqual(Mask[i], i+2))
04668       return false;
04669   for (unsigned i = 2; i != 4; ++i)
04670     if (!isUndefOrEqual(Mask[i], i+4))
04671       return false;
04672   return true;
04673 }
04674 
04675 /// isScalarLoadToVector - Returns true if the node is a scalar load that
04676 /// is promoted to a vector. It also returns the LoadSDNode by reference if
04677 /// required.
04678 static bool isScalarLoadToVector(SDNode *N, LoadSDNode **LD = NULL) {
04679   if (N->getOpcode() != ISD::SCALAR_TO_VECTOR)
04680     return false;
04681   N = N->getOperand(0).getNode();
04682   if (!ISD::isNON_EXTLoad(N))
04683     return false;
04684   if (LD)
04685     *LD = cast<LoadSDNode>(N);
04686   return true;
04687 }
04688 
04689 // Test whether the given value is a vector value which will be legalized
04690 // into a load.
04691 static bool WillBeConstantPoolLoad(SDNode *N) {
04692   if (N->getOpcode() != ISD::BUILD_VECTOR)
04693     return false;
04694 
04695   // Check for any non-constant elements.
04696   for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i)
04697     switch (N->getOperand(i).getNode()->getOpcode()) {
04698     case ISD::UNDEF:
04699     case ISD::ConstantFP:
04700     case ISD::Constant:
04701       break;
04702     default:
04703       return false;
04704     }
04705 
04706   // Vectors of all-zeros and all-ones are materialized with special
04707   // instructions rather than being loaded.
04708   return !ISD::isBuildVectorAllZeros(N) &&
04709          !ISD::isBuildVectorAllOnes(N);
04710 }
04711 
04712 /// ShouldXformToMOVLP{S|D} - Return true if the node should be transformed to
04713 /// match movlp{s|d}. The lower half elements should come from lower half of
04714 /// V1 (and in order), and the upper half elements should come from the upper
04715 /// half of V2 (and in order). And since V1 will become the source of the
04716 /// MOVLP, it must be either a vector load or a scalar load to vector.
04717 static bool ShouldXformToMOVLP(SDNode *V1, SDNode *V2,
04718                                ArrayRef<int> Mask, MVT VT) {
04719   if (!VT.is128BitVector())
04720     return false;
04721 
04722   if (!ISD::isNON_EXTLoad(V1) && !isScalarLoadToVector(V1))
04723     return false;
04724   // Is V2 is a vector load, don't do this transformation. We will try to use
04725   // load folding shufps op.
04726   if (ISD::isNON_EXTLoad(V2) || WillBeConstantPoolLoad(V2))
04727     return false;
04728 
04729   unsigned NumElems = VT.getVectorNumElements();
04730 
04731   if (NumElems != 2 && NumElems != 4)
04732     return false;
04733   for (unsigned i = 0, e = NumElems/2; i != e; ++i)
04734     if (!isUndefOrEqual(Mask[i], i))
04735       return false;
04736   for (unsigned i = NumElems/2, e = NumElems; i != e; ++i)
04737     if (!isUndefOrEqual(Mask[i], i+NumElems))
04738       return false;
04739   return true;
04740 }
04741 
04742 /// isSplatVector - Returns true if N is a BUILD_VECTOR node whose elements are
04743 /// all the same.
04744 static bool isSplatVector(SDNode *N) {
04745   if (N->getOpcode() != ISD::BUILD_VECTOR)
04746     return false;
04747 
04748   SDValue SplatValue = N->getOperand(0);
04749   for (unsigned i = 1, e = N->getNumOperands(); i != e; ++i)
04750     if (N->getOperand(i) != SplatValue)
04751       return false;
04752   return true;
04753 }
04754 
04755 /// isZeroShuffle - Returns true if N is a VECTOR_SHUFFLE that can be resolved
04756 /// to an zero vector.
04757 /// FIXME: move to dag combiner / method on ShuffleVectorSDNode
04758 static bool isZeroShuffle(ShuffleVectorSDNode *N) {
04759   SDValue V1 = N->getOperand(0);
04760   SDValue V2 = N->getOperand(1);
04761   unsigned NumElems = N->getValueType(0).getVectorNumElements();
04762   for (unsigned i = 0; i != NumElems; ++i) {
04763     int Idx = N->getMaskElt(i);
04764     if (Idx >= (int)NumElems) {
04765       unsigned Opc = V2.getOpcode();
04766       if (Opc == ISD::UNDEF || ISD::isBuildVectorAllZeros(V2.getNode()))
04767         continue;
04768       if (Opc != ISD::BUILD_VECTOR ||
04769           !X86::isZeroNode(V2.getOperand(Idx-NumElems)))
04770         return false;
04771     } else if (Idx >= 0) {
04772       unsigned Opc = V1.getOpcode();
04773       if (Opc == ISD::UNDEF || ISD::isBuildVectorAllZeros(V1.getNode()))
04774         continue;
04775       if (Opc != ISD::BUILD_VECTOR ||
04776           !X86::isZeroNode(V1.getOperand(Idx)))
04777         return false;
04778     }
04779   }
04780   return true;
04781 }
04782 
04783 /// getZeroVector - Returns a vector of specified type with all zero elements.
04784 ///
04785 static SDValue getZeroVector(EVT VT, const X86Subtarget *Subtarget,
04786                              SelectionDAG &DAG, SDLoc dl) {
04787   assert(VT.isVector() && "Expected a vector type");
04788 
04789   // Always build SSE zero vectors as <4 x i32> bitcasted
04790   // to their dest type. This ensures they get CSE'd.
04791   SDValue Vec;
04792   if (VT.is128BitVector()) {  // SSE
04793     if (Subtarget->hasSSE2()) {  // SSE2
04794       SDValue Cst = DAG.getTargetConstant(0, MVT::i32);
04795       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
04796     } else { // SSE1
04797       SDValue Cst = DAG.getTargetConstantFP(+0.0, MVT::f32);
04798       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4f32, Cst, Cst, Cst, Cst);
04799     }
04800   } else if (VT.is256BitVector()) { // AVX
04801     if (Subtarget->hasInt256()) { // AVX2
04802       SDValue Cst = DAG.getTargetConstant(0, MVT::i32);
04803       SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
04804       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops,
04805                         array_lengthof(Ops));
04806     } else {
04807       // 256-bit logic and arithmetic instructions in AVX are all
04808       // floating-point, no support for integer ops. Emit fp zeroed vectors.
04809       SDValue Cst = DAG.getTargetConstantFP(+0.0, MVT::f32);
04810       SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
04811       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8f32, Ops,
04812                         array_lengthof(Ops));
04813     }
04814   } else if (VT.is512BitVector()) { // AVX-512
04815       SDValue Cst = DAG.getTargetConstant(0, MVT::i32);
04816       SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst,
04817                         Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
04818       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v16i32, Ops, 16);
04819   } else if (VT.getScalarType() == MVT::i1) {
04820     assert(VT.getVectorNumElements() <= 16 && "Unexpected vector type");
04821     SDValue Cst = DAG.getTargetConstant(0, MVT::i1);
04822     SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst,
04823                       Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
04824     return DAG.getNode(ISD::BUILD_VECTOR, dl, VT,
04825                        Ops, VT.getVectorNumElements());
04826   } else
04827     llvm_unreachable("Unexpected vector type");
04828 
04829   return DAG.getNode(ISD::BITCAST, dl, VT, Vec);
04830 }
04831 
04832 /// getOnesVector - Returns a vector of specified type with all bits set.
04833 /// Always build ones vectors as <4 x i32> or <8 x i32>. For 256-bit types with
04834 /// no AVX2 supprt, use two <4 x i32> inserted in a <8 x i32> appropriately.
04835 /// Then bitcast to their original type, ensuring they get CSE'd.
04836 static SDValue getOnesVector(MVT VT, bool HasInt256, SelectionDAG &DAG,
04837                              SDLoc dl) {
04838   assert(VT.isVector() && "Expected a vector type");
04839 
04840   SDValue Cst = DAG.getTargetConstant(~0U, MVT::i32);
04841   SDValue Vec;
04842   if (VT.is256BitVector()) {
04843     if (HasInt256) { // AVX2
04844       SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
04845       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops,
04846                         array_lengthof(Ops));
04847     } else { // AVX
04848       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
04849       Vec = Concat128BitVectors(Vec, Vec, MVT::v8i32, 8, DAG, dl);
04850     }
04851   } else if (VT.is128BitVector()) {
04852     Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
04853   } else
04854     llvm_unreachable("Unexpected vector type");
04855 
04856   return DAG.getNode(ISD::BITCAST, dl, VT, Vec);
04857 }
04858 
04859 /// NormalizeMask - V2 is a splat, modify the mask (if needed) so all elements
04860 /// that point to V2 points to its first element.
04861 static void NormalizeMask(SmallVectorImpl<int> &Mask, unsigned NumElems) {
04862   for (unsigned i = 0; i != NumElems; ++i) {
04863     if (Mask[i] > (int)NumElems) {
04864       Mask[i] = NumElems;
04865     }
04866   }
04867 }
04868 
04869 /// getMOVLMask - Returns a vector_shuffle mask for an movs{s|d}, movd
04870 /// operation of specified width.
04871 static SDValue getMOVL(SelectionDAG &DAG, SDLoc dl, EVT VT, SDValue V1,
04872                        SDValue V2) {
04873   unsigned NumElems = VT.getVectorNumElements();
04874   SmallVector<int, 8> Mask;
04875   Mask.push_back(NumElems);
04876   for (unsigned i = 1; i != NumElems; ++i)
04877     Mask.push_back(i);
04878   return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]);
04879 }
04880 
04881 /// getUnpackl - Returns a vector_shuffle node for an unpackl operation.
04882 static SDValue getUnpackl(SelectionDAG &DAG, SDLoc dl, MVT VT, SDValue V1,
04883                           SDValue V2) {
04884   unsigned NumElems = VT.getVectorNumElements();
04885   SmallVector<int, 8> Mask;
04886   for (unsigned i = 0, e = NumElems/2; i != e; ++i) {
04887     Mask.push_back(i);
04888     Mask.push_back(i + NumElems);
04889   }
04890   return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]);
04891 }
04892 
04893 /// getUnpackh - Returns a vector_shuffle node for an unpackh operation.
04894 static SDValue getUnpackh(SelectionDAG &DAG, SDLoc dl, MVT VT, SDValue V1,
04895                           SDValue V2) {
04896   unsigned NumElems = VT.getVectorNumElements();
04897   SmallVector<int, 8> Mask;
04898   for (unsigned i = 0, Half = NumElems/2; i != Half; ++i) {
04899     Mask.push_back(i + Half);
04900     Mask.push_back(i + NumElems + Half);
04901   }
04902   return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]);
04903 }
04904 
04905 // PromoteSplati8i16 - All i16 and i8 vector types can't be used directly by
04906 // a generic shuffle instruction because the target has no such instructions.
04907 // Generate shuffles which repeat i16 and i8 several times until they can be
04908 // represented by v4f32 and then be manipulated by target suported shuffles.
04909 static SDValue PromoteSplati8i16(SDValue V, SelectionDAG &DAG, int &EltNo) {
04910   MVT VT = V.getSimpleValueType();
04911   int NumElems = VT.getVectorNumElements();
04912   SDLoc dl(V);
04913 
04914   while (NumElems > 4) {
04915     if (EltNo < NumElems/2) {
04916       V = getUnpackl(DAG, dl, VT, V, V);
04917     } else {
04918       V = getUnpackh(DAG, dl, VT, V, V);
04919       EltNo -= NumElems/2;
04920     }
04921     NumElems >>= 1;
04922   }
04923   return V;
04924 }
04925 
04926 /// getLegalSplat - Generate a legal splat with supported x86 shuffles
04927 static SDValue getLegalSplat(SelectionDAG &DAG, SDValue V, int EltNo) {
04928   MVT VT = V.getSimpleValueType();
04929   SDLoc dl(V);
04930 
04931   if (VT.is128BitVector()) {
04932     V = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, V);
04933     int SplatMask[4] = { EltNo, EltNo, EltNo, EltNo };
04934     V = DAG.getVectorShuffle(MVT::v4f32, dl, V, DAG.getUNDEF(MVT::v4f32),
04935                              &SplatMask[0]);
04936   } else if (VT.is256BitVector()) {
04937     // To use VPERMILPS to splat scalars, the second half of indicies must
04938     // refer to the higher part, which is a duplication of the lower one,
04939     // because VPERMILPS can only handle in-lane permutations.
04940     int SplatMask[8] = { EltNo, EltNo, EltNo, EltNo,
04941                          EltNo+4, EltNo+4, EltNo+4, EltNo+4 };
04942 
04943     V = DAG.getNode(ISD::BITCAST, dl, MVT::v8f32, V);
04944     V = DAG.getVectorShuffle(MVT::v8f32, dl, V, DAG.getUNDEF(MVT::v8f32),
04945                              &SplatMask[0]);
04946   } else
04947     llvm_unreachable("Vector size not supported");
04948 
04949   return DAG.getNode(ISD::BITCAST, dl, VT, V);
04950 }
04951 
04952 /// PromoteSplat - Splat is promoted to target supported vector shuffles.
04953 static SDValue PromoteSplat(ShuffleVectorSDNode *SV, SelectionDAG &DAG) {
04954   MVT SrcVT = SV->getSimpleValueType(0);
04955   SDValue V1 = SV->getOperand(0);
04956   SDLoc dl(SV);
04957 
04958   int EltNo = SV->getSplatIndex();
04959   int NumElems = SrcVT.getVectorNumElements();
04960   bool Is256BitVec = SrcVT.is256BitVector();
04961 
04962   assert(((SrcVT.is128BitVector() && NumElems > 4) || Is256BitVec) &&
04963          "Unknown how to promote splat for type");
04964 
04965   // Extract the 128-bit part containing the splat element and update
04966   // the splat element index when it refers to the higher register.
04967   if (Is256BitVec) {
04968     V1 = Extract128BitVector(V1, EltNo, DAG, dl);
04969     if (EltNo >= NumElems/2)
04970       EltNo -= NumElems/2;
04971   }
04972 
04973   // All i16 and i8 vector types can't be used directly by a generic shuffle
04974   // instruction because the target has no such instruction. Generate shuffles
04975   // which repeat i16 and i8 several times until they fit in i32, and then can
04976   // be manipulated by target suported shuffles.
04977   MVT EltVT = SrcVT.getVectorElementType();
04978   if (EltVT == MVT::i8 || EltVT == MVT::i16)
04979     V1 = PromoteSplati8i16(V1, DAG, EltNo);
04980 
04981   // Recreate the 256-bit vector and place the same 128-bit vector
04982   // into the low and high part. This is necessary because we want
04983   // to use VPERM* to shuffle the vectors
04984   if (Is256BitVec) {
04985     V1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, SrcVT, V1, V1);
04986   }
04987 
04988   return getLegalSplat(DAG, V1, EltNo);
04989 }
04990 
04991 /// getShuffleVectorZeroOrUndef - Return a vector_shuffle of the specified
04992 /// vector of zero or undef vector.  This produces a shuffle where the low
04993 /// element of V2 is swizzled into the zero/undef vector, landing at element
04994 /// Idx.  This produces a shuffle mask like 4,1,2,3 (idx=0) or  0,1,2,4 (idx=3).
04995 static SDValue getShuffleVectorZeroOrUndef(SDValue V2, unsigned Idx,
04996                                            bool IsZero,
04997                                            const X86Subtarget *Subtarget,
04998                                            SelectionDAG &DAG) {
04999   MVT VT = V2.getSimpleValueType();
05000   SDValue V1 = IsZero
05001     ? getZeroVector(VT, Subtarget, DAG, SDLoc(V2)) : DAG.getUNDEF(VT);
05002   unsigned NumElems = VT.getVectorNumElements();
05003   SmallVector<int, 16> MaskVec;
05004   for (unsigned i = 0; i != NumElems; ++i)
05005     // If this is the insertion idx, put the low elt of V2 here.
05006     MaskVec.push_back(i == Idx ? NumElems : i);
05007   return DAG.getVectorShuffle(VT, SDLoc(V2), V1, V2, &MaskVec[0]);
05008 }
05009 
05010 /// getTargetShuffleMask - Calculates the shuffle mask corresponding to the
05011 /// target specific opcode. Returns true if the Mask could be calculated.
05012 /// Sets IsUnary to true if only uses one source.
05013 static bool getTargetShuffleMask(SDNode *N, MVT VT,
05014                                  SmallVectorImpl<int> &Mask, bool &IsUnary) {
05015   unsigned NumElems = VT.getVectorNumElements();
05016   SDValue ImmN;
05017 
05018   IsUnary = false;
05019   switch(N->getOpcode()) {
05020   case X86ISD::SHUFP:
05021     ImmN = N->getOperand(N->getNumOperands()-1);
05022     DecodeSHUFPMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
05023     break;
05024   case X86ISD::UNPCKH:
05025     DecodeUNPCKHMask(VT, Mask);
05026     break;
05027   case X86ISD::UNPCKL:
05028     DecodeUNPCKLMask(VT, Mask);
05029     break;
05030   case X86ISD::MOVHLPS:
05031     DecodeMOVHLPSMask(NumElems, Mask);
05032     break;
05033   case X86ISD::MOVLHPS:
05034     DecodeMOVLHPSMask(NumElems, Mask);
05035     break;
05036   case X86ISD::PALIGNR:
05037     ImmN = N->getOperand(N->getNumOperands()-1);
05038     DecodePALIGNRMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
05039     break;
05040   case X86ISD::PSHUFD:
05041   case X86ISD::VPERMILP:
05042     ImmN = N->getOperand(N->getNumOperands()-1);
05043     DecodePSHUFMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
05044     IsUnary = true;
05045     break;
05046   case X86ISD::PSHUFHW:
05047     ImmN = N->getOperand(N->getNumOperands()-1);
05048     DecodePSHUFHWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
05049     IsUnary = true;
05050     break;
05051   case X86ISD::PSHUFLW:
05052     ImmN = N->getOperand(N->getNumOperands()-1);
05053     DecodePSHUFLWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
05054     IsUnary = true;
05055     break;
05056   case X86ISD::VPERMI:
05057     ImmN = N->getOperand(N->getNumOperands()-1);
05058     DecodeVPERMMask(cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
05059     IsUnary = true;
05060     break;
05061   case X86ISD::MOVSS:
05062   case X86ISD::MOVSD: {
05063     // The index 0 always comes from the first element of the second source,
05064     // this is why MOVSS and MOVSD are used in the first place. The other
05065     // elements come from the other positions of the first source vector
05066     Mask.push_back(NumElems);
05067     for (unsigned i = 1; i != NumElems; ++i) {
05068       Mask.push_back(i);
05069     }
05070     break;
05071   }
05072   case X86ISD::VPERM2X128:
05073     ImmN = N->getOperand(N->getNumOperands()-1);
05074     DecodeVPERM2X128Mask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
05075     if (Mask.empty()) return false;
05076     break;
05077   case X86ISD::MOVDDUP:
05078   case X86ISD::MOVLHPD:
05079   case X86ISD::MOVLPD:
05080   case X86ISD::MOVLPS:
05081   case X86ISD::MOVSHDUP:
05082   case X86ISD::MOVSLDUP:
05083     // Not yet implemented
05084     return false;
05085   default: llvm_unreachable("unknown target shuffle node");
05086   }
05087 
05088   return true;
05089 }
05090 
05091 /// getShuffleScalarElt - Returns the scalar element that will make up the ith
05092 /// element of the result of the vector shuffle.
05093 static SDValue getShuffleScalarElt(SDNode *N, unsigned Index, SelectionDAG &DAG,
05094                                    unsigned Depth) {
05095   if (Depth == 6)
05096     return SDValue();  // Limit search depth.
05097 
05098   SDValue V = SDValue(N, 0);
05099   EVT VT = V.getValueType();
05100   unsigned Opcode = V.getOpcode();
05101 
05102   // Recurse into ISD::VECTOR_SHUFFLE node to find scalars.
05103   if (const ShuffleVectorSDNode *SV = dyn_cast<ShuffleVectorSDNode>(N)) {
05104     int Elt = SV->getMaskElt(Index);
05105 
05106     if (Elt < 0)
05107       return DAG.getUNDEF(VT.getVectorElementType());
05108 
05109     unsigned NumElems = VT.getVectorNumElements();
05110     SDValue NewV = (Elt < (int)NumElems) ? SV->getOperand(0)
05111                                          : SV->getOperand(1);
05112     return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG, Depth+1);
05113   }
05114 
05115   // Recurse into target specific vector shuffles to find scalars.
05116   if (isTargetShuffle(Opcode)) {
05117     MVT ShufVT = V.getSimpleValueType();
05118     unsigned NumElems = ShufVT.getVectorNumElements();
05119     SmallVector<int, 16> ShuffleMask;
05120     bool IsUnary;
05121 
05122     if (!getTargetShuffleMask(N, ShufVT, ShuffleMask, IsUnary))
05123       return SDValue();
05124 
05125     int Elt = ShuffleMask[Index];
05126     if (Elt < 0)
05127       return DAG.getUNDEF(ShufVT.getVectorElementType());
05128 
05129     SDValue NewV = (Elt < (int)NumElems) ? N->getOperand(0)
05130                                          : N->getOperand(1);
05131     return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG,
05132                                Depth+1);
05133   }
05134 
05135   // Actual nodes that may contain scalar elements
05136   if (Opcode == ISD::BITCAST) {
05137     V = V.getOperand(0);
05138     EVT SrcVT = V.getValueType();
05139     unsigned NumElems = VT.getVectorNumElements();
05140 
05141     if (!SrcVT.isVector() || SrcVT.getVectorNumElements() != NumElems)
05142       return SDValue();
05143   }
05144 
05145   if (V.getOpcode() == ISD::SCALAR_TO_VECTOR)
05146     return (Index == 0) ? V.getOperand(0)
05147                         : DAG.getUNDEF(VT.getVectorElementType());
05148 
05149   if (V.getOpcode() == ISD::BUILD_VECTOR)
05150     return V.getOperand(Index);
05151 
05152   return SDValue();
05153 }
05154 
05155 /// getNumOfConsecutiveZeros - Return the number of elements of a vector
05156 /// shuffle operation which come from a consecutively from a zero. The
05157 /// search can start in two different directions, from left or right.
05158 /// We count undefs as zeros until PreferredNum is reached.
05159 static unsigned getNumOfConsecutiveZeros(ShuffleVectorSDNode *SVOp,
05160                                          unsigned NumElems, bool ZerosFromLeft,
05161                                          SelectionDAG &DAG,
05162                                          unsigned PreferredNum = -1U) {
05163   unsigned NumZeros = 0;
05164   for (unsigned i = 0; i != NumElems; ++i) {
05165     unsigned Index = ZerosFromLeft ? i : NumElems - i - 1;
05166     SDValue Elt = getShuffleScalarElt(SVOp, Index, DAG, 0);
05167     if (!Elt.getNode())
05168       break;
05169 
05170     if (X86::isZeroNode(Elt))
05171       ++NumZeros;
05172     else if (Elt.getOpcode() == ISD::UNDEF) // Undef as zero up to PreferredNum.
05173       NumZeros = std::min(NumZeros + 1, PreferredNum);
05174     else
05175       break;
05176   }
05177 
05178   return NumZeros;
05179 }
05180 
05181 /// isShuffleMaskConsecutive - Check if the shuffle mask indicies [MaskI, MaskE)
05182 /// correspond consecutively to elements from one of the vector operands,
05183 /// starting from its index OpIdx. Also tell OpNum which source vector operand.
05184 static
05185 bool isShuffleMaskConsecutive(ShuffleVectorSDNode *SVOp,
05186                               unsigned MaskI, unsigned MaskE, unsigned OpIdx,
05187                               unsigned NumElems, unsigned &OpNum) {
05188   bool SeenV1 = false;
05189   bool SeenV2 = false;
05190 
05191   for (unsigned i = MaskI; i != MaskE; ++i, ++OpIdx) {
05192     int Idx = SVOp->getMaskElt(i);
05193     // Ignore undef indicies
05194     if (Idx < 0)
05195       continue;
05196 
05197     if (Idx < (int)NumElems)
05198       SeenV1 = true;
05199     else
05200       SeenV2 = true;
05201 
05202     // Only accept consecutive elements from the same vector
05203     if ((Idx % NumElems != OpIdx) || (SeenV1 && SeenV2))
05204       return false;
05205   }
05206 
05207   OpNum = SeenV1 ? 0 : 1;
05208   return true;
05209 }
05210 
05211 /// isVectorShiftRight - Returns true if the shuffle can be implemented as a
05212 /// logical left shift of a vector.
05213 static bool isVectorShiftRight(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG,
05214                                bool &isLeft, SDValue &ShVal, unsigned &ShAmt) {
05215   unsigned NumElems =
05216     SVOp->getSimpleValueType(0).getVectorNumElements();
05217   unsigned NumZeros = getNumOfConsecutiveZeros(
05218       SVOp, NumElems, false /* check zeros from right */, DAG,
05219       SVOp->getMaskElt(0));
05220   unsigned OpSrc;
05221 
05222   if (!NumZeros)
05223     return false;
05224 
05225   // Considering the elements in the mask that are not consecutive zeros,
05226   // check if they consecutively come from only one of the source vectors.
05227   //
05228   //               V1 = {X, A, B, C}     0
05229   //                         \  \  \    /
05230   //   vector_shuffle V1, V2 <1, 2, 3, X>
05231   //
05232   if (!isShuffleMaskConsecutive(SVOp,
05233             0,                   // Mask Start Index
05234             NumElems-NumZeros,   // Mask End Index(exclusive)
05235             NumZeros,            // Where to start looking in the src vector
05236             NumElems,            // Number of elements in vector
05237             OpSrc))              // Which source operand ?
05238     return false;
05239 
05240   isLeft = false;
05241   ShAmt = NumZeros;
05242   ShVal = SVOp->getOperand(OpSrc);
05243   return true;
05244 }
05245 
05246 /// isVectorShiftLeft - Returns true if the shuffle can be implemented as a
05247 /// logical left shift of a vector.
05248 static bool isVectorShiftLeft(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG,
05249                               bool &isLeft, SDValue &ShVal, unsigned &ShAmt) {
05250   unsigned NumElems =
05251     SVOp->getSimpleValueType(0).getVectorNumElements();
05252   unsigned NumZeros = getNumOfConsecutiveZeros(
05253       SVOp, NumElems, true /* check zeros from left */, DAG,
05254       NumElems - SVOp->getMaskElt(NumElems - 1) - 1);
05255   unsigned OpSrc;
05256 
05257   if (!NumZeros)
05258     return false;
05259 
05260   // Considering the elements in the mask that are not consecutive zeros,
05261   // check if they consecutively come from only one of the source vectors.
05262   //
05263   //                           0    { A, B, X, X } = V2
05264   //                          / \    /  /
05265   //   vector_shuffle V1, V2 <X, X, 4, 5>
05266   //
05267   if (!isShuffleMaskConsecutive(SVOp,
05268             NumZeros,     // Mask Start Index
05269             NumElems,     // Mask End Index(exclusive)
05270             0,            // Where to start looking in the src vector
05271             NumElems,     // Number of elements in vector
05272             OpSrc))       // Which source operand ?
05273     return false;
05274 
05275   isLeft = true;
05276   ShAmt = NumZeros;
05277   ShVal = SVOp->getOperand(OpSrc);
05278   return true;
05279 }
05280 
05281 /// isVectorShift - Returns true if the shuffle can be implemented as a
05282 /// logical left or right shift of a vector.
05283 static bool isVectorShift(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG,
05284                           bool &isLeft, SDValue &ShVal, unsigned &ShAmt) {
05285   // Although the logic below support any bitwidth size, there are no
05286   // shift instructions which handle more than 128-bit vectors.
05287   if (!SVOp->getSimpleValueType(0).is128BitVector())
05288     return false;
05289 
05290   if (isVectorShiftLeft(SVOp, DAG, isLeft, ShVal, ShAmt) ||
05291       isVectorShiftRight(SVOp, DAG, isLeft, ShVal, ShAmt))
05292     return true;
05293 
05294   return false;
05295 }
05296 
05297 /// LowerBuildVectorv16i8 - Custom lower build_vector of v16i8.
05298 ///
05299 static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros,
05300                                        unsigned NumNonZero, unsigned NumZero,
05301                                        SelectionDAG &DAG,
05302                                        const X86Subtarget* Subtarget,
05303                                        const TargetLowering &TLI) {
05304   if (NumNonZero > 8)
05305     return SDValue();
05306 
05307   SDLoc dl(Op);
05308   SDValue V(0, 0);
05309   bool First = true;
05310   for (unsigned i = 0; i < 16; ++i) {
05311     bool ThisIsNonZero = (NonZeros & (1 << i)) != 0;
05312     if (ThisIsNonZero && First) {
05313       if (NumZero)
05314         V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
05315       else
05316         V = DAG.getUNDEF(MVT::v8i16);
05317       First = false;
05318     }
05319 
05320     if ((i & 1) != 0) {
05321       SDValue ThisElt(0, 0), LastElt(0, 0);
05322       bool LastIsNonZero = (NonZeros & (1 << (i-1))) != 0;
05323       if (LastIsNonZero) {
05324         LastElt = DAG.getNode(ISD::ZERO_EXTEND, dl,
05325                               MVT::i16, Op.getOperand(i-1));
05326       }
05327       if (ThisIsNonZero) {
05328         ThisElt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i));
05329         ThisElt = DAG.getNode(ISD::SHL, dl, MVT::i16,
05330                               ThisElt, DAG.getConstant(8, MVT::i8));
05331         if (LastIsNonZero)
05332           ThisElt = DAG.getNode(ISD::OR, dl, MVT::i16, ThisElt, LastElt);
05333       } else
05334         ThisElt = LastElt;
05335 
05336       if (ThisElt.getNode())
05337         V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, ThisElt,
05338                         DAG.getIntPtrConstant(i/2));
05339     }
05340   }
05341 
05342   return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, V);
05343 }
05344 
05345 /// LowerBuildVectorv8i16 - Custom lower build_vector of v8i16.
05346 ///
05347 static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros,
05348                                      unsigned NumNonZero, unsigned NumZero,
05349                                      SelectionDAG &DAG,
05350                                      const X86Subtarget* Subtarget,
05351                                      const TargetLowering &TLI) {
05352   if (NumNonZero > 4)
05353     return SDValue();
05354 
05355   SDLoc dl(Op);
05356   SDValue V(0, 0);
05357   bool First = true;
05358   for (unsigned i = 0; i < 8; ++i) {
05359     bool isNonZero = (NonZeros & (1 << i)) != 0;
05360     if (isNonZero) {
05361       if (First) {
05362         if (NumZero)
05363           V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
05364         else
05365           V = DAG.getUNDEF(MVT::v8i16);
05366         First = false;
05367       }
05368       V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl,
05369                       MVT::v8i16, V, Op.getOperand(i),
05370                       DAG.getIntPtrConstant(i));
05371     }
05372   }
05373 
05374   return V;
05375 }
05376 
05377 /// getVShift - Return a vector logical shift node.
05378 ///
05379 static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp,
05380                          unsigned NumBits, SelectionDAG &DAG,
05381                          const TargetLowering &TLI, SDLoc dl) {
05382   assert(VT.is128BitVector() && "Unknown type for VShift");
05383   EVT ShVT = MVT::v2i64;
05384   unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ;
05385   SrcOp = DAG.getNode(ISD::BITCAST, dl, ShVT, SrcOp);
05386   return DAG.getNode(ISD::BITCAST, dl, VT,
05387                      DAG.getNode(Opc, dl, ShVT, SrcOp,
05388                              DAG.getConstant(NumBits,
05389                                   TLI.getScalarShiftAmountTy(SrcOp.getValueType()))));
05390 }
05391 
05392 static SDValue
05393 LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, SDLoc dl, SelectionDAG &DAG) {
05394 
05395   // Check if the scalar load can be widened into a vector load. And if
05396   // the address is "base + cst" see if the cst can be "absorbed" into
05397   // the shuffle mask.
05398   if (LoadSDNode *LD = dyn_cast<LoadSDNode>(SrcOp)) {
05399     SDValue Ptr = LD->getBasePtr();
05400     if (!ISD::isNormalLoad(LD) || LD->isVolatile())
05401       return SDValue();
05402     EVT PVT = LD->getValueType(0);
05403     if (PVT != MVT::i32 && PVT != MVT::f32)
05404       return SDValue();
05405 
05406     int FI = -1;
05407     int64_t Offset = 0;
05408     if (FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr)) {
05409       FI = FINode->getIndex();
05410       Offset = 0;
05411     } else if (DAG.isBaseWithConstantOffset(Ptr) &&
05412                isa<FrameIndexSDNode>(Ptr.getOperand(0))) {
05413       FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex();
05414       Offset = Ptr.getConstantOperandVal(1);
05415       Ptr = Ptr.getOperand(0);
05416     } else {
05417       return SDValue();
05418     }
05419 
05420     // FIXME: 256-bit vector instructions don't require a strict alignment,
05421     // improve this code to support it better.
05422     unsigned RequiredAlign = VT.getSizeInBits()/8;
05423     SDValue Chain = LD->getChain();
05424     // Make sure the stack object alignment is at least 16 or 32.
05425     MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
05426     if (DAG.InferPtrAlignment(Ptr) < RequiredAlign) {
05427       if (MFI->isFixedObjectIndex(FI)) {
05428         // Can't change the alignment. FIXME: It's possible to compute
05429         // the exact stack offset and reference FI + adjust offset instead.
05430         // If someone *really* cares about this. That's the way to implement it.
05431         return SDValue();
05432       } else {
05433         MFI->setObjectAlignment(FI, RequiredAlign);
05434       }
05435     }
05436 
05437     // (Offset % 16 or 32) must be multiple of 4. Then address is then
05438     // Ptr + (Offset & ~15).
05439     if (Offset < 0)
05440       return SDValue();
05441     if ((Offset % RequiredAlign) & 3)
05442       return SDValue();
05443     int64_t StartOffset = Offset & ~(RequiredAlign-1);
05444     if (StartOffset)
05445       Ptr = DAG.getNode(ISD::ADD, SDLoc(Ptr), Ptr.getValueType(),
05446                         Ptr,DAG.getConstant(StartOffset, Ptr.getValueType()));
05447 
05448     int EltNo = (Offset - StartOffset) >> 2;
05449     unsigned NumElems = VT.getVectorNumElements();
05450 
05451     EVT NVT = EVT::getVectorVT(*DAG.getContext(), PVT, NumElems);
05452     SDValue V1 = DAG.getLoad(NVT, dl, Chain, Ptr,
05453                              LD->getPointerInfo().getWithOffset(StartOffset),
05454                              false, false, false, 0);
05455 
05456     SmallVector<int, 8> Mask;
05457     for (unsigned i = 0; i != NumElems; ++i)
05458       Mask.push_back(EltNo);
05459 
05460     return DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), &Mask[0]);
05461   }
05462 
05463   return SDValue();
05464 }
05465 
05466 /// EltsFromConsecutiveLoads - Given the initializing elements 'Elts' of a
05467 /// vector of type 'VT', see if the elements can be replaced by a single large
05468 /// load which has the same value as a build_vector whose operands are 'elts'.
05469 ///
05470 /// Example: <load i32 *a, load i32 *a+4, undef, undef> -> zextload a
05471 ///
05472 /// FIXME: we'd also like to handle the case where the last elements are zero
05473 /// rather than undef via VZEXT_LOAD, but we do not detect that case today.
05474 /// There's even a handy isZeroNode for that purpose.
05475 static SDValue EltsFromConsecutiveLoads(EVT VT, SmallVectorImpl<SDValue> &Elts,
05476                                         SDLoc &DL, SelectionDAG &DAG,
05477                                         bool isAfterLegalize) {
05478   EVT EltVT = VT.getVectorElementType();
05479   unsigned NumElems = Elts.size();
05480 
05481   LoadSDNode *LDBase = NULL;
05482   unsigned LastLoadedElt = -1U;
05483 
05484   // For each element in the initializer, see if we've found a load or an undef.
05485   // If we don't find an initial load element, or later load elements are
05486   // non-consecutive, bail out.
05487   for (unsigned i = 0; i < NumElems; ++i) {
05488     SDValue Elt = Elts[i];
05489 
05490     if (!Elt.getNode() ||
05491         (Elt.getOpcode() != ISD::UNDEF && !ISD::isNON_EXTLoad(Elt.getNode())))
05492       return SDValue();
05493     if (!LDBase) {
05494       if (Elt.getNode()->getOpcode() == ISD::UNDEF)
05495         return SDValue();
05496       LDBase = cast<LoadSDNode>(Elt.getNode());
05497       LastLoadedElt = i;
05498       continue;
05499     }
05500     if (Elt.getOpcode() == ISD::UNDEF)
05501       continue;
05502 
05503     LoadSDNode *LD = cast<LoadSDNode>(Elt);
05504     if (!DAG.isConsecutiveLoad(LD, LDBase, EltVT.getSizeInBits()/8, i))
05505       return SDValue();
05506     LastLoadedElt = i;
05507   }
05508 
05509   // If we have found an entire vector of loads and undefs, then return a large
05510   // load of the entire vector width starting at the base pointer.  If we found
05511   // consecutive loads for the low half, generate a vzext_load node.
05512   if (LastLoadedElt == NumElems - 1) {
05513 
05514     if (isAfterLegalize &&
05515         !DAG.getTargetLoweringInfo().isOperationLegal(ISD::LOAD, VT))
05516       return SDValue();
05517 
05518     SDValue NewLd = SDValue();
05519 
05520     if (DAG.InferPtrAlignment(LDBase->getBasePtr()) >= 16)
05521       NewLd = DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
05522                           LDBase->getPointerInfo(),
05523                           LDBase->isVolatile(), LDBase->isNonTemporal(),
05524                           LDBase->isInvariant(), 0);
05525     NewLd = DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
05526                         LDBase->getPointerInfo(),
05527                         LDBase->isVolatile(), LDBase->isNonTemporal(),
05528                         LDBase->isInvariant(), LDBase->getAlignment());
05529 
05530     if (LDBase->hasAnyUseOfValue(1)) {
05531       SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
05532                                      SDValue(LDBase, 1),
05533                                      SDValue(NewLd.getNode(), 1));
05534       DAG.ReplaceAllUsesOfValueWith(SDValue(LDBase, 1), NewChain);
05535       DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(LDBase, 1),
05536                              SDValue(NewLd.getNode(), 1));
05537     }
05538 
05539     return NewLd;
05540   }
05541   if (NumElems == 4 && LastLoadedElt == 1 &&
05542       DAG.getTargetLoweringInfo().isTypeLegal(MVT::v2i64)) {
05543     SDVTList Tys = DAG.getVTList(MVT::v2i64, MVT::Other);
05544     SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };
05545     SDValue ResNode =
05546         DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, Ops,
05547                                 array_lengthof(Ops), MVT::i64,
05548                                 LDBase->getPointerInfo(),
05549                                 LDBase->getAlignment(),
05550                                 false/*isVolatile*/, true/*ReadMem*/,
05551                                 false/*WriteMem*/);
05552 
05553     // Make sure the newly-created LOAD is in the same position as LDBase in
05554     // terms of dependency. We create a TokenFactor for LDBase and ResNode, and
05555     // update uses of LDBase's output chain to use the TokenFactor.
05556     if (LDBase->hasAnyUseOfValue(1)) {
05557       SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
05558                              SDValue(LDBase, 1), SDValue(ResNode.getNode(), 1));
05559       DAG.ReplaceAllUsesOfValueWith(SDValue(LDBase, 1), NewChain);
05560       DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(LDBase, 1),
05561                              SDValue(ResNode.getNode(), 1));
05562     }
05563 
05564     return DAG.getNode(ISD::BITCAST, DL, VT, ResNode);
05565   }
05566   return SDValue();
05567 }
05568 
05569 /// LowerVectorBroadcast - Attempt to use the vbroadcast instruction
05570 /// to generate a splat value for the following cases:
05571 /// 1. A splat BUILD_VECTOR which uses a single scalar load, or a constant.
05572 /// 2. A splat shuffle which uses a scalar_to_vector node which comes from
05573 /// a scalar load, or a constant.
05574 /// The VBROADCAST node is returned when a pattern is found,
05575 /// or SDValue() otherwise.
05576 static SDValue LowerVectorBroadcast(SDValue Op, const X86Subtarget* Subtarget,
05577                                     SelectionDAG &DAG) {
05578   if (!Subtarget->hasFp256())
05579     return SDValue();
05580 
05581   MVT VT = Op.getSimpleValueType();
05582   SDLoc dl(Op);
05583 
05584   assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
05585          "Unsupported vector type for broadcast.");
05586 
05587   SDValue Ld;
05588   bool ConstSplatVal;
05589 
05590   switch (Op.getOpcode()) {
05591     default:
05592       // Unknown pattern found.
05593       return SDValue();
05594 
05595     case ISD::BUILD_VECTOR: {
05596       // The BUILD_VECTOR node must be a splat.
05597       if (!isSplatVector(Op.getNode()))
05598         return SDValue();
05599 
05600       Ld = Op.getOperand(0);
05601       ConstSplatVal = (Ld.getOpcode() == ISD::Constant ||
05602                      Ld.getOpcode() == ISD::ConstantFP);
05603 
05604       // The suspected load node has several users. Make sure that all
05605       // of its users are from the BUILD_VECTOR node.
05606       // Constants may have multiple users.
05607       if (!ConstSplatVal && !Ld->hasNUsesOfValue(VT.getVectorNumElements(), 0))
05608         return SDValue();
05609       break;
05610     }
05611 
05612     case ISD::VECTOR_SHUFFLE: {
05613       ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
05614 
05615       // Shuffles must have a splat mask where the first element is
05616       // broadcasted.
05617       if ((!SVOp->isSplat()) || SVOp->getMaskElt(0) != 0)
05618         return SDValue();
05619 
05620       SDValue Sc = Op.getOperand(0);
05621       if (Sc.getOpcode() != ISD::SCALAR_TO_VECTOR &&
05622           Sc.getOpcode() != ISD::BUILD_VECTOR) {
05623 
05624         if (!Subtarget->hasInt256())
05625           return SDValue();
05626 
05627         // Use the register form of the broadcast instruction available on AVX2.
05628         if (VT.getSizeInBits() >= 256)
05629           Sc = Extract128BitVector(Sc, 0, DAG, dl);
05630         return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Sc);
05631       }
05632 
05633       Ld = Sc.getOperand(0);
05634       ConstSplatVal = (Ld.getOpcode() == ISD::Constant ||
05635                        Ld.getOpcode() == ISD::ConstantFP);
05636 
05637       // The scalar_to_vector node and the suspected
05638       // load node must have exactly one user.
05639       // Constants may have multiple users.
05640 
05641       // AVX-512 has register version of the broadcast
05642       bool hasRegVer = Subtarget->hasAVX512() && VT.is512BitVector() &&
05643         Ld.getValueType().getSizeInBits() >= 32;
05644       if (!ConstSplatVal && ((!Sc.hasOneUse() || !Ld.hasOneUse()) &&
05645           !hasRegVer))
05646         return SDValue();
05647       break;
05648     }
05649   }
05650 
05651   bool IsGE256 = (VT.getSizeInBits() >= 256);
05652 
05653   // Handle the broadcasting a single constant scalar from the constant pool
05654   // into a vector. On Sandybridge it is still better to load a constant vector
05655   // from the constant pool and not to broadcast it from a scalar.
05656   if (ConstSplatVal && Subtarget->hasInt256()) {
05657     EVT CVT = Ld.getValueType();
05658     assert(!CVT.isVector() && "Must not broadcast a vector type");
05659     unsigned ScalarSize = CVT.getSizeInBits();
05660 
05661     if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64)) {
05662       const Constant *C = 0;
05663       if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Ld))
05664         C = CI->getConstantIntValue();
05665       else if (ConstantFPSDNode *CF = dyn_cast<ConstantFPSDNode>(Ld))
05666         C = CF->getConstantFPValue();
05667 
05668       assert(C && "Invalid constant type");
05669 
05670       const TargetLowering &TLI = DAG.getTargetLoweringInfo();
05671       SDValue CP = DAG.getConstantPool(C, TLI.getPointerTy());
05672       unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
05673       Ld = DAG.getLoad(CVT, dl, DAG.getEntryNode(), CP,
05674                        MachinePointerInfo::getConstantPool(),
05675                        false, false, false, Alignment);
05676 
05677       return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
05678     }
05679   }
05680 
05681   bool IsLoad = ISD::isNormalLoad(Ld.getNode());
05682   unsigned ScalarSize = Ld.getValueType().getSizeInBits();
05683 
05684   // Handle AVX2 in-register broadcasts.
05685   if (!IsLoad && Subtarget->hasInt256() &&
05686       (ScalarSize == 32 || (IsGE256 && ScalarSize == 64)))
05687     return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
05688 
05689   // The scalar source must be a normal load.
05690   if (!IsLoad)
05691     return SDValue();
05692 
05693   if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64))
05694     return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
05695 
05696   // The integer check is needed for the 64-bit into 128-bit so it doesn't match
05697   // double since there is no vbroadcastsd xmm
05698   if (Subtarget->hasInt256() && Ld.getValueType().isInteger()) {
05699     if (ScalarSize == 8 || ScalarSize == 16 || ScalarSize == 64)
05700       return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
05701   }
05702 
05703   // Unsupported broadcast.
05704   return SDValue();
05705 }
05706 
05707 /// \brief For an EXTRACT_VECTOR_ELT with a constant index return the real
05708 /// underlying vector and index.
05709 ///
05710 /// Modifies \p ExtractedFromVec to the real vector and returns the real
05711 /// index.
05712 static int getUnderlyingExtractedFromVec(SDValue &ExtractedFromVec,
05713                                          SDValue ExtIdx) {
05714   int Idx = cast<ConstantSDNode>(ExtIdx)->getZExtValue();
05715   if (!isa<ShuffleVectorSDNode>(ExtractedFromVec))
05716     return Idx;
05717 
05718   // For 256-bit vectors, LowerEXTRACT_VECTOR_ELT_SSE4 may have already
05719   // lowered this:
05720   //   (extract_vector_elt (v8f32 %vreg1), Constant<6>)
05721   // to:
05722   //   (extract_vector_elt (vector_shuffle<2,u,u,u>
05723   //                           (extract_subvector (v8f32 %vreg0), Constant<4>),
05724   //                           undef)
05725   //                       Constant<0>)
05726   // In this case the vector is the extract_subvector expression and the index
05727   // is 2, as specified by the shuffle.
05728   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(ExtractedFromVec);
05729   SDValue ShuffleVec = SVOp->getOperand(0);
05730   MVT ShuffleVecVT = ShuffleVec.getSimpleValueType();
05731   assert(ShuffleVecVT.getVectorElementType() ==
05732          ExtractedFromVec.getSimpleValueType().getVectorElementType());
05733 
05734   int ShuffleIdx = SVOp->getMaskElt(Idx);
05735   if (isUndefOrInRange(ShuffleIdx, 0, ShuffleVecVT.getVectorNumElements())) {
05736     ExtractedFromVec = ShuffleVec;
05737     return ShuffleIdx;
05738   }
05739   return Idx;
05740 }
05741 
05742 static SDValue buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) {
05743   MVT VT = Op.getSimpleValueType();
05744 
05745   // Skip if insert_vec_elt is not supported.
05746   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
05747   if (!TLI.isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT))
05748     return SDValue();
05749 
05750   SDLoc DL(Op);
05751   unsigned NumElems = Op.getNumOperands();
05752 
05753   SDValue VecIn1;
05754   SDValue VecIn2;
05755   SmallVector<unsigned, 4> InsertIndices;
05756   SmallVector<int, 8> Mask(NumElems, -1);
05757 
05758   for (unsigned i = 0; i != NumElems; ++i) {
05759     unsigned Opc = Op.getOperand(i).getOpcode();
05760 
05761     if (Opc == ISD::UNDEF)
05762       continue;
05763 
05764     if (Opc != ISD::EXTRACT_VECTOR_ELT) {
05765       // Quit if more than 1 elements need inserting.
05766       if (InsertIndices.size() > 1)
05767         return SDValue();
05768 
05769       InsertIndices.push_back(i);
05770       continue;
05771     }
05772 
05773     SDValue ExtractedFromVec = Op.getOperand(i).getOperand(0);
05774     SDValue ExtIdx = Op.getOperand(i).getOperand(1);
05775     // Quit if non-constant index.
05776     if (!isa<ConstantSDNode>(ExtIdx))
05777       return SDValue();
05778     int Idx = getUnderlyingExtractedFromVec(ExtractedFromVec, ExtIdx);
05779 
05780     // Quit if extracted from vector of different type.
05781     if (ExtractedFromVec.getValueType() != VT)
05782       return SDValue();
05783 
05784     if (VecIn1.getNode() == 0)
05785       VecIn1 = ExtractedFromVec;
05786     else if (VecIn1 != ExtractedFromVec) {
05787       if (VecIn2.getNode() == 0)
05788         VecIn2 = ExtractedFromVec;
05789       else if (VecIn2 != ExtractedFromVec)
05790         // Quit if more than 2 vectors to shuffle
05791         return SDValue();
05792     }
05793 
05794     if (ExtractedFromVec == VecIn1)
05795       Mask[i] = Idx;
05796     else if (ExtractedFromVec == VecIn2)
05797       Mask[i] = Idx + NumElems;
05798   }
05799 
05800   if (VecIn1.getNode() == 0)
05801     return SDValue();
05802 
05803   VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(VT);
05804   SDValue NV = DAG.getVectorShuffle(VT, DL, VecIn1, VecIn2, &Mask[0]);
05805   for (unsigned i = 0, e = InsertIndices.size(); i != e; ++i) {
05806     unsigned Idx = InsertIndices[i];
05807     NV = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, NV, Op.getOperand(Idx),
05808                      DAG.getIntPtrConstant(Idx));
05809   }
05810 
05811   return NV;
05812 }
05813 
05814 // Lower BUILD_VECTOR operation for v8i1 and v16i1 types.
05815 SDValue
05816 X86TargetLowering::LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const {
05817 
05818   MVT VT = Op.getSimpleValueType();
05819   assert((VT.getVectorElementType() == MVT::i1) && (VT.getSizeInBits() <= 16) &&
05820          "Unexpected type in LowerBUILD_VECTORvXi1!");
05821 
05822   SDLoc dl(Op);
05823   if (ISD::isBuildVectorAllZeros(Op.getNode())) {
05824     SDValue Cst = DAG.getTargetConstant(0, MVT::i1);
05825     SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst,
05826                       Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
05827     return DAG.getNode(ISD::BUILD_VECTOR, dl, VT,
05828                        Ops, VT.getVectorNumElements());
05829   }
05830 
05831   if (ISD::isBuildVectorAllOnes(Op.getNode())) {
05832     SDValue Cst = DAG.getTargetConstant(1, MVT::i1);
05833     SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst,
05834                       Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
05835     return DAG.getNode(ISD::BUILD_VECTOR, dl, VT,
05836                        Ops, VT.getVectorNumElements());
05837   }
05838 
05839   bool AllContants = true;
05840   uint64_t Immediate = 0;
05841   int NonConstIdx = -1;
05842   bool IsSplat = true;
05843   unsigned NumNonConsts = 0;
05844   unsigned NumConsts = 0;
05845   for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
05846     SDValue In = Op.getOperand(idx);
05847     if (In.getOpcode() == ISD::UNDEF)
05848       continue;
05849     if (!isa<ConstantSDNode>(In)) {
05850       AllContants = false;
05851       NonConstIdx = idx;
05852       NumNonConsts++;
05853     }
05854     else {
05855       NumConsts++;
05856       if (cast<ConstantSDNode>(In)->getZExtValue())
05857       Immediate |= (1ULL << idx);
05858     }
05859     if (In != Op.getOperand(0))
05860       IsSplat = false;
05861   }
05862 
05863   if (AllContants) {
05864     SDValue FullMask = DAG.getNode(ISD::BITCAST, dl, MVT::v16i1,
05865       DAG.getConstant(Immediate, MVT::i16));
05866     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, FullMask,
05867                        DAG.getIntPtrConstant(0));
05868   }
05869 
05870   if (NumNonConsts == 1 && NonConstIdx != 0) {
05871     SDValue DstVec;
05872     if (NumConsts) {
05873       SDValue VecAsImm = DAG.getConstant(Immediate,
05874                                          MVT::getIntegerVT(VT.getSizeInBits()));
05875       DstVec = DAG.getNode(ISD::BITCAST, dl, VT, VecAsImm);
05876     }
05877     else 
05878       DstVec = DAG.getUNDEF(VT);
05879     return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DstVec,
05880                        Op.getOperand(NonConstIdx),
05881                        DAG.getIntPtrConstant(NonConstIdx));
05882   }
05883   if (!IsSplat && (NonConstIdx != 0))
05884     llvm_unreachable("Unsupported BUILD_VECTOR operation");
05885   MVT SelectVT = (VT == MVT::v16i1)? MVT::i16 : MVT::i8;
05886   SDValue Select;
05887   if (IsSplat)
05888     Select = DAG.getNode(ISD::SELECT, dl, SelectVT, Op.getOperand(0),
05889                           DAG.getConstant(-1, SelectVT),
05890                           DAG.getConstant(0, SelectVT));
05891   else
05892     Select = DAG.getNode(ISD::SELECT, dl, SelectVT, Op.getOperand(0),
05893                          DAG.getConstant((Immediate | 1), SelectVT),
05894                          DAG.getConstant(Immediate, SelectVT));
05895   return DAG.getNode(ISD::BITCAST, dl, VT, Select);
05896 }
05897 
05898 SDValue
05899 X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
05900   SDLoc dl(Op);
05901 
05902   MVT VT = Op.getSimpleValueType();
05903   MVT ExtVT = VT.getVectorElementType();
05904   unsigned NumElems = Op.getNumOperands();
05905 
05906   // Generate vectors for predicate vectors.
05907   if (VT.getScalarType() == MVT::i1 && Subtarget->hasAVX512())
05908     return LowerBUILD_VECTORvXi1(Op, DAG);
05909 
05910   // Vectors containing all zeros can be matched by pxor and xorps later
05911   if (ISD::isBuildVectorAllZeros(Op.getNode())) {
05912     // Canonicalize this to <4 x i32> to 1) ensure the zero vectors are CSE'd
05913     // and 2) ensure that i64 scalars are eliminated on x86-32 hosts.
05914     if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32)
05915       return Op;
05916 
05917     return getZeroVector(VT, Subtarget, DAG, dl);
05918   }
05919 
05920   // Vectors containing all ones can be matched by pcmpeqd on 128-bit width
05921   // vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use
05922   // vpcmpeqd on 256-bit vectors.
05923   if (Subtarget->hasSSE2() && ISD::isBuildVectorAllOnes(Op.getNode())) {
05924     if (VT == MVT::v4i32 || (VT == MVT::v8i32 && Subtarget->hasInt256()))
05925       return Op;
05926 
05927     if (!VT.is512BitVector())
05928       return getOnesVector(VT, Subtarget->hasInt256(), DAG, dl);
05929   }
05930 
05931   SDValue Broadcast = LowerVectorBroadcast(Op, Subtarget, DAG);
05932   if (Broadcast.getNode())
05933     return Broadcast;
05934 
05935   unsigned EVTBits = ExtVT.getSizeInBits();
05936 
05937   unsigned NumZero  = 0;
05938   unsigned NumNonZero = 0;
05939   unsigned NonZeros = 0;
05940   bool IsAllConstants = true;
05941   SmallSet<SDValue, 8> Values;
05942   for (unsigned i = 0; i < NumElems; ++i) {
05943     SDValue Elt = Op.getOperand(i);
05944     if (Elt.getOpcode() == ISD::UNDEF)
05945       continue;
05946     Values.insert(Elt);
05947     if (Elt.getOpcode() != ISD::Constant &&
05948         Elt.getOpcode() != ISD::ConstantFP)
05949       IsAllConstants = false;
05950     if (X86::isZeroNode(Elt))
05951       NumZero++;
05952     else {
05953       NonZeros |= (1 << i);
05954       NumNonZero++;
05955     }
05956   }
05957 
05958   // All undef vector. Return an UNDEF.  All zero vectors were handled above.
05959   if (NumNonZero == 0)
05960     return DAG.getUNDEF(VT);
05961 
05962   // Special case for single non-zero, non-undef, element.
05963   if (NumNonZero == 1) {
05964     unsigned Idx = countTrailingZeros(NonZeros);
05965     SDValue Item = Op.getOperand(Idx);
05966 
05967     // If this is an insertion of an i64 value on x86-32, and if the top bits of
05968     // the value are obviously zero, truncate the value to i32 and do the
05969     // insertion that way.  Only do this if the value is non-constant or if the
05970     // value is a constant being inserted into element 0.  It is cheaper to do
05971     // a constant pool load than it is to do a movd + shuffle.
05972     if (ExtVT == MVT::i64 && !Subtarget->is64Bit() &&
05973         (!IsAllConstants || Idx == 0)) {
05974       if (DAG.MaskedValueIsZero(Item, APInt::getBitsSet(64, 32, 64))) {
05975         // Handle SSE only.
05976         assert(VT == MVT::v2i64 && "Expected an SSE value type!");
05977         EVT VecVT = MVT::v4i32;
05978         unsigned VecElts = 4;
05979 
05980         // Truncate the value (which may itself be a constant) to i32, and
05981         // convert it to a vector with movd (S2V+shuffle to zero extend).
05982         Item = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Item);
05983         Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Item);
05984         Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
05985 
05986         // Now we have our 32-bit value zero extended in the low element of
05987         // a vector.  If Idx != 0, swizzle it into place.
05988         if (Idx != 0) {
05989           SmallVector<int, 4> Mask;
05990           Mask.push_back(Idx);
05991           for (unsigned i = 1; i != VecElts; ++i)
05992             Mask.push_back(i);
05993           Item = DAG.getVectorShuffle(VecVT, dl, Item, DAG.getUNDEF(VecVT),
05994                                       &Mask[0]);
05995         }
05996         return DAG.getNode(ISD::BITCAST, dl, VT, Item);
05997       }
05998     }
05999 
06000     // If we have a constant or non-constant insertion into the low element of
06001     // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into
06002     // the rest of the elements.  This will be matched as movd/movq/movss/movsd
06003     // depending on what the source datatype is.
06004     if (Idx == 0) {
06005       if (NumZero == 0)
06006         return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
06007 
06008       if (ExtVT == MVT::i32 || ExtVT == MVT::f32 || ExtVT == MVT::f64 ||
06009           (ExtVT == MVT::i64 && Subtarget->is64Bit())) {
06010         if (VT.is256BitVector() || VT.is512BitVector()) {
06011           SDValue ZeroVec = getZeroVector(VT, Subtarget, DAG, dl);
06012           return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, ZeroVec,
06013                              Item, DAG.getIntPtrConstant(0));
06014         }
06015         assert(VT.is128BitVector() && "Expected an SSE value type!");
06016         Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
06017         // Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector.
06018         return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
06019       }
06020 
06021       if (ExtVT == MVT::i16 || ExtVT == MVT::i8) {
06022         Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item);
06023         Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item);
06024         if (VT.is256BitVector()) {
06025           SDValue ZeroVec = getZeroVector(MVT::v8i32, Subtarget, DAG, dl);
06026           Item = Insert128BitVector(ZeroVec, Item, 0, DAG, dl);
06027         } else {
06028           assert(VT.is128BitVector() && "Expected an SSE value type!");
06029           Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
06030         }
06031         return DAG.getNode(ISD::BITCAST, dl, VT, Item);
06032       }
06033     }
06034 
06035     // Is it a vector logical left shift?
06036     if (NumElems == 2 && Idx == 1 &&
06037         X86::isZeroNode(Op.getOperand(0)) &&
06038         !X86::isZeroNode(Op.getOperand(1))) {
06039       unsigned NumBits = VT.getSizeInBits();
06040       return getVShift(true, VT,
06041                        DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
06042                                    VT, Op.getOperand(1)),
06043                        NumBits/2, DAG, *this, dl);
06044     }
06045 
06046     if (IsAllConstants) // Otherwise, it's better to do a constpool load.
06047       return SDValue();
06048 
06049     // Otherwise, if this is a vector with i32 or f32 elements, and the element
06050     // is a non-constant being inserted into an element other than the low one,
06051     // we can't use a constant pool load.  Instead, use SCALAR_TO_VECTOR (aka
06052     // movd/movss) to move this into the low element, then shuffle it into
06053     // place.
06054     if (EVTBits == 32) {
06055       Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
06056 
06057       // Turn it into a shuffle of zero and zero-extended scalar to vector.
06058       Item = getShuffleVectorZeroOrUndef(Item, 0, NumZero > 0, Subtarget, DAG);
06059       SmallVector<int, 8> MaskVec;
06060       for (unsigned i = 0; i != NumElems; ++i)
06061         MaskVec.push_back(i == Idx ? 0 : 1);
06062       return DAG.getVectorShuffle(VT, dl, Item, DAG.getUNDEF(VT), &MaskVec[0]);
06063     }
06064   }
06065 
06066   // Splat is obviously ok. Let legalizer expand it to a shuffle.
06067   if (Values.size() == 1) {
06068     if (EVTBits == 32) {
06069       // Instead of a shuffle like this:
06070       // shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0>
06071       // Check if it's possible to issue this instead.
06072       // shuffle (vload ptr)), undef, <1, 1, 1, 1>
06073       unsigned Idx = countTrailingZeros(NonZeros);
06074       SDValue Item = Op.getOperand(Idx);
06075       if (Op.getNode()->isOnlyUserOf(Item.getNode()))
06076         return LowerAsSplatVectorLoad(Item, VT, dl, DAG);
06077     }
06078     return SDValue();
06079   }
06080 
06081   // A vector full of immediates; various special cases are already
06082   // handled, so this is best done with a single constant-pool load.
06083   if (IsAllConstants)
06084     return SDValue();
06085 
06086   // For AVX-length vectors, build the individual 128-bit pieces and use
06087   // shuffles to put them in place.
06088   if (VT.is256BitVector() || VT.is512BitVector()) {
06089     SmallVector<SDValue, 64> V;
06090     for (unsigned i = 0; i != NumElems; ++i)
06091       V.push_back(Op.getOperand(i));
06092 
06093     EVT HVT = EVT::getVectorVT(*DAG.getContext(), ExtVT, NumElems/2);
06094 
06095     // Build both the lower and upper subvector.
06096     SDValue Lower = DAG.getNode(ISD::BUILD_VECTOR, dl, HVT, &V[0], NumElems/2);
06097     SDValue Upper = DAG.getNode(ISD::BUILD_VECTOR, dl, HVT, &V[NumElems / 2],
06098                                 NumElems/2);
06099 
06100     // Recreate the wider vector with the lower and upper part.
06101     if (VT.is256BitVector())
06102       return Concat128BitVectors(Lower, Upper, VT, NumElems, DAG, dl);
06103     return Concat256BitVectors(Lower, Upper, VT, NumElems, DAG, dl);
06104   }
06105 
06106   // Let legalizer expand 2-wide build_vectors.
06107   if (EVTBits == 64) {
06108     if (NumNonZero == 1) {
06109       // One half is zero or undef.
06110       unsigned Idx = countTrailingZeros(NonZeros);
06111       SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT,
06112                                  Op.getOperand(Idx));
06113       return getShuffleVectorZeroOrUndef(V2, Idx, true, Subtarget, DAG);
06114     }
06115     return SDValue();
06116   }
06117 
06118   // If element VT is < 32 bits, convert it to inserts into a zero vector.
06119   if (EVTBits == 8 && NumElems == 16) {
06120     SDValue V = LowerBuildVectorv16i8(Op, NonZeros,NumNonZero,NumZero, DAG,
06121                                         Subtarget, *this);
06122     if (V.getNode()) return V;
06123   }
06124 
06125   if (EVTBits == 16 && NumElems == 8) {
06126     SDValue V = LowerBuildVectorv8i16(Op, NonZeros,NumNonZero,NumZero, DAG,
06127                                       Subtarget, *this);
06128     if (V.getNode()) return V;
06129   }
06130 
06131   // If element VT is == 32 bits, turn it into a number of shuffles.
06132   SmallVector<SDValue, 8> V(NumElems);
06133   if (NumElems == 4 && NumZero > 0) {
06134     for (unsigned i = 0; i < 4; ++i) {
06135       bool isZero = !(NonZeros & (1 << i));
06136       if (isZero)
06137         V[i] = getZeroVector(VT, Subtarget, DAG, dl);
06138       else
06139         V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
06140     }
06141 
06142     for (unsigned i = 0; i < 2; ++i) {
06143       switch ((NonZeros & (0x3 << i*2)) >> (i*2)) {
06144         default: break;
06145         case 0:
06146           V[i] = V[i*2];  // Must be a zero vector.
06147           break;
06148         case 1:
06149           V[i] = getMOVL(DAG, dl, VT, V[i*2+1], V[i*2]);
06150           break;
06151         case 2:
06152           V[i] = getMOVL(DAG, dl, VT, V[i*2], V[i*2+1]);
06153           break;
06154         case 3:
06155           V[i] = getUnpackl(DAG, dl, VT, V[i*2], V[i*2+1]);
06156           break;
06157       }
06158     }
06159 
06160     bool Reverse1 = (NonZeros & 0x3) == 2;
06161     bool Reverse2 = ((NonZeros & (0x3 << 2)) >> 2) == 2;
06162     int MaskVec[] = {
06163       Reverse1 ? 1 : 0,
06164       Reverse1 ? 0 : 1,
06165       static_cast<int>(Reverse2 ? NumElems+1 : NumElems),
06166       static_cast<int>(Reverse2 ? NumElems   : NumElems+1)
06167     };
06168     return DAG.getVectorShuffle(VT, dl, V[0], V[1], &MaskVec[0]);
06169   }
06170 
06171   if (Values.size() > 1 && VT.is128BitVector()) {
06172     // Check for a build vector of consecutive loads.
06173     for (unsigned i = 0; i < NumElems; ++i)
06174       V[i] = Op.getOperand(i);
06175 
06176     // Check for elements which are consecutive loads.
06177     SDValue LD = EltsFromConsecutiveLoads(VT, V, dl, DAG, false);
06178     if (LD.getNode())
06179       return LD;
06180 
06181     // Check for a build vector from mostly shuffle plus few inserting.
06182     SDValue Sh = buildFromShuffleMostly(Op, DAG);
06183     if (Sh.getNode())
06184       return Sh;
06185 
06186     // For SSE 4.1, use insertps to put the high elements into the low element.
06187     if (getSubtarget()->hasSSE41()) {
06188       SDValue Result;
06189       if (Op.getOperand(0).getOpcode() != ISD::UNDEF)
06190         Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0));
06191       else
06192         Result = DAG.getUNDEF(VT);
06193 
06194       for (unsigned i = 1; i < NumElems; ++i) {
06195         if (Op.getOperand(i).getOpcode() == ISD::UNDEF) continue;
06196         Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result,
06197                              Op.getOperand(i), DAG.getIntPtrConstant(i));
06198       }
06199       return Result;
06200     }
06201 
06202     // Otherwise, expand into a number of unpckl*, start by extending each of
06203     // our (non-undef) elements to the full vector width with the element in the
06204     // bottom slot of the vector (which generates no code for SSE).
06205     for (unsigned i = 0; i < NumElems; ++i) {
06206       if (Op.getOperand(i).getOpcode() != ISD::UNDEF)
06207         V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
06208       else
06209         V[i] = DAG.getUNDEF(VT);
06210     }
06211 
06212     // Next, we iteratively mix elements, e.g. for v4f32:
06213     //   Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
06214     //         : unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
06215     //   Step 2: unpcklps X, Y ==>    <3, 2, 1, 0>
06216     unsigned EltStride = NumElems >> 1;
06217     while (EltStride != 0) {
06218       for (unsigned i = 0; i < EltStride; ++i) {
06219         // If V[i+EltStride] is undef and this is the first round of mixing,
06220         // then it is safe to just drop this shuffle: V[i] is already in the
06221         // right place, the one element (since it's the first round) being
06222         // inserted as undef can be dropped.  This isn't safe for successive
06223         // rounds because they will permute elements within both vectors.
06224         if (V[i+EltStride].getOpcode() == ISD::UNDEF &&
06225             EltStride == NumElems/2)
06226           continue;
06227 
06228         V[i] = getUnpackl(DAG, dl, VT, V[i], V[i + EltStride]);
06229       }
06230       EltStride >>= 1;
06231     }
06232     return V[0];
06233   }
06234   return SDValue();
06235 }
06236 
06237 // LowerAVXCONCAT_VECTORS - 256-bit AVX can use the vinsertf128 instruction
06238 // to create 256-bit vectors from two other 128-bit ones.
06239 static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
06240   SDLoc dl(Op);
06241   MVT ResVT = Op.getSimpleValueType();
06242 
06243   assert((ResVT.is256BitVector() ||
06244           ResVT.is512BitVector()) && "Value type must be 256-/512-bit wide");
06245 
06246   SDValue V1 = Op.getOperand(0);
06247   SDValue V2 = Op.getOperand(1);
06248   unsigned NumElems = ResVT.getVectorNumElements();
06249   if(ResVT.is256BitVector())
06250     return Concat128BitVectors(V1, V2, ResVT, NumElems, DAG, dl);
06251 
06252   if (Op.getNumOperands() == 4) {
06253     MVT HalfVT = MVT::getVectorVT(ResVT.getScalarType(),
06254                                 ResVT.getVectorNumElements()/2);
06255     SDValue V3 = Op.getOperand(2);
06256     SDValue V4 = Op.getOperand(3);
06257     return Concat256BitVectors(Concat128BitVectors(V1, V2, HalfVT, NumElems/2, DAG, dl),
06258       Concat128BitVectors(V3, V4, HalfVT, NumElems/2, DAG, dl), ResVT, NumElems, DAG, dl);
06259   }
06260   return Concat256BitVectors(V1, V2, ResVT, NumElems, DAG, dl);
06261 }
06262 
06263 static SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
06264   MVT LLVM_ATTRIBUTE_UNUSED VT = Op.getSimpleValueType();
06265   assert((VT.is256BitVector() && Op.getNumOperands() == 2) ||
06266          (VT.is512BitVector() && (Op.getNumOperands() == 2 ||
06267           Op.getNumOperands() == 4)));
06268 
06269   // AVX can use the vinsertf128 instruction to create 256-bit vectors
06270   // from two other 128-bit ones.
06271 
06272   // 512-bit vector may contain 2 256-bit vectors or 4 128-bit vectors
06273   return LowerAVXCONCAT_VECTORS(Op, DAG);
06274 }
06275 
06276 // Try to lower a shuffle node into a simple blend instruction.
06277 static SDValue
06278 LowerVECTOR_SHUFFLEtoBlend(ShuffleVectorSDNode *SVOp,
06279                            const X86Subtarget *Subtarget, SelectionDAG &DAG) {
06280   SDValue V1 = SVOp->getOperand(0);
06281   SDValue V2 = SVOp->getOperand(1);
06282   SDLoc dl(SVOp);
06283   MVT VT = SVOp->getSimpleValueType(0);
06284   MVT EltVT = VT.getVectorElementType();
06285   unsigned NumElems = VT.getVectorNumElements();
06286 
06287   // There is no blend with immediate in AVX-512.
06288   if (VT.is512BitVector())
06289     return SDValue();
06290 
06291   if (!Subtarget->hasSSE41() || EltVT == MVT::i8)
06292     return SDValue();
06293   if (!Subtarget->hasInt256() && VT == MVT::v16i16)
06294     return SDValue();
06295 
06296   // Check the mask for BLEND and build the value.
06297   unsigned MaskValue = 0;
06298   // There are 2 lanes if (NumElems > 8), and 1 lane otherwise.
06299   unsigned NumLanes = (NumElems-1)/8 + 1;
06300   unsigned NumElemsInLane = NumElems / NumLanes;
06301 
06302   // Blend for v16i16 should be symetric for the both lanes.
06303   for (unsigned i = 0; i < NumElemsInLane; ++i) {
06304 
06305     int SndLaneEltIdx = (NumLanes == 2) ?
06306       SVOp->getMaskElt(i + NumElemsInLane) : -1;
06307     int EltIdx = SVOp->getMaskElt(i);
06308 
06309     if ((EltIdx < 0 || EltIdx == (int)i) &&
06310         (SndLaneEltIdx < 0 || SndLaneEltIdx == (int)(i + NumElemsInLane)))
06311       continue;
06312 
06313     if (((unsigned)EltIdx == (i + NumElems)) &&
06314         (SndLaneEltIdx < 0 ||
06315          (unsigned)SndLaneEltIdx == i + NumElems + NumElemsInLane))
06316       MaskValue |= (1<<i);
06317     else
06318       return SDValue();
06319   }
06320 
06321   // Convert i32 vectors to floating point if it is not AVX2.
06322   // AVX2 introduced VPBLENDD instruction for 128 and 256-bit vectors.
06323   MVT BlendVT = VT;
06324   if (EltVT == MVT::i64 || (EltVT == MVT::i32 && !Subtarget->hasInt256())) {
06325     BlendVT = MVT::getVectorVT(MVT::getFloatingPointVT(EltVT.getSizeInBits()),
06326                                NumElems);
06327     V1 = DAG.getNode(ISD::BITCAST, dl, VT, V1);
06328     V2 = DAG.getNode(ISD::BITCAST, dl, VT, V2);
06329   }
06330 
06331   SDValue Ret = DAG.getNode(X86ISD::BLENDI, dl, BlendVT, V1, V2,
06332                             DAG.getConstant(MaskValue, MVT::i32));
06333   return DAG.getNode(ISD::BITCAST, dl, VT, Ret);
06334 }
06335 
06336 /// In vector type \p VT, return true if the element at index \p InputIdx
06337 /// falls on a different 128-bit lane than \p OutputIdx.
06338 static bool ShuffleCrosses128bitLane(MVT VT, unsigned InputIdx,
06339                                      unsigned OutputIdx) {
06340   unsigned EltSize = VT.getVectorElementType().getSizeInBits();
06341   return InputIdx * EltSize / 128 != OutputIdx * EltSize / 128;
06342 }
06343 
06344 /// Generate a PSHUFB if possible.  Selects elements from \p V1 according to
06345 /// \p MaskVals.  MaskVals[OutputIdx] = InputIdx specifies that we want to
06346 /// shuffle the element at InputIdx in V1 to OutputIdx in the result.  If \p
06347 /// MaskVals refers to elements outside of \p V1 or is undef (-1), insert a
06348 /// zero.
06349 static SDValue getPSHUFB(ArrayRef<int> MaskVals, SDValue V1, SDLoc &dl,
06350                          SelectionDAG &DAG) {
06351   MVT VT = V1.getSimpleValueType();
06352   assert(VT.is128BitVector() || VT.is256BitVector());
06353 
06354   MVT EltVT = VT.getVectorElementType();
06355   unsigned EltSizeInBytes = EltVT.getSizeInBits() / 8;
06356   unsigned NumElts = VT.getVectorNumElements();
06357 
06358   SmallVector<SDValue, 32> PshufbMask;
06359   for (unsigned OutputIdx = 0; OutputIdx < NumElts; ++OutputIdx) {
06360     int InputIdx = MaskVals[OutputIdx];
06361     unsigned InputByteIdx;
06362 
06363     if (InputIdx < 0 || NumElts <= (unsigned)InputIdx)
06364       InputByteIdx = 0x80;
06365     else {
06366       // Cross lane is not allowed.
06367       if (ShuffleCrosses128bitLane(VT, InputIdx, OutputIdx))
06368         return SDValue();
06369       InputByteIdx = InputIdx * EltSizeInBytes;
06370       // Index is an byte offset within the 128-bit lane.
06371       InputByteIdx &= 0xf;
06372     }
06373 
06374     for (unsigned j = 0; j < EltSizeInBytes; ++j) {
06375       PshufbMask.push_back(DAG.getConstant(InputByteIdx, MVT::i8));
06376       if (InputByteIdx != 0x80)
06377         ++InputByteIdx;
06378     }
06379   }
06380 
06381   MVT ShufVT = MVT::getVectorVT(MVT::i8, PshufbMask.size());
06382   if (ShufVT != VT)
06383     V1 = DAG.getNode(ISD::BITCAST, dl, ShufVT, V1);
06384   return DAG.getNode(X86ISD::PSHUFB, dl, ShufVT, V1,
06385                      DAG.getNode(ISD::BUILD_VECTOR, dl, ShufVT,
06386                                  PshufbMask.data(), PshufbMask.size()));
06387 }
06388 
06389 // v8i16 shuffles - Prefer shuffles in the following order:
06390 // 1. [all]   pshuflw, pshufhw, optional move
06391 // 2. [ssse3] 1 x pshufb
06392 // 3. [ssse3] 2 x pshufb + 1 x por
06393 // 4. [all]   mov + pshuflw + pshufhw + N x (pextrw + pinsrw)
06394 static SDValue
06395 LowerVECTOR_SHUFFLEv8i16(SDValue Op, const X86Subtarget *Subtarget,
06396                          SelectionDAG &DAG) {
06397   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
06398   SDValue V1 = SVOp->getOperand(0);
06399   SDValue V2 = SVOp->getOperand(1);
06400   SDLoc dl(SVOp);
06401   SmallVector<int, 8> MaskVals;
06402 
06403   // Determine if more than 1 of the words in each of the low and high quadwords
06404   // of the result come from the same quadword of one of the two inputs.  Undef
06405   // mask values count as coming from any quadword, for better codegen.
06406   //
06407   // Lo/HiQuad[i] = j indicates how many words from the ith quad of the input
06408   // feeds this quad.  For i, 0 and 1 refer to V1, 2 and 3 refer to V2.
06409   unsigned LoQuad[] = { 0, 0, 0, 0 };
06410   unsigned HiQuad[] = { 0, 0, 0, 0 };
06411   // Indices of quads used.
06412   std::bitset<4> InputQuads;
06413   for (unsigned i = 0; i < 8; ++i) {
06414     unsigned *Quad = i < 4 ? LoQuad : HiQuad;
06415     int EltIdx = SVOp->getMaskElt(i);
06416     MaskVals.push_back(EltIdx);
06417     if (EltIdx < 0) {
06418       ++Quad[0];
06419       ++Quad[1];
06420       ++Quad[2];
06421       ++Quad[3];
06422       continue;
06423     }
06424     ++Quad[EltIdx / 4];
06425     InputQuads.set(EltIdx / 4);
06426   }
06427 
06428   int BestLoQuad = -1;
06429   unsigned MaxQuad = 1;
06430   for (unsigned i = 0; i < 4; ++i) {
06431     if (LoQuad[i] > MaxQuad) {
06432       BestLoQuad = i;
06433       MaxQuad = LoQuad[i];
06434     }
06435   }
06436 
06437   int BestHiQuad = -1;
06438   MaxQuad = 1;
06439   for (unsigned i = 0; i < 4; ++i) {
06440     if (HiQuad[i] > MaxQuad) {
06441       BestHiQuad = i;
06442       MaxQuad = HiQuad[i];
06443     }
06444   }
06445 
06446   // For SSSE3, If all 8 words of the result come from only 1 quadword of each
06447   // of the two input vectors, shuffle them into one input vector so only a
06448   // single pshufb instruction is necessary. If there are more than 2 input
06449   // quads, disable the next transformation since it does not help SSSE3.
06450   bool V1Used = InputQuads[0] || InputQuads[1];
06451   bool V2Used = InputQuads[2] || InputQuads[3];
06452   if (Subtarget->hasSSSE3()) {
06453     if (InputQuads.count() == 2 && V1Used && V2Used) {
06454       BestLoQuad = InputQuads[0] ? 0 : 1;
06455       BestHiQuad = InputQuads[2] ? 2 : 3;
06456     }
06457     if (InputQuads.count() > 2) {
06458       BestLoQuad = -1;
06459       BestHiQuad = -1;
06460     }
06461   }
06462 
06463   // If BestLoQuad or BestHiQuad are set, shuffle the quads together and update
06464   // the shuffle mask.  If a quad is scored as -1, that means that it contains
06465   // words from all 4 input quadwords.
06466   SDValue NewV;
06467   if (BestLoQuad >= 0 || BestHiQuad >= 0) {
06468     int MaskV[] = {
06469       BestLoQuad < 0 ? 0 : BestLoQuad,
06470       BestHiQuad < 0 ? 1 : BestHiQuad
06471     };
06472     NewV = DAG.getVectorShuffle(MVT::v2i64, dl,
06473                   DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, V1),
06474                   DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, V2), &MaskV[0]);
06475     NewV = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, NewV);
06476 
06477     // Rewrite the MaskVals and assign NewV to V1 if NewV now contains all the
06478     // source words for the shuffle, to aid later transformations.
06479     bool AllWordsInNewV = true;
06480     bool InOrder[2] = { true, true };
06481     for (unsigned i = 0; i != 8; ++i) {
06482       int idx = MaskVals[i];
06483       if (idx != (int)i)
06484         InOrder[i/4] = false;
06485       if (idx < 0 || (idx/4) == BestLoQuad || (idx/4) == BestHiQuad)
06486         continue;
06487       AllWordsInNewV = false;
06488       break;
06489     }
06490 
06491     bool pshuflw = AllWordsInNewV, pshufhw = AllWordsInNewV;
06492     if (AllWordsInNewV) {
06493       for (int i = 0; i != 8; ++i) {
06494         int idx = MaskVals[i];
06495         if (idx < 0)
06496           continue;
06497         idx = MaskVals[i] = (idx / 4) == BestLoQuad ? (idx & 3) : (idx & 3) + 4;
06498         if ((idx != i) && idx < 4)
06499           pshufhw = false;
06500         if ((idx != i) && idx > 3)
06501           pshuflw = false;
06502       }
06503       V1 = NewV;
06504       V2Used = false;
06505       BestLoQuad = 0;
06506       BestHiQuad = 1;
06507     }
06508 
06509     // If we've eliminated the use of V2, and the new mask is a pshuflw or
06510     // pshufhw, that's as cheap as it gets.  Return the new shuffle.
06511     if ((pshufhw && InOrder[0]) || (pshuflw && InOrder[1])) {
06512       unsigned Opc = pshufhw ? X86ISD::PSHUFHW : X86ISD::PSHUFLW;
06513       unsigned TargetMask = 0;
06514       NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV,
06515                                   DAG.getUNDEF(MVT::v8i16), &MaskVals[0]);
06516       ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(NewV.getNode());
06517       TargetMask = pshufhw ? getShufflePSHUFHWImmediate(SVOp):
06518                              getShufflePSHUFLWImmediate(SVOp);
06519       V1 = NewV.getOperand(0);
06520       return getTargetShuffleNode(Opc, dl, MVT::v8i16, V1, TargetMask, DAG);
06521     }
06522   }
06523 
06524   // Promote splats to a larger type which usually leads to more efficient code.
06525   // FIXME: Is this true if pshufb is available?
06526   if (SVOp->isSplat())
06527     return PromoteSplat(SVOp, DAG);
06528 
06529   // If we have SSSE3, and all words of the result are from 1 input vector,
06530   // case 2 is generated, otherwise case 3 is generated.  If no SSSE3
06531   // is present, fall back to case 4.
06532   if (Subtarget->hasSSSE3()) {
06533     SmallVector<SDValue,16> pshufbMask;
06534 
06535     // If we have elements from both input vectors, set the high bit of the
06536     // shuffle mask element to zero out elements that come from V2 in the V1
06537     // mask, and elements that come from V1 in the V2 mask, so that the two
06538     // results can be OR'd together.
06539     bool TwoInputs = V1Used && V2Used;
06540     V1 = getPSHUFB(MaskVals, V1, dl, DAG);
06541     if (!TwoInputs)
06542       return DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1);
06543 
06544     // Calculate the shuffle mask for the second input, shuffle it, and
06545     // OR it with the first shuffled input.
06546     CommuteVectorShuffleMask(MaskVals, 8);
06547     V2 = getPSHUFB(MaskVals, V2, dl, DAG);
06548     V1 = DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2);
06549     return DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1);
06550   }
06551 
06552   // If BestLoQuad >= 0, generate a pshuflw to put the low elements in order,
06553   // and update MaskVals with new element order.
06554   std::bitset<8> InOrder;
06555   if (BestLoQuad >= 0) {
06556     int MaskV[] = { -1, -1, -1, -1, 4, 5, 6, 7 };
06557     for (int i = 0; i != 4; ++i) {
06558       int idx = MaskVals[i];
06559       if (idx < 0) {
06560         InOrder.set(i);
06561       } else if ((idx / 4) == BestLoQuad) {
06562         MaskV[i] = idx & 3;
06563         InOrder.set(i);
06564       }
06565     }
06566     NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16),
06567                                 &MaskV[0]);
06568 
06569     if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE && Subtarget->hasSSSE3()) {
06570       ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(NewV.getNode());
06571       NewV = getTargetShuffleNode(X86ISD::PSHUFLW, dl, MVT::v8i16,
06572                                   NewV.getOperand(0),
06573                                   getShufflePSHUFLWImmediate(SVOp), DAG);
06574     }
06575   }
06576 
06577   // If BestHi >= 0, generate a pshufhw to put the high elements in order,
06578   // and update MaskVals with the new element order.
06579   if (BestHiQuad >= 0) {
06580     int MaskV[] = { 0, 1, 2, 3, -1, -1, -1, -1 };
06581     for (unsigned i = 4; i != 8; ++i) {
06582       int idx = MaskVals[i];
06583       if (idx < 0) {
06584         InOrder.set(i);
06585       } else if ((idx / 4) == BestHiQuad) {
06586         MaskV[i] = (idx & 3) + 4;
06587         InOrder.set(i);
06588       }
06589     }
06590     NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16),
06591                                 &MaskV[0]);
06592 
06593     if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE && Subtarget->hasSSSE3()) {
06594       ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(NewV.getNode());
06595       NewV = getTargetShuffleNode(X86ISD::PSHUFHW, dl, MVT::v8i16,
06596                                   NewV.getOperand(0),
06597                                   getShufflePSHUFHWImmediate(SVOp), DAG);
06598     }
06599   }
06600 
06601   // In case BestHi & BestLo were both -1, which means each quadword has a word
06602   // from each of the four input quadwords, calculate the InOrder bitvector now
06603   // before falling through to the insert/extract cleanup.
06604   if (BestLoQuad == -1 && BestHiQuad == -1) {
06605     NewV = V1;
06606     for (int i = 0; i != 8; ++i)
06607       if (MaskVals[i] < 0 || MaskVals[i] == i)
06608         InOrder.set(i);
06609   }
06610 
06611   // The other elements are put in the right place using pextrw and pinsrw.
06612   for (unsigned i = 0; i != 8; ++i) {
06613     if (InOrder[i])
06614       continue;
06615     int EltIdx = MaskVals[i];
06616     if (EltIdx < 0)
06617       continue;
06618     SDValue ExtOp = (EltIdx < 8) ?
06619       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V1,
06620                   DAG.getIntPtrConstant(EltIdx)) :
06621       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V2,
06622                   DAG.getIntPtrConstant(EltIdx - 8));
06623     NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, ExtOp,
06624                        DAG.getIntPtrConstant(i));
06625   }
06626   return NewV;
06627 }
06628 
06629 /// \brief v16i16 shuffles
06630 ///
06631 /// FIXME: We only support generation of a single pshufb currently.  We can
06632 /// generalize the other applicable cases from LowerVECTOR_SHUFFLEv8i16 as
06633 /// well (e.g 2 x pshufb + 1 x por).
06634 static SDValue
06635 LowerVECTOR_SHUFFLEv16i16(SDValue Op, SelectionDAG &DAG) {
06636   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
06637   SDValue V1 = SVOp->getOperand(0);
06638   SDValue V2 = SVOp->getOperand(1);
06639   SDLoc dl(SVOp);
06640 
06641   if (V2.getOpcode() != ISD::UNDEF)
06642     return SDValue();
06643 
06644   SmallVector<int, 16> MaskVals(SVOp->getMask().begin(), SVOp->getMask().end());
06645   return getPSHUFB(MaskVals, V1, dl, DAG);
06646 }
06647 
06648 // v16i8 shuffles - Prefer shuffles in the following order:
06649 // 1. [ssse3] 1 x pshufb
06650 // 2. [ssse3] 2 x pshufb + 1 x por
06651 // 3. [all]   v8i16 shuffle + N x pextrw + rotate + pinsrw
06652 static SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp,
06653                                         const X86Subtarget* Subtarget,
06654                                         SelectionDAG &DAG) {
06655   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
06656   SDValue V1 = SVOp->getOperand(0);
06657   SDValue V2 = SVOp->getOperand(1);
06658   SDLoc dl(SVOp);
06659   ArrayRef<int> MaskVals = SVOp->getMask();
06660 
06661   // Promote splats to a larger type which usually leads to more efficient code.
06662   // FIXME: Is this true if pshufb is available?
06663   if (SVOp->isSplat())
06664     return PromoteSplat(SVOp, DAG);
06665 
06666   // If we have SSSE3, case 1 is generated when all result bytes come from
06667   // one of  the inputs.  Otherwise, case 2 is generated.  If no SSSE3 is
06668   // present, fall back to case 3.
06669 
06670   // If SSSE3, use 1 pshufb instruction per vector with elements in the result.
06671   if (Subtarget->hasSSSE3()) {
06672     SmallVector<SDValue,16> pshufbMask;
06673 
06674     // If all result elements are from one input vector, then only translate
06675     // undef mask values to 0x80 (zero out result) in the pshufb mask.
06676     //
06677     // Otherwise, we have elements from both input vectors, and must zero out
06678     // elements that come from V2 in the first mask, and V1 in the second mask
06679     // so that we can OR them together.
06680     for (unsigned i = 0; i != 16; ++i) {
06681       int EltIdx = MaskVals[i];
06682       if (EltIdx < 0 || EltIdx >= 16)
06683         EltIdx = 0x80;
06684       pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8));
06685     }
06686     V1 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V1,
06687                      DAG.getNode(ISD::BUILD_VECTOR, dl,
06688                                  MVT::v16i8, &pshufbMask[0], 16));
06689 
06690     // As PSHUFB will zero elements with negative indices, it's safe to ignore
06691     // the 2nd operand if it's undefined or zero.
06692     if (V2.getOpcode() == ISD::UNDEF ||
06693         ISD::isBuildVectorAllZeros(V2.getNode()))
06694       return V1;
06695 
06696     // Calculate the shuffle mask for the second input, shuffle it, and
06697     // OR it with the first shuffled input.
06698     pshufbMask.clear();
06699     for (unsigned i = 0; i != 16; ++i) {
06700       int EltIdx = MaskVals[i];
06701       EltIdx = (EltIdx < 16) ? 0x80 : EltIdx - 16;
06702       pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8));
06703     }
06704     V2 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V2,
06705                      DAG.getNode(ISD::BUILD_VECTOR, dl,
06706                                  MVT::v16i8, &pshufbMask[0], 16));
06707     return DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2);
06708   }
06709 
06710   // No SSSE3 - Calculate in place words and then fix all out of place words
06711   // With 0-16 extracts & inserts.  Worst case is 16 bytes out of order from
06712   // the 16 different words that comprise the two doublequadword input vectors.
06713   V1 = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1);
06714   V2 = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V2);
06715   SDValue NewV = V1;
06716   for (int i = 0; i != 8; ++i) {
06717     int Elt0 = MaskVals[i*2];
06718     int Elt1 = MaskVals[i*2+1];
06719 
06720     // This word of the result is all undef, skip it.
06721     if (Elt0 < 0 && Elt1 < 0)
06722       continue;
06723 
06724     // This word of the result is already in the correct place, skip it.
06725     if ((Elt0 == i*2) && (Elt1 == i*2+1))
06726       continue;
06727 
06728     SDValue Elt0Src = Elt0 < 16 ? V1 : V2;
06729     SDValue Elt1Src = Elt1 < 16 ? V1 : V2;
06730     SDValue InsElt;
06731 
06732     // If Elt0 and Elt1 are defined, are consecutive, and can be load
06733     // using a single extract together, load it and store it.
06734     if ((Elt0 >= 0) && ((Elt0 + 1) == Elt1) && ((Elt0 & 1) == 0)) {
06735       InsElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Elt1Src,
06736                            DAG.getIntPtrConstant(Elt1 / 2));
06737       NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, InsElt,
06738                         DAG.getIntPtrConstant(i));
06739       continue;
06740     }
06741 
06742     // If Elt1 is defined, extract it from the appropriate source.  If the
06743     // source byte is not also odd, shift the extracted word left 8 bits
06744     // otherwise clear the bottom 8 bits if we need to do an or.
06745     if (Elt1 >= 0) {
06746       InsElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Elt1Src,
06747                            DAG.getIntPtrConstant(Elt1 / 2));
06748       if ((Elt1 & 1) == 0)
06749         InsElt = DAG.getNode(ISD::SHL, dl, MVT::i16, InsElt,
06750                              DAG.getConstant(8,
06751                                   TLI.getShiftAmountTy(InsElt.getValueType())));
06752       else if (Elt0 >= 0)
06753         InsElt = DAG.getNode(ISD::AND, dl, MVT::i16, InsElt,
06754                              DAG.getConstant(0xFF00, MVT::i16));
06755     }
06756     // If Elt0 is defined, extract it from the appropriate source.  If the
06757     // source byte is not also even, shift the extracted word right 8 bits. If
06758     // Elt1 was also defined, OR the extracted values together before
06759     // inserting them in the result.
06760     if (Elt0 >= 0) {
06761       SDValue InsElt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16,
06762                                     Elt0Src, DAG.getIntPtrConstant(Elt0 / 2));
06763       if ((Elt0 & 1) != 0)
06764         InsElt0 = DAG.getNode(ISD::SRL, dl, MVT::i16, InsElt0,
06765                               DAG.getConstant(8,
06766                                  TLI.getShiftAmountTy(InsElt0.getValueType())));
06767       else if (Elt1 >= 0)
06768         InsElt0 = DAG.getNode(ISD::AND, dl, MVT::i16, InsElt0,
06769                              DAG.getConstant(0x00FF, MVT::i16));
06770       InsElt = Elt1 >= 0 ? DAG.getNode(ISD::OR, dl, MVT::i16, InsElt, InsElt0)
06771                          : InsElt0;
06772     }
06773     NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, InsElt,
06774                        DAG.getIntPtrConstant(i));
06775   }
06776   return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, NewV);
06777 }
06778 
06779 // v32i8 shuffles - Translate to VPSHUFB if possible.
06780 static
06781 SDValue LowerVECTOR_SHUFFLEv32i8(ShuffleVectorSDNode *SVOp,
06782                                  const X86Subtarget *Subtarget,
06783                                  SelectionDAG &DAG) {
06784   MVT VT = SVOp->getSimpleValueType(0);
06785   SDValue V1 = SVOp->getOperand(0);
06786   SDValue V2 = SVOp->getOperand(1);
06787   SDLoc dl(SVOp);
06788   SmallVector<int, 32> MaskVals(SVOp->getMask().begin(), SVOp->getMask().end());
06789 
06790   bool V2IsUndef = V2.getOpcode() == ISD::UNDEF;
06791   bool V1IsAllZero = ISD::isBuildVectorAllZeros(V1.getNode());
06792   bool V2IsAllZero = ISD::isBuildVectorAllZeros(V2.getNode());
06793 
06794   // VPSHUFB may be generated if
06795   // (1) one of input vector is undefined or zeroinitializer.
06796   // The mask value 0x80 puts 0 in the corresponding slot of the vector.
06797   // And (2) the mask indexes don't cross the 128-bit lane.
06798   if (VT != MVT::v32i8 || !Subtarget->hasInt256() ||
06799       (!V2IsUndef && !V2IsAllZero && !V1IsAllZero))
06800     return SDValue();
06801 
06802   if (V1IsAllZero && !V2IsAllZero) {
06803     CommuteVectorShuffleMask(MaskVals, 32);
06804     V1 = V2;
06805   }
06806   return getPSHUFB(MaskVals, V1, dl, DAG);
06807 }
06808 
06809 /// RewriteAsNarrowerShuffle - Try rewriting v8i16 and v16i8 shuffles as 4 wide
06810 /// ones, or rewriting v4i32 / v4f32 as 2 wide ones if possible. This can be
06811 /// done when every pair / quad of shuffle mask elements point to elements in
06812 /// the right sequence. e.g.
06813 /// vector_shuffle X, Y, <2, 3, | 10, 11, | 0, 1, | 14, 15>
06814 static
06815 SDValue RewriteAsNarrowerShuffle(ShuffleVectorSDNode *SVOp,
06816                                  SelectionDAG &DAG) {
06817   MVT VT = SVOp->getSimpleValueType(0);
06818   SDLoc dl(SVOp);
06819   unsigned NumElems = VT.getVectorNumElements();
06820   MVT NewVT;
06821   unsigned Scale;
06822   switch (VT.SimpleTy) {
06823   default: llvm_unreachable("Unexpected!");
06824   case MVT::v4f32:  NewVT = MVT::v2f64; Scale = 2; break;
06825   case MVT::v4i32:  NewVT = MVT::v2i64; Scale = 2; break;
06826   case MVT::v8i16:  NewVT = MVT::v4i32; Scale = 2; break;
06827   case MVT::v16i8:  NewVT = MVT::v4i32; Scale = 4; break;
06828   case MVT::v16i16: NewVT = MVT::v8i32; Scale = 2; break;
06829   case MVT::v32i8:  NewVT = MVT::v8i32; Scale = 4; break;
06830   }
06831 
06832   SmallVector<int, 8> MaskVec;
06833   for (unsigned i = 0; i != NumElems; i += Scale) {
06834     int StartIdx = -1;
06835     for (unsigned j = 0; j != Scale; ++j) {
06836       int EltIdx = SVOp->getMaskElt(i+j);
06837       if (EltIdx < 0)
06838         continue;
06839       if (StartIdx < 0)
06840         StartIdx = (EltIdx / Scale);
06841       if (EltIdx != (int)(StartIdx*Scale + j))
06842         return SDValue();
06843     }
06844     MaskVec.push_back(StartIdx);
06845   }
06846 
06847   SDValue V1 = DAG.getNode(ISD::BITCAST, dl, NewVT, SVOp->getOperand(0));
06848   SDValue V2 = DAG.getNode(ISD::BITCAST, dl, NewVT, SVOp->getOperand(1));
06849   return DAG.getVectorShuffle(NewVT, dl, V1, V2, &MaskVec[0]);
06850 }
06851 
06852 /// getVZextMovL - Return a zero-extending vector move low node.
06853 ///
06854 static SDValue getVZextMovL(MVT VT, MVT OpVT,
06855                             SDValue SrcOp, SelectionDAG &DAG,
06856                             const X86Subtarget *Subtarget, SDLoc dl) {
06857   if (VT == MVT::v2f64 || VT == MVT::v4f32) {
06858     LoadSDNode *LD = NULL;
06859     if (!isScalarLoadToVector(SrcOp.getNode(), &LD))
06860       LD = dyn_cast<LoadSDNode>(SrcOp);
06861     if (!LD) {
06862       // movssrr and movsdrr do not clear top bits. Try to use movd, movq
06863       // instead.
06864       MVT ExtVT = (OpVT == MVT::v2f64) ? MVT::i64 : MVT::i32;
06865       if ((ExtVT != MVT::i64 || Subtarget->is64Bit()) &&
06866           SrcOp.getOpcode() == ISD::SCALAR_TO_VECTOR &&
06867           SrcOp.getOperand(0).getOpcode() == ISD::BITCAST &&
06868           SrcOp.getOperand(0).getOperand(0).getValueType() == ExtVT) {
06869         // PR2108
06870         OpVT = (OpVT == MVT::v2f64) ? MVT::v2i64 : MVT::v4i32;
06871         return DAG.getNode(ISD::BITCAST, dl, VT,
06872                            DAG.getNode(X86ISD::VZEXT_MOVL, dl, OpVT,
06873                                        DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
06874                                                    OpVT,
06875                                                    SrcOp.getOperand(0)
06876                                                           .getOperand(0))));
06877       }
06878     }
06879   }
06880 
06881   return DAG.getNode(ISD::BITCAST, dl, VT,
06882                      DAG.getNode(X86ISD::VZEXT_MOVL, dl, OpVT,
06883                                  DAG.getNode(ISD::BITCAST, dl,
06884                                              OpVT, SrcOp)));
06885 }
06886 
06887 /// LowerVECTOR_SHUFFLE_256 - Handle all 256-bit wide vectors shuffles
06888 /// which could not be matched by any known target speficic shuffle
06889 static SDValue
06890 LowerVECTOR_SHUFFLE_256(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) {
06891 
06892   SDValue NewOp = Compact8x32ShuffleNode(SVOp, DAG);
06893   if (NewOp.getNode())
06894     return NewOp;
06895 
06896   MVT VT = SVOp->getSimpleValueType(0);
06897 
06898   unsigned NumElems = VT.getVectorNumElements();
06899   unsigned NumLaneElems = NumElems / 2;
06900 
06901   SDLoc dl(SVOp);
06902   MVT EltVT = VT.getVectorElementType();
06903   MVT NVT = MVT::getVectorVT(EltVT, NumLaneElems);
06904   SDValue Output[2];
06905 
06906   SmallVector<int, 16> Mask;
06907   for (unsigned l = 0; l < 2; ++l) {
06908     // Build a shuffle mask for the output, discovering on the fly which
06909     // input vectors to use as shuffle operands (recorded in InputUsed).
06910     // If building a suitable shuffle vector proves too hard, then bail
06911     // out with UseBuildVector set.
06912     bool UseBuildVector = false;
06913     int InputUsed[2] = { -1, -1 }; // Not yet discovered.
06914     unsigned LaneStart = l * NumLaneElems;
06915     for (unsigned i = 0; i != NumLaneElems; ++i) {
06916       // The mask element.  This indexes into the input.
06917       int Idx = SVOp->getMaskElt(i+LaneStart);
06918       if (Idx < 0) {
06919         // the mask element does not index into any input vector.
06920         Mask.push_back(-1);
06921         continue;
06922       }
06923 
06924       // The input vector this mask element indexes into.
06925       int Input = Idx / NumLaneElems;
06926 
06927       // Turn the index into an offset from the start of the input vector.
06928       Idx -= Input * NumLaneElems;
06929 
06930       // Find or create a shuffle vector operand to hold this input.
06931       unsigned OpNo;
06932       for (OpNo = 0; OpNo < array_lengthof(InputUsed); ++OpNo) {
06933         if (InputUsed[OpNo] == Input)
06934           // This input vector is already an operand.
06935           break;
06936         if (InputUsed[OpNo] < 0) {
06937           // Create a new operand for this input vector.
06938           InputUsed[OpNo] = Input;
06939           break;
06940         }
06941       }
06942 
06943       if (OpNo >= array_lengthof(InputUsed)) {
06944         // More than two input vectors used!  Give up on trying to create a
06945         // shuffle vector.  Insert all elements into a BUILD_VECTOR instead.
06946         UseBuildVector = true;
06947         break;
06948       }
06949 
06950       // Add the mask index for the new shuffle vector.
06951       Mask.push_back(Idx + OpNo * NumLaneElems);
06952     }
06953 
06954     if (UseBuildVector) {
06955       SmallVector<SDValue, 16> SVOps;
06956       for (unsigned i = 0; i != NumLaneElems; ++i) {
06957         // The mask element.  This indexes into the input.
06958         int Idx = SVOp->getMaskElt(i+LaneStart);
06959         if (Idx < 0) {
06960           SVOps.push_back(DAG.getUNDEF(EltVT));
06961           continue;
06962         }
06963 
06964         // The input vector this mask element indexes into.
06965         int Input = Idx / NumElems;
06966 
06967         // Turn the index into an offset from the start of the input vector.
06968         Idx -= Input * NumElems;
06969 
06970         // Extract the vector element by hand.
06971         SVOps.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT,
06972                                     SVOp->getOperand(Input),
06973                                     DAG.getIntPtrConstant(Idx)));
06974       }
06975 
06976       // Construct the output using a BUILD_VECTOR.
06977       Output[l] = DAG.getNode(ISD::BUILD_VECTOR, dl, NVT, &SVOps[0],
06978                               SVOps.size());
06979     } else if (InputUsed[0] < 0) {
06980       // No input vectors were used! The result is undefined.
06981       Output[l] = DAG.getUNDEF(NVT);
06982     } else {
06983       SDValue Op0 = Extract128BitVector(SVOp->getOperand(InputUsed[0] / 2),
06984                                         (InputUsed[0] % 2) * NumLaneElems,
06985                                         DAG, dl);
06986       // If only one input was used, use an undefined vector for the other.
06987       SDValue Op1 = (InputUsed[1] < 0) ? DAG.getUNDEF(NVT) :
06988         Extract128BitVector(SVOp->getOperand(InputUsed[1] / 2),
06989                             (InputUsed[1] % 2) * NumLaneElems, DAG, dl);
06990       // At least one input vector was used. Create a new shuffle vector.
06991       Output[l] = DAG.getVectorShuffle(NVT, dl, Op0, Op1, &Mask[0]);
06992     }
06993 
06994     Mask.clear();
06995   }
06996 
06997   // Concatenate the result back
06998   return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Output[0], Output[1]);
06999 }
07000 
07001 /// LowerVECTOR_SHUFFLE_128v4 - Handle all 128-bit wide vectors with
07002 /// 4 elements, and match them with several different shuffle types.
07003 static SDValue
07004 LowerVECTOR_SHUFFLE_128v4(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) {
07005   SDValue V1 = SVOp->getOperand(0);
07006   SDValue V2 = SVOp->getOperand(1);
07007   SDLoc dl(SVOp);
07008   MVT VT = SVOp->getSimpleValueType(0);
07009 
07010   assert(VT.is128BitVector() && "Unsupported vector size");
07011 
07012   std::pair<int, int> Locs[4];
07013   int Mask1[] = { -1, -1, -1, -1 };
07014   SmallVector<int, 8> PermMask(SVOp->getMask().begin(), SVOp->getMask().end());
07015 
07016   unsigned NumHi = 0;
07017   unsigned NumLo = 0;
07018   for (unsigned i = 0; i != 4; ++i) {
07019     int Idx = PermMask[i];
07020     if (Idx < 0) {
07021       Locs[i] = std::make_pair(-1, -1);
07022     } else {
07023       assert(Idx < 8 && "Invalid VECTOR_SHUFFLE index!");
07024       if (Idx < 4) {
07025         Locs[i] = std::make_pair(0, NumLo);
07026         Mask1[NumLo] = Idx;
07027         NumLo++;
07028       } else {
07029         Locs[i] = std::make_pair(1, NumHi);
07030         if (2+NumHi < 4)
07031           Mask1[2+NumHi] = Idx;
07032         NumHi++;
07033       }
07034     }
07035   }
07036 
07037   if (NumLo <= 2 && NumHi <= 2) {
07038     // If no more than two elements come from either vector. This can be
07039     // implemented with two shuffles. First shuffle gather the elements.
07040     // The second shuffle, which takes the first shuffle as both of its
07041     // vector operands, put the elements into the right order.
07042     V1 = DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]);
07043 
07044     int Mask2[] = { -1, -1, -1, -1 };
07045 
07046     for (unsigned i = 0; i != 4; ++i)
07047       if (Locs[i].first != -1) {
07048         unsigned Idx = (i < 2) ? 0 : 4;
07049         Idx += Locs[i].first * 2 + Locs[i].second;
07050         Mask2[i] = Idx;
07051       }
07052 
07053     return DAG.getVectorShuffle(VT, dl, V1, V1, &Mask2[0]);
07054   }
07055 
07056   if (NumLo == 3 || NumHi == 3) {
07057     // Otherwise, we must have three elements from one vector, call it X, and
07058     // one element from the other, call it Y.  First, use a shufps to build an
07059     // intermediate vector with the one element from Y and the element from X
07060     // that will be in the same half in the final destination (the indexes don't
07061     // matter). Then, use a shufps to build the final vector, taking the half
07062     // containing the element from Y from the intermediate, and the other half
07063     // from X.
07064     if (NumHi == 3) {
07065       // Normalize it so the 3 elements come from V1.
07066       CommuteVectorShuffleMask(PermMask, 4);
07067       std::swap(V1, V2);
07068     }
07069 
07070     // Find the element from V2.
07071     unsigned HiIndex;
07072     for (HiIndex = 0; HiIndex < 3; ++HiIndex) {
07073       int Val = PermMask[HiIndex];
07074       if (Val < 0)
07075         continue;
07076       if (Val >= 4)
07077         break;
07078     }
07079 
07080     Mask1[0] = PermMask[HiIndex];
07081     Mask1[1] = -1;
07082     Mask1[2] = PermMask[HiIndex^1];
07083     Mask1[3] = -1;
07084     V2 = DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]);
07085 
07086     if (HiIndex >= 2) {
07087       Mask1[0] = PermMask[0];
07088       Mask1[1] = PermMask[1];
07089       Mask1[2] = HiIndex & 1 ? 6 : 4;
07090       Mask1[3] = HiIndex & 1 ? 4 : 6;
07091       return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]);
07092     }
07093 
07094     Mask1[0] = HiIndex & 1 ? 2 : 0;
07095     Mask1[1] = HiIndex & 1 ? 0 : 2;
07096     Mask1[2] = PermMask[2];
07097     Mask1[3] = PermMask[3];
07098     if (Mask1[2] >= 0)
07099       Mask1[2] += 4;
07100     if (Mask1[3] >= 0)
07101       Mask1[3] += 4;
07102     return DAG.getVectorShuffle(VT, dl, V2, V1, &Mask1[0]);
07103   }
07104 
07105   // Break it into (shuffle shuffle_hi, shuffle_lo).
07106   int LoMask[] = { -1, -1, -1, -1 };
07107   int HiMask[] = { -1, -1, -1, -1 };
07108 
07109   int *MaskPtr = LoMask;
07110   unsigned MaskIdx = 0;
07111   unsigned LoIdx = 0;
07112   unsigned HiIdx = 2;
07113   for (unsigned i = 0; i != 4; ++i) {
07114     if (i == 2) {
07115       MaskPtr = HiMask;
07116       MaskIdx = 1;
07117       LoIdx = 0;
07118       HiIdx = 2;
07119     }
07120     int Idx = PermMask[i];
07121     if (Idx < 0) {
07122       Locs[i] = std::make_pair(-1, -1);
07123     } else if (Idx < 4) {
07124       Locs[i] = std::make_pair(MaskIdx, LoIdx);
07125       MaskPtr[LoIdx] = Idx;
07126       LoIdx++;
07127     } else {
07128       Locs[i] = std::make_pair(MaskIdx, HiIdx);
07129       MaskPtr[HiIdx] = Idx;
07130       HiIdx++;
07131     }
07132   }
07133 
07134   SDValue LoShuffle = DAG.getVectorShuffle(VT, dl, V1, V2, &LoMask[0]);
07135   SDValue HiShuffle = DAG.getVectorShuffle(VT, dl, V1, V2, &HiMask[0]);
07136   int MaskOps[] = { -1, -1, -1, -1 };
07137   for (unsigned i = 0; i != 4; ++i)
07138     if (Locs[i].first != -1)
07139       MaskOps[i] = Locs[i].first * 4 + Locs[i].second;
07140   return DAG.getVectorShuffle(VT, dl, LoShuffle, HiShuffle, &MaskOps[0]);
07141 }
07142 
07143 static bool MayFoldVectorLoad(SDValue V) {
07144   while (V.hasOneUse() && V.getOpcode() == ISD::BITCAST)
07145     V = V.getOperand(0);
07146 
07147   if (V.hasOneUse() && V.getOpcode() == ISD::SCALAR_TO_VECTOR)
07148     V = V.getOperand(0);
07149   if (V.hasOneUse() && V.getOpcode() == ISD::BUILD_VECTOR &&
07150       V.getNumOperands() == 2 && V.getOperand(1).getOpcode() == ISD::UNDEF)
07151     // BUILD_VECTOR (load), undef
07152     V = V.getOperand(0);
07153 
07154   return MayFoldLoad(V);
07155 }
07156 
07157 static
07158 SDValue getMOVDDup(SDValue &Op, SDLoc &dl, SDValue V1, SelectionDAG &DAG) {
07159   MVT VT = Op.getSimpleValueType();
07160 
07161   // Canonizalize to v2f64.
07162   V1 = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, V1);
07163   return DAG.getNode(ISD::BITCAST, dl, VT,
07164                      getTargetShuffleNode(X86ISD::MOVDDUP, dl, MVT::v2f64,
07165                                           V1, DAG));
07166 }
07167 
07168 static
07169 SDValue getMOVLowToHigh(SDValue &Op, SDLoc &dl, SelectionDAG &DAG,
07170                         bool HasSSE2) {
07171   SDValue V1 = Op.getOperand(0);
07172   SDValue V2 = Op.getOperand(1);
07173   MVT VT = Op.getSimpleValueType();
07174 
07175   assert(VT != MVT::v2i64 && "unsupported shuffle type");
07176 
07177   if (HasSSE2 && VT == MVT::v2f64)
07178     return getTargetShuffleNode(X86ISD::MOVLHPD, dl, VT, V1, V2, DAG);
07179 
07180   // v4f32 or v4i32: canonizalized to v4f32 (which is legal for SSE1)
07181   return DAG.getNode(ISD::BITCAST, dl, VT,
07182                      getTargetShuffleNode(X86ISD::MOVLHPS, dl, MVT::v4f32,
07183                            DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, V1),
07184                            DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, V2), DAG));
07185 }
07186 
07187 static
07188 SDValue getMOVHighToLow(SDValue &Op, SDLoc &dl, SelectionDAG &DAG) {
07189   SDValue V1 = Op.getOperand(0);
07190   SDValue V2 = Op.getOperand(1);
07191   MVT VT = Op.getSimpleValueType();
07192 
07193   assert((VT == MVT::v4i32 || VT == MVT::v4f32) &&
07194          "unsupported shuffle type");
07195 
07196   if (V2.getOpcode() == ISD::UNDEF)
07197     V2 = V1;
07198 
07199   // v4i32 or v4f32
07200   return getTargetShuffleNode(X86ISD::MOVHLPS, dl, VT, V1, V2, DAG);
07201 }
07202 
07203 static
07204 SDValue getMOVLP(SDValue &Op, SDLoc &dl, SelectionDAG &DAG, bool HasSSE2) {
07205   SDValue V1 = Op.getOperand(0);
07206   SDValue V2 = Op.getOperand(1);
07207   MVT VT = Op.getSimpleValueType();
07208   unsigned NumElems = VT.getVectorNumElements();
07209 
07210   // Use MOVLPS and MOVLPD in case V1 or V2 are loads. During isel, the second
07211   // operand of these instructions is only memory, so check if there's a
07212   // potencial load folding here, otherwise use SHUFPS or MOVSD to match the
07213   // same masks.
07214   bool CanFoldLoad = false;
07215 
07216   // Trivial case, when V2 comes from a load.
07217   if (MayFoldVectorLoad(V2))
07218     CanFoldLoad = true;
07219 
07220   // When V1 is a load, it can be folded later into a store in isel, example:
07221   //  (store (v4f32 (X86Movlps (load addr:$src1), VR128:$src2)), addr:$src1)
07222   //    turns into:
07223   //  (MOVLPSmr addr:$src1, VR128:$src2)
07224   // So, recognize this potential and also use MOVLPS or MOVLPD
07225   else if (MayFoldVectorLoad(V1) && MayFoldIntoStore(Op))
07226     CanFoldLoad = true;
07227 
07228   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
07229   if (CanFoldLoad) {
07230     if (HasSSE2 && NumElems == 2)
07231       return getTargetShuffleNode(X86ISD::MOVLPD, dl, VT, V1, V2, DAG);
07232 
07233     if (NumElems == 4)
07234       // If we don't care about the second element, proceed to use movss.
07235       if (SVOp->getMaskElt(1) != -1)
07236         return getTargetShuffleNode(X86ISD::MOVLPS, dl, VT, V1, V2, DAG);
07237   }
07238 
07239   // movl and movlp will both match v2i64, but v2i64 is never matched by
07240   // movl earlier because we make it strict to avoid messing with the movlp load
07241   // folding logic (see the code above getMOVLP call). Match it here then,
07242   // this is horrible, but will stay like this until we move all shuffle
07243   // matching to x86 specific nodes. Note that for the 1st condition all
07244   // types are matched with movsd.
07245   if (HasSSE2) {
07246     // FIXME: isMOVLMask should be checked and matched before getMOVLP,
07247     // as to remove this logic from here, as much as possible
07248     if (NumElems == 2 || !isMOVLMask(SVOp->getMask(), VT))
07249       return getTargetShuffleNode(X86ISD::MOVSD, dl, VT, V1, V2, DAG);
07250     return getTargetShuffleNode(X86ISD::MOVSS, dl, VT, V1, V2, DAG);
07251   }
07252 
07253   assert(VT != MVT::v4i32 && "unsupported shuffle type");
07254 
07255   // Invert the operand order and use SHUFPS to match it.
07256   return getTargetShuffleNode(X86ISD::SHUFP, dl, VT, V2, V1,
07257                               getShuffleSHUFImmediate(SVOp), DAG);
07258 }
07259 
07260 // Reduce a vector shuffle to zext.
07261 static SDValue LowerVectorIntExtend(SDValue Op, const X86Subtarget *Subtarget,
07262                                     SelectionDAG &DAG) {
07263   // PMOVZX is only available from SSE41.
07264   if (!Subtarget->hasSSE41())
07265     return SDValue();
07266 
07267   MVT VT = Op.getSimpleValueType();
07268 
07269   // Only AVX2 support 256-bit vector integer extending.
07270   if (!Subtarget->hasInt256() && VT.is256BitVector())
07271     return SDValue();
07272 
07273   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
07274   SDLoc DL(Op);
07275   SDValue V1 = Op.getOperand(0);
07276   SDValue V2 = Op.getOperand(1);
07277   unsigned NumElems = VT.getVectorNumElements();
07278 
07279   // Extending is an unary operation and the element type of the source vector
07280   // won't be equal to or larger than i64.
07281   if (V2.getOpcode() != ISD::UNDEF || !VT.isInteger() ||
07282       VT.getVectorElementType() == MVT::i64)
07283     return SDValue();
07284 
07285   // Find the expansion ratio, e.g. expanding from i8 to i32 has a ratio of 4.
07286   unsigned Shift = 1; // Start from 2, i.e. 1 << 1.
07287   while ((1U << Shift) < NumElems) {
07288     if (SVOp->getMaskElt(1U << Shift) == 1)
07289       break;
07290     Shift += 1;
07291     // The maximal ratio is 8, i.e. from i8 to i64.
07292     if (Shift > 3)
07293       return SDValue();
07294   }
07295 
07296   // Check the shuffle mask.
07297   unsigned Mask = (1U << Shift) - 1;
07298   for (unsigned i = 0; i != NumElems; ++i) {
07299     int EltIdx = SVOp->getMaskElt(i);
07300     if ((i & Mask) != 0 && EltIdx != -1)
07301       return SDValue();
07302     if ((i & Mask) == 0 && (unsigned)EltIdx != (i >> Shift))
07303       return SDValue();
07304   }
07305 
07306   unsigned NBits = VT.getVectorElementType().getSizeInBits() << Shift;
07307   MVT NeVT = MVT::getIntegerVT(NBits);
07308   MVT NVT = MVT::getVectorVT(NeVT, NumElems >> Shift);
07309 
07310   if (!DAG.getTargetLoweringInfo().isTypeLegal(NVT))
07311     return SDValue();
07312 
07313   // Simplify the operand as it's prepared to be fed into shuffle.
07314   unsigned SignificantBits = NVT.getSizeInBits() >> Shift;
07315   if (V1.getOpcode() == ISD::BITCAST &&
07316       V1.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR &&
07317       V1.getOperand(0).getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
07318       V1.getOperand(0).getOperand(0)
07319         .getSimpleValueType().getSizeInBits() == SignificantBits) {
07320     // (bitcast (sclr2vec (ext_vec_elt x))) -> (bitcast x)
07321     SDValue V = V1.getOperand(0).getOperand(0).getOperand(0);
07322     ConstantSDNode *CIdx =
07323       dyn_cast<ConstantSDNode>(V1.getOperand(0).getOperand(0).getOperand(1));
07324     // If it's foldable, i.e. normal load with single use, we will let code
07325     // selection to fold it. Otherwise, we will short the conversion sequence.
07326     if (CIdx && CIdx->getZExtValue()