LLVM API Documentation

X86ISelLowering.cpp
Go to the documentation of this file.
00001 //===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===//
00002 //
00003 //                     The LLVM Compiler Infrastructure
00004 //
00005 // This file is distributed under the University of Illinois Open Source
00006 // License. See LICENSE.TXT for details.
00007 //
00008 //===----------------------------------------------------------------------===//
00009 //
00010 // This file defines the interfaces that X86 uses to lower LLVM code into a
00011 // selection DAG.
00012 //
00013 //===----------------------------------------------------------------------===//
00014 
00015 #include "X86ISelLowering.h"
00016 #include "Utils/X86ShuffleDecode.h"
00017 #include "X86CallingConv.h"
00018 #include "X86InstrBuilder.h"
00019 #include "X86MachineFunctionInfo.h"
00020 #include "X86TargetMachine.h"
00021 #include "X86TargetObjectFile.h"
00022 #include "llvm/ADT/SmallSet.h"
00023 #include "llvm/ADT/Statistic.h"
00024 #include "llvm/ADT/StringExtras.h"
00025 #include "llvm/ADT/StringSwitch.h"
00026 #include "llvm/ADT/VariadicFunction.h"
00027 #include "llvm/CodeGen/IntrinsicLowering.h"
00028 #include "llvm/CodeGen/MachineFrameInfo.h"
00029 #include "llvm/CodeGen/MachineFunction.h"
00030 #include "llvm/CodeGen/MachineInstrBuilder.h"
00031 #include "llvm/CodeGen/MachineJumpTableInfo.h"
00032 #include "llvm/CodeGen/MachineModuleInfo.h"
00033 #include "llvm/CodeGen/MachineRegisterInfo.h"
00034 #include "llvm/IR/CallSite.h"
00035 #include "llvm/IR/CallingConv.h"
00036 #include "llvm/IR/Constants.h"
00037 #include "llvm/IR/DerivedTypes.h"
00038 #include "llvm/IR/Function.h"
00039 #include "llvm/IR/GlobalAlias.h"
00040 #include "llvm/IR/GlobalVariable.h"
00041 #include "llvm/IR/Instructions.h"
00042 #include "llvm/IR/Intrinsics.h"
00043 #include "llvm/MC/MCAsmInfo.h"
00044 #include "llvm/MC/MCContext.h"
00045 #include "llvm/MC/MCExpr.h"
00046 #include "llvm/MC/MCSymbol.h"
00047 #include "llvm/Support/CommandLine.h"
00048 #include "llvm/Support/Debug.h"
00049 #include "llvm/Support/ErrorHandling.h"
00050 #include "llvm/Support/MathExtras.h"
00051 #include "llvm/Target/TargetOptions.h"
00052 #include <bitset>
00053 #include <numeric>
00054 #include <cctype>
00055 using namespace llvm;
00056 
00057 #define DEBUG_TYPE "x86-isel"
00058 
00059 STATISTIC(NumTailCalls, "Number of tail calls");
00060 
00061 static cl::opt<bool> ExperimentalVectorWideningLegalization(
00062     "x86-experimental-vector-widening-legalization", cl::init(false),
00063     cl::desc("Enable an experimental vector type legalization through widening "
00064              "rather than promotion."),
00065     cl::Hidden);
00066 
00067 static cl::opt<bool> ExperimentalVectorShuffleLowering(
00068     "x86-experimental-vector-shuffle-lowering", cl::init(false),
00069     cl::desc("Enable an experimental vector shuffle lowering code path."),
00070     cl::Hidden);
00071 
00072 // Forward declarations.
00073 static SDValue getMOVL(SelectionDAG &DAG, SDLoc dl, EVT VT, SDValue V1,
00074                        SDValue V2);
00075 
00076 static SDValue ExtractSubVector(SDValue Vec, unsigned IdxVal,
00077                                 SelectionDAG &DAG, SDLoc dl,
00078                                 unsigned vectorWidth) {
00079   assert((vectorWidth == 128 || vectorWidth == 256) &&
00080          "Unsupported vector width");
00081   EVT VT = Vec.getValueType();
00082   EVT ElVT = VT.getVectorElementType();
00083   unsigned Factor = VT.getSizeInBits()/vectorWidth;
00084   EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT,
00085                                   VT.getVectorNumElements()/Factor);
00086 
00087   // Extract from UNDEF is UNDEF.
00088   if (Vec.getOpcode() == ISD::UNDEF)
00089     return DAG.getUNDEF(ResultVT);
00090 
00091   // Extract the relevant vectorWidth bits.  Generate an EXTRACT_SUBVECTOR
00092   unsigned ElemsPerChunk = vectorWidth / ElVT.getSizeInBits();
00093 
00094   // This is the index of the first element of the vectorWidth-bit chunk
00095   // we want.
00096   unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits()) / vectorWidth)
00097                                * ElemsPerChunk);
00098 
00099   // If the input is a buildvector just emit a smaller one.
00100   if (Vec.getOpcode() == ISD::BUILD_VECTOR)
00101     return DAG.getNode(ISD::BUILD_VECTOR, dl, ResultVT,
00102                        makeArrayRef(Vec->op_begin()+NormalizedIdxVal,
00103                                     ElemsPerChunk));
00104 
00105   SDValue VecIdx = DAG.getIntPtrConstant(NormalizedIdxVal);
00106   SDValue Result = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec,
00107                                VecIdx);
00108 
00109   return Result;
00110 
00111 }
00112 /// Generate a DAG to grab 128-bits from a vector > 128 bits.  This
00113 /// sets things up to match to an AVX VEXTRACTF128 / VEXTRACTI128
00114 /// or AVX-512 VEXTRACTF32x4 / VEXTRACTI32x4
00115 /// instructions or a simple subregister reference. Idx is an index in the
00116 /// 128 bits we want.  It need not be aligned to a 128-bit bounday.  That makes
00117 /// lowering EXTRACT_VECTOR_ELT operations easier.
00118 static SDValue Extract128BitVector(SDValue Vec, unsigned IdxVal,
00119                                    SelectionDAG &DAG, SDLoc dl) {
00120   assert((Vec.getValueType().is256BitVector() ||
00121           Vec.getValueType().is512BitVector()) && "Unexpected vector size!");
00122   return ExtractSubVector(Vec, IdxVal, DAG, dl, 128);
00123 }
00124 
00125 /// Generate a DAG to grab 256-bits from a 512-bit vector.
00126 static SDValue Extract256BitVector(SDValue Vec, unsigned IdxVal,
00127                                    SelectionDAG &DAG, SDLoc dl) {
00128   assert(Vec.getValueType().is512BitVector() && "Unexpected vector size!");
00129   return ExtractSubVector(Vec, IdxVal, DAG, dl, 256);
00130 }
00131 
00132 static SDValue InsertSubVector(SDValue Result, SDValue Vec,
00133                                unsigned IdxVal, SelectionDAG &DAG,
00134                                SDLoc dl, unsigned vectorWidth) {
00135   assert((vectorWidth == 128 || vectorWidth == 256) &&
00136          "Unsupported vector width");
00137   // Inserting UNDEF is Result
00138   if (Vec.getOpcode() == ISD::UNDEF)
00139     return Result;
00140   EVT VT = Vec.getValueType();
00141   EVT ElVT = VT.getVectorElementType();
00142   EVT ResultVT = Result.getValueType();
00143 
00144   // Insert the relevant vectorWidth bits.
00145   unsigned ElemsPerChunk = vectorWidth/ElVT.getSizeInBits();
00146 
00147   // This is the index of the first element of the vectorWidth-bit chunk
00148   // we want.
00149   unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits())/vectorWidth)
00150                                * ElemsPerChunk);
00151 
00152   SDValue VecIdx = DAG.getIntPtrConstant(NormalizedIdxVal);
00153   return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec,
00154                      VecIdx);
00155 }
00156 /// Generate a DAG to put 128-bits into a vector > 128 bits.  This
00157 /// sets things up to match to an AVX VINSERTF128/VINSERTI128 or
00158 /// AVX-512 VINSERTF32x4/VINSERTI32x4 instructions or a
00159 /// simple superregister reference.  Idx is an index in the 128 bits
00160 /// we want.  It need not be aligned to a 128-bit bounday.  That makes
00161 /// lowering INSERT_VECTOR_ELT operations easier.
00162 static SDValue Insert128BitVector(SDValue Result, SDValue Vec,
00163                                   unsigned IdxVal, SelectionDAG &DAG,
00164                                   SDLoc dl) {
00165   assert(Vec.getValueType().is128BitVector() && "Unexpected vector size!");
00166   return InsertSubVector(Result, Vec, IdxVal, DAG, dl, 128);
00167 }
00168 
00169 static SDValue Insert256BitVector(SDValue Result, SDValue Vec,
00170                                   unsigned IdxVal, SelectionDAG &DAG,
00171                                   SDLoc dl) {
00172   assert(Vec.getValueType().is256BitVector() && "Unexpected vector size!");
00173   return InsertSubVector(Result, Vec, IdxVal, DAG, dl, 256);
00174 }
00175 
00176 /// Concat two 128-bit vectors into a 256 bit vector using VINSERTF128
00177 /// instructions. This is used because creating CONCAT_VECTOR nodes of
00178 /// BUILD_VECTORS returns a larger BUILD_VECTOR while we're trying to lower
00179 /// large BUILD_VECTORS.
00180 static SDValue Concat128BitVectors(SDValue V1, SDValue V2, EVT VT,
00181                                    unsigned NumElems, SelectionDAG &DAG,
00182                                    SDLoc dl) {
00183   SDValue V = Insert128BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl);
00184   return Insert128BitVector(V, V2, NumElems/2, DAG, dl);
00185 }
00186 
00187 static SDValue Concat256BitVectors(SDValue V1, SDValue V2, EVT VT,
00188                                    unsigned NumElems, SelectionDAG &DAG,
00189                                    SDLoc dl) {
00190   SDValue V = Insert256BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl);
00191   return Insert256BitVector(V, V2, NumElems/2, DAG, dl);
00192 }
00193 
00194 static TargetLoweringObjectFile *createTLOF(const Triple &TT) {
00195   if (TT.isOSBinFormatMachO()) {
00196     if (TT.getArch() == Triple::x86_64)
00197       return new X86_64MachoTargetObjectFile();
00198     return new TargetLoweringObjectFileMachO();
00199   }
00200 
00201   if (TT.isOSLinux())
00202     return new X86LinuxTargetObjectFile();
00203   if (TT.isOSBinFormatELF())
00204     return new TargetLoweringObjectFileELF();
00205   if (TT.isKnownWindowsMSVCEnvironment())
00206     return new X86WindowsTargetObjectFile();
00207   if (TT.isOSBinFormatCOFF())
00208     return new TargetLoweringObjectFileCOFF();
00209   llvm_unreachable("unknown subtarget type");
00210 }
00211 
00212 // FIXME: This should stop caching the target machine as soon as
00213 // we can remove resetOperationActions et al.
00214 X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
00215   : TargetLowering(TM, createTLOF(Triple(TM.getTargetTriple()))) {
00216   Subtarget = &TM.getSubtarget<X86Subtarget>();
00217   X86ScalarSSEf64 = Subtarget->hasSSE2();
00218   X86ScalarSSEf32 = Subtarget->hasSSE1();
00219   TD = getDataLayout();
00220 
00221   resetOperationActions();
00222 }
00223 
00224 void X86TargetLowering::resetOperationActions() {
00225   const TargetMachine &TM = getTargetMachine();
00226   static bool FirstTimeThrough = true;
00227 
00228   // If none of the target options have changed, then we don't need to reset the
00229   // operation actions.
00230   if (!FirstTimeThrough && TO == TM.Options) return;
00231 
00232   if (!FirstTimeThrough) {
00233     // Reinitialize the actions.
00234     initActions();
00235     FirstTimeThrough = false;
00236   }
00237 
00238   TO = TM.Options;
00239 
00240   // Set up the TargetLowering object.
00241   static const MVT IntVTs[] = { MVT::i8, MVT::i16, MVT::i32, MVT::i64 };
00242 
00243   // X86 is weird, it always uses i8 for shift amounts and setcc results.
00244   setBooleanContents(ZeroOrOneBooleanContent);
00245   // X86-SSE is even stranger. It uses -1 or 0 for vector masks.
00246   setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
00247 
00248   // For 64-bit since we have so many registers use the ILP scheduler, for
00249   // 32-bit code use the register pressure specific scheduling.
00250   // For Atom, always use ILP scheduling.
00251   if (Subtarget->isAtom())
00252     setSchedulingPreference(Sched::ILP);
00253   else if (Subtarget->is64Bit())
00254     setSchedulingPreference(Sched::ILP);
00255   else
00256     setSchedulingPreference(Sched::RegPressure);
00257   const X86RegisterInfo *RegInfo =
00258     static_cast<const X86RegisterInfo*>(TM.getRegisterInfo());
00259   setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister());
00260 
00261   // Bypass expensive divides on Atom when compiling with O2
00262   if (Subtarget->hasSlowDivide() && TM.getOptLevel() >= CodeGenOpt::Default) {
00263     addBypassSlowDiv(32, 8);
00264     if (Subtarget->is64Bit())
00265       addBypassSlowDiv(64, 16);
00266   }
00267 
00268   if (Subtarget->isTargetKnownWindowsMSVC()) {
00269     // Setup Windows compiler runtime calls.
00270     setLibcallName(RTLIB::SDIV_I64, "_alldiv");
00271     setLibcallName(RTLIB::UDIV_I64, "_aulldiv");
00272     setLibcallName(RTLIB::SREM_I64, "_allrem");
00273     setLibcallName(RTLIB::UREM_I64, "_aullrem");
00274     setLibcallName(RTLIB::MUL_I64, "_allmul");
00275     setLibcallCallingConv(RTLIB::SDIV_I64, CallingConv::X86_StdCall);
00276     setLibcallCallingConv(RTLIB::UDIV_I64, CallingConv::X86_StdCall);
00277     setLibcallCallingConv(RTLIB::SREM_I64, CallingConv::X86_StdCall);
00278     setLibcallCallingConv(RTLIB::UREM_I64, CallingConv::X86_StdCall);
00279     setLibcallCallingConv(RTLIB::MUL_I64, CallingConv::X86_StdCall);
00280 
00281     // The _ftol2 runtime function has an unusual calling conv, which
00282     // is modeled by a special pseudo-instruction.
00283     setLibcallName(RTLIB::FPTOUINT_F64_I64, nullptr);
00284     setLibcallName(RTLIB::FPTOUINT_F32_I64, nullptr);
00285     setLibcallName(RTLIB::FPTOUINT_F64_I32, nullptr);
00286     setLibcallName(RTLIB::FPTOUINT_F32_I32, nullptr);
00287   }
00288 
00289   if (Subtarget->isTargetDarwin()) {
00290     // Darwin should use _setjmp/_longjmp instead of setjmp/longjmp.
00291     setUseUnderscoreSetJmp(false);
00292     setUseUnderscoreLongJmp(false);
00293   } else if (Subtarget->isTargetWindowsGNU()) {
00294     // MS runtime is weird: it exports _setjmp, but longjmp!
00295     setUseUnderscoreSetJmp(true);
00296     setUseUnderscoreLongJmp(false);
00297   } else {
00298     setUseUnderscoreSetJmp(true);
00299     setUseUnderscoreLongJmp(true);
00300   }
00301 
00302   // Set up the register classes.
00303   addRegisterClass(MVT::i8, &X86::GR8RegClass);
00304   addRegisterClass(MVT::i16, &X86::GR16RegClass);
00305   addRegisterClass(MVT::i32, &X86::GR32RegClass);
00306   if (Subtarget->is64Bit())
00307     addRegisterClass(MVT::i64, &X86::GR64RegClass);
00308 
00309   setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote);
00310 
00311   // We don't accept any truncstore of integer registers.
00312   setTruncStoreAction(MVT::i64, MVT::i32, Expand);
00313   setTruncStoreAction(MVT::i64, MVT::i16, Expand);
00314   setTruncStoreAction(MVT::i64, MVT::i8 , Expand);
00315   setTruncStoreAction(MVT::i32, MVT::i16, Expand);
00316   setTruncStoreAction(MVT::i32, MVT::i8 , Expand);
00317   setTruncStoreAction(MVT::i16, MVT::i8,  Expand);
00318 
00319   // SETOEQ and SETUNE require checking two conditions.
00320   setCondCodeAction(ISD::SETOEQ, MVT::f32, Expand);
00321   setCondCodeAction(ISD::SETOEQ, MVT::f64, Expand);
00322   setCondCodeAction(ISD::SETOEQ, MVT::f80, Expand);
00323   setCondCodeAction(ISD::SETUNE, MVT::f32, Expand);
00324   setCondCodeAction(ISD::SETUNE, MVT::f64, Expand);
00325   setCondCodeAction(ISD::SETUNE, MVT::f80, Expand);
00326 
00327   // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this
00328   // operation.
00329   setOperationAction(ISD::UINT_TO_FP       , MVT::i1   , Promote);
00330   setOperationAction(ISD::UINT_TO_FP       , MVT::i8   , Promote);
00331   setOperationAction(ISD::UINT_TO_FP       , MVT::i16  , Promote);
00332 
00333   if (Subtarget->is64Bit()) {
00334     setOperationAction(ISD::UINT_TO_FP     , MVT::i32  , Promote);
00335     setOperationAction(ISD::UINT_TO_FP     , MVT::i64  , Custom);
00336   } else if (!TM.Options.UseSoftFloat) {
00337     // We have an algorithm for SSE2->double, and we turn this into a
00338     // 64-bit FILD followed by conditional FADD for other targets.
00339     setOperationAction(ISD::UINT_TO_FP     , MVT::i64  , Custom);
00340     // We have an algorithm for SSE2, and we turn this into a 64-bit
00341     // FILD for other targets.
00342     setOperationAction(ISD::UINT_TO_FP     , MVT::i32  , Custom);
00343   }
00344 
00345   // Promote i1/i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have
00346   // this operation.
00347   setOperationAction(ISD::SINT_TO_FP       , MVT::i1   , Promote);
00348   setOperationAction(ISD::SINT_TO_FP       , MVT::i8   , Promote);
00349 
00350   if (!TM.Options.UseSoftFloat) {
00351     // SSE has no i16 to fp conversion, only i32
00352     if (X86ScalarSSEf32) {
00353       setOperationAction(ISD::SINT_TO_FP     , MVT::i16  , Promote);
00354       // f32 and f64 cases are Legal, f80 case is not
00355       setOperationAction(ISD::SINT_TO_FP     , MVT::i32  , Custom);
00356     } else {
00357       setOperationAction(ISD::SINT_TO_FP     , MVT::i16  , Custom);
00358       setOperationAction(ISD::SINT_TO_FP     , MVT::i32  , Custom);
00359     }
00360   } else {
00361     setOperationAction(ISD::SINT_TO_FP     , MVT::i16  , Promote);
00362     setOperationAction(ISD::SINT_TO_FP     , MVT::i32  , Promote);
00363   }
00364 
00365   // In 32-bit mode these are custom lowered.  In 64-bit mode F32 and F64
00366   // are Legal, f80 is custom lowered.
00367   setOperationAction(ISD::FP_TO_SINT     , MVT::i64  , Custom);
00368   setOperationAction(ISD::SINT_TO_FP     , MVT::i64  , Custom);
00369 
00370   // Promote i1/i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have
00371   // this operation.
00372   setOperationAction(ISD::FP_TO_SINT       , MVT::i1   , Promote);
00373   setOperationAction(ISD::FP_TO_SINT       , MVT::i8   , Promote);
00374 
00375   if (X86ScalarSSEf32) {
00376     setOperationAction(ISD::FP_TO_SINT     , MVT::i16  , Promote);
00377     // f32 and f64 cases are Legal, f80 case is not
00378     setOperationAction(ISD::FP_TO_SINT     , MVT::i32  , Custom);
00379   } else {
00380     setOperationAction(ISD::FP_TO_SINT     , MVT::i16  , Custom);
00381     setOperationAction(ISD::FP_TO_SINT     , MVT::i32  , Custom);
00382   }
00383 
00384   // Handle FP_TO_UINT by promoting the destination to a larger signed
00385   // conversion.
00386   setOperationAction(ISD::FP_TO_UINT       , MVT::i1   , Promote);
00387   setOperationAction(ISD::FP_TO_UINT       , MVT::i8   , Promote);
00388   setOperationAction(ISD::FP_TO_UINT       , MVT::i16  , Promote);
00389 
00390   if (Subtarget->is64Bit()) {
00391     setOperationAction(ISD::FP_TO_UINT     , MVT::i64  , Expand);
00392     setOperationAction(ISD::FP_TO_UINT     , MVT::i32  , Promote);
00393   } else if (!TM.Options.UseSoftFloat) {
00394     // Since AVX is a superset of SSE3, only check for SSE here.
00395     if (Subtarget->hasSSE1() && !Subtarget->hasSSE3())
00396       // Expand FP_TO_UINT into a select.
00397       // FIXME: We would like to use a Custom expander here eventually to do
00398       // the optimal thing for SSE vs. the default expansion in the legalizer.
00399       setOperationAction(ISD::FP_TO_UINT   , MVT::i32  , Expand);
00400     else
00401       // With SSE3 we can use fisttpll to convert to a signed i64; without
00402       // SSE, we're stuck with a fistpll.
00403       setOperationAction(ISD::FP_TO_UINT   , MVT::i32  , Custom);
00404   }
00405 
00406   if (isTargetFTOL()) {
00407     // Use the _ftol2 runtime function, which has a pseudo-instruction
00408     // to handle its weird calling convention.
00409     setOperationAction(ISD::FP_TO_UINT     , MVT::i64  , Custom);
00410   }
00411 
00412   // TODO: when we have SSE, these could be more efficient, by using movd/movq.
00413   if (!X86ScalarSSEf64) {
00414     setOperationAction(ISD::BITCAST        , MVT::f32  , Expand);
00415     setOperationAction(ISD::BITCAST        , MVT::i32  , Expand);
00416     if (Subtarget->is64Bit()) {
00417       setOperationAction(ISD::BITCAST      , MVT::f64  , Expand);
00418       // Without SSE, i64->f64 goes through memory.
00419       setOperationAction(ISD::BITCAST      , MVT::i64  , Expand);
00420     }
00421   }
00422 
00423   // Scalar integer divide and remainder are lowered to use operations that
00424   // produce two results, to match the available instructions. This exposes
00425   // the two-result form to trivial CSE, which is able to combine x/y and x%y
00426   // into a single instruction.
00427   //
00428   // Scalar integer multiply-high is also lowered to use two-result
00429   // operations, to match the available instructions. However, plain multiply
00430   // (low) operations are left as Legal, as there are single-result
00431   // instructions for this in x86. Using the two-result multiply instructions
00432   // when both high and low results are needed must be arranged by dagcombine.
00433   for (unsigned i = 0; i != array_lengthof(IntVTs); ++i) {
00434     MVT VT = IntVTs[i];
00435     setOperationAction(ISD::MULHS, VT, Expand);
00436     setOperationAction(ISD::MULHU, VT, Expand);
00437     setOperationAction(ISD::SDIV, VT, Expand);
00438     setOperationAction(ISD::UDIV, VT, Expand);
00439     setOperationAction(ISD::SREM, VT, Expand);
00440     setOperationAction(ISD::UREM, VT, Expand);
00441 
00442     // Add/Sub overflow ops with MVT::Glues are lowered to EFLAGS dependences.
00443     setOperationAction(ISD::ADDC, VT, Custom);
00444     setOperationAction(ISD::ADDE, VT, Custom);
00445     setOperationAction(ISD::SUBC, VT, Custom);
00446     setOperationAction(ISD::SUBE, VT, Custom);
00447   }
00448 
00449   setOperationAction(ISD::BR_JT            , MVT::Other, Expand);
00450   setOperationAction(ISD::BRCOND           , MVT::Other, Custom);
00451   setOperationAction(ISD::BR_CC            , MVT::f32,   Expand);
00452   setOperationAction(ISD::BR_CC            , MVT::f64,   Expand);
00453   setOperationAction(ISD::BR_CC            , MVT::f80,   Expand);
00454   setOperationAction(ISD::BR_CC            , MVT::i8,    Expand);
00455   setOperationAction(ISD::BR_CC            , MVT::i16,   Expand);
00456   setOperationAction(ISD::BR_CC            , MVT::i32,   Expand);
00457   setOperationAction(ISD::BR_CC            , MVT::i64,   Expand);
00458   setOperationAction(ISD::SELECT_CC        , MVT::f32,   Expand);
00459   setOperationAction(ISD::SELECT_CC        , MVT::f64,   Expand);
00460   setOperationAction(ISD::SELECT_CC        , MVT::f80,   Expand);
00461   setOperationAction(ISD::SELECT_CC        , MVT::i8,    Expand);
00462   setOperationAction(ISD::SELECT_CC        , MVT::i16,   Expand);
00463   setOperationAction(ISD::SELECT_CC        , MVT::i32,   Expand);
00464   setOperationAction(ISD::SELECT_CC        , MVT::i64,   Expand);
00465   if (Subtarget->is64Bit())
00466     setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
00467   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16  , Legal);
00468   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8   , Legal);
00469   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1   , Expand);
00470   setOperationAction(ISD::FP_ROUND_INREG   , MVT::f32  , Expand);
00471   setOperationAction(ISD::FREM             , MVT::f32  , Expand);
00472   setOperationAction(ISD::FREM             , MVT::f64  , Expand);
00473   setOperationAction(ISD::FREM             , MVT::f80  , Expand);
00474   setOperationAction(ISD::FLT_ROUNDS_      , MVT::i32  , Custom);
00475 
00476   // Promote the i8 variants and force them on up to i32 which has a shorter
00477   // encoding.
00478   setOperationAction(ISD::CTTZ             , MVT::i8   , Promote);
00479   AddPromotedToType (ISD::CTTZ             , MVT::i8   , MVT::i32);
00480   setOperationAction(ISD::CTTZ_ZERO_UNDEF  , MVT::i8   , Promote);
00481   AddPromotedToType (ISD::CTTZ_ZERO_UNDEF  , MVT::i8   , MVT::i32);
00482   if (Subtarget->hasBMI()) {
00483     setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i16  , Expand);
00484     setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32  , Expand);
00485     if (Subtarget->is64Bit())
00486       setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Expand);
00487   } else {
00488     setOperationAction(ISD::CTTZ           , MVT::i16  , Custom);
00489     setOperationAction(ISD::CTTZ           , MVT::i32  , Custom);
00490     if (Subtarget->is64Bit())
00491       setOperationAction(ISD::CTTZ         , MVT::i64  , Custom);
00492   }
00493 
00494   if (Subtarget->hasLZCNT()) {
00495     // When promoting the i8 variants, force them to i32 for a shorter
00496     // encoding.
00497     setOperationAction(ISD::CTLZ           , MVT::i8   , Promote);
00498     AddPromotedToType (ISD::CTLZ           , MVT::i8   , MVT::i32);
00499     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8   , Promote);
00500     AddPromotedToType (ISD::CTLZ_ZERO_UNDEF, MVT::i8   , MVT::i32);
00501     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16  , Expand);
00502     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32  , Expand);
00503     if (Subtarget->is64Bit())
00504       setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Expand);
00505   } else {
00506     setOperationAction(ISD::CTLZ           , MVT::i8   , Custom);
00507     setOperationAction(ISD::CTLZ           , MVT::i16  , Custom);
00508     setOperationAction(ISD::CTLZ           , MVT::i32  , Custom);
00509     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8   , Custom);
00510     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16  , Custom);
00511     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32  , Custom);
00512     if (Subtarget->is64Bit()) {
00513       setOperationAction(ISD::CTLZ         , MVT::i64  , Custom);
00514       setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom);
00515     }
00516   }
00517 
00518   // Special handling for half-precision floating point conversions.
00519   // If we don't have F16C support, then lower half float conversions
00520   // into library calls.
00521   if (TM.Options.UseSoftFloat || !Subtarget->hasF16C()) {
00522     setOperationAction(ISD::FP16_TO_FP, MVT::f32, Expand);
00523     setOperationAction(ISD::FP_TO_FP16, MVT::f32, Expand);
00524   }
00525 
00526   // There's never any support for operations beyond MVT::f32.
00527   setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
00528   setOperationAction(ISD::FP16_TO_FP, MVT::f80, Expand);
00529   setOperationAction(ISD::FP_TO_FP16, MVT::f64, Expand);
00530   setOperationAction(ISD::FP_TO_FP16, MVT::f80, Expand);
00531 
00532   setLoadExtAction(ISD::EXTLOAD, MVT::f16, Expand);
00533   setTruncStoreAction(MVT::f32, MVT::f16, Expand);
00534   setTruncStoreAction(MVT::f64, MVT::f16, Expand);
00535   setTruncStoreAction(MVT::f80, MVT::f16, Expand);
00536 
00537   if (Subtarget->hasPOPCNT()) {
00538     setOperationAction(ISD::CTPOP          , MVT::i8   , Promote);
00539   } else {
00540     setOperationAction(ISD::CTPOP          , MVT::i8   , Expand);
00541     setOperationAction(ISD::CTPOP          , MVT::i16  , Expand);
00542     setOperationAction(ISD::CTPOP          , MVT::i32  , Expand);
00543     if (Subtarget->is64Bit())
00544       setOperationAction(ISD::CTPOP        , MVT::i64  , Expand);
00545   }
00546 
00547   setOperationAction(ISD::READCYCLECOUNTER , MVT::i64  , Custom);
00548 
00549   if (!Subtarget->hasMOVBE())
00550     setOperationAction(ISD::BSWAP          , MVT::i16  , Expand);
00551 
00552   // These should be promoted to a larger select which is supported.
00553   setOperationAction(ISD::SELECT          , MVT::i1   , Promote);
00554   // X86 wants to expand cmov itself.
00555   setOperationAction(ISD::SELECT          , MVT::i8   , Custom);
00556   setOperationAction(ISD::SELECT          , MVT::i16  , Custom);
00557   setOperationAction(ISD::SELECT          , MVT::i32  , Custom);
00558   setOperationAction(ISD::SELECT          , MVT::f32  , Custom);
00559   setOperationAction(ISD::SELECT          , MVT::f64  , Custom);
00560   setOperationAction(ISD::SELECT          , MVT::f80  , Custom);
00561   setOperationAction(ISD::SETCC           , MVT::i8   , Custom);
00562   setOperationAction(ISD::SETCC           , MVT::i16  , Custom);
00563   setOperationAction(ISD::SETCC           , MVT::i32  , Custom);
00564   setOperationAction(ISD::SETCC           , MVT::f32  , Custom);
00565   setOperationAction(ISD::SETCC           , MVT::f64  , Custom);
00566   setOperationAction(ISD::SETCC           , MVT::f80  , Custom);
00567   if (Subtarget->is64Bit()) {
00568     setOperationAction(ISD::SELECT        , MVT::i64  , Custom);
00569     setOperationAction(ISD::SETCC         , MVT::i64  , Custom);
00570   }
00571   setOperationAction(ISD::EH_RETURN       , MVT::Other, Custom);
00572   // NOTE: EH_SJLJ_SETJMP/_LONGJMP supported here is NOT intended to support
00573   // SjLj exception handling but a light-weight setjmp/longjmp replacement to
00574   // support continuation, user-level threading, and etc.. As a result, no
00575   // other SjLj exception interfaces are implemented and please don't build
00576   // your own exception handling based on them.
00577   // LLVM/Clang supports zero-cost DWARF exception handling.
00578   setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom);
00579   setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom);
00580 
00581   // Darwin ABI issue.
00582   setOperationAction(ISD::ConstantPool    , MVT::i32  , Custom);
00583   setOperationAction(ISD::JumpTable       , MVT::i32  , Custom);
00584   setOperationAction(ISD::GlobalAddress   , MVT::i32  , Custom);
00585   setOperationAction(ISD::GlobalTLSAddress, MVT::i32  , Custom);
00586   if (Subtarget->is64Bit())
00587     setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom);
00588   setOperationAction(ISD::ExternalSymbol  , MVT::i32  , Custom);
00589   setOperationAction(ISD::BlockAddress    , MVT::i32  , Custom);
00590   if (Subtarget->is64Bit()) {
00591     setOperationAction(ISD::ConstantPool  , MVT::i64  , Custom);
00592     setOperationAction(ISD::JumpTable     , MVT::i64  , Custom);
00593     setOperationAction(ISD::GlobalAddress , MVT::i64  , Custom);
00594     setOperationAction(ISD::ExternalSymbol, MVT::i64  , Custom);
00595     setOperationAction(ISD::BlockAddress  , MVT::i64  , Custom);
00596   }
00597   // 64-bit addm sub, shl, sra, srl (iff 32-bit x86)
00598   setOperationAction(ISD::SHL_PARTS       , MVT::i32  , Custom);
00599   setOperationAction(ISD::SRA_PARTS       , MVT::i32  , Custom);
00600   setOperationAction(ISD::SRL_PARTS       , MVT::i32  , Custom);
00601   if (Subtarget->is64Bit()) {
00602     setOperationAction(ISD::SHL_PARTS     , MVT::i64  , Custom);
00603     setOperationAction(ISD::SRA_PARTS     , MVT::i64  , Custom);
00604     setOperationAction(ISD::SRL_PARTS     , MVT::i64  , Custom);
00605   }
00606 
00607   if (Subtarget->hasSSE1())
00608     setOperationAction(ISD::PREFETCH      , MVT::Other, Legal);
00609 
00610   setOperationAction(ISD::ATOMIC_FENCE  , MVT::Other, Custom);
00611 
00612   // Expand certain atomics
00613   for (unsigned i = 0; i != array_lengthof(IntVTs); ++i) {
00614     MVT VT = IntVTs[i];
00615     setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, VT, Custom);
00616     setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom);
00617     setOperationAction(ISD::ATOMIC_STORE, VT, Custom);
00618   }
00619 
00620   if (Subtarget->hasCmpxchg16b()) {
00621     setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i128, Custom);
00622   }
00623 
00624   // FIXME - use subtarget debug flags
00625   if (!Subtarget->isTargetDarwin() && !Subtarget->isTargetELF() &&
00626       !Subtarget->isTargetCygMing() && !Subtarget->isTargetWin64()) {
00627     setOperationAction(ISD::EH_LABEL, MVT::Other, Expand);
00628   }
00629 
00630   if (Subtarget->is64Bit()) {
00631     setExceptionPointerRegister(X86::RAX);
00632     setExceptionSelectorRegister(X86::RDX);
00633   } else {
00634     setExceptionPointerRegister(X86::EAX);
00635     setExceptionSelectorRegister(X86::EDX);
00636   }
00637   setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom);
00638   setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i64, Custom);
00639 
00640   setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom);
00641   setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom);
00642 
00643   setOperationAction(ISD::TRAP, MVT::Other, Legal);
00644   setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);
00645 
00646   // VASTART needs to be custom lowered to use the VarArgsFrameIndex
00647   setOperationAction(ISD::VASTART           , MVT::Other, Custom);
00648   setOperationAction(ISD::VAEND             , MVT::Other, Expand);
00649   if (Subtarget->is64Bit() && !Subtarget->isTargetWin64()) {
00650     // TargetInfo::X86_64ABIBuiltinVaList
00651     setOperationAction(ISD::VAARG           , MVT::Other, Custom);
00652     setOperationAction(ISD::VACOPY          , MVT::Other, Custom);
00653   } else {
00654     // TargetInfo::CharPtrBuiltinVaList
00655     setOperationAction(ISD::VAARG           , MVT::Other, Expand);
00656     setOperationAction(ISD::VACOPY          , MVT::Other, Expand);
00657   }
00658 
00659   setOperationAction(ISD::STACKSAVE,          MVT::Other, Expand);
00660   setOperationAction(ISD::STACKRESTORE,       MVT::Other, Expand);
00661 
00662   setOperationAction(ISD::DYNAMIC_STACKALLOC, Subtarget->is64Bit() ?
00663                      MVT::i64 : MVT::i32, Custom);
00664 
00665   if (!TM.Options.UseSoftFloat && X86ScalarSSEf64) {
00666     // f32 and f64 use SSE.
00667     // Set up the FP register classes.
00668     addRegisterClass(MVT::f32, &X86::FR32RegClass);
00669     addRegisterClass(MVT::f64, &X86::FR64RegClass);
00670 
00671     // Use ANDPD to simulate FABS.
00672     setOperationAction(ISD::FABS , MVT::f64, Custom);
00673     setOperationAction(ISD::FABS , MVT::f32, Custom);
00674 
00675     // Use XORP to simulate FNEG.
00676     setOperationAction(ISD::FNEG , MVT::f64, Custom);
00677     setOperationAction(ISD::FNEG , MVT::f32, Custom);
00678 
00679     // Use ANDPD and ORPD to simulate FCOPYSIGN.
00680     setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom);
00681     setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
00682 
00683     // Lower this to FGETSIGNx86 plus an AND.
00684     setOperationAction(ISD::FGETSIGN, MVT::i64, Custom);
00685     setOperationAction(ISD::FGETSIGN, MVT::i32, Custom);
00686 
00687     // We don't support sin/cos/fmod
00688     setOperationAction(ISD::FSIN   , MVT::f64, Expand);
00689     setOperationAction(ISD::FCOS   , MVT::f64, Expand);
00690     setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
00691     setOperationAction(ISD::FSIN   , MVT::f32, Expand);
00692     setOperationAction(ISD::FCOS   , MVT::f32, Expand);
00693     setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
00694 
00695     // Expand FP immediates into loads from the stack, except for the special
00696     // cases we handle.
00697     addLegalFPImmediate(APFloat(+0.0)); // xorpd
00698     addLegalFPImmediate(APFloat(+0.0f)); // xorps
00699   } else if (!TM.Options.UseSoftFloat && X86ScalarSSEf32) {
00700     // Use SSE for f32, x87 for f64.
00701     // Set up the FP register classes.
00702     addRegisterClass(MVT::f32, &X86::FR32RegClass);
00703     addRegisterClass(MVT::f64, &X86::RFP64RegClass);
00704 
00705     // Use ANDPS to simulate FABS.
00706     setOperationAction(ISD::FABS , MVT::f32, Custom);
00707 
00708     // Use XORP to simulate FNEG.
00709     setOperationAction(ISD::FNEG , MVT::f32, Custom);
00710 
00711     setOperationAction(ISD::UNDEF,     MVT::f64, Expand);
00712 
00713     // Use ANDPS and ORPS to simulate FCOPYSIGN.
00714     setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
00715     setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
00716 
00717     // We don't support sin/cos/fmod
00718     setOperationAction(ISD::FSIN   , MVT::f32, Expand);
00719     setOperationAction(ISD::FCOS   , MVT::f32, Expand);
00720     setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
00721 
00722     // Special cases we handle for FP constants.
00723     addLegalFPImmediate(APFloat(+0.0f)); // xorps
00724     addLegalFPImmediate(APFloat(+0.0)); // FLD0
00725     addLegalFPImmediate(APFloat(+1.0)); // FLD1
00726     addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
00727     addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
00728 
00729     if (!TM.Options.UnsafeFPMath) {
00730       setOperationAction(ISD::FSIN   , MVT::f64, Expand);
00731       setOperationAction(ISD::FCOS   , MVT::f64, Expand);
00732       setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
00733     }
00734   } else if (!TM.Options.UseSoftFloat) {
00735     // f32 and f64 in x87.
00736     // Set up the FP register classes.
00737     addRegisterClass(MVT::f64, &X86::RFP64RegClass);
00738     addRegisterClass(MVT::f32, &X86::RFP32RegClass);
00739 
00740     setOperationAction(ISD::UNDEF,     MVT::f64, Expand);
00741     setOperationAction(ISD::UNDEF,     MVT::f32, Expand);
00742     setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
00743     setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
00744 
00745     if (!TM.Options.UnsafeFPMath) {
00746       setOperationAction(ISD::FSIN   , MVT::f64, Expand);
00747       setOperationAction(ISD::FSIN   , MVT::f32, Expand);
00748       setOperationAction(ISD::FCOS   , MVT::f64, Expand);
00749       setOperationAction(ISD::FCOS   , MVT::f32, Expand);
00750       setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
00751       setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
00752     }
00753     addLegalFPImmediate(APFloat(+0.0)); // FLD0
00754     addLegalFPImmediate(APFloat(+1.0)); // FLD1
00755     addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
00756     addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
00757     addLegalFPImmediate(APFloat(+0.0f)); // FLD0
00758     addLegalFPImmediate(APFloat(+1.0f)); // FLD1
00759     addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS
00760     addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS
00761   }
00762 
00763   // We don't support FMA.
00764   setOperationAction(ISD::FMA, MVT::f64, Expand);
00765   setOperationAction(ISD::FMA, MVT::f32, Expand);
00766 
00767   // Long double always uses X87.
00768   if (!TM.Options.UseSoftFloat) {
00769     addRegisterClass(MVT::f80, &X86::RFP80RegClass);
00770     setOperationAction(ISD::UNDEF,     MVT::f80, Expand);
00771     setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand);
00772     {
00773       APFloat TmpFlt = APFloat::getZero(APFloat::x87DoubleExtended);
00774       addLegalFPImmediate(TmpFlt);  // FLD0
00775       TmpFlt.changeSign();
00776       addLegalFPImmediate(TmpFlt);  // FLD0/FCHS
00777 
00778       bool ignored;
00779       APFloat TmpFlt2(+1.0);
00780       TmpFlt2.convert(APFloat::x87DoubleExtended, APFloat::rmNearestTiesToEven,
00781                       &ignored);
00782       addLegalFPImmediate(TmpFlt2);  // FLD1
00783       TmpFlt2.changeSign();
00784       addLegalFPImmediate(TmpFlt2);  // FLD1/FCHS
00785     }
00786 
00787     if (!TM.Options.UnsafeFPMath) {
00788       setOperationAction(ISD::FSIN   , MVT::f80, Expand);
00789       setOperationAction(ISD::FCOS   , MVT::f80, Expand);
00790       setOperationAction(ISD::FSINCOS, MVT::f80, Expand);
00791     }
00792 
00793     setOperationAction(ISD::FFLOOR, MVT::f80, Expand);
00794     setOperationAction(ISD::FCEIL,  MVT::f80, Expand);
00795     setOperationAction(ISD::FTRUNC, MVT::f80, Expand);
00796     setOperationAction(ISD::FRINT,  MVT::f80, Expand);
00797     setOperationAction(ISD::FNEARBYINT, MVT::f80, Expand);
00798     setOperationAction(ISD::FMA, MVT::f80, Expand);
00799   }
00800 
00801   // Always use a library call for pow.
00802   setOperationAction(ISD::FPOW             , MVT::f32  , Expand);
00803   setOperationAction(ISD::FPOW             , MVT::f64  , Expand);
00804   setOperationAction(ISD::FPOW             , MVT::f80  , Expand);
00805 
00806   setOperationAction(ISD::FLOG, MVT::f80, Expand);
00807   setOperationAction(ISD::FLOG2, MVT::f80, Expand);
00808   setOperationAction(ISD::FLOG10, MVT::f80, Expand);
00809   setOperationAction(ISD::FEXP, MVT::f80, Expand);
00810   setOperationAction(ISD::FEXP2, MVT::f80, Expand);
00811 
00812   // First set operation action for all vector types to either promote
00813   // (for widening) or expand (for scalarization). Then we will selectively
00814   // turn on ones that can be effectively codegen'd.
00815   for (int i = MVT::FIRST_VECTOR_VALUETYPE;
00816            i <= MVT::LAST_VECTOR_VALUETYPE; ++i) {
00817     MVT VT = (MVT::SimpleValueType)i;
00818     setOperationAction(ISD::ADD , VT, Expand);
00819     setOperationAction(ISD::SUB , VT, Expand);
00820     setOperationAction(ISD::FADD, VT, Expand);
00821     setOperationAction(ISD::FNEG, VT, Expand);
00822     setOperationAction(ISD::FSUB, VT, Expand);
00823     setOperationAction(ISD::MUL , VT, Expand);
00824     setOperationAction(ISD::FMUL, VT, Expand);
00825     setOperationAction(ISD::SDIV, VT, Expand);
00826     setOperationAction(ISD::UDIV, VT, Expand);
00827     setOperationAction(ISD::FDIV, VT, Expand);
00828     setOperationAction(ISD::SREM, VT, Expand);
00829     setOperationAction(ISD::UREM, VT, Expand);
00830     setOperationAction(ISD::LOAD, VT, Expand);
00831     setOperationAction(ISD::VECTOR_SHUFFLE, VT, Expand);
00832     setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT,Expand);
00833     setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Expand);
00834     setOperationAction(ISD::EXTRACT_SUBVECTOR, VT,Expand);
00835     setOperationAction(ISD::INSERT_SUBVECTOR, VT,Expand);
00836     setOperationAction(ISD::FABS, VT, Expand);
00837     setOperationAction(ISD::FSIN, VT, Expand);
00838     setOperationAction(ISD::FSINCOS, VT, Expand);
00839     setOperationAction(ISD::FCOS, VT, Expand);
00840     setOperationAction(ISD::FSINCOS, VT, Expand);
00841     setOperationAction(ISD::FREM, VT, Expand);
00842     setOperationAction(ISD::FMA,  VT, Expand);
00843     setOperationAction(ISD::FPOWI, VT, Expand);
00844     setOperationAction(ISD::FSQRT, VT, Expand);
00845     setOperationAction(ISD::FCOPYSIGN, VT, Expand);
00846     setOperationAction(ISD::FFLOOR, VT, Expand);
00847     setOperationAction(ISD::FCEIL, VT, Expand);
00848     setOperationAction(ISD::FTRUNC, VT, Expand);
00849     setOperationAction(ISD::FRINT, VT, Expand);
00850     setOperationAction(ISD::FNEARBYINT, VT, Expand);
00851     setOperationAction(ISD::SMUL_LOHI, VT, Expand);
00852     setOperationAction(ISD::MULHS, VT, Expand);
00853     setOperationAction(ISD::UMUL_LOHI, VT, Expand);
00854     setOperationAction(ISD::MULHU, VT, Expand);
00855     setOperationAction(ISD::SDIVREM, VT, Expand);
00856     setOperationAction(ISD::UDIVREM, VT, Expand);
00857     setOperationAction(ISD::FPOW, VT, Expand);
00858     setOperationAction(ISD::CTPOP, VT, Expand);
00859     setOperationAction(ISD::CTTZ, VT, Expand);
00860     setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Expand);
00861     setOperationAction(ISD::CTLZ, VT, Expand);
00862     setOperationAction(ISD::CTLZ_ZERO_UNDEF, VT, Expand);
00863     setOperationAction(ISD::SHL, VT, Expand);
00864     setOperationAction(ISD::SRA, VT, Expand);
00865     setOperationAction(ISD::SRL, VT, Expand);
00866     setOperationAction(ISD::ROTL, VT, Expand);
00867     setOperationAction(ISD::ROTR, VT, Expand);
00868     setOperationAction(ISD::BSWAP, VT, Expand);
00869     setOperationAction(ISD::SETCC, VT, Expand);
00870     setOperationAction(ISD::FLOG, VT, Expand);
00871     setOperationAction(ISD::FLOG2, VT, Expand);
00872     setOperationAction(ISD::FLOG10, VT, Expand);
00873     setOperationAction(ISD::FEXP, VT, Expand);
00874     setOperationAction(ISD::FEXP2, VT, Expand);
00875     setOperationAction(ISD::FP_TO_UINT, VT, Expand);
00876     setOperationAction(ISD::FP_TO_SINT, VT, Expand);
00877     setOperationAction(ISD::UINT_TO_FP, VT, Expand);
00878     setOperationAction(ISD::SINT_TO_FP, VT, Expand);
00879     setOperationAction(ISD::SIGN_EXTEND_INREG, VT,Expand);
00880     setOperationAction(ISD::TRUNCATE, VT, Expand);
00881     setOperationAction(ISD::SIGN_EXTEND, VT, Expand);
00882     setOperationAction(ISD::ZERO_EXTEND, VT, Expand);
00883     setOperationAction(ISD::ANY_EXTEND, VT, Expand);
00884     setOperationAction(ISD::VSELECT, VT, Expand);
00885     setOperationAction(ISD::SELECT_CC, VT, Expand);
00886     for (int InnerVT = MVT::FIRST_VECTOR_VALUETYPE;
00887              InnerVT <= MVT::LAST_VECTOR_VALUETYPE; ++InnerVT)
00888       setTruncStoreAction(VT,
00889                           (MVT::SimpleValueType)InnerVT, Expand);
00890     setLoadExtAction(ISD::SEXTLOAD, VT, Expand);
00891     setLoadExtAction(ISD::ZEXTLOAD, VT, Expand);
00892 
00893     // N.b. ISD::EXTLOAD legality is basically ignored except for i1-like types,
00894     // we have to deal with them whether we ask for Expansion or not. Setting
00895     // Expand causes its own optimisation problems though, so leave them legal.
00896     if (VT.getVectorElementType() == MVT::i1)
00897       setLoadExtAction(ISD::EXTLOAD, VT, Expand);
00898   }
00899 
00900   // FIXME: In order to prevent SSE instructions being expanded to MMX ones
00901   // with -msoft-float, disable use of MMX as well.
00902   if (!TM.Options.UseSoftFloat && Subtarget->hasMMX()) {
00903     addRegisterClass(MVT::x86mmx, &X86::VR64RegClass);
00904     // No operations on x86mmx supported, everything uses intrinsics.
00905   }
00906 
00907   // MMX-sized vectors (other than x86mmx) are expected to be expanded
00908   // into smaller operations.
00909   setOperationAction(ISD::MULHS,              MVT::v8i8,  Expand);
00910   setOperationAction(ISD::MULHS,              MVT::v4i16, Expand);
00911   setOperationAction(ISD::MULHS,              MVT::v2i32, Expand);
00912   setOperationAction(ISD::MULHS,              MVT::v1i64, Expand);
00913   setOperationAction(ISD::AND,                MVT::v8i8,  Expand);
00914   setOperationAction(ISD::AND,                MVT::v4i16, Expand);
00915   setOperationAction(ISD::AND,                MVT::v2i32, Expand);
00916   setOperationAction(ISD::AND,                MVT::v1i64, Expand);
00917   setOperationAction(ISD::OR,                 MVT::v8i8,  Expand);
00918   setOperationAction(ISD::OR,                 MVT::v4i16, Expand);
00919   setOperationAction(ISD::OR,                 MVT::v2i32, Expand);
00920   setOperationAction(ISD::OR,                 MVT::v1i64, Expand);
00921   setOperationAction(ISD::XOR,                MVT::v8i8,  Expand);
00922   setOperationAction(ISD::XOR,                MVT::v4i16, Expand);
00923   setOperationAction(ISD::XOR,                MVT::v2i32, Expand);
00924   setOperationAction(ISD::XOR,                MVT::v1i64, Expand);
00925   setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v8i8,  Expand);
00926   setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v4i16, Expand);
00927   setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v2i32, Expand);
00928   setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v1i64, Expand);
00929   setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v1i64, Expand);
00930   setOperationAction(ISD::SELECT,             MVT::v8i8,  Expand);
00931   setOperationAction(ISD::SELECT,             MVT::v4i16, Expand);
00932   setOperationAction(ISD::SELECT,             MVT::v2i32, Expand);
00933   setOperationAction(ISD::SELECT,             MVT::v1i64, Expand);
00934   setOperationAction(ISD::BITCAST,            MVT::v8i8,  Expand);
00935   setOperationAction(ISD::BITCAST,            MVT::v4i16, Expand);
00936   setOperationAction(ISD::BITCAST,            MVT::v2i32, Expand);
00937   setOperationAction(ISD::BITCAST,            MVT::v1i64, Expand);
00938 
00939   if (!TM.Options.UseSoftFloat && Subtarget->hasSSE1()) {
00940     addRegisterClass(MVT::v4f32, &X86::VR128RegClass);
00941 
00942     setOperationAction(ISD::FADD,               MVT::v4f32, Legal);
00943     setOperationAction(ISD::FSUB,               MVT::v4f32, Legal);
00944     setOperationAction(ISD::FMUL,               MVT::v4f32, Legal);
00945     setOperationAction(ISD::FDIV,               MVT::v4f32, Legal);
00946     setOperationAction(ISD::FSQRT,              MVT::v4f32, Legal);
00947     setOperationAction(ISD::FNEG,               MVT::v4f32, Custom);
00948     setOperationAction(ISD::FABS,               MVT::v4f32, Custom);
00949     setOperationAction(ISD::LOAD,               MVT::v4f32, Legal);
00950     setOperationAction(ISD::BUILD_VECTOR,       MVT::v4f32, Custom);
00951     setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v4f32, Custom);
00952     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
00953     setOperationAction(ISD::SELECT,             MVT::v4f32, Custom);
00954   }
00955 
00956   if (!TM.Options.UseSoftFloat && Subtarget->hasSSE2()) {
00957     addRegisterClass(MVT::v2f64, &X86::VR128RegClass);
00958 
00959     // FIXME: Unfortunately -soft-float and -no-implicit-float means XMM
00960     // registers cannot be used even for integer operations.
00961     addRegisterClass(MVT::v16i8, &X86::VR128RegClass);
00962     addRegisterClass(MVT::v8i16, &X86::VR128RegClass);
00963     addRegisterClass(MVT::v4i32, &X86::VR128RegClass);
00964     addRegisterClass(MVT::v2i64, &X86::VR128RegClass);
00965 
00966     setOperationAction(ISD::ADD,                MVT::v16i8, Legal);
00967     setOperationAction(ISD::ADD,                MVT::v8i16, Legal);
00968     setOperationAction(ISD::ADD,                MVT::v4i32, Legal);
00969     setOperationAction(ISD::ADD,                MVT::v2i64, Legal);
00970     setOperationAction(ISD::MUL,                MVT::v4i32, Custom);
00971     setOperationAction(ISD::MUL,                MVT::v2i64, Custom);
00972     setOperationAction(ISD::UMUL_LOHI,          MVT::v4i32, Custom);
00973     setOperationAction(ISD::SMUL_LOHI,          MVT::v4i32, Custom);
00974     setOperationAction(ISD::MULHU,              MVT::v8i16, Legal);
00975     setOperationAction(ISD::MULHS,              MVT::v8i16, Legal);
00976     setOperationAction(ISD::SUB,                MVT::v16i8, Legal);
00977     setOperationAction(ISD::SUB,                MVT::v8i16, Legal);
00978     setOperationAction(ISD::SUB,                MVT::v4i32, Legal);
00979     setOperationAction(ISD::SUB,                MVT::v2i64, Legal);
00980     setOperationAction(ISD::MUL,                MVT::v8i16, Legal);
00981     setOperationAction(ISD::FADD,               MVT::v2f64, Legal);
00982     setOperationAction(ISD::FSUB,               MVT::v2f64, Legal);
00983     setOperationAction(ISD::FMUL,               MVT::v2f64, Legal);
00984     setOperationAction(ISD::FDIV,               MVT::v2f64, Legal);
00985     setOperationAction(ISD::FSQRT,              MVT::v2f64, Legal);
00986     setOperationAction(ISD::FNEG,               MVT::v2f64, Custom);
00987     setOperationAction(ISD::FABS,               MVT::v2f64, Custom);
00988 
00989     setOperationAction(ISD::SETCC,              MVT::v2i64, Custom);
00990     setOperationAction(ISD::SETCC,              MVT::v16i8, Custom);
00991     setOperationAction(ISD::SETCC,              MVT::v8i16, Custom);
00992     setOperationAction(ISD::SETCC,              MVT::v4i32, Custom);
00993 
00994     setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v16i8, Custom);
00995     setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v8i16, Custom);
00996     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v8i16, Custom);
00997     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4i32, Custom);
00998     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4f32, Custom);
00999 
01000     // Custom lower build_vector, vector_shuffle, and extract_vector_elt.
01001     for (int i = MVT::v16i8; i != MVT::v2i64; ++i) {
01002       MVT VT = (MVT::SimpleValueType)i;
01003       // Do not attempt to custom lower non-power-of-2 vectors
01004       if (!isPowerOf2_32(VT.getVectorNumElements()))
01005         continue;
01006       // Do not attempt to custom lower non-128-bit vectors
01007       if (!VT.is128BitVector())
01008         continue;
01009       setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);
01010       setOperationAction(ISD::VECTOR_SHUFFLE,     VT, Custom);
01011       setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
01012     }
01013 
01014     // We support custom legalizing of sext and anyext loads for specific
01015     // memory vector types which we can load as a scalar (or sequence of
01016     // scalars) and extend in-register to a legal 128-bit vector type. For sext
01017     // loads these must work with a single scalar load.
01018     setLoadExtAction(ISD::SEXTLOAD, MVT::v4i8, Custom);
01019     if (Subtarget->is64Bit()) {
01020       setLoadExtAction(ISD::SEXTLOAD, MVT::v4i16, Custom);
01021       setLoadExtAction(ISD::SEXTLOAD, MVT::v8i8, Custom);
01022     }
01023     setLoadExtAction(ISD::EXTLOAD, MVT::v2i8, Custom);
01024     setLoadExtAction(ISD::EXTLOAD, MVT::v2i16, Custom);
01025     setLoadExtAction(ISD::EXTLOAD, MVT::v2i32, Custom);
01026     setLoadExtAction(ISD::EXTLOAD, MVT::v4i8, Custom);
01027     setLoadExtAction(ISD::EXTLOAD, MVT::v4i16, Custom);
01028     setLoadExtAction(ISD::EXTLOAD, MVT::v8i8, Custom);
01029 
01030     setOperationAction(ISD::BUILD_VECTOR,       MVT::v2f64, Custom);
01031     setOperationAction(ISD::BUILD_VECTOR,       MVT::v2i64, Custom);
01032     setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v2f64, Custom);
01033     setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v2i64, Custom);
01034     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v2f64, Custom);
01035     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f64, Custom);
01036 
01037     if (Subtarget->is64Bit()) {
01038       setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v2i64, Custom);
01039       setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Custom);
01040     }
01041 
01042     // Promote v16i8, v8i16, v4i32 load, select, and, or, xor to v2i64.
01043     for (int i = MVT::v16i8; i != MVT::v2i64; ++i) {
01044       MVT VT = (MVT::SimpleValueType)i;
01045 
01046       // Do not attempt to promote non-128-bit vectors
01047       if (!VT.is128BitVector())
01048         continue;
01049 
01050       setOperationAction(ISD::AND,    VT, Promote);
01051       AddPromotedToType (ISD::AND,    VT, MVT::v2i64);
01052       setOperationAction(ISD::OR,     VT, Promote);
01053       AddPromotedToType (ISD::OR,     VT, MVT::v2i64);
01054       setOperationAction(ISD::XOR,    VT, Promote);
01055       AddPromotedToType (ISD::XOR,    VT, MVT::v2i64);
01056       setOperationAction(ISD::LOAD,   VT, Promote);
01057       AddPromotedToType (ISD::LOAD,   VT, MVT::v2i64);
01058       setOperationAction(ISD::SELECT, VT, Promote);
01059       AddPromotedToType (ISD::SELECT, VT, MVT::v2i64);
01060     }
01061 
01062     setTruncStoreAction(MVT::f64, MVT::f32, Expand);
01063 
01064     // Custom lower v2i64 and v2f64 selects.
01065     setOperationAction(ISD::LOAD,               MVT::v2f64, Legal);
01066     setOperationAction(ISD::LOAD,               MVT::v2i64, Legal);
01067     setOperationAction(ISD::SELECT,             MVT::v2f64, Custom);
01068     setOperationAction(ISD::SELECT,             MVT::v2i64, Custom);
01069 
01070     setOperationAction(ISD::FP_TO_SINT,         MVT::v4i32, Legal);
01071     setOperationAction(ISD::SINT_TO_FP,         MVT::v4i32, Legal);
01072 
01073     setOperationAction(ISD::UINT_TO_FP,         MVT::v4i8,  Custom);
01074     setOperationAction(ISD::UINT_TO_FP,         MVT::v4i16, Custom);
01075     // As there is no 64-bit GPR available, we need build a special custom
01076     // sequence to convert from v2i32 to v2f32.
01077     if (!Subtarget->is64Bit())
01078       setOperationAction(ISD::UINT_TO_FP,       MVT::v2f32, Custom);
01079 
01080     setOperationAction(ISD::FP_EXTEND,          MVT::v2f32, Custom);
01081     setOperationAction(ISD::FP_ROUND,           MVT::v2f32, Custom);
01082 
01083     setLoadExtAction(ISD::EXTLOAD,              MVT::v2f32, Legal);
01084 
01085     setOperationAction(ISD::BITCAST,            MVT::v2i32, Custom);
01086     setOperationAction(ISD::BITCAST,            MVT::v4i16, Custom);
01087     setOperationAction(ISD::BITCAST,            MVT::v8i8,  Custom);
01088   }
01089 
01090   if (!TM.Options.UseSoftFloat && Subtarget->hasSSE41()) {
01091     setOperationAction(ISD::FFLOOR,             MVT::f32,   Legal);
01092     setOperationAction(ISD::FCEIL,              MVT::f32,   Legal);
01093     setOperationAction(ISD::FTRUNC,             MVT::f32,   Legal);
01094     setOperationAction(ISD::FRINT,              MVT::f32,   Legal);
01095     setOperationAction(ISD::FNEARBYINT,         MVT::f32,   Legal);
01096     setOperationAction(ISD::FFLOOR,             MVT::f64,   Legal);
01097     setOperationAction(ISD::FCEIL,              MVT::f64,   Legal);
01098     setOperationAction(ISD::FTRUNC,             MVT::f64,   Legal);
01099     setOperationAction(ISD::FRINT,              MVT::f64,   Legal);
01100     setOperationAction(ISD::FNEARBYINT,         MVT::f64,   Legal);
01101 
01102     setOperationAction(ISD::FFLOOR,             MVT::v4f32, Legal);
01103     setOperationAction(ISD::FCEIL,              MVT::v4f32, Legal);
01104     setOperationAction(ISD::FTRUNC,             MVT::v4f32, Legal);
01105     setOperationAction(ISD::FRINT,              MVT::v4f32, Legal);
01106     setOperationAction(ISD::FNEARBYINT,         MVT::v4f32, Legal);
01107     setOperationAction(ISD::FFLOOR,             MVT::v2f64, Legal);
01108     setOperationAction(ISD::FCEIL,              MVT::v2f64, Legal);
01109     setOperationAction(ISD::FTRUNC,             MVT::v2f64, Legal);
01110     setOperationAction(ISD::FRINT,              MVT::v2f64, Legal);
01111     setOperationAction(ISD::FNEARBYINT,         MVT::v2f64, Legal);
01112 
01113     // FIXME: Do we need to handle scalar-to-vector here?
01114     setOperationAction(ISD::MUL,                MVT::v4i32, Legal);
01115 
01116     setOperationAction(ISD::VSELECT,            MVT::v2f64, Custom);
01117     setOperationAction(ISD::VSELECT,            MVT::v2i64, Custom);
01118     setOperationAction(ISD::VSELECT,            MVT::v4i32, Custom);
01119     setOperationAction(ISD::VSELECT,            MVT::v4f32, Custom);
01120     setOperationAction(ISD::VSELECT,            MVT::v8i16, Custom);
01121     // There is no BLENDI for byte vectors. We don't need to custom lower
01122     // some vselects for now.
01123     setOperationAction(ISD::VSELECT,            MVT::v16i8, Legal);
01124 
01125     // SSE41 brings specific instructions for doing vector sign extend even in
01126     // cases where we don't have SRA.
01127     setLoadExtAction(ISD::SEXTLOAD, MVT::v2i8, Custom);
01128     setLoadExtAction(ISD::SEXTLOAD, MVT::v2i16, Custom);
01129     setLoadExtAction(ISD::SEXTLOAD, MVT::v2i32, Custom);
01130 
01131     // i8 and i16 vectors are custom , because the source register and source
01132     // source memory operand types are not the same width.  f32 vectors are
01133     // custom since the immediate controlling the insert encodes additional
01134     // information.
01135     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v16i8, Custom);
01136     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v8i16, Custom);
01137     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4i32, Custom);
01138     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4f32, Custom);
01139 
01140     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i8, Custom);
01141     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i16, Custom);
01142     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Custom);
01143     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
01144 
01145     // FIXME: these should be Legal but thats only for the case where
01146     // the index is constant.  For now custom expand to deal with that.
01147     if (Subtarget->is64Bit()) {
01148       setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v2i64, Custom);
01149       setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Custom);
01150     }
01151   }
01152 
01153   if (Subtarget->hasSSE2()) {
01154     setOperationAction(ISD::SRL,               MVT::v8i16, Custom);
01155     setOperationAction(ISD::SRL,               MVT::v16i8, Custom);
01156 
01157     setOperationAction(ISD::SHL,               MVT::v8i16, Custom);
01158     setOperationAction(ISD::SHL,               MVT::v16i8, Custom);
01159 
01160     setOperationAction(ISD::SRA,               MVT::v8i16, Custom);
01161     setOperationAction(ISD::SRA,               MVT::v16i8, Custom);
01162 
01163     // In the customized shift lowering, the legal cases in AVX2 will be
01164     // recognized.
01165     setOperationAction(ISD::SRL,               MVT::v2i64, Custom);
01166     setOperationAction(ISD::SRL,               MVT::v4i32, Custom);
01167 
01168     setOperationAction(ISD::SHL,               MVT::v2i64, Custom);
01169     setOperationAction(ISD::SHL,               MVT::v4i32, Custom);
01170 
01171     setOperationAction(ISD::SRA,               MVT::v4i32, Custom);
01172   }
01173 
01174   if (!TM.Options.UseSoftFloat && Subtarget->hasFp256()) {
01175     addRegisterClass(MVT::v32i8,  &X86::VR256RegClass);
01176     addRegisterClass(MVT::v16i16, &X86::VR256RegClass);
01177     addRegisterClass(MVT::v8i32,  &X86::VR256RegClass);
01178     addRegisterClass(MVT::v8f32,  &X86::VR256RegClass);
01179     addRegisterClass(MVT::v4i64,  &X86::VR256RegClass);
01180     addRegisterClass(MVT::v4f64,  &X86::VR256RegClass);
01181 
01182     setOperationAction(ISD::LOAD,               MVT::v8f32, Legal);
01183     setOperationAction(ISD::LOAD,               MVT::v4f64, Legal);
01184     setOperationAction(ISD::LOAD,               MVT::v4i64, Legal);
01185 
01186     setOperationAction(ISD::FADD,               MVT::v8f32, Legal);
01187     setOperationAction(ISD::FSUB,               MVT::v8f32, Legal);
01188     setOperationAction(ISD::FMUL,               MVT::v8f32, Legal);
01189     setOperationAction(ISD::FDIV,               MVT::v8f32, Legal);
01190     setOperationAction(ISD::FSQRT,              MVT::v8f32, Legal);
01191     setOperationAction(ISD::FFLOOR,             MVT::v8f32, Legal);
01192     setOperationAction(ISD::FCEIL,              MVT::v8f32, Legal);
01193     setOperationAction(ISD::FTRUNC,             MVT::v8f32, Legal);
01194     setOperationAction(ISD::FRINT,              MVT::v8f32, Legal);
01195     setOperationAction(ISD::FNEARBYINT,         MVT::v8f32, Legal);
01196     setOperationAction(ISD::FNEG,               MVT::v8f32, Custom);
01197     setOperationAction(ISD::FABS,               MVT::v8f32, Custom);
01198 
01199     setOperationAction(ISD::FADD,               MVT::v4f64, Legal);
01200     setOperationAction(ISD::FSUB,               MVT::v4f64, Legal);
01201     setOperationAction(ISD::FMUL,               MVT::v4f64, Legal);
01202     setOperationAction(ISD::FDIV,               MVT::v4f64, Legal);
01203     setOperationAction(ISD::FSQRT,              MVT::v4f64, Legal);
01204     setOperationAction(ISD::FFLOOR,             MVT::v4f64, Legal);
01205     setOperationAction(ISD::FCEIL,              MVT::v4f64, Legal);
01206     setOperationAction(ISD::FTRUNC,             MVT::v4f64, Legal);
01207     setOperationAction(ISD::FRINT,              MVT::v4f64, Legal);
01208     setOperationAction(ISD::FNEARBYINT,         MVT::v4f64, Legal);
01209     setOperationAction(ISD::FNEG,               MVT::v4f64, Custom);
01210     setOperationAction(ISD::FABS,               MVT::v4f64, Custom);
01211 
01212     // (fp_to_int:v8i16 (v8f32 ..)) requires the result type to be promoted
01213     // even though v8i16 is a legal type.
01214     setOperationAction(ISD::FP_TO_SINT,         MVT::v8i16, Promote);
01215     setOperationAction(ISD::FP_TO_UINT,         MVT::v8i16, Promote);
01216     setOperationAction(ISD::FP_TO_SINT,         MVT::v8i32, Legal);
01217 
01218     setOperationAction(ISD::SINT_TO_FP,         MVT::v8i16, Promote);
01219     setOperationAction(ISD::SINT_TO_FP,         MVT::v8i32, Legal);
01220     setOperationAction(ISD::FP_ROUND,           MVT::v4f32, Legal);
01221 
01222     setOperationAction(ISD::UINT_TO_FP,         MVT::v8i8,  Custom);
01223     setOperationAction(ISD::UINT_TO_FP,         MVT::v8i16, Custom);
01224 
01225     setLoadExtAction(ISD::EXTLOAD,              MVT::v4f32, Legal);
01226 
01227     setOperationAction(ISD::SRL,               MVT::v16i16, Custom);
01228     setOperationAction(ISD::SRL,               MVT::v32i8, Custom);
01229 
01230     setOperationAction(ISD::SHL,               MVT::v16i16, Custom);
01231     setOperationAction(ISD::SHL,               MVT::v32i8, Custom);
01232 
01233     setOperationAction(ISD::SRA,               MVT::v16i16, Custom);
01234     setOperationAction(ISD::SRA,               MVT::v32i8, Custom);
01235 
01236     setOperationAction(ISD::SETCC,             MVT::v32i8, Custom);
01237     setOperationAction(ISD::SETCC,             MVT::v16i16, Custom);
01238     setOperationAction(ISD::SETCC,             MVT::v8i32, Custom);
01239     setOperationAction(ISD::SETCC,             MVT::v4i64, Custom);
01240 
01241     setOperationAction(ISD::SELECT,            MVT::v4f64, Custom);
01242     setOperationAction(ISD::SELECT,            MVT::v4i64, Custom);
01243     setOperationAction(ISD::SELECT,            MVT::v8f32, Custom);
01244 
01245     setOperationAction(ISD::VSELECT,           MVT::v4f64, Custom);
01246     setOperationAction(ISD::VSELECT,           MVT::v4i64, Custom);
01247     setOperationAction(ISD::VSELECT,           MVT::v8i32, Custom);
01248     setOperationAction(ISD::VSELECT,           MVT::v8f32, Custom);
01249 
01250     setOperationAction(ISD::SIGN_EXTEND,       MVT::v4i64, Custom);
01251     setOperationAction(ISD::SIGN_EXTEND,       MVT::v8i32, Custom);
01252     setOperationAction(ISD::SIGN_EXTEND,       MVT::v16i16, Custom);
01253     setOperationAction(ISD::ZERO_EXTEND,       MVT::v4i64, Custom);
01254     setOperationAction(ISD::ZERO_EXTEND,       MVT::v8i32, Custom);
01255     setOperationAction(ISD::ZERO_EXTEND,       MVT::v16i16, Custom);
01256     setOperationAction(ISD::ANY_EXTEND,        MVT::v4i64, Custom);
01257     setOperationAction(ISD::ANY_EXTEND,        MVT::v8i32, Custom);
01258     setOperationAction(ISD::ANY_EXTEND,        MVT::v16i16, Custom);
01259     setOperationAction(ISD::TRUNCATE,          MVT::v16i8, Custom);
01260     setOperationAction(ISD::TRUNCATE,          MVT::v8i16, Custom);
01261     setOperationAction(ISD::TRUNCATE,          MVT::v4i32, Custom);
01262 
01263     if (Subtarget->hasFMA() || Subtarget->hasFMA4()) {
01264       setOperationAction(ISD::FMA,             MVT::v8f32, Legal);
01265       setOperationAction(ISD::FMA,             MVT::v4f64, Legal);
01266       setOperationAction(ISD::FMA,             MVT::v4f32, Legal);
01267       setOperationAction(ISD::FMA,             MVT::v2f64, Legal);
01268       setOperationAction(ISD::FMA,             MVT::f32, Legal);
01269       setOperationAction(ISD::FMA,             MVT::f64, Legal);
01270     }
01271 
01272     if (Subtarget->hasInt256()) {
01273       setOperationAction(ISD::ADD,             MVT::v4i64, Legal);
01274       setOperationAction(ISD::ADD,             MVT::v8i32, Legal);
01275       setOperationAction(ISD::ADD,             MVT::v16i16, Legal);
01276       setOperationAction(ISD::ADD,             MVT::v32i8, Legal);
01277 
01278       setOperationAction(ISD::SUB,             MVT::v4i64, Legal);
01279       setOperationAction(ISD::SUB,             MVT::v8i32, Legal);
01280       setOperationAction(ISD::SUB,             MVT::v16i16, Legal);
01281       setOperationAction(ISD::SUB,             MVT::v32i8, Legal);
01282 
01283       setOperationAction(ISD::MUL,             MVT::v4i64, Custom);
01284       setOperationAction(ISD::MUL,             MVT::v8i32, Legal);
01285       setOperationAction(ISD::MUL,             MVT::v16i16, Legal);
01286       // Don't lower v32i8 because there is no 128-bit byte mul
01287 
01288       setOperationAction(ISD::UMUL_LOHI,       MVT::v8i32, Custom);
01289       setOperationAction(ISD::SMUL_LOHI,       MVT::v8i32, Custom);
01290       setOperationAction(ISD::MULHU,           MVT::v16i16, Legal);
01291       setOperationAction(ISD::MULHS,           MVT::v16i16, Legal);
01292 
01293       setOperationAction(ISD::VSELECT,         MVT::v16i16, Custom);
01294       setOperationAction(ISD::VSELECT,         MVT::v32i8, Legal);
01295     } else {
01296       setOperationAction(ISD::ADD,             MVT::v4i64, Custom);
01297       setOperationAction(ISD::ADD,             MVT::v8i32, Custom);
01298       setOperationAction(ISD::ADD,             MVT::v16i16, Custom);
01299       setOperationAction(ISD::ADD,             MVT::v32i8, Custom);
01300 
01301       setOperationAction(ISD::SUB,             MVT::v4i64, Custom);
01302       setOperationAction(ISD::SUB,             MVT::v8i32, Custom);
01303       setOperationAction(ISD::SUB,             MVT::v16i16, Custom);
01304       setOperationAction(ISD::SUB,             MVT::v32i8, Custom);
01305 
01306       setOperationAction(ISD::MUL,             MVT::v4i64, Custom);
01307       setOperationAction(ISD::MUL,             MVT::v8i32, Custom);
01308       setOperationAction(ISD::MUL,             MVT::v16i16, Custom);
01309       // Don't lower v32i8 because there is no 128-bit byte mul
01310     }
01311 
01312     // In the customized shift lowering, the legal cases in AVX2 will be
01313     // recognized.
01314     setOperationAction(ISD::SRL,               MVT::v4i64, Custom);
01315     setOperationAction(ISD::SRL,               MVT::v8i32, Custom);
01316 
01317     setOperationAction(ISD::SHL,               MVT::v4i64, Custom);
01318     setOperationAction(ISD::SHL,               MVT::v8i32, Custom);
01319 
01320     setOperationAction(ISD::SRA,               MVT::v8i32, Custom);
01321 
01322     // Custom lower several nodes for 256-bit types.
01323     for (int i = MVT::FIRST_VECTOR_VALUETYPE;
01324              i <= MVT::LAST_VECTOR_VALUETYPE; ++i) {
01325       MVT VT = (MVT::SimpleValueType)i;
01326 
01327       // Extract subvector is special because the value type
01328       // (result) is 128-bit but the source is 256-bit wide.
01329       if (VT.is128BitVector())
01330         setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
01331 
01332       // Do not attempt to custom lower other non-256-bit vectors
01333       if (!VT.is256BitVector())
01334         continue;
01335 
01336       setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);
01337       setOperationAction(ISD::VECTOR_SHUFFLE,     VT, Custom);
01338       setOperationAction(ISD::INSERT_VECTOR_ELT,  VT, Custom);
01339       setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
01340       setOperationAction(ISD::SCALAR_TO_VECTOR,   VT, Custom);
01341       setOperationAction(ISD::INSERT_SUBVECTOR,   VT, Custom);
01342       setOperationAction(ISD::CONCAT_VECTORS,     VT, Custom);
01343     }
01344 
01345     // Promote v32i8, v16i16, v8i32 select, and, or, xor to v4i64.
01346     for (int i = MVT::v32i8; i != MVT::v4i64; ++i) {
01347       MVT VT = (MVT::SimpleValueType)i;
01348 
01349       // Do not attempt to promote non-256-bit vectors
01350       if (!VT.is256BitVector())
01351         continue;
01352 
01353       setOperationAction(ISD::AND,    VT, Promote);
01354       AddPromotedToType (ISD::AND,    VT, MVT::v4i64);
01355       setOperationAction(ISD::OR,     VT, Promote);
01356       AddPromotedToType (ISD::OR,     VT, MVT::v4i64);
01357       setOperationAction(ISD::XOR,    VT, Promote);
01358       AddPromotedToType (ISD::XOR,    VT, MVT::v4i64);
01359       setOperationAction(ISD::LOAD,   VT, Promote);
01360       AddPromotedToType (ISD::LOAD,   VT, MVT::v4i64);
01361       setOperationAction(ISD::SELECT, VT, Promote);
01362       AddPromotedToType (ISD::SELECT, VT, MVT::v4i64);
01363     }
01364   }
01365 
01366   if (!TM.Options.UseSoftFloat && Subtarget->hasAVX512()) {
01367     addRegisterClass(MVT::v16i32, &X86::VR512RegClass);
01368     addRegisterClass(MVT::v16f32, &X86::VR512RegClass);
01369     addRegisterClass(MVT::v8i64,  &X86::VR512RegClass);
01370     addRegisterClass(MVT::v8f64,  &X86::VR512RegClass);
01371 
01372     addRegisterClass(MVT::i1,     &X86::VK1RegClass);
01373     addRegisterClass(MVT::v8i1,   &X86::VK8RegClass);
01374     addRegisterClass(MVT::v16i1,  &X86::VK16RegClass);
01375 
01376     setOperationAction(ISD::BR_CC,              MVT::i1,    Expand);
01377     setOperationAction(ISD::SETCC,              MVT::i1,    Custom);
01378     setOperationAction(ISD::XOR,                MVT::i1,    Legal);
01379     setOperationAction(ISD::OR,                 MVT::i1,    Legal);
01380     setOperationAction(ISD::AND,                MVT::i1,    Legal);
01381     setLoadExtAction(ISD::EXTLOAD,              MVT::v8f32, Legal);
01382     setOperationAction(ISD::LOAD,               MVT::v16f32, Legal);
01383     setOperationAction(ISD::LOAD,               MVT::v8f64, Legal);
01384     setOperationAction(ISD::LOAD,               MVT::v8i64, Legal);
01385     setOperationAction(ISD::LOAD,               MVT::v16i32, Legal);
01386     setOperationAction(ISD::LOAD,               MVT::v16i1, Legal);
01387 
01388     setOperationAction(ISD::FADD,               MVT::v16f32, Legal);
01389     setOperationAction(ISD::FSUB,               MVT::v16f32, Legal);
01390     setOperationAction(ISD::FMUL,               MVT::v16f32, Legal);
01391     setOperationAction(ISD::FDIV,               MVT::v16f32, Legal);
01392     setOperationAction(ISD::FSQRT,              MVT::v16f32, Legal);
01393     setOperationAction(ISD::FNEG,               MVT::v16f32, Custom);
01394 
01395     setOperationAction(ISD::FADD,               MVT::v8f64, Legal);
01396     setOperationAction(ISD::FSUB,               MVT::v8f64, Legal);
01397     setOperationAction(ISD::FMUL,               MVT::v8f64, Legal);
01398     setOperationAction(ISD::FDIV,               MVT::v8f64, Legal);
01399     setOperationAction(ISD::FSQRT,              MVT::v8f64, Legal);
01400     setOperationAction(ISD::FNEG,               MVT::v8f64, Custom);
01401     setOperationAction(ISD::FMA,                MVT::v8f64, Legal);
01402     setOperationAction(ISD::FMA,                MVT::v16f32, Legal);
01403 
01404     setOperationAction(ISD::FP_TO_SINT,         MVT::i32, Legal);
01405     setOperationAction(ISD::FP_TO_UINT,         MVT::i32, Legal);
01406     setOperationAction(ISD::SINT_TO_FP,         MVT::i32, Legal);
01407     setOperationAction(ISD::UINT_TO_FP,         MVT::i32, Legal);
01408     if (Subtarget->is64Bit()) {
01409       setOperationAction(ISD::FP_TO_UINT,       MVT::i64, Legal);
01410       setOperationAction(ISD::FP_TO_SINT,       MVT::i64, Legal);
01411       setOperationAction(ISD::SINT_TO_FP,       MVT::i64, Legal);
01412       setOperationAction(ISD::UINT_TO_FP,       MVT::i64, Legal);
01413     }
01414     setOperationAction(ISD::FP_TO_SINT,         MVT::v16i32, Legal);
01415     setOperationAction(ISD::FP_TO_UINT,         MVT::v16i32, Legal);
01416     setOperationAction(ISD::FP_TO_UINT,         MVT::v8i32, Legal);
01417     setOperationAction(ISD::FP_TO_UINT,         MVT::v4i32, Legal);
01418     setOperationAction(ISD::SINT_TO_FP,         MVT::v16i32, Legal);
01419     setOperationAction(ISD::UINT_TO_FP,         MVT::v16i32, Legal);
01420     setOperationAction(ISD::UINT_TO_FP,         MVT::v8i32, Legal);
01421     setOperationAction(ISD::UINT_TO_FP,         MVT::v4i32, Legal);
01422     setOperationAction(ISD::FP_ROUND,           MVT::v8f32, Legal);
01423     setOperationAction(ISD::FP_EXTEND,          MVT::v8f32, Legal);
01424 
01425     setOperationAction(ISD::TRUNCATE,           MVT::i1, Custom);
01426     setOperationAction(ISD::TRUNCATE,           MVT::v16i8, Custom);
01427     setOperationAction(ISD::TRUNCATE,           MVT::v8i32, Custom);
01428     setOperationAction(ISD::TRUNCATE,           MVT::v8i1, Custom);
01429     setOperationAction(ISD::TRUNCATE,           MVT::v16i1, Custom);
01430     setOperationAction(ISD::TRUNCATE,           MVT::v16i16, Custom);
01431     setOperationAction(ISD::ZERO_EXTEND,        MVT::v16i32, Custom);
01432     setOperationAction(ISD::ZERO_EXTEND,        MVT::v8i64, Custom);
01433     setOperationAction(ISD::SIGN_EXTEND,        MVT::v16i32, Custom);
01434     setOperationAction(ISD::SIGN_EXTEND,        MVT::v8i64, Custom);
01435     setOperationAction(ISD::SIGN_EXTEND,        MVT::v16i8, Custom);
01436     setOperationAction(ISD::SIGN_EXTEND,        MVT::v8i16, Custom);
01437     setOperationAction(ISD::SIGN_EXTEND,        MVT::v16i16, Custom);
01438 
01439     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v8f64,  Custom);
01440     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v8i64,  Custom);
01441     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v16f32,  Custom);
01442     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v16i32,  Custom);
01443     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v8i1,    Custom);
01444     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v16i1, Legal);
01445 
01446     setOperationAction(ISD::SETCC,              MVT::v16i1, Custom);
01447     setOperationAction(ISD::SETCC,              MVT::v8i1, Custom);
01448 
01449     setOperationAction(ISD::MUL,              MVT::v8i64, Custom);
01450 
01451     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i1,  Custom);
01452     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i1, Custom);
01453     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v16i1, Custom);
01454     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v8i1, Custom);
01455     setOperationAction(ISD::BUILD_VECTOR,       MVT::v8i1, Custom);
01456     setOperationAction(ISD::BUILD_VECTOR,       MVT::v16i1, Custom);
01457     setOperationAction(ISD::SELECT,             MVT::v8f64, Custom);
01458     setOperationAction(ISD::SELECT,             MVT::v8i64, Custom);
01459     setOperationAction(ISD::SELECT,             MVT::v16f32, Custom);
01460 
01461     setOperationAction(ISD::ADD,                MVT::v8i64, Legal);
01462     setOperationAction(ISD::ADD,                MVT::v16i32, Legal);
01463 
01464     setOperationAction(ISD::SUB,                MVT::v8i64, Legal);
01465     setOperationAction(ISD::SUB,                MVT::v16i32, Legal);
01466 
01467     setOperationAction(ISD::MUL,                MVT::v16i32, Legal);
01468 
01469     setOperationAction(ISD::SRL,                MVT::v8i64, Custom);
01470     setOperationAction(ISD::SRL,                MVT::v16i32, Custom);
01471 
01472     setOperationAction(ISD::SHL,                MVT::v8i64, Custom);
01473     setOperationAction(ISD::SHL,                MVT::v16i32, Custom);
01474 
01475     setOperationAction(ISD::SRA,                MVT::v8i64, Custom);
01476     setOperationAction(ISD::SRA,                MVT::v16i32, Custom);
01477 
01478     setOperationAction(ISD::AND,                MVT::v8i64, Legal);
01479     setOperationAction(ISD::OR,                 MVT::v8i64, Legal);
01480     setOperationAction(ISD::XOR,                MVT::v8i64, Legal);
01481     setOperationAction(ISD::AND,                MVT::v16i32, Legal);
01482     setOperationAction(ISD::OR,                 MVT::v16i32, Legal);
01483     setOperationAction(ISD::XOR,                MVT::v16i32, Legal);
01484 
01485     if (Subtarget->hasCDI()) {
01486       setOperationAction(ISD::CTLZ,             MVT::v8i64, Legal);
01487       setOperationAction(ISD::CTLZ,             MVT::v16i32, Legal);
01488     }
01489 
01490     // Custom lower several nodes.
01491     for (int i = MVT::FIRST_VECTOR_VALUETYPE;
01492              i <= MVT::LAST_VECTOR_VALUETYPE; ++i) {
01493       MVT VT = (MVT::SimpleValueType)i;
01494 
01495       unsigned EltSize = VT.getVectorElementType().getSizeInBits();
01496       // Extract subvector is special because the value type
01497       // (result) is 256/128-bit but the source is 512-bit wide.
01498       if (VT.is128BitVector() || VT.is256BitVector())
01499         setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
01500 
01501       if (VT.getVectorElementType() == MVT::i1)
01502         setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);
01503 
01504       // Do not attempt to custom lower other non-512-bit vectors
01505       if (!VT.is512BitVector())
01506         continue;
01507 
01508       if ( EltSize >= 32) {
01509         setOperationAction(ISD::VECTOR_SHUFFLE,      VT, Custom);
01510         setOperationAction(ISD::INSERT_VECTOR_ELT,   VT, Custom);
01511         setOperationAction(ISD::BUILD_VECTOR,        VT, Custom);
01512         setOperationAction(ISD::VSELECT,             VT, Legal);
01513         setOperationAction(ISD::EXTRACT_VECTOR_ELT,  VT, Custom);
01514         setOperationAction(ISD::SCALAR_TO_VECTOR,    VT, Custom);
01515         setOperationAction(ISD::INSERT_SUBVECTOR,    VT, Custom);
01516       }
01517     }
01518     for (int i = MVT::v32i8; i != MVT::v8i64; ++i) {
01519       MVT VT = (MVT::SimpleValueType)i;
01520 
01521       // Do not attempt to promote non-256-bit vectors
01522       if (!VT.is512BitVector())
01523         continue;
01524 
01525       setOperationAction(ISD::SELECT, VT, Promote);
01526       AddPromotedToType (ISD::SELECT, VT, MVT::v8i64);
01527     }
01528   }// has  AVX-512
01529 
01530   if (!TM.Options.UseSoftFloat && Subtarget->hasBWI()) {
01531     addRegisterClass(MVT::v32i1,  &X86::VK32RegClass);
01532     addRegisterClass(MVT::v64i1,  &X86::VK64RegClass);
01533   }
01534 
01535   // SIGN_EXTEND_INREGs are evaluated by the extend type. Handle the expansion
01536   // of this type with custom code.
01537   for (int VT = MVT::FIRST_VECTOR_VALUETYPE;
01538            VT != MVT::LAST_VECTOR_VALUETYPE; VT++) {
01539     setOperationAction(ISD::SIGN_EXTEND_INREG, (MVT::SimpleValueType)VT,
01540                        Custom);
01541   }
01542 
01543   // We want to custom lower some of our intrinsics.
01544   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
01545   setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
01546   setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
01547   if (!Subtarget->is64Bit())
01548     setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i64, Custom);
01549 
01550   // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't
01551   // handle type legalization for these operations here.
01552   //
01553   // FIXME: We really should do custom legalization for addition and
01554   // subtraction on x86-32 once PR3203 is fixed.  We really can't do much better
01555   // than generic legalization for 64-bit multiplication-with-overflow, though.
01556   for (unsigned i = 0, e = 3+Subtarget->is64Bit(); i != e; ++i) {
01557     // Add/Sub/Mul with overflow operations are custom lowered.
01558     MVT VT = IntVTs[i];
01559     setOperationAction(ISD::SADDO, VT, Custom);
01560     setOperationAction(ISD::UADDO, VT, Custom);
01561     setOperationAction(ISD::SSUBO, VT, Custom);
01562     setOperationAction(ISD::USUBO, VT, Custom);
01563     setOperationAction(ISD::SMULO, VT, Custom);
01564     setOperationAction(ISD::UMULO, VT, Custom);
01565   }
01566 
01567   // There are no 8-bit 3-address imul/mul instructions
01568   setOperationAction(ISD::SMULO, MVT::i8, Expand);
01569   setOperationAction(ISD::UMULO, MVT::i8, Expand);
01570 
01571   if (!Subtarget->is64Bit()) {
01572     // These libcalls are not available in 32-bit.
01573     setLibcallName(RTLIB::SHL_I128, nullptr);
01574     setLibcallName(RTLIB::SRL_I128, nullptr);
01575     setLibcallName(RTLIB::SRA_I128, nullptr);
01576   }
01577 
01578   // Combine sin / cos into one node or libcall if possible.
01579   if (Subtarget->hasSinCos()) {
01580     setLibcallName(RTLIB::SINCOS_F32, "sincosf");
01581     setLibcallName(RTLIB::SINCOS_F64, "sincos");
01582     if (Subtarget->isTargetDarwin()) {
01583       // For MacOSX, we don't want to the normal expansion of a libcall to
01584       // sincos. We want to issue a libcall to __sincos_stret to avoid memory
01585       // traffic.
01586       setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
01587       setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
01588     }
01589   }
01590 
01591   if (Subtarget->isTargetWin64()) {
01592     setOperationAction(ISD::SDIV, MVT::i128, Custom);
01593     setOperationAction(ISD::UDIV, MVT::i128, Custom);
01594     setOperationAction(ISD::SREM, MVT::i128, Custom);
01595     setOperationAction(ISD::UREM, MVT::i128, Custom);
01596     setOperationAction(ISD::SDIVREM, MVT::i128, Custom);
01597     setOperationAction(ISD::UDIVREM, MVT::i128, Custom);
01598   }
01599 
01600   // We have target-specific dag combine patterns for the following nodes:
01601   setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
01602   setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
01603   setTargetDAGCombine(ISD::VSELECT);
01604   setTargetDAGCombine(ISD::SELECT);
01605   setTargetDAGCombine(ISD::SHL);
01606   setTargetDAGCombine(ISD::SRA);
01607   setTargetDAGCombine(ISD::SRL);
01608   setTargetDAGCombine(ISD::OR);
01609   setTargetDAGCombine(ISD::AND);
01610   setTargetDAGCombine(ISD::ADD);
01611   setTargetDAGCombine(ISD::FADD);
01612   setTargetDAGCombine(ISD::FSUB);
01613   setTargetDAGCombine(ISD::FMA);
01614   setTargetDAGCombine(ISD::SUB);
01615   setTargetDAGCombine(ISD::LOAD);
01616   setTargetDAGCombine(ISD::STORE);
01617   setTargetDAGCombine(ISD::ZERO_EXTEND);
01618   setTargetDAGCombine(ISD::ANY_EXTEND);
01619   setTargetDAGCombine(ISD::SIGN_EXTEND);
01620   setTargetDAGCombine(ISD::SIGN_EXTEND_INREG);
01621   setTargetDAGCombine(ISD::TRUNCATE);
01622   setTargetDAGCombine(ISD::SINT_TO_FP);
01623   setTargetDAGCombine(ISD::SETCC);
01624   setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN);
01625   setTargetDAGCombine(ISD::BUILD_VECTOR);
01626   if (Subtarget->is64Bit())
01627     setTargetDAGCombine(ISD::MUL);
01628   setTargetDAGCombine(ISD::XOR);
01629 
01630   computeRegisterProperties();
01631 
01632   // On Darwin, -Os means optimize for size without hurting performance,
01633   // do not reduce the limit.
01634   MaxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores
01635   MaxStoresPerMemsetOptSize = Subtarget->isTargetDarwin() ? 16 : 8;
01636   MaxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores
01637   MaxStoresPerMemcpyOptSize = Subtarget->isTargetDarwin() ? 8 : 4;
01638   MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores
01639   MaxStoresPerMemmoveOptSize = Subtarget->isTargetDarwin() ? 8 : 4;
01640   setPrefLoopAlignment(4); // 2^4 bytes.
01641 
01642   // Predictable cmov don't hurt on atom because it's in-order.
01643   PredictableSelectIsExpensive = !Subtarget->isAtom();
01644 
01645   setPrefFunctionAlignment(4); // 2^4 bytes.
01646 }
01647 
01648 // This has so far only been implemented for 64-bit MachO.
01649 bool X86TargetLowering::useLoadStackGuardNode() const {
01650   return Subtarget->getTargetTriple().getObjectFormat() == Triple::MachO &&
01651          Subtarget->is64Bit();
01652 }
01653 
01654 TargetLoweringBase::LegalizeTypeAction
01655 X86TargetLowering::getPreferredVectorAction(EVT VT) const {
01656   if (ExperimentalVectorWideningLegalization &&
01657       VT.getVectorNumElements() != 1 &&
01658       VT.getVectorElementType().getSimpleVT() != MVT::i1)
01659     return TypeWidenVector;
01660 
01661   return TargetLoweringBase::getPreferredVectorAction(VT);
01662 }
01663 
01664 EVT X86TargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const {
01665   if (!VT.isVector())
01666     return Subtarget->hasAVX512() ? MVT::i1: MVT::i8;
01667 
01668   if (Subtarget->hasAVX512())
01669     switch(VT.getVectorNumElements()) {
01670     case  8: return MVT::v8i1;
01671     case 16: return MVT::v16i1;
01672   }
01673 
01674   return VT.changeVectorElementTypeToInteger();
01675 }
01676 
01677 /// getMaxByValAlign - Helper for getByValTypeAlignment to determine
01678 /// the desired ByVal argument alignment.
01679 static void getMaxByValAlign(Type *Ty, unsigned &MaxAlign) {
01680   if (MaxAlign == 16)
01681     return;
01682   if (VectorType *VTy = dyn_cast<VectorType>(Ty)) {
01683     if (VTy->getBitWidth() == 128)
01684       MaxAlign = 16;
01685   } else if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
01686     unsigned EltAlign = 0;
01687     getMaxByValAlign(ATy->getElementType(), EltAlign);
01688     if (EltAlign > MaxAlign)
01689       MaxAlign = EltAlign;
01690   } else if (StructType *STy = dyn_cast<StructType>(Ty)) {
01691     for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
01692       unsigned EltAlign = 0;
01693       getMaxByValAlign(STy->getElementType(i), EltAlign);
01694       if (EltAlign > MaxAlign)
01695         MaxAlign = EltAlign;
01696       if (MaxAlign == 16)
01697         break;
01698     }
01699   }
01700 }
01701 
01702 /// getByValTypeAlignment - Return the desired alignment for ByVal aggregate
01703 /// function arguments in the caller parameter area. For X86, aggregates
01704 /// that contain SSE vectors are placed at 16-byte boundaries while the rest
01705 /// are at 4-byte boundaries.
01706 unsigned X86TargetLowering::getByValTypeAlignment(Type *Ty) const {
01707   if (Subtarget->is64Bit()) {
01708     // Max of 8 and alignment of type.
01709     unsigned TyAlign = TD->getABITypeAlignment(Ty);
01710     if (TyAlign > 8)
01711       return TyAlign;
01712     return 8;
01713   }
01714 
01715   unsigned Align = 4;
01716   if (Subtarget->hasSSE1())
01717     getMaxByValAlign(Ty, Align);
01718   return Align;
01719 }
01720 
01721 /// getOptimalMemOpType - Returns the target specific optimal type for load
01722 /// and store operations as a result of memset, memcpy, and memmove
01723 /// lowering. If DstAlign is zero that means it's safe to destination
01724 /// alignment can satisfy any constraint. Similarly if SrcAlign is zero it
01725 /// means there isn't a need to check it against alignment requirement,
01726 /// probably because the source does not need to be loaded. If 'IsMemset' is
01727 /// true, that means it's expanding a memset. If 'ZeroMemset' is true, that
01728 /// means it's a memset of zero. 'MemcpyStrSrc' indicates whether the memcpy
01729 /// source is constant so it does not need to be loaded.
01730 /// It returns EVT::Other if the type should be determined using generic
01731 /// target-independent logic.
01732 EVT
01733 X86TargetLowering::getOptimalMemOpType(uint64_t Size,
01734                                        unsigned DstAlign, unsigned SrcAlign,
01735                                        bool IsMemset, bool ZeroMemset,
01736                                        bool MemcpyStrSrc,
01737                                        MachineFunction &MF) const {
01738   const Function *F = MF.getFunction();
01739   if ((!IsMemset || ZeroMemset) &&
01740       !F->getAttributes().hasAttribute(AttributeSet::FunctionIndex,
01741                                        Attribute::NoImplicitFloat)) {
01742     if (Size >= 16 &&
01743         (Subtarget->isUnalignedMemAccessFast() ||
01744          ((DstAlign == 0 || DstAlign >= 16) &&
01745           (SrcAlign == 0 || SrcAlign >= 16)))) {
01746       if (Size >= 32) {
01747         if (Subtarget->hasInt256())
01748           return MVT::v8i32;
01749         if (Subtarget->hasFp256())
01750           return MVT::v8f32;
01751       }
01752       if (Subtarget->hasSSE2())
01753         return MVT::v4i32;
01754       if (Subtarget->hasSSE1())
01755         return MVT::v4f32;
01756     } else if (!MemcpyStrSrc && Size >= 8 &&
01757                !Subtarget->is64Bit() &&
01758                Subtarget->hasSSE2()) {
01759       // Do not use f64 to lower memcpy if source is string constant. It's
01760       // better to use i32 to avoid the loads.
01761       return MVT::f64;
01762     }
01763   }
01764   if (Subtarget->is64Bit() && Size >= 8)
01765     return MVT::i64;
01766   return MVT::i32;
01767 }
01768 
01769 bool X86TargetLowering::isSafeMemOpType(MVT VT) const {
01770   if (VT == MVT::f32)
01771     return X86ScalarSSEf32;
01772   else if (VT == MVT::f64)
01773     return X86ScalarSSEf64;
01774   return true;
01775 }
01776 
01777 bool
01778 X86TargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
01779                                                   unsigned,
01780                                                   unsigned,
01781                                                   bool *Fast) const {
01782   if (Fast)
01783     *Fast = Subtarget->isUnalignedMemAccessFast();
01784   return true;
01785 }
01786 
01787 /// getJumpTableEncoding - Return the entry encoding for a jump table in the
01788 /// current function.  The returned value is a member of the
01789 /// MachineJumpTableInfo::JTEntryKind enum.
01790 unsigned X86TargetLowering::getJumpTableEncoding() const {
01791   // In GOT pic mode, each entry in the jump table is emitted as a @GOTOFF
01792   // symbol.
01793   if (getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
01794       Subtarget->isPICStyleGOT())
01795     return MachineJumpTableInfo::EK_Custom32;
01796 
01797   // Otherwise, use the normal jump table encoding heuristics.
01798   return TargetLowering::getJumpTableEncoding();
01799 }
01800 
01801 const MCExpr *
01802 X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI,
01803                                              const MachineBasicBlock *MBB,
01804                                              unsigned uid,MCContext &Ctx) const{
01805   assert(MBB->getParent()->getTarget().getRelocationModel() == Reloc::PIC_ &&
01806          Subtarget->isPICStyleGOT());
01807   // In 32-bit ELF systems, our jump table entries are formed with @GOTOFF
01808   // entries.
01809   return MCSymbolRefExpr::Create(MBB->getSymbol(),
01810                                  MCSymbolRefExpr::VK_GOTOFF, Ctx);
01811 }
01812 
01813 /// getPICJumpTableRelocaBase - Returns relocation base for the given PIC
01814 /// jumptable.
01815 SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table,
01816                                                     SelectionDAG &DAG) const {
01817   if (!Subtarget->is64Bit())
01818     // This doesn't have SDLoc associated with it, but is not really the
01819     // same as a Register.
01820     return DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), getPointerTy());
01821   return Table;
01822 }
01823 
01824 /// getPICJumpTableRelocBaseExpr - This returns the relocation base for the
01825 /// given PIC jumptable, the same as getPICJumpTableRelocBase, but as an
01826 /// MCExpr.
01827 const MCExpr *X86TargetLowering::
01828 getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI,
01829                              MCContext &Ctx) const {
01830   // X86-64 uses RIP relative addressing based on the jump table label.
01831   if (Subtarget->isPICStyleRIPRel())
01832     return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx);
01833 
01834   // Otherwise, the reference is relative to the PIC base.
01835   return MCSymbolRefExpr::Create(MF->getPICBaseSymbol(), Ctx);
01836 }
01837 
01838 // FIXME: Why this routine is here? Move to RegInfo!
01839 std::pair<const TargetRegisterClass*, uint8_t>
01840 X86TargetLowering::findRepresentativeClass(MVT VT) const{
01841   const TargetRegisterClass *RRC = nullptr;
01842   uint8_t Cost = 1;
01843   switch (VT.SimpleTy) {
01844   default:
01845     return TargetLowering::findRepresentativeClass(VT);
01846   case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64:
01847     RRC = Subtarget->is64Bit() ?
01848       (const TargetRegisterClass*)&X86::GR64RegClass :
01849       (const TargetRegisterClass*)&X86::GR32RegClass;
01850     break;
01851   case MVT::x86mmx:
01852     RRC = &X86::VR64RegClass;
01853     break;
01854   case MVT::f32: case MVT::f64:
01855   case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64:
01856   case MVT::v4f32: case MVT::v2f64:
01857   case MVT::v32i8: case MVT::v8i32: case MVT::v4i64: case MVT::v8f32:
01858   case MVT::v4f64:
01859     RRC = &X86::VR128RegClass;
01860     break;
01861   }
01862   return std::make_pair(RRC, Cost);
01863 }
01864 
01865 bool X86TargetLowering::getStackCookieLocation(unsigned &AddressSpace,
01866                                                unsigned &Offset) const {
01867   if (!Subtarget->isTargetLinux())
01868     return false;
01869 
01870   if (Subtarget->is64Bit()) {
01871     // %fs:0x28, unless we're using a Kernel code model, in which case it's %gs:
01872     Offset = 0x28;
01873     if (getTargetMachine().getCodeModel() == CodeModel::Kernel)
01874       AddressSpace = 256;
01875     else
01876       AddressSpace = 257;
01877   } else {
01878     // %gs:0x14 on i386
01879     Offset = 0x14;
01880     AddressSpace = 256;
01881   }
01882   return true;
01883 }
01884 
01885 bool X86TargetLowering::isNoopAddrSpaceCast(unsigned SrcAS,
01886                                             unsigned DestAS) const {
01887   assert(SrcAS != DestAS && "Expected different address spaces!");
01888 
01889   return SrcAS < 256 && DestAS < 256;
01890 }
01891 
01892 //===----------------------------------------------------------------------===//
01893 //               Return Value Calling Convention Implementation
01894 //===----------------------------------------------------------------------===//
01895 
01896 #include "X86GenCallingConv.inc"
01897 
01898 bool
01899 X86TargetLowering::CanLowerReturn(CallingConv::ID CallConv,
01900                                   MachineFunction &MF, bool isVarArg,
01901                         const SmallVectorImpl<ISD::OutputArg> &Outs,
01902                         LLVMContext &Context) const {
01903   SmallVector<CCValAssign, 16> RVLocs;
01904   CCState CCInfo(CallConv, isVarArg, MF, MF.getTarget(),
01905                  RVLocs, Context);
01906   return CCInfo.CheckReturn(Outs, RetCC_X86);
01907 }
01908 
01909 const MCPhysReg *X86TargetLowering::getScratchRegisters(CallingConv::ID) const {
01910   static const MCPhysReg ScratchRegs[] = { X86::R11, 0 };
01911   return ScratchRegs;
01912 }
01913 
01914 SDValue
01915 X86TargetLowering::LowerReturn(SDValue Chain,
01916                                CallingConv::ID CallConv, bool isVarArg,
01917                                const SmallVectorImpl<ISD::OutputArg> &Outs,
01918                                const SmallVectorImpl<SDValue> &OutVals,
01919                                SDLoc dl, SelectionDAG &DAG) const {
01920   MachineFunction &MF = DAG.getMachineFunction();
01921   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
01922 
01923   SmallVector<CCValAssign, 16> RVLocs;
01924   CCState CCInfo(CallConv, isVarArg, MF, DAG.getTarget(),
01925                  RVLocs, *DAG.getContext());
01926   CCInfo.AnalyzeReturn(Outs, RetCC_X86);
01927 
01928   SDValue Flag;
01929   SmallVector<SDValue, 6> RetOps;
01930   RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
01931   // Operand #1 = Bytes To Pop
01932   RetOps.push_back(DAG.getTargetConstant(FuncInfo->getBytesToPopOnReturn(),
01933                    MVT::i16));
01934 
01935   // Copy the result values into the output registers.
01936   for (unsigned i = 0; i != RVLocs.size(); ++i) {
01937     CCValAssign &VA = RVLocs[i];
01938     assert(VA.isRegLoc() && "Can only return in registers!");
01939     SDValue ValToCopy = OutVals[i];
01940     EVT ValVT = ValToCopy.getValueType();
01941 
01942     // Promote values to the appropriate types
01943     if (VA.getLocInfo() == CCValAssign::SExt)
01944       ValToCopy = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), ValToCopy);
01945     else if (VA.getLocInfo() == CCValAssign::ZExt)
01946       ValToCopy = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), ValToCopy);
01947     else if (VA.getLocInfo() == CCValAssign::AExt)
01948       ValToCopy = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), ValToCopy);
01949     else if (VA.getLocInfo() == CCValAssign::BCvt)
01950       ValToCopy = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), ValToCopy);
01951 
01952     assert(VA.getLocInfo() != CCValAssign::FPExt &&
01953            "Unexpected FP-extend for return value.");  
01954 
01955     // If this is x86-64, and we disabled SSE, we can't return FP values,
01956     // or SSE or MMX vectors.
01957     if ((ValVT == MVT::f32 || ValVT == MVT::f64 ||
01958          VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) &&
01959           (Subtarget->is64Bit() && !Subtarget->hasSSE1())) {
01960       report_fatal_error("SSE register return with SSE disabled");
01961     }
01962     // Likewise we can't return F64 values with SSE1 only.  gcc does so, but
01963     // llvm-gcc has never done it right and no one has noticed, so this
01964     // should be OK for now.
01965     if (ValVT == MVT::f64 &&
01966         (Subtarget->is64Bit() && !Subtarget->hasSSE2()))
01967       report_fatal_error("SSE2 register return with SSE2 disabled");
01968 
01969     // Returns in ST0/ST1 are handled specially: these are pushed as operands to
01970     // the RET instruction and handled by the FP Stackifier.
01971     if (VA.getLocReg() == X86::ST0 ||
01972         VA.getLocReg() == X86::ST1) {
01973       // If this is a copy from an xmm register to ST(0), use an FPExtend to
01974       // change the value to the FP stack register class.
01975       if (isScalarFPTypeInSSEReg(VA.getValVT()))
01976         ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy);
01977       RetOps.push_back(ValToCopy);
01978       // Don't emit a copytoreg.
01979       continue;
01980     }
01981 
01982     // 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64
01983     // which is returned in RAX / RDX.
01984     if (Subtarget->is64Bit()) {
01985       if (ValVT == MVT::x86mmx) {
01986         if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) {
01987           ValToCopy = DAG.getNode(ISD::BITCAST, dl, MVT::i64, ValToCopy);
01988           ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
01989                                   ValToCopy);
01990           // If we don't have SSE2 available, convert to v4f32 so the generated
01991           // register is legal.
01992           if (!Subtarget->hasSSE2())
01993             ValToCopy = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32,ValToCopy);
01994         }
01995       }
01996     }
01997 
01998     Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), ValToCopy, Flag);
01999     Flag = Chain.getValue(1);
02000     RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
02001   }
02002 
02003   // The x86-64 ABIs require that for returning structs by value we copy
02004   // the sret argument into %rax/%eax (depending on ABI) for the return.
02005   // Win32 requires us to put the sret argument to %eax as well.
02006   // We saved the argument into a virtual register in the entry block,
02007   // so now we copy the value out and into %rax/%eax.
02008   if (DAG.getMachineFunction().getFunction()->hasStructRetAttr() &&
02009       (Subtarget->is64Bit() || Subtarget->isTargetKnownWindowsMSVC())) {
02010     MachineFunction &MF = DAG.getMachineFunction();
02011     X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
02012     unsigned Reg = FuncInfo->getSRetReturnReg();
02013     assert(Reg &&
02014            "SRetReturnReg should have been set in LowerFormalArguments().");
02015     SDValue Val = DAG.getCopyFromReg(Chain, dl, Reg, getPointerTy());
02016 
02017     unsigned RetValReg
02018         = (Subtarget->is64Bit() && !Subtarget->isTarget64BitILP32()) ?
02019           X86::RAX : X86::EAX;
02020     Chain = DAG.getCopyToReg(Chain, dl, RetValReg, Val, Flag);
02021     Flag = Chain.getValue(1);
02022 
02023     // RAX/EAX now acts like a return value.
02024     RetOps.push_back(DAG.getRegister(RetValReg, getPointerTy()));
02025   }
02026 
02027   RetOps[0] = Chain;  // Update chain.
02028 
02029   // Add the flag if we have it.
02030   if (Flag.getNode())
02031     RetOps.push_back(Flag);
02032 
02033   return DAG.getNode(X86ISD::RET_FLAG, dl, MVT::Other, RetOps);
02034 }
02035 
02036 bool X86TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
02037   if (N->getNumValues() != 1)
02038     return false;
02039   if (!N->hasNUsesOfValue(1, 0))
02040     return false;
02041 
02042   SDValue TCChain = Chain;
02043   SDNode *Copy = *N->use_begin();
02044   if (Copy->getOpcode() == ISD::CopyToReg) {
02045     // If the copy has a glue operand, we conservatively assume it isn't safe to
02046     // perform a tail call.
02047     if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
02048       return false;
02049     TCChain = Copy->getOperand(0);
02050   } else if (Copy->getOpcode() != ISD::FP_EXTEND)
02051     return false;
02052 
02053   bool HasRet = false;
02054   for (SDNode::use_iterator UI = Copy->use_begin(), UE = Copy->use_end();
02055        UI != UE; ++UI) {
02056     if (UI->getOpcode() != X86ISD::RET_FLAG)
02057       return false;
02058     HasRet = true;
02059   }
02060 
02061   if (!HasRet)
02062     return false;
02063 
02064   Chain = TCChain;
02065   return true;
02066 }
02067 
02068 MVT
02069 X86TargetLowering::getTypeForExtArgOrReturn(MVT VT,
02070                                             ISD::NodeType ExtendKind) const {
02071   MVT ReturnMVT;
02072   // TODO: Is this also valid on 32-bit?
02073   if (Subtarget->is64Bit() && VT == MVT::i1 && ExtendKind == ISD::ZERO_EXTEND)
02074     ReturnMVT = MVT::i8;
02075   else
02076     ReturnMVT = MVT::i32;
02077 
02078   MVT MinVT = getRegisterType(ReturnMVT);
02079   return VT.bitsLT(MinVT) ? MinVT : VT;
02080 }
02081 
02082 /// LowerCallResult - Lower the result values of a call into the
02083 /// appropriate copies out of appropriate physical registers.
02084 ///
02085 SDValue
02086 X86TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
02087                                    CallingConv::ID CallConv, bool isVarArg,
02088                                    const SmallVectorImpl<ISD::InputArg> &Ins,
02089                                    SDLoc dl, SelectionDAG &DAG,
02090                                    SmallVectorImpl<SDValue> &InVals) const {
02091 
02092   // Assign locations to each value returned by this call.
02093   SmallVector<CCValAssign, 16> RVLocs;
02094   bool Is64Bit = Subtarget->is64Bit();
02095   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
02096                  DAG.getTarget(), RVLocs, *DAG.getContext());
02097   CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
02098 
02099   // Copy all of the result registers out of their specified physreg.
02100   for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
02101     CCValAssign &VA = RVLocs[i];
02102     EVT CopyVT = VA.getValVT();
02103 
02104     // If this is x86-64, and we disabled SSE, we can't return FP values
02105     if ((CopyVT == MVT::f32 || CopyVT == MVT::f64) &&
02106         ((Is64Bit || Ins[i].Flags.isInReg()) && !Subtarget->hasSSE1())) {
02107       report_fatal_error("SSE register return with SSE disabled");
02108     }
02109 
02110     SDValue Val;
02111 
02112     // If this is a call to a function that returns an fp value on the floating
02113     // point stack, we must guarantee the value is popped from the stack, so
02114     // a CopyFromReg is not good enough - the copy instruction may be eliminated
02115     // if the return value is not used. We use the FpPOP_RETVAL instruction
02116     // instead.
02117     if (VA.getLocReg() == X86::ST0 || VA.getLocReg() == X86::ST1) {
02118       // If we prefer to use the value in xmm registers, copy it out as f80 and
02119       // use a truncate to move it from fp stack reg to xmm reg.
02120       if (isScalarFPTypeInSSEReg(VA.getValVT())) CopyVT = MVT::f80;
02121       SDValue Ops[] = { Chain, InFlag };
02122       Chain = SDValue(DAG.getMachineNode(X86::FpPOP_RETVAL, dl, CopyVT,
02123                                          MVT::Other, MVT::Glue, Ops), 1);
02124       Val = Chain.getValue(0);
02125 
02126       // Round the f80 to the right size, which also moves it to the appropriate
02127       // xmm register.
02128       if (CopyVT != VA.getValVT())
02129         Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val,
02130                           // This truncation won't change the value.
02131                           DAG.getIntPtrConstant(1));
02132     } else {
02133       Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(),
02134                                  CopyVT, InFlag).getValue(1);
02135       Val = Chain.getValue(0);
02136     }
02137     InFlag = Chain.getValue(2);
02138     InVals.push_back(Val);
02139   }
02140 
02141   return Chain;
02142 }
02143 
02144 //===----------------------------------------------------------------------===//
02145 //                C & StdCall & Fast Calling Convention implementation
02146 //===----------------------------------------------------------------------===//
02147 //  StdCall calling convention seems to be standard for many Windows' API
02148 //  routines and around. It differs from C calling convention just a little:
02149 //  callee should clean up the stack, not caller. Symbols should be also
02150 //  decorated in some fancy way :) It doesn't support any vector arguments.
02151 //  For info on fast calling convention see Fast Calling Convention (tail call)
02152 //  implementation LowerX86_32FastCCCallTo.
02153 
02154 /// CallIsStructReturn - Determines whether a call uses struct return
02155 /// semantics.
02156 enum StructReturnType {
02157   NotStructReturn,
02158   RegStructReturn,
02159   StackStructReturn
02160 };
02161 static StructReturnType
02162 callIsStructReturn(const SmallVectorImpl<ISD::OutputArg> &Outs) {
02163   if (Outs.empty())
02164     return NotStructReturn;
02165 
02166   const ISD::ArgFlagsTy &Flags = Outs[0].Flags;
02167   if (!Flags.isSRet())
02168     return NotStructReturn;
02169   if (Flags.isInReg())
02170     return RegStructReturn;
02171   return StackStructReturn;
02172 }
02173 
02174 /// ArgsAreStructReturn - Determines whether a function uses struct
02175 /// return semantics.
02176 static StructReturnType
02177 argsAreStructReturn(const SmallVectorImpl<ISD::InputArg> &Ins) {
02178   if (Ins.empty())
02179     return NotStructReturn;
02180 
02181   const ISD::ArgFlagsTy &Flags = Ins[0].Flags;
02182   if (!Flags.isSRet())
02183     return NotStructReturn;
02184   if (Flags.isInReg())
02185     return RegStructReturn;
02186   return StackStructReturn;
02187 }
02188 
02189 /// CreateCopyOfByValArgument - Make a copy of an aggregate at address specified
02190 /// by "Src" to address "Dst" with size and alignment information specified by
02191 /// the specific parameter attribute. The copy will be passed as a byval
02192 /// function parameter.
02193 static SDValue
02194 CreateCopyOfByValArgument(SDValue Src, SDValue Dst, SDValue Chain,
02195                           ISD::ArgFlagsTy Flags, SelectionDAG &DAG,
02196                           SDLoc dl) {
02197   SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), MVT::i32);
02198 
02199   return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(),
02200                        /*isVolatile*/false, /*AlwaysInline=*/true,
02201                        MachinePointerInfo(), MachinePointerInfo());
02202 }
02203 
02204 /// IsTailCallConvention - Return true if the calling convention is one that
02205 /// supports tail call optimization.
02206 static bool IsTailCallConvention(CallingConv::ID CC) {
02207   return (CC == CallingConv::Fast || CC == CallingConv::GHC ||
02208           CC == CallingConv::HiPE);
02209 }
02210 
02211 /// \brief Return true if the calling convention is a C calling convention.
02212 static bool IsCCallConvention(CallingConv::ID CC) {
02213   return (CC == CallingConv::C || CC == CallingConv::X86_64_Win64 ||
02214           CC == CallingConv::X86_64_SysV);
02215 }
02216 
02217 bool X86TargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const {
02218   if (!CI->isTailCall() || getTargetMachine().Options.DisableTailCalls)
02219     return false;
02220 
02221   CallSite CS(CI);
02222   CallingConv::ID CalleeCC = CS.getCallingConv();
02223   if (!IsTailCallConvention(CalleeCC) && !IsCCallConvention(CalleeCC))
02224     return false;
02225 
02226   return true;
02227 }
02228 
02229 /// FuncIsMadeTailCallSafe - Return true if the function is being made into
02230 /// a tailcall target by changing its ABI.
02231 static bool FuncIsMadeTailCallSafe(CallingConv::ID CC,
02232                                    bool GuaranteedTailCallOpt) {
02233   return GuaranteedTailCallOpt && IsTailCallConvention(CC);
02234 }
02235 
02236 SDValue
02237 X86TargetLowering::LowerMemArgument(SDValue Chain,
02238                                     CallingConv::ID CallConv,
02239                                     const SmallVectorImpl<ISD::InputArg> &Ins,
02240                                     SDLoc dl, SelectionDAG &DAG,
02241                                     const CCValAssign &VA,
02242                                     MachineFrameInfo *MFI,
02243                                     unsigned i) const {
02244   // Create the nodes corresponding to a load from this parameter slot.
02245   ISD::ArgFlagsTy Flags = Ins[i].Flags;
02246   bool AlwaysUseMutable = FuncIsMadeTailCallSafe(
02247       CallConv, DAG.getTarget().Options.GuaranteedTailCallOpt);
02248   bool isImmutable = !AlwaysUseMutable && !Flags.isByVal();
02249   EVT ValVT;
02250 
02251   // If value is passed by pointer we have address passed instead of the value
02252   // itself.
02253   if (VA.getLocInfo() == CCValAssign::Indirect)
02254     ValVT = VA.getLocVT();
02255   else
02256     ValVT = VA.getValVT();
02257 
02258   // FIXME: For now, all byval parameter objects are marked mutable. This can be
02259   // changed with more analysis.
02260   // In case of tail call optimization mark all arguments mutable. Since they
02261   // could be overwritten by lowering of arguments in case of a tail call.
02262   if (Flags.isByVal()) {
02263     unsigned Bytes = Flags.getByValSize();
02264     if (Bytes == 0) Bytes = 1; // Don't create zero-sized stack objects.
02265     int FI = MFI->CreateFixedObject(Bytes, VA.getLocMemOffset(), isImmutable);
02266     return DAG.getFrameIndex(FI, getPointerTy());
02267   } else {
02268     int FI = MFI->CreateFixedObject(ValVT.getSizeInBits()/8,
02269                                     VA.getLocMemOffset(), isImmutable);
02270     SDValue FIN = DAG.getFrameIndex(FI, getPointerTy());
02271     return DAG.getLoad(ValVT, dl, Chain, FIN,
02272                        MachinePointerInfo::getFixedStack(FI),
02273                        false, false, false, 0);
02274   }
02275 }
02276 
02277 SDValue
02278 X86TargetLowering::LowerFormalArguments(SDValue Chain,
02279                                         CallingConv::ID CallConv,
02280                                         bool isVarArg,
02281                                       const SmallVectorImpl<ISD::InputArg> &Ins,
02282                                         SDLoc dl,
02283                                         SelectionDAG &DAG,
02284                                         SmallVectorImpl<SDValue> &InVals)
02285                                           const {
02286   MachineFunction &MF = DAG.getMachineFunction();
02287   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
02288 
02289   const Function* Fn = MF.getFunction();
02290   if (Fn->hasExternalLinkage() &&
02291       Subtarget->isTargetCygMing() &&
02292       Fn->getName() == "main")
02293     FuncInfo->setForceFramePointer(true);
02294 
02295   MachineFrameInfo *MFI = MF.getFrameInfo();
02296   bool Is64Bit = Subtarget->is64Bit();
02297   bool IsWin64 = Subtarget->isCallingConvWin64(CallConv);
02298 
02299   assert(!(isVarArg && IsTailCallConvention(CallConv)) &&
02300          "Var args not supported with calling convention fastcc, ghc or hipe");
02301 
02302   // Assign locations to all of the incoming arguments.
02303   SmallVector<CCValAssign, 16> ArgLocs;
02304   CCState CCInfo(CallConv, isVarArg, MF, DAG.getTarget(),
02305                  ArgLocs, *DAG.getContext());
02306 
02307   // Allocate shadow area for Win64
02308   if (IsWin64)
02309     CCInfo.AllocateStack(32, 8);
02310 
02311   CCInfo.AnalyzeFormalArguments(Ins, CC_X86);
02312 
02313   unsigned LastVal = ~0U;
02314   SDValue ArgValue;
02315   for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
02316     CCValAssign &VA = ArgLocs[i];
02317     // TODO: If an arg is passed in two places (e.g. reg and stack), skip later
02318     // places.
02319     assert(VA.getValNo() != LastVal &&
02320            "Don't support value assigned to multiple locs yet");
02321     (void)LastVal;
02322     LastVal = VA.getValNo();
02323 
02324     if (VA.isRegLoc()) {
02325       EVT RegVT = VA.getLocVT();
02326       const TargetRegisterClass *RC;
02327       if (RegVT == MVT::i32)
02328         RC = &X86::GR32RegClass;
02329       else if (Is64Bit && RegVT == MVT::i64)
02330         RC = &X86::GR64RegClass;
02331       else if (RegVT == MVT::f32)
02332         RC = &X86::FR32RegClass;
02333       else if (RegVT == MVT::f64)
02334         RC = &X86::FR64RegClass;
02335       else if (RegVT.is512BitVector())
02336         RC = &X86::VR512RegClass;
02337       else if (RegVT.is256BitVector())
02338         RC = &X86::VR256RegClass;
02339       else if (RegVT.is128BitVector())
02340         RC = &X86::VR128RegClass;
02341       else if (RegVT == MVT::x86mmx)
02342         RC = &X86::VR64RegClass;
02343       else if (RegVT == MVT::i1)
02344         RC = &X86::VK1RegClass;
02345       else if (RegVT == MVT::v8i1)
02346         RC = &X86::VK8RegClass;
02347       else if (RegVT == MVT::v16i1)
02348         RC = &X86::VK16RegClass;
02349       else if (RegVT == MVT::v32i1)
02350         RC = &X86::VK32RegClass;
02351       else if (RegVT == MVT::v64i1)
02352         RC = &X86::VK64RegClass;
02353       else
02354         llvm_unreachable("Unknown argument type!");
02355 
02356       unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
02357       ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
02358 
02359       // If this is an 8 or 16-bit value, it is really passed promoted to 32
02360       // bits.  Insert an assert[sz]ext to capture this, then truncate to the
02361       // right size.
02362       if (VA.getLocInfo() == CCValAssign::SExt)
02363         ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue,
02364                                DAG.getValueType(VA.getValVT()));
02365       else if (VA.getLocInfo() == CCValAssign::ZExt)
02366         ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue,
02367                                DAG.getValueType(VA.getValVT()));
02368       else if (VA.getLocInfo() == CCValAssign::BCvt)
02369         ArgValue = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), ArgValue);
02370 
02371       if (VA.isExtInLoc()) {
02372         // Handle MMX values passed in XMM regs.
02373         if (RegVT.isVector())
02374           ArgValue = DAG.getNode(X86ISD::MOVDQ2Q, dl, VA.getValVT(), ArgValue);
02375         else
02376           ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
02377       }
02378     } else {
02379       assert(VA.isMemLoc());
02380       ArgValue = LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, i);
02381     }
02382 
02383     // If value is passed via pointer - do a load.
02384     if (VA.getLocInfo() == CCValAssign::Indirect)
02385       ArgValue = DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue,
02386                              MachinePointerInfo(), false, false, false, 0);
02387 
02388     InVals.push_back(ArgValue);
02389   }
02390 
02391   if (Subtarget->is64Bit() || Subtarget->isTargetKnownWindowsMSVC()) {
02392     for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
02393       // The x86-64 ABIs require that for returning structs by value we copy
02394       // the sret argument into %rax/%eax (depending on ABI) for the return.
02395       // Win32 requires us to put the sret argument to %eax as well.
02396       // Save the argument into a virtual register so that we can access it
02397       // from the return points.
02398       if (Ins[i].Flags.isSRet()) {
02399         unsigned Reg = FuncInfo->getSRetReturnReg();
02400         if (!Reg) {
02401           MVT PtrTy = getPointerTy();
02402           Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy));
02403           FuncInfo->setSRetReturnReg(Reg);
02404         }
02405         SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[i]);
02406         Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain);
02407         break;
02408       }
02409     }
02410   }
02411 
02412   unsigned StackSize = CCInfo.getNextStackOffset();
02413   // Align stack specially for tail calls.
02414   if (FuncIsMadeTailCallSafe(CallConv,
02415                              MF.getTarget().Options.GuaranteedTailCallOpt))
02416     StackSize = GetAlignedArgumentStackSize(StackSize, DAG);
02417 
02418   // If the function takes variable number of arguments, make a frame index for
02419   // the start of the first vararg value... for expansion of llvm.va_start.
02420   if (isVarArg) {
02421     if (Is64Bit || (CallConv != CallingConv::X86_FastCall &&
02422                     CallConv != CallingConv::X86_ThisCall)) {
02423       FuncInfo->setVarArgsFrameIndex(MFI->CreateFixedObject(1, StackSize,true));
02424     }
02425     if (Is64Bit) {
02426       unsigned TotalNumIntRegs = 0, TotalNumXMMRegs = 0;
02427 
02428       // FIXME: We should really autogenerate these arrays
02429       static const MCPhysReg GPR64ArgRegsWin64[] = {
02430         X86::RCX, X86::RDX, X86::R8,  X86::R9
02431       };
02432       static const MCPhysReg GPR64ArgRegs64Bit[] = {
02433         X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9
02434       };
02435       static const MCPhysReg XMMArgRegs64Bit[] = {
02436         X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
02437         X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
02438       };
02439       const MCPhysReg *GPR64ArgRegs;
02440       unsigned NumXMMRegs = 0;
02441 
02442       if (IsWin64) {
02443         // The XMM registers which might contain var arg parameters are shadowed
02444         // in their paired GPR.  So we only need to save the GPR to their home
02445         // slots.
02446         TotalNumIntRegs = 4;
02447         GPR64ArgRegs = GPR64ArgRegsWin64;
02448       } else {
02449         TotalNumIntRegs = 6; TotalNumXMMRegs = 8;
02450         GPR64ArgRegs = GPR64ArgRegs64Bit;
02451 
02452         NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs64Bit,
02453                                                 TotalNumXMMRegs);
02454       }
02455       unsigned NumIntRegs = CCInfo.getFirstUnallocated(GPR64ArgRegs,
02456                                                        TotalNumIntRegs);
02457 
02458       bool NoImplicitFloatOps = Fn->getAttributes().
02459         hasAttribute(AttributeSet::FunctionIndex, Attribute::NoImplicitFloat);
02460       assert(!(NumXMMRegs && !Subtarget->hasSSE1()) &&
02461              "SSE register cannot be used when SSE is disabled!");
02462       assert(!(NumXMMRegs && MF.getTarget().Options.UseSoftFloat &&
02463                NoImplicitFloatOps) &&
02464              "SSE register cannot be used when SSE is disabled!");
02465       if (MF.getTarget().Options.UseSoftFloat || NoImplicitFloatOps ||
02466           !Subtarget->hasSSE1())
02467         // Kernel mode asks for SSE to be disabled, so don't push them
02468         // on the stack.
02469         TotalNumXMMRegs = 0;
02470 
02471       if (IsWin64) {
02472         const TargetFrameLowering &TFI = *MF.getTarget().getFrameLowering();
02473         // Get to the caller-allocated home save location.  Add 8 to account
02474         // for the return address.
02475         int HomeOffset = TFI.getOffsetOfLocalArea() + 8;
02476         FuncInfo->setRegSaveFrameIndex(
02477           MFI->CreateFixedObject(1, NumIntRegs * 8 + HomeOffset, false));
02478         // Fixup to set vararg frame on shadow area (4 x i64).
02479         if (NumIntRegs < 4)
02480           FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex());
02481       } else {
02482         // For X86-64, if there are vararg parameters that are passed via
02483         // registers, then we must store them to their spots on the stack so
02484         // they may be loaded by deferencing the result of va_next.
02485         FuncInfo->setVarArgsGPOffset(NumIntRegs * 8);
02486         FuncInfo->setVarArgsFPOffset(TotalNumIntRegs * 8 + NumXMMRegs * 16);
02487         FuncInfo->setRegSaveFrameIndex(
02488           MFI->CreateStackObject(TotalNumIntRegs * 8 + TotalNumXMMRegs * 16, 16,
02489                                false));
02490       }
02491 
02492       // Store the integer parameter registers.
02493       SmallVector<SDValue, 8> MemOps;
02494       SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(),
02495                                         getPointerTy());
02496       unsigned Offset = FuncInfo->getVarArgsGPOffset();
02497       for (; NumIntRegs != TotalNumIntRegs; ++NumIntRegs) {
02498         SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), RSFIN,
02499                                   DAG.getIntPtrConstant(Offset));
02500         unsigned VReg = MF.addLiveIn(GPR64ArgRegs[NumIntRegs],
02501                                      &X86::GR64RegClass);
02502         SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64);
02503         SDValue Store =
02504           DAG.getStore(Val.getValue(1), dl, Val, FIN,
02505                        MachinePointerInfo::getFixedStack(
02506                          FuncInfo->getRegSaveFrameIndex(), Offset),
02507                        false, false, 0);
02508         MemOps.push_back(Store);
02509         Offset += 8;
02510       }
02511 
02512       if (TotalNumXMMRegs != 0 && NumXMMRegs != TotalNumXMMRegs) {
02513         // Now store the XMM (fp + vector) parameter registers.
02514         SmallVector<SDValue, 11> SaveXMMOps;
02515         SaveXMMOps.push_back(Chain);
02516 
02517         unsigned AL = MF.addLiveIn(X86::AL, &X86::GR8RegClass);
02518         SDValue ALVal = DAG.getCopyFromReg(DAG.getEntryNode(), dl, AL, MVT::i8);
02519         SaveXMMOps.push_back(ALVal);
02520 
02521         SaveXMMOps.push_back(DAG.getIntPtrConstant(
02522                                FuncInfo->getRegSaveFrameIndex()));
02523         SaveXMMOps.push_back(DAG.getIntPtrConstant(
02524                                FuncInfo->getVarArgsFPOffset()));
02525 
02526         for (; NumXMMRegs != TotalNumXMMRegs; ++NumXMMRegs) {
02527           unsigned VReg = MF.addLiveIn(XMMArgRegs64Bit[NumXMMRegs],
02528                                        &X86::VR128RegClass);
02529           SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::v4f32);
02530           SaveXMMOps.push_back(Val);
02531         }
02532         MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, dl,
02533                                      MVT::Other, SaveXMMOps));
02534       }
02535 
02536       if (!MemOps.empty())
02537         Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);
02538     }
02539   }
02540 
02541   // Some CCs need callee pop.
02542   if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
02543                        MF.getTarget().Options.GuaranteedTailCallOpt)) {
02544     FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything.
02545   } else {
02546     FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing.
02547     // If this is an sret function, the return should pop the hidden pointer.
02548     if (!Is64Bit && !IsTailCallConvention(CallConv) &&
02549         !Subtarget->getTargetTriple().isOSMSVCRT() &&
02550         argsAreStructReturn(Ins) == StackStructReturn)
02551       FuncInfo->setBytesToPopOnReturn(4);
02552   }
02553 
02554   if (!Is64Bit) {
02555     // RegSaveFrameIndex is X86-64 only.
02556     FuncInfo->setRegSaveFrameIndex(0xAAAAAAA);
02557     if (CallConv == CallingConv::X86_FastCall ||
02558         CallConv == CallingConv::X86_ThisCall)
02559       // fastcc functions can't have varargs.
02560       FuncInfo->setVarArgsFrameIndex(0xAAAAAAA);
02561   }
02562 
02563   FuncInfo->setArgumentStackSize(StackSize);
02564 
02565   return Chain;
02566 }
02567 
02568 SDValue
02569 X86TargetLowering::LowerMemOpCallTo(SDValue Chain,
02570                                     SDValue StackPtr, SDValue Arg,
02571                                     SDLoc dl, SelectionDAG &DAG,
02572                                     const CCValAssign &VA,
02573                                     ISD::ArgFlagsTy Flags) const {
02574   unsigned LocMemOffset = VA.getLocMemOffset();
02575   SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset);
02576   PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, PtrOff);
02577   if (Flags.isByVal())
02578     return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl);
02579 
02580   return DAG.getStore(Chain, dl, Arg, PtrOff,
02581                       MachinePointerInfo::getStack(LocMemOffset),
02582                       false, false, 0);
02583 }
02584 
02585 /// EmitTailCallLoadRetAddr - Emit a load of return address if tail call
02586 /// optimization is performed and it is required.
02587 SDValue
02588 X86TargetLowering::EmitTailCallLoadRetAddr(SelectionDAG &DAG,
02589                                            SDValue &OutRetAddr, SDValue Chain,
02590                                            bool IsTailCall, bool Is64Bit,
02591                                            int FPDiff, SDLoc dl) const {
02592   // Adjust the Return address stack slot.
02593   EVT VT = getPointerTy();
02594   OutRetAddr = getReturnAddressFrameIndex(DAG);
02595 
02596   // Load the "old" Return address.
02597   OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, MachinePointerInfo(),
02598                            false, false, false, 0);
02599   return SDValue(OutRetAddr.getNode(), 1);
02600 }
02601 
02602 /// EmitTailCallStoreRetAddr - Emit a store of the return address if tail call
02603 /// optimization is performed and it is required (FPDiff!=0).
02604 static SDValue EmitTailCallStoreRetAddr(SelectionDAG &DAG, MachineFunction &MF,
02605                                         SDValue Chain, SDValue RetAddrFrIdx,
02606                                         EVT PtrVT, unsigned SlotSize,
02607                                         int FPDiff, SDLoc dl) {
02608   // Store the return address to the appropriate stack slot.
02609   if (!FPDiff) return Chain;
02610   // Calculate the new stack slot for the return address.
02611   int NewReturnAddrFI =
02612     MF.getFrameInfo()->CreateFixedObject(SlotSize, (int64_t)FPDiff - SlotSize,
02613                                          false);
02614   SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, PtrVT);
02615   Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx,
02616                        MachinePointerInfo::getFixedStack(NewReturnAddrFI),
02617                        false, false, 0);
02618   return Chain;
02619 }
02620 
02621 SDValue
02622 X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
02623                              SmallVectorImpl<SDValue> &InVals) const {
02624   SelectionDAG &DAG                     = CLI.DAG;
02625   SDLoc &dl                             = CLI.DL;
02626   SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
02627   SmallVectorImpl<SDValue> &OutVals     = CLI.OutVals;
02628   SmallVectorImpl<ISD::InputArg> &Ins   = CLI.Ins;
02629   SDValue Chain                         = CLI.Chain;
02630   SDValue Callee                        = CLI.Callee;
02631   CallingConv::ID CallConv              = CLI.CallConv;
02632   bool &isTailCall                      = CLI.IsTailCall;
02633   bool isVarArg                         = CLI.IsVarArg;
02634 
02635   MachineFunction &MF = DAG.getMachineFunction();
02636   bool Is64Bit        = Subtarget->is64Bit();
02637   bool IsWin64        = Subtarget->isCallingConvWin64(CallConv);
02638   StructReturnType SR = callIsStructReturn(Outs);
02639   bool IsSibcall      = false;
02640 
02641   if (MF.getTarget().Options.DisableTailCalls)
02642     isTailCall = false;
02643 
02644   bool IsMustTail = CLI.CS && CLI.CS->isMustTailCall();
02645   if (IsMustTail) {
02646     // Force this to be a tail call.  The verifier rules are enough to ensure
02647     // that we can lower this successfully without moving the return address
02648     // around.
02649     isTailCall = true;
02650   } else if (isTailCall) {
02651     // Check if it's really possible to do a tail call.
02652     isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv,
02653                     isVarArg, SR != NotStructReturn,
02654                     MF.getFunction()->hasStructRetAttr(), CLI.RetTy,
02655                     Outs, OutVals, Ins, DAG);
02656 
02657     // Sibcalls are automatically detected tailcalls which do not require
02658     // ABI changes.
02659     if (!MF.getTarget().Options.GuaranteedTailCallOpt && isTailCall)
02660       IsSibcall = true;
02661 
02662     if (isTailCall)
02663       ++NumTailCalls;
02664   }
02665 
02666   assert(!(isVarArg && IsTailCallConvention(CallConv)) &&
02667          "Var args not supported with calling convention fastcc, ghc or hipe");
02668 
02669   // Analyze operands of the call, assigning locations to each operand.
02670   SmallVector<CCValAssign, 16> ArgLocs;
02671   CCState CCInfo(CallConv, isVarArg, MF, MF.getTarget(),
02672                  ArgLocs, *DAG.getContext());
02673 
02674   // Allocate shadow area for Win64
02675   if (IsWin64)
02676     CCInfo.AllocateStack(32, 8);
02677 
02678   CCInfo.AnalyzeCallOperands(Outs, CC_X86);
02679 
02680   // Get a count of how many bytes are to be pushed on the stack.
02681   unsigned NumBytes = CCInfo.getNextStackOffset();
02682   if (IsSibcall)
02683     // This is a sibcall. The memory operands are available in caller's
02684     // own caller's stack.
02685     NumBytes = 0;
02686   else if (MF.getTarget().Options.GuaranteedTailCallOpt &&
02687            IsTailCallConvention(CallConv))
02688     NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG);
02689 
02690   int FPDiff = 0;
02691   if (isTailCall && !IsSibcall && !IsMustTail) {
02692     // Lower arguments at fp - stackoffset + fpdiff.
02693     X86MachineFunctionInfo *X86Info = MF.getInfo<X86MachineFunctionInfo>();
02694     unsigned NumBytesCallerPushed = X86Info->getBytesToPopOnReturn();
02695 
02696     FPDiff = NumBytesCallerPushed - NumBytes;
02697 
02698     // Set the delta of movement of the returnaddr stackslot.
02699     // But only set if delta is greater than previous delta.
02700     if (FPDiff < X86Info->getTCReturnAddrDelta())
02701       X86Info->setTCReturnAddrDelta(FPDiff);
02702   }
02703 
02704   unsigned NumBytesToPush = NumBytes;
02705   unsigned NumBytesToPop = NumBytes;
02706 
02707   // If we have an inalloca argument, all stack space has already been allocated
02708   // for us and be right at the top of the stack.  We don't support multiple
02709   // arguments passed in memory when using inalloca.
02710   if (!Outs.empty() && Outs.back().Flags.isInAlloca()) {
02711     NumBytesToPush = 0;
02712     if (!ArgLocs.back().isMemLoc())
02713       report_fatal_error("cannot use inalloca attribute on a register "
02714                          "parameter");
02715     if (ArgLocs.back().getLocMemOffset() != 0)
02716       report_fatal_error("any parameter with the inalloca attribute must be "
02717                          "the only memory argument");
02718   }
02719 
02720   if (!IsSibcall)
02721     Chain = DAG.getCALLSEQ_START(
02722         Chain, DAG.getIntPtrConstant(NumBytesToPush, true), dl);
02723 
02724   SDValue RetAddrFrIdx;
02725   // Load return address for tail calls.
02726   if (isTailCall && FPDiff)
02727     Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall,
02728                                     Is64Bit, FPDiff, dl);
02729 
02730   SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
02731   SmallVector<SDValue, 8> MemOpChains;
02732   SDValue StackPtr;
02733 
02734   // Walk the register/memloc assignments, inserting copies/loads.  In the case
02735   // of tail call optimization arguments are handle later.
02736   const X86RegisterInfo *RegInfo =
02737     static_cast<const X86RegisterInfo*>(DAG.getTarget().getRegisterInfo());
02738   for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
02739     // Skip inalloca arguments, they have already been written.
02740     ISD::ArgFlagsTy Flags = Outs[i].Flags;
02741     if (Flags.isInAlloca())
02742       continue;
02743 
02744     CCValAssign &VA = ArgLocs[i];
02745     EVT RegVT = VA.getLocVT();
02746     SDValue Arg = OutVals[i];
02747     bool isByVal = Flags.isByVal();
02748 
02749     // Promote the value if needed.
02750     switch (VA.getLocInfo()) {
02751     default: llvm_unreachable("Unknown loc info!");
02752     case CCValAssign::Full: break;
02753     case CCValAssign::SExt:
02754       Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg);
02755       break;
02756     case CCValAssign::ZExt:
02757       Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg);
02758       break;
02759     case CCValAssign::AExt:
02760       if (RegVT.is128BitVector()) {
02761         // Special case: passing MMX values in XMM registers.
02762         Arg = DAG.getNode(ISD::BITCAST, dl, MVT::i64, Arg);
02763         Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg);
02764         Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg);
02765       } else
02766         Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg);
02767       break;
02768     case CCValAssign::BCvt:
02769       Arg = DAG.getNode(ISD::BITCAST, dl, RegVT, Arg);
02770       break;
02771     case CCValAssign::Indirect: {
02772       // Store the argument.
02773       SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT());
02774       int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex();
02775       Chain = DAG.getStore(Chain, dl, Arg, SpillSlot,
02776                            MachinePointerInfo::getFixedStack(FI),
02777                            false, false, 0);
02778       Arg = SpillSlot;
02779       break;
02780     }
02781     }
02782 
02783     if (VA.isRegLoc()) {
02784       RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
02785       if (isVarArg && IsWin64) {
02786         // Win64 ABI requires argument XMM reg to be copied to the corresponding
02787         // shadow reg if callee is a varargs function.
02788         unsigned ShadowReg = 0;
02789         switch (VA.getLocReg()) {
02790         case X86::XMM0: ShadowReg = X86::RCX; break;
02791         case X86::XMM1: ShadowReg = X86::RDX; break;
02792         case X86::XMM2: ShadowReg = X86::R8; break;
02793         case X86::XMM3: ShadowReg = X86::R9; break;
02794         }
02795         if (ShadowReg)
02796           RegsToPass.push_back(std::make_pair(ShadowReg, Arg));
02797       }
02798     } else if (!IsSibcall && (!isTailCall || isByVal)) {
02799       assert(VA.isMemLoc());
02800       if (!StackPtr.getNode())
02801         StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
02802                                       getPointerTy());
02803       MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg,
02804                                              dl, DAG, VA, Flags));
02805     }
02806   }
02807 
02808   if (!MemOpChains.empty())
02809     Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
02810 
02811   if (Subtarget->isPICStyleGOT()) {
02812     // ELF / PIC requires GOT in the EBX register before function calls via PLT
02813     // GOT pointer.
02814     if (!isTailCall) {
02815       RegsToPass.push_back(std::make_pair(unsigned(X86::EBX),
02816                DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), getPointerTy())));
02817     } else {
02818       // If we are tail calling and generating PIC/GOT style code load the
02819       // address of the callee into ECX. The value in ecx is used as target of
02820       // the tail jump. This is done to circumvent the ebx/callee-saved problem
02821       // for tail calls on PIC/GOT architectures. Normally we would just put the
02822       // address of GOT into ebx and then call target@PLT. But for tail calls
02823       // ebx would be restored (since ebx is callee saved) before jumping to the
02824       // target@PLT.
02825 
02826       // Note: The actual moving to ECX is done further down.
02827       GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
02828       if (G && !G->getGlobal()->hasHiddenVisibility() &&
02829           !G->getGlobal()->hasProtectedVisibility())
02830         Callee = LowerGlobalAddress(Callee, DAG);
02831       else if (isa<ExternalSymbolSDNode>(Callee))
02832         Callee = LowerExternalSymbol(Callee, DAG);
02833     }
02834   }
02835 
02836   if (Is64Bit && isVarArg && !IsWin64) {
02837     // From AMD64 ABI document:
02838     // For calls that may call functions that use varargs or stdargs
02839     // (prototype-less calls or calls to functions containing ellipsis (...) in
02840     // the declaration) %al is used as hidden argument to specify the number
02841     // of SSE registers used. The contents of %al do not need to match exactly
02842     // the number of registers, but must be an ubound on the number of SSE
02843     // registers used and is in the range 0 - 8 inclusive.
02844 
02845     // Count the number of XMM registers allocated.
02846     static const MCPhysReg XMMArgRegs[] = {
02847       X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
02848       X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
02849     };
02850     unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs, 8);
02851     assert((Subtarget->hasSSE1() || !NumXMMRegs)
02852            && "SSE registers cannot be used when SSE is disabled");
02853 
02854     RegsToPass.push_back(std::make_pair(unsigned(X86::AL),
02855                                         DAG.getConstant(NumXMMRegs, MVT::i8)));
02856   }
02857 
02858   // For tail calls lower the arguments to the 'real' stack slots.  Sibcalls
02859   // don't need this because the eligibility check rejects calls that require
02860   // shuffling arguments passed in memory.
02861   if (!IsSibcall && isTailCall) {
02862     // Force all the incoming stack arguments to be loaded from the stack
02863     // before any new outgoing arguments are stored to the stack, because the
02864     // outgoing stack slots may alias the incoming argument stack slots, and
02865     // the alias isn't otherwise explicit. This is slightly more conservative
02866     // than necessary, because it means that each store effectively depends
02867     // on every argument instead of just those arguments it would clobber.
02868     SDValue ArgChain = DAG.getStackArgumentTokenFactor(Chain);
02869 
02870     SmallVector<SDValue, 8> MemOpChains2;
02871     SDValue FIN;
02872     int FI = 0;
02873     for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
02874       CCValAssign &VA = ArgLocs[i];
02875       if (VA.isRegLoc())
02876         continue;
02877       assert(VA.isMemLoc());
02878       SDValue Arg = OutVals[i];
02879       ISD::ArgFlagsTy Flags = Outs[i].Flags;
02880       // Skip inalloca arguments.  They don't require any work.
02881       if (Flags.isInAlloca())
02882         continue;
02883       // Create frame index.
02884       int32_t Offset = VA.getLocMemOffset()+FPDiff;
02885       uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8;
02886       FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset, true);
02887       FIN = DAG.getFrameIndex(FI, getPointerTy());
02888 
02889       if (Flags.isByVal()) {
02890         // Copy relative to framepointer.
02891         SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset());
02892         if (!StackPtr.getNode())
02893           StackPtr = DAG.getCopyFromReg(Chain, dl,
02894                                         RegInfo->getStackRegister(),
02895                                         getPointerTy());
02896         Source = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, Source);
02897 
02898         MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN,
02899                                                          ArgChain,
02900                                                          Flags, DAG, dl));
02901       } else {
02902         // Store relative to framepointer.
02903         MemOpChains2.push_back(
02904           DAG.getStore(ArgChain, dl, Arg, FIN,
02905                        MachinePointerInfo::getFixedStack(FI),
02906                        false, false, 0));
02907       }
02908     }
02909 
02910     if (!MemOpChains2.empty())
02911       Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains2);
02912 
02913     // Store the return address to the appropriate stack slot.
02914     Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx,
02915                                      getPointerTy(), RegInfo->getSlotSize(),
02916                                      FPDiff, dl);
02917   }
02918 
02919   // Build a sequence of copy-to-reg nodes chained together with token chain
02920   // and flag operands which copy the outgoing args into registers.
02921   SDValue InFlag;
02922   for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
02923     Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
02924                              RegsToPass[i].second, InFlag);
02925     InFlag = Chain.getValue(1);
02926   }
02927 
02928   if (DAG.getTarget().getCodeModel() == CodeModel::Large) {
02929     assert(Is64Bit && "Large code model is only legal in 64-bit mode.");
02930     // In the 64-bit large code model, we have to make all calls
02931     // through a register, since the call instruction's 32-bit
02932     // pc-relative offset may not be large enough to hold the whole
02933     // address.
02934   } else if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
02935     // If the callee is a GlobalAddress node (quite common, every direct call
02936     // is) turn it into a TargetGlobalAddress node so that legalize doesn't hack
02937     // it.
02938 
02939     // We should use extra load for direct calls to dllimported functions in
02940     // non-JIT mode.
02941     const GlobalValue *GV = G->getGlobal();
02942     if (!GV->hasDLLImportStorageClass()) {
02943       unsigned char OpFlags = 0;
02944       bool ExtraLoad = false;
02945       unsigned WrapperKind = ISD::DELETED_NODE;
02946 
02947       // On ELF targets, in both X86-64 and X86-32 mode, direct calls to
02948       // external symbols most go through the PLT in PIC mode.  If the symbol
02949       // has hidden or protected visibility, or if it is static or local, then
02950       // we don't need to use the PLT - we can directly call it.
02951       if (Subtarget->isTargetELF() &&
02952           DAG.getTarget().getRelocationModel() == Reloc::PIC_ &&
02953           GV->hasDefaultVisibility() && !GV->hasLocalLinkage()) {
02954         OpFlags = X86II::MO_PLT;
02955       } else if (Subtarget->isPICStyleStubAny() &&
02956                  (GV->isDeclaration() || GV->isWeakForLinker()) &&
02957                  (!Subtarget->getTargetTriple().isMacOSX() ||
02958                   Subtarget->getTargetTriple().isMacOSXVersionLT(10, 5))) {
02959         // PC-relative references to external symbols should go through $stub,
02960         // unless we're building with the leopard linker or later, which
02961         // automatically synthesizes these stubs.
02962         OpFlags = X86II::MO_DARWIN_STUB;
02963       } else if (Subtarget->isPICStyleRIPRel() &&
02964                  isa<Function>(GV) &&
02965                  cast<Function>(GV)->getAttributes().
02966                    hasAttribute(AttributeSet::FunctionIndex,
02967                                 Attribute::NonLazyBind)) {
02968         // If the function is marked as non-lazy, generate an indirect call
02969         // which loads from the GOT directly. This avoids runtime overhead
02970         // at the cost of eager binding (and one extra byte of encoding).
02971         OpFlags = X86II::MO_GOTPCREL;
02972         WrapperKind = X86ISD::WrapperRIP;
02973         ExtraLoad = true;
02974       }
02975 
02976       Callee = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(),
02977                                           G->getOffset(), OpFlags);
02978 
02979       // Add a wrapper if needed.
02980       if (WrapperKind != ISD::DELETED_NODE)
02981         Callee = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Callee);
02982       // Add extra indirection if needed.
02983       if (ExtraLoad)
02984         Callee = DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), Callee,
02985                              MachinePointerInfo::getGOT(),
02986                              false, false, false, 0);
02987     }
02988   } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
02989     unsigned char OpFlags = 0;
02990 
02991     // On ELF targets, in either X86-64 or X86-32 mode, direct calls to
02992     // external symbols should go through the PLT.
02993     if (Subtarget->isTargetELF() &&
02994         DAG.getTarget().getRelocationModel() == Reloc::PIC_) {
02995       OpFlags = X86II::MO_PLT;
02996     } else if (Subtarget->isPICStyleStubAny() &&
02997                (!Subtarget->getTargetTriple().isMacOSX() ||
02998                 Subtarget->getTargetTriple().isMacOSXVersionLT(10, 5))) {
02999       // PC-relative references to external symbols should go through $stub,
03000       // unless we're building with the leopard linker or later, which
03001       // automatically synthesizes these stubs.
03002       OpFlags = X86II::MO_DARWIN_STUB;
03003     }
03004 
03005     Callee = DAG.getTargetExternalSymbol(S->getSymbol(), getPointerTy(),
03006                                          OpFlags);
03007   }
03008 
03009   // Returns a chain & a flag for retval copy to use.
03010   SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
03011   SmallVector<SDValue, 8> Ops;
03012 
03013   if (!IsSibcall && isTailCall) {
03014     Chain = DAG.getCALLSEQ_END(Chain,
03015                                DAG.getIntPtrConstant(NumBytesToPop, true),
03016                                DAG.getIntPtrConstant(0, true), InFlag, dl);
03017     InFlag = Chain.getValue(1);
03018   }
03019 
03020   Ops.push_back(Chain);
03021   Ops.push_back(Callee);
03022 
03023   if (isTailCall)
03024     Ops.push_back(DAG.getConstant(FPDiff, MVT::i32));
03025 
03026   // Add argument registers to the end of the list so that they are known live
03027   // into the call.
03028   for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
03029     Ops.push_back(DAG.getRegister(RegsToPass[i].first,
03030                                   RegsToPass[i].second.getValueType()));
03031 
03032   // Add a register mask operand representing the call-preserved registers.
03033   const TargetRegisterInfo *TRI = DAG.getTarget().getRegisterInfo();
03034   const uint32_t *Mask = TRI->getCallPreservedMask(CallConv);
03035   assert(Mask && "Missing call preserved mask for calling convention");
03036   Ops.push_back(DAG.getRegisterMask(Mask));
03037 
03038   if (InFlag.getNode())
03039     Ops.push_back(InFlag);
03040 
03041   if (isTailCall) {
03042     // We used to do:
03043     //// If this is the first return lowered for this function, add the regs
03044     //// to the liveout set for the function.
03045     // This isn't right, although it's probably harmless on x86; liveouts
03046     // should be computed from returns not tail calls.  Consider a void
03047     // function making a tail call to a function returning int.
03048     return DAG.getNode(X86ISD::TC_RETURN, dl, NodeTys, Ops);
03049   }
03050 
03051   Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, Ops);
03052   InFlag = Chain.getValue(1);
03053 
03054   // Create the CALLSEQ_END node.
03055   unsigned NumBytesForCalleeToPop;
03056   if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
03057                        DAG.getTarget().Options.GuaranteedTailCallOpt))
03058     NumBytesForCalleeToPop = NumBytes;    // Callee pops everything
03059   else if (!Is64Bit && !IsTailCallConvention(CallConv) &&
03060            !Subtarget->getTargetTriple().isOSMSVCRT() &&
03061            SR == StackStructReturn)
03062     // If this is a call to a struct-return function, the callee
03063     // pops the hidden struct pointer, so we have to push it back.
03064     // This is common for Darwin/X86, Linux & Mingw32 targets.
03065     // For MSVC Win32 targets, the caller pops the hidden struct pointer.
03066     NumBytesForCalleeToPop = 4;
03067   else
03068     NumBytesForCalleeToPop = 0;  // Callee pops nothing.
03069 
03070   // Returns a flag for retval copy to use.
03071   if (!IsSibcall) {
03072     Chain = DAG.getCALLSEQ_END(Chain,
03073                                DAG.getIntPtrConstant(NumBytesToPop, true),
03074                                DAG.getIntPtrConstant(NumBytesForCalleeToPop,
03075                                                      true),
03076                                InFlag, dl);
03077     InFlag = Chain.getValue(1);
03078   }
03079 
03080   // Handle result values, copying them out of physregs into vregs that we
03081   // return.
03082   return LowerCallResult(Chain, InFlag, CallConv, isVarArg,
03083                          Ins, dl, DAG, InVals);
03084 }
03085 
03086 //===----------------------------------------------------------------------===//
03087 //                Fast Calling Convention (tail call) implementation
03088 //===----------------------------------------------------------------------===//
03089 
03090 //  Like std call, callee cleans arguments, convention except that ECX is
03091 //  reserved for storing the tail called function address. Only 2 registers are
03092 //  free for argument passing (inreg). Tail call optimization is performed
03093 //  provided:
03094 //                * tailcallopt is enabled
03095 //                * caller/callee are fastcc
03096 //  On X86_64 architecture with GOT-style position independent code only local
03097 //  (within module) calls are supported at the moment.
03098 //  To keep the stack aligned according to platform abi the function
03099 //  GetAlignedArgumentStackSize ensures that argument delta is always multiples
03100 //  of stack alignment. (Dynamic linkers need this - darwin's dyld for example)
03101 //  If a tail called function callee has more arguments than the caller the
03102 //  caller needs to make sure that there is room to move the RETADDR to. This is
03103 //  achieved by reserving an area the size of the argument delta right after the
03104 //  original RETADDR, but before the saved framepointer or the spilled registers
03105 //  e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4)
03106 //  stack layout:
03107 //    arg1
03108 //    arg2
03109 //    RETADDR
03110 //    [ new RETADDR
03111 //      move area ]
03112 //    (possible EBP)
03113 //    ESI
03114 //    EDI
03115 //    local1 ..
03116 
03117 /// GetAlignedArgumentStackSize - Make the stack size align e.g 16n + 12 aligned
03118 /// for a 16 byte align requirement.
03119 unsigned
03120 X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize,
03121                                                SelectionDAG& DAG) const {
03122   MachineFunction &MF = DAG.getMachineFunction();
03123   const TargetMachine &TM = MF.getTarget();
03124   const X86RegisterInfo *RegInfo =
03125     static_cast<const X86RegisterInfo*>(TM.getRegisterInfo());
03126   const TargetFrameLowering &TFI = *TM.getFrameLowering();
03127   unsigned StackAlignment = TFI.getStackAlignment();
03128   uint64_t AlignMask = StackAlignment - 1;
03129   int64_t Offset = StackSize;
03130   unsigned SlotSize = RegInfo->getSlotSize();
03131   if ( (Offset & AlignMask) <= (StackAlignment - SlotSize) ) {
03132     // Number smaller than 12 so just add the difference.
03133     Offset += ((StackAlignment - SlotSize) - (Offset & AlignMask));
03134   } else {
03135     // Mask out lower bits, add stackalignment once plus the 12 bytes.
03136     Offset = ((~AlignMask) & Offset) + StackAlignment +
03137       (StackAlignment-SlotSize);
03138   }
03139   return Offset;
03140 }
03141 
03142 /// MatchingStackOffset - Return true if the given stack call argument is
03143 /// already available in the same position (relatively) of the caller's
03144 /// incoming argument stack.
03145 static
03146 bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
03147                          MachineFrameInfo *MFI, const MachineRegisterInfo *MRI,
03148                          const X86InstrInfo *TII) {
03149   unsigned Bytes = Arg.getValueType().getSizeInBits() / 8;
03150   int FI = INT_MAX;
03151   if (Arg.getOpcode() == ISD::CopyFromReg) {
03152     unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg();
03153     if (!TargetRegisterInfo::isVirtualRegister(VR))
03154       return false;
03155     MachineInstr *Def = MRI->getVRegDef(VR);
03156     if (!Def)
03157       return false;
03158     if (!Flags.isByVal()) {
03159       if (!TII->isLoadFromStackSlot(Def, FI))
03160         return false;
03161     } else {
03162       unsigned Opcode = Def->getOpcode();
03163       if ((Opcode == X86::LEA32r || Opcode == X86::LEA64r) &&
03164           Def->getOperand(1).isFI()) {
03165         FI = Def->getOperand(1).getIndex();
03166         Bytes = Flags.getByValSize();
03167       } else
03168         return false;
03169     }
03170   } else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) {
03171     if (Flags.isByVal())
03172       // ByVal argument is passed in as a pointer but it's now being
03173       // dereferenced. e.g.
03174       // define @foo(%struct.X* %A) {
03175       //   tail call @bar(%struct.X* byval %A)
03176       // }
03177       return false;
03178     SDValue Ptr = Ld->getBasePtr();
03179     FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr);
03180     if (!FINode)
03181       return false;
03182     FI = FINode->getIndex();
03183   } else if (Arg.getOpcode() == ISD::FrameIndex && Flags.isByVal()) {
03184     FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Arg);
03185     FI = FINode->getIndex();
03186     Bytes = Flags.getByValSize();
03187   } else
03188     return false;
03189 
03190   assert(FI != INT_MAX);
03191   if (!MFI->isFixedObjectIndex(FI))
03192     return false;
03193   return Offset == MFI->getObjectOffset(FI) && Bytes == MFI->getObjectSize(FI);
03194 }
03195 
03196 /// IsEligibleForTailCallOptimization - Check whether the call is eligible
03197 /// for tail call optimization. Targets which want to do tail call
03198 /// optimization should implement this function.
03199 bool
03200 X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
03201                                                      CallingConv::ID CalleeCC,
03202                                                      bool isVarArg,
03203                                                      bool isCalleeStructRet,
03204                                                      bool isCallerStructRet,
03205                                                      Type *RetTy,
03206                                     const SmallVectorImpl<ISD::OutputArg> &Outs,
03207                                     const SmallVectorImpl<SDValue> &OutVals,
03208                                     const SmallVectorImpl<ISD::InputArg> &Ins,
03209                                                      SelectionDAG &DAG) const {
03210   if (!IsTailCallConvention(CalleeCC) && !IsCCallConvention(CalleeCC))
03211     return false;
03212 
03213   // If -tailcallopt is specified, make fastcc functions tail-callable.
03214   const MachineFunction &MF = DAG.getMachineFunction();
03215   const Function *CallerF = MF.getFunction();
03216 
03217   // If the function return type is x86_fp80 and the callee return type is not,
03218   // then the FP_EXTEND of the call result is not a nop. It's not safe to
03219   // perform a tailcall optimization here.
03220   if (CallerF->getReturnType()->isX86_FP80Ty() && !RetTy->isX86_FP80Ty())
03221     return false;
03222 
03223   CallingConv::ID CallerCC = CallerF->getCallingConv();
03224   bool CCMatch = CallerCC == CalleeCC;
03225   bool IsCalleeWin64 = Subtarget->isCallingConvWin64(CalleeCC);
03226   bool IsCallerWin64 = Subtarget->isCallingConvWin64(CallerCC);
03227 
03228   if (DAG.getTarget().Options.GuaranteedTailCallOpt) {
03229     if (IsTailCallConvention(CalleeCC) && CCMatch)
03230       return true;
03231     return false;
03232   }
03233 
03234   // Look for obvious safe cases to perform tail call optimization that do not
03235   // require ABI changes. This is what gcc calls sibcall.
03236 
03237   // Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to
03238   // emit a special epilogue.
03239   const X86RegisterInfo *RegInfo =
03240     static_cast<const X86RegisterInfo*>(DAG.getTarget().getRegisterInfo());
03241   if (RegInfo->needsStackRealignment(MF))
03242     return false;
03243 
03244   // Also avoid sibcall optimization if either caller or callee uses struct
03245   // return semantics.
03246   if (isCalleeStructRet || isCallerStructRet)
03247     return false;
03248 
03249   // An stdcall/thiscall caller is expected to clean up its arguments; the
03250   // callee isn't going to do that.
03251   // FIXME: this is more restrictive than needed. We could produce a tailcall
03252   // when the stack adjustment matches. For example, with a thiscall that takes
03253   // only one argument.
03254   if (!CCMatch && (CallerCC == CallingConv::X86_StdCall ||
03255                    CallerCC == CallingConv::X86_ThisCall))
03256     return false;
03257 
03258   // Do not sibcall optimize vararg calls unless all arguments are passed via
03259   // registers.
03260   if (isVarArg && !Outs.empty()) {
03261 
03262     // Optimizing for varargs on Win64 is unlikely to be safe without
03263     // additional testing.
03264     if (IsCalleeWin64 || IsCallerWin64)
03265       return false;
03266 
03267     SmallVector<CCValAssign, 16> ArgLocs;
03268     CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(),
03269                    DAG.getTarget(), ArgLocs, *DAG.getContext());
03270 
03271     CCInfo.AnalyzeCallOperands(Outs, CC_X86);
03272     for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i)
03273       if (!ArgLocs[i].isRegLoc())
03274         return false;
03275   }
03276 
03277   // If the call result is in ST0 / ST1, it needs to be popped off the x87
03278   // stack.  Therefore, if it's not used by the call it is not safe to optimize
03279   // this into a sibcall.
03280   bool Unused = false;
03281   for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
03282     if (!Ins[i].Used) {
03283       Unused = true;
03284       break;
03285     }
03286   }
03287   if (Unused) {
03288     SmallVector<CCValAssign, 16> RVLocs;
03289     CCState CCInfo(CalleeCC, false, DAG.getMachineFunction(),
03290                    DAG.getTarget(), RVLocs, *DAG.getContext());
03291     CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
03292     for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
03293       CCValAssign &VA = RVLocs[i];
03294       if (VA.getLocReg() == X86::ST0 || VA.getLocReg() == X86::ST1)
03295         return false;
03296     }
03297   }
03298 
03299   // If the calling conventions do not match, then we'd better make sure the
03300   // results are returned in the same way as what the caller expects.
03301   if (!CCMatch) {
03302     SmallVector<CCValAssign, 16> RVLocs1;
03303     CCState CCInfo1(CalleeCC, false, DAG.getMachineFunction(),
03304                     DAG.getTarget(), RVLocs1, *DAG.getContext());
03305     CCInfo1.AnalyzeCallResult(Ins, RetCC_X86);
03306 
03307     SmallVector<CCValAssign, 16> RVLocs2;
03308     CCState CCInfo2(CallerCC, false, DAG.getMachineFunction(),
03309                     DAG.getTarget(), RVLocs2, *DAG.getContext());
03310     CCInfo2.AnalyzeCallResult(Ins, RetCC_X86);
03311 
03312     if (RVLocs1.size() != RVLocs2.size())
03313       return false;
03314     for (unsigned i = 0, e = RVLocs1.size(); i != e; ++i) {
03315       if (RVLocs1[i].isRegLoc() != RVLocs2[i].isRegLoc())
03316         return false;
03317       if (RVLocs1[i].getLocInfo() != RVLocs2[i].getLocInfo())
03318         return false;
03319       if (RVLocs1[i].isRegLoc()) {
03320         if (RVLocs1[i].getLocReg() != RVLocs2[i].getLocReg())
03321           return false;
03322       } else {
03323         if (RVLocs1[i].getLocMemOffset() != RVLocs2[i].getLocMemOffset())
03324           return false;
03325       }
03326     }
03327   }
03328 
03329   // If the callee takes no arguments then go on to check the results of the
03330   // call.
03331   if (!Outs.empty()) {
03332     // Check if stack adjustment is needed. For now, do not do this if any
03333     // argument is passed on the stack.
03334     SmallVector<CCValAssign, 16> ArgLocs;
03335     CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(),
03336                    DAG.getTarget(), ArgLocs, *DAG.getContext());
03337 
03338     // Allocate shadow area for Win64
03339     if (IsCalleeWin64)
03340       CCInfo.AllocateStack(32, 8);
03341 
03342     CCInfo.AnalyzeCallOperands(Outs, CC_X86);
03343     if (CCInfo.getNextStackOffset()) {
03344       MachineFunction &MF = DAG.getMachineFunction();
03345       if (MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn())
03346         return false;
03347 
03348       // Check if the arguments are already laid out in the right way as
03349       // the caller's fixed stack objects.
03350       MachineFrameInfo *MFI = MF.getFrameInfo();
03351       const MachineRegisterInfo *MRI = &MF.getRegInfo();
03352       const X86InstrInfo *TII =
03353           static_cast<const X86InstrInfo *>(DAG.getTarget().getInstrInfo());
03354       for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
03355         CCValAssign &VA = ArgLocs[i];
03356         SDValue Arg = OutVals[i];
03357         ISD::ArgFlagsTy Flags = Outs[i].Flags;
03358         if (VA.getLocInfo() == CCValAssign::Indirect)
03359           return false;
03360         if (!VA.isRegLoc()) {
03361           if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags,
03362                                    MFI, MRI, TII))
03363             return false;
03364         }
03365       }
03366     }
03367 
03368     // If the tailcall address may be in a register, then make sure it's
03369     // possible to register allocate for it. In 32-bit, the call address can
03370     // only target EAX, EDX, or ECX since the tail call must be scheduled after
03371     // callee-saved registers are restored. These happen to be the same
03372     // registers used to pass 'inreg' arguments so watch out for those.
03373     if (!Subtarget->is64Bit() &&
03374         ((!isa<GlobalAddressSDNode>(Callee) &&
03375           !isa<ExternalSymbolSDNode>(Callee)) ||
03376          DAG.getTarget().getRelocationModel() == Reloc::PIC_)) {
03377       unsigned NumInRegs = 0;
03378       // In PIC we need an extra register to formulate the address computation
03379       // for the callee.
03380       unsigned MaxInRegs =
03381   (DAG.getTarget().getRelocationModel() == Reloc::PIC_) ? 2 : 3;
03382 
03383       for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
03384         CCValAssign &VA = ArgLocs[i];
03385         if (!VA.isRegLoc())
03386           continue;
03387         unsigned Reg = VA.getLocReg();
03388         switch (Reg) {
03389         default: break;
03390         case X86::EAX: case X86::EDX: case X86::ECX:
03391           if (++NumInRegs == MaxInRegs)
03392             return false;
03393           break;
03394         }
03395       }
03396     }
03397   }
03398 
03399   return true;
03400 }
03401 
03402 FastISel *
03403 X86TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
03404                                   const TargetLibraryInfo *libInfo) const {
03405   return X86::createFastISel(funcInfo, libInfo);
03406 }
03407 
03408 //===----------------------------------------------------------------------===//
03409 //                           Other Lowering Hooks
03410 //===----------------------------------------------------------------------===//
03411 
03412 static bool MayFoldLoad(SDValue Op) {
03413   return Op.hasOneUse() && ISD::isNormalLoad(Op.getNode());
03414 }
03415 
03416 static bool MayFoldIntoStore(SDValue Op) {
03417   return Op.hasOneUse() && ISD::isNormalStore(*Op.getNode()->use_begin());
03418 }
03419 
03420 static bool isTargetShuffle(unsigned Opcode) {
03421   switch(Opcode) {
03422   default: return false;
03423   case X86ISD::PSHUFD:
03424   case X86ISD::PSHUFHW:
03425   case X86ISD::PSHUFLW:
03426   case X86ISD::SHUFP:
03427   case X86ISD::PALIGNR:
03428   case X86ISD::MOVLHPS:
03429   case X86ISD::MOVLHPD:
03430   case X86ISD::MOVHLPS:
03431   case X86ISD::MOVLPS:
03432   case X86ISD::MOVLPD:
03433   case X86ISD::MOVSHDUP:
03434   case X86ISD::MOVSLDUP:
03435   case X86ISD::MOVDDUP:
03436   case X86ISD::MOVSS:
03437   case X86ISD::MOVSD:
03438   case X86ISD::UNPCKL:
03439   case X86ISD::UNPCKH:
03440   case X86ISD::VPERMILP:
03441   case X86ISD::VPERM2X128:
03442   case X86ISD::VPERMI:
03443     return true;
03444   }
03445 }
03446 
03447 static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT,
03448                                     SDValue V1, SelectionDAG &DAG) {
03449   switch(Opc) {
03450   default: llvm_unreachable("Unknown x86 shuffle node");
03451   case X86ISD::MOVSHDUP:
03452   case X86ISD::MOVSLDUP:
03453   case X86ISD::MOVDDUP:
03454     return DAG.getNode(Opc, dl, VT, V1);
03455   }
03456 }
03457 
03458 static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT,
03459                                     SDValue V1, unsigned TargetMask,
03460                                     SelectionDAG &DAG) {
03461   switch(Opc) {
03462   default: llvm_unreachable("Unknown x86 shuffle node");
03463   case X86ISD::PSHUFD:
03464   case X86ISD::PSHUFHW:
03465   case X86ISD::PSHUFLW:
03466   case X86ISD::VPERMILP:
03467   case X86ISD::VPERMI:
03468     return DAG.getNode(Opc, dl, VT, V1, DAG.getConstant(TargetMask, MVT::i8));
03469   }
03470 }
03471 
03472 static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT,
03473                                     SDValue V1, SDValue V2, unsigned TargetMask,
03474                                     SelectionDAG &DAG) {
03475   switch(Opc) {
03476   default: llvm_unreachable("Unknown x86 shuffle node");
03477   case X86ISD::PALIGNR:
03478   case X86ISD::SHUFP:
03479   case X86ISD::VPERM2X128:
03480     return DAG.getNode(Opc, dl, VT, V1, V2,
03481                        DAG.getConstant(TargetMask, MVT::i8));
03482   }
03483 }
03484 
03485 static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT,
03486                                     SDValue V1, SDValue V2, SelectionDAG &DAG) {
03487   switch(Opc) {
03488   default: llvm_unreachable("Unknown x86 shuffle node");
03489   case X86ISD::MOVLHPS:
03490   case X86ISD::MOVLHPD:
03491   case X86ISD::MOVHLPS:
03492   case X86ISD::MOVLPS:
03493   case X86ISD::MOVLPD:
03494   case X86ISD::MOVSS:
03495   case X86ISD::MOVSD:
03496   case X86ISD::UNPCKL:
03497   case X86ISD::UNPCKH:
03498     return DAG.getNode(Opc, dl, VT, V1, V2);
03499   }
03500 }
03501 
03502 SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const {
03503   MachineFunction &MF = DAG.getMachineFunction();
03504   const X86RegisterInfo *RegInfo =
03505     static_cast<const X86RegisterInfo*>(DAG.getTarget().getRegisterInfo());
03506   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
03507   int ReturnAddrIndex = FuncInfo->getRAIndex();
03508 
03509   if (ReturnAddrIndex == 0) {
03510     // Set up a frame object for the return address.
03511     unsigned SlotSize = RegInfo->getSlotSize();
03512     ReturnAddrIndex = MF.getFrameInfo()->CreateFixedObject(SlotSize,
03513                                                            -(int64_t)SlotSize,
03514                                                            false);
03515     FuncInfo->setRAIndex(ReturnAddrIndex);
03516   }
03517 
03518   return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy());
03519 }
03520 
03521 bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M,
03522                                        bool hasSymbolicDisplacement) {
03523   // Offset should fit into 32 bit immediate field.
03524   if (!isInt<32>(Offset))
03525     return false;
03526 
03527   // If we don't have a symbolic displacement - we don't have any extra
03528   // restrictions.
03529   if (!hasSymbolicDisplacement)
03530     return true;
03531 
03532   // FIXME: Some tweaks might be needed for medium code model.
03533   if (M != CodeModel::Small && M != CodeModel::Kernel)
03534     return false;
03535 
03536   // For small code model we assume that latest object is 16MB before end of 31
03537   // bits boundary. We may also accept pretty large negative constants knowing
03538   // that all objects are in the positive half of address space.
03539   if (M == CodeModel::Small && Offset < 16*1024*1024)
03540     return true;
03541 
03542   // For kernel code model we know that all object resist in the negative half
03543   // of 32bits address space. We may not accept negative offsets, since they may
03544   // be just off and we may accept pretty large positive ones.
03545   if (M == CodeModel::Kernel && Offset > 0)
03546     return true;
03547 
03548   return false;
03549 }
03550 
03551 /// isCalleePop - Determines whether the callee is required to pop its
03552 /// own arguments. Callee pop is necessary to support tail calls.
03553 bool X86::isCalleePop(CallingConv::ID CallingConv,
03554                       bool is64Bit, bool IsVarArg, bool TailCallOpt) {
03555   if (IsVarArg)
03556     return false;
03557 
03558   switch (CallingConv) {
03559   default:
03560     return false;
03561   case CallingConv::X86_StdCall:
03562     return !is64Bit;
03563   case CallingConv::X86_FastCall:
03564     return !is64Bit;
03565   case CallingConv::X86_ThisCall:
03566     return !is64Bit;
03567   case CallingConv::Fast:
03568     return TailCallOpt;
03569   case CallingConv::GHC:
03570     return TailCallOpt;
03571   case CallingConv::HiPE:
03572     return TailCallOpt;
03573   }
03574 }
03575 
03576 /// \brief Return true if the condition is an unsigned comparison operation.
03577 static bool isX86CCUnsigned(unsigned X86CC) {
03578   switch (X86CC) {
03579   default: llvm_unreachable("Invalid integer condition!");
03580   case X86::COND_E:     return true;
03581   case X86::COND_G:     return false;
03582   case X86::COND_GE:    return false;
03583   case X86::COND_L:     return false;
03584   case X86::COND_LE:    return false;
03585   case X86::COND_NE:    return true;
03586   case X86::COND_B:     return true;
03587   case X86::COND_A:     return true;
03588   case X86::COND_BE:    return true;
03589   case X86::COND_AE:    return true;
03590   }
03591   llvm_unreachable("covered switch fell through?!");
03592 }
03593 
03594 /// TranslateX86CC - do a one to one translation of a ISD::CondCode to the X86
03595 /// specific condition code, returning the condition code and the LHS/RHS of the
03596 /// comparison to make.
03597 static unsigned TranslateX86CC(ISD::CondCode SetCCOpcode, bool isFP,
03598                                SDValue &LHS, SDValue &RHS, SelectionDAG &DAG) {
03599   if (!isFP) {
03600     if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) {
03601       if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnesValue()) {
03602         // X > -1   -> X == 0, jump !sign.
03603         RHS = DAG.getConstant(0, RHS.getValueType());
03604         return X86::COND_NS;
03605       }
03606       if (SetCCOpcode == ISD::SETLT && RHSC->isNullValue()) {
03607         // X < 0   -> X == 0, jump on sign.
03608         return X86::COND_S;
03609       }
03610       if (SetCCOpcode == ISD::SETLT && RHSC->getZExtValue() == 1) {
03611         // X < 1   -> X <= 0
03612         RHS = DAG.getConstant(0, RHS.getValueType());
03613         return X86::COND_LE;
03614       }
03615     }
03616 
03617     switch (SetCCOpcode) {
03618     default: llvm_unreachable("Invalid integer condition!");
03619     case ISD::SETEQ:  return X86::COND_E;
03620     case ISD::SETGT:  return X86::COND_G;
03621     case ISD::SETGE:  return X86::COND_GE;
03622     case ISD::SETLT:  return X86::COND_L;
03623     case ISD::SETLE:  return X86::COND_LE;
03624     case ISD::SETNE:  return X86::COND_NE;
03625     case ISD::SETULT: return X86::COND_B;
03626     case ISD::SETUGT: return X86::COND_A;
03627     case ISD::SETULE: return X86::COND_BE;
03628     case ISD::SETUGE: return X86::COND_AE;
03629     }
03630   }
03631 
03632   // First determine if it is required or is profitable to flip the operands.
03633 
03634   // If LHS is a foldable load, but RHS is not, flip the condition.
03635   if (ISD::isNON_EXTLoad(LHS.getNode()) &&
03636       !ISD::isNON_EXTLoad(RHS.getNode())) {
03637     SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode);
03638     std::swap(LHS, RHS);
03639   }
03640 
03641   switch (SetCCOpcode) {
03642   default: break;
03643   case ISD::SETOLT:
03644   case ISD::SETOLE:
03645   case ISD::SETUGT:
03646   case ISD::SETUGE:
03647     std::swap(LHS, RHS);
03648     break;
03649   }
03650 
03651   // On a floating point condition, the flags are set as follows:
03652   // ZF  PF  CF   op
03653   //  0 | 0 | 0 | X > Y
03654   //  0 | 0 | 1 | X < Y
03655   //  1 | 0 | 0 | X == Y
03656   //  1 | 1 | 1 | unordered
03657   switch (SetCCOpcode) {
03658   default: llvm_unreachable("Condcode should be pre-legalized away");
03659   case ISD::SETUEQ:
03660   case ISD::SETEQ:   return X86::COND_E;
03661   case ISD::SETOLT:              // flipped
03662   case ISD::SETOGT:
03663   case ISD::SETGT:   return X86::COND_A;
03664   case ISD::SETOLE:              // flipped
03665   case ISD::SETOGE:
03666   case ISD::SETGE:   return X86::COND_AE;
03667   case ISD::SETUGT:              // flipped
03668   case ISD::SETULT:
03669   case ISD::SETLT:   return X86::COND_B;
03670   case ISD::SETUGE:              // flipped
03671   case ISD::SETULE:
03672   case ISD::SETLE:   return X86::COND_BE;
03673   case ISD::SETONE:
03674   case ISD::SETNE:   return X86::COND_NE;
03675   case ISD::SETUO:   return X86::COND_P;
03676   case ISD::SETO:    return X86::COND_NP;
03677   case ISD::SETOEQ:
03678   case ISD::SETUNE:  return X86::COND_INVALID;
03679   }
03680 }
03681 
03682 /// hasFPCMov - is there a floating point cmov for the specific X86 condition
03683 /// code. Current x86 isa includes the following FP cmov instructions:
03684 /// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu.
03685 static bool hasFPCMov(unsigned X86CC) {
03686   switch (X86CC) {
03687   default:
03688     return false;
03689   case X86::COND_B:
03690   case X86::COND_BE:
03691   case X86::COND_E:
03692   case X86::COND_P:
03693   case X86::COND_A:
03694   case X86::COND_AE:
03695   case X86::COND_NE:
03696   case X86::COND_NP:
03697     return true;
03698   }
03699 }
03700 
03701 /// isFPImmLegal - Returns true if the target can instruction select the
03702 /// specified FP immediate natively. If false, the legalizer will
03703 /// materialize the FP immediate as a load from a constant pool.
03704 bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
03705   for (unsigned i = 0, e = LegalFPImmediates.size(); i != e; ++i) {
03706     if (Imm.bitwiseIsEqual(LegalFPImmediates[i]))
03707       return true;
03708   }
03709   return false;
03710 }
03711 
03712 /// \brief Returns true if it is beneficial to convert a load of a constant
03713 /// to just the constant itself.
03714 bool X86TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
03715                                                           Type *Ty) const {
03716   assert(Ty->isIntegerTy());
03717 
03718   unsigned BitSize = Ty->getPrimitiveSizeInBits();
03719   if (BitSize == 0 || BitSize > 64)
03720     return false;
03721   return true;
03722 }
03723 
03724 /// isUndefOrInRange - Return true if Val is undef or if its value falls within
03725 /// the specified range (L, H].
03726 static bool isUndefOrInRange(int Val, int Low, int Hi) {
03727   return (Val < 0) || (Val >= Low && Val < Hi);
03728 }
03729 
03730 /// isUndefOrEqual - Val is either less than zero (undef) or equal to the
03731 /// specified value.
03732 static bool isUndefOrEqual(int Val, int CmpVal) {
03733   return (Val < 0 || Val == CmpVal);
03734 }
03735 
03736 /// isSequentialOrUndefInRange - Return true if every element in Mask, beginning
03737 /// from position Pos and ending in Pos+Size, falls within the specified
03738 /// sequential range (L, L+Pos]. or is undef.
03739 static bool isSequentialOrUndefInRange(ArrayRef<int> Mask,
03740                                        unsigned Pos, unsigned Size, int Low) {
03741   for (unsigned i = Pos, e = Pos+Size; i != e; ++i, ++Low)
03742     if (!isUndefOrEqual(Mask[i], Low))
03743       return false;
03744   return true;
03745 }
03746 
03747 /// isPSHUFDMask - Return true if the node specifies a shuffle of elements that
03748 /// is suitable for input to PSHUFD or PSHUFW.  That is, it doesn't reference
03749 /// the second operand.
03750 static bool isPSHUFDMask(ArrayRef<int> Mask, MVT VT) {
03751   if (VT == MVT::v4f32 || VT == MVT::v4i32 )
03752     return (Mask[0] < 4 && Mask[1] < 4 && Mask[2] < 4 && Mask[3] < 4);
03753   if (VT == MVT::v2f64 || VT == MVT::v2i64)
03754     return (Mask[0] < 2 && Mask[1] < 2);
03755   return false;
03756 }
03757 
03758 /// isPSHUFHWMask - Return true if the node specifies a shuffle of elements that
03759 /// is suitable for input to PSHUFHW.
03760 static bool isPSHUFHWMask(ArrayRef<int> Mask, MVT VT, bool HasInt256) {
03761   if (VT != MVT::v8i16 && (!HasInt256 || VT != MVT::v16i16))
03762     return false;
03763 
03764   // Lower quadword copied in order or undef.
03765   if (!isSequentialOrUndefInRange(Mask, 0, 4, 0))
03766     return false;
03767 
03768   // Upper quadword shuffled.
03769   for (unsigned i = 4; i != 8; ++i)
03770     if (!isUndefOrInRange(Mask[i], 4, 8))
03771       return false;
03772 
03773   if (VT == MVT::v16i16) {
03774     // Lower quadword copied in order or undef.
03775     if (!isSequentialOrUndefInRange(Mask, 8, 4, 8))
03776       return false;
03777 
03778     // Upper quadword shuffled.
03779     for (unsigned i = 12; i != 16; ++i)
03780       if (!isUndefOrInRange(Mask[i], 12, 16))
03781         return false;
03782   }
03783 
03784   return true;
03785 }
03786 
03787 /// isPSHUFLWMask - Return true if the node specifies a shuffle of elements that
03788 /// is suitable for input to PSHUFLW.
03789 static bool isPSHUFLWMask(ArrayRef<int> Mask, MVT VT, bool HasInt256) {
03790   if (VT != MVT::v8i16 && (!HasInt256 || VT != MVT::v16i16))
03791     return false;
03792 
03793   // Upper quadword copied in order.
03794   if (!isSequentialOrUndefInRange(Mask, 4, 4, 4))
03795     return false;
03796 
03797   // Lower quadword shuffled.
03798   for (unsigned i = 0; i != 4; ++i)
03799     if (!isUndefOrInRange(Mask[i], 0, 4))
03800       return false;
03801 
03802   if (VT == MVT::v16i16) {
03803     // Upper quadword copied in order.
03804     if (!isSequentialOrUndefInRange(Mask, 12, 4, 12))
03805       return false;
03806 
03807     // Lower quadword shuffled.
03808     for (unsigned i = 8; i != 12; ++i)
03809       if (!isUndefOrInRange(Mask[i], 8, 12))
03810         return false;
03811   }
03812 
03813   return true;
03814 }
03815 
03816 /// isPALIGNRMask - Return true if the node specifies a shuffle of elements that
03817 /// is suitable for input to PALIGNR.
03818 static bool isPALIGNRMask(ArrayRef<int> Mask, MVT VT,
03819                           const X86Subtarget *Subtarget) {
03820   if ((VT.is128BitVector() && !Subtarget->hasSSSE3()) ||
03821       (VT.is256BitVector() && !Subtarget->hasInt256()))
03822     return false;
03823 
03824   unsigned NumElts = VT.getVectorNumElements();
03825   unsigned NumLanes = VT.is512BitVector() ? 1: VT.getSizeInBits()/128;
03826   unsigned NumLaneElts = NumElts/NumLanes;
03827 
03828   // Do not handle 64-bit element shuffles with palignr.
03829   if (NumLaneElts == 2)
03830     return false;
03831 
03832   for (unsigned l = 0; l != NumElts; l+=NumLaneElts) {
03833     unsigned i;
03834     for (i = 0; i != NumLaneElts; ++i) {
03835       if (Mask[i+l] >= 0)
03836         break;
03837     }
03838 
03839     // Lane is all undef, go to next lane
03840     if (i == NumLaneElts)
03841       continue;
03842 
03843     int Start = Mask[i+l];
03844 
03845     // Make sure its in this lane in one of the sources
03846     if (!isUndefOrInRange(Start, l, l+NumLaneElts) &&
03847         !isUndefOrInRange(Start, l+NumElts, l+NumElts+NumLaneElts))
03848       return false;
03849 
03850     // If not lane 0, then we must match lane 0
03851     if (l != 0 && Mask[i] >= 0 && !isUndefOrEqual(Start, Mask[i]+l))
03852       return false;
03853 
03854     // Correct second source to be contiguous with first source
03855     if (Start >= (int)NumElts)
03856       Start -= NumElts - NumLaneElts;
03857 
03858     // Make sure we're shifting in the right direction.
03859     if (Start <= (int)(i+l))
03860       return false;
03861 
03862     Start -= i;
03863 
03864     // Check the rest of the elements to see if they are consecutive.
03865     for (++i; i != NumLaneElts; ++i) {
03866       int Idx = Mask[i+l];
03867 
03868       // Make sure its in this lane
03869       if (!isUndefOrInRange(Idx, l, l+NumLaneElts) &&
03870           !isUndefOrInRange(Idx, l+NumElts, l+NumElts+NumLaneElts))
03871         return false;
03872 
03873       // If not lane 0, then we must match lane 0
03874       if (l != 0 && Mask[i] >= 0 && !isUndefOrEqual(Idx, Mask[i]+l))
03875         return false;
03876 
03877       if (Idx >= (int)NumElts)
03878         Idx -= NumElts - NumLaneElts;
03879 
03880       if (!isUndefOrEqual(Idx, Start+i))
03881         return false;
03882 
03883     }
03884   }
03885 
03886   return true;
03887 }
03888 
03889 /// CommuteVectorShuffleMask - Change values in a shuffle permute mask assuming
03890 /// the two vector operands have swapped position.
03891 static void CommuteVectorShuffleMask(SmallVectorImpl<int> &Mask,
03892                                      unsigned NumElems) {
03893   for (unsigned i = 0; i != NumElems; ++i) {
03894     int idx = Mask[i];
03895     if (idx < 0)
03896       continue;
03897     else if (idx < (int)NumElems)
03898       Mask[i] = idx + NumElems;
03899     else
03900       Mask[i] = idx - NumElems;
03901   }
03902 }
03903 
03904 /// isSHUFPMask - Return true if the specified VECTOR_SHUFFLE operand
03905 /// specifies a shuffle of elements that is suitable for input to 128/256-bit
03906 /// SHUFPS and SHUFPD. If Commuted is true, then it checks for sources to be
03907 /// reverse of what x86 shuffles want.
03908 static bool isSHUFPMask(ArrayRef<int> Mask, MVT VT, bool Commuted = false) {
03909 
03910   unsigned NumElems = VT.getVectorNumElements();
03911   unsigned NumLanes = VT.getSizeInBits()/128;
03912   unsigned NumLaneElems = NumElems/NumLanes;
03913 
03914   if (NumLaneElems != 2 && NumLaneElems != 4)
03915     return false;
03916 
03917   unsigned EltSize = VT.getVectorElementType().getSizeInBits();
03918   bool symetricMaskRequired =
03919     (VT.getSizeInBits() >= 256) && (EltSize == 32);
03920 
03921   // VSHUFPSY divides the resulting vector into 4 chunks.
03922   // The sources are also splitted into 4 chunks, and each destination
03923   // chunk must come from a different source chunk.
03924   //
03925   //  SRC1 =>   X7    X6    X5    X4    X3    X2    X1    X0
03926   //  SRC2 =>   Y7    Y6    Y5    Y4    Y3    Y2    Y1    Y9
03927   //
03928   //  DST  =>  Y7..Y4,   Y7..Y4,   X7..X4,   X7..X4,
03929   //           Y3..Y0,   Y3..Y0,   X3..X0,   X3..X0
03930   //
03931   // VSHUFPDY divides the resulting vector into 4 chunks.
03932   // The sources are also splitted into 4 chunks, and each destination
03933   // chunk must come from a different source chunk.
03934   //
03935   //  SRC1 =>      X3       X2       X1       X0
03936   //  SRC2 =>      Y3       Y2       Y1       Y0
03937   //
03938   //  DST  =>  Y3..Y2,  X3..X2,  Y1..Y0,  X1..X0
03939   //
03940   SmallVector<int, 4> MaskVal(NumLaneElems, -1);
03941   unsigned HalfLaneElems = NumLaneElems/2;
03942   for (unsigned l = 0; l != NumElems; l += NumLaneElems) {
03943     for (unsigned i = 0; i != NumLaneElems; ++i) {
03944       int Idx = Mask[i+l];
03945       unsigned RngStart = l + ((Commuted == (i<HalfLaneElems)) ? NumElems : 0);
03946       if (!isUndefOrInRange(Idx, RngStart, RngStart+NumLaneElems))
03947         return false;
03948       // For VSHUFPSY, the mask of the second half must be the same as the
03949       // first but with the appropriate offsets. This works in the same way as
03950       // VPERMILPS works with masks.
03951       if (!symetricMaskRequired || Idx < 0)
03952         continue;
03953       if (MaskVal[i] < 0) {
03954         MaskVal[i] = Idx - l;
03955         continue;
03956       }
03957       if ((signed)(Idx - l) != MaskVal[i])
03958         return false;
03959     }
03960   }
03961 
03962   return true;
03963 }
03964 
03965 /// isMOVHLPSMask - Return true if the specified VECTOR_SHUFFLE operand
03966 /// specifies a shuffle of elements that is suitable for input to MOVHLPS.
03967 static bool isMOVHLPSMask(ArrayRef<int> Mask, MVT VT) {
03968   if (!VT.is128BitVector())
03969     return false;
03970 
03971   unsigned NumElems = VT.getVectorNumElements();
03972 
03973   if (NumElems != 4)
03974     return false;
03975 
03976   // Expect bit0 == 6, bit1 == 7, bit2 == 2, bit3 == 3
03977   return isUndefOrEqual(Mask[0], 6) &&
03978          isUndefOrEqual(Mask[1], 7) &&
03979          isUndefOrEqual(Mask[2], 2) &&
03980          isUndefOrEqual(Mask[3], 3);
03981 }
03982 
03983 /// isMOVHLPS_v_undef_Mask - Special case of isMOVHLPSMask for canonical form
03984 /// of vector_shuffle v, v, <2, 3, 2, 3>, i.e. vector_shuffle v, undef,
03985 /// <2, 3, 2, 3>
03986 static bool isMOVHLPS_v_undef_Mask(ArrayRef<int> Mask, MVT VT) {
03987   if (!VT.is128BitVector())
03988     return false;
03989 
03990   unsigned NumElems = VT.getVectorNumElements();
03991 
03992   if (NumElems != 4)
03993     return false;
03994 
03995   return isUndefOrEqual(Mask[0], 2) &&
03996          isUndefOrEqual(Mask[1], 3) &&
03997          isUndefOrEqual(Mask[2], 2) &&
03998          isUndefOrEqual(Mask[3], 3);
03999 }
04000 
04001 /// isMOVLPMask - Return true if the specified VECTOR_SHUFFLE operand
04002 /// specifies a shuffle of elements that is suitable for input to MOVLP{S|D}.
04003 static bool isMOVLPMask(ArrayRef<int> Mask, MVT VT) {
04004   if (!VT.is128BitVector())
04005     return false;
04006 
04007   unsigned NumElems = VT.getVectorNumElements();
04008 
04009   if (NumElems != 2 && NumElems != 4)
04010     return false;
04011 
04012   for (unsigned i = 0, e = NumElems/2; i != e; ++i)
04013     if (!isUndefOrEqual(Mask[i], i + NumElems))
04014       return false;
04015 
04016   for (unsigned i = NumElems/2, e = NumElems; i != e; ++i)
04017     if (!isUndefOrEqual(Mask[i], i))
04018       return false;
04019 
04020   return true;
04021 }
04022 
04023 /// isMOVLHPSMask - Return true if the specified VECTOR_SHUFFLE operand
04024 /// specifies a shuffle of elements that is suitable for input to MOVLHPS.
04025 static bool isMOVLHPSMask(ArrayRef<int> Mask, MVT VT) {
04026   if (!VT.is128BitVector())
04027     return false;
04028 
04029   unsigned NumElems = VT.getVectorNumElements();
04030 
04031   if (NumElems != 2 && NumElems != 4)
04032     return false;
04033 
04034   for (unsigned i = 0, e = NumElems/2; i != e; ++i)
04035     if (!isUndefOrEqual(Mask[i], i))
04036       return false;
04037 
04038   for (unsigned i = 0, e = NumElems/2; i != e; ++i)
04039     if (!isUndefOrEqual(Mask[i + e], i + NumElems))
04040       return false;
04041 
04042   return true;
04043 }
04044 
04045 /// isINSERTPSMask - Return true if the specified VECTOR_SHUFFLE operand
04046 /// specifies a shuffle of elements that is suitable for input to INSERTPS.
04047 /// i. e: If all but one element come from the same vector.
04048 static bool isINSERTPSMask(ArrayRef<int> Mask, MVT VT) {
04049   // TODO: Deal with AVX's VINSERTPS
04050   if (!VT.is128BitVector() || (VT != MVT::v4f32 && VT != MVT::v4i32))
04051     return false;
04052 
04053   unsigned CorrectPosV1 = 0;
04054   unsigned CorrectPosV2 = 0;
04055   for (int i = 0, e = (int)VT.getVectorNumElements(); i != e; ++i) {
04056     if (Mask[i] == -1) {
04057       ++CorrectPosV1;
04058       ++CorrectPosV2;
04059       continue;
04060     }
04061 
04062     if (Mask[i] == i)
04063       ++CorrectPosV1;
04064     else if (Mask[i] == i + 4)
04065       ++CorrectPosV2;
04066   }
04067 
04068   if (CorrectPosV1 == 3 || CorrectPosV2 == 3)
04069     // We have 3 elements (undefs count as elements from any vector) from one
04070     // vector, and one from another.
04071     return true;
04072 
04073   return false;
04074 }
04075 
04076 //
04077 // Some special combinations that can be optimized.
04078 //
04079 static
04080 SDValue Compact8x32ShuffleNode(ShuffleVectorSDNode *SVOp,
04081                                SelectionDAG &DAG) {
04082   MVT VT = SVOp->getSimpleValueType(0);
04083   SDLoc dl(SVOp);
04084 
04085   if (VT != MVT::v8i32 && VT != MVT::v8f32)
04086     return SDValue();
04087 
04088   ArrayRef<int> Mask = SVOp->getMask();
04089 
04090   // These are the special masks that may be optimized.
04091   static const int MaskToOptimizeEven[] = {0, 8, 2, 10, 4, 12, 6, 14};
04092   static const int MaskToOptimizeOdd[]  = {1, 9, 3, 11, 5, 13, 7, 15};
04093   bool MatchEvenMask = true;
04094   bool MatchOddMask  = true;
04095   for (int i=0; i<8; ++i) {
04096     if (!isUndefOrEqual(Mask[i], MaskToOptimizeEven[i]))
04097       MatchEvenMask = false;
04098     if (!isUndefOrEqual(Mask[i], MaskToOptimizeOdd[i]))
04099       MatchOddMask = false;
04100   }
04101 
04102   if (!MatchEvenMask && !MatchOddMask)
04103     return SDValue();
04104 
04105   SDValue UndefNode = DAG.getNode(ISD::UNDEF, dl, VT);
04106 
04107   SDValue Op0 = SVOp->getOperand(0);
04108   SDValue Op1 = SVOp->getOperand(1);
04109 
04110   if (MatchEvenMask) {
04111     // Shift the second operand right to 32 bits.
04112     static const int ShiftRightMask[] = {-1, 0, -1, 2, -1, 4, -1, 6 };
04113     Op1 = DAG.getVectorShuffle(VT, dl, Op1, UndefNode, ShiftRightMask);
04114   } else {
04115     // Shift the first operand left to 32 bits.
04116     static const int ShiftLeftMask[] = {1, -1, 3, -1, 5, -1, 7, -1 };
04117     Op0 = DAG.getVectorShuffle(VT, dl, Op0, UndefNode, ShiftLeftMask);
04118   }
04119   static const int BlendMask[] = {0, 9, 2, 11, 4, 13, 6, 15};
04120   return DAG.getVectorShuffle(VT, dl, Op0, Op1, BlendMask);
04121 }
04122 
04123 /// isUNPCKLMask - Return true if the specified VECTOR_SHUFFLE operand
04124 /// specifies a shuffle of elements that is suitable for input to UNPCKL.
04125 static bool isUNPCKLMask(ArrayRef<int> Mask, MVT VT,
04126                          bool HasInt256, bool V2IsSplat = false) {
04127 
04128   assert(VT.getSizeInBits() >= 128 &&
04129          "Unsupported vector type for unpckl");
04130 
04131   // AVX defines UNPCK* to operate independently on 128-bit lanes.
04132   unsigned NumLanes;
04133   unsigned NumOf256BitLanes;
04134   unsigned NumElts = VT.getVectorNumElements();
04135   if (VT.is256BitVector()) {
04136     if (NumElts != 4 && NumElts != 8 &&
04137         (!HasInt256 || (NumElts != 16 && NumElts != 32)))
04138     return false;
04139     NumLanes = 2;
04140     NumOf256BitLanes = 1;
04141   } else if (VT.is512BitVector()) {
04142     assert(VT.getScalarType().getSizeInBits() >= 32 &&
04143            "Unsupported vector type for unpckh");
04144     NumLanes = 2;
04145     NumOf256BitLanes = 2;
04146   } else {
04147     NumLanes = 1;
04148     NumOf256BitLanes = 1;
04149   }
04150 
04151   unsigned NumEltsInStride = NumElts/NumOf256BitLanes;
04152   unsigned NumLaneElts = NumEltsInStride/NumLanes;
04153 
04154   for (unsigned l256 = 0; l256 < NumOf256BitLanes; l256 += 1) {
04155     for (unsigned l = 0; l != NumEltsInStride; l += NumLaneElts) {
04156       for (unsigned i = 0, j = l; i != NumLaneElts; i += 2, ++j) {
04157         int BitI  = Mask[l256*NumEltsInStride+l+i];
04158         int BitI1 = Mask[l256*NumEltsInStride+l+i+1];
04159         if (!isUndefOrEqual(BitI, j+l256*NumElts))
04160           return false;
04161         if (V2IsSplat && !isUndefOrEqual(BitI1, NumElts))
04162           return false;
04163         if (!isUndefOrEqual(BitI1, j+l256*NumElts+NumEltsInStride))
04164           return false;
04165       }
04166     }
04167   }
04168   return true;
04169 }
04170 
04171 /// isUNPCKHMask - Return true if the specified VECTOR_SHUFFLE operand
04172 /// specifies a shuffle of elements that is suitable for input to UNPCKH.
04173 static bool isUNPCKHMask(ArrayRef<int> Mask, MVT VT,
04174                          bool HasInt256, bool V2IsSplat = false) {
04175   assert(VT.getSizeInBits() >= 128 &&
04176          "Unsupported vector type for unpckh");
04177 
04178   // AVX defines UNPCK* to operate independently on 128-bit lanes.
04179   unsigned NumLanes;
04180   unsigned NumOf256BitLanes;
04181   unsigned NumElts = VT.getVectorNumElements();
04182   if (VT.is256BitVector()) {
04183     if (NumElts != 4 && NumElts != 8 &&
04184         (!HasInt256 || (NumElts != 16 && NumElts != 32)))
04185     return false;
04186     NumLanes = 2;
04187     NumOf256BitLanes = 1;
04188   } else if (VT.is512BitVector()) {
04189     assert(VT.getScalarType().getSizeInBits() >= 32 &&
04190            "Unsupported vector type for unpckh");
04191     NumLanes = 2;
04192     NumOf256BitLanes = 2;
04193   } else {
04194     NumLanes = 1;
04195     NumOf256BitLanes = 1;
04196   }
04197 
04198   unsigned NumEltsInStride = NumElts/NumOf256BitLanes;
04199   unsigned NumLaneElts = NumEltsInStride/NumLanes;
04200 
04201   for (unsigned l256 = 0; l256 < NumOf256BitLanes; l256 += 1) {
04202     for (unsigned l = 0; l != NumEltsInStride; l += NumLaneElts) {
04203       for (unsigned i = 0, j = l+NumLaneElts/2; i != NumLaneElts; i += 2, ++j) {
04204         int BitI  = Mask[l256*NumEltsInStride+l+i];
04205         int BitI1 = Mask[l256*NumEltsInStride+l+i+1];
04206         if (!isUndefOrEqual(BitI, j+l256*NumElts))
04207           return false;
04208         if (V2IsSplat && !isUndefOrEqual(BitI1, NumElts))
04209           return false;
04210         if (!isUndefOrEqual(BitI1, j+l256*NumElts+NumEltsInStride))
04211           return false;
04212       }
04213     }
04214   }
04215   return true;
04216 }
04217 
04218 /// isUNPCKL_v_undef_Mask - Special case of isUNPCKLMask for canonical form
04219 /// of vector_shuffle v, v, <0, 4, 1, 5>, i.e. vector_shuffle v, undef,
04220 /// <0, 0, 1, 1>
04221 static bool isUNPCKL_v_undef_Mask(ArrayRef<int> Mask, MVT VT, bool HasInt256) {
04222   unsigned NumElts = VT.getVectorNumElements();
04223   bool Is256BitVec = VT.is256BitVector();
04224 
04225   if (VT.is512BitVector())
04226     return false;
04227   assert((VT.is128BitVector() || VT.is256BitVector()) &&
04228          "Unsupported vector type for unpckh");
04229 
04230   if (Is256BitVec && NumElts != 4 && NumElts != 8 &&
04231       (!HasInt256 || (NumElts != 16 && NumElts != 32)))
04232     return false;
04233 
04234   // For 256-bit i64/f64, use MOVDDUPY instead, so reject the matching pattern
04235   // FIXME: Need a better way to get rid of this, there's no latency difference
04236   // between UNPCKLPD and MOVDDUP, the later should always be checked first and
04237   // the former later. We should also remove the "_undef" special mask.
04238   if (NumElts == 4 && Is256BitVec)
04239     return false;
04240 
04241   // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate
04242   // independently on 128-bit lanes.
04243   unsigned NumLanes = VT.getSizeInBits()/128;
04244   unsigned NumLaneElts = NumElts/NumLanes;
04245 
04246   for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
04247     for (unsigned i = 0, j = l; i != NumLaneElts; i += 2, ++j) {
04248       int BitI  = Mask[l+i];
04249       int BitI1 = Mask[l+i+1];
04250 
04251       if (!isUndefOrEqual(BitI, j))
04252         return false;
04253       if (!isUndefOrEqual(BitI1, j))
04254         return false;
04255     }
04256   }
04257 
04258   return true;
04259 }
04260 
04261 /// isUNPCKH_v_undef_Mask - Special case of isUNPCKHMask for canonical form
04262 /// of vector_shuffle v, v, <2, 6, 3, 7>, i.e. vector_shuffle v, undef,
04263 /// <2, 2, 3, 3>
04264 static bool isUNPCKH_v_undef_Mask(ArrayRef<int> Mask, MVT VT, bool HasInt256) {
04265   unsigned NumElts = VT.getVectorNumElements();
04266 
04267   if (VT.is512BitVector())
04268     return false;
04269 
04270   assert((VT.is128BitVector() || VT.is256BitVector()) &&
04271          "Unsupported vector type for unpckh");
04272 
04273   if (VT.is256BitVector() && NumElts != 4 && NumElts != 8 &&
04274       (!HasInt256 || (NumElts != 16 && NumElts != 32)))
04275     return false;
04276 
04277   // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate
04278   // independently on 128-bit lanes.
04279   unsigned NumLanes = VT.getSizeInBits()/128;
04280   unsigned NumLaneElts = NumElts/NumLanes;
04281 
04282   for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
04283     for (unsigned i = 0, j = l+NumLaneElts/2; i != NumLaneElts; i += 2, ++j) {
04284       int BitI  = Mask[l+i];
04285       int BitI1 = Mask[l+i+1];
04286       if (!isUndefOrEqual(BitI, j))
04287         return false;
04288       if (!isUndefOrEqual(BitI1, j))
04289         return false;
04290     }
04291   }
04292   return true;
04293 }
04294 
04295 // Match for INSERTI64x4 INSERTF64x4 instructions (src0[0], src1[0]) or
04296 // (src1[0], src0[1]), manipulation with 256-bit sub-vectors
04297 static bool isINSERT64x4Mask(ArrayRef<int> Mask, MVT VT, unsigned int *Imm) {
04298   if (!VT.is512BitVector())
04299     return false;
04300 
04301   unsigned NumElts = VT.getVectorNumElements();
04302   unsigned HalfSize = NumElts/2;
04303   if (isSequentialOrUndefInRange(Mask, 0, HalfSize, 0)) {
04304     if (isSequentialOrUndefInRange(Mask, HalfSize, HalfSize, NumElts)) {
04305       *Imm = 1;
04306       return true;
04307     }
04308   }
04309   if (isSequentialOrUndefInRange(Mask, 0, HalfSize, NumElts)) {
04310     if (isSequentialOrUndefInRange(Mask, HalfSize, HalfSize, HalfSize)) {
04311       *Imm = 0;
04312       return true;
04313     }
04314   }
04315   return false;
04316 }
04317 
04318 /// isMOVLMask - Return true if the specified VECTOR_SHUFFLE operand
04319 /// specifies a shuffle of elements that is suitable for input to MOVSS,
04320 /// MOVSD, and MOVD, i.e. setting the lowest element.
04321 static bool isMOVLMask(ArrayRef<int> Mask, EVT VT) {
04322   if (VT.getVectorElementType().getSizeInBits() < 32)
04323     return false;
04324   if (!VT.is128BitVector())
04325     return false;
04326 
04327   unsigned NumElts = VT.getVectorNumElements();
04328 
04329   if (!isUndefOrEqual(Mask[0], NumElts))
04330     return false;
04331 
04332   for (unsigned i = 1; i != NumElts; ++i)
04333     if (!isUndefOrEqual(Mask[i], i))
04334       return false;
04335 
04336   return true;
04337 }
04338 
04339 /// isVPERM2X128Mask - Match 256-bit shuffles where the elements are considered
04340 /// as permutations between 128-bit chunks or halves. As an example: this
04341 /// shuffle bellow:
04342 ///   vector_shuffle <4, 5, 6, 7, 12, 13, 14, 15>
04343 /// The first half comes from the second half of V1 and the second half from the
04344 /// the second half of V2.
04345 static bool isVPERM2X128Mask(ArrayRef<int> Mask, MVT VT, bool HasFp256) {
04346   if (!HasFp256 || !VT.is256BitVector())
04347     return false;
04348 
04349   // The shuffle result is divided into half A and half B. In total the two
04350   // sources have 4 halves, namely: C, D, E, F. The final values of A and
04351   // B must come from C, D, E or F.
04352   unsigned HalfSize = VT.getVectorNumElements()/2;
04353   bool MatchA = false, MatchB = false;
04354 
04355   // Check if A comes from one of C, D, E, F.
04356   for (unsigned Half = 0; Half != 4; ++Half) {
04357     if (isSequentialOrUndefInRange(Mask, 0, HalfSize, Half*HalfSize)) {
04358       MatchA = true;
04359       break;
04360     }
04361   }
04362 
04363   // Check if B comes from one of C, D, E, F.
04364   for (unsigned Half = 0; Half != 4; ++Half) {
04365     if (isSequentialOrUndefInRange(Mask, HalfSize, HalfSize, Half*HalfSize)) {
04366       MatchB = true;
04367       break;
04368     }
04369   }
04370 
04371   return MatchA && MatchB;
04372 }
04373 
04374 /// getShuffleVPERM2X128Immediate - Return the appropriate immediate to shuffle
04375 /// the specified VECTOR_MASK mask with VPERM2F128/VPERM2I128 instructions.
04376 static unsigned getShuffleVPERM2X128Immediate(ShuffleVectorSDNode *SVOp) {
04377   MVT VT = SVOp->getSimpleValueType(0);
04378 
04379   unsigned HalfSize = VT.getVectorNumElements()/2;
04380 
04381   unsigned FstHalf = 0, SndHalf = 0;
04382   for (unsigned i = 0; i < HalfSize; ++i) {
04383     if (SVOp->getMaskElt(i) > 0) {
04384       FstHalf = SVOp->getMaskElt(i)/HalfSize;
04385       break;
04386     }
04387   }
04388   for (unsigned i = HalfSize; i < HalfSize*2; ++i) {
04389     if (SVOp->getMaskElt(i) > 0) {
04390       SndHalf = SVOp->getMaskElt(i)/HalfSize;
04391       break;
04392     }
04393   }
04394 
04395   return (FstHalf | (SndHalf << 4));
04396 }
04397 
04398 // Symetric in-lane mask. Each lane has 4 elements (for imm8)
04399 static bool isPermImmMask(ArrayRef<int> Mask, MVT VT, unsigned& Imm8) {
04400   unsigned EltSize = VT.getVectorElementType().getSizeInBits();
04401   if (EltSize < 32)
04402     return false;
04403 
04404   unsigned NumElts = VT.getVectorNumElements();
04405   Imm8 = 0;
04406   if (VT.is128BitVector() || (VT.is256BitVector() && EltSize == 64)) {
04407     for (unsigned i = 0; i != NumElts; ++i) {
04408       if (Mask[i] < 0)
04409         continue;
04410       Imm8 |= Mask[i] << (i*2);
04411     }
04412     return true;
04413   }
04414 
04415   unsigned LaneSize = 4;
04416   SmallVector<int, 4> MaskVal(LaneSize, -1);
04417 
04418   for (unsigned l = 0; l != NumElts; l += LaneSize) {
04419     for (unsigned i = 0; i != LaneSize; ++i) {
04420       if (!isUndefOrInRange(Mask[i+l], l, l+LaneSize))
04421         return false;
04422       if (Mask[i+l] < 0)
04423         continue;
04424       if (MaskVal[i] < 0) {
04425         MaskVal[i] = Mask[i+l] - l;
04426         Imm8 |= MaskVal[i] << (i*2);
04427         continue;
04428       }
04429       if (Mask[i+l] != (signed)(MaskVal[i]+l))
04430         return false;
04431     }
04432   }
04433   return true;
04434 }
04435 
04436 /// isVPERMILPMask - Return true if the specified VECTOR_SHUFFLE operand
04437 /// specifies a shuffle of elements that is suitable for input to VPERMILPD*.
04438 /// Note that VPERMIL mask matching is different depending whether theunderlying
04439 /// type is 32 or 64. In the VPERMILPS the high half of the mask should point
04440 /// to the same elements of the low, but to the higher half of the source.
04441 /// In VPERMILPD the two lanes could be shuffled independently of each other
04442 /// with the same restriction that lanes can't be crossed. Also handles PSHUFDY.
04443 static bool isVPERMILPMask(ArrayRef<int> Mask, MVT VT) {
04444   unsigned EltSize = VT.getVectorElementType().getSizeInBits();
04445   if (VT.getSizeInBits() < 256 || EltSize < 32)
04446     return false;
04447   bool symetricMaskRequired = (EltSize == 32);
04448   unsigned NumElts = VT.getVectorNumElements();
04449 
04450   unsigned NumLanes = VT.getSizeInBits()/128;
04451   unsigned LaneSize = NumElts/NumLanes;
04452   // 2 or 4 elements in one lane
04453 
04454   SmallVector<int, 4> ExpectedMaskVal(LaneSize, -1);
04455   for (unsigned l = 0; l != NumElts; l += LaneSize) {
04456     for (unsigned i = 0; i != LaneSize; ++i) {
04457       if (!isUndefOrInRange(Mask[i+l], l, l+LaneSize))
04458         return false;
04459       if (symetricMaskRequired) {
04460         if (ExpectedMaskVal[i] < 0 && Mask[i+l] >= 0) {
04461           ExpectedMaskVal[i] = Mask[i+l] - l;
04462           continue;
04463         }
04464         if (!isUndefOrEqual(Mask[i+l], ExpectedMaskVal[i]+l))
04465           return false;
04466       }
04467     }
04468   }
04469   return true;
04470 }
04471 
04472 /// isCommutedMOVLMask - Returns true if the shuffle mask is except the reverse
04473 /// of what x86 movss want. X86 movs requires the lowest  element to be lowest
04474 /// element of vector 2 and the other elements to come from vector 1 in order.
04475 static bool isCommutedMOVLMask(ArrayRef<int> Mask, MVT VT,
04476                                bool V2IsSplat = false, bool V2IsUndef = false) {
04477   if (!VT.is128BitVector())
04478     return false;
04479 
04480   unsigned NumOps = VT.getVectorNumElements();
04481   if (NumOps != 2 && NumOps != 4 && NumOps != 8 && NumOps != 16)
04482     return false;
04483 
04484   if (!isUndefOrEqual(Mask[0], 0))
04485     return false;
04486 
04487   for (unsigned i = 1; i != NumOps; ++i)
04488     if (!(isUndefOrEqual(Mask[i], i+NumOps) ||
04489           (V2IsUndef && isUndefOrInRange(Mask[i], NumOps, NumOps*2)) ||
04490           (V2IsSplat && isUndefOrEqual(Mask[i], NumOps))))
04491       return false;
04492 
04493   return true;
04494 }
04495 
04496 /// isMOVSHDUPMask - Return true if the specified VECTOR_SHUFFLE operand
04497 /// specifies a shuffle of elements that is suitable for input to MOVSHDUP.
04498 /// Masks to match: <1, 1, 3, 3> or <1, 1, 3, 3, 5, 5, 7, 7>
04499 static bool isMOVSHDUPMask(ArrayRef<int> Mask, MVT VT,
04500                            const X86Subtarget *Subtarget) {
04501   if (!Subtarget->hasSSE3())
04502     return false;
04503 
04504   unsigned NumElems = VT.getVectorNumElements();
04505 
04506   if ((VT.is128BitVector() && NumElems != 4) ||
04507       (VT.is256BitVector() && NumElems != 8) ||
04508       (VT.is512BitVector() && NumElems != 16))
04509     return false;
04510 
04511   // "i+1" is the value the indexed mask element must have
04512   for (unsigned i = 0; i != NumElems; i += 2)
04513     if (!isUndefOrEqual(Mask[i], i+1) ||
04514         !isUndefOrEqual(Mask[i+1], i+1))
04515       return false;
04516 
04517   return true;
04518 }
04519 
04520 /// isMOVSLDUPMask - Return true if the specified VECTOR_SHUFFLE operand
04521 /// specifies a shuffle of elements that is suitable for input to MOVSLDUP.
04522 /// Masks to match: <0, 0, 2, 2> or <0, 0, 2, 2, 4, 4, 6, 6>
04523 static bool isMOVSLDUPMask(ArrayRef<int> Mask, MVT VT,
04524                            const X86Subtarget *Subtarget) {
04525   if (!Subtarget->hasSSE3())
04526     return false;
04527 
04528   unsigned NumElems = VT.getVectorNumElements();
04529 
04530   if ((VT.is128BitVector() && NumElems != 4) ||
04531       (VT.is256BitVector() && NumElems != 8) ||
04532       (VT.is512BitVector() && NumElems != 16))
04533     return false;
04534 
04535   // "i" is the value the indexed mask element must have
04536   for (unsigned i = 0; i != NumElems; i += 2)
04537     if (!isUndefOrEqual(Mask[i], i) ||
04538         !isUndefOrEqual(Mask[i+1], i))
04539       return false;
04540 
04541   return true;
04542 }
04543 
04544 /// isMOVDDUPYMask - Return true if the specified VECTOR_SHUFFLE operand
04545 /// specifies a shuffle of elements that is suitable for input to 256-bit
04546 /// version of MOVDDUP.
04547 static bool isMOVDDUPYMask(ArrayRef<int> Mask, MVT VT, bool HasFp256) {
04548   if (!HasFp256 || !VT.is256BitVector())
04549     return false;
04550 
04551   unsigned NumElts = VT.getVectorNumElements();
04552   if (NumElts != 4)
04553     return false;
04554 
04555   for (unsigned i = 0; i != NumElts/2; ++i)
04556     if (!isUndefOrEqual(Mask[i], 0))
04557       return false;
04558   for (unsigned i = NumElts/2; i != NumElts; ++i)
04559     if (!isUndefOrEqual(Mask[i], NumElts/2))
04560       return false;
04561   return true;
04562 }
04563 
04564 /// isMOVDDUPMask - Return true if the specified VECTOR_SHUFFLE operand
04565 /// specifies a shuffle of elements that is suitable for input to 128-bit
04566 /// version of MOVDDUP.
04567 static bool isMOVDDUPMask(ArrayRef<int> Mask, MVT VT) {
04568   if (!VT.is128BitVector())
04569     return false;
04570 
04571   unsigned e = VT.getVectorNumElements() / 2;
04572   for (unsigned i = 0; i != e; ++i)
04573     if (!isUndefOrEqual(Mask[i], i))
04574       return false;
04575   for (unsigned i = 0; i != e; ++i)
04576     if (!isUndefOrEqual(Mask[e+i], i))
04577       return false;
04578   return true;
04579 }
04580 
04581 /// isVEXTRACTIndex - Return true if the specified
04582 /// EXTRACT_SUBVECTOR operand specifies a vector extract that is
04583 /// suitable for instruction that extract 128 or 256 bit vectors
04584 static bool isVEXTRACTIndex(SDNode *N, unsigned vecWidth) {
04585   assert((vecWidth == 128 || vecWidth == 256) && "Unexpected vector width");
04586   if (!isa<ConstantSDNode>(N->getOperand(1).getNode()))
04587     return false;
04588 
04589   // The index should be aligned on a vecWidth-bit boundary.
04590   uint64_t Index =
04591     cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue();
04592 
04593   MVT VT = N->getSimpleValueType(0);
04594   unsigned ElSize = VT.getVectorElementType().getSizeInBits();
04595   bool Result = (Index * ElSize) % vecWidth == 0;
04596 
04597   return Result;
04598 }
04599 
04600 /// isVINSERTIndex - Return true if the specified INSERT_SUBVECTOR
04601 /// operand specifies a subvector insert that is suitable for input to
04602 /// insertion of 128 or 256-bit subvectors
04603 static bool isVINSERTIndex(SDNode *N, unsigned vecWidth) {
04604   assert((vecWidth == 128 || vecWidth == 256) && "Unexpected vector width");
04605   if (!isa<ConstantSDNode>(N->getOperand(2).getNode()))
04606     return false;
04607   // The index should be aligned on a vecWidth-bit boundary.
04608   uint64_t Index =
04609     cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue();
04610 
04611   MVT VT = N->getSimpleValueType(0);
04612   unsigned ElSize = VT.getVectorElementType().getSizeInBits();
04613   bool Result = (Index * ElSize) % vecWidth == 0;
04614 
04615   return Result;
04616 }
04617 
04618 bool X86::isVINSERT128Index(SDNode *N) {
04619   return isVINSERTIndex(N, 128);
04620 }
04621 
04622 bool X86::isVINSERT256Index(SDNode *N) {
04623   return isVINSERTIndex(N, 256);
04624 }
04625 
04626 bool X86::isVEXTRACT128Index(SDNode *N) {
04627   return isVEXTRACTIndex(N, 128);
04628 }
04629 
04630 bool X86::isVEXTRACT256Index(SDNode *N) {
04631   return isVEXTRACTIndex(N, 256);
04632 }
04633 
04634 /// getShuffleSHUFImmediate - Return the appropriate immediate to shuffle
04635 /// the specified VECTOR_SHUFFLE mask with PSHUF* and SHUFP* instructions.
04636 /// Handles 128-bit and 256-bit.
04637 static unsigned getShuffleSHUFImmediate(ShuffleVectorSDNode *N) {
04638   MVT VT = N->getSimpleValueType(0);
04639 
04640   assert((VT.getSizeInBits() >= 128) &&
04641          "Unsupported vector type for PSHUF/SHUFP");
04642 
04643   // Handle 128 and 256-bit vector lengths. AVX defines PSHUF/SHUFP to operate
04644   // independently on 128-bit lanes.
04645   unsigned NumElts = VT.getVectorNumElements();
04646   unsigned NumLanes = VT.getSizeInBits()/128;
04647   unsigned NumLaneElts = NumElts/NumLanes;
04648 
04649   assert((NumLaneElts == 2 || NumLaneElts == 4 || NumLaneElts == 8) &&
04650          "Only supports 2, 4 or 8 elements per lane");
04651 
04652   unsigned Shift = (NumLaneElts >= 4) ? 1 : 0;
04653   unsigned Mask = 0;
04654   for (unsigned i = 0; i != NumElts; ++i) {
04655     int Elt = N->getMaskElt(i);
04656     if (Elt < 0) continue;
04657     Elt &= NumLaneElts - 1;
04658     unsigned ShAmt = (i << Shift) % 8;
04659     Mask |= Elt << ShAmt;
04660   }
04661 
04662   return Mask;
04663 }
04664 
04665 /// getShufflePSHUFHWImmediate - Return the appropriate immediate to shuffle
04666 /// the specified VECTOR_SHUFFLE mask with the PSHUFHW instruction.
04667 static unsigned getShufflePSHUFHWImmediate(ShuffleVectorSDNode *N) {
04668   MVT VT = N->getSimpleValueType(0);
04669 
04670   assert((VT == MVT::v8i16 || VT == MVT::v16i16) &&
04671          "Unsupported vector type for PSHUFHW");
04672 
04673   unsigned NumElts = VT.getVectorNumElements();
04674 
04675   unsigned Mask = 0;
04676   for (unsigned l = 0; l != NumElts; l += 8) {
04677     // 8 nodes per lane, but we only care about the last 4.
04678     for (unsigned i = 0; i < 4; ++i) {
04679       int Elt = N->getMaskElt(l+i+4);
04680       if (Elt < 0) continue;
04681       Elt &= 0x3; // only 2-bits.
04682       Mask |= Elt << (i * 2);
04683     }
04684   }
04685 
04686   return Mask;
04687 }
04688 
04689 /// getShufflePSHUFLWImmediate - Return the appropriate immediate to shuffle
04690 /// the specified VECTOR_SHUFFLE mask with the PSHUFLW instruction.
04691 static unsigned getShufflePSHUFLWImmediate(ShuffleVectorSDNode *N) {
04692   MVT VT = N->getSimpleValueType(0);
04693 
04694   assert((VT == MVT::v8i16 || VT == MVT::v16i16) &&
04695          "Unsupported vector type for PSHUFHW");
04696 
04697   unsigned NumElts = VT.getVectorNumElements();
04698 
04699   unsigned Mask = 0;
04700   for (unsigned l = 0; l != NumElts; l += 8) {
04701     // 8 nodes per lane, but we only care about the first 4.
04702     for (unsigned i = 0; i < 4; ++i) {
04703       int Elt = N->getMaskElt(l+i);
04704       if (Elt < 0) continue;
04705       Elt &= 0x3; // only 2-bits
04706       Mask |= Elt << (i * 2);
04707     }
04708   }
04709 
04710   return Mask;
04711 }
04712 
04713 /// getShufflePALIGNRImmediate - Return the appropriate immediate to shuffle
04714 /// the specified VECTOR_SHUFFLE mask with the PALIGNR instruction.
04715 static unsigned getShufflePALIGNRImmediate(ShuffleVectorSDNode *SVOp) {
04716   MVT VT = SVOp->getSimpleValueType(0);
04717   unsigned EltSize = VT.is512BitVector() ? 1 :
04718     VT.getVectorElementType().getSizeInBits() >> 3;
04719 
04720   unsigned NumElts = VT.getVectorNumElements();
04721   unsigned NumLanes = VT.is512BitVector() ? 1 : VT.getSizeInBits()/128;
04722   unsigned NumLaneElts = NumElts/NumLanes;
04723 
04724   int Val = 0;
04725   unsigned i;
04726   for (i = 0; i != NumElts; ++i) {
04727     Val = SVOp->getMaskElt(i);
04728     if (Val >= 0)
04729       break;
04730   }
04731   if (Val >= (int)NumElts)
04732     Val -= NumElts - NumLaneElts;
04733 
04734   assert(Val - i > 0 && "PALIGNR imm should be positive");
04735   return (Val - i) * EltSize;
04736 }
04737 
04738 static unsigned getExtractVEXTRACTImmediate(SDNode *N, unsigned vecWidth) {
04739   assert((vecWidth == 128 || vecWidth == 256) && "Unsupported vector width");
04740   if (!isa<ConstantSDNode>(N->getOperand(1).getNode()))
04741     llvm_unreachable("Illegal extract subvector for VEXTRACT");
04742 
04743   uint64_t Index =
04744     cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue();
04745 
04746   MVT VecVT = N->getOperand(0).getSimpleValueType();
04747   MVT ElVT = VecVT.getVectorElementType();
04748 
04749   unsigned NumElemsPerChunk = vecWidth / ElVT.getSizeInBits();
04750   return Index / NumElemsPerChunk;
04751 }
04752 
04753 static unsigned getInsertVINSERTImmediate(SDNode *N, unsigned vecWidth) {
04754   assert((vecWidth == 128 || vecWidth == 256) && "Unsupported vector width");
04755   if (!isa<ConstantSDNode>(N->getOperand(2).getNode()))
04756     llvm_unreachable("Illegal insert subvector for VINSERT");
04757 
04758   uint64_t Index =
04759     cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue();
04760 
04761   MVT VecVT = N->getSimpleValueType(0);
04762   MVT ElVT = VecVT.getVectorElementType();
04763 
04764   unsigned NumElemsPerChunk = vecWidth / ElVT.getSizeInBits();
04765   return Index / NumElemsPerChunk;
04766 }
04767 
04768 /// getExtractVEXTRACT128Immediate - Return the appropriate immediate
04769 /// to extract the specified EXTRACT_SUBVECTOR index with VEXTRACTF128
04770 /// and VINSERTI128 instructions.
04771 unsigned X86::getExtractVEXTRACT128Immediate(SDNode *N) {
04772   return getExtractVEXTRACTImmediate(N, 128);
04773 }
04774 
04775 /// getExtractVEXTRACT256Immediate - Return the appropriate immediate
04776 /// to extract the specified EXTRACT_SUBVECTOR index with VEXTRACTF64x4
04777 /// and VINSERTI64x4 instructions.
04778 unsigned X86::getExtractVEXTRACT256Immediate(SDNode *N) {
04779   return getExtractVEXTRACTImmediate(N, 256);
04780 }
04781 
04782 /// getInsertVINSERT128Immediate - Return the appropriate immediate
04783 /// to insert at the specified INSERT_SUBVECTOR index with VINSERTF128
04784 /// and VINSERTI128 instructions.
04785 unsigned X86::getInsertVINSERT128Immediate(SDNode *N) {
04786   return getInsertVINSERTImmediate(N, 128);
04787 }
04788 
04789 /// getInsertVINSERT256Immediate - Return the appropriate immediate
04790 /// to insert at the specified INSERT_SUBVECTOR index with VINSERTF46x4
04791 /// and VINSERTI64x4 instructions.
04792 unsigned X86::getInsertVINSERT256Immediate(SDNode *N) {
04793   return getInsertVINSERTImmediate(N, 256);
04794 }
04795 
04796 /// isZero - Returns true if Elt is a constant integer zero
04797 static bool isZero(SDValue V) {
04798   ConstantSDNode *C = dyn_cast<ConstantSDNode>(V);
04799   return C && C->isNullValue();
04800 }
04801 
04802 /// isZeroNode - Returns true if Elt is a constant zero or a floating point
04803 /// constant +0.0.
04804 bool X86::isZeroNode(SDValue Elt) {
04805   if (isZero(Elt))
04806     return true;
04807   if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Elt))
04808     return CFP->getValueAPF().isPosZero();
04809   return false;
04810 }
04811 
04812 /// ShouldXformToMOVHLPS - Return true if the node should be transformed to
04813 /// match movhlps. The lower half elements should come from upper half of
04814 /// V1 (and in order), and the upper half elements should come from the upper
04815 /// half of V2 (and in order).
04816 static bool ShouldXformToMOVHLPS(ArrayRef<int> Mask, MVT VT) {
04817   if (!VT.is128BitVector())
04818     return false;
04819   if (VT.getVectorNumElements() != 4)
04820     return false;
04821   for (unsigned i = 0, e = 2; i != e; ++i)
04822     if (!isUndefOrEqual(Mask[i], i+2))
04823       return false;
04824   for (unsigned i = 2; i != 4; ++i)
04825     if (!isUndefOrEqual(Mask[i], i+4))
04826       return false;
04827   return true;
04828 }
04829 
04830 /// isScalarLoadToVector - Returns true if the node is a scalar load that
04831 /// is promoted to a vector. It also returns the LoadSDNode by reference if
04832 /// required.
04833 static bool isScalarLoadToVector(SDNode *N, LoadSDNode **LD = nullptr) {
04834   if (N->getOpcode() != ISD::SCALAR_TO_VECTOR)
04835     return false;
04836   N = N->getOperand(0).getNode();
04837   if (!ISD::isNON_EXTLoad(N))
04838     return false;
04839   if (LD)
04840     *LD = cast<LoadSDNode>(N);
04841   return true;
04842 }
04843 
04844 // Test whether the given value is a vector value which will be legalized
04845 // into a load.
04846 static bool WillBeConstantPoolLoad(SDNode *N) {
04847   if (N->getOpcode() != ISD::BUILD_VECTOR)
04848     return false;
04849 
04850   // Check for any non-constant elements.
04851   for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i)
04852     switch (N->getOperand(i).getNode()->getOpcode()) {
04853     case ISD::UNDEF:
04854     case ISD::ConstantFP:
04855     case ISD::Constant:
04856       break;
04857     default:
04858       return false;
04859     }
04860 
04861   // Vectors of all-zeros and all-ones are materialized with special
04862   // instructions rather than being loaded.
04863   return !ISD::isBuildVectorAllZeros(N) &&
04864          !ISD::isBuildVectorAllOnes(N);
04865 }
04866 
04867 /// ShouldXformToMOVLP{S|D} - Return true if the node should be transformed to
04868 /// match movlp{s|d}. The lower half elements should come from lower half of
04869 /// V1 (and in order), and the upper half elements should come from the upper
04870 /// half of V2 (and in order). And since V1 will become the source of the
04871 /// MOVLP, it must be either a vector load or a scalar load to vector.
04872 static bool ShouldXformToMOVLP(SDNode *V1, SDNode *V2,
04873                                ArrayRef<int> Mask, MVT VT) {
04874   if (!VT.is128BitVector())
04875     return false;
04876 
04877   if (!ISD::isNON_EXTLoad(V1) && !isScalarLoadToVector(V1))
04878     return false;
04879   // Is V2 is a vector load, don't do this transformation. We will try to use
04880   // load folding shufps op.
04881   if (ISD::isNON_EXTLoad(V2) || WillBeConstantPoolLoad(V2))
04882     return false;
04883 
04884   unsigned NumElems = VT.getVectorNumElements();
04885 
04886   if (NumElems != 2 && NumElems != 4)
04887     return false;
04888   for (unsigned i = 0, e = NumElems/2; i != e; ++i)
04889     if (!isUndefOrEqual(Mask[i], i))
04890       return false;
04891   for (unsigned i = NumElems/2, e = NumElems; i != e; ++i)
04892     if (!isUndefOrEqual(Mask[i], i+NumElems))
04893       return false;
04894   return true;
04895 }
04896 
04897 /// isZeroShuffle - Returns true if N is a VECTOR_SHUFFLE that can be resolved
04898 /// to an zero vector.
04899 /// FIXME: move to dag combiner / method on ShuffleVectorSDNode
04900 static bool isZeroShuffle(ShuffleVectorSDNode *N) {
04901   SDValue V1 = N->getOperand(0);
04902   SDValue V2 = N->getOperand(1);
04903   unsigned NumElems = N->getValueType(0).getVectorNumElements();
04904   for (unsigned i = 0; i != NumElems; ++i) {
04905     int Idx = N->getMaskElt(i);
04906     if (Idx >= (int)NumElems) {
04907       unsigned Opc = V2.getOpcode();
04908       if (Opc == ISD::UNDEF || ISD::isBuildVectorAllZeros(V2.getNode()))
04909         continue;
04910       if (Opc != ISD::BUILD_VECTOR ||
04911           !X86::isZeroNode(V2.getOperand(Idx-NumElems)))
04912         return false;
04913     } else if (Idx >= 0) {
04914       unsigned Opc = V1.getOpcode();
04915       if (Opc == ISD::UNDEF || ISD::isBuildVectorAllZeros(V1.getNode()))
04916         continue;
04917       if (Opc != ISD::BUILD_VECTOR ||
04918           !X86::isZeroNode(V1.getOperand(Idx)))
04919         return false;
04920     }
04921   }
04922   return true;
04923 }
04924 
04925 /// getZeroVector - Returns a vector of specified type with all zero elements.
04926 ///
04927 static SDValue getZeroVector(EVT VT, const X86Subtarget *Subtarget,
04928                              SelectionDAG &DAG, SDLoc dl) {
04929   assert(VT.isVector() && "Expected a vector type");
04930 
04931   // Always build SSE zero vectors as <4 x i32> bitcasted
04932   // to their dest type. This ensures they get CSE'd.
04933   SDValue Vec;
04934   if (VT.is128BitVector()) {  // SSE
04935     if (Subtarget->hasSSE2()) {  // SSE2
04936       SDValue Cst = DAG.getTargetConstant(0, MVT::i32);
04937       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
04938     } else { // SSE1
04939       SDValue Cst = DAG.getTargetConstantFP(+0.0, MVT::f32);
04940       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4f32, Cst, Cst, Cst, Cst);
04941     }
04942   } else if (VT.is256BitVector()) { // AVX
04943     if (Subtarget->hasInt256()) { // AVX2
04944       SDValue Cst = DAG.getTargetConstant(0, MVT::i32);
04945       SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
04946       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops);
04947     } else {
04948       // 256-bit logic and arithmetic instructions in AVX are all
04949       // floating-point, no support for integer ops. Emit fp zeroed vectors.
04950       SDValue Cst = DAG.getTargetConstantFP(+0.0, MVT::f32);
04951       SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
04952       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8f32, Ops);
04953     }
04954   } else if (VT.is512BitVector()) { // AVX-512
04955       SDValue Cst = DAG.getTargetConstant(0, MVT::i32);
04956       SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst,
04957                         Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
04958       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v16i32, Ops);
04959   } else if (VT.getScalarType() == MVT::i1) {
04960     assert(VT.getVectorNumElements() <= 16 && "Unexpected vector type");
04961     SDValue Cst = DAG.getTargetConstant(0, MVT::i1);
04962     SmallVector<SDValue, 16> Ops(VT.getVectorNumElements(), Cst);
04963     return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops);
04964   } else
04965     llvm_unreachable("Unexpected vector type");
04966 
04967   return DAG.getNode(ISD::BITCAST, dl, VT, Vec);
04968 }
04969 
04970 /// getOnesVector - Returns a vector of specified type with all bits set.
04971 /// Always build ones vectors as <4 x i32> or <8 x i32>. For 256-bit types with
04972 /// no AVX2 supprt, use two <4 x i32> inserted in a <8 x i32> appropriately.
04973 /// Then bitcast to their original type, ensuring they get CSE'd.
04974 static SDValue getOnesVector(MVT VT, bool HasInt256, SelectionDAG &DAG,
04975                              SDLoc dl) {
04976   assert(VT.isVector() && "Expected a vector type");
04977 
04978   SDValue Cst = DAG.getTargetConstant(~0U, MVT::i32);
04979   SDValue Vec;
04980   if (VT.is256BitVector()) {
04981     if (HasInt256) { // AVX2
04982       SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
04983       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops);
04984     } else { // AVX
04985       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
04986       Vec = Concat128BitVectors(Vec, Vec, MVT::v8i32, 8, DAG, dl);
04987     }
04988   } else if (VT.is128BitVector()) {
04989     Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
04990   } else
04991     llvm_unreachable("Unexpected vector type");
04992 
04993   return DAG.getNode(ISD::BITCAST, dl, VT, Vec);
04994 }
04995 
04996 /// NormalizeMask - V2 is a splat, modify the mask (if needed) so all elements
04997 /// that point to V2 points to its first element.
04998 static void NormalizeMask(SmallVectorImpl<int> &Mask, unsigned NumElems) {
04999   for (unsigned i = 0; i != NumElems; ++i) {
05000     if (Mask[i] > (int)NumElems) {
05001       Mask[i] = NumElems;
05002     }
05003   }
05004 }
05005 
05006 /// getMOVLMask - Returns a vector_shuffle mask for an movs{s|d}, movd
05007 /// operation of specified width.
05008 static SDValue getMOVL(SelectionDAG &DAG, SDLoc dl, EVT VT, SDValue V1,
05009                        SDValue V2) {
05010   unsigned NumElems = VT.getVectorNumElements();
05011   SmallVector<int, 8> Mask;
05012   Mask.push_back(NumElems);
05013   for (unsigned i = 1; i != NumElems; ++i)
05014     Mask.push_back(i);
05015   return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]);
05016 }
05017 
05018 /// getUnpackl - Returns a vector_shuffle node for an unpackl operation.
05019 static SDValue getUnpackl(SelectionDAG &DAG, SDLoc dl, MVT VT, SDValue V1,
05020                           SDValue V2) {
05021   unsigned NumElems = VT.getVectorNumElements();
05022   SmallVector<int, 8> Mask;
05023   for (unsigned i = 0, e = NumElems/2; i != e; ++i) {
05024     Mask.push_back(i);
05025     Mask.push_back(i + NumElems);
05026   }
05027   return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]);
05028 }
05029 
05030 /// getUnpackh - Returns a vector_shuffle node for an unpackh operation.
05031 static SDValue getUnpackh(SelectionDAG &DAG, SDLoc dl, MVT VT, SDValue V1,
05032                           SDValue V2) {
05033   unsigned NumElems = VT.getVectorNumElements();
05034   SmallVector<int, 8> Mask;
05035   for (unsigned i = 0, Half = NumElems/2; i != Half; ++i) {
05036     Mask.push_back(i + Half);
05037     Mask.push_back(i + NumElems + Half);
05038   }
05039   return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]);
05040 }
05041 
05042 // PromoteSplati8i16 - All i16 and i8 vector types can't be used directly by
05043 // a generic shuffle instruction because the target has no such instructions.
05044 // Generate shuffles which repeat i16 and i8 several times until they can be
05045 // represented by v4f32 and then be manipulated by target suported shuffles.
05046 static SDValue PromoteSplati8i16(SDValue V, SelectionDAG &DAG, int &EltNo) {
05047   MVT VT = V.getSimpleValueType();
05048   int NumElems = VT.getVectorNumElements();
05049   SDLoc dl(V);
05050 
05051   while (NumElems > 4) {
05052     if (EltNo < NumElems/2) {
05053       V = getUnpackl(DAG, dl, VT, V, V);
05054     } else {
05055       V = getUnpackh(DAG, dl, VT, V, V);
05056       EltNo -= NumElems/2;
05057     }
05058     NumElems >>= 1;
05059   }
05060   return V;
05061 }
05062 
05063 /// getLegalSplat - Generate a legal splat with supported x86 shuffles
05064 static SDValue getLegalSplat(SelectionDAG &DAG, SDValue V, int EltNo) {
05065   MVT VT = V.getSimpleValueType();
05066   SDLoc dl(V);
05067 
05068   if (VT.is128BitVector()) {
05069     V = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, V);
05070     int SplatMask[4] = { EltNo, EltNo, EltNo, EltNo };
05071     V = DAG.getVectorShuffle(MVT::v4f32, dl, V, DAG.getUNDEF(MVT::v4f32),
05072                              &SplatMask[0]);
05073   } else if (VT.is256BitVector()) {
05074     // To use VPERMILPS to splat scalars, the second half of indicies must
05075     // refer to the higher part, which is a duplication of the lower one,
05076     // because VPERMILPS can only handle in-lane permutations.
05077     int SplatMask[8] = { EltNo, EltNo, EltNo, EltNo,
05078                          EltNo+4, EltNo+4, EltNo+4, EltNo+4 };
05079 
05080     V = DAG.getNode(ISD::BITCAST, dl, MVT::v8f32, V);
05081     V = DAG.getVectorShuffle(MVT::v8f32, dl, V, DAG.getUNDEF(MVT::v8f32),
05082                              &SplatMask[0]);
05083   } else
05084     llvm_unreachable("Vector size not supported");
05085 
05086   return DAG.getNode(ISD::BITCAST, dl, VT, V);
05087 }
05088 
05089 /// PromoteSplat - Splat is promoted to target supported vector shuffles.
05090 static SDValue PromoteSplat(ShuffleVectorSDNode *SV, SelectionDAG &DAG) {
05091   MVT SrcVT = SV->getSimpleValueType(0);
05092   SDValue V1 = SV->getOperand(0);
05093   SDLoc dl(SV);
05094 
05095   int EltNo = SV->getSplatIndex();
05096   int NumElems = SrcVT.getVectorNumElements();
05097   bool Is256BitVec = SrcVT.is256BitVector();
05098 
05099   assert(((SrcVT.is128BitVector() && NumElems > 4) || Is256BitVec) &&
05100          "Unknown how to promote splat for type");
05101 
05102   // Extract the 128-bit part containing the splat element and update
05103   // the splat element index when it refers to the higher register.
05104   if (Is256BitVec) {
05105     V1 = Extract128BitVector(V1, EltNo, DAG, dl);
05106     if (EltNo >= NumElems/2)
05107       EltNo -= NumElems/2;
05108   }
05109 
05110   // All i16 and i8 vector types can't be used directly by a generic shuffle
05111   // instruction because the target has no such instruction. Generate shuffles
05112   // which repeat i16 and i8 several times until they fit in i32, and then can
05113   // be manipulated by target suported shuffles.
05114   MVT EltVT = SrcVT.getVectorElementType();
05115   if (EltVT == MVT::i8 || EltVT == MVT::i16)
05116     V1 = PromoteSplati8i16(V1, DAG, EltNo);
05117 
05118   // Recreate the 256-bit vector and place the same 128-bit vector
05119   // into the low and high part. This is necessary because we want
05120   // to use VPERM* to shuffle the vectors
05121   if (Is256BitVec) {
05122     V1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, SrcVT, V1, V1);
05123   }
05124 
05125   return getLegalSplat(DAG, V1, EltNo);
05126 }
05127 
05128 /// getShuffleVectorZeroOrUndef - Return a vector_shuffle of the specified
05129 /// vector of zero or undef vector.  This produces a shuffle where the low
05130 /// element of V2 is swizzled into the zero/undef vector, landing at element
05131 /// Idx.  This produces a shuffle mask like 4,1,2,3 (idx=0) or  0,1,2,4 (idx=3).
05132 static SDValue getShuffleVectorZeroOrUndef(SDValue V2, unsigned Idx,
05133                                            bool IsZero,
05134                                            const X86Subtarget *Subtarget,
05135                                            SelectionDAG &DAG) {
05136   MVT VT = V2.getSimpleValueType();
05137   SDValue V1 = IsZero
05138     ? getZeroVector(VT, Subtarget, DAG, SDLoc(V2)) : DAG.getUNDEF(VT);
05139   unsigned NumElems = VT.getVectorNumElements();
05140   SmallVector<int, 16> MaskVec;
05141   for (unsigned i = 0; i != NumElems; ++i)
05142     // If this is the insertion idx, put the low elt of V2 here.
05143     MaskVec.push_back(i == Idx ? NumElems : i);
05144   return DAG.getVectorShuffle(VT, SDLoc(V2), V1, V2, &MaskVec[0]);
05145 }
05146 
05147 /// getTargetShuffleMask - Calculates the shuffle mask corresponding to the
05148 /// target specific opcode. Returns true if the Mask could be calculated.
05149 /// Sets IsUnary to true if only uses one source.
05150 static bool getTargetShuffleMask(SDNode *N, MVT VT,
05151                                  SmallVectorImpl<int> &Mask, bool &IsUnary) {
05152   unsigned NumElems = VT.getVectorNumElements();
05153   SDValue ImmN;
05154 
05155   IsUnary = false;
05156   switch(N->getOpcode()) {
05157   case X86ISD::SHUFP:
05158     ImmN = N->getOperand(N->getNumOperands()-1);
05159     DecodeSHUFPMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
05160     break;
05161   case X86ISD::UNPCKH:
05162     DecodeUNPCKHMask(VT, Mask);
05163     break;
05164   case X86ISD::UNPCKL:
05165     DecodeUNPCKLMask(VT, Mask);
05166     break;
05167   case X86ISD::MOVHLPS:
05168     DecodeMOVHLPSMask(NumElems, Mask);
05169     break;
05170   case X86ISD::MOVLHPS:
05171     DecodeMOVLHPSMask(NumElems, Mask);
05172     break;
05173   case X86ISD::PALIGNR:
05174     ImmN = N->getOperand(N->getNumOperands()-1);
05175     DecodePALIGNRMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
05176     break;
05177   case X86ISD::PSHUFD:
05178   case X86ISD::VPERMILP:
05179     ImmN = N->getOperand(N->getNumOperands()-1);
05180     DecodePSHUFMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
05181     IsUnary = true;
05182     break;
05183   case X86ISD::PSHUFHW:
05184     ImmN = N->getOperand(N->getNumOperands()-1);
05185     DecodePSHUFHWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
05186     IsUnary = true;
05187     break;
05188   case X86ISD::PSHUFLW:
05189     ImmN = N->getOperand(N->getNumOperands()-1);
05190     DecodePSHUFLWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
05191     IsUnary = true;
05192     break;
05193   case X86ISD::VPERMI:
05194     ImmN = N->getOperand(N->getNumOperands()-1);
05195     DecodeVPERMMask(cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
05196     IsUnary = true;
05197     break;
05198   case X86ISD::MOVSS:
05199   case X86ISD::MOVSD: {
05200     // The index 0 always comes from the first element of the second source,
05201     // this is why MOVSS and MOVSD are used in the first place. The other
05202     // elements come from the other positions of the first source vector
05203     Mask.push_back(NumElems);
05204     for (unsigned i = 1; i != NumElems; ++i) {
05205       Mask.push_back(i);
05206     }
05207     break;
05208   }
05209   case X86ISD::VPERM2X128:
05210     ImmN = N->getOperand(N->getNumOperands()-1);
05211     DecodeVPERM2X128Mask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
05212     if (Mask.empty()) return false;
05213     break;
05214   case X86ISD::MOVDDUP:
05215   case X86ISD::MOVLHPD:
05216   case X86ISD::MOVLPD:
05217   case X86ISD::MOVLPS:
05218   case X86ISD::MOVSHDUP:
05219   case X86ISD::MOVSLDUP:
05220     // Not yet implemented
05221     return false;
05222   default: llvm_unreachable("unknown target shuffle node");
05223   }
05224 
05225   return true;
05226 }
05227 
05228 /// getShuffleScalarElt - Returns the scalar element that will make up the ith
05229 /// element of the result of the vector shuffle.
05230 static SDValue getShuffleScalarElt(SDNode *N, unsigned Index, SelectionDAG &DAG,
05231                                    unsigned Depth) {
05232   if (Depth == 6)
05233     return SDValue();  // Limit search depth.
05234 
05235   SDValue V = SDValue(N, 0);
05236   EVT VT = V.getValueType();
05237   unsigned Opcode = V.getOpcode();
05238 
05239   // Recurse into ISD::VECTOR_SHUFFLE node to find scalars.
05240   if (const ShuffleVectorSDNode *SV = dyn_cast<ShuffleVectorSDNode>(N)) {
05241     int Elt = SV->getMaskElt(Index);
05242 
05243     if (Elt < 0)
05244       return DAG.getUNDEF(VT.getVectorElementType());
05245 
05246     unsigned NumElems = VT.getVectorNumElements();
05247     SDValue NewV = (Elt < (int)NumElems) ? SV->getOperand(0)
05248                                          : SV->getOperand(1);
05249     return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG, Depth+1);
05250   }
05251 
05252   // Recurse into target specific vector shuffles to find scalars.
05253   if (isTargetShuffle(Opcode)) {
05254     MVT ShufVT = V.getSimpleValueType();
05255     unsigned NumElems = ShufVT.getVectorNumElements();
05256     SmallVector<int, 16> ShuffleMask;
05257     bool IsUnary;
05258 
05259     if (!getTargetShuffleMask(N, ShufVT, ShuffleMask, IsUnary))
05260       return SDValue();
05261 
05262     int Elt = ShuffleMask[Index];
05263     if (Elt < 0)
05264       return DAG.getUNDEF(ShufVT.getVectorElementType());
05265 
05266     SDValue NewV = (Elt < (int)NumElems) ? N->getOperand(0)
05267                                          : N->getOperand(1);
05268     return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG,
05269                                Depth+1);
05270   }
05271 
05272   // Actual nodes that may contain scalar elements
05273   if (Opcode == ISD::BITCAST) {
05274     V = V.getOperand(0);
05275     EVT SrcVT = V.getValueType();
05276     unsigned NumElems = VT.getVectorNumElements();
05277 
05278     if (!SrcVT.isVector() || SrcVT.getVectorNumElements() != NumElems)
05279       return SDValue();
05280   }
05281 
05282   if (V.getOpcode() == ISD::SCALAR_TO_VECTOR)
05283     return (Index == 0) ? V.getOperand(0)
05284                         : DAG.getUNDEF(VT.getVectorElementType());
05285 
05286   if (V.getOpcode() == ISD::BUILD_VECTOR)
05287     return V.getOperand(Index);
05288 
05289   return SDValue();
05290 }
05291 
05292 /// getNumOfConsecutiveZeros - Return the number of elements of a vector
05293 /// shuffle operation which come from a consecutively from a zero. The
05294 /// search can start in two different directions, from left or right.
05295 /// We count undefs as zeros until PreferredNum is reached.
05296 static unsigned getNumOfConsecutiveZeros(ShuffleVectorSDNode *SVOp,
05297                                          unsigned NumElems, bool ZerosFromLeft,
05298                                          SelectionDAG &DAG,
05299                                          unsigned PreferredNum = -1U) {
05300   unsigned NumZeros = 0;
05301   for (unsigned i = 0; i != NumElems; ++i) {
05302     unsigned Index = ZerosFromLeft ? i : NumElems - i - 1;
05303     SDValue Elt = getShuffleScalarElt(SVOp, Index, DAG, 0);
05304     if (!Elt.getNode())
05305       break;
05306 
05307     if (X86::isZeroNode(Elt))
05308       ++NumZeros;
05309     else if (Elt.getOpcode() == ISD::UNDEF) // Undef as zero up to PreferredNum.
05310       NumZeros = std::min(NumZeros + 1, PreferredNum);
05311     else
05312       break;
05313   }
05314 
05315   return NumZeros;
05316 }
05317 
05318 /// isShuffleMaskConsecutive - Check if the shuffle mask indicies [MaskI, MaskE)
05319 /// correspond consecutively to elements from one of the vector operands,
05320 /// starting from its index OpIdx. Also tell OpNum which source vector operand.
05321 static
05322 bool isShuffleMaskConsecutive(ShuffleVectorSDNode *SVOp,
05323                               unsigned MaskI, unsigned MaskE, unsigned OpIdx,
05324                               unsigned NumElems, unsigned &OpNum) {
05325   bool SeenV1 = false;
05326   bool SeenV2 = false;
05327 
05328   for (unsigned i = MaskI; i != MaskE; ++i, ++OpIdx) {
05329     int Idx = SVOp->getMaskElt(i);
05330     // Ignore undef indicies
05331     if (Idx < 0)
05332       continue;
05333 
05334     if (Idx < (int)NumElems)
05335       SeenV1 = true;
05336     else
05337       SeenV2 = true;
05338 
05339     // Only accept consecutive elements from the same vector
05340     if ((Idx % NumElems != OpIdx) || (SeenV1 && SeenV2))
05341       return false;
05342   }
05343 
05344   OpNum = SeenV1 ? 0 : 1;
05345   return true;
05346 }
05347 
05348 /// isVectorShiftRight - Returns true if the shuffle can be implemented as a
05349 /// logical left shift of a vector.
05350 static bool isVectorShiftRight(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG,
05351                                bool &isLeft, SDValue &ShVal, unsigned &ShAmt) {
05352   unsigned NumElems =
05353     SVOp->getSimpleValueType(0).getVectorNumElements();
05354   unsigned NumZeros = getNumOfConsecutiveZeros(
05355       SVOp, NumElems, false /* check zeros from right */, DAG,
05356       SVOp->getMaskElt(0));
05357   unsigned OpSrc;
05358 
05359   if (!NumZeros)
05360     return false;
05361 
05362   // Considering the elements in the mask that are not consecutive zeros,
05363   // check if they consecutively come from only one of the source vectors.
05364   //
05365   //               V1 = {X, A, B, C}     0
05366   //                         \  \  \    /
05367   //   vector_shuffle V1, V2 <1, 2, 3, X>
05368   //
05369   if (!isShuffleMaskConsecutive(SVOp,
05370             0,                   // Mask Start Index
05371             NumElems-NumZeros,   // Mask End Index(exclusive)
05372             NumZeros,            // Where to start looking in the src vector
05373             NumElems,            // Number of elements in vector
05374             OpSrc))              // Which source operand ?
05375     return false;
05376 
05377   isLeft = false;
05378   ShAmt = NumZeros;
05379   ShVal = SVOp->getOperand(OpSrc);
05380   return true;
05381 }
05382 
05383 /// isVectorShiftLeft - Returns true if the shuffle can be implemented as a
05384 /// logical left shift of a vector.
05385 static bool isVectorShiftLeft(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG,
05386                               bool &isLeft, SDValue &ShVal, unsigned &ShAmt) {
05387   unsigned NumElems =
05388     SVOp->getSimpleValueType(0).getVectorNumElements();
05389   unsigned NumZeros = getNumOfConsecutiveZeros(
05390       SVOp, NumElems, true /* check zeros from left */, DAG,
05391       NumElems - SVOp->getMaskElt(NumElems - 1) - 1);
05392   unsigned OpSrc;
05393 
05394   if (!NumZeros)
05395     return false;
05396 
05397   // Considering the elements in the mask that are not consecutive zeros,
05398   // check if they consecutively come from only one of the source vectors.
05399   //
05400   //                           0    { A, B, X, X } = V2
05401   //                          / \    /  /
05402   //   vector_shuffle V1, V2 <X, X, 4, 5>
05403   //
05404   if (!isShuffleMaskConsecutive(SVOp,
05405             NumZeros,     // Mask Start Index
05406             NumElems,     // Mask End Index(exclusive)
05407             0,            // Where to start looking in the src vector
05408             NumElems,     // Number of elements in vector
05409             OpSrc))       // Which source operand ?
05410     return false;
05411 
05412   isLeft = true;
05413   ShAmt = NumZeros;
05414   ShVal = SVOp->getOperand(OpSrc);
05415   return true;
05416 }
05417 
05418 /// isVectorShift - Returns true if the shuffle can be implemented as a
05419 /// logical left or right shift of a vector.
05420 static bool isVectorShift(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG,
05421                           bool &isLeft, SDValue &ShVal, unsigned &ShAmt) {
05422   // Although the logic below support any bitwidth size, there are no
05423   // shift instructions which handle more than 128-bit vectors.
05424   if (!SVOp->getSimpleValueType(0).is128BitVector())
05425     return false;
05426 
05427   if (isVectorShiftLeft(SVOp, DAG, isLeft, ShVal, ShAmt) ||
05428       isVectorShiftRight(SVOp, DAG, isLeft, ShVal, ShAmt))
05429     return true;
05430 
05431   return false;
05432 }
05433 
05434 /// LowerBuildVectorv16i8 - Custom lower build_vector of v16i8.
05435 ///
05436 static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros,
05437                                        unsigned NumNonZero, unsigned NumZero,
05438                                        SelectionDAG &DAG,
05439                                        const X86Subtarget* Subtarget,
05440                                        const TargetLowering &TLI) {
05441   if (NumNonZero > 8)
05442     return SDValue();
05443 
05444   SDLoc dl(Op);
05445   SDValue V;
05446   bool First = true;
05447   for (unsigned i = 0; i < 16; ++i) {
05448     bool ThisIsNonZero = (NonZeros & (1 << i)) != 0;
05449     if (ThisIsNonZero && First) {
05450       if (NumZero)
05451         V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
05452       else
05453         V = DAG.getUNDEF(MVT::v8i16);
05454       First = false;
05455     }
05456 
05457     if ((i & 1) != 0) {
05458       SDValue ThisElt, LastElt;
05459       bool LastIsNonZero = (NonZeros & (1 << (i-1))) != 0;
05460       if (LastIsNonZero) {
05461         LastElt = DAG.getNode(ISD::ZERO_EXTEND, dl,
05462                               MVT::i16, Op.getOperand(i-1));
05463       }
05464       if (ThisIsNonZero) {
05465         ThisElt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i));
05466         ThisElt = DAG.getNode(ISD::SHL, dl, MVT::i16,
05467                               ThisElt, DAG.getConstant(8, MVT::i8));
05468         if (LastIsNonZero)
05469           ThisElt = DAG.getNode(ISD::OR, dl, MVT::i16, ThisElt, LastElt);
05470       } else
05471         ThisElt = LastElt;
05472 
05473       if (ThisElt.getNode())
05474         V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, ThisElt,
05475                         DAG.getIntPtrConstant(i/2));
05476     }
05477   }
05478 
05479   return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, V);
05480 }
05481 
05482 /// LowerBuildVectorv8i16 - Custom lower build_vector of v8i16.
05483 ///
05484 static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros,
05485                                      unsigned NumNonZero, unsigned NumZero,
05486                                      SelectionDAG &DAG,
05487                                      const X86Subtarget* Subtarget,
05488                                      const TargetLowering &TLI) {
05489   if (NumNonZero > 4)
05490     return SDValue();
05491 
05492   SDLoc dl(Op);
05493   SDValue V;
05494   bool First = true;
05495   for (unsigned i = 0; i < 8; ++i) {
05496     bool isNonZero = (NonZeros & (1 << i)) != 0;
05497     if (isNonZero) {
05498       if (First) {
05499         if (NumZero)
05500           V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
05501         else
05502           V = DAG.getUNDEF(MVT::v8i16);
05503         First = false;
05504       }
05505       V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl,
05506                       MVT::v8i16, V, Op.getOperand(i),
05507                       DAG.getIntPtrConstant(i));
05508     }
05509   }
05510 
05511   return V;
05512 }
05513 
05514 /// LowerBuildVectorv4x32 - Custom lower build_vector of v4i32 or v4f32.
05515 static SDValue LowerBuildVectorv4x32(SDValue Op, unsigned NumElems,
05516                                      unsigned NonZeros, unsigned NumNonZero,
05517                                      unsigned NumZero, SelectionDAG &DAG,
05518                                      const X86Subtarget *Subtarget,
05519                                      const TargetLowering &TLI) {
05520   // We know there's at least one non-zero element
05521   unsigned FirstNonZeroIdx = 0;
05522   SDValue FirstNonZero = Op->getOperand(FirstNonZeroIdx);
05523   while (FirstNonZero.getOpcode() == ISD::UNDEF ||
05524          X86::isZeroNode(FirstNonZero)) {
05525     ++FirstNonZeroIdx;
05526     FirstNonZero = Op->getOperand(FirstNonZeroIdx);
05527   }
05528 
05529   if (FirstNonZero.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
05530       !isa<ConstantSDNode>(FirstNonZero.getOperand(1)))
05531     return SDValue();
05532 
05533   SDValue V = FirstNonZero.getOperand(0);
05534   MVT VVT = V.getSimpleValueType();
05535   if (!Subtarget->hasSSE41() || (VVT != MVT::v4f32 && VVT != MVT::v4i32))
05536     return SDValue();
05537 
05538   unsigned FirstNonZeroDst =
05539       cast<ConstantSDNode>(FirstNonZero.getOperand(1))->getZExtValue();
05540   unsigned CorrectIdx = FirstNonZeroDst == FirstNonZeroIdx;
05541   unsigned IncorrectIdx = CorrectIdx ? -1U : FirstNonZeroIdx;
05542   unsigned IncorrectDst = CorrectIdx ? -1U : FirstNonZeroDst;
05543 
05544   for (unsigned Idx = FirstNonZeroIdx + 1; Idx < NumElems; ++Idx) {
05545     SDValue Elem = Op.getOperand(Idx);
05546     if (Elem.getOpcode() == ISD::UNDEF || X86::isZeroNode(Elem))
05547       continue;
05548 
05549     // TODO: What else can be here? Deal with it.
05550     if (Elem.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
05551       return SDValue();
05552 
05553     // TODO: Some optimizations are still possible here
05554     // ex: Getting one element from a vector, and the rest from another.
05555     if (Elem.getOperand(0) != V)
05556       return SDValue();
05557 
05558     unsigned Dst = cast<ConstantSDNode>(Elem.getOperand(1))->getZExtValue();
05559     if (Dst == Idx)
05560       ++CorrectIdx;
05561     else if (IncorrectIdx == -1U) {
05562       IncorrectIdx = Idx;
05563       IncorrectDst = Dst;
05564     } else
05565       // There was already one element with an incorrect index.
05566       // We can't optimize this case to an insertps.
05567       return SDValue();
05568   }
05569 
05570   if (NumNonZero == CorrectIdx || NumNonZero == CorrectIdx + 1) {
05571     SDLoc dl(Op);
05572     EVT VT = Op.getSimpleValueType();
05573     unsigned ElementMoveMask = 0;
05574     if (IncorrectIdx == -1U)
05575       ElementMoveMask = FirstNonZeroIdx << 6 | FirstNonZeroIdx << 4;
05576     else
05577       ElementMoveMask = IncorrectDst << 6 | IncorrectIdx << 4;
05578 
05579     SDValue InsertpsMask =
05580         DAG.getIntPtrConstant(ElementMoveMask | (~NonZeros & 0xf));
05581     return DAG.getNode(X86ISD::INSERTPS, dl, VT, V, V, InsertpsMask);
05582   }
05583 
05584   return SDValue();
05585 }
05586 
05587 /// getVShift - Return a vector logical shift node.
05588 ///
05589 static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp,
05590                          unsigned NumBits, SelectionDAG &DAG,
05591                          const TargetLowering &TLI, SDLoc dl) {
05592   assert(VT.is128BitVector() && "Unknown type for VShift");
05593   EVT ShVT = MVT::v2i64;
05594   unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ;
05595   SrcOp = DAG.getNode(ISD::BITCAST, dl, ShVT, SrcOp);
05596   return DAG.getNode(ISD::BITCAST, dl, VT,
05597                      DAG.getNode(Opc, dl, ShVT, SrcOp,
05598                              DAG.getConstant(NumBits,
05599                                   TLI.getScalarShiftAmountTy(SrcOp.getValueType()))));
05600 }
05601 
05602 static SDValue
05603 LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, SDLoc dl, SelectionDAG &DAG) {
05604 
05605   // Check if the scalar load can be widened into a vector load. And if
05606   // the address is "base + cst" see if the cst can be "absorbed" into
05607   // the shuffle mask.
05608   if (LoadSDNode *LD = dyn_cast<LoadSDNode>(SrcOp)) {
05609     SDValue Ptr = LD->getBasePtr();
05610     if (!ISD::isNormalLoad(LD) || LD->isVolatile())
05611       return SDValue();
05612     EVT PVT = LD->getValueType(0);
05613     if (PVT != MVT::i32 && PVT != MVT::f32)
05614       return SDValue();
05615 
05616     int FI = -1;
05617     int64_t Offset = 0;
05618     if (FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr)) {
05619       FI = FINode->getIndex();
05620       Offset = 0;
05621     } else if (DAG.isBaseWithConstantOffset(Ptr) &&
05622                isa<FrameIndexSDNode>(Ptr.getOperand(0))) {
05623       FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex();
05624       Offset = Ptr.getConstantOperandVal(1);
05625       Ptr = Ptr.getOperand(0);
05626     } else {
05627       return SDValue();
05628     }
05629 
05630     // FIXME: 256-bit vector instructions don't require a strict alignment,
05631     // improve this code to support it better.
05632     unsigned RequiredAlign = VT.getSizeInBits()/8;
05633     SDValue Chain = LD->getChain();
05634     // Make sure the stack object alignment is at least 16 or 32.
05635     MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
05636     if (DAG.InferPtrAlignment(Ptr) < RequiredAlign) {
05637       if (MFI->isFixedObjectIndex(FI)) {
05638         // Can't change the alignment. FIXME: It's possible to compute
05639         // the exact stack offset and reference FI + adjust offset instead.
05640         // If someone *really* cares about this. That's the way to implement it.
05641         return SDValue();
05642       } else {
05643         MFI->setObjectAlignment(FI, RequiredAlign);
05644       }
05645     }
05646 
05647     // (Offset % 16 or 32) must be multiple of 4. Then address is then
05648     // Ptr + (Offset & ~15).
05649     if (Offset < 0)
05650       return SDValue();
05651     if ((Offset % RequiredAlign) & 3)
05652       return SDValue();
05653     int64_t StartOffset = Offset & ~(RequiredAlign-1);
05654     if (StartOffset)
05655       Ptr = DAG.getNode(ISD::ADD, SDLoc(Ptr), Ptr.getValueType(),
05656                         Ptr,DAG.getConstant(StartOffset, Ptr.getValueType()));
05657 
05658     int EltNo = (Offset - StartOffset) >> 2;
05659     unsigned NumElems = VT.getVectorNumElements();
05660 
05661     EVT NVT = EVT::getVectorVT(*DAG.getContext(), PVT, NumElems);
05662     SDValue V1 = DAG.getLoad(NVT, dl, Chain, Ptr,
05663                              LD->getPointerInfo().getWithOffset(StartOffset),
05664                              false, false, false, 0);
05665 
05666     SmallVector<int, 8> Mask;
05667     for (unsigned i = 0; i != NumElems; ++i)
05668       Mask.push_back(EltNo);
05669 
05670     return DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), &Mask[0]);
05671   }
05672 
05673   return SDValue();
05674 }
05675 
05676 /// EltsFromConsecutiveLoads - Given the initializing elements 'Elts' of a
05677 /// vector of type 'VT', see if the elements can be replaced by a single large
05678 /// load which has the same value as a build_vector whose operands are 'elts'.
05679 ///
05680 /// Example: <load i32 *a, load i32 *a+4, undef, undef> -> zextload a
05681 ///
05682 /// FIXME: we'd also like to handle the case where the last elements are zero
05683 /// rather than undef via VZEXT_LOAD, but we do not detect that case today.
05684 /// There's even a handy isZeroNode for that purpose.
05685 static SDValue EltsFromConsecutiveLoads(EVT VT, SmallVectorImpl<SDValue> &Elts,
05686                                         SDLoc &DL, SelectionDAG &DAG,
05687                                         bool isAfterLegalize) {
05688   EVT EltVT = VT.getVectorElementType();
05689   unsigned NumElems = Elts.size();
05690 
05691   LoadSDNode *LDBase = nullptr;
05692   unsigned LastLoadedElt = -1U;
05693 
05694   // For each element in the initializer, see if we've found a load or an undef.
05695   // If we don't find an initial load element, or later load elements are
05696   // non-consecutive, bail out.
05697   for (unsigned i = 0; i < NumElems; ++i) {
05698     SDValue Elt = Elts[i];
05699 
05700     if (!Elt.getNode() ||
05701         (Elt.getOpcode() != ISD::UNDEF && !ISD::isNON_EXTLoad(Elt.getNode())))
05702       return SDValue();
05703     if (!LDBase) {
05704       if (Elt.getNode()->getOpcode() == ISD::UNDEF)
05705         return SDValue();
05706       LDBase = cast<LoadSDNode>(Elt.getNode());
05707       LastLoadedElt = i;
05708       continue;
05709     }
05710     if (Elt.getOpcode() == ISD::UNDEF)
05711       continue;
05712 
05713     LoadSDNode *LD = cast<LoadSDNode>(Elt);
05714     if (!DAG.isConsecutiveLoad(LD, LDBase, EltVT.getSizeInBits()/8, i))
05715       return SDValue();
05716     LastLoadedElt = i;
05717   }
05718 
05719   // If we have found an entire vector of loads and undefs, then return a large
05720   // load of the entire vector width starting at the base pointer.  If we found
05721   // consecutive loads for the low half, generate a vzext_load node.
05722   if (LastLoadedElt == NumElems - 1) {
05723 
05724     if (isAfterLegalize &&
05725         !DAG.getTargetLoweringInfo().isOperationLegal(ISD::LOAD, VT))
05726       return SDValue();
05727 
05728     SDValue NewLd = SDValue();
05729 
05730     if (DAG.InferPtrAlignment(LDBase->getBasePtr()) >= 16)
05731       NewLd = DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
05732                           LDBase->getPointerInfo(),
05733                           LDBase->isVolatile(), LDBase->isNonTemporal(),
05734                           LDBase->isInvariant(), 0);
05735     NewLd = DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
05736                         LDBase->getPointerInfo(),
05737                         LDBase->isVolatile(), LDBase->isNonTemporal(),
05738                         LDBase->isInvariant(), LDBase->getAlignment());
05739 
05740     if (LDBase->hasAnyUseOfValue(1)) {
05741       SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
05742                                      SDValue(LDBase, 1),
05743                                      SDValue(NewLd.getNode(), 1));
05744       DAG.ReplaceAllUsesOfValueWith(SDValue(LDBase, 1), NewChain);
05745       DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(LDBase, 1),
05746                              SDValue(NewLd.getNode(), 1));
05747     }
05748 
05749     return NewLd;
05750   }
05751   if (NumElems == 4 && LastLoadedElt == 1 &&
05752       DAG.getTargetLoweringInfo().isTypeLegal(MVT::v2i64)) {
05753     SDVTList Tys = DAG.getVTList(MVT::v2i64, MVT::Other);
05754     SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };
05755     SDValue ResNode =
05756         DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, Ops, MVT::i64,
05757                                 LDBase->getPointerInfo(),
05758                                 LDBase->getAlignment(),
05759                                 false/*isVolatile*/, true/*ReadMem*/,
05760                                 false/*WriteMem*/);
05761 
05762     // Make sure the newly-created LOAD is in the same position as LDBase in
05763     // terms of dependency. We create a TokenFactor for LDBase and ResNode, and
05764     // update uses of LDBase's output chain to use the TokenFactor.
05765     if (LDBase->hasAnyUseOfValue(1)) {
05766       SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
05767                              SDValue(LDBase, 1), SDValue(ResNode.getNode(), 1));
05768       DAG.ReplaceAllUsesOfValueWith(SDValue(LDBase, 1), NewChain);
05769       DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(LDBase, 1),
05770                              SDValue(ResNode.getNode(), 1));
05771     }
05772 
05773     return DAG.getNode(ISD::BITCAST, DL, VT, ResNode);
05774   }
05775   return SDValue();
05776 }
05777 
05778 /// LowerVectorBroadcast - Attempt to use the vbroadcast instruction
05779 /// to generate a splat value for the following cases:
05780 /// 1. A splat BUILD_VECTOR which uses a single scalar load, or a constant.
05781 /// 2. A splat shuffle which uses a scalar_to_vector node which comes from
05782 /// a scalar load, or a constant.
05783 /// The VBROADCAST node is returned when a pattern is found,
05784 /// or SDValue() otherwise.
05785 static SDValue LowerVectorBroadcast(SDValue Op, const X86Subtarget* Subtarget,
05786                                     SelectionDAG &DAG) {
05787   if (!Subtarget->hasFp256())
05788     return SDValue();
05789 
05790   MVT VT = Op.getSimpleValueType();
05791   SDLoc dl(Op);
05792 
05793   assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
05794          "Unsupported vector type for broadcast.");
05795 
05796   SDValue Ld;
05797   bool ConstSplatVal;
05798 
05799   switch (Op.getOpcode()) {
05800     default:
05801       // Unknown pattern found.
05802       return SDValue();
05803 
05804     case ISD::BUILD_VECTOR: {
05805       auto *BVOp = cast<BuildVectorSDNode>(Op.getNode());
05806       BitVector UndefElements;
05807       SDValue Splat = BVOp->getSplatValue(&UndefElements);
05808 
05809       // We need a splat of a single value to use broadcast, and it doesn't
05810       // make any sense if the value is only in one element of the vector.
05811       if (!Splat || (VT.getVectorNumElements() - UndefElements.count()) <= 1)
05812         return SDValue();
05813 
05814       Ld = Splat;
05815       ConstSplatVal = (Ld.getOpcode() == ISD::Constant ||
05816                        Ld.getOpcode() == ISD::ConstantFP);
05817 
05818       // Make sure that all of the users of a non-constant load are from the
05819       // BUILD_VECTOR node.
05820       if (!ConstSplatVal && !BVOp->isOnlyUserOf(Ld.getNode()))
05821         return SDValue();
05822       break;
05823     }
05824 
05825     case ISD::VECTOR_SHUFFLE: {
05826       ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
05827 
05828       // Shuffles must have a splat mask where the first element is
05829       // broadcasted.
05830       if ((!SVOp->isSplat()) || SVOp->getMaskElt(0) != 0)
05831         return SDValue();
05832 
05833       SDValue Sc = Op.getOperand(0);
05834       if (Sc.getOpcode() != ISD::SCALAR_TO_VECTOR &&
05835           Sc.getOpcode() != ISD::BUILD_VECTOR) {
05836 
05837         if (!Subtarget->hasInt256())
05838           return SDValue();
05839 
05840         // Use the register form of the broadcast instruction available on AVX2.
05841         if (VT.getSizeInBits() >= 256)
05842           Sc = Extract128BitVector(Sc, 0, DAG, dl);
05843         return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Sc);
05844       }
05845 
05846       Ld = Sc.getOperand(0);
05847       ConstSplatVal = (Ld.getOpcode() == ISD::Constant ||
05848                        Ld.getOpcode() == ISD::ConstantFP);
05849 
05850       // The scalar_to_vector node and the suspected
05851       // load node must have exactly one user.
05852       // Constants may have multiple users.
05853 
05854       // AVX-512 has register version of the broadcast
05855       bool hasRegVer = Subtarget->hasAVX512() && VT.is512BitVector() &&
05856         Ld.getValueType().getSizeInBits() >= 32;
05857       if (!ConstSplatVal && ((!Sc.hasOneUse() || !Ld.hasOneUse()) &&
05858           !hasRegVer))
05859         return SDValue();
05860       break;
05861     }
05862   }
05863 
05864   bool IsGE256 = (VT.getSizeInBits() >= 256);
05865 
05866   // Handle the broadcasting a single constant scalar from the constant pool
05867   // into a vector. On Sandybridge it is still better to load a constant vector
05868   // from the constant pool and not to broadcast it from a scalar.
05869   if (ConstSplatVal && Subtarget->hasInt256()) {
05870     EVT CVT = Ld.getValueType();
05871     assert(!CVT.isVector() && "Must not broadcast a vector type");
05872     unsigned ScalarSize = CVT.getSizeInBits();
05873 
05874     if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64)) {
05875       const Constant *C = nullptr;
05876       if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Ld))
05877         C = CI->getConstantIntValue();
05878       else if (ConstantFPSDNode *CF = dyn_cast<ConstantFPSDNode>(Ld))
05879         C = CF->getConstantFPValue();
05880 
05881       assert(C && "Invalid constant type");
05882 
05883       const TargetLowering &TLI = DAG.getTargetLoweringInfo();
05884       SDValue CP = DAG.getConstantPool(C, TLI.getPointerTy());
05885       unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
05886       Ld = DAG.getLoad(CVT, dl, DAG.getEntryNode(), CP,
05887                        MachinePointerInfo::getConstantPool(),
05888                        false, false, false, Alignment);
05889 
05890       return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
05891     }
05892   }
05893 
05894   bool IsLoad = ISD::isNormalLoad(Ld.getNode());
05895   unsigned ScalarSize = Ld.getValueType().getSizeInBits();
05896 
05897   // Handle AVX2 in-register broadcasts.
05898   if (!IsLoad && Subtarget->hasInt256() &&
05899       (ScalarSize == 32 || (IsGE256 && ScalarSize == 64)))
05900     return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
05901 
05902   // The scalar source must be a normal load.
05903   if (!IsLoad)
05904     return SDValue();
05905 
05906   if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64))
05907     return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
05908 
05909   // The integer check is needed for the 64-bit into 128-bit so it doesn't match
05910   // double since there is no vbroadcastsd xmm
05911   if (Subtarget->hasInt256() && Ld.getValueType().isInteger()) {
05912     if (ScalarSize == 8 || ScalarSize == 16 || ScalarSize == 64)
05913       return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
05914   }
05915 
05916   // Unsupported broadcast.
05917   return SDValue();
05918 }
05919 
05920 /// \brief For an EXTRACT_VECTOR_ELT with a constant index return the real
05921 /// underlying vector and index.
05922 ///
05923 /// Modifies \p ExtractedFromVec to the real vector and returns the real
05924 /// index.
05925 static int getUnderlyingExtractedFromVec(SDValue &ExtractedFromVec,
05926                                          SDValue ExtIdx) {
05927   int Idx = cast<ConstantSDNode>(ExtIdx)->getZExtValue();
05928   if (!isa<ShuffleVectorSDNode>(ExtractedFromVec))
05929     return Idx;
05930 
05931   // For 256-bit vectors, LowerEXTRACT_VECTOR_ELT_SSE4 may have already
05932   // lowered this:
05933   //   (extract_vector_elt (v8f32 %vreg1), Constant<6>)
05934   // to:
05935   //   (extract_vector_elt (vector_shuffle<2,u,u,u>
05936   //                           (extract_subvector (v8f32 %vreg0), Constant<4>),
05937   //                           undef)
05938   //                       Constant<0>)
05939   // In this case the vector is the extract_subvector expression and the index
05940   // is 2, as specified by the shuffle.
05941   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(ExtractedFromVec);
05942   SDValue ShuffleVec = SVOp->getOperand(0);
05943   MVT ShuffleVecVT = ShuffleVec.getSimpleValueType();
05944   assert(ShuffleVecVT.getVectorElementType() ==
05945          ExtractedFromVec.getSimpleValueType().getVectorElementType());
05946 
05947   int ShuffleIdx = SVOp->getMaskElt(Idx);
05948   if (isUndefOrInRange(ShuffleIdx, 0, ShuffleVecVT.getVectorNumElements())) {
05949     ExtractedFromVec = ShuffleVec;
05950     return ShuffleIdx;
05951   }
05952   return Idx;
05953 }
05954 
05955 static SDValue buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) {
05956   MVT VT = Op.getSimpleValueType();
05957 
05958   // Skip if insert_vec_elt is not supported.
05959   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
05960   if (!TLI.isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT))
05961     return SDValue();
05962 
05963   SDLoc DL(Op);
05964   unsigned NumElems = Op.getNumOperands();
05965 
05966   SDValue VecIn1;
05967   SDValue VecIn2;
05968   SmallVector<unsigned, 4> InsertIndices;
05969   SmallVector<int, 8> Mask(NumElems, -1);
05970 
05971   for (unsigned i = 0; i != NumElems; ++i) {
05972     unsigned Opc = Op.getOperand(i).getOpcode();
05973 
05974     if (Opc == ISD::UNDEF)
05975       continue;
05976 
05977     if (Opc != ISD::EXTRACT_VECTOR_ELT) {
05978       // Quit if more than 1 elements need inserting.
05979       if (InsertIndices.size() > 1)
05980         return SDValue();
05981 
05982       InsertIndices.push_back(i);
05983       continue;
05984     }
05985 
05986     SDValue ExtractedFromVec = Op.getOperand(i).getOperand(0);
05987     SDValue ExtIdx = Op.getOperand(i).getOperand(1);
05988     // Quit if non-constant index.
05989     if (!isa<ConstantSDNode>(ExtIdx))
05990       return SDValue();
05991     int Idx = getUnderlyingExtractedFromVec(ExtractedFromVec, ExtIdx);
05992 
05993     // Quit if extracted from vector of different type.
05994     if (ExtractedFromVec.getValueType() != VT)
05995       return SDValue();
05996 
05997     if (!VecIn1.getNode())
05998       VecIn1 = ExtractedFromVec;
05999     else if (VecIn1 != ExtractedFromVec) {
06000       if (!VecIn2.getNode())
06001         VecIn2 = ExtractedFromVec;
06002       else if (VecIn2 != ExtractedFromVec)
06003         // Quit if more than 2 vectors to shuffle
06004         return SDValue();
06005     }
06006 
06007     if (ExtractedFromVec == VecIn1)
06008       Mask[i] = Idx;
06009     else if (ExtractedFromVec == VecIn2)
06010       Mask[i] = Idx + NumElems;
06011   }
06012 
06013   if (!VecIn1.getNode())
06014     return SDValue();
06015 
06016   VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(VT);
06017   SDValue NV = DAG.getVectorShuffle(VT, DL, VecIn1, VecIn2, &Mask[0]);
06018   for (unsigned i = 0, e = InsertIndices.size(); i != e; ++i) {
06019     unsigned Idx = InsertIndices[i];
06020     NV = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, NV, Op.getOperand(Idx),
06021                      DAG.getIntPtrConstant(Idx));
06022   }
06023 
06024   return NV;
06025 }
06026 
06027 // Lower BUILD_VECTOR operation for v8i1 and v16i1 types.
06028 SDValue
06029 X86TargetLowering::LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const {
06030 
06031   MVT VT = Op.getSimpleValueType();
06032   assert((VT.getVectorElementType() == MVT::i1) && (VT.getSizeInBits() <= 16) &&
06033          "Unexpected type in LowerBUILD_VECTORvXi1!");
06034 
06035   SDLoc dl(Op);
06036   if (ISD::isBuildVectorAllZeros(Op.getNode())) {
06037     SDValue Cst = DAG.getTargetConstant(0, MVT::i1);
06038     SmallVector<SDValue, 16> Ops(VT.getVectorNumElements(), Cst);
06039     return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops);
06040   }
06041 
06042   if (ISD::isBuildVectorAllOnes(Op.getNode())) {
06043     SDValue Cst = DAG.getTargetConstant(1, MVT::i1);
06044     SmallVector<SDValue, 16> Ops(VT.getVectorNumElements(), Cst);
06045     return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops);
06046   }
06047 
06048   bool AllContants = true;
06049   uint64_t Immediate = 0;
06050   int NonConstIdx = -1;
06051   bool IsSplat = true;
06052   unsigned NumNonConsts = 0;
06053   unsigned NumConsts = 0;
06054   for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
06055     SDValue In = Op.getOperand(idx);
06056     if (In.getOpcode() == ISD::UNDEF)
06057       continue;
06058     if (!isa<ConstantSDNode>(In)) {
06059       AllContants = false;
06060       NonConstIdx = idx;
06061       NumNonConsts++;
06062     }
06063     else {
06064       NumConsts++;
06065       if (cast<ConstantSDNode>(In)->getZExtValue())
06066       Immediate |= (1ULL << idx);
06067     }
06068     if (In != Op.getOperand(0))
06069       IsSplat = false;
06070   }
06071 
06072   if (AllContants) {
06073     SDValue FullMask = DAG.getNode(ISD::BITCAST, dl, MVT::v16i1,
06074       DAG.getConstant(Immediate, MVT::i16));
06075     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, FullMask,
06076                        DAG.getIntPtrConstant(0));
06077   }
06078 
06079   if (NumNonConsts == 1 && NonConstIdx != 0) {
06080     SDValue DstVec;
06081     if (NumConsts) {
06082       SDValue VecAsImm = DAG.getConstant(Immediate,
06083                                          MVT::getIntegerVT(VT.getSizeInBits()));
06084       DstVec = DAG.getNode(ISD::BITCAST, dl, VT, VecAsImm);
06085     }
06086     else 
06087       DstVec = DAG.getUNDEF(VT);
06088     return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DstVec,
06089                        Op.getOperand(NonConstIdx),
06090                        DAG.getIntPtrConstant(NonConstIdx));
06091   }
06092   if (!IsSplat && (NonConstIdx != 0))
06093     llvm_unreachable("Unsupported BUILD_VECTOR operation");
06094   MVT SelectVT = (VT == MVT::v16i1)? MVT::i16 : MVT::i8;
06095   SDValue Select;
06096   if (IsSplat)
06097     Select = DAG.getNode(ISD::SELECT, dl, SelectVT, Op.getOperand(0),
06098                           DAG.getConstant(-1, SelectVT),
06099                           DAG.getConstant(0, SelectVT));
06100   else
06101     Select = DAG.getNode(ISD::SELECT, dl, SelectVT, Op.getOperand(0),
06102                          DAG.getConstant((Immediate | 1), SelectVT),
06103                          DAG.getConstant(Immediate, SelectVT));
06104   return DAG.getNode(ISD::BITCAST, dl, VT, Select);
06105 }
06106 
06107 /// \brief Return true if \p N implements a horizontal binop and return the
06108 /// operands for the horizontal binop into V0 and V1.
06109 /// 
06110 /// This is a helper function of PerformBUILD_VECTORCombine.
06111 /// This function checks that the build_vector \p N in input implements a
06112 /// horizontal operation. Parameter \p Opcode defines the kind of horizontal
06113 /// operation to match.
06114 /// For example, if \p Opcode is equal to ISD::ADD, then this function
06115 /// checks if \p N implements a horizontal arithmetic add; if instead \p Opcode
06116 /// is equal to ISD::SUB, then this function checks if this is a horizontal
06117 /// arithmetic sub.
06118 ///
06119 /// This function only analyzes elements of \p N whose indices are
06120 /// in range [BaseIdx, LastIdx).
06121 static bool isHorizontalBinOp(const BuildVectorSDNode *N, unsigned Opcode,
06122                               SelectionDAG &DAG,
06123                               unsigned BaseIdx, unsigned LastIdx,
06124                               SDValue &V0, SDValue &V1) {
06125   EVT VT = N->getValueType(0);
06126 
06127   assert(BaseIdx * 2 <= LastIdx && "Invalid Indices in input!");
06128   assert(VT.isVector() && VT.getVectorNumElements() >= LastIdx &&
06129          "Invalid Vector in input!");
06130   
06131   bool IsCommutable = (Opcode == ISD::ADD || Opcode == ISD::FADD);
06132   bool CanFold = true;
06133   unsigned ExpectedVExtractIdx = BaseIdx;
06134   unsigned NumElts = LastIdx - BaseIdx;
06135   V0 = DAG.getUNDEF(VT);
06136   V1 = DAG.getUNDEF(VT);
06137 
06138   // Check if N implements a horizontal binop.
06139   for (unsigned i = 0, e = NumElts; i != e && CanFold; ++i) {
06140     SDValue Op = N->getOperand(i + BaseIdx);
06141 
06142     // Skip UNDEFs.
06143     if (Op->getOpcode() == ISD::UNDEF) {
06144       // Update the expected vector extract index.
06145       if (i * 2 == NumElts)
06146         ExpectedVExtractIdx = BaseIdx;
06147       ExpectedVExtractIdx += 2;
06148       continue;
06149     }
06150 
06151     CanFold = Op->getOpcode() == Opcode && Op->hasOneUse();
06152 
06153     if (!CanFold)
06154       break;
06155 
06156     SDValue Op0 = Op.getOperand(0);
06157     SDValue Op1 = Op.getOperand(1);
06158 
06159     // Try to match the following pattern:
06160     // (BINOP (extract_vector_elt A, I), (extract_vector_elt A, I+1))
06161     CanFold = (Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
06162         Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
06163         Op0.getOperand(0) == Op1.getOperand(0) &&
06164         isa<ConstantSDNode>(Op0.getOperand(1)) &&
06165         isa<ConstantSDNode>(Op1.getOperand(1)));
06166     if (!CanFold)
06167       break;
06168 
06169     unsigned I0 = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue();
06170     unsigned I1 = cast<ConstantSDNode>(Op1.getOperand(1))->getZExtValue();
06171 
06172     if (i * 2 < NumElts) {
06173       if (V0.getOpcode() == ISD::UNDEF)
06174         V0 = Op0.getOperand(0);
06175     } else {
06176       if (V1.getOpcode() == ISD::UNDEF)
06177         V1 = Op0.getOperand(0);
06178       if (i * 2 == NumElts)
06179         ExpectedVExtractIdx = BaseIdx;
06180     }
06181 
06182     SDValue Expected = (i * 2 < NumElts) ? V0 : V1;
06183     if (I0 == ExpectedVExtractIdx)
06184       CanFold = I1 == I0 + 1 && Op0.getOperand(0) == Expected;
06185     else if (IsCommutable && I1 == ExpectedVExtractIdx) {
06186       // Try to match the following dag sequence:
06187       // (BINOP (extract_vector_elt A, I+1), (extract_vector_elt A, I))
06188       CanFold = I0 == I1 + 1 && Op1.getOperand(0) == Expected;
06189     } else
06190       CanFold = false;
06191 
06192     ExpectedVExtractIdx += 2;
06193   }
06194 
06195   return CanFold;
06196 }
06197 
06198 /// \brief Emit a sequence of two 128-bit horizontal add/sub followed by
06199 /// a concat_vector. 
06200 ///
06201 /// This is a helper function of PerformBUILD_VECTORCombine.
06202 /// This function expects two 256-bit vectors called V0 and V1.
06203 /// At first, each vector is split into two separate 128-bit vectors.
06204 /// Then, the resulting 128-bit vectors are used to implement two
06205 /// horizontal binary operations. 
06206 ///
06207 /// The kind of horizontal binary operation is defined by \p X86Opcode.
06208 ///
06209 /// \p Mode specifies how the 128-bit parts of V0 and V1 are passed in input to
06210 /// the two new horizontal binop.
06211 /// When Mode is set, the first horizontal binop dag node would take as input
06212 /// the lower 128-bit of V0 and the upper 128-bit of V0. The second
06213 /// horizontal binop dag node would take as input the lower 128-bit of V1
06214 /// and the upper 128-bit of V1.
06215 ///   Example:
06216 ///     HADD V0_LO, V0_HI
06217 ///     HADD V1_LO, V1_HI
06218 ///
06219 /// Otherwise, the first horizontal binop dag node takes as input the lower
06220 /// 128-bit of V0 and the lower 128-bit of V1, and the second horizontal binop
06221 /// dag node takes the the upper 128-bit of V0 and the upper 128-bit of V1.
06222 ///   Example:
06223 ///     HADD V0_LO, V1_LO
06224 ///     HADD V0_HI, V1_HI
06225 ///
06226 /// If \p isUndefLO is set, then the algorithm propagates UNDEF to the lower
06227 /// 128-bits of the result. If \p isUndefHI is set, then UNDEF is propagated to
06228 /// the upper 128-bits of the result.
06229 static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1,
06230                                      SDLoc DL, SelectionDAG &DAG,
06231                                      unsigned X86Opcode, bool Mode,
06232                                      bool isUndefLO, bool isUndefHI) {
06233   EVT VT = V0.getValueType();
06234   assert(VT.is256BitVector() && VT == V1.getValueType() &&
06235          "Invalid nodes in input!");
06236 
06237   unsigned NumElts = VT.getVectorNumElements();
06238   SDValue V0_LO = Extract128BitVector(V0, 0, DAG, DL);
06239   SDValue V0_HI = Extract128BitVector(V0, NumElts/2, DAG, DL);
06240   SDValue V1_LO = Extract128BitVector(V1, 0, DAG, DL);
06241   SDValue V1_HI = Extract128BitVector(V1, NumElts/2, DAG, DL);
06242   EVT NewVT = V0_LO.getValueType();
06243 
06244   SDValue LO = DAG.getUNDEF(NewVT);
06245   SDValue HI = DAG.getUNDEF(NewVT);
06246 
06247   if (Mode) {
06248     // Don't emit a horizontal binop if the result is expected to be UNDEF.
06249     if (!isUndefLO && V0->getOpcode() != ISD::UNDEF)
06250       LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V0_HI);
06251     if (!isUndefHI && V1->getOpcode() != ISD::UNDEF)
06252       HI = DAG.getNode(X86Opcode, DL, NewVT, V1_LO, V1_HI);
06253   } else {
06254     // Don't emit a horizontal binop if the result is expected to be UNDEF.
06255     if (!isUndefLO && (V0_LO->getOpcode() != ISD::UNDEF ||
06256                        V1_LO->getOpcode() != ISD::UNDEF))
06257       LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V1_LO);
06258 
06259     if (!isUndefHI && (V0_HI->getOpcode() != ISD::UNDEF ||
06260                        V1_HI->getOpcode() != ISD::UNDEF))
06261       HI = DAG.getNode(X86Opcode, DL, NewVT, V0_HI, V1_HI);
06262   }
06263 
06264   return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LO, HI);
06265 }
06266 
06267 /// \brief Try to fold a build_vector that performs an 'addsub' into the
06268 /// sequence of 'vadd + vsub + blendi'.
06269 static SDValue matchAddSub(const BuildVectorSDNode *BV, SelectionDAG &DAG,
06270                            const X86Subtarget *Subtarget) {
06271   SDLoc DL(BV);
06272   EVT VT = BV->getValueType(0);
06273   unsigned NumElts = VT.getVectorNumElements();
06274   SDValue InVec0 = DAG.getUNDEF(VT);
06275   SDValue InVec1 = DAG.getUNDEF(VT);
06276 
06277   assert((VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v4f32 ||
06278           VT == MVT::v2f64) && "build_vector with an invalid type found!");
06279 
06280   // Don't try to emit a VSELECT that cannot be lowered into a blend.
06281   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
06282   if (!TLI.isOperationLegalOrCustom(ISD::VSELECT, VT))
06283     return SDValue();
06284 
06285   // Odd-numbered elements in the input build vector are obtained from
06286   // adding two integer/float elements.
06287   // Even-numbered elements in the input build vector are obtained from
06288   // subtracting two integer/float elements.
06289   unsigned ExpectedOpcode = ISD::FSUB;
06290   unsigned NextExpectedOpcode = ISD::FADD;
06291   bool AddFound = false;
06292   bool SubFound = false;
06293 
06294   for (unsigned i = 0, e = NumElts; i != e; i++) {
06295     SDValue Op = BV->getOperand(i);
06296       
06297     // Skip 'undef' values.
06298     unsigned Opcode = Op.getOpcode();
06299     if (Opcode == ISD::UNDEF) {
06300       std::swap(ExpectedOpcode, NextExpectedOpcode);
06301       continue;
06302     }
06303       
06304     // Early exit if we found an unexpected opcode.
06305     if (Opcode != ExpectedOpcode)
06306       return SDValue();
06307 
06308     SDValue Op0 = Op.getOperand(0);
06309     SDValue Op1 = Op.getOperand(1);
06310 
06311     // Try to match the following pattern:
06312     // (BINOP (extract_vector_elt A, i), (extract_vector_elt B, i))
06313     // Early exit if we cannot match that sequence.
06314     if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
06315         Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
06316         !isa<ConstantSDNode>(Op0.getOperand(1)) ||
06317         !isa<ConstantSDNode>(Op1.getOperand(1)) ||
06318         Op0.getOperand(1) != Op1.getOperand(1))
06319       return SDValue();
06320 
06321     unsigned I0 = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue();
06322     if (I0 != i)
06323       return SDValue();
06324 
06325     // We found a valid add/sub node. Update the information accordingly.
06326     if (i & 1)
06327       AddFound = true;
06328     else
06329       SubFound = true;
06330 
06331     // Update InVec0 and InVec1.
06332     if (InVec0.getOpcode() == ISD::UNDEF)
06333       InVec0 = Op0.getOperand(0);
06334     if (InVec1.getOpcode() == ISD::UNDEF)
06335       InVec1 = Op1.getOperand(0);
06336 
06337     // Make sure that operands in input to each add/sub node always
06338     // come from a same pair of vectors.
06339     if (InVec0 != Op0.getOperand(0)) {
06340       if (ExpectedOpcode == ISD::FSUB)
06341         return SDValue();
06342 
06343       // FADD is commutable. Try to commute the operands
06344       // and then test again.
06345       std::swap(Op0, Op1);
06346       if (InVec0 != Op0.getOperand(0))
06347         return SDValue();
06348     }
06349 
06350     if (InVec1 != Op1.getOperand(0))
06351       return SDValue();
06352 
06353     // Update the pair of expected opcodes.
06354     std::swap(ExpectedOpcode, NextExpectedOpcode);
06355   }
06356 
06357   // Don't try to fold this build_vector into a VSELECT if it has
06358   // too many UNDEF operands.
06359   if (AddFound && SubFound && InVec0.getOpcode() != ISD::UNDEF &&
06360       InVec1.getOpcode() != ISD::UNDEF) {
06361     // Emit a sequence of vector add and sub followed by a VSELECT.
06362     // The new VSELECT will be lowered into a BLENDI.
06363     // At ISel stage, we pattern-match the sequence 'add + sub + BLENDI'
06364     // and emit a single ADDSUB instruction.
06365     SDValue Sub = DAG.getNode(ExpectedOpcode, DL, VT, InVec0, InVec1);
06366     SDValue Add = DAG.getNode(NextExpectedOpcode, DL, VT, InVec0, InVec1);
06367 
06368     // Construct the VSELECT mask.
06369     EVT MaskVT = VT.changeVectorElementTypeToInteger();
06370     EVT SVT = MaskVT.getVectorElementType();
06371     unsigned SVTBits = SVT.getSizeInBits();
06372     SmallVector<SDValue, 8> Ops;
06373 
06374     for (unsigned i = 0, e = NumElts; i != e; ++i) {
06375       APInt Value = i & 1 ? APInt::getNullValue(SVTBits) :
06376                             APInt::getAllOnesValue(SVTBits);
06377       SDValue Constant = DAG.getConstant(Value, SVT);
06378       Ops.push_back(Constant);
06379     }
06380 
06381     SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, DL, MaskVT, Ops);
06382     return DAG.getSelect(DL, VT, Mask, Sub, Add);
06383   }
06384   
06385   return SDValue();
06386 }
06387 
06388 static SDValue PerformBUILD_VECTORCombine(SDNode *N, SelectionDAG &DAG,
06389                                           const X86Subtarget *Subtarget) {
06390   SDLoc DL(N);
06391   EVT VT = N->getValueType(0);
06392   unsigned NumElts = VT.getVectorNumElements();
06393   BuildVectorSDNode *BV = cast<BuildVectorSDNode>(N);
06394   SDValue InVec0, InVec1;
06395 
06396   // Try to match an ADDSUB.
06397   if ((Subtarget->hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
06398       (Subtarget->hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))) {
06399     SDValue Value = matchAddSub(BV, DAG, Subtarget);
06400     if (Value.getNode())
06401       return Value;
06402   }
06403 
06404   // Try to match horizontal ADD/SUB.
06405   unsigned NumUndefsLO = 0;
06406   unsigned NumUndefsHI = 0;
06407   unsigned Half = NumElts/2;
06408 
06409   // Count the number of UNDEF operands in the build_vector in input.
06410   for (unsigned i = 0, e = Half; i != e; ++i)
06411     if (BV->getOperand(i)->getOpcode() == ISD::UNDEF)
06412       NumUndefsLO++;
06413 
06414   for (unsigned i = Half, e = NumElts; i != e; ++i)
06415     if (BV->getOperand(i)->getOpcode() == ISD::UNDEF)
06416       NumUndefsHI++;
06417 
06418   // Early exit if this is either a build_vector of all UNDEFs or all the
06419   // operands but one are UNDEF.
06420   if (NumUndefsLO + NumUndefsHI + 1 >= NumElts)
06421     return SDValue();
06422 
06423   if ((VT == MVT::v4f32 || VT == MVT::v2f64) && Subtarget->hasSSE3()) {
06424     // Try to match an SSE3 float HADD/HSUB.
06425     if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, NumElts, InVec0, InVec1))
06426       return DAG.getNode(X86ISD::FHADD, DL, VT, InVec0, InVec1);
06427     
06428     if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, NumElts, InVec0, InVec1))
06429       return DAG.getNode(X86ISD::FHSUB, DL, VT, InVec0, InVec1);
06430   } else if ((VT == MVT::v4i32 || VT == MVT::v8i16) && Subtarget->hasSSSE3()) {
06431     // Try to match an SSSE3 integer HADD/HSUB.
06432     if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1))
06433       return DAG.getNode(X86ISD::HADD, DL, VT, InVec0, InVec1);
06434     
06435     if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, NumElts, InVec0, InVec1))
06436       return DAG.getNode(X86ISD::HSUB, DL, VT, InVec0, InVec1);
06437   }
06438   
06439   if (!Subtarget->hasAVX())
06440     return SDValue();
06441 
06442   if ((VT == MVT::v8f32 || VT == MVT::v4f64)) {
06443     // Try to match an AVX horizontal add/sub of packed single/double
06444     // precision floating point values from 256-bit vectors.
06445     SDValue InVec2, InVec3;
06446     if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, Half, InVec0, InVec1) &&
06447         isHorizontalBinOp(BV, ISD::FADD, DAG, Half, NumElts, InVec2, InVec3) &&
06448         ((InVec0.getOpcode() == ISD::UNDEF ||
06449           InVec2.getOpcode() == ISD::UNDEF) || InVec0 == InVec2) &&
06450         ((InVec1.getOpcode() == ISD::UNDEF ||
06451           InVec3.getOpcode() == ISD::UNDEF) || InVec1 == InVec3))
06452       return DAG.getNode(X86ISD::FHADD, DL, VT, InVec0, InVec1);
06453 
06454     if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, Half, InVec0, InVec1) &&
06455         isHorizontalBinOp(BV, ISD::FSUB, DAG, Half, NumElts, InVec2, InVec3) &&
06456         ((InVec0.getOpcode() == ISD::UNDEF ||
06457           InVec2.getOpcode() == ISD::UNDEF) || InVec0 == InVec2) &&
06458         ((InVec1.getOpcode() == ISD::UNDEF ||
06459           InVec3.getOpcode() == ISD::UNDEF) || InVec1 == InVec3))
06460       return DAG.getNode(X86ISD::FHSUB, DL, VT, InVec0, InVec1);
06461   } else if (VT == MVT::v8i32 || VT == MVT::v16i16) {
06462     // Try to match an AVX2 horizontal add/sub of signed integers.
06463     SDValue InVec2, InVec3;
06464     unsigned X86Opcode;
06465     bool CanFold = true;
06466 
06467     if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, Half, InVec0, InVec1) &&
06468         isHorizontalBinOp(BV, ISD::ADD, DAG, Half, NumElts, InVec2, InVec3) &&
06469         ((InVec0.getOpcode() == ISD::UNDEF ||
06470           InVec2.getOpcode() == ISD::UNDEF) || InVec0 == InVec2) &&
06471         ((InVec1.getOpcode() == ISD::UNDEF ||
06472           InVec3.getOpcode() == ISD::UNDEF) || InVec1 == InVec3))
06473       X86Opcode = X86ISD::HADD;
06474     else if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, Half, InVec0, InVec1) &&
06475         isHorizontalBinOp(BV, ISD::SUB, DAG, Half, NumElts, InVec2, InVec3) &&
06476         ((InVec0.getOpcode() == ISD::UNDEF ||
06477           InVec2.getOpcode() == ISD::UNDEF) || InVec0 == InVec2) &&
06478         ((InVec1.getOpcode() == ISD::UNDEF ||
06479           InVec3.getOpcode() == ISD::UNDEF) || InVec1 == InVec3))
06480       X86Opcode = X86ISD::HSUB;
06481     else
06482       CanFold = false;
06483 
06484     if (CanFold) {
06485       // Fold this build_vector into a single horizontal add/sub.
06486       // Do this only if the target has AVX2.
06487       if (Subtarget->hasAVX2())
06488         return DAG.getNode(X86Opcode, DL, VT, InVec0, InVec1);
06489  
06490       // Do not try to expand this build_vector into a pair of horizontal
06491       // add/sub if we can emit a pair of scalar add/sub.
06492       if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
06493         return SDValue();
06494 
06495       // Convert this build_vector into a pair of horizontal binop followed by
06496       // a concat vector.
06497       bool isUndefLO = NumUndefsLO == Half;
06498       bool isUndefHI = NumUndefsHI == Half;
06499       return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, false,
06500                                    isUndefLO, isUndefHI);
06501     }
06502   }
06503 
06504   if ((VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v8i32 ||
06505        VT == MVT::v16i16) && Subtarget->hasAVX()) {
06506     unsigned X86Opcode;
06507     if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1))
06508       X86Opcode = X86ISD::HADD;
06509     else if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, NumElts, InVec0, InVec1))
06510       X86Opcode = X86ISD::HSUB;
06511     else if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, NumElts, InVec0, InVec1))
06512       X86Opcode = X86ISD::FHADD;
06513     else if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, NumElts, InVec0, InVec1))
06514       X86Opcode = X86ISD::FHSUB;
06515     else
06516       return SDValue();
06517 
06518     // Don't try to expand this build_vector into a pair of horizontal add/sub
06519     // if we can simply emit a pair of scalar add/sub.
06520     if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
06521       return SDValue();
06522 
06523     // Convert this build_vector into two horizontal add/sub followed by
06524     // a concat vector.
06525     bool isUndefLO = NumUndefsLO == Half;
06526     bool isUndefHI = NumUndefsHI == Half;
06527     return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, true,
06528                                  isUndefLO, isUndefHI);
06529   }
06530 
06531   return SDValue();
06532 }
06533 
06534 SDValue
06535 X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
06536   SDLoc dl(Op);
06537 
06538   MVT VT = Op.getSimpleValueType();
06539   MVT ExtVT = VT.getVectorElementType();
06540   unsigned NumElems = Op.getNumOperands();
06541 
06542   // Generate vectors for predicate vectors.
06543   if (VT.getScalarType() == MVT::i1 && Subtarget->hasAVX512())
06544     return LowerBUILD_VECTORvXi1(Op, DAG);
06545 
06546   // Vectors containing all zeros can be matched by pxor and xorps later
06547   if (ISD::isBuildVectorAllZeros(Op.getNode())) {
06548     // Canonicalize this to <4 x i32> to 1) ensure the zero vectors are CSE'd
06549     // and 2) ensure that i64 scalars are eliminated on x86-32 hosts.
06550     if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32)
06551       return Op;
06552 
06553     return getZeroVector(VT, Subtarget, DAG, dl);
06554   }
06555 
06556   // Vectors containing all ones can be matched by pcmpeqd on 128-bit width
06557   // vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use
06558   // vpcmpeqd on 256-bit vectors.
06559   if (Subtarget->hasSSE2() && ISD::isBuildVectorAllOnes(Op.getNode())) {
06560     if (VT == MVT::v4i32 || (VT == MVT::v8i32 && Subtarget->hasInt256()))
06561       return Op;
06562 
06563     if (!VT.is512BitVector())
06564       return getOnesVector(VT, Subtarget->hasInt256(), DAG, dl);
06565   }
06566 
06567   SDValue Broadcast = LowerVectorBroadcast(Op, Subtarget, DAG);
06568   if (Broadcast.getNode())
06569     return Broadcast;
06570 
06571   unsigned EVTBits = ExtVT.getSizeInBits();
06572 
06573   unsigned NumZero  = 0;
06574   unsigned NumNonZero = 0;
06575   unsigned NonZeros = 0;
06576   bool IsAllConstants = true;
06577   SmallSet<SDValue, 8> Values;
06578   for (unsigned i = 0; i < NumElems; ++i) {
06579     SDValue Elt = Op.getOperand(i);
06580     if (Elt.getOpcode() == ISD::UNDEF)
06581       continue;
06582     Values.insert(Elt);
06583     if (Elt.getOpcode() != ISD::Constant &&
06584         Elt.getOpcode() != ISD::ConstantFP)
06585       IsAllConstants = false;
06586     if (X86::isZeroNode(Elt))
06587       NumZero++;
06588     else {
06589       NonZeros |= (1 << i);
06590       NumNonZero++;
06591     }
06592   }
06593 
06594   // All undef vector. Return an UNDEF.  All zero vectors were handled above.
06595   if (NumNonZero == 0)
06596     return DAG.getUNDEF(VT);
06597 
06598   // Special case for single non-zero, non-undef, element.
06599   if (NumNonZero == 1) {
06600     unsigned Idx = countTrailingZeros(NonZeros);
06601     SDValue Item = Op.getOperand(Idx);
06602 
06603     // If this is an insertion of an i64 value on x86-32, and if the top bits of
06604     // the value are obviously zero, truncate the value to i32 and do the
06605     // insertion that way.  Only do this if the value is non-constant or if the
06606     // value is a constant being inserted into element 0.  It is cheaper to do
06607     // a constant pool load than it is to do a movd + shuffle.
06608     if (ExtVT == MVT::i64 && !Subtarget->is64Bit() &&
06609         (!IsAllConstants || Idx == 0)) {
06610       if (DAG.MaskedValueIsZero(Item, APInt::getBitsSet(64, 32, 64))) {
06611         // Handle SSE only.
06612         assert(VT == MVT::v2i64 && "Expected an SSE value type!");
06613         EVT VecVT = MVT::v4i32;
06614         unsigned VecElts = 4;
06615 
06616         // Truncate the value (which may itself be a constant) to i32, and
06617         // convert it to a vector with movd (S2V+shuffle to zero extend).
06618         Item = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Item);
06619         Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Item);
06620         Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
06621 
06622         // Now we have our 32-bit value zero extended in the low element of
06623         // a vector.  If Idx != 0, swizzle it into place.
06624         if (Idx != 0) {
06625           SmallVector<int, 4> Mask;
06626           Mask.push_back(Idx);
06627           for (unsigned i = 1; i != VecElts; ++i)
06628             Mask.push_back(i);
06629           Item = DAG.getVectorShuffle(VecVT, dl, Item, DAG.getUNDEF(VecVT),
06630                                       &Mask[0]);
06631         }
06632         return DAG.getNode(ISD::BITCAST, dl, VT, Item);
06633       }
06634     }
06635 
06636     // If we have a constant or non-constant insertion into the low element of
06637     // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into
06638     // the rest of the elements.  This will be matched as movd/movq/movss/movsd
06639     // depending on what the source datatype is.
06640     if (Idx == 0) {
06641       if (NumZero == 0)
06642         return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
06643 
06644       if (ExtVT == MVT::i32 || ExtVT == MVT::f32 || ExtVT == MVT::f64 ||
06645           (ExtVT == MVT::i64 && Subtarget->is64Bit())) {
06646         if (VT.is256BitVector() || VT.is512BitVector()) {
06647           SDValue ZeroVec = getZeroVector(VT, Subtarget, DAG, dl);
06648           return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, ZeroVec,
06649                              Item, DAG.getIntPtrConstant(0));
06650         }
06651         assert(VT.is128BitVector() && "Expected an SSE value type!");
06652         Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
06653         // Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector.
06654         return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
06655       }
06656 
06657       if (ExtVT == MVT::i16 || ExtVT == MVT::i8) {
06658         Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item);
06659         Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item);
06660         if (VT.is256BitVector()) {
06661           SDValue ZeroVec = getZeroVector(MVT::v8i32, Subtarget, DAG, dl);
06662           Item = Insert128BitVector(ZeroVec, Item, 0, DAG, dl);
06663         } else {
06664           assert(VT.is128BitVector() && "Expected an SSE value type!");
06665           Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
06666         }
06667         return DAG.getNode(ISD::BITCAST, dl, VT, Item);
06668       }
06669     }
06670 
06671     // Is it a vector logical left shift?
06672     if (NumElems == 2 && Idx == 1 &&
06673         X86::isZeroNode(Op.getOperand(0)) &&
06674         !X86::isZeroNode(Op.getOperand(1))) {
06675       unsigned NumBits = VT.getSizeInBits();
06676       return getVShift(true, VT,
06677                        DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
06678                                    VT, Op.getOperand(1)),
06679                        NumBits/2, DAG, *this, dl);
06680     }
06681 
06682     if (IsAllConstants) // Otherwise, it's better to do a constpool load.
06683       return SDValue();
06684 
06685     // Otherwise, if this is a vector with i32 or f32 elements, and the element
06686     // is a non-constant being inserted into an element other than the low one,
06687     // we can't use a constant pool load.  Instead, use SCALAR_TO_VECTOR (aka
06688     // movd/movss) to move this into the low element, then shuffle it into
06689     // place.
06690     if (EVTBits == 32) {
06691       Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
06692 
06693       // Turn it into a shuffle of zero and zero-extended scalar to vector.
06694       Item = getShuffleVectorZeroOrUndef(Item, 0, NumZero > 0, Subtarget, DAG);
06695       SmallVector<int, 8> MaskVec;
06696       for (unsigned i = 0; i != NumElems; ++i)
06697         MaskVec.push_back(i == Idx ? 0 : 1);
06698       return DAG.getVectorShuffle(VT, dl, Item, DAG.getUNDEF(VT), &MaskVec[0]);
06699     }
06700   }
06701 
06702   // Splat is obviously ok. Let legalizer expand it to a shuffle.
06703   if (Values.size() == 1) {
06704     if (EVTBits == 32) {
06705       // Instead of a shuffle like this:
06706       // shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0>
06707       // Check if it's possible to issue this instead.
06708       // shuffle (vload ptr)), undef, <1, 1, 1, 1>
06709       unsigned Idx = countTrailingZeros(NonZeros);
06710       SDValue Item = Op.getOperand(Idx);
06711       if (Op.getNode()->isOnlyUserOf(Item.getNode()))
06712         return LowerAsSplatVectorLoad(Item, VT, dl, DAG);
06713     }
06714     return SDValue();
06715   }
06716 
06717   // A vector full of immediates; various special cases are already
06718   // handled, so this is best done with a single constant-pool load.
06719   if (IsAllConstants)
06720     return SDValue();
06721 
06722   // For AVX-length vectors, build the individual 128-bit pieces and use
06723   // shuffles to put them in place.
06724   if (VT.is256BitVector() || VT.is512BitVector()) {
06725     SmallVector<SDValue, 64> V;
06726     for (unsigned i = 0; i != NumElems; ++i)
06727       V.push_back(Op.getOperand(i));
06728 
06729     EVT HVT = EVT::getVectorVT(*DAG.getContext(), ExtVT, NumElems/2);
06730 
06731     // Build both the lower and upper subvector.
06732     SDValue Lower = DAG.getNode(ISD::BUILD_VECTOR, dl, HVT,
06733                                 makeArrayRef(&V[0], NumElems/2));
06734     SDValue Upper = DAG.getNode(ISD::BUILD_VECTOR, dl, HVT,
06735                                 makeArrayRef(&V[NumElems / 2], NumElems/2));
06736 
06737     // Recreate the wider vector with the lower and upper part.
06738     if (VT.is256BitVector())
06739       return Concat128BitVectors(Lower, Upper, VT, NumElems, DAG, dl);
06740     return Concat256BitVectors(Lower, Upper, VT, NumElems, DAG, dl);
06741   }
06742 
06743   // Let legalizer expand 2-wide build_vectors.
06744   if (EVTBits == 64) {
06745     if (NumNonZero == 1) {
06746       // One half is zero or undef.
06747       unsigned Idx = countTrailingZeros(NonZeros);
06748       SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT,
06749                                  Op.getOperand(Idx));
06750       return getShuffleVectorZeroOrUndef(V2, Idx, true, Subtarget, DAG);
06751     }
06752     return SDValue();
06753   }
06754 
06755   // If element VT is < 32 bits, convert it to inserts into a zero vector.
06756   if (EVTBits == 8 && NumElems == 16) {
06757     SDValue V = LowerBuildVectorv16i8(Op, NonZeros,NumNonZero,NumZero, DAG,
06758                                         Subtarget, *this);
06759     if (V.getNode()) return V;
06760   }
06761 
06762   if (EVTBits == 16 && NumElems == 8) {
06763     SDValue V = LowerBuildVectorv8i16(Op, NonZeros,NumNonZero,NumZero, DAG,
06764                                       Subtarget, *this);
06765     if (V.getNode()) return V;
06766   }
06767 
06768   // If element VT is == 32 bits and has 4 elems, try to generate an INSERTPS
06769   if (EVTBits == 32 && NumElems == 4) {
06770     SDValue V = LowerBuildVectorv4x32(Op, NumElems, NonZeros, NumNonZero,
06771                                       NumZero, DAG, Subtarget, *this);
06772     if (V.getNode())
06773       return V;
06774   }
06775 
06776   // If element VT is == 32 bits, turn it into a number of shuffles.
06777   SmallVector<SDValue, 8> V(NumElems);
06778   if (NumElems == 4 && NumZero > 0) {
06779     for (unsigned i = 0; i < 4; ++i) {
06780       bool isZero = !(NonZeros & (1 << i));
06781       if (isZero)
06782         V[i] = getZeroVector(VT, Subtarget, DAG, dl);
06783       else
06784         V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
06785     }
06786 
06787     for (unsigned i = 0; i < 2; ++i) {
06788       switch ((NonZeros & (0x3 << i*2)) >> (i*2)) {
06789         default: break;
06790         case 0:
06791           V[i] = V[i*2];  // Must be a zero vector.
06792           break;
06793         case 1:
06794           V[i] = getMOVL(DAG, dl, VT, V[i*2+1], V[i*2]);
06795           break;
06796         case 2:
06797           V[i] = getMOVL(DAG, dl, VT, V[i*2], V[i*2+1]);
06798           break;
06799         case 3:
06800           V[i] = getUnpackl(DAG, dl, VT, V[i*2], V[i*2+1]);
06801           break;
06802       }
06803     }
06804 
06805     bool Reverse1 = (NonZeros & 0x3) == 2;
06806     bool Reverse2 = ((NonZeros & (0x3 << 2)) >> 2) == 2;
06807     int MaskVec[] = {
06808       Reverse1 ? 1 : 0,
06809       Reverse1 ? 0 : 1,
06810       static_cast<int>(Reverse2 ? NumElems+1 : NumElems),
06811       static_cast<int>(Reverse2 ? NumElems   : NumElems+1)
06812     };
06813     return DAG.getVectorShuffle(VT, dl, V[0], V[1], &MaskVec[0]);
06814   }
06815 
06816   if (Values.size() > 1 && VT.is128BitVector()) {
06817     // Check for a build vector of consecutive loads.
06818     for (unsigned i = 0; i < NumElems; ++i)
06819       V[i] = Op.getOperand(i);
06820 
06821     // Check for elements which are consecutive loads.
06822     SDValue LD = EltsFromConsecutiveLoads(VT, V, dl, DAG, false);
06823     if (LD.getNode())
06824       return LD;
06825 
06826     // Check for a build vector from mostly shuffle plus few inserting.
06827     SDValue Sh = buildFromShuffleMostly(Op, DAG);
06828     if (Sh.getNode())
06829       return Sh;
06830 
06831     // For SSE 4.1, use insertps to put the high elements into the low element.
06832     if (getSubtarget()->hasSSE41()) {
06833       SDValue Result;
06834       if (Op.getOperand(0).getOpcode() != ISD::UNDEF)
06835         Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0));
06836       else
06837         Result = DAG.getUNDEF(VT);
06838 
06839       for (unsigned i = 1; i < NumElems; ++i) {
06840         if (Op.getOperand(i).getOpcode() == ISD::UNDEF) continue;
06841         Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result,
06842                              Op.getOperand(i), DAG.getIntPtrConstant(i));
06843       }
06844       return Result;
06845     }
06846 
06847     // Otherwise, expand into a number of unpckl*, start by extending each of
06848     // our (non-undef) elements to the full vector width with the element in the
06849     // bottom slot of the vector (which generates no code for SSE).
06850     for (unsigned i = 0; i < NumElems; ++i) {
06851       if (Op.getOperand(i).getOpcode() != ISD::UNDEF)
06852         V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
06853       else
06854         V[i] = DAG.getUNDEF(VT);
06855     }
06856 
06857     // Next, we iteratively mix elements, e.g. for v4f32:
06858     //   Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
06859     //         : unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
06860     //   Step 2: unpcklps X, Y ==>    <3, 2, 1, 0>
06861     unsigned EltStride = NumElems >> 1;
06862     while (EltStride != 0) {
06863       for (unsigned i = 0; i < EltStride; ++i) {
06864         // If V[i+EltStride] is undef and this is the first round of mixing,
06865         // then it is safe to just drop this shuffle: V[i] is already in the
06866         // right place, the one element (since it's the first round) being
06867         // inserted as undef can be dropped.  This isn't safe for successive
06868         // rounds because they will permute elements within both vectors.
06869         if (V[i+EltStride].getOpcode() == ISD::UNDEF &&
06870             EltStride == NumElems/2)
06871           continue;
06872 
06873         V[i] = getUnpackl(DAG, dl, VT, V[i], V[i + EltStride]);
06874       }
06875       EltStride >>= 1;
06876     }
06877     return V[0];
06878   }
06879   return SDValue();
06880 }
06881 
06882 // LowerAVXCONCAT_VECTORS - 256-bit AVX can use the vinsertf128 instruction
06883 // to create 256-bit vectors from two other 128-bit ones.
06884 static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
06885   SDLoc dl(Op);
06886   MVT ResVT = Op.getSimpleValueType();
06887 
06888   assert((ResVT.is256BitVector() ||
06889           ResVT.is512BitVector()) && "Value type must be 256-/512-bit wide");
06890 
06891   SDValue V1 = Op.getOperand(0);
06892   SDValue V2 = Op.getOperand(1);
06893   unsigned NumElems = ResVT.getVectorNumElements();
06894   if(ResVT.is256BitVector())
06895     return Concat128BitVectors(V1, V2, ResVT, NumElems, DAG, dl);
06896 
06897   if (Op.getNumOperands() == 4) {
06898     MVT HalfVT = MVT::getVectorVT(ResVT.getScalarType(),
06899                                 ResVT.getVectorNumElements()/2);
06900     SDValue V3 = Op.getOperand(2);
06901     SDValue V4 = Op.getOperand(3);
06902     return Concat256BitVectors(Concat128BitVectors(V1, V2, HalfVT, NumElems/2, DAG, dl),
06903       Concat128BitVectors(V3, V4, HalfVT, NumElems/2, DAG, dl), ResVT, NumElems, DAG, dl);
06904   }
06905   return Concat256BitVectors(V1, V2, ResVT, NumElems, DAG, dl);
06906 }
06907 
06908 static SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
06909   MVT LLVM_ATTRIBUTE_UNUSED VT = Op.getSimpleValueType();
06910   assert((VT.is256BitVector() && Op.getNumOperands() == 2) ||
06911          (VT.is512BitVector() && (Op.getNumOperands() == 2 ||
06912           Op.getNumOperands() == 4)));
06913 
06914   // AVX can use the vinsertf128 instruction to create 256-bit vectors
06915   // from two other 128-bit ones.
06916 
06917   // 512-bit vector may contain 2 256-bit vectors or 4 128-bit vectors
06918   return LowerAVXCONCAT_VECTORS(Op, DAG);
06919 }
06920 
06921 
06922 //===----------------------------------------------------------------------===//
06923 // Vector shuffle lowering
06924 //
06925 // This is an experimental code path for lowering vector shuffles on x86. It is
06926 // designed to handle arbitrary vector shuffles and blends, gracefully
06927 // degrading performance as necessary. It works hard to recognize idiomatic
06928 // shuffles and lower them to optimal instruction patterns without leaving
06929 // a framework that allows reasonably efficient handling of all vector shuffle
06930 // patterns.
06931 //===----------------------------------------------------------------------===//
06932 
06933 /// \brief Tiny helper function to identify a no-op mask.
06934 ///
06935 /// This is a somewhat boring predicate function. It checks whether the mask
06936 /// array input, which is assumed to be a single-input shuffle mask of the kind
06937 /// used by the X86 shuffle instructions (not a fully general
06938 /// ShuffleVectorSDNode mask) requires any shuffles to occur. Both undef and an
06939 /// in-place shuffle are 'no-op's.
06940 static bool isNoopShuffleMask(ArrayRef<int> Mask) {
06941   for (int i = 0, Size = Mask.size(); i < Size; ++i)
06942     if (Mask[i] != -1 && Mask[i] != i)
06943       return false;
06944   return true;
06945 }
06946 
06947 /// \brief Helper function to classify a mask as a single-input mask.
06948 ///
06949 /// This isn't a generic single-input test because in the vector shuffle
06950 /// lowering we canonicalize single inputs to be the first input operand. This
06951 /// means we can more quickly test for a single input by only checking whether
06952 /// an input from the second operand exists. We also assume that the size of
06953 /// mask corresponds to the size of the input vectors which isn't true in the
06954 /// fully general case.
06955 static bool isSingleInputShuffleMask(ArrayRef<int> Mask) {
06956   for (int M : Mask)
06957     if (M >= (int)Mask.size())
06958       return false;
06959   return true;
06960 }
06961 
06962 /// \brief Get a 4-lane 8-bit shuffle immediate for a mask.
06963 ///
06964 /// This helper function produces an 8-bit shuffle immediate corresponding to
06965 /// the ubiquitous shuffle encoding scheme used in x86 instructions for
06966 /// shuffling 4 lanes. It can be used with most of the PSHUF instructions for
06967 /// example.
06968 ///
06969 /// NB: We rely heavily on "undef" masks preserving the input lane.
06970 static SDValue getV4X86ShuffleImm8ForMask(ArrayRef<int> Mask,
06971                                           SelectionDAG &DAG) {
06972   assert(Mask.size() == 4 && "Only 4-lane shuffle masks");
06973   assert(Mask[0] >= -1 && Mask[0] < 4 && "Out of bound mask element!");
06974   assert(Mask[1] >= -1 && Mask[1] < 4 && "Out of bound mask element!");
06975   assert(Mask[2] >= -1 && Mask[2] < 4 && "Out of bound mask element!");
06976   assert(Mask[3] >= -1 && Mask[3] < 4 && "Out of bound mask element!");
06977 
06978   unsigned Imm = 0;
06979   Imm |= (Mask[0] == -1 ? 0 : Mask[0]) << 0;
06980   Imm |= (Mask[1] == -1 ? 1 : Mask[1]) << 2;
06981   Imm |= (Mask[2] == -1 ? 2 : Mask[2]) << 4;
06982   Imm |= (Mask[3] == -1 ? 3 : Mask[3]) << 6;
06983   return DAG.getConstant(Imm, MVT::i8);
06984 }
06985 
06986 /// \brief Handle lowering of 2-lane 64-bit floating point shuffles.
06987 ///
06988 /// This is the basis function for the 2-lane 64-bit shuffles as we have full
06989 /// support for floating point shuffles but not integer shuffles. These
06990 /// instructions will incur a domain crossing penalty on some chips though so
06991 /// it is better to avoid lowering through this for integer vectors where
06992 /// possible.
06993 static SDValue lowerV2F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
06994                                        const X86Subtarget *Subtarget,
06995                                        SelectionDAG &DAG) {
06996   SDLoc DL(Op);
06997   assert(Op.getSimpleValueType() == MVT::v2f64 && "Bad shuffle type!");
06998   assert(V1.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
06999   assert(V2.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
07000   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
07001   ArrayRef<int> Mask = SVOp->getMask();
07002   assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
07003 
07004   if (isSingleInputShuffleMask(Mask)) {
07005     // Straight shuffle of a single input vector. Simulate this by using the
07006     // single input as both of the "inputs" to this instruction..
07007     unsigned SHUFPDMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1);
07008     return DAG.getNode(X86ISD::SHUFP, SDLoc(Op), MVT::v2f64, V1, V1,
07009                        DAG.getConstant(SHUFPDMask, MVT::i8));
07010   }
07011   assert(Mask[0] >= 0 && Mask[0] < 2 && "Non-canonicalized blend!");
07012   assert(Mask[1] >= 2 && "Non-canonicalized blend!");
07013 
07014   unsigned SHUFPDMask = (Mask[0] == 1) | (((Mask[1] - 2) == 1) << 1);
07015   return DAG.getNode(X86ISD::SHUFP, SDLoc(Op), MVT::v2f64, V1, V2,
07016                      DAG.getConstant(SHUFPDMask, MVT::i8));
07017 }
07018 
07019 /// \brief Handle lowering of 2-lane 64-bit integer shuffles.
07020 ///
07021 /// Tries to lower a 2-lane 64-bit shuffle using shuffle operations provided by
07022 /// the integer unit to minimize domain crossing penalties. However, for blends
07023 /// it falls back to the floating point shuffle operation with appropriate bit
07024 /// casting.
07025 static SDValue lowerV2I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
07026                                        const X86Subtarget *Subtarget,
07027                                        SelectionDAG &DAG) {
07028   SDLoc DL(Op);
07029   assert(Op.getSimpleValueType() == MVT::v2i64 && "Bad shuffle type!");
07030   assert(V1.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
07031   assert(V2.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
07032   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
07033   ArrayRef<int> Mask = SVOp->getMask();
07034   assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
07035 
07036   if (isSingleInputShuffleMask(Mask)) {
07037     // Straight shuffle of a single input vector. For everything from SSE2
07038     // onward this has a single fast instruction with no scary immediates.
07039     // We have to map the mask as it is actually a v4i32 shuffle instruction.
07040     V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, V1);
07041     int WidenedMask[4] = {
07042         std::max(Mask[0], 0) * 2, std::max(Mask[0], 0) * 2 + 1,
07043         std::max(Mask[1], 0) * 2, std::max(Mask[1], 0) * 2 + 1};
07044     return DAG.getNode(
07045         ISD::BITCAST, DL, MVT::v2i64,
07046         DAG.getNode(X86ISD::PSHUFD, SDLoc(Op), MVT::v4i32, V1,
07047                     getV4X86ShuffleImm8ForMask(WidenedMask, DAG)));
07048   }
07049 
07050   // We implement this with SHUFPD which is pretty lame because it will likely
07051   // incur 2 cycles of stall for integer vectors on Nehalem and older chips.
07052   // However, all the alternatives are still more cycles and newer chips don't
07053   // have this problem. It would be really nice if x86 had better shuffles here.
07054   V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v2f64, V1);
07055   V2 = DAG.getNode(ISD::BITCAST, DL, MVT::v2f64, V2);
07056   return DAG.getNode(ISD::BITCAST, DL, MVT::v2i64,
07057                      DAG.getVectorShuffle(MVT::v2f64, DL, V1, V2, Mask));
07058 }
07059 
07060 /// \brief Lower 4-lane 32-bit floating point shuffles.
07061 ///
07062 /// Uses instructions exclusively from the floating point unit to minimize
07063 /// domain crossing penalties, as these are sufficient to implement all v4f32
07064 /// shuffles.
07065 static SDValue lowerV4F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
07066                                        const X86Subtarget *Subtarget,
07067                                        SelectionDAG &DAG) {
07068   SDLoc DL(Op);
07069   assert(Op.getSimpleValueType() == MVT::v4f32 && "Bad shuffle type!");
07070   assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
07071   assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
07072   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
07073   ArrayRef<int> Mask = SVOp->getMask();
07074   assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
07075 
07076   SDValue LowV = V1, HighV = V2;
07077   int NewMask[4] = {Mask[0], Mask[1], Mask[2], Mask[3]};
07078 
07079   int NumV2Elements =
07080       std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; });
07081 
07082   if (NumV2Elements == 0)
07083     // Straight shuffle of a single input vector. We pass the input vector to
07084     // both operands to simulate this with a SHUFPS.
07085     return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V1, V1,
07086                        getV4X86ShuffleImm8ForMask(Mask, DAG));
07087 
07088   if (NumV2Elements == 1) {
07089     int V2Index =
07090         std::find_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; }) -
07091         Mask.begin();
07092     // Compute the index adjacent to V2Index and in the same half by toggling
07093     // the low bit.
07094     int V2AdjIndex = V2Index ^ 1;
07095 
07096     if (Mask[V2AdjIndex] == -1) {
07097       // Handles all the cases where we have a single V2 element and an undef.
07098       // This will only ever happen in the high lanes because we commute the
07099       // vector otherwise.
07100       if (V2Index < 2)
07101         std::swap(LowV, HighV);
07102       NewMask[V2Index] -= 4;
07103     } else {
07104       // Handle the case where the V2 element ends up adjacent to a V1 element.
07105       // To make this work, blend them together as the first step.
07106       int V1Index = V2AdjIndex;
07107       int BlendMask[4] = {Mask[V2Index] - 4, 0, Mask[V1Index], 0};
07108       V2 = DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V2, V1,
07109                        getV4X86ShuffleImm8ForMask(BlendMask, DAG));
07110 
07111       // Now proceed to reconstruct the final blend as we have the necessary
07112       // high or low half formed.
07113       if (V2Index < 2) {
07114         LowV = V2;
07115         HighV = V1;
07116       } else {
07117         HighV = V2;
07118       }
07119       NewMask[V1Index] = 2; // We put the V1 element in V2[2].
07120       NewMask[V2Index] = 0; // We shifted the V2 element into V2[0].
07121     }
07122   } else if (NumV2Elements == 2) {
07123     if (Mask[0] < 4 && Mask[1] < 4) {
07124       // Handle the easy case where we have V1 in the low lanes and V2 in the
07125       // high lanes. We never see this reversed because we sort the shuffle.
07126       NewMask[2] -= 4;
07127       NewMask[3] -= 4;
07128     } else {
07129       // We have a mixture of V1 and V2 in both low and high lanes. Rather than
07130       // trying to place elements directly, just blend them and set up the final
07131       // shuffle to place them.
07132 
07133       // The first two blend mask elements are for V1, the second two are for
07134       // V2.
07135       int BlendMask[4] = {Mask[0] < 4 ? Mask[0] : Mask[1],
07136                           Mask[2] < 4 ? Mask[2] : Mask[3],
07137                           (Mask[0] >= 4 ? Mask[0] : Mask[1]) - 4,
07138                           (Mask[2] >= 4 ? Mask[2] : Mask[3]) - 4};
07139       V1 = DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V1, V2,
07140                        getV4X86ShuffleImm8ForMask(BlendMask, DAG));
07141 
07142       // Now we do a normal shuffle of V1 by giving V1 as both operands to
07143       // a blend.
07144       LowV = HighV = V1;
07145       NewMask[0] = Mask[0] < 4 ? 0 : 2;
07146       NewMask[1] = Mask[0] < 4 ? 2 : 0;
07147       NewMask[2] = Mask[2] < 4 ? 1 : 3;
07148       NewMask[3] = Mask[2] < 4 ? 3 : 1;
07149     }
07150   }
07151   return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, LowV, HighV,
07152                      getV4X86ShuffleImm8ForMask(NewMask, DAG));
07153 }
07154 
07155 /// \brief Lower 4-lane i32 vector shuffles.
07156 ///
07157 /// We try to handle these with integer-domain shuffles where we can, but for
07158 /// blends we use the floating point domain blend instructions.
07159 static SDValue lowerV4I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
07160                                        const X86Subtarget *Subtarget,
07161                                        SelectionDAG &DAG) {
07162   SDLoc DL(Op);
07163   assert(Op.getSimpleValueType() == MVT::v4i32 && "Bad shuffle type!");
07164   assert(V1.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
07165   assert(V2.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
07166   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
07167   ArrayRef<int> Mask = SVOp->getMask();
07168   assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
07169 
07170   if (isSingleInputShuffleMask(Mask))
07171     // Straight shuffle of a single input vector. For everything from SSE2
07172     // onward this has a single fast instruction with no scary immediates.
07173     return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
07174                        getV4X86ShuffleImm8ForMask(Mask, DAG));
07175 
07176   // We implement this with SHUFPS because it can blend from two vectors.
07177   // Because we're going to eventually use SHUFPS, we use SHUFPS even to build
07178