LLVM API Documentation

X86ISelLowering.cpp
Go to the documentation of this file.
00001 //===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===//
00002 //
00003 //                     The LLVM Compiler Infrastructure
00004 //
00005 // This file is distributed under the University of Illinois Open Source
00006 // License. See LICENSE.TXT for details.
00007 //
00008 //===----------------------------------------------------------------------===//
00009 //
00010 // This file defines the interfaces that X86 uses to lower LLVM code into a
00011 // selection DAG.
00012 //
00013 //===----------------------------------------------------------------------===//
00014 
00015 #include "X86ISelLowering.h"
00016 #include "Utils/X86ShuffleDecode.h"
00017 #include "X86CallingConv.h"
00018 #include "X86InstrBuilder.h"
00019 #include "X86MachineFunctionInfo.h"
00020 #include "X86TargetMachine.h"
00021 #include "X86TargetObjectFile.h"
00022 #include "llvm/ADT/SmallSet.h"
00023 #include "llvm/ADT/Statistic.h"
00024 #include "llvm/ADT/StringExtras.h"
00025 #include "llvm/ADT/StringSwitch.h"
00026 #include "llvm/ADT/VariadicFunction.h"
00027 #include "llvm/CodeGen/IntrinsicLowering.h"
00028 #include "llvm/CodeGen/MachineFrameInfo.h"
00029 #include "llvm/CodeGen/MachineFunction.h"
00030 #include "llvm/CodeGen/MachineInstrBuilder.h"
00031 #include "llvm/CodeGen/MachineJumpTableInfo.h"
00032 #include "llvm/CodeGen/MachineModuleInfo.h"
00033 #include "llvm/CodeGen/MachineRegisterInfo.h"
00034 #include "llvm/IR/CallSite.h"
00035 #include "llvm/IR/CallingConv.h"
00036 #include "llvm/IR/Constants.h"
00037 #include "llvm/IR/DerivedTypes.h"
00038 #include "llvm/IR/Function.h"
00039 #include "llvm/IR/GlobalAlias.h"
00040 #include "llvm/IR/GlobalVariable.h"
00041 #include "llvm/IR/Instructions.h"
00042 #include "llvm/IR/Intrinsics.h"
00043 #include "llvm/MC/MCAsmInfo.h"
00044 #include "llvm/MC/MCContext.h"
00045 #include "llvm/MC/MCExpr.h"
00046 #include "llvm/MC/MCSymbol.h"
00047 #include "llvm/Support/CommandLine.h"
00048 #include "llvm/Support/Debug.h"
00049 #include "llvm/Support/ErrorHandling.h"
00050 #include "llvm/Support/MathExtras.h"
00051 #include "llvm/Target/TargetOptions.h"
00052 #include <bitset>
00053 #include <numeric>
00054 #include <cctype>
00055 using namespace llvm;
00056 
00057 #define DEBUG_TYPE "x86-isel"
00058 
00059 STATISTIC(NumTailCalls, "Number of tail calls");
00060 
00061 static cl::opt<bool> ExperimentalVectorWideningLegalization(
00062     "x86-experimental-vector-widening-legalization", cl::init(false),
00063     cl::desc("Enable an experimental vector type legalization through widening "
00064              "rather than promotion."),
00065     cl::Hidden);
00066 
00067 static cl::opt<bool> ExperimentalVectorShuffleLowering(
00068     "x86-experimental-vector-shuffle-lowering", cl::init(false),
00069     cl::desc("Enable an experimental vector shuffle lowering code path."),
00070     cl::Hidden);
00071 
00072 // Forward declarations.
00073 static SDValue getMOVL(SelectionDAG &DAG, SDLoc dl, EVT VT, SDValue V1,
00074                        SDValue V2);
00075 
00076 static SDValue ExtractSubVector(SDValue Vec, unsigned IdxVal,
00077                                 SelectionDAG &DAG, SDLoc dl,
00078                                 unsigned vectorWidth) {
00079   assert((vectorWidth == 128 || vectorWidth == 256) &&
00080          "Unsupported vector width");
00081   EVT VT = Vec.getValueType();
00082   EVT ElVT = VT.getVectorElementType();
00083   unsigned Factor = VT.getSizeInBits()/vectorWidth;
00084   EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT,
00085                                   VT.getVectorNumElements()/Factor);
00086 
00087   // Extract from UNDEF is UNDEF.
00088   if (Vec.getOpcode() == ISD::UNDEF)
00089     return DAG.getUNDEF(ResultVT);
00090 
00091   // Extract the relevant vectorWidth bits.  Generate an EXTRACT_SUBVECTOR
00092   unsigned ElemsPerChunk = vectorWidth / ElVT.getSizeInBits();
00093 
00094   // This is the index of the first element of the vectorWidth-bit chunk
00095   // we want.
00096   unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits()) / vectorWidth)
00097                                * ElemsPerChunk);
00098 
00099   // If the input is a buildvector just emit a smaller one.
00100   if (Vec.getOpcode() == ISD::BUILD_VECTOR)
00101     return DAG.getNode(ISD::BUILD_VECTOR, dl, ResultVT,
00102                        makeArrayRef(Vec->op_begin()+NormalizedIdxVal,
00103                                     ElemsPerChunk));
00104 
00105   SDValue VecIdx = DAG.getIntPtrConstant(NormalizedIdxVal);
00106   SDValue Result = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec,
00107                                VecIdx);
00108 
00109   return Result;
00110 
00111 }
00112 /// Generate a DAG to grab 128-bits from a vector > 128 bits.  This
00113 /// sets things up to match to an AVX VEXTRACTF128 / VEXTRACTI128
00114 /// or AVX-512 VEXTRACTF32x4 / VEXTRACTI32x4
00115 /// instructions or a simple subregister reference. Idx is an index in the
00116 /// 128 bits we want.  It need not be aligned to a 128-bit bounday.  That makes
00117 /// lowering EXTRACT_VECTOR_ELT operations easier.
00118 static SDValue Extract128BitVector(SDValue Vec, unsigned IdxVal,
00119                                    SelectionDAG &DAG, SDLoc dl) {
00120   assert((Vec.getValueType().is256BitVector() ||
00121           Vec.getValueType().is512BitVector()) && "Unexpected vector size!");
00122   return ExtractSubVector(Vec, IdxVal, DAG, dl, 128);
00123 }
00124 
00125 /// Generate a DAG to grab 256-bits from a 512-bit vector.
00126 static SDValue Extract256BitVector(SDValue Vec, unsigned IdxVal,
00127                                    SelectionDAG &DAG, SDLoc dl) {
00128   assert(Vec.getValueType().is512BitVector() && "Unexpected vector size!");
00129   return ExtractSubVector(Vec, IdxVal, DAG, dl, 256);
00130 }
00131 
00132 static SDValue InsertSubVector(SDValue Result, SDValue Vec,
00133                                unsigned IdxVal, SelectionDAG &DAG,
00134                                SDLoc dl, unsigned vectorWidth) {
00135   assert((vectorWidth == 128 || vectorWidth == 256) &&
00136          "Unsupported vector width");
00137   // Inserting UNDEF is Result
00138   if (Vec.getOpcode() == ISD::UNDEF)
00139     return Result;
00140   EVT VT = Vec.getValueType();
00141   EVT ElVT = VT.getVectorElementType();
00142   EVT ResultVT = Result.getValueType();
00143 
00144   // Insert the relevant vectorWidth bits.
00145   unsigned ElemsPerChunk = vectorWidth/ElVT.getSizeInBits();
00146 
00147   // This is the index of the first element of the vectorWidth-bit chunk
00148   // we want.
00149   unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits())/vectorWidth)
00150                                * ElemsPerChunk);
00151 
00152   SDValue VecIdx = DAG.getIntPtrConstant(NormalizedIdxVal);
00153   return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec,
00154                      VecIdx);
00155 }
00156 /// Generate a DAG to put 128-bits into a vector > 128 bits.  This
00157 /// sets things up to match to an AVX VINSERTF128/VINSERTI128 or
00158 /// AVX-512 VINSERTF32x4/VINSERTI32x4 instructions or a
00159 /// simple superregister reference.  Idx is an index in the 128 bits
00160 /// we want.  It need not be aligned to a 128-bit bounday.  That makes
00161 /// lowering INSERT_VECTOR_ELT operations easier.
00162 static SDValue Insert128BitVector(SDValue Result, SDValue Vec,
00163                                   unsigned IdxVal, SelectionDAG &DAG,
00164                                   SDLoc dl) {
00165   assert(Vec.getValueType().is128BitVector() && "Unexpected vector size!");
00166   return InsertSubVector(Result, Vec, IdxVal, DAG, dl, 128);
00167 }
00168 
00169 static SDValue Insert256BitVector(SDValue Result, SDValue Vec,
00170                                   unsigned IdxVal, SelectionDAG &DAG,
00171                                   SDLoc dl) {
00172   assert(Vec.getValueType().is256BitVector() && "Unexpected vector size!");
00173   return InsertSubVector(Result, Vec, IdxVal, DAG, dl, 256);
00174 }
00175 
00176 /// Concat two 128-bit vectors into a 256 bit vector using VINSERTF128
00177 /// instructions. This is used because creating CONCAT_VECTOR nodes of
00178 /// BUILD_VECTORS returns a larger BUILD_VECTOR while we're trying to lower
00179 /// large BUILD_VECTORS.
00180 static SDValue Concat128BitVectors(SDValue V1, SDValue V2, EVT VT,
00181                                    unsigned NumElems, SelectionDAG &DAG,
00182                                    SDLoc dl) {
00183   SDValue V = Insert128BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl);
00184   return Insert128BitVector(V, V2, NumElems/2, DAG, dl);
00185 }
00186 
00187 static SDValue Concat256BitVectors(SDValue V1, SDValue V2, EVT VT,
00188                                    unsigned NumElems, SelectionDAG &DAG,
00189                                    SDLoc dl) {
00190   SDValue V = Insert256BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl);
00191   return Insert256BitVector(V, V2, NumElems/2, DAG, dl);
00192 }
00193 
00194 static TargetLoweringObjectFile *createTLOF(const Triple &TT) {
00195   if (TT.isOSBinFormatMachO()) {
00196     if (TT.getArch() == Triple::x86_64)
00197       return new X86_64MachoTargetObjectFile();
00198     return new TargetLoweringObjectFileMachO();
00199   }
00200 
00201   if (TT.isOSLinux())
00202     return new X86LinuxTargetObjectFile();
00203   if (TT.isOSBinFormatELF())
00204     return new TargetLoweringObjectFileELF();
00205   if (TT.isKnownWindowsMSVCEnvironment())
00206     return new X86WindowsTargetObjectFile();
00207   if (TT.isOSBinFormatCOFF())
00208     return new TargetLoweringObjectFileCOFF();
00209   llvm_unreachable("unknown subtarget type");
00210 }
00211 
00212 // FIXME: This should stop caching the target machine as soon as
00213 // we can remove resetOperationActions et al.
00214 X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
00215   : TargetLowering(TM, createTLOF(Triple(TM.getTargetTriple()))) {
00216   Subtarget = &TM.getSubtarget<X86Subtarget>();
00217   X86ScalarSSEf64 = Subtarget->hasSSE2();
00218   X86ScalarSSEf32 = Subtarget->hasSSE1();
00219   TD = getDataLayout();
00220 
00221   resetOperationActions();
00222 }
00223 
00224 void X86TargetLowering::resetOperationActions() {
00225   const TargetMachine &TM = getTargetMachine();
00226   static bool FirstTimeThrough = true;
00227 
00228   // If none of the target options have changed, then we don't need to reset the
00229   // operation actions.
00230   if (!FirstTimeThrough && TO == TM.Options) return;
00231 
00232   if (!FirstTimeThrough) {
00233     // Reinitialize the actions.
00234     initActions();
00235     FirstTimeThrough = false;
00236   }
00237 
00238   TO = TM.Options;
00239 
00240   // Set up the TargetLowering object.
00241   static const MVT IntVTs[] = { MVT::i8, MVT::i16, MVT::i32, MVT::i64 };
00242 
00243   // X86 is weird, it always uses i8 for shift amounts and setcc results.
00244   setBooleanContents(ZeroOrOneBooleanContent);
00245   // X86-SSE is even stranger. It uses -1 or 0 for vector masks.
00246   setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
00247 
00248   // For 64-bit since we have so many registers use the ILP scheduler, for
00249   // 32-bit code use the register pressure specific scheduling.
00250   // For Atom, always use ILP scheduling.
00251   if (Subtarget->isAtom())
00252     setSchedulingPreference(Sched::ILP);
00253   else if (Subtarget->is64Bit())
00254     setSchedulingPreference(Sched::ILP);
00255   else
00256     setSchedulingPreference(Sched::RegPressure);
00257   const X86RegisterInfo *RegInfo =
00258     static_cast<const X86RegisterInfo*>(TM.getRegisterInfo());
00259   setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister());
00260 
00261   // Bypass expensive divides on Atom when compiling with O2
00262   if (Subtarget->hasSlowDivide() && TM.getOptLevel() >= CodeGenOpt::Default) {
00263     addBypassSlowDiv(32, 8);
00264     if (Subtarget->is64Bit())
00265       addBypassSlowDiv(64, 16);
00266   }
00267 
00268   if (Subtarget->isTargetKnownWindowsMSVC()) {
00269     // Setup Windows compiler runtime calls.
00270     setLibcallName(RTLIB::SDIV_I64, "_alldiv");
00271     setLibcallName(RTLIB::UDIV_I64, "_aulldiv");
00272     setLibcallName(RTLIB::SREM_I64, "_allrem");
00273     setLibcallName(RTLIB::UREM_I64, "_aullrem");
00274     setLibcallName(RTLIB::MUL_I64, "_allmul");
00275     setLibcallCallingConv(RTLIB::SDIV_I64, CallingConv::X86_StdCall);
00276     setLibcallCallingConv(RTLIB::UDIV_I64, CallingConv::X86_StdCall);
00277     setLibcallCallingConv(RTLIB::SREM_I64, CallingConv::X86_StdCall);
00278     setLibcallCallingConv(RTLIB::UREM_I64, CallingConv::X86_StdCall);
00279     setLibcallCallingConv(RTLIB::MUL_I64, CallingConv::X86_StdCall);
00280 
00281     // The _ftol2 runtime function has an unusual calling conv, which
00282     // is modeled by a special pseudo-instruction.
00283     setLibcallName(RTLIB::FPTOUINT_F64_I64, nullptr);
00284     setLibcallName(RTLIB::FPTOUINT_F32_I64, nullptr);
00285     setLibcallName(RTLIB::FPTOUINT_F64_I32, nullptr);
00286     setLibcallName(RTLIB::FPTOUINT_F32_I32, nullptr);
00287   }
00288 
00289   if (Subtarget->isTargetDarwin()) {
00290     // Darwin should use _setjmp/_longjmp instead of setjmp/longjmp.
00291     setUseUnderscoreSetJmp(false);
00292     setUseUnderscoreLongJmp(false);
00293   } else if (Subtarget->isTargetWindowsGNU()) {
00294     // MS runtime is weird: it exports _setjmp, but longjmp!
00295     setUseUnderscoreSetJmp(true);
00296     setUseUnderscoreLongJmp(false);
00297   } else {
00298     setUseUnderscoreSetJmp(true);
00299     setUseUnderscoreLongJmp(true);
00300   }
00301 
00302   // Set up the register classes.
00303   addRegisterClass(MVT::i8, &X86::GR8RegClass);
00304   addRegisterClass(MVT::i16, &X86::GR16RegClass);
00305   addRegisterClass(MVT::i32, &X86::GR32RegClass);
00306   if (Subtarget->is64Bit())
00307     addRegisterClass(MVT::i64, &X86::GR64RegClass);
00308 
00309   setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote);
00310 
00311   // We don't accept any truncstore of integer registers.
00312   setTruncStoreAction(MVT::i64, MVT::i32, Expand);
00313   setTruncStoreAction(MVT::i64, MVT::i16, Expand);
00314   setTruncStoreAction(MVT::i64, MVT::i8 , Expand);
00315   setTruncStoreAction(MVT::i32, MVT::i16, Expand);
00316   setTruncStoreAction(MVT::i32, MVT::i8 , Expand);
00317   setTruncStoreAction(MVT::i16, MVT::i8,  Expand);
00318 
00319   // SETOEQ and SETUNE require checking two conditions.
00320   setCondCodeAction(ISD::SETOEQ, MVT::f32, Expand);
00321   setCondCodeAction(ISD::SETOEQ, MVT::f64, Expand);
00322   setCondCodeAction(ISD::SETOEQ, MVT::f80, Expand);
00323   setCondCodeAction(ISD::SETUNE, MVT::f32, Expand);
00324   setCondCodeAction(ISD::SETUNE, MVT::f64, Expand);
00325   setCondCodeAction(ISD::SETUNE, MVT::f80, Expand);
00326 
00327   // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this
00328   // operation.
00329   setOperationAction(ISD::UINT_TO_FP       , MVT::i1   , Promote);
00330   setOperationAction(ISD::UINT_TO_FP       , MVT::i8   , Promote);
00331   setOperationAction(ISD::UINT_TO_FP       , MVT::i16  , Promote);
00332 
00333   if (Subtarget->is64Bit()) {
00334     setOperationAction(ISD::UINT_TO_FP     , MVT::i32  , Promote);
00335     setOperationAction(ISD::UINT_TO_FP     , MVT::i64  , Custom);
00336   } else if (!TM.Options.UseSoftFloat) {
00337     // We have an algorithm for SSE2->double, and we turn this into a
00338     // 64-bit FILD followed by conditional FADD for other targets.
00339     setOperationAction(ISD::UINT_TO_FP     , MVT::i64  , Custom);
00340     // We have an algorithm for SSE2, and we turn this into a 64-bit
00341     // FILD for other targets.
00342     setOperationAction(ISD::UINT_TO_FP     , MVT::i32  , Custom);
00343   }
00344 
00345   // Promote i1/i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have
00346   // this operation.
00347   setOperationAction(ISD::SINT_TO_FP       , MVT::i1   , Promote);
00348   setOperationAction(ISD::SINT_TO_FP       , MVT::i8   , Promote);
00349 
00350   if (!TM.Options.UseSoftFloat) {
00351     // SSE has no i16 to fp conversion, only i32
00352     if (X86ScalarSSEf32) {
00353       setOperationAction(ISD::SINT_TO_FP     , MVT::i16  , Promote);
00354       // f32 and f64 cases are Legal, f80 case is not
00355       setOperationAction(ISD::SINT_TO_FP     , MVT::i32  , Custom);
00356     } else {
00357       setOperationAction(ISD::SINT_TO_FP     , MVT::i16  , Custom);
00358       setOperationAction(ISD::SINT_TO_FP     , MVT::i32  , Custom);
00359     }
00360   } else {
00361     setOperationAction(ISD::SINT_TO_FP     , MVT::i16  , Promote);
00362     setOperationAction(ISD::SINT_TO_FP     , MVT::i32  , Promote);
00363   }
00364 
00365   // In 32-bit mode these are custom lowered.  In 64-bit mode F32 and F64
00366   // are Legal, f80 is custom lowered.
00367   setOperationAction(ISD::FP_TO_SINT     , MVT::i64  , Custom);
00368   setOperationAction(ISD::SINT_TO_FP     , MVT::i64  , Custom);
00369 
00370   // Promote i1/i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have
00371   // this operation.
00372   setOperationAction(ISD::FP_TO_SINT       , MVT::i1   , Promote);
00373   setOperationAction(ISD::FP_TO_SINT       , MVT::i8   , Promote);
00374 
00375   if (X86ScalarSSEf32) {
00376     setOperationAction(ISD::FP_TO_SINT     , MVT::i16  , Promote);
00377     // f32 and f64 cases are Legal, f80 case is not
00378     setOperationAction(ISD::FP_TO_SINT     , MVT::i32  , Custom);
00379   } else {
00380     setOperationAction(ISD::FP_TO_SINT     , MVT::i16  , Custom);
00381     setOperationAction(ISD::FP_TO_SINT     , MVT::i32  , Custom);
00382   }
00383 
00384   // Handle FP_TO_UINT by promoting the destination to a larger signed
00385   // conversion.
00386   setOperationAction(ISD::FP_TO_UINT       , MVT::i1   , Promote);
00387   setOperationAction(ISD::FP_TO_UINT       , MVT::i8   , Promote);
00388   setOperationAction(ISD::FP_TO_UINT       , MVT::i16  , Promote);
00389 
00390   if (Subtarget->is64Bit()) {
00391     setOperationAction(ISD::FP_TO_UINT     , MVT::i64  , Expand);
00392     setOperationAction(ISD::FP_TO_UINT     , MVT::i32  , Promote);
00393   } else if (!TM.Options.UseSoftFloat) {
00394     // Since AVX is a superset of SSE3, only check for SSE here.
00395     if (Subtarget->hasSSE1() && !Subtarget->hasSSE3())
00396       // Expand FP_TO_UINT into a select.
00397       // FIXME: We would like to use a Custom expander here eventually to do
00398       // the optimal thing for SSE vs. the default expansion in the legalizer.
00399       setOperationAction(ISD::FP_TO_UINT   , MVT::i32  , Expand);
00400     else
00401       // With SSE3 we can use fisttpll to convert to a signed i64; without
00402       // SSE, we're stuck with a fistpll.
00403       setOperationAction(ISD::FP_TO_UINT   , MVT::i32  , Custom);
00404   }
00405 
00406   if (isTargetFTOL()) {
00407     // Use the _ftol2 runtime function, which has a pseudo-instruction
00408     // to handle its weird calling convention.
00409     setOperationAction(ISD::FP_TO_UINT     , MVT::i64  , Custom);
00410   }
00411 
00412   // TODO: when we have SSE, these could be more efficient, by using movd/movq.
00413   if (!X86ScalarSSEf64) {
00414     setOperationAction(ISD::BITCAST        , MVT::f32  , Expand);
00415     setOperationAction(ISD::BITCAST        , MVT::i32  , Expand);
00416     if (Subtarget->is64Bit()) {
00417       setOperationAction(ISD::BITCAST      , MVT::f64  , Expand);
00418       // Without SSE, i64->f64 goes through memory.
00419       setOperationAction(ISD::BITCAST      , MVT::i64  , Expand);
00420     }
00421   }
00422 
00423   // Scalar integer divide and remainder are lowered to use operations that
00424   // produce two results, to match the available instructions. This exposes
00425   // the two-result form to trivial CSE, which is able to combine x/y and x%y
00426   // into a single instruction.
00427   //
00428   // Scalar integer multiply-high is also lowered to use two-result
00429   // operations, to match the available instructions. However, plain multiply
00430   // (low) operations are left as Legal, as there are single-result
00431   // instructions for this in x86. Using the two-result multiply instructions
00432   // when both high and low results are needed must be arranged by dagcombine.
00433   for (unsigned i = 0; i != array_lengthof(IntVTs); ++i) {
00434     MVT VT = IntVTs[i];
00435     setOperationAction(ISD::MULHS, VT, Expand);
00436     setOperationAction(ISD::MULHU, VT, Expand);
00437     setOperationAction(ISD::SDIV, VT, Expand);
00438     setOperationAction(ISD::UDIV, VT, Expand);
00439     setOperationAction(ISD::SREM, VT, Expand);
00440     setOperationAction(ISD::UREM, VT, Expand);
00441 
00442     // Add/Sub overflow ops with MVT::Glues are lowered to EFLAGS dependences.
00443     setOperationAction(ISD::ADDC, VT, Custom);
00444     setOperationAction(ISD::ADDE, VT, Custom);
00445     setOperationAction(ISD::SUBC, VT, Custom);
00446     setOperationAction(ISD::SUBE, VT, Custom);
00447   }
00448 
00449   setOperationAction(ISD::BR_JT            , MVT::Other, Expand);
00450   setOperationAction(ISD::BRCOND           , MVT::Other, Custom);
00451   setOperationAction(ISD::BR_CC            , MVT::f32,   Expand);
00452   setOperationAction(ISD::BR_CC            , MVT::f64,   Expand);
00453   setOperationAction(ISD::BR_CC            , MVT::f80,   Expand);
00454   setOperationAction(ISD::BR_CC            , MVT::i8,    Expand);
00455   setOperationAction(ISD::BR_CC            , MVT::i16,   Expand);
00456   setOperationAction(ISD::BR_CC            , MVT::i32,   Expand);
00457   setOperationAction(ISD::BR_CC            , MVT::i64,   Expand);
00458   setOperationAction(ISD::SELECT_CC        , MVT::f32,   Expand);
00459   setOperationAction(ISD::SELECT_CC        , MVT::f64,   Expand);
00460   setOperationAction(ISD::SELECT_CC        , MVT::f80,   Expand);
00461   setOperationAction(ISD::SELECT_CC        , MVT::i8,    Expand);
00462   setOperationAction(ISD::SELECT_CC        , MVT::i16,   Expand);
00463   setOperationAction(ISD::SELECT_CC        , MVT::i32,   Expand);
00464   setOperationAction(ISD::SELECT_CC        , MVT::i64,   Expand);
00465   if (Subtarget->is64Bit())
00466     setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
00467   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16  , Legal);
00468   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8   , Legal);
00469   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1   , Expand);
00470   setOperationAction(ISD::FP_ROUND_INREG   , MVT::f32  , Expand);
00471   setOperationAction(ISD::FREM             , MVT::f32  , Expand);
00472   setOperationAction(ISD::FREM             , MVT::f64  , Expand);
00473   setOperationAction(ISD::FREM             , MVT::f80  , Expand);
00474   setOperationAction(ISD::FLT_ROUNDS_      , MVT::i32  , Custom);
00475 
00476   // Promote the i8 variants and force them on up to i32 which has a shorter
00477   // encoding.
00478   setOperationAction(ISD::CTTZ             , MVT::i8   , Promote);
00479   AddPromotedToType (ISD::CTTZ             , MVT::i8   , MVT::i32);
00480   setOperationAction(ISD::CTTZ_ZERO_UNDEF  , MVT::i8   , Promote);
00481   AddPromotedToType (ISD::CTTZ_ZERO_UNDEF  , MVT::i8   , MVT::i32);
00482   if (Subtarget->hasBMI()) {
00483     setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i16  , Expand);
00484     setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32  , Expand);
00485     if (Subtarget->is64Bit())
00486       setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Expand);
00487   } else {
00488     setOperationAction(ISD::CTTZ           , MVT::i16  , Custom);
00489     setOperationAction(ISD::CTTZ           , MVT::i32  , Custom);
00490     if (Subtarget->is64Bit())
00491       setOperationAction(ISD::CTTZ         , MVT::i64  , Custom);
00492   }
00493 
00494   if (Subtarget->hasLZCNT()) {
00495     // When promoting the i8 variants, force them to i32 for a shorter
00496     // encoding.
00497     setOperationAction(ISD::CTLZ           , MVT::i8   , Promote);
00498     AddPromotedToType (ISD::CTLZ           , MVT::i8   , MVT::i32);
00499     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8   , Promote);
00500     AddPromotedToType (ISD::CTLZ_ZERO_UNDEF, MVT::i8   , MVT::i32);
00501     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16  , Expand);
00502     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32  , Expand);
00503     if (Subtarget->is64Bit())
00504       setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Expand);
00505   } else {
00506     setOperationAction(ISD::CTLZ           , MVT::i8   , Custom);
00507     setOperationAction(ISD::CTLZ           , MVT::i16  , Custom);
00508     setOperationAction(ISD::CTLZ           , MVT::i32  , Custom);
00509     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8   , Custom);
00510     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16  , Custom);
00511     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32  , Custom);
00512     if (Subtarget->is64Bit()) {
00513       setOperationAction(ISD::CTLZ         , MVT::i64  , Custom);
00514       setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom);
00515     }
00516   }
00517 
00518   // Special handling for half-precision floating point conversions.
00519   // If we don't have F16C support, then lower half float conversions
00520   // into library calls.
00521   if (TM.Options.UseSoftFloat || !Subtarget->hasF16C()) {
00522     setOperationAction(ISD::FP16_TO_FP, MVT::f32, Expand);
00523     setOperationAction(ISD::FP_TO_FP16, MVT::f32, Expand);
00524   }
00525 
00526   // There's never any support for operations beyond MVT::f32.
00527   setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
00528   setOperationAction(ISD::FP16_TO_FP, MVT::f80, Expand);
00529   setOperationAction(ISD::FP_TO_FP16, MVT::f64, Expand);
00530   setOperationAction(ISD::FP_TO_FP16, MVT::f80, Expand);
00531 
00532   setLoadExtAction(ISD::EXTLOAD, MVT::f16, Expand);
00533   setTruncStoreAction(MVT::f32, MVT::f16, Expand);
00534   setTruncStoreAction(MVT::f64, MVT::f16, Expand);
00535   setTruncStoreAction(MVT::f80, MVT::f16, Expand);
00536 
00537   if (Subtarget->hasPOPCNT()) {
00538     setOperationAction(ISD::CTPOP          , MVT::i8   , Promote);
00539   } else {
00540     setOperationAction(ISD::CTPOP          , MVT::i8   , Expand);
00541     setOperationAction(ISD::CTPOP          , MVT::i16  , Expand);
00542     setOperationAction(ISD::CTPOP          , MVT::i32  , Expand);
00543     if (Subtarget->is64Bit())
00544       setOperationAction(ISD::CTPOP        , MVT::i64  , Expand);
00545   }
00546 
00547   setOperationAction(ISD::READCYCLECOUNTER , MVT::i64  , Custom);
00548 
00549   if (!Subtarget->hasMOVBE())
00550     setOperationAction(ISD::BSWAP          , MVT::i16  , Expand);
00551 
00552   // These should be promoted to a larger select which is supported.
00553   setOperationAction(ISD::SELECT          , MVT::i1   , Promote);
00554   // X86 wants to expand cmov itself.
00555   setOperationAction(ISD::SELECT          , MVT::i8   , Custom);
00556   setOperationAction(ISD::SELECT          , MVT::i16  , Custom);
00557   setOperationAction(ISD::SELECT          , MVT::i32  , Custom);
00558   setOperationAction(ISD::SELECT          , MVT::f32  , Custom);
00559   setOperationAction(ISD::SELECT          , MVT::f64  , Custom);
00560   setOperationAction(ISD::SELECT          , MVT::f80  , Custom);
00561   setOperationAction(ISD::SETCC           , MVT::i8   , Custom);
00562   setOperationAction(ISD::SETCC           , MVT::i16  , Custom);
00563   setOperationAction(ISD::SETCC           , MVT::i32  , Custom);
00564   setOperationAction(ISD::SETCC           , MVT::f32  , Custom);
00565   setOperationAction(ISD::SETCC           , MVT::f64  , Custom);
00566   setOperationAction(ISD::SETCC           , MVT::f80  , Custom);
00567   if (Subtarget->is64Bit()) {
00568     setOperationAction(ISD::SELECT        , MVT::i64  , Custom);
00569     setOperationAction(ISD::SETCC         , MVT::i64  , Custom);
00570   }
00571   setOperationAction(ISD::EH_RETURN       , MVT::Other, Custom);
00572   // NOTE: EH_SJLJ_SETJMP/_LONGJMP supported here is NOT intended to support
00573   // SjLj exception handling but a light-weight setjmp/longjmp replacement to
00574   // support continuation, user-level threading, and etc.. As a result, no
00575   // other SjLj exception interfaces are implemented and please don't build
00576   // your own exception handling based on them.
00577   // LLVM/Clang supports zero-cost DWARF exception handling.
00578   setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom);
00579   setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom);
00580 
00581   // Darwin ABI issue.
00582   setOperationAction(ISD::ConstantPool    , MVT::i32  , Custom);
00583   setOperationAction(ISD::JumpTable       , MVT::i32  , Custom);
00584   setOperationAction(ISD::GlobalAddress   , MVT::i32  , Custom);
00585   setOperationAction(ISD::GlobalTLSAddress, MVT::i32  , Custom);
00586   if (Subtarget->is64Bit())
00587     setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom);
00588   setOperationAction(ISD::ExternalSymbol  , MVT::i32  , Custom);
00589   setOperationAction(ISD::BlockAddress    , MVT::i32  , Custom);
00590   if (Subtarget->is64Bit()) {
00591     setOperationAction(ISD::ConstantPool  , MVT::i64  , Custom);
00592     setOperationAction(ISD::JumpTable     , MVT::i64  , Custom);
00593     setOperationAction(ISD::GlobalAddress , MVT::i64  , Custom);
00594     setOperationAction(ISD::ExternalSymbol, MVT::i64  , Custom);
00595     setOperationAction(ISD::BlockAddress  , MVT::i64  , Custom);
00596   }
00597   // 64-bit addm sub, shl, sra, srl (iff 32-bit x86)
00598   setOperationAction(ISD::SHL_PARTS       , MVT::i32  , Custom);
00599   setOperationAction(ISD::SRA_PARTS       , MVT::i32  , Custom);
00600   setOperationAction(ISD::SRL_PARTS       , MVT::i32  , Custom);
00601   if (Subtarget->is64Bit()) {
00602     setOperationAction(ISD::SHL_PARTS     , MVT::i64  , Custom);
00603     setOperationAction(ISD::SRA_PARTS     , MVT::i64  , Custom);
00604     setOperationAction(ISD::SRL_PARTS     , MVT::i64  , Custom);
00605   }
00606 
00607   if (Subtarget->hasSSE1())
00608     setOperationAction(ISD::PREFETCH      , MVT::Other, Legal);
00609 
00610   setOperationAction(ISD::ATOMIC_FENCE  , MVT::Other, Custom);
00611 
00612   // Expand certain atomics
00613   for (unsigned i = 0; i != array_lengthof(IntVTs); ++i) {
00614     MVT VT = IntVTs[i];
00615     setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, VT, Custom);
00616     setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom);
00617     setOperationAction(ISD::ATOMIC_STORE, VT, Custom);
00618   }
00619 
00620   if (Subtarget->hasCmpxchg16b()) {
00621     setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i128, Custom);
00622   }
00623 
00624   // FIXME - use subtarget debug flags
00625   if (!Subtarget->isTargetDarwin() && !Subtarget->isTargetELF() &&
00626       !Subtarget->isTargetCygMing() && !Subtarget->isTargetWin64()) {
00627     setOperationAction(ISD::EH_LABEL, MVT::Other, Expand);
00628   }
00629 
00630   if (Subtarget->is64Bit()) {
00631     setExceptionPointerRegister(X86::RAX);
00632     setExceptionSelectorRegister(X86::RDX);
00633   } else {
00634     setExceptionPointerRegister(X86::EAX);
00635     setExceptionSelectorRegister(X86::EDX);
00636   }
00637   setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom);
00638   setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i64, Custom);
00639 
00640   setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom);
00641   setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom);
00642 
00643   setOperationAction(ISD::TRAP, MVT::Other, Legal);
00644   setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);
00645 
00646   // VASTART needs to be custom lowered to use the VarArgsFrameIndex
00647   setOperationAction(ISD::VASTART           , MVT::Other, Custom);
00648   setOperationAction(ISD::VAEND             , MVT::Other, Expand);
00649   if (Subtarget->is64Bit() && !Subtarget->isTargetWin64()) {
00650     // TargetInfo::X86_64ABIBuiltinVaList
00651     setOperationAction(ISD::VAARG           , MVT::Other, Custom);
00652     setOperationAction(ISD::VACOPY          , MVT::Other, Custom);
00653   } else {
00654     // TargetInfo::CharPtrBuiltinVaList
00655     setOperationAction(ISD::VAARG           , MVT::Other, Expand);
00656     setOperationAction(ISD::VACOPY          , MVT::Other, Expand);
00657   }
00658 
00659   setOperationAction(ISD::STACKSAVE,          MVT::Other, Expand);
00660   setOperationAction(ISD::STACKRESTORE,       MVT::Other, Expand);
00661 
00662   setOperationAction(ISD::DYNAMIC_STACKALLOC, Subtarget->is64Bit() ?
00663                      MVT::i64 : MVT::i32, Custom);
00664 
00665   if (!TM.Options.UseSoftFloat && X86ScalarSSEf64) {
00666     // f32 and f64 use SSE.
00667     // Set up the FP register classes.
00668     addRegisterClass(MVT::f32, &X86::FR32RegClass);
00669     addRegisterClass(MVT::f64, &X86::FR64RegClass);
00670 
00671     // Use ANDPD to simulate FABS.
00672     setOperationAction(ISD::FABS , MVT::f64, Custom);
00673     setOperationAction(ISD::FABS , MVT::f32, Custom);
00674 
00675     // Use XORP to simulate FNEG.
00676     setOperationAction(ISD::FNEG , MVT::f64, Custom);
00677     setOperationAction(ISD::FNEG , MVT::f32, Custom);
00678 
00679     // Use ANDPD and ORPD to simulate FCOPYSIGN.
00680     setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom);
00681     setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
00682 
00683     // Lower this to FGETSIGNx86 plus an AND.
00684     setOperationAction(ISD::FGETSIGN, MVT::i64, Custom);
00685     setOperationAction(ISD::FGETSIGN, MVT::i32, Custom);
00686 
00687     // We don't support sin/cos/fmod
00688     setOperationAction(ISD::FSIN   , MVT::f64, Expand);
00689     setOperationAction(ISD::FCOS   , MVT::f64, Expand);
00690     setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
00691     setOperationAction(ISD::FSIN   , MVT::f32, Expand);
00692     setOperationAction(ISD::FCOS   , MVT::f32, Expand);
00693     setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
00694 
00695     // Expand FP immediates into loads from the stack, except for the special
00696     // cases we handle.
00697     addLegalFPImmediate(APFloat(+0.0)); // xorpd
00698     addLegalFPImmediate(APFloat(+0.0f)); // xorps
00699   } else if (!TM.Options.UseSoftFloat && X86ScalarSSEf32) {
00700     // Use SSE for f32, x87 for f64.
00701     // Set up the FP register classes.
00702     addRegisterClass(MVT::f32, &X86::FR32RegClass);
00703     addRegisterClass(MVT::f64, &X86::RFP64RegClass);
00704 
00705     // Use ANDPS to simulate FABS.
00706     setOperationAction(ISD::FABS , MVT::f32, Custom);
00707 
00708     // Use XORP to simulate FNEG.
00709     setOperationAction(ISD::FNEG , MVT::f32, Custom);
00710 
00711     setOperationAction(ISD::UNDEF,     MVT::f64, Expand);
00712 
00713     // Use ANDPS and ORPS to simulate FCOPYSIGN.
00714     setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
00715     setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
00716 
00717     // We don't support sin/cos/fmod
00718     setOperationAction(ISD::FSIN   , MVT::f32, Expand);
00719     setOperationAction(ISD::FCOS   , MVT::f32, Expand);
00720     setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
00721 
00722     // Special cases we handle for FP constants.
00723     addLegalFPImmediate(APFloat(+0.0f)); // xorps
00724     addLegalFPImmediate(APFloat(+0.0)); // FLD0
00725     addLegalFPImmediate(APFloat(+1.0)); // FLD1
00726     addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
00727     addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
00728 
00729     if (!TM.Options.UnsafeFPMath) {
00730       setOperationAction(ISD::FSIN   , MVT::f64, Expand);
00731       setOperationAction(ISD::FCOS   , MVT::f64, Expand);
00732       setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
00733     }
00734   } else if (!TM.Options.UseSoftFloat) {
00735     // f32 and f64 in x87.
00736     // Set up the FP register classes.
00737     addRegisterClass(MVT::f64, &X86::RFP64RegClass);
00738     addRegisterClass(MVT::f32, &X86::RFP32RegClass);
00739 
00740     setOperationAction(ISD::UNDEF,     MVT::f64, Expand);
00741     setOperationAction(ISD::UNDEF,     MVT::f32, Expand);
00742     setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
00743     setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
00744 
00745     if (!TM.Options.UnsafeFPMath) {
00746       setOperationAction(ISD::FSIN   , MVT::f64, Expand);
00747       setOperationAction(ISD::FSIN   , MVT::f32, Expand);
00748       setOperationAction(ISD::FCOS   , MVT::f64, Expand);
00749       setOperationAction(ISD::FCOS   , MVT::f32, Expand);
00750       setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
00751       setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
00752     }
00753     addLegalFPImmediate(APFloat(+0.0)); // FLD0
00754     addLegalFPImmediate(APFloat(+1.0)); // FLD1
00755     addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
00756     addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
00757     addLegalFPImmediate(APFloat(+0.0f)); // FLD0
00758     addLegalFPImmediate(APFloat(+1.0f)); // FLD1
00759     addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS
00760     addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS
00761   }
00762 
00763   // We don't support FMA.
00764   setOperationAction(ISD::FMA, MVT::f64, Expand);
00765   setOperationAction(ISD::FMA, MVT::f32, Expand);
00766 
00767   // Long double always uses X87.
00768   if (!TM.Options.UseSoftFloat) {
00769     addRegisterClass(MVT::f80, &X86::RFP80RegClass);
00770     setOperationAction(ISD::UNDEF,     MVT::f80, Expand);
00771     setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand);
00772     {
00773       APFloat TmpFlt = APFloat::getZero(APFloat::x87DoubleExtended);
00774       addLegalFPImmediate(TmpFlt);  // FLD0
00775       TmpFlt.changeSign();
00776       addLegalFPImmediate(TmpFlt);  // FLD0/FCHS
00777 
00778       bool ignored;
00779       APFloat TmpFlt2(+1.0);
00780       TmpFlt2.convert(APFloat::x87DoubleExtended, APFloat::rmNearestTiesToEven,
00781                       &ignored);
00782       addLegalFPImmediate(TmpFlt2);  // FLD1
00783       TmpFlt2.changeSign();
00784       addLegalFPImmediate(TmpFlt2);  // FLD1/FCHS
00785     }
00786 
00787     if (!TM.Options.UnsafeFPMath) {
00788       setOperationAction(ISD::FSIN   , MVT::f80, Expand);
00789       setOperationAction(ISD::FCOS   , MVT::f80, Expand);
00790       setOperationAction(ISD::FSINCOS, MVT::f80, Expand);
00791     }
00792 
00793     setOperationAction(ISD::FFLOOR, MVT::f80, Expand);
00794     setOperationAction(ISD::FCEIL,  MVT::f80, Expand);
00795     setOperationAction(ISD::FTRUNC, MVT::f80, Expand);
00796     setOperationAction(ISD::FRINT,  MVT::f80, Expand);
00797     setOperationAction(ISD::FNEARBYINT, MVT::f80, Expand);
00798     setOperationAction(ISD::FMA, MVT::f80, Expand);
00799   }
00800 
00801   // Always use a library call for pow.
00802   setOperationAction(ISD::FPOW             , MVT::f32  , Expand);
00803   setOperationAction(ISD::FPOW             , MVT::f64  , Expand);
00804   setOperationAction(ISD::FPOW             , MVT::f80  , Expand);
00805 
00806   setOperationAction(ISD::FLOG, MVT::f80, Expand);
00807   setOperationAction(ISD::FLOG2, MVT::f80, Expand);
00808   setOperationAction(ISD::FLOG10, MVT::f80, Expand);
00809   setOperationAction(ISD::FEXP, MVT::f80, Expand);
00810   setOperationAction(ISD::FEXP2, MVT::f80, Expand);
00811 
00812   // First set operation action for all vector types to either promote
00813   // (for widening) or expand (for scalarization). Then we will selectively
00814   // turn on ones that can be effectively codegen'd.
00815   for (int i = MVT::FIRST_VECTOR_VALUETYPE;
00816            i <= MVT::LAST_VECTOR_VALUETYPE; ++i) {
00817     MVT VT = (MVT::SimpleValueType)i;
00818     setOperationAction(ISD::ADD , VT, Expand);
00819     setOperationAction(ISD::SUB , VT, Expand);
00820     setOperationAction(ISD::FADD, VT, Expand);
00821     setOperationAction(ISD::FNEG, VT, Expand);
00822     setOperationAction(ISD::FSUB, VT, Expand);
00823     setOperationAction(ISD::MUL , VT, Expand);
00824     setOperationAction(ISD::FMUL, VT, Expand);
00825     setOperationAction(ISD::SDIV, VT, Expand);
00826     setOperationAction(ISD::UDIV, VT, Expand);
00827     setOperationAction(ISD::FDIV, VT, Expand);
00828     setOperationAction(ISD::SREM, VT, Expand);
00829     setOperationAction(ISD::UREM, VT, Expand);
00830     setOperationAction(ISD::LOAD, VT, Expand);
00831     setOperationAction(ISD::VECTOR_SHUFFLE, VT, Expand);
00832     setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT,Expand);
00833     setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Expand);
00834     setOperationAction(ISD::EXTRACT_SUBVECTOR, VT,Expand);
00835     setOperationAction(ISD::INSERT_SUBVECTOR, VT,Expand);
00836     setOperationAction(ISD::FABS, VT, Expand);
00837     setOperationAction(ISD::FSIN, VT, Expand);
00838     setOperationAction(ISD::FSINCOS, VT, Expand);
00839     setOperationAction(ISD::FCOS, VT, Expand);
00840     setOperationAction(ISD::FSINCOS, VT, Expand);
00841     setOperationAction(ISD::FREM, VT, Expand);
00842     setOperationAction(ISD::FMA,  VT, Expand);
00843     setOperationAction(ISD::FPOWI, VT, Expand);
00844     setOperationAction(ISD::FSQRT, VT, Expand);
00845     setOperationAction(ISD::FCOPYSIGN, VT, Expand);
00846     setOperationAction(ISD::FFLOOR, VT, Expand);
00847     setOperationAction(ISD::FCEIL, VT, Expand);
00848     setOperationAction(ISD::FTRUNC, VT, Expand);
00849     setOperationAction(ISD::FRINT, VT, Expand);
00850     setOperationAction(ISD::FNEARBYINT, VT, Expand);
00851     setOperationAction(ISD::SMUL_LOHI, VT, Expand);
00852     setOperationAction(ISD::MULHS, VT, Expand);
00853     setOperationAction(ISD::UMUL_LOHI, VT, Expand);
00854     setOperationAction(ISD::MULHU, VT, Expand);
00855     setOperationAction(ISD::SDIVREM, VT, Expand);
00856     setOperationAction(ISD::UDIVREM, VT, Expand);
00857     setOperationAction(ISD::FPOW, VT, Expand);
00858     setOperationAction(ISD::CTPOP, VT, Expand);
00859     setOperationAction(ISD::CTTZ, VT, Expand);
00860     setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Expand);
00861     setOperationAction(ISD::CTLZ, VT, Expand);
00862     setOperationAction(ISD::CTLZ_ZERO_UNDEF, VT, Expand);
00863     setOperationAction(ISD::SHL, VT, Expand);
00864     setOperationAction(ISD::SRA, VT, Expand);
00865     setOperationAction(ISD::SRL, VT, Expand);
00866     setOperationAction(ISD::ROTL, VT, Expand);
00867     setOperationAction(ISD::ROTR, VT, Expand);
00868     setOperationAction(ISD::BSWAP, VT, Expand);
00869     setOperationAction(ISD::SETCC, VT, Expand);
00870     setOperationAction(ISD::FLOG, VT, Expand);
00871     setOperationAction(ISD::FLOG2, VT, Expand);
00872     setOperationAction(ISD::FLOG10, VT, Expand);
00873     setOperationAction(ISD::FEXP, VT, Expand);
00874     setOperationAction(ISD::FEXP2, VT, Expand);
00875     setOperationAction(ISD::FP_TO_UINT, VT, Expand);
00876     setOperationAction(ISD::FP_TO_SINT, VT, Expand);
00877     setOperationAction(ISD::UINT_TO_FP, VT, Expand);
00878     setOperationAction(ISD::SINT_TO_FP, VT, Expand);
00879     setOperationAction(ISD::SIGN_EXTEND_INREG, VT,Expand);
00880     setOperationAction(ISD::TRUNCATE, VT, Expand);
00881     setOperationAction(ISD::SIGN_EXTEND, VT, Expand);
00882     setOperationAction(ISD::ZERO_EXTEND, VT, Expand);
00883     setOperationAction(ISD::ANY_EXTEND, VT, Expand);
00884     setOperationAction(ISD::VSELECT, VT, Expand);
00885     setOperationAction(ISD::SELECT_CC, VT, Expand);
00886     for (int InnerVT = MVT::FIRST_VECTOR_VALUETYPE;
00887              InnerVT <= MVT::LAST_VECTOR_VALUETYPE; ++InnerVT)
00888       setTruncStoreAction(VT,
00889                           (MVT::SimpleValueType)InnerVT, Expand);
00890     setLoadExtAction(ISD::SEXTLOAD, VT, Expand);
00891     setLoadExtAction(ISD::ZEXTLOAD, VT, Expand);
00892 
00893     // N.b. ISD::EXTLOAD legality is basically ignored except for i1-like types,
00894     // we have to deal with them whether we ask for Expansion or not. Setting
00895     // Expand causes its own optimisation problems though, so leave them legal.
00896     if (VT.getVectorElementType() == MVT::i1)
00897       setLoadExtAction(ISD::EXTLOAD, VT, Expand);
00898   }
00899 
00900   // FIXME: In order to prevent SSE instructions being expanded to MMX ones
00901   // with -msoft-float, disable use of MMX as well.
00902   if (!TM.Options.UseSoftFloat && Subtarget->hasMMX()) {
00903     addRegisterClass(MVT::x86mmx, &X86::VR64RegClass);
00904     // No operations on x86mmx supported, everything uses intrinsics.
00905   }
00906 
00907   // MMX-sized vectors (other than x86mmx) are expected to be expanded
00908   // into smaller operations.
00909   setOperationAction(ISD::MULHS,              MVT::v8i8,  Expand);
00910   setOperationAction(ISD::MULHS,              MVT::v4i16, Expand);
00911   setOperationAction(ISD::MULHS,              MVT::v2i32, Expand);
00912   setOperationAction(ISD::MULHS,              MVT::v1i64, Expand);
00913   setOperationAction(ISD::AND,                MVT::v8i8,  Expand);
00914   setOperationAction(ISD::AND,                MVT::v4i16, Expand);
00915   setOperationAction(ISD::AND,                MVT::v2i32, Expand);
00916   setOperationAction(ISD::AND,                MVT::v1i64, Expand);
00917   setOperationAction(ISD::OR,                 MVT::v8i8,  Expand);
00918   setOperationAction(ISD::OR,                 MVT::v4i16, Expand);
00919   setOperationAction(ISD::OR,                 MVT::v2i32, Expand);
00920   setOperationAction(ISD::OR,                 MVT::v1i64, Expand);
00921   setOperationAction(ISD::XOR,                MVT::v8i8,  Expand);
00922   setOperationAction(ISD::XOR,                MVT::v4i16, Expand);
00923   setOperationAction(ISD::XOR,                MVT::v2i32, Expand);
00924   setOperationAction(ISD::XOR,                MVT::v1i64, Expand);
00925   setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v8i8,  Expand);
00926   setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v4i16, Expand);
00927   setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v2i32, Expand);
00928   setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v1i64, Expand);
00929   setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v1i64, Expand);
00930   setOperationAction(ISD::SELECT,             MVT::v8i8,  Expand);
00931   setOperationAction(ISD::SELECT,             MVT::v4i16, Expand);
00932   setOperationAction(ISD::SELECT,             MVT::v2i32, Expand);
00933   setOperationAction(ISD::SELECT,             MVT::v1i64, Expand);
00934   setOperationAction(ISD::BITCAST,            MVT::v8i8,  Expand);
00935   setOperationAction(ISD::BITCAST,            MVT::v4i16, Expand);
00936   setOperationAction(ISD::BITCAST,            MVT::v2i32, Expand);
00937   setOperationAction(ISD::BITCAST,            MVT::v1i64, Expand);
00938 
00939   if (!TM.Options.UseSoftFloat && Subtarget->hasSSE1()) {
00940     addRegisterClass(MVT::v4f32, &X86::VR128RegClass);
00941 
00942     setOperationAction(ISD::FADD,               MVT::v4f32, Legal);
00943     setOperationAction(ISD::FSUB,               MVT::v4f32, Legal);
00944     setOperationAction(ISD::FMUL,               MVT::v4f32, Legal);
00945     setOperationAction(ISD::FDIV,               MVT::v4f32, Legal);
00946     setOperationAction(ISD::FSQRT,              MVT::v4f32, Legal);
00947     setOperationAction(ISD::FNEG,               MVT::v4f32, Custom);
00948     setOperationAction(ISD::FABS,               MVT::v4f32, Custom);
00949     setOperationAction(ISD::LOAD,               MVT::v4f32, Legal);
00950     setOperationAction(ISD::BUILD_VECTOR,       MVT::v4f32, Custom);
00951     setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v4f32, Custom);
00952     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
00953     setOperationAction(ISD::SELECT,             MVT::v4f32, Custom);
00954   }
00955 
00956   if (!TM.Options.UseSoftFloat && Subtarget->hasSSE2()) {
00957     addRegisterClass(MVT::v2f64, &X86::VR128RegClass);
00958 
00959     // FIXME: Unfortunately -soft-float and -no-implicit-float means XMM
00960     // registers cannot be used even for integer operations.
00961     addRegisterClass(MVT::v16i8, &X86::VR128RegClass);
00962     addRegisterClass(MVT::v8i16, &X86::VR128RegClass);
00963     addRegisterClass(MVT::v4i32, &X86::VR128RegClass);
00964     addRegisterClass(MVT::v2i64, &X86::VR128RegClass);
00965 
00966     setOperationAction(ISD::ADD,                MVT::v16i8, Legal);
00967     setOperationAction(ISD::ADD,                MVT::v8i16, Legal);
00968     setOperationAction(ISD::ADD,                MVT::v4i32, Legal);
00969     setOperationAction(ISD::ADD,                MVT::v2i64, Legal);
00970     setOperationAction(ISD::MUL,                MVT::v4i32, Custom);
00971     setOperationAction(ISD::MUL,                MVT::v2i64, Custom);
00972     setOperationAction(ISD::UMUL_LOHI,          MVT::v4i32, Custom);
00973     setOperationAction(ISD::SMUL_LOHI,          MVT::v4i32, Custom);
00974     setOperationAction(ISD::MULHU,              MVT::v8i16, Legal);
00975     setOperationAction(ISD::MULHS,              MVT::v8i16, Legal);
00976     setOperationAction(ISD::SUB,                MVT::v16i8, Legal);
00977     setOperationAction(ISD::SUB,                MVT::v8i16, Legal);
00978     setOperationAction(ISD::SUB,                MVT::v4i32, Legal);
00979     setOperationAction(ISD::SUB,                MVT::v2i64, Legal);
00980     setOperationAction(ISD::MUL,                MVT::v8i16, Legal);
00981     setOperationAction(ISD::FADD,               MVT::v2f64, Legal);
00982     setOperationAction(ISD::FSUB,               MVT::v2f64, Legal);
00983     setOperationAction(ISD::FMUL,               MVT::v2f64, Legal);
00984     setOperationAction(ISD::FDIV,               MVT::v2f64, Legal);
00985     setOperationAction(ISD::FSQRT,              MVT::v2f64, Legal);
00986     setOperationAction(ISD::FNEG,               MVT::v2f64, Custom);
00987     setOperationAction(ISD::FABS,               MVT::v2f64, Custom);
00988 
00989     setOperationAction(ISD::SETCC,              MVT::v2i64, Custom);
00990     setOperationAction(ISD::SETCC,              MVT::v16i8, Custom);
00991     setOperationAction(ISD::SETCC,              MVT::v8i16, Custom);
00992     setOperationAction(ISD::SETCC,              MVT::v4i32, Custom);
00993 
00994     setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v16i8, Custom);
00995     setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v8i16, Custom);
00996     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v8i16, Custom);
00997     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4i32, Custom);
00998     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4f32, Custom);
00999 
01000     // Custom lower build_vector, vector_shuffle, and extract_vector_elt.
01001     for (int i = MVT::v16i8; i != MVT::v2i64; ++i) {
01002       MVT VT = (MVT::SimpleValueType)i;
01003       // Do not attempt to custom lower non-power-of-2 vectors
01004       if (!isPowerOf2_32(VT.getVectorNumElements()))
01005         continue;
01006       // Do not attempt to custom lower non-128-bit vectors
01007       if (!VT.is128BitVector())
01008         continue;
01009       setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);
01010       setOperationAction(ISD::VECTOR_SHUFFLE,     VT, Custom);
01011       setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
01012     }
01013 
01014     setOperationAction(ISD::BUILD_VECTOR,       MVT::v2f64, Custom);
01015     setOperationAction(ISD::BUILD_VECTOR,       MVT::v2i64, Custom);
01016     setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v2f64, Custom);
01017     setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v2i64, Custom);
01018     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v2f64, Custom);
01019     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f64, Custom);
01020 
01021     if (Subtarget->is64Bit()) {
01022       setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v2i64, Custom);
01023       setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Custom);
01024     }
01025 
01026     // Promote v16i8, v8i16, v4i32 load, select, and, or, xor to v2i64.
01027     for (int i = MVT::v16i8; i != MVT::v2i64; ++i) {
01028       MVT VT = (MVT::SimpleValueType)i;
01029 
01030       // Do not attempt to promote non-128-bit vectors
01031       if (!VT.is128BitVector())
01032         continue;
01033 
01034       setOperationAction(ISD::AND,    VT, Promote);
01035       AddPromotedToType (ISD::AND,    VT, MVT::v2i64);
01036       setOperationAction(ISD::OR,     VT, Promote);
01037       AddPromotedToType (ISD::OR,     VT, MVT::v2i64);
01038       setOperationAction(ISD::XOR,    VT, Promote);
01039       AddPromotedToType (ISD::XOR,    VT, MVT::v2i64);
01040       setOperationAction(ISD::LOAD,   VT, Promote);
01041       AddPromotedToType (ISD::LOAD,   VT, MVT::v2i64);
01042       setOperationAction(ISD::SELECT, VT, Promote);
01043       AddPromotedToType (ISD::SELECT, VT, MVT::v2i64);
01044     }
01045 
01046     setTruncStoreAction(MVT::f64, MVT::f32, Expand);
01047 
01048     // Custom lower v2i64 and v2f64 selects.
01049     setOperationAction(ISD::LOAD,               MVT::v2f64, Legal);
01050     setOperationAction(ISD::LOAD,               MVT::v2i64, Legal);
01051     setOperationAction(ISD::SELECT,             MVT::v2f64, Custom);
01052     setOperationAction(ISD::SELECT,             MVT::v2i64, Custom);
01053 
01054     setOperationAction(ISD::FP_TO_SINT,         MVT::v4i32, Legal);
01055     setOperationAction(ISD::SINT_TO_FP,         MVT::v4i32, Legal);
01056 
01057     setOperationAction(ISD::UINT_TO_FP,         MVT::v4i8,  Custom);
01058     setOperationAction(ISD::UINT_TO_FP,         MVT::v4i16, Custom);
01059     // As there is no 64-bit GPR available, we need build a special custom
01060     // sequence to convert from v2i32 to v2f32.
01061     if (!Subtarget->is64Bit())
01062       setOperationAction(ISD::UINT_TO_FP,       MVT::v2f32, Custom);
01063 
01064     setOperationAction(ISD::FP_EXTEND,          MVT::v2f32, Custom);
01065     setOperationAction(ISD::FP_ROUND,           MVT::v2f32, Custom);
01066 
01067     setLoadExtAction(ISD::EXTLOAD,              MVT::v2f32, Legal);
01068 
01069     setOperationAction(ISD::BITCAST,            MVT::v2i32, Custom);
01070     setOperationAction(ISD::BITCAST,            MVT::v4i16, Custom);
01071     setOperationAction(ISD::BITCAST,            MVT::v8i8,  Custom);
01072   }
01073 
01074   if (!TM.Options.UseSoftFloat && Subtarget->hasSSE41()) {
01075     setOperationAction(ISD::FFLOOR,             MVT::f32,   Legal);
01076     setOperationAction(ISD::FCEIL,              MVT::f32,   Legal);
01077     setOperationAction(ISD::FTRUNC,             MVT::f32,   Legal);
01078     setOperationAction(ISD::FRINT,              MVT::f32,   Legal);
01079     setOperationAction(ISD::FNEARBYINT,         MVT::f32,   Legal);
01080     setOperationAction(ISD::FFLOOR,             MVT::f64,   Legal);
01081     setOperationAction(ISD::FCEIL,              MVT::f64,   Legal);
01082     setOperationAction(ISD::FTRUNC,             MVT::f64,   Legal);
01083     setOperationAction(ISD::FRINT,              MVT::f64,   Legal);
01084     setOperationAction(ISD::FNEARBYINT,         MVT::f64,   Legal);
01085 
01086     setOperationAction(ISD::FFLOOR,             MVT::v4f32, Legal);
01087     setOperationAction(ISD::FCEIL,              MVT::v4f32, Legal);
01088     setOperationAction(ISD::FTRUNC,             MVT::v4f32, Legal);
01089     setOperationAction(ISD::FRINT,              MVT::v4f32, Legal);
01090     setOperationAction(ISD::FNEARBYINT,         MVT::v4f32, Legal);
01091     setOperationAction(ISD::FFLOOR,             MVT::v2f64, Legal);
01092     setOperationAction(ISD::FCEIL,              MVT::v2f64, Legal);
01093     setOperationAction(ISD::FTRUNC,             MVT::v2f64, Legal);
01094     setOperationAction(ISD::FRINT,              MVT::v2f64, Legal);
01095     setOperationAction(ISD::FNEARBYINT,         MVT::v2f64, Legal);
01096 
01097     // FIXME: Do we need to handle scalar-to-vector here?
01098     setOperationAction(ISD::MUL,                MVT::v4i32, Legal);
01099 
01100     setOperationAction(ISD::VSELECT,            MVT::v2f64, Custom);
01101     setOperationAction(ISD::VSELECT,            MVT::v2i64, Custom);
01102     setOperationAction(ISD::VSELECT,            MVT::v4i32, Custom);
01103     setOperationAction(ISD::VSELECT,            MVT::v4f32, Custom);
01104     setOperationAction(ISD::VSELECT,            MVT::v8i16, Custom);
01105     // There is no BLENDI for byte vectors. We don't need to custom lower
01106     // some vselects for now.
01107     setOperationAction(ISD::VSELECT,            MVT::v16i8, Legal);
01108 
01109     // i8 and i16 vectors are custom , because the source register and source
01110     // source memory operand types are not the same width.  f32 vectors are
01111     // custom since the immediate controlling the insert encodes additional
01112     // information.
01113     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v16i8, Custom);
01114     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v8i16, Custom);
01115     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4i32, Custom);
01116     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4f32, Custom);
01117 
01118     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i8, Custom);
01119     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i16, Custom);
01120     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Custom);
01121     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
01122 
01123     // FIXME: these should be Legal but thats only for the case where
01124     // the index is constant.  For now custom expand to deal with that.
01125     if (Subtarget->is64Bit()) {
01126       setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v2i64, Custom);
01127       setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Custom);
01128     }
01129   }
01130 
01131   if (Subtarget->hasSSE2()) {
01132     setOperationAction(ISD::SRL,               MVT::v8i16, Custom);
01133     setOperationAction(ISD::SRL,               MVT::v16i8, Custom);
01134 
01135     setOperationAction(ISD::SHL,               MVT::v8i16, Custom);
01136     setOperationAction(ISD::SHL,               MVT::v16i8, Custom);
01137 
01138     setOperationAction(ISD::SRA,               MVT::v8i16, Custom);
01139     setOperationAction(ISD::SRA,               MVT::v16i8, Custom);
01140 
01141     // In the customized shift lowering, the legal cases in AVX2 will be
01142     // recognized.
01143     setOperationAction(ISD::SRL,               MVT::v2i64, Custom);
01144     setOperationAction(ISD::SRL,               MVT::v4i32, Custom);
01145 
01146     setOperationAction(ISD::SHL,               MVT::v2i64, Custom);
01147     setOperationAction(ISD::SHL,               MVT::v4i32, Custom);
01148 
01149     setOperationAction(ISD::SRA,               MVT::v4i32, Custom);
01150   }
01151 
01152   if (!TM.Options.UseSoftFloat && Subtarget->hasFp256()) {
01153     addRegisterClass(MVT::v32i8,  &X86::VR256RegClass);
01154     addRegisterClass(MVT::v16i16, &X86::VR256RegClass);
01155     addRegisterClass(MVT::v8i32,  &X86::VR256RegClass);
01156     addRegisterClass(MVT::v8f32,  &X86::VR256RegClass);
01157     addRegisterClass(MVT::v4i64,  &X86::VR256RegClass);
01158     addRegisterClass(MVT::v4f64,  &X86::VR256RegClass);
01159 
01160     setOperationAction(ISD::LOAD,               MVT::v8f32, Legal);
01161     setOperationAction(ISD::LOAD,               MVT::v4f64, Legal);
01162     setOperationAction(ISD::LOAD,               MVT::v4i64, Legal);
01163 
01164     setOperationAction(ISD::FADD,               MVT::v8f32, Legal);
01165     setOperationAction(ISD::FSUB,               MVT::v8f32, Legal);
01166     setOperationAction(ISD::FMUL,               MVT::v8f32, Legal);
01167     setOperationAction(ISD::FDIV,               MVT::v8f32, Legal);
01168     setOperationAction(ISD::FSQRT,              MVT::v8f32, Legal);
01169     setOperationAction(ISD::FFLOOR,             MVT::v8f32, Legal);
01170     setOperationAction(ISD::FCEIL,              MVT::v8f32, Legal);
01171     setOperationAction(ISD::FTRUNC,             MVT::v8f32, Legal);
01172     setOperationAction(ISD::FRINT,              MVT::v8f32, Legal);
01173     setOperationAction(ISD::FNEARBYINT,         MVT::v8f32, Legal);
01174     setOperationAction(ISD::FNEG,               MVT::v8f32, Custom);
01175     setOperationAction(ISD::FABS,               MVT::v8f32, Custom);
01176 
01177     setOperationAction(ISD::FADD,               MVT::v4f64, Legal);
01178     setOperationAction(ISD::FSUB,               MVT::v4f64, Legal);
01179     setOperationAction(ISD::FMUL,               MVT::v4f64, Legal);
01180     setOperationAction(ISD::FDIV,               MVT::v4f64, Legal);
01181     setOperationAction(ISD::FSQRT,              MVT::v4f64, Legal);
01182     setOperationAction(ISD::FFLOOR,             MVT::v4f64, Legal);
01183     setOperationAction(ISD::FCEIL,              MVT::v4f64, Legal);
01184     setOperationAction(ISD::FTRUNC,             MVT::v4f64, Legal);
01185     setOperationAction(ISD::FRINT,              MVT::v4f64, Legal);
01186     setOperationAction(ISD::FNEARBYINT,         MVT::v4f64, Legal);
01187     setOperationAction(ISD::FNEG,               MVT::v4f64, Custom);
01188     setOperationAction(ISD::FABS,               MVT::v4f64, Custom);
01189 
01190     // (fp_to_int:v8i16 (v8f32 ..)) requires the result type to be promoted
01191     // even though v8i16 is a legal type.
01192     setOperationAction(ISD::FP_TO_SINT,         MVT::v8i16, Promote);
01193     setOperationAction(ISD::FP_TO_UINT,         MVT::v8i16, Promote);
01194     setOperationAction(ISD::FP_TO_SINT,         MVT::v8i32, Legal);
01195 
01196     setOperationAction(ISD::SINT_TO_FP,         MVT::v8i16, Promote);
01197     setOperationAction(ISD::SINT_TO_FP,         MVT::v8i32, Legal);
01198     setOperationAction(ISD::FP_ROUND,           MVT::v4f32, Legal);
01199 
01200     setOperationAction(ISD::UINT_TO_FP,         MVT::v8i8,  Custom);
01201     setOperationAction(ISD::UINT_TO_FP,         MVT::v8i16, Custom);
01202 
01203     setLoadExtAction(ISD::EXTLOAD,              MVT::v4f32, Legal);
01204 
01205     setOperationAction(ISD::SRL,               MVT::v16i16, Custom);
01206     setOperationAction(ISD::SRL,               MVT::v32i8, Custom);
01207 
01208     setOperationAction(ISD::SHL,               MVT::v16i16, Custom);
01209     setOperationAction(ISD::SHL,               MVT::v32i8, Custom);
01210 
01211     setOperationAction(ISD::SRA,               MVT::v16i16, Custom);
01212     setOperationAction(ISD::SRA,               MVT::v32i8, Custom);
01213 
01214     setOperationAction(ISD::SETCC,             MVT::v32i8, Custom);
01215     setOperationAction(ISD::SETCC,             MVT::v16i16, Custom);
01216     setOperationAction(ISD::SETCC,             MVT::v8i32, Custom);
01217     setOperationAction(ISD::SETCC,             MVT::v4i64, Custom);
01218 
01219     setOperationAction(ISD::SELECT,            MVT::v4f64, Custom);
01220     setOperationAction(ISD::SELECT,            MVT::v4i64, Custom);
01221     setOperationAction(ISD::SELECT,            MVT::v8f32, Custom);
01222 
01223     setOperationAction(ISD::VSELECT,           MVT::v4f64, Custom);
01224     setOperationAction(ISD::VSELECT,           MVT::v4i64, Custom);
01225     setOperationAction(ISD::VSELECT,           MVT::v8i32, Custom);
01226     setOperationAction(ISD::VSELECT,           MVT::v8f32, Custom);
01227 
01228     setOperationAction(ISD::SIGN_EXTEND,       MVT::v4i64, Custom);
01229     setOperationAction(ISD::SIGN_EXTEND,       MVT::v8i32, Custom);
01230     setOperationAction(ISD::SIGN_EXTEND,       MVT::v16i16, Custom);
01231     setOperationAction(ISD::ZERO_EXTEND,       MVT::v4i64, Custom);
01232     setOperationAction(ISD::ZERO_EXTEND,       MVT::v8i32, Custom);
01233     setOperationAction(ISD::ZERO_EXTEND,       MVT::v16i16, Custom);
01234     setOperationAction(ISD::ANY_EXTEND,        MVT::v4i64, Custom);
01235     setOperationAction(ISD::ANY_EXTEND,        MVT::v8i32, Custom);
01236     setOperationAction(ISD::ANY_EXTEND,        MVT::v16i16, Custom);
01237     setOperationAction(ISD::TRUNCATE,          MVT::v16i8, Custom);
01238     setOperationAction(ISD::TRUNCATE,          MVT::v8i16, Custom);
01239     setOperationAction(ISD::TRUNCATE,          MVT::v4i32, Custom);
01240 
01241     if (Subtarget->hasFMA() || Subtarget->hasFMA4()) {
01242       setOperationAction(ISD::FMA,             MVT::v8f32, Legal);
01243       setOperationAction(ISD::FMA,             MVT::v4f64, Legal);
01244       setOperationAction(ISD::FMA,             MVT::v4f32, Legal);
01245       setOperationAction(ISD::FMA,             MVT::v2f64, Legal);
01246       setOperationAction(ISD::FMA,             MVT::f32, Legal);
01247       setOperationAction(ISD::FMA,             MVT::f64, Legal);
01248     }
01249 
01250     if (Subtarget->hasInt256()) {
01251       setOperationAction(ISD::ADD,             MVT::v4i64, Legal);
01252       setOperationAction(ISD::ADD,             MVT::v8i32, Legal);
01253       setOperationAction(ISD::ADD,             MVT::v16i16, Legal);
01254       setOperationAction(ISD::ADD,             MVT::v32i8, Legal);
01255 
01256       setOperationAction(ISD::SUB,             MVT::v4i64, Legal);
01257       setOperationAction(ISD::SUB,             MVT::v8i32, Legal);
01258       setOperationAction(ISD::SUB,             MVT::v16i16, Legal);
01259       setOperationAction(ISD::SUB,             MVT::v32i8, Legal);
01260 
01261       setOperationAction(ISD::MUL,             MVT::v4i64, Custom);
01262       setOperationAction(ISD::MUL,             MVT::v8i32, Legal);
01263       setOperationAction(ISD::MUL,             MVT::v16i16, Legal);
01264       // Don't lower v32i8 because there is no 128-bit byte mul
01265 
01266       setOperationAction(ISD::UMUL_LOHI,       MVT::v8i32, Custom);
01267       setOperationAction(ISD::SMUL_LOHI,       MVT::v8i32, Custom);
01268       setOperationAction(ISD::MULHU,           MVT::v16i16, Legal);
01269       setOperationAction(ISD::MULHS,           MVT::v16i16, Legal);
01270 
01271       setOperationAction(ISD::VSELECT,         MVT::v16i16, Custom);
01272       setOperationAction(ISD::VSELECT,         MVT::v32i8, Legal);
01273     } else {
01274       setOperationAction(ISD::ADD,             MVT::v4i64, Custom);
01275       setOperationAction(ISD::ADD,             MVT::v8i32, Custom);
01276       setOperationAction(ISD::ADD,             MVT::v16i16, Custom);
01277       setOperationAction(ISD::ADD,             MVT::v32i8, Custom);
01278 
01279       setOperationAction(ISD::SUB,             MVT::v4i64, Custom);
01280       setOperationAction(ISD::SUB,             MVT::v8i32, Custom);
01281       setOperationAction(ISD::SUB,             MVT::v16i16, Custom);
01282       setOperationAction(ISD::SUB,             MVT::v32i8, Custom);
01283 
01284       setOperationAction(ISD::MUL,             MVT::v4i64, Custom);
01285       setOperationAction(ISD::MUL,             MVT::v8i32, Custom);
01286       setOperationAction(ISD::MUL,             MVT::v16i16, Custom);
01287       // Don't lower v32i8 because there is no 128-bit byte mul
01288     }
01289 
01290     // In the customized shift lowering, the legal cases in AVX2 will be
01291     // recognized.
01292     setOperationAction(ISD::SRL,               MVT::v4i64, Custom);
01293     setOperationAction(ISD::SRL,               MVT::v8i32, Custom);
01294 
01295     setOperationAction(ISD::SHL,               MVT::v4i64, Custom);
01296     setOperationAction(ISD::SHL,               MVT::v8i32, Custom);
01297 
01298     setOperationAction(ISD::SRA,               MVT::v8i32, Custom);
01299 
01300     // Custom lower several nodes for 256-bit types.
01301     for (int i = MVT::FIRST_VECTOR_VALUETYPE;
01302              i <= MVT::LAST_VECTOR_VALUETYPE; ++i) {
01303       MVT VT = (MVT::SimpleValueType)i;
01304 
01305       // Extract subvector is special because the value type
01306       // (result) is 128-bit but the source is 256-bit wide.
01307       if (VT.is128BitVector())
01308         setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
01309 
01310       // Do not attempt to custom lower other non-256-bit vectors
01311       if (!VT.is256BitVector())
01312         continue;
01313 
01314       setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);
01315       setOperationAction(ISD::VECTOR_SHUFFLE,     VT, Custom);
01316       setOperationAction(ISD::INSERT_VECTOR_ELT,  VT, Custom);
01317       setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
01318       setOperationAction(ISD::SCALAR_TO_VECTOR,   VT, Custom);
01319       setOperationAction(ISD::INSERT_SUBVECTOR,   VT, Custom);
01320       setOperationAction(ISD::CONCAT_VECTORS,     VT, Custom);
01321     }
01322 
01323     // Promote v32i8, v16i16, v8i32 select, and, or, xor to v4i64.
01324     for (int i = MVT::v32i8; i != MVT::v4i64; ++i) {
01325       MVT VT = (MVT::SimpleValueType)i;
01326 
01327       // Do not attempt to promote non-256-bit vectors
01328       if (!VT.is256BitVector())
01329         continue;
01330 
01331       setOperationAction(ISD::AND,    VT, Promote);
01332       AddPromotedToType (ISD::AND,    VT, MVT::v4i64);
01333       setOperationAction(ISD::OR,     VT, Promote);
01334       AddPromotedToType (ISD::OR,     VT, MVT::v4i64);
01335       setOperationAction(ISD::XOR,    VT, Promote);
01336       AddPromotedToType (ISD::XOR,    VT, MVT::v4i64);
01337       setOperationAction(ISD::LOAD,   VT, Promote);
01338       AddPromotedToType (ISD::LOAD,   VT, MVT::v4i64);
01339       setOperationAction(ISD::SELECT, VT, Promote);
01340       AddPromotedToType (ISD::SELECT, VT, MVT::v4i64);
01341     }
01342   }
01343 
01344   if (!TM.Options.UseSoftFloat && Subtarget->hasAVX512()) {
01345     addRegisterClass(MVT::v16i32, &X86::VR512RegClass);
01346     addRegisterClass(MVT::v16f32, &X86::VR512RegClass);
01347     addRegisterClass(MVT::v8i64,  &X86::VR512RegClass);
01348     addRegisterClass(MVT::v8f64,  &X86::VR512RegClass);
01349 
01350     addRegisterClass(MVT::i1,     &X86::VK1RegClass);
01351     addRegisterClass(MVT::v8i1,   &X86::VK8RegClass);
01352     addRegisterClass(MVT::v16i1,  &X86::VK16RegClass);
01353 
01354     setOperationAction(ISD::BR_CC,              MVT::i1,    Expand);
01355     setOperationAction(ISD::SETCC,              MVT::i1,    Custom);
01356     setOperationAction(ISD::XOR,                MVT::i1,    Legal);
01357     setOperationAction(ISD::OR,                 MVT::i1,    Legal);
01358     setOperationAction(ISD::AND,                MVT::i1,    Legal);
01359     setLoadExtAction(ISD::EXTLOAD,              MVT::v8f32, Legal);
01360     setOperationAction(ISD::LOAD,               MVT::v16f32, Legal);
01361     setOperationAction(ISD::LOAD,               MVT::v8f64, Legal);
01362     setOperationAction(ISD::LOAD,               MVT::v8i64, Legal);
01363     setOperationAction(ISD::LOAD,               MVT::v16i32, Legal);
01364     setOperationAction(ISD::LOAD,               MVT::v16i1, Legal);
01365 
01366     setOperationAction(ISD::FADD,               MVT::v16f32, Legal);
01367     setOperationAction(ISD::FSUB,               MVT::v16f32, Legal);
01368     setOperationAction(ISD::FMUL,               MVT::v16f32, Legal);
01369     setOperationAction(ISD::FDIV,               MVT::v16f32, Legal);
01370     setOperationAction(ISD::FSQRT,              MVT::v16f32, Legal);
01371     setOperationAction(ISD::FNEG,               MVT::v16f32, Custom);
01372 
01373     setOperationAction(ISD::FADD,               MVT::v8f64, Legal);
01374     setOperationAction(ISD::FSUB,               MVT::v8f64, Legal);
01375     setOperationAction(ISD::FMUL,               MVT::v8f64, Legal);
01376     setOperationAction(ISD::FDIV,               MVT::v8f64, Legal);
01377     setOperationAction(ISD::FSQRT,              MVT::v8f64, Legal);
01378     setOperationAction(ISD::FNEG,               MVT::v8f64, Custom);
01379     setOperationAction(ISD::FMA,                MVT::v8f64, Legal);
01380     setOperationAction(ISD::FMA,                MVT::v16f32, Legal);
01381 
01382     setOperationAction(ISD::FP_TO_SINT,         MVT::i32, Legal);
01383     setOperationAction(ISD::FP_TO_UINT,         MVT::i32, Legal);
01384     setOperationAction(ISD::SINT_TO_FP,         MVT::i32, Legal);
01385     setOperationAction(ISD::UINT_TO_FP,         MVT::i32, Legal);
01386     if (Subtarget->is64Bit()) {
01387       setOperationAction(ISD::FP_TO_UINT,       MVT::i64, Legal);
01388       setOperationAction(ISD::FP_TO_SINT,       MVT::i64, Legal);
01389       setOperationAction(ISD::SINT_TO_FP,       MVT::i64, Legal);
01390       setOperationAction(ISD::UINT_TO_FP,       MVT::i64, Legal);
01391     }
01392     setOperationAction(ISD::FP_TO_SINT,         MVT::v16i32, Legal);
01393     setOperationAction(ISD::FP_TO_UINT,         MVT::v16i32, Legal);
01394     setOperationAction(ISD::FP_TO_UINT,         MVT::v8i32, Legal);
01395     setOperationAction(ISD::FP_TO_UINT,         MVT::v4i32, Legal);
01396     setOperationAction(ISD::SINT_TO_FP,         MVT::v16i32, Legal);
01397     setOperationAction(ISD::UINT_TO_FP,         MVT::v16i32, Legal);
01398     setOperationAction(ISD::UINT_TO_FP,         MVT::v8i32, Legal);
01399     setOperationAction(ISD::UINT_TO_FP,         MVT::v4i32, Legal);
01400     setOperationAction(ISD::FP_ROUND,           MVT::v8f32, Legal);
01401     setOperationAction(ISD::FP_EXTEND,          MVT::v8f32, Legal);
01402 
01403     setOperationAction(ISD::TRUNCATE,           MVT::i1, Custom);
01404     setOperationAction(ISD::TRUNCATE,           MVT::v16i8, Custom);
01405     setOperationAction(ISD::TRUNCATE,           MVT::v8i32, Custom);
01406     setOperationAction(ISD::TRUNCATE,           MVT::v8i1, Custom);
01407     setOperationAction(ISD::TRUNCATE,           MVT::v16i1, Custom);
01408     setOperationAction(ISD::TRUNCATE,           MVT::v16i16, Custom);
01409     setOperationAction(ISD::ZERO_EXTEND,        MVT::v16i32, Custom);
01410     setOperationAction(ISD::ZERO_EXTEND,        MVT::v8i64, Custom);
01411     setOperationAction(ISD::SIGN_EXTEND,        MVT::v16i32, Custom);
01412     setOperationAction(ISD::SIGN_EXTEND,        MVT::v8i64, Custom);
01413     setOperationAction(ISD::SIGN_EXTEND,        MVT::v16i8, Custom);
01414     setOperationAction(ISD::SIGN_EXTEND,        MVT::v8i16, Custom);
01415     setOperationAction(ISD::SIGN_EXTEND,        MVT::v16i16, Custom);
01416 
01417     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v8f64,  Custom);
01418     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v8i64,  Custom);
01419     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v16f32,  Custom);
01420     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v16i32,  Custom);
01421     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v8i1,    Custom);
01422     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v16i1, Legal);
01423 
01424     setOperationAction(ISD::SETCC,              MVT::v16i1, Custom);
01425     setOperationAction(ISD::SETCC,              MVT::v8i1, Custom);
01426 
01427     setOperationAction(ISD::MUL,              MVT::v8i64, Custom);
01428 
01429     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i1,  Custom);
01430     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i1, Custom);
01431     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v16i1, Custom);
01432     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v8i1, Custom);
01433     setOperationAction(ISD::BUILD_VECTOR,       MVT::v8i1, Custom);
01434     setOperationAction(ISD::BUILD_VECTOR,       MVT::v16i1, Custom);
01435     setOperationAction(ISD::SELECT,             MVT::v8f64, Custom);
01436     setOperationAction(ISD::SELECT,             MVT::v8i64, Custom);
01437     setOperationAction(ISD::SELECT,             MVT::v16f32, Custom);
01438 
01439     setOperationAction(ISD::ADD,                MVT::v8i64, Legal);
01440     setOperationAction(ISD::ADD,                MVT::v16i32, Legal);
01441 
01442     setOperationAction(ISD::SUB,                MVT::v8i64, Legal);
01443     setOperationAction(ISD::SUB,                MVT::v16i32, Legal);
01444 
01445     setOperationAction(ISD::MUL,                MVT::v16i32, Legal);
01446 
01447     setOperationAction(ISD::SRL,                MVT::v8i64, Custom);
01448     setOperationAction(ISD::SRL,                MVT::v16i32, Custom);
01449 
01450     setOperationAction(ISD::SHL,                MVT::v8i64, Custom);
01451     setOperationAction(ISD::SHL,                MVT::v16i32, Custom);
01452 
01453     setOperationAction(ISD::SRA,                MVT::v8i64, Custom);
01454     setOperationAction(ISD::SRA,                MVT::v16i32, Custom);
01455 
01456     setOperationAction(ISD::AND,                MVT::v8i64, Legal);
01457     setOperationAction(ISD::OR,                 MVT::v8i64, Legal);
01458     setOperationAction(ISD::XOR,                MVT::v8i64, Legal);
01459     setOperationAction(ISD::AND,                MVT::v16i32, Legal);
01460     setOperationAction(ISD::OR,                 MVT::v16i32, Legal);
01461     setOperationAction(ISD::XOR,                MVT::v16i32, Legal);
01462 
01463     if (Subtarget->hasCDI()) {
01464       setOperationAction(ISD::CTLZ,             MVT::v8i64, Legal);
01465       setOperationAction(ISD::CTLZ,             MVT::v16i32, Legal);
01466     }
01467 
01468     // Custom lower several nodes.
01469     for (int i = MVT::FIRST_VECTOR_VALUETYPE;
01470              i <= MVT::LAST_VECTOR_VALUETYPE; ++i) {
01471       MVT VT = (MVT::SimpleValueType)i;
01472 
01473       unsigned EltSize = VT.getVectorElementType().getSizeInBits();
01474       // Extract subvector is special because the value type
01475       // (result) is 256/128-bit but the source is 512-bit wide.
01476       if (VT.is128BitVector() || VT.is256BitVector())
01477         setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
01478 
01479       if (VT.getVectorElementType() == MVT::i1)
01480         setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);
01481 
01482       // Do not attempt to custom lower other non-512-bit vectors
01483       if (!VT.is512BitVector())
01484         continue;
01485 
01486       if ( EltSize >= 32) {
01487         setOperationAction(ISD::VECTOR_SHUFFLE,      VT, Custom);
01488         setOperationAction(ISD::INSERT_VECTOR_ELT,   VT, Custom);
01489         setOperationAction(ISD::BUILD_VECTOR,        VT, Custom);
01490         setOperationAction(ISD::VSELECT,             VT, Legal);
01491         setOperationAction(ISD::EXTRACT_VECTOR_ELT,  VT, Custom);
01492         setOperationAction(ISD::SCALAR_TO_VECTOR,    VT, Custom);
01493         setOperationAction(ISD::INSERT_SUBVECTOR,    VT, Custom);
01494       }
01495     }
01496     for (int i = MVT::v32i8; i != MVT::v8i64; ++i) {
01497       MVT VT = (MVT::SimpleValueType)i;
01498 
01499       // Do not attempt to promote non-256-bit vectors
01500       if (!VT.is512BitVector())
01501         continue;
01502 
01503       setOperationAction(ISD::SELECT, VT, Promote);
01504       AddPromotedToType (ISD::SELECT, VT, MVT::v8i64);
01505     }
01506   }// has  AVX-512
01507 
01508   if (!TM.Options.UseSoftFloat && Subtarget->hasBWI()) {
01509     addRegisterClass(MVT::v32i1,  &X86::VK32RegClass);
01510     addRegisterClass(MVT::v64i1,  &X86::VK64RegClass);
01511   }
01512 
01513   // SIGN_EXTEND_INREGs are evaluated by the extend type. Handle the expansion
01514   // of this type with custom code.
01515   for (int VT = MVT::FIRST_VECTOR_VALUETYPE;
01516            VT != MVT::LAST_VECTOR_VALUETYPE; VT++) {
01517     setOperationAction(ISD::SIGN_EXTEND_INREG, (MVT::SimpleValueType)VT,
01518                        Custom);
01519   }
01520 
01521   // We want to custom lower some of our intrinsics.
01522   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
01523   setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
01524   setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
01525   if (!Subtarget->is64Bit())
01526     setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i64, Custom);
01527 
01528   // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't
01529   // handle type legalization for these operations here.
01530   //
01531   // FIXME: We really should do custom legalization for addition and
01532   // subtraction on x86-32 once PR3203 is fixed.  We really can't do much better
01533   // than generic legalization for 64-bit multiplication-with-overflow, though.
01534   for (unsigned i = 0, e = 3+Subtarget->is64Bit(); i != e; ++i) {
01535     // Add/Sub/Mul with overflow operations are custom lowered.
01536     MVT VT = IntVTs[i];
01537     setOperationAction(ISD::SADDO, VT, Custom);
01538     setOperationAction(ISD::UADDO, VT, Custom);
01539     setOperationAction(ISD::SSUBO, VT, Custom);
01540     setOperationAction(ISD::USUBO, VT, Custom);
01541     setOperationAction(ISD::SMULO, VT, Custom);
01542     setOperationAction(ISD::UMULO, VT, Custom);
01543   }
01544 
01545   // There are no 8-bit 3-address imul/mul instructions
01546   setOperationAction(ISD::SMULO, MVT::i8, Expand);
01547   setOperationAction(ISD::UMULO, MVT::i8, Expand);
01548 
01549   if (!Subtarget->is64Bit()) {
01550     // These libcalls are not available in 32-bit.
01551     setLibcallName(RTLIB::SHL_I128, nullptr);
01552     setLibcallName(RTLIB::SRL_I128, nullptr);
01553     setLibcallName(RTLIB::SRA_I128, nullptr);
01554   }
01555 
01556   // Combine sin / cos into one node or libcall if possible.
01557   if (Subtarget->hasSinCos()) {
01558     setLibcallName(RTLIB::SINCOS_F32, "sincosf");
01559     setLibcallName(RTLIB::SINCOS_F64, "sincos");
01560     if (Subtarget->isTargetDarwin()) {
01561       // For MacOSX, we don't want to the normal expansion of a libcall to
01562       // sincos. We want to issue a libcall to __sincos_stret to avoid memory
01563       // traffic.
01564       setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
01565       setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
01566     }
01567   }
01568 
01569   if (Subtarget->isTargetWin64()) {
01570     setOperationAction(ISD::SDIV, MVT::i128, Custom);
01571     setOperationAction(ISD::UDIV, MVT::i128, Custom);
01572     setOperationAction(ISD::SREM, MVT::i128, Custom);
01573     setOperationAction(ISD::UREM, MVT::i128, Custom);
01574     setOperationAction(ISD::SDIVREM, MVT::i128, Custom);
01575     setOperationAction(ISD::UDIVREM, MVT::i128, Custom);
01576   }
01577 
01578   // We have target-specific dag combine patterns for the following nodes:
01579   setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
01580   setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
01581   setTargetDAGCombine(ISD::VSELECT);
01582   setTargetDAGCombine(ISD::SELECT);
01583   setTargetDAGCombine(ISD::SHL);
01584   setTargetDAGCombine(ISD::SRA);
01585   setTargetDAGCombine(ISD::SRL);
01586   setTargetDAGCombine(ISD::OR);
01587   setTargetDAGCombine(ISD::AND);
01588   setTargetDAGCombine(ISD::ADD);
01589   setTargetDAGCombine(ISD::FADD);
01590   setTargetDAGCombine(ISD::FSUB);
01591   setTargetDAGCombine(ISD::FMA);
01592   setTargetDAGCombine(ISD::SUB);
01593   setTargetDAGCombine(ISD::LOAD);
01594   setTargetDAGCombine(ISD::STORE);
01595   setTargetDAGCombine(ISD::ZERO_EXTEND);
01596   setTargetDAGCombine(ISD::ANY_EXTEND);
01597   setTargetDAGCombine(ISD::SIGN_EXTEND);
01598   setTargetDAGCombine(ISD::SIGN_EXTEND_INREG);
01599   setTargetDAGCombine(ISD::TRUNCATE);
01600   setTargetDAGCombine(ISD::SINT_TO_FP);
01601   setTargetDAGCombine(ISD::SETCC);
01602   setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN);
01603   setTargetDAGCombine(ISD::BUILD_VECTOR);
01604   if (Subtarget->is64Bit())
01605     setTargetDAGCombine(ISD::MUL);
01606   setTargetDAGCombine(ISD::XOR);
01607 
01608   computeRegisterProperties();
01609 
01610   // On Darwin, -Os means optimize for size without hurting performance,
01611   // do not reduce the limit.
01612   MaxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores
01613   MaxStoresPerMemsetOptSize = Subtarget->isTargetDarwin() ? 16 : 8;
01614   MaxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores
01615   MaxStoresPerMemcpyOptSize = Subtarget->isTargetDarwin() ? 8 : 4;
01616   MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores
01617   MaxStoresPerMemmoveOptSize = Subtarget->isTargetDarwin() ? 8 : 4;
01618   setPrefLoopAlignment(4); // 2^4 bytes.
01619 
01620   // Predictable cmov don't hurt on atom because it's in-order.
01621   PredictableSelectIsExpensive = !Subtarget->isAtom();
01622 
01623   setPrefFunctionAlignment(4); // 2^4 bytes.
01624 }
01625 
01626 TargetLoweringBase::LegalizeTypeAction
01627 X86TargetLowering::getPreferredVectorAction(EVT VT) const {
01628   if (ExperimentalVectorWideningLegalization &&
01629       VT.getVectorNumElements() != 1 &&
01630       VT.getVectorElementType().getSimpleVT() != MVT::i1)
01631     return TypeWidenVector;
01632 
01633   return TargetLoweringBase::getPreferredVectorAction(VT);
01634 }
01635 
01636 EVT X86TargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const {
01637   if (!VT.isVector())
01638     return Subtarget->hasAVX512() ? MVT::i1: MVT::i8;
01639 
01640   if (Subtarget->hasAVX512())
01641     switch(VT.getVectorNumElements()) {
01642     case  8: return MVT::v8i1;
01643     case 16: return MVT::v16i1;
01644   }
01645 
01646   return VT.changeVectorElementTypeToInteger();
01647 }
01648 
01649 /// getMaxByValAlign - Helper for getByValTypeAlignment to determine
01650 /// the desired ByVal argument alignment.
01651 static void getMaxByValAlign(Type *Ty, unsigned &MaxAlign) {
01652   if (MaxAlign == 16)
01653     return;
01654   if (VectorType *VTy = dyn_cast<VectorType>(Ty)) {
01655     if (VTy->getBitWidth() == 128)
01656       MaxAlign = 16;
01657   } else if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
01658     unsigned EltAlign = 0;
01659     getMaxByValAlign(ATy->getElementType(), EltAlign);
01660     if (EltAlign > MaxAlign)
01661       MaxAlign = EltAlign;
01662   } else if (StructType *STy = dyn_cast<StructType>(Ty)) {
01663     for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
01664       unsigned EltAlign = 0;
01665       getMaxByValAlign(STy->getElementType(i), EltAlign);
01666       if (EltAlign > MaxAlign)
01667         MaxAlign = EltAlign;
01668       if (MaxAlign == 16)
01669         break;
01670     }
01671   }
01672 }
01673 
01674 /// getByValTypeAlignment - Return the desired alignment for ByVal aggregate
01675 /// function arguments in the caller parameter area. For X86, aggregates
01676 /// that contain SSE vectors are placed at 16-byte boundaries while the rest
01677 /// are at 4-byte boundaries.
01678 unsigned X86TargetLowering::getByValTypeAlignment(Type *Ty) const {
01679   if (Subtarget->is64Bit()) {
01680     // Max of 8 and alignment of type.
01681     unsigned TyAlign = TD->getABITypeAlignment(Ty);
01682     if (TyAlign > 8)
01683       return TyAlign;
01684     return 8;
01685   }
01686 
01687   unsigned Align = 4;
01688   if (Subtarget->hasSSE1())
01689     getMaxByValAlign(Ty, Align);
01690   return Align;
01691 }
01692 
01693 /// getOptimalMemOpType - Returns the target specific optimal type for load
01694 /// and store operations as a result of memset, memcpy, and memmove
01695 /// lowering. If DstAlign is zero that means it's safe to destination
01696 /// alignment can satisfy any constraint. Similarly if SrcAlign is zero it
01697 /// means there isn't a need to check it against alignment requirement,
01698 /// probably because the source does not need to be loaded. If 'IsMemset' is
01699 /// true, that means it's expanding a memset. If 'ZeroMemset' is true, that
01700 /// means it's a memset of zero. 'MemcpyStrSrc' indicates whether the memcpy
01701 /// source is constant so it does not need to be loaded.
01702 /// It returns EVT::Other if the type should be determined using generic
01703 /// target-independent logic.
01704 EVT
01705 X86TargetLowering::getOptimalMemOpType(uint64_t Size,
01706                                        unsigned DstAlign, unsigned SrcAlign,
01707                                        bool IsMemset, bool ZeroMemset,
01708                                        bool MemcpyStrSrc,
01709                                        MachineFunction &MF) const {
01710   const Function *F = MF.getFunction();
01711   if ((!IsMemset || ZeroMemset) &&
01712       !F->getAttributes().hasAttribute(AttributeSet::FunctionIndex,
01713                                        Attribute::NoImplicitFloat)) {
01714     if (Size >= 16 &&
01715         (Subtarget->isUnalignedMemAccessFast() ||
01716          ((DstAlign == 0 || DstAlign >= 16) &&
01717           (SrcAlign == 0 || SrcAlign >= 16)))) {
01718       if (Size >= 32) {
01719         if (Subtarget->hasInt256())
01720           return MVT::v8i32;
01721         if (Subtarget->hasFp256())
01722           return MVT::v8f32;
01723       }
01724       if (Subtarget->hasSSE2())
01725         return MVT::v4i32;
01726       if (Subtarget->hasSSE1())
01727         return MVT::v4f32;
01728     } else if (!MemcpyStrSrc && Size >= 8 &&
01729                !Subtarget->is64Bit() &&
01730                Subtarget->hasSSE2()) {
01731       // Do not use f64 to lower memcpy if source is string constant. It's
01732       // better to use i32 to avoid the loads.
01733       return MVT::f64;
01734     }
01735   }
01736   if (Subtarget->is64Bit() && Size >= 8)
01737     return MVT::i64;
01738   return MVT::i32;
01739 }
01740 
01741 bool X86TargetLowering::isSafeMemOpType(MVT VT) const {
01742   if (VT == MVT::f32)
01743     return X86ScalarSSEf32;
01744   else if (VT == MVT::f64)
01745     return X86ScalarSSEf64;
01746   return true;
01747 }
01748 
01749 bool
01750 X86TargetLowering::allowsUnalignedMemoryAccesses(EVT VT,
01751                                                  unsigned,
01752                                                  bool *Fast) const {
01753   if (Fast)
01754     *Fast = Subtarget->isUnalignedMemAccessFast();
01755   return true;
01756 }
01757 
01758 /// getJumpTableEncoding - Return the entry encoding for a jump table in the
01759 /// current function.  The returned value is a member of the
01760 /// MachineJumpTableInfo::JTEntryKind enum.
01761 unsigned X86TargetLowering::getJumpTableEncoding() const {
01762   // In GOT pic mode, each entry in the jump table is emitted as a @GOTOFF
01763   // symbol.
01764   if (getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
01765       Subtarget->isPICStyleGOT())
01766     return MachineJumpTableInfo::EK_Custom32;
01767 
01768   // Otherwise, use the normal jump table encoding heuristics.
01769   return TargetLowering::getJumpTableEncoding();
01770 }
01771 
01772 const MCExpr *
01773 X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI,
01774                                              const MachineBasicBlock *MBB,
01775                                              unsigned uid,MCContext &Ctx) const{
01776   assert(MBB->getParent()->getTarget().getRelocationModel() == Reloc::PIC_ &&
01777          Subtarget->isPICStyleGOT());
01778   // In 32-bit ELF systems, our jump table entries are formed with @GOTOFF
01779   // entries.
01780   return MCSymbolRefExpr::Create(MBB->getSymbol(),
01781                                  MCSymbolRefExpr::VK_GOTOFF, Ctx);
01782 }
01783 
01784 /// getPICJumpTableRelocaBase - Returns relocation base for the given PIC
01785 /// jumptable.
01786 SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table,
01787                                                     SelectionDAG &DAG) const {
01788   if (!Subtarget->is64Bit())
01789     // This doesn't have SDLoc associated with it, but is not really the
01790     // same as a Register.
01791     return DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), getPointerTy());
01792   return Table;
01793 }
01794 
01795 /// getPICJumpTableRelocBaseExpr - This returns the relocation base for the
01796 /// given PIC jumptable, the same as getPICJumpTableRelocBase, but as an
01797 /// MCExpr.
01798 const MCExpr *X86TargetLowering::
01799 getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI,
01800                              MCContext &Ctx) const {
01801   // X86-64 uses RIP relative addressing based on the jump table label.
01802   if (Subtarget->isPICStyleRIPRel())
01803     return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx);
01804 
01805   // Otherwise, the reference is relative to the PIC base.
01806   return MCSymbolRefExpr::Create(MF->getPICBaseSymbol(), Ctx);
01807 }
01808 
01809 // FIXME: Why this routine is here? Move to RegInfo!
01810 std::pair<const TargetRegisterClass*, uint8_t>
01811 X86TargetLowering::findRepresentativeClass(MVT VT) const{
01812   const TargetRegisterClass *RRC = nullptr;
01813   uint8_t Cost = 1;
01814   switch (VT.SimpleTy) {
01815   default:
01816     return TargetLowering::findRepresentativeClass(VT);
01817   case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64:
01818     RRC = Subtarget->is64Bit() ?
01819       (const TargetRegisterClass*)&X86::GR64RegClass :
01820       (const TargetRegisterClass*)&X86::GR32RegClass;
01821     break;
01822   case MVT::x86mmx:
01823     RRC = &X86::VR64RegClass;
01824     break;
01825   case MVT::f32: case MVT::f64:
01826   case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64:
01827   case MVT::v4f32: case MVT::v2f64:
01828   case MVT::v32i8: case MVT::v8i32: case MVT::v4i64: case MVT::v8f32:
01829   case MVT::v4f64:
01830     RRC = &X86::VR128RegClass;
01831     break;
01832   }
01833   return std::make_pair(RRC, Cost);
01834 }
01835 
01836 bool X86TargetLowering::getStackCookieLocation(unsigned &AddressSpace,
01837                                                unsigned &Offset) const {
01838   if (!Subtarget->isTargetLinux())
01839     return false;
01840 
01841   if (Subtarget->is64Bit()) {
01842     // %fs:0x28, unless we're using a Kernel code model, in which case it's %gs:
01843     Offset = 0x28;
01844     if (getTargetMachine().getCodeModel() == CodeModel::Kernel)
01845       AddressSpace = 256;
01846     else
01847       AddressSpace = 257;
01848   } else {
01849     // %gs:0x14 on i386
01850     Offset = 0x14;
01851     AddressSpace = 256;
01852   }
01853   return true;
01854 }
01855 
01856 bool X86TargetLowering::isNoopAddrSpaceCast(unsigned SrcAS,
01857                                             unsigned DestAS) const {
01858   assert(SrcAS != DestAS && "Expected different address spaces!");
01859 
01860   return SrcAS < 256 && DestAS < 256;
01861 }
01862 
01863 //===----------------------------------------------------------------------===//
01864 //               Return Value Calling Convention Implementation
01865 //===----------------------------------------------------------------------===//
01866 
01867 #include "X86GenCallingConv.inc"
01868 
01869 bool
01870 X86TargetLowering::CanLowerReturn(CallingConv::ID CallConv,
01871                                   MachineFunction &MF, bool isVarArg,
01872                         const SmallVectorImpl<ISD::OutputArg> &Outs,
01873                         LLVMContext &Context) const {
01874   SmallVector<CCValAssign, 16> RVLocs;
01875   CCState CCInfo(CallConv, isVarArg, MF, MF.getTarget(),
01876                  RVLocs, Context);
01877   return CCInfo.CheckReturn(Outs, RetCC_X86);
01878 }
01879 
01880 const MCPhysReg *X86TargetLowering::getScratchRegisters(CallingConv::ID) const {
01881   static const MCPhysReg ScratchRegs[] = { X86::R11, 0 };
01882   return ScratchRegs;
01883 }
01884 
01885 SDValue
01886 X86TargetLowering::LowerReturn(SDValue Chain,
01887                                CallingConv::ID CallConv, bool isVarArg,
01888                                const SmallVectorImpl<ISD::OutputArg> &Outs,
01889                                const SmallVectorImpl<SDValue> &OutVals,
01890                                SDLoc dl, SelectionDAG &DAG) const {
01891   MachineFunction &MF = DAG.getMachineFunction();
01892   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
01893 
01894   SmallVector<CCValAssign, 16> RVLocs;
01895   CCState CCInfo(CallConv, isVarArg, MF, DAG.getTarget(),
01896                  RVLocs, *DAG.getContext());
01897   CCInfo.AnalyzeReturn(Outs, RetCC_X86);
01898 
01899   SDValue Flag;
01900   SmallVector<SDValue, 6> RetOps;
01901   RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
01902   // Operand #1 = Bytes To Pop
01903   RetOps.push_back(DAG.getTargetConstant(FuncInfo->getBytesToPopOnReturn(),
01904                    MVT::i16));
01905 
01906   // Copy the result values into the output registers.
01907   for (unsigned i = 0; i != RVLocs.size(); ++i) {
01908     CCValAssign &VA = RVLocs[i];
01909     assert(VA.isRegLoc() && "Can only return in registers!");
01910     SDValue ValToCopy = OutVals[i];
01911     EVT ValVT = ValToCopy.getValueType();
01912 
01913     // Promote values to the appropriate types
01914     if (VA.getLocInfo() == CCValAssign::SExt)
01915       ValToCopy = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), ValToCopy);
01916     else if (VA.getLocInfo() == CCValAssign::ZExt)
01917       ValToCopy = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), ValToCopy);
01918     else if (VA.getLocInfo() == CCValAssign::AExt)
01919       ValToCopy = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), ValToCopy);
01920     else if (VA.getLocInfo() == CCValAssign::BCvt)
01921       ValToCopy = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), ValToCopy);
01922 
01923     assert(VA.getLocInfo() != CCValAssign::FPExt &&
01924            "Unexpected FP-extend for return value.");  
01925 
01926     // If this is x86-64, and we disabled SSE, we can't return FP values,
01927     // or SSE or MMX vectors.
01928     if ((ValVT == MVT::f32 || ValVT == MVT::f64 ||
01929          VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) &&
01930           (Subtarget->is64Bit() && !Subtarget->hasSSE1())) {
01931       report_fatal_error("SSE register return with SSE disabled");
01932     }
01933     // Likewise we can't return F64 values with SSE1 only.  gcc does so, but
01934     // llvm-gcc has never done it right and no one has noticed, so this
01935     // should be OK for now.
01936     if (ValVT == MVT::f64 &&
01937         (Subtarget->is64Bit() && !Subtarget->hasSSE2()))
01938       report_fatal_error("SSE2 register return with SSE2 disabled");
01939 
01940     // Returns in ST0/ST1 are handled specially: these are pushed as operands to
01941     // the RET instruction and handled by the FP Stackifier.
01942     if (VA.getLocReg() == X86::ST0 ||
01943         VA.getLocReg() == X86::ST1) {
01944       // If this is a copy from an xmm register to ST(0), use an FPExtend to
01945       // change the value to the FP stack register class.
01946       if (isScalarFPTypeInSSEReg(VA.getValVT()))
01947         ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy);
01948       RetOps.push_back(ValToCopy);
01949       // Don't emit a copytoreg.
01950       continue;
01951     }
01952 
01953     // 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64
01954     // which is returned in RAX / RDX.
01955     if (Subtarget->is64Bit()) {
01956       if (ValVT == MVT::x86mmx) {
01957         if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) {
01958           ValToCopy = DAG.getNode(ISD::BITCAST, dl, MVT::i64, ValToCopy);
01959           ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
01960                                   ValToCopy);
01961           // If we don't have SSE2 available, convert to v4f32 so the generated
01962           // register is legal.
01963           if (!Subtarget->hasSSE2())
01964             ValToCopy = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32,ValToCopy);
01965         }
01966       }
01967     }
01968 
01969     Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), ValToCopy, Flag);
01970     Flag = Chain.getValue(1);
01971     RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
01972   }
01973 
01974   // The x86-64 ABIs require that for returning structs by value we copy
01975   // the sret argument into %rax/%eax (depending on ABI) for the return.
01976   // Win32 requires us to put the sret argument to %eax as well.
01977   // We saved the argument into a virtual register in the entry block,
01978   // so now we copy the value out and into %rax/%eax.
01979   if (DAG.getMachineFunction().getFunction()->hasStructRetAttr() &&
01980       (Subtarget->is64Bit() || Subtarget->isTargetKnownWindowsMSVC())) {
01981     MachineFunction &MF = DAG.getMachineFunction();
01982     X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
01983     unsigned Reg = FuncInfo->getSRetReturnReg();
01984     assert(Reg &&
01985            "SRetReturnReg should have been set in LowerFormalArguments().");
01986     SDValue Val = DAG.getCopyFromReg(Chain, dl, Reg, getPointerTy());
01987 
01988     unsigned RetValReg
01989         = (Subtarget->is64Bit() && !Subtarget->isTarget64BitILP32()) ?
01990           X86::RAX : X86::EAX;
01991     Chain = DAG.getCopyToReg(Chain, dl, RetValReg, Val, Flag);
01992     Flag = Chain.getValue(1);
01993 
01994     // RAX/EAX now acts like a return value.
01995     RetOps.push_back(DAG.getRegister(RetValReg, getPointerTy()));
01996   }
01997 
01998   RetOps[0] = Chain;  // Update chain.
01999 
02000   // Add the flag if we have it.
02001   if (Flag.getNode())
02002     RetOps.push_back(Flag);
02003 
02004   return DAG.getNode(X86ISD::RET_FLAG, dl, MVT::Other, RetOps);
02005 }
02006 
02007 bool X86TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
02008   if (N->getNumValues() != 1)
02009     return false;
02010   if (!N->hasNUsesOfValue(1, 0))
02011     return false;
02012 
02013   SDValue TCChain = Chain;
02014   SDNode *Copy = *N->use_begin();
02015   if (Copy->getOpcode() == ISD::CopyToReg) {
02016     // If the copy has a glue operand, we conservatively assume it isn't safe to
02017     // perform a tail call.
02018     if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
02019       return false;
02020     TCChain = Copy->getOperand(0);
02021   } else if (Copy->getOpcode() != ISD::FP_EXTEND)
02022     return false;
02023 
02024   bool HasRet = false;
02025   for (SDNode::use_iterator UI = Copy->use_begin(), UE = Copy->use_end();
02026        UI != UE; ++UI) {
02027     if (UI->getOpcode() != X86ISD::RET_FLAG)
02028       return false;
02029     HasRet = true;
02030   }
02031 
02032   if (!HasRet)
02033     return false;
02034 
02035   Chain = TCChain;
02036   return true;
02037 }
02038 
02039 MVT
02040 X86TargetLowering::getTypeForExtArgOrReturn(MVT VT,
02041                                             ISD::NodeType ExtendKind) const {
02042   MVT ReturnMVT;
02043   // TODO: Is this also valid on 32-bit?
02044   if (Subtarget->is64Bit() && VT == MVT::i1 && ExtendKind == ISD::ZERO_EXTEND)
02045     ReturnMVT = MVT::i8;
02046   else
02047     ReturnMVT = MVT::i32;
02048 
02049   MVT MinVT = getRegisterType(ReturnMVT);
02050   return VT.bitsLT(MinVT) ? MinVT : VT;
02051 }
02052 
02053 /// LowerCallResult - Lower the result values of a call into the
02054 /// appropriate copies out of appropriate physical registers.
02055 ///
02056 SDValue
02057 X86TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
02058                                    CallingConv::ID CallConv, bool isVarArg,
02059                                    const SmallVectorImpl<ISD::InputArg> &Ins,
02060                                    SDLoc dl, SelectionDAG &DAG,
02061                                    SmallVectorImpl<SDValue> &InVals) const {
02062 
02063   // Assign locations to each value returned by this call.
02064   SmallVector<CCValAssign, 16> RVLocs;
02065   bool Is64Bit = Subtarget->is64Bit();
02066   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
02067                  DAG.getTarget(), RVLocs, *DAG.getContext());
02068   CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
02069 
02070   // Copy all of the result registers out of their specified physreg.
02071   for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
02072     CCValAssign &VA = RVLocs[i];
02073     EVT CopyVT = VA.getValVT();
02074 
02075     // If this is x86-64, and we disabled SSE, we can't return FP values
02076     if ((CopyVT == MVT::f32 || CopyVT == MVT::f64) &&
02077         ((Is64Bit || Ins[i].Flags.isInReg()) && !Subtarget->hasSSE1())) {
02078       report_fatal_error("SSE register return with SSE disabled");
02079     }
02080 
02081     SDValue Val;
02082 
02083     // If this is a call to a function that returns an fp value on the floating
02084     // point stack, we must guarantee the value is popped from the stack, so
02085     // a CopyFromReg is not good enough - the copy instruction may be eliminated
02086     // if the return value is not used. We use the FpPOP_RETVAL instruction
02087     // instead.
02088     if (VA.getLocReg() == X86::ST0 || VA.getLocReg() == X86::ST1) {
02089       // If we prefer to use the value in xmm registers, copy it out as f80 and
02090       // use a truncate to move it from fp stack reg to xmm reg.
02091       if (isScalarFPTypeInSSEReg(VA.getValVT())) CopyVT = MVT::f80;
02092       SDValue Ops[] = { Chain, InFlag };
02093       Chain = SDValue(DAG.getMachineNode(X86::FpPOP_RETVAL, dl, CopyVT,
02094                                          MVT::Other, MVT::Glue, Ops), 1);
02095       Val = Chain.getValue(0);
02096 
02097       // Round the f80 to the right size, which also moves it to the appropriate
02098       // xmm register.
02099       if (CopyVT != VA.getValVT())
02100         Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val,
02101                           // This truncation won't change the value.
02102                           DAG.getIntPtrConstant(1));
02103     } else {
02104       Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(),
02105                                  CopyVT, InFlag).getValue(1);
02106       Val = Chain.getValue(0);
02107     }
02108     InFlag = Chain.getValue(2);
02109     InVals.push_back(Val);
02110   }
02111 
02112   return Chain;
02113 }
02114 
02115 //===----------------------------------------------------------------------===//
02116 //                C & StdCall & Fast Calling Convention implementation
02117 //===----------------------------------------------------------------------===//
02118 //  StdCall calling convention seems to be standard for many Windows' API
02119 //  routines and around. It differs from C calling convention just a little:
02120 //  callee should clean up the stack, not caller. Symbols should be also
02121 //  decorated in some fancy way :) It doesn't support any vector arguments.
02122 //  For info on fast calling convention see Fast Calling Convention (tail call)
02123 //  implementation LowerX86_32FastCCCallTo.
02124 
02125 /// CallIsStructReturn - Determines whether a call uses struct return
02126 /// semantics.
02127 enum StructReturnType {
02128   NotStructReturn,
02129   RegStructReturn,
02130   StackStructReturn
02131 };
02132 static StructReturnType
02133 callIsStructReturn(const SmallVectorImpl<ISD::OutputArg> &Outs) {
02134   if (Outs.empty())
02135     return NotStructReturn;
02136 
02137   const ISD::ArgFlagsTy &Flags = Outs[0].Flags;
02138   if (!Flags.isSRet())
02139     return NotStructReturn;
02140   if (Flags.isInReg())
02141     return RegStructReturn;
02142   return StackStructReturn;
02143 }
02144 
02145 /// ArgsAreStructReturn - Determines whether a function uses struct
02146 /// return semantics.
02147 static StructReturnType
02148 argsAreStructReturn(const SmallVectorImpl<ISD::InputArg> &Ins) {
02149   if (Ins.empty())
02150     return NotStructReturn;
02151 
02152   const ISD::ArgFlagsTy &Flags = Ins[0].Flags;
02153   if (!Flags.isSRet())
02154     return NotStructReturn;
02155   if (Flags.isInReg())
02156     return RegStructReturn;
02157   return StackStructReturn;
02158 }
02159 
02160 /// CreateCopyOfByValArgument - Make a copy of an aggregate at address specified
02161 /// by "Src" to address "Dst" with size and alignment information specified by
02162 /// the specific parameter attribute. The copy will be passed as a byval
02163 /// function parameter.
02164 static SDValue
02165 CreateCopyOfByValArgument(SDValue Src, SDValue Dst, SDValue Chain,
02166                           ISD::ArgFlagsTy Flags, SelectionDAG &DAG,
02167                           SDLoc dl) {
02168   SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), MVT::i32);
02169 
02170   return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(),
02171                        /*isVolatile*/false, /*AlwaysInline=*/true,
02172                        MachinePointerInfo(), MachinePointerInfo());
02173 }
02174 
02175 /// IsTailCallConvention - Return true if the calling convention is one that
02176 /// supports tail call optimization.
02177 static bool IsTailCallConvention(CallingConv::ID CC) {
02178   return (CC == CallingConv::Fast || CC == CallingConv::GHC ||
02179           CC == CallingConv::HiPE);
02180 }
02181 
02182 /// \brief Return true if the calling convention is a C calling convention.
02183 static bool IsCCallConvention(CallingConv::ID CC) {
02184   return (CC == CallingConv::C || CC == CallingConv::X86_64_Win64 ||
02185           CC == CallingConv::X86_64_SysV);
02186 }
02187 
02188 bool X86TargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const {
02189   if (!CI->isTailCall() || getTargetMachine().Options.DisableTailCalls)
02190     return false;
02191 
02192   CallSite CS(CI);
02193   CallingConv::ID CalleeCC = CS.getCallingConv();
02194   if (!IsTailCallConvention(CalleeCC) && !IsCCallConvention(CalleeCC))
02195     return false;
02196 
02197   return true;
02198 }
02199 
02200 /// FuncIsMadeTailCallSafe - Return true if the function is being made into
02201 /// a tailcall target by changing its ABI.
02202 static bool FuncIsMadeTailCallSafe(CallingConv::ID CC,
02203                                    bool GuaranteedTailCallOpt) {
02204   return GuaranteedTailCallOpt && IsTailCallConvention(CC);
02205 }
02206 
02207 SDValue
02208 X86TargetLowering::LowerMemArgument(SDValue Chain,
02209                                     CallingConv::ID CallConv,
02210                                     const SmallVectorImpl<ISD::InputArg> &Ins,
02211                                     SDLoc dl, SelectionDAG &DAG,
02212                                     const CCValAssign &VA,
02213                                     MachineFrameInfo *MFI,
02214                                     unsigned i) const {
02215   // Create the nodes corresponding to a load from this parameter slot.
02216   ISD::ArgFlagsTy Flags = Ins[i].Flags;
02217   bool AlwaysUseMutable = FuncIsMadeTailCallSafe(
02218       CallConv, DAG.getTarget().Options.GuaranteedTailCallOpt);
02219   bool isImmutable = !AlwaysUseMutable && !Flags.isByVal();
02220   EVT ValVT;
02221 
02222   // If value is passed by pointer we have address passed instead of the value
02223   // itself.
02224   if (VA.getLocInfo() == CCValAssign::Indirect)
02225     ValVT = VA.getLocVT();
02226   else
02227     ValVT = VA.getValVT();
02228 
02229   // FIXME: For now, all byval parameter objects are marked mutable. This can be
02230   // changed with more analysis.
02231   // In case of tail call optimization mark all arguments mutable. Since they
02232   // could be overwritten by lowering of arguments in case of a tail call.
02233   if (Flags.isByVal()) {
02234     unsigned Bytes = Flags.getByValSize();
02235     if (Bytes == 0) Bytes = 1; // Don't create zero-sized stack objects.
02236     int FI = MFI->CreateFixedObject(Bytes, VA.getLocMemOffset(), isImmutable);
02237     return DAG.getFrameIndex(FI, getPointerTy());
02238   } else {
02239     int FI = MFI->CreateFixedObject(ValVT.getSizeInBits()/8,
02240                                     VA.getLocMemOffset(), isImmutable);
02241     SDValue FIN = DAG.getFrameIndex(FI, getPointerTy());
02242     return DAG.getLoad(ValVT, dl, Chain, FIN,
02243                        MachinePointerInfo::getFixedStack(FI),
02244                        false, false, false, 0);
02245   }
02246 }
02247 
02248 SDValue
02249 X86TargetLowering::LowerFormalArguments(SDValue Chain,
02250                                         CallingConv::ID CallConv,
02251                                         bool isVarArg,
02252                                       const SmallVectorImpl<ISD::InputArg> &Ins,
02253                                         SDLoc dl,
02254                                         SelectionDAG &DAG,
02255                                         SmallVectorImpl<SDValue> &InVals)
02256                                           const {
02257   MachineFunction &MF = DAG.getMachineFunction();
02258   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
02259 
02260   const Function* Fn = MF.getFunction();
02261   if (Fn->hasExternalLinkage() &&
02262       Subtarget->isTargetCygMing() &&
02263       Fn->getName() == "main")
02264     FuncInfo->setForceFramePointer(true);
02265 
02266   MachineFrameInfo *MFI = MF.getFrameInfo();
02267   bool Is64Bit = Subtarget->is64Bit();
02268   bool IsWin64 = Subtarget->isCallingConvWin64(CallConv);
02269 
02270   assert(!(isVarArg && IsTailCallConvention(CallConv)) &&
02271          "Var args not supported with calling convention fastcc, ghc or hipe");
02272 
02273   // Assign locations to all of the incoming arguments.
02274   SmallVector<CCValAssign, 16> ArgLocs;
02275   CCState CCInfo(CallConv, isVarArg, MF, DAG.getTarget(),
02276                  ArgLocs, *DAG.getContext());
02277 
02278   // Allocate shadow area for Win64
02279   if (IsWin64)
02280     CCInfo.AllocateStack(32, 8);
02281 
02282   CCInfo.AnalyzeFormalArguments(Ins, CC_X86);
02283 
02284   unsigned LastVal = ~0U;
02285   SDValue ArgValue;
02286   for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
02287     CCValAssign &VA = ArgLocs[i];
02288     // TODO: If an arg is passed in two places (e.g. reg and stack), skip later
02289     // places.
02290     assert(VA.getValNo() != LastVal &&
02291            "Don't support value assigned to multiple locs yet");
02292     (void)LastVal;
02293     LastVal = VA.getValNo();
02294 
02295     if (VA.isRegLoc()) {
02296       EVT RegVT = VA.getLocVT();
02297       const TargetRegisterClass *RC;
02298       if (RegVT == MVT::i32)
02299         RC = &X86::GR32RegClass;
02300       else if (Is64Bit && RegVT == MVT::i64)
02301         RC = &X86::GR64RegClass;
02302       else if (RegVT == MVT::f32)
02303         RC = &X86::FR32RegClass;
02304       else if (RegVT == MVT::f64)
02305         RC = &X86::FR64RegClass;
02306       else if (RegVT.is512BitVector())
02307         RC = &X86::VR512RegClass;
02308       else if (RegVT.is256BitVector())
02309         RC = &X86::VR256RegClass;
02310       else if (RegVT.is128BitVector())
02311         RC = &X86::VR128RegClass;
02312       else if (RegVT == MVT::x86mmx)
02313         RC = &X86::VR64RegClass;
02314       else if (RegVT == MVT::i1)
02315         RC = &X86::VK1RegClass;
02316       else if (RegVT == MVT::v8i1)
02317         RC = &X86::VK8RegClass;
02318       else if (RegVT == MVT::v16i1)
02319         RC = &X86::VK16RegClass;
02320       else if (RegVT == MVT::v32i1)
02321         RC = &X86::VK32RegClass;
02322       else if (RegVT == MVT::v64i1)
02323         RC = &X86::VK64RegClass;
02324       else
02325         llvm_unreachable("Unknown argument type!");
02326 
02327       unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
02328       ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
02329 
02330       // If this is an 8 or 16-bit value, it is really passed promoted to 32
02331       // bits.  Insert an assert[sz]ext to capture this, then truncate to the
02332       // right size.
02333       if (VA.getLocInfo() == CCValAssign::SExt)
02334         ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue,
02335                                DAG.getValueType(VA.getValVT()));
02336       else if (VA.getLocInfo() == CCValAssign::ZExt)
02337         ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue,
02338                                DAG.getValueType(VA.getValVT()));
02339       else if (VA.getLocInfo() == CCValAssign::BCvt)
02340         ArgValue = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), ArgValue);
02341 
02342       if (VA.isExtInLoc()) {
02343         // Handle MMX values passed in XMM regs.
02344         if (RegVT.isVector())
02345           ArgValue = DAG.getNode(X86ISD::MOVDQ2Q, dl, VA.getValVT(), ArgValue);
02346         else
02347           ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
02348       }
02349     } else {
02350       assert(VA.isMemLoc());
02351       ArgValue = LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, i);
02352     }
02353 
02354     // If value is passed via pointer - do a load.
02355     if (VA.getLocInfo() == CCValAssign::Indirect)
02356       ArgValue = DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue,
02357                              MachinePointerInfo(), false, false, false, 0);
02358 
02359     InVals.push_back(ArgValue);
02360   }
02361 
02362   if (Subtarget->is64Bit() || Subtarget->isTargetKnownWindowsMSVC()) {
02363     for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
02364       // The x86-64 ABIs require that for returning structs by value we copy
02365       // the sret argument into %rax/%eax (depending on ABI) for the return.
02366       // Win32 requires us to put the sret argument to %eax as well.
02367       // Save the argument into a virtual register so that we can access it
02368       // from the return points.
02369       if (Ins[i].Flags.isSRet()) {
02370         unsigned Reg = FuncInfo->getSRetReturnReg();
02371         if (!Reg) {
02372           MVT PtrTy = getPointerTy();
02373           Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy));
02374           FuncInfo->setSRetReturnReg(Reg);
02375         }
02376         SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[i]);
02377         Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain);
02378         break;
02379       }
02380     }
02381   }
02382 
02383   unsigned StackSize = CCInfo.getNextStackOffset();
02384   // Align stack specially for tail calls.
02385   if (FuncIsMadeTailCallSafe(CallConv,
02386                              MF.getTarget().Options.GuaranteedTailCallOpt))
02387     StackSize = GetAlignedArgumentStackSize(StackSize, DAG);
02388 
02389   // If the function takes variable number of arguments, make a frame index for
02390   // the start of the first vararg value... for expansion of llvm.va_start.
02391   if (isVarArg) {
02392     if (Is64Bit || (CallConv != CallingConv::X86_FastCall &&
02393                     CallConv != CallingConv::X86_ThisCall)) {
02394       FuncInfo->setVarArgsFrameIndex(MFI->CreateFixedObject(1, StackSize,true));
02395     }
02396     if (Is64Bit) {
02397       unsigned TotalNumIntRegs = 0, TotalNumXMMRegs = 0;
02398 
02399       // FIXME: We should really autogenerate these arrays
02400       static const MCPhysReg GPR64ArgRegsWin64[] = {
02401         X86::RCX, X86::RDX, X86::R8,  X86::R9
02402       };
02403       static const MCPhysReg GPR64ArgRegs64Bit[] = {
02404         X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9
02405       };
02406       static const MCPhysReg XMMArgRegs64Bit[] = {
02407         X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
02408         X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
02409       };
02410       const MCPhysReg *GPR64ArgRegs;
02411       unsigned NumXMMRegs = 0;
02412 
02413       if (IsWin64) {
02414         // The XMM registers which might contain var arg parameters are shadowed
02415         // in their paired GPR.  So we only need to save the GPR to their home
02416         // slots.
02417         TotalNumIntRegs = 4;
02418         GPR64ArgRegs = GPR64ArgRegsWin64;
02419       } else {
02420         TotalNumIntRegs = 6; TotalNumXMMRegs = 8;
02421         GPR64ArgRegs = GPR64ArgRegs64Bit;
02422 
02423         NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs64Bit,
02424                                                 TotalNumXMMRegs);
02425       }
02426       unsigned NumIntRegs = CCInfo.getFirstUnallocated(GPR64ArgRegs,
02427                                                        TotalNumIntRegs);
02428 
02429       bool NoImplicitFloatOps = Fn->getAttributes().
02430         hasAttribute(AttributeSet::FunctionIndex, Attribute::NoImplicitFloat);
02431       assert(!(NumXMMRegs && !Subtarget->hasSSE1()) &&
02432              "SSE register cannot be used when SSE is disabled!");
02433       assert(!(NumXMMRegs && MF.getTarget().Options.UseSoftFloat &&
02434                NoImplicitFloatOps) &&
02435              "SSE register cannot be used when SSE is disabled!");
02436       if (MF.getTarget().Options.UseSoftFloat || NoImplicitFloatOps ||
02437           !Subtarget->hasSSE1())
02438         // Kernel mode asks for SSE to be disabled, so don't push them
02439         // on the stack.
02440         TotalNumXMMRegs = 0;
02441 
02442       if (IsWin64) {
02443         const TargetFrameLowering &TFI = *MF.getTarget().getFrameLowering();
02444         // Get to the caller-allocated home save location.  Add 8 to account
02445         // for the return address.
02446         int HomeOffset = TFI.getOffsetOfLocalArea() + 8;
02447         FuncInfo->setRegSaveFrameIndex(
02448           MFI->CreateFixedObject(1, NumIntRegs * 8 + HomeOffset, false));
02449         // Fixup to set vararg frame on shadow area (4 x i64).
02450         if (NumIntRegs < 4)
02451           FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex());
02452       } else {
02453         // For X86-64, if there are vararg parameters that are passed via
02454         // registers, then we must store them to their spots on the stack so
02455         // they may be loaded by deferencing the result of va_next.
02456         FuncInfo->setVarArgsGPOffset(NumIntRegs * 8);
02457         FuncInfo->setVarArgsFPOffset(TotalNumIntRegs * 8 + NumXMMRegs * 16);
02458         FuncInfo->setRegSaveFrameIndex(
02459           MFI->CreateStackObject(TotalNumIntRegs * 8 + TotalNumXMMRegs * 16, 16,
02460                                false));
02461       }
02462 
02463       // Store the integer parameter registers.
02464       SmallVector<SDValue, 8> MemOps;
02465       SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(),
02466                                         getPointerTy());
02467       unsigned Offset = FuncInfo->getVarArgsGPOffset();
02468       for (; NumIntRegs != TotalNumIntRegs; ++NumIntRegs) {
02469         SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), RSFIN,
02470                                   DAG.getIntPtrConstant(Offset));
02471         unsigned VReg = MF.addLiveIn(GPR64ArgRegs[NumIntRegs],
02472                                      &X86::GR64RegClass);
02473         SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64);
02474         SDValue Store =
02475           DAG.getStore(Val.getValue(1), dl, Val, FIN,
02476                        MachinePointerInfo::getFixedStack(
02477                          FuncInfo->getRegSaveFrameIndex(), Offset),
02478                        false, false, 0);
02479         MemOps.push_back(Store);
02480         Offset += 8;
02481       }
02482 
02483       if (TotalNumXMMRegs != 0 && NumXMMRegs != TotalNumXMMRegs) {
02484         // Now store the XMM (fp + vector) parameter registers.
02485         SmallVector<SDValue, 11> SaveXMMOps;
02486         SaveXMMOps.push_back(Chain);
02487 
02488         unsigned AL = MF.addLiveIn(X86::AL, &X86::GR8RegClass);
02489         SDValue ALVal = DAG.getCopyFromReg(DAG.getEntryNode(), dl, AL, MVT::i8);
02490         SaveXMMOps.push_back(ALVal);
02491 
02492         SaveXMMOps.push_back(DAG.getIntPtrConstant(
02493                                FuncInfo->getRegSaveFrameIndex()));
02494         SaveXMMOps.push_back(DAG.getIntPtrConstant(
02495                                FuncInfo->getVarArgsFPOffset()));
02496 
02497         for (; NumXMMRegs != TotalNumXMMRegs; ++NumXMMRegs) {
02498           unsigned VReg = MF.addLiveIn(XMMArgRegs64Bit[NumXMMRegs],
02499                                        &X86::VR128RegClass);
02500           SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::v4f32);
02501           SaveXMMOps.push_back(Val);
02502         }
02503         MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, dl,
02504                                      MVT::Other, SaveXMMOps));
02505       }
02506 
02507       if (!MemOps.empty())
02508         Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);
02509     }
02510   }
02511 
02512   // Some CCs need callee pop.
02513   if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
02514                        MF.getTarget().Options.GuaranteedTailCallOpt)) {
02515     FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything.
02516   } else {
02517     FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing.
02518     // If this is an sret function, the return should pop the hidden pointer.
02519     if (!Is64Bit && !IsTailCallConvention(CallConv) &&
02520         !Subtarget->getTargetTriple().isOSMSVCRT() &&
02521         argsAreStructReturn(Ins) == StackStructReturn)
02522       FuncInfo->setBytesToPopOnReturn(4);
02523   }
02524 
02525   if (!Is64Bit) {
02526     // RegSaveFrameIndex is X86-64 only.
02527     FuncInfo->setRegSaveFrameIndex(0xAAAAAAA);
02528     if (CallConv == CallingConv::X86_FastCall ||
02529         CallConv == CallingConv::X86_ThisCall)
02530       // fastcc functions can't have varargs.
02531       FuncInfo->setVarArgsFrameIndex(0xAAAAAAA);
02532   }
02533 
02534   FuncInfo->setArgumentStackSize(StackSize);
02535 
02536   return Chain;
02537 }
02538 
02539 SDValue
02540 X86TargetLowering::LowerMemOpCallTo(SDValue Chain,
02541                                     SDValue StackPtr, SDValue Arg,
02542                                     SDLoc dl, SelectionDAG &DAG,
02543                                     const CCValAssign &VA,
02544                                     ISD::ArgFlagsTy Flags) const {
02545   unsigned LocMemOffset = VA.getLocMemOffset();
02546   SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset);
02547   PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, PtrOff);
02548   if (Flags.isByVal())
02549     return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl);
02550 
02551   return DAG.getStore(Chain, dl, Arg, PtrOff,
02552                       MachinePointerInfo::getStack(LocMemOffset),
02553                       false, false, 0);
02554 }
02555 
02556 /// EmitTailCallLoadRetAddr - Emit a load of return address if tail call
02557 /// optimization is performed and it is required.
02558 SDValue
02559 X86TargetLowering::EmitTailCallLoadRetAddr(SelectionDAG &DAG,
02560                                            SDValue &OutRetAddr, SDValue Chain,
02561                                            bool IsTailCall, bool Is64Bit,
02562                                            int FPDiff, SDLoc dl) const {
02563   // Adjust the Return address stack slot.
02564   EVT VT = getPointerTy();
02565   OutRetAddr = getReturnAddressFrameIndex(DAG);
02566 
02567   // Load the "old" Return address.
02568   OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, MachinePointerInfo(),
02569                            false, false, false, 0);
02570   return SDValue(OutRetAddr.getNode(), 1);
02571 }
02572 
02573 /// EmitTailCallStoreRetAddr - Emit a store of the return address if tail call
02574 /// optimization is performed and it is required (FPDiff!=0).
02575 static SDValue EmitTailCallStoreRetAddr(SelectionDAG &DAG, MachineFunction &MF,
02576                                         SDValue Chain, SDValue RetAddrFrIdx,
02577                                         EVT PtrVT, unsigned SlotSize,
02578                                         int FPDiff, SDLoc dl) {
02579   // Store the return address to the appropriate stack slot.
02580   if (!FPDiff) return Chain;
02581   // Calculate the new stack slot for the return address.
02582   int NewReturnAddrFI =
02583     MF.getFrameInfo()->CreateFixedObject(SlotSize, (int64_t)FPDiff - SlotSize,
02584                                          false);
02585   SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, PtrVT);
02586   Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx,
02587                        MachinePointerInfo::getFixedStack(NewReturnAddrFI),
02588                        false, false, 0);
02589   return Chain;
02590 }
02591 
02592 SDValue
02593 X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
02594                              SmallVectorImpl<SDValue> &InVals) const {
02595   SelectionDAG &DAG                     = CLI.DAG;
02596   SDLoc &dl                             = CLI.DL;
02597   SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
02598   SmallVectorImpl<SDValue> &OutVals     = CLI.OutVals;
02599   SmallVectorImpl<ISD::InputArg> &Ins   = CLI.Ins;
02600   SDValue Chain                         = CLI.Chain;
02601   SDValue Callee                        = CLI.Callee;
02602   CallingConv::ID CallConv              = CLI.CallConv;
02603   bool &isTailCall                      = CLI.IsTailCall;
02604   bool isVarArg                         = CLI.IsVarArg;
02605 
02606   MachineFunction &MF = DAG.getMachineFunction();
02607   bool Is64Bit        = Subtarget->is64Bit();
02608   bool IsWin64        = Subtarget->isCallingConvWin64(CallConv);
02609   StructReturnType SR = callIsStructReturn(Outs);
02610   bool IsSibcall      = false;
02611 
02612   if (MF.getTarget().Options.DisableTailCalls)
02613     isTailCall = false;
02614 
02615   bool IsMustTail = CLI.CS && CLI.CS->isMustTailCall();
02616   if (IsMustTail) {
02617     // Force this to be a tail call.  The verifier rules are enough to ensure
02618     // that we can lower this successfully without moving the return address
02619     // around.
02620     isTailCall = true;
02621   } else if (isTailCall) {
02622     // Check if it's really possible to do a tail call.
02623     isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv,
02624                     isVarArg, SR != NotStructReturn,
02625                     MF.getFunction()->hasStructRetAttr(), CLI.RetTy,
02626                     Outs, OutVals, Ins, DAG);
02627 
02628     // Sibcalls are automatically detected tailcalls which do not require
02629     // ABI changes.
02630     if (!MF.getTarget().Options.GuaranteedTailCallOpt && isTailCall)
02631       IsSibcall = true;
02632 
02633     if (isTailCall)
02634       ++NumTailCalls;
02635   }
02636 
02637   assert(!(isVarArg && IsTailCallConvention(CallConv)) &&
02638          "Var args not supported with calling convention fastcc, ghc or hipe");
02639 
02640   // Analyze operands of the call, assigning locations to each operand.
02641   SmallVector<CCValAssign, 16> ArgLocs;
02642   CCState CCInfo(CallConv, isVarArg, MF, MF.getTarget(),
02643                  ArgLocs, *DAG.getContext());
02644 
02645   // Allocate shadow area for Win64
02646   if (IsWin64)
02647     CCInfo.AllocateStack(32, 8);
02648 
02649   CCInfo.AnalyzeCallOperands(Outs, CC_X86);
02650 
02651   // Get a count of how many bytes are to be pushed on the stack.
02652   unsigned NumBytes = CCInfo.getNextStackOffset();
02653   if (IsSibcall)
02654     // This is a sibcall. The memory operands are available in caller's
02655     // own caller's stack.
02656     NumBytes = 0;
02657   else if (MF.getTarget().Options.GuaranteedTailCallOpt &&
02658            IsTailCallConvention(CallConv))
02659     NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG);
02660 
02661   int FPDiff = 0;
02662   if (isTailCall && !IsSibcall && !IsMustTail) {
02663     // Lower arguments at fp - stackoffset + fpdiff.
02664     X86MachineFunctionInfo *X86Info = MF.getInfo<X86MachineFunctionInfo>();
02665     unsigned NumBytesCallerPushed = X86Info->getBytesToPopOnReturn();
02666 
02667     FPDiff = NumBytesCallerPushed - NumBytes;
02668 
02669     // Set the delta of movement of the returnaddr stackslot.
02670     // But only set if delta is greater than previous delta.
02671     if (FPDiff < X86Info->getTCReturnAddrDelta())
02672       X86Info->setTCReturnAddrDelta(FPDiff);
02673   }
02674 
02675   unsigned NumBytesToPush = NumBytes;
02676   unsigned NumBytesToPop = NumBytes;
02677 
02678   // If we have an inalloca argument, all stack space has already been allocated
02679   // for us and be right at the top of the stack.  We don't support multiple
02680   // arguments passed in memory when using inalloca.
02681   if (!Outs.empty() && Outs.back().Flags.isInAlloca()) {
02682     NumBytesToPush = 0;
02683     assert(ArgLocs.back().getLocMemOffset() == 0 &&
02684            "an inalloca argument must be the only memory argument");
02685   }
02686 
02687   if (!IsSibcall)
02688     Chain = DAG.getCALLSEQ_START(
02689         Chain, DAG.getIntPtrConstant(NumBytesToPush, true), dl);
02690 
02691   SDValue RetAddrFrIdx;
02692   // Load return address for tail calls.
02693   if (isTailCall && FPDiff)
02694     Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall,
02695                                     Is64Bit, FPDiff, dl);
02696 
02697   SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
02698   SmallVector<SDValue, 8> MemOpChains;
02699   SDValue StackPtr;
02700 
02701   // Walk the register/memloc assignments, inserting copies/loads.  In the case
02702   // of tail call optimization arguments are handle later.
02703   const X86RegisterInfo *RegInfo =
02704     static_cast<const X86RegisterInfo*>(DAG.getTarget().getRegisterInfo());
02705   for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
02706     // Skip inalloca arguments, they have already been written.
02707     ISD::ArgFlagsTy Flags = Outs[i].Flags;
02708     if (Flags.isInAlloca())
02709       continue;
02710 
02711     CCValAssign &VA = ArgLocs[i];
02712     EVT RegVT = VA.getLocVT();
02713     SDValue Arg = OutVals[i];
02714     bool isByVal = Flags.isByVal();
02715 
02716     // Promote the value if needed.
02717     switch (VA.getLocInfo()) {
02718     default: llvm_unreachable("Unknown loc info!");
02719     case CCValAssign::Full: break;
02720     case CCValAssign::SExt:
02721       Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg);
02722       break;
02723     case CCValAssign::ZExt:
02724       Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg);
02725       break;
02726     case CCValAssign::AExt:
02727       if (RegVT.is128BitVector()) {
02728         // Special case: passing MMX values in XMM registers.
02729         Arg = DAG.getNode(ISD::BITCAST, dl, MVT::i64, Arg);
02730         Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg);
02731         Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg);
02732       } else
02733         Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg);
02734       break;
02735     case CCValAssign::BCvt:
02736       Arg = DAG.getNode(ISD::BITCAST, dl, RegVT, Arg);
02737       break;
02738     case CCValAssign::Indirect: {
02739       // Store the argument.
02740       SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT());
02741       int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex();
02742       Chain = DAG.getStore(Chain, dl, Arg, SpillSlot,
02743                            MachinePointerInfo::getFixedStack(FI),
02744                            false, false, 0);
02745       Arg = SpillSlot;
02746       break;
02747     }
02748     }
02749 
02750     if (VA.isRegLoc()) {
02751       RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
02752       if (isVarArg && IsWin64) {
02753         // Win64 ABI requires argument XMM reg to be copied to the corresponding
02754         // shadow reg if callee is a varargs function.
02755         unsigned ShadowReg = 0;
02756         switch (VA.getLocReg()) {
02757         case X86::XMM0: ShadowReg = X86::RCX; break;
02758         case X86::XMM1: ShadowReg = X86::RDX; break;
02759         case X86::XMM2: ShadowReg = X86::R8; break;
02760         case X86::XMM3: ShadowReg = X86::R9; break;
02761         }
02762         if (ShadowReg)
02763           RegsToPass.push_back(std::make_pair(ShadowReg, Arg));
02764       }
02765     } else if (!IsSibcall && (!isTailCall || isByVal)) {
02766       assert(VA.isMemLoc());
02767       if (!StackPtr.getNode())
02768         StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
02769                                       getPointerTy());
02770       MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg,
02771                                              dl, DAG, VA, Flags));
02772     }
02773   }
02774 
02775   if (!MemOpChains.empty())
02776     Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
02777 
02778   if (Subtarget->isPICStyleGOT()) {
02779     // ELF / PIC requires GOT in the EBX register before function calls via PLT
02780     // GOT pointer.
02781     if (!isTailCall) {
02782       RegsToPass.push_back(std::make_pair(unsigned(X86::EBX),
02783                DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), getPointerTy())));
02784     } else {
02785       // If we are tail calling and generating PIC/GOT style code load the
02786       // address of the callee into ECX. The value in ecx is used as target of
02787       // the tail jump. This is done to circumvent the ebx/callee-saved problem
02788       // for tail calls on PIC/GOT architectures. Normally we would just put the
02789       // address of GOT into ebx and then call target@PLT. But for tail calls
02790       // ebx would be restored (since ebx is callee saved) before jumping to the
02791       // target@PLT.
02792 
02793       // Note: The actual moving to ECX is done further down.
02794       GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
02795       if (G && !G->getGlobal()->hasHiddenVisibility() &&
02796           !G->getGlobal()->hasProtectedVisibility())
02797         Callee = LowerGlobalAddress(Callee, DAG);
02798       else if (isa<ExternalSymbolSDNode>(Callee))
02799         Callee = LowerExternalSymbol(Callee, DAG);
02800     }
02801   }
02802 
02803   if (Is64Bit && isVarArg && !IsWin64) {
02804     // From AMD64 ABI document:
02805     // For calls that may call functions that use varargs or stdargs
02806     // (prototype-less calls or calls to functions containing ellipsis (...) in
02807     // the declaration) %al is used as hidden argument to specify the number
02808     // of SSE registers used. The contents of %al do not need to match exactly
02809     // the number of registers, but must be an ubound on the number of SSE
02810     // registers used and is in the range 0 - 8 inclusive.
02811 
02812     // Count the number of XMM registers allocated.
02813     static const MCPhysReg XMMArgRegs[] = {
02814       X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
02815       X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
02816     };
02817     unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs, 8);
02818     assert((Subtarget->hasSSE1() || !NumXMMRegs)
02819            && "SSE registers cannot be used when SSE is disabled");
02820 
02821     RegsToPass.push_back(std::make_pair(unsigned(X86::AL),
02822                                         DAG.getConstant(NumXMMRegs, MVT::i8)));
02823   }
02824 
02825   // For tail calls lower the arguments to the 'real' stack slots.  Sibcalls
02826   // don't need this because the eligibility check rejects calls that require
02827   // shuffling arguments passed in memory.
02828   if (!IsSibcall && isTailCall) {
02829     // Force all the incoming stack arguments to be loaded from the stack
02830     // before any new outgoing arguments are stored to the stack, because the
02831     // outgoing stack slots may alias the incoming argument stack slots, and
02832     // the alias isn't otherwise explicit. This is slightly more conservative
02833     // than necessary, because it means that each store effectively depends
02834     // on every argument instead of just those arguments it would clobber.
02835     SDValue ArgChain = DAG.getStackArgumentTokenFactor(Chain);
02836 
02837     SmallVector<SDValue, 8> MemOpChains2;
02838     SDValue FIN;
02839     int FI = 0;
02840     for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
02841       CCValAssign &VA = ArgLocs[i];
02842       if (VA.isRegLoc())
02843         continue;
02844       assert(VA.isMemLoc());
02845       SDValue Arg = OutVals[i];
02846       ISD::ArgFlagsTy Flags = Outs[i].Flags;
02847       // Skip inalloca arguments.  They don't require any work.
02848       if (Flags.isInAlloca())
02849         continue;
02850       // Create frame index.
02851       int32_t Offset = VA.getLocMemOffset()+FPDiff;
02852       uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8;
02853       FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset, true);
02854       FIN = DAG.getFrameIndex(FI, getPointerTy());
02855 
02856       if (Flags.isByVal()) {
02857         // Copy relative to framepointer.
02858         SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset());
02859         if (!StackPtr.getNode())
02860           StackPtr = DAG.getCopyFromReg(Chain, dl,
02861                                         RegInfo->getStackRegister(),
02862                                         getPointerTy());
02863         Source = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, Source);
02864 
02865         MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN,
02866                                                          ArgChain,
02867                                                          Flags, DAG, dl));
02868       } else {
02869         // Store relative to framepointer.
02870         MemOpChains2.push_back(
02871           DAG.getStore(ArgChain, dl, Arg, FIN,
02872                        MachinePointerInfo::getFixedStack(FI),
02873                        false, false, 0));
02874       }
02875     }
02876 
02877     if (!MemOpChains2.empty())
02878       Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains2);
02879 
02880     // Store the return address to the appropriate stack slot.
02881     Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx,
02882                                      getPointerTy(), RegInfo->getSlotSize(),
02883                                      FPDiff, dl);
02884   }
02885 
02886   // Build a sequence of copy-to-reg nodes chained together with token chain
02887   // and flag operands which copy the outgoing args into registers.
02888   SDValue InFlag;
02889   for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
02890     Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
02891                              RegsToPass[i].second, InFlag);
02892     InFlag = Chain.getValue(1);
02893   }
02894 
02895   if (DAG.getTarget().getCodeModel() == CodeModel::Large) {
02896     assert(Is64Bit && "Large code model is only legal in 64-bit mode.");
02897     // In the 64-bit large code model, we have to make all calls
02898     // through a register, since the call instruction's 32-bit
02899     // pc-relative offset may not be large enough to hold the whole
02900     // address.
02901   } else if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
02902     // If the callee is a GlobalAddress node (quite common, every direct call
02903     // is) turn it into a TargetGlobalAddress node so that legalize doesn't hack
02904     // it.
02905 
02906     // We should use extra load for direct calls to dllimported functions in
02907     // non-JIT mode.
02908     const GlobalValue *GV = G->getGlobal();
02909     if (!GV->hasDLLImportStorageClass()) {
02910       unsigned char OpFlags = 0;
02911       bool ExtraLoad = false;
02912       unsigned WrapperKind = ISD::DELETED_NODE;
02913 
02914       // On ELF targets, in both X86-64 and X86-32 mode, direct calls to
02915       // external symbols most go through the PLT in PIC mode.  If the symbol
02916       // has hidden or protected visibility, or if it is static or local, then
02917       // we don't need to use the PLT - we can directly call it.
02918       if (Subtarget->isTargetELF() &&
02919           DAG.getTarget().getRelocationModel() == Reloc::PIC_ &&
02920           GV->hasDefaultVisibility() && !GV->hasLocalLinkage()) {
02921         OpFlags = X86II::MO_PLT;
02922       } else if (Subtarget->isPICStyleStubAny() &&
02923                  (GV->isDeclaration() || GV->isWeakForLinker()) &&
02924                  (!Subtarget->getTargetTriple().isMacOSX() ||
02925                   Subtarget->getTargetTriple().isMacOSXVersionLT(10, 5))) {
02926         // PC-relative references to external symbols should go through $stub,
02927         // unless we're building with the leopard linker or later, which
02928         // automatically synthesizes these stubs.
02929         OpFlags = X86II::MO_DARWIN_STUB;
02930       } else if (Subtarget->isPICStyleRIPRel() &&
02931                  isa<Function>(GV) &&
02932                  cast<Function>(GV)->getAttributes().
02933                    hasAttribute(AttributeSet::FunctionIndex,
02934                                 Attribute::NonLazyBind)) {
02935         // If the function is marked as non-lazy, generate an indirect call
02936         // which loads from the GOT directly. This avoids runtime overhead
02937         // at the cost of eager binding (and one extra byte of encoding).
02938         OpFlags = X86II::MO_GOTPCREL;
02939         WrapperKind = X86ISD::WrapperRIP;
02940         ExtraLoad = true;
02941       }
02942 
02943       Callee = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(),
02944                                           G->getOffset(), OpFlags);
02945 
02946       // Add a wrapper if needed.
02947       if (WrapperKind != ISD::DELETED_NODE)
02948         Callee = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Callee);
02949       // Add extra indirection if needed.
02950       if (ExtraLoad)
02951         Callee = DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), Callee,
02952                              MachinePointerInfo::getGOT(),
02953                              false, false, false, 0);
02954     }
02955   } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
02956     unsigned char OpFlags = 0;
02957 
02958     // On ELF targets, in either X86-64 or X86-32 mode, direct calls to
02959     // external symbols should go through the PLT.
02960     if (Subtarget->isTargetELF() &&
02961         DAG.getTarget().getRelocationModel() == Reloc::PIC_) {
02962       OpFlags = X86II::MO_PLT;
02963     } else if (Subtarget->isPICStyleStubAny() &&
02964                (!Subtarget->getTargetTriple().isMacOSX() ||
02965                 Subtarget->getTargetTriple().isMacOSXVersionLT(10, 5))) {
02966       // PC-relative references to external symbols should go through $stub,
02967       // unless we're building with the leopard linker or later, which
02968       // automatically synthesizes these stubs.
02969       OpFlags = X86II::MO_DARWIN_STUB;
02970     }
02971 
02972     Callee = DAG.getTargetExternalSymbol(S->getSymbol(), getPointerTy(),
02973                                          OpFlags);
02974   }
02975 
02976   // Returns a chain & a flag for retval copy to use.
02977   SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
02978   SmallVector<SDValue, 8> Ops;
02979 
02980   if (!IsSibcall && isTailCall) {
02981     Chain = DAG.getCALLSEQ_END(Chain,
02982                                DAG.getIntPtrConstant(NumBytesToPop, true),
02983                                DAG.getIntPtrConstant(0, true), InFlag, dl);
02984     InFlag = Chain.getValue(1);
02985   }
02986 
02987   Ops.push_back(Chain);
02988   Ops.push_back(Callee);
02989 
02990   if (isTailCall)
02991     Ops.push_back(DAG.getConstant(FPDiff, MVT::i32));
02992 
02993   // Add argument registers to the end of the list so that they are known live
02994   // into the call.
02995   for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
02996     Ops.push_back(DAG.getRegister(RegsToPass[i].first,
02997                                   RegsToPass[i].second.getValueType()));
02998 
02999   // Add a register mask operand representing the call-preserved registers.
03000   const TargetRegisterInfo *TRI = DAG.getTarget().getRegisterInfo();
03001   const uint32_t *Mask = TRI->getCallPreservedMask(CallConv);
03002   assert(Mask && "Missing call preserved mask for calling convention");
03003   Ops.push_back(DAG.getRegisterMask(Mask));
03004 
03005   if (InFlag.getNode())
03006     Ops.push_back(InFlag);
03007 
03008   if (isTailCall) {
03009     // We used to do:
03010     //// If this is the first return lowered for this function, add the regs
03011     //// to the liveout set for the function.
03012     // This isn't right, although it's probably harmless on x86; liveouts
03013     // should be computed from returns not tail calls.  Consider a void
03014     // function making a tail call to a function returning int.
03015     return DAG.getNode(X86ISD::TC_RETURN, dl, NodeTys, Ops);
03016   }
03017 
03018   Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, Ops);
03019   InFlag = Chain.getValue(1);
03020 
03021   // Create the CALLSEQ_END node.
03022   unsigned NumBytesForCalleeToPop;
03023   if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
03024                        DAG.getTarget().Options.GuaranteedTailCallOpt))
03025     NumBytesForCalleeToPop = NumBytes;    // Callee pops everything
03026   else if (!Is64Bit && !IsTailCallConvention(CallConv) &&
03027            !Subtarget->getTargetTriple().isOSMSVCRT() &&
03028            SR == StackStructReturn)
03029     // If this is a call to a struct-return function, the callee
03030     // pops the hidden struct pointer, so we have to push it back.
03031     // This is common for Darwin/X86, Linux & Mingw32 targets.
03032     // For MSVC Win32 targets, the caller pops the hidden struct pointer.
03033     NumBytesForCalleeToPop = 4;
03034   else
03035     NumBytesForCalleeToPop = 0;  // Callee pops nothing.
03036 
03037   // Returns a flag for retval copy to use.
03038   if (!IsSibcall) {
03039     Chain = DAG.getCALLSEQ_END(Chain,
03040                                DAG.getIntPtrConstant(NumBytesToPop, true),
03041                                DAG.getIntPtrConstant(NumBytesForCalleeToPop,
03042                                                      true),
03043                                InFlag, dl);
03044     InFlag = Chain.getValue(1);
03045   }
03046 
03047   // Handle result values, copying them out of physregs into vregs that we
03048   // return.
03049   return LowerCallResult(Chain, InFlag, CallConv, isVarArg,
03050                          Ins, dl, DAG, InVals);
03051 }
03052 
03053 //===----------------------------------------------------------------------===//
03054 //                Fast Calling Convention (tail call) implementation
03055 //===----------------------------------------------------------------------===//
03056 
03057 //  Like std call, callee cleans arguments, convention except that ECX is
03058 //  reserved for storing the tail called function address. Only 2 registers are
03059 //  free for argument passing (inreg). Tail call optimization is performed
03060 //  provided:
03061 //                * tailcallopt is enabled
03062 //                * caller/callee are fastcc
03063 //  On X86_64 architecture with GOT-style position independent code only local
03064 //  (within module) calls are supported at the moment.
03065 //  To keep the stack aligned according to platform abi the function
03066 //  GetAlignedArgumentStackSize ensures that argument delta is always multiples
03067 //  of stack alignment. (Dynamic linkers need this - darwin's dyld for example)
03068 //  If a tail called function callee has more arguments than the caller the
03069 //  caller needs to make sure that there is room to move the RETADDR to. This is
03070 //  achieved by reserving an area the size of the argument delta right after the
03071 //  original RETADDR, but before the saved framepointer or the spilled registers
03072 //  e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4)
03073 //  stack layout:
03074 //    arg1
03075 //    arg2
03076 //    RETADDR
03077 //    [ new RETADDR
03078 //      move area ]
03079 //    (possible EBP)
03080 //    ESI
03081 //    EDI
03082 //    local1 ..
03083 
03084 /// GetAlignedArgumentStackSize - Make the stack size align e.g 16n + 12 aligned
03085 /// for a 16 byte align requirement.
03086 unsigned
03087 X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize,
03088                                                SelectionDAG& DAG) const {
03089   MachineFunction &MF = DAG.getMachineFunction();
03090   const TargetMachine &TM = MF.getTarget();
03091   const X86RegisterInfo *RegInfo =
03092     static_cast<const X86RegisterInfo*>(TM.getRegisterInfo());
03093   const TargetFrameLowering &TFI = *TM.getFrameLowering();
03094   unsigned StackAlignment = TFI.getStackAlignment();
03095   uint64_t AlignMask = StackAlignment - 1;
03096   int64_t Offset = StackSize;
03097   unsigned SlotSize = RegInfo->getSlotSize();
03098   if ( (Offset & AlignMask) <= (StackAlignment - SlotSize) ) {
03099     // Number smaller than 12 so just add the difference.
03100     Offset += ((StackAlignment - SlotSize) - (Offset & AlignMask));
03101   } else {
03102     // Mask out lower bits, add stackalignment once plus the 12 bytes.
03103     Offset = ((~AlignMask) & Offset) + StackAlignment +
03104       (StackAlignment-SlotSize);
03105   }
03106   return Offset;
03107 }
03108 
03109 /// MatchingStackOffset - Return true if the given stack call argument is
03110 /// already available in the same position (relatively) of the caller's
03111 /// incoming argument stack.
03112 static
03113 bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
03114                          MachineFrameInfo *MFI, const MachineRegisterInfo *MRI,
03115                          const X86InstrInfo *TII) {
03116   unsigned Bytes = Arg.getValueType().getSizeInBits() / 8;
03117   int FI = INT_MAX;
03118   if (Arg.getOpcode() == ISD::CopyFromReg) {
03119     unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg();
03120     if (!TargetRegisterInfo::isVirtualRegister(VR))
03121       return false;
03122     MachineInstr *Def = MRI->getVRegDef(VR);
03123     if (!Def)
03124       return false;
03125     if (!Flags.isByVal()) {
03126       if (!TII->isLoadFromStackSlot(Def, FI))
03127         return false;
03128     } else {
03129       unsigned Opcode = Def->getOpcode();
03130       if ((Opcode == X86::LEA32r || Opcode == X86::LEA64r) &&
03131           Def->getOperand(1).isFI()) {
03132         FI = Def->getOperand(1).getIndex();
03133         Bytes = Flags.getByValSize();
03134       } else
03135         return false;
03136     }
03137   } else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) {
03138     if (Flags.isByVal())
03139       // ByVal argument is passed in as a pointer but it's now being
03140       // dereferenced. e.g.
03141       // define @foo(%struct.X* %A) {
03142       //   tail call @bar(%struct.X* byval %A)
03143       // }
03144       return false;
03145     SDValue Ptr = Ld->getBasePtr();
03146     FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr);
03147     if (!FINode)
03148       return false;
03149     FI = FINode->getIndex();
03150   } else if (Arg.getOpcode() == ISD::FrameIndex && Flags.isByVal()) {
03151     FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Arg);
03152     FI = FINode->getIndex();
03153     Bytes = Flags.getByValSize();
03154   } else
03155     return false;
03156 
03157   assert(FI != INT_MAX);
03158   if (!MFI->isFixedObjectIndex(FI))
03159     return false;
03160   return Offset == MFI->getObjectOffset(FI) && Bytes == MFI->getObjectSize(FI);
03161 }
03162 
03163 /// IsEligibleForTailCallOptimization - Check whether the call is eligible
03164 /// for tail call optimization. Targets which want to do tail call
03165 /// optimization should implement this function.
03166 bool
03167 X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
03168                                                      CallingConv::ID CalleeCC,
03169                                                      bool isVarArg,
03170                                                      bool isCalleeStructRet,
03171                                                      bool isCallerStructRet,
03172                                                      Type *RetTy,
03173                                     const SmallVectorImpl<ISD::OutputArg> &Outs,
03174                                     const SmallVectorImpl<SDValue> &OutVals,
03175                                     const SmallVectorImpl<ISD::InputArg> &Ins,
03176                                                      SelectionDAG &DAG) const {
03177   if (!IsTailCallConvention(CalleeCC) && !IsCCallConvention(CalleeCC))
03178     return false;
03179 
03180   // If -tailcallopt is specified, make fastcc functions tail-callable.
03181   const MachineFunction &MF = DAG.getMachineFunction();
03182   const Function *CallerF = MF.getFunction();
03183 
03184   // If the function return type is x86_fp80 and the callee return type is not,
03185   // then the FP_EXTEND of the call result is not a nop. It's not safe to
03186   // perform a tailcall optimization here.
03187   if (CallerF->getReturnType()->isX86_FP80Ty() && !RetTy->isX86_FP80Ty())
03188     return false;
03189 
03190   CallingConv::ID CallerCC = CallerF->getCallingConv();
03191   bool CCMatch = CallerCC == CalleeCC;
03192   bool IsCalleeWin64 = Subtarget->isCallingConvWin64(CalleeCC);
03193   bool IsCallerWin64 = Subtarget->isCallingConvWin64(CallerCC);
03194 
03195   if (DAG.getTarget().Options.GuaranteedTailCallOpt) {
03196     if (IsTailCallConvention(CalleeCC) && CCMatch)
03197       return true;
03198     return false;
03199   }
03200 
03201   // Look for obvious safe cases to perform tail call optimization that do not
03202   // require ABI changes. This is what gcc calls sibcall.
03203 
03204   // Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to
03205   // emit a special epilogue.
03206   const X86RegisterInfo *RegInfo =
03207     static_cast<const X86RegisterInfo*>(DAG.getTarget().getRegisterInfo());
03208   if (RegInfo->needsStackRealignment(MF))
03209     return false;
03210 
03211   // Also avoid sibcall optimization if either caller or callee uses struct
03212   // return semantics.
03213   if (isCalleeStructRet || isCallerStructRet)
03214     return false;
03215 
03216   // An stdcall/thiscall caller is expected to clean up its arguments; the
03217   // callee isn't going to do that.
03218   // FIXME: this is more restrictive than needed. We could produce a tailcall
03219   // when the stack adjustment matches. For example, with a thiscall that takes
03220   // only one argument.
03221   if (!CCMatch && (CallerCC == CallingConv::X86_StdCall ||
03222                    CallerCC == CallingConv::X86_ThisCall))
03223     return false;
03224 
03225   // Do not sibcall optimize vararg calls unless all arguments are passed via
03226   // registers.
03227   if (isVarArg && !Outs.empty()) {
03228 
03229     // Optimizing for varargs on Win64 is unlikely to be safe without
03230     // additional testing.
03231     if (IsCalleeWin64 || IsCallerWin64)
03232       return false;
03233 
03234     SmallVector<CCValAssign, 16> ArgLocs;
03235     CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(),
03236                    DAG.getTarget(), ArgLocs, *DAG.getContext());
03237 
03238     CCInfo.AnalyzeCallOperands(Outs, CC_X86);
03239     for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i)
03240       if (!ArgLocs[i].isRegLoc())
03241         return false;
03242   }
03243 
03244   // If the call result is in ST0 / ST1, it needs to be popped off the x87
03245   // stack.  Therefore, if it's not used by the call it is not safe to optimize
03246   // this into a sibcall.
03247   bool Unused = false;
03248   for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
03249     if (!Ins[i].Used) {
03250       Unused = true;
03251       break;
03252     }
03253   }
03254   if (Unused) {
03255     SmallVector<CCValAssign, 16> RVLocs;
03256     CCState CCInfo(CalleeCC, false, DAG.getMachineFunction(),
03257                    DAG.getTarget(), RVLocs, *DAG.getContext());
03258     CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
03259     for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
03260       CCValAssign &VA = RVLocs[i];
03261       if (VA.getLocReg() == X86::ST0 || VA.getLocReg() == X86::ST1)
03262         return false;
03263     }
03264   }
03265 
03266   // If the calling conventions do not match, then we'd better make sure the
03267   // results are returned in the same way as what the caller expects.
03268   if (!CCMatch) {
03269     SmallVector<CCValAssign, 16> RVLocs1;
03270     CCState CCInfo1(CalleeCC, false, DAG.getMachineFunction(),
03271                     DAG.getTarget(), RVLocs1, *DAG.getContext());
03272     CCInfo1.AnalyzeCallResult(Ins, RetCC_X86);
03273 
03274     SmallVector<CCValAssign, 16> RVLocs2;
03275     CCState CCInfo2(CallerCC, false, DAG.getMachineFunction(),
03276                     DAG.getTarget(), RVLocs2, *DAG.getContext());
03277     CCInfo2.AnalyzeCallResult(Ins, RetCC_X86);
03278 
03279     if (RVLocs1.size() != RVLocs2.size())
03280       return false;
03281     for (unsigned i = 0, e = RVLocs1.size(); i != e; ++i) {
03282       if (RVLocs1[i].isRegLoc() != RVLocs2[i].isRegLoc())
03283         return false;
03284       if (RVLocs1[i].getLocInfo() != RVLocs2[i].getLocInfo())
03285         return false;
03286       if (RVLocs1[i].isRegLoc()) {
03287         if (RVLocs1[i].getLocReg() != RVLocs2[i].getLocReg())
03288           return false;
03289       } else {
03290         if (RVLocs1[i].getLocMemOffset() != RVLocs2[i].getLocMemOffset())
03291           return false;
03292       }
03293     }
03294   }
03295 
03296   // If the callee takes no arguments then go on to check the results of the
03297   // call.
03298   if (!Outs.empty()) {
03299     // Check if stack adjustment is needed. For now, do not do this if any
03300     // argument is passed on the stack.
03301     SmallVector<CCValAssign, 16> ArgLocs;
03302     CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(),
03303                    DAG.getTarget(), ArgLocs, *DAG.getContext());
03304 
03305     // Allocate shadow area for Win64
03306     if (IsCalleeWin64)
03307       CCInfo.AllocateStack(32, 8);
03308 
03309     CCInfo.AnalyzeCallOperands(Outs, CC_X86);
03310     if (CCInfo.getNextStackOffset()) {
03311       MachineFunction &MF = DAG.getMachineFunction();
03312       if (MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn())
03313         return false;
03314 
03315       // Check if the arguments are already laid out in the right way as
03316       // the caller's fixed stack objects.
03317       MachineFrameInfo *MFI = MF.getFrameInfo();
03318       const MachineRegisterInfo *MRI = &MF.getRegInfo();
03319       const X86InstrInfo *TII =
03320           static_cast<const X86InstrInfo *>(DAG.getTarget().getInstrInfo());
03321       for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
03322         CCValAssign &VA = ArgLocs[i];
03323         SDValue Arg = OutVals[i];
03324         ISD::ArgFlagsTy Flags = Outs[i].Flags;
03325         if (VA.getLocInfo() == CCValAssign::Indirect)
03326           return false;
03327         if (!VA.isRegLoc()) {
03328           if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags,
03329                                    MFI, MRI, TII))
03330             return false;
03331         }
03332       }
03333     }
03334 
03335     // If the tailcall address may be in a register, then make sure it's
03336     // possible to register allocate for it. In 32-bit, the call address can
03337     // only target EAX, EDX, or ECX since the tail call must be scheduled after
03338     // callee-saved registers are restored. These happen to be the same
03339     // registers used to pass 'inreg' arguments so watch out for those.
03340     if (!Subtarget->is64Bit() &&
03341         ((!isa<GlobalAddressSDNode>(Callee) &&
03342           !isa<ExternalSymbolSDNode>(Callee)) ||
03343          DAG.getTarget().getRelocationModel() == Reloc::PIC_)) {
03344       unsigned NumInRegs = 0;
03345       // In PIC we need an extra register to formulate the address computation
03346       // for the callee.
03347       unsigned MaxInRegs =
03348   (DAG.getTarget().getRelocationModel() == Reloc::PIC_) ? 2 : 3;
03349 
03350       for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
03351         CCValAssign &VA = ArgLocs[i];
03352         if (!VA.isRegLoc())
03353           continue;
03354         unsigned Reg = VA.getLocReg();
03355         switch (Reg) {
03356         default: break;
03357         case X86::EAX: case X86::EDX: case X86::ECX:
03358           if (++NumInRegs == MaxInRegs)
03359             return false;
03360           break;
03361         }
03362       }
03363     }
03364   }
03365 
03366   return true;
03367 }
03368 
03369 FastISel *
03370 X86TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
03371                                   const TargetLibraryInfo *libInfo) const {
03372   return X86::createFastISel(funcInfo, libInfo);
03373 }
03374 
03375 //===----------------------------------------------------------------------===//
03376 //                           Other Lowering Hooks
03377 //===----------------------------------------------------------------------===//
03378 
03379 static bool MayFoldLoad(SDValue Op) {
03380   return Op.hasOneUse() && ISD::isNormalLoad(Op.getNode());
03381 }
03382 
03383 static bool MayFoldIntoStore(SDValue Op) {
03384   return Op.hasOneUse() && ISD::isNormalStore(*Op.getNode()->use_begin());
03385 }
03386 
03387 static bool isTargetShuffle(unsigned Opcode) {
03388   switch(Opcode) {
03389   default: return false;
03390   case X86ISD::PSHUFD:
03391   case X86ISD::PSHUFHW:
03392   case X86ISD::PSHUFLW:
03393   case X86ISD::SHUFP:
03394   case X86ISD::PALIGNR:
03395   case X86ISD::MOVLHPS:
03396   case X86ISD::MOVLHPD:
03397   case X86ISD::MOVHLPS:
03398   case X86ISD::MOVLPS:
03399   case X86ISD::MOVLPD:
03400   case X86ISD::MOVSHDUP:
03401   case X86ISD::MOVSLDUP:
03402   case X86ISD::MOVDDUP:
03403   case X86ISD::MOVSS:
03404   case X86ISD::MOVSD:
03405   case X86ISD::UNPCKL:
03406   case X86ISD::UNPCKH:
03407   case X86ISD::VPERMILP:
03408   case X86ISD::VPERM2X128:
03409   case X86ISD::VPERMI:
03410     return true;
03411   }
03412 }
03413 
03414 static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT,
03415                                     SDValue V1, SelectionDAG &DAG) {
03416   switch(Opc) {
03417   default: llvm_unreachable("Unknown x86 shuffle node");
03418   case X86ISD::MOVSHDUP:
03419   case X86ISD::MOVSLDUP:
03420   case X86ISD::MOVDDUP:
03421     return DAG.getNode(Opc, dl, VT, V1);
03422   }
03423 }
03424 
03425 static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT,
03426                                     SDValue V1, unsigned TargetMask,
03427                                     SelectionDAG &DAG) {
03428   switch(Opc) {
03429   default: llvm_unreachable("Unknown x86 shuffle node");
03430   case X86ISD::PSHUFD:
03431   case X86ISD::PSHUFHW:
03432   case X86ISD::PSHUFLW:
03433   case X86ISD::VPERMILP:
03434   case X86ISD::VPERMI:
03435     return DAG.getNode(Opc, dl, VT, V1, DAG.getConstant(TargetMask, MVT::i8));
03436   }
03437 }
03438 
03439 static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT,
03440                                     SDValue V1, SDValue V2, unsigned TargetMask,
03441                                     SelectionDAG &DAG) {
03442   switch(Opc) {
03443   default: llvm_unreachable("Unknown x86 shuffle node");
03444   case X86ISD::PALIGNR:
03445   case X86ISD::SHUFP:
03446   case X86ISD::VPERM2X128:
03447     return DAG.getNode(Opc, dl, VT, V1, V2,
03448                        DAG.getConstant(TargetMask, MVT::i8));
03449   }
03450 }
03451 
03452 static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT,
03453                                     SDValue V1, SDValue V2, SelectionDAG &DAG) {
03454   switch(Opc) {
03455   default: llvm_unreachable("Unknown x86 shuffle node");
03456   case X86ISD::MOVLHPS:
03457   case X86ISD::MOVLHPD:
03458   case X86ISD::MOVHLPS:
03459   case X86ISD::MOVLPS:
03460   case X86ISD::MOVLPD:
03461   case X86ISD::MOVSS:
03462   case X86ISD::MOVSD:
03463   case X86ISD::UNPCKL:
03464   case X86ISD::UNPCKH:
03465     return DAG.getNode(Opc, dl, VT, V1, V2);
03466   }
03467 }
03468 
03469 SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const {
03470   MachineFunction &MF = DAG.getMachineFunction();
03471   const X86RegisterInfo *RegInfo =
03472     static_cast<const X86RegisterInfo*>(DAG.getTarget().getRegisterInfo());
03473   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
03474   int ReturnAddrIndex = FuncInfo->getRAIndex();
03475 
03476   if (ReturnAddrIndex == 0) {
03477     // Set up a frame object for the return address.
03478     unsigned SlotSize = RegInfo->getSlotSize();
03479     ReturnAddrIndex = MF.getFrameInfo()->CreateFixedObject(SlotSize,
03480                                                            -(int64_t)SlotSize,
03481                                                            false);
03482     FuncInfo->setRAIndex(ReturnAddrIndex);
03483   }
03484 
03485   return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy());
03486 }
03487 
03488 bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M,
03489                                        bool hasSymbolicDisplacement) {
03490   // Offset should fit into 32 bit immediate field.
03491   if (!isInt<32>(Offset))
03492     return false;
03493 
03494   // If we don't have a symbolic displacement - we don't have any extra
03495   // restrictions.
03496   if (!hasSymbolicDisplacement)
03497     return true;
03498 
03499   // FIXME: Some tweaks might be needed for medium code model.
03500   if (M != CodeModel::Small && M != CodeModel::Kernel)
03501     return false;
03502 
03503   // For small code model we assume that latest object is 16MB before end of 31
03504   // bits boundary. We may also accept pretty large negative constants knowing
03505   // that all objects are in the positive half of address space.
03506   if (M == CodeModel::Small && Offset < 16*1024*1024)
03507     return true;
03508 
03509   // For kernel code model we know that all object resist in the negative half
03510   // of 32bits address space. We may not accept negative offsets, since they may
03511   // be just off and we may accept pretty large positive ones.
03512   if (M == CodeModel::Kernel && Offset > 0)
03513     return true;
03514 
03515   return false;
03516 }
03517 
03518 /// isCalleePop - Determines whether the callee is required to pop its
03519 /// own arguments. Callee pop is necessary to support tail calls.
03520 bool X86::isCalleePop(CallingConv::ID CallingConv,
03521                       bool is64Bit, bool IsVarArg, bool TailCallOpt) {
03522   if (IsVarArg)
03523     return false;
03524 
03525   switch (CallingConv) {
03526   default:
03527     return false;
03528   case CallingConv::X86_StdCall:
03529     return !is64Bit;
03530   case CallingConv::X86_FastCall:
03531     return !is64Bit;
03532   case CallingConv::X86_ThisCall:
03533     return !is64Bit;
03534   case CallingConv::Fast:
03535     return TailCallOpt;
03536   case CallingConv::GHC:
03537     return TailCallOpt;
03538   case CallingConv::HiPE:
03539     return TailCallOpt;
03540   }
03541 }
03542 
03543 /// \brief Return true if the condition is an unsigned comparison operation.
03544 static bool isX86CCUnsigned(unsigned X86CC) {
03545   switch (X86CC) {
03546   default: llvm_unreachable("Invalid integer condition!");
03547   case X86::COND_E:     return true;
03548   case X86::COND_G:     return false;
03549   case X86::COND_GE:    return false;
03550   case X86::COND_L:     return false;
03551   case X86::COND_LE:    return false;
03552   case X86::COND_NE:    return true;
03553   case X86::COND_B:     return true;
03554   case X86::COND_A:     return true;
03555   case X86::COND_BE:    return true;
03556   case X86::COND_AE:    return true;
03557   }
03558   llvm_unreachable("covered switch fell through?!");
03559 }
03560 
03561 /// TranslateX86CC - do a one to one translation of a ISD::CondCode to the X86
03562 /// specific condition code, returning the condition code and the LHS/RHS of the
03563 /// comparison to make.
03564 static unsigned TranslateX86CC(ISD::CondCode SetCCOpcode, bool isFP,
03565                                SDValue &LHS, SDValue &RHS, SelectionDAG &DAG) {
03566   if (!isFP) {
03567     if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) {
03568       if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnesValue()) {
03569         // X > -1   -> X == 0, jump !sign.
03570         RHS = DAG.getConstant(0, RHS.getValueType());
03571         return X86::COND_NS;
03572       }
03573       if (SetCCOpcode == ISD::SETLT && RHSC->isNullValue()) {
03574         // X < 0   -> X == 0, jump on sign.
03575         return X86::COND_S;
03576       }
03577       if (SetCCOpcode == ISD::SETLT && RHSC->getZExtValue() == 1) {
03578         // X < 1   -> X <= 0
03579         RHS = DAG.getConstant(0, RHS.getValueType());
03580         return X86::COND_LE;
03581       }
03582     }
03583 
03584     switch (SetCCOpcode) {
03585     default: llvm_unreachable("Invalid integer condition!");
03586     case ISD::SETEQ:  return X86::COND_E;
03587     case ISD::SETGT:  return X86::COND_G;
03588     case ISD::SETGE:  return X86::COND_GE;
03589     case ISD::SETLT:  return X86::COND_L;
03590     case ISD::SETLE:  return X86::COND_LE;
03591     case ISD::SETNE:  return X86::COND_NE;
03592     case ISD::SETULT: return X86::COND_B;
03593     case ISD::SETUGT: return X86::COND_A;
03594     case ISD::SETULE: return X86::COND_BE;
03595     case ISD::SETUGE: return X86::COND_AE;
03596     }
03597   }
03598 
03599   // First determine if it is required or is profitable to flip the operands.
03600 
03601   // If LHS is a foldable load, but RHS is not, flip the condition.
03602   if (ISD::isNON_EXTLoad(LHS.getNode()) &&
03603       !ISD::isNON_EXTLoad(RHS.getNode())) {
03604     SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode);
03605     std::swap(LHS, RHS);
03606   }
03607 
03608   switch (SetCCOpcode) {
03609   default: break;
03610   case ISD::SETOLT:
03611   case ISD::SETOLE:
03612   case ISD::SETUGT:
03613   case ISD::SETUGE:
03614     std::swap(LHS, RHS);
03615     break;
03616   }
03617 
03618   // On a floating point condition, the flags are set as follows:
03619   // ZF  PF  CF   op
03620   //  0 | 0 | 0 | X > Y
03621   //  0 | 0 | 1 | X < Y
03622   //  1 | 0 | 0 | X == Y
03623   //  1 | 1 | 1 | unordered
03624   switch (SetCCOpcode) {
03625   default: llvm_unreachable("Condcode should be pre-legalized away");
03626   case ISD::SETUEQ:
03627   case ISD::SETEQ:   return X86::COND_E;
03628   case ISD::SETOLT:              // flipped
03629   case ISD::SETOGT:
03630   case ISD::SETGT:   return X86::COND_A;
03631   case ISD::SETOLE:              // flipped
03632   case ISD::SETOGE:
03633   case ISD::SETGE:   return X86::COND_AE;
03634   case ISD::SETUGT:              // flipped
03635   case ISD::SETULT:
03636   case ISD::SETLT:   return X86::COND_B;
03637   case ISD::SETUGE:              // flipped
03638   case ISD::SETULE:
03639   case ISD::SETLE:   return X86::COND_BE;
03640   case ISD::SETONE:
03641   case ISD::SETNE:   return X86::COND_NE;
03642   case ISD::SETUO:   return X86::COND_P;
03643   case ISD::SETO:    return X86::COND_NP;
03644   case ISD::SETOEQ:
03645   case ISD::SETUNE:  return X86::COND_INVALID;
03646   }
03647 }
03648 
03649 /// hasFPCMov - is there a floating point cmov for the specific X86 condition
03650 /// code. Current x86 isa includes the following FP cmov instructions:
03651 /// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu.
03652 static bool hasFPCMov(unsigned X86CC) {
03653   switch (X86CC) {
03654   default:
03655     return false;
03656   case X86::COND_B:
03657   case X86::COND_BE:
03658   case X86::COND_E:
03659   case X86::COND_P:
03660   case X86::COND_A:
03661   case X86::COND_AE:
03662   case X86::COND_NE:
03663   case X86::COND_NP:
03664     return true;
03665   }
03666 }
03667 
03668 /// isFPImmLegal - Returns true if the target can instruction select the
03669 /// specified FP immediate natively. If false, the legalizer will
03670 /// materialize the FP immediate as a load from a constant pool.
03671 bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
03672   for (unsigned i = 0, e = LegalFPImmediates.size(); i != e; ++i) {
03673     if (Imm.bitwiseIsEqual(LegalFPImmediates[i]))
03674       return true;
03675   }
03676   return false;
03677 }
03678 
03679 /// \brief Returns true if it is beneficial to convert a load of a constant
03680 /// to just the constant itself.
03681 bool X86TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
03682                                                           Type *Ty) const {
03683   assert(Ty->isIntegerTy());
03684 
03685   unsigned BitSize = Ty->getPrimitiveSizeInBits();
03686   if (BitSize == 0 || BitSize > 64)
03687     return false;
03688   return true;
03689 }
03690 
03691 /// isUndefOrInRange - Return true if Val is undef or if its value falls within
03692 /// the specified range (L, H].
03693 static bool isUndefOrInRange(int Val, int Low, int Hi) {
03694   return (Val < 0) || (Val >= Low && Val < Hi);
03695 }
03696 
03697 /// isUndefOrEqual - Val is either less than zero (undef) or equal to the
03698 /// specified value.
03699 static bool isUndefOrEqual(int Val, int CmpVal) {
03700   return (Val < 0 || Val == CmpVal);
03701 }
03702 
03703 /// isSequentialOrUndefInRange - Return true if every element in Mask, beginning
03704 /// from position Pos and ending in Pos+Size, falls within the specified
03705 /// sequential range (L, L+Pos]. or is undef.
03706 static bool isSequentialOrUndefInRange(ArrayRef<int> Mask,
03707                                        unsigned Pos, unsigned Size, int Low) {
03708   for (unsigned i = Pos, e = Pos+Size; i != e; ++i, ++Low)
03709     if (!isUndefOrEqual(Mask[i], Low))
03710       return false;
03711   return true;
03712 }
03713 
03714 /// isPSHUFDMask - Return true if the node specifies a shuffle of elements that
03715 /// is suitable for input to PSHUFD or PSHUFW.  That is, it doesn't reference
03716 /// the second operand.
03717 static bool isPSHUFDMask(ArrayRef<int> Mask, MVT VT) {
03718   if (VT == MVT::v4f32 || VT == MVT::v4i32 )
03719     return (Mask[0] < 4 && Mask[1] < 4 && Mask[2] < 4 && Mask[3] < 4);
03720   if (VT == MVT::v2f64 || VT == MVT::v2i64)
03721     return (Mask[0] < 2 && Mask[1] < 2);
03722   return false;
03723 }
03724 
03725 /// isPSHUFHWMask - Return true if the node specifies a shuffle of elements that
03726 /// is suitable for input to PSHUFHW.
03727 static bool isPSHUFHWMask(ArrayRef<int> Mask, MVT VT, bool HasInt256) {
03728   if (VT != MVT::v8i16 && (!HasInt256 || VT != MVT::v16i16))
03729     return false;
03730 
03731   // Lower quadword copied in order or undef.
03732   if (!isSequentialOrUndefInRange(Mask, 0, 4, 0))
03733     return false;
03734 
03735   // Upper quadword shuffled.
03736   for (unsigned i = 4; i != 8; ++i)
03737     if (!isUndefOrInRange(Mask[i], 4, 8))
03738       return false;
03739 
03740   if (VT == MVT::v16i16) {
03741     // Lower quadword copied in order or undef.
03742     if (!isSequentialOrUndefInRange(Mask, 8, 4, 8))
03743       return false;
03744 
03745     // Upper quadword shuffled.
03746     for (unsigned i = 12; i != 16; ++i)
03747       if (!isUndefOrInRange(Mask[i], 12, 16))
03748         return false;
03749   }
03750 
03751   return true;
03752 }
03753 
03754 /// isPSHUFLWMask - Return true if the node specifies a shuffle of elements that
03755 /// is suitable for input to PSHUFLW.
03756 static bool isPSHUFLWMask(ArrayRef<int> Mask, MVT VT, bool HasInt256) {
03757   if (VT != MVT::v8i16 && (!HasInt256 || VT != MVT::v16i16))
03758     return false;
03759 
03760   // Upper quadword copied in order.
03761   if (!isSequentialOrUndefInRange(Mask, 4, 4, 4))
03762     return false;
03763 
03764   // Lower quadword shuffled.
03765   for (unsigned i = 0; i != 4; ++i)
03766     if (!isUndefOrInRange(Mask[i], 0, 4))
03767       return false;
03768 
03769   if (VT == MVT::v16i16) {
03770     // Upper quadword copied in order.
03771     if (!isSequentialOrUndefInRange(Mask, 12, 4, 12))
03772       return false;
03773 
03774     // Lower quadword shuffled.
03775     for (unsigned i = 8; i != 12; ++i)
03776       if (!isUndefOrInRange(Mask[i], 8, 12))
03777         return false;
03778   }
03779 
03780   return true;
03781 }
03782 
03783 /// isPALIGNRMask - Return true if the node specifies a shuffle of elements that
03784 /// is suitable for input to PALIGNR.
03785 static bool isPALIGNRMask(ArrayRef<int> Mask, MVT VT,
03786                           const X86Subtarget *Subtarget) {
03787   if ((VT.is128BitVector() && !Subtarget->hasSSSE3()) ||
03788       (VT.is256BitVector() && !Subtarget->hasInt256()))
03789     return false;
03790 
03791   unsigned NumElts = VT.getVectorNumElements();
03792   unsigned NumLanes = VT.is512BitVector() ? 1: VT.getSizeInBits()/128;
03793   unsigned NumLaneElts = NumElts/NumLanes;
03794 
03795   // Do not handle 64-bit element shuffles with palignr.
03796   if (NumLaneElts == 2)
03797     return false;
03798 
03799   for (unsigned l = 0; l != NumElts; l+=NumLaneElts) {
03800     unsigned i;
03801     for (i = 0; i != NumLaneElts; ++i) {
03802       if (Mask[i+l] >= 0)
03803         break;
03804     }
03805 
03806     // Lane is all undef, go to next lane
03807     if (i == NumLaneElts)
03808       continue;
03809 
03810     int Start = Mask[i+l];
03811 
03812     // Make sure its in this lane in one of the sources
03813     if (!isUndefOrInRange(Start, l, l+NumLaneElts) &&
03814         !isUndefOrInRange(Start, l+NumElts, l+NumElts+NumLaneElts))
03815       return false;
03816 
03817     // If not lane 0, then we must match lane 0
03818     if (l != 0 && Mask[i] >= 0 && !isUndefOrEqual(Start, Mask[i]+l))
03819       return false;
03820 
03821     // Correct second source to be contiguous with first source
03822     if (Start >= (int)NumElts)
03823       Start -= NumElts - NumLaneElts;
03824 
03825     // Make sure we're shifting in the right direction.
03826     if (Start <= (int)(i+l))
03827       return false;
03828 
03829     Start -= i;
03830 
03831     // Check the rest of the elements to see if they are consecutive.
03832     for (++i; i != NumLaneElts; ++i) {
03833       int Idx = Mask[i+l];
03834 
03835       // Make sure its in this lane
03836       if (!isUndefOrInRange(Idx, l, l+NumLaneElts) &&
03837           !isUndefOrInRange(Idx, l+NumElts, l+NumElts+NumLaneElts))
03838         return false;
03839 
03840       // If not lane 0, then we must match lane 0
03841       if (l != 0 && Mask[i] >= 0 && !isUndefOrEqual(Idx, Mask[i]+l))
03842         return false;
03843 
03844       if (Idx >= (int)NumElts)
03845         Idx -= NumElts - NumLaneElts;
03846 
03847       if (!isUndefOrEqual(Idx, Start+i))
03848         return false;
03849 
03850     }
03851   }
03852 
03853   return true;
03854 }
03855 
03856 /// CommuteVectorShuffleMask - Change values in a shuffle permute mask assuming
03857 /// the two vector operands have swapped position.
03858 static void CommuteVectorShuffleMask(SmallVectorImpl<int> &Mask,
03859                                      unsigned NumElems) {
03860   for (unsigned i = 0; i != NumElems; ++i) {
03861     int idx = Mask[i];
03862     if (idx < 0)
03863       continue;
03864     else if (idx < (int)NumElems)
03865       Mask[i] = idx + NumElems;
03866     else
03867       Mask[i] = idx - NumElems;
03868   }
03869 }
03870 
03871 /// isSHUFPMask - Return true if the specified VECTOR_SHUFFLE operand
03872 /// specifies a shuffle of elements that is suitable for input to 128/256-bit
03873 /// SHUFPS and SHUFPD. If Commuted is true, then it checks for sources to be
03874 /// reverse of what x86 shuffles want.
03875 static bool isSHUFPMask(ArrayRef<int> Mask, MVT VT, bool Commuted = false) {
03876 
03877   unsigned NumElems = VT.getVectorNumElements();
03878   unsigned NumLanes = VT.getSizeInBits()/128;
03879   unsigned NumLaneElems = NumElems/NumLanes;
03880 
03881   if (NumLaneElems != 2 && NumLaneElems != 4)
03882     return false;
03883 
03884   unsigned EltSize = VT.getVectorElementType().getSizeInBits();
03885   bool symetricMaskRequired =
03886     (VT.getSizeInBits() >= 256) && (EltSize == 32);
03887 
03888   // VSHUFPSY divides the resulting vector into 4 chunks.
03889   // The sources are also splitted into 4 chunks, and each destination
03890   // chunk must come from a different source chunk.
03891   //
03892   //  SRC1 =>   X7    X6    X5    X4    X3    X2    X1    X0
03893   //  SRC2 =>   Y7    Y6    Y5    Y4    Y3    Y2    Y1    Y9
03894   //
03895   //  DST  =>  Y7..Y4,   Y7..Y4,   X7..X4,   X7..X4,
03896   //           Y3..Y0,   Y3..Y0,   X3..X0,   X3..X0
03897   //
03898   // VSHUFPDY divides the resulting vector into 4 chunks.
03899   // The sources are also splitted into 4 chunks, and each destination
03900   // chunk must come from a different source chunk.
03901   //
03902   //  SRC1 =>      X3       X2       X1       X0
03903   //  SRC2 =>      Y3       Y2       Y1       Y0
03904   //
03905   //  DST  =>  Y3..Y2,  X3..X2,  Y1..Y0,  X1..X0
03906   //
03907   SmallVector<int, 4> MaskVal(NumLaneElems, -1);
03908   unsigned HalfLaneElems = NumLaneElems/2;
03909   for (unsigned l = 0; l != NumElems; l += NumLaneElems) {
03910     for (unsigned i = 0; i != NumLaneElems; ++i) {
03911       int Idx = Mask[i+l];
03912       unsigned RngStart = l + ((Commuted == (i<HalfLaneElems)) ? NumElems : 0);
03913       if (!isUndefOrInRange(Idx, RngStart, RngStart+NumLaneElems))
03914         return false;
03915       // For VSHUFPSY, the mask of the second half must be the same as the
03916       // first but with the appropriate offsets. This works in the same way as
03917       // VPERMILPS works with masks.
03918       if (!symetricMaskRequired || Idx < 0)
03919         continue;
03920       if (MaskVal[i] < 0) {
03921         MaskVal[i] = Idx - l;
03922         continue;
03923       }
03924       if ((signed)(Idx - l) != MaskVal[i])
03925         return false;
03926     }
03927   }
03928 
03929   return true;
03930 }
03931 
03932 /// isMOVHLPSMask - Return true if the specified VECTOR_SHUFFLE operand
03933 /// specifies a shuffle of elements that is suitable for input to MOVHLPS.
03934 static bool isMOVHLPSMask(ArrayRef<int> Mask, MVT VT) {
03935   if (!VT.is128BitVector())
03936     return false;
03937 
03938   unsigned NumElems = VT.getVectorNumElements();
03939 
03940   if (NumElems != 4)
03941     return false;
03942 
03943   // Expect bit0 == 6, bit1 == 7, bit2 == 2, bit3 == 3
03944   return isUndefOrEqual(Mask[0], 6) &&
03945          isUndefOrEqual(Mask[1], 7) &&
03946          isUndefOrEqual(Mask[2], 2) &&
03947          isUndefOrEqual(Mask[3], 3);
03948 }
03949 
03950 /// isMOVHLPS_v_undef_Mask - Special case of isMOVHLPSMask for canonical form
03951 /// of vector_shuffle v, v, <2, 3, 2, 3>, i.e. vector_shuffle v, undef,
03952 /// <2, 3, 2, 3>
03953 static bool isMOVHLPS_v_undef_Mask(ArrayRef<int> Mask, MVT VT) {
03954   if (!VT.is128BitVector())
03955     return false;
03956 
03957   unsigned NumElems = VT.getVectorNumElements();
03958 
03959   if (NumElems != 4)
03960     return false;
03961 
03962   return isUndefOrEqual(Mask[0], 2) &&
03963          isUndefOrEqual(Mask[1], 3) &&
03964          isUndefOrEqual(Mask[2], 2) &&
03965          isUndefOrEqual(Mask[3], 3);
03966 }
03967 
03968 /// isMOVLPMask - Return true if the specified VECTOR_SHUFFLE operand
03969 /// specifies a shuffle of elements that is suitable for input to MOVLP{S|D}.
03970 static bool isMOVLPMask(ArrayRef<int> Mask, MVT VT) {
03971   if (!VT.is128BitVector())
03972     return false;
03973 
03974   unsigned NumElems = VT.getVectorNumElements();
03975 
03976   if (NumElems != 2 && NumElems != 4)
03977     return false;
03978 
03979   for (unsigned i = 0, e = NumElems/2; i != e; ++i)
03980     if (!isUndefOrEqual(Mask[i], i + NumElems))
03981       return false;
03982 
03983   for (unsigned i = NumElems/2, e = NumElems; i != e; ++i)
03984     if (!isUndefOrEqual(Mask[i], i))
03985       return false;
03986 
03987   return true;
03988 }
03989 
03990 /// isMOVLHPSMask - Return true if the specified VECTOR_SHUFFLE operand
03991 /// specifies a shuffle of elements that is suitable for input to MOVLHPS.
03992 static bool isMOVLHPSMask(ArrayRef<int> Mask, MVT VT) {
03993   if (!VT.is128BitVector())
03994     return false;
03995 
03996   unsigned NumElems = VT.getVectorNumElements();
03997 
03998   if (NumElems != 2 && NumElems != 4)
03999     return false;
04000 
04001   for (unsigned i = 0, e = NumElems/2; i != e; ++i)
04002     if (!isUndefOrEqual(Mask[i], i))
04003       return false;
04004 
04005   for (unsigned i = 0, e = NumElems/2; i != e; ++i)
04006     if (!isUndefOrEqual(Mask[i + e], i + NumElems))
04007       return false;
04008 
04009   return true;
04010 }
04011 
04012 /// isINSERTPSMask - Return true if the specified VECTOR_SHUFFLE operand
04013 /// specifies a shuffle of elements that is suitable for input to INSERTPS.
04014 /// i. e: If all but one element come from the same vector.
04015 static bool isINSERTPSMask(ArrayRef<int> Mask, MVT VT) {
04016   // TODO: Deal with AVX's VINSERTPS
04017   if (!VT.is128BitVector() || (VT != MVT::v4f32 && VT != MVT::v4i32))
04018     return false;
04019 
04020   unsigned CorrectPosV1 = 0;
04021   unsigned CorrectPosV2 = 0;
04022   for (int i = 0, e = (int)VT.getVectorNumElements(); i != e; ++i) {
04023     if (Mask[i] == -1) {
04024       ++CorrectPosV1;
04025       ++CorrectPosV2;
04026       continue;
04027     }
04028 
04029     if (Mask[i] == i)
04030       ++CorrectPosV1;
04031     else if (Mask[i] == i + 4)
04032       ++CorrectPosV2;
04033   }
04034 
04035   if (CorrectPosV1 == 3 || CorrectPosV2 == 3)
04036     // We have 3 elements (undefs count as elements from any vector) from one
04037     // vector, and one from another.
04038     return true;
04039 
04040   return false;
04041 }
04042 
04043 //
04044 // Some special combinations that can be optimized.
04045 //
04046 static
04047 SDValue Compact8x32ShuffleNode(ShuffleVectorSDNode *SVOp,
04048                                SelectionDAG &DAG) {
04049   MVT VT = SVOp->getSimpleValueType(0);
04050   SDLoc dl(SVOp);
04051 
04052   if (VT != MVT::v8i32 && VT != MVT::v8f32)
04053     return SDValue();
04054 
04055   ArrayRef<int> Mask = SVOp->getMask();
04056 
04057   // These are the special masks that may be optimized.
04058   static const int MaskToOptimizeEven[] = {0, 8, 2, 10, 4, 12, 6, 14};
04059   static const int MaskToOptimizeOdd[]  = {1, 9, 3, 11, 5, 13, 7, 15};
04060   bool MatchEvenMask = true;
04061   bool MatchOddMask  = true;
04062   for (int i=0; i<8; ++i) {
04063     if (!isUndefOrEqual(Mask[i], MaskToOptimizeEven[i]))
04064       MatchEvenMask = false;
04065     if (!isUndefOrEqual(Mask[i], MaskToOptimizeOdd[i]))
04066       MatchOddMask = false;
04067   }
04068 
04069   if (!MatchEvenMask && !MatchOddMask)
04070     return SDValue();
04071 
04072   SDValue UndefNode = DAG.getNode(ISD::UNDEF, dl, VT);
04073 
04074   SDValue Op0 = SVOp->getOperand(0);
04075   SDValue Op1 = SVOp->getOperand(1);
04076 
04077   if (MatchEvenMask) {
04078     // Shift the second operand right to 32 bits.
04079     static const int ShiftRightMask[] = {-1, 0, -1, 2, -1, 4, -1, 6 };
04080     Op1 = DAG.getVectorShuffle(VT, dl, Op1, UndefNode, ShiftRightMask);
04081   } else {
04082     // Shift the first operand left to 32 bits.
04083     static const int ShiftLeftMask[] = {1, -1, 3, -1, 5, -1, 7, -1 };
04084     Op0 = DAG.getVectorShuffle(VT, dl, Op0, UndefNode, ShiftLeftMask);
04085   }
04086   static const int BlendMask[] = {0, 9, 2, 11, 4, 13, 6, 15};
04087   return DAG.getVectorShuffle(VT, dl, Op0, Op1, BlendMask);
04088 }
04089 
04090 /// isUNPCKLMask - Return true if the specified VECTOR_SHUFFLE operand
04091 /// specifies a shuffle of elements that is suitable for input to UNPCKL.
04092 static bool isUNPCKLMask(ArrayRef<int> Mask, MVT VT,
04093                          bool HasInt256, bool V2IsSplat = false) {
04094 
04095   assert(VT.getSizeInBits() >= 128 &&
04096          "Unsupported vector type for unpckl");
04097 
04098   // AVX defines UNPCK* to operate independently on 128-bit lanes.
04099   unsigned NumLanes;
04100   unsigned NumOf256BitLanes;
04101   unsigned NumElts = VT.getVectorNumElements();
04102   if (VT.is256BitVector()) {
04103     if (NumElts != 4 && NumElts != 8 &&
04104         (!HasInt256 || (NumElts != 16 && NumElts != 32)))
04105     return false;
04106     NumLanes = 2;
04107     NumOf256BitLanes = 1;
04108   } else if (VT.is512BitVector()) {
04109     assert(VT.getScalarType().getSizeInBits() >= 32 &&
04110            "Unsupported vector type for unpckh");
04111     NumLanes = 2;
04112     NumOf256BitLanes = 2;
04113   } else {
04114     NumLanes = 1;
04115     NumOf256BitLanes = 1;
04116   }
04117 
04118   unsigned NumEltsInStride = NumElts/NumOf256BitLanes;
04119   unsigned NumLaneElts = NumEltsInStride/NumLanes;
04120 
04121   for (unsigned l256 = 0; l256 < NumOf256BitLanes; l256 += 1) {
04122     for (unsigned l = 0; l != NumEltsInStride; l += NumLaneElts) {
04123       for (unsigned i = 0, j = l; i != NumLaneElts; i += 2, ++j) {
04124         int BitI  = Mask[l256*NumEltsInStride+l+i];
04125         int BitI1 = Mask[l256*NumEltsInStride+l+i+1];
04126         if (!isUndefOrEqual(BitI, j+l256*NumElts))
04127           return false;
04128         if (V2IsSplat && !isUndefOrEqual(BitI1, NumElts))
04129           return false;
04130         if (!isUndefOrEqual(BitI1, j+l256*NumElts+NumEltsInStride))
04131           return false;
04132       }
04133     }
04134   }
04135   return true;
04136 }
04137 
04138 /// isUNPCKHMask - Return true if the specified VECTOR_SHUFFLE operand
04139 /// specifies a shuffle of elements that is suitable for input to UNPCKH.
04140 static bool isUNPCKHMask(ArrayRef<int> Mask, MVT VT,
04141                          bool HasInt256, bool V2IsSplat = false) {
04142   assert(VT.getSizeInBits() >= 128 &&
04143          "Unsupported vector type for unpckh");
04144 
04145   // AVX defines UNPCK* to operate independently on 128-bit lanes.
04146   unsigned NumLanes;
04147   unsigned NumOf256BitLanes;
04148   unsigned NumElts = VT.getVectorNumElements();
04149   if (VT.is256BitVector()) {
04150     if (NumElts != 4 && NumElts != 8 &&
04151         (!HasInt256 || (NumElts != 16 && NumElts != 32)))
04152     return false;
04153     NumLanes = 2;
04154     NumOf256BitLanes = 1;
04155   } else if (VT.is512BitVector()) {
04156     assert(VT.getScalarType().getSizeInBits() >= 32 &&
04157            "Unsupported vector type for unpckh");
04158     NumLanes = 2;
04159     NumOf256BitLanes = 2;
04160   } else {
04161     NumLanes = 1;
04162     NumOf256BitLanes = 1;
04163   }
04164 
04165   unsigned NumEltsInStride = NumElts/NumOf256BitLanes;
04166   unsigned NumLaneElts = NumEltsInStride/NumLanes;
04167 
04168   for (unsigned l256 = 0; l256 < NumOf256BitLanes; l256 += 1) {
04169     for (unsigned l = 0; l != NumEltsInStride; l += NumLaneElts) {
04170       for (unsigned i = 0, j = l+NumLaneElts/2; i != NumLaneElts; i += 2, ++j) {
04171         int BitI  = Mask[l256*NumEltsInStride+l+i];
04172         int BitI1 = Mask[l256*NumEltsInStride+l+i+1];
04173         if (!isUndefOrEqual(BitI, j+l256*NumElts))
04174           return false;
04175         if (V2IsSplat && !isUndefOrEqual(BitI1, NumElts))
04176           return false;
04177         if (!isUndefOrEqual(BitI1, j+l256*NumElts+NumEltsInStride))
04178           return false;
04179       }
04180     }
04181   }
04182   return true;
04183 }
04184 
04185 /// isUNPCKL_v_undef_Mask - Special case of isUNPCKLMask for canonical form
04186 /// of vector_shuffle v, v, <0, 4, 1, 5>, i.e. vector_shuffle v, undef,
04187 /// <0, 0, 1, 1>
04188 static bool isUNPCKL_v_undef_Mask(ArrayRef<int> Mask, MVT VT, bool HasInt256) {
04189   unsigned NumElts = VT.getVectorNumElements();
04190   bool Is256BitVec = VT.is256BitVector();
04191 
04192   if (VT.is512BitVector())
04193     return false;
04194   assert((VT.is128BitVector() || VT.is256BitVector()) &&
04195          "Unsupported vector type for unpckh");
04196 
04197   if (Is256BitVec && NumElts != 4 && NumElts != 8 &&
04198       (!HasInt256 || (NumElts != 16 && NumElts != 32)))
04199     return false;
04200 
04201   // For 256-bit i64/f64, use MOVDDUPY instead, so reject the matching pattern
04202   // FIXME: Need a better way to get rid of this, there's no latency difference
04203   // between UNPCKLPD and MOVDDUP, the later should always be checked first and
04204   // the former later. We should also remove the "_undef" special mask.
04205   if (NumElts == 4 && Is256BitVec)
04206     return false;
04207 
04208   // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate
04209   // independently on 128-bit lanes.
04210   unsigned NumLanes = VT.getSizeInBits()/128;
04211   unsigned NumLaneElts = NumElts/NumLanes;
04212 
04213   for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
04214     for (unsigned i = 0, j = l; i != NumLaneElts; i += 2, ++j) {
04215       int BitI  = Mask[l+i];
04216       int BitI1 = Mask[l+i+1];
04217 
04218       if (!isUndefOrEqual(BitI, j))
04219         return false;
04220       if (!isUndefOrEqual(BitI1, j))
04221         return false;
04222     }
04223   }
04224 
04225   return true;
04226 }
04227 
04228 /// isUNPCKH_v_undef_Mask - Special case of isUNPCKHMask for canonical form
04229 /// of vector_shuffle v, v, <2, 6, 3, 7>, i.e. vector_shuffle v, undef,
04230 /// <2, 2, 3, 3>
04231 static bool isUNPCKH_v_undef_Mask(ArrayRef<int> Mask, MVT VT, bool HasInt256) {
04232   unsigned NumElts = VT.getVectorNumElements();
04233 
04234   if (VT.is512BitVector())
04235     return false;
04236 
04237   assert((VT.is128BitVector() || VT.is256BitVector()) &&
04238          "Unsupported vector type for unpckh");
04239 
04240   if (VT.is256BitVector() && NumElts != 4 && NumElts != 8 &&
04241       (!HasInt256 || (NumElts != 16 && NumElts != 32)))
04242     return false;
04243 
04244   // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate
04245   // independently on 128-bit lanes.
04246   unsigned NumLanes = VT.getSizeInBits()/128;
04247   unsigned NumLaneElts = NumElts/NumLanes;
04248 
04249   for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
04250     for (unsigned i = 0, j = l+NumLaneElts/2; i != NumLaneElts; i += 2, ++j) {
04251       int BitI  = Mask[l+i];
04252       int BitI1 = Mask[l+i+1];
04253       if (!isUndefOrEqual(BitI, j))
04254         return false;
04255       if (!isUndefOrEqual(BitI1, j))
04256         return false;
04257     }
04258   }
04259   return true;
04260 }
04261 
04262 // Match for INSERTI64x4 INSERTF64x4 instructions (src0[0], src1[0]) or
04263 // (src1[0], src0[1]), manipulation with 256-bit sub-vectors
04264 static bool isINSERT64x4Mask(ArrayRef<int> Mask, MVT VT, unsigned int *Imm) {
04265   if (!VT.is512BitVector())
04266     return false;
04267 
04268   unsigned NumElts = VT.getVectorNumElements();
04269   unsigned HalfSize = NumElts/2;
04270   if (isSequentialOrUndefInRange(Mask, 0, HalfSize, 0)) {
04271     if (isSequentialOrUndefInRange(Mask, HalfSize, HalfSize, NumElts)) {
04272       *Imm = 1;
04273       return true;
04274     }
04275   }
04276   if (isSequentialOrUndefInRange(Mask, 0, HalfSize, NumElts)) {
04277     if (isSequentialOrUndefInRange(Mask, HalfSize, HalfSize, HalfSize)) {
04278       *Imm = 0;
04279       return true;
04280     }
04281   }
04282   return false;
04283 }
04284 
04285 /// isMOVLMask - Return true if the specified VECTOR_SHUFFLE operand
04286 /// specifies a shuffle of elements that is suitable for input to MOVSS,
04287 /// MOVSD, and MOVD, i.e. setting the lowest element.
04288 static bool isMOVLMask(ArrayRef<int> Mask, EVT VT) {
04289   if (VT.getVectorElementType().getSizeInBits() < 32)
04290     return false;
04291   if (!VT.is128BitVector())
04292     return false;
04293 
04294   unsigned NumElts = VT.getVectorNumElements();
04295 
04296   if (!isUndefOrEqual(Mask[0], NumElts))
04297     return false;
04298 
04299   for (unsigned i = 1; i != NumElts; ++i)
04300     if (!isUndefOrEqual(Mask[i], i))
04301       return false;
04302 
04303   return true;
04304 }
04305 
04306 /// isVPERM2X128Mask - Match 256-bit shuffles where the elements are considered
04307 /// as permutations between 128-bit chunks or halves. As an example: this
04308 /// shuffle bellow:
04309 ///   vector_shuffle <4, 5, 6, 7, 12, 13, 14, 15>
04310 /// The first half comes from the second half of V1 and the second half from the
04311 /// the second half of V2.
04312 static bool isVPERM2X128Mask(ArrayRef<int> Mask, MVT VT, bool HasFp256) {
04313   if (!HasFp256 || !VT.is256BitVector())
04314     return false;
04315 
04316   // The shuffle result is divided into half A and half B. In total the two
04317   // sources have 4 halves, namely: C, D, E, F. The final values of A and
04318   // B must come from C, D, E or F.
04319   unsigned HalfSize = VT.getVectorNumElements()/2;
04320   bool MatchA = false, MatchB = false;
04321 
04322   // Check if A comes from one of C, D, E, F.
04323   for (unsigned Half = 0; Half != 4; ++Half) {
04324     if (isSequentialOrUndefInRange(Mask, 0, HalfSize, Half*HalfSize)) {
04325       MatchA = true;
04326       break;
04327     }
04328   }
04329 
04330   // Check if B comes from one of C, D, E, F.
04331   for (unsigned Half = 0; Half != 4; ++Half) {
04332     if (isSequentialOrUndefInRange(Mask, HalfSize, HalfSize, Half*HalfSize)) {
04333       MatchB = true;
04334       break;
04335     }
04336   }
04337 
04338   return MatchA && MatchB;
04339 }
04340 
04341 /// getShuffleVPERM2X128Immediate - Return the appropriate immediate to shuffle
04342 /// the specified VECTOR_MASK mask with VPERM2F128/VPERM2I128 instructions.
04343 static unsigned getShuffleVPERM2X128Immediate(ShuffleVectorSDNode *SVOp) {
04344   MVT VT = SVOp->getSimpleValueType(0);
04345 
04346   unsigned HalfSize = VT.getVectorNumElements()/2;
04347 
04348   unsigned FstHalf = 0, SndHalf = 0;
04349   for (unsigned i = 0; i < HalfSize; ++i) {
04350     if (SVOp->getMaskElt(i) > 0) {
04351       FstHalf = SVOp->getMaskElt(i)/HalfSize;
04352       break;
04353     }
04354   }
04355   for (unsigned i = HalfSize; i < HalfSize*2; ++i) {
04356     if (SVOp->getMaskElt(i) > 0) {
04357       SndHalf = SVOp->getMaskElt(i)/HalfSize;
04358       break;
04359     }
04360   }
04361 
04362   return (FstHalf | (SndHalf << 4));
04363 }
04364 
04365 // Symetric in-lane mask. Each lane has 4 elements (for imm8)
04366 static bool isPermImmMask(ArrayRef<int> Mask, MVT VT, unsigned& Imm8) {
04367   unsigned EltSize = VT.getVectorElementType().getSizeInBits();
04368   if (EltSize < 32)
04369     return false;
04370 
04371   unsigned NumElts = VT.getVectorNumElements();
04372   Imm8 = 0;
04373   if (VT.is128BitVector() || (VT.is256BitVector() && EltSize == 64)) {
04374     for (unsigned i = 0; i != NumElts; ++i) {
04375       if (Mask[i] < 0)
04376         continue;
04377       Imm8 |= Mask[i] << (i*2);
04378     }
04379     return true;
04380   }
04381 
04382   unsigned LaneSize = 4;
04383   SmallVector<int, 4> MaskVal(LaneSize, -1);
04384 
04385   for (unsigned l = 0; l != NumElts; l += LaneSize) {
04386     for (unsigned i = 0; i != LaneSize; ++i) {
04387       if (!isUndefOrInRange(Mask[i+l], l, l+LaneSize))
04388         return false;
04389       if (Mask[i+l] < 0)
04390         continue;
04391       if (MaskVal[i] < 0) {
04392         MaskVal[i] = Mask[i+l] - l;
04393         Imm8 |= MaskVal[i] << (i*2);
04394         continue;
04395       }
04396       if (Mask[i+l] != (signed)(MaskVal[i]+l))
04397         return false;
04398     }
04399   }
04400   return true;
04401 }
04402 
04403 /// isVPERMILPMask - Return true if the specified VECTOR_SHUFFLE operand
04404 /// specifies a shuffle of elements that is suitable for input to VPERMILPD*.
04405 /// Note that VPERMIL mask matching is different depending whether theunderlying
04406 /// type is 32 or 64. In the VPERMILPS the high half of the mask should point
04407 /// to the same elements of the low, but to the higher half of the source.
04408 /// In VPERMILPD the two lanes could be shuffled independently of each other
04409 /// with the same restriction that lanes can't be crossed. Also handles PSHUFDY.
04410 static bool isVPERMILPMask(ArrayRef<int> Mask, MVT VT) {
04411   unsigned EltSize = VT.getVectorElementType().getSizeInBits();
04412   if (VT.getSizeInBits() < 256 || EltSize < 32)
04413     return false;
04414   bool symetricMaskRequired = (EltSize == 32);
04415   unsigned NumElts = VT.getVectorNumElements();
04416 
04417   unsigned NumLanes = VT.getSizeInBits()/128;
04418   unsigned LaneSize = NumElts/NumLanes;
04419   // 2 or 4 elements in one lane
04420 
04421   SmallVector<int, 4> ExpectedMaskVal(LaneSize, -1);
04422   for (unsigned l = 0; l != NumElts; l += LaneSize) {
04423     for (unsigned i = 0; i != LaneSize; ++i) {
04424       if (!isUndefOrInRange(Mask[i+l], l, l+LaneSize))
04425         return false;
04426       if (symetricMaskRequired) {
04427         if (ExpectedMaskVal[i] < 0 && Mask[i+l] >= 0) {
04428           ExpectedMaskVal[i] = Mask[i+l] - l;
04429           continue;
04430         }
04431         if (!isUndefOrEqual(Mask[i+l], ExpectedMaskVal[i]+l))
04432           return false;
04433       }
04434     }
04435   }
04436   return true;
04437 }
04438 
04439 /// isCommutedMOVLMask - Returns true if the shuffle mask is except the reverse
04440 /// of what x86 movss want. X86 movs requires the lowest  element to be lowest
04441 /// element of vector 2 and the other elements to come from vector 1 in order.
04442 static bool isCommutedMOVLMask(ArrayRef<int> Mask, MVT VT,
04443                                bool V2IsSplat = false, bool V2IsUndef = false) {
04444   if (!VT.is128BitVector())
04445     return false;
04446 
04447   unsigned NumOps = VT.getVectorNumElements();
04448   if (NumOps != 2 && NumOps != 4 && NumOps != 8 && NumOps != 16)
04449     return false;
04450 
04451   if (!isUndefOrEqual(Mask[0], 0))
04452     return false;
04453 
04454   for (unsigned i = 1; i != NumOps; ++i)
04455     if (!(isUndefOrEqual(Mask[i], i+NumOps) ||
04456           (V2IsUndef && isUndefOrInRange(Mask[i], NumOps, NumOps*2)) ||
04457           (V2IsSplat && isUndefOrEqual(Mask[i], NumOps))))
04458       return false;
04459 
04460   return true;
04461 }
04462 
04463 /// isMOVSHDUPMask - Return true if the specified VECTOR_SHUFFLE operand
04464 /// specifies a shuffle of elements that is suitable for input to MOVSHDUP.
04465 /// Masks to match: <1, 1, 3, 3> or <1, 1, 3, 3, 5, 5, 7, 7>
04466 static bool isMOVSHDUPMask(ArrayRef<int> Mask, MVT VT,
04467                            const X86Subtarget *Subtarget) {
04468   if (!Subtarget->hasSSE3())
04469     return false;
04470 
04471   unsigned NumElems = VT.getVectorNumElements();
04472 
04473   if ((VT.is128BitVector() && NumElems != 4) ||
04474       (VT.is256BitVector() && NumElems != 8) ||
04475       (VT.is512BitVector() && NumElems != 16))
04476     return false;
04477 
04478   // "i+1" is the value the indexed mask element must have
04479   for (unsigned i = 0; i != NumElems; i += 2)
04480     if (!isUndefOrEqual(Mask[i], i+1) ||
04481         !isUndefOrEqual(Mask[i+1], i+1))
04482       return false;
04483 
04484   return true;
04485 }
04486 
04487 /// isMOVSLDUPMask - Return true if the specified VECTOR_SHUFFLE operand
04488 /// specifies a shuffle of elements that is suitable for input to MOVSLDUP.
04489 /// Masks to match: <0, 0, 2, 2> or <0, 0, 2, 2, 4, 4, 6, 6>
04490 static bool isMOVSLDUPMask(ArrayRef<int> Mask, MVT VT,
04491                            const X86Subtarget *Subtarget) {
04492   if (!Subtarget->hasSSE3())
04493     return false;
04494 
04495   unsigned NumElems = VT.getVectorNumElements();
04496 
04497   if ((VT.is128BitVector() && NumElems != 4) ||
04498       (VT.is256BitVector() && NumElems != 8) ||
04499       (VT.is512BitVector() && NumElems != 16))
04500     return false;
04501 
04502   // "i" is the value the indexed mask element must have
04503   for (unsigned i = 0; i != NumElems; i += 2)
04504     if (!isUndefOrEqual(Mask[i], i) ||
04505         !isUndefOrEqual(Mask[i+1], i))
04506       return false;
04507 
04508   return true;
04509 }
04510 
04511 /// isMOVDDUPYMask - Return true if the specified VECTOR_SHUFFLE operand
04512 /// specifies a shuffle of elements that is suitable for input to 256-bit
04513 /// version of MOVDDUP.
04514 static bool isMOVDDUPYMask(ArrayRef<int> Mask, MVT VT, bool HasFp256) {
04515   if (!HasFp256 || !VT.is256BitVector())
04516     return false;
04517 
04518   unsigned NumElts = VT.getVectorNumElements();
04519   if (NumElts != 4)
04520     return false;
04521 
04522   for (unsigned i = 0; i != NumElts/2; ++i)
04523     if (!isUndefOrEqual(Mask[i], 0))
04524       return false;
04525   for (unsigned i = NumElts/2; i != NumElts; ++i)
04526     if (!isUndefOrEqual(Mask[i], NumElts/2))
04527       return false;
04528   return true;
04529 }
04530 
04531 /// isMOVDDUPMask - Return true if the specified VECTOR_SHUFFLE operand
04532 /// specifies a shuffle of elements that is suitable for input to 128-bit
04533 /// version of MOVDDUP.
04534 static bool isMOVDDUPMask(ArrayRef<int> Mask, MVT VT) {
04535   if (!VT.is128BitVector())
04536     return false;
04537 
04538   unsigned e = VT.getVectorNumElements() / 2;
04539   for (unsigned i = 0; i != e; ++i)
04540     if (!isUndefOrEqual(Mask[i], i))
04541       return false;
04542   for (unsigned i = 0; i != e; ++i)
04543     if (!isUndefOrEqual(Mask[e+i], i))
04544       return false;
04545   return true;
04546 }
04547 
04548 /// isVEXTRACTIndex - Return true if the specified
04549 /// EXTRACT_SUBVECTOR operand specifies a vector extract that is
04550 /// suitable for instruction that extract 128 or 256 bit vectors
04551 static bool isVEXTRACTIndex(SDNode *N, unsigned vecWidth) {
04552   assert((vecWidth == 128 || vecWidth == 256) && "Unexpected vector width");
04553   if (!isa<ConstantSDNode>(N->getOperand(1).getNode()))
04554     return false;
04555 
04556   // The index should be aligned on a vecWidth-bit boundary.
04557   uint64_t Index =
04558     cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue();
04559 
04560   MVT VT = N->getSimpleValueType(0);
04561   unsigned ElSize = VT.getVectorElementType().getSizeInBits();
04562   bool Result = (Index * ElSize) % vecWidth == 0;
04563 
04564   return Result;
04565 }
04566 
04567 /// isVINSERTIndex - Return true if the specified INSERT_SUBVECTOR
04568 /// operand specifies a subvector insert that is suitable for input to
04569 /// insertion of 128 or 256-bit subvectors
04570 static bool isVINSERTIndex(SDNode *N, unsigned vecWidth) {
04571   assert((vecWidth == 128 || vecWidth == 256) && "Unexpected vector width");
04572   if (!isa<ConstantSDNode>(N->getOperand(2).getNode()))
04573     return false;
04574   // The index should be aligned on a vecWidth-bit boundary.
04575   uint64_t Index =
04576     cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue();
04577 
04578   MVT VT = N->getSimpleValueType(0);
04579   unsigned ElSize = VT.getVectorElementType().getSizeInBits();
04580   bool Result = (Index * ElSize) % vecWidth == 0;
04581 
04582   return Result;
04583 }
04584 
04585 bool X86::isVINSERT128Index(SDNode *N) {
04586   return isVINSERTIndex(N, 128);
04587 }
04588 
04589 bool X86::isVINSERT256Index(SDNode *N) {
04590   return isVINSERTIndex(N, 256);
04591 }
04592 
04593 bool X86::isVEXTRACT128Index(SDNode *N) {
04594   return isVEXTRACTIndex(N, 128);
04595 }
04596 
04597 bool X86::isVEXTRACT256Index(SDNode *N) {
04598   return isVEXTRACTIndex(N, 256);
04599 }
04600 
04601 /// getShuffleSHUFImmediate - Return the appropriate immediate to shuffle
04602 /// the specified VECTOR_SHUFFLE mask with PSHUF* and SHUFP* instructions.
04603 /// Handles 128-bit and 256-bit.
04604 static unsigned getShuffleSHUFImmediate(ShuffleVectorSDNode *N) {
04605   MVT VT = N->getSimpleValueType(0);
04606 
04607   assert((VT.getSizeInBits() >= 128) &&
04608          "Unsupported vector type for PSHUF/SHUFP");
04609 
04610   // Handle 128 and 256-bit vector lengths. AVX defines PSHUF/SHUFP to operate
04611   // independently on 128-bit lanes.
04612   unsigned NumElts = VT.getVectorNumElements();
04613   unsigned NumLanes = VT.getSizeInBits()/128;
04614   unsigned NumLaneElts = NumElts/NumLanes;
04615 
04616   assert((NumLaneElts == 2 || NumLaneElts == 4 || NumLaneElts == 8) &&
04617          "Only supports 2, 4 or 8 elements per lane");
04618 
04619   unsigned Shift = (NumLaneElts >= 4) ? 1 : 0;
04620   unsigned Mask = 0;
04621   for (unsigned i = 0; i != NumElts; ++i) {
04622     int Elt = N->getMaskElt(i);
04623     if (Elt < 0) continue;
04624     Elt &= NumLaneElts - 1;
04625     unsigned ShAmt = (i << Shift) % 8;
04626     Mask |= Elt << ShAmt;
04627   }
04628 
04629   return Mask;
04630 }
04631 
04632 /// getShufflePSHUFHWImmediate - Return the appropriate immediate to shuffle
04633 /// the specified VECTOR_SHUFFLE mask with the PSHUFHW instruction.
04634 static unsigned getShufflePSHUFHWImmediate(ShuffleVectorSDNode *N) {
04635   MVT VT = N->getSimpleValueType(0);
04636 
04637   assert((VT == MVT::v8i16 || VT == MVT::v16i16) &&
04638          "Unsupported vector type for PSHUFHW");
04639 
04640   unsigned NumElts = VT.getVectorNumElements();
04641 
04642   unsigned Mask = 0;
04643   for (unsigned l = 0; l != NumElts; l += 8) {
04644     // 8 nodes per lane, but we only care about the last 4.
04645     for (unsigned i = 0; i < 4; ++i) {
04646       int Elt = N->getMaskElt(l+i+4);
04647       if (Elt < 0) continue;
04648       Elt &= 0x3; // only 2-bits.
04649       Mask |= Elt << (i * 2);
04650     }
04651   }
04652 
04653   return Mask;
04654 }
04655 
04656 /// getShufflePSHUFLWImmediate - Return the appropriate immediate to shuffle
04657 /// the specified VECTOR_SHUFFLE mask with the PSHUFLW instruction.
04658 static unsigned getShufflePSHUFLWImmediate(ShuffleVectorSDNode *N) {
04659   MVT VT = N->getSimpleValueType(0);
04660 
04661   assert((VT == MVT::v8i16 || VT == MVT::v16i16) &&
04662          "Unsupported vector type for PSHUFHW");
04663 
04664   unsigned NumElts = VT.getVectorNumElements();
04665 
04666   unsigned Mask = 0;
04667   for (unsigned l = 0; l != NumElts; l += 8) {
04668     // 8 nodes per lane, but we only care about the first 4.
04669     for (unsigned i = 0; i < 4; ++i) {
04670       int Elt = N->getMaskElt(l+i);
04671       if (Elt < 0) continue;
04672       Elt &= 0x3; // only 2-bits
04673       Mask |= Elt << (i * 2);
04674     }
04675   }
04676 
04677   return Mask;
04678 }
04679 
04680 /// getShufflePALIGNRImmediate - Return the appropriate immediate to shuffle
04681 /// the specified VECTOR_SHUFFLE mask with the PALIGNR instruction.
04682 static unsigned getShufflePALIGNRImmediate(ShuffleVectorSDNode *SVOp) {
04683   MVT VT = SVOp->getSimpleValueType(0);
04684   unsigned EltSize = VT.is512BitVector() ? 1 :
04685     VT.getVectorElementType().getSizeInBits() >> 3;
04686 
04687   unsigned NumElts = VT.getVectorNumElements();
04688   unsigned NumLanes = VT.is512BitVector() ? 1 : VT.getSizeInBits()/128;
04689   unsigned NumLaneElts = NumElts/NumLanes;
04690 
04691   int Val = 0;
04692   unsigned i;
04693   for (i = 0; i != NumElts; ++i) {
04694     Val = SVOp->getMaskElt(i);
04695     if (Val >= 0)
04696       break;
04697   }
04698   if (Val >= (int)NumElts)
04699     Val -= NumElts - NumLaneElts;
04700 
04701   assert(Val - i > 0 && "PALIGNR imm should be positive");
04702   return (Val - i) * EltSize;
04703 }
04704 
04705 static unsigned getExtractVEXTRACTImmediate(SDNode *N, unsigned vecWidth) {
04706   assert((vecWidth == 128 || vecWidth == 256) && "Unsupported vector width");
04707   if (!isa<ConstantSDNode>(N->getOperand(1).getNode()))
04708     llvm_unreachable("Illegal extract subvector for VEXTRACT");
04709 
04710   uint64_t Index =
04711     cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue();
04712 
04713   MVT VecVT = N->getOperand(0).getSimpleValueType();
04714   MVT ElVT = VecVT.getVectorElementType();
04715 
04716   unsigned NumElemsPerChunk = vecWidth / ElVT.getSizeInBits();
04717   return Index / NumElemsPerChunk;
04718 }
04719 
04720 static unsigned getInsertVINSERTImmediate(SDNode *N, unsigned vecWidth) {
04721   assert((vecWidth == 128 || vecWidth == 256) && "Unsupported vector width");
04722   if (!isa<ConstantSDNode>(N->getOperand(2).getNode()))
04723     llvm_unreachable("Illegal insert subvector for VINSERT");
04724 
04725   uint64_t Index =
04726     cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue();
04727 
04728   MVT VecVT = N->getSimpleValueType(0);
04729   MVT ElVT = VecVT.getVectorElementType();
04730 
04731   unsigned NumElemsPerChunk = vecWidth / ElVT.getSizeInBits();
04732   return Index / NumElemsPerChunk;
04733 }
04734 
04735 /// getExtractVEXTRACT128Immediate - Return the appropriate immediate
04736 /// to extract the specified EXTRACT_SUBVECTOR index with VEXTRACTF128
04737 /// and VINSERTI128 instructions.
04738 unsigned X86::getExtractVEXTRACT128Immediate(SDNode *N) {
04739   return getExtractVEXTRACTImmediate(N, 128);
04740 }
04741 
04742 /// getExtractVEXTRACT256Immediate - Return the appropriate immediate
04743 /// to extract the specified EXTRACT_SUBVECTOR index with VEXTRACTF64x4
04744 /// and VINSERTI64x4 instructions.
04745 unsigned X86::getExtractVEXTRACT256Immediate(SDNode *N) {
04746   return getExtractVEXTRACTImmediate(N, 256);
04747 }
04748 
04749 /// getInsertVINSERT128Immediate - Return the appropriate immediate
04750 /// to insert at the specified INSERT_SUBVECTOR index with VINSERTF128
04751 /// and VINSERTI128 instructions.
04752 unsigned X86::getInsertVINSERT128Immediate(SDNode *N) {
04753   return getInsertVINSERTImmediate(N, 128);
04754 }
04755 
04756 /// getInsertVINSERT256Immediate - Return the appropriate immediate
04757 /// to insert at the specified INSERT_SUBVECTOR index with VINSERTF46x4
04758 /// and VINSERTI64x4 instructions.
04759 unsigned X86::getInsertVINSERT256Immediate(SDNode *N) {
04760   return getInsertVINSERTImmediate(N, 256);
04761 }
04762 
04763 /// isZero - Returns true if Elt is a constant integer zero
04764 static bool isZero(SDValue V) {
04765   ConstantSDNode *C = dyn_cast<ConstantSDNode>(V);
04766   return C && C->isNullValue();
04767 }
04768 
04769 /// isZeroNode - Returns true if Elt is a constant zero or a floating point
04770 /// constant +0.0.
04771 bool X86::isZeroNode(SDValue Elt) {
04772   if (isZero(Elt))
04773     return true;
04774   if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Elt))
04775     return CFP->getValueAPF().isPosZero();
04776   return false;
04777 }
04778 
04779 /// ShouldXformToMOVHLPS - Return true if the node should be transformed to
04780 /// match movhlps. The lower half elements should come from upper half of
04781 /// V1 (and in order), and the upper half elements should come from the upper
04782 /// half of V2 (and in order).
04783 static bool ShouldXformToMOVHLPS(ArrayRef<int> Mask, MVT VT) {
04784   if (!VT.is128BitVector())
04785     return false;
04786   if (VT.getVectorNumElements() != 4)
04787     return false;
04788   for (unsigned i = 0, e = 2; i != e; ++i)
04789     if (!isUndefOrEqual(Mask[i], i+2))
04790       return false;
04791   for (unsigned i = 2; i != 4; ++i)
04792     if (!isUndefOrEqual(Mask[i], i+4))
04793       return false;
04794   return true;
04795 }
04796 
04797 /// isScalarLoadToVector - Returns true if the node is a scalar load that
04798 /// is promoted to a vector. It also returns the LoadSDNode by reference if
04799 /// required.
04800 static bool isScalarLoadToVector(SDNode *N, LoadSDNode **LD = nullptr) {
04801   if (N->getOpcode() != ISD::SCALAR_TO_VECTOR)
04802     return false;
04803   N = N->getOperand(0).getNode();
04804   if (!ISD::isNON_EXTLoad(N))
04805     return false;
04806   if (LD)
04807     *LD = cast<LoadSDNode>(N);
04808   return true;
04809 }
04810 
04811 // Test whether the given value is a vector value which will be legalized
04812 // into a load.
04813 static bool WillBeConstantPoolLoad(SDNode *N) {
04814   if (N->getOpcode() != ISD::BUILD_VECTOR)
04815     return false;
04816 
04817   // Check for any non-constant elements.
04818   for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i)
04819     switch (N->getOperand(i).getNode()->getOpcode()) {
04820     case ISD::UNDEF:
04821     case ISD::ConstantFP:
04822     case ISD::Constant:
04823       break;
04824     default:
04825       return false;
04826     }
04827 
04828   // Vectors of all-zeros and all-ones are materialized with special
04829   // instructions rather than being loaded.
04830   return !ISD::isBuildVectorAllZeros(N) &&
04831          !ISD::isBuildVectorAllOnes(N);
04832 }
04833 
04834 /// ShouldXformToMOVLP{S|D} - Return true if the node should be transformed to
04835 /// match movlp{s|d}. The lower half elements should come from lower half of
04836 /// V1 (and in order), and the upper half elements should come from the upper
04837 /// half of V2 (and in order). And since V1 will become the source of the
04838 /// MOVLP, it must be either a vector load or a scalar load to vector.
04839 static bool ShouldXformToMOVLP(SDNode *V1, SDNode *V2,
04840                                ArrayRef<int> Mask, MVT VT) {
04841   if (!VT.is128BitVector())
04842     return false;
04843 
04844   if (!ISD::isNON_EXTLoad(V1) && !isScalarLoadToVector(V1))
04845     return false;
04846   // Is V2 is a vector load, don't do this transformation. We will try to use
04847   // load folding shufps op.
04848   if (ISD::isNON_EXTLoad(V2) || WillBeConstantPoolLoad(V2))
04849     return false;
04850 
04851   unsigned NumElems = VT.getVectorNumElements();
04852 
04853   if (NumElems != 2 && NumElems != 4)
04854     return false;
04855   for (unsigned i = 0, e = NumElems/2; i != e; ++i)
04856     if (!isUndefOrEqual(Mask[i], i))
04857       return false;
04858   for (unsigned i = NumElems/2, e = NumElems; i != e; ++i)
04859     if (!isUndefOrEqual(Mask[i], i+NumElems))
04860       return false;
04861   return true;
04862 }
04863 
04864 /// isZeroShuffle - Returns true if N is a VECTOR_SHUFFLE that can be resolved
04865 /// to an zero vector.
04866 /// FIXME: move to dag combiner / method on ShuffleVectorSDNode
04867 static bool isZeroShuffle(ShuffleVectorSDNode *N) {
04868   SDValue V1 = N->getOperand(0);
04869   SDValue V2 = N->getOperand(1);
04870   unsigned NumElems = N->getValueType(0).getVectorNumElements();
04871   for (unsigned i = 0; i != NumElems; ++i) {
04872     int Idx = N->getMaskElt(i);
04873     if (Idx >= (int)NumElems) {
04874       unsigned Opc = V2.getOpcode();
04875       if (Opc == ISD::UNDEF || ISD::isBuildVectorAllZeros(V2.getNode()))
04876         continue;
04877       if (Opc != ISD::BUILD_VECTOR ||
04878           !X86::isZeroNode(V2.getOperand(Idx-NumElems)))
04879         return false;
04880     } else if (Idx >= 0) {
04881       unsigned Opc = V1.getOpcode();
04882       if (Opc == ISD::UNDEF || ISD::isBuildVectorAllZeros(V1.getNode()))
04883         continue;
04884       if (Opc != ISD::BUILD_VECTOR ||
04885           !X86::isZeroNode(V1.getOperand(Idx)))
04886         return false;
04887     }
04888   }
04889   return true;
04890 }
04891 
04892 /// getZeroVector - Returns a vector of specified type with all zero elements.
04893 ///
04894 static SDValue getZeroVector(EVT VT, const X86Subtarget *Subtarget,
04895                              SelectionDAG &DAG, SDLoc dl) {
04896   assert(VT.isVector() && "Expected a vector type");
04897 
04898   // Always build SSE zero vectors as <4 x i32> bitcasted
04899   // to their dest type. This ensures they get CSE'd.
04900   SDValue Vec;
04901   if (VT.is128BitVector()) {  // SSE
04902     if (Subtarget->hasSSE2()) {  // SSE2
04903       SDValue Cst = DAG.getTargetConstant(0, MVT::i32);
04904       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
04905     } else { // SSE1
04906       SDValue Cst = DAG.getTargetConstantFP(+0.0, MVT::f32);
04907       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4f32, Cst, Cst, Cst, Cst);
04908     }
04909   } else if (VT.is256BitVector()) { // AVX
04910     if (Subtarget->hasInt256()) { // AVX2
04911       SDValue Cst = DAG.getTargetConstant(0, MVT::i32);
04912       SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
04913       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops);
04914     } else {
04915       // 256-bit logic and arithmetic instructions in AVX are all
04916       // floating-point, no support for integer ops. Emit fp zeroed vectors.
04917       SDValue Cst = DAG.getTargetConstantFP(+0.0, MVT::f32);
04918       SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
04919       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8f32, Ops);
04920     }
04921   } else if (VT.is512BitVector()) { // AVX-512
04922       SDValue Cst = DAG.getTargetConstant(0, MVT::i32);
04923       SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst,
04924                         Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
04925       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v16i32, Ops);
04926   } else if (VT.getScalarType() == MVT::i1) {
04927     assert(VT.getVectorNumElements() <= 16 && "Unexpected vector type");
04928     SDValue Cst = DAG.getTargetConstant(0, MVT::i1);
04929     SmallVector<SDValue, 16> Ops(VT.getVectorNumElements(), Cst);
04930     return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops);
04931   } else
04932     llvm_unreachable("Unexpected vector type");
04933 
04934   return DAG.getNode(ISD::BITCAST, dl, VT, Vec);
04935 }
04936 
04937 /// getOnesVector - Returns a vector of specified type with all bits set.
04938 /// Always build ones vectors as <4 x i32> or <8 x i32>. For 256-bit types with
04939 /// no AVX2 supprt, use two <4 x i32> inserted in a <8 x i32> appropriately.
04940 /// Then bitcast to their original type, ensuring they get CSE'd.
04941 static SDValue getOnesVector(MVT VT, bool HasInt256, SelectionDAG &DAG,
04942                              SDLoc dl) {
04943   assert(VT.isVector() && "Expected a vector type");
04944 
04945   SDValue Cst = DAG.getTargetConstant(~0U, MVT::i32);
04946   SDValue Vec;
04947   if (VT.is256BitVector()) {
04948     if (HasInt256) { // AVX2
04949       SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
04950       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops);
04951     } else { // AVX
04952       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
04953       Vec = Concat128BitVectors(Vec, Vec, MVT::v8i32, 8, DAG, dl);
04954     }
04955   } else if (VT.is128BitVector()) {
04956     Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
04957   } else
04958     llvm_unreachable("Unexpected vector type");
04959 
04960   return DAG.getNode(ISD::BITCAST, dl, VT, Vec);
04961 }
04962 
04963 /// NormalizeMask - V2 is a splat, modify the mask (if needed) so all elements
04964 /// that point to V2 points to its first element.
04965 static void NormalizeMask(SmallVectorImpl<int> &Mask, unsigned NumElems) {
04966   for (unsigned i = 0; i != NumElems; ++i) {
04967     if (Mask[i] > (int)NumElems) {
04968       Mask[i] = NumElems;
04969     }
04970   }
04971 }
04972 
04973 /// getMOVLMask - Returns a vector_shuffle mask for an movs{s|d}, movd
04974 /// operation of specified width.
04975 static SDValue getMOVL(SelectionDAG &DAG, SDLoc dl, EVT VT, SDValue V1,
04976                        SDValue V2) {
04977   unsigned NumElems = VT.getVectorNumElements();
04978   SmallVector<int, 8> Mask;
04979   Mask.push_back(NumElems);
04980   for (unsigned i = 1; i != NumElems; ++i)
04981     Mask.push_back(i);
04982   return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]);
04983 }
04984 
04985 /// getUnpackl - Returns a vector_shuffle node for an unpackl operation.
04986 static SDValue getUnpackl(SelectionDAG &DAG, SDLoc dl, MVT VT, SDValue V1,
04987                           SDValue V2) {
04988   unsigned NumElems = VT.getVectorNumElements();
04989   SmallVector<int, 8> Mask;
04990   for (unsigned i = 0, e = NumElems/2; i != e; ++i) {
04991     Mask.push_back(i);
04992     Mask.push_back(i + NumElems);
04993   }
04994   return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]);
04995 }
04996 
04997 /// getUnpackh - Returns a vector_shuffle node for an unpackh operation.
04998 static SDValue getUnpackh(SelectionDAG &DAG, SDLoc dl, MVT VT, SDValue V1,
04999                           SDValue V2) {
05000   unsigned NumElems = VT.getVectorNumElements();
05001   SmallVector<int, 8> Mask;
05002   for (unsigned i = 0, Half = NumElems/2; i != Half; ++i) {
05003     Mask.push_back(i + Half);
05004     Mask.push_back(i + NumElems + Half);
05005   }
05006   return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]);
05007 }
05008 
05009 // PromoteSplati8i16 - All i16 and i8 vector types can't be used directly by
05010 // a generic shuffle instruction because the target has no such instructions.
05011 // Generate shuffles which repeat i16 and i8 several times until they can be
05012 // represented by v4f32 and then be manipulated by target suported shuffles.
05013 static SDValue PromoteSplati8i16(SDValue V, SelectionDAG &DAG, int &EltNo) {
05014   MVT VT = V.getSimpleValueType();
05015   int NumElems = VT.getVectorNumElements();
05016   SDLoc dl(V);
05017 
05018   while (NumElems > 4) {
05019     if (EltNo < NumElems/2) {
05020       V = getUnpackl(DAG, dl, VT, V, V);
05021     } else {
05022       V = getUnpackh(DAG, dl, VT, V, V);
05023       EltNo -= NumElems/2;
05024     }
05025     NumElems >>= 1;
05026   }
05027   return V;
05028 }
05029 
05030 /// getLegalSplat - Generate a legal splat with supported x86 shuffles
05031 static SDValue getLegalSplat(SelectionDAG &DAG, SDValue V, int EltNo) {
05032   MVT VT = V.getSimpleValueType();
05033   SDLoc dl(V);
05034 
05035   if (VT.is128BitVector()) {
05036     V = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, V);
05037     int SplatMask[4] = { EltNo, EltNo, EltNo, EltNo };
05038     V = DAG.getVectorShuffle(MVT::v4f32, dl, V, DAG.getUNDEF(MVT::v4f32),
05039                              &SplatMask[0]);
05040   } else if (VT.is256BitVector()) {
05041     // To use VPERMILPS to splat scalars, the second half of indicies must
05042     // refer to the higher part, which is a duplication of the lower one,
05043     // because VPERMILPS can only handle in-lane permutations.
05044     int SplatMask[8] = { EltNo, EltNo, EltNo, EltNo,
05045                          EltNo+4, EltNo+4, EltNo+4, EltNo+4 };
05046 
05047     V = DAG.getNode(ISD::BITCAST, dl, MVT::v8f32, V);
05048     V = DAG.getVectorShuffle(MVT::v8f32, dl, V, DAG.getUNDEF(MVT::v8f32),
05049                              &SplatMask[0]);
05050   } else
05051     llvm_unreachable("Vector size not supported");
05052 
05053   return DAG.getNode(ISD::BITCAST, dl, VT, V);
05054 }
05055 
05056 /// PromoteSplat - Splat is promoted to target supported vector shuffles.
05057 static SDValue PromoteSplat(ShuffleVectorSDNode *SV, SelectionDAG &DAG) {
05058   MVT SrcVT = SV->getSimpleValueType(0);
05059   SDValue V1 = SV->getOperand(0);
05060   SDLoc dl(SV);
05061 
05062   int EltNo = SV->getSplatIndex();
05063   int NumElems = SrcVT.getVectorNumElements();
05064   bool Is256BitVec = SrcVT.is256BitVector();
05065 
05066   assert(((SrcVT.is128BitVector() && NumElems > 4) || Is256BitVec) &&
05067          "Unknown how to promote splat for type");
05068 
05069   // Extract the 128-bit part containing the splat element and update
05070   // the splat element index when it refers to the higher register.
05071   if (Is256BitVec) {
05072     V1 = Extract128BitVector(V1, EltNo, DAG, dl);
05073     if (EltNo >= NumElems/2)
05074       EltNo -= NumElems/2;
05075   }
05076 
05077   // All i16 and i8 vector types can't be used directly by a generic shuffle
05078   // instruction because the target has no such instruction. Generate shuffles
05079   // which repeat i16 and i8 several times until they fit in i32, and then can
05080   // be manipulated by target suported shuffles.
05081   MVT EltVT = SrcVT.getVectorElementType();
05082   if (EltVT == MVT::i8 || EltVT == MVT::i16)
05083     V1 = PromoteSplati8i16(V1, DAG, EltNo);
05084 
05085   // Recreate the 256-bit vector and place the same 128-bit vector
05086   // into the low and high part. This is necessary because we want
05087   // to use VPERM* to shuffle the vectors
05088   if (Is256BitVec) {
05089     V1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, SrcVT, V1, V1);
05090   }
05091 
05092   return getLegalSplat(DAG, V1, EltNo);
05093 }
05094 
05095 /// getShuffleVectorZeroOrUndef - Return a vector_shuffle of the specified
05096 /// vector of zero or undef vector.  This produces a shuffle where the low
05097 /// element of V2 is swizzled into the zero/undef vector, landing at element
05098 /// Idx.  This produces a shuffle mask like 4,1,2,3 (idx=0) or  0,1,2,4 (idx=3).
05099 static SDValue getShuffleVectorZeroOrUndef(SDValue V2, unsigned Idx,
05100                                            bool IsZero,
05101                                            const X86Subtarget *Subtarget,
05102                                            SelectionDAG &DAG) {
05103   MVT VT = V2.getSimpleValueType();
05104   SDValue V1 = IsZero
05105     ? getZeroVector(VT, Subtarget, DAG, SDLoc(V2)) : DAG.getUNDEF(VT);
05106   unsigned NumElems = VT.getVectorNumElements();
05107   SmallVector<int, 16> MaskVec;
05108   for (unsigned i = 0; i != NumElems; ++i)
05109     // If this is the insertion idx, put the low elt of V2 here.
05110     MaskVec.push_back(i == Idx ? NumElems : i);
05111   return DAG.getVectorShuffle(VT, SDLoc(V2), V1, V2, &MaskVec[0]);
05112 }
05113 
05114 /// getTargetShuffleMask - Calculates the shuffle mask corresponding to the
05115 /// target specific opcode. Returns true if the Mask could be calculated.
05116 /// Sets IsUnary to true if only uses one source.
05117 static bool getTargetShuffleMask(SDNode *N, MVT VT,
05118                                  SmallVectorImpl<int> &Mask, bool &IsUnary) {
05119   unsigned NumElems = VT.getVectorNumElements();
05120   SDValue ImmN;
05121 
05122   IsUnary = false;
05123   switch(N->getOpcode()) {
05124   case X86ISD::SHUFP:
05125     ImmN = N->getOperand(N->getNumOperands()-1);
05126     DecodeSHUFPMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
05127     break;
05128   case X86ISD::UNPCKH:
05129     DecodeUNPCKHMask(VT, Mask);
05130     break;
05131   case X86ISD::UNPCKL:
05132     DecodeUNPCKLMask(VT, Mask);
05133     break;
05134   case X86ISD::MOVHLPS:
05135     DecodeMOVHLPSMask(NumElems, Mask);
05136     break;
05137   case X86ISD::MOVLHPS:
05138     DecodeMOVLHPSMask(NumElems, Mask);
05139     break;
05140   case X86ISD::PALIGNR:
05141     ImmN = N->getOperand(N->getNumOperands()-1);
05142     DecodePALIGNRMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
05143     break;
05144   case X86ISD::PSHUFD:
05145   case X86ISD::VPERMILP:
05146     ImmN = N->getOperand(N->getNumOperands()-1);
05147     DecodePSHUFMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
05148     IsUnary = true;
05149     break;
05150   case X86ISD::PSHUFHW:
05151     ImmN = N->getOperand(N->getNumOperands()-1);
05152     DecodePSHUFHWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
05153     IsUnary = true;
05154     break;
05155   case X86ISD::PSHUFLW:
05156     ImmN = N->getOperand(N->getNumOperands()-1);
05157     DecodePSHUFLWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
05158     IsUnary = true;
05159     break;
05160   case X86ISD::VPERMI:
05161     ImmN = N->getOperand(N->getNumOperands()-1);
05162     DecodeVPERMMask(cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
05163     IsUnary = true;
05164     break;
05165   case X86ISD::MOVSS:
05166   case X86ISD::MOVSD: {
05167     // The index 0 always comes from the first element of the second source,
05168     // this is why MOVSS and MOVSD are used in the first place. The other
05169     // elements come from the other positions of the first source vector
05170     Mask.push_back(NumElems);
05171     for (unsigned i = 1; i != NumElems; ++i) {
05172       Mask.push_back(i);
05173     }
05174     break;
05175   }
05176   case X86ISD::VPERM2X128:
05177     ImmN = N->getOperand(N->getNumOperands()-1);
05178     DecodeVPERM2X128Mask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
05179     if (Mask.empty()) return false;
05180     break;
05181   case X86ISD::MOVDDUP:
05182   case X86ISD::MOVLHPD:
05183   case X86ISD::MOVLPD:
05184   case X86ISD::MOVLPS:
05185   case X86ISD::MOVSHDUP:
05186   case X86ISD::MOVSLDUP:
05187     // Not yet implemented
05188     return false;
05189   default: llvm_unreachable("unknown target shuffle node");
05190   }
05191 
05192   return true;
05193 }
05194 
05195 /// getShuffleScalarElt - Returns the scalar element that will make up the ith
05196 /// element of the result of the vector shuffle.
05197 static SDValue getShuffleScalarElt(SDNode *N, unsigned Index, SelectionDAG &DAG,
05198                                    unsigned Depth) {
05199   if (Depth == 6)
05200     return SDValue();  // Limit search depth.
05201 
05202   SDValue V = SDValue(N, 0);
05203   EVT VT = V.getValueType();
05204   unsigned Opcode = V.getOpcode();
05205 
05206   // Recurse into ISD::VECTOR_SHUFFLE node to find scalars.
05207   if (const ShuffleVectorSDNode *SV = dyn_cast<ShuffleVectorSDNode>(N)) {
05208     int Elt = SV->getMaskElt(Index);
05209 
05210     if (Elt < 0)
05211       return DAG.getUNDEF(VT.getVectorElementType());
05212 
05213     unsigned NumElems = VT.getVectorNumElements();
05214     SDValue NewV = (Elt < (int)NumElems) ? SV->getOperand(0)
05215                                          : SV->getOperand(1);
05216     return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG, Depth+1);
05217   }
05218 
05219   // Recurse into target specific vector shuffles to find scalars.
05220   if (isTargetShuffle(Opcode)) {
05221     MVT ShufVT = V.getSimpleValueType();
05222     unsigned NumElems = ShufVT.getVectorNumElements();
05223     SmallVector<int, 16> ShuffleMask;
05224     bool IsUnary;
05225 
05226     if (!getTargetShuffleMask(N, ShufVT, ShuffleMask, IsUnary))
05227       return SDValue();
05228 
05229     int Elt = ShuffleMask[Index];
05230     if (Elt < 0)
05231       return DAG.getUNDEF(ShufVT.getVectorElementType());
05232 
05233     SDValue NewV = (Elt < (int)NumElems) ? N->getOperand(0)
05234                                          : N->getOperand(1);
05235     return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG,
05236                                Depth+1);
05237   }
05238 
05239   // Actual nodes that may contain scalar elements
05240   if (Opcode == ISD::BITCAST) {
05241     V = V.getOperand(0);
05242     EVT SrcVT = V.getValueType();
05243     unsigned NumElems = VT.getVectorNumElements();
05244 
05245     if (!SrcVT.isVector() || SrcVT.getVectorNumElements() != NumElems)
05246       return SDValue();
05247   }
05248 
05249   if (V.getOpcode() == ISD::SCALAR_TO_VECTOR)
05250     return (Index == 0) ? V.getOperand(0)
05251                         : DAG.getUNDEF(VT.getVectorElementType());
05252 
05253   if (V.getOpcode() == ISD::BUILD_VECTOR)
05254     return V.getOperand(Index);
05255 
05256   return SDValue();
05257 }
05258 
05259 /// getNumOfConsecutiveZeros - Return the number of elements of a vector
05260 /// shuffle operation which come from a consecutively from a zero. The
05261 /// search can start in two different directions, from left or right.
05262 /// We count undefs as zeros until PreferredNum is reached.
05263 static unsigned getNumOfConsecutiveZeros(ShuffleVectorSDNode *SVOp,
05264                                          unsigned NumElems, bool ZerosFromLeft,
05265                                          SelectionDAG &DAG,
05266                                          unsigned PreferredNum = -1U) {
05267   unsigned NumZeros = 0;
05268   for (unsigned i = 0; i != NumElems; ++i) {
05269     unsigned Index = ZerosFromLeft ? i : NumElems - i - 1;
05270     SDValue Elt = getShuffleScalarElt(SVOp, Index, DAG, 0);
05271     if (!Elt.getNode())
05272       break;
05273 
05274     if (X86::isZeroNode(Elt))
05275       ++NumZeros;
05276     else if (Elt.getOpcode() == ISD::UNDEF) // Undef as zero up to PreferredNum.
05277       NumZeros = std::min(NumZeros + 1, PreferredNum);
05278     else
05279       break;
05280   }
05281 
05282   return NumZeros;
05283 }
05284 
05285 /// isShuffleMaskConsecutive - Check if the shuffle mask indicies [MaskI, MaskE)
05286 /// correspond consecutively to elements from one of the vector operands,
05287 /// starting from its index OpIdx. Also tell OpNum which source vector operand.
05288 static
05289 bool isShuffleMaskConsecutive(ShuffleVectorSDNode *SVOp,
05290                               unsigned MaskI, unsigned MaskE, unsigned OpIdx,
05291                               unsigned NumElems, unsigned &OpNum) {
05292   bool SeenV1 = false;
05293   bool SeenV2 = false;
05294 
05295   for (unsigned i = MaskI; i != MaskE; ++i, ++OpIdx) {
05296     int Idx = SVOp->getMaskElt(i);
05297     // Ignore undef indicies
05298     if (Idx < 0)
05299       continue;
05300 
05301     if (Idx < (int)NumElems)
05302       SeenV1 = true;
05303     else
05304       SeenV2 = true;
05305 
05306     // Only accept consecutive elements from the same vector
05307     if ((Idx % NumElems != OpIdx) || (SeenV1 && SeenV2))
05308       return false;
05309   }
05310 
05311   OpNum = SeenV1 ? 0 : 1;
05312   return true;
05313 }
05314 
05315 /// isVectorShiftRight - Returns true if the shuffle can be implemented as a
05316 /// logical left shift of a vector.
05317 static bool isVectorShiftRight(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG,
05318                                bool &isLeft, SDValue &ShVal, unsigned &ShAmt) {
05319   unsigned NumElems =
05320     SVOp->getSimpleValueType(0).getVectorNumElements();
05321   unsigned NumZeros = getNumOfConsecutiveZeros(
05322       SVOp, NumElems, false /* check zeros from right */, DAG,
05323       SVOp->getMaskElt(0));
05324   unsigned OpSrc;
05325 
05326   if (!NumZeros)
05327     return false;
05328 
05329   // Considering the elements in the mask that are not consecutive zeros,
05330   // check if they consecutively come from only one of the source vectors.
05331   //
05332   //               V1 = {X, A, B, C}     0
05333   //                         \  \  \    /
05334   //   vector_shuffle V1, V2 <1, 2, 3, X>
05335   //
05336   if (!isShuffleMaskConsecutive(SVOp,
05337             0,                   // Mask Start Index
05338             NumElems-NumZeros,   // Mask End Index(exclusive)
05339             NumZeros,            // Where to start looking in the src vector
05340             NumElems,            // Number of elements in vector
05341             OpSrc))              // Which source operand ?
05342     return false;
05343 
05344   isLeft = false;
05345   ShAmt = NumZeros;
05346   ShVal = SVOp->getOperand(OpSrc);
05347   return true;
05348 }
05349 
05350 /// isVectorShiftLeft - Returns true if the shuffle can be implemented as a
05351 /// logical left shift of a vector.
05352 static bool isVectorShiftLeft(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG,
05353                               bool &isLeft, SDValue &ShVal, unsigned &ShAmt) {
05354   unsigned NumElems =
05355     SVOp->getSimpleValueType(0).getVectorNumElements();
05356   unsigned NumZeros = getNumOfConsecutiveZeros(
05357       SVOp, NumElems, true /* check zeros from left */, DAG,
05358       NumElems - SVOp->getMaskElt(NumElems - 1) - 1);
05359   unsigned OpSrc;
05360 
05361   if (!NumZeros)
05362     return false;
05363 
05364   // Considering the elements in the mask that are not consecutive zeros,
05365   // check if they consecutively come from only one of the source vectors.
05366   //
05367   //                           0    { A, B, X, X } = V2
05368   //                          / \    /  /
05369   //   vector_shuffle V1, V2 <X, X, 4, 5>
05370   //
05371   if (!isShuffleMaskConsecutive(SVOp,
05372             NumZeros,     // Mask Start Index
05373             NumElems,     // Mask End Index(exclusive)
05374             0,            // Where to start looking in the src vector
05375             NumElems,     // Number of elements in vector
05376             OpSrc))       // Which source operand ?
05377     return false;
05378 
05379   isLeft = true;
05380   ShAmt = NumZeros;
05381   ShVal = SVOp->getOperand(OpSrc);
05382   return true;
05383 }
05384 
05385 /// isVectorShift - Returns true if the shuffle can be implemented as a
05386 /// logical left or right shift of a vector.
05387 static bool isVectorShift(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG,
05388                           bool &isLeft, SDValue &ShVal, unsigned &ShAmt) {
05389   // Although the logic below support any bitwidth size, there are no
05390   // shift instructions which handle more than 128-bit vectors.
05391   if (!SVOp->getSimpleValueType(0).is128BitVector())
05392     return false;
05393 
05394   if (isVectorShiftLeft(SVOp, DAG, isLeft, ShVal, ShAmt) ||
05395       isVectorShiftRight(SVOp, DAG, isLeft, ShVal, ShAmt))
05396     return true;
05397 
05398   return false;
05399 }
05400 
05401 /// LowerBuildVectorv16i8 - Custom lower build_vector of v16i8.
05402 ///
05403 static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros,
05404                                        unsigned NumNonZero, unsigned NumZero,
05405                                        SelectionDAG &DAG,
05406                                        const X86Subtarget* Subtarget,
05407                                        const TargetLowering &TLI) {
05408   if (NumNonZero > 8)
05409     return SDValue();
05410 
05411   SDLoc dl(Op);
05412   SDValue V;
05413   bool First = true;
05414   for (unsigned i = 0; i < 16; ++i) {
05415     bool ThisIsNonZero = (NonZeros & (1 << i)) != 0;
05416     if (ThisIsNonZero && First) {
05417       if (NumZero)
05418         V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
05419       else
05420         V = DAG.getUNDEF(MVT::v8i16);
05421       First = false;
05422     }
05423 
05424     if ((i & 1) != 0) {
05425       SDValue ThisElt, LastElt;
05426       bool LastIsNonZero = (NonZeros & (1 << (i-1))) != 0;
05427       if (LastIsNonZero) {
05428         LastElt = DAG.getNode(ISD::ZERO_EXTEND, dl,
05429                               MVT::i16, Op.getOperand(i-1));
05430       }
05431       if (ThisIsNonZero) {
05432         ThisElt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i));
05433         ThisElt = DAG.getNode(ISD::SHL, dl, MVT::i16,
05434                               ThisElt, DAG.getConstant(8, MVT::i8));
05435         if (LastIsNonZero)
05436           ThisElt = DAG.getNode(ISD::OR, dl, MVT::i16, ThisElt, LastElt);
05437       } else
05438         ThisElt = LastElt;
05439 
05440       if (ThisElt.getNode())
05441         V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, ThisElt,
05442                         DAG.getIntPtrConstant(i/2));
05443     }
05444   }
05445 
05446   return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, V);
05447 }
05448 
05449 /// LowerBuildVectorv8i16 - Custom lower build_vector of v8i16.
05450 ///
05451 static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros,
05452                                      unsigned NumNonZero, unsigned NumZero,
05453                                      SelectionDAG &DAG,
05454                                      const X86Subtarget* Subtarget,
05455                                      const TargetLowering &TLI) {
05456   if (NumNonZero > 4)
05457     return SDValue();
05458 
05459   SDLoc dl(Op);
05460   SDValue V;
05461   bool First = true;
05462   for (unsigned i = 0; i < 8; ++i) {
05463     bool isNonZero = (NonZeros & (1 << i)) != 0;
05464     if (isNonZero) {
05465       if (First) {
05466         if (NumZero)
05467           V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
05468         else
05469           V = DAG.getUNDEF(MVT::v8i16);
05470         First = false;
05471       }
05472       V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl,
05473                       MVT::v8i16, V, Op.getOperand(i),
05474                       DAG.getIntPtrConstant(i));
05475     }
05476   }
05477 
05478   return V;
05479 }
05480 
05481 /// LowerBuildVectorv4x32 - Custom lower build_vector of v4i32 or v4f32.
05482 static SDValue LowerBuildVectorv4x32(SDValue Op, unsigned NumElems,
05483                                      unsigned NonZeros, unsigned NumNonZero,
05484                                      unsigned NumZero, SelectionDAG &DAG,
05485                                      const X86Subtarget *Subtarget,
05486                                      const TargetLowering &TLI) {
05487   // We know there's at least one non-zero element
05488   unsigned FirstNonZeroIdx = 0;
05489   SDValue FirstNonZero = Op->getOperand(FirstNonZeroIdx);
05490   while (FirstNonZero.getOpcode() == ISD::UNDEF ||
05491          X86::isZeroNode(FirstNonZero)) {
05492     ++FirstNonZeroIdx;
05493     FirstNonZero = Op->getOperand(FirstNonZeroIdx);
05494   }
05495 
05496   if (FirstNonZero.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
05497       !isa<ConstantSDNode>(FirstNonZero.getOperand(1)))
05498     return SDValue();
05499 
05500   SDValue V = FirstNonZero.getOperand(0);
05501   MVT VVT = V.getSimpleValueType();
05502   if (!Subtarget->hasSSE41() || (VVT != MVT::v4f32 && VVT != MVT::v4i32))
05503     return SDValue();
05504 
05505   unsigned FirstNonZeroDst =
05506       cast<ConstantSDNode>(FirstNonZero.getOperand(1))->getZExtValue();
05507   unsigned CorrectIdx = FirstNonZeroDst == FirstNonZeroIdx;
05508   unsigned IncorrectIdx = CorrectIdx ? -1U : FirstNonZeroIdx;
05509   unsigned IncorrectDst = CorrectIdx ? -1U : FirstNonZeroDst;
05510 
05511   for (unsigned Idx = FirstNonZeroIdx + 1; Idx < NumElems; ++Idx) {
05512     SDValue Elem = Op.getOperand(Idx);
05513     if (Elem.getOpcode() == ISD::UNDEF || X86::isZeroNode(Elem))
05514       continue;
05515 
05516     // TODO: What else can be here? Deal with it.
05517     if (Elem.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
05518       return SDValue();
05519 
05520     // TODO: Some optimizations are still possible here
05521     // ex: Getting one element from a vector, and the rest from another.
05522     if (Elem.getOperand(0) != V)
05523       return SDValue();
05524 
05525     unsigned Dst = cast<ConstantSDNode>(Elem.getOperand(1))->getZExtValue();
05526     if (Dst == Idx)
05527       ++CorrectIdx;
05528     else if (IncorrectIdx == -1U) {
05529       IncorrectIdx = Idx;
05530       IncorrectDst = Dst;
05531     } else
05532       // There was already one element with an incorrect index.
05533       // We can't optimize this case to an insertps.
05534       return SDValue();
05535   }
05536 
05537   if (NumNonZero == CorrectIdx || NumNonZero == CorrectIdx + 1) {
05538     SDLoc dl(Op);
05539     EVT VT = Op.getSimpleValueType();
05540     unsigned ElementMoveMask = 0;
05541     if (IncorrectIdx == -1U)
05542       ElementMoveMask = FirstNonZeroIdx << 6 | FirstNonZeroIdx << 4;
05543     else
05544       ElementMoveMask = IncorrectDst << 6 | IncorrectIdx << 4;
05545 
05546     SDValue InsertpsMask =
05547         DAG.getIntPtrConstant(ElementMoveMask | (~NonZeros & 0xf));
05548     return DAG.getNode(X86ISD::INSERTPS, dl, VT, V, V, InsertpsMask);
05549   }
05550 
05551   return SDValue();
05552 }
05553 
05554 /// getVShift - Return a vector logical shift node.
05555 ///
05556 static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp,
05557                          unsigned NumBits, SelectionDAG &DAG,
05558                          const TargetLowering &TLI, SDLoc dl) {
05559   assert(VT.is128BitVector() && "Unknown type for VShift");
05560   EVT ShVT = MVT::v2i64;
05561   unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ;
05562   SrcOp = DAG.getNode(ISD::BITCAST, dl, ShVT, SrcOp);
05563   return DAG.getNode(ISD::BITCAST, dl, VT,
05564                      DAG.getNode(Opc, dl, ShVT, SrcOp,
05565                              DAG.getConstant(NumBits,
05566                                   TLI.getScalarShiftAmountTy(SrcOp.getValueType()))));
05567 }
05568 
05569 static SDValue
05570 LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, SDLoc dl, SelectionDAG &DAG) {
05571 
05572   // Check if the scalar load can be widened into a vector load. And if
05573   // the address is "base + cst" see if the cst can be "absorbed" into
05574   // the shuffle mask.
05575   if (LoadSDNode *LD = dyn_cast<LoadSDNode>(SrcOp)) {
05576     SDValue Ptr = LD->getBasePtr();
05577     if (!ISD::isNormalLoad(LD) || LD->isVolatile())
05578       return SDValue();
05579     EVT PVT = LD->getValueType(0);
05580     if (PVT != MVT::i32 && PVT != MVT::f32)
05581       return SDValue();
05582 
05583     int FI = -1;
05584     int64_t Offset = 0;
05585     if (FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr)) {
05586       FI = FINode->getIndex();
05587       Offset = 0;
05588     } else if (DAG.isBaseWithConstantOffset(Ptr) &&
05589                isa<FrameIndexSDNode>(Ptr.getOperand(0))) {
05590       FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex();
05591       Offset = Ptr.getConstantOperandVal(1);
05592       Ptr = Ptr.getOperand(0);
05593     } else {
05594       return SDValue();
05595     }
05596 
05597     // FIXME: 256-bit vector instructions don't require a strict alignment,
05598     // improve this code to support it better.
05599     unsigned RequiredAlign = VT.getSizeInBits()/8;
05600     SDValue Chain = LD->getChain();
05601     // Make sure the stack object alignment is at least 16 or 32.
05602     MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
05603     if (DAG.InferPtrAlignment(Ptr) < RequiredAlign) {
05604       if (MFI->isFixedObjectIndex(FI)) {
05605         // Can't change the alignment. FIXME: It's possible to compute
05606         // the exact stack offset and reference FI + adjust offset instead.
05607         // If someone *really* cares about this. That's the way to implement it.
05608         return SDValue();
05609       } else {
05610         MFI->setObjectAlignment(FI, RequiredAlign);
05611       }
05612     }
05613 
05614     // (Offset % 16 or 32) must be multiple of 4. Then address is then
05615     // Ptr + (Offset & ~15).
05616     if (Offset < 0)
05617       return SDValue();
05618     if ((Offset % RequiredAlign) & 3)
05619       return SDValue();
05620     int64_t StartOffset = Offset & ~(RequiredAlign-1);
05621     if (StartOffset)
05622       Ptr = DAG.getNode(ISD::ADD, SDLoc(Ptr), Ptr.getValueType(),
05623                         Ptr,DAG.getConstant(StartOffset, Ptr.getValueType()));
05624 
05625     int EltNo = (Offset - StartOffset) >> 2;
05626     unsigned NumElems = VT.getVectorNumElements();
05627 
05628     EVT NVT = EVT::getVectorVT(*DAG.getContext(), PVT, NumElems);
05629     SDValue V1 = DAG.getLoad(NVT, dl, Chain, Ptr,
05630                              LD->getPointerInfo().getWithOffset(StartOffset),
05631                              false, false, false, 0);
05632 
05633     SmallVector<int, 8> Mask;
05634     for (unsigned i = 0; i != NumElems; ++i)
05635       Mask.push_back(EltNo);
05636 
05637     return DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), &Mask[0]);
05638   }
05639 
05640   return SDValue();
05641 }
05642 
05643 /// EltsFromConsecutiveLoads - Given the initializing elements 'Elts' of a
05644 /// vector of type 'VT', see if the elements can be replaced by a single large
05645 /// load which has the same value as a build_vector whose operands are 'elts'.
05646 ///
05647 /// Example: <load i32 *a, load i32 *a+4, undef, undef> -> zextload a
05648 ///
05649 /// FIXME: we'd also like to handle the case where the last elements are zero
05650 /// rather than undef via VZEXT_LOAD, but we do not detect that case today.
05651 /// There's even a handy isZeroNode for that purpose.
05652 static SDValue EltsFromConsecutiveLoads(EVT VT, SmallVectorImpl<SDValue> &Elts,
05653                                         SDLoc &DL, SelectionDAG &DAG,
05654                                         bool isAfterLegalize) {
05655   EVT EltVT = VT.getVectorElementType();
05656   unsigned NumElems = Elts.size();
05657 
05658   LoadSDNode *LDBase = nullptr;
05659   unsigned LastLoadedElt = -1U;
05660 
05661   // For each element in the initializer, see if we've found a load or an undef.
05662   // If we don't find an initial load element, or later load elements are
05663   // non-consecutive, bail out.
05664   for (unsigned i = 0; i < NumElems; ++i) {
05665     SDValue Elt = Elts[i];
05666 
05667     if (!Elt.getNode() ||
05668         (Elt.getOpcode() != ISD::UNDEF && !ISD::isNON_EXTLoad(Elt.getNode())))
05669       return SDValue();
05670     if (!LDBase) {
05671       if (Elt.getNode()->getOpcode() == ISD::UNDEF)
05672         return SDValue();
05673       LDBase = cast<LoadSDNode>(Elt.getNode());
05674       LastLoadedElt = i;
05675       continue;
05676     }
05677     if (Elt.getOpcode() == ISD::UNDEF)
05678       continue;
05679 
05680     LoadSDNode *LD = cast<LoadSDNode>(Elt);
05681     if (!DAG.isConsecutiveLoad(LD, LDBase, EltVT.getSizeInBits()/8, i))
05682       return SDValue();
05683     LastLoadedElt = i;
05684   }
05685 
05686   // If we have found an entire vector of loads and undefs, then return a large
05687   // load of the entire vector width starting at the base pointer.  If we found
05688   // consecutive loads for the low half, generate a vzext_load node.
05689   if (LastLoadedElt == NumElems - 1) {
05690 
05691     if (isAfterLegalize &&
05692         !DAG.getTargetLoweringInfo().isOperationLegal(ISD::LOAD, VT))
05693       return SDValue();
05694 
05695     SDValue NewLd = SDValue();
05696 
05697     if (DAG.InferPtrAlignment(LDBase->getBasePtr()) >= 16)
05698       NewLd = DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
05699                           LDBase->getPointerInfo(),
05700                           LDBase->isVolatile(), LDBase->isNonTemporal(),
05701                           LDBase->isInvariant(), 0);
05702     NewLd = DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
05703                         LDBase->getPointerInfo(),
05704                         LDBase->isVolatile(), LDBase->isNonTemporal(),
05705                         LDBase->isInvariant(), LDBase->getAlignment());
05706 
05707     if (LDBase->hasAnyUseOfValue(1)) {
05708       SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
05709                                      SDValue(LDBase, 1),
05710                                      SDValue(NewLd.getNode(), 1));
05711       DAG.ReplaceAllUsesOfValueWith(SDValue(LDBase, 1), NewChain);
05712       DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(LDBase, 1),
05713                              SDValue(NewLd.getNode(), 1));
05714     }
05715 
05716     return NewLd;
05717   }
05718   if (NumElems == 4 && LastLoadedElt == 1 &&
05719       DAG.getTargetLoweringInfo().isTypeLegal(MVT::v2i64)) {
05720     SDVTList Tys = DAG.getVTList(MVT::v2i64, MVT::Other);
05721     SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };
05722     SDValue ResNode =
05723         DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, Ops, MVT::i64,
05724                                 LDBase->getPointerInfo(),
05725                                 LDBase->getAlignment(),
05726                                 false/*isVolatile*/, true/*ReadMem*/,
05727                                 false/*WriteMem*/);
05728 
05729     // Make sure the newly-created LOAD is in the same position as LDBase in
05730     // terms of dependency. We create a TokenFactor for LDBase and ResNode, and
05731     // update uses of LDBase's output chain to use the TokenFactor.
05732     if (LDBase->hasAnyUseOfValue(1)) {
05733       SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
05734                              SDValue(LDBase, 1), SDValue(ResNode.getNode(), 1));
05735       DAG.ReplaceAllUsesOfValueWith(SDValue(LDBase, 1), NewChain);
05736       DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(LDBase, 1),
05737                              SDValue(ResNode.getNode(), 1));
05738     }
05739 
05740     return DAG.getNode(ISD::BITCAST, DL, VT, ResNode);
05741   }
05742   return SDValue();
05743 }
05744 
05745 /// LowerVectorBroadcast - Attempt to use the vbroadcast instruction
05746 /// to generate a splat value for the following cases:
05747 /// 1. A splat BUILD_VECTOR which uses a single scalar load, or a constant.
05748 /// 2. A splat shuffle which uses a scalar_to_vector node which comes from
05749 /// a scalar load, or a constant.
05750 /// The VBROADCAST node is returned when a pattern is found,
05751 /// or SDValue() otherwise.
05752 static SDValue LowerVectorBroadcast(SDValue Op, const X86Subtarget* Subtarget,
05753                                     SelectionDAG &DAG) {
05754   if (!Subtarget->hasFp256())
05755     return SDValue();
05756 
05757   MVT VT = Op.getSimpleValueType();
05758   SDLoc dl(Op);
05759 
05760   assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
05761          "Unsupported vector type for broadcast.");
05762 
05763   SDValue Ld;
05764   bool ConstSplatVal;
05765 
05766   switch (Op.getOpcode()) {
05767     default:
05768       // Unknown pattern found.
05769       return SDValue();
05770 
05771     case ISD::BUILD_VECTOR: {
05772       auto *BVOp = cast<BuildVectorSDNode>(Op.getNode());
05773       BitVector UndefElements;
05774       SDValue Splat = BVOp->getSplatValue(&UndefElements);
05775 
05776       // We need a splat of a single value to use broadcast, and it doesn't
05777       // make any sense if the value is only in one element of the vector.
05778       if (!Splat || (VT.getVectorNumElements() - UndefElements.count()) <= 1)
05779         return SDValue();
05780 
05781       Ld = Splat;
05782       ConstSplatVal = (Ld.getOpcode() == ISD::Constant ||
05783                        Ld.getOpcode() == ISD::ConstantFP);
05784 
05785       // Make sure that all of the users of a non-constant load are from the
05786       // BUILD_VECTOR node.
05787       if (!ConstSplatVal && !BVOp->isOnlyUserOf(Ld.getNode()))
05788         return SDValue();
05789       break;
05790     }
05791 
05792     case ISD::VECTOR_SHUFFLE: {
05793       ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
05794 
05795       // Shuffles must have a splat mask where the first element is
05796       // broadcasted.
05797       if ((!SVOp->isSplat()) || SVOp->getMaskElt(0) != 0)
05798         return SDValue();
05799 
05800       SDValue Sc = Op.getOperand(0);
05801       if (Sc.getOpcode() != ISD::SCALAR_TO_VECTOR &&
05802           Sc.getOpcode() != ISD::BUILD_VECTOR) {
05803 
05804         if (!Subtarget->hasInt256())
05805           return SDValue();
05806 
05807         // Use the register form of the broadcast instruction available on AVX2.
05808         if (VT.getSizeInBits() >= 256)
05809           Sc = Extract128BitVector(Sc, 0, DAG, dl);
05810         return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Sc);
05811       }
05812 
05813       Ld = Sc.getOperand(0);
05814       ConstSplatVal = (Ld.getOpcode() == ISD::Constant ||
05815                        Ld.getOpcode() == ISD::ConstantFP);
05816 
05817       // The scalar_to_vector node and the suspected
05818       // load node must have exactly one user.
05819       // Constants may have multiple users.
05820 
05821       // AVX-512 has register version of the broadcast
05822       bool hasRegVer = Subtarget->hasAVX512() && VT.is512BitVector() &&
05823         Ld.getValueType().getSizeInBits() >= 32;
05824       if (!ConstSplatVal && ((!Sc.hasOneUse() || !Ld.hasOneUse()) &&
05825           !hasRegVer))
05826         return SDValue();
05827       break;
05828     }
05829   }
05830 
05831   bool IsGE256 = (VT.getSizeInBits() >= 256);
05832 
05833   // Handle the broadcasting a single constant scalar from the constant pool
05834   // into a vector. On Sandybridge it is still better to load a constant vector
05835   // from the constant pool and not to broadcast it from a scalar.
05836   if (ConstSplatVal && Subtarget->hasInt256()) {
05837     EVT CVT = Ld.getValueType();
05838     assert(!CVT.isVector() && "Must not broadcast a vector type");
05839     unsigned ScalarSize = CVT.getSizeInBits();
05840 
05841     if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64)) {
05842       const Constant *C = nullptr;
05843       if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Ld))
05844         C = CI->getConstantIntValue();
05845       else if (ConstantFPSDNode *CF = dyn_cast<ConstantFPSDNode>(Ld))
05846         C = CF->getConstantFPValue();
05847 
05848       assert(C && "Invalid constant type");
05849 
05850       const TargetLowering &TLI = DAG.getTargetLoweringInfo();
05851       SDValue CP = DAG.getConstantPool(C, TLI.getPointerTy());
05852       unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
05853       Ld = DAG.getLoad(CVT, dl, DAG.getEntryNode(), CP,
05854                        MachinePointerInfo::getConstantPool(),
05855                        false, false, false, Alignment);
05856 
05857       return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
05858     }
05859   }
05860 
05861   bool IsLoad = ISD::isNormalLoad(Ld.getNode());
05862   unsigned ScalarSize = Ld.getValueType().getSizeInBits();
05863 
05864   // Handle AVX2 in-register broadcasts.
05865   if (!IsLoad && Subtarget->hasInt256() &&
05866       (ScalarSize == 32 || (IsGE256 && ScalarSize == 64)))
05867     return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
05868 
05869   // The scalar source must be a normal load.
05870   if (!IsLoad)
05871     return SDValue();
05872 
05873   if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64))
05874     return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
05875 
05876   // The integer check is needed for the 64-bit into 128-bit so it doesn't match
05877   // double since there is no vbroadcastsd xmm
05878   if (Subtarget->hasInt256() && Ld.getValueType().isInteger()) {
05879     if (ScalarSize == 8 || ScalarSize == 16 || ScalarSize == 64)
05880       return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
05881   }
05882 
05883   // Unsupported broadcast.
05884   return SDValue();
05885 }
05886 
05887 /// \brief For an EXTRACT_VECTOR_ELT with a constant index return the real
05888 /// underlying vector and index.
05889 ///
05890 /// Modifies \p ExtractedFromVec to the real vector and returns the real
05891 /// index.
05892 static int getUnderlyingExtractedFromVec(SDValue &ExtractedFromVec,
05893                                          SDValue ExtIdx) {
05894   int Idx = cast<ConstantSDNode>(ExtIdx)->getZExtValue();
05895   if (!isa<ShuffleVectorSDNode>(ExtractedFromVec))
05896     return Idx;
05897 
05898   // For 256-bit vectors, LowerEXTRACT_VECTOR_ELT_SSE4 may have already
05899   // lowered this:
05900   //   (extract_vector_elt (v8f32 %vreg1), Constant<6>)
05901   // to:
05902   //   (extract_vector_elt (vector_shuffle<2,u,u,u>
05903   //                           (extract_subvector (v8f32 %vreg0), Constant<4>),
05904   //                           undef)
05905   //                       Constant<0>)
05906   // In this case the vector is the extract_subvector expression and the index
05907   // is 2, as specified by the shuffle.
05908   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(ExtractedFromVec);
05909   SDValue ShuffleVec = SVOp->getOperand(0);
05910   MVT ShuffleVecVT = ShuffleVec.getSimpleValueType();
05911   assert(ShuffleVecVT.getVectorElementType() ==
05912          ExtractedFromVec.getSimpleValueType().getVectorElementType());
05913 
05914   int ShuffleIdx = SVOp->getMaskElt(Idx);
05915   if (isUndefOrInRange(ShuffleIdx, 0, ShuffleVecVT.getVectorNumElements())) {
05916     ExtractedFromVec = ShuffleVec;
05917     return ShuffleIdx;
05918   }
05919   return Idx;
05920 }
05921 
05922 static SDValue buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) {
05923   MVT VT = Op.getSimpleValueType();
05924 
05925   // Skip if insert_vec_elt is not supported.
05926   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
05927   if (!TLI.isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT))
05928     return SDValue();
05929 
05930   SDLoc DL(Op);
05931   unsigned NumElems = Op.getNumOperands();
05932 
05933   SDValue VecIn1;
05934   SDValue VecIn2;
05935   SmallVector<unsigned, 4> InsertIndices;
05936   SmallVector<int, 8> Mask(NumElems, -1);
05937 
05938   for (unsigned i = 0; i != NumElems; ++i) {
05939     unsigned Opc = Op.getOperand(i).getOpcode();
05940 
05941     if (Opc == ISD::UNDEF)
05942       continue;
05943 
05944     if (Opc != ISD::EXTRACT_VECTOR_ELT) {
05945       // Quit if more than 1 elements need inserting.
05946       if (InsertIndices.size() > 1)
05947         return SDValue();
05948 
05949       InsertIndices.push_back(i);
05950       continue;
05951     }
05952 
05953     SDValue ExtractedFromVec = Op.getOperand(i).getOperand(0);
05954     SDValue ExtIdx = Op.getOperand(i).getOperand(1);
05955     // Quit if non-constant index.
05956     if (!isa<ConstantSDNode>(ExtIdx))
05957       return SDValue();
05958     int Idx = getUnderlyingExtractedFromVec(ExtractedFromVec, ExtIdx);
05959 
05960     // Quit if extracted from vector of different type.
05961     if (ExtractedFromVec.getValueType() != VT)
05962       return SDValue();
05963 
05964     if (!VecIn1.getNode())
05965       VecIn1 = ExtractedFromVec;
05966     else if (VecIn1 != ExtractedFromVec) {
05967       if (!VecIn2.getNode())
05968         VecIn2 = ExtractedFromVec;
05969       else if (VecIn2 != ExtractedFromVec)
05970         // Quit if more than 2 vectors to shuffle
05971         return SDValue();
05972     }
05973 
05974     if (ExtractedFromVec == VecIn1)
05975       Mask[i] = Idx;
05976     else if (ExtractedFromVec == VecIn2)
05977       Mask[i] = Idx + NumElems;
05978   }
05979 
05980   if (!VecIn1.getNode())
05981     return SDValue();
05982 
05983   VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(VT);
05984   SDValue NV = DAG.getVectorShuffle(VT, DL, VecIn1, VecIn2, &Mask[0]);
05985   for (unsigned i = 0, e = InsertIndices.size(); i != e; ++i) {
05986     unsigned Idx = InsertIndices[i];
05987     NV = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, NV, Op.getOperand(Idx),
05988                      DAG.getIntPtrConstant(Idx));
05989   }
05990 
05991   return NV;
05992 }
05993 
05994 // Lower BUILD_VECTOR operation for v8i1 and v16i1 types.
05995 SDValue
05996 X86TargetLowering::LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const {
05997 
05998   MVT VT = Op.getSimpleValueType();
05999   assert((VT.getVectorElementType() == MVT::i1) && (VT.getSizeInBits() <= 16) &&
06000          "Unexpected type in LowerBUILD_VECTORvXi1!");
06001 
06002   SDLoc dl(Op);
06003   if (ISD::isBuildVectorAllZeros(Op.getNode())) {
06004     SDValue Cst = DAG.getTargetConstant(0, MVT::i1);
06005     SmallVector<SDValue, 16> Ops(VT.getVectorNumElements(), Cst);
06006     return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops);
06007   }
06008 
06009   if (ISD::isBuildVectorAllOnes(Op.getNode())) {
06010     SDValue Cst = DAG.getTargetConstant(1, MVT::i1);
06011     SmallVector<SDValue, 16> Ops(VT.getVectorNumElements(), Cst);
06012     return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops);
06013   }
06014 
06015   bool AllContants = true;
06016   uint64_t Immediate = 0;
06017   int NonConstIdx = -1;
06018   bool IsSplat = true;
06019   unsigned NumNonConsts = 0;
06020   unsigned NumConsts = 0;
06021   for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
06022     SDValue In = Op.getOperand(idx);
06023     if (In.getOpcode() == ISD::UNDEF)
06024       continue;
06025     if (!isa<ConstantSDNode>(In)) {
06026       AllContants = false;
06027       NonConstIdx = idx;
06028       NumNonConsts++;
06029     }
06030     else {
06031       NumConsts++;
06032       if (cast<ConstantSDNode>(In)->getZExtValue())
06033       Immediate |= (1ULL << idx);
06034     }
06035     if (In != Op.getOperand(0))
06036       IsSplat = false;
06037   }
06038 
06039   if (AllContants) {
06040     SDValue FullMask = DAG.getNode(ISD::BITCAST, dl, MVT::v16i1,
06041       DAG.getConstant(Immediate, MVT::i16));
06042     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, FullMask,
06043                        DAG.getIntPtrConstant(0));
06044   }
06045 
06046   if (NumNonConsts == 1 && NonConstIdx != 0) {
06047     SDValue DstVec;
06048     if (NumConsts) {
06049       SDValue VecAsImm = DAG.getConstant(Immediate,
06050                                          MVT::getIntegerVT(VT.getSizeInBits()));
06051       DstVec = DAG.getNode(ISD::BITCAST, dl, VT, VecAsImm);
06052     }
06053     else 
06054       DstVec = DAG.getUNDEF(VT);
06055     return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DstVec,
06056                        Op.getOperand(NonConstIdx),
06057                        DAG.getIntPtrConstant(NonConstIdx));
06058   }
06059   if (!IsSplat && (NonConstIdx != 0))
06060     llvm_unreachable("Unsupported BUILD_VECTOR operation");
06061   MVT SelectVT = (VT == MVT::v16i1)? MVT::i16 : MVT::i8;
06062   SDValue Select;
06063   if (IsSplat)
06064     Select = DAG.getNode(ISD::SELECT, dl, SelectVT, Op.getOperand(0),
06065                           DAG.getConstant(-1, SelectVT),
06066                           DAG.getConstant(0, SelectVT));
06067   else
06068     Select = DAG.getNode(ISD::SELECT, dl, SelectVT, Op.getOperand(0),
06069                          DAG.getConstant((Immediate | 1), SelectVT),
06070                          DAG.getConstant(Immediate, SelectVT));
06071   return DAG.getNode(ISD::BITCAST, dl, VT, Select);
06072 }
06073 
06074 /// \brief Return true if \p N implements a horizontal binop and return the
06075 /// operands for the horizontal binop into V0 and V1.
06076 /// 
06077 /// This is a helper function of PerformBUILD_VECTORCombine.
06078 /// This function checks that the build_vector \p N in input implements a
06079 /// horizontal operation. Parameter \p Opcode defines the kind of horizontal
06080 /// operation to match.
06081 /// For example, if \p Opcode is equal to ISD::ADD, then this function
06082 /// checks if \p N implements a horizontal arithmetic add; if instead \p Opcode
06083 /// is equal to ISD::SUB, then this function checks if this is a horizontal
06084 /// arithmetic sub.
06085 ///
06086 /// This function only analyzes elements of \p N whose indices are
06087 /// in range [BaseIdx, LastIdx).
06088 static bool isHorizontalBinOp(const BuildVectorSDNode *N, unsigned Opcode,
06089                               SelectionDAG &DAG,
06090                               unsigned BaseIdx, unsigned LastIdx,
06091                               SDValue &V0, SDValue &V1) {
06092   EVT VT = N->getValueType(0);
06093 
06094   assert(BaseIdx * 2 <= LastIdx && "Invalid Indices in input!");
06095   assert(VT.isVector() && VT.getVectorNumElements() >= LastIdx &&
06096          "Invalid Vector in input!");
06097   
06098   bool IsCommutable = (Opcode == ISD::ADD || Opcode == ISD::FADD);
06099   bool CanFold = true;
06100   unsigned ExpectedVExtractIdx = BaseIdx;
06101   unsigned NumElts = LastIdx - BaseIdx;
06102   V0 = DAG.getUNDEF(VT);
06103   V1 = DAG.getUNDEF(VT);
06104 
06105   // Check if N implements a horizontal binop.
06106   for (unsigned i = 0, e = NumElts; i != e && CanFold; ++i) {
06107     SDValue Op = N->getOperand(i + BaseIdx);
06108 
06109     // Skip UNDEFs.
06110     if (Op->getOpcode() == ISD::UNDEF) {
06111       // Update the expected vector extract index.
06112       if (i * 2 == NumElts)
06113         ExpectedVExtractIdx = BaseIdx;
06114       ExpectedVExtractIdx += 2;
06115       continue;
06116     }
06117 
06118     CanFold = Op->getOpcode() == Opcode && Op->hasOneUse();
06119 
06120     if (!CanFold)
06121       break;
06122 
06123     SDValue Op0 = Op.getOperand(0);
06124     SDValue Op1 = Op.getOperand(1);
06125 
06126     // Try to match the following pattern:
06127     // (BINOP (extract_vector_elt A, I), (extract_vector_elt A, I+1))
06128     CanFold = (Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
06129         Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
06130         Op0.getOperand(0) == Op1.getOperand(0) &&
06131         isa<ConstantSDNode>(Op0.getOperand(1)) &&
06132         isa<ConstantSDNode>(Op1.getOperand(1)));
06133     if (!CanFold)
06134       break;
06135 
06136     unsigned I0 = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue();
06137     unsigned I1 = cast<ConstantSDNode>(Op1.getOperand(1))->getZExtValue();
06138 
06139     if (i * 2 < NumElts) {
06140       if (V0.getOpcode() == ISD::UNDEF)
06141         V0 = Op0.getOperand(0);
06142     } else {
06143       if (V1.getOpcode() == ISD::UNDEF)
06144         V1 = Op0.getOperand(0);
06145       if (i * 2 == NumElts)
06146         ExpectedVExtractIdx = BaseIdx;
06147     }
06148 
06149     SDValue Expected = (i * 2 < NumElts) ? V0 : V1;
06150     if (I0 == ExpectedVExtractIdx)
06151       CanFold = I1 == I0 + 1 && Op0.getOperand(0) == Expected;
06152     else if (IsCommutable && I1 == ExpectedVExtractIdx) {
06153       // Try to match the following dag sequence:
06154       // (BINOP (extract_vector_elt A, I+1), (extract_vector_elt A, I))
06155       CanFold = I0 == I1 + 1 && Op1.getOperand(0) == Expected;
06156     } else
06157       CanFold = false;
06158 
06159     ExpectedVExtractIdx += 2;
06160   }
06161 
06162   return CanFold;
06163 }
06164 
06165 /// \brief Emit a sequence of two 128-bit horizontal add/sub followed by
06166 /// a concat_vector. 
06167 ///
06168 /// This is a helper function of PerformBUILD_VECTORCombine.
06169 /// This function expects two 256-bit vectors called V0 and V1.
06170 /// At first, each vector is split into two separate 128-bit vectors.
06171 /// Then, the resulting 128-bit vectors are used to implement two
06172 /// horizontal binary operations. 
06173 ///
06174 /// The kind of horizontal binary operation is defined by \p X86Opcode.
06175 ///
06176 /// \p Mode specifies how the 128-bit parts of V0 and V1 are passed in input to
06177 /// the two new horizontal binop.
06178 /// When Mode is set, the first horizontal binop dag node would take as input
06179 /// the lower 128-bit of V0 and the upper 128-bit of V0. The second
06180 /// horizontal binop dag node would take as input the lower 128-bit of V1
06181 /// and the upper 128-bit of V1.
06182 ///   Example:
06183 ///     HADD V0_LO, V0_HI
06184 ///     HADD V1_LO, V1_HI
06185 ///
06186 /// Otherwise, the first horizontal binop dag node takes as input the lower
06187 /// 128-bit of V0 and the lower 128-bit of V1, and the second horizontal binop
06188 /// dag node takes the the upper 128-bit of V0 and the upper 128-bit of V1.
06189 ///   Example:
06190 ///     HADD V0_LO, V1_LO
06191 ///     HADD V0_HI, V1_HI
06192 ///
06193 /// If \p isUndefLO is set, then the algorithm propagates UNDEF to the lower
06194 /// 128-bits of the result. If \p isUndefHI is set, then UNDEF is propagated to
06195 /// the upper 128-bits of the result.
06196 static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1,
06197                                      SDLoc DL, SelectionDAG &DAG,
06198                                      unsigned X86Opcode, bool Mode,
06199                                      bool isUndefLO, bool isUndefHI) {
06200   EVT VT = V0.getValueType();
06201   assert(VT.is256BitVector() && VT == V1.getValueType() &&
06202          "Invalid nodes in input!");
06203 
06204   unsigned NumElts = VT.getVectorNumElements();
06205   SDValue V0_LO = Extract128BitVector(V0, 0, DAG, DL);
06206   SDValue V0_HI = Extract128BitVector(V0, NumElts/2, DAG, DL);
06207   SDValue V1_LO = Extract128BitVector(V1, 0, DAG, DL);
06208   SDValue V1_HI = Extract128BitVector(V1, NumElts/2, DAG, DL);
06209   EVT NewVT = V0_LO.getValueType();
06210 
06211   SDValue LO = DAG.getUNDEF(NewVT);
06212   SDValue HI = DAG.getUNDEF(NewVT);
06213 
06214   if (Mode) {
06215     // Don't emit a horizontal binop if the result is expected to be UNDEF.
06216     if (!isUndefLO && V0->getOpcode() != ISD::UNDEF)
06217       LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V0_HI);
06218     if (!isUndefHI && V1->getOpcode() != ISD::UNDEF)
06219       HI = DAG.getNode(X86Opcode, DL, NewVT, V1_LO, V1_HI);
06220   } else {
06221     // Don't emit a horizontal binop if the result is expected to be UNDEF.
06222     if (!isUndefLO && (V0_LO->getOpcode() != ISD::UNDEF ||
06223                        V1_LO->getOpcode() != ISD::UNDEF))
06224       LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V1_LO);
06225 
06226     if (!isUndefHI && (V0_HI->getOpcode() != ISD::UNDEF ||
06227                        V1_HI->getOpcode() != ISD::UNDEF))
06228       HI = DAG.getNode(X86Opcode, DL, NewVT, V0_HI, V1_HI);
06229   }
06230 
06231   return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LO, HI);
06232 }
06233 
06234 /// \brief Try to fold a build_vector that performs an 'addsub' into the
06235 /// sequence of 'vadd + vsub + blendi'.
06236 static SDValue matchAddSub(const BuildVectorSDNode *BV, SelectionDAG &DAG,
06237                            const X86Subtarget *Subtarget) {
06238   SDLoc DL(BV);
06239   EVT VT = BV->getValueType(0);
06240   unsigned NumElts = VT.getVectorNumElements();
06241   SDValue InVec0 = DAG.getUNDEF(VT);
06242   SDValue InVec1 = DAG.getUNDEF(VT);
06243 
06244   assert((VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v4f32 ||
06245           VT == MVT::v2f64) && "build_vector with an invalid type found!");
06246 
06247   // Don't try to emit a VSELECT that cannot be lowered into a blend.
06248   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
06249   if (!TLI.isOperationLegalOrCustom(ISD::VSELECT, VT))
06250     return SDValue();
06251 
06252   // Odd-numbered elements in the input build vector are obtained from
06253   // adding two integer/float elements.
06254   // Even-numbered elements in the input build vector are obtained from
06255   // subtracting two integer/float elements.
06256   unsigned ExpectedOpcode = ISD::FSUB;
06257   unsigned NextExpectedOpcode = ISD::FADD;
06258   bool AddFound = false;
06259   bool SubFound = false;
06260 
06261   for (unsigned i = 0, e = NumElts; i != e; i++) {
06262     SDValue Op = BV->getOperand(i);
06263       
06264     // Skip 'undef' values.
06265     unsigned Opcode = Op.getOpcode();
06266     if (Opcode == ISD::UNDEF) {
06267       std::swap(ExpectedOpcode, NextExpectedOpcode);
06268       continue;
06269     }
06270       
06271     // Early exit if we found an unexpected opcode.
06272     if (Opcode != ExpectedOpcode)
06273       return SDValue();
06274 
06275     SDValue Op0 = Op.getOperand(0);
06276     SDValue Op1 = Op.getOperand(1);
06277 
06278     // Try to match the following pattern:
06279     // (BINOP (extract_vector_elt A, i), (extract_vector_elt B, i))
06280     // Early exit if we cannot match that sequence.
06281     if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
06282         Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
06283         !isa<ConstantSDNode>(Op0.getOperand(1)) ||
06284         !isa<ConstantSDNode>(Op1.getOperand(1)) ||
06285         Op0.getOperand(1) != Op1.getOperand(1))
06286       return SDValue();
06287 
06288     unsigned I0 = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue();
06289     if (I0 != i)
06290       return SDValue();
06291 
06292     // We found a valid add/sub node. Update the information accordingly.
06293     if (i & 1)
06294       AddFound = true;
06295     else
06296       SubFound = true;
06297 
06298     // Update InVec0 and InVec1.
06299     if (InVec0.getOpcode() == ISD::UNDEF)
06300       InVec0 = Op0.getOperand(0);
06301     if (InVec1.getOpcode() == ISD::UNDEF)
06302       InVec1 = Op1.getOperand(0);
06303 
06304     // Make sure that operands in input to each add/sub node always
06305     // come from a same pair of vectors.
06306     if (InVec0 != Op0.getOperand(0)) {
06307       if (ExpectedOpcode == ISD::FSUB)
06308         return SDValue();
06309 
06310       // FADD is commutable. Try to commute the operands
06311       // and then test again.
06312       std::swap(Op0, Op1);
06313       if (InVec0 != Op0.getOperand(0))
06314         return SDValue();
06315     }
06316 
06317     if (InVec1 != Op1.getOperand(0))
06318       return SDValue();
06319 
06320     // Update the pair of expected opcodes.
06321     std::swap(ExpectedOpcode, NextExpectedOpcode);
06322   }
06323 
06324   // Don't try to fold this build_vector into a VSELECT if it has
06325   // too many UNDEF operands.
06326   if (AddFound && SubFound && InVec0.getOpcode() != ISD::UNDEF &&
06327       InVec1.getOpcode() != ISD::UNDEF) {
06328     // Emit a sequence of vector add and sub followed by a VSELECT.
06329     // The new VSELECT will be lowered into a BLENDI.
06330     // At ISel stage, we pattern-match the sequence 'add + sub + BLENDI'
06331     // and emit a single ADDSUB instruction.
06332     SDValue Sub = DAG.getNode(ExpectedOpcode, DL, VT, InVec0, InVec1);
06333     SDValue Add = DAG.getNode(NextExpectedOpcode, DL, VT, InVec0, InVec1);
06334 
06335     // Construct the VSELECT mask.
06336     EVT MaskVT = VT.changeVectorElementTypeToInteger();
06337     EVT SVT = MaskVT.getVectorElementType();
06338     unsigned SVTBits = SVT.getSizeInBits();
06339     SmallVector<SDValue, 8> Ops;
06340 
06341     for (unsigned i = 0, e = NumElts; i != e; ++i) {
06342       APInt Value = i & 1 ? APInt::getNullValue(SVTBits) :
06343                             APInt::getAllOnesValue(SVTBits);
06344       SDValue Constant = DAG.getConstant(Value, SVT);
06345       Ops.push_back(Constant);
06346     }
06347 
06348     SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, DL, MaskVT, Ops);
06349     return DAG.getSelect(DL, VT, Mask, Sub, Add);
06350   }
06351   
06352   return SDValue();
06353 }
06354 
06355 static SDValue PerformBUILD_VECTORCombine(SDNode *N, SelectionDAG &DAG,
06356                                           const X86Subtarget *Subtarget) {
06357   SDLoc DL(N);
06358   EVT VT = N->getValueType(0);
06359   unsigned NumElts = VT.getVectorNumElements();
06360   BuildVectorSDNode *BV = cast<BuildVectorSDNode>(N);
06361   SDValue InVec0, InVec1;
06362 
06363   // Try to match an ADDSUB.
06364   if ((Subtarget->hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
06365       (Subtarget->hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))) {
06366     SDValue Value = matchAddSub(BV, DAG, Subtarget);
06367     if (Value.getNode())
06368       return Value;
06369   }
06370 
06371   // Try to match horizontal ADD/SUB.
06372   unsigned NumUndefsLO = 0;
06373   unsigned NumUndefsHI = 0;
06374   unsigned Half = NumElts/2;
06375 
06376   // Count the number of UNDEF operands in the build_vector in input.
06377   for (unsigned i = 0, e = Half; i != e; ++i)
06378     if (BV->getOperand(i)->getOpcode() == ISD::UNDEF)
06379       NumUndefsLO++;
06380 
06381   for (unsigned i = Half, e = NumElts; i != e; ++i)
06382     if (BV->getOperand(i)->getOpcode() == ISD::UNDEF)
06383       NumUndefsHI++;
06384 
06385   // Early exit if this is either a build_vector of all UNDEFs or all the
06386   // operands but one are UNDEF.
06387   if (NumUndefsLO + NumUndefsHI + 1 >= NumElts)
06388     return SDValue();
06389 
06390   if ((VT == MVT::v4f32 || VT == MVT::v2f64) && Subtarget->hasSSE3()) {
06391     // Try to match an SSE3 float HADD/HSUB.
06392     if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, NumElts, InVec0, InVec1))
06393       return DAG.getNode(X86ISD::FHADD, DL, VT, InVec0, InVec1);
06394     
06395     if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, NumElts, InVec0, InVec1))
06396       return DAG.getNode(X86ISD::FHSUB, DL, VT, InVec0, InVec1);
06397   } else if ((VT == MVT::v4i32 || VT == MVT::v8i16) && Subtarget->hasSSSE3()) {
06398     // Try to match an SSSE3 integer HADD/HSUB.
06399     if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1))
06400       return DAG.getNode(X86ISD::HADD, DL, VT, InVec0, InVec1);
06401     
06402     if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, NumElts, InVec0, InVec1))
06403       return DAG.getNode(X86ISD::HSUB, DL, VT, InVec0, InVec1);
06404   }
06405   
06406   if (!Subtarget->hasAVX())
06407     return SDValue();
06408 
06409   if ((VT == MVT::v8f32 || VT == MVT::v4f64)) {
06410     // Try to match an AVX horizontal add/sub of packed single/double
06411     // precision floating point values from 256-bit vectors.
06412     SDValue InVec2, InVec3;
06413     if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, Half, InVec0, InVec1) &&
06414         isHorizontalBinOp(BV, ISD::FADD, DAG, Half, NumElts, InVec2, InVec3) &&
06415         ((InVec0.getOpcode() == ISD::UNDEF ||
06416           InVec2.getOpcode() == ISD::UNDEF) || InVec0 == InVec2) &&
06417         ((InVec1.getOpcode() == ISD::UNDEF ||
06418           InVec3.getOpcode() == ISD::UNDEF) || InVec1 == InVec3))
06419       return DAG.getNode(X86ISD::FHADD, DL, VT, InVec0, InVec1);
06420 
06421     if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, Half, InVec0, InVec1) &&
06422         isHorizontalBinOp(BV, ISD::FSUB, DAG, Half, NumElts, InVec2, InVec3) &&
06423         ((InVec0.getOpcode() == ISD::UNDEF ||
06424           InVec2.getOpcode() == ISD::UNDEF) || InVec0 == InVec2) &&
06425         ((InVec1.getOpcode() == ISD::UNDEF ||
06426           InVec3.getOpcode() == ISD::UNDEF) || InVec1 == InVec3))
06427       return DAG.getNode(X86ISD::FHSUB, DL, VT, InVec0, InVec1);
06428   } else if (VT == MVT::v8i32 || VT == MVT::v16i16) {
06429     // Try to match an AVX2 horizontal add/sub of signed integers.
06430     SDValue InVec2, InVec3;
06431     unsigned X86Opcode;
06432     bool CanFold = true;
06433 
06434     if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, Half, InVec0, InVec1) &&
06435         isHorizontalBinOp(BV, ISD::ADD, DAG, Half, NumElts, InVec2, InVec3) &&
06436         ((InVec0.getOpcode() == ISD::UNDEF ||
06437           InVec2.getOpcode() == ISD::UNDEF) || InVec0 == InVec2) &&
06438         ((InVec1.getOpcode() == ISD::UNDEF ||
06439           InVec3.getOpcode() == ISD::UNDEF) || InVec1 == InVec3))
06440       X86Opcode = X86ISD::HADD;
06441     else if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, Half, InVec0, InVec1) &&
06442         isHorizontalBinOp(BV, ISD::SUB, DAG, Half, NumElts, InVec2, InVec3) &&
06443         ((InVec0.getOpcode() == ISD::UNDEF ||
06444           InVec2.getOpcode() == ISD::UNDEF) || InVec0 == InVec2) &&
06445         ((InVec1.getOpcode() == ISD::UNDEF ||
06446           InVec3.getOpcode() == ISD::UNDEF) || InVec1 == InVec3))
06447       X86Opcode = X86ISD::HSUB;
06448     else
06449       CanFold = false;
06450 
06451     if (CanFold) {
06452       // Fold this build_vector into a single horizontal add/sub.
06453       // Do this only if the target has AVX2.
06454       if (Subtarget->hasAVX2())
06455         return DAG.getNode(X86Opcode, DL, VT, InVec0, InVec1);
06456  
06457       // Do not try to expand this build_vector into a pair of horizontal
06458       // add/sub if we can emit a pair of scalar add/sub.
06459       if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
06460         return SDValue();
06461 
06462       // Convert this build_vector into a pair of horizontal binop followed by
06463       // a concat vector.
06464       bool isUndefLO = NumUndefsLO == Half;
06465       bool isUndefHI = NumUndefsHI == Half;
06466       return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, false,
06467                                    isUndefLO, isUndefHI);
06468     }
06469   }
06470 
06471   if ((VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v8i32 ||
06472        VT == MVT::v16i16) && Subtarget->hasAVX()) {
06473     unsigned X86Opcode;
06474     if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1))
06475       X86Opcode = X86ISD::HADD;
06476     else if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, NumElts, InVec0, InVec1))
06477       X86Opcode = X86ISD::HSUB;
06478     else if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, NumElts, InVec0, InVec1))
06479       X86Opcode = X86ISD::FHADD;
06480     else if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, NumElts, InVec0, InVec1))
06481       X86Opcode = X86ISD::FHSUB;
06482     else
06483       return SDValue();
06484 
06485     // Don't try to expand this build_vector into a pair of horizontal add/sub
06486     // if we can simply emit a pair of scalar add/sub.
06487     if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
06488       return SDValue();
06489 
06490     // Convert this build_vector into two horizontal add/sub followed by
06491     // a concat vector.
06492     bool isUndefLO = NumUndefsLO == Half;
06493     bool isUndefHI = NumUndefsHI == Half;
06494     return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, true,
06495                                  isUndefLO, isUndefHI);
06496   }
06497 
06498   return SDValue();
06499 }
06500 
06501 SDValue
06502 X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
06503   SDLoc dl(Op);
06504 
06505   MVT VT = Op.getSimpleValueType();
06506   MVT ExtVT = VT.getVectorElementType();
06507   unsigned NumElems = Op.getNumOperands();
06508 
06509   // Generate vectors for predicate vectors.
06510   if (VT.getScalarType() == MVT::i1 && Subtarget->hasAVX512())
06511     return LowerBUILD_VECTORvXi1(Op, DAG);
06512 
06513   // Vectors containing all zeros can be matched by pxor and xorps later
06514   if (ISD::isBuildVectorAllZeros(Op.getNode())) {
06515     // Canonicalize this to <4 x i32> to 1) ensure the zero vectors are CSE'd
06516     // and 2) ensure that i64 scalars are eliminated on x86-32 hosts.
06517     if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32)
06518       return Op;
06519 
06520     return getZeroVector(VT, Subtarget, DAG, dl);
06521   }
06522 
06523   // Vectors containing all ones can be matched by pcmpeqd on 128-bit width
06524   // vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use
06525   // vpcmpeqd on 256-bit vectors.
06526   if (Subtarget->hasSSE2() && ISD::isBuildVectorAllOnes(Op.getNode())) {
06527     if (VT == MVT::v4i32 || (VT == MVT::v8i32 && Subtarget->hasInt256()))
06528       return Op;
06529 
06530     if (!VT.is512BitVector())
06531       return getOnesVector(VT, Subtarget->hasInt256(), DAG, dl);
06532   }
06533 
06534   SDValue Broadcast = LowerVectorBroadcast(Op, Subtarget, DAG);
06535   if (Broadcast.getNode())
06536     return Broadcast;
06537 
06538   unsigned EVTBits = ExtVT.getSizeInBits();
06539 
06540   unsigned NumZero  = 0;
06541   unsigned NumNonZero = 0;
06542   unsigned NonZeros = 0;
06543   bool IsAllConstants = true;
06544   SmallSet<SDValue, 8> Values;
06545   for (unsigned i = 0; i < NumElems; ++i) {
06546     SDValue Elt = Op.getOperand(i);
06547     if (Elt.getOpcode() == ISD::UNDEF)
06548       continue;
06549     Values.insert(Elt);
06550     if (Elt.getOpcode() != ISD::Constant &&
06551         Elt.getOpcode() != ISD::ConstantFP)
06552       IsAllConstants = false;
06553     if (X86::isZeroNode(Elt))
06554       NumZero++;
06555     else {
06556       NonZeros |= (1 << i);
06557       NumNonZero++;
06558     }
06559   }
06560 
06561   // All undef vector. Return an UNDEF.  All zero vectors were handled above.
06562   if (NumNonZero == 0)
06563     return DAG.getUNDEF(VT);
06564 
06565   // Special case for single non-zero, non-undef, element.
06566   if (NumNonZero == 1) {
06567     unsigned Idx = countTrailingZeros(NonZeros);
06568     SDValue Item = Op.getOperand(Idx);
06569 
06570     // If this is an insertion of an i64 value on x86-32, and if the top bits of
06571     // the value are obviously zero, truncate the value to i32 and do the
06572     // insertion that way.  Only do this if the value is non-constant or if the
06573     // value is a constant being inserted into element 0.  It is cheaper to do
06574     // a constant pool load than it is to do a movd + shuffle.
06575     if (ExtVT == MVT::i64 && !Subtarget->is64Bit() &&
06576         (!IsAllConstants || Idx == 0)) {
06577       if (DAG.MaskedValueIsZero(Item, APInt::getBitsSet(64, 32, 64))) {
06578         // Handle SSE only.
06579         assert(VT == MVT::v2i64 && "Expected an SSE value type!");
06580         EVT VecVT = MVT::v4i32;
06581         unsigned VecElts = 4;
06582 
06583         // Truncate the value (which may itself be a constant) to i32, and
06584         // convert it to a vector with movd (S2V+shuffle to zero extend).
06585         Item = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Item);
06586         Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Item);
06587         Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
06588 
06589         // Now we have our 32-bit value zero extended in the low element of
06590         // a vector.  If Idx != 0, swizzle it into place.
06591         if (Idx != 0) {
06592           SmallVector<int, 4> Mask;
06593           Mask.push_back(Idx);
06594           for (unsigned i = 1; i != VecElts; ++i)
06595             Mask.push_back(i);
06596           Item = DAG.getVectorShuffle(VecVT, dl, Item, DAG.getUNDEF(VecVT),
06597                                       &Mask[0]);
06598         }
06599         return DAG.getNode(ISD::BITCAST, dl, VT, Item);
06600       }
06601     }
06602 
06603     // If we have a constant or non-constant insertion into the low element of
06604     // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into
06605     // the rest of the elements.  This will be matched as movd/movq/movss/movsd
06606     // depending on what the source datatype is.
06607     if (Idx == 0) {
06608       if (NumZero == 0)
06609         return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
06610 
06611       if (ExtVT == MVT::i32 || ExtVT == MVT::f32 || ExtVT == MVT::f64 ||
06612           (ExtVT == MVT::i64 && Subtarget->is64Bit())) {
06613         if (VT.is256BitVector() || VT.is512BitVector()) {
06614           SDValue ZeroVec = getZeroVector(VT, Subtarget, DAG, dl);
06615           return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, ZeroVec,
06616                              Item, DAG.getIntPtrConstant(0));
06617         }
06618         assert(VT.is128BitVector() && "Expected an SSE value type!");
06619         Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
06620         // Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector.
06621         return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
06622       }
06623 
06624       if (ExtVT == MVT::i16 || ExtVT == MVT::i8) {
06625         Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item);
06626         Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item);
06627         if (VT.is256BitVector()) {
06628           SDValue ZeroVec = getZeroVector(MVT::v8i32, Subtarget, DAG, dl);
06629           Item = Insert128BitVector(ZeroVec, Item, 0, DAG, dl);
06630         } else {
06631           assert(VT.is128BitVector() && "Expected an SSE value type!");
06632           Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
06633         }
06634         return DAG.getNode(ISD::BITCAST, dl, VT, Item);
06635       }
06636     }
06637 
06638     // Is it a vector logical left shift?
06639     if (NumElems == 2 && Idx == 1 &&
06640         X86::isZeroNode(Op.getOperand(0)) &&
06641         !X86::isZeroNode(Op.getOperand(1))) {
06642       unsigned NumBits = VT.getSizeInBits();
06643       return getVShift(true, VT,
06644                        DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
06645                                    VT, Op.getOperand(1)),
06646                        NumBits/2, DAG, *this, dl);
06647     }
06648 
06649     if (IsAllConstants) // Otherwise, it's better to do a constpool load.
06650       return SDValue();
06651 
06652     // Otherwise, if this is a vector with i32 or f32 elements, and the element
06653     // is a non-constant being inserted into an element other than the low one,
06654     // we can't use a constant pool load.  Instead, use SCALAR_TO_VECTOR (aka
06655     // movd/movss) to move this into the low element, then shuffle it into
06656     // place.
06657     if (EVTBits == 32) {
06658       Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
06659 
06660       // Turn it into a shuffle of zero and zero-extended scalar to vector.
06661       Item = getShuffleVectorZeroOrUndef(Item, 0, NumZero > 0, Subtarget, DAG);
06662       SmallVector<int, 8> MaskVec;
06663       for (unsigned i = 0; i != NumElems; ++i)
06664         MaskVec.push_back(i == Idx ? 0 : 1);
06665       return DAG.getVectorShuffle(VT, dl, Item, DAG.getUNDEF(VT), &MaskVec[0]);
06666     }
06667   }
06668 
06669   // Splat is obviously ok. Let legalizer expand it to a shuffle.
06670   if (Values.size() == 1) {
06671     if (EVTBits == 32) {
06672       // Instead of a shuffle like this:
06673       // shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0>
06674       // Check if it's possible to issue this instead.
06675       // shuffle (vload ptr)), undef, <1, 1, 1, 1>
06676       unsigned Idx = countTrailingZeros(NonZeros);
06677       SDValue Item = Op.getOperand(Idx);
06678       if (Op.getNode()->isOnlyUserOf(Item.getNode()))
06679         return LowerAsSplatVectorLoad(Item, VT, dl, DAG);
06680     }
06681     return SDValue();
06682   }
06683 
06684   // A vector full of immediates; various special cases are already
06685   // handled, so this is best done with a single constant-pool load.
06686   if (IsAllConstants)
06687     return SDValue();
06688 
06689   // For AVX-length vectors, build the individual 128-bit pieces and use
06690   // shuffles to put them in place.
06691   if (VT.is256BitVector() || VT.is512BitVector()) {
06692     SmallVector<SDValue, 64> V;
06693     for (unsigned i = 0; i != NumElems; ++i)
06694       V.push_back(Op.getOperand(i));
06695 
06696     EVT HVT = EVT::getVectorVT(*DAG.getContext(), ExtVT, NumElems/2);
06697 
06698     // Build both the lower and upper subvector.
06699     SDValue Lower = DAG.getNode(ISD::BUILD_VECTOR, dl, HVT,
06700                                 makeArrayRef(&V[0], NumElems/2));
06701     SDValue Upper = DAG.getNode(ISD::BUILD_VECTOR, dl, HVT,
06702                                 makeArrayRef(&V[NumElems / 2], NumElems/2));
06703 
06704     // Recreate the wider vector with the lower and upper part.
06705     if (VT.is256BitVector())
06706       return Concat128BitVectors(Lower, Upper, VT, NumElems, DAG, dl);
06707     return Concat256BitVectors(Lower, Upper, VT, NumElems, DAG, dl);
06708   }
06709 
06710   // Let legalizer expand 2-wide build_vectors.
06711   if (EVTBits == 64) {
06712     if (NumNonZero == 1) {
06713       // One half is zero or undef.
06714       unsigned Idx = countTrailingZeros(NonZeros);
06715       SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT,
06716                                  Op.getOperand(Idx));
06717       return getShuffleVectorZeroOrUndef(V2, Idx, true, Subtarget, DAG);
06718     }
06719     return SDValue();
06720   }
06721 
06722   // If element VT is < 32 bits, convert it to inserts into a zero vector.
06723   if (EVTBits == 8 && NumElems == 16) {
06724     SDValue V = LowerBuildVectorv16i8(Op, NonZeros,NumNonZero,NumZero, DAG,
06725                                         Subtarget, *this);
06726     if (V.getNode()) return V;
06727   }
06728 
06729   if (EVTBits == 16 && NumElems == 8) {
06730     SDValue V = LowerBuildVectorv8i16(Op, NonZeros,NumNonZero,NumZero, DAG,
06731                                       Subtarget, *this);
06732     if (V.getNode()) return V;
06733   }
06734 
06735   // If element VT is == 32 bits and has 4 elems, try to generate an INSERTPS
06736   if (EVTBits == 32 && NumElems == 4) {
06737     SDValue V = LowerBuildVectorv4x32(Op, NumElems, NonZeros, NumNonZero,
06738                                       NumZero, DAG, Subtarget, *this);
06739     if (V.getNode())
06740       return V;
06741   }
06742 
06743   // If element VT is == 32 bits, turn it into a number of shuffles.
06744   SmallVector<SDValue, 8> V(NumElems);
06745   if (NumElems == 4 && NumZero > 0) {
06746     for (unsigned i = 0; i < 4; ++i) {
06747       bool isZero = !(NonZeros & (1 << i));
06748       if (isZero)
06749         V[i] = getZeroVector(VT, Subtarget, DAG, dl);
06750       else
06751         V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
06752     }
06753 
06754     for (unsigned i = 0; i < 2; ++i) {
06755       switch ((NonZeros & (0x3 << i*2)) >> (i*2)) {
06756         default: break;
06757         case 0:
06758           V[i] = V[i*2];  // Must be a zero vector.
06759           break;
06760         case 1:
06761           V[i] = getMOVL(DAG, dl, VT, V[i*2+1], V[i*2]);
06762           break;
06763         case 2:
06764           V[i] = getMOVL(DAG, dl, VT, V[i*2], V[i*2+1]);
06765           break;
06766         case 3:
06767           V[i] = getUnpackl(DAG, dl, VT, V[i*2], V[i*2+1]);
06768           break;
06769       }
06770     }
06771 
06772     bool Reverse1 = (NonZeros & 0x3) == 2;
06773     bool Reverse2 = ((NonZeros & (0x3 << 2)) >> 2) == 2;
06774     int MaskVec[] = {
06775       Reverse1 ? 1 : 0,
06776       Reverse1 ? 0 : 1,
06777       static_cast<int>(Reverse2 ? NumElems+1 : NumElems),
06778       static_cast<int>(Reverse2 ? NumElems   : NumElems+1)
06779     };
06780     return DAG.getVectorShuffle(VT, dl, V[0], V[1], &MaskVec[0]);
06781   }
06782 
06783   if (Values.size() > 1 && VT.is128BitVector()) {
06784     // Check for a build vector of consecutive loads.
06785     for (unsigned i = 0; i < NumElems; ++i)
06786       V[i] = Op.getOperand(i);
06787 
06788     // Check for elements which are consecutive loads.
06789     SDValue LD = EltsFromConsecutiveLoads(VT, V, dl, DAG, false);
06790     if (LD.getNode())
06791       return LD;
06792 
06793     // Check for a build vector from mostly shuffle plus few inserting.
06794     SDValue Sh = buildFromShuffleMostly(Op, DAG);
06795     if (Sh.getNode())
06796       return Sh;
06797 
06798     // For SSE 4.1, use insertps to put the high elements into the low element.
06799     if (getSubtarget()->hasSSE41()) {
06800       SDValue Result;
06801       if (Op.getOperand(0).getOpcode() != ISD::UNDEF)
06802         Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0));
06803       else
06804         Result = DAG.getUNDEF(VT);
06805 
06806       for (unsigned i = 1; i < NumElems; ++i) {
06807         if (Op.getOperand(i).getOpcode() == ISD::UNDEF) continue;
06808         Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result,
06809                              Op.getOperand(i), DAG.getIntPtrConstant(i));
06810       }
06811       return Result;
06812     }
06813 
06814     // Otherwise, expand into a number of unpckl*, start by extending each of
06815     // our (non-undef) elements to the full vector width with the element in the
06816     // bottom slot of the vector (which generates no code for SSE).
06817     for (unsigned i = 0; i < NumElems; ++i) {
06818       if (Op.getOperand(i).getOpcode() != ISD::UNDEF)
06819         V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
06820       else
06821         V[i] = DAG.getUNDEF(VT);
06822     }
06823 
06824     // Next, we iteratively mix elements, e.g. for v4f32:
06825     //   Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
06826     //         : unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
06827     //   Step 2: unpcklps X, Y ==>    <3, 2, 1, 0>
06828     unsigned EltStride = NumElems >> 1;
06829     while (EltStride != 0) {
06830       for (unsigned i = 0; i < EltStride; ++i) {
06831         // If V[i+EltStride] is undef and this is the first round of mixing,
06832         // then it is safe to just drop this shuffle: V[i] is already in the
06833         // right place, the one element (since it's the first round) being
06834         // inserted as undef can be dropped.  This isn't safe for successive
06835         // rounds because they will permute elements within both vectors.
06836         if (V[i+EltStride].getOpcode() == ISD::UNDEF &&
06837             EltStride == NumElems/2)
06838           continue;
06839 
06840         V[i] = getUnpackl(DAG, dl, VT, V[i], V[i + EltStride]);
06841       }
06842       EltStride >>= 1;
06843     }
06844     return V[0];
06845   }
06846   return SDValue();
06847 }
06848 
06849 // LowerAVXCONCAT_VECTORS - 256-bit AVX can use the vinsertf128 instruction
06850 // to create 256-bit vectors from two other 128-bit ones.
06851 static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
06852   SDLoc dl(Op);
06853   MVT ResVT = Op.getSimpleValueType();
06854 
06855   assert((ResVT.is256BitVector() ||
06856           ResVT.is512BitVector()) && "Value type must be 256-/512-bit wide");
06857 
06858   SDValue V1 = Op.getOperand(0);
06859   SDValue V2 = Op.getOperand(1);
06860   unsigned NumElems = ResVT.getVectorNumElements();
06861   if(ResVT.is256BitVector())
06862     return Concat128BitVectors(V1, V2, ResVT, NumElems, DAG, dl);
06863 
06864   if (Op.getNumOperands() == 4) {
06865     MVT HalfVT = MVT::getVectorVT(ResVT.getScalarType(),
06866                                 ResVT.getVectorNumElements()/2);
06867     SDValue V3 = Op.getOperand(2);
06868     SDValue V4 = Op.getOperand(3);
06869     return Concat256BitVectors(Concat128BitVectors(V1, V2, HalfVT, NumElems/2, DAG, dl),
06870       Concat128BitVectors(V3, V4, HalfVT, NumElems/2, DAG, dl), ResVT, NumElems, DAG, dl);
06871   }
06872   return Concat256BitVectors(V1, V2, ResVT, NumElems, DAG, dl);
06873 }
06874 
06875 static SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
06876   MVT LLVM_ATTRIBUTE_UNUSED VT = Op.getSimpleValueType();
06877   assert((VT.is256BitVector() && Op.getNumOperands() == 2) ||
06878          (VT.is512BitVector() && (Op.getNumOperands() == 2 ||
06879           Op.getNumOperands() == 4)));
06880 
06881   // AVX can use the vinsertf128 instruction to create 256-bit vectors
06882   // from two other 128-bit ones.
06883 
06884   // 512-bit vector may contain 2 256-bit vectors or 4 128-bit vectors
06885   return LowerAVXCONCAT_VECTORS(Op, DAG);
06886 }
06887 
06888 
06889 //===----------------------------------------------------------------------===//
06890 // Vector shuffle lowering
06891 //
06892 // This is an experimental code path for lowering vector shuffles on x86. It is
06893 // designed to handle arbitrary vector shuffles and blends, gracefully
06894 // degrading performance as necessary. It works hard to recognize idiomatic
06895 // shuffles and lower them to optimal instruction patterns without leaving
06896 // a framework that allows reasonably efficient handling of all vector shuffle
06897 // patterns.
06898 //===----------------------------------------------------------------------===//
06899 
06900 /// \brief Tiny helper function to identify a no-op mask.
06901 ///
06902 /// This is a somewhat boring predicate function. It checks whether the mask
06903 /// array input, which is assumed to be a single-input shuffle mask of the kind
06904 /// used by the X86 shuffle instructions (not a fully general
06905 /// ShuffleVectorSDNode mask) requires any shuffles to occur. Both undef and an
06906 /// in-place shuffle are 'no-op's.
06907 static bool isNoopShuffleMask(ArrayRef<int> Mask) {
06908   for (int i = 0, Size = Mask.size(); i < Size; ++i)
06909     if (Mask[i] != -1 && Mask[i] != i)
06910       return false;
06911   return true;
06912 }
06913 
06914 /// \brief Helper function to classify a mask as a single-input mask.
06915 ///
06916 /// This isn't a generic single-input test because in the vector shuffle
06917 /// lowering we canonicalize single inputs to be the first input operand. This
06918 /// means we can more quickly test for a single input by only checking whether
06919 /// an input from the second operand exists. We also assume that the size of
06920 /// mask corresponds to the size of the input vectors which isn't true in the
06921 /// fully general case.
06922 static bool isSingleInputShuffleMask(ArrayRef<int> Mask) {
06923   for (int M : Mask)
06924     if (M >= (int)Mask.size())
06925       return false;
06926   return true;
06927 }
06928 
06929 /// \brief Get a 4-lane 8-bit shuffle immediate for a mask.
06930 ///
06931 /// This helper function produces an 8-bit shuffle immediate corresponding to
06932 /// the ubiquitous shuffle encoding scheme used in x86 instructions for
06933 /// shuffling 4 lanes. It can be used with most of the PSHUF instructions for
06934 /// example.
06935 ///
06936 /// NB: We rely heavily on "undef" masks preserving the input lane.
06937 static SDValue getV4X86ShuffleImm8ForMask(ArrayRef<int> Mask,
06938                                           SelectionDAG &DAG) {
06939   assert(Mask.size() == 4 && "Only 4-lane shuffle masks");
06940   assert(Mask[0] >= -1 && Mask[0] < 4 && "Out of bound mask element!");
06941   assert(Mask[1] >= -1 && Mask[1] < 4 && "Out of bound mask element!");
06942   assert(Mask[2] >= -1 && Mask[2] < 4 && "Out of bound mask element!");
06943   assert(Mask[3] >= -1 && Mask[3] < 4 && "Out of bound mask element!");
06944 
06945   unsigned Imm = 0;
06946   Imm |= (Mask[0] == -1 ? 0 : Mask[0]) << 0;
06947   Imm |= (Mask[1] == -1 ? 1 : Mask[1]) << 2;
06948   Imm |= (Mask[2] == -1 ? 2 : Mask[2]) << 4;
06949   Imm |= (Mask[3] == -1 ? 3 : Mask[3]) << 6;
06950   return DAG.getConstant(Imm, MVT::i8);
06951 }
06952 
06953 /// \brief Handle lowering of 2-lane 64-bit floating point shuffles.
06954 ///
06955 /// This is the basis function for the 2-lane 64-bit shuffles as we have full
06956 /// support for floating point shuffles but not integer shuffles. These
06957 /// instructions will incur a domain crossing penalty on some chips though so
06958 /// it is better to avoid lowering through this for integer vectors where
06959 /// possible.
06960 static SDValue lowerV2F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
06961                                        const X86Subtarget *Subtarget,
06962                                        SelectionDAG &DAG) {
06963   SDLoc DL(Op);
06964   assert(Op.getSimpleValueType() == MVT::v2f64 && "Bad shuffle type!");
06965   assert(V1.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
06966   assert(V2.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
06967   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
06968   ArrayRef<int> Mask = SVOp->getMask();
06969   assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
06970 
06971   if (isSingleInputShuffleMask(Mask)) {
06972     // Straight shuffle of a single input vector. Simulate this by using the
06973     // single input as both of the "inputs" to this instruction..
06974     unsigned SHUFPDMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1);
06975     return DAG.getNode(X86ISD::SHUFP, SDLoc(Op), MVT::v2f64, V1, V1,
06976                        DAG.getConstant(SHUFPDMask, MVT::i8));
06977   }
06978   assert(Mask[0] >= 0 && Mask[0] < 2 && "Non-canonicalized blend!");
06979   assert(Mask[1] >= 2 && "Non-canonicalized blend!");
06980 
06981   unsigned SHUFPDMask = (Mask[0] == 1) | (((Mask[1] - 2) == 1) << 1);
06982   return DAG.getNode(X86ISD::SHUFP, SDLoc(Op), MVT::v2f64, V1, V2,
06983                      DAG.getConstant(SHUFPDMask, MVT::i8));
06984 }
06985 
06986 /// \brief Handle lowering of 2-lane 64-bit integer shuffles.
06987 ///
06988 /// Tries to lower a 2-lane 64-bit shuffle using shuffle operations provided by
06989 /// the integer unit to minimize domain crossing penalties. However, for blends
06990 /// it falls back to the floating point shuffle operation with appropriate bit
06991 /// casting.
06992 static SDValue lowerV2I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
06993                                        const X86Subtarget *Subtarget,
06994                                        SelectionDAG &DAG) {
06995   SDLoc DL(Op);
06996   assert(Op.getSimpleValueType() == MVT::v2i64 && "Bad shuffle type!");
06997   assert(V1.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
06998   assert(V2.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
06999   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
07000   ArrayRef<int> Mask = SVOp->getMask();
07001   assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
07002 
07003   if (isSingleInputShuffleMask(Mask)) {
07004     // Straight shuffle of a single input vector. For everything from SSE2
07005     // onward this has a single fast instruction with no scary immediates.
07006     // We have to map the mask as it is actually a v4i32 shuffle instruction.
07007     V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, V1);
07008     int WidenedMask[4] = {
07009         std::max(Mask[0], 0) * 2, std::max(Mask[0], 0) * 2 + 1,
07010         std::max(Mask[1], 0) * 2, std::max(Mask[1], 0) * 2 + 1};
07011     return DAG.getNode(
07012         ISD::BITCAST, DL, MVT::v2i64,
07013         DAG.getNode(X86ISD::PSHUFD, SDLoc(Op), MVT::v4i32, V1,
07014                     getV4X86ShuffleImm8ForMask(WidenedMask, DAG)));
07015   }
07016 
07017   // We implement this with SHUFPD which is pretty lame because it will likely
07018   // incur 2 cycles of stall for integer vectors on Nehalem and older chips.
07019   // However, all the alternatives are still more cycles and newer chips don't
07020   // have this problem. It would be really nice if x86 had better shuffles here.
07021   V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v2f64, V1);
07022   V2 = DAG.getNode(ISD::BITCAST, DL, MVT::v2f64, V2);
07023   return DAG.getNode(ISD::BITCAST, DL, MVT::v2i64,
07024                      DAG.getVectorShuffle(MVT::v2f64, DL, V1, V2, Mask));
07025 }
07026 
07027 /// \brief Lower 4-lane 32-bit floating point shuffles.
07028 ///
07029 /// Uses instructions exclusively from the floating point unit to minimize
07030 /// domain crossing penalties, as these are sufficient to implement all v4f32
07031 /// shuffles.
07032 static SDValue lowerV4F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
07033                                        const X86Subtarget *Subtarget,
07034                                        SelectionDAG &DAG) {
07035   SDLoc DL(Op);
07036   assert(Op.getSimpleValueType() == MVT::v4f32 && "Bad shuffle type!");
07037   assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
07038   assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
07039   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
07040   ArrayRef<int> Mask = SVOp->getMask();
07041   assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
07042 
07043   SDValue LowV = V1, HighV = V2;
07044   int NewMask[4] = {Mask[0], Mask[1], Mask[2], Mask[3]};
07045 
07046   int NumV2Elements =
07047       std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; });
07048 
07049   if (NumV2Elements == 0)
07050     // Straight shuffle of a single input vector. We pass the input vector to
07051     // both operands to simulate this with a SHUFPS.
07052     return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V1, V1,
07053                        getV4X86ShuffleImm8ForMask(Mask, DAG));
07054 
07055   if (NumV2Elements == 1) {
07056     int V2Index =
07057         std::find_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; }) -
07058         Mask.begin();
07059     // Compute the index adjacent to V2Index and in the same half by toggling
07060     // the low bit.
07061     int V2AdjIndex = V2Index ^ 1;
07062 
07063     if (Mask[V2AdjIndex] == -1) {
07064       // Handles all the cases where we have a single V2 element and an undef.
07065       // This will only ever happen in the high lanes because we commute the
07066       // vector otherwise.
07067       if (V2Index < 2)
07068         std::swap(LowV, HighV);
07069       NewMask[V2Index] -= 4;
07070     } else {
07071       // Handle the case where the V2 element ends up adjacent to a V1 element.
07072       // To make this work, blend them together as the first step.
07073       int V1Index = V2AdjIndex;
07074       int BlendMask[4] = {Mask[V2Index] - 4, 0, Mask[V1Index], 0};
07075       V2 = DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V2, V1,
07076                        getV4X86ShuffleImm8ForMask(BlendMask, DAG));
07077 
07078       // Now proceed to reconstruct the final blend as we have the necessary
07079       // high or low half formed.
07080       if (V2Index < 2) {
07081         LowV = V2;
07082         HighV = V1;
07083       } else {
07084         HighV = V2;
07085       }
07086       NewMask[V1Index] = 2; // We put the V1 element in V2[2].
07087       NewMask[V2Index] = 0; // We shifted the V2 element into V2[0].
07088     }
07089   } else if (NumV2Elements == 2) {
07090     if (Mask[0] < 4 && Mask[1] < 4) {
07091       // Handle the easy case where we have V1 in the low lanes and V2 in the
07092       // high lanes. We never see this reversed because we sort the shuffle.
07093       NewMask[2] -= 4;
07094       NewMask[3] -= 4;
07095     } else {
07096       // We have a mixture of V1 and V2 in both low and high lanes. Rather than
07097       // trying to place elements directly, just blend them and set up the final
07098       // shuffle to place them.
07099 
07100       // The first two blend mask elements are for V1, the second two are for
07101       // V2.
07102       int BlendMask[4] = {Mask[0] < 4 ? Mask[0] : Mask[1],
07103                           Mask[2] < 4 ? Mask[2] : Mask[3],
07104                           (Mask[0] >= 4 ? Mask[0] : Mask[1]) - 4,
07105                           (Mask[2] >= 4 ? Mask[2] : Mask[3]) - 4};
07106       V1 = DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V1, V2,
07107                        getV4X86ShuffleImm8ForMask(BlendMask, DAG));
07108 
07109       // Now we do a normal shuffle of V1 by giving V1 as both operands to
07110       // a blend.
07111       LowV = HighV = V1;
07112       NewMask[0] = Mask[0] < 4 ? 0 : 2;
07113       NewMask[1] = Mask[0] < 4 ? 2 : 0;
07114       NewMask[2] = Mask[2] < 4 ? 1 : 3;
07115       NewMask[3] = Mask[2] < 4 ? 3 : 1;
07116     }
07117   }
07118   return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, LowV, HighV,
07119                      getV4X86ShuffleImm8ForMask(NewMask, DAG));
07120 }
07121 
07122 /// \brief Lower 4-lane i32 vector shuffles.
07123 ///
07124 /// We try to handle these with integer-domain shuffles where we can, but for
07125 /// blends we use the floating point domain blend instructions.
07126 static SDValue lowerV4I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
07127                                        const X86Subtarget *Subtarget,
07128                                        SelectionDAG &DAG) {
07129   SDLoc DL(Op);
07130   assert(Op.getSimpleValueType() == MVT::v4i32 && "Bad shuffle type!");
07131   assert(V1.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
07132   assert(V2.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
07133   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
07134   ArrayRef<int> Mask = SVOp->getMask();
07135   assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
07136 
07137   if (isSingleInputShuffleMask(Mask))
07138     // Straight shuffle of a single input vector. For everything from SSE2
07139     // onward this has a single fast instruction with no scary immediates.
07140     return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
07141                        getV4X86ShuffleImm8ForMask(Mask, DAG));
07142 
07143   // We implement this with SHUFPS because it can blend from two vectors.
07144   // Because we're going to eventually use SHUFPS, we use SHUFPS even to build
07145   // up the inputs, bypassing domain shift penalties that we would encur if we
07146   // directly used PSHUFD on Nehalem and older. For newer chips, this isn't
07147   // relevant.
07148   return DAG.getNode(ISD::BITCAST, DL, MVT::v4i32,
07149                      DAG.getVectorShuffle(
07150                          MVT::v4f32, DL,
07151                          DAG.getNode(ISD::BITCAST, DL, MVT::v4f32, V1),
07152                          DAG.getNode(ISD::BITCAST, DL, MVT::v4f32, V2), Mask));
07153 }
07154 
07155 /// \brief Lowering of single-input v8i16 shuffles is the cornerstone of SSE2
07156 /// shuffle lowering, and the most complex part.
07157 ///
07158 /// The lowering strategy is to try to form pairs of input lanes which are
07159 /// targeted at the same half of the final vector, and then use a dword shuffle
07160 /// to place them onto the right half, and finally unpack the paired lanes into
07161 /// their final position.
07162 ///
07163 /// The exact breakdown of how to form these dword pairs and align them on the
07164 /// correct sides is really tricky. See the comments within the function for
07165 /// more of the details.
07166 static SDValue lowerV8I16SingleInputVectorShuffle(
07167     SDLoc DL, SDValue V, MutableArrayRef<int> Mask,
07168     const X86Subtarget *Subtarget, SelectionDAG &DAG) {
07169   assert(V.getSimpleValueType() == MVT::v8i16 && "Bad input type!");
07170   MutableArrayRef<int> LoMask = Mask.slice(0, 4);
07171   MutableArrayRef<int> HiMask = Mask.slice(4, 4);
07172 
07173   SmallVector<int, 4> LoInputs;
07174   std::copy_if(LoMask.begin(), LoMask.end(), std::back_inserter(LoInputs),
07175