LLVM API Documentation

X86ISelLowering.cpp
Go to the documentation of this file.
00001 //===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===//
00002 //
00003 //                     The LLVM Compiler Infrastructure
00004 //
00005 // This file is distributed under the University of Illinois Open Source
00006 // License. See LICENSE.TXT for details.
00007 //
00008 //===----------------------------------------------------------------------===//
00009 //
00010 // This file defines the interfaces that X86 uses to lower LLVM code into a
00011 // selection DAG.
00012 //
00013 //===----------------------------------------------------------------------===//
00014 
00015 #include "X86ISelLowering.h"
00016 #include "Utils/X86ShuffleDecode.h"
00017 #include "X86CallingConv.h"
00018 #include "X86InstrBuilder.h"
00019 #include "X86MachineFunctionInfo.h"
00020 #include "X86TargetMachine.h"
00021 #include "X86TargetObjectFile.h"
00022 #include "llvm/ADT/SmallSet.h"
00023 #include "llvm/ADT/Statistic.h"
00024 #include "llvm/ADT/StringExtras.h"
00025 #include "llvm/ADT/StringSwitch.h"
00026 #include "llvm/ADT/VariadicFunction.h"
00027 #include "llvm/CodeGen/IntrinsicLowering.h"
00028 #include "llvm/CodeGen/MachineFrameInfo.h"
00029 #include "llvm/CodeGen/MachineFunction.h"
00030 #include "llvm/CodeGen/MachineInstrBuilder.h"
00031 #include "llvm/CodeGen/MachineJumpTableInfo.h"
00032 #include "llvm/CodeGen/MachineModuleInfo.h"
00033 #include "llvm/CodeGen/MachineRegisterInfo.h"
00034 #include "llvm/IR/CallSite.h"
00035 #include "llvm/IR/CallingConv.h"
00036 #include "llvm/IR/Constants.h"
00037 #include "llvm/IR/DerivedTypes.h"
00038 #include "llvm/IR/Function.h"
00039 #include "llvm/IR/GlobalAlias.h"
00040 #include "llvm/IR/GlobalVariable.h"
00041 #include "llvm/IR/Instructions.h"
00042 #include "llvm/IR/Intrinsics.h"
00043 #include "llvm/MC/MCAsmInfo.h"
00044 #include "llvm/MC/MCContext.h"
00045 #include "llvm/MC/MCExpr.h"
00046 #include "llvm/MC/MCSymbol.h"
00047 #include "llvm/Support/CommandLine.h"
00048 #include "llvm/Support/Debug.h"
00049 #include "llvm/Support/ErrorHandling.h"
00050 #include "llvm/Support/MathExtras.h"
00051 #include "llvm/Target/TargetOptions.h"
00052 #include "X86IntrinsicsInfo.h"
00053 #include <bitset>
00054 #include <numeric>
00055 #include <cctype>
00056 using namespace llvm;
00057 
00058 #define DEBUG_TYPE "x86-isel"
00059 
00060 STATISTIC(NumTailCalls, "Number of tail calls");
00061 
00062 static cl::opt<bool> ExperimentalVectorWideningLegalization(
00063     "x86-experimental-vector-widening-legalization", cl::init(false),
00064     cl::desc("Enable an experimental vector type legalization through widening "
00065              "rather than promotion."),
00066     cl::Hidden);
00067 
00068 static cl::opt<bool> ExperimentalVectorShuffleLowering(
00069     "x86-experimental-vector-shuffle-lowering", cl::init(false),
00070     cl::desc("Enable an experimental vector shuffle lowering code path."),
00071     cl::Hidden);
00072 
00073 // Forward declarations.
00074 static SDValue getMOVL(SelectionDAG &DAG, SDLoc dl, EVT VT, SDValue V1,
00075                        SDValue V2);
00076 
00077 static SDValue ExtractSubVector(SDValue Vec, unsigned IdxVal,
00078                                 SelectionDAG &DAG, SDLoc dl,
00079                                 unsigned vectorWidth) {
00080   assert((vectorWidth == 128 || vectorWidth == 256) &&
00081          "Unsupported vector width");
00082   EVT VT = Vec.getValueType();
00083   EVT ElVT = VT.getVectorElementType();
00084   unsigned Factor = VT.getSizeInBits()/vectorWidth;
00085   EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT,
00086                                   VT.getVectorNumElements()/Factor);
00087 
00088   // Extract from UNDEF is UNDEF.
00089   if (Vec.getOpcode() == ISD::UNDEF)
00090     return DAG.getUNDEF(ResultVT);
00091 
00092   // Extract the relevant vectorWidth bits.  Generate an EXTRACT_SUBVECTOR
00093   unsigned ElemsPerChunk = vectorWidth / ElVT.getSizeInBits();
00094 
00095   // This is the index of the first element of the vectorWidth-bit chunk
00096   // we want.
00097   unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits()) / vectorWidth)
00098                                * ElemsPerChunk);
00099 
00100   // If the input is a buildvector just emit a smaller one.
00101   if (Vec.getOpcode() == ISD::BUILD_VECTOR)
00102     return DAG.getNode(ISD::BUILD_VECTOR, dl, ResultVT,
00103                        makeArrayRef(Vec->op_begin()+NormalizedIdxVal,
00104                                     ElemsPerChunk));
00105 
00106   SDValue VecIdx = DAG.getIntPtrConstant(NormalizedIdxVal);
00107   SDValue Result = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec,
00108                                VecIdx);
00109 
00110   return Result;
00111 
00112 }
00113 /// Generate a DAG to grab 128-bits from a vector > 128 bits.  This
00114 /// sets things up to match to an AVX VEXTRACTF128 / VEXTRACTI128
00115 /// or AVX-512 VEXTRACTF32x4 / VEXTRACTI32x4
00116 /// instructions or a simple subregister reference. Idx is an index in the
00117 /// 128 bits we want.  It need not be aligned to a 128-bit bounday.  That makes
00118 /// lowering EXTRACT_VECTOR_ELT operations easier.
00119 static SDValue Extract128BitVector(SDValue Vec, unsigned IdxVal,
00120                                    SelectionDAG &DAG, SDLoc dl) {
00121   assert((Vec.getValueType().is256BitVector() ||
00122           Vec.getValueType().is512BitVector()) && "Unexpected vector size!");
00123   return ExtractSubVector(Vec, IdxVal, DAG, dl, 128);
00124 }
00125 
00126 /// Generate a DAG to grab 256-bits from a 512-bit vector.
00127 static SDValue Extract256BitVector(SDValue Vec, unsigned IdxVal,
00128                                    SelectionDAG &DAG, SDLoc dl) {
00129   assert(Vec.getValueType().is512BitVector() && "Unexpected vector size!");
00130   return ExtractSubVector(Vec, IdxVal, DAG, dl, 256);
00131 }
00132 
00133 static SDValue InsertSubVector(SDValue Result, SDValue Vec,
00134                                unsigned IdxVal, SelectionDAG &DAG,
00135                                SDLoc dl, unsigned vectorWidth) {
00136   assert((vectorWidth == 128 || vectorWidth == 256) &&
00137          "Unsupported vector width");
00138   // Inserting UNDEF is Result
00139   if (Vec.getOpcode() == ISD::UNDEF)
00140     return Result;
00141   EVT VT = Vec.getValueType();
00142   EVT ElVT = VT.getVectorElementType();
00143   EVT ResultVT = Result.getValueType();
00144 
00145   // Insert the relevant vectorWidth bits.
00146   unsigned ElemsPerChunk = vectorWidth/ElVT.getSizeInBits();
00147 
00148   // This is the index of the first element of the vectorWidth-bit chunk
00149   // we want.
00150   unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits())/vectorWidth)
00151                                * ElemsPerChunk);
00152 
00153   SDValue VecIdx = DAG.getIntPtrConstant(NormalizedIdxVal);
00154   return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec,
00155                      VecIdx);
00156 }
00157 /// Generate a DAG to put 128-bits into a vector > 128 bits.  This
00158 /// sets things up to match to an AVX VINSERTF128/VINSERTI128 or
00159 /// AVX-512 VINSERTF32x4/VINSERTI32x4 instructions or a
00160 /// simple superregister reference.  Idx is an index in the 128 bits
00161 /// we want.  It need not be aligned to a 128-bit bounday.  That makes
00162 /// lowering INSERT_VECTOR_ELT operations easier.
00163 static SDValue Insert128BitVector(SDValue Result, SDValue Vec,
00164                                   unsigned IdxVal, SelectionDAG &DAG,
00165                                   SDLoc dl) {
00166   assert(Vec.getValueType().is128BitVector() && "Unexpected vector size!");
00167   return InsertSubVector(Result, Vec, IdxVal, DAG, dl, 128);
00168 }
00169 
00170 static SDValue Insert256BitVector(SDValue Result, SDValue Vec,
00171                                   unsigned IdxVal, SelectionDAG &DAG,
00172                                   SDLoc dl) {
00173   assert(Vec.getValueType().is256BitVector() && "Unexpected vector size!");
00174   return InsertSubVector(Result, Vec, IdxVal, DAG, dl, 256);
00175 }
00176 
00177 /// Concat two 128-bit vectors into a 256 bit vector using VINSERTF128
00178 /// instructions. This is used because creating CONCAT_VECTOR nodes of
00179 /// BUILD_VECTORS returns a larger BUILD_VECTOR while we're trying to lower
00180 /// large BUILD_VECTORS.
00181 static SDValue Concat128BitVectors(SDValue V1, SDValue V2, EVT VT,
00182                                    unsigned NumElems, SelectionDAG &DAG,
00183                                    SDLoc dl) {
00184   SDValue V = Insert128BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl);
00185   return Insert128BitVector(V, V2, NumElems/2, DAG, dl);
00186 }
00187 
00188 static SDValue Concat256BitVectors(SDValue V1, SDValue V2, EVT VT,
00189                                    unsigned NumElems, SelectionDAG &DAG,
00190                                    SDLoc dl) {
00191   SDValue V = Insert256BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl);
00192   return Insert256BitVector(V, V2, NumElems/2, DAG, dl);
00193 }
00194 
00195 static TargetLoweringObjectFile *createTLOF(const Triple &TT) {
00196   if (TT.isOSBinFormatMachO()) {
00197     if (TT.getArch() == Triple::x86_64)
00198       return new X86_64MachoTargetObjectFile();
00199     return new TargetLoweringObjectFileMachO();
00200   }
00201 
00202   if (TT.isOSLinux())
00203     return new X86LinuxTargetObjectFile();
00204   if (TT.isOSBinFormatELF())
00205     return new TargetLoweringObjectFileELF();
00206   if (TT.isKnownWindowsMSVCEnvironment())
00207     return new X86WindowsTargetObjectFile();
00208   if (TT.isOSBinFormatCOFF())
00209     return new TargetLoweringObjectFileCOFF();
00210   llvm_unreachable("unknown subtarget type");
00211 }
00212 
00213 // FIXME: This should stop caching the target machine as soon as
00214 // we can remove resetOperationActions et al.
00215 X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
00216   : TargetLowering(TM, createTLOF(Triple(TM.getTargetTriple()))) {
00217   Subtarget = &TM.getSubtarget<X86Subtarget>();
00218   X86ScalarSSEf64 = Subtarget->hasSSE2();
00219   X86ScalarSSEf32 = Subtarget->hasSSE1();
00220   TD = getDataLayout();
00221 
00222   resetOperationActions();
00223 }
00224 
00225 void X86TargetLowering::resetOperationActions() {
00226   const TargetMachine &TM = getTargetMachine();
00227   static bool FirstTimeThrough = true;
00228 
00229   // If none of the target options have changed, then we don't need to reset the
00230   // operation actions.
00231   if (!FirstTimeThrough && TO == TM.Options) return;
00232 
00233   if (!FirstTimeThrough) {
00234     // Reinitialize the actions.
00235     initActions();
00236     FirstTimeThrough = false;
00237   }
00238 
00239   TO = TM.Options;
00240 
00241   // Set up the TargetLowering object.
00242   static const MVT IntVTs[] = { MVT::i8, MVT::i16, MVT::i32, MVT::i64 };
00243 
00244   // X86 is weird, it always uses i8 for shift amounts and setcc results.
00245   setBooleanContents(ZeroOrOneBooleanContent);
00246   // X86-SSE is even stranger. It uses -1 or 0 for vector masks.
00247   setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
00248 
00249   // For 64-bit since we have so many registers use the ILP scheduler, for
00250   // 32-bit code use the register pressure specific scheduling.
00251   // For Atom, always use ILP scheduling.
00252   if (Subtarget->isAtom())
00253     setSchedulingPreference(Sched::ILP);
00254   else if (Subtarget->is64Bit())
00255     setSchedulingPreference(Sched::ILP);
00256   else
00257     setSchedulingPreference(Sched::RegPressure);
00258   const X86RegisterInfo *RegInfo =
00259       TM.getSubtarget<X86Subtarget>().getRegisterInfo();
00260   setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister());
00261 
00262   // Bypass expensive divides on Atom when compiling with O2
00263   if (Subtarget->hasSlowDivide() && TM.getOptLevel() >= CodeGenOpt::Default) {
00264     addBypassSlowDiv(32, 8);
00265     if (Subtarget->is64Bit())
00266       addBypassSlowDiv(64, 16);
00267   }
00268 
00269   if (Subtarget->isTargetKnownWindowsMSVC()) {
00270     // Setup Windows compiler runtime calls.
00271     setLibcallName(RTLIB::SDIV_I64, "_alldiv");
00272     setLibcallName(RTLIB::UDIV_I64, "_aulldiv");
00273     setLibcallName(RTLIB::SREM_I64, "_allrem");
00274     setLibcallName(RTLIB::UREM_I64, "_aullrem");
00275     setLibcallName(RTLIB::MUL_I64, "_allmul");
00276     setLibcallCallingConv(RTLIB::SDIV_I64, CallingConv::X86_StdCall);
00277     setLibcallCallingConv(RTLIB::UDIV_I64, CallingConv::X86_StdCall);
00278     setLibcallCallingConv(RTLIB::SREM_I64, CallingConv::X86_StdCall);
00279     setLibcallCallingConv(RTLIB::UREM_I64, CallingConv::X86_StdCall);
00280     setLibcallCallingConv(RTLIB::MUL_I64, CallingConv::X86_StdCall);
00281 
00282     // The _ftol2 runtime function has an unusual calling conv, which
00283     // is modeled by a special pseudo-instruction.
00284     setLibcallName(RTLIB::FPTOUINT_F64_I64, nullptr);
00285     setLibcallName(RTLIB::FPTOUINT_F32_I64, nullptr);
00286     setLibcallName(RTLIB::FPTOUINT_F64_I32, nullptr);
00287     setLibcallName(RTLIB::FPTOUINT_F32_I32, nullptr);
00288   }
00289 
00290   if (Subtarget->isTargetDarwin()) {
00291     // Darwin should use _setjmp/_longjmp instead of setjmp/longjmp.
00292     setUseUnderscoreSetJmp(false);
00293     setUseUnderscoreLongJmp(false);
00294   } else if (Subtarget->isTargetWindowsGNU()) {
00295     // MS runtime is weird: it exports _setjmp, but longjmp!
00296     setUseUnderscoreSetJmp(true);
00297     setUseUnderscoreLongJmp(false);
00298   } else {
00299     setUseUnderscoreSetJmp(true);
00300     setUseUnderscoreLongJmp(true);
00301   }
00302 
00303   // Set up the register classes.
00304   addRegisterClass(MVT::i8, &X86::GR8RegClass);
00305   addRegisterClass(MVT::i16, &X86::GR16RegClass);
00306   addRegisterClass(MVT::i32, &X86::GR32RegClass);
00307   if (Subtarget->is64Bit())
00308     addRegisterClass(MVT::i64, &X86::GR64RegClass);
00309 
00310   setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote);
00311 
00312   // We don't accept any truncstore of integer registers.
00313   setTruncStoreAction(MVT::i64, MVT::i32, Expand);
00314   setTruncStoreAction(MVT::i64, MVT::i16, Expand);
00315   setTruncStoreAction(MVT::i64, MVT::i8 , Expand);
00316   setTruncStoreAction(MVT::i32, MVT::i16, Expand);
00317   setTruncStoreAction(MVT::i32, MVT::i8 , Expand);
00318   setTruncStoreAction(MVT::i16, MVT::i8,  Expand);
00319 
00320   setTruncStoreAction(MVT::f64, MVT::f32, Expand);
00321 
00322   // SETOEQ and SETUNE require checking two conditions.
00323   setCondCodeAction(ISD::SETOEQ, MVT::f32, Expand);
00324   setCondCodeAction(ISD::SETOEQ, MVT::f64, Expand);
00325   setCondCodeAction(ISD::SETOEQ, MVT::f80, Expand);
00326   setCondCodeAction(ISD::SETUNE, MVT::f32, Expand);
00327   setCondCodeAction(ISD::SETUNE, MVT::f64, Expand);
00328   setCondCodeAction(ISD::SETUNE, MVT::f80, Expand);
00329 
00330   // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this
00331   // operation.
00332   setOperationAction(ISD::UINT_TO_FP       , MVT::i1   , Promote);
00333   setOperationAction(ISD::UINT_TO_FP       , MVT::i8   , Promote);
00334   setOperationAction(ISD::UINT_TO_FP       , MVT::i16  , Promote);
00335 
00336   if (Subtarget->is64Bit()) {
00337     setOperationAction(ISD::UINT_TO_FP     , MVT::i32  , Promote);
00338     setOperationAction(ISD::UINT_TO_FP     , MVT::i64  , Custom);
00339   } else if (!TM.Options.UseSoftFloat) {
00340     // We have an algorithm for SSE2->double, and we turn this into a
00341     // 64-bit FILD followed by conditional FADD for other targets.
00342     setOperationAction(ISD::UINT_TO_FP     , MVT::i64  , Custom);
00343     // We have an algorithm for SSE2, and we turn this into a 64-bit
00344     // FILD for other targets.
00345     setOperationAction(ISD::UINT_TO_FP     , MVT::i32  , Custom);
00346   }
00347 
00348   // Promote i1/i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have
00349   // this operation.
00350   setOperationAction(ISD::SINT_TO_FP       , MVT::i1   , Promote);
00351   setOperationAction(ISD::SINT_TO_FP       , MVT::i8   , Promote);
00352 
00353   if (!TM.Options.UseSoftFloat) {
00354     // SSE has no i16 to fp conversion, only i32
00355     if (X86ScalarSSEf32) {
00356       setOperationAction(ISD::SINT_TO_FP     , MVT::i16  , Promote);
00357       // f32 and f64 cases are Legal, f80 case is not
00358       setOperationAction(ISD::SINT_TO_FP     , MVT::i32  , Custom);
00359     } else {
00360       setOperationAction(ISD::SINT_TO_FP     , MVT::i16  , Custom);
00361       setOperationAction(ISD::SINT_TO_FP     , MVT::i32  , Custom);
00362     }
00363   } else {
00364     setOperationAction(ISD::SINT_TO_FP     , MVT::i16  , Promote);
00365     setOperationAction(ISD::SINT_TO_FP     , MVT::i32  , Promote);
00366   }
00367 
00368   // In 32-bit mode these are custom lowered.  In 64-bit mode F32 and F64
00369   // are Legal, f80 is custom lowered.
00370   setOperationAction(ISD::FP_TO_SINT     , MVT::i64  , Custom);
00371   setOperationAction(ISD::SINT_TO_FP     , MVT::i64  , Custom);
00372 
00373   // Promote i1/i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have
00374   // this operation.
00375   setOperationAction(ISD::FP_TO_SINT       , MVT::i1   , Promote);
00376   setOperationAction(ISD::FP_TO_SINT       , MVT::i8   , Promote);
00377 
00378   if (X86ScalarSSEf32) {
00379     setOperationAction(ISD::FP_TO_SINT     , MVT::i16  , Promote);
00380     // f32 and f64 cases are Legal, f80 case is not
00381     setOperationAction(ISD::FP_TO_SINT     , MVT::i32  , Custom);
00382   } else {
00383     setOperationAction(ISD::FP_TO_SINT     , MVT::i16  , Custom);
00384     setOperationAction(ISD::FP_TO_SINT     , MVT::i32  , Custom);
00385   }
00386 
00387   // Handle FP_TO_UINT by promoting the destination to a larger signed
00388   // conversion.
00389   setOperationAction(ISD::FP_TO_UINT       , MVT::i1   , Promote);
00390   setOperationAction(ISD::FP_TO_UINT       , MVT::i8   , Promote);
00391   setOperationAction(ISD::FP_TO_UINT       , MVT::i16  , Promote);
00392 
00393   if (Subtarget->is64Bit()) {
00394     setOperationAction(ISD::FP_TO_UINT     , MVT::i64  , Expand);
00395     setOperationAction(ISD::FP_TO_UINT     , MVT::i32  , Promote);
00396   } else if (!TM.Options.UseSoftFloat) {
00397     // Since AVX is a superset of SSE3, only check for SSE here.
00398     if (Subtarget->hasSSE1() && !Subtarget->hasSSE3())
00399       // Expand FP_TO_UINT into a select.
00400       // FIXME: We would like to use a Custom expander here eventually to do
00401       // the optimal thing for SSE vs. the default expansion in the legalizer.
00402       setOperationAction(ISD::FP_TO_UINT   , MVT::i32  , Expand);
00403     else
00404       // With SSE3 we can use fisttpll to convert to a signed i64; without
00405       // SSE, we're stuck with a fistpll.
00406       setOperationAction(ISD::FP_TO_UINT   , MVT::i32  , Custom);
00407   }
00408 
00409   if (isTargetFTOL()) {
00410     // Use the _ftol2 runtime function, which has a pseudo-instruction
00411     // to handle its weird calling convention.
00412     setOperationAction(ISD::FP_TO_UINT     , MVT::i64  , Custom);
00413   }
00414 
00415   // TODO: when we have SSE, these could be more efficient, by using movd/movq.
00416   if (!X86ScalarSSEf64) {
00417     setOperationAction(ISD::BITCAST        , MVT::f32  , Expand);
00418     setOperationAction(ISD::BITCAST        , MVT::i32  , Expand);
00419     if (Subtarget->is64Bit()) {
00420       setOperationAction(ISD::BITCAST      , MVT::f64  , Expand);
00421       // Without SSE, i64->f64 goes through memory.
00422       setOperationAction(ISD::BITCAST      , MVT::i64  , Expand);
00423     }
00424   }
00425 
00426   // Scalar integer divide and remainder are lowered to use operations that
00427   // produce two results, to match the available instructions. This exposes
00428   // the two-result form to trivial CSE, which is able to combine x/y and x%y
00429   // into a single instruction.
00430   //
00431   // Scalar integer multiply-high is also lowered to use two-result
00432   // operations, to match the available instructions. However, plain multiply
00433   // (low) operations are left as Legal, as there are single-result
00434   // instructions for this in x86. Using the two-result multiply instructions
00435   // when both high and low results are needed must be arranged by dagcombine.
00436   for (unsigned i = 0; i != array_lengthof(IntVTs); ++i) {
00437     MVT VT = IntVTs[i];
00438     setOperationAction(ISD::MULHS, VT, Expand);
00439     setOperationAction(ISD::MULHU, VT, Expand);
00440     setOperationAction(ISD::SDIV, VT, Expand);
00441     setOperationAction(ISD::UDIV, VT, Expand);
00442     setOperationAction(ISD::SREM, VT, Expand);
00443     setOperationAction(ISD::UREM, VT, Expand);
00444 
00445     // Add/Sub overflow ops with MVT::Glues are lowered to EFLAGS dependences.
00446     setOperationAction(ISD::ADDC, VT, Custom);
00447     setOperationAction(ISD::ADDE, VT, Custom);
00448     setOperationAction(ISD::SUBC, VT, Custom);
00449     setOperationAction(ISD::SUBE, VT, Custom);
00450   }
00451 
00452   setOperationAction(ISD::BR_JT            , MVT::Other, Expand);
00453   setOperationAction(ISD::BRCOND           , MVT::Other, Custom);
00454   setOperationAction(ISD::BR_CC            , MVT::f32,   Expand);
00455   setOperationAction(ISD::BR_CC            , MVT::f64,   Expand);
00456   setOperationAction(ISD::BR_CC            , MVT::f80,   Expand);
00457   setOperationAction(ISD::BR_CC            , MVT::i8,    Expand);
00458   setOperationAction(ISD::BR_CC            , MVT::i16,   Expand);
00459   setOperationAction(ISD::BR_CC            , MVT::i32,   Expand);
00460   setOperationAction(ISD::BR_CC            , MVT::i64,   Expand);
00461   setOperationAction(ISD::SELECT_CC        , MVT::f32,   Expand);
00462   setOperationAction(ISD::SELECT_CC        , MVT::f64,   Expand);
00463   setOperationAction(ISD::SELECT_CC        , MVT::f80,   Expand);
00464   setOperationAction(ISD::SELECT_CC        , MVT::i8,    Expand);
00465   setOperationAction(ISD::SELECT_CC        , MVT::i16,   Expand);
00466   setOperationAction(ISD::SELECT_CC        , MVT::i32,   Expand);
00467   setOperationAction(ISD::SELECT_CC        , MVT::i64,   Expand);
00468   if (Subtarget->is64Bit())
00469     setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
00470   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16  , Legal);
00471   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8   , Legal);
00472   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1   , Expand);
00473   setOperationAction(ISD::FP_ROUND_INREG   , MVT::f32  , Expand);
00474   setOperationAction(ISD::FREM             , MVT::f32  , Expand);
00475   setOperationAction(ISD::FREM             , MVT::f64  , Expand);
00476   setOperationAction(ISD::FREM             , MVT::f80  , Expand);
00477   setOperationAction(ISD::FLT_ROUNDS_      , MVT::i32  , Custom);
00478 
00479   // Promote the i8 variants and force them on up to i32 which has a shorter
00480   // encoding.
00481   setOperationAction(ISD::CTTZ             , MVT::i8   , Promote);
00482   AddPromotedToType (ISD::CTTZ             , MVT::i8   , MVT::i32);
00483   setOperationAction(ISD::CTTZ_ZERO_UNDEF  , MVT::i8   , Promote);
00484   AddPromotedToType (ISD::CTTZ_ZERO_UNDEF  , MVT::i8   , MVT::i32);
00485   if (Subtarget->hasBMI()) {
00486     setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i16  , Expand);
00487     setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32  , Expand);
00488     if (Subtarget->is64Bit())
00489       setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Expand);
00490   } else {
00491     setOperationAction(ISD::CTTZ           , MVT::i16  , Custom);
00492     setOperationAction(ISD::CTTZ           , MVT::i32  , Custom);
00493     if (Subtarget->is64Bit())
00494       setOperationAction(ISD::CTTZ         , MVT::i64  , Custom);
00495   }
00496 
00497   if (Subtarget->hasLZCNT()) {
00498     // When promoting the i8 variants, force them to i32 for a shorter
00499     // encoding.
00500     setOperationAction(ISD::CTLZ           , MVT::i8   , Promote);
00501     AddPromotedToType (ISD::CTLZ           , MVT::i8   , MVT::i32);
00502     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8   , Promote);
00503     AddPromotedToType (ISD::CTLZ_ZERO_UNDEF, MVT::i8   , MVT::i32);
00504     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16  , Expand);
00505     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32  , Expand);
00506     if (Subtarget->is64Bit())
00507       setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Expand);
00508   } else {
00509     setOperationAction(ISD::CTLZ           , MVT::i8   , Custom);
00510     setOperationAction(ISD::CTLZ           , MVT::i16  , Custom);
00511     setOperationAction(ISD::CTLZ           , MVT::i32  , Custom);
00512     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8   , Custom);
00513     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16  , Custom);
00514     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32  , Custom);
00515     if (Subtarget->is64Bit()) {
00516       setOperationAction(ISD::CTLZ         , MVT::i64  , Custom);
00517       setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom);
00518     }
00519   }
00520 
00521   // Special handling for half-precision floating point conversions.
00522   // If we don't have F16C support, then lower half float conversions
00523   // into library calls.
00524   if (TM.Options.UseSoftFloat || !Subtarget->hasF16C()) {
00525     setOperationAction(ISD::FP16_TO_FP, MVT::f32, Expand);
00526     setOperationAction(ISD::FP_TO_FP16, MVT::f32, Expand);
00527   }
00528 
00529   // There's never any support for operations beyond MVT::f32.
00530   setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
00531   setOperationAction(ISD::FP16_TO_FP, MVT::f80, Expand);
00532   setOperationAction(ISD::FP_TO_FP16, MVT::f64, Expand);
00533   setOperationAction(ISD::FP_TO_FP16, MVT::f80, Expand);
00534 
00535   setLoadExtAction(ISD::EXTLOAD, MVT::f16, Expand);
00536   setTruncStoreAction(MVT::f32, MVT::f16, Expand);
00537   setTruncStoreAction(MVT::f64, MVT::f16, Expand);
00538   setTruncStoreAction(MVT::f80, MVT::f16, Expand);
00539 
00540   if (Subtarget->hasPOPCNT()) {
00541     setOperationAction(ISD::CTPOP          , MVT::i8   , Promote);
00542   } else {
00543     setOperationAction(ISD::CTPOP          , MVT::i8   , Expand);
00544     setOperationAction(ISD::CTPOP          , MVT::i16  , Expand);
00545     setOperationAction(ISD::CTPOP          , MVT::i32  , Expand);
00546     if (Subtarget->is64Bit())
00547       setOperationAction(ISD::CTPOP        , MVT::i64  , Expand);
00548   }
00549 
00550   setOperationAction(ISD::READCYCLECOUNTER , MVT::i64  , Custom);
00551 
00552   if (!Subtarget->hasMOVBE())
00553     setOperationAction(ISD::BSWAP          , MVT::i16  , Expand);
00554 
00555   // These should be promoted to a larger select which is supported.
00556   setOperationAction(ISD::SELECT          , MVT::i1   , Promote);
00557   // X86 wants to expand cmov itself.
00558   setOperationAction(ISD::SELECT          , MVT::i8   , Custom);
00559   setOperationAction(ISD::SELECT          , MVT::i16  , Custom);
00560   setOperationAction(ISD::SELECT          , MVT::i32  , Custom);
00561   setOperationAction(ISD::SELECT          , MVT::f32  , Custom);
00562   setOperationAction(ISD::SELECT          , MVT::f64  , Custom);
00563   setOperationAction(ISD::SELECT          , MVT::f80  , Custom);
00564   setOperationAction(ISD::SETCC           , MVT::i8   , Custom);
00565   setOperationAction(ISD::SETCC           , MVT::i16  , Custom);
00566   setOperationAction(ISD::SETCC           , MVT::i32  , Custom);
00567   setOperationAction(ISD::SETCC           , MVT::f32  , Custom);
00568   setOperationAction(ISD::SETCC           , MVT::f64  , Custom);
00569   setOperationAction(ISD::SETCC           , MVT::f80  , Custom);
00570   if (Subtarget->is64Bit()) {
00571     setOperationAction(ISD::SELECT        , MVT::i64  , Custom);
00572     setOperationAction(ISD::SETCC         , MVT::i64  , Custom);
00573   }
00574   setOperationAction(ISD::EH_RETURN       , MVT::Other, Custom);
00575   // NOTE: EH_SJLJ_SETJMP/_LONGJMP supported here is NOT intended to support
00576   // SjLj exception handling but a light-weight setjmp/longjmp replacement to
00577   // support continuation, user-level threading, and etc.. As a result, no
00578   // other SjLj exception interfaces are implemented and please don't build
00579   // your own exception handling based on them.
00580   // LLVM/Clang supports zero-cost DWARF exception handling.
00581   setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom);
00582   setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom);
00583 
00584   // Darwin ABI issue.
00585   setOperationAction(ISD::ConstantPool    , MVT::i32  , Custom);
00586   setOperationAction(ISD::JumpTable       , MVT::i32  , Custom);
00587   setOperationAction(ISD::GlobalAddress   , MVT::i32  , Custom);
00588   setOperationAction(ISD::GlobalTLSAddress, MVT::i32  , Custom);
00589   if (Subtarget->is64Bit())
00590     setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom);
00591   setOperationAction(ISD::ExternalSymbol  , MVT::i32  , Custom);
00592   setOperationAction(ISD::BlockAddress    , MVT::i32  , Custom);
00593   if (Subtarget->is64Bit()) {
00594     setOperationAction(ISD::ConstantPool  , MVT::i64  , Custom);
00595     setOperationAction(ISD::JumpTable     , MVT::i64  , Custom);
00596     setOperationAction(ISD::GlobalAddress , MVT::i64  , Custom);
00597     setOperationAction(ISD::ExternalSymbol, MVT::i64  , Custom);
00598     setOperationAction(ISD::BlockAddress  , MVT::i64  , Custom);
00599   }
00600   // 64-bit addm sub, shl, sra, srl (iff 32-bit x86)
00601   setOperationAction(ISD::SHL_PARTS       , MVT::i32  , Custom);
00602   setOperationAction(ISD::SRA_PARTS       , MVT::i32  , Custom);
00603   setOperationAction(ISD::SRL_PARTS       , MVT::i32  , Custom);
00604   if (Subtarget->is64Bit()) {
00605     setOperationAction(ISD::SHL_PARTS     , MVT::i64  , Custom);
00606     setOperationAction(ISD::SRA_PARTS     , MVT::i64  , Custom);
00607     setOperationAction(ISD::SRL_PARTS     , MVT::i64  , Custom);
00608   }
00609 
00610   if (Subtarget->hasSSE1())
00611     setOperationAction(ISD::PREFETCH      , MVT::Other, Legal);
00612 
00613   setOperationAction(ISD::ATOMIC_FENCE  , MVT::Other, Custom);
00614 
00615   // Expand certain atomics
00616   for (unsigned i = 0; i != array_lengthof(IntVTs); ++i) {
00617     MVT VT = IntVTs[i];
00618     setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, VT, Custom);
00619     setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom);
00620     setOperationAction(ISD::ATOMIC_STORE, VT, Custom);
00621   }
00622 
00623   if (Subtarget->hasCmpxchg16b()) {
00624     setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i128, Custom);
00625   }
00626 
00627   // FIXME - use subtarget debug flags
00628   if (!Subtarget->isTargetDarwin() && !Subtarget->isTargetELF() &&
00629       !Subtarget->isTargetCygMing() && !Subtarget->isTargetWin64()) {
00630     setOperationAction(ISD::EH_LABEL, MVT::Other, Expand);
00631   }
00632 
00633   if (Subtarget->is64Bit()) {
00634     setExceptionPointerRegister(X86::RAX);
00635     setExceptionSelectorRegister(X86::RDX);
00636   } else {
00637     setExceptionPointerRegister(X86::EAX);
00638     setExceptionSelectorRegister(X86::EDX);
00639   }
00640   setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom);
00641   setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i64, Custom);
00642 
00643   setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom);
00644   setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom);
00645 
00646   setOperationAction(ISD::TRAP, MVT::Other, Legal);
00647   setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);
00648 
00649   // VASTART needs to be custom lowered to use the VarArgsFrameIndex
00650   setOperationAction(ISD::VASTART           , MVT::Other, Custom);
00651   setOperationAction(ISD::VAEND             , MVT::Other, Expand);
00652   if (Subtarget->is64Bit() && !Subtarget->isTargetWin64()) {
00653     // TargetInfo::X86_64ABIBuiltinVaList
00654     setOperationAction(ISD::VAARG           , MVT::Other, Custom);
00655     setOperationAction(ISD::VACOPY          , MVT::Other, Custom);
00656   } else {
00657     // TargetInfo::CharPtrBuiltinVaList
00658     setOperationAction(ISD::VAARG           , MVT::Other, Expand);
00659     setOperationAction(ISD::VACOPY          , MVT::Other, Expand);
00660   }
00661 
00662   setOperationAction(ISD::STACKSAVE,          MVT::Other, Expand);
00663   setOperationAction(ISD::STACKRESTORE,       MVT::Other, Expand);
00664 
00665   setOperationAction(ISD::DYNAMIC_STACKALLOC, getPointerTy(), Custom);
00666 
00667   if (!TM.Options.UseSoftFloat && X86ScalarSSEf64) {
00668     // f32 and f64 use SSE.
00669     // Set up the FP register classes.
00670     addRegisterClass(MVT::f32, &X86::FR32RegClass);
00671     addRegisterClass(MVT::f64, &X86::FR64RegClass);
00672 
00673     // Use ANDPD to simulate FABS.
00674     setOperationAction(ISD::FABS , MVT::f64, Custom);
00675     setOperationAction(ISD::FABS , MVT::f32, Custom);
00676 
00677     // Use XORP to simulate FNEG.
00678     setOperationAction(ISD::FNEG , MVT::f64, Custom);
00679     setOperationAction(ISD::FNEG , MVT::f32, Custom);
00680 
00681     // Use ANDPD and ORPD to simulate FCOPYSIGN.
00682     setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom);
00683     setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
00684 
00685     // Lower this to FGETSIGNx86 plus an AND.
00686     setOperationAction(ISD::FGETSIGN, MVT::i64, Custom);
00687     setOperationAction(ISD::FGETSIGN, MVT::i32, Custom);
00688 
00689     // We don't support sin/cos/fmod
00690     setOperationAction(ISD::FSIN   , MVT::f64, Expand);
00691     setOperationAction(ISD::FCOS   , MVT::f64, Expand);
00692     setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
00693     setOperationAction(ISD::FSIN   , MVT::f32, Expand);
00694     setOperationAction(ISD::FCOS   , MVT::f32, Expand);
00695     setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
00696 
00697     // Expand FP immediates into loads from the stack, except for the special
00698     // cases we handle.
00699     addLegalFPImmediate(APFloat(+0.0)); // xorpd
00700     addLegalFPImmediate(APFloat(+0.0f)); // xorps
00701   } else if (!TM.Options.UseSoftFloat && X86ScalarSSEf32) {
00702     // Use SSE for f32, x87 for f64.
00703     // Set up the FP register classes.
00704     addRegisterClass(MVT::f32, &X86::FR32RegClass);
00705     addRegisterClass(MVT::f64, &X86::RFP64RegClass);
00706 
00707     // Use ANDPS to simulate FABS.
00708     setOperationAction(ISD::FABS , MVT::f32, Custom);
00709 
00710     // Use XORP to simulate FNEG.
00711     setOperationAction(ISD::FNEG , MVT::f32, Custom);
00712 
00713     setOperationAction(ISD::UNDEF,     MVT::f64, Expand);
00714 
00715     // Use ANDPS and ORPS to simulate FCOPYSIGN.
00716     setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
00717     setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
00718 
00719     // We don't support sin/cos/fmod
00720     setOperationAction(ISD::FSIN   , MVT::f32, Expand);
00721     setOperationAction(ISD::FCOS   , MVT::f32, Expand);
00722     setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
00723 
00724     // Special cases we handle for FP constants.
00725     addLegalFPImmediate(APFloat(+0.0f)); // xorps
00726     addLegalFPImmediate(APFloat(+0.0)); // FLD0
00727     addLegalFPImmediate(APFloat(+1.0)); // FLD1
00728     addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
00729     addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
00730 
00731     if (!TM.Options.UnsafeFPMath) {
00732       setOperationAction(ISD::FSIN   , MVT::f64, Expand);
00733       setOperationAction(ISD::FCOS   , MVT::f64, Expand);
00734       setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
00735     }
00736   } else if (!TM.Options.UseSoftFloat) {
00737     // f32 and f64 in x87.
00738     // Set up the FP register classes.
00739     addRegisterClass(MVT::f64, &X86::RFP64RegClass);
00740     addRegisterClass(MVT::f32, &X86::RFP32RegClass);
00741 
00742     setOperationAction(ISD::UNDEF,     MVT::f64, Expand);
00743     setOperationAction(ISD::UNDEF,     MVT::f32, Expand);
00744     setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
00745     setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
00746 
00747     if (!TM.Options.UnsafeFPMath) {
00748       setOperationAction(ISD::FSIN   , MVT::f64, Expand);
00749       setOperationAction(ISD::FSIN   , MVT::f32, Expand);
00750       setOperationAction(ISD::FCOS   , MVT::f64, Expand);
00751       setOperationAction(ISD::FCOS   , MVT::f32, Expand);
00752       setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
00753       setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
00754     }
00755     addLegalFPImmediate(APFloat(+0.0)); // FLD0
00756     addLegalFPImmediate(APFloat(+1.0)); // FLD1
00757     addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
00758     addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
00759     addLegalFPImmediate(APFloat(+0.0f)); // FLD0
00760     addLegalFPImmediate(APFloat(+1.0f)); // FLD1
00761     addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS
00762     addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS
00763   }
00764 
00765   // We don't support FMA.
00766   setOperationAction(ISD::FMA, MVT::f64, Expand);
00767   setOperationAction(ISD::FMA, MVT::f32, Expand);
00768 
00769   // Long double always uses X87.
00770   if (!TM.Options.UseSoftFloat) {
00771     addRegisterClass(MVT::f80, &X86::RFP80RegClass);
00772     setOperationAction(ISD::UNDEF,     MVT::f80, Expand);
00773     setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand);
00774     {
00775       APFloat TmpFlt = APFloat::getZero(APFloat::x87DoubleExtended);
00776       addLegalFPImmediate(TmpFlt);  // FLD0
00777       TmpFlt.changeSign();
00778       addLegalFPImmediate(TmpFlt);  // FLD0/FCHS
00779 
00780       bool ignored;
00781       APFloat TmpFlt2(+1.0);
00782       TmpFlt2.convert(APFloat::x87DoubleExtended, APFloat::rmNearestTiesToEven,
00783                       &ignored);
00784       addLegalFPImmediate(TmpFlt2);  // FLD1
00785       TmpFlt2.changeSign();
00786       addLegalFPImmediate(TmpFlt2);  // FLD1/FCHS
00787     }
00788 
00789     if (!TM.Options.UnsafeFPMath) {
00790       setOperationAction(ISD::FSIN   , MVT::f80, Expand);
00791       setOperationAction(ISD::FCOS   , MVT::f80, Expand);
00792       setOperationAction(ISD::FSINCOS, MVT::f80, Expand);
00793     }
00794 
00795     setOperationAction(ISD::FFLOOR, MVT::f80, Expand);
00796     setOperationAction(ISD::FCEIL,  MVT::f80, Expand);
00797     setOperationAction(ISD::FTRUNC, MVT::f80, Expand);
00798     setOperationAction(ISD::FRINT,  MVT::f80, Expand);
00799     setOperationAction(ISD::FNEARBYINT, MVT::f80, Expand);
00800     setOperationAction(ISD::FMA, MVT::f80, Expand);
00801   }
00802 
00803   // Always use a library call for pow.
00804   setOperationAction(ISD::FPOW             , MVT::f32  , Expand);
00805   setOperationAction(ISD::FPOW             , MVT::f64  , Expand);
00806   setOperationAction(ISD::FPOW             , MVT::f80  , Expand);
00807 
00808   setOperationAction(ISD::FLOG, MVT::f80, Expand);
00809   setOperationAction(ISD::FLOG2, MVT::f80, Expand);
00810   setOperationAction(ISD::FLOG10, MVT::f80, Expand);
00811   setOperationAction(ISD::FEXP, MVT::f80, Expand);
00812   setOperationAction(ISD::FEXP2, MVT::f80, Expand);
00813 
00814   // First set operation action for all vector types to either promote
00815   // (for widening) or expand (for scalarization). Then we will selectively
00816   // turn on ones that can be effectively codegen'd.
00817   for (int i = MVT::FIRST_VECTOR_VALUETYPE;
00818            i <= MVT::LAST_VECTOR_VALUETYPE; ++i) {
00819     MVT VT = (MVT::SimpleValueType)i;
00820     setOperationAction(ISD::ADD , VT, Expand);
00821     setOperationAction(ISD::SUB , VT, Expand);
00822     setOperationAction(ISD::FADD, VT, Expand);
00823     setOperationAction(ISD::FNEG, VT, Expand);
00824     setOperationAction(ISD::FSUB, VT, Expand);
00825     setOperationAction(ISD::MUL , VT, Expand);
00826     setOperationAction(ISD::FMUL, VT, Expand);
00827     setOperationAction(ISD::SDIV, VT, Expand);
00828     setOperationAction(ISD::UDIV, VT, Expand);
00829     setOperationAction(ISD::FDIV, VT, Expand);
00830     setOperationAction(ISD::SREM, VT, Expand);
00831     setOperationAction(ISD::UREM, VT, Expand);
00832     setOperationAction(ISD::LOAD, VT, Expand);
00833     setOperationAction(ISD::VECTOR_SHUFFLE, VT, Expand);
00834     setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT,Expand);
00835     setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Expand);
00836     setOperationAction(ISD::EXTRACT_SUBVECTOR, VT,Expand);
00837     setOperationAction(ISD::INSERT_SUBVECTOR, VT,Expand);
00838     setOperationAction(ISD::FABS, VT, Expand);
00839     setOperationAction(ISD::FSIN, VT, Expand);
00840     setOperationAction(ISD::FSINCOS, VT, Expand);
00841     setOperationAction(ISD::FCOS, VT, Expand);
00842     setOperationAction(ISD::FSINCOS, VT, Expand);
00843     setOperationAction(ISD::FREM, VT, Expand);
00844     setOperationAction(ISD::FMA,  VT, Expand);
00845     setOperationAction(ISD::FPOWI, VT, Expand);
00846     setOperationAction(ISD::FSQRT, VT, Expand);
00847     setOperationAction(ISD::FCOPYSIGN, VT, Expand);
00848     setOperationAction(ISD::FFLOOR, VT, Expand);
00849     setOperationAction(ISD::FCEIL, VT, Expand);
00850     setOperationAction(ISD::FTRUNC, VT, Expand);
00851     setOperationAction(ISD::FRINT, VT, Expand);
00852     setOperationAction(ISD::FNEARBYINT, VT, Expand);
00853     setOperationAction(ISD::SMUL_LOHI, VT, Expand);
00854     setOperationAction(ISD::MULHS, VT, Expand);
00855     setOperationAction(ISD::UMUL_LOHI, VT, Expand);
00856     setOperationAction(ISD::MULHU, VT, Expand);
00857     setOperationAction(ISD::SDIVREM, VT, Expand);
00858     setOperationAction(ISD::UDIVREM, VT, Expand);
00859     setOperationAction(ISD::FPOW, VT, Expand);
00860     setOperationAction(ISD::CTPOP, VT, Expand);
00861     setOperationAction(ISD::CTTZ, VT, Expand);
00862     setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Expand);
00863     setOperationAction(ISD::CTLZ, VT, Expand);
00864     setOperationAction(ISD::CTLZ_ZERO_UNDEF, VT, Expand);
00865     setOperationAction(ISD::SHL, VT, Expand);
00866     setOperationAction(ISD::SRA, VT, Expand);
00867     setOperationAction(ISD::SRL, VT, Expand);
00868     setOperationAction(ISD::ROTL, VT, Expand);
00869     setOperationAction(ISD::ROTR, VT, Expand);
00870     setOperationAction(ISD::BSWAP, VT, Expand);
00871     setOperationAction(ISD::SETCC, VT, Expand);
00872     setOperationAction(ISD::FLOG, VT, Expand);
00873     setOperationAction(ISD::FLOG2, VT, Expand);
00874     setOperationAction(ISD::FLOG10, VT, Expand);
00875     setOperationAction(ISD::FEXP, VT, Expand);
00876     setOperationAction(ISD::FEXP2, VT, Expand);
00877     setOperationAction(ISD::FP_TO_UINT, VT, Expand);
00878     setOperationAction(ISD::FP_TO_SINT, VT, Expand);
00879     setOperationAction(ISD::UINT_TO_FP, VT, Expand);
00880     setOperationAction(ISD::SINT_TO_FP, VT, Expand);
00881     setOperationAction(ISD::SIGN_EXTEND_INREG, VT,Expand);
00882     setOperationAction(ISD::TRUNCATE, VT, Expand);
00883     setOperationAction(ISD::SIGN_EXTEND, VT, Expand);
00884     setOperationAction(ISD::ZERO_EXTEND, VT, Expand);
00885     setOperationAction(ISD::ANY_EXTEND, VT, Expand);
00886     setOperationAction(ISD::VSELECT, VT, Expand);
00887     setOperationAction(ISD::SELECT_CC, VT, Expand);
00888     for (int InnerVT = MVT::FIRST_VECTOR_VALUETYPE;
00889              InnerVT <= MVT::LAST_VECTOR_VALUETYPE; ++InnerVT)
00890       setTruncStoreAction(VT,
00891                           (MVT::SimpleValueType)InnerVT, Expand);
00892     setLoadExtAction(ISD::SEXTLOAD, VT, Expand);
00893     setLoadExtAction(ISD::ZEXTLOAD, VT, Expand);
00894 
00895     // N.b. ISD::EXTLOAD legality is basically ignored except for i1-like types,
00896     // we have to deal with them whether we ask for Expansion or not. Setting
00897     // Expand causes its own optimisation problems though, so leave them legal.
00898     if (VT.getVectorElementType() == MVT::i1)
00899       setLoadExtAction(ISD::EXTLOAD, VT, Expand);
00900   }
00901 
00902   // FIXME: In order to prevent SSE instructions being expanded to MMX ones
00903   // with -msoft-float, disable use of MMX as well.
00904   if (!TM.Options.UseSoftFloat && Subtarget->hasMMX()) {
00905     addRegisterClass(MVT::x86mmx, &X86::VR64RegClass);
00906     // No operations on x86mmx supported, everything uses intrinsics.
00907   }
00908 
00909   // MMX-sized vectors (other than x86mmx) are expected to be expanded
00910   // into smaller operations.
00911   setOperationAction(ISD::MULHS,              MVT::v8i8,  Expand);
00912   setOperationAction(ISD::MULHS,              MVT::v4i16, Expand);
00913   setOperationAction(ISD::MULHS,              MVT::v2i32, Expand);
00914   setOperationAction(ISD::MULHS,              MVT::v1i64, Expand);
00915   setOperationAction(ISD::AND,                MVT::v8i8,  Expand);
00916   setOperationAction(ISD::AND,                MVT::v4i16, Expand);
00917   setOperationAction(ISD::AND,                MVT::v2i32, Expand);
00918   setOperationAction(ISD::AND,                MVT::v1i64, Expand);
00919   setOperationAction(ISD::OR,                 MVT::v8i8,  Expand);
00920   setOperationAction(ISD::OR,                 MVT::v4i16, Expand);
00921   setOperationAction(ISD::OR,                 MVT::v2i32, Expand);
00922   setOperationAction(ISD::OR,                 MVT::v1i64, Expand);
00923   setOperationAction(ISD::XOR,                MVT::v8i8,  Expand);
00924   setOperationAction(ISD::XOR,                MVT::v4i16, Expand);
00925   setOperationAction(ISD::XOR,                MVT::v2i32, Expand);
00926   setOperationAction(ISD::XOR,                MVT::v1i64, Expand);
00927   setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v8i8,  Expand);
00928   setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v4i16, Expand);
00929   setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v2i32, Expand);
00930   setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v1i64, Expand);
00931   setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v1i64, Expand);
00932   setOperationAction(ISD::SELECT,             MVT::v8i8,  Expand);
00933   setOperationAction(ISD::SELECT,             MVT::v4i16, Expand);
00934   setOperationAction(ISD::SELECT,             MVT::v2i32, Expand);
00935   setOperationAction(ISD::SELECT,             MVT::v1i64, Expand);
00936   setOperationAction(ISD::BITCAST,            MVT::v8i8,  Expand);
00937   setOperationAction(ISD::BITCAST,            MVT::v4i16, Expand);
00938   setOperationAction(ISD::BITCAST,            MVT::v2i32, Expand);
00939   setOperationAction(ISD::BITCAST,            MVT::v1i64, Expand);
00940 
00941   if (!TM.Options.UseSoftFloat && Subtarget->hasSSE1()) {
00942     addRegisterClass(MVT::v4f32, &X86::VR128RegClass);
00943 
00944     setOperationAction(ISD::FADD,               MVT::v4f32, Legal);
00945     setOperationAction(ISD::FSUB,               MVT::v4f32, Legal);
00946     setOperationAction(ISD::FMUL,               MVT::v4f32, Legal);
00947     setOperationAction(ISD::FDIV,               MVT::v4f32, Legal);
00948     setOperationAction(ISD::FSQRT,              MVT::v4f32, Legal);
00949     setOperationAction(ISD::FNEG,               MVT::v4f32, Custom);
00950     setOperationAction(ISD::FABS,               MVT::v4f32, Custom);
00951     setOperationAction(ISD::LOAD,               MVT::v4f32, Legal);
00952     setOperationAction(ISD::BUILD_VECTOR,       MVT::v4f32, Custom);
00953     setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v4f32, Custom);
00954     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
00955     setOperationAction(ISD::SELECT,             MVT::v4f32, Custom);
00956   }
00957 
00958   if (!TM.Options.UseSoftFloat && Subtarget->hasSSE2()) {
00959     addRegisterClass(MVT::v2f64, &X86::VR128RegClass);
00960 
00961     // FIXME: Unfortunately, -soft-float and -no-implicit-float mean XMM
00962     // registers cannot be used even for integer operations.
00963     addRegisterClass(MVT::v16i8, &X86::VR128RegClass);
00964     addRegisterClass(MVT::v8i16, &X86::VR128RegClass);
00965     addRegisterClass(MVT::v4i32, &X86::VR128RegClass);
00966     addRegisterClass(MVT::v2i64, &X86::VR128RegClass);
00967 
00968     setOperationAction(ISD::ADD,                MVT::v16i8, Legal);
00969     setOperationAction(ISD::ADD,                MVT::v8i16, Legal);
00970     setOperationAction(ISD::ADD,                MVT::v4i32, Legal);
00971     setOperationAction(ISD::ADD,                MVT::v2i64, Legal);
00972     setOperationAction(ISD::MUL,                MVT::v4i32, Custom);
00973     setOperationAction(ISD::MUL,                MVT::v2i64, Custom);
00974     setOperationAction(ISD::UMUL_LOHI,          MVT::v4i32, Custom);
00975     setOperationAction(ISD::SMUL_LOHI,          MVT::v4i32, Custom);
00976     setOperationAction(ISD::MULHU,              MVT::v8i16, Legal);
00977     setOperationAction(ISD::MULHS,              MVT::v8i16, Legal);
00978     setOperationAction(ISD::SUB,                MVT::v16i8, Legal);
00979     setOperationAction(ISD::SUB,                MVT::v8i16, Legal);
00980     setOperationAction(ISD::SUB,                MVT::v4i32, Legal);
00981     setOperationAction(ISD::SUB,                MVT::v2i64, Legal);
00982     setOperationAction(ISD::MUL,                MVT::v8i16, Legal);
00983     setOperationAction(ISD::FADD,               MVT::v2f64, Legal);
00984     setOperationAction(ISD::FSUB,               MVT::v2f64, Legal);
00985     setOperationAction(ISD::FMUL,               MVT::v2f64, Legal);
00986     setOperationAction(ISD::FDIV,               MVT::v2f64, Legal);
00987     setOperationAction(ISD::FSQRT,              MVT::v2f64, Legal);
00988     setOperationAction(ISD::FNEG,               MVT::v2f64, Custom);
00989     setOperationAction(ISD::FABS,               MVT::v2f64, Custom);
00990 
00991     setOperationAction(ISD::SETCC,              MVT::v2i64, Custom);
00992     setOperationAction(ISD::SETCC,              MVT::v16i8, Custom);
00993     setOperationAction(ISD::SETCC,              MVT::v8i16, Custom);
00994     setOperationAction(ISD::SETCC,              MVT::v4i32, Custom);
00995 
00996     setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v16i8, Custom);
00997     setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v8i16, Custom);
00998     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v8i16, Custom);
00999     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4i32, Custom);
01000     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4f32, Custom);
01001 
01002     // Custom lower build_vector, vector_shuffle, and extract_vector_elt.
01003     for (int i = MVT::v16i8; i != MVT::v2i64; ++i) {
01004       MVT VT = (MVT::SimpleValueType)i;
01005       // Do not attempt to custom lower non-power-of-2 vectors
01006       if (!isPowerOf2_32(VT.getVectorNumElements()))
01007         continue;
01008       // Do not attempt to custom lower non-128-bit vectors
01009       if (!VT.is128BitVector())
01010         continue;
01011       setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);
01012       setOperationAction(ISD::VECTOR_SHUFFLE,     VT, Custom);
01013       setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
01014     }
01015 
01016     // We support custom legalizing of sext and anyext loads for specific
01017     // memory vector types which we can load as a scalar (or sequence of
01018     // scalars) and extend in-register to a legal 128-bit vector type. For sext
01019     // loads these must work with a single scalar load.
01020     setLoadExtAction(ISD::SEXTLOAD, MVT::v4i8, Custom);
01021     setLoadExtAction(ISD::SEXTLOAD, MVT::v4i16, Custom);
01022     setLoadExtAction(ISD::SEXTLOAD, MVT::v8i8, Custom);
01023     setLoadExtAction(ISD::EXTLOAD, MVT::v2i8, Custom);
01024     setLoadExtAction(ISD::EXTLOAD, MVT::v2i16, Custom);
01025     setLoadExtAction(ISD::EXTLOAD, MVT::v2i32, Custom);
01026     setLoadExtAction(ISD::EXTLOAD, MVT::v4i8, Custom);
01027     setLoadExtAction(ISD::EXTLOAD, MVT::v4i16, Custom);
01028     setLoadExtAction(ISD::EXTLOAD, MVT::v8i8, Custom);
01029 
01030     setOperationAction(ISD::BUILD_VECTOR,       MVT::v2f64, Custom);
01031     setOperationAction(ISD::BUILD_VECTOR,       MVT::v2i64, Custom);
01032     setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v2f64, Custom);
01033     setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v2i64, Custom);
01034     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v2f64, Custom);
01035     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f64, Custom);
01036 
01037     if (Subtarget->is64Bit()) {
01038       setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v2i64, Custom);
01039       setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Custom);
01040     }
01041 
01042     // Promote v16i8, v8i16, v4i32 load, select, and, or, xor to v2i64.
01043     for (int i = MVT::v16i8; i != MVT::v2i64; ++i) {
01044       MVT VT = (MVT::SimpleValueType)i;
01045 
01046       // Do not attempt to promote non-128-bit vectors
01047       if (!VT.is128BitVector())
01048         continue;
01049 
01050       setOperationAction(ISD::AND,    VT, Promote);
01051       AddPromotedToType (ISD::AND,    VT, MVT::v2i64);
01052       setOperationAction(ISD::OR,     VT, Promote);
01053       AddPromotedToType (ISD::OR,     VT, MVT::v2i64);
01054       setOperationAction(ISD::XOR,    VT, Promote);
01055       AddPromotedToType (ISD::XOR,    VT, MVT::v2i64);
01056       setOperationAction(ISD::LOAD,   VT, Promote);
01057       AddPromotedToType (ISD::LOAD,   VT, MVT::v2i64);
01058       setOperationAction(ISD::SELECT, VT, Promote);
01059       AddPromotedToType (ISD::SELECT, VT, MVT::v2i64);
01060     }
01061 
01062     // Custom lower v2i64 and v2f64 selects.
01063     setOperationAction(ISD::LOAD,               MVT::v2f64, Legal);
01064     setOperationAction(ISD::LOAD,               MVT::v2i64, Legal);
01065     setOperationAction(ISD::SELECT,             MVT::v2f64, Custom);
01066     setOperationAction(ISD::SELECT,             MVT::v2i64, Custom);
01067 
01068     setOperationAction(ISD::FP_TO_SINT,         MVT::v4i32, Legal);
01069     setOperationAction(ISD::SINT_TO_FP,         MVT::v4i32, Legal);
01070 
01071     setOperationAction(ISD::UINT_TO_FP,         MVT::v4i8,  Custom);
01072     setOperationAction(ISD::UINT_TO_FP,         MVT::v4i16, Custom);
01073     // As there is no 64-bit GPR available, we need build a special custom
01074     // sequence to convert from v2i32 to v2f32.
01075     if (!Subtarget->is64Bit())
01076       setOperationAction(ISD::UINT_TO_FP,       MVT::v2f32, Custom);
01077 
01078     setOperationAction(ISD::FP_EXTEND,          MVT::v2f32, Custom);
01079     setOperationAction(ISD::FP_ROUND,           MVT::v2f32, Custom);
01080 
01081     setLoadExtAction(ISD::EXTLOAD,              MVT::v2f32, Legal);
01082 
01083     setOperationAction(ISD::BITCAST,            MVT::v2i32, Custom);
01084     setOperationAction(ISD::BITCAST,            MVT::v4i16, Custom);
01085     setOperationAction(ISD::BITCAST,            MVT::v8i8,  Custom);
01086   }
01087 
01088   if (!TM.Options.UseSoftFloat && Subtarget->hasSSE41()) {
01089     setOperationAction(ISD::FFLOOR,             MVT::f32,   Legal);
01090     setOperationAction(ISD::FCEIL,              MVT::f32,   Legal);
01091     setOperationAction(ISD::FTRUNC,             MVT::f32,   Legal);
01092     setOperationAction(ISD::FRINT,              MVT::f32,   Legal);
01093     setOperationAction(ISD::FNEARBYINT,         MVT::f32,   Legal);
01094     setOperationAction(ISD::FFLOOR,             MVT::f64,   Legal);
01095     setOperationAction(ISD::FCEIL,              MVT::f64,   Legal);
01096     setOperationAction(ISD::FTRUNC,             MVT::f64,   Legal);
01097     setOperationAction(ISD::FRINT,              MVT::f64,   Legal);
01098     setOperationAction(ISD::FNEARBYINT,         MVT::f64,   Legal);
01099 
01100     setOperationAction(ISD::FFLOOR,             MVT::v4f32, Legal);
01101     setOperationAction(ISD::FCEIL,              MVT::v4f32, Legal);
01102     setOperationAction(ISD::FTRUNC,             MVT::v4f32, Legal);
01103     setOperationAction(ISD::FRINT,              MVT::v4f32, Legal);
01104     setOperationAction(ISD::FNEARBYINT,         MVT::v4f32, Legal);
01105     setOperationAction(ISD::FFLOOR,             MVT::v2f64, Legal);
01106     setOperationAction(ISD::FCEIL,              MVT::v2f64, Legal);
01107     setOperationAction(ISD::FTRUNC,             MVT::v2f64, Legal);
01108     setOperationAction(ISD::FRINT,              MVT::v2f64, Legal);
01109     setOperationAction(ISD::FNEARBYINT,         MVT::v2f64, Legal);
01110 
01111     // FIXME: Do we need to handle scalar-to-vector here?
01112     setOperationAction(ISD::MUL,                MVT::v4i32, Legal);
01113 
01114     setOperationAction(ISD::VSELECT,            MVT::v2f64, Custom);
01115     setOperationAction(ISD::VSELECT,            MVT::v2i64, Custom);
01116     setOperationAction(ISD::VSELECT,            MVT::v4i32, Custom);
01117     setOperationAction(ISD::VSELECT,            MVT::v4f32, Custom);
01118     setOperationAction(ISD::VSELECT,            MVT::v8i16, Custom);
01119     // There is no BLENDI for byte vectors. We don't need to custom lower
01120     // some vselects for now.
01121     setOperationAction(ISD::VSELECT,            MVT::v16i8, Legal);
01122 
01123     // SSE41 brings specific instructions for doing vector sign extend even in
01124     // cases where we don't have SRA.
01125     setLoadExtAction(ISD::SEXTLOAD, MVT::v2i8, Custom);
01126     setLoadExtAction(ISD::SEXTLOAD, MVT::v2i16, Custom);
01127     setLoadExtAction(ISD::SEXTLOAD, MVT::v2i32, Custom);
01128 
01129     // i8 and i16 vectors are custom because the source register and source
01130     // source memory operand types are not the same width.  f32 vectors are
01131     // custom since the immediate controlling the insert encodes additional
01132     // information.
01133     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v16i8, Custom);
01134     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v8i16, Custom);
01135     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4i32, Custom);
01136     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4f32, Custom);
01137 
01138     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i8, Custom);
01139     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i16, Custom);
01140     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Custom);
01141     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
01142 
01143     // FIXME: these should be Legal, but that's only for the case where
01144     // the index is constant.  For now custom expand to deal with that.
01145     if (Subtarget->is64Bit()) {
01146       setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v2i64, Custom);
01147       setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Custom);
01148     }
01149   }
01150 
01151   if (Subtarget->hasSSE2()) {
01152     setOperationAction(ISD::SRL,               MVT::v8i16, Custom);
01153     setOperationAction(ISD::SRL,               MVT::v16i8, Custom);
01154 
01155     setOperationAction(ISD::SHL,               MVT::v8i16, Custom);
01156     setOperationAction(ISD::SHL,               MVT::v16i8, Custom);
01157 
01158     setOperationAction(ISD::SRA,               MVT::v8i16, Custom);
01159     setOperationAction(ISD::SRA,               MVT::v16i8, Custom);
01160 
01161     // In the customized shift lowering, the legal cases in AVX2 will be
01162     // recognized.
01163     setOperationAction(ISD::SRL,               MVT::v2i64, Custom);
01164     setOperationAction(ISD::SRL,               MVT::v4i32, Custom);
01165 
01166     setOperationAction(ISD::SHL,               MVT::v2i64, Custom);
01167     setOperationAction(ISD::SHL,               MVT::v4i32, Custom);
01168 
01169     setOperationAction(ISD::SRA,               MVT::v4i32, Custom);
01170   }
01171 
01172   if (!TM.Options.UseSoftFloat && Subtarget->hasFp256()) {
01173     addRegisterClass(MVT::v32i8,  &X86::VR256RegClass);
01174     addRegisterClass(MVT::v16i16, &X86::VR256RegClass);
01175     addRegisterClass(MVT::v8i32,  &X86::VR256RegClass);
01176     addRegisterClass(MVT::v8f32,  &X86::VR256RegClass);
01177     addRegisterClass(MVT::v4i64,  &X86::VR256RegClass);
01178     addRegisterClass(MVT::v4f64,  &X86::VR256RegClass);
01179 
01180     setOperationAction(ISD::LOAD,               MVT::v8f32, Legal);
01181     setOperationAction(ISD::LOAD,               MVT::v4f64, Legal);
01182     setOperationAction(ISD::LOAD,               MVT::v4i64, Legal);
01183 
01184     setOperationAction(ISD::FADD,               MVT::v8f32, Legal);
01185     setOperationAction(ISD::FSUB,               MVT::v8f32, Legal);
01186     setOperationAction(ISD::FMUL,               MVT::v8f32, Legal);
01187     setOperationAction(ISD::FDIV,               MVT::v8f32, Legal);
01188     setOperationAction(ISD::FSQRT,              MVT::v8f32, Legal);
01189     setOperationAction(ISD::FFLOOR,             MVT::v8f32, Legal);
01190     setOperationAction(ISD::FCEIL,              MVT::v8f32, Legal);
01191     setOperationAction(ISD::FTRUNC,             MVT::v8f32, Legal);
01192     setOperationAction(ISD::FRINT,              MVT::v8f32, Legal);
01193     setOperationAction(ISD::FNEARBYINT,         MVT::v8f32, Legal);
01194     setOperationAction(ISD::FNEG,               MVT::v8f32, Custom);
01195     setOperationAction(ISD::FABS,               MVT::v8f32, Custom);
01196 
01197     setOperationAction(ISD::FADD,               MVT::v4f64, Legal);
01198     setOperationAction(ISD::FSUB,               MVT::v4f64, Legal);
01199     setOperationAction(ISD::FMUL,               MVT::v4f64, Legal);
01200     setOperationAction(ISD::FDIV,               MVT::v4f64, Legal);
01201     setOperationAction(ISD::FSQRT,              MVT::v4f64, Legal);
01202     setOperationAction(ISD::FFLOOR,             MVT::v4f64, Legal);
01203     setOperationAction(ISD::FCEIL,              MVT::v4f64, Legal);
01204     setOperationAction(ISD::FTRUNC,             MVT::v4f64, Legal);
01205     setOperationAction(ISD::FRINT,              MVT::v4f64, Legal);
01206     setOperationAction(ISD::FNEARBYINT,         MVT::v4f64, Legal);
01207     setOperationAction(ISD::FNEG,               MVT::v4f64, Custom);
01208     setOperationAction(ISD::FABS,               MVT::v4f64, Custom);
01209 
01210     // (fp_to_int:v8i16 (v8f32 ..)) requires the result type to be promoted
01211     // even though v8i16 is a legal type.
01212     setOperationAction(ISD::FP_TO_SINT,         MVT::v8i16, Promote);
01213     setOperationAction(ISD::FP_TO_UINT,         MVT::v8i16, Promote);
01214     setOperationAction(ISD::FP_TO_SINT,         MVT::v8i32, Legal);
01215 
01216     setOperationAction(ISD::SINT_TO_FP,         MVT::v8i16, Promote);
01217     setOperationAction(ISD::SINT_TO_FP,         MVT::v8i32, Legal);
01218     setOperationAction(ISD::FP_ROUND,           MVT::v4f32, Legal);
01219 
01220     setOperationAction(ISD::UINT_TO_FP,         MVT::v8i8,  Custom);
01221     setOperationAction(ISD::UINT_TO_FP,         MVT::v8i16, Custom);
01222 
01223     setLoadExtAction(ISD::EXTLOAD,              MVT::v4f32, Legal);
01224 
01225     setOperationAction(ISD::SRL,               MVT::v16i16, Custom);
01226     setOperationAction(ISD::SRL,               MVT::v32i8, Custom);
01227 
01228     setOperationAction(ISD::SHL,               MVT::v16i16, Custom);
01229     setOperationAction(ISD::SHL,               MVT::v32i8, Custom);
01230 
01231     setOperationAction(ISD::SRA,               MVT::v16i16, Custom);
01232     setOperationAction(ISD::SRA,               MVT::v32i8, Custom);
01233 
01234     setOperationAction(ISD::SETCC,             MVT::v32i8, Custom);
01235     setOperationAction(ISD::SETCC,             MVT::v16i16, Custom);
01236     setOperationAction(ISD::SETCC,             MVT::v8i32, Custom);
01237     setOperationAction(ISD::SETCC,             MVT::v4i64, Custom);
01238 
01239     setOperationAction(ISD::SELECT,            MVT::v4f64, Custom);
01240     setOperationAction(ISD::SELECT,            MVT::v4i64, Custom);
01241     setOperationAction(ISD::SELECT,            MVT::v8f32, Custom);
01242 
01243     setOperationAction(ISD::VSELECT,           MVT::v4f64, Custom);
01244     setOperationAction(ISD::VSELECT,           MVT::v4i64, Custom);
01245     setOperationAction(ISD::VSELECT,           MVT::v8i32, Custom);
01246     setOperationAction(ISD::VSELECT,           MVT::v8f32, Custom);
01247 
01248     setOperationAction(ISD::SIGN_EXTEND,       MVT::v4i64, Custom);
01249     setOperationAction(ISD::SIGN_EXTEND,       MVT::v8i32, Custom);
01250     setOperationAction(ISD::SIGN_EXTEND,       MVT::v16i16, Custom);
01251     setOperationAction(ISD::ZERO_EXTEND,       MVT::v4i64, Custom);
01252     setOperationAction(ISD::ZERO_EXTEND,       MVT::v8i32, Custom);
01253     setOperationAction(ISD::ZERO_EXTEND,       MVT::v16i16, Custom);
01254     setOperationAction(ISD::ANY_EXTEND,        MVT::v4i64, Custom);
01255     setOperationAction(ISD::ANY_EXTEND,        MVT::v8i32, Custom);
01256     setOperationAction(ISD::ANY_EXTEND,        MVT::v16i16, Custom);
01257     setOperationAction(ISD::TRUNCATE,          MVT::v16i8, Custom);
01258     setOperationAction(ISD::TRUNCATE,          MVT::v8i16, Custom);
01259     setOperationAction(ISD::TRUNCATE,          MVT::v4i32, Custom);
01260 
01261     if (Subtarget->hasFMA() || Subtarget->hasFMA4()) {
01262       setOperationAction(ISD::FMA,             MVT::v8f32, Legal);
01263       setOperationAction(ISD::FMA,             MVT::v4f64, Legal);
01264       setOperationAction(ISD::FMA,             MVT::v4f32, Legal);
01265       setOperationAction(ISD::FMA,             MVT::v2f64, Legal);
01266       setOperationAction(ISD::FMA,             MVT::f32, Legal);
01267       setOperationAction(ISD::FMA,             MVT::f64, Legal);
01268     }
01269 
01270     if (Subtarget->hasInt256()) {
01271       setOperationAction(ISD::ADD,             MVT::v4i64, Legal);
01272       setOperationAction(ISD::ADD,             MVT::v8i32, Legal);
01273       setOperationAction(ISD::ADD,             MVT::v16i16, Legal);
01274       setOperationAction(ISD::ADD,             MVT::v32i8, Legal);
01275 
01276       setOperationAction(ISD::SUB,             MVT::v4i64, Legal);
01277       setOperationAction(ISD::SUB,             MVT::v8i32, Legal);
01278       setOperationAction(ISD::SUB,             MVT::v16i16, Legal);
01279       setOperationAction(ISD::SUB,             MVT::v32i8, Legal);
01280 
01281       setOperationAction(ISD::MUL,             MVT::v4i64, Custom);
01282       setOperationAction(ISD::MUL,             MVT::v8i32, Legal);
01283       setOperationAction(ISD::MUL,             MVT::v16i16, Legal);
01284       // Don't lower v32i8 because there is no 128-bit byte mul
01285 
01286       setOperationAction(ISD::UMUL_LOHI,       MVT::v8i32, Custom);
01287       setOperationAction(ISD::SMUL_LOHI,       MVT::v8i32, Custom);
01288       setOperationAction(ISD::MULHU,           MVT::v16i16, Legal);
01289       setOperationAction(ISD::MULHS,           MVT::v16i16, Legal);
01290 
01291       setOperationAction(ISD::VSELECT,         MVT::v16i16, Custom);
01292       setOperationAction(ISD::VSELECT,         MVT::v32i8, Legal);
01293     } else {
01294       setOperationAction(ISD::ADD,             MVT::v4i64, Custom);
01295       setOperationAction(ISD::ADD,             MVT::v8i32, Custom);
01296       setOperationAction(ISD::ADD,             MVT::v16i16, Custom);
01297       setOperationAction(ISD::ADD,             MVT::v32i8, Custom);
01298 
01299       setOperationAction(ISD::SUB,             MVT::v4i64, Custom);
01300       setOperationAction(ISD::SUB,             MVT::v8i32, Custom);
01301       setOperationAction(ISD::SUB,             MVT::v16i16, Custom);
01302       setOperationAction(ISD::SUB,             MVT::v32i8, Custom);
01303 
01304       setOperationAction(ISD::MUL,             MVT::v4i64, Custom);
01305       setOperationAction(ISD::MUL,             MVT::v8i32, Custom);
01306       setOperationAction(ISD::MUL,             MVT::v16i16, Custom);
01307       // Don't lower v32i8 because there is no 128-bit byte mul
01308     }
01309 
01310     // In the customized shift lowering, the legal cases in AVX2 will be
01311     // recognized.
01312     setOperationAction(ISD::SRL,               MVT::v4i64, Custom);
01313     setOperationAction(ISD::SRL,               MVT::v8i32, Custom);
01314 
01315     setOperationAction(ISD::SHL,               MVT::v4i64, Custom);
01316     setOperationAction(ISD::SHL,               MVT::v8i32, Custom);
01317 
01318     setOperationAction(ISD::SRA,               MVT::v8i32, Custom);
01319 
01320     // Custom lower several nodes for 256-bit types.
01321     for (int i = MVT::FIRST_VECTOR_VALUETYPE;
01322              i <= MVT::LAST_VECTOR_VALUETYPE; ++i) {
01323       MVT VT = (MVT::SimpleValueType)i;
01324 
01325       // Extract subvector is special because the value type
01326       // (result) is 128-bit but the source is 256-bit wide.
01327       if (VT.is128BitVector())
01328         setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
01329 
01330       // Do not attempt to custom lower other non-256-bit vectors
01331       if (!VT.is256BitVector())
01332         continue;
01333 
01334       setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);
01335       setOperationAction(ISD::VECTOR_SHUFFLE,     VT, Custom);
01336       setOperationAction(ISD::INSERT_VECTOR_ELT,  VT, Custom);
01337       setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
01338       setOperationAction(ISD::SCALAR_TO_VECTOR,   VT, Custom);
01339       setOperationAction(ISD::INSERT_SUBVECTOR,   VT, Custom);
01340       setOperationAction(ISD::CONCAT_VECTORS,     VT, Custom);
01341     }
01342 
01343     // Promote v32i8, v16i16, v8i32 select, and, or, xor to v4i64.
01344     for (int i = MVT::v32i8; i != MVT::v4i64; ++i) {
01345       MVT VT = (MVT::SimpleValueType)i;
01346 
01347       // Do not attempt to promote non-256-bit vectors
01348       if (!VT.is256BitVector())
01349         continue;
01350 
01351       setOperationAction(ISD::AND,    VT, Promote);
01352       AddPromotedToType (ISD::AND,    VT, MVT::v4i64);
01353       setOperationAction(ISD::OR,     VT, Promote);
01354       AddPromotedToType (ISD::OR,     VT, MVT::v4i64);
01355       setOperationAction(ISD::XOR,    VT, Promote);
01356       AddPromotedToType (ISD::XOR,    VT, MVT::v4i64);
01357       setOperationAction(ISD::LOAD,   VT, Promote);
01358       AddPromotedToType (ISD::LOAD,   VT, MVT::v4i64);
01359       setOperationAction(ISD::SELECT, VT, Promote);
01360       AddPromotedToType (ISD::SELECT, VT, MVT::v4i64);
01361     }
01362   }
01363 
01364   if (!TM.Options.UseSoftFloat && Subtarget->hasAVX512()) {
01365     addRegisterClass(MVT::v16i32, &X86::VR512RegClass);
01366     addRegisterClass(MVT::v16f32, &X86::VR512RegClass);
01367     addRegisterClass(MVT::v8i64,  &X86::VR512RegClass);
01368     addRegisterClass(MVT::v8f64,  &X86::VR512RegClass);
01369 
01370     addRegisterClass(MVT::i1,     &X86::VK1RegClass);
01371     addRegisterClass(MVT::v8i1,   &X86::VK8RegClass);
01372     addRegisterClass(MVT::v16i1,  &X86::VK16RegClass);
01373 
01374     setOperationAction(ISD::BR_CC,              MVT::i1,    Expand);
01375     setOperationAction(ISD::SETCC,              MVT::i1,    Custom);
01376     setOperationAction(ISD::XOR,                MVT::i1,    Legal);
01377     setOperationAction(ISD::OR,                 MVT::i1,    Legal);
01378     setOperationAction(ISD::AND,                MVT::i1,    Legal);
01379     setLoadExtAction(ISD::EXTLOAD,              MVT::v8f32, Legal);
01380     setOperationAction(ISD::LOAD,               MVT::v16f32, Legal);
01381     setOperationAction(ISD::LOAD,               MVT::v8f64, Legal);
01382     setOperationAction(ISD::LOAD,               MVT::v8i64, Legal);
01383     setOperationAction(ISD::LOAD,               MVT::v16i32, Legal);
01384     setOperationAction(ISD::LOAD,               MVT::v16i1, Legal);
01385 
01386     setOperationAction(ISD::FADD,               MVT::v16f32, Legal);
01387     setOperationAction(ISD::FSUB,               MVT::v16f32, Legal);
01388     setOperationAction(ISD::FMUL,               MVT::v16f32, Legal);
01389     setOperationAction(ISD::FDIV,               MVT::v16f32, Legal);
01390     setOperationAction(ISD::FSQRT,              MVT::v16f32, Legal);
01391     setOperationAction(ISD::FNEG,               MVT::v16f32, Custom);
01392 
01393     setOperationAction(ISD::FADD,               MVT::v8f64, Legal);
01394     setOperationAction(ISD::FSUB,               MVT::v8f64, Legal);
01395     setOperationAction(ISD::FMUL,               MVT::v8f64, Legal);
01396     setOperationAction(ISD::FDIV,               MVT::v8f64, Legal);
01397     setOperationAction(ISD::FSQRT,              MVT::v8f64, Legal);
01398     setOperationAction(ISD::FNEG,               MVT::v8f64, Custom);
01399     setOperationAction(ISD::FMA,                MVT::v8f64, Legal);
01400     setOperationAction(ISD::FMA,                MVT::v16f32, Legal);
01401 
01402     setOperationAction(ISD::FP_TO_SINT,         MVT::i32, Legal);
01403     setOperationAction(ISD::FP_TO_UINT,         MVT::i32, Legal);
01404     setOperationAction(ISD::SINT_TO_FP,         MVT::i32, Legal);
01405     setOperationAction(ISD::UINT_TO_FP,         MVT::i32, Legal);
01406     if (Subtarget->is64Bit()) {
01407       setOperationAction(ISD::FP_TO_UINT,       MVT::i64, Legal);
01408       setOperationAction(ISD::FP_TO_SINT,       MVT::i64, Legal);
01409       setOperationAction(ISD::SINT_TO_FP,       MVT::i64, Legal);
01410       setOperationAction(ISD::UINT_TO_FP,       MVT::i64, Legal);
01411     }
01412     setOperationAction(ISD::FP_TO_SINT,         MVT::v16i32, Legal);
01413     setOperationAction(ISD::FP_TO_UINT,         MVT::v16i32, Legal);
01414     setOperationAction(ISD::FP_TO_UINT,         MVT::v8i32, Legal);
01415     setOperationAction(ISD::FP_TO_UINT,         MVT::v4i32, Legal);
01416     setOperationAction(ISD::SINT_TO_FP,         MVT::v16i32, Legal);
01417     setOperationAction(ISD::UINT_TO_FP,         MVT::v16i32, Legal);
01418     setOperationAction(ISD::UINT_TO_FP,         MVT::v8i32, Legal);
01419     setOperationAction(ISD::UINT_TO_FP,         MVT::v4i32, Legal);
01420     setOperationAction(ISD::FP_ROUND,           MVT::v8f32, Legal);
01421     setOperationAction(ISD::FP_EXTEND,          MVT::v8f32, Legal);
01422 
01423     setOperationAction(ISD::TRUNCATE,           MVT::i1, Custom);
01424     setOperationAction(ISD::TRUNCATE,           MVT::v16i8, Custom);
01425     setOperationAction(ISD::TRUNCATE,           MVT::v8i32, Custom);
01426     setOperationAction(ISD::TRUNCATE,           MVT::v8i1, Custom);
01427     setOperationAction(ISD::TRUNCATE,           MVT::v16i1, Custom);
01428     setOperationAction(ISD::TRUNCATE,           MVT::v16i16, Custom);
01429     setOperationAction(ISD::ZERO_EXTEND,        MVT::v16i32, Custom);
01430     setOperationAction(ISD::ZERO_EXTEND,        MVT::v8i64, Custom);
01431     setOperationAction(ISD::SIGN_EXTEND,        MVT::v16i32, Custom);
01432     setOperationAction(ISD::SIGN_EXTEND,        MVT::v8i64, Custom);
01433     setOperationAction(ISD::SIGN_EXTEND,        MVT::v16i8, Custom);
01434     setOperationAction(ISD::SIGN_EXTEND,        MVT::v8i16, Custom);
01435     setOperationAction(ISD::SIGN_EXTEND,        MVT::v16i16, Custom);
01436 
01437     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v8f64,  Custom);
01438     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v8i64,  Custom);
01439     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v16f32,  Custom);
01440     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v16i32,  Custom);
01441     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v8i1,    Custom);
01442     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v16i1, Legal);
01443 
01444     setOperationAction(ISD::SETCC,              MVT::v16i1, Custom);
01445     setOperationAction(ISD::SETCC,              MVT::v8i1, Custom);
01446 
01447     setOperationAction(ISD::MUL,              MVT::v8i64, Custom);
01448 
01449     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i1,  Custom);
01450     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i1, Custom);
01451     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v16i1, Custom);
01452     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v8i1, Custom);
01453     setOperationAction(ISD::BUILD_VECTOR,       MVT::v8i1, Custom);
01454     setOperationAction(ISD::BUILD_VECTOR,       MVT::v16i1, Custom);
01455     setOperationAction(ISD::SELECT,             MVT::v8f64, Custom);
01456     setOperationAction(ISD::SELECT,             MVT::v8i64, Custom);
01457     setOperationAction(ISD::SELECT,             MVT::v16f32, Custom);
01458 
01459     setOperationAction(ISD::ADD,                MVT::v8i64, Legal);
01460     setOperationAction(ISD::ADD,                MVT::v16i32, Legal);
01461 
01462     setOperationAction(ISD::SUB,                MVT::v8i64, Legal);
01463     setOperationAction(ISD::SUB,                MVT::v16i32, Legal);
01464 
01465     setOperationAction(ISD::MUL,                MVT::v16i32, Legal);
01466 
01467     setOperationAction(ISD::SRL,                MVT::v8i64, Custom);
01468     setOperationAction(ISD::SRL,                MVT::v16i32, Custom);
01469 
01470     setOperationAction(ISD::SHL,                MVT::v8i64, Custom);
01471     setOperationAction(ISD::SHL,                MVT::v16i32, Custom);
01472 
01473     setOperationAction(ISD::SRA,                MVT::v8i64, Custom);
01474     setOperationAction(ISD::SRA,                MVT::v16i32, Custom);
01475 
01476     setOperationAction(ISD::AND,                MVT::v8i64, Legal);
01477     setOperationAction(ISD::OR,                 MVT::v8i64, Legal);
01478     setOperationAction(ISD::XOR,                MVT::v8i64, Legal);
01479     setOperationAction(ISD::AND,                MVT::v16i32, Legal);
01480     setOperationAction(ISD::OR,                 MVT::v16i32, Legal);
01481     setOperationAction(ISD::XOR,                MVT::v16i32, Legal);
01482 
01483     if (Subtarget->hasCDI()) {
01484       setOperationAction(ISD::CTLZ,             MVT::v8i64, Legal);
01485       setOperationAction(ISD::CTLZ,             MVT::v16i32, Legal);
01486     }
01487 
01488     // Custom lower several nodes.
01489     for (int i = MVT::FIRST_VECTOR_VALUETYPE;
01490              i <= MVT::LAST_VECTOR_VALUETYPE; ++i) {
01491       MVT VT = (MVT::SimpleValueType)i;
01492 
01493       unsigned EltSize = VT.getVectorElementType().getSizeInBits();
01494       // Extract subvector is special because the value type
01495       // (result) is 256/128-bit but the source is 512-bit wide.
01496       if (VT.is128BitVector() || VT.is256BitVector())
01497         setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
01498 
01499       if (VT.getVectorElementType() == MVT::i1)
01500         setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);
01501 
01502       // Do not attempt to custom lower other non-512-bit vectors
01503       if (!VT.is512BitVector())
01504         continue;
01505 
01506       if ( EltSize >= 32) {
01507         setOperationAction(ISD::VECTOR_SHUFFLE,      VT, Custom);
01508         setOperationAction(ISD::INSERT_VECTOR_ELT,   VT, Custom);
01509         setOperationAction(ISD::BUILD_VECTOR,        VT, Custom);
01510         setOperationAction(ISD::VSELECT,             VT, Legal);
01511         setOperationAction(ISD::EXTRACT_VECTOR_ELT,  VT, Custom);
01512         setOperationAction(ISD::SCALAR_TO_VECTOR,    VT, Custom);
01513         setOperationAction(ISD::INSERT_SUBVECTOR,    VT, Custom);
01514       }
01515     }
01516     for (int i = MVT::v32i8; i != MVT::v8i64; ++i) {
01517       MVT VT = (MVT::SimpleValueType)i;
01518 
01519       // Do not attempt to promote non-256-bit vectors
01520       if (!VT.is512BitVector())
01521         continue;
01522 
01523       setOperationAction(ISD::SELECT, VT, Promote);
01524       AddPromotedToType (ISD::SELECT, VT, MVT::v8i64);
01525     }
01526   }// has  AVX-512
01527 
01528   if (!TM.Options.UseSoftFloat && Subtarget->hasBWI()) {
01529     addRegisterClass(MVT::v32i16, &X86::VR512RegClass);
01530     addRegisterClass(MVT::v64i8,  &X86::VR512RegClass);
01531 
01532     addRegisterClass(MVT::v32i1,  &X86::VK32RegClass);
01533     addRegisterClass(MVT::v64i1,  &X86::VK64RegClass);
01534 
01535     setOperationAction(ISD::LOAD,               MVT::v32i16, Legal);
01536     setOperationAction(ISD::LOAD,               MVT::v64i8, Legal);
01537     setOperationAction(ISD::SETCC,              MVT::v32i1, Custom);
01538     setOperationAction(ISD::SETCC,              MVT::v64i1, Custom);
01539 
01540     for (int i = MVT::v32i8; i != MVT::v8i64; ++i) {
01541       const MVT VT = (MVT::SimpleValueType)i;
01542 
01543       const unsigned EltSize = VT.getVectorElementType().getSizeInBits();
01544 
01545       // Do not attempt to promote non-256-bit vectors
01546       if (!VT.is512BitVector())
01547         continue;
01548 
01549       if ( EltSize < 32) {
01550         setOperationAction(ISD::BUILD_VECTOR,        VT, Custom);
01551         setOperationAction(ISD::VSELECT,             VT, Legal);
01552       }
01553     }
01554   }
01555 
01556   if (!TM.Options.UseSoftFloat && Subtarget->hasVLX()) {
01557     addRegisterClass(MVT::v4i1,   &X86::VK4RegClass);
01558     addRegisterClass(MVT::v2i1,   &X86::VK2RegClass);
01559 
01560     setOperationAction(ISD::SETCC,              MVT::v4i1, Custom);
01561     setOperationAction(ISD::SETCC,              MVT::v2i1, Custom);
01562   }
01563 
01564   // SIGN_EXTEND_INREGs are evaluated by the extend type. Handle the expansion
01565   // of this type with custom code.
01566   for (int VT = MVT::FIRST_VECTOR_VALUETYPE;
01567            VT != MVT::LAST_VECTOR_VALUETYPE; VT++) {
01568     setOperationAction(ISD::SIGN_EXTEND_INREG, (MVT::SimpleValueType)VT,
01569                        Custom);
01570   }
01571 
01572   // We want to custom lower some of our intrinsics.
01573   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
01574   setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
01575   setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
01576   if (!Subtarget->is64Bit())
01577     setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i64, Custom);
01578 
01579   // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't
01580   // handle type legalization for these operations here.
01581   //
01582   // FIXME: We really should do custom legalization for addition and
01583   // subtraction on x86-32 once PR3203 is fixed.  We really can't do much better
01584   // than generic legalization for 64-bit multiplication-with-overflow, though.
01585   for (unsigned i = 0, e = 3+Subtarget->is64Bit(); i != e; ++i) {
01586     // Add/Sub/Mul with overflow operations are custom lowered.
01587     MVT VT = IntVTs[i];
01588     setOperationAction(ISD::SADDO, VT, Custom);
01589     setOperationAction(ISD::UADDO, VT, Custom);
01590     setOperationAction(ISD::SSUBO, VT, Custom);
01591     setOperationAction(ISD::USUBO, VT, Custom);
01592     setOperationAction(ISD::SMULO, VT, Custom);
01593     setOperationAction(ISD::UMULO, VT, Custom);
01594   }
01595 
01596   // There are no 8-bit 3-address imul/mul instructions
01597   setOperationAction(ISD::SMULO, MVT::i8, Expand);
01598   setOperationAction(ISD::UMULO, MVT::i8, Expand);
01599 
01600   if (!Subtarget->is64Bit()) {
01601     // These libcalls are not available in 32-bit.
01602     setLibcallName(RTLIB::SHL_I128, nullptr);
01603     setLibcallName(RTLIB::SRL_I128, nullptr);
01604     setLibcallName(RTLIB::SRA_I128, nullptr);
01605   }
01606 
01607   // Combine sin / cos into one node or libcall if possible.
01608   if (Subtarget->hasSinCos()) {
01609     setLibcallName(RTLIB::SINCOS_F32, "sincosf");
01610     setLibcallName(RTLIB::SINCOS_F64, "sincos");
01611     if (Subtarget->isTargetDarwin()) {
01612       // For MacOSX, we don't want to the normal expansion of a libcall to
01613       // sincos. We want to issue a libcall to __sincos_stret to avoid memory
01614       // traffic.
01615       setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
01616       setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
01617     }
01618   }
01619 
01620   if (Subtarget->isTargetWin64()) {
01621     setOperationAction(ISD::SDIV, MVT::i128, Custom);
01622     setOperationAction(ISD::UDIV, MVT::i128, Custom);
01623     setOperationAction(ISD::SREM, MVT::i128, Custom);
01624     setOperationAction(ISD::UREM, MVT::i128, Custom);
01625     setOperationAction(ISD::SDIVREM, MVT::i128, Custom);
01626     setOperationAction(ISD::UDIVREM, MVT::i128, Custom);
01627   }
01628 
01629   // We have target-specific dag combine patterns for the following nodes:
01630   setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
01631   setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
01632   setTargetDAGCombine(ISD::VSELECT);
01633   setTargetDAGCombine(ISD::SELECT);
01634   setTargetDAGCombine(ISD::SHL);
01635   setTargetDAGCombine(ISD::SRA);
01636   setTargetDAGCombine(ISD::SRL);
01637   setTargetDAGCombine(ISD::OR);
01638   setTargetDAGCombine(ISD::AND);
01639   setTargetDAGCombine(ISD::ADD);
01640   setTargetDAGCombine(ISD::FADD);
01641   setTargetDAGCombine(ISD::FSUB);
01642   setTargetDAGCombine(ISD::FMA);
01643   setTargetDAGCombine(ISD::SUB);
01644   setTargetDAGCombine(ISD::LOAD);
01645   setTargetDAGCombine(ISD::STORE);
01646   setTargetDAGCombine(ISD::ZERO_EXTEND);
01647   setTargetDAGCombine(ISD::ANY_EXTEND);
01648   setTargetDAGCombine(ISD::SIGN_EXTEND);
01649   setTargetDAGCombine(ISD::SIGN_EXTEND_INREG);
01650   setTargetDAGCombine(ISD::TRUNCATE);
01651   setTargetDAGCombine(ISD::SINT_TO_FP);
01652   setTargetDAGCombine(ISD::SETCC);
01653   setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN);
01654   setTargetDAGCombine(ISD::BUILD_VECTOR);
01655   if (Subtarget->is64Bit())
01656     setTargetDAGCombine(ISD::MUL);
01657   setTargetDAGCombine(ISD::XOR);
01658 
01659   computeRegisterProperties();
01660 
01661   // On Darwin, -Os means optimize for size without hurting performance,
01662   // do not reduce the limit.
01663   MaxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores
01664   MaxStoresPerMemsetOptSize = Subtarget->isTargetDarwin() ? 16 : 8;
01665   MaxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores
01666   MaxStoresPerMemcpyOptSize = Subtarget->isTargetDarwin() ? 8 : 4;
01667   MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores
01668   MaxStoresPerMemmoveOptSize = Subtarget->isTargetDarwin() ? 8 : 4;
01669   setPrefLoopAlignment(4); // 2^4 bytes.
01670 
01671   // Predictable cmov don't hurt on atom because it's in-order.
01672   PredictableSelectIsExpensive = !Subtarget->isAtom();
01673 
01674   setPrefFunctionAlignment(4); // 2^4 bytes.
01675 
01676   verifyIntrinsicTables();
01677 }
01678 
01679 // This has so far only been implemented for 64-bit MachO.
01680 bool X86TargetLowering::useLoadStackGuardNode() const {
01681   return Subtarget->getTargetTriple().getObjectFormat() == Triple::MachO &&
01682          Subtarget->is64Bit();
01683 }
01684 
01685 TargetLoweringBase::LegalizeTypeAction
01686 X86TargetLowering::getPreferredVectorAction(EVT VT) const {
01687   if (ExperimentalVectorWideningLegalization &&
01688       VT.getVectorNumElements() != 1 &&
01689       VT.getVectorElementType().getSimpleVT() != MVT::i1)
01690     return TypeWidenVector;
01691 
01692   return TargetLoweringBase::getPreferredVectorAction(VT);
01693 }
01694 
01695 EVT X86TargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const {
01696   if (!VT.isVector())
01697     return Subtarget->hasAVX512() ? MVT::i1: MVT::i8;
01698 
01699   const unsigned NumElts = VT.getVectorNumElements();
01700   const EVT EltVT = VT.getVectorElementType();
01701   if (VT.is512BitVector()) {
01702     if (Subtarget->hasAVX512())
01703       if (EltVT == MVT::i32 || EltVT == MVT::i64 ||
01704           EltVT == MVT::f32 || EltVT == MVT::f64)
01705         switch(NumElts) {
01706         case  8: return MVT::v8i1;
01707         case 16: return MVT::v16i1;
01708       }
01709     if (Subtarget->hasBWI())
01710       if (EltVT == MVT::i8 || EltVT == MVT::i16)
01711         switch(NumElts) {
01712         case 32: return MVT::v32i1;
01713         case 64: return MVT::v64i1;
01714       }
01715   }
01716 
01717   if (VT.is256BitVector() || VT.is128BitVector()) {
01718     if (Subtarget->hasVLX())
01719       if (EltVT == MVT::i32 || EltVT == MVT::i64 ||
01720           EltVT == MVT::f32 || EltVT == MVT::f64)
01721         switch(NumElts) {
01722         case 2: return MVT::v2i1;
01723         case 4: return MVT::v4i1;
01724         case 8: return MVT::v8i1;
01725       }
01726     if (Subtarget->hasBWI() && Subtarget->hasVLX())
01727       if (EltVT == MVT::i8 || EltVT == MVT::i16)
01728         switch(NumElts) {
01729         case  8: return MVT::v8i1;
01730         case 16: return MVT::v16i1;
01731         case 32: return MVT::v32i1;
01732       }
01733   }
01734 
01735   return VT.changeVectorElementTypeToInteger();
01736 }
01737 
01738 /// getMaxByValAlign - Helper for getByValTypeAlignment to determine
01739 /// the desired ByVal argument alignment.
01740 static void getMaxByValAlign(Type *Ty, unsigned &MaxAlign) {
01741   if (MaxAlign == 16)
01742     return;
01743   if (VectorType *VTy = dyn_cast<VectorType>(Ty)) {
01744     if (VTy->getBitWidth() == 128)
01745       MaxAlign = 16;
01746   } else if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
01747     unsigned EltAlign = 0;
01748     getMaxByValAlign(ATy->getElementType(), EltAlign);
01749     if (EltAlign > MaxAlign)
01750       MaxAlign = EltAlign;
01751   } else if (StructType *STy = dyn_cast<StructType>(Ty)) {
01752     for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
01753       unsigned EltAlign = 0;
01754       getMaxByValAlign(STy->getElementType(i), EltAlign);
01755       if (EltAlign > MaxAlign)
01756         MaxAlign = EltAlign;
01757       if (MaxAlign == 16)
01758         break;
01759     }
01760   }
01761 }
01762 
01763 /// getByValTypeAlignment - Return the desired alignment for ByVal aggregate
01764 /// function arguments in the caller parameter area. For X86, aggregates
01765 /// that contain SSE vectors are placed at 16-byte boundaries while the rest
01766 /// are at 4-byte boundaries.
01767 unsigned X86TargetLowering::getByValTypeAlignment(Type *Ty) const {
01768   if (Subtarget->is64Bit()) {
01769     // Max of 8 and alignment of type.
01770     unsigned TyAlign = TD->getABITypeAlignment(Ty);
01771     if (TyAlign > 8)
01772       return TyAlign;
01773     return 8;
01774   }
01775 
01776   unsigned Align = 4;
01777   if (Subtarget->hasSSE1())
01778     getMaxByValAlign(Ty, Align);
01779   return Align;
01780 }
01781 
01782 /// getOptimalMemOpType - Returns the target specific optimal type for load
01783 /// and store operations as a result of memset, memcpy, and memmove
01784 /// lowering. If DstAlign is zero that means it's safe to destination
01785 /// alignment can satisfy any constraint. Similarly if SrcAlign is zero it
01786 /// means there isn't a need to check it against alignment requirement,
01787 /// probably because the source does not need to be loaded. If 'IsMemset' is
01788 /// true, that means it's expanding a memset. If 'ZeroMemset' is true, that
01789 /// means it's a memset of zero. 'MemcpyStrSrc' indicates whether the memcpy
01790 /// source is constant so it does not need to be loaded.
01791 /// It returns EVT::Other if the type should be determined using generic
01792 /// target-independent logic.
01793 EVT
01794 X86TargetLowering::getOptimalMemOpType(uint64_t Size,
01795                                        unsigned DstAlign, unsigned SrcAlign,
01796                                        bool IsMemset, bool ZeroMemset,
01797                                        bool MemcpyStrSrc,
01798                                        MachineFunction &MF) const {
01799   const Function *F = MF.getFunction();
01800   if ((!IsMemset || ZeroMemset) &&
01801       !F->getAttributes().hasAttribute(AttributeSet::FunctionIndex,
01802                                        Attribute::NoImplicitFloat)) {
01803     if (Size >= 16 &&
01804         (Subtarget->isUnalignedMemAccessFast() ||
01805          ((DstAlign == 0 || DstAlign >= 16) &&
01806           (SrcAlign == 0 || SrcAlign >= 16)))) {
01807       if (Size >= 32) {
01808         if (Subtarget->hasInt256())
01809           return MVT::v8i32;
01810         if (Subtarget->hasFp256())
01811           return MVT::v8f32;
01812       }
01813       if (Subtarget->hasSSE2())
01814         return MVT::v4i32;
01815       if (Subtarget->hasSSE1())
01816         return MVT::v4f32;
01817     } else if (!MemcpyStrSrc && Size >= 8 &&
01818                !Subtarget->is64Bit() &&
01819                Subtarget->hasSSE2()) {
01820       // Do not use f64 to lower memcpy if source is string constant. It's
01821       // better to use i32 to avoid the loads.
01822       return MVT::f64;
01823     }
01824   }
01825   if (Subtarget->is64Bit() && Size >= 8)
01826     return MVT::i64;
01827   return MVT::i32;
01828 }
01829 
01830 bool X86TargetLowering::isSafeMemOpType(MVT VT) const {
01831   if (VT == MVT::f32)
01832     return X86ScalarSSEf32;
01833   else if (VT == MVT::f64)
01834     return X86ScalarSSEf64;
01835   return true;
01836 }
01837 
01838 bool
01839 X86TargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
01840                                                   unsigned,
01841                                                   unsigned,
01842                                                   bool *Fast) const {
01843   if (Fast)
01844     *Fast = Subtarget->isUnalignedMemAccessFast();
01845   return true;
01846 }
01847 
01848 /// getJumpTableEncoding - Return the entry encoding for a jump table in the
01849 /// current function.  The returned value is a member of the
01850 /// MachineJumpTableInfo::JTEntryKind enum.
01851 unsigned X86TargetLowering::getJumpTableEncoding() const {
01852   // In GOT pic mode, each entry in the jump table is emitted as a @GOTOFF
01853   // symbol.
01854   if (getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
01855       Subtarget->isPICStyleGOT())
01856     return MachineJumpTableInfo::EK_Custom32;
01857 
01858   // Otherwise, use the normal jump table encoding heuristics.
01859   return TargetLowering::getJumpTableEncoding();
01860 }
01861 
01862 const MCExpr *
01863 X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI,
01864                                              const MachineBasicBlock *MBB,
01865                                              unsigned uid,MCContext &Ctx) const{
01866   assert(MBB->getParent()->getTarget().getRelocationModel() == Reloc::PIC_ &&
01867          Subtarget->isPICStyleGOT());
01868   // In 32-bit ELF systems, our jump table entries are formed with @GOTOFF
01869   // entries.
01870   return MCSymbolRefExpr::Create(MBB->getSymbol(),
01871                                  MCSymbolRefExpr::VK_GOTOFF, Ctx);
01872 }
01873 
01874 /// getPICJumpTableRelocaBase - Returns relocation base for the given PIC
01875 /// jumptable.
01876 SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table,
01877                                                     SelectionDAG &DAG) const {
01878   if (!Subtarget->is64Bit())
01879     // This doesn't have SDLoc associated with it, but is not really the
01880     // same as a Register.
01881     return DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), getPointerTy());
01882   return Table;
01883 }
01884 
01885 /// getPICJumpTableRelocBaseExpr - This returns the relocation base for the
01886 /// given PIC jumptable, the same as getPICJumpTableRelocBase, but as an
01887 /// MCExpr.
01888 const MCExpr *X86TargetLowering::
01889 getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI,
01890                              MCContext &Ctx) const {
01891   // X86-64 uses RIP relative addressing based on the jump table label.
01892   if (Subtarget->isPICStyleRIPRel())
01893     return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx);
01894 
01895   // Otherwise, the reference is relative to the PIC base.
01896   return MCSymbolRefExpr::Create(MF->getPICBaseSymbol(), Ctx);
01897 }
01898 
01899 // FIXME: Why this routine is here? Move to RegInfo!
01900 std::pair<const TargetRegisterClass*, uint8_t>
01901 X86TargetLowering::findRepresentativeClass(MVT VT) const{
01902   const TargetRegisterClass *RRC = nullptr;
01903   uint8_t Cost = 1;
01904   switch (VT.SimpleTy) {
01905   default:
01906     return TargetLowering::findRepresentativeClass(VT);
01907   case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64:
01908     RRC = Subtarget->is64Bit() ? &X86::GR64RegClass : &X86::GR32RegClass;
01909     break;
01910   case MVT::x86mmx:
01911     RRC = &X86::VR64RegClass;
01912     break;
01913   case MVT::f32: case MVT::f64:
01914   case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64:
01915   case MVT::v4f32: case MVT::v2f64:
01916   case MVT::v32i8: case MVT::v8i32: case MVT::v4i64: case MVT::v8f32:
01917   case MVT::v4f64:
01918     RRC = &X86::VR128RegClass;
01919     break;
01920   }
01921   return std::make_pair(RRC, Cost);
01922 }
01923 
01924 bool X86TargetLowering::getStackCookieLocation(unsigned &AddressSpace,
01925                                                unsigned &Offset) const {
01926   if (!Subtarget->isTargetLinux())
01927     return false;
01928 
01929   if (Subtarget->is64Bit()) {
01930     // %fs:0x28, unless we're using a Kernel code model, in which case it's %gs:
01931     Offset = 0x28;
01932     if (getTargetMachine().getCodeModel() == CodeModel::Kernel)
01933       AddressSpace = 256;
01934     else
01935       AddressSpace = 257;
01936   } else {
01937     // %gs:0x14 on i386
01938     Offset = 0x14;
01939     AddressSpace = 256;
01940   }
01941   return true;
01942 }
01943 
01944 bool X86TargetLowering::isNoopAddrSpaceCast(unsigned SrcAS,
01945                                             unsigned DestAS) const {
01946   assert(SrcAS != DestAS && "Expected different address spaces!");
01947 
01948   return SrcAS < 256 && DestAS < 256;
01949 }
01950 
01951 //===----------------------------------------------------------------------===//
01952 //               Return Value Calling Convention Implementation
01953 //===----------------------------------------------------------------------===//
01954 
01955 #include "X86GenCallingConv.inc"
01956 
01957 bool
01958 X86TargetLowering::CanLowerReturn(CallingConv::ID CallConv,
01959                                   MachineFunction &MF, bool isVarArg,
01960                         const SmallVectorImpl<ISD::OutputArg> &Outs,
01961                         LLVMContext &Context) const {
01962   SmallVector<CCValAssign, 16> RVLocs;
01963   CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
01964   return CCInfo.CheckReturn(Outs, RetCC_X86);
01965 }
01966 
01967 const MCPhysReg *X86TargetLowering::getScratchRegisters(CallingConv::ID) const {
01968   static const MCPhysReg ScratchRegs[] = { X86::R11, 0 };
01969   return ScratchRegs;
01970 }
01971 
01972 SDValue
01973 X86TargetLowering::LowerReturn(SDValue Chain,
01974                                CallingConv::ID CallConv, bool isVarArg,
01975                                const SmallVectorImpl<ISD::OutputArg> &Outs,
01976                                const SmallVectorImpl<SDValue> &OutVals,
01977                                SDLoc dl, SelectionDAG &DAG) const {
01978   MachineFunction &MF = DAG.getMachineFunction();
01979   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
01980 
01981   SmallVector<CCValAssign, 16> RVLocs;
01982   CCState CCInfo(CallConv, isVarArg, MF, RVLocs, *DAG.getContext());
01983   CCInfo.AnalyzeReturn(Outs, RetCC_X86);
01984 
01985   SDValue Flag;
01986   SmallVector<SDValue, 6> RetOps;
01987   RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
01988   // Operand #1 = Bytes To Pop
01989   RetOps.push_back(DAG.getTargetConstant(FuncInfo->getBytesToPopOnReturn(),
01990                    MVT::i16));
01991 
01992   // Copy the result values into the output registers.
01993   for (unsigned i = 0; i != RVLocs.size(); ++i) {
01994     CCValAssign &VA = RVLocs[i];
01995     assert(VA.isRegLoc() && "Can only return in registers!");
01996     SDValue ValToCopy = OutVals[i];
01997     EVT ValVT = ValToCopy.getValueType();
01998 
01999     // Promote values to the appropriate types
02000     if (VA.getLocInfo() == CCValAssign::SExt)
02001       ValToCopy = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), ValToCopy);
02002     else if (VA.getLocInfo() == CCValAssign::ZExt)
02003       ValToCopy = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), ValToCopy);
02004     else if (VA.getLocInfo() == CCValAssign::AExt)
02005       ValToCopy = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), ValToCopy);
02006     else if (VA.getLocInfo() == CCValAssign::BCvt)
02007       ValToCopy = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), ValToCopy);
02008 
02009     assert(VA.getLocInfo() != CCValAssign::FPExt &&
02010            "Unexpected FP-extend for return value.");  
02011 
02012     // If this is x86-64, and we disabled SSE, we can't return FP values,
02013     // or SSE or MMX vectors.
02014     if ((ValVT == MVT::f32 || ValVT == MVT::f64 ||
02015          VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) &&
02016           (Subtarget->is64Bit() && !Subtarget->hasSSE1())) {
02017       report_fatal_error("SSE register return with SSE disabled");
02018     }
02019     // Likewise we can't return F64 values with SSE1 only.  gcc does so, but
02020     // llvm-gcc has never done it right and no one has noticed, so this
02021     // should be OK for now.
02022     if (ValVT == MVT::f64 &&
02023         (Subtarget->is64Bit() && !Subtarget->hasSSE2()))
02024       report_fatal_error("SSE2 register return with SSE2 disabled");
02025 
02026     // Returns in ST0/ST1 are handled specially: these are pushed as operands to
02027     // the RET instruction and handled by the FP Stackifier.
02028     if (VA.getLocReg() == X86::FP0 ||
02029         VA.getLocReg() == X86::FP1) {
02030       // If this is a copy from an xmm register to ST(0), use an FPExtend to
02031       // change the value to the FP stack register class.
02032       if (isScalarFPTypeInSSEReg(VA.getValVT()))
02033         ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy);
02034       RetOps.push_back(ValToCopy);
02035       // Don't emit a copytoreg.
02036       continue;
02037     }
02038 
02039     // 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64
02040     // which is returned in RAX / RDX.
02041     if (Subtarget->is64Bit()) {
02042       if (ValVT == MVT::x86mmx) {
02043         if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) {
02044           ValToCopy = DAG.getNode(ISD::BITCAST, dl, MVT::i64, ValToCopy);
02045           ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
02046                                   ValToCopy);
02047           // If we don't have SSE2 available, convert to v4f32 so the generated
02048           // register is legal.
02049           if (!Subtarget->hasSSE2())
02050             ValToCopy = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32,ValToCopy);
02051         }
02052       }
02053     }
02054 
02055     Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), ValToCopy, Flag);
02056     Flag = Chain.getValue(1);
02057     RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
02058   }
02059 
02060   // The x86-64 ABIs require that for returning structs by value we copy
02061   // the sret argument into %rax/%eax (depending on ABI) for the return.
02062   // Win32 requires us to put the sret argument to %eax as well.
02063   // We saved the argument into a virtual register in the entry block,
02064   // so now we copy the value out and into %rax/%eax.
02065   if (DAG.getMachineFunction().getFunction()->hasStructRetAttr() &&
02066       (Subtarget->is64Bit() || Subtarget->isTargetKnownWindowsMSVC())) {
02067     MachineFunction &MF = DAG.getMachineFunction();
02068     X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
02069     unsigned Reg = FuncInfo->getSRetReturnReg();
02070     assert(Reg &&
02071            "SRetReturnReg should have been set in LowerFormalArguments().");
02072     SDValue Val = DAG.getCopyFromReg(Chain, dl, Reg, getPointerTy());
02073 
02074     unsigned RetValReg
02075         = (Subtarget->is64Bit() && !Subtarget->isTarget64BitILP32()) ?
02076           X86::RAX : X86::EAX;
02077     Chain = DAG.getCopyToReg(Chain, dl, RetValReg, Val, Flag);
02078     Flag = Chain.getValue(1);
02079 
02080     // RAX/EAX now acts like a return value.
02081     RetOps.push_back(DAG.getRegister(RetValReg, getPointerTy()));
02082   }
02083 
02084   RetOps[0] = Chain;  // Update chain.
02085 
02086   // Add the flag if we have it.
02087   if (Flag.getNode())
02088     RetOps.push_back(Flag);
02089 
02090   return DAG.getNode(X86ISD::RET_FLAG, dl, MVT::Other, RetOps);
02091 }
02092 
02093 bool X86TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
02094   if (N->getNumValues() != 1)
02095     return false;
02096   if (!N->hasNUsesOfValue(1, 0))
02097     return false;
02098 
02099   SDValue TCChain = Chain;
02100   SDNode *Copy = *N->use_begin();
02101   if (Copy->getOpcode() == ISD::CopyToReg) {
02102     // If the copy has a glue operand, we conservatively assume it isn't safe to
02103     // perform a tail call.
02104     if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
02105       return false;
02106     TCChain = Copy->getOperand(0);
02107   } else if (Copy->getOpcode() != ISD::FP_EXTEND)
02108     return false;
02109 
02110   bool HasRet = false;
02111   for (SDNode::use_iterator UI = Copy->use_begin(), UE = Copy->use_end();
02112        UI != UE; ++UI) {
02113     if (UI->getOpcode() != X86ISD::RET_FLAG)
02114       return false;
02115     // If we are returning more than one value, we can definitely
02116     // not make a tail call see PR19530
02117     if (UI->getNumOperands() > 4)
02118       return false;
02119     if (UI->getNumOperands() == 4 &&
02120         UI->getOperand(UI->getNumOperands()-1).getValueType() != MVT::Glue)
02121       return false;
02122     HasRet = true;
02123   }
02124 
02125   if (!HasRet)
02126     return false;
02127 
02128   Chain = TCChain;
02129   return true;
02130 }
02131 
02132 EVT
02133 X86TargetLowering::getTypeForExtArgOrReturn(LLVMContext &Context, EVT VT,
02134                                             ISD::NodeType ExtendKind) const {
02135   MVT ReturnMVT;
02136   // TODO: Is this also valid on 32-bit?
02137   if (Subtarget->is64Bit() && VT == MVT::i1 && ExtendKind == ISD::ZERO_EXTEND)
02138     ReturnMVT = MVT::i8;
02139   else
02140     ReturnMVT = MVT::i32;
02141 
02142   EVT MinVT = getRegisterType(Context, ReturnMVT);
02143   return VT.bitsLT(MinVT) ? MinVT : VT;
02144 }
02145 
02146 /// LowerCallResult - Lower the result values of a call into the
02147 /// appropriate copies out of appropriate physical registers.
02148 ///
02149 SDValue
02150 X86TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
02151                                    CallingConv::ID CallConv, bool isVarArg,
02152                                    const SmallVectorImpl<ISD::InputArg> &Ins,
02153                                    SDLoc dl, SelectionDAG &DAG,
02154                                    SmallVectorImpl<SDValue> &InVals) const {
02155 
02156   // Assign locations to each value returned by this call.
02157   SmallVector<CCValAssign, 16> RVLocs;
02158   bool Is64Bit = Subtarget->is64Bit();
02159   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
02160                  *DAG.getContext());
02161   CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
02162 
02163   // Copy all of the result registers out of their specified physreg.
02164   for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
02165     CCValAssign &VA = RVLocs[i];
02166     EVT CopyVT = VA.getValVT();
02167 
02168     // If this is x86-64, and we disabled SSE, we can't return FP values
02169     if ((CopyVT == MVT::f32 || CopyVT == MVT::f64) &&
02170         ((Is64Bit || Ins[i].Flags.isInReg()) && !Subtarget->hasSSE1())) {
02171       report_fatal_error("SSE register return with SSE disabled");
02172     }
02173 
02174     // If we prefer to use the value in xmm registers, copy it out as f80 and
02175     // use a truncate to move it from fp stack reg to xmm reg.
02176     if ((VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1) &&
02177         isScalarFPTypeInSSEReg(VA.getValVT()))
02178       CopyVT = MVT::f80;
02179 
02180     Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(),
02181                                CopyVT, InFlag).getValue(1);
02182     SDValue Val = Chain.getValue(0);
02183 
02184     if (CopyVT != VA.getValVT())
02185       Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val,
02186                         // This truncation won't change the value.
02187                         DAG.getIntPtrConstant(1));
02188 
02189     InFlag = Chain.getValue(2);
02190     InVals.push_back(Val);
02191   }
02192 
02193   return Chain;
02194 }
02195 
02196 //===----------------------------------------------------------------------===//
02197 //                C & StdCall & Fast Calling Convention implementation
02198 //===----------------------------------------------------------------------===//
02199 //  StdCall calling convention seems to be standard for many Windows' API
02200 //  routines and around. It differs from C calling convention just a little:
02201 //  callee should clean up the stack, not caller. Symbols should be also
02202 //  decorated in some fancy way :) It doesn't support any vector arguments.
02203 //  For info on fast calling convention see Fast Calling Convention (tail call)
02204 //  implementation LowerX86_32FastCCCallTo.
02205 
02206 /// CallIsStructReturn - Determines whether a call uses struct return
02207 /// semantics.
02208 enum StructReturnType {
02209   NotStructReturn,
02210   RegStructReturn,
02211   StackStructReturn
02212 };
02213 static StructReturnType
02214 callIsStructReturn(const SmallVectorImpl<ISD::OutputArg> &Outs) {
02215   if (Outs.empty())
02216     return NotStructReturn;
02217 
02218   const ISD::ArgFlagsTy &Flags = Outs[0].Flags;
02219   if (!Flags.isSRet())
02220     return NotStructReturn;
02221   if (Flags.isInReg())
02222     return RegStructReturn;
02223   return StackStructReturn;
02224 }
02225 
02226 /// ArgsAreStructReturn - Determines whether a function uses struct
02227 /// return semantics.
02228 static StructReturnType
02229 argsAreStructReturn(const SmallVectorImpl<ISD::InputArg> &Ins) {
02230   if (Ins.empty())
02231     return NotStructReturn;
02232 
02233   const ISD::ArgFlagsTy &Flags = Ins[0].Flags;
02234   if (!Flags.isSRet())
02235     return NotStructReturn;
02236   if (Flags.isInReg())
02237     return RegStructReturn;
02238   return StackStructReturn;
02239 }
02240 
02241 /// CreateCopyOfByValArgument - Make a copy of an aggregate at address specified
02242 /// by "Src" to address "Dst" with size and alignment information specified by
02243 /// the specific parameter attribute. The copy will be passed as a byval
02244 /// function parameter.
02245 static SDValue
02246 CreateCopyOfByValArgument(SDValue Src, SDValue Dst, SDValue Chain,
02247                           ISD::ArgFlagsTy Flags, SelectionDAG &DAG,
02248                           SDLoc dl) {
02249   SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), MVT::i32);
02250 
02251   return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(),
02252                        /*isVolatile*/false, /*AlwaysInline=*/true,
02253                        MachinePointerInfo(), MachinePointerInfo());
02254 }
02255 
02256 /// IsTailCallConvention - Return true if the calling convention is one that
02257 /// supports tail call optimization.
02258 static bool IsTailCallConvention(CallingConv::ID CC) {
02259   return (CC == CallingConv::Fast || CC == CallingConv::GHC ||
02260           CC == CallingConv::HiPE);
02261 }
02262 
02263 /// \brief Return true if the calling convention is a C calling convention.
02264 static bool IsCCallConvention(CallingConv::ID CC) {
02265   return (CC == CallingConv::C || CC == CallingConv::X86_64_Win64 ||
02266           CC == CallingConv::X86_64_SysV);
02267 }
02268 
02269 bool X86TargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const {
02270   if (!CI->isTailCall() || getTargetMachine().Options.DisableTailCalls)
02271     return false;
02272 
02273   CallSite CS(CI);
02274   CallingConv::ID CalleeCC = CS.getCallingConv();
02275   if (!IsTailCallConvention(CalleeCC) && !IsCCallConvention(CalleeCC))
02276     return false;
02277 
02278   return true;
02279 }
02280 
02281 /// FuncIsMadeTailCallSafe - Return true if the function is being made into
02282 /// a tailcall target by changing its ABI.
02283 static bool FuncIsMadeTailCallSafe(CallingConv::ID CC,
02284                                    bool GuaranteedTailCallOpt) {
02285   return GuaranteedTailCallOpt && IsTailCallConvention(CC);
02286 }
02287 
02288 SDValue
02289 X86TargetLowering::LowerMemArgument(SDValue Chain,
02290                                     CallingConv::ID CallConv,
02291                                     const SmallVectorImpl<ISD::InputArg> &Ins,
02292                                     SDLoc dl, SelectionDAG &DAG,
02293                                     const CCValAssign &VA,
02294                                     MachineFrameInfo *MFI,
02295                                     unsigned i) const {
02296   // Create the nodes corresponding to a load from this parameter slot.
02297   ISD::ArgFlagsTy Flags = Ins[i].Flags;
02298   bool AlwaysUseMutable = FuncIsMadeTailCallSafe(
02299       CallConv, DAG.getTarget().Options.GuaranteedTailCallOpt);
02300   bool isImmutable = !AlwaysUseMutable && !Flags.isByVal();
02301   EVT ValVT;
02302 
02303   // If value is passed by pointer we have address passed instead of the value
02304   // itself.
02305   if (VA.getLocInfo() == CCValAssign::Indirect)
02306     ValVT = VA.getLocVT();
02307   else
02308     ValVT = VA.getValVT();
02309 
02310   // FIXME: For now, all byval parameter objects are marked mutable. This can be
02311   // changed with more analysis.
02312   // In case of tail call optimization mark all arguments mutable. Since they
02313   // could be overwritten by lowering of arguments in case of a tail call.
02314   if (Flags.isByVal()) {
02315     unsigned Bytes = Flags.getByValSize();
02316     if (Bytes == 0) Bytes = 1; // Don't create zero-sized stack objects.
02317     int FI = MFI->CreateFixedObject(Bytes, VA.getLocMemOffset(), isImmutable);
02318     return DAG.getFrameIndex(FI, getPointerTy());
02319   } else {
02320     int FI = MFI->CreateFixedObject(ValVT.getSizeInBits()/8,
02321                                     VA.getLocMemOffset(), isImmutable);
02322     SDValue FIN = DAG.getFrameIndex(FI, getPointerTy());
02323     return DAG.getLoad(ValVT, dl, Chain, FIN,
02324                        MachinePointerInfo::getFixedStack(FI),
02325                        false, false, false, 0);
02326   }
02327 }
02328 
02329 // FIXME: Get this from tablegen.
02330 static ArrayRef<MCPhysReg> get64BitArgumentGPRs(CallingConv::ID CallConv,
02331                                                 const X86Subtarget *Subtarget) {
02332   assert(Subtarget->is64Bit());
02333 
02334   if (Subtarget->isCallingConvWin64(CallConv)) {
02335     static const MCPhysReg GPR64ArgRegsWin64[] = {
02336       X86::RCX, X86::RDX, X86::R8,  X86::R9
02337     };
02338     return makeArrayRef(std::begin(GPR64ArgRegsWin64), std::end(GPR64ArgRegsWin64));
02339   }
02340 
02341   static const MCPhysReg GPR64ArgRegs64Bit[] = {
02342     X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9
02343   };
02344   return makeArrayRef(std::begin(GPR64ArgRegs64Bit), std::end(GPR64ArgRegs64Bit));
02345 }
02346 
02347 // FIXME: Get this from tablegen.
02348 static ArrayRef<MCPhysReg> get64BitArgumentXMMs(MachineFunction &MF,
02349                                                 CallingConv::ID CallConv,
02350                                                 const X86Subtarget *Subtarget) {
02351   assert(Subtarget->is64Bit());
02352   if (Subtarget->isCallingConvWin64(CallConv)) {
02353     // The XMM registers which might contain var arg parameters are shadowed
02354     // in their paired GPR.  So we only need to save the GPR to their home
02355     // slots.
02356     // TODO: __vectorcall will change this.
02357     return None;
02358   }
02359 
02360   const Function *Fn = MF.getFunction();
02361   bool NoImplicitFloatOps = Fn->getAttributes().
02362       hasAttribute(AttributeSet::FunctionIndex, Attribute::NoImplicitFloat);
02363   assert(!(MF.getTarget().Options.UseSoftFloat && NoImplicitFloatOps) &&
02364          "SSE register cannot be used when SSE is disabled!");
02365   if (MF.getTarget().Options.UseSoftFloat || NoImplicitFloatOps ||
02366       !Subtarget->hasSSE1())
02367     // Kernel mode asks for SSE to be disabled, so there are no XMM argument
02368     // registers.
02369     return None;
02370 
02371   static const MCPhysReg XMMArgRegs64Bit[] = {
02372     X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
02373     X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
02374   };
02375   return makeArrayRef(std::begin(XMMArgRegs64Bit), std::end(XMMArgRegs64Bit));
02376 }
02377 
02378 SDValue
02379 X86TargetLowering::LowerFormalArguments(SDValue Chain,
02380                                         CallingConv::ID CallConv,
02381                                         bool isVarArg,
02382                                       const SmallVectorImpl<ISD::InputArg> &Ins,
02383                                         SDLoc dl,
02384                                         SelectionDAG &DAG,
02385                                         SmallVectorImpl<SDValue> &InVals)
02386                                           const {
02387   MachineFunction &MF = DAG.getMachineFunction();
02388   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
02389 
02390   const Function* Fn = MF.getFunction();
02391   if (Fn->hasExternalLinkage() &&
02392       Subtarget->isTargetCygMing() &&
02393       Fn->getName() == "main")
02394     FuncInfo->setForceFramePointer(true);
02395 
02396   MachineFrameInfo *MFI = MF.getFrameInfo();
02397   bool Is64Bit = Subtarget->is64Bit();
02398   bool IsWin64 = Subtarget->isCallingConvWin64(CallConv);
02399 
02400   assert(!(isVarArg && IsTailCallConvention(CallConv)) &&
02401          "Var args not supported with calling convention fastcc, ghc or hipe");
02402 
02403   // Assign locations to all of the incoming arguments.
02404   SmallVector<CCValAssign, 16> ArgLocs;
02405   CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
02406 
02407   // Allocate shadow area for Win64
02408   if (IsWin64)
02409     CCInfo.AllocateStack(32, 8);
02410 
02411   CCInfo.AnalyzeFormalArguments(Ins, CC_X86);
02412 
02413   unsigned LastVal = ~0U;
02414   SDValue ArgValue;
02415   for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
02416     CCValAssign &VA = ArgLocs[i];
02417     // TODO: If an arg is passed in two places (e.g. reg and stack), skip later
02418     // places.
02419     assert(VA.getValNo() != LastVal &&
02420            "Don't support value assigned to multiple locs yet");
02421     (void)LastVal;
02422     LastVal = VA.getValNo();
02423 
02424     if (VA.isRegLoc()) {
02425       EVT RegVT = VA.getLocVT();
02426       const TargetRegisterClass *RC;
02427       if (RegVT == MVT::i32)
02428         RC = &X86::GR32RegClass;
02429       else if (Is64Bit && RegVT == MVT::i64)
02430         RC = &X86::GR64RegClass;
02431       else if (RegVT == MVT::f32)
02432         RC = &X86::FR32RegClass;
02433       else if (RegVT == MVT::f64)
02434         RC = &X86::FR64RegClass;
02435       else if (RegVT.is512BitVector())
02436         RC = &X86::VR512RegClass;
02437       else if (RegVT.is256BitVector())
02438         RC = &X86::VR256RegClass;
02439       else if (RegVT.is128BitVector())
02440         RC = &X86::VR128RegClass;
02441       else if (RegVT == MVT::x86mmx)
02442         RC = &X86::VR64RegClass;
02443       else if (RegVT == MVT::i1)
02444         RC = &X86::VK1RegClass;
02445       else if (RegVT == MVT::v8i1)
02446         RC = &X86::VK8RegClass;
02447       else if (RegVT == MVT::v16i1)
02448         RC = &X86::VK16RegClass;
02449       else if (RegVT == MVT::v32i1)
02450         RC = &X86::VK32RegClass;
02451       else if (RegVT == MVT::v64i1)
02452         RC = &X86::VK64RegClass;
02453       else
02454         llvm_unreachable("Unknown argument type!");
02455 
02456       unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
02457       ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
02458 
02459       // If this is an 8 or 16-bit value, it is really passed promoted to 32
02460       // bits.  Insert an assert[sz]ext to capture this, then truncate to the
02461       // right size.
02462       if (VA.getLocInfo() == CCValAssign::SExt)
02463         ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue,
02464                                DAG.getValueType(VA.getValVT()));
02465       else if (VA.getLocInfo() == CCValAssign::ZExt)
02466         ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue,
02467                                DAG.getValueType(VA.getValVT()));
02468       else if (VA.getLocInfo() == CCValAssign::BCvt)
02469         ArgValue = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), ArgValue);
02470 
02471       if (VA.isExtInLoc()) {
02472         // Handle MMX values passed in XMM regs.
02473         if (RegVT.isVector())
02474           ArgValue = DAG.getNode(X86ISD::MOVDQ2Q, dl, VA.getValVT(), ArgValue);
02475         else
02476           ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
02477       }
02478     } else {
02479       assert(VA.isMemLoc());
02480       ArgValue = LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, i);
02481     }
02482 
02483     // If value is passed via pointer - do a load.
02484     if (VA.getLocInfo() == CCValAssign::Indirect)
02485       ArgValue = DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue,
02486                              MachinePointerInfo(), false, false, false, 0);
02487 
02488     InVals.push_back(ArgValue);
02489   }
02490 
02491   if (Subtarget->is64Bit() || Subtarget->isTargetKnownWindowsMSVC()) {
02492     for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
02493       // The x86-64 ABIs require that for returning structs by value we copy
02494       // the sret argument into %rax/%eax (depending on ABI) for the return.
02495       // Win32 requires us to put the sret argument to %eax as well.
02496       // Save the argument into a virtual register so that we can access it
02497       // from the return points.
02498       if (Ins[i].Flags.isSRet()) {
02499         unsigned Reg = FuncInfo->getSRetReturnReg();
02500         if (!Reg) {
02501           MVT PtrTy = getPointerTy();
02502           Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy));
02503           FuncInfo->setSRetReturnReg(Reg);
02504         }
02505         SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[i]);
02506         Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain);
02507         break;
02508       }
02509     }
02510   }
02511 
02512   unsigned StackSize = CCInfo.getNextStackOffset();
02513   // Align stack specially for tail calls.
02514   if (FuncIsMadeTailCallSafe(CallConv,
02515                              MF.getTarget().Options.GuaranteedTailCallOpt))
02516     StackSize = GetAlignedArgumentStackSize(StackSize, DAG);
02517 
02518   // If the function takes variable number of arguments, make a frame index for
02519   // the start of the first vararg value... for expansion of llvm.va_start. We
02520   // can skip this if there are no va_start calls.
02521   if (MFI->hasVAStart() &&
02522       (Is64Bit || (CallConv != CallingConv::X86_FastCall &&
02523                    CallConv != CallingConv::X86_ThisCall))) {
02524     FuncInfo->setVarArgsFrameIndex(
02525         MFI->CreateFixedObject(1, StackSize, true));
02526   }
02527 
02528   // 64-bit calling conventions support varargs and register parameters, so we
02529   // have to do extra work to spill them in the prologue or forward them to
02530   // musttail calls.
02531   if (Is64Bit && isVarArg &&
02532       (MFI->hasVAStart() || MFI->hasMustTailInVarArgFunc())) {
02533     // Find the first unallocated argument registers.
02534     ArrayRef<MCPhysReg> ArgGPRs = get64BitArgumentGPRs(CallConv, Subtarget);
02535     ArrayRef<MCPhysReg> ArgXMMs = get64BitArgumentXMMs(MF, CallConv, Subtarget);
02536     unsigned NumIntRegs =
02537         CCInfo.getFirstUnallocated(ArgGPRs.data(), ArgGPRs.size());
02538     unsigned NumXMMRegs =
02539         CCInfo.getFirstUnallocated(ArgXMMs.data(), ArgXMMs.size());
02540     assert(!(NumXMMRegs && !Subtarget->hasSSE1()) &&
02541            "SSE register cannot be used when SSE is disabled!");
02542 
02543     // Gather all the live in physical registers.
02544     SmallVector<SDValue, 6> LiveGPRs;
02545     SmallVector<SDValue, 8> LiveXMMRegs;
02546     SDValue ALVal;
02547     for (MCPhysReg Reg : ArgGPRs.slice(NumIntRegs)) {
02548       unsigned GPR = MF.addLiveIn(Reg, &X86::GR64RegClass);
02549       LiveGPRs.push_back(
02550           DAG.getCopyFromReg(Chain, dl, GPR, MVT::i64));
02551     }
02552     if (!ArgXMMs.empty()) {
02553       unsigned AL = MF.addLiveIn(X86::AL, &X86::GR8RegClass);
02554       ALVal = DAG.getCopyFromReg(Chain, dl, AL, MVT::i8);
02555       for (MCPhysReg Reg : ArgXMMs.slice(NumXMMRegs)) {
02556         unsigned XMMReg = MF.addLiveIn(Reg, &X86::VR128RegClass);
02557         LiveXMMRegs.push_back(
02558             DAG.getCopyFromReg(Chain, dl, XMMReg, MVT::v4f32));
02559       }
02560     }
02561 
02562     // Store them to the va_list returned by va_start.
02563     if (MFI->hasVAStart()) {
02564       if (IsWin64) {
02565         const TargetFrameLowering &TFI = *MF.getSubtarget().getFrameLowering();
02566         // Get to the caller-allocated home save location.  Add 8 to account
02567         // for the return address.
02568         int HomeOffset = TFI.getOffsetOfLocalArea() + 8;
02569         FuncInfo->setRegSaveFrameIndex(
02570           MFI->CreateFixedObject(1, NumIntRegs * 8 + HomeOffset, false));
02571         // Fixup to set vararg frame on shadow area (4 x i64).
02572         if (NumIntRegs < 4)
02573           FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex());
02574       } else {
02575         // For X86-64, if there are vararg parameters that are passed via
02576         // registers, then we must store them to their spots on the stack so
02577         // they may be loaded by deferencing the result of va_next.
02578         FuncInfo->setVarArgsGPOffset(NumIntRegs * 8);
02579         FuncInfo->setVarArgsFPOffset(ArgGPRs.size() * 8 + NumXMMRegs * 16);
02580         FuncInfo->setRegSaveFrameIndex(MFI->CreateStackObject(
02581             ArgGPRs.size() * 8 + ArgXMMs.size() * 16, 16, false));
02582       }
02583 
02584       // Store the integer parameter registers.
02585       SmallVector<SDValue, 8> MemOps;
02586       SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(),
02587                                         getPointerTy());
02588       unsigned Offset = FuncInfo->getVarArgsGPOffset();
02589       for (SDValue Val : LiveGPRs) {
02590         SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), RSFIN,
02591                                   DAG.getIntPtrConstant(Offset));
02592         SDValue Store =
02593           DAG.getStore(Val.getValue(1), dl, Val, FIN,
02594                        MachinePointerInfo::getFixedStack(
02595                          FuncInfo->getRegSaveFrameIndex(), Offset),
02596                        false, false, 0);
02597         MemOps.push_back(Store);
02598         Offset += 8;
02599       }
02600 
02601       if (!ArgXMMs.empty() && NumXMMRegs != ArgXMMs.size()) {
02602         // Now store the XMM (fp + vector) parameter registers.
02603         SmallVector<SDValue, 12> SaveXMMOps;
02604         SaveXMMOps.push_back(Chain);
02605         SaveXMMOps.push_back(ALVal);
02606         SaveXMMOps.push_back(DAG.getIntPtrConstant(
02607                                FuncInfo->getRegSaveFrameIndex()));
02608         SaveXMMOps.push_back(DAG.getIntPtrConstant(
02609                                FuncInfo->getVarArgsFPOffset()));
02610         SaveXMMOps.insert(SaveXMMOps.end(), LiveXMMRegs.begin(),
02611                           LiveXMMRegs.end());
02612         MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, dl,
02613                                      MVT::Other, SaveXMMOps));
02614       }
02615 
02616       if (!MemOps.empty())
02617         Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);
02618     } else {
02619       // Add all GPRs, al, and XMMs to the list of forwards.  We will add then
02620       // to the liveout set on a musttail call.
02621       assert(MFI->hasMustTailInVarArgFunc());
02622       auto &Forwards = FuncInfo->getForwardedMustTailRegParms();
02623       typedef X86MachineFunctionInfo::Forward Forward;
02624 
02625       for (unsigned I = 0, E = LiveGPRs.size(); I != E; ++I) {
02626         unsigned VReg =
02627             MF.getRegInfo().createVirtualRegister(&X86::GR64RegClass);
02628         Chain = DAG.getCopyToReg(Chain, dl, VReg, LiveGPRs[I]);
02629         Forwards.push_back(Forward(VReg, ArgGPRs[NumIntRegs + I], MVT::i64));
02630       }
02631 
02632       if (!ArgXMMs.empty()) {
02633         unsigned ALVReg =
02634             MF.getRegInfo().createVirtualRegister(&X86::GR8RegClass);
02635         Chain = DAG.getCopyToReg(Chain, dl, ALVReg, ALVal);
02636         Forwards.push_back(Forward(ALVReg, X86::AL, MVT::i8));
02637 
02638         for (unsigned I = 0, E = LiveXMMRegs.size(); I != E; ++I) {
02639           unsigned VReg =
02640               MF.getRegInfo().createVirtualRegister(&X86::VR128RegClass);
02641           Chain = DAG.getCopyToReg(Chain, dl, VReg, LiveXMMRegs[I]);
02642           Forwards.push_back(
02643               Forward(VReg, ArgXMMs[NumXMMRegs + I], MVT::v4f32));
02644         }
02645       }
02646     }
02647   }
02648 
02649   // Some CCs need callee pop.
02650   if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
02651                        MF.getTarget().Options.GuaranteedTailCallOpt)) {
02652     FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything.
02653   } else {
02654     FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing.
02655     // If this is an sret function, the return should pop the hidden pointer.
02656     if (!Is64Bit && !IsTailCallConvention(CallConv) &&
02657         !Subtarget->getTargetTriple().isOSMSVCRT() &&
02658         argsAreStructReturn(Ins) == StackStructReturn)
02659       FuncInfo->setBytesToPopOnReturn(4);
02660   }
02661 
02662   if (!Is64Bit) {
02663     // RegSaveFrameIndex is X86-64 only.
02664     FuncInfo->setRegSaveFrameIndex(0xAAAAAAA);
02665     if (CallConv == CallingConv::X86_FastCall ||
02666         CallConv == CallingConv::X86_ThisCall)
02667       // fastcc functions can't have varargs.
02668       FuncInfo->setVarArgsFrameIndex(0xAAAAAAA);
02669   }
02670 
02671   FuncInfo->setArgumentStackSize(StackSize);
02672 
02673   return Chain;
02674 }
02675 
02676 SDValue
02677 X86TargetLowering::LowerMemOpCallTo(SDValue Chain,
02678                                     SDValue StackPtr, SDValue Arg,
02679                                     SDLoc dl, SelectionDAG &DAG,
02680                                     const CCValAssign &VA,
02681                                     ISD::ArgFlagsTy Flags) const {
02682   unsigned LocMemOffset = VA.getLocMemOffset();
02683   SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset);
02684   PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, PtrOff);
02685   if (Flags.isByVal())
02686     return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl);
02687 
02688   return DAG.getStore(Chain, dl, Arg, PtrOff,
02689                       MachinePointerInfo::getStack(LocMemOffset),
02690                       false, false, 0);
02691 }
02692 
02693 /// EmitTailCallLoadRetAddr - Emit a load of return address if tail call
02694 /// optimization is performed and it is required.
02695 SDValue
02696 X86TargetLowering::EmitTailCallLoadRetAddr(SelectionDAG &DAG,
02697                                            SDValue &OutRetAddr, SDValue Chain,
02698                                            bool IsTailCall, bool Is64Bit,
02699                                            int FPDiff, SDLoc dl) const {
02700   // Adjust the Return address stack slot.
02701   EVT VT = getPointerTy();
02702   OutRetAddr = getReturnAddressFrameIndex(DAG);
02703 
02704   // Load the "old" Return address.
02705   OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, MachinePointerInfo(),
02706                            false, false, false, 0);
02707   return SDValue(OutRetAddr.getNode(), 1);
02708 }
02709 
02710 /// EmitTailCallStoreRetAddr - Emit a store of the return address if tail call
02711 /// optimization is performed and it is required (FPDiff!=0).
02712 static SDValue EmitTailCallStoreRetAddr(SelectionDAG &DAG, MachineFunction &MF,
02713                                         SDValue Chain, SDValue RetAddrFrIdx,
02714                                         EVT PtrVT, unsigned SlotSize,
02715                                         int FPDiff, SDLoc dl) {
02716   // Store the return address to the appropriate stack slot.
02717   if (!FPDiff) return Chain;
02718   // Calculate the new stack slot for the return address.
02719   int NewReturnAddrFI =
02720     MF.getFrameInfo()->CreateFixedObject(SlotSize, (int64_t)FPDiff - SlotSize,
02721                                          false);
02722   SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, PtrVT);
02723   Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx,
02724                        MachinePointerInfo::getFixedStack(NewReturnAddrFI),
02725                        false, false, 0);
02726   return Chain;
02727 }
02728 
02729 SDValue
02730 X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
02731                              SmallVectorImpl<SDValue> &InVals) const {
02732   SelectionDAG &DAG                     = CLI.DAG;
02733   SDLoc &dl                             = CLI.DL;
02734   SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
02735   SmallVectorImpl<SDValue> &OutVals     = CLI.OutVals;
02736   SmallVectorImpl<ISD::InputArg> &Ins   = CLI.Ins;
02737   SDValue Chain                         = CLI.Chain;
02738   SDValue Callee                        = CLI.Callee;
02739   CallingConv::ID CallConv              = CLI.CallConv;
02740   bool &isTailCall                      = CLI.IsTailCall;
02741   bool isVarArg                         = CLI.IsVarArg;
02742 
02743   MachineFunction &MF = DAG.getMachineFunction();
02744   bool Is64Bit        = Subtarget->is64Bit();
02745   bool IsWin64        = Subtarget->isCallingConvWin64(CallConv);
02746   StructReturnType SR = callIsStructReturn(Outs);
02747   bool IsSibcall      = false;
02748   X86MachineFunctionInfo *X86Info = MF.getInfo<X86MachineFunctionInfo>();
02749 
02750   if (MF.getTarget().Options.DisableTailCalls)
02751     isTailCall = false;
02752 
02753   bool IsMustTail = CLI.CS && CLI.CS->isMustTailCall();
02754   if (IsMustTail) {
02755     // Force this to be a tail call.  The verifier rules are enough to ensure
02756     // that we can lower this successfully without moving the return address
02757     // around.
02758     isTailCall = true;
02759   } else if (isTailCall) {
02760     // Check if it's really possible to do a tail call.
02761     isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv,
02762                     isVarArg, SR != NotStructReturn,
02763                     MF.getFunction()->hasStructRetAttr(), CLI.RetTy,
02764                     Outs, OutVals, Ins, DAG);
02765 
02766     // Sibcalls are automatically detected tailcalls which do not require
02767     // ABI changes.
02768     if (!MF.getTarget().Options.GuaranteedTailCallOpt && isTailCall)
02769       IsSibcall = true;
02770 
02771     if (isTailCall)
02772       ++NumTailCalls;
02773   }
02774 
02775   assert(!(isVarArg && IsTailCallConvention(CallConv)) &&
02776          "Var args not supported with calling convention fastcc, ghc or hipe");
02777 
02778   // Analyze operands of the call, assigning locations to each operand.
02779   SmallVector<CCValAssign, 16> ArgLocs;
02780   CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
02781 
02782   // Allocate shadow area for Win64
02783   if (IsWin64)
02784     CCInfo.AllocateStack(32, 8);
02785 
02786   CCInfo.AnalyzeCallOperands(Outs, CC_X86);
02787 
02788   // Get a count of how many bytes are to be pushed on the stack.
02789   unsigned NumBytes = CCInfo.getNextStackOffset();
02790   if (IsSibcall)
02791     // This is a sibcall. The memory operands are available in caller's
02792     // own caller's stack.
02793     NumBytes = 0;
02794   else if (MF.getTarget().Options.GuaranteedTailCallOpt &&
02795            IsTailCallConvention(CallConv))
02796     NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG);
02797 
02798   int FPDiff = 0;
02799   if (isTailCall && !IsSibcall && !IsMustTail) {
02800     // Lower arguments at fp - stackoffset + fpdiff.
02801     unsigned NumBytesCallerPushed = X86Info->getBytesToPopOnReturn();
02802 
02803     FPDiff = NumBytesCallerPushed - NumBytes;
02804 
02805     // Set the delta of movement of the returnaddr stackslot.
02806     // But only set if delta is greater than previous delta.
02807     if (FPDiff < X86Info->getTCReturnAddrDelta())
02808       X86Info->setTCReturnAddrDelta(FPDiff);
02809   }
02810 
02811   unsigned NumBytesToPush = NumBytes;
02812   unsigned NumBytesToPop = NumBytes;
02813 
02814   // If we have an inalloca argument, all stack space has already been allocated
02815   // for us and be right at the top of the stack.  We don't support multiple
02816   // arguments passed in memory when using inalloca.
02817   if (!Outs.empty() && Outs.back().Flags.isInAlloca()) {
02818     NumBytesToPush = 0;
02819     if (!ArgLocs.back().isMemLoc())
02820       report_fatal_error("cannot use inalloca attribute on a register "
02821                          "parameter");
02822     if (ArgLocs.back().getLocMemOffset() != 0)
02823       report_fatal_error("any parameter with the inalloca attribute must be "
02824                          "the only memory argument");
02825   }
02826 
02827   if (!IsSibcall)
02828     Chain = DAG.getCALLSEQ_START(
02829         Chain, DAG.getIntPtrConstant(NumBytesToPush, true), dl);
02830 
02831   SDValue RetAddrFrIdx;
02832   // Load return address for tail calls.
02833   if (isTailCall && FPDiff)
02834     Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall,
02835                                     Is64Bit, FPDiff, dl);
02836 
02837   SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
02838   SmallVector<SDValue, 8> MemOpChains;
02839   SDValue StackPtr;
02840 
02841   // Walk the register/memloc assignments, inserting copies/loads.  In the case
02842   // of tail call optimization arguments are handle later.
02843   const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>(
02844       DAG.getSubtarget().getRegisterInfo());
02845   for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
02846     // Skip inalloca arguments, they have already been written.
02847     ISD::ArgFlagsTy Flags = Outs[i].Flags;
02848     if (Flags.isInAlloca())
02849       continue;
02850 
02851     CCValAssign &VA = ArgLocs[i];
02852     EVT RegVT = VA.getLocVT();
02853     SDValue Arg = OutVals[i];
02854     bool isByVal = Flags.isByVal();
02855 
02856     // Promote the value if needed.
02857     switch (VA.getLocInfo()) {
02858     default: llvm_unreachable("Unknown loc info!");
02859     case CCValAssign::Full: break;
02860     case CCValAssign::SExt:
02861       Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg);
02862       break;
02863     case CCValAssign::ZExt:
02864       Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg);
02865       break;
02866     case CCValAssign::AExt:
02867       if (RegVT.is128BitVector()) {
02868         // Special case: passing MMX values in XMM registers.
02869         Arg = DAG.getNode(ISD::BITCAST, dl, MVT::i64, Arg);
02870         Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg);
02871         Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg);
02872       } else
02873         Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg);
02874       break;
02875     case CCValAssign::BCvt:
02876       Arg = DAG.getNode(ISD::BITCAST, dl, RegVT, Arg);
02877       break;
02878     case CCValAssign::Indirect: {
02879       // Store the argument.
02880       SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT());
02881       int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex();
02882       Chain = DAG.getStore(Chain, dl, Arg, SpillSlot,
02883                            MachinePointerInfo::getFixedStack(FI),
02884                            false, false, 0);
02885       Arg = SpillSlot;
02886       break;
02887     }
02888     }
02889 
02890     if (VA.isRegLoc()) {
02891       RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
02892       if (isVarArg && IsWin64) {
02893         // Win64 ABI requires argument XMM reg to be copied to the corresponding
02894         // shadow reg if callee is a varargs function.
02895         unsigned ShadowReg = 0;
02896         switch (VA.getLocReg()) {
02897         case X86::XMM0: ShadowReg = X86::RCX; break;
02898         case X86::XMM1: ShadowReg = X86::RDX; break;
02899         case X86::XMM2: ShadowReg = X86::R8; break;
02900         case X86::XMM3: ShadowReg = X86::R9; break;
02901         }
02902         if (ShadowReg)
02903           RegsToPass.push_back(std::make_pair(ShadowReg, Arg));
02904       }
02905     } else if (!IsSibcall && (!isTailCall || isByVal)) {
02906       assert(VA.isMemLoc());
02907       if (!StackPtr.getNode())
02908         StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
02909                                       getPointerTy());
02910       MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg,
02911                                              dl, DAG, VA, Flags));
02912     }
02913   }
02914 
02915   if (!MemOpChains.empty())
02916     Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
02917 
02918   if (Subtarget->isPICStyleGOT()) {
02919     // ELF / PIC requires GOT in the EBX register before function calls via PLT
02920     // GOT pointer.
02921     if (!isTailCall) {
02922       RegsToPass.push_back(std::make_pair(unsigned(X86::EBX),
02923                DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), getPointerTy())));
02924     } else {
02925       // If we are tail calling and generating PIC/GOT style code load the
02926       // address of the callee into ECX. The value in ecx is used as target of
02927       // the tail jump. This is done to circumvent the ebx/callee-saved problem
02928       // for tail calls on PIC/GOT architectures. Normally we would just put the
02929       // address of GOT into ebx and then call target@PLT. But for tail calls
02930       // ebx would be restored (since ebx is callee saved) before jumping to the
02931       // target@PLT.
02932 
02933       // Note: The actual moving to ECX is done further down.
02934       GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
02935       if (G && !G->getGlobal()->hasHiddenVisibility() &&
02936           !G->getGlobal()->hasProtectedVisibility())
02937         Callee = LowerGlobalAddress(Callee, DAG);
02938       else if (isa<ExternalSymbolSDNode>(Callee))
02939         Callee = LowerExternalSymbol(Callee, DAG);
02940     }
02941   }
02942 
02943   if (Is64Bit && isVarArg && !IsWin64 && !IsMustTail) {
02944     // From AMD64 ABI document:
02945     // For calls that may call functions that use varargs or stdargs
02946     // (prototype-less calls or calls to functions containing ellipsis (...) in
02947     // the declaration) %al is used as hidden argument to specify the number
02948     // of SSE registers used. The contents of %al do not need to match exactly
02949     // the number of registers, but must be an ubound on the number of SSE
02950     // registers used and is in the range 0 - 8 inclusive.
02951 
02952     // Count the number of XMM registers allocated.
02953     static const MCPhysReg XMMArgRegs[] = {
02954       X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
02955       X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
02956     };
02957     unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs, 8);
02958     assert((Subtarget->hasSSE1() || !NumXMMRegs)
02959            && "SSE registers cannot be used when SSE is disabled");
02960 
02961     RegsToPass.push_back(std::make_pair(unsigned(X86::AL),
02962                                         DAG.getConstant(NumXMMRegs, MVT::i8)));
02963   }
02964 
02965   if (Is64Bit && isVarArg && IsMustTail) {
02966     const auto &Forwards = X86Info->getForwardedMustTailRegParms();
02967     for (const auto &F : Forwards) {
02968       SDValue Val = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT);
02969       RegsToPass.push_back(std::make_pair(unsigned(F.PReg), Val));
02970     }
02971   }
02972 
02973   // For tail calls lower the arguments to the 'real' stack slots.  Sibcalls
02974   // don't need this because the eligibility check rejects calls that require
02975   // shuffling arguments passed in memory.
02976   if (!IsSibcall && isTailCall) {
02977     // Force all the incoming stack arguments to be loaded from the stack
02978     // before any new outgoing arguments are stored to the stack, because the
02979     // outgoing stack slots may alias the incoming argument stack slots, and
02980     // the alias isn't otherwise explicit. This is slightly more conservative
02981     // than necessary, because it means that each store effectively depends
02982     // on every argument instead of just those arguments it would clobber.
02983     SDValue ArgChain = DAG.getStackArgumentTokenFactor(Chain);
02984 
02985     SmallVector<SDValue, 8> MemOpChains2;
02986     SDValue FIN;
02987     int FI = 0;
02988     for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
02989       CCValAssign &VA = ArgLocs[i];
02990       if (VA.isRegLoc())
02991         continue;
02992       assert(VA.isMemLoc());
02993       SDValue Arg = OutVals[i];
02994       ISD::ArgFlagsTy Flags = Outs[i].Flags;
02995       // Skip inalloca arguments.  They don't require any work.
02996       if (Flags.isInAlloca())
02997         continue;
02998       // Create frame index.
02999       int32_t Offset = VA.getLocMemOffset()+FPDiff;
03000       uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8;
03001       FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset, true);
03002       FIN = DAG.getFrameIndex(FI, getPointerTy());
03003 
03004       if (Flags.isByVal()) {
03005         // Copy relative to framepointer.
03006         SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset());
03007         if (!StackPtr.getNode())
03008           StackPtr = DAG.getCopyFromReg(Chain, dl,
03009                                         RegInfo->getStackRegister(),
03010                                         getPointerTy());
03011         Source = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, Source);
03012 
03013         MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN,
03014                                                          ArgChain,
03015                                                          Flags, DAG, dl));
03016       } else {
03017         // Store relative to framepointer.
03018         MemOpChains2.push_back(
03019           DAG.getStore(ArgChain, dl, Arg, FIN,
03020                        MachinePointerInfo::getFixedStack(FI),
03021                        false, false, 0));
03022       }
03023     }
03024 
03025     if (!MemOpChains2.empty())
03026       Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains2);
03027 
03028     // Store the return address to the appropriate stack slot.
03029     Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx,
03030                                      getPointerTy(), RegInfo->getSlotSize(),
03031                                      FPDiff, dl);
03032   }
03033 
03034   // Build a sequence of copy-to-reg nodes chained together with token chain
03035   // and flag operands which copy the outgoing args into registers.
03036   SDValue InFlag;
03037   for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
03038     Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
03039                              RegsToPass[i].second, InFlag);
03040     InFlag = Chain.getValue(1);
03041   }
03042 
03043   if (DAG.getTarget().getCodeModel() == CodeModel::Large) {
03044     assert(Is64Bit && "Large code model is only legal in 64-bit mode.");
03045     // In the 64-bit large code model, we have to make all calls
03046     // through a register, since the call instruction's 32-bit
03047     // pc-relative offset may not be large enough to hold the whole
03048     // address.
03049   } else if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
03050     // If the callee is a GlobalAddress node (quite common, every direct call
03051     // is) turn it into a TargetGlobalAddress node so that legalize doesn't hack
03052     // it.
03053 
03054     // We should use extra load for direct calls to dllimported functions in
03055     // non-JIT mode.
03056     const GlobalValue *GV = G->getGlobal();
03057     if (!GV->hasDLLImportStorageClass()) {
03058       unsigned char OpFlags = 0;
03059       bool ExtraLoad = false;
03060       unsigned WrapperKind = ISD::DELETED_NODE;
03061 
03062       // On ELF targets, in both X86-64 and X86-32 mode, direct calls to
03063       // external symbols most go through the PLT in PIC mode.  If the symbol
03064       // has hidden or protected visibility, or if it is static or local, then
03065       // we don't need to use the PLT - we can directly call it.
03066       if (Subtarget->isTargetELF() &&
03067           DAG.getTarget().getRelocationModel() == Reloc::PIC_ &&
03068           GV->hasDefaultVisibility() && !GV->hasLocalLinkage()) {
03069         OpFlags = X86II::MO_PLT;
03070       } else if (Subtarget->isPICStyleStubAny() &&
03071                  (GV->isDeclaration() || GV->isWeakForLinker()) &&
03072                  (!Subtarget->getTargetTriple().isMacOSX() ||
03073                   Subtarget->getTargetTriple().isMacOSXVersionLT(10, 5))) {
03074         // PC-relative references to external symbols should go through $stub,
03075         // unless we're building with the leopard linker or later, which
03076         // automatically synthesizes these stubs.
03077         OpFlags = X86II::MO_DARWIN_STUB;
03078       } else if (Subtarget->isPICStyleRIPRel() &&
03079                  isa<Function>(GV) &&
03080                  cast<Function>(GV)->getAttributes().
03081                    hasAttribute(AttributeSet::FunctionIndex,
03082                                 Attribute::NonLazyBind)) {
03083         // If the function is marked as non-lazy, generate an indirect call
03084         // which loads from the GOT directly. This avoids runtime overhead
03085         // at the cost of eager binding (and one extra byte of encoding).
03086         OpFlags = X86II::MO_GOTPCREL;
03087         WrapperKind = X86ISD::WrapperRIP;
03088         ExtraLoad = true;
03089       }
03090 
03091       Callee = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(),
03092                                           G->getOffset(), OpFlags);
03093 
03094       // Add a wrapper if needed.
03095       if (WrapperKind != ISD::DELETED_NODE)
03096         Callee = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Callee);
03097       // Add extra indirection if needed.
03098       if (ExtraLoad)
03099         Callee = DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), Callee,
03100                              MachinePointerInfo::getGOT(),
03101                              false, false, false, 0);
03102     }
03103   } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
03104     unsigned char OpFlags = 0;
03105 
03106     // On ELF targets, in either X86-64 or X86-32 mode, direct calls to
03107     // external symbols should go through the PLT.
03108     if (Subtarget->isTargetELF() &&
03109         DAG.getTarget().getRelocationModel() == Reloc::PIC_) {
03110       OpFlags = X86II::MO_PLT;
03111     } else if (Subtarget->isPICStyleStubAny() &&
03112                (!Subtarget->getTargetTriple().isMacOSX() ||
03113                 Subtarget->getTargetTriple().isMacOSXVersionLT(10, 5))) {
03114       // PC-relative references to external symbols should go through $stub,
03115       // unless we're building with the leopard linker or later, which
03116       // automatically synthesizes these stubs.
03117       OpFlags = X86II::MO_DARWIN_STUB;
03118     }
03119 
03120     Callee = DAG.getTargetExternalSymbol(S->getSymbol(), getPointerTy(),
03121                                          OpFlags);
03122   }
03123 
03124   // Returns a chain & a flag for retval copy to use.
03125   SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
03126   SmallVector<SDValue, 8> Ops;
03127 
03128   if (!IsSibcall && isTailCall) {
03129     Chain = DAG.getCALLSEQ_END(Chain,
03130                                DAG.getIntPtrConstant(NumBytesToPop, true),
03131                                DAG.getIntPtrConstant(0, true), InFlag, dl);
03132     InFlag = Chain.getValue(1);
03133   }
03134 
03135   Ops.push_back(Chain);
03136   Ops.push_back(Callee);
03137 
03138   if (isTailCall)
03139     Ops.push_back(DAG.getConstant(FPDiff, MVT::i32));
03140 
03141   // Add argument registers to the end of the list so that they are known live
03142   // into the call.
03143   for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
03144     Ops.push_back(DAG.getRegister(RegsToPass[i].first,
03145                                   RegsToPass[i].second.getValueType()));
03146 
03147   // Add a register mask operand representing the call-preserved registers.
03148   const TargetRegisterInfo *TRI = DAG.getSubtarget().getRegisterInfo();
03149   const uint32_t *Mask = TRI->getCallPreservedMask(CallConv);
03150   assert(Mask && "Missing call preserved mask for calling convention");
03151   Ops.push_back(DAG.getRegisterMask(Mask));
03152 
03153   if (InFlag.getNode())
03154     Ops.push_back(InFlag);
03155 
03156   if (isTailCall) {
03157     // We used to do:
03158     //// If this is the first return lowered for this function, add the regs
03159     //// to the liveout set for the function.
03160     // This isn't right, although it's probably harmless on x86; liveouts
03161     // should be computed from returns not tail calls.  Consider a void
03162     // function making a tail call to a function returning int.
03163     return DAG.getNode(X86ISD::TC_RETURN, dl, NodeTys, Ops);
03164   }
03165 
03166   Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, Ops);
03167   InFlag = Chain.getValue(1);
03168 
03169   // Create the CALLSEQ_END node.
03170   unsigned NumBytesForCalleeToPop;
03171   if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
03172                        DAG.getTarget().Options.GuaranteedTailCallOpt))
03173     NumBytesForCalleeToPop = NumBytes;    // Callee pops everything
03174   else if (!Is64Bit && !IsTailCallConvention(CallConv) &&
03175            !Subtarget->getTargetTriple().isOSMSVCRT() &&
03176            SR == StackStructReturn)
03177     // If this is a call to a struct-return function, the callee
03178     // pops the hidden struct pointer, so we have to push it back.
03179     // This is common for Darwin/X86, Linux & Mingw32 targets.
03180     // For MSVC Win32 targets, the caller pops the hidden struct pointer.
03181     NumBytesForCalleeToPop = 4;
03182   else
03183     NumBytesForCalleeToPop = 0;  // Callee pops nothing.
03184 
03185   // Returns a flag for retval copy to use.
03186   if (!IsSibcall) {
03187     Chain = DAG.getCALLSEQ_END(Chain,
03188                                DAG.getIntPtrConstant(NumBytesToPop, true),
03189                                DAG.getIntPtrConstant(NumBytesForCalleeToPop,
03190                                                      true),
03191                                InFlag, dl);
03192     InFlag = Chain.getValue(1);
03193   }
03194 
03195   // Handle result values, copying them out of physregs into vregs that we
03196   // return.
03197   return LowerCallResult(Chain, InFlag, CallConv, isVarArg,
03198                          Ins, dl, DAG, InVals);
03199 }
03200 
03201 //===----------------------------------------------------------------------===//
03202 //                Fast Calling Convention (tail call) implementation
03203 //===----------------------------------------------------------------------===//
03204 
03205 //  Like std call, callee cleans arguments, convention except that ECX is
03206 //  reserved for storing the tail called function address. Only 2 registers are
03207 //  free for argument passing (inreg). Tail call optimization is performed
03208 //  provided:
03209 //                * tailcallopt is enabled
03210 //                * caller/callee are fastcc
03211 //  On X86_64 architecture with GOT-style position independent code only local
03212 //  (within module) calls are supported at the moment.
03213 //  To keep the stack aligned according to platform abi the function
03214 //  GetAlignedArgumentStackSize ensures that argument delta is always multiples
03215 //  of stack alignment. (Dynamic linkers need this - darwin's dyld for example)
03216 //  If a tail called function callee has more arguments than the caller the
03217 //  caller needs to make sure that there is room to move the RETADDR to. This is
03218 //  achieved by reserving an area the size of the argument delta right after the
03219 //  original RETADDR, but before the saved framepointer or the spilled registers
03220 //  e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4)
03221 //  stack layout:
03222 //    arg1
03223 //    arg2
03224 //    RETADDR
03225 //    [ new RETADDR
03226 //      move area ]
03227 //    (possible EBP)
03228 //    ESI
03229 //    EDI
03230 //    local1 ..
03231 
03232 /// GetAlignedArgumentStackSize - Make the stack size align e.g 16n + 12 aligned
03233 /// for a 16 byte align requirement.
03234 unsigned
03235 X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize,
03236                                                SelectionDAG& DAG) const {
03237   MachineFunction &MF = DAG.getMachineFunction();
03238   const TargetMachine &TM = MF.getTarget();
03239   const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>(
03240       TM.getSubtargetImpl()->getRegisterInfo());
03241   const TargetFrameLowering &TFI = *TM.getSubtargetImpl()->getFrameLowering();
03242   unsigned StackAlignment = TFI.getStackAlignment();
03243   uint64_t AlignMask = StackAlignment - 1;
03244   int64_t Offset = StackSize;
03245   unsigned SlotSize = RegInfo->getSlotSize();
03246   if ( (Offset & AlignMask) <= (StackAlignment - SlotSize) ) {
03247     // Number smaller than 12 so just add the difference.
03248     Offset += ((StackAlignment - SlotSize) - (Offset & AlignMask));
03249   } else {
03250     // Mask out lower bits, add stackalignment once plus the 12 bytes.
03251     Offset = ((~AlignMask) & Offset) + StackAlignment +
03252       (StackAlignment-SlotSize);
03253   }
03254   return Offset;
03255 }
03256 
03257 /// MatchingStackOffset - Return true if the given stack call argument is
03258 /// already available in the same position (relatively) of the caller's
03259 /// incoming argument stack.
03260 static
03261 bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
03262                          MachineFrameInfo *MFI, const MachineRegisterInfo *MRI,
03263                          const X86InstrInfo *TII) {
03264   unsigned Bytes = Arg.getValueType().getSizeInBits() / 8;
03265   int FI = INT_MAX;
03266   if (Arg.getOpcode() == ISD::CopyFromReg) {
03267     unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg();
03268     if (!TargetRegisterInfo::isVirtualRegister(VR))
03269       return false;
03270     MachineInstr *Def = MRI->getVRegDef(VR);
03271     if (!Def)
03272       return false;
03273     if (!Flags.isByVal()) {
03274       if (!TII->isLoadFromStackSlot(Def, FI))
03275         return false;
03276     } else {
03277       unsigned Opcode = Def->getOpcode();
03278       if ((Opcode == X86::LEA32r || Opcode == X86::LEA64r) &&
03279           Def->getOperand(1).isFI()) {
03280         FI = Def->getOperand(1).getIndex();
03281         Bytes = Flags.getByValSize();
03282       } else
03283         return false;
03284     }
03285   } else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) {
03286     if (Flags.isByVal())
03287       // ByVal argument is passed in as a pointer but it's now being
03288       // dereferenced. e.g.
03289       // define @foo(%struct.X* %A) {
03290       //   tail call @bar(%struct.X* byval %A)
03291       // }
03292       return false;
03293     SDValue Ptr = Ld->getBasePtr();
03294     FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr);
03295     if (!FINode)
03296       return false;
03297     FI = FINode->getIndex();
03298   } else if (Arg.getOpcode() == ISD::FrameIndex && Flags.isByVal()) {
03299     FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Arg);
03300     FI = FINode->getIndex();
03301     Bytes = Flags.getByValSize();
03302   } else
03303     return false;
03304 
03305   assert(FI != INT_MAX);
03306   if (!MFI->isFixedObjectIndex(FI))
03307     return false;
03308   return Offset == MFI->getObjectOffset(FI) && Bytes == MFI->getObjectSize(FI);
03309 }
03310 
03311 /// IsEligibleForTailCallOptimization - Check whether the call is eligible
03312 /// for tail call optimization. Targets which want to do tail call
03313 /// optimization should implement this function.
03314 bool
03315 X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
03316                                                      CallingConv::ID CalleeCC,
03317                                                      bool isVarArg,
03318                                                      bool isCalleeStructRet,
03319                                                      bool isCallerStructRet,
03320                                                      Type *RetTy,
03321                                     const SmallVectorImpl<ISD::OutputArg> &Outs,
03322                                     const SmallVectorImpl<SDValue> &OutVals,
03323                                     const SmallVectorImpl<ISD::InputArg> &Ins,
03324                                                      SelectionDAG &DAG) const {
03325   if (!IsTailCallConvention(CalleeCC) && !IsCCallConvention(CalleeCC))
03326     return false;
03327 
03328   // If -tailcallopt is specified, make fastcc functions tail-callable.
03329   const MachineFunction &MF = DAG.getMachineFunction();
03330   const Function *CallerF = MF.getFunction();
03331 
03332   // If the function return type is x86_fp80 and the callee return type is not,
03333   // then the FP_EXTEND of the call result is not a nop. It's not safe to
03334   // perform a tailcall optimization here.
03335   if (CallerF->getReturnType()->isX86_FP80Ty() && !RetTy->isX86_FP80Ty())
03336     return false;
03337 
03338   CallingConv::ID CallerCC = CallerF->getCallingConv();
03339   bool CCMatch = CallerCC == CalleeCC;
03340   bool IsCalleeWin64 = Subtarget->isCallingConvWin64(CalleeCC);
03341   bool IsCallerWin64 = Subtarget->isCallingConvWin64(CallerCC);
03342 
03343   if (DAG.getTarget().Options.GuaranteedTailCallOpt) {
03344     if (IsTailCallConvention(CalleeCC) && CCMatch)
03345       return true;
03346     return false;
03347   }
03348 
03349   // Look for obvious safe cases to perform tail call optimization that do not
03350   // require ABI changes. This is what gcc calls sibcall.
03351 
03352   // Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to
03353   // emit a special epilogue.
03354   const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>(
03355       DAG.getSubtarget().getRegisterInfo());
03356   if (RegInfo->needsStackRealignment(MF))
03357     return false;
03358 
03359   // Also avoid sibcall optimization if either caller or callee uses struct
03360   // return semantics.
03361   if (isCalleeStructRet || isCallerStructRet)
03362     return false;
03363 
03364   // An stdcall/thiscall caller is expected to clean up its arguments; the
03365   // callee isn't going to do that.
03366   // FIXME: this is more restrictive than needed. We could produce a tailcall
03367   // when the stack adjustment matches. For example, with a thiscall that takes
03368   // only one argument.
03369   if (!CCMatch && (CallerCC == CallingConv::X86_StdCall ||
03370                    CallerCC == CallingConv::X86_ThisCall))
03371     return false;
03372 
03373   // Do not sibcall optimize vararg calls unless all arguments are passed via
03374   // registers.
03375   if (isVarArg && !Outs.empty()) {
03376 
03377     // Optimizing for varargs on Win64 is unlikely to be safe without
03378     // additional testing.
03379     if (IsCalleeWin64 || IsCallerWin64)
03380       return false;
03381 
03382     SmallVector<CCValAssign, 16> ArgLocs;
03383     CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(), ArgLocs,
03384                    *DAG.getContext());
03385 
03386     CCInfo.AnalyzeCallOperands(Outs, CC_X86);
03387     for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i)
03388       if (!ArgLocs[i].isRegLoc())
03389         return false;
03390   }
03391 
03392   // If the call result is in ST0 / ST1, it needs to be popped off the x87
03393   // stack.  Therefore, if it's not used by the call it is not safe to optimize
03394   // this into a sibcall.
03395   bool Unused = false;
03396   for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
03397     if (!Ins[i].Used) {
03398       Unused = true;
03399       break;
03400     }
03401   }
03402   if (Unused) {
03403     SmallVector<CCValAssign, 16> RVLocs;
03404     CCState CCInfo(CalleeCC, false, DAG.getMachineFunction(), RVLocs,
03405                    *DAG.getContext());
03406     CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
03407     for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
03408       CCValAssign &VA = RVLocs[i];
03409       if (VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1)
03410         return false;
03411     }
03412   }
03413 
03414   // If the calling conventions do not match, then we'd better make sure the
03415   // results are returned in the same way as what the caller expects.
03416   if (!CCMatch) {
03417     SmallVector<CCValAssign, 16> RVLocs1;
03418     CCState CCInfo1(CalleeCC, false, DAG.getMachineFunction(), RVLocs1,
03419                     *DAG.getContext());
03420     CCInfo1.AnalyzeCallResult(Ins, RetCC_X86);
03421 
03422     SmallVector<CCValAssign, 16> RVLocs2;
03423     CCState CCInfo2(CallerCC, false, DAG.getMachineFunction(), RVLocs2,
03424                     *DAG.getContext());
03425     CCInfo2.AnalyzeCallResult(Ins, RetCC_X86);
03426 
03427     if (RVLocs1.size() != RVLocs2.size())
03428       return false;
03429     for (unsigned i = 0, e = RVLocs1.size(); i != e; ++i) {
03430       if (RVLocs1[i].isRegLoc() != RVLocs2[i].isRegLoc())
03431         return false;
03432       if (RVLocs1[i].getLocInfo() != RVLocs2[i].getLocInfo())
03433         return false;
03434       if (RVLocs1[i].isRegLoc()) {
03435         if (RVLocs1[i].getLocReg() != RVLocs2[i].getLocReg())
03436           return false;
03437       } else {
03438         if (RVLocs1[i].getLocMemOffset() != RVLocs2[i].getLocMemOffset())
03439           return false;
03440       }
03441     }
03442   }
03443 
03444   // If the callee takes no arguments then go on to check the results of the
03445   // call.
03446   if (!Outs.empty()) {
03447     // Check if stack adjustment is needed. For now, do not do this if any
03448     // argument is passed on the stack.
03449     SmallVector<CCValAssign, 16> ArgLocs;
03450     CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(), ArgLocs,
03451                    *DAG.getContext());
03452 
03453     // Allocate shadow area for Win64
03454     if (IsCalleeWin64)
03455       CCInfo.AllocateStack(32, 8);
03456 
03457     CCInfo.AnalyzeCallOperands(Outs, CC_X86);
03458     if (CCInfo.getNextStackOffset()) {
03459       MachineFunction &MF = DAG.getMachineFunction();
03460       if (MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn())
03461         return false;
03462 
03463       // Check if the arguments are already laid out in the right way as
03464       // the caller's fixed stack objects.
03465       MachineFrameInfo *MFI = MF.getFrameInfo();
03466       const MachineRegisterInfo *MRI = &MF.getRegInfo();
03467       const X86InstrInfo *TII =
03468           static_cast<const X86InstrInfo *>(DAG.getSubtarget().getInstrInfo());
03469       for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
03470         CCValAssign &VA = ArgLocs[i];
03471         SDValue Arg = OutVals[i];
03472         ISD::ArgFlagsTy Flags = Outs[i].Flags;
03473         if (VA.getLocInfo() == CCValAssign::Indirect)
03474           return false;
03475         if (!VA.isRegLoc()) {
03476           if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags,
03477                                    MFI, MRI, TII))
03478             return false;
03479         }
03480       }
03481     }
03482 
03483     // If the tailcall address may be in a register, then make sure it's
03484     // possible to register allocate for it. In 32-bit, the call address can
03485     // only target EAX, EDX, or ECX since the tail call must be scheduled after
03486     // callee-saved registers are restored. These happen to be the same
03487     // registers used to pass 'inreg' arguments so watch out for those.
03488     if (!Subtarget->is64Bit() &&
03489         ((!isa<GlobalAddressSDNode>(Callee) &&
03490           !isa<ExternalSymbolSDNode>(Callee)) ||
03491          DAG.getTarget().getRelocationModel() == Reloc::PIC_)) {
03492       unsigned NumInRegs = 0;
03493       // In PIC we need an extra register to formulate the address computation
03494       // for the callee.
03495       unsigned MaxInRegs =
03496   (DAG.getTarget().getRelocationModel() == Reloc::PIC_) ? 2 : 3;
03497 
03498       for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
03499         CCValAssign &VA = ArgLocs[i];
03500         if (!VA.isRegLoc())
03501           continue;
03502         unsigned Reg = VA.getLocReg();
03503         switch (Reg) {
03504         default: break;
03505         case X86::EAX: case X86::EDX: case X86::ECX:
03506           if (++NumInRegs == MaxInRegs)
03507             return false;
03508           break;
03509         }
03510       }
03511     }
03512   }
03513 
03514   return true;
03515 }
03516 
03517 FastISel *
03518 X86TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
03519                                   const TargetLibraryInfo *libInfo) const {
03520   return X86::createFastISel(funcInfo, libInfo);
03521 }
03522 
03523 //===----------------------------------------------------------------------===//
03524 //                           Other Lowering Hooks
03525 //===----------------------------------------------------------------------===//
03526 
03527 static bool MayFoldLoad(SDValue Op) {
03528   return Op.hasOneUse() && ISD::isNormalLoad(Op.getNode());
03529 }
03530 
03531 static bool MayFoldIntoStore(SDValue Op) {
03532   return Op.hasOneUse() && ISD::isNormalStore(*Op.getNode()->use_begin());
03533 }
03534 
03535 static bool isTargetShuffle(unsigned Opcode) {
03536   switch(Opcode) {
03537   default: return false;
03538   case X86ISD::PSHUFB:
03539   case X86ISD::PSHUFD:
03540   case X86ISD::PSHUFHW:
03541   case X86ISD::PSHUFLW:
03542   case X86ISD::SHUFP:
03543   case X86ISD::PALIGNR:
03544   case X86ISD::MOVLHPS:
03545   case X86ISD::MOVLHPD:
03546   case X86ISD::MOVHLPS:
03547   case X86ISD::MOVLPS:
03548   case X86ISD::MOVLPD:
03549   case X86ISD::MOVSHDUP:
03550   case X86ISD::MOVSLDUP:
03551   case X86ISD::MOVDDUP:
03552   case X86ISD::MOVSS:
03553   case X86ISD::MOVSD:
03554   case X86ISD::UNPCKL:
03555   case X86ISD::UNPCKH:
03556   case X86ISD::VPERMILP:
03557   case X86ISD::VPERM2X128:
03558   case X86ISD::VPERMI:
03559     return true;
03560   }
03561 }
03562 
03563 static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT,
03564                                     SDValue V1, SelectionDAG &DAG) {
03565   switch(Opc) {
03566   default: llvm_unreachable("Unknown x86 shuffle node");
03567   case X86ISD::MOVSHDUP:
03568   case X86ISD::MOVSLDUP:
03569   case X86ISD::MOVDDUP:
03570     return DAG.getNode(Opc, dl, VT, V1);
03571   }
03572 }
03573 
03574 static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT,
03575                                     SDValue V1, unsigned TargetMask,
03576                                     SelectionDAG &DAG) {
03577   switch(Opc) {
03578   default: llvm_unreachable("Unknown x86 shuffle node");
03579   case X86ISD::PSHUFD:
03580   case X86ISD::PSHUFHW:
03581   case X86ISD::PSHUFLW:
03582   case X86ISD::VPERMILP:
03583   case X86ISD::VPERMI:
03584     return DAG.getNode(Opc, dl, VT, V1, DAG.getConstant(TargetMask, MVT::i8));
03585   }
03586 }
03587 
03588 static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT,
03589                                     SDValue V1, SDValue V2, unsigned TargetMask,
03590                                     SelectionDAG &DAG) {
03591   switch(Opc) {
03592   default: llvm_unreachable("Unknown x86 shuffle node");
03593   case X86ISD::PALIGNR:
03594   case X86ISD::VALIGN:
03595   case X86ISD::SHUFP:
03596   case X86ISD::VPERM2X128:
03597     return DAG.getNode(Opc, dl, VT, V1, V2,
03598                        DAG.getConstant(TargetMask, MVT::i8));
03599   }
03600 }
03601 
03602 static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT,
03603                                     SDValue V1, SDValue V2, SelectionDAG &DAG) {
03604   switch(Opc) {
03605   default: llvm_unreachable("Unknown x86 shuffle node");
03606   case X86ISD::MOVLHPS:
03607   case X86ISD::MOVLHPD:
03608   case X86ISD::MOVHLPS:
03609   case X86ISD::MOVLPS:
03610   case X86ISD::MOVLPD:
03611   case X86ISD::MOVSS:
03612   case X86ISD::MOVSD:
03613   case X86ISD::UNPCKL:
03614   case X86ISD::UNPCKH:
03615     return DAG.getNode(Opc, dl, VT, V1, V2);
03616   }
03617 }
03618 
03619 SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const {
03620   MachineFunction &MF = DAG.getMachineFunction();
03621   const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>(
03622       DAG.getSubtarget().getRegisterInfo());
03623   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
03624   int ReturnAddrIndex = FuncInfo->getRAIndex();
03625 
03626   if (ReturnAddrIndex == 0) {
03627     // Set up a frame object for the return address.
03628     unsigned SlotSize = RegInfo->getSlotSize();
03629     ReturnAddrIndex = MF.getFrameInfo()->CreateFixedObject(SlotSize,
03630                                                            -(int64_t)SlotSize,
03631                                                            false);
03632     FuncInfo->setRAIndex(ReturnAddrIndex);
03633   }
03634 
03635   return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy());
03636 }
03637 
03638 bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M,
03639                                        bool hasSymbolicDisplacement) {
03640   // Offset should fit into 32 bit immediate field.
03641   if (!isInt<32>(Offset))
03642     return false;
03643 
03644   // If we don't have a symbolic displacement - we don't have any extra
03645   // restrictions.
03646   if (!hasSymbolicDisplacement)
03647     return true;
03648 
03649   // FIXME: Some tweaks might be needed for medium code model.
03650   if (M != CodeModel::Small && M != CodeModel::Kernel)
03651     return false;
03652 
03653   // For small code model we assume that latest object is 16MB before end of 31
03654   // bits boundary. We may also accept pretty large negative constants knowing
03655   // that all objects are in the positive half of address space.
03656   if (M == CodeModel::Small && Offset < 16*1024*1024)
03657     return true;
03658 
03659   // For kernel code model we know that all object resist in the negative half
03660   // of 32bits address space. We may not accept negative offsets, since they may
03661   // be just off and we may accept pretty large positive ones.
03662   if (M == CodeModel::Kernel && Offset > 0)
03663     return true;
03664 
03665   return false;
03666 }
03667 
03668 /// isCalleePop - Determines whether the callee is required to pop its
03669 /// own arguments. Callee pop is necessary to support tail calls.
03670 bool X86::isCalleePop(CallingConv::ID CallingConv,
03671                       bool is64Bit, bool IsVarArg, bool TailCallOpt) {
03672   switch (CallingConv) {
03673   default:
03674     return false;
03675   case CallingConv::X86_StdCall:
03676   case CallingConv::X86_FastCall:
03677   case CallingConv::X86_ThisCall:
03678     return !is64Bit;
03679   case CallingConv::Fast:
03680   case CallingConv::GHC:
03681   case CallingConv::HiPE:
03682     if (IsVarArg)
03683       return false;
03684     return TailCallOpt;
03685   }
03686 }
03687 
03688 /// \brief Return true if the condition is an unsigned comparison operation.
03689 static bool isX86CCUnsigned(unsigned X86CC) {
03690   switch (X86CC) {
03691   default: llvm_unreachable("Invalid integer condition!");
03692   case X86::COND_E:     return true;
03693   case X86::COND_G:     return false;
03694   case X86::COND_GE:    return false;
03695   case X86::COND_L:     return false;
03696   case X86::COND_LE:    return false;
03697   case X86::COND_NE:    return true;
03698   case X86::COND_B:     return true;
03699   case X86::COND_A:     return true;
03700   case X86::COND_BE:    return true;
03701   case X86::COND_AE:    return true;
03702   }
03703   llvm_unreachable("covered switch fell through?!");
03704 }
03705 
03706 /// TranslateX86CC - do a one to one translation of a ISD::CondCode to the X86
03707 /// specific condition code, returning the condition code and the LHS/RHS of the
03708 /// comparison to make.
03709 static unsigned TranslateX86CC(ISD::CondCode SetCCOpcode, bool isFP,
03710                                SDValue &LHS, SDValue &RHS, SelectionDAG &DAG) {
03711   if (!isFP) {
03712     if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) {
03713       if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnesValue()) {
03714         // X > -1   -> X == 0, jump !sign.
03715         RHS = DAG.getConstant(0, RHS.getValueType());
03716         return X86::COND_NS;
03717       }
03718       if (SetCCOpcode == ISD::SETLT && RHSC->isNullValue()) {
03719         // X < 0   -> X == 0, jump on sign.
03720         return X86::COND_S;
03721       }
03722       if (SetCCOpcode == ISD::SETLT && RHSC->getZExtValue() == 1) {
03723         // X < 1   -> X <= 0
03724         RHS = DAG.getConstant(0, RHS.getValueType());
03725         return X86::COND_LE;
03726       }
03727     }
03728 
03729     switch (SetCCOpcode) {
03730     default: llvm_unreachable("Invalid integer condition!");
03731     case ISD::SETEQ:  return X86::COND_E;
03732     case ISD::SETGT:  return X86::COND_G;
03733     case ISD::SETGE:  return X86::COND_GE;
03734     case ISD::SETLT:  return X86::COND_L;
03735     case ISD::SETLE:  return X86::COND_LE;
03736     case ISD::SETNE:  return X86::COND_NE;
03737     case ISD::SETULT: return X86::COND_B;
03738     case ISD::SETUGT: return X86::COND_A;
03739     case ISD::SETULE: return X86::COND_BE;
03740     case ISD::SETUGE: return X86::COND_AE;
03741     }
03742   }
03743 
03744   // First determine if it is required or is profitable to flip the operands.
03745 
03746   // If LHS is a foldable load, but RHS is not, flip the condition.
03747   if (ISD::isNON_EXTLoad(LHS.getNode()) &&
03748       !ISD::isNON_EXTLoad(RHS.getNode())) {
03749     SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode);
03750     std::swap(LHS, RHS);
03751   }
03752 
03753   switch (SetCCOpcode) {
03754   default: break;
03755   case ISD::SETOLT:
03756   case ISD::SETOLE:
03757   case ISD::SETUGT:
03758   case ISD::SETUGE:
03759     std::swap(LHS, RHS);
03760     break;
03761   }
03762 
03763   // On a floating point condition, the flags are set as follows:
03764   // ZF  PF  CF   op
03765   //  0 | 0 | 0 | X > Y
03766   //  0 | 0 | 1 | X < Y
03767   //  1 | 0 | 0 | X == Y
03768   //  1 | 1 | 1 | unordered
03769   switch (SetCCOpcode) {
03770   default: llvm_unreachable("Condcode should be pre-legalized away");
03771   case ISD::SETUEQ:
03772   case ISD::SETEQ:   return X86::COND_E;
03773   case ISD::SETOLT:              // flipped
03774   case ISD::SETOGT:
03775   case ISD::SETGT:   return X86::COND_A;
03776   case ISD::SETOLE:              // flipped
03777   case ISD::SETOGE:
03778   case ISD::SETGE:   return X86::COND_AE;
03779   case ISD::SETUGT:              // flipped
03780   case ISD::SETULT:
03781   case ISD::SETLT:   return X86::COND_B;
03782   case ISD::SETUGE:              // flipped
03783   case ISD::SETULE:
03784   case ISD::SETLE:   return X86::COND_BE;
03785   case ISD::SETONE:
03786   case ISD::SETNE:   return X86::COND_NE;
03787   case ISD::SETUO:   return X86::COND_P;
03788   case ISD::SETO:    return X86::COND_NP;
03789   case ISD::SETOEQ:
03790   case ISD::SETUNE:  return X86::COND_INVALID;
03791   }
03792 }
03793 
03794 /// hasFPCMov - is there a floating point cmov for the specific X86 condition
03795 /// code. Current x86 isa includes the following FP cmov instructions:
03796 /// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu.
03797 static bool hasFPCMov(unsigned X86CC) {
03798   switch (X86CC) {
03799   default:
03800     return false;
03801   case X86::COND_B:
03802   case X86::COND_BE:
03803   case X86::COND_E:
03804   case X86::COND_P:
03805   case X86::COND_A:
03806   case X86::COND_AE:
03807   case X86::COND_NE:
03808   case X86::COND_NP:
03809     return true;
03810   }
03811 }
03812 
03813 /// isFPImmLegal - Returns true if the target can instruction select the
03814 /// specified FP immediate natively. If false, the legalizer will
03815 /// materialize the FP immediate as a load from a constant pool.
03816 bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
03817   for (unsigned i = 0, e = LegalFPImmediates.size(); i != e; ++i) {
03818     if (Imm.bitwiseIsEqual(LegalFPImmediates[i]))
03819       return true;
03820   }
03821   return false;
03822 }
03823 
03824 /// \brief Returns true if it is beneficial to convert a load of a constant
03825 /// to just the constant itself.
03826 bool X86TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
03827                                                           Type *Ty) const {
03828   assert(Ty->isIntegerTy());
03829 
03830   unsigned BitSize = Ty->getPrimitiveSizeInBits();
03831   if (BitSize == 0 || BitSize > 64)
03832     return false;
03833   return true;
03834 }
03835 
03836 /// isUndefOrInRange - Return true if Val is undef or if its value falls within
03837 /// the specified range (L, H].
03838 static bool isUndefOrInRange(int Val, int Low, int Hi) {
03839   return (Val < 0) || (Val >= Low && Val < Hi);
03840 }
03841 
03842 /// isUndefOrEqual - Val is either less than zero (undef) or equal to the
03843 /// specified value.
03844 static bool isUndefOrEqual(int Val, int CmpVal) {
03845   return (Val < 0 || Val == CmpVal);
03846 }
03847 
03848 /// isSequentialOrUndefInRange - Return true if every element in Mask, beginning
03849 /// from position Pos and ending in Pos+Size, falls within the specified
03850 /// sequential range (L, L+Pos]. or is undef.
03851 static bool isSequentialOrUndefInRange(ArrayRef<int> Mask,
03852                                        unsigned Pos, unsigned Size, int Low) {
03853   for (unsigned i = Pos, e = Pos+Size; i != e; ++i, ++Low)
03854     if (!isUndefOrEqual(Mask[i], Low))
03855       return false;
03856   return true;
03857 }
03858 
03859 /// isPSHUFDMask - Return true if the node specifies a shuffle of elements that
03860 /// is suitable for input to PSHUFD or PSHUFW.  That is, it doesn't reference
03861 /// the second operand.
03862 static bool isPSHUFDMask(ArrayRef<int> Mask, MVT VT) {
03863   if (VT == MVT::v4f32 || VT == MVT::v4i32 )
03864     return (Mask[0] < 4 && Mask[1] < 4 && Mask[2] < 4 && Mask[3] < 4);
03865   if (VT == MVT::v2f64 || VT == MVT::v2i64)
03866     return (Mask[0] < 2 && Mask[1] < 2);
03867   return false;
03868 }
03869 
03870 /// isPSHUFHWMask - Return true if the node specifies a shuffle of elements that
03871 /// is suitable for input to PSHUFHW.
03872 static bool isPSHUFHWMask(ArrayRef<int> Mask, MVT VT, bool HasInt256) {
03873   if (VT != MVT::v8i16 && (!HasInt256 || VT != MVT::v16i16))
03874     return false;
03875 
03876   // Lower quadword copied in order or undef.
03877   if (!isSequentialOrUndefInRange(Mask, 0, 4, 0))
03878     return false;
03879 
03880   // Upper quadword shuffled.
03881   for (unsigned i = 4; i != 8; ++i)
03882     if (!isUndefOrInRange(Mask[i], 4, 8))
03883       return false;
03884 
03885   if (VT == MVT::v16i16) {
03886     // Lower quadword copied in order or undef.
03887     if (!isSequentialOrUndefInRange(Mask, 8, 4, 8))
03888       return false;
03889 
03890     // Upper quadword shuffled.
03891     for (unsigned i = 12; i != 16; ++i)
03892       if (!isUndefOrInRange(Mask[i], 12, 16))
03893         return false;
03894   }
03895 
03896   return true;
03897 }
03898 
03899 /// isPSHUFLWMask - Return true if the node specifies a shuffle of elements that
03900 /// is suitable for input to PSHUFLW.
03901 static bool isPSHUFLWMask(ArrayRef<int> Mask, MVT VT, bool HasInt256) {
03902   if (VT != MVT::v8i16 && (!HasInt256 || VT != MVT::v16i16))
03903     return false;
03904 
03905   // Upper quadword copied in order.
03906   if (!isSequentialOrUndefInRange(Mask, 4, 4, 4))
03907     return false;
03908 
03909   // Lower quadword shuffled.
03910   for (unsigned i = 0; i != 4; ++i)
03911     if (!isUndefOrInRange(Mask[i], 0, 4))
03912       return false;
03913 
03914   if (VT == MVT::v16i16) {
03915     // Upper quadword copied in order.
03916     if (!isSequentialOrUndefInRange(Mask, 12, 4, 12))
03917       return false;
03918 
03919     // Lower quadword shuffled.
03920     for (unsigned i = 8; i != 12; ++i)
03921       if (!isUndefOrInRange(Mask[i], 8, 12))
03922         return false;
03923   }
03924 
03925   return true;
03926 }
03927 
03928 /// \brief Return true if the mask specifies a shuffle of elements that is
03929 /// suitable for input to intralane (palignr) or interlane (valign) vector
03930 /// right-shift.
03931 static bool isAlignrMask(ArrayRef<int> Mask, MVT VT, bool InterLane) {
03932   unsigned NumElts = VT.getVectorNumElements();
03933   unsigned NumLanes = InterLane ? 1: VT.getSizeInBits()/128;
03934   unsigned NumLaneElts = NumElts/NumLanes;
03935 
03936   // Do not handle 64-bit element shuffles with palignr.
03937   if (NumLaneElts == 2)
03938     return false;
03939 
03940   for (unsigned l = 0; l != NumElts; l+=NumLaneElts) {
03941     unsigned i;
03942     for (i = 0; i != NumLaneElts; ++i) {
03943       if (Mask[i+l] >= 0)
03944         break;
03945     }
03946 
03947     // Lane is all undef, go to next lane
03948     if (i == NumLaneElts)
03949       continue;
03950 
03951     int Start = Mask[i+l];
03952 
03953     // Make sure its in this lane in one of the sources
03954     if (!isUndefOrInRange(Start, l, l+NumLaneElts) &&
03955         !isUndefOrInRange(Start, l+NumElts, l+NumElts+NumLaneElts))
03956       return false;
03957 
03958     // If not lane 0, then we must match lane 0
03959     if (l != 0 && Mask[i] >= 0 && !isUndefOrEqual(Start, Mask[i]+l))
03960       return false;
03961 
03962     // Correct second source to be contiguous with first source
03963     if (Start >= (int)NumElts)
03964       Start -= NumElts - NumLaneElts;
03965 
03966     // Make sure we're shifting in the right direction.
03967     if (Start <= (int)(i+l))
03968       return false;
03969 
03970     Start -= i;
03971 
03972     // Check the rest of the elements to see if they are consecutive.
03973     for (++i; i != NumLaneElts; ++i) {
03974       int Idx = Mask[i+l];
03975 
03976       // Make sure its in this lane
03977       if (!isUndefOrInRange(Idx, l, l+NumLaneElts) &&
03978           !isUndefOrInRange(Idx, l+NumElts, l+NumElts+NumLaneElts))
03979         return false;
03980 
03981       // If not lane 0, then we must match lane 0
03982       if (l != 0 && Mask[i] >= 0 && !isUndefOrEqual(Idx, Mask[i]+l))
03983         return false;
03984 
03985       if (Idx >= (int)NumElts)
03986         Idx -= NumElts - NumLaneElts;
03987 
03988       if (!isUndefOrEqual(Idx, Start+i))
03989         return false;
03990 
03991     }
03992   }
03993 
03994   return true;
03995 }
03996 
03997 /// \brief Return true if the node specifies a shuffle of elements that is
03998 /// suitable for input to PALIGNR.
03999 static bool isPALIGNRMask(ArrayRef<int> Mask, MVT VT,
04000                           const X86Subtarget *Subtarget) {
04001   if ((VT.is128BitVector() && !Subtarget->hasSSSE3()) ||
04002       (VT.is256BitVector() && !Subtarget->hasInt256()) ||
04003       VT.is512BitVector())
04004     // FIXME: Add AVX512BW.
04005     return false;
04006 
04007   return isAlignrMask(Mask, VT, false);
04008 }
04009 
04010 /// \brief Return true if the node specifies a shuffle of elements that is
04011 /// suitable for input to VALIGN.
04012 static bool isVALIGNMask(ArrayRef<int> Mask, MVT VT,
04013                           const X86Subtarget *Subtarget) {
04014   // FIXME: Add AVX512VL.
04015   if (!VT.is512BitVector() || !Subtarget->hasAVX512())
04016     return false;
04017   return isAlignrMask(Mask, VT, true);
04018 }
04019 
04020 /// CommuteVectorShuffleMask - Change values in a shuffle permute mask assuming
04021 /// the two vector operands have swapped position.
04022 static void CommuteVectorShuffleMask(SmallVectorImpl<int> &Mask,
04023                                      unsigned NumElems) {
04024   for (unsigned i = 0; i != NumElems; ++i) {
04025     int idx = Mask[i];
04026     if (idx < 0)
04027       continue;
04028     else if (idx < (int)NumElems)
04029       Mask[i] = idx + NumElems;
04030     else
04031       Mask[i] = idx - NumElems;
04032   }
04033 }
04034 
04035 /// isSHUFPMask - Return true if the specified VECTOR_SHUFFLE operand
04036 /// specifies a shuffle of elements that is suitable for input to 128/256-bit
04037 /// SHUFPS and SHUFPD. If Commuted is true, then it checks for sources to be
04038 /// reverse of what x86 shuffles want.
04039 static bool isSHUFPMask(ArrayRef<int> Mask, MVT VT, bool Commuted = false) {
04040 
04041   unsigned NumElems = VT.getVectorNumElements();
04042   unsigned NumLanes = VT.getSizeInBits()/128;
04043   unsigned NumLaneElems = NumElems/NumLanes;
04044 
04045   if (NumLaneElems != 2 && NumLaneElems != 4)
04046     return false;
04047 
04048   unsigned EltSize = VT.getVectorElementType().getSizeInBits();
04049   bool symetricMaskRequired =
04050     (VT.getSizeInBits() >= 256) && (EltSize == 32);
04051 
04052   // VSHUFPSY divides the resulting vector into 4 chunks.
04053   // The sources are also splitted into 4 chunks, and each destination
04054   // chunk must come from a different source chunk.
04055   //
04056   //  SRC1 =>   X7    X6    X5    X4    X3    X2    X1    X0
04057   //  SRC2 =>   Y7    Y6    Y5    Y4    Y3    Y2    Y1    Y9
04058   //
04059   //  DST  =>  Y7..Y4,   Y7..Y4,   X7..X4,   X7..X4,
04060   //           Y3..Y0,   Y3..Y0,   X3..X0,   X3..X0
04061   //
04062   // VSHUFPDY divides the resulting vector into 4 chunks.
04063   // The sources are also splitted into 4 chunks, and each destination
04064   // chunk must come from a different source chunk.
04065   //
04066   //  SRC1 =>      X3       X2       X1       X0
04067   //  SRC2 =>      Y3       Y2       Y1       Y0
04068   //
04069   //  DST  =>  Y3..Y2,  X3..X2,  Y1..Y0,  X1..X0
04070   //
04071   SmallVector<int, 4> MaskVal(NumLaneElems, -1);
04072   unsigned HalfLaneElems = NumLaneElems/2;
04073   for (unsigned l = 0; l != NumElems; l += NumLaneElems) {
04074     for (unsigned i = 0; i != NumLaneElems; ++i) {
04075       int Idx = Mask[i+l];
04076       unsigned RngStart = l + ((Commuted == (i<HalfLaneElems)) ? NumElems : 0);
04077       if (!isUndefOrInRange(Idx, RngStart, RngStart+NumLaneElems))
04078         return false;
04079       // For VSHUFPSY, the mask of the second half must be the same as the
04080       // first but with the appropriate offsets. This works in the same way as
04081       // VPERMILPS works with masks.
04082       if (!symetricMaskRequired || Idx < 0)
04083         continue;
04084       if (MaskVal[i] < 0) {
04085         MaskVal[i] = Idx - l;
04086         continue;
04087       }
04088       if ((signed)(Idx - l) != MaskVal[i])
04089         return false;
04090     }
04091   }
04092 
04093   return true;
04094 }
04095 
04096 /// isMOVHLPSMask - Return true if the specified VECTOR_SHUFFLE operand
04097 /// specifies a shuffle of elements that is suitable for input to MOVHLPS.
04098 static bool isMOVHLPSMask(ArrayRef<int> Mask, MVT VT) {
04099   if (!VT.is128BitVector())
04100     return false;
04101 
04102   unsigned NumElems = VT.getVectorNumElements();
04103 
04104   if (NumElems != 4)
04105     return false;
04106 
04107   // Expect bit0 == 6, bit1 == 7, bit2 == 2, bit3 == 3
04108   return isUndefOrEqual(Mask[0], 6) &&
04109          isUndefOrEqual(Mask[1], 7) &&
04110          isUndefOrEqual(Mask[2], 2) &&
04111          isUndefOrEqual(Mask[3], 3);
04112 }
04113 
04114 /// isMOVHLPS_v_undef_Mask - Special case of isMOVHLPSMask for canonical form
04115 /// of vector_shuffle v, v, <2, 3, 2, 3>, i.e. vector_shuffle v, undef,
04116 /// <2, 3, 2, 3>
04117 static bool isMOVHLPS_v_undef_Mask(ArrayRef<int> Mask, MVT VT) {
04118   if (!VT.is128BitVector())
04119     return false;
04120 
04121   unsigned NumElems = VT.getVectorNumElements();
04122 
04123   if (NumElems != 4)
04124     return false;
04125 
04126   return isUndefOrEqual(Mask[0], 2) &&
04127          isUndefOrEqual(Mask[1], 3) &&
04128          isUndefOrEqual(Mask[2], 2) &&
04129          isUndefOrEqual(Mask[3], 3);
04130 }
04131 
04132 /// isMOVLPMask - Return true if the specified VECTOR_SHUFFLE operand
04133 /// specifies a shuffle of elements that is suitable for input to MOVLP{S|D}.
04134 static bool isMOVLPMask(ArrayRef<int> Mask, MVT VT) {
04135   if (!VT.is128BitVector())
04136     return false;
04137 
04138   unsigned NumElems = VT.getVectorNumElements();
04139 
04140   if (NumElems != 2 && NumElems != 4)
04141     return false;
04142 
04143   for (unsigned i = 0, e = NumElems/2; i != e; ++i)
04144     if (!isUndefOrEqual(Mask[i], i + NumElems))
04145       return false;
04146 
04147   for (unsigned i = NumElems/2, e = NumElems; i != e; ++i)
04148     if (!isUndefOrEqual(Mask[i], i))
04149       return false;
04150 
04151   return true;
04152 }
04153 
04154 /// isMOVLHPSMask - Return true if the specified VECTOR_SHUFFLE operand
04155 /// specifies a shuffle of elements that is suitable for input to MOVLHPS.
04156 static bool isMOVLHPSMask(ArrayRef<int> Mask, MVT VT) {
04157   if (!VT.is128BitVector())
04158     return false;
04159 
04160   unsigned NumElems = VT.getVectorNumElements();
04161 
04162   if (NumElems != 2 && NumElems != 4)
04163     return false;
04164 
04165   for (unsigned i = 0, e = NumElems/2; i != e; ++i)
04166     if (!isUndefOrEqual(Mask[i], i))
04167       return false;
04168 
04169   for (unsigned i = 0, e = NumElems/2; i != e; ++i)
04170     if (!isUndefOrEqual(Mask[i + e], i + NumElems))
04171       return false;
04172 
04173   return true;
04174 }
04175 
04176 /// isINSERTPSMask - Return true if the specified VECTOR_SHUFFLE operand
04177 /// specifies a shuffle of elements that is suitable for input to INSERTPS.
04178 /// i. e: If all but one element come from the same vector.
04179 static bool isINSERTPSMask(ArrayRef<int> Mask, MVT VT) {
04180   // TODO: Deal with AVX's VINSERTPS
04181   if (!VT.is128BitVector() || (VT != MVT::v4f32 && VT != MVT::v4i32))
04182     return false;
04183 
04184   unsigned CorrectPosV1 = 0;
04185   unsigned CorrectPosV2 = 0;
04186   for (int i = 0, e = (int)VT.getVectorNumElements(); i != e; ++i) {
04187     if (Mask[i] == -1) {
04188       ++CorrectPosV1;
04189       ++CorrectPosV2;
04190       continue;
04191     }
04192 
04193     if (Mask[i] == i)
04194       ++CorrectPosV1;
04195     else if (Mask[i] == i + 4)
04196       ++CorrectPosV2;
04197   }
04198 
04199   if (CorrectPosV1 == 3 || CorrectPosV2 == 3)
04200     // We have 3 elements (undefs count as elements from any vector) from one
04201     // vector, and one from another.
04202     return true;
04203 
04204   return false;
04205 }
04206 
04207 //
04208 // Some special combinations that can be optimized.
04209 //
04210 static
04211 SDValue Compact8x32ShuffleNode(ShuffleVectorSDNode *SVOp,
04212                                SelectionDAG &DAG) {
04213   MVT VT = SVOp->getSimpleValueType(0);
04214   SDLoc dl(SVOp);
04215 
04216   if (VT != MVT::v8i32 && VT != MVT::v8f32)
04217     return SDValue();
04218 
04219   ArrayRef<int> Mask = SVOp->getMask();
04220 
04221   // These are the special masks that may be optimized.
04222   static const int MaskToOptimizeEven[] = {0, 8, 2, 10, 4, 12, 6, 14};
04223   static const int MaskToOptimizeOdd[]  = {1, 9, 3, 11, 5, 13, 7, 15};
04224   bool MatchEvenMask = true;
04225   bool MatchOddMask  = true;
04226   for (int i=0; i<8; ++i) {
04227     if (!isUndefOrEqual(Mask[i], MaskToOptimizeEven[i]))
04228       MatchEvenMask = false;
04229     if (!isUndefOrEqual(Mask[i], MaskToOptimizeOdd[i]))
04230       MatchOddMask = false;
04231   }
04232 
04233   if (!MatchEvenMask && !MatchOddMask)
04234     return SDValue();
04235 
04236   SDValue UndefNode = DAG.getNode(ISD::UNDEF, dl, VT);
04237 
04238   SDValue Op0 = SVOp->getOperand(0);
04239   SDValue Op1 = SVOp->getOperand(1);
04240 
04241   if (MatchEvenMask) {
04242     // Shift the second operand right to 32 bits.
04243     static const int ShiftRightMask[] = {-1, 0, -1, 2, -1, 4, -1, 6 };
04244     Op1 = DAG.getVectorShuffle(VT, dl, Op1, UndefNode, ShiftRightMask);
04245   } else {
04246     // Shift the first operand left to 32 bits.
04247     static const int ShiftLeftMask[] = {1, -1, 3, -1, 5, -1, 7, -1 };
04248     Op0 = DAG.getVectorShuffle(VT, dl, Op0, UndefNode, ShiftLeftMask);
04249   }
04250   static const int BlendMask[] = {0, 9, 2, 11, 4, 13, 6, 15};
04251   return DAG.getVectorShuffle(VT, dl, Op0, Op1, BlendMask);
04252 }
04253 
04254 /// isUNPCKLMask - Return true if the specified VECTOR_SHUFFLE operand
04255 /// specifies a shuffle of elements that is suitable for input to UNPCKL.
04256 static bool isUNPCKLMask(ArrayRef<int> Mask, MVT VT,
04257                          bool HasInt256, bool V2IsSplat = false) {
04258 
04259   assert(VT.getSizeInBits() >= 128 &&
04260          "Unsupported vector type for unpckl");
04261 
04262   unsigned NumElts = VT.getVectorNumElements();
04263   if (VT.is256BitVector() && NumElts != 4 && NumElts != 8 &&
04264       (!HasInt256 || (NumElts != 16 && NumElts != 32)))
04265     return false;
04266 
04267   assert((!VT.is512BitVector() || VT.getScalarType().getSizeInBits() >= 32) &&
04268          "Unsupported vector type for unpckh");
04269 
04270   // AVX defines UNPCK* to operate independently on 128-bit lanes.
04271   unsigned NumLanes = VT.getSizeInBits()/128;
04272   unsigned NumLaneElts = NumElts/NumLanes;
04273 
04274   for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
04275     for (unsigned i = 0, j = l; i != NumLaneElts; i += 2, ++j) {
04276       int BitI  = Mask[l+i];
04277       int BitI1 = Mask[l+i+1];
04278       if (!isUndefOrEqual(BitI, j))
04279         return false;
04280       if (V2IsSplat) {
04281         if (!isUndefOrEqual(BitI1, NumElts))
04282           return false;
04283       } else {
04284         if (!isUndefOrEqual(BitI1, j + NumElts))
04285           return false;
04286       }
04287     }
04288   }
04289 
04290   return true;
04291 }
04292 
04293 /// isUNPCKHMask - Return true if the specified VECTOR_SHUFFLE operand
04294 /// specifies a shuffle of elements that is suitable for input to UNPCKH.
04295 static bool isUNPCKHMask(ArrayRef<int> Mask, MVT VT,
04296                          bool HasInt256, bool V2IsSplat = false) {
04297   assert(VT.getSizeInBits() >= 128 &&
04298          "Unsupported vector type for unpckh");
04299 
04300   unsigned NumElts = VT.getVectorNumElements();
04301   if (VT.is256BitVector() && NumElts != 4 && NumElts != 8 &&
04302       (!HasInt256 || (NumElts != 16 && NumElts != 32)))
04303     return false;
04304 
04305   assert((!VT.is512BitVector() || VT.getScalarType().getSizeInBits() >= 32) &&
04306          "Unsupported vector type for unpckh");
04307 
04308   // AVX defines UNPCK* to operate independently on 128-bit lanes.
04309   unsigned NumLanes = VT.getSizeInBits()/128;
04310   unsigned NumLaneElts = NumElts/NumLanes;
04311 
04312   for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
04313     for (unsigned i = 0, j = l+NumLaneElts/2; i != NumLaneElts; i += 2, ++j) {
04314       int BitI  = Mask[l+i];
04315       int BitI1 = Mask[l+i+1];
04316       if (!isUndefOrEqual(BitI, j))
04317         return false;
04318       if (V2IsSplat) {
04319         if (isUndefOrEqual(BitI1, NumElts))
04320           return false;
04321       } else {
04322         if (!isUndefOrEqual(BitI1, j+NumElts))
04323           return false;
04324       }
04325     }
04326   }
04327   return true;
04328 }
04329 
04330 /// isUNPCKL_v_undef_Mask - Special case of isUNPCKLMask for canonical form
04331 /// of vector_shuffle v, v, <0, 4, 1, 5>, i.e. vector_shuffle v, undef,
04332 /// <0, 0, 1, 1>
04333 static bool isUNPCKL_v_undef_Mask(ArrayRef<int> Mask, MVT VT, bool HasInt256) {
04334   unsigned NumElts = VT.getVectorNumElements();
04335   bool Is256BitVec = VT.is256BitVector();
04336 
04337   if (VT.is512BitVector())
04338     return false;
04339   assert((VT.is128BitVector() || VT.is256BitVector()) &&
04340          "Unsupported vector type for unpckh");
04341 
04342   if (Is256BitVec && NumElts != 4 && NumElts != 8 &&
04343       (!HasInt256 || (NumElts != 16 && NumElts != 32)))
04344     return false;
04345 
04346   // For 256-bit i64/f64, use MOVDDUPY instead, so reject the matching pattern
04347   // FIXME: Need a better way to get rid of this, there's no latency difference
04348   // between UNPCKLPD and MOVDDUP, the later should always be checked first and
04349   // the former later. We should also remove the "_undef" special mask.
04350   if (NumElts == 4 && Is256BitVec)
04351     return false;
04352 
04353   // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate
04354   // independently on 128-bit lanes.
04355   unsigned NumLanes = VT.getSizeInBits()/128;
04356   unsigned NumLaneElts = NumElts/NumLanes;
04357 
04358   for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
04359     for (unsigned i = 0, j = l; i != NumLaneElts; i += 2, ++j) {
04360       int BitI  = Mask[l+i];
04361       int BitI1 = Mask[l+i+1];
04362 
04363       if (!isUndefOrEqual(BitI, j))
04364         return false;
04365       if (!isUndefOrEqual(BitI1, j))
04366         return false;
04367     }
04368   }
04369 
04370   return true;
04371 }
04372 
04373 /// isUNPCKH_v_undef_Mask - Special case of isUNPCKHMask for canonical form
04374 /// of vector_shuffle v, v, <2, 6, 3, 7>, i.e. vector_shuffle v, undef,
04375 /// <2, 2, 3, 3>
04376 static bool isUNPCKH_v_undef_Mask(ArrayRef<int> Mask, MVT VT, bool HasInt256) {
04377   unsigned NumElts = VT.getVectorNumElements();
04378 
04379   if (VT.is512BitVector())
04380     return false;
04381 
04382   assert((VT.is128BitVector() || VT.is256BitVector()) &&
04383          "Unsupported vector type for unpckh");
04384 
04385   if (VT.is256BitVector() && NumElts != 4 && NumElts != 8 &&
04386       (!HasInt256 || (NumElts != 16 && NumElts != 32)))
04387     return false;
04388 
04389   // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate
04390   // independently on 128-bit lanes.
04391   unsigned NumLanes = VT.getSizeInBits()/128;
04392   unsigned NumLaneElts = NumElts/NumLanes;
04393 
04394   for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
04395     for (unsigned i = 0, j = l+NumLaneElts/2; i != NumLaneElts; i += 2, ++j) {
04396       int BitI  = Mask[l+i];
04397       int BitI1 = Mask[l+i+1];
04398       if (!isUndefOrEqual(BitI, j))
04399         return false;
04400       if (!isUndefOrEqual(BitI1, j))
04401         return false;
04402     }
04403   }
04404   return true;
04405 }
04406 
04407 // Match for INSERTI64x4 INSERTF64x4 instructions (src0[0], src1[0]) or
04408 // (src1[0], src0[1]), manipulation with 256-bit sub-vectors
04409 static bool isINSERT64x4Mask(ArrayRef<int> Mask, MVT VT, unsigned int *Imm) {
04410   if (!VT.is512BitVector())
04411     return false;
04412 
04413   unsigned NumElts = VT.getVectorNumElements();
04414   unsigned HalfSize = NumElts/2;
04415   if (isSequentialOrUndefInRange(Mask, 0, HalfSize, 0)) {
04416     if (isSequentialOrUndefInRange(Mask, HalfSize, HalfSize, NumElts)) {
04417       *Imm = 1;
04418       return true;
04419     }
04420   }
04421   if (isSequentialOrUndefInRange(Mask, 0, HalfSize, NumElts)) {
04422     if (isSequentialOrUndefInRange(Mask, HalfSize, HalfSize, HalfSize)) {
04423       *Imm = 0;
04424       return true;
04425     }
04426   }
04427   return false;
04428 }
04429 
04430 /// isMOVLMask - Return true if the specified VECTOR_SHUFFLE operand
04431 /// specifies a shuffle of elements that is suitable for input to MOVSS,
04432 /// MOVSD, and MOVD, i.e. setting the lowest element.
04433 static bool isMOVLMask(ArrayRef<int> Mask, EVT VT) {
04434   if (VT.getVectorElementType().getSizeInBits() < 32)
04435     return false;
04436   if (!VT.is128BitVector())
04437     return false;
04438 
04439   unsigned NumElts = VT.getVectorNumElements();
04440 
04441   if (!isUndefOrEqual(Mask[0], NumElts))
04442     return false;
04443 
04444   for (unsigned i = 1; i != NumElts; ++i)
04445     if (!isUndefOrEqual(Mask[i], i))
04446       return false;
04447 
04448   return true;
04449 }
04450 
04451 /// isVPERM2X128Mask - Match 256-bit shuffles where the elements are considered
04452 /// as permutations between 128-bit chunks or halves. As an example: this
04453 /// shuffle bellow:
04454 ///   vector_shuffle <4, 5, 6, 7, 12, 13, 14, 15>
04455 /// The first half comes from the second half of V1 and the second half from the
04456 /// the second half of V2.
04457 static bool isVPERM2X128Mask(ArrayRef<int> Mask, MVT VT, bool HasFp256) {
04458   if (!HasFp256 || !VT.is256BitVector())
04459     return false;
04460 
04461   // The shuffle result is divided into half A and half B. In total the two
04462   // sources have 4 halves, namely: C, D, E, F. The final values of A and
04463   // B must come from C, D, E or F.
04464   unsigned HalfSize = VT.getVectorNumElements()/2;
04465   bool MatchA = false, MatchB = false;
04466 
04467   // Check if A comes from one of C, D, E, F.
04468   for (unsigned Half = 0; Half != 4; ++Half) {
04469     if (isSequentialOrUndefInRange(Mask, 0, HalfSize, Half*HalfSize)) {
04470       MatchA = true;
04471       break;
04472     }
04473   }
04474 
04475   // Check if B comes from one of C, D, E, F.
04476   for (unsigned Half = 0; Half != 4; ++Half) {
04477     if (isSequentialOrUndefInRange(Mask, HalfSize, HalfSize, Half*HalfSize)) {
04478       MatchB = true;
04479       break;
04480     }
04481   }
04482 
04483   return MatchA && MatchB;
04484 }
04485 
04486 /// getShuffleVPERM2X128Immediate - Return the appropriate immediate to shuffle
04487 /// the specified VECTOR_MASK mask with VPERM2F128/VPERM2I128 instructions.
04488 static unsigned getShuffleVPERM2X128Immediate(ShuffleVectorSDNode *SVOp) {
04489   MVT VT = SVOp->getSimpleValueType(0);
04490 
04491   unsigned HalfSize = VT.getVectorNumElements()/2;
04492 
04493   unsigned FstHalf = 0, SndHalf = 0;
04494   for (unsigned i = 0; i < HalfSize; ++i) {
04495     if (SVOp->getMaskElt(i) > 0) {
04496       FstHalf = SVOp->getMaskElt(i)/HalfSize;
04497       break;
04498     }
04499   }
04500   for (unsigned i = HalfSize; i < HalfSize*2; ++i) {
04501     if (SVOp->getMaskElt(i) > 0) {
04502       SndHalf = SVOp->getMaskElt(i)/HalfSize;
04503       break;
04504     }
04505   }
04506 
04507   return (FstHalf | (SndHalf << 4));
04508 }
04509 
04510 // Symetric in-lane mask. Each lane has 4 elements (for imm8)
04511 static bool isPermImmMask(ArrayRef<int> Mask, MVT VT, unsigned& Imm8) {
04512   unsigned EltSize = VT.getVectorElementType().getSizeInBits();
04513   if (EltSize < 32)
04514     return false;
04515 
04516   unsigned NumElts = VT.getVectorNumElements();
04517   Imm8 = 0;
04518   if (VT.is128BitVector() || (VT.is256BitVector() && EltSize == 64)) {
04519     for (unsigned i = 0; i != NumElts; ++i) {
04520       if (Mask[i] < 0)
04521         continue;
04522       Imm8 |= Mask[i] << (i*2);
04523     }
04524     return true;
04525   }
04526 
04527   unsigned LaneSize = 4;
04528   SmallVector<int, 4> MaskVal(LaneSize, -1);
04529 
04530   for (unsigned l = 0; l != NumElts; l += LaneSize) {
04531     for (unsigned i = 0; i != LaneSize; ++i) {
04532       if (!isUndefOrInRange(Mask[i+l], l, l+LaneSize))
04533         return false;
04534       if (Mask[i+l] < 0)
04535         continue;
04536       if (MaskVal[i] < 0) {
04537         MaskVal[i] = Mask[i+l] - l;
04538         Imm8 |= MaskVal[i] << (i*2);
04539         continue;
04540       }
04541       if (Mask[i+l] != (signed)(MaskVal[i]+l))
04542         return false;
04543     }
04544   }
04545   return true;
04546 }
04547 
04548 /// isVPERMILPMask - Return true if the specified VECTOR_SHUFFLE operand
04549 /// specifies a shuffle of elements that is suitable for input to VPERMILPD*.
04550 /// Note that VPERMIL mask matching is different depending whether theunderlying
04551 /// type is 32 or 64. In the VPERMILPS the high half of the mask should point
04552 /// to the same elements of the low, but to the higher half of the source.
04553 /// In VPERMILPD the two lanes could be shuffled independently of each other
04554 /// with the same restriction that lanes can't be crossed. Also handles PSHUFDY.
04555 static bool isVPERMILPMask(ArrayRef<int> Mask, MVT VT) {
04556   unsigned EltSize = VT.getVectorElementType().getSizeInBits();
04557   if (VT.getSizeInBits() < 256 || EltSize < 32)
04558     return false;
04559   bool symetricMaskRequired = (EltSize == 32);
04560   unsigned NumElts = VT.getVectorNumElements();
04561 
04562   unsigned NumLanes = VT.getSizeInBits()/128;
04563   unsigned LaneSize = NumElts/NumLanes;
04564   // 2 or 4 elements in one lane
04565 
04566   SmallVector<int, 4> ExpectedMaskVal(LaneSize, -1);
04567   for (unsigned l = 0; l != NumElts; l += LaneSize) {
04568     for (unsigned i = 0; i != LaneSize; ++i) {
04569       if (!isUndefOrInRange(Mask[i+l], l, l+LaneSize))
04570         return false;
04571       if (symetricMaskRequired) {
04572         if (ExpectedMaskVal[i] < 0 && Mask[i+l] >= 0) {
04573           ExpectedMaskVal[i] = Mask[i+l] - l;
04574           continue;
04575         }
04576         if (!isUndefOrEqual(Mask[i+l], ExpectedMaskVal[i]+l))
04577           return false;
04578       }
04579     }
04580   }
04581   return true;
04582 }
04583 
04584 /// isCommutedMOVLMask - Returns true if the shuffle mask is except the reverse
04585 /// of what x86 movss want. X86 movs requires the lowest  element to be lowest
04586 /// element of vector 2 and the other elements to come from vector 1 in order.
04587 static bool isCommutedMOVLMask(ArrayRef<int> Mask, MVT VT,
04588                                bool V2IsSplat = false, bool V2IsUndef = false) {
04589   if (!VT.is128BitVector())
04590     return false;
04591 
04592   unsigned NumOps = VT.getVectorNumElements();
04593   if (NumOps != 2 && NumOps != 4 && NumOps != 8 && NumOps != 16)
04594     return false;
04595 
04596   if (!isUndefOrEqual(Mask[0], 0))
04597     return false;
04598 
04599   for (unsigned i = 1; i != NumOps; ++i)
04600     if (!(isUndefOrEqual(Mask[i], i+NumOps) ||
04601           (V2IsUndef && isUndefOrInRange(Mask[i], NumOps, NumOps*2)) ||
04602           (V2IsSplat && isUndefOrEqual(Mask[i], NumOps))))
04603       return false;
04604 
04605   return true;
04606 }
04607 
04608 /// isMOVSHDUPMask - Return true if the specified VECTOR_SHUFFLE operand
04609 /// specifies a shuffle of elements that is suitable for input to MOVSHDUP.
04610 /// Masks to match: <1, 1, 3, 3> or <1, 1, 3, 3, 5, 5, 7, 7>
04611 static bool isMOVSHDUPMask(ArrayRef<int> Mask, MVT VT,
04612                            const X86Subtarget *Subtarget) {
04613   if (!Subtarget->hasSSE3())
04614     return false;
04615 
04616   unsigned NumElems = VT.getVectorNumElements();
04617 
04618   if ((VT.is128BitVector() && NumElems != 4) ||
04619       (VT.is256BitVector() && NumElems != 8) ||
04620       (VT.is512BitVector() && NumElems != 16))
04621     return false;
04622 
04623   // "i+1" is the value the indexed mask element must have
04624   for (unsigned i = 0; i != NumElems; i += 2)
04625     if (!isUndefOrEqual(Mask[i], i+1) ||
04626         !isUndefOrEqual(Mask[i+1], i+1))
04627       return false;
04628 
04629   return true;
04630 }
04631 
04632 /// isMOVSLDUPMask - Return true if the specified VECTOR_SHUFFLE operand
04633 /// specifies a shuffle of elements that is suitable for input to MOVSLDUP.
04634 /// Masks to match: <0, 0, 2, 2> or <0, 0, 2, 2, 4, 4, 6, 6>
04635 static bool isMOVSLDUPMask(ArrayRef<int> Mask, MVT VT,
04636                            const X86Subtarget *Subtarget) {
04637   if (!Subtarget->hasSSE3())
04638     return false;
04639 
04640   unsigned NumElems = VT.getVectorNumElements();
04641 
04642   if ((VT.is128BitVector() && NumElems != 4) ||
04643       (VT.is256BitVector() && NumElems != 8) ||
04644       (VT.is512BitVector() && NumElems != 16))
04645     return false;
04646 
04647   // "i" is the value the indexed mask element must have
04648   for (unsigned i = 0; i != NumElems; i += 2)
04649     if (!isUndefOrEqual(Mask[i], i) ||
04650         !isUndefOrEqual(Mask[i+1], i))
04651       return false;
04652 
04653   return true;
04654 }
04655 
04656 /// isMOVDDUPYMask - Return true if the specified VECTOR_SHUFFLE operand
04657 /// specifies a shuffle of elements that is suitable for input to 256-bit
04658 /// version of MOVDDUP.
04659 static bool isMOVDDUPYMask(ArrayRef<int> Mask, MVT VT, bool HasFp256) {
04660   if (!HasFp256 || !VT.is256BitVector())
04661     return false;
04662 
04663   unsigned NumElts = VT.getVectorNumElements();
04664   if (NumElts != 4)
04665     return false;
04666 
04667   for (unsigned i = 0; i != NumElts/2; ++i)
04668     if (!isUndefOrEqual(Mask[i], 0))
04669       return false;
04670   for (unsigned i = NumElts/2; i != NumElts; ++i)
04671     if (!isUndefOrEqual(Mask[i], NumElts/2))
04672       return false;
04673   return true;
04674 }
04675 
04676 /// isMOVDDUPMask - Return true if the specified VECTOR_SHUFFLE operand
04677 /// specifies a shuffle of elements that is suitable for input to 128-bit
04678 /// version of MOVDDUP.
04679 static bool isMOVDDUPMask(ArrayRef<int> Mask, MVT VT) {
04680   if (!VT.is128BitVector())
04681     return false;
04682 
04683   unsigned e = VT.getVectorNumElements() / 2;
04684   for (unsigned i = 0; i != e; ++i)
04685     if (!isUndefOrEqual(Mask[i], i))
04686       return false;
04687   for (unsigned i = 0; i != e; ++i)
04688     if (!isUndefOrEqual(Mask[e+i], i))
04689       return false;
04690   return true;
04691 }
04692 
04693 /// isVEXTRACTIndex - Return true if the specified
04694 /// EXTRACT_SUBVECTOR operand specifies a vector extract that is
04695 /// suitable for instruction that extract 128 or 256 bit vectors
04696 static bool isVEXTRACTIndex(SDNode *N, unsigned vecWidth) {
04697   assert((vecWidth == 128 || vecWidth == 256) && "Unexpected vector width");
04698   if (!isa<ConstantSDNode>(N->getOperand(1).getNode()))
04699     return false;
04700 
04701   // The index should be aligned on a vecWidth-bit boundary.
04702   uint64_t Index =
04703     cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue();
04704 
04705   MVT VT = N->getSimpleValueType(0);
04706   unsigned ElSize = VT.getVectorElementType().getSizeInBits();
04707   bool Result = (Index * ElSize) % vecWidth == 0;
04708 
04709   return Result;
04710 }
04711 
04712 /// isVINSERTIndex - Return true if the specified INSERT_SUBVECTOR
04713 /// operand specifies a subvector insert that is suitable for input to
04714 /// insertion of 128 or 256-bit subvectors
04715 static bool isVINSERTIndex(SDNode *N, unsigned vecWidth) {
04716   assert((vecWidth == 128 || vecWidth == 256) && "Unexpected vector width");
04717   if (!isa<ConstantSDNode>(N->getOperand(2).getNode()))
04718     return false;
04719   // The index should be aligned on a vecWidth-bit boundary.
04720   uint64_t Index =
04721     cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue();
04722 
04723   MVT VT = N->getSimpleValueType(0);
04724   unsigned ElSize = VT.getVectorElementType().getSizeInBits();
04725   bool Result = (Index * ElSize) % vecWidth == 0;
04726 
04727   return Result;
04728 }
04729 
04730 bool X86::isVINSERT128Index(SDNode *N) {
04731   return isVINSERTIndex(N, 128);
04732 }
04733 
04734 bool X86::isVINSERT256Index(SDNode *N) {
04735   return isVINSERTIndex(N, 256);
04736 }
04737 
04738 bool X86::isVEXTRACT128Index(SDNode *N) {
04739   return isVEXTRACTIndex(N, 128);
04740 }
04741 
04742 bool X86::isVEXTRACT256Index(SDNode *N) {
04743   return isVEXTRACTIndex(N, 256);
04744 }
04745 
04746 /// getShuffleSHUFImmediate - Return the appropriate immediate to shuffle
04747 /// the specified VECTOR_SHUFFLE mask with PSHUF* and SHUFP* instructions.
04748 /// Handles 128-bit and 256-bit.
04749 static unsigned getShuffleSHUFImmediate(ShuffleVectorSDNode *N) {
04750   MVT VT = N->getSimpleValueType(0);
04751 
04752   assert((VT.getSizeInBits() >= 128) &&
04753          "Unsupported vector type for PSHUF/SHUFP");
04754 
04755   // Handle 128 and 256-bit vector lengths. AVX defines PSHUF/SHUFP to operate
04756   // independently on 128-bit lanes.
04757   unsigned NumElts = VT.getVectorNumElements();
04758   unsigned NumLanes = VT.getSizeInBits()/128;
04759   unsigned NumLaneElts = NumElts/NumLanes;
04760 
04761   assert((NumLaneElts == 2 || NumLaneElts == 4 || NumLaneElts == 8) &&
04762          "Only supports 2, 4 or 8 elements per lane");
04763 
04764   unsigned Shift = (NumLaneElts >= 4) ? 1 : 0;
04765   unsigned Mask = 0;
04766   for (unsigned i = 0; i != NumElts; ++i) {
04767     int Elt = N->getMaskElt(i);
04768     if (Elt < 0) continue;
04769     Elt &= NumLaneElts - 1;
04770     unsigned ShAmt = (i << Shift) % 8;
04771     Mask |= Elt << ShAmt;
04772   }
04773 
04774   return Mask;
04775 }
04776 
04777 /// getShufflePSHUFHWImmediate - Return the appropriate immediate to shuffle
04778 /// the specified VECTOR_SHUFFLE mask with the PSHUFHW instruction.
04779 static unsigned getShufflePSHUFHWImmediate(ShuffleVectorSDNode *N) {
04780   MVT VT = N->getSimpleValueType(0);
04781 
04782   assert((VT == MVT::v8i16 || VT == MVT::v16i16) &&
04783          "Unsupported vector type for PSHUFHW");
04784 
04785   unsigned NumElts = VT.getVectorNumElements();
04786 
04787   unsigned Mask = 0;
04788   for (unsigned l = 0; l != NumElts; l += 8) {
04789     // 8 nodes per lane, but we only care about the last 4.
04790     for (unsigned i = 0; i < 4; ++i) {
04791       int Elt = N->getMaskElt(l+i+4);
04792       if (Elt < 0) continue;
04793       Elt &= 0x3; // only 2-bits.
04794       Mask |= Elt << (i * 2);
04795     }
04796   }
04797 
04798   return Mask;
04799 }
04800 
04801 /// getShufflePSHUFLWImmediate - Return the appropriate immediate to shuffle
04802 /// the specified VECTOR_SHUFFLE mask with the PSHUFLW instruction.
04803 static unsigned getShufflePSHUFLWImmediate(ShuffleVectorSDNode *N) {
04804   MVT VT = N->getSimpleValueType(0);
04805 
04806   assert((VT == MVT::v8i16 || VT == MVT::v16i16) &&
04807          "Unsupported vector type for PSHUFHW");
04808 
04809   unsigned NumElts = VT.getVectorNumElements();
04810 
04811   unsigned Mask = 0;
04812   for (unsigned l = 0; l != NumElts; l += 8) {
04813     // 8 nodes per lane, but we only care about the first 4.
04814     for (unsigned i = 0; i < 4; ++i) {
04815       int Elt = N->getMaskElt(l+i);
04816       if (Elt < 0) continue;
04817       Elt &= 0x3; // only 2-bits
04818       Mask |= Elt << (i * 2);
04819     }
04820   }
04821 
04822   return Mask;
04823 }
04824 
04825 /// \brief Return the appropriate immediate to shuffle the specified
04826 /// VECTOR_SHUFFLE mask with the PALIGNR (if InterLane is false) or with
04827 /// VALIGN (if Interlane is true) instructions.
04828 static unsigned getShuffleAlignrImmediate(ShuffleVectorSDNode *SVOp,
04829                                            bool InterLane) {
04830   MVT VT = SVOp->getSimpleValueType(0);
04831   unsigned EltSize = InterLane ? 1 :
04832     VT.getVectorElementType().getSizeInBits() >> 3;
04833 
04834   unsigned NumElts = VT.getVectorNumElements();
04835   unsigned NumLanes = VT.is512BitVector() ? 1 : VT.getSizeInBits()/128;
04836   unsigned NumLaneElts = NumElts/NumLanes;
04837 
04838   int Val = 0;
04839   unsigned i;
04840   for (i = 0; i != NumElts; ++i) {
04841     Val = SVOp->getMaskElt(i);
04842     if (Val >= 0)
04843       break;
04844   }
04845   if (Val >= (int)NumElts)
04846     Val -= NumElts - NumLaneElts;
04847 
04848   assert(Val - i > 0 && "PALIGNR imm should be positive");
04849   return (Val - i) * EltSize;
04850 }
04851 
04852 /// \brief Return the appropriate immediate to shuffle the specified
04853 /// VECTOR_SHUFFLE mask with the PALIGNR instruction.
04854 static unsigned getShufflePALIGNRImmediate(ShuffleVectorSDNode *SVOp) {
04855   return getShuffleAlignrImmediate(SVOp, false);
04856 }
04857 
04858 /// \brief Return the appropriate immediate to shuffle the specified
04859 /// VECTOR_SHUFFLE mask with the VALIGN instruction.
04860 static unsigned getShuffleVALIGNImmediate(ShuffleVectorSDNode *SVOp) {
04861   return getShuffleAlignrImmediate(SVOp, true);
04862 }
04863 
04864 
04865 static unsigned getExtractVEXTRACTImmediate(SDNode *N, unsigned vecWidth) {
04866   assert((vecWidth == 128 || vecWidth == 256) && "Unsupported vector width");
04867   if (!isa<ConstantSDNode>(N->getOperand(1).getNode()))
04868     llvm_unreachable("Illegal extract subvector for VEXTRACT");
04869 
04870   uint64_t Index =
04871     cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue();
04872 
04873   MVT VecVT = N->getOperand(0).getSimpleValueType();
04874   MVT ElVT = VecVT.getVectorElementType();
04875 
04876   unsigned NumElemsPerChunk = vecWidth / ElVT.getSizeInBits();
04877   return Index / NumElemsPerChunk;
04878 }
04879 
04880 static unsigned getInsertVINSERTImmediate(SDNode *N, unsigned vecWidth) {
04881   assert((vecWidth == 128 || vecWidth == 256) && "Unsupported vector width");
04882   if (!isa<ConstantSDNode>(N->getOperand(2).getNode()))
04883     llvm_unreachable("Illegal insert subvector for VINSERT");
04884 
04885   uint64_t Index =
04886     cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue();
04887 
04888   MVT VecVT = N->getSimpleValueType(0);
04889   MVT ElVT = VecVT.getVectorElementType();
04890 
04891   unsigned NumElemsPerChunk = vecWidth / ElVT.getSizeInBits();
04892   return Index / NumElemsPerChunk;
04893 }
04894 
04895 /// getExtractVEXTRACT128Immediate - Return the appropriate immediate
04896 /// to extract the specified EXTRACT_SUBVECTOR index with VEXTRACTF128
04897 /// and VINSERTI128 instructions.
04898 unsigned X86::getExtractVEXTRACT128Immediate(SDNode *N) {
04899   return getExtractVEXTRACTImmediate(N, 128);
04900 }
04901 
04902 /// getExtractVEXTRACT256Immediate - Return the appropriate immediate
04903 /// to extract the specified EXTRACT_SUBVECTOR index with VEXTRACTF64x4
04904 /// and VINSERTI64x4 instructions.
04905 unsigned X86::getExtractVEXTRACT256Immediate(SDNode *N) {
04906   return getExtractVEXTRACTImmediate(N, 256);
04907 }
04908 
04909 /// getInsertVINSERT128Immediate - Return the appropriate immediate
04910 /// to insert at the specified INSERT_SUBVECTOR index with VINSERTF128
04911 /// and VINSERTI128 instructions.
04912 unsigned X86::getInsertVINSERT128Immediate(SDNode *N) {
04913   return getInsertVINSERTImmediate(N, 128);
04914 }
04915 
04916 /// getInsertVINSERT256Immediate - Return the appropriate immediate
04917 /// to insert at the specified INSERT_SUBVECTOR index with VINSERTF46x4
04918 /// and VINSERTI64x4 instructions.
04919 unsigned X86::getInsertVINSERT256Immediate(SDNode *N) {
04920   return getInsertVINSERTImmediate(N, 256);
04921 }
04922 
04923 /// isZero - Returns true if Elt is a constant integer zero
04924 static bool isZero(SDValue V) {
04925   ConstantSDNode *C = dyn_cast<ConstantSDNode>(V);
04926   return C && C->isNullValue();
04927 }
04928 
04929 /// isZeroNode - Returns true if Elt is a constant zero or a floating point
04930 /// constant +0.0.
04931 bool X86::isZeroNode(SDValue Elt) {
04932   if (isZero(Elt))
04933     return true;
04934   if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Elt))
04935     return CFP->getValueAPF().isPosZero();
04936   return false;
04937 }
04938 
04939 /// ShouldXformToMOVHLPS - Return true if the node should be transformed to
04940 /// match movhlps. The lower half elements should come from upper half of
04941 /// V1 (and in order), and the upper half elements should come from the upper
04942 /// half of V2 (and in order).
04943 static bool ShouldXformToMOVHLPS(ArrayRef<int> Mask, MVT VT) {
04944   if (!VT.is128BitVector())
04945     return false;
04946   if (VT.getVectorNumElements() != 4)
04947     return false;
04948   for (unsigned i = 0, e = 2; i != e; ++i)
04949     if (!isUndefOrEqual(Mask[i], i+2))
04950       return false;
04951   for (unsigned i = 2; i != 4; ++i)
04952     if (!isUndefOrEqual(Mask[i], i+4))
04953       return false;
04954   return true;
04955 }
04956 
04957 /// isScalarLoadToVector - Returns true if the node is a scalar load that
04958 /// is promoted to a vector. It also returns the LoadSDNode by reference if
04959 /// required.
04960 static bool isScalarLoadToVector(SDNode *N, LoadSDNode **LD = nullptr) {
04961   if (N->getOpcode() != ISD::SCALAR_TO_VECTOR)
04962     return false;
04963   N = N->getOperand(0).getNode();
04964   if (!ISD::isNON_EXTLoad(N))
04965     return false;
04966   if (LD)
04967     *LD = cast<LoadSDNode>(N);
04968   return true;
04969 }
04970 
04971 // Test whether the given value is a vector value which will be legalized
04972 // into a load.
04973 static bool WillBeConstantPoolLoad(SDNode *N) {
04974   if (N->getOpcode() != ISD::BUILD_VECTOR)
04975     return false;
04976 
04977   // Check for any non-constant elements.
04978   for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i)
04979     switch (N->getOperand(i).getNode()->getOpcode()) {
04980     case ISD::UNDEF:
04981     case ISD::ConstantFP:
04982     case ISD::Constant:
04983       break;
04984     default:
04985       return false;
04986     }
04987 
04988   // Vectors of all-zeros and all-ones are materialized with special
04989   // instructions rather than being loaded.
04990   return !ISD::isBuildVectorAllZeros(N) &&
04991          !ISD::isBuildVectorAllOnes(N);
04992 }
04993 
04994 /// ShouldXformToMOVLP{S|D} - Return true if the node should be transformed to
04995 /// match movlp{s|d}. The lower half elements should come from lower half of
04996 /// V1 (and in order), and the upper half elements should come from the upper
04997 /// half of V2 (and in order). And since V1 will become the source of the
04998 /// MOVLP, it must be either a vector load or a scalar load to vector.
04999 static bool ShouldXformToMOVLP(SDNode *V1, SDNode *V2,
05000                                ArrayRef<int> Mask, MVT VT) {
05001   if (!VT.is128BitVector())
05002     return false;
05003 
05004   if (!ISD::isNON_EXTLoad(V1) && !isScalarLoadToVector(V1))
05005     return false;
05006   // Is V2 is a vector load, don't do this transformation. We will try to use
05007   // load folding shufps op.
05008   if (ISD::isNON_EXTLoad(V2) || WillBeConstantPoolLoad(V2))
05009     return false;
05010 
05011   unsigned NumElems = VT.getVectorNumElements();
05012 
05013   if (NumElems != 2 && NumElems != 4)
05014     return false;
05015   for (unsigned i = 0, e = NumElems/2; i != e; ++i)
05016     if (!isUndefOrEqual(Mask[i], i))
05017       return false;
05018   for (unsigned i = NumElems/2, e = NumElems; i != e; ++i)
05019     if (!isUndefOrEqual(Mask[i], i+NumElems))
05020       return false;
05021   return true;
05022 }
05023 
05024 /// isZeroShuffle - Returns true if N is a VECTOR_SHUFFLE that can be resolved
05025 /// to an zero vector.
05026 /// FIXME: move to dag combiner / method on ShuffleVectorSDNode
05027 static bool isZeroShuffle(ShuffleVectorSDNode *N) {
05028   SDValue V1 = N->getOperand(0);
05029   SDValue V2 = N->getOperand(1);
05030   unsigned NumElems = N->getValueType(0).getVectorNumElements();
05031   for (unsigned i = 0; i != NumElems; ++i) {
05032     int Idx = N->getMaskElt(i);
05033     if (Idx >= (int)NumElems) {
05034       unsigned Opc = V2.getOpcode();
05035       if (Opc == ISD::UNDEF || ISD::isBuildVectorAllZeros(V2.getNode()))
05036         continue;
05037       if (Opc != ISD::BUILD_VECTOR ||
05038           !X86::isZeroNode(V2.getOperand(Idx-NumElems)))
05039         return false;
05040     } else if (Idx >= 0) {
05041       unsigned Opc = V1.getOpcode();
05042       if (Opc == ISD::UNDEF || ISD::isBuildVectorAllZeros(V1.getNode()))
05043         continue;
05044       if (Opc != ISD::BUILD_VECTOR ||
05045           !X86::isZeroNode(V1.getOperand(Idx)))
05046         return false;
05047     }
05048   }
05049   return true;
05050 }
05051 
05052 /// getZeroVector - Returns a vector of specified type with all zero elements.
05053 ///
05054 static SDValue getZeroVector(EVT VT, const X86Subtarget *Subtarget,
05055                              SelectionDAG &DAG, SDLoc dl) {
05056   assert(VT.isVector() && "Expected a vector type");
05057 
05058   // Always build SSE zero vectors as <4 x i32> bitcasted
05059   // to their dest type. This ensures they get CSE'd.
05060   SDValue Vec;
05061   if (VT.is128BitVector()) {  // SSE
05062     if (Subtarget->hasSSE2()) {  // SSE2
05063       SDValue Cst = DAG.getTargetConstant(0, MVT::i32);
05064       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
05065     } else { // SSE1
05066       SDValue Cst = DAG.getTargetConstantFP(+0.0, MVT::f32);
05067       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4f32, Cst, Cst, Cst, Cst);
05068     }
05069   } else if (VT.is256BitVector()) { // AVX
05070     if (Subtarget->hasInt256()) { // AVX2
05071       SDValue Cst = DAG.getTargetConstant(0, MVT::i32);
05072       SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
05073       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops);
05074     } else {
05075       // 256-bit logic and arithmetic instructions in AVX are all
05076       // floating-point, no support for integer ops. Emit fp zeroed vectors.
05077       SDValue Cst = DAG.getTargetConstantFP(+0.0, MVT::f32);
05078       SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
05079       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8f32, Ops);
05080     }
05081   } else if (VT.is512BitVector()) { // AVX-512
05082       SDValue Cst = DAG.getTargetConstant(0, MVT::i32);
05083       SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst,
05084                         Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
05085       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v16i32, Ops);
05086   } else if (VT.getScalarType() == MVT::i1) {
05087     assert(VT.getVectorNumElements() <= 16 && "Unexpected vector type");
05088     SDValue Cst = DAG.getTargetConstant(0, MVT::i1);
05089     SmallVector<SDValue, 16> Ops(VT.getVectorNumElements(), Cst);
05090     return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops);
05091   } else
05092     llvm_unreachable("Unexpected vector type");
05093 
05094   return DAG.getNode(ISD::BITCAST, dl, VT, Vec);
05095 }
05096 
05097 /// getOnesVector - Returns a vector of specified type with all bits set.
05098 /// Always build ones vectors as <4 x i32> or <8 x i32>. For 256-bit types with
05099 /// no AVX2 supprt, use two <4 x i32> inserted in a <8 x i32> appropriately.
05100 /// Then bitcast to their original type, ensuring they get CSE'd.
05101 static SDValue getOnesVector(MVT VT, bool HasInt256, SelectionDAG &DAG,
05102                              SDLoc dl) {
05103   assert(VT.isVector() && "Expected a vector type");
05104 
05105   SDValue Cst = DAG.getTargetConstant(~0U, MVT::i32);
05106   SDValue Vec;
05107   if (VT.is256BitVector()) {
05108     if (HasInt256) { // AVX2
05109       SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
05110       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops);
05111     } else { // AVX
05112       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
05113       Vec = Concat128BitVectors(Vec, Vec, MVT::v8i32, 8, DAG, dl);
05114     }
05115   } else if (VT.is128BitVector()) {
05116     Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
05117   } else
05118     llvm_unreachable("Unexpected vector type");
05119 
05120   return DAG.getNode(ISD::BITCAST, dl, VT, Vec);
05121 }
05122 
05123 /// NormalizeMask - V2 is a splat, modify the mask (if needed) so all elements
05124 /// that point to V2 points to its first element.
05125 static void NormalizeMask(SmallVectorImpl<int> &Mask, unsigned NumElems) {
05126   for (unsigned i = 0; i != NumElems; ++i) {
05127     if (Mask[i] > (int)NumElems) {
05128       Mask[i] = NumElems;
05129     }
05130   }
05131 }
05132 
05133 /// getMOVLMask - Returns a vector_shuffle mask for an movs{s|d}, movd
05134 /// operation of specified width.
05135 static SDValue getMOVL(SelectionDAG &DAG, SDLoc dl, EVT VT, SDValue V1,
05136                        SDValue V2) {
05137   unsigned NumElems = VT.getVectorNumElements();
05138   SmallVector<int, 8> Mask;
05139   Mask.push_back(NumElems);
05140   for (unsigned i = 1; i != NumElems; ++i)
05141     Mask.push_back(i);
05142   return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]);
05143 }
05144 
05145 /// getUnpackl - Returns a vector_shuffle node for an unpackl operation.
05146 static SDValue getUnpackl(SelectionDAG &DAG, SDLoc dl, MVT VT, SDValue V1,
05147                           SDValue V2) {
05148   unsigned NumElems = VT.getVectorNumElements();
05149   SmallVector<int, 8> Mask;
05150   for (unsigned i = 0, e = NumElems/2; i != e; ++i) {
05151     Mask.push_back(i);
05152     Mask.push_back(i + NumElems);
05153   }
05154   return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]);
05155 }
05156 
05157 /// getUnpackh - Returns a vector_shuffle node for an unpackh operation.
05158 static SDValue getUnpackh(SelectionDAG &DAG, SDLoc dl, MVT VT, SDValue V1,
05159                           SDValue V2) {
05160   unsigned NumElems = VT.getVectorNumElements();
05161   SmallVector<int, 8> Mask;
05162   for (unsigned i = 0, Half = NumElems/2; i != Half; ++i) {
05163     Mask.push_back(i + Half);
05164     Mask.push_back(i + NumElems + Half);
05165   }
05166   return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]);
05167 }
05168 
05169 // PromoteSplati8i16 - All i16 and i8 vector types can't be used directly by
05170 // a generic shuffle instruction because the target has no such instructions.
05171 // Generate shuffles which repeat i16 and i8 several times until they can be
05172 // represented by v4f32 and then be manipulated by target suported shuffles.
05173 static SDValue PromoteSplati8i16(SDValue V, SelectionDAG &DAG, int &EltNo) {
05174   MVT VT = V.getSimpleValueType();
05175   int NumElems = VT.getVectorNumElements();
05176   SDLoc dl(V);
05177 
05178   while (NumElems > 4) {
05179     if (EltNo < NumElems/2) {
05180       V = getUnpackl(DAG, dl, VT, V, V);
05181     } else {
05182       V = getUnpackh(DAG, dl, VT, V, V);
05183       EltNo -= NumElems/2;
05184     }
05185     NumElems >>= 1;
05186   }
05187   return V;
05188 }
05189 
05190 /// getLegalSplat - Generate a legal splat with supported x86 shuffles
05191 static SDValue getLegalSplat(SelectionDAG &DAG, SDValue V, int EltNo) {
05192   MVT VT = V.getSimpleValueType();
05193   SDLoc dl(V);
05194 
05195   if (VT.is128BitVector()) {
05196     V = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, V);
05197     int SplatMask[4] = { EltNo, EltNo, EltNo, EltNo };
05198     V = DAG.getVectorShuffle(MVT::v4f32, dl, V, DAG.getUNDEF(MVT::v4f32),
05199                              &SplatMask[0]);
05200   } else if (VT.is256BitVector()) {
05201     // To use VPERMILPS to splat scalars, the second half of indicies must
05202     // refer to the higher part, which is a duplication of the lower one,
05203     // because VPERMILPS can only handle in-lane permutations.
05204     int SplatMask[8] = { EltNo, EltNo, EltNo, EltNo,
05205                          EltNo+4, EltNo+4, EltNo+4, EltNo+4 };
05206 
05207     V = DAG.getNode(ISD::BITCAST, dl, MVT::v8f32, V);
05208     V = DAG.getVectorShuffle(MVT::v8f32, dl, V, DAG.getUNDEF(MVT::v8f32),
05209                              &SplatMask[0]);
05210   } else
05211     llvm_unreachable("Vector size not supported");
05212 
05213   return DAG.getNode(ISD::BITCAST, dl, VT, V);
05214 }
05215 
05216 /// PromoteSplat - Splat is promoted to target supported vector shuffles.
05217 static SDValue PromoteSplat(ShuffleVectorSDNode *SV, SelectionDAG &DAG) {
05218   MVT SrcVT = SV->getSimpleValueType(0);
05219   SDValue V1 = SV->getOperand(0);
05220   SDLoc dl(SV);
05221 
05222   int EltNo = SV->getSplatIndex();
05223   int NumElems = SrcVT.getVectorNumElements();
05224   bool Is256BitVec = SrcVT.is256BitVector();
05225 
05226   assert(((SrcVT.is128BitVector() && NumElems > 4) || Is256BitVec) &&
05227          "Unknown how to promote splat for type");
05228 
05229   // Extract the 128-bit part containing the splat element and update
05230   // the splat element index when it refers to the higher register.
05231   if (Is256BitVec) {
05232     V1 = Extract128BitVector(V1, EltNo, DAG, dl);
05233     if (EltNo >= NumElems/2)
05234       EltNo -= NumElems/2;
05235   }
05236 
05237   // All i16 and i8 vector types can't be used directly by a generic shuffle
05238   // instruction because the target has no such instruction. Generate shuffles
05239   // which repeat i16 and i8 several times until they fit in i32, and then can
05240   // be manipulated by target suported shuffles.
05241   MVT EltVT = SrcVT.getVectorElementType();
05242   if (EltVT == MVT::i8 || EltVT == MVT::i16)
05243     V1 = PromoteSplati8i16(V1, DAG, EltNo);
05244 
05245   // Recreate the 256-bit vector and place the same 128-bit vector
05246   // into the low and high part. This is necessary because we want
05247   // to use VPERM* to shuffle the vectors
05248   if (Is256BitVec) {
05249     V1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, SrcVT, V1, V1);
05250   }
05251 
05252   return getLegalSplat(DAG, V1, EltNo);
05253 }
05254 
05255 /// getShuffleVectorZeroOrUndef - Return a vector_shuffle of the specified
05256 /// vector of zero or undef vector.  This produces a shuffle where the low
05257 /// element of V2 is swizzled into the zero/undef vector, landing at element
05258 /// Idx.  This produces a shuffle mask like 4,1,2,3 (idx=0) or  0,1,2,4 (idx=3).
05259 static SDValue getShuffleVectorZeroOrUndef(SDValue V2, unsigned Idx,
05260                                            bool IsZero,
05261                                            const X86Subtarget *Subtarget,
05262                                            SelectionDAG &DAG) {
05263   MVT VT = V2.getSimpleValueType();
05264   SDValue V1 = IsZero
05265     ? getZeroVector(VT, Subtarget, DAG, SDLoc(V2)) : DAG.getUNDEF(VT);
05266   unsigned NumElems = VT.getVectorNumElements();
05267   SmallVector<int, 16> MaskVec;
05268   for (unsigned i = 0; i != NumElems; ++i)
05269     // If this is the insertion idx, put the low elt of V2 here.
05270     MaskVec.push_back(i == Idx ? NumElems : i);
05271   return DAG.getVectorShuffle(VT, SDLoc(V2), V1, V2, &MaskVec[0]);
05272 }
05273 
05274 /// getTargetShuffleMask - Calculates the shuffle mask corresponding to the
05275 /// target specific opcode. Returns true if the Mask could be calculated. Sets
05276 /// IsUnary to true if only uses one source. Note that this will set IsUnary for
05277 /// shuffles which use a single input multiple times, and in those cases it will
05278 /// adjust the mask to only have indices within that single input.
05279 static bool getTargetShuffleMask(SDNode *N, MVT VT,
05280                                  SmallVectorImpl<int> &Mask, bool &IsUnary) {
05281   unsigned NumElems = VT.getVectorNumElements();
05282   SDValue ImmN;
05283 
05284   IsUnary = false;
05285   bool IsFakeUnary = false;
05286   switch(N->getOpcode()) {
05287   case X86ISD::SHUFP:
05288     ImmN = N->getOperand(N->getNumOperands()-1);
05289     DecodeSHUFPMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
05290     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
05291     break;
05292   case X86ISD::UNPCKH:
05293     DecodeUNPCKHMask(VT, Mask);
05294     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
05295     break;
05296   case X86ISD::UNPCKL:
05297     DecodeUNPCKLMask(VT, Mask);
05298     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
05299     break;
05300   case X86ISD::MOVHLPS:
05301     DecodeMOVHLPSMask(NumElems, Mask);
05302     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
05303     break;
05304   case X86ISD::MOVLHPS:
05305     DecodeMOVLHPSMask(NumElems, Mask);
05306     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
05307     break;
05308   case X86ISD::PALIGNR:
05309     ImmN = N->getOperand(N->getNumOperands()-1);
05310     DecodePALIGNRMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
05311     break;
05312   case X86ISD::PSHUFD:
05313   case X86ISD::VPERMILP:
05314     ImmN = N->getOperand(N->getNumOperands()-1);
05315     DecodePSHUFMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
05316     IsUnary = true;
05317     break;
05318   case X86ISD::PSHUFHW:
05319     ImmN = N->getOperand(N->getNumOperands()-1);
05320     DecodePSHUFHWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
05321     IsUnary = true;
05322     break;
05323   case X86ISD::PSHUFLW:
05324     ImmN = N->getOperand(N->getNumOperands()-1);
05325     DecodePSHUFLWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
05326     IsUnary = true;
05327     break;
05328   case X86ISD::PSHUFB: {
05329     IsUnary = true;
05330     SDValue MaskNode = N->getOperand(1);
05331     while (MaskNode->getOpcode() == ISD::BITCAST)
05332       MaskNode = MaskNode->getOperand(0);
05333 
05334     if (MaskNode->getOpcode() == ISD::BUILD_VECTOR) {
05335       // If we have a build-vector, then things are easy.
05336       EVT VT = MaskNode.getValueType();
05337       assert(VT.isVector() &&
05338              "Can't produce a non-vector with a build_vector!");
05339       if (!VT.isInteger())
05340         return false;
05341 
05342       int NumBytesPerElement = VT.getVectorElementType().getSizeInBits() / 8;
05343 
05344       SmallVector<uint64_t, 32> RawMask;
05345       for (int i = 0, e = MaskNode->getNumOperands(); i < e; ++i) {
05346         auto *CN = dyn_cast<ConstantSDNode>(MaskNode->getOperand(i));
05347         if (!CN)
05348           return false;
05349         APInt MaskElement = CN->getAPIntValue();
05350 
05351         // We now have to decode the element which could be any integer size and
05352         // extract each byte of it.
05353         for (int j = 0; j < NumBytesPerElement; ++j) {
05354           // Note that this is x86 and so always little endian: the low byte is
05355           // the first byte of the mask.
05356           RawMask.push_back(MaskElement.getLoBits(8).getZExtValue());
05357           MaskElement = MaskElement.lshr(8);
05358         }
05359       }
05360       DecodePSHUFBMask(RawMask, Mask);
05361       break;
05362     }
05363 
05364     auto *MaskLoad = dyn_cast<LoadSDNode>(MaskNode);
05365     if (!MaskLoad)
05366       return false;
05367 
05368     SDValue Ptr = MaskLoad->getBasePtr();
05369     if (Ptr->getOpcode() == X86ISD::Wrapper)
05370       Ptr = Ptr->getOperand(0);
05371 
05372     auto *MaskCP = dyn_cast<ConstantPoolSDNode>(Ptr);
05373     if (!MaskCP || MaskCP->isMachineConstantPoolEntry())
05374       return false;
05375 
05376     if (auto *C = dyn_cast<ConstantDataSequential>(MaskCP->getConstVal())) {
05377       // FIXME: Support AVX-512 here.
05378       if (!C->getType()->isVectorTy() ||
05379           (C->getNumElements() != 16 && C->getNumElements() != 32))
05380         return false;
05381 
05382       assert(C->getType()->isVectorTy() && "Expected a vector constant.");
05383       DecodePSHUFBMask(C, Mask);
05384       break;
05385     }
05386 
05387     return false;
05388   }
05389   case X86ISD::VPERMI:
05390     ImmN = N->getOperand(N->getNumOperands()-1);
05391     DecodeVPERMMask(cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
05392     IsUnary = true;
05393     break;
05394   case X86ISD::MOVSS:
05395   case X86ISD::MOVSD: {
05396     // The index 0 always comes from the first element of the second source,
05397     // this is why MOVSS and MOVSD are used in the first place. The other
05398     // elements come from the other positions of the first source vector
05399     Mask.push_back(NumElems);
05400     for (unsigned i = 1; i != NumElems; ++i) {
05401       Mask.push_back(i);
05402     }
05403     break;
05404   }
05405   case X86ISD::VPERM2X128:
05406     ImmN = N->getOperand(N->getNumOperands()-1);
05407     DecodeVPERM2X128Mask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
05408     if (Mask.empty()) return false;
05409     break;
05410   case X86ISD::MOVSLDUP:
05411     DecodeMOVSLDUPMask(VT, Mask);
05412     break;
05413   case X86ISD::MOVSHDUP:
05414     DecodeMOVSHDUPMask(VT, Mask);
05415     break;
05416   case X86ISD::MOVDDUP:
05417   case X86ISD::MOVLHPD:
05418   case X86ISD::MOVLPD:
05419   case X86ISD::MOVLPS:
05420     // Not yet implemented
05421     return false;
05422   default: llvm_unreachable("unknown target shuffle node");
05423   }
05424 
05425   // If we have a fake unary shuffle, the shuffle mask is spread across two
05426   // inputs that are actually the same node. Re-map the mask to always point
05427   // into the first input.
05428   if (IsFakeUnary)
05429     for (int &M : Mask)
05430       if (M >= (int)Mask.size())
05431         M -= Mask.size();
05432 
05433   return true;
05434 }
05435 
05436 /// getShuffleScalarElt - Returns the scalar element that will make up the ith
05437 /// element of the result of the vector shuffle.
05438 static SDValue getShuffleScalarElt(SDNode *N, unsigned Index, SelectionDAG &DAG,
05439                                    unsigned Depth) {
05440   if (Depth == 6)
05441     return SDValue();  // Limit search depth.
05442 
05443   SDValue V = SDValue(N, 0);
05444   EVT VT = V.getValueType();
05445   unsigned Opcode = V.getOpcode();
05446 
05447   // Recurse into ISD::VECTOR_SHUFFLE node to find scalars.
05448   if (const ShuffleVectorSDNode *SV = dyn_cast<ShuffleVectorSDNode>(N)) {
05449     int Elt = SV->getMaskElt(Index);
05450 
05451     if (Elt < 0)
05452       return DAG.getUNDEF(VT.getVectorElementType());
05453 
05454     unsigned NumElems = VT.getVectorNumElements();
05455     SDValue NewV = (Elt < (int)NumElems) ? SV->getOperand(0)
05456                                          : SV->getOperand(1);
05457     return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG, Depth+1);
05458   }
05459 
05460   // Recurse into target specific vector shuffles to find scalars.
05461   if (isTargetShuffle(Opcode)) {
05462     MVT ShufVT = V.getSimpleValueType();
05463     unsigned NumElems = ShufVT.getVectorNumElements();
05464     SmallVector<int, 16> ShuffleMask;
05465     bool IsUnary;
05466 
05467     if (!getTargetShuffleMask(N, ShufVT, ShuffleMask, IsUnary))
05468       return SDValue();
05469 
05470     int Elt = ShuffleMask[Index];
05471     if (Elt < 0)
05472       return DAG.getUNDEF(ShufVT.getVectorElementType());
05473 
05474     SDValue NewV = (Elt < (int)NumElems) ? N->getOperand(0)
05475                                          : N->getOperand(1);
05476     return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG,
05477                                Depth+1);
05478   }
05479 
05480   // Actual nodes that may contain scalar elements
05481   if (Opcode == ISD::BITCAST) {
05482     V = V.getOperand(0);
05483     EVT SrcVT = V.getValueType();
05484     unsigned NumElems = VT.getVectorNumElements();
05485 
05486     if (!SrcVT.isVector() || SrcVT.getVectorNumElements() != NumElems)
05487       return SDValue();
05488   }
05489 
05490   if (V.getOpcode() == ISD::SCALAR_TO_VECTOR)
05491     return (Index == 0) ? V.getOperand(0)
05492                         : DAG.getUNDEF(VT.getVectorElementType());
05493 
05494   if (V.getOpcode() == ISD::BUILD_VECTOR)
05495     return V.getOperand(Index);
05496 
05497   return SDValue();
05498 }
05499 
05500 /// getNumOfConsecutiveZeros - Return the number of elements of a vector
05501 /// shuffle operation which come from a consecutively from a zero. The
05502 /// search can start in two different directions, from left or right.
05503 /// We count undefs as zeros until PreferredNum is reached.
05504 static unsigned getNumOfConsecutiveZeros(ShuffleVectorSDNode *SVOp,
05505                                          unsigned NumElems, bool ZerosFromLeft,
05506                                          SelectionDAG &DAG,
05507                                          unsigned PreferredNum = -1U) {
05508   unsigned NumZeros = 0;
05509   for (unsigned i = 0; i != NumElems; ++i) {
05510     unsigned Index = ZerosFromLeft ? i : NumElems - i - 1;
05511     SDValue Elt = getShuffleScalarElt(SVOp, Index, DAG, 0);
05512     if (!Elt.getNode())
05513       break;
05514 
05515     if (X86::isZeroNode(Elt))
05516       ++NumZeros;
05517     else if (Elt.getOpcode() == ISD::UNDEF) // Undef as zero up to PreferredNum.
05518       NumZeros = std::min(NumZeros + 1, PreferredNum);
05519     else
05520       break;
05521   }
05522 
05523   return NumZeros;
05524 }
05525 
05526 /// isShuffleMaskConsecutive - Check if the shuffle mask indicies [MaskI, MaskE)
05527 /// correspond consecutively to elements from one of the vector operands,
05528 /// starting from its index OpIdx. Also tell OpNum which source vector operand.
05529 static
05530 bool isShuffleMaskConsecutive(ShuffleVectorSDNode *SVOp,
05531                               unsigned MaskI, unsigned MaskE, unsigned OpIdx,
05532                               unsigned NumElems, unsigned &OpNum) {
05533   bool SeenV1 = false;
05534   bool SeenV2 = false;
05535 
05536   for (unsigned i = MaskI; i != MaskE; ++i, ++OpIdx) {
05537     int Idx = SVOp->getMaskElt(i);
05538     // Ignore undef indicies
05539     if (Idx < 0)
05540       continue;
05541 
05542     if (Idx < (int)NumElems)
05543       SeenV1 = true;
05544     else
05545       SeenV2 = true;
05546 
05547     // Only accept consecutive elements from the same vector
05548     if ((Idx % NumElems != OpIdx) || (SeenV1 && SeenV2))
05549       return false;
05550   }
05551 
05552   OpNum = SeenV1 ? 0 : 1;
05553   return true;
05554 }
05555 
05556 /// isVectorShiftRight - Returns true if the shuffle can be implemented as a
05557 /// logical left shift of a vector.
05558 static bool isVectorShiftRight(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG,
05559                                bool &isLeft, SDValue &ShVal, unsigned &ShAmt) {
05560   unsigned NumElems =
05561     SVOp->getSimpleValueType(0).getVectorNumElements();
05562   unsigned NumZeros = getNumOfConsecutiveZeros(
05563       SVOp, NumElems, false /* check zeros from right */, DAG,
05564       SVOp->getMaskElt(0));
05565   unsigned OpSrc;
05566 
05567   if (!NumZeros)
05568     return false;
05569 
05570   // Considering the elements in the mask that are not consecutive zeros,
05571   // check if they consecutively come from only one of the source vectors.
05572   //
05573   //               V1 = {X, A, B, C}     0
05574   //                         \  \  \    /
05575   //   vector_shuffle V1, V2 <1, 2, 3, X>
05576   //
05577   if (!isShuffleMaskConsecutive(SVOp,
05578             0,                   // Mask Start Index
05579             NumElems-NumZeros,   // Mask End Index(exclusive)
05580             NumZeros,            // Where to start looking in the src vector
05581             NumElems,            // Number of elements in vector
05582             OpSrc))              // Which source operand ?
05583     return false;
05584 
05585   isLeft = false;
05586   ShAmt = NumZeros;
05587   ShVal = SVOp->getOperand(OpSrc);
05588   return true;
05589 }
05590 
05591 /// isVectorShiftLeft - Returns true if the shuffle can be implemented as a
05592 /// logical left shift of a vector.
05593 static bool isVectorShiftLeft(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG,
05594                               bool &isLeft, SDValue &ShVal, unsigned &ShAmt) {
05595   unsigned NumElems =
05596     SVOp->getSimpleValueType(0).getVectorNumElements();
05597   unsigned NumZeros = getNumOfConsecutiveZeros(
05598       SVOp, NumElems, true /* check zeros from left */, DAG,
05599       NumElems - SVOp->getMaskElt(NumElems - 1) - 1);
05600   unsigned OpSrc;
05601 
05602   if (!NumZeros)
05603     return false;
05604 
05605   // Considering the elements in the mask that are not consecutive zeros,
05606   // check if they consecutively come from only one of the source vectors.
05607   //
05608   //                           0    { A, B, X, X } = V2
05609   //                          / \    /  /
05610   //   vector_shuffle V1, V2 <X, X, 4, 5>
05611   //
05612   if (!isShuffleMaskConsecutive(SVOp,
05613             NumZeros,     // Mask Start Index
05614             NumElems,     // Mask End Index(exclusive)
05615             0,            // Where to start looking in the src vector
05616             NumElems,     // Number of elements in vector
05617             OpSrc))       // Which source operand ?
05618     return false;
05619 
05620   isLeft = true;
05621   ShAmt = NumZeros;
05622   ShVal = SVOp->getOperand(OpSrc);
05623   return true;
05624 }
05625 
05626 /// isVectorShift - Returns true if the shuffle can be implemented as a
05627 /// logical left or right shift of a vector.
05628 static bool isVectorShift(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG,
05629                           bool &isLeft, SDValue &ShVal, unsigned &ShAmt) {
05630   // Although the logic below support any bitwidth size, there are no
05631   // shift instructions which handle more than 128-bit vectors.
05632   if (!SVOp->getSimpleValueType(0).is128BitVector())
05633     return false;
05634 
05635   if (isVectorShiftLeft(SVOp, DAG, isLeft, ShVal, ShAmt) ||
05636       isVectorShiftRight(SVOp, DAG, isLeft, ShVal, ShAmt))
05637     return true;
05638 
05639   return false;
05640 }
05641 
05642 /// LowerBuildVectorv16i8 - Custom lower build_vector of v16i8.
05643 ///
05644 static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros,
05645                                        unsigned NumNonZero, unsigned NumZero,
05646                                        SelectionDAG &DAG,
05647                                        const X86Subtarget* Subtarget,
05648                                        const TargetLowering &TLI) {
05649   if (NumNonZero > 8)
05650     return SDValue();
05651 
05652   SDLoc dl(Op);
05653   SDValue V;
05654   bool First = true;
05655   for (unsigned i = 0; i < 16; ++i) {
05656     bool ThisIsNonZero = (NonZeros & (1 << i)) != 0;
05657     if (ThisIsNonZero && First) {
05658       if (NumZero)
05659         V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
05660       else
05661         V = DAG.getUNDEF(MVT::v8i16);
05662       First = false;
05663     }
05664 
05665     if ((i & 1) != 0) {
05666       SDValue ThisElt, LastElt;
05667       bool LastIsNonZero = (NonZeros & (1 << (i-1))) != 0;
05668       if (LastIsNonZero) {
05669         LastElt = DAG.getNode(ISD::ZERO_EXTEND, dl,
05670                               MVT::i16, Op.getOperand(i-1));
05671       }
05672       if (ThisIsNonZero) {
05673         ThisElt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i));
05674         ThisElt = DAG.getNode(ISD::SHL, dl, MVT::i16,
05675                               ThisElt, DAG.getConstant(8, MVT::i8));
05676         if (LastIsNonZero)
05677           ThisElt = DAG.getNode(ISD::OR, dl, MVT::i16, ThisElt, LastElt);
05678       } else
05679         ThisElt = LastElt;
05680 
05681       if (ThisElt.getNode())
05682         V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, ThisElt,
05683                         DAG.getIntPtrConstant(i/2));
05684     }
05685   }
05686 
05687   return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, V);
05688 }
05689 
05690 /// LowerBuildVectorv8i16 - Custom lower build_vector of v8i16.
05691 ///
05692 static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros,
05693                                      unsigned NumNonZero, unsigned NumZero,
05694                                      SelectionDAG &DAG,
05695                                      const X86Subtarget* Subtarget,
05696                                      const TargetLowering &TLI) {
05697   if (NumNonZero > 4)
05698     return SDValue();
05699 
05700   SDLoc dl(Op);
05701   SDValue V;
05702   bool First = true;
05703   for (unsigned i = 0; i < 8; ++i) {
05704     bool isNonZero = (NonZeros & (1 << i)) != 0;
05705     if (isNonZero) {
05706       if (First) {
05707         if (NumZero)
05708           V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
05709         else
05710           V = DAG.getUNDEF(MVT::v8i16);
05711         First = false;
05712       }
05713       V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl,
05714                       MVT::v8i16, V, Op.getOperand(i),
05715                       DAG.getIntPtrConstant(i));
05716     }
05717   }
05718 
05719   return V;
05720 }
05721 
05722 /// LowerBuildVectorv4x32 - Custom lower build_vector of v4i32 or v4f32.
05723 static SDValue LowerBuildVectorv4x32(SDValue Op, unsigned NumElems,
05724                                      unsigned NonZeros, unsigned NumNonZero,
05725                                      unsigned NumZero, SelectionDAG &DAG,
05726                                      const X86Subtarget *Subtarget,
05727                                      const TargetLowering &TLI) {
05728   // We know there's at least one non-zero element
05729   unsigned FirstNonZeroIdx = 0;
05730   SDValue FirstNonZero = Op->getOperand(FirstNonZeroIdx);
05731   while (FirstNonZero.getOpcode() == ISD::UNDEF ||
05732          X86::isZeroNode(FirstNonZero)) {
05733     ++FirstNonZeroIdx;
05734     FirstNonZero = Op->getOperand(FirstNonZeroIdx);
05735   }
05736 
05737   if (FirstNonZero.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
05738       !isa<ConstantSDNode>(FirstNonZero.getOperand(1)))
05739     return SDValue();
05740 
05741   SDValue V = FirstNonZero.getOperand(0);
05742   MVT VVT = V.getSimpleValueType();
05743   if (!Subtarget->hasSSE41() || (VVT != MVT::v4f32 && VVT != MVT::v4i32))
05744     return SDValue();
05745 
05746   unsigned FirstNonZeroDst =
05747       cast<ConstantSDNode>(FirstNonZero.getOperand(1))->getZExtValue();
05748   unsigned CorrectIdx = FirstNonZeroDst == FirstNonZeroIdx;
05749   unsigned IncorrectIdx = CorrectIdx ? -1U : FirstNonZeroIdx;
05750   unsigned IncorrectDst = CorrectIdx ? -1U : FirstNonZeroDst;
05751 
05752   for (unsigned Idx = FirstNonZeroIdx + 1; Idx < NumElems; ++Idx) {
05753     SDValue Elem = Op.getOperand(Idx);
05754     if (Elem.getOpcode() == ISD::UNDEF || X86::isZeroNode(Elem))
05755       continue;
05756 
05757     // TODO: What else can be here? Deal with it.
05758     if (Elem.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
05759       return SDValue();
05760 
05761     // TODO: Some optimizations are still possible here
05762     // ex: Getting one element from a vector, and the rest from another.
05763     if (Elem.getOperand(0) != V)
05764       return SDValue();
05765 
05766     unsigned Dst = cast<ConstantSDNode>(Elem.getOperand(1))->getZExtValue();
05767     if (Dst == Idx)
05768       ++CorrectIdx;
05769     else if (IncorrectIdx == -1U) {
05770       IncorrectIdx = Idx;
05771       IncorrectDst = Dst;
05772     } else
05773       // There was already one element with an incorrect index.
05774       // We can't optimize this case to an insertps.
05775       return SDValue();
05776   }
05777 
05778   if (NumNonZero == CorrectIdx || NumNonZero == CorrectIdx + 1) {
05779     SDLoc dl(Op);
05780     EVT VT = Op.getSimpleValueType();
05781     unsigned ElementMoveMask = 0;
05782     if (IncorrectIdx == -1U)
05783       ElementMoveMask = FirstNonZeroIdx << 6 | FirstNonZeroIdx << 4;
05784     else
05785       ElementMoveMask = IncorrectDst << 6 | IncorrectIdx << 4;
05786 
05787     SDValue InsertpsMask =
05788         DAG.getIntPtrConstant(ElementMoveMask | (~NonZeros & 0xf));
05789     return DAG.getNode(X86ISD::INSERTPS, dl, VT, V, V, InsertpsMask);
05790   }
05791 
05792   return SDValue();
05793 }
05794 
05795 /// getVShift - Return a vector logical shift node.
05796 ///
05797 static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp,
05798                          unsigned NumBits, SelectionDAG &DAG,
05799                          const TargetLowering &TLI, SDLoc dl) {
05800   assert(VT.is128BitVector() && "Unknown type for VShift");
05801   EVT ShVT = MVT::v2i64;
05802   unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ;
05803   SrcOp = DAG.getNode(ISD::BITCAST, dl, ShVT, SrcOp);
05804   return DAG.getNode(ISD::BITCAST, dl, VT,
05805                      DAG.getNode(Opc, dl, ShVT, SrcOp,
05806                              DAG.getConstant(NumBits,
05807                                   TLI.getScalarShiftAmountTy(SrcOp.getValueType()))));
05808 }
05809 
05810 static SDValue
05811 LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, SDLoc dl, SelectionDAG &DAG) {
05812 
05813   // Check if the scalar load can be widened into a vector load. And if
05814   // the address is "base + cst" see if the cst can be "absorbed" into
05815   // the shuffle mask.
05816   if (LoadSDNode *LD = dyn_cast<LoadSDNode>(SrcOp)) {
05817     SDValue Ptr = LD->getBasePtr();
05818     if (!ISD::isNormalLoad(LD) || LD->isVolatile())
05819       return SDValue();
05820     EVT PVT = LD->getValueType(0);
05821     if (PVT != MVT::i32 && PVT != MVT::f32)
05822       return SDValue();
05823 
05824     int FI = -1;
05825     int64_t Offset = 0;
05826     if (FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr)) {
05827       FI = FINode->getIndex();
05828       Offset = 0;
05829     } else if (DAG.isBaseWithConstantOffset(Ptr) &&
05830                isa<FrameIndexSDNode>(Ptr.getOperand(0))) {
05831       FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex();
05832       Offset = Ptr.getConstantOperandVal(1);
05833       Ptr = Ptr.getOperand(0);
05834     } else {
05835       return SDValue();
05836     }
05837 
05838     // FIXME: 256-bit vector instructions don't require a strict alignment,
05839     // improve this code to support it better.
05840     unsigned RequiredAlign = VT.getSizeInBits()/8;
05841     SDValue Chain = LD->getChain();
05842     // Make sure the stack object alignment is at least 16 or 32.
05843     MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
05844     if (DAG.InferPtrAlignment(Ptr) < RequiredAlign) {
05845       if (MFI->isFixedObjectIndex(FI)) {
05846         // Can't change the alignment. FIXME: It's possible to compute
05847         // the exact stack offset and reference FI + adjust offset instead.
05848         // If someone *really* cares about this. That's the way to implement it.
05849         return SDValue();
05850       } else {
05851         MFI->setObjectAlignment(FI, RequiredAlign);
05852       }
05853     }
05854 
05855     // (Offset % 16 or 32) must be multiple of 4. Then address is then
05856     // Ptr + (Offset & ~15).
05857     if (Offset < 0)
05858       return SDValue();
05859     if ((Offset % RequiredAlign) & 3)
05860       return SDValue();
05861     int64_t StartOffset = Offset & ~(RequiredAlign-1);
05862     if (StartOffset)
05863       Ptr = DAG.getNode(ISD::ADD, SDLoc(Ptr), Ptr.getValueType(),
05864                         Ptr,DAG.getConstant(StartOffset, Ptr.getValueType()));
05865 
05866     int EltNo = (Offset - StartOffset) >> 2;
05867     unsigned NumElems = VT.getVectorNumElements();
05868 
05869     EVT NVT = EVT::getVectorVT(*DAG.getContext(), PVT, NumElems);
05870     SDValue V1 = DAG.getLoad(NVT, dl, Chain, Ptr,
05871                              LD->getPointerInfo().getWithOffset(StartOffset),
05872                              false, false, false, 0);
05873 
05874     SmallVector<int, 8> Mask;
05875     for (unsigned i = 0; i != NumElems; ++i)
05876       Mask.push_back(EltNo);
05877 
05878     return DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), &Mask[0]);
05879   }
05880 
05881   return SDValue();
05882 }
05883 
05884 /// EltsFromConsecutiveLoads - Given the initializing elements 'Elts' of a
05885 /// vector of type 'VT', see if the elements can be replaced by a single large
05886 /// load which has the same value as a build_vector whose operands are 'elts'.
05887 ///
05888 /// Example: <load i32 *a, load i32 *a+4, undef, undef> -> zextload a
05889 ///
05890 /// FIXME: we'd also like to handle the case where the last elements are zero
05891 /// rather than undef via VZEXT_LOAD, but we do not detect that case today.
05892 /// There's even a handy isZeroNode for that purpose.
05893 static SDValue EltsFromConsecutiveLoads(EVT VT, SmallVectorImpl<SDValue> &Elts,
05894                                         SDLoc &DL, SelectionDAG &DAG,
05895                                         bool isAfterLegalize) {
05896   EVT EltVT = VT.getVectorElementType();
05897   unsigned NumElems = Elts.size();
05898 
05899   LoadSDNode *LDBase = nullptr;
05900   unsigned LastLoadedElt = -1U;
05901 
05902   // For each element in the initializer, see if we've found a load or an undef.
05903   // If we don't find an initial load element, or later load elements are
05904   // non-consecutive, bail out.
05905   for (unsigned i = 0; i < NumElems; ++i) {
05906     SDValue Elt = Elts[i];
05907 
05908     if (!Elt.getNode() ||
05909         (Elt.getOpcode() != ISD::UNDEF && !ISD::isNON_EXTLoad(Elt.getNode())))
05910       return SDValue();
05911     if (!LDBase) {
05912       if (Elt.getNode()->getOpcode() == ISD::UNDEF)
05913         return SDValue();
05914       LDBase = cast<LoadSDNode>(Elt.getNode());
05915       LastLoadedElt = i;
05916       continue;
05917     }
05918     if (Elt.getOpcode() == ISD::UNDEF)
05919       continue;
05920 
05921     LoadSDNode *LD = cast<LoadSDNode>(Elt);
05922     if (!DAG.isConsecutiveLoad(LD, LDBase, EltVT.getSizeInBits()/8, i))
05923       return SDValue();
05924     LastLoadedElt = i;
05925   }
05926 
05927   // If we have found an entire vector of loads and undefs, then return a large
05928   // load of the entire vector width starting at the base pointer.  If we found
05929   // consecutive loads for the low half, generate a vzext_load node.
05930   if (LastLoadedElt == NumElems - 1) {
05931 
05932     if (isAfterLegalize &&
05933         !DAG.getTargetLoweringInfo().isOperationLegal(ISD::LOAD, VT))
05934       return SDValue();
05935 
05936     SDValue NewLd = SDValue();
05937 
05938     if (DAG.InferPtrAlignment(LDBase->getBasePtr()) >= 16)
05939       NewLd = DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
05940                           LDBase->getPointerInfo(),
05941                           LDBase->isVolatile(), LDBase->isNonTemporal(),
05942                           LDBase->isInvariant(), 0);
05943     NewLd = DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
05944                         LDBase->getPointerInfo(),
05945                         LDBase->isVolatile(), LDBase->isNonTemporal(),
05946                         LDBase->isInvariant(), LDBase->getAlignment());
05947 
05948     if (LDBase->hasAnyUseOfValue(1)) {
05949       SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
05950                                      SDValue(LDBase, 1),
05951                                      SDValue(NewLd.getNode(), 1));
05952       DAG.ReplaceAllUsesOfValueWith(SDValue(LDBase, 1), NewChain);
05953       DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(LDBase, 1),
05954                              SDValue(NewLd.getNode(), 1));
05955     }
05956 
05957     return NewLd;
05958   }
05959   if (NumElems == 4 && LastLoadedElt == 1 &&
05960       DAG.getTargetLoweringInfo().isTypeLegal(MVT::v2i64)) {
05961     SDVTList Tys = DAG.getVTList(MVT::v2i64, MVT::Other);
05962     SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };
05963     SDValue ResNode =
05964         DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, Ops, MVT::i64,
05965                                 LDBase->getPointerInfo(),
05966                                 LDBase->getAlignment(),
05967                                 false/*isVolatile*/, true/*ReadMem*/,
05968                                 false/*WriteMem*/);
05969 
05970     // Make sure the newly-created LOAD is in the same position as LDBase in
05971     // terms of dependency. We create a TokenFactor for LDBase and ResNode, and
05972     // update uses of LDBase's output chain to use the TokenFactor.
05973     if (LDBase->hasAnyUseOfValue(1)) {
05974       SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
05975                              SDValue(LDBase, 1), SDValue(ResNode.getNode(), 1));
05976       DAG.ReplaceAllUsesOfValueWith(SDValue(LDBase, 1), NewChain);
05977       DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(LDBase, 1),
05978                              SDValue(ResNode.getNode(), 1));
05979     }
05980 
05981     return DAG.getNode(ISD::BITCAST, DL, VT, ResNode);
05982   }
05983   return SDValue();
05984 }
05985 
05986 /// LowerVectorBroadcast - Attempt to use the vbroadcast instruction
05987 /// to generate a splat value for the following cases:
05988 /// 1. A splat BUILD_VECTOR which uses a single scalar load, or a constant.
05989 /// 2. A splat shuffle which uses a scalar_to_vector node which comes from
05990 /// a scalar load, or a constant.
05991 /// The VBROADCAST node is returned when a pattern is found,
05992 /// or SDValue() otherwise.
05993 static SDValue LowerVectorBroadcast(SDValue Op, const X86Subtarget* Subtarget,
05994                                     SelectionDAG &DAG) {
05995   if (!Subtarget->hasFp256())
05996     return SDValue();
05997 
05998   MVT VT = Op.getSimpleValueType();
05999   SDLoc dl(Op);
06000 
06001   assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
06002          "Unsupported vector type for broadcast.");
06003 
06004   SDValue Ld;
06005   bool ConstSplatVal;
06006 
06007   switch (Op.getOpcode()) {
06008     default:
06009       // Unknown pattern found.
06010       return SDValue();
06011 
06012     case ISD::BUILD_VECTOR: {
06013       auto *BVOp = cast<BuildVectorSDNode>(Op.getNode());
06014       BitVector UndefElements;
06015       SDValue Splat = BVOp->getSplatValue(&UndefElements);
06016 
06017       // We need a splat of a single value to use broadcast, and it doesn't
06018       // make any sense if the value is only in one element of the vector.
06019       if (!Splat || (VT.getVectorNumElements() - UndefElements.count()) <= 1)
06020         return SDValue();
06021 
06022       Ld = Splat;
06023       ConstSplatVal = (Ld.getOpcode() == ISD::Constant ||
06024                        Ld.getOpcode() == ISD::ConstantFP);
06025 
06026       // Make sure that all of the users of a non-constant load are from the
06027       // BUILD_VECTOR node.
06028       if (!ConstSplatVal && !BVOp->isOnlyUserOf(Ld.getNode()))
06029         return SDValue();
06030       break;
06031     }
06032 
06033     case ISD::VECTOR_SHUFFLE: {
06034       ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
06035 
06036       // Shuffles must have a splat mask where the first element is
06037       // broadcasted.
06038       if ((!SVOp->isSplat()) || SVOp->getMaskElt(0) != 0)
06039         return SDValue();
06040 
06041       SDValue Sc = Op.getOperand(0);
06042       if (Sc.getOpcode() != ISD::SCALAR_TO_VECTOR &&
06043           Sc.getOpcode() != ISD::BUILD_VECTOR) {
06044 
06045         if (!Subtarget->hasInt256())
06046           return SDValue();
06047 
06048         // Use the register form of the broadcast instruction available on AVX2.
06049         if (VT.getSizeInBits() >= 256)
06050           Sc = Extract128BitVector(Sc, 0, DAG, dl);
06051         return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Sc);
06052       }
06053 
06054       Ld = Sc.getOperand(0);
06055       ConstSplatVal = (Ld.getOpcode() == ISD::Constant ||
06056                        Ld.getOpcode() == ISD::ConstantFP);
06057 
06058       // The scalar_to_vector node and the suspected
06059       // load node must have exactly one user.
06060       // Constants may have multiple users.
06061 
06062       // AVX-512 has register version of the broadcast
06063       bool hasRegVer = Subtarget->hasAVX512() && VT.is512BitVector() &&
06064         Ld.getValueType().getSizeInBits() >= 32;
06065       if (!ConstSplatVal && ((!Sc.hasOneUse() || !Ld.hasOneUse()) &&
06066           !hasRegVer))
06067         return SDValue();
06068       break;
06069     }
06070   }
06071 
06072   bool IsGE256 = (VT.getSizeInBits() >= 256);
06073 
06074   // Handle the broadcasting a single constant scalar from the constant pool
06075   // into a vector. On Sandybridge it is still better to load a constant vector
06076   // from the constant pool and not to broadcast it from a scalar.
06077   if (ConstSplatVal && Subtarget->hasInt256()) {
06078     EVT CVT = Ld.getValueType();
06079     assert(!CVT.isVector() && "Must not broadcast a vector type");
06080     unsigned ScalarSize = CVT.getSizeInBits();
06081 
06082     if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64)) {
06083       const Constant *C = nullptr;
06084       if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Ld))
06085         C = CI->getConstantIntValue();
06086       else if (ConstantFPSDNode *CF = dyn_cast<ConstantFPSDNode>(Ld))
06087         C = CF->getConstantFPValue();
06088 
06089       assert(C && "Invalid constant type");
06090 
06091       const TargetLowering &TLI = DAG.getTargetLoweringInfo();
06092       SDValue CP = DAG.getConstantPool(C, TLI.getPointerTy());
06093       unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
06094       Ld = DAG.getLoad(CVT, dl, DAG.getEntryNode(), CP,
06095                        MachinePointerInfo::getConstantPool(),
06096                        false, false, false, Alignment);
06097 
06098       return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
06099     }
06100   }
06101 
06102   bool IsLoad = ISD::isNormalLoad(Ld.getNode());
06103   unsigned ScalarSize = Ld.getValueType().getSizeInBits();
06104 
06105   // Handle AVX2 in-register broadcasts.
06106   if (!IsLoad && Subtarget->hasInt256() &&
06107       (ScalarSize == 32 || (IsGE256 && ScalarSize == 64)))
06108     return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
06109 
06110   // The scalar source must be a normal load.
06111   if (!IsLoad)
06112     return SDValue();
06113 
06114   if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64))
06115     return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
06116 
06117   // The integer check is needed for the 64-bit into 128-bit so it doesn't match
06118   // double since there is no vbroadcastsd xmm
06119   if (Subtarget->hasInt256() && Ld.getValueType().isInteger()) {
06120     if (ScalarSize == 8 || ScalarSize == 16 || ScalarSize == 64)
06121       return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
06122   }
06123 
06124   // Unsupported broadcast.
06125   return SDValue();
06126 }
06127 
06128 /// \brief For an EXTRACT_VECTOR_ELT with a constant index return the real
06129 /// underlying vector and index.
06130 ///
06131 /// Modifies \p ExtractedFromVec to the real vector and returns the real
06132 /// index.
06133 static int getUnderlyingExtractedFromVec(SDValue &ExtractedFromVec,
06134                                          SDValue ExtIdx) {
06135   int Idx = cast<ConstantSDNode>(ExtIdx)->getZExtValue();
06136   if (!isa<ShuffleVectorSDNode>(ExtractedFromVec))
06137     return Idx;
06138 
06139   // For 256-bit vectors, LowerEXTRACT_VECTOR_ELT_SSE4 may have already
06140   // lowered this:
06141   //   (extract_vector_elt (v8f32 %vreg1), Constant<6>)
06142   // to:
06143   //   (extract_vector_elt (vector_shuffle<2,u,u,u>
06144   //                           (extract_subvector (v8f32 %vreg0), Constant<4>),
06145   //                           undef)
06146   //                       Constant<0>)
06147   // In this case the vector is the extract_subvector expression and the index
06148   // is 2, as specified by the shuffle.
06149   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(ExtractedFromVec);
06150   SDValue ShuffleVec = SVOp->getOperand(0);
06151   MVT ShuffleVecVT = ShuffleVec.getSimpleValueType();
06152   assert(ShuffleVecVT.getVectorElementType() ==
06153          ExtractedFromVec.getSimpleValueType().getVectorElementType());
06154 
06155   int ShuffleIdx = SVOp->getMaskElt(Idx);
06156   if (isUndefOrInRange(ShuffleIdx, 0, ShuffleVecVT.getVectorNumElements())) {
06157     ExtractedFromVec = ShuffleVec;
06158     return ShuffleIdx;
06159   }
06160   return Idx;
06161 }
06162 
06163 static SDValue buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) {
06164   MVT VT = Op.getSimpleValueType();
06165 
06166   // Skip if insert_vec_elt is not supported.
06167   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
06168   if (!TLI.isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT))
06169     return SDValue();
06170 
06171   SDLoc DL(Op);
06172   unsigned NumElems = Op.getNumOperands();
06173 
06174   SDValue VecIn1;
06175   SDValue VecIn2;
06176   SmallVector<unsigned, 4> InsertIndices;
06177   SmallVector<int, 8> Mask(NumElems, -1);
06178 
06179   for (unsigned i = 0; i != NumElems; ++i) {
06180     unsigned Opc = Op.getOperand(i).getOpcode();
06181 
06182     if (Opc == ISD::UNDEF)
06183       continue;
06184 
06185     if (Opc != ISD::EXTRACT_VECTOR_ELT) {
06186       // Quit if more than 1 elements need inserting.
06187       if (InsertIndices.size() > 1)
06188         return SDValue();
06189 
06190       InsertIndices.push_back(i);
06191       continue;
06192     }
06193 
06194     SDValue ExtractedFromVec = Op.getOperand(i).getOperand(0);
06195     SDValue ExtIdx = Op.getOperand(i).getOperand(1);
06196     // Quit if non-constant index.
06197     if (!isa<ConstantSDNode>(ExtIdx))
06198       return SDValue();
06199     int Idx = getUnderlyingExtractedFromVec(ExtractedFromVec, ExtIdx);
06200 
06201     // Quit if extracted from vector of different type.
06202     if (ExtractedFromVec.getValueType() != VT)
06203       return SDValue();
06204 
06205     if (!VecIn1.getNode())
06206       VecIn1 = ExtractedFromVec;
06207     else if (VecIn1 != ExtractedFromVec) {
06208       if (!VecIn2.getNode())
06209         VecIn2 = ExtractedFromVec;
06210       else if (VecIn2 != ExtractedFromVec)
06211         // Quit if more than 2 vectors to shuffle
06212         return SDValue();
06213     }
06214 
06215     if (ExtractedFromVec == VecIn1)
06216       Mask[i] = Idx;
06217     else if (ExtractedFromVec == VecIn2)
06218       Mask[i] = Idx + NumElems;
06219   }
06220 
06221   if (!VecIn1.getNode())
06222     return SDValue();
06223 
06224   VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(VT);
06225   SDValue NV = DAG.getVectorShuffle(VT, DL, VecIn1, VecIn2, &Mask[0]);
06226   for (unsigned i = 0, e = InsertIndices.size(); i != e; ++i) {
06227     unsigned Idx = InsertIndices[i];
06228     NV = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, NV, Op.getOperand(Idx),
06229                      DAG.getIntPtrConstant(Idx));
06230   }
06231 
06232   return NV;
06233 }
06234 
06235 // Lower BUILD_VECTOR operation for v8i1 and v16i1 types.
06236 SDValue
06237 X86TargetLowering::LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const {
06238 
06239   MVT VT = Op.getSimpleValueType();
06240   assert((VT.getVectorElementType() == MVT::i1) && (VT.getSizeInBits() <= 16) &&
06241          "Unexpected type in LowerBUILD_VECTORvXi1!");
06242 
06243   SDLoc dl(Op);
06244   if (ISD::isBuildVectorAllZeros(Op.getNode())) {
06245     SDValue Cst = DAG.getTargetConstant(0, MVT::i1);
06246     SmallVector<SDValue, 16> Ops(VT.getVectorNumElements(), Cst);
06247     return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops);
06248   }
06249 
06250   if (ISD::isBuildVectorAllOnes(Op.getNode())) {
06251     SDValue Cst = DAG.getTargetConstant(1, MVT::i1);
06252     SmallVector<SDValue, 16> Ops(VT.getVectorNumElements(), Cst);
06253     return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops);
06254   }
06255 
06256   bool AllContants = true;
06257   uint64_t Immediate = 0;
06258   int NonConstIdx = -1;
06259   bool IsSplat = true;
06260   unsigned NumNonConsts = 0;
06261   unsigned NumConsts = 0;
06262   for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
06263     SDValue In = Op.getOperand(idx);
06264     if (In.getOpcode() == ISD::UNDEF)
06265       continue;
06266     if (!isa<ConstantSDNode>(In)) {
06267       AllContants = false;
06268       NonConstIdx = idx;
06269       NumNonConsts++;
06270     }
06271     else {
06272       NumConsts++;
06273       if (cast<ConstantSDNode>(In)->getZExtValue())
06274       Immediate |= (1ULL << idx);
06275     }
06276     if (In != Op.getOperand(0))
06277       IsSplat = false;
06278   }
06279 
06280   if (AllContants) {
06281     SDValue FullMask = DAG.getNode(ISD::BITCAST, dl, MVT::v16i1,
06282       DAG.getConstant(Immediate, MVT::i16));
06283     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, FullMask,
06284                        DAG.getIntPtrConstant(0));
06285   }
06286 
06287   if (NumNonConsts == 1 && NonConstIdx != 0) {
06288     SDValue DstVec;
06289     if (NumConsts) {
06290       SDValue VecAsImm = DAG.getConstant(Immediate,
06291                                          MVT::getIntegerVT(VT.getSizeInBits()));
06292       DstVec = DAG.getNode(ISD::BITCAST, dl, VT, VecAsImm);
06293     }
06294     else 
06295       DstVec = DAG.getUNDEF(VT);
06296     return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DstVec,
06297                        Op.getOperand(NonConstIdx),
06298                        DAG.getIntPtrConstant(NonConstIdx));
06299   }
06300   if (!IsSplat && (NonConstIdx != 0))
06301     llvm_unreachable("Unsupported BUILD_VECTOR operation");
06302   MVT SelectVT = (VT == MVT::v16i1)? MVT::i16 : MVT::i8;
06303   SDValue Select;
06304   if (IsSplat)
06305     Select = DAG.getNode(ISD::SELECT, dl, SelectVT, Op.getOperand(0),
06306                           DAG.getConstant(-1, SelectVT),
06307                           DAG.getConstant(0, SelectVT));
06308   else
06309     Select = DAG.getNode(ISD::SELECT, dl, SelectVT, Op.getOperand(0),
06310                          DAG.getConstant((Immediate | 1), SelectVT),
06311                          DAG.getConstant(Immediate, SelectVT));
06312   return DAG.getNode(ISD::BITCAST, dl, VT, Select);
06313 }
06314 
06315 /// \brief Return true if \p N implements a horizontal binop and return the
06316 /// operands for the horizontal binop into V0 and V1.
06317 /// 
06318 /// This is a helper function of PerformBUILD_VECTORCombine.
06319 /// This function checks that the build_vector \p N in input implements a
06320 /// horizontal operation. Parameter \p Opcode defines the kind of horizontal
06321 /// operation to match.
06322 /// For example, if \p Opcode is equal to ISD::ADD, then this function
06323 /// checks if \p N implements a horizontal arithmetic add; if instead \p Opcode
06324 /// is equal to ISD::SUB, then this function checks if this is a horizontal
06325 /// arithmetic sub.
06326 ///
06327 /// This function only analyzes elements of \p N whose indices are
06328 /// in range [BaseIdx, LastIdx).
06329 static bool isHorizontalBinOp(const BuildVectorSDNode *N, unsigned Opcode,
06330                               SelectionDAG &DAG,
06331                               unsigned BaseIdx, unsigned LastIdx,
06332                               SDValue &V0, SDValue &V1) {
06333   EVT VT = N->getValueType(0);
06334 
06335   assert(BaseIdx * 2 <= LastIdx && "Invalid Indices in input!");
06336   assert(VT.isVector() && VT.getVectorNumElements() >= LastIdx &&
06337          "Invalid Vector in input!");
06338   
06339   bool IsCommutable = (Opcode == ISD::ADD || Opcode == ISD::FADD);
06340   bool CanFold = true;
06341   unsigned ExpectedVExtractIdx = BaseIdx;
06342   unsigned NumElts = LastIdx - BaseIdx;
06343   V0 = DAG.getUNDEF(VT);
06344   V1 = DAG.getUNDEF(VT);
06345 
06346   // Check if N implements a horizontal binop.
06347   for (unsigned i = 0, e = NumElts; i != e && CanFold; ++i) {
06348     SDValue Op = N->getOperand(i + BaseIdx);
06349 
06350     // Skip UNDEFs.
06351     if (Op->getOpcode() == ISD::UNDEF) {
06352       // Update the expected vector extract index.
06353       if (i * 2 == NumElts)
06354         ExpectedVExtractIdx = BaseIdx;
06355       ExpectedVExtractIdx += 2;
06356       continue;
06357     }
06358 
06359     CanFold = Op->getOpcode() == Opcode && Op->hasOneUse();
06360 
06361     if (!CanFold)
06362       break;
06363 
06364     SDValue Op0 = Op.getOperand(0);
06365     SDValue Op1 = Op.getOperand(1);
06366 
06367     // Try to match the following pattern:
06368     // (BINOP (extract_vector_elt A, I), (extract_vector_elt A, I+1))
06369     CanFold = (Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
06370         Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
06371         Op0.getOperand(0) == Op1.getOperand(0) &&
06372         isa<ConstantSDNode>(Op0.getOperand(1)) &&
06373         isa<ConstantSDNode>(Op1.getOperand(1)));
06374     if (!CanFold)
06375       break;
06376 
06377     unsigned I0 = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue();
06378     unsigned I1 = cast<ConstantSDNode>(Op1.getOperand(1))->getZExtValue();
06379 
06380     if (i * 2 < NumElts) {
06381       if (V0.getOpcode() == ISD::UNDEF)
06382         V0 = Op0.getOperand(0);
06383     } else {
06384       if (V1.getOpcode() == ISD::UNDEF)
06385         V1 = Op0.getOperand(0);
06386       if (i * 2 == NumElts)
06387         ExpectedVExtractIdx = BaseIdx;
06388     }
06389 
06390     SDValue Expected = (i * 2 < NumElts) ? V0 : V1;
06391     if (I0 == ExpectedVExtractIdx)
06392       CanFold = I1 == I0 + 1 && Op0.getOperand(0) == Expected;
06393     else if (IsCommutable && I1 == ExpectedVExtractIdx) {
06394       // Try to match the following dag sequence:
06395       // (BINOP (extract_vector_elt A, I+1), (extract_vector_elt A, I))
06396       CanFold = I0 == I1 + 1 && Op1.getOperand(0) == Expected;
06397     } else
06398       CanFold = false;
06399 
06400     ExpectedVExtractIdx += 2;
06401   }
06402 
06403   return CanFold;
06404 }
06405 
06406 /// \brief Emit a sequence of two 128-bit horizontal add/sub followed by
06407 /// a concat_vector. 
06408 ///
06409 /// This is a helper function of PerformBUILD_VECTORCombine.
06410 /// This function expects two 256-bit vectors called V0 and V1.
06411 /// At first, each vector is split into two separate 128-bit vectors.
06412 /// Then, the resulting 128-bit vectors are used to implement two
06413 /// horizontal binary operations. 
06414 ///
06415 /// The kind of horizontal binary operation is defined by \p X86Opcode.
06416 ///
06417 /// \p Mode specifies how the 128-bit parts of V0 and V1 are passed in input to
06418 /// the two new horizontal binop.
06419 /// When Mode is set, the first horizontal binop dag node would take as input
06420 /// the lower 128-bit of V0 and the upper 128-bit of V0. The second
06421 /// horizontal binop dag node would take as input the lower 128-bit of V1
06422 /// and the upper 128-bit of V1.
06423 ///   Example:
06424 ///     HADD V0_LO, V0_HI
06425 ///     HADD V1_LO, V1_HI
06426 ///
06427 /// Otherwise, the first horizontal binop dag node takes as input the lower
06428 /// 128-bit of V0 and the lower 128-bit of V1, and the second horizontal binop
06429 /// dag node takes the the upper 128-bit of V0 and the upper 128-bit of V1.
06430 ///   Example:
06431 ///     HADD V0_LO, V1_LO
06432 ///     HADD V0_HI, V1_HI
06433 ///
06434 /// If \p isUndefLO is set, then the algorithm propagates UNDEF to the lower
06435 /// 128-bits of the result. If \p isUndefHI is set, then UNDEF is propagated to
06436 /// the upper 128-bits of the result.
06437 static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1,
06438                                      SDLoc DL, SelectionDAG &DAG,
06439                                      unsigned X86Opcode, bool Mode,
06440                                      bool isUndefLO, bool isUndefHI) {
06441   EVT VT = V0.getValueType();
06442   assert(VT.is256BitVector() && VT == V1.getValueType() &&
06443          "Invalid nodes in input!");
06444 
06445   unsigned NumElts = VT.getVectorNumElements();
06446   SDValue V0_LO = Extract128BitVector(V0, 0, DAG, DL);
06447   SDValue V0_HI = Extract128BitVector(V0, NumElts/2, DAG, DL);
06448   SDValue V1_LO = Extract128BitVector(V1, 0, DAG, DL);
06449   SDValue V1_HI = Extract128BitVector(V1, NumElts/2, DAG, DL);
06450   EVT NewVT = V0_LO.getValueType();
06451 
06452   SDValue LO = DAG.getUNDEF(NewVT);
06453   SDValue HI = DAG.getUNDEF(NewVT);
06454 
06455   if (Mode) {
06456     // Don't emit a horizontal binop if the result is expected to be UNDEF.
06457     if (!isUndefLO && V0->getOpcode() != ISD::UNDEF)
06458       LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V0_HI);
06459     if (!isUndefHI && V1->getOpcode() != ISD::UNDEF)
06460       HI = DAG.getNode(X86Opcode, DL, NewVT, V1_LO, V1_HI);
06461   } else {
06462     // Don't emit a horizontal binop if the result is expected to be UNDEF.
06463     if (!isUndefLO && (V0_LO->getOpcode() != ISD::UNDEF ||
06464                        V1_LO->getOpcode() != ISD::UNDEF))
06465       LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V1_LO);
06466 
06467     if (!isUndefHI && (V0_HI->getOpcode() != ISD::UNDEF ||
06468                        V1_HI->getOpcode() != ISD::UNDEF))
06469       HI = DAG.getNode(X86Opcode, DL, NewVT, V0_HI, V1_HI);
06470   }
06471 
06472   return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LO, HI);
06473 }
06474 
06475 /// \brief Try to fold a build_vector that performs an 'addsub' into the
06476 /// sequence of 'vadd + vsub + blendi'.
06477 static SDValue matchAddSub(const BuildVectorSDNode *BV, SelectionDAG &DAG,
06478                            const X86Subtarget *Subtarget) {
06479   SDLoc DL(BV);
06480   EVT VT = BV->getValueType(0);
06481   unsigned NumElts = VT.getVectorNumElements();
06482   SDValue InVec0 = DAG.getUNDEF(VT);
06483   SDValue InVec1 = DAG.getUNDEF(VT);
06484 
06485   assert((VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v4f32 ||
06486           VT == MVT::v2f64) && "build_vector with an invalid type found!");
06487 
06488   // Odd-numbered elements in the input build vector are obtained from
06489   // adding two integer/float elements.
06490   // Even-numbered elements in the input build vector are obtained from
06491   // subtracting two integer/float elements.
06492   unsigned ExpectedOpcode = ISD::FSUB;
06493   unsigned NextExpectedOpcode = ISD::FADD;
06494   bool AddFound = false;
06495   bool SubFound = false;
06496 
06497   for (unsigned i = 0, e = NumElts; i != e; i++) {
06498     SDValue Op = BV->getOperand(i);
06499 
06500     // Skip 'undef' values.
06501     unsigned Opcode = Op.getOpcode();
06502     if (Opcode == ISD::UNDEF) {
06503       std::swap(ExpectedOpcode, NextExpectedOpcode);
06504       continue;
06505     }
06506 
06507     // Early exit if we found an unexpected opcode.
06508     if (Opcode != ExpectedOpcode)
06509       return SDValue();
06510 
06511     SDValue Op0 = Op.getOperand(0);
06512     SDValue Op1 = Op.getOperand(1);
06513 
06514     // Try to match the following pattern:
06515     // (BINOP (extract_vector_elt A, i), (extract_vector_elt B, i))
06516     // Early exit if we cannot match that sequence.
06517     if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
06518         Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
06519         !isa<ConstantSDNode>(Op0.getOperand(1)) ||
06520         !isa<ConstantSDNode>(Op1.getOperand(1)) ||
06521         Op0.getOperand(1) != Op1.getOperand(1))
06522       return SDValue();
06523 
06524     unsigned I0 = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue();
06525     if (I0 != i)
06526       return SDValue();
06527 
06528     // We found a valid add/sub node. Update the information accordingly.
06529     if (i & 1)
06530       AddFound = true;
06531     else
06532       SubFound = true;
06533 
06534     // Update InVec0 and InVec1.
06535     if (InVec0.getOpcode() == ISD::UNDEF)
06536       InVec0 = Op0.getOperand(0);
06537     if (InVec1.getOpcode() == ISD::UNDEF)
06538       InVec1 = Op1.getOperand(0);
06539 
06540     // Make sure that operands in input to each add/sub node always
06541     // come from a same pair of vectors.
06542     if (InVec0 != Op0.getOperand(0)) {
06543       if (ExpectedOpcode == ISD::FSUB)
06544         return SDValue();
06545 
06546       // FADD is commutable. Try to commute the operands
06547       // and then test again.
06548       std::swap(Op0, Op1);
06549       if (InVec0 != Op0.getOperand(0))
06550         return SDValue();
06551     }
06552 
06553     if (InVec1 != Op1.getOperand(0))
06554       return SDValue();
06555 
06556     // Update the pair of expected opcodes.
06557     std::swap(ExpectedOpcode, NextExpectedOpcode);
06558   }
06559 
06560   // Don't try to fold this build_vector into an ADDSUB if the inputs are undef.
06561   if (AddFound && SubFound && InVec0.getOpcode() != ISD::UNDEF &&
06562       InVec1.getOpcode() != ISD::UNDEF)
06563     return DAG.getNode(X86ISD::ADDSUB, DL, VT, InVec0, InVec1);
06564 
06565   return SDValue();
06566 }
06567 
06568 static SDValue PerformBUILD_VECTORCombine(SDNode *N, SelectionDAG &DAG,
06569                                           const X86Subtarget *Subtarget) {
06570   SDLoc DL(N);
06571   EVT VT = N->getValueType(0);
06572   unsigned NumElts = VT.getVectorNumElements();
06573   BuildVectorSDNode *BV = cast<BuildVectorSDNode>(N);
06574   SDValue InVec0, InVec1;
06575 
06576   // Try to match an ADDSUB.
06577   if ((Subtarget->hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
06578       (Subtarget->hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))) {
06579     SDValue Value = matchAddSub(BV, DAG, Subtarget);
06580     if (Value.getNode())
06581       return Value;
06582   }
06583 
06584   // Try to match horizontal ADD/SUB.
06585   unsigned NumUndefsLO = 0;
06586   unsigned NumUndefsHI = 0;
06587   unsigned Half = NumElts/2;
06588 
06589   // Count the number of UNDEF operands in the build_vector in input.
06590   for (unsigned i = 0, e = Half; i != e; ++i)
06591     if (BV->getOperand(i)->getOpcode() == ISD::UNDEF)
06592       NumUndefsLO++;
06593 
06594   for (unsigned i = Half, e = NumElts; i != e; ++i)
06595     if (BV->getOperand(i)->getOpcode() == ISD::UNDEF)
06596       NumUndefsHI++;
06597 
06598   // Early exit if this is either a build_vector of all UNDEFs or all the
06599   // operands but one are UNDEF.
06600   if (NumUndefsLO + NumUndefsHI + 1 >= NumElts)
06601     return SDValue();
06602 
06603   if ((VT == MVT::v4f32 || VT == MVT::v2f64) && Subtarget->hasSSE3()) {
06604     // Try to match an SSE3 float HADD/HSUB.
06605     if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, NumElts, InVec0, InVec1))
06606       return DAG.getNode(X86ISD::FHADD, DL, VT, InVec0, InVec1);
06607     
06608     if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, NumElts, InVec0, InVec1))
06609       return DAG.getNode(X86ISD::FHSUB, DL, VT, InVec0, InVec1);
06610   } else if ((VT == MVT::v4i32 || VT == MVT::v8i16) && Subtarget->hasSSSE3()) {
06611     // Try to match an SSSE3 integer HADD/HSUB.
06612     if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1))
06613       return DAG.getNode(X86ISD::HADD, DL, VT, InVec0, InVec1);
06614     
06615     if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, NumElts, InVec0, InVec1))
06616       return DAG.getNode(X86ISD::HSUB, DL, VT, InVec0, InVec1);
06617   }
06618   
06619   if (!Subtarget->hasAVX())
06620     return SDValue();
06621 
06622   if ((VT == MVT::v8f32 || VT == MVT::v4f64)) {
06623     // Try to match an AVX horizontal add/sub of packed single/double
06624     // precision floating point values from 256-bit vectors.
06625     SDValue InVec2, InVec3;
06626     if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, Half, InVec0, InVec1) &&
06627         isHorizontalBinOp(BV, ISD::FADD, DAG, Half, NumElts, InVec2, InVec3) &&
06628         ((InVec0.getOpcode() == ISD::UNDEF ||
06629           InVec2.getOpcode() == ISD::UNDEF) || InVec0 == InVec2) &&
06630         ((InVec1.getOpcode() == ISD::UNDEF ||
06631           InVec3.getOpcode() == ISD::UNDEF) || InVec1 == InVec3))
06632       return DAG.getNode(X86ISD::FHADD, DL, VT, InVec0, InVec1);
06633 
06634     if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, Half, InVec0, InVec1) &&
06635         isHorizontalBinOp(BV, ISD::FSUB, DAG, Half, NumElts, InVec2, InVec3) &&
06636         ((InVec0.getOpcode() == ISD::UNDEF ||
06637           InVec2.getOpcode() == ISD::UNDEF) || InVec0 == InVec2) &&
06638         ((InVec1.getOpcode() == ISD::UNDEF ||
06639           InVec3.getOpcode() == ISD::UNDEF) || InVec1 == InVec3))
06640       return DAG.getNode(X86ISD::FHSUB, DL, VT, InVec0, InVec1);
06641   } else if (VT == MVT::v8i32 || VT == MVT::v16i16) {
06642     // Try to match an AVX2 horizontal add/sub of signed integers.
06643     SDValue InVec2, InVec3;
06644     unsigned X86Opcode;
06645     bool CanFold = true;
06646 
06647     if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, Half, InVec0, InVec1) &&
06648         isHorizontalBinOp(BV, ISD::ADD, DAG, Half, NumElts, InVec2, InVec3) &&
06649         ((InVec0.getOpcode() == ISD::UNDEF ||
06650           InVec2.getOpcode() == ISD::UNDEF) || InVec0 == InVec2) &&
06651         ((InVec1.getOpcode() == ISD::UNDEF ||
06652           InVec3.getOpcode() == ISD::UNDEF) || InVec1 == InVec3))
06653       X86Opcode = X86ISD::HADD;
06654     else if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, Half, InVec0, InVec1) &&
06655         isHorizontalBinOp(BV, ISD::SUB, DAG, Half, NumElts, InVec2, InVec3) &&
06656         ((InVec0.getOpcode() == ISD::UNDEF ||
06657           InVec2.getOpcode() == ISD::UNDEF) || InVec0 == InVec2) &&
06658         ((InVec1.getOpcode() == ISD::UNDEF ||
06659           InVec3.getOpcode() == ISD::UNDEF) || InVec1 == InVec3))
06660       X86Opcode = X86ISD::HSUB;
06661     else
06662       CanFold = false;
06663 
06664     if (CanFold) {
06665       // Fold this build_vector into a single horizontal add/sub.
06666       // Do this only if the target has AVX2.
06667       if (Subtarget->hasAVX2())
06668         return DAG.getNode(X86Opcode, DL, VT, InVec0, InVec1);
06669  
06670       // Do not try to expand this build_vector into a pair of horizontal
06671       // add/sub if we can emit a pair of scalar add/sub.
06672       if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
06673         return SDValue();
06674 
06675       // Convert this build_vector into a pair of horizontal binop followed by
06676       // a concat vector.
06677       bool isUndefLO = NumUndefsLO == Half;
06678       bool isUndefHI = NumUndefsHI == Half;
06679       return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, false,
06680                                    isUndefLO, isUndefHI);
06681     }
06682   }
06683 
06684   if ((VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v8i32 ||
06685        VT == MVT::v16i16) && Subtarget->hasAVX()) {
06686     unsigned X86Opcode;
06687     if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1))
06688       X86Opcode = X86ISD::HADD;
06689     else if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, NumElts, InVec0, InVec1))
06690       X86Opcode = X86ISD::HSUB;
06691     else if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, NumElts, InVec0, InVec1))
06692       X86Opcode = X86ISD::FHADD;
06693     else if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, NumElts, InVec0, InVec1))
06694       X86Opcode = X86ISD::FHSUB;
06695     else
06696       return SDValue();
06697 
06698     // Don't try to expand this build_vector into a pair of horizontal add/sub
06699     // if we can simply emit a pair of scalar add/sub.
06700     if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
06701       return SDValue();
06702 
06703     // Convert this build_vector into two horizontal add/sub followed by
06704     // a concat vector.
06705     bool isUndefLO = NumUndefsLO == Half;
06706     bool isUndefHI = NumUndefsHI == Half;
06707     return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, true,
06708                                  isUndefLO, isUndefHI);
06709   }
06710 
06711   return SDValue();
06712 }
06713 
06714 SDValue
06715 X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
06716   SDLoc dl(Op);
06717 
06718   MVT VT = Op.getSimpleValueType();
06719   MVT ExtVT = VT.getVectorElementType();
06720   unsigned NumElems = Op.getNumOperands();
06721 
06722   // Generate vectors for predicate vectors.
06723   if (VT.getScalarType() == MVT::i1 && Subtarget->hasAVX512())
06724     return LowerBUILD_VECTORvXi1(Op, DAG);
06725 
06726   // Vectors containing all zeros can be matched by pxor and xorps later
06727   if (ISD::isBuildVectorAllZeros(Op.getNode())) {
06728     // Canonicalize this to <4 x i32> to 1) ensure the zero vectors are CSE'd
06729     // and 2) ensure that i64 scalars are eliminated on x86-32 hosts.
06730     if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32)
06731       return Op;
06732 
06733     return getZeroVector(VT, Subtarget, DAG, dl);
06734   }
06735 
06736   // Vectors containing all ones can be matched by pcmpeqd on 128-bit width
06737   // vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use
06738   // vpcmpeqd on 256-bit vectors.
06739   if (Subtarget->hasSSE2() && ISD::isBuildVectorAllOnes(Op.getNode())) {
06740     if (VT == MVT::v4i32 || (VT == MVT::v8i32 && Subtarget->hasInt256()))
06741       return Op;
06742 
06743     if (!VT.is512BitVector())
06744       return getOnesVector(VT, Subtarget->hasInt256(), DAG, dl);
06745   }
06746 
06747   SDValue Broadcast = LowerVectorBroadcast(Op, Subtarget, DAG);
06748   if (Broadcast.getNode())
06749     return Broadcast;
06750 
06751   unsigned EVTBits = ExtVT.getSizeInBits();
06752 
06753   unsigned NumZero  = 0;
06754   unsigned NumNonZero = 0;
06755   unsigned NonZeros = 0;
06756   bool IsAllConstants = true;
06757   SmallSet<SDValue, 8> Values;
06758   for (unsigned i = 0; i < NumElems; ++i) {
06759     SDValue Elt = Op.getOperand(i);
06760     if (Elt.getOpcode() == ISD::UNDEF)
06761       continue;
06762     Values.insert(Elt);
06763     if (Elt.getOpcode() != ISD::Constant &&
06764         Elt.getOpcode() != ISD::ConstantFP)
06765       IsAllConstants = false;
06766     if (X86::isZeroNode(Elt))
06767       NumZero++;
06768     else {
06769       NonZeros |= (1 << i);
06770       NumNonZero++;
06771     }
06772   }
06773 
06774   // All undef vector. Return an UNDEF.  All zero vectors were handled above.
06775   if (NumNonZero == 0)
06776     return DAG.getUNDEF(VT);
06777 
06778   // Special case for single non-zero, non-undef, element.
06779   if (NumNonZero == 1) {
06780     unsigned Idx = countTrailingZeros(NonZeros);
06781     SDValue Item = Op.getOperand(Idx);
06782 
06783     // If this is an insertion of an i64 value on x86-32, and if the top bits of
06784     // the value are obviously zero, truncate the value to i32 and do the
06785     // insertion that way.  Only do this if the value is non-constant or if the
06786     // value is a constant being inserted into element 0.  It is cheaper to do
06787     // a constant pool load than it is to do a movd + shuffle.
06788     if (ExtVT == MVT::i64 && !Subtarget->is64Bit() &&
06789         (!IsAllConstants || Idx == 0)) {
06790       if (DAG.MaskedValueIsZero(Item, APInt::getBitsSet(64, 32, 64))) {
06791         // Handle SSE only.
06792         assert(VT == MVT::v2i64 && "Expected an SSE value type!");
06793         EVT VecVT = MVT::v4i32;
06794         unsigned VecElts = 4;
06795 
06796         // Truncate the value (which may itself be a constant) to i32, and
06797         // convert it to a vector with movd (S2V+shuffle to zero extend).
06798         Item = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Item);
06799         Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Item);
06800 
06801         // If using the new shuffle lowering, just directly insert this.
06802         if (ExperimentalVectorShuffleLowering)
06803           return DAG.getNode(
06804               ISD::BITCAST, dl, VT,
06805               getShuffleVectorZeroOrUndef(Item, Idx * 2, true, Subtarget, DAG));
06806 
06807         Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
06808 
06809         // Now we have our 32-bit value zero extended in the low element of
06810         // a vector.  If Idx != 0, swizzle it into place.
06811         if (Idx != 0) {
06812           SmallVector<int, 4> Mask;
06813           Mask.push_back(Idx);
06814           for (unsigned i = 1; i != VecElts; ++i)
06815             Mask.push_back(i);
06816           Item = DAG.getVectorShuffle(VecVT, dl, Item, DAG.getUNDEF(VecVT),
06817                                       &Mask[0]);
06818         }
06819         return DAG.getNode(ISD::BITCAST, dl, VT, Item);
06820       }
06821     }
06822 
06823     // If we have a constant or non-constant insertion into the low element of
06824     // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into
06825     // the rest of the elements.  This will be matched as movd/movq/movss/movsd
06826     // depending on what the source datatype is.
06827     if (Idx == 0) {
06828       if (NumZero == 0)
06829         return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
06830 
06831       if (ExtVT == MVT::i32 || ExtVT == MVT::f32 || ExtVT == MVT::f64 ||
06832           (ExtVT == MVT::i64 && Subtarget->is64Bit())) {
06833         if (VT.is256BitVector() || VT.is512BitVector()) {
06834           SDValue ZeroVec = getZeroVector(VT, Subtarget, DAG, dl);
06835           return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, ZeroVec,
06836                              Item, DAG.getIntPtrConstant(0));
06837         }
06838         assert(VT.is128BitVector() && "Expected an SSE value type!");
06839         Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
06840         // Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector.
06841         return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
06842       }
06843 
06844       if (ExtVT == MVT::i16 || ExtVT == MVT::i8) {
06845         Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item);
06846         Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item);
06847         if (VT.is256BitVector()) {
06848           SDValue ZeroVec = getZeroVector(MVT::v8i32, Subtarget, DAG, dl);
06849           Item = Insert128BitVector(ZeroVec, Item, 0, DAG, dl);
06850         } else {
06851           assert(VT.is128BitVector() && "Expected an SSE value type!");
06852           Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
06853         }
06854         return DAG.getNode(ISD::BITCAST, dl, VT, Item);
06855       }
06856     }
06857 
06858     // Is it a vector logical left shift?
06859     if (NumElems == 2 && Idx == 1 &&
06860         X86::isZeroNode(Op.getOperand(0)) &&
06861         !X86::isZeroNode(Op.getOperand(1))) {
06862       unsigned NumBits = VT.getSizeInBits();
06863       return getVShift(true, VT,
06864                        DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
06865                                    VT, Op.getOperand(1)),
06866                        NumBits/2, DAG, *this, dl);
06867     }
06868 
06869     if (IsAllConstants) // Otherwise, it's better to do a constpool load.
06870       return SDValue();
06871 
06872     // Otherwise, if this is a vector with i32 or f32 elements, and the element
06873     // is a non-constant being inserted into an element other than the low one,
06874     // we can't use a constant pool load.  Instead, use SCALAR_TO_VECTOR (aka
06875     // movd/movss) to move this into the low element, then shuffle it into
06876     // place.
06877     if (EVTBits == 32) {
06878       Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
06879 
06880       // If using the new shuffle lowering, just directly insert this.
06881       if (ExperimentalVectorShuffleLowering)
06882         return getShuffleVectorZeroOrUndef(Item, Idx, NumZero > 0, Subtarget, DAG);
06883 
06884       // Turn it into a shuffle of zero and zero-extended scalar to vector.
06885       Item = getShuffleVectorZeroOrUndef(Item, 0, NumZero > 0, Subtarget, DAG);
06886       SmallVector<int, 8> MaskVec;
06887       for (unsigned i = 0; i != NumElems; ++i)
06888         MaskVec.push_back(i == Idx ? 0 : 1);
06889       return DAG.getVectorShuffle(VT, dl, Item, DAG.getUNDEF(VT), &MaskVec[0]);
06890     }
06891   }
06892 
06893   // Splat is obviously ok. Let legalizer expand it to a shuffle.
06894   if (Values.size() == 1) {
06895     if (EVTBits == 32) {
06896       // Instead of a shuffle like this:
06897       // shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0>
06898       // Check if it's possible to issue this instead.
06899       // shuffle (vload ptr)), undef, <1, 1, 1, 1>
06900       unsigned Idx = countTrailingZeros(NonZeros);
06901       SDValue Item = Op.getOperand(Idx);
06902       if (Op.getNode()->isOnlyUserOf(Item.getNode()))
06903         return LowerAsSplatVectorLoad(Item, VT, dl, DAG);
06904     }
06905     return SDValue();
06906   }
06907 
06908   // A vector full of immediates; various special cases are already
06909   // handled, so this is best done with a single constant-pool load.
06910   if (IsAllConstants)
06911     return SDValue();
06912 
06913   // For AVX-length vectors, build the individual 128-bit pieces and use
06914   // shuffles to put them in place.
06915   if (VT.is256BitVector() || VT.is512BitVector()) {
06916     SmallVector<SDValue, 64> V;
06917     for (unsigned i = 0; i != NumElems; ++i)
06918       V.push_back(Op.getOperand(i));
06919 
06920     EVT HVT = EVT::getVectorVT(*DAG.getContext(), ExtVT, NumElems/2);
06921 
06922     // Build both the lower and upper subvector.
06923     SDValue Lower = DAG.getNode(ISD::BUILD_VECTOR, dl, HVT,
06924                                 makeArrayRef(&V[0], NumElems/2));
06925     SDValue Upper = DAG.getNode(ISD::BUILD_VECTOR, dl, HVT,
06926                                 makeArrayRef(&V[NumElems / 2], NumElems/2));
06927 
06928     // Recreate the wider vector with the lower and upper part.
06929     if (VT.is256BitVector())
06930       return Concat128BitVectors(Lower, Upper, VT, NumElems, DAG, dl);
06931     return Concat256BitVectors(Lower, Upper, VT, NumElems, DAG, dl);
06932   }
06933 
06934   // Let legalizer expand 2-wide build_vectors.
06935   if (EVTBits == 64) {
06936     if (NumNonZero == 1) {
06937       // One half is zero or undef.
06938       unsigned Idx = countTrailingZeros(NonZeros);
06939       SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT,
06940                                  Op.getOperand(Idx));
06941       return getShuffleVectorZeroOrUndef(V2, Idx, true, Subtarget, DAG);
06942     }
06943     return SDValue();
06944   }
06945 
06946   // If element VT is < 32 bits, convert it to inserts into a zero vector.
06947   if (EVTBits == 8 && NumElems == 16) {
06948     SDValue V = LowerBuildVectorv16i8(Op, NonZeros,NumNonZero,NumZero, DAG,
06949                                         Subtarget, *this);
06950     if (V.getNode()) return V;
06951   }
06952 
06953   if (EVTBits == 16 && NumElems == 8) {
06954     SDValue V = LowerBuildVectorv8i16(Op, NonZeros,NumNonZero,NumZero, DAG,
06955                                       Subtarget, *this);
06956     if (V.getNode()) return V;
06957   }
06958 
06959   // If element VT is == 32 bits and has 4 elems, try to generate an INSERTPS
06960   if (EVTBits == 32 && NumElems == 4) {
06961     SDValue V = LowerBuildVectorv4x32(Op, NumElems, NonZeros, NumNonZero,
06962                                       NumZero, DAG, Subtarget, *this);
06963     if (V.getNode())
06964       return V;
06965   }
06966 
06967   // If element VT is == 32 bits, turn it into a number of shuffles.
06968   SmallVector<SDValue, 8> V(NumElems);
06969   if (NumElems == 4 && NumZero > 0) {
06970     for (unsigned i = 0; i < 4; ++i) {
06971       bool isZero = !(NonZeros & (1 << i));
06972       if (isZero)
06973         V[i] = getZeroVector(VT, Subtarget, DAG, dl);
06974       else
06975         V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
06976     }
06977 
06978     for (unsigned i = 0; i < 2; ++i) {
06979       switch ((NonZeros & (0x3 << i*2)) >> (i*2)) {
06980         default: break;
06981         case 0:
06982           V[i] = V[i*2];  // Must be a zero vector.
06983           break;
06984         case 1:
06985           V[i] = getMOVL(DAG, dl, VT, V[i*2+1], V[i*2]);
06986           break;
06987         case 2:
06988           V[i] = getMOVL(DAG, dl, VT, V[i*2], V[i*2+1]);
06989           break;
06990         case 3:
06991           V[i] = getUnpackl(DAG, dl, VT, V[i*2], V[i*2+1]);
06992           break;
06993       }
06994     }
06995 
06996     bool Reverse1 = (NonZeros & 0x3) == 2;
06997     bool Reverse2 = ((NonZeros & (0x3 << 2)) >> 2) == 2;
06998     int MaskVec[] = {
06999       Reverse1 ? 1 : 0,
07000       Reverse1 ? 0 : 1,
07001       static_cast<int>(Reverse2 ? NumElems+1 : NumElems),
07002       static_cast<int>(Reverse2 ? NumElems   : NumElems+1)
07003     };
07004     return DAG.getVectorShuffle(VT, dl, V[0], V[1], &MaskVec[0]);
07005   }
07006 
07007   if (Values.size() > 1 && VT.is128BitVector()) {
07008     // Check for a build vector of consecutive loads.
07009     for (unsigned i = 0; i < NumElems; ++i)
07010       V[i] = Op.getOperand(i);
07011 
07012     // Check for elements which are consecutive loads.
07013     SDValue LD = EltsFromConsecutiveLoads(VT, V, dl, DAG, false);
07014     if (LD.getNode())
07015       return LD;
07016 
07017     // Check for a build vector from mostly shuffle plus few inserting.
07018     SDValue Sh = buildFromShuffleMostly(Op, DAG);
07019     if (Sh.getNode())
07020       return Sh;
07021 
07022     // For SSE 4.1, use insertps to put the high elements into the low element.
07023     if (getSubtarget()->hasSSE41()) {
07024       SDValue Result;
07025       if (Op.getOperand(0).getOpcode() != ISD::UNDEF)
07026         Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0));
07027       else
07028         Result = DAG.getUNDEF(VT);
07029 
07030       for (unsigned i = 1; i < NumElems; ++i) {
07031         if (Op.getOperand(i).getOpcode() == ISD::UNDEF) continue;
07032         Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result,
07033                              Op.getOperand(i), DAG.getIntPtrConstant(i));
07034       }
07035       return Result;
07036     }
07037 
07038     // Otherwise, expand into a number of unpckl*, start by extending each of
07039     // our (non-undef) elements to the full vector width with the element in the
07040     // bottom slot of the vector (which generates no code for SSE).
07041     for (unsigned i = 0; i < NumElems; ++i) {
07042       if (Op.getOperand(i).getOpcode() != ISD::UNDEF)
07043         V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
07044       else
07045         V[i] = DAG.getUNDEF(VT);
07046     }
07047 
07048     // Next, we iteratively mix elements, e.g. for v4f32:
07049     //   Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
07050     //         : unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
07051     //   Step 2: unpcklps X, Y ==>    <3, 2, 1, 0>
07052     unsigned EltStride = NumElems >> 1;
07053     while (EltStride != 0) {
07054       for (unsigned i = 0; i < EltStride; ++i) {
07055         // If V[i+EltStride] is undef and this is the first round of mixing,
07056         // then it is safe to just drop this shuffle: V[i] is already in the
07057         // right place, the one element (since it's the first round) being
07058         // inserted as undef can be dropped.  This isn't safe for successive
07059         // rounds because they will permute elements within both vectors.
07060         if (V[i+EltStride].getOpcode() == ISD::UNDEF &&
07061             EltStride == NumElems/2)
07062           continue;
07063 
07064         V[i] = getUnpackl(DAG, dl, VT, V[i], V[i + EltStride]);
07065       }
07066       EltStride >>= 1;
07067     }
07068     return V[0];
07069   }
07070   return SDValue();
07071 }
07072 
07073 // LowerAVXCONCAT_VECTORS - 256-bit AVX can use the vinsertf128 instruction
07074 // to create 256-bit vectors from two other 128-bit ones.
07075 static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
07076   SDLoc dl(Op);
07077   MVT ResVT = Op.getSimpleValueType();
07078 
07079   assert((ResVT.is256BitVector() ||
07080           ResVT.is512BitVector()) && "Value type must be 256-/512-bit wide");
07081 
07082   SDValue V1 = Op.getOperand(0);
07083   SDValue V2 = Op.getOperand(1);
07084   unsigned NumElems = ResVT.getVectorNumElements();
07085   if(ResVT.is256BitVector())
07086     return Concat128BitVectors(V1, V2, ResVT, NumElems, DAG, dl);
07087 
07088   if (Op.getNumOperands() == 4) {
07089     MVT HalfVT = MVT::getVectorVT(ResVT.getScalarType(),
07090                                 ResVT.getVectorNumElements()/2);
07091     SDValue V3 = Op.getOperand(2);
07092     SDValue V4 = Op.getOperand(3);
07093     return Concat256BitVectors(Concat128BitVectors(V1, V2, HalfVT, NumElems/2, DAG, dl),
07094       Concat128BitVectors(V3, V4, HalfVT, NumElems/2, DAG, dl), ResVT, NumElems, DAG, dl);
07095   }
07096   return Concat256BitVectors(V1, V2, ResVT, NumElems, DAG, dl);
07097 }
07098 
07099 static SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
07100   MVT LLVM_ATTRIBUTE_UNUSED VT = Op.getSimpleValueType();
07101   assert((VT.is256BitVector() && Op.getNumOperands() == 2) ||
07102          (VT.is512BitVector() && (Op.getNumOperands() == 2 ||
07103           Op.getNumOperands() == 4)));
07104 
07105   // AVX can use the vinsertf128 instruction to create 256-bit vectors
07106   // from two other 128-bit ones.
07107 
07108   // 512-bit vector may contain 2 256-bit vectors or 4 128-bit vectors
07109   return LowerAVXCONCAT_VECTORS(Op, DAG);
07110 }
07111 
07112 
07113 //===----------------------------------------------------------------------===//
07114 // Vector shuffle lowering
07115 //
07116 // This is an experimental code path for lowering vector shuffles on x86. It is
07117 // designed to handle arbitrary vector shuffles and blends, gracefully
07118 // degrading performance as necessary. It works hard to recognize idiomatic
07119 // shuffles and lower them to optimal instruction patterns without leaving
07120 // a framework that allows reasonably efficient handling of all vector shuffle
07121 // patterns.
07122 //===----------------------------------------------------------------------===//
07123 
07124 /// \brief Tiny helper function to identify a no-op mask.
07125 ///
07126 /// This is a somewhat boring predicate function. It checks whether the mask
07127 /// array input, which is assumed to be a single-input shuffle mask of the kind
07128 /// used by the X86 shuffle instructions (not a fully general
07129 /// ShuffleVectorSDNode mask) requires any shuffles to occur. Both undef and an
07130 /// in-place shuffle are 'no-op's.
07131 static bool isNoopShuffleMask(ArrayRef<int> Mask) {
07132   for (int i = 0, Size = Mask.size(); i < Size; ++i)
07133     if (Mask[i] != -1 && Mask[i] != i)
07134       return false;
07135   return true;
07136 }
07137 
07138 /// \brief Helper function to classify a mask as a single-input mask.
07139 ///
07140 /// This isn't a generic single-input test because in the vector shuffle
07141 /// lowering we canonicalize single inputs to be the first input operand. This
07142 /// means we can more quickly test for a single input by only checking whether
07143 /// an input from the second operand exists. We also assume that the size of
07144 /// mask corresponds to the size of the input vectors which isn't true in the
07145 /// fully general case.
07146 static bool isSingleInputShuffleMask(ArrayRef<int> Mask) {
07147   for (int M : Mask)
07148     if (M >= (int)Mask.size())
07149       return false;
07150   return true;
07151 }
07152 
07153 // Hide this symbol with an anonymous namespace instead of 'static' so that MSVC
07154 // 2013 will allow us to use it as a non-type template parameter.
07155 namespace {
07156 
07157 /// \brief Implementation of the \c isShuffleEquivalent variadic functor.
07158 ///
07159 /// See its documentation for details.