LLVM API Documentation

X86ISelLowering.cpp
Go to the documentation of this file.
00001 //===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===//
00002 //
00003 //                     The LLVM Compiler Infrastructure
00004 //
00005 // This file is distributed under the University of Illinois Open Source
00006 // License. See LICENSE.TXT for details.
00007 //
00008 //===----------------------------------------------------------------------===//
00009 //
00010 // This file defines the interfaces that X86 uses to lower LLVM code into a
00011 // selection DAG.
00012 //
00013 //===----------------------------------------------------------------------===//
00014 
00015 #include "X86ISelLowering.h"
00016 #include "Utils/X86ShuffleDecode.h"
00017 #include "X86CallingConv.h"
00018 #include "X86InstrBuilder.h"
00019 #include "X86MachineFunctionInfo.h"
00020 #include "X86TargetMachine.h"
00021 #include "X86TargetObjectFile.h"
00022 #include "llvm/ADT/SmallBitVector.h"
00023 #include "llvm/ADT/SmallSet.h"
00024 #include "llvm/ADT/Statistic.h"
00025 #include "llvm/ADT/StringExtras.h"
00026 #include "llvm/ADT/StringSwitch.h"
00027 #include "llvm/ADT/VariadicFunction.h"
00028 #include "llvm/CodeGen/IntrinsicLowering.h"
00029 #include "llvm/CodeGen/MachineFrameInfo.h"
00030 #include "llvm/CodeGen/MachineFunction.h"
00031 #include "llvm/CodeGen/MachineInstrBuilder.h"
00032 #include "llvm/CodeGen/MachineJumpTableInfo.h"
00033 #include "llvm/CodeGen/MachineModuleInfo.h"
00034 #include "llvm/CodeGen/MachineRegisterInfo.h"
00035 #include "llvm/IR/CallSite.h"
00036 #include "llvm/IR/CallingConv.h"
00037 #include "llvm/IR/Constants.h"
00038 #include "llvm/IR/DerivedTypes.h"
00039 #include "llvm/IR/Function.h"
00040 #include "llvm/IR/GlobalAlias.h"
00041 #include "llvm/IR/GlobalVariable.h"
00042 #include "llvm/IR/Instructions.h"
00043 #include "llvm/IR/Intrinsics.h"
00044 #include "llvm/MC/MCAsmInfo.h"
00045 #include "llvm/MC/MCContext.h"
00046 #include "llvm/MC/MCExpr.h"
00047 #include "llvm/MC/MCSymbol.h"
00048 #include "llvm/Support/CommandLine.h"
00049 #include "llvm/Support/Debug.h"
00050 #include "llvm/Support/ErrorHandling.h"
00051 #include "llvm/Support/MathExtras.h"
00052 #include "llvm/Target/TargetOptions.h"
00053 #include "X86IntrinsicsInfo.h"
00054 #include <bitset>
00055 #include <numeric>
00056 #include <cctype>
00057 using namespace llvm;
00058 
00059 #define DEBUG_TYPE "x86-isel"
00060 
00061 STATISTIC(NumTailCalls, "Number of tail calls");
00062 
00063 static cl::opt<bool> ExperimentalVectorWideningLegalization(
00064     "x86-experimental-vector-widening-legalization", cl::init(false),
00065     cl::desc("Enable an experimental vector type legalization through widening "
00066              "rather than promotion."),
00067     cl::Hidden);
00068 
00069 static cl::opt<bool> ExperimentalVectorShuffleLowering(
00070     "x86-experimental-vector-shuffle-lowering", cl::init(true),
00071     cl::desc("Enable an experimental vector shuffle lowering code path."),
00072     cl::Hidden);
00073 
00074 // Forward declarations.
00075 static SDValue getMOVL(SelectionDAG &DAG, SDLoc dl, EVT VT, SDValue V1,
00076                        SDValue V2);
00077 
00078 static SDValue ExtractSubVector(SDValue Vec, unsigned IdxVal,
00079                                 SelectionDAG &DAG, SDLoc dl,
00080                                 unsigned vectorWidth) {
00081   assert((vectorWidth == 128 || vectorWidth == 256) &&
00082          "Unsupported vector width");
00083   EVT VT = Vec.getValueType();
00084   EVT ElVT = VT.getVectorElementType();
00085   unsigned Factor = VT.getSizeInBits()/vectorWidth;
00086   EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT,
00087                                   VT.getVectorNumElements()/Factor);
00088 
00089   // Extract from UNDEF is UNDEF.
00090   if (Vec.getOpcode() == ISD::UNDEF)
00091     return DAG.getUNDEF(ResultVT);
00092 
00093   // Extract the relevant vectorWidth bits.  Generate an EXTRACT_SUBVECTOR
00094   unsigned ElemsPerChunk = vectorWidth / ElVT.getSizeInBits();
00095 
00096   // This is the index of the first element of the vectorWidth-bit chunk
00097   // we want.
00098   unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits()) / vectorWidth)
00099                                * ElemsPerChunk);
00100 
00101   // If the input is a buildvector just emit a smaller one.
00102   if (Vec.getOpcode() == ISD::BUILD_VECTOR)
00103     return DAG.getNode(ISD::BUILD_VECTOR, dl, ResultVT,
00104                        makeArrayRef(Vec->op_begin()+NormalizedIdxVal,
00105                                     ElemsPerChunk));
00106 
00107   SDValue VecIdx = DAG.getIntPtrConstant(NormalizedIdxVal);
00108   SDValue Result = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec,
00109                                VecIdx);
00110 
00111   return Result;
00112 
00113 }
00114 /// Generate a DAG to grab 128-bits from a vector > 128 bits.  This
00115 /// sets things up to match to an AVX VEXTRACTF128 / VEXTRACTI128
00116 /// or AVX-512 VEXTRACTF32x4 / VEXTRACTI32x4
00117 /// instructions or a simple subregister reference. Idx is an index in the
00118 /// 128 bits we want.  It need not be aligned to a 128-bit bounday.  That makes
00119 /// lowering EXTRACT_VECTOR_ELT operations easier.
00120 static SDValue Extract128BitVector(SDValue Vec, unsigned IdxVal,
00121                                    SelectionDAG &DAG, SDLoc dl) {
00122   assert((Vec.getValueType().is256BitVector() ||
00123           Vec.getValueType().is512BitVector()) && "Unexpected vector size!");
00124   return ExtractSubVector(Vec, IdxVal, DAG, dl, 128);
00125 }
00126 
00127 /// Generate a DAG to grab 256-bits from a 512-bit vector.
00128 static SDValue Extract256BitVector(SDValue Vec, unsigned IdxVal,
00129                                    SelectionDAG &DAG, SDLoc dl) {
00130   assert(Vec.getValueType().is512BitVector() && "Unexpected vector size!");
00131   return ExtractSubVector(Vec, IdxVal, DAG, dl, 256);
00132 }
00133 
00134 static SDValue InsertSubVector(SDValue Result, SDValue Vec,
00135                                unsigned IdxVal, SelectionDAG &DAG,
00136                                SDLoc dl, unsigned vectorWidth) {
00137   assert((vectorWidth == 128 || vectorWidth == 256) &&
00138          "Unsupported vector width");
00139   // Inserting UNDEF is Result
00140   if (Vec.getOpcode() == ISD::UNDEF)
00141     return Result;
00142   EVT VT = Vec.getValueType();
00143   EVT ElVT = VT.getVectorElementType();
00144   EVT ResultVT = Result.getValueType();
00145 
00146   // Insert the relevant vectorWidth bits.
00147   unsigned ElemsPerChunk = vectorWidth/ElVT.getSizeInBits();
00148 
00149   // This is the index of the first element of the vectorWidth-bit chunk
00150   // we want.
00151   unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits())/vectorWidth)
00152                                * ElemsPerChunk);
00153 
00154   SDValue VecIdx = DAG.getIntPtrConstant(NormalizedIdxVal);
00155   return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec,
00156                      VecIdx);
00157 }
00158 /// Generate a DAG to put 128-bits into a vector > 128 bits.  This
00159 /// sets things up to match to an AVX VINSERTF128/VINSERTI128 or
00160 /// AVX-512 VINSERTF32x4/VINSERTI32x4 instructions or a
00161 /// simple superregister reference.  Idx is an index in the 128 bits
00162 /// we want.  It need not be aligned to a 128-bit bounday.  That makes
00163 /// lowering INSERT_VECTOR_ELT operations easier.
00164 static SDValue Insert128BitVector(SDValue Result, SDValue Vec,
00165                                   unsigned IdxVal, SelectionDAG &DAG,
00166                                   SDLoc dl) {
00167   assert(Vec.getValueType().is128BitVector() && "Unexpected vector size!");
00168   return InsertSubVector(Result, Vec, IdxVal, DAG, dl, 128);
00169 }
00170 
00171 static SDValue Insert256BitVector(SDValue Result, SDValue Vec,
00172                                   unsigned IdxVal, SelectionDAG &DAG,
00173                                   SDLoc dl) {
00174   assert(Vec.getValueType().is256BitVector() && "Unexpected vector size!");
00175   return InsertSubVector(Result, Vec, IdxVal, DAG, dl, 256);
00176 }
00177 
00178 /// Concat two 128-bit vectors into a 256 bit vector using VINSERTF128
00179 /// instructions. This is used because creating CONCAT_VECTOR nodes of
00180 /// BUILD_VECTORS returns a larger BUILD_VECTOR while we're trying to lower
00181 /// large BUILD_VECTORS.
00182 static SDValue Concat128BitVectors(SDValue V1, SDValue V2, EVT VT,
00183                                    unsigned NumElems, SelectionDAG &DAG,
00184                                    SDLoc dl) {
00185   SDValue V = Insert128BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl);
00186   return Insert128BitVector(V, V2, NumElems/2, DAG, dl);
00187 }
00188 
00189 static SDValue Concat256BitVectors(SDValue V1, SDValue V2, EVT VT,
00190                                    unsigned NumElems, SelectionDAG &DAG,
00191                                    SDLoc dl) {
00192   SDValue V = Insert256BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl);
00193   return Insert256BitVector(V, V2, NumElems/2, DAG, dl);
00194 }
00195 
00196 static TargetLoweringObjectFile *createTLOF(const Triple &TT) {
00197   if (TT.isOSBinFormatMachO()) {
00198     if (TT.getArch() == Triple::x86_64)
00199       return new X86_64MachoTargetObjectFile();
00200     return new TargetLoweringObjectFileMachO();
00201   }
00202 
00203   if (TT.isOSLinux())
00204     return new X86LinuxTargetObjectFile();
00205   if (TT.isOSBinFormatELF())
00206     return new TargetLoweringObjectFileELF();
00207   if (TT.isKnownWindowsMSVCEnvironment())
00208     return new X86WindowsTargetObjectFile();
00209   if (TT.isOSBinFormatCOFF())
00210     return new TargetLoweringObjectFileCOFF();
00211   llvm_unreachable("unknown subtarget type");
00212 }
00213 
00214 // FIXME: This should stop caching the target machine as soon as
00215 // we can remove resetOperationActions et al.
00216 X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM)
00217     : TargetLowering(TM, createTLOF(Triple(TM.getTargetTriple()))) {
00218   Subtarget = &TM.getSubtarget<X86Subtarget>();
00219   X86ScalarSSEf64 = Subtarget->hasSSE2();
00220   X86ScalarSSEf32 = Subtarget->hasSSE1();
00221   TD = getDataLayout();
00222 
00223   resetOperationActions();
00224 }
00225 
00226 void X86TargetLowering::resetOperationActions() {
00227   const TargetMachine &TM = getTargetMachine();
00228   static bool FirstTimeThrough = true;
00229 
00230   // If none of the target options have changed, then we don't need to reset the
00231   // operation actions.
00232   if (!FirstTimeThrough && TO == TM.Options) return;
00233 
00234   if (!FirstTimeThrough) {
00235     // Reinitialize the actions.
00236     initActions();
00237     FirstTimeThrough = false;
00238   }
00239 
00240   TO = TM.Options;
00241 
00242   // Set up the TargetLowering object.
00243   static const MVT IntVTs[] = { MVT::i8, MVT::i16, MVT::i32, MVT::i64 };
00244 
00245   // X86 is weird, it always uses i8 for shift amounts and setcc results.
00246   setBooleanContents(ZeroOrOneBooleanContent);
00247   // X86-SSE is even stranger. It uses -1 or 0 for vector masks.
00248   setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
00249 
00250   // For 64-bit since we have so many registers use the ILP scheduler, for
00251   // 32-bit code use the register pressure specific scheduling.
00252   // For Atom, always use ILP scheduling.
00253   if (Subtarget->isAtom())
00254     setSchedulingPreference(Sched::ILP);
00255   else if (Subtarget->is64Bit())
00256     setSchedulingPreference(Sched::ILP);
00257   else
00258     setSchedulingPreference(Sched::RegPressure);
00259   const X86RegisterInfo *RegInfo =
00260       TM.getSubtarget<X86Subtarget>().getRegisterInfo();
00261   setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister());
00262 
00263   // Bypass expensive divides on Atom when compiling with O2
00264   if (Subtarget->hasSlowDivide() && TM.getOptLevel() >= CodeGenOpt::Default) {
00265     addBypassSlowDiv(32, 8);
00266     if (Subtarget->is64Bit())
00267       addBypassSlowDiv(64, 16);
00268   }
00269 
00270   if (Subtarget->isTargetKnownWindowsMSVC()) {
00271     // Setup Windows compiler runtime calls.
00272     setLibcallName(RTLIB::SDIV_I64, "_alldiv");
00273     setLibcallName(RTLIB::UDIV_I64, "_aulldiv");
00274     setLibcallName(RTLIB::SREM_I64, "_allrem");
00275     setLibcallName(RTLIB::UREM_I64, "_aullrem");
00276     setLibcallName(RTLIB::MUL_I64, "_allmul");
00277     setLibcallCallingConv(RTLIB::SDIV_I64, CallingConv::X86_StdCall);
00278     setLibcallCallingConv(RTLIB::UDIV_I64, CallingConv::X86_StdCall);
00279     setLibcallCallingConv(RTLIB::SREM_I64, CallingConv::X86_StdCall);
00280     setLibcallCallingConv(RTLIB::UREM_I64, CallingConv::X86_StdCall);
00281     setLibcallCallingConv(RTLIB::MUL_I64, CallingConv::X86_StdCall);
00282 
00283     // The _ftol2 runtime function has an unusual calling conv, which
00284     // is modeled by a special pseudo-instruction.
00285     setLibcallName(RTLIB::FPTOUINT_F64_I64, nullptr);
00286     setLibcallName(RTLIB::FPTOUINT_F32_I64, nullptr);
00287     setLibcallName(RTLIB::FPTOUINT_F64_I32, nullptr);
00288     setLibcallName(RTLIB::FPTOUINT_F32_I32, nullptr);
00289   }
00290 
00291   if (Subtarget->isTargetDarwin()) {
00292     // Darwin should use _setjmp/_longjmp instead of setjmp/longjmp.
00293     setUseUnderscoreSetJmp(false);
00294     setUseUnderscoreLongJmp(false);
00295   } else if (Subtarget->isTargetWindowsGNU()) {
00296     // MS runtime is weird: it exports _setjmp, but longjmp!
00297     setUseUnderscoreSetJmp(true);
00298     setUseUnderscoreLongJmp(false);
00299   } else {
00300     setUseUnderscoreSetJmp(true);
00301     setUseUnderscoreLongJmp(true);
00302   }
00303 
00304   // Set up the register classes.
00305   addRegisterClass(MVT::i8, &X86::GR8RegClass);
00306   addRegisterClass(MVT::i16, &X86::GR16RegClass);
00307   addRegisterClass(MVT::i32, &X86::GR32RegClass);
00308   if (Subtarget->is64Bit())
00309     addRegisterClass(MVT::i64, &X86::GR64RegClass);
00310 
00311   setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote);
00312 
00313   // We don't accept any truncstore of integer registers.
00314   setTruncStoreAction(MVT::i64, MVT::i32, Expand);
00315   setTruncStoreAction(MVT::i64, MVT::i16, Expand);
00316   setTruncStoreAction(MVT::i64, MVT::i8 , Expand);
00317   setTruncStoreAction(MVT::i32, MVT::i16, Expand);
00318   setTruncStoreAction(MVT::i32, MVT::i8 , Expand);
00319   setTruncStoreAction(MVT::i16, MVT::i8,  Expand);
00320 
00321   setTruncStoreAction(MVT::f64, MVT::f32, Expand);
00322 
00323   // SETOEQ and SETUNE require checking two conditions.
00324   setCondCodeAction(ISD::SETOEQ, MVT::f32, Expand);
00325   setCondCodeAction(ISD::SETOEQ, MVT::f64, Expand);
00326   setCondCodeAction(ISD::SETOEQ, MVT::f80, Expand);
00327   setCondCodeAction(ISD::SETUNE, MVT::f32, Expand);
00328   setCondCodeAction(ISD::SETUNE, MVT::f64, Expand);
00329   setCondCodeAction(ISD::SETUNE, MVT::f80, Expand);
00330 
00331   // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this
00332   // operation.
00333   setOperationAction(ISD::UINT_TO_FP       , MVT::i1   , Promote);
00334   setOperationAction(ISD::UINT_TO_FP       , MVT::i8   , Promote);
00335   setOperationAction(ISD::UINT_TO_FP       , MVT::i16  , Promote);
00336 
00337   if (Subtarget->is64Bit()) {
00338     setOperationAction(ISD::UINT_TO_FP     , MVT::i32  , Promote);
00339     setOperationAction(ISD::UINT_TO_FP     , MVT::i64  , Custom);
00340   } else if (!TM.Options.UseSoftFloat) {
00341     // We have an algorithm for SSE2->double, and we turn this into a
00342     // 64-bit FILD followed by conditional FADD for other targets.
00343     setOperationAction(ISD::UINT_TO_FP     , MVT::i64  , Custom);
00344     // We have an algorithm for SSE2, and we turn this into a 64-bit
00345     // FILD for other targets.
00346     setOperationAction(ISD::UINT_TO_FP     , MVT::i32  , Custom);
00347   }
00348 
00349   // Promote i1/i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have
00350   // this operation.
00351   setOperationAction(ISD::SINT_TO_FP       , MVT::i1   , Promote);
00352   setOperationAction(ISD::SINT_TO_FP       , MVT::i8   , Promote);
00353 
00354   if (!TM.Options.UseSoftFloat) {
00355     // SSE has no i16 to fp conversion, only i32
00356     if (X86ScalarSSEf32) {
00357       setOperationAction(ISD::SINT_TO_FP     , MVT::i16  , Promote);
00358       // f32 and f64 cases are Legal, f80 case is not
00359       setOperationAction(ISD::SINT_TO_FP     , MVT::i32  , Custom);
00360     } else {
00361       setOperationAction(ISD::SINT_TO_FP     , MVT::i16  , Custom);
00362       setOperationAction(ISD::SINT_TO_FP     , MVT::i32  , Custom);
00363     }
00364   } else {
00365     setOperationAction(ISD::SINT_TO_FP     , MVT::i16  , Promote);
00366     setOperationAction(ISD::SINT_TO_FP     , MVT::i32  , Promote);
00367   }
00368 
00369   // In 32-bit mode these are custom lowered.  In 64-bit mode F32 and F64
00370   // are Legal, f80 is custom lowered.
00371   setOperationAction(ISD::FP_TO_SINT     , MVT::i64  , Custom);
00372   setOperationAction(ISD::SINT_TO_FP     , MVT::i64  , Custom);
00373 
00374   // Promote i1/i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have
00375   // this operation.
00376   setOperationAction(ISD::FP_TO_SINT       , MVT::i1   , Promote);
00377   setOperationAction(ISD::FP_TO_SINT       , MVT::i8   , Promote);
00378 
00379   if (X86ScalarSSEf32) {
00380     setOperationAction(ISD::FP_TO_SINT     , MVT::i16  , Promote);
00381     // f32 and f64 cases are Legal, f80 case is not
00382     setOperationAction(ISD::FP_TO_SINT     , MVT::i32  , Custom);
00383   } else {
00384     setOperationAction(ISD::FP_TO_SINT     , MVT::i16  , Custom);
00385     setOperationAction(ISD::FP_TO_SINT     , MVT::i32  , Custom);
00386   }
00387 
00388   // Handle FP_TO_UINT by promoting the destination to a larger signed
00389   // conversion.
00390   setOperationAction(ISD::FP_TO_UINT       , MVT::i1   , Promote);
00391   setOperationAction(ISD::FP_TO_UINT       , MVT::i8   , Promote);
00392   setOperationAction(ISD::FP_TO_UINT       , MVT::i16  , Promote);
00393 
00394   if (Subtarget->is64Bit()) {
00395     setOperationAction(ISD::FP_TO_UINT     , MVT::i64  , Expand);
00396     setOperationAction(ISD::FP_TO_UINT     , MVT::i32  , Promote);
00397   } else if (!TM.Options.UseSoftFloat) {
00398     // Since AVX is a superset of SSE3, only check for SSE here.
00399     if (Subtarget->hasSSE1() && !Subtarget->hasSSE3())
00400       // Expand FP_TO_UINT into a select.
00401       // FIXME: We would like to use a Custom expander here eventually to do
00402       // the optimal thing for SSE vs. the default expansion in the legalizer.
00403       setOperationAction(ISD::FP_TO_UINT   , MVT::i32  , Expand);
00404     else
00405       // With SSE3 we can use fisttpll to convert to a signed i64; without
00406       // SSE, we're stuck with a fistpll.
00407       setOperationAction(ISD::FP_TO_UINT   , MVT::i32  , Custom);
00408   }
00409 
00410   if (isTargetFTOL()) {
00411     // Use the _ftol2 runtime function, which has a pseudo-instruction
00412     // to handle its weird calling convention.
00413     setOperationAction(ISD::FP_TO_UINT     , MVT::i64  , Custom);
00414   }
00415 
00416   // TODO: when we have SSE, these could be more efficient, by using movd/movq.
00417   if (!X86ScalarSSEf64) {
00418     setOperationAction(ISD::BITCAST        , MVT::f32  , Expand);
00419     setOperationAction(ISD::BITCAST        , MVT::i32  , Expand);
00420     if (Subtarget->is64Bit()) {
00421       setOperationAction(ISD::BITCAST      , MVT::f64  , Expand);
00422       // Without SSE, i64->f64 goes through memory.
00423       setOperationAction(ISD::BITCAST      , MVT::i64  , Expand);
00424     }
00425   }
00426 
00427   // Scalar integer divide and remainder are lowered to use operations that
00428   // produce two results, to match the available instructions. This exposes
00429   // the two-result form to trivial CSE, which is able to combine x/y and x%y
00430   // into a single instruction.
00431   //
00432   // Scalar integer multiply-high is also lowered to use two-result
00433   // operations, to match the available instructions. However, plain multiply
00434   // (low) operations are left as Legal, as there are single-result
00435   // instructions for this in x86. Using the two-result multiply instructions
00436   // when both high and low results are needed must be arranged by dagcombine.
00437   for (unsigned i = 0; i != array_lengthof(IntVTs); ++i) {
00438     MVT VT = IntVTs[i];
00439     setOperationAction(ISD::MULHS, VT, Expand);
00440     setOperationAction(ISD::MULHU, VT, Expand);
00441     setOperationAction(ISD::SDIV, VT, Expand);
00442     setOperationAction(ISD::UDIV, VT, Expand);
00443     setOperationAction(ISD::SREM, VT, Expand);
00444     setOperationAction(ISD::UREM, VT, Expand);
00445 
00446     // Add/Sub overflow ops with MVT::Glues are lowered to EFLAGS dependences.
00447     setOperationAction(ISD::ADDC, VT, Custom);
00448     setOperationAction(ISD::ADDE, VT, Custom);
00449     setOperationAction(ISD::SUBC, VT, Custom);
00450     setOperationAction(ISD::SUBE, VT, Custom);
00451   }
00452 
00453   setOperationAction(ISD::BR_JT            , MVT::Other, Expand);
00454   setOperationAction(ISD::BRCOND           , MVT::Other, Custom);
00455   setOperationAction(ISD::BR_CC            , MVT::f32,   Expand);
00456   setOperationAction(ISD::BR_CC            , MVT::f64,   Expand);
00457   setOperationAction(ISD::BR_CC            , MVT::f80,   Expand);
00458   setOperationAction(ISD::BR_CC            , MVT::i8,    Expand);
00459   setOperationAction(ISD::BR_CC            , MVT::i16,   Expand);
00460   setOperationAction(ISD::BR_CC            , MVT::i32,   Expand);
00461   setOperationAction(ISD::BR_CC            , MVT::i64,   Expand);
00462   setOperationAction(ISD::SELECT_CC        , MVT::f32,   Expand);
00463   setOperationAction(ISD::SELECT_CC        , MVT::f64,   Expand);
00464   setOperationAction(ISD::SELECT_CC        , MVT::f80,   Expand);
00465   setOperationAction(ISD::SELECT_CC        , MVT::i8,    Expand);
00466   setOperationAction(ISD::SELECT_CC        , MVT::i16,   Expand);
00467   setOperationAction(ISD::SELECT_CC        , MVT::i32,   Expand);
00468   setOperationAction(ISD::SELECT_CC        , MVT::i64,   Expand);
00469   if (Subtarget->is64Bit())
00470     setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
00471   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16  , Legal);
00472   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8   , Legal);
00473   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1   , Expand);
00474   setOperationAction(ISD::FP_ROUND_INREG   , MVT::f32  , Expand);
00475   setOperationAction(ISD::FREM             , MVT::f32  , Expand);
00476   setOperationAction(ISD::FREM             , MVT::f64  , Expand);
00477   setOperationAction(ISD::FREM             , MVT::f80  , Expand);
00478   setOperationAction(ISD::FLT_ROUNDS_      , MVT::i32  , Custom);
00479 
00480   // Promote the i8 variants and force them on up to i32 which has a shorter
00481   // encoding.
00482   setOperationAction(ISD::CTTZ             , MVT::i8   , Promote);
00483   AddPromotedToType (ISD::CTTZ             , MVT::i8   , MVT::i32);
00484   setOperationAction(ISD::CTTZ_ZERO_UNDEF  , MVT::i8   , Promote);
00485   AddPromotedToType (ISD::CTTZ_ZERO_UNDEF  , MVT::i8   , MVT::i32);
00486   if (Subtarget->hasBMI()) {
00487     setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i16  , Expand);
00488     setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32  , Expand);
00489     if (Subtarget->is64Bit())
00490       setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Expand);
00491   } else {
00492     setOperationAction(ISD::CTTZ           , MVT::i16  , Custom);
00493     setOperationAction(ISD::CTTZ           , MVT::i32  , Custom);
00494     if (Subtarget->is64Bit())
00495       setOperationAction(ISD::CTTZ         , MVT::i64  , Custom);
00496   }
00497 
00498   if (Subtarget->hasLZCNT()) {
00499     // When promoting the i8 variants, force them to i32 for a shorter
00500     // encoding.
00501     setOperationAction(ISD::CTLZ           , MVT::i8   , Promote);
00502     AddPromotedToType (ISD::CTLZ           , MVT::i8   , MVT::i32);
00503     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8   , Promote);
00504     AddPromotedToType (ISD::CTLZ_ZERO_UNDEF, MVT::i8   , MVT::i32);
00505     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16  , Expand);
00506     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32  , Expand);
00507     if (Subtarget->is64Bit())
00508       setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Expand);
00509   } else {
00510     setOperationAction(ISD::CTLZ           , MVT::i8   , Custom);
00511     setOperationAction(ISD::CTLZ           , MVT::i16  , Custom);
00512     setOperationAction(ISD::CTLZ           , MVT::i32  , Custom);
00513     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8   , Custom);
00514     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16  , Custom);
00515     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32  , Custom);
00516     if (Subtarget->is64Bit()) {
00517       setOperationAction(ISD::CTLZ         , MVT::i64  , Custom);
00518       setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom);
00519     }
00520   }
00521 
00522   // Special handling for half-precision floating point conversions.
00523   // If we don't have F16C support, then lower half float conversions
00524   // into library calls.
00525   if (TM.Options.UseSoftFloat || !Subtarget->hasF16C()) {
00526     setOperationAction(ISD::FP16_TO_FP, MVT::f32, Expand);
00527     setOperationAction(ISD::FP_TO_FP16, MVT::f32, Expand);
00528   }
00529 
00530   // There's never any support for operations beyond MVT::f32.
00531   setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
00532   setOperationAction(ISD::FP16_TO_FP, MVT::f80, Expand);
00533   setOperationAction(ISD::FP_TO_FP16, MVT::f64, Expand);
00534   setOperationAction(ISD::FP_TO_FP16, MVT::f80, Expand);
00535 
00536   setLoadExtAction(ISD::EXTLOAD, MVT::f16, Expand);
00537   setTruncStoreAction(MVT::f32, MVT::f16, Expand);
00538   setTruncStoreAction(MVT::f64, MVT::f16, Expand);
00539   setTruncStoreAction(MVT::f80, MVT::f16, Expand);
00540 
00541   if (Subtarget->hasPOPCNT()) {
00542     setOperationAction(ISD::CTPOP          , MVT::i8   , Promote);
00543   } else {
00544     setOperationAction(ISD::CTPOP          , MVT::i8   , Expand);
00545     setOperationAction(ISD::CTPOP          , MVT::i16  , Expand);
00546     setOperationAction(ISD::CTPOP          , MVT::i32  , Expand);
00547     if (Subtarget->is64Bit())
00548       setOperationAction(ISD::CTPOP        , MVT::i64  , Expand);
00549   }
00550 
00551   setOperationAction(ISD::READCYCLECOUNTER , MVT::i64  , Custom);
00552 
00553   if (!Subtarget->hasMOVBE())
00554     setOperationAction(ISD::BSWAP          , MVT::i16  , Expand);
00555 
00556   // These should be promoted to a larger select which is supported.
00557   setOperationAction(ISD::SELECT          , MVT::i1   , Promote);
00558   // X86 wants to expand cmov itself.
00559   setOperationAction(ISD::SELECT          , MVT::i8   , Custom);
00560   setOperationAction(ISD::SELECT          , MVT::i16  , Custom);
00561   setOperationAction(ISD::SELECT          , MVT::i32  , Custom);
00562   setOperationAction(ISD::SELECT          , MVT::f32  , Custom);
00563   setOperationAction(ISD::SELECT          , MVT::f64  , Custom);
00564   setOperationAction(ISD::SELECT          , MVT::f80  , Custom);
00565   setOperationAction(ISD::SETCC           , MVT::i8   , Custom);
00566   setOperationAction(ISD::SETCC           , MVT::i16  , Custom);
00567   setOperationAction(ISD::SETCC           , MVT::i32  , Custom);
00568   setOperationAction(ISD::SETCC           , MVT::f32  , Custom);
00569   setOperationAction(ISD::SETCC           , MVT::f64  , Custom);
00570   setOperationAction(ISD::SETCC           , MVT::f80  , Custom);
00571   if (Subtarget->is64Bit()) {
00572     setOperationAction(ISD::SELECT        , MVT::i64  , Custom);
00573     setOperationAction(ISD::SETCC         , MVT::i64  , Custom);
00574   }
00575   setOperationAction(ISD::EH_RETURN       , MVT::Other, Custom);
00576   // NOTE: EH_SJLJ_SETJMP/_LONGJMP supported here is NOT intended to support
00577   // SjLj exception handling but a light-weight setjmp/longjmp replacement to
00578   // support continuation, user-level threading, and etc.. As a result, no
00579   // other SjLj exception interfaces are implemented and please don't build
00580   // your own exception handling based on them.
00581   // LLVM/Clang supports zero-cost DWARF exception handling.
00582   setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom);
00583   setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom);
00584 
00585   // Darwin ABI issue.
00586   setOperationAction(ISD::ConstantPool    , MVT::i32  , Custom);
00587   setOperationAction(ISD::JumpTable       , MVT::i32  , Custom);
00588   setOperationAction(ISD::GlobalAddress   , MVT::i32  , Custom);
00589   setOperationAction(ISD::GlobalTLSAddress, MVT::i32  , Custom);
00590   if (Subtarget->is64Bit())
00591     setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom);
00592   setOperationAction(ISD::ExternalSymbol  , MVT::i32  , Custom);
00593   setOperationAction(ISD::BlockAddress    , MVT::i32  , Custom);
00594   if (Subtarget->is64Bit()) {
00595     setOperationAction(ISD::ConstantPool  , MVT::i64  , Custom);
00596     setOperationAction(ISD::JumpTable     , MVT::i64  , Custom);
00597     setOperationAction(ISD::GlobalAddress , MVT::i64  , Custom);
00598     setOperationAction(ISD::ExternalSymbol, MVT::i64  , Custom);
00599     setOperationAction(ISD::BlockAddress  , MVT::i64  , Custom);
00600   }
00601   // 64-bit addm sub, shl, sra, srl (iff 32-bit x86)
00602   setOperationAction(ISD::SHL_PARTS       , MVT::i32  , Custom);
00603   setOperationAction(ISD::SRA_PARTS       , MVT::i32  , Custom);
00604   setOperationAction(ISD::SRL_PARTS       , MVT::i32  , Custom);
00605   if (Subtarget->is64Bit()) {
00606     setOperationAction(ISD::SHL_PARTS     , MVT::i64  , Custom);
00607     setOperationAction(ISD::SRA_PARTS     , MVT::i64  , Custom);
00608     setOperationAction(ISD::SRL_PARTS     , MVT::i64  , Custom);
00609   }
00610 
00611   if (Subtarget->hasSSE1())
00612     setOperationAction(ISD::PREFETCH      , MVT::Other, Legal);
00613 
00614   setOperationAction(ISD::ATOMIC_FENCE  , MVT::Other, Custom);
00615 
00616   // Expand certain atomics
00617   for (unsigned i = 0; i != array_lengthof(IntVTs); ++i) {
00618     MVT VT = IntVTs[i];
00619     setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, VT, Custom);
00620     setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom);
00621     setOperationAction(ISD::ATOMIC_STORE, VT, Custom);
00622   }
00623 
00624   if (Subtarget->hasCmpxchg16b()) {
00625     setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i128, Custom);
00626   }
00627 
00628   // FIXME - use subtarget debug flags
00629   if (!Subtarget->isTargetDarwin() && !Subtarget->isTargetELF() &&
00630       !Subtarget->isTargetCygMing() && !Subtarget->isTargetWin64()) {
00631     setOperationAction(ISD::EH_LABEL, MVT::Other, Expand);
00632   }
00633 
00634   if (Subtarget->is64Bit()) {
00635     setExceptionPointerRegister(X86::RAX);
00636     setExceptionSelectorRegister(X86::RDX);
00637   } else {
00638     setExceptionPointerRegister(X86::EAX);
00639     setExceptionSelectorRegister(X86::EDX);
00640   }
00641   setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom);
00642   setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i64, Custom);
00643 
00644   setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom);
00645   setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom);
00646 
00647   setOperationAction(ISD::TRAP, MVT::Other, Legal);
00648   setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);
00649 
00650   // VASTART needs to be custom lowered to use the VarArgsFrameIndex
00651   setOperationAction(ISD::VASTART           , MVT::Other, Custom);
00652   setOperationAction(ISD::VAEND             , MVT::Other, Expand);
00653   if (Subtarget->is64Bit() && !Subtarget->isTargetWin64()) {
00654     // TargetInfo::X86_64ABIBuiltinVaList
00655     setOperationAction(ISD::VAARG           , MVT::Other, Custom);
00656     setOperationAction(ISD::VACOPY          , MVT::Other, Custom);
00657   } else {
00658     // TargetInfo::CharPtrBuiltinVaList
00659     setOperationAction(ISD::VAARG           , MVT::Other, Expand);
00660     setOperationAction(ISD::VACOPY          , MVT::Other, Expand);
00661   }
00662 
00663   setOperationAction(ISD::STACKSAVE,          MVT::Other, Expand);
00664   setOperationAction(ISD::STACKRESTORE,       MVT::Other, Expand);
00665 
00666   setOperationAction(ISD::DYNAMIC_STACKALLOC, getPointerTy(), Custom);
00667 
00668   if (!TM.Options.UseSoftFloat && X86ScalarSSEf64) {
00669     // f32 and f64 use SSE.
00670     // Set up the FP register classes.
00671     addRegisterClass(MVT::f32, &X86::FR32RegClass);
00672     addRegisterClass(MVT::f64, &X86::FR64RegClass);
00673 
00674     // Use ANDPD to simulate FABS.
00675     setOperationAction(ISD::FABS , MVT::f64, Custom);
00676     setOperationAction(ISD::FABS , MVT::f32, Custom);
00677 
00678     // Use XORP to simulate FNEG.
00679     setOperationAction(ISD::FNEG , MVT::f64, Custom);
00680     setOperationAction(ISD::FNEG , MVT::f32, Custom);
00681 
00682     // Use ANDPD and ORPD to simulate FCOPYSIGN.
00683     setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom);
00684     setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
00685 
00686     // Lower this to FGETSIGNx86 plus an AND.
00687     setOperationAction(ISD::FGETSIGN, MVT::i64, Custom);
00688     setOperationAction(ISD::FGETSIGN, MVT::i32, Custom);
00689 
00690     // We don't support sin/cos/fmod
00691     setOperationAction(ISD::FSIN   , MVT::f64, Expand);
00692     setOperationAction(ISD::FCOS   , MVT::f64, Expand);
00693     setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
00694     setOperationAction(ISD::FSIN   , MVT::f32, Expand);
00695     setOperationAction(ISD::FCOS   , MVT::f32, Expand);
00696     setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
00697 
00698     // Expand FP immediates into loads from the stack, except for the special
00699     // cases we handle.
00700     addLegalFPImmediate(APFloat(+0.0)); // xorpd
00701     addLegalFPImmediate(APFloat(+0.0f)); // xorps
00702   } else if (!TM.Options.UseSoftFloat && X86ScalarSSEf32) {
00703     // Use SSE for f32, x87 for f64.
00704     // Set up the FP register classes.
00705     addRegisterClass(MVT::f32, &X86::FR32RegClass);
00706     addRegisterClass(MVT::f64, &X86::RFP64RegClass);
00707 
00708     // Use ANDPS to simulate FABS.
00709     setOperationAction(ISD::FABS , MVT::f32, Custom);
00710 
00711     // Use XORP to simulate FNEG.
00712     setOperationAction(ISD::FNEG , MVT::f32, Custom);
00713 
00714     setOperationAction(ISD::UNDEF,     MVT::f64, Expand);
00715 
00716     // Use ANDPS and ORPS to simulate FCOPYSIGN.
00717     setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
00718     setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
00719 
00720     // We don't support sin/cos/fmod
00721     setOperationAction(ISD::FSIN   , MVT::f32, Expand);
00722     setOperationAction(ISD::FCOS   , MVT::f32, Expand);
00723     setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
00724 
00725     // Special cases we handle for FP constants.
00726     addLegalFPImmediate(APFloat(+0.0f)); // xorps
00727     addLegalFPImmediate(APFloat(+0.0)); // FLD0
00728     addLegalFPImmediate(APFloat(+1.0)); // FLD1
00729     addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
00730     addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
00731 
00732     if (!TM.Options.UnsafeFPMath) {
00733       setOperationAction(ISD::FSIN   , MVT::f64, Expand);
00734       setOperationAction(ISD::FCOS   , MVT::f64, Expand);
00735       setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
00736     }
00737   } else if (!TM.Options.UseSoftFloat) {
00738     // f32 and f64 in x87.
00739     // Set up the FP register classes.
00740     addRegisterClass(MVT::f64, &X86::RFP64RegClass);
00741     addRegisterClass(MVT::f32, &X86::RFP32RegClass);
00742 
00743     setOperationAction(ISD::UNDEF,     MVT::f64, Expand);
00744     setOperationAction(ISD::UNDEF,     MVT::f32, Expand);
00745     setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
00746     setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
00747 
00748     if (!TM.Options.UnsafeFPMath) {
00749       setOperationAction(ISD::FSIN   , MVT::f64, Expand);
00750       setOperationAction(ISD::FSIN   , MVT::f32, Expand);
00751       setOperationAction(ISD::FCOS   , MVT::f64, Expand);
00752       setOperationAction(ISD::FCOS   , MVT::f32, Expand);
00753       setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
00754       setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
00755     }
00756     addLegalFPImmediate(APFloat(+0.0)); // FLD0
00757     addLegalFPImmediate(APFloat(+1.0)); // FLD1
00758     addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
00759     addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
00760     addLegalFPImmediate(APFloat(+0.0f)); // FLD0
00761     addLegalFPImmediate(APFloat(+1.0f)); // FLD1
00762     addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS
00763     addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS
00764   }
00765 
00766   // We don't support FMA.
00767   setOperationAction(ISD::FMA, MVT::f64, Expand);
00768   setOperationAction(ISD::FMA, MVT::f32, Expand);
00769 
00770   // Long double always uses X87.
00771   if (!TM.Options.UseSoftFloat) {
00772     addRegisterClass(MVT::f80, &X86::RFP80RegClass);
00773     setOperationAction(ISD::UNDEF,     MVT::f80, Expand);
00774     setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand);
00775     {
00776       APFloat TmpFlt = APFloat::getZero(APFloat::x87DoubleExtended);
00777       addLegalFPImmediate(TmpFlt);  // FLD0
00778       TmpFlt.changeSign();
00779       addLegalFPImmediate(TmpFlt);  // FLD0/FCHS
00780 
00781       bool ignored;
00782       APFloat TmpFlt2(+1.0);
00783       TmpFlt2.convert(APFloat::x87DoubleExtended, APFloat::rmNearestTiesToEven,
00784                       &ignored);
00785       addLegalFPImmediate(TmpFlt2);  // FLD1
00786       TmpFlt2.changeSign();
00787       addLegalFPImmediate(TmpFlt2);  // FLD1/FCHS
00788     }
00789 
00790     if (!TM.Options.UnsafeFPMath) {
00791       setOperationAction(ISD::FSIN   , MVT::f80, Expand);
00792       setOperationAction(ISD::FCOS   , MVT::f80, Expand);
00793       setOperationAction(ISD::FSINCOS, MVT::f80, Expand);
00794     }
00795 
00796     setOperationAction(ISD::FFLOOR, MVT::f80, Expand);
00797     setOperationAction(ISD::FCEIL,  MVT::f80, Expand);
00798     setOperationAction(ISD::FTRUNC, MVT::f80, Expand);
00799     setOperationAction(ISD::FRINT,  MVT::f80, Expand);
00800     setOperationAction(ISD::FNEARBYINT, MVT::f80, Expand);
00801     setOperationAction(ISD::FMA, MVT::f80, Expand);
00802   }
00803 
00804   // Always use a library call for pow.
00805   setOperationAction(ISD::FPOW             , MVT::f32  , Expand);
00806   setOperationAction(ISD::FPOW             , MVT::f64  , Expand);
00807   setOperationAction(ISD::FPOW             , MVT::f80  , Expand);
00808 
00809   setOperationAction(ISD::FLOG, MVT::f80, Expand);
00810   setOperationAction(ISD::FLOG2, MVT::f80, Expand);
00811   setOperationAction(ISD::FLOG10, MVT::f80, Expand);
00812   setOperationAction(ISD::FEXP, MVT::f80, Expand);
00813   setOperationAction(ISD::FEXP2, MVT::f80, Expand);
00814   setOperationAction(ISD::FMINNUM, MVT::f80, Expand);
00815   setOperationAction(ISD::FMAXNUM, MVT::f80, Expand);
00816 
00817   // First set operation action for all vector types to either promote
00818   // (for widening) or expand (for scalarization). Then we will selectively
00819   // turn on ones that can be effectively codegen'd.
00820   for (int i = MVT::FIRST_VECTOR_VALUETYPE;
00821            i <= MVT::LAST_VECTOR_VALUETYPE; ++i) {
00822     MVT VT = (MVT::SimpleValueType)i;
00823     setOperationAction(ISD::ADD , VT, Expand);
00824     setOperationAction(ISD::SUB , VT, Expand);
00825     setOperationAction(ISD::FADD, VT, Expand);
00826     setOperationAction(ISD::FNEG, VT, Expand);
00827     setOperationAction(ISD::FSUB, VT, Expand);
00828     setOperationAction(ISD::MUL , VT, Expand);
00829     setOperationAction(ISD::FMUL, VT, Expand);
00830     setOperationAction(ISD::SDIV, VT, Expand);
00831     setOperationAction(ISD::UDIV, VT, Expand);
00832     setOperationAction(ISD::FDIV, VT, Expand);
00833     setOperationAction(ISD::SREM, VT, Expand);
00834     setOperationAction(ISD::UREM, VT, Expand);
00835     setOperationAction(ISD::LOAD, VT, Expand);
00836     setOperationAction(ISD::VECTOR_SHUFFLE, VT, Expand);
00837     setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT,Expand);
00838     setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Expand);
00839     setOperationAction(ISD::EXTRACT_SUBVECTOR, VT,Expand);
00840     setOperationAction(ISD::INSERT_SUBVECTOR, VT,Expand);
00841     setOperationAction(ISD::FABS, VT, Expand);
00842     setOperationAction(ISD::FSIN, VT, Expand);
00843     setOperationAction(ISD::FSINCOS, VT, Expand);
00844     setOperationAction(ISD::FCOS, VT, Expand);
00845     setOperationAction(ISD::FSINCOS, VT, Expand);
00846     setOperationAction(ISD::FREM, VT, Expand);
00847     setOperationAction(ISD::FMA,  VT, Expand);
00848     setOperationAction(ISD::FPOWI, VT, Expand);
00849     setOperationAction(ISD::FSQRT, VT, Expand);
00850     setOperationAction(ISD::FCOPYSIGN, VT, Expand);
00851     setOperationAction(ISD::FFLOOR, VT, Expand);
00852     setOperationAction(ISD::FCEIL, VT, Expand);
00853     setOperationAction(ISD::FTRUNC, VT, Expand);
00854     setOperationAction(ISD::FRINT, VT, Expand);
00855     setOperationAction(ISD::FNEARBYINT, VT, Expand);
00856     setOperationAction(ISD::SMUL_LOHI, VT, Expand);
00857     setOperationAction(ISD::MULHS, VT, Expand);
00858     setOperationAction(ISD::UMUL_LOHI, VT, Expand);
00859     setOperationAction(ISD::MULHU, VT, Expand);
00860     setOperationAction(ISD::SDIVREM, VT, Expand);
00861     setOperationAction(ISD::UDIVREM, VT, Expand);
00862     setOperationAction(ISD::FPOW, VT, Expand);
00863     setOperationAction(ISD::CTPOP, VT, Expand);
00864     setOperationAction(ISD::CTTZ, VT, Expand);
00865     setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Expand);
00866     setOperationAction(ISD::CTLZ, VT, Expand);
00867     setOperationAction(ISD::CTLZ_ZERO_UNDEF, VT, Expand);
00868     setOperationAction(ISD::SHL, VT, Expand);
00869     setOperationAction(ISD::SRA, VT, Expand);
00870     setOperationAction(ISD::SRL, VT, Expand);
00871     setOperationAction(ISD::ROTL, VT, Expand);
00872     setOperationAction(ISD::ROTR, VT, Expand);
00873     setOperationAction(ISD::BSWAP, VT, Expand);
00874     setOperationAction(ISD::SETCC, VT, Expand);
00875     setOperationAction(ISD::FLOG, VT, Expand);
00876     setOperationAction(ISD::FLOG2, VT, Expand);
00877     setOperationAction(ISD::FLOG10, VT, Expand);
00878     setOperationAction(ISD::FEXP, VT, Expand);
00879     setOperationAction(ISD::FEXP2, VT, Expand);
00880     setOperationAction(ISD::FP_TO_UINT, VT, Expand);
00881     setOperationAction(ISD::FP_TO_SINT, VT, Expand);
00882     setOperationAction(ISD::UINT_TO_FP, VT, Expand);
00883     setOperationAction(ISD::SINT_TO_FP, VT, Expand);
00884     setOperationAction(ISD::SIGN_EXTEND_INREG, VT,Expand);
00885     setOperationAction(ISD::TRUNCATE, VT, Expand);
00886     setOperationAction(ISD::SIGN_EXTEND, VT, Expand);
00887     setOperationAction(ISD::ZERO_EXTEND, VT, Expand);
00888     setOperationAction(ISD::ANY_EXTEND, VT, Expand);
00889     setOperationAction(ISD::VSELECT, VT, Expand);
00890     setOperationAction(ISD::SELECT_CC, VT, Expand);
00891     for (int InnerVT = MVT::FIRST_VECTOR_VALUETYPE;
00892              InnerVT <= MVT::LAST_VECTOR_VALUETYPE; ++InnerVT)
00893       setTruncStoreAction(VT,
00894                           (MVT::SimpleValueType)InnerVT, Expand);
00895     setLoadExtAction(ISD::SEXTLOAD, VT, Expand);
00896     setLoadExtAction(ISD::ZEXTLOAD, VT, Expand);
00897 
00898     // N.b. ISD::EXTLOAD legality is basically ignored except for i1-like types,
00899     // we have to deal with them whether we ask for Expansion or not. Setting
00900     // Expand causes its own optimisation problems though, so leave them legal.
00901     if (VT.getVectorElementType() == MVT::i1)
00902       setLoadExtAction(ISD::EXTLOAD, VT, Expand);
00903   }
00904 
00905   // FIXME: In order to prevent SSE instructions being expanded to MMX ones
00906   // with -msoft-float, disable use of MMX as well.
00907   if (!TM.Options.UseSoftFloat && Subtarget->hasMMX()) {
00908     addRegisterClass(MVT::x86mmx, &X86::VR64RegClass);
00909     // No operations on x86mmx supported, everything uses intrinsics.
00910   }
00911 
00912   // MMX-sized vectors (other than x86mmx) are expected to be expanded
00913   // into smaller operations.
00914   setOperationAction(ISD::MULHS,              MVT::v8i8,  Expand);
00915   setOperationAction(ISD::MULHS,              MVT::v4i16, Expand);
00916   setOperationAction(ISD::MULHS,              MVT::v2i32, Expand);
00917   setOperationAction(ISD::MULHS,              MVT::v1i64, Expand);
00918   setOperationAction(ISD::AND,                MVT::v8i8,  Expand);
00919   setOperationAction(ISD::AND,                MVT::v4i16, Expand);
00920   setOperationAction(ISD::AND,                MVT::v2i32, Expand);
00921   setOperationAction(ISD::AND,                MVT::v1i64, Expand);
00922   setOperationAction(ISD::OR,                 MVT::v8i8,  Expand);
00923   setOperationAction(ISD::OR,                 MVT::v4i16, Expand);
00924   setOperationAction(ISD::OR,                 MVT::v2i32, Expand);
00925   setOperationAction(ISD::OR,                 MVT::v1i64, Expand);
00926   setOperationAction(ISD::XOR,                MVT::v8i8,  Expand);
00927   setOperationAction(ISD::XOR,                MVT::v4i16, Expand);
00928   setOperationAction(ISD::XOR,                MVT::v2i32, Expand);
00929   setOperationAction(ISD::XOR,                MVT::v1i64, Expand);
00930   setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v8i8,  Expand);
00931   setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v4i16, Expand);
00932   setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v2i32, Expand);
00933   setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v1i64, Expand);
00934   setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v1i64, Expand);
00935   setOperationAction(ISD::SELECT,             MVT::v8i8,  Expand);
00936   setOperationAction(ISD::SELECT,             MVT::v4i16, Expand);
00937   setOperationAction(ISD::SELECT,             MVT::v2i32, Expand);
00938   setOperationAction(ISD::SELECT,             MVT::v1i64, Expand);
00939   setOperationAction(ISD::BITCAST,            MVT::v8i8,  Expand);
00940   setOperationAction(ISD::BITCAST,            MVT::v4i16, Expand);
00941   setOperationAction(ISD::BITCAST,            MVT::v2i32, Expand);
00942   setOperationAction(ISD::BITCAST,            MVT::v1i64, Expand);
00943 
00944   if (!TM.Options.UseSoftFloat && Subtarget->hasSSE1()) {
00945     addRegisterClass(MVT::v4f32, &X86::VR128RegClass);
00946 
00947     setOperationAction(ISD::FADD,               MVT::v4f32, Legal);
00948     setOperationAction(ISD::FSUB,               MVT::v4f32, Legal);
00949     setOperationAction(ISD::FMUL,               MVT::v4f32, Legal);
00950     setOperationAction(ISD::FDIV,               MVT::v4f32, Legal);
00951     setOperationAction(ISD::FSQRT,              MVT::v4f32, Legal);
00952     setOperationAction(ISD::FNEG,               MVT::v4f32, Custom);
00953     setOperationAction(ISD::FABS,               MVT::v4f32, Custom);
00954     setOperationAction(ISD::LOAD,               MVT::v4f32, Legal);
00955     setOperationAction(ISD::BUILD_VECTOR,       MVT::v4f32, Custom);
00956     setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v4f32, Custom);
00957     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
00958     setOperationAction(ISD::SELECT,             MVT::v4f32, Custom);
00959   }
00960 
00961   if (!TM.Options.UseSoftFloat && Subtarget->hasSSE2()) {
00962     addRegisterClass(MVT::v2f64, &X86::VR128RegClass);
00963 
00964     // FIXME: Unfortunately, -soft-float and -no-implicit-float mean XMM
00965     // registers cannot be used even for integer operations.
00966     addRegisterClass(MVT::v16i8, &X86::VR128RegClass);
00967     addRegisterClass(MVT::v8i16, &X86::VR128RegClass);
00968     addRegisterClass(MVT::v4i32, &X86::VR128RegClass);
00969     addRegisterClass(MVT::v2i64, &X86::VR128RegClass);
00970 
00971     setOperationAction(ISD::ADD,                MVT::v16i8, Legal);
00972     setOperationAction(ISD::ADD,                MVT::v8i16, Legal);
00973     setOperationAction(ISD::ADD,                MVT::v4i32, Legal);
00974     setOperationAction(ISD::ADD,                MVT::v2i64, Legal);
00975     setOperationAction(ISD::MUL,                MVT::v4i32, Custom);
00976     setOperationAction(ISD::MUL,                MVT::v2i64, Custom);
00977     setOperationAction(ISD::UMUL_LOHI,          MVT::v4i32, Custom);
00978     setOperationAction(ISD::SMUL_LOHI,          MVT::v4i32, Custom);
00979     setOperationAction(ISD::MULHU,              MVT::v8i16, Legal);
00980     setOperationAction(ISD::MULHS,              MVT::v8i16, Legal);
00981     setOperationAction(ISD::SUB,                MVT::v16i8, Legal);
00982     setOperationAction(ISD::SUB,                MVT::v8i16, Legal);
00983     setOperationAction(ISD::SUB,                MVT::v4i32, Legal);
00984     setOperationAction(ISD::SUB,                MVT::v2i64, Legal);
00985     setOperationAction(ISD::MUL,                MVT::v8i16, Legal);
00986     setOperationAction(ISD::FADD,               MVT::v2f64, Legal);
00987     setOperationAction(ISD::FSUB,               MVT::v2f64, Legal);
00988     setOperationAction(ISD::FMUL,               MVT::v2f64, Legal);
00989     setOperationAction(ISD::FDIV,               MVT::v2f64, Legal);
00990     setOperationAction(ISD::FSQRT,              MVT::v2f64, Legal);
00991     setOperationAction(ISD::FNEG,               MVT::v2f64, Custom);
00992     setOperationAction(ISD::FABS,               MVT::v2f64, Custom);
00993 
00994     setOperationAction(ISD::SETCC,              MVT::v2i64, Custom);
00995     setOperationAction(ISD::SETCC,              MVT::v16i8, Custom);
00996     setOperationAction(ISD::SETCC,              MVT::v8i16, Custom);
00997     setOperationAction(ISD::SETCC,              MVT::v4i32, Custom);
00998 
00999     setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v16i8, Custom);
01000     setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v8i16, Custom);
01001     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v8i16, Custom);
01002     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4i32, Custom);
01003     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4f32, Custom);
01004 
01005     // Custom lower build_vector, vector_shuffle, and extract_vector_elt.
01006     for (int i = MVT::v16i8; i != MVT::v2i64; ++i) {
01007       MVT VT = (MVT::SimpleValueType)i;
01008       // Do not attempt to custom lower non-power-of-2 vectors
01009       if (!isPowerOf2_32(VT.getVectorNumElements()))
01010         continue;
01011       // Do not attempt to custom lower non-128-bit vectors
01012       if (!VT.is128BitVector())
01013         continue;
01014       setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);
01015       setOperationAction(ISD::VECTOR_SHUFFLE,     VT, Custom);
01016       setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
01017     }
01018 
01019     // We support custom legalizing of sext and anyext loads for specific
01020     // memory vector types which we can load as a scalar (or sequence of
01021     // scalars) and extend in-register to a legal 128-bit vector type. For sext
01022     // loads these must work with a single scalar load.
01023     setLoadExtAction(ISD::SEXTLOAD, MVT::v4i8, Custom);
01024     setLoadExtAction(ISD::SEXTLOAD, MVT::v4i16, Custom);
01025     setLoadExtAction(ISD::SEXTLOAD, MVT::v8i8, Custom);
01026     setLoadExtAction(ISD::EXTLOAD, MVT::v2i8, Custom);
01027     setLoadExtAction(ISD::EXTLOAD, MVT::v2i16, Custom);
01028     setLoadExtAction(ISD::EXTLOAD, MVT::v2i32, Custom);
01029     setLoadExtAction(ISD::EXTLOAD, MVT::v4i8, Custom);
01030     setLoadExtAction(ISD::EXTLOAD, MVT::v4i16, Custom);
01031     setLoadExtAction(ISD::EXTLOAD, MVT::v8i8, Custom);
01032 
01033     setOperationAction(ISD::BUILD_VECTOR,       MVT::v2f64, Custom);
01034     setOperationAction(ISD::BUILD_VECTOR,       MVT::v2i64, Custom);
01035     setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v2f64, Custom);
01036     setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v2i64, Custom);
01037     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v2f64, Custom);
01038     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f64, Custom);
01039 
01040     if (Subtarget->is64Bit()) {
01041       setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v2i64, Custom);
01042       setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Custom);
01043     }
01044 
01045     // Promote v16i8, v8i16, v4i32 load, select, and, or, xor to v2i64.
01046     for (int i = MVT::v16i8; i != MVT::v2i64; ++i) {
01047       MVT VT = (MVT::SimpleValueType)i;
01048 
01049       // Do not attempt to promote non-128-bit vectors
01050       if (!VT.is128BitVector())
01051         continue;
01052 
01053       setOperationAction(ISD::AND,    VT, Promote);
01054       AddPromotedToType (ISD::AND,    VT, MVT::v2i64);
01055       setOperationAction(ISD::OR,     VT, Promote);
01056       AddPromotedToType (ISD::OR,     VT, MVT::v2i64);
01057       setOperationAction(ISD::XOR,    VT, Promote);
01058       AddPromotedToType (ISD::XOR,    VT, MVT::v2i64);
01059       setOperationAction(ISD::LOAD,   VT, Promote);
01060       AddPromotedToType (ISD::LOAD,   VT, MVT::v2i64);
01061       setOperationAction(ISD::SELECT, VT, Promote);
01062       AddPromotedToType (ISD::SELECT, VT, MVT::v2i64);
01063     }
01064 
01065     // Custom lower v2i64 and v2f64 selects.
01066     setOperationAction(ISD::LOAD,               MVT::v2f64, Legal);
01067     setOperationAction(ISD::LOAD,               MVT::v2i64, Legal);
01068     setOperationAction(ISD::SELECT,             MVT::v2f64, Custom);
01069     setOperationAction(ISD::SELECT,             MVT::v2i64, Custom);
01070 
01071     setOperationAction(ISD::FP_TO_SINT,         MVT::v4i32, Legal);
01072     setOperationAction(ISD::SINT_TO_FP,         MVT::v4i32, Legal);
01073 
01074     setOperationAction(ISD::UINT_TO_FP,         MVT::v4i8,  Custom);
01075     setOperationAction(ISD::UINT_TO_FP,         MVT::v4i16, Custom);
01076     // As there is no 64-bit GPR available, we need build a special custom
01077     // sequence to convert from v2i32 to v2f32.
01078     if (!Subtarget->is64Bit())
01079       setOperationAction(ISD::UINT_TO_FP,       MVT::v2f32, Custom);
01080 
01081     setOperationAction(ISD::FP_EXTEND,          MVT::v2f32, Custom);
01082     setOperationAction(ISD::FP_ROUND,           MVT::v2f32, Custom);
01083 
01084     setLoadExtAction(ISD::EXTLOAD,              MVT::v2f32, Legal);
01085 
01086     setOperationAction(ISD::BITCAST,            MVT::v2i32, Custom);
01087     setOperationAction(ISD::BITCAST,            MVT::v4i16, Custom);
01088     setOperationAction(ISD::BITCAST,            MVT::v8i8,  Custom);
01089   }
01090 
01091   if (!TM.Options.UseSoftFloat && Subtarget->hasSSE41()) {
01092     setOperationAction(ISD::FFLOOR,             MVT::f32,   Legal);
01093     setOperationAction(ISD::FCEIL,              MVT::f32,   Legal);
01094     setOperationAction(ISD::FTRUNC,             MVT::f32,   Legal);
01095     setOperationAction(ISD::FRINT,              MVT::f32,   Legal);
01096     setOperationAction(ISD::FNEARBYINT,         MVT::f32,   Legal);
01097     setOperationAction(ISD::FFLOOR,             MVT::f64,   Legal);
01098     setOperationAction(ISD::FCEIL,              MVT::f64,   Legal);
01099     setOperationAction(ISD::FTRUNC,             MVT::f64,   Legal);
01100     setOperationAction(ISD::FRINT,              MVT::f64,   Legal);
01101     setOperationAction(ISD::FNEARBYINT,         MVT::f64,   Legal);
01102 
01103     setOperationAction(ISD::FFLOOR,             MVT::v4f32, Legal);
01104     setOperationAction(ISD::FCEIL,              MVT::v4f32, Legal);
01105     setOperationAction(ISD::FTRUNC,             MVT::v4f32, Legal);
01106     setOperationAction(ISD::FRINT,              MVT::v4f32, Legal);
01107     setOperationAction(ISD::FNEARBYINT,         MVT::v4f32, Legal);
01108     setOperationAction(ISD::FFLOOR,             MVT::v2f64, Legal);
01109     setOperationAction(ISD::FCEIL,              MVT::v2f64, Legal);
01110     setOperationAction(ISD::FTRUNC,             MVT::v2f64, Legal);
01111     setOperationAction(ISD::FRINT,              MVT::v2f64, Legal);
01112     setOperationAction(ISD::FNEARBYINT,         MVT::v2f64, Legal);
01113 
01114     // FIXME: Do we need to handle scalar-to-vector here?
01115     setOperationAction(ISD::MUL,                MVT::v4i32, Legal);
01116 
01117     setOperationAction(ISD::VSELECT,            MVT::v2f64, Custom);
01118     setOperationAction(ISD::VSELECT,            MVT::v2i64, Custom);
01119     setOperationAction(ISD::VSELECT,            MVT::v4i32, Custom);
01120     setOperationAction(ISD::VSELECT,            MVT::v4f32, Custom);
01121     setOperationAction(ISD::VSELECT,            MVT::v8i16, Custom);
01122     // There is no BLENDI for byte vectors. We don't need to custom lower
01123     // some vselects for now.
01124     setOperationAction(ISD::VSELECT,            MVT::v16i8, Legal);
01125 
01126     // SSE41 brings specific instructions for doing vector sign extend even in
01127     // cases where we don't have SRA.
01128     setLoadExtAction(ISD::SEXTLOAD, MVT::v2i8, Custom);
01129     setLoadExtAction(ISD::SEXTLOAD, MVT::v2i16, Custom);
01130     setLoadExtAction(ISD::SEXTLOAD, MVT::v2i32, Custom);
01131 
01132     // i8 and i16 vectors are custom because the source register and source
01133     // source memory operand types are not the same width.  f32 vectors are
01134     // custom since the immediate controlling the insert encodes additional
01135     // information.
01136     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v16i8, Custom);
01137     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v8i16, Custom);
01138     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4i32, Custom);
01139     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4f32, Custom);
01140 
01141     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i8, Custom);
01142     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i16, Custom);
01143     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Custom);
01144     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
01145 
01146     // FIXME: these should be Legal, but that's only for the case where
01147     // the index is constant.  For now custom expand to deal with that.
01148     if (Subtarget->is64Bit()) {
01149       setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v2i64, Custom);
01150       setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Custom);
01151     }
01152   }
01153 
01154   if (Subtarget->hasSSE2()) {
01155     setOperationAction(ISD::SRL,               MVT::v8i16, Custom);
01156     setOperationAction(ISD::SRL,               MVT::v16i8, Custom);
01157 
01158     setOperationAction(ISD::SHL,               MVT::v8i16, Custom);
01159     setOperationAction(ISD::SHL,               MVT::v16i8, Custom);
01160 
01161     setOperationAction(ISD::SRA,               MVT::v8i16, Custom);
01162     setOperationAction(ISD::SRA,               MVT::v16i8, Custom);
01163 
01164     // In the customized shift lowering, the legal cases in AVX2 will be
01165     // recognized.
01166     setOperationAction(ISD::SRL,               MVT::v2i64, Custom);
01167     setOperationAction(ISD::SRL,               MVT::v4i32, Custom);
01168 
01169     setOperationAction(ISD::SHL,               MVT::v2i64, Custom);
01170     setOperationAction(ISD::SHL,               MVT::v4i32, Custom);
01171 
01172     setOperationAction(ISD::SRA,               MVT::v4i32, Custom);
01173   }
01174 
01175   if (!TM.Options.UseSoftFloat && Subtarget->hasFp256()) {
01176     addRegisterClass(MVT::v32i8,  &X86::VR256RegClass);
01177     addRegisterClass(MVT::v16i16, &X86::VR256RegClass);
01178     addRegisterClass(MVT::v8i32,  &X86::VR256RegClass);
01179     addRegisterClass(MVT::v8f32,  &X86::VR256RegClass);
01180     addRegisterClass(MVT::v4i64,  &X86::VR256RegClass);
01181     addRegisterClass(MVT::v4f64,  &X86::VR256RegClass);
01182 
01183     setOperationAction(ISD::LOAD,               MVT::v8f32, Legal);
01184     setOperationAction(ISD::LOAD,               MVT::v4f64, Legal);
01185     setOperationAction(ISD::LOAD,               MVT::v4i64, Legal);
01186 
01187     setOperationAction(ISD::FADD,               MVT::v8f32, Legal);
01188     setOperationAction(ISD::FSUB,               MVT::v8f32, Legal);
01189     setOperationAction(ISD::FMUL,               MVT::v8f32, Legal);
01190     setOperationAction(ISD::FDIV,               MVT::v8f32, Legal);
01191     setOperationAction(ISD::FSQRT,              MVT::v8f32, Legal);
01192     setOperationAction(ISD::FFLOOR,             MVT::v8f32, Legal);
01193     setOperationAction(ISD::FCEIL,              MVT::v8f32, Legal);
01194     setOperationAction(ISD::FTRUNC,             MVT::v8f32, Legal);
01195     setOperationAction(ISD::FRINT,              MVT::v8f32, Legal);
01196     setOperationAction(ISD::FNEARBYINT,         MVT::v8f32, Legal);
01197     setOperationAction(ISD::FNEG,               MVT::v8f32, Custom);
01198     setOperationAction(ISD::FABS,               MVT::v8f32, Custom);
01199 
01200     setOperationAction(ISD::FADD,               MVT::v4f64, Legal);
01201     setOperationAction(ISD::FSUB,               MVT::v4f64, Legal);
01202     setOperationAction(ISD::FMUL,               MVT::v4f64, Legal);
01203     setOperationAction(ISD::FDIV,               MVT::v4f64, Legal);
01204     setOperationAction(ISD::FSQRT,              MVT::v4f64, Legal);
01205     setOperationAction(ISD::FFLOOR,             MVT::v4f64, Legal);
01206     setOperationAction(ISD::FCEIL,              MVT::v4f64, Legal);
01207     setOperationAction(ISD::FTRUNC,             MVT::v4f64, Legal);
01208     setOperationAction(ISD::FRINT,              MVT::v4f64, Legal);
01209     setOperationAction(ISD::FNEARBYINT,         MVT::v4f64, Legal);
01210     setOperationAction(ISD::FNEG,               MVT::v4f64, Custom);
01211     setOperationAction(ISD::FABS,               MVT::v4f64, Custom);
01212 
01213     // (fp_to_int:v8i16 (v8f32 ..)) requires the result type to be promoted
01214     // even though v8i16 is a legal type.
01215     setOperationAction(ISD::FP_TO_SINT,         MVT::v8i16, Promote);
01216     setOperationAction(ISD::FP_TO_UINT,         MVT::v8i16, Promote);
01217     setOperationAction(ISD::FP_TO_SINT,         MVT::v8i32, Legal);
01218 
01219     setOperationAction(ISD::SINT_TO_FP,         MVT::v8i16, Promote);
01220     setOperationAction(ISD::SINT_TO_FP,         MVT::v8i32, Legal);
01221     setOperationAction(ISD::FP_ROUND,           MVT::v4f32, Legal);
01222 
01223     setOperationAction(ISD::UINT_TO_FP,         MVT::v8i8,  Custom);
01224     setOperationAction(ISD::UINT_TO_FP,         MVT::v8i16, Custom);
01225 
01226     setLoadExtAction(ISD::EXTLOAD,              MVT::v4f32, Legal);
01227 
01228     setOperationAction(ISD::SRL,               MVT::v16i16, Custom);
01229     setOperationAction(ISD::SRL,               MVT::v32i8, Custom);
01230 
01231     setOperationAction(ISD::SHL,               MVT::v16i16, Custom);
01232     setOperationAction(ISD::SHL,               MVT::v32i8, Custom);
01233 
01234     setOperationAction(ISD::SRA,               MVT::v16i16, Custom);
01235     setOperationAction(ISD::SRA,               MVT::v32i8, Custom);
01236 
01237     setOperationAction(ISD::SETCC,             MVT::v32i8, Custom);
01238     setOperationAction(ISD::SETCC,             MVT::v16i16, Custom);
01239     setOperationAction(ISD::SETCC,             MVT::v8i32, Custom);
01240     setOperationAction(ISD::SETCC,             MVT::v4i64, Custom);
01241 
01242     setOperationAction(ISD::SELECT,            MVT::v4f64, Custom);
01243     setOperationAction(ISD::SELECT,            MVT::v4i64, Custom);
01244     setOperationAction(ISD::SELECT,            MVT::v8f32, Custom);
01245 
01246     setOperationAction(ISD::VSELECT,           MVT::v4f64, Custom);
01247     setOperationAction(ISD::VSELECT,           MVT::v4i64, Custom);
01248     setOperationAction(ISD::VSELECT,           MVT::v8i32, Custom);
01249     setOperationAction(ISD::VSELECT,           MVT::v8f32, Custom);
01250 
01251     setOperationAction(ISD::SIGN_EXTEND,       MVT::v4i64, Custom);
01252     setOperationAction(ISD::SIGN_EXTEND,       MVT::v8i32, Custom);
01253     setOperationAction(ISD::SIGN_EXTEND,       MVT::v16i16, Custom);
01254     setOperationAction(ISD::ZERO_EXTEND,       MVT::v4i64, Custom);
01255     setOperationAction(ISD::ZERO_EXTEND,       MVT::v8i32, Custom);
01256     setOperationAction(ISD::ZERO_EXTEND,       MVT::v16i16, Custom);
01257     setOperationAction(ISD::ANY_EXTEND,        MVT::v4i64, Custom);
01258     setOperationAction(ISD::ANY_EXTEND,        MVT::v8i32, Custom);
01259     setOperationAction(ISD::ANY_EXTEND,        MVT::v16i16, Custom);
01260     setOperationAction(ISD::TRUNCATE,          MVT::v16i8, Custom);
01261     setOperationAction(ISD::TRUNCATE,          MVT::v8i16, Custom);
01262     setOperationAction(ISD::TRUNCATE,          MVT::v4i32, Custom);
01263 
01264     if (Subtarget->hasFMA() || Subtarget->hasFMA4()) {
01265       setOperationAction(ISD::FMA,             MVT::v8f32, Legal);
01266       setOperationAction(ISD::FMA,             MVT::v4f64, Legal);
01267       setOperationAction(ISD::FMA,             MVT::v4f32, Legal);
01268       setOperationAction(ISD::FMA,             MVT::v2f64, Legal);
01269       setOperationAction(ISD::FMA,             MVT::f32, Legal);
01270       setOperationAction(ISD::FMA,             MVT::f64, Legal);
01271     }
01272 
01273     if (Subtarget->hasInt256()) {
01274       setOperationAction(ISD::ADD,             MVT::v4i64, Legal);
01275       setOperationAction(ISD::ADD,             MVT::v8i32, Legal);
01276       setOperationAction(ISD::ADD,             MVT::v16i16, Legal);
01277       setOperationAction(ISD::ADD,             MVT::v32i8, Legal);
01278 
01279       setOperationAction(ISD::SUB,             MVT::v4i64, Legal);
01280       setOperationAction(ISD::SUB,             MVT::v8i32, Legal);
01281       setOperationAction(ISD::SUB,             MVT::v16i16, Legal);
01282       setOperationAction(ISD::SUB,             MVT::v32i8, Legal);
01283 
01284       setOperationAction(ISD::MUL,             MVT::v4i64, Custom);
01285       setOperationAction(ISD::MUL,             MVT::v8i32, Legal);
01286       setOperationAction(ISD::MUL,             MVT::v16i16, Legal);
01287       // Don't lower v32i8 because there is no 128-bit byte mul
01288 
01289       setOperationAction(ISD::UMUL_LOHI,       MVT::v8i32, Custom);
01290       setOperationAction(ISD::SMUL_LOHI,       MVT::v8i32, Custom);
01291       setOperationAction(ISD::MULHU,           MVT::v16i16, Legal);
01292       setOperationAction(ISD::MULHS,           MVT::v16i16, Legal);
01293 
01294       setOperationAction(ISD::VSELECT,         MVT::v16i16, Custom);
01295       setOperationAction(ISD::VSELECT,         MVT::v32i8, Legal);
01296     } else {
01297       setOperationAction(ISD::ADD,             MVT::v4i64, Custom);
01298       setOperationAction(ISD::ADD,             MVT::v8i32, Custom);
01299       setOperationAction(ISD::ADD,             MVT::v16i16, Custom);
01300       setOperationAction(ISD::ADD,             MVT::v32i8, Custom);
01301 
01302       setOperationAction(ISD::SUB,             MVT::v4i64, Custom);
01303       setOperationAction(ISD::SUB,             MVT::v8i32, Custom);
01304       setOperationAction(ISD::SUB,             MVT::v16i16, Custom);
01305       setOperationAction(ISD::SUB,             MVT::v32i8, Custom);
01306 
01307       setOperationAction(ISD::MUL,             MVT::v4i64, Custom);
01308       setOperationAction(ISD::MUL,             MVT::v8i32, Custom);
01309       setOperationAction(ISD::MUL,             MVT::v16i16, Custom);
01310       // Don't lower v32i8 because there is no 128-bit byte mul
01311     }
01312 
01313     // In the customized shift lowering, the legal cases in AVX2 will be
01314     // recognized.
01315     setOperationAction(ISD::SRL,               MVT::v4i64, Custom);
01316     setOperationAction(ISD::SRL,               MVT::v8i32, Custom);
01317 
01318     setOperationAction(ISD::SHL,               MVT::v4i64, Custom);
01319     setOperationAction(ISD::SHL,               MVT::v8i32, Custom);
01320 
01321     setOperationAction(ISD::SRA,               MVT::v8i32, Custom);
01322 
01323     // Custom lower several nodes for 256-bit types.
01324     for (int i = MVT::FIRST_VECTOR_VALUETYPE;
01325              i <= MVT::LAST_VECTOR_VALUETYPE; ++i) {
01326       MVT VT = (MVT::SimpleValueType)i;
01327 
01328       // Extract subvector is special because the value type
01329       // (result) is 128-bit but the source is 256-bit wide.
01330       if (VT.is128BitVector())
01331         setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
01332 
01333       // Do not attempt to custom lower other non-256-bit vectors
01334       if (!VT.is256BitVector())
01335         continue;
01336 
01337       setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);
01338       setOperationAction(ISD::VECTOR_SHUFFLE,     VT, Custom);
01339       setOperationAction(ISD::INSERT_VECTOR_ELT,  VT, Custom);
01340       setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
01341       setOperationAction(ISD::SCALAR_TO_VECTOR,   VT, Custom);
01342       setOperationAction(ISD::INSERT_SUBVECTOR,   VT, Custom);
01343       setOperationAction(ISD::CONCAT_VECTORS,     VT, Custom);
01344     }
01345 
01346     // Promote v32i8, v16i16, v8i32 select, and, or, xor to v4i64.
01347     for (int i = MVT::v32i8; i != MVT::v4i64; ++i) {
01348       MVT VT = (MVT::SimpleValueType)i;
01349 
01350       // Do not attempt to promote non-256-bit vectors
01351       if (!VT.is256BitVector())
01352         continue;
01353 
01354       setOperationAction(ISD::AND,    VT, Promote);
01355       AddPromotedToType (ISD::AND,    VT, MVT::v4i64);
01356       setOperationAction(ISD::OR,     VT, Promote);
01357       AddPromotedToType (ISD::OR,     VT, MVT::v4i64);
01358       setOperationAction(ISD::XOR,    VT, Promote);
01359       AddPromotedToType (ISD::XOR,    VT, MVT::v4i64);
01360       setOperationAction(ISD::LOAD,   VT, Promote);
01361       AddPromotedToType (ISD::LOAD,   VT, MVT::v4i64);
01362       setOperationAction(ISD::SELECT, VT, Promote);
01363       AddPromotedToType (ISD::SELECT, VT, MVT::v4i64);
01364     }
01365   }
01366 
01367   if (!TM.Options.UseSoftFloat && Subtarget->hasAVX512()) {
01368     addRegisterClass(MVT::v16i32, &X86::VR512RegClass);
01369     addRegisterClass(MVT::v16f32, &X86::VR512RegClass);
01370     addRegisterClass(MVT::v8i64,  &X86::VR512RegClass);
01371     addRegisterClass(MVT::v8f64,  &X86::VR512RegClass);
01372 
01373     addRegisterClass(MVT::i1,     &X86::VK1RegClass);
01374     addRegisterClass(MVT::v8i1,   &X86::VK8RegClass);
01375     addRegisterClass(MVT::v16i1,  &X86::VK16RegClass);
01376 
01377     setOperationAction(ISD::BR_CC,              MVT::i1,    Expand);
01378     setOperationAction(ISD::SETCC,              MVT::i1,    Custom);
01379     setOperationAction(ISD::XOR,                MVT::i1,    Legal);
01380     setOperationAction(ISD::OR,                 MVT::i1,    Legal);
01381     setOperationAction(ISD::AND,                MVT::i1,    Legal);
01382     setLoadExtAction(ISD::EXTLOAD,              MVT::v8f32, Legal);
01383     setOperationAction(ISD::LOAD,               MVT::v16f32, Legal);
01384     setOperationAction(ISD::LOAD,               MVT::v8f64, Legal);
01385     setOperationAction(ISD::LOAD,               MVT::v8i64, Legal);
01386     setOperationAction(ISD::LOAD,               MVT::v16i32, Legal);
01387     setOperationAction(ISD::LOAD,               MVT::v16i1, Legal);
01388 
01389     setOperationAction(ISD::FADD,               MVT::v16f32, Legal);
01390     setOperationAction(ISD::FSUB,               MVT::v16f32, Legal);
01391     setOperationAction(ISD::FMUL,               MVT::v16f32, Legal);
01392     setOperationAction(ISD::FDIV,               MVT::v16f32, Legal);
01393     setOperationAction(ISD::FSQRT,              MVT::v16f32, Legal);
01394     setOperationAction(ISD::FNEG,               MVT::v16f32, Custom);
01395 
01396     setOperationAction(ISD::FADD,               MVT::v8f64, Legal);
01397     setOperationAction(ISD::FSUB,               MVT::v8f64, Legal);
01398     setOperationAction(ISD::FMUL,               MVT::v8f64, Legal);
01399     setOperationAction(ISD::FDIV,               MVT::v8f64, Legal);
01400     setOperationAction(ISD::FSQRT,              MVT::v8f64, Legal);
01401     setOperationAction(ISD::FNEG,               MVT::v8f64, Custom);
01402     setOperationAction(ISD::FMA,                MVT::v8f64, Legal);
01403     setOperationAction(ISD::FMA,                MVT::v16f32, Legal);
01404 
01405     setOperationAction(ISD::FP_TO_SINT,         MVT::i32, Legal);
01406     setOperationAction(ISD::FP_TO_UINT,         MVT::i32, Legal);
01407     setOperationAction(ISD::SINT_TO_FP,         MVT::i32, Legal);
01408     setOperationAction(ISD::UINT_TO_FP,         MVT::i32, Legal);
01409     if (Subtarget->is64Bit()) {
01410       setOperationAction(ISD::FP_TO_UINT,       MVT::i64, Legal);
01411       setOperationAction(ISD::FP_TO_SINT,       MVT::i64, Legal);
01412       setOperationAction(ISD::SINT_TO_FP,       MVT::i64, Legal);
01413       setOperationAction(ISD::UINT_TO_FP,       MVT::i64, Legal);
01414     }
01415     setOperationAction(ISD::FP_TO_SINT,         MVT::v16i32, Legal);
01416     setOperationAction(ISD::FP_TO_UINT,         MVT::v16i32, Legal);
01417     setOperationAction(ISD::FP_TO_UINT,         MVT::v8i32, Legal);
01418     setOperationAction(ISD::FP_TO_UINT,         MVT::v4i32, Legal);
01419     setOperationAction(ISD::SINT_TO_FP,         MVT::v16i32, Legal);
01420     setOperationAction(ISD::UINT_TO_FP,         MVT::v16i32, Legal);
01421     setOperationAction(ISD::UINT_TO_FP,         MVT::v8i32, Legal);
01422     setOperationAction(ISD::UINT_TO_FP,         MVT::v4i32, Legal);
01423     setOperationAction(ISD::FP_ROUND,           MVT::v8f32, Legal);
01424     setOperationAction(ISD::FP_EXTEND,          MVT::v8f32, Legal);
01425 
01426     setOperationAction(ISD::TRUNCATE,           MVT::i1, Custom);
01427     setOperationAction(ISD::TRUNCATE,           MVT::v16i8, Custom);
01428     setOperationAction(ISD::TRUNCATE,           MVT::v8i32, Custom);
01429     setOperationAction(ISD::TRUNCATE,           MVT::v8i1, Custom);
01430     setOperationAction(ISD::TRUNCATE,           MVT::v16i1, Custom);
01431     setOperationAction(ISD::TRUNCATE,           MVT::v16i16, Custom);
01432     setOperationAction(ISD::ZERO_EXTEND,        MVT::v16i32, Custom);
01433     setOperationAction(ISD::ZERO_EXTEND,        MVT::v8i64, Custom);
01434     setOperationAction(ISD::SIGN_EXTEND,        MVT::v16i32, Custom);
01435     setOperationAction(ISD::SIGN_EXTEND,        MVT::v8i64, Custom);
01436     setOperationAction(ISD::SIGN_EXTEND,        MVT::v16i8, Custom);
01437     setOperationAction(ISD::SIGN_EXTEND,        MVT::v8i16, Custom);
01438     setOperationAction(ISD::SIGN_EXTEND,        MVT::v16i16, Custom);
01439 
01440     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v8f64,  Custom);
01441     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v8i64,  Custom);
01442     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v16f32,  Custom);
01443     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v16i32,  Custom);
01444     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v8i1,    Custom);
01445     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v16i1, Legal);
01446 
01447     setOperationAction(ISD::SETCC,              MVT::v16i1, Custom);
01448     setOperationAction(ISD::SETCC,              MVT::v8i1, Custom);
01449 
01450     setOperationAction(ISD::MUL,              MVT::v8i64, Custom);
01451 
01452     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i1,  Custom);
01453     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i1, Custom);
01454     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v16i1, Custom);
01455     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v8i1, Custom);
01456     setOperationAction(ISD::BUILD_VECTOR,       MVT::v8i1, Custom);
01457     setOperationAction(ISD::BUILD_VECTOR,       MVT::v16i1, Custom);
01458     setOperationAction(ISD::SELECT,             MVT::v8f64, Custom);
01459     setOperationAction(ISD::SELECT,             MVT::v8i64, Custom);
01460     setOperationAction(ISD::SELECT,             MVT::v16f32, Custom);
01461 
01462     setOperationAction(ISD::ADD,                MVT::v8i64, Legal);
01463     setOperationAction(ISD::ADD,                MVT::v16i32, Legal);
01464 
01465     setOperationAction(ISD::SUB,                MVT::v8i64, Legal);
01466     setOperationAction(ISD::SUB,                MVT::v16i32, Legal);
01467 
01468     setOperationAction(ISD::MUL,                MVT::v16i32, Legal);
01469 
01470     setOperationAction(ISD::SRL,                MVT::v8i64, Custom);
01471     setOperationAction(ISD::SRL,                MVT::v16i32, Custom);
01472 
01473     setOperationAction(ISD::SHL,                MVT::v8i64, Custom);
01474     setOperationAction(ISD::SHL,                MVT::v16i32, Custom);
01475 
01476     setOperationAction(ISD::SRA,                MVT::v8i64, Custom);
01477     setOperationAction(ISD::SRA,                MVT::v16i32, Custom);
01478 
01479     setOperationAction(ISD::AND,                MVT::v8i64, Legal);
01480     setOperationAction(ISD::OR,                 MVT::v8i64, Legal);
01481     setOperationAction(ISD::XOR,                MVT::v8i64, Legal);
01482     setOperationAction(ISD::AND,                MVT::v16i32, Legal);
01483     setOperationAction(ISD::OR,                 MVT::v16i32, Legal);
01484     setOperationAction(ISD::XOR,                MVT::v16i32, Legal);
01485 
01486     if (Subtarget->hasCDI()) {
01487       setOperationAction(ISD::CTLZ,             MVT::v8i64, Legal);
01488       setOperationAction(ISD::CTLZ,             MVT::v16i32, Legal);
01489     }
01490 
01491     // Custom lower several nodes.
01492     for (int i = MVT::FIRST_VECTOR_VALUETYPE;
01493              i <= MVT::LAST_VECTOR_VALUETYPE; ++i) {
01494       MVT VT = (MVT::SimpleValueType)i;
01495 
01496       unsigned EltSize = VT.getVectorElementType().getSizeInBits();
01497       // Extract subvector is special because the value type
01498       // (result) is 256/128-bit but the source is 512-bit wide.
01499       if (VT.is128BitVector() || VT.is256BitVector())
01500         setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
01501 
01502       if (VT.getVectorElementType() == MVT::i1)
01503         setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);
01504 
01505       // Do not attempt to custom lower other non-512-bit vectors
01506       if (!VT.is512BitVector())
01507         continue;
01508 
01509       if ( EltSize >= 32) {
01510         setOperationAction(ISD::VECTOR_SHUFFLE,      VT, Custom);
01511         setOperationAction(ISD::INSERT_VECTOR_ELT,   VT, Custom);
01512         setOperationAction(ISD::BUILD_VECTOR,        VT, Custom);
01513         setOperationAction(ISD::VSELECT,             VT, Legal);
01514         setOperationAction(ISD::EXTRACT_VECTOR_ELT,  VT, Custom);
01515         setOperationAction(ISD::SCALAR_TO_VECTOR,    VT, Custom);
01516         setOperationAction(ISD::INSERT_SUBVECTOR,    VT, Custom);
01517       }
01518     }
01519     for (int i = MVT::v32i8; i != MVT::v8i64; ++i) {
01520       MVT VT = (MVT::SimpleValueType)i;
01521 
01522       // Do not attempt to promote non-256-bit vectors
01523       if (!VT.is512BitVector())
01524         continue;
01525 
01526       setOperationAction(ISD::SELECT, VT, Promote);
01527       AddPromotedToType (ISD::SELECT, VT, MVT::v8i64);
01528     }
01529   }// has  AVX-512
01530 
01531   if (!TM.Options.UseSoftFloat && Subtarget->hasBWI()) {
01532     addRegisterClass(MVT::v32i16, &X86::VR512RegClass);
01533     addRegisterClass(MVT::v64i8,  &X86::VR512RegClass);
01534 
01535     addRegisterClass(MVT::v32i1,  &X86::VK32RegClass);
01536     addRegisterClass(MVT::v64i1,  &X86::VK64RegClass);
01537 
01538     setOperationAction(ISD::LOAD,               MVT::v32i16, Legal);
01539     setOperationAction(ISD::LOAD,               MVT::v64i8, Legal);
01540     setOperationAction(ISD::SETCC,              MVT::v32i1, Custom);
01541     setOperationAction(ISD::SETCC,              MVT::v64i1, Custom);
01542 
01543     for (int i = MVT::v32i8; i != MVT::v8i64; ++i) {
01544       const MVT VT = (MVT::SimpleValueType)i;
01545 
01546       const unsigned EltSize = VT.getVectorElementType().getSizeInBits();
01547 
01548       // Do not attempt to promote non-256-bit vectors
01549       if (!VT.is512BitVector())
01550         continue;
01551 
01552       if ( EltSize < 32) {
01553         setOperationAction(ISD::BUILD_VECTOR,        VT, Custom);
01554         setOperationAction(ISD::VSELECT,             VT, Legal);
01555       }
01556     }
01557   }
01558 
01559   if (!TM.Options.UseSoftFloat && Subtarget->hasVLX()) {
01560     addRegisterClass(MVT::v4i1,   &X86::VK4RegClass);
01561     addRegisterClass(MVT::v2i1,   &X86::VK2RegClass);
01562 
01563     setOperationAction(ISD::SETCC,              MVT::v4i1, Custom);
01564     setOperationAction(ISD::SETCC,              MVT::v2i1, Custom);
01565     setOperationAction(ISD::INSERT_SUBVECTOR,   MVT::v8i1, Legal);
01566   }
01567 
01568   // SIGN_EXTEND_INREGs are evaluated by the extend type. Handle the expansion
01569   // of this type with custom code.
01570   for (int VT = MVT::FIRST_VECTOR_VALUETYPE;
01571            VT != MVT::LAST_VECTOR_VALUETYPE; VT++) {
01572     setOperationAction(ISD::SIGN_EXTEND_INREG, (MVT::SimpleValueType)VT,
01573                        Custom);
01574   }
01575 
01576   // We want to custom lower some of our intrinsics.
01577   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
01578   setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
01579   setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
01580   if (!Subtarget->is64Bit())
01581     setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i64, Custom);
01582 
01583   // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't
01584   // handle type legalization for these operations here.
01585   //
01586   // FIXME: We really should do custom legalization for addition and
01587   // subtraction on x86-32 once PR3203 is fixed.  We really can't do much better
01588   // than generic legalization for 64-bit multiplication-with-overflow, though.
01589   for (unsigned i = 0, e = 3+Subtarget->is64Bit(); i != e; ++i) {
01590     // Add/Sub/Mul with overflow operations are custom lowered.
01591     MVT VT = IntVTs[i];
01592     setOperationAction(ISD::SADDO, VT, Custom);
01593     setOperationAction(ISD::UADDO, VT, Custom);
01594     setOperationAction(ISD::SSUBO, VT, Custom);
01595     setOperationAction(ISD::USUBO, VT, Custom);
01596     setOperationAction(ISD::SMULO, VT, Custom);
01597     setOperationAction(ISD::UMULO, VT, Custom);
01598   }
01599 
01600 
01601   if (!Subtarget->is64Bit()) {
01602     // These libcalls are not available in 32-bit.
01603     setLibcallName(RTLIB::SHL_I128, nullptr);
01604     setLibcallName(RTLIB::SRL_I128, nullptr);
01605     setLibcallName(RTLIB::SRA_I128, nullptr);
01606   }
01607 
01608   // Combine sin / cos into one node or libcall if possible.
01609   if (Subtarget->hasSinCos()) {
01610     setLibcallName(RTLIB::SINCOS_F32, "sincosf");
01611     setLibcallName(RTLIB::SINCOS_F64, "sincos");
01612     if (Subtarget->isTargetDarwin()) {
01613       // For MacOSX, we don't want to the normal expansion of a libcall to
01614       // sincos. We want to issue a libcall to __sincos_stret to avoid memory
01615       // traffic.
01616       setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
01617       setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
01618     }
01619   }
01620 
01621   if (Subtarget->isTargetWin64()) {
01622     setOperationAction(ISD::SDIV, MVT::i128, Custom);
01623     setOperationAction(ISD::UDIV, MVT::i128, Custom);
01624     setOperationAction(ISD::SREM, MVT::i128, Custom);
01625     setOperationAction(ISD::UREM, MVT::i128, Custom);
01626     setOperationAction(ISD::SDIVREM, MVT::i128, Custom);
01627     setOperationAction(ISD::UDIVREM, MVT::i128, Custom);
01628   }
01629 
01630   // We have target-specific dag combine patterns for the following nodes:
01631   setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
01632   setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
01633   setTargetDAGCombine(ISD::VSELECT);
01634   setTargetDAGCombine(ISD::SELECT);
01635   setTargetDAGCombine(ISD::SHL);
01636   setTargetDAGCombine(ISD::SRA);
01637   setTargetDAGCombine(ISD::SRL);
01638   setTargetDAGCombine(ISD::OR);
01639   setTargetDAGCombine(ISD::AND);
01640   setTargetDAGCombine(ISD::ADD);
01641   setTargetDAGCombine(ISD::FADD);
01642   setTargetDAGCombine(ISD::FSUB);
01643   setTargetDAGCombine(ISD::FMA);
01644   setTargetDAGCombine(ISD::SUB);
01645   setTargetDAGCombine(ISD::LOAD);
01646   setTargetDAGCombine(ISD::STORE);
01647   setTargetDAGCombine(ISD::ZERO_EXTEND);
01648   setTargetDAGCombine(ISD::ANY_EXTEND);
01649   setTargetDAGCombine(ISD::SIGN_EXTEND);
01650   setTargetDAGCombine(ISD::SIGN_EXTEND_INREG);
01651   setTargetDAGCombine(ISD::TRUNCATE);
01652   setTargetDAGCombine(ISD::SINT_TO_FP);
01653   setTargetDAGCombine(ISD::SETCC);
01654   setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN);
01655   setTargetDAGCombine(ISD::BUILD_VECTOR);
01656   if (Subtarget->is64Bit())
01657     setTargetDAGCombine(ISD::MUL);
01658   setTargetDAGCombine(ISD::XOR);
01659 
01660   computeRegisterProperties();
01661 
01662   // On Darwin, -Os means optimize for size without hurting performance,
01663   // do not reduce the limit.
01664   MaxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores
01665   MaxStoresPerMemsetOptSize = Subtarget->isTargetDarwin() ? 16 : 8;
01666   MaxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores
01667   MaxStoresPerMemcpyOptSize = Subtarget->isTargetDarwin() ? 8 : 4;
01668   MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores
01669   MaxStoresPerMemmoveOptSize = Subtarget->isTargetDarwin() ? 8 : 4;
01670   setPrefLoopAlignment(4); // 2^4 bytes.
01671 
01672   // Predictable cmov don't hurt on atom because it's in-order.
01673   PredictableSelectIsExpensive = !Subtarget->isAtom();
01674 
01675   setPrefFunctionAlignment(4); // 2^4 bytes.
01676 
01677   verifyIntrinsicTables();
01678 }
01679 
01680 // This has so far only been implemented for 64-bit MachO.
01681 bool X86TargetLowering::useLoadStackGuardNode() const {
01682   return Subtarget->getTargetTriple().getObjectFormat() == Triple::MachO &&
01683          Subtarget->is64Bit();
01684 }
01685 
01686 TargetLoweringBase::LegalizeTypeAction
01687 X86TargetLowering::getPreferredVectorAction(EVT VT) const {
01688   if (ExperimentalVectorWideningLegalization &&
01689       VT.getVectorNumElements() != 1 &&
01690       VT.getVectorElementType().getSimpleVT() != MVT::i1)
01691     return TypeWidenVector;
01692 
01693   return TargetLoweringBase::getPreferredVectorAction(VT);
01694 }
01695 
01696 EVT X86TargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const {
01697   if (!VT.isVector())
01698     return Subtarget->hasAVX512() ? MVT::i1: MVT::i8;
01699 
01700   const unsigned NumElts = VT.getVectorNumElements();
01701   const EVT EltVT = VT.getVectorElementType();
01702   if (VT.is512BitVector()) {
01703     if (Subtarget->hasAVX512())
01704       if (EltVT == MVT::i32 || EltVT == MVT::i64 ||
01705           EltVT == MVT::f32 || EltVT == MVT::f64)
01706         switch(NumElts) {
01707         case  8: return MVT::v8i1;
01708         case 16: return MVT::v16i1;
01709       }
01710     if (Subtarget->hasBWI())
01711       if (EltVT == MVT::i8 || EltVT == MVT::i16)
01712         switch(NumElts) {
01713         case 32: return MVT::v32i1;
01714         case 64: return MVT::v64i1;
01715       }
01716   }
01717 
01718   if (VT.is256BitVector() || VT.is128BitVector()) {
01719     if (Subtarget->hasVLX())
01720       if (EltVT == MVT::i32 || EltVT == MVT::i64 ||
01721           EltVT == MVT::f32 || EltVT == MVT::f64)
01722         switch(NumElts) {
01723         case 2: return MVT::v2i1;
01724         case 4: return MVT::v4i1;
01725         case 8: return MVT::v8i1;
01726       }
01727     if (Subtarget->hasBWI() && Subtarget->hasVLX())
01728       if (EltVT == MVT::i8 || EltVT == MVT::i16)
01729         switch(NumElts) {
01730         case  8: return MVT::v8i1;
01731         case 16: return MVT::v16i1;
01732         case 32: return MVT::v32i1;
01733       }
01734   }
01735 
01736   return VT.changeVectorElementTypeToInteger();
01737 }
01738 
01739 /// getMaxByValAlign - Helper for getByValTypeAlignment to determine
01740 /// the desired ByVal argument alignment.
01741 static void getMaxByValAlign(Type *Ty, unsigned &MaxAlign) {
01742   if (MaxAlign == 16)
01743     return;
01744   if (VectorType *VTy = dyn_cast<VectorType>(Ty)) {
01745     if (VTy->getBitWidth() == 128)
01746       MaxAlign = 16;
01747   } else if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
01748     unsigned EltAlign = 0;
01749     getMaxByValAlign(ATy->getElementType(), EltAlign);
01750     if (EltAlign > MaxAlign)
01751       MaxAlign = EltAlign;
01752   } else if (StructType *STy = dyn_cast<StructType>(Ty)) {
01753     for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
01754       unsigned EltAlign = 0;
01755       getMaxByValAlign(STy->getElementType(i), EltAlign);
01756       if (EltAlign > MaxAlign)
01757         MaxAlign = EltAlign;
01758       if (MaxAlign == 16)
01759         break;
01760     }
01761   }
01762 }
01763 
01764 /// getByValTypeAlignment - Return the desired alignment for ByVal aggregate
01765 /// function arguments in the caller parameter area. For X86, aggregates
01766 /// that contain SSE vectors are placed at 16-byte boundaries while the rest
01767 /// are at 4-byte boundaries.
01768 unsigned X86TargetLowering::getByValTypeAlignment(Type *Ty) const {
01769   if (Subtarget->is64Bit()) {
01770     // Max of 8 and alignment of type.
01771     unsigned TyAlign = TD->getABITypeAlignment(Ty);
01772     if (TyAlign > 8)
01773       return TyAlign;
01774     return 8;
01775   }
01776 
01777   unsigned Align = 4;
01778   if (Subtarget->hasSSE1())
01779     getMaxByValAlign(Ty, Align);
01780   return Align;
01781 }
01782 
01783 /// getOptimalMemOpType - Returns the target specific optimal type for load
01784 /// and store operations as a result of memset, memcpy, and memmove
01785 /// lowering. If DstAlign is zero that means it's safe to destination
01786 /// alignment can satisfy any constraint. Similarly if SrcAlign is zero it
01787 /// means there isn't a need to check it against alignment requirement,
01788 /// probably because the source does not need to be loaded. If 'IsMemset' is
01789 /// true, that means it's expanding a memset. If 'ZeroMemset' is true, that
01790 /// means it's a memset of zero. 'MemcpyStrSrc' indicates whether the memcpy
01791 /// source is constant so it does not need to be loaded.
01792 /// It returns EVT::Other if the type should be determined using generic
01793 /// target-independent logic.
01794 EVT
01795 X86TargetLowering::getOptimalMemOpType(uint64_t Size,
01796                                        unsigned DstAlign, unsigned SrcAlign,
01797                                        bool IsMemset, bool ZeroMemset,
01798                                        bool MemcpyStrSrc,
01799                                        MachineFunction &MF) const {
01800   const Function *F = MF.getFunction();
01801   if ((!IsMemset || ZeroMemset) &&
01802       !F->getAttributes().hasAttribute(AttributeSet::FunctionIndex,
01803                                        Attribute::NoImplicitFloat)) {
01804     if (Size >= 16 &&
01805         (Subtarget->isUnalignedMemAccessFast() ||
01806          ((DstAlign == 0 || DstAlign >= 16) &&
01807           (SrcAlign == 0 || SrcAlign >= 16)))) {
01808       if (Size >= 32) {
01809         if (Subtarget->hasInt256())
01810           return MVT::v8i32;
01811         if (Subtarget->hasFp256())
01812           return MVT::v8f32;
01813       }
01814       if (Subtarget->hasSSE2())
01815         return MVT::v4i32;
01816       if (Subtarget->hasSSE1())
01817         return MVT::v4f32;
01818     } else if (!MemcpyStrSrc && Size >= 8 &&
01819                !Subtarget->is64Bit() &&
01820                Subtarget->hasSSE2()) {
01821       // Do not use f64 to lower memcpy if source is string constant. It's
01822       // better to use i32 to avoid the loads.
01823       return MVT::f64;
01824     }
01825   }
01826   if (Subtarget->is64Bit() && Size >= 8)
01827     return MVT::i64;
01828   return MVT::i32;
01829 }
01830 
01831 bool X86TargetLowering::isSafeMemOpType(MVT VT) const {
01832   if (VT == MVT::f32)
01833     return X86ScalarSSEf32;
01834   else if (VT == MVT::f64)
01835     return X86ScalarSSEf64;
01836   return true;
01837 }
01838 
01839 bool
01840 X86TargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
01841                                                   unsigned,
01842                                                   unsigned,
01843                                                   bool *Fast) const {
01844   if (Fast)
01845     *Fast = Subtarget->isUnalignedMemAccessFast();
01846   return true;
01847 }
01848 
01849 /// getJumpTableEncoding - Return the entry encoding for a jump table in the
01850 /// current function.  The returned value is a member of the
01851 /// MachineJumpTableInfo::JTEntryKind enum.
01852 unsigned X86TargetLowering::getJumpTableEncoding() const {
01853   // In GOT pic mode, each entry in the jump table is emitted as a @GOTOFF
01854   // symbol.
01855   if (getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
01856       Subtarget->isPICStyleGOT())
01857     return MachineJumpTableInfo::EK_Custom32;
01858 
01859   // Otherwise, use the normal jump table encoding heuristics.
01860   return TargetLowering::getJumpTableEncoding();
01861 }
01862 
01863 const MCExpr *
01864 X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI,
01865                                              const MachineBasicBlock *MBB,
01866                                              unsigned uid,MCContext &Ctx) const{
01867   assert(MBB->getParent()->getTarget().getRelocationModel() == Reloc::PIC_ &&
01868          Subtarget->isPICStyleGOT());
01869   // In 32-bit ELF systems, our jump table entries are formed with @GOTOFF
01870   // entries.
01871   return MCSymbolRefExpr::Create(MBB->getSymbol(),
01872                                  MCSymbolRefExpr::VK_GOTOFF, Ctx);
01873 }
01874 
01875 /// getPICJumpTableRelocaBase - Returns relocation base for the given PIC
01876 /// jumptable.
01877 SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table,
01878                                                     SelectionDAG &DAG) const {
01879   if (!Subtarget->is64Bit())
01880     // This doesn't have SDLoc associated with it, but is not really the
01881     // same as a Register.
01882     return DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), getPointerTy());
01883   return Table;
01884 }
01885 
01886 /// getPICJumpTableRelocBaseExpr - This returns the relocation base for the
01887 /// given PIC jumptable, the same as getPICJumpTableRelocBase, but as an
01888 /// MCExpr.
01889 const MCExpr *X86TargetLowering::
01890 getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI,
01891                              MCContext &Ctx) const {
01892   // X86-64 uses RIP relative addressing based on the jump table label.
01893   if (Subtarget->isPICStyleRIPRel())
01894     return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx);
01895 
01896   // Otherwise, the reference is relative to the PIC base.
01897   return MCSymbolRefExpr::Create(MF->getPICBaseSymbol(), Ctx);
01898 }
01899 
01900 // FIXME: Why this routine is here? Move to RegInfo!
01901 std::pair<const TargetRegisterClass*, uint8_t>
01902 X86TargetLowering::findRepresentativeClass(MVT VT) const{
01903   const TargetRegisterClass *RRC = nullptr;
01904   uint8_t Cost = 1;
01905   switch (VT.SimpleTy) {
01906   default:
01907     return TargetLowering::findRepresentativeClass(VT);
01908   case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64:
01909     RRC = Subtarget->is64Bit() ? &X86::GR64RegClass : &X86::GR32RegClass;
01910     break;
01911   case MVT::x86mmx:
01912     RRC = &X86::VR64RegClass;
01913     break;
01914   case MVT::f32: case MVT::f64:
01915   case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64:
01916   case MVT::v4f32: case MVT::v2f64:
01917   case MVT::v32i8: case MVT::v8i32: case MVT::v4i64: case MVT::v8f32:
01918   case MVT::v4f64:
01919     RRC = &X86::VR128RegClass;
01920     break;
01921   }
01922   return std::make_pair(RRC, Cost);
01923 }
01924 
01925 bool X86TargetLowering::getStackCookieLocation(unsigned &AddressSpace,
01926                                                unsigned &Offset) const {
01927   if (!Subtarget->isTargetLinux())
01928     return false;
01929 
01930   if (Subtarget->is64Bit()) {
01931     // %fs:0x28, unless we're using a Kernel code model, in which case it's %gs:
01932     Offset = 0x28;
01933     if (getTargetMachine().getCodeModel() == CodeModel::Kernel)
01934       AddressSpace = 256;
01935     else
01936       AddressSpace = 257;
01937   } else {
01938     // %gs:0x14 on i386
01939     Offset = 0x14;
01940     AddressSpace = 256;
01941   }
01942   return true;
01943 }
01944 
01945 bool X86TargetLowering::isNoopAddrSpaceCast(unsigned SrcAS,
01946                                             unsigned DestAS) const {
01947   assert(SrcAS != DestAS && "Expected different address spaces!");
01948 
01949   return SrcAS < 256 && DestAS < 256;
01950 }
01951 
01952 //===----------------------------------------------------------------------===//
01953 //               Return Value Calling Convention Implementation
01954 //===----------------------------------------------------------------------===//
01955 
01956 #include "X86GenCallingConv.inc"
01957 
01958 bool
01959 X86TargetLowering::CanLowerReturn(CallingConv::ID CallConv,
01960                                   MachineFunction &MF, bool isVarArg,
01961                         const SmallVectorImpl<ISD::OutputArg> &Outs,
01962                         LLVMContext &Context) const {
01963   SmallVector<CCValAssign, 16> RVLocs;
01964   CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
01965   return CCInfo.CheckReturn(Outs, RetCC_X86);
01966 }
01967 
01968 const MCPhysReg *X86TargetLowering::getScratchRegisters(CallingConv::ID) const {
01969   static const MCPhysReg ScratchRegs[] = { X86::R11, 0 };
01970   return ScratchRegs;
01971 }
01972 
01973 SDValue
01974 X86TargetLowering::LowerReturn(SDValue Chain,
01975                                CallingConv::ID CallConv, bool isVarArg,
01976                                const SmallVectorImpl<ISD::OutputArg> &Outs,
01977                                const SmallVectorImpl<SDValue> &OutVals,
01978                                SDLoc dl, SelectionDAG &DAG) const {
01979   MachineFunction &MF = DAG.getMachineFunction();
01980   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
01981 
01982   SmallVector<CCValAssign, 16> RVLocs;
01983   CCState CCInfo(CallConv, isVarArg, MF, RVLocs, *DAG.getContext());
01984   CCInfo.AnalyzeReturn(Outs, RetCC_X86);
01985 
01986   SDValue Flag;
01987   SmallVector<SDValue, 6> RetOps;
01988   RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
01989   // Operand #1 = Bytes To Pop
01990   RetOps.push_back(DAG.getTargetConstant(FuncInfo->getBytesToPopOnReturn(),
01991                    MVT::i16));
01992 
01993   // Copy the result values into the output registers.
01994   for (unsigned i = 0; i != RVLocs.size(); ++i) {
01995     CCValAssign &VA = RVLocs[i];
01996     assert(VA.isRegLoc() && "Can only return in registers!");
01997     SDValue ValToCopy = OutVals[i];
01998     EVT ValVT = ValToCopy.getValueType();
01999 
02000     // Promote values to the appropriate types
02001     if (VA.getLocInfo() == CCValAssign::SExt)
02002       ValToCopy = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), ValToCopy);
02003     else if (VA.getLocInfo() == CCValAssign::ZExt)
02004       ValToCopy = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), ValToCopy);
02005     else if (VA.getLocInfo() == CCValAssign::AExt)
02006       ValToCopy = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), ValToCopy);
02007     else if (VA.getLocInfo() == CCValAssign::BCvt)
02008       ValToCopy = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), ValToCopy);
02009 
02010     assert(VA.getLocInfo() != CCValAssign::FPExt &&
02011            "Unexpected FP-extend for return value.");  
02012 
02013     // If this is x86-64, and we disabled SSE, we can't return FP values,
02014     // or SSE or MMX vectors.
02015     if ((ValVT == MVT::f32 || ValVT == MVT::f64 ||
02016          VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) &&
02017           (Subtarget->is64Bit() && !Subtarget->hasSSE1())) {
02018       report_fatal_error("SSE register return with SSE disabled");
02019     }
02020     // Likewise we can't return F64 values with SSE1 only.  gcc does so, but
02021     // llvm-gcc has never done it right and no one has noticed, so this
02022     // should be OK for now.
02023     if (ValVT == MVT::f64 &&
02024         (Subtarget->is64Bit() && !Subtarget->hasSSE2()))
02025       report_fatal_error("SSE2 register return with SSE2 disabled");
02026 
02027     // Returns in ST0/ST1 are handled specially: these are pushed as operands to
02028     // the RET instruction and handled by the FP Stackifier.
02029     if (VA.getLocReg() == X86::FP0 ||
02030         VA.getLocReg() == X86::FP1) {
02031       // If this is a copy from an xmm register to ST(0), use an FPExtend to
02032       // change the value to the FP stack register class.
02033       if (isScalarFPTypeInSSEReg(VA.getValVT()))
02034         ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy);
02035       RetOps.push_back(ValToCopy);
02036       // Don't emit a copytoreg.
02037       continue;
02038     }
02039 
02040     // 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64
02041     // which is returned in RAX / RDX.
02042     if (Subtarget->is64Bit()) {
02043       if (ValVT == MVT::x86mmx) {
02044         if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) {
02045           ValToCopy = DAG.getNode(ISD::BITCAST, dl, MVT::i64, ValToCopy);
02046           ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
02047                                   ValToCopy);
02048           // If we don't have SSE2 available, convert to v4f32 so the generated
02049           // register is legal.
02050           if (!Subtarget->hasSSE2())
02051             ValToCopy = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32,ValToCopy);
02052         }
02053       }
02054     }
02055 
02056     Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), ValToCopy, Flag);
02057     Flag = Chain.getValue(1);
02058     RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
02059   }
02060 
02061   // The x86-64 ABIs require that for returning structs by value we copy
02062   // the sret argument into %rax/%eax (depending on ABI) for the return.
02063   // Win32 requires us to put the sret argument to %eax as well.
02064   // We saved the argument into a virtual register in the entry block,
02065   // so now we copy the value out and into %rax/%eax.
02066   if (DAG.getMachineFunction().getFunction()->hasStructRetAttr() &&
02067       (Subtarget->is64Bit() || Subtarget->isTargetKnownWindowsMSVC())) {
02068     MachineFunction &MF = DAG.getMachineFunction();
02069     X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
02070     unsigned Reg = FuncInfo->getSRetReturnReg();
02071     assert(Reg &&
02072            "SRetReturnReg should have been set in LowerFormalArguments().");
02073     SDValue Val = DAG.getCopyFromReg(Chain, dl, Reg, getPointerTy());
02074 
02075     unsigned RetValReg
02076         = (Subtarget->is64Bit() && !Subtarget->isTarget64BitILP32()) ?
02077           X86::RAX : X86::EAX;
02078     Chain = DAG.getCopyToReg(Chain, dl, RetValReg, Val, Flag);
02079     Flag = Chain.getValue(1);
02080 
02081     // RAX/EAX now acts like a return value.
02082     RetOps.push_back(DAG.getRegister(RetValReg, getPointerTy()));
02083   }
02084 
02085   RetOps[0] = Chain;  // Update chain.
02086 
02087   // Add the flag if we have it.
02088   if (Flag.getNode())
02089     RetOps.push_back(Flag);
02090 
02091   return DAG.getNode(X86ISD::RET_FLAG, dl, MVT::Other, RetOps);
02092 }
02093 
02094 bool X86TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
02095   if (N->getNumValues() != 1)
02096     return false;
02097   if (!N->hasNUsesOfValue(1, 0))
02098     return false;
02099 
02100   SDValue TCChain = Chain;
02101   SDNode *Copy = *N->use_begin();
02102   if (Copy->getOpcode() == ISD::CopyToReg) {
02103     // If the copy has a glue operand, we conservatively assume it isn't safe to
02104     // perform a tail call.
02105     if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
02106       return false;
02107     TCChain = Copy->getOperand(0);
02108   } else if (Copy->getOpcode() != ISD::FP_EXTEND)
02109     return false;
02110 
02111   bool HasRet = false;
02112   for (SDNode::use_iterator UI = Copy->use_begin(), UE = Copy->use_end();
02113        UI != UE; ++UI) {
02114     if (UI->getOpcode() != X86ISD::RET_FLAG)
02115       return false;
02116     // If we are returning more than one value, we can definitely
02117     // not make a tail call see PR19530
02118     if (UI->getNumOperands() > 4)
02119       return false;
02120     if (UI->getNumOperands() == 4 &&
02121         UI->getOperand(UI->getNumOperands()-1).getValueType() != MVT::Glue)
02122       return false;
02123     HasRet = true;
02124   }
02125 
02126   if (!HasRet)
02127     return false;
02128 
02129   Chain = TCChain;
02130   return true;
02131 }
02132 
02133 EVT
02134 X86TargetLowering::getTypeForExtArgOrReturn(LLVMContext &Context, EVT VT,
02135                                             ISD::NodeType ExtendKind) const {
02136   MVT ReturnMVT;
02137   // TODO: Is this also valid on 32-bit?
02138   if (Subtarget->is64Bit() && VT == MVT::i1 && ExtendKind == ISD::ZERO_EXTEND)
02139     ReturnMVT = MVT::i8;
02140   else
02141     ReturnMVT = MVT::i32;
02142 
02143   EVT MinVT = getRegisterType(Context, ReturnMVT);
02144   return VT.bitsLT(MinVT) ? MinVT : VT;
02145 }
02146 
02147 /// LowerCallResult - Lower the result values of a call into the
02148 /// appropriate copies out of appropriate physical registers.
02149 ///
02150 SDValue
02151 X86TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
02152                                    CallingConv::ID CallConv, bool isVarArg,
02153                                    const SmallVectorImpl<ISD::InputArg> &Ins,
02154                                    SDLoc dl, SelectionDAG &DAG,
02155                                    SmallVectorImpl<SDValue> &InVals) const {
02156 
02157   // Assign locations to each value returned by this call.
02158   SmallVector<CCValAssign, 16> RVLocs;
02159   bool Is64Bit = Subtarget->is64Bit();
02160   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
02161                  *DAG.getContext());
02162   CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
02163 
02164   // Copy all of the result registers out of their specified physreg.
02165   for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
02166     CCValAssign &VA = RVLocs[i];
02167     EVT CopyVT = VA.getValVT();
02168 
02169     // If this is x86-64, and we disabled SSE, we can't return FP values
02170     if ((CopyVT == MVT::f32 || CopyVT == MVT::f64) &&
02171         ((Is64Bit || Ins[i].Flags.isInReg()) && !Subtarget->hasSSE1())) {
02172       report_fatal_error("SSE register return with SSE disabled");
02173     }
02174 
02175     // If we prefer to use the value in xmm registers, copy it out as f80 and
02176     // use a truncate to move it from fp stack reg to xmm reg.
02177     if ((VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1) &&
02178         isScalarFPTypeInSSEReg(VA.getValVT()))
02179       CopyVT = MVT::f80;
02180 
02181     Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(),
02182                                CopyVT, InFlag).getValue(1);
02183     SDValue Val = Chain.getValue(0);
02184 
02185     if (CopyVT != VA.getValVT())
02186       Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val,
02187                         // This truncation won't change the value.
02188                         DAG.getIntPtrConstant(1));
02189 
02190     InFlag = Chain.getValue(2);
02191     InVals.push_back(Val);
02192   }
02193 
02194   return Chain;
02195 }
02196 
02197 //===----------------------------------------------------------------------===//
02198 //                C & StdCall & Fast Calling Convention implementation
02199 //===----------------------------------------------------------------------===//
02200 //  StdCall calling convention seems to be standard for many Windows' API
02201 //  routines and around. It differs from C calling convention just a little:
02202 //  callee should clean up the stack, not caller. Symbols should be also
02203 //  decorated in some fancy way :) It doesn't support any vector arguments.
02204 //  For info on fast calling convention see Fast Calling Convention (tail call)
02205 //  implementation LowerX86_32FastCCCallTo.
02206 
02207 /// CallIsStructReturn - Determines whether a call uses struct return
02208 /// semantics.
02209 enum StructReturnType {
02210   NotStructReturn,
02211   RegStructReturn,
02212   StackStructReturn
02213 };
02214 static StructReturnType
02215 callIsStructReturn(const SmallVectorImpl<ISD::OutputArg> &Outs) {
02216   if (Outs.empty())
02217     return NotStructReturn;
02218 
02219   const ISD::ArgFlagsTy &Flags = Outs[0].Flags;
02220   if (!Flags.isSRet())
02221     return NotStructReturn;
02222   if (Flags.isInReg())
02223     return RegStructReturn;
02224   return StackStructReturn;
02225 }
02226 
02227 /// ArgsAreStructReturn - Determines whether a function uses struct
02228 /// return semantics.
02229 static StructReturnType
02230 argsAreStructReturn(const SmallVectorImpl<ISD::InputArg> &Ins) {
02231   if (Ins.empty())
02232     return NotStructReturn;
02233 
02234   const ISD::ArgFlagsTy &Flags = Ins[0].Flags;
02235   if (!Flags.isSRet())
02236     return NotStructReturn;
02237   if (Flags.isInReg())
02238     return RegStructReturn;
02239   return StackStructReturn;
02240 }
02241 
02242 /// CreateCopyOfByValArgument - Make a copy of an aggregate at address specified
02243 /// by "Src" to address "Dst" with size and alignment information specified by
02244 /// the specific parameter attribute. The copy will be passed as a byval
02245 /// function parameter.
02246 static SDValue
02247 CreateCopyOfByValArgument(SDValue Src, SDValue Dst, SDValue Chain,
02248                           ISD::ArgFlagsTy Flags, SelectionDAG &DAG,
02249                           SDLoc dl) {
02250   SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), MVT::i32);
02251 
02252   return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(),
02253                        /*isVolatile*/false, /*AlwaysInline=*/true,
02254                        MachinePointerInfo(), MachinePointerInfo());
02255 }
02256 
02257 /// IsTailCallConvention - Return true if the calling convention is one that
02258 /// supports tail call optimization.
02259 static bool IsTailCallConvention(CallingConv::ID CC) {
02260   return (CC == CallingConv::Fast || CC == CallingConv::GHC ||
02261           CC == CallingConv::HiPE);
02262 }
02263 
02264 /// \brief Return true if the calling convention is a C calling convention.
02265 static bool IsCCallConvention(CallingConv::ID CC) {
02266   return (CC == CallingConv::C || CC == CallingConv::X86_64_Win64 ||
02267           CC == CallingConv::X86_64_SysV);
02268 }
02269 
02270 bool X86TargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const {
02271   if (!CI->isTailCall() || getTargetMachine().Options.DisableTailCalls)
02272     return false;
02273 
02274   CallSite CS(CI);
02275   CallingConv::ID CalleeCC = CS.getCallingConv();
02276   if (!IsTailCallConvention(CalleeCC) && !IsCCallConvention(CalleeCC))
02277     return false;
02278 
02279   return true;
02280 }
02281 
02282 /// FuncIsMadeTailCallSafe - Return true if the function is being made into
02283 /// a tailcall target by changing its ABI.
02284 static bool FuncIsMadeTailCallSafe(CallingConv::ID CC,
02285                                    bool GuaranteedTailCallOpt) {
02286   return GuaranteedTailCallOpt && IsTailCallConvention(CC);
02287 }
02288 
02289 SDValue
02290 X86TargetLowering::LowerMemArgument(SDValue Chain,
02291                                     CallingConv::ID CallConv,
02292                                     const SmallVectorImpl<ISD::InputArg> &Ins,
02293                                     SDLoc dl, SelectionDAG &DAG,
02294                                     const CCValAssign &VA,
02295                                     MachineFrameInfo *MFI,
02296                                     unsigned i) const {
02297   // Create the nodes corresponding to a load from this parameter slot.
02298   ISD::ArgFlagsTy Flags = Ins[i].Flags;
02299   bool AlwaysUseMutable = FuncIsMadeTailCallSafe(
02300       CallConv, DAG.getTarget().Options.GuaranteedTailCallOpt);
02301   bool isImmutable = !AlwaysUseMutable && !Flags.isByVal();
02302   EVT ValVT;
02303 
02304   // If value is passed by pointer we have address passed instead of the value
02305   // itself.
02306   if (VA.getLocInfo() == CCValAssign::Indirect)
02307     ValVT = VA.getLocVT();
02308   else
02309     ValVT = VA.getValVT();
02310 
02311   // FIXME: For now, all byval parameter objects are marked mutable. This can be
02312   // changed with more analysis.
02313   // In case of tail call optimization mark all arguments mutable. Since they
02314   // could be overwritten by lowering of arguments in case of a tail call.
02315   if (Flags.isByVal()) {
02316     unsigned Bytes = Flags.getByValSize();
02317     if (Bytes == 0) Bytes = 1; // Don't create zero-sized stack objects.
02318     int FI = MFI->CreateFixedObject(Bytes, VA.getLocMemOffset(), isImmutable);
02319     return DAG.getFrameIndex(FI, getPointerTy());
02320   } else {
02321     int FI = MFI->CreateFixedObject(ValVT.getSizeInBits()/8,
02322                                     VA.getLocMemOffset(), isImmutable);
02323     SDValue FIN = DAG.getFrameIndex(FI, getPointerTy());
02324     return DAG.getLoad(ValVT, dl, Chain, FIN,
02325                        MachinePointerInfo::getFixedStack(FI),
02326                        false, false, false, 0);
02327   }
02328 }
02329 
02330 // FIXME: Get this from tablegen.
02331 static ArrayRef<MCPhysReg> get64BitArgumentGPRs(CallingConv::ID CallConv,
02332                                                 const X86Subtarget *Subtarget) {
02333   assert(Subtarget->is64Bit());
02334 
02335   if (Subtarget->isCallingConvWin64(CallConv)) {
02336     static const MCPhysReg GPR64ArgRegsWin64[] = {
02337       X86::RCX, X86::RDX, X86::R8,  X86::R9
02338     };
02339     return makeArrayRef(std::begin(GPR64ArgRegsWin64), std::end(GPR64ArgRegsWin64));
02340   }
02341 
02342   static const MCPhysReg GPR64ArgRegs64Bit[] = {
02343     X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9
02344   };
02345   return makeArrayRef(std::begin(GPR64ArgRegs64Bit), std::end(GPR64ArgRegs64Bit));
02346 }
02347 
02348 // FIXME: Get this from tablegen.
02349 static ArrayRef<MCPhysReg> get64BitArgumentXMMs(MachineFunction &MF,
02350                                                 CallingConv::ID CallConv,
02351                                                 const X86Subtarget *Subtarget) {
02352   assert(Subtarget->is64Bit());
02353   if (Subtarget->isCallingConvWin64(CallConv)) {
02354     // The XMM registers which might contain var arg parameters are shadowed
02355     // in their paired GPR.  So we only need to save the GPR to their home
02356     // slots.
02357     // TODO: __vectorcall will change this.
02358     return None;
02359   }
02360 
02361   const Function *Fn = MF.getFunction();
02362   bool NoImplicitFloatOps = Fn->getAttributes().
02363       hasAttribute(AttributeSet::FunctionIndex, Attribute::NoImplicitFloat);
02364   assert(!(MF.getTarget().Options.UseSoftFloat && NoImplicitFloatOps) &&
02365          "SSE register cannot be used when SSE is disabled!");
02366   if (MF.getTarget().Options.UseSoftFloat || NoImplicitFloatOps ||
02367       !Subtarget->hasSSE1())
02368     // Kernel mode asks for SSE to be disabled, so there are no XMM argument
02369     // registers.
02370     return None;
02371 
02372   static const MCPhysReg XMMArgRegs64Bit[] = {
02373     X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
02374     X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
02375   };
02376   return makeArrayRef(std::begin(XMMArgRegs64Bit), std::end(XMMArgRegs64Bit));
02377 }
02378 
02379 SDValue
02380 X86TargetLowering::LowerFormalArguments(SDValue Chain,
02381                                         CallingConv::ID CallConv,
02382                                         bool isVarArg,
02383                                       const SmallVectorImpl<ISD::InputArg> &Ins,
02384                                         SDLoc dl,
02385                                         SelectionDAG &DAG,
02386                                         SmallVectorImpl<SDValue> &InVals)
02387                                           const {
02388   MachineFunction &MF = DAG.getMachineFunction();
02389   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
02390 
02391   const Function* Fn = MF.getFunction();
02392   if (Fn->hasExternalLinkage() &&
02393       Subtarget->isTargetCygMing() &&
02394       Fn->getName() == "main")
02395     FuncInfo->setForceFramePointer(true);
02396 
02397   MachineFrameInfo *MFI = MF.getFrameInfo();
02398   bool Is64Bit = Subtarget->is64Bit();
02399   bool IsWin64 = Subtarget->isCallingConvWin64(CallConv);
02400 
02401   assert(!(isVarArg && IsTailCallConvention(CallConv)) &&
02402          "Var args not supported with calling convention fastcc, ghc or hipe");
02403 
02404   // Assign locations to all of the incoming arguments.
02405   SmallVector<CCValAssign, 16> ArgLocs;
02406   CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
02407 
02408   // Allocate shadow area for Win64
02409   if (IsWin64)
02410     CCInfo.AllocateStack(32, 8);
02411 
02412   CCInfo.AnalyzeFormalArguments(Ins, CC_X86);
02413 
02414   unsigned LastVal = ~0U;
02415   SDValue ArgValue;
02416   for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
02417     CCValAssign &VA = ArgLocs[i];
02418     // TODO: If an arg is passed in two places (e.g. reg and stack), skip later
02419     // places.
02420     assert(VA.getValNo() != LastVal &&
02421            "Don't support value assigned to multiple locs yet");
02422     (void)LastVal;
02423     LastVal = VA.getValNo();
02424 
02425     if (VA.isRegLoc()) {
02426       EVT RegVT = VA.getLocVT();
02427       const TargetRegisterClass *RC;
02428       if (RegVT == MVT::i32)
02429         RC = &X86::GR32RegClass;
02430       else if (Is64Bit && RegVT == MVT::i64)
02431         RC = &X86::GR64RegClass;
02432       else if (RegVT == MVT::f32)
02433         RC = &X86::FR32RegClass;
02434       else if (RegVT == MVT::f64)
02435         RC = &X86::FR64RegClass;
02436       else if (RegVT.is512BitVector())
02437         RC = &X86::VR512RegClass;
02438       else if (RegVT.is256BitVector())
02439         RC = &X86::VR256RegClass;
02440       else if (RegVT.is128BitVector())
02441         RC = &X86::VR128RegClass;
02442       else if (RegVT == MVT::x86mmx)
02443         RC = &X86::VR64RegClass;
02444       else if (RegVT == MVT::i1)
02445         RC = &X86::VK1RegClass;
02446       else if (RegVT == MVT::v8i1)
02447         RC = &X86::VK8RegClass;
02448       else if (RegVT == MVT::v16i1)
02449         RC = &X86::VK16RegClass;
02450       else if (RegVT == MVT::v32i1)
02451         RC = &X86::VK32RegClass;
02452       else if (RegVT == MVT::v64i1)
02453         RC = &X86::VK64RegClass;
02454       else
02455         llvm_unreachable("Unknown argument type!");
02456 
02457       unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
02458       ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
02459 
02460       // If this is an 8 or 16-bit value, it is really passed promoted to 32
02461       // bits.  Insert an assert[sz]ext to capture this, then truncate to the
02462       // right size.
02463       if (VA.getLocInfo() == CCValAssign::SExt)
02464         ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue,
02465                                DAG.getValueType(VA.getValVT()));
02466       else if (VA.getLocInfo() == CCValAssign::ZExt)
02467         ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue,
02468                                DAG.getValueType(VA.getValVT()));
02469       else if (VA.getLocInfo() == CCValAssign::BCvt)
02470         ArgValue = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), ArgValue);
02471 
02472       if (VA.isExtInLoc()) {
02473         // Handle MMX values passed in XMM regs.
02474         if (RegVT.isVector())
02475           ArgValue = DAG.getNode(X86ISD::MOVDQ2Q, dl, VA.getValVT(), ArgValue);
02476         else
02477           ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
02478       }
02479     } else {
02480       assert(VA.isMemLoc());
02481       ArgValue = LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, i);
02482     }
02483 
02484     // If value is passed via pointer - do a load.
02485     if (VA.getLocInfo() == CCValAssign::Indirect)
02486       ArgValue = DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue,
02487                              MachinePointerInfo(), false, false, false, 0);
02488 
02489     InVals.push_back(ArgValue);
02490   }
02491 
02492   if (Subtarget->is64Bit() || Subtarget->isTargetKnownWindowsMSVC()) {
02493     for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
02494       // The x86-64 ABIs require that for returning structs by value we copy
02495       // the sret argument into %rax/%eax (depending on ABI) for the return.
02496       // Win32 requires us to put the sret argument to %eax as well.
02497       // Save the argument into a virtual register so that we can access it
02498       // from the return points.
02499       if (Ins[i].Flags.isSRet()) {
02500         unsigned Reg = FuncInfo->getSRetReturnReg();
02501         if (!Reg) {
02502           MVT PtrTy = getPointerTy();
02503           Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy));
02504           FuncInfo->setSRetReturnReg(Reg);
02505         }
02506         SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[i]);
02507         Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain);
02508         break;
02509       }
02510     }
02511   }
02512 
02513   unsigned StackSize = CCInfo.getNextStackOffset();
02514   // Align stack specially for tail calls.
02515   if (FuncIsMadeTailCallSafe(CallConv,
02516                              MF.getTarget().Options.GuaranteedTailCallOpt))
02517     StackSize = GetAlignedArgumentStackSize(StackSize, DAG);
02518 
02519   // If the function takes variable number of arguments, make a frame index for
02520   // the start of the first vararg value... for expansion of llvm.va_start. We
02521   // can skip this if there are no va_start calls.
02522   if (MFI->hasVAStart() &&
02523       (Is64Bit || (CallConv != CallingConv::X86_FastCall &&
02524                    CallConv != CallingConv::X86_ThisCall))) {
02525     FuncInfo->setVarArgsFrameIndex(
02526         MFI->CreateFixedObject(1, StackSize, true));
02527   }
02528 
02529   // 64-bit calling conventions support varargs and register parameters, so we
02530   // have to do extra work to spill them in the prologue or forward them to
02531   // musttail calls.
02532   if (Is64Bit && isVarArg &&
02533       (MFI->hasVAStart() || MFI->hasMustTailInVarArgFunc())) {
02534     // Find the first unallocated argument registers.
02535     ArrayRef<MCPhysReg> ArgGPRs = get64BitArgumentGPRs(CallConv, Subtarget);
02536     ArrayRef<MCPhysReg> ArgXMMs = get64BitArgumentXMMs(MF, CallConv, Subtarget);
02537     unsigned NumIntRegs =
02538         CCInfo.getFirstUnallocated(ArgGPRs.data(), ArgGPRs.size());
02539     unsigned NumXMMRegs =
02540         CCInfo.getFirstUnallocated(ArgXMMs.data(), ArgXMMs.size());
02541     assert(!(NumXMMRegs && !Subtarget->hasSSE1()) &&
02542            "SSE register cannot be used when SSE is disabled!");
02543 
02544     // Gather all the live in physical registers.
02545     SmallVector<SDValue, 6> LiveGPRs;
02546     SmallVector<SDValue, 8> LiveXMMRegs;
02547     SDValue ALVal;
02548     for (MCPhysReg Reg : ArgGPRs.slice(NumIntRegs)) {
02549       unsigned GPR = MF.addLiveIn(Reg, &X86::GR64RegClass);
02550       LiveGPRs.push_back(
02551           DAG.getCopyFromReg(Chain, dl, GPR, MVT::i64));
02552     }
02553     if (!ArgXMMs.empty()) {
02554       unsigned AL = MF.addLiveIn(X86::AL, &X86::GR8RegClass);
02555       ALVal = DAG.getCopyFromReg(Chain, dl, AL, MVT::i8);
02556       for (MCPhysReg Reg : ArgXMMs.slice(NumXMMRegs)) {
02557         unsigned XMMReg = MF.addLiveIn(Reg, &X86::VR128RegClass);
02558         LiveXMMRegs.push_back(
02559             DAG.getCopyFromReg(Chain, dl, XMMReg, MVT::v4f32));
02560       }
02561     }
02562 
02563     // Store them to the va_list returned by va_start.
02564     if (MFI->hasVAStart()) {
02565       if (IsWin64) {
02566         const TargetFrameLowering &TFI = *MF.getSubtarget().getFrameLowering();
02567         // Get to the caller-allocated home save location.  Add 8 to account
02568         // for the return address.
02569         int HomeOffset = TFI.getOffsetOfLocalArea() + 8;
02570         FuncInfo->setRegSaveFrameIndex(
02571           MFI->CreateFixedObject(1, NumIntRegs * 8 + HomeOffset, false));
02572         // Fixup to set vararg frame on shadow area (4 x i64).
02573         if (NumIntRegs < 4)
02574           FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex());
02575       } else {
02576         // For X86-64, if there are vararg parameters that are passed via
02577         // registers, then we must store them to their spots on the stack so
02578         // they may be loaded by deferencing the result of va_next.
02579         FuncInfo->setVarArgsGPOffset(NumIntRegs * 8);
02580         FuncInfo->setVarArgsFPOffset(ArgGPRs.size() * 8 + NumXMMRegs * 16);
02581         FuncInfo->setRegSaveFrameIndex(MFI->CreateStackObject(
02582             ArgGPRs.size() * 8 + ArgXMMs.size() * 16, 16, false));
02583       }
02584 
02585       // Store the integer parameter registers.
02586       SmallVector<SDValue, 8> MemOps;
02587       SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(),
02588                                         getPointerTy());
02589       unsigned Offset = FuncInfo->getVarArgsGPOffset();
02590       for (SDValue Val : LiveGPRs) {
02591         SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), RSFIN,
02592                                   DAG.getIntPtrConstant(Offset));
02593         SDValue Store =
02594           DAG.getStore(Val.getValue(1), dl, Val, FIN,
02595                        MachinePointerInfo::getFixedStack(
02596                          FuncInfo->getRegSaveFrameIndex(), Offset),
02597                        false, false, 0);
02598         MemOps.push_back(Store);
02599         Offset += 8;
02600       }
02601 
02602       if (!ArgXMMs.empty() && NumXMMRegs != ArgXMMs.size()) {
02603         // Now store the XMM (fp + vector) parameter registers.
02604         SmallVector<SDValue, 12> SaveXMMOps;
02605         SaveXMMOps.push_back(Chain);
02606         SaveXMMOps.push_back(ALVal);
02607         SaveXMMOps.push_back(DAG.getIntPtrConstant(
02608                                FuncInfo->getRegSaveFrameIndex()));
02609         SaveXMMOps.push_back(DAG.getIntPtrConstant(
02610                                FuncInfo->getVarArgsFPOffset()));
02611         SaveXMMOps.insert(SaveXMMOps.end(), LiveXMMRegs.begin(),
02612                           LiveXMMRegs.end());
02613         MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, dl,
02614                                      MVT::Other, SaveXMMOps));
02615       }
02616 
02617       if (!MemOps.empty())
02618         Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);
02619     } else {
02620       // Add all GPRs, al, and XMMs to the list of forwards.  We will add then
02621       // to the liveout set on a musttail call.
02622       assert(MFI->hasMustTailInVarArgFunc());
02623       auto &Forwards = FuncInfo->getForwardedMustTailRegParms();
02624       typedef X86MachineFunctionInfo::Forward Forward;
02625 
02626       for (unsigned I = 0, E = LiveGPRs.size(); I != E; ++I) {
02627         unsigned VReg =
02628             MF.getRegInfo().createVirtualRegister(&X86::GR64RegClass);
02629         Chain = DAG.getCopyToReg(Chain, dl, VReg, LiveGPRs[I]);
02630         Forwards.push_back(Forward(VReg, ArgGPRs[NumIntRegs + I], MVT::i64));
02631       }
02632 
02633       if (!ArgXMMs.empty()) {
02634         unsigned ALVReg =
02635             MF.getRegInfo().createVirtualRegister(&X86::GR8RegClass);
02636         Chain = DAG.getCopyToReg(Chain, dl, ALVReg, ALVal);
02637         Forwards.push_back(Forward(ALVReg, X86::AL, MVT::i8));
02638 
02639         for (unsigned I = 0, E = LiveXMMRegs.size(); I != E; ++I) {
02640           unsigned VReg =
02641               MF.getRegInfo().createVirtualRegister(&X86::VR128RegClass);
02642           Chain = DAG.getCopyToReg(Chain, dl, VReg, LiveXMMRegs[I]);
02643           Forwards.push_back(
02644               Forward(VReg, ArgXMMs[NumXMMRegs + I], MVT::v4f32));
02645         }
02646       }
02647     }
02648   }
02649 
02650   // Some CCs need callee pop.
02651   if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
02652                        MF.getTarget().Options.GuaranteedTailCallOpt)) {
02653     FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything.
02654   } else {
02655     FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing.
02656     // If this is an sret function, the return should pop the hidden pointer.
02657     if (!Is64Bit && !IsTailCallConvention(CallConv) &&
02658         !Subtarget->getTargetTriple().isOSMSVCRT() &&
02659         argsAreStructReturn(Ins) == StackStructReturn)
02660       FuncInfo->setBytesToPopOnReturn(4);
02661   }
02662 
02663   if (!Is64Bit) {
02664     // RegSaveFrameIndex is X86-64 only.
02665     FuncInfo->setRegSaveFrameIndex(0xAAAAAAA);
02666     if (CallConv == CallingConv::X86_FastCall ||
02667         CallConv == CallingConv::X86_ThisCall)
02668       // fastcc functions can't have varargs.
02669       FuncInfo->setVarArgsFrameIndex(0xAAAAAAA);
02670   }
02671 
02672   FuncInfo->setArgumentStackSize(StackSize);
02673 
02674   return Chain;
02675 }
02676 
02677 SDValue
02678 X86TargetLowering::LowerMemOpCallTo(SDValue Chain,
02679                                     SDValue StackPtr, SDValue Arg,
02680                                     SDLoc dl, SelectionDAG &DAG,
02681                                     const CCValAssign &VA,
02682                                     ISD::ArgFlagsTy Flags) const {
02683   unsigned LocMemOffset = VA.getLocMemOffset();
02684   SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset);
02685   PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, PtrOff);
02686   if (Flags.isByVal())
02687     return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl);
02688 
02689   return DAG.getStore(Chain, dl, Arg, PtrOff,
02690                       MachinePointerInfo::getStack(LocMemOffset),
02691                       false, false, 0);
02692 }
02693 
02694 /// EmitTailCallLoadRetAddr - Emit a load of return address if tail call
02695 /// optimization is performed and it is required.
02696 SDValue
02697 X86TargetLowering::EmitTailCallLoadRetAddr(SelectionDAG &DAG,
02698                                            SDValue &OutRetAddr, SDValue Chain,
02699                                            bool IsTailCall, bool Is64Bit,
02700                                            int FPDiff, SDLoc dl) const {
02701   // Adjust the Return address stack slot.
02702   EVT VT = getPointerTy();
02703   OutRetAddr = getReturnAddressFrameIndex(DAG);
02704 
02705   // Load the "old" Return address.
02706   OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, MachinePointerInfo(),
02707                            false, false, false, 0);
02708   return SDValue(OutRetAddr.getNode(), 1);
02709 }
02710 
02711 /// EmitTailCallStoreRetAddr - Emit a store of the return address if tail call
02712 /// optimization is performed and it is required (FPDiff!=0).
02713 static SDValue EmitTailCallStoreRetAddr(SelectionDAG &DAG, MachineFunction &MF,
02714                                         SDValue Chain, SDValue RetAddrFrIdx,
02715                                         EVT PtrVT, unsigned SlotSize,
02716                                         int FPDiff, SDLoc dl) {
02717   // Store the return address to the appropriate stack slot.
02718   if (!FPDiff) return Chain;
02719   // Calculate the new stack slot for the return address.
02720   int NewReturnAddrFI =
02721     MF.getFrameInfo()->CreateFixedObject(SlotSize, (int64_t)FPDiff - SlotSize,
02722                                          false);
02723   SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, PtrVT);
02724   Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx,
02725                        MachinePointerInfo::getFixedStack(NewReturnAddrFI),
02726                        false, false, 0);
02727   return Chain;
02728 }
02729 
02730 SDValue
02731 X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
02732                              SmallVectorImpl<SDValue> &InVals) const {
02733   SelectionDAG &DAG                     = CLI.DAG;
02734   SDLoc &dl                             = CLI.DL;
02735   SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
02736   SmallVectorImpl<SDValue> &OutVals     = CLI.OutVals;
02737   SmallVectorImpl<ISD::InputArg> &Ins   = CLI.Ins;
02738   SDValue Chain                         = CLI.Chain;
02739   SDValue Callee                        = CLI.Callee;
02740   CallingConv::ID CallConv              = CLI.CallConv;
02741   bool &isTailCall                      = CLI.IsTailCall;
02742   bool isVarArg                         = CLI.IsVarArg;
02743 
02744   MachineFunction &MF = DAG.getMachineFunction();
02745   bool Is64Bit        = Subtarget->is64Bit();
02746   bool IsWin64        = Subtarget->isCallingConvWin64(CallConv);
02747   StructReturnType SR = callIsStructReturn(Outs);
02748   bool IsSibcall      = false;
02749   X86MachineFunctionInfo *X86Info = MF.getInfo<X86MachineFunctionInfo>();
02750 
02751   if (MF.getTarget().Options.DisableTailCalls)
02752     isTailCall = false;
02753 
02754   bool IsMustTail = CLI.CS && CLI.CS->isMustTailCall();
02755   if (IsMustTail) {
02756     // Force this to be a tail call.  The verifier rules are enough to ensure
02757     // that we can lower this successfully without moving the return address
02758     // around.
02759     isTailCall = true;
02760   } else if (isTailCall) {
02761     // Check if it's really possible to do a tail call.
02762     isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv,
02763                     isVarArg, SR != NotStructReturn,
02764                     MF.getFunction()->hasStructRetAttr(), CLI.RetTy,
02765                     Outs, OutVals, Ins, DAG);
02766 
02767     // Sibcalls are automatically detected tailcalls which do not require
02768     // ABI changes.
02769     if (!MF.getTarget().Options.GuaranteedTailCallOpt && isTailCall)
02770       IsSibcall = true;
02771 
02772     if (isTailCall)
02773       ++NumTailCalls;
02774   }
02775 
02776   assert(!(isVarArg && IsTailCallConvention(CallConv)) &&
02777          "Var args not supported with calling convention fastcc, ghc or hipe");
02778 
02779   // Analyze operands of the call, assigning locations to each operand.
02780   SmallVector<CCValAssign, 16> ArgLocs;
02781   CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
02782 
02783   // Allocate shadow area for Win64
02784   if (IsWin64)
02785     CCInfo.AllocateStack(32, 8);
02786 
02787   CCInfo.AnalyzeCallOperands(Outs, CC_X86);
02788 
02789   // Get a count of how many bytes are to be pushed on the stack.
02790   unsigned NumBytes = CCInfo.getNextStackOffset();
02791   if (IsSibcall)
02792     // This is a sibcall. The memory operands are available in caller's
02793     // own caller's stack.
02794     NumBytes = 0;
02795   else if (MF.getTarget().Options.GuaranteedTailCallOpt &&
02796            IsTailCallConvention(CallConv))
02797     NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG);
02798 
02799   int FPDiff = 0;
02800   if (isTailCall && !IsSibcall && !IsMustTail) {
02801     // Lower arguments at fp - stackoffset + fpdiff.
02802     unsigned NumBytesCallerPushed = X86Info->getBytesToPopOnReturn();
02803 
02804     FPDiff = NumBytesCallerPushed - NumBytes;
02805 
02806     // Set the delta of movement of the returnaddr stackslot.
02807     // But only set if delta is greater than previous delta.
02808     if (FPDiff < X86Info->getTCReturnAddrDelta())
02809       X86Info->setTCReturnAddrDelta(FPDiff);
02810   }
02811 
02812   unsigned NumBytesToPush = NumBytes;
02813   unsigned NumBytesToPop = NumBytes;
02814 
02815   // If we have an inalloca argument, all stack space has already been allocated
02816   // for us and be right at the top of the stack.  We don't support multiple
02817   // arguments passed in memory when using inalloca.
02818   if (!Outs.empty() && Outs.back().Flags.isInAlloca()) {
02819     NumBytesToPush = 0;
02820     if (!ArgLocs.back().isMemLoc())
02821       report_fatal_error("cannot use inalloca attribute on a register "
02822                          "parameter");
02823     if (ArgLocs.back().getLocMemOffset() != 0)
02824       report_fatal_error("any parameter with the inalloca attribute must be "
02825                          "the only memory argument");
02826   }
02827 
02828   if (!IsSibcall)
02829     Chain = DAG.getCALLSEQ_START(
02830         Chain, DAG.getIntPtrConstant(NumBytesToPush, true), dl);
02831 
02832   SDValue RetAddrFrIdx;
02833   // Load return address for tail calls.
02834   if (isTailCall && FPDiff)
02835     Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall,
02836                                     Is64Bit, FPDiff, dl);
02837 
02838   SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
02839   SmallVector<SDValue, 8> MemOpChains;
02840   SDValue StackPtr;
02841 
02842   // Walk the register/memloc assignments, inserting copies/loads.  In the case
02843   // of tail call optimization arguments are handle later.
02844   const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>(
02845       DAG.getSubtarget().getRegisterInfo());
02846   for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
02847     // Skip inalloca arguments, they have already been written.
02848     ISD::ArgFlagsTy Flags = Outs[i].Flags;
02849     if (Flags.isInAlloca())
02850       continue;
02851 
02852     CCValAssign &VA = ArgLocs[i];
02853     EVT RegVT = VA.getLocVT();
02854     SDValue Arg = OutVals[i];
02855     bool isByVal = Flags.isByVal();
02856 
02857     // Promote the value if needed.
02858     switch (VA.getLocInfo()) {
02859     default: llvm_unreachable("Unknown loc info!");
02860     case CCValAssign::Full: break;
02861     case CCValAssign::SExt:
02862       Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg);
02863       break;
02864     case CCValAssign::ZExt:
02865       Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg);
02866       break;
02867     case CCValAssign::AExt:
02868       if (RegVT.is128BitVector()) {
02869         // Special case: passing MMX values in XMM registers.
02870         Arg = DAG.getNode(ISD::BITCAST, dl, MVT::i64, Arg);
02871         Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg);
02872         Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg);
02873       } else
02874         Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg);
02875       break;
02876     case CCValAssign::BCvt:
02877       Arg = DAG.getNode(ISD::BITCAST, dl, RegVT, Arg);
02878       break;
02879     case CCValAssign::Indirect: {
02880       // Store the argument.
02881       SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT());
02882       int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex();
02883       Chain = DAG.getStore(Chain, dl, Arg, SpillSlot,
02884                            MachinePointerInfo::getFixedStack(FI),
02885                            false, false, 0);
02886       Arg = SpillSlot;
02887       break;
02888     }
02889     }
02890 
02891     if (VA.isRegLoc()) {
02892       RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
02893       if (isVarArg && IsWin64) {
02894         // Win64 ABI requires argument XMM reg to be copied to the corresponding
02895         // shadow reg if callee is a varargs function.
02896         unsigned ShadowReg = 0;
02897         switch (VA.getLocReg()) {
02898         case X86::XMM0: ShadowReg = X86::RCX; break;
02899         case X86::XMM1: ShadowReg = X86::RDX; break;
02900         case X86::XMM2: ShadowReg = X86::R8; break;
02901         case X86::XMM3: ShadowReg = X86::R9; break;
02902         }
02903         if (ShadowReg)
02904           RegsToPass.push_back(std::make_pair(ShadowReg, Arg));
02905       }
02906     } else if (!IsSibcall && (!isTailCall || isByVal)) {
02907       assert(VA.isMemLoc());
02908       if (!StackPtr.getNode())
02909         StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
02910                                       getPointerTy());
02911       MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg,
02912                                              dl, DAG, VA, Flags));
02913     }
02914   }
02915 
02916   if (!MemOpChains.empty())
02917     Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
02918 
02919   if (Subtarget->isPICStyleGOT()) {
02920     // ELF / PIC requires GOT in the EBX register before function calls via PLT
02921     // GOT pointer.
02922     if (!isTailCall) {
02923       RegsToPass.push_back(std::make_pair(unsigned(X86::EBX),
02924                DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), getPointerTy())));
02925     } else {
02926       // If we are tail calling and generating PIC/GOT style code load the
02927       // address of the callee into ECX. The value in ecx is used as target of
02928       // the tail jump. This is done to circumvent the ebx/callee-saved problem
02929       // for tail calls on PIC/GOT architectures. Normally we would just put the
02930       // address of GOT into ebx and then call target@PLT. But for tail calls
02931       // ebx would be restored (since ebx is callee saved) before jumping to the
02932       // target@PLT.
02933 
02934       // Note: The actual moving to ECX is done further down.
02935       GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
02936       if (G && !G->getGlobal()->hasHiddenVisibility() &&
02937           !G->getGlobal()->hasProtectedVisibility())
02938         Callee = LowerGlobalAddress(Callee, DAG);
02939       else if (isa<ExternalSymbolSDNode>(Callee))
02940         Callee = LowerExternalSymbol(Callee, DAG);
02941     }
02942   }
02943 
02944   if (Is64Bit && isVarArg && !IsWin64 && !IsMustTail) {
02945     // From AMD64 ABI document:
02946     // For calls that may call functions that use varargs or stdargs
02947     // (prototype-less calls or calls to functions containing ellipsis (...) in
02948     // the declaration) %al is used as hidden argument to specify the number
02949     // of SSE registers used. The contents of %al do not need to match exactly
02950     // the number of registers, but must be an ubound on the number of SSE
02951     // registers used and is in the range 0 - 8 inclusive.
02952 
02953     // Count the number of XMM registers allocated.
02954     static const MCPhysReg XMMArgRegs[] = {
02955       X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
02956       X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
02957     };
02958     unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs, 8);
02959     assert((Subtarget->hasSSE1() || !NumXMMRegs)
02960            && "SSE registers cannot be used when SSE is disabled");
02961 
02962     RegsToPass.push_back(std::make_pair(unsigned(X86::AL),
02963                                         DAG.getConstant(NumXMMRegs, MVT::i8)));
02964   }
02965 
02966   if (Is64Bit && isVarArg && IsMustTail) {
02967     const auto &Forwards = X86Info->getForwardedMustTailRegParms();
02968     for (const auto &F : Forwards) {
02969       SDValue Val = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT);
02970       RegsToPass.push_back(std::make_pair(unsigned(F.PReg), Val));
02971     }
02972   }
02973 
02974   // For tail calls lower the arguments to the 'real' stack slots.  Sibcalls
02975   // don't need this because the eligibility check rejects calls that require
02976   // shuffling arguments passed in memory.
02977   if (!IsSibcall && isTailCall) {
02978     // Force all the incoming stack arguments to be loaded from the stack
02979     // before any new outgoing arguments are stored to the stack, because the
02980     // outgoing stack slots may alias the incoming argument stack slots, and
02981     // the alias isn't otherwise explicit. This is slightly more conservative
02982     // than necessary, because it means that each store effectively depends
02983     // on every argument instead of just those arguments it would clobber.
02984     SDValue ArgChain = DAG.getStackArgumentTokenFactor(Chain);
02985 
02986     SmallVector<SDValue, 8> MemOpChains2;
02987     SDValue FIN;
02988     int FI = 0;
02989     for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
02990       CCValAssign &VA = ArgLocs[i];
02991       if (VA.isRegLoc())
02992         continue;
02993       assert(VA.isMemLoc());
02994       SDValue Arg = OutVals[i];
02995       ISD::ArgFlagsTy Flags = Outs[i].Flags;
02996       // Skip inalloca arguments.  They don't require any work.
02997       if (Flags.isInAlloca())
02998         continue;
02999       // Create frame index.
03000       int32_t Offset = VA.getLocMemOffset()+FPDiff;
03001       uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8;
03002       FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset, true);
03003       FIN = DAG.getFrameIndex(FI, getPointerTy());
03004 
03005       if (Flags.isByVal()) {
03006         // Copy relative to framepointer.
03007         SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset());
03008         if (!StackPtr.getNode())
03009           StackPtr = DAG.getCopyFromReg(Chain, dl,
03010                                         RegInfo->getStackRegister(),
03011                                         getPointerTy());
03012         Source = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, Source);
03013 
03014         MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN,
03015                                                          ArgChain,
03016                                                          Flags, DAG, dl));
03017       } else {
03018         // Store relative to framepointer.
03019         MemOpChains2.push_back(
03020           DAG.getStore(ArgChain, dl, Arg, FIN,
03021                        MachinePointerInfo::getFixedStack(FI),
03022                        false, false, 0));
03023       }
03024     }
03025 
03026     if (!MemOpChains2.empty())
03027       Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains2);
03028 
03029     // Store the return address to the appropriate stack slot.
03030     Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx,
03031                                      getPointerTy(), RegInfo->getSlotSize(),
03032                                      FPDiff, dl);
03033   }
03034 
03035   // Build a sequence of copy-to-reg nodes chained together with token chain
03036   // and flag operands which copy the outgoing args into registers.
03037   SDValue InFlag;
03038   for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
03039     Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
03040                              RegsToPass[i].second, InFlag);
03041     InFlag = Chain.getValue(1);
03042   }
03043 
03044   if (DAG.getTarget().getCodeModel() == CodeModel::Large) {
03045     assert(Is64Bit && "Large code model is only legal in 64-bit mode.");
03046     // In the 64-bit large code model, we have to make all calls
03047     // through a register, since the call instruction's 32-bit
03048     // pc-relative offset may not be large enough to hold the whole
03049     // address.
03050   } else if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
03051     // If the callee is a GlobalAddress node (quite common, every direct call
03052     // is) turn it into a TargetGlobalAddress node so that legalize doesn't hack
03053     // it.
03054 
03055     // We should use extra load for direct calls to dllimported functions in
03056     // non-JIT mode.
03057     const GlobalValue *GV = G->getGlobal();
03058     if (!GV->hasDLLImportStorageClass()) {
03059       unsigned char OpFlags = 0;
03060       bool ExtraLoad = false;
03061       unsigned WrapperKind = ISD::DELETED_NODE;
03062 
03063       // On ELF targets, in both X86-64 and X86-32 mode, direct calls to
03064       // external symbols most go through the PLT in PIC mode.  If the symbol
03065       // has hidden or protected visibility, or if it is static or local, then
03066       // we don't need to use the PLT - we can directly call it.
03067       if (Subtarget->isTargetELF() &&
03068           DAG.getTarget().getRelocationModel() == Reloc::PIC_ &&
03069           GV->hasDefaultVisibility() && !GV->hasLocalLinkage()) {
03070         OpFlags = X86II::MO_PLT;
03071       } else if (Subtarget->isPICStyleStubAny() &&
03072                  (GV->isDeclaration() || GV->isWeakForLinker()) &&
03073                  (!Subtarget->getTargetTriple().isMacOSX() ||
03074                   Subtarget->getTargetTriple().isMacOSXVersionLT(10, 5))) {
03075         // PC-relative references to external symbols should go through $stub,
03076         // unless we're building with the leopard linker or later, which
03077         // automatically synthesizes these stubs.
03078         OpFlags = X86II::MO_DARWIN_STUB;
03079       } else if (Subtarget->isPICStyleRIPRel() &&
03080                  isa<Function>(GV) &&
03081                  cast<Function>(GV)->getAttributes().
03082                    hasAttribute(AttributeSet::FunctionIndex,
03083                                 Attribute::NonLazyBind)) {
03084         // If the function is marked as non-lazy, generate an indirect call
03085         // which loads from the GOT directly. This avoids runtime overhead
03086         // at the cost of eager binding (and one extra byte of encoding).
03087         OpFlags = X86II::MO_GOTPCREL;
03088         WrapperKind = X86ISD::WrapperRIP;
03089         ExtraLoad = true;
03090       }
03091 
03092       Callee = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(),
03093                                           G->getOffset(), OpFlags);
03094 
03095       // Add a wrapper if needed.
03096       if (WrapperKind != ISD::DELETED_NODE)
03097         Callee = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Callee);
03098       // Add extra indirection if needed.
03099       if (ExtraLoad)
03100         Callee = DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), Callee,
03101                              MachinePointerInfo::getGOT(),
03102                              false, false, false, 0);
03103     }
03104   } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
03105     unsigned char OpFlags = 0;
03106 
03107     // On ELF targets, in either X86-64 or X86-32 mode, direct calls to
03108     // external symbols should go through the PLT.
03109     if (Subtarget->isTargetELF() &&
03110         DAG.getTarget().getRelocationModel() == Reloc::PIC_) {
03111       OpFlags = X86II::MO_PLT;
03112     } else if (Subtarget->isPICStyleStubAny() &&
03113                (!Subtarget->getTargetTriple().isMacOSX() ||
03114                 Subtarget->getTargetTriple().isMacOSXVersionLT(10, 5))) {
03115       // PC-relative references to external symbols should go through $stub,
03116       // unless we're building with the leopard linker or later, which
03117       // automatically synthesizes these stubs.
03118       OpFlags = X86II::MO_DARWIN_STUB;
03119     }
03120 
03121     Callee = DAG.getTargetExternalSymbol(S->getSymbol(), getPointerTy(),
03122                                          OpFlags);
03123   } else if (Subtarget->isTarget64BitILP32() && Callee->getValueType(0) == MVT::i32) {
03124     // Zero-extend the 32-bit Callee address into a 64-bit according to x32 ABI
03125     Callee = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Callee);
03126   }
03127 
03128   // Returns a chain & a flag for retval copy to use.
03129   SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
03130   SmallVector<SDValue, 8> Ops;
03131 
03132   if (!IsSibcall && isTailCall) {
03133     Chain = DAG.getCALLSEQ_END(Chain,
03134                                DAG.getIntPtrConstant(NumBytesToPop, true),
03135                                DAG.getIntPtrConstant(0, true), InFlag, dl);
03136     InFlag = Chain.getValue(1);
03137   }
03138 
03139   Ops.push_back(Chain);
03140   Ops.push_back(Callee);
03141 
03142   if (isTailCall)
03143     Ops.push_back(DAG.getConstant(FPDiff, MVT::i32));
03144 
03145   // Add argument registers to the end of the list so that they are known live
03146   // into the call.
03147   for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
03148     Ops.push_back(DAG.getRegister(RegsToPass[i].first,
03149                                   RegsToPass[i].second.getValueType()));
03150 
03151   // Add a register mask operand representing the call-preserved registers.
03152   const TargetRegisterInfo *TRI = DAG.getSubtarget().getRegisterInfo();
03153   const uint32_t *Mask = TRI->getCallPreservedMask(CallConv);
03154   assert(Mask && "Missing call preserved mask for calling convention");
03155   Ops.push_back(DAG.getRegisterMask(Mask));
03156 
03157   if (InFlag.getNode())
03158     Ops.push_back(InFlag);
03159 
03160   if (isTailCall) {
03161     // We used to do:
03162     //// If this is the first return lowered for this function, add the regs
03163     //// to the liveout set for the function.
03164     // This isn't right, although it's probably harmless on x86; liveouts
03165     // should be computed from returns not tail calls.  Consider a void
03166     // function making a tail call to a function returning int.
03167     return DAG.getNode(X86ISD::TC_RETURN, dl, NodeTys, Ops);
03168   }
03169 
03170   Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, Ops);
03171   InFlag = Chain.getValue(1);
03172 
03173   // Create the CALLSEQ_END node.
03174   unsigned NumBytesForCalleeToPop;
03175   if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
03176                        DAG.getTarget().Options.GuaranteedTailCallOpt))
03177     NumBytesForCalleeToPop = NumBytes;    // Callee pops everything
03178   else if (!Is64Bit && !IsTailCallConvention(CallConv) &&
03179            !Subtarget->getTargetTriple().isOSMSVCRT() &&
03180            SR == StackStructReturn)
03181     // If this is a call to a struct-return function, the callee
03182     // pops the hidden struct pointer, so we have to push it back.
03183     // This is common for Darwin/X86, Linux & Mingw32 targets.
03184     // For MSVC Win32 targets, the caller pops the hidden struct pointer.
03185     NumBytesForCalleeToPop = 4;
03186   else
03187     NumBytesForCalleeToPop = 0;  // Callee pops nothing.
03188 
03189   // Returns a flag for retval copy to use.
03190   if (!IsSibcall) {
03191     Chain = DAG.getCALLSEQ_END(Chain,
03192                                DAG.getIntPtrConstant(NumBytesToPop, true),
03193                                DAG.getIntPtrConstant(NumBytesForCalleeToPop,
03194                                                      true),
03195                                InFlag, dl);
03196     InFlag = Chain.getValue(1);
03197   }
03198 
03199   // Handle result values, copying them out of physregs into vregs that we
03200   // return.
03201   return LowerCallResult(Chain, InFlag, CallConv, isVarArg,
03202                          Ins, dl, DAG, InVals);
03203 }
03204 
03205 //===----------------------------------------------------------------------===//
03206 //                Fast Calling Convention (tail call) implementation
03207 //===----------------------------------------------------------------------===//
03208 
03209 //  Like std call, callee cleans arguments, convention except that ECX is
03210 //  reserved for storing the tail called function address. Only 2 registers are
03211 //  free for argument passing (inreg). Tail call optimization is performed
03212 //  provided:
03213 //                * tailcallopt is enabled
03214 //                * caller/callee are fastcc
03215 //  On X86_64 architecture with GOT-style position independent code only local
03216 //  (within module) calls are supported at the moment.
03217 //  To keep the stack aligned according to platform abi the function
03218 //  GetAlignedArgumentStackSize ensures that argument delta is always multiples
03219 //  of stack alignment. (Dynamic linkers need this - darwin's dyld for example)
03220 //  If a tail called function callee has more arguments than the caller the
03221 //  caller needs to make sure that there is room to move the RETADDR to. This is
03222 //  achieved by reserving an area the size of the argument delta right after the
03223 //  original RETADDR, but before the saved framepointer or the spilled registers
03224 //  e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4)
03225 //  stack layout:
03226 //    arg1
03227 //    arg2
03228 //    RETADDR
03229 //    [ new RETADDR
03230 //      move area ]
03231 //    (possible EBP)
03232 //    ESI
03233 //    EDI
03234 //    local1 ..
03235 
03236 /// GetAlignedArgumentStackSize - Make the stack size align e.g 16n + 12 aligned
03237 /// for a 16 byte align requirement.
03238 unsigned
03239 X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize,
03240                                                SelectionDAG& DAG) const {
03241   MachineFunction &MF = DAG.getMachineFunction();
03242   const TargetMachine &TM = MF.getTarget();
03243   const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>(
03244       TM.getSubtargetImpl()->getRegisterInfo());
03245   const TargetFrameLowering &TFI = *TM.getSubtargetImpl()->getFrameLowering();
03246   unsigned StackAlignment = TFI.getStackAlignment();
03247   uint64_t AlignMask = StackAlignment - 1;
03248   int64_t Offset = StackSize;
03249   unsigned SlotSize = RegInfo->getSlotSize();
03250   if ( (Offset & AlignMask) <= (StackAlignment - SlotSize) ) {
03251     // Number smaller than 12 so just add the difference.
03252     Offset += ((StackAlignment - SlotSize) - (Offset & AlignMask));
03253   } else {
03254     // Mask out lower bits, add stackalignment once plus the 12 bytes.
03255     Offset = ((~AlignMask) & Offset) + StackAlignment +
03256       (StackAlignment-SlotSize);
03257   }
03258   return Offset;
03259 }
03260 
03261 /// MatchingStackOffset - Return true if the given stack call argument is
03262 /// already available in the same position (relatively) of the caller's
03263 /// incoming argument stack.
03264 static
03265 bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
03266                          MachineFrameInfo *MFI, const MachineRegisterInfo *MRI,
03267                          const X86InstrInfo *TII) {
03268   unsigned Bytes = Arg.getValueType().getSizeInBits() / 8;
03269   int FI = INT_MAX;
03270   if (Arg.getOpcode() == ISD::CopyFromReg) {
03271     unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg();
03272     if (!TargetRegisterInfo::isVirtualRegister(VR))
03273       return false;
03274     MachineInstr *Def = MRI->getVRegDef(VR);
03275     if (!Def)
03276       return false;
03277     if (!Flags.isByVal()) {
03278       if (!TII->isLoadFromStackSlot(Def, FI))
03279         return false;
03280     } else {
03281       unsigned Opcode = Def->getOpcode();
03282       if ((Opcode == X86::LEA32r || Opcode == X86::LEA64r) &&
03283           Def->getOperand(1).isFI()) {
03284         FI = Def->getOperand(1).getIndex();
03285         Bytes = Flags.getByValSize();
03286       } else
03287         return false;
03288     }
03289   } else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) {
03290     if (Flags.isByVal())
03291       // ByVal argument is passed in as a pointer but it's now being
03292       // dereferenced. e.g.
03293       // define @foo(%struct.X* %A) {
03294       //   tail call @bar(%struct.X* byval %A)
03295       // }
03296       return false;
03297     SDValue Ptr = Ld->getBasePtr();
03298     FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr);
03299     if (!FINode)
03300       return false;
03301     FI = FINode->getIndex();
03302   } else if (Arg.getOpcode() == ISD::FrameIndex && Flags.isByVal()) {
03303     FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Arg);
03304     FI = FINode->getIndex();
03305     Bytes = Flags.getByValSize();
03306   } else
03307     return false;
03308 
03309   assert(FI != INT_MAX);
03310   if (!MFI->isFixedObjectIndex(FI))
03311     return false;
03312   return Offset == MFI->getObjectOffset(FI) && Bytes == MFI->getObjectSize(FI);
03313 }
03314 
03315 /// IsEligibleForTailCallOptimization - Check whether the call is eligible
03316 /// for tail call optimization. Targets which want to do tail call
03317 /// optimization should implement this function.
03318 bool
03319 X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
03320                                                      CallingConv::ID CalleeCC,
03321                                                      bool isVarArg,
03322                                                      bool isCalleeStructRet,
03323                                                      bool isCallerStructRet,
03324                                                      Type *RetTy,
03325                                     const SmallVectorImpl<ISD::OutputArg> &Outs,
03326                                     const SmallVectorImpl<SDValue> &OutVals,
03327                                     const SmallVectorImpl<ISD::InputArg> &Ins,
03328                                                      SelectionDAG &DAG) const {
03329   if (!IsTailCallConvention(CalleeCC) && !IsCCallConvention(CalleeCC))
03330     return false;
03331 
03332   // If -tailcallopt is specified, make fastcc functions tail-callable.
03333   const MachineFunction &MF = DAG.getMachineFunction();
03334   const Function *CallerF = MF.getFunction();
03335 
03336   // If the function return type is x86_fp80 and the callee return type is not,
03337   // then the FP_EXTEND of the call result is not a nop. It's not safe to
03338   // perform a tailcall optimization here.
03339   if (CallerF->getReturnType()->isX86_FP80Ty() && !RetTy->isX86_FP80Ty())
03340     return false;
03341 
03342   CallingConv::ID CallerCC = CallerF->getCallingConv();
03343   bool CCMatch = CallerCC == CalleeCC;
03344   bool IsCalleeWin64 = Subtarget->isCallingConvWin64(CalleeCC);
03345   bool IsCallerWin64 = Subtarget->isCallingConvWin64(CallerCC);
03346 
03347   if (DAG.getTarget().Options.GuaranteedTailCallOpt) {
03348     if (IsTailCallConvention(CalleeCC) && CCMatch)
03349       return true;
03350     return false;
03351   }
03352 
03353   // Look for obvious safe cases to perform tail call optimization that do not
03354   // require ABI changes. This is what gcc calls sibcall.
03355 
03356   // Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to
03357   // emit a special epilogue.
03358   const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>(
03359       DAG.getSubtarget().getRegisterInfo());
03360   if (RegInfo->needsStackRealignment(MF))
03361     return false;
03362 
03363   // Also avoid sibcall optimization if either caller or callee uses struct
03364   // return semantics.
03365   if (isCalleeStructRet || isCallerStructRet)
03366     return false;
03367 
03368   // An stdcall/thiscall caller is expected to clean up its arguments; the
03369   // callee isn't going to do that.
03370   // FIXME: this is more restrictive than needed. We could produce a tailcall
03371   // when the stack adjustment matches. For example, with a thiscall that takes
03372   // only one argument.
03373   if (!CCMatch && (CallerCC == CallingConv::X86_StdCall ||
03374                    CallerCC == CallingConv::X86_ThisCall))
03375     return false;
03376 
03377   // Do not sibcall optimize vararg calls unless all arguments are passed via
03378   // registers.
03379   if (isVarArg && !Outs.empty()) {
03380 
03381     // Optimizing for varargs on Win64 is unlikely to be safe without
03382     // additional testing.
03383     if (IsCalleeWin64 || IsCallerWin64)
03384       return false;
03385 
03386     SmallVector<CCValAssign, 16> ArgLocs;
03387     CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(), ArgLocs,
03388                    *DAG.getContext());
03389 
03390     CCInfo.AnalyzeCallOperands(Outs, CC_X86);
03391     for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i)
03392       if (!ArgLocs[i].isRegLoc())
03393         return false;
03394   }
03395 
03396   // If the call result is in ST0 / ST1, it needs to be popped off the x87
03397   // stack.  Therefore, if it's not used by the call it is not safe to optimize
03398   // this into a sibcall.
03399   bool Unused = false;
03400   for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
03401     if (!Ins[i].Used) {
03402       Unused = true;
03403       break;
03404     }
03405   }
03406   if (Unused) {
03407     SmallVector<CCValAssign, 16> RVLocs;
03408     CCState CCInfo(CalleeCC, false, DAG.getMachineFunction(), RVLocs,
03409                    *DAG.getContext());
03410     CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
03411     for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
03412       CCValAssign &VA = RVLocs[i];
03413       if (VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1)
03414         return false;
03415     }
03416   }
03417 
03418   // If the calling conventions do not match, then we'd better make sure the
03419   // results are returned in the same way as what the caller expects.
03420   if (!CCMatch) {
03421     SmallVector<CCValAssign, 16> RVLocs1;
03422     CCState CCInfo1(CalleeCC, false, DAG.getMachineFunction(), RVLocs1,
03423                     *DAG.getContext());
03424     CCInfo1.AnalyzeCallResult(Ins, RetCC_X86);
03425 
03426     SmallVector<CCValAssign, 16> RVLocs2;
03427     CCState CCInfo2(CallerCC, false, DAG.getMachineFunction(), RVLocs2,
03428                     *DAG.getContext());
03429     CCInfo2.AnalyzeCallResult(Ins, RetCC_X86);
03430 
03431     if (RVLocs1.size() != RVLocs2.size())
03432       return false;
03433     for (unsigned i = 0, e = RVLocs1.size(); i != e; ++i) {
03434       if (RVLocs1[i].isRegLoc() != RVLocs2[i].isRegLoc())
03435         return false;
03436       if (RVLocs1[i].getLocInfo() != RVLocs2[i].getLocInfo())
03437         return false;
03438       if (RVLocs1[i].isRegLoc()) {
03439         if (RVLocs1[i].getLocReg() != RVLocs2[i].getLocReg())
03440           return false;
03441       } else {
03442         if (RVLocs1[i].getLocMemOffset() != RVLocs2[i].getLocMemOffset())
03443           return false;
03444       }
03445     }
03446   }
03447 
03448   // If the callee takes no arguments then go on to check the results of the
03449   // call.
03450   if (!Outs.empty()) {
03451     // Check if stack adjustment is needed. For now, do not do this if any
03452     // argument is passed on the stack.
03453     SmallVector<CCValAssign, 16> ArgLocs;
03454     CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(), ArgLocs,
03455                    *DAG.getContext());
03456 
03457     // Allocate shadow area for Win64
03458     if (IsCalleeWin64)
03459       CCInfo.AllocateStack(32, 8);
03460 
03461     CCInfo.AnalyzeCallOperands(Outs, CC_X86);
03462     if (CCInfo.getNextStackOffset()) {
03463       MachineFunction &MF = DAG.getMachineFunction();
03464       if (MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn())
03465         return false;
03466 
03467       // Check if the arguments are already laid out in the right way as
03468       // the caller's fixed stack objects.
03469       MachineFrameInfo *MFI = MF.getFrameInfo();
03470       const MachineRegisterInfo *MRI = &MF.getRegInfo();
03471       const X86InstrInfo *TII =
03472           static_cast<const X86InstrInfo *>(DAG.getSubtarget().getInstrInfo());
03473       for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
03474         CCValAssign &VA = ArgLocs[i];
03475         SDValue Arg = OutVals[i];
03476         ISD::ArgFlagsTy Flags = Outs[i].Flags;
03477         if (VA.getLocInfo() == CCValAssign::Indirect)
03478           return false;
03479         if (!VA.isRegLoc()) {
03480           if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags,
03481                                    MFI, MRI, TII))
03482             return false;
03483         }
03484       }
03485     }
03486 
03487     // If the tailcall address may be in a register, then make sure it's
03488     // possible to register allocate for it. In 32-bit, the call address can
03489     // only target EAX, EDX, or ECX since the tail call must be scheduled after
03490     // callee-saved registers are restored. These happen to be the same
03491     // registers used to pass 'inreg' arguments so watch out for those.
03492     if (!Subtarget->is64Bit() &&
03493         ((!isa<GlobalAddressSDNode>(Callee) &&
03494           !isa<ExternalSymbolSDNode>(Callee)) ||
03495          DAG.getTarget().getRelocationModel() == Reloc::PIC_)) {
03496       unsigned NumInRegs = 0;
03497       // In PIC we need an extra register to formulate the address computation
03498       // for the callee.
03499       unsigned MaxInRegs =
03500   (DAG.getTarget().getRelocationModel() == Reloc::PIC_) ? 2 : 3;
03501 
03502       for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
03503         CCValAssign &VA = ArgLocs[i];
03504         if (!VA.isRegLoc())
03505           continue;
03506         unsigned Reg = VA.getLocReg();
03507         switch (Reg) {
03508         default: break;
03509         case X86::EAX: case X86::EDX: case X86::ECX:
03510           if (++NumInRegs == MaxInRegs)
03511             return false;
03512           break;
03513         }
03514       }
03515     }
03516   }
03517 
03518   return true;
03519 }
03520 
03521 FastISel *
03522 X86TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
03523                                   const TargetLibraryInfo *libInfo) const {
03524   return X86::createFastISel(funcInfo, libInfo);
03525 }
03526 
03527 //===----------------------------------------------------------------------===//
03528 //                           Other Lowering Hooks
03529 //===----------------------------------------------------------------------===//
03530 
03531 static bool MayFoldLoad(SDValue Op) {
03532   return Op.hasOneUse() && ISD::isNormalLoad(Op.getNode());
03533 }
03534 
03535 static bool MayFoldIntoStore(SDValue Op) {
03536   return Op.hasOneUse() && ISD::isNormalStore(*Op.getNode()->use_begin());
03537 }
03538 
03539 static bool isTargetShuffle(unsigned Opcode) {
03540   switch(Opcode) {
03541   default: return false;
03542   case X86ISD::BLENDI:
03543   case X86ISD::PSHUFB:
03544   case X86ISD::PSHUFD:
03545   case X86ISD::PSHUFHW:
03546   case X86ISD::PSHUFLW:
03547   case X86ISD::SHUFP:
03548   case X86ISD::PALIGNR:
03549   case X86ISD::MOVLHPS:
03550   case X86ISD::MOVLHPD:
03551   case X86ISD::MOVHLPS:
03552   case X86ISD::MOVLPS:
03553   case X86ISD::MOVLPD:
03554   case X86ISD::MOVSHDUP:
03555   case X86ISD::MOVSLDUP:
03556   case X86ISD::MOVDDUP:
03557   case X86ISD::MOVSS:
03558   case X86ISD::MOVSD:
03559   case X86ISD::UNPCKL:
03560   case X86ISD::UNPCKH:
03561   case X86ISD::VPERMILPI:
03562   case X86ISD::VPERM2X128:
03563   case X86ISD::VPERMI:
03564     return true;
03565   }
03566 }
03567 
03568 static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT,
03569                                     SDValue V1, SelectionDAG &DAG) {
03570   switch(Opc) {
03571   default: llvm_unreachable("Unknown x86 shuffle node");
03572   case X86ISD::MOVSHDUP:
03573   case X86ISD::MOVSLDUP:
03574   case X86ISD::MOVDDUP:
03575     return DAG.getNode(Opc, dl, VT, V1);
03576   }
03577 }
03578 
03579 static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT,
03580                                     SDValue V1, unsigned TargetMask,
03581                                     SelectionDAG &DAG) {
03582   switch(Opc) {
03583   default: llvm_unreachable("Unknown x86 shuffle node");
03584   case X86ISD::PSHUFD:
03585   case X86ISD::PSHUFHW:
03586   case X86ISD::PSHUFLW:
03587   case X86ISD::VPERMILPI:
03588   case X86ISD::VPERMI:
03589     return DAG.getNode(Opc, dl, VT, V1, DAG.getConstant(TargetMask, MVT::i8));
03590   }
03591 }
03592 
03593 static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT,
03594                                     SDValue V1, SDValue V2, unsigned TargetMask,
03595                                     SelectionDAG &DAG) {
03596   switch(Opc) {
03597   default: llvm_unreachable("Unknown x86 shuffle node");
03598   case X86ISD::PALIGNR:
03599   case X86ISD::VALIGN:
03600   case X86ISD::SHUFP:
03601   case X86ISD::VPERM2X128:
03602     return DAG.getNode(Opc, dl, VT, V1, V2,
03603                        DAG.getConstant(TargetMask, MVT::i8));
03604   }
03605 }
03606 
03607 static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT,
03608                                     SDValue V1, SDValue V2, SelectionDAG &DAG) {
03609   switch(Opc) {
03610   default: llvm_unreachable("Unknown x86 shuffle node");
03611   case X86ISD::MOVLHPS:
03612   case X86ISD::MOVLHPD:
03613   case X86ISD::MOVHLPS:
03614   case X86ISD::MOVLPS:
03615   case X86ISD::MOVLPD:
03616   case X86ISD::MOVSS:
03617   case X86ISD::MOVSD:
03618   case X86ISD::UNPCKL:
03619   case X86ISD::UNPCKH:
03620     return DAG.getNode(Opc, dl, VT, V1, V2);
03621   }
03622 }
03623 
03624 SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const {
03625   MachineFunction &MF = DAG.getMachineFunction();
03626   const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>(
03627       DAG.getSubtarget().getRegisterInfo());
03628   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
03629   int ReturnAddrIndex = FuncInfo->getRAIndex();
03630 
03631   if (ReturnAddrIndex == 0) {
03632     // Set up a frame object for the return address.
03633     unsigned SlotSize = RegInfo->getSlotSize();
03634     ReturnAddrIndex = MF.getFrameInfo()->CreateFixedObject(SlotSize,
03635                                                            -(int64_t)SlotSize,
03636                                                            false);
03637     FuncInfo->setRAIndex(ReturnAddrIndex);
03638   }
03639 
03640   return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy());
03641 }
03642 
03643 bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M,
03644                                        bool hasSymbolicDisplacement) {
03645   // Offset should fit into 32 bit immediate field.
03646   if (!isInt<32>(Offset))
03647     return false;
03648 
03649   // If we don't have a symbolic displacement - we don't have any extra
03650   // restrictions.
03651   if (!hasSymbolicDisplacement)
03652     return true;
03653 
03654   // FIXME: Some tweaks might be needed for medium code model.
03655   if (M != CodeModel::Small && M != CodeModel::Kernel)
03656     return false;
03657 
03658   // For small code model we assume that latest object is 16MB before end of 31
03659   // bits boundary. We may also accept pretty large negative constants knowing
03660   // that all objects are in the positive half of address space.
03661   if (M == CodeModel::Small && Offset < 16*1024*1024)
03662     return true;
03663 
03664   // For kernel code model we know that all object resist in the negative half
03665   // of 32bits address space. We may not accept negative offsets, since they may
03666   // be just off and we may accept pretty large positive ones.
03667   if (M == CodeModel::Kernel && Offset > 0)
03668     return true;
03669 
03670   return false;
03671 }
03672 
03673 /// isCalleePop - Determines whether the callee is required to pop its
03674 /// own arguments. Callee pop is necessary to support tail calls.
03675 bool X86::isCalleePop(CallingConv::ID CallingConv,
03676                       bool is64Bit, bool IsVarArg, bool TailCallOpt) {
03677   switch (CallingConv) {
03678   default:
03679     return false;
03680   case CallingConv::X86_StdCall:
03681   case CallingConv::X86_FastCall:
03682   case CallingConv::X86_ThisCall:
03683     return !is64Bit;
03684   case CallingConv::Fast:
03685   case CallingConv::GHC:
03686   case CallingConv::HiPE:
03687     if (IsVarArg)
03688       return false;
03689     return TailCallOpt;
03690   }
03691 }
03692 
03693 /// \brief Return true if the condition is an unsigned comparison operation.
03694 static bool isX86CCUnsigned(unsigned X86CC) {
03695   switch (X86CC) {
03696   default: llvm_unreachable("Invalid integer condition!");
03697   case X86::COND_E:     return true;
03698   case X86::COND_G:     return false;
03699   case X86::COND_GE:    return false;
03700   case X86::COND_L:     return false;
03701   case X86::COND_LE:    return false;
03702   case X86::COND_NE:    return true;
03703   case X86::COND_B:     return true;
03704   case X86::COND_A:     return true;
03705   case X86::COND_BE:    return true;
03706   case X86::COND_AE:    return true;
03707   }
03708   llvm_unreachable("covered switch fell through?!");
03709 }
03710 
03711 /// TranslateX86CC - do a one to one translation of a ISD::CondCode to the X86
03712 /// specific condition code, returning the condition code and the LHS/RHS of the
03713 /// comparison to make.
03714 static unsigned TranslateX86CC(ISD::CondCode SetCCOpcode, bool isFP,
03715                                SDValue &LHS, SDValue &RHS, SelectionDAG &DAG) {
03716   if (!isFP) {
03717     if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) {
03718       if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnesValue()) {
03719         // X > -1   -> X == 0, jump !sign.
03720         RHS = DAG.getConstant(0, RHS.getValueType());
03721         return X86::COND_NS;
03722       }
03723       if (SetCCOpcode == ISD::SETLT && RHSC->isNullValue()) {
03724         // X < 0   -> X == 0, jump on sign.
03725         return X86::COND_S;
03726       }
03727       if (SetCCOpcode == ISD::SETLT && RHSC->getZExtValue() == 1) {
03728         // X < 1   -> X <= 0
03729         RHS = DAG.getConstant(0, RHS.getValueType());
03730         return X86::COND_LE;
03731       }
03732     }
03733 
03734     switch (SetCCOpcode) {
03735     default: llvm_unreachable("Invalid integer condition!");
03736     case ISD::SETEQ:  return X86::COND_E;
03737     case ISD::SETGT:  return X86::COND_G;
03738     case ISD::SETGE:  return X86::COND_GE;
03739     case ISD::SETLT:  return X86::COND_L;
03740     case ISD::SETLE:  return X86::COND_LE;
03741     case ISD::SETNE:  return X86::COND_NE;
03742     case ISD::SETULT: return X86::COND_B;
03743     case ISD::SETUGT: return X86::COND_A;
03744     case ISD::SETULE: return X86::COND_BE;
03745     case ISD::SETUGE: return X86::COND_AE;
03746     }
03747   }
03748 
03749   // First determine if it is required or is profitable to flip the operands.
03750 
03751   // If LHS is a foldable load, but RHS is not, flip the condition.
03752   if (ISD::isNON_EXTLoad(LHS.getNode()) &&
03753       !ISD::isNON_EXTLoad(RHS.getNode())) {
03754     SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode);
03755     std::swap(LHS, RHS);
03756   }
03757 
03758   switch (SetCCOpcode) {
03759   default: break;
03760   case ISD::SETOLT:
03761   case ISD::SETOLE:
03762   case ISD::SETUGT:
03763   case ISD::SETUGE:
03764     std::swap(LHS, RHS);
03765     break;
03766   }
03767 
03768   // On a floating point condition, the flags are set as follows:
03769   // ZF  PF  CF   op
03770   //  0 | 0 | 0 | X > Y
03771   //  0 | 0 | 1 | X < Y
03772   //  1 | 0 | 0 | X == Y
03773   //  1 | 1 | 1 | unordered
03774   switch (SetCCOpcode) {
03775   default: llvm_unreachable("Condcode should be pre-legalized away");
03776   case ISD::SETUEQ:
03777   case ISD::SETEQ:   return X86::COND_E;
03778   case ISD::SETOLT:              // flipped
03779   case ISD::SETOGT:
03780   case ISD::SETGT:   return X86::COND_A;
03781   case ISD::SETOLE:              // flipped
03782   case ISD::SETOGE:
03783   case ISD::SETGE:   return X86::COND_AE;
03784   case ISD::SETUGT:              // flipped
03785   case ISD::SETULT:
03786   case ISD::SETLT:   return X86::COND_B;
03787   case ISD::SETUGE:              // flipped
03788   case ISD::SETULE:
03789   case ISD::SETLE:   return X86::COND_BE;
03790   case ISD::SETONE:
03791   case ISD::SETNE:   return X86::COND_NE;
03792   case ISD::SETUO:   return X86::COND_P;
03793   case ISD::SETO:    return X86::COND_NP;
03794   case ISD::SETOEQ:
03795   case ISD::SETUNE:  return X86::COND_INVALID;
03796   }
03797 }
03798 
03799 /// hasFPCMov - is there a floating point cmov for the specific X86 condition
03800 /// code. Current x86 isa includes the following FP cmov instructions:
03801 /// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu.
03802 static bool hasFPCMov(unsigned X86CC) {
03803   switch (X86CC) {
03804   default:
03805     return false;
03806   case X86::COND_B:
03807   case X86::COND_BE:
03808   case X86::COND_E:
03809   case X86::COND_P:
03810   case X86::COND_A:
03811   case X86::COND_AE:
03812   case X86::COND_NE:
03813   case X86::COND_NP:
03814     return true;
03815   }
03816 }
03817 
03818 /// isFPImmLegal - Returns true if the target can instruction select the
03819 /// specified FP immediate natively. If false, the legalizer will
03820 /// materialize the FP immediate as a load from a constant pool.
03821 bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
03822   for (unsigned i = 0, e = LegalFPImmediates.size(); i != e; ++i) {
03823     if (Imm.bitwiseIsEqual(LegalFPImmediates[i]))
03824       return true;
03825   }
03826   return false;
03827 }
03828 
03829 /// \brief Returns true if it is beneficial to convert a load of a constant
03830 /// to just the constant itself.
03831 bool X86TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
03832                                                           Type *Ty) const {
03833   assert(Ty->isIntegerTy());
03834 
03835   unsigned BitSize = Ty->getPrimitiveSizeInBits();
03836   if (BitSize == 0 || BitSize > 64)
03837     return false;
03838   return true;
03839 }
03840 
03841 /// isUndefOrInRange - Return true if Val is undef or if its value falls within
03842 /// the specified range (L, H].
03843 static bool isUndefOrInRange(int Val, int Low, int Hi) {
03844   return (Val < 0) || (Val >= Low && Val < Hi);
03845 }
03846 
03847 /// isUndefOrEqual - Val is either less than zero (undef) or equal to the
03848 /// specified value.
03849 static bool isUndefOrEqual(int Val, int CmpVal) {
03850   return (Val < 0 || Val == CmpVal);
03851 }
03852 
03853 /// isSequentialOrUndefInRange - Return true if every element in Mask, beginning
03854 /// from position Pos and ending in Pos+Size, falls within the specified
03855 /// sequential range (L, L+Pos]. or is undef.
03856 static bool isSequentialOrUndefInRange(ArrayRef<int> Mask,
03857                                        unsigned Pos, unsigned Size, int Low) {
03858   for (unsigned i = Pos, e = Pos+Size; i != e; ++i, ++Low)
03859     if (!isUndefOrEqual(Mask[i], Low))
03860       return false;
03861   return true;
03862 }
03863 
03864 /// isPSHUFDMask - Return true if the node specifies a shuffle of elements that
03865 /// is suitable for input to PSHUFD or PSHUFW.  That is, it doesn't reference
03866 /// the second operand.
03867 static bool isPSHUFDMask(ArrayRef<int> Mask, MVT VT) {
03868   if (VT == MVT::v4f32 || VT == MVT::v4i32 )
03869     return (Mask[0] < 4 && Mask[1] < 4 && Mask[2] < 4 && Mask[3] < 4);
03870   if (VT == MVT::v2f64 || VT == MVT::v2i64)
03871     return (Mask[0] < 2 && Mask[1] < 2);
03872   return false;
03873 }
03874 
03875 /// isPSHUFHWMask - Return true if the node specifies a shuffle of elements that
03876 /// is suitable for input to PSHUFHW.
03877 static bool isPSHUFHWMask(ArrayRef<int> Mask, MVT VT, bool HasInt256) {
03878   if (VT != MVT::v8i16 && (!HasInt256 || VT != MVT::v16i16))
03879     return false;
03880 
03881   // Lower quadword copied in order or undef.
03882   if (!isSequentialOrUndefInRange(Mask, 0, 4, 0))
03883     return false;
03884 
03885   // Upper quadword shuffled.
03886   for (unsigned i = 4; i != 8; ++i)
03887     if (!isUndefOrInRange(Mask[i], 4, 8))
03888       return false;
03889 
03890   if (VT == MVT::v16i16) {
03891     // Lower quadword copied in order or undef.
03892     if (!isSequentialOrUndefInRange(Mask, 8, 4, 8))
03893       return false;
03894 
03895     // Upper quadword shuffled.
03896     for (unsigned i = 12; i != 16; ++i)
03897       if (!isUndefOrInRange(Mask[i], 12, 16))
03898         return false;
03899   }
03900 
03901   return true;
03902 }
03903 
03904 /// isPSHUFLWMask - Return true if the node specifies a shuffle of elements that
03905 /// is suitable for input to PSHUFLW.
03906 static bool isPSHUFLWMask(ArrayRef<int> Mask, MVT VT, bool HasInt256) {
03907   if (VT != MVT::v8i16 && (!HasInt256 || VT != MVT::v16i16))
03908     return false;
03909 
03910   // Upper quadword copied in order.
03911   if (!isSequentialOrUndefInRange(Mask, 4, 4, 4))
03912     return false;
03913 
03914   // Lower quadword shuffled.
03915   for (unsigned i = 0; i != 4; ++i)
03916     if (!isUndefOrInRange(Mask[i], 0, 4))
03917       return false;
03918 
03919   if (VT == MVT::v16i16) {
03920     // Upper quadword copied in order.
03921     if (!isSequentialOrUndefInRange(Mask, 12, 4, 12))
03922       return false;
03923 
03924     // Lower quadword shuffled.
03925     for (unsigned i = 8; i != 12; ++i)
03926       if (!isUndefOrInRange(Mask[i], 8, 12))
03927         return false;
03928   }
03929 
03930   return true;
03931 }
03932 
03933 /// \brief Return true if the mask specifies a shuffle of elements that is
03934 /// suitable for input to intralane (palignr) or interlane (valign) vector
03935 /// right-shift.
03936 static bool isAlignrMask(ArrayRef<int> Mask, MVT VT, bool InterLane) {
03937   unsigned NumElts = VT.getVectorNumElements();
03938   unsigned NumLanes = InterLane ? 1: VT.getSizeInBits()/128;
03939   unsigned NumLaneElts = NumElts/NumLanes;
03940 
03941   // Do not handle 64-bit element shuffles with palignr.
03942   if (NumLaneElts == 2)
03943     return false;
03944 
03945   for (unsigned l = 0; l != NumElts; l+=NumLaneElts) {
03946     unsigned i;
03947     for (i = 0; i != NumLaneElts; ++i) {
03948       if (Mask[i+l] >= 0)
03949         break;
03950     }
03951 
03952     // Lane is all undef, go to next lane
03953     if (i == NumLaneElts)
03954       continue;
03955 
03956     int Start = Mask[i+l];
03957 
03958     // Make sure its in this lane in one of the sources
03959     if (!isUndefOrInRange(Start, l, l+NumLaneElts) &&
03960         !isUndefOrInRange(Start, l+NumElts, l+NumElts+NumLaneElts))
03961       return false;
03962 
03963     // If not lane 0, then we must match lane 0
03964     if (l != 0 && Mask[i] >= 0 && !isUndefOrEqual(Start, Mask[i]+l))
03965       return false;
03966 
03967     // Correct second source to be contiguous with first source
03968     if (Start >= (int)NumElts)
03969       Start -= NumElts - NumLaneElts;
03970 
03971     // Make sure we're shifting in the right direction.
03972     if (Start <= (int)(i+l))
03973       return false;
03974 
03975     Start -= i;
03976 
03977     // Check the rest of the elements to see if they are consecutive.
03978     for (++i; i != NumLaneElts; ++i) {
03979       int Idx = Mask[i+l];
03980 
03981       // Make sure its in this lane
03982       if (!isUndefOrInRange(Idx, l, l+NumLaneElts) &&
03983           !isUndefOrInRange(Idx, l+NumElts, l+NumElts+NumLaneElts))
03984         return false;
03985 
03986       // If not lane 0, then we must match lane 0
03987       if (l != 0 && Mask[i] >= 0 && !isUndefOrEqual(Idx, Mask[i]+l))
03988         return false;
03989 
03990       if (Idx >= (int)NumElts)
03991         Idx -= NumElts - NumLaneElts;
03992 
03993       if (!isUndefOrEqual(Idx, Start+i))
03994         return false;
03995 
03996     }
03997   }
03998 
03999   return true;
04000 }
04001 
04002 /// \brief Return true if the node specifies a shuffle of elements that is
04003 /// suitable for input to PALIGNR.
04004 static bool isPALIGNRMask(ArrayRef<int> Mask, MVT VT,
04005                           const X86Subtarget *Subtarget) {
04006   if ((VT.is128BitVector() && !Subtarget->hasSSSE3()) ||
04007       (VT.is256BitVector() && !Subtarget->hasInt256()) ||
04008       VT.is512BitVector())
04009     // FIXME: Add AVX512BW.
04010     return false;
04011 
04012   return isAlignrMask(Mask, VT, false);
04013 }
04014 
04015 /// \brief Return true if the node specifies a shuffle of elements that is
04016 /// suitable for input to VALIGN.
04017 static bool isVALIGNMask(ArrayRef<int> Mask, MVT VT,
04018                           const X86Subtarget *Subtarget) {
04019   // FIXME: Add AVX512VL.
04020   if (!VT.is512BitVector() || !Subtarget->hasAVX512())
04021     return false;
04022   return isAlignrMask(Mask, VT, true);
04023 }
04024 
04025 /// CommuteVectorShuffleMask - Change values in a shuffle permute mask assuming
04026 /// the two vector operands have swapped position.
04027 static void CommuteVectorShuffleMask(SmallVectorImpl<int> &Mask,
04028                                      unsigned NumElems) {
04029   for (unsigned i = 0; i != NumElems; ++i) {
04030     int idx = Mask[i];
04031     if (idx < 0)
04032       continue;
04033     else if (idx < (int)NumElems)
04034       Mask[i] = idx + NumElems;
04035     else
04036       Mask[i] = idx - NumElems;
04037   }
04038 }
04039 
04040 /// isSHUFPMask - Return true if the specified VECTOR_SHUFFLE operand
04041 /// specifies a shuffle of elements that is suitable for input to 128/256-bit
04042 /// SHUFPS and SHUFPD. If Commuted is true, then it checks for sources to be
04043 /// reverse of what x86 shuffles want.
04044 static bool isSHUFPMask(ArrayRef<int> Mask, MVT VT, bool Commuted = false) {
04045 
04046   unsigned NumElems = VT.getVectorNumElements();
04047   unsigned NumLanes = VT.getSizeInBits()/128;
04048   unsigned NumLaneElems = NumElems/NumLanes;
04049 
04050   if (NumLaneElems != 2 && NumLaneElems != 4)
04051     return false;
04052 
04053   unsigned EltSize = VT.getVectorElementType().getSizeInBits();
04054   bool symetricMaskRequired =
04055     (VT.getSizeInBits() >= 256) && (EltSize == 32);
04056 
04057   // VSHUFPSY divides the resulting vector into 4 chunks.
04058   // The sources are also splitted into 4 chunks, and each destination
04059   // chunk must come from a different source chunk.
04060   //
04061   //  SRC1 =>   X7    X6    X5    X4    X3    X2    X1    X0
04062   //  SRC2 =>   Y7    Y6    Y5    Y4    Y3    Y2    Y1    Y9
04063   //
04064   //  DST  =>  Y7..Y4,   Y7..Y4,   X7..X4,   X7..X4,
04065   //           Y3..Y0,   Y3..Y0,   X3..X0,   X3..X0
04066   //
04067   // VSHUFPDY divides the resulting vector into 4 chunks.
04068   // The sources are also splitted into 4 chunks, and each destination
04069   // chunk must come from a different source chunk.
04070   //
04071   //  SRC1 =>      X3       X2       X1       X0
04072   //  SRC2 =>      Y3       Y2       Y1       Y0
04073   //
04074   //  DST  =>  Y3..Y2,  X3..X2,  Y1..Y0,  X1..X0
04075   //
04076   SmallVector<int, 4> MaskVal(NumLaneElems, -1);
04077   unsigned HalfLaneElems = NumLaneElems/2;
04078   for (unsigned l = 0; l != NumElems; l += NumLaneElems) {
04079     for (unsigned i = 0; i != NumLaneElems; ++i) {
04080       int Idx = Mask[i+l];
04081       unsigned RngStart = l + ((Commuted == (i<HalfLaneElems)) ? NumElems : 0);
04082       if (!isUndefOrInRange(Idx, RngStart, RngStart+NumLaneElems))
04083         return false;
04084       // For VSHUFPSY, the mask of the second half must be the same as the
04085       // first but with the appropriate offsets. This works in the same way as
04086       // VPERMILPS works with masks.
04087       if (!symetricMaskRequired || Idx < 0)
04088         continue;
04089       if (MaskVal[i] < 0) {
04090         MaskVal[i] = Idx - l;
04091         continue;
04092       }
04093       if ((signed)(Idx - l) != MaskVal[i])
04094         return false;
04095     }
04096   }
04097 
04098   return true;
04099 }
04100 
04101 /// isMOVHLPSMask - Return true if the specified VECTOR_SHUFFLE operand
04102 /// specifies a shuffle of elements that is suitable for input to MOVHLPS.
04103 static bool isMOVHLPSMask(ArrayRef<int> Mask, MVT VT) {
04104   if (!VT.is128BitVector())
04105     return false;
04106 
04107   unsigned NumElems = VT.getVectorNumElements();
04108 
04109   if (NumElems != 4)
04110     return false;
04111 
04112   // Expect bit0 == 6, bit1 == 7, bit2 == 2, bit3 == 3
04113   return isUndefOrEqual(Mask[0], 6) &&
04114          isUndefOrEqual(Mask[1], 7) &&
04115          isUndefOrEqual(Mask[2], 2) &&
04116          isUndefOrEqual(Mask[3], 3);
04117 }
04118 
04119 /// isMOVHLPS_v_undef_Mask - Special case of isMOVHLPSMask for canonical form
04120 /// of vector_shuffle v, v, <2, 3, 2, 3>, i.e. vector_shuffle v, undef,
04121 /// <2, 3, 2, 3>
04122 static bool isMOVHLPS_v_undef_Mask(ArrayRef<int> Mask, MVT VT) {
04123   if (!VT.is128BitVector())
04124     return false;
04125 
04126   unsigned NumElems = VT.getVectorNumElements();
04127 
04128   if (NumElems != 4)
04129     return false;
04130 
04131   return isUndefOrEqual(Mask[0], 2) &&
04132          isUndefOrEqual(Mask[1], 3) &&
04133          isUndefOrEqual(Mask[2], 2) &&
04134          isUndefOrEqual(Mask[3], 3);
04135 }
04136 
04137 /// isMOVLPMask - Return true if the specified VECTOR_SHUFFLE operand
04138 /// specifies a shuffle of elements that is suitable for input to MOVLP{S|D}.
04139 static bool isMOVLPMask(ArrayRef<int> Mask, MVT VT) {
04140   if (!VT.is128BitVector())
04141     return false;
04142 
04143   unsigned NumElems = VT.getVectorNumElements();
04144 
04145   if (NumElems != 2 && NumElems != 4)
04146     return false;
04147 
04148   for (unsigned i = 0, e = NumElems/2; i != e; ++i)
04149     if (!isUndefOrEqual(Mask[i], i + NumElems))
04150       return false;
04151 
04152   for (unsigned i = NumElems/2, e = NumElems; i != e; ++i)
04153     if (!isUndefOrEqual(Mask[i], i))
04154       return false;
04155 
04156   return true;
04157 }
04158 
04159 /// isMOVLHPSMask - Return true if the specified VECTOR_SHUFFLE operand
04160 /// specifies a shuffle of elements that is suitable for input to MOVLHPS.
04161 static bool isMOVLHPSMask(ArrayRef<int> Mask, MVT VT) {
04162   if (!VT.is128BitVector())
04163     return false;
04164 
04165   unsigned NumElems = VT.getVectorNumElements();
04166 
04167   if (NumElems != 2 && NumElems != 4)
04168     return false;
04169 
04170   for (unsigned i = 0, e = NumElems/2; i != e; ++i)
04171     if (!isUndefOrEqual(Mask[i], i))
04172       return false;
04173 
04174   for (unsigned i = 0, e = NumElems/2; i != e; ++i)
04175     if (!isUndefOrEqual(Mask[i + e], i + NumElems))
04176       return false;
04177 
04178   return true;
04179 }
04180 
04181 /// isINSERTPSMask - Return true if the specified VECTOR_SHUFFLE operand
04182 /// specifies a shuffle of elements that is suitable for input to INSERTPS.
04183 /// i. e: If all but one element come from the same vector.
04184 static bool isINSERTPSMask(ArrayRef<int> Mask, MVT VT) {
04185   // TODO: Deal with AVX's VINSERTPS
04186   if (!VT.is128BitVector() || (VT != MVT::v4f32 && VT != MVT::v4i32))
04187     return false;
04188 
04189   unsigned CorrectPosV1 = 0;
04190   unsigned CorrectPosV2 = 0;
04191   for (int i = 0, e = (int)VT.getVectorNumElements(); i != e; ++i) {
04192     if (Mask[i] == -1) {
04193       ++CorrectPosV1;
04194       ++CorrectPosV2;
04195       continue;
04196     }
04197 
04198     if (Mask[i] == i)
04199       ++CorrectPosV1;
04200     else if (Mask[i] == i + 4)
04201       ++CorrectPosV2;
04202   }
04203 
04204   if (CorrectPosV1 == 3 || CorrectPosV2 == 3)
04205     // We have 3 elements (undefs count as elements from any vector) from one
04206     // vector, and one from another.
04207     return true;
04208 
04209   return false;
04210 }
04211 
04212 //
04213 // Some special combinations that can be optimized.
04214 //
04215 static
04216 SDValue Compact8x32ShuffleNode(ShuffleVectorSDNode *SVOp,
04217                                SelectionDAG &DAG) {
04218   MVT VT = SVOp->getSimpleValueType(0);
04219   SDLoc dl(SVOp);
04220 
04221   if (VT != MVT::v8i32 && VT != MVT::v8f32)
04222     return SDValue();
04223 
04224   ArrayRef<int> Mask = SVOp->getMask();
04225 
04226   // These are the special masks that may be optimized.
04227   static const int MaskToOptimizeEven[] = {0, 8, 2, 10, 4, 12, 6, 14};
04228   static const int MaskToOptimizeOdd[]  = {1, 9, 3, 11, 5, 13, 7, 15};
04229   bool MatchEvenMask = true;
04230   bool MatchOddMask  = true;
04231   for (int i=0; i<8; ++i) {
04232     if (!isUndefOrEqual(Mask[i], MaskToOptimizeEven[i]))
04233       MatchEvenMask = false;
04234     if (!isUndefOrEqual(Mask[i], MaskToOptimizeOdd[i]))
04235       MatchOddMask = false;
04236   }
04237 
04238   if (!MatchEvenMask && !MatchOddMask)
04239     return SDValue();
04240 
04241   SDValue UndefNode = DAG.getNode(ISD::UNDEF, dl, VT);
04242 
04243   SDValue Op0 = SVOp->getOperand(0);
04244   SDValue Op1 = SVOp->getOperand(1);
04245 
04246   if (MatchEvenMask) {
04247     // Shift the second operand right to 32 bits.
04248     static const int ShiftRightMask[] = {-1, 0, -1, 2, -1, 4, -1, 6 };
04249     Op1 = DAG.getVectorShuffle(VT, dl, Op1, UndefNode, ShiftRightMask);
04250   } else {
04251     // Shift the first operand left to 32 bits.
04252     static const int ShiftLeftMask[] = {1, -1, 3, -1, 5, -1, 7, -1 };
04253     Op0 = DAG.getVectorShuffle(VT, dl, Op0, UndefNode, ShiftLeftMask);
04254   }
04255   static const int BlendMask[] = {0, 9, 2, 11, 4, 13, 6, 15};
04256   return DAG.getVectorShuffle(VT, dl, Op0, Op1, BlendMask);
04257 }
04258 
04259 /// isUNPCKLMask - Return true if the specified VECTOR_SHUFFLE operand
04260 /// specifies a shuffle of elements that is suitable for input to UNPCKL.
04261 static bool isUNPCKLMask(ArrayRef<int> Mask, MVT VT,
04262                          bool HasInt256, bool V2IsSplat = false) {
04263 
04264   assert(VT.getSizeInBits() >= 128 &&
04265          "Unsupported vector type for unpckl");
04266 
04267   unsigned NumElts = VT.getVectorNumElements();
04268   if (VT.is256BitVector() && NumElts != 4 && NumElts != 8 &&
04269       (!HasInt256 || (NumElts != 16 && NumElts != 32)))
04270     return false;
04271 
04272   assert((!VT.is512BitVector() || VT.getScalarType().getSizeInBits() >= 32) &&
04273          "Unsupported vector type for unpckh");
04274 
04275   // AVX defines UNPCK* to operate independently on 128-bit lanes.
04276   unsigned NumLanes = VT.getSizeInBits()/128;
04277   unsigned NumLaneElts = NumElts/NumLanes;
04278 
04279   for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
04280     for (unsigned i = 0, j = l; i != NumLaneElts; i += 2, ++j) {
04281       int BitI  = Mask[l+i];
04282       int BitI1 = Mask[l+i+1];
04283       if (!isUndefOrEqual(BitI, j))
04284         return false;
04285       if (V2IsSplat) {
04286         if (!isUndefOrEqual(BitI1, NumElts))
04287           return false;
04288       } else {
04289         if (!isUndefOrEqual(BitI1, j + NumElts))
04290           return false;
04291       }
04292     }
04293   }
04294 
04295   return true;
04296 }
04297 
04298 /// isUNPCKHMask - Return true if the specified VECTOR_SHUFFLE operand
04299 /// specifies a shuffle of elements that is suitable for input to UNPCKH.
04300 static bool isUNPCKHMask(ArrayRef<int> Mask, MVT VT,
04301                          bool HasInt256, bool V2IsSplat = false) {
04302   assert(VT.getSizeInBits() >= 128 &&
04303          "Unsupported vector type for unpckh");
04304 
04305   unsigned NumElts = VT.getVectorNumElements();
04306   if (VT.is256BitVector() && NumElts != 4 && NumElts != 8 &&
04307       (!HasInt256 || (NumElts != 16 && NumElts != 32)))
04308     return false;
04309 
04310   assert((!VT.is512BitVector() || VT.getScalarType().getSizeInBits() >= 32) &&
04311          "Unsupported vector type for unpckh");
04312 
04313   // AVX defines UNPCK* to operate independently on 128-bit lanes.
04314   unsigned NumLanes = VT.getSizeInBits()/128;
04315   unsigned NumLaneElts = NumElts/NumLanes;
04316 
04317   for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
04318     for (unsigned i = 0, j = l+NumLaneElts/2; i != NumLaneElts; i += 2, ++j) {
04319       int BitI  = Mask[l+i];
04320       int BitI1 = Mask[l+i+1];
04321       if (!isUndefOrEqual(BitI, j))
04322         return false;
04323       if (V2IsSplat) {
04324         if (isUndefOrEqual(BitI1, NumElts))
04325           return false;
04326       } else {
04327         if (!isUndefOrEqual(BitI1, j+NumElts))
04328           return false;
04329       }
04330     }
04331   }
04332   return true;
04333 }
04334 
04335 /// isUNPCKL_v_undef_Mask - Special case of isUNPCKLMask for canonical form
04336 /// of vector_shuffle v, v, <0, 4, 1, 5>, i.e. vector_shuffle v, undef,
04337 /// <0, 0, 1, 1>
04338 static bool isUNPCKL_v_undef_Mask(ArrayRef<int> Mask, MVT VT, bool HasInt256) {
04339   unsigned NumElts = VT.getVectorNumElements();
04340   bool Is256BitVec = VT.is256BitVector();
04341 
04342   if (VT.is512BitVector())
04343     return false;
04344   assert((VT.is128BitVector() || VT.is256BitVector()) &&
04345          "Unsupported vector type for unpckh");
04346 
04347   if (Is256BitVec && NumElts != 4 && NumElts != 8 &&
04348       (!HasInt256 || (NumElts != 16 && NumElts != 32)))
04349     return false;
04350 
04351   // For 256-bit i64/f64, use MOVDDUPY instead, so reject the matching pattern
04352   // FIXME: Need a better way to get rid of this, there's no latency difference
04353   // between UNPCKLPD and MOVDDUP, the later should always be checked first and
04354   // the former later. We should also remove the "_undef" special mask.
04355   if (NumElts == 4 && Is256BitVec)
04356     return false;
04357 
04358   // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate
04359   // independently on 128-bit lanes.
04360   unsigned NumLanes = VT.getSizeInBits()/128;
04361   unsigned NumLaneElts = NumElts/NumLanes;
04362 
04363   for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
04364     for (unsigned i = 0, j = l; i != NumLaneElts; i += 2, ++j) {
04365       int BitI  = Mask[l+i];
04366       int BitI1 = Mask[l+i+1];
04367 
04368       if (!isUndefOrEqual(BitI, j))
04369         return false;
04370       if (!isUndefOrEqual(BitI1, j))
04371         return false;
04372     }
04373   }
04374 
04375   return true;
04376 }
04377 
04378 /// isUNPCKH_v_undef_Mask - Special case of isUNPCKHMask for canonical form
04379 /// of vector_shuffle v, v, <2, 6, 3, 7>, i.e. vector_shuffle v, undef,
04380 /// <2, 2, 3, 3>
04381 static bool isUNPCKH_v_undef_Mask(ArrayRef<int> Mask, MVT VT, bool HasInt256) {
04382   unsigned NumElts = VT.getVectorNumElements();
04383 
04384   if (VT.is512BitVector())
04385     return false;
04386 
04387   assert((VT.is128BitVector() || VT.is256BitVector()) &&
04388          "Unsupported vector type for unpckh");
04389 
04390   if (VT.is256BitVector() && NumElts != 4 && NumElts != 8 &&
04391       (!HasInt256 || (NumElts != 16 && NumElts != 32)))
04392     return false;
04393 
04394   // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate
04395   // independently on 128-bit lanes.
04396   unsigned NumLanes = VT.getSizeInBits()/128;
04397   unsigned NumLaneElts = NumElts/NumLanes;
04398 
04399   for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
04400     for (unsigned i = 0, j = l+NumLaneElts/2; i != NumLaneElts; i += 2, ++j) {
04401       int BitI  = Mask[l+i];
04402       int BitI1 = Mask[l+i+1];
04403       if (!isUndefOrEqual(BitI, j))
04404         return false;
04405       if (!isUndefOrEqual(BitI1, j))
04406         return false;
04407     }
04408   }
04409   return true;
04410 }
04411 
04412 // Match for INSERTI64x4 INSERTF64x4 instructions (src0[0], src1[0]) or
04413 // (src1[0], src0[1]), manipulation with 256-bit sub-vectors
04414 static bool isINSERT64x4Mask(ArrayRef<int> Mask, MVT VT, unsigned int *Imm) {
04415   if (!VT.is512BitVector())
04416     return false;
04417 
04418   unsigned NumElts = VT.getVectorNumElements();
04419   unsigned HalfSize = NumElts/2;
04420   if (isSequentialOrUndefInRange(Mask, 0, HalfSize, 0)) {
04421     if (isSequentialOrUndefInRange(Mask, HalfSize, HalfSize, NumElts)) {
04422       *Imm = 1;
04423       return true;
04424     }
04425   }
04426   if (isSequentialOrUndefInRange(Mask, 0, HalfSize, NumElts)) {
04427     if (isSequentialOrUndefInRange(Mask, HalfSize, HalfSize, HalfSize)) {
04428       *Imm = 0;
04429       return true;
04430     }
04431   }
04432   return false;
04433 }
04434 
04435 /// isMOVLMask - Return true if the specified VECTOR_SHUFFLE operand
04436 /// specifies a shuffle of elements that is suitable for input to MOVSS,
04437 /// MOVSD, and MOVD, i.e. setting the lowest element.
04438 static bool isMOVLMask(ArrayRef<int> Mask, EVT VT) {
04439   if (VT.getVectorElementType().getSizeInBits() < 32)
04440     return false;
04441   if (!VT.is128BitVector())
04442     return false;
04443 
04444   unsigned NumElts = VT.getVectorNumElements();
04445 
04446   if (!isUndefOrEqual(Mask[0], NumElts))
04447     return false;
04448 
04449   for (unsigned i = 1; i != NumElts; ++i)
04450     if (!isUndefOrEqual(Mask[i], i))
04451       return false;
04452 
04453   return true;
04454 }
04455 
04456 /// isVPERM2X128Mask - Match 256-bit shuffles where the elements are considered
04457 /// as permutations between 128-bit chunks or halves. As an example: this
04458 /// shuffle bellow:
04459 ///   vector_shuffle <4, 5, 6, 7, 12, 13, 14, 15>
04460 /// The first half comes from the second half of V1 and the second half from the
04461 /// the second half of V2.
04462 static bool isVPERM2X128Mask(ArrayRef<int> Mask, MVT VT, bool HasFp256) {
04463   if (!HasFp256 || !VT.is256BitVector())
04464     return false;
04465 
04466   // The shuffle result is divided into half A and half B. In total the two
04467   // sources have 4 halves, namely: C, D, E, F. The final values of A and
04468   // B must come from C, D, E or F.
04469   unsigned HalfSize = VT.getVectorNumElements()/2;
04470   bool MatchA = false, MatchB = false;
04471 
04472   // Check if A comes from one of C, D, E, F.
04473   for (unsigned Half = 0; Half != 4; ++Half) {
04474     if (isSequentialOrUndefInRange(Mask, 0, HalfSize, Half*HalfSize)) {
04475       MatchA = true;
04476       break;
04477     }
04478   }
04479 
04480   // Check if B comes from one of C, D, E, F.
04481   for (unsigned Half = 0; Half != 4; ++Half) {
04482     if (isSequentialOrUndefInRange(Mask, HalfSize, HalfSize, Half*HalfSize)) {
04483       MatchB = true;
04484       break;
04485     }
04486   }
04487 
04488   return MatchA && MatchB;
04489 }
04490 
04491 /// getShuffleVPERM2X128Immediate - Return the appropriate immediate to shuffle
04492 /// the specified VECTOR_MASK mask with VPERM2F128/VPERM2I128 instructions.
04493 static unsigned getShuffleVPERM2X128Immediate(ShuffleVectorSDNode *SVOp) {
04494   MVT VT = SVOp->getSimpleValueType(0);
04495 
04496   unsigned HalfSize = VT.getVectorNumElements()/2;
04497 
04498   unsigned FstHalf = 0, SndHalf = 0;
04499   for (unsigned i = 0; i < HalfSize; ++i) {
04500     if (SVOp->getMaskElt(i) > 0) {
04501       FstHalf = SVOp->getMaskElt(i)/HalfSize;
04502       break;
04503     }
04504   }
04505   for (unsigned i = HalfSize; i < HalfSize*2; ++i) {
04506     if (SVOp->getMaskElt(i) > 0) {
04507       SndHalf = SVOp->getMaskElt(i)/HalfSize;
04508       break;
04509     }
04510   }
04511 
04512   return (FstHalf | (SndHalf << 4));
04513 }
04514 
04515 // Symetric in-lane mask. Each lane has 4 elements (for imm8)
04516 static bool isPermImmMask(ArrayRef<int> Mask, MVT VT, unsigned& Imm8) {
04517   unsigned EltSize = VT.getVectorElementType().getSizeInBits();
04518   if (EltSize < 32)
04519     return false;
04520 
04521   unsigned NumElts = VT.getVectorNumElements();
04522   Imm8 = 0;
04523   if (VT.is128BitVector() || (VT.is256BitVector() && EltSize == 64)) {
04524     for (unsigned i = 0; i != NumElts; ++i) {
04525       if (Mask[i] < 0)
04526         continue;
04527       Imm8 |= Mask[i] << (i*2);
04528     }
04529     return true;
04530   }
04531 
04532   unsigned LaneSize = 4;
04533   SmallVector<int, 4> MaskVal(LaneSize, -1);
04534 
04535   for (unsigned l = 0; l != NumElts; l += LaneSize) {
04536     for (unsigned i = 0; i != LaneSize; ++i) {
04537       if (!isUndefOrInRange(Mask[i+l], l, l+LaneSize))
04538         return false;
04539       if (Mask[i+l] < 0)
04540         continue;
04541       if (MaskVal[i] < 0) {
04542         MaskVal[i] = Mask[i+l] - l;
04543         Imm8 |= MaskVal[i] << (i*2);
04544         continue;
04545       }
04546       if (Mask[i+l] != (signed)(MaskVal[i]+l))
04547         return false;
04548     }
04549   }
04550   return true;
04551 }
04552 
04553 /// isVPERMILPMask - Return true if the specified VECTOR_SHUFFLE operand
04554 /// specifies a shuffle of elements that is suitable for input to VPERMILPD*.
04555 /// Note that VPERMIL mask matching is different depending whether theunderlying
04556 /// type is 32 or 64. In the VPERMILPS the high half of the mask should point
04557 /// to the same elements of the low, but to the higher half of the source.
04558 /// In VPERMILPD the two lanes could be shuffled independently of each other
04559 /// with the same restriction that lanes can't be crossed. Also handles PSHUFDY.
04560 static bool isVPERMILPMask(ArrayRef<int> Mask, MVT VT) {
04561   unsigned EltSize = VT.getVectorElementType().getSizeInBits();
04562   if (VT.getSizeInBits() < 256 || EltSize < 32)
04563     return false;
04564   bool symetricMaskRequired = (EltSize == 32);
04565   unsigned NumElts = VT.getVectorNumElements();
04566 
04567   unsigned NumLanes = VT.getSizeInBits()/128;
04568   unsigned LaneSize = NumElts/NumLanes;
04569   // 2 or 4 elements in one lane
04570 
04571   SmallVector<int, 4> ExpectedMaskVal(LaneSize, -1);
04572   for (unsigned l = 0; l != NumElts; l += LaneSize) {
04573     for (unsigned i = 0; i != LaneSize; ++i) {
04574       if (!isUndefOrInRange(Mask[i+l], l, l+LaneSize))
04575         return false;
04576       if (symetricMaskRequired) {
04577         if (ExpectedMaskVal[i] < 0 && Mask[i+l] >= 0) {
04578           ExpectedMaskVal[i] = Mask[i+l] - l;
04579           continue;
04580         }
04581         if (!isUndefOrEqual(Mask[i+l], ExpectedMaskVal[i]+l))
04582           return false;
04583       }
04584     }
04585   }
04586   return true;
04587 }
04588 
04589 /// isCommutedMOVLMask - Returns true if the shuffle mask is except the reverse
04590 /// of what x86 movss want. X86 movs requires the lowest  element to be lowest
04591 /// element of vector 2 and the other elements to come from vector 1 in order.
04592 static bool isCommutedMOVLMask(ArrayRef<int> Mask, MVT VT,
04593                                bool V2IsSplat = false, bool V2IsUndef = false) {
04594   if (!VT.is128BitVector())
04595     return false;
04596 
04597   unsigned NumOps = VT.getVectorNumElements();
04598   if (NumOps != 2 && NumOps != 4 && NumOps != 8 && NumOps != 16)
04599     return false;
04600 
04601   if (!isUndefOrEqual(Mask[0], 0))
04602     return false;
04603 
04604   for (unsigned i = 1; i != NumOps; ++i)
04605     if (!(isUndefOrEqual(Mask[i], i+NumOps) ||
04606           (V2IsUndef && isUndefOrInRange(Mask[i], NumOps, NumOps*2)) ||
04607           (V2IsSplat && isUndefOrEqual(Mask[i], NumOps))))
04608       return false;
04609 
04610   return true;
04611 }
04612 
04613 /// isMOVSHDUPMask - Return true if the specified VECTOR_SHUFFLE operand
04614 /// specifies a shuffle of elements that is suitable for input to MOVSHDUP.
04615 /// Masks to match: <1, 1, 3, 3> or <1, 1, 3, 3, 5, 5, 7, 7>
04616 static bool isMOVSHDUPMask(ArrayRef<int> Mask, MVT VT,
04617                            const X86Subtarget *Subtarget) {
04618   if (!Subtarget->hasSSE3())
04619     return false;
04620 
04621   unsigned NumElems = VT.getVectorNumElements();
04622 
04623   if ((VT.is128BitVector() && NumElems != 4) ||
04624       (VT.is256BitVector() && NumElems != 8) ||
04625       (VT.is512BitVector() && NumElems != 16))
04626     return false;
04627 
04628   // "i+1" is the value the indexed mask element must have
04629   for (unsigned i = 0; i != NumElems; i += 2)
04630     if (!isUndefOrEqual(Mask[i], i+1) ||
04631         !isUndefOrEqual(Mask[i+1], i+1))
04632       return false;
04633 
04634   return true;
04635 }
04636 
04637 /// isMOVSLDUPMask - Return true if the specified VECTOR_SHUFFLE operand
04638 /// specifies a shuffle of elements that is suitable for input to MOVSLDUP.
04639 /// Masks to match: <0, 0, 2, 2> or <0, 0, 2, 2, 4, 4, 6, 6>
04640 static bool isMOVSLDUPMask(ArrayRef<int> Mask, MVT VT,
04641                            const X86Subtarget *Subtarget) {
04642   if (!Subtarget->hasSSE3())
04643     return false;
04644 
04645   unsigned NumElems = VT.getVectorNumElements();
04646 
04647   if ((VT.is128BitVector() && NumElems != 4) ||
04648       (VT.is256BitVector() && NumElems != 8) ||
04649       (VT.is512BitVector() && NumElems != 16))
04650     return false;
04651 
04652   // "i" is the value the indexed mask element must have
04653   for (unsigned i = 0; i != NumElems; i += 2)
04654     if (!isUndefOrEqual(Mask[i], i) ||
04655         !isUndefOrEqual(Mask[i+1], i))
04656       return false;
04657 
04658   return true;
04659 }
04660 
04661 /// isMOVDDUPYMask - Return true if the specified VECTOR_SHUFFLE operand
04662 /// specifies a shuffle of elements that is suitable for input to 256-bit
04663 /// version of MOVDDUP.
04664 static bool isMOVDDUPYMask(ArrayRef<int> Mask, MVT VT, bool HasFp256) {
04665   if (!HasFp256 || !VT.is256BitVector())
04666     return false;
04667 
04668   unsigned NumElts = VT.getVectorNumElements();
04669   if (NumElts != 4)
04670     return false;
04671 
04672   for (unsigned i = 0; i != NumElts/2; ++i)
04673     if (!isUndefOrEqual(Mask[i], 0))
04674       return false;
04675   for (unsigned i = NumElts/2; i != NumElts; ++i)
04676     if (!isUndefOrEqual(Mask[i], NumElts/2))
04677       return false;
04678   return true;
04679 }
04680 
04681 /// isMOVDDUPMask - Return true if the specified VECTOR_SHUFFLE operand
04682 /// specifies a shuffle of elements that is suitable for input to 128-bit
04683 /// version of MOVDDUP.
04684 static bool isMOVDDUPMask(ArrayRef<int> Mask, MVT VT) {
04685   if (!VT.is128BitVector())
04686     return false;
04687 
04688   unsigned e = VT.getVectorNumElements() / 2;
04689   for (unsigned i = 0; i != e; ++i)
04690     if (!isUndefOrEqual(Mask[i], i))
04691       return false;
04692   for (unsigned i = 0; i != e; ++i)
04693     if (!isUndefOrEqual(Mask[e+i], i))
04694       return false;
04695   return true;
04696 }
04697 
04698 /// isVEXTRACTIndex - Return true if the specified
04699 /// EXTRACT_SUBVECTOR operand specifies a vector extract that is
04700 /// suitable for instruction that extract 128 or 256 bit vectors
04701 static bool isVEXTRACTIndex(SDNode *N, unsigned vecWidth) {
04702   assert((vecWidth == 128 || vecWidth == 256) && "Unexpected vector width");
04703   if (!isa<ConstantSDNode>(N->getOperand(1).getNode()))
04704     return false;
04705 
04706   // The index should be aligned on a vecWidth-bit boundary.
04707   uint64_t Index =
04708     cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue();
04709 
04710   MVT VT = N->getSimpleValueType(0);
04711   unsigned ElSize = VT.getVectorElementType().getSizeInBits();
04712   bool Result = (Index * ElSize) % vecWidth == 0;
04713 
04714   return Result;
04715 }
04716 
04717 /// isVINSERTIndex - Return true if the specified INSERT_SUBVECTOR
04718 /// operand specifies a subvector insert that is suitable for input to
04719 /// insertion of 128 or 256-bit subvectors
04720 static bool isVINSERTIndex(SDNode *N, unsigned vecWidth) {
04721   assert((vecWidth == 128 || vecWidth == 256) && "Unexpected vector width");
04722   if (!isa<ConstantSDNode>(N->getOperand(2).getNode()))
04723     return false;
04724   // The index should be aligned on a vecWidth-bit boundary.
04725   uint64_t Index =
04726     cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue();
04727 
04728   MVT VT = N->getSimpleValueType(0);
04729   unsigned ElSize = VT.getVectorElementType().getSizeInBits();
04730   bool Result = (Index * ElSize) % vecWidth == 0;
04731 
04732   return Result;
04733 }
04734 
04735 bool X86::isVINSERT128Index(SDNode *N) {
04736   return isVINSERTIndex(N, 128);
04737 }
04738 
04739 bool X86::isVINSERT256Index(SDNode *N) {
04740   return isVINSERTIndex(N, 256);
04741 }
04742 
04743 bool X86::isVEXTRACT128Index(SDNode *N) {
04744   return isVEXTRACTIndex(N, 128);
04745 }
04746 
04747 bool X86::isVEXTRACT256Index(SDNode *N) {
04748   return isVEXTRACTIndex(N, 256);
04749 }
04750 
04751 /// getShuffleSHUFImmediate - Return the appropriate immediate to shuffle
04752 /// the specified VECTOR_SHUFFLE mask with PSHUF* and SHUFP* instructions.
04753 /// Handles 128-bit and 256-bit.
04754 static unsigned getShuffleSHUFImmediate(ShuffleVectorSDNode *N) {
04755   MVT VT = N->getSimpleValueType(0);
04756 
04757   assert((VT.getSizeInBits() >= 128) &&
04758          "Unsupported vector type for PSHUF/SHUFP");
04759 
04760   // Handle 128 and 256-bit vector lengths. AVX defines PSHUF/SHUFP to operate
04761   // independently on 128-bit lanes.
04762   unsigned NumElts = VT.getVectorNumElements();
04763   unsigned NumLanes = VT.getSizeInBits()/128;
04764   unsigned NumLaneElts = NumElts/NumLanes;
04765 
04766   assert((NumLaneElts == 2 || NumLaneElts == 4 || NumLaneElts == 8) &&
04767          "Only supports 2, 4 or 8 elements per lane");
04768 
04769   unsigned Shift = (NumLaneElts >= 4) ? 1 : 0;
04770   unsigned Mask = 0;
04771   for (unsigned i = 0; i != NumElts; ++i) {
04772     int Elt = N->getMaskElt(i);
04773     if (Elt < 0) continue;
04774     Elt &= NumLaneElts - 1;
04775     unsigned ShAmt = (i << Shift) % 8;
04776     Mask |= Elt << ShAmt;
04777   }
04778 
04779   return Mask;
04780 }
04781 
04782 /// getShufflePSHUFHWImmediate - Return the appropriate immediate to shuffle
04783 /// the specified VECTOR_SHUFFLE mask with the PSHUFHW instruction.
04784 static unsigned getShufflePSHUFHWImmediate(ShuffleVectorSDNode *N) {
04785   MVT VT = N->getSimpleValueType(0);
04786 
04787   assert((VT == MVT::v8i16 || VT == MVT::v16i16) &&
04788          "Unsupported vector type for PSHUFHW");
04789 
04790   unsigned NumElts = VT.getVectorNumElements();
04791 
04792   unsigned Mask = 0;
04793   for (unsigned l = 0; l != NumElts; l += 8) {
04794     // 8 nodes per lane, but we only care about the last 4.
04795     for (unsigned i = 0; i < 4; ++i) {
04796       int Elt = N->getMaskElt(l+i+4);
04797       if (Elt < 0) continue;
04798       Elt &= 0x3; // only 2-bits.
04799       Mask |= Elt << (i * 2);
04800     }
04801   }
04802 
04803   return Mask;
04804 }
04805 
04806 /// getShufflePSHUFLWImmediate - Return the appropriate immediate to shuffle
04807 /// the specified VECTOR_SHUFFLE mask with the PSHUFLW instruction.
04808 static unsigned getShufflePSHUFLWImmediate(ShuffleVectorSDNode *N) {
04809   MVT VT = N->getSimpleValueType(0);
04810 
04811   assert((VT == MVT::v8i16 || VT == MVT::v16i16) &&
04812          "Unsupported vector type for PSHUFHW");
04813 
04814   unsigned NumElts = VT.getVectorNumElements();
04815 
04816   unsigned Mask = 0;
04817   for (unsigned l = 0; l != NumElts; l += 8) {
04818     // 8 nodes per lane, but we only care about the first 4.
04819     for (unsigned i = 0; i < 4; ++i) {
04820       int Elt = N->getMaskElt(l+i);
04821       if (Elt < 0) continue;
04822       Elt &= 0x3; // only 2-bits
04823       Mask |= Elt << (i * 2);
04824     }
04825   }
04826 
04827   return Mask;
04828 }
04829 
04830 /// \brief Return the appropriate immediate to shuffle the specified
04831 /// VECTOR_SHUFFLE mask with the PALIGNR (if InterLane is false) or with
04832 /// VALIGN (if Interlane is true) instructions.
04833 static unsigned getShuffleAlignrImmediate(ShuffleVectorSDNode *SVOp,
04834                                            bool InterLane) {
04835   MVT VT = SVOp->getSimpleValueType(0);
04836   unsigned EltSize = InterLane ? 1 :
04837     VT.getVectorElementType().getSizeInBits() >> 3;
04838 
04839   unsigned NumElts = VT.getVectorNumElements();
04840   unsigned NumLanes = VT.is512BitVector() ? 1 : VT.getSizeInBits()/128;
04841   unsigned NumLaneElts = NumElts/NumLanes;
04842 
04843   int Val = 0;
04844   unsigned i;
04845   for (i = 0; i != NumElts; ++i) {
04846     Val = SVOp->getMaskElt(i);
04847     if (Val >= 0)
04848       break;
04849   }
04850   if (Val >= (int)NumElts)
04851     Val -= NumElts - NumLaneElts;
04852 
04853   assert(Val - i > 0 && "PALIGNR imm should be positive");
04854   return (Val - i) * EltSize;
04855 }
04856 
04857 /// \brief Return the appropriate immediate to shuffle the specified
04858 /// VECTOR_SHUFFLE mask with the PALIGNR instruction.
04859 static unsigned getShufflePALIGNRImmediate(ShuffleVectorSDNode *SVOp) {
04860   return getShuffleAlignrImmediate(SVOp, false);
04861 }
04862 
04863 /// \brief Return the appropriate immediate to shuffle the specified
04864 /// VECTOR_SHUFFLE mask with the VALIGN instruction.
04865 static unsigned getShuffleVALIGNImmediate(ShuffleVectorSDNode *SVOp) {
04866   return getShuffleAlignrImmediate(SVOp, true);
04867 }
04868 
04869 
04870 static unsigned getExtractVEXTRACTImmediate(SDNode *N, unsigned vecWidth) {
04871   assert((vecWidth == 128 || vecWidth == 256) && "Unsupported vector width");
04872   if (!isa<ConstantSDNode>(N->getOperand(1).getNode()))
04873     llvm_unreachable("Illegal extract subvector for VEXTRACT");
04874 
04875   uint64_t Index =
04876     cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue();
04877 
04878   MVT VecVT = N->getOperand(0).getSimpleValueType();
04879   MVT ElVT = VecVT.getVectorElementType();
04880 
04881   unsigned NumElemsPerChunk = vecWidth / ElVT.getSizeInBits();
04882   return Index / NumElemsPerChunk;
04883 }
04884 
04885 static unsigned getInsertVINSERTImmediate(SDNode *N, unsigned vecWidth) {
04886   assert((vecWidth == 128 || vecWidth == 256) && "Unsupported vector width");
04887   if (!isa<ConstantSDNode>(N->getOperand(2).getNode()))
04888     llvm_unreachable("Illegal insert subvector for VINSERT");
04889 
04890   uint64_t Index =
04891     cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue();
04892 
04893   MVT VecVT = N->getSimpleValueType(0);
04894   MVT ElVT = VecVT.getVectorElementType();
04895 
04896   unsigned NumElemsPerChunk = vecWidth / ElVT.getSizeInBits();
04897   return Index / NumElemsPerChunk;
04898 }
04899 
04900 /// getExtractVEXTRACT128Immediate - Return the appropriate immediate
04901 /// to extract the specified EXTRACT_SUBVECTOR index with VEXTRACTF128
04902 /// and VINSERTI128 instructions.
04903 unsigned X86::getExtractVEXTRACT128Immediate(SDNode *N) {
04904   return getExtractVEXTRACTImmediate(N, 128);
04905 }
04906 
04907 /// getExtractVEXTRACT256Immediate - Return the appropriate immediate
04908 /// to extract the specified EXTRACT_SUBVECTOR index with VEXTRACTF64x4
04909 /// and VINSERTI64x4 instructions.
04910 unsigned X86::getExtractVEXTRACT256Immediate(SDNode *N) {
04911   return getExtractVEXTRACTImmediate(N, 256);
04912 }
04913 
04914 /// getInsertVINSERT128Immediate - Return the appropriate immediate
04915 /// to insert at the specified INSERT_SUBVECTOR index with VINSERTF128
04916 /// and VINSERTI128 instructions.
04917 unsigned X86::getInsertVINSERT128Immediate(SDNode *N) {
04918   return getInsertVINSERTImmediate(N, 128);
04919 }
04920 
04921 /// getInsertVINSERT256Immediate - Return the appropriate immediate
04922 /// to insert at the specified INSERT_SUBVECTOR index with VINSERTF46x4
04923 /// and VINSERTI64x4 instructions.
04924 unsigned X86::getInsertVINSERT256Immediate(SDNode *N) {
04925   return getInsertVINSERTImmediate(N, 256);
04926 }
04927 
04928 /// isZero - Returns true if Elt is a constant integer zero
04929 static bool isZero(SDValue V) {
04930   ConstantSDNode *C = dyn_cast<ConstantSDNode>(V);
04931   return C && C->isNullValue();
04932 }
04933 
04934 /// isZeroNode - Returns true if Elt is a constant zero or a floating point
04935 /// constant +0.0.
04936 bool X86::isZeroNode(SDValue Elt) {
04937   if (isZero(Elt))
04938     return true;
04939   if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Elt))
04940     return CFP->getValueAPF().isPosZero();
04941   return false;
04942 }
04943 
04944 /// ShouldXformToMOVHLPS - Return true if the node should be transformed to
04945 /// match movhlps. The lower half elements should come from upper half of
04946 /// V1 (and in order), and the upper half elements should come from the upper
04947 /// half of V2 (and in order).
04948 static bool ShouldXformToMOVHLPS(ArrayRef<int> Mask, MVT VT) {
04949   if (!VT.is128BitVector())
04950     return false;
04951   if (VT.getVectorNumElements() != 4)
04952     return false;
04953   for (unsigned i = 0, e = 2; i != e; ++i)
04954     if (!isUndefOrEqual(Mask[i], i+2))
04955       return false;
04956   for (unsigned i = 2; i != 4; ++i)
04957     if (!isUndefOrEqual(Mask[i], i+4))
04958       return false;
04959   return true;
04960 }
04961 
04962 /// isScalarLoadToVector - Returns true if the node is a scalar load that
04963 /// is promoted to a vector. It also returns the LoadSDNode by reference if
04964 /// required.
04965 static bool isScalarLoadToVector(SDNode *N, LoadSDNode **LD = nullptr) {
04966   if (N->getOpcode() != ISD::SCALAR_TO_VECTOR)
04967     return false;
04968   N = N->getOperand(0).getNode();
04969   if (!ISD::isNON_EXTLoad(N))
04970     return false;
04971   if (LD)
04972     *LD = cast<LoadSDNode>(N);
04973   return true;
04974 }
04975 
04976 // Test whether the given value is a vector value which will be legalized
04977 // into a load.
04978 static bool WillBeConstantPoolLoad(SDNode *N) {
04979   if (N->getOpcode() != ISD::BUILD_VECTOR)
04980     return false;
04981 
04982   // Check for any non-constant elements.
04983   for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i)
04984     switch (N->getOperand(i).getNode()->getOpcode()) {
04985     case ISD::UNDEF:
04986     case ISD::ConstantFP:
04987     case ISD::Constant:
04988       break;
04989     default:
04990       return false;
04991     }
04992 
04993   // Vectors of all-zeros and all-ones are materialized with special
04994   // instructions rather than being loaded.
04995   return !ISD::isBuildVectorAllZeros(N) &&
04996          !ISD::isBuildVectorAllOnes(N);
04997 }
04998 
04999 /// ShouldXformToMOVLP{S|D} - Return true if the node should be transformed to
05000 /// match movlp{s|d}. The lower half elements should come from lower half of
05001 /// V1 (and in order), and the upper half elements should come from the upper
05002 /// half of V2 (and in order). And since V1 will become the source of the
05003 /// MOVLP, it must be either a vector load or a scalar load to vector.
05004 static bool ShouldXformToMOVLP(SDNode *V1, SDNode *V2,
05005                                ArrayRef<int> Mask, MVT VT) {
05006   if (!VT.is128BitVector())
05007     return false;
05008 
05009   if (!ISD::isNON_EXTLoad(V1) && !isScalarLoadToVector(V1))
05010     return false;
05011   // Is V2 is a vector load, don't do this transformation. We will try to use
05012   // load folding shufps op.
05013   if (ISD::isNON_EXTLoad(V2) || WillBeConstantPoolLoad(V2))
05014     return false;
05015 
05016   unsigned NumElems = VT.getVectorNumElements();
05017 
05018   if (NumElems != 2 && NumElems != 4)
05019     return false;
05020   for (unsigned i = 0, e = NumElems/2; i != e; ++i)
05021     if (!isUndefOrEqual(Mask[i], i))
05022       return false;
05023   for (unsigned i = NumElems/2, e = NumElems; i != e; ++i)
05024     if (!isUndefOrEqual(Mask[i], i+NumElems))
05025       return false;
05026   return true;
05027 }
05028 
05029 /// isZeroShuffle - Returns true if N is a VECTOR_SHUFFLE that can be resolved
05030 /// to an zero vector.
05031 /// FIXME: move to dag combiner / method on ShuffleVectorSDNode
05032 static bool isZeroShuffle(ShuffleVectorSDNode *N) {
05033   SDValue V1 = N->getOperand(0);
05034   SDValue V2 = N->getOperand(1);
05035   unsigned NumElems = N->getValueType(0).getVectorNumElements();
05036   for (unsigned i = 0; i != NumElems; ++i) {
05037     int Idx = N->getMaskElt(i);
05038     if (Idx >= (int)NumElems) {
05039       unsigned Opc = V2.getOpcode();
05040       if (Opc == ISD::UNDEF || ISD::isBuildVectorAllZeros(V2.getNode()))
05041         continue;
05042       if (Opc != ISD::BUILD_VECTOR ||
05043           !X86::isZeroNode(V2.getOperand(Idx-NumElems)))
05044         return false;
05045     } else if (Idx >= 0) {
05046       unsigned Opc = V1.getOpcode();
05047       if (Opc == ISD::UNDEF || ISD::isBuildVectorAllZeros(V1.getNode()))
05048         continue;
05049       if (Opc != ISD::BUILD_VECTOR ||
05050           !X86::isZeroNode(V1.getOperand(Idx)))
05051         return false;
05052     }
05053   }
05054   return true;
05055 }
05056 
05057 /// getZeroVector - Returns a vector of specified type with all zero elements.
05058 ///
05059 static SDValue getZeroVector(EVT VT, const X86Subtarget *Subtarget,
05060                              SelectionDAG &DAG, SDLoc dl) {
05061   assert(VT.isVector() && "Expected a vector type");
05062 
05063   // Always build SSE zero vectors as <4 x i32> bitcasted
05064   // to their dest type. This ensures they get CSE'd.
05065   SDValue Vec;
05066   if (VT.is128BitVector()) {  // SSE
05067     if (Subtarget->hasSSE2()) {  // SSE2
05068       SDValue Cst = DAG.getTargetConstant(0, MVT::i32);
05069       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
05070     } else { // SSE1
05071       SDValue Cst = DAG.getTargetConstantFP(+0.0, MVT::f32);
05072       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4f32, Cst, Cst, Cst, Cst);
05073     }
05074   } else if (VT.is256BitVector()) { // AVX
05075     if (Subtarget->hasInt256()) { // AVX2
05076       SDValue Cst = DAG.getTargetConstant(0, MVT::i32);
05077       SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
05078       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops);
05079     } else {
05080       // 256-bit logic and arithmetic instructions in AVX are all
05081       // floating-point, no support for integer ops. Emit fp zeroed vectors.
05082       SDValue Cst = DAG.getTargetConstantFP(+0.0, MVT::f32);
05083       SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
05084       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8f32, Ops);
05085     }
05086   } else if (VT.is512BitVector()) { // AVX-512
05087       SDValue Cst = DAG.getTargetConstant(0, MVT::i32);
05088       SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst,
05089                         Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
05090       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v16i32, Ops);
05091   } else if (VT.getScalarType() == MVT::i1) {
05092     assert(VT.getVectorNumElements() <= 16 && "Unexpected vector type");
05093     SDValue Cst = DAG.getTargetConstant(0, MVT::i1);
05094     SmallVector<SDValue, 16> Ops(VT.getVectorNumElements(), Cst);
05095     return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops);
05096   } else
05097     llvm_unreachable("Unexpected vector type");
05098 
05099   return DAG.getNode(ISD::BITCAST, dl, VT, Vec);
05100 }
05101 
05102 /// getOnesVector - Returns a vector of specified type with all bits set.
05103 /// Always build ones vectors as <4 x i32> or <8 x i32>. For 256-bit types with
05104 /// no AVX2 supprt, use two <4 x i32> inserted in a <8 x i32> appropriately.
05105 /// Then bitcast to their original type, ensuring they get CSE'd.
05106 static SDValue getOnesVector(MVT VT, bool HasInt256, SelectionDAG &DAG,
05107                              SDLoc dl) {
05108   assert(VT.isVector() && "Expected a vector type");
05109 
05110   SDValue Cst = DAG.getTargetConstant(~0U, MVT::i32);
05111   SDValue Vec;
05112   if (VT.is256BitVector()) {
05113     if (HasInt256) { // AVX2
05114       SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
05115       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops);
05116     } else { // AVX
05117       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
05118       Vec = Concat128BitVectors(Vec, Vec, MVT::v8i32, 8, DAG, dl);
05119     }
05120   } else if (VT.is128BitVector()) {
05121     Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
05122   } else
05123     llvm_unreachable("Unexpected vector type");
05124 
05125   return DAG.getNode(ISD::BITCAST, dl, VT, Vec);
05126 }
05127 
05128 /// NormalizeMask - V2 is a splat, modify the mask (if needed) so all elements
05129 /// that point to V2 points to its first element.
05130 static void NormalizeMask(SmallVectorImpl<int> &Mask, unsigned NumElems) {
05131   for (unsigned i = 0; i != NumElems; ++i) {
05132     if (Mask[i] > (int)NumElems) {
05133       Mask[i] = NumElems;
05134     }
05135   }
05136 }
05137 
05138 /// getMOVLMask - Returns a vector_shuffle mask for an movs{s|d}, movd
05139 /// operation of specified width.
05140 static SDValue getMOVL(SelectionDAG &DAG, SDLoc dl, EVT VT, SDValue V1,
05141                        SDValue V2) {
05142   unsigned NumElems = VT.getVectorNumElements();
05143   SmallVector<int, 8> Mask;
05144   Mask.push_back(NumElems);
05145   for (unsigned i = 1; i != NumElems; ++i)
05146     Mask.push_back(i);
05147   return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]);
05148 }
05149 
05150 /// getUnpackl - Returns a vector_shuffle node for an unpackl operation.
05151 static SDValue getUnpackl(SelectionDAG &DAG, SDLoc dl, MVT VT, SDValue V1,
05152                           SDValue V2) {
05153   unsigned NumElems = VT.getVectorNumElements();
05154   SmallVector<int, 8> Mask;
05155   for (unsigned i = 0, e = NumElems/2; i != e; ++i) {
05156     Mask.push_back(i);
05157     Mask.push_back(i + NumElems);
05158   }
05159   return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]);
05160 }
05161 
05162 /// getUnpackh - Returns a vector_shuffle node for an unpackh operation.
05163 static SDValue getUnpackh(SelectionDAG &DAG, SDLoc dl, MVT VT, SDValue V1,
05164                           SDValue V2) {
05165   unsigned NumElems = VT.getVectorNumElements();
05166   SmallVector<int, 8> Mask;
05167   for (unsigned i = 0, Half = NumElems/2; i != Half; ++i) {
05168     Mask.push_back(i + Half);
05169     Mask.push_back(i + NumElems + Half);
05170   }
05171   return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]);
05172 }
05173 
05174 // PromoteSplati8i16 - All i16 and i8 vector types can't be used directly by
05175 // a generic shuffle instruction because the target has no such instructions.
05176 // Generate shuffles which repeat i16 and i8 several times until they can be
05177 // represented by v4f32 and then be manipulated by target suported shuffles.
05178 static SDValue PromoteSplati8i16(SDValue V, SelectionDAG &DAG, int &EltNo) {
05179   MVT VT = V.getSimpleValueType();
05180   int NumElems = VT.getVectorNumElements();
05181   SDLoc dl(V);
05182 
05183   while (NumElems > 4) {
05184     if (EltNo < NumElems/2) {
05185       V = getUnpackl(DAG, dl, VT, V, V);
05186     } else {
05187       V = getUnpackh(DAG, dl, VT, V, V);
05188       EltNo -= NumElems/2;
05189     }
05190     NumElems >>= 1;
05191   }
05192   return V;
05193 }
05194 
05195 /// getLegalSplat - Generate a legal splat with supported x86 shuffles
05196 static SDValue getLegalSplat(SelectionDAG &DAG, SDValue V, int EltNo) {
05197   MVT VT = V.getSimpleValueType();
05198   SDLoc dl(V);
05199 
05200   if (VT.is128BitVector()) {
05201     V = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, V);
05202     int SplatMask[4] = { EltNo, EltNo, EltNo, EltNo };
05203     V = DAG.getVectorShuffle(MVT::v4f32, dl, V, DAG.getUNDEF(MVT::v4f32),
05204                              &SplatMask[0]);
05205   } else if (VT.is256BitVector()) {
05206     // To use VPERMILPS to splat scalars, the second half of indicies must
05207     // refer to the higher part, which is a duplication of the lower one,
05208     // because VPERMILPS can only handle in-lane permutations.
05209     int SplatMask[8] = { EltNo, EltNo, EltNo, EltNo,
05210                          EltNo+4, EltNo+4, EltNo+4, EltNo+4 };
05211 
05212     V = DAG.getNode(ISD::BITCAST, dl, MVT::v8f32, V);
05213     V = DAG.getVectorShuffle(MVT::v8f32, dl, V, DAG.getUNDEF(MVT::v8f32),
05214                              &SplatMask[0]);
05215   } else
05216     llvm_unreachable("Vector size not supported");
05217 
05218   return DAG.getNode(ISD::BITCAST, dl, VT, V);
05219 }
05220 
05221 /// PromoteSplat - Splat is promoted to target supported vector shuffles.
05222 static SDValue PromoteSplat(ShuffleVectorSDNode *SV, SelectionDAG &DAG) {
05223   MVT SrcVT = SV->getSimpleValueType(0);
05224   SDValue V1 = SV->getOperand(0);
05225   SDLoc dl(SV);
05226 
05227   int EltNo = SV->getSplatIndex();
05228   int NumElems = SrcVT.getVectorNumElements();
05229   bool Is256BitVec = SrcVT.is256BitVector();
05230 
05231   assert(((SrcVT.is128BitVector() && NumElems > 4) || Is256BitVec) &&
05232          "Unknown how to promote splat for type");
05233 
05234   // Extract the 128-bit part containing the splat element and update
05235   // the splat element index when it refers to the higher register.
05236   if (Is256BitVec) {
05237     V1 = Extract128BitVector(V1, EltNo, DAG, dl);
05238     if (EltNo >= NumElems/2)
05239       EltNo -= NumElems/2;
05240   }
05241 
05242   // All i16 and i8 vector types can't be used directly by a generic shuffle
05243   // instruction because the target has no such instruction. Generate shuffles
05244   // which repeat i16 and i8 several times until they fit in i32, and then can
05245   // be manipulated by target suported shuffles.
05246   MVT EltVT = SrcVT.getVectorElementType();
05247   if (EltVT == MVT::i8 || EltVT == MVT::i16)
05248     V1 = PromoteSplati8i16(V1, DAG, EltNo);
05249 
05250   // Recreate the 256-bit vector and place the same 128-bit vector
05251   // into the low and high part. This is necessary because we want
05252   // to use VPERM* to shuffle the vectors
05253   if (Is256BitVec) {
05254     V1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, SrcVT, V1, V1);
05255   }
05256 
05257   return getLegalSplat(DAG, V1, EltNo);
05258 }
05259 
05260 /// getShuffleVectorZeroOrUndef - Return a vector_shuffle of the specified
05261 /// vector of zero or undef vector.  This produces a shuffle where the low
05262 /// element of V2 is swizzled into the zero/undef vector, landing at element
05263 /// Idx.  This produces a shuffle mask like 4,1,2,3 (idx=0) or  0,1,2,4 (idx=3).
05264 static SDValue getShuffleVectorZeroOrUndef(SDValue V2, unsigned Idx,
05265                                            bool IsZero,
05266                                            const X86Subtarget *Subtarget,
05267                                            SelectionDAG &DAG) {
05268   MVT VT = V2.getSimpleValueType();
05269   SDValue V1 = IsZero
05270     ? getZeroVector(VT, Subtarget, DAG, SDLoc(V2)) : DAG.getUNDEF(VT);
05271   unsigned NumElems = VT.getVectorNumElements();
05272   SmallVector<int, 16> MaskVec;
05273   for (unsigned i = 0; i != NumElems; ++i)
05274     // If this is the insertion idx, put the low elt of V2 here.
05275     MaskVec.push_back(i == Idx ? NumElems : i);
05276   return DAG.getVectorShuffle(VT, SDLoc(V2), V1, V2, &MaskVec[0]);
05277 }
05278 
05279 /// getTargetShuffleMask - Calculates the shuffle mask corresponding to the
05280 /// target specific opcode. Returns true if the Mask could be calculated. Sets
05281 /// IsUnary to true if only uses one source. Note that this will set IsUnary for
05282 /// shuffles which use a single input multiple times, and in those cases it will
05283 /// adjust the mask to only have indices within that single input.
05284 static bool getTargetShuffleMask(SDNode *N, MVT VT,
05285                                  SmallVectorImpl<int> &Mask, bool &IsUnary) {
05286   unsigned NumElems = VT.getVectorNumElements();
05287   SDValue ImmN;
05288 
05289   IsUnary = false;
05290   bool IsFakeUnary = false;
05291   switch(N->getOpcode()) {
05292   case X86ISD::BLENDI:
05293     ImmN = N->getOperand(N->getNumOperands()-1);
05294     DecodeBLENDMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
05295     break;
05296   case X86ISD::SHUFP:
05297     ImmN = N->getOperand(N->getNumOperands()-1);
05298     DecodeSHUFPMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
05299     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
05300     break;
05301   case X86ISD::UNPCKH:
05302     DecodeUNPCKHMask(VT, Mask);
05303     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
05304     break;
05305   case X86ISD::UNPCKL:
05306     DecodeUNPCKLMask(VT, Mask);
05307     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
05308     break;
05309   case X86ISD::MOVHLPS:
05310     DecodeMOVHLPSMask(NumElems, Mask);
05311     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
05312     break;
05313   case X86ISD::MOVLHPS:
05314     DecodeMOVLHPSMask(NumElems, Mask);
05315     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
05316     break;
05317   case X86ISD::PALIGNR:
05318     ImmN = N->getOperand(N->getNumOperands()-1);
05319     DecodePALIGNRMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
05320     break;
05321   case X86ISD::PSHUFD:
05322   case X86ISD::VPERMILPI:
05323     ImmN = N->getOperand(N->getNumOperands()-1);
05324     DecodePSHUFMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
05325     IsUnary = true;
05326     break;
05327   case X86ISD::PSHUFHW:
05328     ImmN = N->getOperand(N->getNumOperands()-1);
05329     DecodePSHUFHWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
05330     IsUnary = true;
05331     break;
05332   case X86ISD::PSHUFLW:
05333     ImmN = N->getOperand(N->getNumOperands()-1);
05334     DecodePSHUFLWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
05335     IsUnary = true;
05336     break;
05337   case X86ISD::PSHUFB: {
05338     IsUnary = true;
05339     SDValue MaskNode = N->getOperand(1);
05340     while (MaskNode->getOpcode() == ISD::BITCAST)
05341       MaskNode = MaskNode->getOperand(0);
05342 
05343     if (MaskNode->getOpcode() == ISD::BUILD_VECTOR) {
05344       // If we have a build-vector, then things are easy.
05345       EVT VT = MaskNode.getValueType();
05346       assert(VT.isVector() &&
05347              "Can't produce a non-vector with a build_vector!");
05348       if (!VT.isInteger())
05349         return false;
05350 
05351       int NumBytesPerElement = VT.getVectorElementType().getSizeInBits() / 8;
05352 
05353       SmallVector<uint64_t, 32> RawMask;
05354       for (int i = 0, e = MaskNode->getNumOperands(); i < e; ++i) {
05355         SDValue Op = MaskNode->getOperand(i);
05356         if (Op->getOpcode() == ISD::UNDEF) {
05357           RawMask.push_back((uint64_t)SM_SentinelUndef);
05358           continue;
05359         }
05360         auto *CN = dyn_cast<ConstantSDNode>(Op.getNode());
05361         if (!CN)
05362           return false;
05363         APInt MaskElement = CN->getAPIntValue();
05364 
05365         // We now have to decode the element which could be any integer size and
05366         // extract each byte of it.
05367         for (int j = 0; j < NumBytesPerElement; ++j) {
05368           // Note that this is x86 and so always little endian: the low byte is
05369           // the first byte of the mask.
05370           RawMask.push_back(MaskElement.getLoBits(8).getZExtValue());
05371           MaskElement = MaskElement.lshr(8);
05372         }
05373       }
05374       DecodePSHUFBMask(RawMask, Mask);
05375       break;
05376     }
05377 
05378     auto *MaskLoad = dyn_cast<LoadSDNode>(MaskNode);
05379     if (!MaskLoad)
05380       return false;
05381 
05382     SDValue Ptr = MaskLoad->getBasePtr();
05383     if (Ptr->getOpcode() == X86ISD::Wrapper)
05384       Ptr = Ptr->getOperand(0);
05385 
05386     auto *MaskCP = dyn_cast<ConstantPoolSDNode>(Ptr);
05387     if (!MaskCP || MaskCP->isMachineConstantPoolEntry())
05388       return false;
05389 
05390     if (auto *C = dyn_cast<Constant>(MaskCP->getConstVal())) {
05391       // FIXME: Support AVX-512 here.
05392       Type *Ty = C->getType();
05393       if (!Ty->isVectorTy() || (Ty->getVectorNumElements() != 16 &&
05394                                 Ty->getVectorNumElements() != 32))
05395         return false;
05396 
05397       DecodePSHUFBMask(C, Mask);
05398       break;
05399     }
05400 
05401     return false;
05402   }
05403   case X86ISD::VPERMI:
05404     ImmN = N->getOperand(N->getNumOperands()-1);
05405     DecodeVPERMMask(cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
05406     IsUnary = true;
05407     break;
05408   case X86ISD::MOVSS:
05409   case X86ISD::MOVSD: {
05410     // The index 0 always comes from the first element of the second source,
05411     // this is why MOVSS and MOVSD are used in the first place. The other
05412     // elements come from the other positions of the first source vector
05413     Mask.push_back(NumElems);
05414     for (unsigned i = 1; i != NumElems; ++i) {
05415       Mask.push_back(i);
05416     }
05417     break;
05418   }
05419   case X86ISD::VPERM2X128:
05420     ImmN = N->getOperand(N->getNumOperands()-1);
05421     DecodeVPERM2X128Mask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
05422     if (Mask.empty()) return false;
05423     break;
05424   case X86ISD::MOVSLDUP:
05425     DecodeMOVSLDUPMask(VT, Mask);
05426     break;
05427   case X86ISD::MOVSHDUP:
05428     DecodeMOVSHDUPMask(VT, Mask);
05429     break;
05430   case X86ISD::MOVDDUP:
05431   case X86ISD::MOVLHPD:
05432   case X86ISD::MOVLPD:
05433   case X86ISD::MOVLPS:
05434     // Not yet implemented
05435     return false;
05436   default: llvm_unreachable("unknown target shuffle node");
05437   }
05438 
05439   // If we have a fake unary shuffle, the shuffle mask is spread across two
05440   // inputs that are actually the same node. Re-map the mask to always point
05441   // into the first input.
05442   if (IsFakeUnary)
05443     for (int &M : Mask)
05444       if (M >= (int)Mask.size())
05445         M -= Mask.size();
05446 
05447   return true;
05448 }
05449 
05450 /// getShuffleScalarElt - Returns the scalar element that will make up the ith
05451 /// element of the result of the vector shuffle.
05452 static SDValue getShuffleScalarElt(SDNode *N, unsigned Index, SelectionDAG &DAG,
05453                                    unsigned Depth) {
05454   if (Depth == 6)
05455     return SDValue();  // Limit search depth.
05456 
05457   SDValue V = SDValue(N, 0);
05458   EVT VT = V.getValueType();
05459   unsigned Opcode = V.getOpcode();
05460 
05461   // Recurse into ISD::VECTOR_SHUFFLE node to find scalars.
05462   if (const ShuffleVectorSDNode *SV = dyn_cast<ShuffleVectorSDNode>(N)) {
05463     int Elt = SV->getMaskElt(Index);
05464 
05465     if (Elt < 0)
05466       return DAG.getUNDEF(VT.getVectorElementType());
05467 
05468     unsigned NumElems = VT.getVectorNumElements();
05469     SDValue NewV = (Elt < (int)NumElems) ? SV->getOperand(0)
05470                                          : SV->getOperand(1);
05471     return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG, Depth+1);
05472   }
05473 
05474   // Recurse into target specific vector shuffles to find scalars.
05475   if (isTargetShuffle(Opcode)) {
05476     MVT ShufVT = V.getSimpleValueType();
05477     unsigned NumElems = ShufVT.getVectorNumElements();
05478     SmallVector<int, 16> ShuffleMask;
05479     bool IsUnary;
05480 
05481     if (!getTargetShuffleMask(N, ShufVT, ShuffleMask, IsUnary))
05482       return SDValue();
05483 
05484     int Elt = ShuffleMask[Index];
05485     if (Elt < 0)
05486       return DAG.getUNDEF(ShufVT.getVectorElementType());
05487 
05488     SDValue NewV = (Elt < (int)NumElems) ? N->getOperand(0)
05489                                          : N->getOperand(1);
05490     return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG,
05491                                Depth+1);
05492   }
05493 
05494   // Actual nodes that may contain scalar elements
05495   if (Opcode == ISD::BITCAST) {
05496     V = V.getOperand(0);
05497     EVT SrcVT = V.getValueType();
05498     unsigned NumElems = VT.getVectorNumElements();
05499 
05500     if (!SrcVT.isVector() || SrcVT.getVectorNumElements() != NumElems)
05501       return SDValue();
05502   }
05503 
05504   if (V.getOpcode() == ISD::SCALAR_TO_VECTOR)
05505     return (Index == 0) ? V.getOperand(0)
05506                         : DAG.getUNDEF(VT.getVectorElementType());
05507 
05508   if (V.getOpcode() == ISD::BUILD_VECTOR)
05509     return V.getOperand(Index);
05510 
05511   return SDValue();
05512 }
05513 
05514 /// getNumOfConsecutiveZeros - Return the number of elements of a vector
05515 /// shuffle operation which come from a consecutively from a zero. The
05516 /// search can start in two different directions, from left or right.
05517 /// We count undefs as zeros until PreferredNum is reached.
05518 static unsigned getNumOfConsecutiveZeros(ShuffleVectorSDNode *SVOp,
05519                                          unsigned NumElems, bool ZerosFromLeft,
05520                                          SelectionDAG &DAG,
05521                                          unsigned PreferredNum = -1U) {
05522   unsigned NumZeros = 0;
05523   for (unsigned i = 0; i != NumElems; ++i) {
05524     unsigned Index = ZerosFromLeft ? i : NumElems - i - 1;
05525     SDValue Elt = getShuffleScalarElt(SVOp, Index, DAG, 0);
05526     if (!Elt.getNode())
05527       break;
05528 
05529     if (X86::isZeroNode(Elt))
05530       ++NumZeros;
05531     else if (Elt.getOpcode() == ISD::UNDEF) // Undef as zero up to PreferredNum.
05532       NumZeros = std::min(NumZeros + 1, PreferredNum);
05533     else
05534       break;
05535   }
05536 
05537   return NumZeros;
05538 }
05539 
05540 /// isShuffleMaskConsecutive - Check if the shuffle mask indicies [MaskI, MaskE)
05541 /// correspond consecutively to elements from one of the vector operands,
05542 /// starting from its index OpIdx. Also tell OpNum which source vector operand.
05543 static
05544 bool isShuffleMaskConsecutive(ShuffleVectorSDNode *SVOp,
05545                               unsigned MaskI, unsigned MaskE, unsigned OpIdx,
05546                               unsigned NumElems, unsigned &OpNum) {
05547   bool SeenV1 = false;
05548   bool SeenV2 = false;
05549 
05550   for (unsigned i = MaskI; i != MaskE; ++i, ++OpIdx) {
05551     int Idx = SVOp->getMaskElt(i);
05552     // Ignore undef indicies
05553     if (Idx < 0)
05554       continue;
05555 
05556     if (Idx < (int)NumElems)
05557       SeenV1 = true;
05558     else
05559       SeenV2 = true;
05560 
05561     // Only accept consecutive elements from the same vector
05562     if ((Idx % NumElems != OpIdx) || (SeenV1 && SeenV2))
05563       return false;
05564   }
05565 
05566   OpNum = SeenV1 ? 0 : 1;
05567   return true;
05568 }
05569 
05570 /// isVectorShiftRight - Returns true if the shuffle can be implemented as a
05571 /// logical left shift of a vector.
05572 static bool isVectorShiftRight(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG,
05573                                bool &isLeft, SDValue &ShVal, unsigned &ShAmt) {
05574   unsigned NumElems =
05575     SVOp->getSimpleValueType(0).getVectorNumElements();
05576   unsigned NumZeros = getNumOfConsecutiveZeros(
05577       SVOp, NumElems, false /* check zeros from right */, DAG,
05578       SVOp->getMaskElt(0));
05579   unsigned OpSrc;
05580 
05581   if (!NumZeros)
05582     return false;
05583 
05584   // Considering the elements in the mask that are not consecutive zeros,
05585   // check if they consecutively come from only one of the source vectors.
05586   //
05587   //               V1 = {X, A, B, C}     0
05588   //                         \  \  \    /
05589   //   vector_shuffle V1, V2 <1, 2, 3, X>
05590   //
05591   if (!isShuffleMaskConsecutive(SVOp,
05592             0,                   // Mask Start Index
05593             NumElems-NumZeros,   // Mask End Index(exclusive)
05594             NumZeros,            // Where to start looking in the src vector
05595             NumElems,            // Number of elements in vector
05596             OpSrc))              // Which source operand ?
05597     return false;
05598 
05599   isLeft = false;
05600   ShAmt = NumZeros;
05601   ShVal = SVOp->getOperand(OpSrc);
05602   return true;
05603 }
05604 
05605 /// isVectorShiftLeft - Returns true if the shuffle can be implemented as a
05606 /// logical left shift of a vector.
05607 static bool isVectorShiftLeft(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG,
05608                               bool &isLeft, SDValue &ShVal, unsigned &ShAmt) {
05609   unsigned NumElems =
05610     SVOp->getSimpleValueType(0).getVectorNumElements();
05611   unsigned NumZeros = getNumOfConsecutiveZeros(
05612       SVOp, NumElems, true /* check zeros from left */, DAG,
05613       NumElems - SVOp->getMaskElt(NumElems - 1) - 1);
05614   unsigned OpSrc;
05615 
05616   if (!NumZeros)
05617     return false;
05618 
05619   // Considering the elements in the mask that are not consecutive zeros,
05620   // check if they consecutively come from only one of the source vectors.
05621   //
05622   //                           0    { A, B, X, X } = V2
05623   //                          / \    /  /
05624   //   vector_shuffle V1, V2 <X, X, 4, 5>
05625   //
05626   if (!isShuffleMaskConsecutive(SVOp,
05627             NumZeros,     // Mask Start Index
05628             NumElems,     // Mask End Index(exclusive)
05629             0,            // Where to start looking in the src vector
05630             NumElems,     // Number of elements in vector
05631             OpSrc))       // Which source operand ?
05632     return false;
05633 
05634   isLeft = true;
05635   ShAmt = NumZeros;
05636   ShVal = SVOp->getOperand(OpSrc);
05637   return true;
05638 }
05639 
05640 /// isVectorShift - Returns true if the shuffle can be implemented as a
05641 /// logical left or right shift of a vector.
05642 static bool isVectorShift(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG,
05643                           bool &isLeft, SDValue &ShVal, unsigned &ShAmt) {
05644   // Although the logic below support any bitwidth size, there are no
05645   // shift instructions which handle more than 128-bit vectors.
05646   if (!SVOp->getSimpleValueType(0).is128BitVector())
05647     return false;
05648 
05649   if (isVectorShiftLeft(SVOp, DAG, isLeft, ShVal, ShAmt) ||
05650       isVectorShiftRight(SVOp, DAG, isLeft, ShVal, ShAmt))
05651     return true;
05652 
05653   return false;
05654 }
05655 
05656 /// LowerBuildVectorv16i8 - Custom lower build_vector of v16i8.
05657 ///
05658 static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros,
05659                                        unsigned NumNonZero, unsigned NumZero,
05660                                        SelectionDAG &DAG,
05661                                        const X86Subtarget* Subtarget,
05662                                        const TargetLowering &TLI) {
05663   if (NumNonZero > 8)
05664     return SDValue();
05665 
05666   SDLoc dl(Op);
05667   SDValue V;
05668   bool First = true;
05669   for (unsigned i = 0; i < 16; ++i) {
05670     bool ThisIsNonZero = (NonZeros & (1 << i)) != 0;
05671     if (ThisIsNonZero && First) {
05672       if (NumZero)
05673         V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
05674       else
05675         V = DAG.getUNDEF(MVT::v8i16);
05676       First = false;
05677     }
05678 
05679     if ((i & 1) != 0) {
05680       SDValue ThisElt, LastElt;
05681       bool LastIsNonZero = (NonZeros & (1 << (i-1))) != 0;
05682       if (LastIsNonZero) {
05683         LastElt = DAG.getNode(ISD::ZERO_EXTEND, dl,
05684                               MVT::i16, Op.getOperand(i-1));
05685       }
05686       if (ThisIsNonZero) {
05687         ThisElt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i));
05688         ThisElt = DAG.getNode(ISD::SHL, dl, MVT::i16,
05689                               ThisElt, DAG.getConstant(8, MVT::i8));
05690         if (LastIsNonZero)
05691           ThisElt = DAG.getNode(ISD::OR, dl, MVT::i16, ThisElt, LastElt);
05692       } else
05693         ThisElt = LastElt;
05694 
05695       if (ThisElt.getNode())
05696         V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, ThisElt,
05697                         DAG.getIntPtrConstant(i/2));
05698     }
05699   }
05700 
05701   return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, V);
05702 }
05703 
05704 /// LowerBuildVectorv8i16 - Custom lower build_vector of v8i16.
05705 ///
05706 static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros,
05707                                      unsigned NumNonZero, unsigned NumZero,
05708                                      SelectionDAG &DAG,
05709                                      const X86Subtarget* Subtarget,
05710                                      const TargetLowering &TLI) {
05711   if (NumNonZero > 4)
05712     return SDValue();
05713 
05714   SDLoc dl(Op);
05715   SDValue V;
05716   bool First = true;
05717   for (unsigned i = 0; i < 8; ++i) {
05718     bool isNonZero = (NonZeros & (1 << i)) != 0;
05719     if (isNonZero) {
05720       if (First) {
05721         if (NumZero)
05722           V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
05723         else
05724           V = DAG.getUNDEF(MVT::v8i16);
05725         First = false;
05726       }
05727       V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl,
05728                       MVT::v8i16, V, Op.getOperand(i),
05729                       DAG.getIntPtrConstant(i));
05730     }
05731   }
05732 
05733   return V;
05734 }
05735 
05736 /// LowerBuildVectorv4x32 - Custom lower build_vector of v4i32 or v4f32.
05737 static SDValue LowerBuildVectorv4x32(SDValue Op, unsigned NumElems,
05738                                      unsigned NonZeros, unsigned NumNonZero,
05739                                      unsigned NumZero, SelectionDAG &DAG,
05740                                      const X86Subtarget *Subtarget,
05741                                      const TargetLowering &TLI) {
05742   // We know there's at least one non-zero element
05743   unsigned FirstNonZeroIdx = 0;
05744   SDValue FirstNonZero = Op->getOperand(FirstNonZeroIdx);
05745   while (FirstNonZero.getOpcode() == ISD::UNDEF ||
05746          X86::isZeroNode(FirstNonZero)) {
05747     ++FirstNonZeroIdx;
05748     FirstNonZero = Op->getOperand(FirstNonZeroIdx);
05749   }
05750 
05751   if (FirstNonZero.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
05752       !isa<ConstantSDNode>(FirstNonZero.getOperand(1)))
05753     return SDValue();
05754 
05755   SDValue V = FirstNonZero.getOperand(0);
05756   MVT VVT = V.getSimpleValueType();
05757   if (!Subtarget->hasSSE41() || (VVT != MVT::v4f32 && VVT != MVT::v4i32))
05758     return SDValue();
05759 
05760   unsigned FirstNonZeroDst =
05761       cast<ConstantSDNode>(FirstNonZero.getOperand(1))->getZExtValue();
05762   unsigned CorrectIdx = FirstNonZeroDst == FirstNonZeroIdx;
05763   unsigned IncorrectIdx = CorrectIdx ? -1U : FirstNonZeroIdx;
05764   unsigned IncorrectDst = CorrectIdx ? -1U : FirstNonZeroDst;
05765 
05766   for (unsigned Idx = FirstNonZeroIdx + 1; Idx < NumElems; ++Idx) {
05767     SDValue Elem = Op.getOperand(Idx);
05768     if (Elem.getOpcode() == ISD::UNDEF || X86::isZeroNode(Elem))
05769       continue;
05770 
05771     // TODO: What else can be here? Deal with it.
05772     if (Elem.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
05773       return SDValue();
05774 
05775     // TODO: Some optimizations are still possible here
05776     // ex: Getting one element from a vector, and the rest from another.
05777     if (Elem.getOperand(0) != V)
05778       return SDValue();
05779 
05780     unsigned Dst = cast<ConstantSDNode>(Elem.getOperand(1))->getZExtValue();
05781     if (Dst == Idx)
05782       ++CorrectIdx;
05783     else if (IncorrectIdx == -1U) {
05784       IncorrectIdx = Idx;
05785       IncorrectDst = Dst;
05786     } else
05787       // There was already one element with an incorrect index.
05788       // We can't optimize this case to an insertps.
05789       return SDValue();
05790   }
05791 
05792   if (NumNonZero == CorrectIdx || NumNonZero == CorrectIdx + 1) {
05793     SDLoc dl(Op);
05794     EVT VT = Op.getSimpleValueType();
05795     unsigned ElementMoveMask = 0;
05796     if (IncorrectIdx == -1U)
05797       ElementMoveMask = FirstNonZeroIdx << 6 | FirstNonZeroIdx << 4;
05798     else
05799       ElementMoveMask = IncorrectDst << 6 | IncorrectIdx << 4;
05800 
05801     SDValue InsertpsMask =
05802         DAG.getIntPtrConstant(ElementMoveMask | (~NonZeros & 0xf));
05803     return DAG.getNode(X86ISD::INSERTPS, dl, VT, V, V, InsertpsMask);
05804   }
05805 
05806   return SDValue();
05807 }
05808 
05809 /// getVShift - Return a vector logical shift node.
05810 ///
05811 static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp,
05812                          unsigned NumBits, SelectionDAG &DAG,
05813                          const TargetLowering &TLI, SDLoc dl) {
05814   assert(VT.is128BitVector() && "Unknown type for VShift");
05815   EVT ShVT = MVT::v2i64;
05816   unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ;
05817   SrcOp = DAG.getNode(ISD::BITCAST, dl, ShVT, SrcOp);
05818   return DAG.getNode(ISD::BITCAST, dl, VT,
05819                      DAG.getNode(Opc, dl, ShVT, SrcOp,
05820                              DAG.getConstant(NumBits,
05821                                   TLI.getScalarShiftAmountTy(SrcOp.getValueType()))));
05822 }
05823 
05824 static SDValue
05825 LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, SDLoc dl, SelectionDAG &DAG) {
05826 
05827   // Check if the scalar load can be widened into a vector load. And if
05828   // the address is "base + cst" see if the cst can be "absorbed" into
05829   // the shuffle mask.
05830   if (LoadSDNode *LD = dyn_cast<LoadSDNode>(SrcOp)) {
05831     SDValue Ptr = LD->getBasePtr();
05832     if (!ISD::isNormalLoad(LD) || LD->isVolatile())
05833       return SDValue();
05834     EVT PVT = LD->getValueType(0);
05835     if (PVT != MVT::i32 && PVT != MVT::f32)
05836       return SDValue();
05837 
05838     int FI = -1;
05839     int64_t Offset = 0;
05840     if (FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr)) {
05841       FI = FINode->getIndex();
05842       Offset = 0;
05843     } else if (DAG.isBaseWithConstantOffset(Ptr) &&
05844                isa<FrameIndexSDNode>(Ptr.getOperand(0))) {
05845       FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex();
05846       Offset = Ptr.getConstantOperandVal(1);
05847       Ptr = Ptr.getOperand(0);
05848     } else {
05849       return SDValue();
05850     }
05851 
05852     // FIXME: 256-bit vector instructions don't require a strict alignment,
05853     // improve this code to support it better.
05854     unsigned RequiredAlign = VT.getSizeInBits()/8;
05855     SDValue Chain = LD->getChain();
05856     // Make sure the stack object alignment is at least 16 or 32.
05857     MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
05858     if (DAG.InferPtrAlignment(Ptr) < RequiredAlign) {
05859       if (MFI->isFixedObjectIndex(FI)) {
05860         // Can't change the alignment. FIXME: It's possible to compute
05861         // the exact stack offset and reference FI + adjust offset instead.
05862         // If someone *really* cares about this. That's the way to implement it.
05863         return SDValue();
05864       } else {
05865         MFI->setObjectAlignment(FI, RequiredAlign);
05866       }
05867     }
05868 
05869     // (Offset % 16 or 32) must be multiple of 4. Then address is then
05870     // Ptr + (Offset & ~15).
05871     if (Offset < 0)
05872       return SDValue();
05873     if ((Offset % RequiredAlign) & 3)
05874       return SDValue();
05875     int64_t StartOffset = Offset & ~(RequiredAlign-1);
05876     if (StartOffset)
05877       Ptr = DAG.getNode(ISD::ADD, SDLoc(Ptr), Ptr.getValueType(),
05878                         Ptr,DAG.getConstant(StartOffset, Ptr.getValueType()));
05879 
05880     int EltNo = (Offset - StartOffset) >> 2;
05881     unsigned NumElems = VT.getVectorNumElements();
05882 
05883     EVT NVT = EVT::getVectorVT(*DAG.getContext(), PVT, NumElems);
05884     SDValue V1 = DAG.getLoad(NVT, dl, Chain, Ptr,
05885                              LD->getPointerInfo().getWithOffset(StartOffset),
05886                              false, false, false, 0);
05887 
05888     SmallVector<int, 8> Mask;
05889     for (unsigned i = 0; i != NumElems; ++i)
05890       Mask.push_back(EltNo);
05891 
05892     return DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), &Mask[0]);
05893   }
05894 
05895   return SDValue();
05896 }
05897 
05898 /// EltsFromConsecutiveLoads - Given the initializing elements 'Elts' of a
05899 /// vector of type 'VT', see if the elements can be replaced by a single large
05900 /// load which has the same value as a build_vector whose operands are 'elts'.
05901 ///
05902 /// Example: <load i32 *a, load i32 *a+4, undef, undef> -> zextload a
05903 ///
05904 /// FIXME: we'd also like to handle the case where the last elements are zero
05905 /// rather than undef via VZEXT_LOAD, but we do not detect that case today.
05906 /// There's even a handy isZeroNode for that purpose.
05907 static SDValue EltsFromConsecutiveLoads(EVT VT, SmallVectorImpl<SDValue> &Elts,
05908                                         SDLoc &DL, SelectionDAG &DAG,
05909                                         bool isAfterLegalize) {
05910   EVT EltVT = VT.getVectorElementType();
05911   unsigned NumElems = Elts.size();
05912 
05913   LoadSDNode *LDBase = nullptr;
05914   unsigned LastLoadedElt = -1U;
05915 
05916   // For each element in the initializer, see if we've found a load or an undef.
05917   // If we don't find an initial load element, or later load elements are
05918   // non-consecutive, bail out.
05919   for (unsigned i = 0; i < NumElems; ++i) {
05920     SDValue Elt = Elts[i];
05921 
05922     if (!Elt.getNode() ||
05923         (Elt.getOpcode() != ISD::UNDEF && !ISD::isNON_EXTLoad(Elt.getNode())))
05924       return SDValue();
05925     if (!LDBase) {
05926       if (Elt.getNode()->getOpcode() == ISD::UNDEF)
05927         return SDValue();
05928       LDBase = cast<LoadSDNode>(Elt.getNode());
05929       LastLoadedElt = i;
05930       continue;
05931     }
05932     if (Elt.getOpcode() == ISD::UNDEF)
05933       continue;
05934 
05935     LoadSDNode *LD = cast<LoadSDNode>(Elt);
05936     if (!DAG.isConsecutiveLoad(LD, LDBase, EltVT.getSizeInBits()/8, i))
05937       return SDValue();
05938     LastLoadedElt = i;
05939   }
05940 
05941   // If we have found an entire vector of loads and undefs, then return a large
05942   // load of the entire vector width starting at the base pointer.  If we found
05943   // consecutive loads for the low half, generate a vzext_load node.
05944   if (LastLoadedElt == NumElems - 1) {
05945 
05946     if (isAfterLegalize &&
05947         !DAG.getTargetLoweringInfo().isOperationLegal(ISD::LOAD, VT))
05948       return SDValue();
05949 
05950     SDValue NewLd = SDValue();
05951 
05952     if (DAG.InferPtrAlignment(LDBase->getBasePtr()) >= 16)
05953       NewLd = DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
05954                           LDBase->getPointerInfo(),
05955                           LDBase->isVolatile(), LDBase->isNonTemporal(),
05956                           LDBase->isInvariant(), 0);
05957     NewLd = DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
05958                         LDBase->getPointerInfo(),
05959                         LDBase->isVolatile(), LDBase->isNonTemporal(),
05960                         LDBase->isInvariant(), LDBase->getAlignment());
05961 
05962     if (LDBase->hasAnyUseOfValue(1)) {
05963       SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
05964                                      SDValue(LDBase, 1),
05965                                      SDValue(NewLd.getNode(), 1));
05966       DAG.ReplaceAllUsesOfValueWith(SDValue(LDBase, 1), NewChain);
05967       DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(LDBase, 1),
05968                              SDValue(NewLd.getNode(), 1));
05969     }
05970 
05971     return NewLd;
05972   }
05973   if (NumElems == 4 && LastLoadedElt == 1 &&
05974       DAG.getTargetLoweringInfo().isTypeLegal(MVT::v2i64)) {
05975     SDVTList Tys = DAG.getVTList(MVT::v2i64, MVT::Other);
05976     SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };
05977     SDValue ResNode =
05978         DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, Ops, MVT::i64,
05979                                 LDBase->getPointerInfo(),
05980                                 LDBase->getAlignment(),
05981                                 false/*isVolatile*/, true/*ReadMem*/,
05982                                 false/*WriteMem*/);
05983 
05984     // Make sure the newly-created LOAD is in the same position as LDBase in
05985     // terms of dependency. We create a TokenFactor for LDBase and ResNode, and
05986     // update uses of LDBase's output chain to use the TokenFactor.
05987     if (LDBase->hasAnyUseOfValue(1)) {
05988       SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
05989                              SDValue(LDBase, 1), SDValue(ResNode.getNode(), 1));
05990       DAG.ReplaceAllUsesOfValueWith(SDValue(LDBase, 1), NewChain);
05991       DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(LDBase, 1),
05992                              SDValue(ResNode.getNode(), 1));
05993     }
05994 
05995     return DAG.getNode(ISD::BITCAST, DL, VT, ResNode);
05996   }
05997   return SDValue();
05998 }
05999 
06000 /// LowerVectorBroadcast - Attempt to use the vbroadcast instruction
06001 /// to generate a splat value for the following cases:
06002 /// 1. A splat BUILD_VECTOR which uses a single scalar load, or a constant.
06003 /// 2. A splat shuffle which uses a scalar_to_vector node which comes from
06004 /// a scalar load, or a constant.
06005 /// The VBROADCAST node is returned when a pattern is found,
06006 /// or SDValue() otherwise.
06007 static SDValue LowerVectorBroadcast(SDValue Op, const X86Subtarget* Subtarget,
06008                                     SelectionDAG &DAG) {
06009   // VBROADCAST requires AVX.
06010   // TODO: Splats could be generated for non-AVX CPUs using SSE
06011   // instructions, but there's less potential gain for only 128-bit vectors.
06012   if (!Subtarget->hasAVX())
06013     return SDValue();
06014 
06015   MVT VT = Op.getSimpleValueType();
06016   SDLoc dl(Op);
06017 
06018   assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
06019          "Unsupported vector type for broadcast.");
06020 
06021   SDValue Ld;
06022   bool ConstSplatVal;
06023 
06024   switch (Op.getOpcode()) {
06025     default:
06026       // Unknown pattern found.
06027       return SDValue();
06028 
06029     case ISD::BUILD_VECTOR: {
06030       auto *BVOp = cast<BuildVectorSDNode>(Op.getNode());
06031       BitVector UndefElements;
06032       SDValue Splat = BVOp->getSplatValue(&UndefElements);
06033 
06034       // We need a splat of a single value to use broadcast, and it doesn't
06035       // make any sense if the value is only in one element of the vector.
06036       if (!Splat || (VT.getVectorNumElements() - UndefElements.count()) <= 1)
06037         return SDValue();
06038 
06039       Ld = Splat;
06040       ConstSplatVal = (Ld.getOpcode() == ISD::Constant ||
06041                        Ld.getOpcode() == ISD::ConstantFP);
06042 
06043       // Make sure that all of the users of a non-constant load are from the
06044       // BUILD_VECTOR node.
06045       if (!ConstSplatVal && !BVOp->isOnlyUserOf(Ld.getNode()))
06046         return SDValue();
06047       break;
06048     }
06049 
06050     case ISD::VECTOR_SHUFFLE: {
06051       ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
06052 
06053       // Shuffles must have a splat mask where the first element is
06054       // broadcasted.
06055       if ((!SVOp->isSplat()) || SVOp->getMaskElt(0) != 0)
06056         return SDValue();
06057 
06058       SDValue Sc = Op.getOperand(0);
06059       if (Sc.getOpcode() != ISD::SCALAR_TO_VECTOR &&
06060           Sc.getOpcode() != ISD::BUILD_VECTOR) {
06061 
06062         if (!Subtarget->hasInt256())
06063           return SDValue();
06064 
06065         // Use the register form of the broadcast instruction available on AVX2.
06066         if (VT.getSizeInBits() >= 256)
06067           Sc = Extract128BitVector(Sc, 0, DAG, dl);
06068         return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Sc);
06069       }
06070 
06071       Ld = Sc.getOperand(0);
06072       ConstSplatVal = (Ld.getOpcode() == ISD::Constant ||
06073                        Ld.getOpcode() == ISD::ConstantFP);
06074 
06075       // The scalar_to_vector node and the suspected
06076       // load node must have exactly one user.
06077       // Constants may have multiple users.
06078 
06079       // AVX-512 has register version of the broadcast
06080       bool hasRegVer = Subtarget->hasAVX512() && VT.is512BitVector() &&
06081         Ld.getValueType().getSizeInBits() >= 32;
06082       if (!ConstSplatVal && ((!Sc.hasOneUse() || !Ld.hasOneUse()) &&
06083           !hasRegVer))
06084         return SDValue();
06085       break;
06086     }
06087   }
06088 
06089   unsigned ScalarSize = Ld.getValueType().getSizeInBits();
06090   bool IsGE256 = (VT.getSizeInBits() >= 256);
06091 
06092   // When optimizing for size, generate up to 5 extra bytes for a broadcast
06093   // instruction to save 8 or more bytes of constant pool data.
06094   // TODO: If multiple splats are generated to load the same constant,
06095   // it may be detrimental to overall size. There needs to be a way to detect
06096   // that condition to know if this is truly a size win.
06097   const Function *F = DAG.getMachineFunction().getFunction();
06098   bool OptForSize = F->getAttributes().
06099     hasAttribute(AttributeSet::FunctionIndex, Attribute::OptimizeForSize);
06100 
06101   // Handle broadcasting a single constant scalar from the constant pool
06102   // into a vector.
06103   // On Sandybridge (no AVX2), it is still better to load a constant vector
06104   // from the constant pool and not to broadcast it from a scalar.
06105   // But override that restriction when optimizing for size.
06106   // TODO: Check if splatting is recommended for other AVX-capable CPUs.
06107   if (ConstSplatVal && (Subtarget->hasAVX2() || OptForSize)) {
06108     EVT CVT = Ld.getValueType();
06109     assert(!CVT.isVector() && "Must not broadcast a vector type");
06110 
06111     // Splat f32, i32, v4f64, v4i64 in all cases with AVX2.
06112     // For size optimization, also splat v2f64 and v2i64, and for size opt
06113     // with AVX2, also splat i8 and i16.
06114     // With pattern matching, the VBROADCAST node may become a VMOVDDUP.
06115     if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
06116         (OptForSize && (ScalarSize == 64 || Subtarget->hasAVX2()))) {
06117       const Constant *C = nullptr;
06118       if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Ld))
06119         C = CI->getConstantIntValue();
06120       else if (ConstantFPSDNode *CF = dyn_cast<ConstantFPSDNode>(Ld))
06121         C = CF->getConstantFPValue();
06122 
06123       assert(C && "Invalid constant type");
06124 
06125       const TargetLowering &TLI = DAG.getTargetLoweringInfo();
06126       SDValue CP = DAG.getConstantPool(C, TLI.getPointerTy());
06127       unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
06128       Ld = DAG.getLoad(CVT, dl, DAG.getEntryNode(), CP,
06129                        MachinePointerInfo::getConstantPool(),
06130                        false, false, false, Alignment);
06131 
06132       return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
06133     }
06134   }
06135 
06136   bool IsLoad = ISD::isNormalLoad(Ld.getNode());
06137 
06138   // Handle AVX2 in-register broadcasts.
06139   if (!IsLoad && Subtarget->hasInt256() &&
06140       (ScalarSize == 32 || (IsGE256 && ScalarSize == 64)))
06141     return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
06142 
06143   // The scalar source must be a normal load.
06144   if (!IsLoad)
06145     return SDValue();
06146 
06147   if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64))
06148     return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
06149 
06150   // The integer check is needed for the 64-bit into 128-bit so it doesn't match
06151   // double since there is no vbroadcastsd xmm
06152   if (Subtarget->hasInt256() && Ld.getValueType().isInteger()) {
06153     if (ScalarSize == 8 || ScalarSize == 16 || ScalarSize == 64)
06154       return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
06155   }
06156 
06157   // Unsupported broadcast.
06158   return SDValue();
06159 }
06160 
06161 /// \brief For an EXTRACT_VECTOR_ELT with a constant index return the real
06162 /// underlying vector and index.
06163 ///
06164 /// Modifies \p ExtractedFromVec to the real vector and returns the real
06165 /// index.
06166 static int getUnderlyingExtractedFromVec(SDValue &ExtractedFromVec,
06167                                          SDValue ExtIdx) {
06168   int Idx = cast<ConstantSDNode>(ExtIdx)->getZExtValue();
06169   if (!isa<ShuffleVectorSDNode>(ExtractedFromVec))
06170     return Idx;
06171 
06172   // For 256-bit vectors, LowerEXTRACT_VECTOR_ELT_SSE4 may have already
06173   // lowered this:
06174   //   (extract_vector_elt (v8f32 %vreg1), Constant<6>)
06175   // to:
06176   //   (extract_vector_elt (vector_shuffle<2,u,u,u>
06177   //                           (extract_subvector (v8f32 %vreg0), Constant<4>),
06178   //                           undef)
06179   //                       Constant<0>)
06180   // In this case the vector is the extract_subvector expression and the index
06181   // is 2, as specified by the shuffle.
06182   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(ExtractedFromVec);
06183   SDValue ShuffleVec = SVOp->getOperand(0);
06184   MVT ShuffleVecVT = ShuffleVec.getSimpleValueType();
06185   assert(ShuffleVecVT.getVectorElementType() ==
06186          ExtractedFromVec.getSimpleValueType().getVectorElementType());
06187 
06188   int ShuffleIdx = SVOp->getMaskElt(Idx);
06189   if (isUndefOrInRange(ShuffleIdx, 0, ShuffleVecVT.getVectorNumElements())) {
06190     ExtractedFromVec = ShuffleVec;
06191     return ShuffleIdx;
06192   }
06193   return Idx;
06194 }
06195 
06196 static SDValue buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) {
06197   MVT VT = Op.getSimpleValueType();
06198 
06199   // Skip if insert_vec_elt is not supported.
06200   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
06201   if (!TLI.isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT))
06202     return SDValue();
06203 
06204   SDLoc DL(Op);
06205   unsigned NumElems = Op.getNumOperands();
06206 
06207   SDValue VecIn1;
06208   SDValue VecIn2;
06209   SmallVector<unsigned, 4> InsertIndices;
06210   SmallVector<int, 8> Mask(NumElems, -1);
06211 
06212   for (unsigned i = 0; i != NumElems; ++i) {
06213     unsigned Opc = Op.getOperand(i).getOpcode();
06214 
06215     if (Opc == ISD::UNDEF)
06216       continue;
06217 
06218     if (Opc != ISD::EXTRACT_VECTOR_ELT) {
06219       // Quit if more than 1 elements need inserting.
06220       if (InsertIndices.size() > 1)
06221         return SDValue();
06222 
06223       InsertIndices.push_back(i);
06224       continue;
06225     }
06226 
06227     SDValue ExtractedFromVec = Op.getOperand(i).getOperand(0);
06228     SDValue ExtIdx = Op.getOperand(i).getOperand(1);
06229     // Quit if non-constant index.
06230     if (!isa<ConstantSDNode>(ExtIdx))
06231       return SDValue();
06232     int Idx = getUnderlyingExtractedFromVec(ExtractedFromVec, ExtIdx);
06233 
06234     // Quit if extracted from vector of different type.
06235     if (ExtractedFromVec.getValueType() != VT)
06236       return SDValue();
06237 
06238     if (!VecIn1.getNode())
06239       VecIn1 = ExtractedFromVec;
06240     else if (VecIn1 != ExtractedFromVec) {
06241       if (!VecIn2.getNode())
06242         VecIn2 = ExtractedFromVec;
06243       else if (VecIn2 != ExtractedFromVec)
06244         // Quit if more than 2 vectors to shuffle
06245         return SDValue();
06246     }
06247 
06248     if (ExtractedFromVec == VecIn1)
06249       Mask[i] = Idx;
06250     else if (ExtractedFromVec == VecIn2)
06251       Mask[i] = Idx + NumElems;
06252   }
06253 
06254   if (!VecIn1.getNode())
06255     return SDValue();
06256 
06257   VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(VT);
06258   SDValue NV = DAG.getVectorShuffle(VT, DL, VecIn1, VecIn2, &Mask[0]);
06259   for (unsigned i = 0, e = InsertIndices.size(); i != e; ++i) {
06260     unsigned Idx = InsertIndices[i];
06261     NV = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, NV, Op.getOperand(Idx),
06262                      DAG.getIntPtrConstant(Idx));
06263   }
06264 
06265   return NV;
06266 }
06267 
06268 // Lower BUILD_VECTOR operation for v8i1 and v16i1 types.
06269 SDValue
06270 X86TargetLowering::LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const {
06271 
06272   MVT VT = Op.getSimpleValueType();
06273   assert((VT.getVectorElementType() == MVT::i1) && (VT.getSizeInBits() <= 16) &&
06274          "Unexpected type in LowerBUILD_VECTORvXi1!");
06275 
06276   SDLoc dl(Op);
06277   if (ISD::isBuildVectorAllZeros(Op.getNode())) {
06278     SDValue Cst = DAG.getTargetConstant(0, MVT::i1);
06279     SmallVector<SDValue, 16> Ops(VT.getVectorNumElements(), Cst);
06280     return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops);
06281   }
06282 
06283   if (ISD::isBuildVectorAllOnes(Op.getNode())) {
06284     SDValue Cst = DAG.getTargetConstant(1, MVT::i1);
06285     SmallVector<SDValue, 16> Ops(VT.getVectorNumElements(), Cst);
06286     return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops);
06287   }
06288 
06289   bool AllContants = true;
06290   uint64_t Immediate = 0;
06291   int NonConstIdx = -1;
06292   bool IsSplat = true;
06293   unsigned NumNonConsts = 0;
06294   unsigned NumConsts = 0;
06295   for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
06296     SDValue In = Op.getOperand(idx);
06297     if (In.getOpcode() == ISD::UNDEF)
06298       continue;
06299     if (!isa<ConstantSDNode>(In)) {
06300       AllContants = false;
06301       NonConstIdx = idx;
06302       NumNonConsts++;
06303     }
06304     else {
06305       NumConsts++;
06306       if (cast<ConstantSDNode>(In)->getZExtValue())
06307       Immediate |= (1ULL << idx);
06308     }
06309     if (In != Op.getOperand(0))
06310       IsSplat = false;
06311   }
06312 
06313   if (AllContants) {
06314     SDValue FullMask = DAG.getNode(ISD::BITCAST, dl, MVT::v16i1,
06315       DAG.getConstant(Immediate, MVT::i16));
06316     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, FullMask,
06317                        DAG.getIntPtrConstant(0));
06318   }
06319 
06320   if (NumNonConsts == 1 && NonConstIdx != 0) {
06321     SDValue DstVec;
06322     if (NumConsts) {
06323       SDValue VecAsImm = DAG.getConstant(Immediate,
06324                                          MVT::getIntegerVT(VT.getSizeInBits()));
06325       DstVec = DAG.getNode(ISD::BITCAST, dl, VT, VecAsImm);
06326     }
06327     else 
06328       DstVec = DAG.getUNDEF(VT);
06329     return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DstVec,
06330                        Op.getOperand(NonConstIdx),
06331                        DAG.getIntPtrConstant(NonConstIdx));
06332   }
06333   if (!IsSplat && (NonConstIdx != 0))
06334     llvm_unreachable("Unsupported BUILD_VECTOR operation");
06335   MVT SelectVT = (VT == MVT::v16i1)? MVT::i16 : MVT::i8;
06336   SDValue Select;
06337   if (IsSplat)
06338     Select = DAG.getNode(ISD::SELECT, dl, SelectVT, Op.getOperand(0),
06339                           DAG.getConstant(-1, SelectVT),
06340                           DAG.getConstant(0, SelectVT));
06341   else
06342     Select = DAG.getNode(ISD::SELECT, dl, SelectVT, Op.getOperand(0),
06343                          DAG.getConstant((Immediate | 1), SelectVT),
06344                          DAG.getConstant(Immediate, SelectVT));
06345   return DAG.getNode(ISD::BITCAST, dl, VT, Select);
06346 }
06347 
06348 /// \brief Return true if \p N implements a horizontal binop and return the
06349 /// operands for the horizontal binop into V0 and V1.
06350 /// 
06351 /// This is a helper function of PerformBUILD_VECTORCombine.
06352 /// This function checks that the build_vector \p N in input implements a
06353 /// horizontal operation. Parameter \p Opcode defines the kind of horizontal
06354 /// operation to match.
06355 /// For example, if \p Opcode is equal to ISD::ADD, then this function
06356 /// checks if \p N implements a horizontal arithmetic add; if instead \p Opcode
06357 /// is equal to ISD::SUB, then this function checks if this is a horizontal
06358 /// arithmetic sub.
06359 ///
06360 /// This function only analyzes elements of \p N whose indices are
06361 /// in range [BaseIdx, LastIdx).
06362 static bool isHorizontalBinOp(const BuildVectorSDNode *N, unsigned Opcode,
06363                               SelectionDAG &DAG,
06364                               unsigned BaseIdx, unsigned LastIdx,
06365                               SDValue &V0, SDValue &V1) {
06366   EVT VT = N->getValueType(0);
06367 
06368   assert(BaseIdx * 2 <= LastIdx && "Invalid Indices in input!");
06369   assert(VT.isVector() && VT.getVectorNumElements() >= LastIdx &&
06370          "Invalid Vector in input!");
06371   
06372   bool IsCommutable = (Opcode == ISD::ADD || Opcode == ISD::FADD);
06373   bool CanFold = true;
06374   unsigned ExpectedVExtractIdx = BaseIdx;
06375   unsigned NumElts = LastIdx - BaseIdx;
06376   V0 = DAG.getUNDEF(VT);
06377   V1 = DAG.getUNDEF(VT);
06378 
06379   // Check if N implements a horizontal binop.
06380   for (unsigned i = 0, e = NumElts; i != e && CanFold; ++i) {
06381     SDValue Op = N->getOperand(i + BaseIdx);
06382 
06383     // Skip UNDEFs.
06384     if (Op->getOpcode() == ISD::UNDEF) {
06385       // Update the expected vector extract index.
06386       if (i * 2 == NumElts)
06387         ExpectedVExtractIdx = BaseIdx;
06388       ExpectedVExtractIdx += 2;
06389       continue;
06390     }
06391 
06392     CanFold = Op->getOpcode() == Opcode && Op->hasOneUse();
06393 
06394     if (!CanFold)
06395       break;
06396 
06397     SDValue Op0 = Op.getOperand(0);
06398     SDValue Op1 = Op.getOperand(1);
06399 
06400     // Try to match the following pattern:
06401     // (BINOP (extract_vector_elt A, I), (extract_vector_elt A, I+1))
06402     CanFold = (Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
06403         Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
06404         Op0.getOperand(0) == Op1.getOperand(0) &&
06405         isa<ConstantSDNode>(Op0.getOperand(1)) &&
06406         isa<ConstantSDNode>(Op1.getOperand(1)));
06407     if (!CanFold)
06408       break;
06409 
06410     unsigned I0 = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue();
06411     unsigned I1 = cast<ConstantSDNode>(Op1.getOperand(1))->getZExtValue();
06412 
06413     if (i * 2 < NumElts) {
06414       if (V0.getOpcode() == ISD::UNDEF)
06415         V0 = Op0.getOperand(0);
06416     } else {
06417       if (V1.getOpcode() == ISD::UNDEF)
06418         V1 = Op0.getOperand(0);
06419       if (i * 2 == NumElts)
06420         ExpectedVExtractIdx = BaseIdx;
06421     }
06422 
06423     SDValue Expected = (i * 2 < NumElts) ? V0 : V1;
06424     if (I0 == ExpectedVExtractIdx)
06425       CanFold = I1 == I0 + 1 && Op0.getOperand(0) == Expected;
06426     else if (IsCommutable && I1 == ExpectedVExtractIdx) {
06427       // Try to match the following dag sequence:
06428       // (BINOP (extract_vector_elt A, I+1), (extract_vector_elt A, I))
06429       CanFold = I0 == I1 + 1 && Op1.getOperand(0) == Expected;
06430     } else
06431       CanFold = false;
06432 
06433     ExpectedVExtractIdx += 2;
06434   }
06435 
06436   return CanFold;
06437 }
06438 
06439 /// \brief Emit a sequence of two 128-bit horizontal add/sub followed by
06440 /// a concat_vector. 
06441 ///
06442 /// This is a helper function of PerformBUILD_VECTORCombine.
06443 /// This function expects two 256-bit vectors called V0 and V1.
06444 /// At first, each vector is split into two separate 128-bit vectors.
06445 /// Then, the resulting 128-bit vectors are used to implement two
06446 /// horizontal binary operations. 
06447 ///
06448 /// The kind of horizontal binary operation is defined by \p X86Opcode.
06449 ///
06450 /// \p Mode specifies how the 128-bit parts of V0 and V1 are passed in input to
06451 /// the two new horizontal binop.
06452 /// When Mode is set, the first horizontal binop dag node would take as input
06453 /// the lower 128-bit of V0 and the upper 128-bit of V0. The second
06454 /// horizontal binop dag node would take as input the lower 128-bit of V1
06455 /// and the upper 128-bit of V1.
06456 ///   Example:
06457 ///     HADD V0_LO, V0_HI
06458 ///     HADD V1_LO, V1_HI
06459 ///
06460 /// Otherwise, the first horizontal binop dag node takes as input the lower
06461 /// 128-bit of V0 and the lower 128-bit of V1, and the second horizontal binop
06462 /// dag node takes the the upper 128-bit of V0 and the upper 128-bit of V1.
06463 ///   Example:
06464 ///     HADD V0_LO, V1_LO
06465 ///     HADD V0_HI, V1_HI
06466 ///
06467 /// If \p isUndefLO is set, then the algorithm propagates UNDEF to the lower
06468 /// 128-bits of the result. If \p isUndefHI is set, then UNDEF is propagated to
06469 /// the upper 128-bits of the result.
06470 static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1,
06471                                      SDLoc DL, SelectionDAG &DAG,
06472                                      unsigned X86Opcode, bool Mode,
06473                                      bool isUndefLO, bool isUndefHI) {
06474   EVT VT = V0.getValueType();
06475   assert(VT.is256BitVector() && VT == V1.getValueType() &&
06476          "Invalid nodes in input!");
06477 
06478   unsigned NumElts = VT.getVectorNumElements();
06479   SDValue V0_LO = Extract128BitVector(V0, 0, DAG, DL);
06480   SDValue V0_HI = Extract128BitVector(V0, NumElts/2, DAG, DL);
06481   SDValue V1_LO = Extract128BitVector(V1, 0, DAG, DL);
06482   SDValue V1_HI = Extract128BitVector(V1, NumElts/2, DAG, DL);
06483   EVT NewVT = V0_LO.getValueType();
06484 
06485   SDValue LO = DAG.getUNDEF(NewVT);
06486   SDValue HI = DAG.getUNDEF(NewVT);
06487 
06488   if (Mode) {
06489     // Don't emit a horizontal binop if the result is expected to be UNDEF.
06490     if (!isUndefLO && V0->getOpcode() != ISD::UNDEF)
06491       LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V0_HI);
06492     if (!isUndefHI && V1->getOpcode() != ISD::UNDEF)
06493       HI = DAG.getNode(X86Opcode, DL, NewVT, V1_LO, V1_HI);
06494   } else {
06495     // Don't emit a horizontal binop if the result is expected to be UNDEF.
06496     if (!isUndefLO && (V0_LO->getOpcode() != ISD::UNDEF ||
06497                        V1_LO->getOpcode() != ISD::UNDEF))
06498       LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V1_LO);
06499 
06500     if (!isUndefHI && (V0_HI->getOpcode() != ISD::UNDEF ||
06501                        V1_HI->getOpcode() != ISD::UNDEF))
06502       HI = DAG.getNode(X86Opcode, DL, NewVT, V0_HI, V1_HI);
06503   }
06504 
06505   return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LO, HI);
06506 }
06507 
06508 /// \brief Try to fold a build_vector that performs an 'addsub' into the
06509 /// sequence of 'vadd + vsub + blendi'.
06510 static SDValue matchAddSub(const BuildVectorSDNode *BV, SelectionDAG &DAG,
06511                            const X86Subtarget *Subtarget) {
06512   SDLoc DL(BV);
06513   EVT VT = BV->getValueType(0);
06514   unsigned NumElts = VT.getVectorNumElements();
06515   SDValue InVec0 = DAG.getUNDEF(VT);
06516   SDValue InVec1 = DAG.getUNDEF(VT);
06517 
06518   assert((VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v4f32 ||
06519           VT == MVT::v2f64) && "build_vector with an invalid type found!");
06520 
06521   // Odd-numbered elements in the input build vector are obtained from
06522   // adding two integer/float elements.
06523   // Even-numbered elements in the input build vector are obtained from
06524   // subtracting two integer/float elements.
06525   unsigned ExpectedOpcode = ISD::FSUB;
06526   unsigned NextExpectedOpcode = ISD::FADD;
06527   bool AddFound = false;
06528   bool SubFound = false;
06529 
06530   for (unsigned i = 0, e = NumElts; i != e; i++) {
06531     SDValue Op = BV->getOperand(i);
06532 
06533     // Skip 'undef' values.
06534     unsigned Opcode = Op.getOpcode();
06535     if (Opcode == ISD::UNDEF) {
06536       std::swap(ExpectedOpcode, NextExpectedOpcode);
06537       continue;
06538     }
06539 
06540     // Early exit if we found an unexpected opcode.
06541     if (Opcode != ExpectedOpcode)
06542       return SDValue();
06543 
06544     SDValue Op0 = Op.getOperand(0);
06545     SDValue Op1 = Op.getOperand(1);
06546 
06547     // Try to match the following pattern:
06548     // (BINOP (extract_vector_elt A, i), (extract_vector_elt B, i))
06549     // Early exit if we cannot match that sequence.
06550     if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
06551         Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
06552         !isa<ConstantSDNode>(Op0.getOperand(1)) ||
06553         !isa<ConstantSDNode>(Op1.getOperand(1)) ||
06554         Op0.getOperand(1) != Op1.getOperand(1))
06555       return SDValue();
06556 
06557     unsigned I0 = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue();
06558     if (I0 != i)
06559       return SDValue();
06560 
06561     // We found a valid add/sub node. Update the information accordingly.
06562     if (i & 1)
06563       AddFound = true;
06564     else
06565       SubFound = true;
06566 
06567     // Update InVec0 and InVec1.
06568     if (InVec0.getOpcode() == ISD::UNDEF)
06569       InVec0 = Op0.getOperand(0);
06570     if (InVec1.getOpcode() == ISD::UNDEF)
06571       InVec1 = Op1.getOperand(0);
06572 
06573     // Make sure that operands in input to each add/sub node always
06574     // come from a same pair of vectors.
06575     if (InVec0 != Op0.getOperand(0)) {
06576       if (ExpectedOpcode == ISD::FSUB)
06577         return SDValue();
06578 
06579       // FADD is commutable. Try to commute the operands
06580       // and then test again.
06581       std::swap(Op0, Op1);
06582       if (InVec0 != Op0.getOperand(0))
06583         return SDValue();
06584     }
06585 
06586     if (InVec1 != Op1.getOperand(0))
06587       return SDValue();
06588 
06589     // Update the pair of expected opcodes.
06590     std::swap(ExpectedOpcode, NextExpectedOpcode);
06591   }
06592 
06593   // Don't try to fold this build_vector into an ADDSUB if the inputs are undef.
06594   if (AddFound && SubFound && InVec0.getOpcode() != ISD::UNDEF &&
06595       InVec1.getOpcode() != ISD::UNDEF)
06596     return DAG.getNode(X86ISD::ADDSUB, DL, VT, InVec0, InVec1);
06597 
06598   return SDValue();
06599 }
06600 
06601 static SDValue PerformBUILD_VECTORCombine(SDNode *N, SelectionDAG &DAG,
06602                                           const X86Subtarget *Subtarget) {
06603   SDLoc DL(N);
06604   EVT VT = N->getValueType(0);
06605   unsigned NumElts = VT.getVectorNumElements();
06606   BuildVectorSDNode *BV = cast<BuildVectorSDNode>(N);
06607   SDValue InVec0, InVec1;
06608 
06609   // Try to match an ADDSUB.
06610   if ((Subtarget->hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
06611       (Subtarget->hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))) {
06612     SDValue Value = matchAddSub(BV, DAG, Subtarget);
06613     if (Value.getNode())
06614       return Value;
06615   }
06616 
06617   // Try to match horizontal ADD/SUB.
06618   unsigned NumUndefsLO = 0;
06619   unsigned NumUndefsHI = 0;
06620   unsigned Half = NumElts/2;
06621 
06622   // Count the number of UNDEF operands in the build_vector in input.
06623   for (unsigned i = 0, e = Half; i != e; ++i)
06624     if (BV->getOperand(i)->getOpcode() == ISD::UNDEF)
06625       NumUndefsLO++;
06626 
06627   for (unsigned i = Half, e = NumElts; i != e; ++i)
06628     if (BV->getOperand(i)->getOpcode() == ISD::UNDEF)
06629       NumUndefsHI++;
06630 
06631   // Early exit if this is either a build_vector of all UNDEFs or all the
06632   // operands but one are UNDEF.
06633   if (NumUndefsLO + NumUndefsHI + 1 >= NumElts)
06634     return SDValue();
06635 
06636   if ((VT == MVT::v4f32 || VT == MVT::v2f64) && Subtarget->hasSSE3()) {
06637     // Try to match an SSE3 float HADD/HSUB.
06638     if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, NumElts, InVec0, InVec1))
06639       return DAG.getNode(X86ISD::FHADD, DL, VT, InVec0, InVec1);
06640     
06641     if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, NumElts, InVec0, InVec1))
06642       return DAG.getNode(X86ISD::FHSUB, DL, VT, InVec0, InVec1);
06643   } else if ((VT == MVT::v4i32 || VT == MVT::v8i16) && Subtarget->hasSSSE3()) {
06644     // Try to match an SSSE3 integer HADD/HSUB.
06645     if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1))
06646       return DAG.getNode(X86ISD::HADD, DL, VT, InVec0, InVec1);
06647     
06648     if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, NumElts, InVec0, InVec1))
06649       return DAG.getNode(X86ISD::HSUB, DL, VT, InVec0, InVec1);
06650   }
06651   
06652   if (!Subtarget->hasAVX())
06653     return SDValue();
06654 
06655   if ((VT == MVT::v8f32 || VT == MVT::v4f64)) {
06656     // Try to match an AVX horizontal add/sub of packed single/double
06657     // precision floating point values from 256-bit vectors.
06658     SDValue InVec2, InVec3;
06659     if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, Half, InVec0, InVec1) &&
06660         isHorizontalBinOp(BV, ISD::FADD, DAG, Half, NumElts, InVec2, InVec3) &&
06661         ((InVec0.getOpcode() == ISD::UNDEF ||
06662           InVec2.getOpcode() == ISD::UNDEF) || InVec0 == InVec2) &&
06663         ((InVec1.getOpcode() == ISD::UNDEF ||
06664           InVec3.getOpcode() == ISD::UNDEF) || InVec1 == InVec3))
06665       return DAG.getNode(X86ISD::FHADD, DL, VT, InVec0, InVec1);
06666 
06667     if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, Half, InVec0, InVec1) &&
06668         isHorizontalBinOp(BV, ISD::FSUB, DAG, Half, NumElts, InVec2, InVec3) &&
06669         ((InVec0.getOpcode() == ISD::UNDEF ||
06670           InVec2.getOpcode() == ISD::UNDEF) || InVec0 == InVec2) &&
06671         ((InVec1.getOpcode() == ISD::UNDEF ||
06672           InVec3.getOpcode() == ISD::UNDEF) || InVec1 == InVec3))
06673       return DAG.getNode(X86ISD::FHSUB, DL, VT, InVec0, InVec1);
06674   } else if (VT == MVT::v8i32 || VT == MVT::v16i16) {
06675     // Try to match an AVX2 horizontal add/sub of signed integers.
06676     SDValue InVec2, InVec3;
06677     unsigned X86Opcode;
06678     bool CanFold = true;
06679 
06680     if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, Half, InVec0, InVec1) &&
06681         isHorizontalBinOp(BV, ISD::ADD, DAG, Half, NumElts, InVec2, InVec3) &&
06682         ((InVec0.getOpcode() == ISD::UNDEF ||
06683           InVec2.getOpcode() == ISD::UNDEF) || InVec0 == InVec2) &&
06684         ((InVec1.getOpcode() == ISD::UNDEF ||
06685           InVec3.getOpcode() == ISD::UNDEF) || InVec1 == InVec3))
06686       X86Opcode = X86ISD::HADD;
06687     else if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, Half, InVec0, InVec1) &&
06688         isHorizontalBinOp(BV, ISD::SUB, DAG, Half, NumElts, InVec2, InVec3) &&
06689         ((InVec0.getOpcode() == ISD::UNDEF ||
06690           InVec2.getOpcode() == ISD::UNDEF) || InVec0 == InVec2) &&
06691         ((InVec1.getOpcode() == ISD::UNDEF ||
06692           InVec3.getOpcode() == ISD::UNDEF) || InVec1 == InVec3))
06693       X86Opcode = X86ISD::HSUB;
06694     else
06695       CanFold = false;
06696 
06697     if (CanFold) {
06698       // Fold this build_vector into a single horizontal add/sub.
06699       // Do this only if the target has AVX2.
06700       if (Subtarget->hasAVX2())
06701         return DAG.getNode(X86Opcode, DL, VT, InVec0, InVec1);
06702  
06703       // Do not try to expand this build_vector into a pair of horizontal
06704       // add/sub if we can emit a pair of scalar add/sub.
06705       if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
06706         return SDValue();
06707 
06708       // Convert this build_vector into a pair of horizontal binop followed by
06709       // a concat vector.
06710       bool isUndefLO = NumUndefsLO == Half;
06711       bool isUndefHI = NumUndefsHI == Half;
06712       return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, false,
06713                                    isUndefLO, isUndefHI);
06714     }
06715   }
06716 
06717   if ((VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v8i32 ||
06718        VT == MVT::v16i16) && Subtarget->hasAVX()) {
06719     unsigned X86Opcode;
06720     if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1))
06721       X86Opcode = X86ISD::HADD;
06722     else if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, NumElts, InVec0, InVec1))
06723       X86Opcode = X86ISD::HSUB;
06724     else if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, NumElts, InVec0, InVec1))
06725       X86Opcode = X86ISD::FHADD;
06726     else if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, NumElts, InVec0, InVec1))
06727       X86Opcode = X86ISD::FHSUB;
06728     else
06729       return SDValue();
06730 
06731     // Don't try to expand this build_vector into a pair of horizontal add/sub
06732     // if we can simply emit a pair of scalar add/sub.
06733     if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
06734       return SDValue();
06735 
06736     // Convert this build_vector into two horizontal add/sub followed by
06737     // a concat vector.
06738     bool isUndefLO = NumUndefsLO == Half;
06739     bool isUndefHI = NumUndefsHI == Half;
06740     return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, true,
06741                                  isUndefLO, isUndefHI);
06742   }
06743 
06744   return SDValue();
06745 }
06746 
06747 SDValue
06748 X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
06749   SDLoc dl(Op);
06750 
06751   MVT VT = Op.getSimpleValueType();
06752   MVT ExtVT = VT.getVectorElementType();
06753   unsigned NumElems = Op.getNumOperands();
06754 
06755   // Generate vectors for predicate vectors.
06756   if (VT.getScalarType() == MVT::i1 && Subtarget->hasAVX512())
06757     return LowerBUILD_VECTORvXi1(Op, DAG);
06758 
06759   // Vectors containing all zeros can be matched by pxor and xorps later
06760   if (ISD::isBuildVectorAllZeros(Op.getNode())) {
06761     // Canonicalize this to <4 x i32> to 1) ensure the zero vectors are CSE'd
06762     // and 2) ensure that i64 scalars are eliminated on x86-32 hosts.
06763     if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32)
06764       return Op;
06765 
06766     return getZeroVector(VT, Subtarget, DAG, dl);
06767   }
06768 
06769   // Vectors containing all ones can be matched by pcmpeqd on 128-bit width
06770   // vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use
06771   // vpcmpeqd on 256-bit vectors.
06772   if (Subtarget->hasSSE2() && ISD::isBuildVectorAllOnes(Op.getNode())) {
06773     if (VT == MVT::v4i32 || (VT == MVT::v8i32 && Subtarget->hasInt256()))
06774       return Op;
06775 
06776     if (!VT.is512BitVector())
06777       return getOnesVector(VT, Subtarget->hasInt256(), DAG, dl);
06778   }
06779 
06780   SDValue Broadcast = LowerVectorBroadcast(Op, Subtarget, DAG);
06781   if (Broadcast.getNode())
06782     return Broadcast;
06783 
06784   unsigned EVTBits = ExtVT.getSizeInBits();
06785 
06786   unsigned NumZero  = 0;
06787   unsigned NumNonZero = 0;
06788   unsigned NonZeros = 0;
06789   bool IsAllConstants = true;
06790   SmallSet<SDValue, 8> Values;
06791   for (unsigned i = 0; i < NumElems; ++i) {
06792     SDValue Elt = Op.getOperand(i);
06793     if (Elt.getOpcode() == ISD::UNDEF)
06794       continue;
06795     Values.insert(Elt);
06796     if (Elt.getOpcode() != ISD::Constant &&
06797         Elt.getOpcode() != ISD::ConstantFP)
06798       IsAllConstants = false;
06799     if (X86::isZeroNode(Elt))
06800       NumZero++;
06801     else {
06802       NonZeros |= (1 << i);
06803       NumNonZero++;
06804     }
06805   }
06806 
06807   // All undef vector. Return an UNDEF.  All zero vectors were handled above.
06808   if (NumNonZero == 0)
06809     return DAG.getUNDEF(VT);
06810 
06811   // Special case for single non-zero, non-undef, element.
06812   if (NumNonZero == 1) {
06813     unsigned Idx = countTrailingZeros(NonZeros);
06814     SDValue Item = Op.getOperand(Idx);
06815 
06816     // If this is an insertion of an i64 value on x86-32, and if the top bits of
06817     // the value are obviously zero, truncate the value to i32 and do the
06818     // insertion that way.  Only do this if the value is non-constant or if the
06819     // value is a constant being inserted into element 0.  It is cheaper to do
06820     // a constant pool load than it is to do a movd + shuffle.
06821     if (ExtVT == MVT::i64 && !Subtarget->is64Bit() &&
06822         (!IsAllConstants || Idx == 0)) {
06823       if (DAG.MaskedValueIsZero(Item, APInt::getBitsSet(64, 32, 64))) {
06824         // Handle SSE only.
06825         assert(VT == MVT::v2i64 && "Expected an SSE value type!");
06826         EVT VecVT = MVT::v4i32;
06827         unsigned VecElts = 4;
06828 
06829         // Truncate the value (which may itself be a constant) to i32, and
06830         // convert it to a vector with movd (S2V+shuffle to zero extend).
06831         Item = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Item);
06832         Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Item);
06833 
06834         // If using the new shuffle lowering, just directly insert this.
06835         if (ExperimentalVectorShuffleLowering)
06836           return DAG.getNode(
06837               ISD::BITCAST, dl, VT,
06838               getShuffleVectorZeroOrUndef(Item, Idx * 2, true, Subtarget, DAG));
06839 
06840         Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
06841 
06842         // Now we have our 32-bit value zero extended in the low element of
06843         // a vector.  If Idx != 0, swizzle it into place.
06844         if (Idx != 0) {
06845           SmallVector<int, 4> Mask;
06846           Mask.push_back(Idx);
06847           for (unsigned i = 1; i != VecElts; ++i)
06848             Mask.push_back(i);
06849           Item = DAG.getVectorShuffle(VecVT, dl, Item, DAG.getUNDEF(VecVT),
06850                                       &Mask[0]);
06851         }
06852         return DAG.getNode(ISD::BITCAST, dl, VT, Item);
06853       }
06854     }
06855 
06856     // If we have a constant or non-constant insertion into the low element of
06857     // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into
06858     // the rest of the elements.  This will be matched as movd/movq/movss/movsd
06859     // depending on what the source datatype is.
06860     if (Idx == 0) {
06861       if (NumZero == 0)
06862         return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
06863 
06864       if (ExtVT == MVT::i32 || ExtVT == MVT::f32 || ExtVT == MVT::f64 ||
06865           (ExtVT == MVT::i64 && Subtarget->is64Bit())) {
06866         if (VT.is256BitVector() || VT.is512BitVector()) {
06867           SDValue ZeroVec = getZeroVector(VT, Subtarget, DAG, dl);
06868           return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, ZeroVec,
06869                              Item, DAG.getIntPtrConstant(0));
06870         }
06871         assert(VT.is128BitVector() && "Expected an SSE value type!");
06872         Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
06873         // Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector.
06874         return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
06875       }
06876 
06877       if (ExtVT == MVT::i16 || ExtVT == MVT::i8) {
06878         Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item);
06879         Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item);
06880         if (VT.is256BitVector()) {
06881           SDValue ZeroVec = getZeroVector(MVT::v8i32, Subtarget, DAG, dl);
06882           Item = Insert128BitVector(ZeroVec, Item, 0, DAG, dl);
06883         } else {
06884           assert(VT.is128BitVector() && "Expected an SSE value type!");
06885           Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
06886         }
06887         return DAG.getNode(ISD::BITCAST, dl, VT, Item);
06888       }
06889     }
06890 
06891     // Is it a vector logical left shift?
06892     if (NumElems == 2 && Idx == 1 &&
06893         X86::isZeroNode(Op.getOperand(0)) &&
06894         !X86::isZeroNode(Op.getOperand(1))) {
06895       unsigned NumBits = VT.getSizeInBits();
06896       return getVShift(true, VT,
06897                        DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
06898                                    VT, Op.getOperand(1)),
06899                        NumBits/2, DAG, *this, dl);
06900     }
06901 
06902     if (IsAllConstants) // Otherwise, it's better to do a constpool load.
06903       return SDValue();
06904 
06905     // Otherwise, if this is a vector with i32 or f32 elements, and the element
06906     // is a non-constant being inserted into an element other than the low one,
06907     // we can't use a constant pool load.  Instead, use SCALAR_TO_VECTOR (aka
06908     // movd/movss) to move this into the low element, then shuffle it into
06909     // place.
06910     if (EVTBits == 32) {
06911       Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
06912 
06913       // If using the new shuffle lowering, just directly insert this.
06914       if (ExperimentalVectorShuffleLowering)
06915         return getShuffleVectorZeroOrUndef(Item, Idx, NumZero > 0, Subtarget, DAG);
06916 
06917       // Turn it into a shuffle of zero and zero-extended scalar to vector.
06918       Item = getShuffleVectorZeroOrUndef(Item, 0, NumZero > 0, Subtarget, DAG);
06919       SmallVector<int, 8> MaskVec;
06920       for (unsigned i = 0; i != NumElems; ++i)
06921         MaskVec.push_back(i == Idx ? 0 : 1);
06922       return DAG.getVectorShuffle(VT, dl, Item, DAG.getUNDEF(VT), &MaskVec[0]);
06923     }
06924   }
06925 
06926   // Splat is obviously ok. Let legalizer expand it to a shuffle.
06927   if (Values.size() == 1) {
06928     if (EVTBits == 32) {
06929       // Instead of a shuffle like this:
06930       // shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0>
06931       // Check if it's possible to issue this instead.
06932       // shuffle (vload ptr)), undef, <1, 1, 1, 1>
06933       unsigned Idx = countTrailingZeros(NonZeros);
06934       SDValue Item = Op.getOperand(Idx);
06935       if (Op.getNode()->isOnlyUserOf(Item.getNode()))
06936         return LowerAsSplatVectorLoad(Item, VT, dl, DAG);
06937     }
06938     return SDValue();
06939   }
06940 
06941   // A vector full of immediates; various special cases are already
06942   // handled, so this is best done with a single constant-pool load.
06943   if (IsAllConstants)
06944     return SDValue();
06945 
06946   // For AVX-length vectors, build the individual 128-bit pieces and use
06947   // shuffles to put them in place.
06948   if (VT.is256BitVector() || VT.is512BitVector()) {
06949     SmallVector<SDValue, 64> V;
06950     for (unsigned i = 0; i != NumElems; ++i)
06951       V.push_back(Op.getOperand(i));
06952 
06953     EVT HVT = EVT::getVectorVT(*DAG.getContext(), ExtVT, NumElems/2);
06954 
06955     // Build both the lower and upper subvector.
06956     SDValue Lower = DAG.getNode(ISD::BUILD_VECTOR, dl, HVT,
06957                                 makeArrayRef(&V[0], NumElems/2));
06958     SDValue Upper = DAG.getNode(ISD::BUILD_VECTOR, dl, HVT,
06959                                 makeArrayRef(&V[NumElems / 2], NumElems/2));
06960 
06961     // Recreate the wider vector with the lower and upper part.
06962     if (VT.is256BitVector())
06963       return Concat128BitVectors(Lower, Upper, VT, NumElems, DAG, dl);
06964     return Concat256BitVectors(Lower, Upper, VT, NumElems, DAG, dl);
06965   }
06966 
06967   // Let legalizer expand 2-wide build_vectors.
06968   if (EVTBits == 64) {
06969     if (NumNonZero == 1) {
06970       // One half is zero or undef.
06971       unsigned Idx = countTrailingZeros(NonZeros);
06972       SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT,
06973                                  Op.getOperand(Idx));
06974       return getShuffleVectorZeroOrUndef(V2, Idx, true, Subtarget, DAG);
06975     }
06976     return SDValue();
06977   }
06978 
06979   // If element VT is < 32 bits, convert it to inserts into a zero vector.
06980   if (EVTBits == 8 && NumElems == 16) {
06981     SDValue V = LowerBuildVectorv16i8(Op, NonZeros,NumNonZero,NumZero, DAG,
06982                                         Subtarget, *this);
06983     if (V.getNode()) return V;
06984   }
06985 
06986   if (EVTBits == 16 && NumElems == 8) {
06987     SDValue V = LowerBuildVectorv8i16(Op, NonZeros,NumNonZero,NumZero, DAG,
06988                                       Subtarget, *this);
06989     if (V.getNode()) return V;
06990   }
06991 
06992   // If element VT is == 32 bits and has 4 elems, try to generate an INSERTPS
06993   if (EVTBits == 32 && NumElems == 4) {
06994     SDValue V = LowerBuildVectorv4x32(Op, NumElems, NonZeros, NumNonZero,
06995                                       NumZero, DAG, Subtarget, *this);
06996     if (V.getNode())
06997       return V;
06998   }
06999 
07000   // If element VT is == 32 bits, turn it into a number of shuffles.
07001   SmallVector<SDValue, 8> V(NumElems);
07002   if (NumElems == 4 && NumZero > 0) {
07003     for (unsigned i = 0; i < 4; ++i) {
07004       bool isZero = !(NonZeros & (1 << i));
07005       if (isZero)
07006         V[i] = getZeroVector(VT, Subtarget, DAG, dl);
07007       else
07008         V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
07009     }
07010 
07011     for (unsigned i = 0; i < 2; ++i) {
07012       switch ((NonZeros & (0x3 << i*2)) >> (i*2)) {
07013         default: break;
07014         case 0:
07015           V[i] = V[i*2];  // Must be a zero vector.
07016           break;
07017         case 1:
07018           V[i] = getMOVL(DAG, dl, VT, V[i*2+1], V[i*2]);
07019           break;
07020         case 2:
07021           V[i] = getMOVL(DAG, dl, VT, V[i*2], V[i*2+1]);
07022           break;
07023         case 3:
07024           V[i] = getUnpackl(DAG, dl, VT, V[i*2], V[i*2+1]);
07025           break;
07026       }
07027     }
07028 
07029     bool Reverse1 = (NonZeros & 0x3) == 2;
07030     bool Reverse2 = ((NonZeros & (0x3 << 2)) >> 2) == 2;
07031     int MaskVec[] = {
07032       Reverse1 ? 1 : 0,
07033       Reverse1 ? 0 : 1,
07034       static_cast<int>(Reverse2 ? NumElems+1 : NumElems),
07035       static_cast<int>(Reverse2 ? NumElems   : NumElems+1)
07036     };
07037     return DAG.getVectorShuffle(VT, dl, V[0], V[1], &MaskVec[0]);
07038   }
07039 
07040   if (Values.size() > 1 && VT.is128BitVector()) {
07041     // Check for a build vector of consecutive loads.
07042     for (unsigned i = 0; i < NumElems; ++i)
07043       V[i] = Op.getOperand(i);
07044 
07045     // Check for elements which are consecutive loads.
07046     SDValue LD = EltsFromConsecutiveLoads(VT, V, dl, DAG, false);
07047     if (LD.getNode())
07048       return LD;
07049 
07050     // Check for a build vector from mostly shuffle plus few inserting.
07051     SDValue Sh = buildFromShuffleMostly(Op, DAG);
07052     if (Sh.getNode())
07053       return Sh;
07054 
07055     // For SSE 4.1, use insertps to put the high elements into the low element.
07056     if (getSubtarget()->hasSSE41()) {
07057       SDValue Result;
07058       if (Op.getOperand(0).getOpcode() != ISD::UNDEF)
07059         Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0));
07060       else
07061         Result = DAG.getUNDEF(VT);
07062 
07063       for (unsigned i = 1; i < NumElems; ++i) {
07064         if (Op.getOperand(i).getOpcode() == ISD::UNDEF) continue;
07065         Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result,
07066                              Op.getOperand(i), DAG.getIntPtrConstant(i));
07067       }
07068       return Result;
07069     }
07070 
07071     // Otherwise, expand into a number of unpckl*, start by extending each of
07072     // our (non-undef) elements to the full vector width with the element in the
07073     // bottom slot of the vector (which generates no code for SSE).
07074     for (unsigned i = 0; i < NumElems; ++i) {
07075       if (Op.getOperand(i).getOpcode() != ISD::UNDEF)
07076         V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
07077       else
07078         V[i] = DAG.getUNDEF(VT);
07079     }
07080 
07081     // Next, we iteratively mix elements, e.g. for v4f32:
07082     //   Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
07083     //         : unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
07084     //   Step 2: unpcklps X, Y ==>    <3, 2, 1, 0>
07085     unsigned EltStride = NumElems >> 1;
07086     while (EltStride != 0) {
07087       for (unsigned i = 0; i < EltStride; ++i) {
07088         // If V[i+EltStride] is undef and this is the first round of mixing,
07089         // then it is safe to just drop this shuffle: V[i] is already in the
07090         // right place, the one element (since it's the first round) being
07091         // inserted as undef can be dropped.  This isn't safe for successive
07092         // rounds because they will permute elements within both vectors.
07093         if (V[i+EltStride].getOpcode() == ISD::UNDEF &&
07094             EltStride == NumElems/2)
07095           continue;
07096 
07097         V[i] = getUnpackl(DAG, dl, VT, V[i], V[i + EltStride]);
07098       }
07099       EltStride >>= 1;
07100     }
07101     return V[0];
07102   }
07103   return SDValue();
07104 }
07105 
07106 // LowerAVXCONCAT_VECTORS - 256-bit AVX can use the vinsertf128 instruction
07107 // to create 256-bit vectors from two other 128-bit ones.
07108 static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
07109   SDLoc dl(Op);
07110   MVT ResVT = Op.getSimpleValueType();
07111 
07112   assert((ResVT.is256BitVector() ||
07113           ResVT.is512BitVector()) && "Value type must be 256-/512-bit wide");
07114 
07115   SDValue V1 = Op.getOperand(0);
07116   SDValue V2 = Op.getOperand(1);
07117   unsigned NumElems = ResVT.getVectorNumElements();
07118   if(ResVT.is256BitVector())
07119     return Concat128BitVectors(V1, V2, ResVT, NumElems, DAG, dl);
07120 
07121   if (Op.getNumOperands() == 4) {
07122     MVT HalfVT = MVT::getVectorVT(ResVT.getScalarType(),
07123                                 ResVT.getVectorNumElements()/2);
07124     SDValue V3 = Op.getOperand(2);
07125     SDValue V4 = Op.getOperand(3);
07126     return Concat256BitVectors(Concat128BitVectors(V1, V2, HalfVT, NumElems/2, DAG, dl),
07127       Concat128BitVectors(V3, V4, HalfVT, NumElems/2, DAG, dl), ResVT, NumElems, DAG, dl);
07128   }
07129   return Concat256BitVectors(V1, V2, ResVT, NumElems, DAG, dl);
07130 }
07131 
07132 static SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
07133   MVT LLVM_ATTRIBUTE_UNUSED VT = Op.getSimpleValueType();
07134   assert((VT.is256BitVector() && Op.getNumOperands() == 2) ||
07135          (VT.is512BitVector() && (Op.getNumOperands