LLVM API Documentation

X86ISelLowering.cpp
Go to the documentation of this file.
00001 //===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===//
00002 //
00003 //                     The LLVM Compiler Infrastructure
00004 //
00005 // This file is distributed under the University of Illinois Open Source
00006 // License. See LICENSE.TXT for details.
00007 //
00008 //===----------------------------------------------------------------------===//
00009 //
00010 // This file defines the interfaces that X86 uses to lower LLVM code into a
00011 // selection DAG.
00012 //
00013 //===----------------------------------------------------------------------===//
00014 
00015 #include "X86ISelLowering.h"
00016 #include "Utils/X86ShuffleDecode.h"
00017 #include "X86CallingConv.h"
00018 #include "X86FrameLowering.h"
00019 #include "X86InstrBuilder.h"
00020 #include "X86MachineFunctionInfo.h"
00021 #include "X86TargetMachine.h"
00022 #include "X86TargetObjectFile.h"
00023 #include "llvm/ADT/SmallBitVector.h"
00024 #include "llvm/ADT/SmallSet.h"
00025 #include "llvm/ADT/Statistic.h"
00026 #include "llvm/ADT/StringExtras.h"
00027 #include "llvm/ADT/StringSwitch.h"
00028 #include "llvm/ADT/VariadicFunction.h"
00029 #include "llvm/CodeGen/IntrinsicLowering.h"
00030 #include "llvm/CodeGen/MachineFrameInfo.h"
00031 #include "llvm/CodeGen/MachineFunction.h"
00032 #include "llvm/CodeGen/MachineInstrBuilder.h"
00033 #include "llvm/CodeGen/MachineJumpTableInfo.h"
00034 #include "llvm/CodeGen/MachineModuleInfo.h"
00035 #include "llvm/CodeGen/MachineRegisterInfo.h"
00036 #include "llvm/IR/CallSite.h"
00037 #include "llvm/IR/CallingConv.h"
00038 #include "llvm/IR/Constants.h"
00039 #include "llvm/IR/DerivedTypes.h"
00040 #include "llvm/IR/Function.h"
00041 #include "llvm/IR/GlobalAlias.h"
00042 #include "llvm/IR/GlobalVariable.h"
00043 #include "llvm/IR/Instructions.h"
00044 #include "llvm/IR/Intrinsics.h"
00045 #include "llvm/MC/MCAsmInfo.h"
00046 #include "llvm/MC/MCContext.h"
00047 #include "llvm/MC/MCExpr.h"
00048 #include "llvm/MC/MCSymbol.h"
00049 #include "llvm/Support/CommandLine.h"
00050 #include "llvm/Support/Debug.h"
00051 #include "llvm/Support/ErrorHandling.h"
00052 #include "llvm/Support/MathExtras.h"
00053 #include "llvm/Target/TargetOptions.h"
00054 #include "X86IntrinsicsInfo.h"
00055 #include <bitset>
00056 #include <numeric>
00057 #include <cctype>
00058 using namespace llvm;
00059 
00060 #define DEBUG_TYPE "x86-isel"
00061 
00062 STATISTIC(NumTailCalls, "Number of tail calls");
00063 
00064 static cl::opt<bool> ExperimentalVectorWideningLegalization(
00065     "x86-experimental-vector-widening-legalization", cl::init(false),
00066     cl::desc("Enable an experimental vector type legalization through widening "
00067              "rather than promotion."),
00068     cl::Hidden);
00069 
00070 static cl::opt<int> ReciprocalEstimateRefinementSteps(
00071     "x86-recip-refinement-steps", cl::init(1),
00072     cl::desc("Specify the number of Newton-Raphson iterations applied to the "
00073              "result of the hardware reciprocal estimate instruction."),
00074     cl::NotHidden);
00075 
00076 // Forward declarations.
00077 static SDValue getMOVL(SelectionDAG &DAG, SDLoc dl, EVT VT, SDValue V1,
00078                        SDValue V2);
00079 
00080 static SDValue ExtractSubVector(SDValue Vec, unsigned IdxVal,
00081                                 SelectionDAG &DAG, SDLoc dl,
00082                                 unsigned vectorWidth) {
00083   assert((vectorWidth == 128 || vectorWidth == 256) &&
00084          "Unsupported vector width");
00085   EVT VT = Vec.getValueType();
00086   EVT ElVT = VT.getVectorElementType();
00087   unsigned Factor = VT.getSizeInBits()/vectorWidth;
00088   EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT,
00089                                   VT.getVectorNumElements()/Factor);
00090 
00091   // Extract from UNDEF is UNDEF.
00092   if (Vec.getOpcode() == ISD::UNDEF)
00093     return DAG.getUNDEF(ResultVT);
00094 
00095   // Extract the relevant vectorWidth bits.  Generate an EXTRACT_SUBVECTOR
00096   unsigned ElemsPerChunk = vectorWidth / ElVT.getSizeInBits();
00097 
00098   // This is the index of the first element of the vectorWidth-bit chunk
00099   // we want.
00100   unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits()) / vectorWidth)
00101                                * ElemsPerChunk);
00102 
00103   // If the input is a buildvector just emit a smaller one.
00104   if (Vec.getOpcode() == ISD::BUILD_VECTOR)
00105     return DAG.getNode(ISD::BUILD_VECTOR, dl, ResultVT,
00106                        makeArrayRef(Vec->op_begin() + NormalizedIdxVal,
00107                                     ElemsPerChunk));
00108 
00109   SDValue VecIdx = DAG.getIntPtrConstant(NormalizedIdxVal);
00110   return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec, VecIdx);
00111 }
00112 
00113 /// Generate a DAG to grab 128-bits from a vector > 128 bits.  This
00114 /// sets things up to match to an AVX VEXTRACTF128 / VEXTRACTI128
00115 /// or AVX-512 VEXTRACTF32x4 / VEXTRACTI32x4
00116 /// instructions or a simple subregister reference. Idx is an index in the
00117 /// 128 bits we want.  It need not be aligned to a 128-bit boundary.  That makes
00118 /// lowering EXTRACT_VECTOR_ELT operations easier.
00119 static SDValue Extract128BitVector(SDValue Vec, unsigned IdxVal,
00120                                    SelectionDAG &DAG, SDLoc dl) {
00121   assert((Vec.getValueType().is256BitVector() ||
00122           Vec.getValueType().is512BitVector()) && "Unexpected vector size!");
00123   return ExtractSubVector(Vec, IdxVal, DAG, dl, 128);
00124 }
00125 
00126 /// Generate a DAG to grab 256-bits from a 512-bit vector.
00127 static SDValue Extract256BitVector(SDValue Vec, unsigned IdxVal,
00128                                    SelectionDAG &DAG, SDLoc dl) {
00129   assert(Vec.getValueType().is512BitVector() && "Unexpected vector size!");
00130   return ExtractSubVector(Vec, IdxVal, DAG, dl, 256);
00131 }
00132 
00133 static SDValue InsertSubVector(SDValue Result, SDValue Vec,
00134                                unsigned IdxVal, SelectionDAG &DAG,
00135                                SDLoc dl, unsigned vectorWidth) {
00136   assert((vectorWidth == 128 || vectorWidth == 256) &&
00137          "Unsupported vector width");
00138   // Inserting UNDEF is Result
00139   if (Vec.getOpcode() == ISD::UNDEF)
00140     return Result;
00141   EVT VT = Vec.getValueType();
00142   EVT ElVT = VT.getVectorElementType();
00143   EVT ResultVT = Result.getValueType();
00144 
00145   // Insert the relevant vectorWidth bits.
00146   unsigned ElemsPerChunk = vectorWidth/ElVT.getSizeInBits();
00147 
00148   // This is the index of the first element of the vectorWidth-bit chunk
00149   // we want.
00150   unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits())/vectorWidth)
00151                                * ElemsPerChunk);
00152 
00153   SDValue VecIdx = DAG.getIntPtrConstant(NormalizedIdxVal);
00154   return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec, VecIdx);
00155 }
00156 
00157 /// Generate a DAG to put 128-bits into a vector > 128 bits.  This
00158 /// sets things up to match to an AVX VINSERTF128/VINSERTI128 or
00159 /// AVX-512 VINSERTF32x4/VINSERTI32x4 instructions or a
00160 /// simple superregister reference.  Idx is an index in the 128 bits
00161 /// we want.  It need not be aligned to a 128-bit boundary.  That makes
00162 /// lowering INSERT_VECTOR_ELT operations easier.
00163 static SDValue Insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
00164                                   SelectionDAG &DAG,SDLoc dl) {
00165   assert(Vec.getValueType().is128BitVector() && "Unexpected vector size!");
00166   return InsertSubVector(Result, Vec, IdxVal, DAG, dl, 128);
00167 }
00168 
00169 static SDValue Insert256BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
00170                                   SelectionDAG &DAG, SDLoc dl) {
00171   assert(Vec.getValueType().is256BitVector() && "Unexpected vector size!");
00172   return InsertSubVector(Result, Vec, IdxVal, DAG, dl, 256);
00173 }
00174 
00175 /// Concat two 128-bit vectors into a 256 bit vector using VINSERTF128
00176 /// instructions. This is used because creating CONCAT_VECTOR nodes of
00177 /// BUILD_VECTORS returns a larger BUILD_VECTOR while we're trying to lower
00178 /// large BUILD_VECTORS.
00179 static SDValue Concat128BitVectors(SDValue V1, SDValue V2, EVT VT,
00180                                    unsigned NumElems, SelectionDAG &DAG,
00181                                    SDLoc dl) {
00182   SDValue V = Insert128BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl);
00183   return Insert128BitVector(V, V2, NumElems/2, DAG, dl);
00184 }
00185 
00186 static SDValue Concat256BitVectors(SDValue V1, SDValue V2, EVT VT,
00187                                    unsigned NumElems, SelectionDAG &DAG,
00188                                    SDLoc dl) {
00189   SDValue V = Insert256BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl);
00190   return Insert256BitVector(V, V2, NumElems/2, DAG, dl);
00191 }
00192 
00193 X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
00194                                      const X86Subtarget &STI)
00195     : TargetLowering(TM), Subtarget(&STI) {
00196   X86ScalarSSEf64 = Subtarget->hasSSE2();
00197   X86ScalarSSEf32 = Subtarget->hasSSE1();
00198   TD = getDataLayout();
00199 
00200   // Set up the TargetLowering object.
00201   static const MVT IntVTs[] = { MVT::i8, MVT::i16, MVT::i32, MVT::i64 };
00202 
00203   // X86 is weird. It always uses i8 for shift amounts and setcc results.
00204   setBooleanContents(ZeroOrOneBooleanContent);
00205   // X86-SSE is even stranger. It uses -1 or 0 for vector masks.
00206   setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
00207 
00208   // For 64-bit, since we have so many registers, use the ILP scheduler.
00209   // For 32-bit, use the register pressure specific scheduling.
00210   // For Atom, always use ILP scheduling.
00211   if (Subtarget->isAtom())
00212     setSchedulingPreference(Sched::ILP);
00213   else if (Subtarget->is64Bit())
00214     setSchedulingPreference(Sched::ILP);
00215   else
00216     setSchedulingPreference(Sched::RegPressure);
00217   const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
00218   setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister());
00219 
00220   // Bypass expensive divides on Atom when compiling with O2.
00221   if (TM.getOptLevel() >= CodeGenOpt::Default) {
00222     if (Subtarget->hasSlowDivide32())
00223       addBypassSlowDiv(32, 8);
00224     if (Subtarget->hasSlowDivide64() && Subtarget->is64Bit())
00225       addBypassSlowDiv(64, 16);
00226   }
00227 
00228   if (Subtarget->isTargetKnownWindowsMSVC()) {
00229     // Setup Windows compiler runtime calls.
00230     setLibcallName(RTLIB::SDIV_I64, "_alldiv");
00231     setLibcallName(RTLIB::UDIV_I64, "_aulldiv");
00232     setLibcallName(RTLIB::SREM_I64, "_allrem");
00233     setLibcallName(RTLIB::UREM_I64, "_aullrem");
00234     setLibcallName(RTLIB::MUL_I64, "_allmul");
00235     setLibcallCallingConv(RTLIB::SDIV_I64, CallingConv::X86_StdCall);
00236     setLibcallCallingConv(RTLIB::UDIV_I64, CallingConv::X86_StdCall);
00237     setLibcallCallingConv(RTLIB::SREM_I64, CallingConv::X86_StdCall);
00238     setLibcallCallingConv(RTLIB::UREM_I64, CallingConv::X86_StdCall);
00239     setLibcallCallingConv(RTLIB::MUL_I64, CallingConv::X86_StdCall);
00240 
00241     // The _ftol2 runtime function has an unusual calling conv, which
00242     // is modeled by a special pseudo-instruction.
00243     setLibcallName(RTLIB::FPTOUINT_F64_I64, nullptr);
00244     setLibcallName(RTLIB::FPTOUINT_F32_I64, nullptr);
00245     setLibcallName(RTLIB::FPTOUINT_F64_I32, nullptr);
00246     setLibcallName(RTLIB::FPTOUINT_F32_I32, nullptr);
00247   }
00248 
00249   if (Subtarget->isTargetDarwin()) {
00250     // Darwin should use _setjmp/_longjmp instead of setjmp/longjmp.
00251     setUseUnderscoreSetJmp(false);
00252     setUseUnderscoreLongJmp(false);
00253   } else if (Subtarget->isTargetWindowsGNU()) {
00254     // MS runtime is weird: it exports _setjmp, but longjmp!
00255     setUseUnderscoreSetJmp(true);
00256     setUseUnderscoreLongJmp(false);
00257   } else {
00258     setUseUnderscoreSetJmp(true);
00259     setUseUnderscoreLongJmp(true);
00260   }
00261 
00262   // Set up the register classes.
00263   addRegisterClass(MVT::i8, &X86::GR8RegClass);
00264   addRegisterClass(MVT::i16, &X86::GR16RegClass);
00265   addRegisterClass(MVT::i32, &X86::GR32RegClass);
00266   if (Subtarget->is64Bit())
00267     addRegisterClass(MVT::i64, &X86::GR64RegClass);
00268 
00269   for (MVT VT : MVT::integer_valuetypes())
00270     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
00271 
00272   // We don't accept any truncstore of integer registers.
00273   setTruncStoreAction(MVT::i64, MVT::i32, Expand);
00274   setTruncStoreAction(MVT::i64, MVT::i16, Expand);
00275   setTruncStoreAction(MVT::i64, MVT::i8 , Expand);
00276   setTruncStoreAction(MVT::i32, MVT::i16, Expand);
00277   setTruncStoreAction(MVT::i32, MVT::i8 , Expand);
00278   setTruncStoreAction(MVT::i16, MVT::i8,  Expand);
00279 
00280   setTruncStoreAction(MVT::f64, MVT::f32, Expand);
00281 
00282   // SETOEQ and SETUNE require checking two conditions.
00283   setCondCodeAction(ISD::SETOEQ, MVT::f32, Expand);
00284   setCondCodeAction(ISD::SETOEQ, MVT::f64, Expand);
00285   setCondCodeAction(ISD::SETOEQ, MVT::f80, Expand);
00286   setCondCodeAction(ISD::SETUNE, MVT::f32, Expand);
00287   setCondCodeAction(ISD::SETUNE, MVT::f64, Expand);
00288   setCondCodeAction(ISD::SETUNE, MVT::f80, Expand);
00289 
00290   // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this
00291   // operation.
00292   setOperationAction(ISD::UINT_TO_FP       , MVT::i1   , Promote);
00293   setOperationAction(ISD::UINT_TO_FP       , MVT::i8   , Promote);
00294   setOperationAction(ISD::UINT_TO_FP       , MVT::i16  , Promote);
00295 
00296   if (Subtarget->is64Bit()) {
00297     setOperationAction(ISD::UINT_TO_FP     , MVT::i32  , Promote);
00298     setOperationAction(ISD::UINT_TO_FP     , MVT::i64  , Custom);
00299   } else if (!TM.Options.UseSoftFloat) {
00300     // We have an algorithm for SSE2->double, and we turn this into a
00301     // 64-bit FILD followed by conditional FADD for other targets.
00302     setOperationAction(ISD::UINT_TO_FP     , MVT::i64  , Custom);
00303     // We have an algorithm for SSE2, and we turn this into a 64-bit
00304     // FILD for other targets.
00305     setOperationAction(ISD::UINT_TO_FP     , MVT::i32  , Custom);
00306   }
00307 
00308   // Promote i1/i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have
00309   // this operation.
00310   setOperationAction(ISD::SINT_TO_FP       , MVT::i1   , Promote);
00311   setOperationAction(ISD::SINT_TO_FP       , MVT::i8   , Promote);
00312 
00313   if (!TM.Options.UseSoftFloat) {
00314     // SSE has no i16 to fp conversion, only i32
00315     if (X86ScalarSSEf32) {
00316       setOperationAction(ISD::SINT_TO_FP     , MVT::i16  , Promote);
00317       // f32 and f64 cases are Legal, f80 case is not
00318       setOperationAction(ISD::SINT_TO_FP     , MVT::i32  , Custom);
00319     } else {
00320       setOperationAction(ISD::SINT_TO_FP     , MVT::i16  , Custom);
00321       setOperationAction(ISD::SINT_TO_FP     , MVT::i32  , Custom);
00322     }
00323   } else {
00324     setOperationAction(ISD::SINT_TO_FP     , MVT::i16  , Promote);
00325     setOperationAction(ISD::SINT_TO_FP     , MVT::i32  , Promote);
00326   }
00327 
00328   // In 32-bit mode these are custom lowered.  In 64-bit mode F32 and F64
00329   // are Legal, f80 is custom lowered.
00330   setOperationAction(ISD::FP_TO_SINT     , MVT::i64  , Custom);
00331   setOperationAction(ISD::SINT_TO_FP     , MVT::i64  , Custom);
00332 
00333   // Promote i1/i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have
00334   // this operation.
00335   setOperationAction(ISD::FP_TO_SINT       , MVT::i1   , Promote);
00336   setOperationAction(ISD::FP_TO_SINT       , MVT::i8   , Promote);
00337 
00338   if (X86ScalarSSEf32) {
00339     setOperationAction(ISD::FP_TO_SINT     , MVT::i16  , Promote);
00340     // f32 and f64 cases are Legal, f80 case is not
00341     setOperationAction(ISD::FP_TO_SINT     , MVT::i32  , Custom);
00342   } else {
00343     setOperationAction(ISD::FP_TO_SINT     , MVT::i16  , Custom);
00344     setOperationAction(ISD::FP_TO_SINT     , MVT::i32  , Custom);
00345   }
00346 
00347   // Handle FP_TO_UINT by promoting the destination to a larger signed
00348   // conversion.
00349   setOperationAction(ISD::FP_TO_UINT       , MVT::i1   , Promote);
00350   setOperationAction(ISD::FP_TO_UINT       , MVT::i8   , Promote);
00351   setOperationAction(ISD::FP_TO_UINT       , MVT::i16  , Promote);
00352 
00353   if (Subtarget->is64Bit()) {
00354     setOperationAction(ISD::FP_TO_UINT     , MVT::i64  , Expand);
00355     setOperationAction(ISD::FP_TO_UINT     , MVT::i32  , Promote);
00356   } else if (!TM.Options.UseSoftFloat) {
00357     // Since AVX is a superset of SSE3, only check for SSE here.
00358     if (Subtarget->hasSSE1() && !Subtarget->hasSSE3())
00359       // Expand FP_TO_UINT into a select.
00360       // FIXME: We would like to use a Custom expander here eventually to do
00361       // the optimal thing for SSE vs. the default expansion in the legalizer.
00362       setOperationAction(ISD::FP_TO_UINT   , MVT::i32  , Expand);
00363     else
00364       // With SSE3 we can use fisttpll to convert to a signed i64; without
00365       // SSE, we're stuck with a fistpll.
00366       setOperationAction(ISD::FP_TO_UINT   , MVT::i32  , Custom);
00367   }
00368 
00369   if (isTargetFTOL()) {
00370     // Use the _ftol2 runtime function, which has a pseudo-instruction
00371     // to handle its weird calling convention.
00372     setOperationAction(ISD::FP_TO_UINT     , MVT::i64  , Custom);
00373   }
00374 
00375   // TODO: when we have SSE, these could be more efficient, by using movd/movq.
00376   if (!X86ScalarSSEf64) {
00377     setOperationAction(ISD::BITCAST        , MVT::f32  , Expand);
00378     setOperationAction(ISD::BITCAST        , MVT::i32  , Expand);
00379     if (Subtarget->is64Bit()) {
00380       setOperationAction(ISD::BITCAST      , MVT::f64  , Expand);
00381       // Without SSE, i64->f64 goes through memory.
00382       setOperationAction(ISD::BITCAST      , MVT::i64  , Expand);
00383     }
00384   }
00385 
00386   // Scalar integer divide and remainder are lowered to use operations that
00387   // produce two results, to match the available instructions. This exposes
00388   // the two-result form to trivial CSE, which is able to combine x/y and x%y
00389   // into a single instruction.
00390   //
00391   // Scalar integer multiply-high is also lowered to use two-result
00392   // operations, to match the available instructions. However, plain multiply
00393   // (low) operations are left as Legal, as there are single-result
00394   // instructions for this in x86. Using the two-result multiply instructions
00395   // when both high and low results are needed must be arranged by dagcombine.
00396   for (unsigned i = 0; i != array_lengthof(IntVTs); ++i) {
00397     MVT VT = IntVTs[i];
00398     setOperationAction(ISD::MULHS, VT, Expand);
00399     setOperationAction(ISD::MULHU, VT, Expand);
00400     setOperationAction(ISD::SDIV, VT, Expand);
00401     setOperationAction(ISD::UDIV, VT, Expand);
00402     setOperationAction(ISD::SREM, VT, Expand);
00403     setOperationAction(ISD::UREM, VT, Expand);
00404 
00405     // Add/Sub overflow ops with MVT::Glues are lowered to EFLAGS dependences.
00406     setOperationAction(ISD::ADDC, VT, Custom);
00407     setOperationAction(ISD::ADDE, VT, Custom);
00408     setOperationAction(ISD::SUBC, VT, Custom);
00409     setOperationAction(ISD::SUBE, VT, Custom);
00410   }
00411 
00412   setOperationAction(ISD::BR_JT            , MVT::Other, Expand);
00413   setOperationAction(ISD::BRCOND           , MVT::Other, Custom);
00414   setOperationAction(ISD::BR_CC            , MVT::f32,   Expand);
00415   setOperationAction(ISD::BR_CC            , MVT::f64,   Expand);
00416   setOperationAction(ISD::BR_CC            , MVT::f80,   Expand);
00417   setOperationAction(ISD::BR_CC            , MVT::i8,    Expand);
00418   setOperationAction(ISD::BR_CC            , MVT::i16,   Expand);
00419   setOperationAction(ISD::BR_CC            , MVT::i32,   Expand);
00420   setOperationAction(ISD::BR_CC            , MVT::i64,   Expand);
00421   setOperationAction(ISD::SELECT_CC        , MVT::f32,   Expand);
00422   setOperationAction(ISD::SELECT_CC        , MVT::f64,   Expand);
00423   setOperationAction(ISD::SELECT_CC        , MVT::f80,   Expand);
00424   setOperationAction(ISD::SELECT_CC        , MVT::i8,    Expand);
00425   setOperationAction(ISD::SELECT_CC        , MVT::i16,   Expand);
00426   setOperationAction(ISD::SELECT_CC        , MVT::i32,   Expand);
00427   setOperationAction(ISD::SELECT_CC        , MVT::i64,   Expand);
00428   if (Subtarget->is64Bit())
00429     setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
00430   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16  , Legal);
00431   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8   , Legal);
00432   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1   , Expand);
00433   setOperationAction(ISD::FP_ROUND_INREG   , MVT::f32  , Expand);
00434   setOperationAction(ISD::FREM             , MVT::f32  , Expand);
00435   setOperationAction(ISD::FREM             , MVT::f64  , Expand);
00436   setOperationAction(ISD::FREM             , MVT::f80  , Expand);
00437   setOperationAction(ISD::FLT_ROUNDS_      , MVT::i32  , Custom);
00438 
00439   // Promote the i8 variants and force them on up to i32 which has a shorter
00440   // encoding.
00441   setOperationAction(ISD::CTTZ             , MVT::i8   , Promote);
00442   AddPromotedToType (ISD::CTTZ             , MVT::i8   , MVT::i32);
00443   setOperationAction(ISD::CTTZ_ZERO_UNDEF  , MVT::i8   , Promote);
00444   AddPromotedToType (ISD::CTTZ_ZERO_UNDEF  , MVT::i8   , MVT::i32);
00445   if (Subtarget->hasBMI()) {
00446     setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i16  , Expand);
00447     setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32  , Expand);
00448     if (Subtarget->is64Bit())
00449       setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Expand);
00450   } else {
00451     setOperationAction(ISD::CTTZ           , MVT::i16  , Custom);
00452     setOperationAction(ISD::CTTZ           , MVT::i32  , Custom);
00453     if (Subtarget->is64Bit())
00454       setOperationAction(ISD::CTTZ         , MVT::i64  , Custom);
00455   }
00456 
00457   if (Subtarget->hasLZCNT()) {
00458     // When promoting the i8 variants, force them to i32 for a shorter
00459     // encoding.
00460     setOperationAction(ISD::CTLZ           , MVT::i8   , Promote);
00461     AddPromotedToType (ISD::CTLZ           , MVT::i8   , MVT::i32);
00462     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8   , Promote);
00463     AddPromotedToType (ISD::CTLZ_ZERO_UNDEF, MVT::i8   , MVT::i32);
00464     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16  , Expand);
00465     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32  , Expand);
00466     if (Subtarget->is64Bit())
00467       setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Expand);
00468   } else {
00469     setOperationAction(ISD::CTLZ           , MVT::i8   , Custom);
00470     setOperationAction(ISD::CTLZ           , MVT::i16  , Custom);
00471     setOperationAction(ISD::CTLZ           , MVT::i32  , Custom);
00472     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8   , Custom);
00473     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16  , Custom);
00474     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32  , Custom);
00475     if (Subtarget->is64Bit()) {
00476       setOperationAction(ISD::CTLZ         , MVT::i64  , Custom);
00477       setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom);
00478     }
00479   }
00480 
00481   // Special handling for half-precision floating point conversions.
00482   // If we don't have F16C support, then lower half float conversions
00483   // into library calls.
00484   if (TM.Options.UseSoftFloat || !Subtarget->hasF16C()) {
00485     setOperationAction(ISD::FP16_TO_FP, MVT::f32, Expand);
00486     setOperationAction(ISD::FP_TO_FP16, MVT::f32, Expand);
00487   }
00488 
00489   // There's never any support for operations beyond MVT::f32.
00490   setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
00491   setOperationAction(ISD::FP16_TO_FP, MVT::f80, Expand);
00492   setOperationAction(ISD::FP_TO_FP16, MVT::f64, Expand);
00493   setOperationAction(ISD::FP_TO_FP16, MVT::f80, Expand);
00494 
00495   setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
00496   setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
00497   setLoadExtAction(ISD::EXTLOAD, MVT::f80, MVT::f16, Expand);
00498   setTruncStoreAction(MVT::f32, MVT::f16, Expand);
00499   setTruncStoreAction(MVT::f64, MVT::f16, Expand);
00500   setTruncStoreAction(MVT::f80, MVT::f16, Expand);
00501 
00502   if (Subtarget->hasPOPCNT()) {
00503     setOperationAction(ISD::CTPOP          , MVT::i8   , Promote);
00504   } else {
00505     setOperationAction(ISD::CTPOP          , MVT::i8   , Expand);
00506     setOperationAction(ISD::CTPOP          , MVT::i16  , Expand);
00507     setOperationAction(ISD::CTPOP          , MVT::i32  , Expand);
00508     if (Subtarget->is64Bit())
00509       setOperationAction(ISD::CTPOP        , MVT::i64  , Expand);
00510   }
00511 
00512   setOperationAction(ISD::READCYCLECOUNTER , MVT::i64  , Custom);
00513 
00514   if (!Subtarget->hasMOVBE())
00515     setOperationAction(ISD::BSWAP          , MVT::i16  , Expand);
00516 
00517   // These should be promoted to a larger select which is supported.
00518   setOperationAction(ISD::SELECT          , MVT::i1   , Promote);
00519   // X86 wants to expand cmov itself.
00520   setOperationAction(ISD::SELECT          , MVT::i8   , Custom);
00521   setOperationAction(ISD::SELECT          , MVT::i16  , Custom);
00522   setOperationAction(ISD::SELECT          , MVT::i32  , Custom);
00523   setOperationAction(ISD::SELECT          , MVT::f32  , Custom);
00524   setOperationAction(ISD::SELECT          , MVT::f64  , Custom);
00525   setOperationAction(ISD::SELECT          , MVT::f80  , Custom);
00526   setOperationAction(ISD::SETCC           , MVT::i8   , Custom);
00527   setOperationAction(ISD::SETCC           , MVT::i16  , Custom);
00528   setOperationAction(ISD::SETCC           , MVT::i32  , Custom);
00529   setOperationAction(ISD::SETCC           , MVT::f32  , Custom);
00530   setOperationAction(ISD::SETCC           , MVT::f64  , Custom);
00531   setOperationAction(ISD::SETCC           , MVT::f80  , Custom);
00532   if (Subtarget->is64Bit()) {
00533     setOperationAction(ISD::SELECT        , MVT::i64  , Custom);
00534     setOperationAction(ISD::SETCC         , MVT::i64  , Custom);
00535   }
00536   setOperationAction(ISD::EH_RETURN       , MVT::Other, Custom);
00537   // NOTE: EH_SJLJ_SETJMP/_LONGJMP supported here is NOT intended to support
00538   // SjLj exception handling but a light-weight setjmp/longjmp replacement to
00539   // support continuation, user-level threading, and etc.. As a result, no
00540   // other SjLj exception interfaces are implemented and please don't build
00541   // your own exception handling based on them.
00542   // LLVM/Clang supports zero-cost DWARF exception handling.
00543   setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom);
00544   setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom);
00545 
00546   // Darwin ABI issue.
00547   setOperationAction(ISD::ConstantPool    , MVT::i32  , Custom);
00548   setOperationAction(ISD::JumpTable       , MVT::i32  , Custom);
00549   setOperationAction(ISD::GlobalAddress   , MVT::i32  , Custom);
00550   setOperationAction(ISD::GlobalTLSAddress, MVT::i32  , Custom);
00551   if (Subtarget->is64Bit())
00552     setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom);
00553   setOperationAction(ISD::ExternalSymbol  , MVT::i32  , Custom);
00554   setOperationAction(ISD::BlockAddress    , MVT::i32  , Custom);
00555   if (Subtarget->is64Bit()) {
00556     setOperationAction(ISD::ConstantPool  , MVT::i64  , Custom);
00557     setOperationAction(ISD::JumpTable     , MVT::i64  , Custom);
00558     setOperationAction(ISD::GlobalAddress , MVT::i64  , Custom);
00559     setOperationAction(ISD::ExternalSymbol, MVT::i64  , Custom);
00560     setOperationAction(ISD::BlockAddress  , MVT::i64  , Custom);
00561   }
00562   // 64-bit addm sub, shl, sra, srl (iff 32-bit x86)
00563   setOperationAction(ISD::SHL_PARTS       , MVT::i32  , Custom);
00564   setOperationAction(ISD::SRA_PARTS       , MVT::i32  , Custom);
00565   setOperationAction(ISD::SRL_PARTS       , MVT::i32  , Custom);
00566   if (Subtarget->is64Bit()) {
00567     setOperationAction(ISD::SHL_PARTS     , MVT::i64  , Custom);
00568     setOperationAction(ISD::SRA_PARTS     , MVT::i64  , Custom);
00569     setOperationAction(ISD::SRL_PARTS     , MVT::i64  , Custom);
00570   }
00571 
00572   if (Subtarget->hasSSE1())
00573     setOperationAction(ISD::PREFETCH      , MVT::Other, Legal);
00574 
00575   setOperationAction(ISD::ATOMIC_FENCE  , MVT::Other, Custom);
00576 
00577   // Expand certain atomics
00578   for (unsigned i = 0; i != array_lengthof(IntVTs); ++i) {
00579     MVT VT = IntVTs[i];
00580     setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, VT, Custom);
00581     setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom);
00582     setOperationAction(ISD::ATOMIC_STORE, VT, Custom);
00583   }
00584 
00585   if (Subtarget->hasCmpxchg16b()) {
00586     setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i128, Custom);
00587   }
00588 
00589   // FIXME - use subtarget debug flags
00590   if (!Subtarget->isTargetDarwin() && !Subtarget->isTargetELF() &&
00591       !Subtarget->isTargetCygMing() && !Subtarget->isTargetWin64()) {
00592     setOperationAction(ISD::EH_LABEL, MVT::Other, Expand);
00593   }
00594 
00595   if (Subtarget->is64Bit()) {
00596     setExceptionPointerRegister(X86::RAX);
00597     setExceptionSelectorRegister(X86::RDX);
00598   } else {
00599     setExceptionPointerRegister(X86::EAX);
00600     setExceptionSelectorRegister(X86::EDX);
00601   }
00602   setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom);
00603   setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i64, Custom);
00604 
00605   setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom);
00606   setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom);
00607 
00608   setOperationAction(ISD::TRAP, MVT::Other, Legal);
00609   setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);
00610 
00611   // VASTART needs to be custom lowered to use the VarArgsFrameIndex
00612   setOperationAction(ISD::VASTART           , MVT::Other, Custom);
00613   setOperationAction(ISD::VAEND             , MVT::Other, Expand);
00614   if (Subtarget->is64Bit() && !Subtarget->isTargetWin64()) {
00615     // TargetInfo::X86_64ABIBuiltinVaList
00616     setOperationAction(ISD::VAARG           , MVT::Other, Custom);
00617     setOperationAction(ISD::VACOPY          , MVT::Other, Custom);
00618   } else {
00619     // TargetInfo::CharPtrBuiltinVaList
00620     setOperationAction(ISD::VAARG           , MVT::Other, Expand);
00621     setOperationAction(ISD::VACOPY          , MVT::Other, Expand);
00622   }
00623 
00624   setOperationAction(ISD::STACKSAVE,          MVT::Other, Expand);
00625   setOperationAction(ISD::STACKRESTORE,       MVT::Other, Expand);
00626 
00627   setOperationAction(ISD::DYNAMIC_STACKALLOC, getPointerTy(), Custom);
00628 
00629   if (!TM.Options.UseSoftFloat && X86ScalarSSEf64) {
00630     // f32 and f64 use SSE.
00631     // Set up the FP register classes.
00632     addRegisterClass(MVT::f32, &X86::FR32RegClass);
00633     addRegisterClass(MVT::f64, &X86::FR64RegClass);
00634 
00635     // Use ANDPD to simulate FABS.
00636     setOperationAction(ISD::FABS , MVT::f64, Custom);
00637     setOperationAction(ISD::FABS , MVT::f32, Custom);
00638 
00639     // Use XORP to simulate FNEG.
00640     setOperationAction(ISD::FNEG , MVT::f64, Custom);
00641     setOperationAction(ISD::FNEG , MVT::f32, Custom);
00642 
00643     // Use ANDPD and ORPD to simulate FCOPYSIGN.
00644     setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom);
00645     setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
00646 
00647     // Lower this to FGETSIGNx86 plus an AND.
00648     setOperationAction(ISD::FGETSIGN, MVT::i64, Custom);
00649     setOperationAction(ISD::FGETSIGN, MVT::i32, Custom);
00650 
00651     // We don't support sin/cos/fmod
00652     setOperationAction(ISD::FSIN   , MVT::f64, Expand);
00653     setOperationAction(ISD::FCOS   , MVT::f64, Expand);
00654     setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
00655     setOperationAction(ISD::FSIN   , MVT::f32, Expand);
00656     setOperationAction(ISD::FCOS   , MVT::f32, Expand);
00657     setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
00658 
00659     // Expand FP immediates into loads from the stack, except for the special
00660     // cases we handle.
00661     addLegalFPImmediate(APFloat(+0.0)); // xorpd
00662     addLegalFPImmediate(APFloat(+0.0f)); // xorps
00663   } else if (!TM.Options.UseSoftFloat && X86ScalarSSEf32) {
00664     // Use SSE for f32, x87 for f64.
00665     // Set up the FP register classes.
00666     addRegisterClass(MVT::f32, &X86::FR32RegClass);
00667     addRegisterClass(MVT::f64, &X86::RFP64RegClass);
00668 
00669     // Use ANDPS to simulate FABS.
00670     setOperationAction(ISD::FABS , MVT::f32, Custom);
00671 
00672     // Use XORP to simulate FNEG.
00673     setOperationAction(ISD::FNEG , MVT::f32, Custom);
00674 
00675     setOperationAction(ISD::UNDEF,     MVT::f64, Expand);
00676 
00677     // Use ANDPS and ORPS to simulate FCOPYSIGN.
00678     setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
00679     setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
00680 
00681     // We don't support sin/cos/fmod
00682     setOperationAction(ISD::FSIN   , MVT::f32, Expand);
00683     setOperationAction(ISD::FCOS   , MVT::f32, Expand);
00684     setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
00685 
00686     // Special cases we handle for FP constants.
00687     addLegalFPImmediate(APFloat(+0.0f)); // xorps
00688     addLegalFPImmediate(APFloat(+0.0)); // FLD0
00689     addLegalFPImmediate(APFloat(+1.0)); // FLD1
00690     addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
00691     addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
00692 
00693     if (!TM.Options.UnsafeFPMath) {
00694       setOperationAction(ISD::FSIN   , MVT::f64, Expand);
00695       setOperationAction(ISD::FCOS   , MVT::f64, Expand);
00696       setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
00697     }
00698   } else if (!TM.Options.UseSoftFloat) {
00699     // f32 and f64 in x87.
00700     // Set up the FP register classes.
00701     addRegisterClass(MVT::f64, &X86::RFP64RegClass);
00702     addRegisterClass(MVT::f32, &X86::RFP32RegClass);
00703 
00704     setOperationAction(ISD::UNDEF,     MVT::f64, Expand);
00705     setOperationAction(ISD::UNDEF,     MVT::f32, Expand);
00706     setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
00707     setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
00708 
00709     if (!TM.Options.UnsafeFPMath) {
00710       setOperationAction(ISD::FSIN   , MVT::f64, Expand);
00711       setOperationAction(ISD::FSIN   , MVT::f32, Expand);
00712       setOperationAction(ISD::FCOS   , MVT::f64, Expand);
00713       setOperationAction(ISD::FCOS   , MVT::f32, Expand);
00714       setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
00715       setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
00716     }
00717     addLegalFPImmediate(APFloat(+0.0)); // FLD0
00718     addLegalFPImmediate(APFloat(+1.0)); // FLD1
00719     addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
00720     addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
00721     addLegalFPImmediate(APFloat(+0.0f)); // FLD0
00722     addLegalFPImmediate(APFloat(+1.0f)); // FLD1
00723     addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS
00724     addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS
00725   }
00726 
00727   // We don't support FMA.
00728   setOperationAction(ISD::FMA, MVT::f64, Expand);
00729   setOperationAction(ISD::FMA, MVT::f32, Expand);
00730 
00731   // Long double always uses X87.
00732   if (!TM.Options.UseSoftFloat) {
00733     addRegisterClass(MVT::f80, &X86::RFP80RegClass);
00734     setOperationAction(ISD::UNDEF,     MVT::f80, Expand);
00735     setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand);
00736     {
00737       APFloat TmpFlt = APFloat::getZero(APFloat::x87DoubleExtended);
00738       addLegalFPImmediate(TmpFlt);  // FLD0
00739       TmpFlt.changeSign();
00740       addLegalFPImmediate(TmpFlt);  // FLD0/FCHS
00741 
00742       bool ignored;
00743       APFloat TmpFlt2(+1.0);
00744       TmpFlt2.convert(APFloat::x87DoubleExtended, APFloat::rmNearestTiesToEven,
00745                       &ignored);
00746       addLegalFPImmediate(TmpFlt2);  // FLD1
00747       TmpFlt2.changeSign();
00748       addLegalFPImmediate(TmpFlt2);  // FLD1/FCHS
00749     }
00750 
00751     if (!TM.Options.UnsafeFPMath) {
00752       setOperationAction(ISD::FSIN   , MVT::f80, Expand);
00753       setOperationAction(ISD::FCOS   , MVT::f80, Expand);
00754       setOperationAction(ISD::FSINCOS, MVT::f80, Expand);
00755     }
00756 
00757     setOperationAction(ISD::FFLOOR, MVT::f80, Expand);
00758     setOperationAction(ISD::FCEIL,  MVT::f80, Expand);
00759     setOperationAction(ISD::FTRUNC, MVT::f80, Expand);
00760     setOperationAction(ISD::FRINT,  MVT::f80, Expand);
00761     setOperationAction(ISD::FNEARBYINT, MVT::f80, Expand);
00762     setOperationAction(ISD::FMA, MVT::f80, Expand);
00763   }
00764 
00765   // Always use a library call for pow.
00766   setOperationAction(ISD::FPOW             , MVT::f32  , Expand);
00767   setOperationAction(ISD::FPOW             , MVT::f64  , Expand);
00768   setOperationAction(ISD::FPOW             , MVT::f80  , Expand);
00769 
00770   setOperationAction(ISD::FLOG, MVT::f80, Expand);
00771   setOperationAction(ISD::FLOG2, MVT::f80, Expand);
00772   setOperationAction(ISD::FLOG10, MVT::f80, Expand);
00773   setOperationAction(ISD::FEXP, MVT::f80, Expand);
00774   setOperationAction(ISD::FEXP2, MVT::f80, Expand);
00775   setOperationAction(ISD::FMINNUM, MVT::f80, Expand);
00776   setOperationAction(ISD::FMAXNUM, MVT::f80, Expand);
00777 
00778   // First set operation action for all vector types to either promote
00779   // (for widening) or expand (for scalarization). Then we will selectively
00780   // turn on ones that can be effectively codegen'd.
00781   for (MVT VT : MVT::vector_valuetypes()) {
00782     setOperationAction(ISD::ADD , VT, Expand);
00783     setOperationAction(ISD::SUB , VT, Expand);
00784     setOperationAction(ISD::FADD, VT, Expand);
00785     setOperationAction(ISD::FNEG, VT, Expand);
00786     setOperationAction(ISD::FSUB, VT, Expand);
00787     setOperationAction(ISD::MUL , VT, Expand);
00788     setOperationAction(ISD::FMUL, VT, Expand);
00789     setOperationAction(ISD::SDIV, VT, Expand);
00790     setOperationAction(ISD::UDIV, VT, Expand);
00791     setOperationAction(ISD::FDIV, VT, Expand);
00792     setOperationAction(ISD::SREM, VT, Expand);
00793     setOperationAction(ISD::UREM, VT, Expand);
00794     setOperationAction(ISD::LOAD, VT, Expand);
00795     setOperationAction(ISD::VECTOR_SHUFFLE, VT, Expand);
00796     setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT,Expand);
00797     setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Expand);
00798     setOperationAction(ISD::EXTRACT_SUBVECTOR, VT,Expand);
00799     setOperationAction(ISD::INSERT_SUBVECTOR, VT,Expand);
00800     setOperationAction(ISD::FABS, VT, Expand);
00801     setOperationAction(ISD::FSIN, VT, Expand);
00802     setOperationAction(ISD::FSINCOS, VT, Expand);
00803     setOperationAction(ISD::FCOS, VT, Expand);
00804     setOperationAction(ISD::FSINCOS, VT, Expand);
00805     setOperationAction(ISD::FREM, VT, Expand);
00806     setOperationAction(ISD::FMA,  VT, Expand);
00807     setOperationAction(ISD::FPOWI, VT, Expand);
00808     setOperationAction(ISD::FSQRT, VT, Expand);
00809     setOperationAction(ISD::FCOPYSIGN, VT, Expand);
00810     setOperationAction(ISD::FFLOOR, VT, Expand);
00811     setOperationAction(ISD::FCEIL, VT, Expand);
00812     setOperationAction(ISD::FTRUNC, VT, Expand);
00813     setOperationAction(ISD::FRINT, VT, Expand);
00814     setOperationAction(ISD::FNEARBYINT, VT, Expand);
00815     setOperationAction(ISD::SMUL_LOHI, VT, Expand);
00816     setOperationAction(ISD::MULHS, VT, Expand);
00817     setOperationAction(ISD::UMUL_LOHI, VT, Expand);
00818     setOperationAction(ISD::MULHU, VT, Expand);
00819     setOperationAction(ISD::SDIVREM, VT, Expand);
00820     setOperationAction(ISD::UDIVREM, VT, Expand);
00821     setOperationAction(ISD::FPOW, VT, Expand);
00822     setOperationAction(ISD::CTPOP, VT, Expand);
00823     setOperationAction(ISD::CTTZ, VT, Expand);
00824     setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Expand);
00825     setOperationAction(ISD::CTLZ, VT, Expand);
00826     setOperationAction(ISD::CTLZ_ZERO_UNDEF, VT, Expand);
00827     setOperationAction(ISD::SHL, VT, Expand);
00828     setOperationAction(ISD::SRA, VT, Expand);
00829     setOperationAction(ISD::SRL, VT, Expand);
00830     setOperationAction(ISD::ROTL, VT, Expand);
00831     setOperationAction(ISD::ROTR, VT, Expand);
00832     setOperationAction(ISD::BSWAP, VT, Expand);
00833     setOperationAction(ISD::SETCC, VT, Expand);
00834     setOperationAction(ISD::FLOG, VT, Expand);
00835     setOperationAction(ISD::FLOG2, VT, Expand);
00836     setOperationAction(ISD::FLOG10, VT, Expand);
00837     setOperationAction(ISD::FEXP, VT, Expand);
00838     setOperationAction(ISD::FEXP2, VT, Expand);
00839     setOperationAction(ISD::FP_TO_UINT, VT, Expand);
00840     setOperationAction(ISD::FP_TO_SINT, VT, Expand);
00841     setOperationAction(ISD::UINT_TO_FP, VT, Expand);
00842     setOperationAction(ISD::SINT_TO_FP, VT, Expand);
00843     setOperationAction(ISD::SIGN_EXTEND_INREG, VT,Expand);
00844     setOperationAction(ISD::TRUNCATE, VT, Expand);
00845     setOperationAction(ISD::SIGN_EXTEND, VT, Expand);
00846     setOperationAction(ISD::ZERO_EXTEND, VT, Expand);
00847     setOperationAction(ISD::ANY_EXTEND, VT, Expand);
00848     setOperationAction(ISD::VSELECT, VT, Expand);
00849     setOperationAction(ISD::SELECT_CC, VT, Expand);
00850     for (MVT InnerVT : MVT::vector_valuetypes()) {
00851       setTruncStoreAction(InnerVT, VT, Expand);
00852 
00853       setLoadExtAction(ISD::SEXTLOAD, InnerVT, VT, Expand);
00854       setLoadExtAction(ISD::ZEXTLOAD, InnerVT, VT, Expand);
00855 
00856       // N.b. ISD::EXTLOAD legality is basically ignored except for i1-like
00857       // types, we have to deal with them whether we ask for Expansion or not.
00858       // Setting Expand causes its own optimisation problems though, so leave
00859       // them legal.
00860       if (VT.getVectorElementType() == MVT::i1)
00861         setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
00862     }
00863   }
00864 
00865   // FIXME: In order to prevent SSE instructions being expanded to MMX ones
00866   // with -msoft-float, disable use of MMX as well.
00867   if (!TM.Options.UseSoftFloat && Subtarget->hasMMX()) {
00868     addRegisterClass(MVT::x86mmx, &X86::VR64RegClass);
00869     // No operations on x86mmx supported, everything uses intrinsics.
00870   }
00871 
00872   // MMX-sized vectors (other than x86mmx) are expected to be expanded
00873   // into smaller operations.
00874   setOperationAction(ISD::MULHS,              MVT::v8i8,  Expand);
00875   setOperationAction(ISD::MULHS,              MVT::v4i16, Expand);
00876   setOperationAction(ISD::MULHS,              MVT::v2i32, Expand);
00877   setOperationAction(ISD::MULHS,              MVT::v1i64, Expand);
00878   setOperationAction(ISD::AND,                MVT::v8i8,  Expand);
00879   setOperationAction(ISD::AND,                MVT::v4i16, Expand);
00880   setOperationAction(ISD::AND,                MVT::v2i32, Expand);
00881   setOperationAction(ISD::AND,                MVT::v1i64, Expand);
00882   setOperationAction(ISD::OR,                 MVT::v8i8,  Expand);
00883   setOperationAction(ISD::OR,                 MVT::v4i16, Expand);
00884   setOperationAction(ISD::OR,                 MVT::v2i32, Expand);
00885   setOperationAction(ISD::OR,                 MVT::v1i64, Expand);
00886   setOperationAction(ISD::XOR,                MVT::v8i8,  Expand);
00887   setOperationAction(ISD::XOR,                MVT::v4i16, Expand);
00888   setOperationAction(ISD::XOR,                MVT::v2i32, Expand);
00889   setOperationAction(ISD::XOR,                MVT::v1i64, Expand);
00890   setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v8i8,  Expand);
00891   setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v4i16, Expand);
00892   setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v2i32, Expand);
00893   setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v1i64, Expand);
00894   setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v1i64, Expand);
00895   setOperationAction(ISD::SELECT,             MVT::v8i8,  Expand);
00896   setOperationAction(ISD::SELECT,             MVT::v4i16, Expand);
00897   setOperationAction(ISD::SELECT,             MVT::v2i32, Expand);
00898   setOperationAction(ISD::SELECT,             MVT::v1i64, Expand);
00899   setOperationAction(ISD::BITCAST,            MVT::v8i8,  Expand);
00900   setOperationAction(ISD::BITCAST,            MVT::v4i16, Expand);
00901   setOperationAction(ISD::BITCAST,            MVT::v2i32, Expand);
00902   setOperationAction(ISD::BITCAST,            MVT::v1i64, Expand);
00903 
00904   if (!TM.Options.UseSoftFloat && Subtarget->hasSSE1()) {
00905     addRegisterClass(MVT::v4f32, &X86::VR128RegClass);
00906 
00907     setOperationAction(ISD::FADD,               MVT::v4f32, Legal);
00908     setOperationAction(ISD::FSUB,               MVT::v4f32, Legal);
00909     setOperationAction(ISD::FMUL,               MVT::v4f32, Legal);
00910     setOperationAction(ISD::FDIV,               MVT::v4f32, Legal);
00911     setOperationAction(ISD::FSQRT,              MVT::v4f32, Legal);
00912     setOperationAction(ISD::FNEG,               MVT::v4f32, Custom);
00913     setOperationAction(ISD::FABS,               MVT::v4f32, Custom);
00914     setOperationAction(ISD::LOAD,               MVT::v4f32, Legal);
00915     setOperationAction(ISD::BUILD_VECTOR,       MVT::v4f32, Custom);
00916     setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v4f32, Custom);
00917     setOperationAction(ISD::VSELECT,            MVT::v4f32, Custom);
00918     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
00919     setOperationAction(ISD::SELECT,             MVT::v4f32, Custom);
00920     setOperationAction(ISD::UINT_TO_FP,         MVT::v4i32, Custom);
00921   }
00922 
00923   if (!TM.Options.UseSoftFloat && Subtarget->hasSSE2()) {
00924     addRegisterClass(MVT::v2f64, &X86::VR128RegClass);
00925 
00926     // FIXME: Unfortunately, -soft-float and -no-implicit-float mean XMM
00927     // registers cannot be used even for integer operations.
00928     addRegisterClass(MVT::v16i8, &X86::VR128RegClass);
00929     addRegisterClass(MVT::v8i16, &X86::VR128RegClass);
00930     addRegisterClass(MVT::v4i32, &X86::VR128RegClass);
00931     addRegisterClass(MVT::v2i64, &X86::VR128RegClass);
00932 
00933     setOperationAction(ISD::ADD,                MVT::v16i8, Legal);
00934     setOperationAction(ISD::ADD,                MVT::v8i16, Legal);
00935     setOperationAction(ISD::ADD,                MVT::v4i32, Legal);
00936     setOperationAction(ISD::ADD,                MVT::v2i64, Legal);
00937     setOperationAction(ISD::MUL,                MVT::v4i32, Custom);
00938     setOperationAction(ISD::MUL,                MVT::v2i64, Custom);
00939     setOperationAction(ISD::UMUL_LOHI,          MVT::v4i32, Custom);
00940     setOperationAction(ISD::SMUL_LOHI,          MVT::v4i32, Custom);
00941     setOperationAction(ISD::MULHU,              MVT::v8i16, Legal);
00942     setOperationAction(ISD::MULHS,              MVT::v8i16, Legal);
00943     setOperationAction(ISD::SUB,                MVT::v16i8, Legal);
00944     setOperationAction(ISD::SUB,                MVT::v8i16, Legal);
00945     setOperationAction(ISD::SUB,                MVT::v4i32, Legal);
00946     setOperationAction(ISD::SUB,                MVT::v2i64, Legal);
00947     setOperationAction(ISD::MUL,                MVT::v8i16, Legal);
00948     setOperationAction(ISD::FADD,               MVT::v2f64, Legal);
00949     setOperationAction(ISD::FSUB,               MVT::v2f64, Legal);
00950     setOperationAction(ISD::FMUL,               MVT::v2f64, Legal);
00951     setOperationAction(ISD::FDIV,               MVT::v2f64, Legal);
00952     setOperationAction(ISD::FSQRT,              MVT::v2f64, Legal);
00953     setOperationAction(ISD::FNEG,               MVT::v2f64, Custom);
00954     setOperationAction(ISD::FABS,               MVT::v2f64, Custom);
00955 
00956     setOperationAction(ISD::SETCC,              MVT::v2i64, Custom);
00957     setOperationAction(ISD::SETCC,              MVT::v16i8, Custom);
00958     setOperationAction(ISD::SETCC,              MVT::v8i16, Custom);
00959     setOperationAction(ISD::SETCC,              MVT::v4i32, Custom);
00960 
00961     setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v16i8, Custom);
00962     setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v8i16, Custom);
00963     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v8i16, Custom);
00964     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4i32, Custom);
00965     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4f32, Custom);
00966 
00967     // Only provide customized ctpop vector bit twiddling for vector types we
00968     // know to perform better than using the popcnt instructions on each vector
00969     // element. If popcnt isn't supported, always provide the custom version.
00970     if (!Subtarget->hasPOPCNT()) {
00971       setOperationAction(ISD::CTPOP,            MVT::v4i32, Custom);
00972       setOperationAction(ISD::CTPOP,            MVT::v2i64, Custom);
00973     }
00974 
00975     // Custom lower build_vector, vector_shuffle, and extract_vector_elt.
00976     for (int i = MVT::v16i8; i != MVT::v2i64; ++i) {
00977       MVT VT = (MVT::SimpleValueType)i;
00978       // Do not attempt to custom lower non-power-of-2 vectors
00979       if (!isPowerOf2_32(VT.getVectorNumElements()))
00980         continue;
00981       // Do not attempt to custom lower non-128-bit vectors
00982       if (!VT.is128BitVector())
00983         continue;
00984       setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);
00985       setOperationAction(ISD::VECTOR_SHUFFLE,     VT, Custom);
00986       setOperationAction(ISD::VSELECT,            VT, Custom);
00987       setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
00988     }
00989 
00990     // We support custom legalizing of sext and anyext loads for specific
00991     // memory vector types which we can load as a scalar (or sequence of
00992     // scalars) and extend in-register to a legal 128-bit vector type. For sext
00993     // loads these must work with a single scalar load.
00994     for (MVT VT : MVT::integer_vector_valuetypes()) {
00995       setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i8, Custom);
00996       setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i16, Custom);
00997       setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v8i8, Custom);
00998       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i8, Custom);
00999       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i16, Custom);
01000       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i32, Custom);
01001       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i8, Custom);
01002       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i16, Custom);
01003       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v8i8, Custom);
01004     }
01005 
01006     setOperationAction(ISD::BUILD_VECTOR,       MVT::v2f64, Custom);
01007     setOperationAction(ISD::BUILD_VECTOR,       MVT::v2i64, Custom);
01008     setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v2f64, Custom);
01009     setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v2i64, Custom);
01010     setOperationAction(ISD::VSELECT,            MVT::v2f64, Custom);
01011     setOperationAction(ISD::VSELECT,            MVT::v2i64, Custom);
01012     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v2f64, Custom);
01013     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f64, Custom);
01014 
01015     if (Subtarget->is64Bit()) {
01016       setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v2i64, Custom);
01017       setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Custom);
01018     }
01019 
01020     // Promote v16i8, v8i16, v4i32 load, select, and, or, xor to v2i64.
01021     for (int i = MVT::v16i8; i != MVT::v2i64; ++i) {
01022       MVT VT = (MVT::SimpleValueType)i;
01023 
01024       // Do not attempt to promote non-128-bit vectors
01025       if (!VT.is128BitVector())
01026         continue;
01027 
01028       setOperationAction(ISD::AND,    VT, Promote);
01029       AddPromotedToType (ISD::AND,    VT, MVT::v2i64);
01030       setOperationAction(ISD::OR,     VT, Promote);
01031       AddPromotedToType (ISD::OR,     VT, MVT::v2i64);
01032       setOperationAction(ISD::XOR,    VT, Promote);
01033       AddPromotedToType (ISD::XOR,    VT, MVT::v2i64);
01034       setOperationAction(ISD::LOAD,   VT, Promote);
01035       AddPromotedToType (ISD::LOAD,   VT, MVT::v2i64);
01036       setOperationAction(ISD::SELECT, VT, Promote);
01037       AddPromotedToType (ISD::SELECT, VT, MVT::v2i64);
01038     }
01039 
01040     // Custom lower v2i64 and v2f64 selects.
01041     setOperationAction(ISD::LOAD,               MVT::v2f64, Legal);
01042     setOperationAction(ISD::LOAD,               MVT::v2i64, Legal);
01043     setOperationAction(ISD::SELECT,             MVT::v2f64, Custom);
01044     setOperationAction(ISD::SELECT,             MVT::v2i64, Custom);
01045 
01046     setOperationAction(ISD::FP_TO_SINT,         MVT::v4i32, Legal);
01047     setOperationAction(ISD::SINT_TO_FP,         MVT::v4i32, Legal);
01048 
01049     setOperationAction(ISD::UINT_TO_FP,         MVT::v4i8,  Custom);
01050     setOperationAction(ISD::UINT_TO_FP,         MVT::v4i16, Custom);
01051     // As there is no 64-bit GPR available, we need build a special custom
01052     // sequence to convert from v2i32 to v2f32.
01053     if (!Subtarget->is64Bit())
01054       setOperationAction(ISD::UINT_TO_FP,       MVT::v2f32, Custom);
01055 
01056     setOperationAction(ISD::FP_EXTEND,          MVT::v2f32, Custom);
01057     setOperationAction(ISD::FP_ROUND,           MVT::v2f32, Custom);
01058 
01059     for (MVT VT : MVT::fp_vector_valuetypes())
01060       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2f32, Legal);
01061 
01062     setOperationAction(ISD::BITCAST,            MVT::v2i32, Custom);
01063     setOperationAction(ISD::BITCAST,            MVT::v4i16, Custom);
01064     setOperationAction(ISD::BITCAST,            MVT::v8i8,  Custom);
01065   }
01066 
01067   if (!TM.Options.UseSoftFloat && Subtarget->hasSSE41()) {
01068     setOperationAction(ISD::FFLOOR,             MVT::f32,   Legal);
01069     setOperationAction(ISD::FCEIL,              MVT::f32,   Legal);
01070     setOperationAction(ISD::FTRUNC,             MVT::f32,   Legal);
01071     setOperationAction(ISD::FRINT,              MVT::f32,   Legal);
01072     setOperationAction(ISD::FNEARBYINT,         MVT::f32,   Legal);
01073     setOperationAction(ISD::FFLOOR,             MVT::f64,   Legal);
01074     setOperationAction(ISD::FCEIL,              MVT::f64,   Legal);
01075     setOperationAction(ISD::FTRUNC,             MVT::f64,   Legal);
01076     setOperationAction(ISD::FRINT,              MVT::f64,   Legal);
01077     setOperationAction(ISD::FNEARBYINT,         MVT::f64,   Legal);
01078 
01079     setOperationAction(ISD::FFLOOR,             MVT::v4f32, Legal);
01080     setOperationAction(ISD::FCEIL,              MVT::v4f32, Legal);
01081     setOperationAction(ISD::FTRUNC,             MVT::v4f32, Legal);
01082     setOperationAction(ISD::FRINT,              MVT::v4f32, Legal);
01083     setOperationAction(ISD::FNEARBYINT,         MVT::v4f32, Legal);
01084     setOperationAction(ISD::FFLOOR,             MVT::v2f64, Legal);
01085     setOperationAction(ISD::FCEIL,              MVT::v2f64, Legal);
01086     setOperationAction(ISD::FTRUNC,             MVT::v2f64, Legal);
01087     setOperationAction(ISD::FRINT,              MVT::v2f64, Legal);
01088     setOperationAction(ISD::FNEARBYINT,         MVT::v2f64, Legal);
01089 
01090     // FIXME: Do we need to handle scalar-to-vector here?
01091     setOperationAction(ISD::MUL,                MVT::v4i32, Legal);
01092 
01093     // We directly match byte blends in the backend as they match the VSELECT
01094     // condition form.
01095     setOperationAction(ISD::VSELECT,            MVT::v16i8, Legal);
01096 
01097     // SSE41 brings specific instructions for doing vector sign extend even in
01098     // cases where we don't have SRA.
01099     for (MVT VT : MVT::integer_vector_valuetypes()) {
01100       setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i8, Custom);
01101       setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i16, Custom);
01102       setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i32, Custom);
01103     }
01104 
01105     // SSE41 also has vector sign/zero extending loads, PMOV[SZ]X
01106     setLoadExtAction(ISD::SEXTLOAD, MVT::v8i16, MVT::v8i8,  Legal);
01107     setLoadExtAction(ISD::SEXTLOAD, MVT::v4i32, MVT::v4i8,  Legal);
01108     setLoadExtAction(ISD::SEXTLOAD, MVT::v2i64, MVT::v2i8,  Legal);
01109     setLoadExtAction(ISD::SEXTLOAD, MVT::v4i32, MVT::v4i16, Legal);
01110     setLoadExtAction(ISD::SEXTLOAD, MVT::v2i64, MVT::v2i16, Legal);
01111     setLoadExtAction(ISD::SEXTLOAD, MVT::v2i64, MVT::v2i32, Legal);
01112 
01113     setLoadExtAction(ISD::ZEXTLOAD, MVT::v8i16, MVT::v8i8,  Legal);
01114     setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i32, MVT::v4i8,  Legal);
01115     setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i64, MVT::v2i8,  Legal);
01116     setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i32, MVT::v4i16, Legal);
01117     setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i64, MVT::v2i16, Legal);
01118     setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i64, MVT::v2i32, Legal);
01119 
01120     // i8 and i16 vectors are custom because the source register and source
01121     // source memory operand types are not the same width.  f32 vectors are
01122     // custom since the immediate controlling the insert encodes additional
01123     // information.
01124     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v16i8, Custom);
01125     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v8i16, Custom);
01126     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4i32, Custom);
01127     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4f32, Custom);
01128 
01129     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i8, Custom);
01130     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i16, Custom);
01131     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Custom);
01132     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
01133 
01134     // FIXME: these should be Legal, but that's only for the case where
01135     // the index is constant.  For now custom expand to deal with that.
01136     if (Subtarget->is64Bit()) {
01137       setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v2i64, Custom);
01138       setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Custom);
01139     }
01140   }
01141 
01142   if (Subtarget->hasSSE2()) {
01143     setOperationAction(ISD::SRL,               MVT::v8i16, Custom);
01144     setOperationAction(ISD::SRL,               MVT::v16i8, Custom);
01145 
01146     setOperationAction(ISD::SHL,               MVT::v8i16, Custom);
01147     setOperationAction(ISD::SHL,               MVT::v16i8, Custom);
01148 
01149     setOperationAction(ISD::SRA,               MVT::v8i16, Custom);
01150     setOperationAction(ISD::SRA,               MVT::v16i8, Custom);
01151 
01152     // In the customized shift lowering, the legal cases in AVX2 will be
01153     // recognized.
01154     setOperationAction(ISD::SRL,               MVT::v2i64, Custom);
01155     setOperationAction(ISD::SRL,               MVT::v4i32, Custom);
01156 
01157     setOperationAction(ISD::SHL,               MVT::v2i64, Custom);
01158     setOperationAction(ISD::SHL,               MVT::v4i32, Custom);
01159 
01160     setOperationAction(ISD::SRA,               MVT::v4i32, Custom);
01161   }
01162 
01163   if (!TM.Options.UseSoftFloat && Subtarget->hasFp256()) {
01164     addRegisterClass(MVT::v32i8,  &X86::VR256RegClass);
01165     addRegisterClass(MVT::v16i16, &X86::VR256RegClass);
01166     addRegisterClass(MVT::v8i32,  &X86::VR256RegClass);
01167     addRegisterClass(MVT::v8f32,  &X86::VR256RegClass);
01168     addRegisterClass(MVT::v4i64,  &X86::VR256RegClass);
01169     addRegisterClass(MVT::v4f64,  &X86::VR256RegClass);
01170 
01171     setOperationAction(ISD::LOAD,               MVT::v8f32, Legal);
01172     setOperationAction(ISD::LOAD,               MVT::v4f64, Legal);
01173     setOperationAction(ISD::LOAD,               MVT::v4i64, Legal);
01174 
01175     setOperationAction(ISD::FADD,               MVT::v8f32, Legal);
01176     setOperationAction(ISD::FSUB,               MVT::v8f32, Legal);
01177     setOperationAction(ISD::FMUL,               MVT::v8f32, Legal);
01178     setOperationAction(ISD::FDIV,               MVT::v8f32, Legal);
01179     setOperationAction(ISD::FSQRT,              MVT::v8f32, Legal);
01180     setOperationAction(ISD::FFLOOR,             MVT::v8f32, Legal);
01181     setOperationAction(ISD::FCEIL,              MVT::v8f32, Legal);
01182     setOperationAction(ISD::FTRUNC,             MVT::v8f32, Legal);
01183     setOperationAction(ISD::FRINT,              MVT::v8f32, Legal);
01184     setOperationAction(ISD::FNEARBYINT,         MVT::v8f32, Legal);
01185     setOperationAction(ISD::FNEG,               MVT::v8f32, Custom);
01186     setOperationAction(ISD::FABS,               MVT::v8f32, Custom);
01187 
01188     setOperationAction(ISD::FADD,               MVT::v4f64, Legal);
01189     setOperationAction(ISD::FSUB,               MVT::v4f64, Legal);
01190     setOperationAction(ISD::FMUL,               MVT::v4f64, Legal);
01191     setOperationAction(ISD::FDIV,               MVT::v4f64, Legal);
01192     setOperationAction(ISD::FSQRT,              MVT::v4f64, Legal);
01193     setOperationAction(ISD::FFLOOR,             MVT::v4f64, Legal);
01194     setOperationAction(ISD::FCEIL,              MVT::v4f64, Legal);
01195     setOperationAction(ISD::FTRUNC,             MVT::v4f64, Legal);
01196     setOperationAction(ISD::FRINT,              MVT::v4f64, Legal);
01197     setOperationAction(ISD::FNEARBYINT,         MVT::v4f64, Legal);
01198     setOperationAction(ISD::FNEG,               MVT::v4f64, Custom);
01199     setOperationAction(ISD::FABS,               MVT::v4f64, Custom);
01200 
01201     // (fp_to_int:v8i16 (v8f32 ..)) requires the result type to be promoted
01202     // even though v8i16 is a legal type.
01203     setOperationAction(ISD::FP_TO_SINT,         MVT::v8i16, Promote);
01204     setOperationAction(ISD::FP_TO_UINT,         MVT::v8i16, Promote);
01205     setOperationAction(ISD::FP_TO_SINT,         MVT::v8i32, Legal);
01206 
01207     setOperationAction(ISD::SINT_TO_FP,         MVT::v8i16, Promote);
01208     setOperationAction(ISD::SINT_TO_FP,         MVT::v8i32, Legal);
01209     setOperationAction(ISD::FP_ROUND,           MVT::v4f32, Legal);
01210 
01211     setOperationAction(ISD::UINT_TO_FP,         MVT::v8i8,  Custom);
01212     setOperationAction(ISD::UINT_TO_FP,         MVT::v8i16, Custom);
01213 
01214     for (MVT VT : MVT::fp_vector_valuetypes())
01215       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4f32, Legal);
01216 
01217     setOperationAction(ISD::SRL,               MVT::v16i16, Custom);
01218     setOperationAction(ISD::SRL,               MVT::v32i8, Custom);
01219 
01220     setOperationAction(ISD::SHL,               MVT::v16i16, Custom);
01221     setOperationAction(ISD::SHL,               MVT::v32i8, Custom);
01222 
01223     setOperationAction(ISD::SRA,               MVT::v16i16, Custom);
01224     setOperationAction(ISD::SRA,               MVT::v32i8, Custom);
01225 
01226     setOperationAction(ISD::SETCC,             MVT::v32i8, Custom);
01227     setOperationAction(ISD::SETCC,             MVT::v16i16, Custom);
01228     setOperationAction(ISD::SETCC,             MVT::v8i32, Custom);
01229     setOperationAction(ISD::SETCC,             MVT::v4i64, Custom);
01230 
01231     setOperationAction(ISD::SELECT,            MVT::v4f64, Custom);
01232     setOperationAction(ISD::SELECT,            MVT::v4i64, Custom);
01233     setOperationAction(ISD::SELECT,            MVT::v8f32, Custom);
01234 
01235     setOperationAction(ISD::SIGN_EXTEND,       MVT::v4i64, Custom);
01236     setOperationAction(ISD::SIGN_EXTEND,       MVT::v8i32, Custom);
01237     setOperationAction(ISD::SIGN_EXTEND,       MVT::v16i16, Custom);
01238     setOperationAction(ISD::ZERO_EXTEND,       MVT::v4i64, Custom);
01239     setOperationAction(ISD::ZERO_EXTEND,       MVT::v8i32, Custom);
01240     setOperationAction(ISD::ZERO_EXTEND,       MVT::v16i16, Custom);
01241     setOperationAction(ISD::ANY_EXTEND,        MVT::v4i64, Custom);
01242     setOperationAction(ISD::ANY_EXTEND,        MVT::v8i32, Custom);
01243     setOperationAction(ISD::ANY_EXTEND,        MVT::v16i16, Custom);
01244     setOperationAction(ISD::TRUNCATE,          MVT::v16i8, Custom);
01245     setOperationAction(ISD::TRUNCATE,          MVT::v8i16, Custom);
01246     setOperationAction(ISD::TRUNCATE,          MVT::v4i32, Custom);
01247 
01248     if (Subtarget->hasFMA() || Subtarget->hasFMA4()) {
01249       setOperationAction(ISD::FMA,             MVT::v8f32, Legal);
01250       setOperationAction(ISD::FMA,             MVT::v4f64, Legal);
01251       setOperationAction(ISD::FMA,             MVT::v4f32, Legal);
01252       setOperationAction(ISD::FMA,             MVT::v2f64, Legal);
01253       setOperationAction(ISD::FMA,             MVT::f32, Legal);
01254       setOperationAction(ISD::FMA,             MVT::f64, Legal);
01255     }
01256 
01257     if (Subtarget->hasInt256()) {
01258       setOperationAction(ISD::ADD,             MVT::v4i64, Legal);
01259       setOperationAction(ISD::ADD,             MVT::v8i32, Legal);
01260       setOperationAction(ISD::ADD,             MVT::v16i16, Legal);
01261       setOperationAction(ISD::ADD,             MVT::v32i8, Legal);
01262 
01263       setOperationAction(ISD::SUB,             MVT::v4i64, Legal);
01264       setOperationAction(ISD::SUB,             MVT::v8i32, Legal);
01265       setOperationAction(ISD::SUB,             MVT::v16i16, Legal);
01266       setOperationAction(ISD::SUB,             MVT::v32i8, Legal);
01267 
01268       setOperationAction(ISD::MUL,             MVT::v4i64, Custom);
01269       setOperationAction(ISD::MUL,             MVT::v8i32, Legal);
01270       setOperationAction(ISD::MUL,             MVT::v16i16, Legal);
01271       // Don't lower v32i8 because there is no 128-bit byte mul
01272 
01273       setOperationAction(ISD::UMUL_LOHI,       MVT::v8i32, Custom);
01274       setOperationAction(ISD::SMUL_LOHI,       MVT::v8i32, Custom);
01275       setOperationAction(ISD::MULHU,           MVT::v16i16, Legal);
01276       setOperationAction(ISD::MULHS,           MVT::v16i16, Legal);
01277 
01278       // The custom lowering for UINT_TO_FP for v8i32 becomes interesting
01279       // when we have a 256bit-wide blend with immediate.
01280       setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Custom);
01281 
01282       // Only provide customized ctpop vector bit twiddling for vector types we
01283       // know to perform better than using the popcnt instructions on each
01284       // vector element. If popcnt isn't supported, always provide the custom
01285       // version.
01286       if (!Subtarget->hasPOPCNT())
01287         setOperationAction(ISD::CTPOP,           MVT::v4i64, Custom);
01288 
01289       // Custom CTPOP always performs better on natively supported v8i32
01290       setOperationAction(ISD::CTPOP,             MVT::v8i32, Custom);
01291 
01292       // AVX2 also has wider vector sign/zero extending loads, VPMOV[SZ]X
01293       setLoadExtAction(ISD::SEXTLOAD, MVT::v16i16, MVT::v16i8, Legal);
01294       setLoadExtAction(ISD::SEXTLOAD, MVT::v8i32,  MVT::v8i8,  Legal);
01295       setLoadExtAction(ISD::SEXTLOAD, MVT::v4i64,  MVT::v4i8,  Legal);
01296       setLoadExtAction(ISD::SEXTLOAD, MVT::v8i32,  MVT::v8i16, Legal);
01297       setLoadExtAction(ISD::SEXTLOAD, MVT::v4i64,  MVT::v4i16, Legal);
01298       setLoadExtAction(ISD::SEXTLOAD, MVT::v4i64,  MVT::v4i32, Legal);
01299 
01300       setLoadExtAction(ISD::ZEXTLOAD, MVT::v16i16, MVT::v16i8, Legal);
01301       setLoadExtAction(ISD::ZEXTLOAD, MVT::v8i32,  MVT::v8i8,  Legal);
01302       setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i64,  MVT::v4i8,  Legal);
01303       setLoadExtAction(ISD::ZEXTLOAD, MVT::v8i32,  MVT::v8i16, Legal);
01304       setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i64,  MVT::v4i16, Legal);
01305       setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i64,  MVT::v4i32, Legal);
01306     } else {
01307       setOperationAction(ISD::ADD,             MVT::v4i64, Custom);
01308       setOperationAction(ISD::ADD,             MVT::v8i32, Custom);
01309       setOperationAction(ISD::ADD,             MVT::v16i16, Custom);
01310       setOperationAction(ISD::ADD,             MVT::v32i8, Custom);
01311 
01312       setOperationAction(ISD::SUB,             MVT::v4i64, Custom);
01313       setOperationAction(ISD::SUB,             MVT::v8i32, Custom);
01314       setOperationAction(ISD::SUB,             MVT::v16i16, Custom);
01315       setOperationAction(ISD::SUB,             MVT::v32i8, Custom);
01316 
01317       setOperationAction(ISD::MUL,             MVT::v4i64, Custom);
01318       setOperationAction(ISD::MUL,             MVT::v8i32, Custom);
01319       setOperationAction(ISD::MUL,             MVT::v16i16, Custom);
01320       // Don't lower v32i8 because there is no 128-bit byte mul
01321     }
01322 
01323     // In the customized shift lowering, the legal cases in AVX2 will be
01324     // recognized.
01325     setOperationAction(ISD::SRL,               MVT::v4i64, Custom);
01326     setOperationAction(ISD::SRL,               MVT::v8i32, Custom);
01327 
01328     setOperationAction(ISD::SHL,               MVT::v4i64, Custom);
01329     setOperationAction(ISD::SHL,               MVT::v8i32, Custom);
01330 
01331     setOperationAction(ISD::SRA,               MVT::v8i32, Custom);
01332 
01333     // Custom lower several nodes for 256-bit types.
01334     for (MVT VT : MVT::vector_valuetypes()) {
01335       if (VT.getScalarSizeInBits() >= 32) {
01336         setOperationAction(ISD::MLOAD,  VT, Legal);
01337         setOperationAction(ISD::MSTORE, VT, Legal);
01338       }
01339       // Extract subvector is special because the value type
01340       // (result) is 128-bit but the source is 256-bit wide.
01341       if (VT.is128BitVector()) {
01342         setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
01343       }
01344       // Do not attempt to custom lower other non-256-bit vectors
01345       if (!VT.is256BitVector())
01346         continue;
01347 
01348       setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);
01349       setOperationAction(ISD::VECTOR_SHUFFLE,     VT, Custom);
01350       setOperationAction(ISD::VSELECT,            VT, Custom);
01351       setOperationAction(ISD::INSERT_VECTOR_ELT,  VT, Custom);
01352       setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
01353       setOperationAction(ISD::SCALAR_TO_VECTOR,   VT, Custom);
01354       setOperationAction(ISD::INSERT_SUBVECTOR,   VT, Custom);
01355       setOperationAction(ISD::CONCAT_VECTORS,     VT, Custom);
01356     }
01357 
01358     if (Subtarget->hasInt256())
01359       setOperationAction(ISD::VSELECT,         MVT::v32i8, Legal);
01360 
01361 
01362     // Promote v32i8, v16i16, v8i32 select, and, or, xor to v4i64.
01363     for (int i = MVT::v32i8; i != MVT::v4i64; ++i) {
01364       MVT VT = (MVT::SimpleValueType)i;
01365 
01366       // Do not attempt to promote non-256-bit vectors
01367       if (!VT.is256BitVector())
01368         continue;
01369 
01370       setOperationAction(ISD::AND,    VT, Promote);
01371       AddPromotedToType (ISD::AND,    VT, MVT::v4i64);
01372       setOperationAction(ISD::OR,     VT, Promote);
01373       AddPromotedToType (ISD::OR,     VT, MVT::v4i64);
01374       setOperationAction(ISD::XOR,    VT, Promote);
01375       AddPromotedToType (ISD::XOR,    VT, MVT::v4i64);
01376       setOperationAction(ISD::LOAD,   VT, Promote);
01377       AddPromotedToType (ISD::LOAD,   VT, MVT::v4i64);
01378       setOperationAction(ISD::SELECT, VT, Promote);
01379       AddPromotedToType (ISD::SELECT, VT, MVT::v4i64);
01380     }
01381   }
01382 
01383   if (!TM.Options.UseSoftFloat && Subtarget->hasAVX512()) {
01384     addRegisterClass(MVT::v16i32, &X86::VR512RegClass);
01385     addRegisterClass(MVT::v16f32, &X86::VR512RegClass);
01386     addRegisterClass(MVT::v8i64,  &X86::VR512RegClass);
01387     addRegisterClass(MVT::v8f64,  &X86::VR512RegClass);
01388 
01389     addRegisterClass(MVT::i1,     &X86::VK1RegClass);
01390     addRegisterClass(MVT::v8i1,   &X86::VK8RegClass);
01391     addRegisterClass(MVT::v16i1,  &X86::VK16RegClass);
01392 
01393     for (MVT VT : MVT::fp_vector_valuetypes())
01394       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v8f32, Legal);
01395 
01396     setOperationAction(ISD::BR_CC,              MVT::i1,    Expand);
01397     setOperationAction(ISD::SETCC,              MVT::i1,    Custom);
01398     setOperationAction(ISD::XOR,                MVT::i1,    Legal);
01399     setOperationAction(ISD::OR,                 MVT::i1,    Legal);
01400     setOperationAction(ISD::AND,                MVT::i1,    Legal);
01401     setOperationAction(ISD::LOAD,               MVT::v16f32, Legal);
01402     setOperationAction(ISD::LOAD,               MVT::v8f64, Legal);
01403     setOperationAction(ISD::LOAD,               MVT::v8i64, Legal);
01404     setOperationAction(ISD::LOAD,               MVT::v16i32, Legal);
01405     setOperationAction(ISD::LOAD,               MVT::v16i1, Legal);
01406 
01407     setOperationAction(ISD::FADD,               MVT::v16f32, Legal);
01408     setOperationAction(ISD::FSUB,               MVT::v16f32, Legal);
01409     setOperationAction(ISD::FMUL,               MVT::v16f32, Legal);
01410     setOperationAction(ISD::FDIV,               MVT::v16f32, Legal);
01411     setOperationAction(ISD::FSQRT,              MVT::v16f32, Legal);
01412     setOperationAction(ISD::FNEG,               MVT::v16f32, Custom);
01413 
01414     setOperationAction(ISD::FADD,               MVT::v8f64, Legal);
01415     setOperationAction(ISD::FSUB,               MVT::v8f64, Legal);
01416     setOperationAction(ISD::FMUL,               MVT::v8f64, Legal);
01417     setOperationAction(ISD::FDIV,               MVT::v8f64, Legal);
01418     setOperationAction(ISD::FSQRT,              MVT::v8f64, Legal);
01419     setOperationAction(ISD::FNEG,               MVT::v8f64, Custom);
01420     setOperationAction(ISD::FMA,                MVT::v8f64, Legal);
01421     setOperationAction(ISD::FMA,                MVT::v16f32, Legal);
01422 
01423     setOperationAction(ISD::FP_TO_SINT,         MVT::i32, Legal);
01424     setOperationAction(ISD::FP_TO_UINT,         MVT::i32, Legal);
01425     setOperationAction(ISD::SINT_TO_FP,         MVT::i32, Legal);
01426     setOperationAction(ISD::UINT_TO_FP,         MVT::i32, Legal);
01427     if (Subtarget->is64Bit()) {
01428       setOperationAction(ISD::FP_TO_UINT,       MVT::i64, Legal);
01429       setOperationAction(ISD::FP_TO_SINT,       MVT::i64, Legal);
01430       setOperationAction(ISD::SINT_TO_FP,       MVT::i64, Legal);
01431       setOperationAction(ISD::UINT_TO_FP,       MVT::i64, Legal);
01432     }
01433     setOperationAction(ISD::FP_TO_SINT,         MVT::v16i32, Legal);
01434     setOperationAction(ISD::FP_TO_UINT,         MVT::v16i32, Legal);
01435     setOperationAction(ISD::FP_TO_UINT,         MVT::v8i32, Legal);
01436     setOperationAction(ISD::FP_TO_UINT,         MVT::v4i32, Legal);
01437     setOperationAction(ISD::SINT_TO_FP,         MVT::v16i32, Legal);
01438     setOperationAction(ISD::SINT_TO_FP,         MVT::v8i1,   Custom);
01439     setOperationAction(ISD::SINT_TO_FP,         MVT::v16i1,  Custom);
01440     setOperationAction(ISD::SINT_TO_FP,         MVT::v16i8,  Promote);
01441     setOperationAction(ISD::SINT_TO_FP,         MVT::v16i16, Promote);
01442     setOperationAction(ISD::UINT_TO_FP,         MVT::v16i32, Legal);
01443     setOperationAction(ISD::UINT_TO_FP,         MVT::v8i32, Legal);
01444     setOperationAction(ISD::UINT_TO_FP,         MVT::v4i32, Legal);
01445     setOperationAction(ISD::FP_ROUND,           MVT::v8f32, Legal);
01446     setOperationAction(ISD::FP_EXTEND,          MVT::v8f32, Legal);
01447 
01448     setOperationAction(ISD::TRUNCATE,           MVT::i1, Custom);
01449     setOperationAction(ISD::TRUNCATE,           MVT::v16i8, Custom);
01450     setOperationAction(ISD::TRUNCATE,           MVT::v8i32, Custom);
01451     setOperationAction(ISD::TRUNCATE,           MVT::v8i1, Custom);
01452     setOperationAction(ISD::TRUNCATE,           MVT::v16i1, Custom);
01453     setOperationAction(ISD::TRUNCATE,           MVT::v16i16, Custom);
01454     setOperationAction(ISD::ZERO_EXTEND,        MVT::v16i32, Custom);
01455     setOperationAction(ISD::ZERO_EXTEND,        MVT::v8i64, Custom);
01456     setOperationAction(ISD::SIGN_EXTEND,        MVT::v16i32, Custom);
01457     setOperationAction(ISD::SIGN_EXTEND,        MVT::v8i64, Custom);
01458     setOperationAction(ISD::SIGN_EXTEND,        MVT::v16i8, Custom);
01459     setOperationAction(ISD::SIGN_EXTEND,        MVT::v8i16, Custom);
01460     setOperationAction(ISD::SIGN_EXTEND,        MVT::v16i16, Custom);
01461 
01462     setOperationAction(ISD::FFLOOR,             MVT::v16f32, Legal);
01463     setOperationAction(ISD::FFLOOR,             MVT::v8f64, Legal);
01464     setOperationAction(ISD::FCEIL,              MVT::v16f32, Legal);
01465     setOperationAction(ISD::FCEIL,              MVT::v8f64, Legal);
01466     setOperationAction(ISD::FTRUNC,             MVT::v16f32, Legal);
01467     setOperationAction(ISD::FTRUNC,             MVT::v8f64, Legal);
01468     setOperationAction(ISD::FRINT,              MVT::v16f32, Legal);
01469     setOperationAction(ISD::FRINT,              MVT::v8f64, Legal);
01470     setOperationAction(ISD::FNEARBYINT,         MVT::v16f32, Legal);
01471     setOperationAction(ISD::FNEARBYINT,         MVT::v8f64, Legal);
01472 
01473     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v8f64,  Custom);
01474     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v8i64,  Custom);
01475     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v16f32,  Custom);
01476     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v16i32,  Custom);
01477     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v8i1,    Custom);
01478     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v16i1, Legal);
01479 
01480     setOperationAction(ISD::SETCC,              MVT::v16i1, Custom);
01481     setOperationAction(ISD::SETCC,              MVT::v8i1, Custom);
01482 
01483     setOperationAction(ISD::MUL,              MVT::v8i64, Custom);
01484 
01485     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i1,  Custom);
01486     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i1, Custom);
01487     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v16i1, Custom);
01488     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v8i1, Custom);
01489     setOperationAction(ISD::BUILD_VECTOR,       MVT::v8i1, Custom);
01490     setOperationAction(ISD::BUILD_VECTOR,       MVT::v16i1, Custom);
01491     setOperationAction(ISD::SELECT,             MVT::v8f64, Custom);
01492     setOperationAction(ISD::SELECT,             MVT::v8i64, Custom);
01493     setOperationAction(ISD::SELECT,             MVT::v16f32, Custom);
01494 
01495     setOperationAction(ISD::ADD,                MVT::v8i64, Legal);
01496     setOperationAction(ISD::ADD,                MVT::v16i32, Legal);
01497 
01498     setOperationAction(ISD::SUB,                MVT::v8i64, Legal);
01499     setOperationAction(ISD::SUB,                MVT::v16i32, Legal);
01500 
01501     setOperationAction(ISD::MUL,                MVT::v16i32, Legal);
01502 
01503     setOperationAction(ISD::SRL,                MVT::v8i64, Custom);
01504     setOperationAction(ISD::SRL,                MVT::v16i32, Custom);
01505 
01506     setOperationAction(ISD::SHL,                MVT::v8i64, Custom);
01507     setOperationAction(ISD::SHL,                MVT::v16i32, Custom);
01508 
01509     setOperationAction(ISD::SRA,                MVT::v8i64, Custom);
01510     setOperationAction(ISD::SRA,                MVT::v16i32, Custom);
01511 
01512     setOperationAction(ISD::AND,                MVT::v8i64, Legal);
01513     setOperationAction(ISD::OR,                 MVT::v8i64, Legal);
01514     setOperationAction(ISD::XOR,                MVT::v8i64, Legal);
01515     setOperationAction(ISD::AND,                MVT::v16i32, Legal);
01516     setOperationAction(ISD::OR,                 MVT::v16i32, Legal);
01517     setOperationAction(ISD::XOR,                MVT::v16i32, Legal);
01518 
01519     if (Subtarget->hasCDI()) {
01520       setOperationAction(ISD::CTLZ,             MVT::v8i64, Legal);
01521       setOperationAction(ISD::CTLZ,             MVT::v16i32, Legal);
01522     }
01523 
01524     // Custom lower several nodes.
01525     for (MVT VT : MVT::vector_valuetypes()) {
01526       unsigned EltSize = VT.getVectorElementType().getSizeInBits();
01527       // Extract subvector is special because the value type
01528       // (result) is 256/128-bit but the source is 512-bit wide.
01529       if (VT.is128BitVector() || VT.is256BitVector()) {
01530         setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
01531       }
01532       if (VT.getVectorElementType() == MVT::i1)
01533         setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);
01534 
01535       // Do not attempt to custom lower other non-512-bit vectors
01536       if (!VT.is512BitVector())
01537         continue;
01538 
01539       if ( EltSize >= 32) {
01540         setOperationAction(ISD::VECTOR_SHUFFLE,      VT, Custom);
01541         setOperationAction(ISD::INSERT_VECTOR_ELT,   VT, Custom);
01542         setOperationAction(ISD::BUILD_VECTOR,        VT, Custom);
01543         setOperationAction(ISD::VSELECT,             VT, Legal);
01544         setOperationAction(ISD::EXTRACT_VECTOR_ELT,  VT, Custom);
01545         setOperationAction(ISD::SCALAR_TO_VECTOR,    VT, Custom);
01546         setOperationAction(ISD::INSERT_SUBVECTOR,    VT, Custom);
01547         setOperationAction(ISD::MLOAD,               VT, Legal);
01548         setOperationAction(ISD::MSTORE,              VT, Legal);
01549       }
01550     }
01551     for (int i = MVT::v32i8; i != MVT::v8i64; ++i) {
01552       MVT VT = (MVT::SimpleValueType)i;
01553 
01554       // Do not attempt to promote non-512-bit vectors.
01555       if (!VT.is512BitVector())
01556         continue;
01557 
01558       setOperationAction(ISD::SELECT, VT, Promote);
01559       AddPromotedToType (ISD::SELECT, VT, MVT::v8i64);
01560     }
01561   }// has  AVX-512
01562 
01563   if (!TM.Options.UseSoftFloat && Subtarget->hasBWI()) {
01564     addRegisterClass(MVT::v32i16, &X86::VR512RegClass);
01565     addRegisterClass(MVT::v64i8,  &X86::VR512RegClass);
01566 
01567     addRegisterClass(MVT::v32i1,  &X86::VK32RegClass);
01568     addRegisterClass(MVT::v64i1,  &X86::VK64RegClass);
01569 
01570     setOperationAction(ISD::LOAD,               MVT::v32i16, Legal);
01571     setOperationAction(ISD::LOAD,               MVT::v64i8, Legal);
01572     setOperationAction(ISD::SETCC,              MVT::v32i1, Custom);
01573     setOperationAction(ISD::SETCC,              MVT::v64i1, Custom);
01574     setOperationAction(ISD::ADD,                MVT::v32i16, Legal);
01575     setOperationAction(ISD::ADD,                MVT::v64i8, Legal);
01576     setOperationAction(ISD::SUB,                MVT::v32i16, Legal);
01577     setOperationAction(ISD::SUB,                MVT::v64i8, Legal);
01578     setOperationAction(ISD::MUL,                MVT::v32i16, Legal);
01579 
01580     for (int i = MVT::v32i8; i != MVT::v8i64; ++i) {
01581       const MVT VT = (MVT::SimpleValueType)i;
01582 
01583       const unsigned EltSize = VT.getVectorElementType().getSizeInBits();
01584 
01585       // Do not attempt to promote non-512-bit vectors.
01586       if (!VT.is512BitVector())
01587         continue;
01588 
01589       if (EltSize < 32) {
01590         setOperationAction(ISD::BUILD_VECTOR,        VT, Custom);
01591         setOperationAction(ISD::VSELECT,             VT, Legal);
01592       }
01593     }
01594   }
01595 
01596   if (!TM.Options.UseSoftFloat && Subtarget->hasVLX()) {
01597     addRegisterClass(MVT::v4i1,   &X86::VK4RegClass);
01598     addRegisterClass(MVT::v2i1,   &X86::VK2RegClass);
01599 
01600     setOperationAction(ISD::SETCC,              MVT::v4i1, Custom);
01601     setOperationAction(ISD::SETCC,              MVT::v2i1, Custom);
01602     setOperationAction(ISD::INSERT_SUBVECTOR,   MVT::v8i1, Legal);
01603 
01604     setOperationAction(ISD::AND,                MVT::v8i32, Legal);
01605     setOperationAction(ISD::OR,                 MVT::v8i32, Legal);
01606     setOperationAction(ISD::XOR,                MVT::v8i32, Legal);
01607     setOperationAction(ISD::AND,                MVT::v4i32, Legal);
01608     setOperationAction(ISD::OR,                 MVT::v4i32, Legal);
01609     setOperationAction(ISD::XOR,                MVT::v4i32, Legal);
01610   }
01611 
01612   // We want to custom lower some of our intrinsics.
01613   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
01614   setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
01615   setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
01616   if (!Subtarget->is64Bit())
01617     setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i64, Custom);
01618 
01619   // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't
01620   // handle type legalization for these operations here.
01621   //
01622   // FIXME: We really should do custom legalization for addition and
01623   // subtraction on x86-32 once PR3203 is fixed.  We really can't do much better
01624   // than generic legalization for 64-bit multiplication-with-overflow, though.
01625   for (unsigned i = 0, e = 3+Subtarget->is64Bit(); i != e; ++i) {
01626     // Add/Sub/Mul with overflow operations are custom lowered.
01627     MVT VT = IntVTs[i];
01628     setOperationAction(ISD::SADDO, VT, Custom);
01629     setOperationAction(ISD::UADDO, VT, Custom);
01630     setOperationAction(ISD::SSUBO, VT, Custom);
01631     setOperationAction(ISD::USUBO, VT, Custom);
01632     setOperationAction(ISD::SMULO, VT, Custom);
01633     setOperationAction(ISD::UMULO, VT, Custom);
01634   }
01635 
01636 
01637   if (!Subtarget->is64Bit()) {
01638     // These libcalls are not available in 32-bit.
01639     setLibcallName(RTLIB::SHL_I128, nullptr);
01640     setLibcallName(RTLIB::SRL_I128, nullptr);
01641     setLibcallName(RTLIB::SRA_I128, nullptr);
01642   }
01643 
01644   // Combine sin / cos into one node or libcall if possible.
01645   if (Subtarget->hasSinCos()) {
01646     setLibcallName(RTLIB::SINCOS_F32, "sincosf");
01647     setLibcallName(RTLIB::SINCOS_F64, "sincos");
01648     if (Subtarget->isTargetDarwin()) {
01649       // For MacOSX, we don't want the normal expansion of a libcall to sincos.
01650       // We want to issue a libcall to __sincos_stret to avoid memory traffic.
01651       setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
01652       setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
01653     }
01654   }
01655 
01656   if (Subtarget->isTargetWin64()) {
01657     setOperationAction(ISD::SDIV, MVT::i128, Custom);
01658     setOperationAction(ISD::UDIV, MVT::i128, Custom);
01659     setOperationAction(ISD::SREM, MVT::i128, Custom);
01660     setOperationAction(ISD::UREM, MVT::i128, Custom);
01661     setOperationAction(ISD::SDIVREM, MVT::i128, Custom);
01662     setOperationAction(ISD::UDIVREM, MVT::i128, Custom);
01663   }
01664 
01665   // We have target-specific dag combine patterns for the following nodes:
01666   setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
01667   setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
01668   setTargetDAGCombine(ISD::BITCAST);
01669   setTargetDAGCombine(ISD::VSELECT);
01670   setTargetDAGCombine(ISD::SELECT);
01671   setTargetDAGCombine(ISD::SHL);
01672   setTargetDAGCombine(ISD::SRA);
01673   setTargetDAGCombine(ISD::SRL);
01674   setTargetDAGCombine(ISD::OR);
01675   setTargetDAGCombine(ISD::AND);
01676   setTargetDAGCombine(ISD::ADD);
01677   setTargetDAGCombine(ISD::FADD);
01678   setTargetDAGCombine(ISD::FSUB);
01679   setTargetDAGCombine(ISD::FMA);
01680   setTargetDAGCombine(ISD::SUB);
01681   setTargetDAGCombine(ISD::LOAD);
01682   setTargetDAGCombine(ISD::MLOAD);
01683   setTargetDAGCombine(ISD::STORE);
01684   setTargetDAGCombine(ISD::MSTORE);
01685   setTargetDAGCombine(ISD::ZERO_EXTEND);
01686   setTargetDAGCombine(ISD::ANY_EXTEND);
01687   setTargetDAGCombine(ISD::SIGN_EXTEND);
01688   setTargetDAGCombine(ISD::SIGN_EXTEND_INREG);
01689   setTargetDAGCombine(ISD::TRUNCATE);
01690   setTargetDAGCombine(ISD::SINT_TO_FP);
01691   setTargetDAGCombine(ISD::SETCC);
01692   setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN);
01693   setTargetDAGCombine(ISD::BUILD_VECTOR);
01694   setTargetDAGCombine(ISD::MUL);
01695   setTargetDAGCombine(ISD::XOR);
01696 
01697   computeRegisterProperties(Subtarget->getRegisterInfo());
01698 
01699   // On Darwin, -Os means optimize for size without hurting performance,
01700   // do not reduce the limit.
01701   MaxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores
01702   MaxStoresPerMemsetOptSize = Subtarget->isTargetDarwin() ? 16 : 8;
01703   MaxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores
01704   MaxStoresPerMemcpyOptSize = Subtarget->isTargetDarwin() ? 8 : 4;
01705   MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores
01706   MaxStoresPerMemmoveOptSize = Subtarget->isTargetDarwin() ? 8 : 4;
01707   setPrefLoopAlignment(4); // 2^4 bytes.
01708 
01709   // Predictable cmov don't hurt on atom because it's in-order.
01710   PredictableSelectIsExpensive = !Subtarget->isAtom();
01711   EnableExtLdPromotion = true;
01712   setPrefFunctionAlignment(4); // 2^4 bytes.
01713 
01714   verifyIntrinsicTables();
01715 }
01716 
01717 // This has so far only been implemented for 64-bit MachO.
01718 bool X86TargetLowering::useLoadStackGuardNode() const {
01719   return Subtarget->isTargetMachO() && Subtarget->is64Bit();
01720 }
01721 
01722 TargetLoweringBase::LegalizeTypeAction
01723 X86TargetLowering::getPreferredVectorAction(EVT VT) const {
01724   if (ExperimentalVectorWideningLegalization &&
01725       VT.getVectorNumElements() != 1 &&
01726       VT.getVectorElementType().getSimpleVT() != MVT::i1)
01727     return TypeWidenVector;
01728 
01729   return TargetLoweringBase::getPreferredVectorAction(VT);
01730 }
01731 
01732 EVT X86TargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const {
01733   if (!VT.isVector())
01734     return Subtarget->hasAVX512() ? MVT::i1: MVT::i8;
01735 
01736   const unsigned NumElts = VT.getVectorNumElements();
01737   const EVT EltVT = VT.getVectorElementType();
01738   if (VT.is512BitVector()) {
01739     if (Subtarget->hasAVX512())
01740       if (EltVT == MVT::i32 || EltVT == MVT::i64 ||
01741           EltVT == MVT::f32 || EltVT == MVT::f64)
01742         switch(NumElts) {
01743         case  8: return MVT::v8i1;
01744         case 16: return MVT::v16i1;
01745       }
01746     if (Subtarget->hasBWI())
01747       if (EltVT == MVT::i8 || EltVT == MVT::i16)
01748         switch(NumElts) {
01749         case 32: return MVT::v32i1;
01750         case 64: return MVT::v64i1;
01751       }
01752   }
01753 
01754   if (VT.is256BitVector() || VT.is128BitVector()) {
01755     if (Subtarget->hasVLX())
01756       if (EltVT == MVT::i32 || EltVT == MVT::i64 ||
01757           EltVT == MVT::f32 || EltVT == MVT::f64)
01758         switch(NumElts) {
01759         case 2: return MVT::v2i1;
01760         case 4: return MVT::v4i1;
01761         case 8: return MVT::v8i1;
01762       }
01763     if (Subtarget->hasBWI() && Subtarget->hasVLX())
01764       if (EltVT == MVT::i8 || EltVT == MVT::i16)
01765         switch(NumElts) {
01766         case  8: return MVT::v8i1;
01767         case 16: return MVT::v16i1;
01768         case 32: return MVT::v32i1;
01769       }
01770   }
01771 
01772   return VT.changeVectorElementTypeToInteger();
01773 }
01774 
01775 /// Helper for getByValTypeAlignment to determine
01776 /// the desired ByVal argument alignment.
01777 static void getMaxByValAlign(Type *Ty, unsigned &MaxAlign) {
01778   if (MaxAlign == 16)
01779     return;
01780   if (VectorType *VTy = dyn_cast<VectorType>(Ty)) {
01781     if (VTy->getBitWidth() == 128)
01782       MaxAlign = 16;
01783   } else if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
01784     unsigned EltAlign = 0;
01785     getMaxByValAlign(ATy->getElementType(), EltAlign);
01786     if (EltAlign > MaxAlign)
01787       MaxAlign = EltAlign;
01788   } else if (StructType *STy = dyn_cast<StructType>(Ty)) {
01789     for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
01790       unsigned EltAlign = 0;
01791       getMaxByValAlign(STy->getElementType(i), EltAlign);
01792       if (EltAlign > MaxAlign)
01793         MaxAlign = EltAlign;
01794       if (MaxAlign == 16)
01795         break;
01796     }
01797   }
01798 }
01799 
01800 /// Return the desired alignment for ByVal aggregate
01801 /// function arguments in the caller parameter area. For X86, aggregates
01802 /// that contain SSE vectors are placed at 16-byte boundaries while the rest
01803 /// are at 4-byte boundaries.
01804 unsigned X86TargetLowering::getByValTypeAlignment(Type *Ty) const {
01805   if (Subtarget->is64Bit()) {
01806     // Max of 8 and alignment of type.
01807     unsigned TyAlign = TD->getABITypeAlignment(Ty);
01808     if (TyAlign > 8)
01809       return TyAlign;
01810     return 8;
01811   }
01812 
01813   unsigned Align = 4;
01814   if (Subtarget->hasSSE1())
01815     getMaxByValAlign(Ty, Align);
01816   return Align;
01817 }
01818 
01819 /// Returns the target specific optimal type for load
01820 /// and store operations as a result of memset, memcpy, and memmove
01821 /// lowering. If DstAlign is zero that means it's safe to destination
01822 /// alignment can satisfy any constraint. Similarly if SrcAlign is zero it
01823 /// means there isn't a need to check it against alignment requirement,
01824 /// probably because the source does not need to be loaded. If 'IsMemset' is
01825 /// true, that means it's expanding a memset. If 'ZeroMemset' is true, that
01826 /// means it's a memset of zero. 'MemcpyStrSrc' indicates whether the memcpy
01827 /// source is constant so it does not need to be loaded.
01828 /// It returns EVT::Other if the type should be determined using generic
01829 /// target-independent logic.
01830 EVT
01831 X86TargetLowering::getOptimalMemOpType(uint64_t Size,
01832                                        unsigned DstAlign, unsigned SrcAlign,
01833                                        bool IsMemset, bool ZeroMemset,
01834                                        bool MemcpyStrSrc,
01835                                        MachineFunction &MF) const {
01836   const Function *F = MF.getFunction();
01837   if ((!IsMemset || ZeroMemset) &&
01838       !F->hasFnAttribute(Attribute::NoImplicitFloat)) {
01839     if (Size >= 16 &&
01840         (Subtarget->isUnalignedMemAccessFast() ||
01841          ((DstAlign == 0 || DstAlign >= 16) &&
01842           (SrcAlign == 0 || SrcAlign >= 16)))) {
01843       if (Size >= 32) {
01844         if (Subtarget->hasInt256())
01845           return MVT::v8i32;
01846         if (Subtarget->hasFp256())
01847           return MVT::v8f32;
01848       }
01849       if (Subtarget->hasSSE2())
01850         return MVT::v4i32;
01851       if (Subtarget->hasSSE1())
01852         return MVT::v4f32;
01853     } else if (!MemcpyStrSrc && Size >= 8 &&
01854                !Subtarget->is64Bit() &&
01855                Subtarget->hasSSE2()) {
01856       // Do not use f64 to lower memcpy if source is string constant. It's
01857       // better to use i32 to avoid the loads.
01858       return MVT::f64;
01859     }
01860   }
01861   if (Subtarget->is64Bit() && Size >= 8)
01862     return MVT::i64;
01863   return MVT::i32;
01864 }
01865 
01866 bool X86TargetLowering::isSafeMemOpType(MVT VT) const {
01867   if (VT == MVT::f32)
01868     return X86ScalarSSEf32;
01869   else if (VT == MVT::f64)
01870     return X86ScalarSSEf64;
01871   return true;
01872 }
01873 
01874 bool
01875 X86TargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
01876                                                   unsigned,
01877                                                   unsigned,
01878                                                   bool *Fast) const {
01879   if (Fast)
01880     *Fast = Subtarget->isUnalignedMemAccessFast();
01881   return true;
01882 }
01883 
01884 /// Return the entry encoding for a jump table in the
01885 /// current function.  The returned value is a member of the
01886 /// MachineJumpTableInfo::JTEntryKind enum.
01887 unsigned X86TargetLowering::getJumpTableEncoding() const {
01888   // In GOT pic mode, each entry in the jump table is emitted as a @GOTOFF
01889   // symbol.
01890   if (getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
01891       Subtarget->isPICStyleGOT())
01892     return MachineJumpTableInfo::EK_Custom32;
01893 
01894   // Otherwise, use the normal jump table encoding heuristics.
01895   return TargetLowering::getJumpTableEncoding();
01896 }
01897 
01898 const MCExpr *
01899 X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI,
01900                                              const MachineBasicBlock *MBB,
01901                                              unsigned uid,MCContext &Ctx) const{
01902   assert(MBB->getParent()->getTarget().getRelocationModel() == Reloc::PIC_ &&
01903          Subtarget->isPICStyleGOT());
01904   // In 32-bit ELF systems, our jump table entries are formed with @GOTOFF
01905   // entries.
01906   return MCSymbolRefExpr::Create(MBB->getSymbol(),
01907                                  MCSymbolRefExpr::VK_GOTOFF, Ctx);
01908 }
01909 
01910 /// Returns relocation base for the given PIC jumptable.
01911 SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table,
01912                                                     SelectionDAG &DAG) const {
01913   if (!Subtarget->is64Bit())
01914     // This doesn't have SDLoc associated with it, but is not really the
01915     // same as a Register.
01916     return DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), getPointerTy());
01917   return Table;
01918 }
01919 
01920 /// This returns the relocation base for the given PIC jumptable,
01921 /// the same as getPICJumpTableRelocBase, but as an MCExpr.
01922 const MCExpr *X86TargetLowering::
01923 getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI,
01924                              MCContext &Ctx) const {
01925   // X86-64 uses RIP relative addressing based on the jump table label.
01926   if (Subtarget->isPICStyleRIPRel())
01927     return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx);
01928 
01929   // Otherwise, the reference is relative to the PIC base.
01930   return MCSymbolRefExpr::Create(MF->getPICBaseSymbol(), Ctx);
01931 }
01932 
01933 std::pair<const TargetRegisterClass *, uint8_t>
01934 X86TargetLowering::findRepresentativeClass(const TargetRegisterInfo *TRI,
01935                                            MVT VT) const {
01936   const TargetRegisterClass *RRC = nullptr;
01937   uint8_t Cost = 1;
01938   switch (VT.SimpleTy) {
01939   default:
01940     return TargetLowering::findRepresentativeClass(TRI, VT);
01941   case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64:
01942     RRC = Subtarget->is64Bit() ? &X86::GR64RegClass : &X86::GR32RegClass;
01943     break;
01944   case MVT::x86mmx:
01945     RRC = &X86::VR64RegClass;
01946     break;
01947   case MVT::f32: case MVT::f64:
01948   case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64:
01949   case MVT::v4f32: case MVT::v2f64:
01950   case MVT::v32i8: case MVT::v8i32: case MVT::v4i64: case MVT::v8f32:
01951   case MVT::v4f64:
01952     RRC = &X86::VR128RegClass;
01953     break;
01954   }
01955   return std::make_pair(RRC, Cost);
01956 }
01957 
01958 bool X86TargetLowering::getStackCookieLocation(unsigned &AddressSpace,
01959                                                unsigned &Offset) const {
01960   if (!Subtarget->isTargetLinux())
01961     return false;
01962 
01963   if (Subtarget->is64Bit()) {
01964     // %fs:0x28, unless we're using a Kernel code model, in which case it's %gs:
01965     Offset = 0x28;
01966     if (getTargetMachine().getCodeModel() == CodeModel::Kernel)
01967       AddressSpace = 256;
01968     else
01969       AddressSpace = 257;
01970   } else {
01971     // %gs:0x14 on i386
01972     Offset = 0x14;
01973     AddressSpace = 256;
01974   }
01975   return true;
01976 }
01977 
01978 bool X86TargetLowering::isNoopAddrSpaceCast(unsigned SrcAS,
01979                                             unsigned DestAS) const {
01980   assert(SrcAS != DestAS && "Expected different address spaces!");
01981 
01982   return SrcAS < 256 && DestAS < 256;
01983 }
01984 
01985 //===----------------------------------------------------------------------===//
01986 //               Return Value Calling Convention Implementation
01987 //===----------------------------------------------------------------------===//
01988 
01989 #include "X86GenCallingConv.inc"
01990 
01991 bool
01992 X86TargetLowering::CanLowerReturn(CallingConv::ID CallConv,
01993                                   MachineFunction &MF, bool isVarArg,
01994                         const SmallVectorImpl<ISD::OutputArg> &Outs,
01995                         LLVMContext &Context) const {
01996   SmallVector<CCValAssign, 16> RVLocs;
01997   CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
01998   return CCInfo.CheckReturn(Outs, RetCC_X86);
01999 }
02000 
02001 const MCPhysReg *X86TargetLowering::getScratchRegisters(CallingConv::ID) const {
02002   static const MCPhysReg ScratchRegs[] = { X86::R11, 0 };
02003   return ScratchRegs;
02004 }
02005 
02006 SDValue
02007 X86TargetLowering::LowerReturn(SDValue Chain,
02008                                CallingConv::ID CallConv, bool isVarArg,
02009                                const SmallVectorImpl<ISD::OutputArg> &Outs,
02010                                const SmallVectorImpl<SDValue> &OutVals,
02011                                SDLoc dl, SelectionDAG &DAG) const {
02012   MachineFunction &MF = DAG.getMachineFunction();
02013   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
02014 
02015   SmallVector<CCValAssign, 16> RVLocs;
02016   CCState CCInfo(CallConv, isVarArg, MF, RVLocs, *DAG.getContext());
02017   CCInfo.AnalyzeReturn(Outs, RetCC_X86);
02018 
02019   SDValue Flag;
02020   SmallVector<SDValue, 6> RetOps;
02021   RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
02022   // Operand #1 = Bytes To Pop
02023   RetOps.push_back(DAG.getTargetConstant(FuncInfo->getBytesToPopOnReturn(),
02024                    MVT::i16));
02025 
02026   // Copy the result values into the output registers.
02027   for (unsigned i = 0; i != RVLocs.size(); ++i) {
02028     CCValAssign &VA = RVLocs[i];
02029     assert(VA.isRegLoc() && "Can only return in registers!");
02030     SDValue ValToCopy = OutVals[i];
02031     EVT ValVT = ValToCopy.getValueType();
02032 
02033     // Promote values to the appropriate types.
02034     if (VA.getLocInfo() == CCValAssign::SExt)
02035       ValToCopy = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), ValToCopy);
02036     else if (VA.getLocInfo() == CCValAssign::ZExt)
02037       ValToCopy = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), ValToCopy);
02038     else if (VA.getLocInfo() == CCValAssign::AExt)
02039       ValToCopy = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), ValToCopy);
02040     else if (VA.getLocInfo() == CCValAssign::BCvt)
02041       ValToCopy = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), ValToCopy);
02042 
02043     assert(VA.getLocInfo() != CCValAssign::FPExt &&
02044            "Unexpected FP-extend for return value.");
02045 
02046     // If this is x86-64, and we disabled SSE, we can't return FP values,
02047     // or SSE or MMX vectors.
02048     if ((ValVT == MVT::f32 || ValVT == MVT::f64 ||
02049          VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) &&
02050           (Subtarget->is64Bit() && !Subtarget->hasSSE1())) {
02051       report_fatal_error("SSE register return with SSE disabled");
02052     }
02053     // Likewise we can't return F64 values with SSE1 only.  gcc does so, but
02054     // llvm-gcc has never done it right and no one has noticed, so this
02055     // should be OK for now.
02056     if (ValVT == MVT::f64 &&
02057         (Subtarget->is64Bit() && !Subtarget->hasSSE2()))
02058       report_fatal_error("SSE2 register return with SSE2 disabled");
02059 
02060     // Returns in ST0/ST1 are handled specially: these are pushed as operands to
02061     // the RET instruction and handled by the FP Stackifier.
02062     if (VA.getLocReg() == X86::FP0 ||
02063         VA.getLocReg() == X86::FP1) {
02064       // If this is a copy from an xmm register to ST(0), use an FPExtend to
02065       // change the value to the FP stack register class.
02066       if (isScalarFPTypeInSSEReg(VA.getValVT()))
02067         ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy);
02068       RetOps.push_back(ValToCopy);
02069       // Don't emit a copytoreg.
02070       continue;
02071     }
02072 
02073     // 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64
02074     // which is returned in RAX / RDX.
02075     if (Subtarget->is64Bit()) {
02076       if (ValVT == MVT::x86mmx) {
02077         if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) {
02078           ValToCopy = DAG.getNode(ISD::BITCAST, dl, MVT::i64, ValToCopy);
02079           ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
02080                                   ValToCopy);
02081           // If we don't have SSE2 available, convert to v4f32 so the generated
02082           // register is legal.
02083           if (!Subtarget->hasSSE2())
02084             ValToCopy = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32,ValToCopy);
02085         }
02086       }
02087     }
02088 
02089     Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), ValToCopy, Flag);
02090     Flag = Chain.getValue(1);
02091     RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
02092   }
02093 
02094   // The x86-64 ABIs require that for returning structs by value we copy
02095   // the sret argument into %rax/%eax (depending on ABI) for the return.
02096   // Win32 requires us to put the sret argument to %eax as well.
02097   // We saved the argument into a virtual register in the entry block,
02098   // so now we copy the value out and into %rax/%eax.
02099   //
02100   // Checking Function.hasStructRetAttr() here is insufficient because the IR
02101   // may not have an explicit sret argument. If FuncInfo.CanLowerReturn is
02102   // false, then an sret argument may be implicitly inserted in the SelDAG. In
02103   // either case FuncInfo->setSRetReturnReg() will have been called.
02104   if (unsigned SRetReg = FuncInfo->getSRetReturnReg()) {
02105     assert((Subtarget->is64Bit() || Subtarget->isTargetKnownWindowsMSVC()) &&
02106            "No need for an sret register");
02107     SDValue Val = DAG.getCopyFromReg(Chain, dl, SRetReg, getPointerTy());
02108 
02109     unsigned RetValReg
02110         = (Subtarget->is64Bit() && !Subtarget->isTarget64BitILP32()) ?
02111           X86::RAX : X86::EAX;
02112     Chain = DAG.getCopyToReg(Chain, dl, RetValReg, Val, Flag);
02113     Flag = Chain.getValue(1);
02114 
02115     // RAX/EAX now acts like a return value.
02116     RetOps.push_back(DAG.getRegister(RetValReg, getPointerTy()));
02117   }
02118 
02119   RetOps[0] = Chain;  // Update chain.
02120 
02121   // Add the flag if we have it.
02122   if (Flag.getNode())
02123     RetOps.push_back(Flag);
02124 
02125   return DAG.getNode(X86ISD::RET_FLAG, dl, MVT::Other, RetOps);
02126 }
02127 
02128 bool X86TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
02129   if (N->getNumValues() != 1)
02130     return false;
02131   if (!N->hasNUsesOfValue(1, 0))
02132     return false;
02133 
02134   SDValue TCChain = Chain;
02135   SDNode *Copy = *N->use_begin();
02136   if (Copy->getOpcode() == ISD::CopyToReg) {
02137     // If the copy has a glue operand, we conservatively assume it isn't safe to
02138     // perform a tail call.
02139     if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
02140       return false;
02141     TCChain = Copy->getOperand(0);
02142   } else if (Copy->getOpcode() != ISD::FP_EXTEND)
02143     return false;
02144 
02145   bool HasRet = false;
02146   for (SDNode::use_iterator UI = Copy->use_begin(), UE = Copy->use_end();
02147        UI != UE; ++UI) {
02148     if (UI->getOpcode() != X86ISD::RET_FLAG)
02149       return false;
02150     // If we are returning more than one value, we can definitely
02151     // not make a tail call see PR19530
02152     if (UI->getNumOperands() > 4)
02153       return false;
02154     if (UI->getNumOperands() == 4 &&
02155         UI->getOperand(UI->getNumOperands()-1).getValueType() != MVT::Glue)
02156       return false;
02157     HasRet = true;
02158   }
02159 
02160   if (!HasRet)
02161     return false;
02162 
02163   Chain = TCChain;
02164   return true;
02165 }
02166 
02167 EVT
02168 X86TargetLowering::getTypeForExtArgOrReturn(LLVMContext &Context, EVT VT,
02169                                             ISD::NodeType ExtendKind) const {
02170   MVT ReturnMVT;
02171   // TODO: Is this also valid on 32-bit?
02172   if (Subtarget->is64Bit() && VT == MVT::i1 && ExtendKind == ISD::ZERO_EXTEND)
02173     ReturnMVT = MVT::i8;
02174   else
02175     ReturnMVT = MVT::i32;
02176 
02177   EVT MinVT = getRegisterType(Context, ReturnMVT);
02178   return VT.bitsLT(MinVT) ? MinVT : VT;
02179 }
02180 
02181 /// Lower the result values of a call into the
02182 /// appropriate copies out of appropriate physical registers.
02183 ///
02184 SDValue
02185 X86TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
02186                                    CallingConv::ID CallConv, bool isVarArg,
02187                                    const SmallVectorImpl<ISD::InputArg> &Ins,
02188                                    SDLoc dl, SelectionDAG &DAG,
02189                                    SmallVectorImpl<SDValue> &InVals) const {
02190 
02191   // Assign locations to each value returned by this call.
02192   SmallVector<CCValAssign, 16> RVLocs;
02193   bool Is64Bit = Subtarget->is64Bit();
02194   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
02195                  *DAG.getContext());
02196   CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
02197 
02198   // Copy all of the result registers out of their specified physreg.
02199   for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
02200     CCValAssign &VA = RVLocs[i];
02201     EVT CopyVT = VA.getValVT();
02202 
02203     // If this is x86-64, and we disabled SSE, we can't return FP values
02204     if ((CopyVT == MVT::f32 || CopyVT == MVT::f64) &&
02205         ((Is64Bit || Ins[i].Flags.isInReg()) && !Subtarget->hasSSE1())) {
02206       report_fatal_error("SSE register return with SSE disabled");
02207     }
02208 
02209     // If we prefer to use the value in xmm registers, copy it out as f80 and
02210     // use a truncate to move it from fp stack reg to xmm reg.
02211     if ((VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1) &&
02212         isScalarFPTypeInSSEReg(VA.getValVT()))
02213       CopyVT = MVT::f80;
02214 
02215     Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(),
02216                                CopyVT, InFlag).getValue(1);
02217     SDValue Val = Chain.getValue(0);
02218 
02219     if (CopyVT != VA.getValVT())
02220       Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val,
02221                         // This truncation won't change the value.
02222                         DAG.getIntPtrConstant(1));
02223 
02224     InFlag = Chain.getValue(2);
02225     InVals.push_back(Val);
02226   }
02227 
02228   return Chain;
02229 }
02230 
02231 //===----------------------------------------------------------------------===//
02232 //                C & StdCall & Fast Calling Convention implementation
02233 //===----------------------------------------------------------------------===//
02234 //  StdCall calling convention seems to be standard for many Windows' API
02235 //  routines and around. It differs from C calling convention just a little:
02236 //  callee should clean up the stack, not caller. Symbols should be also
02237 //  decorated in some fancy way :) It doesn't support any vector arguments.
02238 //  For info on fast calling convention see Fast Calling Convention (tail call)
02239 //  implementation LowerX86_32FastCCCallTo.
02240 
02241 /// CallIsStructReturn - Determines whether a call uses struct return
02242 /// semantics.
02243 enum StructReturnType {
02244   NotStructReturn,
02245   RegStructReturn,
02246   StackStructReturn
02247 };
02248 static StructReturnType
02249 callIsStructReturn(const SmallVectorImpl<ISD::OutputArg> &Outs) {
02250   if (Outs.empty())
02251     return NotStructReturn;
02252 
02253   const ISD::ArgFlagsTy &Flags = Outs[0].Flags;
02254   if (!Flags.isSRet())
02255     return NotStructReturn;
02256   if (Flags.isInReg())
02257     return RegStructReturn;
02258   return StackStructReturn;
02259 }
02260 
02261 /// Determines whether a function uses struct return semantics.
02262 static StructReturnType
02263 argsAreStructReturn(const SmallVectorImpl<ISD::InputArg> &Ins) {
02264   if (Ins.empty())
02265     return NotStructReturn;
02266 
02267   const ISD::ArgFlagsTy &Flags = Ins[0].Flags;
02268   if (!Flags.isSRet())
02269     return NotStructReturn;
02270   if (Flags.isInReg())
02271     return RegStructReturn;
02272   return StackStructReturn;
02273 }
02274 
02275 /// Make a copy of an aggregate at address specified by "Src" to address
02276 /// "Dst" with size and alignment information specified by the specific
02277 /// parameter attribute. The copy will be passed as a byval function parameter.
02278 static SDValue
02279 CreateCopyOfByValArgument(SDValue Src, SDValue Dst, SDValue Chain,
02280                           ISD::ArgFlagsTy Flags, SelectionDAG &DAG,
02281                           SDLoc dl) {
02282   SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), MVT::i32);
02283 
02284   return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(),
02285                        /*isVolatile*/false, /*AlwaysInline=*/true,
02286                        MachinePointerInfo(), MachinePointerInfo());
02287 }
02288 
02289 /// Return true if the calling convention is one that
02290 /// supports tail call optimization.
02291 static bool IsTailCallConvention(CallingConv::ID CC) {
02292   return (CC == CallingConv::Fast || CC == CallingConv::GHC ||
02293           CC == CallingConv::HiPE);
02294 }
02295 
02296 /// \brief Return true if the calling convention is a C calling convention.
02297 static bool IsCCallConvention(CallingConv::ID CC) {
02298   return (CC == CallingConv::C || CC == CallingConv::X86_64_Win64 ||
02299           CC == CallingConv::X86_64_SysV);
02300 }
02301 
02302 bool X86TargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const {
02303   if (!CI->isTailCall() || getTargetMachine().Options.DisableTailCalls)
02304     return false;
02305 
02306   CallSite CS(CI);
02307   CallingConv::ID CalleeCC = CS.getCallingConv();
02308   if (!IsTailCallConvention(CalleeCC) && !IsCCallConvention(CalleeCC))
02309     return false;
02310 
02311   return true;
02312 }
02313 
02314 /// Return true if the function is being made into
02315 /// a tailcall target by changing its ABI.
02316 static bool FuncIsMadeTailCallSafe(CallingConv::ID CC,
02317                                    bool GuaranteedTailCallOpt) {
02318   return GuaranteedTailCallOpt && IsTailCallConvention(CC);
02319 }
02320 
02321 SDValue
02322 X86TargetLowering::LowerMemArgument(SDValue Chain,
02323                                     CallingConv::ID CallConv,
02324                                     const SmallVectorImpl<ISD::InputArg> &Ins,
02325                                     SDLoc dl, SelectionDAG &DAG,
02326                                     const CCValAssign &VA,
02327                                     MachineFrameInfo *MFI,
02328                                     unsigned i) const {
02329   // Create the nodes corresponding to a load from this parameter slot.
02330   ISD::ArgFlagsTy Flags = Ins[i].Flags;
02331   bool AlwaysUseMutable = FuncIsMadeTailCallSafe(
02332       CallConv, DAG.getTarget().Options.GuaranteedTailCallOpt);
02333   bool isImmutable = !AlwaysUseMutable && !Flags.isByVal();
02334   EVT ValVT;
02335 
02336   // If value is passed by pointer we have address passed instead of the value
02337   // itself.
02338   if (VA.getLocInfo() == CCValAssign::Indirect)
02339     ValVT = VA.getLocVT();
02340   else
02341     ValVT = VA.getValVT();
02342 
02343   // FIXME: For now, all byval parameter objects are marked mutable. This can be
02344   // changed with more analysis.
02345   // In case of tail call optimization mark all arguments mutable. Since they
02346   // could be overwritten by lowering of arguments in case of a tail call.
02347   if (Flags.isByVal()) {
02348     unsigned Bytes = Flags.getByValSize();
02349     if (Bytes == 0) Bytes = 1; // Don't create zero-sized stack objects.
02350     int FI = MFI->CreateFixedObject(Bytes, VA.getLocMemOffset(), isImmutable);
02351     return DAG.getFrameIndex(FI, getPointerTy());
02352   } else {
02353     int FI = MFI->CreateFixedObject(ValVT.getSizeInBits()/8,
02354                                     VA.getLocMemOffset(), isImmutable);
02355     SDValue FIN = DAG.getFrameIndex(FI, getPointerTy());
02356     return DAG.getLoad(ValVT, dl, Chain, FIN,
02357                        MachinePointerInfo::getFixedStack(FI),
02358                        false, false, false, 0);
02359   }
02360 }
02361 
02362 // FIXME: Get this from tablegen.
02363 static ArrayRef<MCPhysReg> get64BitArgumentGPRs(CallingConv::ID CallConv,
02364                                                 const X86Subtarget *Subtarget) {
02365   assert(Subtarget->is64Bit());
02366 
02367   if (Subtarget->isCallingConvWin64(CallConv)) {
02368     static const MCPhysReg GPR64ArgRegsWin64[] = {
02369       X86::RCX, X86::RDX, X86::R8,  X86::R9
02370     };
02371     return makeArrayRef(std::begin(GPR64ArgRegsWin64), std::end(GPR64ArgRegsWin64));
02372   }
02373 
02374   static const MCPhysReg GPR64ArgRegs64Bit[] = {
02375     X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9
02376   };
02377   return makeArrayRef(std::begin(GPR64ArgRegs64Bit), std::end(GPR64ArgRegs64Bit));
02378 }
02379 
02380 // FIXME: Get this from tablegen.
02381 static ArrayRef<MCPhysReg> get64BitArgumentXMMs(MachineFunction &MF,
02382                                                 CallingConv::ID CallConv,
02383                                                 const X86Subtarget *Subtarget) {
02384   assert(Subtarget->is64Bit());
02385   if (Subtarget->isCallingConvWin64(CallConv)) {
02386     // The XMM registers which might contain var arg parameters are shadowed
02387     // in their paired GPR.  So we only need to save the GPR to their home
02388     // slots.
02389     // TODO: __vectorcall will change this.
02390     return None;
02391   }
02392 
02393   const Function *Fn = MF.getFunction();
02394   bool NoImplicitFloatOps = Fn->hasFnAttribute(Attribute::NoImplicitFloat);
02395   assert(!(MF.getTarget().Options.UseSoftFloat && NoImplicitFloatOps) &&
02396          "SSE register cannot be used when SSE is disabled!");
02397   if (MF.getTarget().Options.UseSoftFloat || NoImplicitFloatOps ||
02398       !Subtarget->hasSSE1())
02399     // Kernel mode asks for SSE to be disabled, so there are no XMM argument
02400     // registers.
02401     return None;
02402 
02403   static const MCPhysReg XMMArgRegs64Bit[] = {
02404     X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
02405     X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
02406   };
02407   return makeArrayRef(std::begin(XMMArgRegs64Bit), std::end(XMMArgRegs64Bit));
02408 }
02409 
02410 SDValue
02411 X86TargetLowering::LowerFormalArguments(SDValue Chain,
02412                                         CallingConv::ID CallConv,
02413                                         bool isVarArg,
02414                                       const SmallVectorImpl<ISD::InputArg> &Ins,
02415                                         SDLoc dl,
02416                                         SelectionDAG &DAG,
02417                                         SmallVectorImpl<SDValue> &InVals)
02418                                           const {
02419   MachineFunction &MF = DAG.getMachineFunction();
02420   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
02421 
02422   const Function* Fn = MF.getFunction();
02423   if (Fn->hasExternalLinkage() &&
02424       Subtarget->isTargetCygMing() &&
02425       Fn->getName() == "main")
02426     FuncInfo->setForceFramePointer(true);
02427 
02428   MachineFrameInfo *MFI = MF.getFrameInfo();
02429   bool Is64Bit = Subtarget->is64Bit();
02430   bool IsWin64 = Subtarget->isCallingConvWin64(CallConv);
02431 
02432   assert(!(isVarArg && IsTailCallConvention(CallConv)) &&
02433          "Var args not supported with calling convention fastcc, ghc or hipe");
02434 
02435   // Assign locations to all of the incoming arguments.
02436   SmallVector<CCValAssign, 16> ArgLocs;
02437   CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
02438 
02439   // Allocate shadow area for Win64
02440   if (IsWin64)
02441     CCInfo.AllocateStack(32, 8);
02442 
02443   CCInfo.AnalyzeFormalArguments(Ins, CC_X86);
02444 
02445   unsigned LastVal = ~0U;
02446   SDValue ArgValue;
02447   for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
02448     CCValAssign &VA = ArgLocs[i];
02449     // TODO: If an arg is passed in two places (e.g. reg and stack), skip later
02450     // places.
02451     assert(VA.getValNo() != LastVal &&
02452            "Don't support value assigned to multiple locs yet");
02453     (void)LastVal;
02454     LastVal = VA.getValNo();
02455 
02456     if (VA.isRegLoc()) {
02457       EVT RegVT = VA.getLocVT();
02458       const TargetRegisterClass *RC;
02459       if (RegVT == MVT::i32)
02460         RC = &X86::GR32RegClass;
02461       else if (Is64Bit && RegVT == MVT::i64)
02462         RC = &X86::GR64RegClass;
02463       else if (RegVT == MVT::f32)
02464         RC = &X86::FR32RegClass;
02465       else if (RegVT == MVT::f64)
02466         RC = &X86::FR64RegClass;
02467       else if (RegVT.is512BitVector())
02468         RC = &X86::VR512RegClass;
02469       else if (RegVT.is256BitVector())
02470         RC = &X86::VR256RegClass;
02471       else if (RegVT.is128BitVector())
02472         RC = &X86::VR128RegClass;
02473       else if (RegVT == MVT::x86mmx)
02474         RC = &X86::VR64RegClass;
02475       else if (RegVT == MVT::i1)
02476         RC = &X86::VK1RegClass;
02477       else if (RegVT == MVT::v8i1)
02478         RC = &X86::VK8RegClass;
02479       else if (RegVT == MVT::v16i1)
02480         RC = &X86::VK16RegClass;
02481       else if (RegVT == MVT::v32i1)
02482         RC = &X86::VK32RegClass;
02483       else if (RegVT == MVT::v64i1)
02484         RC = &X86::VK64RegClass;
02485       else
02486         llvm_unreachable("Unknown argument type!");
02487 
02488       unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
02489       ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
02490 
02491       // If this is an 8 or 16-bit value, it is really passed promoted to 32
02492       // bits.  Insert an assert[sz]ext to capture this, then truncate to the
02493       // right size.
02494       if (VA.getLocInfo() == CCValAssign::SExt)
02495         ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue,
02496                                DAG.getValueType(VA.getValVT()));
02497       else if (VA.getLocInfo() == CCValAssign::ZExt)
02498         ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue,
02499                                DAG.getValueType(VA.getValVT()));
02500       else if (VA.getLocInfo() == CCValAssign::BCvt)
02501         ArgValue = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), ArgValue);
02502 
02503       if (VA.isExtInLoc()) {
02504         // Handle MMX values passed in XMM regs.
02505         if (RegVT.isVector())
02506           ArgValue = DAG.getNode(X86ISD::MOVDQ2Q, dl, VA.getValVT(), ArgValue);
02507         else
02508           ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
02509       }
02510     } else {
02511       assert(VA.isMemLoc());
02512       ArgValue = LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, i);
02513     }
02514 
02515     // If value is passed via pointer - do a load.
02516     if (VA.getLocInfo() == CCValAssign::Indirect)
02517       ArgValue = DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue,
02518                              MachinePointerInfo(), false, false, false, 0);
02519 
02520     InVals.push_back(ArgValue);
02521   }
02522 
02523   if (Subtarget->is64Bit() || Subtarget->isTargetKnownWindowsMSVC()) {
02524     for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
02525       // The x86-64 ABIs require that for returning structs by value we copy
02526       // the sret argument into %rax/%eax (depending on ABI) for the return.
02527       // Win32 requires us to put the sret argument to %eax as well.
02528       // Save the argument into a virtual register so that we can access it
02529       // from the return points.
02530       if (Ins[i].Flags.isSRet()) {
02531         unsigned Reg = FuncInfo->getSRetReturnReg();
02532         if (!Reg) {
02533           MVT PtrTy = getPointerTy();
02534           Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy));
02535           FuncInfo->setSRetReturnReg(Reg);
02536         }
02537         SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[i]);
02538         Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain);
02539         break;
02540       }
02541     }
02542   }
02543 
02544   unsigned StackSize = CCInfo.getNextStackOffset();
02545   // Align stack specially for tail calls.
02546   if (FuncIsMadeTailCallSafe(CallConv,
02547                              MF.getTarget().Options.GuaranteedTailCallOpt))
02548     StackSize = GetAlignedArgumentStackSize(StackSize, DAG);
02549 
02550   // If the function takes variable number of arguments, make a frame index for
02551   // the start of the first vararg value... for expansion of llvm.va_start. We
02552   // can skip this if there are no va_start calls.
02553   if (MFI->hasVAStart() &&
02554       (Is64Bit || (CallConv != CallingConv::X86_FastCall &&
02555                    CallConv != CallingConv::X86_ThisCall))) {
02556     FuncInfo->setVarArgsFrameIndex(
02557         MFI->CreateFixedObject(1, StackSize, true));
02558   }
02559 
02560   // Figure out if XMM registers are in use.
02561   assert(!(MF.getTarget().Options.UseSoftFloat &&
02562            Fn->hasFnAttribute(Attribute::NoImplicitFloat)) &&
02563          "SSE register cannot be used when SSE is disabled!");
02564 
02565   // 64-bit calling conventions support varargs and register parameters, so we
02566   // have to do extra work to spill them in the prologue.
02567   if (Is64Bit && isVarArg && MFI->hasVAStart()) {
02568     // Find the first unallocated argument registers.
02569     ArrayRef<MCPhysReg> ArgGPRs = get64BitArgumentGPRs(CallConv, Subtarget);
02570     ArrayRef<MCPhysReg> ArgXMMs = get64BitArgumentXMMs(MF, CallConv, Subtarget);
02571     unsigned NumIntRegs = CCInfo.getFirstUnallocated(ArgGPRs);
02572     unsigned NumXMMRegs = CCInfo.getFirstUnallocated(ArgXMMs);
02573     assert(!(NumXMMRegs && !Subtarget->hasSSE1()) &&
02574            "SSE register cannot be used when SSE is disabled!");
02575 
02576     // Gather all the live in physical registers.
02577     SmallVector<SDValue, 6> LiveGPRs;
02578     SmallVector<SDValue, 8> LiveXMMRegs;
02579     SDValue ALVal;
02580     for (MCPhysReg Reg : ArgGPRs.slice(NumIntRegs)) {
02581       unsigned GPR = MF.addLiveIn(Reg, &X86::GR64RegClass);
02582       LiveGPRs.push_back(
02583           DAG.getCopyFromReg(Chain, dl, GPR, MVT::i64));
02584     }
02585     if (!ArgXMMs.empty()) {
02586       unsigned AL = MF.addLiveIn(X86::AL, &X86::GR8RegClass);
02587       ALVal = DAG.getCopyFromReg(Chain, dl, AL, MVT::i8);
02588       for (MCPhysReg Reg : ArgXMMs.slice(NumXMMRegs)) {
02589         unsigned XMMReg = MF.addLiveIn(Reg, &X86::VR128RegClass);
02590         LiveXMMRegs.push_back(
02591             DAG.getCopyFromReg(Chain, dl, XMMReg, MVT::v4f32));
02592       }
02593     }
02594 
02595     if (IsWin64) {
02596       const TargetFrameLowering &TFI = *Subtarget->getFrameLowering();
02597       // Get to the caller-allocated home save location.  Add 8 to account
02598       // for the return address.
02599       int HomeOffset = TFI.getOffsetOfLocalArea() + 8;
02600       FuncInfo->setRegSaveFrameIndex(
02601           MFI->CreateFixedObject(1, NumIntRegs * 8 + HomeOffset, false));
02602       // Fixup to set vararg frame on shadow area (4 x i64).
02603       if (NumIntRegs < 4)
02604         FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex());
02605     } else {
02606       // For X86-64, if there are vararg parameters that are passed via
02607       // registers, then we must store them to their spots on the stack so
02608       // they may be loaded by deferencing the result of va_next.
02609       FuncInfo->setVarArgsGPOffset(NumIntRegs * 8);
02610       FuncInfo->setVarArgsFPOffset(ArgGPRs.size() * 8 + NumXMMRegs * 16);
02611       FuncInfo->setRegSaveFrameIndex(MFI->CreateStackObject(
02612           ArgGPRs.size() * 8 + ArgXMMs.size() * 16, 16, false));
02613     }
02614 
02615     // Store the integer parameter registers.
02616     SmallVector<SDValue, 8> MemOps;
02617     SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(),
02618                                       getPointerTy());
02619     unsigned Offset = FuncInfo->getVarArgsGPOffset();
02620     for (SDValue Val : LiveGPRs) {
02621       SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), RSFIN,
02622                                 DAG.getIntPtrConstant(Offset));
02623       SDValue Store =
02624         DAG.getStore(Val.getValue(1), dl, Val, FIN,
02625                      MachinePointerInfo::getFixedStack(
02626                        FuncInfo->getRegSaveFrameIndex(), Offset),
02627                      false, false, 0);
02628       MemOps.push_back(Store);
02629       Offset += 8;
02630     }
02631 
02632     if (!ArgXMMs.empty() && NumXMMRegs != ArgXMMs.size()) {
02633       // Now store the XMM (fp + vector) parameter registers.
02634       SmallVector<SDValue, 12> SaveXMMOps;
02635       SaveXMMOps.push_back(Chain);
02636       SaveXMMOps.push_back(ALVal);
02637       SaveXMMOps.push_back(DAG.getIntPtrConstant(
02638                              FuncInfo->getRegSaveFrameIndex()));
02639       SaveXMMOps.push_back(DAG.getIntPtrConstant(
02640                              FuncInfo->getVarArgsFPOffset()));
02641       SaveXMMOps.insert(SaveXMMOps.end(), LiveXMMRegs.begin(),
02642                         LiveXMMRegs.end());
02643       MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, dl,
02644                                    MVT::Other, SaveXMMOps));
02645     }
02646 
02647     if (!MemOps.empty())
02648       Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);
02649   }
02650 
02651   if (isVarArg && MFI->hasMustTailInVarArgFunc()) {
02652     // Find the largest legal vector type.
02653     MVT VecVT = MVT::Other;
02654     // FIXME: Only some x86_32 calling conventions support AVX512.
02655     if (Subtarget->hasAVX512() &&
02656         (Is64Bit || (CallConv == CallingConv::X86_VectorCall ||
02657                      CallConv == CallingConv::Intel_OCL_BI)))
02658       VecVT = MVT::v16f32;
02659     else if (Subtarget->hasAVX())
02660       VecVT = MVT::v8f32;
02661     else if (Subtarget->hasSSE2())
02662       VecVT = MVT::v4f32;
02663 
02664     // We forward some GPRs and some vector types.
02665     SmallVector<MVT, 2> RegParmTypes;
02666     MVT IntVT = Is64Bit ? MVT::i64 : MVT::i32;
02667     RegParmTypes.push_back(IntVT);
02668     if (VecVT != MVT::Other)
02669       RegParmTypes.push_back(VecVT);
02670 
02671     // Compute the set of forwarded registers. The rest are scratch.
02672     SmallVectorImpl<ForwardedRegister> &Forwards =
02673         FuncInfo->getForwardedMustTailRegParms();
02674     CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes, CC_X86);
02675 
02676     // Conservatively forward AL on x86_64, since it might be used for varargs.
02677     if (Is64Bit && !CCInfo.isAllocated(X86::AL)) {
02678       unsigned ALVReg = MF.addLiveIn(X86::AL, &X86::GR8RegClass);
02679       Forwards.push_back(ForwardedRegister(ALVReg, X86::AL, MVT::i8));
02680     }
02681 
02682     // Copy all forwards from physical to virtual registers.
02683     for (ForwardedRegister &F : Forwards) {
02684       // FIXME: Can we use a less constrained schedule?
02685       SDValue RegVal = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT);
02686       F.VReg = MF.getRegInfo().createVirtualRegister(getRegClassFor(F.VT));
02687       Chain = DAG.getCopyToReg(Chain, dl, F.VReg, RegVal);
02688     }
02689   }
02690 
02691   // Some CCs need callee pop.
02692   if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
02693                        MF.getTarget().Options.GuaranteedTailCallOpt)) {
02694     FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything.
02695   } else {
02696     FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing.
02697     // If this is an sret function, the return should pop the hidden pointer.
02698     if (!Is64Bit && !IsTailCallConvention(CallConv) &&
02699         !Subtarget->getTargetTriple().isOSMSVCRT() &&
02700         argsAreStructReturn(Ins) == StackStructReturn)
02701       FuncInfo->setBytesToPopOnReturn(4);
02702   }
02703 
02704   if (!Is64Bit) {
02705     // RegSaveFrameIndex is X86-64 only.
02706     FuncInfo->setRegSaveFrameIndex(0xAAAAAAA);
02707     if (CallConv == CallingConv::X86_FastCall ||
02708         CallConv == CallingConv::X86_ThisCall)
02709       // fastcc functions can't have varargs.
02710       FuncInfo->setVarArgsFrameIndex(0xAAAAAAA);
02711   }
02712 
02713   FuncInfo->setArgumentStackSize(StackSize);
02714 
02715   return Chain;
02716 }
02717 
02718 SDValue
02719 X86TargetLowering::LowerMemOpCallTo(SDValue Chain,
02720                                     SDValue StackPtr, SDValue Arg,
02721                                     SDLoc dl, SelectionDAG &DAG,
02722                                     const CCValAssign &VA,
02723                                     ISD::ArgFlagsTy Flags) const {
02724   unsigned LocMemOffset = VA.getLocMemOffset();
02725   SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset);
02726   PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, PtrOff);
02727   if (Flags.isByVal())
02728     return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl);
02729 
02730   return DAG.getStore(Chain, dl, Arg, PtrOff,
02731                       MachinePointerInfo::getStack(LocMemOffset),
02732                       false, false, 0);
02733 }
02734 
02735 /// Emit a load of return address if tail call
02736 /// optimization is performed and it is required.
02737 SDValue
02738 X86TargetLowering::EmitTailCallLoadRetAddr(SelectionDAG &DAG,
02739                                            SDValue &OutRetAddr, SDValue Chain,
02740                                            bool IsTailCall, bool Is64Bit,
02741                                            int FPDiff, SDLoc dl) const {
02742   // Adjust the Return address stack slot.
02743   EVT VT = getPointerTy();
02744   OutRetAddr = getReturnAddressFrameIndex(DAG);
02745 
02746   // Load the "old" Return address.
02747   OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, MachinePointerInfo(),
02748                            false, false, false, 0);
02749   return SDValue(OutRetAddr.getNode(), 1);
02750 }
02751 
02752 /// Emit a store of the return address if tail call
02753 /// optimization is performed and it is required (FPDiff!=0).
02754 static SDValue EmitTailCallStoreRetAddr(SelectionDAG &DAG, MachineFunction &MF,
02755                                         SDValue Chain, SDValue RetAddrFrIdx,
02756                                         EVT PtrVT, unsigned SlotSize,
02757                                         int FPDiff, SDLoc dl) {
02758   // Store the return address to the appropriate stack slot.
02759   if (!FPDiff) return Chain;
02760   // Calculate the new stack slot for the return address.
02761   int NewReturnAddrFI =
02762     MF.getFrameInfo()->CreateFixedObject(SlotSize, (int64_t)FPDiff - SlotSize,
02763                                          false);
02764   SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, PtrVT);
02765   Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx,
02766                        MachinePointerInfo::getFixedStack(NewReturnAddrFI),
02767                        false, false, 0);
02768   return Chain;
02769 }
02770 
02771 SDValue
02772 X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
02773                              SmallVectorImpl<SDValue> &InVals) const {
02774   SelectionDAG &DAG                     = CLI.DAG;
02775   SDLoc &dl                             = CLI.DL;
02776   SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
02777   SmallVectorImpl<SDValue> &OutVals     = CLI.OutVals;
02778   SmallVectorImpl<ISD::InputArg> &Ins   = CLI.Ins;
02779   SDValue Chain                         = CLI.Chain;
02780   SDValue Callee                        = CLI.Callee;
02781   CallingConv::ID CallConv              = CLI.CallConv;
02782   bool &isTailCall                      = CLI.IsTailCall;
02783   bool isVarArg                         = CLI.IsVarArg;
02784 
02785   MachineFunction &MF = DAG.getMachineFunction();
02786   bool Is64Bit        = Subtarget->is64Bit();
02787   bool IsWin64        = Subtarget->isCallingConvWin64(CallConv);
02788   StructReturnType SR = callIsStructReturn(Outs);
02789   bool IsSibcall      = false;
02790   X86MachineFunctionInfo *X86Info = MF.getInfo<X86MachineFunctionInfo>();
02791 
02792   if (MF.getTarget().Options.DisableTailCalls)
02793     isTailCall = false;
02794 
02795   bool IsMustTail = CLI.CS && CLI.CS->isMustTailCall();
02796   if (IsMustTail) {
02797     // Force this to be a tail call.  The verifier rules are enough to ensure
02798     // that we can lower this successfully without moving the return address
02799     // around.
02800     isTailCall = true;
02801   } else if (isTailCall) {
02802     // Check if it's really possible to do a tail call.
02803     isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv,
02804                     isVarArg, SR != NotStructReturn,
02805                     MF.getFunction()->hasStructRetAttr(), CLI.RetTy,
02806                     Outs, OutVals, Ins, DAG);
02807 
02808     // Sibcalls are automatically detected tailcalls which do not require
02809     // ABI changes.
02810     if (!MF.getTarget().Options.GuaranteedTailCallOpt && isTailCall)
02811       IsSibcall = true;
02812 
02813     if (isTailCall)
02814       ++NumTailCalls;
02815   }
02816 
02817   assert(!(isVarArg && IsTailCallConvention(CallConv)) &&
02818          "Var args not supported with calling convention fastcc, ghc or hipe");
02819 
02820   // Analyze operands of the call, assigning locations to each operand.
02821   SmallVector<CCValAssign, 16> ArgLocs;
02822   CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
02823 
02824   // Allocate shadow area for Win64
02825   if (IsWin64)
02826     CCInfo.AllocateStack(32, 8);
02827 
02828   CCInfo.AnalyzeCallOperands(Outs, CC_X86);
02829 
02830   // Get a count of how many bytes are to be pushed on the stack.
02831   unsigned NumBytes = CCInfo.getNextStackOffset();
02832   if (IsSibcall)
02833     // This is a sibcall. The memory operands are available in caller's
02834     // own caller's stack.
02835     NumBytes = 0;
02836   else if (MF.getTarget().Options.GuaranteedTailCallOpt &&
02837            IsTailCallConvention(CallConv))
02838     NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG);
02839 
02840   int FPDiff = 0;
02841   if (isTailCall && !IsSibcall && !IsMustTail) {
02842     // Lower arguments at fp - stackoffset + fpdiff.
02843     unsigned NumBytesCallerPushed = X86Info->getBytesToPopOnReturn();
02844 
02845     FPDiff = NumBytesCallerPushed - NumBytes;
02846 
02847     // Set the delta of movement of the returnaddr stackslot.
02848     // But only set if delta is greater than previous delta.
02849     if (FPDiff < X86Info->getTCReturnAddrDelta())
02850       X86Info->setTCReturnAddrDelta(FPDiff);
02851   }
02852 
02853   unsigned NumBytesToPush = NumBytes;
02854   unsigned NumBytesToPop = NumBytes;
02855 
02856   // If we have an inalloca argument, all stack space has already been allocated
02857   // for us and be right at the top of the stack.  We don't support multiple
02858   // arguments passed in memory when using inalloca.
02859   if (!Outs.empty() && Outs.back().Flags.isInAlloca()) {
02860     NumBytesToPush = 0;
02861     if (!ArgLocs.back().isMemLoc())
02862       report_fatal_error("cannot use inalloca attribute on a register "
02863                          "parameter");
02864     if (ArgLocs.back().getLocMemOffset() != 0)
02865       report_fatal_error("any parameter with the inalloca attribute must be "
02866                          "the only memory argument");
02867   }
02868 
02869   if (!IsSibcall)
02870     Chain = DAG.getCALLSEQ_START(
02871         Chain, DAG.getIntPtrConstant(NumBytesToPush, true), dl);
02872 
02873   SDValue RetAddrFrIdx;
02874   // Load return address for tail calls.
02875   if (isTailCall && FPDiff)
02876     Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall,
02877                                     Is64Bit, FPDiff, dl);
02878 
02879   SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
02880   SmallVector<SDValue, 8> MemOpChains;
02881   SDValue StackPtr;
02882 
02883   // Walk the register/memloc assignments, inserting copies/loads.  In the case
02884   // of tail call optimization arguments are handle later.
02885   const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
02886   for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
02887     // Skip inalloca arguments, they have already been written.
02888     ISD::ArgFlagsTy Flags = Outs[i].Flags;
02889     if (Flags.isInAlloca())
02890       continue;
02891 
02892     CCValAssign &VA = ArgLocs[i];
02893     EVT RegVT = VA.getLocVT();
02894     SDValue Arg = OutVals[i];
02895     bool isByVal = Flags.isByVal();
02896 
02897     // Promote the value if needed.
02898     switch (VA.getLocInfo()) {
02899     default: llvm_unreachable("Unknown loc info!");
02900     case CCValAssign::Full: break;
02901     case CCValAssign::SExt:
02902       Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg);
02903       break;
02904     case CCValAssign::ZExt:
02905       Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg);
02906       break;
02907     case CCValAssign::AExt:
02908       if (RegVT.is128BitVector()) {
02909         // Special case: passing MMX values in XMM registers.
02910         Arg = DAG.getNode(ISD::BITCAST, dl, MVT::i64, Arg);
02911         Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg);
02912         Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg);
02913       } else
02914         Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg);
02915       break;
02916     case CCValAssign::BCvt:
02917       Arg = DAG.getNode(ISD::BITCAST, dl, RegVT, Arg);
02918       break;
02919     case CCValAssign::Indirect: {
02920       // Store the argument.
02921       SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT());
02922       int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex();
02923       Chain = DAG.getStore(Chain, dl, Arg, SpillSlot,
02924                            MachinePointerInfo::getFixedStack(FI),
02925                            false, false, 0);
02926       Arg = SpillSlot;
02927       break;
02928     }
02929     }
02930 
02931     if (VA.isRegLoc()) {
02932       RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
02933       if (isVarArg && IsWin64) {
02934         // Win64 ABI requires argument XMM reg to be copied to the corresponding
02935         // shadow reg if callee is a varargs function.
02936         unsigned ShadowReg = 0;
02937         switch (VA.getLocReg()) {
02938         case X86::XMM0: ShadowReg = X86::RCX; break;
02939         case X86::XMM1: ShadowReg = X86::RDX; break;
02940         case X86::XMM2: ShadowReg = X86::R8; break;
02941         case X86::XMM3: ShadowReg = X86::R9; break;
02942         }
02943         if (ShadowReg)
02944           RegsToPass.push_back(std::make_pair(ShadowReg, Arg));
02945       }
02946     } else if (!IsSibcall && (!isTailCall || isByVal)) {
02947       assert(VA.isMemLoc());
02948       if (!StackPtr.getNode())
02949         StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
02950                                       getPointerTy());
02951       MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg,
02952                                              dl, DAG, VA, Flags));
02953     }
02954   }
02955 
02956   if (!MemOpChains.empty())
02957     Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
02958 
02959   if (Subtarget->isPICStyleGOT()) {
02960     // ELF / PIC requires GOT in the EBX register before function calls via PLT
02961     // GOT pointer.
02962     if (!isTailCall) {
02963       RegsToPass.push_back(std::make_pair(unsigned(X86::EBX),
02964                DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), getPointerTy())));
02965     } else {
02966       // If we are tail calling and generating PIC/GOT style code load the
02967       // address of the callee into ECX. The value in ecx is used as target of
02968       // the tail jump. This is done to circumvent the ebx/callee-saved problem
02969       // for tail calls on PIC/GOT architectures. Normally we would just put the
02970       // address of GOT into ebx and then call target@PLT. But for tail calls
02971       // ebx would be restored (since ebx is callee saved) before jumping to the
02972       // target@PLT.
02973 
02974       // Note: The actual moving to ECX is done further down.
02975       GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
02976       if (G && !G->getGlobal()->hasHiddenVisibility() &&
02977           !G->getGlobal()->hasProtectedVisibility())
02978         Callee = LowerGlobalAddress(Callee, DAG);
02979       else if (isa<ExternalSymbolSDNode>(Callee))
02980         Callee = LowerExternalSymbol(Callee, DAG);
02981     }
02982   }
02983 
02984   if (Is64Bit && isVarArg && !IsWin64 && !IsMustTail) {
02985     // From AMD64 ABI document:
02986     // For calls that may call functions that use varargs or stdargs
02987     // (prototype-less calls or calls to functions containing ellipsis (...) in
02988     // the declaration) %al is used as hidden argument to specify the number
02989     // of SSE registers used. The contents of %al do not need to match exactly
02990     // the number of registers, but must be an ubound on the number of SSE
02991     // registers used and is in the range 0 - 8 inclusive.
02992 
02993     // Count the number of XMM registers allocated.
02994     static const MCPhysReg XMMArgRegs[] = {
02995       X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
02996       X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
02997     };
02998     unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs);
02999     assert((Subtarget->hasSSE1() || !NumXMMRegs)
03000            && "SSE registers cannot be used when SSE is disabled");
03001 
03002     RegsToPass.push_back(std::make_pair(unsigned(X86::AL),
03003                                         DAG.getConstant(NumXMMRegs, MVT::i8)));
03004   }
03005 
03006   if (isVarArg && IsMustTail) {
03007     const auto &Forwards = X86Info->getForwardedMustTailRegParms();
03008     for (const auto &F : Forwards) {
03009       SDValue Val = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT);
03010       RegsToPass.push_back(std::make_pair(unsigned(F.PReg), Val));
03011     }
03012   }
03013 
03014   // For tail calls lower the arguments to the 'real' stack slots.  Sibcalls
03015   // don't need this because the eligibility check rejects calls that require
03016   // shuffling arguments passed in memory.
03017   if (!IsSibcall && isTailCall) {
03018     // Force all the incoming stack arguments to be loaded from the stack
03019     // before any new outgoing arguments are stored to the stack, because the
03020     // outgoing stack slots may alias the incoming argument stack slots, and
03021     // the alias isn't otherwise explicit. This is slightly more conservative
03022     // than necessary, because it means that each store effectively depends
03023     // on every argument instead of just those arguments it would clobber.
03024     SDValue ArgChain = DAG.getStackArgumentTokenFactor(Chain);
03025 
03026     SmallVector<SDValue, 8> MemOpChains2;
03027     SDValue FIN;
03028     int FI = 0;
03029     for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
03030       CCValAssign &VA = ArgLocs[i];
03031       if (VA.isRegLoc())
03032         continue;
03033       assert(VA.isMemLoc());
03034       SDValue Arg = OutVals[i];
03035       ISD::ArgFlagsTy Flags = Outs[i].Flags;
03036       // Skip inalloca arguments.  They don't require any work.
03037       if (Flags.isInAlloca())
03038         continue;
03039       // Create frame index.
03040       int32_t Offset = VA.getLocMemOffset()+FPDiff;
03041       uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8;
03042       FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset, true);
03043       FIN = DAG.getFrameIndex(FI, getPointerTy());
03044 
03045       if (Flags.isByVal()) {
03046         // Copy relative to framepointer.
03047         SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset());
03048         if (!StackPtr.getNode())
03049           StackPtr = DAG.getCopyFromReg(Chain, dl,
03050                                         RegInfo->getStackRegister(),
03051                                         getPointerTy());
03052         Source = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, Source);
03053 
03054         MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN,
03055                                                          ArgChain,
03056                                                          Flags, DAG, dl));
03057       } else {
03058         // Store relative to framepointer.
03059         MemOpChains2.push_back(
03060           DAG.getStore(ArgChain, dl, Arg, FIN,
03061                        MachinePointerInfo::getFixedStack(FI),
03062                        false, false, 0));
03063       }
03064     }
03065 
03066     if (!MemOpChains2.empty())
03067       Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains2);
03068 
03069     // Store the return address to the appropriate stack slot.
03070     Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx,
03071                                      getPointerTy(), RegInfo->getSlotSize(),
03072                                      FPDiff, dl);
03073   }
03074 
03075   // Build a sequence of copy-to-reg nodes chained together with token chain
03076   // and flag operands which copy the outgoing args into registers.
03077   SDValue InFlag;
03078   for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
03079     Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
03080                              RegsToPass[i].second, InFlag);
03081     InFlag = Chain.getValue(1);
03082   }
03083 
03084   if (DAG.getTarget().getCodeModel() == CodeModel::Large) {
03085     assert(Is64Bit && "Large code model is only legal in 64-bit mode.");
03086     // In the 64-bit large code model, we have to make all calls
03087     // through a register, since the call instruction's 32-bit
03088     // pc-relative offset may not be large enough to hold the whole
03089     // address.
03090   } else if (Callee->getOpcode() == ISD::GlobalAddress) {
03091     // If the callee is a GlobalAddress node (quite common, every direct call
03092     // is) turn it into a TargetGlobalAddress node so that legalize doesn't hack
03093     // it.
03094     GlobalAddressSDNode* G = cast<GlobalAddressSDNode>(Callee);
03095 
03096     // We should use extra load for direct calls to dllimported functions in
03097     // non-JIT mode.
03098     const GlobalValue *GV = G->getGlobal();
03099     if (!GV->hasDLLImportStorageClass()) {
03100       unsigned char OpFlags = 0;
03101       bool ExtraLoad = false;
03102       unsigned WrapperKind = ISD::DELETED_NODE;
03103 
03104       // On ELF targets, in both X86-64 and X86-32 mode, direct calls to
03105       // external symbols most go through the PLT in PIC mode.  If the symbol
03106       // has hidden or protected visibility, or if it is static or local, then
03107       // we don't need to use the PLT - we can directly call it.
03108       if (Subtarget->isTargetELF() &&
03109           DAG.getTarget().getRelocationModel() == Reloc::PIC_ &&
03110           GV->hasDefaultVisibility() && !GV->hasLocalLinkage()) {
03111         OpFlags = X86II::MO_PLT;
03112       } else if (Subtarget->isPICStyleStubAny() &&
03113                  (GV->isDeclaration() || GV->isWeakForLinker()) &&
03114                  (!Subtarget->getTargetTriple().isMacOSX() ||
03115                   Subtarget->getTargetTriple().isMacOSXVersionLT(10, 5))) {
03116         // PC-relative references to external symbols should go through $stub,
03117         // unless we're building with the leopard linker or later, which
03118         // automatically synthesizes these stubs.
03119         OpFlags = X86II::MO_DARWIN_STUB;
03120       } else if (Subtarget->isPICStyleRIPRel() && isa<Function>(GV) &&
03121                  cast<Function>(GV)->hasFnAttribute(Attribute::NonLazyBind)) {
03122         // If the function is marked as non-lazy, generate an indirect call
03123         // which loads from the GOT directly. This avoids runtime overhead
03124         // at the cost of eager binding (and one extra byte of encoding).
03125         OpFlags = X86II::MO_GOTPCREL;
03126         WrapperKind = X86ISD::WrapperRIP;
03127         ExtraLoad = true;
03128       }
03129 
03130       Callee = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(),
03131                                           G->getOffset(), OpFlags);
03132 
03133       // Add a wrapper if needed.
03134       if (WrapperKind != ISD::DELETED_NODE)
03135         Callee = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Callee);
03136       // Add extra indirection if needed.
03137       if (ExtraLoad)
03138         Callee = DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), Callee,
03139                              MachinePointerInfo::getGOT(),
03140                              false, false, false, 0);
03141     }
03142   } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
03143     unsigned char OpFlags = 0;
03144 
03145     // On ELF targets, in either X86-64 or X86-32 mode, direct calls to
03146     // external symbols should go through the PLT.
03147     if (Subtarget->isTargetELF() &&
03148         DAG.getTarget().getRelocationModel() == Reloc::PIC_) {
03149       OpFlags = X86II::MO_PLT;
03150     } else if (Subtarget->isPICStyleStubAny() &&
03151                (!Subtarget->getTargetTriple().isMacOSX() ||
03152                 Subtarget->getTargetTriple().isMacOSXVersionLT(10, 5))) {
03153       // PC-relative references to external symbols should go through $stub,
03154       // unless we're building with the leopard linker or later, which
03155       // automatically synthesizes these stubs.
03156       OpFlags = X86II::MO_DARWIN_STUB;
03157     }
03158 
03159     Callee = DAG.getTargetExternalSymbol(S->getSymbol(), getPointerTy(),
03160                                          OpFlags);
03161   } else if (Subtarget->isTarget64BitILP32() &&
03162              Callee->getValueType(0) == MVT::i32) {
03163     // Zero-extend the 32-bit Callee address into a 64-bit according to x32 ABI
03164     Callee = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Callee);
03165   }
03166 
03167   // Returns a chain & a flag for retval copy to use.
03168   SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
03169   SmallVector<SDValue, 8> Ops;
03170 
03171   if (!IsSibcall && isTailCall) {
03172     Chain = DAG.getCALLSEQ_END(Chain,
03173                                DAG.getIntPtrConstant(NumBytesToPop, true),
03174                                DAG.getIntPtrConstant(0, true), InFlag, dl);
03175     InFlag = Chain.getValue(1);
03176   }
03177 
03178   Ops.push_back(Chain);
03179   Ops.push_back(Callee);
03180 
03181   if (isTailCall)
03182     Ops.push_back(DAG.getConstant(FPDiff, MVT::i32));
03183 
03184   // Add argument registers to the end of the list so that they are known live
03185   // into the call.
03186   for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
03187     Ops.push_back(DAG.getRegister(RegsToPass[i].first,
03188                                   RegsToPass[i].second.getValueType()));
03189 
03190   // Add a register mask operand representing the call-preserved registers.
03191   const TargetRegisterInfo *TRI = Subtarget->getRegisterInfo();
03192   const uint32_t *Mask = TRI->getCallPreservedMask(CallConv);
03193   assert(Mask && "Missing call preserved mask for calling convention");
03194   Ops.push_back(DAG.getRegisterMask(Mask));
03195 
03196   if (InFlag.getNode())
03197     Ops.push_back(InFlag);
03198 
03199   if (isTailCall) {
03200     // We used to do:
03201     //// If this is the first return lowered for this function, add the regs
03202     //// to the liveout set for the function.
03203     // This isn't right, although it's probably harmless on x86; liveouts
03204     // should be computed from returns not tail calls.  Consider a void
03205     // function making a tail call to a function returning int.
03206     return DAG.getNode(X86ISD::TC_RETURN, dl, NodeTys, Ops);
03207   }
03208 
03209   Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, Ops);
03210   InFlag = Chain.getValue(1);
03211 
03212   // Create the CALLSEQ_END node.
03213   unsigned NumBytesForCalleeToPop;
03214   if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
03215                        DAG.getTarget().Options.GuaranteedTailCallOpt))
03216     NumBytesForCalleeToPop = NumBytes;    // Callee pops everything
03217   else if (!Is64Bit && !IsTailCallConvention(CallConv) &&
03218            !Subtarget->getTargetTriple().isOSMSVCRT() &&
03219            SR == StackStructReturn)
03220     // If this is a call to a struct-return function, the callee
03221     // pops the hidden struct pointer, so we have to push it back.
03222     // This is common for Darwin/X86, Linux & Mingw32 targets.
03223     // For MSVC Win32 targets, the caller pops the hidden struct pointer.
03224     NumBytesForCalleeToPop = 4;
03225   else
03226     NumBytesForCalleeToPop = 0;  // Callee pops nothing.
03227 
03228   // Returns a flag for retval copy to use.
03229   if (!IsSibcall) {
03230     Chain = DAG.getCALLSEQ_END(Chain,
03231                                DAG.getIntPtrConstant(NumBytesToPop, true),
03232                                DAG.getIntPtrConstant(NumBytesForCalleeToPop,
03233                                                      true),
03234                                InFlag, dl);
03235     InFlag = Chain.getValue(1);
03236   }
03237 
03238   // Handle result values, copying them out of physregs into vregs that we
03239   // return.
03240   return LowerCallResult(Chain, InFlag, CallConv, isVarArg,
03241                          Ins, dl, DAG, InVals);
03242 }
03243 
03244 //===----------------------------------------------------------------------===//
03245 //                Fast Calling Convention (tail call) implementation
03246 //===----------------------------------------------------------------------===//
03247 
03248 //  Like std call, callee cleans arguments, convention except that ECX is
03249 //  reserved for storing the tail called function address. Only 2 registers are
03250 //  free for argument passing (inreg). Tail call optimization is performed
03251 //  provided:
03252 //                * tailcallopt is enabled
03253 //                * caller/callee are fastcc
03254 //  On X86_64 architecture with GOT-style position independent code only local
03255 //  (within module) calls are supported at the moment.
03256 //  To keep the stack aligned according to platform abi the function
03257 //  GetAlignedArgumentStackSize ensures that argument delta is always multiples
03258 //  of stack alignment. (Dynamic linkers need this - darwin's dyld for example)
03259 //  If a tail called function callee has more arguments than the caller the
03260 //  caller needs to make sure that there is room to move the RETADDR to. This is
03261 //  achieved by reserving an area the size of the argument delta right after the
03262 //  original RETADDR, but before the saved framepointer or the spilled registers
03263 //  e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4)
03264 //  stack layout:
03265 //    arg1
03266 //    arg2
03267 //    RETADDR
03268 //    [ new RETADDR
03269 //      move area ]
03270 //    (possible EBP)
03271 //    ESI
03272 //    EDI
03273 //    local1 ..
03274 
03275 /// GetAlignedArgumentStackSize - Make the stack size align e.g 16n + 12 aligned
03276 /// for a 16 byte align requirement.
03277 unsigned
03278 X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize,
03279                                                SelectionDAG& DAG) const {
03280   const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
03281   const TargetFrameLowering &TFI = *Subtarget->getFrameLowering();
03282   unsigned StackAlignment = TFI.getStackAlignment();
03283   uint64_t AlignMask = StackAlignment - 1;
03284   int64_t Offset = StackSize;
03285   unsigned SlotSize = RegInfo->getSlotSize();
03286   if ( (Offset & AlignMask) <= (StackAlignment - SlotSize) ) {
03287     // Number smaller than 12 so just add the difference.
03288     Offset += ((StackAlignment - SlotSize) - (Offset & AlignMask));
03289   } else {
03290     // Mask out lower bits, add stackalignment once plus the 12 bytes.
03291     Offset = ((~AlignMask) & Offset) + StackAlignment +
03292       (StackAlignment-SlotSize);
03293   }
03294   return Offset;
03295 }
03296 
03297 /// MatchingStackOffset - Return true if the given stack call argument is
03298 /// already available in the same position (relatively) of the caller's
03299 /// incoming argument stack.
03300 static
03301 bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
03302                          MachineFrameInfo *MFI, const MachineRegisterInfo *MRI,
03303                          const X86InstrInfo *TII) {
03304   unsigned Bytes = Arg.getValueType().getSizeInBits() / 8;
03305   int FI = INT_MAX;
03306   if (Arg.getOpcode() == ISD::CopyFromReg) {
03307     unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg();
03308     if (!TargetRegisterInfo::isVirtualRegister(VR))
03309       return false;
03310     MachineInstr *Def = MRI->getVRegDef(VR);
03311     if (!Def)
03312       return false;
03313     if (!Flags.isByVal()) {
03314       if (!TII->isLoadFromStackSlot(Def, FI))
03315         return false;
03316     } else {
03317       unsigned Opcode = Def->getOpcode();
03318       if ((Opcode == X86::LEA32r || Opcode == X86::LEA64r ||
03319            Opcode == X86::LEA64_32r) &&
03320           Def->getOperand(1).isFI()) {
03321         FI = Def->getOperand(1).getIndex();
03322         Bytes = Flags.getByValSize();
03323       } else
03324         return false;
03325     }
03326   } else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) {
03327     if (Flags.isByVal())
03328       // ByVal argument is passed in as a pointer but it's now being
03329       // dereferenced. e.g.
03330       // define @foo(%struct.X* %A) {
03331       //   tail call @bar(%struct.X* byval %A)
03332       // }
03333       return false;
03334     SDValue Ptr = Ld->getBasePtr();
03335     FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr);
03336     if (!FINode)
03337       return false;
03338     FI = FINode->getIndex();
03339   } else if (Arg.getOpcode() == ISD::FrameIndex && Flags.isByVal()) {
03340     FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Arg);
03341     FI = FINode->getIndex();
03342     Bytes = Flags.getByValSize();
03343   } else
03344     return false;
03345 
03346   assert(FI != INT_MAX);
03347   if (!MFI->isFixedObjectIndex(FI))
03348     return false;
03349   return Offset == MFI->getObjectOffset(FI) && Bytes == MFI->getObjectSize(FI);
03350 }
03351 
03352 /// IsEligibleForTailCallOptimization - Check whether the call is eligible
03353 /// for tail call optimization. Targets which want to do tail call
03354 /// optimization should implement this function.
03355 bool
03356 X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
03357                                                      CallingConv::ID CalleeCC,
03358                                                      bool isVarArg,
03359                                                      bool isCalleeStructRet,
03360                                                      bool isCallerStructRet,
03361                                                      Type *RetTy,
03362                                     const SmallVectorImpl<ISD::OutputArg> &Outs,
03363                                     const SmallVectorImpl<SDValue> &OutVals,
03364                                     const SmallVectorImpl<ISD::InputArg> &Ins,
03365                                                      SelectionDAG &DAG) const {
03366   if (!IsTailCallConvention(CalleeCC) && !IsCCallConvention(CalleeCC))
03367     return false;
03368 
03369   // If -tailcallopt is specified, make fastcc functions tail-callable.
03370   const MachineFunction &MF = DAG.getMachineFunction();
03371   const Function *CallerF = MF.getFunction();
03372 
03373   // If the function return type is x86_fp80 and the callee return type is not,
03374   // then the FP_EXTEND of the call result is not a nop. It's not safe to
03375   // perform a tailcall optimization here.
03376   if (CallerF->getReturnType()->isX86_FP80Ty() && !RetTy->isX86_FP80Ty())
03377     return false;
03378 
03379   CallingConv::ID CallerCC = CallerF->getCallingConv();
03380   bool CCMatch = CallerCC == CalleeCC;
03381   bool IsCalleeWin64 = Subtarget->isCallingConvWin64(CalleeCC);
03382   bool IsCallerWin64 = Subtarget->isCallingConvWin64(CallerCC);
03383 
03384   // Win64 functions have extra shadow space for argument homing. Don't do the
03385   // sibcall if the caller and callee have mismatched expectations for this
03386   // space.
03387   if (IsCalleeWin64 != IsCallerWin64)
03388     return false;
03389 
03390   if (DAG.getTarget().Options.GuaranteedTailCallOpt) {
03391     if (IsTailCallConvention(CalleeCC) && CCMatch)
03392       return true;
03393     return false;
03394   }
03395 
03396   // Look for obvious safe cases to perform tail call optimization that do not
03397   // require ABI changes. This is what gcc calls sibcall.
03398 
03399   // Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to
03400   // emit a special epilogue.
03401   const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
03402   if (RegInfo->needsStackRealignment(MF))
03403     return false;
03404 
03405   // Also avoid sibcall optimization if either caller or callee uses struct
03406   // return semantics.
03407   if (isCalleeStructRet || isCallerStructRet)
03408     return false;
03409 
03410   // An stdcall/thiscall caller is expected to clean up its arguments; the
03411   // callee isn't going to do that.
03412   // FIXME: this is more restrictive than needed. We could produce a tailcall
03413   // when the stack adjustment matches. For example, with a thiscall that takes
03414   // only one argument.
03415   if (!CCMatch && (CallerCC == CallingConv::X86_StdCall ||
03416                    CallerCC == CallingConv::X86_ThisCall))
03417     return false;
03418 
03419   // Do not sibcall optimize vararg calls unless all arguments are passed via
03420   // registers.
03421   if (isVarArg && !Outs.empty()) {
03422 
03423     // Optimizing for varargs on Win64 is unlikely to be safe without
03424     // additional testing.
03425     if (IsCalleeWin64 || IsCallerWin64)
03426       return false;
03427 
03428     SmallVector<CCValAssign, 16> ArgLocs;
03429     CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(), ArgLocs,
03430                    *DAG.getContext());
03431 
03432     CCInfo.AnalyzeCallOperands(Outs, CC_X86);
03433     for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i)
03434       if (!ArgLocs[i].isRegLoc())
03435         return false;
03436   }
03437 
03438   // If the call result is in ST0 / ST1, it needs to be popped off the x87
03439   // stack.  Therefore, if it's not used by the call it is not safe to optimize
03440   // this into a sibcall.
03441   bool Unused = false;
03442   for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
03443     if (!Ins[i].Used) {
03444       Unused = true;
03445       break;
03446     }
03447   }
03448   if (Unused) {
03449     SmallVector<CCValAssign, 16> RVLocs;
03450     CCState CCInfo(CalleeCC, false, DAG.getMachineFunction(), RVLocs,
03451                    *DAG.getContext());
03452     CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
03453     for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
03454       CCValAssign &VA = RVLocs[i];
03455       if (VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1)
03456         return false;
03457     }
03458   }
03459 
03460   // If the calling conventions do not match, then we'd better make sure the
03461   // results are returned in the same way as what the caller expects.
03462   if (!CCMatch) {
03463     SmallVector<CCValAssign, 16> RVLocs1;
03464     CCState CCInfo1(CalleeCC, false, DAG.getMachineFunction(), RVLocs1,
03465                     *DAG.getContext());
03466     CCInfo1.AnalyzeCallResult(Ins, RetCC_X86);
03467 
03468     SmallVector<CCValAssign, 16> RVLocs2;
03469     CCState CCInfo2(CallerCC, false, DAG.getMachineFunction(), RVLocs2,
03470                     *DAG.getContext());
03471     CCInfo2.AnalyzeCallResult(Ins, RetCC_X86);
03472 
03473     if (RVLocs1.size() != RVLocs2.size())
03474       return false;
03475     for (unsigned i = 0, e = RVLocs1.size(); i != e; ++i) {
03476       if (RVLocs1[i].isRegLoc() != RVLocs2[i].isRegLoc())
03477         return false;
03478       if (RVLocs1[i].getLocInfo() != RVLocs2[i].getLocInfo())
03479         return false;
03480       if (RVLocs1[i].isRegLoc()) {
03481         if (RVLocs1[i].getLocReg() != RVLocs2[i].getLocReg())
03482           return false;
03483       } else {
03484         if (RVLocs1[i].getLocMemOffset() != RVLocs2[i].getLocMemOffset())
03485           return false;
03486       }
03487     }
03488   }
03489 
03490   // If the callee takes no arguments then go on to check the results of the
03491   // call.
03492   if (!Outs.empty()) {
03493     // Check if stack adjustment is needed. For now, do not do this if any
03494     // argument is passed on the stack.
03495     SmallVector<CCValAssign, 16> ArgLocs;
03496     CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(), ArgLocs,
03497                    *DAG.getContext());
03498 
03499     // Allocate shadow area for Win64
03500     if (IsCalleeWin64)
03501       CCInfo.AllocateStack(32, 8);
03502 
03503     CCInfo.AnalyzeCallOperands(Outs, CC_X86);
03504     if (CCInfo.getNextStackOffset()) {
03505       MachineFunction &MF = DAG.getMachineFunction();
03506       if (MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn())
03507         return false;
03508 
03509       // Check if the arguments are already laid out in the right way as
03510       // the caller's fixed stack objects.
03511       MachineFrameInfo *MFI = MF.getFrameInfo();
03512       const MachineRegisterInfo *MRI = &MF.getRegInfo();
03513       const X86InstrInfo *TII = Subtarget->getInstrInfo();
03514       for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
03515         CCValAssign &VA = ArgLocs[i];
03516         SDValue Arg = OutVals[i];
03517         ISD::ArgFlagsTy Flags = Outs[i].Flags;
03518         if (VA.getLocInfo() == CCValAssign::Indirect)
03519           return false;
03520         if (!VA.isRegLoc()) {
03521           if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags,
03522                                    MFI, MRI, TII))
03523             return false;
03524         }
03525       }
03526     }
03527 
03528     // If the tailcall address may be in a register, then make sure it's
03529     // possible to register allocate for it. In 32-bit, the call address can
03530     // only target EAX, EDX, or ECX since the tail call must be scheduled after
03531     // callee-saved registers are restored. These happen to be the same
03532     // registers used to pass 'inreg' arguments so watch out for those.
03533     if (!Subtarget->is64Bit() &&
03534         ((!isa<GlobalAddressSDNode>(Callee) &&
03535           !isa<ExternalSymbolSDNode>(Callee)) ||
03536          DAG.getTarget().getRelocationModel() == Reloc::PIC_)) {
03537       unsigned NumInRegs = 0;
03538       // In PIC we need an extra register to formulate the address computation
03539       // for the callee.
03540       unsigned MaxInRegs =
03541         (DAG.getTarget().getRelocationModel() == Reloc::PIC_) ? 2 : 3;
03542 
03543       for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
03544         CCValAssign &VA = ArgLocs[i];
03545         if (!VA.isRegLoc())
03546           continue;
03547         unsigned Reg = VA.getLocReg();
03548         switch (Reg) {
03549         default: break;
03550         case X86::EAX: case X86::EDX: case X86::ECX:
03551           if (++NumInRegs == MaxInRegs)
03552             return false;
03553           break;
03554         }
03555       }
03556     }
03557   }
03558 
03559   return true;
03560 }
03561 
03562 FastISel *
03563 X86TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
03564                                   const TargetLibraryInfo *libInfo) const {
03565   return X86::createFastISel(funcInfo, libInfo);
03566 }
03567 
03568 //===----------------------------------------------------------------------===//
03569 //                           Other Lowering Hooks
03570 //===----------------------------------------------------------------------===//
03571 
03572 static bool MayFoldLoad(SDValue Op) {
03573   return Op.hasOneUse() && ISD::isNormalLoad(Op.getNode());
03574 }
03575 
03576 static bool MayFoldIntoStore(SDValue Op) {
03577   return Op.hasOneUse() && ISD::isNormalStore(*Op.getNode()->use_begin());
03578 }
03579 
03580 static bool isTargetShuffle(unsigned Opcode) {
03581   switch(Opcode) {
03582   default: return false;
03583   case X86ISD::BLENDI:
03584   case X86ISD::PSHUFB:
03585   case X86ISD::PSHUFD:
03586   case X86ISD::PSHUFHW:
03587   case X86ISD::PSHUFLW:
03588   case X86ISD::SHUFP:
03589   case X86ISD::PALIGNR:
03590   case X86ISD::MOVLHPS:
03591   case X86ISD::MOVLHPD:
03592   case X86ISD::MOVHLPS:
03593   case X86ISD::MOVLPS:
03594   case X86ISD::MOVLPD:
03595   case X86ISD::MOVSHDUP:
03596   case X86ISD::MOVSLDUP:
03597   case X86ISD::MOVDDUP:
03598   case X86ISD::MOVSS:
03599   case X86ISD::MOVSD:
03600   case X86ISD::UNPCKL:
03601   case X86ISD::UNPCKH:
03602   case X86ISD::VPERMILPI:
03603   case X86ISD::VPERM2X128:
03604   case X86ISD::VPERMI:
03605     return true;
03606   }
03607 }
03608 
03609 static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT,
03610                                     SDValue V1, unsigned TargetMask,
03611                                     SelectionDAG &DAG) {
03612   switch(Opc) {
03613   default: llvm_unreachable("Unknown x86 shuffle node");
03614   case X86ISD::PSHUFD:
03615   case X86ISD::PSHUFHW:
03616   case X86ISD::PSHUFLW:
03617   case X86ISD::VPERMILPI:
03618   case X86ISD::VPERMI:
03619     return DAG.getNode(Opc, dl, VT, V1, DAG.getConstant(TargetMask, MVT::i8));
03620   }
03621 }
03622 
03623 static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT,
03624                                     SDValue V1, SDValue V2, SelectionDAG &DAG) {
03625   switch(Opc) {
03626   default: llvm_unreachable("Unknown x86 shuffle node");
03627   case X86ISD::MOVLHPS:
03628   case X86ISD::MOVLHPD:
03629   case X86ISD::MOVHLPS:
03630   case X86ISD::MOVLPS:
03631   case X86ISD::MOVLPD:
03632   case X86ISD::MOVSS:
03633   case X86ISD::MOVSD:
03634   case X86ISD::UNPCKL:
03635   case X86ISD::UNPCKH:
03636     return DAG.getNode(Opc, dl, VT, V1, V2);
03637   }
03638 }
03639 
03640 SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const {
03641   MachineFunction &MF = DAG.getMachineFunction();
03642   const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
03643   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
03644   int ReturnAddrIndex = FuncInfo->getRAIndex();
03645 
03646   if (ReturnAddrIndex == 0) {
03647     // Set up a frame object for the return address.
03648     unsigned SlotSize = RegInfo->getSlotSize();
03649     ReturnAddrIndex = MF.getFrameInfo()->CreateFixedObject(SlotSize,
03650                                                            -(int64_t)SlotSize,
03651                                                            false);
03652     FuncInfo->setRAIndex(ReturnAddrIndex);
03653   }
03654 
03655   return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy());
03656 }
03657 
03658 bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M,
03659                                        bool hasSymbolicDisplacement) {
03660   // Offset should fit into 32 bit immediate field.
03661   if (!isInt<32>(Offset))
03662     return false;
03663 
03664   // If we don't have a symbolic displacement - we don't have any extra
03665   // restrictions.
03666   if (!hasSymbolicDisplacement)
03667     return true;
03668 
03669   // FIXME: Some tweaks might be needed for medium code model.
03670   if (M != CodeModel::Small && M != CodeModel::Kernel)
03671     return false;
03672 
03673   // For small code model we assume that latest object is 16MB before end of 31
03674   // bits boundary. We may also accept pretty large negative constants knowing
03675   // that all objects are in the positive half of address space.
03676   if (M == CodeModel::Small && Offset < 16*1024*1024)
03677     return true;
03678 
03679   // For kernel code model we know that all object resist in the negative half
03680   // of 32bits address space. We may not accept negative offsets, since they may
03681   // be just off and we may accept pretty large positive ones.
03682   if (M == CodeModel::Kernel && Offset >= 0)
03683     return true;
03684 
03685   return false;
03686 }
03687 
03688 /// isCalleePop - Determines whether the callee is required to pop its
03689 /// own arguments. Callee pop is necessary to support tail calls.
03690 bool X86::isCalleePop(CallingConv::ID CallingConv,
03691                       bool is64Bit, bool IsVarArg, bool TailCallOpt) {
03692   switch (CallingConv) {
03693   default:
03694     return false;
03695   case CallingConv::X86_StdCall:
03696   case CallingConv::X86_FastCall:
03697   case CallingConv::X86_ThisCall:
03698     return !is64Bit;
03699   case CallingConv::Fast:
03700   case CallingConv::GHC:
03701   case CallingConv::HiPE:
03702     if (IsVarArg)
03703       return false;
03704     return TailCallOpt;
03705   }
03706 }
03707 
03708 /// \brief Return true if the condition is an unsigned comparison operation.
03709 static bool isX86CCUnsigned(unsigned X86CC) {
03710   switch (X86CC) {
03711   default: llvm_unreachable("Invalid integer condition!");
03712   case X86::COND_E:     return true;
03713   case X86::COND_G:     return false;
03714   case X86::COND_GE:    return false;
03715   case X86::COND_L:     return false;
03716   case X86::COND_LE:    return false;
03717   case X86::COND_NE:    return true;
03718   case X86::COND_B:     return true;
03719   case X86::COND_A:     return true;
03720   case X86::COND_BE:    return true;
03721   case X86::COND_AE:    return true;
03722   }
03723   llvm_unreachable("covered switch fell through?!");
03724 }
03725 
03726 /// TranslateX86CC - do a one to one translation of a ISD::CondCode to the X86
03727 /// specific condition code, returning the condition code and the LHS/RHS of the
03728 /// comparison to make.
03729 static unsigned TranslateX86CC(ISD::CondCode SetCCOpcode, bool isFP,
03730                                SDValue &LHS, SDValue &RHS, SelectionDAG &DAG) {
03731   if (!isFP) {
03732     if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) {
03733       if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnesValue()) {
03734         // X > -1   -> X == 0, jump !sign.
03735         RHS = DAG.getConstant(0, RHS.getValueType());
03736         return X86::COND_NS;
03737       }
03738       if (SetCCOpcode == ISD::SETLT && RHSC->isNullValue()) {
03739         // X < 0   -> X == 0, jump on sign.
03740         return X86::COND_S;
03741       }
03742       if (SetCCOpcode == ISD::SETLT && RHSC->getZExtValue() == 1) {
03743         // X < 1   -> X <= 0
03744         RHS = DAG.getConstant(0, RHS.getValueType());
03745         return X86::COND_LE;
03746       }
03747     }
03748 
03749     switch (SetCCOpcode) {
03750     default: llvm_unreachable("Invalid integer condition!");
03751     case ISD::SETEQ:  return X86::COND_E;
03752     case ISD::SETGT:  return X86::COND_G;
03753     case ISD::SETGE:  return X86::COND_GE;
03754     case ISD::SETLT:  return X86::COND_L;
03755     case ISD::SETLE:  return X86::COND_LE;
03756     case ISD::SETNE:  return X86::COND_NE;
03757     case ISD::SETULT: return X86::COND_B;
03758     case ISD::SETUGT: return X86::COND_A;
03759     case ISD::SETULE: return X86::COND_BE;
03760     case ISD::SETUGE: return X86::COND_AE;
03761     }
03762   }
03763 
03764   // First determine if it is required or is profitable to flip the operands.
03765 
03766   // If LHS is a foldable load, but RHS is not, flip the condition.
03767   if (ISD::isNON_EXTLoad(LHS.getNode()) &&
03768       !ISD::isNON_EXTLoad(RHS.getNode())) {
03769     SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode);
03770     std::swap(LHS, RHS);
03771   }
03772 
03773   switch (SetCCOpcode) {
03774   default: break;
03775   case ISD::SETOLT:
03776   case ISD::SETOLE:
03777   case ISD::SETUGT:
03778   case ISD::SETUGE:
03779     std::swap(LHS, RHS);
03780     break;
03781   }
03782 
03783   // On a floating point condition, the flags are set as follows:
03784   // ZF  PF  CF   op
03785   //  0 | 0 | 0 | X > Y
03786   //  0 | 0 | 1 | X < Y
03787   //  1 | 0 | 0 | X == Y
03788   //  1 | 1 | 1 | unordered
03789   switch (SetCCOpcode) {
03790   default: llvm_unreachable("Condcode should be pre-legalized away");
03791   case ISD::SETUEQ:
03792   case ISD::SETEQ:   return X86::COND_E;
03793   case ISD::SETOLT:              // flipped
03794   case ISD::SETOGT:
03795   case ISD::SETGT:   return X86::COND_A;
03796   case ISD::SETOLE:              // flipped
03797   case ISD::SETOGE:
03798   case ISD::SETGE:   return X86::COND_AE;
03799   case ISD::SETUGT:              // flipped
03800   case ISD::SETULT:
03801   case ISD::SETLT:   return X86::COND_B;
03802   case ISD::SETUGE:              // flipped
03803   case ISD::SETULE:
03804   case ISD::SETLE:   return X86::COND_BE;
03805   case ISD::SETONE:
03806   case ISD::SETNE:   return X86::COND_NE;
03807   case ISD::SETUO:   return X86::COND_P;
03808   case ISD::SETO:    return X86::COND_NP;
03809   case ISD::SETOEQ:
03810   case ISD::SETUNE:  return X86::COND_INVALID;
03811   }
03812 }
03813 
03814 /// hasFPCMov - is there a floating point cmov for the specific X86 condition
03815 /// code. Current x86 isa includes the following FP cmov instructions:
03816 /// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu.
03817 static bool hasFPCMov(unsigned X86CC) {
03818   switch (X86CC) {
03819   default:
03820     return false;
03821   case X86::COND_B:
03822   case X86::COND_BE:
03823   case X86::COND_E:
03824   case X86::COND_P:
03825   case X86::COND_A:
03826   case X86::COND_AE:
03827   case X86::COND_NE:
03828   case X86::COND_NP:
03829     return true;
03830   }
03831 }
03832 
03833 /// isFPImmLegal - Returns true if the target can instruction select the
03834 /// specified FP immediate natively. If false, the legalizer will
03835 /// materialize the FP immediate as a load from a constant pool.
03836 bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
03837   for (unsigned i = 0, e = LegalFPImmediates.size(); i != e; ++i) {
03838     if (Imm.bitwiseIsEqual(LegalFPImmediates[i]))
03839       return true;
03840   }
03841   return false;
03842 }
03843 
03844 bool X86TargetLowering::shouldReduceLoadWidth(SDNode *Load,
03845                                               ISD::LoadExtType ExtTy,
03846                                               EVT NewVT) const {
03847   // "ELF Handling for Thread-Local Storage" specifies that R_X86_64_GOTTPOFF
03848   // relocation target a movq or addq instruction: don't let the load shrink.
03849   SDValue BasePtr = cast<LoadSDNode>(Load)->getBasePtr();
03850   if (BasePtr.getOpcode() == X86ISD::WrapperRIP)
03851     if (const auto *GA = dyn_cast<GlobalAddressSDNode>(BasePtr.getOperand(0)))
03852       return GA->getTargetFlags() != X86II::MO_GOTTPOFF;
03853   return true;
03854 }
03855 
03856 /// \brief Returns true if it is beneficial to convert a load of a constant
03857 /// to just the constant itself.
03858 bool X86TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
03859                                                           Type *Ty) const {
03860   assert(Ty->isIntegerTy());
03861 
03862   unsigned BitSize = Ty->getPrimitiveSizeInBits();
03863   if (BitSize == 0 || BitSize > 64)
03864     return false;
03865   return true;
03866 }
03867 
03868 bool X86TargetLowering::isExtractSubvectorCheap(EVT ResVT,
03869                                                 unsigned Index) const {
03870   if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT))
03871     return false;
03872 
03873   return (Index == 0 || Index == ResVT.getVectorNumElements());
03874 }
03875 
03876 bool X86TargetLowering::isCheapToSpeculateCttz() const {
03877   // Speculate cttz only if we can directly use TZCNT.
03878   return Subtarget->hasBMI();
03879 }
03880 
03881 bool X86TargetLowering::isCheapToSpeculateCtlz() const {
03882   // Speculate ctlz only if we can directly use LZCNT.
03883   return Subtarget->hasLZCNT();
03884 }
03885 
03886 /// isUndefOrInRange - Return true if Val is undef or if its value falls within
03887 /// the specified range (L, H].
03888 static bool isUndefOrInRange(int Val, int Low, int Hi) {
03889   return (Val < 0) || (Val >= Low && Val < Hi);
03890 }
03891 
03892 /// isUndefOrEqual - Val is either less than zero (undef) or equal to the
03893 /// specified value.
03894 static bool isUndefOrEqual(int Val, int CmpVal) {
03895   return (Val < 0 || Val == CmpVal);
03896 }
03897 
03898 /// isSequentialOrUndefInRange - Return true if every element in Mask, beginning
03899 /// from position Pos and ending in Pos+Size, falls within the specified
03900 /// sequential range (Low, Low+Size]. or is undef.
03901 static bool isSequentialOrUndefInRange(ArrayRef<int> Mask,
03902                                        unsigned Pos, unsigned Size, int Low) {
03903   for (unsigned i = Pos, e = Pos+Size; i != e; ++i, ++Low)
03904     if (!isUndefOrEqual(Mask[i], Low))
03905       return false;
03906   return true;
03907 }
03908 
03909 /// CommuteVectorShuffleMask - Change values in a shuffle permute mask assuming
03910 /// the two vector operands have swapped position.
03911 static void CommuteVectorShuffleMask(SmallVectorImpl<int> &Mask,
03912                                      unsigned NumElems) {
03913   for (unsigned i = 0; i != NumElems; ++i) {
03914     int idx = Mask[i];
03915     if (idx < 0)
03916       continue;
03917     else if (idx < (int)NumElems)
03918       Mask[i] = idx + NumElems;
03919     else
03920       Mask[i] = idx - NumElems;
03921   }
03922 }
03923 
03924 /// isVEXTRACTIndex - Return true if the specified
03925 /// EXTRACT_SUBVECTOR operand specifies a vector extract that is
03926 /// suitable for instruction that extract 128 or 256 bit vectors
03927 static bool isVEXTRACTIndex(SDNode *N, unsigned vecWidth) {
03928   assert((vecWidth == 128 || vecWidth == 256) && "Unexpected vector width");
03929   if (!isa<ConstantSDNode>(N->getOperand(1).getNode()))
03930     return false;
03931 
03932   // The index should be aligned on a vecWidth-bit boundary.
03933   uint64_t Index =
03934     cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue();
03935 
03936   MVT VT = N->getSimpleValueType(0);
03937   unsigned ElSize = VT.getVectorElementType().getSizeInBits();
03938   bool Result = (Index * ElSize) % vecWidth == 0;
03939 
03940   return Result;
03941 }
03942 
03943 /// isVINSERTIndex - Return true if the specified INSERT_SUBVECTOR
03944 /// operand specifies a subvector insert that is suitable for input to
03945 /// insertion of 128 or 256-bit subvectors
03946 static bool isVINSERTIndex(SDNode *N, unsigned vecWidth) {
03947   assert((vecWidth == 128 || vecWidth == 256) && "Unexpected vector width");
03948   if (!isa<ConstantSDNode>(N->getOperand(2).getNode()))
03949     return false;
03950   // The index should be aligned on a vecWidth-bit boundary.
03951   uint64_t Index =
03952     cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue();
03953 
03954   MVT VT = N->getSimpleValueType(0);
03955   unsigned ElSize = VT.getVectorElementType().getSizeInBits();
03956   bool Result = (Index * ElSize) % vecWidth == 0;
03957 
03958   return Result;
03959 }
03960 
03961 bool X86::isVINSERT128Index(SDNode *N) {
03962   return isVINSERTIndex(N, 128);
03963 }
03964 
03965 bool X86::isVINSERT256Index(SDNode *N) {
03966   return isVINSERTIndex(N, 256);
03967 }
03968 
03969 bool X86::isVEXTRACT128Index(SDNode *N) {
03970   return isVEXTRACTIndex(N, 128);
03971 }
03972 
03973 bool X86::isVEXTRACT256Index(SDNode *N) {
03974   return isVEXTRACTIndex(N, 256);
03975 }
03976 
03977 static unsigned getExtractVEXTRACTImmediate(SDNode *N, unsigned vecWidth) {
03978   assert((vecWidth == 128 || vecWidth == 256) && "Unsupported vector width");
03979   if (!isa<ConstantSDNode>(N->getOperand(1).getNode()))
03980     llvm_unreachable("Illegal extract subvector for VEXTRACT");
03981 
03982   uint64_t Index =
03983     cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue();
03984 
03985   MVT VecVT = N->getOperand(0).getSimpleValueType();
03986   MVT ElVT = VecVT.getVectorElementType();
03987 
03988   unsigned NumElemsPerChunk = vecWidth / ElVT.getSizeInBits();
03989   return Index / NumElemsPerChunk;
03990 }
03991 
03992 static unsigned getInsertVINSERTImmediate(SDNode *N, unsigned vecWidth) {
03993   assert((vecWidth == 128 || vecWidth == 256) && "Unsupported vector width");
03994   if (!isa<ConstantSDNode>(N->getOperand(2).getNode()))
03995     llvm_unreachable("Illegal insert subvector for VINSERT");
03996 
03997   uint64_t Index =
03998     cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue();
03999 
04000   MVT VecVT = N->getSimpleValueType(0);
04001   MVT ElVT = VecVT.getVectorElementType();
04002 
04003   unsigned NumElemsPerChunk = vecWidth / ElVT.getSizeInBits();
04004   return Index / NumElemsPerChunk;
04005 }
04006 
04007 /// getExtractVEXTRACT128Immediate - Return the appropriate immediate
04008 /// to extract the specified EXTRACT_SUBVECTOR index with VEXTRACTF128
04009 /// and VINSERTI128 instructions.
04010 unsigned X86::getExtractVEXTRACT128Immediate(SDNode *N) {
04011   return getExtractVEXTRACTImmediate(N, 128);
04012 }
04013 
04014 /// getExtractVEXTRACT256Immediate - Return the appropriate immediate
04015 /// to extract the specified EXTRACT_SUBVECTOR index with VEXTRACTF64x4
04016 /// and VINSERTI64x4 instructions.
04017 unsigned X86::getExtractVEXTRACT256Immediate(SDNode *N) {
04018   return getExtractVEXTRACTImmediate(N, 256);
04019 }
04020 
04021 /// getInsertVINSERT128Immediate - Return the appropriate immediate
04022 /// to insert at the specified INSERT_SUBVECTOR index with VINSERTF128
04023 /// and VINSERTI128 instructions.
04024 unsigned X86::getInsertVINSERT128Immediate(SDNode *N) {
04025   return getInsertVINSERTImmediate(N, 128);
04026 }
04027 
04028 /// getInsertVINSERT256Immediate - Return the appropriate immediate
04029 /// to insert at the specified INSERT_SUBVECTOR index with VINSERTF46x4
04030 /// and VINSERTI64x4 instructions.
04031 unsigned X86::getInsertVINSERT256Immediate(SDNode *N) {
04032   return getInsertVINSERTImmediate(N, 256);
04033 }
04034 
04035 /// isZero - Returns true if Elt is a constant integer zero
04036 static bool isZero(SDValue V) {
04037   ConstantSDNode *C = dyn_cast<ConstantSDNode>(V);
04038   return C && C->isNullValue();
04039 }
04040 
04041 /// isZeroNode - Returns true if Elt is a constant zero or a floating point
04042 /// constant +0.0.
04043 bool X86::isZeroNode(SDValue Elt) {
04044   if (isZero(Elt))
04045     return true;
04046   if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Elt))
04047     return CFP->getValueAPF().isPosZero();
04048   return false;
04049 }
04050 
04051 /// getZeroVector - Returns a vector of specified type with all zero elements.
04052 ///
04053 static SDValue getZeroVector(EVT VT, const X86Subtarget *Subtarget,
04054                              SelectionDAG &DAG, SDLoc dl) {
04055   assert(VT.isVector() && "Expected a vector type");
04056 
04057   // Always build SSE zero vectors as <4 x i32> bitcasted
04058   // to their dest type. This ensures they get CSE'd.
04059   SDValue Vec;
04060   if (VT.is128BitVector()) {  // SSE
04061     if (Subtarget->hasSSE2()) {  // SSE2
04062       SDValue Cst = DAG.getConstant(0, MVT::i32);
04063       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
04064     } else { // SSE1
04065       SDValue Cst = DAG.getConstantFP(+0.0, MVT::f32);
04066       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4f32, Cst, Cst, Cst, Cst);
04067     }
04068   } else if (VT.is256BitVector()) { // AVX
04069     if (Subtarget->hasInt256()) { // AVX2
04070       SDValue Cst = DAG.getConstant(0, MVT::i32);
04071       SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
04072       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops);
04073     } else {
04074       // 256-bit logic and arithmetic instructions in AVX are all
04075       // floating-point, no support for integer ops. Emit fp zeroed vectors.
04076       SDValue Cst = DAG.getConstantFP(+0.0, MVT::f32);
04077       SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
04078       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8f32, Ops);
04079     }
04080   } else if (VT.is512BitVector()) { // AVX-512
04081       SDValue Cst = DAG.getConstant(0, MVT::i32);
04082       SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst,
04083                         Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
04084       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v16i32, Ops);
04085   } else if (VT.getScalarType() == MVT::i1) {
04086     assert(VT.getVectorNumElements() <= 16 && "Unexpected vector type");
04087     SDValue Cst = DAG.getConstant(0, MVT::i1);
04088     SmallVector<SDValue, 16> Ops(VT.getVectorNumElements(), Cst);
04089     return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops);
04090   } else
04091     llvm_unreachable("Unexpected vector type");
04092 
04093   return DAG.getNode(ISD::BITCAST, dl, VT, Vec);
04094 }
04095 
04096 /// getOnesVector - Returns a vector of specified type with all bits set.
04097 /// Always build ones vectors as <4 x i32> or <8 x i32>. For 256-bit types with
04098 /// no AVX2 supprt, use two <4 x i32> inserted in a <8 x i32> appropriately.
04099 /// Then bitcast to their original type, ensuring they get CSE'd.
04100 static SDValue getOnesVector(MVT VT, bool HasInt256, SelectionDAG &DAG,
04101                              SDLoc dl) {
04102   assert(VT.isVector() && "Expected a vector type");
04103 
04104   SDValue Cst = DAG.getConstant(~0U, MVT::i32);
04105   SDValue Vec;
04106   if (VT.is256BitVector()) {
04107     if (HasInt256) { // AVX2
04108       SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
04109       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops);
04110     } else { // AVX
04111       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
04112       Vec = Concat128BitVectors(Vec, Vec, MVT::v8i32, 8, DAG, dl);
04113     }
04114   } else if (VT.is128BitVector()) {
04115     Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
04116   } else
04117     llvm_unreachable("Unexpected vector type");
04118 
04119   return DAG.getNode(ISD::BITCAST, dl, VT, Vec);
04120 }
04121 
04122 /// getMOVLMask - Returns a vector_shuffle mask for an movs{s|d}, movd
04123 /// operation of specified width.
04124 static SDValue getMOVL(SelectionDAG &DAG, SDLoc dl, EVT VT, SDValue V1,
04125                        SDValue V2) {
04126   unsigned NumElems = VT.getVectorNumElements();
04127   SmallVector<int, 8> Mask;
04128   Mask.push_back(NumElems);
04129   for (unsigned i = 1; i != NumElems; ++i)
04130     Mask.push_back(i);
04131   return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]);
04132 }
04133 
04134 /// getUnpackl - Returns a vector_shuffle node for an unpackl operation.
04135 static SDValue getUnpackl(SelectionDAG &DAG, SDLoc dl, MVT VT, SDValue V1,
04136                           SDValue V2) {
04137   unsigned NumElems = VT.getVectorNumElements();
04138   SmallVector<int, 8> Mask;
04139   for (unsigned i = 0, e = NumElems/2; i != e; ++i) {
04140     Mask.push_back(i);
04141     Mask.push_back(i + NumElems);
04142   }
04143   return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]);
04144 }
04145 
04146 /// getUnpackh - Returns a vector_shuffle node for an unpackh operation.
04147 static SDValue getUnpackh(SelectionDAG &DAG, SDLoc dl, MVT VT, SDValue V1,
04148                           SDValue V2) {
04149   unsigned NumElems = VT.getVectorNumElements();
04150   SmallVector<int, 8> Mask;
04151   for (unsigned i = 0, Half = NumElems/2; i != Half; ++i) {
04152     Mask.push_back(i + Half);
04153     Mask.push_back(i + NumElems + Half);
04154   }
04155   return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]);
04156 }
04157 
04158 /// getShuffleVectorZeroOrUndef - Return a vector_shuffle of the specified
04159 /// vector of zero or undef vector.  This produces a shuffle where the low
04160 /// element of V2 is swizzled into the zero/undef vector, landing at element
04161 /// Idx.  This produces a shuffle mask like 4,1,2,3 (idx=0) or  0,1,2,4 (idx=3).
04162 static SDValue getShuffleVectorZeroOrUndef(SDValue V2, unsigned Idx,
04163                                            bool IsZero,
04164                                            const X86Subtarget *Subtarget,
04165                                            SelectionDAG &DAG) {
04166   MVT VT = V2.getSimpleValueType();
04167   SDValue V1 = IsZero
04168     ? getZeroVector(VT, Subtarget, DAG, SDLoc(V2)) : DAG.getUNDEF(VT);
04169   unsigned NumElems = VT.getVectorNumElements();
04170   SmallVector<int, 16> MaskVec;
04171   for (unsigned i = 0; i != NumElems; ++i)
04172     // If this is the insertion idx, put the low elt of V2 here.
04173     MaskVec.push_back(i == Idx ? NumElems : i);
04174   return DAG.getVectorShuffle(VT, SDLoc(V2), V1, V2, &MaskVec[0]);
04175 }
04176 
04177 /// getTargetShuffleMask - Calculates the shuffle mask corresponding to the
04178 /// target specific opcode. Returns true if the Mask could be calculated. Sets
04179 /// IsUnary to true if only uses one source. Note that this will set IsUnary for
04180 /// shuffles which use a single input multiple times, and in those cases it will
04181 /// adjust the mask to only have indices within that single input.
04182 static bool getTargetShuffleMask(SDNode *N, MVT VT,
04183                                  SmallVectorImpl<int> &Mask, bool &IsUnary) {
04184   unsigned NumElems = VT.getVectorNumElements();
04185   SDValue ImmN;
04186 
04187   IsUnary = false;
04188   bool IsFakeUnary = false;
04189   switch(N->getOpcode()) {
04190   case X86ISD::BLENDI:
04191     ImmN = N->getOperand(N->getNumOperands()-1);
04192     DecodeBLENDMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
04193     break;
04194   case X86ISD::SHUFP:
04195     ImmN = N->getOperand(N->getNumOperands()-1);
04196     DecodeSHUFPMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
04197     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
04198     break;
04199   case X86ISD::UNPCKH:
04200     DecodeUNPCKHMask(VT, Mask);
04201     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
04202     break;
04203   case X86ISD::UNPCKL:
04204     DecodeUNPCKLMask(VT, Mask);
04205     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
04206     break;
04207   case X86ISD::MOVHLPS:
04208     DecodeMOVHLPSMask(NumElems, Mask);
04209     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
04210     break;
04211   case X86ISD::MOVLHPS:
04212     DecodeMOVLHPSMask(NumElems, Mask);
04213     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
04214     break;
04215   case X86ISD::PALIGNR:
04216     ImmN = N->getOperand(N->getNumOperands()-1);
04217     DecodePALIGNRMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
04218     break;
04219   case X86ISD::PSHUFD:
04220   case X86ISD::VPERMILPI:
04221     ImmN = N->getOperand(N->getNumOperands()-1);
04222     DecodePSHUFMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
04223     IsUnary = true;
04224     break;
04225   case X86ISD::PSHUFHW:
04226     ImmN = N->getOperand(N->getNumOperands()-1);
04227     DecodePSHUFHWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
04228     IsUnary = true;
04229     break;
04230   case X86ISD::PSHUFLW:
04231     ImmN = N->getOperand(N->getNumOperands()-1);
04232     DecodePSHUFLWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
04233     IsUnary = true;
04234     break;
04235   case X86ISD::PSHUFB: {
04236     IsUnary = true;
04237     SDValue MaskNode = N->getOperand(1);
04238     while (MaskNode->getOpcode() == ISD::BITCAST)
04239       MaskNode = MaskNode->getOperand(0);
04240 
04241     if (MaskNode->getOpcode() == ISD::BUILD_VECTOR) {
04242       // If we have a build-vector, then things are easy.
04243       EVT VT = MaskNode.getValueType();
04244       assert(VT.isVector() &&
04245              "Can't produce a non-vector with a build_vector!");
04246       if (!VT.isInteger())
04247         return false;
04248 
04249       int NumBytesPerElement = VT.getVectorElementType().getSizeInBits() / 8;
04250 
04251       SmallVector<uint64_t, 32> RawMask;
04252       for (int i = 0, e = MaskNode->getNumOperands(); i < e; ++i) {
04253         SDValue Op = MaskNode->getOperand(i);
04254         if (Op->getOpcode() == ISD::UNDEF) {
04255           RawMask.push_back((uint64_t)SM_SentinelUndef);
04256           continue;
04257         }
04258         auto *CN = dyn_cast<ConstantSDNode>(Op.getNode());
04259         if (!CN)
04260           return false;
04261         APInt MaskElement = CN->getAPIntValue();
04262 
04263         // We now have to decode the element which could be any integer size and
04264         // extract each byte of it.
04265         for (int j = 0; j < NumBytesPerElement; ++j) {
04266           // Note that this is x86 and so always little endian: the low byte is
04267           // the first byte of the mask.
04268           RawMask.push_back(MaskElement.getLoBits(8).getZExtValue());
04269           MaskElement = MaskElement.lshr(8);
04270         }
04271       }
04272       DecodePSHUFBMask(RawMask, Mask);
04273       break;
04274     }
04275 
04276     auto *MaskLoad = dyn_cast<LoadSDNode>(MaskNode);
04277     if (!MaskLoad)
04278       return false;
04279 
04280     SDValue Ptr = MaskLoad->getBasePtr();
04281     if (Ptr->getOpcode() == X86ISD::Wrapper)
04282       Ptr = Ptr->getOperand(0);
04283 
04284     auto *MaskCP = dyn_cast<ConstantPoolSDNode>(Ptr);
04285     if (!MaskCP || MaskCP->isMachineConstantPoolEntry())
04286       return false;
04287 
04288     if (auto *C = dyn_cast<Constant>(MaskCP->getConstVal())) {
04289       DecodePSHUFBMask(C, Mask);
04290       if (Mask.empty())
04291         return false;
04292       break;
04293     }
04294 
04295     return false;
04296   }
04297   case X86ISD::VPERMI:
04298     ImmN = N->getOperand(N->getNumOperands()-1);
04299     DecodeVPERMMask(cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
04300     IsUnary = true;
04301     break;
04302   case X86ISD::MOVSS:
04303   case X86ISD::MOVSD:
04304     DecodeScalarMoveMask(VT, /* IsLoad */ false, Mask);
04305     break;
04306   case X86ISD::VPERM2X128:
04307     ImmN = N->getOperand(N->getNumOperands()-1);
04308     DecodeVPERM2X128Mask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
04309     if (Mask.empty()) return false;
04310     break;
04311   case X86ISD::MOVSLDUP:
04312     DecodeMOVSLDUPMask(VT, Mask);
04313     IsUnary = true;
04314     break;
04315   case X86ISD::MOVSHDUP:
04316     DecodeMOVSHDUPMask(VT, Mask);
04317     IsUnary = true;
04318     break;
04319   case X86ISD::MOVDDUP:
04320     DecodeMOVDDUPMask(VT, Mask);
04321     IsUnary = true;
04322     break;
04323   case X86ISD::MOVLHPD:
04324   case X86ISD::MOVLPD:
04325   case X86ISD::MOVLPS:
04326     // Not yet implemented
04327     return false;
04328   default: llvm_unreachable("unknown target shuffle node");
04329   }
04330 
04331   // If we have a fake unary shuffle, the shuffle mask is spread across two
04332   // inputs that are actually the same node. Re-map the mask to always point
04333   // into the first input.
04334   if (IsFakeUnary)
04335     for (int &M : Mask)
04336       if (M >= (int)Mask.size())
04337         M -= Mask.size();
04338 
04339   return true;
04340 }
04341 
04342 /// getShuffleScalarElt - Returns the scalar element that will make up the ith
04343 /// element of the result of the vector shuffle.
04344 static SDValue getShuffleScalarElt(SDNode *N, unsigned Index, SelectionDAG &DAG,
04345                                    unsigned Depth) {
04346   if (Depth == 6)
04347     return SDValue();  // Limit search depth.
04348 
04349   SDValue V = SDValue(N, 0);
04350   EVT VT = V.getValueType();
04351   unsigned Opcode = V.getOpcode();
04352 
04353   // Recurse into ISD::VECTOR_SHUFFLE node to find scalars.
04354   if (const ShuffleVectorSDNode *SV = dyn_cast<ShuffleVectorSDNode>(N)) {
04355     int Elt = SV->getMaskElt(Index);
04356 
04357     if (Elt < 0)
04358       return DAG.getUNDEF(VT.getVectorElementType());
04359 
04360     unsigned NumElems = VT.getVectorNumElements();
04361     SDValue NewV = (Elt < (int)NumElems) ? SV->getOperand(0)
04362                                          : SV->getOperand(1);
04363     return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG, Depth+1);
04364   }
04365 
04366   // Recurse into target specific vector shuffles to find scalars.
04367   if (isTargetShuffle(Opcode)) {
04368     MVT ShufVT = V.getSimpleValueType();
04369     unsigned NumElems = ShufVT.getVectorNumElements();
04370     SmallVector<int, 16> ShuffleMask;
04371     bool IsUnary;
04372 
04373     if (!getTargetShuffleMask(N, ShufVT, ShuffleMask, IsUnary))
04374       return SDValue();
04375 
04376     int Elt = ShuffleMask[Index];
04377     if (Elt < 0)
04378       return DAG.getUNDEF(ShufVT.getVectorElementType());
04379 
04380     SDValue NewV = (Elt < (int)NumElems) ? N->getOperand(0)
04381                                          : N->getOperand(1);
04382     return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG,
04383                                Depth+1);
04384   }
04385 
04386   // Actual nodes that may contain scalar elements
04387   if (Opcode == ISD::BITCAST) {
04388     V = V.getOperand(0);
04389     EVT SrcVT = V.getValueType();
04390     unsigned NumElems = VT.getVectorNumElements();
04391 
04392     if (!SrcVT.isVector() || SrcVT.getVectorNumElements() != NumElems)
04393       return SDValue();
04394   }
04395 
04396   if (V.getOpcode() == ISD::SCALAR_TO_VECTOR)
04397     return (Index == 0) ? V.getOperand(0)
04398                         : DAG.getUNDEF(VT.getVectorElementType());
04399 
04400   if (V.getOpcode() == ISD::BUILD_VECTOR)
04401     return V.getOperand(Index);
04402 
04403   return SDValue();
04404 }
04405 
04406 /// LowerBuildVectorv16i8 - Custom lower build_vector of v16i8.
04407 ///
04408 static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros,
04409                                        unsigned NumNonZero, unsigned NumZero,
04410                                        SelectionDAG &DAG,
04411                                        const X86Subtarget* Subtarget,
04412                                        const TargetLowering &TLI) {
04413   if (NumNonZero > 8)
04414     return SDValue();
04415 
04416   SDLoc dl(Op);
04417   SDValue V;
04418   bool First = true;
04419   for (unsigned i = 0; i < 16; ++i) {
04420     bool ThisIsNonZero = (NonZeros & (1 << i)) != 0;
04421     if (ThisIsNonZero && First) {
04422       if (NumZero)
04423         V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
04424       else
04425         V = DAG.getUNDEF(MVT::v8i16);
04426       First = false;
04427     }
04428 
04429     if ((i & 1) != 0) {
04430       SDValue ThisElt, LastElt;
04431       bool LastIsNonZero = (NonZeros & (1 << (i-1))) != 0;
04432       if (LastIsNonZero) {
04433         LastElt = DAG.getNode(ISD::ZERO_EXTEND, dl,
04434                               MVT::i16, Op.getOperand(i-1));
04435       }
04436       if (ThisIsNonZero) {
04437         ThisElt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i));
04438         ThisElt = DAG.getNode(ISD::SHL, dl, MVT::i16,
04439                               ThisElt, DAG.getConstant(8, MVT::i8));
04440         if (LastIsNonZero)
04441           ThisElt = DAG.getNode(ISD::OR, dl, MVT::i16, ThisElt, LastElt);
04442       } else
04443         ThisElt = LastElt;
04444 
04445       if (ThisElt.getNode())
04446         V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, ThisElt,
04447                         DAG.getIntPtrConstant(i/2));
04448     }
04449   }
04450 
04451   return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, V);
04452 }
04453 
04454 /// LowerBuildVectorv8i16 - Custom lower build_vector of v8i16.
04455 ///
04456 static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros,
04457                                      unsigned NumNonZero, unsigned NumZero,
04458                                      SelectionDAG &DAG,
04459                                      const X86Subtarget* Subtarget,
04460                                      const TargetLowering &TLI) {
04461   if (NumNonZero > 4)
04462     return SDValue();
04463 
04464   SDLoc dl(Op);
04465   SDValue V;
04466   bool First = true;
04467   for (unsigned i = 0; i < 8; ++i) {
04468     bool isNonZero = (NonZeros & (1 << i)) != 0;
04469     if (isNonZero) {
04470       if (First) {
04471         if (NumZero)
04472           V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
04473         else
04474           V = DAG.getUNDEF(MVT::v8i16);
04475         First = false;
04476       }
04477       V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl,
04478                       MVT::v8i16, V, Op.getOperand(i),
04479                       DAG.getIntPtrConstant(i));
04480     }
04481   }
04482 
04483   return V;
04484 }
04485 
04486 /// LowerBuildVectorv4x32 - Custom lower build_vector of v4i32 or v4f32.
04487 static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG,
04488                                      const X86Subtarget *Subtarget,
04489                                      const TargetLowering &TLI) {
04490   // Find all zeroable elements.
04491   std::bitset<4> Zeroable;
04492   for (int i=0; i < 4; ++i) {
04493     SDValue Elt = Op->getOperand(i);
04494     Zeroable[i] = (Elt.getOpcode() == ISD::UNDEF || X86::isZeroNode(Elt));
04495   }
04496   assert(Zeroable.size() - Zeroable.count() > 1 &&
04497          "We expect at least two non-zero elements!");
04498 
04499   // We only know how to deal with build_vector nodes where elements are either
04500   // zeroable or extract_vector_elt with constant index.
04501   SDValue FirstNonZero;
04502   unsigned FirstNonZeroIdx;
04503   for (unsigned i=0; i < 4; ++i) {
04504     if (Zeroable[i])
04505       continue;
04506     SDValue Elt = Op->getOperand(i);
04507     if (Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
04508         !isa<ConstantSDNode>(Elt.getOperand(1)))
04509       return SDValue();
04510     // Make sure that this node is extracting from a 128-bit vector.
04511     MVT VT = Elt.getOperand(0).getSimpleValueType();
04512     if (!VT.is128BitVector())
04513       return SDValue();
04514     if (!FirstNonZero.getNode()) {
04515       FirstNonZero = Elt;
04516       FirstNonZeroIdx = i;
04517     }
04518   }
04519 
04520   assert(FirstNonZero.getNode() && "Unexpected build vector of all zeros!");
04521   SDValue V1 = FirstNonZero.getOperand(0);
04522   MVT VT = V1.getSimpleValueType();
04523 
04524   // See if this build_vector can be lowered as a blend with zero.
04525   SDValue Elt;
04526   unsigned EltMaskIdx, EltIdx;
04527   int Mask[4];
04528   for (EltIdx = 0; EltIdx < 4; ++EltIdx) {
04529     if (Zeroable[EltIdx]) {
04530       // The zero vector will be on the right hand side.
04531       Mask[EltIdx] = EltIdx+4;
04532       continue;
04533     }
04534 
04535     Elt = Op->getOperand(EltIdx);
04536     // By construction, Elt is a EXTRACT_VECTOR_ELT with constant index.
04537     EltMaskIdx = cast<ConstantSDNode>(Elt.getOperand(1))->getZExtValue();
04538     if (Elt.getOperand(0) != V1 || EltMaskIdx != EltIdx)
04539       break;
04540     Mask[EltIdx] = EltIdx;
04541   }
04542 
04543   if (EltIdx == 4) {
04544     // Let the shuffle legalizer deal with blend operations.
04545     SDValue VZero = getZeroVector(VT, Subtarget, DAG, SDLoc(Op));
04546     if (V1.getSimpleValueType() != VT)
04547       V1 = DAG.getNode(ISD::BITCAST, SDLoc(V1), VT, V1);
04548     return DAG.getVectorShuffle(VT, SDLoc(V1), V1, VZero, &Mask[0]);
04549   }
04550 
04551   // See if we can lower this build_vector to a INSERTPS.
04552   if (!Subtarget->hasSSE41())
04553     return SDValue();
04554 
04555   SDValue V2 = Elt.getOperand(0);
04556   if (Elt == FirstNonZero && EltIdx == FirstNonZeroIdx)
04557     V1 = SDValue();
04558 
04559   bool CanFold = true;
04560   for (unsigned i = EltIdx + 1; i < 4 && CanFold; ++i) {
04561     if (Zeroable[i])
04562       continue;
04563 
04564     SDValue Current = Op->getOperand(i);
04565     SDValue SrcVector = Current->getOperand(0);
04566     if (!V1.getNode())
04567       V1 = SrcVector;
04568     CanFold = SrcVector == V1 &&
04569       cast<ConstantSDNode>(Current.getOperand(1))->getZExtValue() == i;
04570   }
04571 
04572   if (!CanFold)
04573     return SDValue();
04574 
04575   assert(V1.getNode() && "Expected at least two non-zero elements!");
04576   if (V1.getSimpleValueType() != MVT::v4f32)
04577     V1 = DAG.getNode(ISD::BITCAST, SDLoc(V1), MVT::v4f32, V1);
04578   if (V2.getSimpleValueType() != MVT::v4f32)
04579     V2 = DAG.getNode(ISD::BITCAST, SDLoc(V2), MVT::v4f32, V2);
04580 
04581   // Ok, we can emit an INSERTPS instruction.
04582   unsigned ZMask = Zeroable.to_ulong();
04583 
04584   unsigned InsertPSMask = EltMaskIdx << 6 | EltIdx << 4 | ZMask;
04585   assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
04586   SDValue Result = DAG.getNode(X86ISD::INSERTPS, SDLoc(Op), MVT::v4f32, V1, V2,
04587                                DAG.getIntPtrConstant(InsertPSMask));
04588   return DAG.getNode(ISD::BITCAST, SDLoc(Op), VT, Result);
04589 }
04590 
04591 /// Return a vector logical shift node.
04592 static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp,
04593                          unsigned NumBits, SelectionDAG &DAG,
04594                          const TargetLowering &TLI, SDLoc dl) {
04595   assert(VT.is128BitVector() && "Unknown type for VShift");
04596   MVT ShVT = MVT::v2i64;
04597   unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ;
04598   SrcOp = DAG.getNode(ISD::BITCAST, dl, ShVT, SrcOp);
04599   MVT ScalarShiftTy = TLI.getScalarShiftAmountTy(SrcOp.getValueType());
04600   assert(NumBits % 8 == 0 && "Only support byte sized shifts");
04601   SDValue ShiftVal = DAG.getConstant(NumBits/8, ScalarShiftTy);
04602   return DAG.getNode(ISD::BITCAST, dl, VT,
04603                      DAG.getNode(Opc, dl, ShVT, SrcOp, ShiftVal));
04604 }
04605 
04606 static SDValue
04607 LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, SDLoc dl, SelectionDAG &DAG) {
04608 
04609   // Check if the scalar load can be widened into a vector load. And if
04610   // the address is "base + cst" see if the cst can be "absorbed" into
04611   // the shuffle mask.
04612   if (LoadSDNode *LD = dyn_cast<LoadSDNode>(SrcOp)) {
04613     SDValue Ptr = LD->getBasePtr();
04614     if (!ISD::isNormalLoad(LD) || LD->isVolatile())
04615       return SDValue();
04616     EVT PVT = LD->getValueType(0);
04617     if (PVT != MVT::i32 && PVT != MVT::f32)
04618       return SDValue();
04619 
04620     int FI = -1;
04621     int64_t Offset = 0;
04622     if (FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr)) {
04623       FI = FINode->getIndex();
04624       Offset = 0;
04625     } else if (DAG.isBaseWithConstantOffset(Ptr) &&
04626                isa<FrameIndexSDNode>(Ptr.getOperand(0))) {
04627       FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex();
04628       Offset = Ptr.getConstantOperandVal(1);
04629       Ptr = Ptr.getOperand(0);
04630     } else {
04631       return SDValue();
04632     }
04633 
04634     // FIXME: 256-bit vector instructions don't require a strict alignment,
04635     // improve this code to support it better.
04636     unsigned RequiredAlign = VT.getSizeInBits()/8;
04637     SDValue Chain = LD->getChain();
04638     // Make sure the stack object alignment is at least 16 or 32.
04639     MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
04640     if (DAG.InferPtrAlignment(Ptr) < RequiredAlign) {
04641       if (MFI->isFixedObjectIndex(FI)) {
04642         // Can't change the alignment. FIXME: It's possible to compute
04643         // the exact stack offset and reference FI + adjust offset instead.
04644         // If someone *really* cares about this. That's the way to implement it.
04645         return SDValue();
04646       } else {
04647         MFI->setObjectAlignment(FI, RequiredAlign);
04648       }
04649     }
04650 
04651     // (Offset % 16 or 32) must be multiple of 4. Then address is then
04652     // Ptr + (Offset & ~15).
04653     if (Offset < 0)
04654       return SDValue();
04655     if ((Offset % RequiredAlign) & 3)
04656       return SDValue();
04657     int64_t StartOffset = Offset & ~(RequiredAlign-1);
04658     if (StartOffset)
04659       Ptr = DAG.getNode(ISD::ADD, SDLoc(Ptr), Ptr.getValueType(),
04660                         Ptr,DAG.getConstant(StartOffset, Ptr.getValueType()));
04661 
04662     int EltNo = (Offset - StartOffset) >> 2;
04663     unsigned NumElems = VT.getVectorNumElements();
04664 
04665     EVT NVT = EVT::getVectorVT(*DAG.getContext(), PVT, NumElems);
04666     SDValue V1 = DAG.getLoad(NVT, dl, Chain, Ptr,
04667                              LD->getPointerInfo().getWithOffset(StartOffset),
04668                              false, false, false, 0);
04669 
04670     SmallVector<int, 8> Mask(NumElems, EltNo);
04671 
04672     return DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), &Mask[0]);
04673   }
04674 
04675   return SDValue();
04676 }
04677 
04678 /// Given the initializing elements 'Elts' of a vector of type 'VT', see if the
04679 /// elements can be replaced by a single large load which has the same value as
04680 /// a build_vector or insert_subvector whose loaded operands are 'Elts'.
04681 ///
04682 /// Example: <load i32 *a, load i32 *a+4, undef, undef> -> zextload a
04683 ///
04684 /// FIXME: we'd also like to handle the case where the last elements are zero
04685 /// rather than undef via VZEXT_LOAD, but we do not detect that case today.
04686 /// There's even a handy isZeroNode for that purpose.
04687 static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
04688                                         SDLoc &DL, SelectionDAG &DAG,
04689                                         bool isAfterLegalize) {
04690   unsigned NumElems = Elts.size();
04691 
04692   LoadSDNode *LDBase = nullptr;
04693   unsigned LastLoadedElt = -1U;
04694 
04695   // For each element in the initializer, see if we've found a load or an undef.
04696   // If we don't find an initial load element, or later load elements are
04697   // non-consecutive, bail out.
04698   for (unsigned i = 0; i < NumElems; ++i) {
04699     SDValue Elt = Elts[i];
04700     // Look through a bitcast.
04701     if (Elt.getNode() && Elt.getOpcode() == ISD::BITCAST)
04702       Elt = Elt.getOperand(0);
04703     if (!Elt.getNode() ||
04704         (Elt.getOpcode() != ISD::UNDEF && !ISD::isNON_EXTLoad(Elt.getNode())))
04705       return SDValue();
04706     if (!LDBase) {
04707       if (Elt.getNode()->getOpcode() == ISD::UNDEF)
04708         return SDValue();
04709       LDBase = cast<LoadSDNode>(Elt.getNode());
04710       LastLoadedElt = i;
04711       continue;
04712     }
04713     if (Elt.getOpcode() == ISD::UNDEF)
04714       continue;
04715 
04716     LoadSDNode *LD = cast<LoadSDNode>(Elt);
04717     EVT LdVT = Elt.getValueType();
04718     // Each loaded element must be the correct fractional portion of the
04719     // requested vector load.
04720     if (LdVT.getSizeInBits() != VT.getSizeInBits() / NumElems)
04721       return SDValue();
04722     if (!DAG.isConsecutiveLoad(LD, LDBase, LdVT.getSizeInBits() / 8, i))
04723       return SDValue();
04724     LastLoadedElt = i;
04725   }
04726 
04727   // If we have found an entire vector of loads and undefs, then return a large
04728   // load of the entire vector width starting at the base pointer.  If we found
04729   // consecutive loads for the low half, generate a vzext_load node.
04730   if (LastLoadedElt == NumElems - 1) {
04731     assert(LDBase && "Did not find base load for merging consecutive loads");
04732     EVT EltVT = LDBase->getValueType(0);
04733     // Ensure that the input vector size for the merged loads matches the
04734     // cumulative size of the input elements.
04735     if (VT.getSizeInBits() != EltVT.getSizeInBits() * NumElems)
04736       return SDValue();
04737 
04738     if (isAfterLegalize &&
04739         !DAG.getTargetLoweringInfo().isOperationLegal(ISD::LOAD, VT))
04740       return SDValue();
04741 
04742     SDValue NewLd = SDValue();
04743 
04744     NewLd = DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
04745                         LDBase->getPointerInfo(), LDBase->isVolatile(),
04746                         LDBase->isNonTemporal(), LDBase->isInvariant(),
04747                         LDBase->getAlignment());
04748 
04749     if (LDBase->hasAnyUseOfValue(1)) {
04750       SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
04751                                      SDValue(LDBase, 1),
04752                                      SDValue(NewLd.getNode(), 1));
04753       DAG.ReplaceAllUsesOfValueWith(SDValue(LDBase, 1), NewChain);
04754       DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(LDBase, 1),
04755                              SDValue(NewLd.getNode(), 1));
04756     }
04757 
04758     return NewLd;
04759   }
04760 
04761   //TODO: The code below fires only for for loading the low v2i32 / v2f32
04762   //of a v4i32 / v4f32. It's probably worth generalizing.
04763   EVT EltVT = VT.getVectorElementType();
04764   if (NumElems == 4 && LastLoadedElt == 1 && (EltVT.getSizeInBits() == 32) &&
04765       DAG.getTargetLoweringInfo().isTypeLegal(MVT::v2i64)) {
04766     SDVTList Tys = DAG.getVTList(MVT::v2i64, MVT::Other);
04767     SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };
04768     SDValue ResNode =
04769         DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, Ops, MVT::i64,
04770                                 LDBase->getPointerInfo(),
04771                                 LDBase->getAlignment(),
04772                                 false/*isVolatile*/, true/*ReadMem*/,
04773                                 false/*WriteMem*/);
04774 
04775     // Make sure the newly-created LOAD is in the same position as LDBase in
04776     // terms of dependency. We create a TokenFactor for LDBase and ResNode, and
04777     // update uses of LDBase's output chain to use the TokenFactor.
04778     if (LDBase->hasAnyUseOfValue(1)) {
04779       SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
04780                              SDValue(LDBase, 1), SDValue(ResNode.getNode(), 1));
04781       DAG.ReplaceAllUsesOfValueWith(SDValue(LDBase, 1), NewChain);
04782       DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(LDBase, 1),
04783                              SDValue(ResNode.getNode(), 1));
04784     }
04785 
04786     return DAG.getNode(ISD::BITCAST, DL, VT, ResNode);
04787   }
04788   return SDValue();
04789 }
04790 
04791 /// LowerVectorBroadcast - Attempt to use the vbroadcast instruction
04792 /// to generate a splat value for the following cases:
04793 /// 1. A splat BUILD_VECTOR which uses a single scalar load, or a constant.
04794 /// 2. A splat shuffle which uses a scalar_to_vector node which comes from
04795 /// a scalar load, or a constant.
04796 /// The VBROADCAST node is returned when a pattern is found,
04797 /// or SDValue() otherwise.
04798 static SDValue LowerVectorBroadcast(SDValue Op, const X86Subtarget* Subtarget,
04799                                     SelectionDAG &DAG) {
04800   // VBROADCAST requires AVX.
04801   // TODO: Splats could be generated for non-AVX CPUs using SSE
04802   // instructions, but there's less potential gain for only 128-bit vectors.
04803   if (!Subtarget->hasAVX())
04804     return SDValue();
04805 
04806   MVT VT = Op.getSimpleValueType();
04807   SDLoc dl(Op);
04808 
04809   assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
04810          "Unsupported vector type for broadcast.");
04811 
04812   SDValue Ld;
04813   bool ConstSplatVal;
04814 
04815   switch (Op.getOpcode()) {
04816     default:
04817       // Unknown pattern found.
04818       return SDValue();
04819 
04820     case ISD::BUILD_VECTOR: {
04821       auto *BVOp = cast<BuildVectorSDNode>(Op.getNode());
04822       BitVector UndefElements;
04823       SDValue Splat = BVOp->getSplatValue(&UndefElements);
04824 
04825       // We need a splat of a single value to use broadcast, and it doesn't
04826       // make any sense if the value is only in one element of the vector.
04827       if (!Splat || (VT.getVectorNumElements() - UndefElements.count()) <= 1)
04828         return SDValue();
04829 
04830       Ld = Splat;
04831       ConstSplatVal = (Ld.getOpcode() == ISD::Constant ||
04832                        Ld.getOpcode() == ISD::ConstantFP);
04833 
04834       // Make sure that all of the users of a non-constant load are from the
04835       // BUILD_VECTOR node.
04836       if (!ConstSplatVal && !BVOp->isOnlyUserOf(Ld.getNode()))
04837         return SDValue();
04838       break;
04839     }
04840 
04841     case ISD::VECTOR_SHUFFLE: {
04842       ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
04843 
04844       // Shuffles must have a splat mask where the first element is
04845       // broadcasted.
04846       if ((!SVOp->isSplat()) || SVOp->getMaskElt(0) != 0)
04847         return SDValue();
04848 
04849       SDValue Sc = Op.getOperand(0);
04850       if (Sc.getOpcode() != ISD::SCALAR_TO_VECTOR &&
04851           Sc.getOpcode() != ISD::BUILD_VECTOR) {
04852 
04853         if (!Subtarget->hasInt256())
04854           return SDValue();
04855 
04856         // Use the register form of the broadcast instruction available on AVX2.
04857         if (VT.getSizeInBits() >= 256)
04858           Sc = Extract128BitVector(Sc, 0, DAG, dl);
04859         return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Sc);
04860       }
04861 
04862       Ld = Sc.getOperand(0);
04863       ConstSplatVal = (Ld.getOpcode() == ISD::Constant ||
04864                        Ld.getOpcode() == ISD::ConstantFP);
04865 
04866       // The scalar_to_vector node and the suspected
04867       // load node must have exactly one user.
04868       // Constants may have multiple users.
04869 
04870       // AVX-512 has register version of the broadcast
04871       bool hasRegVer = Subtarget->hasAVX512() && VT.is512BitVector() &&
04872         Ld.getValueType().getSizeInBits() >= 32;
04873       if (!ConstSplatVal && ((!Sc.hasOneUse() || !Ld.hasOneUse()) &&
04874           !hasRegVer))
04875         return SDValue();
04876       break;
04877     }
04878   }
04879 
04880   unsigned ScalarSize = Ld.getValueType().getSizeInBits();
04881   bool IsGE256 = (VT.getSizeInBits() >= 256);
04882 
04883   // When optimizing for size, generate up to 5 extra bytes for a broadcast
04884   // instruction to save 8 or more bytes of constant pool data.
04885   // TODO: If multiple splats are generated to load the same constant,
04886   // it may be detrimental to overall size. There needs to be a way to detect
04887   // that condition to know if this is truly a size win.
04888   const Function *F = DAG.getMachineFunction().getFunction();
04889   bool OptForSize = F->hasFnAttribute(Attribute::OptimizeForSize);
04890 
04891   // Handle broadcasting a single constant scalar from the constant pool
04892   // into a vector.
04893   // On Sandybridge (no AVX2), it is still better to load a constant vector
04894   // from the constant pool and not to broadcast it from a scalar.
04895   // But override that restriction when optimizing for size.
04896   // TODO: Check if splatting is recommended for other AVX-capable CPUs.
04897   if (ConstSplatVal && (Subtarget->hasAVX2() || OptForSize)) {
04898     EVT CVT = Ld.getValueType();
04899     assert(!CVT.isVector() && "Must not broadcast a vector type");
04900 
04901     // Splat f32, i32, v4f64, v4i64 in all cases with AVX2.
04902     // For size optimization, also splat v2f64 and v2i64, and for size opt
04903     // with AVX2, also splat i8 and i16.
04904     // With pattern matching, the VBROADCAST node may become a VMOVDDUP.
04905     if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
04906         (OptForSize && (ScalarSize == 64 || Subtarget->hasAVX2()))) {
04907       const Constant *C = nullptr;
04908       if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Ld))
04909         C = CI->getConstantIntValue();
04910       else if (ConstantFPSDNode *CF = dyn_cast<ConstantFPSDNode>(Ld))
04911         C = CF->getConstantFPValue();
04912 
04913       assert(C && "Invalid constant type");
04914 
04915       const TargetLowering &TLI = DAG.getTargetLoweringInfo();
04916       SDValue CP = DAG.getConstantPool(C, TLI.getPointerTy());
04917       unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
04918       Ld = DAG.getLoad(CVT, dl, DAG.getEntryNode(), CP,
04919                        MachinePointerInfo::getConstantPool(),
04920                        false, false, false, Alignment);
04921 
04922       return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
04923     }
04924   }
04925 
04926   bool IsLoad = ISD::isNormalLoad(Ld.getNode());
04927 
04928   // Handle AVX2 in-register broadcasts.
04929   if (!IsLoad && Subtarget->hasInt256() &&
04930       (ScalarSize == 32 || (IsGE256 && ScalarSize == 64)))
04931     return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
04932 
04933   // The scalar source must be a normal load.
04934   if (!IsLoad)
04935     return SDValue();
04936 
04937   if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
04938       (Subtarget->hasVLX() && ScalarSize == 64))
04939     return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
04940 
04941   // The integer check is needed for the 64-bit into 128-bit so it doesn't match
04942   // double since there is no vbroadcastsd xmm
04943   if (Subtarget->hasInt256() && Ld.getValueType().isInteger()) {
04944     if (ScalarSize == 8 || ScalarSize == 16 || ScalarSize == 64)
04945       return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
04946   }
04947 
04948   // Unsupported broadcast.
04949   return SDValue();
04950 }
04951 
04952 /// \brief For an EXTRACT_VECTOR_ELT with a constant index return the real
04953 /// underlying vector and index.
04954 ///
04955 /// Modifies \p ExtractedFromVec to the real vector and returns the real
04956 /// index.
04957 static int getUnderlyingExtractedFromVec(SDValue &ExtractedFromVec,
04958                                          SDValue ExtIdx) {
04959   int Idx = cast<ConstantSDNode>(ExtIdx)->getZExtValue();
04960   if (!isa<ShuffleVectorSDNode>(ExtractedFromVec))
04961     return Idx;
04962 
04963   // For 256-bit vectors, LowerEXTRACT_VECTOR_ELT_SSE4 may have already
04964   // lowered this:
04965   //   (extract_vector_elt (v8f32 %vreg1), Constant<6>)
04966   // to:
04967   //   (extract_vector_elt (vector_shuffle<2,u,u,u>
04968   //                           (extract_subvector (v8f32 %vreg0), Constant<4>),
04969   //                           undef)
04970   //                       Constant<0>)
04971   // In this case the vector is the extract_subvector expression and the index
04972   // is 2, as specified by the shuffle.
04973   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(ExtractedFromVec);
04974   SDValue ShuffleVec = SVOp->getOperand(0);
04975   MVT ShuffleVecVT = ShuffleVec.getSimpleValueType();
04976   assert(ShuffleVecVT.getVectorElementType() ==
04977          ExtractedFromVec.getSimpleValueType().getVectorElementType());
04978 
04979   int ShuffleIdx = SVOp->getMaskElt(Idx);
04980   if (isUndefOrInRange(ShuffleIdx, 0, ShuffleVecVT.getVectorNumElements())) {
04981     ExtractedFromVec = ShuffleVec;
04982     return ShuffleIdx;
04983   }
04984   return Idx;
04985 }
04986 
04987 static SDValue buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) {
04988   MVT VT = Op.getSimpleValueType();
04989 
04990   // Skip if insert_vec_elt is not supported.
04991   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
04992   if (!TLI.isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT))
04993     return SDValue();
04994 
04995   SDLoc DL(Op);
04996   unsigned NumElems = Op.getNumOperands();
04997 
04998   SDValue VecIn1;
04999   SDValue VecIn2;
05000   SmallVector<unsigned, 4> InsertIndices;
05001   SmallVector<int, 8> Mask(NumElems, -1);
05002 
05003   for (unsigned i = 0; i != NumElems; ++i) {
05004     unsigned Opc = Op.getOperand(i).getOpcode();
05005 
05006     if (Opc == ISD::UNDEF)
05007       continue;
05008 
05009     if (Opc != ISD::EXTRACT_VECTOR_ELT) {
05010       // Quit if more than 1 elements need inserting.
05011       if (InsertIndices.size() > 1)
05012         return SDValue();
05013 
05014       InsertIndices.push_back(i);
05015       continue;
05016     }
05017 
05018     SDValue ExtractedFromVec = Op.getOperand(i).getOperand(0);
05019     SDValue ExtIdx = Op.getOperand(i).getOperand(1);
05020     // Quit if non-constant index.
05021     if (!isa<ConstantSDNode>(ExtIdx))
05022       return SDValue();
05023     int Idx = getUnderlyingExtractedFromVec(ExtractedFromVec, ExtIdx);
05024 
05025     // Quit if extracted from vector of different type.
05026     if (ExtractedFromVec.getValueType() != VT)
05027       return SDValue();
05028 
05029     if (!VecIn1.getNode())
05030       VecIn1 = ExtractedFromVec;
05031     else if (VecIn1 != ExtractedFromVec) {
05032       if (!VecIn2.getNode())
05033         VecIn2 = ExtractedFromVec;
05034       else if (VecIn2 != ExtractedFromVec)
05035         // Quit if more than 2 vectors to shuffle
05036         return SDValue();
05037     }
05038 
05039     if (ExtractedFromVec == VecIn1)
05040       Mask[i] = Idx;
05041     else if (ExtractedFromVec == VecIn2)
05042       Mask[i] = Idx + NumElems;
05043   }
05044 
05045   if (!VecIn1.getNode())
05046     return SDValue();
05047 
05048   VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(VT);
05049   SDValue NV = DAG.getVectorShuffle(VT, DL, VecIn1, VecIn2, &Mask[0]);
05050   for (unsigned i = 0, e = InsertIndices.size(); i != e; ++i) {
05051     unsigned Idx = InsertIndices[i];
05052     NV = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, NV, Op.getOperand(Idx),
05053                      DAG.getIntPtrConstant(Idx));
05054   }
05055 
05056   return NV;
05057 }
05058 
05059 // Lower BUILD_VECTOR operation for v8i1 and v16i1 types.
05060 SDValue
05061 X86TargetLowering::LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const {
05062 
05063   MVT VT = Op.getSimpleValueType();
05064   assert((VT.getVectorElementType() == MVT::i1) && (VT.getSizeInBits() <= 16) &&
05065          "Unexpected type in LowerBUILD_VECTORvXi1!");
05066 
05067   SDLoc dl(Op);
05068   if (ISD::isBuildVectorAllZeros(Op.getNode())) {
05069     SDValue Cst = DAG.getTargetConstant(0, MVT::i1);
05070     SmallVector<SDValue, 16> Ops(VT.getVectorNumElements(), Cst);
05071     return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops);
05072   }
05073 
05074   if (ISD::isBuildVectorAllOnes(Op.getNode())) {
05075     SDValue Cst = DAG.getTargetConstant(1, MVT::i1);
05076     SmallVector<SDValue, 16> Ops(VT.getVectorNumElements(), Cst);
05077     return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops);
05078   }
05079 
05080   bool AllContants = true;
05081   uint64_t Immediate = 0;
05082   int NonConstIdx = -1;
05083   bool IsSplat = true;
05084   unsigned NumNonConsts = 0;
05085   unsigned NumConsts = 0;
05086   for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
05087     SDValue In = Op.getOperand(idx);
05088     if (In.getOpcode() == ISD::UNDEF)
05089       continue;
05090     if (!isa<ConstantSDNode>(In)) {
05091       AllContants = false;
05092       NonConstIdx = idx;
05093       NumNonConsts++;
05094     } else {
05095       NumConsts++;
05096       if (cast<ConstantSDNode>(In)->getZExtValue())
05097       Immediate |= (1ULL << idx);
05098     }
05099     if (In != Op.getOperand(0))
05100       IsSplat = false;
05101   }
05102 
05103   if (AllContants) {
05104     SDValue FullMask = DAG.getNode(ISD::BITCAST, dl, MVT::v16i1,
05105       DAG.getConstant(Immediate, MVT::i16));
05106     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, FullMask,
05107                        DAG.getIntPtrConstant(0));
05108   }
05109 
05110   if (NumNonConsts == 1 && NonConstIdx != 0) {
05111     SDValue DstVec;
05112     if (NumConsts) {
05113       SDValue VecAsImm = DAG.getConstant(Immediate,
05114                                          MVT::getIntegerVT(VT.getSizeInBits()));
05115       DstVec = DAG.getNode(ISD::BITCAST, dl, VT, VecAsImm);
05116     }
05117     else
05118       DstVec = DAG.getUNDEF(VT);
05119     return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DstVec,
05120                        Op.getOperand(NonConstIdx),
05121                        DAG.getIntPtrConstant(NonConstIdx));
05122   }
05123   if (!IsSplat && (NonConstIdx != 0))
05124     llvm_unreachable("Unsupported BUILD_VECTOR operation");
05125   MVT SelectVT = (VT == MVT::v16i1)? MVT::i16 : MVT::i8;
05126   SDValue Select;
05127   if (IsSplat)
05128     Select = DAG.getNode(ISD::SELECT, dl, SelectVT, Op.getOperand(0),
05129                           DAG.getConstant(-1, SelectVT),
05130                           DAG.getConstant(0, SelectVT));
05131   else
05132     Select = DAG.getNode(ISD::SELECT, dl, SelectVT, Op.getOperand(0),
05133                          DAG.getConstant((Immediate | 1), SelectVT),
05134                          DAG.getConstant(Immediate, SelectVT));
05135   return DAG.getNode(ISD::BITCAST, dl, VT, Select);
05136 }
05137 
05138 /// \brief Return true if \p N implements a horizontal binop and return the
05139 /// operands for the horizontal binop into V0 and V1.
05140 ///
05141 /// This is a helper function of PerformBUILD_VECTORCombine.
05142 /// This function checks that the build_vector \p N in input implements a
05143 /// horizontal operation. Parameter \p Opcode defines the kind of horizontal
05144 /// operation to match.
05145 /// For example, if \p Opcode is equal to ISD::ADD, then this function
05146 /// checks if \p N implements a horizontal arithmetic add; if instead \p Opcode
05147 /// is equal to ISD::SUB, then this function checks if this is a horizontal
05148 /// arithmetic sub.
05149 ///
05150 /// This function only analyzes elements of \p N whose indices are
05151 /// in range [BaseIdx, LastIdx).
05152 static bool isHorizontalBinOp(const BuildVectorSDNode *N, unsigned Opcode,
05153                               SelectionDAG &DAG,
05154                               unsigned BaseIdx, unsigned LastIdx,
05155                               SDValue &V0, SDValue &V1) {
05156   EVT VT = N->getValueType(0);
05157 
05158   assert(BaseIdx * 2 <= LastIdx && "Invalid Indices in input!");
05159   assert(VT.isVector() && VT.getVectorNumElements() >= LastIdx &&
05160          "Invalid Vector in input!");
05161 
05162   bool IsCommutable = (Opcode == ISD::ADD || Opcode == ISD::FADD);
05163   bool CanFold = true;
05164   unsigned ExpectedVExtractIdx = BaseIdx;
05165   unsigned NumElts = LastIdx - BaseIdx;
05166   V0 = DAG.getUNDEF(VT);
05167   V1 = DAG.getUNDEF(VT);
05168 
05169   // Check if N implements a horizontal binop.
05170   for (unsigned i = 0, e = NumElts; i != e && CanFold; ++i) {
05171     SDValue Op = N->getOperand(i + BaseIdx);
05172 
05173     // Skip UNDEFs.
05174     if (Op->getOpcode() == ISD::UNDEF) {
05175       // Update the expected vector extract index.
05176       if (i * 2 == NumElts)
05177         ExpectedVExtractIdx = BaseIdx;
05178       ExpectedVExtractIdx += 2;
05179       continue;
05180     }
05181 
05182     CanFold = Op->getOpcode() == Opcode && Op->hasOneUse();
05183 
05184     if (!CanFold)
05185       break;
05186 
05187     SDValue Op0 = Op.getOperand(0);
05188     SDValue Op1 = Op.getOperand(1);
05189 
05190     // Try to match the following pattern:
05191     // (BINOP (extract_vector_elt A, I), (extract_vector_elt A, I+1))
05192     CanFold = (Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
05193         Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
05194         Op0.getOperand(0) == Op1.getOperand(0) &&
05195         isa<ConstantSDNode>(Op0.getOperand(1)) &&
05196         isa<ConstantSDNode>(Op1.getOperand(1)));
05197     if (!CanFold)
05198       break;
05199 
05200     unsigned I0 = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue();
05201     unsigned I1 = cast<ConstantSDNode>(Op1.getOperand(1))->getZExtValue();
05202 
05203     if (i * 2 < NumElts) {
05204       if (V0.getOpcode() == ISD::UNDEF)
05205         V0 = Op0.getOperand(0);
05206     } else {
05207       if (V1.getOpcode() == ISD::UNDEF)
05208         V1 = Op0.getOperand(0);
05209       if (i * 2 == NumElts)
05210         ExpectedVExtractIdx = BaseIdx;
05211     }
05212 
05213     SDValue Expected = (i * 2 < NumElts) ? V0 : V1;
05214     if (I0 == ExpectedVExtractIdx)
05215       CanFold = I1 == I0 + 1 && Op0.getOperand(0) == Expected;
05216     else if (IsCommutable && I1 == ExpectedVExtractIdx) {
05217       // Try to match the following dag sequence:
05218       // (BINOP (extract_vector_elt A, I+1), (extract_vector_elt A, I))
05219       CanFold = I0 == I1 + 1 && Op1.getOperand(0) == Expected;
05220     } else
05221       CanFold = false;
05222 
05223     ExpectedVExtractIdx += 2;
05224   }
05225 
05226   return CanFold;
05227 }
05228 
05229 /// \brief Emit a sequence of two 128-bit horizontal add/sub followed by
05230 /// a concat_vector.
05231 ///
05232 /// This is a helper function of PerformBUILD_VECTORCombine.
05233 /// This function expects two 256-bit vectors called V0 and V1.
05234 /// At first, each vector is split into two separate 128-bit vectors.
05235 /// Then, the resulting 128-bit vectors are used to implement two
05236 /// horizontal binary operations.
05237 ///
05238 /// The kind of horizontal binary operation is defined by \p X86Opcode.
05239 ///
05240 /// \p Mode specifies how the 128-bit parts of V0 and V1 are passed in input to
05241 /// the two new horizontal binop.
05242 /// When Mode is set, the first horizontal binop dag node would take as input
05243 /// the lower 128-bit of V0 and the upper 128-bit of V0. The second
05244 /// horizontal binop dag node would take as input the lower 128-bit of V1
05245 /// and the upper 128-bit of V1.
05246 ///   Example:
05247 ///     HADD V0_LO, V0_HI
05248 ///     HADD V1_LO, V1_HI
05249 ///
05250 /// Otherwise, the first horizontal binop dag node takes as input the lower
05251 /// 128-bit of V0 and the lower 128-bit of V1, and the second horizontal binop
05252 /// dag node takes the the upper 128-bit of V0 and the upper 128-bit of V1.
05253 ///   Example:
05254 ///     HADD V0_LO, V1_LO
05255 ///     HADD V0_HI, V1_HI
05256 ///
05257 /// If \p isUndefLO is set, then the algorithm propagates UNDEF to the lower
05258 /// 128-bits of the result. If \p isUndefHI is set, then UNDEF is propagated to
05259 /// the upper 128-bits of the result.
05260 static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1,
05261                                      SDLoc DL, SelectionDAG &DAG,
05262                                      unsigned X86Opcode, bool Mode,
05263                                      bool isUndefLO, bool isUndefHI) {
05264   EVT VT = V0.getValueType();
05265   assert(VT.is256BitVector() && VT == V1.getValueType() &&
05266          "Invalid nodes in input!");
05267 
05268   unsigned NumElts = VT.getVectorNumElements();
05269   SDValue V0_LO = Extract128BitVector(V0, 0, DAG, DL);
05270   SDValue V0_HI = Extract128BitVector(V0, NumElts/2, DAG, DL);
05271   SDValue V1_LO = Extract128BitVector(V1, 0, DAG, DL);
05272   SDValue V1_HI = Extract128BitVector(V1, NumElts/2, DAG, DL);
05273   EVT NewVT = V0_LO.getValueType();
05274 
05275   SDValue LO = DAG.getUNDEF(NewVT);
05276   SDValue HI = DAG.getUNDEF(NewVT);
05277 
05278   if (Mode) {
05279     // Don't emit a horizontal binop if the result is expected to be UNDEF.
05280     if (!isUndefLO && V0->getOpcode() != ISD::UNDEF)
05281       LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V0_HI);
05282     if (!isUndefHI && V1->getOpcode() != ISD::UNDEF)
05283       HI = DAG.getNode(X86Opcode, DL, NewVT, V1_LO, V1_HI);
05284   } else {
05285     // Don't emit a horizontal binop if the result is expected to be UNDEF.
05286     if (!isUndefLO && (V0_LO->getOpcode() != ISD::UNDEF ||
05287                        V1_LO->getOpcode() != ISD::UNDEF))
05288       LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V1_LO);
05289 
05290     if (!isUndefHI && (V0_HI->getOpcode() != ISD::UNDEF ||
05291                        V1_HI->getOpcode() != ISD::UNDEF))
05292       HI = DAG.getNode(X86Opcode, DL, NewVT, V0_HI, V1_HI);
05293   }
05294 
05295   return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LO, HI);
05296 }
05297 
05298 /// \brief Try to fold a build_vector that performs an 'addsub' into the
05299 /// sequence of 'vadd + vsub + blendi'.
05300 static SDValue matchAddSub(const BuildVectorSDNode *BV, SelectionDAG &DAG,
05301                            const X86Subtarget *Subtarget) {
05302   SDLoc DL(BV);
05303   EVT VT = BV->getValueType(0);
05304   unsigned NumElts = VT.getVectorNumElements();
05305   SDValue InVec0 = DAG.getUNDEF(VT);
05306   SDValue InVec1 = DAG.getUNDEF(VT);
05307 
05308   assert((VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v4f32 ||
05309           VT == MVT::v2f64) && "build_vector with an invalid type found!");
05310 
05311   // Odd-numbered elements in the input build vector are obtained from
05312   // adding two integer/float elements.
05313   // Even-numbered elements in the input build vector are obtained from
05314   // subtracting two integer/float elements.
05315   unsigned ExpectedOpcode = ISD::FSUB;
05316   unsigned NextExpectedOpcode = ISD::FADD;
05317   bool AddFound = false;
05318   bool SubFound = false;
05319 
05320   for (unsigned i = 0, e = NumElts; i != e; ++i) {
05321     SDValue Op = BV->getOperand(i);
05322 
05323     // Skip 'undef' values.
05324     unsigned Opcode = Op.getOpcode();
05325     if (Opcode == ISD::UNDEF) {
05326       std::swap(ExpectedOpcode, NextExpectedOpcode);
05327       continue;
05328     }
05329 
05330     // Early exit if we found an unexpected opcode.
05331     if (Opcode != ExpectedOpcode)
05332       return SDValue();
05333 
05334     SDValue Op0 = Op.getOperand(0);
05335     SDValue Op1 = Op.getOperand(1);
05336 
05337     // Try to match the following pattern:
05338     // (BINOP (extract_vector_elt A, i), (extract_vector_elt B, i))
05339     // Early exit if we cannot match that sequence.
05340     if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
05341         Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
05342         !isa<ConstantSDNode>(Op0.getOperand(1)) ||
05343         !isa<ConstantSDNode>(Op1.getOperand(1)) ||
05344         Op0.getOperand(1) != Op1.getOperand(1))
05345       return SDValue();
05346 
05347     unsigned I0 = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue();
05348     if (I0 != i)
05349       return SDValue();
05350 
05351     // We found a valid add/sub node. Update the information accordingly.
05352     if (i & 1)
05353       AddFound = true;
05354     else
05355       SubFound = true;
05356 
05357     // Update InVec0 and InVec1.
05358     if (InVec0.getOpcode() == ISD::UNDEF)
05359       InVec0 = Op0.getOperand(0);
05360     if (InVec1.getOpcode() == ISD::UNDEF)
05361       InVec1 = Op1.getOperand(0);
05362 
05363     // Make sure that operands in input to each add/sub node always
05364     // come from a same pair of vectors.
05365     if (InVec0 != Op0.getOperand(0)) {
05366       if (ExpectedOpcode == ISD::FSUB)
05367         return SDValue();
05368 
05369       // FADD is commutable. Try to commute the operands
05370       // and then test again.
05371       std::swap(Op0, Op1);
05372       if (InVec0 != Op0.getOperand(0))
05373         return SDValue();
05374     }
05375 
05376     if (InVec1 != Op1.getOperand(0))
05377       return SDValue();
05378 
05379     // Update the pair of expected opcodes.
05380     std::swap(ExpectedOpcode, NextExpectedOpcode);
05381   }
05382 
05383   // Don't try to fold this build_vector into an ADDSUB if the inputs are undef.
05384   if (AddFound && SubFound && InVec0.getOpcode() != ISD::UNDEF &&
05385       InVec1.getOpcode() != ISD::UNDEF)
05386     return DAG.getNode(X86ISD::ADDSUB, DL, VT, InVec0, InVec1);
05387 
05388   return SDValue();
05389 }
05390 
05391 static SDValue PerformBUILD_VECTORCombine(SDNode *N, SelectionDAG &DAG,
05392                                           const X86Subtarget *Subtarget) {
05393   SDLoc DL(N);
05394   EVT VT = N->getValueType(0);
05395   unsigned NumElts = VT.getVectorNumElements();
05396   BuildVectorSDNode *BV = cast<BuildVectorSDNode>(N);
05397   SDValue InVec0, InVec1;
05398 
05399   // Try to match an ADDSUB.
05400   if ((Subtarget->hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
05401       (Subtarget->hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))) {
05402     SDValue Value = matchAddSub(BV, DAG, Subtarget);
05403     if (Value.getNode())
05404       return Value;
05405   }
05406 
05407   // Try to match horizontal ADD/SUB.
05408   unsigned NumUndefsLO = 0;
05409   unsigned NumUndefsHI = 0;
05410   unsigned Half = NumElts/2;
05411 
05412   // Count the number of UNDEF operands in the build_vector in input.
05413   for (unsigned i = 0, e = Half; i != e; ++i)
05414     if (BV->getOperand(i)->getOpcode() == ISD::UNDEF)
05415       NumUndefsLO++;
05416 
05417   for (unsigned i = Half, e = NumElts; i != e; ++i)
05418     if (BV->getOperand(i)->getOpcode() == ISD::UNDEF)
05419       NumUndefsHI++;
05420 
05421   // Early exit if this is either a build_vector of all UNDEFs or all the
05422   // operands but one are UNDEF.
05423   if (NumUndefsLO + NumUndefsHI + 1 >= NumElts)
05424     return SDValue();
05425 
05426   if ((VT == MVT::v4f32 || VT == MVT::v2f64) && Subtarget->hasSSE3()) {
05427     // Try to match an SSE3 float HADD/HSUB.
05428     if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, NumElts, InVec0, InVec1))
05429       return DAG.getNode(X86ISD::FHADD, DL, VT, InVec0, InVec1);
05430 
05431     if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, NumElts, InVec0, InVec1))
05432       return DAG.getNode(X86ISD::FHSUB, DL, VT, InVec0, InVec1);
05433   } else if ((VT == MVT::v4i32 || VT == MVT::v8i16) && Subtarget->hasSSSE3()) {
05434     // Try to match an SSSE3 integer HADD/HSUB.
05435     if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1))
05436       return DAG.getNode(X86ISD::HADD, DL, VT, InVec0, InVec1);
05437 
05438     if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, NumElts, InVec0, InVec1))
05439       return DAG.getNode(X86ISD::HSUB, DL, VT, InVec0, InVec1);
05440   }
05441 
05442   if (!Subtarget->hasAVX())
05443     return SDValue();
05444 
05445   if ((VT == MVT::v8f32 || VT == MVT::v4f64)) {
05446     // Try to match an AVX horizontal add/sub of packed single/double
05447     // precision floating point values from 256-bit vectors.
05448     SDValue InVec2, InVec3;
05449     if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, Half, InVec0, InVec1) &&
05450         isHorizontalBinOp(BV, ISD::FADD, DAG, Half, NumElts, InVec2, InVec3) &&
05451         ((InVec0.getOpcode() == ISD::UNDEF ||
05452           InVec2.getOpcode() == ISD::UNDEF) || InVec0 == InVec2) &&
05453         ((InVec1.getOpcode() == ISD::UNDEF ||
05454           InVec3.getOpcode() == ISD::UNDEF) || InVec1 == InVec3))
05455       return DAG.getNode(X86ISD::FHADD, DL, VT, InVec0, InVec1);
05456 
05457     if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, Half, InVec0, InVec1) &&
05458         isHorizontalBinOp(BV, ISD::FSUB, DAG, Half, NumElts, InVec2, InVec3) &&
05459         ((InVec0.getOpcode() == ISD::UNDEF ||
05460           InVec2.getOpcode() == ISD::UNDEF) || InVec0 == InVec2) &&
05461         ((InVec1.getOpcode() == ISD::UNDEF ||
05462           InVec3.getOpcode() == ISD::UNDEF) || InVec1 == InVec3))
05463       return DAG.getNode(X86ISD::FHSUB, DL, VT, InVec0, InVec1);
05464   } else if (VT == MVT::v8i32 || VT == MVT::v16i16) {
05465     // Try to match an AVX2 horizontal add/sub of signed integers.
05466     SDValue InVec2, InVec3;
05467     unsigned X86Opcode;
05468     bool CanFold = true;
05469 
05470     if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, Half, InVec0, InVec1) &&
05471         isHorizontalBinOp(BV, ISD::ADD, DAG, Half, NumElts, InVec2, InVec3) &&
05472         ((InVec0.getOpcode() == ISD::UNDEF ||
05473           InVec2.getOpcode() == ISD::UNDEF) || InVec0 == InVec2) &&
05474         ((InVec1.getOpcode() == ISD::UNDEF ||
05475           InVec3.getOpcode() == ISD::UNDEF) || InVec1 == InVec3))
05476       X86Opcode = X86ISD::HADD;
05477     else if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, Half, InVec0, InVec1) &&
05478         isHorizontalBinOp(BV, ISD::SUB, DAG, Half, NumElts, InVec2, InVec3) &&
05479         ((InVec0.getOpcode() == ISD::UNDEF ||
05480           InVec2.getOpcode() == ISD::UNDEF) || InVec0 == InVec2) &&
05481         ((InVec1.getOpcode() == ISD::UNDEF ||
05482           InVec3.getOpcode() == ISD::UNDEF) || InVec1 == InVec3))
05483       X86Opcode = X86ISD::HSUB;
05484     else
05485       CanFold = false;
05486 
05487     if (CanFold) {
05488       // Fold this build_vector into a single horizontal add/sub.
05489       // Do this only if the target has AVX2.
05490       if (Subtarget->hasAVX2())
05491         return DAG.getNode(X86Opcode, DL, VT, InVec0, InVec1);
05492 
05493       // Do not try to expand this build_vector into a pair of horizontal
05494       // add/sub if we can emit a pair of scalar add/sub.
05495       if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
05496         return SDValue();
05497 
05498       // Convert this build_vector into a pair of horizontal binop followed by
05499       // a concat vector.
05500       bool isUndefLO = NumUndefsLO == Half;
05501       bool isUndefHI = NumUndefsHI == Half;
05502       return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, false,
05503                                    isUndefLO, isUndefHI);
05504     }
05505   }
05506 
05507   if ((VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v8i32 ||
05508        VT == MVT::v16i16) && Subtarget->hasAVX()) {
05509     unsigned X86Opcode;
05510     if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1))
05511       X86Opcode = X86ISD::HADD;
05512     else if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, NumElts, InVec0, InVec1))
05513       X86Opcode = X86ISD::HSUB;
05514     else if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, NumElts, InVec0, InVec1))
05515       X86Opcode = X86ISD::FHADD;
05516     else if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, NumElts, InVec0, InVec1))
05517       X86Opcode = X86ISD::FHSUB;
05518     else
05519       return SDValue();
05520 
05521     // Don't try to expand this build_vector into a pair of horizontal add/sub
05522     // if we can simply emit a pair of scalar add/sub.
05523     if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
05524       return SDValue();
05525 
05526     // Convert this build_vector into two horizontal add/sub followed by
05527     // a concat vector.
05528     bool isUndefLO = NumUndefsLO == Half;
05529     bool isUndefHI = NumUndefsHI == Half;
05530     return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, true,
05531                                  isUndefLO, isUndefHI);
05532   }
05533 
05534   return SDValue();
05535 }
05536 
05537 SDValue
05538 X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
05539   SDLoc dl(Op);
05540 
05541   MVT VT = Op.getSimpleValueType();
05542   MVT ExtVT = VT.getVectorElementType();
05543   unsigned NumElems = Op.getNumOperands();
05544 
05545   // Generate vectors for predicate vectors.
05546   if (VT.getScalarType() == MVT::i1 && Subtarget->hasAVX512())
05547     return LowerBUILD_VECTORvXi1(Op, DAG);
05548 
05549   // Vectors containing all zeros can be matched by pxor and xorps later
05550   if (ISD::isBuildVectorAllZeros(Op.getNode())) {
05551     // Canonicalize this to <4 x i32> to 1) ensure the zero vectors are CSE'd
05552     // and 2) ensure that i64 scalars are eliminated on x86-32 hosts.
05553     if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32)
05554       return Op;
05555 
05556     return getZeroVector(VT, Subtarget, DAG, dl);
05557   }
05558 
05559   // Vectors containing all ones can be matched by pcmpeqd on 128-bit width
05560   // vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use
05561   // vpcmpeqd on 256-bit vectors.
05562   if (Subtarget->hasSSE2() && ISD::isBuildVectorAllOnes(Op.getNode())) {
05563     if (VT == MVT::v4i32 || (VT == MVT::v8i32 && Subtarget->hasInt256()))
05564       return Op;
05565 
05566     if (!VT.is512BitVector())
05567       return getOnesVector(VT, Subtarget->hasInt256(), DAG, dl);
05568   }
05569 
05570   SDValue Broadcast = LowerVectorBroadcast(Op, Subtarget, DAG);
05571   if (Broadcast.getNode())
05572     return Broadcast;
05573 
05574   unsigned EVTBits = ExtVT.getSizeInBits();
05575 
05576   unsigned NumZero  = 0;
05577   unsigned NumNonZero = 0;
05578   unsigned NonZeros = 0;
05579   bool IsAllConstants = true;
05580   SmallSet<SDValue, 8> Values;
05581   for (unsigned i = 0; i < NumElems; ++i) {
05582     SDValue Elt = Op.getOperand(i);
05583     if (Elt.getOpcode() == ISD::UNDEF)
05584       continue;
05585     Values.insert(Elt);
05586     if (Elt.getOpcode() != ISD::Constant &&
05587         Elt.getOpcode() != ISD::ConstantFP)
05588       IsAllConstants = false;
05589     if (X86::isZeroNode(Elt))
05590       NumZero++;
05591     else {
05592       NonZeros |= (1 << i);
05593       NumNonZero++;
05594     }
05595   }
05596 
05597   // All undef vector. Return an UNDEF.  All zero vectors were handled above.
05598   if (NumNonZero == 0)
05599     return DAG.getUNDEF(VT);
05600 
05601   // Special case for single non-zero, non-undef, element.
05602   if (NumNonZero == 1) {
05603     unsigned Idx = countTrailingZeros(NonZeros);
05604     SDValue Item = Op.getOperand(Idx);
05605 
05606     // If this is an insertion of an i64 value on x86-32, and if the top bits of
05607     // the value are obviously zero, truncate the value to i32 and do the
05608     // insertion that way.  Only do this if the value is non-constant or if the
05609     // value is a constant being inserted into element 0.  It is cheaper to do
05610     // a constant pool load than it is to do a movd + shuffle.
05611     if (ExtVT == MVT::i64 && !Subtarget->is64Bit() &&
05612         (!IsAllConstants || Idx == 0)) {
05613       if (DAG.MaskedValueIsZero(Item, APInt::getBitsSet(64, 32, 64))) {
05614         // Handle SSE only.
05615         assert(VT == MVT::v2i64 && "Expected an SSE value type!");
05616         EVT VecVT = MVT::v4i32;
05617 
05618         // Truncate the value (which may itself be a constant) to i32, and
05619         // convert it to a vector with movd (S2V+shuffle to zero extend).
05620         Item = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Item);
05621         Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Item);
05622         return DAG.getNode(
05623             ISD::BITCAST, dl, VT,
05624             getShuffleVectorZeroOrUndef(Item, Idx * 2, true, Subtarget, DAG));
05625       }
05626     }
05627 
05628     // If we have a constant or non-constant insertion into the low element of
05629     // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into
05630     // the rest of the elements.  This will be matched as movd/movq/movss/movsd
05631     // depending on what the source datatype is.
05632     if (Idx == 0) {
05633       if (NumZero == 0)
05634         return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
05635 
05636       if (ExtVT == MVT::i32 || ExtVT == MVT::f32 || ExtVT == MVT::f64 ||
05637           (ExtVT == MVT::i64 && Subtarget->is64Bit())) {
05638         if (VT.is256BitVector() || VT.is512BitVector()) {
05639           SDValue ZeroVec = getZeroVector(VT, Subtarget, DAG, dl);
05640           return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, ZeroVec,
05641                              Item, DAG.getIntPtrConstant(0));
05642         }
05643         assert(VT.is128BitVector() && "Expected an SSE value type!");
05644         Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
05645         // Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector.
05646         return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
05647       }
05648 
05649       if (ExtVT == MVT::i16 || ExtVT == MVT::i8) {
05650         Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item);
05651         Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item);
05652         if (VT.is256BitVector()) {
05653           SDValue ZeroVec = getZeroVector(MVT::v8i32, Subtarget, DAG, dl);
05654           Item = Insert128BitVector(ZeroVec, Item, 0, DAG, dl);
05655         } else {
05656           assert(VT.is128BitVector() && "Expected an SSE value type!");
05657           Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
05658         }
05659         return DAG.getNode(ISD::BITCAST, dl, VT, Item);
05660       }
05661     }
05662 
05663     // Is it a vector logical left shift?
05664     if (NumElems == 2 && Idx == 1 &&
05665         X86::isZeroNode(Op.getOperand(0)) &&
05666         !X86::isZeroNode(Op.getOperand(1))) {
05667       unsigned NumBits = VT.getSizeInBits();
05668       return getVShift(true, VT,
05669                        DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
05670                                    VT, Op.getOperand(1)),
05671                        NumBits/2, DAG, *this, dl);
05672     }
05673 
05674     if (IsAllConstants) // Otherwise, it's better to do a constpool load.
05675       return SDValue();
05676 
05677     // Otherwise, if this is a vector with i32 or f32 elements, and the element
05678     // is a non-constant being inserted into an element other than the low one,
05679     // we can't use a constant pool load.  Instead, use SCALAR_TO_VECTOR (aka
05680     // movd/movss) to move this into the low element, then shuffle it into
05681     // place.
05682     if (EVTBits == 32) {
05683       Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
05684       return getShuffleVectorZeroOrUndef(Item, Idx, NumZero > 0, Subtarget, DAG);
05685     }
05686   }
05687 
05688   // Splat is obviously ok. Let legalizer expand it to a shuffle.
05689   if (Values.size() == 1) {
05690     if (EVTBits == 32) {
05691       // Instead of a shuffle like this:
05692       // shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0>
05693       // Check if it's possible to issue this instead.
05694       // shuffle (vload ptr)), undef, <1, 1, 1, 1>
05695       unsigned Idx = countTrailingZeros(NonZeros);
05696       SDValue Item = Op.getOperand(Idx);
05697       if (Op.getNode()->isOnlyUserOf(Item.getNode()))
05698         return LowerAsSplatVectorLoad(Item, VT, dl, DAG);
05699     }
05700     return SDValue();
05701   }
05702 
05703   // A vector full of immediates; various special cases are already
05704   // handled, so this is best done with a single constant-pool load.
05705   if (IsAllConstants)
05706     return SDValue();
05707 
05708   // For AVX-length vectors, see if we can use a vector load to get all of the
05709   // elements, otherwise build the individual 128-bit pieces and use
05710   // shuffles to put them in place.
05711   if (VT.is256BitVector() || VT.is512BitVector()) {
05712     SmallVector<SDValue, 64> V(Op->op_begin(), Op->op_begin() + NumElems);
05713 
05714     // Check for a build vector of consecutive loads.
05715     if (SDValue LD = EltsFromConsecutiveLoads(VT, V, dl, DAG, false))
05716       return LD;
05717 
05718     EVT HVT = EVT::getVectorVT(*DAG.getContext(), ExtVT, NumElems/2);
05719 
05720     // Build both the lower and upper subvector.
05721     SDValue Lower = DAG.getNode(ISD::BUILD_VECTOR, dl, HVT,
05722                                 makeArrayRef(&V[0], NumElems/2));
05723     SDValue Upper = DAG.getNode(ISD::BUILD_VECTOR, dl, HVT,
05724                                 makeArrayRef(&V[NumElems / 2], NumElems/2));
05725 
05726     // Recreate the wider vector with the lower and upper part.
05727     if (VT.is256BitVector())
05728       return Concat128BitVectors(Lower, Upper, VT, NumElems, DAG, dl);
05729     return Concat256BitVectors(Lower, Upper, VT, NumElems, DAG, dl);
05730   }
05731 
05732   // Let legalizer expand 2-wide build_vectors.
05733   if (EVTBits == 64) {
05734     if (NumNonZero == 1) {
05735       // One half is zero or undef.
05736       unsigned Idx = countTrailingZeros(NonZeros);
05737       SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT,
05738                                  Op.getOperand(Idx));
05739       return getShuffleVectorZeroOrUndef(V2, Idx, true, Subtarget, DAG);
05740     }
05741     return SDValue();
05742   }
05743 
05744   // If element VT is < 32 bits, convert it to inserts into a zero vector.
05745   if (EVTBits == 8 && NumElems == 16) {
05746     SDValue V = LowerBuildVectorv16i8(Op, NonZeros,NumNonZero,NumZero, DAG,
05747                                         Subtarget, *this);
05748     if (V.getNode()) return V;
05749   }
05750 
05751   if (EVTBits == 16 && NumElems == 8) {
05752     SDValue V = LowerBuildVectorv8i16(Op, NonZeros,NumNonZero,NumZero, DAG,
05753                                       Subtarget, *this);
05754     if (V.getNode()) return V;
05755   }
05756 
05757   // If element VT is == 32 bits and has 4 elems, try to generate an INSERTPS
05758   if (EVTBits == 32 && NumElems == 4) {
05759     SDValue V = LowerBuildVectorv4x32(Op, DAG, Subtarget, *this);
05760     if (V.getNode())
05761       return V;
05762   }
05763 
05764   // If element VT is == 32 bits, turn it into a number of shuffles.
05765   SmallVector<SDValue, 8> V(NumElems);
05766   if (NumElems == 4 && NumZero > 0) {
05767     for (unsigned i = 0; i < 4; ++i) {
05768       bool isZero = !(NonZeros & (1 << i));
05769       if (isZero)
05770         V[i] = getZeroVector(VT, Subtarget, DAG, dl);
05771       else
05772         V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
05773     }
05774 
05775     for (unsigned i = 0; i < 2; ++i) {
05776       switch ((NonZeros & (0x3 << i*2)) >> (i*2)) {
05777         default: break;
05778         case 0:
05779           V[i] = V[i*2];  // Must be a zero vector.
05780           break;
05781         case 1:
05782           V[i] = getMOVL(DAG, dl, VT, V[i*2+1], V[i*2]);
05783           break;
05784         case 2:
05785           V[i] = getMOVL(DAG, dl, VT, V[i*2], V[i*2+1]);
05786           break;
05787         case 3:
05788           V[i] = getUnpackl(DAG, dl, VT, V[i*2], V[i*2+1]);
05789           break;
05790       }
05791     }
05792 
05793     bool Reverse1 = (NonZeros & 0x3) == 2;
05794     bool Reverse2 = ((NonZeros & (0x3 << 2)) >> 2) == 2;
05795     int MaskVec[] = {
05796       Reverse1 ? 1 : 0,
05797       Reverse1 ? 0 : 1,
05798       static_cast<int>(Reverse2 ? NumElems+1 : NumElems),
05799       static_cast<int>(Reverse2 ? NumElems   : NumElems+1)
05800     };
05801     return DAG.getVectorShuffle(VT, dl, V[0], V[1], &MaskVec[0]);
05802   }
05803 
05804   if (Values.size() > 1 && VT.is128BitVector()) {
05805     // Check for a build vector of consecutive loads.
05806     for (unsigned i = 0; i < NumElems; ++i)
05807       V[i] = Op.getOperand(i);
05808 
05809     // Check for elements which are consecutive loads.
05810     SDValue LD = EltsFromConsecutiveLoads(VT, V, dl, DAG, false);
05811     if (LD.getNode())
05812       return LD;
05813 
05814     // Check for a build vector from mostly shuffle plus few inserting.
05815     SDValue Sh = buildFromShuffleMostly(Op, DAG);
05816     if (Sh.getNode())
05817       return Sh;
05818 
05819     // For SSE 4.1, use insertps to put the high elements into the low element.
05820     if (Subtarget->hasSSE41()) {
05821       SDValue Result;
05822       if (Op.getOperand(0).getOpcode() != ISD::UNDEF)
05823         Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0));
05824       else
05825         Result = DAG.getUNDEF(VT);
05826 
05827       for (unsigned i = 1; i < NumElems; ++i) {
05828         if (Op.getOperand(i).getOpcode() == ISD::UNDEF) continue;
05829         Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result,
05830                              Op.getOperand(i), DAG.getIntPtrConstant(i));
05831       }
05832       return Result;
05833     }
05834 
05835     // Otherwise, expand into a number of unpckl*, start by extending each of
05836     // our (non-undef) elements to the full vector width with the element in the
05837     // bottom slot of the vector (which generates no code for SSE).
05838     for (unsigned i = 0; i < NumElems; ++i) {
05839       if (Op.getOperand(i).getOpcode() != ISD::UNDEF)
05840         V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
05841       else
05842         V[i] = DAG.getUNDEF(VT);
05843     }
05844 
05845     // Next, we iteratively mix elements, e.g. for v4f32:
05846     //   Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
05847     //         : unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
05848     //   Step 2: unpcklps X, Y ==>    <3, 2, 1, 0>
05849     unsigned EltStride = NumElems >> 1;
05850     while (EltStride != 0) {
05851       for (unsigned i = 0; i < EltStride; ++i) {
05852         // If V[i+EltStride] is undef and this is the first round of mixing,
05853         // then it is safe to just drop this shuffle: V[i] is already in the
05854         // right place, the one element (since it's the first round) being
05855         // inserted as undef can be dropped.  This isn't safe for successive
05856         // rounds because they will permute elements within both vectors.
05857         if (V[i+EltStride].getOpcode() == ISD::UNDEF &&
05858             EltStride == NumElems/2)
05859           continue;
05860 
05861         V[i] = getUnpackl(DAG, dl, VT, V[i], V[i + EltStride]);
05862       }
05863       EltStride >>= 1;
05864     }
05865     return V[0];
05866   }
05867   return SDValue();
05868 }
05869 
05870 // LowerAVXCONCAT_VECTORS - 256-bit AVX can use the vinsertf128 instruction
05871 // to create 256-bit vectors from two other 128-bit ones.
05872 static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
05873   SDLoc dl(Op);
05874   MVT ResVT = Op.getSimpleValueType();
05875 
05876   assert((ResVT.is256BitVector() ||
05877           ResVT.is512BitVector()) && "Value type must be 256-/512-bit wide");
05878 
05879   SDValue V1 = Op.getOperand(0);
05880   SDValue V2 = Op.getOperand(1);
05881   unsigned NumElems = ResVT.getVectorNumElements();
05882   if(ResVT.is256BitVector())
05883     return Concat128BitVectors(V1, V2, ResVT, NumElems, DAG, dl);
05884 
05885   if (Op.getNumOperands() == 4) {
05886     MVT HalfVT = MVT::getVectorVT(ResVT.getScalarType(),
05887                                 ResVT.getVectorNumElements()/2);
05888     SDValue V3 = Op.getOperand(2);
05889     SDValue V4 = Op.getOperand(3);
05890     return Concat256BitVectors(Concat128BitVectors(V1, V2, HalfVT, NumElems/2, DAG, dl),
05891       Concat128BitVectors(V3, V4, HalfVT, NumElems/2, DAG, dl), ResVT, NumElems, DAG, dl);
05892   }
05893   return Concat256BitVectors(V1, V2, ResVT, NumElems, DAG, dl);
05894 }
05895 
05896 static SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
05897   MVT LLVM_ATTRIBUTE_UNUSED VT = Op.getSimpleValueType();
05898   assert((VT.is256BitVector() && Op.getNumOperands() == 2) ||
05899          (VT.is512BitVector() && (Op.getNumOperands() == 2 ||
05900           Op.getNumOperands() == 4)));
05901 
05902   // AVX can use the vinsertf128 instruction to create 256-bit vectors
05903   // from two other 128-bit ones.
05904 
05905   // 512-bit vector may contain 2 256-bit vectors or 4 128-bit vectors
05906   return LowerAVXCONCAT_VECTORS(Op, DAG);
05907 }
05908 
05909 
05910 //===----------------------------------------------------------------------===//
05911 // Vector shuffle lowering
05912 //
05913 // This is an experimental code path for lowering vector shuffles on x86. It is
05914 // designed to handle arbitrary vector shuffles and blends, gracefully
05915 // degrading performance as necessary. It works hard to recognize idiomatic
05916 // shuffles and lower them to optimal instruction patterns without leaving
05917 // a framework that allows reasonably efficient handling of all vector shuffle
05918 // patterns.
05919 //===----------------------------------------------------------------------===//
05920 
05921 /// \brief Tiny helper function to identify a no-op mask.
05922 ///
05923 /// This is a somewhat boring predicate function. It checks whether the mask
05924 /// array input, which is assumed to be a single-input shuffle mask of the kind
05925 /// used by the X86 shuffle instructions (not a fully general
05926 /// ShuffleVectorSDNode mask) requires any shuffles to occur. Both undef and an
05927 /// in-place shuffle are 'no-op's.
05928 static bool isNoopShuffleMask(ArrayRef<int> Mask) {
05929   for (int i = 0, Size = Mask.size(); i < Size; ++i)
05930     if (Mask[i] != -1 && Mask[i] != i)
05931       return false;
05932   return true;
05933 }
05934 
05935 /// \brief Helper function to classify a mask as a single-input mask.
05936 ///
05937 /// This isn't a generic single-input test because in the vector shuffle
05938 /// lowering we canonicalize single inputs to be the first input operand. This
05939 /// means we can more quickly test for a single input by only checking whether
05940 /// an input from the second operand exists. We also assume that the size of
05941 /// mask corresponds to the size of the input vectors which isn't true in the
05942 /// fully general case.
05943 static bool isSingleInputShuffleMask(ArrayRef<int> Mask) {
05944   for (int M : Mask)
05945     if (M >= (int)Mask.size())
05946       return false;
05947   return true;
05948 }
05949 
05950 /// \brief Test whether there are elements crossing 128-bit lanes in this
05951 /// shuffle mask.
05952 ///
05953 /// X86 divides up its shuffles into in-lane and cross-lane shuffle operations
05954 /// and we routinely test for these.
05955 static bool is128BitLaneCrossingShuffleMask(MVT VT, ArrayRef<int> Mask) {
05956   int LaneSize = 128 / VT.getScalarSizeInBits();
05957   int Size = Mask.size();
05958   for (int i = 0; i < Size; ++i)
05959     if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
05960       return true;
05961   return false;
05962 }
05963 
05964 /// \brief Test whether a shuffle mask is equivalent within each 128-bit lane.
05965 ///
05966 /// This checks a shuffle mask to see if it is performing the same
05967 /// 128-bit lane-relative shuffle in each 128-bit lane. This trivially implies
05968 /// that it is also not lane-crossing. It may however involve a blend from the
05969 /// same lane of a second vector.
05970 ///
05971 /// The specific repeated shuffle mask is populated in \p RepeatedMask, as it is
05972 /// non-trivial to compute in the face of undef lanes. The representation is
05973 /// *not* suitable for use with existing 128-bit shuffles as it will contain
05974 /// entries from both V1 and V2 inputs to the wider mask.
05975 static bool
05976 is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
05977                                 SmallVectorImpl<int> &RepeatedMask) {
05978   int LaneSize = 128 / VT.getScalarSizeInBits();
05979   RepeatedMask.resize(LaneSize, -1);
05980   int Size = Mask.size();
05981   for (int i = 0; i < Size; ++i) {
05982     if (Mask[i] < 0)
05983       continue;
05984     if ((Mask[i] % Size) / LaneSize != i / LaneSize)
05985       // This entry crosses lanes, so there is no way to model this shuffle.
05986       return false;
05987 
05988     // Ok, handle the in-lane shuffles by detecting if and when they repeat.
05989     if (RepeatedMask[i % LaneSize] == -1)
05990       // This is the first non-undef entry in this slot of a 128-bit lane.
05991       RepeatedMask[i % LaneSize] =
05992           Mask[i] < Size ? Mask[i] % LaneSize : Mask[i] % LaneSize + Size;
05993     else if (RepeatedMask[i % LaneSize] + (i / LaneSize) * LaneSize != Mask[i])
05994       // Found a mismatch with the repeated mask.
05995       return false;
05996   }
05997   return true;
05998 }
05999 
06000 /// \brief Checks whether a shuffle mask is equivalent to an explicit list of
06001 /// arguments.
06002 ///
06003 /// This is a fast way to test a shuffle mask against a fixed pattern:
06004 ///
06005 ///   if (isShuffleEquivalent(Mask, 3, 2, {1, 0})) { ... }
06006 ///
06007 /// It returns true if the mask is exactly as wide as the argument list, and
06008 /// each element of the mask is either -1 (signifying undef) or the value given
06009 /// in the argument.
06010 static bool isShuffleEquivalent(SDValue V1, SDValue V2, ArrayRef<int> Mask,
06011                                 ArrayRef<int> ExpectedMask) {
06012   if (Mask.size() != ExpectedMask.size())
06013     return false;
06014 
06015   int Size = Mask.size();
06016 
06017   // If the values are build vectors, we can look through them to find
06018   // equivalent inputs that make the shuffles equivalent.
06019   auto *BV1 = dyn_cast<BuildVectorSDNode>(V1);
06020   auto *BV2 = dyn_cast<BuildVectorSDNode>(V2);
06021 
06022   for (int i = 0; i < Size; ++i)
06023     if (Mask[i] != -1 && Mask[i] != ExpectedMask[i]) {
06024       auto *MaskBV = Mask[i] < Size ? BV1 : BV2;
06025       auto *ExpectedBV = ExpectedMask[i] < Size ? BV1 : BV2;
06026       if (!MaskBV || !ExpectedBV ||
06027           MaskBV->getOperand(Mask[i] % Size) !=
06028               ExpectedBV->getOperand(ExpectedMask[i] % Size))
06029         return false;
06030     }
06031 
06032   return true;
06033 }
06034 
06035 /// \brief Get a 4-lane 8-bit shuffle immediate for a mask.
06036 ///
06037 /// This helper function produces an 8-bit shuffle immediate corresponding to
06038 /// the ubiquitous shuffle encoding scheme used in x86 instructions for
06039 /// shuffling 4 lanes. It can be used with most of the PSHUF instructions for
06040 /// example.
06041 ///
06042 /// NB: We rely heavily on "undef" masks preserving the input lane.
06043 static SDValue getV4X86ShuffleImm8ForMask(ArrayRef<int> Mask,
06044                                           SelectionDAG &DAG) {
06045   assert(Mask.size() == 4 && "Only 4-lane shuffle masks");
06046   assert(Mask[0] >= -1 && Mask[0] < 4 && "Out of bound mask element!");
06047   assert(Mask[1] >= -1 && Mask[1] < 4 && "Out of bound mask element!");
06048   assert(Mask[2] >= -1 && Mask[2] < 4 && "Out of bound mask element!");
06049   assert(Mask[3] >= -1 && Mask[3] < 4 && "Out of bound mask element!");
06050 
06051   unsigned Imm = 0;
06052   Imm |= (Mask[0] == -1 ? 0 : Mask[0]) << 0;
06053   Imm |= (Mask[1] == -1 ? 1 : Mask[1]) << 2;
06054   Imm |= (Mask[2] == -1 ? 2 : Mask[2]) << 4;
06055   Imm |= (Mask[3] == -1 ? 3 : Mask[3]) << 6;
06056   return DAG.getConstant(Imm, MVT::i8);
06057 }
06058 
06059 /// \brief Try to emit a blend instruction for a shuffle using bit math.
06060 ///
06061 /// This is used as a fallback approach when first class blend instructions are
06062 /// unavailable. Currently it is only suitable for integer vectors, but could
06063 /// be generalized for floating point vectors if desirable.
06064 static SDValue lowerVectorShuffleAsBitBlend(SDLoc DL, MVT VT, SDValue V1,
06065                                             SDValue V2, ArrayRef<int> Mask,
06066                                             SelectionDAG &DAG) {
06067   assert(VT.isInteger() && "Only supports integer vector types!");
06068   MVT EltVT = VT.getScalarType();
06069   int NumEltBits = EltVT.getSizeInBits();
06070   SDValue Zero = DAG.getConstant(0, EltVT);
06071   SDValue AllOnes = DAG.getConstant(APInt::getAllOnesValue(NumEltBits), EltVT);
06072   SmallVector<SDValue, 16> MaskOps;
06073   for (int i = 0, Size = Mask.size(); i < Size; ++i) {
06074     if (Mask[i] != -1 && Mask[i] != i && Mask[i] != i + Size)
06075       return SDValue(); // Shuffled input!
06076     MaskOps.push_back(Mask[i] < Size ? AllOnes : Zero);
06077   }
06078 
06079   SDValue V1Mask = DAG.getNode(ISD::BUILD_VECTOR, DL, VT, MaskOps);
06080   V1 = DAG.getNode(ISD::AND, DL, VT, V1, V1Mask);
06081   // We have to cast V2 around.
06082   MVT MaskVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64);
06083   V2 = DAG.getNode(ISD::BITCAST, DL, VT,
06084                    DAG.getNode(X86ISD::ANDNP, DL, MaskVT,
06085                                DAG.getNode(ISD::BITCAST, DL, MaskVT, V1Mask),
06086                                DAG.getNode(ISD::BITCAST, DL, MaskVT, V2)));
06087   return DAG.getNode(ISD::OR, DL, VT, V1, V2);
06088 }
06089 
06090 /// \brief Try to emit a blend instruction for a shuffle.
06091 ///
06092 /// This doesn't do any checks for the availability of instructions for blending
06093 /// these values. It relies on the availability of the X86ISD::BLENDI pattern to
06094 /// be matched in the backend with the type given. What it does check for is
06095 /// that the shuffle mask is in fact a blend.
06096 static SDValue lowerVectorShuffleAsBlend(SDLoc DL, MVT VT, SDValue V1,
06097                                          SDValue V2, ArrayRef<int> Mask,
06098                                          const X86Subtarget *Subtarget,
06099                                          SelectionDAG &DAG) {
06100   unsigned BlendMask = 0;
06101   for (int i = 0, Size = Mask.size(); i < Size; ++i) {
06102     if (Mask[i] >= Size) {
06103       if (Mask[i] != i + Size)
06104         return SDValue(); // Shuffled V2 input!
06105       BlendMask |= 1u << i;
06106       continue;
06107     }
06108     if (Mask[i] >= 0 && Mask[i] != i)
06109       return SDValue(); // Shuffled V1 input!
06110   }
06111   switch (VT.SimpleTy) {
06112   case MVT::v2f64:
06113   case MVT::v4f32:
06114   case MVT::v4f64:
06115   case MVT::v8f32:
06116     return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V2,
06117                        DAG.getConstant(BlendMask, MVT::i8));
06118 
06119   case MVT::v4i64:
06120   case MVT::v8i32:
06121     assert(Subtarget->hasAVX2() && "256-bit integer blends require AVX2!");
06122     // FALLTHROUGH
06123   case MVT::v2i64:
06124   case MVT::v4i32:
06125     // If we have AVX2 it is faster to use VPBLENDD when the shuffle fits into
06126     // that instruction.
06127     if (Subtarget->hasAVX2()) {
06128       // Scale the blend by the number of 32-bit dwords per element.
06129       int Scale =  VT.getScalarSizeInBits() / 32;
06130       BlendMask = 0;
06131       for (int i = 0, Size = Mask.size(); i < Size; ++i)
06132         if (Mask[i] >= Size)
06133           for (int j = 0; j < Scale; ++j)
06134             BlendMask |= 1u << (i * Scale + j);
06135 
06136       MVT BlendVT = VT.getSizeInBits() > 128 ? MVT::v8i32 : MVT::v4i32;
06137       V1 = DAG.getNode(ISD::BITCAST, DL, BlendVT, V1);
06138       V2 = DAG.getNode(ISD::BITCAST, DL, BlendVT, V2);
06139       return DAG.getNode(ISD::BITCAST, DL, VT,
06140                          DAG.getNode(X86ISD::BLENDI, DL, BlendVT, V1, V2,
06141                                      DAG.getConstant(BlendMask, MVT::i8)));
06142     }
06143     // FALLTHROUGH
06144   case MVT::v8i16: {
06145     // For integer shuffles we need to expand the mask and cast the inputs to
06146     // v8i16s prior to blending.
06147     int Scale = 8 / VT.getVectorNumElements();
06148     BlendMask = 0;
06149     for (int i = 0, Size = Mask.size(); i < Size; ++i)
06150       if (Mask[i] >= Size)
06151         for (int j = 0; j < Scale; ++j)
06152           BlendMask |= 1u << (i * Scale + j);
06153 
06154     V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V1);
06155     V2 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V2);
06156     return DAG.getNode(ISD::BITCAST, DL, VT,
06157                        DAG.getNode(X86ISD::BLENDI, DL, MVT::v8i16, V1, V2,
06158                                    DAG.getConstant(BlendMask, MVT::i8)));
06159   }
06160 
06161   case MVT::v16i16: {
06162     assert(Subtarget->hasAVX2() && "256-bit integer blends require AVX2!");
06163     SmallVector<int, 8> RepeatedMask;
06164     if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
06165       // We can lower these with PBLENDW which is mirrored across 128-bit lanes.
06166       assert(RepeatedMask.size() == 8 && "Repeated mask size doesn't match!");
06167       BlendMask = 0;
06168       for (int i = 0; i < 8; ++i)
06169         if (RepeatedMask[i] >= 16)
06170           BlendMask |= 1u << i;
06171       return DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
06172                          DAG.getConstant(BlendMask, MVT::i8));
06173     }
06174   }
06175     // FALLTHROUGH
06176   case MVT::v16i8:
06177   case MVT::v32i8: {
06178     assert((VT.getSizeInBits() == 128 || Subtarget->hasAVX2()) &&
06179            "256-bit byte-blends require AVX2 support!");
06180 
06181     // Scale the blend by the number of bytes per element.
06182     int Scale = VT.getScalarSizeInBits() / 8;
06183 
06184     // This form of blend is always done on bytes. Compute the byte vector
06185     // type.
06186     MVT BlendVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
06187 
06188     // Compute the VSELECT mask. Note that VSELECT is really confusing in the
06189     // mix of LLVM's code generator and the x86 backend. We tell the code
06190     // generator that boolean values in the elements of an x86 vector register
06191     // are -1 for true and 0 for false. We then use the LLVM semantics of 'true'
06192     // mapping a select to operand #1, and 'false' mapping to operand #2. The
06193     // reality in x86 is that vector masks (pre-AVX-512) use only the high bit
06194     // of the element (the remaining are ignored) and 0 in that high bit would
06195     // mean operand #1 while 1 in the high bit would mean operand #2. So while
06196     // the LLVM model for boolean values in vector elements gets the relevant
06197     // bit set, it is set backwards and over constrained relative to x86's
06198     // actual model.
06199     SmallVector<SDValue, 32> VSELECTMask;
06200     for (int i = 0, Size = Mask.size(); i < Size; ++i)
06201       for (int j = 0; j < Scale; ++j)
06202         VSELECTMask.push_back(
06203             Mask[i] < 0 ? DAG.getUNDEF(MVT::i8)
06204                         : DAG.getConstant(Mask[i] < Size ? -1 : 0, MVT::i8));
06205 
06206     V1 = DAG.getNode(ISD::BITCAST, DL, BlendVT, V1);
06207     V2 = DAG.getNode(ISD::BITCAST, DL, BlendVT, V2);
06208     return DAG.getNode(
06209         ISD::BITCAST, DL, VT,
06210         DAG.getNode(ISD::VSELECT, DL, BlendVT,
06211                     DAG.getNode(ISD::BUILD_VECTOR, DL, BlendVT, VSELECTMask),
06212                     V1, V2));
06213   }
06214 
06215   default:
06216     llvm_unreachable("Not a supported integer vector type!");
06217   }
06218 }
06219 
06220 /// \brief Try to lower as a blend of elements from two inputs followed by
06221 /// a single-input permutation.
06222 ///
06223 /// This matches the pattern where we can blend elements from two inputs and
06224 /// then reduce the shuffle to a single-input permutation.
06225 static SDValue lowerVectorShuffleAsBlendAndPermute(SDLoc DL, MVT VT, SDValue V1,
06226                                                    SDValue V2,
06227                                                    ArrayRef<int> Mask,
06228                                                    SelectionDAG &DAG) {
06229   // We build up the blend mask while checking whether a blend is a viable way
06230   // to reduce the shuffle.
06231   SmallVector<int, 32> BlendMask(Mask.size(), -1);
06232   SmallVector<int, 32> PermuteMask(Mask.size(), -1);
06233 
06234   for (int i = 0, Size = Mask.size(); i < Size; ++i) {
06235     if (Mask[i] < 0)
06236       continue;
06237 
06238     assert(Mask[i] < Size * 2 && "Shuffle input is out of bounds.");
06239 
06240     if (BlendMask[Mask[i] % Size] == -1)
06241       BlendMask[Mask[i] % Size] = Mask[i];
06242     else if (BlendMask[Mask[i] % Size] != Mask[i])
06243       return SDValue(); // Can't blend in the needed input!
06244 
06245     PermuteMask[i] = Mask[i] % Size;
06246   }
06247 
06248   SDValue V = DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
06249   return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), PermuteMask);
06250 }
06251 
06252 /// \brief Generic routine to decompose a shuffle and blend into indepndent
06253 /// blends and permutes.
06254 ///
06255 /// This matches the extremely common pattern for handling combined
06256 /// shuffle+blend operations on newer X86 ISAs where we have very fast blend
06257 /// operations. It will try to pick the best arrangement of shuffles and
06258 /// blends.
06259 static SDValue lowerVectorShuffleAsDecomposedShuffleBlend(SDLoc DL, MVT VT,
06260                                                           SDValue V1,
06261                                                           SDValue V2,
06262                                                           ArrayRef<int> Mask,
06263                                                           SelectionDAG &DAG) {
06264   // Shuffle the input elements into the desired positions in V1 and V2 and
06265   // blend them together.
06266   SmallVector<int, 32> V1Mask(Mask.size(), -1);
06267   SmallVector<int, 32> V2Mask(Mask.size(), -1);
06268   SmallVector<int, 32> BlendMask(Mask.size(), -1);
06269   for (int i = 0, Size = Mask.size(); i < Size; ++i)
06270     if (Mask[i] >= 0 && Mask[i] < Size) {
06271       V1Mask[i] = Mask[i];
06272       BlendMask[i] = i;
06273     } else if (Mask[i] >= Size) {
06274       V2Mask[i] = Mask[i] - Size;
06275       BlendMask[i] = i + Size;
06276     }
06277 
06278   // Try to lower with the simpler initial blend strategy unless one of the
06279   // input shuffles would be a no-op. We prefer to shuffle inputs as the
06280   // shuffle may be able to fold with a load or other benefit. However, when
06281   // we'll have to do 2x as many shuffles in order to achieve this, blending
06282   // first is a better strategy.
06283   if (!isNoopShuffleMask(V1Mask) && !isNoopShuffleMask(V2Mask))
06284     if (SDValue BlendPerm =
06285             lowerVectorShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask, DAG))
06286       return BlendPerm;
06287 
06288   V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
06289   V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
06290   return DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
06291 }
06292 
06293 /// \brief Try to lower a vector shuffle as a byte rotation.
06294 ///
06295 /// SSSE3 has a generic PALIGNR instruction in x86 that will do an arbitrary
06296 /// byte-rotation of the concatenation of two vectors; pre-SSSE3 can use
06297 /// a PSRLDQ/PSLLDQ/POR pattern to get a similar effect. This routine will
06298 /// try to generically lower a vector shuffle through such an pattern. It
06299 /// does not check for the profitability of lowering either as PALIGNR or
06300 /// PSRLDQ/PSLLDQ/POR, only whether the mask is valid to lower in that form.
06301 /// This matches shuffle vectors that look like:
06302 ///
06303 ///   v8i16 [11, 12, 13, 14, 15, 0, 1, 2]
06304 ///
06305 /// Essentially it concatenates V1 and V2, shifts right by some number of
06306 /// elements, and takes the low elements as the result. Note that while this is
06307 /// specified as a *right shift* because x86 is little-endian, it is a *left
06308 /// rotate* of the vector lanes.
06309 static SDValue lowerVectorShuffleAsByteRotate(SDLoc DL, MVT VT, SDValue V1,
06310                                               SDValue V2,
06311                                               ArrayRef<int> Mask,
06312                                               const X86Subtarget *Subtarget,
06313                                               SelectionDAG &DAG) {
06314   assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
06315 
06316   int NumElts = Mask.size();
06317   int NumLanes = VT.getSizeInBits() / 128;
06318   int NumLaneElts = NumElts / NumLanes;
06319 
06320   // We need to detect various ways of spelling a rotation:
06321   //   [11, 12, 13, 14, 15,  0,  1,  2]
06322   //   [-1, 12, 13, 14, -1, -1,  1, -1]
06323   //   [-1, -1, -1, -1, -1, -1,  1,  2]
06324   //   [ 3,  4,  5,  6,  7,  8,  9, 10]
06325   //   [-1,  4,  5,  6, -1, -1,  9, -1]
06326   //   [-1,  4,  5,  6, -1, -1, -1, -1]
06327   int Rotation = 0;
06328   SDValue Lo, Hi;
06329   for (int l = 0; l < NumElts; l += NumLaneElts) {
06330     for (int i = 0; i < NumLaneElts; ++i) {
06331       if (Mask[l + i] == -1)
06332         continue;
06333       assert(Mask[l + i] >= 0 && "Only -1 is a valid negative mask element!");
06334 
06335       // Get the mod-Size index and lane correct it.
06336       int LaneIdx = (Mask[l + i] % NumElts) - l;
06337       // Make sure it was in this lane.
06338       if (LaneIdx < 0 || LaneIdx >= NumLaneElts)
06339         return SDValue();
06340 
06341       // Determine where a rotated vector would have started.
06342       int StartIdx = i - LaneIdx;
06343       if (StartIdx == 0)
06344         // The identity rotation isn't interesting, stop.
06345         return SDValue();
06346 
06347       // If we found the tail of a vector the rotation must be the missing
06348       // front. If we found the head of a vector, it must be how much of the
06349       // head.
06350       int CandidateRotation = StartIdx < 0 ? -StartIdx : NumLaneElts - StartIdx;
06351 
06352       if (Rotation == 0)
06353         Rotation = CandidateRotation;
06354       else if (Rotation != CandidateRotation)
06355         // The rotations don't match, so we can't match this mask.
06356         return SDValue();
06357 
06358       // Compute which value this mask is pointing at.
06359       SDValue MaskV = Mask[l + i] < NumElts ? V1 : V2;
06360 
06361       // Compute which of the two target values this index should be assigned
06362       // to. This reflects whether the high elements are remaining or the low
06363       // elements are remaining.
06364       SDValue &TargetV = StartIdx < 0 ? Hi : Lo;
06365 
06366       // Either set up this value if we've not encountered it before, or check
06367       // that it remains consistent.
06368       if (!TargetV)
06369         TargetV = MaskV;
06370       else if (TargetV != MaskV)
06371         // This may be a rotation, but it pulls from the inputs in some
06372         // unsupported interleaving.
06373         return SDValue();
06374     }
06375   }
06376 
06377   // Check that we successfully analyzed the mask, and normalize the results.
06378   assert(Rotation != 0 && "Failed to locate a viable rotation!");
06379   assert((Lo || Hi) && "Failed to find a rotated input vector!");
06380   if (!Lo)
06381     Lo = Hi;
06382   else if (!Hi)
06383     Hi = Lo;
06384 
06385   // The actual rotate instruction rotates bytes, so we need to scale the
06386   // rotation based on how many bytes are in the vector lane.
06387   int Scale = 16 / NumLaneElts;
06388 
06389   // SSSE3 targets can use the palignr instruction.
06390   if (Subtarget->hasSSSE3()) {
06391     // Cast the inputs to i8 vector of correct length to match PALIGNR.
06392     MVT AlignVT = MVT::getVectorVT(MVT::i8, 16 * NumLanes);
06393     Lo = DAG.getNode(ISD::BITCAST, DL, AlignVT, Lo);
06394     Hi = DAG.getNode(ISD::BITCAST, DL, AlignVT, Hi);
06395 
06396     return DAG.getNode(ISD::BITCAST, DL, VT,
06397                        DAG.getNode(X86ISD::PALIGNR, DL, AlignVT, Hi, Lo,
06398                                    DAG.getConstant(Rotation * Scale, MVT::i8)));
06399   }
06400 
06401   assert(VT.getSizeInBits() == 128 &&
06402          "Rotate-based lowering only supports 128-bit lowering!");
06403   assert(Mask.size() <= 16 &&
06404          "Can shuffle at most 16 bytes in a 128-bit vector!");
06405 
06406   // Default SSE2 implementation
06407   int LoByteShift = 16 - Rotation * Scale;
06408   int HiByteShift = Rotation * Scale;
06409 
06410   // Cast the inputs to v2i64 to match PSLLDQ/PSRLDQ.
06411   Lo = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, Lo);
06412   Hi = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, Hi);
06413 
06414   SDValue LoShift = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v2i64, Lo,
06415                                 DAG.getConstant(LoByteShift, MVT::i8));
06416   SDValue HiShift = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v2i64, Hi,
06417                                 DAG.getConstant(HiByteShift, MVT::i8));
06418   return DAG.getNode(ISD::BITCAST, DL, VT,
06419                      DAG.getNode(ISD::OR, DL, MVT::v2i64, LoShift, HiShift));
06420 }
06421 
06422 /// \brief Compute whether each element of a shuffle is zeroable.
06423 ///
06424 /// A "zeroable" vector shuffle element is one which can be lowered to zero.
06425 /// Either it is an undef element in the shuffle mask, the element of the input
06426 /// referenced is undef, or the element of the input referenced is known to be
06427 /// zero. Many x86 shuffles can zero lanes cheaply and we often want to handle
06428 /// as many lanes with this technique as possible to simplify the remaining
06429 /// shuffle.
06430 static SmallBitVector computeZeroableShuffleElements(ArrayRef<int> Mask,
06431                                                      SDValue V1, SDValue V2) {
06432   SmallBitVector Zeroable(Mask.size(), false);
06433 
06434   while (V1.getOpcode() == ISD::BITCAST)
06435     V1 = V1->getOperand(0);
06436   while (V2.getOpcode() == ISD::BITCAST)
06437     V2 = V2->getOperand(0);
06438 
06439   bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode());
06440   bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode());
06441 
06442   for (int i = 0, Size = Mask.size(); i < Size; ++i) {
06443     int M = Mask[i];
06444     // Handle the easy cases.
06445     if (M < 0 || (M >= 0 && M < Size && V1IsZero) || (M >= Size && V2IsZero)) {
06446       Zeroable[i] = true;
06447       continue;
06448     }
06449 
06450     // If this is an index into a build_vector node (which has the same number
06451     // of elements), dig out the input value and use it.
06452     SDValue V = M < Size ? V1 : V2;
06453     if (V.getOpcode() != ISD::BUILD_VECTOR || Size != (int)V.getNumOperands())
06454       continue;
06455 
06456     SDValue Input = V.getOperand(M % Size);
06457     // The UNDEF opcode check really should be dead code here, but not quite
06458     // worth asserting on (it isn't invalid, just unexpected).
06459     if (Input.getOpcode() == ISD::UNDEF || X86::isZeroNode(Input))
06460       Zeroable[i] = true;
06461   }
06462 
06463   return Zeroable;
06464 }
06465 
06466 /// \brief Try to emit a bitmask instruction for a shuffle.
06467 ///
06468 /// This handles cases where we can model a blend exactly as a bitmask due to
06469 /// one of the inputs being zeroable.
06470 static SDValue lowerVectorShuffleAsBitMask(SDLoc DL, MVT VT, SDValue V1,
06471                                            SDValue V2, ArrayRef<int> Mask,
06472                                            SelectionDAG &DAG) {
06473   MVT EltVT = VT.getScalarType();
06474   int NumEltBits = EltVT.getSizeInBits();
06475   MVT IntEltVT = MVT::getIntegerVT(NumEltBits);
06476   SDValue Zero = DAG.getConstant(0, IntEltVT);
06477   SDValue AllOnes = DAG.getConstant(APInt::getAllOnesValue(NumEltBits), IntEltVT);
06478   if (EltVT.isFloatingPoint()) {
06479     Zero = DAG.getNode(ISD::BITCAST, DL, EltVT, Zero);
06480     AllOnes = DAG.getNode(ISD::BITCAST, DL, EltVT, AllOnes);
06481   }
06482   SmallVector<SDValue, 16> VMaskOps(Mask.size(), Zero);
06483   SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
06484   SDValue V;
06485   for (int i = 0, Size = Mask.size(); i < Size; ++i) {
06486     if (Zeroable[i])
06487       continue;
06488     if (Mask[i] % Size != i)
06489       return SDValue(); // Not a blend.
06490     if (!V)
06491       V = Mask[i] < Size ? V1 : V2;
06492     else if (V != (Mask[i] < Size ? V1 : V2))
06493       return SDValue(); // Can only let one input through the mask.
06494 
06495     VMaskOps[i] = AllOnes;
06496   }
06497   if (!V)
06498     return SDValue(); // No non-zeroable elements!
06499 
06500   SDValue VMask = DAG.getNode(ISD::BUILD_VECTOR, DL, VT, VMaskOps);
06501   V = DAG.getNode(VT.isFloatingPoint()
06502                   ? (unsigned) X86ISD::FAND : (unsigned) ISD::AND,
06503                   DL, VT, V, VMask);
06504   return V;
06505 }
06506 
06507 /// \brief Try to lower a vector shuffle as a bit shift (shifts in zeros).
06508 ///
06509 /// Attempts to match a shuffle mask against the PSLL(W/D/Q/DQ) and
06510 /// PSRL(W/D/Q/DQ) SSE2 and AVX2 logical bit-shift instructions. The function
06511 /// matches elements from one of the input vectors shuffled to the left or
06512 /// right with zeroable elements 'shifted in'. It handles both the strictly
06513 /// bit-wise element shifts and the byte shift across an entire 128-bit double
06514 /// quad word lane.
06515 ///
06516 /// PSHL : (little-endian) left bit shift.
06517 /// [ zz, 0, zz,  2 ]
06518 /// [ -1, 4, zz, -1 ]
06519 /// PSRL : (little-endian) right bit shift.
06520 /// [  1, zz,  3, zz]
06521 /// [ -1, -1,  7, zz]
06522 /// PSLLDQ : (little-endian) left byte shift
06523 /// [ zz,  0,  1,  2,  3,  4,  5,  6]
06524 /// [ zz, zz, -1, -1,  2,  3,  4, -1]
06525 /// [ zz, zz, zz, zz, zz, zz, -1,  1]
06526 /// PSRLDQ : (little-endian) right byte shift
06527 /// [  5, 6,  7, zz, zz, zz, zz, zz]
06528 /// [ -1, 5,  6,  7, zz, zz, zz, zz]
06529 /// [  1, 2, -1, -1, -1, -1, zz, zz]
06530 static SDValue lowerVectorShuffleAsShift(SDLoc DL, MVT VT, SDValue V1,
06531                                          SDValue V2, ArrayRef<int> Mask,
06532                                          SelectionDAG &DAG) {
06533   SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
06534 
06535   int Size = Mask.size();
06536   assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
06537 
06538   auto CheckZeros = [&](int Shift, int Scale, bool Left) {
06539     for (int i = 0; i < Size; i += Scale)
06540       for (int j = 0; j < Shift; ++j)
06541         if (!Zeroable[i + j + (Left ? 0 : (Scale - Shift))])
06542           return false;
06543 
06544     return true;
06545   };
06546 
06547   auto MatchShift = [&](int Shift, int Scale, bool Left, SDValue V) {
06548     for (int i = 0; i != Size; i += Scale) {
06549       unsigned Pos = Left ? i + Shift : i;
06550       unsigned Low = Left ? i : i + Shift;
06551       unsigned Len = Scale - Shift;
06552       if (!isSequentialOrUndefInRange(Mask, Pos, Len,
06553                                       Low + (V == V1 ? 0 : Size)))
06554         return SDValue();
06555     }
06556 
06557     int ShiftEltBits = VT.getScalarSizeInBits() * Scale;
06558     bool ByteShift = ShiftEltBits > 64;
06559     unsigned OpCode = Left ? (ByteShift ? X86ISD::VSHLDQ : X86ISD::VSHLI)
06560                            : (ByteShift ? X86ISD::VSRLDQ : X86ISD::VSRLI);
06561     int ShiftAmt = Shift * VT.getScalarSizeInBits() / (ByteShift ? 8 : 1);
06562 
06563     // Normalize the scale for byte shifts to still produce an i64 element
06564     // type.
06565     Scale = ByteShift ? Scale / 2 : Scale;
06566 
06567     // We need to round trip through the appropriate type for the shift.
06568     MVT ShiftSVT = MVT::getIntegerVT(VT.getScalarSizeInBits() * Scale);
06569     MVT ShiftVT = MVT::getVectorVT(ShiftSVT, Size / Scale);
06570     assert(DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) &&
06571            "Illegal integer vector type");
06572     V = DAG.getNode(ISD::BITCAST, DL, ShiftVT, V);
06573 
06574     V = DAG.getNode(OpCode, DL, ShiftVT, V, DAG.getConstant(ShiftAmt, MVT::i8));
06575     return DAG.getNode(ISD::BITCAST, DL, VT, V);
06576   };
06577 
06578   // SSE/AVX supports logical shifts up to 64-bit integers - so we can just
06579   // keep doubling the size of the integer elements up to that. We can
06580   // then shift the elements of the integer vector by whole multiples of
06581   // their width within the elements of the larger integer vector. Test each
06582   // multiple to see if we can find a match with the moved element indices
06583   // and that the shifted in elements are all zeroable.
06584   for (int Scale = 2; Scale * VT.getScalarSizeInBits() <= 128; Scale *= 2)
06585     for (int Shift = 1; Shift != Scale; ++Shift)
06586       for (bool Left : {true, false})
06587         if (CheckZeros(Shift, Scale, Left))
06588           for (SDValue V : {V1, V2})
06589             if (SDValue Match = MatchShift(Shift, Scale, Left, V))
06590               return Match;
06591 
06592   // no match
06593   return SDValue();
06594 }
06595 
06596 /// \brief Lower a vector shuffle as a zero or any extension.
06597 ///
06598 /// Given a specific number of elements, element bit width, and extension
06599 /// stride, produce either a zero or any extension based on the available
06600 /// features of the subtarget.
06601 static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend(
06602     SDLoc DL, MVT VT, int Scale, bool AnyExt, SDValue InputV,
06603     const X86Subtarget *Subtarget, SelectionDAG &DAG) {
06604   assert(Scale > 1 && "Need a scale to extend.");
06605   int NumElements = VT.getVectorNumElements();
06606   int EltBits = VT.getScalarSizeInBits();
06607   assert((EltBits == 8 || EltBits == 16 || EltBits == 32) &&
06608          "Only 8, 16, and 32 bit elements can be extended.");
06609   assert(Scale * EltBits <= 64 && "Cannot zero extend past 64 bits.");
06610 
06611   // Found a valid zext mask! Try various lowering strategies based on the
06612   // input type and available ISA extensions.
06613   if (Subtarget->hasSSE41()) {
06614     MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits * Scale),
06615                                  NumElements / Scale);
06616     return DAG.getNode(ISD::BITCAST, DL, VT,
06617                        DAG.getNode(X86ISD::VZEXT, DL, ExtVT, InputV));
06618   }
06619 
06620   // For any extends we can cheat for larger element sizes and use shuffle
06621   // instructions that can fold with a load and/or copy.
06622   if (AnyExt && EltBits == 32) {
06623     int PSHUFDMask[4] = {0, -1, 1, -1};
06624     return DAG.getNode(
06625         ISD::BITCAST, DL, VT,
06626         DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
06627                     DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, InputV),
06628                     getV4X86ShuffleImm8ForMask(PSHUFDMask, DAG)));
06629   }
06630   if (AnyExt && EltBits == 16 && Scale > 2) {
06631     int PSHUFDMask[4] = {0, -1, 0, -1};
06632     InputV = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
06633                          DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, InputV),
06634                          getV4X86ShuffleImm8ForMask(PSHUFDMask, DAG));
06635     int PSHUFHWMask[4] = {1, -1, -1, -1};
06636     return DAG.getNode(
06637         ISD::BITCAST, DL, VT,
06638         DAG.getNode(X86ISD::PSHUFHW, DL, MVT::v8i16,
06639                     DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, InputV),
06640                     getV4X86ShuffleImm8ForMask(PSHUFHWMask, DAG)));
06641   }
06642 
06643   // If this would require more than 2 unpack instructions to expand, use
06644   // pshufb when available. We can only use more than 2 unpack instructions
06645   // when zero extending i8 elements which also makes it easier to use pshufb.
06646   if (Scale > 4 && EltBits == 8 && Subtarget->hasSSSE3()) {
06647     assert(NumElements == 16 && "Unexpected byte vector width!");
06648     SDValue PSHUFBMask[16];
06649     for (int i = 0; i < 16; ++i)
06650       PSHUFBMask[i] =
06651           DAG.getConstant((i % Scale == 0) ? i / Scale : 0x80, MVT::i8);
06652     InputV = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, InputV);
06653     return DAG.getNode(ISD::BITCAST, DL, VT,
06654                        DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, InputV,
06655                                    DAG.getNode(ISD::BUILD_VECTOR, DL,
06656                                                MVT::v16i8, PSHUFBMask)));
06657   }
06658 
06659   // Otherwise emit a sequence of unpacks.
06660   do {
06661     MVT InputVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits), NumElements);
06662     SDValue Ext = AnyExt ? DAG.getUNDEF(InputVT)
06663                          : getZeroVector(InputVT, Subtarget, DAG, DL);
06664     InputV = DAG.getNode(ISD::BITCAST, DL, InputVT, InputV);
06665     InputV = DAG.getNode(X86ISD::UNPCKL, DL, InputVT, InputV, Ext);
06666     Scale /= 2;
06667     EltBits *= 2;
06668     NumElements /= 2;
06669   } while (Scale > 1);
06670   return DAG.getNode(ISD::BITCAST, DL, VT, InputV);
06671 }
06672 
06673 /// \brief Try to lower a vector shuffle as a zero extension on any microarch.
06674 ///
06675 /// This routine will try to do everything in its power to cleverly lower
06676 /// a shuffle which happens to match the pattern of a zero extend. It doesn't
06677 /// check for the profitability of this lowering,  it tries to aggressively
06678 /// match this pattern. It will use all of the micro-architectural details it
06679 /// can to emit an efficient lowering. It handles both blends with all-zero
06680 /// inputs to explicitly zero-extend and undef-lanes (sometimes undef due to
06681 /// masking out later).
06682 ///
06683 /// The reason we have dedicated lowering for zext-style shuffles is that they
06684 /// are both incredibly common and often quite performance sensitive.
06685 static SDValue lowerVectorShuffleAsZeroOrAnyExtend(
06686     SDLoc DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
06687     const X86Subtarget *Subtarget, SelectionDAG &DAG) {
06688   SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
06689 
06690   int Bits = VT.getSizeInBits();
06691   int NumElements = VT.getVectorNumElements();
06692   assert(VT.getScalarSizeInBits() <= 32 &&
06693          "Exceeds 32-bit integer zero extension limit");
06694   assert((int)Mask.size() == NumElements && "Unexpected shuffle mask size");
06695 
06696   // Define a helper function to check a particular ext-scale and lower to it if
06697   // valid.
06698   auto Lower = [&](int Scale) -> SDValue {
06699     SDValue InputV;
06700     bool AnyExt = true;
06701     for (int i = 0; i < NumElements; ++i) {
06702       if (Mask[i] == -1)
06703         continue; // Valid anywhere but doesn't tell us anything.
06704       if (i % Scale != 0) {
06705         // Each of the extended elements need to be zeroable.
06706         if (!Zeroable[i])
06707           return SDValue();
06708 
06709         // We no longer are in the anyext case.
06710         AnyExt = false;
06711         continue;
06712       }
06713 
06714       // Each of the base elements needs to be consecutive indices into the
06715       // same input vector.
06716       SDValue V = Mask[i] < NumElements ? V1 : V2;
06717       if (!InputV)
06718         InputV = V;
06719       else if (InputV != V)
06720         return SDValue(); // Flip-flopping inputs.
06721 
06722       if (Mask[i] % NumElements != i / Scale)
06723         return SDValue(); // Non-consecutive strided elements.
06724     }
06725 
06726     // If we fail to find an input, we have a zero-shuffle which should always
06727     // have already been handled.
06728     // FIXME: Maybe handle this here in case during blending we end up with one?
06729     if (!InputV)
06730       return SDValue();
06731 
06732     return lowerVectorShuffleAsSpecificZeroOrAnyExtend(
06733         DL, VT, Scale, AnyExt, InputV, Subtarget, DAG);
06734   };
06735 
06