LLVM API Documentation

X86ISelLowering.cpp
Go to the documentation of this file.
00001 //===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===//
00002 //
00003 //                     The LLVM Compiler Infrastructure
00004 //
00005 // This file is distributed under the University of Illinois Open Source
00006 // License. See LICENSE.TXT for details.
00007 //
00008 //===----------------------------------------------------------------------===//
00009 //
00010 // This file defines the interfaces that X86 uses to lower LLVM code into a
00011 // selection DAG.
00012 //
00013 //===----------------------------------------------------------------------===//
00014 
00015 #include "X86ISelLowering.h"
00016 #include "Utils/X86ShuffleDecode.h"
00017 #include "X86CallingConv.h"
00018 #include "X86InstrBuilder.h"
00019 #include "X86MachineFunctionInfo.h"
00020 #include "X86TargetMachine.h"
00021 #include "X86TargetObjectFile.h"
00022 #include "llvm/ADT/SmallBitVector.h"
00023 #include "llvm/ADT/SmallSet.h"
00024 #include "llvm/ADT/Statistic.h"
00025 #include "llvm/ADT/StringExtras.h"
00026 #include "llvm/ADT/StringSwitch.h"
00027 #include "llvm/ADT/VariadicFunction.h"
00028 #include "llvm/CodeGen/IntrinsicLowering.h"
00029 #include "llvm/CodeGen/MachineFrameInfo.h"
00030 #include "llvm/CodeGen/MachineFunction.h"
00031 #include "llvm/CodeGen/MachineInstrBuilder.h"
00032 #include "llvm/CodeGen/MachineJumpTableInfo.h"
00033 #include "llvm/CodeGen/MachineModuleInfo.h"
00034 #include "llvm/CodeGen/MachineRegisterInfo.h"
00035 #include "llvm/IR/CallSite.h"
00036 #include "llvm/IR/CallingConv.h"
00037 #include "llvm/IR/Constants.h"
00038 #include "llvm/IR/DerivedTypes.h"
00039 #include "llvm/IR/Function.h"
00040 #include "llvm/IR/GlobalAlias.h"
00041 #include "llvm/IR/GlobalVariable.h"
00042 #include "llvm/IR/Instructions.h"
00043 #include "llvm/IR/Intrinsics.h"
00044 #include "llvm/MC/MCAsmInfo.h"
00045 #include "llvm/MC/MCContext.h"
00046 #include "llvm/MC/MCExpr.h"
00047 #include "llvm/MC/MCSymbol.h"
00048 #include "llvm/Support/CommandLine.h"
00049 #include "llvm/Support/Debug.h"
00050 #include "llvm/Support/ErrorHandling.h"
00051 #include "llvm/Support/MathExtras.h"
00052 #include "llvm/Target/TargetOptions.h"
00053 #include "X86IntrinsicsInfo.h"
00054 #include <bitset>
00055 #include <numeric>
00056 #include <cctype>
00057 using namespace llvm;
00058 
00059 #define DEBUG_TYPE "x86-isel"
00060 
00061 STATISTIC(NumTailCalls, "Number of tail calls");
00062 
00063 static cl::opt<bool> ExperimentalVectorWideningLegalization(
00064     "x86-experimental-vector-widening-legalization", cl::init(false),
00065     cl::desc("Enable an experimental vector type legalization through widening "
00066              "rather than promotion."),
00067     cl::Hidden);
00068 
00069 static cl::opt<bool> ExperimentalVectorShuffleLowering(
00070     "x86-experimental-vector-shuffle-lowering", cl::init(true),
00071     cl::desc("Enable an experimental vector shuffle lowering code path."),
00072     cl::Hidden);
00073 
00074 // Forward declarations.
00075 static SDValue getMOVL(SelectionDAG &DAG, SDLoc dl, EVT VT, SDValue V1,
00076                        SDValue V2);
00077 
00078 static SDValue ExtractSubVector(SDValue Vec, unsigned IdxVal,
00079                                 SelectionDAG &DAG, SDLoc dl,
00080                                 unsigned vectorWidth) {
00081   assert((vectorWidth == 128 || vectorWidth == 256) &&
00082          "Unsupported vector width");
00083   EVT VT = Vec.getValueType();
00084   EVT ElVT = VT.getVectorElementType();
00085   unsigned Factor = VT.getSizeInBits()/vectorWidth;
00086   EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT,
00087                                   VT.getVectorNumElements()/Factor);
00088 
00089   // Extract from UNDEF is UNDEF.
00090   if (Vec.getOpcode() == ISD::UNDEF)
00091     return DAG.getUNDEF(ResultVT);
00092 
00093   // Extract the relevant vectorWidth bits.  Generate an EXTRACT_SUBVECTOR
00094   unsigned ElemsPerChunk = vectorWidth / ElVT.getSizeInBits();
00095 
00096   // This is the index of the first element of the vectorWidth-bit chunk
00097   // we want.
00098   unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits()) / vectorWidth)
00099                                * ElemsPerChunk);
00100 
00101   // If the input is a buildvector just emit a smaller one.
00102   if (Vec.getOpcode() == ISD::BUILD_VECTOR)
00103     return DAG.getNode(ISD::BUILD_VECTOR, dl, ResultVT,
00104                        makeArrayRef(Vec->op_begin()+NormalizedIdxVal,
00105                                     ElemsPerChunk));
00106 
00107   SDValue VecIdx = DAG.getIntPtrConstant(NormalizedIdxVal);
00108   SDValue Result = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec,
00109                                VecIdx);
00110 
00111   return Result;
00112 
00113 }
00114 /// Generate a DAG to grab 128-bits from a vector > 128 bits.  This
00115 /// sets things up to match to an AVX VEXTRACTF128 / VEXTRACTI128
00116 /// or AVX-512 VEXTRACTF32x4 / VEXTRACTI32x4
00117 /// instructions or a simple subregister reference. Idx is an index in the
00118 /// 128 bits we want.  It need not be aligned to a 128-bit bounday.  That makes
00119 /// lowering EXTRACT_VECTOR_ELT operations easier.
00120 static SDValue Extract128BitVector(SDValue Vec, unsigned IdxVal,
00121                                    SelectionDAG &DAG, SDLoc dl) {
00122   assert((Vec.getValueType().is256BitVector() ||
00123           Vec.getValueType().is512BitVector()) && "Unexpected vector size!");
00124   return ExtractSubVector(Vec, IdxVal, DAG, dl, 128);
00125 }
00126 
00127 /// Generate a DAG to grab 256-bits from a 512-bit vector.
00128 static SDValue Extract256BitVector(SDValue Vec, unsigned IdxVal,
00129                                    SelectionDAG &DAG, SDLoc dl) {
00130   assert(Vec.getValueType().is512BitVector() && "Unexpected vector size!");
00131   return ExtractSubVector(Vec, IdxVal, DAG, dl, 256);
00132 }
00133 
00134 static SDValue InsertSubVector(SDValue Result, SDValue Vec,
00135                                unsigned IdxVal, SelectionDAG &DAG,
00136                                SDLoc dl, unsigned vectorWidth) {
00137   assert((vectorWidth == 128 || vectorWidth == 256) &&
00138          "Unsupported vector width");
00139   // Inserting UNDEF is Result
00140   if (Vec.getOpcode() == ISD::UNDEF)
00141     return Result;
00142   EVT VT = Vec.getValueType();
00143   EVT ElVT = VT.getVectorElementType();
00144   EVT ResultVT = Result.getValueType();
00145 
00146   // Insert the relevant vectorWidth bits.
00147   unsigned ElemsPerChunk = vectorWidth/ElVT.getSizeInBits();
00148 
00149   // This is the index of the first element of the vectorWidth-bit chunk
00150   // we want.
00151   unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits())/vectorWidth)
00152                                * ElemsPerChunk);
00153 
00154   SDValue VecIdx = DAG.getIntPtrConstant(NormalizedIdxVal);
00155   return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec,
00156                      VecIdx);
00157 }
00158 /// Generate a DAG to put 128-bits into a vector > 128 bits.  This
00159 /// sets things up to match to an AVX VINSERTF128/VINSERTI128 or
00160 /// AVX-512 VINSERTF32x4/VINSERTI32x4 instructions or a
00161 /// simple superregister reference.  Idx is an index in the 128 bits
00162 /// we want.  It need not be aligned to a 128-bit bounday.  That makes
00163 /// lowering INSERT_VECTOR_ELT operations easier.
00164 static SDValue Insert128BitVector(SDValue Result, SDValue Vec,
00165                                   unsigned IdxVal, SelectionDAG &DAG,
00166                                   SDLoc dl) {
00167   assert(Vec.getValueType().is128BitVector() && "Unexpected vector size!");
00168   return InsertSubVector(Result, Vec, IdxVal, DAG, dl, 128);
00169 }
00170 
00171 static SDValue Insert256BitVector(SDValue Result, SDValue Vec,
00172                                   unsigned IdxVal, SelectionDAG &DAG,
00173                                   SDLoc dl) {
00174   assert(Vec.getValueType().is256BitVector() && "Unexpected vector size!");
00175   return InsertSubVector(Result, Vec, IdxVal, DAG, dl, 256);
00176 }
00177 
00178 /// Concat two 128-bit vectors into a 256 bit vector using VINSERTF128
00179 /// instructions. This is used because creating CONCAT_VECTOR nodes of
00180 /// BUILD_VECTORS returns a larger BUILD_VECTOR while we're trying to lower
00181 /// large BUILD_VECTORS.
00182 static SDValue Concat128BitVectors(SDValue V1, SDValue V2, EVT VT,
00183                                    unsigned NumElems, SelectionDAG &DAG,
00184                                    SDLoc dl) {
00185   SDValue V = Insert128BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl);
00186   return Insert128BitVector(V, V2, NumElems/2, DAG, dl);
00187 }
00188 
00189 static SDValue Concat256BitVectors(SDValue V1, SDValue V2, EVT VT,
00190                                    unsigned NumElems, SelectionDAG &DAG,
00191                                    SDLoc dl) {
00192   SDValue V = Insert256BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl);
00193   return Insert256BitVector(V, V2, NumElems/2, DAG, dl);
00194 }
00195 
00196 static TargetLoweringObjectFile *createTLOF(const Triple &TT) {
00197   if (TT.isOSBinFormatMachO()) {
00198     if (TT.getArch() == Triple::x86_64)
00199       return new X86_64MachoTargetObjectFile();
00200     return new TargetLoweringObjectFileMachO();
00201   }
00202 
00203   if (TT.isOSLinux())
00204     return new X86LinuxTargetObjectFile();
00205   if (TT.isOSBinFormatELF())
00206     return new TargetLoweringObjectFileELF();
00207   if (TT.isKnownWindowsMSVCEnvironment())
00208     return new X86WindowsTargetObjectFile();
00209   if (TT.isOSBinFormatCOFF())
00210     return new TargetLoweringObjectFileCOFF();
00211   llvm_unreachable("unknown subtarget type");
00212 }
00213 
00214 // FIXME: This should stop caching the target machine as soon as
00215 // we can remove resetOperationActions et al.
00216 X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM)
00217     : TargetLowering(TM, createTLOF(Triple(TM.getTargetTriple()))) {
00218   Subtarget = &TM.getSubtarget<X86Subtarget>();
00219   X86ScalarSSEf64 = Subtarget->hasSSE2();
00220   X86ScalarSSEf32 = Subtarget->hasSSE1();
00221   TD = getDataLayout();
00222 
00223   resetOperationActions();
00224 }
00225 
00226 void X86TargetLowering::resetOperationActions() {
00227   const TargetMachine &TM = getTargetMachine();
00228   static bool FirstTimeThrough = true;
00229 
00230   // If none of the target options have changed, then we don't need to reset the
00231   // operation actions.
00232   if (!FirstTimeThrough && TO == TM.Options) return;
00233 
00234   if (!FirstTimeThrough) {
00235     // Reinitialize the actions.
00236     initActions();
00237     FirstTimeThrough = false;
00238   }
00239 
00240   TO = TM.Options;
00241 
00242   // Set up the TargetLowering object.
00243   static const MVT IntVTs[] = { MVT::i8, MVT::i16, MVT::i32, MVT::i64 };
00244 
00245   // X86 is weird, it always uses i8 for shift amounts and setcc results.
00246   setBooleanContents(ZeroOrOneBooleanContent);
00247   // X86-SSE is even stranger. It uses -1 or 0 for vector masks.
00248   setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
00249 
00250   // For 64-bit since we have so many registers use the ILP scheduler, for
00251   // 32-bit code use the register pressure specific scheduling.
00252   // For Atom, always use ILP scheduling.
00253   if (Subtarget->isAtom())
00254     setSchedulingPreference(Sched::ILP);
00255   else if (Subtarget->is64Bit())
00256     setSchedulingPreference(Sched::ILP);
00257   else
00258     setSchedulingPreference(Sched::RegPressure);
00259   const X86RegisterInfo *RegInfo =
00260       TM.getSubtarget<X86Subtarget>().getRegisterInfo();
00261   setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister());
00262 
00263   // Bypass expensive divides on Atom when compiling with O2
00264   if (Subtarget->hasSlowDivide() && TM.getOptLevel() >= CodeGenOpt::Default) {
00265     addBypassSlowDiv(32, 8);
00266     if (Subtarget->is64Bit())
00267       addBypassSlowDiv(64, 16);
00268   }
00269 
00270   if (Subtarget->isTargetKnownWindowsMSVC()) {
00271     // Setup Windows compiler runtime calls.
00272     setLibcallName(RTLIB::SDIV_I64, "_alldiv");
00273     setLibcallName(RTLIB::UDIV_I64, "_aulldiv");
00274     setLibcallName(RTLIB::SREM_I64, "_allrem");
00275     setLibcallName(RTLIB::UREM_I64, "_aullrem");
00276     setLibcallName(RTLIB::MUL_I64, "_allmul");
00277     setLibcallCallingConv(RTLIB::SDIV_I64, CallingConv::X86_StdCall);
00278     setLibcallCallingConv(RTLIB::UDIV_I64, CallingConv::X86_StdCall);
00279     setLibcallCallingConv(RTLIB::SREM_I64, CallingConv::X86_StdCall);
00280     setLibcallCallingConv(RTLIB::UREM_I64, CallingConv::X86_StdCall);
00281     setLibcallCallingConv(RTLIB::MUL_I64, CallingConv::X86_StdCall);
00282 
00283     // The _ftol2 runtime function has an unusual calling conv, which
00284     // is modeled by a special pseudo-instruction.
00285     setLibcallName(RTLIB::FPTOUINT_F64_I64, nullptr);
00286     setLibcallName(RTLIB::FPTOUINT_F32_I64, nullptr);
00287     setLibcallName(RTLIB::FPTOUINT_F64_I32, nullptr);
00288     setLibcallName(RTLIB::FPTOUINT_F32_I32, nullptr);
00289   }
00290 
00291   if (Subtarget->isTargetDarwin()) {
00292     // Darwin should use _setjmp/_longjmp instead of setjmp/longjmp.
00293     setUseUnderscoreSetJmp(false);
00294     setUseUnderscoreLongJmp(false);
00295   } else if (Subtarget->isTargetWindowsGNU()) {
00296     // MS runtime is weird: it exports _setjmp, but longjmp!
00297     setUseUnderscoreSetJmp(true);
00298     setUseUnderscoreLongJmp(false);
00299   } else {
00300     setUseUnderscoreSetJmp(true);
00301     setUseUnderscoreLongJmp(true);
00302   }
00303 
00304   // Set up the register classes.
00305   addRegisterClass(MVT::i8, &X86::GR8RegClass);
00306   addRegisterClass(MVT::i16, &X86::GR16RegClass);
00307   addRegisterClass(MVT::i32, &X86::GR32RegClass);
00308   if (Subtarget->is64Bit())
00309     addRegisterClass(MVT::i64, &X86::GR64RegClass);
00310 
00311   setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote);
00312 
00313   // We don't accept any truncstore of integer registers.
00314   setTruncStoreAction(MVT::i64, MVT::i32, Expand);
00315   setTruncStoreAction(MVT::i64, MVT::i16, Expand);
00316   setTruncStoreAction(MVT::i64, MVT::i8 , Expand);
00317   setTruncStoreAction(MVT::i32, MVT::i16, Expand);
00318   setTruncStoreAction(MVT::i32, MVT::i8 , Expand);
00319   setTruncStoreAction(MVT::i16, MVT::i8,  Expand);
00320 
00321   setTruncStoreAction(MVT::f64, MVT::f32, Expand);
00322 
00323   // SETOEQ and SETUNE require checking two conditions.
00324   setCondCodeAction(ISD::SETOEQ, MVT::f32, Expand);
00325   setCondCodeAction(ISD::SETOEQ, MVT::f64, Expand);
00326   setCondCodeAction(ISD::SETOEQ, MVT::f80, Expand);
00327   setCondCodeAction(ISD::SETUNE, MVT::f32, Expand);
00328   setCondCodeAction(ISD::SETUNE, MVT::f64, Expand);
00329   setCondCodeAction(ISD::SETUNE, MVT::f80, Expand);
00330 
00331   // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this
00332   // operation.
00333   setOperationAction(ISD::UINT_TO_FP       , MVT::i1   , Promote);
00334   setOperationAction(ISD::UINT_TO_FP       , MVT::i8   , Promote);
00335   setOperationAction(ISD::UINT_TO_FP       , MVT::i16  , Promote);
00336 
00337   if (Subtarget->is64Bit()) {
00338     setOperationAction(ISD::UINT_TO_FP     , MVT::i32  , Promote);
00339     setOperationAction(ISD::UINT_TO_FP     , MVT::i64  , Custom);
00340   } else if (!TM.Options.UseSoftFloat) {
00341     // We have an algorithm for SSE2->double, and we turn this into a
00342     // 64-bit FILD followed by conditional FADD for other targets.
00343     setOperationAction(ISD::UINT_TO_FP     , MVT::i64  , Custom);
00344     // We have an algorithm for SSE2, and we turn this into a 64-bit
00345     // FILD for other targets.
00346     setOperationAction(ISD::UINT_TO_FP     , MVT::i32  , Custom);
00347   }
00348 
00349   // Promote i1/i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have
00350   // this operation.
00351   setOperationAction(ISD::SINT_TO_FP       , MVT::i1   , Promote);
00352   setOperationAction(ISD::SINT_TO_FP       , MVT::i8   , Promote);
00353 
00354   if (!TM.Options.UseSoftFloat) {
00355     // SSE has no i16 to fp conversion, only i32
00356     if (X86ScalarSSEf32) {
00357       setOperationAction(ISD::SINT_TO_FP     , MVT::i16  , Promote);
00358       // f32 and f64 cases are Legal, f80 case is not
00359       setOperationAction(ISD::SINT_TO_FP     , MVT::i32  , Custom);
00360     } else {
00361       setOperationAction(ISD::SINT_TO_FP     , MVT::i16  , Custom);
00362       setOperationAction(ISD::SINT_TO_FP     , MVT::i32  , Custom);
00363     }
00364   } else {
00365     setOperationAction(ISD::SINT_TO_FP     , MVT::i16  , Promote);
00366     setOperationAction(ISD::SINT_TO_FP     , MVT::i32  , Promote);
00367   }
00368 
00369   // In 32-bit mode these are custom lowered.  In 64-bit mode F32 and F64
00370   // are Legal, f80 is custom lowered.
00371   setOperationAction(ISD::FP_TO_SINT     , MVT::i64  , Custom);
00372   setOperationAction(ISD::SINT_TO_FP     , MVT::i64  , Custom);
00373 
00374   // Promote i1/i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have
00375   // this operation.
00376   setOperationAction(ISD::FP_TO_SINT       , MVT::i1   , Promote);
00377   setOperationAction(ISD::FP_TO_SINT       , MVT::i8   , Promote);
00378 
00379   if (X86ScalarSSEf32) {
00380     setOperationAction(ISD::FP_TO_SINT     , MVT::i16  , Promote);
00381     // f32 and f64 cases are Legal, f80 case is not
00382     setOperationAction(ISD::FP_TO_SINT     , MVT::i32  , Custom);
00383   } else {
00384     setOperationAction(ISD::FP_TO_SINT     , MVT::i16  , Custom);
00385     setOperationAction(ISD::FP_TO_SINT     , MVT::i32  , Custom);
00386   }
00387 
00388   // Handle FP_TO_UINT by promoting the destination to a larger signed
00389   // conversion.
00390   setOperationAction(ISD::FP_TO_UINT       , MVT::i1   , Promote);
00391   setOperationAction(ISD::FP_TO_UINT       , MVT::i8   , Promote);
00392   setOperationAction(ISD::FP_TO_UINT       , MVT::i16  , Promote);
00393 
00394   if (Subtarget->is64Bit()) {
00395     setOperationAction(ISD::FP_TO_UINT     , MVT::i64  , Expand);
00396     setOperationAction(ISD::FP_TO_UINT     , MVT::i32  , Promote);
00397   } else if (!TM.Options.UseSoftFloat) {
00398     // Since AVX is a superset of SSE3, only check for SSE here.
00399     if (Subtarget->hasSSE1() && !Subtarget->hasSSE3())
00400       // Expand FP_TO_UINT into a select.
00401       // FIXME: We would like to use a Custom expander here eventually to do
00402       // the optimal thing for SSE vs. the default expansion in the legalizer.
00403       setOperationAction(ISD::FP_TO_UINT   , MVT::i32  , Expand);
00404     else
00405       // With SSE3 we can use fisttpll to convert to a signed i64; without
00406       // SSE, we're stuck with a fistpll.
00407       setOperationAction(ISD::FP_TO_UINT   , MVT::i32  , Custom);
00408   }
00409 
00410   if (isTargetFTOL()) {
00411     // Use the _ftol2 runtime function, which has a pseudo-instruction
00412     // to handle its weird calling convention.
00413     setOperationAction(ISD::FP_TO_UINT     , MVT::i64  , Custom);
00414   }
00415 
00416   // TODO: when we have SSE, these could be more efficient, by using movd/movq.
00417   if (!X86ScalarSSEf64) {
00418     setOperationAction(ISD::BITCAST        , MVT::f32  , Expand);
00419     setOperationAction(ISD::BITCAST        , MVT::i32  , Expand);
00420     if (Subtarget->is64Bit()) {
00421       setOperationAction(ISD::BITCAST      , MVT::f64  , Expand);
00422       // Without SSE, i64->f64 goes through memory.
00423       setOperationAction(ISD::BITCAST      , MVT::i64  , Expand);
00424     }
00425   }
00426 
00427   // Scalar integer divide and remainder are lowered to use operations that
00428   // produce two results, to match the available instructions. This exposes
00429   // the two-result form to trivial CSE, which is able to combine x/y and x%y
00430   // into a single instruction.
00431   //
00432   // Scalar integer multiply-high is also lowered to use two-result
00433   // operations, to match the available instructions. However, plain multiply
00434   // (low) operations are left as Legal, as there are single-result
00435   // instructions for this in x86. Using the two-result multiply instructions
00436   // when both high and low results are needed must be arranged by dagcombine.
00437   for (unsigned i = 0; i != array_lengthof(IntVTs); ++i) {
00438     MVT VT = IntVTs[i];
00439     setOperationAction(ISD::MULHS, VT, Expand);
00440     setOperationAction(ISD::MULHU, VT, Expand);
00441     setOperationAction(ISD::SDIV, VT, Expand);
00442     setOperationAction(ISD::UDIV, VT, Expand);
00443     setOperationAction(ISD::SREM, VT, Expand);
00444     setOperationAction(ISD::UREM, VT, Expand);
00445 
00446     // Add/Sub overflow ops with MVT::Glues are lowered to EFLAGS dependences.
00447     setOperationAction(ISD::ADDC, VT, Custom);
00448     setOperationAction(ISD::ADDE, VT, Custom);
00449     setOperationAction(ISD::SUBC, VT, Custom);
00450     setOperationAction(ISD::SUBE, VT, Custom);
00451   }
00452 
00453   setOperationAction(ISD::BR_JT            , MVT::Other, Expand);
00454   setOperationAction(ISD::BRCOND           , MVT::Other, Custom);
00455   setOperationAction(ISD::BR_CC            , MVT::f32,   Expand);
00456   setOperationAction(ISD::BR_CC            , MVT::f64,   Expand);
00457   setOperationAction(ISD::BR_CC            , MVT::f80,   Expand);
00458   setOperationAction(ISD::BR_CC            , MVT::i8,    Expand);
00459   setOperationAction(ISD::BR_CC            , MVT::i16,   Expand);
00460   setOperationAction(ISD::BR_CC            , MVT::i32,   Expand);
00461   setOperationAction(ISD::BR_CC            , MVT::i64,   Expand);
00462   setOperationAction(ISD::SELECT_CC        , MVT::f32,   Expand);
00463   setOperationAction(ISD::SELECT_CC        , MVT::f64,   Expand);
00464   setOperationAction(ISD::SELECT_CC        , MVT::f80,   Expand);
00465   setOperationAction(ISD::SELECT_CC        , MVT::i8,    Expand);
00466   setOperationAction(ISD::SELECT_CC        , MVT::i16,   Expand);
00467   setOperationAction(ISD::SELECT_CC        , MVT::i32,   Expand);
00468   setOperationAction(ISD::SELECT_CC        , MVT::i64,   Expand);
00469   if (Subtarget->is64Bit())
00470     setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
00471   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16  , Legal);
00472   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8   , Legal);
00473   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1   , Expand);
00474   setOperationAction(ISD::FP_ROUND_INREG   , MVT::f32  , Expand);
00475   setOperationAction(ISD::FREM             , MVT::f32  , Expand);
00476   setOperationAction(ISD::FREM             , MVT::f64  , Expand);
00477   setOperationAction(ISD::FREM             , MVT::f80  , Expand);
00478   setOperationAction(ISD::FLT_ROUNDS_      , MVT::i32  , Custom);
00479 
00480   // Promote the i8 variants and force them on up to i32 which has a shorter
00481   // encoding.
00482   setOperationAction(ISD::CTTZ             , MVT::i8   , Promote);
00483   AddPromotedToType (ISD::CTTZ             , MVT::i8   , MVT::i32);
00484   setOperationAction(ISD::CTTZ_ZERO_UNDEF  , MVT::i8   , Promote);
00485   AddPromotedToType (ISD::CTTZ_ZERO_UNDEF  , MVT::i8   , MVT::i32);
00486   if (Subtarget->hasBMI()) {
00487     setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i16  , Expand);
00488     setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32  , Expand);
00489     if (Subtarget->is64Bit())
00490       setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Expand);
00491   } else {
00492     setOperationAction(ISD::CTTZ           , MVT::i16  , Custom);
00493     setOperationAction(ISD::CTTZ           , MVT::i32  , Custom);
00494     if (Subtarget->is64Bit())
00495       setOperationAction(ISD::CTTZ         , MVT::i64  , Custom);
00496   }
00497 
00498   if (Subtarget->hasLZCNT()) {
00499     // When promoting the i8 variants, force them to i32 for a shorter
00500     // encoding.
00501     setOperationAction(ISD::CTLZ           , MVT::i8   , Promote);
00502     AddPromotedToType (ISD::CTLZ           , MVT::i8   , MVT::i32);
00503     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8   , Promote);
00504     AddPromotedToType (ISD::CTLZ_ZERO_UNDEF, MVT::i8   , MVT::i32);
00505     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16  , Expand);
00506     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32  , Expand);
00507     if (Subtarget->is64Bit())
00508       setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Expand);
00509   } else {
00510     setOperationAction(ISD::CTLZ           , MVT::i8   , Custom);
00511     setOperationAction(ISD::CTLZ           , MVT::i16  , Custom);
00512     setOperationAction(ISD::CTLZ           , MVT::i32  , Custom);
00513     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8   , Custom);
00514     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16  , Custom);
00515     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32  , Custom);
00516     if (Subtarget->is64Bit()) {
00517       setOperationAction(ISD::CTLZ         , MVT::i64  , Custom);
00518       setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom);
00519     }
00520   }
00521 
00522   // Special handling for half-precision floating point conversions.
00523   // If we don't have F16C support, then lower half float conversions
00524   // into library calls.
00525   if (TM.Options.UseSoftFloat || !Subtarget->hasF16C()) {
00526     setOperationAction(ISD::FP16_TO_FP, MVT::f32, Expand);
00527     setOperationAction(ISD::FP_TO_FP16, MVT::f32, Expand);
00528   }
00529 
00530   // There's never any support for operations beyond MVT::f32.
00531   setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
00532   setOperationAction(ISD::FP16_TO_FP, MVT::f80, Expand);
00533   setOperationAction(ISD::FP_TO_FP16, MVT::f64, Expand);
00534   setOperationAction(ISD::FP_TO_FP16, MVT::f80, Expand);
00535 
00536   setLoadExtAction(ISD::EXTLOAD, MVT::f16, Expand);
00537   setTruncStoreAction(MVT::f32, MVT::f16, Expand);
00538   setTruncStoreAction(MVT::f64, MVT::f16, Expand);
00539   setTruncStoreAction(MVT::f80, MVT::f16, Expand);
00540 
00541   if (Subtarget->hasPOPCNT()) {
00542     setOperationAction(ISD::CTPOP          , MVT::i8   , Promote);
00543   } else {
00544     setOperationAction(ISD::CTPOP          , MVT::i8   , Expand);
00545     setOperationAction(ISD::CTPOP          , MVT::i16  , Expand);
00546     setOperationAction(ISD::CTPOP          , MVT::i32  , Expand);
00547     if (Subtarget->is64Bit())
00548       setOperationAction(ISD::CTPOP        , MVT::i64  , Expand);
00549   }
00550 
00551   setOperationAction(ISD::READCYCLECOUNTER , MVT::i64  , Custom);
00552 
00553   if (!Subtarget->hasMOVBE())
00554     setOperationAction(ISD::BSWAP          , MVT::i16  , Expand);
00555 
00556   // These should be promoted to a larger select which is supported.
00557   setOperationAction(ISD::SELECT          , MVT::i1   , Promote);
00558   // X86 wants to expand cmov itself.
00559   setOperationAction(ISD::SELECT          , MVT::i8   , Custom);
00560   setOperationAction(ISD::SELECT          , MVT::i16  , Custom);
00561   setOperationAction(ISD::SELECT          , MVT::i32  , Custom);
00562   setOperationAction(ISD::SELECT          , MVT::f32  , Custom);
00563   setOperationAction(ISD::SELECT          , MVT::f64  , Custom);
00564   setOperationAction(ISD::SELECT          , MVT::f80  , Custom);
00565   setOperationAction(ISD::SETCC           , MVT::i8   , Custom);
00566   setOperationAction(ISD::SETCC           , MVT::i16  , Custom);
00567   setOperationAction(ISD::SETCC           , MVT::i32  , Custom);
00568   setOperationAction(ISD::SETCC           , MVT::f32  , Custom);
00569   setOperationAction(ISD::SETCC           , MVT::f64  , Custom);
00570   setOperationAction(ISD::SETCC           , MVT::f80  , Custom);
00571   if (Subtarget->is64Bit()) {
00572     setOperationAction(ISD::SELECT        , MVT::i64  , Custom);
00573     setOperationAction(ISD::SETCC         , MVT::i64  , Custom);
00574   }
00575   setOperationAction(ISD::EH_RETURN       , MVT::Other, Custom);
00576   // NOTE: EH_SJLJ_SETJMP/_LONGJMP supported here is NOT intended to support
00577   // SjLj exception handling but a light-weight setjmp/longjmp replacement to
00578   // support continuation, user-level threading, and etc.. As a result, no
00579   // other SjLj exception interfaces are implemented and please don't build
00580   // your own exception handling based on them.
00581   // LLVM/Clang supports zero-cost DWARF exception handling.
00582   setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom);
00583   setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom);
00584 
00585   // Darwin ABI issue.
00586   setOperationAction(ISD::ConstantPool    , MVT::i32  , Custom);
00587   setOperationAction(ISD::JumpTable       , MVT::i32  , Custom);
00588   setOperationAction(ISD::GlobalAddress   , MVT::i32  , Custom);
00589   setOperationAction(ISD::GlobalTLSAddress, MVT::i32  , Custom);
00590   if (Subtarget->is64Bit())
00591     setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom);
00592   setOperationAction(ISD::ExternalSymbol  , MVT::i32  , Custom);
00593   setOperationAction(ISD::BlockAddress    , MVT::i32  , Custom);
00594   if (Subtarget->is64Bit()) {
00595     setOperationAction(ISD::ConstantPool  , MVT::i64  , Custom);
00596     setOperationAction(ISD::JumpTable     , MVT::i64  , Custom);
00597     setOperationAction(ISD::GlobalAddress , MVT::i64  , Custom);
00598     setOperationAction(ISD::ExternalSymbol, MVT::i64  , Custom);
00599     setOperationAction(ISD::BlockAddress  , MVT::i64  , Custom);
00600   }
00601   // 64-bit addm sub, shl, sra, srl (iff 32-bit x86)
00602   setOperationAction(ISD::SHL_PARTS       , MVT::i32  , Custom);
00603   setOperationAction(ISD::SRA_PARTS       , MVT::i32  , Custom);
00604   setOperationAction(ISD::SRL_PARTS       , MVT::i32  , Custom);
00605   if (Subtarget->is64Bit()) {
00606     setOperationAction(ISD::SHL_PARTS     , MVT::i64  , Custom);
00607     setOperationAction(ISD::SRA_PARTS     , MVT::i64  , Custom);
00608     setOperationAction(ISD::SRL_PARTS     , MVT::i64  , Custom);
00609   }
00610 
00611   if (Subtarget->hasSSE1())
00612     setOperationAction(ISD::PREFETCH      , MVT::Other, Legal);
00613 
00614   setOperationAction(ISD::ATOMIC_FENCE  , MVT::Other, Custom);
00615 
00616   // Expand certain atomics
00617   for (unsigned i = 0; i != array_lengthof(IntVTs); ++i) {
00618     MVT VT = IntVTs[i];
00619     setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, VT, Custom);
00620     setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom);
00621     setOperationAction(ISD::ATOMIC_STORE, VT, Custom);
00622   }
00623 
00624   if (Subtarget->hasCmpxchg16b()) {
00625     setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i128, Custom);
00626   }
00627 
00628   // FIXME - use subtarget debug flags
00629   if (!Subtarget->isTargetDarwin() && !Subtarget->isTargetELF() &&
00630       !Subtarget->isTargetCygMing() && !Subtarget->isTargetWin64()) {
00631     setOperationAction(ISD::EH_LABEL, MVT::Other, Expand);
00632   }
00633 
00634   if (Subtarget->is64Bit()) {
00635     setExceptionPointerRegister(X86::RAX);
00636     setExceptionSelectorRegister(X86::RDX);
00637   } else {
00638     setExceptionPointerRegister(X86::EAX);
00639     setExceptionSelectorRegister(X86::EDX);
00640   }
00641   setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom);
00642   setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i64, Custom);
00643 
00644   setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom);
00645   setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom);
00646 
00647   setOperationAction(ISD::TRAP, MVT::Other, Legal);
00648   setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);
00649 
00650   // VASTART needs to be custom lowered to use the VarArgsFrameIndex
00651   setOperationAction(ISD::VASTART           , MVT::Other, Custom);
00652   setOperationAction(ISD::VAEND             , MVT::Other, Expand);
00653   if (Subtarget->is64Bit() && !Subtarget->isTargetWin64()) {
00654     // TargetInfo::X86_64ABIBuiltinVaList
00655     setOperationAction(ISD::VAARG           , MVT::Other, Custom);
00656     setOperationAction(ISD::VACOPY          , MVT::Other, Custom);
00657   } else {
00658     // TargetInfo::CharPtrBuiltinVaList
00659     setOperationAction(ISD::VAARG           , MVT::Other, Expand);
00660     setOperationAction(ISD::VACOPY          , MVT::Other, Expand);
00661   }
00662 
00663   setOperationAction(ISD::STACKSAVE,          MVT::Other, Expand);
00664   setOperationAction(ISD::STACKRESTORE,       MVT::Other, Expand);
00665 
00666   setOperationAction(ISD::DYNAMIC_STACKALLOC, getPointerTy(), Custom);
00667 
00668   if (!TM.Options.UseSoftFloat && X86ScalarSSEf64) {
00669     // f32 and f64 use SSE.
00670     // Set up the FP register classes.
00671     addRegisterClass(MVT::f32, &X86::FR32RegClass);
00672     addRegisterClass(MVT::f64, &X86::FR64RegClass);
00673 
00674     // Use ANDPD to simulate FABS.
00675     setOperationAction(ISD::FABS , MVT::f64, Custom);
00676     setOperationAction(ISD::FABS , MVT::f32, Custom);
00677 
00678     // Use XORP to simulate FNEG.
00679     setOperationAction(ISD::FNEG , MVT::f64, Custom);
00680     setOperationAction(ISD::FNEG , MVT::f32, Custom);
00681 
00682     // Use ANDPD and ORPD to simulate FCOPYSIGN.
00683     setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom);
00684     setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
00685 
00686     // Lower this to FGETSIGNx86 plus an AND.
00687     setOperationAction(ISD::FGETSIGN, MVT::i64, Custom);
00688     setOperationAction(ISD::FGETSIGN, MVT::i32, Custom);
00689 
00690     // We don't support sin/cos/fmod
00691     setOperationAction(ISD::FSIN   , MVT::f64, Expand);
00692     setOperationAction(ISD::FCOS   , MVT::f64, Expand);
00693     setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
00694     setOperationAction(ISD::FSIN   , MVT::f32, Expand);
00695     setOperationAction(ISD::FCOS   , MVT::f32, Expand);
00696     setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
00697 
00698     // Expand FP immediates into loads from the stack, except for the special
00699     // cases we handle.
00700     addLegalFPImmediate(APFloat(+0.0)); // xorpd
00701     addLegalFPImmediate(APFloat(+0.0f)); // xorps
00702   } else if (!TM.Options.UseSoftFloat && X86ScalarSSEf32) {
00703     // Use SSE for f32, x87 for f64.
00704     // Set up the FP register classes.
00705     addRegisterClass(MVT::f32, &X86::FR32RegClass);
00706     addRegisterClass(MVT::f64, &X86::RFP64RegClass);
00707 
00708     // Use ANDPS to simulate FABS.
00709     setOperationAction(ISD::FABS , MVT::f32, Custom);
00710 
00711     // Use XORP to simulate FNEG.
00712     setOperationAction(ISD::FNEG , MVT::f32, Custom);
00713 
00714     setOperationAction(ISD::UNDEF,     MVT::f64, Expand);
00715 
00716     // Use ANDPS and ORPS to simulate FCOPYSIGN.
00717     setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
00718     setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
00719 
00720     // We don't support sin/cos/fmod
00721     setOperationAction(ISD::FSIN   , MVT::f32, Expand);
00722     setOperationAction(ISD::FCOS   , MVT::f32, Expand);
00723     setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
00724 
00725     // Special cases we handle for FP constants.
00726     addLegalFPImmediate(APFloat(+0.0f)); // xorps
00727     addLegalFPImmediate(APFloat(+0.0)); // FLD0
00728     addLegalFPImmediate(APFloat(+1.0)); // FLD1
00729     addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
00730     addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
00731 
00732     if (!TM.Options.UnsafeFPMath) {
00733       setOperationAction(ISD::FSIN   , MVT::f64, Expand);
00734       setOperationAction(ISD::FCOS   , MVT::f64, Expand);
00735       setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
00736     }
00737   } else if (!TM.Options.UseSoftFloat) {
00738     // f32 and f64 in x87.
00739     // Set up the FP register classes.
00740     addRegisterClass(MVT::f64, &X86::RFP64RegClass);
00741     addRegisterClass(MVT::f32, &X86::RFP32RegClass);
00742 
00743     setOperationAction(ISD::UNDEF,     MVT::f64, Expand);
00744     setOperationAction(ISD::UNDEF,     MVT::f32, Expand);
00745     setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
00746     setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
00747 
00748     if (!TM.Options.UnsafeFPMath) {
00749       setOperationAction(ISD::FSIN   , MVT::f64, Expand);
00750       setOperationAction(ISD::FSIN   , MVT::f32, Expand);
00751       setOperationAction(ISD::FCOS   , MVT::f64, Expand);
00752       setOperationAction(ISD::FCOS   , MVT::f32, Expand);
00753       setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
00754       setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
00755     }
00756     addLegalFPImmediate(APFloat(+0.0)); // FLD0
00757     addLegalFPImmediate(APFloat(+1.0)); // FLD1
00758     addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
00759     addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
00760     addLegalFPImmediate(APFloat(+0.0f)); // FLD0
00761     addLegalFPImmediate(APFloat(+1.0f)); // FLD1
00762     addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS
00763     addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS
00764   }
00765 
00766   // We don't support FMA.
00767   setOperationAction(ISD::FMA, MVT::f64, Expand);
00768   setOperationAction(ISD::FMA, MVT::f32, Expand);
00769 
00770   // Long double always uses X87.
00771   if (!TM.Options.UseSoftFloat) {
00772     addRegisterClass(MVT::f80, &X86::RFP80RegClass);
00773     setOperationAction(ISD::UNDEF,     MVT::f80, Expand);
00774     setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand);
00775     {
00776       APFloat TmpFlt = APFloat::getZero(APFloat::x87DoubleExtended);
00777       addLegalFPImmediate(TmpFlt);  // FLD0
00778       TmpFlt.changeSign();
00779       addLegalFPImmediate(TmpFlt);  // FLD0/FCHS
00780 
00781       bool ignored;
00782       APFloat TmpFlt2(+1.0);
00783       TmpFlt2.convert(APFloat::x87DoubleExtended, APFloat::rmNearestTiesToEven,
00784                       &ignored);
00785       addLegalFPImmediate(TmpFlt2);  // FLD1
00786       TmpFlt2.changeSign();
00787       addLegalFPImmediate(TmpFlt2);  // FLD1/FCHS
00788     }
00789 
00790     if (!TM.Options.UnsafeFPMath) {
00791       setOperationAction(ISD::FSIN   , MVT::f80, Expand);
00792       setOperationAction(ISD::FCOS   , MVT::f80, Expand);
00793       setOperationAction(ISD::FSINCOS, MVT::f80, Expand);
00794     }
00795 
00796     setOperationAction(ISD::FFLOOR, MVT::f80, Expand);
00797     setOperationAction(ISD::FCEIL,  MVT::f80, Expand);
00798     setOperationAction(ISD::FTRUNC, MVT::f80, Expand);
00799     setOperationAction(ISD::FRINT,  MVT::f80, Expand);
00800     setOperationAction(ISD::FNEARBYINT, MVT::f80, Expand);
00801     setOperationAction(ISD::FMA, MVT::f80, Expand);
00802   }
00803 
00804   // Always use a library call for pow.
00805   setOperationAction(ISD::FPOW             , MVT::f32  , Expand);
00806   setOperationAction(ISD::FPOW             , MVT::f64  , Expand);
00807   setOperationAction(ISD::FPOW             , MVT::f80  , Expand);
00808 
00809   setOperationAction(ISD::FLOG, MVT::f80, Expand);
00810   setOperationAction(ISD::FLOG2, MVT::f80, Expand);
00811   setOperationAction(ISD::FLOG10, MVT::f80, Expand);
00812   setOperationAction(ISD::FEXP, MVT::f80, Expand);
00813   setOperationAction(ISD::FEXP2, MVT::f80, Expand);
00814   setOperationAction(ISD::FMINNUM, MVT::f80, Expand);
00815   setOperationAction(ISD::FMAXNUM, MVT::f80, Expand);
00816 
00817   // First set operation action for all vector types to either promote
00818   // (for widening) or expand (for scalarization). Then we will selectively
00819   // turn on ones that can be effectively codegen'd.
00820   for (int i = MVT::FIRST_VECTOR_VALUETYPE;
00821            i <= MVT::LAST_VECTOR_VALUETYPE; ++i) {
00822     MVT VT = (MVT::SimpleValueType)i;
00823     setOperationAction(ISD::ADD , VT, Expand);
00824     setOperationAction(ISD::SUB , VT, Expand);
00825     setOperationAction(ISD::FADD, VT, Expand);
00826     setOperationAction(ISD::FNEG, VT, Expand);
00827     setOperationAction(ISD::FSUB, VT, Expand);
00828     setOperationAction(ISD::MUL , VT, Expand);
00829     setOperationAction(ISD::FMUL, VT, Expand);
00830     setOperationAction(ISD::SDIV, VT, Expand);
00831     setOperationAction(ISD::UDIV, VT, Expand);
00832     setOperationAction(ISD::FDIV, VT, Expand);
00833     setOperationAction(ISD::SREM, VT, Expand);
00834     setOperationAction(ISD::UREM, VT, Expand);
00835     setOperationAction(ISD::LOAD, VT, Expand);
00836     setOperationAction(ISD::VECTOR_SHUFFLE, VT, Expand);
00837     setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT,Expand);
00838     setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Expand);
00839     setOperationAction(ISD::EXTRACT_SUBVECTOR, VT,Expand);
00840     setOperationAction(ISD::INSERT_SUBVECTOR, VT,Expand);
00841     setOperationAction(ISD::FABS, VT, Expand);
00842     setOperationAction(ISD::FSIN, VT, Expand);
00843     setOperationAction(ISD::FSINCOS, VT, Expand);
00844     setOperationAction(ISD::FCOS, VT, Expand);
00845     setOperationAction(ISD::FSINCOS, VT, Expand);
00846     setOperationAction(ISD::FREM, VT, Expand);
00847     setOperationAction(ISD::FMA,  VT, Expand);
00848     setOperationAction(ISD::FPOWI, VT, Expand);
00849     setOperationAction(ISD::FSQRT, VT, Expand);
00850     setOperationAction(ISD::FCOPYSIGN, VT, Expand);
00851     setOperationAction(ISD::FFLOOR, VT, Expand);
00852     setOperationAction(ISD::FCEIL, VT, Expand);
00853     setOperationAction(ISD::FTRUNC, VT, Expand);
00854     setOperationAction(ISD::FRINT, VT, Expand);
00855     setOperationAction(ISD::FNEARBYINT, VT, Expand);
00856     setOperationAction(ISD::SMUL_LOHI, VT, Expand);
00857     setOperationAction(ISD::MULHS, VT, Expand);
00858     setOperationAction(ISD::UMUL_LOHI, VT, Expand);
00859     setOperationAction(ISD::MULHU, VT, Expand);
00860     setOperationAction(ISD::SDIVREM, VT, Expand);
00861     setOperationAction(ISD::UDIVREM, VT, Expand);
00862     setOperationAction(ISD::FPOW, VT, Expand);
00863     setOperationAction(ISD::CTPOP, VT, Expand);
00864     setOperationAction(ISD::CTTZ, VT, Expand);
00865     setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Expand);
00866     setOperationAction(ISD::CTLZ, VT, Expand);
00867     setOperationAction(ISD::CTLZ_ZERO_UNDEF, VT, Expand);
00868     setOperationAction(ISD::SHL, VT, Expand);
00869     setOperationAction(ISD::SRA, VT, Expand);
00870     setOperationAction(ISD::SRL, VT, Expand);
00871     setOperationAction(ISD::ROTL, VT, Expand);
00872     setOperationAction(ISD::ROTR, VT, Expand);
00873     setOperationAction(ISD::BSWAP, VT, Expand);
00874     setOperationAction(ISD::SETCC, VT, Expand);
00875     setOperationAction(ISD::FLOG, VT, Expand);
00876     setOperationAction(ISD::FLOG2, VT, Expand);
00877     setOperationAction(ISD::FLOG10, VT, Expand);
00878     setOperationAction(ISD::FEXP, VT, Expand);
00879     setOperationAction(ISD::FEXP2, VT, Expand);
00880     setOperationAction(ISD::FP_TO_UINT, VT, Expand);
00881     setOperationAction(ISD::FP_TO_SINT, VT, Expand);
00882     setOperationAction(ISD::UINT_TO_FP, VT, Expand);
00883     setOperationAction(ISD::SINT_TO_FP, VT, Expand);
00884     setOperationAction(ISD::SIGN_EXTEND_INREG, VT,Expand);
00885     setOperationAction(ISD::TRUNCATE, VT, Expand);
00886     setOperationAction(ISD::SIGN_EXTEND, VT, Expand);
00887     setOperationAction(ISD::ZERO_EXTEND, VT, Expand);
00888     setOperationAction(ISD::ANY_EXTEND, VT, Expand);
00889     setOperationAction(ISD::VSELECT, VT, Expand);
00890     setOperationAction(ISD::SELECT_CC, VT, Expand);
00891     for (int InnerVT = MVT::FIRST_VECTOR_VALUETYPE;
00892              InnerVT <= MVT::LAST_VECTOR_VALUETYPE; ++InnerVT)
00893       setTruncStoreAction(VT,
00894                           (MVT::SimpleValueType)InnerVT, Expand);
00895     setLoadExtAction(ISD::SEXTLOAD, VT, Expand);
00896     setLoadExtAction(ISD::ZEXTLOAD, VT, Expand);
00897 
00898     // N.b. ISD::EXTLOAD legality is basically ignored except for i1-like types,
00899     // we have to deal with them whether we ask for Expansion or not. Setting
00900     // Expand causes its own optimisation problems though, so leave them legal.
00901     if (VT.getVectorElementType() == MVT::i1)
00902       setLoadExtAction(ISD::EXTLOAD, VT, Expand);
00903   }
00904 
00905   // FIXME: In order to prevent SSE instructions being expanded to MMX ones
00906   // with -msoft-float, disable use of MMX as well.
00907   if (!TM.Options.UseSoftFloat && Subtarget->hasMMX()) {
00908     addRegisterClass(MVT::x86mmx, &X86::VR64RegClass);
00909     // No operations on x86mmx supported, everything uses intrinsics.
00910   }
00911 
00912   // MMX-sized vectors (other than x86mmx) are expected to be expanded
00913   // into smaller operations.
00914   setOperationAction(ISD::MULHS,              MVT::v8i8,  Expand);
00915   setOperationAction(ISD::MULHS,              MVT::v4i16, Expand);
00916   setOperationAction(ISD::MULHS,              MVT::v2i32, Expand);
00917   setOperationAction(ISD::MULHS,              MVT::v1i64, Expand);
00918   setOperationAction(ISD::AND,                MVT::v8i8,  Expand);
00919   setOperationAction(ISD::AND,                MVT::v4i16, Expand);
00920   setOperationAction(ISD::AND,                MVT::v2i32, Expand);
00921   setOperationAction(ISD::AND,                MVT::v1i64, Expand);
00922   setOperationAction(ISD::OR,                 MVT::v8i8,  Expand);
00923   setOperationAction(ISD::OR,                 MVT::v4i16, Expand);
00924   setOperationAction(ISD::OR,                 MVT::v2i32, Expand);
00925   setOperationAction(ISD::OR,                 MVT::v1i64, Expand);
00926   setOperationAction(ISD::XOR,                MVT::v8i8,  Expand);
00927   setOperationAction(ISD::XOR,                MVT::v4i16, Expand);
00928   setOperationAction(ISD::XOR,                MVT::v2i32, Expand);
00929   setOperationAction(ISD::XOR,                MVT::v1i64, Expand);
00930   setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v8i8,  Expand);
00931   setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v4i16, Expand);
00932   setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v2i32, Expand);
00933   setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v1i64, Expand);
00934   setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v1i64, Expand);
00935   setOperationAction(ISD::SELECT,             MVT::v8i8,  Expand);
00936   setOperationAction(ISD::SELECT,             MVT::v4i16, Expand);
00937   setOperationAction(ISD::SELECT,             MVT::v2i32, Expand);
00938   setOperationAction(ISD::SELECT,             MVT::v1i64, Expand);
00939   setOperationAction(ISD::BITCAST,            MVT::v8i8,  Expand);
00940   setOperationAction(ISD::BITCAST,            MVT::v4i16, Expand);
00941   setOperationAction(ISD::BITCAST,            MVT::v2i32, Expand);
00942   setOperationAction(ISD::BITCAST,            MVT::v1i64, Expand);
00943 
00944   if (!TM.Options.UseSoftFloat && Subtarget->hasSSE1()) {
00945     addRegisterClass(MVT::v4f32, &X86::VR128RegClass);
00946 
00947     setOperationAction(ISD::FADD,               MVT::v4f32, Legal);
00948     setOperationAction(ISD::FSUB,               MVT::v4f32, Legal);
00949     setOperationAction(ISD::FMUL,               MVT::v4f32, Legal);
00950     setOperationAction(ISD::FDIV,               MVT::v4f32, Legal);
00951     setOperationAction(ISD::FSQRT,              MVT::v4f32, Legal);
00952     setOperationAction(ISD::FNEG,               MVT::v4f32, Custom);
00953     setOperationAction(ISD::FABS,               MVT::v4f32, Custom);
00954     setOperationAction(ISD::LOAD,               MVT::v4f32, Legal);
00955     setOperationAction(ISD::BUILD_VECTOR,       MVT::v4f32, Custom);
00956     setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v4f32, Custom);
00957     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
00958     setOperationAction(ISD::SELECT,             MVT::v4f32, Custom);
00959   }
00960 
00961   if (!TM.Options.UseSoftFloat && Subtarget->hasSSE2()) {
00962     addRegisterClass(MVT::v2f64, &X86::VR128RegClass);
00963 
00964     // FIXME: Unfortunately, -soft-float and -no-implicit-float mean XMM
00965     // registers cannot be used even for integer operations.
00966     addRegisterClass(MVT::v16i8, &X86::VR128RegClass);
00967     addRegisterClass(MVT::v8i16, &X86::VR128RegClass);
00968     addRegisterClass(MVT::v4i32, &X86::VR128RegClass);
00969     addRegisterClass(MVT::v2i64, &X86::VR128RegClass);
00970 
00971     setOperationAction(ISD::ADD,                MVT::v16i8, Legal);
00972     setOperationAction(ISD::ADD,                MVT::v8i16, Legal);
00973     setOperationAction(ISD::ADD,                MVT::v4i32, Legal);
00974     setOperationAction(ISD::ADD,                MVT::v2i64, Legal);
00975     setOperationAction(ISD::MUL,                MVT::v4i32, Custom);
00976     setOperationAction(ISD::MUL,                MVT::v2i64, Custom);
00977     setOperationAction(ISD::UMUL_LOHI,          MVT::v4i32, Custom);
00978     setOperationAction(ISD::SMUL_LOHI,          MVT::v4i32, Custom);
00979     setOperationAction(ISD::MULHU,              MVT::v8i16, Legal);
00980     setOperationAction(ISD::MULHS,              MVT::v8i16, Legal);
00981     setOperationAction(ISD::SUB,                MVT::v16i8, Legal);
00982     setOperationAction(ISD::SUB,                MVT::v8i16, Legal);
00983     setOperationAction(ISD::SUB,                MVT::v4i32, Legal);
00984     setOperationAction(ISD::SUB,                MVT::v2i64, Legal);
00985     setOperationAction(ISD::MUL,                MVT::v8i16, Legal);
00986     setOperationAction(ISD::FADD,               MVT::v2f64, Legal);
00987     setOperationAction(ISD::FSUB,               MVT::v2f64, Legal);
00988     setOperationAction(ISD::FMUL,               MVT::v2f64, Legal);
00989     setOperationAction(ISD::FDIV,               MVT::v2f64, Legal);
00990     setOperationAction(ISD::FSQRT,              MVT::v2f64, Legal);
00991     setOperationAction(ISD::FNEG,               MVT::v2f64, Custom);
00992     setOperationAction(ISD::FABS,               MVT::v2f64, Custom);
00993 
00994     setOperationAction(ISD::SETCC,              MVT::v2i64, Custom);
00995     setOperationAction(ISD::SETCC,              MVT::v16i8, Custom);
00996     setOperationAction(ISD::SETCC,              MVT::v8i16, Custom);
00997     setOperationAction(ISD::SETCC,              MVT::v4i32, Custom);
00998 
00999     setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v16i8, Custom);
01000     setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v8i16, Custom);
01001     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v8i16, Custom);
01002     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4i32, Custom);
01003     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4f32, Custom);
01004 
01005     // Custom lower build_vector, vector_shuffle, and extract_vector_elt.
01006     for (int i = MVT::v16i8; i != MVT::v2i64; ++i) {
01007       MVT VT = (MVT::SimpleValueType)i;
01008       // Do not attempt to custom lower non-power-of-2 vectors
01009       if (!isPowerOf2_32(VT.getVectorNumElements()))
01010         continue;
01011       // Do not attempt to custom lower non-128-bit vectors
01012       if (!VT.is128BitVector())
01013         continue;
01014       setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);
01015       setOperationAction(ISD::VECTOR_SHUFFLE,     VT, Custom);
01016       setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
01017     }
01018 
01019     // We support custom legalizing of sext and anyext loads for specific
01020     // memory vector types which we can load as a scalar (or sequence of
01021     // scalars) and extend in-register to a legal 128-bit vector type. For sext
01022     // loads these must work with a single scalar load.
01023     setLoadExtAction(ISD::SEXTLOAD, MVT::v4i8, Custom);
01024     setLoadExtAction(ISD::SEXTLOAD, MVT::v4i16, Custom);
01025     setLoadExtAction(ISD::SEXTLOAD, MVT::v8i8, Custom);
01026     setLoadExtAction(ISD::EXTLOAD, MVT::v2i8, Custom);
01027     setLoadExtAction(ISD::EXTLOAD, MVT::v2i16, Custom);
01028     setLoadExtAction(ISD::EXTLOAD, MVT::v2i32, Custom);
01029     setLoadExtAction(ISD::EXTLOAD, MVT::v4i8, Custom);
01030     setLoadExtAction(ISD::EXTLOAD, MVT::v4i16, Custom);
01031     setLoadExtAction(ISD::EXTLOAD, MVT::v8i8, Custom);
01032 
01033     setOperationAction(ISD::BUILD_VECTOR,       MVT::v2f64, Custom);
01034     setOperationAction(ISD::BUILD_VECTOR,       MVT::v2i64, Custom);
01035     setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v2f64, Custom);
01036     setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v2i64, Custom);
01037     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v2f64, Custom);
01038     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f64, Custom);
01039 
01040     if (Subtarget->is64Bit()) {
01041       setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v2i64, Custom);
01042       setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Custom);
01043     }
01044 
01045     // Promote v16i8, v8i16, v4i32 load, select, and, or, xor to v2i64.
01046     for (int i = MVT::v16i8; i != MVT::v2i64; ++i) {
01047       MVT VT = (MVT::SimpleValueType)i;
01048 
01049       // Do not attempt to promote non-128-bit vectors
01050       if (!VT.is128BitVector())
01051         continue;
01052 
01053       setOperationAction(ISD::AND,    VT, Promote);
01054       AddPromotedToType (ISD::AND,    VT, MVT::v2i64);
01055       setOperationAction(ISD::OR,     VT, Promote);
01056       AddPromotedToType (ISD::OR,     VT, MVT::v2i64);
01057       setOperationAction(ISD::XOR,    VT, Promote);
01058       AddPromotedToType (ISD::XOR,    VT, MVT::v2i64);
01059       setOperationAction(ISD::LOAD,   VT, Promote);
01060       AddPromotedToType (ISD::LOAD,   VT, MVT::v2i64);
01061       setOperationAction(ISD::SELECT, VT, Promote);
01062       AddPromotedToType (ISD::SELECT, VT, MVT::v2i64);
01063     }
01064 
01065     // Custom lower v2i64 and v2f64 selects.
01066     setOperationAction(ISD::LOAD,               MVT::v2f64, Legal);
01067     setOperationAction(ISD::LOAD,               MVT::v2i64, Legal);
01068     setOperationAction(ISD::SELECT,             MVT::v2f64, Custom);
01069     setOperationAction(ISD::SELECT,             MVT::v2i64, Custom);
01070 
01071     setOperationAction(ISD::FP_TO_SINT,         MVT::v4i32, Legal);
01072     setOperationAction(ISD::SINT_TO_FP,         MVT::v4i32, Legal);
01073 
01074     setOperationAction(ISD::UINT_TO_FP,         MVT::v4i8,  Custom);
01075     setOperationAction(ISD::UINT_TO_FP,         MVT::v4i16, Custom);
01076     // As there is no 64-bit GPR available, we need build a special custom
01077     // sequence to convert from v2i32 to v2f32.
01078     if (!Subtarget->is64Bit())
01079       setOperationAction(ISD::UINT_TO_FP,       MVT::v2f32, Custom);
01080 
01081     setOperationAction(ISD::FP_EXTEND,          MVT::v2f32, Custom);
01082     setOperationAction(ISD::FP_ROUND,           MVT::v2f32, Custom);
01083 
01084     setLoadExtAction(ISD::EXTLOAD,              MVT::v2f32, Legal);
01085 
01086     setOperationAction(ISD::BITCAST,            MVT::v2i32, Custom);
01087     setOperationAction(ISD::BITCAST,            MVT::v4i16, Custom);
01088     setOperationAction(ISD::BITCAST,            MVT::v8i8,  Custom);
01089   }
01090 
01091   if (!TM.Options.UseSoftFloat && Subtarget->hasSSE41()) {
01092     setOperationAction(ISD::FFLOOR,             MVT::f32,   Legal);
01093     setOperationAction(ISD::FCEIL,              MVT::f32,   Legal);
01094     setOperationAction(ISD::FTRUNC,             MVT::f32,   Legal);
01095     setOperationAction(ISD::FRINT,              MVT::f32,   Legal);
01096     setOperationAction(ISD::FNEARBYINT,         MVT::f32,   Legal);
01097     setOperationAction(ISD::FFLOOR,             MVT::f64,   Legal);
01098     setOperationAction(ISD::FCEIL,              MVT::f64,   Legal);
01099     setOperationAction(ISD::FTRUNC,             MVT::f64,   Legal);
01100     setOperationAction(ISD::FRINT,              MVT::f64,   Legal);
01101     setOperationAction(ISD::FNEARBYINT,         MVT::f64,   Legal);
01102 
01103     setOperationAction(ISD::FFLOOR,             MVT::v4f32, Legal);
01104     setOperationAction(ISD::FCEIL,              MVT::v4f32, Legal);
01105     setOperationAction(ISD::FTRUNC,             MVT::v4f32, Legal);
01106     setOperationAction(ISD::FRINT,              MVT::v4f32, Legal);
01107     setOperationAction(ISD::FNEARBYINT,         MVT::v4f32, Legal);
01108     setOperationAction(ISD::FFLOOR,             MVT::v2f64, Legal);
01109     setOperationAction(ISD::FCEIL,              MVT::v2f64, Legal);
01110     setOperationAction(ISD::FTRUNC,             MVT::v2f64, Legal);
01111     setOperationAction(ISD::FRINT,              MVT::v2f64, Legal);
01112     setOperationAction(ISD::FNEARBYINT,         MVT::v2f64, Legal);
01113 
01114     // FIXME: Do we need to handle scalar-to-vector here?
01115     setOperationAction(ISD::MUL,                MVT::v4i32, Legal);
01116 
01117     setOperationAction(ISD::VSELECT,            MVT::v2f64, Custom);
01118     setOperationAction(ISD::VSELECT,            MVT::v2i64, Custom);
01119     setOperationAction(ISD::VSELECT,            MVT::v4i32, Custom);
01120     setOperationAction(ISD::VSELECT,            MVT::v4f32, Custom);
01121     setOperationAction(ISD::VSELECT,            MVT::v8i16, Custom);
01122     // There is no BLENDI for byte vectors. We don't need to custom lower
01123     // some vselects for now.
01124     setOperationAction(ISD::VSELECT,            MVT::v16i8, Legal);
01125 
01126     // SSE41 brings specific instructions for doing vector sign extend even in
01127     // cases where we don't have SRA.
01128     setLoadExtAction(ISD::SEXTLOAD, MVT::v2i8, Custom);
01129     setLoadExtAction(ISD::SEXTLOAD, MVT::v2i16, Custom);
01130     setLoadExtAction(ISD::SEXTLOAD, MVT::v2i32, Custom);
01131 
01132     // i8 and i16 vectors are custom because the source register and source
01133     // source memory operand types are not the same width.  f32 vectors are
01134     // custom since the immediate controlling the insert encodes additional
01135     // information.
01136     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v16i8, Custom);
01137     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v8i16, Custom);
01138     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4i32, Custom);
01139     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4f32, Custom);
01140 
01141     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i8, Custom);
01142     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i16, Custom);
01143     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Custom);
01144     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
01145 
01146     // FIXME: these should be Legal, but that's only for the case where
01147     // the index is constant.  For now custom expand to deal with that.
01148     if (Subtarget->is64Bit()) {
01149       setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v2i64, Custom);
01150       setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Custom);
01151     }
01152   }
01153 
01154   if (Subtarget->hasSSE2()) {
01155     setOperationAction(ISD::SRL,               MVT::v8i16, Custom);
01156     setOperationAction(ISD::SRL,               MVT::v16i8, Custom);
01157 
01158     setOperationAction(ISD::SHL,               MVT::v8i16, Custom);
01159     setOperationAction(ISD::SHL,               MVT::v16i8, Custom);
01160 
01161     setOperationAction(ISD::SRA,               MVT::v8i16, Custom);
01162     setOperationAction(ISD::SRA,               MVT::v16i8, Custom);
01163 
01164     // In the customized shift lowering, the legal cases in AVX2 will be
01165     // recognized.
01166     setOperationAction(ISD::SRL,               MVT::v2i64, Custom);
01167     setOperationAction(ISD::SRL,               MVT::v4i32, Custom);
01168 
01169     setOperationAction(ISD::SHL,               MVT::v2i64, Custom);
01170     setOperationAction(ISD::SHL,               MVT::v4i32, Custom);
01171 
01172     setOperationAction(ISD::SRA,               MVT::v4i32, Custom);
01173   }
01174 
01175   if (!TM.Options.UseSoftFloat && Subtarget->hasFp256()) {
01176     addRegisterClass(MVT::v32i8,  &X86::VR256RegClass);
01177     addRegisterClass(MVT::v16i16, &X86::VR256RegClass);
01178     addRegisterClass(MVT::v8i32,  &X86::VR256RegClass);
01179     addRegisterClass(MVT::v8f32,  &X86::VR256RegClass);
01180     addRegisterClass(MVT::v4i64,  &X86::VR256RegClass);
01181     addRegisterClass(MVT::v4f64,  &X86::VR256RegClass);
01182 
01183     setOperationAction(ISD::LOAD,               MVT::v8f32, Legal);
01184     setOperationAction(ISD::LOAD,               MVT::v4f64, Legal);
01185     setOperationAction(ISD::LOAD,               MVT::v4i64, Legal);
01186 
01187     setOperationAction(ISD::FADD,               MVT::v8f32, Legal);
01188     setOperationAction(ISD::FSUB,               MVT::v8f32, Legal);
01189     setOperationAction(ISD::FMUL,               MVT::v8f32, Legal);
01190     setOperationAction(ISD::FDIV,               MVT::v8f32, Legal);
01191     setOperationAction(ISD::FSQRT,              MVT::v8f32, Legal);
01192     setOperationAction(ISD::FFLOOR,             MVT::v8f32, Legal);
01193     setOperationAction(ISD::FCEIL,              MVT::v8f32, Legal);
01194     setOperationAction(ISD::FTRUNC,             MVT::v8f32, Legal);
01195     setOperationAction(ISD::FRINT,              MVT::v8f32, Legal);
01196     setOperationAction(ISD::FNEARBYINT,         MVT::v8f32, Legal);
01197     setOperationAction(ISD::FNEG,               MVT::v8f32, Custom);
01198     setOperationAction(ISD::FABS,               MVT::v8f32, Custom);
01199 
01200     setOperationAction(ISD::FADD,               MVT::v4f64, Legal);
01201     setOperationAction(ISD::FSUB,               MVT::v4f64, Legal);
01202     setOperationAction(ISD::FMUL,               MVT::v4f64, Legal);
01203     setOperationAction(ISD::FDIV,               MVT::v4f64, Legal);
01204     setOperationAction(ISD::FSQRT,              MVT::v4f64, Legal);
01205     setOperationAction(ISD::FFLOOR,             MVT::v4f64, Legal);
01206     setOperationAction(ISD::FCEIL,              MVT::v4f64, Legal);
01207     setOperationAction(ISD::FTRUNC,             MVT::v4f64, Legal);
01208     setOperationAction(ISD::FRINT,              MVT::v4f64, Legal);
01209     setOperationAction(ISD::FNEARBYINT,         MVT::v4f64, Legal);
01210     setOperationAction(ISD::FNEG,               MVT::v4f64, Custom);
01211     setOperationAction(ISD::FABS,               MVT::v4f64, Custom);
01212 
01213     // (fp_to_int:v8i16 (v8f32 ..)) requires the result type to be promoted
01214     // even though v8i16 is a legal type.
01215     setOperationAction(ISD::FP_TO_SINT,         MVT::v8i16, Promote);
01216     setOperationAction(ISD::FP_TO_UINT,         MVT::v8i16, Promote);
01217     setOperationAction(ISD::FP_TO_SINT,         MVT::v8i32, Legal);
01218 
01219     setOperationAction(ISD::SINT_TO_FP,         MVT::v8i16, Promote);
01220     setOperationAction(ISD::SINT_TO_FP,         MVT::v8i32, Legal);
01221     setOperationAction(ISD::FP_ROUND,           MVT::v4f32, Legal);
01222 
01223     setOperationAction(ISD::UINT_TO_FP,         MVT::v8i8,  Custom);
01224     setOperationAction(ISD::UINT_TO_FP,         MVT::v8i16, Custom);
01225 
01226     setLoadExtAction(ISD::EXTLOAD,              MVT::v4f32, Legal);
01227 
01228     setOperationAction(ISD::SRL,               MVT::v16i16, Custom);
01229     setOperationAction(ISD::SRL,               MVT::v32i8, Custom);
01230 
01231     setOperationAction(ISD::SHL,               MVT::v16i16, Custom);
01232     setOperationAction(ISD::SHL,               MVT::v32i8, Custom);
01233 
01234     setOperationAction(ISD::SRA,               MVT::v16i16, Custom);
01235     setOperationAction(ISD::SRA,               MVT::v32i8, Custom);
01236 
01237     setOperationAction(ISD::SETCC,             MVT::v32i8, Custom);
01238     setOperationAction(ISD::SETCC,             MVT::v16i16, Custom);
01239     setOperationAction(ISD::SETCC,             MVT::v8i32, Custom);
01240     setOperationAction(ISD::SETCC,             MVT::v4i64, Custom);
01241 
01242     setOperationAction(ISD::SELECT,            MVT::v4f64, Custom);
01243     setOperationAction(ISD::SELECT,            MVT::v4i64, Custom);
01244     setOperationAction(ISD::SELECT,            MVT::v8f32, Custom);
01245 
01246     setOperationAction(ISD::VSELECT,           MVT::v4f64, Custom);
01247     setOperationAction(ISD::VSELECT,           MVT::v4i64, Custom);
01248     setOperationAction(ISD::VSELECT,           MVT::v8i32, Custom);
01249     setOperationAction(ISD::VSELECT,           MVT::v8f32, Custom);
01250 
01251     setOperationAction(ISD::SIGN_EXTEND,       MVT::v4i64, Custom);
01252     setOperationAction(ISD::SIGN_EXTEND,       MVT::v8i32, Custom);
01253     setOperationAction(ISD::SIGN_EXTEND,       MVT::v16i16, Custom);
01254     setOperationAction(ISD::ZERO_EXTEND,       MVT::v4i64, Custom);
01255     setOperationAction(ISD::ZERO_EXTEND,       MVT::v8i32, Custom);
01256     setOperationAction(ISD::ZERO_EXTEND,       MVT::v16i16, Custom);
01257     setOperationAction(ISD::ANY_EXTEND,        MVT::v4i64, Custom);
01258     setOperationAction(ISD::ANY_EXTEND,        MVT::v8i32, Custom);
01259     setOperationAction(ISD::ANY_EXTEND,        MVT::v16i16, Custom);
01260     setOperationAction(ISD::TRUNCATE,          MVT::v16i8, Custom);
01261     setOperationAction(ISD::TRUNCATE,          MVT::v8i16, Custom);
01262     setOperationAction(ISD::TRUNCATE,          MVT::v4i32, Custom);
01263 
01264     if (Subtarget->hasFMA() || Subtarget->hasFMA4()) {
01265       setOperationAction(ISD::FMA,             MVT::v8f32, Legal);
01266       setOperationAction(ISD::FMA,             MVT::v4f64, Legal);
01267       setOperationAction(ISD::FMA,             MVT::v4f32, Legal);
01268       setOperationAction(ISD::FMA,             MVT::v2f64, Legal);
01269       setOperationAction(ISD::FMA,             MVT::f32, Legal);
01270       setOperationAction(ISD::FMA,             MVT::f64, Legal);
01271     }
01272 
01273     if (Subtarget->hasInt256()) {
01274       setOperationAction(ISD::ADD,             MVT::v4i64, Legal);
01275       setOperationAction(ISD::ADD,             MVT::v8i32, Legal);
01276       setOperationAction(ISD::ADD,             MVT::v16i16, Legal);
01277       setOperationAction(ISD::ADD,             MVT::v32i8, Legal);
01278 
01279       setOperationAction(ISD::SUB,             MVT::v4i64, Legal);
01280       setOperationAction(ISD::SUB,             MVT::v8i32, Legal);
01281       setOperationAction(ISD::SUB,             MVT::v16i16, Legal);
01282       setOperationAction(ISD::SUB,             MVT::v32i8, Legal);
01283 
01284       setOperationAction(ISD::MUL,             MVT::v4i64, Custom);
01285       setOperationAction(ISD::MUL,             MVT::v8i32, Legal);
01286       setOperationAction(ISD::MUL,             MVT::v16i16, Legal);
01287       // Don't lower v32i8 because there is no 128-bit byte mul
01288 
01289       setOperationAction(ISD::UMUL_LOHI,       MVT::v8i32, Custom);
01290       setOperationAction(ISD::SMUL_LOHI,       MVT::v8i32, Custom);
01291       setOperationAction(ISD::MULHU,           MVT::v16i16, Legal);
01292       setOperationAction(ISD::MULHS,           MVT::v16i16, Legal);
01293 
01294       setOperationAction(ISD::VSELECT,         MVT::v16i16, Custom);
01295       setOperationAction(ISD::VSELECT,         MVT::v32i8, Legal);
01296     } else {
01297       setOperationAction(ISD::ADD,             MVT::v4i64, Custom);
01298       setOperationAction(ISD::ADD,             MVT::v8i32, Custom);
01299       setOperationAction(ISD::ADD,             MVT::v16i16, Custom);
01300       setOperationAction(ISD::ADD,             MVT::v32i8, Custom);
01301 
01302       setOperationAction(ISD::SUB,             MVT::v4i64, Custom);
01303       setOperationAction(ISD::SUB,             MVT::v8i32, Custom);
01304       setOperationAction(ISD::SUB,             MVT::v16i16, Custom);
01305       setOperationAction(ISD::SUB,             MVT::v32i8, Custom);
01306 
01307       setOperationAction(ISD::MUL,             MVT::v4i64, Custom);
01308       setOperationAction(ISD::MUL,             MVT::v8i32, Custom);
01309       setOperationAction(ISD::MUL,             MVT::v16i16, Custom);
01310       // Don't lower v32i8 because there is no 128-bit byte mul
01311     }
01312 
01313     // In the customized shift lowering, the legal cases in AVX2 will be
01314     // recognized.
01315     setOperationAction(ISD::SRL,               MVT::v4i64, Custom);
01316     setOperationAction(ISD::SRL,               MVT::v8i32, Custom);
01317 
01318     setOperationAction(ISD::SHL,               MVT::v4i64, Custom);
01319     setOperationAction(ISD::SHL,               MVT::v8i32, Custom);
01320 
01321     setOperationAction(ISD::SRA,               MVT::v8i32, Custom);
01322 
01323     // Custom lower several nodes for 256-bit types.
01324     for (int i = MVT::FIRST_VECTOR_VALUETYPE;
01325              i <= MVT::LAST_VECTOR_VALUETYPE; ++i) {
01326       MVT VT = (MVT::SimpleValueType)i;
01327 
01328       // Extract subvector is special because the value type
01329       // (result) is 128-bit but the source is 256-bit wide.
01330       if (VT.is128BitVector())
01331         setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
01332 
01333       // Do not attempt to custom lower other non-256-bit vectors
01334       if (!VT.is256BitVector())
01335         continue;
01336 
01337       setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);
01338       setOperationAction(ISD::VECTOR_SHUFFLE,     VT, Custom);
01339       setOperationAction(ISD::INSERT_VECTOR_ELT,  VT, Custom);
01340       setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
01341       setOperationAction(ISD::SCALAR_TO_VECTOR,   VT, Custom);
01342       setOperationAction(ISD::INSERT_SUBVECTOR,   VT, Custom);
01343       setOperationAction(ISD::CONCAT_VECTORS,     VT, Custom);
01344     }
01345 
01346     // Promote v32i8, v16i16, v8i32 select, and, or, xor to v4i64.
01347     for (int i = MVT::v32i8; i != MVT::v4i64; ++i) {
01348       MVT VT = (MVT::SimpleValueType)i;
01349 
01350       // Do not attempt to promote non-256-bit vectors
01351       if (!VT.is256BitVector())
01352         continue;
01353 
01354       setOperationAction(ISD::AND,    VT, Promote);
01355       AddPromotedToType (ISD::AND,    VT, MVT::v4i64);
01356       setOperationAction(ISD::OR,     VT, Promote);
01357       AddPromotedToType (ISD::OR,     VT, MVT::v4i64);
01358       setOperationAction(ISD::XOR,    VT, Promote);
01359       AddPromotedToType (ISD::XOR,    VT, MVT::v4i64);
01360       setOperationAction(ISD::LOAD,   VT, Promote);
01361       AddPromotedToType (ISD::LOAD,   VT, MVT::v4i64);
01362       setOperationAction(ISD::SELECT, VT, Promote);
01363       AddPromotedToType (ISD::SELECT, VT, MVT::v4i64);
01364     }
01365   }
01366 
01367   if (!TM.Options.UseSoftFloat && Subtarget->hasAVX512()) {
01368     addRegisterClass(MVT::v16i32, &X86::VR512RegClass);
01369     addRegisterClass(MVT::v16f32, &X86::VR512RegClass);
01370     addRegisterClass(MVT::v8i64,  &X86::VR512RegClass);
01371     addRegisterClass(MVT::v8f64,  &X86::VR512RegClass);
01372 
01373     addRegisterClass(MVT::i1,     &X86::VK1RegClass);
01374     addRegisterClass(MVT::v8i1,   &X86::VK8RegClass);
01375     addRegisterClass(MVT::v16i1,  &X86::VK16RegClass);
01376 
01377     setOperationAction(ISD::BR_CC,              MVT::i1,    Expand);
01378     setOperationAction(ISD::SETCC,              MVT::i1,    Custom);
01379     setOperationAction(ISD::XOR,                MVT::i1,    Legal);
01380     setOperationAction(ISD::OR,                 MVT::i1,    Legal);
01381     setOperationAction(ISD::AND,                MVT::i1,    Legal);
01382     setLoadExtAction(ISD::EXTLOAD,              MVT::v8f32, Legal);
01383     setOperationAction(ISD::LOAD,               MVT::v16f32, Legal);
01384     setOperationAction(ISD::LOAD,               MVT::v8f64, Legal);
01385     setOperationAction(ISD::LOAD,               MVT::v8i64, Legal);
01386     setOperationAction(ISD::LOAD,               MVT::v16i32, Legal);
01387     setOperationAction(ISD::LOAD,               MVT::v16i1, Legal);
01388 
01389     setOperationAction(ISD::FADD,               MVT::v16f32, Legal);
01390     setOperationAction(ISD::FSUB,               MVT::v16f32, Legal);
01391     setOperationAction(ISD::FMUL,               MVT::v16f32, Legal);
01392     setOperationAction(ISD::FDIV,               MVT::v16f32, Legal);
01393     setOperationAction(ISD::FSQRT,              MVT::v16f32, Legal);
01394     setOperationAction(ISD::FNEG,               MVT::v16f32, Custom);
01395 
01396     setOperationAction(ISD::FADD,               MVT::v8f64, Legal);
01397     setOperationAction(ISD::FSUB,               MVT::v8f64, Legal);
01398     setOperationAction(ISD::FMUL,               MVT::v8f64, Legal);
01399     setOperationAction(ISD::FDIV,               MVT::v8f64, Legal);
01400     setOperationAction(ISD::FSQRT,              MVT::v8f64, Legal);
01401     setOperationAction(ISD::FNEG,               MVT::v8f64, Custom);
01402     setOperationAction(ISD::FMA,                MVT::v8f64, Legal);
01403     setOperationAction(ISD::FMA,                MVT::v16f32, Legal);
01404 
01405     setOperationAction(ISD::FP_TO_SINT,         MVT::i32, Legal);
01406     setOperationAction(ISD::FP_TO_UINT,         MVT::i32, Legal);
01407     setOperationAction(ISD::SINT_TO_FP,         MVT::i32, Legal);
01408     setOperationAction(ISD::UINT_TO_FP,         MVT::i32, Legal);
01409     if (Subtarget->is64Bit()) {
01410       setOperationAction(ISD::FP_TO_UINT,       MVT::i64, Legal);
01411       setOperationAction(ISD::FP_TO_SINT,       MVT::i64, Legal);
01412       setOperationAction(ISD::SINT_TO_FP,       MVT::i64, Legal);
01413       setOperationAction(ISD::UINT_TO_FP,       MVT::i64, Legal);
01414     }
01415     setOperationAction(ISD::FP_TO_SINT,         MVT::v16i32, Legal);
01416     setOperationAction(ISD::FP_TO_UINT,         MVT::v16i32, Legal);
01417     setOperationAction(ISD::FP_TO_UINT,         MVT::v8i32, Legal);
01418     setOperationAction(ISD::FP_TO_UINT,         MVT::v4i32, Legal);
01419     setOperationAction(ISD::SINT_TO_FP,         MVT::v16i32, Legal);
01420     setOperationAction(ISD::UINT_TO_FP,         MVT::v16i32, Legal);
01421     setOperationAction(ISD::UINT_TO_FP,         MVT::v8i32, Legal);
01422     setOperationAction(ISD::UINT_TO_FP,         MVT::v4i32, Legal);
01423     setOperationAction(ISD::FP_ROUND,           MVT::v8f32, Legal);
01424     setOperationAction(ISD::FP_EXTEND,          MVT::v8f32, Legal);
01425 
01426     setOperationAction(ISD::TRUNCATE,           MVT::i1, Custom);
01427     setOperationAction(ISD::TRUNCATE,           MVT::v16i8, Custom);
01428     setOperationAction(ISD::TRUNCATE,           MVT::v8i32, Custom);
01429     setOperationAction(ISD::TRUNCATE,           MVT::v8i1, Custom);
01430     setOperationAction(ISD::TRUNCATE,           MVT::v16i1, Custom);
01431     setOperationAction(ISD::TRUNCATE,           MVT::v16i16, Custom);
01432     setOperationAction(ISD::ZERO_EXTEND,        MVT::v16i32, Custom);
01433     setOperationAction(ISD::ZERO_EXTEND,        MVT::v8i64, Custom);
01434     setOperationAction(ISD::SIGN_EXTEND,        MVT::v16i32, Custom);
01435     setOperationAction(ISD::SIGN_EXTEND,        MVT::v8i64, Custom);
01436     setOperationAction(ISD::SIGN_EXTEND,        MVT::v16i8, Custom);
01437     setOperationAction(ISD::SIGN_EXTEND,        MVT::v8i16, Custom);
01438     setOperationAction(ISD::SIGN_EXTEND,        MVT::v16i16, Custom);
01439 
01440     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v8f64,  Custom);
01441     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v8i64,  Custom);
01442     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v16f32,  Custom);
01443     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v16i32,  Custom);
01444     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v8i1,    Custom);
01445     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v16i1, Legal);
01446 
01447     setOperationAction(ISD::SETCC,              MVT::v16i1, Custom);
01448     setOperationAction(ISD::SETCC,              MVT::v8i1, Custom);
01449 
01450     setOperationAction(ISD::MUL,              MVT::v8i64, Custom);
01451 
01452     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i1,  Custom);
01453     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i1, Custom);
01454     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v16i1, Custom);
01455     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v8i1, Custom);
01456     setOperationAction(ISD::BUILD_VECTOR,       MVT::v8i1, Custom);
01457     setOperationAction(ISD::BUILD_VECTOR,       MVT::v16i1, Custom);
01458     setOperationAction(ISD::SELECT,             MVT::v8f64, Custom);
01459     setOperationAction(ISD::SELECT,             MVT::v8i64, Custom);
01460     setOperationAction(ISD::SELECT,             MVT::v16f32, Custom);
01461 
01462     setOperationAction(ISD::ADD,                MVT::v8i64, Legal);
01463     setOperationAction(ISD::ADD,                MVT::v16i32, Legal);
01464 
01465     setOperationAction(ISD::SUB,                MVT::v8i64, Legal);
01466     setOperationAction(ISD::SUB,                MVT::v16i32, Legal);
01467 
01468     setOperationAction(ISD::MUL,                MVT::v16i32, Legal);
01469 
01470     setOperationAction(ISD::SRL,                MVT::v8i64, Custom);
01471     setOperationAction(ISD::SRL,                MVT::v16i32, Custom);
01472 
01473     setOperationAction(ISD::SHL,                MVT::v8i64, Custom);
01474     setOperationAction(ISD::SHL,                MVT::v16i32, Custom);
01475 
01476     setOperationAction(ISD::SRA,                MVT::v8i64, Custom);
01477     setOperationAction(ISD::SRA,                MVT::v16i32, Custom);
01478 
01479     setOperationAction(ISD::AND,                MVT::v8i64, Legal);
01480     setOperationAction(ISD::OR,                 MVT::v8i64, Legal);
01481     setOperationAction(ISD::XOR,                MVT::v8i64, Legal);
01482     setOperationAction(ISD::AND,                MVT::v16i32, Legal);
01483     setOperationAction(ISD::OR,                 MVT::v16i32, Legal);
01484     setOperationAction(ISD::XOR,                MVT::v16i32, Legal);
01485 
01486     if (Subtarget->hasCDI()) {
01487       setOperationAction(ISD::CTLZ,             MVT::v8i64, Legal);
01488       setOperationAction(ISD::CTLZ,             MVT::v16i32, Legal);
01489     }
01490 
01491     // Custom lower several nodes.
01492     for (int i = MVT::FIRST_VECTOR_VALUETYPE;
01493              i <= MVT::LAST_VECTOR_VALUETYPE; ++i) {
01494       MVT VT = (MVT::SimpleValueType)i;
01495 
01496       unsigned EltSize = VT.getVectorElementType().getSizeInBits();
01497       // Extract subvector is special because the value type
01498       // (result) is 256/128-bit but the source is 512-bit wide.
01499       if (VT.is128BitVector() || VT.is256BitVector())
01500         setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
01501 
01502       if (VT.getVectorElementType() == MVT::i1)
01503         setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);
01504 
01505       // Do not attempt to custom lower other non-512-bit vectors
01506       if (!VT.is512BitVector())
01507         continue;
01508 
01509       if ( EltSize >= 32) {
01510         setOperationAction(ISD::VECTOR_SHUFFLE,      VT, Custom);
01511         setOperationAction(ISD::INSERT_VECTOR_ELT,   VT, Custom);
01512         setOperationAction(ISD::BUILD_VECTOR,        VT, Custom);
01513         setOperationAction(ISD::VSELECT,             VT, Legal);
01514         setOperationAction(ISD::EXTRACT_VECTOR_ELT,  VT, Custom);
01515         setOperationAction(ISD::SCALAR_TO_VECTOR,    VT, Custom);
01516         setOperationAction(ISD::INSERT_SUBVECTOR,    VT, Custom);
01517       }
01518     }
01519     for (int i = MVT::v32i8; i != MVT::v8i64; ++i) {
01520       MVT VT = (MVT::SimpleValueType)i;
01521 
01522       // Do not attempt to promote non-256-bit vectors
01523       if (!VT.is512BitVector())
01524         continue;
01525 
01526       setOperationAction(ISD::SELECT, VT, Promote);
01527       AddPromotedToType (ISD::SELECT, VT, MVT::v8i64);
01528     }
01529   }// has  AVX-512
01530 
01531   if (!TM.Options.UseSoftFloat && Subtarget->hasBWI()) {
01532     addRegisterClass(MVT::v32i16, &X86::VR512RegClass);
01533     addRegisterClass(MVT::v64i8,  &X86::VR512RegClass);
01534 
01535     addRegisterClass(MVT::v32i1,  &X86::VK32RegClass);
01536     addRegisterClass(MVT::v64i1,  &X86::VK64RegClass);
01537 
01538     setOperationAction(ISD::LOAD,               MVT::v32i16, Legal);
01539     setOperationAction(ISD::LOAD,               MVT::v64i8, Legal);
01540     setOperationAction(ISD::SETCC,              MVT::v32i1, Custom);
01541     setOperationAction(ISD::SETCC,              MVT::v64i1, Custom);
01542 
01543     for (int i = MVT::v32i8; i != MVT::v8i64; ++i) {
01544       const MVT VT = (MVT::SimpleValueType)i;
01545 
01546       const unsigned EltSize = VT.getVectorElementType().getSizeInBits();
01547 
01548       // Do not attempt to promote non-256-bit vectors
01549       if (!VT.is512BitVector())
01550         continue;
01551 
01552       if ( EltSize < 32) {
01553         setOperationAction(ISD::BUILD_VECTOR,        VT, Custom);
01554         setOperationAction(ISD::VSELECT,             VT, Legal);
01555       }
01556     }
01557   }
01558 
01559   if (!TM.Options.UseSoftFloat && Subtarget->hasVLX()) {
01560     addRegisterClass(MVT::v4i1,   &X86::VK4RegClass);
01561     addRegisterClass(MVT::v2i1,   &X86::VK2RegClass);
01562 
01563     setOperationAction(ISD::SETCC,              MVT::v4i1, Custom);
01564     setOperationAction(ISD::SETCC,              MVT::v2i1, Custom);
01565     setOperationAction(ISD::INSERT_SUBVECTOR,   MVT::v8i1, Legal);
01566   }
01567 
01568   // SIGN_EXTEND_INREGs are evaluated by the extend type. Handle the expansion
01569   // of this type with custom code.
01570   for (int VT = MVT::FIRST_VECTOR_VALUETYPE;
01571            VT != MVT::LAST_VECTOR_VALUETYPE; VT++) {
01572     setOperationAction(ISD::SIGN_EXTEND_INREG, (MVT::SimpleValueType)VT,
01573                        Custom);
01574   }
01575 
01576   // We want to custom lower some of our intrinsics.
01577   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
01578   setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
01579   setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
01580   if (!Subtarget->is64Bit())
01581     setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i64, Custom);
01582 
01583   // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't
01584   // handle type legalization for these operations here.
01585   //
01586   // FIXME: We really should do custom legalization for addition and
01587   // subtraction on x86-32 once PR3203 is fixed.  We really can't do much better
01588   // than generic legalization for 64-bit multiplication-with-overflow, though.
01589   for (unsigned i = 0, e = 3+Subtarget->is64Bit(); i != e; ++i) {
01590     // Add/Sub/Mul with overflow operations are custom lowered.
01591     MVT VT = IntVTs[i];
01592     setOperationAction(ISD::SADDO, VT, Custom);
01593     setOperationAction(ISD::UADDO, VT, Custom);
01594     setOperationAction(ISD::SSUBO, VT, Custom);
01595     setOperationAction(ISD::USUBO, VT, Custom);
01596     setOperationAction(ISD::SMULO, VT, Custom);
01597     setOperationAction(ISD::UMULO, VT, Custom);
01598   }
01599 
01600   // There are no 8-bit 3-address imul/mul instructions
01601   setOperationAction(ISD::SMULO, MVT::i8, Expand);
01602   setOperationAction(ISD::UMULO, MVT::i8, Expand);
01603 
01604   if (!Subtarget->is64Bit()) {
01605     // These libcalls are not available in 32-bit.
01606     setLibcallName(RTLIB::SHL_I128, nullptr);
01607     setLibcallName(RTLIB::SRL_I128, nullptr);
01608     setLibcallName(RTLIB::SRA_I128, nullptr);
01609   }
01610 
01611   // Combine sin / cos into one node or libcall if possible.
01612   if (Subtarget->hasSinCos()) {
01613     setLibcallName(RTLIB::SINCOS_F32, "sincosf");
01614     setLibcallName(RTLIB::SINCOS_F64, "sincos");
01615     if (Subtarget->isTargetDarwin()) {
01616       // For MacOSX, we don't want to the normal expansion of a libcall to
01617       // sincos. We want to issue a libcall to __sincos_stret to avoid memory
01618       // traffic.
01619       setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
01620       setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
01621     }
01622   }
01623 
01624   if (Subtarget->isTargetWin64()) {
01625     setOperationAction(ISD::SDIV, MVT::i128, Custom);
01626     setOperationAction(ISD::UDIV, MVT::i128, Custom);
01627     setOperationAction(ISD::SREM, MVT::i128, Custom);
01628     setOperationAction(ISD::UREM, MVT::i128, Custom);
01629     setOperationAction(ISD::SDIVREM, MVT::i128, Custom);
01630     setOperationAction(ISD::UDIVREM, MVT::i128, Custom);
01631   }
01632 
01633   // We have target-specific dag combine patterns for the following nodes:
01634   setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
01635   setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
01636   setTargetDAGCombine(ISD::VSELECT);
01637   setTargetDAGCombine(ISD::SELECT);
01638   setTargetDAGCombine(ISD::SHL);
01639   setTargetDAGCombine(ISD::SRA);
01640   setTargetDAGCombine(ISD::SRL);
01641   setTargetDAGCombine(ISD::OR);
01642   setTargetDAGCombine(ISD::AND);
01643   setTargetDAGCombine(ISD::ADD);
01644   setTargetDAGCombine(ISD::FADD);
01645   setTargetDAGCombine(ISD::FSUB);
01646   setTargetDAGCombine(ISD::FMA);
01647   setTargetDAGCombine(ISD::SUB);
01648   setTargetDAGCombine(ISD::LOAD);
01649   setTargetDAGCombine(ISD::STORE);
01650   setTargetDAGCombine(ISD::ZERO_EXTEND);
01651   setTargetDAGCombine(ISD::ANY_EXTEND);
01652   setTargetDAGCombine(ISD::SIGN_EXTEND);
01653   setTargetDAGCombine(ISD::SIGN_EXTEND_INREG);
01654   setTargetDAGCombine(ISD::TRUNCATE);
01655   setTargetDAGCombine(ISD::SINT_TO_FP);
01656   setTargetDAGCombine(ISD::SETCC);
01657   setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN);
01658   setTargetDAGCombine(ISD::BUILD_VECTOR);
01659   if (Subtarget->is64Bit())
01660     setTargetDAGCombine(ISD::MUL);
01661   setTargetDAGCombine(ISD::XOR);
01662 
01663   computeRegisterProperties();
01664 
01665   // On Darwin, -Os means optimize for size without hurting performance,
01666   // do not reduce the limit.
01667   MaxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores
01668   MaxStoresPerMemsetOptSize = Subtarget->isTargetDarwin() ? 16 : 8;
01669   MaxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores
01670   MaxStoresPerMemcpyOptSize = Subtarget->isTargetDarwin() ? 8 : 4;
01671   MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores
01672   MaxStoresPerMemmoveOptSize = Subtarget->isTargetDarwin() ? 8 : 4;
01673   setPrefLoopAlignment(4); // 2^4 bytes.
01674 
01675   // Predictable cmov don't hurt on atom because it's in-order.
01676   PredictableSelectIsExpensive = !Subtarget->isAtom();
01677 
01678   setPrefFunctionAlignment(4); // 2^4 bytes.
01679 
01680   verifyIntrinsicTables();
01681 }
01682 
01683 // This has so far only been implemented for 64-bit MachO.
01684 bool X86TargetLowering::useLoadStackGuardNode() const {
01685   return Subtarget->getTargetTriple().getObjectFormat() == Triple::MachO &&
01686          Subtarget->is64Bit();
01687 }
01688 
01689 TargetLoweringBase::LegalizeTypeAction
01690 X86TargetLowering::getPreferredVectorAction(EVT VT) const {
01691   if (ExperimentalVectorWideningLegalization &&
01692       VT.getVectorNumElements() != 1 &&
01693       VT.getVectorElementType().getSimpleVT() != MVT::i1)
01694     return TypeWidenVector;
01695 
01696   return TargetLoweringBase::getPreferredVectorAction(VT);
01697 }
01698 
01699 EVT X86TargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const {
01700   if (!VT.isVector())
01701     return Subtarget->hasAVX512() ? MVT::i1: MVT::i8;
01702 
01703   const unsigned NumElts = VT.getVectorNumElements();
01704   const EVT EltVT = VT.getVectorElementType();
01705   if (VT.is512BitVector()) {
01706     if (Subtarget->hasAVX512())
01707       if (EltVT == MVT::i32 || EltVT == MVT::i64 ||
01708           EltVT == MVT::f32 || EltVT == MVT::f64)
01709         switch(NumElts) {
01710         case  8: return MVT::v8i1;
01711         case 16: return MVT::v16i1;
01712       }
01713     if (Subtarget->hasBWI())
01714       if (EltVT == MVT::i8 || EltVT == MVT::i16)
01715         switch(NumElts) {
01716         case 32: return MVT::v32i1;
01717         case 64: return MVT::v64i1;
01718       }
01719   }
01720 
01721   if (VT.is256BitVector() || VT.is128BitVector()) {
01722     if (Subtarget->hasVLX())
01723       if (EltVT == MVT::i32 || EltVT == MVT::i64 ||
01724           EltVT == MVT::f32 || EltVT == MVT::f64)
01725         switch(NumElts) {
01726         case 2: return MVT::v2i1;
01727         case 4: return MVT::v4i1;
01728         case 8: return MVT::v8i1;
01729       }
01730     if (Subtarget->hasBWI() && Subtarget->hasVLX())
01731       if (EltVT == MVT::i8 || EltVT == MVT::i16)
01732         switch(NumElts) {
01733         case  8: return MVT::v8i1;
01734         case 16: return MVT::v16i1;
01735         case 32: return MVT::v32i1;
01736       }
01737   }
01738 
01739   return VT.changeVectorElementTypeToInteger();
01740 }
01741 
01742 /// getMaxByValAlign - Helper for getByValTypeAlignment to determine
01743 /// the desired ByVal argument alignment.
01744 static void getMaxByValAlign(Type *Ty, unsigned &MaxAlign) {
01745   if (MaxAlign == 16)
01746     return;
01747   if (VectorType *VTy = dyn_cast<VectorType>(Ty)) {
01748     if (VTy->getBitWidth() == 128)
01749       MaxAlign = 16;
01750   } else if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
01751     unsigned EltAlign = 0;
01752     getMaxByValAlign(ATy->getElementType(), EltAlign);
01753     if (EltAlign > MaxAlign)
01754       MaxAlign = EltAlign;
01755   } else if (StructType *STy = dyn_cast<StructType>(Ty)) {
01756     for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
01757       unsigned EltAlign = 0;
01758       getMaxByValAlign(STy->getElementType(i), EltAlign);
01759       if (EltAlign > MaxAlign)
01760         MaxAlign = EltAlign;
01761       if (MaxAlign == 16)
01762         break;
01763     }
01764   }
01765 }
01766 
01767 /// getByValTypeAlignment - Return the desired alignment for ByVal aggregate
01768 /// function arguments in the caller parameter area. For X86, aggregates
01769 /// that contain SSE vectors are placed at 16-byte boundaries while the rest
01770 /// are at 4-byte boundaries.
01771 unsigned X86TargetLowering::getByValTypeAlignment(Type *Ty) const {
01772   if (Subtarget->is64Bit()) {
01773     // Max of 8 and alignment of type.
01774     unsigned TyAlign = TD->getABITypeAlignment(Ty);
01775     if (TyAlign > 8)
01776       return TyAlign;
01777     return 8;
01778   }
01779 
01780   unsigned Align = 4;
01781   if (Subtarget->hasSSE1())
01782     getMaxByValAlign(Ty, Align);
01783   return Align;
01784 }
01785 
01786 /// getOptimalMemOpType - Returns the target specific optimal type for load
01787 /// and store operations as a result of memset, memcpy, and memmove
01788 /// lowering. If DstAlign is zero that means it's safe to destination
01789 /// alignment can satisfy any constraint. Similarly if SrcAlign is zero it
01790 /// means there isn't a need to check it against alignment requirement,
01791 /// probably because the source does not need to be loaded. If 'IsMemset' is
01792 /// true, that means it's expanding a memset. If 'ZeroMemset' is true, that
01793 /// means it's a memset of zero. 'MemcpyStrSrc' indicates whether the memcpy
01794 /// source is constant so it does not need to be loaded.
01795 /// It returns EVT::Other if the type should be determined using generic
01796 /// target-independent logic.
01797 EVT
01798 X86TargetLowering::getOptimalMemOpType(uint64_t Size,
01799                                        unsigned DstAlign, unsigned SrcAlign,
01800                                        bool IsMemset, bool ZeroMemset,
01801                                        bool MemcpyStrSrc,
01802                                        MachineFunction &MF) const {
01803   const Function *F = MF.getFunction();
01804   if ((!IsMemset || ZeroMemset) &&
01805       !F->getAttributes().hasAttribute(AttributeSet::FunctionIndex,
01806                                        Attribute::NoImplicitFloat)) {
01807     if (Size >= 16 &&
01808         (Subtarget->isUnalignedMemAccessFast() ||
01809          ((DstAlign == 0 || DstAlign >= 16) &&
01810           (SrcAlign == 0 || SrcAlign >= 16)))) {
01811       if (Size >= 32) {
01812         if (Subtarget->hasInt256())
01813           return MVT::v8i32;
01814         if (Subtarget->hasFp256())
01815           return MVT::v8f32;
01816       }
01817       if (Subtarget->hasSSE2())
01818         return MVT::v4i32;
01819       if (Subtarget->hasSSE1())
01820         return MVT::v4f32;
01821     } else if (!MemcpyStrSrc && Size >= 8 &&
01822                !Subtarget->is64Bit() &&
01823                Subtarget->hasSSE2()) {
01824       // Do not use f64 to lower memcpy if source is string constant. It's
01825       // better to use i32 to avoid the loads.
01826       return MVT::f64;
01827     }
01828   }
01829   if (Subtarget->is64Bit() && Size >= 8)
01830     return MVT::i64;
01831   return MVT::i32;
01832 }
01833 
01834 bool X86TargetLowering::isSafeMemOpType(MVT VT) const {
01835   if (VT == MVT::f32)
01836     return X86ScalarSSEf32;
01837   else if (VT == MVT::f64)
01838     return X86ScalarSSEf64;
01839   return true;
01840 }
01841 
01842 bool
01843 X86TargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
01844                                                   unsigned,
01845                                                   unsigned,
01846                                                   bool *Fast) const {
01847   if (Fast)
01848     *Fast = Subtarget->isUnalignedMemAccessFast();
01849   return true;
01850 }
01851 
01852 /// getJumpTableEncoding - Return the entry encoding for a jump table in the
01853 /// current function.  The returned value is a member of the
01854 /// MachineJumpTableInfo::JTEntryKind enum.
01855 unsigned X86TargetLowering::getJumpTableEncoding() const {
01856   // In GOT pic mode, each entry in the jump table is emitted as a @GOTOFF
01857   // symbol.
01858   if (getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
01859       Subtarget->isPICStyleGOT())
01860     return MachineJumpTableInfo::EK_Custom32;
01861 
01862   // Otherwise, use the normal jump table encoding heuristics.
01863   return TargetLowering::getJumpTableEncoding();
01864 }
01865 
01866 const MCExpr *
01867 X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI,
01868                                              const MachineBasicBlock *MBB,
01869                                              unsigned uid,MCContext &Ctx) const{
01870   assert(MBB->getParent()->getTarget().getRelocationModel() == Reloc::PIC_ &&
01871          Subtarget->isPICStyleGOT());
01872   // In 32-bit ELF systems, our jump table entries are formed with @GOTOFF
01873   // entries.
01874   return MCSymbolRefExpr::Create(MBB->getSymbol(),
01875                                  MCSymbolRefExpr::VK_GOTOFF, Ctx);
01876 }
01877 
01878 /// getPICJumpTableRelocaBase - Returns relocation base for the given PIC
01879 /// jumptable.
01880 SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table,
01881                                                     SelectionDAG &DAG) const {
01882   if (!Subtarget->is64Bit())
01883     // This doesn't have SDLoc associated with it, but is not really the
01884     // same as a Register.
01885     return DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), getPointerTy());
01886   return Table;
01887 }
01888 
01889 /// getPICJumpTableRelocBaseExpr - This returns the relocation base for the
01890 /// given PIC jumptable, the same as getPICJumpTableRelocBase, but as an
01891 /// MCExpr.
01892 const MCExpr *X86TargetLowering::
01893 getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI,
01894                              MCContext &Ctx) const {
01895   // X86-64 uses RIP relative addressing based on the jump table label.
01896   if (Subtarget->isPICStyleRIPRel())
01897     return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx);
01898 
01899   // Otherwise, the reference is relative to the PIC base.
01900   return MCSymbolRefExpr::Create(MF->getPICBaseSymbol(), Ctx);
01901 }
01902 
01903 // FIXME: Why this routine is here? Move to RegInfo!
01904 std::pair<const TargetRegisterClass*, uint8_t>
01905 X86TargetLowering::findRepresentativeClass(MVT VT) const{
01906   const TargetRegisterClass *RRC = nullptr;
01907   uint8_t Cost = 1;
01908   switch (VT.SimpleTy) {
01909   default:
01910     return TargetLowering::findRepresentativeClass(VT);
01911   case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64:
01912     RRC = Subtarget->is64Bit() ? &X86::GR64RegClass : &X86::GR32RegClass;
01913     break;
01914   case MVT::x86mmx:
01915     RRC = &X86::VR64RegClass;
01916     break;
01917   case MVT::f32: case MVT::f64:
01918   case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64:
01919   case MVT::v4f32: case MVT::v2f64:
01920   case MVT::v32i8: case MVT::v8i32: case MVT::v4i64: case MVT::v8f32:
01921   case MVT::v4f64:
01922     RRC = &X86::VR128RegClass;
01923     break;
01924   }
01925   return std::make_pair(RRC, Cost);
01926 }
01927 
01928 bool X86TargetLowering::getStackCookieLocation(unsigned &AddressSpace,
01929                                                unsigned &Offset) const {
01930   if (!Subtarget->isTargetLinux())
01931     return false;
01932 
01933   if (Subtarget->is64Bit()) {
01934     // %fs:0x28, unless we're using a Kernel code model, in which case it's %gs:
01935     Offset = 0x28;
01936     if (getTargetMachine().getCodeModel() == CodeModel::Kernel)
01937       AddressSpace = 256;
01938     else
01939       AddressSpace = 257;
01940   } else {
01941     // %gs:0x14 on i386
01942     Offset = 0x14;
01943     AddressSpace = 256;
01944   }
01945   return true;
01946 }
01947 
01948 bool X86TargetLowering::isNoopAddrSpaceCast(unsigned SrcAS,
01949                                             unsigned DestAS) const {
01950   assert(SrcAS != DestAS && "Expected different address spaces!");
01951 
01952   return SrcAS < 256 && DestAS < 256;
01953 }
01954 
01955 //===----------------------------------------------------------------------===//
01956 //               Return Value Calling Convention Implementation
01957 //===----------------------------------------------------------------------===//
01958 
01959 #include "X86GenCallingConv.inc"
01960 
01961 bool
01962 X86TargetLowering::CanLowerReturn(CallingConv::ID CallConv,
01963                                   MachineFunction &MF, bool isVarArg,
01964                         const SmallVectorImpl<ISD::OutputArg> &Outs,
01965                         LLVMContext &Context) const {
01966   SmallVector<CCValAssign, 16> RVLocs;
01967   CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
01968   return CCInfo.CheckReturn(Outs, RetCC_X86);
01969 }
01970 
01971 const MCPhysReg *X86TargetLowering::getScratchRegisters(CallingConv::ID) const {
01972   static const MCPhysReg ScratchRegs[] = { X86::R11, 0 };
01973   return ScratchRegs;
01974 }
01975 
01976 SDValue
01977 X86TargetLowering::LowerReturn(SDValue Chain,
01978                                CallingConv::ID CallConv, bool isVarArg,
01979                                const SmallVectorImpl<ISD::OutputArg> &Outs,
01980                                const SmallVectorImpl<SDValue> &OutVals,
01981                                SDLoc dl, SelectionDAG &DAG) const {
01982   MachineFunction &MF = DAG.getMachineFunction();
01983   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
01984 
01985   SmallVector<CCValAssign, 16> RVLocs;
01986   CCState CCInfo(CallConv, isVarArg, MF, RVLocs, *DAG.getContext());
01987   CCInfo.AnalyzeReturn(Outs, RetCC_X86);
01988 
01989   SDValue Flag;
01990   SmallVector<SDValue, 6> RetOps;
01991   RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
01992   // Operand #1 = Bytes To Pop
01993   RetOps.push_back(DAG.getTargetConstant(FuncInfo->getBytesToPopOnReturn(),
01994                    MVT::i16));
01995 
01996   // Copy the result values into the output registers.
01997   for (unsigned i = 0; i != RVLocs.size(); ++i) {
01998     CCValAssign &VA = RVLocs[i];
01999     assert(VA.isRegLoc() && "Can only return in registers!");
02000     SDValue ValToCopy = OutVals[i];
02001     EVT ValVT = ValToCopy.getValueType();
02002 
02003     // Promote values to the appropriate types
02004     if (VA.getLocInfo() == CCValAssign::SExt)
02005       ValToCopy = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), ValToCopy);
02006     else if (VA.getLocInfo() == CCValAssign::ZExt)
02007       ValToCopy = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), ValToCopy);
02008     else if (VA.getLocInfo() == CCValAssign::AExt)
02009       ValToCopy = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), ValToCopy);
02010     else if (VA.getLocInfo() == CCValAssign::BCvt)
02011       ValToCopy = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), ValToCopy);
02012 
02013     assert(VA.getLocInfo() != CCValAssign::FPExt &&
02014            "Unexpected FP-extend for return value.");  
02015 
02016     // If this is x86-64, and we disabled SSE, we can't return FP values,
02017     // or SSE or MMX vectors.
02018     if ((ValVT == MVT::f32 || ValVT == MVT::f64 ||
02019          VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) &&
02020           (Subtarget->is64Bit() && !Subtarget->hasSSE1())) {
02021       report_fatal_error("SSE register return with SSE disabled");
02022     }
02023     // Likewise we can't return F64 values with SSE1 only.  gcc does so, but
02024     // llvm-gcc has never done it right and no one has noticed, so this
02025     // should be OK for now.
02026     if (ValVT == MVT::f64 &&
02027         (Subtarget->is64Bit() && !Subtarget->hasSSE2()))
02028       report_fatal_error("SSE2 register return with SSE2 disabled");
02029 
02030     // Returns in ST0/ST1 are handled specially: these are pushed as operands to
02031     // the RET instruction and handled by the FP Stackifier.
02032     if (VA.getLocReg() == X86::FP0 ||
02033         VA.getLocReg() == X86::FP1) {
02034       // If this is a copy from an xmm register to ST(0), use an FPExtend to
02035       // change the value to the FP stack register class.
02036       if (isScalarFPTypeInSSEReg(VA.getValVT()))
02037         ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy);
02038       RetOps.push_back(ValToCopy);
02039       // Don't emit a copytoreg.
02040       continue;
02041     }
02042 
02043     // 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64
02044     // which is returned in RAX / RDX.
02045     if (Subtarget->is64Bit()) {
02046       if (ValVT == MVT::x86mmx) {
02047         if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) {
02048           ValToCopy = DAG.getNode(ISD::BITCAST, dl, MVT::i64, ValToCopy);
02049           ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
02050                                   ValToCopy);
02051           // If we don't have SSE2 available, convert to v4f32 so the generated
02052           // register is legal.
02053           if (!Subtarget->hasSSE2())
02054             ValToCopy = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32,ValToCopy);
02055         }
02056       }
02057     }
02058 
02059     Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), ValToCopy, Flag);
02060     Flag = Chain.getValue(1);
02061     RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
02062   }
02063 
02064   // The x86-64 ABIs require that for returning structs by value we copy
02065   // the sret argument into %rax/%eax (depending on ABI) for the return.
02066   // Win32 requires us to put the sret argument to %eax as well.
02067   // We saved the argument into a virtual register in the entry block,
02068   // so now we copy the value out and into %rax/%eax.
02069   if (DAG.getMachineFunction().getFunction()->hasStructRetAttr() &&
02070       (Subtarget->is64Bit() || Subtarget->isTargetKnownWindowsMSVC())) {
02071     MachineFunction &MF = DAG.getMachineFunction();
02072     X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
02073     unsigned Reg = FuncInfo->getSRetReturnReg();
02074     assert(Reg &&
02075            "SRetReturnReg should have been set in LowerFormalArguments().");
02076     SDValue Val = DAG.getCopyFromReg(Chain, dl, Reg, getPointerTy());
02077 
02078     unsigned RetValReg
02079         = (Subtarget->is64Bit() && !Subtarget->isTarget64BitILP32()) ?
02080           X86::RAX : X86::EAX;
02081     Chain = DAG.getCopyToReg(Chain, dl, RetValReg, Val, Flag);
02082     Flag = Chain.getValue(1);
02083 
02084     // RAX/EAX now acts like a return value.
02085     RetOps.push_back(DAG.getRegister(RetValReg, getPointerTy()));
02086   }
02087 
02088   RetOps[0] = Chain;  // Update chain.
02089 
02090   // Add the flag if we have it.
02091   if (Flag.getNode())
02092     RetOps.push_back(Flag);
02093 
02094   return DAG.getNode(X86ISD::RET_FLAG, dl, MVT::Other, RetOps);
02095 }
02096 
02097 bool X86TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
02098   if (N->getNumValues() != 1)
02099     return false;
02100   if (!N->hasNUsesOfValue(1, 0))
02101     return false;
02102 
02103   SDValue TCChain = Chain;
02104   SDNode *Copy = *N->use_begin();
02105   if (Copy->getOpcode() == ISD::CopyToReg) {
02106     // If the copy has a glue operand, we conservatively assume it isn't safe to
02107     // perform a tail call.
02108     if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
02109       return false;
02110     TCChain = Copy->getOperand(0);
02111   } else if (Copy->getOpcode() != ISD::FP_EXTEND)
02112     return false;
02113 
02114   bool HasRet = false;
02115   for (SDNode::use_iterator UI = Copy->use_begin(), UE = Copy->use_end();
02116        UI != UE; ++UI) {
02117     if (UI->getOpcode() != X86ISD::RET_FLAG)
02118       return false;
02119     // If we are returning more than one value, we can definitely
02120     // not make a tail call see PR19530
02121     if (UI->getNumOperands() > 4)
02122       return false;
02123     if (UI->getNumOperands() == 4 &&
02124         UI->getOperand(UI->getNumOperands()-1).getValueType() != MVT::Glue)
02125       return false;
02126     HasRet = true;
02127   }
02128 
02129   if (!HasRet)
02130     return false;
02131 
02132   Chain = TCChain;
02133   return true;
02134 }
02135 
02136 EVT
02137 X86TargetLowering::getTypeForExtArgOrReturn(LLVMContext &Context, EVT VT,
02138                                             ISD::NodeType ExtendKind) const {
02139   MVT ReturnMVT;
02140   // TODO: Is this also valid on 32-bit?
02141   if (Subtarget->is64Bit() && VT == MVT::i1 && ExtendKind == ISD::ZERO_EXTEND)
02142     ReturnMVT = MVT::i8;
02143   else
02144     ReturnMVT = MVT::i32;
02145 
02146   EVT MinVT = getRegisterType(Context, ReturnMVT);
02147   return VT.bitsLT(MinVT) ? MinVT : VT;
02148 }
02149 
02150 /// LowerCallResult - Lower the result values of a call into the
02151 /// appropriate copies out of appropriate physical registers.
02152 ///
02153 SDValue
02154 X86TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
02155                                    CallingConv::ID CallConv, bool isVarArg,
02156                                    const SmallVectorImpl<ISD::InputArg> &Ins,
02157                                    SDLoc dl, SelectionDAG &DAG,
02158                                    SmallVectorImpl<SDValue> &InVals) const {
02159 
02160   // Assign locations to each value returned by this call.
02161   SmallVector<CCValAssign, 16> RVLocs;
02162   bool Is64Bit = Subtarget->is64Bit();
02163   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
02164                  *DAG.getContext());
02165   CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
02166 
02167   // Copy all of the result registers out of their specified physreg.
02168   for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
02169     CCValAssign &VA = RVLocs[i];
02170     EVT CopyVT = VA.getValVT();
02171 
02172     // If this is x86-64, and we disabled SSE, we can't return FP values
02173     if ((CopyVT == MVT::f32 || CopyVT == MVT::f64) &&
02174         ((Is64Bit || Ins[i].Flags.isInReg()) && !Subtarget->hasSSE1())) {
02175       report_fatal_error("SSE register return with SSE disabled");
02176     }
02177 
02178     // If we prefer to use the value in xmm registers, copy it out as f80 and
02179     // use a truncate to move it from fp stack reg to xmm reg.
02180     if ((VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1) &&
02181         isScalarFPTypeInSSEReg(VA.getValVT()))
02182       CopyVT = MVT::f80;
02183 
02184     Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(),
02185                                CopyVT, InFlag).getValue(1);
02186     SDValue Val = Chain.getValue(0);
02187 
02188     if (CopyVT != VA.getValVT())
02189       Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val,
02190                         // This truncation won't change the value.
02191                         DAG.getIntPtrConstant(1));
02192 
02193     InFlag = Chain.getValue(2);
02194     InVals.push_back(Val);
02195   }
02196 
02197   return Chain;
02198 }
02199 
02200 //===----------------------------------------------------------------------===//
02201 //                C & StdCall & Fast Calling Convention implementation
02202 //===----------------------------------------------------------------------===//
02203 //  StdCall calling convention seems to be standard for many Windows' API
02204 //  routines and around. It differs from C calling convention just a little:
02205 //  callee should clean up the stack, not caller. Symbols should be also
02206 //  decorated in some fancy way :) It doesn't support any vector arguments.
02207 //  For info on fast calling convention see Fast Calling Convention (tail call)
02208 //  implementation LowerX86_32FastCCCallTo.
02209 
02210 /// CallIsStructReturn - Determines whether a call uses struct return
02211 /// semantics.
02212 enum StructReturnType {
02213   NotStructReturn,
02214   RegStructReturn,
02215   StackStructReturn
02216 };
02217 static StructReturnType
02218 callIsStructReturn(const SmallVectorImpl<ISD::OutputArg> &Outs) {
02219   if (Outs.empty())
02220     return NotStructReturn;
02221 
02222   const ISD::ArgFlagsTy &Flags = Outs[0].Flags;
02223   if (!Flags.isSRet())
02224     return NotStructReturn;
02225   if (Flags.isInReg())
02226     return RegStructReturn;
02227   return StackStructReturn;
02228 }
02229 
02230 /// ArgsAreStructReturn - Determines whether a function uses struct
02231 /// return semantics.
02232 static StructReturnType
02233 argsAreStructReturn(const SmallVectorImpl<ISD::InputArg> &Ins) {
02234   if (Ins.empty())
02235     return NotStructReturn;
02236 
02237   const ISD::ArgFlagsTy &Flags = Ins[0].Flags;
02238   if (!Flags.isSRet())
02239     return NotStructReturn;
02240   if (Flags.isInReg())
02241     return RegStructReturn;
02242   return StackStructReturn;
02243 }
02244 
02245 /// CreateCopyOfByValArgument - Make a copy of an aggregate at address specified
02246 /// by "Src" to address "Dst" with size and alignment information specified by
02247 /// the specific parameter attribute. The copy will be passed as a byval
02248 /// function parameter.
02249 static SDValue
02250 CreateCopyOfByValArgument(SDValue Src, SDValue Dst, SDValue Chain,
02251                           ISD::ArgFlagsTy Flags, SelectionDAG &DAG,
02252                           SDLoc dl) {
02253   SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), MVT::i32);
02254 
02255   return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(),
02256                        /*isVolatile*/false, /*AlwaysInline=*/true,
02257                        MachinePointerInfo(), MachinePointerInfo());
02258 }
02259 
02260 /// IsTailCallConvention - Return true if the calling convention is one that
02261 /// supports tail call optimization.
02262 static bool IsTailCallConvention(CallingConv::ID CC) {
02263   return (CC == CallingConv::Fast || CC == CallingConv::GHC ||
02264           CC == CallingConv::HiPE);
02265 }
02266 
02267 /// \brief Return true if the calling convention is a C calling convention.
02268 static bool IsCCallConvention(CallingConv::ID CC) {
02269   return (CC == CallingConv::C || CC == CallingConv::X86_64_Win64 ||
02270           CC == CallingConv::X86_64_SysV);
02271 }
02272 
02273 bool X86TargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const {
02274   if (!CI->isTailCall() || getTargetMachine().Options.DisableTailCalls)
02275     return false;
02276 
02277   CallSite CS(CI);
02278   CallingConv::ID CalleeCC = CS.getCallingConv();
02279   if (!IsTailCallConvention(CalleeCC) && !IsCCallConvention(CalleeCC))
02280     return false;
02281 
02282   return true;
02283 }
02284 
02285 /// FuncIsMadeTailCallSafe - Return true if the function is being made into
02286 /// a tailcall target by changing its ABI.
02287 static bool FuncIsMadeTailCallSafe(CallingConv::ID CC,
02288                                    bool GuaranteedTailCallOpt) {
02289   return GuaranteedTailCallOpt && IsTailCallConvention(CC);
02290 }
02291 
02292 SDValue
02293 X86TargetLowering::LowerMemArgument(SDValue Chain,
02294                                     CallingConv::ID CallConv,
02295                                     const SmallVectorImpl<ISD::InputArg> &Ins,
02296                                     SDLoc dl, SelectionDAG &DAG,
02297                                     const CCValAssign &VA,
02298                                     MachineFrameInfo *MFI,
02299                                     unsigned i) const {
02300   // Create the nodes corresponding to a load from this parameter slot.
02301   ISD::ArgFlagsTy Flags = Ins[i].Flags;
02302   bool AlwaysUseMutable = FuncIsMadeTailCallSafe(
02303       CallConv, DAG.getTarget().Options.GuaranteedTailCallOpt);
02304   bool isImmutable = !AlwaysUseMutable && !Flags.isByVal();
02305   EVT ValVT;
02306 
02307   // If value is passed by pointer we have address passed instead of the value
02308   // itself.
02309   if (VA.getLocInfo() == CCValAssign::Indirect)
02310     ValVT = VA.getLocVT();
02311   else
02312     ValVT = VA.getValVT();
02313 
02314   // FIXME: For now, all byval parameter objects are marked mutable. This can be
02315   // changed with more analysis.
02316   // In case of tail call optimization mark all arguments mutable. Since they
02317   // could be overwritten by lowering of arguments in case of a tail call.
02318   if (Flags.isByVal()) {
02319     unsigned Bytes = Flags.getByValSize();
02320     if (Bytes == 0) Bytes = 1; // Don't create zero-sized stack objects.
02321     int FI = MFI->CreateFixedObject(Bytes, VA.getLocMemOffset(), isImmutable);
02322     return DAG.getFrameIndex(FI, getPointerTy());
02323   } else {
02324     int FI = MFI->CreateFixedObject(ValVT.getSizeInBits()/8,
02325                                     VA.getLocMemOffset(), isImmutable);
02326     SDValue FIN = DAG.getFrameIndex(FI, getPointerTy());
02327     return DAG.getLoad(ValVT, dl, Chain, FIN,
02328                        MachinePointerInfo::getFixedStack(FI),
02329                        false, false, false, 0);
02330   }
02331 }
02332 
02333 // FIXME: Get this from tablegen.
02334 static ArrayRef<MCPhysReg> get64BitArgumentGPRs(CallingConv::ID CallConv,
02335                                                 const X86Subtarget *Subtarget) {
02336   assert(Subtarget->is64Bit());
02337 
02338   if (Subtarget->isCallingConvWin64(CallConv)) {
02339     static const MCPhysReg GPR64ArgRegsWin64[] = {
02340       X86::RCX, X86::RDX, X86::R8,  X86::R9
02341     };
02342     return makeArrayRef(std::begin(GPR64ArgRegsWin64), std::end(GPR64ArgRegsWin64));
02343   }
02344 
02345   static const MCPhysReg GPR64ArgRegs64Bit[] = {
02346     X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9
02347   };
02348   return makeArrayRef(std::begin(GPR64ArgRegs64Bit), std::end(GPR64ArgRegs64Bit));
02349 }
02350 
02351 // FIXME: Get this from tablegen.
02352 static ArrayRef<MCPhysReg> get64BitArgumentXMMs(MachineFunction &MF,
02353                                                 CallingConv::ID CallConv,
02354                                                 const X86Subtarget *Subtarget) {
02355   assert(Subtarget->is64Bit());
02356   if (Subtarget->isCallingConvWin64(CallConv)) {
02357     // The XMM registers which might contain var arg parameters are shadowed
02358     // in their paired GPR.  So we only need to save the GPR to their home
02359     // slots.
02360     // TODO: __vectorcall will change this.
02361     return None;
02362   }
02363 
02364   const Function *Fn = MF.getFunction();
02365   bool NoImplicitFloatOps = Fn->getAttributes().
02366       hasAttribute(AttributeSet::FunctionIndex, Attribute::NoImplicitFloat);
02367   assert(!(MF.getTarget().Options.UseSoftFloat && NoImplicitFloatOps) &&
02368          "SSE register cannot be used when SSE is disabled!");
02369   if (MF.getTarget().Options.UseSoftFloat || NoImplicitFloatOps ||
02370       !Subtarget->hasSSE1())
02371     // Kernel mode asks for SSE to be disabled, so there are no XMM argument
02372     // registers.
02373     return None;
02374 
02375   static const MCPhysReg XMMArgRegs64Bit[] = {
02376     X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
02377     X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
02378   };
02379   return makeArrayRef(std::begin(XMMArgRegs64Bit), std::end(XMMArgRegs64Bit));
02380 }
02381 
02382 SDValue
02383 X86TargetLowering::LowerFormalArguments(SDValue Chain,
02384                                         CallingConv::ID CallConv,
02385                                         bool isVarArg,
02386                                       const SmallVectorImpl<ISD::InputArg> &Ins,
02387                                         SDLoc dl,
02388                                         SelectionDAG &DAG,
02389                                         SmallVectorImpl<SDValue> &InVals)
02390                                           const {
02391   MachineFunction &MF = DAG.getMachineFunction();
02392   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
02393 
02394   const Function* Fn = MF.getFunction();
02395   if (Fn->hasExternalLinkage() &&
02396       Subtarget->isTargetCygMing() &&
02397       Fn->getName() == "main")
02398     FuncInfo->setForceFramePointer(true);
02399 
02400   MachineFrameInfo *MFI = MF.getFrameInfo();
02401   bool Is64Bit = Subtarget->is64Bit();
02402   bool IsWin64 = Subtarget->isCallingConvWin64(CallConv);
02403 
02404   assert(!(isVarArg && IsTailCallConvention(CallConv)) &&
02405          "Var args not supported with calling convention fastcc, ghc or hipe");
02406 
02407   // Assign locations to all of the incoming arguments.
02408   SmallVector<CCValAssign, 16> ArgLocs;
02409   CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
02410 
02411   // Allocate shadow area for Win64
02412   if (IsWin64)
02413     CCInfo.AllocateStack(32, 8);
02414 
02415   CCInfo.AnalyzeFormalArguments(Ins, CC_X86);
02416 
02417   unsigned LastVal = ~0U;
02418   SDValue ArgValue;
02419   for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
02420     CCValAssign &VA = ArgLocs[i];
02421     // TODO: If an arg is passed in two places (e.g. reg and stack), skip later
02422     // places.
02423     assert(VA.getValNo() != LastVal &&
02424            "Don't support value assigned to multiple locs yet");
02425     (void)LastVal;
02426     LastVal = VA.getValNo();
02427 
02428     if (VA.isRegLoc()) {
02429       EVT RegVT = VA.getLocVT();
02430       const TargetRegisterClass *RC;
02431       if (RegVT == MVT::i32)
02432         RC = &X86::GR32RegClass;
02433       else if (Is64Bit && RegVT == MVT::i64)
02434         RC = &X86::GR64RegClass;
02435       else if (RegVT == MVT::f32)
02436         RC = &X86::FR32RegClass;
02437       else if (RegVT == MVT::f64)
02438         RC = &X86::FR64RegClass;
02439       else if (RegVT.is512BitVector())
02440         RC = &X86::VR512RegClass;
02441       else if (RegVT.is256BitVector())
02442         RC = &X86::VR256RegClass;
02443       else if (RegVT.is128BitVector())
02444         RC = &X86::VR128RegClass;
02445       else if (RegVT == MVT::x86mmx)
02446         RC = &X86::VR64RegClass;
02447       else if (RegVT == MVT::i1)
02448         RC = &X86::VK1RegClass;
02449       else if (RegVT == MVT::v8i1)
02450         RC = &X86::VK8RegClass;
02451       else if (RegVT == MVT::v16i1)
02452         RC = &X86::VK16RegClass;
02453       else if (RegVT == MVT::v32i1)
02454         RC = &X86::VK32RegClass;
02455       else if (RegVT == MVT::v64i1)
02456         RC = &X86::VK64RegClass;
02457       else
02458         llvm_unreachable("Unknown argument type!");
02459 
02460       unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
02461       ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
02462 
02463       // If this is an 8 or 16-bit value, it is really passed promoted to 32
02464       // bits.  Insert an assert[sz]ext to capture this, then truncate to the
02465       // right size.
02466       if (VA.getLocInfo() == CCValAssign::SExt)
02467         ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue,
02468                                DAG.getValueType(VA.getValVT()));
02469       else if (VA.getLocInfo() == CCValAssign::ZExt)
02470         ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue,
02471                                DAG.getValueType(VA.getValVT()));
02472       else if (VA.getLocInfo() == CCValAssign::BCvt)
02473         ArgValue = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), ArgValue);
02474 
02475       if (VA.isExtInLoc()) {
02476         // Handle MMX values passed in XMM regs.
02477         if (RegVT.isVector())
02478           ArgValue = DAG.getNode(X86ISD::MOVDQ2Q, dl, VA.getValVT(), ArgValue);
02479         else
02480           ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
02481       }
02482     } else {
02483       assert(VA.isMemLoc());
02484       ArgValue = LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, i);
02485     }
02486 
02487     // If value is passed via pointer - do a load.
02488     if (VA.getLocInfo() == CCValAssign::Indirect)
02489       ArgValue = DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue,
02490                              MachinePointerInfo(), false, false, false, 0);
02491 
02492     InVals.push_back(ArgValue);
02493   }
02494 
02495   if (Subtarget->is64Bit() || Subtarget->isTargetKnownWindowsMSVC()) {
02496     for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
02497       // The x86-64 ABIs require that for returning structs by value we copy
02498       // the sret argument into %rax/%eax (depending on ABI) for the return.
02499       // Win32 requires us to put the sret argument to %eax as well.
02500       // Save the argument into a virtual register so that we can access it
02501       // from the return points.
02502       if (Ins[i].Flags.isSRet()) {
02503         unsigned Reg = FuncInfo->getSRetReturnReg();
02504         if (!Reg) {
02505           MVT PtrTy = getPointerTy();
02506           Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy));
02507           FuncInfo->setSRetReturnReg(Reg);
02508         }
02509         SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[i]);
02510         Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain);
02511         break;
02512       }
02513     }
02514   }
02515 
02516   unsigned StackSize = CCInfo.getNextStackOffset();
02517   // Align stack specially for tail calls.
02518   if (FuncIsMadeTailCallSafe(CallConv,
02519                              MF.getTarget().Options.GuaranteedTailCallOpt))
02520     StackSize = GetAlignedArgumentStackSize(StackSize, DAG);
02521 
02522   // If the function takes variable number of arguments, make a frame index for
02523   // the start of the first vararg value... for expansion of llvm.va_start. We
02524   // can skip this if there are no va_start calls.
02525   if (MFI->hasVAStart() &&
02526       (Is64Bit || (CallConv != CallingConv::X86_FastCall &&
02527                    CallConv != CallingConv::X86_ThisCall))) {
02528     FuncInfo->setVarArgsFrameIndex(
02529         MFI->CreateFixedObject(1, StackSize, true));
02530   }
02531 
02532   // 64-bit calling conventions support varargs and register parameters, so we
02533   // have to do extra work to spill them in the prologue or forward them to
02534   // musttail calls.
02535   if (Is64Bit && isVarArg &&
02536       (MFI->hasVAStart() || MFI->hasMustTailInVarArgFunc())) {
02537     // Find the first unallocated argument registers.
02538     ArrayRef<MCPhysReg> ArgGPRs = get64BitArgumentGPRs(CallConv, Subtarget);
02539     ArrayRef<MCPhysReg> ArgXMMs = get64BitArgumentXMMs(MF, CallConv, Subtarget);
02540     unsigned NumIntRegs =
02541         CCInfo.getFirstUnallocated(ArgGPRs.data(), ArgGPRs.size());
02542     unsigned NumXMMRegs =
02543         CCInfo.getFirstUnallocated(ArgXMMs.data(), ArgXMMs.size());
02544     assert(!(NumXMMRegs && !Subtarget->hasSSE1()) &&
02545            "SSE register cannot be used when SSE is disabled!");
02546 
02547     // Gather all the live in physical registers.
02548     SmallVector<SDValue, 6> LiveGPRs;
02549     SmallVector<SDValue, 8> LiveXMMRegs;
02550     SDValue ALVal;
02551     for (MCPhysReg Reg : ArgGPRs.slice(NumIntRegs)) {
02552       unsigned GPR = MF.addLiveIn(Reg, &X86::GR64RegClass);
02553       LiveGPRs.push_back(
02554           DAG.getCopyFromReg(Chain, dl, GPR, MVT::i64));
02555     }
02556     if (!ArgXMMs.empty()) {
02557       unsigned AL = MF.addLiveIn(X86::AL, &X86::GR8RegClass);
02558       ALVal = DAG.getCopyFromReg(Chain, dl, AL, MVT::i8);
02559       for (MCPhysReg Reg : ArgXMMs.slice(NumXMMRegs)) {
02560         unsigned XMMReg = MF.addLiveIn(Reg, &X86::VR128RegClass);
02561         LiveXMMRegs.push_back(
02562             DAG.getCopyFromReg(Chain, dl, XMMReg, MVT::v4f32));
02563       }
02564     }
02565 
02566     // Store them to the va_list returned by va_start.
02567     if (MFI->hasVAStart()) {
02568       if (IsWin64) {
02569         const TargetFrameLowering &TFI = *MF.getSubtarget().getFrameLowering();
02570         // Get to the caller-allocated home save location.  Add 8 to account
02571         // for the return address.
02572         int HomeOffset = TFI.getOffsetOfLocalArea() + 8;
02573         FuncInfo->setRegSaveFrameIndex(
02574           MFI->CreateFixedObject(1, NumIntRegs * 8 + HomeOffset, false));
02575         // Fixup to set vararg frame on shadow area (4 x i64).
02576         if (NumIntRegs < 4)
02577           FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex());
02578       } else {
02579         // For X86-64, if there are vararg parameters that are passed via
02580         // registers, then we must store them to their spots on the stack so
02581         // they may be loaded by deferencing the result of va_next.
02582         FuncInfo->setVarArgsGPOffset(NumIntRegs * 8);
02583         FuncInfo->setVarArgsFPOffset(ArgGPRs.size() * 8 + NumXMMRegs * 16);
02584         FuncInfo->setRegSaveFrameIndex(MFI->CreateStackObject(
02585             ArgGPRs.size() * 8 + ArgXMMs.size() * 16, 16, false));
02586       }
02587 
02588       // Store the integer parameter registers.
02589       SmallVector<SDValue, 8> MemOps;
02590       SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(),
02591                                         getPointerTy());
02592       unsigned Offset = FuncInfo->getVarArgsGPOffset();
02593       for (SDValue Val : LiveGPRs) {
02594         SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), RSFIN,
02595                                   DAG.getIntPtrConstant(Offset));
02596         SDValue Store =
02597           DAG.getStore(Val.getValue(1), dl, Val, FIN,
02598                        MachinePointerInfo::getFixedStack(
02599                          FuncInfo->getRegSaveFrameIndex(), Offset),
02600                        false, false, 0);
02601         MemOps.push_back(Store);
02602         Offset += 8;
02603       }
02604 
02605       if (!ArgXMMs.empty() && NumXMMRegs != ArgXMMs.size()) {
02606         // Now store the XMM (fp + vector) parameter registers.
02607         SmallVector<SDValue, 12> SaveXMMOps;
02608         SaveXMMOps.push_back(Chain);
02609         SaveXMMOps.push_back(ALVal);
02610         SaveXMMOps.push_back(DAG.getIntPtrConstant(
02611                                FuncInfo->getRegSaveFrameIndex()));
02612         SaveXMMOps.push_back(DAG.getIntPtrConstant(
02613                                FuncInfo->getVarArgsFPOffset()));
02614         SaveXMMOps.insert(SaveXMMOps.end(), LiveXMMRegs.begin(),
02615                           LiveXMMRegs.end());
02616         MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, dl,
02617                                      MVT::Other, SaveXMMOps));
02618       }
02619 
02620       if (!MemOps.empty())
02621         Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);
02622     } else {
02623       // Add all GPRs, al, and XMMs to the list of forwards.  We will add then
02624       // to the liveout set on a musttail call.
02625       assert(MFI->hasMustTailInVarArgFunc());
02626       auto &Forwards = FuncInfo->getForwardedMustTailRegParms();
02627       typedef X86MachineFunctionInfo::Forward Forward;
02628 
02629       for (unsigned I = 0, E = LiveGPRs.size(); I != E; ++I) {
02630         unsigned VReg =
02631             MF.getRegInfo().createVirtualRegister(&X86::GR64RegClass);
02632         Chain = DAG.getCopyToReg(Chain, dl, VReg, LiveGPRs[I]);
02633         Forwards.push_back(Forward(VReg, ArgGPRs[NumIntRegs + I], MVT::i64));
02634       }
02635 
02636       if (!ArgXMMs.empty()) {
02637         unsigned ALVReg =
02638             MF.getRegInfo().createVirtualRegister(&X86::GR8RegClass);
02639         Chain = DAG.getCopyToReg(Chain, dl, ALVReg, ALVal);
02640         Forwards.push_back(Forward(ALVReg, X86::AL, MVT::i8));
02641 
02642         for (unsigned I = 0, E = LiveXMMRegs.size(); I != E; ++I) {
02643           unsigned VReg =
02644               MF.getRegInfo().createVirtualRegister(&X86::VR128RegClass);
02645           Chain = DAG.getCopyToReg(Chain, dl, VReg, LiveXMMRegs[I]);
02646           Forwards.push_back(
02647               Forward(VReg, ArgXMMs[NumXMMRegs + I], MVT::v4f32));
02648         }
02649       }
02650     }
02651   }
02652 
02653   // Some CCs need callee pop.
02654   if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
02655                        MF.getTarget().Options.GuaranteedTailCallOpt)) {
02656     FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything.
02657   } else {
02658     FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing.
02659     // If this is an sret function, the return should pop the hidden pointer.
02660     if (!Is64Bit && !IsTailCallConvention(CallConv) &&
02661         !Subtarget->getTargetTriple().isOSMSVCRT() &&
02662         argsAreStructReturn(Ins) == StackStructReturn)
02663       FuncInfo->setBytesToPopOnReturn(4);
02664   }
02665 
02666   if (!Is64Bit) {
02667     // RegSaveFrameIndex is X86-64 only.
02668     FuncInfo->setRegSaveFrameIndex(0xAAAAAAA);
02669     if (CallConv == CallingConv::X86_FastCall ||
02670         CallConv == CallingConv::X86_ThisCall)
02671       // fastcc functions can't have varargs.
02672       FuncInfo->setVarArgsFrameIndex(0xAAAAAAA);
02673   }
02674 
02675   FuncInfo->setArgumentStackSize(StackSize);
02676 
02677   return Chain;
02678 }
02679 
02680 SDValue
02681 X86TargetLowering::LowerMemOpCallTo(SDValue Chain,
02682                                     SDValue StackPtr, SDValue Arg,
02683                                     SDLoc dl, SelectionDAG &DAG,
02684                                     const CCValAssign &VA,
02685                                     ISD::ArgFlagsTy Flags) const {
02686   unsigned LocMemOffset = VA.getLocMemOffset();
02687   SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset);
02688   PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, PtrOff);
02689   if (Flags.isByVal())
02690     return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl);
02691 
02692   return DAG.getStore(Chain, dl, Arg, PtrOff,
02693                       MachinePointerInfo::getStack(LocMemOffset),
02694                       false, false, 0);
02695 }
02696 
02697 /// EmitTailCallLoadRetAddr - Emit a load of return address if tail call
02698 /// optimization is performed and it is required.
02699 SDValue
02700 X86TargetLowering::EmitTailCallLoadRetAddr(SelectionDAG &DAG,
02701                                            SDValue &OutRetAddr, SDValue Chain,
02702                                            bool IsTailCall, bool Is64Bit,
02703                                            int FPDiff, SDLoc dl) const {
02704   // Adjust the Return address stack slot.
02705   EVT VT = getPointerTy();
02706   OutRetAddr = getReturnAddressFrameIndex(DAG);
02707 
02708   // Load the "old" Return address.
02709   OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, MachinePointerInfo(),
02710                            false, false, false, 0);
02711   return SDValue(OutRetAddr.getNode(), 1);
02712 }
02713 
02714 /// EmitTailCallStoreRetAddr - Emit a store of the return address if tail call
02715 /// optimization is performed and it is required (FPDiff!=0).
02716 static SDValue EmitTailCallStoreRetAddr(SelectionDAG &DAG, MachineFunction &MF,
02717                                         SDValue Chain, SDValue RetAddrFrIdx,
02718                                         EVT PtrVT, unsigned SlotSize,
02719                                         int FPDiff, SDLoc dl) {
02720   // Store the return address to the appropriate stack slot.
02721   if (!FPDiff) return Chain;
02722   // Calculate the new stack slot for the return address.
02723   int NewReturnAddrFI =
02724     MF.getFrameInfo()->CreateFixedObject(SlotSize, (int64_t)FPDiff - SlotSize,
02725                                          false);
02726   SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, PtrVT);
02727   Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx,
02728                        MachinePointerInfo::getFixedStack(NewReturnAddrFI),
02729                        false, false, 0);
02730   return Chain;
02731 }
02732 
02733 SDValue
02734 X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
02735                              SmallVectorImpl<SDValue> &InVals) const {
02736   SelectionDAG &DAG                     = CLI.DAG;
02737   SDLoc &dl                             = CLI.DL;
02738   SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
02739   SmallVectorImpl<SDValue> &OutVals     = CLI.OutVals;
02740   SmallVectorImpl<ISD::InputArg> &Ins   = CLI.Ins;
02741   SDValue Chain                         = CLI.Chain;
02742   SDValue Callee                        = CLI.Callee;
02743   CallingConv::ID CallConv              = CLI.CallConv;
02744   bool &isTailCall                      = CLI.IsTailCall;
02745   bool isVarArg                         = CLI.IsVarArg;
02746 
02747   MachineFunction &MF = DAG.getMachineFunction();
02748   bool Is64Bit        = Subtarget->is64Bit();
02749   bool IsWin64        = Subtarget->isCallingConvWin64(CallConv);
02750   StructReturnType SR = callIsStructReturn(Outs);
02751   bool IsSibcall      = false;
02752   X86MachineFunctionInfo *X86Info = MF.getInfo<X86MachineFunctionInfo>();
02753 
02754   if (MF.getTarget().Options.DisableTailCalls)
02755     isTailCall = false;
02756 
02757   bool IsMustTail = CLI.CS && CLI.CS->isMustTailCall();
02758   if (IsMustTail) {
02759     // Force this to be a tail call.  The verifier rules are enough to ensure
02760     // that we can lower this successfully without moving the return address
02761     // around.
02762     isTailCall = true;
02763   } else if (isTailCall) {
02764     // Check if it's really possible to do a tail call.
02765     isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv,
02766                     isVarArg, SR != NotStructReturn,
02767                     MF.getFunction()->hasStructRetAttr(), CLI.RetTy,
02768                     Outs, OutVals, Ins, DAG);
02769 
02770     // Sibcalls are automatically detected tailcalls which do not require
02771     // ABI changes.
02772     if (!MF.getTarget().Options.GuaranteedTailCallOpt && isTailCall)
02773       IsSibcall = true;
02774 
02775     if (isTailCall)
02776       ++NumTailCalls;
02777   }
02778 
02779   assert(!(isVarArg && IsTailCallConvention(CallConv)) &&
02780          "Var args not supported with calling convention fastcc, ghc or hipe");
02781 
02782   // Analyze operands of the call, assigning locations to each operand.
02783   SmallVector<CCValAssign, 16> ArgLocs;
02784   CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
02785 
02786   // Allocate shadow area for Win64
02787   if (IsWin64)
02788     CCInfo.AllocateStack(32, 8);
02789 
02790   CCInfo.AnalyzeCallOperands(Outs, CC_X86);
02791 
02792   // Get a count of how many bytes are to be pushed on the stack.
02793   unsigned NumBytes = CCInfo.getNextStackOffset();
02794   if (IsSibcall)
02795     // This is a sibcall. The memory operands are available in caller's
02796     // own caller's stack.
02797     NumBytes = 0;
02798   else if (MF.getTarget().Options.GuaranteedTailCallOpt &&
02799            IsTailCallConvention(CallConv))
02800     NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG);
02801 
02802   int FPDiff = 0;
02803   if (isTailCall && !IsSibcall && !IsMustTail) {
02804     // Lower arguments at fp - stackoffset + fpdiff.
02805     unsigned NumBytesCallerPushed = X86Info->getBytesToPopOnReturn();
02806 
02807     FPDiff = NumBytesCallerPushed - NumBytes;
02808 
02809     // Set the delta of movement of the returnaddr stackslot.
02810     // But only set if delta is greater than previous delta.
02811     if (FPDiff < X86Info->getTCReturnAddrDelta())
02812       X86Info->setTCReturnAddrDelta(FPDiff);
02813   }
02814 
02815   unsigned NumBytesToPush = NumBytes;
02816   unsigned NumBytesToPop = NumBytes;
02817 
02818   // If we have an inalloca argument, all stack space has already been allocated
02819   // for us and be right at the top of the stack.  We don't support multiple
02820   // arguments passed in memory when using inalloca.
02821   if (!Outs.empty() && Outs.back().Flags.isInAlloca()) {
02822     NumBytesToPush = 0;
02823     if (!ArgLocs.back().isMemLoc())
02824       report_fatal_error("cannot use inalloca attribute on a register "
02825                          "parameter");
02826     if (ArgLocs.back().getLocMemOffset() != 0)
02827       report_fatal_error("any parameter with the inalloca attribute must be "
02828                          "the only memory argument");
02829   }
02830 
02831   if (!IsSibcall)
02832     Chain = DAG.getCALLSEQ_START(
02833         Chain, DAG.getIntPtrConstant(NumBytesToPush, true), dl);
02834 
02835   SDValue RetAddrFrIdx;
02836   // Load return address for tail calls.
02837   if (isTailCall && FPDiff)
02838     Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall,
02839                                     Is64Bit, FPDiff, dl);
02840 
02841   SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
02842   SmallVector<SDValue, 8> MemOpChains;
02843   SDValue StackPtr;
02844 
02845   // Walk the register/memloc assignments, inserting copies/loads.  In the case
02846   // of tail call optimization arguments are handle later.
02847   const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>(
02848       DAG.getSubtarget().getRegisterInfo());
02849   for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
02850     // Skip inalloca arguments, they have already been written.
02851     ISD::ArgFlagsTy Flags = Outs[i].Flags;
02852     if (Flags.isInAlloca())
02853       continue;
02854 
02855     CCValAssign &VA = ArgLocs[i];
02856     EVT RegVT = VA.getLocVT();
02857     SDValue Arg = OutVals[i];
02858     bool isByVal = Flags.isByVal();
02859 
02860     // Promote the value if needed.
02861     switch (VA.getLocInfo()) {
02862     default: llvm_unreachable("Unknown loc info!");
02863     case CCValAssign::Full: break;
02864     case CCValAssign::SExt:
02865       Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg);
02866       break;
02867     case CCValAssign::ZExt:
02868       Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg);
02869       break;
02870     case CCValAssign::AExt:
02871       if (RegVT.is128BitVector()) {
02872         // Special case: passing MMX values in XMM registers.
02873         Arg = DAG.getNode(ISD::BITCAST, dl, MVT::i64, Arg);
02874         Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg);
02875         Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg);
02876       } else
02877         Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg);
02878       break;
02879     case CCValAssign::BCvt:
02880       Arg = DAG.getNode(ISD::BITCAST, dl, RegVT, Arg);
02881       break;
02882     case CCValAssign::Indirect: {
02883       // Store the argument.
02884       SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT());
02885       int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex();
02886       Chain = DAG.getStore(Chain, dl, Arg, SpillSlot,
02887                            MachinePointerInfo::getFixedStack(FI),
02888                            false, false, 0);
02889       Arg = SpillSlot;
02890       break;
02891     }
02892     }
02893 
02894     if (VA.isRegLoc()) {
02895       RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
02896       if (isVarArg && IsWin64) {
02897         // Win64 ABI requires argument XMM reg to be copied to the corresponding
02898         // shadow reg if callee is a varargs function.
02899         unsigned ShadowReg = 0;
02900         switch (VA.getLocReg()) {
02901         case X86::XMM0: ShadowReg = X86::RCX; break;
02902         case X86::XMM1: ShadowReg = X86::RDX; break;
02903         case X86::XMM2: ShadowReg = X86::R8; break;
02904         case X86::XMM3: ShadowReg = X86::R9; break;
02905         }
02906         if (ShadowReg)
02907           RegsToPass.push_back(std::make_pair(ShadowReg, Arg));
02908       }
02909     } else if (!IsSibcall && (!isTailCall || isByVal)) {
02910       assert(VA.isMemLoc());
02911       if (!StackPtr.getNode())
02912         StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
02913                                       getPointerTy());
02914       MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg,
02915                                              dl, DAG, VA, Flags));
02916     }
02917   }
02918 
02919   if (!MemOpChains.empty())
02920     Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
02921 
02922   if (Subtarget->isPICStyleGOT()) {
02923     // ELF / PIC requires GOT in the EBX register before function calls via PLT
02924     // GOT pointer.
02925     if (!isTailCall) {
02926       RegsToPass.push_back(std::make_pair(unsigned(X86::EBX),
02927                DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), getPointerTy())));
02928     } else {
02929       // If we are tail calling and generating PIC/GOT style code load the
02930       // address of the callee into ECX. The value in ecx is used as target of
02931       // the tail jump. This is done to circumvent the ebx/callee-saved problem
02932       // for tail calls on PIC/GOT architectures. Normally we would just put the
02933       // address of GOT into ebx and then call target@PLT. But for tail calls
02934       // ebx would be restored (since ebx is callee saved) before jumping to the
02935       // target@PLT.
02936 
02937       // Note: The actual moving to ECX is done further down.
02938       GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
02939       if (G && !G->getGlobal()->hasHiddenVisibility() &&
02940           !G->getGlobal()->hasProtectedVisibility())
02941         Callee = LowerGlobalAddress(Callee, DAG);
02942       else if (isa<ExternalSymbolSDNode>(Callee))
02943         Callee = LowerExternalSymbol(Callee, DAG);
02944     }
02945   }
02946 
02947   if (Is64Bit && isVarArg && !IsWin64 && !IsMustTail) {
02948     // From AMD64 ABI document:
02949     // For calls that may call functions that use varargs or stdargs
02950     // (prototype-less calls or calls to functions containing ellipsis (...) in
02951     // the declaration) %al is used as hidden argument to specify the number
02952     // of SSE registers used. The contents of %al do not need to match exactly
02953     // the number of registers, but must be an ubound on the number of SSE
02954     // registers used and is in the range 0 - 8 inclusive.
02955 
02956     // Count the number of XMM registers allocated.
02957     static const MCPhysReg XMMArgRegs[] = {
02958       X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
02959       X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
02960     };
02961     unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs, 8);
02962     assert((Subtarget->hasSSE1() || !NumXMMRegs)
02963            && "SSE registers cannot be used when SSE is disabled");
02964 
02965     RegsToPass.push_back(std::make_pair(unsigned(X86::AL),
02966                                         DAG.getConstant(NumXMMRegs, MVT::i8)));
02967   }
02968 
02969   if (Is64Bit && isVarArg && IsMustTail) {
02970     const auto &Forwards = X86Info->getForwardedMustTailRegParms();
02971     for (const auto &F : Forwards) {
02972       SDValue Val = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT);
02973       RegsToPass.push_back(std::make_pair(unsigned(F.PReg), Val));
02974     }
02975   }
02976 
02977   // For tail calls lower the arguments to the 'real' stack slots.  Sibcalls
02978   // don't need this because the eligibility check rejects calls that require
02979   // shuffling arguments passed in memory.
02980   if (!IsSibcall && isTailCall) {
02981     // Force all the incoming stack arguments to be loaded from the stack
02982     // before any new outgoing arguments are stored to the stack, because the
02983     // outgoing stack slots may alias the incoming argument stack slots, and
02984     // the alias isn't otherwise explicit. This is slightly more conservative
02985     // than necessary, because it means that each store effectively depends
02986     // on every argument instead of just those arguments it would clobber.
02987     SDValue ArgChain = DAG.getStackArgumentTokenFactor(Chain);
02988 
02989     SmallVector<SDValue, 8> MemOpChains2;
02990     SDValue FIN;
02991     int FI = 0;
02992     for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
02993       CCValAssign &VA = ArgLocs[i];
02994       if (VA.isRegLoc())
02995         continue;
02996       assert(VA.isMemLoc());
02997       SDValue Arg = OutVals[i];
02998       ISD::ArgFlagsTy Flags = Outs[i].Flags;
02999       // Skip inalloca arguments.  They don't require any work.
03000       if (Flags.isInAlloca())
03001         continue;
03002       // Create frame index.
03003       int32_t Offset = VA.getLocMemOffset()+FPDiff;
03004       uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8;
03005       FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset, true);
03006       FIN = DAG.getFrameIndex(FI, getPointerTy());
03007 
03008       if (Flags.isByVal()) {
03009         // Copy relative to framepointer.
03010         SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset());
03011         if (!StackPtr.getNode())
03012           StackPtr = DAG.getCopyFromReg(Chain, dl,
03013                                         RegInfo->getStackRegister(),
03014                                         getPointerTy());
03015         Source = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, Source);
03016 
03017         MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN,
03018                                                          ArgChain,
03019                                                          Flags, DAG, dl));
03020       } else {
03021         // Store relative to framepointer.
03022         MemOpChains2.push_back(
03023           DAG.getStore(ArgChain, dl, Arg, FIN,
03024                        MachinePointerInfo::getFixedStack(FI),
03025                        false, false, 0));
03026       }
03027     }
03028 
03029     if (!MemOpChains2.empty())
03030       Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains2);
03031 
03032     // Store the return address to the appropriate stack slot.
03033     Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx,
03034                                      getPointerTy(), RegInfo->getSlotSize(),
03035                                      FPDiff, dl);
03036   }
03037 
03038   // Build a sequence of copy-to-reg nodes chained together with token chain
03039   // and flag operands which copy the outgoing args into registers.
03040   SDValue InFlag;
03041   for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
03042     Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
03043                              RegsToPass[i].second, InFlag);
03044     InFlag = Chain.getValue(1);
03045   }
03046 
03047   if (DAG.getTarget().getCodeModel() == CodeModel::Large) {
03048     assert(Is64Bit && "Large code model is only legal in 64-bit mode.");
03049     // In the 64-bit large code model, we have to make all calls
03050     // through a register, since the call instruction's 32-bit
03051     // pc-relative offset may not be large enough to hold the whole
03052     // address.
03053   } else if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
03054     // If the callee is a GlobalAddress node (quite common, every direct call
03055     // is) turn it into a TargetGlobalAddress node so that legalize doesn't hack
03056     // it.
03057 
03058     // We should use extra load for direct calls to dllimported functions in
03059     // non-JIT mode.
03060     const GlobalValue *GV = G->getGlobal();
03061     if (!GV->hasDLLImportStorageClass()) {
03062       unsigned char OpFlags = 0;
03063       bool ExtraLoad = false;
03064       unsigned WrapperKind = ISD::DELETED_NODE;
03065 
03066       // On ELF targets, in both X86-64 and X86-32 mode, direct calls to
03067       // external symbols most go through the PLT in PIC mode.  If the symbol
03068       // has hidden or protected visibility, or if it is static or local, then
03069       // we don't need to use the PLT - we can directly call it.
03070       if (Subtarget->isTargetELF() &&
03071           DAG.getTarget().getRelocationModel() == Reloc::PIC_ &&
03072           GV->hasDefaultVisibility() && !GV->hasLocalLinkage()) {
03073         OpFlags = X86II::MO_PLT;
03074       } else if (Subtarget->isPICStyleStubAny() &&
03075                  (GV->isDeclaration() || GV->isWeakForLinker()) &&
03076                  (!Subtarget->getTargetTriple().isMacOSX() ||
03077                   Subtarget->getTargetTriple().isMacOSXVersionLT(10, 5))) {
03078         // PC-relative references to external symbols should go through $stub,
03079         // unless we're building with the leopard linker or later, which
03080         // automatically synthesizes these stubs.
03081         OpFlags = X86II::MO_DARWIN_STUB;
03082       } else if (Subtarget->isPICStyleRIPRel() &&
03083                  isa<Function>(GV) &&
03084                  cast<Function>(GV)->getAttributes().
03085                    hasAttribute(AttributeSet::FunctionIndex,
03086                                 Attribute::NonLazyBind)) {
03087         // If the function is marked as non-lazy, generate an indirect call
03088         // which loads from the GOT directly. This avoids runtime overhead
03089         // at the cost of eager binding (and one extra byte of encoding).
03090         OpFlags = X86II::MO_GOTPCREL;
03091         WrapperKind = X86ISD::WrapperRIP;
03092         ExtraLoad = true;
03093       }
03094 
03095       Callee = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(),
03096                                           G->getOffset(), OpFlags);
03097 
03098       // Add a wrapper if needed.
03099       if (WrapperKind != ISD::DELETED_NODE)
03100         Callee = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Callee);
03101       // Add extra indirection if needed.
03102       if (ExtraLoad)
03103         Callee = DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), Callee,
03104                              MachinePointerInfo::getGOT(),
03105                              false, false, false, 0);
03106     }
03107   } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
03108     unsigned char OpFlags = 0;
03109 
03110     // On ELF targets, in either X86-64 or X86-32 mode, direct calls to
03111     // external symbols should go through the PLT.
03112     if (Subtarget->isTargetELF() &&
03113         DAG.getTarget().getRelocationModel() == Reloc::PIC_) {
03114       OpFlags = X86II::MO_PLT;
03115     } else if (Subtarget->isPICStyleStubAny() &&
03116                (!Subtarget->getTargetTriple().isMacOSX() ||
03117                 Subtarget->getTargetTriple().isMacOSXVersionLT(10, 5))) {
03118       // PC-relative references to external symbols should go through $stub,
03119       // unless we're building with the leopard linker or later, which
03120       // automatically synthesizes these stubs.
03121       OpFlags = X86II::MO_DARWIN_STUB;
03122     }
03123 
03124     Callee = DAG.getTargetExternalSymbol(S->getSymbol(), getPointerTy(),
03125                                          OpFlags);
03126   } else if (Subtarget->isTarget64BitILP32() && Callee->getValueType(0) == MVT::i32) {
03127     // Zero-extend the 32-bit Callee address into a 64-bit according to x32 ABI
03128     Callee = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Callee);
03129   }
03130 
03131   // Returns a chain & a flag for retval copy to use.
03132   SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
03133   SmallVector<SDValue, 8> Ops;
03134 
03135   if (!IsSibcall && isTailCall) {
03136     Chain = DAG.getCALLSEQ_END(Chain,
03137                                DAG.getIntPtrConstant(NumBytesToPop, true),
03138                                DAG.getIntPtrConstant(0, true), InFlag, dl);
03139     InFlag = Chain.getValue(1);
03140   }
03141 
03142   Ops.push_back(Chain);
03143   Ops.push_back(Callee);
03144 
03145   if (isTailCall)
03146     Ops.push_back(DAG.getConstant(FPDiff, MVT::i32));
03147 
03148   // Add argument registers to the end of the list so that they are known live
03149   // into the call.
03150   for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
03151     Ops.push_back(DAG.getRegister(RegsToPass[i].first,
03152                                   RegsToPass[i].second.getValueType()));
03153 
03154   // Add a register mask operand representing the call-preserved registers.
03155   const TargetRegisterInfo *TRI = DAG.getSubtarget().getRegisterInfo();
03156   const uint32_t *Mask = TRI->getCallPreservedMask(CallConv);
03157   assert(Mask && "Missing call preserved mask for calling convention");
03158   Ops.push_back(DAG.getRegisterMask(Mask));
03159 
03160   if (InFlag.getNode())
03161     Ops.push_back(InFlag);
03162 
03163   if (isTailCall) {
03164     // We used to do:
03165     //// If this is the first return lowered for this function, add the regs
03166     //// to the liveout set for the function.
03167     // This isn't right, although it's probably harmless on x86; liveouts
03168     // should be computed from returns not tail calls.  Consider a void
03169     // function making a tail call to a function returning int.
03170     return DAG.getNode(X86ISD::TC_RETURN, dl, NodeTys, Ops);
03171   }
03172 
03173   Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, Ops);
03174   InFlag = Chain.getValue(1);
03175 
03176   // Create the CALLSEQ_END node.
03177   unsigned NumBytesForCalleeToPop;
03178   if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
03179                        DAG.getTarget().Options.GuaranteedTailCallOpt))
03180     NumBytesForCalleeToPop = NumBytes;    // Callee pops everything
03181   else if (!Is64Bit && !IsTailCallConvention(CallConv) &&
03182            !Subtarget->getTargetTriple().isOSMSVCRT() &&
03183            SR == StackStructReturn)
03184     // If this is a call to a struct-return function, the callee
03185     // pops the hidden struct pointer, so we have to push it back.
03186     // This is common for Darwin/X86, Linux & Mingw32 targets.
03187     // For MSVC Win32 targets, the caller pops the hidden struct pointer.
03188     NumBytesForCalleeToPop = 4;
03189   else
03190     NumBytesForCalleeToPop = 0;  // Callee pops nothing.
03191 
03192   // Returns a flag for retval copy to use.
03193   if (!IsSibcall) {
03194     Chain = DAG.getCALLSEQ_END(Chain,
03195                                DAG.getIntPtrConstant(NumBytesToPop, true),
03196                                DAG.getIntPtrConstant(NumBytesForCalleeToPop,
03197                                                      true),
03198                                InFlag, dl);
03199     InFlag = Chain.getValue(1);
03200   }
03201 
03202   // Handle result values, copying them out of physregs into vregs that we
03203   // return.
03204   return LowerCallResult(Chain, InFlag, CallConv, isVarArg,
03205                          Ins, dl, DAG, InVals);
03206 }
03207 
03208 //===----------------------------------------------------------------------===//
03209 //                Fast Calling Convention (tail call) implementation
03210 //===----------------------------------------------------------------------===//
03211 
03212 //  Like std call, callee cleans arguments, convention except that ECX is
03213 //  reserved for storing the tail called function address. Only 2 registers are
03214 //  free for argument passing (inreg). Tail call optimization is performed
03215 //  provided:
03216 //                * tailcallopt is enabled
03217 //                * caller/callee are fastcc
03218 //  On X86_64 architecture with GOT-style position independent code only local
03219 //  (within module) calls are supported at the moment.
03220 //  To keep the stack aligned according to platform abi the function
03221 //  GetAlignedArgumentStackSize ensures that argument delta is always multiples
03222 //  of stack alignment. (Dynamic linkers need this - darwin's dyld for example)
03223 //  If a tail called function callee has more arguments than the caller the
03224 //  caller needs to make sure that there is room to move the RETADDR to. This is
03225 //  achieved by reserving an area the size of the argument delta right after the
03226 //  original RETADDR, but before the saved framepointer or the spilled registers
03227 //  e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4)
03228 //  stack layout:
03229 //    arg1
03230 //    arg2
03231 //    RETADDR
03232 //    [ new RETADDR
03233 //      move area ]
03234 //    (possible EBP)
03235 //    ESI
03236 //    EDI
03237 //    local1 ..
03238 
03239 /// GetAlignedArgumentStackSize - Make the stack size align e.g 16n + 12 aligned
03240 /// for a 16 byte align requirement.
03241 unsigned
03242 X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize,
03243                                                SelectionDAG& DAG) const {
03244   MachineFunction &MF = DAG.getMachineFunction();
03245   const TargetMachine &TM = MF.getTarget();
03246   const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>(
03247       TM.getSubtargetImpl()->getRegisterInfo());
03248   const TargetFrameLowering &TFI = *TM.getSubtargetImpl()->getFrameLowering();
03249   unsigned StackAlignment = TFI.getStackAlignment();
03250   uint64_t AlignMask = StackAlignment - 1;
03251   int64_t Offset = StackSize;
03252   unsigned SlotSize = RegInfo->getSlotSize();
03253   if ( (Offset & AlignMask) <= (StackAlignment - SlotSize) ) {
03254     // Number smaller than 12 so just add the difference.
03255     Offset += ((StackAlignment - SlotSize) - (Offset & AlignMask));
03256   } else {
03257     // Mask out lower bits, add stackalignment once plus the 12 bytes.
03258     Offset = ((~AlignMask) & Offset) + StackAlignment +
03259       (StackAlignment-SlotSize);
03260   }
03261   return Offset;
03262 }
03263 
03264 /// MatchingStackOffset - Return true if the given stack call argument is
03265 /// already available in the same position (relatively) of the caller's
03266 /// incoming argument stack.
03267 static
03268 bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
03269                          MachineFrameInfo *MFI, const MachineRegisterInfo *MRI,
03270                          const X86InstrInfo *TII) {
03271   unsigned Bytes = Arg.getValueType().getSizeInBits() / 8;
03272   int FI = INT_MAX;
03273   if (Arg.getOpcode() == ISD::CopyFromReg) {
03274     unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg();
03275     if (!TargetRegisterInfo::isVirtualRegister(VR))
03276       return false;
03277     MachineInstr *Def = MRI->getVRegDef(VR);
03278     if (!Def)
03279       return false;
03280     if (!Flags.isByVal()) {
03281       if (!TII->isLoadFromStackSlot(Def, FI))
03282         return false;
03283     } else {
03284       unsigned Opcode = Def->getOpcode();
03285       if ((Opcode == X86::LEA32r || Opcode == X86::LEA64r) &&
03286           Def->getOperand(1).isFI()) {
03287         FI = Def->getOperand(1).getIndex();
03288         Bytes = Flags.getByValSize();
03289       } else
03290         return false;
03291     }
03292   } else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) {
03293     if (Flags.isByVal())
03294       // ByVal argument is passed in as a pointer but it's now being
03295       // dereferenced. e.g.
03296       // define @foo(%struct.X* %A) {
03297       //   tail call @bar(%struct.X* byval %A)
03298       // }
03299       return false;
03300     SDValue Ptr = Ld->getBasePtr();
03301     FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr);
03302     if (!FINode)
03303       return false;
03304     FI = FINode->getIndex();
03305   } else if (Arg.getOpcode() == ISD::FrameIndex && Flags.isByVal()) {
03306     FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Arg);
03307     FI = FINode->getIndex();
03308     Bytes = Flags.getByValSize();
03309   } else
03310     return false;
03311 
03312   assert(FI != INT_MAX);
03313   if (!MFI->isFixedObjectIndex(FI))
03314     return false;
03315   return Offset == MFI->getObjectOffset(FI) && Bytes == MFI->getObjectSize(FI);
03316 }
03317 
03318 /// IsEligibleForTailCallOptimization - Check whether the call is eligible
03319 /// for tail call optimization. Targets which want to do tail call
03320 /// optimization should implement this function.
03321 bool
03322 X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
03323                                                      CallingConv::ID CalleeCC,
03324                                                      bool isVarArg,
03325                                                      bool isCalleeStructRet,
03326                                                      bool isCallerStructRet,
03327                                                      Type *RetTy,
03328                                     const SmallVectorImpl<ISD::OutputArg> &Outs,
03329                                     const SmallVectorImpl<SDValue> &OutVals,
03330                                     const SmallVectorImpl<ISD::InputArg> &Ins,
03331                                                      SelectionDAG &DAG) const {
03332   if (!IsTailCallConvention(CalleeCC) && !IsCCallConvention(CalleeCC))
03333     return false;
03334 
03335   // If -tailcallopt is specified, make fastcc functions tail-callable.
03336   const MachineFunction &MF = DAG.getMachineFunction();
03337   const Function *CallerF = MF.getFunction();
03338 
03339   // If the function return type is x86_fp80 and the callee return type is not,
03340   // then the FP_EXTEND of the call result is not a nop. It's not safe to
03341   // perform a tailcall optimization here.
03342   if (CallerF->getReturnType()->isX86_FP80Ty() && !RetTy->isX86_FP80Ty())
03343     return false;
03344 
03345   CallingConv::ID CallerCC = CallerF->getCallingConv();
03346   bool CCMatch = CallerCC == CalleeCC;
03347   bool IsCalleeWin64 = Subtarget->isCallingConvWin64(CalleeCC);
03348   bool IsCallerWin64 = Subtarget->isCallingConvWin64(CallerCC);
03349 
03350   if (DAG.getTarget().Options.GuaranteedTailCallOpt) {
03351     if (IsTailCallConvention(CalleeCC) && CCMatch)
03352       return true;
03353     return false;
03354   }
03355 
03356   // Look for obvious safe cases to perform tail call optimization that do not
03357   // require ABI changes. This is what gcc calls sibcall.
03358 
03359   // Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to
03360   // emit a special epilogue.
03361   const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>(
03362       DAG.getSubtarget().getRegisterInfo());
03363   if (RegInfo->needsStackRealignment(MF))
03364     return false;
03365 
03366   // Also avoid sibcall optimization if either caller or callee uses struct
03367   // return semantics.
03368   if (isCalleeStructRet || isCallerStructRet)
03369     return false;
03370 
03371   // An stdcall/thiscall caller is expected to clean up its arguments; the
03372   // callee isn't going to do that.
03373   // FIXME: this is more restrictive than needed. We could produce a tailcall
03374   // when the stack adjustment matches. For example, with a thiscall that takes
03375   // only one argument.
03376   if (!CCMatch && (CallerCC == CallingConv::X86_StdCall ||
03377                    CallerCC == CallingConv::X86_ThisCall))
03378     return false;
03379 
03380   // Do not sibcall optimize vararg calls unless all arguments are passed via
03381   // registers.
03382   if (isVarArg && !Outs.empty()) {
03383 
03384     // Optimizing for varargs on Win64 is unlikely to be safe without
03385     // additional testing.
03386     if (IsCalleeWin64 || IsCallerWin64)
03387       return false;
03388 
03389     SmallVector<CCValAssign, 16> ArgLocs;
03390     CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(), ArgLocs,
03391                    *DAG.getContext());
03392 
03393     CCInfo.AnalyzeCallOperands(Outs, CC_X86);
03394     for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i)
03395       if (!ArgLocs[i].isRegLoc())
03396         return false;
03397   }
03398 
03399   // If the call result is in ST0 / ST1, it needs to be popped off the x87
03400   // stack.  Therefore, if it's not used by the call it is not safe to optimize
03401   // this into a sibcall.
03402   bool Unused = false;
03403   for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
03404     if (!Ins[i].Used) {
03405       Unused = true;
03406       break;
03407     }
03408   }
03409   if (Unused) {
03410     SmallVector<CCValAssign, 16> RVLocs;
03411     CCState CCInfo(CalleeCC, false, DAG.getMachineFunction(), RVLocs,
03412                    *DAG.getContext());
03413     CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
03414     for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
03415       CCValAssign &VA = RVLocs[i];
03416       if (VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1)
03417         return false;
03418     }
03419   }
03420 
03421   // If the calling conventions do not match, then we'd better make sure the
03422   // results are returned in the same way as what the caller expects.
03423   if (!CCMatch) {
03424     SmallVector<CCValAssign, 16> RVLocs1;
03425     CCState CCInfo1(CalleeCC, false, DAG.getMachineFunction(), RVLocs1,
03426                     *DAG.getContext());
03427     CCInfo1.AnalyzeCallResult(Ins, RetCC_X86);
03428 
03429     SmallVector<CCValAssign, 16> RVLocs2;
03430     CCState CCInfo2(CallerCC, false, DAG.getMachineFunction(), RVLocs2,
03431                     *DAG.getContext());
03432     CCInfo2.AnalyzeCallResult(Ins, RetCC_X86);
03433 
03434     if (RVLocs1.size() != RVLocs2.size())
03435       return false;
03436     for (unsigned i = 0, e = RVLocs1.size(); i != e; ++i) {
03437       if (RVLocs1[i].isRegLoc() != RVLocs2[i].isRegLoc())
03438         return false;
03439       if (RVLocs1[i].getLocInfo() != RVLocs2[i].getLocInfo())
03440         return false;
03441       if (RVLocs1[i].isRegLoc()) {
03442         if (RVLocs1[i].getLocReg() != RVLocs2[i].getLocReg())
03443           return false;
03444       } else {
03445         if (RVLocs1[i].getLocMemOffset() != RVLocs2[i].getLocMemOffset())
03446           return false;
03447       }
03448     }
03449   }
03450 
03451   // If the callee takes no arguments then go on to check the results of the
03452   // call.
03453   if (!Outs.empty()) {
03454     // Check if stack adjustment is needed. For now, do not do this if any
03455     // argument is passed on the stack.
03456     SmallVector<CCValAssign, 16> ArgLocs;
03457     CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(), ArgLocs,
03458                    *DAG.getContext());
03459 
03460     // Allocate shadow area for Win64
03461     if (IsCalleeWin64)
03462       CCInfo.AllocateStack(32, 8);
03463 
03464     CCInfo.AnalyzeCallOperands(Outs, CC_X86);
03465     if (CCInfo.getNextStackOffset()) {
03466       MachineFunction &MF = DAG.getMachineFunction();
03467       if (MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn())
03468         return false;
03469 
03470       // Check if the arguments are already laid out in the right way as
03471       // the caller's fixed stack objects.
03472       MachineFrameInfo *MFI = MF.getFrameInfo();
03473       const MachineRegisterInfo *MRI = &MF.getRegInfo();
03474       const X86InstrInfo *TII =
03475           static_cast<const X86InstrInfo *>(DAG.getSubtarget().getInstrInfo());
03476       for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
03477         CCValAssign &VA = ArgLocs[i];
03478         SDValue Arg = OutVals[i];
03479         ISD::ArgFlagsTy Flags = Outs[i].Flags;
03480         if (VA.getLocInfo() == CCValAssign::Indirect)
03481           return false;
03482         if (!VA.isRegLoc()) {
03483           if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags,
03484                                    MFI, MRI, TII))
03485             return false;
03486         }
03487       }
03488     }
03489 
03490     // If the tailcall address may be in a register, then make sure it's
03491     // possible to register allocate for it. In 32-bit, the call address can
03492     // only target EAX, EDX, or ECX since the tail call must be scheduled after
03493     // callee-saved registers are restored. These happen to be the same
03494     // registers used to pass 'inreg' arguments so watch out for those.
03495     if (!Subtarget->is64Bit() &&
03496         ((!isa<GlobalAddressSDNode>(Callee) &&
03497           !isa<ExternalSymbolSDNode>(Callee)) ||
03498          DAG.getTarget().getRelocationModel() == Reloc::PIC_)) {
03499       unsigned NumInRegs = 0;
03500       // In PIC we need an extra register to formulate the address computation
03501       // for the callee.
03502       unsigned MaxInRegs =
03503   (DAG.getTarget().getRelocationModel() == Reloc::PIC_) ? 2 : 3;
03504 
03505       for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
03506         CCValAssign &VA = ArgLocs[i];
03507         if (!VA.isRegLoc())
03508           continue;
03509         unsigned Reg = VA.getLocReg();
03510         switch (Reg) {
03511         default: break;
03512         case X86::EAX: case X86::EDX: case X86::ECX:
03513           if (++NumInRegs == MaxInRegs)
03514             return false;
03515           break;
03516         }
03517       }
03518     }
03519   }
03520 
03521   return true;
03522 }
03523 
03524 FastISel *
03525 X86TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
03526                                   const TargetLibraryInfo *libInfo) const {
03527   return X86::createFastISel(funcInfo, libInfo);
03528 }
03529 
03530 //===----------------------------------------------------------------------===//
03531 //                           Other Lowering Hooks
03532 //===----------------------------------------------------------------------===//
03533 
03534 static bool MayFoldLoad(SDValue Op) {
03535   return Op.hasOneUse() && ISD::isNormalLoad(Op.getNode());
03536 }
03537 
03538 static bool MayFoldIntoStore(SDValue Op) {
03539   return Op.hasOneUse() && ISD::isNormalStore(*Op.getNode()->use_begin());
03540 }
03541 
03542 static bool isTargetShuffle(unsigned Opcode) {
03543   switch(Opcode) {
03544   default: return false;
03545   case X86ISD::BLENDI:
03546   case X86ISD::PSHUFB:
03547   case X86ISD::PSHUFD:
03548   case X86ISD::PSHUFHW:
03549   case X86ISD::PSHUFLW:
03550   case X86ISD::SHUFP:
03551   case X86ISD::PALIGNR:
03552   case X86ISD::MOVLHPS:
03553   case X86ISD::MOVLHPD:
03554   case X86ISD::MOVHLPS:
03555   case X86ISD::MOVLPS:
03556   case X86ISD::MOVLPD:
03557   case X86ISD::MOVSHDUP:
03558   case X86ISD::MOVSLDUP:
03559   case X86ISD::MOVDDUP:
03560   case X86ISD::MOVSS:
03561   case X86ISD::MOVSD:
03562   case X86ISD::UNPCKL:
03563   case X86ISD::UNPCKH:
03564   case X86ISD::VPERMILPI:
03565   case X86ISD::VPERM2X128:
03566   case X86ISD::VPERMI:
03567     return true;
03568   }
03569 }
03570 
03571 static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT,
03572                                     SDValue V1, SelectionDAG &DAG) {
03573   switch(Opc) {
03574   default: llvm_unreachable("Unknown x86 shuffle node");
03575   case X86ISD::MOVSHDUP:
03576   case X86ISD::MOVSLDUP:
03577   case X86ISD::MOVDDUP:
03578     return DAG.getNode(Opc, dl, VT, V1);
03579   }
03580 }
03581 
03582 static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT,
03583                                     SDValue V1, unsigned TargetMask,
03584                                     SelectionDAG &DAG) {
03585   switch(Opc) {
03586   default: llvm_unreachable("Unknown x86 shuffle node");
03587   case X86ISD::PSHUFD:
03588   case X86ISD::PSHUFHW:
03589   case X86ISD::PSHUFLW:
03590   case X86ISD::VPERMILPI:
03591   case X86ISD::VPERMI:
03592     return DAG.getNode(Opc, dl, VT, V1, DAG.getConstant(TargetMask, MVT::i8));
03593   }
03594 }
03595 
03596 static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT,
03597                                     SDValue V1, SDValue V2, unsigned TargetMask,
03598                                     SelectionDAG &DAG) {
03599   switch(Opc) {
03600   default: llvm_unreachable("Unknown x86 shuffle node");
03601   case X86ISD::PALIGNR:
03602   case X86ISD::VALIGN:
03603   case X86ISD::SHUFP:
03604   case X86ISD::VPERM2X128:
03605     return DAG.getNode(Opc, dl, VT, V1, V2,
03606                        DAG.getConstant(TargetMask, MVT::i8));
03607   }
03608 }
03609 
03610 static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT,
03611                                     SDValue V1, SDValue V2, SelectionDAG &DAG) {
03612   switch(Opc) {
03613   default: llvm_unreachable("Unknown x86 shuffle node");
03614   case X86ISD::MOVLHPS:
03615   case X86ISD::MOVLHPD:
03616   case X86ISD::MOVHLPS:
03617   case X86ISD::MOVLPS:
03618   case X86ISD::MOVLPD:
03619   case X86ISD::MOVSS:
03620   case X86ISD::MOVSD:
03621   case X86ISD::UNPCKL:
03622   case X86ISD::UNPCKH:
03623     return DAG.getNode(Opc, dl, VT, V1, V2);
03624   }
03625 }
03626 
03627 SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const {
03628   MachineFunction &MF = DAG.getMachineFunction();
03629   const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>(
03630       DAG.getSubtarget().getRegisterInfo());
03631   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
03632   int ReturnAddrIndex = FuncInfo->getRAIndex();
03633 
03634   if (ReturnAddrIndex == 0) {
03635     // Set up a frame object for the return address.
03636     unsigned SlotSize = RegInfo->getSlotSize();
03637     ReturnAddrIndex = MF.getFrameInfo()->CreateFixedObject(SlotSize,
03638                                                            -(int64_t)SlotSize,
03639                                                            false);
03640     FuncInfo->setRAIndex(ReturnAddrIndex);
03641   }
03642 
03643   return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy());
03644 }
03645 
03646 bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M,
03647                                        bool hasSymbolicDisplacement) {
03648   // Offset should fit into 32 bit immediate field.
03649   if (!isInt<32>(Offset))
03650     return false;
03651 
03652   // If we don't have a symbolic displacement - we don't have any extra
03653   // restrictions.
03654   if (!hasSymbolicDisplacement)
03655     return true;
03656 
03657   // FIXME: Some tweaks might be needed for medium code model.
03658   if (M != CodeModel::Small && M != CodeModel::Kernel)
03659     return false;
03660 
03661   // For small code model we assume that latest object is 16MB before end of 31
03662   // bits boundary. We may also accept pretty large negative constants knowing
03663   // that all objects are in the positive half of address space.
03664   if (M == CodeModel::Small && Offset < 16*1024*1024)
03665     return true;
03666 
03667   // For kernel code model we know that all object resist in the negative half
03668   // of 32bits address space. We may not accept negative offsets, since they may
03669   // be just off and we may accept pretty large positive ones.
03670   if (M == CodeModel::Kernel && Offset > 0)
03671     return true;
03672 
03673   return false;
03674 }
03675 
03676 /// isCalleePop - Determines whether the callee is required to pop its
03677 /// own arguments. Callee pop is necessary to support tail calls.
03678 bool X86::isCalleePop(CallingConv::ID CallingConv,
03679                       bool is64Bit, bool IsVarArg, bool TailCallOpt) {
03680   switch (CallingConv) {
03681   default:
03682     return false;
03683   case CallingConv::X86_StdCall:
03684   case CallingConv::X86_FastCall:
03685   case CallingConv::X86_ThisCall:
03686     return !is64Bit;
03687   case CallingConv::Fast:
03688   case CallingConv::GHC:
03689   case CallingConv::HiPE:
03690     if (IsVarArg)
03691       return false;
03692     return TailCallOpt;
03693   }
03694 }
03695 
03696 /// \brief Return true if the condition is an unsigned comparison operation.
03697 static bool isX86CCUnsigned(unsigned X86CC) {
03698   switch (X86CC) {
03699   default: llvm_unreachable("Invalid integer condition!");
03700   case X86::COND_E:     return true;
03701   case X86::COND_G:     return false;
03702   case X86::COND_GE:    return false;
03703   case X86::COND_L:     return false;
03704   case X86::COND_LE:    return false;
03705   case X86::COND_NE:    return true;
03706   case X86::COND_B:     return true;
03707   case X86::COND_A:     return true;
03708   case X86::COND_BE:    return true;
03709   case X86::COND_AE:    return true;
03710   }
03711   llvm_unreachable("covered switch fell through?!");
03712 }
03713 
03714 /// TranslateX86CC - do a one to one translation of a ISD::CondCode to the X86
03715 /// specific condition code, returning the condition code and the LHS/RHS of the
03716 /// comparison to make.
03717 static unsigned TranslateX86CC(ISD::CondCode SetCCOpcode, bool isFP,
03718                                SDValue &LHS, SDValue &RHS, SelectionDAG &DAG) {
03719   if (!isFP) {
03720     if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) {
03721       if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnesValue()) {
03722         // X > -1   -> X == 0, jump !sign.
03723         RHS = DAG.getConstant(0, RHS.getValueType());
03724         return X86::COND_NS;
03725       }
03726       if (SetCCOpcode == ISD::SETLT && RHSC->isNullValue()) {
03727         // X < 0   -> X == 0, jump on sign.
03728         return X86::COND_S;
03729       }
03730       if (SetCCOpcode == ISD::SETLT && RHSC->getZExtValue() == 1) {
03731         // X < 1   -> X <= 0
03732         RHS = DAG.getConstant(0, RHS.getValueType());
03733         return X86::COND_LE;
03734       }
03735     }
03736 
03737     switch (SetCCOpcode) {
03738     default: llvm_unreachable("Invalid integer condition!");
03739     case ISD::SETEQ:  return X86::COND_E;
03740     case ISD::SETGT:  return X86::COND_G;
03741     case ISD::SETGE:  return X86::COND_GE;
03742     case ISD::SETLT:  return X86::COND_L;
03743     case ISD::SETLE:  return X86::COND_LE;
03744     case ISD::SETNE:  return X86::COND_NE;
03745     case ISD::SETULT: return X86::COND_B;
03746     case ISD::SETUGT: return X86::COND_A;
03747     case ISD::SETULE: return X86::COND_BE;
03748     case ISD::SETUGE: return X86::COND_AE;
03749     }
03750   }
03751 
03752   // First determine if it is required or is profitable to flip the operands.
03753 
03754   // If LHS is a foldable load, but RHS is not, flip the condition.
03755   if (ISD::isNON_EXTLoad(LHS.getNode()) &&
03756       !ISD::isNON_EXTLoad(RHS.getNode())) {
03757     SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode);
03758     std::swap(LHS, RHS);
03759   }
03760 
03761   switch (SetCCOpcode) {
03762   default: break;
03763   case ISD::SETOLT:
03764   case ISD::SETOLE:
03765   case ISD::SETUGT:
03766   case ISD::SETUGE:
03767     std::swap(LHS, RHS);
03768     break;
03769   }
03770 
03771   // On a floating point condition, the flags are set as follows:
03772   // ZF  PF  CF   op
03773   //  0 | 0 | 0 | X > Y
03774   //  0 | 0 | 1 | X < Y
03775   //  1 | 0 | 0 | X == Y
03776   //  1 | 1 | 1 | unordered
03777   switch (SetCCOpcode) {
03778   default: llvm_unreachable("Condcode should be pre-legalized away");
03779   case ISD::SETUEQ:
03780   case ISD::SETEQ:   return X86::COND_E;
03781   case ISD::SETOLT:              // flipped
03782   case ISD::SETOGT:
03783   case ISD::SETGT:   return X86::COND_A;
03784   case ISD::SETOLE:              // flipped
03785   case ISD::SETOGE:
03786   case ISD::SETGE:   return X86::COND_AE;
03787   case ISD::SETUGT:              // flipped
03788   case ISD::SETULT:
03789   case ISD::SETLT:   return X86::COND_B;
03790   case ISD::SETUGE:              // flipped
03791   case ISD::SETULE:
03792   case ISD::SETLE:   return X86::COND_BE;
03793   case ISD::SETONE:
03794   case ISD::SETNE:   return X86::COND_NE;
03795   case ISD::SETUO:   return X86::COND_P;
03796   case ISD::SETO:    return X86::COND_NP;
03797   case ISD::SETOEQ:
03798   case ISD::SETUNE:  return X86::COND_INVALID;
03799   }
03800 }
03801 
03802 /// hasFPCMov - is there a floating point cmov for the specific X86 condition
03803 /// code. Current x86 isa includes the following FP cmov instructions:
03804 /// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu.
03805 static bool hasFPCMov(unsigned X86CC) {
03806   switch (X86CC) {
03807   default:
03808     return false;
03809   case X86::COND_B:
03810   case X86::COND_BE:
03811   case X86::COND_E:
03812   case X86::COND_P:
03813   case X86::COND_A:
03814   case X86::COND_AE:
03815   case X86::COND_NE:
03816   case X86::COND_NP:
03817     return true;
03818   }
03819 }
03820 
03821 /// isFPImmLegal - Returns true if the target can instruction select the
03822 /// specified FP immediate natively. If false, the legalizer will
03823 /// materialize the FP immediate as a load from a constant pool.
03824 bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
03825   for (unsigned i = 0, e = LegalFPImmediates.size(); i != e; ++i) {
03826     if (Imm.bitwiseIsEqual(LegalFPImmediates[i]))
03827       return true;
03828   }
03829   return false;
03830 }
03831 
03832 /// \brief Returns true if it is beneficial to convert a load of a constant
03833 /// to just the constant itself.
03834 bool X86TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
03835                                                           Type *Ty) const {
03836   assert(Ty->isIntegerTy());
03837 
03838   unsigned BitSize = Ty->getPrimitiveSizeInBits();
03839   if (BitSize == 0 || BitSize > 64)
03840     return false;
03841   return true;
03842 }
03843 
03844 /// isUndefOrInRange - Return true if Val is undef or if its value falls within
03845 /// the specified range (L, H].
03846 static bool isUndefOrInRange(int Val, int Low, int Hi) {
03847   return (Val < 0) || (Val >= Low && Val < Hi);
03848 }
03849 
03850 /// isUndefOrEqual - Val is either less than zero (undef) or equal to the
03851 /// specified value.
03852 static bool isUndefOrEqual(int Val, int CmpVal) {
03853   return (Val < 0 || Val == CmpVal);
03854 }
03855 
03856 /// isSequentialOrUndefInRange - Return true if every element in Mask, beginning
03857 /// from position Pos and ending in Pos+Size, falls within the specified
03858 /// sequential range (L, L+Pos]. or is undef.
03859 static bool isSequentialOrUndefInRange(ArrayRef<int> Mask,
03860                                        unsigned Pos, unsigned Size, int Low) {
03861   for (unsigned i = Pos, e = Pos+Size; i != e; ++i, ++Low)
03862     if (!isUndefOrEqual(Mask[i], Low))
03863       return false;
03864   return true;
03865 }
03866 
03867 /// isPSHUFDMask - Return true if the node specifies a shuffle of elements that
03868 /// is suitable for input to PSHUFD or PSHUFW.  That is, it doesn't reference
03869 /// the second operand.
03870 static bool isPSHUFDMask(ArrayRef<int> Mask, MVT VT) {
03871   if (VT == MVT::v4f32 || VT == MVT::v4i32 )
03872     return (Mask[0] < 4 && Mask[1] < 4 && Mask[2] < 4 && Mask[3] < 4);
03873   if (VT == MVT::v2f64 || VT == MVT::v2i64)
03874     return (Mask[0] < 2 && Mask[1] < 2);
03875   return false;
03876 }
03877 
03878 /// isPSHUFHWMask - Return true if the node specifies a shuffle of elements that
03879 /// is suitable for input to PSHUFHW.
03880 static bool isPSHUFHWMask(ArrayRef<int> Mask, MVT VT, bool HasInt256) {
03881   if (VT != MVT::v8i16 && (!HasInt256 || VT != MVT::v16i16))
03882     return false;
03883 
03884   // Lower quadword copied in order or undef.
03885   if (!isSequentialOrUndefInRange(Mask, 0, 4, 0))
03886     return false;
03887 
03888   // Upper quadword shuffled.
03889   for (unsigned i = 4; i != 8; ++i)
03890     if (!isUndefOrInRange(Mask[i], 4, 8))
03891       return false;
03892 
03893   if (VT == MVT::v16i16) {
03894     // Lower quadword copied in order or undef.
03895     if (!isSequentialOrUndefInRange(Mask, 8, 4, 8))
03896       return false;
03897 
03898     // Upper quadword shuffled.
03899     for (unsigned i = 12; i != 16; ++i)
03900       if (!isUndefOrInRange(Mask[i], 12, 16))
03901         return false;
03902   }
03903 
03904   return true;
03905 }
03906 
03907 /// isPSHUFLWMask - Return true if the node specifies a shuffle of elements that
03908 /// is suitable for input to PSHUFLW.
03909 static bool isPSHUFLWMask(ArrayRef<int> Mask, MVT VT, bool HasInt256) {
03910   if (VT != MVT::v8i16 && (!HasInt256 || VT != MVT::v16i16))
03911     return false;
03912 
03913   // Upper quadword copied in order.
03914   if (!isSequentialOrUndefInRange(Mask, 4, 4, 4))
03915     return false;
03916 
03917   // Lower quadword shuffled.
03918   for (unsigned i = 0; i != 4; ++i)
03919     if (!isUndefOrInRange(Mask[i], 0, 4))
03920       return false;
03921 
03922   if (VT == MVT::v16i16) {
03923     // Upper quadword copied in order.
03924     if (!isSequentialOrUndefInRange(Mask, 12, 4, 12))
03925       return false;
03926 
03927     // Lower quadword shuffled.
03928     for (unsigned i = 8; i != 12; ++i)
03929       if (!isUndefOrInRange(Mask[i], 8, 12))
03930         return false;
03931   }
03932 
03933   return true;
03934 }
03935 
03936 /// \brief Return true if the mask specifies a shuffle of elements that is
03937 /// suitable for input to intralane (palignr) or interlane (valign) vector
03938 /// right-shift.
03939 static bool isAlignrMask(ArrayRef<int> Mask, MVT VT, bool InterLane) {
03940   unsigned NumElts = VT.getVectorNumElements();
03941   unsigned NumLanes = InterLane ? 1: VT.getSizeInBits()/128;
03942   unsigned NumLaneElts = NumElts/NumLanes;
03943 
03944   // Do not handle 64-bit element shuffles with palignr.
03945   if (NumLaneElts == 2)
03946     return false;
03947 
03948   for (unsigned l = 0; l != NumElts; l+=NumLaneElts) {
03949     unsigned i;
03950     for (i = 0; i != NumLaneElts; ++i) {
03951       if (Mask[i+l] >= 0)
03952         break;
03953     }
03954 
03955     // Lane is all undef, go to next lane
03956     if (i == NumLaneElts)
03957       continue;
03958 
03959     int Start = Mask[i+l];
03960 
03961     // Make sure its in this lane in one of the sources
03962     if (!isUndefOrInRange(Start, l, l+NumLaneElts) &&
03963         !isUndefOrInRange(Start, l+NumElts, l+NumElts+NumLaneElts))
03964       return false;
03965 
03966     // If not lane 0, then we must match lane 0
03967     if (l != 0 && Mask[i] >= 0 && !isUndefOrEqual(Start, Mask[i]+l))
03968       return false;
03969 
03970     // Correct second source to be contiguous with first source
03971     if (Start >= (int)NumElts)
03972       Start -= NumElts - NumLaneElts;
03973 
03974     // Make sure we're shifting in the right direction.
03975     if (Start <= (int)(i+l))
03976       return false;
03977 
03978     Start -= i;
03979 
03980     // Check the rest of the elements to see if they are consecutive.
03981     for (++i; i != NumLaneElts; ++i) {
03982       int Idx = Mask[i+l];
03983 
03984       // Make sure its in this lane
03985       if (!isUndefOrInRange(Idx, l, l+NumLaneElts) &&
03986           !isUndefOrInRange(Idx, l+NumElts, l+NumElts+NumLaneElts))
03987         return false;
03988 
03989       // If not lane 0, then we must match lane 0
03990       if (l != 0 && Mask[i] >= 0 && !isUndefOrEqual(Idx, Mask[i]+l))
03991         return false;
03992 
03993       if (Idx >= (int)NumElts)
03994         Idx -= NumElts - NumLaneElts;
03995 
03996       if (!isUndefOrEqual(Idx, Start+i))
03997         return false;
03998 
03999     }
04000   }
04001 
04002   return true;
04003 }
04004 
04005 /// \brief Return true if the node specifies a shuffle of elements that is
04006 /// suitable for input to PALIGNR.
04007 static bool isPALIGNRMask(ArrayRef<int> Mask, MVT VT,
04008                           const X86Subtarget *Subtarget) {
04009   if ((VT.is128BitVector() && !Subtarget->hasSSSE3()) ||
04010       (VT.is256BitVector() && !Subtarget->hasInt256()) ||
04011       VT.is512BitVector())
04012     // FIXME: Add AVX512BW.
04013     return false;
04014 
04015   return isAlignrMask(Mask, VT, false);
04016 }
04017 
04018 /// \brief Return true if the node specifies a shuffle of elements that is
04019 /// suitable for input to VALIGN.
04020 static bool isVALIGNMask(ArrayRef<int> Mask, MVT VT,
04021                           const X86Subtarget *Subtarget) {
04022   // FIXME: Add AVX512VL.
04023   if (!VT.is512BitVector() || !Subtarget->hasAVX512())
04024     return false;
04025   return isAlignrMask(Mask, VT, true);
04026 }
04027 
04028 /// CommuteVectorShuffleMask - Change values in a shuffle permute mask assuming
04029 /// the two vector operands have swapped position.
04030 static void CommuteVectorShuffleMask(SmallVectorImpl<int> &Mask,
04031                                      unsigned NumElems) {
04032   for (unsigned i = 0; i != NumElems; ++i) {
04033     int idx = Mask[i];
04034     if (idx < 0)
04035       continue;
04036     else if (idx < (int)NumElems)
04037       Mask[i] = idx + NumElems;
04038     else
04039       Mask[i] = idx - NumElems;
04040   }
04041 }
04042 
04043 /// isSHUFPMask - Return true if the specified VECTOR_SHUFFLE operand
04044 /// specifies a shuffle of elements that is suitable for input to 128/256-bit
04045 /// SHUFPS and SHUFPD. If Commuted is true, then it checks for sources to be
04046 /// reverse of what x86 shuffles want.
04047 static bool isSHUFPMask(ArrayRef<int> Mask, MVT VT, bool Commuted = false) {
04048 
04049   unsigned NumElems = VT.getVectorNumElements();
04050   unsigned NumLanes = VT.getSizeInBits()/128;
04051   unsigned NumLaneElems = NumElems/NumLanes;
04052 
04053   if (NumLaneElems != 2 && NumLaneElems != 4)
04054     return false;
04055 
04056   unsigned EltSize = VT.getVectorElementType().getSizeInBits();
04057   bool symetricMaskRequired =
04058     (VT.getSizeInBits() >= 256) && (EltSize == 32);
04059 
04060   // VSHUFPSY divides the resulting vector into 4 chunks.
04061   // The sources are also splitted into 4 chunks, and each destination
04062   // chunk must come from a different source chunk.
04063   //
04064   //  SRC1 =>   X7    X6    X5    X4    X3    X2    X1    X0
04065   //  SRC2 =>   Y7    Y6    Y5    Y4    Y3    Y2    Y1    Y9
04066   //
04067   //  DST  =>  Y7..Y4,   Y7..Y4,   X7..X4,   X7..X4,
04068   //           Y3..Y0,   Y3..Y0,   X3..X0,   X3..X0
04069   //
04070   // VSHUFPDY divides the resulting vector into 4 chunks.
04071   // The sources are also splitted into 4 chunks, and each destination
04072   // chunk must come from a different source chunk.
04073   //
04074   //  SRC1 =>      X3       X2       X1       X0
04075   //  SRC2 =>      Y3       Y2       Y1       Y0
04076   //
04077   //  DST  =>  Y3..Y2,  X3..X2,  Y1..Y0,  X1..X0
04078   //
04079   SmallVector<int, 4> MaskVal(NumLaneElems, -1);
04080   unsigned HalfLaneElems = NumLaneElems/2;
04081   for (unsigned l = 0; l != NumElems; l += NumLaneElems) {
04082     for (unsigned i = 0; i != NumLaneElems; ++i) {
04083       int Idx = Mask[i+l];
04084       unsigned RngStart = l + ((Commuted == (i<HalfLaneElems)) ? NumElems : 0);
04085       if (!isUndefOrInRange(Idx, RngStart, RngStart+NumLaneElems))
04086         return false;
04087       // For VSHUFPSY, the mask of the second half must be the same as the
04088       // first but with the appropriate offsets. This works in the same way as
04089       // VPERMILPS works with masks.
04090       if (!symetricMaskRequired || Idx < 0)
04091         continue;
04092       if (MaskVal[i] < 0) {
04093         MaskVal[i] = Idx - l;
04094         continue;
04095       }
04096       if ((signed)(Idx - l) != MaskVal[i])
04097         return false;
04098     }
04099   }
04100 
04101   return true;
04102 }
04103 
04104 /// isMOVHLPSMask - Return true if the specified VECTOR_SHUFFLE operand
04105 /// specifies a shuffle of elements that is suitable for input to MOVHLPS.
04106 static bool isMOVHLPSMask(ArrayRef<int> Mask, MVT VT) {
04107   if (!VT.is128BitVector())
04108     return false;
04109 
04110   unsigned NumElems = VT.getVectorNumElements();
04111 
04112   if (NumElems != 4)
04113     return false;
04114 
04115   // Expect bit0 == 6, bit1 == 7, bit2 == 2, bit3 == 3
04116   return isUndefOrEqual(Mask[0], 6) &&
04117          isUndefOrEqual(Mask[1], 7) &&
04118          isUndefOrEqual(Mask[2], 2) &&
04119          isUndefOrEqual(Mask[3], 3);
04120 }
04121 
04122 /// isMOVHLPS_v_undef_Mask - Special case of isMOVHLPSMask for canonical form
04123 /// of vector_shuffle v, v, <2, 3, 2, 3>, i.e. vector_shuffle v, undef,
04124 /// <2, 3, 2, 3>
04125 static bool isMOVHLPS_v_undef_Mask(ArrayRef<int> Mask, MVT VT) {
04126   if (!VT.is128BitVector())
04127     return false;
04128 
04129   unsigned NumElems = VT.getVectorNumElements();
04130 
04131   if (NumElems != 4)
04132     return false;
04133 
04134   return isUndefOrEqual(Mask[0], 2) &&
04135          isUndefOrEqual(Mask[1], 3) &&
04136          isUndefOrEqual(Mask[2], 2) &&
04137          isUndefOrEqual(Mask[3], 3);
04138 }
04139 
04140 /// isMOVLPMask - Return true if the specified VECTOR_SHUFFLE operand
04141 /// specifies a shuffle of elements that is suitable for input to MOVLP{S|D}.
04142 static bool isMOVLPMask(ArrayRef<int> Mask, MVT VT) {
04143   if (!VT.is128BitVector())
04144     return false;
04145 
04146   unsigned NumElems = VT.getVectorNumElements();
04147 
04148   if (NumElems != 2 && NumElems != 4)
04149     return false;
04150 
04151   for (unsigned i = 0, e = NumElems/2; i != e; ++i)
04152     if (!isUndefOrEqual(Mask[i], i + NumElems))
04153       return false;
04154 
04155   for (unsigned i = NumElems/2, e = NumElems; i != e; ++i)
04156     if (!isUndefOrEqual(Mask[i], i))
04157       return false;
04158 
04159   return true;
04160 }
04161 
04162 /// isMOVLHPSMask - Return true if the specified VECTOR_SHUFFLE operand
04163 /// specifies a shuffle of elements that is suitable for input to MOVLHPS.
04164 static bool isMOVLHPSMask(ArrayRef<int> Mask, MVT VT) {
04165   if (!VT.is128BitVector())
04166     return false;
04167 
04168   unsigned NumElems = VT.getVectorNumElements();
04169 
04170   if (NumElems != 2 && NumElems != 4)
04171     return false;
04172 
04173   for (unsigned i = 0, e = NumElems/2; i != e; ++i)
04174     if (!isUndefOrEqual(Mask[i], i))
04175       return false;
04176 
04177   for (unsigned i = 0, e = NumElems/2; i != e; ++i)
04178     if (!isUndefOrEqual(Mask[i + e], i + NumElems))
04179       return false;
04180 
04181   return true;
04182 }
04183 
04184 /// isINSERTPSMask - Return true if the specified VECTOR_SHUFFLE operand
04185 /// specifies a shuffle of elements that is suitable for input to INSERTPS.
04186 /// i. e: If all but one element come from the same vector.
04187 static bool isINSERTPSMask(ArrayRef<int> Mask, MVT VT) {
04188   // TODO: Deal with AVX's VINSERTPS
04189   if (!VT.is128BitVector() || (VT != MVT::v4f32 && VT != MVT::v4i32))
04190     return false;
04191 
04192   unsigned CorrectPosV1 = 0;
04193   unsigned CorrectPosV2 = 0;
04194   for (int i = 0, e = (int)VT.getVectorNumElements(); i != e; ++i) {
04195     if (Mask[i] == -1) {
04196       ++CorrectPosV1;
04197       ++CorrectPosV2;
04198       continue;
04199     }
04200 
04201     if (Mask[i] == i)
04202       ++CorrectPosV1;
04203     else if (Mask[i] == i + 4)
04204       ++CorrectPosV2;
04205   }
04206 
04207   if (CorrectPosV1 == 3 || CorrectPosV2 == 3)
04208     // We have 3 elements (undefs count as elements from any vector) from one
04209     // vector, and one from another.
04210     return true;
04211 
04212   return false;
04213 }
04214 
04215 //
04216 // Some special combinations that can be optimized.
04217 //
04218 static
04219 SDValue Compact8x32ShuffleNode(ShuffleVectorSDNode *SVOp,
04220                                SelectionDAG &DAG) {
04221   MVT VT = SVOp->getSimpleValueType(0);
04222   SDLoc dl(SVOp);
04223 
04224   if (VT != MVT::v8i32 && VT != MVT::v8f32)
04225     return SDValue();
04226 
04227   ArrayRef<int> Mask = SVOp->getMask();
04228 
04229   // These are the special masks that may be optimized.
04230   static const int MaskToOptimizeEven[] = {0, 8, 2, 10, 4, 12, 6, 14};
04231   static const int MaskToOptimizeOdd[]  = {1, 9, 3, 11, 5, 13, 7, 15};
04232   bool MatchEvenMask = true;
04233   bool MatchOddMask  = true;
04234   for (int i=0; i<8; ++i) {
04235     if (!isUndefOrEqual(Mask[i], MaskToOptimizeEven[i]))
04236       MatchEvenMask = false;
04237     if (!isUndefOrEqual(Mask[i], MaskToOptimizeOdd[i]))
04238       MatchOddMask = false;
04239   }
04240 
04241   if (!MatchEvenMask && !MatchOddMask)
04242     return SDValue();
04243 
04244   SDValue UndefNode = DAG.getNode(ISD::UNDEF, dl, VT);
04245 
04246   SDValue Op0 = SVOp->getOperand(0);
04247   SDValue Op1 = SVOp->getOperand(1);
04248 
04249   if (MatchEvenMask) {
04250     // Shift the second operand right to 32 bits.
04251     static const int ShiftRightMask[] = {-1, 0, -1, 2, -1, 4, -1, 6 };
04252     Op1 = DAG.getVectorShuffle(VT, dl, Op1, UndefNode, ShiftRightMask);
04253   } else {
04254     // Shift the first operand left to 32 bits.
04255     static const int ShiftLeftMask[] = {1, -1, 3, -1, 5, -1, 7, -1 };
04256     Op0 = DAG.getVectorShuffle(VT, dl, Op0, UndefNode, ShiftLeftMask);
04257   }
04258   static const int BlendMask[] = {0, 9, 2, 11, 4, 13, 6, 15};
04259   return DAG.getVectorShuffle(VT, dl, Op0, Op1, BlendMask);
04260 }
04261 
04262 /// isUNPCKLMask - Return true if the specified VECTOR_SHUFFLE operand
04263 /// specifies a shuffle of elements that is suitable for input to UNPCKL.
04264 static bool isUNPCKLMask(ArrayRef<int> Mask, MVT VT,
04265                          bool HasInt256, bool V2IsSplat = false) {
04266 
04267   assert(VT.getSizeInBits() >= 128 &&
04268          "Unsupported vector type for unpckl");
04269 
04270   unsigned NumElts = VT.getVectorNumElements();
04271   if (VT.is256BitVector() && NumElts != 4 && NumElts != 8 &&
04272       (!HasInt256 || (NumElts != 16 && NumElts != 32)))
04273     return false;
04274 
04275   assert((!VT.is512BitVector() || VT.getScalarType().getSizeInBits() >= 32) &&
04276          "Unsupported vector type for unpckh");
04277 
04278   // AVX defines UNPCK* to operate independently on 128-bit lanes.
04279   unsigned NumLanes = VT.getSizeInBits()/128;
04280   unsigned NumLaneElts = NumElts/NumLanes;
04281 
04282   for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
04283     for (unsigned i = 0, j = l; i != NumLaneElts; i += 2, ++j) {
04284       int BitI  = Mask[l+i];
04285       int BitI1 = Mask[l+i+1];
04286       if (!isUndefOrEqual(BitI, j))
04287         return false;
04288       if (V2IsSplat) {
04289         if (!isUndefOrEqual(BitI1, NumElts))
04290           return false;
04291       } else {
04292         if (!isUndefOrEqual(BitI1, j + NumElts))
04293           return false;
04294       }
04295     }
04296   }
04297 
04298   return true;
04299 }
04300 
04301 /// isUNPCKHMask - Return true if the specified VECTOR_SHUFFLE operand
04302 /// specifies a shuffle of elements that is suitable for input to UNPCKH.
04303 static bool isUNPCKHMask(ArrayRef<int> Mask, MVT VT,
04304                          bool HasInt256, bool V2IsSplat = false) {
04305   assert(VT.getSizeInBits() >= 128 &&
04306          "Unsupported vector type for unpckh");
04307 
04308   unsigned NumElts = VT.getVectorNumElements();
04309   if (VT.is256BitVector() && NumElts != 4 && NumElts != 8 &&
04310       (!HasInt256 || (NumElts != 16 && NumElts != 32)))
04311     return false;
04312 
04313   assert((!VT.is512BitVector() || VT.getScalarType().getSizeInBits() >= 32) &&
04314          "Unsupported vector type for unpckh");
04315 
04316   // AVX defines UNPCK* to operate independently on 128-bit lanes.
04317   unsigned NumLanes = VT.getSizeInBits()/128;
04318   unsigned NumLaneElts = NumElts/NumLanes;
04319 
04320   for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
04321     for (unsigned i = 0, j = l+NumLaneElts/2; i != NumLaneElts; i += 2, ++j) {
04322       int BitI  = Mask[l+i];
04323       int BitI1 = Mask[l+i+1];
04324       if (!isUndefOrEqual(BitI, j))
04325         return false;
04326       if (V2IsSplat) {
04327         if (isUndefOrEqual(BitI1, NumElts))
04328           return false;
04329       } else {
04330         if (!isUndefOrEqual(BitI1, j+NumElts))
04331           return false;
04332       }
04333     }
04334   }
04335   return true;
04336 }
04337 
04338 /// isUNPCKL_v_undef_Mask - Special case of isUNPCKLMask for canonical form
04339 /// of vector_shuffle v, v, <0, 4, 1, 5>, i.e. vector_shuffle v, undef,
04340 /// <0, 0, 1, 1>
04341 static bool isUNPCKL_v_undef_Mask(ArrayRef<int> Mask, MVT VT, bool HasInt256) {
04342   unsigned NumElts = VT.getVectorNumElements();
04343   bool Is256BitVec = VT.is256BitVector();
04344 
04345   if (VT.is512BitVector())
04346     return false;
04347   assert((VT.is128BitVector() || VT.is256BitVector()) &&
04348          "Unsupported vector type for unpckh");
04349 
04350   if (Is256BitVec && NumElts != 4 && NumElts != 8 &&
04351       (!HasInt256 || (NumElts != 16 && NumElts != 32)))
04352     return false;
04353 
04354   // For 256-bit i64/f64, use MOVDDUPY instead, so reject the matching pattern
04355   // FIXME: Need a better way to get rid of this, there's no latency difference
04356   // between UNPCKLPD and MOVDDUP, the later should always be checked first and
04357   // the former later. We should also remove the "_undef" special mask.
04358   if (NumElts == 4 && Is256BitVec)
04359     return false;
04360 
04361   // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate
04362   // independently on 128-bit lanes.
04363   unsigned NumLanes = VT.getSizeInBits()/128;
04364   unsigned NumLaneElts = NumElts/NumLanes;
04365 
04366   for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
04367     for (unsigned i = 0, j = l; i != NumLaneElts; i += 2, ++j) {
04368       int BitI  = Mask[l+i];
04369       int BitI1 = Mask[l+i+1];
04370 
04371       if (!isUndefOrEqual(BitI, j))
04372         return false;
04373       if (!isUndefOrEqual(BitI1, j))
04374         return false;
04375     }
04376   }
04377 
04378   return true;
04379 }
04380 
04381 /// isUNPCKH_v_undef_Mask - Special case of isUNPCKHMask for canonical form
04382 /// of vector_shuffle v, v, <2, 6, 3, 7>, i.e. vector_shuffle v, undef,
04383 /// <2, 2, 3, 3>
04384 static bool isUNPCKH_v_undef_Mask(ArrayRef<int> Mask, MVT VT, bool HasInt256) {
04385   unsigned NumElts = VT.getVectorNumElements();
04386 
04387   if (VT.is512BitVector())
04388     return false;
04389 
04390   assert((VT.is128BitVector() || VT.is256BitVector()) &&
04391          "Unsupported vector type for unpckh");
04392 
04393   if (VT.is256BitVector() && NumElts != 4 && NumElts != 8 &&
04394       (!HasInt256 || (NumElts != 16 && NumElts != 32)))
04395     return false;
04396 
04397   // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate
04398   // independently on 128-bit lanes.
04399   unsigned NumLanes = VT.getSizeInBits()/128;
04400   unsigned NumLaneElts = NumElts/NumLanes;
04401 
04402   for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
04403     for (unsigned i = 0, j = l+NumLaneElts/2; i != NumLaneElts; i += 2, ++j) {
04404       int BitI  = Mask[l+i];
04405       int BitI1 = Mask[l+i+1];
04406       if (!isUndefOrEqual(BitI, j))
04407         return false;
04408       if (!isUndefOrEqual(BitI1, j))
04409         return false;
04410     }
04411   }
04412   return true;
04413 }
04414 
04415 // Match for INSERTI64x4 INSERTF64x4 instructions (src0[0], src1[0]) or
04416 // (src1[0], src0[1]), manipulation with 256-bit sub-vectors
04417 static bool isINSERT64x4Mask(ArrayRef<int> Mask, MVT VT, unsigned int *Imm) {
04418   if (!VT.is512BitVector())
04419     return false;
04420 
04421   unsigned NumElts = VT.getVectorNumElements();
04422   unsigned HalfSize = NumElts/2;
04423   if (isSequentialOrUndefInRange(Mask, 0, HalfSize, 0)) {
04424     if (isSequentialOrUndefInRange(Mask, HalfSize, HalfSize, NumElts)) {
04425       *Imm = 1;
04426       return true;
04427     }
04428   }
04429   if (isSequentialOrUndefInRange(Mask, 0, HalfSize, NumElts)) {
04430     if (isSequentialOrUndefInRange(Mask, HalfSize, HalfSize, HalfSize)) {
04431       *Imm = 0;
04432       return true;
04433     }
04434   }
04435   return false;
04436 }
04437 
04438 /// isMOVLMask - Return true if the specified VECTOR_SHUFFLE operand
04439 /// specifies a shuffle of elements that is suitable for input to MOVSS,
04440 /// MOVSD, and MOVD, i.e. setting the lowest element.
04441 static bool isMOVLMask(ArrayRef<int> Mask, EVT VT) {
04442   if (VT.getVectorElementType().getSizeInBits() < 32)
04443     return false;
04444   if (!VT.is128BitVector())
04445     return false;
04446 
04447   unsigned NumElts = VT.getVectorNumElements();
04448 
04449   if (!isUndefOrEqual(Mask[0], NumElts))
04450     return false;
04451 
04452   for (unsigned i = 1; i != NumElts; ++i)
04453     if (!isUndefOrEqual(Mask[i], i))
04454       return false;
04455 
04456   return true;
04457 }
04458 
04459 /// isVPERM2X128Mask - Match 256-bit shuffles where the elements are considered
04460 /// as permutations between 128-bit chunks or halves. As an example: this
04461 /// shuffle bellow:
04462 ///   vector_shuffle <4, 5, 6, 7, 12, 13, 14, 15>
04463 /// The first half comes from the second half of V1 and the second half from the
04464 /// the second half of V2.
04465 static bool isVPERM2X128Mask(ArrayRef<int> Mask, MVT VT, bool HasFp256) {
04466   if (!HasFp256 || !VT.is256BitVector())
04467     return false;
04468 
04469   // The shuffle result is divided into half A and half B. In total the two
04470   // sources have 4 halves, namely: C, D, E, F. The final values of A and
04471   // B must come from C, D, E or F.
04472   unsigned HalfSize = VT.getVectorNumElements()/2;
04473   bool MatchA = false, MatchB = false;
04474 
04475   // Check if A comes from one of C, D, E, F.
04476   for (unsigned Half = 0; Half != 4; ++Half) {
04477     if (isSequentialOrUndefInRange(Mask, 0, HalfSize, Half*HalfSize)) {
04478       MatchA = true;
04479       break;
04480     }
04481   }
04482 
04483   // Check if B comes from one of C, D, E, F.
04484   for (unsigned Half = 0; Half != 4; ++Half) {
04485     if (isSequentialOrUndefInRange(Mask, HalfSize, HalfSize, Half*HalfSize)) {
04486       MatchB = true;
04487       break;
04488     }
04489   }
04490 
04491   return MatchA && MatchB;
04492 }
04493 
04494 /// getShuffleVPERM2X128Immediate - Return the appropriate immediate to shuffle
04495 /// the specified VECTOR_MASK mask with VPERM2F128/VPERM2I128 instructions.
04496 static unsigned getShuffleVPERM2X128Immediate(ShuffleVectorSDNode *SVOp) {
04497   MVT VT = SVOp->getSimpleValueType(0);
04498 
04499   unsigned HalfSize = VT.getVectorNumElements()/2;
04500 
04501   unsigned FstHalf = 0, SndHalf = 0;
04502   for (unsigned i = 0; i < HalfSize; ++i) {
04503     if (SVOp->getMaskElt(i) > 0) {
04504       FstHalf = SVOp->getMaskElt(i)/HalfSize;
04505       break;
04506     }
04507   }
04508   for (unsigned i = HalfSize; i < HalfSize*2; ++i) {
04509     if (SVOp->getMaskElt(i) > 0) {
04510       SndHalf = SVOp->getMaskElt(i)/HalfSize;
04511       break;
04512     }
04513   }
04514 
04515   return (FstHalf | (SndHalf << 4));
04516 }
04517 
04518 // Symetric in-lane mask. Each lane has 4 elements (for imm8)
04519 static bool isPermImmMask(ArrayRef<int> Mask, MVT VT, unsigned& Imm8) {
04520   unsigned EltSize = VT.getVectorElementType().getSizeInBits();
04521   if (EltSize < 32)
04522     return false;
04523 
04524   unsigned NumElts = VT.getVectorNumElements();
04525   Imm8 = 0;
04526   if (VT.is128BitVector() || (VT.is256BitVector() && EltSize == 64)) {
04527     for (unsigned i = 0; i != NumElts; ++i) {
04528       if (Mask[i] < 0)
04529         continue;
04530       Imm8 |= Mask[i] << (i*2);
04531     }
04532     return true;
04533   }
04534 
04535   unsigned LaneSize = 4;
04536   SmallVector<int, 4> MaskVal(LaneSize, -1);
04537 
04538   for (unsigned l = 0; l != NumElts; l += LaneSize) {
04539     for (unsigned i = 0; i != LaneSize; ++i) {
04540       if (!isUndefOrInRange(Mask[i+l], l, l+LaneSize))
04541         return false;
04542       if (Mask[i+l] < 0)
04543         continue;
04544       if (MaskVal[i] < 0) {
04545         MaskVal[i] = Mask[i+l] - l;
04546         Imm8 |= MaskVal[i] << (i*2);
04547         continue;
04548       }
04549       if (Mask[i+l] != (signed)(MaskVal[i]+l))
04550         return false;
04551     }
04552   }
04553   return true;
04554 }
04555 
04556 /// isVPERMILPMask - Return true if the specified VECTOR_SHUFFLE operand
04557 /// specifies a shuffle of elements that is suitable for input to VPERMILPD*.
04558 /// Note that VPERMIL mask matching is different depending whether theunderlying
04559 /// type is 32 or 64. In the VPERMILPS the high half of the mask should point
04560 /// to the same elements of the low, but to the higher half of the source.
04561 /// In VPERMILPD the two lanes could be shuffled independently of each other
04562 /// with the same restriction that lanes can't be crossed. Also handles PSHUFDY.
04563 static bool isVPERMILPMask(ArrayRef<int> Mask, MVT VT) {
04564   unsigned EltSize = VT.getVectorElementType().getSizeInBits();
04565   if (VT.getSizeInBits() < 256 || EltSize < 32)
04566     return false;
04567   bool symetricMaskRequired = (EltSize == 32);
04568   unsigned NumElts = VT.getVectorNumElements();
04569 
04570   unsigned NumLanes = VT.getSizeInBits()/128;
04571   unsigned LaneSize = NumElts/NumLanes;
04572   // 2 or 4 elements in one lane
04573 
04574   SmallVector<int, 4> ExpectedMaskVal(LaneSize, -1);
04575   for (unsigned l = 0; l != NumElts; l += LaneSize) {
04576     for (unsigned i = 0; i != LaneSize; ++i) {
04577       if (!isUndefOrInRange(Mask[i+l], l, l+LaneSize))
04578         return false;
04579       if (symetricMaskRequired) {
04580         if (ExpectedMaskVal[i] < 0 && Mask[i+l] >= 0) {
04581           ExpectedMaskVal[i] = Mask[i+l] - l;
04582           continue;
04583         }
04584         if (!isUndefOrEqual(Mask[i+l], ExpectedMaskVal[i]+l))
04585           return false;
04586       }
04587     }
04588   }
04589   return true;
04590 }
04591 
04592 /// isCommutedMOVLMask - Returns true if the shuffle mask is except the reverse
04593 /// of what x86 movss want. X86 movs requires the lowest  element to be lowest
04594 /// element of vector 2 and the other elements to come from vector 1 in order.
04595 static bool isCommutedMOVLMask(ArrayRef<int> Mask, MVT VT,
04596                                bool V2IsSplat = false, bool V2IsUndef = false) {
04597   if (!VT.is128BitVector())
04598     return false;
04599 
04600   unsigned NumOps = VT.getVectorNumElements();
04601   if (NumOps != 2 && NumOps != 4 && NumOps != 8 && NumOps != 16)
04602     return false;
04603 
04604   if (!isUndefOrEqual(Mask[0], 0))
04605     return false;
04606 
04607   for (unsigned i = 1; i != NumOps; ++i)
04608     if (!(isUndefOrEqual(Mask[i], i+NumOps) ||
04609           (V2IsUndef && isUndefOrInRange(Mask[i], NumOps, NumOps*2)) ||
04610           (V2IsSplat && isUndefOrEqual(Mask[i], NumOps))))
04611       return false;
04612 
04613   return true;
04614 }
04615 
04616 /// isMOVSHDUPMask - Return true if the specified VECTOR_SHUFFLE operand
04617 /// specifies a shuffle of elements that is suitable for input to MOVSHDUP.
04618 /// Masks to match: <1, 1, 3, 3> or <1, 1, 3, 3, 5, 5, 7, 7>
04619 static bool isMOVSHDUPMask(ArrayRef<int> Mask, MVT VT,
04620                            const X86Subtarget *Subtarget) {
04621   if (!Subtarget->hasSSE3())
04622     return false;
04623 
04624   unsigned NumElems = VT.getVectorNumElements();
04625 
04626   if ((VT.is128BitVector() && NumElems != 4) ||
04627       (VT.is256BitVector() && NumElems != 8) ||
04628       (VT.is512BitVector() && NumElems != 16))
04629     return false;
04630 
04631   // "i+1" is the value the indexed mask element must have
04632   for (unsigned i = 0; i != NumElems; i += 2)
04633     if (!isUndefOrEqual(Mask[i], i+1) ||
04634         !isUndefOrEqual(Mask[i+1], i+1))
04635       return false;
04636 
04637   return true;
04638 }
04639 
04640 /// isMOVSLDUPMask - Return true if the specified VECTOR_SHUFFLE operand
04641 /// specifies a shuffle of elements that is suitable for input to MOVSLDUP.
04642 /// Masks to match: <0, 0, 2, 2> or <0, 0, 2, 2, 4, 4, 6, 6>
04643 static bool isMOVSLDUPMask(ArrayRef<int> Mask, MVT VT,
04644                            const X86Subtarget *Subtarget) {
04645   if (!Subtarget->hasSSE3())
04646     return false;
04647 
04648   unsigned NumElems = VT.getVectorNumElements();
04649 
04650   if ((VT.is128BitVector() && NumElems != 4) ||
04651       (VT.is256BitVector() && NumElems != 8) ||
04652       (VT.is512BitVector() && NumElems != 16))
04653     return false;
04654 
04655   // "i" is the value the indexed mask element must have
04656   for (unsigned i = 0; i != NumElems; i += 2)
04657     if (!isUndefOrEqual(Mask[i], i) ||
04658         !isUndefOrEqual(Mask[i+1], i))
04659       return false;
04660 
04661   return true;
04662 }
04663 
04664 /// isMOVDDUPYMask - Return true if the specified VECTOR_SHUFFLE operand
04665 /// specifies a shuffle of elements that is suitable for input to 256-bit
04666 /// version of MOVDDUP.
04667 static bool isMOVDDUPYMask(ArrayRef<int> Mask, MVT VT, bool HasFp256) {
04668   if (!HasFp256 || !VT.is256BitVector())
04669     return false;
04670 
04671   unsigned NumElts = VT.getVectorNumElements();
04672   if (NumElts != 4)
04673     return false;
04674 
04675   for (unsigned i = 0; i != NumElts/2; ++i)
04676     if (!isUndefOrEqual(Mask[i], 0))
04677       return false;
04678   for (unsigned i = NumElts/2; i != NumElts; ++i)
04679     if (!isUndefOrEqual(Mask[i], NumElts/2))
04680       return false;
04681   return true;
04682 }
04683 
04684 /// isMOVDDUPMask - Return true if the specified VECTOR_SHUFFLE operand
04685 /// specifies a shuffle of elements that is suitable for input to 128-bit
04686 /// version of MOVDDUP.
04687 static bool isMOVDDUPMask(ArrayRef<int> Mask, MVT VT) {
04688   if (!VT.is128BitVector())
04689     return false;
04690 
04691   unsigned e = VT.getVectorNumElements() / 2;
04692   for (unsigned i = 0; i != e; ++i)
04693     if (!isUndefOrEqual(Mask[i], i))
04694       return false;
04695   for (unsigned i = 0; i != e; ++i)
04696     if (!isUndefOrEqual(Mask[e+i], i))
04697       return false;
04698   return true;
04699 }
04700 
04701 /// isVEXTRACTIndex - Return true if the specified
04702 /// EXTRACT_SUBVECTOR operand specifies a vector extract that is
04703 /// suitable for instruction that extract 128 or 256 bit vectors
04704 static bool isVEXTRACTIndex(SDNode *N, unsigned vecWidth) {
04705   assert((vecWidth == 128 || vecWidth == 256) && "Unexpected vector width");
04706   if (!isa<ConstantSDNode>(N->getOperand(1).getNode()))
04707     return false;
04708 
04709   // The index should be aligned on a vecWidth-bit boundary.
04710   uint64_t Index =
04711     cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue();
04712 
04713   MVT VT = N->getSimpleValueType(0);
04714   unsigned ElSize = VT.getVectorElementType().getSizeInBits();
04715   bool Result = (Index * ElSize) % vecWidth == 0;
04716 
04717   return Result;
04718 }
04719 
04720 /// isVINSERTIndex - Return true if the specified INSERT_SUBVECTOR
04721 /// operand specifies a subvector insert that is suitable for input to
04722 /// insertion of 128 or 256-bit subvectors
04723 static bool isVINSERTIndex(SDNode *N, unsigned vecWidth) {
04724   assert((vecWidth == 128 || vecWidth == 256) && "Unexpected vector width");
04725   if (!isa<ConstantSDNode>(N->getOperand(2).getNode()))
04726     return false;
04727   // The index should be aligned on a vecWidth-bit boundary.
04728   uint64_t Index =
04729     cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue();
04730 
04731   MVT VT = N->getSimpleValueType(0);
04732   unsigned ElSize = VT.getVectorElementType().getSizeInBits();
04733   bool Result = (Index * ElSize) % vecWidth == 0;
04734 
04735   return Result;
04736 }
04737 
04738 bool X86::isVINSERT128Index(SDNode *N) {
04739   return isVINSERTIndex(N, 128);
04740 }
04741 
04742 bool X86::isVINSERT256Index(SDNode *N) {
04743   return isVINSERTIndex(N, 256);
04744 }
04745 
04746 bool X86::isVEXTRACT128Index(SDNode *N) {
04747   return isVEXTRACTIndex(N, 128);
04748 }
04749 
04750 bool X86::isVEXTRACT256Index(SDNode *N) {
04751   return isVEXTRACTIndex(N, 256);
04752 }
04753 
04754 /// getShuffleSHUFImmediate - Return the appropriate immediate to shuffle
04755 /// the specified VECTOR_SHUFFLE mask with PSHUF* and SHUFP* instructions.
04756 /// Handles 128-bit and 256-bit.
04757 static unsigned getShuffleSHUFImmediate(ShuffleVectorSDNode *N) {
04758   MVT VT = N->getSimpleValueType(0);
04759 
04760   assert((VT.getSizeInBits() >= 128) &&
04761          "Unsupported vector type for PSHUF/SHUFP");
04762 
04763   // Handle 128 and 256-bit vector lengths. AVX defines PSHUF/SHUFP to operate
04764   // independently on 128-bit lanes.
04765   unsigned NumElts = VT.getVectorNumElements();
04766   unsigned NumLanes = VT.getSizeInBits()/128;
04767   unsigned NumLaneElts = NumElts/NumLanes;
04768 
04769   assert((NumLaneElts == 2 || NumLaneElts == 4 || NumLaneElts == 8) &&
04770          "Only supports 2, 4 or 8 elements per lane");
04771 
04772   unsigned Shift = (NumLaneElts >= 4) ? 1 : 0;
04773   unsigned Mask = 0;
04774   for (unsigned i = 0; i != NumElts; ++i) {
04775     int Elt = N->getMaskElt(i);
04776     if (Elt < 0) continue;
04777     Elt &= NumLaneElts - 1;
04778     unsigned ShAmt = (i << Shift) % 8;
04779     Mask |= Elt << ShAmt;
04780   }
04781 
04782   return Mask;
04783 }
04784 
04785 /// getShufflePSHUFHWImmediate - Return the appropriate immediate to shuffle
04786 /// the specified VECTOR_SHUFFLE mask with the PSHUFHW instruction.
04787 static unsigned getShufflePSHUFHWImmediate(ShuffleVectorSDNode *N) {
04788   MVT VT = N->getSimpleValueType(0);
04789 
04790   assert((VT == MVT::v8i16 || VT == MVT::v16i16) &&
04791          "Unsupported vector type for PSHUFHW");
04792 
04793   unsigned NumElts = VT.getVectorNumElements();
04794 
04795   unsigned Mask = 0;
04796   for (unsigned l = 0; l != NumElts; l += 8) {
04797     // 8 nodes per lane, but we only care about the last 4.
04798     for (unsigned i = 0; i < 4; ++i) {
04799       int Elt = N->getMaskElt(l+i+4);
04800       if (Elt < 0) continue;
04801       Elt &= 0x3; // only 2-bits.
04802       Mask |= Elt << (i * 2);
04803     }
04804   }
04805 
04806   return Mask;
04807 }
04808 
04809 /// getShufflePSHUFLWImmediate - Return the appropriate immediate to shuffle
04810 /// the specified VECTOR_SHUFFLE mask with the PSHUFLW instruction.
04811 static unsigned getShufflePSHUFLWImmediate(ShuffleVectorSDNode *N) {
04812   MVT VT = N->getSimpleValueType(0);
04813 
04814   assert((VT == MVT::v8i16 || VT == MVT::v16i16) &&
04815          "Unsupported vector type for PSHUFHW");
04816 
04817   unsigned NumElts = VT.getVectorNumElements();
04818 
04819   unsigned Mask = 0;
04820   for (unsigned l = 0; l != NumElts; l += 8) {
04821     // 8 nodes per lane, but we only care about the first 4.
04822     for (unsigned i = 0; i < 4; ++i) {
04823       int Elt = N->getMaskElt(l+i);
04824       if (Elt < 0) continue;
04825       Elt &= 0x3; // only 2-bits
04826       Mask |= Elt << (i * 2);
04827     }
04828   }
04829 
04830   return Mask;
04831 }
04832 
04833 /// \brief Return the appropriate immediate to shuffle the specified
04834 /// VECTOR_SHUFFLE mask with the PALIGNR (if InterLane is false) or with
04835 /// VALIGN (if Interlane is true) instructions.
04836 static unsigned getShuffleAlignrImmediate(ShuffleVectorSDNode *SVOp,
04837                                            bool InterLane) {
04838   MVT VT = SVOp->getSimpleValueType(0);
04839   unsigned EltSize = InterLane ? 1 :
04840     VT.getVectorElementType().getSizeInBits() >> 3;
04841 
04842   unsigned NumElts = VT.getVectorNumElements();
04843   unsigned NumLanes = VT.is512BitVector() ? 1 : VT.getSizeInBits()/128;
04844   unsigned NumLaneElts = NumElts/NumLanes;
04845 
04846   int Val = 0;
04847   unsigned i;
04848   for (i = 0; i != NumElts; ++i) {
04849     Val = SVOp->getMaskElt(i);
04850     if (Val >= 0)
04851       break;
04852   }
04853   if (Val >= (int)NumElts)
04854     Val -= NumElts - NumLaneElts;
04855 
04856   assert(Val - i > 0 && "PALIGNR imm should be positive");
04857   return (Val - i) * EltSize;
04858 }
04859 
04860 /// \brief Return the appropriate immediate to shuffle the specified
04861 /// VECTOR_SHUFFLE mask with the PALIGNR instruction.
04862 static unsigned getShufflePALIGNRImmediate(ShuffleVectorSDNode *SVOp) {
04863   return getShuffleAlignrImmediate(SVOp, false);
04864 }
04865 
04866 /// \brief Return the appropriate immediate to shuffle the specified
04867 /// VECTOR_SHUFFLE mask with the VALIGN instruction.
04868 static unsigned getShuffleVALIGNImmediate(ShuffleVectorSDNode *SVOp) {
04869   return getShuffleAlignrImmediate(SVOp, true);
04870 }
04871 
04872 
04873 static unsigned getExtractVEXTRACTImmediate(SDNode *N, unsigned vecWidth) {
04874   assert((vecWidth == 128 || vecWidth == 256) && "Unsupported vector width");
04875   if (!isa<ConstantSDNode>(N->getOperand(1).getNode()))
04876     llvm_unreachable("Illegal extract subvector for VEXTRACT");
04877 
04878   uint64_t Index =
04879     cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue();
04880 
04881   MVT VecVT = N->getOperand(0).getSimpleValueType();
04882   MVT ElVT = VecVT.getVectorElementType();
04883 
04884   unsigned NumElemsPerChunk = vecWidth / ElVT.getSizeInBits();
04885   return Index / NumElemsPerChunk;
04886 }
04887 
04888 static unsigned getInsertVINSERTImmediate(SDNode *N, unsigned vecWidth) {
04889   assert((vecWidth == 128 || vecWidth == 256) && "Unsupported vector width");
04890   if (!isa<ConstantSDNode>(N->getOperand(2).getNode()))
04891     llvm_unreachable("Illegal insert subvector for VINSERT");
04892 
04893   uint64_t Index =
04894     cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue();
04895 
04896   MVT VecVT = N->getSimpleValueType(0);
04897   MVT ElVT = VecVT.getVectorElementType();
04898 
04899   unsigned NumElemsPerChunk = vecWidth / ElVT.getSizeInBits();
04900   return Index / NumElemsPerChunk;
04901 }
04902 
04903 /// getExtractVEXTRACT128Immediate - Return the appropriate immediate
04904 /// to extract the specified EXTRACT_SUBVECTOR index with VEXTRACTF128
04905 /// and VINSERTI128 instructions.
04906 unsigned X86::getExtractVEXTRACT128Immediate(SDNode *N) {
04907   return getExtractVEXTRACTImmediate(N, 128);
04908 }
04909 
04910 /// getExtractVEXTRACT256Immediate - Return the appropriate immediate
04911 /// to extract the specified EXTRACT_SUBVECTOR index with VEXTRACTF64x4
04912 /// and VINSERTI64x4 instructions.
04913 unsigned X86::getExtractVEXTRACT256Immediate(SDNode *N) {
04914   return getExtractVEXTRACTImmediate(N, 256);
04915 }
04916 
04917 /// getInsertVINSERT128Immediate - Return the appropriate immediate
04918 /// to insert at the specified INSERT_SUBVECTOR index with VINSERTF128
04919 /// and VINSERTI128 instructions.
04920 unsigned X86::getInsertVINSERT128Immediate(SDNode *N) {
04921   return getInsertVINSERTImmediate(N, 128);
04922 }
04923 
04924 /// getInsertVINSERT256Immediate - Return the appropriate immediate
04925 /// to insert at the specified INSERT_SUBVECTOR index with VINSERTF46x4
04926 /// and VINSERTI64x4 instructions.
04927 unsigned X86::getInsertVINSERT256Immediate(SDNode *N) {
04928   return getInsertVINSERTImmediate(N, 256);
04929 }
04930 
04931 /// isZero - Returns true if Elt is a constant integer zero
04932 static bool isZero(SDValue V) {
04933   ConstantSDNode *C = dyn_cast<ConstantSDNode>(V);
04934   return C && C->isNullValue();
04935 }
04936 
04937 /// isZeroNode - Returns true if Elt is a constant zero or a floating point
04938 /// constant +0.0.
04939 bool X86::isZeroNode(SDValue Elt) {
04940   if (isZero(Elt))
04941     return true;
04942   if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Elt))
04943     return CFP->getValueAPF().isPosZero();
04944   return false;
04945 }
04946 
04947 /// ShouldXformToMOVHLPS - Return true if the node should be transformed to
04948 /// match movhlps. The lower half elements should come from upper half of
04949 /// V1 (and in order), and the upper half elements should come from the upper
04950 /// half of V2 (and in order).
04951 static bool ShouldXformToMOVHLPS(ArrayRef<int> Mask, MVT VT) {
04952   if (!VT.is128BitVector())
04953     return false;
04954   if (VT.getVectorNumElements() != 4)
04955     return false;
04956   for (unsigned i = 0, e = 2; i != e; ++i)
04957     if (!isUndefOrEqual(Mask[i], i+2))
04958       return false;
04959   for (unsigned i = 2; i != 4; ++i)
04960     if (!isUndefOrEqual(Mask[i], i+4))
04961       return false;
04962   return true;
04963 }
04964 
04965 /// isScalarLoadToVector - Returns true if the node is a scalar load that
04966 /// is promoted to a vector. It also returns the LoadSDNode by reference if
04967 /// required.
04968 static bool isScalarLoadToVector(SDNode *N, LoadSDNode **LD = nullptr) {
04969   if (N->getOpcode() != ISD::SCALAR_TO_VECTOR)
04970     return false;
04971   N = N->getOperand(0).getNode();
04972   if (!ISD::isNON_EXTLoad(N))
04973     return false;
04974   if (LD)
04975     *LD = cast<LoadSDNode>(N);
04976   return true;
04977 }
04978 
04979 // Test whether the given value is a vector value which will be legalized
04980 // into a load.
04981 static bool WillBeConstantPoolLoad(SDNode *N) {
04982   if (N->getOpcode() != ISD::BUILD_VECTOR)
04983     return false;
04984 
04985   // Check for any non-constant elements.
04986   for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i)
04987     switch (N->getOperand(i).getNode()->getOpcode()) {
04988     case ISD::UNDEF:
04989     case ISD::ConstantFP:
04990     case ISD::Constant:
04991       break;
04992     default:
04993       return false;
04994     }
04995 
04996   // Vectors of all-zeros and all-ones are materialized with special
04997   // instructions rather than being loaded.
04998   return !ISD::isBuildVectorAllZeros(N) &&
04999          !ISD::isBuildVectorAllOnes(N);
05000 }
05001 
05002 /// ShouldXformToMOVLP{S|D} - Return true if the node should be transformed to
05003 /// match movlp{s|d}. The lower half elements should come from lower half of
05004 /// V1 (and in order), and the upper half elements should come from the upper
05005 /// half of V2 (and in order). And since V1 will become the source of the
05006 /// MOVLP, it must be either a vector load or a scalar load to vector.
05007 static bool ShouldXformToMOVLP(SDNode *V1, SDNode *V2,
05008                                ArrayRef<int> Mask, MVT VT) {
05009   if (!VT.is128BitVector())
05010     return false;
05011 
05012   if (!ISD::isNON_EXTLoad(V1) && !isScalarLoadToVector(V1))
05013     return false;
05014   // Is V2 is a vector load, don't do this transformation. We will try to use
05015   // load folding shufps op.
05016   if (ISD::isNON_EXTLoad(V2) || WillBeConstantPoolLoad(V2))
05017     return false;
05018 
05019   unsigned NumElems = VT.getVectorNumElements();
05020 
05021   if (NumElems != 2 && NumElems != 4)
05022     return false;
05023   for (unsigned i = 0, e = NumElems/2; i != e; ++i)
05024     if (!isUndefOrEqual(Mask[i], i))
05025       return false;
05026   for (unsigned i = NumElems/2, e = NumElems; i != e; ++i)
05027     if (!isUndefOrEqual(Mask[i], i+NumElems))
05028       return false;
05029   return true;
05030 }
05031 
05032 /// isZeroShuffle - Returns true if N is a VECTOR_SHUFFLE that can be resolved
05033 /// to an zero vector.
05034 /// FIXME: move to dag combiner / method on ShuffleVectorSDNode
05035 static bool isZeroShuffle(ShuffleVectorSDNode *N) {
05036   SDValue V1 = N->getOperand(0);
05037   SDValue V2 = N->getOperand(1);
05038   unsigned NumElems = N->getValueType(0).getVectorNumElements();
05039   for (unsigned i = 0; i != NumElems; ++i) {
05040     int Idx = N->getMaskElt(i);
05041     if (Idx >= (int)NumElems) {
05042       unsigned Opc = V2.getOpcode();
05043       if (Opc == ISD::UNDEF || ISD::isBuildVectorAllZeros(V2.getNode()))
05044         continue;
05045       if (Opc != ISD::BUILD_VECTOR ||
05046           !X86::isZeroNode(V2.getOperand(Idx-NumElems)))
05047         return false;
05048     } else if (Idx >= 0) {
05049       unsigned Opc = V1.getOpcode();
05050       if (Opc == ISD::UNDEF || ISD::isBuildVectorAllZeros(V1.getNode()))
05051         continue;
05052       if (Opc != ISD::BUILD_VECTOR ||
05053           !X86::isZeroNode(V1.getOperand(Idx)))
05054         return false;
05055     }
05056   }
05057   return true;
05058 }
05059 
05060 /// getZeroVector - Returns a vector of specified type with all zero elements.
05061 ///
05062 static SDValue getZeroVector(EVT VT, const X86Subtarget *Subtarget,
05063                              SelectionDAG &DAG, SDLoc dl) {
05064   assert(VT.isVector() && "Expected a vector type");
05065 
05066   // Always build SSE zero vectors as <4 x i32> bitcasted
05067   // to their dest type. This ensures they get CSE'd.
05068   SDValue Vec;
05069   if (VT.is128BitVector()) {  // SSE
05070     if (Subtarget->hasSSE2()) {  // SSE2
05071       SDValue Cst = DAG.getTargetConstant(0, MVT::i32);
05072       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
05073     } else { // SSE1
05074       SDValue Cst = DAG.getTargetConstantFP(+0.0, MVT::f32);
05075       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4f32, Cst, Cst, Cst, Cst);
05076     }
05077   } else if (VT.is256BitVector()) { // AVX
05078     if (Subtarget->hasInt256()) { // AVX2
05079       SDValue Cst = DAG.getTargetConstant(0, MVT::i32);
05080       SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
05081       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops);
05082     } else {
05083       // 256-bit logic and arithmetic instructions in AVX are all
05084       // floating-point, no support for integer ops. Emit fp zeroed vectors.
05085       SDValue Cst = DAG.getTargetConstantFP(+0.0, MVT::f32);
05086       SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
05087       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8f32, Ops);
05088     }
05089   } else if (VT.is512BitVector()) { // AVX-512
05090       SDValue Cst = DAG.getTargetConstant(0, MVT::i32);
05091       SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst,
05092                         Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
05093       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v16i32, Ops);
05094   } else if (VT.getScalarType() == MVT::i1) {
05095     assert(VT.getVectorNumElements() <= 16 && "Unexpected vector type");
05096     SDValue Cst = DAG.getTargetConstant(0, MVT::i1);
05097     SmallVector<SDValue, 16> Ops(VT.getVectorNumElements(), Cst);
05098     return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops);
05099   } else
05100     llvm_unreachable("Unexpected vector type");
05101 
05102   return DAG.getNode(ISD::BITCAST, dl, VT, Vec);
05103 }
05104 
05105 /// getOnesVector - Returns a vector of specified type with all bits set.
05106 /// Always build ones vectors as <4 x i32> or <8 x i32>. For 256-bit types with
05107 /// no AVX2 supprt, use two <4 x i32> inserted in a <8 x i32> appropriately.
05108 /// Then bitcast to their original type, ensuring they get CSE'd.
05109 static SDValue getOnesVector(MVT VT, bool HasInt256, SelectionDAG &DAG,
05110                              SDLoc dl) {
05111   assert(VT.isVector() && "Expected a vector type");
05112 
05113   SDValue Cst = DAG.getTargetConstant(~0U, MVT::i32);
05114   SDValue Vec;
05115   if (VT.is256BitVector()) {
05116     if (HasInt256) { // AVX2
05117       SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
05118       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops);
05119     } else { // AVX
05120       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
05121       Vec = Concat128BitVectors(Vec, Vec, MVT::v8i32, 8, DAG, dl);
05122     }
05123   } else if (VT.is128BitVector()) {
05124     Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
05125   } else
05126     llvm_unreachable("Unexpected vector type");
05127 
05128   return DAG.getNode(ISD::BITCAST, dl, VT, Vec);
05129 }
05130 
05131 /// NormalizeMask - V2 is a splat, modify the mask (if needed) so all elements
05132 /// that point to V2 points to its first element.
05133 static void NormalizeMask(SmallVectorImpl<int> &Mask, unsigned NumElems) {
05134   for (unsigned i = 0; i != NumElems; ++i) {
05135     if (Mask[i] > (int)NumElems) {
05136       Mask[i] = NumElems;
05137     }
05138   }
05139 }
05140 
05141 /// getMOVLMask - Returns a vector_shuffle mask for an movs{s|d}, movd
05142 /// operation of specified width.
05143 static SDValue getMOVL(SelectionDAG &DAG, SDLoc dl, EVT VT, SDValue V1,
05144                        SDValue V2) {
05145   unsigned NumElems = VT.getVectorNumElements();
05146   SmallVector<int, 8> Mask;
05147   Mask.push_back(NumElems);
05148   for (unsigned i = 1; i != NumElems; ++i)
05149     Mask.push_back(i);
05150   return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]);
05151 }
05152 
05153 /// getUnpackl - Returns a vector_shuffle node for an unpackl operation.
05154 static SDValue getUnpackl(SelectionDAG &DAG, SDLoc dl, MVT VT, SDValue V1,
05155                           SDValue V2) {
05156   unsigned NumElems = VT.getVectorNumElements();
05157   SmallVector<int, 8> Mask;
05158   for (unsigned i = 0, e = NumElems/2; i != e; ++i) {
05159     Mask.push_back(i);
05160     Mask.push_back(i + NumElems);
05161   }
05162   return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]);
05163 }
05164 
05165 /// getUnpackh - Returns a vector_shuffle node for an unpackh operation.
05166 static SDValue getUnpackh(SelectionDAG &DAG, SDLoc dl, MVT VT, SDValue V1,
05167                           SDValue V2) {
05168   unsigned NumElems = VT.getVectorNumElements();
05169   SmallVector<int, 8> Mask;
05170   for (unsigned i = 0, Half = NumElems/2; i != Half; ++i) {
05171     Mask.push_back(i + Half);
05172     Mask.push_back(i + NumElems + Half);
05173   }
05174   return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]);
05175 }
05176 
05177 // PromoteSplati8i16 - All i16 and i8 vector types can't be used directly by
05178 // a generic shuffle instruction because the target has no such instructions.
05179 // Generate shuffles which repeat i16 and i8 several times until they can be
05180 // represented by v4f32 and then be manipulated by target suported shuffles.
05181 static SDValue PromoteSplati8i16(SDValue V, SelectionDAG &DAG, int &EltNo) {
05182   MVT VT = V.getSimpleValueType();
05183   int NumElems = VT.getVectorNumElements();
05184   SDLoc dl(V);
05185 
05186   while (NumElems > 4) {
05187     if (EltNo < NumElems/2) {
05188       V = getUnpackl(DAG, dl, VT, V, V);
05189     } else {
05190       V = getUnpackh(DAG, dl, VT, V, V);
05191       EltNo -= NumElems/2;
05192     }
05193     NumElems >>= 1;
05194   }
05195   return V;
05196 }
05197 
05198 /// getLegalSplat - Generate a legal splat with supported x86 shuffles
05199 static SDValue getLegalSplat(SelectionDAG &DAG, SDValue V, int EltNo) {
05200   MVT VT = V.getSimpleValueType();
05201   SDLoc dl(V);
05202 
05203   if (VT.is128BitVector()) {
05204     V = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, V);
05205     int SplatMask[4] = { EltNo, EltNo, EltNo, EltNo };
05206     V = DAG.getVectorShuffle(MVT::v4f32, dl, V, DAG.getUNDEF(MVT::v4f32),
05207                              &SplatMask[0]);
05208   } else if (VT.is256BitVector()) {
05209     // To use VPERMILPS to splat scalars, the second half of indicies must
05210     // refer to the higher part, which is a duplication of the lower one,
05211     // because VPERMILPS can only handle in-lane permutations.
05212     int SplatMask[8] = { EltNo, EltNo, EltNo, EltNo,
05213                          EltNo+4, EltNo+4, EltNo+4, EltNo+4 };
05214 
05215     V = DAG.getNode(ISD::BITCAST, dl, MVT::v8f32, V);
05216     V = DAG.getVectorShuffle(MVT::v8f32, dl, V, DAG.getUNDEF(MVT::v8f32),
05217                              &SplatMask[0]);
05218   } else
05219     llvm_unreachable("Vector size not supported");
05220 
05221   return DAG.getNode(ISD::BITCAST, dl, VT, V);
05222 }
05223 
05224 /// PromoteSplat - Splat is promoted to target supported vector shuffles.
05225 static SDValue PromoteSplat(ShuffleVectorSDNode *SV, SelectionDAG &DAG) {
05226   MVT SrcVT = SV->getSimpleValueType(0);
05227   SDValue V1 = SV->getOperand(0);
05228   SDLoc dl(SV);
05229 
05230   int EltNo = SV->getSplatIndex();
05231   int NumElems = SrcVT.getVectorNumElements();
05232   bool Is256BitVec = SrcVT.is256BitVector();
05233 
05234   assert(((SrcVT.is128BitVector() && NumElems > 4) || Is256BitVec) &&
05235          "Unknown how to promote splat for type");
05236 
05237   // Extract the 128-bit part containing the splat element and update
05238   // the splat element index when it refers to the higher register.
05239   if (Is256BitVec) {
05240     V1 = Extract128BitVector(V1, EltNo, DAG, dl);
05241     if (EltNo >= NumElems/2)
05242       EltNo -= NumElems/2;
05243   }
05244 
05245   // All i16 and i8 vector types can't be used directly by a generic shuffle
05246   // instruction because the target has no such instruction. Generate shuffles
05247   // which repeat i16 and i8 several times until they fit in i32, and then can
05248   // be manipulated by target suported shuffles.
05249   MVT EltVT = SrcVT.getVectorElementType();
05250   if (EltVT == MVT::i8 || EltVT == MVT::i16)
05251     V1 = PromoteSplati8i16(V1, DAG, EltNo);
05252 
05253   // Recreate the 256-bit vector and place the same 128-bit vector
05254   // into the low and high part. This is necessary because we want
05255   // to use VPERM* to shuffle the vectors
05256   if (Is256BitVec) {
05257     V1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, SrcVT, V1, V1);
05258   }
05259 
05260   return getLegalSplat(DAG, V1, EltNo);
05261 }
05262 
05263 /// getShuffleVectorZeroOrUndef - Return a vector_shuffle of the specified
05264 /// vector of zero or undef vector.  This produces a shuffle where the low
05265 /// element of V2 is swizzled into the zero/undef vector, landing at element
05266 /// Idx.  This produces a shuffle mask like 4,1,2,3 (idx=0) or  0,1,2,4 (idx=3).
05267 static SDValue getShuffleVectorZeroOrUndef(SDValue V2, unsigned Idx,
05268                                            bool IsZero,
05269                                            const X86Subtarget *Subtarget,
05270                                            SelectionDAG &DAG) {
05271   MVT VT = V2.getSimpleValueType();
05272   SDValue V1 = IsZero
05273     ? getZeroVector(VT, Subtarget, DAG, SDLoc(V2)) : DAG.getUNDEF(VT);
05274   unsigned NumElems = VT.getVectorNumElements();
05275   SmallVector<int, 16> MaskVec;
05276   for (unsigned i = 0; i != NumElems; ++i)
05277     // If this is the insertion idx, put the low elt of V2 here.
05278     MaskVec.push_back(i == Idx ? NumElems : i);
05279   return DAG.getVectorShuffle(VT, SDLoc(V2), V1, V2, &MaskVec[0]);
05280 }
05281 
05282 /// getTargetShuffleMask - Calculates the shuffle mask corresponding to the
05283 /// target specific opcode. Returns true if the Mask could be calculated. Sets
05284 /// IsUnary to true if only uses one source. Note that this will set IsUnary for
05285 /// shuffles which use a single input multiple times, and in those cases it will
05286 /// adjust the mask to only have indices within that single input.
05287 static bool getTargetShuffleMask(SDNode *N, MVT VT,
05288                                  SmallVectorImpl<int> &Mask, bool &IsUnary) {
05289   unsigned NumElems = VT.getVectorNumElements();
05290   SDValue ImmN;
05291 
05292   IsUnary = false;
05293   bool IsFakeUnary = false;
05294   switch(N->getOpcode()) {
05295   case X86ISD::BLENDI:
05296     ImmN = N->getOperand(N->getNumOperands()-1);
05297     DecodeBLENDMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
05298     break;
05299   case X86ISD::SHUFP:
05300     ImmN = N->getOperand(N->getNumOperands()-1);
05301     DecodeSHUFPMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
05302     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
05303     break;
05304   case X86ISD::UNPCKH:
05305     DecodeUNPCKHMask(VT, Mask);
05306     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
05307     break;
05308   case X86ISD::UNPCKL:
05309     DecodeUNPCKLMask(VT, Mask);
05310     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
05311     break;
05312   case X86ISD::MOVHLPS:
05313     DecodeMOVHLPSMask(NumElems, Mask);
05314     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
05315     break;
05316   case X86ISD::MOVLHPS:
05317     DecodeMOVLHPSMask(NumElems, Mask);
05318     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
05319     break;
05320   case X86ISD::PALIGNR:
05321     ImmN = N->getOperand(N->getNumOperands()-1);
05322     DecodePALIGNRMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
05323     break;
05324   case X86ISD::PSHUFD:
05325   case X86ISD::VPERMILPI:
05326     ImmN = N->getOperand(N->getNumOperands()-1);
05327     DecodePSHUFMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
05328     IsUnary = true;
05329     break;
05330   case X86ISD::PSHUFHW:
05331     ImmN = N->getOperand(N->getNumOperands()-1);
05332     DecodePSHUFHWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
05333     IsUnary = true;
05334     break;
05335   case X86ISD::PSHUFLW:
05336     ImmN = N->getOperand(N->getNumOperands()-1);
05337     DecodePSHUFLWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
05338     IsUnary = true;
05339     break;
05340   case X86ISD::PSHUFB: {
05341     IsUnary = true;
05342     SDValue MaskNode = N->getOperand(1);
05343     while (MaskNode->getOpcode() == ISD::BITCAST)
05344       MaskNode = MaskNode->getOperand(0);
05345 
05346     if (MaskNode->getOpcode() == ISD::BUILD_VECTOR) {
05347       // If we have a build-vector, then things are easy.
05348       EVT VT = MaskNode.getValueType();
05349       assert(VT.isVector() &&
05350              "Can't produce a non-vector with a build_vector!");
05351       if (!VT.isInteger())
05352         return false;
05353 
05354       int NumBytesPerElement = VT.getVectorElementType().getSizeInBits() / 8;
05355 
05356       SmallVector<uint64_t, 32> RawMask;
05357       for (int i = 0, e = MaskNode->getNumOperands(); i < e; ++i) {
05358         SDValue Op = MaskNode->getOperand(i);
05359         if (Op->getOpcode() == ISD::UNDEF) {
05360           RawMask.push_back((uint64_t)SM_SentinelUndef);
05361           continue;
05362         }
05363         auto *CN = dyn_cast<ConstantSDNode>(Op.getNode());
05364         if (!CN)
05365           return false;
05366         APInt MaskElement = CN->getAPIntValue();
05367 
05368         // We now have to decode the element which could be any integer size and
05369         // extract each byte of it.
05370         for (int j = 0; j < NumBytesPerElement; ++j) {
05371           // Note that this is x86 and so always little endian: the low byte is
05372           // the first byte of the mask.
05373           RawMask.push_back(MaskElement.getLoBits(8).getZExtValue());
05374           MaskElement = MaskElement.lshr(8);
05375         }
05376       }
05377       DecodePSHUFBMask(RawMask, Mask);
05378       break;
05379     }
05380 
05381     auto *MaskLoad = dyn_cast<LoadSDNode>(MaskNode);
05382     if (!MaskLoad)
05383       return false;
05384 
05385     SDValue Ptr = MaskLoad->getBasePtr();
05386     if (Ptr->getOpcode() == X86ISD::Wrapper)
05387       Ptr = Ptr->getOperand(0);
05388 
05389     auto *MaskCP = dyn_cast<ConstantPoolSDNode>(Ptr);
05390     if (!MaskCP || MaskCP->isMachineConstantPoolEntry())
05391       return false;
05392 
05393     if (auto *C = dyn_cast<Constant>(MaskCP->getConstVal())) {
05394       // FIXME: Support AVX-512 here.
05395       Type *Ty = C->getType();
05396       if (!Ty->isVectorTy() || (Ty->getVectorNumElements() != 16 &&
05397                                 Ty->getVectorNumElements() != 32))
05398         return false;
05399 
05400       DecodePSHUFBMask(C, Mask);
05401       break;
05402     }
05403 
05404     return false;
05405   }
05406   case X86ISD::VPERMI:
05407     ImmN = N->getOperand(N->getNumOperands()-1);
05408     DecodeVPERMMask(cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
05409     IsUnary = true;
05410     break;
05411   case X86ISD::MOVSS:
05412   case X86ISD::MOVSD: {
05413     // The index 0 always comes from the first element of the second source,
05414     // this is why MOVSS and MOVSD are used in the first place. The other
05415     // elements come from the other positions of the first source vector
05416     Mask.push_back(NumElems);
05417     for (unsigned i = 1; i != NumElems; ++i) {
05418       Mask.push_back(i);
05419     }
05420     break;
05421   }
05422   case X86ISD::VPERM2X128:
05423     ImmN = N->getOperand(N->getNumOperands()-1);
05424     DecodeVPERM2X128Mask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
05425     if (Mask.empty()) return false;
05426     break;
05427   case X86ISD::MOVSLDUP:
05428     DecodeMOVSLDUPMask(VT, Mask);
05429     break;
05430   case X86ISD::MOVSHDUP:
05431     DecodeMOVSHDUPMask(VT, Mask);
05432     break;
05433   case X86ISD::MOVDDUP:
05434   case X86ISD::MOVLHPD:
05435   case X86ISD::MOVLPD:
05436   case X86ISD::MOVLPS:
05437     // Not yet implemented
05438     return false;
05439   default: llvm_unreachable("unknown target shuffle node");
05440   }
05441 
05442   // If we have a fake unary shuffle, the shuffle mask is spread across two
05443   // inputs that are actually the same node. Re-map the mask to always point
05444   // into the first input.
05445   if (IsFakeUnary)
05446     for (int &M : Mask)
05447       if (M >= (int)Mask.size())
05448         M -= Mask.size();
05449 
05450   return true;
05451 }
05452 
05453 /// getShuffleScalarElt - Returns the scalar element that will make up the ith
05454 /// element of the result of the vector shuffle.
05455 static SDValue getShuffleScalarElt(SDNode *N, unsigned Index, SelectionDAG &DAG,
05456                                    unsigned Depth) {
05457   if (Depth == 6)
05458     return SDValue();  // Limit search depth.
05459 
05460   SDValue V = SDValue(N, 0);
05461   EVT VT = V.getValueType();
05462   unsigned Opcode = V.getOpcode();
05463 
05464   // Recurse into ISD::VECTOR_SHUFFLE node to find scalars.
05465   if (const ShuffleVectorSDNode *SV = dyn_cast<ShuffleVectorSDNode>(N)) {
05466     int Elt = SV->getMaskElt(Index);
05467 
05468     if (Elt < 0)
05469       return DAG.getUNDEF(VT.getVectorElementType());
05470 
05471     unsigned NumElems = VT.getVectorNumElements();
05472     SDValue NewV = (Elt < (int)NumElems) ? SV->getOperand(0)
05473                                          : SV->getOperand(1);
05474     return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG, Depth+1);
05475   }
05476 
05477   // Recurse into target specific vector shuffles to find scalars.
05478   if (isTargetShuffle(Opcode)) {
05479     MVT ShufVT = V.getSimpleValueType();
05480     unsigned NumElems = ShufVT.getVectorNumElements();
05481     SmallVector<int, 16> ShuffleMask;
05482     bool IsUnary;
05483 
05484     if (!getTargetShuffleMask(N, ShufVT, ShuffleMask, IsUnary))
05485       return SDValue();
05486 
05487     int Elt = ShuffleMask[Index];
05488     if (Elt < 0)
05489       return DAG.getUNDEF(ShufVT.getVectorElementType());
05490 
05491     SDValue NewV = (Elt < (int)NumElems) ? N->getOperand(0)
05492                                          : N->getOperand(1);
05493     return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG,
05494                                Depth+1);
05495   }
05496 
05497   // Actual nodes that may contain scalar elements
05498   if (Opcode == ISD::BITCAST) {
05499     V = V.getOperand(0);
05500     EVT SrcVT = V.getValueType();
05501     unsigned NumElems = VT.getVectorNumElements();
05502 
05503     if (!SrcVT.isVector() || SrcVT.getVectorNumElements() != NumElems)
05504       return SDValue();
05505   }
05506 
05507   if (V.getOpcode() == ISD::SCALAR_TO_VECTOR)
05508     return (Index == 0) ? V.getOperand(0)
05509                         : DAG.getUNDEF(VT.getVectorElementType());
05510 
05511   if (V.getOpcode() == ISD::BUILD_VECTOR)
05512     return V.getOperand(Index);
05513 
05514   return SDValue();
05515 }
05516 
05517 /// getNumOfConsecutiveZeros - Return the number of elements of a vector
05518 /// shuffle operation which come from a consecutively from a zero. The
05519 /// search can start in two different directions, from left or right.
05520 /// We count undefs as zeros until PreferredNum is reached.
05521 static unsigned getNumOfConsecutiveZeros(ShuffleVectorSDNode *SVOp,
05522                                          unsigned NumElems, bool ZerosFromLeft,
05523                                          SelectionDAG &DAG,
05524                                          unsigned PreferredNum = -1U) {
05525   unsigned NumZeros = 0;
05526   for (unsigned i = 0; i != NumElems; ++i) {
05527     unsigned Index = ZerosFromLeft ? i : NumElems - i - 1;
05528     SDValue Elt = getShuffleScalarElt(SVOp, Index, DAG, 0);
05529     if (!Elt.getNode())
05530       break;
05531 
05532     if (X86::isZeroNode(Elt))
05533       ++NumZeros;
05534     else if (Elt.getOpcode() == ISD::UNDEF) // Undef as zero up to PreferredNum.
05535       NumZeros = std::min(NumZeros + 1, PreferredNum);
05536     else
05537       break;
05538   }
05539 
05540   return NumZeros;
05541 }
05542 
05543 /// isShuffleMaskConsecutive - Check if the shuffle mask indicies [MaskI, MaskE)
05544 /// correspond consecutively to elements from one of the vector operands,
05545 /// starting from its index OpIdx. Also tell OpNum which source vector operand.
05546 static
05547 bool isShuffleMaskConsecutive(ShuffleVectorSDNode *SVOp,
05548                               unsigned MaskI, unsigned MaskE, unsigned OpIdx,
05549                               unsigned NumElems, unsigned &OpNum) {
05550   bool SeenV1 = false;
05551   bool SeenV2 = false;
05552 
05553   for (unsigned i = MaskI; i != MaskE; ++i, ++OpIdx) {
05554     int Idx = SVOp->getMaskElt(i);
05555     // Ignore undef indicies
05556     if (Idx < 0)
05557       continue;
05558 
05559     if (Idx < (int)NumElems)
05560       SeenV1 = true;
05561     else
05562       SeenV2 = true;
05563 
05564     // Only accept consecutive elements from the same vector
05565     if ((Idx % NumElems != OpIdx) || (SeenV1 && SeenV2))
05566       return false;
05567   }
05568 
05569   OpNum = SeenV1 ? 0 : 1;
05570   return true;
05571 }
05572 
05573 /// isVectorShiftRight - Returns true if the shuffle can be implemented as a
05574 /// logical left shift of a vector.
05575 static bool isVectorShiftRight(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG,
05576                                bool &isLeft, SDValue &ShVal, unsigned &ShAmt) {
05577   unsigned NumElems =
05578     SVOp->getSimpleValueType(0).getVectorNumElements();
05579   unsigned NumZeros = getNumOfConsecutiveZeros(
05580       SVOp, NumElems, false /* check zeros from right */, DAG,
05581       SVOp->getMaskElt(0));
05582   unsigned OpSrc;
05583 
05584   if (!NumZeros)
05585     return false;
05586 
05587   // Considering the elements in the mask that are not consecutive zeros,
05588   // check if they consecutively come from only one of the source vectors.
05589   //
05590   //               V1 = {X, A, B, C}     0
05591   //                         \  \  \    /
05592   //   vector_shuffle V1, V2 <1, 2, 3, X>
05593   //
05594   if (!isShuffleMaskConsecutive(SVOp,
05595             0,                   // Mask Start Index
05596             NumElems-NumZeros,   // Mask End Index(exclusive)
05597             NumZeros,            // Where to start looking in the src vector
05598             NumElems,            // Number of elements in vector
05599             OpSrc))              // Which source operand ?
05600     return false;
05601 
05602   isLeft = false;
05603   ShAmt = NumZeros;
05604   ShVal = SVOp->getOperand(OpSrc);
05605   return true;
05606 }
05607 
05608 /// isVectorShiftLeft - Returns true if the shuffle can be implemented as a
05609 /// logical left shift of a vector.
05610 static bool isVectorShiftLeft(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG,
05611                               bool &isLeft, SDValue &ShVal, unsigned &ShAmt) {
05612   unsigned NumElems =
05613     SVOp->getSimpleValueType(0).getVectorNumElements();
05614   unsigned NumZeros = getNumOfConsecutiveZeros(
05615       SVOp, NumElems, true /* check zeros from left */, DAG,
05616       NumElems - SVOp->getMaskElt(NumElems - 1) - 1);
05617   unsigned OpSrc;
05618 
05619   if (!NumZeros)
05620     return false;
05621 
05622   // Considering the elements in the mask that are not consecutive zeros,
05623   // check if they consecutively come from only one of the source vectors.
05624   //
05625   //                           0    { A, B, X, X } = V2
05626   //                          / \    /  /
05627   //   vector_shuffle V1, V2 <X, X, 4, 5>
05628   //
05629   if (!isShuffleMaskConsecutive(SVOp,
05630             NumZeros,     // Mask Start Index
05631             NumElems,     // Mask End Index(exclusive)
05632             0,            // Where to start looking in the src vector
05633             NumElems,     // Number of elements in vector
05634             OpSrc))       // Which source operand ?
05635     return false;
05636 
05637   isLeft = true;
05638   ShAmt = NumZeros;
05639   ShVal = SVOp->getOperand(OpSrc);
05640   return true;
05641 }
05642 
05643 /// isVectorShift - Returns true if the shuffle can be implemented as a
05644 /// logical left or right shift of a vector.
05645 static bool isVectorShift(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG,
05646                           bool &isLeft, SDValue &ShVal, unsigned &ShAmt) {
05647   // Although the logic below support any bitwidth size, there are no
05648   // shift instructions which handle more than 128-bit vectors.
05649   if (!SVOp->getSimpleValueType(0).is128BitVector())
05650     return false;
05651 
05652   if (isVectorShiftLeft(SVOp, DAG, isLeft, ShVal, ShAmt) ||
05653       isVectorShiftRight(SVOp, DAG, isLeft, ShVal, ShAmt))
05654     return true;
05655 
05656   return false;
05657 }
05658 
05659 /// LowerBuildVectorv16i8 - Custom lower build_vector of v16i8.
05660 ///
05661 static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros,
05662                                        unsigned NumNonZero, unsigned NumZero,
05663                                        SelectionDAG &DAG,
05664                                        const X86Subtarget* Subtarget,
05665                                        const TargetLowering &TLI) {
05666   if (NumNonZero > 8)
05667     return SDValue();
05668 
05669   SDLoc dl(Op);
05670   SDValue V;
05671   bool First = true;
05672   for (unsigned i = 0; i < 16; ++i) {
05673     bool ThisIsNonZero = (NonZeros & (1 << i)) != 0;
05674     if (ThisIsNonZero && First) {
05675       if (NumZero)
05676         V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
05677       else
05678         V = DAG.getUNDEF(MVT::v8i16);
05679       First = false;
05680     }
05681 
05682     if ((i & 1) != 0) {
05683       SDValue ThisElt, LastElt;
05684       bool LastIsNonZero = (NonZeros & (1 << (i-1))) != 0;
05685       if (LastIsNonZero) {
05686         LastElt = DAG.getNode(ISD::ZERO_EXTEND, dl,
05687                               MVT::i16, Op.getOperand(i-1));
05688       }
05689       if (ThisIsNonZero) {
05690         ThisElt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i));
05691         ThisElt = DAG.getNode(ISD::SHL, dl, MVT::i16,
05692                               ThisElt, DAG.getConstant(8, MVT::i8));
05693         if (LastIsNonZero)
05694           ThisElt = DAG.getNode(ISD::OR, dl, MVT::i16, ThisElt, LastElt);
05695       } else
05696         ThisElt = LastElt;
05697 
05698       if (ThisElt.getNode())
05699         V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, ThisElt,
05700                         DAG.getIntPtrConstant(i/2));
05701     }
05702   }
05703 
05704   return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, V);
05705 }
05706 
05707 /// LowerBuildVectorv8i16 - Custom lower build_vector of v8i16.
05708 ///
05709 static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros,
05710                                      unsigned NumNonZero, unsigned NumZero,
05711                                      SelectionDAG &DAG,
05712                                      const X86Subtarget* Subtarget,
05713                                      const TargetLowering &TLI) {
05714   if (NumNonZero > 4)
05715     return SDValue();
05716 
05717   SDLoc dl(Op);
05718   SDValue V;
05719   bool First = true;
05720   for (unsigned i = 0; i < 8; ++i) {
05721     bool isNonZero = (NonZeros & (1 << i)) != 0;
05722     if (isNonZero) {
05723       if (First) {
05724         if (NumZero)
05725           V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
05726         else
05727           V = DAG.getUNDEF(MVT::v8i16);
05728         First = false;
05729       }
05730       V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl,
05731                       MVT::v8i16, V, Op.getOperand(i),
05732                       DAG.getIntPtrConstant(i));
05733     }
05734   }
05735 
05736   return V;
05737 }
05738 
05739 /// LowerBuildVectorv4x32 - Custom lower build_vector of v4i32 or v4f32.
05740 static SDValue LowerBuildVectorv4x32(SDValue Op, unsigned NumElems,
05741                                      unsigned NonZeros, unsigned NumNonZero,
05742                                      unsigned NumZero, SelectionDAG &DAG,
05743                                      const X86Subtarget *Subtarget,
05744                                      const TargetLowering &TLI) {
05745   // We know there's at least one non-zero element
05746   unsigned FirstNonZeroIdx = 0;
05747   SDValue FirstNonZero = Op->getOperand(FirstNonZeroIdx);
05748   while (FirstNonZero.getOpcode() == ISD::UNDEF ||
05749          X86::isZeroNode(FirstNonZero)) {
05750     ++FirstNonZeroIdx;
05751     FirstNonZero = Op->getOperand(FirstNonZeroIdx);
05752   }
05753 
05754   if (FirstNonZero.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
05755       !isa<ConstantSDNode>(FirstNonZero.getOperand(1)))
05756     return SDValue();
05757 
05758   SDValue V = FirstNonZero.getOperand(0);
05759   MVT VVT = V.getSimpleValueType();
05760   if (!Subtarget->hasSSE41() || (VVT != MVT::v4f32 && VVT != MVT::v4i32))
05761     return SDValue();
05762 
05763   unsigned FirstNonZeroDst =
05764       cast<ConstantSDNode>(FirstNonZero.getOperand(1))->getZExtValue();
05765   unsigned CorrectIdx = FirstNonZeroDst == FirstNonZeroIdx;
05766   unsigned IncorrectIdx = CorrectIdx ? -1U : FirstNonZeroIdx;
05767   unsigned IncorrectDst = CorrectIdx ? -1U : FirstNonZeroDst;
05768 
05769   for (unsigned Idx = FirstNonZeroIdx + 1; Idx < NumElems; ++Idx) {
05770     SDValue Elem = Op.getOperand(Idx);
05771     if (Elem.getOpcode() == ISD::UNDEF || X86::isZeroNode(Elem))
05772       continue;
05773 
05774     // TODO: What else can be here? Deal with it.
05775     if (Elem.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
05776       return SDValue();
05777 
05778     // TODO: Some optimizations are still possible here
05779     // ex: Getting one element from a vector, and the rest from another.
05780     if (Elem.getOperand(0) != V)
05781       return SDValue();
05782 
05783     unsigned Dst = cast<ConstantSDNode>(Elem.getOperand(1))->getZExtValue();
05784     if (Dst == Idx)
05785       ++CorrectIdx;
05786     else if (IncorrectIdx == -1U) {
05787       IncorrectIdx = Idx;
05788       IncorrectDst = Dst;
05789     } else
05790       // There was already one element with an incorrect index.
05791       // We can't optimize this case to an insertps.
05792       return SDValue();
05793   }
05794 
05795   if (NumNonZero == CorrectIdx || NumNonZero == CorrectIdx + 1) {
05796     SDLoc dl(Op);
05797     EVT VT = Op.getSimpleValueType();
05798     unsigned ElementMoveMask = 0;
05799     if (IncorrectIdx == -1U)
05800       ElementMoveMask = FirstNonZeroIdx << 6 | FirstNonZeroIdx << 4;
05801     else
05802       ElementMoveMask = IncorrectDst << 6 | IncorrectIdx << 4;
05803 
05804     SDValue InsertpsMask =
05805         DAG.getIntPtrConstant(ElementMoveMask | (~NonZeros & 0xf));
05806     return DAG.getNode(X86ISD::INSERTPS, dl, VT, V, V, InsertpsMask);
05807   }
05808 
05809   return SDValue();
05810 }
05811 
05812 /// getVShift - Return a vector logical shift node.
05813 ///
05814 static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp,
05815                          unsigned NumBits, SelectionDAG &DAG,
05816                          const TargetLowering &TLI, SDLoc dl) {
05817   assert(VT.is128BitVector() && "Unknown type for VShift");
05818   EVT ShVT = MVT::v2i64;
05819   unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ;
05820   SrcOp = DAG.getNode(ISD::BITCAST, dl, ShVT, SrcOp);
05821   return DAG.getNode(ISD::BITCAST, dl, VT,
05822                      DAG.getNode(Opc, dl, ShVT, SrcOp,
05823                              DAG.getConstant(NumBits,
05824                                   TLI.getScalarShiftAmountTy(SrcOp.getValueType()))));
05825 }
05826 
05827 static SDValue
05828 LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, SDLoc dl, SelectionDAG &DAG) {
05829 
05830   // Check if the scalar load can be widened into a vector load. And if
05831   // the address is "base + cst" see if the cst can be "absorbed" into
05832   // the shuffle mask.
05833   if (LoadSDNode *LD = dyn_cast<LoadSDNode>(SrcOp)) {
05834     SDValue Ptr = LD->getBasePtr();
05835     if (!ISD::isNormalLoad(LD) || LD->isVolatile())
05836       return SDValue();
05837     EVT PVT = LD->getValueType(0);
05838     if (PVT != MVT::i32 && PVT != MVT::f32)
05839       return SDValue();
05840 
05841     int FI = -1;
05842     int64_t Offset = 0;
05843     if (FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr)) {
05844       FI = FINode->getIndex();
05845       Offset = 0;
05846     } else if (DAG.isBaseWithConstantOffset(Ptr) &&
05847                isa<FrameIndexSDNode>(Ptr.getOperand(0))) {
05848       FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex();
05849       Offset = Ptr.getConstantOperandVal(1);
05850       Ptr = Ptr.getOperand(0);
05851     } else {
05852       return SDValue();
05853     }
05854 
05855     // FIXME: 256-bit vector instructions don't require a strict alignment,
05856     // improve this code to support it better.
05857     unsigned RequiredAlign = VT.getSizeInBits()/8;
05858     SDValue Chain = LD->getChain();
05859     // Make sure the stack object alignment is at least 16 or 32.
05860     MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
05861     if (DAG.InferPtrAlignment(Ptr) < RequiredAlign) {
05862       if (MFI->isFixedObjectIndex(FI)) {
05863         // Can't change the alignment. FIXME: It's possible to compute
05864         // the exact stack offset and reference FI + adjust offset instead.
05865         // If someone *really* cares about this. That's the way to implement it.
05866         return SDValue();
05867       } else {
05868         MFI->setObjectAlignment(FI, RequiredAlign);
05869       }
05870     }
05871 
05872     // (Offset % 16 or 32) must be multiple of 4. Then address is then
05873     // Ptr + (Offset & ~15).
05874     if (Offset < 0)
05875       return SDValue();
05876     if ((Offset % RequiredAlign) & 3)
05877       return SDValue();
05878     int64_t StartOffset = Offset & ~(RequiredAlign-1);
05879     if (StartOffset)
05880       Ptr = DAG.getNode(ISD::ADD, SDLoc(Ptr), Ptr.getValueType(),
05881                         Ptr,DAG.getConstant(StartOffset, Ptr.getValueType()));
05882 
05883     int EltNo = (Offset - StartOffset) >> 2;
05884     unsigned NumElems = VT.getVectorNumElements();
05885 
05886     EVT NVT = EVT::getVectorVT(*DAG.getContext(), PVT, NumElems);
05887     SDValue V1 = DAG.getLoad(NVT, dl, Chain, Ptr,
05888                              LD->getPointerInfo().getWithOffset(StartOffset),
05889                              false, false, false, 0);
05890 
05891     SmallVector<int, 8> Mask;
05892     for (unsigned i = 0; i != NumElems; ++i)
05893       Mask.push_back(EltNo);
05894 
05895     return DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), &Mask[0]);
05896   }
05897 
05898   return SDValue();
05899 }
05900 
05901 /// EltsFromConsecutiveLoads - Given the initializing elements 'Elts' of a
05902 /// vector of type 'VT', see if the elements can be replaced by a single large
05903 /// load which has the same value as a build_vector whose operands are 'elts'.
05904 ///
05905 /// Example: <load i32 *a, load i32 *a+4, undef, undef> -> zextload a
05906 ///
05907 /// FIXME: we'd also like to handle the case where the last elements are zero
05908 /// rather than undef via VZEXT_LOAD, but we do not detect that case today.
05909 /// There's even a handy isZeroNode for that purpose.
05910 static SDValue EltsFromConsecutiveLoads(EVT VT, SmallVectorImpl<SDValue> &Elts,
05911                                         SDLoc &DL, SelectionDAG &DAG,
05912                                         bool isAfterLegalize) {
05913   EVT EltVT = VT.getVectorElementType();
05914   unsigned NumElems = Elts.size();
05915 
05916   LoadSDNode *LDBase = nullptr;
05917   unsigned LastLoadedElt = -1U;
05918 
05919   // For each element in the initializer, see if we've found a load or an undef.
05920   // If we don't find an initial load element, or later load elements are
05921   // non-consecutive, bail out.
05922   for (unsigned i = 0; i < NumElems; ++i) {
05923     SDValue Elt = Elts[i];
05924 
05925     if (!Elt.getNode() ||
05926         (Elt.getOpcode() != ISD::UNDEF && !ISD::isNON_EXTLoad(Elt.getNode())))
05927       return SDValue();
05928     if (!LDBase) {
05929       if (Elt.getNode()->getOpcode() == ISD::UNDEF)
05930         return SDValue();
05931       LDBase = cast<LoadSDNode>(Elt.getNode());
05932       LastLoadedElt = i;
05933       continue;
05934     }
05935     if (Elt.getOpcode() == ISD::UNDEF)
05936       continue;
05937 
05938     LoadSDNode *LD = cast<LoadSDNode>(Elt);
05939     if (!DAG.isConsecutiveLoad(LD, LDBase, EltVT.getSizeInBits()/8, i))
05940       return SDValue();
05941     LastLoadedElt = i;
05942   }
05943 
05944   // If we have found an entire vector of loads and undefs, then return a large
05945   // load of the entire vector width starting at the base pointer.  If we found
05946   // consecutive loads for the low half, generate a vzext_load node.
05947   if (LastLoadedElt == NumElems - 1) {
05948 
05949     if (isAfterLegalize &&
05950         !DAG.getTargetLoweringInfo().isOperationLegal(ISD::LOAD, VT))
05951       return SDValue();
05952 
05953     SDValue NewLd = SDValue();
05954 
05955     if (DAG.InferPtrAlignment(LDBase->getBasePtr()) >= 16)
05956       NewLd = DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
05957                           LDBase->getPointerInfo(),
05958                           LDBase->isVolatile(), LDBase->isNonTemporal(),
05959                           LDBase->isInvariant(), 0);
05960     NewLd = DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
05961                         LDBase->getPointerInfo(),
05962                         LDBase->isVolatile(), LDBase->isNonTemporal(),
05963                         LDBase->isInvariant(), LDBase->getAlignment());
05964 
05965     if (LDBase->hasAnyUseOfValue(1)) {
05966       SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
05967                                      SDValue(LDBase, 1),
05968                                      SDValue(NewLd.getNode(), 1));
05969       DAG.ReplaceAllUsesOfValueWith(SDValue(LDBase, 1), NewChain);
05970       DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(LDBase, 1),
05971                              SDValue(NewLd.getNode(), 1));
05972     }
05973 
05974     return NewLd;
05975   }
05976   if (NumElems == 4 && LastLoadedElt == 1 &&
05977       DAG.getTargetLoweringInfo().isTypeLegal(MVT::v2i64)) {
05978     SDVTList Tys = DAG.getVTList(MVT::v2i64, MVT::Other);
05979     SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };
05980     SDValue ResNode =
05981         DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, Ops, MVT::i64,
05982                                 LDBase->getPointerInfo(),
05983                                 LDBase->getAlignment(),
05984                                 false/*isVolatile*/, true/*ReadMem*/,
05985                                 false/*WriteMem*/);
05986 
05987     // Make sure the newly-created LOAD is in the same position as LDBase in
05988     // terms of dependency. We create a TokenFactor for LDBase and ResNode, and
05989     // update uses of LDBase's output chain to use the TokenFactor.
05990     if (LDBase->hasAnyUseOfValue(1)) {
05991       SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
05992                              SDValue(LDBase, 1), SDValue(ResNode.getNode(), 1));
05993       DAG.ReplaceAllUsesOfValueWith(SDValue(LDBase, 1), NewChain);
05994       DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(LDBase, 1),
05995                              SDValue(ResNode.getNode(), 1));
05996     }
05997 
05998     return DAG.getNode(ISD::BITCAST, DL, VT, ResNode);
05999   }
06000   return SDValue();
06001 }
06002 
06003 /// LowerVectorBroadcast - Attempt to use the vbroadcast instruction
06004 /// to generate a splat value for the following cases:
06005 /// 1. A splat BUILD_VECTOR which uses a single scalar load, or a constant.
06006 /// 2. A splat shuffle which uses a scalar_to_vector node which comes from
06007 /// a scalar load, or a constant.
06008 /// The VBROADCAST node is returned when a pattern is found,
06009 /// or SDValue() otherwise.
06010 static SDValue LowerVectorBroadcast(SDValue Op, const X86Subtarget* Subtarget,
06011                                     SelectionDAG &DAG) {
06012   // VBROADCAST requires AVX.
06013   // TODO: Splats could be generated for non-AVX CPUs using SSE
06014   // instructions, but there's less potential gain for only 128-bit vectors.
06015   if (!Subtarget->hasAVX())
06016     return SDValue();
06017 
06018   MVT VT = Op.getSimpleValueType();
06019   SDLoc dl(Op);
06020 
06021   assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
06022          "Unsupported vector type for broadcast.");
06023 
06024   SDValue Ld;
06025   bool ConstSplatVal;
06026 
06027   switch (Op.getOpcode()) {
06028     default:
06029       // Unknown pattern found.
06030       return SDValue();
06031 
06032     case ISD::BUILD_VECTOR: {
06033       auto *BVOp = cast<BuildVectorSDNode>(Op.getNode());
06034       BitVector UndefElements;
06035       SDValue Splat = BVOp->getSplatValue(&UndefElements);
06036 
06037       // We need a splat of a single value to use broadcast, and it doesn't
06038       // make any sense if the value is only in one element of the vector.
06039       if (!Splat || (VT.getVectorNumElements() - UndefElements.count()) <= 1)
06040         return SDValue();
06041 
06042       Ld = Splat;
06043       ConstSplatVal = (Ld.getOpcode() == ISD::Constant ||
06044                        Ld.getOpcode() == ISD::ConstantFP);
06045 
06046       // Make sure that all of the users of a non-constant load are from the
06047       // BUILD_VECTOR node.
06048       if (!ConstSplatVal && !BVOp->isOnlyUserOf(Ld.getNode()))
06049         return SDValue();
06050       break;
06051     }
06052 
06053     case ISD::VECTOR_SHUFFLE: {
06054       ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
06055 
06056       // Shuffles must have a splat mask where the first element is
06057       // broadcasted.
06058       if ((!SVOp->isSplat()) || SVOp->getMaskElt(0) != 0)
06059         return SDValue();
06060 
06061       SDValue Sc = Op.getOperand(0);
06062       if (Sc.getOpcode() != ISD::SCALAR_TO_VECTOR &&
06063           Sc.getOpcode() != ISD::BUILD_VECTOR) {
06064 
06065         if (!Subtarget->hasInt256())
06066           return SDValue();
06067 
06068         // Use the register form of the broadcast instruction available on AVX2.
06069         if (VT.getSizeInBits() >= 256)
06070           Sc = Extract128BitVector(Sc, 0, DAG, dl);
06071         return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Sc);
06072       }
06073 
06074       Ld = Sc.getOperand(0);
06075       ConstSplatVal = (Ld.getOpcode() == ISD::Constant ||
06076                        Ld.getOpcode() == ISD::ConstantFP);
06077 
06078       // The scalar_to_vector node and the suspected
06079       // load node must have exactly one user.
06080       // Constants may have multiple users.
06081 
06082       // AVX-512 has register version of the broadcast
06083       bool hasRegVer = Subtarget->hasAVX512() && VT.is512BitVector() &&
06084         Ld.getValueType().getSizeInBits() >= 32;
06085       if (!ConstSplatVal && ((!Sc.hasOneUse() || !Ld.hasOneUse()) &&
06086           !hasRegVer))
06087         return SDValue();
06088       break;
06089     }
06090   }
06091 
06092   unsigned ScalarSize = Ld.getValueType().getSizeInBits();
06093   bool IsGE256 = (VT.getSizeInBits() >= 256);
06094 
06095   // When optimizing for size, generate up to 5 extra bytes for a broadcast
06096   // instruction to save 8 or more bytes of constant pool data.
06097   // TODO: If multiple splats are generated to load the same constant,
06098   // it may be detrimental to overall size. There needs to be a way to detect
06099   // that condition to know if this is truly a size win.
06100   const Function *F = DAG.getMachineFunction().getFunction();
06101   bool OptForSize = F->getAttributes().
06102     hasAttribute(AttributeSet::FunctionIndex, Attribute::OptimizeForSize);
06103 
06104   // Handle broadcasting a single constant scalar from the constant pool
06105   // into a vector.
06106   // On Sandybridge (no AVX2), it is still better to load a constant vector
06107   // from the constant pool and not to broadcast it from a scalar.
06108   // But override that restriction when optimizing for size.
06109   // TODO: Check if splatting is recommended for other AVX-capable CPUs.
06110   if (ConstSplatVal && (Subtarget->hasAVX2() || OptForSize)) {
06111     EVT CVT = Ld.getValueType();
06112     assert(!CVT.isVector() && "Must not broadcast a vector type");
06113 
06114     // Splat f32, i32, v4f64, v4i64 in all cases with AVX2.
06115     // For size optimization, also splat v2f64 and v2i64, and for size opt
06116     // with AVX2, also splat i8 and i16.
06117     // With pattern matching, the VBROADCAST node may become a VMOVDDUP.
06118     if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
06119         (OptForSize && (ScalarSize == 64 || Subtarget->hasAVX2()))) {
06120       const Constant *C = nullptr;
06121       if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Ld))
06122         C = CI->getConstantIntValue();
06123       else if (ConstantFPSDNode *CF = dyn_cast<ConstantFPSDNode>(Ld))
06124         C = CF->getConstantFPValue();
06125 
06126       assert(C && "Invalid constant type");
06127 
06128       const TargetLowering &TLI = DAG.getTargetLoweringInfo();
06129       SDValue CP = DAG.getConstantPool(C, TLI.getPointerTy());
06130       unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
06131       Ld = DAG.getLoad(CVT, dl, DAG.getEntryNode(), CP,
06132                        MachinePointerInfo::getConstantPool(),
06133                        false, false, false, Alignment);
06134 
06135       return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
06136     }
06137   }
06138 
06139   bool IsLoad = ISD::isNormalLoad(Ld.getNode());
06140 
06141   // Handle AVX2 in-register broadcasts.
06142   if (!IsLoad && Subtarget->hasInt256() &&
06143       (ScalarSize == 32 || (IsGE256 && ScalarSize == 64)))
06144     return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
06145 
06146   // The scalar source must be a normal load.
06147   if (!IsLoad)
06148     return SDValue();
06149 
06150   if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64))
06151     return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
06152 
06153   // The integer check is needed for the 64-bit into 128-bit so it doesn't match
06154   // double since there is no vbroadcastsd xmm
06155   if (Subtarget->hasInt256() && Ld.getValueType().isInteger()) {
06156     if (ScalarSize == 8 || ScalarSize == 16 || ScalarSize == 64)
06157       return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
06158   }
06159 
06160   // Unsupported broadcast.
06161   return SDValue();
06162 }
06163 
06164 /// \brief For an EXTRACT_VECTOR_ELT with a constant index return the real
06165 /// underlying vector and index.
06166 ///
06167 /// Modifies \p ExtractedFromVec to the real vector and returns the real
06168 /// index.
06169 static int getUnderlyingExtractedFromVec(SDValue &ExtractedFromVec,
06170                                          SDValue ExtIdx) {
06171   int Idx = cast<ConstantSDNode>(ExtIdx)->getZExtValue();
06172   if (!isa<ShuffleVectorSDNode>(ExtractedFromVec))
06173     return Idx;
06174 
06175   // For 256-bit vectors, LowerEXTRACT_VECTOR_ELT_SSE4 may have already
06176   // lowered this:
06177   //   (extract_vector_elt (v8f32 %vreg1), Constant<6>)
06178   // to:
06179   //   (extract_vector_elt (vector_shuffle<2,u,u,u>
06180   //                           (extract_subvector (v8f32 %vreg0), Constant<4>),
06181   //                           undef)
06182   //                       Constant<0>)
06183   // In this case the vector is the extract_subvector expression and the index
06184   // is 2, as specified by the shuffle.
06185   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(ExtractedFromVec);
06186   SDValue ShuffleVec = SVOp->getOperand(0);
06187   MVT ShuffleVecVT = ShuffleVec.getSimpleValueType();
06188   assert(ShuffleVecVT.getVectorElementType() ==
06189          ExtractedFromVec.getSimpleValueType().getVectorElementType());
06190 
06191   int ShuffleIdx = SVOp->getMaskElt(Idx);
06192   if (isUndefOrInRange(ShuffleIdx, 0, ShuffleVecVT.getVectorNumElements())) {
06193     ExtractedFromVec = ShuffleVec;
06194     return ShuffleIdx;
06195   }
06196   return Idx;
06197 }
06198 
06199 static SDValue buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) {
06200   MVT VT = Op.getSimpleValueType();
06201 
06202   // Skip if insert_vec_elt is not supported.
06203   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
06204   if (!TLI.isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT))
06205     return SDValue();
06206 
06207   SDLoc DL(Op);
06208   unsigned NumElems = Op.getNumOperands();
06209 
06210   SDValue VecIn1;
06211   SDValue VecIn2;
06212   SmallVector<unsigned, 4> InsertIndices;
06213   SmallVector<int, 8> Mask(NumElems, -1);
06214 
06215   for (unsigned i = 0; i != NumElems; ++i) {
06216     unsigned Opc = Op.getOperand(i).getOpcode();
06217 
06218     if (Opc == ISD::UNDEF)
06219       continue;
06220 
06221     if (Opc != ISD::EXTRACT_VECTOR_ELT) {
06222       // Quit if more than 1 elements need inserting.
06223       if (InsertIndices.size() > 1)
06224         return SDValue();
06225 
06226       InsertIndices.push_back(i);
06227       continue;
06228     }
06229 
06230     SDValue ExtractedFromVec = Op.getOperand(i).getOperand(0);
06231     SDValue ExtIdx = Op.getOperand(i).getOperand(1);
06232     // Quit if non-constant index.
06233     if (!isa<ConstantSDNode>(ExtIdx))
06234       return SDValue();
06235     int Idx = getUnderlyingExtractedFromVec(ExtractedFromVec, ExtIdx);
06236 
06237     // Quit if extracted from vector of different type.
06238     if (ExtractedFromVec.getValueType() != VT)
06239       return SDValue();
06240 
06241     if (!VecIn1.getNode())
06242       VecIn1 = ExtractedFromVec;
06243     else if (VecIn1 != ExtractedFromVec) {
06244       if (!VecIn2.getNode())
06245         VecIn2 = ExtractedFromVec;
06246       else if (VecIn2 != ExtractedFromVec)
06247         // Quit if more than 2 vectors to shuffle
06248         return SDValue();
06249     }
06250 
06251     if (ExtractedFromVec == VecIn1)
06252       Mask[i] = Idx;
06253     else if (ExtractedFromVec == VecIn2)
06254       Mask[i] = Idx + NumElems;
06255   }
06256 
06257   if (!VecIn1.getNode())
06258     return SDValue();
06259 
06260   VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(VT);
06261   SDValue NV = DAG.getVectorShuffle(VT, DL, VecIn1, VecIn2, &Mask[0]);
06262   for (unsigned i = 0, e = InsertIndices.size(); i != e; ++i) {
06263     unsigned Idx = InsertIndices[i];
06264     NV = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, NV, Op.getOperand(Idx),
06265                      DAG.getIntPtrConstant(Idx));
06266   }
06267 
06268   return NV;
06269 }
06270 
06271 // Lower BUILD_VECTOR operation for v8i1 and v16i1 types.
06272 SDValue
06273 X86TargetLowering::LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const {
06274 
06275   MVT VT = Op.getSimpleValueType();
06276   assert((VT.getVectorElementType() == MVT::i1) && (VT.getSizeInBits() <= 16) &&
06277          "Unexpected type in LowerBUILD_VECTORvXi1!");
06278 
06279   SDLoc dl(Op);
06280   if (ISD::isBuildVectorAllZeros(Op.getNode())) {
06281     SDValue Cst = DAG.getTargetConstant(0, MVT::i1);
06282     SmallVector<SDValue, 16> Ops(VT.getVectorNumElements(), Cst);
06283     return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops);
06284   }
06285 
06286   if (ISD::isBuildVectorAllOnes(Op.getNode())) {
06287     SDValue Cst = DAG.getTargetConstant(1, MVT::i1);
06288     SmallVector<SDValue, 16> Ops(VT.getVectorNumElements(), Cst);
06289     return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops);
06290   }
06291 
06292   bool AllContants = true;
06293   uint64_t Immediate = 0;
06294   int NonConstIdx = -1;
06295   bool IsSplat = true;
06296   unsigned NumNonConsts = 0;
06297   unsigned NumConsts = 0;
06298   for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
06299     SDValue In = Op.getOperand(idx);
06300     if (In.getOpcode() == ISD::UNDEF)
06301       continue;
06302     if (!isa<ConstantSDNode>(In)) {
06303       AllContants = false;
06304       NonConstIdx = idx;
06305       NumNonConsts++;
06306     }
06307     else {
06308       NumConsts++;
06309       if (cast<ConstantSDNode>(In)->getZExtValue())
06310       Immediate |= (1ULL << idx);
06311     }
06312     if (In != Op.getOperand(0))
06313       IsSplat = false;
06314   }
06315 
06316   if (AllContants) {
06317     SDValue FullMask = DAG.getNode(ISD::BITCAST, dl, MVT::v16i1,
06318       DAG.getConstant(Immediate, MVT::i16));
06319     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, FullMask,
06320                        DAG.getIntPtrConstant(0));
06321   }
06322 
06323   if (NumNonConsts == 1 && NonConstIdx != 0) {
06324     SDValue DstVec;
06325     if (NumConsts) {
06326       SDValue VecAsImm = DAG.getConstant(Immediate,
06327                                          MVT::getIntegerVT(VT.getSizeInBits()));
06328       DstVec = DAG.getNode(ISD::BITCAST, dl, VT, VecAsImm);
06329     }
06330     else 
06331       DstVec = DAG.getUNDEF(VT);
06332     return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DstVec,
06333                        Op.getOperand(NonConstIdx),
06334                        DAG.getIntPtrConstant(NonConstIdx));
06335   }
06336   if (!IsSplat && (NonConstIdx != 0))
06337     llvm_unreachable("Unsupported BUILD_VECTOR operation");
06338   MVT SelectVT = (VT == MVT::v16i1)? MVT::i16 : MVT::i8;
06339   SDValue Select;
06340   if (IsSplat)
06341     Select = DAG.getNode(ISD::SELECT, dl, SelectVT, Op.getOperand(0),
06342                           DAG.getConstant(-1, SelectVT),
06343                           DAG.getConstant(0, SelectVT));
06344   else
06345     Select = DAG.getNode(ISD::SELECT, dl, SelectVT, Op.getOperand(0),
06346                          DAG.getConstant((Immediate | 1), SelectVT),
06347                          DAG.getConstant(Immediate, SelectVT));
06348   return DAG.getNode(ISD::BITCAST, dl, VT, Select);
06349 }
06350 
06351 /// \brief Return true if \p N implements a horizontal binop and return the
06352 /// operands for the horizontal binop into V0 and V1.
06353 /// 
06354 /// This is a helper function of PerformBUILD_VECTORCombine.
06355 /// This function checks that the build_vector \p N in input implements a
06356 /// horizontal operation. Parameter \p Opcode defines the kind of horizontal
06357 /// operation to match.
06358 /// For example, if \p Opcode is equal to ISD::ADD, then this function
06359 /// checks if \p N implements a horizontal arithmetic add; if instead \p Opcode
06360 /// is equal to ISD::SUB, then this function checks if this is a horizontal
06361 /// arithmetic sub.
06362 ///
06363 /// This function only analyzes elements of \p N whose indices are
06364 /// in range [BaseIdx, LastIdx).
06365 static bool isHorizontalBinOp(const BuildVectorSDNode *N, unsigned Opcode,
06366                               SelectionDAG &DAG,
06367                               unsigned BaseIdx, unsigned LastIdx,
06368                               SDValue &V0, SDValue &V1) {
06369   EVT VT = N->getValueType(0);
06370 
06371   assert(BaseIdx * 2 <= LastIdx && "Invalid Indices in input!");
06372   assert(VT.isVector() && VT.getVectorNumElements() >= LastIdx &&
06373          "Invalid Vector in input!");
06374   
06375   bool IsCommutable = (Opcode == ISD::ADD || Opcode == ISD::FADD);
06376   bool CanFold = true;
06377   unsigned ExpectedVExtractIdx = BaseIdx;
06378   unsigned NumElts = LastIdx - BaseIdx;
06379   V0 = DAG.getUNDEF(VT);
06380   V1 = DAG.getUNDEF(VT);
06381 
06382   // Check if N implements a horizontal binop.
06383   for (unsigned i = 0, e = NumElts; i != e && CanFold; ++i) {
06384     SDValue Op = N->getOperand(i + BaseIdx);
06385 
06386     // Skip UNDEFs.
06387     if (Op->getOpcode() == ISD::UNDEF) {
06388       // Update the expected vector extract index.
06389       if (i * 2 == NumElts)
06390         ExpectedVExtractIdx = BaseIdx;
06391       ExpectedVExtractIdx += 2;
06392       continue;
06393     }
06394 
06395     CanFold = Op->getOpcode() == Opcode && Op->hasOneUse();
06396 
06397     if (!CanFold)
06398       break;
06399 
06400     SDValue Op0 = Op.getOperand(0);
06401     SDValue Op1 = Op.getOperand(1);
06402 
06403     // Try to match the following pattern:
06404     // (BINOP (extract_vector_elt A, I), (extract_vector_elt A, I+1))
06405     CanFold = (Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
06406         Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
06407         Op0.getOperand(0) == Op1.getOperand(0) &&
06408         isa<ConstantSDNode>(Op0.getOperand(1)) &&
06409         isa<ConstantSDNode>(Op1.getOperand(1)));
06410     if (!CanFold)
06411       break;
06412 
06413     unsigned I0 = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue();
06414     unsigned I1 = cast<ConstantSDNode>(Op1.getOperand(1))->getZExtValue();
06415 
06416     if (i * 2 < NumElts) {
06417       if (V0.getOpcode() == ISD::UNDEF)
06418         V0 = Op0.getOperand(0);
06419     } else {
06420       if (V1.getOpcode() == ISD::UNDEF)
06421         V1 = Op0.getOperand(0);
06422       if (i * 2 == NumElts)
06423         ExpectedVExtractIdx = BaseIdx;
06424     }
06425 
06426     SDValue Expected = (i * 2 < NumElts) ? V0 : V1;
06427     if (I0 == ExpectedVExtractIdx)
06428       CanFold = I1 == I0 + 1 && Op0.getOperand(0) == Expected;
06429     else if (IsCommutable && I1 == ExpectedVExtractIdx) {
06430       // Try to match the following dag sequence:
06431       // (BINOP (extract_vector_elt A, I+1), (extract_vector_elt A, I))
06432       CanFold = I0 == I1 + 1 && Op1.getOperand(0) == Expected;
06433     } else
06434       CanFold = false;
06435 
06436     ExpectedVExtractIdx += 2;
06437   }
06438 
06439   return CanFold;
06440 }
06441 
06442 /// \brief Emit a sequence of two 128-bit horizontal add/sub followed by
06443 /// a concat_vector. 
06444 ///
06445 /// This is a helper function of PerformBUILD_VECTORCombine.
06446 /// This function expects two 256-bit vectors called V0 and V1.
06447 /// At first, each vector is split into two separate 128-bit vectors.
06448 /// Then, the resulting 128-bit vectors are used to implement two
06449 /// horizontal binary operations. 
06450 ///
06451 /// The kind of horizontal binary operation is defined by \p X86Opcode.
06452 ///
06453 /// \p Mode specifies how the 128-bit parts of V0 and V1 are passed in input to
06454 /// the two new horizontal binop.
06455 /// When Mode is set, the first horizontal binop dag node would take as input
06456 /// the lower 128-bit of V0 and the upper 128-bit of V0. The second
06457 /// horizontal binop dag node would take as input the lower 128-bit of V1
06458 /// and the upper 128-bit of V1.
06459 ///   Example:
06460 ///     HADD V0_LO, V0_HI
06461 ///     HADD V1_LO, V1_HI
06462 ///
06463 /// Otherwise, the first horizontal binop dag node takes as input the lower
06464 /// 128-bit of V0 and the lower 128-bit of V1, and the second horizontal binop
06465 /// dag node takes the the upper 128-bit of V0 and the upper 128-bit of V1.
06466 ///   Example:
06467 ///     HADD V0_LO, V1_LO
06468 ///     HADD V0_HI, V1_HI
06469 ///
06470 /// If \p isUndefLO is set, then the algorithm propagates UNDEF to the lower
06471 /// 128-bits of the result. If \p isUndefHI is set, then UNDEF is propagated to
06472 /// the upper 128-bits of the result.
06473 static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1,
06474                                      SDLoc DL, SelectionDAG &DAG,
06475                                      unsigned X86Opcode, bool Mode,
06476                                      bool isUndefLO, bool isUndefHI) {
06477   EVT VT = V0.getValueType();
06478   assert(VT.is256BitVector() && VT == V1.getValueType() &&
06479          "Invalid nodes in input!");
06480 
06481   unsigned NumElts = VT.getVectorNumElements();
06482   SDValue V0_LO = Extract128BitVector(V0, 0, DAG, DL);
06483   SDValue V0_HI = Extract128BitVector(V0, NumElts/2, DAG, DL);
06484   SDValue V1_LO = Extract128BitVector(V1, 0, DAG, DL);
06485   SDValue V1_HI = Extract128BitVector(V1, NumElts/2, DAG, DL);
06486   EVT NewVT = V0_LO.getValueType();
06487 
06488   SDValue LO = DAG.getUNDEF(NewVT);
06489   SDValue HI = DAG.getUNDEF(NewVT);
06490 
06491   if (Mode) {
06492     // Don't emit a horizontal binop if the result is expected to be UNDEF.
06493     if (!isUndefLO && V0->getOpcode() != ISD::UNDEF)
06494       LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V0_HI);
06495     if (!isUndefHI && V1->getOpcode() != ISD::UNDEF)
06496       HI = DAG.getNode(X86Opcode, DL, NewVT, V1_LO, V1_HI);
06497   } else {
06498     // Don't emit a horizontal binop if the result is expected to be UNDEF.
06499     if (!isUndefLO && (V0_LO->getOpcode() != ISD::UNDEF ||
06500                        V1_LO->getOpcode() != ISD::UNDEF))
06501       LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V1_LO);
06502 
06503     if (!isUndefHI && (V0_HI->getOpcode() != ISD::UNDEF ||
06504                        V1_HI->getOpcode() != ISD::UNDEF))
06505       HI = DAG.getNode(X86Opcode, DL, NewVT, V0_HI, V1_HI);
06506   }
06507 
06508   return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LO, HI);
06509 }
06510 
06511 /// \brief Try to fold a build_vector that performs an 'addsub' into the
06512 /// sequence of 'vadd + vsub + blendi'.
06513 static SDValue matchAddSub(const BuildVectorSDNode *BV, SelectionDAG &DAG,
06514                            const X86Subtarget *Subtarget) {
06515   SDLoc DL(BV);
06516   EVT VT = BV->getValueType(0);
06517   unsigned NumElts = VT.getVectorNumElements();
06518   SDValue InVec0 = DAG.getUNDEF(VT);
06519   SDValue InVec1 = DAG.getUNDEF(VT);
06520 
06521   assert((VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v4f32 ||
06522           VT == MVT::v2f64) && "build_vector with an invalid type found!");
06523 
06524   // Odd-numbered elements in the input build vector are obtained from
06525   // adding two integer/float elements.
06526   // Even-numbered elements in the input build vector are obtained from
06527   // subtracting two integer/float elements.
06528   unsigned ExpectedOpcode = ISD::FSUB;
06529   unsigned NextExpectedOpcode = ISD::FADD;
06530   bool AddFound = false;
06531   bool SubFound = false;
06532 
06533   for (unsigned i = 0, e = NumElts; i != e; i++) {
06534     SDValue Op = BV->getOperand(i);
06535 
06536     // Skip 'undef' values.
06537     unsigned Opcode = Op.getOpcode();
06538     if (Opcode == ISD::UNDEF) {
06539       std::swap(ExpectedOpcode, NextExpectedOpcode);
06540       continue;
06541     }
06542 
06543     // Early exit if we found an unexpected opcode.
06544     if (Opcode != ExpectedOpcode)
06545       return SDValue();
06546 
06547     SDValue Op0 = Op.getOperand(0);
06548     SDValue Op1 = Op.getOperand(1);
06549 
06550     // Try to match the following pattern:
06551     // (BINOP (extract_vector_elt A, i), (extract_vector_elt B, i))
06552     // Early exit if we cannot match that sequence.
06553     if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
06554         Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
06555         !isa<ConstantSDNode>(Op0.getOperand(1)) ||
06556         !isa<ConstantSDNode>(Op1.getOperand(1)) ||
06557         Op0.getOperand(1) != Op1.getOperand(1))
06558       return SDValue();
06559 
06560     unsigned I0 = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue();
06561     if (I0 != i)
06562       return SDValue();
06563 
06564     // We found a valid add/sub node. Update the information accordingly.
06565     if (i & 1)
06566       AddFound = true;
06567     else
06568       SubFound = true;
06569 
06570     // Update InVec0 and InVec1.
06571     if (InVec0.getOpcode() == ISD::UNDEF)
06572       InVec0 = Op0.getOperand(0);
06573     if (InVec1.getOpcode() == ISD::UNDEF)
06574       InVec1 = Op1.getOperand(0);
06575 
06576     // Make sure that operands in input to each add/sub node always
06577     // come from a same pair of vectors.
06578     if (InVec0 != Op0.getOperand(0)) {
06579       if (ExpectedOpcode == ISD::FSUB)
06580         return SDValue();
06581 
06582       // FADD is commutable. Try to commute the operands
06583       // and then test again.
06584       std::swap(Op0, Op1);
06585       if (InVec0 != Op0.getOperand(0))
06586         return SDValue();
06587     }
06588 
06589     if (InVec1 != Op1.getOperand(0))
06590       return SDValue();
06591 
06592     // Update the pair of expected opcodes.
06593     std::swap(ExpectedOpcode, NextExpectedOpcode);
06594   }
06595 
06596   // Don't try to fold this build_vector into an ADDSUB if the inputs are undef.
06597   if (AddFound && SubFound && InVec0.getOpcode() != ISD::UNDEF &&
06598       InVec1.getOpcode() != ISD::UNDEF)
06599     return DAG.getNode(X86ISD::ADDSUB, DL, VT, InVec0, InVec1);
06600 
06601   return SDValue();
06602 }
06603 
06604 static SDValue PerformBUILD_VECTORCombine(SDNode *N, SelectionDAG &DAG,
06605                                           const X86Subtarget *Subtarget) {
06606   SDLoc DL(N);
06607   EVT VT = N->getValueType(0);
06608   unsigned NumElts = VT.getVectorNumElements();
06609   BuildVectorSDNode *BV = cast<BuildVectorSDNode>(N);
06610   SDValue InVec0, InVec1;
06611 
06612   // Try to match an ADDSUB.
06613   if ((Subtarget->hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
06614       (Subtarget->hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))) {
06615     SDValue Value = matchAddSub(BV, DAG, Subtarget);
06616     if (Value.getNode())
06617       return Value;
06618   }
06619 
06620   // Try to match horizontal ADD/SUB.
06621   unsigned NumUndefsLO = 0;
06622   unsigned NumUndefsHI = 0;
06623   unsigned Half = NumElts/2;
06624 
06625   // Count the number of UNDEF operands in the build_vector in input.
06626   for (unsigned i = 0, e = Half; i != e; ++i)
06627     if (BV->getOperand(i)->getOpcode() == ISD::UNDEF)
06628       NumUndefsLO++;
06629 
06630   for (unsigned i = Half, e = NumElts; i != e; ++i)
06631     if (BV->getOperand(i)->getOpcode() == ISD::UNDEF)
06632       NumUndefsHI++;
06633 
06634   // Early exit if this is either a build_vector of all UNDEFs or all the
06635   // operands but one are UNDEF.
06636   if (NumUndefsLO + NumUndefsHI + 1 >= NumElts)
06637     return SDValue();
06638 
06639   if ((VT == MVT::v4f32 || VT == MVT::v2f64) && Subtarget->hasSSE3()) {
06640     // Try to match an SSE3 float HADD/HSUB.
06641     if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, NumElts, InVec0, InVec1))
06642       return DAG.getNode(X86ISD::FHADD, DL, VT, InVec0, InVec1);
06643     
06644     if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, NumElts, InVec0, InVec1))
06645       return DAG.getNode(X86ISD::FHSUB, DL, VT, InVec0, InVec1);
06646   } else if ((VT == MVT::v4i32 || VT == MVT::v8i16) && Subtarget->hasSSSE3()) {
06647     // Try to match an SSSE3 integer HADD/HSUB.
06648     if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1))
06649       return DAG.getNode(X86ISD::HADD, DL, VT, InVec0, InVec1);
06650     
06651     if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, NumElts, InVec0, InVec1))
06652       return DAG.getNode(X86ISD::HSUB, DL, VT, InVec0, InVec1);
06653   }
06654   
06655   if (!Subtarget->hasAVX())
06656     return SDValue();
06657 
06658   if ((VT == MVT::v8f32 || VT == MVT::v4f64)) {
06659     // Try to match an AVX horizontal add/sub of packed single/double
06660     // precision floating point values from 256-bit vectors.
06661     SDValue InVec2, InVec3;
06662     if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, Half, InVec0, InVec1) &&
06663         isHorizontalBinOp(BV, ISD::FADD, DAG, Half, NumElts, InVec2, InVec3) &&
06664         ((InVec0.getOpcode() == ISD::UNDEF ||
06665           InVec2.getOpcode() == ISD::UNDEF) || InVec0 == InVec2) &&
06666         ((InVec1.getOpcode() == ISD::UNDEF ||
06667           InVec3.getOpcode() == ISD::UNDEF) || InVec1 == InVec3))
06668       return DAG.getNode(X86ISD::FHADD, DL, VT, InVec0, InVec1);
06669 
06670     if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, Half, InVec0, InVec1) &&
06671         isHorizontalBinOp(BV, ISD::FSUB, DAG, Half, NumElts, InVec2, InVec3) &&
06672         ((InVec0.getOpcode() == ISD::UNDEF ||
06673           InVec2.getOpcode() == ISD::UNDEF) || InVec0 == InVec2) &&
06674         ((InVec1.getOpcode() == ISD::UNDEF ||
06675           InVec3.getOpcode() == ISD::UNDEF) || InVec1 == InVec3))
06676       return DAG.getNode(X86ISD::FHSUB, DL, VT, InVec0, InVec1);
06677   } else if (VT == MVT::v8i32 || VT == MVT::v16i16) {
06678     // Try to match an AVX2 horizontal add/sub of signed integers.
06679     SDValue InVec2, InVec3;
06680     unsigned X86Opcode;
06681     bool CanFold = true;
06682 
06683     if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, Half, InVec0, InVec1) &&
06684         isHorizontalBinOp(BV, ISD::ADD, DAG, Half, NumElts, InVec2, InVec3) &&
06685         ((InVec0.getOpcode() == ISD::UNDEF ||
06686           InVec2.getOpcode() == ISD::UNDEF) || InVec0 == InVec2) &&
06687         ((InVec1.getOpcode() == ISD::UNDEF ||
06688           InVec3.getOpcode() == ISD::UNDEF) || InVec1 == InVec3))
06689       X86Opcode = X86ISD::HADD;
06690     else if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, Half, InVec0, InVec1) &&
06691         isHorizontalBinOp(BV, ISD::SUB, DAG, Half, NumElts, InVec2, InVec3) &&
06692         ((InVec0.getOpcode() == ISD::UNDEF ||
06693           InVec2.getOpcode() == ISD::UNDEF) || InVec0 == InVec2) &&
06694         ((InVec1.getOpcode() == ISD::UNDEF ||
06695           InVec3.getOpcode() == ISD::UNDEF) || InVec1 == InVec3))
06696       X86Opcode = X86ISD::HSUB;
06697     else
06698       CanFold = false;
06699 
06700     if (CanFold) {
06701       // Fold this build_vector into a single horizontal add/sub.
06702       // Do this only if the target has AVX2.
06703       if (Subtarget->hasAVX2())
06704         return DAG.getNode(X86Opcode, DL, VT, InVec0, InVec1);
06705  
06706       // Do not try to expand this build_vector into a pair of horizontal
06707       // add/sub if we can emit a pair of scalar add/sub.
06708       if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
06709         return SDValue();
06710 
06711       // Convert this build_vector into a pair of horizontal binop followed by
06712       // a concat vector.
06713       bool isUndefLO = NumUndefsLO == Half;
06714       bool isUndefHI = NumUndefsHI == Half;
06715       return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, false,
06716                                    isUndefLO, isUndefHI);
06717     }
06718   }
06719 
06720   if ((VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v8i32 ||
06721        VT == MVT::v16i16) && Subtarget->hasAVX()) {
06722     unsigned X86Opcode;
06723     if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1))
06724       X86Opcode = X86ISD::HADD;
06725     else if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, NumElts, InVec0, InVec1))
06726       X86Opcode = X86ISD::HSUB;
06727     else if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, NumElts, InVec0, InVec1))
06728       X86Opcode = X86ISD::FHADD;
06729     else if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, NumElts, InVec0, InVec1))
06730       X86Opcode = X86ISD::FHSUB;
06731     else
06732       return SDValue();
06733 
06734     // Don't try to expand this build_vector into a pair of horizontal add/sub
06735     // if we can simply emit a pair of scalar add/sub.
06736     if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
06737       return SDValue();
06738 
06739     // Convert this build_vector into two horizontal add/sub followed by
06740     // a concat vector.
06741     bool isUndefLO = NumUndefsLO == Half;
06742     bool isUndefHI = NumUndefsHI == Half;
06743     return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, true,
06744                                  isUndefLO, isUndefHI);
06745   }
06746 
06747   return SDValue();
06748 }
06749 
06750 SDValue
06751 X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
06752   SDLoc dl(Op);
06753 
06754   MVT VT = Op.getSimpleValueType();
06755   MVT ExtVT = VT.getVectorElementType();
06756   unsigned NumElems = Op.getNumOperands();
06757 
06758   // Generate vectors for predicate vectors.
06759   if (VT.getScalarType() == MVT::i1 && Subtarget->hasAVX512())
06760     return LowerBUILD_VECTORvXi1(Op, DAG);
06761 
06762   // Vectors containing all zeros can be matched by pxor and xorps later
06763   if (ISD::isBuildVectorAllZeros(Op.getNode())) {
06764     // Canonicalize this to <4 x i32> to 1) ensure the zero vectors are CSE'd
06765     // and 2) ensure that i64 scalars are eliminated on x86-32 hosts.
06766     if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32)
06767       return Op;
06768 
06769     return getZeroVector(VT, Subtarget, DAG, dl);
06770   }
06771 
06772   // Vectors containing all ones can be matched by pcmpeqd on 128-bit width
06773   // vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use
06774   // vpcmpeqd on 256-bit vectors.
06775   if (Subtarget->hasSSE2() && ISD::isBuildVectorAllOnes(Op.getNode())) {
06776     if (VT == MVT::v4i32 || (VT == MVT::v8i32 && Subtarget->hasInt256()))
06777       return Op;
06778 
06779     if (!VT.is512BitVector())
06780       return getOnesVector(VT, Subtarget->hasInt256(), DAG, dl);
06781   }
06782 
06783   SDValue Broadcast = LowerVectorBroadcast(Op, Subtarget, DAG);
06784   if (Broadcast.getNode())
06785     return Broadcast;
06786 
06787   unsigned EVTBits = ExtVT.getSizeInBits();
06788 
06789   unsigned NumZero  = 0;
06790   unsigned NumNonZero = 0;
06791   unsigned NonZeros = 0;
06792   bool IsAllConstants = true;
06793   SmallSet<SDValue, 8> Values;
06794   for (unsigned i = 0; i < NumElems; ++i) {
06795     SDValue Elt = Op.getOperand(i);
06796     if (Elt.getOpcode() == ISD::UNDEF)
06797       continue;
06798     Values.insert(Elt);
06799     if (Elt.getOpcode() != ISD::Constant &&
06800         Elt.getOpcode() != ISD::ConstantFP)
06801       IsAllConstants = false;
06802     if (X86::isZeroNode(Elt))
06803       NumZero++;
06804     else {
06805       NonZeros |= (1 << i);
06806       NumNonZero++;
06807     }
06808   }
06809 
06810   // All undef vector. Return an UNDEF.  All zero vectors were handled above.
06811   if (NumNonZero == 0)
06812     return DAG.getUNDEF(VT);
06813 
06814   // Special case for single non-zero, non-undef, element.
06815   if (NumNonZero == 1) {
06816     unsigned Idx = countTrailingZeros(NonZeros);
06817     SDValue Item = Op.getOperand(Idx);
06818 
06819     // If this is an insertion of an i64 value on x86-32, and if the top bits of
06820     // the value are obviously zero, truncate the value to i32 and do the
06821     // insertion that way.  Only do this if the value is non-constant or if the
06822     // value is a constant being inserted into element 0.  It is cheaper to do
06823     // a constant pool load than it is to do a movd + shuffle.
06824     if (ExtVT == MVT::i64 && !Subtarget->is64Bit() &&
06825         (!IsAllConstants || Idx == 0)) {
06826       if (DAG.MaskedValueIsZero(Item, APInt::getBitsSet(64, 32, 64))) {
06827         // Handle SSE only.
06828         assert(VT == MVT::v2i64 && "Expected an SSE value type!");
06829         EVT VecVT = MVT::v4i32;
06830         unsigned VecElts = 4;
06831 
06832         // Truncate the value (which may itself be a constant) to i32, and
06833         // convert it to a vector with movd (S2V+shuffle to zero extend).
06834         Item = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Item);
06835         Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Item);
06836 
06837         // If using the new shuffle lowering, just directly insert this.
06838         if (ExperimentalVectorShuffleLowering)
06839           return DAG.getNode(
06840               ISD::BITCAST, dl, VT,
06841               getShuffleVectorZeroOrUndef(Item, Idx * 2, true, Subtarget, DAG));
06842 
06843         Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
06844 
06845         // Now we have our 32-bit value zero extended in the low element of
06846         // a vector.  If Idx != 0, swizzle it into place.
06847         if (Idx != 0) {
06848           SmallVector<int, 4> Mask;
06849           Mask.push_back(Idx);
06850           for (unsigned i = 1; i != VecElts; ++i)
06851             Mask.push_back(i);
06852           Item = DAG.getVectorShuffle(VecVT, dl, Item, DAG.getUNDEF(VecVT),
06853                                       &Mask[0]);
06854         }
06855         return DAG.getNode(ISD::BITCAST, dl, VT, Item);
06856       }
06857     }
06858 
06859     // If we have a constant or non-constant insertion into the low element of
06860     // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into
06861     // the rest of the elements.  This will be matched as movd/movq/movss/movsd
06862     // depending on what the source datatype is.
06863     if (Idx == 0) {
06864       if (NumZero == 0)
06865         return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
06866 
06867       if (ExtVT == MVT::i32 || ExtVT == MVT::f32 || ExtVT == MVT::f64 ||
06868           (ExtVT == MVT::i64 && Subtarget->is64Bit())) {
06869         if (VT.is256BitVector() || VT.is512BitVector()) {
06870           SDValue ZeroVec = getZeroVector(VT, Subtarget, DAG, dl);
06871           return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, ZeroVec,
06872                              Item, DAG.getIntPtrConstant(0));
06873         }
06874         assert(VT.is128BitVector() && "Expected an SSE value type!");
06875         Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
06876         // Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector.
06877         return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
06878       }
06879 
06880       if (ExtVT == MVT::i16 || ExtVT == MVT::i8) {
06881         Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item);
06882         Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item);
06883         if (VT.is256BitVector()) {
06884           SDValue ZeroVec = getZeroVector(MVT::v8i32, Subtarget, DAG, dl);
06885           Item = Insert128BitVector(ZeroVec, Item, 0, DAG, dl);
06886         } else {
06887           assert(VT.is128BitVector() && "Expected an SSE value type!");
06888           Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
06889         }
06890         return DAG.getNode(ISD::BITCAST, dl, VT, Item);
06891       }
06892     }
06893 
06894     // Is it a vector logical left shift?
06895     if (NumElems == 2 && Idx == 1 &&
06896         X86::isZeroNode(Op.getOperand(0)) &&
06897         !X86::isZeroNode(Op.getOperand(1))) {
06898       unsigned NumBits = VT.getSizeInBits();
06899       return getVShift(true, VT,
06900                        DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
06901                                    VT, Op.getOperand(1)),
06902                        NumBits/2, DAG, *this, dl);
06903     }
06904 
06905     if (IsAllConstants) // Otherwise, it's better to do a constpool load.
06906       return SDValue();
06907 
06908     // Otherwise, if this is a vector with i32 or f32 elements, and the element
06909     // is a non-constant being inserted into an element other than the low one,
06910     // we can't use a constant pool load.  Instead, use SCALAR_TO_VECTOR (aka
06911     // movd/movss) to move this into the low element, then shuffle it into
06912     // place.
06913     if (EVTBits == 32) {
06914       Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
06915 
06916       // If using the new shuffle lowering, just directly insert this.
06917       if (ExperimentalVectorShuffleLowering)
06918         return getShuffleVectorZeroOrUndef(Item, Idx, NumZero > 0, Subtarget, DAG);
06919 
06920       // Turn it into a shuffle of zero and zero-extended scalar to vector.
06921       Item = getShuffleVectorZeroOrUndef(Item, 0, NumZero > 0, Subtarget, DAG);
06922       SmallVector<int, 8> MaskVec;
06923       for (unsigned i = 0; i != NumElems; ++i)
06924         MaskVec.push_back(i == Idx ? 0 : 1);
06925       return DAG.getVectorShuffle(VT, dl, Item, DAG.getUNDEF(VT), &MaskVec[0]);
06926     }
06927   }
06928 
06929   // Splat is obviously ok. Let legalizer expand it to a shuffle.
06930   if (Values.size() == 1) {
06931     if (EVTBits == 32) {
06932       // Instead of a shuffle like this:
06933       // shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0>
06934       // Check if it's possible to issue this instead.
06935       // shuffle (vload ptr)), undef, <1, 1, 1, 1>
06936       unsigned Idx = countTrailingZeros(NonZeros);
06937       SDValue Item = Op.getOperand(Idx);
06938       if (Op.getNode()->isOnlyUserOf(Item.getNode()))
06939         return LowerAsSplatVectorLoad(Item, VT, dl, DAG);
06940     }
06941     return SDValue();
06942   }
06943 
06944   // A vector full of immediates; various special cases are already
06945   // handled, so this is best done with a single constant-pool load.
06946   if (IsAllConstants)
06947     return SDValue();
06948 
06949   // For AVX-length vectors, build the individual 128-bit pieces and use
06950   // shuffles to put them in place.
06951   if (VT.is256BitVector() || VT.is512BitVector()) {
06952     SmallVector<SDValue, 64> V;
06953     for (unsigned i = 0; i != NumElems; ++i)
06954       V.push_back(Op.getOperand(i));
06955 
06956     EVT HVT = EVT::getVectorVT(*DAG.getContext(), ExtVT, NumElems/2);
06957 
06958     // Build both the lower and upper subvector.
06959     SDValue Lower = DAG.getNode(ISD::BUILD_VECTOR, dl, HVT,
06960                                 makeArrayRef(&V[0], NumElems/2));
06961     SDValue Upper = DAG.getNode(ISD::BUILD_VECTOR, dl, HVT,
06962                                 makeArrayRef(&V[NumElems / 2], NumElems/2));
06963 
06964     // Recreate the wider vector with the lower and upper part.
06965     if (VT.is256BitVector())
06966       return Concat128BitVectors(Lower, Upper, VT, NumElems, DAG, dl);
06967     return Concat256BitVectors(Lower, Upper, VT, NumElems, DAG, dl);
06968   }
06969 
06970   // Let legalizer expand 2-wide build_vectors.
06971   if (EVTBits == 64) {
06972     if (NumNonZero == 1) {
06973       // One half is zero or undef.
06974       unsigned Idx = countTrailingZeros(NonZeros);
06975       SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT,
06976                                  Op.getOperand(Idx));
06977       return getShuffleVectorZeroOrUndef(V2, Idx, true, Subtarget, DAG);
06978     }
06979     return SDValue();
06980   }
06981 
06982   // If element VT is < 32 bits, convert it to inserts into a zero vector.
06983   if (EVTBits == 8 && NumElems == 16) {
06984     SDValue V = LowerBuildVectorv16i8(Op, NonZeros,NumNonZero,NumZero, DAG,
06985                                         Subtarget, *this);
06986     if (V.getNode()) return V;
06987   }
06988 
06989   if (EVTBits == 16 && NumElems == 8) {
06990     SDValue V = LowerBuildVectorv8i16(Op, NonZeros,NumNonZero,NumZero, DAG,
06991                                       Subtarget, *this);
06992     if (V.getNode()) return V;
06993   }
06994 
06995   // If element VT is == 32 bits and has 4 elems, try to generate an INSERTPS
06996   if (EVTBits == 32 && NumElems == 4) {
06997     SDValue V = LowerBuildVectorv4x32(Op, NumElems, NonZeros, NumNonZero,
06998                                       NumZero, DAG, Subtarget, *this);
06999     if (V.getNode())
07000       return V;
07001   }
07002 
07003   // If element VT is == 32 bits, turn it into a number of shuffles.
07004   SmallVector<SDValue, 8> V(NumElems);
07005   if (NumElems == 4 && NumZero > 0) {
07006     for (unsigned i = 0; i < 4; ++i) {
07007       bool isZero = !(NonZeros & (1 << i));
07008       if (isZero)
07009         V[i] = getZeroVector(VT, Subtarget, DAG, dl);
07010       else
07011         V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
07012     }
07013 
07014     for (unsigned i = 0; i < 2; ++i) {
07015       switch ((NonZeros & (0x3 << i*2)) >> (i*2)) {
07016         default: break;
07017         case 0:
07018           V[i] = V[i*2];  // Must be a zero vector.
07019           break;
07020         case 1:
07021           V[i] = getMOVL(DAG, dl, VT, V[i*2+1], V[i*2]);
07022           break;
07023         case 2:
07024           V[i] = getMOVL(DAG, dl, VT, V[i*2], V[i*2+1]);
07025           break;
07026         case 3:
07027           V[i] = getUnpackl(DAG, dl, VT, V[i*2], V[i*2+1]);
07028           break;
07029       }
07030     }
07031 
07032     bool Reverse1 = (NonZeros & 0x3) == 2;
07033     bool Reverse2 = ((NonZeros & (0x3 << 2)) >> 2) == 2;
07034     int MaskVec[] = {
07035       Reverse1 ? 1 : 0,
07036       Reverse1 ? 0 : 1,
07037       static_cast<int>(Reverse2 ? NumElems+1 : NumElems),
07038       static_cast<int>(Reverse2 ? NumElems   : NumElems+1)
07039     };
07040     return DAG.getVectorShuffle(VT, dl, V[0], V[1], &MaskVec[0]);
07041   }
07042 
07043   if (Values.size() > 1 && VT.is128BitVector()) {
07044     // Check for a build vector of consecutive loads.
07045     for (unsigned i = 0; i < NumElems; ++i)
07046       V[i] = Op.getOperand(i);
07047 
07048     // Check for elements which are consecutive loads.
07049     SDValue LD = EltsFromConsecutiveLoads(VT, V, dl, DAG, false);
07050     if (LD.getNode())
07051       return LD;
07052 
07053     // Check for a build vector from mostly shuffle plus few inserting.
07054     SDValue Sh = buildFromShuffleMostly(Op, DAG);
07055     if (Sh.getNode())
07056       return Sh;
07057 
07058     // For SSE 4.1, use insertps to put the high elements into the low element.
07059     if (getSubtarget()->hasSSE41()) {
07060       SDValue Result;
07061       if (Op.getOperand(0).getOpcode() != ISD::UNDEF)
07062         Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0));
07063       else
07064         Result = DAG.getUNDEF(VT);
07065 
07066       for (unsigned i = 1; i < NumElems; ++i) {
07067         if (Op.getOperand(i).getOpcode() == ISD::UNDEF) continue;
07068         Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result,
07069                              Op.getOperand(i), DAG.getIntPtrConstant(i));
07070       }
07071       return Result;
07072     }
07073 
07074     // Otherwise, expand into a number of unpckl*, start by extending each of
07075     // our (non-undef) elements to the full vector width with the element in the
07076     // bottom slot of the vector (which generates no code for SSE).
07077     for (unsigned i = 0; i < NumElems; ++i) {
07078       if (Op.getOperand(i).getOpcode() != ISD::UNDEF)
07079         V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
07080       else
07081         V[i] = DAG.getUNDEF(VT);
07082     }
07083 
07084     // Next, we iteratively mix elements, e.g. for v4f32:
07085     //   Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
07086     //         : unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
07087     //   Step 2: unpcklps X, Y ==>    <3, 2, 1, 0>
07088     unsigned EltStride = NumElems >> 1;
07089     while (EltStride != 0) {
07090       for (unsigned i = 0; i < EltStride; ++i) {
07091         // If V[i+EltStride] is undef and this is the first round of mixing,
07092         // then it is safe to just drop this shuffle: V[i] is already in the
07093         // right place, the one element (since it's the first round) being
07094         // inserted as undef can be dropped.  This isn't safe for successive
07095         // rounds because they will permute elements within both vectors.
07096         if (V[i+EltStride].getOpcode() == ISD::UNDEF &&
07097             EltStride == NumElems/2)
07098           continue;
07099 
07100         V[i] = getUnpackl(DAG, dl, VT, V[i], V[i + EltStride]);
07101       }
07102       EltStride >>= 1;
07103     }
07104     return V[0];
07105   }
07106   return SDValue();
07107 }
07108 
07109 // LowerAVXCONCAT_VECTORS - 256-bit AVX can use the vinsertf128 instruction
07110 // to create 256-bit vectors from two other 128-bit ones.
07111 static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
07112   SDLoc dl(Op);
07113   MVT ResVT = Op.getSimpleValueType();
07114 
07115   assert((ResVT.is256BitVector() ||
07116           ResVT.is512BitVector()) && "Value type must be 256-/512-bit wide");
07117 
07118   SDValue V1 = Op.getOperand(0);
07119   SDValue V2 = Op.getOperand(1);
07120   unsigned NumElems = ResVT.getVectorNumElements();
07121   if(ResVT.is256BitVector())
07122     return Concat128BitVectors(V1, V2, ResVT, NumElems, DAG, dl);
07123 
07124   if (Op.getNumOperands() == 4) {
07125     MVT HalfVT = MVT::getVectorVT(ResVT.getScalarType(),
07126                                 ResVT.getVectorNumElements()/2);
07127     SDValue V3 = Op.getOperand(2);
07128     SDValue V4 = Op.getOperand(3);
07129     return Concat256BitVectors(Concat128BitVectors(V1, V2, HalfVT, NumElems/2, DAG, dl),
07130       Concat128BitVectors(V3, V4, HalfVT, NumElems/2, DAG, dl), ResVT, NumElems, DAG, dl);
07131   }
07132   return Concat256BitVectors(V1, V2, ResVT, NumElems, DAG, dl);
07133 }
07134 
07135 static SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
07136   MVT LLVM_ATTRIBUTE_UNUSED VT = Op.getSimpleValueType();
07137   assert((VT.is256BitVector() && Op.