LLVM API Documentation

X86ISelLowering.cpp
Go to the documentation of this file.
00001 //===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===//
00002 //
00003 //                     The LLVM Compiler Infrastructure
00004 //
00005 // This file is distributed under the University of Illinois Open Source
00006 // License. See LICENSE.TXT for details.
00007 //
00008 //===----------------------------------------------------------------------===//
00009 //
00010 // This file defines the interfaces that X86 uses to lower LLVM code into a
00011 // selection DAG.
00012 //
00013 //===----------------------------------------------------------------------===//
00014 
00015 #define DEBUG_TYPE "x86-isel"
00016 #include "X86ISelLowering.h"
00017 #include "Utils/X86ShuffleDecode.h"
00018 #include "X86.h"
00019 #include "X86InstrBuilder.h"
00020 #include "X86TargetMachine.h"
00021 #include "X86TargetObjectFile.h"
00022 #include "llvm/ADT/SmallSet.h"
00023 #include "llvm/ADT/Statistic.h"
00024 #include "llvm/ADT/StringExtras.h"
00025 #include "llvm/ADT/VariadicFunction.h"
00026 #include "llvm/CodeGen/IntrinsicLowering.h"
00027 #include "llvm/CodeGen/MachineFrameInfo.h"
00028 #include "llvm/CodeGen/MachineFunction.h"
00029 #include "llvm/CodeGen/MachineInstrBuilder.h"
00030 #include "llvm/CodeGen/MachineJumpTableInfo.h"
00031 #include "llvm/CodeGen/MachineModuleInfo.h"
00032 #include "llvm/CodeGen/MachineRegisterInfo.h"
00033 #include "llvm/IR/CallingConv.h"
00034 #include "llvm/IR/Constants.h"
00035 #include "llvm/IR/DerivedTypes.h"
00036 #include "llvm/IR/Function.h"
00037 #include "llvm/IR/GlobalAlias.h"
00038 #include "llvm/IR/GlobalVariable.h"
00039 #include "llvm/IR/Instructions.h"
00040 #include "llvm/IR/Intrinsics.h"
00041 #include "llvm/IR/LLVMContext.h"
00042 #include "llvm/MC/MCAsmInfo.h"
00043 #include "llvm/MC/MCContext.h"
00044 #include "llvm/MC/MCExpr.h"
00045 #include "llvm/MC/MCSymbol.h"
00046 #include "llvm/Support/CallSite.h"
00047 #include "llvm/Support/Debug.h"
00048 #include "llvm/Support/ErrorHandling.h"
00049 #include "llvm/Support/MathExtras.h"
00050 #include "llvm/Target/TargetOptions.h"
00051 #include <bitset>
00052 #include <cctype>
00053 using namespace llvm;
00054 
00055 STATISTIC(NumTailCalls, "Number of tail calls");
00056 
00057 // Forward declarations.
00058 static SDValue getMOVL(SelectionDAG &DAG, SDLoc dl, EVT VT, SDValue V1,
00059                        SDValue V2);
00060 
00061 /// Generate a DAG to grab 128-bits from a vector > 128 bits.  This
00062 /// sets things up to match to an AVX VEXTRACTF128 instruction or a
00063 /// simple subregister reference.  Idx is an index in the 128 bits we
00064 /// want.  It need not be aligned to a 128-bit bounday.  That makes
00065 /// lowering EXTRACT_VECTOR_ELT operations easier.
00066 static SDValue Extract128BitVector(SDValue Vec, unsigned IdxVal,
00067                                    SelectionDAG &DAG, SDLoc dl) {
00068   EVT VT = Vec.getValueType();
00069   assert(VT.is256BitVector() && "Unexpected vector size!");
00070   EVT ElVT = VT.getVectorElementType();
00071   unsigned Factor = VT.getSizeInBits()/128;
00072   EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT,
00073                                   VT.getVectorNumElements()/Factor);
00074 
00075   // Extract from UNDEF is UNDEF.
00076   if (Vec.getOpcode() == ISD::UNDEF)
00077     return DAG.getUNDEF(ResultVT);
00078 
00079   // Extract the relevant 128 bits.  Generate an EXTRACT_SUBVECTOR
00080   // we can match to VEXTRACTF128.
00081   unsigned ElemsPerChunk = 128 / ElVT.getSizeInBits();
00082 
00083   // This is the index of the first element of the 128-bit chunk
00084   // we want.
00085   unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits()) / 128)
00086                                * ElemsPerChunk);
00087 
00088   // If the input is a buildvector just emit a smaller one.
00089   if (Vec.getOpcode() == ISD::BUILD_VECTOR)
00090     return DAG.getNode(ISD::BUILD_VECTOR, dl, ResultVT,
00091                        Vec->op_begin()+NormalizedIdxVal, ElemsPerChunk);
00092 
00093   SDValue VecIdx = DAG.getIntPtrConstant(NormalizedIdxVal);
00094   SDValue Result = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec,
00095                                VecIdx);
00096 
00097   return Result;
00098 }
00099 
00100 /// Generate a DAG to put 128-bits into a vector > 128 bits.  This
00101 /// sets things up to match to an AVX VINSERTF128 instruction or a
00102 /// simple superregister reference.  Idx is an index in the 128 bits
00103 /// we want.  It need not be aligned to a 128-bit bounday.  That makes
00104 /// lowering INSERT_VECTOR_ELT operations easier.
00105 static SDValue Insert128BitVector(SDValue Result, SDValue Vec,
00106                                   unsigned IdxVal, SelectionDAG &DAG,
00107                                   SDLoc dl) {
00108   // Inserting UNDEF is Result
00109   if (Vec.getOpcode() == ISD::UNDEF)
00110     return Result;
00111 
00112   EVT VT = Vec.getValueType();
00113   assert(VT.is128BitVector() && "Unexpected vector size!");
00114 
00115   EVT ElVT = VT.getVectorElementType();
00116   EVT ResultVT = Result.getValueType();
00117 
00118   // Insert the relevant 128 bits.
00119   unsigned ElemsPerChunk = 128/ElVT.getSizeInBits();
00120 
00121   // This is the index of the first element of the 128-bit chunk
00122   // we want.
00123   unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits())/128)
00124                                * ElemsPerChunk);
00125 
00126   SDValue VecIdx = DAG.getIntPtrConstant(NormalizedIdxVal);
00127   return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec,
00128                      VecIdx);
00129 }
00130 
00131 /// Concat two 128-bit vectors into a 256 bit vector using VINSERTF128
00132 /// instructions. This is used because creating CONCAT_VECTOR nodes of
00133 /// BUILD_VECTORS returns a larger BUILD_VECTOR while we're trying to lower
00134 /// large BUILD_VECTORS.
00135 static SDValue Concat128BitVectors(SDValue V1, SDValue V2, EVT VT,
00136                                    unsigned NumElems, SelectionDAG &DAG,
00137                                    SDLoc dl) {
00138   SDValue V = Insert128BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl);
00139   return Insert128BitVector(V, V2, NumElems/2, DAG, dl);
00140 }
00141 
00142 static TargetLoweringObjectFile *createTLOF(X86TargetMachine &TM) {
00143   const X86Subtarget *Subtarget = &TM.getSubtarget<X86Subtarget>();
00144   bool is64Bit = Subtarget->is64Bit();
00145 
00146   if (Subtarget->isTargetEnvMacho()) {
00147     if (is64Bit)
00148       return new X86_64MachoTargetObjectFile();
00149     return new TargetLoweringObjectFileMachO();
00150   }
00151 
00152   if (Subtarget->isTargetLinux())
00153     return new X86LinuxTargetObjectFile();
00154   if (Subtarget->isTargetELF())
00155     return new TargetLoweringObjectFileELF();
00156   if (Subtarget->isTargetCOFF() && !Subtarget->isTargetEnvMacho())
00157     return new TargetLoweringObjectFileCOFF();
00158   llvm_unreachable("unknown subtarget type");
00159 }
00160 
00161 X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
00162   : TargetLowering(TM, createTLOF(TM)) {
00163   Subtarget = &TM.getSubtarget<X86Subtarget>();
00164   X86ScalarSSEf64 = Subtarget->hasSSE2();
00165   X86ScalarSSEf32 = Subtarget->hasSSE1();
00166   RegInfo = TM.getRegisterInfo();
00167   TD = getDataLayout();
00168 
00169   resetOperationActions();
00170 }
00171 
00172 void X86TargetLowering::resetOperationActions() {
00173   const TargetMachine &TM = getTargetMachine();
00174   static bool FirstTimeThrough = true;
00175 
00176   // If none of the target options have changed, then we don't need to reset the
00177   // operation actions.
00178   if (!FirstTimeThrough && TO == TM.Options) return;
00179 
00180   if (!FirstTimeThrough) {
00181     // Reinitialize the actions.
00182     initActions();
00183     FirstTimeThrough = false;
00184   }
00185 
00186   TO = TM.Options;
00187 
00188   // Set up the TargetLowering object.
00189   static const MVT IntVTs[] = { MVT::i8, MVT::i16, MVT::i32, MVT::i64 };
00190 
00191   // X86 is weird, it always uses i8 for shift amounts and setcc results.
00192   setBooleanContents(ZeroOrOneBooleanContent);
00193   // X86-SSE is even stranger. It uses -1 or 0 for vector masks.
00194   setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
00195 
00196   // For 64-bit since we have so many registers use the ILP scheduler, for
00197   // 32-bit code use the register pressure specific scheduling.
00198   // For Atom, always use ILP scheduling.
00199   if (Subtarget->isAtom())
00200     setSchedulingPreference(Sched::ILP);
00201   else if (Subtarget->is64Bit())
00202     setSchedulingPreference(Sched::ILP);
00203   else
00204     setSchedulingPreference(Sched::RegPressure);
00205   setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister());
00206 
00207   // Bypass expensive divides on Atom when compiling with O2
00208   if (Subtarget->hasSlowDivide() && TM.getOptLevel() >= CodeGenOpt::Default) {
00209     addBypassSlowDiv(32, 8);
00210     if (Subtarget->is64Bit())
00211       addBypassSlowDiv(64, 16);
00212   }
00213 
00214   if (Subtarget->isTargetWindows() && !Subtarget->isTargetCygMing()) {
00215     // Setup Windows compiler runtime calls.
00216     setLibcallName(RTLIB::SDIV_I64, "_alldiv");
00217     setLibcallName(RTLIB::UDIV_I64, "_aulldiv");
00218     setLibcallName(RTLIB::SREM_I64, "_allrem");
00219     setLibcallName(RTLIB::UREM_I64, "_aullrem");
00220     setLibcallName(RTLIB::MUL_I64, "_allmul");
00221     setLibcallCallingConv(RTLIB::SDIV_I64, CallingConv::X86_StdCall);
00222     setLibcallCallingConv(RTLIB::UDIV_I64, CallingConv::X86_StdCall);
00223     setLibcallCallingConv(RTLIB::SREM_I64, CallingConv::X86_StdCall);
00224     setLibcallCallingConv(RTLIB::UREM_I64, CallingConv::X86_StdCall);
00225     setLibcallCallingConv(RTLIB::MUL_I64, CallingConv::X86_StdCall);
00226 
00227     // The _ftol2 runtime function has an unusual calling conv, which
00228     // is modeled by a special pseudo-instruction.
00229     setLibcallName(RTLIB::FPTOUINT_F64_I64, 0);
00230     setLibcallName(RTLIB::FPTOUINT_F32_I64, 0);
00231     setLibcallName(RTLIB::FPTOUINT_F64_I32, 0);
00232     setLibcallName(RTLIB::FPTOUINT_F32_I32, 0);
00233   }
00234 
00235   if (Subtarget->isTargetDarwin()) {
00236     // Darwin should use _setjmp/_longjmp instead of setjmp/longjmp.
00237     setUseUnderscoreSetJmp(false);
00238     setUseUnderscoreLongJmp(false);
00239   } else if (Subtarget->isTargetMingw()) {
00240     // MS runtime is weird: it exports _setjmp, but longjmp!
00241     setUseUnderscoreSetJmp(true);
00242     setUseUnderscoreLongJmp(false);
00243   } else {
00244     setUseUnderscoreSetJmp(true);
00245     setUseUnderscoreLongJmp(true);
00246   }
00247 
00248   // Set up the register classes.
00249   addRegisterClass(MVT::i8, &X86::GR8RegClass);
00250   addRegisterClass(MVT::i16, &X86::GR16RegClass);
00251   addRegisterClass(MVT::i32, &X86::GR32RegClass);
00252   if (Subtarget->is64Bit())
00253     addRegisterClass(MVT::i64, &X86::GR64RegClass);
00254 
00255   setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote);
00256 
00257   // We don't accept any truncstore of integer registers.
00258   setTruncStoreAction(MVT::i64, MVT::i32, Expand);
00259   setTruncStoreAction(MVT::i64, MVT::i16, Expand);
00260   setTruncStoreAction(MVT::i64, MVT::i8 , Expand);
00261   setTruncStoreAction(MVT::i32, MVT::i16, Expand);
00262   setTruncStoreAction(MVT::i32, MVT::i8 , Expand);
00263   setTruncStoreAction(MVT::i16, MVT::i8,  Expand);
00264 
00265   // SETOEQ and SETUNE require checking two conditions.
00266   setCondCodeAction(ISD::SETOEQ, MVT::f32, Expand);
00267   setCondCodeAction(ISD::SETOEQ, MVT::f64, Expand);
00268   setCondCodeAction(ISD::SETOEQ, MVT::f80, Expand);
00269   setCondCodeAction(ISD::SETUNE, MVT::f32, Expand);
00270   setCondCodeAction(ISD::SETUNE, MVT::f64, Expand);
00271   setCondCodeAction(ISD::SETUNE, MVT::f80, Expand);
00272 
00273   // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this
00274   // operation.
00275   setOperationAction(ISD::UINT_TO_FP       , MVT::i1   , Promote);
00276   setOperationAction(ISD::UINT_TO_FP       , MVT::i8   , Promote);
00277   setOperationAction(ISD::UINT_TO_FP       , MVT::i16  , Promote);
00278 
00279   if (Subtarget->is64Bit()) {
00280     setOperationAction(ISD::UINT_TO_FP     , MVT::i32  , Promote);
00281     setOperationAction(ISD::UINT_TO_FP     , MVT::i64  , Custom);
00282   } else if (!TM.Options.UseSoftFloat) {
00283     // We have an algorithm for SSE2->double, and we turn this into a
00284     // 64-bit FILD followed by conditional FADD for other targets.
00285     setOperationAction(ISD::UINT_TO_FP     , MVT::i64  , Custom);
00286     // We have an algorithm for SSE2, and we turn this into a 64-bit
00287     // FILD for other targets.
00288     setOperationAction(ISD::UINT_TO_FP     , MVT::i32  , Custom);
00289   }
00290 
00291   // Promote i1/i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have
00292   // this operation.
00293   setOperationAction(ISD::SINT_TO_FP       , MVT::i1   , Promote);
00294   setOperationAction(ISD::SINT_TO_FP       , MVT::i8   , Promote);
00295 
00296   if (!TM.Options.UseSoftFloat) {
00297     // SSE has no i16 to fp conversion, only i32
00298     if (X86ScalarSSEf32) {
00299       setOperationAction(ISD::SINT_TO_FP     , MVT::i16  , Promote);
00300       // f32 and f64 cases are Legal, f80 case is not
00301       setOperationAction(ISD::SINT_TO_FP     , MVT::i32  , Custom);
00302     } else {
00303       setOperationAction(ISD::SINT_TO_FP     , MVT::i16  , Custom);
00304       setOperationAction(ISD::SINT_TO_FP     , MVT::i32  , Custom);
00305     }
00306   } else {
00307     setOperationAction(ISD::SINT_TO_FP     , MVT::i16  , Promote);
00308     setOperationAction(ISD::SINT_TO_FP     , MVT::i32  , Promote);
00309   }
00310 
00311   // In 32-bit mode these are custom lowered.  In 64-bit mode F32 and F64
00312   // are Legal, f80 is custom lowered.
00313   setOperationAction(ISD::FP_TO_SINT     , MVT::i64  , Custom);
00314   setOperationAction(ISD::SINT_TO_FP     , MVT::i64  , Custom);
00315 
00316   // Promote i1/i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have
00317   // this operation.
00318   setOperationAction(ISD::FP_TO_SINT       , MVT::i1   , Promote);
00319   setOperationAction(ISD::FP_TO_SINT       , MVT::i8   , Promote);
00320 
00321   if (X86ScalarSSEf32) {
00322     setOperationAction(ISD::FP_TO_SINT     , MVT::i16  , Promote);
00323     // f32 and f64 cases are Legal, f80 case is not
00324     setOperationAction(ISD::FP_TO_SINT     , MVT::i32  , Custom);
00325   } else {
00326     setOperationAction(ISD::FP_TO_SINT     , MVT::i16  , Custom);
00327     setOperationAction(ISD::FP_TO_SINT     , MVT::i32  , Custom);
00328   }
00329 
00330   // Handle FP_TO_UINT by promoting the destination to a larger signed
00331   // conversion.
00332   setOperationAction(ISD::FP_TO_UINT       , MVT::i1   , Promote);
00333   setOperationAction(ISD::FP_TO_UINT       , MVT::i8   , Promote);
00334   setOperationAction(ISD::FP_TO_UINT       , MVT::i16  , Promote);
00335 
00336   if (Subtarget->is64Bit()) {
00337     setOperationAction(ISD::FP_TO_UINT     , MVT::i64  , Expand);
00338     setOperationAction(ISD::FP_TO_UINT     , MVT::i32  , Promote);
00339   } else if (!TM.Options.UseSoftFloat) {
00340     // Since AVX is a superset of SSE3, only check for SSE here.
00341     if (Subtarget->hasSSE1() && !Subtarget->hasSSE3())
00342       // Expand FP_TO_UINT into a select.
00343       // FIXME: We would like to use a Custom expander here eventually to do
00344       // the optimal thing for SSE vs. the default expansion in the legalizer.
00345       setOperationAction(ISD::FP_TO_UINT   , MVT::i32  , Expand);
00346     else
00347       // With SSE3 we can use fisttpll to convert to a signed i64; without
00348       // SSE, we're stuck with a fistpll.
00349       setOperationAction(ISD::FP_TO_UINT   , MVT::i32  , Custom);
00350   }
00351 
00352   if (isTargetFTOL()) {
00353     // Use the _ftol2 runtime function, which has a pseudo-instruction
00354     // to handle its weird calling convention.
00355     setOperationAction(ISD::FP_TO_UINT     , MVT::i64  , Custom);
00356   }
00357 
00358   // TODO: when we have SSE, these could be more efficient, by using movd/movq.
00359   if (!X86ScalarSSEf64) {
00360     setOperationAction(ISD::BITCAST        , MVT::f32  , Expand);
00361     setOperationAction(ISD::BITCAST        , MVT::i32  , Expand);
00362     if (Subtarget->is64Bit()) {
00363       setOperationAction(ISD::BITCAST      , MVT::f64  , Expand);
00364       // Without SSE, i64->f64 goes through memory.
00365       setOperationAction(ISD::BITCAST      , MVT::i64  , Expand);
00366     }
00367   }
00368 
00369   // Scalar integer divide and remainder are lowered to use operations that
00370   // produce two results, to match the available instructions. This exposes
00371   // the two-result form to trivial CSE, which is able to combine x/y and x%y
00372   // into a single instruction.
00373   //
00374   // Scalar integer multiply-high is also lowered to use two-result
00375   // operations, to match the available instructions. However, plain multiply
00376   // (low) operations are left as Legal, as there are single-result
00377   // instructions for this in x86. Using the two-result multiply instructions
00378   // when both high and low results are needed must be arranged by dagcombine.
00379   for (unsigned i = 0; i != array_lengthof(IntVTs); ++i) {
00380     MVT VT = IntVTs[i];
00381     setOperationAction(ISD::MULHS, VT, Expand);
00382     setOperationAction(ISD::MULHU, VT, Expand);
00383     setOperationAction(ISD::SDIV, VT, Expand);
00384     setOperationAction(ISD::UDIV, VT, Expand);
00385     setOperationAction(ISD::SREM, VT, Expand);
00386     setOperationAction(ISD::UREM, VT, Expand);
00387 
00388     // Add/Sub overflow ops with MVT::Glues are lowered to EFLAGS dependences.
00389     setOperationAction(ISD::ADDC, VT, Custom);
00390     setOperationAction(ISD::ADDE, VT, Custom);
00391     setOperationAction(ISD::SUBC, VT, Custom);
00392     setOperationAction(ISD::SUBE, VT, Custom);
00393   }
00394 
00395   setOperationAction(ISD::BR_JT            , MVT::Other, Expand);
00396   setOperationAction(ISD::BRCOND           , MVT::Other, Custom);
00397   setOperationAction(ISD::BR_CC            , MVT::f32,   Expand);
00398   setOperationAction(ISD::BR_CC            , MVT::f64,   Expand);
00399   setOperationAction(ISD::BR_CC            , MVT::f80,   Expand);
00400   setOperationAction(ISD::BR_CC            , MVT::i8,    Expand);
00401   setOperationAction(ISD::BR_CC            , MVT::i16,   Expand);
00402   setOperationAction(ISD::BR_CC            , MVT::i32,   Expand);
00403   setOperationAction(ISD::BR_CC            , MVT::i64,   Expand);
00404   setOperationAction(ISD::SELECT_CC        , MVT::Other, Expand);
00405   if (Subtarget->is64Bit())
00406     setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
00407   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16  , Legal);
00408   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8   , Legal);
00409   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1   , Expand);
00410   setOperationAction(ISD::FP_ROUND_INREG   , MVT::f32  , Expand);
00411   setOperationAction(ISD::FREM             , MVT::f32  , Expand);
00412   setOperationAction(ISD::FREM             , MVT::f64  , Expand);
00413   setOperationAction(ISD::FREM             , MVT::f80  , Expand);
00414   setOperationAction(ISD::FLT_ROUNDS_      , MVT::i32  , Custom);
00415 
00416   // Promote the i8 variants and force them on up to i32 which has a shorter
00417   // encoding.
00418   setOperationAction(ISD::CTTZ             , MVT::i8   , Promote);
00419   AddPromotedToType (ISD::CTTZ             , MVT::i8   , MVT::i32);
00420   setOperationAction(ISD::CTTZ_ZERO_UNDEF  , MVT::i8   , Promote);
00421   AddPromotedToType (ISD::CTTZ_ZERO_UNDEF  , MVT::i8   , MVT::i32);
00422   if (Subtarget->hasBMI()) {
00423     setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i16  , Expand);
00424     setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32  , Expand);
00425     if (Subtarget->is64Bit())
00426       setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Expand);
00427   } else {
00428     setOperationAction(ISD::CTTZ           , MVT::i16  , Custom);
00429     setOperationAction(ISD::CTTZ           , MVT::i32  , Custom);
00430     if (Subtarget->is64Bit())
00431       setOperationAction(ISD::CTTZ         , MVT::i64  , Custom);
00432   }
00433 
00434   if (Subtarget->hasLZCNT()) {
00435     // When promoting the i8 variants, force them to i32 for a shorter
00436     // encoding.
00437     setOperationAction(ISD::CTLZ           , MVT::i8   , Promote);
00438     AddPromotedToType (ISD::CTLZ           , MVT::i8   , MVT::i32);
00439     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8   , Promote);
00440     AddPromotedToType (ISD::CTLZ_ZERO_UNDEF, MVT::i8   , MVT::i32);
00441     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16  , Expand);
00442     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32  , Expand);
00443     if (Subtarget->is64Bit())
00444       setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Expand);
00445   } else {
00446     setOperationAction(ISD::CTLZ           , MVT::i8   , Custom);
00447     setOperationAction(ISD::CTLZ           , MVT::i16  , Custom);
00448     setOperationAction(ISD::CTLZ           , MVT::i32  , Custom);
00449     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8   , Custom);
00450     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16  , Custom);
00451     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32  , Custom);
00452     if (Subtarget->is64Bit()) {
00453       setOperationAction(ISD::CTLZ         , MVT::i64  , Custom);
00454       setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom);
00455     }
00456   }
00457 
00458   if (Subtarget->hasPOPCNT()) {
00459     setOperationAction(ISD::CTPOP          , MVT::i8   , Promote);
00460   } else {
00461     setOperationAction(ISD::CTPOP          , MVT::i8   , Expand);
00462     setOperationAction(ISD::CTPOP          , MVT::i16  , Expand);
00463     setOperationAction(ISD::CTPOP          , MVT::i32  , Expand);
00464     if (Subtarget->is64Bit())
00465       setOperationAction(ISD::CTPOP        , MVT::i64  , Expand);
00466   }
00467 
00468   setOperationAction(ISD::READCYCLECOUNTER , MVT::i64  , Custom);
00469   setOperationAction(ISD::BSWAP            , MVT::i16  , Expand);
00470 
00471   // These should be promoted to a larger select which is supported.
00472   setOperationAction(ISD::SELECT          , MVT::i1   , Promote);
00473   // X86 wants to expand cmov itself.
00474   setOperationAction(ISD::SELECT          , MVT::i8   , Custom);
00475   setOperationAction(ISD::SELECT          , MVT::i16  , Custom);
00476   setOperationAction(ISD::SELECT          , MVT::i32  , Custom);
00477   setOperationAction(ISD::SELECT          , MVT::f32  , Custom);
00478   setOperationAction(ISD::SELECT          , MVT::f64  , Custom);
00479   setOperationAction(ISD::SELECT          , MVT::f80  , Custom);
00480   setOperationAction(ISD::SETCC           , MVT::i8   , Custom);
00481   setOperationAction(ISD::SETCC           , MVT::i16  , Custom);
00482   setOperationAction(ISD::SETCC           , MVT::i32  , Custom);
00483   setOperationAction(ISD::SETCC           , MVT::f32  , Custom);
00484   setOperationAction(ISD::SETCC           , MVT::f64  , Custom);
00485   setOperationAction(ISD::SETCC           , MVT::f80  , Custom);
00486   if (Subtarget->is64Bit()) {
00487     setOperationAction(ISD::SELECT        , MVT::i64  , Custom);
00488     setOperationAction(ISD::SETCC         , MVT::i64  , Custom);
00489   }
00490   setOperationAction(ISD::EH_RETURN       , MVT::Other, Custom);
00491   // NOTE: EH_SJLJ_SETJMP/_LONGJMP supported here is NOT intended to support
00492   // SjLj exception handling but a light-weight setjmp/longjmp replacement to
00493   // support continuation, user-level threading, and etc.. As a result, no
00494   // other SjLj exception interfaces are implemented and please don't build
00495   // your own exception handling based on them.
00496   // LLVM/Clang supports zero-cost DWARF exception handling.
00497   setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom);
00498   setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom);
00499 
00500   // Darwin ABI issue.
00501   setOperationAction(ISD::ConstantPool    , MVT::i32  , Custom);
00502   setOperationAction(ISD::JumpTable       , MVT::i32  , Custom);
00503   setOperationAction(ISD::GlobalAddress   , MVT::i32  , Custom);
00504   setOperationAction(ISD::GlobalTLSAddress, MVT::i32  , Custom);
00505   if (Subtarget->is64Bit())
00506     setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom);
00507   setOperationAction(ISD::ExternalSymbol  , MVT::i32  , Custom);
00508   setOperationAction(ISD::BlockAddress    , MVT::i32  , Custom);
00509   if (Subtarget->is64Bit()) {
00510     setOperationAction(ISD::ConstantPool  , MVT::i64  , Custom);
00511     setOperationAction(ISD::JumpTable     , MVT::i64  , Custom);
00512     setOperationAction(ISD::GlobalAddress , MVT::i64  , Custom);
00513     setOperationAction(ISD::ExternalSymbol, MVT::i64  , Custom);
00514     setOperationAction(ISD::BlockAddress  , MVT::i64  , Custom);
00515   }
00516   // 64-bit addm sub, shl, sra, srl (iff 32-bit x86)
00517   setOperationAction(ISD::SHL_PARTS       , MVT::i32  , Custom);
00518   setOperationAction(ISD::SRA_PARTS       , MVT::i32  , Custom);
00519   setOperationAction(ISD::SRL_PARTS       , MVT::i32  , Custom);
00520   if (Subtarget->is64Bit()) {
00521     setOperationAction(ISD::SHL_PARTS     , MVT::i64  , Custom);
00522     setOperationAction(ISD::SRA_PARTS     , MVT::i64  , Custom);
00523     setOperationAction(ISD::SRL_PARTS     , MVT::i64  , Custom);
00524   }
00525 
00526   if (Subtarget->hasSSE1())
00527     setOperationAction(ISD::PREFETCH      , MVT::Other, Legal);
00528 
00529   setOperationAction(ISD::ATOMIC_FENCE  , MVT::Other, Custom);
00530 
00531   // Expand certain atomics
00532   for (unsigned i = 0; i != array_lengthof(IntVTs); ++i) {
00533     MVT VT = IntVTs[i];
00534     setOperationAction(ISD::ATOMIC_CMP_SWAP, VT, Custom);
00535     setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom);
00536     setOperationAction(ISD::ATOMIC_STORE, VT, Custom);
00537   }
00538 
00539   if (!Subtarget->is64Bit()) {
00540     setOperationAction(ISD::ATOMIC_LOAD, MVT::i64, Custom);
00541     setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i64, Custom);
00542     setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i64, Custom);
00543     setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i64, Custom);
00544     setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i64, Custom);
00545     setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i64, Custom);
00546     setOperationAction(ISD::ATOMIC_LOAD_NAND, MVT::i64, Custom);
00547     setOperationAction(ISD::ATOMIC_SWAP, MVT::i64, Custom);
00548     setOperationAction(ISD::ATOMIC_LOAD_MAX, MVT::i64, Custom);
00549     setOperationAction(ISD::ATOMIC_LOAD_MIN, MVT::i64, Custom);
00550     setOperationAction(ISD::ATOMIC_LOAD_UMAX, MVT::i64, Custom);
00551     setOperationAction(ISD::ATOMIC_LOAD_UMIN, MVT::i64, Custom);
00552   }
00553 
00554   if (Subtarget->hasCmpxchg16b()) {
00555     setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i128, Custom);
00556   }
00557 
00558   // FIXME - use subtarget debug flags
00559   if (!Subtarget->isTargetDarwin() &&
00560       !Subtarget->isTargetELF() &&
00561       !Subtarget->isTargetCygMing()) {
00562     setOperationAction(ISD::EH_LABEL, MVT::Other, Expand);
00563   }
00564 
00565   setOperationAction(ISD::EXCEPTIONADDR, MVT::i64, Expand);
00566   setOperationAction(ISD::EHSELECTION,   MVT::i64, Expand);
00567   setOperationAction(ISD::EXCEPTIONADDR, MVT::i32, Expand);
00568   setOperationAction(ISD::EHSELECTION,   MVT::i32, Expand);
00569   if (Subtarget->is64Bit()) {
00570     setExceptionPointerRegister(X86::RAX);
00571     setExceptionSelectorRegister(X86::RDX);
00572   } else {
00573     setExceptionPointerRegister(X86::EAX);
00574     setExceptionSelectorRegister(X86::EDX);
00575   }
00576   setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom);
00577   setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i64, Custom);
00578 
00579   setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom);
00580   setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom);
00581 
00582   setOperationAction(ISD::TRAP, MVT::Other, Legal);
00583   setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);
00584 
00585   // VASTART needs to be custom lowered to use the VarArgsFrameIndex
00586   setOperationAction(ISD::VASTART           , MVT::Other, Custom);
00587   setOperationAction(ISD::VAEND             , MVT::Other, Expand);
00588   if (Subtarget->is64Bit()) {
00589     setOperationAction(ISD::VAARG           , MVT::Other, Custom);
00590     setOperationAction(ISD::VACOPY          , MVT::Other, Custom);
00591   } else {
00592     setOperationAction(ISD::VAARG           , MVT::Other, Expand);
00593     setOperationAction(ISD::VACOPY          , MVT::Other, Expand);
00594   }
00595 
00596   setOperationAction(ISD::STACKSAVE,          MVT::Other, Expand);
00597   setOperationAction(ISD::STACKRESTORE,       MVT::Other, Expand);
00598 
00599   if (Subtarget->isTargetCOFF() && !Subtarget->isTargetEnvMacho())
00600     setOperationAction(ISD::DYNAMIC_STACKALLOC, Subtarget->is64Bit() ?
00601                        MVT::i64 : MVT::i32, Custom);
00602   else if (TM.Options.EnableSegmentedStacks)
00603     setOperationAction(ISD::DYNAMIC_STACKALLOC, Subtarget->is64Bit() ?
00604                        MVT::i64 : MVT::i32, Custom);
00605   else
00606     setOperationAction(ISD::DYNAMIC_STACKALLOC, Subtarget->is64Bit() ?
00607                        MVT::i64 : MVT::i32, Expand);
00608 
00609   if (!TM.Options.UseSoftFloat && X86ScalarSSEf64) {
00610     // f32 and f64 use SSE.
00611     // Set up the FP register classes.
00612     addRegisterClass(MVT::f32, &X86::FR32RegClass);
00613     addRegisterClass(MVT::f64, &X86::FR64RegClass);
00614 
00615     // Use ANDPD to simulate FABS.
00616     setOperationAction(ISD::FABS , MVT::f64, Custom);
00617     setOperationAction(ISD::FABS , MVT::f32, Custom);
00618 
00619     // Use XORP to simulate FNEG.
00620     setOperationAction(ISD::FNEG , MVT::f64, Custom);
00621     setOperationAction(ISD::FNEG , MVT::f32, Custom);
00622 
00623     // Use ANDPD and ORPD to simulate FCOPYSIGN.
00624     setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom);
00625     setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
00626 
00627     // Lower this to FGETSIGNx86 plus an AND.
00628     setOperationAction(ISD::FGETSIGN, MVT::i64, Custom);
00629     setOperationAction(ISD::FGETSIGN, MVT::i32, Custom);
00630 
00631     // We don't support sin/cos/fmod
00632     setOperationAction(ISD::FSIN   , MVT::f64, Expand);
00633     setOperationAction(ISD::FCOS   , MVT::f64, Expand);
00634     setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
00635     setOperationAction(ISD::FSIN   , MVT::f32, Expand);
00636     setOperationAction(ISD::FCOS   , MVT::f32, Expand);
00637     setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
00638 
00639     // Expand FP immediates into loads from the stack, except for the special
00640     // cases we handle.
00641     addLegalFPImmediate(APFloat(+0.0)); // xorpd
00642     addLegalFPImmediate(APFloat(+0.0f)); // xorps
00643   } else if (!TM.Options.UseSoftFloat && X86ScalarSSEf32) {
00644     // Use SSE for f32, x87 for f64.
00645     // Set up the FP register classes.
00646     addRegisterClass(MVT::f32, &X86::FR32RegClass);
00647     addRegisterClass(MVT::f64, &X86::RFP64RegClass);
00648 
00649     // Use ANDPS to simulate FABS.
00650     setOperationAction(ISD::FABS , MVT::f32, Custom);
00651 
00652     // Use XORP to simulate FNEG.
00653     setOperationAction(ISD::FNEG , MVT::f32, Custom);
00654 
00655     setOperationAction(ISD::UNDEF,     MVT::f64, Expand);
00656 
00657     // Use ANDPS and ORPS to simulate FCOPYSIGN.
00658     setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
00659     setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
00660 
00661     // We don't support sin/cos/fmod
00662     setOperationAction(ISD::FSIN   , MVT::f32, Expand);
00663     setOperationAction(ISD::FCOS   , MVT::f32, Expand);
00664     setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
00665 
00666     // Special cases we handle for FP constants.
00667     addLegalFPImmediate(APFloat(+0.0f)); // xorps
00668     addLegalFPImmediate(APFloat(+0.0)); // FLD0
00669     addLegalFPImmediate(APFloat(+1.0)); // FLD1
00670     addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
00671     addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
00672 
00673     if (!TM.Options.UnsafeFPMath) {
00674       setOperationAction(ISD::FSIN   , MVT::f64, Expand);
00675       setOperationAction(ISD::FCOS   , MVT::f64, Expand);
00676       setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
00677     }
00678   } else if (!TM.Options.UseSoftFloat) {
00679     // f32 and f64 in x87.
00680     // Set up the FP register classes.
00681     addRegisterClass(MVT::f64, &X86::RFP64RegClass);
00682     addRegisterClass(MVT::f32, &X86::RFP32RegClass);
00683 
00684     setOperationAction(ISD::UNDEF,     MVT::f64, Expand);
00685     setOperationAction(ISD::UNDEF,     MVT::f32, Expand);
00686     setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
00687     setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
00688 
00689     if (!TM.Options.UnsafeFPMath) {
00690       setOperationAction(ISD::FSIN   , MVT::f64, Expand);
00691       setOperationAction(ISD::FSIN   , MVT::f32, Expand);
00692       setOperationAction(ISD::FCOS   , MVT::f64, Expand);
00693       setOperationAction(ISD::FCOS   , MVT::f32, Expand);
00694       setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
00695       setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
00696     }
00697     addLegalFPImmediate(APFloat(+0.0)); // FLD0
00698     addLegalFPImmediate(APFloat(+1.0)); // FLD1
00699     addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
00700     addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
00701     addLegalFPImmediate(APFloat(+0.0f)); // FLD0
00702     addLegalFPImmediate(APFloat(+1.0f)); // FLD1
00703     addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS
00704     addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS
00705   }
00706 
00707   // We don't support FMA.
00708   setOperationAction(ISD::FMA, MVT::f64, Expand);
00709   setOperationAction(ISD::FMA, MVT::f32, Expand);
00710 
00711   // Long double always uses X87.
00712   if (!TM.Options.UseSoftFloat) {
00713     addRegisterClass(MVT::f80, &X86::RFP80RegClass);
00714     setOperationAction(ISD::UNDEF,     MVT::f80, Expand);
00715     setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand);
00716     {
00717       APFloat TmpFlt = APFloat::getZero(APFloat::x87DoubleExtended);
00718       addLegalFPImmediate(TmpFlt);  // FLD0
00719       TmpFlt.changeSign();
00720       addLegalFPImmediate(TmpFlt);  // FLD0/FCHS
00721 
00722       bool ignored;
00723       APFloat TmpFlt2(+1.0);
00724       TmpFlt2.convert(APFloat::x87DoubleExtended, APFloat::rmNearestTiesToEven,
00725                       &ignored);
00726       addLegalFPImmediate(TmpFlt2);  // FLD1
00727       TmpFlt2.changeSign();
00728       addLegalFPImmediate(TmpFlt2);  // FLD1/FCHS
00729     }
00730 
00731     if (!TM.Options.UnsafeFPMath) {
00732       setOperationAction(ISD::FSIN   , MVT::f80, Expand);
00733       setOperationAction(ISD::FCOS   , MVT::f80, Expand);
00734       setOperationAction(ISD::FSINCOS, MVT::f80, Expand);
00735     }
00736 
00737     setOperationAction(ISD::FFLOOR, MVT::f80, Expand);
00738     setOperationAction(ISD::FCEIL,  MVT::f80, Expand);
00739     setOperationAction(ISD::FTRUNC, MVT::f80, Expand);
00740     setOperationAction(ISD::FRINT,  MVT::f80, Expand);
00741     setOperationAction(ISD::FNEARBYINT, MVT::f80, Expand);
00742     setOperationAction(ISD::FMA, MVT::f80, Expand);
00743   }
00744 
00745   // Always use a library call for pow.
00746   setOperationAction(ISD::FPOW             , MVT::f32  , Expand);
00747   setOperationAction(ISD::FPOW             , MVT::f64  , Expand);
00748   setOperationAction(ISD::FPOW             , MVT::f80  , Expand);
00749 
00750   setOperationAction(ISD::FLOG, MVT::f80, Expand);
00751   setOperationAction(ISD::FLOG2, MVT::f80, Expand);
00752   setOperationAction(ISD::FLOG10, MVT::f80, Expand);
00753   setOperationAction(ISD::FEXP, MVT::f80, Expand);
00754   setOperationAction(ISD::FEXP2, MVT::f80, Expand);
00755 
00756   // First set operation action for all vector types to either promote
00757   // (for widening) or expand (for scalarization). Then we will selectively
00758   // turn on ones that can be effectively codegen'd.
00759   for (int i = MVT::FIRST_VECTOR_VALUETYPE;
00760            i <= MVT::LAST_VECTOR_VALUETYPE; ++i) {
00761     MVT VT = (MVT::SimpleValueType)i;
00762     setOperationAction(ISD::ADD , VT, Expand);
00763     setOperationAction(ISD::SUB , VT, Expand);
00764     setOperationAction(ISD::FADD, VT, Expand);
00765     setOperationAction(ISD::FNEG, VT, Expand);
00766     setOperationAction(ISD::FSUB, VT, Expand);
00767     setOperationAction(ISD::MUL , VT, Expand);
00768     setOperationAction(ISD::FMUL, VT, Expand);
00769     setOperationAction(ISD::SDIV, VT, Expand);
00770     setOperationAction(ISD::UDIV, VT, Expand);
00771     setOperationAction(ISD::FDIV, VT, Expand);
00772     setOperationAction(ISD::SREM, VT, Expand);
00773     setOperationAction(ISD::UREM, VT, Expand);
00774     setOperationAction(ISD::LOAD, VT, Expand);
00775     setOperationAction(ISD::VECTOR_SHUFFLE, VT, Expand);
00776     setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT,Expand);
00777     setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Expand);
00778     setOperationAction(ISD::EXTRACT_SUBVECTOR, VT,Expand);
00779     setOperationAction(ISD::INSERT_SUBVECTOR, VT,Expand);
00780     setOperationAction(ISD::FABS, VT, Expand);
00781     setOperationAction(ISD::FSIN, VT, Expand);
00782     setOperationAction(ISD::FSINCOS, VT, Expand);
00783     setOperationAction(ISD::FCOS, VT, Expand);
00784     setOperationAction(ISD::FSINCOS, VT, Expand);
00785     setOperationAction(ISD::FREM, VT, Expand);
00786     setOperationAction(ISD::FMA,  VT, Expand);
00787     setOperationAction(ISD::FPOWI, VT, Expand);
00788     setOperationAction(ISD::FSQRT, VT, Expand);
00789     setOperationAction(ISD::FCOPYSIGN, VT, Expand);
00790     setOperationAction(ISD::FFLOOR, VT, Expand);
00791     setOperationAction(ISD::FCEIL, VT, Expand);
00792     setOperationAction(ISD::FTRUNC, VT, Expand);
00793     setOperationAction(ISD::FRINT, VT, Expand);
00794     setOperationAction(ISD::FNEARBYINT, VT, Expand);
00795     setOperationAction(ISD::SMUL_LOHI, VT, Expand);
00796     setOperationAction(ISD::UMUL_LOHI, VT, Expand);
00797     setOperationAction(ISD::SDIVREM, VT, Expand);
00798     setOperationAction(ISD::UDIVREM, VT, Expand);
00799     setOperationAction(ISD::FPOW, VT, Expand);
00800     setOperationAction(ISD::CTPOP, VT, Expand);
00801     setOperationAction(ISD::CTTZ, VT, Expand);
00802     setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Expand);
00803     setOperationAction(ISD::CTLZ, VT, Expand);
00804     setOperationAction(ISD::CTLZ_ZERO_UNDEF, VT, Expand);
00805     setOperationAction(ISD::SHL, VT, Expand);
00806     setOperationAction(ISD::SRA, VT, Expand);
00807     setOperationAction(ISD::SRL, VT, Expand);
00808     setOperationAction(ISD::ROTL, VT, Expand);
00809     setOperationAction(ISD::ROTR, VT, Expand);
00810     setOperationAction(ISD::BSWAP, VT, Expand);
00811     setOperationAction(ISD::SETCC, VT, Expand);
00812     setOperationAction(ISD::FLOG, VT, Expand);
00813     setOperationAction(ISD::FLOG2, VT, Expand);
00814     setOperationAction(ISD::FLOG10, VT, Expand);
00815     setOperationAction(ISD::FEXP, VT, Expand);
00816     setOperationAction(ISD::FEXP2, VT, Expand);
00817     setOperationAction(ISD::FP_TO_UINT, VT, Expand);
00818     setOperationAction(ISD::FP_TO_SINT, VT, Expand);
00819     setOperationAction(ISD::UINT_TO_FP, VT, Expand);
00820     setOperationAction(ISD::SINT_TO_FP, VT, Expand);
00821     setOperationAction(ISD::SIGN_EXTEND_INREG, VT,Expand);
00822     setOperationAction(ISD::TRUNCATE, VT, Expand);
00823     setOperationAction(ISD::SIGN_EXTEND, VT, Expand);
00824     setOperationAction(ISD::ZERO_EXTEND, VT, Expand);
00825     setOperationAction(ISD::ANY_EXTEND, VT, Expand);
00826     setOperationAction(ISD::VSELECT, VT, Expand);
00827     for (int InnerVT = MVT::FIRST_VECTOR_VALUETYPE;
00828              InnerVT <= MVT::LAST_VECTOR_VALUETYPE; ++InnerVT)
00829       setTruncStoreAction(VT,
00830                           (MVT::SimpleValueType)InnerVT, Expand);
00831     setLoadExtAction(ISD::SEXTLOAD, VT, Expand);
00832     setLoadExtAction(ISD::ZEXTLOAD, VT, Expand);
00833     setLoadExtAction(ISD::EXTLOAD, VT, Expand);
00834   }
00835 
00836   // FIXME: In order to prevent SSE instructions being expanded to MMX ones
00837   // with -msoft-float, disable use of MMX as well.
00838   if (!TM.Options.UseSoftFloat && Subtarget->hasMMX()) {
00839     addRegisterClass(MVT::x86mmx, &X86::VR64RegClass);
00840     // No operations on x86mmx supported, everything uses intrinsics.
00841   }
00842 
00843   // MMX-sized vectors (other than x86mmx) are expected to be expanded
00844   // into smaller operations.
00845   setOperationAction(ISD::MULHS,              MVT::v8i8,  Expand);
00846   setOperationAction(ISD::MULHS,              MVT::v4i16, Expand);
00847   setOperationAction(ISD::MULHS,              MVT::v2i32, Expand);
00848   setOperationAction(ISD::MULHS,              MVT::v1i64, Expand);
00849   setOperationAction(ISD::AND,                MVT::v8i8,  Expand);
00850   setOperationAction(ISD::AND,                MVT::v4i16, Expand);
00851   setOperationAction(ISD::AND,                MVT::v2i32, Expand);
00852   setOperationAction(ISD::AND,                MVT::v1i64, Expand);
00853   setOperationAction(ISD::OR,                 MVT::v8i8,  Expand);
00854   setOperationAction(ISD::OR,                 MVT::v4i16, Expand);
00855   setOperationAction(ISD::OR,                 MVT::v2i32, Expand);
00856   setOperationAction(ISD::OR,                 MVT::v1i64, Expand);
00857   setOperationAction(ISD::XOR,                MVT::v8i8,  Expand);
00858   setOperationAction(ISD::XOR,                MVT::v4i16, Expand);
00859   setOperationAction(ISD::XOR,                MVT::v2i32, Expand);
00860   setOperationAction(ISD::XOR,                MVT::v1i64, Expand);
00861   setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v8i8,  Expand);
00862   setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v4i16, Expand);
00863   setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v2i32, Expand);
00864   setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v1i64, Expand);
00865   setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v1i64, Expand);
00866   setOperationAction(ISD::SELECT,             MVT::v8i8,  Expand);
00867   setOperationAction(ISD::SELECT,             MVT::v4i16, Expand);
00868   setOperationAction(ISD::SELECT,             MVT::v2i32, Expand);
00869   setOperationAction(ISD::SELECT,             MVT::v1i64, Expand);
00870   setOperationAction(ISD::BITCAST,            MVT::v8i8,  Expand);
00871   setOperationAction(ISD::BITCAST,            MVT::v4i16, Expand);
00872   setOperationAction(ISD::BITCAST,            MVT::v2i32, Expand);
00873   setOperationAction(ISD::BITCAST,            MVT::v1i64, Expand);
00874 
00875   if (!TM.Options.UseSoftFloat && Subtarget->hasSSE1()) {
00876     addRegisterClass(MVT::v4f32, &X86::VR128RegClass);
00877 
00878     setOperationAction(ISD::FADD,               MVT::v4f32, Legal);
00879     setOperationAction(ISD::FSUB,               MVT::v4f32, Legal);
00880     setOperationAction(ISD::FMUL,               MVT::v4f32, Legal);
00881     setOperationAction(ISD::FDIV,               MVT::v4f32, Legal);
00882     setOperationAction(ISD::FSQRT,              MVT::v4f32, Legal);
00883     setOperationAction(ISD::FNEG,               MVT::v4f32, Custom);
00884     setOperationAction(ISD::FABS,               MVT::v4f32, Custom);
00885     setOperationAction(ISD::LOAD,               MVT::v4f32, Legal);
00886     setOperationAction(ISD::BUILD_VECTOR,       MVT::v4f32, Custom);
00887     setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v4f32, Custom);
00888     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
00889     setOperationAction(ISD::SELECT,             MVT::v4f32, Custom);
00890   }
00891 
00892   if (!TM.Options.UseSoftFloat && Subtarget->hasSSE2()) {
00893     addRegisterClass(MVT::v2f64, &X86::VR128RegClass);
00894 
00895     // FIXME: Unfortunately -soft-float and -no-implicit-float means XMM
00896     // registers cannot be used even for integer operations.
00897     addRegisterClass(MVT::v16i8, &X86::VR128RegClass);
00898     addRegisterClass(MVT::v8i16, &X86::VR128RegClass);
00899     addRegisterClass(MVT::v4i32, &X86::VR128RegClass);
00900     addRegisterClass(MVT::v2i64, &X86::VR128RegClass);
00901 
00902     setOperationAction(ISD::ADD,                MVT::v16i8, Legal);
00903     setOperationAction(ISD::ADD,                MVT::v8i16, Legal);
00904     setOperationAction(ISD::ADD,                MVT::v4i32, Legal);
00905     setOperationAction(ISD::ADD,                MVT::v2i64, Legal);
00906     setOperationAction(ISD::MUL,                MVT::v4i32, Custom);
00907     setOperationAction(ISD::MUL,                MVT::v2i64, Custom);
00908     setOperationAction(ISD::SUB,                MVT::v16i8, Legal);
00909     setOperationAction(ISD::SUB,                MVT::v8i16, Legal);
00910     setOperationAction(ISD::SUB,                MVT::v4i32, Legal);
00911     setOperationAction(ISD::SUB,                MVT::v2i64, Legal);
00912     setOperationAction(ISD::MUL,                MVT::v8i16, Legal);
00913     setOperationAction(ISD::FADD,               MVT::v2f64, Legal);
00914     setOperationAction(ISD::FSUB,               MVT::v2f64, Legal);
00915     setOperationAction(ISD::FMUL,               MVT::v2f64, Legal);
00916     setOperationAction(ISD::FDIV,               MVT::v2f64, Legal);
00917     setOperationAction(ISD::FSQRT,              MVT::v2f64, Legal);
00918     setOperationAction(ISD::FNEG,               MVT::v2f64, Custom);
00919     setOperationAction(ISD::FABS,               MVT::v2f64, Custom);
00920 
00921     setOperationAction(ISD::SETCC,              MVT::v2i64, Custom);
00922     setOperationAction(ISD::SETCC,              MVT::v16i8, Custom);
00923     setOperationAction(ISD::SETCC,              MVT::v8i16, Custom);
00924     setOperationAction(ISD::SETCC,              MVT::v4i32, Custom);
00925 
00926     setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v16i8, Custom);
00927     setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v8i16, Custom);
00928     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v8i16, Custom);
00929     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4i32, Custom);
00930     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4f32, Custom);
00931 
00932     // Custom lower build_vector, vector_shuffle, and extract_vector_elt.
00933     for (int i = MVT::v16i8; i != MVT::v2i64; ++i) {
00934       MVT VT = (MVT::SimpleValueType)i;
00935       // Do not attempt to custom lower non-power-of-2 vectors
00936       if (!isPowerOf2_32(VT.getVectorNumElements()))
00937         continue;
00938       // Do not attempt to custom lower non-128-bit vectors
00939       if (!VT.is128BitVector())
00940         continue;
00941       setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);
00942       setOperationAction(ISD::VECTOR_SHUFFLE,     VT, Custom);
00943       setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
00944     }
00945 
00946     setOperationAction(ISD::BUILD_VECTOR,       MVT::v2f64, Custom);
00947     setOperationAction(ISD::BUILD_VECTOR,       MVT::v2i64, Custom);
00948     setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v2f64, Custom);
00949     setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v2i64, Custom);
00950     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v2f64, Custom);
00951     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f64, Custom);
00952 
00953     if (Subtarget->is64Bit()) {
00954       setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v2i64, Custom);
00955       setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Custom);
00956     }
00957 
00958     // Promote v16i8, v8i16, v4i32 load, select, and, or, xor to v2i64.
00959     for (int i = MVT::v16i8; i != MVT::v2i64; ++i) {
00960       MVT VT = (MVT::SimpleValueType)i;
00961 
00962       // Do not attempt to promote non-128-bit vectors
00963       if (!VT.is128BitVector())
00964         continue;
00965 
00966       setOperationAction(ISD::AND,    VT, Promote);
00967       AddPromotedToType (ISD::AND,    VT, MVT::v2i64);
00968       setOperationAction(ISD::OR,     VT, Promote);
00969       AddPromotedToType (ISD::OR,     VT, MVT::v2i64);
00970       setOperationAction(ISD::XOR,    VT, Promote);
00971       AddPromotedToType (ISD::XOR,    VT, MVT::v2i64);
00972       setOperationAction(ISD::LOAD,   VT, Promote);
00973       AddPromotedToType (ISD::LOAD,   VT, MVT::v2i64);
00974       setOperationAction(ISD::SELECT, VT, Promote);
00975       AddPromotedToType (ISD::SELECT, VT, MVT::v2i64);
00976     }
00977 
00978     setTruncStoreAction(MVT::f64, MVT::f32, Expand);
00979 
00980     // Custom lower v2i64 and v2f64 selects.
00981     setOperationAction(ISD::LOAD,               MVT::v2f64, Legal);
00982     setOperationAction(ISD::LOAD,               MVT::v2i64, Legal);
00983     setOperationAction(ISD::SELECT,             MVT::v2f64, Custom);
00984     setOperationAction(ISD::SELECT,             MVT::v2i64, Custom);
00985 
00986     setOperationAction(ISD::FP_TO_SINT,         MVT::v4i32, Legal);
00987     setOperationAction(ISD::SINT_TO_FP,         MVT::v4i32, Legal);
00988 
00989     setOperationAction(ISD::UINT_TO_FP,         MVT::v4i8,  Custom);
00990     setOperationAction(ISD::UINT_TO_FP,         MVT::v4i16, Custom);
00991     // As there is no 64-bit GPR available, we need build a special custom
00992     // sequence to convert from v2i32 to v2f32.
00993     if (!Subtarget->is64Bit())
00994       setOperationAction(ISD::UINT_TO_FP,       MVT::v2f32, Custom);
00995 
00996     setOperationAction(ISD::FP_EXTEND,          MVT::v2f32, Custom);
00997     setOperationAction(ISD::FP_ROUND,           MVT::v2f32, Custom);
00998 
00999     setLoadExtAction(ISD::EXTLOAD,              MVT::v2f32, Legal);
01000   }
01001 
01002   if (Subtarget->hasSSE41()) {
01003     setOperationAction(ISD::FFLOOR,             MVT::f32,   Legal);
01004     setOperationAction(ISD::FCEIL,              MVT::f32,   Legal);
01005     setOperationAction(ISD::FTRUNC,             MVT::f32,   Legal);
01006     setOperationAction(ISD::FRINT,              MVT::f32,   Legal);
01007     setOperationAction(ISD::FNEARBYINT,         MVT::f32,   Legal);
01008     setOperationAction(ISD::FFLOOR,             MVT::f64,   Legal);
01009     setOperationAction(ISD::FCEIL,              MVT::f64,   Legal);
01010     setOperationAction(ISD::FTRUNC,             MVT::f64,   Legal);
01011     setOperationAction(ISD::FRINT,              MVT::f64,   Legal);
01012     setOperationAction(ISD::FNEARBYINT,         MVT::f64,   Legal);
01013 
01014     setOperationAction(ISD::FFLOOR,             MVT::v4f32, Legal);
01015     setOperationAction(ISD::FCEIL,              MVT::v4f32, Legal);
01016     setOperationAction(ISD::FTRUNC,             MVT::v4f32, Legal);
01017     setOperationAction(ISD::FRINT,              MVT::v4f32, Legal);
01018     setOperationAction(ISD::FNEARBYINT,         MVT::v4f32, Legal);
01019     setOperationAction(ISD::FFLOOR,             MVT::v2f64, Legal);
01020     setOperationAction(ISD::FCEIL,              MVT::v2f64, Legal);
01021     setOperationAction(ISD::FTRUNC,             MVT::v2f64, Legal);
01022     setOperationAction(ISD::FRINT,              MVT::v2f64, Legal);
01023     setOperationAction(ISD::FNEARBYINT,         MVT::v2f64, Legal);
01024 
01025     // FIXME: Do we need to handle scalar-to-vector here?
01026     setOperationAction(ISD::MUL,                MVT::v4i32, Legal);
01027 
01028     setOperationAction(ISD::VSELECT,            MVT::v2f64, Legal);
01029     setOperationAction(ISD::VSELECT,            MVT::v2i64, Legal);
01030     setOperationAction(ISD::VSELECT,            MVT::v16i8, Legal);
01031     setOperationAction(ISD::VSELECT,            MVT::v4i32, Legal);
01032     setOperationAction(ISD::VSELECT,            MVT::v4f32, Legal);
01033 
01034     // i8 and i16 vectors are custom , because the source register and source
01035     // source memory operand types are not the same width.  f32 vectors are
01036     // custom since the immediate controlling the insert encodes additional
01037     // information.
01038     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v16i8, Custom);
01039     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v8i16, Custom);
01040     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4i32, Custom);
01041     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4f32, Custom);
01042 
01043     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i8, Custom);
01044     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i16, Custom);
01045     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Custom);
01046     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
01047 
01048     // FIXME: these should be Legal but thats only for the case where
01049     // the index is constant.  For now custom expand to deal with that.
01050     if (Subtarget->is64Bit()) {
01051       setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v2i64, Custom);
01052       setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Custom);
01053     }
01054   }
01055 
01056   if (Subtarget->hasSSE2()) {
01057     setOperationAction(ISD::SRL,               MVT::v8i16, Custom);
01058     setOperationAction(ISD::SRL,               MVT::v16i8, Custom);
01059 
01060     setOperationAction(ISD::SHL,               MVT::v8i16, Custom);
01061     setOperationAction(ISD::SHL,               MVT::v16i8, Custom);
01062 
01063     setOperationAction(ISD::SRA,               MVT::v8i16, Custom);
01064     setOperationAction(ISD::SRA,               MVT::v16i8, Custom);
01065 
01066     // In the customized shift lowering, the legal cases in AVX2 will be
01067     // recognized.
01068     setOperationAction(ISD::SRL,               MVT::v2i64, Custom);
01069     setOperationAction(ISD::SRL,               MVT::v4i32, Custom);
01070 
01071     setOperationAction(ISD::SHL,               MVT::v2i64, Custom);
01072     setOperationAction(ISD::SHL,               MVT::v4i32, Custom);
01073 
01074     setOperationAction(ISD::SRA,               MVT::v4i32, Custom);
01075 
01076     setOperationAction(ISD::SDIV,              MVT::v8i16, Custom);
01077     setOperationAction(ISD::SDIV,              MVT::v4i32, Custom);
01078   }
01079 
01080   if (!TM.Options.UseSoftFloat && Subtarget->hasFp256()) {
01081     addRegisterClass(MVT::v32i8,  &X86::VR256RegClass);
01082     addRegisterClass(MVT::v16i16, &X86::VR256RegClass);
01083     addRegisterClass(MVT::v8i32,  &X86::VR256RegClass);
01084     addRegisterClass(MVT::v8f32,  &X86::VR256RegClass);
01085     addRegisterClass(MVT::v4i64,  &X86::VR256RegClass);
01086     addRegisterClass(MVT::v4f64,  &X86::VR256RegClass);
01087 
01088     setOperationAction(ISD::LOAD,               MVT::v8f32, Legal);
01089     setOperationAction(ISD::LOAD,               MVT::v4f64, Legal);
01090     setOperationAction(ISD::LOAD,               MVT::v4i64, Legal);
01091 
01092     setOperationAction(ISD::FADD,               MVT::v8f32, Legal);
01093     setOperationAction(ISD::FSUB,               MVT::v8f32, Legal);
01094     setOperationAction(ISD::FMUL,               MVT::v8f32, Legal);
01095     setOperationAction(ISD::FDIV,               MVT::v8f32, Legal);
01096     setOperationAction(ISD::FSQRT,              MVT::v8f32, Legal);
01097     setOperationAction(ISD::FFLOOR,             MVT::v8f32, Legal);
01098     setOperationAction(ISD::FCEIL,              MVT::v8f32, Legal);
01099     setOperationAction(ISD::FTRUNC,             MVT::v8f32, Legal);
01100     setOperationAction(ISD::FRINT,              MVT::v8f32, Legal);
01101     setOperationAction(ISD::FNEARBYINT,         MVT::v8f32, Legal);
01102     setOperationAction(ISD::FNEG,               MVT::v8f32, Custom);
01103     setOperationAction(ISD::FABS,               MVT::v8f32, Custom);
01104 
01105     setOperationAction(ISD::FADD,               MVT::v4f64, Legal);
01106     setOperationAction(ISD::FSUB,               MVT::v4f64, Legal);
01107     setOperationAction(ISD::FMUL,               MVT::v4f64, Legal);
01108     setOperationAction(ISD::FDIV,               MVT::v4f64, Legal);
01109     setOperationAction(ISD::FSQRT,              MVT::v4f64, Legal);
01110     setOperationAction(ISD::FFLOOR,             MVT::v4f64, Legal);
01111     setOperationAction(ISD::FCEIL,              MVT::v4f64, Legal);
01112     setOperationAction(ISD::FTRUNC,             MVT::v4f64, Legal);
01113     setOperationAction(ISD::FRINT,              MVT::v4f64, Legal);
01114     setOperationAction(ISD::FNEARBYINT,         MVT::v4f64, Legal);
01115     setOperationAction(ISD::FNEG,               MVT::v4f64, Custom);
01116     setOperationAction(ISD::FABS,               MVT::v4f64, Custom);
01117 
01118     setOperationAction(ISD::TRUNCATE,           MVT::v8i16, Custom);
01119     setOperationAction(ISD::TRUNCATE,           MVT::v4i32, Custom);
01120 
01121     setOperationAction(ISD::FP_TO_SINT,         MVT::v8i16, Custom);
01122 
01123     setOperationAction(ISD::FP_TO_SINT,         MVT::v8i32, Legal);
01124     setOperationAction(ISD::SINT_TO_FP,         MVT::v8i16, Promote);
01125     setOperationAction(ISD::SINT_TO_FP,         MVT::v8i32, Legal);
01126     setOperationAction(ISD::FP_ROUND,           MVT::v4f32, Legal);
01127 
01128     setOperationAction(ISD::ZERO_EXTEND,        MVT::v8i32, Custom);
01129     setOperationAction(ISD::UINT_TO_FP,         MVT::v8i8,  Custom);
01130     setOperationAction(ISD::UINT_TO_FP,         MVT::v8i16, Custom);
01131 
01132     setLoadExtAction(ISD::EXTLOAD,              MVT::v4f32, Legal);
01133 
01134     setOperationAction(ISD::SRL,               MVT::v16i16, Custom);
01135     setOperationAction(ISD::SRL,               MVT::v32i8, Custom);
01136 
01137     setOperationAction(ISD::SHL,               MVT::v16i16, Custom);
01138     setOperationAction(ISD::SHL,               MVT::v32i8, Custom);
01139 
01140     setOperationAction(ISD::SRA,               MVT::v16i16, Custom);
01141     setOperationAction(ISD::SRA,               MVT::v32i8, Custom);
01142 
01143     setOperationAction(ISD::SDIV,              MVT::v16i16, Custom);
01144 
01145     setOperationAction(ISD::SETCC,             MVT::v32i8, Custom);
01146     setOperationAction(ISD::SETCC,             MVT::v16i16, Custom);
01147     setOperationAction(ISD::SETCC,             MVT::v8i32, Custom);
01148     setOperationAction(ISD::SETCC,             MVT::v4i64, Custom);
01149 
01150     setOperationAction(ISD::SELECT,            MVT::v4f64, Custom);
01151     setOperationAction(ISD::SELECT,            MVT::v4i64, Custom);
01152     setOperationAction(ISD::SELECT,            MVT::v8f32, Custom);
01153 
01154     setOperationAction(ISD::VSELECT,           MVT::v4f64, Legal);
01155     setOperationAction(ISD::VSELECT,           MVT::v4i64, Legal);
01156     setOperationAction(ISD::VSELECT,           MVT::v8i32, Legal);
01157     setOperationAction(ISD::VSELECT,           MVT::v8f32, Legal);
01158 
01159     setOperationAction(ISD::SIGN_EXTEND,       MVT::v4i64, Custom);
01160     setOperationAction(ISD::SIGN_EXTEND,       MVT::v8i32, Custom);
01161     setOperationAction(ISD::ZERO_EXTEND,       MVT::v4i64, Custom);
01162     setOperationAction(ISD::ZERO_EXTEND,       MVT::v8i32, Custom);
01163     setOperationAction(ISD::ANY_EXTEND,        MVT::v4i64, Custom);
01164     setOperationAction(ISD::ANY_EXTEND,        MVT::v8i32, Custom);
01165 
01166     if (Subtarget->hasFMA() || Subtarget->hasFMA4()) {
01167       setOperationAction(ISD::FMA,             MVT::v8f32, Legal);
01168       setOperationAction(ISD::FMA,             MVT::v4f64, Legal);
01169       setOperationAction(ISD::FMA,             MVT::v4f32, Legal);
01170       setOperationAction(ISD::FMA,             MVT::v2f64, Legal);
01171       setOperationAction(ISD::FMA,             MVT::f32, Legal);
01172       setOperationAction(ISD::FMA,             MVT::f64, Legal);
01173     }
01174 
01175     if (Subtarget->hasInt256()) {
01176       setOperationAction(ISD::ADD,             MVT::v4i64, Legal);
01177       setOperationAction(ISD::ADD,             MVT::v8i32, Legal);
01178       setOperationAction(ISD::ADD,             MVT::v16i16, Legal);
01179       setOperationAction(ISD::ADD,             MVT::v32i8, Legal);
01180 
01181       setOperationAction(ISD::SUB,             MVT::v4i64, Legal);
01182       setOperationAction(ISD::SUB,             MVT::v8i32, Legal);
01183       setOperationAction(ISD::SUB,             MVT::v16i16, Legal);
01184       setOperationAction(ISD::SUB,             MVT::v32i8, Legal);
01185 
01186       setOperationAction(ISD::MUL,             MVT::v4i64, Custom);
01187       setOperationAction(ISD::MUL,             MVT::v8i32, Legal);
01188       setOperationAction(ISD::MUL,             MVT::v16i16, Legal);
01189       // Don't lower v32i8 because there is no 128-bit byte mul
01190 
01191       setOperationAction(ISD::VSELECT,         MVT::v32i8, Legal);
01192 
01193       setOperationAction(ISD::SDIV,            MVT::v8i32, Custom);
01194     } else {
01195       setOperationAction(ISD::ADD,             MVT::v4i64, Custom);
01196       setOperationAction(ISD::ADD,             MVT::v8i32, Custom);
01197       setOperationAction(ISD::ADD,             MVT::v16i16, Custom);
01198       setOperationAction(ISD::ADD,             MVT::v32i8, Custom);
01199 
01200       setOperationAction(ISD::SUB,             MVT::v4i64, Custom);
01201       setOperationAction(ISD::SUB,             MVT::v8i32, Custom);
01202       setOperationAction(ISD::SUB,             MVT::v16i16, Custom);
01203       setOperationAction(ISD::SUB,             MVT::v32i8, Custom);
01204 
01205       setOperationAction(ISD::MUL,             MVT::v4i64, Custom);
01206       setOperationAction(ISD::MUL,             MVT::v8i32, Custom);
01207       setOperationAction(ISD::MUL,             MVT::v16i16, Custom);
01208       // Don't lower v32i8 because there is no 128-bit byte mul
01209     }
01210 
01211     // In the customized shift lowering, the legal cases in AVX2 will be
01212     // recognized.
01213     setOperationAction(ISD::SRL,               MVT::v4i64, Custom);
01214     setOperationAction(ISD::SRL,               MVT::v8i32, Custom);
01215 
01216     setOperationAction(ISD::SHL,               MVT::v4i64, Custom);
01217     setOperationAction(ISD::SHL,               MVT::v8i32, Custom);
01218 
01219     setOperationAction(ISD::SRA,               MVT::v8i32, Custom);
01220 
01221     // Custom lower several nodes for 256-bit types.
01222     for (int i = MVT::FIRST_VECTOR_VALUETYPE;
01223              i <= MVT::LAST_VECTOR_VALUETYPE; ++i) {
01224       MVT VT = (MVT::SimpleValueType)i;
01225 
01226       // Extract subvector is special because the value type
01227       // (result) is 128-bit but the source is 256-bit wide.
01228       if (VT.is128BitVector())
01229         setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
01230 
01231       // Do not attempt to custom lower other non-256-bit vectors
01232       if (!VT.is256BitVector())
01233         continue;
01234 
01235       setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);
01236       setOperationAction(ISD::VECTOR_SHUFFLE,     VT, Custom);
01237       setOperationAction(ISD::INSERT_VECTOR_ELT,  VT, Custom);
01238       setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
01239       setOperationAction(ISD::SCALAR_TO_VECTOR,   VT, Custom);
01240       setOperationAction(ISD::INSERT_SUBVECTOR,   VT, Custom);
01241       setOperationAction(ISD::CONCAT_VECTORS,     VT, Custom);
01242     }
01243 
01244     // Promote v32i8, v16i16, v8i32 select, and, or, xor to v4i64.
01245     for (int i = MVT::v32i8; i != MVT::v4i64; ++i) {
01246       MVT VT = (MVT::SimpleValueType)i;
01247 
01248       // Do not attempt to promote non-256-bit vectors
01249       if (!VT.is256BitVector())
01250         continue;
01251 
01252       setOperationAction(ISD::AND,    VT, Promote);
01253       AddPromotedToType (ISD::AND,    VT, MVT::v4i64);
01254       setOperationAction(ISD::OR,     VT, Promote);
01255       AddPromotedToType (ISD::OR,     VT, MVT::v4i64);
01256       setOperationAction(ISD::XOR,    VT, Promote);
01257       AddPromotedToType (ISD::XOR,    VT, MVT::v4i64);
01258       setOperationAction(ISD::LOAD,   VT, Promote);
01259       AddPromotedToType (ISD::LOAD,   VT, MVT::v4i64);
01260       setOperationAction(ISD::SELECT, VT, Promote);
01261       AddPromotedToType (ISD::SELECT, VT, MVT::v4i64);
01262     }
01263   }
01264 
01265   // SIGN_EXTEND_INREGs are evaluated by the extend type. Handle the expansion
01266   // of this type with custom code.
01267   for (int VT = MVT::FIRST_VECTOR_VALUETYPE;
01268            VT != MVT::LAST_VECTOR_VALUETYPE; VT++) {
01269     setOperationAction(ISD::SIGN_EXTEND_INREG, (MVT::SimpleValueType)VT,
01270                        Custom);
01271   }
01272 
01273   // We want to custom lower some of our intrinsics.
01274   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
01275   setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
01276 
01277   // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't
01278   // handle type legalization for these operations here.
01279   //
01280   // FIXME: We really should do custom legalization for addition and
01281   // subtraction on x86-32 once PR3203 is fixed.  We really can't do much better
01282   // than generic legalization for 64-bit multiplication-with-overflow, though.
01283   for (unsigned i = 0, e = 3+Subtarget->is64Bit(); i != e; ++i) {
01284     // Add/Sub/Mul with overflow operations are custom lowered.
01285     MVT VT = IntVTs[i];
01286     setOperationAction(ISD::SADDO, VT, Custom);
01287     setOperationAction(ISD::UADDO, VT, Custom);
01288     setOperationAction(ISD::SSUBO, VT, Custom);
01289     setOperationAction(ISD::USUBO, VT, Custom);
01290     setOperationAction(ISD::SMULO, VT, Custom);
01291     setOperationAction(ISD::UMULO, VT, Custom);
01292   }
01293 
01294   // There are no 8-bit 3-address imul/mul instructions
01295   setOperationAction(ISD::SMULO, MVT::i8, Expand);
01296   setOperationAction(ISD::UMULO, MVT::i8, Expand);
01297 
01298   if (!Subtarget->is64Bit()) {
01299     // These libcalls are not available in 32-bit.
01300     setLibcallName(RTLIB::SHL_I128, 0);
01301     setLibcallName(RTLIB::SRL_I128, 0);
01302     setLibcallName(RTLIB::SRA_I128, 0);
01303   }
01304 
01305   // Combine sin / cos into one node or libcall if possible.
01306   if (Subtarget->hasSinCos()) {
01307     setLibcallName(RTLIB::SINCOS_F32, "sincosf");
01308     setLibcallName(RTLIB::SINCOS_F64, "sincos");
01309     if (Subtarget->isTargetDarwin()) {
01310       // For MacOSX, we don't want to the normal expansion of a libcall to
01311       // sincos. We want to issue a libcall to __sincos_stret to avoid memory
01312       // traffic.
01313       setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
01314       setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
01315     }
01316   }
01317 
01318   // We have target-specific dag combine patterns for the following nodes:
01319   setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
01320   setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
01321   setTargetDAGCombine(ISD::VSELECT);
01322   setTargetDAGCombine(ISD::SELECT);
01323   setTargetDAGCombine(ISD::SHL);
01324   setTargetDAGCombine(ISD::SRA);
01325   setTargetDAGCombine(ISD::SRL);
01326   setTargetDAGCombine(ISD::OR);
01327   setTargetDAGCombine(ISD::AND);
01328   setTargetDAGCombine(ISD::ADD);
01329   setTargetDAGCombine(ISD::FADD);
01330   setTargetDAGCombine(ISD::FSUB);
01331   setTargetDAGCombine(ISD::FMA);
01332   setTargetDAGCombine(ISD::SUB);
01333   setTargetDAGCombine(ISD::LOAD);
01334   setTargetDAGCombine(ISD::STORE);
01335   setTargetDAGCombine(ISD::ZERO_EXTEND);
01336   setTargetDAGCombine(ISD::ANY_EXTEND);
01337   setTargetDAGCombine(ISD::SIGN_EXTEND);
01338   setTargetDAGCombine(ISD::SIGN_EXTEND_INREG);
01339   setTargetDAGCombine(ISD::TRUNCATE);
01340   setTargetDAGCombine(ISD::SINT_TO_FP);
01341   setTargetDAGCombine(ISD::SETCC);
01342   if (Subtarget->is64Bit())
01343     setTargetDAGCombine(ISD::MUL);
01344   setTargetDAGCombine(ISD::XOR);
01345 
01346   computeRegisterProperties();
01347 
01348   // On Darwin, -Os means optimize for size without hurting performance,
01349   // do not reduce the limit.
01350   MaxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores
01351   MaxStoresPerMemsetOptSize = Subtarget->isTargetDarwin() ? 16 : 8;
01352   MaxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores
01353   MaxStoresPerMemcpyOptSize = Subtarget->isTargetDarwin() ? 8 : 4;
01354   MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores
01355   MaxStoresPerMemmoveOptSize = Subtarget->isTargetDarwin() ? 8 : 4;
01356   setPrefLoopAlignment(4); // 2^4 bytes.
01357 
01358   // Predictable cmov don't hurt on atom because it's in-order.
01359   PredictableSelectIsExpensive = !Subtarget->isAtom();
01360 
01361   setPrefFunctionAlignment(4); // 2^4 bytes.
01362 }
01363 
01364 EVT X86TargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const {
01365   if (!VT.isVector()) return MVT::i8;
01366   return VT.changeVectorElementTypeToInteger();
01367 }
01368 
01369 /// getMaxByValAlign - Helper for getByValTypeAlignment to determine
01370 /// the desired ByVal argument alignment.
01371 static void getMaxByValAlign(Type *Ty, unsigned &MaxAlign) {
01372   if (MaxAlign == 16)
01373     return;
01374   if (VectorType *VTy = dyn_cast<VectorType>(Ty)) {
01375     if (VTy->getBitWidth() == 128)
01376       MaxAlign = 16;
01377   } else if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
01378     unsigned EltAlign = 0;
01379     getMaxByValAlign(ATy->getElementType(), EltAlign);
01380     if (EltAlign > MaxAlign)
01381       MaxAlign = EltAlign;
01382   } else if (StructType *STy = dyn_cast<StructType>(Ty)) {
01383     for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
01384       unsigned EltAlign = 0;
01385       getMaxByValAlign(STy->getElementType(i), EltAlign);
01386       if (EltAlign > MaxAlign)
01387         MaxAlign = EltAlign;
01388       if (MaxAlign == 16)
01389         break;
01390     }
01391   }
01392 }
01393 
01394 /// getByValTypeAlignment - Return the desired alignment for ByVal aggregate
01395 /// function arguments in the caller parameter area. For X86, aggregates
01396 /// that contain SSE vectors are placed at 16-byte boundaries while the rest
01397 /// are at 4-byte boundaries.
01398 unsigned X86TargetLowering::getByValTypeAlignment(Type *Ty) const {
01399   if (Subtarget->is64Bit()) {
01400     // Max of 8 and alignment of type.
01401     unsigned TyAlign = TD->getABITypeAlignment(Ty);
01402     if (TyAlign > 8)
01403       return TyAlign;
01404     return 8;
01405   }
01406 
01407   unsigned Align = 4;
01408   if (Subtarget->hasSSE1())
01409     getMaxByValAlign(Ty, Align);
01410   return Align;
01411 }
01412 
01413 /// getOptimalMemOpType - Returns the target specific optimal type for load
01414 /// and store operations as a result of memset, memcpy, and memmove
01415 /// lowering. If DstAlign is zero that means it's safe to destination
01416 /// alignment can satisfy any constraint. Similarly if SrcAlign is zero it
01417 /// means there isn't a need to check it against alignment requirement,
01418 /// probably because the source does not need to be loaded. If 'IsMemset' is
01419 /// true, that means it's expanding a memset. If 'ZeroMemset' is true, that
01420 /// means it's a memset of zero. 'MemcpyStrSrc' indicates whether the memcpy
01421 /// source is constant so it does not need to be loaded.
01422 /// It returns EVT::Other if the type should be determined using generic
01423 /// target-independent logic.
01424 EVT
01425 X86TargetLowering::getOptimalMemOpType(uint64_t Size,
01426                                        unsigned DstAlign, unsigned SrcAlign,
01427                                        bool IsMemset, bool ZeroMemset,
01428                                        bool MemcpyStrSrc,
01429                                        MachineFunction &MF) const {
01430   const Function *F = MF.getFunction();
01431   if ((!IsMemset || ZeroMemset) &&
01432       !F->getAttributes().hasAttribute(AttributeSet::FunctionIndex,
01433                                        Attribute::NoImplicitFloat)) {
01434     if (Size >= 16 &&
01435         (Subtarget->isUnalignedMemAccessFast() ||
01436          ((DstAlign == 0 || DstAlign >= 16) &&
01437           (SrcAlign == 0 || SrcAlign >= 16)))) {
01438       if (Size >= 32) {
01439         if (Subtarget->hasInt256())
01440           return MVT::v8i32;
01441         if (Subtarget->hasFp256())
01442           return MVT::v8f32;
01443       }
01444       if (Subtarget->hasSSE2())
01445         return MVT::v4i32;
01446       if (Subtarget->hasSSE1())
01447         return MVT::v4f32;
01448     } else if (!MemcpyStrSrc && Size >= 8 &&
01449                !Subtarget->is64Bit() &&
01450                Subtarget->hasSSE2()) {
01451       // Do not use f64 to lower memcpy if source is string constant. It's
01452       // better to use i32 to avoid the loads.
01453       return MVT::f64;
01454     }
01455   }
01456   if (Subtarget->is64Bit() && Size >= 8)
01457     return MVT::i64;
01458   return MVT::i32;
01459 }
01460 
01461 bool X86TargetLowering::isSafeMemOpType(MVT VT) const {
01462   if (VT == MVT::f32)
01463     return X86ScalarSSEf32;
01464   else if (VT == MVT::f64)
01465     return X86ScalarSSEf64;
01466   return true;
01467 }
01468 
01469 bool
01470 X86TargetLowering::allowsUnalignedMemoryAccesses(EVT VT, bool *Fast) const {
01471   if (Fast)
01472     *Fast = Subtarget->isUnalignedMemAccessFast();
01473   return true;
01474 }
01475 
01476 /// getJumpTableEncoding - Return the entry encoding for a jump table in the
01477 /// current function.  The returned value is a member of the
01478 /// MachineJumpTableInfo::JTEntryKind enum.
01479 unsigned X86TargetLowering::getJumpTableEncoding() const {
01480   // In GOT pic mode, each entry in the jump table is emitted as a @GOTOFF
01481   // symbol.
01482   if (getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
01483       Subtarget->isPICStyleGOT())
01484     return MachineJumpTableInfo::EK_Custom32;
01485 
01486   // Otherwise, use the normal jump table encoding heuristics.
01487   return TargetLowering::getJumpTableEncoding();
01488 }
01489 
01490 const MCExpr *
01491 X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI,
01492                                              const MachineBasicBlock *MBB,
01493                                              unsigned uid,MCContext &Ctx) const{
01494   assert(getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
01495          Subtarget->isPICStyleGOT());
01496   // In 32-bit ELF systems, our jump table entries are formed with @GOTOFF
01497   // entries.
01498   return MCSymbolRefExpr::Create(MBB->getSymbol(),
01499                                  MCSymbolRefExpr::VK_GOTOFF, Ctx);
01500 }
01501 
01502 /// getPICJumpTableRelocaBase - Returns relocation base for the given PIC
01503 /// jumptable.
01504 SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table,
01505                                                     SelectionDAG &DAG) const {
01506   if (!Subtarget->is64Bit())
01507     // This doesn't have SDLoc associated with it, but is not really the
01508     // same as a Register.
01509     return DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), getPointerTy());
01510   return Table;
01511 }
01512 
01513 /// getPICJumpTableRelocBaseExpr - This returns the relocation base for the
01514 /// given PIC jumptable, the same as getPICJumpTableRelocBase, but as an
01515 /// MCExpr.
01516 const MCExpr *X86TargetLowering::
01517 getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI,
01518                              MCContext &Ctx) const {
01519   // X86-64 uses RIP relative addressing based on the jump table label.
01520   if (Subtarget->isPICStyleRIPRel())
01521     return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx);
01522 
01523   // Otherwise, the reference is relative to the PIC base.
01524   return MCSymbolRefExpr::Create(MF->getPICBaseSymbol(), Ctx);
01525 }
01526 
01527 // FIXME: Why this routine is here? Move to RegInfo!
01528 std::pair<const TargetRegisterClass*, uint8_t>
01529 X86TargetLowering::findRepresentativeClass(MVT VT) const{
01530   const TargetRegisterClass *RRC = 0;
01531   uint8_t Cost = 1;
01532   switch (VT.SimpleTy) {
01533   default:
01534     return TargetLowering::findRepresentativeClass(VT);
01535   case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64:
01536     RRC = Subtarget->is64Bit() ?
01537       (const TargetRegisterClass*)&X86::GR64RegClass :
01538       (const TargetRegisterClass*)&X86::GR32RegClass;
01539     break;
01540   case MVT::x86mmx:
01541     RRC = &X86::VR64RegClass;
01542     break;
01543   case MVT::f32: case MVT::f64:
01544   case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64:
01545   case MVT::v4f32: case MVT::v2f64:
01546   case MVT::v32i8: case MVT::v8i32: case MVT::v4i64: case MVT::v8f32:
01547   case MVT::v4f64:
01548     RRC = &X86::VR128RegClass;
01549     break;
01550   }
01551   return std::make_pair(RRC, Cost);
01552 }
01553 
01554 bool X86TargetLowering::getStackCookieLocation(unsigned &AddressSpace,
01555                                                unsigned &Offset) const {
01556   if (!Subtarget->isTargetLinux())
01557     return false;
01558 
01559   if (Subtarget->is64Bit()) {
01560     // %fs:0x28, unless we're using a Kernel code model, in which case it's %gs:
01561     Offset = 0x28;
01562     if (getTargetMachine().getCodeModel() == CodeModel::Kernel)
01563       AddressSpace = 256;
01564     else
01565       AddressSpace = 257;
01566   } else {
01567     // %gs:0x14 on i386
01568     Offset = 0x14;
01569     AddressSpace = 256;
01570   }
01571   return true;
01572 }
01573 
01574 //===----------------------------------------------------------------------===//
01575 //               Return Value Calling Convention Implementation
01576 //===----------------------------------------------------------------------===//
01577 
01578 #include "X86GenCallingConv.inc"
01579 
01580 bool
01581 X86TargetLowering::CanLowerReturn(CallingConv::ID CallConv,
01582                                   MachineFunction &MF, bool isVarArg,
01583                         const SmallVectorImpl<ISD::OutputArg> &Outs,
01584                         LLVMContext &Context) const {
01585   SmallVector<CCValAssign, 16> RVLocs;
01586   CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(),
01587                  RVLocs, Context);
01588   return CCInfo.CheckReturn(Outs, RetCC_X86);
01589 }
01590 
01591 SDValue
01592 X86TargetLowering::LowerReturn(SDValue Chain,
01593                                CallingConv::ID CallConv, bool isVarArg,
01594                                const SmallVectorImpl<ISD::OutputArg> &Outs,
01595                                const SmallVectorImpl<SDValue> &OutVals,
01596                                SDLoc dl, SelectionDAG &DAG) const {
01597   MachineFunction &MF = DAG.getMachineFunction();
01598   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
01599 
01600   SmallVector<CCValAssign, 16> RVLocs;
01601   CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(),
01602                  RVLocs, *DAG.getContext());
01603   CCInfo.AnalyzeReturn(Outs, RetCC_X86);
01604 
01605   SDValue Flag;
01606   SmallVector<SDValue, 6> RetOps;
01607   RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
01608   // Operand #1 = Bytes To Pop
01609   RetOps.push_back(DAG.getTargetConstant(FuncInfo->getBytesToPopOnReturn(),
01610                    MVT::i16));
01611 
01612   // Copy the result values into the output registers.
01613   for (unsigned i = 0; i != RVLocs.size(); ++i) {
01614     CCValAssign &VA = RVLocs[i];
01615     assert(VA.isRegLoc() && "Can only return in registers!");
01616     SDValue ValToCopy = OutVals[i];
01617     EVT ValVT = ValToCopy.getValueType();
01618 
01619     // Promote values to the appropriate types
01620     if (VA.getLocInfo() == CCValAssign::SExt)
01621       ValToCopy = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), ValToCopy);
01622     else if (VA.getLocInfo() == CCValAssign::ZExt)
01623       ValToCopy = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), ValToCopy);
01624     else if (VA.getLocInfo() == CCValAssign::AExt)
01625       ValToCopy = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), ValToCopy);
01626     else if (VA.getLocInfo() == CCValAssign::BCvt)
01627       ValToCopy = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), ValToCopy);
01628 
01629     // If this is x86-64, and we disabled SSE, we can't return FP values,
01630     // or SSE or MMX vectors.
01631     if ((ValVT == MVT::f32 || ValVT == MVT::f64 ||
01632          VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) &&
01633           (Subtarget->is64Bit() && !Subtarget->hasSSE1())) {
01634       report_fatal_error("SSE register return with SSE disabled");
01635     }
01636     // Likewise we can't return F64 values with SSE1 only.  gcc does so, but
01637     // llvm-gcc has never done it right and no one has noticed, so this
01638     // should be OK for now.
01639     if (ValVT == MVT::f64 &&
01640         (Subtarget->is64Bit() && !Subtarget->hasSSE2()))
01641       report_fatal_error("SSE2 register return with SSE2 disabled");
01642 
01643     // Returns in ST0/ST1 are handled specially: these are pushed as operands to
01644     // the RET instruction and handled by the FP Stackifier.
01645     if (VA.getLocReg() == X86::ST0 ||
01646         VA.getLocReg() == X86::ST1) {
01647       // If this is a copy from an xmm register to ST(0), use an FPExtend to
01648       // change the value to the FP stack register class.
01649       if (isScalarFPTypeInSSEReg(VA.getValVT()))
01650         ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy);
01651       RetOps.push_back(ValToCopy);
01652       // Don't emit a copytoreg.
01653       continue;
01654     }
01655 
01656     // 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64
01657     // which is returned in RAX / RDX.
01658     if (Subtarget->is64Bit()) {
01659       if (ValVT == MVT::x86mmx) {
01660         if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) {
01661           ValToCopy = DAG.getNode(ISD::BITCAST, dl, MVT::i64, ValToCopy);
01662           ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
01663                                   ValToCopy);
01664           // If we don't have SSE2 available, convert to v4f32 so the generated
01665           // register is legal.
01666           if (!Subtarget->hasSSE2())
01667             ValToCopy = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32,ValToCopy);
01668         }
01669       }
01670     }
01671 
01672     Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), ValToCopy, Flag);
01673     Flag = Chain.getValue(1);
01674     RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
01675   }
01676 
01677   // The x86-64 ABIs require that for returning structs by value we copy
01678   // the sret argument into %rax/%eax (depending on ABI) for the return.
01679   // Win32 requires us to put the sret argument to %eax as well.
01680   // We saved the argument into a virtual register in the entry block,
01681   // so now we copy the value out and into %rax/%eax.
01682   if (DAG.getMachineFunction().getFunction()->hasStructRetAttr() &&
01683       (Subtarget->is64Bit() || Subtarget->isTargetWindows())) {
01684     MachineFunction &MF = DAG.getMachineFunction();
01685     X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
01686     unsigned Reg = FuncInfo->getSRetReturnReg();
01687     assert(Reg &&
01688            "SRetReturnReg should have been set in LowerFormalArguments().");
01689     SDValue Val = DAG.getCopyFromReg(Chain, dl, Reg, getPointerTy());
01690 
01691     unsigned RetValReg
01692         = (Subtarget->is64Bit() && !Subtarget->isTarget64BitILP32()) ?
01693           X86::RAX : X86::EAX;
01694     Chain = DAG.getCopyToReg(Chain, dl, RetValReg, Val, Flag);
01695     Flag = Chain.getValue(1);
01696 
01697     // RAX/EAX now acts like a return value.
01698     RetOps.push_back(DAG.getRegister(RetValReg, getPointerTy()));
01699   }
01700 
01701   RetOps[0] = Chain;  // Update chain.
01702 
01703   // Add the flag if we have it.
01704   if (Flag.getNode())
01705     RetOps.push_back(Flag);
01706 
01707   return DAG.getNode(X86ISD::RET_FLAG, dl,
01708                      MVT::Other, &RetOps[0], RetOps.size());
01709 }
01710 
01711 bool X86TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
01712   if (N->getNumValues() != 1)
01713     return false;
01714   if (!N->hasNUsesOfValue(1, 0))
01715     return false;
01716 
01717   SDValue TCChain = Chain;
01718   SDNode *Copy = *N->use_begin();
01719   if (Copy->getOpcode() == ISD::CopyToReg) {
01720     // If the copy has a glue operand, we conservatively assume it isn't safe to
01721     // perform a tail call.
01722     if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
01723       return false;
01724     TCChain = Copy->getOperand(0);
01725   } else if (Copy->getOpcode() != ISD::FP_EXTEND)
01726     return false;
01727 
01728   bool HasRet = false;
01729   for (SDNode::use_iterator UI = Copy->use_begin(), UE = Copy->use_end();
01730        UI != UE; ++UI) {
01731     if (UI->getOpcode() != X86ISD::RET_FLAG)
01732       return false;
01733     HasRet = true;
01734   }
01735 
01736   if (!HasRet)
01737     return false;
01738 
01739   Chain = TCChain;
01740   return true;
01741 }
01742 
01743 MVT
01744 X86TargetLowering::getTypeForExtArgOrReturn(MVT VT,
01745                                             ISD::NodeType ExtendKind) const {
01746   MVT ReturnMVT;
01747   // TODO: Is this also valid on 32-bit?
01748   if (Subtarget->is64Bit() && VT == MVT::i1 && ExtendKind == ISD::ZERO_EXTEND)
01749     ReturnMVT = MVT::i8;
01750   else
01751     ReturnMVT = MVT::i32;
01752 
01753   MVT MinVT = getRegisterType(ReturnMVT);
01754   return VT.bitsLT(MinVT) ? MinVT : VT;
01755 }
01756 
01757 /// LowerCallResult - Lower the result values of a call into the
01758 /// appropriate copies out of appropriate physical registers.
01759 ///
01760 SDValue
01761 X86TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
01762                                    CallingConv::ID CallConv, bool isVarArg,
01763                                    const SmallVectorImpl<ISD::InputArg> &Ins,
01764                                    SDLoc dl, SelectionDAG &DAG,
01765                                    SmallVectorImpl<SDValue> &InVals) const {
01766 
01767   // Assign locations to each value returned by this call.
01768   SmallVector<CCValAssign, 16> RVLocs;
01769   bool Is64Bit = Subtarget->is64Bit();
01770   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
01771                  getTargetMachine(), RVLocs, *DAG.getContext());
01772   CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
01773 
01774   // Copy all of the result registers out of their specified physreg.
01775   for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
01776     CCValAssign &VA = RVLocs[i];
01777     EVT CopyVT = VA.getValVT();
01778 
01779     // If this is x86-64, and we disabled SSE, we can't return FP values
01780     if ((CopyVT == MVT::f32 || CopyVT == MVT::f64) &&
01781         ((Is64Bit || Ins[i].Flags.isInReg()) && !Subtarget->hasSSE1())) {
01782       report_fatal_error("SSE register return with SSE disabled");
01783     }
01784 
01785     SDValue Val;
01786 
01787     // If this is a call to a function that returns an fp value on the floating
01788     // point stack, we must guarantee the value is popped from the stack, so
01789     // a CopyFromReg is not good enough - the copy instruction may be eliminated
01790     // if the return value is not used. We use the FpPOP_RETVAL instruction
01791     // instead.
01792     if (VA.getLocReg() == X86::ST0 || VA.getLocReg() == X86::ST1) {
01793       // If we prefer to use the value in xmm registers, copy it out as f80 and
01794       // use a truncate to move it from fp stack reg to xmm reg.
01795       if (isScalarFPTypeInSSEReg(VA.getValVT())) CopyVT = MVT::f80;
01796       SDValue Ops[] = { Chain, InFlag };
01797       Chain = SDValue(DAG.getMachineNode(X86::FpPOP_RETVAL, dl, CopyVT,
01798                                          MVT::Other, MVT::Glue, Ops), 1);
01799       Val = Chain.getValue(0);
01800 
01801       // Round the f80 to the right size, which also moves it to the appropriate
01802       // xmm register.
01803       if (CopyVT != VA.getValVT())
01804         Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val,
01805                           // This truncation won't change the value.
01806                           DAG.getIntPtrConstant(1));
01807     } else {
01808       Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(),
01809                                  CopyVT, InFlag).getValue(1);
01810       Val = Chain.getValue(0);
01811     }
01812     InFlag = Chain.getValue(2);
01813     InVals.push_back(Val);
01814   }
01815 
01816   return Chain;
01817 }
01818 
01819 //===----------------------------------------------------------------------===//
01820 //                C & StdCall & Fast Calling Convention implementation
01821 //===----------------------------------------------------------------------===//
01822 //  StdCall calling convention seems to be standard for many Windows' API
01823 //  routines and around. It differs from C calling convention just a little:
01824 //  callee should clean up the stack, not caller. Symbols should be also
01825 //  decorated in some fancy way :) It doesn't support any vector arguments.
01826 //  For info on fast calling convention see Fast Calling Convention (tail call)
01827 //  implementation LowerX86_32FastCCCallTo.
01828 
01829 /// CallIsStructReturn - Determines whether a call uses struct return
01830 /// semantics.
01831 enum StructReturnType {
01832   NotStructReturn,
01833   RegStructReturn,
01834   StackStructReturn
01835 };
01836 static StructReturnType
01837 callIsStructReturn(const SmallVectorImpl<ISD::OutputArg> &Outs) {
01838   if (Outs.empty())
01839     return NotStructReturn;
01840 
01841   const ISD::ArgFlagsTy &Flags = Outs[0].Flags;
01842   if (!Flags.isSRet())
01843     return NotStructReturn;
01844   if (Flags.isInReg())
01845     return RegStructReturn;
01846   return StackStructReturn;
01847 }
01848 
01849 /// ArgsAreStructReturn - Determines whether a function uses struct
01850 /// return semantics.
01851 static StructReturnType
01852 argsAreStructReturn(const SmallVectorImpl<ISD::InputArg> &Ins) {
01853   if (Ins.empty())
01854     return NotStructReturn;
01855 
01856   const ISD::ArgFlagsTy &Flags = Ins[0].Flags;
01857   if (!Flags.isSRet())
01858     return NotStructReturn;
01859   if (Flags.isInReg())
01860     return RegStructReturn;
01861   return StackStructReturn;
01862 }
01863 
01864 /// CreateCopyOfByValArgument - Make a copy of an aggregate at address specified
01865 /// by "Src" to address "Dst" with size and alignment information specified by
01866 /// the specific parameter attribute. The copy will be passed as a byval
01867 /// function parameter.
01868 static SDValue
01869 CreateCopyOfByValArgument(SDValue Src, SDValue Dst, SDValue Chain,
01870                           ISD::ArgFlagsTy Flags, SelectionDAG &DAG,
01871                           SDLoc dl) {
01872   SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), MVT::i32);
01873 
01874   return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(),
01875                        /*isVolatile*/false, /*AlwaysInline=*/true,
01876                        MachinePointerInfo(), MachinePointerInfo());
01877 }
01878 
01879 /// IsTailCallConvention - Return true if the calling convention is one that
01880 /// supports tail call optimization.
01881 static bool IsTailCallConvention(CallingConv::ID CC) {
01882   return (CC == CallingConv::Fast || CC == CallingConv::GHC ||
01883           CC == CallingConv::HiPE);
01884 }
01885 
01886 bool X86TargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const {
01887   if (!CI->isTailCall() || getTargetMachine().Options.DisableTailCalls)
01888     return false;
01889 
01890   CallSite CS(CI);
01891   CallingConv::ID CalleeCC = CS.getCallingConv();
01892   if (!IsTailCallConvention(CalleeCC) && CalleeCC != CallingConv::C)
01893     return false;
01894 
01895   return true;
01896 }
01897 
01898 /// FuncIsMadeTailCallSafe - Return true if the function is being made into
01899 /// a tailcall target by changing its ABI.
01900 static bool FuncIsMadeTailCallSafe(CallingConv::ID CC,
01901                                    bool GuaranteedTailCallOpt) {
01902   return GuaranteedTailCallOpt && IsTailCallConvention(CC);
01903 }
01904 
01905 SDValue
01906 X86TargetLowering::LowerMemArgument(SDValue Chain,
01907                                     CallingConv::ID CallConv,
01908                                     const SmallVectorImpl<ISD::InputArg> &Ins,
01909                                     SDLoc dl, SelectionDAG &DAG,
01910                                     const CCValAssign &VA,
01911                                     MachineFrameInfo *MFI,
01912                                     unsigned i) const {
01913   // Create the nodes corresponding to a load from this parameter slot.
01914   ISD::ArgFlagsTy Flags = Ins[i].Flags;
01915   bool AlwaysUseMutable = FuncIsMadeTailCallSafe(CallConv,
01916                               getTargetMachine().Options.GuaranteedTailCallOpt);
01917   bool isImmutable = !AlwaysUseMutable && !Flags.isByVal();
01918   EVT ValVT;
01919 
01920   // If value is passed by pointer we have address passed instead of the value
01921   // itself.
01922   if (VA.getLocInfo() == CCValAssign::Indirect)
01923     ValVT = VA.getLocVT();
01924   else
01925     ValVT = VA.getValVT();
01926 
01927   // FIXME: For now, all byval parameter objects are marked mutable. This can be
01928   // changed with more analysis.
01929   // In case of tail call optimization mark all arguments mutable. Since they
01930   // could be overwritten by lowering of arguments in case of a tail call.
01931   if (Flags.isByVal()) {
01932     unsigned Bytes = Flags.getByValSize();
01933     if (Bytes == 0) Bytes = 1; // Don't create zero-sized stack objects.
01934     int FI = MFI->CreateFixedObject(Bytes, VA.getLocMemOffset(), isImmutable);
01935     return DAG.getFrameIndex(FI, getPointerTy());
01936   } else {
01937     int FI = MFI->CreateFixedObject(ValVT.getSizeInBits()/8,
01938                                     VA.getLocMemOffset(), isImmutable);
01939     SDValue FIN = DAG.getFrameIndex(FI, getPointerTy());
01940     return DAG.getLoad(ValVT, dl, Chain, FIN,
01941                        MachinePointerInfo::getFixedStack(FI),
01942                        false, false, false, 0);
01943   }
01944 }
01945 
01946 SDValue
01947 X86TargetLowering::LowerFormalArguments(SDValue Chain,
01948                                         CallingConv::ID CallConv,
01949                                         bool isVarArg,
01950                                       const SmallVectorImpl<ISD::InputArg> &Ins,
01951                                         SDLoc dl,
01952                                         SelectionDAG &DAG,
01953                                         SmallVectorImpl<SDValue> &InVals)
01954                                           const {
01955   MachineFunction &MF = DAG.getMachineFunction();
01956   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
01957 
01958   const Function* Fn = MF.getFunction();
01959   if (Fn->hasExternalLinkage() &&
01960       Subtarget->isTargetCygMing() &&
01961       Fn->getName() == "main")
01962     FuncInfo->setForceFramePointer(true);
01963 
01964   MachineFrameInfo *MFI = MF.getFrameInfo();
01965   bool Is64Bit = Subtarget->is64Bit();
01966   bool IsWindows = Subtarget->isTargetWindows();
01967   bool IsWin64 = Subtarget->isTargetWin64();
01968 
01969   assert(!(isVarArg && IsTailCallConvention(CallConv)) &&
01970          "Var args not supported with calling convention fastcc, ghc or hipe");
01971 
01972   // Assign locations to all of the incoming arguments.
01973   SmallVector<CCValAssign, 16> ArgLocs;
01974   CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(),
01975                  ArgLocs, *DAG.getContext());
01976 
01977   // Allocate shadow area for Win64
01978   if (IsWin64) {
01979     CCInfo.AllocateStack(32, 8);
01980   }
01981 
01982   CCInfo.AnalyzeFormalArguments(Ins, CC_X86);
01983 
01984   unsigned LastVal = ~0U;
01985   SDValue ArgValue;
01986   for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
01987     CCValAssign &VA = ArgLocs[i];
01988     // TODO: If an arg is passed in two places (e.g. reg and stack), skip later
01989     // places.
01990     assert(VA.getValNo() != LastVal &&
01991            "Don't support value assigned to multiple locs yet");
01992     (void)LastVal;
01993     LastVal = VA.getValNo();
01994 
01995     if (VA.isRegLoc()) {
01996       EVT RegVT = VA.getLocVT();
01997       const TargetRegisterClass *RC;
01998       if (RegVT == MVT::i32)
01999         RC = &X86::GR32RegClass;
02000       else if (Is64Bit && RegVT == MVT::i64)
02001         RC = &X86::GR64RegClass;
02002       else if (RegVT == MVT::f32)
02003         RC = &X86::FR32RegClass;
02004       else if (RegVT == MVT::f64)
02005         RC = &X86::FR64RegClass;
02006       else if (RegVT.is256BitVector())
02007         RC = &X86::VR256RegClass;
02008       else if (RegVT.is128BitVector())
02009         RC = &X86::VR128RegClass;
02010       else if (RegVT == MVT::x86mmx)
02011         RC = &X86::VR64RegClass;
02012       else
02013         llvm_unreachable("Unknown argument type!");
02014 
02015       unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
02016       ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
02017 
02018       // If this is an 8 or 16-bit value, it is really passed promoted to 32
02019       // bits.  Insert an assert[sz]ext to capture this, then truncate to the
02020       // right size.
02021       if (VA.getLocInfo() == CCValAssign::SExt)
02022         ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue,
02023                                DAG.getValueType(VA.getValVT()));
02024       else if (VA.getLocInfo() == CCValAssign::ZExt)
02025         ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue,
02026                                DAG.getValueType(VA.getValVT()));
02027       else if (VA.getLocInfo() == CCValAssign::BCvt)
02028         ArgValue = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), ArgValue);
02029 
02030       if (VA.isExtInLoc()) {
02031         // Handle MMX values passed in XMM regs.
02032         if (RegVT.isVector())
02033           ArgValue = DAG.getNode(X86ISD::MOVDQ2Q, dl, VA.getValVT(), ArgValue);
02034         else
02035           ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
02036       }
02037     } else {
02038       assert(VA.isMemLoc());
02039       ArgValue = LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, i);
02040     }
02041 
02042     // If value is passed via pointer - do a load.
02043     if (VA.getLocInfo() == CCValAssign::Indirect)
02044       ArgValue = DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue,
02045                              MachinePointerInfo(), false, false, false, 0);
02046 
02047     InVals.push_back(ArgValue);
02048   }
02049 
02050   // The x86-64 ABIs require that for returning structs by value we copy
02051   // the sret argument into %rax/%eax (depending on ABI) for the return.
02052   // Win32 requires us to put the sret argument to %eax as well.
02053   // Save the argument into a virtual register so that we can access it
02054   // from the return points.
02055   if (MF.getFunction()->hasStructRetAttr() &&
02056       (Subtarget->is64Bit() || Subtarget->isTargetWindows())) {
02057     X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
02058     unsigned Reg = FuncInfo->getSRetReturnReg();
02059     if (!Reg) {
02060       MVT PtrTy = getPointerTy();
02061       Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy));
02062       FuncInfo->setSRetReturnReg(Reg);
02063     }
02064     SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[0]);
02065     Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain);
02066   }
02067 
02068   unsigned StackSize = CCInfo.getNextStackOffset();
02069   // Align stack specially for tail calls.
02070   if (FuncIsMadeTailCallSafe(CallConv,
02071                              MF.getTarget().Options.GuaranteedTailCallOpt))
02072     StackSize = GetAlignedArgumentStackSize(StackSize, DAG);
02073 
02074   // If the function takes variable number of arguments, make a frame index for
02075   // the start of the first vararg value... for expansion of llvm.va_start.
02076   if (isVarArg) {
02077     if (Is64Bit || (CallConv != CallingConv::X86_FastCall &&
02078                     CallConv != CallingConv::X86_ThisCall)) {
02079       FuncInfo->setVarArgsFrameIndex(MFI->CreateFixedObject(1, StackSize,true));
02080     }
02081     if (Is64Bit) {
02082       unsigned TotalNumIntRegs = 0, TotalNumXMMRegs = 0;
02083 
02084       // FIXME: We should really autogenerate these arrays
02085       static const uint16_t GPR64ArgRegsWin64[] = {
02086         X86::RCX, X86::RDX, X86::R8,  X86::R9
02087       };
02088       static const uint16_t GPR64ArgRegs64Bit[] = {
02089         X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9
02090       };
02091       static const uint16_t XMMArgRegs64Bit[] = {
02092         X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
02093         X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
02094       };
02095       const uint16_t *GPR64ArgRegs;
02096       unsigned NumXMMRegs = 0;
02097 
02098       if (IsWin64) {
02099         // The XMM registers which might contain var arg parameters are shadowed
02100         // in their paired GPR.  So we only need to save the GPR to their home
02101         // slots.
02102         TotalNumIntRegs = 4;
02103         GPR64ArgRegs = GPR64ArgRegsWin64;
02104       } else {
02105         TotalNumIntRegs = 6; TotalNumXMMRegs = 8;
02106         GPR64ArgRegs = GPR64ArgRegs64Bit;
02107 
02108         NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs64Bit,
02109                                                 TotalNumXMMRegs);
02110       }
02111       unsigned NumIntRegs = CCInfo.getFirstUnallocated(GPR64ArgRegs,
02112                                                        TotalNumIntRegs);
02113 
02114       bool NoImplicitFloatOps = Fn->getAttributes().
02115         hasAttribute(AttributeSet::FunctionIndex, Attribute::NoImplicitFloat);
02116       assert(!(NumXMMRegs && !Subtarget->hasSSE1()) &&
02117              "SSE register cannot be used when SSE is disabled!");
02118       assert(!(NumXMMRegs && MF.getTarget().Options.UseSoftFloat &&
02119                NoImplicitFloatOps) &&
02120              "SSE register cannot be used when SSE is disabled!");
02121       if (MF.getTarget().Options.UseSoftFloat || NoImplicitFloatOps ||
02122           !Subtarget->hasSSE1())
02123         // Kernel mode asks for SSE to be disabled, so don't push them
02124         // on the stack.
02125         TotalNumXMMRegs = 0;
02126 
02127       if (IsWin64) {
02128         const TargetFrameLowering &TFI = *getTargetMachine().getFrameLowering();
02129         // Get to the caller-allocated home save location.  Add 8 to account
02130         // for the return address.
02131         int HomeOffset = TFI.getOffsetOfLocalArea() + 8;
02132         FuncInfo->setRegSaveFrameIndex(
02133           MFI->CreateFixedObject(1, NumIntRegs * 8 + HomeOffset, false));
02134         // Fixup to set vararg frame on shadow area (4 x i64).
02135         if (NumIntRegs < 4)
02136           FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex());
02137       } else {
02138         // For X86-64, if there are vararg parameters that are passed via
02139         // registers, then we must store them to their spots on the stack so
02140         // they may be loaded by deferencing the result of va_next.
02141         FuncInfo->setVarArgsGPOffset(NumIntRegs * 8);
02142         FuncInfo->setVarArgsFPOffset(TotalNumIntRegs * 8 + NumXMMRegs * 16);
02143         FuncInfo->setRegSaveFrameIndex(
02144           MFI->CreateStackObject(TotalNumIntRegs * 8 + TotalNumXMMRegs * 16, 16,
02145                                false));
02146       }
02147 
02148       // Store the integer parameter registers.
02149       SmallVector<SDValue, 8> MemOps;
02150       SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(),
02151                                         getPointerTy());
02152       unsigned Offset = FuncInfo->getVarArgsGPOffset();
02153       for (; NumIntRegs != TotalNumIntRegs; ++NumIntRegs) {
02154         SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), RSFIN,
02155                                   DAG.getIntPtrConstant(Offset));
02156         unsigned VReg = MF.addLiveIn(GPR64ArgRegs[NumIntRegs],
02157                                      &X86::GR64RegClass);
02158         SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64);
02159         SDValue Store =
02160           DAG.getStore(Val.getValue(1), dl, Val, FIN,
02161                        MachinePointerInfo::getFixedStack(
02162                          FuncInfo->getRegSaveFrameIndex(), Offset),
02163                        false, false, 0);
02164         MemOps.push_back(Store);
02165         Offset += 8;
02166       }
02167 
02168       if (TotalNumXMMRegs != 0 && NumXMMRegs != TotalNumXMMRegs) {
02169         // Now store the XMM (fp + vector) parameter registers.
02170         SmallVector<SDValue, 11> SaveXMMOps;
02171         SaveXMMOps.push_back(Chain);
02172 
02173         unsigned AL = MF.addLiveIn(X86::AL, &X86::GR8RegClass);
02174         SDValue ALVal = DAG.getCopyFromReg(DAG.getEntryNode(), dl, AL, MVT::i8);
02175         SaveXMMOps.push_back(ALVal);
02176 
02177         SaveXMMOps.push_back(DAG.getIntPtrConstant(
02178                                FuncInfo->getRegSaveFrameIndex()));
02179         SaveXMMOps.push_back(DAG.getIntPtrConstant(
02180                                FuncInfo->getVarArgsFPOffset()));
02181 
02182         for (; NumXMMRegs != TotalNumXMMRegs; ++NumXMMRegs) {
02183           unsigned VReg = MF.addLiveIn(XMMArgRegs64Bit[NumXMMRegs],
02184                                        &X86::VR128RegClass);
02185           SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::v4f32);
02186           SaveXMMOps.push_back(Val);
02187         }
02188         MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, dl,
02189                                      MVT::Other,
02190                                      &SaveXMMOps[0], SaveXMMOps.size()));
02191       }
02192 
02193       if (!MemOps.empty())
02194         Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
02195                             &MemOps[0], MemOps.size());
02196     }
02197   }
02198 
02199   // Some CCs need callee pop.
02200   if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
02201                        MF.getTarget().Options.GuaranteedTailCallOpt)) {
02202     FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything.
02203   } else {
02204     FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing.
02205     // If this is an sret function, the return should pop the hidden pointer.
02206     if (!Is64Bit && !IsTailCallConvention(CallConv) && !IsWindows &&
02207         argsAreStructReturn(Ins) == StackStructReturn)
02208       FuncInfo->setBytesToPopOnReturn(4);
02209   }
02210 
02211   if (!Is64Bit) {
02212     // RegSaveFrameIndex is X86-64 only.
02213     FuncInfo->setRegSaveFrameIndex(0xAAAAAAA);
02214     if (CallConv == CallingConv::X86_FastCall ||
02215         CallConv == CallingConv::X86_ThisCall)
02216       // fastcc functions can't have varargs.
02217       FuncInfo->setVarArgsFrameIndex(0xAAAAAAA);
02218   }
02219 
02220   FuncInfo->setArgumentStackSize(StackSize);
02221 
02222   return Chain;
02223 }
02224 
02225 SDValue
02226 X86TargetLowering::LowerMemOpCallTo(SDValue Chain,
02227                                     SDValue StackPtr, SDValue Arg,
02228                                     SDLoc dl, SelectionDAG &DAG,
02229                                     const CCValAssign &VA,
02230                                     ISD::ArgFlagsTy Flags) const {
02231   unsigned LocMemOffset = VA.getLocMemOffset();
02232   SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset);
02233   PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, PtrOff);
02234   if (Flags.isByVal())
02235     return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl);
02236 
02237   return DAG.getStore(Chain, dl, Arg, PtrOff,
02238                       MachinePointerInfo::getStack(LocMemOffset),
02239                       false, false, 0);
02240 }
02241 
02242 /// EmitTailCallLoadRetAddr - Emit a load of return address if tail call
02243 /// optimization is performed and it is required.
02244 SDValue
02245 X86TargetLowering::EmitTailCallLoadRetAddr(SelectionDAG &DAG,
02246                                            SDValue &OutRetAddr, SDValue Chain,
02247                                            bool IsTailCall, bool Is64Bit,
02248                                            int FPDiff, SDLoc dl) const {
02249   // Adjust the Return address stack slot.
02250   EVT VT = getPointerTy();
02251   OutRetAddr = getReturnAddressFrameIndex(DAG);
02252 
02253   // Load the "old" Return address.
02254   OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, MachinePointerInfo(),
02255                            false, false, false, 0);
02256   return SDValue(OutRetAddr.getNode(), 1);
02257 }
02258 
02259 /// EmitTailCallStoreRetAddr - Emit a store of the return address if tail call
02260 /// optimization is performed and it is required (FPDiff!=0).
02261 static SDValue
02262 EmitTailCallStoreRetAddr(SelectionDAG & DAG, MachineFunction &MF,
02263                          SDValue Chain, SDValue RetAddrFrIdx, EVT PtrVT,
02264                          unsigned SlotSize, int FPDiff, SDLoc dl) {
02265   // Store the return address to the appropriate stack slot.
02266   if (!FPDiff) return Chain;
02267   // Calculate the new stack slot for the return address.
02268   int NewReturnAddrFI =
02269     MF.getFrameInfo()->CreateFixedObject(SlotSize, FPDiff-SlotSize, false);
02270   SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, PtrVT);
02271   Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx,
02272                        MachinePointerInfo::getFixedStack(NewReturnAddrFI),
02273                        false, false, 0);
02274   return Chain;
02275 }
02276 
02277 SDValue
02278 X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
02279                              SmallVectorImpl<SDValue> &InVals) const {
02280   SelectionDAG &DAG                     = CLI.DAG;
02281   SDLoc &dl                          = CLI.DL;
02282   SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
02283   SmallVector<SDValue, 32> &OutVals     = CLI.OutVals;
02284   SmallVector<ISD::InputArg, 32> &Ins   = CLI.Ins;
02285   SDValue Chain                         = CLI.Chain;
02286   SDValue Callee                        = CLI.Callee;
02287   CallingConv::ID CallConv              = CLI.CallConv;
02288   bool &isTailCall                      = CLI.IsTailCall;
02289   bool isVarArg                         = CLI.IsVarArg;
02290 
02291   MachineFunction &MF = DAG.getMachineFunction();
02292   bool Is64Bit        = Subtarget->is64Bit();
02293   bool IsWin64        = Subtarget->isTargetWin64();
02294   bool IsWindows      = Subtarget->isTargetWindows();
02295   StructReturnType SR = callIsStructReturn(Outs);
02296   bool IsSibcall      = false;
02297 
02298   if (MF.getTarget().Options.DisableTailCalls)
02299     isTailCall = false;
02300 
02301   if (isTailCall) {
02302     // Check if it's really possible to do a tail call.
02303     isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv,
02304                     isVarArg, SR != NotStructReturn,
02305                     MF.getFunction()->hasStructRetAttr(), CLI.RetTy,
02306                     Outs, OutVals, Ins, DAG);
02307 
02308     // Sibcalls are automatically detected tailcalls which do not require
02309     // ABI changes.
02310     if (!MF.getTarget().Options.GuaranteedTailCallOpt && isTailCall)
02311       IsSibcall = true;
02312 
02313     if (isTailCall)
02314       ++NumTailCalls;
02315   }
02316 
02317   assert(!(isVarArg && IsTailCallConvention(CallConv)) &&
02318          "Var args not supported with calling convention fastcc, ghc or hipe");
02319 
02320   // Analyze operands of the call, assigning locations to each operand.
02321   SmallVector<CCValAssign, 16> ArgLocs;
02322   CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(),
02323                  ArgLocs, *DAG.getContext());
02324 
02325   // Allocate shadow area for Win64
02326   if (IsWin64) {
02327     CCInfo.AllocateStack(32, 8);
02328   }
02329 
02330   CCInfo.AnalyzeCallOperands(Outs, CC_X86);
02331 
02332   // Get a count of how many bytes are to be pushed on the stack.
02333   unsigned NumBytes = CCInfo.getNextStackOffset();
02334   if (IsSibcall)
02335     // This is a sibcall. The memory operands are available in caller's
02336     // own caller's stack.
02337     NumBytes = 0;
02338   else if (getTargetMachine().Options.GuaranteedTailCallOpt &&
02339            IsTailCallConvention(CallConv))
02340     NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG);
02341 
02342   int FPDiff = 0;
02343   if (isTailCall && !IsSibcall) {
02344     // Lower arguments at fp - stackoffset + fpdiff.
02345     X86MachineFunctionInfo *X86Info = MF.getInfo<X86MachineFunctionInfo>();
02346     unsigned NumBytesCallerPushed = X86Info->getBytesToPopOnReturn();
02347 
02348     FPDiff = NumBytesCallerPushed - NumBytes;
02349 
02350     // Set the delta of movement of the returnaddr stackslot.
02351     // But only set if delta is greater than previous delta.
02352     if (FPDiff < X86Info->getTCReturnAddrDelta())
02353       X86Info->setTCReturnAddrDelta(FPDiff);
02354   }
02355 
02356   if (!IsSibcall)
02357     Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true));
02358 
02359   SDValue RetAddrFrIdx;
02360   // Load return address for tail calls.
02361   if (isTailCall && FPDiff)
02362     Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall,
02363                                     Is64Bit, FPDiff, dl);
02364 
02365   SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
02366   SmallVector<SDValue, 8> MemOpChains;
02367   SDValue StackPtr;
02368 
02369   // Walk the register/memloc assignments, inserting copies/loads.  In the case
02370   // of tail call optimization arguments are handle later.
02371   for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
02372     CCValAssign &VA = ArgLocs[i];
02373     EVT RegVT = VA.getLocVT();
02374     SDValue Arg = OutVals[i];
02375     ISD::ArgFlagsTy Flags = Outs[i].Flags;
02376     bool isByVal = Flags.isByVal();
02377 
02378     // Promote the value if needed.
02379     switch (VA.getLocInfo()) {
02380     default: llvm_unreachable("Unknown loc info!");
02381     case CCValAssign::Full: break;
02382     case CCValAssign::SExt:
02383       Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg);
02384       break;
02385     case CCValAssign::ZExt:
02386       Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg);
02387       break;
02388     case CCValAssign::AExt:
02389       if (RegVT.is128BitVector()) {
02390         // Special case: passing MMX values in XMM registers.
02391         Arg = DAG.getNode(ISD::BITCAST, dl, MVT::i64, Arg);
02392         Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg);
02393         Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg);
02394       } else
02395         Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg);
02396       break;
02397     case CCValAssign::BCvt:
02398       Arg = DAG.getNode(ISD::BITCAST, dl, RegVT, Arg);
02399       break;
02400     case CCValAssign::Indirect: {
02401       // Store the argument.
02402       SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT());
02403       int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex();
02404       Chain = DAG.getStore(Chain, dl, Arg, SpillSlot,
02405                            MachinePointerInfo::getFixedStack(FI),
02406                            false, false, 0);
02407       Arg = SpillSlot;
02408       break;
02409     }
02410     }
02411 
02412     if (VA.isRegLoc()) {
02413       RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
02414       if (isVarArg && IsWin64) {
02415         // Win64 ABI requires argument XMM reg to be copied to the corresponding
02416         // shadow reg if callee is a varargs function.
02417         unsigned ShadowReg = 0;
02418         switch (VA.getLocReg()) {
02419         case X86::XMM0: ShadowReg = X86::RCX; break;
02420         case X86::XMM1: ShadowReg = X86::RDX; break;
02421         case X86::XMM2: ShadowReg = X86::R8; break;
02422         case X86::XMM3: ShadowReg = X86::R9; break;
02423         }
02424         if (ShadowReg)
02425           RegsToPass.push_back(std::make_pair(ShadowReg, Arg));
02426       }
02427     } else if (!IsSibcall && (!isTailCall || isByVal)) {
02428       assert(VA.isMemLoc());
02429       if (StackPtr.getNode() == 0)
02430         StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
02431                                       getPointerTy());
02432       MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg,
02433                                              dl, DAG, VA, Flags));
02434     }
02435   }
02436 
02437   if (!MemOpChains.empty())
02438     Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
02439                         &MemOpChains[0], MemOpChains.size());
02440 
02441   if (Subtarget->isPICStyleGOT()) {
02442     // ELF / PIC requires GOT in the EBX register before function calls via PLT
02443     // GOT pointer.
02444     if (!isTailCall) {
02445       RegsToPass.push_back(std::make_pair(unsigned(X86::EBX),
02446                DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), getPointerTy())));
02447     } else {
02448       // If we are tail calling and generating PIC/GOT style code load the
02449       // address of the callee into ECX. The value in ecx is used as target of
02450       // the tail jump. This is done to circumvent the ebx/callee-saved problem
02451       // for tail calls on PIC/GOT architectures. Normally we would just put the
02452       // address of GOT into ebx and then call target@PLT. But for tail calls
02453       // ebx would be restored (since ebx is callee saved) before jumping to the
02454       // target@PLT.
02455 
02456       // Note: The actual moving to ECX is done further down.
02457       GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
02458       if (G && !G->getGlobal()->hasHiddenVisibility() &&
02459           !G->getGlobal()->hasProtectedVisibility())
02460         Callee = LowerGlobalAddress(Callee, DAG);
02461       else if (isa<ExternalSymbolSDNode>(Callee))
02462         Callee = LowerExternalSymbol(Callee, DAG);
02463     }
02464   }
02465 
02466   if (Is64Bit && isVarArg && !IsWin64) {
02467     // From AMD64 ABI document:
02468     // For calls that may call functions that use varargs or stdargs
02469     // (prototype-less calls or calls to functions containing ellipsis (...) in
02470     // the declaration) %al is used as hidden argument to specify the number
02471     // of SSE registers used. The contents of %al do not need to match exactly
02472     // the number of registers, but must be an ubound on the number of SSE
02473     // registers used and is in the range 0 - 8 inclusive.
02474 
02475     // Count the number of XMM registers allocated.
02476     static const uint16_t XMMArgRegs[] = {
02477       X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
02478       X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
02479     };
02480     unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs, 8);
02481     assert((Subtarget->hasSSE1() || !NumXMMRegs)
02482            && "SSE registers cannot be used when SSE is disabled");
02483 
02484     RegsToPass.push_back(std::make_pair(unsigned(X86::AL),
02485                                         DAG.getConstant(NumXMMRegs, MVT::i8)));
02486   }
02487 
02488   // For tail calls lower the arguments to the 'real' stack slot.
02489   if (isTailCall) {
02490     // Force all the incoming stack arguments to be loaded from the stack
02491     // before any new outgoing arguments are stored to the stack, because the
02492     // outgoing stack slots may alias the incoming argument stack slots, and
02493     // the alias isn't otherwise explicit. This is slightly more conservative
02494     // than necessary, because it means that each store effectively depends
02495     // on every argument instead of just those arguments it would clobber.
02496     SDValue ArgChain = DAG.getStackArgumentTokenFactor(Chain);
02497 
02498     SmallVector<SDValue, 8> MemOpChains2;
02499     SDValue FIN;
02500     int FI = 0;
02501     if (getTargetMachine().Options.GuaranteedTailCallOpt) {
02502       for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
02503         CCValAssign &VA = ArgLocs[i];
02504         if (VA.isRegLoc())
02505           continue;
02506         assert(VA.isMemLoc());
02507         SDValue Arg = OutVals[i];
02508         ISD::ArgFlagsTy Flags = Outs[i].Flags;
02509         // Create frame index.
02510         int32_t Offset = VA.getLocMemOffset()+FPDiff;
02511         uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8;
02512         FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset, true);
02513         FIN = DAG.getFrameIndex(FI, getPointerTy());
02514 
02515         if (Flags.isByVal()) {
02516           // Copy relative to framepointer.
02517           SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset());
02518           if (StackPtr.getNode() == 0)
02519             StackPtr = DAG.getCopyFromReg(Chain, dl,
02520                                           RegInfo->getStackRegister(),
02521                                           getPointerTy());
02522           Source = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, Source);
02523 
02524           MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN,
02525                                                            ArgChain,
02526                                                            Flags, DAG, dl));
02527         } else {
02528           // Store relative to framepointer.
02529           MemOpChains2.push_back(
02530             DAG.getStore(ArgChain, dl, Arg, FIN,
02531                          MachinePointerInfo::getFixedStack(FI),
02532                          false, false, 0));
02533         }
02534       }
02535     }
02536 
02537     if (!MemOpChains2.empty())
02538       Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
02539                           &MemOpChains2[0], MemOpChains2.size());
02540 
02541     // Store the return address to the appropriate stack slot.
02542     Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx,
02543                                      getPointerTy(), RegInfo->getSlotSize(),
02544                                      FPDiff, dl);
02545   }
02546 
02547   // Build a sequence of copy-to-reg nodes chained together with token chain
02548   // and flag operands which copy the outgoing args into registers.
02549   SDValue InFlag;
02550   for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
02551     Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
02552                              RegsToPass[i].second, InFlag);
02553     InFlag = Chain.getValue(1);
02554   }
02555 
02556   if (getTargetMachine().getCodeModel() == CodeModel::Large) {
02557     assert(Is64Bit && "Large code model is only legal in 64-bit mode.");
02558     // In the 64-bit large code model, we have to make all calls
02559     // through a register, since the call instruction's 32-bit
02560     // pc-relative offset may not be large enough to hold the whole
02561     // address.
02562   } else if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
02563     // If the callee is a GlobalAddress node (quite common, every direct call
02564     // is) turn it into a TargetGlobalAddress node so that legalize doesn't hack
02565     // it.
02566 
02567     // We should use extra load for direct calls to dllimported functions in
02568     // non-JIT mode.
02569     const GlobalValue *GV = G->getGlobal();
02570     if (!GV->hasDLLImportLinkage()) {
02571       unsigned char OpFlags = 0;
02572       bool ExtraLoad = false;
02573       unsigned WrapperKind = ISD::DELETED_NODE;
02574 
02575       // On ELF targets, in both X86-64 and X86-32 mode, direct calls to
02576       // external symbols most go through the PLT in PIC mode.  If the symbol
02577       // has hidden or protected visibility, or if it is static or local, then
02578       // we don't need to use the PLT - we can directly call it.
02579       if (Subtarget->isTargetELF() &&
02580           getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
02581           GV->hasDefaultVisibility() && !GV->hasLocalLinkage()) {
02582         OpFlags = X86II::MO_PLT;
02583       } else if (Subtarget->isPICStyleStubAny() &&
02584                  (GV->isDeclaration() || GV->isWeakForLinker()) &&
02585                  (!Subtarget->getTargetTriple().isMacOSX() ||
02586                   Subtarget->getTargetTriple().isMacOSXVersionLT(10, 5))) {
02587         // PC-relative references to external symbols should go through $stub,
02588         // unless we're building with the leopard linker or later, which
02589         // automatically synthesizes these stubs.
02590         OpFlags = X86II::MO_DARWIN_STUB;
02591       } else if (Subtarget->isPICStyleRIPRel() &&
02592                  isa<Function>(GV) &&
02593                  cast<Function>(GV)->getAttributes().
02594                    hasAttribute(AttributeSet::FunctionIndex,
02595                                 Attribute::NonLazyBind)) {
02596         // If the function is marked as non-lazy, generate an indirect call
02597         // which loads from the GOT directly. This avoids runtime overhead
02598         // at the cost of eager binding (and one extra byte of encoding).
02599         OpFlags = X86II::MO_GOTPCREL;
02600         WrapperKind = X86ISD::WrapperRIP;
02601         ExtraLoad = true;
02602       }
02603 
02604       Callee = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(),
02605                                           G->getOffset(), OpFlags);
02606 
02607       // Add a wrapper if needed.
02608       if (WrapperKind != ISD::DELETED_NODE)
02609         Callee = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Callee);
02610       // Add extra indirection if needed.
02611       if (ExtraLoad)
02612         Callee = DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), Callee,
02613                              MachinePointerInfo::getGOT(),
02614                              false, false, false, 0);
02615     }
02616   } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
02617     unsigned char OpFlags = 0;
02618 
02619     // On ELF targets, in either X86-64 or X86-32 mode, direct calls to
02620     // external symbols should go through the PLT.
02621     if (Subtarget->isTargetELF() &&
02622         getTargetMachine().getRelocationModel() == Reloc::PIC_) {
02623       OpFlags = X86II::MO_PLT;
02624     } else if (Subtarget->isPICStyleStubAny() &&
02625                (!Subtarget->getTargetTriple().isMacOSX() ||
02626                 Subtarget->getTargetTriple().isMacOSXVersionLT(10, 5))) {
02627       // PC-relative references to external symbols should go through $stub,
02628       // unless we're building with the leopard linker or later, which
02629       // automatically synthesizes these stubs.
02630       OpFlags = X86II::MO_DARWIN_STUB;
02631     }
02632 
02633     Callee = DAG.getTargetExternalSymbol(S->getSymbol(), getPointerTy(),
02634                                          OpFlags);
02635   }
02636 
02637   // Returns a chain & a flag for retval copy to use.
02638   SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
02639   SmallVector<SDValue, 8> Ops;
02640 
02641   if (!IsSibcall && isTailCall) {
02642     Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true),
02643                            DAG.getIntPtrConstant(0, true), InFlag);
02644     InFlag = Chain.getValue(1);
02645   }
02646 
02647   Ops.push_back(Chain);
02648   Ops.push_back(Callee);
02649 
02650   if (isTailCall)
02651     Ops.push_back(DAG.getConstant(FPDiff, MVT::i32));
02652 
02653   // Add argument registers to the end of the list so that they are known live
02654   // into the call.
02655   for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
02656     Ops.push_back(DAG.getRegister(RegsToPass[i].first,
02657                                   RegsToPass[i].second.getValueType()));
02658 
02659   // Add a register mask operand representing the call-preserved registers.
02660   const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo();
02661   const uint32_t *Mask = TRI->getCallPreservedMask(CallConv);
02662   assert(Mask && "Missing call preserved mask for calling convention");
02663   Ops.push_back(DAG.getRegisterMask(Mask));
02664 
02665   if (InFlag.getNode())
02666     Ops.push_back(InFlag);
02667 
02668   if (isTailCall) {
02669     // We used to do:
02670     //// If this is the first return lowered for this function, add the regs
02671     //// to the liveout set for the function.
02672     // This isn't right, although it's probably harmless on x86; liveouts
02673     // should be computed from returns not tail calls.  Consider a void
02674     // function making a tail call to a function returning int.
02675     return DAG.getNode(X86ISD::TC_RETURN, dl, NodeTys, &Ops[0], Ops.size());
02676   }
02677 
02678   Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, &Ops[0], Ops.size());
02679   InFlag = Chain.getValue(1);
02680 
02681   // Create the CALLSEQ_END node.
02682   unsigned NumBytesForCalleeToPush;
02683   if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
02684                        getTargetMachine().Options.GuaranteedTailCallOpt))
02685     NumBytesForCalleeToPush = NumBytes;    // Callee pops everything
02686   else if (!Is64Bit && !IsTailCallConvention(CallConv) && !IsWindows &&
02687            SR == StackStructReturn)
02688     // If this is a call to a struct-return function, the callee
02689     // pops the hidden struct pointer, so we have to push it back.
02690     // This is common for Darwin/X86, Linux & Mingw32 targets.
02691     // For MSVC Win32 targets, the caller pops the hidden struct pointer.
02692     NumBytesForCalleeToPush = 4;
02693   else
02694     NumBytesForCalleeToPush = 0;  // Callee pops nothing.
02695 
02696   // Returns a flag for retval copy to use.
02697   if (!IsSibcall) {
02698     Chain = DAG.getCALLSEQ_END(Chain,
02699                                DAG.getIntPtrConstant(NumBytes, true),
02700                                DAG.getIntPtrConstant(NumBytesForCalleeToPush,
02701                                                      true),
02702                                InFlag);
02703     InFlag = Chain.getValue(1);
02704   }
02705 
02706   // Handle result values, copying them out of physregs into vregs that we
02707   // return.
02708   return LowerCallResult(Chain, InFlag, CallConv, isVarArg,
02709                          Ins, dl, DAG, InVals);
02710 }
02711 
02712 //===----------------------------------------------------------------------===//
02713 //                Fast Calling Convention (tail call) implementation
02714 //===----------------------------------------------------------------------===//
02715 
02716 //  Like std call, callee cleans arguments, convention except that ECX is
02717 //  reserved for storing the tail called function address. Only 2 registers are
02718 //  free for argument passing (inreg). Tail call optimization is performed
02719 //  provided:
02720 //                * tailcallopt is enabled
02721 //                * caller/callee are fastcc
02722 //  On X86_64 architecture with GOT-style position independent code only local
02723 //  (within module) calls are supported at the moment.
02724 //  To keep the stack aligned according to platform abi the function
02725 //  GetAlignedArgumentStackSize ensures that argument delta is always multiples
02726 //  of stack alignment. (Dynamic linkers need this - darwin's dyld for example)
02727 //  If a tail called function callee has more arguments than the caller the
02728 //  caller needs to make sure that there is room to move the RETADDR to. This is
02729 //  achieved by reserving an area the size of the argument delta right after the
02730 //  original REtADDR, but before the saved framepointer or the spilled registers
02731 //  e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4)
02732 //  stack layout:
02733 //    arg1
02734 //    arg2
02735 //    RETADDR
02736 //    [ new RETADDR
02737 //      move area ]
02738 //    (possible EBP)
02739 //    ESI
02740 //    EDI
02741 //    local1 ..
02742 
02743 /// GetAlignedArgumentStackSize - Make the stack size align e.g 16n + 12 aligned
02744 /// for a 16 byte align requirement.
02745 unsigned
02746 X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize,
02747                                                SelectionDAG& DAG) const {
02748   MachineFunction &MF = DAG.getMachineFunction();
02749   const TargetMachine &TM = MF.getTarget();
02750   const TargetFrameLowering &TFI = *TM.getFrameLowering();
02751   unsigned StackAlignment = TFI.getStackAlignment();
02752   uint64_t AlignMask = StackAlignment - 1;
02753   int64_t Offset = StackSize;
02754   unsigned SlotSize = RegInfo->getSlotSize();
02755   if ( (Offset & AlignMask) <= (StackAlignment - SlotSize) ) {
02756     // Number smaller than 12 so just add the difference.
02757     Offset += ((StackAlignment - SlotSize) - (Offset & AlignMask));
02758   } else {
02759     // Mask out lower bits, add stackalignment once plus the 12 bytes.
02760     Offset = ((~AlignMask) & Offset) + StackAlignment +
02761       (StackAlignment-SlotSize);
02762   }
02763   return Offset;
02764 }
02765 
02766 /// MatchingStackOffset - Return true if the given stack call argument is
02767 /// already available in the same position (relatively) of the caller's
02768 /// incoming argument stack.
02769 static
02770 bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
02771                          MachineFrameInfo *MFI, const MachineRegisterInfo *MRI,
02772                          const X86InstrInfo *TII) {
02773   unsigned Bytes = Arg.getValueType().getSizeInBits() / 8;
02774   int FI = INT_MAX;
02775   if (Arg.getOpcode() == ISD::CopyFromReg) {
02776     unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg();
02777     if (!TargetRegisterInfo::isVirtualRegister(VR))
02778       return false;
02779     MachineInstr *Def = MRI->getVRegDef(VR);
02780     if (!Def)
02781       return false;
02782     if (!Flags.isByVal()) {
02783       if (!TII->isLoadFromStackSlot(Def, FI))
02784         return false;
02785     } else {
02786       unsigned Opcode = Def->getOpcode();
02787       if ((Opcode == X86::LEA32r || Opcode == X86::LEA64r) &&
02788           Def->getOperand(1).isFI()) {
02789         FI = Def->getOperand(1).getIndex();
02790         Bytes = Flags.getByValSize();
02791       } else
02792         return false;
02793     }
02794   } else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) {
02795     if (Flags.isByVal())
02796       // ByVal argument is passed in as a pointer but it's now being
02797       // dereferenced. e.g.
02798       // define @foo(%struct.X* %A) {
02799       //   tail call @bar(%struct.X* byval %A)
02800       // }
02801       return false;
02802     SDValue Ptr = Ld->getBasePtr();
02803     FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr);
02804     if (!FINode)
02805       return false;
02806     FI = FINode->getIndex();
02807   } else if (Arg.getOpcode() == ISD::FrameIndex && Flags.isByVal()) {
02808     FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Arg);
02809     FI = FINode->getIndex();
02810     Bytes = Flags.getByValSize();
02811   } else
02812     return false;
02813 
02814   assert(FI != INT_MAX);
02815   if (!MFI->isFixedObjectIndex(FI))
02816     return false;
02817   return Offset == MFI->getObjectOffset(FI) && Bytes == MFI->getObjectSize(FI);
02818 }
02819 
02820 /// IsEligibleForTailCallOptimization - Check whether the call is eligible
02821 /// for tail call optimization. Targets which want to do tail call
02822 /// optimization should implement this function.
02823 bool
02824 X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
02825                                                      CallingConv::ID CalleeCC,
02826                                                      bool isVarArg,
02827                                                      bool isCalleeStructRet,
02828                                                      bool isCallerStructRet,
02829                                                      Type *RetTy,
02830                                     const SmallVectorImpl<ISD::OutputArg> &Outs,
02831                                     const SmallVectorImpl<SDValue> &OutVals,
02832                                     const SmallVectorImpl<ISD::InputArg> &Ins,
02833                                                      SelectionDAG &DAG) const {
02834   if (!IsTailCallConvention(CalleeCC) &&
02835       CalleeCC != CallingConv::C)
02836     return false;
02837 
02838   // If -tailcallopt is specified, make fastcc functions tail-callable.
02839   const MachineFunction &MF = DAG.getMachineFunction();
02840   const Function *CallerF = DAG.getMachineFunction().getFunction();
02841 
02842   // If the function return type is x86_fp80 and the callee return type is not,
02843   // then the FP_EXTEND of the call result is not a nop. It's not safe to
02844   // perform a tailcall optimization here.
02845   if (CallerF->getReturnType()->isX86_FP80Ty() && !RetTy->isX86_FP80Ty())
02846     return false;
02847 
02848   CallingConv::ID CallerCC = CallerF->getCallingConv();
02849   bool CCMatch = CallerCC == CalleeCC;
02850 
02851   if (getTargetMachine().Options.GuaranteedTailCallOpt) {
02852     if (IsTailCallConvention(CalleeCC) && CCMatch)
02853       return true;
02854     return false;
02855   }
02856 
02857   // Look for obvious safe cases to perform tail call optimization that do not
02858   // require ABI changes. This is what gcc calls sibcall.
02859 
02860   // Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to
02861   // emit a special epilogue.
02862   if (RegInfo->needsStackRealignment(MF))
02863     return false;
02864 
02865   // Also avoid sibcall optimization if either caller or callee uses struct
02866   // return semantics.
02867   if (isCalleeStructRet || isCallerStructRet)
02868     return false;
02869 
02870   // An stdcall caller is expected to clean up its arguments; the callee
02871   // isn't going to do that.
02872   if (!CCMatch && CallerCC == CallingConv::X86_StdCall)
02873     return false;
02874 
02875   // Do not sibcall optimize vararg calls unless all arguments are passed via
02876   // registers.
02877   if (isVarArg && !Outs.empty()) {
02878 
02879     // Optimizing for varargs on Win64 is unlikely to be safe without
02880     // additional testing.
02881     if (Subtarget->isTargetWin64())
02882       return false;
02883 
02884     SmallVector<CCValAssign, 16> ArgLocs;
02885     CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(),
02886                    getTargetMachine(), ArgLocs, *DAG.getContext());
02887 
02888     CCInfo.AnalyzeCallOperands(Outs, CC_X86);
02889     for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i)
02890       if (!ArgLocs[i].isRegLoc())
02891         return false;
02892   }
02893 
02894   // If the call result is in ST0 / ST1, it needs to be popped off the x87
02895   // stack.  Therefore, if it's not used by the call it is not safe to optimize
02896   // this into a sibcall.
02897   bool Unused = false;
02898   for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
02899     if (!Ins[i].Used) {
02900       Unused = true;
02901       break;
02902     }
02903   }
02904   if (Unused) {
02905     SmallVector<CCValAssign, 16> RVLocs;
02906     CCState CCInfo(CalleeCC, false, DAG.getMachineFunction(),
02907                    getTargetMachine(), RVLocs, *DAG.getContext());
02908     CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
02909     for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
02910       CCValAssign &VA = RVLocs[i];
02911       if (VA.getLocReg() == X86::ST0 || VA.getLocReg() == X86::ST1)
02912         return false;
02913     }
02914   }
02915 
02916   // If the calling conventions do not match, then we'd better make sure the
02917   // results are returned in the same way as what the caller expects.
02918   if (!CCMatch) {
02919     SmallVector<CCValAssign, 16> RVLocs1;
02920     CCState CCInfo1(CalleeCC, false, DAG.getMachineFunction(),
02921                     getTargetMachine(), RVLocs1, *DAG.getContext());
02922     CCInfo1.AnalyzeCallResult(Ins, RetCC_X86);
02923 
02924     SmallVector<CCValAssign, 16> RVLocs2;
02925     CCState CCInfo2(CallerCC, false, DAG.getMachineFunction(),
02926                     getTargetMachine(), RVLocs2, *DAG.getContext());
02927     CCInfo2.AnalyzeCallResult(Ins, RetCC_X86);
02928 
02929     if (RVLocs1.size() != RVLocs2.size())
02930       return false;
02931     for (unsigned i = 0, e = RVLocs1.size(); i != e; ++i) {
02932       if (RVLocs1[i].isRegLoc() != RVLocs2[i].isRegLoc())
02933         return false;
02934       if (RVLocs1[i].getLocInfo() != RVLocs2[i].getLocInfo())
02935         return false;
02936       if (RVLocs1[i].isRegLoc()) {
02937         if (RVLocs1[i].getLocReg() != RVLocs2[i].getLocReg())
02938           return false;
02939       } else {
02940         if (RVLocs1[i].getLocMemOffset() != RVLocs2[i].getLocMemOffset())
02941           return false;
02942       }
02943     }
02944   }
02945 
02946   // If the callee takes no arguments then go on to check the results of the
02947   // call.
02948   if (!Outs.empty()) {
02949     // Check if stack adjustment is needed. For now, do not do this if any
02950     // argument is passed on the stack.
02951     SmallVector<CCValAssign, 16> ArgLocs;
02952     CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(),
02953                    getTargetMachine(), ArgLocs, *DAG.getContext());
02954 
02955     // Allocate shadow area for Win64
02956     if (Subtarget->isTargetWin64()) {
02957       CCInfo.AllocateStack(32, 8);
02958     }
02959 
02960     CCInfo.AnalyzeCallOperands(Outs, CC_X86);
02961     if (CCInfo.getNextStackOffset()) {
02962       MachineFunction &MF = DAG.getMachineFunction();
02963       if (MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn())
02964         return false;
02965 
02966       // Check if the arguments are already laid out in the right way as
02967       // the caller's fixed stack objects.
02968       MachineFrameInfo *MFI = MF.getFrameInfo();
02969       const MachineRegisterInfo *MRI = &MF.getRegInfo();
02970       const X86InstrInfo *TII =
02971         ((const X86TargetMachine&)getTargetMachine()).getInstrInfo();
02972       for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
02973         CCValAssign &VA = ArgLocs[i];
02974         SDValue Arg = OutVals[i];
02975         ISD::ArgFlagsTy Flags = Outs[i].Flags;
02976         if (VA.getLocInfo() == CCValAssign::Indirect)
02977           return false;
02978         if (!VA.isRegLoc()) {
02979           if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags,
02980                                    MFI, MRI, TII))
02981             return false;
02982         }
02983       }
02984     }
02985 
02986     // If the tailcall address may be in a register, then make sure it's
02987     // possible to register allocate for it. In 32-bit, the call address can
02988     // only target EAX, EDX, or ECX since the tail call must be scheduled after
02989     // callee-saved registers are restored. These happen to be the same
02990     // registers used to pass 'inreg' arguments so watch out for those.
02991     if (!Subtarget->is64Bit() &&
02992         ((!isa<GlobalAddressSDNode>(Callee) &&
02993           !isa<ExternalSymbolSDNode>(Callee)) ||
02994          getTargetMachine().getRelocationModel() == Reloc::PIC_)) {
02995       unsigned NumInRegs = 0;
02996       // In PIC we need an extra register to formulate the address computation
02997       // for the callee.
02998       unsigned MaxInRegs =
02999           (getTargetMachine().getRelocationModel() == Reloc::PIC_) ? 2 : 3;
03000 
03001       for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
03002         CCValAssign &VA = ArgLocs[i];
03003         if (!VA.isRegLoc())
03004           continue;
03005         unsigned Reg = VA.getLocReg();
03006         switch (Reg) {
03007         default: break;
03008         case X86::EAX: case X86::EDX: case X86::ECX:
03009           if (++NumInRegs == MaxInRegs)
03010             return false;
03011           break;
03012         }
03013       }
03014     }
03015   }
03016 
03017   return true;
03018 }
03019 
03020 FastISel *
03021 X86TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
03022                                   const TargetLibraryInfo *libInfo) const {
03023   return X86::createFastISel(funcInfo, libInfo);
03024 }
03025 
03026 //===----------------------------------------------------------------------===//
03027 //                           Other Lowering Hooks
03028 //===----------------------------------------------------------------------===//
03029 
03030 static bool MayFoldLoad(SDValue Op) {
03031   return Op.hasOneUse() && ISD::isNormalLoad(Op.getNode());
03032 }
03033 
03034 static bool MayFoldIntoStore(SDValue Op) {
03035   return Op.hasOneUse() && ISD::isNormalStore(*Op.getNode()->use_begin());
03036 }
03037 
03038 static bool isTargetShuffle(unsigned Opcode) {
03039   switch(Opcode) {
03040   default: return false;
03041   case X86ISD::PSHUFD:
03042   case X86ISD::PSHUFHW:
03043   case X86ISD::PSHUFLW:
03044   case X86ISD::SHUFP:
03045   case X86ISD::PALIGNR:
03046   case X86ISD::MOVLHPS:
03047   case X86ISD::MOVLHPD:
03048   case X86ISD::MOVHLPS:
03049   case X86ISD::MOVLPS:
03050   case X86ISD::MOVLPD:
03051   case X86ISD::MOVSHDUP:
03052   case X86ISD::MOVSLDUP:
03053   case X86ISD::MOVDDUP:
03054   case X86ISD::MOVSS:
03055   case X86ISD::MOVSD:
03056   case X86ISD::UNPCKL:
03057   case X86ISD::UNPCKH:
03058   case X86ISD::VPERMILP:
03059   case X86ISD::VPERM2X128:
03060   case X86ISD::VPERMI:
03061     return true;
03062   }
03063 }
03064 
03065 static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT,
03066                                     SDValue V1, SelectionDAG &DAG) {
03067   switch(Opc) {
03068   default: llvm_unreachable("Unknown x86 shuffle node");
03069   case X86ISD::MOVSHDUP:
03070   case X86ISD::MOVSLDUP:
03071   case X86ISD::MOVDDUP:
03072     return DAG.getNode(Opc, dl, VT, V1);
03073   }
03074 }
03075 
03076 static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT,
03077                                     SDValue V1, unsigned TargetMask,
03078                                     SelectionDAG &DAG) {
03079   switch(Opc) {
03080   default: llvm_unreachable("Unknown x86 shuffle node");
03081   case X86ISD::PSHUFD:
03082   case X86ISD::PSHUFHW:
03083   case X86ISD::PSHUFLW:
03084   case X86ISD::VPERMILP:
03085   case X86ISD::VPERMI:
03086     return DAG.getNode(Opc, dl, VT, V1, DAG.getConstant(TargetMask, MVT::i8));
03087   }
03088 }
03089 
03090 static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT,
03091                                     SDValue V1, SDValue V2, unsigned TargetMask,
03092                                     SelectionDAG &DAG) {
03093   switch(Opc) {
03094   default: llvm_unreachable("Unknown x86 shuffle node");
03095   case X86ISD::PALIGNR:
03096   case X86ISD::SHUFP:
03097   case X86ISD::VPERM2X128:
03098     return DAG.getNode(Opc, dl, VT, V1, V2,
03099                        DAG.getConstant(TargetMask, MVT::i8));
03100   }
03101 }
03102 
03103 static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT,
03104                                     SDValue V1, SDValue V2, SelectionDAG &DAG) {
03105   switch(Opc) {
03106   default: llvm_unreachable("Unknown x86 shuffle node");
03107   case X86ISD::MOVLHPS:
03108   case X86ISD::MOVLHPD:
03109   case X86ISD::MOVHLPS:
03110   case X86ISD::MOVLPS:
03111   case X86ISD::MOVLPD:
03112   case X86ISD::MOVSS:
03113   case X86ISD::MOVSD:
03114   case X86ISD::UNPCKL:
03115   case X86ISD::UNPCKH:
03116     return DAG.getNode(Opc, dl, VT, V1, V2);
03117   }
03118 }
03119 
03120 SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const {
03121   MachineFunction &MF = DAG.getMachineFunction();
03122   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
03123   int ReturnAddrIndex = FuncInfo->getRAIndex();
03124 
03125   if (ReturnAddrIndex == 0) {
03126     // Set up a frame object for the return address.
03127     unsigned SlotSize = RegInfo->getSlotSize();
03128     ReturnAddrIndex = MF.getFrameInfo()->CreateFixedObject(SlotSize, -SlotSize,
03129                                                            false);
03130     FuncInfo->setRAIndex(ReturnAddrIndex);
03131   }
03132 
03133   return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy());
03134 }
03135 
03136 bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M,
03137                                        bool hasSymbolicDisplacement) {
03138   // Offset should fit into 32 bit immediate field.
03139   if (!isInt<32>(Offset))
03140     return false;
03141 
03142   // If we don't have a symbolic displacement - we don't have any extra
03143   // restrictions.
03144   if (!hasSymbolicDisplacement)
03145     return true;
03146 
03147   // FIXME: Some tweaks might be needed for medium code model.
03148   if (M != CodeModel::Small && M != CodeModel::Kernel)
03149     return false;
03150 
03151   // For small code model we assume that latest object is 16MB before end of 31
03152   // bits boundary. We may also accept pretty large negative constants knowing
03153   // that all objects are in the positive half of address space.
03154   if (M == CodeModel::Small && Offset < 16*1024*1024)
03155     return true;
03156 
03157   // For kernel code model we know that all object resist in the negative half
03158   // of 32bits address space. We may not accept negative offsets, since they may
03159   // be just off and we may accept pretty large positive ones.
03160   if (M == CodeModel::Kernel && Offset > 0)
03161     return true;
03162 
03163   return false;
03164 }
03165 
03166 /// isCalleePop - Determines whether the callee is required to pop its
03167 /// own arguments. Callee pop is necessary to support tail calls.
03168 bool X86::isCalleePop(CallingConv::ID CallingConv,
03169                       bool is64Bit, bool IsVarArg, bool TailCallOpt) {
03170   if (IsVarArg)
03171     return false;
03172 
03173   switch (CallingConv) {
03174   default:
03175     return false;
03176   case CallingConv::X86_StdCall:
03177     return !is64Bit;
03178   case CallingConv::X86_FastCall:
03179     return !is64Bit;
03180   case CallingConv::X86_ThisCall:
03181     return !is64Bit;
03182   case CallingConv::Fast:
03183     return TailCallOpt;
03184   case CallingConv::GHC:
03185     return TailCallOpt;
03186   case CallingConv::HiPE:
03187     return TailCallOpt;
03188   }
03189 }
03190 
03191 /// TranslateX86CC - do a one to one translation of a ISD::CondCode to the X86
03192 /// specific condition code, returning the condition code and the LHS/RHS of the
03193 /// comparison to make.
03194 static unsigned TranslateX86CC(ISD::CondCode SetCCOpcode, bool isFP,
03195                                SDValue &LHS, SDValue &RHS, SelectionDAG &DAG) {
03196   if (!isFP) {
03197     if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) {
03198       if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnesValue()) {
03199         // X > -1   -> X == 0, jump !sign.
03200         RHS = DAG.getConstant(0, RHS.getValueType());
03201         return X86::COND_NS;
03202       }
03203       if (SetCCOpcode == ISD::SETLT && RHSC->isNullValue()) {
03204         // X < 0   -> X == 0, jump on sign.
03205         return X86::COND_S;
03206       }
03207       if (SetCCOpcode == ISD::SETLT && RHSC->getZExtValue() == 1) {
03208         // X < 1   -> X <= 0
03209         RHS = DAG.getConstant(0, RHS.getValueType());
03210         return X86::COND_LE;
03211       }
03212     }
03213 
03214     switch (SetCCOpcode) {
03215     default: llvm_unreachable("Invalid integer condition!");
03216     case ISD::SETEQ:  return X86::COND_E;
03217     case ISD::SETGT:  return X86::COND_G;
03218     case ISD::SETGE:  return X86::COND_GE;
03219     case ISD::SETLT:  return X86::COND_L;
03220     case ISD::SETLE:  return X86::COND_LE;
03221     case ISD::SETNE:  return X86::COND_NE;
03222     case ISD::SETULT: return X86::COND_B;
03223     case ISD::SETUGT: return X86::COND_A;
03224     case ISD::SETULE: return X86::COND_BE;
03225     case ISD::SETUGE: return X86::COND_AE;
03226     }
03227   }
03228 
03229   // First determine if it is required or is profitable to flip the operands.
03230 
03231   // If LHS is a foldable load, but RHS is not, flip the condition.
03232   if (ISD::isNON_EXTLoad(LHS.getNode()) &&
03233       !ISD::isNON_EXTLoad(RHS.getNode())) {
03234     SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode);
03235     std::swap(LHS, RHS);
03236   }
03237 
03238   switch (SetCCOpcode) {
03239   default: break;
03240   case ISD::SETOLT:
03241   case ISD::SETOLE:
03242   case ISD::SETUGT:
03243   case ISD::SETUGE:
03244     std::swap(LHS, RHS);
03245     break;
03246   }
03247 
03248   // On a floating point condition, the flags are set as follows:
03249   // ZF  PF  CF   op
03250   //  0 | 0 | 0 | X > Y
03251   //  0 | 0 | 1 | X < Y
03252   //  1 | 0 | 0 | X == Y
03253   //  1 | 1 | 1 | unordered
03254   switch (SetCCOpcode) {
03255   default: llvm_unreachable("Condcode should be pre-legalized away");
03256   case ISD::SETUEQ:
03257   case ISD::SETEQ:   return X86::COND_E;
03258   case ISD::SETOLT:              // flipped
03259   case ISD::SETOGT:
03260   case ISD::SETGT:   return X86::COND_A;
03261   case ISD::SETOLE:              // flipped
03262   case ISD::SETOGE:
03263   case ISD::SETGE:   return X86::COND_AE;
03264   case ISD::SETUGT:              // flipped
03265   case ISD::SETULT:
03266   case ISD::SETLT:   return X86::COND_B;
03267   case ISD::SETUGE:              // flipped
03268   case ISD::SETULE:
03269   case ISD::SETLE:   return X86::COND_BE;
03270   case ISD::SETONE:
03271   case ISD::SETNE:   return X86::COND_NE;
03272   case ISD::SETUO:   return X86::COND_P;
03273   case ISD::SETO:    return X86::COND_NP;
03274   case ISD::SETOEQ:
03275   case ISD::SETUNE:  return X86::COND_INVALID;
03276   }
03277 }
03278 
03279 /// hasFPCMov - is there a floating point cmov for the specific X86 condition
03280 /// code. Current x86 isa includes the following FP cmov instructions:
03281 /// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu.
03282 static bool hasFPCMov(unsigned X86CC) {
03283   switch (X86CC) {
03284   default:
03285     return false;
03286   case X86::COND_B:
03287   case X86::COND_BE:
03288   case X86::COND_E:
03289   case X86::COND_P:
03290   case X86::COND_A:
03291   case X86::COND_AE:
03292   case X86::COND_NE:
03293   case X86::COND_NP:
03294     return true;
03295   }
03296 }
03297 
03298 /// isFPImmLegal - Returns true if the target can instruction select the
03299 /// specified FP immediate natively. If false, the legalizer will
03300 /// materialize the FP immediate as a load from a constant pool.
03301 bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
03302   for (unsigned i = 0, e = LegalFPImmediates.size(); i != e; ++i) {
03303     if (Imm.bitwiseIsEqual(LegalFPImmediates[i]))
03304       return true;
03305   }
03306   return false;
03307 }
03308 
03309 /// isUndefOrInRange - Return true if Val is undef or if its value falls within
03310 /// the specified range (L, H].
03311 static bool isUndefOrInRange(int Val, int Low, int Hi) {
03312   return (Val < 0) || (Val >= Low && Val < Hi);
03313 }
03314 
03315 /// isUndefOrEqual - Val is either less than zero (undef) or equal to the
03316 /// specified value.
03317 static bool isUndefOrEqual(int Val, int CmpVal) {
03318   return (Val < 0 || Val == CmpVal);
03319 }
03320 
03321 /// isSequentialOrUndefInRange - Return true if every element in Mask, beginning
03322 /// from position Pos and ending in Pos+Size, falls within the specified
03323 /// sequential range (L, L+Pos]. or is undef.
03324 static bool isSequentialOrUndefInRange(ArrayRef<int> Mask,
03325                                        unsigned Pos, unsigned Size, int Low) {
03326   for (unsigned i = Pos, e = Pos+Size; i != e; ++i, ++Low)
03327     if (!isUndefOrEqual(Mask[i], Low))
03328       return false;
03329   return true;
03330 }
03331 
03332 /// isPSHUFDMask - Return true if the node specifies a shuffle of elements that
03333 /// is suitable for input to PSHUFD or PSHUFW.  That is, it doesn't reference
03334 /// the second operand.
03335 static bool isPSHUFDMask(ArrayRef<int> Mask, EVT VT) {
03336   if (VT == MVT::v4f32 || VT == MVT::v4i32 )
03337     return (Mask[0] < 4 && Mask[1] < 4 && Mask[2] < 4 && Mask[3] < 4);
03338   if (VT == MVT::v2f64 || VT == MVT::v2i64)
03339     return (Mask[0] < 2 && Mask[1] < 2);
03340   return false;
03341 }
03342 
03343 /// isPSHUFHWMask - Return true if the node specifies a shuffle of elements that
03344 /// is suitable for input to PSHUFHW.
03345 static bool isPSHUFHWMask(ArrayRef<int> Mask, EVT VT, bool HasInt256) {
03346   if (VT != MVT::v8i16 && (!HasInt256 || VT != MVT::v16i16))
03347     return false;
03348 
03349   // Lower quadword copied in order or undef.
03350   if (!isSequentialOrUndefInRange(Mask, 0, 4, 0))
03351     return false;
03352 
03353   // Upper quadword shuffled.
03354   for (unsigned i = 4; i != 8; ++i)
03355     if (!isUndefOrInRange(Mask[i], 4, 8))
03356       return false;
03357 
03358   if (VT == MVT::v16i16) {
03359     // Lower quadword copied in order or undef.
03360     if (!isSequentialOrUndefInRange(Mask, 8, 4, 8))
03361       return false;
03362 
03363     // Upper quadword shuffled.
03364     for (unsigned i = 12; i != 16; ++i)
03365       if (!isUndefOrInRange(Mask[i], 12, 16))
03366         return false;
03367   }
03368 
03369   return true;
03370 }
03371 
03372 /// isPSHUFLWMask - Return true if the node specifies a shuffle of elements that
03373 /// is suitable for input to PSHUFLW.
03374 static bool isPSHUFLWMask(ArrayRef<int> Mask, EVT VT, bool HasInt256) {
03375   if (VT != MVT::v8i16 && (!HasInt256 || VT != MVT::v16i16))
03376     return false;
03377 
03378   // Upper quadword copied in order.
03379   if (!isSequentialOrUndefInRange(Mask, 4, 4, 4))
03380     return false;
03381 
03382   // Lower quadword shuffled.
03383   for (unsigned i = 0; i != 4; ++i)
03384     if (!isUndefOrInRange(Mask[i], 0, 4))
03385       return false;
03386 
03387   if (VT == MVT::v16i16) {
03388     // Upper quadword copied in order.
03389     if (!isSequentialOrUndefInRange(Mask, 12, 4, 12))
03390       return false;
03391 
03392     // Lower quadword shuffled.
03393     for (unsigned i = 8; i != 12; ++i)
03394       if (!isUndefOrInRange(Mask[i], 8, 12))
03395         return false;
03396   }
03397 
03398   return true;
03399 }
03400 
03401 /// isPALIGNRMask - Return true if the node specifies a shuffle of elements that
03402 /// is suitable for input to PALIGNR.
03403 static bool isPALIGNRMask(ArrayRef<int> Mask, EVT VT,
03404                           const X86Subtarget *Subtarget) {
03405   if ((VT.is128BitVector() && !Subtarget->hasSSSE3()) ||
03406       (VT.is256BitVector() && !Subtarget->hasInt256()))
03407     return false;
03408 
03409   unsigned NumElts = VT.getVectorNumElements();
03410   unsigned NumLanes = VT.getSizeInBits()/128;
03411   unsigned NumLaneElts = NumElts/NumLanes;
03412 
03413   // Do not handle 64-bit element shuffles with palignr.
03414   if (NumLaneElts == 2)
03415     return false;
03416 
03417   for (unsigned l = 0; l != NumElts; l+=NumLaneElts) {
03418     unsigned i;
03419     for (i = 0; i != NumLaneElts; ++i) {
03420       if (Mask[i+l] >= 0)
03421         break;
03422     }
03423 
03424     // Lane is all undef, go to next lane
03425     if (i == NumLaneElts)
03426       continue;
03427 
03428     int Start = Mask[i+l];
03429 
03430     // Make sure its in this lane in one of the sources
03431     if (!isUndefOrInRange(Start, l, l+NumLaneElts) &&
03432         !isUndefOrInRange(Start, l+NumElts, l+NumElts+NumLaneElts))
03433       return false;
03434 
03435     // If not lane 0, then we must match lane 0
03436     if (l != 0 && Mask[i] >= 0 && !isUndefOrEqual(Start, Mask[i]+l))
03437       return false;
03438 
03439     // Correct second source to be contiguous with first source
03440     if (Start >= (int)NumElts)
03441       Start -= NumElts - NumLaneElts;
03442 
03443     // Make sure we're shifting in the right direction.
03444     if (Start <= (int)(i+l))
03445       return false;
03446 
03447     Start -= i;
03448 
03449     // Check the rest of the elements to see if they are consecutive.
03450     for (++i; i != NumLaneElts; ++i) {
03451       int Idx = Mask[i+l];
03452 
03453       // Make sure its in this lane
03454       if (!isUndefOrInRange(Idx, l, l+NumLaneElts) &&
03455           !isUndefOrInRange(Idx, l+NumElts, l+NumElts+NumLaneElts))
03456         return false;
03457 
03458       // If not lane 0, then we must match lane 0
03459       if (l != 0 && Mask[i] >= 0 && !isUndefOrEqual(Idx, Mask[i]+l))
03460         return false;
03461 
03462       if (Idx >= (int)NumElts)
03463         Idx -= NumElts - NumLaneElts;
03464 
03465       if (!isUndefOrEqual(Idx, Start+i))
03466         return false;
03467 
03468     }
03469   }
03470 
03471   return true;
03472 }
03473 
03474 /// CommuteVectorShuffleMask - Change values in a shuffle permute mask assuming
03475 /// the two vector operands have swapped position.
03476 static void CommuteVectorShuffleMask(SmallVectorImpl<int> &Mask,
03477                                      unsigned NumElems) {
03478   for (unsigned i = 0; i != NumElems; ++i) {
03479     int idx = Mask[i];
03480     if (idx < 0)
03481       continue;
03482     else if (idx < (int)NumElems)
03483       Mask[i] = idx + NumElems;
03484     else
03485       Mask[i] = idx - NumElems;
03486   }
03487 }
03488 
03489 /// isSHUFPMask - Return true if the specified VECTOR_SHUFFLE operand
03490 /// specifies a shuffle of elements that is suitable for input to 128/256-bit
03491 /// SHUFPS and SHUFPD. If Commuted is true, then it checks for sources to be
03492 /// reverse of what x86 shuffles want.
03493 static bool isSHUFPMask(ArrayRef<int> Mask, EVT VT, bool HasFp256,
03494                         bool Commuted = false) {
03495   if (!HasFp256 && VT.is256BitVector())
03496     return false;
03497 
03498   unsigned NumElems = VT.getVectorNumElements();
03499   unsigned NumLanes = VT.getSizeInBits()/128;
03500   unsigned NumLaneElems = NumElems/NumLanes;
03501 
03502   if (NumLaneElems != 2 && NumLaneElems != 4)
03503     return false;
03504 
03505   // VSHUFPSY divides the resulting vector into 4 chunks.
03506   // The sources are also splitted into 4 chunks, and each destination
03507   // chunk must come from a different source chunk.
03508   //
03509   //  SRC1 =>   X7    X6    X5    X4    X3    X2    X1    X0
03510   //  SRC2 =>   Y7    Y6    Y5    Y4    Y3    Y2    Y1    Y9
03511   //
03512   //  DST  =>  Y7..Y4,   Y7..Y4,   X7..X4,   X7..X4,
03513   //           Y3..Y0,   Y3..Y0,   X3..X0,   X3..X0
03514   //
03515   // VSHUFPDY divides the resulting vector into 4 chunks.
03516   // The sources are also splitted into 4 chunks, and each destination
03517   // chunk must come from a different source chunk.
03518   //
03519   //  SRC1 =>      X3       X2       X1       X0
03520   //  SRC2 =>      Y3       Y2       Y1       Y0
03521   //
03522   //  DST  =>  Y3..Y2,  X3..X2,  Y1..Y0,  X1..X0
03523   //
03524   unsigned HalfLaneElems = NumLaneElems/2;
03525   for (unsigned l = 0; l != NumElems; l += NumLaneElems) {
03526     for (unsigned i = 0; i != NumLaneElems; ++i) {
03527       int Idx = Mask[i+l];
03528       unsigned RngStart = l + ((Commuted == (i<HalfLaneElems)) ? NumElems : 0);
03529       if (!isUndefOrInRange(Idx, RngStart, RngStart+NumLaneElems))
03530         return false;
03531       // For VSHUFPSY, the mask of the second half must be the same as the
03532       // first but with the appropriate offsets. This works in the same way as
03533       // VPERMILPS works with masks.
03534       if (NumElems != 8 || l == 0 || Mask[i] < 0)
03535         continue;
03536       if (!isUndefOrEqual(Idx, Mask[i]+l))
03537         return false;
03538     }
03539   }
03540 
03541   return true;
03542 }
03543 
03544 /// isMOVHLPSMask - Return true if the specified VECTOR_SHUFFLE operand
03545 /// specifies a shuffle of elements that is suitable for input to MOVHLPS.
03546 static bool isMOVHLPSMask(ArrayRef<int> Mask, EVT VT) {
03547   if (!VT.is128BitVector())
03548     return false;
03549 
03550   unsigned NumElems = VT.getVectorNumElements();
03551 
03552   if (NumElems != 4)
03553     return false;
03554 
03555   // Expect bit0 == 6, bit1 == 7, bit2 == 2, bit3 == 3
03556   return isUndefOrEqual(Mask[0], 6) &&
03557          isUndefOrEqual(Mask[1], 7) &&
03558          isUndefOrEqual(Mask[2], 2) &&
03559          isUndefOrEqual(Mask[3], 3);
03560 }
03561 
03562 /// isMOVHLPS_v_undef_Mask - Special case of isMOVHLPSMask for canonical form
03563 /// of vector_shuffle v, v, <2, 3, 2, 3>, i.e. vector_shuffle v, undef,
03564 /// <2, 3, 2, 3>
03565 static bool isMOVHLPS_v_undef_Mask(ArrayRef<int> Mask, EVT VT) {
03566   if (!VT.is128BitVector())
03567     return false;
03568 
03569   unsigned NumElems = VT.getVectorNumElements();
03570 
03571   if (NumElems != 4)
03572     return false;
03573 
03574   return isUndefOrEqual(Mask[0], 2) &&
03575          isUndefOrEqual(Mask[1], 3) &&
03576          isUndefOrEqual(Mask[2], 2) &&
03577          isUndefOrEqual(Mask[3], 3);
03578 }
03579 
03580 /// isMOVLPMask - Return true if the specified VECTOR_SHUFFLE operand
03581 /// specifies a shuffle of elements that is suitable for input to MOVLP{S|D}.
03582 static bool isMOVLPMask(ArrayRef<int> Mask, EVT VT) {
03583   if (!VT.is128BitVector())
03584     return false;
03585 
03586   unsigned NumElems = VT.getVectorNumElements();
03587 
03588   if (NumElems != 2 && NumElems != 4)
03589     return false;
03590 
03591   for (unsigned i = 0, e = NumElems/2; i != e; ++i)
03592     if (!isUndefOrEqual(Mask[i], i + NumElems))
03593       return false;
03594 
03595   for (unsigned i = NumElems/2, e = NumElems; i != e; ++i)
03596     if (!isUndefOrEqual(Mask[i], i))
03597       return false;
03598 
03599   return true;
03600 }
03601 
03602 /// isMOVLHPSMask - Return true if the specified VECTOR_SHUFFLE operand
03603 /// specifies a shuffle of elements that is suitable for input to MOVLHPS.
03604 static bool isMOVLHPSMask(ArrayRef<int> Mask, EVT VT) {
03605   if (!VT.is128BitVector())
03606     return false;
03607 
03608   unsigned NumElems = VT.getVectorNumElements();
03609 
03610   if (NumElems != 2 && NumElems != 4)
03611     return false;
03612 
03613   for (unsigned i = 0, e = NumElems/2; i != e; ++i)
03614     if (!isUndefOrEqual(Mask[i], i))
03615       return false;
03616 
03617   for (unsigned i = 0, e = NumElems/2; i != e; ++i)
03618     if (!isUndefOrEqual(Mask[i + e], i + NumElems))
03619       return false;
03620 
03621   return true;
03622 }
03623 
03624 //
03625 // Some special combinations that can be optimized.
03626 //
03627 static
03628 SDValue Compact8x32ShuffleNode(ShuffleVectorSDNode *SVOp,
03629                                SelectionDAG &DAG) {
03630   MVT VT = SVOp->getValueType(0).getSimpleVT();
03631   SDLoc dl(SVOp);
03632 
03633   if (VT != MVT::v8i32 && VT != MVT::v8f32)
03634     return SDValue();
03635 
03636   ArrayRef<int> Mask = SVOp->getMask();
03637 
03638   // These are the special masks that may be optimized.
03639   static const int MaskToOptimizeEven[] = {0, 8, 2, 10, 4, 12, 6, 14};
03640   static const int MaskToOptimizeOdd[]  = {1, 9, 3, 11, 5, 13, 7, 15};
03641   bool MatchEvenMask = true;
03642   bool MatchOddMask  = true;
03643   for (int i=0; i<8; ++i) {
03644     if (!isUndefOrEqual(Mask[i], MaskToOptimizeEven[i]))
03645       MatchEvenMask = false;
03646     if (!isUndefOrEqual(Mask[i], MaskToOptimizeOdd[i]))
03647       MatchOddMask = false;
03648   }
03649 
03650   if (!MatchEvenMask && !MatchOddMask)
03651     return SDValue();
03652 
03653   SDValue UndefNode = DAG.getNode(ISD::UNDEF, dl, VT);
03654 
03655   SDValue Op0 = SVOp->getOperand(0);
03656   SDValue Op1 = SVOp->getOperand(1);
03657 
03658   if (MatchEvenMask) {
03659     // Shift the second operand right to 32 bits.
03660     static const int ShiftRightMask[] = {-1, 0, -1, 2, -1, 4, -1, 6 };
03661     Op1 = DAG.getVectorShuffle(VT, dl, Op1, UndefNode, ShiftRightMask);
03662   } else {
03663     // Shift the first operand left to 32 bits.
03664     static const int ShiftLeftMask[] = {1, -1, 3, -1, 5, -1, 7, -1 };
03665     Op0 = DAG.getVectorShuffle(VT, dl, Op0, UndefNode, ShiftLeftMask);
03666   }
03667   static const int BlendMask[] = {0, 9, 2, 11, 4, 13, 6, 15};
03668   return DAG.getVectorShuffle(VT, dl, Op0, Op1, BlendMask);
03669 }
03670 
03671 /// isUNPCKLMask - Return true if the specified VECTOR_SHUFFLE operand
03672 /// specifies a shuffle of elements that is suitable for input to UNPCKL.
03673 static bool isUNPCKLMask(ArrayRef<int> Mask, EVT VT,
03674                          bool HasInt256, bool V2IsSplat = false) {
03675   unsigned NumElts = VT.getVectorNumElements();
03676 
03677   assert((VT.is128BitVector() || VT.is256BitVector()) &&
03678          "Unsupported vector type for unpckh");
03679 
03680   if (VT.is256BitVector() && NumElts != 4 && NumElts != 8 &&
03681       (!HasInt256 || (NumElts != 16 && NumElts != 32)))
03682     return false;
03683 
03684   // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate
03685   // independently on 128-bit lanes.
03686   unsigned NumLanes = VT.getSizeInBits()/128;
03687   unsigned NumLaneElts = NumElts/NumLanes;
03688 
03689   for (unsigned l = 0; l != NumLanes; ++l) {
03690     for (unsigned i = l*NumLaneElts, j = l*NumLaneElts;
03691          i != (l+1)*NumLaneElts;
03692          i += 2, ++j) {
03693       int BitI  = Mask[i];
03694       int BitI1 = Mask[i+1];
03695       if (!isUndefOrEqual(BitI, j))
03696         return false;
03697       if (V2IsSplat) {
03698         if (!isUndefOrEqual(BitI1, NumElts))
03699           return false;
03700       } else {
03701         if (!isUndefOrEqual(BitI1, j + NumElts))
03702           return false;
03703       }
03704     }
03705   }
03706 
03707   return true;
03708 }
03709 
03710 /// isUNPCKHMask - Return true if the specified VECTOR_SHUFFLE operand
03711 /// specifies a shuffle of elements that is suitable for input to UNPCKH.
03712 static bool isUNPCKHMask(ArrayRef<int> Mask, EVT VT,
03713                          bool HasInt256, bool V2IsSplat = false) {
03714   unsigned NumElts = VT.getVectorNumElements();
03715 
03716   assert((VT.is128BitVector() || VT.is256BitVector()) &&
03717          "Unsupported vector type for unpckh");
03718 
03719   if (VT.is256BitVector() && NumElts != 4 && NumElts != 8 &&
03720       (!HasInt256 || (NumElts != 16 && NumElts != 32)))
03721     return false;
03722 
03723   // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate
03724   // independently on 128-bit lanes.
03725   unsigned NumLanes = VT.getSizeInBits()/128;
03726   unsigned NumLaneElts = NumElts/NumLanes;
03727 
03728   for (unsigned l = 0; l != NumLanes; ++l) {
03729     for (unsigned i = l*NumLaneElts, j = (l*NumLaneElts)+NumLaneElts/2;
03730          i != (l+1)*NumLaneElts; i += 2, ++j) {
03731       int BitI  = Mask[i];
03732       int BitI1 = Mask[i+1];
03733       if (!isUndefOrEqual(BitI, j))
03734         return false;
03735       if (V2IsSplat) {
03736         if (isUndefOrEqual(BitI1, NumElts))
03737           return false;
03738       } else {
03739         if (!isUndefOrEqual(BitI1, j+NumElts))
03740           return false;
03741       }
03742     }
03743   }
03744   return true;
03745 }
03746 
03747 /// isUNPCKL_v_undef_Mask - Special case of isUNPCKLMask for canonical form
03748 /// of vector_shuffle v, v, <0, 4, 1, 5>, i.e. vector_shuffle v, undef,
03749 /// <0, 0, 1, 1>
03750 static bool isUNPCKL_v_undef_Mask(ArrayRef<int> Mask, EVT VT, bool HasInt256) {
03751   unsigned NumElts = VT.getVectorNumElements();
03752   bool Is256BitVec = VT.is256BitVector();
03753 
03754   assert((VT.is128BitVector() || VT.is256BitVector()) &&
03755          "Unsupported vector type for unpckh");
03756 
03757   if (Is256BitVec && NumElts != 4 && NumElts != 8 &&
03758       (!HasInt256 || (NumElts != 16 && NumElts != 32)))
03759     return false;
03760 
03761   // For 256-bit i64/f64, use MOVDDUPY instead, so reject the matching pattern
03762   // FIXME: Need a better way to get rid of this, there's no latency difference
03763   // between UNPCKLPD and MOVDDUP, the later should always be checked first and
03764   // the former later. We should also remove the "_undef" special mask.
03765   if (NumElts == 4 && Is256BitVec)
03766     return false;
03767 
03768   // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate
03769   // independently on 128-bit lanes.
03770   unsigned NumLanes = VT.getSizeInBits()/128;
03771   unsigned NumLaneElts = NumElts/NumLanes;
03772 
03773   for (unsigned l = 0; l != NumLanes; ++l) {
03774     for (unsigned i = l*NumLaneElts, j = l*NumLaneElts;
03775          i != (l+1)*NumLaneElts;
03776          i += 2, ++j) {
03777       int BitI  = Mask[i];
03778       int BitI1 = Mask[i+1];
03779 
03780       if (!isUndefOrEqual(BitI, j))
03781         return false;
03782       if (!isUndefOrEqual(BitI1, j))
03783         return false;
03784     }
03785   }
03786 
03787   return true;
03788 }
03789 
03790 /// isUNPCKH_v_undef_Mask - Special case of isUNPCKHMask for canonical form
03791 /// of vector_shuffle v, v, <2, 6, 3, 7>, i.e. vector_shuffle v, undef,
03792 /// <2, 2, 3, 3>
03793 static bool isUNPCKH_v_undef_Mask(ArrayRef<int> Mask, EVT VT, bool HasInt256) {
03794   unsigned NumElts = VT.getVectorNumElements();
03795 
03796   assert((VT.is128BitVector() || VT.is256BitVector()) &&
03797          "Unsupported vector type for unpckh");
03798 
03799   if (VT.is256BitVector() && NumElts != 4 && NumElts != 8 &&
03800       (!HasInt256 || (NumElts != 16 && NumElts != 32)))
03801     return false;
03802 
03803   // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate
03804   // independently on 128-bit lanes.
03805   unsigned NumLanes = VT.getSizeInBits()/128;
03806   unsigned NumLaneElts = NumElts/NumLanes;
03807 
03808   for (unsigned l = 0; l != NumLanes; ++l) {
03809     for (unsigned i = l*NumLaneElts, j = (l*NumLaneElts)+NumLaneElts/2;
03810          i != (l+1)*NumLaneElts; i += 2, ++j) {
03811       int BitI  = Mask[i];
03812       int BitI1 = Mask[i+1];
03813       if (!isUndefOrEqual(BitI, j))
03814         return false;
03815       if (!isUndefOrEqual(BitI1, j))
03816         return false;
03817     }
03818   }
03819   return true;
03820 }
03821 
03822 /// isMOVLMask - Return true if the specified VECTOR_SHUFFLE operand
03823 /// specifies a shuffle of elements that is suitable for input to MOVSS,
03824 /// MOVSD, and MOVD, i.e. setting the lowest element.
03825 static bool isMOVLMask(ArrayRef<int> Mask, EVT VT) {
03826   if (VT.getVectorElementType().getSizeInBits() < 32)
03827     return false;
03828   if (!VT.is128BitVector())
03829     return false;
03830 
03831   unsigned NumElts = VT.getVectorNumElements();
03832 
03833   if (!isUndefOrEqual(Mask[0], NumElts))
03834     return false;
03835 
03836   for (unsigned i = 1; i != NumElts; ++i)
03837     if (!isUndefOrEqual(Mask[i], i))
03838       return false;
03839 
03840   return true;
03841 }
03842 
03843 /// isVPERM2X128Mask - Match 256-bit shuffles where the elements are considered
03844 /// as permutations between 128-bit chunks or halves. As an example: this
03845 /// shuffle bellow:
03846 ///   vector_shuffle <4, 5, 6, 7, 12, 13, 14, 15>
03847 /// The first half comes from the second half of V1 and the second half from the
03848 /// the second half of V2.
03849 static bool isVPERM2X128Mask(ArrayRef<int> Mask, EVT VT, bool HasFp256) {
03850   if (!HasFp256 || !VT.is256BitVector())
03851     return false;
03852 
03853   // The shuffle result is divided into half A and half B. In total the two
03854   // sources have 4 halves, namely: C, D, E, F. The final values of A and
03855   // B must come from C, D, E or F.
03856   unsigned HalfSize = VT.getVectorNumElements()/2;
03857   bool MatchA = false, MatchB = false;
03858 
03859   // Check if A comes from one of C, D, E, F.
03860   for (unsigned Half = 0; Half != 4; ++Half) {
03861     if (isSequentialOrUndefInRange(Mask, 0, HalfSize, Half*HalfSize)) {
03862       MatchA = true;
03863       break;
03864     }
03865   }
03866 
03867   // Check if B comes from one of C, D, E, F.
03868   for (unsigned Half = 0; Half != 4; ++Half) {
03869     if (isSequentialOrUndefInRange(Mask, HalfSize, HalfSize, Half*HalfSize)) {
03870       MatchB = true;
03871       break;
03872     }
03873   }
03874 
03875   return MatchA && MatchB;
03876 }
03877 
03878 /// getShuffleVPERM2X128Immediate - Return the appropriate immediate to shuffle
03879 /// the specified VECTOR_MASK mask with VPERM2F128/VPERM2I128 instructions.
03880 static unsigned getShuffleVPERM2X128Immediate(ShuffleVectorSDNode *SVOp) {
03881   MVT VT = SVOp->getValueType(0).getSimpleVT();
03882 
03883   unsigned HalfSize = VT.getVectorNumElements()/2;
03884 
03885   unsigned FstHalf = 0, SndHalf = 0;
03886   for (unsigned i = 0; i < HalfSize; ++i) {
03887     if (SVOp->getMaskElt(i) > 0) {
03888       FstHalf = SVOp->getMaskElt(i)/HalfSize;
03889       break;
03890     }
03891   }
03892   for (unsigned i = HalfSize; i < HalfSize*2; ++i) {
03893     if (SVOp->getMaskElt(i) > 0) {
03894       SndHalf = SVOp->getMaskElt(i)/HalfSize;
03895       break;
03896     }
03897   }
03898 
03899   return (FstHalf | (SndHalf << 4));
03900 }
03901 
03902 /// isVPERMILPMask - Return true if the specified VECTOR_SHUFFLE operand
03903 /// specifies a shuffle of elements that is suitable for input to VPERMILPD*.
03904 /// Note that VPERMIL mask matching is different depending whether theunderlying
03905 /// type is 32 or 64. In the VPERMILPS the high half of the mask should point
03906 /// to the same elements of the low, but to the higher half of the source.
03907 /// In VPERMILPD the two lanes could be shuffled independently of each other
03908 /// with the same restriction that lanes can't be crossed. Also handles PSHUFDY.
03909 static bool isVPERMILPMask(ArrayRef<int> Mask, EVT VT, bool HasFp256) {
03910   if (!HasFp256)
03911     return false;
03912 
03913   unsigned NumElts = VT.getVectorNumElements();
03914   // Only match 256-bit with 32/64-bit types
03915   if (!VT.is256BitVector() || (NumElts != 4 && NumElts != 8))
03916     return false;
03917 
03918   unsigned NumLanes = VT.getSizeInBits()/128;
03919   unsigned LaneSize = NumElts/NumLanes;
03920   for (unsigned l = 0; l != NumElts; l += LaneSize) {
03921     for (unsigned i = 0; i != LaneSize; ++i) {
03922       if (!isUndefOrInRange(Mask[i+l], l, l+LaneSize))
03923         return false;
03924       if (NumElts != 8 || l == 0)
03925         continue;
03926       // VPERMILPS handling
03927       if (Mask[i] < 0)
03928         continue;
03929       if (!isUndefOrEqual(Mask[i+l], Mask[i]+l))
03930         return false;
03931     }
03932   }
03933 
03934   return true;
03935 }
03936 
03937 /// isCommutedMOVLMask - Returns true if the shuffle mask is except the reverse
03938 /// of what x86 movss want. X86 movs requires the lowest  element to be lowest
03939 /// element of vector 2 and the other elements to come from vector 1 in order.
03940 static bool isCommutedMOVLMask(ArrayRef<int> Mask, EVT VT,
03941                                bool V2IsSplat = false, bool V2IsUndef = false) {
03942   if (!VT.is128BitVector())
03943     return false;
03944 
03945   unsigned NumOps = VT.getVectorNumElements();
03946   if (NumOps != 2 && NumOps != 4 && NumOps != 8 && NumOps != 16)
03947     return false;
03948 
03949   if (!isUndefOrEqual(Mask[0], 0))
03950     return false;
03951 
03952   for (unsigned i = 1; i != NumOps; ++i)
03953     if (!(isUndefOrEqual(Mask[i], i+NumOps) ||
03954           (V2IsUndef && isUndefOrInRange(Mask[i], NumOps, NumOps*2)) ||
03955           (V2IsSplat && isUndefOrEqual(Mask[i], NumOps))))
03956       return false;
03957 
03958   return true;
03959 }
03960 
03961 /// isMOVSHDUPMask - Return true if the specified VECTOR_SHUFFLE operand
03962 /// specifies a shuffle of elements that is suitable for input to MOVSHDUP.
03963 /// Masks to match: <1, 1, 3, 3> or <1, 1, 3, 3, 5, 5, 7, 7>
03964 static bool isMOVSHDUPMask(ArrayRef<int> Mask, EVT VT,
03965                            const X86Subtarget *Subtarget) {
03966   if (!Subtarget->hasSSE3())
03967     return false;
03968 
03969   unsigned NumElems = VT.getVectorNumElements();
03970 
03971   if ((VT.is128BitVector() && NumElems != 4) ||
03972       (VT.is256BitVector() && NumElems != 8))
03973     return false;
03974 
03975   // "i+1" is the value the indexed mask element must have
03976   for (unsigned i = 0; i != NumElems; i += 2)
03977     if (!isUndefOrEqual(Mask[i], i+1) ||
03978         !isUndefOrEqual(Mask[i+1], i+1))
03979       return false;
03980 
03981   return true;
03982 }
03983 
03984 /// isMOVSLDUPMask - Return true if the specified VECTOR_SHUFFLE operand
03985 /// specifies a shuffle of elements that is suitable for input to MOVSLDUP.
03986 /// Masks to match: <0, 0, 2, 2> or <0, 0, 2, 2, 4, 4, 6, 6>
03987 static bool isMOVSLDUPMask(ArrayRef<int> Mask, EVT VT,
03988                            const X86Subtarget *Subtarget) {
03989   if (!Subtarget->hasSSE3())
03990     return false;
03991 
03992   unsigned NumElems = VT.getVectorNumElements();
03993 
03994   if ((VT.is128BitVector() && NumElems != 4) ||
03995       (VT.is256BitVector() && NumElems != 8))
03996     return false;
03997 
03998   // "i" is the value the indexed mask element must have
03999   for (unsigned i = 0; i != NumElems; i += 2)
04000     if (!isUndefOrEqual(Mask[i], i) ||
04001         !isUndefOrEqual(Mask[i+1], i))
04002       return false;
04003 
04004   return true;
04005 }
04006 
04007 /// isMOVDDUPYMask - Return true if the specified VECTOR_SHUFFLE operand
04008 /// specifies a shuffle of elements that is suitable for input to 256-bit
04009 /// version of MOVDDUP.
04010 static bool isMOVDDUPYMask(ArrayRef<int> Mask, EVT VT, bool HasFp256) {
04011   if (!HasFp256 || !VT.is256BitVector())
04012     return false;
04013 
04014   unsigned NumElts = VT.getVectorNumElements();
04015   if (NumElts != 4)
04016     return false;
04017 
04018   for (unsigned i = 0; i != NumElts/2; ++i)
04019     if (!isUndefOrEqual(Mask[i], 0))
04020       return false;
04021   for (unsigned i = NumElts/2; i != NumElts; ++i)
04022     if (!isUndefOrEqual(Mask[i], NumElts/2))
04023       return false;
04024   return true;
04025 }
04026 
04027 /// isMOVDDUPMask - Return true if the specified VECTOR_SHUFFLE operand
04028 /// specifies a shuffle of elements that is suitable for input to 128-bit
04029 /// version of MOVDDUP.
04030 static bool isMOVDDUPMask(ArrayRef<int> Mask, EVT VT) {
04031   if (!VT.is128BitVector())
04032     return false;
04033 
04034   unsigned e = VT.getVectorNumElements() / 2;
04035   for (unsigned i = 0; i != e; ++i)
04036     if (!isUndefOrEqual(Mask[i], i))
04037       return false;
04038   for (unsigned i = 0; i != e; ++i)
04039     if (!isUndefOrEqual(Mask[e+i], i))
04040       return false;
04041   return true;
04042 }
04043 
04044 /// isVEXTRACTF128Index - Return true if the specified
04045 /// EXTRACT_SUBVECTOR operand specifies a vector extract that is
04046 /// suitable for input to VEXTRACTF128.
04047 bool X86::isVEXTRACTF128Index(SDNode *N) {
04048   if (!isa<ConstantSDNode>(N->getOperand(1).getNode()))
04049     return false;
04050 
04051   // The index should be aligned on a 128-bit boundary.
04052   uint64_t Index =
04053     cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue();
04054 
04055   MVT VT = N->getValueType(0).getSimpleVT();
04056   unsigned ElSize = VT.getVectorElementType().getSizeInBits();
04057   bool Result = (Index * ElSize) % 128 == 0;
04058 
04059   return Result;
04060 }
04061 
04062 /// isVINSERTF128Index - Return true if the specified INSERT_SUBVECTOR
04063 /// operand specifies a subvector insert that is suitable for input to
04064 /// VINSERTF128.
04065 bool X86::isVINSERTF128Index(SDNode *N) {
04066   if (!isa<ConstantSDNode>(N->getOperand(2).getNode()))
04067     return false;
04068 
04069   // The index should be aligned on a 128-bit boundary.
04070   uint64_t Index =
04071     cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue();
04072 
04073   MVT VT = N->getValueType(0).getSimpleVT();
04074   unsigned ElSize = VT.getVectorElementType().getSizeInBits();
04075   bool Result = (Index * ElSize) % 128 == 0;
04076 
04077   return Result;
04078 }
04079 
04080 /// getShuffleSHUFImmediate - Return the appropriate immediate to shuffle
04081 /// the specified VECTOR_SHUFFLE mask with PSHUF* and SHUFP* instructions.
04082 /// Handles 128-bit and 256-bit.
04083 static unsigned getShuffleSHUFImmediate(ShuffleVectorSDNode *N) {
04084   MVT VT = N->getValueType(0).getSimpleVT();
04085 
04086   assert((VT.is128BitVector() || VT.is256BitVector()) &&
04087          "Unsupported vector type for PSHUF/SHUFP");
04088 
04089   // Handle 128 and 256-bit vector lengths. AVX defines PSHUF/SHUFP to operate
04090   // independently on 128-bit lanes.
04091   unsigned NumElts = VT.getVectorNumElements();
04092   unsigned NumLanes = VT.getSizeInBits()/128;
04093   unsigned NumLaneElts = NumElts/NumLanes;
04094 
04095   assert((NumLaneElts == 2 || NumLaneElts == 4) &&
04096          "Only supports 2 or 4 elements per lane");
04097 
04098   unsigned Shift = (NumLaneElts == 4) ? 1 : 0;
04099   unsigned Mask = 0;
04100   for (unsigned i = 0; i != NumElts; ++i) {
04101     int Elt = N->getMaskElt(i);
04102     if (Elt < 0) continue;
04103     Elt &= NumLaneElts - 1;
04104     unsigned ShAmt = (i << Shift) % 8;
04105     Mask |= Elt << ShAmt;
04106   }
04107 
04108   return Mask;
04109 }
04110 
04111 /// getShufflePSHUFHWImmediate - Return the appropriate immediate to shuffle
04112 /// the specified VECTOR_SHUFFLE mask with the PSHUFHW instruction.
04113 static unsigned getShufflePSHUFHWImmediate(ShuffleVectorSDNode *N) {
04114   MVT VT = N->getValueType(0).getSimpleVT();
04115 
04116   assert((VT == MVT::v8i16 || VT == MVT::v16i16) &&
04117          "Unsupported vector type for PSHUFHW");
04118 
04119   unsigned NumElts = VT.getVectorNumElements();
04120 
04121   unsigned Mask = 0;
04122   for (unsigned l = 0; l != NumElts; l += 8) {
04123     // 8 nodes per lane, but we only care about the last 4.
04124     for (unsigned i = 0; i < 4; ++i) {
04125       int Elt = N->getMaskElt(l+i+4);
04126       if (Elt < 0) continue;
04127       Elt &= 0x3; // only 2-bits.
04128       Mask |= Elt << (i * 2);
04129     }
04130   }
04131 
04132   return Mask;
04133 }
04134 
04135 /// getShufflePSHUFLWImmediate - Return the appropriate immediate to shuffle
04136 /// the specified VECTOR_SHUFFLE mask with the PSHUFLW instruction.
04137 static unsigned getShufflePSHUFLWImmediate(ShuffleVectorSDNode *N) {
04138   MVT VT = N->getValueType(0).getSimpleVT();
04139 
04140   assert((VT == MVT::v8i16 || VT == MVT::v16i16) &&
04141          "Unsupported vector type for PSHUFHW");
04142 
04143   unsigned NumElts = VT.getVectorNumElements();
04144 
04145   unsigned Mask = 0;
04146   for (unsigned l = 0; l != NumElts; l += 8) {
04147     // 8 nodes per lane, but we only care about the first 4.
04148     for (unsigned i = 0; i < 4; ++i) {
04149       int Elt = N->getMaskElt(l+i);
04150       if (Elt < 0) continue;
04151       Elt &= 0x3; // only 2-bits
04152       Mask |= Elt << (i * 2);
04153     }
04154   }
04155 
04156   return Mask;
04157 }
04158 
04159 /// getShufflePALIGNRImmediate - Return the appropriate immediate to shuffle
04160 /// the specified VECTOR_SHUFFLE mask with the PALIGNR instruction.
04161 static unsigned getShufflePALIGNRImmediate(ShuffleVectorSDNode *SVOp) {
04162   MVT VT = SVOp->getValueType(0).getSimpleVT();
04163   unsigned EltSize = VT.getVectorElementType().getSizeInBits() >> 3;
04164 
04165   unsigned NumElts = VT.getVectorNumElements();
04166   unsigned NumLanes = VT.getSizeInBits()/128;
04167   unsigned NumLaneElts = NumElts/NumLanes;
04168 
04169   int Val = 0;
04170   unsigned i;
04171   for (i = 0; i != NumElts; ++i) {
04172     Val = SVOp->getMaskElt(i);
04173     if (Val >= 0)
04174       break;
04175   }
04176   if (Val >= (int)NumElts)
04177     Val -= NumElts - NumLaneElts;
04178 
04179   assert(Val - i > 0 && "PALIGNR imm should be positive");
04180   return (Val - i) * EltSize;
04181 }
04182 
04183 /// getExtractVEXTRACTF128Immediate - Return the appropriate immediate
04184 /// to extract the specified EXTRACT_SUBVECTOR index with VEXTRACTF128
04185 /// instructions.
04186 unsigned X86::getExtractVEXTRACTF128Immediate(SDNode *N) {
04187   if (!isa<ConstantSDNode>(N->getOperand(1).getNode()))
04188     llvm_unreachable("Illegal extract subvector for VEXTRACTF128");
04189 
04190   uint64_t Index =
04191     cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue();
04192 
04193   MVT VecVT = N->getOperand(0).getValueType().getSimpleVT();
04194   MVT ElVT = VecVT.getVectorElementType();
04195 
04196   unsigned NumElemsPerChunk = 128 / ElVT.getSizeInBits();
04197   return Index / NumElemsPerChunk;
04198 }
04199 
04200 /// getInsertVINSERTF128Immediate - Return the appropriate immediate
04201 /// to insert at the specified INSERT_SUBVECTOR index with VINSERTF128
04202 /// instructions.
04203 unsigned X86::getInsertVINSERTF128Immediate(SDNode *N) {
04204   if (!isa<ConstantSDNode>(N->getOperand(2).getNode()))
04205     llvm_unreachable("Illegal insert subvector for VINSERTF128");
04206 
04207   uint64_t Index =
04208     cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue();
04209 
04210   MVT VecVT = N->getValueType(0).getSimpleVT();
04211   MVT ElVT = VecVT.getVectorElementType();
04212 
04213   unsigned NumElemsPerChunk = 128 / ElVT.getSizeInBits();
04214   return Index / NumElemsPerChunk;
04215 }
04216 
04217 /// getShuffleCLImmediate - Return the appropriate immediate to shuffle
04218 /// the specified VECTOR_SHUFFLE mask with VPERMQ and VPERMPD instructions.
04219 /// Handles 256-bit.
04220 static unsigned getShuffleCLImmediate(ShuffleVectorSDNode *N) {
04221   MVT VT = N->getValueType(0).getSimpleVT();
04222 
04223   unsigned NumElts = VT.getVectorNumElements();
04224 
04225   assert((VT.is256BitVector() && NumElts == 4) &&
04226          "Unsupported vector type for VPERMQ/VPERMPD");
04227 
04228   unsigned Mask = 0;
04229   for (unsigned i = 0; i != NumElts; ++i) {
04230     int Elt = N->getMaskElt(i);
04231     if (Elt < 0)
04232       continue;
04233     Mask |= Elt << (i*2);
04234   }
04235 
04236   return Mask;
04237 }
04238 /// isZeroNode - Returns true if Elt is a constant zero or a floating point
04239 /// constant +0.0.
04240 bool X86::isZeroNode(SDValue Elt) {
04241   if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Elt))
04242     return CN->isNullValue();
04243   if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Elt))
04244     return CFP->getValueAPF().isPosZero();
04245   return false;
04246 }
04247 
04248 /// CommuteVectorShuffle - Swap vector_shuffle operands as well as values in
04249 /// their permute mask.
04250 static SDValue CommuteVectorShuffle(ShuffleVectorSDNode *SVOp,
04251                                     SelectionDAG &DAG) {
04252   MVT VT = SVOp->getValueType(0).getSimpleVT();
04253   unsigned NumElems = VT.getVectorNumElements();
04254   SmallVector<int, 8> MaskVec;
04255 
04256   for (unsigned i = 0; i != NumElems; ++i) {
04257     int Idx = SVOp->getMaskElt(i);
04258     if (Idx >= 0) {
04259       if (Idx < (int)NumElems)
04260         Idx += NumElems;
04261       else
04262         Idx -= NumElems;
04263     }
04264     MaskVec.push_back(Idx);
04265   }
04266   return DAG.getVectorShuffle(VT, SDLoc(SVOp), SVOp->getOperand(1),
04267                               SVOp->getOperand(0), &MaskVec[0]);
04268 }
04269 
04270 /// ShouldXformToMOVHLPS - Return true if the node should be transformed to
04271 /// match movhlps. The lower half elements should come from upper half of
04272 /// V1 (and in order), and the upper half elements should come from the upper
04273 /// half of V2 (and in order).
04274 static bool ShouldXformToMOVHLPS(ArrayRef<int> Mask, EVT VT) {
04275   if (!VT.is128BitVector())
04276     return false;
04277   if (VT.getVectorNumElements() != 4)
04278     return false;
04279   for (unsigned i = 0, e = 2; i != e; ++i)
04280     if (!isUndefOrEqual(Mask[i], i+2))
04281       return false;
04282   for (unsigned i = 2; i != 4; ++i)
04283     if (!isUndefOrEqual(Mask[i], i+4))
04284       return false;
04285   return true;
04286 }
04287 
04288 /// isScalarLoadToVector - Returns true if the node is a scalar load that
04289 /// is promoted to a vector. It also returns the LoadSDNode by reference if
04290 /// required.
04291 static bool isScalarLoadToVector(SDNode *N, LoadSDNode **LD = NULL) {
04292   if (N->getOpcode() != ISD::SCALAR_TO_VECTOR)
04293     return false;
04294   N = N->getOperand(0).getNode();
04295   if (!ISD::isNON_EXTLoad(N))
04296     return false;
04297   if (LD)
04298     *LD = cast<LoadSDNode>(N);
04299   return true;
04300 }
04301 
04302 // Test whether the given value is a vector value which will be legalized
04303 // into a load.
04304 static bool WillBeConstantPoolLoad(SDNode *N) {
04305   if (N->getOpcode() != ISD::BUILD_VECTOR)
04306     return false;
04307 
04308   // Check for any non-constant elements.
04309   for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i)
04310     switch (N->getOperand(i).getNode()->getOpcode()) {
04311     case ISD::UNDEF:
04312     case ISD::ConstantFP:
04313     case ISD::Constant:
04314       break;
04315     default:
04316       return false;
04317     }
04318 
04319   // Vectors of all-zeros and all-ones are materialized with special
04320   // instructions rather than being loaded.
04321   return !ISD::isBuildVectorAllZeros(N) &&
04322          !ISD::isBuildVectorAllOnes(N);
04323 }
04324 
04325 /// ShouldXformToMOVLP{S|D} - Return true if the node should be transformed to
04326 /// match movlp{s|d}. The lower half elements should come from lower half of
04327 /// V1 (and in order), and the upper half elements should come from the upper
04328 /// half of V2 (and in order). And since V1 will become the source of the
04329 /// MOVLP, it must be either a vector load or a scalar load to vector.
04330 static bool ShouldXformToMOVLP(SDNode *V1, SDNode *V2,
04331                                ArrayRef<int> Mask, EVT VT) {
04332   if (!VT.is128BitVector())
04333     return false;
04334 
04335   if (!ISD::isNON_EXTLoad(V1) && !isScalarLoadToVector(V1))
04336     return false;
04337   // Is V2 is a vector load, don't do this transformation. We will try to use
04338   // load folding shufps op.
04339   if (ISD::isNON_EXTLoad(V2) || WillBeConstantPoolLoad(V2))
04340     return false;
04341 
04342   unsigned NumElems = VT.getVectorNumElements();
04343 
04344   if (NumElems != 2 && NumElems != 4)
04345     return false;
04346   for (unsigned i = 0, e = NumElems/2; i != e; ++i)
04347     if (!isUndefOrEqual(Mask[i], i))
04348       return false;
04349   for (unsigned i = NumElems/2, e = NumElems; i != e; ++i)
04350     if (!isUndefOrEqual(Mask[i], i+NumElems))
04351       return false;
04352   return true;
04353 }
04354 
04355 /// isSplatVector - Returns true if N is a BUILD_VECTOR node whose elements are
04356 /// all the same.
04357 static bool isSplatVector(SDNode *N) {
04358   if (N->getOpcode() != ISD::BUILD_VECTOR)
04359     return false;
04360 
04361   SDValue SplatValue = N->getOperand(0);
04362   for (unsigned i = 1, e = N->getNumOperands(); i != e; ++i)
04363     if (N->getOperand(i) != SplatValue)
04364       return false;
04365   return true;
04366 }
04367 
04368 /// isZeroShuffle - Returns true if N is a VECTOR_SHUFFLE that can be resolved
04369 /// to an zero vector.
04370 /// FIXME: move to dag combiner / method on ShuffleVectorSDNode
04371 static bool isZeroShuffle(ShuffleVectorSDNode *N) {
04372   SDValue V1 = N->getOperand(0);
04373   SDValue V2 = N->getOperand(1);
04374   unsigned NumElems = N->getValueType(0).getVectorNumElements();
04375   for (unsigned i = 0; i != NumElems; ++i) {
04376     int Idx = N->getMaskElt(i);
04377     if (Idx >= (int)NumElems) {
04378       unsigned Opc = V2.getOpcode();
04379       if (Opc == ISD::UNDEF || ISD::isBuildVectorAllZeros(V2.getNode()))
04380         continue;
04381       if (Opc != ISD::BUILD_VECTOR ||
04382           !X86::isZeroNode(V2.getOperand(Idx-NumElems)))
04383         return false;
04384     } else if (Idx >= 0) {
04385       unsigned Opc = V1.getOpcode();
04386       if (Opc == ISD::UNDEF || ISD::isBuildVectorAllZeros(V1.getNode()))
04387         continue;
04388       if (Opc != ISD::BUILD_VECTOR ||
04389           !X86::isZeroNode(V1.getOperand(Idx)))
04390         return false;
04391     }
04392   }
04393   return true;
04394 }
04395 
04396 /// getZeroVector - Returns a vector of specified type with all zero elements.
04397 ///
04398 static SDValue getZeroVector(EVT VT, const X86Subtarget *Subtarget,
04399                              SelectionDAG &DAG, SDLoc dl) {
04400   assert(VT.isVector() && "Expected a vector type");
04401 
04402   // Always build SSE zero vectors as <4 x i32> bitcasted
04403   // to their dest type. This ensures they get CSE'd.
04404   SDValue Vec;
04405   if (VT.is128BitVector()) {  // SSE
04406     if (Subtarget->hasSSE2()) {  // SSE2
04407       SDValue Cst = DAG.getTargetConstant(0, MVT::i32);
04408       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
04409     } else { // SSE1
04410       SDValue Cst = DAG.getTargetConstantFP(+0.0, MVT::f32);
04411       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4f32, Cst, Cst, Cst, Cst);
04412     }
04413   } else if (VT.is256BitVector()) { // AVX
04414     if (Subtarget->hasInt256()) { // AVX2
04415       SDValue Cst = DAG.getTargetConstant(0, MVT::i32);
04416       SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
04417       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops,
04418                         array_lengthof(Ops));
04419     } else {
04420       // 256-bit logic and arithmetic instructions in AVX are all
04421       // floating-point, no support for integer ops. Emit fp zeroed vectors.
04422       SDValue Cst = DAG.getTargetConstantFP(+0.0, MVT::f32);
04423       SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
04424       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8f32, Ops,
04425                         array_lengthof(Ops));
04426     }
04427   } else
04428     llvm_unreachable("Unexpected vector type");
04429 
04430   return DAG.getNode(ISD::BITCAST, dl, VT, Vec);
04431 }
04432 
04433 /// getOnesVector - Returns a vector of specified type with all bits set.
04434 /// Always build ones vectors as <4 x i32> or <8 x i32>. For 256-bit types with
04435 /// no AVX2 supprt, use two <4 x i32> inserted in a <8 x i32> appropriately.
04436 /// Then bitcast to their original type, ensuring they get CSE'd.
04437 static SDValue getOnesVector(MVT VT, bool HasInt256, SelectionDAG &DAG,
04438                              SDLoc dl) {
04439   assert(VT.isVector() && "Expected a vector type");
04440 
04441   SDValue Cst = DAG.getTargetConstant(~0U, MVT::i32);
04442   SDValue Vec;
04443   if (VT.is256BitVector()) {
04444     if (HasInt256) { // AVX2
04445       SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
04446       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops,
04447                         array_lengthof(Ops));
04448     } else { // AVX
04449       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
04450       Vec = Concat128BitVectors(Vec, Vec, MVT::v8i32, 8, DAG, dl);
04451     }
04452   } else if (VT.is128BitVector()) {
04453     Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
04454   } else
04455     llvm_unreachable("Unexpected vector type");
04456 
04457   return DAG.getNode(ISD::BITCAST, dl, VT, Vec);
04458 }
04459 
04460 /// NormalizeMask - V2 is a splat, modify the mask (if needed) so all elements
04461 /// that point to V2 points to its first element.
04462 static void NormalizeMask(SmallVectorImpl<int> &Mask, unsigned NumElems) {
04463   for (unsigned i = 0; i != NumElems; ++i) {
04464     if (Mask[i] > (int)NumElems) {
04465       Mask[i] = NumElems;
04466     }
04467   }
04468 }
04469 
04470 /// getMOVLMask - Returns a vector_shuffle mask for an movs{s|d}, movd
04471 /// operation of specified width.
04472 static SDValue getMOVL(SelectionDAG &DAG, SDLoc dl, EVT VT, SDValue V1,
04473                        SDValue V2) {
04474   unsigned NumElems = VT.getVectorNumElements();
04475   SmallVector<int, 8> Mask;
04476   Mask.push_back(NumElems);
04477   for (unsigned i = 1; i != NumElems; ++i)
04478     Mask.push_back(i);
04479   return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]);
04480 }
04481 
04482 /// getUnpackl - Returns a vector_shuffle node for an unpackl operation.
04483 static SDValue getUnpackl(SelectionDAG &DAG, SDLoc dl, EVT VT, SDValue V1,
04484                           SDValue V2) {
04485   unsigned NumElems = VT.getVectorNumElements();
04486   SmallVector<int, 8> Mask;
04487   for (unsigned i = 0, e = NumElems/2; i != e; ++i) {
04488     Mask.push_back(i);
04489     Mask.push_back(i + NumElems);
04490   }
04491   return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]);
04492 }
04493 
04494 /// getUnpackh - Returns a vector_shuffle node for an unpackh operation.
04495 static SDValue getUnpackh(SelectionDAG &DAG, SDLoc dl, EVT VT, SDValue V1,
04496                           SDValue V2) {
04497   unsigned NumElems = VT.getVectorNumElements();
04498   SmallVector<int, 8> Mask;
04499   for (unsigned i = 0, Half = NumElems/2; i != Half; ++i) {
04500     Mask.push_back(i + Half);
04501     Mask.push_back(i + NumElems + Half);
04502   }
04503   return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]);
04504 }
04505 
04506 // PromoteSplati8i16 - All i16 and i8 vector types can't be used directly by
04507 // a generic shuffle instruction because the target has no such instructions.
04508 // Generate shuffles which repeat i16 and i8 several times until they can be
04509 // represented by v4f32 and then be manipulated by target suported shuffles.
04510 static SDValue PromoteSplati8i16(SDValue V, SelectionDAG &DAG, int &EltNo) {
04511   EVT VT = V.getValueType();
04512   int NumElems = VT.getVectorNumElements();
04513   SDLoc dl(V);
04514 
04515   while (NumElems > 4) {
04516     if (EltNo < NumElems/2) {
04517       V = getUnpackl(DAG, dl, VT, V, V);
04518     } else {
04519       V = getUnpackh(DAG, dl, VT, V, V);
04520       EltNo -= NumElems/2;
04521     }
04522     NumElems >>= 1;
04523   }
04524   return V;
04525 }
04526 
04527 /// getLegalSplat - Generate a legal splat with supported x86 shuffles
04528 static SDValue getLegalSplat(SelectionDAG &DAG, SDValue V, int EltNo) {
04529   EVT VT = V.getValueType();
04530   SDLoc dl(V);
04531 
04532   if (VT.is128BitVector()) {
04533     V = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, V);
04534     int SplatMask[4] = { EltNo, EltNo, EltNo, EltNo };
04535     V = DAG.getVectorShuffle(MVT::v4f32, dl, V, DAG.getUNDEF(MVT::v4f32),
04536                              &SplatMask[0]);
04537   } else if (VT.is256BitVector()) {
04538     // To use VPERMILPS to splat scalars, the second half of indicies must
04539     // refer to the higher part, which is a duplication of the lower one,
04540     // because VPERMILPS can only handle in-lane permutations.
04541     int SplatMask[8] = { EltNo, EltNo, EltNo, EltNo,
04542                          EltNo+4, EltNo+4, EltNo+4, EltNo+4 };
04543 
04544     V = DAG.getNode(ISD::BITCAST, dl, MVT::v8f32, V);
04545     V = DAG.getVectorShuffle(MVT::v8f32, dl, V, DAG.getUNDEF(MVT::v8f32),
04546                              &SplatMask[0]);
04547   } else
04548     llvm_unreachable("Vector size not supported");
04549 
04550   return DAG.getNode(ISD::BITCAST, dl, VT, V);
04551 }
04552 
04553 /// PromoteSplat - Splat is promoted to target supported vector shuffles.
04554 static SDValue PromoteSplat(ShuffleVectorSDNode *SV, SelectionDAG &DAG) {
04555   EVT SrcVT = SV->getValueType(0);
04556   SDValue V1 = SV->getOperand(0);
04557   SDLoc dl(SV);
04558 
04559   int EltNo = SV->getSplatIndex();
04560   int NumElems = SrcVT.getVectorNumElements();
04561   bool Is256BitVec = SrcVT.is256BitVector();
04562 
04563   assert(((SrcVT.is128BitVector() && NumElems > 4) || Is256BitVec) &&
04564          "Unknown how to promote splat for type");
04565 
04566   // Extract the 128-bit part containing the splat element and update
04567   // the splat element index when it refers to the higher register.
04568   if (Is256BitVec) {
04569     V1 = Extract128BitVector(V1, EltNo, DAG, dl);
04570     if (EltNo >= NumElems/2)
04571       EltNo -= NumElems/2;
04572   }
04573 
04574   // All i16 and i8 vector types can't be used directly by a generic shuffle
04575   // instruction because the target has no such instruction. Generate shuffles
04576   // which repeat i16 and i8 several times until they fit in i32, and then can
04577   // be manipulated by target suported shuffles.
04578   EVT EltVT = SrcVT.getVectorElementType();
04579   if (EltVT == MVT::i8 || EltVT == MVT::i16)
04580     V1 = PromoteSplati8i16(V1, DAG, EltNo);
04581 
04582   // Recreate the 256-bit vector and place the same 128-bit vector
04583   // into the low and high part. This is necessary because we want
04584   // to use VPERM* to shuffle the vectors
04585   if (Is256BitVec) {
04586     V1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, SrcVT, V1, V1);
04587   }
04588 
04589   return getLegalSplat(DAG, V1, EltNo);
04590 }
04591 
04592 /// getShuffleVectorZeroOrUndef - Return a vector_shuffle of the specified
04593 /// vector of zero or undef vector.  This produces a shuffle where the low
04594 /// element of V2 is swizzled into the zero/undef vector, landing at element
04595 /// Idx.  This produces a shuffle mask like 4,1,2,3 (idx=0) or  0,1,2,4 (idx=3).
04596 static SDValue getShuffleVectorZeroOrUndef(SDValue V2, unsigned Idx,
04597                                            bool IsZero,
04598                                            const X86Subtarget *Subtarget,
04599                                            SelectionDAG &DAG) {
04600   EVT VT = V2.getValueType();
04601   SDValue V1 = IsZero
04602     ? getZeroVector(VT, Subtarget, DAG, SDLoc(V2)) : DAG.getUNDEF(VT);
04603   unsigned NumElems = VT.getVectorNumElements();
04604   SmallVector<int, 16> MaskVec;
04605   for (unsigned i = 0; i != NumElems; ++i)
04606     // If this is the insertion idx, put the low elt of V2 here.
04607     MaskVec.push_back(i == Idx ? NumElems : i);
04608   return DAG.getVectorShuffle(VT, SDLoc(V2), V1, V2, &MaskVec[0]);
04609 }
04610 
04611 /// getTargetShuffleMask - Calculates the shuffle mask corresponding to the
04612 /// target specific opcode. Returns true if the Mask could be calculated.
04613 /// Sets IsUnary to true if only uses one source.
04614 static bool getTargetShuffleMask(SDNode *N, MVT VT,
04615                                  SmallVectorImpl<int> &Mask, bool &IsUnary) {
04616   unsigned NumElems = VT.getVectorNumElements();
04617   SDValue ImmN;
04618 
04619   IsUnary = false;
04620   switch(N->getOpcode()) {
04621   case X86ISD::SHUFP:
04622     ImmN = N->getOperand(N->getNumOperands()-1);
04623     DecodeSHUFPMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
04624     break;
04625   case X86ISD::UNPCKH:
04626     DecodeUNPCKHMask(VT, Mask);
04627     break;
04628   case X86ISD::UNPCKL:
04629     DecodeUNPCKLMask(VT, Mask);
04630     break;
04631   case X86ISD::MOVHLPS:
04632     DecodeMOVHLPSMask(NumElems, Mask);
04633     break;
04634   case X86ISD::MOVLHPS:
04635     DecodeMOVLHPSMask(NumElems, Mask);
04636     break;
04637   case X86ISD::PALIGNR:
04638     ImmN = N->getOperand(N->getNumOperands()-1);
04639     DecodePALIGNRMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
04640     break;
04641   case X86ISD::PSHUFD:
04642   case X86ISD::VPERMILP:
04643     ImmN = N->getOperand(N->getNumOperands()-1);
04644     DecodePSHUFMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
04645     IsUnary = true;
04646     break;
04647   case X86ISD::PSHUFHW:
04648     ImmN = N->getOperand(N->getNumOperands()-1);
04649     DecodePSHUFHWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
04650     IsUnary = true;
04651     break;
04652   case X86ISD::PSHUFLW:
04653     ImmN = N->getOperand(N->getNumOperands()-1);
04654     DecodePSHUFLWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
04655     IsUnary = true;
04656     break;
04657   case X86ISD::VPERMI:
04658     ImmN = N->getOperand(N->getNumOperands()-1);
04659     DecodeVPERMMask(cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
04660     IsUnary = true;
04661     break;
04662   case X86ISD::MOVSS:
04663   case X86ISD::MOVSD: {
04664     // The index 0 always comes from the first element of the second source,
04665     // this is why MOVSS and MOVSD are used in the first place. The other
04666     // elements come from the other positions of the first source vector
04667     Mask.push_back(NumElems);
04668     for (unsigned i = 1; i != NumElems; ++i) {
04669       Mask.push_back(i);
04670     }
04671     break;
04672   }
04673   case X86ISD::VPERM2X128:
04674     ImmN = N->getOperand(N->getNumOperands()-1);
04675     DecodeVPERM2X128Mask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
04676     if (Mask.empty()) return false;
04677     break;
04678   case X86ISD::MOVDDUP:
04679   case X86ISD::MOVLHPD:
04680   case X86ISD::MOVLPD:
04681   case X86ISD::MOVLPS:
04682   case X86ISD::MOVSHDUP:
04683   case X86ISD::MOVSLDUP:
04684     // Not yet implemented
04685     return false;
04686   default: llvm_unreachable("unknown target shuffle node");
04687   }
04688 
04689   return true;
04690 }
04691 
04692 /// getShuffleScalarElt - Returns the scalar element that will make up the ith
04693 /// element of the result of the vector shuffle.
04694 static SDValue getShuffleScalarElt(SDNode *N, unsigned Index, SelectionDAG &DAG,
04695                                    unsigned Depth) {
04696   if (Depth == 6)
04697     return SDValue();  // Limit search depth.
04698 
04699   SDValue V = SDValue(N, 0);
04700   EVT VT = V.getValueType();
04701   unsigned Opcode = V.getOpcode();
04702 
04703   // Recurse into ISD::VECTOR_SHUFFLE node to find scalars.
04704   if (const ShuffleVectorSDNode *SV = dyn_cast<ShuffleVectorSDNode>(N)) {
04705     int Elt = SV->getMaskElt(Index);
04706 
04707     if (Elt < 0)
04708       return DAG.getUNDEF(VT.getVectorElementType());
04709 
04710     unsigned NumElems = VT.getVectorNumElements();
04711     SDValue NewV = (Elt < (int)NumElems) ? SV->getOperand(0)
04712                                          : SV->getOperand(1);
04713     return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG, Depth+1);
04714   }
04715 
04716   // Recurse into target specific vector shuffles to find scalars.
04717   if (isTargetShuffle(Opcode)) {
04718     MVT ShufVT = V.getValueType().getSimpleVT();
04719     unsigned NumElems = ShufVT.getVectorNumElements();
04720     SmallVector<int, 16> ShuffleMask;
04721     bool IsUnary;
04722 
04723     if (!getTargetShuffleMask(N, ShufVT, ShuffleMask, IsUnary))
04724       return SDValue();
04725 
04726     int Elt = ShuffleMask[Index];
04727     if (Elt < 0)
04728       return DAG.getUNDEF(ShufVT.getVectorElementType());
04729 
04730     SDValue NewV = (Elt < (int)NumElems) ? N->getOperand(0)
04731                                          : N->getOperand(1);
04732     return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG,
04733                                Depth+1);
04734   }
04735 
04736   // Actual nodes that may contain scalar elements
04737   if (Opcode == ISD::BITCAST) {
04738     V = V.getOperand(0);
04739     EVT SrcVT = V.getValueType();
04740     unsigned NumElems = VT.getVectorNumElements();
04741 
04742     if (!SrcVT.isVector() || SrcVT.getVectorNumElements() != NumElems)
04743       return SDValue();
04744   }
04745 
04746   if (V.getOpcode() == ISD::SCALAR_TO_VECTOR)
04747     return (Index == 0) ? V.getOperand(0)
04748                         : DAG.getUNDEF(VT.getVectorElementType());
04749 
04750   if (V.getOpcode() == ISD::BUILD_VECTOR)
04751     return V.getOperand(Index);
04752 
04753   return SDValue();
04754 }
04755 
04756 /// getNumOfConsecutiveZeros - Return the number of elements of a vector
04757 /// shuffle operation which come from a consecutively from a zero. The
04758 /// search can start in two different directions, from left or right.
04759 /// We count undefs as zeros until PreferredNum is reached.
04760 static unsigned getNumOfConsecutiveZeros(ShuffleVectorSDNode *SVOp,
04761                                          unsigned NumElems, bool ZerosFromLeft,
04762                                          SelectionDAG &DAG,
04763                                          unsigned PreferredNum = -1U) {
04764   unsigned NumZeros = 0;
04765   for (unsigned i = 0; i != NumElems; ++i) {
04766     unsigned Index = ZerosFromLeft ? i : NumElems - i - 1;
04767     SDValue Elt = getShuffleScalarElt(SVOp, Index, DAG, 0);
04768     if (!Elt.getNode())
04769       break;
04770 
04771     if (X86::isZeroNode(Elt))
04772       ++NumZeros;
04773     else if (Elt.getOpcode() == ISD::UNDEF) // Undef as zero up to PreferredNum.
04774       NumZeros = std::min(NumZeros + 1, PreferredNum);
04775     else
04776       break;
04777   }
04778 
04779   return NumZeros;
04780 }
04781 
04782 /// isShuffleMaskConsecutive - Check if the shuffle mask indicies [MaskI, MaskE)
04783 /// correspond consecutively to elements from one of the vector operands,
04784 /// starting from its index OpIdx. Also tell OpNum which source vector operand.
04785 static
04786 bool isShuffleMaskConsecutive(ShuffleVectorSDNode *SVOp,
04787                               unsigned MaskI, unsigned MaskE, unsigned OpIdx,
04788                               unsigned NumElems, unsigned &OpNum) {
04789   bool SeenV1 = false;
04790   bool SeenV2 = false;
04791 
04792   for (unsigned i = MaskI; i != MaskE; ++i, ++OpIdx) {
04793     int Idx = SVOp->getMaskElt(i);
04794     // Ignore undef indicies
04795     if (Idx < 0)
04796       continue;
04797 
04798     if (Idx < (int)NumElems)
04799       SeenV1 = true;
04800     else
04801       SeenV2 = true;
04802 
04803     // Only accept consecutive elements from the same vector
04804     if ((Idx % NumElems != OpIdx) || (SeenV1 && SeenV2))
04805       return false;
04806   }
04807 
04808   OpNum = SeenV1 ? 0 : 1;
04809   return true;
04810 }
04811 
04812 /// isVectorShiftRight - Returns true if the shuffle can be implemented as a
04813 /// logical left shift of a vector.
04814 static bool isVectorShiftRight(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG,
04815                                bool &isLeft, SDValue &ShVal, unsigned &ShAmt) {
04816   unsigned NumElems = SVOp->getValueType(0).getVectorNumElements();
04817   unsigned NumZeros = getNumOfConsecutiveZeros(
04818       SVOp, NumElems, false /* check zeros from right */, DAG,
04819       SVOp->getMaskElt(0));
04820   unsigned OpSrc;
04821 
04822   if (!NumZeros)
04823     return false;
04824 
04825   // Considering the elements in the mask that are not consecutive zeros,
04826   // check if they consecutively come from only one of the source vectors.
04827   //
04828   //               V1 = {X, A, B, C}     0
04829   //                         \  \  \    /
04830   //   vector_shuffle V1, V2 <1, 2, 3, X>
04831   //
04832   if (!isShuffleMaskConsecutive(SVOp,
04833             0,                   // Mask Start Index
04834             NumElems-NumZeros,   // Mask End Index(exclusive)
04835             NumZeros,            // Where to start looking in the src vector
04836             NumElems,            // Number of elements in vector
04837             OpSrc))              // Which source operand ?
04838     return false;
04839 
04840   isLeft = false;
04841   ShAmt = NumZeros;
04842   ShVal = SVOp->getOperand(OpSrc);
04843   return true;
04844 }
04845 
04846 /// isVectorShiftLeft - Returns true if the shuffle can be implemented as a
04847 /// logical left shift of a vector.
04848 static bool isVectorShiftLeft(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG,
04849                               bool &isLeft, SDValue &ShVal, unsigned &ShAmt) {
04850   unsigned NumElems = SVOp->getValueType(0).getVectorNumElements();
04851   unsigned NumZeros = getNumOfConsecutiveZeros(
04852       SVOp, NumElems, true /* check zeros from left */, DAG,
04853       NumElems - SVOp->getMaskElt(NumElems - 1) - 1);
04854   unsigned OpSrc;
04855 
04856   if (!NumZeros)
04857     return false;
04858 
04859   // Considering the elements in the mask that are not consecutive zeros,
04860   // check if they consecutively come from only one of the source vectors.
04861   //
04862   //                           0    { A, B, X, X } = V2
04863   //                          / \    /  /
04864   //   vector_shuffle V1, V2 <X, X, 4, 5>
04865   //
04866   if (!isShuffleMaskConsecutive(SVOp,
04867             NumZeros,     // Mask Start Index
04868             NumElems,     // Mask End Index(exclusive)
04869             0,            // Where to start looking in the src vector
04870             NumElems,     // Number of elements in vector
04871             OpSrc))       // Which source operand ?
04872     return false;
04873 
04874   isLeft = true;
04875   ShAmt = NumZeros;
04876   ShVal = SVOp->getOperand(OpSrc);
04877   return true;
04878 }
04879 
04880 /// isVectorShift - Returns true if the shuffle can be implemented as a
04881 /// logical left or right shift of a vector.
04882 static bool isVectorShift(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG,
04883                           bool &isLeft, SDValue &ShVal, unsigned &ShAmt) {
04884   // Although the logic below support any bitwidth size, there are no
04885   // shift instructions which handle more than 128-bit vectors.
04886   if (!SVOp->getValueType(0).is128BitVector())
04887     return false;
04888 
04889   if (isVectorShiftLeft(SVOp, DAG, isLeft, ShVal, ShAmt) ||
04890       isVectorShiftRight(SVOp, DAG, isLeft, ShVal, ShAmt))
04891     return true;
04892 
04893   return false;
04894 }
04895 
04896 /// LowerBuildVectorv16i8 - Custom lower build_vector of v16i8.
04897 ///
04898 static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros,
04899                                        unsigned NumNonZero, unsigned NumZero,
04900                                        SelectionDAG &DAG,
04901                                        const X86Subtarget* Subtarget,
04902                                        const TargetLowering &TLI) {
04903   if (NumNonZero > 8)
04904     return SDValue();
04905 
04906   SDLoc dl(Op);
04907   SDValue V(0, 0);
04908   bool First = true;
04909   for (unsigned i = 0; i < 16; ++i) {
04910     bool ThisIsNonZero = (NonZeros & (1 << i)) != 0;
04911     if (ThisIsNonZero && First) {
04912       if (NumZero)
04913         V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
04914       else
04915         V = DAG.getUNDEF(MVT::v8i16);
04916       First = false;
04917     }
04918 
04919     if ((i & 1) != 0) {
04920       SDValue ThisElt(0, 0), LastElt(0, 0);
04921       bool LastIsNonZero = (NonZeros & (1 << (i-1))) != 0;
04922       if (LastIsNonZero) {
04923         LastElt = DAG.getNode(ISD::ZERO_EXTEND, dl,
04924                               MVT::i16, Op.getOperand(i-1));
04925       }
04926       if (ThisIsNonZero) {
04927         ThisElt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i));
04928         ThisElt = DAG.getNode(ISD::SHL, dl, MVT::i16,
04929                               ThisElt, DAG.getConstant(8, MVT::i8));
04930         if (LastIsNonZero)
04931           ThisElt = DAG.getNode(ISD::OR, dl, MVT::i16, ThisElt, LastElt);
04932       } else
04933         ThisElt = LastElt;
04934 
04935       if (ThisElt.getNode())
04936         V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, ThisElt,
04937                         DAG.getIntPtrConstant(i/2));
04938     }
04939   }
04940 
04941   return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, V);
04942 }
04943 
04944 /// LowerBuildVectorv8i16 - Custom lower build_vector of v8i16.
04945 ///
04946 static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros,
04947                                      unsigned NumNonZero, unsigned NumZero,
04948                                      SelectionDAG &DAG,
04949                                      const X86Subtarget* Subtarget,
04950                                      const TargetLowering &TLI) {
04951   if (NumNonZero > 4)
04952     return SDValue();
04953 
04954   SDLoc dl(Op);
04955   SDValue V(0, 0);
04956   bool First = true;
04957   for (unsigned i = 0; i < 8; ++i) {
04958     bool isNonZero = (NonZeros & (1 << i)) != 0;
04959     if (isNonZero) {
04960       if (First) {
04961         if (NumZero)
04962           V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
04963         else
04964           V = DAG.getUNDEF(MVT::v8i16);
04965         First = false;
04966       }
04967       V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl,
04968                       MVT::v8i16, V, Op.getOperand(i),
04969                       DAG.getIntPtrConstant(i));
04970     }
04971   }
04972 
04973   return V;
04974 }
04975 
04976 /// getVShift - Return a vector logical shift node.
04977 ///
04978 static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp,
04979                          unsigned NumBits, SelectionDAG &DAG,
04980                          const TargetLowering &TLI, SDLoc dl) {
04981   assert(VT.is128BitVector() && "Unknown type for VShift");
04982   EVT ShVT = MVT::v2i64;
04983   unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ;
04984   SrcOp = DAG.getNode(ISD::BITCAST, dl, ShVT, SrcOp);
04985   return DAG.getNode(ISD::BITCAST, dl, VT,
04986                      DAG.getNode(Opc, dl, ShVT, SrcOp,
04987                              DAG.getConstant(NumBits,
04988                                   TLI.getScalarShiftAmountTy(SrcOp.getValueType()))));
04989 }
04990 
04991 SDValue
04992 X86TargetLowering::LowerAsSplatVectorLoad(SDValue SrcOp, EVT VT, SDLoc dl,
04993                                           SelectionDAG &DAG) const {
04994 
04995   // Check if the scalar load can be widened into a vector load. And if
04996   // the address is "base + cst" see if the cst can be "absorbed" into
04997   // the shuffle mask.
04998   if (LoadSDNode *LD = dyn_cast<LoadSDNode>(SrcOp)) {
04999     SDValue Ptr = LD->getBasePtr();
05000     if (!ISD::isNormalLoad(LD) || LD->isVolatile())
05001       return SDValue();
05002     EVT PVT = LD->getValueType(0);
05003     if (PVT != MVT::i32 && PVT != MVT::f32)
05004       return SDValue();
05005 
05006     int FI = -1;
05007     int64_t Offset = 0;
05008     if (FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr)) {
05009       FI = FINode->getIndex();
05010       Offset = 0;
05011     } else if (DAG.isBaseWithConstantOffset(Ptr) &&
05012                isa<FrameIndexSDNode>(Ptr.getOperand(0))) {
05013       FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex();
05014       Offset = Ptr.getConstantOperandVal(1);
05015       Ptr = Ptr.getOperand(0);
05016     } else {
05017       return SDValue();
05018     }
05019 
05020     // FIXME: 256-bit vector instructions don't require a strict alignment,
05021     // improve this code to support it better.
05022     unsigned RequiredAlign = VT.getSizeInBits()/8;
05023     SDValue Chain = LD->getChain();
05024     // Make sure the stack object alignment is at least 16 or 32.
05025     MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
05026     if (DAG.InferPtrAlignment(Ptr) < RequiredAlign) {
05027       if (MFI->isFixedObjectIndex(FI)) {
05028         // Can't change the alignment. FIXME: It's possible to compute
05029         // the exact stack offset and reference FI + adjust offset instead.
05030         // If someone *really* cares about this. That's the way to implement it.
05031         return SDValue();
05032       } else {
05033         MFI->setObjectAlignment(FI, RequiredAlign);
05034       }
05035     }
05036 
05037     // (Offset % 16 or 32) must be multiple of 4. Then address is then
05038     // Ptr + (Offset & ~15).
05039     if (Offset < 0)
05040       return SDValue();
05041     if ((Offset % RequiredAlign) & 3)
05042       return SDValue();
05043     int64_t StartOffset = Offset & ~(RequiredAlign-1);
05044     if (StartOffset)
05045       Ptr = DAG.getNode(ISD::ADD, SDLoc(Ptr), Ptr.getValueType(),
05046                         Ptr,DAG.getConstant(StartOffset, Ptr.getValueType()));
05047 
05048     int EltNo = (Offset - StartOffset) >> 2;
05049     unsigned NumElems = VT.getVectorNumElements();
05050 
05051     EVT NVT = EVT::getVectorVT(*DAG.getContext(), PVT, NumElems);
05052     SDValue V1 = DAG.getLoad(NVT, dl, Chain, Ptr,
05053                              LD->getPointerInfo().getWithOffset(StartOffset),
05054                              false, false, false, 0);
05055 
05056     SmallVector<int, 8> Mask;
05057     for (unsigned i = 0; i != NumElems; ++i)
05058       Mask.push_back(EltNo);
05059 
05060     return DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), &Mask[0]);
05061   }
05062 
05063   return SDValue();
05064 }
05065 
05066 /// EltsFromConsecutiveLoads - Given the initializing elements 'Elts' of a
05067 /// vector of type 'VT', see if the elements can be replaced by a single large
05068 /// load which has the same value as a build_vector whose operands are 'elts'.
05069 ///
05070 /// Example: <load i32 *a, load i32 *a+4, undef, undef> -> zextload a
05071 ///
05072 /// FIXME: we'd also like to handle the case where the last elements are zero
05073 /// rather than undef via VZEXT_LOAD, but we do not detect that case today.
05074 /// There's even a handy isZeroNode for that purpose.
05075 static SDValue EltsFromConsecutiveLoads(EVT VT, SmallVectorImpl<SDValue> &Elts,
05076                                         SDLoc &DL, SelectionDAG &DAG) {
05077   EVT EltVT = VT.getVectorElementType();
05078   unsigned NumElems = Elts.size();
05079 
05080   LoadSDNode *LDBase = NULL;
05081   unsigned LastLoadedElt = -1U;
05082 
05083   // For each element in the initializer, see if we've found a load or an undef.
05084   // If we don't find an initial load element, or later load elements are
05085   // non-consecutive, bail out.
05086   for (unsigned i = 0; i < NumElems; ++i) {
05087     SDValue Elt = Elts[i];
05088 
05089     if (!Elt.getNode() ||
05090         (Elt.getOpcode() != ISD::UNDEF && !ISD::isNON_EXTLoad(Elt.getNode())))
05091       return SDValue();
05092     if (!LDBase) {
05093       if (Elt.getNode()->getOpcode() == ISD::UNDEF)
05094         return SDValue();
05095       LDBase = cast<LoadSDNode>(Elt.getNode());
05096       LastLoadedElt = i;
05097       continue;
05098     }
05099     if (Elt.getOpcode() == ISD::UNDEF)
05100       continue;
05101 
05102     LoadSDNode *LD = cast<LoadSDNode>(Elt);
05103     if (!DAG.isConsecutiveLoad(LD, LDBase, EltVT.getSizeInBits()/8, i))
05104       return SDValue();
05105     LastLoadedElt = i;
05106   }
05107 
05108   // If we have found an entire vector of loads and undefs, then return a large
05109   // load of the entire vector width starting at the base pointer.  If we found
05110   // consecutive loads for the low half, generate a vzext_load node.
05111   if (LastLoadedElt == NumElems - 1) {
05112     SDValue NewLd = SDValue();
05113     if (DAG.InferPtrAlignment(LDBase->getBasePtr()) >= 16)
05114       NewLd = DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
05115                           LDBase->getPointerInfo(),
05116                           LDBase->isVolatile(), LDBase->isNonTemporal(),
05117                           LDBase->isInvariant(), 0);
05118     NewLd = DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
05119                         LDBase->getPointerInfo(),
05120                         LDBase->isVolatile(), LDBase->isNonTemporal(),
05121                         LDBase->isInvariant(), LDBase->getAlignment());
05122 
05123     if (LDBase->hasAnyUseOfValue(1)) {
05124       SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
05125                                      SDValue(LDBase, 1),
05126                                      SDValue(NewLd.getNode(), 1));
05127       DAG.ReplaceAllUsesOfValueWith(SDValue(LDBase, 1), NewChain);
05128       DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(LDBase, 1),
05129                              SDValue(NewLd.getNode(), 1));
05130     }
05131 
05132     return NewLd;
05133   }
05134   if (NumElems == 4 && LastLoadedElt == 1 &&
05135       DAG.getTargetLoweringInfo().isTypeLegal(MVT::v2i64)) {
05136     SDVTList Tys = DAG.getVTList(MVT::v2i64, MVT::Other);
05137     SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };
05138     SDValue ResNode =
05139         DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, Ops,
05140                                 array_lengthof(Ops), MVT::i64,
05141                                 LDBase->getPointerInfo(),
05142                                 LDBase->getAlignment(),
05143                                 false/*isVolatile*/, true/*ReadMem*/,
05144                                 false/*WriteMem*/);
05145 
05146     // Make sure the newly-created LOAD is in the same position as LDBase in
05147     // terms of dependency. We create a TokenFactor for LDBase and ResNode, and
05148     // update uses of LDBase's output chain to use the TokenFactor.
05149     if (LDBase->hasAnyUseOfValue(1)) {
05150       SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
05151                              SDValue(LDBase, 1), SDValue(ResNode.getNode(), 1));
05152       DAG.ReplaceAllUsesOfValueWith(SDValue(LDBase, 1), NewChain);
05153       DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(LDBase, 1),
05154                              SDValue(ResNode.getNode(), 1));
05155     }
05156 
05157     return DAG.getNode(ISD::BITCAST, DL, VT, ResNode);
05158   }
05159   return SDValue();
05160 }
05161 
05162 /// LowerVectorBroadcast - Attempt to use the vbroadcast instruction
05163 /// to generate a splat value for the following cases:
05164 /// 1. A splat BUILD_VECTOR which uses a single scalar load, or a constant.
05165 /// 2. A splat shuffle which uses a scalar_to_vector node which comes from
05166 /// a scalar load, or a constant.
05167 /// The VBROADCAST node is returned when a pattern is found,
05168 /// or SDValue() otherwise.
05169 SDValue
05170 X86TargetLowering::LowerVectorBroadcast(SDValue Op, SelectionDAG &DAG) const {
05171   if (!Subtarget->hasFp256())
05172     return SDValue();
05173 
05174   MVT VT = Op.getValueType().getSimpleVT();
05175   SDLoc dl(Op);
05176 
05177   assert((VT.is128BitVector() || VT.is256BitVector()) &&
05178          "Unsupported vector type for broadcast.");
05179 
05180   SDValue Ld;
05181   bool ConstSplatVal;
05182 
05183   switch (Op.getOpcode()) {
05184     default:
05185       // Unknown pattern found.
05186       return SDValue();
05187 
05188     case ISD::BUILD_VECTOR: {
05189       // The BUILD_VECTOR node must be a splat.
05190       if (!isSplatVector(Op.getNode()))
05191         return SDValue();
05192 
05193       Ld = Op.getOperand(0);
05194       ConstSplatVal = (Ld.getOpcode() == ISD::Constant ||
05195                      Ld.getOpcode() == ISD::ConstantFP);
05196 
05197       // The suspected load node has several users. Make sure that all
05198       // of its users are from the BUILD_VECTOR node.
05199       // Constants may have multiple users.
05200       if (!ConstSplatVal && !Ld->hasNUsesOfValue(VT.getVectorNumElements(), 0))
05201         return SDValue();
05202       break;
05203     }
05204 
05205     case ISD::VECTOR_SHUFFLE: {
05206       ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
05207 
05208       // Shuffles must have a splat mask where the first element is
05209       // broadcasted.
05210       if ((!SVOp->isSplat()) || SVOp->getMaskElt(0) != 0)
05211         return SDValue();
05212 
05213       SDValue Sc = Op.getOperand(0);
05214       if (Sc.getOpcode() != ISD::SCALAR_TO_VECTOR &&
05215           Sc.getOpcode() != ISD::BUILD_VECTOR) {
05216 
05217         if (!Subtarget->hasInt256())
05218           return SDValue();
05219 
05220         // Use the register form of the broadcast instruction available on AVX2.
05221         if (VT.is256BitVector())
05222           Sc = Extract128BitVector(Sc, 0, DAG, dl);
05223         return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Sc);
05224       }
05225 
05226       Ld = Sc.getOperand(0);
05227       ConstSplatVal = (Ld.getOpcode() == ISD::Constant ||
05228                        Ld.getOpcode() == ISD::ConstantFP);
05229 
05230       // The scalar_to_vector node and the suspected
05231       // load node must have exactly one user.
05232       // Constants may have multiple users.
05233       if (!ConstSplatVal && (!Sc.hasOneUse() || !Ld.hasOneUse()))
05234         return SDValue();
05235       break;
05236     }
05237   }
05238 
05239   bool Is256 = VT.is256BitVector();
05240 
05241   // Handle the broadcasting a single constant scalar from the constant pool
05242   // into a vector. On Sandybridge it is still better to load a constant vector
05243   // from the constant pool and not to broadcast it from a scalar.
05244   if (ConstSplatVal && Subtarget->hasInt256()) {
05245     EVT CVT = Ld.getValueType();
05246     assert(!CVT.isVector() && "Must not broadcast a vector type");
05247     unsigned ScalarSize = CVT.getSizeInBits();
05248 
05249     if (ScalarSize == 32 || (Is256 && ScalarSize == 64)) {
05250       const Constant *C = 0;
05251       if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Ld))
05252         C = CI->getConstantIntValue();
05253       else if (ConstantFPSDNode *CF = dyn_cast<ConstantFPSDNode>(Ld))
05254         C = CF->getConstantFPValue();
05255 
05256       assert(C && "Invalid constant type");
05257 
05258       SDValue CP = DAG.getConstantPool(C, getPointerTy());
05259       unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
05260       Ld = DAG.getLoad(CVT, dl, DAG.getEntryNode(), CP,
05261                        MachinePointerInfo::getConstantPool(),
05262                        false, false, false, Alignment);
05263 
05264       return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
05265     }
05266   }
05267 
05268   bool IsLoad = ISD::isNormalLoad(Ld.getNode());
05269   unsigned ScalarSize = Ld.getValueType().getSizeInBits();
05270 
05271   // Handle AVX2 in-register broadcasts.
05272   if (!IsLoad && Subtarget->hasInt256() &&
05273       (ScalarSize == 32 || (Is256 && ScalarSize == 64)))
05274     return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
05275 
05276   // The scalar source must be a normal load.
05277   if (!IsLoad)
05278     return SDValue();
05279 
05280   if (ScalarSize == 32 || (Is256 && ScalarSize == 64))
05281     return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
05282 
05283   // The integer check is needed for the 64-bit into 128-bit so it doesn't match
05284   // double since there is no vbroadcastsd xmm
05285   if (Subtarget->hasInt256() && Ld.getValueType().isInteger()) {
05286     if (ScalarSize == 8 || ScalarSize == 16 || ScalarSize == 64)
05287       return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
05288   }
05289 
05290   // Unsupported broadcast.
05291   return SDValue();
05292 }
05293 
05294 SDValue
05295 X86TargetLowering::buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) const {
05296   EVT VT = Op.getValueType();
05297 
05298   // Skip if insert_vec_elt is not supported.
05299   if (!isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT))
05300     return SDValue();
05301 
05302   SDLoc DL(Op);
05303   unsigned NumElems = Op.getNumOperands();
05304 
05305   SDValue VecIn1;
05306   SDValue VecIn2;
05307   SmallVector<unsigned, 4> InsertIndices;
05308   SmallVector<int, 8> Mask(NumElems, -1);
05309 
05310   for (unsigned i = 0; i != NumElems; ++i) {
05311     unsigned Opc = Op.getOperand(i).getOpcode();
05312 
05313     if (Opc == ISD::UNDEF)
05314       continue;
05315 
05316     if (Opc != ISD::EXTRACT_VECTOR_ELT) {
05317       // Quit if more than 1 elements need inserting.
05318       if (InsertIndices.size() > 1)
05319         return SDValue();
05320 
05321       InsertIndices.push_back(i);
05322       continue;
05323     }
05324 
05325     SDValue ExtractedFromVec = Op.getOperand(i).getOperand(0);
05326     SDValue ExtIdx = Op.getOperand(i).getOperand(1);
05327 
05328     // Quit if extracted from vector of different type.
05329     if (ExtractedFromVec.getValueType() != VT)
05330       return SDValue();
05331 
05332     // Quit if non-constant index.
05333     if (!isa<ConstantSDNode>(ExtIdx))
05334       return SDValue();
05335 
05336     if (VecIn1.getNode() == 0)
05337       VecIn1 = ExtractedFromVec;
05338     else if (VecIn1 != ExtractedFromVec) {
05339       if (VecIn2.getNode() == 0)
05340         VecIn2 = ExtractedFromVec;
05341       else if (VecIn2 != ExtractedFromVec)
05342         // Quit if more than 2 vectors to shuffle
05343         return SDValue();
05344     }
05345 
05346     unsigned Idx = cast<ConstantSDNode>(ExtIdx)->getZExtValue();
05347 
05348     if (ExtractedFromVec == VecIn1)
05349       Mask[i] = Idx;
05350     else if (ExtractedFromVec == VecIn2)
05351       Mask[i] = Idx + NumElems;
05352   }
05353 
05354   if (VecIn1.getNode() == 0)
05355     return SDValue();
05356 
05357   VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(VT);
05358   SDValue NV = DAG.getVectorShuffle(VT, DL, VecIn1, VecIn2, &Mask[0]);
05359   for (unsigned i = 0, e = InsertIndices.size(); i != e; ++i) {
05360     unsigned Idx = InsertIndices[i];
05361     NV = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, NV, Op.getOperand(Idx),
05362                      DAG.getIntPtrConstant(Idx));
05363   }
05364 
05365   return NV;
05366 }
05367 
05368 SDValue
05369 X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
05370   SDLoc dl(Op);
05371 
05372   MVT VT = Op.getValueType().getSimpleVT();
05373   MVT ExtVT = VT.getVectorElementType();
05374   unsigned NumElems = Op.getNumOperands();
05375 
05376   // Vectors containing all zeros can be matched by pxor and xorps later
05377   if (ISD::isBuildVectorAllZeros(Op.getNode())) {
05378     // Canonicalize this to <4 x i32> to 1) ensure the zero vectors are CSE'd
05379     // and 2) ensure that i64 scalars are eliminated on x86-32 hosts.
05380     if (VT == MVT::v4i32 || VT == MVT::v8i32)
05381       return Op;
05382 
05383     return getZeroVector(VT, Subtarget, DAG, dl);
05384   }
05385 
05386   // Vectors containing all ones can be matched by pcmpeqd on 128-bit width
05387   // vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use
05388   // vpcmpeqd on 256-bit vectors.
05389   if (Subtarget->hasSSE2() && ISD::isBuildVectorAllOnes(Op.getNode())) {
05390     if (VT == MVT::v4i32 || (VT == MVT::v8i32 && Subtarget->hasInt256()))
05391       return Op;
05392 
05393     return getOnesVector(VT, Subtarget->hasInt256(), DAG, dl);
05394   }
05395 
05396   SDValue Broadcast = LowerVectorBroadcast(Op, DAG);
05397   if (Broadcast.getNode())
05398     return Broadcast;
05399 
05400   unsigned EVTBits = ExtVT.getSizeInBits();
05401 
05402   unsigned NumZero  = 0;
05403   unsigned NumNonZero = 0;
05404   unsigned NonZeros = 0;
05405   bool IsAllConstants = true;
05406   SmallSet<SDValue, 8> Values;
05407   for (unsigned i = 0; i < NumElems; ++i) {
05408     SDValue Elt = Op.getOperand(i);
05409     if (Elt.getOpcode() == ISD::UNDEF)
05410       continue;
05411     Values.insert(Elt);
05412     if (Elt.getOpcode() != ISD::Constant &&
05413         Elt.getOpcode() != ISD::ConstantFP)
05414       IsAllConstants = false;
05415     if (X86::isZeroNode(Elt))
05416       NumZero++;
05417     else {
05418       NonZeros |= (1 << i);
05419       NumNonZero++;
05420     }
05421   }
05422 
05423   // All undef vector. Return an UNDEF.  All zero vectors were handled above.
05424   if (NumNonZero == 0)
05425     return DAG.getUNDEF(VT);
05426 
05427   // Special case for single non-zero, non-undef, element.
05428   if (NumNonZero == 1) {
05429     unsigned Idx = countTrailingZeros(NonZeros);
05430     SDValue Item = Op.getOperand(Idx);
05431 
05432     // If this is an insertion of an i64 value on x86-32, and if the top bits of
05433     // the value are obviously zero, truncate the value to i32 and do the
05434     // insertion that way.  Only do this if the value is non-constant or if the
05435     // value is a constant being inserted into element 0.  It is cheaper to do
05436     // a constant pool load than it is to do a movd + shuffle.
05437     if (ExtVT == MVT::i64 && !Subtarget->is64Bit() &&
05438         (!IsAllConstants || Idx == 0)) {
05439       if (DAG.MaskedValueIsZero(Item, APInt::getBitsSet(64, 32, 64))) {
05440         // Handle SSE only.
05441         assert(VT == MVT::v2i64 && "Expected an SSE value type!");
05442         EVT VecVT = MVT::v4i32;
05443         unsigned VecElts = 4;
05444 
05445         // Truncate the value (which may itself be a constant) to i32, and
05446         // convert it to a vector with movd (S2V+shuffle to zero extend).
05447         Item = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Item);
05448         Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Item);
05449         Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
05450 
05451         // Now we have our 32-bit value zero extended in the low element of
05452         // a vector.  If Idx != 0, swizzle it into place.
05453         if (Idx != 0) {
05454           SmallVector<int, 4> Mask;
05455           Mask.push_back(Idx);
05456           for (unsigned i = 1; i != VecElts; ++i)
05457             Mask.push_back(i);
05458           Item = DAG.getVectorShuffle(VecVT, dl, Item, DAG.getUNDEF(VecVT),
05459                                       &Mask[0]);
05460         }
05461         return DAG.getNode(ISD::BITCAST, dl, VT, Item);
05462       }
05463     }
05464 
05465     // If we have a constant or non-constant insertion into the low element of
05466     // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into
05467     // the rest of the elements.  This will be matched as movd/movq/movss/movsd
05468     // depending on what the source datatype is.
05469     if (Idx == 0) {
05470       if (NumZero == 0)
05471         return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
05472 
05473       if (ExtVT == MVT::i32 || ExtVT == MVT::f32 || ExtVT == MVT::f64 ||
05474           (ExtVT == MVT::i64 && Subtarget->is64Bit())) {
05475         if (VT.is256BitVector()) {
05476           SDValue ZeroVec = getZeroVector(VT, Subtarget, DAG, dl);
05477           return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, ZeroVec,
05478                              Item, DAG.getIntPtrConstant(0));
05479         }
05480         assert(VT.is128BitVector() && "Expected an SSE value type!");
05481         Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
05482         // Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector.
05483         return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
05484       }
05485 
05486       if (ExtVT == MVT::i16 || ExtVT == MVT::i8) {
05487         Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item);
05488         Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item);
05489         if (VT.is256BitVector()) {
05490           SDValue ZeroVec = getZeroVector(MVT::v8i32, Subtarget, DAG, dl);
05491           Item = Insert128BitVector(ZeroVec, Item, 0, DAG, dl);
05492         } else {
05493           assert(VT.is128BitVector() && "Expected an SSE value type!");
05494           Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
05495         }
05496         return DAG.getNode(ISD::BITCAST, dl, VT, Item);
05497       }
05498     }
05499 
05500     // Is it a vector logical left shift?
05501     if (NumElems == 2 && Idx == 1 &&
05502         X86::isZeroNode(Op.getOperand(0)) &&
05503         !X86::isZeroNode(Op.getOperand(1))) {
05504       unsigned NumBits = VT.getSizeInBits();
05505       return getVShift(true, VT,
05506                        DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
05507                                    VT, Op.getOperand(1)),
05508                        NumBits/2, DAG, *this, dl);
05509     }
05510 
05511     if (IsAllConstants) // Otherwise, it's better to do a constpool load.
05512       return SDValue();
05513 
05514     // Otherwise, if this is a vector with i32 or f32 elements, and the element
05515     // is a non-constant being inserted into an element other than the low one,
05516     // we can't use a constant pool load.  Instead, use SCALAR_TO_VECTOR (aka
05517     // movd/movss) to move this into the low element, then shuffle it into
05518     // place.
05519     if (EVTBits == 32) {
05520       Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
05521 
05522       // Turn it into a shuffle of zero and zero-extended scalar to vector.
05523       Item = getShuffleVectorZeroOrUndef(Item, 0, NumZero > 0, Subtarget, DAG);
05524       SmallVector<int, 8> MaskVec;
05525       for (unsigned i = 0; i != NumElems; ++i)
05526         MaskVec.push_back(i == Idx ? 0 : 1);
05527       return DAG.getVectorShuffle(VT, dl, Item, DAG.getUNDEF(VT), &