doxygen/html/X86ISelLowering_8cpp_source.html

//===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===//

//

// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.

// See https://llvm.org/LICENSE.txt for license information.

// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

//

//===----------------------------------------------------------------------===//

//

// This file defines the interfaces that X86 uses to lower LLVM code into a

// selection DAG.

//

//===----------------------------------------------------------------------===//


#include "X86ISelLowering.h"

#include "MCTargetDesc/X86ShuffleDecode.h"

#include "X86.h"

#include "X86FrameLowering.h"

#include "X86InstrBuilder.h"

#include "X86IntrinsicsInfo.h"

#include "X86MachineFunctionInfo.h"

#include "X86TargetMachine.h"

#include "llvm/ADT/SmallBitVector.h"

#include "llvm/ADT/SmallSet.h"

#include "llvm/ADT/SmallVector.h"

#include "llvm/ADT/Statistic.h"

#include "llvm/ADT/StringExtras.h"

#include "llvm/ADT/StringSwitch.h"

#include "llvm/Analysis/BlockFrequencyInfo.h"

#include "llvm/Analysis/ProfileSummaryInfo.h"

#include "llvm/Analysis/VectorUtils.h"

#include "llvm/CodeGen/LivePhysRegs.h"

#include "llvm/CodeGen/MachineFrameInfo.h"

#include "llvm/CodeGen/MachineFunction.h"

#include "llvm/CodeGen/MachineInstrBuilder.h"

#include "llvm/CodeGen/MachineJumpTableInfo.h"

#include "llvm/CodeGen/MachineLoopInfo.h"

#include "llvm/CodeGen/MachineModuleInfo.h"

#include "llvm/CodeGen/MachineRegisterInfo.h"

#include "llvm/CodeGen/SDPatternMatch.h"

#include "llvm/CodeGen/SelectionDAGNodes.h"

#include "llvm/CodeGen/TargetLowering.h"

#include "llvm/CodeGen/WinEHFuncInfo.h"

#include "llvm/IR/CallingConv.h"

#include "llvm/IR/Constants.h"

#include "llvm/IR/DerivedTypes.h"

#include "llvm/IR/EHPersonalities.h"

#include "llvm/IR/Function.h"

#include "llvm/IR/GlobalAlias.h"

#include "llvm/IR/GlobalVariable.h"

#include "llvm/IR/IRBuilder.h"

#include "llvm/IR/Instructions.h"

#include "llvm/IR/Intrinsics.h"

#include "llvm/IR/PatternMatch.h"

#include "llvm/MC/MCAsmInfo.h"

#include "llvm/MC/MCContext.h"

#include "llvm/MC/MCExpr.h"

#include "llvm/MC/MCSymbol.h"

#include "llvm/Support/CommandLine.h"

#include "llvm/Support/Debug.h"

#include "llvm/Support/ErrorHandling.h"

#include "llvm/Support/KnownBits.h"

#include "llvm/Support/MathExtras.h"

#include "llvm/Target/TargetOptions.h"

#include <algorithm>

#include <bitset>

#include <cctype>

#include <numeric>

using namespace llvm;


#define DEBUG_TYPE "x86-isel"


static cl::opt<int> ExperimentalPrefInnermostLoopAlignment(

    "x86-experimental-pref-innermost-loop-alignment", cl::init(4),

    cl::desc(

        "Sets the preferable loop alignment for experiments (as log2 bytes) "

        "for innermost loops only. If specified, this option overrides "

        "alignment set by x86-experimental-pref-loop-alignment."),

    cl::Hidden);


static cl::opt<int> BrMergingBaseCostThresh(

    "x86-br-merging-base-cost", cl::init(2),

    cl::desc(

        "Sets the cost threshold for when multiple conditionals will be merged "

        "into one branch versus be split in multiple branches. Merging "

        "conditionals saves branches at the cost of additional instructions. "

        "This value sets the instruction cost limit, below which conditionals "

        "will be merged, and above which conditionals will be split. Set to -1 "

        "to never merge branches."),

    cl::Hidden);


static cl::opt<int> BrMergingCcmpBias(

    "x86-br-merging-ccmp-bias", cl::init(6),

    cl::desc("Increases 'x86-br-merging-base-cost' in cases that the target "

             "supports conditional compare instructions."),

    cl::Hidden);


static cl::opt<bool>

    WidenShift("x86-widen-shift", cl::init(true),

               cl::desc("Replace narrow shifts with wider shifts."),

               cl::Hidden);


static cl::opt<int> BrMergingLikelyBias(

    "x86-br-merging-likely-bias", cl::init(0),

    cl::desc("Increases 'x86-br-merging-base-cost' in cases that it is likely "

             "that all conditionals will be executed. For example for merging "

             "the conditionals (a == b && c > d), if its known that a == b is "

             "likely, then it is likely that if the conditionals are split "

             "both sides will be executed, so it may be desirable to increase "

             "the instruction cost threshold. Set to -1 to never merge likely "

             "branches."),

    cl::Hidden);


static cl::opt<int> BrMergingUnlikelyBias(

    "x86-br-merging-unlikely-bias", cl::init(-1),

    cl::desc(

        "Decreases 'x86-br-merging-base-cost' in cases that it is unlikely "

        "that all conditionals will be executed. For example for merging "

        "the conditionals (a == b && c > d), if its known that a == b is "

        "unlikely, then it is unlikely that if the conditionals are split "

        "both sides will be executed, so it may be desirable to decrease "

        "the instruction cost threshold. Set to -1 to never merge unlikely "

        "branches."),

    cl::Hidden);


static cl::opt<bool> MulConstantOptimization(

    "mul-constant-optimization", cl::init(true),

    cl::desc("Replace 'mul x, Const' with more effective instructions like "

             "SHIFT, LEA, etc."),

    cl::Hidden);


X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,

                                     const X86Subtarget &STI)

    : TargetLowering(TM), Subtarget(STI) {

  bool UseX87 = !Subtarget.useSoftFloat() && Subtarget.hasX87();

  MVT PtrVT = MVT::getIntegerVT(TM.getPointerSizeInBits(0));


  // Set up the TargetLowering object.


  // X86 is weird. It always uses i8 for shift amounts and setcc results.

  setBooleanContents(ZeroOrOneBooleanContent);

  // X86-SSE is even stranger. It uses -1 or 0 for vector masks.

  setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);


  // X86 instruction cache is coherent with its data cache so we can use the

  // default expansion to a no-op.

  setOperationAction(ISD::CLEAR_CACHE, MVT::Other, Expand);


  // For 64-bit, since we have so many registers, use the ILP scheduler.

  // For 32-bit, use the register pressure specific scheduling.

  // For Atom, always use ILP scheduling.

  if (Subtarget.isAtom())

    setSchedulingPreference(Sched::ILP);

  else if (Subtarget.is64Bit())

    setSchedulingPreference(Sched::ILP);

  else

    setSchedulingPreference(Sched::RegPressure);

  const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();

  setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister());


  // Bypass expensive divides and use cheaper ones.

  if (TM.getOptLevel() >= CodeGenOptLevel::Default) {

    if (Subtarget.hasSlowDivide32())

      addBypassSlowDiv(32, 8);

    if (Subtarget.hasSlowDivide64() && Subtarget.is64Bit())

      addBypassSlowDiv(64, 32);

  }


  if (Subtarget.canUseCMPXCHG16B())

    setMaxAtomicSizeInBitsSupported(128);

  else if (Subtarget.canUseCMPXCHG8B())

    setMaxAtomicSizeInBitsSupported(64);

  else

    setMaxAtomicSizeInBitsSupported(32);


  setMaxDivRemBitWidthSupported(Subtarget.is64Bit() ? 128 : 64);


  setMaxLargeFPConvertBitWidthSupported(128);


  // Set up the register classes.

  addRegisterClass(MVT::i8, &X86::GR8RegClass);

  addRegisterClass(MVT::i16, &X86::GR16RegClass);

  addRegisterClass(MVT::i32, &X86::GR32RegClass);

  if (Subtarget.is64Bit())

    addRegisterClass(MVT::i64, &X86::GR64RegClass);


  for (MVT VT : MVT::integer_valuetypes())

    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);


  // We don't accept any truncstore of integer registers.

  setTruncStoreAction(MVT::i64, MVT::i32, Expand);

  setTruncStoreAction(MVT::i64, MVT::i16, Expand);

  setTruncStoreAction(MVT::i64, MVT::i8 , Expand);

  setTruncStoreAction(MVT::i32, MVT::i16, Expand);

  setTruncStoreAction(MVT::i32, MVT::i8 , Expand);

  setTruncStoreAction(MVT::i16, MVT::i8,  Expand);


  setTruncStoreAction(MVT::f64, MVT::f32, Expand);


  // SETOEQ and SETUNE require checking two conditions.

  for (auto VT : {MVT::f32, MVT::f64, MVT::f80}) {

    setCondCodeAction(ISD::SETOEQ, VT, Expand);

    setCondCodeAction(ISD::SETUNE, VT, Expand);

  }


  // Integer absolute.

  if (Subtarget.canUseCMOV()) {

    setOperationAction(ISD::ABS            , MVT::i16  , Custom);

    setOperationAction(ISD::ABS            , MVT::i32  , Custom);

    if (Subtarget.is64Bit())

      setOperationAction(ISD::ABS          , MVT::i64  , Custom);

  }


  // Absolute difference.

  for (auto Op : {ISD::ABDS, ISD::ABDU}) {

    setOperationAction(Op                  , MVT::i8   , Custom);

    setOperationAction(Op                  , MVT::i16  , Custom);

    setOperationAction(Op                  , MVT::i32  , Custom);

    if (Subtarget.is64Bit())

     setOperationAction(Op                 , MVT::i64  , Custom);

  }


  // Signed saturation subtraction.

  setOperationAction(ISD::SSUBSAT          , MVT::i8   , Custom);

  setOperationAction(ISD::SSUBSAT          , MVT::i16  , Custom);

  setOperationAction(ISD::SSUBSAT          , MVT::i32  , Custom);

  if (Subtarget.is64Bit())

    setOperationAction(ISD::SSUBSAT        , MVT::i64  , Custom);


  // Funnel shifts.

  for (auto ShiftOp : {ISD::FSHL, ISD::FSHR}) {

    // For slow shld targets we only lower for code size.

    LegalizeAction ShiftDoubleAction = Subtarget.isSHLDSlow() ? Custom : Legal;


    setOperationAction(ShiftOp             , MVT::i8   , Custom);

    setOperationAction(ShiftOp             , MVT::i16  , Custom);

    setOperationAction(ShiftOp             , MVT::i32  , ShiftDoubleAction);

    if (Subtarget.is64Bit())

      setOperationAction(ShiftOp           , MVT::i64  , ShiftDoubleAction);

  }


  if (!Subtarget.useSoftFloat()) {

    // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this

    // operation.

    setOperationAction(ISD::UINT_TO_FP,        MVT::i8, Promote);

    setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i8, Promote);

    setOperationAction(ISD::UINT_TO_FP,        MVT::i16, Promote);

    setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i16, Promote);

    // We have an algorithm for SSE2, and we turn this into a 64-bit

    // FILD or VCVTUSI2SS/SD for other targets.

    setOperationAction(ISD::UINT_TO_FP,        MVT::i32, Custom);

    setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i32, Custom);

    // We have an algorithm for SSE2->double, and we turn this into a

    // 64-bit FILD followed by conditional FADD for other targets.

    setOperationAction(ISD::UINT_TO_FP,        MVT::i64, Custom);

    setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i64, Custom);


    // Promote i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have

    // this operation.

    setOperationAction(ISD::SINT_TO_FP,        MVT::i8, Promote);

    setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i8, Promote);

    // SSE has no i16 to fp conversion, only i32. We promote in the handler

    // to allow f80 to use i16 and f64 to use i16 with sse1 only

    setOperationAction(ISD::SINT_TO_FP,        MVT::i16, Custom);

    setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i16, Custom);

    // f32 and f64 cases are Legal with SSE1/SSE2, f80 case is not

    setOperationAction(ISD::SINT_TO_FP,        MVT::i32, Custom);

    setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i32, Custom);

    // In 32-bit mode these are custom lowered.  In 64-bit mode F32 and F64

    // are Legal, f80 is custom lowered.

    setOperationAction(ISD::SINT_TO_FP,        MVT::i64, Custom);

    setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i64, Custom);


    // Promote i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have

    // this operation.

    setOperationAction(ISD::FP_TO_SINT,        MVT::i8,  Promote);

    // FIXME: This doesn't generate invalid exception when it should. PR44019.

    setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i8,  Promote);

    setOperationAction(ISD::FP_TO_SINT,        MVT::i16, Custom);

    setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i16, Custom);

    setOperationAction(ISD::FP_TO_SINT,        MVT::i32, Custom);

    setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i32, Custom);

    // In 32-bit mode these are custom lowered.  In 64-bit mode F32 and F64

    // are Legal, f80 is custom lowered.

    setOperationAction(ISD::FP_TO_SINT,        MVT::i64, Custom);

    setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i64, Custom);


    // Handle FP_TO_UINT by promoting the destination to a larger signed

    // conversion.

    setOperationAction(ISD::FP_TO_UINT,        MVT::i8,  Promote);

    // FIXME: This doesn't generate invalid exception when it should. PR44019.

    setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i8,  Promote);

    setOperationAction(ISD::FP_TO_UINT,        MVT::i16, Promote);

    // FIXME: This doesn't generate invalid exception when it should. PR44019.

    setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i16, Promote);

    setOperationAction(ISD::FP_TO_UINT,        MVT::i32, Custom);

    setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i32, Custom);

    setOperationAction(ISD::FP_TO_UINT,        MVT::i64, Custom);

    setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i64, Custom);


    setOperationAction(ISD::LRINT,             MVT::f32, Custom);

    setOperationAction(ISD::LRINT,             MVT::f64, Custom);

    setOperationAction(ISD::LLRINT,            MVT::f32, Custom);

    setOperationAction(ISD::LLRINT,            MVT::f64, Custom);


    if (!Subtarget.is64Bit()) {

      setOperationAction(ISD::LRINT,  MVT::i64, Custom);

      setOperationAction(ISD::LLRINT, MVT::i64, Custom);

    }

  }


  if (Subtarget.hasSSE2()) {

    // Custom lowering for saturating float to int conversions.

    // We handle promotion to larger result types manually.

    for (MVT VT : { MVT::i8, MVT::i16, MVT::i32 }) {

      setOperationAction(ISD::FP_TO_UINT_SAT, VT, Custom);

      setOperationAction(ISD::FP_TO_SINT_SAT, VT, Custom);

    }

    setOperationAction(ISD::FCANONICALIZE, MVT::f32, Custom);

    setOperationAction(ISD::FCANONICALIZE, MVT::f64, Custom);

    if (Subtarget.is64Bit()) {

      setOperationAction(ISD::FP_TO_UINT_SAT, MVT::i64, Custom);

      setOperationAction(ISD::FP_TO_SINT_SAT, MVT::i64, Custom);

    }

  }

  if (Subtarget.hasAVX10_2()) {

    setOperationAction(ISD::FP_TO_UINT_SAT, MVT::v2i32, Custom);

    setOperationAction(ISD::FP_TO_SINT_SAT, MVT::v2i32, Custom);

    setOperationAction(ISD::FP_TO_UINT_SAT, MVT::v8i64, Legal);

    setOperationAction(ISD::FP_TO_SINT_SAT, MVT::v8i64, Legal);

    for (MVT VT : {MVT::i32, MVT::v4i32, MVT::v8i32, MVT::v16i32, MVT::v2i64,

                   MVT::v4i64}) {

      setOperationAction(ISD::FP_TO_UINT_SAT, VT, Legal);

      setOperationAction(ISD::FP_TO_SINT_SAT, VT, Legal);

    }

    if (Subtarget.is64Bit()) {

      setOperationAction(ISD::FP_TO_UINT_SAT, MVT::i64, Legal);

      setOperationAction(ISD::FP_TO_SINT_SAT, MVT::i64, Legal);

    }

  }


  // Handle address space casts between mixed sized pointers.

  setOperationAction(ISD::ADDRSPACECAST, MVT::i32, Custom);

  setOperationAction(ISD::ADDRSPACECAST, MVT::i64, Custom);


  // TODO: when we have SSE, these could be more efficient, by using movd/movq.

  if (!Subtarget.hasSSE2()) {

    setOperationAction(ISD::BITCAST        , MVT::f32  , Expand);

    setOperationAction(ISD::BITCAST        , MVT::i32  , Expand);

    setOperationAction(ISD::FCANONICALIZE, MVT::f32, Custom);

    setOperationAction(ISD::FCANONICALIZE, MVT::f64, Custom);

    if (Subtarget.is64Bit()) {

      setOperationAction(ISD::BITCAST      , MVT::f64  , Expand);

      // Without SSE, i64->f64 goes through memory.

      setOperationAction(ISD::BITCAST      , MVT::i64  , Expand);

    }

  } else if (!Subtarget.is64Bit())

    setOperationAction(ISD::BITCAST      , MVT::i64  , Custom);


  // Scalar integer divide and remainder are lowered to use operations that

  // produce two results, to match the available instructions. This exposes

  // the two-result form to trivial CSE, which is able to combine x/y and x%y

  // into a single instruction.

  //

  // Scalar integer multiply-high is also lowered to use two-result

  // operations, to match the available instructions. However, plain multiply

  // (low) operations are left as Legal, as there are single-result

  // instructions for this in x86. Using the two-result multiply instructions

  // when both high and low results are needed must be arranged by dagcombine.

  for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {

    setOperationAction(ISD::MULHS, VT, Expand);

    setOperationAction(ISD::MULHU, VT, Expand);

    setOperationAction(ISD::SDIV, VT, Expand);

    setOperationAction(ISD::UDIV, VT, Expand);

    setOperationAction(ISD::SREM, VT, Expand);

    setOperationAction(ISD::UREM, VT, Expand);

  }


  setOperationAction(ISD::BR_JT            , MVT::Other, Expand);

  setOperationAction(ISD::BRCOND           , MVT::Other, Custom);

  for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128,

                   MVT::i8,  MVT::i16, MVT::i32, MVT::i64 }) {

    setOperationAction(ISD::BR_CC,     VT, Expand);

    setOperationAction(ISD::SELECT_CC, VT, Expand);

  }

  if (Subtarget.is64Bit())

    setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);

  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16  , Legal);

  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8   , Legal);

  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1   , Expand);


  setOperationAction(ISD::FREM             , MVT::f32  , Expand);

  setOperationAction(ISD::FREM             , MVT::f64  , Expand);

  setOperationAction(ISD::FREM             , MVT::f80  , Expand);

  setOperationAction(ISD::FREM             , MVT::f128 , Expand);


  if (!Subtarget.useSoftFloat() && Subtarget.hasX87()) {

    setOperationAction(ISD::GET_ROUNDING   , MVT::i32  , Custom);

    setOperationAction(ISD::SET_ROUNDING   , MVT::Other, Custom);

    setOperationAction(ISD::GET_FPENV_MEM  , MVT::Other, Custom);

    setOperationAction(ISD::SET_FPENV_MEM  , MVT::Other, Custom);

    setOperationAction(ISD::RESET_FPENV    , MVT::Other, Custom);

  }


  // Promote the i8 variants and force them on up to i32 which has a shorter

  // encoding.

  setOperationPromotedToType(ISD::CTTZ           , MVT::i8   , MVT::i32);

  setOperationPromotedToType(ISD::CTTZ_ZERO_UNDEF, MVT::i8   , MVT::i32);

  // Promoted i16. tzcntw has a false dependency on Intel CPUs. For BSF, we emit

  // a REP prefix to encode it as TZCNT for modern CPUs so it makes sense to

  // promote that too.

  setOperationPromotedToType(ISD::CTTZ           , MVT::i16  , MVT::i32);

  setOperationPromotedToType(ISD::CTTZ_ZERO_UNDEF, MVT::i16  , MVT::i32);


  if (!Subtarget.hasBMI()) {

    setOperationAction(ISD::CTTZ           , MVT::i32  , Custom);

    setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32  , Legal);

    if (Subtarget.is64Bit()) {

      setOperationAction(ISD::CTTZ         , MVT::i64  , Custom);

      setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Legal);

    }

  }


  if (Subtarget.hasLZCNT()) {

    // When promoting the i8 variants, force them to i32 for a shorter

    // encoding.

    setOperationPromotedToType(ISD::CTLZ           , MVT::i8   , MVT::i32);

    setOperationPromotedToType(ISD::CTLZ_ZERO_UNDEF, MVT::i8   , MVT::i32);

  } else {

    for (auto VT : {MVT::i8, MVT::i16, MVT::i32, MVT::i64}) {

      if (VT == MVT::i64 && !Subtarget.is64Bit())

        continue;

      setOperationAction(ISD::CTLZ           , VT, Custom);

      setOperationAction(ISD::CTLZ_ZERO_UNDEF, VT, Custom);

    }

  }


  for (auto Op : {ISD::FP16_TO_FP, ISD::STRICT_FP16_TO_FP, ISD::FP_TO_FP16,

                  ISD::STRICT_FP_TO_FP16}) {

    // Special handling for half-precision floating point conversions.

    // If we don't have F16C support, then lower half float conversions

    // into library calls.

    setOperationAction(

        Op, MVT::f32,

        (!Subtarget.useSoftFloat() && Subtarget.hasF16C()) ? Custom : Expand);

    // There's never any support for operations beyond MVT::f32.

    setOperationAction(Op, MVT::f64, Expand);

    setOperationAction(Op, MVT::f80, Expand);

    setOperationAction(Op, MVT::f128, Expand);

  }


  for (auto VT : {MVT::f32, MVT::f64, MVT::f80, MVT::f128}) {

    setOperationAction(ISD::STRICT_FP_TO_BF16, VT, Expand);

    setOperationAction(ISD::STRICT_BF16_TO_FP, VT, Expand);

  }


  for (MVT VT : {MVT::f32, MVT::f64, MVT::f80, MVT::f128}) {

    setLoadExtAction(ISD::EXTLOAD, VT, MVT::f16, Expand);

    setLoadExtAction(ISD::EXTLOAD, VT, MVT::bf16, Expand);

    setTruncStoreAction(VT, MVT::f16, Expand);

    setTruncStoreAction(VT, MVT::bf16, Expand);


    setOperationAction(ISD::BF16_TO_FP, VT, Expand);

    setOperationAction(ISD::FP_TO_BF16, VT, Custom);

  }


  setOperationAction(ISD::PARITY, MVT::i8, Custom);

  setOperationAction(ISD::PARITY, MVT::i16, Custom);

  setOperationAction(ISD::PARITY, MVT::i32, Custom);

  if (Subtarget.is64Bit())

    setOperationAction(ISD::PARITY, MVT::i64, Custom);

  if (Subtarget.hasPOPCNT()) {

    setOperationPromotedToType(ISD::CTPOP, MVT::i8, MVT::i32);

    // popcntw is longer to encode than popcntl and also has a false dependency

    // on the dest that popcntl hasn't had since Cannon Lake.

    setOperationPromotedToType(ISD::CTPOP, MVT::i16, MVT::i32);

  } else {

    setOperationAction(ISD::CTPOP          , MVT::i8   , Custom);

    setOperationAction(ISD::CTPOP          , MVT::i16  , Custom);

    setOperationAction(ISD::CTPOP          , MVT::i32  , Custom);

    setOperationAction(ISD::CTPOP          , MVT::i64  , Custom);

  }


  setOperationAction(ISD::READCYCLECOUNTER , MVT::i64  , Custom);


  if (!Subtarget.hasMOVBE())

    setOperationAction(ISD::BSWAP          , MVT::i16  , Expand);


  // X86 wants to expand cmov itself.

  for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128 }) {

    setOperationAction(ISD::SELECT, VT, Custom);

    setOperationAction(ISD::SETCC, VT, Custom);

    setOperationAction(ISD::STRICT_FSETCC, VT, Custom);

    setOperationAction(ISD::STRICT_FSETCCS, VT, Custom);

  }

  for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {

    if (VT == MVT::i64 && !Subtarget.is64Bit())

      continue;

    setOperationAction(ISD::SELECT, VT, Custom);

    setOperationAction(ISD::SETCC,  VT, Custom);

  }


  // Custom action for SELECT MMX and expand action for SELECT_CC MMX

  setOperationAction(ISD::SELECT, MVT::x86mmx, Custom);

  setOperationAction(ISD::SELECT_CC, MVT::x86mmx, Expand);


  setOperationAction(ISD::EH_RETURN       , MVT::Other, Custom);

  // NOTE: EH_SJLJ_SETJMP/_LONGJMP are not recommended, since

  // LLVM/Clang supports zero-cost DWARF and SEH exception handling.

  setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom);

  setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom);

  setOperationAction(ISD::EH_SJLJ_SETUP_DISPATCH, MVT::Other, Custom);


  // Darwin ABI issue.

  for (auto VT : { MVT::i32, MVT::i64 }) {

    if (VT == MVT::i64 && !Subtarget.is64Bit())

      continue;

    setOperationAction(ISD::ConstantPool    , VT, Custom);

    setOperationAction(ISD::JumpTable       , VT, Custom);

    setOperationAction(ISD::GlobalAddress   , VT, Custom);

    setOperationAction(ISD::GlobalTLSAddress, VT, Custom);

    setOperationAction(ISD::ExternalSymbol  , VT, Custom);

    setOperationAction(ISD::BlockAddress    , VT, Custom);

  }


  // 64-bit shl, sra, srl (iff 32-bit x86)

  for (auto VT : { MVT::i32, MVT::i64 }) {

    if (VT == MVT::i64 && !Subtarget.is64Bit())

      continue;

    setOperationAction(ISD::SHL_PARTS, VT, Custom);

    setOperationAction(ISD::SRA_PARTS, VT, Custom);

    setOperationAction(ISD::SRL_PARTS, VT, Custom);

  }


  if (Subtarget.hasSSEPrefetch())

    setOperationAction(ISD::PREFETCH      , MVT::Other, Custom);


  setOperationAction(ISD::ATOMIC_FENCE  , MVT::Other, Custom);


  // Expand certain atomics

  for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {

    setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, VT, Custom);

    setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom);

    setOperationAction(ISD::ATOMIC_LOAD_ADD, VT, Custom);

    setOperationAction(ISD::ATOMIC_LOAD_OR, VT, Custom);

    setOperationAction(ISD::ATOMIC_LOAD_XOR, VT, Custom);

    setOperationAction(ISD::ATOMIC_LOAD_AND, VT, Custom);

    setOperationAction(ISD::ATOMIC_STORE, VT, Custom);

  }


  if (!Subtarget.is64Bit())

    setOperationAction(ISD::ATOMIC_LOAD, MVT::i64, Custom);


  if (Subtarget.is64Bit() && Subtarget.hasAVX()) {

    // All CPUs supporting AVX will atomically load/store aligned 128-bit

    // values, so we can emit [V]MOVAPS/[V]MOVDQA.

    setOperationAction(ISD::ATOMIC_LOAD, MVT::i128, Custom);

    setOperationAction(ISD::ATOMIC_STORE, MVT::i128, Custom);

  }


  if (Subtarget.canUseCMPXCHG16B())

    setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i128, Custom);


  // FIXME - use subtarget debug flags

  if (!Subtarget.isTargetDarwin() && !Subtarget.isTargetELF() &&

      !Subtarget.isTargetCygMing() && !Subtarget.isTargetWin64() &&

      TM.Options.ExceptionModel != ExceptionHandling::SjLj) {

    setOperationAction(ISD::EH_LABEL, MVT::Other, Expand);

  }


  setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom);

  setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i64, Custom);


  setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom);

  setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom);


  setOperationAction(ISD::TRAP, MVT::Other, Legal);

  setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);

  if (Subtarget.isTargetPS())

    setOperationAction(ISD::UBSANTRAP, MVT::Other, Expand);

  else

    setOperationAction(ISD::UBSANTRAP, MVT::Other, Legal);


  // VASTART needs to be custom lowered to use the VarArgsFrameIndex

  setOperationAction(ISD::VASTART           , MVT::Other, Custom);

  setOperationAction(ISD::VAEND             , MVT::Other, Expand);

  bool Is64Bit = Subtarget.is64Bit();

  setOperationAction(ISD::VAARG,  MVT::Other, Is64Bit ? Custom : Expand);

  setOperationAction(ISD::VACOPY, MVT::Other, Is64Bit ? Custom : Expand);


  setOperationAction(ISD::STACKSAVE,          MVT::Other, Expand);

  setOperationAction(ISD::STACKRESTORE,       MVT::Other, Expand);


  setOperationAction(ISD::DYNAMIC_STACKALLOC, PtrVT, Custom);


  // GC_TRANSITION_START and GC_TRANSITION_END need custom lowering.

  setOperationAction(ISD::GC_TRANSITION_START, MVT::Other, Custom);

  setOperationAction(ISD::GC_TRANSITION_END, MVT::Other, Custom);


  setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f64, Legal);


  auto setF16Action = [&] (MVT VT, LegalizeAction Action) {

    setOperationAction(ISD::FABS, VT, Action);

    setOperationAction(ISD::FNEG, VT, Action);

    setOperationAction(ISD::FCOPYSIGN, VT, Expand);

    setOperationAction(ISD::FREM, VT, Action);

    setOperationAction(ISD::FMA, VT, Action);

    setOperationAction(ISD::FMINNUM, VT, Action);

    setOperationAction(ISD::FMAXNUM, VT, Action);

    setOperationAction(ISD::FMINIMUM, VT, Action);

    setOperationAction(ISD::FMAXIMUM, VT, Action);

    setOperationAction(ISD::FMINIMUMNUM, VT, Action);

    setOperationAction(ISD::FMAXIMUMNUM, VT, Action);

    setOperationAction(ISD::FSIN, VT, Action);

    setOperationAction(ISD::FCOS, VT, Action);

    setOperationAction(ISD::FSINCOS, VT, Action);

    setOperationAction(ISD::FTAN, VT, Action);

    setOperationAction(ISD::FSQRT, VT, Action);

    setOperationAction(ISD::FPOW, VT, Action);

    setOperationAction(ISD::FPOWI, VT, Action);

    setOperationAction(ISD::FLOG, VT, Action);

    setOperationAction(ISD::FLOG2, VT, Action);

    setOperationAction(ISD::FLOG10, VT, Action);

    setOperationAction(ISD::FEXP, VT, Action);

    setOperationAction(ISD::FEXP2, VT, Action);

    setOperationAction(ISD::FEXP10, VT, Action);

    setOperationAction(ISD::FCEIL, VT, Action);

    setOperationAction(ISD::FFLOOR, VT, Action);

    setOperationAction(ISD::FNEARBYINT, VT, Action);

    setOperationAction(ISD::FRINT, VT, Action);

    setOperationAction(ISD::BR_CC, VT, Action);

    setOperationAction(ISD::SETCC, VT, Action);

    setOperationAction(ISD::SELECT, VT, Custom);

    setOperationAction(ISD::SELECT_CC, VT, Action);

    setOperationAction(ISD::FROUND, VT, Action);

    setOperationAction(ISD::FROUNDEVEN, VT, Action);

    setOperationAction(ISD::FTRUNC, VT, Action);

    setOperationAction(ISD::FLDEXP, VT, Action);

  };


  if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) {

    // f16, f32 and f64 use SSE.

    // Set up the FP register classes.

    addRegisterClass(MVT::f16, Subtarget.hasAVX512() ? &X86::FR16XRegClass

                                                     : &X86::FR16RegClass);

    addRegisterClass(MVT::f32, Subtarget.hasAVX512() ? &X86::FR32XRegClass

                                                     : &X86::FR32RegClass);

    addRegisterClass(MVT::f64, Subtarget.hasAVX512() ? &X86::FR64XRegClass

                                                     : &X86::FR64RegClass);


    // Disable f32->f64 extload as we can only generate this in one instruction

    // under optsize. So its easier to pattern match (fpext (load)) for that

    // case instead of needing to emit 2 instructions for extload in the

    // non-optsize case.

    setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);


    for (auto VT : { MVT::f32, MVT::f64 }) {

      // Use ANDPD to simulate FABS.

      setOperationAction(ISD::FABS, VT, Custom);


      // Use XORP to simulate FNEG.

      setOperationAction(ISD::FNEG, VT, Custom);


      // Use ANDPD and ORPD to simulate FCOPYSIGN.

      setOperationAction(ISD::FCOPYSIGN, VT, Custom);


      // These might be better off as horizontal vector ops.

      setOperationAction(ISD::FADD, VT, Custom);

      setOperationAction(ISD::FSUB, VT, Custom);


      // We don't support sin/cos/fmod

      setOperationAction(ISD::FSIN   , VT, Expand);

      setOperationAction(ISD::FCOS   , VT, Expand);

      setOperationAction(ISD::FSINCOS, VT, Expand);

    }


    // Half type will be promoted by default.

    setF16Action(MVT::f16, Promote);

    setOperationAction(ISD::FADD, MVT::f16, Promote);

    setOperationAction(ISD::FSUB, MVT::f16, Promote);

    setOperationAction(ISD::FMUL, MVT::f16, Promote);

    setOperationAction(ISD::FDIV, MVT::f16, Promote);

    setOperationAction(ISD::FABS, MVT::f16, Custom);

    setOperationAction(ISD::FNEG, MVT::f16, Custom);

    setOperationAction(ISD::FCOPYSIGN, MVT::f16, Custom);

    setOperationAction(ISD::FP_ROUND, MVT::f16, Custom);

    setOperationAction(ISD::FP_EXTEND, MVT::f32, Custom);

    setOperationAction(ISD::FP_EXTEND, MVT::f64, Custom);


    setOperationAction(ISD::STRICT_FADD, MVT::f16, Promote);

    setOperationAction(ISD::STRICT_FSUB, MVT::f16, Promote);

    setOperationAction(ISD::STRICT_FMUL, MVT::f16, Promote);

    setOperationAction(ISD::STRICT_FDIV, MVT::f16, Promote);

    setOperationAction(ISD::STRICT_FMA, MVT::f16, Promote);

    setOperationAction(ISD::STRICT_FMINNUM, MVT::f16, Promote);

    setOperationAction(ISD::STRICT_FMAXNUM, MVT::f16, Promote);

    setOperationAction(ISD::STRICT_FMINIMUM, MVT::f16, Promote);

    setOperationAction(ISD::STRICT_FMAXIMUM, MVT::f16, Promote);

    setOperationAction(ISD::STRICT_FSQRT, MVT::f16, Promote);

    setOperationAction(ISD::STRICT_FPOW, MVT::f16, Promote);

    setOperationAction(ISD::STRICT_FLDEXP, MVT::f16, Promote);

    setOperationAction(ISD::STRICT_FLOG, MVT::f16, Promote);

    setOperationAction(ISD::STRICT_FLOG2, MVT::f16, Promote);

    setOperationAction(ISD::STRICT_FLOG10, MVT::f16, Promote);

    setOperationAction(ISD::STRICT_FEXP, MVT::f16, Promote);

    setOperationAction(ISD::STRICT_FEXP2, MVT::f16, Promote);

    setOperationAction(ISD::STRICT_FCEIL, MVT::f16, Promote);

    setOperationAction(ISD::STRICT_FFLOOR, MVT::f16, Promote);

    setOperationAction(ISD::STRICT_FNEARBYINT, MVT::f16, Promote);

    setOperationAction(ISD::STRICT_FRINT, MVT::f16, Promote);

    setOperationAction(ISD::STRICT_FSETCC, MVT::f16, Promote);

    setOperationAction(ISD::STRICT_FSETCCS, MVT::f16, Promote);

    setOperationAction(ISD::STRICT_FROUND, MVT::f16, Promote);

    setOperationAction(ISD::STRICT_FROUNDEVEN, MVT::f16, Promote);

    setOperationAction(ISD::STRICT_FTRUNC, MVT::f16, Promote);

    setOperationAction(ISD::STRICT_FP_ROUND, MVT::f16, Custom);

    setOperationAction(ISD::FCANONICALIZE, MVT::f16, Custom);

    setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f32, Custom);

    setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f64, Custom);

    setOperationAction(ISD::LRINT, MVT::f16, Expand);

    setOperationAction(ISD::LLRINT, MVT::f16, Expand);


    // Lower this to MOVMSK plus an AND.

    setOperationAction(ISD::FGETSIGN, MVT::i64, Custom);

    setOperationAction(ISD::FGETSIGN, MVT::i32, Custom);


  } else if (!Subtarget.useSoftFloat() && Subtarget.hasSSE1() &&

             (UseX87 || Is64Bit)) {

    // Use SSE for f32, x87 for f64.

    // Set up the FP register classes.

    addRegisterClass(MVT::f32, &X86::FR32RegClass);

    if (UseX87)

      addRegisterClass(MVT::f64, &X86::RFP64RegClass);


    // Use ANDPS to simulate FABS.

    setOperationAction(ISD::FABS , MVT::f32, Custom);


    // Use XORP to simulate FNEG.

    setOperationAction(ISD::FNEG , MVT::f32, Custom);


    if (UseX87)

      setOperationAction(ISD::UNDEF, MVT::f64, Expand);


    // Use ANDPS and ORPS to simulate FCOPYSIGN.

    if (UseX87)

      setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);

    setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);


    // We don't support sin/cos/fmod

    setOperationAction(ISD::FSIN   , MVT::f32, Expand);

    setOperationAction(ISD::FCOS   , MVT::f32, Expand);

    setOperationAction(ISD::FSINCOS, MVT::f32, Expand);


    if (UseX87) {

      // Always expand sin/cos functions even though x87 has an instruction.

      setOperationAction(ISD::FSIN, MVT::f64, Expand);

      setOperationAction(ISD::FCOS, MVT::f64, Expand);

      setOperationAction(ISD::FSINCOS, MVT::f64, Expand);

    }

  } else if (UseX87) {

    // f32 and f64 in x87.

    // Set up the FP register classes.

    addRegisterClass(MVT::f64, &X86::RFP64RegClass);

    addRegisterClass(MVT::f32, &X86::RFP32RegClass);


    for (auto VT : { MVT::f32, MVT::f64 }) {

      setOperationAction(ISD::UNDEF,     VT, Expand);

      setOperationAction(ISD::FCOPYSIGN, VT, Expand);


      // Always expand sin/cos functions even though x87 has an instruction.

      setOperationAction(ISD::FSIN   , VT, Expand);

      setOperationAction(ISD::FCOS   , VT, Expand);

      setOperationAction(ISD::FSINCOS, VT, Expand);

    }

  }


  // Expand FP32 immediates into loads from the stack, save special cases.

  if (isTypeLegal(MVT::f32)) {

    if (UseX87 && (getRegClassFor(MVT::f32) == &X86::RFP32RegClass)) {

      addLegalFPImmediate(APFloat(+0.0f)); // FLD0

      addLegalFPImmediate(APFloat(+1.0f)); // FLD1

      addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS

      addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS

    } else // SSE immediates.

      addLegalFPImmediate(APFloat(+0.0f)); // xorps

  }

  // Expand FP64 immediates into loads from the stack, save special cases.

  if (isTypeLegal(MVT::f64)) {

    if (UseX87 && getRegClassFor(MVT::f64) == &X86::RFP64RegClass) {

      addLegalFPImmediate(APFloat(+0.0)); // FLD0

      addLegalFPImmediate(APFloat(+1.0)); // FLD1

      addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS

      addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS

    } else // SSE immediates.

      addLegalFPImmediate(APFloat(+0.0)); // xorpd

  }

  // Support fp16 0 immediate.

  if (isTypeLegal(MVT::f16))

    addLegalFPImmediate(APFloat::getZero(APFloat::IEEEhalf()));


  // Handle constrained floating-point operations of scalar.

  setOperationAction(ISD::STRICT_FADD,      MVT::f32, Legal);

  setOperationAction(ISD::STRICT_FADD,      MVT::f64, Legal);

  setOperationAction(ISD::STRICT_FSUB,      MVT::f32, Legal);

  setOperationAction(ISD::STRICT_FSUB,      MVT::f64, Legal);

  setOperationAction(ISD::STRICT_FMUL,      MVT::f32, Legal);

  setOperationAction(ISD::STRICT_FMUL,      MVT::f64, Legal);

  setOperationAction(ISD::STRICT_FDIV,      MVT::f32, Legal);

  setOperationAction(ISD::STRICT_FDIV,      MVT::f64, Legal);

  setOperationAction(ISD::STRICT_FP_ROUND,  MVT::f32, Legal);

  setOperationAction(ISD::STRICT_FP_ROUND,  MVT::f64, Legal);

  setOperationAction(ISD::STRICT_FSQRT,     MVT::f32, Legal);

  setOperationAction(ISD::STRICT_FSQRT,     MVT::f64, Legal);


  // We don't support FMA.

  setOperationAction(ISD::FMA, MVT::f64, Expand);

  setOperationAction(ISD::FMA, MVT::f32, Expand);


  // f80 always uses X87.

  if (UseX87) {

    addRegisterClass(MVT::f80, &X86::RFP80RegClass);

    setOperationAction(ISD::UNDEF,     MVT::f80, Expand);

    setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand);

    {

      APFloat TmpFlt = APFloat::getZero(APFloat::x87DoubleExtended());

      addLegalFPImmediate(TmpFlt);  // FLD0

      TmpFlt.changeSign();

      addLegalFPImmediate(TmpFlt);  // FLD0/FCHS


      bool ignored;

      APFloat TmpFlt2(+1.0);

      TmpFlt2.convert(APFloat::x87DoubleExtended(), APFloat::rmNearestTiesToEven,

                      &ignored);

      addLegalFPImmediate(TmpFlt2);  // FLD1

      TmpFlt2.changeSign();

      addLegalFPImmediate(TmpFlt2);  // FLD1/FCHS

    }


    // Always expand sin/cos functions even though x87 has an instruction.

    // clang-format off

    setOperationAction(ISD::FSIN   , MVT::f80, Expand);

    setOperationAction(ISD::FCOS   , MVT::f80, Expand);

    setOperationAction(ISD::FSINCOS, MVT::f80, Expand);

    setOperationAction(ISD::FTAN   , MVT::f80, Expand);

    setOperationAction(ISD::FASIN  , MVT::f80, Expand);

    setOperationAction(ISD::FACOS  , MVT::f80, Expand);

    setOperationAction(ISD::FATAN  , MVT::f80, Expand);

    setOperationAction(ISD::FATAN2 , MVT::f80, Expand);

    setOperationAction(ISD::FSINH  , MVT::f80, Expand);

    setOperationAction(ISD::FCOSH  , MVT::f80, Expand);

    setOperationAction(ISD::FTANH  , MVT::f80, Expand);

    // clang-format on


    setOperationAction(ISD::FFLOOR, MVT::f80, Expand);

    setOperationAction(ISD::FCEIL,  MVT::f80, Expand);

    setOperationAction(ISD::FTRUNC, MVT::f80, Expand);

    setOperationAction(ISD::FRINT,  MVT::f80, Expand);

    setOperationAction(ISD::FNEARBYINT, MVT::f80, Expand);

    setOperationAction(ISD::FROUNDEVEN, MVT::f80, Expand);

    setOperationAction(ISD::FMA, MVT::f80, Expand);

    setOperationAction(ISD::LROUND, MVT::f80, LibCall);

    setOperationAction(ISD::LLROUND, MVT::f80, LibCall);

    setOperationAction(ISD::LRINT, MVT::f80, Custom);

    setOperationAction(ISD::LLRINT, MVT::f80, Custom);


    // Handle constrained floating-point operations of scalar.

    setOperationAction(ISD::STRICT_FADD     , MVT::f80, Legal);

    setOperationAction(ISD::STRICT_FSUB     , MVT::f80, Legal);

    setOperationAction(ISD::STRICT_FMUL     , MVT::f80, Legal);

    setOperationAction(ISD::STRICT_FDIV     , MVT::f80, Legal);

    setOperationAction(ISD::STRICT_FSQRT    , MVT::f80, Legal);

    setOperationAction(ISD::FCANONICALIZE   , MVT::f80, Custom);

    if (isTypeLegal(MVT::f16)) {

      setOperationAction(ISD::FP_EXTEND, MVT::f80, Custom);

      setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f80, Custom);

    } else {

      setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f80, Legal);

    }

    // FIXME: When the target is 64-bit, STRICT_FP_ROUND will be overwritten

    // as Custom.

    setOperationAction(ISD::STRICT_FP_ROUND, MVT::f80, Legal);

  }


  // f128 uses xmm registers, but most operations require libcalls.

  if (!Subtarget.useSoftFloat() && Subtarget.is64Bit() && Subtarget.hasSSE1()) {

    addRegisterClass(MVT::f128, Subtarget.hasVLX() ? &X86::VR128XRegClass

                                                   : &X86::VR128RegClass);


    addLegalFPImmediate(APFloat::getZero(APFloat::IEEEquad())); // xorps


    setOperationAction(ISD::FADD,        MVT::f128, LibCall);

    setOperationAction(ISD::STRICT_FADD, MVT::f128, LibCall);

    setOperationAction(ISD::FSUB,        MVT::f128, LibCall);

    setOperationAction(ISD::STRICT_FSUB, MVT::f128, LibCall);

    setOperationAction(ISD::FDIV,        MVT::f128, LibCall);

    setOperationAction(ISD::STRICT_FDIV, MVT::f128, LibCall);

    setOperationAction(ISD::FMUL,        MVT::f128, LibCall);

    setOperationAction(ISD::STRICT_FMUL, MVT::f128, LibCall);

    setOperationAction(ISD::FMA,         MVT::f128, LibCall);

    setOperationAction(ISD::STRICT_FMA,  MVT::f128, LibCall);


    setOperationAction(ISD::FABS, MVT::f128, Custom);

    setOperationAction(ISD::FNEG, MVT::f128, Custom);

    setOperationAction(ISD::FCOPYSIGN, MVT::f128, Custom);


    // clang-format off

    setOperationAction(ISD::FSIN,         MVT::f128, LibCall);

    setOperationAction(ISD::STRICT_FSIN,  MVT::f128, LibCall);

    setOperationAction(ISD::FCOS,         MVT::f128, LibCall);

    setOperationAction(ISD::STRICT_FCOS,  MVT::f128, LibCall);

    setOperationAction(ISD::FSINCOS,      MVT::f128, LibCall);

    setOperationAction(ISD::FTAN,         MVT::f128, LibCall);

    setOperationAction(ISD::STRICT_FTAN,  MVT::f128, LibCall);

    // clang-format on

    // No STRICT_FSINCOS

    setOperationAction(ISD::FSQRT,        MVT::f128, LibCall);

    setOperationAction(ISD::STRICT_FSQRT, MVT::f128, LibCall);


    setOperationAction(ISD::FP_EXTEND,        MVT::f128, Custom);

    setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f128, Custom);

    // We need to custom handle any FP_ROUND with an f128 input, but

    // LegalizeDAG uses the result type to know when to run a custom handler.

    // So we have to list all legal floating point result types here.

    if (isTypeLegal(MVT::f32)) {

      setOperationAction(ISD::FP_ROUND, MVT::f32, Custom);

      setOperationAction(ISD::STRICT_FP_ROUND, MVT::f32, Custom);

    }

    if (isTypeLegal(MVT::f64)) {

      setOperationAction(ISD::FP_ROUND, MVT::f64, Custom);

      setOperationAction(ISD::STRICT_FP_ROUND, MVT::f64, Custom);

    }

    if (isTypeLegal(MVT::f80)) {

      setOperationAction(ISD::FP_ROUND, MVT::f80, Custom);

      setOperationAction(ISD::STRICT_FP_ROUND, MVT::f80, Custom);

      setOperationAction(ISD::FCANONICALIZE, MVT::f80, Custom);

    }


    setOperationAction(ISD::SETCC, MVT::f128, Custom);


    setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f32, Expand);

    setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f64, Expand);

    setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f80, Expand);

    setTruncStoreAction(MVT::f128, MVT::f32, Expand);

    setTruncStoreAction(MVT::f128, MVT::f64, Expand);

    setTruncStoreAction(MVT::f128, MVT::f80, Expand);

  }


  // Always use a library call for pow.

  setOperationAction(ISD::FPOW             , MVT::f32  , Expand);

  setOperationAction(ISD::FPOW             , MVT::f64  , Expand);

  setOperationAction(ISD::FPOW             , MVT::f80  , Expand);

  setOperationAction(ISD::FPOW             , MVT::f128 , Expand);


  setOperationAction(ISD::FLOG, MVT::f80, Expand);

  setOperationAction(ISD::FLOG2, MVT::f80, Expand);

  setOperationAction(ISD::FLOG10, MVT::f80, Expand);

  setOperationAction(ISD::FEXP, MVT::f80, Expand);

  setOperationAction(ISD::FEXP2, MVT::f80, Expand);

  setOperationAction(ISD::FEXP10, MVT::f80, Expand);

  setOperationAction(ISD::FMINNUM, MVT::f80, Expand);

  setOperationAction(ISD::FMAXNUM, MVT::f80, Expand);


  // Some FP actions are always expanded for vector types.

  for (auto VT : { MVT::v8f16, MVT::v16f16, MVT::v32f16,

                   MVT::v4f32, MVT::v8f32,  MVT::v16f32,

                   MVT::v2f64, MVT::v4f64,  MVT::v8f64 }) {

    // clang-format off

    setOperationAction(ISD::FSIN,      VT, Expand);

    setOperationAction(ISD::FSINCOS,   VT, Expand);

    setOperationAction(ISD::FCOS,      VT, Expand);

    setOperationAction(ISD::FTAN,      VT, Expand);

    setOperationAction(ISD::FREM,      VT, Expand);

    setOperationAction(ISD::FCOPYSIGN, VT, Expand);

    setOperationAction(ISD::FPOW,      VT, Expand);

    setOperationAction(ISD::FLOG,      VT, Expand);

    setOperationAction(ISD::FLOG2,     VT, Expand);

    setOperationAction(ISD::FLOG10,    VT, Expand);

    setOperationAction(ISD::FEXP,      VT, Expand);

    setOperationAction(ISD::FEXP2,     VT, Expand);

    setOperationAction(ISD::FEXP10,    VT, Expand);

    // clang-format on

  }


  // First set operation action for all vector types to either promote

  // (for widening) or expand (for scalarization). Then we will selectively

  // turn on ones that can be effectively codegen'd.

  for (MVT VT : MVT::fixedlen_vector_valuetypes()) {

    setOperationAction(ISD::SDIV, VT, Expand);

    setOperationAction(ISD::UDIV, VT, Expand);

    setOperationAction(ISD::SREM, VT, Expand);

    setOperationAction(ISD::UREM, VT, Expand);

    setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT,Expand);

    setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Expand);

    setOperationAction(ISD::EXTRACT_SUBVECTOR, VT,Expand);

    setOperationAction(ISD::INSERT_SUBVECTOR, VT,Expand);

    setOperationAction(ISD::FMA,  VT, Expand);

    setOperationAction(ISD::FFLOOR, VT, Expand);

    setOperationAction(ISD::FCEIL, VT, Expand);

    setOperationAction(ISD::FTRUNC, VT, Expand);

    setOperationAction(ISD::FRINT, VT, Expand);

    setOperationAction(ISD::FNEARBYINT, VT, Expand);

    setOperationAction(ISD::FROUNDEVEN, VT, Expand);

    setOperationAction(ISD::SMUL_LOHI, VT, Expand);

    setOperationAction(ISD::MULHS, VT, Expand);

    setOperationAction(ISD::UMUL_LOHI, VT, Expand);

    setOperationAction(ISD::MULHU, VT, Expand);

    setOperationAction(ISD::SDIVREM, VT, Expand);

    setOperationAction(ISD::UDIVREM, VT, Expand);

    setOperationAction(ISD::CTPOP, VT, Expand);

    setOperationAction(ISD::CTTZ, VT, Expand);

    setOperationAction(ISD::CTLZ, VT, Expand);

    setOperationAction(ISD::ROTL, VT, Expand);

    setOperationAction(ISD::ROTR, VT, Expand);

    setOperationAction(ISD::BSWAP, VT, Expand);

    setOperationAction(ISD::SETCC, VT, Expand);

    setOperationAction(ISD::FP_TO_UINT, VT, Expand);

    setOperationAction(ISD::FP_TO_SINT, VT, Expand);

    setOperationAction(ISD::UINT_TO_FP, VT, Expand);

    setOperationAction(ISD::SINT_TO_FP, VT, Expand);

    setOperationAction(ISD::SIGN_EXTEND_INREG, VT,Expand);

    setOperationAction(ISD::TRUNCATE, VT, Expand);

    setOperationAction(ISD::SIGN_EXTEND, VT, Expand);

    setOperationAction(ISD::ZERO_EXTEND, VT, Expand);

    setOperationAction(ISD::ANY_EXTEND, VT, Expand);

    setOperationAction(ISD::SELECT_CC, VT, Expand);

    for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) {

      setTruncStoreAction(InnerVT, VT, Expand);


      setLoadExtAction(ISD::SEXTLOAD, InnerVT, VT, Expand);

      setLoadExtAction(ISD::ZEXTLOAD, InnerVT, VT, Expand);


      // N.b. ISD::EXTLOAD legality is basically ignored except for i1-like

      // types, we have to deal with them whether we ask for Expansion or not.

      // Setting Expand causes its own optimisation problems though, so leave

      // them legal.

      if (VT.getVectorElementType() == MVT::i1)

        setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);


      // EXTLOAD for MVT::f16 vectors is not legal because f16 vectors are

      // split/scalarized right now.

      if (VT.getVectorElementType() == MVT::f16 ||

          VT.getVectorElementType() == MVT::bf16)

        setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);

    }

  }


  // FIXME: In order to prevent SSE instructions being expanded to MMX ones

  // with -msoft-float, disable use of MMX as well.

  if (!Subtarget.useSoftFloat() && Subtarget.hasMMX()) {

    addRegisterClass(MVT::x86mmx, &X86::VR64RegClass);

    // No operations on x86mmx supported, everything uses intrinsics.

  }


  if (!Subtarget.useSoftFloat() && Subtarget.hasSSE1()) {

    addRegisterClass(MVT::v4f32, Subtarget.hasVLX() ? &X86::VR128XRegClass

                                                    : &X86::VR128RegClass);


    setOperationAction(ISD::FMAXIMUM,           MVT::f32, Custom);

    setOperationAction(ISD::FMINIMUM,           MVT::f32, Custom);

    setOperationAction(ISD::FMAXIMUMNUM,        MVT::f32, Custom);

    setOperationAction(ISD::FMINIMUMNUM,        MVT::f32, Custom);


    setOperationAction(ISD::FNEG,               MVT::v4f32, Custom);

    setOperationAction(ISD::FABS,               MVT::v4f32, Custom);

    setOperationAction(ISD::FCOPYSIGN,          MVT::v4f32, Custom);

    setOperationAction(ISD::BUILD_VECTOR,       MVT::v4f32, Custom);

    setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v4f32, Custom);

    setOperationAction(ISD::VSELECT,            MVT::v4f32, Custom);

    setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);

    setOperationAction(ISD::SELECT,             MVT::v4f32, Custom);

    setOperationAction(ISD::FCANONICALIZE, MVT::v4f32, Custom);


    setOperationAction(ISD::LOAD,               MVT::v2f32, Custom);

    setOperationAction(ISD::STORE,              MVT::v2f32, Custom);

    setOperationAction(ISD::FCANONICALIZE, MVT::v2f32, Custom);


    setOperationAction(ISD::STRICT_FADD,        MVT::v4f32, Legal);

    setOperationAction(ISD::STRICT_FSUB,        MVT::v4f32, Legal);

    setOperationAction(ISD::STRICT_FMUL,        MVT::v4f32, Legal);

    setOperationAction(ISD::STRICT_FDIV,        MVT::v4f32, Legal);

    setOperationAction(ISD::STRICT_FSQRT,       MVT::v4f32, Legal);

  }


  if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) {

    addRegisterClass(MVT::v2f64, Subtarget.hasVLX() ? &X86::VR128XRegClass

                                                    : &X86::VR128RegClass);


    // FIXME: Unfortunately, -soft-float and -no-implicit-float mean XMM

    // registers cannot be used even for integer operations.

    addRegisterClass(MVT::v16i8, Subtarget.hasVLX() ? &X86::VR128XRegClass

                                                    : &X86::VR128RegClass);

    addRegisterClass(MVT::v8i16, Subtarget.hasVLX() ? &X86::VR128XRegClass

                                                    : &X86::VR128RegClass);

    addRegisterClass(MVT::v8f16, Subtarget.hasVLX() ? &X86::VR128XRegClass

                                                    : &X86::VR128RegClass);

    addRegisterClass(MVT::v4i32, Subtarget.hasVLX() ? &X86::VR128XRegClass

                                                    : &X86::VR128RegClass);

    addRegisterClass(MVT::v2i64, Subtarget.hasVLX() ? &X86::VR128XRegClass

                                                    : &X86::VR128RegClass);


    for (auto VT : { MVT::f64, MVT::v4f32, MVT::v2f64 }) {

      setOperationAction(ISD::FMAXIMUM, VT, Custom);

      setOperationAction(ISD::FMINIMUM, VT, Custom);

      setOperationAction(ISD::FMAXIMUMNUM, VT, Custom);

      setOperationAction(ISD::FMINIMUMNUM, VT, Custom);

    }


    for (auto VT : { MVT::v2i8, MVT::v4i8, MVT::v8i8,

                     MVT::v2i16, MVT::v4i16, MVT::v2i32 }) {

      setOperationAction(ISD::SDIV, VT, Custom);

      setOperationAction(ISD::SREM, VT, Custom);

      setOperationAction(ISD::UDIV, VT, Custom);

      setOperationAction(ISD::UREM, VT, Custom);

    }


    setOperationAction(ISD::MUL,                MVT::v2i8,  Custom);

    setOperationAction(ISD::MUL,                MVT::v4i8,  Custom);

    setOperationAction(ISD::MUL,                MVT::v8i8,  Custom);


    setOperationAction(ISD::MUL,                MVT::v16i8, Custom);

    setOperationAction(ISD::MUL,                MVT::v4i32, Custom);

    setOperationAction(ISD::MUL,                MVT::v2i64, Custom);

    setOperationAction(ISD::MULHU,              MVT::v4i32, Custom);

    setOperationAction(ISD::MULHS,              MVT::v4i32, Custom);

    setOperationAction(ISD::MULHU,              MVT::v16i8, Custom);

    setOperationAction(ISD::MULHS,              MVT::v16i8, Custom);

    setOperationAction(ISD::MULHU,              MVT::v8i16, Legal);

    setOperationAction(ISD::MULHS,              MVT::v8i16, Legal);

    setOperationAction(ISD::MUL,                MVT::v8i16, Legal);

    setOperationAction(ISD::AVGCEILU,           MVT::v16i8, Legal);

    setOperationAction(ISD::AVGCEILU,           MVT::v8i16, Legal);


    setOperationAction(ISD::SMULO,              MVT::v16i8, Custom);

    setOperationAction(ISD::UMULO,              MVT::v16i8, Custom);

    setOperationAction(ISD::UMULO,              MVT::v2i32, Custom);


    setOperationAction(ISD::FNEG,               MVT::v2f64, Custom);

    setOperationAction(ISD::FCANONICALIZE, MVT::v2f64, Custom);

    setOperationAction(ISD::FABS,               MVT::v2f64, Custom);

    setOperationAction(ISD::FCOPYSIGN,          MVT::v2f64, Custom);


    setOperationAction(ISD::LRINT, MVT::v4f32, Custom);

    setOperationAction(ISD::LRINT, MVT::v2i32, Custom);


    for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {

      setOperationAction(ISD::SMAX, VT, VT == MVT::v8i16 ? Legal : Custom);

      setOperationAction(ISD::SMIN, VT, VT == MVT::v8i16 ? Legal : Custom);

      setOperationAction(ISD::UMAX, VT, VT == MVT::v16i8 ? Legal : Custom);

      setOperationAction(ISD::UMIN, VT, VT == MVT::v16i8 ? Legal : Custom);

    }


    setOperationAction(ISD::UADDSAT,            MVT::v16i8, Legal);

    setOperationAction(ISD::SADDSAT,            MVT::v16i8, Legal);

    setOperationAction(ISD::USUBSAT,            MVT::v16i8, Legal);

    setOperationAction(ISD::SSUBSAT,            MVT::v16i8, Legal);

    setOperationAction(ISD::UADDSAT,            MVT::v8i16, Legal);

    setOperationAction(ISD::SADDSAT,            MVT::v8i16, Legal);

    setOperationAction(ISD::USUBSAT,            MVT::v8i16, Legal);

    setOperationAction(ISD::SSUBSAT,            MVT::v8i16, Legal);

    setOperationAction(ISD::USUBSAT,            MVT::v4i32, Custom);

    setOperationAction(ISD::USUBSAT,            MVT::v2i64, Custom);


    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v16i8, Custom);

    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v8i16, Custom);

    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4i32, Custom);

    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4f32, Custom);


    for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {

      setOperationAction(ISD::SETCC, VT, Custom);

      setOperationAction(ISD::CTPOP, VT, Custom);

      setOperationAction(ISD::ABS, VT, Custom);

      setOperationAction(ISD::ABDS, VT, Custom);

      setOperationAction(ISD::ABDU, VT, Custom);


      // The condition codes aren't legal in SSE/AVX and under AVX512 we use

      // setcc all the way to isel and prefer SETGT in some isel patterns.

      setCondCodeAction(ISD::SETLT, VT, Custom);

      setCondCodeAction(ISD::SETLE, VT, Custom);

    }


    setOperationAction(ISD::SETCC,          MVT::v2f64, Custom);

    setOperationAction(ISD::SETCC,          MVT::v4f32, Custom);

    setOperationAction(ISD::STRICT_FSETCC,  MVT::v2f64, Custom);

    setOperationAction(ISD::STRICT_FSETCC,  MVT::v4f32, Custom);

    setOperationAction(ISD::STRICT_FSETCCS, MVT::v2f64, Custom);

    setOperationAction(ISD::STRICT_FSETCCS, MVT::v4f32, Custom);


    for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {

      setOperationAction(ISD::SCALAR_TO_VECTOR,   VT, Custom);

      setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);

      setOperationAction(ISD::VECTOR_SHUFFLE,     VT, Custom);

      setOperationAction(ISD::VSELECT,            VT, Custom);

      setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);

    }


    for (auto VT : { MVT::v8f16, MVT::v2f64, MVT::v2i64 }) {

      setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);

      setOperationAction(ISD::VECTOR_SHUFFLE,     VT, Custom);

      setOperationAction(ISD::VSELECT,            VT, Custom);


      if (VT == MVT::v2i64 && !Subtarget.is64Bit())

        continue;


      setOperationAction(ISD::INSERT_VECTOR_ELT,  VT, Custom);

      setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);

    }

    setF16Action(MVT::v8f16, Expand);

    setOperationAction(ISD::FADD, MVT::v8f16, Expand);

    setOperationAction(ISD::FSUB, MVT::v8f16, Expand);

    setOperationAction(ISD::FMUL, MVT::v8f16, Expand);

    setOperationAction(ISD::FDIV, MVT::v8f16, Expand);

    setOperationAction(ISD::FNEG, MVT::v8f16, Custom);

    setOperationAction(ISD::FABS, MVT::v8f16, Custom);

    setOperationAction(ISD::FCOPYSIGN, MVT::v8f16, Custom);


    // Custom lower v2i64 and v2f64 selects.

    setOperationAction(ISD::SELECT,             MVT::v2f64, Custom);

    setOperationAction(ISD::SELECT,             MVT::v2i64, Custom);

    setOperationAction(ISD::SELECT,             MVT::v4i32, Custom);

    setOperationAction(ISD::SELECT,             MVT::v8i16, Custom);

    setOperationAction(ISD::SELECT,             MVT::v8f16, Custom);

    setOperationAction(ISD::SELECT,             MVT::v16i8, Custom);


    setOperationAction(ISD::FP_TO_SINT,         MVT::v4i32, Custom);

    setOperationAction(ISD::FP_TO_UINT,         MVT::v4i32, Custom);

    setOperationAction(ISD::FP_TO_SINT,         MVT::v2i32, Custom);

    setOperationAction(ISD::FP_TO_UINT,         MVT::v2i32, Custom);

    setOperationAction(ISD::STRICT_FP_TO_SINT,  MVT::v4i32, Custom);

    setOperationAction(ISD::STRICT_FP_TO_SINT,  MVT::v2i32, Custom);


    // Custom legalize these to avoid over promotion or custom promotion.

    for (auto VT : {MVT::v2i8, MVT::v4i8, MVT::v8i8, MVT::v2i16, MVT::v4i16}) {

      setOperationAction(ISD::FP_TO_SINT,        VT, Custom);

      setOperationAction(ISD::FP_TO_UINT,        VT, Custom);

      setOperationAction(ISD::STRICT_FP_TO_SINT, VT, Custom);

      setOperationAction(ISD::STRICT_FP_TO_UINT, VT, Custom);

    }


    setOperationAction(ISD::SINT_TO_FP,         MVT::v4i32, Custom);

    setOperationAction(ISD::STRICT_SINT_TO_FP,  MVT::v4i32, Custom);

    setOperationAction(ISD::SINT_TO_FP,         MVT::v2i32, Custom);

    setOperationAction(ISD::STRICT_SINT_TO_FP,  MVT::v2i32, Custom);


    setOperationAction(ISD::UINT_TO_FP,         MVT::v2i32, Custom);

    setOperationAction(ISD::STRICT_UINT_TO_FP,  MVT::v2i32, Custom);


    setOperationAction(ISD::UINT_TO_FP,         MVT::v4i32, Custom);

    setOperationAction(ISD::STRICT_UINT_TO_FP,  MVT::v4i32, Custom);


    // Fast v2f32 UINT_TO_FP( v2i32 ) custom conversion.

    setOperationAction(ISD::SINT_TO_FP,         MVT::v2f32, Custom);

    setOperationAction(ISD::STRICT_SINT_TO_FP,  MVT::v2f32, Custom);

    setOperationAction(ISD::UINT_TO_FP,         MVT::v2f32, Custom);

    setOperationAction(ISD::STRICT_UINT_TO_FP,  MVT::v2f32, Custom);


    setOperationAction(ISD::FP_EXTEND,          MVT::v2f32, Custom);

    setOperationAction(ISD::STRICT_FP_EXTEND,   MVT::v2f32, Custom);

    setOperationAction(ISD::FP_ROUND,           MVT::v2f32, Custom);

    setOperationAction(ISD::STRICT_FP_ROUND,    MVT::v2f32, Custom);


    // We want to legalize this to an f64 load rather than an i64 load on

    // 64-bit targets and two 32-bit loads on a 32-bit target. Similar for

    // store.

    setOperationAction(ISD::LOAD,               MVT::v2i32, Custom);

    setOperationAction(ISD::LOAD,               MVT::v4i16, Custom);

    setOperationAction(ISD::LOAD,               MVT::v8i8,  Custom);

    setOperationAction(ISD::STORE,              MVT::v2i32, Custom);

    setOperationAction(ISD::STORE,              MVT::v4i16, Custom);

    setOperationAction(ISD::STORE,              MVT::v8i8,  Custom);


    // Add 32-bit vector stores to help vectorization opportunities.

    setOperationAction(ISD::STORE,              MVT::v2i16, Custom);

    setOperationAction(ISD::STORE,              MVT::v4i8,  Custom);


    setOperationAction(ISD::BITCAST,            MVT::v2i32, Custom);

    setOperationAction(ISD::BITCAST,            MVT::v4i16, Custom);

    setOperationAction(ISD::BITCAST,            MVT::v8i8,  Custom);

    if (!Subtarget.hasAVX512())

      setOperationAction(ISD::BITCAST, MVT::v16i1, Custom);


    setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v2i64, Custom);

    setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v4i32, Custom);

    setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i16, Custom);


    setOperationAction(ISD::SIGN_EXTEND, MVT::v4i64, Custom);


    setOperationAction(ISD::TRUNCATE,    MVT::v2i8,  Custom);

    setOperationAction(ISD::TRUNCATE,    MVT::v2i16, Custom);

    setOperationAction(ISD::TRUNCATE,    MVT::v2i32, Custom);

    setOperationAction(ISD::TRUNCATE,    MVT::v2i64, Custom);

    setOperationAction(ISD::TRUNCATE,    MVT::v4i8,  Custom);

    setOperationAction(ISD::TRUNCATE,    MVT::v4i16, Custom);

    setOperationAction(ISD::TRUNCATE,    MVT::v4i32, Custom);

    setOperationAction(ISD::TRUNCATE,    MVT::v4i64, Custom);

    setOperationAction(ISD::TRUNCATE,    MVT::v8i8,  Custom);

    setOperationAction(ISD::TRUNCATE,    MVT::v8i16, Custom);

    setOperationAction(ISD::TRUNCATE,    MVT::v8i32, Custom);

    setOperationAction(ISD::TRUNCATE,    MVT::v8i64, Custom);

    setOperationAction(ISD::TRUNCATE,    MVT::v16i8, Custom);

    setOperationAction(ISD::TRUNCATE,    MVT::v16i16, Custom);

    setOperationAction(ISD::TRUNCATE,    MVT::v16i32, Custom);

    setOperationAction(ISD::TRUNCATE,    MVT::v16i64, Custom);


    // In the customized shift lowering, the legal v4i32/v2i64 cases

    // in AVX2 will be recognized.

    for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {

      setOperationAction(ISD::SRL,              VT, Custom);

      setOperationAction(ISD::SHL,              VT, Custom);

      setOperationAction(ISD::SRA,              VT, Custom);

      if (VT == MVT::v2i64) continue;

      setOperationAction(ISD::ROTL,             VT, Custom);

      setOperationAction(ISD::ROTR,             VT, Custom);

      setOperationAction(ISD::FSHL,             VT, Custom);

      setOperationAction(ISD::FSHR,             VT, Custom);

    }


    setOperationAction(ISD::STRICT_FSQRT,       MVT::v2f64, Legal);

    setOperationAction(ISD::STRICT_FADD,        MVT::v2f64, Legal);

    setOperationAction(ISD::STRICT_FSUB,        MVT::v2f64, Legal);

    setOperationAction(ISD::STRICT_FMUL,        MVT::v2f64, Legal);

    setOperationAction(ISD::STRICT_FDIV,        MVT::v2f64, Legal);

  }


  if (!Subtarget.useSoftFloat() && Subtarget.hasGFNI()) {

    setOperationAction(ISD::BITREVERSE, MVT::i8, Custom);

    setOperationAction(ISD::BITREVERSE, MVT::i16, Custom);

    setOperationAction(ISD::BITREVERSE, MVT::i32, Custom);

    setOperationAction(ISD::BITREVERSE, MVT::i64, Custom);


    for (auto VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64}) {

      setOperationAction(ISD::BITREVERSE, VT, Custom);

    }

  }


  if (!Subtarget.useSoftFloat() && Subtarget.hasSSSE3()) {

    setOperationAction(ISD::ABS,                MVT::v16i8, Legal);

    setOperationAction(ISD::ABS,                MVT::v8i16, Legal);

    setOperationAction(ISD::ABS,                MVT::v4i32, Legal);


    for (auto VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64}) {

      setOperationAction(ISD::BITREVERSE,       VT, Custom);

      setOperationAction(ISD::CTLZ,             VT, Custom);

    }


    // These might be better off as horizontal vector ops.

    setOperationAction(ISD::ADD,                MVT::i16, Custom);

    setOperationAction(ISD::ADD,                MVT::i32, Custom);

    setOperationAction(ISD::SUB,                MVT::i16, Custom);

    setOperationAction(ISD::SUB,                MVT::i32, Custom);

  }


  if (!Subtarget.useSoftFloat() && Subtarget.hasSSE41()) {

    for (MVT RoundedTy : {MVT::f32, MVT::f64, MVT::v4f32, MVT::v2f64}) {

      setOperationAction(ISD::FFLOOR,            RoundedTy,  Legal);

      setOperationAction(ISD::STRICT_FFLOOR,     RoundedTy,  Legal);

      setOperationAction(ISD::FCEIL,             RoundedTy,  Legal);

      setOperationAction(ISD::STRICT_FCEIL,      RoundedTy,  Legal);

      setOperationAction(ISD::FTRUNC,            RoundedTy,  Legal);

      setOperationAction(ISD::STRICT_FTRUNC,     RoundedTy,  Legal);

      setOperationAction(ISD::FRINT,             RoundedTy,  Legal);

      setOperationAction(ISD::STRICT_FRINT,      RoundedTy,  Legal);

      setOperationAction(ISD::FNEARBYINT,        RoundedTy,  Legal);

      setOperationAction(ISD::STRICT_FNEARBYINT, RoundedTy,  Legal);

      setOperationAction(ISD::FROUNDEVEN,        RoundedTy,  Legal);

      setOperationAction(ISD::STRICT_FROUNDEVEN, RoundedTy,  Legal);


      setOperationAction(ISD::FROUND,            RoundedTy,  Custom);

    }


    setOperationAction(ISD::SMAX,               MVT::v16i8, Legal);

    setOperationAction(ISD::SMAX,               MVT::v4i32, Legal);

    setOperationAction(ISD::UMAX,               MVT::v8i16, Legal);

    setOperationAction(ISD::UMAX,               MVT::v4i32, Legal);

    setOperationAction(ISD::SMIN,               MVT::v16i8, Legal);

    setOperationAction(ISD::SMIN,               MVT::v4i32, Legal);

    setOperationAction(ISD::UMIN,               MVT::v8i16, Legal);

    setOperationAction(ISD::UMIN,               MVT::v4i32, Legal);


    setOperationAction(ISD::UADDSAT,            MVT::v4i32, Custom);

    setOperationAction(ISD::SADDSAT,            MVT::v2i64, Custom);

    setOperationAction(ISD::SSUBSAT,            MVT::v2i64, Custom);


    // FIXME: Do we need to handle scalar-to-vector here?

    setOperationAction(ISD::MUL,                MVT::v4i32, Legal);

    setOperationAction(ISD::SMULO,              MVT::v2i32, Custom);


    // We directly match byte blends in the backend as they match the VSELECT

    // condition form.

    setOperationAction(ISD::VSELECT,            MVT::v16i8, Legal);


    // SSE41 brings specific instructions for doing vector sign extend even in

    // cases where we don't have SRA.

    for (auto VT : { MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {

      setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Legal);

      setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Legal);

    }


    // SSE41 also has vector sign/zero extending loads, PMOV[SZ]X

    for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {

      setLoadExtAction(LoadExtOp, MVT::v8i16, MVT::v8i8,  Legal);

      setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i8,  Legal);

      setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i8,  Legal);

      setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i16, Legal);

      setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i16, Legal);

      setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i32, Legal);

    }


    if (Subtarget.is64Bit() && !Subtarget.hasAVX512()) {

      // We need to scalarize v4i64->v432 uint_to_fp using cvtsi2ss, but we can

      // do the pre and post work in the vector domain.

      setOperationAction(ISD::UINT_TO_FP,        MVT::v4i64, Custom);

      setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v4i64, Custom);

      // We need to mark SINT_TO_FP as Custom even though we want to expand it

      // so that DAG combine doesn't try to turn it into uint_to_fp.

      setOperationAction(ISD::SINT_TO_FP,        MVT::v4i64, Custom);

      setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v4i64, Custom);

    }

  }


  if (!Subtarget.useSoftFloat() && Subtarget.hasSSE42()) {

    setOperationAction(ISD::UADDSAT,            MVT::v2i64, Custom);

  }


  if (!Subtarget.useSoftFloat() && Subtarget.hasXOP()) {

    for (auto VT : { MVT::v16i8, MVT::v8i16,  MVT::v4i32, MVT::v2i64,

                     MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {

      setOperationAction(ISD::ROTL, VT, Custom);

      setOperationAction(ISD::ROTR, VT, Custom);

    }


    // XOP can efficiently perform BITREVERSE with VPPERM.

    for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 })

      setOperationAction(ISD::BITREVERSE, VT, Custom);

  }


  if (!Subtarget.useSoftFloat() && Subtarget.hasAVX()) {

    bool HasInt256 = Subtarget.hasInt256();


    addRegisterClass(MVT::v32i8,  Subtarget.hasVLX() ? &X86::VR256XRegClass

                                                     : &X86::VR256RegClass);

    addRegisterClass(MVT::v16i16, Subtarget.hasVLX() ? &X86::VR256XRegClass

                                                     : &X86::VR256RegClass);

    addRegisterClass(MVT::v16f16, Subtarget.hasVLX() ? &X86::VR256XRegClass

                                                     : &X86::VR256RegClass);

    addRegisterClass(MVT::v8i32,  Subtarget.hasVLX() ? &X86::VR256XRegClass

                                                     : &X86::VR256RegClass);

    addRegisterClass(MVT::v8f32,  Subtarget.hasVLX() ? &X86::VR256XRegClass

                                                     : &X86::VR256RegClass);

    addRegisterClass(MVT::v4i64,  Subtarget.hasVLX() ? &X86::VR256XRegClass

                                                     : &X86::VR256RegClass);

    addRegisterClass(MVT::v4f64,  Subtarget.hasVLX() ? &X86::VR256XRegClass

                                                     : &X86::VR256RegClass);


    for (auto VT : { MVT::v8f32, MVT::v4f64 }) {

      setOperationAction(ISD::FFLOOR,            VT, Legal);

      setOperationAction(ISD::STRICT_FFLOOR,     VT, Legal);

      setOperationAction(ISD::FCEIL,             VT, Legal);

      setOperationAction(ISD::STRICT_FCEIL,      VT, Legal);

      setOperationAction(ISD::FTRUNC,            VT, Legal);

      setOperationAction(ISD::STRICT_FTRUNC,     VT, Legal);

      setOperationAction(ISD::FRINT,             VT, Legal);

      setOperationAction(ISD::STRICT_FRINT,      VT, Legal);

      setOperationAction(ISD::FNEARBYINT,        VT, Legal);

      setOperationAction(ISD::STRICT_FNEARBYINT, VT, Legal);

      setOperationAction(ISD::FROUNDEVEN,        VT, Legal);

      setOperationAction(ISD::STRICT_FROUNDEVEN, VT, Legal);


      setOperationAction(ISD::FROUND,            VT, Custom);


      setOperationAction(ISD::FNEG,              VT, Custom);

      setOperationAction(ISD::FABS,              VT, Custom);

      setOperationAction(ISD::FCOPYSIGN,         VT, Custom);


      setOperationAction(ISD::FMAXIMUM,          VT, Custom);

      setOperationAction(ISD::FMINIMUM,          VT, Custom);

      setOperationAction(ISD::FMAXIMUMNUM,       VT, Custom);

      setOperationAction(ISD::FMINIMUMNUM,       VT, Custom);

      setOperationAction(ISD::FCANONICALIZE, VT, Custom);

    }


    setOperationAction(ISD::LRINT, MVT::v8f32, Custom);

    setOperationAction(ISD::LRINT, MVT::v4f64, Custom);


    // (fp_to_int:v8i16 (v8f32 ..)) requires the result type to be promoted

    // even though v8i16 is a legal type.

    setOperationPromotedToType(ISD::FP_TO_SINT,        MVT::v8i16, MVT::v8i32);

    setOperationPromotedToType(ISD::FP_TO_UINT,        MVT::v8i16, MVT::v8i32);

    setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v8i16, MVT::v8i32);

    setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v8i16, MVT::v8i32);

    setOperationAction(ISD::FP_TO_SINT,                MVT::v8i32, Custom);

    setOperationAction(ISD::FP_TO_UINT,                MVT::v8i32, Custom);

    setOperationAction(ISD::STRICT_FP_TO_SINT,         MVT::v8i32, Custom);


    setOperationAction(ISD::SINT_TO_FP,         MVT::v8i32, Custom);

    setOperationAction(ISD::STRICT_SINT_TO_FP,  MVT::v8i32, Custom);

    setOperationAction(ISD::FP_EXTEND,          MVT::v8f32, Expand);

    setOperationAction(ISD::FP_ROUND,           MVT::v8f16, Expand);

    setOperationAction(ISD::FP_EXTEND,          MVT::v4f64, Custom);

    setOperationAction(ISD::STRICT_FP_EXTEND,   MVT::v4f64, Custom);


    setOperationAction(ISD::STRICT_FP_ROUND,    MVT::v4f32, Legal);

    setOperationAction(ISD::STRICT_FADD,        MVT::v8f32, Legal);

    setOperationAction(ISD::STRICT_FADD,        MVT::v4f64, Legal);

    setOperationAction(ISD::STRICT_FSUB,        MVT::v8f32, Legal);

    setOperationAction(ISD::STRICT_FSUB,        MVT::v4f64, Legal);

    setOperationAction(ISD::STRICT_FMUL,        MVT::v8f32, Legal);

    setOperationAction(ISD::STRICT_FMUL,        MVT::v4f64, Legal);

    setOperationAction(ISD::STRICT_FDIV,        MVT::v8f32, Legal);

    setOperationAction(ISD::STRICT_FDIV,        MVT::v4f64, Legal);

    setOperationAction(ISD::STRICT_FSQRT,       MVT::v8f32, Legal);

    setOperationAction(ISD::STRICT_FSQRT,       MVT::v4f64, Legal);


    if (!Subtarget.hasAVX512())

      setOperationAction(ISD::BITCAST, MVT::v32i1, Custom);


    // In the customized shift lowering, the legal v8i32/v4i64 cases

    // in AVX2 will be recognized.

    for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {

      setOperationAction(ISD::SRL,             VT, Custom);

      setOperationAction(ISD::SHL,             VT, Custom);

      setOperationAction(ISD::SRA,             VT, Custom);

      setOperationAction(ISD::ABDS,            VT, Custom);

      setOperationAction(ISD::ABDU,            VT, Custom);

      if (VT == MVT::v4i64) continue;

      setOperationAction(ISD::ROTL,            VT, Custom);

      setOperationAction(ISD::ROTR,            VT, Custom);

      setOperationAction(ISD::FSHL,            VT, Custom);

      setOperationAction(ISD::FSHR,            VT, Custom);

    }


    // These types need custom splitting if their input is a 128-bit vector.

    setOperationAction(ISD::SIGN_EXTEND,       MVT::v8i64,  Custom);

    setOperationAction(ISD::SIGN_EXTEND,       MVT::v16i32, Custom);

    setOperationAction(ISD::ZERO_EXTEND,       MVT::v8i64,  Custom);

    setOperationAction(ISD::ZERO_EXTEND,       MVT::v16i32, Custom);


    setOperationAction(ISD::SELECT,            MVT::v4f64, Custom);

    setOperationAction(ISD::SELECT,            MVT::v4i64, Custom);

    setOperationAction(ISD::SELECT,            MVT::v8i32, Custom);

    setOperationAction(ISD::SELECT,            MVT::v16i16, Custom);

    setOperationAction(ISD::SELECT,            MVT::v16f16, Custom);

    setOperationAction(ISD::SELECT,            MVT::v32i8, Custom);

    setOperationAction(ISD::SELECT,            MVT::v8f32, Custom);


    for (auto VT : { MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {

      setOperationAction(ISD::SIGN_EXTEND,     VT, Custom);

      setOperationAction(ISD::ZERO_EXTEND,     VT, Custom);

      setOperationAction(ISD::ANY_EXTEND,      VT, Custom);

    }


    setOperationAction(ISD::TRUNCATE,          MVT::v32i8, Custom);

    setOperationAction(ISD::TRUNCATE,          MVT::v32i16, Custom);

    setOperationAction(ISD::TRUNCATE,          MVT::v32i32, Custom);

    setOperationAction(ISD::TRUNCATE,          MVT::v32i64, Custom);


    for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {

      setOperationAction(ISD::SETCC,           VT, Custom);

      setOperationAction(ISD::CTPOP,           VT, Custom);

      setOperationAction(ISD::CTLZ,            VT, Custom);

      setOperationAction(ISD::BITREVERSE,      VT, Custom);


      // The condition codes aren't legal in SSE/AVX and under AVX512 we use

      // setcc all the way to isel and prefer SETGT in some isel patterns.

      setCondCodeAction(ISD::SETLT, VT, Custom);

      setCondCodeAction(ISD::SETLE, VT, Custom);

    }


    setOperationAction(ISD::SETCC,          MVT::v4f64, Custom);

    setOperationAction(ISD::SETCC,          MVT::v8f32, Custom);

    setOperationAction(ISD::STRICT_FSETCC,  MVT::v4f64, Custom);

    setOperationAction(ISD::STRICT_FSETCC,  MVT::v8f32, Custom);

    setOperationAction(ISD::STRICT_FSETCCS, MVT::v4f64, Custom);

    setOperationAction(ISD::STRICT_FSETCCS, MVT::v8f32, Custom);


    if (Subtarget.hasAnyFMA()) {

      for (auto VT : { MVT::f32, MVT::f64, MVT::v4f32, MVT::v8f32,

                       MVT::v2f64, MVT::v4f64 }) {

        setOperationAction(ISD::FMA, VT, Legal);

        setOperationAction(ISD::STRICT_FMA, VT, Legal);

      }

    }


    for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {

      setOperationAction(ISD::ADD, VT, HasInt256 ? Legal : Custom);

      setOperationAction(ISD::SUB, VT, HasInt256 ? Legal : Custom);

    }


    setOperationAction(ISD::MUL,       MVT::v4i64,  Custom);

    setOperationAction(ISD::MUL,       MVT::v8i32,  HasInt256 ? Legal : Custom);

    setOperationAction(ISD::MUL,       MVT::v16i16, HasInt256 ? Legal : Custom);

    setOperationAction(ISD::MUL,       MVT::v32i8,  Custom);


    setOperationAction(ISD::MULHU,     MVT::v8i32,  Custom);

    setOperationAction(ISD::MULHS,     MVT::v8i32,  Custom);

    setOperationAction(ISD::MULHU,     MVT::v16i16, HasInt256 ? Legal : Custom);

    setOperationAction(ISD::MULHS,     MVT::v16i16, HasInt256 ? Legal : Custom);

    setOperationAction(ISD::MULHU,     MVT::v32i8,  Custom);

    setOperationAction(ISD::MULHS,     MVT::v32i8,  Custom);

    setOperationAction(ISD::AVGCEILU,  MVT::v16i16, HasInt256 ? Legal : Custom);

    setOperationAction(ISD::AVGCEILU,  MVT::v32i8,  HasInt256 ? Legal : Custom);


    setOperationAction(ISD::SMULO,     MVT::v32i8, Custom);

    setOperationAction(ISD::UMULO,     MVT::v32i8, Custom);


    setOperationAction(ISD::ABS,       MVT::v4i64,  Custom);

    setOperationAction(ISD::SMAX,      MVT::v4i64,  Custom);

    setOperationAction(ISD::UMAX,      MVT::v4i64,  Custom);

    setOperationAction(ISD::SMIN,      MVT::v4i64,  Custom);

    setOperationAction(ISD::UMIN,      MVT::v4i64,  Custom);


    setOperationAction(ISD::UADDSAT,   MVT::v32i8,  HasInt256 ? Legal : Custom);

    setOperationAction(ISD::SADDSAT,   MVT::v32i8,  HasInt256 ? Legal : Custom);

    setOperationAction(ISD::USUBSAT,   MVT::v32i8,  HasInt256 ? Legal : Custom);

    setOperationAction(ISD::SSUBSAT,   MVT::v32i8,  HasInt256 ? Legal : Custom);

    setOperationAction(ISD::UADDSAT,   MVT::v16i16, HasInt256 ? Legal : Custom);

    setOperationAction(ISD::SADDSAT,   MVT::v16i16, HasInt256 ? Legal : Custom);

    setOperationAction(ISD::USUBSAT,   MVT::v16i16, HasInt256 ? Legal : Custom);

    setOperationAction(ISD::SSUBSAT,   MVT::v16i16, HasInt256 ? Legal : Custom);

    setOperationAction(ISD::UADDSAT,   MVT::v8i32, Custom);

    setOperationAction(ISD::USUBSAT,   MVT::v8i32, Custom);

    setOperationAction(ISD::UADDSAT,   MVT::v4i64, Custom);

    setOperationAction(ISD::USUBSAT,   MVT::v4i64, Custom);


    for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) {

      setOperationAction(ISD::ABS,  VT, HasInt256 ? Legal : Custom);

      setOperationAction(ISD::SMAX, VT, HasInt256 ? Legal : Custom);

      setOperationAction(ISD::UMAX, VT, HasInt256 ? Legal : Custom);

      setOperationAction(ISD::SMIN, VT, HasInt256 ? Legal : Custom);

      setOperationAction(ISD::UMIN, VT, HasInt256 ? Legal : Custom);

    }


    for (auto VT : {MVT::v16i16, MVT::v8i32, MVT::v4i64}) {

      setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Custom);

      setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Custom);

    }


    if (HasInt256) {

      // The custom lowering for UINT_TO_FP for v8i32 becomes interesting

      // when we have a 256bit-wide blend with immediate.

      setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Custom);

      setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v8i32, Custom);


      // AVX2 also has wider vector sign/zero extending loads, VPMOV[SZ]X

      for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {

        setLoadExtAction(LoadExtOp, MVT::v16i16, MVT::v16i8, Legal);

        setLoadExtAction(LoadExtOp, MVT::v8i32,  MVT::v8i8,  Legal);

        setLoadExtAction(LoadExtOp, MVT::v4i64,  MVT::v4i8,  Legal);

        setLoadExtAction(LoadExtOp, MVT::v8i32,  MVT::v8i16, Legal);

        setLoadExtAction(LoadExtOp, MVT::v4i64,  MVT::v4i16, Legal);

        setLoadExtAction(LoadExtOp, MVT::v4i64,  MVT::v4i32, Legal);

      }

    }


    for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,

                     MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 }) {

      setOperationAction(ISD::MLOAD,  VT, Subtarget.hasVLX() ? Legal : Custom);

      setOperationAction(ISD::MSTORE, VT, Legal);

    }


    // Extract subvector is special because the value type

    // (result) is 128-bit but the source is 256-bit wide.

    for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,

                     MVT::v8f16, MVT::v4f32, MVT::v2f64 }) {

      setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);

    }


    // Custom lower several nodes for 256-bit types.

    for (MVT VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,

                    MVT::v16f16, MVT::v8f32, MVT::v4f64 }) {

      setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);

      setOperationAction(ISD::VECTOR_SHUFFLE,     VT, Custom);

      setOperationAction(ISD::VSELECT,            VT, Custom);

      setOperationAction(ISD::INSERT_VECTOR_ELT,  VT, Custom);

      setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);

      setOperationAction(ISD::SCALAR_TO_VECTOR,   VT, Custom);

      setOperationAction(ISD::INSERT_SUBVECTOR,   VT, Legal);

      setOperationAction(ISD::CONCAT_VECTORS,     VT, Custom);

      setOperationAction(ISD::STORE,              VT, Custom);

    }

    setF16Action(MVT::v16f16, Expand);

    setOperationAction(ISD::FNEG, MVT::v16f16, Custom);

    setOperationAction(ISD::FABS, MVT::v16f16, Custom);

    setOperationAction(ISD::FCOPYSIGN, MVT::v16f16, Custom);

    setOperationAction(ISD::FADD, MVT::v16f16, Expand);

    setOperationAction(ISD::FSUB, MVT::v16f16, Expand);

    setOperationAction(ISD::FMUL, MVT::v16f16, Expand);

    setOperationAction(ISD::FDIV, MVT::v16f16, Expand);


    if (HasInt256) {

      setOperationAction(ISD::VSELECT, MVT::v32i8, Legal);


      // Custom legalize 2x32 to get a little better code.

      setOperationAction(ISD::MGATHER, MVT::v2f32, Custom);

      setOperationAction(ISD::MGATHER, MVT::v2i32, Custom);


      for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,

                       MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 })

        setOperationAction(ISD::MGATHER,  VT, Custom);

    }

  }


  if (!Subtarget.useSoftFloat() && !Subtarget.hasFP16() &&

      Subtarget.hasF16C()) {

    for (MVT VT : { MVT::f16, MVT::v2f16, MVT::v4f16, MVT::v8f16 }) {

      setOperationAction(ISD::FP_ROUND,           VT, Custom);

      setOperationAction(ISD::STRICT_FP_ROUND,    VT, Custom);

    }

    for (MVT VT : { MVT::f32, MVT::v2f32, MVT::v4f32, MVT::v8f32 }) {

      setOperationAction(ISD::FP_EXTEND,          VT, Custom);

      setOperationAction(ISD::STRICT_FP_EXTEND,   VT, Custom);

    }

    for (unsigned Opc : {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV}) {

      setOperationPromotedToType(Opc, MVT::v8f16, MVT::v8f32);

      setOperationPromotedToType(Opc, MVT::v16f16, MVT::v16f32);

    }

    setOperationAction(ISD::SETCC, MVT::v8f16, Custom);

    setOperationAction(ISD::SETCC, MVT::v16f16, Custom);

  }


  // This block controls legalization of the mask vector sizes that are

  // available with AVX512. 512-bit vectors are in a separate block controlled

  // by useAVX512Regs.

  if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {

    addRegisterClass(MVT::v1i1,   &X86::VK1RegClass);

    addRegisterClass(MVT::v2i1,   &X86::VK2RegClass);

    addRegisterClass(MVT::v4i1,   &X86::VK4RegClass);

    addRegisterClass(MVT::v8i1,   &X86::VK8RegClass);

    addRegisterClass(MVT::v16i1,  &X86::VK16RegClass);


    setOperationAction(ISD::SELECT,             MVT::v1i1, Custom);

    setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v1i1, Custom);

    setOperationAction(ISD::BUILD_VECTOR,       MVT::v1i1, Custom);


    setOperationPromotedToType(ISD::FP_TO_SINT,        MVT::v8i1,  MVT::v8i32);

    setOperationPromotedToType(ISD::FP_TO_UINT,        MVT::v8i1,  MVT::v8i32);

    setOperationPromotedToType(ISD::FP_TO_SINT,        MVT::v4i1,  MVT::v4i32);

    setOperationPromotedToType(ISD::FP_TO_UINT,        MVT::v4i1,  MVT::v4i32);

    setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v8i1,  MVT::v8i32);

    setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v8i1,  MVT::v8i32);

    setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v4i1,  MVT::v4i32);

    setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v4i1,  MVT::v4i32);

    setOperationAction(ISD::FP_TO_SINT,                MVT::v2i1,  Custom);

    setOperationAction(ISD::FP_TO_UINT,                MVT::v2i1,  Custom);

    setOperationAction(ISD::STRICT_FP_TO_SINT,         MVT::v2i1,  Custom);

    setOperationAction(ISD::STRICT_FP_TO_UINT,         MVT::v2i1,  Custom);

    setOperationAction(ISD::FCANONICALIZE, MVT::v8f16, Custom);

    setOperationAction(ISD::FCANONICALIZE, MVT::v16f16, Custom);

    setOperationAction(ISD::FCANONICALIZE, MVT::v32f16, Custom);


    // There is no byte sized k-register load or store without AVX512DQ.

    if (!Subtarget.hasDQI()) {

      setOperationAction(ISD::LOAD, MVT::v1i1, Custom);

      setOperationAction(ISD::LOAD, MVT::v2i1, Custom);

      setOperationAction(ISD::LOAD, MVT::v4i1, Custom);

      setOperationAction(ISD::LOAD, MVT::v8i1, Custom);


      setOperationAction(ISD::STORE, MVT::v1i1, Custom);

      setOperationAction(ISD::STORE, MVT::v2i1, Custom);

      setOperationAction(ISD::STORE, MVT::v4i1, Custom);

      setOperationAction(ISD::STORE, MVT::v8i1, Custom);

    }


    // Extends of v16i1/v8i1/v4i1/v2i1 to 128-bit vectors.

    for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {

      setOperationAction(ISD::SIGN_EXTEND, VT, Custom);

      setOperationAction(ISD::ZERO_EXTEND, VT, Custom);

      setOperationAction(ISD::ANY_EXTEND,  VT, Custom);

    }


    for (auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v16i1 })

      setOperationAction(ISD::VSELECT,          VT, Expand);


    for (auto VT : { MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v16i1 }) {

      setOperationAction(ISD::SETCC,            VT, Custom);

      setOperationAction(ISD::SELECT,           VT, Custom);

      setOperationAction(ISD::TRUNCATE,         VT, Custom);


      setOperationAction(ISD::BUILD_VECTOR,     VT, Custom);

      setOperationAction(ISD::CONCAT_VECTORS,   VT, Custom);

      setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);

      setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);

      setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);

      setOperationAction(ISD::VECTOR_SHUFFLE,   VT,  Custom);

    }


    for (auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1 })

      setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);

  }

  if (Subtarget.hasDQI() && Subtarget.hasVLX()) {

    for (MVT VT : {MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64}) {

      setOperationAction(ISD::LRINT, VT, Legal);

      setOperationAction(ISD::LLRINT, VT, Legal);

    }

  }


  // This block controls legalization for 512-bit operations with 8/16/32/64 bit

  // elements. 512-bits can be disabled based on prefer-vector-width and

  // required-vector-width function attributes.

  if (!Subtarget.useSoftFloat() && Subtarget.useAVX512Regs()) {

    bool HasBWI = Subtarget.hasBWI();


    addRegisterClass(MVT::v16i32, &X86::VR512RegClass);

    addRegisterClass(MVT::v16f32, &X86::VR512RegClass);

    addRegisterClass(MVT::v8i64,  &X86::VR512RegClass);

    addRegisterClass(MVT::v8f64,  &X86::VR512RegClass);

    addRegisterClass(MVT::v32i16, &X86::VR512RegClass);

    addRegisterClass(MVT::v32f16, &X86::VR512RegClass);

    addRegisterClass(MVT::v64i8,  &X86::VR512RegClass);


    for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD}) {

      setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i8,  Legal);

      setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i16, Legal);

      setLoadExtAction(ExtType, MVT::v8i64,  MVT::v8i8,   Legal);

      setLoadExtAction(ExtType, MVT::v8i64,  MVT::v8i16,  Legal);

      setLoadExtAction(ExtType, MVT::v8i64,  MVT::v8i32,  Legal);

      if (HasBWI)

        setLoadExtAction(ExtType, MVT::v32i16, MVT::v32i8, Legal);

    }


    for (MVT VT : { MVT::v16f32, MVT::v8f64 }) {

      setOperationAction(ISD::FMAXIMUM, VT, Custom);

      setOperationAction(ISD::FMINIMUM, VT, Custom);

      setOperationAction(ISD::FMAXIMUMNUM, VT, Custom);

      setOperationAction(ISD::FMINIMUMNUM, VT, Custom);

      setOperationAction(ISD::FNEG,  VT, Custom);

      setOperationAction(ISD::FABS,  VT, Custom);

      setOperationAction(ISD::FMA,   VT, Legal);

      setOperationAction(ISD::STRICT_FMA, VT, Legal);

      setOperationAction(ISD::FCOPYSIGN, VT, Custom);

      setOperationAction(ISD::FCANONICALIZE, VT, Custom);

    }

    setOperationAction(ISD::LRINT, MVT::v16f32,

                       Subtarget.hasDQI() ? Legal : Custom);

    setOperationAction(ISD::LRINT, MVT::v8f64,

                       Subtarget.hasDQI() ? Legal : Custom);

    if (Subtarget.hasDQI())

      setOperationAction(ISD::LLRINT, MVT::v8f64, Legal);


    for (MVT VT : { MVT::v16i1, MVT::v16i8 }) {

      setOperationPromotedToType(ISD::FP_TO_SINT       , VT, MVT::v16i32);

      setOperationPromotedToType(ISD::FP_TO_UINT       , VT, MVT::v16i32);

      setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, VT, MVT::v16i32);

      setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, VT, MVT::v16i32);

    }


    for (MVT VT : { MVT::v16i16, MVT::v16i32 }) {

      setOperationAction(ISD::FP_TO_SINT,        VT, Custom);

      setOperationAction(ISD::FP_TO_UINT,        VT, Custom);

      setOperationAction(ISD::STRICT_FP_TO_SINT, VT, Custom);

      setOperationAction(ISD::STRICT_FP_TO_UINT, VT, Custom);

    }


    setOperationAction(ISD::SINT_TO_FP,        MVT::v16i32, Custom);

    setOperationAction(ISD::UINT_TO_FP,        MVT::v16i32, Custom);

    setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v16i32, Custom);

    setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v16i32, Custom);

    setOperationAction(ISD::FP_EXTEND,         MVT::v8f64,  Custom);

    setOperationAction(ISD::STRICT_FP_EXTEND,  MVT::v8f64,  Custom);


    setOperationAction(ISD::STRICT_FADD,      MVT::v16f32, Legal);

    setOperationAction(ISD::STRICT_FADD,      MVT::v8f64,  Legal);

    setOperationAction(ISD::STRICT_FSUB,      MVT::v16f32, Legal);

    setOperationAction(ISD::STRICT_FSUB,      MVT::v8f64,  Legal);

    setOperationAction(ISD::STRICT_FMUL,      MVT::v16f32, Legal);

    setOperationAction(ISD::STRICT_FMUL,      MVT::v8f64,  Legal);

    setOperationAction(ISD::STRICT_FDIV,      MVT::v16f32, Legal);

    setOperationAction(ISD::STRICT_FDIV,      MVT::v8f64,  Legal);

    setOperationAction(ISD::STRICT_FSQRT,     MVT::v16f32, Legal);

    setOperationAction(ISD::STRICT_FSQRT,     MVT::v8f64,  Legal);

    setOperationAction(ISD::STRICT_FP_ROUND,  MVT::v8f32,  Legal);


    setTruncStoreAction(MVT::v8i64,   MVT::v8i8,   Legal);

    setTruncStoreAction(MVT::v8i64,   MVT::v8i16,  Legal);

    setTruncStoreAction(MVT::v8i64,   MVT::v8i32,  Legal);

    setTruncStoreAction(MVT::v16i32,  MVT::v16i8,  Legal);

    setTruncStoreAction(MVT::v16i32,  MVT::v16i16, Legal);

    if (HasBWI)

      setTruncStoreAction(MVT::v32i16,  MVT::v32i8, Legal);


    // With 512-bit vectors and no VLX, we prefer to widen MLOAD/MSTORE

    // to 512-bit rather than use the AVX2 instructions so that we can use

    // k-masks.

    if (!Subtarget.hasVLX()) {

      for (auto VT : {MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,

           MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64}) {

        setOperationAction(ISD::MLOAD,  VT, Custom);

        setOperationAction(ISD::MSTORE, VT, Custom);

      }

    }


    setOperationAction(ISD::TRUNCATE,    MVT::v8i32,  Legal);

    setOperationAction(ISD::TRUNCATE,    MVT::v16i16, Legal);

    setOperationAction(ISD::TRUNCATE,    MVT::v32i8,  HasBWI ? Legal : Custom);

    setOperationAction(ISD::ZERO_EXTEND, MVT::v32i16, Custom);

    setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom);

    setOperationAction(ISD::ZERO_EXTEND, MVT::v8i64,  Custom);

    setOperationAction(ISD::ANY_EXTEND,  MVT::v32i16, Custom);

    setOperationAction(ISD::ANY_EXTEND,  MVT::v16i32, Custom);

    setOperationAction(ISD::ANY_EXTEND,  MVT::v8i64,  Custom);

    setOperationAction(ISD::SIGN_EXTEND, MVT::v32i16, Custom);

    setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom);

    setOperationAction(ISD::SIGN_EXTEND, MVT::v8i64,  Custom);


    if (HasBWI) {

      // Extends from v64i1 masks to 512-bit vectors.

      setOperationAction(ISD::SIGN_EXTEND,        MVT::v64i8, Custom);

      setOperationAction(ISD::ZERO_EXTEND,        MVT::v64i8, Custom);

      setOperationAction(ISD::ANY_EXTEND,         MVT::v64i8, Custom);

    }


    for (auto VT : { MVT::v16f32, MVT::v8f64 }) {

      setOperationAction(ISD::FFLOOR,            VT, Legal);

      setOperationAction(ISD::STRICT_FFLOOR,     VT, Legal);

      setOperationAction(ISD::FCEIL,             VT, Legal);

      setOperationAction(ISD::STRICT_FCEIL,      VT, Legal);

      setOperationAction(ISD::FTRUNC,            VT, Legal);

      setOperationAction(ISD::STRICT_FTRUNC,     VT, Legal);

      setOperationAction(ISD::FRINT,             VT, Legal);

      setOperationAction(ISD::STRICT_FRINT,      VT, Legal);

      setOperationAction(ISD::FNEARBYINT,        VT, Legal);

      setOperationAction(ISD::STRICT_FNEARBYINT, VT, Legal);

      setOperationAction(ISD::FROUNDEVEN,        VT, Legal);

      setOperationAction(ISD::STRICT_FROUNDEVEN, VT, Legal);


      setOperationAction(ISD::FROUND,            VT, Custom);

    }


    for (auto VT : {MVT::v32i16, MVT::v16i32, MVT::v8i64}) {

      setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Custom);

      setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Custom);

    }


    setOperationAction(ISD::ADD, MVT::v32i16, HasBWI ? Legal : Custom);

    setOperationAction(ISD::SUB, MVT::v32i16, HasBWI ? Legal : Custom);

    setOperationAction(ISD::ADD, MVT::v64i8,  HasBWI ? Legal : Custom);

    setOperationAction(ISD::SUB, MVT::v64i8,  HasBWI ? Legal : Custom);


    setOperationAction(ISD::MUL, MVT::v8i64,  Custom);

    setOperationAction(ISD::MUL, MVT::v16i32, Legal);

    setOperationAction(ISD::MUL, MVT::v32i16, HasBWI ? Legal : Custom);

    setOperationAction(ISD::MUL, MVT::v64i8,  Custom);


    setOperationAction(ISD::MULHU, MVT::v16i32, Custom);

    setOperationAction(ISD::MULHS, MVT::v16i32, Custom);

    setOperationAction(ISD::MULHS, MVT::v32i16, HasBWI ? Legal : Custom);

    setOperationAction(ISD::MULHU, MVT::v32i16, HasBWI ? Legal : Custom);

    setOperationAction(ISD::MULHS, MVT::v64i8,  Custom);

    setOperationAction(ISD::MULHU, MVT::v64i8,  Custom);

    setOperationAction(ISD::AVGCEILU, MVT::v32i16, HasBWI ? Legal : Custom);

    setOperationAction(ISD::AVGCEILU, MVT::v64i8,  HasBWI ? Legal : Custom);


    setOperationAction(ISD::SMULO, MVT::v64i8, Custom);

    setOperationAction(ISD::UMULO, MVT::v64i8, Custom);


    for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32, MVT::v8i64 }) {

      setOperationAction(ISD::SRL,              VT, Custom);

      setOperationAction(ISD::SHL,              VT, Custom);

      setOperationAction(ISD::SRA,              VT, Custom);

      setOperationAction(ISD::ROTL,             VT, Custom);

      setOperationAction(ISD::ROTR,             VT, Custom);

      setOperationAction(ISD::SETCC,            VT, Custom);

      setOperationAction(ISD::ABDS,             VT, Custom);

      setOperationAction(ISD::ABDU,             VT, Custom);

      setOperationAction(ISD::BITREVERSE,       VT, Custom);


      // The condition codes aren't legal in SSE/AVX and under AVX512 we use

      // setcc all the way to isel and prefer SETGT in some isel patterns.

      setCondCodeAction(ISD::SETLT, VT, Custom);

      setCondCodeAction(ISD::SETLE, VT, Custom);

    }


    setOperationAction(ISD::SETCC,          MVT::v8f64, Custom);

    setOperationAction(ISD::SETCC,          MVT::v16f32, Custom);

    setOperationAction(ISD::STRICT_FSETCC,  MVT::v8f64, Custom);

    setOperationAction(ISD::STRICT_FSETCC,  MVT::v16f32, Custom);

    setOperationAction(ISD::STRICT_FSETCCS, MVT::v8f64, Custom);

    setOperationAction(ISD::STRICT_FSETCCS, MVT::v16f32, Custom);


    for (auto VT : { MVT::v16i32, MVT::v8i64 }) {

      setOperationAction(ISD::SMAX,             VT, Legal);

      setOperationAction(ISD::UMAX,             VT, Legal);

      setOperationAction(ISD::SMIN,             VT, Legal);

      setOperationAction(ISD::UMIN,             VT, Legal);

      setOperationAction(ISD::ABS,              VT, Legal);

      setOperationAction(ISD::CTPOP,            VT, Custom);

    }


    for (auto VT : { MVT::v64i8, MVT::v32i16 }) {

      setOperationAction(ISD::ABS,     VT, HasBWI ? Legal : Custom);

      setOperationAction(ISD::CTPOP,   VT, Subtarget.hasBITALG() ? Legal : Custom);

      setOperationAction(ISD::CTLZ,    VT, Custom);

      setOperationAction(ISD::SMAX,    VT, HasBWI ? Legal : Custom);

      setOperationAction(ISD::UMAX,    VT, HasBWI ? Legal : Custom);

      setOperationAction(ISD::SMIN,    VT, HasBWI ? Legal : Custom);

      setOperationAction(ISD::UMIN,    VT, HasBWI ? Legal : Custom);

      setOperationAction(ISD::UADDSAT, VT, HasBWI ? Legal : Custom);

      setOperationAction(ISD::SADDSAT, VT, HasBWI ? Legal : Custom);

      setOperationAction(ISD::USUBSAT, VT, HasBWI ? Legal : Custom);

      setOperationAction(ISD::SSUBSAT, VT, HasBWI ? Legal : Custom);

    }


    setOperationAction(ISD::FSHL,       MVT::v64i8, Custom);

    setOperationAction(ISD::FSHR,       MVT::v64i8, Custom);

    setOperationAction(ISD::FSHL,      MVT::v32i16, Custom);

    setOperationAction(ISD::FSHR,      MVT::v32i16, Custom);

    setOperationAction(ISD::FSHL,      MVT::v16i32, Custom);

    setOperationAction(ISD::FSHR,      MVT::v16i32, Custom);


    if (Subtarget.hasDQI() || Subtarget.hasFP16())

      for (auto Opc : {ISD::SINT_TO_FP, ISD::UINT_TO_FP, ISD::STRICT_SINT_TO_FP,

                       ISD::STRICT_UINT_TO_FP, ISD::FP_TO_SINT, ISD::FP_TO_UINT,

                       ISD::STRICT_FP_TO_SINT, ISD::STRICT_FP_TO_UINT})

        setOperationAction(Opc,           MVT::v8i64, Custom);


    if (Subtarget.hasDQI())

      setOperationAction(ISD::MUL,        MVT::v8i64, Legal);


    if (Subtarget.hasCDI()) {

      // NonVLX sub-targets extend 128/256 vectors to use the 512 version.

      for (auto VT : { MVT::v16i32, MVT::v8i64} ) {

        setOperationAction(ISD::CTLZ,            VT, Legal);

      }

    } // Subtarget.hasCDI()


    if (Subtarget.hasVPOPCNTDQ()) {

      for (auto VT : { MVT::v16i32, MVT::v8i64 })

        setOperationAction(ISD::CTPOP, VT, Legal);

    }


    // Extract subvector is special because the value type

    // (result) is 256-bit but the source is 512-bit wide.

    // 128-bit was made Legal under AVX1.

    for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,

                     MVT::v16f16, MVT::v8f32, MVT::v4f64 })

      setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);


    for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32, MVT::v8i64,

                     MVT::v32f16, MVT::v16f32, MVT::v8f64 }) {

      setOperationAction(ISD::CONCAT_VECTORS,     VT, Custom);

      setOperationAction(ISD::INSERT_SUBVECTOR,   VT, Legal);

      setOperationAction(ISD::SELECT,             VT, Custom);

      setOperationAction(ISD::VSELECT,            VT, Custom);

      setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);

      setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);

      setOperationAction(ISD::VECTOR_SHUFFLE,     VT, Custom);

      setOperationAction(ISD::SCALAR_TO_VECTOR,   VT, Custom);

      setOperationAction(ISD::INSERT_VECTOR_ELT,  VT, Custom);

    }

    setF16Action(MVT::v32f16, Expand);

    setOperationAction(ISD::FP_ROUND, MVT::v16f16, Custom);

    setOperationAction(ISD::STRICT_FP_ROUND, MVT::v16f16, Custom);

    setOperationAction(ISD::FP_EXTEND, MVT::v16f32, Custom);

    setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v16f32, Custom);

    for (unsigned Opc : {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV})

      setOperationPromotedToType(Opc, MVT::v32f16, MVT::v32f32);

    setOperationAction(ISD::SETCC, MVT::v32f16, Custom);


    for (auto VT : { MVT::v16i32, MVT::v8i64, MVT::v16f32, MVT::v8f64 }) {

      setOperationAction(ISD::MLOAD,               VT, Legal);

      setOperationAction(ISD::MSTORE,              VT, Legal);

      setOperationAction(ISD::MGATHER,             VT, Custom);

      setOperationAction(ISD::MSCATTER,            VT, Custom);

    }

    if (HasBWI) {

      for (auto VT : { MVT::v64i8, MVT::v32i16 }) {

        setOperationAction(ISD::MLOAD,        VT, Legal);

        setOperationAction(ISD::MSTORE,       VT, Legal);

      }

    } else {

      setOperationAction(ISD::STORE, MVT::v32i16, Custom);

      setOperationAction(ISD::STORE, MVT::v64i8,  Custom);

    }


    if (Subtarget.hasVBMI2()) {

      for (auto VT : {MVT::v32i16, MVT::v16i32, MVT::v8i64}) {

        setOperationAction(ISD::FSHL, VT, Custom);

        setOperationAction(ISD::FSHR, VT, Custom);

      }


      setOperationAction(ISD::ROTL, MVT::v32i16, Custom);

      setOperationAction(ISD::ROTR, MVT::v32i16, Custom);

    }


    setOperationAction(ISD::FNEG, MVT::v32f16, Custom);

    setOperationAction(ISD::FABS, MVT::v32f16, Custom);

    setOperationAction(ISD::FCOPYSIGN, MVT::v32f16, Custom);

  }// useAVX512Regs


  if (!Subtarget.useSoftFloat() && Subtarget.hasVBMI2()) {

    for (auto VT : {MVT::v8i16, MVT::v4i32, MVT::v2i64, MVT::v16i16, MVT::v8i32,

                    MVT::v4i64}) {

      setOperationAction(ISD::FSHL, VT, Custom);

      setOperationAction(ISD::FSHR, VT, Custom);

    }

  }


  // This block controls legalization for operations that don't have

  // pre-AVX512 equivalents. Without VLX we use 512-bit operations for

  // narrower widths.

  if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {

    // These operations are handled on non-VLX by artificially widening in

    // isel patterns.


    setOperationAction(ISD::STRICT_FP_TO_UINT,  MVT::v8i32, Custom);

    setOperationAction(ISD::STRICT_FP_TO_UINT,  MVT::v4i32, Custom);

    setOperationAction(ISD::STRICT_FP_TO_UINT,  MVT::v2i32, Custom);


    if (Subtarget.hasDQI()) {

      // Fast v2f32 SINT_TO_FP( v2i64 ) custom conversion.

      // v2f32 UINT_TO_FP is already custom under SSE2.

      assert(isOperationCustom(ISD::UINT_TO_FP, MVT::v2f32) &&

             isOperationCustom(ISD::STRICT_UINT_TO_FP, MVT::v2f32) &&

             "Unexpected operation action!");

      // v2i64 FP_TO_S/UINT(v2f32) custom conversion.

      setOperationAction(ISD::FP_TO_SINT,        MVT::v2f32, Custom);

      setOperationAction(ISD::FP_TO_UINT,        MVT::v2f32, Custom);

      setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v2f32, Custom);

      setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v2f32, Custom);

    }


    for (auto VT : { MVT::v2i64, MVT::v4i64 }) {

      setOperationAction(ISD::SMAX, VT, Legal);

      setOperationAction(ISD::UMAX, VT, Legal);

      setOperationAction(ISD::SMIN, VT, Legal);

      setOperationAction(ISD::UMIN, VT, Legal);

      setOperationAction(ISD::ABS,  VT, Legal);

    }


    for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) {

      setOperationAction(ISD::ROTL,     VT, Custom);

      setOperationAction(ISD::ROTR,     VT, Custom);

    }


    // Custom legalize 2x32 to get a little better code.

    setOperationAction(ISD::MSCATTER, MVT::v2f32, Custom);

    setOperationAction(ISD::MSCATTER, MVT::v2i32, Custom);


    for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,

                     MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 })

      setOperationAction(ISD::MSCATTER, VT, Custom);


    if (Subtarget.hasDQI()) {

      for (auto Opc : {ISD::SINT_TO_FP, ISD::UINT_TO_FP, ISD::STRICT_SINT_TO_FP,

                       ISD::STRICT_UINT_TO_FP, ISD::FP_TO_SINT, ISD::FP_TO_UINT,

                       ISD::STRICT_FP_TO_SINT, ISD::STRICT_FP_TO_UINT}) {

        setOperationAction(Opc, MVT::v2i64, Custom);

        setOperationAction(Opc, MVT::v4i64, Custom);

      }

      setOperationAction(ISD::MUL, MVT::v2i64, Legal);

      setOperationAction(ISD::MUL, MVT::v4i64, Legal);

    }


    if (Subtarget.hasCDI()) {

      for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) {

        setOperationAction(ISD::CTLZ,            VT, Legal);

      }

    } // Subtarget.hasCDI()


    if (Subtarget.hasVPOPCNTDQ()) {

      for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 })

        setOperationAction(ISD::CTPOP, VT, Legal);

    }


    // We can try to convert vectors to different sizes to leverage legal

    // `vpcompress` cases. So we mark these supported vector sizes as Custom and

    // then specialize to Legal below.

    for (MVT VT : {MVT::v8i32, MVT::v8f32, MVT::v4i32, MVT::v4f32, MVT::v4i64,

                   MVT::v4f64, MVT::v2i64, MVT::v2f64, MVT::v16i8, MVT::v8i16,

                   MVT::v16i16, MVT::v8i8})

      setOperationAction(ISD::VECTOR_COMPRESS, VT, Custom);


    // Legal vpcompress depends on various AVX512 extensions.

    // Legal in AVX512F

    for (MVT VT : {MVT::v16i32, MVT::v16f32, MVT::v8i64, MVT::v8f64})

      setOperationAction(ISD::VECTOR_COMPRESS, VT, Legal);


    // Legal in AVX512F + AVX512VL

    if (Subtarget.hasVLX())

      for (MVT VT : {MVT::v8i32, MVT::v8f32, MVT::v4i32, MVT::v4f32, MVT::v4i64,

                     MVT::v4f64, MVT::v2i64, MVT::v2f64})

        setOperationAction(ISD::VECTOR_COMPRESS, VT, Legal);


    // Legal in AVX512F + AVX512VBMI2

    if (Subtarget.hasVBMI2())

      for (MVT VT : {MVT::v32i16, MVT::v64i8})

        setOperationAction(ISD::VECTOR_COMPRESS, VT, Legal);


    // Legal in AVX512F + AVX512VL + AVX512VBMI2

    if (Subtarget.hasVBMI2() && Subtarget.hasVLX())

      for (MVT VT : {MVT::v16i8, MVT::v8i16, MVT::v32i8, MVT::v16i16})

        setOperationAction(ISD::VECTOR_COMPRESS, VT, Legal);

  }


  // This block control legalization of v32i1/v64i1 which are available with

  // AVX512BW..

  if (!Subtarget.useSoftFloat() && Subtarget.hasBWI()) {

    addRegisterClass(MVT::v32i1,  &X86::VK32RegClass);

    addRegisterClass(MVT::v64i1,  &X86::VK64RegClass);


    for (auto VT : { MVT::v32i1, MVT::v64i1 }) {

      setOperationAction(ISD::VSELECT,            VT, Expand);

      setOperationAction(ISD::TRUNCATE,           VT, Custom);

      setOperationAction(ISD::SETCC,              VT, Custom);

      setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);

      setOperationAction(ISD::INSERT_VECTOR_ELT,  VT, Custom);

      setOperationAction(ISD::SELECT,             VT, Custom);

      setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);

      setOperationAction(ISD::VECTOR_SHUFFLE,     VT, Custom);

      setOperationAction(ISD::CONCAT_VECTORS,     VT, Custom);

      setOperationAction(ISD::INSERT_SUBVECTOR,   VT, Custom);

    }


    for (auto VT : { MVT::v16i1, MVT::v32i1 })

      setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);


    // Extends from v32i1 masks to 256-bit vectors.

    setOperationAction(ISD::SIGN_EXTEND,        MVT::v32i8, Custom);

    setOperationAction(ISD::ZERO_EXTEND,        MVT::v32i8, Custom);

    setOperationAction(ISD::ANY_EXTEND,         MVT::v32i8, Custom);


    for (auto VT : {MVT::v32i8, MVT::v16i8, MVT::v16i16, MVT::v8i16,

                    MVT::v16f16, MVT::v8f16}) {

      setOperationAction(ISD::MLOAD,  VT, Subtarget.hasVLX() ? Legal : Custom);

      setOperationAction(ISD::MSTORE, VT, Subtarget.hasVLX() ? Legal : Custom);

    }


    // These operations are handled on non-VLX by artificially widening in

    // isel patterns.

    // TODO: Custom widen in lowering on non-VLX and drop the isel patterns?


    if (Subtarget.hasBITALG()) {

      for (auto VT : { MVT::v16i8, MVT::v32i8, MVT::v8i16, MVT::v16i16 })

        setOperationAction(ISD::CTPOP, VT, Legal);

    }

  }


  if (!Subtarget.useSoftFloat() && Subtarget.hasFP16()) {

    auto setGroup = [&] (MVT VT) {

      setOperationAction(ISD::FADD,               VT, Legal);

      setOperationAction(ISD::STRICT_FADD,        VT, Legal);

      setOperationAction(ISD::FSUB,               VT, Legal);

      setOperationAction(ISD::STRICT_FSUB,        VT, Legal);

      setOperationAction(ISD::FMUL,               VT, Legal);

      setOperationAction(ISD::STRICT_FMUL,        VT, Legal);

      setOperationAction(ISD::FDIV,               VT, Legal);

      setOperationAction(ISD::STRICT_FDIV,        VT, Legal);

      setOperationAction(ISD::FSQRT,              VT, Legal);

      setOperationAction(ISD::STRICT_FSQRT,       VT, Legal);


      setOperationAction(ISD::FFLOOR,             VT, Legal);

      setOperationAction(ISD::STRICT_FFLOOR,      VT, Legal);

      setOperationAction(ISD::FCEIL,              VT, Legal);

      setOperationAction(ISD::STRICT_FCEIL,       VT, Legal);

      setOperationAction(ISD::FTRUNC,             VT, Legal);

      setOperationAction(ISD::STRICT_FTRUNC,      VT, Legal);

      setOperationAction(ISD::FRINT,              VT, Legal);

      setOperationAction(ISD::STRICT_FRINT,       VT, Legal);

      setOperationAction(ISD::FNEARBYINT,         VT, Legal);

      setOperationAction(ISD::STRICT_FNEARBYINT,  VT, Legal);

      setOperationAction(ISD::FROUNDEVEN, VT, Legal);

      setOperationAction(ISD::STRICT_FROUNDEVEN, VT, Legal);


      setOperationAction(ISD::FROUND,             VT, Custom);


      setOperationAction(ISD::LOAD,               VT, Legal);

      setOperationAction(ISD::STORE,              VT, Legal);


      setOperationAction(ISD::FMA,                VT, Legal);

      setOperationAction(ISD::STRICT_FMA,         VT, Legal);

      setOperationAction(ISD::VSELECT,            VT, Legal);

      setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);

      setOperationAction(ISD::SELECT,             VT, Custom);


      setOperationAction(ISD::FNEG,               VT, Custom);

      setOperationAction(ISD::FABS,               VT, Custom);

      setOperationAction(ISD::FCOPYSIGN,          VT, Custom);

      setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);

      setOperationAction(ISD::VECTOR_SHUFFLE,     VT, Custom);


      setOperationAction(ISD::SETCC,              VT, Custom);

      setOperationAction(ISD::STRICT_FSETCC,      VT, Custom);

      setOperationAction(ISD::STRICT_FSETCCS,     VT, Custom);

    };


    // AVX512_FP16 scalar operations

    setGroup(MVT::f16);

    setOperationAction(ISD::FREM,                 MVT::f16, Promote);

    setOperationAction(ISD::STRICT_FREM,          MVT::f16, Promote);

    setOperationAction(ISD::SELECT_CC,            MVT::f16, Expand);

    setOperationAction(ISD::BR_CC,                MVT::f16, Expand);

    setOperationAction(ISD::STRICT_FROUND,        MVT::f16, Promote);

    setOperationAction(ISD::FROUNDEVEN,           MVT::f16, Legal);

    setOperationAction(ISD::STRICT_FROUNDEVEN,    MVT::f16, Legal);

    setOperationAction(ISD::FP_ROUND,             MVT::f16, Custom);

    setOperationAction(ISD::STRICT_FP_ROUND,      MVT::f16, Custom);

    setOperationAction(ISD::FMAXIMUM,             MVT::f16, Custom);

    setOperationAction(ISD::FMINIMUM,             MVT::f16, Custom);

    setOperationAction(ISD::FMAXIMUMNUM,          MVT::f16, Custom);

    setOperationAction(ISD::FMINIMUMNUM,          MVT::f16, Custom);

    setOperationAction(ISD::FP_EXTEND,            MVT::f32, Legal);

    setOperationAction(ISD::STRICT_FP_EXTEND,     MVT::f32, Legal);

    setOperationAction(ISD::LRINT,                MVT::f16, Legal);

    setOperationAction(ISD::LLRINT,               MVT::f16, Legal);


    setCondCodeAction(ISD::SETOEQ, MVT::f16, Expand);

    setCondCodeAction(ISD::SETUNE, MVT::f16, Expand);


    if (Subtarget.useAVX512Regs()) {

      setGroup(MVT::v32f16);

      setOperationAction(ISD::SCALAR_TO_VECTOR,       MVT::v32f16, Custom);

      setOperationAction(ISD::SINT_TO_FP,             MVT::v32i16, Legal);

      setOperationAction(ISD::STRICT_SINT_TO_FP,      MVT::v32i16, Legal);

      setOperationAction(ISD::UINT_TO_FP,             MVT::v32i16, Legal);

      setOperationAction(ISD::STRICT_UINT_TO_FP,      MVT::v32i16, Legal);

      setOperationAction(ISD::FP_ROUND,               MVT::v16f16, Legal);

      setOperationAction(ISD::STRICT_FP_ROUND,        MVT::v16f16, Legal);

      setOperationAction(ISD::FP_EXTEND,              MVT::v16f32, Custom);

      setOperationAction(ISD::STRICT_FP_EXTEND,       MVT::v16f32, Legal);

      setOperationAction(ISD::FP_EXTEND,              MVT::v8f64,  Custom);

      setOperationAction(ISD::STRICT_FP_EXTEND,       MVT::v8f64,  Legal);

      setOperationAction(ISD::INSERT_VECTOR_ELT,      MVT::v32f16, Custom);


      setOperationAction(ISD::FP_TO_SINT,             MVT::v32i16, Custom);

      setOperationAction(ISD::STRICT_FP_TO_SINT,      MVT::v32i16, Custom);

      setOperationAction(ISD::FP_TO_UINT,             MVT::v32i16, Custom);

      setOperationAction(ISD::STRICT_FP_TO_UINT,      MVT::v32i16, Custom);

      setOperationPromotedToType(ISD::FP_TO_SINT,     MVT::v32i8,  MVT::v32i16);

      setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v32i8,

                                 MVT::v32i16);

      setOperationPromotedToType(ISD::FP_TO_UINT,     MVT::v32i8,  MVT::v32i16);

      setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v32i8,

                                 MVT::v32i16);

      setOperationPromotedToType(ISD::FP_TO_SINT,     MVT::v32i1,  MVT::v32i16);

      setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v32i1,

                                 MVT::v32i16);

      setOperationPromotedToType(ISD::FP_TO_UINT,     MVT::v32i1,  MVT::v32i16);

      setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v32i1,

                                 MVT::v32i16);


      setOperationAction(ISD::EXTRACT_SUBVECTOR,      MVT::v16f16, Legal);

      setOperationAction(ISD::INSERT_SUBVECTOR,       MVT::v32f16, Legal);

      setOperationAction(ISD::CONCAT_VECTORS,         MVT::v32f16, Custom);


      setLoadExtAction(ISD::EXTLOAD, MVT::v8f64,  MVT::v8f16,  Legal);

      setLoadExtAction(ISD::EXTLOAD, MVT::v16f32, MVT::v16f16, Legal);


      setOperationAction(ISD::FMINIMUM, MVT::v32f16, Custom);

      setOperationAction(ISD::FMAXIMUM, MVT::v32f16, Custom);

      setOperationAction(ISD::FMINIMUMNUM, MVT::v32f16, Custom);

      setOperationAction(ISD::FMAXIMUMNUM, MVT::v32f16, Custom);

      setOperationAction(ISD::LRINT, MVT::v32f16, Legal);

      setOperationAction(ISD::LLRINT, MVT::v8f16, Legal);

    }


    setOperationAction(ISD::FP_TO_SINT,         MVT::v8i16, Custom);

    setOperationAction(ISD::STRICT_FP_TO_SINT,  MVT::v8i16, Custom);

    setOperationAction(ISD::FP_TO_UINT,         MVT::v8i16, Custom);

    setOperationAction(ISD::STRICT_FP_TO_UINT,  MVT::v8i16, Custom);


    if (Subtarget.hasVLX()) {

      setGroup(MVT::v8f16);

      setGroup(MVT::v16f16);


      setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v8f16,  Legal);

      setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v16f16, Custom);

      setOperationAction(ISD::SINT_TO_FP,         MVT::v16i16, Legal);

      setOperationAction(ISD::STRICT_SINT_TO_FP,  MVT::v16i16, Legal);

      setOperationAction(ISD::SINT_TO_FP,         MVT::v8i16,  Legal);

      setOperationAction(ISD::STRICT_SINT_TO_FP,  MVT::v8i16,  Legal);

      setOperationAction(ISD::UINT_TO_FP,         MVT::v16i16, Legal);

      setOperationAction(ISD::STRICT_UINT_TO_FP,  MVT::v16i16, Legal);

      setOperationAction(ISD::UINT_TO_FP,         MVT::v8i16,  Legal);

      setOperationAction(ISD::STRICT_UINT_TO_FP,  MVT::v8i16,  Legal);


      setOperationAction(ISD::FP_ROUND,           MVT::v8f16, Legal);

      setOperationAction(ISD::STRICT_FP_ROUND,    MVT::v8f16, Legal);

      setOperationAction(ISD::FP_EXTEND,          MVT::v8f32, Custom);

      setOperationAction(ISD::STRICT_FP_EXTEND,   MVT::v8f32, Legal);

      setOperationAction(ISD::FP_EXTEND,          MVT::v4f64, Custom);

      setOperationAction(ISD::STRICT_FP_EXTEND,   MVT::v4f64, Legal);


      // INSERT_VECTOR_ELT v8f16 extended to VECTOR_SHUFFLE

      setOperationAction(ISD::INSERT_VECTOR_ELT,    MVT::v8f16,  Custom);

      setOperationAction(ISD::INSERT_VECTOR_ELT,    MVT::v16f16, Custom);


      setOperationAction(ISD::EXTRACT_SUBVECTOR,    MVT::v8f16, Legal);

      setOperationAction(ISD::INSERT_SUBVECTOR,     MVT::v16f16, Legal);

      setOperationAction(ISD::CONCAT_VECTORS,       MVT::v16f16, Custom);


      setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Legal);

      setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Legal);

      setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8f16, Legal);

      setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Legal);


      // Need to custom widen these to prevent scalarization.

      setOperationAction(ISD::LOAD,  MVT::v4f16, Custom);

      setOperationAction(ISD::STORE, MVT::v4f16, Custom);


      setOperationAction(ISD::FMINIMUM, MVT::v8f16, Custom);

      setOperationAction(ISD::FMAXIMUM, MVT::v8f16, Custom);

      setOperationAction(ISD::FMINIMUMNUM, MVT::v8f16, Custom);

      setOperationAction(ISD::FMAXIMUMNUM, MVT::v8f16, Custom);


      setOperationAction(ISD::FMINIMUM, MVT::v16f16, Custom);

      setOperationAction(ISD::FMAXIMUM, MVT::v16f16, Custom);

      setOperationAction(ISD::FMINIMUMNUM, MVT::v16f16, Custom);

      setOperationAction(ISD::FMAXIMUMNUM, MVT::v16f16, Custom);

      setOperationAction(ISD::LRINT, MVT::v8f16, Legal);

      setOperationAction(ISD::LRINT, MVT::v16f16, Legal);

    }

  }


  if (!Subtarget.useSoftFloat() &&

      (Subtarget.hasAVXNECONVERT() || Subtarget.hasBF16())) {

    addRegisterClass(MVT::v8bf16, Subtarget.hasAVX512() ? &X86::VR128XRegClass

                                                        : &X86::VR128RegClass);

    addRegisterClass(MVT::v16bf16, Subtarget.hasAVX512() ? &X86::VR256XRegClass

                                                         : &X86::VR256RegClass);

    // We set the type action of bf16 to TypeSoftPromoteHalf, but we don't

    // provide the method to promote BUILD_VECTOR and INSERT_VECTOR_ELT.

    // Set the operation action Custom to do the customization later.

    setOperationAction(ISD::BUILD_VECTOR, MVT::bf16, Custom);

    setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::bf16, Custom);

    for (auto VT : {MVT::v8bf16, MVT::v16bf16}) {

      setF16Action(VT, Expand);

      if (!Subtarget.hasBF16())

        setOperationAction(ISD::VSELECT, VT, Custom);

      setOperationAction(ISD::BUILD_VECTOR, VT, Custom);

      setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);

      setOperationAction(ISD::INSERT_SUBVECTOR, VT, Legal);

      setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);

    }

    for (unsigned Opc : {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV}) {

      setOperationPromotedToType(Opc, MVT::v8bf16, MVT::v8f32);

      setOperationPromotedToType(Opc, MVT::v16bf16, MVT::v16f32);

    }

    setOperationAction(ISD::SETCC, MVT::v8bf16, Custom);

    setOperationAction(ISD::SETCC, MVT::v16bf16, Custom);

    setOperationAction(ISD::FP_ROUND, MVT::v8bf16, Custom);

    addLegalFPImmediate(APFloat::getZero(APFloat::BFloat()));

  }


  if (!Subtarget.useSoftFloat() && Subtarget.hasBF16() &&

      Subtarget.useAVX512Regs()) {

    addRegisterClass(MVT::v32bf16, &X86::VR512RegClass);

    setF16Action(MVT::v32bf16, Expand);

    for (unsigned Opc : {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV})

      setOperationPromotedToType(Opc, MVT::v32bf16, MVT::v32f32);

    setOperationAction(ISD::SETCC, MVT::v32bf16, Custom);

    setOperationAction(ISD::BUILD_VECTOR, MVT::v32bf16, Custom);

    setOperationAction(ISD::FP_ROUND, MVT::v16bf16, Custom);

    setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v32bf16, Custom);

    setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v32bf16, Legal);

    setOperationAction(ISD::CONCAT_VECTORS, MVT::v32bf16, Custom);

  }


  if (!Subtarget.useSoftFloat() && Subtarget.hasAVX10_2()) {

    setOperationAction(ISD::FADD, MVT::v32bf16, Legal);

    setOperationAction(ISD::FSUB, MVT::v32bf16, Legal);

    setOperationAction(ISD::FMUL, MVT::v32bf16, Legal);

    setOperationAction(ISD::FDIV, MVT::v32bf16, Legal);

    setOperationAction(ISD::FSQRT, MVT::v32bf16, Legal);

    setOperationAction(ISD::FMA, MVT::v32bf16, Legal);

    setOperationAction(ISD::SETCC, MVT::v32bf16, Custom);

    setOperationAction(ISD::FMINIMUM, MVT::v32bf16, Custom);

    setOperationAction(ISD::FMAXIMUM, MVT::v32bf16, Custom);

    setOperationAction(ISD::FMINIMUMNUM, MVT::v32bf16, Custom);

    setOperationAction(ISD::FMAXIMUMNUM, MVT::v32bf16, Custom);

    for (auto VT : {MVT::v8bf16, MVT::v16bf16}) {

      setOperationAction(ISD::FADD, VT, Legal);

      setOperationAction(ISD::FSUB, VT, Legal);

      setOperationAction(ISD::FMUL, VT, Legal);

      setOperationAction(ISD::FDIV, VT, Legal);

      setOperationAction(ISD::FSQRT, VT, Legal);

      setOperationAction(ISD::FMA, VT, Legal);

      setOperationAction(ISD::SETCC, VT, Custom);

      setOperationAction(ISD::FMINIMUM, VT, Custom);

      setOperationAction(ISD::FMAXIMUM, VT, Custom);

      setOperationAction(ISD::FMINIMUMNUM, VT, Custom);

      setOperationAction(ISD::FMAXIMUMNUM, VT, Custom);

    }

    for (auto VT : {MVT::f16, MVT::f32, MVT::f64}) {

      setCondCodeAction(ISD::SETOEQ, VT, Custom);

      setCondCodeAction(ISD::SETUNE, VT, Custom);

    }

  }


  if (!Subtarget.useSoftFloat() && Subtarget.hasVLX()) {

    setTruncStoreAction(MVT::v4i64, MVT::v4i8,  Legal);

    setTruncStoreAction(MVT::v4i64, MVT::v4i16, Legal);

    setTruncStoreAction(MVT::v4i64, MVT::v4i32, Legal);

    setTruncStoreAction(MVT::v8i32, MVT::v8i8,  Legal);

    setTruncStoreAction(MVT::v8i32, MVT::v8i16, Legal);


    setTruncStoreAction(MVT::v2i64, MVT::v2i8,  Legal);

    setTruncStoreAction(MVT::v2i64, MVT::v2i16, Legal);

    setTruncStoreAction(MVT::v2i64, MVT::v2i32, Legal);

    setTruncStoreAction(MVT::v4i32, MVT::v4i8,  Legal);

    setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal);


    if (Subtarget.hasBWI()) {

      setTruncStoreAction(MVT::v16i16,  MVT::v16i8, Legal);

      setTruncStoreAction(MVT::v8i16,   MVT::v8i8,  Legal);

    }


    if (Subtarget.hasFP16()) {

      // vcvttph2[u]dq v4f16 -> v4i32/64, v2f16 -> v2i32/64

      setOperationAction(ISD::FP_TO_SINT,        MVT::v2f16, Custom);

      setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v2f16, Custom);

      setOperationAction(ISD::FP_TO_UINT,        MVT::v2f16, Custom);

      setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v2f16, Custom);

      setOperationAction(ISD::FP_TO_SINT,        MVT::v4f16, Custom);

      setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v4f16, Custom);

      setOperationAction(ISD::FP_TO_UINT,        MVT::v4f16, Custom);

      setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v4f16, Custom);

      // vcvt[u]dq2ph v4i32/64 -> v4f16, v2i32/64 -> v2f16

      setOperationAction(ISD::SINT_TO_FP,        MVT::v2f16, Custom);

      setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v2f16, Custom);

      setOperationAction(ISD::UINT_TO_FP,        MVT::v2f16, Custom);

      setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v2f16, Custom);

      setOperationAction(ISD::SINT_TO_FP,        MVT::v4f16, Custom);

      setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v4f16, Custom);

      setOperationAction(ISD::UINT_TO_FP,        MVT::v4f16, Custom);

      setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v4f16, Custom);

      // vcvtps2phx v4f32 -> v4f16, v2f32 -> v2f16

      setOperationAction(ISD::FP_ROUND,          MVT::v2f16, Custom);

      setOperationAction(ISD::STRICT_FP_ROUND,   MVT::v2f16, Custom);

      setOperationAction(ISD::FP_ROUND,          MVT::v4f16, Custom);

      setOperationAction(ISD::STRICT_FP_ROUND,   MVT::v4f16, Custom);

      // vcvtph2psx v4f16 -> v4f32, v2f16 -> v2f32

      setOperationAction(ISD::FP_EXTEND,         MVT::v2f16, Custom);

      setOperationAction(ISD::STRICT_FP_EXTEND,  MVT::v2f16, Custom);

      setOperationAction(ISD::FP_EXTEND,         MVT::v4f16, Custom);

      setOperationAction(ISD::STRICT_FP_EXTEND,  MVT::v4f16, Custom);

    }

  }


  if (!Subtarget.useSoftFloat() && Subtarget.hasAMXTILE()) {

    addRegisterClass(MVT::x86amx, &X86::TILERegClass);

  }


  // We want to custom lower some of our intrinsics.

  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);

  setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);

  setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);

  if (!Subtarget.is64Bit()) {

    setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i64, Custom);

  }


  // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't

  // handle type legalization for these operations here.

  //

  // FIXME: We really should do custom legalization for addition and

  // subtraction on x86-32 once PR3203 is fixed.  We really can't do much better

  // than generic legalization for 64-bit multiplication-with-overflow, though.

  for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {

    if (VT == MVT::i64 && !Subtarget.is64Bit())

      continue;

    // Add/Sub/Mul with overflow operations are custom lowered.

    setOperationAction(ISD::SADDO, VT, Custom);

    setOperationAction(ISD::UADDO, VT, Custom);

    setOperationAction(ISD::SSUBO, VT, Custom);

    setOperationAction(ISD::USUBO, VT, Custom);

    setOperationAction(ISD::SMULO, VT, Custom);

    setOperationAction(ISD::UMULO, VT, Custom);


    // Support carry in as value rather than glue.

    setOperationAction(ISD::UADDO_CARRY, VT, Custom);

    setOperationAction(ISD::USUBO_CARRY, VT, Custom);

    setOperationAction(ISD::SETCCCARRY, VT, Custom);

    setOperationAction(ISD::SADDO_CARRY, VT, Custom);

    setOperationAction(ISD::SSUBO_CARRY, VT, Custom);

  }


  // Combine sin / cos into _sincos_stret if it is available.

  if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr &&

      getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) {

    setOperationAction(ISD::FSINCOS, MVT::f64, Custom);

    setOperationAction(ISD::FSINCOS, MVT::f32, Custom);

  }


  if (Subtarget.isTargetWin64()) {

    setOperationAction(ISD::SDIV, MVT::i128, Custom);

    setOperationAction(ISD::UDIV, MVT::i128, Custom);

    setOperationAction(ISD::SREM, MVT::i128, Custom);

    setOperationAction(ISD::UREM, MVT::i128, Custom);

    setOperationAction(ISD::FP_TO_SINT, MVT::i128, Custom);

    setOperationAction(ISD::FP_TO_UINT, MVT::i128, Custom);

    setOperationAction(ISD::SINT_TO_FP, MVT::i128, Custom);

    setOperationAction(ISD::UINT_TO_FP, MVT::i128, Custom);

    setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i128, Custom);

    setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i128, Custom);

    setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i128, Custom);

    setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i128, Custom);

  }


  // On 32 bit MSVC, `fmodf(f32)` is not defined - only `fmod(f64)`

  // is. We should promote the value to 64-bits to solve this.

  // This is what the CRT headers do - `fmodf` is an inline header

  // function casting to f64 and calling `fmod`.

  if (Subtarget.is32Bit() &&

      (Subtarget.isTargetWindowsMSVC() || Subtarget.isTargetWindowsItanium()))

    // clang-format off

   for (ISD::NodeType Op :

         {ISD::FACOS,  ISD::STRICT_FACOS,

          ISD::FASIN,  ISD::STRICT_FASIN,

          ISD::FATAN,  ISD::STRICT_FATAN,

          ISD::FATAN2, ISD::STRICT_FATAN2,

          ISD::FCEIL,  ISD::STRICT_FCEIL,

          ISD::FCOS,   ISD::STRICT_FCOS,

          ISD::FCOSH,  ISD::STRICT_FCOSH,

          ISD::FEXP,   ISD::STRICT_FEXP,

          ISD::FFLOOR, ISD::STRICT_FFLOOR,

          ISD::FREM,   ISD::STRICT_FREM,

          ISD::FLOG,   ISD::STRICT_FLOG,

          ISD::FLOG10, ISD::STRICT_FLOG10,

          ISD::FPOW,   ISD::STRICT_FPOW,

          ISD::FSIN,   ISD::STRICT_FSIN,

          ISD::FSINH,  ISD::STRICT_FSINH,

          ISD::FTAN,   ISD::STRICT_FTAN,

          ISD::FTANH,  ISD::STRICT_FTANH,

          // TODO: Add ISD:::STRICT_FMODF too once implemented.

          ISD::FMODF})

      if (isOperationExpand(Op, MVT::f32))

        setOperationAction(Op, MVT::f32, Promote);

  // clang-format on


  // On MSVC, both 32-bit and 64-bit, ldexpf(f32) is not defined.  MinGW has

  // it, but it's just a wrapper around ldexp.

  if (Subtarget.isOSWindows()) {

    for (ISD::NodeType Op : {ISD::FLDEXP, ISD::STRICT_FLDEXP, ISD::FFREXP})

      if (isOperationExpand(Op, MVT::f32))

        setOperationAction(Op, MVT::f32, Promote);

  }


  setOperationPromotedToType(ISD::ATOMIC_LOAD, MVT::f16, MVT::i16);

  setOperationPromotedToType(ISD::ATOMIC_LOAD, MVT::f32, MVT::i32);

  setOperationPromotedToType(ISD::ATOMIC_LOAD, MVT::f64, MVT::i64);


  // We have target-specific dag combine patterns for the following nodes:

  setTargetDAGCombine({ISD::VECTOR_SHUFFLE,

                       ISD::SCALAR_TO_VECTOR,

                       ISD::INSERT_VECTOR_ELT,

                       ISD::EXTRACT_VECTOR_ELT,

                       ISD::CONCAT_VECTORS,

                       ISD::INSERT_SUBVECTOR,

                       ISD::EXTRACT_SUBVECTOR,

                       ISD::BITCAST,

                       ISD::VSELECT,

                       ISD::SELECT,

                       ISD::SHL,

                       ISD::SRA,

                       ISD::SRL,

                       ISD::OR,

                       ISD::AND,

                       ISD::AVGCEILS,

                       ISD::AVGCEILU,

                       ISD::AVGFLOORS,

                       ISD::AVGFLOORU,

                       ISD::BITREVERSE,

                       ISD::ADD,

                       ISD::FADD,

                       ISD::FSUB,

                       ISD::FNEG,

                       ISD::FMA,

                       ISD::STRICT_FMA,

                       ISD::FMINNUM,

                       ISD::FMAXNUM,

                       ISD::SUB,

                       ISD::LOAD,

                       ISD::LRINT,

                       ISD::LLRINT,

                       ISD::MLOAD,

                       ISD::STORE,

                       ISD::MSTORE,

                       ISD::TRUNCATE,

                       ISD::ZERO_EXTEND,

                       ISD::ANY_EXTEND,

                       ISD::SIGN_EXTEND,

                       ISD::SIGN_EXTEND_INREG,

                       ISD::ANY_EXTEND_VECTOR_INREG,

                       ISD::SIGN_EXTEND_VECTOR_INREG,

                       ISD::ZERO_EXTEND_VECTOR_INREG,

                       ISD::SINT_TO_FP,

                       ISD::UINT_TO_FP,

                       ISD::FP_TO_SINT,

                       ISD::STRICT_SINT_TO_FP,

                       ISD::STRICT_UINT_TO_FP,

                       ISD::FP_TO_SINT_SAT,

                       ISD::FP_TO_UINT_SAT,

                       ISD::SETCC,

                       ISD::MUL,

                       ISD::XOR,

                       ISD::MSCATTER,

                       ISD::MGATHER,

                       ISD::FP16_TO_FP,

                       ISD::FP_EXTEND,

                       ISD::STRICT_FP_EXTEND,

                       ISD::FP_ROUND,

                       ISD::STRICT_FP_ROUND,

                       ISD::INTRINSIC_VOID,

                       ISD::INTRINSIC_WO_CHAIN,

                       ISD::INTRINSIC_W_CHAIN});


  computeRegisterProperties(Subtarget.getRegisterInfo());


  MaxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores

  MaxStoresPerMemsetOptSize = 8;

  MaxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores

  MaxStoresPerMemcpyOptSize = 4;

  MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores

  MaxStoresPerMemmoveOptSize = 4;


  // TODO: These control memcmp expansion in CGP and could be raised higher, but

  // that needs to benchmarked and balanced with the potential use of vector

  // load/store types (PR33329, PR33914).

  MaxLoadsPerMemcmp = 2;

  MaxLoadsPerMemcmpOptSize = 2;


  // Default loop alignment, which can be overridden by -align-loops.

  setPrefLoopAlignment(Align(16));


  // An out-of-order CPU can speculatively execute past a predictable branch,

  // but a conditional move could be stalled by an expensive earlier operation.

  PredictableSelectIsExpensive = Subtarget.getSchedModel().isOutOfOrder();

  EnableExtLdPromotion = true;

  setPrefFunctionAlignment(Align(16));


  verifyIntrinsicTables();


  // Default to having -disable-strictnode-mutation on

  IsStrictFPEnabled = true;

}


// This has so far only been implemented for 64-bit MachO.


bool X86TargetLowering::useLoadStackGuardNode(const Module &M) const {

  return Subtarget.isTargetMachO() && Subtarget.is64Bit();

}


bool X86TargetLowering::useStackGuardXorFP() const {

  // Currently only MSVC CRTs XOR the frame pointer into the stack guard value.

  return Subtarget.getTargetTriple().isOSMSVCRT() && !Subtarget.isTargetMachO();

}


SDValue X86TargetLowering::emitStackGuardXorFP(SelectionDAG &DAG, SDValue Val,

                                               const SDLoc &DL) const {

  EVT PtrTy = getPointerTy(DAG.getDataLayout());

  unsigned XorOp = Subtarget.is64Bit() ? X86::XOR64_FP : X86::XOR32_FP;

  MachineSDNode *Node = DAG.getMachineNode(XorOp, DL, PtrTy, Val);

  return SDValue(Node, 0);

}


TargetLoweringBase::LegalizeTypeAction


X86TargetLowering::getPreferredVectorAction(MVT VT) const {

  if ((VT == MVT::v32i1 || VT == MVT::v64i1) && Subtarget.hasAVX512() &&

      !Subtarget.hasBWI())

    return TypeSplitVector;


  // Since v8f16 is legal, widen anything over v4f16.

  if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 &&

      VT.getVectorNumElements() <= 4 && !Subtarget.hasF16C() &&

      VT.getVectorElementType() == MVT::f16)

    return TypeSplitVector;


  if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 &&

      VT.getVectorElementType() != MVT::i1)

    return TypeWidenVector;


  return TargetLoweringBase::getPreferredVectorAction(VT);

}


FastISel *


X86TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,

                                  const TargetLibraryInfo *libInfo) const {

  return X86::createFastISel(funcInfo, libInfo);

}


//===----------------------------------------------------------------------===//

//                           Other Lowering Hooks

//===----------------------------------------------------------------------===//


bool X86::mayFoldLoad(SDValue Op, const X86Subtarget &Subtarget,

                      bool AssumeSingleUse) {

  if (!AssumeSingleUse && !Op.hasOneUse())

    return false;

  if (!ISD::isNormalLoad(Op.getNode()))

    return false;


  // If this is an unaligned vector, make sure the target supports folding it.

  auto *Ld = cast<LoadSDNode>(Op.getNode());

  if (!Subtarget.hasAVX() && !Subtarget.hasSSEUnalignedMem() &&

      Ld->getValueSizeInBits(0) == 128 && Ld->getAlign() < Align(16))

    return false;


  // TODO: If this is a non-temporal load and the target has an instruction

  //       for it, it should not be folded. See "useNonTemporalLoad()".


  return true;

}


bool X86::mayFoldLoadIntoBroadcastFromMem(SDValue Op, MVT EltVT,

                                          const X86Subtarget &Subtarget,

                                          bool AssumeSingleUse) {

  assert(Subtarget.hasAVX() && "Expected AVX for broadcast from memory");

  if (!X86::mayFoldLoad(Op, Subtarget, AssumeSingleUse))

    return false;


  // We can not replace a wide volatile load with a broadcast-from-memory,

  // because that would narrow the load, which isn't legal for volatiles.

  auto *Ld = cast<LoadSDNode>(Op.getNode());

  return !Ld->isVolatile() ||

         Ld->getValueSizeInBits(0) == EltVT.getScalarSizeInBits();

}


bool X86::mayFoldIntoStore(SDValue Op) {

  if (!Op.hasOneUse())

    return false;

  // Peek through (oneuse) bitcast users

  SDNode *User = *Op->user_begin();

  while (User->getOpcode() == ISD::BITCAST) {

    if (!User->hasOneUse())

      return false;

    User = *User->user_begin();

  }

  return ISD::isNormalStore(User);

}


bool X86::mayFoldIntoZeroExtend(SDValue Op) {

  if (Op.hasOneUse()) {

    unsigned Opcode = Op.getNode()->user_begin()->getOpcode();

    return (ISD::ZERO_EXTEND == Opcode);

  }

  return false;

}


static bool isLogicOp(unsigned Opcode) {

  // TODO: Add support for X86ISD::FAND/FOR/FXOR/FANDN with test coverage.

  return ISD::isBitwiseLogicOp(Opcode) || X86ISD::ANDNP == Opcode;

}


static bool isTargetShuffle(unsigned Opcode) {

  switch(Opcode) {

  default: return false;

  case X86ISD::BLENDI:

  case X86ISD::PSHUFB:

  case X86ISD::PSHUFD:

  case X86ISD::PSHUFHW:

  case X86ISD::PSHUFLW:

  case X86ISD::SHUFP:

  case X86ISD::INSERTPS:

  case X86ISD::EXTRQI:

  case X86ISD::INSERTQI:

  case X86ISD::VALIGN:

  case X86ISD::PALIGNR:

  case X86ISD::VSHLDQ:

  case X86ISD::VSRLDQ:

  case X86ISD::MOVLHPS:

  case X86ISD::MOVHLPS:

  case X86ISD::MOVSHDUP:

  case X86ISD::MOVSLDUP:

  case X86ISD::MOVDDUP:

  case X86ISD::MOVSS:

  case X86ISD::MOVSD:

  case X86ISD::MOVSH:

  case X86ISD::UNPCKL:

  case X86ISD::UNPCKH:

  case X86ISD::VBROADCAST:

  case X86ISD::VPERMILPI:

  case X86ISD::VPERMILPV:

  case X86ISD::VPERM2X128:

  case X86ISD::SHUF128:

  case X86ISD::VPERMIL2:

  case X86ISD::VPERMI:

  case X86ISD::VPPERM:

  case X86ISD::VPERMV:

  case X86ISD::VPERMV3:

  case X86ISD::VZEXT_MOVL:

    return true;

  }

}


static bool isTargetShuffleVariableMask(unsigned Opcode) {

  switch (Opcode) {

  default: return false;

  // Target Shuffles.

  case X86ISD::PSHUFB:

  case X86ISD::VPERMILPV:

  case X86ISD::VPERMIL2:

  case X86ISD::VPPERM:

  case X86ISD::VPERMV:

  case X86ISD::VPERMV3:

    return true;

  // 'Faux' Target Shuffles.

  case ISD::OR:

  case ISD::AND:

  case X86ISD::ANDNP:

    return true;

  }

}


SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const {

  MachineFunction &MF = DAG.getMachineFunction();

  const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();

  X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();

  int ReturnAddrIndex = FuncInfo->getRAIndex();


  if (ReturnAddrIndex == 0) {

    // Set up a frame object for the return address.

    unsigned SlotSize = RegInfo->getSlotSize();

    ReturnAddrIndex = MF.getFrameInfo().CreateFixedObject(SlotSize,

                                                          -(int64_t)SlotSize,

                                                          false);

    FuncInfo->setRAIndex(ReturnAddrIndex);

  }


  return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy(DAG.getDataLayout()));

}


bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model CM,

                                       bool HasSymbolicDisplacement) {

  // Offset should fit into 32 bit immediate field.

  if (!isInt<32>(Offset))

    return false;


  // If we don't have a symbolic displacement - we don't have any extra

  // restrictions.

  if (!HasSymbolicDisplacement)

    return true;


  // We can fold large offsets in the large code model because we always use

  // 64-bit offsets.

  if (CM == CodeModel::Large)

    return true;


  // For kernel code model we know that all object resist in the negative half

  // of 32bits address space. We may not accept negative offsets, since they may

  // be just off and we may accept pretty large positive ones.

  if (CM == CodeModel::Kernel)

    return Offset >= 0;


  // For other non-large code models we assume that latest small object is 16MB

  // before end of 31 bits boundary. We may also accept pretty large negative

  // constants knowing that all objects are in the positive half of address

  // space.

  return Offset < 16 * 1024 * 1024;

}


/// Return true if the condition is an signed comparison operation.


static bool isX86CCSigned(X86::CondCode X86CC) {

  switch (X86CC) {

  default:

    llvm_unreachable("Invalid integer condition!");

  case X86::COND_E:

  case X86::COND_NE:

  case X86::COND_B:

  case X86::COND_A:

  case X86::COND_BE:

  case X86::COND_AE:

    return false;

  case X86::COND_G:

  case X86::COND_GE:

  case X86::COND_L:

  case X86::COND_LE:

    return true;

  }

}


static X86::CondCode TranslateIntegerX86CC(ISD::CondCode SetCCOpcode) {

  switch (SetCCOpcode) {

  // clang-format off

  default: llvm_unreachable("Invalid integer condition!");

  case ISD::SETEQ:  return X86::COND_E;

  case ISD::SETGT:  return X86::COND_G;

  case ISD::SETGE:  return X86::COND_GE;

  case ISD::SETLT:  return X86::COND_L;

  case ISD::SETLE:  return X86::COND_LE;

  case ISD::SETNE:  return X86::COND_NE;

  case ISD::SETULT: return X86::COND_B;

  case ISD::SETUGT: return X86::COND_A;

  case ISD::SETULE: return X86::COND_BE;

  case ISD::SETUGE: return X86::COND_AE;

  // clang-format on

  }

}


/// Do a one-to-one translation of a ISD::CondCode to the X86-specific

/// condition code, returning the condition code and the LHS/RHS of the

/// comparison to make.


static X86::CondCode TranslateX86CC(ISD::CondCode SetCCOpcode, const SDLoc &DL,

                                    bool isFP, SDValue &LHS, SDValue &RHS,

                                    SelectionDAG &DAG) {

  if (!isFP) {

    if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) {

      if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnes()) {

        // X > -1   -> X == 0, jump !sign.

        RHS = DAG.getConstant(0, DL, RHS.getValueType());

        return X86::COND_NS;

      }

      if (SetCCOpcode == ISD::SETLT && RHSC->isZero()) {

        // X < 0   -> X == 0, jump on sign.

        return X86::COND_S;

      }

      if (SetCCOpcode == ISD::SETGE && RHSC->isZero()) {

        // X >= 0   -> X == 0, jump on !sign.

        return X86::COND_NS;

      }

      if (SetCCOpcode == ISD::SETLT && RHSC->isOne()) {

        // X < 1   -> X <= 0

        RHS = DAG.getConstant(0, DL, RHS.getValueType());

        return X86::COND_LE;

      }

    }


    return TranslateIntegerX86CC(SetCCOpcode);

  }


  // First determine if it is required or is profitable to flip the operands.


  // If LHS is a foldable load, but RHS is not, flip the condition.

  if (ISD::isNON_EXTLoad(LHS.getNode()) &&

      !ISD::isNON_EXTLoad(RHS.getNode())) {

    SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode);

    std::swap(LHS, RHS);

  }


  switch (SetCCOpcode) {

  default: break;

  case ISD::SETOLT:

  case ISD::SETOLE:

  case ISD::SETUGT:

  case ISD::SETUGE:

    std::swap(LHS, RHS);

    break;

  }


  // On a floating point condition, the flags are set as follows:

  // ZF  PF  CF   op

  //  0 | 0 | 0 | X > Y

  //  0 | 0 | 1 | X < Y

  //  1 | 0 | 0 | X == Y

  //  1 | 1 | 1 | unordered

  switch (SetCCOpcode) {

  // clang-format off

  default: llvm_unreachable("Condcode should be pre-legalized away");

  case ISD::SETUEQ:

  case ISD::SETEQ:   return X86::COND_E;

  case ISD::SETOLT:              // flipped

  case ISD::SETOGT:

  case ISD::SETGT:   return X86::COND_A;

  case ISD::SETOLE:              // flipped

  case ISD::SETOGE:

  case ISD::SETGE:   return X86::COND_AE;

  case ISD::SETUGT:              // flipped

  case ISD::SETULT:

  case ISD::SETLT:   return X86::COND_B;

  case ISD::SETUGE:              // flipped

  case ISD::SETULE:

  case ISD::SETLE:   return X86::COND_BE;

  case ISD::SETONE:

  case ISD::SETNE:   return X86::COND_NE;

  case ISD::SETUO:   return X86::COND_P;

  case ISD::SETO:    return X86::COND_NP;

  case ISD::SETOEQ:

  case ISD::SETUNE:  return X86::COND_INVALID;

  // clang-format on

  }

}


/// Is there a floating point cmov for the specific X86 condition code?

/// Current x86 isa includes the following FP cmov instructions:

/// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu.


static bool hasFPCMov(unsigned X86CC) {

  switch (X86CC) {

  default:

    return false;

  case X86::COND_B:

  case X86::COND_BE:

  case X86::COND_E:

  case X86::COND_P:

  case X86::COND_A:

  case X86::COND_AE:

  case X86::COND_NE:

  case X86::COND_NP:

    return true;

  }

}


static bool useVPTERNLOG(const X86Subtarget &Subtarget, MVT VT) {

  return Subtarget.hasVLX() || Subtarget.canExtendTo512DQ() ||

         VT.is512BitVector();

}


bool X86TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,

                                           const CallInst &I,

                                           MachineFunction &MF,

                                           unsigned Intrinsic) const {

  Info.flags = MachineMemOperand::MONone;

  Info.offset = 0;


  const IntrinsicData* IntrData = getIntrinsicWithChain(Intrinsic);

  if (!IntrData) {

    switch (Intrinsic) {

    case Intrinsic::x86_aesenc128kl:

    case Intrinsic::x86_aesdec128kl:

      Info.opc = ISD::INTRINSIC_W_CHAIN;

      Info.ptrVal = I.getArgOperand(1);

      Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 48);

      Info.align = Align(1);

      Info.flags |= MachineMemOperand::MOLoad;

      return true;

    case Intrinsic::x86_aesenc256kl:

    case Intrinsic::x86_aesdec256kl:

      Info.opc = ISD::INTRINSIC_W_CHAIN;

      Info.ptrVal = I.getArgOperand(1);

      Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 64);

      Info.align = Align(1);

      Info.flags |= MachineMemOperand::MOLoad;

      return true;

    case Intrinsic::x86_aesencwide128kl:

    case Intrinsic::x86_aesdecwide128kl:

      Info.opc = ISD::INTRINSIC_W_CHAIN;

      Info.ptrVal = I.getArgOperand(0);

      Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 48);

      Info.align = Align(1);

      Info.flags |= MachineMemOperand::MOLoad;

      return true;

    case Intrinsic::x86_aesencwide256kl:

    case Intrinsic::x86_aesdecwide256kl:

      Info.opc = ISD::INTRINSIC_W_CHAIN;

      Info.ptrVal = I.getArgOperand(0);

      Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 64);

      Info.align = Align(1);

      Info.flags |= MachineMemOperand::MOLoad;

      return true;

    case Intrinsic::x86_cmpccxadd32:

    case Intrinsic::x86_cmpccxadd64:

    case Intrinsic::x86_atomic_bts:

    case Intrinsic::x86_atomic_btc:

    case Intrinsic::x86_atomic_btr: {

      Info.opc = ISD::INTRINSIC_W_CHAIN;

      Info.ptrVal = I.getArgOperand(0);

      unsigned Size = I.getType()->getScalarSizeInBits();

      Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), Size);

      Info.align = Align(Size);

      Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore |

                    MachineMemOperand::MOVolatile;

      return true;

    }

    case Intrinsic::x86_atomic_bts_rm:

    case Intrinsic::x86_atomic_btc_rm:

    case Intrinsic::x86_atomic_btr_rm: {

      Info.opc = ISD::INTRINSIC_W_CHAIN;

      Info.ptrVal = I.getArgOperand(0);

      unsigned Size = I.getArgOperand(1)->getType()->getScalarSizeInBits();

      Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), Size);

      Info.align = Align(Size);

      Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore |

                    MachineMemOperand::MOVolatile;

      return true;

    }

    case Intrinsic::x86_aadd32:

    case Intrinsic::x86_aadd64:

    case Intrinsic::x86_aand32:

    case Intrinsic::x86_aand64:

    case Intrinsic::x86_aor32:

    case Intrinsic::x86_aor64:

    case Intrinsic::x86_axor32:

    case Intrinsic::x86_axor64:

    case Intrinsic::x86_atomic_add_cc:

    case Intrinsic::x86_atomic_sub_cc:

    case Intrinsic::x86_atomic_or_cc:

    case Intrinsic::x86_atomic_and_cc:

    case Intrinsic::x86_atomic_xor_cc: {

      Info.opc = ISD::INTRINSIC_W_CHAIN;

      Info.ptrVal = I.getArgOperand(0);

      unsigned Size = I.getArgOperand(1)->getType()->getScalarSizeInBits();

      Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), Size);

      Info.align = Align(Size);

      Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore |

                    MachineMemOperand::MOVolatile;

      return true;

    }

    }

    return false;

  }


  switch (IntrData->Type) {

  case TRUNCATE_TO_MEM_VI8:

  case TRUNCATE_TO_MEM_VI16:

  case TRUNCATE_TO_MEM_VI32: {

    Info.opc = ISD::INTRINSIC_VOID;

    Info.ptrVal = I.getArgOperand(0);

    MVT VT  = MVT::getVT(I.getArgOperand(1)->getType());

    MVT ScalarVT = MVT::INVALID_SIMPLE_VALUE_TYPE;

    if (IntrData->Type == TRUNCATE_TO_MEM_VI8)

      ScalarVT = MVT::i8;

    else if (IntrData->Type == TRUNCATE_TO_MEM_VI16)

      ScalarVT = MVT::i16;

    else if (IntrData->Type == TRUNCATE_TO_MEM_VI32)

      ScalarVT = MVT::i32;


    Info.memVT = MVT::getVectorVT(ScalarVT, VT.getVectorNumElements());

    Info.align = Align(1);

    Info.flags |= MachineMemOperand::MOStore;

    break;

  }

  case GATHER:

  case GATHER_AVX2: {

    Info.opc = ISD::INTRINSIC_W_CHAIN;

    Info.ptrVal = nullptr;

    MVT DataVT = MVT::getVT(I.getType());

    MVT IndexVT = MVT::getVT(I.getArgOperand(2)->getType());

    unsigned NumElts = std::min(DataVT.getVectorNumElements(),

                                IndexVT.getVectorNumElements());

    Info.memVT = MVT::getVectorVT(DataVT.getVectorElementType(), NumElts);

    Info.align = Align(1);

    Info.flags |= MachineMemOperand::MOLoad;

    break;

  }

  case SCATTER: {

    Info.opc = ISD::INTRINSIC_VOID;

    Info.ptrVal = nullptr;

    MVT DataVT = MVT::getVT(I.getArgOperand(3)->getType());

    MVT IndexVT = MVT::getVT(I.getArgOperand(2)->getType());

    unsigned NumElts = std::min(DataVT.getVectorNumElements(),

                                IndexVT.getVectorNumElements());

    Info.memVT = MVT::getVectorVT(DataVT.getVectorElementType(), NumElts);

    Info.align = Align(1);

    Info.flags |= MachineMemOperand::MOStore;

    break;

  }

  default:

    return false;

  }


  return true;

}


/// Returns true if the target can instruction select the

/// specified FP immediate natively. If false, the legalizer will

/// materialize the FP immediate as a load from a constant pool.


bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT,

                                     bool ForCodeSize) const {

  for (const APFloat &FPImm : LegalFPImmediates)

    if (Imm.bitwiseIsEqual(FPImm))

      return true;

  return false;

}


bool X86TargetLowering::shouldReduceLoadWidth(

    SDNode *Load, ISD::LoadExtType ExtTy, EVT NewVT,

    std::optional<unsigned> ByteOffset) const {

  assert(cast<LoadSDNode>(Load)->isSimple() && "illegal to narrow");


  auto PeekThroughOneUserBitcasts = [](const SDNode *N) {

    while (N->getOpcode() == ISD::BITCAST && N->hasOneUse())

      N = *N->user_begin();

    return N;

  };


  // "ELF Handling for Thread-Local Storage" specifies that R_X86_64_GOTTPOFF

  // relocation target a movq or addq instruction: don't let the load shrink.

  SDValue BasePtr = cast<LoadSDNode>(Load)->getBasePtr();

  if (BasePtr.getOpcode() == X86ISD::WrapperRIP)

    if (const auto *GA = dyn_cast<GlobalAddressSDNode>(BasePtr.getOperand(0)))

      return GA->getTargetFlags() != X86II::MO_GOTTPOFF;


  // If this is an (1) AVX vector load with (2) multiple uses and (3) all of

  // those uses are extracted directly into a store, then the extract + store

  // can be store-folded, or (4) any use will be used by legal full width

  // instruction. Then, it's probably not worth splitting the load.

  EVT VT = Load->getValueType(0);

  if ((VT.is256BitVector() || VT.is512BitVector()) &&

      !SDValue(Load, 0).hasOneUse()) {

    bool FullWidthUse = false;

    bool AllExtractStores = true;

    for (SDUse &Use : Load->uses()) {

      // Skip uses of the chain value. Result 0 of the node is the load value.

      if (Use.getResNo() != 0)

        continue;


      const SDNode *User = PeekThroughOneUserBitcasts(Use.getUser());


      // If this use is an extract + store, it's probably not worth splitting.

      if (User->getOpcode() == ISD::EXTRACT_SUBVECTOR &&

          all_of(User->uses(), [&](const SDUse &U) {

            const SDNode *Inner = PeekThroughOneUserBitcasts(U.getUser());

            return Inner->getOpcode() == ISD::STORE;

          }))

        continue;


      AllExtractStores = false;


      // If any use is a full width legal/target bin op, then assume its legal

      // and won't split.

      if (isBinOp(User->getOpcode()) &&

          (isOperationLegal(User->getOpcode(), User->getValueType(0)) ||

           User->getOpcode() > ISD::BUILTIN_OP_END))

        FullWidthUse = true;

    }


    if (AllExtractStores)

      return false;


    // If we have an user that uses the full vector width, then this use is

    // only worth splitting if the offset isn't 0 (to avoid an

    // EXTRACT_SUBVECTOR) or we're loading a scalar integer.

    if (FullWidthUse)

      return (ByteOffset.value_or(0) > 0) || NewVT.isScalarInteger();

  }


  return true;

}


/// Returns true if it is beneficial to convert a load of a constant

/// to just the constant itself.


bool X86TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,

                                                          Type *Ty) const {

  assert(Ty->isIntegerTy());


  unsigned BitSize = Ty->getPrimitiveSizeInBits();

  if (BitSize == 0 || BitSize > 64)

    return false;

  return true;

}


bool X86TargetLowering::reduceSelectOfFPConstantLoads(EVT CmpOpVT) const {

  // If we are using XMM registers in the ABI and the condition of the select is

  // a floating-point compare and we have blendv or conditional move, then it is

  // cheaper to select instead of doing a cross-register move and creating a

  // load that depends on the compare result.

  bool IsFPSetCC = CmpOpVT.isFloatingPoint() && CmpOpVT != MVT::f128;

  return !IsFPSetCC || !Subtarget.isTarget64BitLP64() || !Subtarget.hasAVX();

}


bool X86TargetLowering::convertSelectOfConstantsToMath(EVT VT) const {

  // TODO: It might be a win to ease or lift this restriction, but the generic

  // folds in DAGCombiner conflict with vector folds for an AVX512 target.

  if (VT.isVector() && Subtarget.hasAVX512())

    return false;


  return true;

}


bool X86TargetLowering::decomposeMulByConstant(LLVMContext &Context, EVT VT,

                                               SDValue C) const {

  // TODO: We handle scalars using custom code, but generic combining could make

  // that unnecessary.

  APInt MulC;

  if (!ISD::isConstantSplatVector(C.getNode(), MulC))

    return false;


  // Find the type this will be legalized too. Otherwise we might prematurely

  // convert this to shl+add/sub and then still have to type legalize those ops.

  // Another choice would be to defer the decision for illegal types until

  // after type legalization. But constant splat vectors of i64 can't make it

  // through type legalization on 32-bit targets so we would need to special

  // case vXi64.

  while (getTypeAction(Context, VT) != TypeLegal)

    VT = getTypeToTransformTo(Context, VT);


  // If vector multiply is legal, assume that's faster than shl + add/sub.

  // Multiply is a complex op with higher latency and lower throughput in

  // most implementations, sub-vXi32 vector multiplies are always fast,

  // vXi32 mustn't have a SlowMULLD implementation, and anything larger (vXi64)

  // is always going to be slow.

  unsigned EltSizeInBits = VT.getScalarSizeInBits();

  if (isOperationLegal(ISD::MUL, VT) && EltSizeInBits <= 32 &&

      (EltSizeInBits != 32 || !Subtarget.isPMULLDSlow()))

    return false;


  // shl+add, shl+sub, shl+add+neg

  return (MulC + 1).isPowerOf2() || (MulC - 1).isPowerOf2() ||

         (1 - MulC).isPowerOf2() || (-(MulC + 1)).isPowerOf2();

}


bool X86TargetLowering::isExtractSubvectorCheap(EVT ResVT, EVT SrcVT,

                                                unsigned Index) const {

  if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT))

    return false;


  // Mask vectors support all subregister combinations and operations that

  // extract half of vector.

  if (ResVT.getVectorElementType() == MVT::i1)

    return Index == 0 || ((ResVT.getSizeInBits() == SrcVT.getSizeInBits()*2) &&

                          (Index == ResVT.getVectorNumElements()));


  return (Index % ResVT.getVectorNumElements()) == 0;

}


bool X86TargetLowering::shouldScalarizeBinop(SDValue VecOp) const {

  unsigned Opc = VecOp.getOpcode();


  // Assume target opcodes can't be scalarized.

  // TODO - do we have any exceptions?

  if (Opc >= ISD::BUILTIN_OP_END || !isBinOp(Opc))

    return false;


  // If the vector op is not supported, try to convert to scalar.

  EVT VecVT = VecOp.getValueType();

  if (!isOperationLegalOrCustomOrPromote(Opc, VecVT))

    return true;


  // If the vector op is supported, but the scalar op is not, the transform may

  // not be worthwhile.

  EVT ScalarVT = VecVT.getScalarType();

  return isOperationLegalOrCustomOrPromote(Opc, ScalarVT);

}


bool X86TargetLowering::shouldFormOverflowOp(unsigned Opcode, EVT VT,

                                             bool) const {

  // TODO: Allow vectors?

  if (VT.isVector())

    return false;

  return VT.isSimple() || !isOperationExpand(Opcode, VT);

}


bool X86TargetLowering::isCheapToSpeculateCttz(Type *Ty) const {

  // Speculate cttz only if we can directly use TZCNT/CMOV, can promote to

  // i32/i64 or can rely on BSF passthrough value.

  return Subtarget.hasBMI() || Subtarget.canUseCMOV() ||

         Subtarget.hasBitScanPassThrough() ||

         (!Ty->isVectorTy() &&

          Ty->getScalarSizeInBits() < (Subtarget.is64Bit() ? 64u : 32u));

}


bool X86TargetLowering::isCheapToSpeculateCtlz(Type *Ty) const {

  // Speculate ctlz only if we can directly use LZCNT/CMOV, or can rely on BSR

  // passthrough value.

  return Subtarget.hasLZCNT() || Subtarget.canUseCMOV() ||

         Subtarget.hasBitScanPassThrough();

}


bool X86TargetLowering::ShouldShrinkFPConstant(EVT VT) const {

  // Don't shrink FP constpool if SSE2 is available since cvtss2sd is more

  // expensive than a straight movsd. On the other hand, it's important to

  // shrink long double fp constant since fldt is very slow.

  return !Subtarget.hasSSE2() || VT == MVT::f80;

}


bool X86TargetLowering::isScalarFPTypeInSSEReg(EVT VT) const {

  return (VT == MVT::f64 && Subtarget.hasSSE2()) ||

         (VT == MVT::f32 && Subtarget.hasSSE1()) || VT == MVT::f16;

}


bool X86TargetLowering::isLoadBitCastBeneficial(EVT LoadVT, EVT BitcastVT,

                                                const SelectionDAG &DAG,

                                                const MachineMemOperand &MMO) const {

  if (!Subtarget.hasAVX512() && !LoadVT.isVector() && BitcastVT.isVector() &&

      BitcastVT.getVectorElementType() == MVT::i1)

    return false;


  if (!Subtarget.hasDQI() && BitcastVT == MVT::v8i1 && LoadVT == MVT::i8)

    return false;


  // If both types are legal vectors, it's always ok to convert them.

  if (LoadVT.isVector() && BitcastVT.isVector() &&

      isTypeLegal(LoadVT) && isTypeLegal(BitcastVT))

    return true;


  return TargetLowering::isLoadBitCastBeneficial(LoadVT, BitcastVT, DAG, MMO);

}


bool X86TargetLowering::canMergeStoresTo(unsigned AddressSpace, EVT MemVT,

                                         const MachineFunction &MF) const {

  // Do not merge to float value size (128 bytes) if no implicit

  // float attribute is set.

  bool NoFloat = MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat);


  if (NoFloat) {

    unsigned MaxIntSize = Subtarget.is64Bit() ? 64 : 32;

    return (MemVT.getSizeInBits() <= MaxIntSize);

  }

  // Make sure we don't merge greater than our preferred vector

  // width.

  if (MemVT.getSizeInBits() > Subtarget.getPreferVectorWidth())

    return false;


  return true;

}


bool X86TargetLowering::isCtlzFast() const {

  return Subtarget.hasFastLZCNT();

}


bool X86TargetLowering::isMaskAndCmp0FoldingBeneficial(

    const Instruction &AndI) const {

  return true;

}


bool X86TargetLowering::hasAndNotCompare(SDValue Y) const {

  EVT VT = Y.getValueType();


  if (VT.isVector())

    return false;


  if (!Subtarget.hasBMI())

    return false;


  // There are only 32-bit and 64-bit forms for 'andn'.

  if (VT != MVT::i32 && VT != MVT::i64)

    return false;


  return !isa<ConstantSDNode>(Y) || cast<ConstantSDNode>(Y)->isOpaque();

}


bool X86TargetLowering::hasAndNot(SDValue Y) const {

  EVT VT = Y.getValueType();


  if (!VT.isVector())

    return hasAndNotCompare(Y);


  // Vector.


  if (!Subtarget.hasSSE1() || VT.getSizeInBits() < 128)

    return false;


  if (VT == MVT::v4i32)

    return true;


  return Subtarget.hasSSE2();

}


bool X86TargetLowering::hasBitTest(SDValue X, SDValue Y) const {

  return X.getValueType().isScalarInteger(); // 'bt'

}


bool X86TargetLowering::

    shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(

        SDValue X, ConstantSDNode *XC, ConstantSDNode *CC, SDValue Y,

        unsigned OldShiftOpcode, unsigned NewShiftOpcode,

        SelectionDAG &DAG) const {

  // Does baseline recommend not to perform the fold by default?

  if (!TargetLowering::shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(

          X, XC, CC, Y, OldShiftOpcode, NewShiftOpcode, DAG))

    return false;

  // For scalars this transform is always beneficial.

  if (X.getValueType().isScalarInteger())

    return true;

  // If all the shift amounts are identical, then transform is beneficial even

  // with rudimentary SSE2 shifts.

  if (DAG.isSplatValue(Y, /*AllowUndefs=*/true))

    return true;

  // If we have AVX2 with it's powerful shift operations, then it's also good.

  if (Subtarget.hasAVX2())

    return true;

  // Pre-AVX2 vector codegen for this pattern is best for variant with 'shl'.

  return NewShiftOpcode == ISD::SHL;

}


unsigned X86TargetLowering::preferedOpcodeForCmpEqPiecesOfOperand(

    EVT VT, unsigned ShiftOpc, bool MayTransformRotate,

    const APInt &ShiftOrRotateAmt, const std::optional<APInt> &AndMask) const {

  if (!VT.isInteger())

    return ShiftOpc;


  bool PreferRotate = false;

  if (VT.isVector()) {

    // For vectors, if we have rotate instruction support, then its definetly

    // best. Otherwise its not clear what the best so just don't make changed.

    PreferRotate = Subtarget.hasAVX512() && (VT.getScalarType() == MVT::i32 ||

                                             VT.getScalarType() == MVT::i64);

  } else {

    // For scalar, if we have bmi prefer rotate for rorx. Otherwise prefer

    // rotate unless we have a zext mask+shr.

    PreferRotate = Subtarget.hasBMI2();

    if (!PreferRotate) {

      unsigned MaskBits =

          VT.getScalarSizeInBits() - ShiftOrRotateAmt.getZExtValue();

      PreferRotate = (MaskBits != 8) && (MaskBits != 16) && (MaskBits != 32);

    }

  }


  if (ShiftOpc == ISD::SHL || ShiftOpc == ISD::SRL) {

    assert(AndMask.has_value() && "Null andmask when querying about shift+and");


    if (PreferRotate && MayTransformRotate)

      return ISD::ROTL;


    // If vector we don't really get much benefit swapping around constants.

    // Maybe we could check if the DAG has the flipped node already in the

    // future.

    if (VT.isVector())

      return ShiftOpc;


    // See if the beneficial to swap shift type.

    if (ShiftOpc == ISD::SHL) {

      // If the current setup has imm64 mask, then inverse will have

      // at least imm32 mask (or be zext i32 -> i64).

      if (VT == MVT::i64)

        return AndMask->getSignificantBits() > 32 ? (unsigned)ISD::SRL

                                                  : ShiftOpc;


      // We can only benefit if req at least 7-bit for the mask. We

      // don't want to replace shl of 1,2,3 as they can be implemented

      // with lea/add.

      return ShiftOrRotateAmt.uge(7) ? (unsigned)ISD::SRL : ShiftOpc;

    }


    if (VT == MVT::i64)

      // Keep exactly 32-bit imm64, this is zext i32 -> i64 which is

      // extremely efficient.

      return AndMask->getSignificantBits() > 33 ? (unsigned)ISD::SHL : ShiftOpc;


    // Keep small shifts as shl so we can generate add/lea.

    return ShiftOrRotateAmt.ult(7) ? (unsigned)ISD::SHL : ShiftOpc;

  }


  // We prefer rotate for vectors of if we won't get a zext mask with SRL

  // (PreferRotate will be set in the latter case).

  if (PreferRotate || !MayTransformRotate || VT.isVector())

    return ShiftOpc;


  // Non-vector type and we have a zext mask with SRL.

  return ISD::SRL;

}


TargetLoweringBase::CondMergingParams


X86TargetLowering::getJumpConditionMergingParams(Instruction::BinaryOps Opc,

                                                 const Value *Lhs,

                                                 const Value *Rhs) const {

  using namespace llvm::PatternMatch;

  int BaseCost = BrMergingBaseCostThresh.getValue();

  // With CCMP, branches can be merged in a more efficient way.

  if (BaseCost >= 0 && Subtarget.hasCCMP())

    BaseCost += BrMergingCcmpBias;

  // a == b && a == c is a fast pattern on x86.

  if (BaseCost >= 0 && Opc == Instruction::And &&

      match(Lhs, m_SpecificICmp(ICmpInst::ICMP_EQ, m_Value(), m_Value())) &&

      match(Rhs, m_SpecificICmp(ICmpInst::ICMP_EQ, m_Value(), m_Value())))

    BaseCost += 1;


  // For OR conditions with EQ comparisons, prefer splitting into branches

  // (unless CCMP is available). OR+EQ cannot be optimized via bitwise ops,

  // unlike OR+NE which becomes (P|Q)!=0. Similarly, don't split signed

  // comparisons (SLT, SGT) that can be optimized.

  if (BaseCost >= 0 && !Subtarget.hasCCMP() && Opc == Instruction::Or &&

      match(Lhs, m_SpecificICmp(ICmpInst::ICMP_EQ, m_Value(), m_Value())) &&

      match(Rhs, m_SpecificICmp(ICmpInst::ICMP_EQ, m_Value(), m_Value())))

    return {-1, -1, -1};


  return {BaseCost, BrMergingLikelyBias.getValue(),

          BrMergingUnlikelyBias.getValue()};

}


bool X86TargetLowering::preferScalarizeSplat(SDNode *N) const {

  return N->getOpcode() != ISD::FP_EXTEND;

}


bool X86TargetLowering::shouldFoldConstantShiftPairToMask(

    const SDNode *N) const {

  assert(((N->getOpcode() == ISD::SHL &&

           N->getOperand(0).getOpcode() == ISD::SRL) ||

          (N->getOpcode() == ISD::SRL &&

           N->getOperand(0).getOpcode() == ISD::SHL)) &&

         "Expected shift-shift mask");

  // TODO: Should we always create i64 masks? Or only folded immediates?

  EVT VT = N->getValueType(0);

  if ((Subtarget.hasFastVectorShiftMasks() && VT.isVector()) ||

      (Subtarget.hasFastScalarShiftMasks() && !VT.isVector())) {

    // Only fold if the shift values are equal - so it folds to AND.

    // TODO - we should fold if either is a non-uniform vector but we don't do

    // the fold for non-splats yet.

    return N->getOperand(1) == N->getOperand(0).getOperand(1);

  }

  return TargetLoweringBase::shouldFoldConstantShiftPairToMask(N);

}


bool X86TargetLowering::shouldFoldMaskToVariableShiftPair(SDValue Y) const {

  EVT VT = Y.getValueType();


  // For vectors, we don't have a preference, but we probably want a mask.

  if (VT.isVector())

    return false;


  unsigned MaxWidth = Subtarget.is64Bit() ? 64 : 32;

  return VT.getScalarSizeInBits() <= MaxWidth;

}


TargetLowering::ShiftLegalizationStrategy


X86TargetLowering::preferredShiftLegalizationStrategy(

    SelectionDAG &DAG, SDNode *N, unsigned ExpansionFactor) const {

  if (DAG.getMachineFunction().getFunction().hasMinSize() &&

      !Subtarget.isOSWindows())

    return ShiftLegalizationStrategy::LowerToLibcall;

  return TargetLowering::preferredShiftLegalizationStrategy(DAG, N,

                                                            ExpansionFactor);

}


bool X86TargetLowering::shouldSplatInsEltVarIndex(EVT VT) const {

  // Any legal vector type can be splatted more efficiently than

  // loading/spilling from memory.

  return isTypeLegal(VT);

}


MVT X86TargetLowering::hasFastEqualityCompare(unsigned NumBits) const {

  MVT VT = MVT::getIntegerVT(NumBits);

  if (isTypeLegal(VT))

    return VT;


  // PMOVMSKB can handle this.

  if (NumBits == 128 && isTypeLegal(MVT::v16i8))

    return MVT::v16i8;


  // VPMOVMSKB can handle this.

  if (NumBits == 256 && isTypeLegal(MVT::v32i8))

    return MVT::v32i8;


  // TODO: Allow 64-bit type for 32-bit target.

  // TODO: 512-bit types should be allowed, but make sure that those

  // cases are handled in combineVectorSizedSetCCEquality().


  return MVT::INVALID_SIMPLE_VALUE_TYPE;

}


/// Val is the undef sentinel value or equal to the specified value.


static bool isUndefOrEqual(int Val, int CmpVal) {

  return ((Val == SM_SentinelUndef) || (Val == CmpVal));

}


/// Return true if every element in Mask is the undef sentinel value or equal to

/// the specified value.


static bool isUndefOrEqual(ArrayRef<int> Mask, int CmpVal) {

  return llvm::all_of(Mask, [CmpVal](int M) {

    return (M == SM_SentinelUndef) || (M == CmpVal);

  });

}


/// Return true if every element in Mask, beginning from position Pos and ending

/// in Pos+Size is the undef sentinel value or equal to the specified value.


static bool isUndefOrEqualInRange(ArrayRef<int> Mask, int CmpVal, unsigned Pos,

                                  unsigned Size) {

  return llvm::all_of(Mask.slice(Pos, Size),

                      [CmpVal](int M) { return isUndefOrEqual(M, CmpVal); });

}


/// Val is either the undef or zero sentinel value.


static bool isUndefOrZero(int Val) {

  return ((Val == SM_SentinelUndef) || (Val == SM_SentinelZero));

}


/// Return true if every element in Mask, beginning from position Pos and ending

/// in Pos+Size is the undef sentinel value.


static bool isUndefInRange(ArrayRef<int> Mask, unsigned Pos, unsigned Size) {

  return llvm::all_of(Mask.slice(Pos, Size),

                      [](int M) { return M == SM_SentinelUndef; });

}


/// Return true if the mask creates a vector whose lower half is undefined.


static bool isUndefLowerHalf(ArrayRef<int> Mask) {

  unsigned NumElts = Mask.size();

  return isUndefInRange(Mask, 0, NumElts / 2);

}


/// Return true if the mask creates a vector whose upper half is undefined.


static bool isUndefUpperHalf(ArrayRef<int> Mask) {

  unsigned NumElts = Mask.size();

  return isUndefInRange(Mask, NumElts / 2, NumElts / 2);

}


/// Return true if Val falls within the specified range (L, H].


static bool isInRange(int Val, int Low, int Hi) {

  return (Val >= Low && Val < Hi);

}


/// Return true if the value of any element in Mask falls within the specified

/// range (L, H].


static bool isAnyInRange(ArrayRef<int> Mask, int Low, int Hi) {

  return llvm::any_of(Mask, [Low, Hi](int M) { return isInRange(M, Low, Hi); });

}


/// Return true if the value of any element in Mask is the zero sentinel value.


static bool isAnyZero(ArrayRef<int> Mask) {

  return llvm::any_of(Mask, [](int M) { return M == SM_SentinelZero; });

}


/// Return true if Val is undef or if its value falls within the

/// specified range (L, H].


static bool isUndefOrInRange(int Val, int Low, int Hi) {

  return (Val == SM_SentinelUndef) || isInRange(Val, Low, Hi);

}


/// Return true if every element in Mask is undef or if its value

/// falls within the specified range (L, H].


static bool isUndefOrInRange(ArrayRef<int> Mask, int Low, int Hi) {

  return llvm::all_of(

      Mask, [Low, Hi](int M) { return isUndefOrInRange(M, Low, Hi); });

}


/// Return true if Val is undef, zero or if its value falls within the

/// specified range (L, H].


static bool isUndefOrZeroOrInRange(int Val, int Low, int Hi) {

  return isUndefOrZero(Val) || isInRange(Val, Low, Hi);

}


/// Return true if every element in Mask is undef, zero or if its value

/// falls within the specified range (L, H].


static bool isUndefOrZeroOrInRange(ArrayRef<int> Mask, int Low, int Hi) {

  return llvm::all_of(

      Mask, [Low, Hi](int M) { return isUndefOrZeroOrInRange(M, Low, Hi); });

}


/// Return true if every element in Mask, is an in-place blend/select mask or is

/// undef.


[[maybe_unused]] static bool isBlendOrUndef(ArrayRef<int> Mask) {

  unsigned NumElts = Mask.size();

  for (auto [I, M] : enumerate(Mask))

    if (!isUndefOrEqual(M, I) && !isUndefOrEqual(M, I + NumElts))

      return false;

  return true;

}


/// Return true if every element in Mask, beginning

/// from position Pos and ending in Pos + Size, falls within the specified

/// sequence (Low, Low + Step, ..., Low + (Size - 1) * Step) or is undef.


static bool isSequentialOrUndefInRange(ArrayRef<int> Mask, unsigned Pos,

                                       unsigned Size, int Low, int Step = 1) {

  for (unsigned i = Pos, e = Pos + Size; i != e; ++i, Low += Step)

    if (!isUndefOrEqual(Mask[i], Low))

      return false;

  return true;

}


/// Return true if every element in Mask, beginning

/// from position Pos and ending in Pos+Size, falls within the specified

/// sequential range (Low, Low+Size], or is undef or is zero.


static bool isSequentialOrUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos,

                                             unsigned Size, int Low,

                                             int Step = 1) {

  for (unsigned i = Pos, e = Pos + Size; i != e; ++i, Low += Step)

    if (!isUndefOrZero(Mask[i]) && Mask[i] != Low)

      return false;

  return true;

}


/// Return true if every element in Mask, beginning

/// from position Pos and ending in Pos+Size is undef or is zero.


static bool isUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos,

                                 unsigned Size) {

  return llvm::all_of(Mask.slice(Pos, Size), isUndefOrZero);

}


/// Return true if every element of a single input is referenced by the shuffle

/// mask. i.e. it just permutes them all.


static bool isCompletePermute(ArrayRef<int> Mask) {

  unsigned NumElts = Mask.size();

  APInt DemandedElts = APInt::getZero(NumElts);

  for (int M : Mask)

    if (isInRange(M, 0, NumElts))

      DemandedElts.setBit(M);

  return DemandedElts.isAllOnes();

}


/// Helper function to test whether a shuffle mask could be

/// simplified by widening the elements being shuffled.

///

/// Appends the mask for wider elements in WidenedMask if valid. Otherwise

/// leaves it in an unspecified state.

///

/// NOTE: This must handle normal vector shuffle masks and *target* vector

/// shuffle masks. The latter have the special property of a '-2' representing

/// a zero-ed lane of a vector.


static bool canWidenShuffleElements(ArrayRef<int> Mask,

                                    SmallVectorImpl<int> &WidenedMask) {

  WidenedMask.assign(Mask.size() / 2, 0);

  for (int i = 0, Size = Mask.size(); i < Size; i += 2) {

    int M0 = Mask[i];

    int M1 = Mask[i + 1];


    // If both elements are undef, its trivial.

    if (M0 == SM_SentinelUndef && M1 == SM_SentinelUndef) {

      WidenedMask[i / 2] = SM_SentinelUndef;

      continue;

    }


    // Check for an undef mask and a mask value properly aligned to fit with

    // a pair of values. If we find such a case, use the non-undef mask's value.

    if (M0 == SM_SentinelUndef && M1 >= 0 && (M1 % 2) == 1) {

      WidenedMask[i / 2] = M1 / 2;

      continue;

    }

    if (M1 == SM_SentinelUndef && M0 >= 0 && (M0 % 2) == 0) {

      WidenedMask[i / 2] = M0 / 2;

      continue;

    }


    // When zeroing, we need to spread the zeroing across both lanes to widen.

    if (M0 == SM_SentinelZero || M1 == SM_SentinelZero) {

      if ((M0 == SM_SentinelZero || M0 == SM_SentinelUndef) &&

          (M1 == SM_SentinelZero || M1 == SM_SentinelUndef)) {

        WidenedMask[i / 2] = SM_SentinelZero;

        continue;

      }

      return false;

    }


    // Finally check if the two mask values are adjacent and aligned with

    // a pair.

    if (M0 != SM_SentinelUndef && (M0 % 2) == 0 && (M0 + 1) == M1) {

      WidenedMask[i / 2] = M0 / 2;

      continue;

    }


    // Otherwise we can't safely widen the elements used in this shuffle.

    return false;

  }

  assert(WidenedMask.size() == Mask.size() / 2 &&

         "Incorrect size of mask after widening the elements!");


  return true;

}


static bool canWidenShuffleElements(ArrayRef<int> Mask,

                                    const APInt &Zeroable,

                                    bool V2IsZero,

                                    SmallVectorImpl<int> &WidenedMask) {

  // Create an alternative mask with info about zeroable elements.

  // Here we do not set undef elements as zeroable.

  SmallVector<int, 64> ZeroableMask(Mask);

  if (V2IsZero) {

    assert(!Zeroable.isZero() && "V2's non-undef elements are used?!");

    for (int i = 0, Size = Mask.size(); i != Size; ++i)

      if (Mask[i] != SM_SentinelUndef && Zeroable[i])

        ZeroableMask[i] = SM_SentinelZero;

  }

  return canWidenShuffleElements(ZeroableMask, WidenedMask);

}


static bool canWidenShuffleElements(ArrayRef<int> Mask) {

  SmallVector<int, 32> WidenedMask;

  return canWidenShuffleElements(Mask, WidenedMask);

}


// Attempt to narrow/widen shuffle mask until it matches the target number of

// elements.


static bool scaleShuffleElements(ArrayRef<int> Mask, unsigned NumDstElts,

                                 SmallVectorImpl<int> &ScaledMask) {

  unsigned NumSrcElts = Mask.size();

  assert(((NumSrcElts % NumDstElts) == 0 || (NumDstElts % NumSrcElts) == 0) &&

         "Illegal shuffle scale factor");


  // Narrowing is guaranteed to work.

  if (NumDstElts >= NumSrcElts) {

    int Scale = NumDstElts / NumSrcElts;

    llvm::narrowShuffleMaskElts(Scale, Mask, ScaledMask);

    return true;

  }


  // We have to repeat the widening until we reach the target size, but we can

  // split out the first widening as it sets up ScaledMask for us.

  if (canWidenShuffleElements(Mask, ScaledMask)) {

    while (ScaledMask.size() > NumDstElts) {

      SmallVector<int, 16> WidenedMask;

      if (!canWidenShuffleElements(ScaledMask, WidenedMask))

        return false;

      ScaledMask = std::move(WidenedMask);

    }

    return true;

  }


  return false;

}


static bool canScaleShuffleElements(ArrayRef<int> Mask, unsigned NumDstElts) {

  SmallVector<int, 32> ScaledMask;

  return scaleShuffleElements(Mask, NumDstElts, ScaledMask);

}


// Helper to grow the shuffle mask for a larger value type.

// NOTE: This is different to scaleShuffleElements which is a same size type.


static void growShuffleMask(ArrayRef<int> SrcMask,

                            SmallVectorImpl<int> &DstMask,

                            unsigned SrcSizeInBits, unsigned DstSizeInBits) {

  assert(DstMask.empty() && "Expected an empty shuffle mas");

  assert((DstSizeInBits % SrcSizeInBits) == 0 && "Illegal shuffle scale");

  unsigned Scale = DstSizeInBits / SrcSizeInBits;

  unsigned NumSrcElts = SrcMask.size();

  DstMask.assign(SrcMask.begin(), SrcMask.end());

  for (int &M : DstMask) {

    if (M < 0)

      continue;

    M = (M % NumSrcElts) + ((M / NumSrcElts) * Scale * NumSrcElts);

  }

  DstMask.append((Scale - 1) * NumSrcElts, SM_SentinelUndef);

}


/// Returns true if Elt is a constant zero or a floating point constant +0.0.


bool X86::isZeroNode(SDValue Elt) {

  return isNullConstant(Elt) || isNullFPConstant(Elt);

}


// Build a vector of constants.

// Use an UNDEF node if MaskElt == -1.

// Split 64-bit constants in the 32-bit mode.


static SDValue getConstVector(ArrayRef<int> Values, MVT VT, SelectionDAG &DAG,

                              const SDLoc &dl, bool IsMask = false) {


  SmallVector<SDValue, 32>  Ops;

  bool Split = false;


  MVT ConstVecVT = VT;

  unsigned NumElts = VT.getVectorNumElements();

  bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);

  if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {

    ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);

    Split = true;

  }


  MVT EltVT = ConstVecVT.getVectorElementType();

  for (unsigned i = 0; i < NumElts; ++i) {

    bool IsUndef = Values[i] < 0 && IsMask;

    SDValue OpNode = IsUndef ? DAG.getUNDEF(EltVT) :

      DAG.getConstant(Values[i], dl, EltVT);

    Ops.push_back(OpNode);

    if (Split)

      Ops.push_back(IsUndef ? DAG.getUNDEF(EltVT) :

                    DAG.getConstant(0, dl, EltVT));

  }

  SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);

  if (Split)

    ConstsNode = DAG.getBitcast(VT, ConstsNode);

  return ConstsNode;

}


static SDValue getConstVector(ArrayRef<APInt> Bits, const APInt &Undefs,

                              MVT VT, SelectionDAG &DAG, const SDLoc &dl) {

  assert(Bits.size() == Undefs.getBitWidth() &&

         "Unequal constant and undef arrays");

  SmallVector<SDValue, 32> Ops;

  bool Split = false;


  MVT ConstVecVT = VT;

  unsigned NumElts = VT.getVectorNumElements();

  bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);

  if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {

    ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);

    Split = true;

  }


  MVT EltVT = ConstVecVT.getVectorElementType();

  MVT EltIntVT = EltVT.changeTypeToInteger();

  for (unsigned i = 0, e = Bits.size(); i != e; ++i) {

    if (Undefs[i]) {

      Ops.append(Split ? 2 : 1, DAG.getUNDEF(EltVT));

      continue;

    }

    const APInt &V = Bits[i];

    assert(V.getBitWidth() == VT.getScalarSizeInBits() && "Unexpected sizes");

    if (Split) {

      Ops.push_back(DAG.getConstant(V.extractBits(32, 0), dl, EltVT));

      Ops.push_back(DAG.getConstant(V.extractBits(32, 32), dl, EltVT));

    } else {

      Ops.push_back(DAG.getBitcast(EltVT, DAG.getConstant(V, dl, EltIntVT)));

    }

  }


  SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);

  return DAG.getBitcast(VT, ConstsNode);

}


static SDValue getConstVector(ArrayRef<APInt> Bits, MVT VT,

                              SelectionDAG &DAG, const SDLoc &dl) {

  APInt Undefs = APInt::getZero(Bits.size());

  return getConstVector(Bits, Undefs, VT, DAG, dl);

}


/// Returns a vector of specified type with all zero elements.


static SDValue getZeroVector(MVT VT, const X86Subtarget &Subtarget,

                             SelectionDAG &DAG, const SDLoc &dl) {

  assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector() ||

          VT.getVectorElementType() == MVT::i1) &&

         "Unexpected vector type");


  // Try to build SSE/AVX zero vectors as <N x i32> bitcasted to their dest

  // type. This ensures they get CSE'd. But if the integer type is not

  // available, use a floating-point +0.0 instead.

  SDValue Vec;

  const TargetLowering &TLI = DAG.getTargetLoweringInfo();

  if (!Subtarget.hasSSE2() && VT.is128BitVector()) {

    Vec = DAG.getConstantFP(+0.0, dl, MVT::v4f32);

  } else if (VT.isFloatingPoint() &&

             TLI.isTypeLegal(VT.getVectorElementType())) {

    Vec = DAG.getConstantFP(+0.0, dl, VT);

  } else if (VT.getVectorElementType() == MVT::i1) {

    assert((Subtarget.hasBWI() || VT.getVectorNumElements() <= 16) &&

           "Unexpected vector type");

    Vec = DAG.getConstant(0, dl, VT);

  } else {

    unsigned Num32BitElts = VT.getSizeInBits() / 32;

    Vec = DAG.getConstant(0, dl, MVT::getVectorVT(MVT::i32, Num32BitElts));

  }

  return DAG.getBitcast(VT, Vec);

}


// Helper to determine if the ops are all the extracted subvectors come from a

// single source. If we allow commute they don't have to be in order (Lo/Hi).


static SDValue getSplitVectorSrc(SDValue LHS, SDValue RHS, bool AllowCommute) {

  if (LHS.getOpcode() != ISD::EXTRACT_SUBVECTOR ||

      RHS.getOpcode() != ISD::EXTRACT_SUBVECTOR ||

      LHS.getValueType() != RHS.getValueType() ||

      LHS.getOperand(0) != RHS.getOperand(0))

    return SDValue();


  SDValue Src = LHS.getOperand(0);

  if (Src.getValueSizeInBits() != (LHS.getValueSizeInBits() * 2))

    return SDValue();


  unsigned NumElts = LHS.getValueType().getVectorNumElements();

  if ((LHS.getConstantOperandAPInt(1) == 0 &&

       RHS.getConstantOperandAPInt(1) == NumElts) ||

      (AllowCommute && RHS.getConstantOperandAPInt(1) == 0 &&

       LHS.getConstantOperandAPInt(1) == NumElts))

    return Src;


  return SDValue();

}


static SDValue extractSubVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG,

                                const SDLoc &dl, unsigned vectorWidth) {

  EVT VT = Vec.getValueType();

  EVT ElVT = VT.getVectorElementType();

  unsigned ResultNumElts =

      (VT.getVectorNumElements() * vectorWidth) / VT.getSizeInBits();

  EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT, ResultNumElts);


  assert(ResultVT.getSizeInBits() == vectorWidth &&

         "Illegal subvector extraction");


  // Extract the relevant vectorWidth bits.  Generate an EXTRACT_SUBVECTOR

  unsigned ElemsPerChunk = vectorWidth / ElVT.getSizeInBits();

  assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");


  // This is the index of the first element of the vectorWidth-bit chunk

  // we want. Since ElemsPerChunk is a power of 2 just need to clear bits.

  IdxVal &= ~(ElemsPerChunk - 1);


  // If the input is a buildvector just emit a smaller one.

  if (Vec.getOpcode() == ISD::BUILD_VECTOR)

    return DAG.getBuildVector(ResultVT, dl,

                              Vec->ops().slice(IdxVal, ElemsPerChunk));


  // Check if we're extracting the upper undef of a widening pattern.

  if (Vec.getOpcode() == ISD::INSERT_SUBVECTOR && Vec.getOperand(0).isUndef() &&

      Vec.getOperand(1).getValueType().getVectorNumElements() <= IdxVal &&

      isNullConstant(Vec.getOperand(2)))

    return DAG.getUNDEF(ResultVT);


  return DAG.getExtractSubvector(dl, ResultVT, Vec, IdxVal);

}


/// Generate a DAG to grab 128-bits from a vector > 128 bits.  This

/// sets things up to match to an AVX VEXTRACTF128 / VEXTRACTI128

/// or AVX-512 VEXTRACTF32x4 / VEXTRACTI32x4

/// instructions or a simple subregister reference. Idx is an index in the

/// 128 bits we want.  It need not be aligned to a 128-bit boundary.  That makes

/// lowering EXTRACT_VECTOR_ELT operations easier.


static SDValue extract128BitVector(SDValue Vec, unsigned IdxVal,

                                   SelectionDAG &DAG, const SDLoc &dl) {

  assert((Vec.getValueType().is256BitVector() ||

          Vec.getValueType().is512BitVector()) &&

         "Unexpected vector size!");

  return extractSubVector(Vec, IdxVal, DAG, dl, 128);

}


/// Generate a DAG to grab 256-bits from a 512-bit vector.


static SDValue extract256BitVector(SDValue Vec, unsigned IdxVal,

                                   SelectionDAG &DAG, const SDLoc &dl) {

  assert(Vec.getValueType().is512BitVector() && "Unexpected vector size!");

  return extractSubVector(Vec, IdxVal, DAG, dl, 256);

}


static SDValue insertSubVector(SDValue Result, SDValue Vec, unsigned IdxVal,

                               SelectionDAG &DAG, const SDLoc &dl,

                               unsigned vectorWidth) {

  assert((vectorWidth == 128 || vectorWidth == 256) &&

         "Unsupported vector width");

  // Inserting UNDEF is Result

  if (Vec.isUndef())

    return Result;


  // Insert the relevant vectorWidth bits.

  EVT VT = Vec.getValueType();

  unsigned ElemsPerChunk = vectorWidth / VT.getScalarSizeInBits();

  assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");


  // This is the index of the first element of the vectorWidth-bit chunk

  // we want. Since ElemsPerChunk is a power of 2 just need to clear bits.

  IdxVal &= ~(ElemsPerChunk - 1);

  return DAG.getInsertSubvector(dl, Result, Vec, IdxVal);

}


/// Generate a DAG to put 128-bits into a vector > 128 bits.  This

/// sets things up to match to an AVX VINSERTF128/VINSERTI128 or

/// AVX-512 VINSERTF32x4/VINSERTI32x4 instructions or a

/// simple superregister reference.  Idx is an index in the 128 bits

/// we want.  It need not be aligned to a 128-bit boundary.  That makes

/// lowering INSERT_VECTOR_ELT operations easier.


static SDValue insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,

                                  SelectionDAG &DAG, const SDLoc &dl) {

  assert(Vec.getValueType().is128BitVector() && "Unexpected vector size!");

  return insertSubVector(Result, Vec, IdxVal, DAG, dl, 128);

}


/// Widen a vector to a larger size with the same scalar type, with the new

/// elements either zero or undef.


static SDValue widenSubVector(MVT VT, SDValue Vec, bool ZeroNewElements,

                              const X86Subtarget &Subtarget, SelectionDAG &DAG,

                              const SDLoc &dl) {

  EVT VecVT = Vec.getValueType();

  assert(VecVT.getFixedSizeInBits() <= VT.getFixedSizeInBits() &&

         VecVT.getScalarType() == VT.getScalarType() &&

         "Unsupported vector widening type");

  // If the upper 128-bits of a build vector are already undef/zero, then try to

  // widen from the lower 128-bits.

  if (Vec.getOpcode() == ISD::BUILD_VECTOR && VecVT.is256BitVector()) {

    unsigned NumSrcElts = VecVT.getVectorNumElements();

    ArrayRef<SDUse> Hi = Vec->ops().drop_front(NumSrcElts / 2);

    if (all_of(Hi, [&](SDValue V) {

          return V.isUndef() || (ZeroNewElements && X86::isZeroNode(V));

        }))

      Vec = extract128BitVector(Vec, 0, DAG, dl);

  }

  SDValue Res = ZeroNewElements ? getZeroVector(VT, Subtarget, DAG, dl)

                                : DAG.getUNDEF(VT);

  return DAG.getInsertSubvector(dl, Res, Vec, 0);

}


/// Widen a vector to a larger size with the same scalar type, with the new

/// elements either zero or undef.


static SDValue widenSubVector(SDValue Vec, bool ZeroNewElements,

                              const X86Subtarget &Subtarget, SelectionDAG &DAG,

                              const SDLoc &dl, unsigned WideSizeInBits) {

  assert(Vec.getValueSizeInBits() <= WideSizeInBits &&

         (WideSizeInBits % Vec.getScalarValueSizeInBits()) == 0 &&

         "Unsupported vector widening type");

  unsigned WideNumElts = WideSizeInBits / Vec.getScalarValueSizeInBits();

  MVT SVT = Vec.getSimpleValueType().getScalarType();

  MVT VT = MVT::getVectorVT(SVT, WideNumElts);

  return widenSubVector(VT, Vec, ZeroNewElements, Subtarget, DAG, dl);

}


/// Widen a mask vector type to a minimum of v8i1/v16i1 to allow use of KSHIFT

/// and bitcast with integer types.


static MVT widenMaskVectorType(MVT VT, const X86Subtarget &Subtarget) {

  assert(VT.getVectorElementType() == MVT::i1 && "Expected bool vector");

  unsigned NumElts = VT.getVectorNumElements();

  if ((!Subtarget.hasDQI() && NumElts == 8) || NumElts < 8)

    return Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;

  return VT;

}


/// Widen a mask vector to a minimum of v8i1/v16i1 to allow use of KSHIFT and

/// bitcast with integer types.


static SDValue widenMaskVector(SDValue Vec, bool ZeroNewElements,

                               const X86Subtarget &Subtarget, SelectionDAG &DAG,

                               const SDLoc &dl) {

  MVT VT = widenMaskVectorType(Vec.getSimpleValueType(), Subtarget);

  return widenSubVector(VT, Vec, ZeroNewElements, Subtarget, DAG, dl);

}


// Helper function to collect subvector ops that are concatenated together,

// either by ISD::CONCAT_VECTORS or a ISD::INSERT_SUBVECTOR series.

// The subvectors in Ops are guaranteed to be the same type.


static bool collectConcatOps(SDNode *N, SmallVectorImpl<SDValue> &Ops,

                             SelectionDAG &DAG) {

  assert(Ops.empty() && "Expected an empty ops vector");


  if (N->getOpcode() == ISD::CONCAT_VECTORS) {

    Ops.append(N->op_begin(), N->op_end());

    return true;

  }


  if (N->getOpcode() == ISD::INSERT_SUBVECTOR) {

    SDValue Src = N->getOperand(0);

    SDValue Sub = N->getOperand(1);

    const APInt &Idx = N->getConstantOperandAPInt(2);

    EVT VT = Src.getValueType();

    EVT SubVT = Sub.getValueType();


    if (VT.getSizeInBits() == (SubVT.getSizeInBits() * 2)) {

      // insert_subvector(undef, x, lo)

      if (Idx == 0 && Src.isUndef()) {

        Ops.push_back(Sub);

        Ops.push_back(DAG.getUNDEF(SubVT));

        return true;

      }

      if (Idx == (VT.getVectorNumElements() / 2)) {

        // insert_subvector(insert_subvector(undef, x, lo), y, hi)

        if (Src.getOpcode() == ISD::INSERT_SUBVECTOR &&

            Src.getOperand(1).getValueType() == SubVT &&

            isNullConstant(Src.getOperand(2))) {

          // Attempt to recurse into inner (matching) concats.

          SDValue Lo = Src.getOperand(1);

          SDValue Hi = Sub;

          SmallVector<SDValue, 2> LoOps, HiOps;

          if (collectConcatOps(Lo.getNode(), LoOps, DAG) &&

              collectConcatOps(Hi.getNode(), HiOps, DAG) &&

              LoOps.size() == HiOps.size()) {

            Ops.append(LoOps);

            Ops.append(HiOps);

            return true;

          }

          Ops.push_back(Lo);

          Ops.push_back(Hi);

          return true;

        }

        // insert_subvector(x, extract_subvector(x, lo), hi)

        if (Sub.getOpcode() == ISD::EXTRACT_SUBVECTOR &&

            Sub.getOperand(0) == Src && isNullConstant(Sub.getOperand(1))) {

          Ops.append(2, Sub);

          return true;

        }

        // insert_subvector(undef, x, hi)

        if (Src.isUndef()) {

          Ops.push_back(DAG.getUNDEF(SubVT));

          Ops.push_back(Sub);

          return true;

        }

      }

    }

  }


  if (N->getOpcode() == ISD::EXTRACT_SUBVECTOR) {

    EVT VT = N->getValueType(0);

    SDValue Src = N->getOperand(0);

    uint64_t Idx = N->getConstantOperandVal(1);


    // Collect all the subvectors from the source vector and slice off the

    // extraction.

    SmallVector<SDValue, 4> SrcOps;

    if (collectConcatOps(Src.getNode(), SrcOps, DAG) &&

        VT.getSizeInBits() > SrcOps[0].getValueSizeInBits() &&

        (VT.getSizeInBits() % SrcOps[0].getValueSizeInBits()) == 0 &&

        (Idx % SrcOps[0].getValueType().getVectorNumElements()) == 0) {

      unsigned SubIdx = Idx / SrcOps[0].getValueType().getVectorNumElements();

      unsigned NumSubs = VT.getSizeInBits() / SrcOps[0].getValueSizeInBits();

      Ops.append(SrcOps.begin() + SubIdx, SrcOps.begin() + SubIdx + NumSubs);

      return true;

    }

  }


  assert(Ops.empty() && "Expected an empty ops vector");

  return false;

}


// Helper to check if \p V can be split into subvectors and the upper subvectors

// are all undef. In which case return the lower subvector.


static SDValue isUpperSubvectorUndef(SDValue V, const SDLoc &DL,

                                     SelectionDAG &DAG) {

  SmallVector<SDValue> SubOps;

  if (!collectConcatOps(V.getNode(), SubOps, DAG))

    return SDValue();


  unsigned NumSubOps = SubOps.size();

  unsigned HalfNumSubOps = NumSubOps / 2;

  assert((NumSubOps % 2) == 0 && "Unexpected number of subvectors");


  ArrayRef<SDValue> UpperOps(SubOps.begin() + HalfNumSubOps, SubOps.end());

  if (any_of(UpperOps, [](SDValue Op) { return !Op.isUndef(); }))

    return SDValue();


  EVT HalfVT = V.getValueType().getHalfNumVectorElementsVT(*DAG.getContext());

  ArrayRef<SDValue> LowerOps(SubOps.begin(), SubOps.begin() + HalfNumSubOps);

  return DAG.getNode(ISD::CONCAT_VECTORS, DL, HalfVT, LowerOps);

}


// Helper to check if we can access all the constituent subvectors without any

// extract ops.


static bool isFreeToSplitVector(SDValue V, SelectionDAG &DAG) {

  SmallVector<SDValue> Ops;

  return collectConcatOps(V.getNode(), Ops, DAG);

}


static std::pair<SDValue, SDValue> splitVector(SDValue Op, SelectionDAG &DAG,

                                               const SDLoc &dl) {

  EVT VT = Op.getValueType();

  unsigned NumElems = VT.getVectorNumElements();

  unsigned SizeInBits = VT.getSizeInBits();

  assert((NumElems % 2) == 0 && (SizeInBits % 2) == 0 &&

         "Can't split odd sized vector");


  SmallVector<SDValue, 4> SubOps;

  if (collectConcatOps(Op.getNode(), SubOps, DAG)) {

    assert((SubOps.size() % 2) == 0 && "Can't split odd sized vector concat");

    unsigned HalfOps = SubOps.size() / 2;

    EVT HalfVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());

    SmallVector<SDValue, 2> LoOps(SubOps.begin(), SubOps.begin() + HalfOps);

    SmallVector<SDValue, 2> HiOps(SubOps.begin() + HalfOps, SubOps.end());

    SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT, LoOps);

    SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT, HiOps);

    return std::make_pair(Lo, Hi);

  }


  // If this is a splat value (with no-undefs) then use the lower subvector,

  // which should be a free extraction.

  SDValue Lo = extractSubVector(Op, 0, DAG, dl, SizeInBits / 2);

  if (DAG.isSplatValue(Op, /*AllowUndefs*/ false))

    return std::make_pair(Lo, Lo);


  SDValue Hi = extractSubVector(Op, NumElems / 2, DAG, dl, SizeInBits / 2);

  return std::make_pair(Lo, Hi);

}


/// Break an operation into 2 half sized ops and then concatenate the results.


static SDValue splitVectorOp(SDValue Op, SelectionDAG &DAG, const SDLoc &dl) {

  unsigned NumOps = Op.getNumOperands();

  EVT VT = Op.getValueType();


  // Extract the LHS Lo/Hi vectors

  SmallVector<SDValue> LoOps(NumOps, SDValue());

  SmallVector<SDValue> HiOps(NumOps, SDValue());

  for (unsigned I = 0; I != NumOps; ++I) {

    SDValue SrcOp = Op.getOperand(I);

    if (!SrcOp.getValueType().isVector()) {

      LoOps[I] = HiOps[I] = SrcOp;

      continue;

    }

    std::tie(LoOps[I], HiOps[I]) = splitVector(SrcOp, DAG, dl);

  }


  EVT LoVT, HiVT;

  std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);

  return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,

                     DAG.getNode(Op.getOpcode(), dl, LoVT, LoOps),

                     DAG.getNode(Op.getOpcode(), dl, HiVT, HiOps));

}


/// Break an unary integer operation into 2 half sized ops and then

/// concatenate the result back.


static SDValue splitVectorIntUnary(SDValue Op, SelectionDAG &DAG,

                                   const SDLoc &dl) {

  // Make sure we only try to split 256/512-bit types to avoid creating

  // narrow vectors.

  [[maybe_unused]] EVT VT = Op.getValueType();

  assert((Op.getOperand(0).getValueType().is256BitVector() ||

          Op.getOperand(0).getValueType().is512BitVector()) &&

         (VT.is256BitVector() || VT.is512BitVector()) && "Unsupported VT!");

  assert(Op.getOperand(0).getValueType().getVectorNumElements() ==

             VT.getVectorNumElements() &&

         "Unexpected VTs!");

  return splitVectorOp(Op, DAG, dl);

}


/// Break a binary integer operation into 2 half sized ops and then

/// concatenate the result back.


static SDValue splitVectorIntBinary(SDValue Op, SelectionDAG &DAG,

                                    const SDLoc &dl) {

  // Assert that all the types match.

  [[maybe_unused]] EVT VT = Op.getValueType();

  assert(Op.getOperand(0).getValueType() == VT &&

         Op.getOperand(1).getValueType() == VT && "Unexpected VTs!");

  assert((VT.is256BitVector() || VT.is512BitVector()) && "Unsupported VT!");

  return splitVectorOp(Op, DAG, dl);

}


// Helper for splitting operands of an operation to legal target size and

// apply a function on each part.

// Useful for operations that are available on SSE2 in 128-bit, on AVX2 in

// 256-bit and on AVX512BW in 512-bit. The argument VT is the type used for

// deciding if/how to split Ops. Ops elements do *not* have to be of type VT.

// The argument Builder is a function that will be applied on each split part:

// SDValue Builder(SelectionDAG&G, SDLoc, ArrayRef<SDValue>)

template <typename F>


SDValue SplitOpsAndApply(SelectionDAG &DAG, const X86Subtarget &Subtarget,

                         const SDLoc &DL, EVT VT, ArrayRef<SDValue> Ops,

                         F Builder, bool CheckBWI = true,

                         bool AllowAVX512 = true) {

  assert(Subtarget.hasSSE2() && "Target assumed to support at least SSE2");

  unsigned NumSubs = 1;

  if (AllowAVX512 && ((CheckBWI && Subtarget.useBWIRegs()) ||

                      (!CheckBWI && Subtarget.useAVX512Regs()))) {

    if (VT.getSizeInBits() > 512) {

      NumSubs = VT.getSizeInBits() / 512;

      assert((VT.getSizeInBits() % 512) == 0 && "Illegal vector size");

    }

  } else if (Subtarget.hasAVX2()) {

    if (VT.getSizeInBits() > 256) {

      NumSubs = VT.getSizeInBits() / 256;

      assert((VT.getSizeInBits() % 256) == 0 && "Illegal vector size");

    }

  } else {

    if (VT.getSizeInBits() > 128) {

      NumSubs = VT.getSizeInBits() / 128;

      assert((VT.getSizeInBits() % 128) == 0 && "Illegal vector size");

    }

  }


  if (NumSubs == 1)

    return Builder(DAG, DL, Ops);


  SmallVector<SDValue, 4> Subs;

  for (unsigned i = 0; i != NumSubs; ++i) {

    SmallVector<SDValue, 2> SubOps;

    for (SDValue Op : Ops) {

      EVT OpVT = Op.getValueType();

      unsigned NumSubElts = OpVT.getVectorNumElements() / NumSubs;

      unsigned SizeSub = OpVT.getSizeInBits() / NumSubs;

      SubOps.push_back(extractSubVector(Op, i * NumSubElts, DAG, DL, SizeSub));

    }

    Subs.push_back(Builder(DAG, DL, SubOps));

  }

  return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Subs);

}


// Helper function that extends a non-512-bit vector op to 512-bits on non-VLX

// targets.


static SDValue getAVX512Node(unsigned Opcode, const SDLoc &DL, MVT VT,

                             ArrayRef<SDValue> Ops, SelectionDAG &DAG,

                             const X86Subtarget &Subtarget) {

  assert(Subtarget.hasAVX512() && "AVX512 target expected");

  MVT SVT = VT.getScalarType();


  // If we have a 32/64 splatted constant, splat it to DstTy to

  // encourage a foldable broadcast'd operand.

  auto MakeBroadcastOp = [&](SDValue Op, MVT OpVT, MVT DstVT) {

    unsigned OpEltSizeInBits = OpVT.getScalarSizeInBits();

    // AVX512 broadcasts 32/64-bit operands.

    // TODO: Support float once getAVX512Node is used by fp-ops.

    if (!OpVT.isInteger() || OpEltSizeInBits < 32 ||

        !DAG.getTargetLoweringInfo().isTypeLegal(SVT))

      return SDValue();

    // If we're not widening, don't bother if we're not bitcasting.

    if (OpVT == DstVT && Op.getOpcode() != ISD::BITCAST)

      return SDValue();

    if (auto *BV = dyn_cast<BuildVectorSDNode>(peekThroughBitcasts(Op))) {

      APInt SplatValue, SplatUndef;

      unsigned SplatBitSize;

      bool HasAnyUndefs;

      if (BV->isConstantSplat(SplatValue, SplatUndef, SplatBitSize,

                              HasAnyUndefs, OpEltSizeInBits) &&

          !HasAnyUndefs && SplatValue.getBitWidth() == OpEltSizeInBits)

        return DAG.getConstant(SplatValue, DL, DstVT);

    }

    return SDValue();

  };


  bool Widen = !(Subtarget.hasVLX() || VT.is512BitVector());


  MVT DstVT = VT;

  if (Widen)

    DstVT = MVT::getVectorVT(SVT, 512 / SVT.getSizeInBits());


  // Canonicalize src operands.

  SmallVector<SDValue> SrcOps(Ops);

  for (SDValue &Op : SrcOps) {

    MVT OpVT = Op.getSimpleValueType();

    // Just pass through scalar operands.

    if (!OpVT.isVector())

      continue;

    assert(OpVT == VT && "Vector type mismatch");


    if (SDValue BroadcastOp = MakeBroadcastOp(Op, OpVT, DstVT)) {

      Op = BroadcastOp;

      continue;

    }


    // Just widen the subvector by inserting into an undef wide vector.

    if (Widen)

      Op = widenSubVector(Op, false, Subtarget, DAG, DL, 512);

  }


  SDValue Res = DAG.getNode(Opcode, DL, DstVT, SrcOps);


  // Perform the 512-bit op then extract the bottom subvector.

  if (Widen)

    Res = extractSubVector(Res, 0, DAG, DL, VT.getSizeInBits());

  return Res;

}


/// Insert i1-subvector to i1-vector.


static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG,

                                const X86Subtarget &Subtarget) {


  SDLoc dl(Op);

  SDValue Vec = Op.getOperand(0);

  SDValue SubVec = Op.getOperand(1);

  SDValue Idx = Op.getOperand(2);

  unsigned IdxVal = Op.getConstantOperandVal(2);


  // Inserting undef is a nop. We can just return the original vector.

  if (SubVec.isUndef())

    return Vec;


  if (IdxVal == 0 && Vec.isUndef()) // the operation is legal

    return Op;


  MVT OpVT = Op.getSimpleValueType();

  unsigned NumElems = OpVT.getVectorNumElements();

  SDValue ZeroIdx = DAG.getVectorIdxConstant(0, dl);


  // Extend to natively supported kshift.

  MVT WideOpVT = widenMaskVectorType(OpVT, Subtarget);


  // Inserting into the lsbs of a zero vector is legal. ISel will insert shifts

  // if necessary.

  if (IdxVal == 0 && ISD::isBuildVectorAllZeros(Vec.getNode())) {

    // May need to promote to a legal type.

    Op = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,

                     DAG.getConstant(0, dl, WideOpVT),

                     SubVec, Idx);

    return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);

  }


  MVT SubVecVT = SubVec.getSimpleValueType();

  unsigned SubVecNumElems = SubVecVT.getVectorNumElements();

  assert(IdxVal + SubVecNumElems <= NumElems &&

         IdxVal % SubVecVT.getSizeInBits() == 0 &&

         "Unexpected index value in INSERT_SUBVECTOR");


  SDValue Undef = DAG.getUNDEF(WideOpVT);


  if (IdxVal == 0) {

    // Zero lower bits of the Vec

    SDValue ShiftBits = DAG.getTargetConstant(SubVecNumElems, dl, MVT::i8);

    Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec,

                      ZeroIdx);

    Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);

    Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);

    // Merge them together, SubVec should be zero extended.

    SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,

                         DAG.getConstant(0, dl, WideOpVT),

                         SubVec, ZeroIdx);

    Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);

    return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);

  }


  SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,

                       Undef, SubVec, ZeroIdx);


  if (Vec.isUndef()) {

    assert(IdxVal != 0 && "Unexpected index");

    SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,

                         DAG.getTargetConstant(IdxVal, dl, MVT::i8));

    return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);

  }


  if (ISD::isBuildVectorAllZeros(Vec.getNode())) {

    assert(IdxVal != 0 && "Unexpected index");

    // If upper elements of Vec are known undef, then just shift into place.

    if (llvm::all_of(Vec->ops().slice(IdxVal + SubVecNumElems),

                     [](SDValue V) { return V.isUndef(); })) {

      SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,

                           DAG.getTargetConstant(IdxVal, dl, MVT::i8));

    } else {

      NumElems = WideOpVT.getVectorNumElements();

      unsigned ShiftLeft = NumElems - SubVecNumElems;

      unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;

      SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,

                           DAG.getTargetConstant(ShiftLeft, dl, MVT::i8));

      if (ShiftRight != 0)

        SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,

                             DAG.getTargetConstant(ShiftRight, dl, MVT::i8));

    }

    return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);

  }


  // Simple case when we put subvector in the upper part

  if (IdxVal + SubVecNumElems == NumElems) {

    SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,

                         DAG.getTargetConstant(IdxVal, dl, MVT::i8));

    if (SubVecNumElems * 2 == NumElems) {

      // Special case, use legal zero extending insert_subvector. This allows

      // isel to optimize when bits are known zero.

      Vec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVecVT, Vec, ZeroIdx);

      Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,

                        DAG.getConstant(0, dl, WideOpVT),

                        Vec, ZeroIdx);

    } else {

      // Otherwise use explicit shifts to zero the bits.

      Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,

                        Undef, Vec, ZeroIdx);

      NumElems = WideOpVT.getVectorNumElements();

      SDValue ShiftBits = DAG.getTargetConstant(NumElems - IdxVal, dl, MVT::i8);

      Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);

      Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);

    }

    Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);

    return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);

  }


  // Inserting into the middle is more complicated.


  NumElems = WideOpVT.getVectorNumElements();


  // Widen the vector if needed.

  Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx);


  unsigned ShiftLeft = NumElems - SubVecNumElems;

  unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;


  // Do an optimization for the most frequently used types.

  if (WideOpVT != MVT::v64i1 || Subtarget.is64Bit()) {

    APInt Mask0 = APInt::getBitsSet(NumElems, IdxVal, IdxVal + SubVecNumElems);

    Mask0.flipAllBits();

    SDValue CMask0 = DAG.getConstant(Mask0, dl, MVT::getIntegerVT(NumElems));

    SDValue VMask0 = DAG.getNode(ISD::BITCAST, dl, WideOpVT, CMask0);

    Vec = DAG.getNode(ISD::AND, dl, WideOpVT, Vec, VMask0);

    SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,

                         DAG.getTargetConstant(ShiftLeft, dl, MVT::i8));

    SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,

                         DAG.getTargetConstant(ShiftRight, dl, MVT::i8));

    Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);


    // Reduce to original width if needed.

    return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);

  }


  // Clear the upper bits of the subvector and move it to its insert position.

  SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,

                       DAG.getTargetConstant(ShiftLeft, dl, MVT::i8));

  SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,

                       DAG.getTargetConstant(ShiftRight, dl, MVT::i8));


  // Isolate the bits below the insertion point.

  unsigned LowShift = NumElems - IdxVal;

  SDValue Low = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec,

                            DAG.getTargetConstant(LowShift, dl, MVT::i8));

  Low = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Low,

                    DAG.getTargetConstant(LowShift, dl, MVT::i8));


  // Isolate the bits after the last inserted bit.

  unsigned HighShift = IdxVal + SubVecNumElems;

  SDValue High = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec,

                            DAG.getTargetConstant(HighShift, dl, MVT::i8));

  High = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, High,

                    DAG.getTargetConstant(HighShift, dl, MVT::i8));


  // Now OR all 3 pieces together.

  Vec = DAG.getNode(ISD::OR, dl, WideOpVT, Low, High);

  SubVec = DAG.getNode(ISD::OR, dl, WideOpVT, SubVec, Vec);


  // Reduce to original width if needed.

  return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);

}


static SDValue concatSubVectors(SDValue V1, SDValue V2, SelectionDAG &DAG,

                                const SDLoc &dl) {

  assert(V1.getValueType() == V2.getValueType() && "subvector type mismatch");

  EVT SubVT = V1.getValueType();

  EVT SubSVT = SubVT.getScalarType();

  unsigned SubNumElts = SubVT.getVectorNumElements();

  unsigned SubVectorWidth = SubVT.getSizeInBits();

  EVT VT = EVT::getVectorVT(*DAG.getContext(), SubSVT, 2 * SubNumElts);

  SDValue V = insertSubVector(DAG.getUNDEF(VT), V1, 0, DAG, dl, SubVectorWidth);

  return insertSubVector(V, V2, SubNumElts, DAG, dl, SubVectorWidth);

}


/// Returns a vector of specified type with all bits set.

/// Always build ones vectors as <4 x i32>, <8 x i32> or <16 x i32>.

/// Then bitcast to their original type, ensuring they get CSE'd.


static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl) {

  assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&

         "Expected a 128/256/512-bit vector type");

  unsigned NumElts = VT.getSizeInBits() / 32;

  SDValue Vec = DAG.getAllOnesConstant(dl, MVT::getVectorVT(MVT::i32, NumElts));

  return DAG.getBitcast(VT, Vec);

}


static SDValue getEXTEND_VECTOR_INREG(unsigned Opcode, const SDLoc &DL, EVT VT,

                                      SDValue In, SelectionDAG &DAG) {

  EVT InVT = In.getValueType();

  assert(VT.isVector() && InVT.isVector() && "Expected vector VTs.");


  // Canonicalize Opcode to general extension version.

  switch (Opcode) {

  case ISD::ANY_EXTEND:

  case ISD::ANY_EXTEND_VECTOR_INREG:

    Opcode = ISD::ANY_EXTEND;

    break;

  case ISD::SIGN_EXTEND:

  case ISD::SIGN_EXTEND_VECTOR_INREG:

    Opcode = ISD::SIGN_EXTEND;

    break;

  case ISD::ZERO_EXTEND:

  case ISD::ZERO_EXTEND_VECTOR_INREG:

    Opcode = ISD::ZERO_EXTEND;

    break;

  default:

    llvm_unreachable("Unknown extension opcode");

  }


  // For 256-bit vectors, we only need the lower (128-bit) input half.

  // For 512-bit vectors, we only need the lower input half or quarter.

  if (InVT.getSizeInBits() > 128) {

    assert(VT.getSizeInBits() == InVT.getSizeInBits() &&

           "Expected VTs to be the same size!");

    unsigned Scale = VT.getScalarSizeInBits() / InVT.getScalarSizeInBits();

    In = extractSubVector(In, 0, DAG, DL,

                          std::max(128U, (unsigned)VT.getSizeInBits() / Scale));

    InVT = In.getValueType();

  }


  if (VT.getVectorNumElements() != InVT.getVectorNumElements())

    Opcode = DAG.getOpcode_EXTEND_VECTOR_INREG(Opcode);


  return DAG.getNode(Opcode, DL, VT, In);

}


// Create OR(AND(LHS,MASK),AND(RHS,~MASK)) bit select pattern


static SDValue getBitSelect(const SDLoc &DL, MVT VT, SDValue LHS, SDValue RHS,

                            SDValue Mask, SelectionDAG &DAG) {

  LHS = DAG.getNode(ISD::AND, DL, VT, LHS, Mask);

  RHS = DAG.getNode(X86ISD::ANDNP, DL, VT, Mask, RHS);

  return DAG.getNode(ISD::OR, DL, VT, LHS, RHS);

}


void llvm::createUnpackShuffleMask(EVT VT, SmallVectorImpl<int> &Mask,

                                   bool Lo, bool Unary) {

  assert(VT.getScalarType().isSimple() && (VT.getSizeInBits() % 128) == 0 &&

         "Illegal vector type to unpack");

  assert(Mask.empty() && "Expected an empty shuffle mask vector");

  int NumElts = VT.getVectorNumElements();

  int NumEltsInLane = 128 / VT.getScalarSizeInBits();

  for (int i = 0; i < NumElts; ++i) {

    unsigned LaneStart = (i / NumEltsInLane) * NumEltsInLane;

    int Pos = (i % NumEltsInLane) / 2 + LaneStart;

    Pos += (Unary ? 0 : NumElts * (i % 2));

    Pos += (Lo ? 0 : NumEltsInLane / 2);

    Mask.push_back(Pos);

  }

}


/// Similar to unpacklo/unpackhi, but without the 128-bit lane limitation

/// imposed by AVX and specific to the unary pattern. Example:

/// v8iX Lo --> <0, 0, 1, 1, 2, 2, 3, 3>

/// v8iX Hi --> <4, 4, 5, 5, 6, 6, 7, 7>


void llvm::createSplat2ShuffleMask(MVT VT, SmallVectorImpl<int> &Mask,

                                   bool Lo) {

  assert(Mask.empty() && "Expected an empty shuffle mask vector");

  int NumElts = VT.getVectorNumElements();

  for (int i = 0; i < NumElts; ++i) {

    int Pos = i / 2;

    Pos += (Lo ? 0 : NumElts / 2);

    Mask.push_back(Pos);

  }

}


// Attempt to constant fold, else just create a VECTOR_SHUFFLE.


static SDValue getVectorShuffle(SelectionDAG &DAG, EVT VT, const SDLoc &dl,

                                SDValue V1, SDValue V2, ArrayRef<int> Mask) {

  if ((ISD::isBuildVectorOfConstantSDNodes(V1.getNode()) || V1.isUndef()) &&

      (ISD::isBuildVectorOfConstantSDNodes(V2.getNode()) || V2.isUndef())) {

    SmallVector<SDValue> Ops(Mask.size(), DAG.getUNDEF(VT.getScalarType()));

    for (int I = 0, NumElts = Mask.size(); I != NumElts; ++I) {

      int M = Mask[I];

      if (M < 0)

        continue;

      SDValue V = (M < NumElts) ? V1 : V2;

      if (V.isUndef())

        continue;

      Ops[I] = V.getOperand(M % NumElts);

    }

    return DAG.getBuildVector(VT, dl, Ops);

  }


  return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);

}


/// Returns a vector_shuffle node for an unpackl operation.


static SDValue getUnpackl(SelectionDAG &DAG, const SDLoc &dl, EVT VT,

                          SDValue V1, SDValue V2) {

  SmallVector<int, 8> Mask;

  createUnpackShuffleMask(VT, Mask, /* Lo = */ true, /* Unary = */ false);

  return getVectorShuffle(DAG, VT, dl, V1, V2, Mask);

}


/// Returns a vector_shuffle node for an unpackh operation.


static SDValue getUnpackh(SelectionDAG &DAG, const SDLoc &dl, EVT VT,

                          SDValue V1, SDValue V2) {

  SmallVector<int, 8> Mask;

  createUnpackShuffleMask(VT, Mask, /* Lo = */ false, /* Unary = */ false);

  return getVectorShuffle(DAG, VT, dl, V1, V2, Mask);

}


/// Returns a node that packs the LHS + RHS nodes together at half width.

/// May return X86ISD::PACKSS/PACKUS, packing the top/bottom half.

/// TODO: Add subvector splitting if/when we have a need for it.


static SDValue getPack(SelectionDAG &DAG, const X86Subtarget &Subtarget,

                       const SDLoc &dl, MVT VT, SDValue LHS, SDValue RHS,

                       bool PackHiHalf = false) {

  MVT OpVT = LHS.getSimpleValueType();

  unsigned EltSizeInBits = VT.getScalarSizeInBits();

  bool UsePackUS = Subtarget.hasSSE41() || EltSizeInBits == 8;

  assert(OpVT == RHS.getSimpleValueType() &&

         VT.getSizeInBits() == OpVT.getSizeInBits() &&

         (EltSizeInBits * 2) == OpVT.getScalarSizeInBits() &&

         "Unexpected PACK operand types");

  assert((EltSizeInBits == 8 || EltSizeInBits == 16 || EltSizeInBits == 32) &&

         "Unexpected PACK result type");


  // Rely on vector shuffles for vXi64 -> vXi32 packing.

  if (EltSizeInBits == 32) {

    SmallVector<int> PackMask;

    int Offset = PackHiHalf ? 1 : 0;

    int NumElts = VT.getVectorNumElements();

    for (int I = 0; I != NumElts; I += 4) {

      PackMask.push_back(I + Offset);

      PackMask.push_back(I + Offset + 2);

      PackMask.push_back(I + Offset + NumElts);

      PackMask.push_back(I + Offset + NumElts + 2);

    }

    return DAG.getVectorShuffle(VT, dl, DAG.getBitcast(VT, LHS),

                                DAG.getBitcast(VT, RHS), PackMask);

  }


  // See if we already have sufficient leading bits for PACKSS/PACKUS.

  if (!PackHiHalf) {

    if (UsePackUS &&

        DAG.computeKnownBits(LHS).countMaxActiveBits() <= EltSizeInBits &&

        DAG.computeKnownBits(RHS).countMaxActiveBits() <= EltSizeInBits)

      return DAG.getNode(X86ISD::PACKUS, dl, VT, LHS, RHS);


    if (DAG.ComputeMaxSignificantBits(LHS) <= EltSizeInBits &&

        DAG.ComputeMaxSignificantBits(RHS) <= EltSizeInBits)

      return DAG.getNode(X86ISD::PACKSS, dl, VT, LHS, RHS);

  }


  // Fallback to sign/zero extending the requested half and pack.

  SDValue Amt = DAG.getTargetConstant(EltSizeInBits, dl, MVT::i8);

  if (UsePackUS) {

    if (PackHiHalf) {

      LHS = DAG.getNode(X86ISD::VSRLI, dl, OpVT, LHS, Amt);

      RHS = DAG.getNode(X86ISD::VSRLI, dl, OpVT, RHS, Amt);

    } else {

      SDValue Mask = DAG.getConstant((1ULL << EltSizeInBits) - 1, dl, OpVT);

      LHS = DAG.getNode(ISD::AND, dl, OpVT, LHS, Mask);

      RHS = DAG.getNode(ISD::AND, dl, OpVT, RHS, Mask);

    };

    return DAG.getNode(X86ISD::PACKUS, dl, VT, LHS, RHS);

  };


  if (!PackHiHalf) {

    LHS = DAG.getNode(X86ISD::VSHLI, dl, OpVT, LHS, Amt);

    RHS = DAG.getNode(X86ISD::VSHLI, dl, OpVT, RHS, Amt);

  }

  LHS = DAG.getNode(X86ISD::VSRAI, dl, OpVT, LHS, Amt);

  RHS = DAG.getNode(X86ISD::VSRAI, dl, OpVT, RHS, Amt);

  return DAG.getNode(X86ISD::PACKSS, dl, VT, LHS, RHS);

}


/// Return a vector_shuffle of the specified vector of zero or undef vector.

/// This produces a shuffle where the low element of V2 is swizzled into the

/// zero/undef vector, landing at element Idx.

/// This produces a shuffle mask like 4,1,2,3 (idx=0) or  0,1,2,4 (idx=3).


static SDValue getShuffleVectorZeroOrUndef(SDValue V2, int Idx,

                                           bool IsZero,

                                           const X86Subtarget &Subtarget,

                                           SelectionDAG &DAG) {

  MVT VT = V2.getSimpleValueType();

  SDValue V1 = IsZero

    ? getZeroVector(VT, Subtarget, DAG, SDLoc(V2)) : DAG.getUNDEF(VT);

  int NumElems = VT.getVectorNumElements();

  SmallVector<int, 16> MaskVec(NumElems);

  for (int i = 0; i != NumElems; ++i)

    // If this is the insertion idx, put the low elt of V2 here.

    MaskVec[i] = (i == Idx) ? NumElems : i;

  return DAG.getVectorShuffle(VT, SDLoc(V2), V1, V2, MaskVec);

}


static ConstantPoolSDNode *getTargetConstantPoolFromBasePtr(SDValue Ptr) {

  if (Ptr.getOpcode() == X86ISD::Wrapper ||

      Ptr.getOpcode() == X86ISD::WrapperRIP)

    Ptr = Ptr.getOperand(0);

  return dyn_cast<ConstantPoolSDNode>(Ptr);

}


// TODO: Add support for non-zero offsets.


static const Constant *getTargetConstantFromBasePtr(SDValue Ptr) {

  ConstantPoolSDNode *CNode = getTargetConstantPoolFromBasePtr(Ptr);

  if (!CNode || CNode->isMachineConstantPoolEntry() || CNode->getOffset() != 0)

    return nullptr;

  return CNode->getConstVal();

}


static const Constant *getTargetConstantFromNode(LoadSDNode *Load) {

  if (!Load || !ISD::isNormalLoad(Load))

    return nullptr;

  return getTargetConstantFromBasePtr(Load->getBasePtr());

}


static const Constant *getTargetConstantFromNode(SDValue Op) {

  Op = peekThroughBitcasts(Op);

  return getTargetConstantFromNode(dyn_cast<LoadSDNode>(Op));

}


const Constant *


X86TargetLowering::getTargetConstantFromLoad(LoadSDNode *LD) const {

  assert(LD && "Unexpected null LoadSDNode");

  return getTargetConstantFromNode(LD);

}


bool X86TargetLowering::isTargetCanonicalSelect(SDNode *N) const {

  // Do not fold (vselect not(C), X, 0s) to (vselect C, Os, X)

  SDValue Cond = N->getOperand(0);

  SDValue RHS = N->getOperand(2);

  EVT CondVT = Cond.getValueType();

  return N->getOpcode() == ISD::VSELECT && Subtarget.hasAVX512() &&

         CondVT.getVectorElementType() == MVT::i1 &&

         ISD::isBuildVectorAllZeros(RHS.getNode());

}


// Extract raw constant bits from constant pools.


static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits,

                                          APInt &UndefElts,

                                          SmallVectorImpl<APInt> &EltBits,

                                          bool AllowWholeUndefs = true,

                                          bool AllowPartialUndefs = false) {

  assert(EltBits.empty() && "Expected an empty EltBits vector");


  Op = peekThroughBitcasts(Op);


  EVT VT = Op.getValueType();

  unsigned SizeInBits = VT.getSizeInBits();

  unsigned NumElts = SizeInBits / EltSizeInBits;


  // Can't split constant.

  if ((SizeInBits % EltSizeInBits) != 0)

    return false;


  // Bitcast a source array of element bits to the target size.

  auto CastBitData = [&](APInt &UndefSrcElts, ArrayRef<APInt> SrcEltBits) {

    unsigned NumSrcElts = UndefSrcElts.getBitWidth();

    unsigned SrcEltSizeInBits = SrcEltBits[0].getBitWidth();

    assert((NumSrcElts * SrcEltSizeInBits) == SizeInBits &&

           "Constant bit sizes don't match");


    // Don't split if we don't allow undef bits.

    bool AllowUndefs = AllowWholeUndefs || AllowPartialUndefs;

    if (UndefSrcElts.getBoolValue() && !AllowUndefs)

      return false;


    // If we're already the right size, don't bother bitcasting.

    if (NumSrcElts == NumElts) {

      UndefElts = UndefSrcElts;

      EltBits.assign(SrcEltBits.begin(), SrcEltBits.end());

      return true;

    }


    // Extract all the undef/constant element data and pack into single bitsets.

    APInt UndefBits(SizeInBits, 0);

    APInt MaskBits(SizeInBits, 0);


    for (unsigned i = 0; i != NumSrcElts; ++i) {

      unsigned BitOffset = i * SrcEltSizeInBits;

      if (UndefSrcElts[i])

        UndefBits.setBits(BitOffset, BitOffset + SrcEltSizeInBits);

      MaskBits.insertBits(SrcEltBits[i], BitOffset);

    }


    // Split the undef/constant single bitset data into the target elements.

    UndefElts = APInt(NumElts, 0);

    EltBits.resize(NumElts, APInt(EltSizeInBits, 0));


    for (unsigned i = 0; i != NumElts; ++i) {

      unsigned BitOffset = i * EltSizeInBits;

      APInt UndefEltBits = UndefBits.extractBits(EltSizeInBits, BitOffset);


      // Only treat an element as UNDEF if all bits are UNDEF.

      if (UndefEltBits.isAllOnes()) {

        if (!AllowWholeUndefs)

          return false;

        UndefElts.setBit(i);

        continue;

      }


      // If only some bits are UNDEF then treat them as zero (or bail if not

      // supported).

      if (UndefEltBits.getBoolValue() && !AllowPartialUndefs)

        return false;


      EltBits[i] = MaskBits.extractBits(EltSizeInBits, BitOffset);

    }

    return true;

  };


  // Collect constant bits and insert into mask/undef bit masks.

  auto CollectConstantBits = [](const Constant *Cst, APInt &Mask, APInt &Undefs,

                                unsigned UndefBitIndex) {

    if (!Cst)

      return false;

    if (isa<UndefValue>(Cst)) {

      Undefs.setBit(UndefBitIndex);

      return true;

    }

    if (auto *CInt = dyn_cast<ConstantInt>(Cst)) {

      Mask = CInt->getValue();

      return true;

    }

    if (auto *CFP = dyn_cast<ConstantFP>(Cst)) {

      Mask = CFP->getValueAPF().bitcastToAPInt();

      return true;

    }

    if (auto *CDS = dyn_cast<ConstantDataSequential>(Cst)) {

      Type *Ty = CDS->getType();

      Mask = APInt::getZero(Ty->getPrimitiveSizeInBits());

      Type *EltTy = CDS->getElementType();

      bool IsInteger = EltTy->isIntegerTy();

      bool IsFP =

          EltTy->isHalfTy() || EltTy->isFloatTy() || EltTy->isDoubleTy();

      if (!IsInteger && !IsFP)

        return false;

      unsigned EltBits = EltTy->getPrimitiveSizeInBits();

      for (unsigned I = 0, E = CDS->getNumElements(); I != E; ++I)

        if (IsInteger)

          Mask.insertBits(CDS->getElementAsAPInt(I), I * EltBits);

        else

          Mask.insertBits(CDS->getElementAsAPFloat(I).bitcastToAPInt(),

                          I * EltBits);

      return true;

    }

    return false;

  };


  // Handle UNDEFs.

  if (Op.isUndef()) {

    APInt UndefSrcElts = APInt::getAllOnes(NumElts);

    SmallVector<APInt, 64> SrcEltBits(NumElts, APInt(EltSizeInBits, 0));

    return CastBitData(UndefSrcElts, SrcEltBits);

  }


  // Extract scalar constant bits.

  if (auto *Cst = dyn_cast<ConstantSDNode>(Op)) {

    APInt UndefSrcElts = APInt::getZero(1);

    SmallVector<APInt, 64> SrcEltBits(1, Cst->getAPIntValue());

    return CastBitData(UndefSrcElts, SrcEltBits);

  }

  if (auto *Cst = dyn_cast<ConstantFPSDNode>(Op)) {

    APInt UndefSrcElts = APInt::getZero(1);

    APInt RawBits = Cst->getValueAPF().bitcastToAPInt();

    SmallVector<APInt, 64> SrcEltBits(1, RawBits);

    return CastBitData(UndefSrcElts, SrcEltBits);

  }


  // Extract constant bits from build vector.

  if (auto *BV = dyn_cast<BuildVectorSDNode>(Op)) {

    BitVector Undefs;

    SmallVector<APInt> SrcEltBits;

    unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();

    if (BV->getConstantRawBits(true, SrcEltSizeInBits, SrcEltBits, Undefs)) {

      APInt UndefSrcElts = APInt::getZero(SrcEltBits.size());

      for (unsigned I = 0, E = SrcEltBits.size(); I != E; ++I)

        if (Undefs[I])

          UndefSrcElts.setBit(I);

      return CastBitData(UndefSrcElts, SrcEltBits);

    }

  }


  // Extract constant bits from constant pool vector.

  if (auto *Cst = getTargetConstantFromNode(Op)) {

    Type *CstTy = Cst->getType();

    unsigned CstSizeInBits = CstTy->getPrimitiveSizeInBits();

    if (!CstTy->isVectorTy() || (CstSizeInBits % SizeInBits) != 0)

      return false;


    unsigned SrcEltSizeInBits = CstTy->getScalarSizeInBits();

    unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;

    if ((SizeInBits % SrcEltSizeInBits) != 0)

      return false;


    APInt UndefSrcElts(NumSrcElts, 0);

    SmallVector<APInt, 64> SrcEltBits(NumSrcElts, APInt(SrcEltSizeInBits, 0));

    for (unsigned i = 0; i != NumSrcElts; ++i)

      if (!CollectConstantBits(Cst->getAggregateElement(i), SrcEltBits[i],

                               UndefSrcElts, i))

        return false;


    return CastBitData(UndefSrcElts, SrcEltBits);

  }


  // Extract constant bits from a broadcasted constant pool scalar.

  if (Op.getOpcode() == X86ISD::VBROADCAST_LOAD &&

      EltSizeInBits <= VT.getScalarSizeInBits()) {

    auto *MemIntr = cast<MemIntrinsicSDNode>(Op);

    if (MemIntr->getMemoryVT().getStoreSizeInBits() != VT.getScalarSizeInBits())

      return false;


    SDValue Ptr = MemIntr->getBasePtr();

    if (const Constant *C = getTargetConstantFromBasePtr(Ptr)) {

      unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();

      unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;


      APInt UndefSrcElts(NumSrcElts, 0);

      SmallVector<APInt, 64> SrcEltBits(1, APInt(SrcEltSizeInBits, 0));

      if (CollectConstantBits(C, SrcEltBits[0], UndefSrcElts, 0)) {

        if (UndefSrcElts[0])

          UndefSrcElts.setBits(0, NumSrcElts);

        if (SrcEltBits[0].getBitWidth() != SrcEltSizeInBits)

          SrcEltBits[0] = SrcEltBits[0].trunc(SrcEltSizeInBits);

        SrcEltBits.append(NumSrcElts - 1, SrcEltBits[0]);

        return CastBitData(UndefSrcElts, SrcEltBits);

      }

    }

  }


  // Extract constant bits from a subvector broadcast.

  if (Op.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) {

    auto *MemIntr = cast<MemIntrinsicSDNode>(Op);

    SDValue Ptr = MemIntr->getBasePtr();

    // The source constant may be larger than the subvector broadcast,

    // ensure we extract the correct subvector constants.

    if (const Constant *Cst = getTargetConstantFromBasePtr(Ptr)) {

      Type *CstTy = Cst->getType();

      unsigned CstSizeInBits = CstTy->getPrimitiveSizeInBits();

      unsigned SubVecSizeInBits = MemIntr->getMemoryVT().getStoreSizeInBits();

      if (!CstTy->isVectorTy() || (CstSizeInBits % SubVecSizeInBits) != 0 ||

          (SizeInBits % SubVecSizeInBits) != 0)

        return false;

      unsigned CstEltSizeInBits = CstTy->getScalarSizeInBits();

      unsigned NumSubElts = SubVecSizeInBits / CstEltSizeInBits;

      unsigned NumSubVecs = SizeInBits / SubVecSizeInBits;

      APInt UndefSubElts(NumSubElts, 0);

      SmallVector<APInt, 64> SubEltBits(NumSubElts * NumSubVecs,

                                        APInt(CstEltSizeInBits, 0));

      for (unsigned i = 0; i != NumSubElts; ++i) {

        if (!CollectConstantBits(Cst->getAggregateElement(i), SubEltBits[i],

                                 UndefSubElts, i))

          return false;

        for (unsigned j = 1; j != NumSubVecs; ++j)

          SubEltBits[i + (j * NumSubElts)] = SubEltBits[i];

      }

      UndefSubElts = APInt::getSplat(NumSubVecs * UndefSubElts.getBitWidth(),

                                     UndefSubElts);

      return CastBitData(UndefSubElts, SubEltBits);

    }

  }


  // Extract a rematerialized scalar constant insertion.

  if (Op.getOpcode() == X86ISD::VZEXT_MOVL &&

      Op.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR &&

      isa<ConstantSDNode>(Op.getOperand(0).getOperand(0))) {

    unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();

    unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;


    APInt UndefSrcElts(NumSrcElts, 0);

    SmallVector<APInt, 64> SrcEltBits;

    const APInt &C = Op.getOperand(0).getConstantOperandAPInt(0);

    SrcEltBits.push_back(C.zextOrTrunc(SrcEltSizeInBits));

    SrcEltBits.append(NumSrcElts - 1, APInt(SrcEltSizeInBits, 0));

    return CastBitData(UndefSrcElts, SrcEltBits);

  }


  // Insert constant bits from a base and sub vector sources.

  if (Op.getOpcode() == ISD::INSERT_SUBVECTOR) {

    // If bitcasts to larger elements we might lose track of undefs - don't

    // allow any to be safe.

    unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();

    bool AllowUndefs = EltSizeInBits >= SrcEltSizeInBits;


    APInt UndefSrcElts, UndefSubElts;

    SmallVector<APInt, 32> EltSrcBits, EltSubBits;

    if (getTargetConstantBitsFromNode(Op.getOperand(1), SrcEltSizeInBits,

                                      UndefSubElts, EltSubBits,

                                      AllowWholeUndefs && AllowUndefs,

                                      AllowPartialUndefs && AllowUndefs) &&

        getTargetConstantBitsFromNode(Op.getOperand(0), SrcEltSizeInBits,

                                      UndefSrcElts, EltSrcBits,

                                      AllowWholeUndefs && AllowUndefs,

                                      AllowPartialUndefs && AllowUndefs)) {

      unsigned BaseIdx = Op.getConstantOperandVal(2);

      UndefSrcElts.insertBits(UndefSubElts, BaseIdx);

      for (unsigned i = 0, e = EltSubBits.size(); i != e; ++i)

        EltSrcBits[BaseIdx + i] = EltSubBits[i];

      return CastBitData(UndefSrcElts, EltSrcBits);

    }

  }


  // Extract constant bits from a subvector's source.

  if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&

      getTargetConstantBitsFromNode(Op.getOperand(0), EltSizeInBits, UndefElts,

                                    EltBits, AllowWholeUndefs,

                                    AllowPartialUndefs)) {

    EVT SrcVT = Op.getOperand(0).getValueType();

    unsigned NumSrcElts = SrcVT.getSizeInBits() / EltSizeInBits;

    unsigned NumSubElts = VT.getSizeInBits() / EltSizeInBits;

    unsigned BaseOfs = Op.getConstantOperandVal(1) * VT.getScalarSizeInBits();

    unsigned BaseIdx = BaseOfs / EltSizeInBits;

    assert((SrcVT.getSizeInBits() % EltSizeInBits) == 0 &&

           (VT.getSizeInBits() % EltSizeInBits) == 0 &&

           (BaseOfs % EltSizeInBits) == 0 && "Bad subvector index");


    UndefElts = UndefElts.extractBits(NumSubElts, BaseIdx);

    if ((BaseIdx + NumSubElts) != NumSrcElts)

      EltBits.erase(EltBits.begin() + BaseIdx + NumSubElts, EltBits.end());

    if (BaseIdx != 0)

      EltBits.erase(EltBits.begin(), EltBits.begin() + BaseIdx);

    return true;

  }


  // Extract constant bits from shuffle node sources.

  if (auto *SVN = dyn_cast<ShuffleVectorSDNode>(Op)) {

    // TODO - support shuffle through bitcasts.

    if (EltSizeInBits != VT.getScalarSizeInBits())

      return false;


    ArrayRef<int> Mask = SVN->getMask();

    if ((!AllowWholeUndefs || !AllowPartialUndefs) &&

        llvm::any_of(Mask, [](int M) { return M < 0; }))

      return false;


    APInt UndefElts0, UndefElts1;

    SmallVector<APInt, 32> EltBits0, EltBits1;

    if (isAnyInRange(Mask, 0, NumElts) &&

        !getTargetConstantBitsFromNode(Op.getOperand(0), EltSizeInBits,

                                       UndefElts0, EltBits0, AllowWholeUndefs,

                                       AllowPartialUndefs))

      return false;

    if (isAnyInRange(Mask, NumElts, 2 * NumElts) &&

        !getTargetConstantBitsFromNode(Op.getOperand(1), EltSizeInBits,

                                       UndefElts1, EltBits1, AllowWholeUndefs,

                                       AllowPartialUndefs))

      return false;


    UndefElts = APInt::getZero(NumElts);

    for (int i = 0; i != (int)NumElts; ++i) {

      int M = Mask[i];

      if (M < 0) {

        UndefElts.setBit(i);

        EltBits.push_back(APInt::getZero(EltSizeInBits));

      } else if (M < (int)NumElts) {

        if (UndefElts0[M])

          UndefElts.setBit(i);

        EltBits.push_back(EltBits0[M]);

      } else {

        if (UndefElts1[M - NumElts])

          UndefElts.setBit(i);

        EltBits.push_back(EltBits1[M - NumElts]);

      }

    }

    return true;

  }


  return false;

}


namespace llvm {

namespace X86 {


bool isConstantSplat(SDValue Op, APInt &SplatVal, bool AllowPartialUndefs) {

  APInt UndefElts;

  SmallVector<APInt, 16> EltBits;

  if (getTargetConstantBitsFromNode(

          Op, Op.getScalarValueSizeInBits(), UndefElts, EltBits,

          /*AllowWholeUndefs*/ true, AllowPartialUndefs)) {

    int SplatIndex = -1;

    for (int i = 0, e = EltBits.size(); i != e; ++i) {

      if (UndefElts[i])

        continue;

      if (0 <= SplatIndex && EltBits[i] != EltBits[SplatIndex]) {

        SplatIndex = -1;

        break;

      }

      SplatIndex = i;

    }

    if (0 <= SplatIndex) {

      SplatVal = EltBits[SplatIndex];

      return true;

    }

  }


  return false;

}


int getRoundingModeX86(unsigned RM) {

  switch (static_cast<::llvm::RoundingMode>(RM)) {

    // clang-format off

  case ::llvm::RoundingMode::NearestTiesToEven: return X86::rmToNearest; break;

  case ::llvm::RoundingMode::TowardNegative:    return X86::rmDownward; break;

  case ::llvm::RoundingMode::TowardPositive:    return X86::rmUpward; break;

  case ::llvm::RoundingMode::TowardZero:        return X86::rmTowardZero; break;

  default:

    return X86::rmInvalid; // Invalid rounding mode

  }

}


} // namespace X86

} // namespace llvm


static bool getTargetShuffleMaskIndices(SDValue MaskNode,

                                        unsigned MaskEltSizeInBits,

                                        SmallVectorImpl<uint64_t> &RawMask,

                                        APInt &UndefElts) {

  // Extract the raw target constant bits.

  SmallVector<APInt, 64> EltBits;

  if (!getTargetConstantBitsFromNode(MaskNode, MaskEltSizeInBits, UndefElts,

                                     EltBits, /* AllowWholeUndefs */ true,

                                     /* AllowPartialUndefs */ false))

    return false;


  // Insert the extracted elements into the mask.

  for (const APInt &Elt : EltBits)

    RawMask.push_back(Elt.getZExtValue());


  return true;

}


static bool isConstantPowerOf2(SDValue V, unsigned EltSizeInBIts,

                               bool AllowUndefs) {

  APInt UndefElts;

  SmallVector<APInt, 64> EltBits;

  if (!getTargetConstantBitsFromNode(V, EltSizeInBIts, UndefElts, EltBits,

                                     /*AllowWholeUndefs*/ AllowUndefs,

                                     /*AllowPartialUndefs*/ false))

    return false;


  bool IsPow2OrUndef = true;

  for (unsigned I = 0, E = EltBits.size(); I != E; ++I)

    IsPow2OrUndef &= UndefElts[I] || EltBits[I].isPowerOf2();

  return IsPow2OrUndef;

}


// Helper to attempt to return a cheaper, bit-inverted version of \p V.


static SDValue IsNOT(SDValue V, SelectionDAG &DAG) {

  // TODO: don't always ignore oneuse constraints.

  V = peekThroughBitcasts(V);

  EVT VT = V.getValueType();


  // Match not(xor X, -1) -> X.

  if (V.getOpcode() == ISD::XOR &&

      (ISD::isBuildVectorAllOnes(V.getOperand(1).getNode()) ||

       isAllOnesConstant(V.getOperand(1))))

    return V.getOperand(0);


  // Match not(extract_subvector(not(X)) -> extract_subvector(X).

  if (V.getOpcode() == ISD::EXTRACT_SUBVECTOR &&

      (isNullConstant(V.getOperand(1)) || V.getOperand(0).hasOneUse())) {

    if (SDValue Not = IsNOT(V.getOperand(0), DAG)) {

      Not = DAG.getBitcast(V.getOperand(0).getValueType(), Not);

      return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(Not), VT, Not,

                         V.getOperand(1));

    }

  }


  // Match not(pcmpgt(C, X)) -> pcmpgt(X, C - 1).

  if (V.getOpcode() == X86ISD::PCMPGT &&

      !ISD::isBuildVectorAllZeros(V.getOperand(0).getNode()) &&

      !ISD::isBuildVectorAllOnes(V.getOperand(0).getNode()) &&

      V.getOperand(0).hasOneUse()) {

    APInt UndefElts;

    SmallVector<APInt> EltBits;

    if (getTargetConstantBitsFromNode(V.getOperand(0),

                                      V.getScalarValueSizeInBits(), UndefElts,

                                      EltBits) &&

        !ISD::isBuildVectorOfConstantSDNodes(V.getOperand(1).getNode())) {

      // Don't fold min_signed_value -> (min_signed_value - 1)

      bool MinSigned = false;

      for (APInt &Elt : EltBits) {

        MinSigned |= Elt.isMinSignedValue();

        Elt -= 1;

      }

      if (!MinSigned) {

        SDLoc DL(V);

        MVT VT = V.getSimpleValueType();

        return DAG.getNode(X86ISD::PCMPGT, DL, VT, V.getOperand(1),

                           getConstVector(EltBits, UndefElts, VT, DAG, DL));

      }

    }

  }


  // Match not(concat_vectors(not(X), not(Y))) -> concat_vectors(X, Y).

  SmallVector<SDValue, 2> CatOps;

  if (collectConcatOps(V.getNode(), CatOps, DAG)) {

    for (SDValue &CatOp : CatOps) {

      SDValue NotCat = IsNOT(CatOp, DAG);

      if (!NotCat)

        return SDValue();

      CatOp = DAG.getBitcast(CatOp.getValueType(), NotCat);

    }

    return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(V), VT, CatOps);

  }


  // Match not(or(not(X),not(Y))) -> and(X, Y).

  if (V.getOpcode() == ISD::OR && DAG.getTargetLoweringInfo().isTypeLegal(VT) &&

      V.getOperand(0).hasOneUse() && V.getOperand(1).hasOneUse()) {

    // TODO: Handle cases with single NOT operand -> ANDNP

    if (SDValue Op1 = IsNOT(V.getOperand(1), DAG))

      if (SDValue Op0 = IsNOT(V.getOperand(0), DAG))

        return DAG.getNode(ISD::AND, SDLoc(V), VT, DAG.getBitcast(VT, Op0),

                           DAG.getBitcast(VT, Op1));

  }


  return SDValue();

}


/// Create a shuffle mask that matches the PACKSS/PACKUS truncation.

/// A multi-stage pack shuffle mask is created by specifying NumStages > 1.

/// Note: This ignores saturation, so inputs must be checked first.


static void createPackShuffleMask(MVT VT, SmallVectorImpl<int> &Mask,

                                  bool Unary, unsigned NumStages = 1) {

  assert(Mask.empty() && "Expected an empty shuffle mask vector");

  unsigned NumElts = VT.getVectorNumElements();

  unsigned NumLanes = VT.getSizeInBits() / 128;

  unsigned NumEltsPerLane = 128 / VT.getScalarSizeInBits();

  unsigned Offset = Unary ? 0 : NumElts;

  unsigned Repetitions = 1u << (NumStages - 1);

  unsigned Increment = 1u << NumStages;

  assert((NumEltsPerLane >> NumStages) > 0 && "Illegal packing compaction");


  for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {

    for (unsigned Stage = 0; Stage != Repetitions; ++Stage) {

      for (unsigned Elt = 0; Elt != NumEltsPerLane; Elt += Increment)

        Mask.push_back(Elt + (Lane * NumEltsPerLane));

      for (unsigned Elt = 0; Elt != NumEltsPerLane; Elt += Increment)

        Mask.push_back(Elt + (Lane * NumEltsPerLane) + Offset);

    }

  }

}


// Split the demanded elts of a PACKSS/PACKUS node between its operands.


static void getPackDemandedElts(EVT VT, const APInt &DemandedElts,

                                APInt &DemandedLHS, APInt &DemandedRHS) {

  int NumLanes = VT.getSizeInBits() / 128;

  int NumElts = DemandedElts.getBitWidth();

  int NumInnerElts = NumElts / 2;

  int NumEltsPerLane = NumElts / NumLanes;

  int NumInnerEltsPerLane = NumInnerElts / NumLanes;


  DemandedLHS = APInt::getZero(NumInnerElts);

  DemandedRHS = APInt::getZero(NumInnerElts);


  // Map DemandedElts to the packed operands.

  for (int Lane = 0; Lane != NumLanes; ++Lane) {

    for (int Elt = 0; Elt != NumInnerEltsPerLane; ++Elt) {

      int OuterIdx = (Lane * NumEltsPerLane) + Elt;

      int InnerIdx = (Lane * NumInnerEltsPerLane) + Elt;

      if (DemandedElts[OuterIdx])

        DemandedLHS.setBit(InnerIdx);

      if (DemandedElts[OuterIdx + NumInnerEltsPerLane])

        DemandedRHS.setBit(InnerIdx);

    }

  }

}


// Split the demanded elts of a HADD/HSUB node between its operands.


static void getHorizDemandedElts(EVT VT, const APInt &DemandedElts,

                                 APInt &DemandedLHS, APInt &DemandedRHS) {

  getHorizDemandedEltsForFirstOperand(VT.getSizeInBits(), DemandedElts,

                                      DemandedLHS, DemandedRHS);

  DemandedLHS |= DemandedLHS << 1;

  DemandedRHS |= DemandedRHS << 1;

}


/// Calculates the shuffle mask corresponding to the target-specific opcode.

/// If the mask could be calculated, returns it in \p Mask, returns the shuffle

/// operands in \p Ops, and returns true.

/// Sets \p IsUnary to true if only one source is used. Note that this will set

/// IsUnary for shuffles which use a single input multiple times, and in those

/// cases it will adjust the mask to only have indices within that single input.

/// It is an error to call this with non-empty Mask/Ops vectors.


static bool getTargetShuffleMask(SDValue N, bool AllowSentinelZero,

                                 SmallVectorImpl<SDValue> &Ops,

                                 SmallVectorImpl<int> &Mask, bool &IsUnary) {

  if (!isTargetShuffle(N.getOpcode()))

    return false;


  MVT VT = N.getSimpleValueType();

  unsigned NumElems = VT.getVectorNumElements();

  unsigned MaskEltSize = VT.getScalarSizeInBits();

  SmallVector<uint64_t, 32> RawMask;

  APInt RawUndefs;

  uint64_t ImmN;


  assert(Mask.empty() && "getTargetShuffleMask expects an empty Mask vector");

  assert(Ops.empty() && "getTargetShuffleMask expects an empty Ops vector");


  IsUnary = false;

  bool IsFakeUnary = false;

  switch (N.getOpcode()) {

  case X86ISD::BLENDI:

    assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");

    assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");

    ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);

    DecodeBLENDMask(NumElems, ImmN, Mask);

    IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);

    break;

  case X86ISD::SHUFP:

    assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");

    assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");

    ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);

    DecodeSHUFPMask(NumElems, MaskEltSize, ImmN, Mask);

    IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);

    break;

  case X86ISD::INSERTPS:

    assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");

    assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");

    ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);

    DecodeINSERTPSMask(ImmN, Mask, /*SrcIsMem=*/false);

    IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);

    break;

  case X86ISD::EXTRQI:

    assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");

    if (isa<ConstantSDNode>(N.getOperand(1)) &&

        isa<ConstantSDNode>(N.getOperand(2))) {

      int BitLen = N.getConstantOperandVal(1);

      int BitIdx = N.getConstantOperandVal(2);

      DecodeEXTRQIMask(NumElems, MaskEltSize, BitLen, BitIdx, Mask);

      IsUnary = true;

    }

    break;

  case X86ISD::INSERTQI:

    assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");

    assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");

    if (isa<ConstantSDNode>(N.getOperand(2)) &&

        isa<ConstantSDNode>(N.getOperand(3))) {

      int BitLen = N.getConstantOperandVal(2);

      int BitIdx = N.getConstantOperandVal(3);

      DecodeINSERTQIMask(NumElems, MaskEltSize, BitLen, BitIdx, Mask);

      IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);

    }

    break;

  case X86ISD::UNPCKH:

    assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");

    assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");

    DecodeUNPCKHMask(NumElems, MaskEltSize, Mask);

    IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);

    break;

  case X86ISD::UNPCKL:

    assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");

    assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");

    DecodeUNPCKLMask(NumElems, MaskEltSize, Mask);

    IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);

    break;

  case X86ISD::MOVHLPS:

    assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");

    assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");

    DecodeMOVHLPSMask(NumElems, Mask);

    IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);

    break;

  case X86ISD::MOVLHPS:

    assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");

    assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");

    DecodeMOVLHPSMask(NumElems, Mask);

    IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);

    break;

  case X86ISD::VALIGN:

    assert((VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) &&

           "Only 32-bit and 64-bit elements are supported!");

    assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");

    assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");

    ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);

    DecodeVALIGNMask(NumElems, ImmN, Mask);

    IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);

    Ops.push_back(N.getOperand(1));

    Ops.push_back(N.getOperand(0));

    break;

  case X86ISD::PALIGNR:

    assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");

    assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");

    assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");

    ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);

    DecodePALIGNRMask(NumElems, ImmN, Mask);

    IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);

    Ops.push_back(N.getOperand(1));

    Ops.push_back(N.getOperand(0));

    break;

  case X86ISD::VSHLDQ:

    assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");

    assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");

    ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);

    DecodePSLLDQMask(NumElems, ImmN, Mask);

    IsUnary = true;

    break;

  case X86ISD::VSRLDQ:

    assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");

    assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");

    ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);

    DecodePSRLDQMask(NumElems, ImmN, Mask);

    IsUnary = true;

    break;

  case X86ISD::PSHUFD:

  case X86ISD::VPERMILPI:

    assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");

    ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);

    DecodePSHUFMask(NumElems, MaskEltSize, ImmN, Mask);

    IsUnary = true;

    break;

  case X86ISD::PSHUFHW:

    assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");

    ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);

    DecodePSHUFHWMask(NumElems, ImmN, Mask);

    IsUnary = true;

    break;

  case X86ISD::PSHUFLW:

    assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");

    ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);

    DecodePSHUFLWMask(NumElems, ImmN, Mask);

    IsUnary = true;

    break;

  case X86ISD::VZEXT_MOVL:

    assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");

    DecodeZeroMoveLowMask(NumElems, Mask);

    IsUnary = true;

    break;

  case X86ISD::VBROADCAST:

    // We only decode broadcasts of same-sized vectors, peeking through to

    // extracted subvectors is likely to cause hasOneUse issues with

    // SimplifyDemandedBits etc.

    if (N.getOperand(0).getValueType() == VT) {

      DecodeVectorBroadcast(NumElems, Mask);

      IsUnary = true;

      break;

    }

    return false;

  case X86ISD::VPERMILPV: {

    assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");

    IsUnary = true;

    SDValue MaskNode = N.getOperand(1);

    if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,

                                    RawUndefs)) {

      DecodeVPERMILPMask(NumElems, MaskEltSize, RawMask, RawUndefs, Mask);

      break;

    }

    return false;

  }

  case X86ISD::PSHUFB: {

    assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");

    assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");

    assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");

    IsUnary = true;

    SDValue MaskNode = N.getOperand(1);

    if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask, RawUndefs)) {

      DecodePSHUFBMask(RawMask, RawUndefs, Mask);

      break;

    }

    return false;

  }

  case X86ISD::VPERMI:

    assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");

    ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);

    DecodeVPERMMask(NumElems, ImmN, Mask);

    IsUnary = true;

    break;

  case X86ISD::MOVSS:

  case X86ISD::MOVSD:

  case X86ISD::MOVSH:

    assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");

    assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");

    DecodeScalarMoveMask(NumElems, /* IsLoad */ false, Mask);

    break;

  case X86ISD::VPERM2X128:

    assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");

    assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");

    ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);

    DecodeVPERM2X128Mask(NumElems, ImmN, Mask);

    IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);

    break;

  case X86ISD::SHUF128:

    assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");

    assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");

    ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);

    decodeVSHUF64x2FamilyMask(NumElems, MaskEltSize, ImmN, Mask);

    IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);

    break;

  case X86ISD::MOVSLDUP:

    assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");

    DecodeMOVSLDUPMask(NumElems, Mask);

    IsUnary = true;

    break;

  case X86ISD::MOVSHDUP:

    assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");

    DecodeMOVSHDUPMask(NumElems, Mask);

    IsUnary = true;

    break;

  case X86ISD::MOVDDUP:

    assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");

    DecodeMOVDDUPMask(NumElems, Mask);

    IsUnary = true;

    break;

  case X86ISD::VPERMIL2: {

    assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");

    assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");

    IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);

    SDValue MaskNode = N.getOperand(2);

    SDValue CtrlNode = N.getOperand(3);

    if (ConstantSDNode *CtrlOp = dyn_cast<ConstantSDNode>(CtrlNode)) {

      unsigned CtrlImm = CtrlOp->getZExtValue();

      if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,

                                      RawUndefs)) {

        DecodeVPERMIL2PMask(NumElems, MaskEltSize, CtrlImm, RawMask, RawUndefs,

                            Mask);

        break;

      }

    }

    return false;

  }

  case X86ISD::VPPERM: {

    assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");

    assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");

    IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);

    SDValue MaskNode = N.getOperand(2);

    if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask, RawUndefs)) {

      DecodeVPPERMMask(RawMask, RawUndefs, Mask);

      break;

    }

    return false;

  }

  case X86ISD::VPERMV: {

    assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");

    IsUnary = true;

    // Unlike most shuffle nodes, VPERMV's mask operand is operand 0.

    Ops.push_back(N.getOperand(1));

    SDValue MaskNode = N.getOperand(0);

    if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,

                                    RawUndefs)) {

      DecodeVPERMVMask(RawMask, RawUndefs, Mask);

      break;

    }

    return false;

  }

  case X86ISD::VPERMV3: {

    assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");

    assert(N.getOperand(2).getValueType() == VT && "Unexpected value type");

    IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(2);

    // Unlike most shuffle nodes, VPERMV3's mask operand is the middle one.

    Ops.push_back(N.getOperand(0));

    Ops.push_back(N.getOperand(2));

    SDValue MaskNode = N.getOperand(1);

    if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,

                                    RawUndefs)) {

      DecodeVPERMV3Mask(RawMask, RawUndefs, Mask);

      break;

    }

    return false;

  }

  default:

    llvm_unreachable("unknown target shuffle node");

  }


  // Empty mask indicates the decode failed.

  if (Mask.empty())

    return false;


  // Check if we're getting a shuffle mask with zero'd elements.

  if (!AllowSentinelZero && isAnyZero(Mask))

    return false;


  // If we have a fake unary shuffle, the shuffle mask is spread across two

  // inputs that are actually the same node. Re-map the mask to always point

  // into the first input.

  if (IsFakeUnary)

    for (int &M : Mask)

      if (M >= (int)Mask.size())

        M -= Mask.size();


  // If we didn't already add operands in the opcode-specific code, default to

  // adding 1 or 2 operands starting at 0.

  if (Ops.empty()) {

    Ops.push_back(N.getOperand(0));

    if (!IsUnary || IsFakeUnary)

      Ops.push_back(N.getOperand(1));

  }


  return true;

}


// Wrapper for getTargetShuffleMask with InUnary;


static bool getTargetShuffleMask(SDValue N, bool AllowSentinelZero,

                                 SmallVectorImpl<SDValue> &Ops,

                                 SmallVectorImpl<int> &Mask) {

  bool IsUnary;

  return getTargetShuffleMask(N, AllowSentinelZero, Ops, Mask, IsUnary);

}


/// Compute whether each element of a shuffle is zeroable.

///

/// A "zeroable" vector shuffle element is one which can be lowered to zero.

/// Either it is an undef element in the shuffle mask, the element of the input

/// referenced is undef, or the element of the input referenced is known to be

/// zero. Many x86 shuffles can zero lanes cheaply and we often want to handle

/// as many lanes with this technique as possible to simplify the remaining

/// shuffle.


static void computeZeroableShuffleElements(ArrayRef<int> Mask,

                                           SDValue V1, SDValue V2,

                                           APInt &KnownUndef, APInt &KnownZero) {

  int Size = Mask.size();

  KnownUndef = KnownZero = APInt::getZero(Size);


  V1 = peekThroughBitcasts(V1);

  V2 = peekThroughBitcasts(V2);


  bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode());

  bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode());


  int VectorSizeInBits = V1.getValueSizeInBits();

  int ScalarSizeInBits = VectorSizeInBits / Size;

  assert(!(VectorSizeInBits % ScalarSizeInBits) && "Illegal shuffle mask size");


  for (int i = 0; i < Size; ++i) {

    int M = Mask[i];

    // Handle the easy cases.

    if (M < 0) {

      KnownUndef.setBit(i);

      continue;

    }

    if ((M >= 0 && M < Size && V1IsZero) || (M >= Size && V2IsZero)) {

      KnownZero.setBit(i);

      continue;

    }


    // Determine shuffle input and normalize the mask.

    SDValue V = M < Size ? V1 : V2;

    M %= Size;


    // Currently we can only search BUILD_VECTOR for UNDEF/ZERO elements.

    if (V.getOpcode() != ISD::BUILD_VECTOR)

      continue;


    // If the BUILD_VECTOR has fewer elements then the bitcasted portion of

    // the (larger) source element must be UNDEF/ZERO.

    if ((Size % V.getNumOperands()) == 0) {

      int Scale = Size / V->getNumOperands();

      SDValue Op = V.getOperand(M / Scale);

      if (Op.isUndef())

        KnownUndef.setBit(i);

      if (X86::isZeroNode(Op))

        KnownZero.setBit(i);

      else if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) {

        APInt Val = Cst->getAPIntValue();

        Val = Val.extractBits(ScalarSizeInBits, (M % Scale) * ScalarSizeInBits);

        if (Val == 0)

          KnownZero.setBit(i);

      } else if (ConstantFPSDNode *Cst = dyn_cast<ConstantFPSDNode>(Op)) {

        APInt Val = Cst->getValueAPF().bitcastToAPInt();

        Val = Val.extractBits(ScalarSizeInBits, (M % Scale) * ScalarSizeInBits);

        if (Val == 0)

          KnownZero.setBit(i);

      }

      continue;

    }


    // If the BUILD_VECTOR has more elements then all the (smaller) source

    // elements must be UNDEF or ZERO.

    if ((V.getNumOperands() % Size) == 0) {

      int Scale = V->getNumOperands() / Size;

      bool AllUndef = true;

      bool AllZero = true;

      for (int j = 0; j < Scale; ++j) {

        SDValue Op = V.getOperand((M * Scale) + j);

        AllUndef &= Op.isUndef();

        AllZero &= X86::isZeroNode(Op);

      }

      if (AllUndef)

        KnownUndef.setBit(i);

      if (AllZero)

        KnownZero.setBit(i);

      continue;

    }

  }

}


/// Decode a target shuffle mask and inputs and see if any values are

/// known to be undef or zero from their inputs.

/// Returns true if the target shuffle mask was decoded.

/// FIXME: Merge this with computeZeroableShuffleElements?


static bool getTargetShuffleAndZeroables(SDValue N, SmallVectorImpl<int> &Mask,

                                         SmallVectorImpl<SDValue> &Ops,

                                         APInt &KnownUndef, APInt &KnownZero) {

  bool IsUnary;

  if (!isTargetShuffle(N.getOpcode()))

    return false;


  MVT VT = N.getSimpleValueType();

  if (!getTargetShuffleMask(N, true, Ops, Mask, IsUnary))

    return false;


  int Size = Mask.size();

  SDValue V1 = Ops[0];

  SDValue V2 = IsUnary ? V1 : Ops[1];

  KnownUndef = KnownZero = APInt::getZero(Size);


  V1 = peekThroughBitcasts(V1);

  V2 = peekThroughBitcasts(V2);


  assert((VT.getSizeInBits() % Size) == 0 &&

         "Illegal split of shuffle value type");

  unsigned EltSizeInBits = VT.getSizeInBits() / Size;


  // Extract known constant input data.

  APInt UndefSrcElts[2];

  SmallVector<APInt, 32> SrcEltBits[2];

  bool IsSrcConstant[2] = {

      getTargetConstantBitsFromNode(V1, EltSizeInBits, UndefSrcElts[0],

                                    SrcEltBits[0], /*AllowWholeUndefs*/ true,

                                    /*AllowPartialUndefs*/ false),

      getTargetConstantBitsFromNode(V2, EltSizeInBits, UndefSrcElts[1],

                                    SrcEltBits[1], /*AllowWholeUndefs*/ true,

                                    /*AllowPartialUndefs*/ false)};


  for (int i = 0; i < Size; ++i) {

    int M = Mask[i];


    // Already decoded as SM_SentinelZero / SM_SentinelUndef.

    if (M < 0) {

      assert(isUndefOrZero(M) && "Unknown shuffle sentinel value!");

      if (SM_SentinelUndef == M)

        KnownUndef.setBit(i);

      if (SM_SentinelZero == M)

        KnownZero.setBit(i);

      continue;

    }


    // Determine shuffle input and normalize the mask.

    unsigned SrcIdx = M / Size;

    SDValue V = M < Size ? V1 : V2;

    M %= Size;


    // We are referencing an UNDEF input.

    if (V.isUndef()) {

      KnownUndef.setBit(i);

      continue;

    }


    // SCALAR_TO_VECTOR - only the first element is defined, and the rest UNDEF.

    // TODO: We currently only set UNDEF for integer types - floats use the same

    // registers as vectors and many of the scalar folded loads rely on the

    // SCALAR_TO_VECTOR pattern.

    if (V.getOpcode() == ISD::SCALAR_TO_VECTOR &&

        (Size % V.getValueType().getVectorNumElements()) == 0) {

      int Scale = Size / V.getValueType().getVectorNumElements();

      int Idx = M / Scale;

      if (Idx != 0 && !VT.isFloatingPoint())

        KnownUndef.setBit(i);

      else if (Idx == 0 && X86::isZeroNode(V.getOperand(0)))

        KnownZero.setBit(i);

      continue;

    }


    // INSERT_SUBVECTOR - to widen vectors we often insert them into UNDEF

    // base vectors.

    if (V.getOpcode() == ISD::INSERT_SUBVECTOR) {

      SDValue Vec = V.getOperand(0);

      int NumVecElts = Vec.getValueType().getVectorNumElements();

      if (Vec.isUndef() && Size == NumVecElts) {

        int Idx = V.getConstantOperandVal(2);

        int NumSubElts = V.getOperand(1).getValueType().getVectorNumElements();

        if (M < Idx || (Idx + NumSubElts) <= M)

          KnownUndef.setBit(i);

      }

      continue;

    }


    // Attempt to extract from the source's constant bits.

    if (IsSrcConstant[SrcIdx]) {

      if (UndefSrcElts[SrcIdx][M])

        KnownUndef.setBit(i);

      else if (SrcEltBits[SrcIdx][M] == 0)

        KnownZero.setBit(i);

    }

  }


  assert(VT.getVectorNumElements() == (unsigned)Size &&

         "Different mask size from vector size!");

  return true;

}


// Replace target shuffle mask elements with known undef/zero sentinels.


static void resolveTargetShuffleFromZeroables(SmallVectorImpl<int> &Mask,

                                              const APInt &KnownUndef,

                                              const APInt &KnownZero,

                                              bool ResolveKnownZeros= true) {

  unsigned NumElts = Mask.size();

  assert(KnownUndef.getBitWidth() == NumElts &&

         KnownZero.getBitWidth() == NumElts && "Shuffle mask size mismatch");


  for (unsigned i = 0; i != NumElts; ++i) {

    if (KnownUndef[i])

      Mask[i] = SM_SentinelUndef;

    else if (ResolveKnownZeros && KnownZero[i])

      Mask[i] = SM_SentinelZero;

  }

}


// Extract target shuffle mask sentinel elements to known undef/zero bitmasks.


static void resolveZeroablesFromTargetShuffle(const SmallVectorImpl<int> &Mask,

                                              APInt &KnownUndef,

                                              APInt &KnownZero) {

  unsigned NumElts = Mask.size();

  KnownUndef = KnownZero = APInt::getZero(NumElts);


  for (unsigned i = 0; i != NumElts; ++i) {

    int M = Mask[i];

    if (SM_SentinelUndef == M)

      KnownUndef.setBit(i);

    if (SM_SentinelZero == M)

      KnownZero.setBit(i);

  }

}


// Attempt to create a shuffle mask from a VSELECT/BLENDV condition mask.


static bool createShuffleMaskFromVSELECT(SmallVectorImpl<int> &Mask,

                                         SDValue Cond, bool IsBLENDV = false) {

  EVT CondVT = Cond.getValueType();

  unsigned EltSizeInBits = CondVT.getScalarSizeInBits();

  unsigned NumElts = CondVT.getVectorNumElements();


  APInt UndefElts;

  SmallVector<APInt, 32> EltBits;

  if (!getTargetConstantBitsFromNode(Cond, EltSizeInBits, UndefElts, EltBits,

                                     /*AllowWholeUndefs*/ true,

                                     /*AllowPartialUndefs*/ false))

    return false;


  Mask.resize(NumElts, SM_SentinelUndef);


  for (int i = 0; i != (int)NumElts; ++i) {

    Mask[i] = i;

    // Arbitrarily choose from the 2nd operand if the select condition element

    // is undef.

    // TODO: Can we do better by matching patterns such as even/odd?

    if (UndefElts[i] || (!IsBLENDV && EltBits[i].isZero()) ||

        (IsBLENDV && EltBits[i].isNonNegative()))

      Mask[i] += NumElts;

  }


  return true;

}


// Forward declaration (for getFauxShuffleMask recursive check).

static bool getTargetShuffleInputs(SDValue Op, const APInt &DemandedElts,

                                   SmallVectorImpl<SDValue> &Inputs,

                                   SmallVectorImpl<int> &Mask,

                                   const SelectionDAG &DAG, unsigned Depth,

                                   bool ResolveKnownElts);


// Attempt to decode ops that could be represented as a shuffle mask.

// The decoded shuffle mask may contain a different number of elements to the

// destination value type.

// TODO: Merge into getTargetShuffleInputs()


static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts,

                               SmallVectorImpl<int> &Mask,

                               SmallVectorImpl<SDValue> &Ops,

                               const SelectionDAG &DAG, unsigned Depth,

                               bool ResolveKnownElts) {

  Mask.clear();

  Ops.clear();


  MVT VT = N.getSimpleValueType();

  unsigned NumElts = VT.getVectorNumElements();

  unsigned NumSizeInBits = VT.getSizeInBits();

  unsigned NumBitsPerElt = VT.getScalarSizeInBits();

  if ((NumBitsPerElt % 8) != 0 || (NumSizeInBits % 8) != 0)

    return false;

  assert(NumElts == DemandedElts.getBitWidth() && "Unexpected vector size");

  unsigned NumSizeInBytes = NumSizeInBits / 8;

  unsigned NumBytesPerElt = NumBitsPerElt / 8;


  unsigned Opcode = N.getOpcode();

  switch (Opcode) {

  case ISD::VECTOR_SHUFFLE: {

    // Don't treat ISD::VECTOR_SHUFFLE as a target shuffle so decode it here.

    ArrayRef<int> ShuffleMask = cast<ShuffleVectorSDNode>(N)->getMask();

    if (isUndefOrInRange(ShuffleMask, 0, 2 * NumElts)) {

      Mask.append(ShuffleMask.begin(), ShuffleMask.end());

      Ops.push_back(N.getOperand(0));

      Ops.push_back(N.getOperand(1));

      return true;

    }

    return false;

  }

  case ISD::AND:

  case X86ISD::ANDNP: {

    // Attempt to decode as a per-byte mask.

    APInt UndefElts;

    SmallVector<APInt, 32> EltBits;

    SDValue N0 = N.getOperand(0);

    SDValue N1 = N.getOperand(1);

    bool IsAndN = (X86ISD::ANDNP == Opcode);

    uint64_t ZeroMask = IsAndN ? 255 : 0;

    if (!getTargetConstantBitsFromNode(IsAndN ? N0 : N1, 8, UndefElts, EltBits,

                                       /*AllowWholeUndefs*/ false,

                                       /*AllowPartialUndefs*/ false))

      return false;

    // We can't assume an undef src element gives an undef dst - the other src

    // might be zero.

    assert(UndefElts.isZero() && "Unexpected UNDEF element in AND/ANDNP mask");

    for (int i = 0, e = (int)EltBits.size(); i != e; ++i) {

      const APInt &ByteBits = EltBits[i];

      if (ByteBits != 0 && ByteBits != 255)

        return false;

      Mask.push_back(ByteBits == ZeroMask ? SM_SentinelZero : i);

    }

    Ops.push_back(IsAndN ? N1 : N0);

    return true;

  }

  case ISD::OR: {

    // Handle OR(SHUFFLE,SHUFFLE) case where one source is zero and the other

    // is a valid shuffle index.

    SDValue N0 = peekThroughBitcasts(N.getOperand(0));

    SDValue N1 = peekThroughBitcasts(N.getOperand(1));

    if (!N0.getValueType().isVector() || !N1.getValueType().isVector())

      return false;


    SmallVector<int, 64> SrcMask0, SrcMask1;

    SmallVector<SDValue, 2> SrcInputs0, SrcInputs1;

    APInt Demand0 = APInt::getAllOnes(N0.getValueType().getVectorNumElements());

    APInt Demand1 = APInt::getAllOnes(N1.getValueType().getVectorNumElements());

    if (!getTargetShuffleInputs(N0, Demand0, SrcInputs0, SrcMask0, DAG,

                                Depth + 1, true) ||

        !getTargetShuffleInputs(N1, Demand1, SrcInputs1, SrcMask1, DAG,

                                Depth + 1, true))

      return false;


    size_t MaskSize = std::max(SrcMask0.size(), SrcMask1.size());

    SmallVector<int, 64> Mask0, Mask1;

    narrowShuffleMaskElts(MaskSize / SrcMask0.size(), SrcMask0, Mask0);

    narrowShuffleMaskElts(MaskSize / SrcMask1.size(), SrcMask1, Mask1);

    for (int i = 0; i != (int)MaskSize; ++i) {

      // NOTE: Don't handle SM_SentinelUndef, as we can end up in infinite

      // loops converting between OR and BLEND shuffles due to

      // canWidenShuffleElements merging away undef elements, meaning we

      // fail to recognise the OR as the undef element isn't known zero.

      if (Mask0[i] == SM_SentinelZero && Mask1[i] == SM_SentinelZero)

        Mask.push_back(SM_SentinelZero);

      else if (Mask1[i] == SM_SentinelZero)

        Mask.push_back(i);

      else if (Mask0[i] == SM_SentinelZero)

        Mask.push_back(i + MaskSize);

      else

        return false;

    }

    Ops.push_back(N.getOperand(0));

    Ops.push_back(N.getOperand(1));

    return true;

  }

  case ISD::CONCAT_VECTORS: {

    // Limit this to vXi64 vector cases to make the most of cross lane shuffles.

    unsigned NumSubElts = N.getOperand(0).getValueType().getVectorNumElements();

    if (NumBitsPerElt == 64) {

      for (unsigned I = 0, E = N.getNumOperands(); I != E; ++I) {

        for (unsigned M = 0; M != NumSubElts; ++M)

          Mask.push_back((I * NumElts) + M);

        Ops.push_back(N.getOperand(I));

      }

      return true;

    }

    return false;

  }

  case ISD::INSERT_SUBVECTOR: {

    SDValue Src = N.getOperand(0);

    SDValue Sub = N.getOperand(1);

    EVT SubVT = Sub.getValueType();

    unsigned NumSubElts = SubVT.getVectorNumElements();

    uint64_t InsertIdx = N.getConstantOperandVal(2);

    // Subvector isn't demanded - just return the base vector.

    if (DemandedElts.extractBits(NumSubElts, InsertIdx) == 0) {

      Mask.resize(NumElts);

      std::iota(Mask.begin(), Mask.end(), 0);

      Ops.push_back(Src);

      return true;

    }

    // Handle CONCAT(SUB0, SUB1).

    // Limit to vXi64/splat cases to make the most of cross lane shuffles.

    if (Depth > 0 && InsertIdx == NumSubElts && NumElts == (2 * NumSubElts) &&

        Src.getOpcode() == ISD::INSERT_SUBVECTOR &&

        Src.getOperand(0).isUndef() &&

        Src.getOperand(1).getValueType() == SubVT &&

        Src.getConstantOperandVal(2) == 0 &&

        (NumBitsPerElt == 64 || Src.getOperand(1) == Sub) &&

        SDNode::areOnlyUsersOf({N.getNode(), Src.getNode()}, Sub.getNode())) {

      Mask.resize(NumElts);

      std::iota(Mask.begin(), Mask.begin() + NumSubElts, 0);

      std::iota(Mask.begin() + NumSubElts, Mask.end(), NumElts);

      Ops.push_back(Src.getOperand(1));

      Ops.push_back(Sub);

      return true;

    }

    if (!N->isOnlyUserOf(Sub.getNode()))

      return false;


    SmallVector<int, 64> SubMask;

    SmallVector<SDValue, 2> SubInputs;

    SDValue SubSrc = peekThroughOneUseBitcasts(Sub);

    EVT SubSrcVT = SubSrc.getValueType();

    if (!SubSrcVT.isVector())

      return false;


    // Handle INSERT_SUBVECTOR(SRC0, EXTRACT_SUBVECTOR(SRC1)).

    if (SubSrc.getOpcode() == ISD::EXTRACT_SUBVECTOR &&

        SubSrc.getOperand(0).getValueSizeInBits() == NumSizeInBits) {

      uint64_t ExtractIdx = SubSrc.getConstantOperandVal(1);

      SDValue SubSrcSrc = SubSrc.getOperand(0);

      unsigned NumSubSrcSrcElts =

          SubSrcSrc.getValueType().getVectorNumElements();

      unsigned MaxElts = std::max(NumElts, NumSubSrcSrcElts);

      assert((MaxElts % NumElts) == 0 && (MaxElts % NumSubSrcSrcElts) == 0 &&

             "Subvector valuetype mismatch");

      InsertIdx *= (MaxElts / NumElts);

      ExtractIdx *= (MaxElts / NumSubSrcSrcElts);

      NumSubElts *= (MaxElts / NumElts);

      bool SrcIsUndef = Src.isUndef();

      for (int i = 0; i != (int)MaxElts; ++i)

        Mask.push_back(SrcIsUndef ? SM_SentinelUndef : i);

      for (int i = 0; i != (int)NumSubElts; ++i)

        Mask[InsertIdx + i] = (SrcIsUndef ? 0 : MaxElts) + ExtractIdx + i;

      if (!SrcIsUndef)

        Ops.push_back(Src);

      Ops.push_back(SubSrcSrc);

      return true;

    }


    // Handle INSERT_SUBVECTOR(SRC0, SHUFFLE(SRC1)).

    APInt SubDemand = APInt::getAllOnes(SubSrcVT.getVectorNumElements());

    if (!getTargetShuffleInputs(SubSrc, SubDemand, SubInputs, SubMask, DAG,

                                Depth + 1, ResolveKnownElts))

      return false;


    // Subvector shuffle inputs must not be larger than the subvector.

    if (llvm::any_of(SubInputs, [SubVT](SDValue SubInput) {

          return SubVT.getFixedSizeInBits() <

                 SubInput.getValueSizeInBits().getFixedValue();

        }))

      return false;


    if (SubMask.size() != NumSubElts) {

      assert(((SubMask.size() % NumSubElts) == 0 ||

              (NumSubElts % SubMask.size()) == 0) &&

             "Illegal submask scale");

      if ((NumSubElts % SubMask.size()) == 0) {

        int Scale = NumSubElts / SubMask.size();

        SmallVector<int, 64> ScaledSubMask;

        narrowShuffleMaskElts(Scale, SubMask, ScaledSubMask);

        SubMask = ScaledSubMask;

      } else {

        int Scale = SubMask.size() / NumSubElts;

        NumSubElts = SubMask.size();

        NumElts *= Scale;

        InsertIdx *= Scale;

      }

    }

    Ops.push_back(Src);

    Ops.append(SubInputs.begin(), SubInputs.end());

    if (ISD::isBuildVectorAllZeros(Src.getNode()))

      Mask.append(NumElts, SM_SentinelZero);

    else

      for (int i = 0; i != (int)NumElts; ++i)

        Mask.push_back(i);

    for (int i = 0; i != (int)NumSubElts; ++i) {

      int M = SubMask[i];

      if (0 <= M) {

        int InputIdx = M / NumSubElts;

        M = (NumElts * (1 + InputIdx)) + (M % NumSubElts);

      }

      Mask[i + InsertIdx] = M;

    }

    return true;

  }

  case X86ISD::PINSRB:

  case X86ISD::PINSRW:

  case ISD::SCALAR_TO_VECTOR:

  case ISD::INSERT_VECTOR_ELT: {

    // Match against a insert_vector_elt/scalar_to_vector of an extract from a

    // vector, for matching src/dst vector types.

    SDValue Scl = N.getOperand(Opcode == ISD::SCALAR_TO_VECTOR ? 0 : 1);


    unsigned DstIdx = 0;

    if (Opcode != ISD::SCALAR_TO_VECTOR) {

      // Check we have an in-range constant insertion index.

      if (!isa<ConstantSDNode>(N.getOperand(2)) ||

          N.getConstantOperandAPInt(2).uge(NumElts))

        return false;

      DstIdx = N.getConstantOperandVal(2);


      // Attempt to recognise an INSERT*(VEC, 0, DstIdx) shuffle pattern.

      if (X86::isZeroNode(Scl)) {

        Ops.push_back(N.getOperand(0));

        for (unsigned i = 0; i != NumElts; ++i)

          Mask.push_back(i == DstIdx ? SM_SentinelZero : (int)i);

        return true;

      }

    }


    // Peek through trunc/aext/zext/bitcast.

    // TODO: aext shouldn't require SM_SentinelZero padding.

    // TODO: handle shift of scalars.

    unsigned MinBitsPerElt = Scl.getScalarValueSizeInBits();

    while (Scl.getOpcode() == ISD::TRUNCATE ||

           Scl.getOpcode() == ISD::ANY_EXTEND ||

           Scl.getOpcode() == ISD::ZERO_EXTEND ||

           (Scl.getOpcode() == ISD::BITCAST &&

            Scl.getScalarValueSizeInBits() ==

                Scl.getOperand(0).getScalarValueSizeInBits())) {

      Scl = Scl.getOperand(0);

      MinBitsPerElt =

          std::min<unsigned>(MinBitsPerElt, Scl.getScalarValueSizeInBits());

    }

    if ((MinBitsPerElt % 8) != 0)

      return false;


    // Attempt to find the source vector the scalar was extracted from.

    SDValue SrcExtract;

    if ((Scl.getOpcode() == ISD::EXTRACT_VECTOR_ELT ||

         Scl.getOpcode() == X86ISD::PEXTRW ||

         Scl.getOpcode() == X86ISD::PEXTRB) &&

        Scl.getOperand(0).getValueSizeInBits() == NumSizeInBits) {

      SrcExtract = Scl;

    }

    if (!SrcExtract || !isa<ConstantSDNode>(SrcExtract.getOperand(1)))

      return false;


    SDValue SrcVec = SrcExtract.getOperand(0);

    EVT SrcVT = SrcVec.getValueType();

    if (!SrcVT.getScalarType().isByteSized())

      return false;

    unsigned SrcIdx = SrcExtract.getConstantOperandVal(1);

    unsigned SrcByte = SrcIdx * (SrcVT.getScalarSizeInBits() / 8);

    unsigned DstByte = DstIdx * NumBytesPerElt;

    MinBitsPerElt =

        std::min<unsigned>(MinBitsPerElt, SrcVT.getScalarSizeInBits());


    // Create 'identity' byte level shuffle mask and then add inserted bytes.

    if (Opcode == ISD::SCALAR_TO_VECTOR) {

      Ops.push_back(SrcVec);

      Mask.append(NumSizeInBytes, SM_SentinelUndef);

    } else {

      Ops.push_back(SrcVec);

      Ops.push_back(N.getOperand(0));

      for (int i = 0; i != (int)NumSizeInBytes; ++i)

        Mask.push_back(NumSizeInBytes + i);

    }


    unsigned MinBytesPerElts = MinBitsPerElt / 8;

    MinBytesPerElts = std::min(MinBytesPerElts, NumBytesPerElt);

    for (unsigned i = 0; i != MinBytesPerElts; ++i)

      Mask[DstByte + i] = SrcByte + i;

    for (unsigned i = MinBytesPerElts; i < NumBytesPerElt; ++i)

      Mask[DstByte + i] = SM_SentinelZero;

    return true;

  }

  case X86ISD::PACKSS:

  case X86ISD::PACKUS: {

    SDValue N0 = N.getOperand(0);

    SDValue N1 = N.getOperand(1);

    assert(N0.getValueType().getVectorNumElements() == (NumElts / 2) &&

           N1.getValueType().getVectorNumElements() == (NumElts / 2) &&

           "Unexpected input value type");


    APInt EltsLHS, EltsRHS;

    getPackDemandedElts(VT, DemandedElts, EltsLHS, EltsRHS);


    // If we know input saturation won't happen (or we don't care for particular

    // lanes), we can treat this as a truncation shuffle.

    bool Offset0 = false, Offset1 = false;

    if (Opcode == X86ISD::PACKSS) {

      if ((!(N0.isUndef() || EltsLHS.isZero()) &&

           DAG.ComputeNumSignBits(N0, EltsLHS, Depth + 1) <= NumBitsPerElt) ||

          (!(N1.isUndef() || EltsRHS.isZero()) &&

           DAG.ComputeNumSignBits(N1, EltsRHS, Depth + 1) <= NumBitsPerElt))

        return false;

      // We can't easily fold ASHR into a shuffle, but if it was feeding a

      // PACKSS then it was likely being used for sign-extension for a

      // truncation, so just peek through and adjust the mask accordingly.

      if (N0.getOpcode() == X86ISD::VSRAI && N->isOnlyUserOf(N0.getNode()) &&

          N0.getConstantOperandAPInt(1) == NumBitsPerElt) {

        Offset0 = true;

        N0 = N0.getOperand(0);

      }

      if (N1.getOpcode() == X86ISD::VSRAI && N->isOnlyUserOf(N1.getNode()) &&

          N1.getConstantOperandAPInt(1) == NumBitsPerElt) {

        Offset1 = true;

        N1 = N1.getOperand(0);

      }

    } else {

      APInt ZeroMask = APInt::getHighBitsSet(2 * NumBitsPerElt, NumBitsPerElt);

      if ((!(N0.isUndef() || EltsLHS.isZero()) &&

           !DAG.MaskedValueIsZero(N0, ZeroMask, EltsLHS, Depth + 1)) ||

          (!(N1.isUndef() || EltsRHS.isZero()) &&

           !DAG.MaskedValueIsZero(N1, ZeroMask, EltsRHS, Depth + 1)))

        return false;

    }


    bool IsUnary = (N0 == N1);


    Ops.push_back(N0);

    if (!IsUnary)

      Ops.push_back(N1);


    createPackShuffleMask(VT, Mask, IsUnary);


    if (Offset0 || Offset1) {

      for (int &M : Mask)

        if ((Offset0 && isInRange(M, 0, NumElts)) ||

            (Offset1 && isInRange(M, NumElts, 2 * NumElts)))

          ++M;

    }

    return true;

  }

  case ISD::VSELECT:

  case X86ISD::BLENDV: {

    SDValue Cond = N.getOperand(0);

    if (createShuffleMaskFromVSELECT(Mask, Cond, Opcode == X86ISD::BLENDV)) {

      Ops.push_back(N.getOperand(1));

      Ops.push_back(N.getOperand(2));

      return true;

    }

    return false;

  }

  case X86ISD::VTRUNC: {

    SDValue Src = N.getOperand(0);

    EVT SrcVT = Src.getValueType();

    if (SrcVT.getSizeInBits() != NumSizeInBits)

      return false;

    unsigned NumSrcElts = SrcVT.getVectorNumElements();

    unsigned NumBitsPerSrcElt = SrcVT.getScalarSizeInBits();

    unsigned Scale = NumBitsPerSrcElt / NumBitsPerElt;

    assert((NumBitsPerSrcElt % NumBitsPerElt) == 0 && "Illegal truncation");

    for (unsigned i = 0; i != NumSrcElts; ++i)

      Mask.push_back(i * Scale);

    Mask.append(NumElts - NumSrcElts, SM_SentinelZero);

    Ops.push_back(Src);

    return true;

  }

  case ISD::SHL:

  case ISD::SRL: {

    APInt UndefElts;

    SmallVector<APInt, 32> EltBits;

    if (!getTargetConstantBitsFromNode(N.getOperand(1), NumBitsPerElt,

                                       UndefElts, EltBits,

                                       /*AllowWholeUndefs*/ true,

                                       /*AllowPartialUndefs*/ false))

      return false;


    // We can only decode 'whole byte' bit shifts as shuffles.

    for (unsigned I = 0; I != NumElts; ++I)

      if (DemandedElts[I] && !UndefElts[I] &&

          (EltBits[I].urem(8) != 0 || EltBits[I].uge(NumBitsPerElt)))

        return false;


    Mask.append(NumSizeInBytes, SM_SentinelUndef);

    Ops.push_back(N.getOperand(0));


    for (unsigned I = 0; I != NumElts; ++I) {

      if (!DemandedElts[I] || UndefElts[I])

        continue;

      unsigned ByteShift = EltBits[I].getZExtValue() / 8;

      unsigned Lo = I * NumBytesPerElt;

      unsigned Hi = Lo + NumBytesPerElt;

      // Clear mask to all zeros and insert the shifted byte indices.

      std::fill(Mask.begin() + Lo, Mask.begin() + Hi, SM_SentinelZero);

      if (ISD::SHL == Opcode)

        std::iota(Mask.begin() + Lo + ByteShift, Mask.begin() + Hi, Lo);

      else

        std::iota(Mask.begin() + Lo, Mask.begin() + Hi - ByteShift,

                  Lo + ByteShift);

    }

    return true;

  }

  case X86ISD::VSHLI:

  case X86ISD::VSRLI: {

    uint64_t ShiftVal = N.getConstantOperandVal(1);

    // Out of range bit shifts are guaranteed to be zero.

    if (NumBitsPerElt <= ShiftVal) {

      Mask.append(NumElts, SM_SentinelZero);

      return true;

    }


    // We can only decode 'whole byte' bit shifts as shuffles.

    if ((ShiftVal % 8) != 0)

      break;


    uint64_t ByteShift = ShiftVal / 8;

    Ops.push_back(N.getOperand(0));


    // Clear mask to all zeros and insert the shifted byte indices.

    Mask.append(NumSizeInBytes, SM_SentinelZero);


    if (X86ISD::VSHLI == Opcode) {

      for (unsigned i = 0; i != NumSizeInBytes; i += NumBytesPerElt)

        for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)

          Mask[i + j] = i + j - ByteShift;

    } else {

      for (unsigned i = 0; i != NumSizeInBytes; i += NumBytesPerElt)

        for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)

          Mask[i + j - ByteShift] = i + j;

    }

    return true;

  }

  case X86ISD::VROTLI:

  case X86ISD::VROTRI: {

    // We can only decode 'whole byte' bit rotates as shuffles.

    uint64_t RotateVal = N.getConstantOperandAPInt(1).urem(NumBitsPerElt);

    if ((RotateVal % 8) != 0)

      return false;

    Ops.push_back(N.getOperand(0));

    int Offset = RotateVal / 8;

    Offset = (X86ISD::VROTLI == Opcode ? NumBytesPerElt - Offset : Offset);

    for (int i = 0; i != (int)NumElts; ++i) {

      int BaseIdx = i * NumBytesPerElt;

      for (int j = 0; j != (int)NumBytesPerElt; ++j) {

        Mask.push_back(BaseIdx + ((Offset + j) % NumBytesPerElt));

      }

    }

    return true;

  }

  case X86ISD::VBROADCAST: {

    SDValue Src = N.getOperand(0);

    if (!Src.getSimpleValueType().isVector()) {

      if (Src.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||

          !isNullConstant(Src.getOperand(1)) ||

          Src.getOperand(0).getValueType().getScalarType() !=

              VT.getScalarType())

        return false;

      Src = Src.getOperand(0);

    }

    Ops.push_back(Src);

    Mask.append(NumElts, 0);

    return true;

  }

  case ISD::SIGN_EXTEND_VECTOR_INREG: {

    SDValue Src = N.getOperand(0);

    EVT SrcVT = Src.getValueType();

    unsigned NumBitsPerSrcElt = SrcVT.getScalarSizeInBits();


    // Extended source must be a simple vector.

    if (!SrcVT.isSimple() || (SrcVT.getSizeInBits() % 128) != 0 ||

        (NumBitsPerSrcElt % 8) != 0)

      return false;


    // We can only handle all-signbits extensions.

    APInt DemandedSrcElts =

        DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());

    if (DAG.ComputeNumSignBits(Src, DemandedSrcElts) != NumBitsPerSrcElt)

      return false;


    assert((NumBitsPerElt % NumBitsPerSrcElt) == 0 && "Unexpected extension");

    unsigned Scale = NumBitsPerElt / NumBitsPerSrcElt;

    for (unsigned I = 0; I != NumElts; ++I)

      Mask.append(Scale, I);

    Ops.push_back(Src);

    return true;

  }

  case ISD::ZERO_EXTEND:

  case ISD::ANY_EXTEND:

  case ISD::ZERO_EXTEND_VECTOR_INREG:

  case ISD::ANY_EXTEND_VECTOR_INREG: {

    SDValue Src = N.getOperand(0);

    EVT SrcVT = Src.getValueType();


    // Extended source must be a simple vector.

    if (!SrcVT.isSimple() || (SrcVT.getSizeInBits() % 128) != 0 ||

        (SrcVT.getScalarSizeInBits() % 8) != 0)

      return false;


    bool IsAnyExtend =

        (ISD::ANY_EXTEND == Opcode || ISD::ANY_EXTEND_VECTOR_INREG == Opcode);

    DecodeZeroExtendMask(SrcVT.getScalarSizeInBits(), NumBitsPerElt, NumElts,

                         IsAnyExtend, Mask);

    Ops.push_back(Src);

    return true;

  }

  }


  return false;

}


/// Removes unused/repeated shuffle source inputs and adjusts the shuffle mask.


static void resolveTargetShuffleInputsAndMask(SmallVectorImpl<SDValue> &Inputs,

                                              SmallVectorImpl<int> &Mask) {

  int MaskWidth = Mask.size();

  SmallVector<SDValue, 16> UsedInputs;

  for (int i = 0, e = Inputs.size(); i < e; ++i) {

    int lo = UsedInputs.size() * MaskWidth;

    int hi = lo + MaskWidth;


    // Strip UNDEF input usage.

    if (Inputs[i].isUndef())

      for (int &M : Mask)

        if ((lo <= M) && (M < hi))

          M = SM_SentinelUndef;


    // Check for unused inputs.

    if (none_of(Mask, [lo, hi](int i) { return (lo <= i) && (i < hi); })) {

      for (int &M : Mask)

        if (lo <= M)

          M -= MaskWidth;

      continue;

    }


    // Check for repeated inputs.

    bool IsRepeat = false;

    for (int j = 0, ue = UsedInputs.size(); j != ue; ++j) {

      if (UsedInputs[j] != Inputs[i])

        continue;

      for (int &M : Mask)

        if (lo <= M)

          M = (M < hi) ? ((M - lo) + (j * MaskWidth)) : (M - MaskWidth);

      IsRepeat = true;

      break;

    }

    if (IsRepeat)

      continue;


    UsedInputs.push_back(Inputs[i]);

  }

  Inputs = UsedInputs;

}


/// Calls getTargetShuffleAndZeroables to resolve a target shuffle mask's inputs

/// and then sets the SM_SentinelUndef and SM_SentinelZero values.

/// Returns true if the target shuffle mask was decoded.


static bool getTargetShuffleInputs(SDValue Op, const APInt &DemandedElts,

                                   SmallVectorImpl<SDValue> &Inputs,

                                   SmallVectorImpl<int> &Mask,

                                   APInt &KnownUndef, APInt &KnownZero,

                                   const SelectionDAG &DAG, unsigned Depth,

                                   bool ResolveKnownElts) {

  if (Depth >= SelectionDAG::MaxRecursionDepth)

    return false; // Limit search depth.


  EVT VT = Op.getValueType();

  if (!VT.isSimple() || !VT.isVector())

    return false;


  if (getTargetShuffleAndZeroables(Op, Mask, Inputs, KnownUndef, KnownZero)) {

    if (ResolveKnownElts)

      resolveTargetShuffleFromZeroables(Mask, KnownUndef, KnownZero);

    return true;

  }

  if (getFauxShuffleMask(Op, DemandedElts, Mask, Inputs, DAG, Depth,

                         ResolveKnownElts)) {

    resolveZeroablesFromTargetShuffle(Mask, KnownUndef, KnownZero);

    return true;

  }

  return false;

}


static bool getTargetShuffleInputs(SDValue Op, const APInt &DemandedElts,

                                   SmallVectorImpl<SDValue> &Inputs,

                                   SmallVectorImpl<int> &Mask,

                                   const SelectionDAG &DAG, unsigned Depth,

                                   bool ResolveKnownElts) {

  APInt KnownUndef, KnownZero;

  return getTargetShuffleInputs(Op, DemandedElts, Inputs, Mask, KnownUndef,

                                KnownZero, DAG, Depth, ResolveKnownElts);

}


static bool getTargetShuffleInputs(SDValue Op, SmallVectorImpl<SDValue> &Inputs,

                                   SmallVectorImpl<int> &Mask,

                                   const SelectionDAG &DAG, unsigned Depth = 0,

                                   bool ResolveKnownElts = true) {

  EVT VT = Op.getValueType();

  if (!VT.isSimple() || !VT.isVector())

    return false;


  unsigned NumElts = Op.getValueType().getVectorNumElements();

  APInt DemandedElts = APInt::getAllOnes(NumElts);

  return getTargetShuffleInputs(Op, DemandedElts, Inputs, Mask, DAG, Depth,

                                ResolveKnownElts);

}


// Attempt to create a scalar/subvector broadcast from the base MemSDNode.


static SDValue getBROADCAST_LOAD(unsigned Opcode, const SDLoc &DL, EVT VT,

                                 EVT MemVT, MemSDNode *Mem, unsigned Offset,

                                 SelectionDAG &DAG) {

  assert((Opcode == X86ISD::VBROADCAST_LOAD ||

          Opcode == X86ISD::SUBV_BROADCAST_LOAD) &&

         "Unknown broadcast load type");


  // Ensure this is a simple (non-atomic, non-voltile), temporal read memop.

  if (!Mem || !Mem->readMem() || !Mem->isSimple() || Mem->isNonTemporal())

    return SDValue();


  SDValue Ptr = DAG.getMemBasePlusOffset(Mem->getBasePtr(),

                                         TypeSize::getFixed(Offset), DL);

  SDVTList Tys = DAG.getVTList(VT, MVT::Other);

  SDValue Ops[] = {Mem->getChain(), Ptr};

  SDValue BcstLd = DAG.getMemIntrinsicNode(

      Opcode, DL, Tys, Ops, MemVT,

      DAG.getMachineFunction().getMachineMemOperand(

          Mem->getMemOperand(), Offset, MemVT.getStoreSize()));

  DAG.makeEquivalentMemoryOrdering(SDValue(Mem, 1), BcstLd.getValue(1));

  return BcstLd;

}


/// Returns the scalar element that will make up the i'th

/// element of the result of the vector shuffle.


static SDValue getShuffleScalarElt(SDValue Op, unsigned Index,

                                   SelectionDAG &DAG, unsigned Depth) {

  if (Depth >= SelectionDAG::MaxRecursionDepth)

    return SDValue(); // Limit search depth.


  EVT VT = Op.getValueType();

  unsigned Opcode = Op.getOpcode();

  unsigned NumElems = VT.getVectorNumElements();


  // Recurse into ISD::VECTOR_SHUFFLE node to find scalars.

  if (auto *SV = dyn_cast<ShuffleVectorSDNode>(Op)) {

    int Elt = SV->getMaskElt(Index);


    if (Elt < 0)

      return DAG.getUNDEF(VT.getVectorElementType());


    SDValue Src = (Elt < (int)NumElems) ? SV->getOperand(0) : SV->getOperand(1);

    return getShuffleScalarElt(Src, Elt % NumElems, DAG, Depth + 1);

  }


  // Recurse into target specific vector shuffles to find scalars.

  if (isTargetShuffle(Opcode)) {

    MVT ShufVT = VT.getSimpleVT();

    MVT ShufSVT = ShufVT.getVectorElementType();

    int NumElems = (int)ShufVT.getVectorNumElements();

    SmallVector<int, 16> ShuffleMask;

    SmallVector<SDValue, 16> ShuffleOps;

    if (!getTargetShuffleMask(Op, true, ShuffleOps, ShuffleMask))

      return SDValue();


    int Elt = ShuffleMask[Index];

    if (Elt == SM_SentinelZero)

      return ShufSVT.isInteger() ? DAG.getConstant(0, SDLoc(Op), ShufSVT)

                                 : DAG.getConstantFP(+0.0, SDLoc(Op), ShufSVT);

    if (Elt == SM_SentinelUndef)

      return DAG.getUNDEF(ShufSVT);


    assert(0 <= Elt && Elt < (2 * NumElems) && "Shuffle index out of range");

    SDValue Src = (Elt < NumElems) ? ShuffleOps[0] : ShuffleOps[1];

    return getShuffleScalarElt(Src, Elt % NumElems, DAG, Depth + 1);

  }


  // Recurse into insert_subvector base/sub vector to find scalars.

  if (Opcode == ISD::INSERT_SUBVECTOR) {

    SDValue Vec = Op.getOperand(0);

    SDValue Sub = Op.getOperand(1);

    uint64_t SubIdx = Op.getConstantOperandVal(2);

    unsigned NumSubElts = Sub.getValueType().getVectorNumElements();


    if (SubIdx <= Index && Index < (SubIdx + NumSubElts))

      return getShuffleScalarElt(Sub, Index - SubIdx, DAG, Depth + 1);

    return getShuffleScalarElt(Vec, Index, DAG, Depth + 1);

  }


  // Recurse into concat_vectors sub vector to find scalars.

  if (Opcode == ISD::CONCAT_VECTORS) {

    EVT SubVT = Op.getOperand(0).getValueType();

    unsigned NumSubElts = SubVT.getVectorNumElements();

    uint64_t SubIdx = Index / NumSubElts;

    uint64_t SubElt = Index % NumSubElts;

    return getShuffleScalarElt(Op.getOperand(SubIdx), SubElt, DAG, Depth + 1);

  }


  // Recurse into extract_subvector src vector to find scalars.

  if (Opcode == ISD::EXTRACT_SUBVECTOR) {

    SDValue Src = Op.getOperand(0);

    uint64_t SrcIdx = Op.getConstantOperandVal(1);

    return getShuffleScalarElt(Src, Index + SrcIdx, DAG, Depth + 1);

  }


  // We only peek through bitcasts of the same vector width.

  if (Opcode == ISD::BITCAST) {

    SDValue Src = Op.getOperand(0);

    EVT SrcVT = Src.getValueType();

    if (SrcVT.isVector() && SrcVT.getVectorNumElements() == NumElems)

      return getShuffleScalarElt(Src, Index, DAG, Depth + 1);

    return SDValue();

  }


  // Actual nodes that may contain scalar elements


  // For insert_vector_elt - either return the index matching scalar or recurse

  // into the base vector.

  if (Opcode == ISD::INSERT_VECTOR_ELT &&

      isa<ConstantSDNode>(Op.getOperand(2))) {

    if (Op.getConstantOperandAPInt(2) == Index)

      return Op.getOperand(1);

    return getShuffleScalarElt(Op.getOperand(0), Index, DAG, Depth + 1);

  }


  if (Opcode == ISD::SCALAR_TO_VECTOR)

    return (Index == 0) ? Op.getOperand(0)

                        : DAG.getUNDEF(VT.getVectorElementType());


  if (Opcode == ISD::BUILD_VECTOR)

    return Op.getOperand(Index);


  return SDValue();

}


// Use PINSRB/PINSRW/PINSRD to create a build vector.


static SDValue LowerBuildVectorAsInsert(SDValue Op, const SDLoc &DL,

                                        const APInt &NonZeroMask,

                                        unsigned NumNonZero, unsigned NumZero,

                                        SelectionDAG &DAG,

                                        const X86Subtarget &Subtarget) {

  MVT VT = Op.getSimpleValueType();

  unsigned NumElts = VT.getVectorNumElements();

  assert(((VT == MVT::v8i16 && Subtarget.hasSSE2()) ||

          ((VT == MVT::v16i8 || VT == MVT::v4i32) && Subtarget.hasSSE41())) &&

         "Illegal vector insertion");


  SDValue V;

  bool First = true;


  for (unsigned i = 0; i < NumElts; ++i) {

    bool IsNonZero = NonZeroMask[i];

    if (!IsNonZero)

      continue;


    // If the build vector contains zeros or our first insertion is not the

    // first index then insert into zero vector to break any register

    // dependency else use SCALAR_TO_VECTOR.

    if (First) {

      First = false;

      if (NumZero || 0 != i)

        V = getZeroVector(VT, Subtarget, DAG, DL);

      else {

        assert(0 == i && "Expected insertion into zero-index");

        V = DAG.getAnyExtOrTrunc(Op.getOperand(i), DL, MVT::i32);

        V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32, V);

        V = DAG.getBitcast(VT, V);

        continue;

      }

    }

    V = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, V, Op.getOperand(i),

                    DAG.getVectorIdxConstant(i, DL));

  }


  return V;

}


/// Custom lower build_vector of v16i8.


static SDValue LowerBuildVectorv16i8(SDValue Op, const SDLoc &DL,

                                     const APInt &NonZeroMask,

                                     unsigned NumNonZero, unsigned NumZero,

                                     SelectionDAG &DAG,

                                     const X86Subtarget &Subtarget) {

  if (NumNonZero > 8 && !Subtarget.hasSSE41())

    return SDValue();


  // SSE4.1 - use PINSRB to insert each byte directly.

  if (Subtarget.hasSSE41())

    return LowerBuildVectorAsInsert(Op, DL, NonZeroMask, NumNonZero, NumZero,

                                    DAG, Subtarget);


  SDValue V;


  // Pre-SSE4.1 - merge byte pairs and insert with PINSRW.

  // If both the lowest 16-bits are non-zero, then convert to MOVD.

  if (!NonZeroMask.extractBits(2, 0).isZero() &&

      !NonZeroMask.extractBits(2, 2).isZero()) {

    for (unsigned I = 0; I != 4; ++I) {

      if (!NonZeroMask[I])

        continue;

      SDValue Elt = DAG.getZExtOrTrunc(Op.getOperand(I), DL, MVT::i32);

      if (I != 0)

        Elt = DAG.getNode(ISD::SHL, DL, MVT::i32, Elt,

                          DAG.getConstant(I * 8, DL, MVT::i8));

      V = V ? DAG.getNode(ISD::OR, DL, MVT::i32, V, Elt) : Elt;

    }

    assert(V && "Failed to fold v16i8 vector to zero");

    V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32, V);

    V = DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v4i32, V);

    V = DAG.getBitcast(MVT::v8i16, V);

  }

  for (unsigned i = V ? 4 : 0; i < 16; i += 2) {

    bool ThisIsNonZero = NonZeroMask[i];

    bool NextIsNonZero = NonZeroMask[i + 1];

    if (!ThisIsNonZero && !NextIsNonZero)

      continue;


    SDValue Elt;

    if (ThisIsNonZero) {

      if (NumZero || NextIsNonZero)

        Elt = DAG.getZExtOrTrunc(Op.getOperand(i), DL, MVT::i32);

      else

        Elt = DAG.getAnyExtOrTrunc(Op.getOperand(i), DL, MVT::i32);

    }


    if (NextIsNonZero) {

      SDValue NextElt = Op.getOperand(i + 1);

      if (i == 0 && NumZero)

        NextElt = DAG.getZExtOrTrunc(NextElt, DL, MVT::i32);

      else

        NextElt = DAG.getAnyExtOrTrunc(NextElt, DL, MVT::i32);

      NextElt = DAG.getNode(ISD::SHL, DL, MVT::i32, NextElt,

                            DAG.getConstant(8, DL, MVT::i8));

      if (ThisIsNonZero)

        Elt = DAG.getNode(ISD::OR, DL, MVT::i32, NextElt, Elt);

      else

        Elt = NextElt;

    }


    // If our first insertion is not the first index or zeros are needed, then

    // insert into zero vector. Otherwise, use SCALAR_TO_VECTOR (leaves high

    // elements undefined).

    if (!V) {

      if (i != 0 || NumZero)

        V = getZeroVector(MVT::v8i16, Subtarget, DAG, DL);

      else {

        V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32, Elt);

        V = DAG.getBitcast(MVT::v8i16, V);

        continue;

      }

    }

    Elt = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Elt);

    V = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v8i16, V, Elt,

                    DAG.getVectorIdxConstant(i / 2, DL));

  }


  return DAG.getBitcast(MVT::v16i8, V);

}


/// Custom lower build_vector of v8i16.


static SDValue LowerBuildVectorv8i16(SDValue Op, const SDLoc &DL,

                                     const APInt &NonZeroMask,

                                     unsigned NumNonZero, unsigned NumZero,

                                     SelectionDAG &DAG,

                                     const X86Subtarget &Subtarget) {

  if (NumNonZero > 4 && !Subtarget.hasSSE41())

    return SDValue();


  // Use PINSRW to insert each byte directly.

  return LowerBuildVectorAsInsert(Op, DL, NonZeroMask, NumNonZero, NumZero, DAG,

                                  Subtarget);

}


/// Custom lower build_vector of v4i32 or v4f32.


static SDValue LowerBuildVectorv4x32(SDValue Op, const SDLoc &DL,

                                     SelectionDAG &DAG,

                                     const X86Subtarget &Subtarget) {

  // If this is a splat of a pair of elements, use MOVDDUP (unless the target

  // has XOP; in that case defer lowering to potentially use VPERMIL2PS).

  // Because we're creating a less complicated build vector here, we may enable

  // further folding of the MOVDDUP via shuffle transforms.

  if (Subtarget.hasSSE3() && !Subtarget.hasXOP() &&

      Op.getOperand(0) == Op.getOperand(2) &&

      Op.getOperand(1) == Op.getOperand(3) &&

      Op.getOperand(0) != Op.getOperand(1)) {

    MVT VT = Op.getSimpleValueType();

    MVT EltVT = VT.getVectorElementType();

    // Create a new build vector with the first 2 elements followed by undef

    // padding, bitcast to v2f64, duplicate, and bitcast back.

    SDValue Ops[4] = { Op.getOperand(0), Op.getOperand(1),

                       DAG.getUNDEF(EltVT), DAG.getUNDEF(EltVT) };

    SDValue NewBV = DAG.getBitcast(MVT::v2f64, DAG.getBuildVector(VT, DL, Ops));

    SDValue Dup = DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v2f64, NewBV);

    return DAG.getBitcast(VT, Dup);

  }


  // Find all zeroable elements.

  std::bitset<4> Zeroable, Undefs;

  for (int i = 0; i < 4; ++i) {

    SDValue Elt = Op.getOperand(i);

    Undefs[i] = Elt.isUndef();

    Zeroable[i] = (Elt.isUndef() || X86::isZeroNode(Elt));

  }

  assert(Zeroable.size() - Zeroable.count() > 1 &&

         "We expect at least two non-zero elements!");


  // We only know how to deal with build_vector nodes where elements are either

  // zeroable or extract_vector_elt with constant index.

  SDValue FirstNonZero;

  unsigned FirstNonZeroIdx;

  for (unsigned i = 0; i < 4; ++i) {

    if (Zeroable[i])

      continue;

    SDValue Elt = Op.getOperand(i);

    if (Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||

        !isa<ConstantSDNode>(Elt.getOperand(1)))

      return SDValue();

    // Make sure that this node is extracting from a 128-bit vector.

    MVT VT = Elt.getOperand(0).getSimpleValueType();

    if (!VT.is128BitVector())

      return SDValue();

    if (!FirstNonZero.getNode()) {

      FirstNonZero = Elt;

      FirstNonZeroIdx = i;

    }

  }


  assert(FirstNonZero.getNode() && "Unexpected build vector of all zeros!");

  SDValue V1 = FirstNonZero.getOperand(0);

  MVT VT = V1.getSimpleValueType();


  // See if this build_vector can be lowered as a blend with zero.

  SDValue Elt;

  unsigned EltMaskIdx, EltIdx;

  int Mask[4];

  for (EltIdx = 0; EltIdx < 4; ++EltIdx) {

    if (Zeroable[EltIdx]) {

      // The zero vector will be on the right hand side.

      Mask[EltIdx] = EltIdx+4;

      continue;

    }


    Elt = Op->getOperand(EltIdx);

    // By construction, Elt is a EXTRACT_VECTOR_ELT with constant index.

    EltMaskIdx = Elt.getConstantOperandVal(1);

    if (Elt.getOperand(0) != V1 || EltMaskIdx != EltIdx)

      break;

    Mask[EltIdx] = EltIdx;

  }


  if (EltIdx == 4) {

    // Let the shuffle legalizer deal with blend operations.

    SDValue VZeroOrUndef = (Zeroable == Undefs)

                               ? DAG.getUNDEF(VT)

                               : getZeroVector(VT, Subtarget, DAG, DL);

    if (V1.getSimpleValueType() != VT)

      V1 = DAG.getBitcast(VT, V1);

    return DAG.getVectorShuffle(VT, SDLoc(V1), V1, VZeroOrUndef, Mask);

  }


  // See if we can lower this build_vector to a INSERTPS.

  if (!Subtarget.hasSSE41())

    return SDValue();


  SDValue V2 = Elt.getOperand(0);

  if (Elt == FirstNonZero && EltIdx == FirstNonZeroIdx)

    V1 = SDValue();


  bool CanFold = true;

  for (unsigned i = EltIdx + 1; i < 4 && CanFold; ++i) {

    if (Zeroable[i])

      continue;


    SDValue Current = Op->getOperand(i);

    SDValue SrcVector = Current->getOperand(0);

    if (!V1.getNode())

      V1 = SrcVector;

    CanFold = (SrcVector == V1) && (Current.getConstantOperandAPInt(1) == i);

  }


  if (!CanFold)

    return SDValue();


  assert(V1.getNode() && "Expected at least two non-zero elements!");

  if (V1.getSimpleValueType() != MVT::v4f32)

    V1 = DAG.getBitcast(MVT::v4f32, V1);

  if (V2.getSimpleValueType() != MVT::v4f32)

    V2 = DAG.getBitcast(MVT::v4f32, V2);


  // Ok, we can emit an INSERTPS instruction.

  unsigned ZMask = Zeroable.to_ulong();


  unsigned InsertPSMask = EltMaskIdx << 6 | EltIdx << 4 | ZMask;

  assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");

  SDValue Result =

      DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,

                  DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));

  return DAG.getBitcast(VT, Result);

}


/// Return a vector logical shift node.


static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, unsigned NumBits,

                         SelectionDAG &DAG, const TargetLowering &TLI,

                         const SDLoc &dl) {

  assert(VT.is128BitVector() && "Unknown type for VShift");

  MVT ShVT = MVT::v16i8;

  unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ;

  SrcOp = DAG.getBitcast(ShVT, SrcOp);

  assert(NumBits % 8 == 0 && "Only support byte sized shifts");

  SDValue ShiftVal = DAG.getTargetConstant(NumBits / 8, dl, MVT::i8);

  return DAG.getBitcast(VT, DAG.getNode(Opc, dl, ShVT, SrcOp, ShiftVal));

}


static SDValue LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, const SDLoc &dl,

                                      SelectionDAG &DAG) {


  // Check if the scalar load can be widened into a vector load. And if

  // the address is "base + cst" see if the cst can be "absorbed" into

  // the shuffle mask.

  if (LoadSDNode *LD = dyn_cast<LoadSDNode>(SrcOp)) {

    SDValue Ptr = LD->getBasePtr();

    if (!ISD::isNormalLoad(LD) || !LD->isSimple())

      return SDValue();

    EVT PVT = LD->getValueType(0);

    if (PVT != MVT::i32 && PVT != MVT::f32)

      return SDValue();


    int FI = -1;

    int64_t Offset = 0;

    if (FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr)) {

      FI = FINode->getIndex();

      Offset = 0;

    } else if (DAG.isBaseWithConstantOffset(Ptr) &&

               isa<FrameIndexSDNode>(Ptr.getOperand(0))) {

      FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex();

      Offset = Ptr.getConstantOperandVal(1);

      Ptr = Ptr.getOperand(0);

    } else {

      return SDValue();

    }


    // FIXME: 256-bit vector instructions don't require a strict alignment,

    // improve this code to support it better.

    Align RequiredAlign(VT.getSizeInBits() / 8);

    SDValue Chain = LD->getChain();

    // Make sure the stack object alignment is at least 16 or 32.

    MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();

    MaybeAlign InferredAlign = DAG.InferPtrAlign(Ptr);

    if (!InferredAlign || *InferredAlign < RequiredAlign) {

      if (MFI.isFixedObjectIndex(FI)) {

        // Can't change the alignment. FIXME: It's possible to compute

        // the exact stack offset and reference FI + adjust offset instead.

        // If someone *really* cares about this. That's the way to implement it.

        return SDValue();

      } else {

        MFI.setObjectAlignment(FI, RequiredAlign);

      }

    }


    // (Offset % 16 or 32) must be multiple of 4. Then address is then

    // Ptr + (Offset & ~15).

    if (Offset < 0)

      return SDValue();

    if ((Offset % RequiredAlign.value()) & 3)

      return SDValue();

    int64_t StartOffset = Offset & ~int64_t(RequiredAlign.value() - 1);

    if (StartOffset) {

      SDLoc DL(Ptr);

      Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,

                        DAG.getConstant(StartOffset, DL, Ptr.getValueType()));

    }


    int EltNo = (Offset - StartOffset) >> 2;

    unsigned NumElems = VT.getVectorNumElements();


    EVT NVT = EVT::getVectorVT(*DAG.getContext(), PVT, NumElems);

    SDValue V1 = DAG.getLoad(NVT, dl, Chain, Ptr,

                             LD->getPointerInfo().getWithOffset(StartOffset));


    SmallVector<int, 8> Mask(NumElems, EltNo);


    return DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), Mask);

  }


  return SDValue();

}


// Recurse to find a LoadSDNode source and the accumulated ByteOffest.


static bool findEltLoadSrc(SDValue Elt, LoadSDNode *&Ld, int64_t &ByteOffset) {

  if (ISD::isNON_EXTLoad(Elt.getNode())) {

    auto *BaseLd = cast<LoadSDNode>(Elt);

    if (!BaseLd->isSimple())

      return false;

    Ld = BaseLd;

    ByteOffset = 0;

    return true;

  }


  switch (Elt.getOpcode()) {

  case ISD::BITCAST:

  case ISD::TRUNCATE:

  case ISD::SCALAR_TO_VECTOR:

    return findEltLoadSrc(Elt.getOperand(0), Ld, ByteOffset);

  case ISD::SRL:

    if (auto *AmtC = dyn_cast<ConstantSDNode>(Elt.getOperand(1))) {

      uint64_t Amt = AmtC->getZExtValue();

      if ((Amt % 8) == 0 && findEltLoadSrc(Elt.getOperand(0), Ld, ByteOffset)) {

        ByteOffset += Amt / 8;

        return true;

      }

    }

    break;

  case ISD::EXTRACT_VECTOR_ELT:

    if (auto *IdxC = dyn_cast<ConstantSDNode>(Elt.getOperand(1))) {

      SDValue Src = Elt.getOperand(0);

      unsigned SrcSizeInBits = Src.getScalarValueSizeInBits();

      unsigned DstSizeInBits = Elt.getScalarValueSizeInBits();

      if (DstSizeInBits == SrcSizeInBits && (SrcSizeInBits % 8) == 0 &&

          findEltLoadSrc(Src, Ld, ByteOffset)) {

        uint64_t Idx = IdxC->getZExtValue();

        ByteOffset += Idx * (SrcSizeInBits / 8);

        return true;

      }

    }

    break;

  }


  return false;

}


/// Given the initializing elements 'Elts' of a vector of type 'VT', see if the

/// elements can be replaced by a single large load which has the same value as

/// a build_vector or insert_subvector whose loaded operands are 'Elts'.

///

/// Example: <load i32 *a, load i32 *a+4, zero, undef> -> zextload a


static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,

                                        const SDLoc &DL, SelectionDAG &DAG,

                                        const X86Subtarget &Subtarget,

                                        bool IsAfterLegalize) {

  if ((VT.getScalarSizeInBits() % 8) != 0)

    return SDValue();


  unsigned NumElems = Elts.size();


  int LastLoadedElt = -1;

  APInt LoadMask = APInt::getZero(NumElems);

  APInt ZeroMask = APInt::getZero(NumElems);

  APInt UndefMask = APInt::getZero(NumElems);


  SmallVector<LoadSDNode*, 8> Loads(NumElems, nullptr);

  SmallVector<int64_t, 8> ByteOffsets(NumElems, 0);


  // For each element in the initializer, see if we've found a load, zero or an

  // undef.

  for (unsigned i = 0; i < NumElems; ++i) {

    SDValue Elt = peekThroughBitcasts(Elts[i]);

    if (!Elt.getNode())

      return SDValue();

    if (Elt.isUndef()) {

      UndefMask.setBit(i);

      continue;

    }

    if (X86::isZeroNode(Elt) || ISD::isBuildVectorAllZeros(Elt.getNode())) {

      ZeroMask.setBit(i);

      continue;

    }


    // Each loaded element must be the correct fractional portion of the

    // requested vector load.

    unsigned EltSizeInBits = Elt.getValueSizeInBits();

    if ((NumElems * EltSizeInBits) != VT.getSizeInBits())

      return SDValue();


    if (!findEltLoadSrc(Elt, Loads[i], ByteOffsets[i]) || ByteOffsets[i] < 0)

      return SDValue();

    unsigned LoadSizeInBits = Loads[i]->getValueSizeInBits(0);

    if (((ByteOffsets[i] * 8) + EltSizeInBits) > LoadSizeInBits)

      return SDValue();


    LoadMask.setBit(i);

    LastLoadedElt = i;

  }

  assert((ZeroMask.popcount() + UndefMask.popcount() + LoadMask.popcount()) ==

             NumElems &&

         "Incomplete element masks");


  // Handle Special Cases - all undef or undef/zero.

  if (UndefMask.popcount() == NumElems)

    return DAG.getUNDEF(VT);

  if ((ZeroMask.popcount() + UndefMask.popcount()) == NumElems)

    return VT.isInteger() ? DAG.getConstant(0, DL, VT)

                          : DAG.getConstantFP(0.0, DL, VT);


  const TargetLowering &TLI = DAG.getTargetLoweringInfo();

  int FirstLoadedElt = LoadMask.countr_zero();

  SDValue EltBase = peekThroughBitcasts(Elts[FirstLoadedElt]);

  EVT EltBaseVT = EltBase.getValueType();

  assert(EltBaseVT.getSizeInBits() == EltBaseVT.getStoreSizeInBits() &&

         "Register/Memory size mismatch");

  LoadSDNode *LDBase = Loads[FirstLoadedElt];

  assert(LDBase && "Did not find base load for merging consecutive loads");

  unsigned BaseSizeInBits = EltBaseVT.getStoreSizeInBits();

  unsigned BaseSizeInBytes = BaseSizeInBits / 8;

  int NumLoadedElts = (1 + LastLoadedElt - FirstLoadedElt);

  int LoadSizeInBits = NumLoadedElts * BaseSizeInBits;

  assert((BaseSizeInBits % 8) == 0 && "Sub-byte element loads detected");


  // TODO: Support offsetting the base load.

  if (ByteOffsets[FirstLoadedElt] != 0)

    return SDValue();


  // Check to see if the element's load is consecutive to the base load

  // or offset from a previous (already checked) load.

  auto CheckConsecutiveLoad = [&](LoadSDNode *Base, int EltIdx) {

    LoadSDNode *Ld = Loads[EltIdx];

    int64_t ByteOffset = ByteOffsets[EltIdx];

    if (ByteOffset && (ByteOffset % BaseSizeInBytes) == 0) {

      int64_t BaseIdx = EltIdx - (ByteOffset / BaseSizeInBytes);

      return (0 <= BaseIdx && BaseIdx < (int)NumElems && LoadMask[BaseIdx] &&

              Loads[BaseIdx] == Ld && ByteOffsets[BaseIdx] == 0);

    }

    return DAG.areNonVolatileConsecutiveLoads(Ld, Base, BaseSizeInBytes,

                                              EltIdx - FirstLoadedElt);

  };


  // Consecutive loads can contain UNDEFS but not ZERO elements.

  // Consecutive loads with UNDEFs and ZEROs elements require a

  // an additional shuffle stage to clear the ZERO elements.

  bool IsConsecutiveLoad = true;

  bool IsConsecutiveLoadWithZeros = true;

  for (int i = FirstLoadedElt + 1; i <= LastLoadedElt; ++i) {

    if (LoadMask[i]) {

      if (!CheckConsecutiveLoad(LDBase, i)) {

        IsConsecutiveLoad = false;

        IsConsecutiveLoadWithZeros = false;

        break;

      }

    } else if (ZeroMask[i]) {

      IsConsecutiveLoad = false;

    }

  }


  auto CreateLoad = [&DAG, &DL, &Loads](EVT VT, LoadSDNode *LDBase) {

    auto MMOFlags = LDBase->getMemOperand()->getFlags();

    assert(LDBase->isSimple() &&

           "Cannot merge volatile or atomic loads.");

    SDValue NewLd =

        DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),

                    LDBase->getPointerInfo(), LDBase->getBaseAlign(), MMOFlags);

    for (auto *LD : Loads)

      if (LD)

        DAG.makeEquivalentMemoryOrdering(LD, NewLd);

    return NewLd;

  };


  // Check if the base load is entirely dereferenceable.

  bool IsDereferenceable = LDBase->getPointerInfo().isDereferenceable(

      VT.getSizeInBits() / 8, *DAG.getContext(), DAG.getDataLayout());


  // LOAD - all consecutive load/undefs (must start/end with a load or be

  // entirely dereferenceable). If we have found an entire vector of loads and

  // undefs, then return a large load of the entire vector width starting at the

  // base pointer. If the vector contains zeros, then attempt to shuffle those

  // elements.

  if (FirstLoadedElt == 0 &&

      (NumLoadedElts == (int)NumElems || IsDereferenceable) &&

      (IsConsecutiveLoad || IsConsecutiveLoadWithZeros)) {

    if (IsAfterLegalize && !TLI.isOperationLegal(ISD::LOAD, VT))

      return SDValue();


    // Don't create 256-bit non-temporal aligned loads without AVX2 as these

    // will lower to regular temporal loads and use the cache.

    if (LDBase->isNonTemporal() && LDBase->getAlign() >= Align(32) &&

        VT.is256BitVector() && !Subtarget.hasInt256())

      return SDValue();


    if (NumElems == 1)

      return DAG.getBitcast(VT, Elts[FirstLoadedElt]);


    if (!ZeroMask)

      return CreateLoad(VT, LDBase);


    // IsConsecutiveLoadWithZeros - we need to create a shuffle of the loaded

    // vector and a zero vector to clear out the zero elements.

    if (!IsAfterLegalize && VT.isVector()) {

      unsigned NumMaskElts = VT.getVectorNumElements();

      if ((NumMaskElts % NumElems) == 0) {

        unsigned Scale = NumMaskElts / NumElems;

        SmallVector<int, 4> ClearMask(NumMaskElts, -1);

        for (unsigned i = 0; i < NumElems; ++i) {

          if (UndefMask[i])

            continue;

          int Offset = ZeroMask[i] ? NumMaskElts : 0;

          for (unsigned j = 0; j != Scale; ++j)

            ClearMask[(i * Scale) + j] = (i * Scale) + j + Offset;

        }

        SDValue V = CreateLoad(VT, LDBase);

        SDValue Z = VT.isInteger() ? DAG.getConstant(0, DL, VT)

                                   : DAG.getConstantFP(0.0, DL, VT);

        return DAG.getVectorShuffle(VT, DL, V, Z, ClearMask);

      }

    }

  }


  // If the upper half of a ymm/zmm load is undef then just load the lower half.

  if (VT.is256BitVector() || VT.is512BitVector()) {

    unsigned HalfNumElems = NumElems / 2;

    if (UndefMask.extractBits(HalfNumElems, HalfNumElems).isAllOnes()) {

      EVT HalfVT =

          EVT::getVectorVT(*DAG.getContext(), VT.getScalarType(), HalfNumElems);

      SDValue HalfLD =

          EltsFromConsecutiveLoads(HalfVT, Elts.drop_back(HalfNumElems), DL,

                                   DAG, Subtarget, IsAfterLegalize);

      if (HalfLD)

        return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT),

                           HalfLD, DAG.getVectorIdxConstant(0, DL));

    }

  }


  // VZEXT_LOAD - consecutive 32/64-bit load/undefs followed by zeros/undefs.

  if (IsConsecutiveLoad && FirstLoadedElt == 0 &&

      ((LoadSizeInBits == 16 && Subtarget.hasFP16()) || LoadSizeInBits == 32 ||

       LoadSizeInBits == 64) &&

      ((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))) {

    MVT VecSVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(LoadSizeInBits)

                                      : MVT::getIntegerVT(LoadSizeInBits);

    MVT VecVT = MVT::getVectorVT(VecSVT, VT.getSizeInBits() / LoadSizeInBits);

    // Allow v4f32 on SSE1 only targets.

    // FIXME: Add more isel patterns so we can just use VT directly.

    if (!Subtarget.hasSSE2() && VT == MVT::v4f32)

      VecVT = MVT::v4f32;

    if (TLI.isTypeLegal(VecVT)) {

      SDVTList Tys = DAG.getVTList(VecVT, MVT::Other);

      SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };

      SDValue ResNode = DAG.getMemIntrinsicNode(

          X86ISD::VZEXT_LOAD, DL, Tys, Ops, VecSVT, LDBase->getPointerInfo(),

          LDBase->getBaseAlign(), MachineMemOperand::MOLoad);

      for (auto *LD : Loads)

        if (LD)

          DAG.makeEquivalentMemoryOrdering(LD, ResNode);

      return DAG.getBitcast(VT, ResNode);

    }

  }


  // BROADCAST - match the smallest possible repetition pattern, load that

  // scalar/subvector element and then broadcast to the entire vector.

  if (ZeroMask.isZero() && isPowerOf2_32(NumElems) && Subtarget.hasAVX() &&

      (VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector())) {

    for (unsigned SubElems = 1; SubElems < NumElems; SubElems *= 2) {

      unsigned RepeatSize = SubElems * BaseSizeInBits;

      unsigned ScalarSize = std::min(RepeatSize, 64u);

      if (!Subtarget.hasAVX2() && ScalarSize < 32)

        continue;


      // Don't attempt a 1:N subvector broadcast - it should be caught by

      // combineConcatVectorOps, else will cause infinite loops.

      if (RepeatSize > ScalarSize && SubElems == 1)

        continue;


      bool Match = true;

      SmallVector<SDValue, 8> RepeatedLoads(SubElems, DAG.getUNDEF(EltBaseVT));

      for (unsigned i = 0; i != NumElems && Match; ++i) {

        if (!LoadMask[i])

          continue;

        SDValue Elt = peekThroughBitcasts(Elts[i]);

        if (RepeatedLoads[i % SubElems].isUndef())

          RepeatedLoads[i % SubElems] = Elt;

        else

          Match &= (RepeatedLoads[i % SubElems] == Elt);

      }


      // We must have loads at both ends of the repetition.

      Match &= !RepeatedLoads.front().isUndef();

      Match &= !RepeatedLoads.back().isUndef();

      if (!Match)

        continue;


      EVT RepeatVT =

          VT.isInteger() && (RepeatSize != 64 || TLI.isTypeLegal(MVT::i64))

              ? EVT::getIntegerVT(*DAG.getContext(), ScalarSize)

              : EVT::getFloatingPointVT(ScalarSize);

      if (RepeatSize > ScalarSize)

        RepeatVT = EVT::getVectorVT(*DAG.getContext(), RepeatVT,

                                    RepeatSize / ScalarSize);

      EVT BroadcastVT =

          EVT::getVectorVT(*DAG.getContext(), RepeatVT.getScalarType(),

                           VT.getSizeInBits() / ScalarSize);

      if (TLI.isTypeLegal(BroadcastVT)) {

        if (SDValue RepeatLoad = EltsFromConsecutiveLoads(

                RepeatVT, RepeatedLoads, DL, DAG, Subtarget, IsAfterLegalize)) {

          SDValue Broadcast = RepeatLoad;

          if (RepeatSize > ScalarSize) {

            while (Broadcast.getValueSizeInBits() < VT.getSizeInBits())

              Broadcast = concatSubVectors(Broadcast, Broadcast, DAG, DL);

          } else {

            if (!Subtarget.hasAVX2() &&

                !X86::mayFoldLoadIntoBroadcastFromMem(

                    RepeatLoad, RepeatVT.getScalarType().getSimpleVT(),

                    Subtarget,

                    /*AssumeSingleUse=*/true))

              return SDValue();

            Broadcast =

                DAG.getNode(X86ISD::VBROADCAST, DL, BroadcastVT, RepeatLoad);

          }

          return DAG.getBitcast(VT, Broadcast);

        }

      }

    }

  }


  return SDValue();

}


// Combine a vector ops (shuffles etc.) that is equal to build_vector load1,

// load2, load3, load4, <0, 1, 2, 3> into a vector load if the load addresses

// are consecutive, non-overlapping, and in the right order.


static SDValue combineToConsecutiveLoads(EVT VT, SDValue Op, const SDLoc &DL,

                                         SelectionDAG &DAG,

                                         const X86Subtarget &Subtarget,

                                         bool IsAfterLegalize) {

  SmallVector<SDValue, 64> Elts;

  for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) {

    if (SDValue Elt = getShuffleScalarElt(Op, i, DAG, 0)) {

      Elts.push_back(Elt);

      continue;

    }

    return SDValue();

  }

  assert(Elts.size() == VT.getVectorNumElements());

  return EltsFromConsecutiveLoads(VT, Elts, DL, DAG, Subtarget,

                                  IsAfterLegalize);

}


static Constant *getConstantVector(MVT VT, ArrayRef<APInt> Bits,

                                   const APInt &Undefs, LLVMContext &C) {

  unsigned ScalarSize = VT.getScalarSizeInBits();

  Type *Ty = EVT(VT.getScalarType()).getTypeForEVT(C);


  auto getConstantScalar = [&](const APInt &Val) -> Constant * {

    if (VT.isFloatingPoint()) {

      if (ScalarSize == 16)

        return ConstantFP::get(C, APFloat(APFloat::IEEEhalf(), Val));

      if (ScalarSize == 32)

        return ConstantFP::get(C, APFloat(APFloat::IEEEsingle(), Val));

      assert(ScalarSize == 64 && "Unsupported floating point scalar size");

      return ConstantFP::get(C, APFloat(APFloat::IEEEdouble(), Val));

    }

    return Constant::getIntegerValue(Ty, Val);

  };


  SmallVector<Constant *, 32> ConstantVec;

  for (unsigned I = 0, E = Bits.size(); I != E; ++I)

    ConstantVec.push_back(Undefs[I] ? UndefValue::get(Ty)

                                    : getConstantScalar(Bits[I]));


  return ConstantVector::get(ArrayRef<Constant *>(ConstantVec));

}


static Constant *getConstantVector(MVT VT, const APInt &SplatValue,

                                   unsigned SplatBitSize, LLVMContext &C) {

  unsigned ScalarSize = VT.getScalarSizeInBits();


  auto getConstantScalar = [&](const APInt &Val) -> Constant * {

    if (VT.isFloatingPoint()) {

      if (ScalarSize == 16)

        return ConstantFP::get(C, APFloat(APFloat::IEEEhalf(), Val));

      if (ScalarSize == 32)

        return ConstantFP::get(C, APFloat(APFloat::IEEEsingle(), Val));

      assert(ScalarSize == 64 && "Unsupported floating point scalar size");

      return ConstantFP::get(C, APFloat(APFloat::IEEEdouble(), Val));

    }

    return Constant::getIntegerValue(Type::getIntNTy(C, ScalarSize), Val);

  };


  if (ScalarSize == SplatBitSize)

    return getConstantScalar(SplatValue);


  unsigned NumElm = SplatBitSize / ScalarSize;

  SmallVector<Constant *, 32> ConstantVec;

  for (unsigned I = 0; I != NumElm; ++I) {

    APInt Val = SplatValue.extractBits(ScalarSize, ScalarSize * I);

    ConstantVec.push_back(getConstantScalar(Val));

  }

  return ConstantVector::get(ArrayRef<Constant *>(ConstantVec));

}


static bool isFoldableUseOfShuffle(SDNode *N) {

  for (auto *U : N->users()) {

    unsigned Opc = U->getOpcode();

    // VPERMV/VPERMV3 shuffles can never fold their index operands.

    if (Opc == X86ISD::VPERMV && U->getOperand(0).getNode() == N)

      return false;

    if (Opc == X86ISD::VPERMV3 && U->getOperand(1).getNode() == N)

      return false;

    if (isTargetShuffle(Opc))

      return true;

    if (Opc == ISD::BITCAST) // Ignore bitcasts

      return isFoldableUseOfShuffle(U);

    if (N->hasOneUse()) {

      // TODO, there may be some general way to know if a SDNode can

      // be folded. We now only know whether an MI is foldable.

      if (Opc == X86ISD::VPDPBUSD && U->getOperand(2).getNode() != N)

        return false;

      return true;

    }

  }

  return false;

}


// If the node has a single use by a VSELECT then AVX512 targets may be able to

// fold as a predicated instruction.


static bool isMaskableNode(SDValue V, const X86Subtarget &Subtarget) {

  unsigned SizeInBits = V.getValueSizeInBits();

  if ((SizeInBits == 512 && Subtarget.hasAVX512()) ||

      (SizeInBits >= 128 && Subtarget.hasVLX())) {

    if (V.hasOneUse() && V->user_begin()->getOpcode() == ISD::VSELECT &&

        V->user_begin()->getOperand(0).getScalarValueSizeInBits() == 1) {

      return true;

    }

  }

  return false;

}


/// Attempt to use the vbroadcast instruction to generate a splat value

/// from a splat BUILD_VECTOR which uses:

///  a. A single scalar load, or a constant.

///  b. Repeated pattern of constants (e.g. <0,1,0,1> or <0,1,2,3,0,1,2,3>).

///

/// The VBROADCAST node is returned when a pattern is found,

/// or SDValue() otherwise.


static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp,

                                           const SDLoc &dl,

                                           const X86Subtarget &Subtarget,

                                           SelectionDAG &DAG) {

  // VBROADCAST requires AVX.

  // TODO: Splats could be generated for non-AVX CPUs using SSE

  // instructions, but there's less potential gain for only 128-bit vectors.

  if (!Subtarget.hasAVX())

    return SDValue();


  MVT VT = BVOp->getSimpleValueType(0);

  unsigned NumElts = VT.getVectorNumElements();

  const TargetLowering &TLI = DAG.getTargetLoweringInfo();

  assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&

         "Unsupported vector type for broadcast.");


  // See if the build vector is a repeating sequence of scalars (inc. splat).

  SDValue Ld;

  BitVector UndefElements;

  SmallVector<SDValue, 16> Sequence;

  if (BVOp->getRepeatedSequence(Sequence, &UndefElements)) {

    assert((NumElts % Sequence.size()) == 0 && "Sequence doesn't fit.");

    if (Sequence.size() == 1)

      Ld = Sequence[0];

  }


  // Attempt to use VBROADCASTM

  // From this pattern:

  // a. t0 = (zext_i64 (bitcast_i8 v2i1 X))

  // b. t1 = (build_vector t0 t0)

  //

  // Create (VBROADCASTM v2i1 X)

  if (!Sequence.empty() && Subtarget.hasCDI()) {

    // If not a splat, are the upper sequence values zeroable?

    unsigned SeqLen = Sequence.size();

    bool UpperZeroOrUndef =

        SeqLen == 1 ||

        llvm::all_of(ArrayRef(Sequence).drop_front(),

                     [](SDValue V) { return !V || isNullConstantOrUndef(V); });

    SDValue Op0 = Sequence[0];

    if (UpperZeroOrUndef && ((Op0.getOpcode() == ISD::BITCAST) ||

                             (Op0.getOpcode() == ISD::ZERO_EXTEND &&

                              Op0.getOperand(0).getOpcode() == ISD::BITCAST))) {

      SDValue BOperand = Op0.getOpcode() == ISD::BITCAST

                             ? Op0.getOperand(0)

                             : Op0.getOperand(0).getOperand(0);

      MVT MaskVT = BOperand.getSimpleValueType();

      MVT EltType = MVT::getIntegerVT(VT.getScalarSizeInBits() * SeqLen);

      if ((EltType == MVT::i64 && MaskVT == MVT::v8i1) ||  // for broadcastmb2q

          (EltType == MVT::i32 && MaskVT == MVT::v16i1)) { // for broadcastmw2d

        MVT BcstVT = MVT::getVectorVT(EltType, NumElts / SeqLen);

        if (!VT.is512BitVector() && !Subtarget.hasVLX()) {

          unsigned Scale = 512 / VT.getSizeInBits();

          BcstVT = MVT::getVectorVT(EltType, Scale * (NumElts / SeqLen));

        }

        SDValue Bcst = DAG.getNode(X86ISD::VBROADCASTM, dl, BcstVT, BOperand);

        if (BcstVT.getSizeInBits() != VT.getSizeInBits())

          Bcst = extractSubVector(Bcst, 0, DAG, dl, VT.getSizeInBits());

        return DAG.getBitcast(VT, Bcst);

      }

    }

  }


  unsigned NumUndefElts = UndefElements.count();

  if (!Ld || (NumElts - NumUndefElts) <= 1) {

    APInt SplatValue, Undef;

    unsigned SplatBitSize;

    bool HasUndef;

    // Check if this is a repeated constant pattern suitable for broadcasting.

    if (BVOp->isConstantSplat(SplatValue, Undef, SplatBitSize, HasUndef) &&

        SplatBitSize > VT.getScalarSizeInBits() &&

        SplatBitSize < VT.getSizeInBits()) {

      // Avoid replacing with broadcast when it's a use of a shuffle

      // instruction to preserve the present custom lowering of shuffles.

      if (isFoldableUseOfShuffle(BVOp))

        return SDValue();

      // replace BUILD_VECTOR with broadcast of the repeated constants.

      LLVMContext *Ctx = DAG.getContext();

      MVT PVT = TLI.getPointerTy(DAG.getDataLayout());

      if (SplatBitSize == 32 || SplatBitSize == 64 ||

          (SplatBitSize < 32 && Subtarget.hasAVX2())) {

        // Load the constant scalar/subvector and broadcast it.

        MVT CVT = MVT::getIntegerVT(SplatBitSize);

        Constant *C = getConstantVector(VT, SplatValue, SplatBitSize, *Ctx);

        SDValue CP = DAG.getConstantPool(C, PVT);

        unsigned Repeat = VT.getSizeInBits() / SplatBitSize;


        Align Alignment = cast<ConstantPoolSDNode>(CP)->getAlign();

        SDVTList Tys = DAG.getVTList(MVT::getVectorVT(CVT, Repeat), MVT::Other);

        SDValue Ops[] = {DAG.getEntryNode(), CP};

        MachinePointerInfo MPI =

            MachinePointerInfo::getConstantPool(DAG.getMachineFunction());

        SDValue Brdcst =

            DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops, CVT,

                                    MPI, Alignment, MachineMemOperand::MOLoad);

        return DAG.getBitcast(VT, Brdcst);

      }

      if (SplatBitSize > 64) {

        // Load the vector of constants and broadcast it.

        Constant *VecC = getConstantVector(VT, SplatValue, SplatBitSize, *Ctx);

        SDValue VCP = DAG.getConstantPool(VecC, PVT);

        unsigned NumElm = SplatBitSize / VT.getScalarSizeInBits();

        MVT VVT = MVT::getVectorVT(VT.getScalarType(), NumElm);

        Align Alignment = cast<ConstantPoolSDNode>(VCP)->getAlign();

        SDVTList Tys = DAG.getVTList(VT, MVT::Other);

        SDValue Ops[] = {DAG.getEntryNode(), VCP};

        MachinePointerInfo MPI =

            MachinePointerInfo::getConstantPool(DAG.getMachineFunction());

        return DAG.getMemIntrinsicNode(X86ISD::SUBV_BROADCAST_LOAD, dl, Tys,

                                       Ops, VVT, MPI, Alignment,

                                       MachineMemOperand::MOLoad);

      }

    }


    // If we are moving a scalar into a vector (Ld must be set and all elements

    // but 1 are undef) and that operation is not obviously supported by

    // vmovd/vmovq/vmovss/vmovsd, then keep trying to form a broadcast.

    // That's better than general shuffling and may eliminate a load to GPR and

    // move from scalar to vector register.

    if (!Ld || NumElts - NumUndefElts != 1)

      return SDValue();

    unsigned ScalarSize = Ld.getValueSizeInBits();

    if (!(UndefElements[0] || (ScalarSize != 32 && ScalarSize != 64)))

      return SDValue();

  }


  bool ConstSplatVal =

      (Ld.getOpcode() == ISD::Constant || Ld.getOpcode() == ISD::ConstantFP);

  bool IsLoad = ISD::isNormalLoad(Ld.getNode());


  // TODO: Handle broadcasts of non-constant sequences.


  // Make sure that all of the users of a non-constant load are from the

  // BUILD_VECTOR node.

  // FIXME: Is the use count needed for non-constant, non-load case?

  if (!ConstSplatVal && !IsLoad && !BVOp->isOnlyUserOf(Ld.getNode()))

    return SDValue();


  unsigned ScalarSize = Ld.getValueSizeInBits();

  bool IsGE256 = (VT.getSizeInBits() >= 256);


  // When optimizing for size, generate up to 5 extra bytes for a broadcast

  // instruction to save 8 or more bytes of constant pool data.

  // TODO: If multiple splats are generated to load the same constant,

  // it may be detrimental to overall size. There needs to be a way to detect

  // that condition to know if this is truly a size win.

  bool OptForSize = DAG.shouldOptForSize();


  // Handle broadcasting a single constant scalar from the constant pool

  // into a vector.

  // On Sandybridge (no AVX2), it is still better to load a constant vector

  // from the constant pool and not to broadcast it from a scalar.

  // But override that restriction when optimizing for size.

  // TODO: Check if splatting is recommended for other AVX-capable CPUs.

  if (ConstSplatVal && (Subtarget.hasAVX2() || OptForSize)) {

    EVT CVT = Ld.getValueType();

    assert(!CVT.isVector() && "Must not broadcast a vector type");


    // Splat f16, f32, i32, v4f64, v4i64 in all cases with AVX2.

    // For size optimization, also splat v2f64 and v2i64, and for size opt

    // with AVX2, also splat i8 and i16.

    // With pattern matching, the VBROADCAST node may become a VMOVDDUP.

    if (ScalarSize == 32 ||

        (ScalarSize == 64 && (IsGE256 || Subtarget.hasVLX())) ||

        (CVT == MVT::f16 && Subtarget.hasAVX2()) ||

        (OptForSize && (ScalarSize == 64 || Subtarget.hasAVX2()))) {

      const Constant *C = nullptr;

      if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Ld))

        C = CI->getConstantIntValue();

      else if (ConstantFPSDNode *CF = dyn_cast<ConstantFPSDNode>(Ld))

        C = CF->getConstantFPValue();


      assert(C && "Invalid constant type");


      SDValue CP =

          DAG.getConstantPool(C, TLI.getPointerTy(DAG.getDataLayout()));

      Align Alignment = cast<ConstantPoolSDNode>(CP)->getAlign();


      SDVTList Tys = DAG.getVTList(VT, MVT::Other);

      SDValue Ops[] = {DAG.getEntryNode(), CP};

      MachinePointerInfo MPI =

          MachinePointerInfo::getConstantPool(DAG.getMachineFunction());

      return DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops, CVT,

                                     MPI, Alignment, MachineMemOperand::MOLoad);

    }

  }


  // Handle AVX2 in-register broadcasts.

  if (!IsLoad && Subtarget.hasInt256() &&

      (ScalarSize == 32 || (IsGE256 && ScalarSize == 64)))

    return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);


  // The scalar source must be a normal load.

  if (!IsLoad)

    return SDValue();


  // Make sure the non-chain result is only used by this build vector.

  if (!Ld->hasNUsesOfValue(NumElts - NumUndefElts, 0))

    return SDValue();


  if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||

      (Subtarget.hasVLX() && ScalarSize == 64)) {

    auto *LN = cast<LoadSDNode>(Ld);

    SDVTList Tys = DAG.getVTList(VT, MVT::Other);

    SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};

    SDValue BCast =

        DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops,

                                LN->getMemoryVT(), LN->getMemOperand());

    DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BCast.getValue(1));

    return BCast;

  }


  // The integer check is needed for the 64-bit into 128-bit so it doesn't match

  // double since there is no vbroadcastsd xmm

  if (Subtarget.hasInt256() && Ld.getValueType().isInteger() &&

      (ScalarSize == 8 || ScalarSize == 16 || ScalarSize == 64)) {

    auto *LN = cast<LoadSDNode>(Ld);

    SDVTList Tys = DAG.getVTList(VT, MVT::Other);

    SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};

    SDValue BCast =

        DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops,

                                LN->getMemoryVT(), LN->getMemOperand());

    DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BCast.getValue(1));

    return BCast;

  }


  if (ScalarSize == 16 && Subtarget.hasFP16() && IsGE256)

    return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);


  // Unsupported broadcast.

  return SDValue();

}


/// For an EXTRACT_VECTOR_ELT with a constant index return the real

/// underlying vector and index.

///

/// Modifies \p ExtractedFromVec to the real vector and returns the real

/// index.


static int getUnderlyingExtractedFromVec(SDValue &ExtractedFromVec,

                                         SDValue ExtIdx) {

  int Idx = ExtIdx->getAsZExtVal();

  if (!isa<ShuffleVectorSDNode>(ExtractedFromVec))

    return Idx;


  // For 256-bit vectors, LowerEXTRACT_VECTOR_ELT_SSE4 may have already

  // lowered this:

  //   (extract_vector_elt (v8f32 %1), Constant<6>)

  // to:

  //   (extract_vector_elt (vector_shuffle<2,u,u,u>

  //                           (extract_subvector (v8f32 %0), Constant<4>),

  //                           undef)

  //                       Constant<0>)

  // In this case the vector is the extract_subvector expression and the index

  // is 2, as specified by the shuffle.

  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(ExtractedFromVec);

  SDValue ShuffleVec = SVOp->getOperand(0);

  MVT ShuffleVecVT = ShuffleVec.getSimpleValueType();

  assert(ShuffleVecVT.getVectorElementType() ==

         ExtractedFromVec.getSimpleValueType().getVectorElementType());


  int ShuffleIdx = SVOp->getMaskElt(Idx);

  if (isUndefOrInRange(ShuffleIdx, 0, ShuffleVecVT.getVectorNumElements())) {

    ExtractedFromVec = ShuffleVec;

    return ShuffleIdx;

  }

  return Idx;

}


static SDValue buildFromShuffleMostly(SDValue Op, const SDLoc &DL,

                                      SelectionDAG &DAG) {

  MVT VT = Op.getSimpleValueType();


  // Skip if insert_vec_elt is not supported.

  const TargetLowering &TLI = DAG.getTargetLoweringInfo();

  if (!TLI.isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT))

    return SDValue();


  unsigned NumElems = Op.getNumOperands();

  SDValue VecIn1;

  SDValue VecIn2;

  SmallVector<unsigned, 4> InsertIndices;

  SmallVector<int, 8> Mask(NumElems, -1);


  for (unsigned i = 0; i != NumElems; ++i) {

    unsigned Opc = Op.getOperand(i).getOpcode();


    if (Opc == ISD::UNDEF)

      continue;


    if (Opc != ISD::EXTRACT_VECTOR_ELT) {

      // Quit if more than 1 elements need inserting.

      if (InsertIndices.size() > 1)

        return SDValue();


      InsertIndices.push_back(i);

      continue;

    }


    SDValue ExtractedFromVec = Op.getOperand(i).getOperand(0);

    SDValue ExtIdx = Op.getOperand(i).getOperand(1);


    // Quit if non-constant index.

    if (!isa<ConstantSDNode>(ExtIdx))

      return SDValue();

    int Idx = getUnderlyingExtractedFromVec(ExtractedFromVec, ExtIdx);


    // Quit if extracted from vector of different type.

    if (ExtractedFromVec.getValueType() != VT)

      return SDValue();


    if (!VecIn1.getNode())

      VecIn1 = ExtractedFromVec;

    else if (VecIn1 != ExtractedFromVec) {

      if (!VecIn2.getNode())

        VecIn2 = ExtractedFromVec;

      else if (VecIn2 != ExtractedFromVec)

        // Quit if more than 2 vectors to shuffle

        return SDValue();

    }


    if (ExtractedFromVec == VecIn1)

      Mask[i] = Idx;

    else if (ExtractedFromVec == VecIn2)

      Mask[i] = Idx + NumElems;

  }


  if (!VecIn1.getNode())

    return SDValue();


  VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(VT);

  SDValue NV = DAG.getVectorShuffle(VT, DL, VecIn1, VecIn2, Mask);


  for (unsigned Idx : InsertIndices)

    NV = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, NV, Op.getOperand(Idx),

                     DAG.getVectorIdxConstant(Idx, DL));


  return NV;

}


// Lower BUILD_VECTOR operation for v8bf16, v16bf16 and v32bf16 types.


static SDValue LowerBUILD_VECTORvXbf16(SDValue Op, SelectionDAG &DAG,

                                       const X86Subtarget &Subtarget) {

  MVT VT = Op.getSimpleValueType();

  MVT IVT =

      VT.changeVectorElementType(Subtarget.hasFP16() ? MVT::f16 : MVT::i16);

  SmallVector<SDValue, 16> NewOps;

  for (unsigned I = 0, E = Op.getNumOperands(); I != E; ++I)

    NewOps.push_back(DAG.getBitcast(Subtarget.hasFP16() ? MVT::f16 : MVT::i16,

                                    Op.getOperand(I)));

  SDValue Res = DAG.getNode(ISD::BUILD_VECTOR, SDLoc(), IVT, NewOps);

  return DAG.getBitcast(VT, Res);

}


// Lower BUILD_VECTOR operation for v8i1 and v16i1 types.


static SDValue LowerBUILD_VECTORvXi1(SDValue Op, const SDLoc &dl,

                                     SelectionDAG &DAG,

                                     const X86Subtarget &Subtarget) {


  MVT VT = Op.getSimpleValueType();

  assert((VT.getVectorElementType() == MVT::i1) &&

         "Unexpected type in LowerBUILD_VECTORvXi1!");

  if (ISD::isBuildVectorAllZeros(Op.getNode()) ||

      ISD::isBuildVectorAllOnes(Op.getNode()))

    return Op;


  uint64_t Immediate = 0;

  SmallVector<unsigned, 16> NonConstIdx;

  bool IsSplat = true;

  bool HasConstElts = false;

  int SplatIdx = -1;

  for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {

    SDValue In = Op.getOperand(idx);

    if (In.isUndef())

      continue;

    if (auto *InC = dyn_cast<ConstantSDNode>(In)) {

      Immediate |= (InC->getZExtValue() & 0x1) << idx;

      HasConstElts = true;

    } else {

      NonConstIdx.push_back(idx);

    }

    if (SplatIdx < 0)

      SplatIdx = idx;

    else if (In != Op.getOperand(SplatIdx))

      IsSplat = false;

  }


  // for splat use " (select i1 splat_elt, all-ones, all-zeroes)"

  if (IsSplat) {

    // The build_vector allows the scalar element to be larger than the vector

    // element type. We need to mask it to use as a condition unless we know

    // the upper bits are zero.

    // FIXME: Use computeKnownBits instead of checking specific opcode?

    SDValue Cond = Op.getOperand(SplatIdx);

    assert(Cond.getValueType() == MVT::i8 && "Unexpected VT!");

    if (Cond.getOpcode() != ISD::SETCC)

      Cond = DAG.getNode(ISD::AND, dl, MVT::i8, Cond,

                         DAG.getConstant(1, dl, MVT::i8));


    // Perform the select in the scalar domain so we can use cmov.

    if (VT == MVT::v64i1 && !Subtarget.is64Bit()) {

      SDValue Select = DAG.getSelect(dl, MVT::i32, Cond,

                                     DAG.getAllOnesConstant(dl, MVT::i32),

                                     DAG.getConstant(0, dl, MVT::i32));

      Select = DAG.getBitcast(MVT::v32i1, Select);

      return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Select, Select);

    } else {

      MVT ImmVT = MVT::getIntegerVT(std::max((unsigned)VT.getSizeInBits(), 8U));

      SDValue Select = DAG.getSelect(dl, ImmVT, Cond,

                                     DAG.getAllOnesConstant(dl, ImmVT),

                                     DAG.getConstant(0, dl, ImmVT));

      MVT VecVT = VT.getSizeInBits() >= 8 ? VT : MVT::v8i1;

      Select = DAG.getBitcast(VecVT, Select);

      return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Select,

                         DAG.getVectorIdxConstant(0, dl));

    }

  }


  // insert elements one by one

  SDValue DstVec;

  if (HasConstElts) {

    if (VT == MVT::v64i1 && !Subtarget.is64Bit()) {

      SDValue ImmL = DAG.getConstant(Lo_32(Immediate), dl, MVT::i32);

      SDValue ImmH = DAG.getConstant(Hi_32(Immediate), dl, MVT::i32);

      ImmL = DAG.getBitcast(MVT::v32i1, ImmL);

      ImmH = DAG.getBitcast(MVT::v32i1, ImmH);

      DstVec = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, ImmL, ImmH);

    } else {

      MVT ImmVT = MVT::getIntegerVT(std::max((unsigned)VT.getSizeInBits(), 8U));

      SDValue Imm = DAG.getConstant(Immediate, dl, ImmVT);

      MVT VecVT = VT.getSizeInBits() >= 8 ? VT : MVT::v8i1;

      DstVec = DAG.getBitcast(VecVT, Imm);

      DstVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, DstVec,

                           DAG.getVectorIdxConstant(0, dl));

    }

  } else

    DstVec = DAG.getUNDEF(VT);


  for (unsigned InsertIdx : NonConstIdx) {

    DstVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DstVec,

                         Op.getOperand(InsertIdx),

                         DAG.getVectorIdxConstant(InsertIdx, dl));

  }

  return DstVec;

}


[[maybe_unused]] static bool isHorizOp(unsigned Opcode) {

  switch (Opcode) {

  case X86ISD::PACKSS:

  case X86ISD::PACKUS:

  case X86ISD::FHADD:

  case X86ISD::FHSUB:

  case X86ISD::HADD:

  case X86ISD::HSUB:

    return true;

  }

  return false;

}


/// This is a helper function of LowerToHorizontalOp().

/// This function checks that the build_vector \p N in input implements a

/// 128-bit partial horizontal operation on a 256-bit vector, but that operation

/// may not match the layout of an x86 256-bit horizontal instruction.

/// In other words, if this returns true, then some extraction/insertion will

/// be required to produce a valid horizontal instruction.

///

/// Parameter \p Opcode defines the kind of horizontal operation to match.

/// For example, if \p Opcode is equal to ISD::ADD, then this function

/// checks if \p N implements a horizontal arithmetic add; if instead \p Opcode

/// is equal to ISD::SUB, then this function checks if this is a horizontal

/// arithmetic sub.

///

/// This function only analyzes elements of \p N whose indices are

/// in range [BaseIdx, LastIdx).

///

/// TODO: This function was originally used to match both real and fake partial

/// horizontal operations, but the index-matching logic is incorrect for that.

/// See the corrected implementation in isHopBuildVector(). Can we reduce this

/// code because it is only used for partial h-op matching now?


static bool isHorizontalBinOpPart(const BuildVectorSDNode *N, unsigned Opcode,

                                  const SDLoc &DL, SelectionDAG &DAG,

                                  unsigned BaseIdx, unsigned LastIdx,

                                  SDValue &V0, SDValue &V1) {

  EVT VT = N->getValueType(0);

  assert(VT.is256BitVector() && "Only use for matching partial 256-bit h-ops");

  assert(BaseIdx * 2 <= LastIdx && "Invalid Indices in input!");

  assert(VT.isVector() && VT.getVectorNumElements() >= LastIdx &&

         "Invalid Vector in input!");


  bool IsCommutable = (Opcode == ISD::ADD || Opcode == ISD::FADD);

  bool CanFold = true;

  unsigned ExpectedVExtractIdx = BaseIdx;

  unsigned NumElts = LastIdx - BaseIdx;

  V0 = DAG.getUNDEF(VT);

  V1 = DAG.getUNDEF(VT);


  // Check if N implements a horizontal binop.

  for (unsigned i = 0, e = NumElts; i != e && CanFold; ++i) {

    SDValue Op = N->getOperand(i + BaseIdx);


    // Skip UNDEFs.

    if (Op->isUndef()) {

      // Update the expected vector extract index.

      if (i * 2 == NumElts)

        ExpectedVExtractIdx = BaseIdx;

      ExpectedVExtractIdx += 2;

      continue;

    }


    CanFold = Op->getOpcode() == Opcode && Op->hasOneUse();


    if (!CanFold)

      break;


    SDValue Op0 = Op.getOperand(0);

    SDValue Op1 = Op.getOperand(1);


    // Try to match the following pattern:

    // (BINOP (extract_vector_elt A, I), (extract_vector_elt A, I+1))

    CanFold = (Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&

        Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&

        Op0.getOperand(0) == Op1.getOperand(0) &&

        isa<ConstantSDNode>(Op0.getOperand(1)) &&

        isa<ConstantSDNode>(Op1.getOperand(1)));

    if (!CanFold)

      break;


    unsigned I0 = Op0.getConstantOperandVal(1);

    unsigned I1 = Op1.getConstantOperandVal(1);


    if (i * 2 < NumElts) {

      if (V0.isUndef()) {

        V0 = Op0.getOperand(0);

        if (V0.getValueType() != VT)

          return false;

      }

    } else {

      if (V1.isUndef()) {

        V1 = Op0.getOperand(0);

        if (V1.getValueType() != VT)

          return false;

      }

      if (i * 2 == NumElts)

        ExpectedVExtractIdx = BaseIdx;

    }


    SDValue Expected = (i * 2 < NumElts) ? V0 : V1;

    if (I0 == ExpectedVExtractIdx)

      CanFold = I1 == I0 + 1 && Op0.getOperand(0) == Expected;

    else if (IsCommutable && I1 == ExpectedVExtractIdx) {

      // Try to match the following dag sequence:

      // (BINOP (extract_vector_elt A, I+1), (extract_vector_elt A, I))

      CanFold = I0 == I1 + 1 && Op1.getOperand(0) == Expected;

    } else

      CanFold = false;


    ExpectedVExtractIdx += 2;

  }


  return CanFold;

}


/// Emit a sequence of two 128-bit horizontal add/sub followed by

/// a concat_vector.

///

/// This is a helper function of LowerToHorizontalOp().

/// This function expects two 256-bit vectors called V0 and V1.

/// At first, each vector is split into two separate 128-bit vectors.

/// Then, the resulting 128-bit vectors are used to implement two

/// horizontal binary operations.

///

/// The kind of horizontal binary operation is defined by \p X86Opcode.

///

/// \p Mode specifies how the 128-bit parts of V0 and V1 are passed in input to

/// the two new horizontal binop.

/// When Mode is set, the first horizontal binop dag node would take as input

/// the lower 128-bit of V0 and the upper 128-bit of V0. The second

/// horizontal binop dag node would take as input the lower 128-bit of V1

/// and the upper 128-bit of V1.

///   Example:

///     HADD V0_LO, V0_HI

///     HADD V1_LO, V1_HI

///

/// Otherwise, the first horizontal binop dag node takes as input the lower

/// 128-bit of V0 and the lower 128-bit of V1, and the second horizontal binop

/// dag node takes the upper 128-bit of V0 and the upper 128-bit of V1.

///   Example:

///     HADD V0_LO, V1_LO

///     HADD V0_HI, V1_HI

///

/// If \p isUndefLO is set, then the algorithm propagates UNDEF to the lower

/// 128-bits of the result. If \p isUndefHI is set, then UNDEF is propagated to

/// the upper 128-bits of the result.


static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1,

                                     const SDLoc &DL, SelectionDAG &DAG,

                                     unsigned X86Opcode, bool Mode,

                                     bool isUndefLO, bool isUndefHI) {

  MVT VT = V0.getSimpleValueType();

  assert(VT.is256BitVector() && VT == V1.getSimpleValueType() &&

         "Invalid nodes in input!");


  unsigned NumElts = VT.getVectorNumElements();

  SDValue V0_LO = extract128BitVector(V0, 0, DAG, DL);

  SDValue V0_HI = extract128BitVector(V0, NumElts/2, DAG, DL);

  SDValue V1_LO = extract128BitVector(V1, 0, DAG, DL);

  SDValue V1_HI = extract128BitVector(V1, NumElts/2, DAG, DL);

  MVT NewVT = V0_LO.getSimpleValueType();


  SDValue LO = DAG.getUNDEF(NewVT);

  SDValue HI = DAG.getUNDEF(NewVT);


  if (Mode) {

    // Don't emit a horizontal binop if the result is expected to be UNDEF.

    if (!isUndefLO && !V0->isUndef())

      LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V0_HI);

    if (!isUndefHI && !V1->isUndef())

      HI = DAG.getNode(X86Opcode, DL, NewVT, V1_LO, V1_HI);

  } else {

    // Don't emit a horizontal binop if the result is expected to be UNDEF.

    if (!isUndefLO && (!V0_LO->isUndef() || !V1_LO->isUndef()))

      LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V1_LO);


    if (!isUndefHI && (!V0_HI->isUndef() || !V1_HI->isUndef()))

      HI = DAG.getNode(X86Opcode, DL, NewVT, V0_HI, V1_HI);

  }


  return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LO, HI);

}


/// Returns true iff \p BV builds a vector with the result equivalent to

/// the result of ADDSUB/SUBADD operation.

/// If true is returned then the operands of ADDSUB = Opnd0 +- Opnd1

/// (SUBADD = Opnd0 -+ Opnd1) operation are written to the parameters

/// \p Opnd0 and \p Opnd1.


static bool isAddSubOrSubAdd(const BuildVectorSDNode *BV,

                             const X86Subtarget &Subtarget, SelectionDAG &DAG,

                             SDValue &Opnd0, SDValue &Opnd1,

                             unsigned &NumExtracts, bool &IsSubAdd,

                             bool &HasAllowContract) {

  using namespace SDPatternMatch;


  MVT VT = BV->getSimpleValueType(0);

  if (!Subtarget.hasSSE3() || !VT.isFloatingPoint())

    return false;


  unsigned NumElts = VT.getVectorNumElements();

  SDValue InVec0 = DAG.getUNDEF(VT);

  SDValue InVec1 = DAG.getUNDEF(VT);


  NumExtracts = 0;

  HasAllowContract = NumElts != 0;


  // Odd-numbered elements in the input build vector are obtained from

  // adding/subtracting two integer/float elements.

  // Even-numbered elements in the input build vector are obtained from

  // subtracting/adding two integer/float elements.

  unsigned Opc[2] = {0, 0};

  for (unsigned i = 0, e = NumElts; i != e; ++i) {

    SDValue Op = BV->getOperand(i);


    // Skip 'undef' values.

    unsigned Opcode = Op.getOpcode();

    if (Opcode == ISD::UNDEF)

      continue;


    // Early exit if we found an unexpected opcode.

    if (Opcode != ISD::FADD && Opcode != ISD::FSUB)

      return false;


    SDValue Op0 = Op.getOperand(0);

    SDValue Op1 = Op.getOperand(1);


    // Try to match the following pattern:

    // (BINOP (extract_vector_elt A, i), (extract_vector_elt B, i))

    // Early exit if we cannot match that sequence.

    if (!sd_match(Op0, m_ExtractElt(m_SpecificVT(VT), m_SpecificInt(i))) ||

        !sd_match(Op1, m_ExtractElt(m_SpecificVT(VT), m_SpecificInt(i))))

      return false;


    // We found a valid add/sub node, make sure its the same opcode as previous

    // elements for this parity.

    if (Opc[i % 2] != 0 && Opc[i % 2] != Opcode)

      return false;

    Opc[i % 2] = Opcode;


    // Update InVec0 and InVec1.

    if (InVec0.isUndef())

      InVec0 = Op0.getOperand(0);

    if (InVec1.isUndef())

      InVec1 = Op1.getOperand(0);


    // Make sure that operands in input to each add/sub node always

    // come from a same pair of vectors.

    if (InVec0 != Op0.getOperand(0)) {

      if (Opcode == ISD::FSUB)

        return false;


      // FADD is commutable. Try to commute the operands

      // and then test again.

      std::swap(Op0, Op1);

      if (InVec0 != Op0.getOperand(0))

        return false;

    }


    if (InVec1 != Op1.getOperand(0))

      return false;


    // Increment the number of extractions done.

    ++NumExtracts;

    HasAllowContract &= Op->getFlags().hasAllowContract();

  }


  // Ensure we have found an opcode for both parities and that they are

  // different. Don't try to fold this build_vector into an ADDSUB/SUBADD if the

  // inputs are undef.

  if (!Opc[0] || !Opc[1] || Opc[0] == Opc[1] ||

      InVec0.isUndef() || InVec1.isUndef())

    return false;


  IsSubAdd = Opc[0] == ISD::FADD;


  Opnd0 = InVec0;

  Opnd1 = InVec1;

  return true;

}


/// Returns true if is possible to fold MUL and an idiom that has already been

/// recognized as ADDSUB/SUBADD(\p Opnd0, \p Opnd1) into

/// FMADDSUB/FMSUBADD(x, y, \p Opnd1). If (and only if) true is returned, the

/// operands of FMADDSUB/FMSUBADD are written to parameters \p Opnd0, \p Opnd1, \p Opnd2.

///

/// Prior to calling this function it should be known that there is some

/// SDNode that potentially can be replaced with an X86ISD::ADDSUB operation

/// using \p Opnd0 and \p Opnd1 as operands. Also, this method is called

/// before replacement of such SDNode with ADDSUB operation. Thus the number

/// of \p Opnd0 uses is expected to be equal to 2.

/// For example, this function may be called for the following IR:

///    %AB = fmul fast <2 x double> %A, %B

///    %Sub = fsub fast <2 x double> %AB, %C

///    %Add = fadd fast <2 x double> %AB, %C

///    %Addsub = shufflevector <2 x double> %Sub, <2 x double> %Add,

///                            <2 x i32> <i32 0, i32 3>

/// There is a def for %Addsub here, which potentially can be replaced by

/// X86ISD::ADDSUB operation:

///    %Addsub = X86ISD::ADDSUB %AB, %C

/// and such ADDSUB can further be replaced with FMADDSUB:

///    %Addsub = FMADDSUB %A, %B, %C.

///

/// The main reason why this method is called before the replacement of the

/// recognized ADDSUB idiom with ADDSUB operation is that such replacement

/// is illegal sometimes. E.g. 512-bit ADDSUB is not available, while 512-bit

/// FMADDSUB is.


static bool isFMAddSubOrFMSubAdd(const X86Subtarget &Subtarget,

                                 SelectionDAG &DAG, SDValue &Opnd0,

                                 SDValue &Opnd1, SDValue &Opnd2,

                                 unsigned ExpectedUses,

                                 bool AllowSubAddOrAddSubContract) {

  if (Opnd0.getOpcode() != ISD::FMUL ||

      !Opnd0->hasNUsesOfValue(ExpectedUses, 0) || !Subtarget.hasAnyFMA())

    return false;


  // FIXME: These checks must match the similar ones in

  // DAGCombiner::visitFADDForFMACombine. It would be good to have one

  // function that would answer if it is Ok to fuse MUL + ADD to FMADD

  // or MUL + ADDSUB to FMADDSUB.

  const TargetOptions &Options = DAG.getTarget().Options;

  bool AllowFusion =

      Options.AllowFPOpFusion == FPOpFusion::Fast ||

      (AllowSubAddOrAddSubContract && Opnd0->getFlags().hasAllowContract());

  if (!AllowFusion)

    return false;


  Opnd2 = Opnd1;

  Opnd1 = Opnd0.getOperand(1);

  Opnd0 = Opnd0.getOperand(0);


  return true;

}


/// Try to fold a build_vector that performs an 'addsub' or 'fmaddsub' or

/// 'fsubadd' operation accordingly to X86ISD::ADDSUB or X86ISD::FMADDSUB or

/// X86ISD::FMSUBADD node.


static SDValue lowerToAddSubOrFMAddSub(const BuildVectorSDNode *BV,

                                       const SDLoc &DL,

                                       const X86Subtarget &Subtarget,

                                       SelectionDAG &DAG) {

  SDValue Opnd0, Opnd1;

  unsigned NumExtracts;

  bool IsSubAdd;

  bool HasAllowContract;

  if (!isAddSubOrSubAdd(BV, Subtarget, DAG, Opnd0, Opnd1, NumExtracts, IsSubAdd,

                        HasAllowContract))

    return SDValue();


  MVT VT = BV->getSimpleValueType(0);


  // Try to generate X86ISD::FMADDSUB node here.

  SDValue Opnd2;

  if (isFMAddSubOrFMSubAdd(Subtarget, DAG, Opnd0, Opnd1, Opnd2, NumExtracts,

                           HasAllowContract)) {

    unsigned Opc = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;

    return DAG.getNode(Opc, DL, VT, Opnd0, Opnd1, Opnd2);

  }


  // We only support ADDSUB.

  if (IsSubAdd)

    return SDValue();


  // There are no known X86 targets with 512-bit ADDSUB instructions!

  // Convert to blend(fsub,fadd).

  if (VT.is512BitVector()) {

    SmallVector<int> Mask;

    for (int I = 0, E = VT.getVectorNumElements(); I != E; I += 2) {

        Mask.push_back(I);

        Mask.push_back(I + E + 1);

    }

    SDValue Sub = DAG.getNode(ISD::FSUB, DL, VT, Opnd0, Opnd1);

    SDValue Add = DAG.getNode(ISD::FADD, DL, VT, Opnd0, Opnd1);

    return DAG.getVectorShuffle(VT, DL, Sub, Add, Mask);

  }


  return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);

}


static bool isHopBuildVector(const BuildVectorSDNode *BV, SelectionDAG &DAG,

                             unsigned &HOpcode, SDValue &V0, SDValue &V1) {

  // Initialize outputs to known values.

  MVT VT = BV->getSimpleValueType(0);

  HOpcode = ISD::DELETED_NODE;

  V0 = DAG.getUNDEF(VT);

  V1 = DAG.getUNDEF(VT);


  // x86 256-bit horizontal ops are defined in a non-obvious way. Each 128-bit

  // half of the result is calculated independently from the 128-bit halves of

  // the inputs, so that makes the index-checking logic below more complicated.

  unsigned NumElts = VT.getVectorNumElements();

  unsigned GenericOpcode = ISD::DELETED_NODE;

  unsigned Num128BitChunks = VT.is256BitVector() ? 2 : 1;

  unsigned NumEltsIn128Bits = NumElts / Num128BitChunks;

  unsigned NumEltsIn64Bits = NumEltsIn128Bits / 2;

  for (unsigned i = 0; i != Num128BitChunks; ++i) {

    for (unsigned j = 0; j != NumEltsIn128Bits; ++j) {

      // Ignore undef elements.

      SDValue Op = BV->getOperand(i * NumEltsIn128Bits + j);

      if (Op.isUndef())

        continue;


      // If there's an opcode mismatch, we're done.

      if (HOpcode != ISD::DELETED_NODE && Op.getOpcode() != GenericOpcode)

        return false;


      // Initialize horizontal opcode.

      if (HOpcode == ISD::DELETED_NODE) {

        GenericOpcode = Op.getOpcode();

        switch (GenericOpcode) {

        // clang-format off

        case ISD::ADD: HOpcode = X86ISD::HADD; break;

        case ISD::SUB: HOpcode = X86ISD::HSUB; break;

        case ISD::FADD: HOpcode = X86ISD::FHADD; break;

        case ISD::FSUB: HOpcode = X86ISD::FHSUB; break;

        default: return false;

        // clang-format on

        }

      }


      SDValue Op0 = Op.getOperand(0);

      SDValue Op1 = Op.getOperand(1);

      if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||

          Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||

          Op0.getOperand(0) != Op1.getOperand(0) ||

          !isa<ConstantSDNode>(Op0.getOperand(1)) ||

          !isa<ConstantSDNode>(Op1.getOperand(1)) || !Op.hasOneUse())

        return false;


      // The source vector is chosen based on which 64-bit half of the

      // destination vector is being calculated.

      if (j < NumEltsIn64Bits) {

        if (V0.isUndef())

          V0 = Op0.getOperand(0);

      } else {

        if (V1.isUndef())

          V1 = Op0.getOperand(0);

      }


      SDValue SourceVec = (j < NumEltsIn64Bits) ? V0 : V1;

      if (SourceVec != Op0.getOperand(0))

        return false;


      // op (extract_vector_elt A, I), (extract_vector_elt A, I+1)

      unsigned ExtIndex0 = Op0.getConstantOperandVal(1);

      unsigned ExtIndex1 = Op1.getConstantOperandVal(1);

      unsigned ExpectedIndex = i * NumEltsIn128Bits +

                               (j % NumEltsIn64Bits) * 2;

      if (ExpectedIndex == ExtIndex0 && ExtIndex1 == ExtIndex0 + 1)

        continue;


      // If this is not a commutative op, this does not match.

      if (GenericOpcode != ISD::ADD && GenericOpcode != ISD::FADD)

        return false;


      // Addition is commutative, so try swapping the extract indexes.

      // op (extract_vector_elt A, I+1), (extract_vector_elt A, I)

      if (ExpectedIndex == ExtIndex1 && ExtIndex0 == ExtIndex1 + 1)

        continue;


      // Extract indexes do not match horizontal requirement.

      return false;

    }

  }

  // We matched. Opcode and operands are returned by reference as arguments.

  return true;

}


static SDValue getHopForBuildVector(const BuildVectorSDNode *BV,

                                    const SDLoc &DL, SelectionDAG &DAG,

                                    unsigned HOpcode, SDValue V0, SDValue V1) {

  // If either input vector is not the same size as the build vector,

  // extract/insert the low bits to the correct size.

  // This is free (examples: zmm --> xmm, xmm --> ymm).

  MVT VT = BV->getSimpleValueType(0);

  unsigned Width = VT.getSizeInBits();

  if (V0.getValueSizeInBits() > Width)

    V0 = extractSubVector(V0, 0, DAG, DL, Width);

  else if (V0.getValueSizeInBits() < Width)

    V0 = insertSubVector(DAG.getUNDEF(VT), V0, 0, DAG, DL, Width);


  if (V1.getValueSizeInBits() > Width)

    V1 = extractSubVector(V1, 0, DAG, DL, Width);

  else if (V1.getValueSizeInBits() < Width)

    V1 = insertSubVector(DAG.getUNDEF(VT), V1, 0, DAG, DL, Width);


  unsigned NumElts = VT.getVectorNumElements();

  APInt DemandedElts = APInt::getAllOnes(NumElts);

  for (unsigned i = 0; i != NumElts; ++i)

    if (BV->getOperand(i).isUndef())

      DemandedElts.clearBit(i);


  // If we don't need the upper xmm, then perform as a xmm hop.

  unsigned HalfNumElts = NumElts / 2;

  if (VT.is256BitVector() && DemandedElts.lshr(HalfNumElts) == 0) {

    MVT HalfVT = VT.getHalfNumVectorElementsVT();

    V0 = extractSubVector(V0, 0, DAG, DL, 128);

    V1 = extractSubVector(V1, 0, DAG, DL, 128);

    SDValue Half = DAG.getNode(HOpcode, DL, HalfVT, V0, V1);

    return insertSubVector(DAG.getUNDEF(VT), Half, 0, DAG, DL, 256);

  }


  return DAG.getNode(HOpcode, DL, VT, V0, V1);

}


/// Lower BUILD_VECTOR to a horizontal add/sub operation if possible.


static SDValue LowerToHorizontalOp(const BuildVectorSDNode *BV, const SDLoc &DL,

                                   const X86Subtarget &Subtarget,

                                   SelectionDAG &DAG) {

  // We need at least 2 non-undef elements to make this worthwhile by default.

  unsigned NumNonUndefs =

      count_if(BV->op_values(), [](SDValue V) { return !V.isUndef(); });

  if (NumNonUndefs < 2)

    return SDValue();


  // There are 4 sets of horizontal math operations distinguished by type:

  // int/FP at 128-bit/256-bit. Each type was introduced with a different

  // subtarget feature. Try to match those "native" patterns first.

  MVT VT = BV->getSimpleValueType(0);

  if (((VT == MVT::v4f32 || VT == MVT::v2f64) && Subtarget.hasSSE3()) ||

      ((VT == MVT::v8i16 || VT == MVT::v4i32) && Subtarget.hasSSSE3()) ||

      ((VT == MVT::v8f32 || VT == MVT::v4f64) && Subtarget.hasAVX()) ||

      ((VT == MVT::v16i16 || VT == MVT::v8i32) && Subtarget.hasAVX2())) {

    unsigned HOpcode;

    SDValue V0, V1;

    if (isHopBuildVector(BV, DAG, HOpcode, V0, V1))

      return getHopForBuildVector(BV, DL, DAG, HOpcode, V0, V1);

  }


  // Try harder to match 256-bit ops by using extract/concat.

  if (!Subtarget.hasAVX() || !VT.is256BitVector())

    return SDValue();


  // Count the number of UNDEF operands in the build_vector in input.

  unsigned NumElts = VT.getVectorNumElements();

  unsigned Half = NumElts / 2;

  unsigned NumUndefsLO = 0;

  unsigned NumUndefsHI = 0;

  for (unsigned i = 0, e = Half; i != e; ++i)

    if (BV->getOperand(i)->isUndef())

      NumUndefsLO++;


  for (unsigned i = Half, e = NumElts; i != e; ++i)

    if (BV->getOperand(i)->isUndef())

      NumUndefsHI++;


  SDValue InVec0, InVec1;

  if (VT == MVT::v8i32 || VT == MVT::v16i16) {

    SDValue InVec2, InVec3;

    unsigned X86Opcode;

    bool CanFold = true;


    if (isHorizontalBinOpPart(BV, ISD::ADD, DL, DAG, 0, Half, InVec0, InVec1) &&

        isHorizontalBinOpPart(BV, ISD::ADD, DL, DAG, Half, NumElts, InVec2,

                              InVec3) &&

        ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&

        ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))

      X86Opcode = X86ISD::HADD;

    else if (isHorizontalBinOpPart(BV, ISD::SUB, DL, DAG, 0, Half, InVec0,

                                   InVec1) &&

             isHorizontalBinOpPart(BV, ISD::SUB, DL, DAG, Half, NumElts, InVec2,

                                   InVec3) &&

             ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&

             ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))

      X86Opcode = X86ISD::HSUB;

    else

      CanFold = false;


    if (CanFold) {

      // Do not try to expand this build_vector into a pair of horizontal

      // add/sub if we can emit a pair of scalar add/sub.

      if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)

        return SDValue();


      // Convert this build_vector into a pair of horizontal binops followed by

      // a concat vector. We must adjust the outputs from the partial horizontal

      // matching calls above to account for undefined vector halves.

      SDValue V0 = InVec0.isUndef() ? InVec2 : InVec0;

      SDValue V1 = InVec1.isUndef() ? InVec3 : InVec1;

      assert((!V0.isUndef() || !V1.isUndef()) && "Horizontal-op of undefs?");

      bool isUndefLO = NumUndefsLO == Half;

      bool isUndefHI = NumUndefsHI == Half;

      return ExpandHorizontalBinOp(V0, V1, DL, DAG, X86Opcode, false, isUndefLO,

                                   isUndefHI);

    }

  }


  if (VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v8i32 ||

      VT == MVT::v16i16) {

    unsigned X86Opcode;

    if (isHorizontalBinOpPart(BV, ISD::ADD, DL, DAG, 0, NumElts, InVec0,

                              InVec1))

      X86Opcode = X86ISD::HADD;

    else if (isHorizontalBinOpPart(BV, ISD::SUB, DL, DAG, 0, NumElts, InVec0,

                                   InVec1))

      X86Opcode = X86ISD::HSUB;

    else if (isHorizontalBinOpPart(BV, ISD::FADD, DL, DAG, 0, NumElts, InVec0,

                                   InVec1))

      X86Opcode = X86ISD::FHADD;

    else if (isHorizontalBinOpPart(BV, ISD::FSUB, DL, DAG, 0, NumElts, InVec0,

                                   InVec1))

      X86Opcode = X86ISD::FHSUB;

    else

      return SDValue();


    // Don't try to expand this build_vector into a pair of horizontal add/sub

    // if we can simply emit a pair of scalar add/sub.

    if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)

      return SDValue();


    // Convert this build_vector into two horizontal add/sub followed by

    // a concat vector.

    bool isUndefLO = NumUndefsLO == Half;

    bool isUndefHI = NumUndefsHI == Half;

    return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, true,

                                 isUndefLO, isUndefHI);

  }


  return SDValue();

}


static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,

                          SelectionDAG &DAG);


/// If a BUILD_VECTOR's source elements all apply the same bit operation and

/// one of their operands is constant, lower to a pair of BUILD_VECTOR and

/// just apply the bit to the vectors.

/// NOTE: Its not in our interest to start make a general purpose vectorizer

/// from this, but enough scalar bit operations are created from the later

/// legalization + scalarization stages to need basic support.


static SDValue lowerBuildVectorToBitOp(BuildVectorSDNode *Op, const SDLoc &DL,

                                       const X86Subtarget &Subtarget,

                                       SelectionDAG &DAG) {

  MVT VT = Op->getSimpleValueType(0);

  unsigned NumElems = VT.getVectorNumElements();

  const TargetLowering &TLI = DAG.getTargetLoweringInfo();


  // Check that all elements have the same opcode.

  // TODO: Should we allow UNDEFS and if so how many?

  unsigned Opcode = Op->getOperand(0).getOpcode();

  for (unsigned i = 1; i < NumElems; ++i)

    if (Opcode != Op->getOperand(i).getOpcode())

      return SDValue();


  // TODO: We may be able to add support for other Ops (ADD/SUB + shifts).

  bool IsShift = false;

  switch (Opcode) {

  default:

    return SDValue();

  case ISD::SHL:

  case ISD::SRL:

  case ISD::SRA:

    IsShift = true;

    break;

  case ISD::AND:

  case ISD::XOR:

  case ISD::OR:

    // Don't do this if the buildvector is a splat - we'd replace one

    // constant with an entire vector.

    if (Op->getSplatValue())

      return SDValue();

    if (!TLI.isOperationLegalOrPromote(Opcode, VT))

      return SDValue();

    break;

  }


  SmallVector<SDValue, 4> LHSElts, RHSElts;

  for (SDValue Elt : Op->ops()) {

    SDValue LHS = Elt.getOperand(0);

    SDValue RHS = Elt.getOperand(1);


    // We expect the canonicalized RHS operand to be the constant.

    if (!isa<ConstantSDNode>(RHS))

      return SDValue();


    // Extend shift amounts.

    if (RHS.getValueSizeInBits() != VT.getScalarSizeInBits()) {

      if (!IsShift)

        return SDValue();

      RHS = DAG.getZExtOrTrunc(RHS, DL, VT.getScalarType());

    }


    LHSElts.push_back(LHS);

    RHSElts.push_back(RHS);

  }


  // Limit to shifts by uniform immediates.

  // TODO: Only accept vXi8/vXi64 special cases?

  // TODO: Permit non-uniform XOP/AVX2/MULLO cases?

  if (IsShift && any_of(RHSElts, [&](SDValue V) { return RHSElts[0] != V; }))

    return SDValue();


  SDValue LHS = DAG.getBuildVector(VT, DL, LHSElts);

  SDValue RHS = DAG.getBuildVector(VT, DL, RHSElts);

  SDValue Res = DAG.getNode(Opcode, DL, VT, LHS, RHS);


  if (!IsShift)

    return Res;


  // Immediately lower the shift to ensure the constant build vector doesn't

  // get converted to a constant pool before the shift is lowered.

  return LowerShift(Res, Subtarget, DAG);

}


static bool isShuffleFoldableLoad(SDValue);


/// Attempt to lower a BUILD_VECTOR of scalar values to a shuffle of splats

/// representing a blend.


static SDValue lowerBuildVectorAsBlend(BuildVectorSDNode *BVOp, SDLoc const &DL,

                                       X86Subtarget const &Subtarget,

                                       SelectionDAG &DAG) {

  MVT VT = BVOp->getSimpleValueType(0u);


  if (VT != MVT::v4f64)

    return SDValue();


  // Collect unique operands.

  auto UniqueOps = SmallSet<SDValue, 16u>();

  for (SDValue Op : BVOp->ops()) {

    if (isIntOrFPConstant(Op) || Op.isUndef())

      return SDValue();

    UniqueOps.insert(Op);

  }


  // Candidate BUILD_VECTOR must have 2 unique operands.

  if (UniqueOps.size() != 2u)

    return SDValue();


  SDValue Op0 = BVOp->getOperand(0u);

  UniqueOps.erase(Op0);

  SDValue Op1 = *UniqueOps.begin();


  if (Subtarget.hasAVX2() || isShuffleFoldableLoad(Op0) ||

      isShuffleFoldableLoad(Op1)) {

    // Create shuffle mask.

    auto const NumElems = VT.getVectorNumElements();

    SmallVector<int, 16u> Mask(NumElems);

    for (auto I = 0u; I < NumElems; ++I) {

      SDValue Op = BVOp->getOperand(I);

      Mask[I] = Op == Op0 ? I : I + NumElems;

    }

    // Create shuffle of splats.

    SDValue NewOp0 = DAG.getSplatBuildVector(VT, DL, Op0);

    SDValue NewOp1 = DAG.getSplatBuildVector(VT, DL, Op1);

    return DAG.getVectorShuffle(VT, DL, NewOp0, NewOp1, Mask);

  }


  return SDValue();

}


/// Create a vector constant without a load. SSE/AVX provide the bare minimum

/// functionality to do this, so it's all zeros, all ones, or some derivation

/// that is cheap to calculate.


static SDValue materializeVectorConstant(SDValue Op, const SDLoc &DL,

                                         SelectionDAG &DAG,

                                         const X86Subtarget &Subtarget) {

  MVT VT = Op.getSimpleValueType();


  // Vectors containing all zeros can be matched by pxor and xorps.

  if (ISD::isBuildVectorAllZeros(Op.getNode()))

    return Op;


  // Vectors containing all ones can be matched by pcmpeqd on 128-bit width

  // vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use

  // vpcmpeqd on 256-bit vectors.

  if (Subtarget.hasSSE2() && ISD::isBuildVectorAllOnes(Op.getNode())) {

    if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32)

      return Op;


    return getOnesVector(VT, DAG, DL);

  }


  return SDValue();

}


/// Look for opportunities to create a VPERMV/VPERMILPV/PSHUFB variable permute

/// from a vector of source values and a vector of extraction indices.

/// The vectors might be manipulated to match the type of the permute op.


static SDValue createVariablePermute(MVT VT, SDValue SrcVec, SDValue IndicesVec,

                                     const SDLoc &DL, SelectionDAG &DAG,

                                     const X86Subtarget &Subtarget) {

  MVT ShuffleVT = VT;

  EVT IndicesVT = EVT(VT).changeVectorElementTypeToInteger();

  unsigned NumElts = VT.getVectorNumElements();

  unsigned SizeInBits = VT.getSizeInBits();


  // Adjust IndicesVec to match VT size.

  assert(IndicesVec.getValueType().getVectorNumElements() >= NumElts &&

         "Illegal variable permute mask size");

  if (IndicesVec.getValueType().getVectorNumElements() > NumElts) {

    // Narrow/widen the indices vector to the correct size.

    if (IndicesVec.getValueSizeInBits() > SizeInBits)

      IndicesVec = extractSubVector(IndicesVec, 0, DAG, SDLoc(IndicesVec),

                                    NumElts * VT.getScalarSizeInBits());

    else if (IndicesVec.getValueSizeInBits() < SizeInBits)

      IndicesVec = widenSubVector(IndicesVec, false, Subtarget, DAG,

                                  SDLoc(IndicesVec), SizeInBits);

    // Zero-extend the index elements within the vector.

    if (IndicesVec.getValueType().getVectorNumElements() > NumElts)

      IndicesVec = DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, SDLoc(IndicesVec),

                               IndicesVT, IndicesVec);

  }

  IndicesVec = DAG.getZExtOrTrunc(IndicesVec, SDLoc(IndicesVec), IndicesVT);


  // Handle SrcVec that don't match VT type.

  if (SrcVec.getValueSizeInBits() != SizeInBits) {

    if ((SrcVec.getValueSizeInBits() % SizeInBits) == 0) {

      // Handle larger SrcVec by treating it as a larger permute.

      unsigned Scale = SrcVec.getValueSizeInBits() / SizeInBits;

      VT = MVT::getVectorVT(VT.getScalarType(), Scale * NumElts);

      IndicesVT = EVT(VT).changeVectorElementTypeToInteger();

      IndicesVec = widenSubVector(IndicesVT.getSimpleVT(), IndicesVec, false,

                                  Subtarget, DAG, SDLoc(IndicesVec));

      SDValue NewSrcVec =

          createVariablePermute(VT, SrcVec, IndicesVec, DL, DAG, Subtarget);

      if (NewSrcVec)

        return extractSubVector(NewSrcVec, 0, DAG, DL, SizeInBits);

      return SDValue();

    } else if (SrcVec.getValueSizeInBits() < SizeInBits) {

      // Widen smaller SrcVec to match VT.

      SrcVec = widenSubVector(VT, SrcVec, false, Subtarget, DAG, SDLoc(SrcVec));

    } else

      return SDValue();

  }


  auto ScaleIndices = [&DAG](SDValue Idx, uint64_t Scale) {

    assert(isPowerOf2_64(Scale) && "Illegal variable permute shuffle scale");

    EVT SrcVT = Idx.getValueType();

    unsigned NumDstBits = SrcVT.getScalarSizeInBits() / Scale;

    uint64_t IndexScale = 0;

    uint64_t IndexOffset = 0;


    // If we're scaling a smaller permute op, then we need to repeat the

    // indices, scaling and offsetting them as well.

    // e.g. v4i32 -> v16i8 (Scale = 4)

    // IndexScale = v4i32 Splat(4 << 24 | 4 << 16 | 4 << 8 | 4)

    // IndexOffset = v4i32 Splat(3 << 24 | 2 << 16 | 1 << 8 | 0)

    for (uint64_t i = 0; i != Scale; ++i) {

      IndexScale |= Scale << (i * NumDstBits);

      IndexOffset |= i << (i * NumDstBits);

    }


    Idx = DAG.getNode(ISD::MUL, SDLoc(Idx), SrcVT, Idx,

                      DAG.getConstant(IndexScale, SDLoc(Idx), SrcVT));

    Idx = DAG.getNode(ISD::ADD, SDLoc(Idx), SrcVT, Idx,

                      DAG.getConstant(IndexOffset, SDLoc(Idx), SrcVT));

    return Idx;

  };


  unsigned Opcode = 0;

  switch (VT.SimpleTy) {

  default:

    break;

  case MVT::v16i8:

    if (Subtarget.hasSSSE3())

      Opcode = X86ISD::PSHUFB;

    break;

  case MVT::v8i16:

    if (Subtarget.hasVLX() && Subtarget.hasBWI())

      Opcode = X86ISD::VPERMV;

    else if (Subtarget.hasSSSE3()) {

      Opcode = X86ISD::PSHUFB;

      ShuffleVT = MVT::v16i8;

    }

    break;

  case MVT::v4f32:

  case MVT::v4i32:

    if (Subtarget.hasAVX()) {

      Opcode = X86ISD::VPERMILPV;

      ShuffleVT = MVT::v4f32;

    } else if (Subtarget.hasSSSE3()) {

      Opcode = X86ISD::PSHUFB;

      ShuffleVT = MVT::v16i8;

    }

    break;

  case MVT::v2f64:

  case MVT::v2i64:

    if (Subtarget.hasAVX()) {

      // VPERMILPD selects using bit#1 of the index vector, so scale IndicesVec.

      IndicesVec = DAG.getNode(ISD::ADD, DL, IndicesVT, IndicesVec, IndicesVec);

      Opcode = X86ISD::VPERMILPV;

      ShuffleVT = MVT::v2f64;

    } else if (Subtarget.hasSSE41()) {

      // SSE41 can compare v2i64 - select between indices 0 and 1.

      return DAG.getSelectCC(

          DL, IndicesVec,

          getZeroVector(IndicesVT.getSimpleVT(), Subtarget, DAG, DL),

          DAG.getVectorShuffle(VT, DL, SrcVec, SrcVec, {0, 0}),

          DAG.getVectorShuffle(VT, DL, SrcVec, SrcVec, {1, 1}),

          ISD::CondCode::SETEQ);

    }

    break;

  case MVT::v32i8:

    if (Subtarget.hasVLX() && Subtarget.hasVBMI())

      Opcode = X86ISD::VPERMV;

    else if (Subtarget.hasXOP()) {

      SDValue LoSrc = extract128BitVector(SrcVec, 0, DAG, DL);

      SDValue HiSrc = extract128BitVector(SrcVec, 16, DAG, DL);

      SDValue LoIdx = extract128BitVector(IndicesVec, 0, DAG, DL);

      SDValue HiIdx = extract128BitVector(IndicesVec, 16, DAG, DL);

      return DAG.getNode(

          ISD::CONCAT_VECTORS, DL, VT,

          DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, LoSrc, HiSrc, LoIdx),

          DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, LoSrc, HiSrc, HiIdx));

    } else if (Subtarget.hasAVX()) {

      SDValue Lo = extract128BitVector(SrcVec, 0, DAG, DL);

      SDValue Hi = extract128BitVector(SrcVec, 16, DAG, DL);

      SDValue LoLo = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Lo);

      SDValue HiHi = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Hi, Hi);

      auto PSHUFBBuilder = [](SelectionDAG &DAG, const SDLoc &DL,

                              ArrayRef<SDValue> Ops) {

        // Permute Lo and Hi and then select based on index range.

        // This works as SHUFB uses bits[3:0] to permute elements and we don't

        // care about the bit[7] as its just an index vector.

        SDValue Idx = Ops[2];

        EVT VT = Idx.getValueType();

        return DAG.getSelectCC(DL, Idx, DAG.getConstant(15, DL, VT),

                               DAG.getNode(X86ISD::PSHUFB, DL, VT, Ops[1], Idx),

                               DAG.getNode(X86ISD::PSHUFB, DL, VT, Ops[0], Idx),

                               ISD::CondCode::SETGT);

      };

      SDValue Ops[] = {LoLo, HiHi, IndicesVec};

      return SplitOpsAndApply(DAG, Subtarget, DL, MVT::v32i8, Ops,

                              PSHUFBBuilder);

    }

    break;

  case MVT::v16i16:

    if (Subtarget.hasVLX() && Subtarget.hasBWI())

      Opcode = X86ISD::VPERMV;

    else if (Subtarget.hasAVX()) {

      // Scale to v32i8 and perform as v32i8.

      IndicesVec = ScaleIndices(IndicesVec, 2);

      return DAG.getBitcast(

          VT, createVariablePermute(

                  MVT::v32i8, DAG.getBitcast(MVT::v32i8, SrcVec),

                  DAG.getBitcast(MVT::v32i8, IndicesVec), DL, DAG, Subtarget));

    }

    break;

  case MVT::v8f32:

  case MVT::v8i32:

    if (Subtarget.hasAVX2())

      Opcode = X86ISD::VPERMV;

    else if (Subtarget.hasAVX()) {

      SrcVec = DAG.getBitcast(MVT::v8f32, SrcVec);

      SDValue LoLo = DAG.getVectorShuffle(MVT::v8f32, DL, SrcVec, SrcVec,

                                          {0, 1, 2, 3, 0, 1, 2, 3});

      SDValue HiHi = DAG.getVectorShuffle(MVT::v8f32, DL, SrcVec, SrcVec,

                                          {4, 5, 6, 7, 4, 5, 6, 7});

      if (Subtarget.hasXOP())

        return DAG.getBitcast(

            VT, DAG.getNode(X86ISD::VPERMIL2, DL, MVT::v8f32, LoLo, HiHi,

                            IndicesVec, DAG.getTargetConstant(0, DL, MVT::i8)));

      // Permute Lo and Hi and then select based on index range.

      // This works as VPERMILPS only uses index bits[0:1] to permute elements.

      SDValue Res = DAG.getSelectCC(

          DL, IndicesVec, DAG.getConstant(3, DL, MVT::v8i32),

          DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, HiHi, IndicesVec),

          DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, LoLo, IndicesVec),

          ISD::CondCode::SETGT);

      return DAG.getBitcast(VT, Res);

    }

    break;

  case MVT::v4i64:

  case MVT::v4f64:

    if (Subtarget.hasAVX512()) {

      if (!Subtarget.hasVLX()) {

        MVT WidenSrcVT = MVT::getVectorVT(VT.getScalarType(), 8);

        SrcVec = widenSubVector(WidenSrcVT, SrcVec, false, Subtarget, DAG,

                                SDLoc(SrcVec));

        IndicesVec = widenSubVector(MVT::v8i64, IndicesVec, false, Subtarget,

                                    DAG, SDLoc(IndicesVec));

        SDValue Res = createVariablePermute(WidenSrcVT, SrcVec, IndicesVec, DL,

                                            DAG, Subtarget);

        return extract256BitVector(Res, 0, DAG, DL);

      }

      Opcode = X86ISD::VPERMV;

    } else if (Subtarget.hasAVX()) {

      SrcVec = DAG.getBitcast(MVT::v4f64, SrcVec);

      SDValue LoLo =

          DAG.getVectorShuffle(MVT::v4f64, DL, SrcVec, SrcVec, {0, 1, 0, 1});

      SDValue HiHi =

          DAG.getVectorShuffle(MVT::v4f64, DL, SrcVec, SrcVec, {2, 3, 2, 3});

      // VPERMIL2PD selects with bit#1 of the index vector, so scale IndicesVec.

      IndicesVec = DAG.getNode(ISD::ADD, DL, IndicesVT, IndicesVec, IndicesVec);

      if (Subtarget.hasXOP())

        return DAG.getBitcast(

            VT, DAG.getNode(X86ISD::VPERMIL2, DL, MVT::v4f64, LoLo, HiHi,

                            IndicesVec, DAG.getTargetConstant(0, DL, MVT::i8)));

      // Permute Lo and Hi and then select based on index range.

      // This works as VPERMILPD only uses index bit[1] to permute elements.

      SDValue Res = DAG.getSelectCC(

          DL, IndicesVec, DAG.getConstant(2, DL, MVT::v4i64),

          DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v4f64, HiHi, IndicesVec),

          DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v4f64, LoLo, IndicesVec),

          ISD::CondCode::SETGT);

      return DAG.getBitcast(VT, Res);

    }

    break;

  case MVT::v64i8:

    if (Subtarget.hasVBMI())

      Opcode = X86ISD::VPERMV;

    break;

  case MVT::v32i16:

    if (Subtarget.hasBWI())

      Opcode = X86ISD::VPERMV;

    break;

  case MVT::v16f32:

  case MVT::v16i32:

  case MVT::v8f64:

  case MVT::v8i64:

    if (Subtarget.hasAVX512())

      Opcode = X86ISD::VPERMV;

    break;

  }

  if (!Opcode)

    return SDValue();


  assert((VT.getSizeInBits() == ShuffleVT.getSizeInBits()) &&

         (VT.getScalarSizeInBits() % ShuffleVT.getScalarSizeInBits()) == 0 &&

         "Illegal variable permute shuffle type");


  uint64_t Scale = VT.getScalarSizeInBits() / ShuffleVT.getScalarSizeInBits();

  if (Scale > 1)

    IndicesVec = ScaleIndices(IndicesVec, Scale);


  EVT ShuffleIdxVT = EVT(ShuffleVT).changeVectorElementTypeToInteger();

  IndicesVec = DAG.getBitcast(ShuffleIdxVT, IndicesVec);


  SrcVec = DAG.getBitcast(ShuffleVT, SrcVec);

  SDValue Res = Opcode == X86ISD::VPERMV

                    ? DAG.getNode(Opcode, DL, ShuffleVT, IndicesVec, SrcVec)

                    : DAG.getNode(Opcode, DL, ShuffleVT, SrcVec, IndicesVec);

  return DAG.getBitcast(VT, Res);

}


// Tries to lower a BUILD_VECTOR composed of extract-extract chains that can be

// reasoned to be a permutation of a vector by indices in a non-constant vector.

// (build_vector (extract_elt V, (extract_elt I, 0)),

//               (extract_elt V, (extract_elt I, 1)),

//                    ...

// ->

// (vpermv I, V)

//

// TODO: Handle undefs

// TODO: Utilize pshufb and zero mask blending to support more efficient

// construction of vectors with constant-0 elements.

static SDValue


LowerBUILD_VECTORAsVariablePermute(SDValue V, const SDLoc &DL,

                                   SelectionDAG &DAG,

                                   const X86Subtarget &Subtarget) {

  SDValue SrcVec, IndicesVec;


  auto PeekThroughFreeze = [](SDValue N) {

    if (N->getOpcode() == ISD::FREEZE && N.hasOneUse())

      return N->getOperand(0);

    return N;

  };

  // Check for a match of the permute source vector and permute index elements.

  // This is done by checking that the i-th build_vector operand is of the form:

  // (extract_elt SrcVec, (extract_elt IndicesVec, i)).

  for (unsigned Idx = 0, E = V.getNumOperands(); Idx != E; ++Idx) {

    SDValue Op = PeekThroughFreeze(V.getOperand(Idx));

    if (Op.getOpcode() != ISD::EXTRACT_VECTOR_ELT)

      return SDValue();


    // If this is the first extract encountered in V, set the source vector,

    // otherwise verify the extract is from the previously defined source

    // vector.

    if (!SrcVec)

      SrcVec = Op.getOperand(0);

    else if (SrcVec != Op.getOperand(0))

      return SDValue();

    SDValue ExtractedIndex = Op->getOperand(1);

    // Peek through extends.

    if (ExtractedIndex.getOpcode() == ISD::ZERO_EXTEND ||

        ExtractedIndex.getOpcode() == ISD::SIGN_EXTEND)

      ExtractedIndex = ExtractedIndex.getOperand(0);

    if (ExtractedIndex.getOpcode() != ISD::EXTRACT_VECTOR_ELT)

      return SDValue();


    // If this is the first extract from the index vector candidate, set the

    // indices vector, otherwise verify the extract is from the previously

    // defined indices vector.

    if (!IndicesVec)

      IndicesVec = ExtractedIndex.getOperand(0);

    else if (IndicesVec != ExtractedIndex.getOperand(0))

      return SDValue();


    auto *PermIdx = dyn_cast<ConstantSDNode>(ExtractedIndex.getOperand(1));

    if (!PermIdx || PermIdx->getAPIntValue() != Idx)

      return SDValue();

  }


  MVT VT = V.getSimpleValueType();

  return createVariablePermute(VT, SrcVec, IndicesVec, DL, DAG, Subtarget);

}


SDValue

X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {

  SDLoc dl(Op);


  MVT VT = Op.getSimpleValueType();

  MVT EltVT = VT.getVectorElementType();

  MVT OpEltVT = Op.getOperand(0).getSimpleValueType();

  unsigned NumElems = Op.getNumOperands();


  // Generate vectors for predicate vectors.

  if (VT.getVectorElementType() == MVT::i1 && Subtarget.hasAVX512())

    return LowerBUILD_VECTORvXi1(Op, dl, DAG, Subtarget);


  if (VT.getVectorElementType() == MVT::bf16 &&

      (Subtarget.hasAVXNECONVERT() || Subtarget.hasBF16()))

    return LowerBUILD_VECTORvXbf16(Op, DAG, Subtarget);


  if (SDValue VectorCst = materializeVectorConstant(Op, dl, DAG, Subtarget))

    return VectorCst;


  unsigned EVTBits = EltVT.getSizeInBits();

  APInt UndefMask = APInt::getZero(NumElems);

  APInt FrozenUndefMask = APInt::getZero(NumElems);

  APInt ZeroMask = APInt::getZero(NumElems);

  APInt NonZeroMask = APInt::getZero(NumElems);

  bool IsAllConstants = true;

  bool OneUseFrozenUndefs = true;

  SmallSet<SDValue, 8> Values;

  unsigned NumConstants = NumElems;

  for (unsigned i = 0; i < NumElems; ++i) {

    SDValue Elt = Op.getOperand(i);

    if (Elt.isUndef()) {

      UndefMask.setBit(i);

      continue;

    }

    if (ISD::isFreezeUndef(Elt.getNode())) {

      OneUseFrozenUndefs = OneUseFrozenUndefs && Elt->hasOneUse();

      FrozenUndefMask.setBit(i);

      continue;

    }

    Values.insert(Elt);

    if (!isIntOrFPConstant(Elt)) {

      IsAllConstants = false;

      NumConstants--;

    }

    if (X86::isZeroNode(Elt)) {

      ZeroMask.setBit(i);

    } else {

      NonZeroMask.setBit(i);

    }

  }


  // All undef vector. Return an UNDEF.

  if (UndefMask.isAllOnes())

    return DAG.getUNDEF(VT);


  // All undef/freeze(undef) vector. Return a FREEZE UNDEF.

  if (OneUseFrozenUndefs && (UndefMask | FrozenUndefMask).isAllOnes())

    return DAG.getFreeze(DAG.getUNDEF(VT));


  // All undef/freeze(undef)/zero vector. Return a zero vector.

  if ((UndefMask | FrozenUndefMask | ZeroMask).isAllOnes())

    return getZeroVector(VT, Subtarget, DAG, dl);


  // If we have multiple FREEZE-UNDEF operands, we are likely going to end up

  // lowering into a suboptimal insertion sequence. Instead, thaw the UNDEF in

  // our source BUILD_VECTOR, create another FREEZE-UNDEF splat BUILD_VECTOR,

  // and blend the FREEZE-UNDEF operands back in.

  // FIXME: is this worthwhile even for a single FREEZE-UNDEF operand?

  if (unsigned NumFrozenUndefElts = FrozenUndefMask.popcount();

      NumFrozenUndefElts >= 2 && NumFrozenUndefElts < NumElems) {

    SmallVector<int, 16> BlendMask(NumElems, -1);

    SmallVector<SDValue, 16> Elts(NumElems, DAG.getUNDEF(OpEltVT));

    for (unsigned i = 0; i < NumElems; ++i) {

      if (UndefMask[i]) {

        BlendMask[i] = -1;

        continue;

      }

      BlendMask[i] = i;

      if (!FrozenUndefMask[i])

        Elts[i] = Op.getOperand(i);

      else

        BlendMask[i] += NumElems;

    }

    SDValue EltsBV = DAG.getBuildVector(VT, dl, Elts);

    SDValue FrozenUndefElt = DAG.getFreeze(DAG.getUNDEF(OpEltVT));

    SDValue FrozenUndefBV = DAG.getSplatBuildVector(VT, dl, FrozenUndefElt);

    return DAG.getVectorShuffle(VT, dl, EltsBV, FrozenUndefBV, BlendMask);

  }


  BuildVectorSDNode *BV = cast<BuildVectorSDNode>(Op.getNode());


  // If the upper elts of a ymm/zmm are undef/freeze(undef)/zero then we might

  // be better off lowering to a smaller build vector and padding with

  // undef/zero.

  if ((VT.is256BitVector() || VT.is512BitVector()) &&

      !isFoldableUseOfShuffle(BV)) {

    unsigned UpperElems = NumElems / 2;

    APInt UndefOrZeroMask = FrozenUndefMask | UndefMask | ZeroMask;

    unsigned NumUpperUndefsOrZeros = UndefOrZeroMask.countl_one();

    if (NumUpperUndefsOrZeros >= UpperElems) {

      if (VT.is512BitVector() &&

          NumUpperUndefsOrZeros >= (NumElems - (NumElems / 4)))

        UpperElems = NumElems - (NumElems / 4);

      // If freeze(undef) is in any upper elements, force to zero.

      bool UndefUpper = UndefMask.countl_one() >= UpperElems;

      MVT LowerVT = MVT::getVectorVT(EltVT, NumElems - UpperElems);

      SDValue NewBV =

          DAG.getBuildVector(LowerVT, dl, Op->ops().drop_back(UpperElems));

      return widenSubVector(VT, NewBV, !UndefUpper, Subtarget, DAG, dl);

    }

  }


  if (SDValue AddSub = lowerToAddSubOrFMAddSub(BV, dl, Subtarget, DAG))

    return AddSub;

  if (SDValue HorizontalOp = LowerToHorizontalOp(BV, dl, Subtarget, DAG))

    return HorizontalOp;

  if (SDValue Broadcast = lowerBuildVectorAsBroadcast(BV, dl, Subtarget, DAG))

    return Broadcast;

  if (SDValue BitOp = lowerBuildVectorToBitOp(BV, dl, Subtarget, DAG))

    return BitOp;

  if (SDValue Blend = lowerBuildVectorAsBlend(BV, dl, Subtarget, DAG))

    return Blend;


  unsigned NumZero = ZeroMask.popcount();

  unsigned NumNonZero = NonZeroMask.popcount();


  // If we are inserting one variable into a vector of non-zero constants, try

  // to avoid loading each constant element as a scalar. Load the constants as a

  // vector and then insert the variable scalar element. If insertion is not

  // supported, fall back to a shuffle to get the scalar blended with the

  // constants. Insertion into a zero vector is handled as a special-case

  // somewhere below here.

  if (NumConstants == NumElems - 1 && NumNonZero != 1 &&

      FrozenUndefMask.isZero() &&

      (isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT) ||

       isOperationLegalOrCustom(ISD::VECTOR_SHUFFLE, VT))) {

    // Create an all-constant vector. The variable element in the old

    // build vector is replaced by undef in the constant vector. Save the

    // variable scalar element and its index for use in the insertelement.

    LLVMContext &Context = *DAG.getContext();

    Type *EltType = Op.getValueType().getScalarType().getTypeForEVT(Context);

    SmallVector<Constant *, 16> ConstVecOps(NumElems, UndefValue::get(EltType));

    SDValue VarElt;

    SDValue InsIndex;

    for (unsigned i = 0; i != NumElems; ++i) {

      SDValue Elt = Op.getOperand(i);

      if (auto *C = dyn_cast<ConstantSDNode>(Elt))

        ConstVecOps[i] = ConstantInt::get(Context, C->getAPIntValue());

      else if (auto *C = dyn_cast<ConstantFPSDNode>(Elt))

        ConstVecOps[i] = ConstantFP::get(Context, C->getValueAPF());

      else if (!Elt.isUndef()) {

        assert(!VarElt.getNode() && !InsIndex.getNode() &&

               "Expected one variable element in this vector");

        VarElt = Elt;

        InsIndex = DAG.getVectorIdxConstant(i, dl);

      }

    }

    Constant *CV = ConstantVector::get(ConstVecOps);

    SDValue DAGConstVec = DAG.getConstantPool(CV, VT);


    // The constants we just created may not be legal (eg, floating point). We

    // must lower the vector right here because we can not guarantee that we'll

    // legalize it before loading it. This is also why we could not just create

    // a new build vector here. If the build vector contains illegal constants,

    // it could get split back up into a series of insert elements.

    // TODO: Improve this by using shorter loads with broadcast/VZEXT_LOAD.

    SDValue LegalDAGConstVec = LowerConstantPool(DAGConstVec, DAG);

    MachineFunction &MF = DAG.getMachineFunction();

    MachinePointerInfo MPI = MachinePointerInfo::getConstantPool(MF);

    SDValue Ld = DAG.getLoad(VT, dl, DAG.getEntryNode(), LegalDAGConstVec, MPI);

    unsigned InsertC = InsIndex->getAsZExtVal();

    unsigned NumEltsInLow128Bits = 128 / VT.getScalarSizeInBits();

    if (InsertC < NumEltsInLow128Bits)

      return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Ld, VarElt, InsIndex);


    // There's no good way to insert into the high elements of a >128-bit

    // vector, so use shuffles to avoid an extract/insert sequence.

    assert(VT.getSizeInBits() > 128 && "Invalid insertion index?");

    assert(Subtarget.hasAVX() && "Must have AVX with >16-byte vector");

    SmallVector<int, 8> ShuffleMask;

    unsigned NumElts = VT.getVectorNumElements();

    for (unsigned i = 0; i != NumElts; ++i)

      ShuffleMask.push_back(i == InsertC ? NumElts : i);

    SDValue S2V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, VarElt);

    return DAG.getVectorShuffle(VT, dl, Ld, S2V, ShuffleMask);

  }


  // Special case for single non-zero, non-undef, element.

  if (NumNonZero == 1) {

    unsigned Idx = NonZeroMask.countr_zero();

    SDValue Item = Op.getOperand(Idx);


    // If we have a constant or non-constant insertion into the low element of

    // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into

    // the rest of the elements.  This will be matched as movd/movq/movss/movsd

    // depending on what the source datatype is.

    if (Idx == 0) {

      if (NumZero == 0)

        return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);


      if (EltVT == MVT::i32 || EltVT == MVT::f16 || EltVT == MVT::f32 ||

          EltVT == MVT::f64 || (EltVT == MVT::i64 && Subtarget.is64Bit()) ||

          (EltVT == MVT::i16 && Subtarget.hasFP16())) {

        assert((VT.is128BitVector() || VT.is256BitVector() ||

                VT.is512BitVector()) &&

               "Expected an SSE value type!");

        Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);

        // Turn it into a MOVL (i.e. movsh, movss, movsd, movw or movd) to a

        // zero vector.

        return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);

      }


      // We can't directly insert an i8 or i16 into a vector, so zero extend

      // it to i32 first.

      if (EltVT == MVT::i16 || EltVT == MVT::i8) {

        Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item);

        MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);

        Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ShufVT, Item);

        Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);

        return DAG.getBitcast(VT, Item);

      }

    }


    // Is it a vector logical left shift?

    if (NumElems == 2 && Idx == 1 &&

        X86::isZeroNode(Op.getOperand(0)) &&

        !X86::isZeroNode(Op.getOperand(1))) {

      unsigned NumBits = VT.getSizeInBits();

      return getVShift(true, VT,

                       DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,

                                   VT, Op.getOperand(1)),

                       NumBits/2, DAG, *this, dl);

    }


    if (IsAllConstants) // Otherwise, it's better to do a constpool load.

      return SDValue();


    // Otherwise, if this is a vector with i32 or f32 elements, and the element

    // is a non-constant being inserted into an element other than the low one,

    // we can't use a constant pool load.  Instead, use SCALAR_TO_VECTOR (aka

    // movd/movss) to move this into the low element, then shuffle it into

    // place.

    if (EVTBits == 32) {

      Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);

      return getShuffleVectorZeroOrUndef(Item, Idx, NumZero > 0, Subtarget, DAG);

    }

  }


  // Splat is obviously ok. Let legalizer expand it to a shuffle.

  if (Values.size() == 1) {

    if (EVTBits == 32) {

      // Instead of a shuffle like this:

      // shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0>

      // Check if it's possible to issue this instead.

      // shuffle (vload ptr)), undef, <1, 1, 1, 1>

      unsigned Idx = NonZeroMask.countr_zero();

      SDValue Item = Op.getOperand(Idx);

      if (Op.getNode()->isOnlyUserOf(Item.getNode()))

        return LowerAsSplatVectorLoad(Item, VT, dl, DAG);

    }

    return SDValue();

  }


  // A vector full of immediates; various special cases are already

  // handled, so this is best done with a single constant-pool load.

  if (IsAllConstants)

    return SDValue();


  if (SDValue V = LowerBUILD_VECTORAsVariablePermute(Op, dl, DAG, Subtarget))

    return V;


  // See if we can use a vector load to get all of the elements.

  {

    SmallVector<SDValue, 64> Ops(Op->ops().take_front(NumElems));

    if (SDValue LD =

            EltsFromConsecutiveLoads(VT, Ops, dl, DAG, Subtarget, false))

      return LD;

  }


  // If this is a splat of pairs of 32-bit elements, we can use a narrower

  // build_vector and broadcast it.

  // TODO: We could probably generalize this more.

  if (Subtarget.hasAVX2() && EVTBits == 32 && Values.size() == 2) {

    SDValue Ops[4] = { Op.getOperand(0), Op.getOperand(1),

                       DAG.getUNDEF(EltVT), DAG.getUNDEF(EltVT) };

    auto CanSplat = [](SDValue Op, unsigned NumElems, ArrayRef<SDValue> Ops) {

      // Make sure all the even/odd operands match.

      for (unsigned i = 2; i != NumElems; ++i)

        if (Ops[i % 2] != Op.getOperand(i))

          return false;

      return true;

    };

    if (CanSplat(Op, NumElems, Ops)) {

      MVT WideEltVT = VT.isFloatingPoint() ? MVT::f64 : MVT::i64;

      MVT NarrowVT = MVT::getVectorVT(EltVT, 4);

      // Create a new build vector and cast to v2i64/v2f64.

      SDValue NewBV = DAG.getBitcast(MVT::getVectorVT(WideEltVT, 2),

                                     DAG.getBuildVector(NarrowVT, dl, Ops));

      // Broadcast from v2i64/v2f64 and cast to final VT.

      MVT BcastVT = MVT::getVectorVT(WideEltVT, NumElems / 2);

      return DAG.getBitcast(VT, DAG.getNode(X86ISD::VBROADCAST, dl, BcastVT,

                                            NewBV));

    }

  }


  // For AVX-length vectors, build the individual 128-bit pieces and use

  // shuffles to put them in place.

  if (VT.getSizeInBits() > 128) {

    MVT HVT = MVT::getVectorVT(EltVT, NumElems / 2);


    // Build both the lower and upper subvector.

    SDValue Lower =

        DAG.getBuildVector(HVT, dl, Op->ops().slice(0, NumElems / 2));

    SDValue Upper = DAG.getBuildVector(

        HVT, dl, Op->ops().slice(NumElems / 2, NumElems /2));


    // Recreate the wider vector with the lower and upper part.

    return concatSubVectors(Lower, Upper, DAG, dl);

  }


  // Let legalizer expand 2-wide build_vectors.

  if (EVTBits == 64) {

    if (NumNonZero == 1) {

      // One half is zero or undef.

      unsigned Idx = NonZeroMask.countr_zero();

      SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT,

                               Op.getOperand(Idx));

      return getShuffleVectorZeroOrUndef(V2, Idx, true, Subtarget, DAG);

    }

    return SDValue();

  }


  // If element VT is < 32 bits, convert it to inserts into a zero vector.

  if (EVTBits == 8 && NumElems == 16)

    if (SDValue V = LowerBuildVectorv16i8(Op, dl, NonZeroMask, NumNonZero,

                                          NumZero, DAG, Subtarget))

      return V;


  if (EltVT == MVT::i16 && NumElems == 8)

    if (SDValue V = LowerBuildVectorv8i16(Op, dl, NonZeroMask, NumNonZero,

                                          NumZero, DAG, Subtarget))

      return V;


  // If element VT is == 32 bits and has 4 elems, try to generate an INSERTPS

  if (EVTBits == 32 && NumElems == 4)

    if (SDValue V = LowerBuildVectorv4x32(Op, dl, DAG, Subtarget))

      return V;


  // If element VT is == 32 bits, turn it into a number of shuffles.

  if (NumElems == 4 && NumZero > 0) {

    SmallVector<SDValue, 8> Ops(NumElems);

    for (unsigned i = 0; i < 4; ++i) {

      bool isZero = !NonZeroMask[i];

      if (isZero)

        Ops[i] = getZeroVector(VT, Subtarget, DAG, dl);

      else

        Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));

    }


    for (unsigned i = 0; i < 2; ++i) {

      switch (NonZeroMask.extractBitsAsZExtValue(2, i * 2)) {

        default: llvm_unreachable("Unexpected NonZero count");

        case 0:

          Ops[i] = Ops[i*2];  // Must be a zero vector.

          break;

        case 1:

          Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2+1], Ops[i*2]);

          break;

        case 2:

          Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);

          break;

        case 3:

          Ops[i] = getUnpackl(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);

          break;

      }

    }


    bool Reverse1 = NonZeroMask.extractBitsAsZExtValue(2, 0) == 2;

    bool Reverse2 = NonZeroMask.extractBitsAsZExtValue(2, 2) == 2;

    int MaskVec[] = {

      Reverse1 ? 1 : 0,

      Reverse1 ? 0 : 1,

      static_cast<int>(Reverse2 ? NumElems+1 : NumElems),

      static_cast<int>(Reverse2 ? NumElems   : NumElems+1)

    };

    return DAG.getVectorShuffle(VT, dl, Ops[0], Ops[1], MaskVec);

  }


  assert(Values.size() > 1 && "Expected non-undef and non-splat vector");


  // Check for a build vector from mostly shuffle plus few inserting.

  if (SDValue Sh = buildFromShuffleMostly(Op, dl, DAG))

    return Sh;


  // For SSE 4.1, use insertps to put the high elements into the low element.

  if (Subtarget.hasSSE41() && EltVT != MVT::f16) {

    SDValue Result;

    if (!Op.getOperand(0).isUndef())

      Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0));

    else

      Result = DAG.getUNDEF(VT);


    for (unsigned i = 1; i < NumElems; ++i) {

      if (Op.getOperand(i).isUndef()) continue;

      Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result,

                           Op.getOperand(i), DAG.getVectorIdxConstant(i, dl));

    }

    return Result;

  }


  // Otherwise, expand into a number of unpckl*, start by extending each of

  // our (non-undef) elements to the full vector width with the element in the

  // bottom slot of the vector (which generates no code for SSE).

  SmallVector<SDValue, 8> Ops(NumElems);

  for (unsigned i = 0; i < NumElems; ++i) {

    if (!Op.getOperand(i).isUndef())

      Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));

    else

      Ops[i] = DAG.getUNDEF(VT);

  }


  // Next, we iteratively mix elements, e.g. for v4f32:

  //   Step 1: unpcklps 0, 1 ==> X: <?, ?, 1, 0>

  //         : unpcklps 2, 3 ==> Y: <?, ?, 3, 2>

  //   Step 2: unpcklpd X, Y ==>    <3, 2, 1, 0>

  for (unsigned Scale = 1; Scale < NumElems; Scale *= 2) {

    // Generate scaled UNPCKL shuffle mask.

    SmallVector<int, 16> Mask;

    for(unsigned i = 0; i != Scale; ++i)

      Mask.push_back(i);

    for (unsigned i = 0; i != Scale; ++i)

      Mask.push_back(NumElems+i);

    Mask.append(NumElems - Mask.size(), SM_SentinelUndef);


    for (unsigned i = 0, e = NumElems / (2 * Scale); i != e; ++i)

      Ops[i] = DAG.getVectorShuffle(VT, dl, Ops[2*i], Ops[(2*i)+1], Mask);

  }

  return Ops[0];

}


// 256-bit AVX can use the vinsertf128 instruction

// to create 256-bit vectors from two other 128-bit ones.

// TODO: Detect subvector broadcast here instead of DAG combine?


static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, const SDLoc &dl,

                                      SelectionDAG &DAG,

                                      const X86Subtarget &Subtarget) {

  MVT ResVT = Op.getSimpleValueType();

  assert((ResVT.is256BitVector() || ResVT.is512BitVector()) &&

         "Value type must be 256-/512-bit wide");


  unsigned NumOperands = Op.getNumOperands();

  unsigned NumFreezeUndef = 0;

  unsigned NumZero = 0;

  unsigned NumNonZero = 0;

  unsigned NonZeros = 0;

  SmallSet<SDValue, 4> Undefs;

  for (unsigned i = 0; i != NumOperands; ++i) {

    SDValue SubVec = Op.getOperand(i);

    if (SubVec.isUndef())

      continue;

    if (ISD::isFreezeUndef(SubVec.getNode())) {

        // If the freeze(undef) has multiple uses then we must fold to zero.

        if (SubVec.hasOneUse()) {

          ++NumFreezeUndef;

        } else {

          ++NumZero;

          Undefs.insert(SubVec);

        }

    }

    else if (ISD::isBuildVectorAllZeros(SubVec.getNode()))

      ++NumZero;

    else {

      assert(i < sizeof(NonZeros) * CHAR_BIT); // Ensure the shift is in range.

      NonZeros |= 1 << i;

      ++NumNonZero;

    }

  }


  // If we have more than 2 non-zeros, build each half separately.

  if (NumNonZero > 2) {

    MVT HalfVT = ResVT.getHalfNumVectorElementsVT();

    ArrayRef<SDUse> Ops = Op->ops();

    SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,

                             Ops.slice(0, NumOperands/2));

    SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,

                             Ops.slice(NumOperands/2));

    return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);

  }


  // Otherwise, build it up through insert_subvectors.

  SDValue Vec = NumZero ? getZeroVector(ResVT, Subtarget, DAG, dl)

                        : (NumFreezeUndef ? DAG.getFreeze(DAG.getUNDEF(ResVT))

                                          : DAG.getUNDEF(ResVT));


  // Replace Undef operands with ZeroVector.

  for (SDValue U : Undefs)

    DAG.ReplaceAllUsesWith(

        U, getZeroVector(U.getSimpleValueType(), Subtarget, DAG, dl));


  MVT SubVT = Op.getOperand(0).getSimpleValueType();

  unsigned NumSubElems = SubVT.getVectorNumElements();

  for (unsigned i = 0; i != NumOperands; ++i) {

    if ((NonZeros & (1 << i)) == 0)

      continue;


    Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, Op.getOperand(i),

                      DAG.getVectorIdxConstant(i * NumSubElems, dl));

  }


  return Vec;

}


// Returns true if the given node is a type promotion (by concatenating i1

// zeros) of the result of a node that already zeros all upper bits of

// k-register.

// TODO: Merge this with LowerAVXCONCAT_VECTORS?


static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op, const SDLoc &dl,

                                       const X86Subtarget &Subtarget,

                                       SelectionDAG & DAG) {

  MVT ResVT = Op.getSimpleValueType();

  unsigned NumOperands = Op.getNumOperands();

  assert(NumOperands > 1 && isPowerOf2_32(NumOperands) &&

         "Unexpected number of operands in CONCAT_VECTORS");


  uint64_t Zeros = 0;

  uint64_t NonZeros = 0;

  for (unsigned i = 0; i != NumOperands; ++i) {

    SDValue SubVec = Op.getOperand(i);

    if (SubVec.isUndef())

      continue;

    assert(i < sizeof(NonZeros) * CHAR_BIT); // Ensure the shift is in range.

    if (ISD::isBuildVectorAllZeros(SubVec.getNode()))

      Zeros |= (uint64_t)1 << i;

    else

      NonZeros |= (uint64_t)1 << i;

  }


  unsigned NumElems = ResVT.getVectorNumElements();


  // If we are inserting non-zero vector and there are zeros in LSBs and undef

  // in the MSBs we need to emit a KSHIFTL. The generic lowering to

  // insert_subvector will give us two kshifts.

  if (isPowerOf2_64(NonZeros) && Zeros != 0 && NonZeros > Zeros &&

      Log2_64(NonZeros) != NumOperands - 1) {

    unsigned Idx = Log2_64(NonZeros);

    SDValue SubVec = Op.getOperand(Idx);

    unsigned SubVecNumElts = SubVec.getSimpleValueType().getVectorNumElements();

    MVT ShiftVT = widenMaskVectorType(ResVT, Subtarget);

    Op = widenSubVector(ShiftVT, SubVec, false, Subtarget, DAG, dl);

    Op = DAG.getNode(X86ISD::KSHIFTL, dl, ShiftVT, Op,

                     DAG.getTargetConstant(Idx * SubVecNumElts, dl, MVT::i8));

    return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResVT, Op,

                       DAG.getVectorIdxConstant(0, dl));

  }


  // If there are zero or one non-zeros we can handle this very simply.

  if (NonZeros == 0 || isPowerOf2_64(NonZeros)) {

    SDValue Vec = Zeros ? DAG.getConstant(0, dl, ResVT) : DAG.getUNDEF(ResVT);

    if (!NonZeros)

      return Vec;

    unsigned Idx = Log2_64(NonZeros);

    SDValue SubVec = Op.getOperand(Idx);

    unsigned SubVecNumElts = SubVec.getSimpleValueType().getVectorNumElements();

    return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, SubVec,

                       DAG.getVectorIdxConstant(Idx * SubVecNumElts, dl));

  }


  if (NumOperands > 2) {

    MVT HalfVT = ResVT.getHalfNumVectorElementsVT();

    ArrayRef<SDUse> Ops = Op->ops();

    SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,

                             Ops.slice(0, NumOperands / 2));

    SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,

                             Ops.slice(NumOperands / 2));

    return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);

  }


  assert(llvm::popcount(NonZeros) == 2 && "Simple cases not handled?");


  if (ResVT.getVectorNumElements() >= 16)

    return Op; // The operation is legal with KUNPCK


  SDValue Vec =

      DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, DAG.getUNDEF(ResVT),

                  Op.getOperand(0), DAG.getVectorIdxConstant(0, dl));

  return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, Op.getOperand(1),

                     DAG.getVectorIdxConstant(NumElems / 2, dl));

}


static SDValue LowerCONCAT_VECTORS(SDValue Op,

                                   const X86Subtarget &Subtarget,

                                   SelectionDAG &DAG) {

  SDLoc DL(Op);

  MVT VT = Op.getSimpleValueType();

  if (VT.getVectorElementType() == MVT::i1)

    return LowerCONCAT_VECTORSvXi1(Op, DL, Subtarget, DAG);


  // AVX can use the vinsertf128 instruction to create 256-bit vectors

  // from two other 128-bit ones.

  // 512-bit vector may contain 2 256-bit vectors or 4 128-bit vectors

  assert((VT.is256BitVector() && Op.getNumOperands() == 2) ||

         (VT.is512BitVector() &&

          (Op.getNumOperands() == 2 || Op.getNumOperands() == 4)));

  return LowerAVXCONCAT_VECTORS(Op, DL, DAG, Subtarget);

}


//===----------------------------------------------------------------------===//

// Vector shuffle lowering

//

// This is an experimental code path for lowering vector shuffles on x86. It is

// designed to handle arbitrary vector shuffles and blends, gracefully

// degrading performance as necessary. It works hard to recognize idiomatic

// shuffles and lower them to optimal instruction patterns without leaving

// a framework that allows reasonably efficient handling of all vector shuffle

// patterns.

//===----------------------------------------------------------------------===//


/// Checks whether the vector elements referenced by two shuffle masks are

/// equivalent.


static bool IsElementEquivalent(int MaskSize, SDValue Op, SDValue ExpectedOp,

                                int Idx, int ExpectedIdx) {

  assert(0 <= Idx && Idx < MaskSize && 0 <= ExpectedIdx &&

         ExpectedIdx < MaskSize && "Out of range element index");

  if (!Op || !ExpectedOp || Op.getOpcode() != ExpectedOp.getOpcode())

    return false;


  EVT VT = Op.getValueType();

  EVT ExpectedVT = ExpectedOp.getValueType();


  // Sources must be vectors and match the mask's element count.

  if (!VT.isVector() || !ExpectedVT.isVector() ||

      (int)VT.getVectorNumElements() != MaskSize ||

      (int)ExpectedVT.getVectorNumElements() != MaskSize)

    return false;


  // Exact match.

  if (Idx == ExpectedIdx && Op == ExpectedOp)

    return true;


  switch (Op.getOpcode()) {

  case ISD::BUILD_VECTOR:

    // If the values are build vectors, we can look through them to find

    // equivalent inputs that make the shuffles equivalent.

    return Op.getOperand(Idx) == ExpectedOp.getOperand(ExpectedIdx);

  case ISD::BITCAST: {

    SDValue Src = peekThroughBitcasts(Op);

    EVT SrcVT = Src.getValueType();

    if (Op == ExpectedOp && SrcVT.isVector()) {

      if ((SrcVT.getScalarSizeInBits() % VT.getScalarSizeInBits()) == 0) {

        unsigned Scale = SrcVT.getScalarSizeInBits() / VT.getScalarSizeInBits();

        return (Idx % Scale) == (ExpectedIdx % Scale) &&

               IsElementEquivalent(SrcVT.getVectorNumElements(), Src, Src,

                                   Idx / Scale, ExpectedIdx / Scale);

      }

      if ((VT.getScalarSizeInBits() % SrcVT.getScalarSizeInBits()) == 0) {

        unsigned Scale = VT.getScalarSizeInBits() / SrcVT.getScalarSizeInBits();

        for (unsigned I = 0; I != Scale; ++I)

          if (!IsElementEquivalent(SrcVT.getVectorNumElements(), Src, Src,

                                   (Idx * Scale) + I,

                                   (ExpectedIdx * Scale) + I))

            return false;

        return true;

      }

    }

    break;

  }

  case ISD::VECTOR_SHUFFLE: {

    auto *SVN = cast<ShuffleVectorSDNode>(Op);

    return Op == ExpectedOp &&

           SVN->getMaskElt(Idx) == SVN->getMaskElt(ExpectedIdx);

  }

  case X86ISD::VBROADCAST:

  case X86ISD::VBROADCAST_LOAD:

    return Op == ExpectedOp;

  case X86ISD::SUBV_BROADCAST_LOAD:

    if (Op == ExpectedOp) {

      auto *MemOp = cast<MemSDNode>(Op);

      unsigned NumMemElts = MemOp->getMemoryVT().getVectorNumElements();

      return (Idx % NumMemElts) == (ExpectedIdx % NumMemElts);

    }

    break;

  case X86ISD::VPERMI: {

    if (Op == ExpectedOp) {

      SmallVector<int, 8> Mask;

      DecodeVPERMMask(MaskSize, Op.getConstantOperandVal(1), Mask);

      SDValue Src = Op.getOperand(0);

      return IsElementEquivalent(MaskSize, Src, Src, Mask[Idx],

                                 Mask[ExpectedIdx]);

    }

    break;

  }

  case X86ISD::HADD:

  case X86ISD::HSUB:

  case X86ISD::FHADD:

  case X86ISD::FHSUB:

  case X86ISD::PACKSS:

  case X86ISD::PACKUS:

    // HOP(X,X) can refer to the elt from the lower/upper half of a lane.

    // TODO: Handle HOP(X,Y) vs HOP(Y,X) equivalence cases.

    if (Op == ExpectedOp && Op.getOperand(0) == Op.getOperand(1)) {

      int NumElts = VT.getVectorNumElements();

      int NumLanes = VT.getSizeInBits() / 128;

      int NumEltsPerLane = NumElts / NumLanes;

      int NumHalfEltsPerLane = NumEltsPerLane / 2;

      bool SameLane = (Idx / NumEltsPerLane) == (ExpectedIdx / NumEltsPerLane);

      bool SameElt =

          (Idx % NumHalfEltsPerLane) == (ExpectedIdx % NumHalfEltsPerLane);

      return SameLane && SameElt;

    }

    break;

  }


  return false;

}


/// Tiny helper function to identify a no-op mask.

///

/// This is a somewhat boring predicate function. It checks whether the mask

/// array input, which is assumed to be a single-input shuffle mask of the kind

/// used by the X86 shuffle instructions (not a fully general

/// ShuffleVectorSDNode mask) requires any shuffles to occur. Both undef and an

/// in-place shuffle are 'no-op's.


static bool isNoopShuffleMask(ArrayRef<int> Mask) {

  for (int i = 0, Size = Mask.size(); i < Size; ++i) {

    assert(Mask[i] >= -1 && "Out of bound mask element!");

    if (Mask[i] >= 0 && Mask[i] != i)

      return false;

  }

  return true;

}


/// Test whether there are elements crossing LaneSizeInBits lanes in this

/// shuffle mask.

///

/// X86 divides up its shuffles into in-lane and cross-lane shuffle operations

/// and we routinely test for these.


static bool isLaneCrossingShuffleMask(unsigned LaneSizeInBits,

                                      unsigned ScalarSizeInBits,

                                      ArrayRef<int> Mask) {

  assert(LaneSizeInBits && ScalarSizeInBits &&

         (LaneSizeInBits % ScalarSizeInBits) == 0 &&

         "Illegal shuffle lane size");

  int LaneSize = LaneSizeInBits / ScalarSizeInBits;

  int Size = Mask.size();

  for (int i = 0; i < Size; ++i)

    if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)

      return true;

  return false;

}


/// Test whether there are elements crossing 128-bit lanes in this

/// shuffle mask.


static bool is128BitLaneCrossingShuffleMask(MVT VT, ArrayRef<int> Mask) {

  return isLaneCrossingShuffleMask(128, VT.getScalarSizeInBits(), Mask);

}


/// Test whether elements in each LaneSizeInBits lane in this shuffle mask come

/// from multiple lanes - this is different to isLaneCrossingShuffleMask to

/// better support 'repeated mask + lane permute' style shuffles.


static bool isMultiLaneShuffleMask(unsigned LaneSizeInBits,

                                   unsigned ScalarSizeInBits,

                                   ArrayRef<int> Mask) {

  assert(LaneSizeInBits && ScalarSizeInBits &&

         (LaneSizeInBits % ScalarSizeInBits) == 0 &&

         "Illegal shuffle lane size");

  int NumElts = Mask.size();

  int NumEltsPerLane = LaneSizeInBits / ScalarSizeInBits;

  int NumLanes = NumElts / NumEltsPerLane;

  if (NumLanes > 1) {

    for (int i = 0; i != NumLanes; ++i) {

      int SrcLane = -1;

      for (int j = 0; j != NumEltsPerLane; ++j) {

        int M = Mask[(i * NumEltsPerLane) + j];

        if (M < 0)

          continue;

        int Lane = (M % NumElts) / NumEltsPerLane;

        if (SrcLane >= 0 && SrcLane != Lane)

          return true;

        SrcLane = Lane;

      }

    }

  }

  return false;

}


/// Test whether a shuffle mask is equivalent within each sub-lane.

///

/// This checks a shuffle mask to see if it is performing the same

/// lane-relative shuffle in each sub-lane. This trivially implies

/// that it is also not lane-crossing. It may however involve a blend from the

/// same lane of a second vector.

///

/// The specific repeated shuffle mask is populated in \p RepeatedMask, as it is

/// non-trivial to compute in the face of undef lanes. The representation is

/// suitable for use with existing 128-bit shuffles as entries from the second

/// vector have been remapped to [LaneSize, 2*LaneSize).


static bool isRepeatedShuffleMask(unsigned LaneSizeInBits, MVT VT,

                                  ArrayRef<int> Mask,

                                  SmallVectorImpl<int> &RepeatedMask) {

  auto LaneSize = LaneSizeInBits / VT.getScalarSizeInBits();

  RepeatedMask.assign(LaneSize, -1);

  int Size = Mask.size();

  for (int i = 0; i < Size; ++i) {

    assert(Mask[i] == SM_SentinelUndef || Mask[i] >= 0);

    if (Mask[i] < 0)

      continue;

    if ((Mask[i] % Size) / LaneSize != i / LaneSize)

      // This entry crosses lanes, so there is no way to model this shuffle.

      return false;


    // Ok, handle the in-lane shuffles by detecting if and when they repeat.

    // Adjust second vector indices to start at LaneSize instead of Size.

    int LocalM = Mask[i] < Size ? Mask[i] % LaneSize

                                : Mask[i] % LaneSize + LaneSize;

    if (RepeatedMask[i % LaneSize] < 0)

      // This is the first non-undef entry in this slot of a 128-bit lane.

      RepeatedMask[i % LaneSize] = LocalM;

    else if (RepeatedMask[i % LaneSize] != LocalM)

      // Found a mismatch with the repeated mask.

      return false;

  }

  return true;

}


/// Test whether a shuffle mask is equivalent within each 128-bit lane.

static bool


is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,

                                SmallVectorImpl<int> &RepeatedMask) {

  return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask);

}


static bool


is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask) {

  SmallVector<int, 32> RepeatedMask;

  return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask);

}


/// Test whether a shuffle mask is equivalent within each 256-bit lane.

static bool


is256BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,

                                SmallVectorImpl<int> &RepeatedMask) {

  return isRepeatedShuffleMask(256, VT, Mask, RepeatedMask);

}


/// Test whether a target shuffle mask is equivalent within each sub-lane.

/// Unlike isRepeatedShuffleMask we must respect SM_SentinelZero.


static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits,

                                        unsigned EltSizeInBits,

                                        ArrayRef<int> Mask,

                                        SmallVectorImpl<int> &RepeatedMask) {

  int LaneSize = LaneSizeInBits / EltSizeInBits;

  RepeatedMask.assign(LaneSize, SM_SentinelUndef);

  int Size = Mask.size();

  for (int i = 0; i < Size; ++i) {

    assert(isUndefOrZero(Mask[i]) || (Mask[i] >= 0));

    if (Mask[i] == SM_SentinelUndef)

      continue;

    if (Mask[i] == SM_SentinelZero) {

      if (!isUndefOrZero(RepeatedMask[i % LaneSize]))

        return false;

      RepeatedMask[i % LaneSize] = SM_SentinelZero;

      continue;

    }

    if ((Mask[i] % Size) / LaneSize != i / LaneSize)

      // This entry crosses lanes, so there is no way to model this shuffle.

      return false;


    // Handle the in-lane shuffles by detecting if and when they repeat. Adjust

    // later vector indices to start at multiples of LaneSize instead of Size.

    int LaneM = Mask[i] / Size;

    int LocalM = (Mask[i] % LaneSize) + (LaneM * LaneSize);

    if (RepeatedMask[i % LaneSize] == SM_SentinelUndef)

      // This is the first non-undef entry in this slot of a 128-bit lane.

      RepeatedMask[i % LaneSize] = LocalM;

    else if (RepeatedMask[i % LaneSize] != LocalM)

      // Found a mismatch with the repeated mask.

      return false;

  }

  return true;

}


/// Test whether a target shuffle mask is equivalent within each sub-lane.

/// Unlike isRepeatedShuffleMask we must respect SM_SentinelZero.


static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits, MVT VT,

                                        ArrayRef<int> Mask,

                                        SmallVectorImpl<int> &RepeatedMask) {

  return isRepeatedTargetShuffleMask(LaneSizeInBits, VT.getScalarSizeInBits(),

                                     Mask, RepeatedMask);

}


/// Checks whether a shuffle mask is equivalent to an explicit list of

/// arguments.

///

/// This is a fast way to test a shuffle mask against a fixed pattern:

///

///   if (isShuffleEquivalent(Mask, 3, 2, {1, 0})) { ... }

///

/// It returns true if the mask is exactly as wide as the argument list, and

/// each element of the mask is either -1 (signifying undef) or the value given

/// in the argument.


static bool isShuffleEquivalent(ArrayRef<int> Mask, ArrayRef<int> ExpectedMask,

                                SDValue V1 = SDValue(),

                                SDValue V2 = SDValue()) {

  int Size = Mask.size();

  if (Size != (int)ExpectedMask.size())

    return false;


  for (int i = 0; i < Size; ++i) {

    assert(Mask[i] >= -1 && "Out of bound mask element!");

    int MaskIdx = Mask[i];

    int ExpectedIdx = ExpectedMask[i];

    if (0 <= MaskIdx && MaskIdx != ExpectedIdx) {

      SDValue MaskV = MaskIdx < Size ? V1 : V2;

      SDValue ExpectedV = ExpectedIdx < Size ? V1 : V2;

      MaskIdx = MaskIdx < Size ? MaskIdx : (MaskIdx - Size);

      ExpectedIdx = ExpectedIdx < Size ? ExpectedIdx : (ExpectedIdx - Size);

      if (!IsElementEquivalent(Size, MaskV, ExpectedV, MaskIdx, ExpectedIdx))

        return false;

    }

  }

  return true;

}


/// Checks whether a target shuffle mask is equivalent to an explicit pattern.

///

/// The masks must be exactly the same width.

///

/// If an element in Mask matches SM_SentinelUndef (-1) then the corresponding

/// value in ExpectedMask is always accepted. Otherwise the indices must match.

///

/// SM_SentinelZero is accepted as a valid negative index but must match in

/// both, or via a known bits test.


static bool isTargetShuffleEquivalent(MVT VT, ArrayRef<int> Mask,

                                      ArrayRef<int> ExpectedMask,

                                      const SelectionDAG &DAG,

                                      SDValue V1 = SDValue(),

                                      SDValue V2 = SDValue()) {

  int Size = Mask.size();

  if (Size != (int)ExpectedMask.size())

    return false;

  assert(llvm::all_of(ExpectedMask,

                      [Size](int M) {

                        return M == SM_SentinelZero ||

                               isInRange(M, 0, 2 * Size);

                      }) &&

         "Illegal target shuffle mask");


  // Check for out-of-range target shuffle mask indices.

  if (!isUndefOrZeroOrInRange(Mask, 0, 2 * Size))

    return false;


  // Don't use V1/V2 if they're not the same size as the shuffle mask type.

  if (V1 && (V1.getValueSizeInBits() != VT.getSizeInBits() ||

             !V1.getValueType().isVector()))

    V1 = SDValue();

  if (V2 && (V2.getValueSizeInBits() != VT.getSizeInBits() ||

             !V2.getValueType().isVector()))

    V2 = SDValue();


  APInt ZeroV1 = APInt::getZero(Size);

  APInt ZeroV2 = APInt::getZero(Size);


  for (int i = 0; i < Size; ++i) {

    int MaskIdx = Mask[i];

    int ExpectedIdx = ExpectedMask[i];

    if (MaskIdx == SM_SentinelUndef || MaskIdx == ExpectedIdx)

      continue;

    // If we failed to match an expected SM_SentinelZero then early out.

    if (ExpectedIdx < 0)

      return false;

    if (MaskIdx == SM_SentinelZero) {

      // If we need this expected index to be a zero element, then update the

      // relevant zero mask and perform the known bits at the end to minimize

      // repeated computes.

      SDValue ExpectedV = ExpectedIdx < Size ? V1 : V2;

      if (ExpectedV &&

          Size == (int)ExpectedV.getValueType().getVectorNumElements()) {

        int BitIdx = ExpectedIdx < Size ? ExpectedIdx : (ExpectedIdx - Size);

        APInt &ZeroMask = ExpectedIdx < Size ? ZeroV1 : ZeroV2;

        ZeroMask.setBit(BitIdx);

        continue;

      }

    }

    if (MaskIdx >= 0) {

      SDValue MaskV = MaskIdx < Size ? V1 : V2;

      SDValue ExpectedV = ExpectedIdx < Size ? V1 : V2;

      MaskIdx = MaskIdx < Size ? MaskIdx : (MaskIdx - Size);

      ExpectedIdx = ExpectedIdx < Size ? ExpectedIdx : (ExpectedIdx - Size);

      if (IsElementEquivalent(Size, MaskV, ExpectedV, MaskIdx, ExpectedIdx))

        continue;

    }

    return false;

  }

  return (ZeroV1.isZero() || DAG.MaskedVectorIsZero(V1, ZeroV1)) &&

         (ZeroV2.isZero() || DAG.MaskedVectorIsZero(V2, ZeroV2));

}


// Check if the shuffle mask is suitable for the AVX vpunpcklwd or vpunpckhwd

// instructions.


static bool isUnpackWdShuffleMask(ArrayRef<int> Mask, MVT VT,

                                  const SelectionDAG &DAG) {

  if (VT != MVT::v8i32 && VT != MVT::v8f32)

    return false;


  SmallVector<int, 8> Unpcklwd;

  createUnpackShuffleMask(MVT::v8i16, Unpcklwd, /* Lo = */ true,

                          /* Unary = */ false);

  SmallVector<int, 8> Unpckhwd;

  createUnpackShuffleMask(MVT::v8i16, Unpckhwd, /* Lo = */ false,

                          /* Unary = */ false);

  bool IsUnpackwdMask = (isTargetShuffleEquivalent(VT, Mask, Unpcklwd, DAG) ||

                         isTargetShuffleEquivalent(VT, Mask, Unpckhwd, DAG));

  return IsUnpackwdMask;

}


static bool is128BitUnpackShuffleMask(ArrayRef<int> Mask,

                                      const SelectionDAG &DAG) {

  // Create 128-bit vector type based on mask size.

  MVT EltVT = MVT::getIntegerVT(128 / Mask.size());

  MVT VT = MVT::getVectorVT(EltVT, Mask.size());


  // We can't assume a canonical shuffle mask, so try the commuted version too.

  SmallVector<int, 4> CommutedMask(Mask);

  ShuffleVectorSDNode::commuteMask(CommutedMask);


  // Match any of unary/binary or low/high.

  for (unsigned i = 0; i != 4; ++i) {

    SmallVector<int, 16> UnpackMask;

    createUnpackShuffleMask(VT, UnpackMask, (i >> 1) % 2, i % 2);

    if (isTargetShuffleEquivalent(VT, Mask, UnpackMask, DAG) ||

        isTargetShuffleEquivalent(VT, CommutedMask, UnpackMask, DAG))

      return true;

  }

  return false;

}


/// Return true if a shuffle mask chooses elements identically in its top and

/// bottom halves. For example, any splat mask has the same top and bottom

/// halves. If an element is undefined in only one half of the mask, the halves

/// are not considered identical.


static bool hasIdenticalHalvesShuffleMask(ArrayRef<int> Mask) {

  assert(Mask.size() % 2 == 0 && "Expecting even number of elements in mask");

  unsigned HalfSize = Mask.size() / 2;

  for (unsigned i = 0; i != HalfSize; ++i) {

    if (Mask[i] != Mask[i + HalfSize])

      return false;

  }

  return true;

}


/// Get a 4-lane 8-bit shuffle immediate for a mask.

///

/// This helper function produces an 8-bit shuffle immediate corresponding to

/// the ubiquitous shuffle encoding scheme used in x86 instructions for

/// shuffling 4 lanes. It can be used with most of the PSHUF instructions for

/// example.

///

/// NB: We rely heavily on "undef" masks preserving the input lane.


static unsigned getV4X86ShuffleImm(ArrayRef<int> Mask) {

  assert(Mask.size() == 4 && "Only 4-lane shuffle masks");

  assert(Mask[0] >= -1 && Mask[0] < 4 && "Out of bound mask element!");

  assert(Mask[1] >= -1 && Mask[1] < 4 && "Out of bound mask element!");

  assert(Mask[2] >= -1 && Mask[2] < 4 && "Out of bound mask element!");

  assert(Mask[3] >= -1 && Mask[3] < 4 && "Out of bound mask element!");


  // If the mask only uses one non-undef element, then fully 'splat' it to

  // improve later broadcast matching.

  int FirstIndex = find_if(Mask, [](int M) { return M >= 0; }) - Mask.begin();

  assert(0 <= FirstIndex && FirstIndex < 4 && "All undef shuffle mask");


  int FirstElt = Mask[FirstIndex];

  if (all_of(Mask, [FirstElt](int M) { return M < 0 || M == FirstElt; }))

    return (FirstElt << 6) | (FirstElt << 4) | (FirstElt << 2) | FirstElt;


  unsigned Imm = 0;

  Imm |= (Mask[0] < 0 ? 0 : Mask[0]) << 0;

  Imm |= (Mask[1] < 0 ? 1 : Mask[1]) << 2;

  Imm |= (Mask[2] < 0 ? 2 : Mask[2]) << 4;

  Imm |= (Mask[3] < 0 ? 3 : Mask[3]) << 6;

  return Imm;

}


static SDValue getV4X86ShuffleImm8ForMask(ArrayRef<int> Mask, const SDLoc &DL,

                                          SelectionDAG &DAG) {

  return DAG.getTargetConstant(getV4X86ShuffleImm(Mask), DL, MVT::i8);

}


// Canonicalize SHUFPD mask to improve chances of further folding.

// Mask elements are assumed to be -1, 0 or 1 to match the SHUFPD lo/hi pattern.


static unsigned getSHUFPDImm(ArrayRef<int> Mask) {

  assert((Mask.size() == 2 || Mask.size() == 4 || Mask.size() == 8) &&

         "Unexpected SHUFPD mask size");

  assert(all_of(Mask, [](int M) { return -1 <= M && M <= 1; }) &&

         "Unexpected SHUFPD mask elements");


  // If the mask only uses one non-undef element, then fully 'splat' it to

  // improve later broadcast matching.

  int FirstIndex = find_if(Mask, [](int M) { return M >= 0; }) - Mask.begin();

  assert(0 <= FirstIndex && FirstIndex < (int)Mask.size() &&

         "All undef shuffle mask");


  int FirstElt = Mask[FirstIndex];

  if (all_of(Mask, [FirstElt](int M) { return M < 0 || M == FirstElt; }) &&

      count_if(Mask, [FirstElt](int M) { return M == FirstElt; }) > 1) {

    unsigned Imm = 0;

    for (unsigned I = 0, E = Mask.size(); I != E; ++I)

      Imm |= FirstElt << I;

    return Imm;

  }


  // Attempt to keep any undef elements in place to improve chances of the

  // shuffle becoming a (commutative) blend.

  unsigned Imm = 0;

  for (unsigned I = 0, E = Mask.size(); I != E; ++I)

    Imm |= (Mask[I] < 0 ? (I & 1) : Mask[I]) << I;


  return Imm;

}


static SDValue getSHUFPDImmForMask(ArrayRef<int> Mask, const SDLoc &DL,

                                   SelectionDAG &DAG) {

  return DAG.getTargetConstant(getSHUFPDImm(Mask), DL, MVT::i8);

}


// The Shuffle result is as follow:

// 0*a[0]0*a[1]...0*a[n] , n >=0 where a[] elements in a ascending order.

// Each Zeroable's element correspond to a particular Mask's element.

// As described in computeZeroableShuffleElements function.

//

// The function looks for a sub-mask that the nonzero elements are in

// increasing order. If such sub-mask exist. The function returns true.


static bool isNonZeroElementsInOrder(const APInt &Zeroable,

                                     ArrayRef<int> Mask, const EVT &VectorType,

                                     bool &IsZeroSideLeft) {

  int NextElement = -1;

  // Check if the Mask's nonzero elements are in increasing order.

  for (int i = 0, e = Mask.size(); i < e; i++) {

    // Checks if the mask's zeros elements are built from only zeros.

    assert(Mask[i] >= -1 && "Out of bound mask element!");

    if (Mask[i] < 0)

      return false;

    if (Zeroable[i])

      continue;

    // Find the lowest non zero element

    if (NextElement < 0) {

      NextElement = Mask[i] != 0 ? VectorType.getVectorNumElements() : 0;

      IsZeroSideLeft = NextElement != 0;

    }

    // Exit if the mask's non zero elements are not in increasing order.

    if (NextElement != Mask[i])

      return false;

    NextElement++;

  }

  return true;

}


static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,

                                      ArrayRef<SDValue> Ops, SelectionDAG &DAG,

                                      const X86Subtarget &Subtarget,

                                      unsigned Depth = 0);


/// Try to lower a shuffle with a single PSHUFB of V1 or V2.


static SDValue lowerShuffleWithPSHUFB(const SDLoc &DL, MVT VT,

                                      ArrayRef<int> Mask, SDValue V1,

                                      SDValue V2, const APInt &Zeroable,

                                      const X86Subtarget &Subtarget,

                                      SelectionDAG &DAG) {

  int Size = Mask.size();

  int LaneSize = 128 / VT.getScalarSizeInBits();

  const int NumBytes = VT.getSizeInBits() / 8;

  const int NumEltBytes = VT.getScalarSizeInBits() / 8;


  assert((Subtarget.hasSSSE3() && VT.is128BitVector()) ||

         (Subtarget.hasAVX2() && VT.is256BitVector()) ||

         (Subtarget.hasBWI() && VT.is512BitVector()));


  SmallVector<SDValue, 64> PSHUFBMask(NumBytes);

  // Sign bit set in i8 mask means zero element.

  SDValue ZeroMask = DAG.getConstant(0x80, DL, MVT::i8);


  SDValue V;

  for (int i = 0; i < NumBytes; ++i) {

    int M = Mask[i / NumEltBytes];

    if (M < 0) {

      PSHUFBMask[i] = DAG.getUNDEF(MVT::i8);

      continue;

    }

    if (Zeroable[i / NumEltBytes]) {

      PSHUFBMask[i] = ZeroMask;

      continue;

    }


    // We can only use a single input of V1 or V2.

    SDValue SrcV = (M >= Size ? V2 : V1);

    if (V && V != SrcV)

      return SDValue();

    V = SrcV;

    M %= Size;


    // PSHUFB can't cross lanes, ensure this doesn't happen.

    if ((M / LaneSize) != ((i / NumEltBytes) / LaneSize))

      return SDValue();


    M = M % LaneSize;

    M = M * NumEltBytes + (i % NumEltBytes);

    PSHUFBMask[i] = DAG.getConstant(M, DL, MVT::i8);

  }

  assert(V && "Failed to find a source input");


  MVT I8VT = MVT::getVectorVT(MVT::i8, NumBytes);

  return DAG.getBitcast(

      VT, DAG.getNode(X86ISD::PSHUFB, DL, I8VT, DAG.getBitcast(I8VT, V),

                      DAG.getBuildVector(I8VT, DL, PSHUFBMask)));

}


static SDValue getMaskNode(SDValue Mask, MVT MaskVT,

                           const X86Subtarget &Subtarget, SelectionDAG &DAG,

                           const SDLoc &dl);


// X86 has dedicated shuffle that can be lowered to VEXPAND


static SDValue lowerShuffleWithEXPAND(const SDLoc &DL, MVT VT, SDValue V1,

                                      SDValue V2, ArrayRef<int> Mask,

                                      const APInt &Zeroable,

                                      const X86Subtarget &Subtarget,

                                      SelectionDAG &DAG) {

  bool IsLeftZeroSide = true;

  if (!isNonZeroElementsInOrder(Zeroable, Mask, V1.getValueType(),

                                IsLeftZeroSide))

    return SDValue();

  unsigned VEXPANDMask = (~Zeroable).getZExtValue();

  MVT IntegerType =

      MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));

  SDValue MaskNode = DAG.getConstant(VEXPANDMask, DL, IntegerType);

  unsigned NumElts = VT.getVectorNumElements();

  assert((NumElts == 4 || NumElts == 8 || NumElts == 16) &&

         "Unexpected number of vector elements");

  SDValue VMask = getMaskNode(MaskNode, MVT::getVectorVT(MVT::i1, NumElts),

                              Subtarget, DAG, DL);

  SDValue ZeroVector = getZeroVector(VT, Subtarget, DAG, DL);

  SDValue ExpandedVector = IsLeftZeroSide ? V2 : V1;

  return DAG.getNode(X86ISD::EXPAND, DL, VT, ExpandedVector, ZeroVector, VMask);

}


static bool matchShuffleWithUNPCK(MVT VT, SDValue &V1, SDValue &V2,

                                  unsigned &UnpackOpcode, bool IsUnary,

                                  ArrayRef<int> TargetMask, const SDLoc &DL,

                                  SelectionDAG &DAG,

                                  const X86Subtarget &Subtarget) {

  int NumElts = VT.getVectorNumElements();


  bool Undef1 = true, Undef2 = true, Zero1 = true, Zero2 = true;

  for (int i = 0; i != NumElts; i += 2) {

    int M1 = TargetMask[i + 0];

    int M2 = TargetMask[i + 1];

    Undef1 &= (SM_SentinelUndef == M1);

    Undef2 &= (SM_SentinelUndef == M2);

    Zero1 &= isUndefOrZero(M1);

    Zero2 &= isUndefOrZero(M2);

  }

  assert(!((Undef1 || Zero1) && (Undef2 || Zero2)) &&

         "Zeroable shuffle detected");


  // Attempt to match the target mask against the unpack lo/hi mask patterns.

  SmallVector<int, 64> Unpckl, Unpckh;

  createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, IsUnary);

  if (isTargetShuffleEquivalent(VT, TargetMask, Unpckl, DAG, V1,

                                (IsUnary ? V1 : V2))) {

    UnpackOpcode = X86ISD::UNPCKL;

    V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));

    V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);

    return true;

  }


  createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, IsUnary);

  if (isTargetShuffleEquivalent(VT, TargetMask, Unpckh, DAG, V1,

                                (IsUnary ? V1 : V2))) {

    UnpackOpcode = X86ISD::UNPCKH;

    V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));

    V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);

    return true;

  }


  // If an unary shuffle, attempt to match as an unpack lo/hi with zero.

  if (IsUnary && (Zero1 || Zero2)) {

    // Don't bother if we can blend instead.

    if ((Subtarget.hasSSE41() || VT == MVT::v2i64 || VT == MVT::v2f64) &&

        isSequentialOrUndefOrZeroInRange(TargetMask, 0, NumElts, 0))

      return false;


    bool MatchLo = true, MatchHi = true;

    for (int i = 0; (i != NumElts) && (MatchLo || MatchHi); ++i) {

      int M = TargetMask[i];


      // Ignore if the input is known to be zero or the index is undef.

      if ((((i & 1) == 0) && Zero1) || (((i & 1) == 1) && Zero2) ||

          (M == SM_SentinelUndef))

        continue;


      MatchLo &= (M == Unpckl[i]);

      MatchHi &= (M == Unpckh[i]);

    }


    if (MatchLo || MatchHi) {

      UnpackOpcode = MatchLo ? X86ISD::UNPCKL : X86ISD::UNPCKH;

      V2 = Zero2 ? getZeroVector(VT, Subtarget, DAG, DL) : V1;

      V1 = Zero1 ? getZeroVector(VT, Subtarget, DAG, DL) : V1;

      return true;

    }

  }


  // If a binary shuffle, commute and try again.

  if (!IsUnary) {

    ShuffleVectorSDNode::commuteMask(Unpckl);

    if (isTargetShuffleEquivalent(VT, TargetMask, Unpckl, DAG)) {

      UnpackOpcode = X86ISD::UNPCKL;

      std::swap(V1, V2);

      return true;

    }


    ShuffleVectorSDNode::commuteMask(Unpckh);

    if (isTargetShuffleEquivalent(VT, TargetMask, Unpckh, DAG)) {

      UnpackOpcode = X86ISD::UNPCKH;

      std::swap(V1, V2);

      return true;

    }

  }


  return false;

}


// X86 has dedicated unpack instructions that can handle specific blend

// operations: UNPCKH and UNPCKL.


static SDValue lowerShuffleWithUNPCK(const SDLoc &DL, MVT VT, SDValue V1,

                                     SDValue V2, ArrayRef<int> Mask,

                                     SelectionDAG &DAG) {

  SmallVector<int, 8> Unpckl;

  createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, /* Unary = */ false);

  if (isShuffleEquivalent(Mask, Unpckl, V1, V2))

    return DAG.getNode(X86ISD::UNPCKL, DL, VT, V1, V2);


  SmallVector<int, 8> Unpckh;

  createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, /* Unary = */ false);

  if (isShuffleEquivalent(Mask, Unpckh, V1, V2))

    return DAG.getNode(X86ISD::UNPCKH, DL, VT, V1, V2);


  // Commute and try again.

  ShuffleVectorSDNode::commuteMask(Unpckl);

  if (isShuffleEquivalent(Mask, Unpckl, V1, V2))

    return DAG.getNode(X86ISD::UNPCKL, DL, VT, V2, V1);


  ShuffleVectorSDNode::commuteMask(Unpckh);

  if (isShuffleEquivalent(Mask, Unpckh, V1, V2))

    return DAG.getNode(X86ISD::UNPCKH, DL, VT, V2, V1);


  return SDValue();

}


/// Check if the mask can be mapped to a preliminary shuffle (vperm 64-bit)

/// followed by unpack 256-bit.


static SDValue lowerShuffleWithUNPCK256(const SDLoc &DL, MVT VT, SDValue V1,

                                        SDValue V2, ArrayRef<int> Mask,

                                        SelectionDAG &DAG) {

  SmallVector<int, 32> Unpckl, Unpckh;

  createSplat2ShuffleMask(VT, Unpckl, /* Lo */ true);

  createSplat2ShuffleMask(VT, Unpckh, /* Lo */ false);


  unsigned UnpackOpcode;

  if (isShuffleEquivalent(Mask, Unpckl, V1, V2))

    UnpackOpcode = X86ISD::UNPCKL;

  else if (isShuffleEquivalent(Mask, Unpckh, V1, V2))

    UnpackOpcode = X86ISD::UNPCKH;

  else

    return SDValue();


  // This is a "natural" unpack operation (rather than the 128-bit sectored

  // operation implemented by AVX). We need to rearrange 64-bit chunks of the

  // input in order to use the x86 instruction.

  V1 = DAG.getVectorShuffle(MVT::v4f64, DL, DAG.getBitcast(MVT::v4f64, V1),

                            DAG.getUNDEF(MVT::v4f64), {0, 2, 1, 3});

  V1 = DAG.getBitcast(VT, V1);

  return DAG.getNode(UnpackOpcode, DL, VT, V1, V1);

}


// Check if the mask can be mapped to a TRUNCATE or VTRUNC, truncating the

// source into the lower elements and zeroing the upper elements.


static bool matchShuffleAsVTRUNC(MVT &SrcVT, MVT &DstVT, MVT VT,

                                 ArrayRef<int> Mask, const APInt &Zeroable,

                                 const X86Subtarget &Subtarget) {

  if (!VT.is512BitVector() && !Subtarget.hasVLX())

    return false;


  unsigned NumElts = Mask.size();

  unsigned EltSizeInBits = VT.getScalarSizeInBits();

  unsigned MaxScale = 64 / EltSizeInBits;


  for (unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) {

    unsigned SrcEltBits = EltSizeInBits * Scale;

    if (SrcEltBits < 32 && !Subtarget.hasBWI())

      continue;

    unsigned NumSrcElts = NumElts / Scale;

    if (!isSequentialOrUndefInRange(Mask, 0, NumSrcElts, 0, Scale))

      continue;

    unsigned UpperElts = NumElts - NumSrcElts;

    if (!Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnes())

      continue;

    SrcVT = MVT::getIntegerVT(EltSizeInBits * Scale);

    SrcVT = MVT::getVectorVT(SrcVT, NumSrcElts);

    DstVT = MVT::getIntegerVT(EltSizeInBits);

    if ((NumSrcElts * EltSizeInBits) >= 128) {

      // ISD::TRUNCATE

      DstVT = MVT::getVectorVT(DstVT, NumSrcElts);

    } else {

      // X86ISD::VTRUNC

      DstVT = MVT::getVectorVT(DstVT, 128 / EltSizeInBits);

    }

    return true;

  }


  return false;

}


// Helper to create TRUNCATE/VTRUNC nodes, optionally with zero/undef upper

// element padding to the final DstVT.


static SDValue getAVX512TruncNode(const SDLoc &DL, MVT DstVT, SDValue Src,

                                  const X86Subtarget &Subtarget,

                                  SelectionDAG &DAG, bool ZeroUppers) {

  MVT SrcVT = Src.getSimpleValueType();

  MVT DstSVT = DstVT.getScalarType();

  unsigned NumDstElts = DstVT.getVectorNumElements();

  unsigned NumSrcElts = SrcVT.getVectorNumElements();

  unsigned DstEltSizeInBits = DstVT.getScalarSizeInBits();


  if (!DAG.getTargetLoweringInfo().isTypeLegal(SrcVT))

    return SDValue();


  // Perform a direct ISD::TRUNCATE if possible.

  if (NumSrcElts == NumDstElts)

    return DAG.getNode(ISD::TRUNCATE, DL, DstVT, Src);


  if (NumSrcElts > NumDstElts) {

    MVT TruncVT = MVT::getVectorVT(DstSVT, NumSrcElts);

    SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Src);

    return extractSubVector(Trunc, 0, DAG, DL, DstVT.getSizeInBits());

  }


  if ((NumSrcElts * DstEltSizeInBits) >= 128) {

    MVT TruncVT = MVT::getVectorVT(DstSVT, NumSrcElts);

    SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Src);

    return widenSubVector(Trunc, ZeroUppers, Subtarget, DAG, DL,

                          DstVT.getSizeInBits());

  }


  // Non-VLX targets must truncate from a 512-bit type, so we need to

  // widen, truncate and then possibly extract the original subvector.

  if (!Subtarget.hasVLX() && !SrcVT.is512BitVector()) {

    SDValue NewSrc = widenSubVector(Src, ZeroUppers, Subtarget, DAG, DL, 512);

    return getAVX512TruncNode(DL, DstVT, NewSrc, Subtarget, DAG, ZeroUppers);

  }


  // Fallback to a X86ISD::VTRUNC, padding if necessary.

  MVT TruncVT = MVT::getVectorVT(DstSVT, 128 / DstEltSizeInBits);

  SDValue Trunc = DAG.getNode(X86ISD::VTRUNC, DL, TruncVT, Src);

  if (DstVT != TruncVT)

    Trunc = widenSubVector(Trunc, ZeroUppers, Subtarget, DAG, DL,

                           DstVT.getSizeInBits());

  return Trunc;

}


// Try to lower trunc+vector_shuffle to a vpmovdb or a vpmovdw instruction.

//

// An example is the following:

//

// t0: ch = EntryToken

//           t2: v4i64,ch = CopyFromReg t0, Register:v4i64 %0

//         t25: v4i32 = truncate t2

//       t41: v8i16 = bitcast t25

//       t21: v8i16 = BUILD_VECTOR undef:i16, undef:i16, undef:i16, undef:i16,

//       Constant:i16<0>, Constant:i16<0>, Constant:i16<0>, Constant:i16<0>

//     t51: v8i16 = vector_shuffle<0,2,4,6,12,13,14,15> t41, t21

//   t18: v2i64 = bitcast t51

//

// One can just use a single vpmovdw instruction, without avx512vl we need to

// use the zmm variant and extract the lower subvector, padding with zeroes.

// TODO: Merge with lowerShuffleAsVTRUNC.


static SDValue lowerShuffleWithVPMOV(const SDLoc &DL, MVT VT, SDValue V1,

                                     SDValue V2, ArrayRef<int> Mask,

                                     const APInt &Zeroable,

                                     const X86Subtarget &Subtarget,

                                     SelectionDAG &DAG) {

  assert((VT == MVT::v16i8 || VT == MVT::v8i16) && "Unexpected VTRUNC type");

  if (!Subtarget.hasAVX512())

    return SDValue();


  unsigned NumElts = VT.getVectorNumElements();

  unsigned EltSizeInBits = VT.getScalarSizeInBits();

  unsigned MaxScale = 64 / EltSizeInBits;

  for (unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) {

    unsigned SrcEltBits = EltSizeInBits * Scale;

    unsigned NumSrcElts = NumElts / Scale;

    unsigned UpperElts = NumElts - NumSrcElts;

    if (!isSequentialOrUndefInRange(Mask, 0, NumSrcElts, 0, Scale) ||

        !Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnes())

      continue;


    // Attempt to find a matching source truncation, but as a fall back VLX

    // cases can use the VPMOV directly.

    SDValue Src = peekThroughBitcasts(V1);

    if (Src.getOpcode() == ISD::TRUNCATE &&

        Src.getScalarValueSizeInBits() == SrcEltBits) {

      Src = Src.getOperand(0);

    } else if (Subtarget.hasVLX()) {

      MVT SrcSVT = MVT::getIntegerVT(SrcEltBits);

      MVT SrcVT = MVT::getVectorVT(SrcSVT, NumSrcElts);

      Src = DAG.getBitcast(SrcVT, Src);

      // Don't do this if PACKSS/PACKUS could perform it cheaper.

      if (Scale == 2 &&

          ((DAG.ComputeNumSignBits(Src) > EltSizeInBits) ||

           (DAG.computeKnownBits(Src).countMinLeadingZeros() >= EltSizeInBits)))

        return SDValue();

    } else

      return SDValue();


    // VPMOVWB is only available with avx512bw.

    if (!Subtarget.hasBWI() && Src.getScalarValueSizeInBits() < 32)

      return SDValue();


    bool UndefUppers = isUndefInRange(Mask, NumSrcElts, UpperElts);

    return getAVX512TruncNode(DL, VT, Src, Subtarget, DAG, !UndefUppers);

  }


  return SDValue();

}


// Attempt to match binary shuffle patterns as a truncate.


static SDValue lowerShuffleAsVTRUNC(const SDLoc &DL, MVT VT, SDValue V1,

                                    SDValue V2, ArrayRef<int> Mask,

                                    const APInt &Zeroable,

                                    const X86Subtarget &Subtarget,

                                    SelectionDAG &DAG) {

  assert((VT.is128BitVector() || VT.is256BitVector()) &&

         "Unexpected VTRUNC type");

  if (!Subtarget.hasAVX512() ||

      (VT.is256BitVector() && !Subtarget.useAVX512Regs()))

    return SDValue();


  unsigned NumElts = VT.getVectorNumElements();

  unsigned EltSizeInBits = VT.getScalarSizeInBits();

  unsigned MaxScale = 64 / EltSizeInBits;

  for (unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) {

    // TODO: Support non-BWI VPMOVWB truncations?

    unsigned SrcEltBits = EltSizeInBits * Scale;

    if (SrcEltBits < 32 && !Subtarget.hasBWI())

      continue;


    // Match shuffle <Ofs,Ofs+Scale,Ofs+2*Scale,..,undef_or_zero,undef_or_zero>

    // Bail if the V2 elements are undef.

    unsigned NumHalfSrcElts = NumElts / Scale;

    unsigned NumSrcElts = 2 * NumHalfSrcElts;

    for (unsigned Offset = 0; Offset != Scale; ++Offset) {

      if (!isSequentialOrUndefInRange(Mask, 0, NumSrcElts, Offset, Scale) ||

          isUndefInRange(Mask, NumHalfSrcElts, NumHalfSrcElts))

        continue;


      // The elements beyond the truncation must be undef/zero.

      unsigned UpperElts = NumElts - NumSrcElts;

      if (UpperElts > 0 &&

          !Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnes())

        continue;

      bool UndefUppers =

          UpperElts > 0 && isUndefInRange(Mask, NumSrcElts, UpperElts);


      // As we're using both sources then we need to concat them together

      // and truncate from the double-sized src.

      MVT ConcatVT = VT.getDoubleNumVectorElementsVT();


      // For offset truncations, ensure that the concat is cheap.

      SDValue Src =

          combineConcatVectorOps(DL, ConcatVT, {V1, V2}, DAG, Subtarget);

      if (!Src) {

        if (Offset)

          continue;

        Src = DAG.getNode(ISD::CONCAT_VECTORS, DL, ConcatVT, V1, V2);

      }


      MVT SrcSVT = MVT::getIntegerVT(SrcEltBits);

      MVT SrcVT = MVT::getVectorVT(SrcSVT, NumSrcElts);

      Src = DAG.getBitcast(SrcVT, Src);


      // Shift the offset'd elements into place for the truncation.

      // TODO: Use getTargetVShiftByConstNode.

      if (Offset)

        Src = DAG.getNode(

            X86ISD::VSRLI, DL, SrcVT, Src,

            DAG.getTargetConstant(Offset * EltSizeInBits, DL, MVT::i8));


      return getAVX512TruncNode(DL, VT, Src, Subtarget, DAG, !UndefUppers);

    }

  }


  return SDValue();

}


/// Check whether a compaction lowering can be done by dropping even/odd

/// elements and compute how many times even/odd elements must be dropped.

///

/// This handles shuffles which take every Nth element where N is a power of

/// two. Example shuffle masks:

///

/// (even)

///  N = 1:  0,  2,  4,  6,  8, 10, 12, 14,  0,  2,  4,  6,  8, 10, 12, 14

///  N = 1:  0,  2,  4,  6,  8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30

///  N = 2:  0,  4,  8, 12,  0,  4,  8, 12,  0,  4,  8, 12,  0,  4,  8, 12

///  N = 2:  0,  4,  8, 12, 16, 20, 24, 28,  0,  4,  8, 12, 16, 20, 24, 28

///  N = 3:  0,  8,  0,  8,  0,  8,  0,  8,  0,  8,  0,  8,  0,  8,  0,  8

///  N = 3:  0,  8, 16, 24,  0,  8, 16, 24,  0,  8, 16, 24,  0,  8, 16, 24

///

/// (odd)

///  N = 1:  1,  3,  5,  7,  9, 11, 13, 15,  0,  2,  4,  6,  8, 10, 12, 14

///  N = 1:  1,  3,  5,  7,  9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31

///

/// Any of these lanes can of course be undef.

///

/// This routine only supports N <= 3.

/// FIXME: Evaluate whether either AVX or AVX-512 have any opportunities here

/// for larger N.

///

/// \returns N above, or the number of times even/odd elements must be dropped

/// if there is such a number. Otherwise returns zero.


static int canLowerByDroppingElements(ArrayRef<int> Mask, bool MatchEven,

                                      bool IsSingleInput) {

  // The modulus for the shuffle vector entries is based on whether this is

  // a single input or not.

  int ShuffleModulus = Mask.size() * (IsSingleInput ? 1 : 2);

  assert(isPowerOf2_32((uint32_t)ShuffleModulus) &&

         "We should only be called with masks with a power-of-2 size!");


  uint64_t ModMask = (uint64_t)ShuffleModulus - 1;

  int Offset = MatchEven ? 0 : 1;


  // We track whether the input is viable for all power-of-2 strides 2^1, 2^2,

  // and 2^3 simultaneously. This is because we may have ambiguity with

  // partially undef inputs.

  bool ViableForN[3] = {true, true, true};


  for (int i = 0, e = Mask.size(); i < e; ++i) {

    // Ignore undef lanes, we'll optimistically collapse them to the pattern we

    // want.

    if (Mask[i] < 0)

      continue;


    bool IsAnyViable = false;

    for (unsigned j = 0; j != std::size(ViableForN); ++j)

      if (ViableForN[j]) {

        uint64_t N = j + 1;


        // The shuffle mask must be equal to (i * 2^N) % M.

        if ((uint64_t)(Mask[i] - Offset) == (((uint64_t)i << N) & ModMask))

          IsAnyViable = true;

        else

          ViableForN[j] = false;

      }

    // Early exit if we exhaust the possible powers of two.

    if (!IsAnyViable)

      break;

  }


  for (unsigned j = 0; j != std::size(ViableForN); ++j)

    if (ViableForN[j])

      return j + 1;


  // Return 0 as there is no viable power of two.

  return 0;

}


// X86 has dedicated pack instructions that can handle specific truncation

// operations: PACKSS and PACKUS.

// Checks for compaction shuffle masks if MaxStages > 1.

// TODO: Add support for matching multiple PACKSS/PACKUS stages.


static bool matchShuffleWithPACK(MVT VT, MVT &SrcVT, SDValue &V1, SDValue &V2,

                                 unsigned &PackOpcode, ArrayRef<int> TargetMask,

                                 const SelectionDAG &DAG,

                                 const X86Subtarget &Subtarget,

                                 unsigned MaxStages = 1) {

  unsigned NumElts = VT.getVectorNumElements();

  unsigned BitSize = VT.getScalarSizeInBits();

  assert(0 < MaxStages && MaxStages <= 3 && (BitSize << MaxStages) <= 64 &&

         "Illegal maximum compaction");


  auto MatchPACK = [&](SDValue N1, SDValue N2, MVT PackVT) {

    unsigned NumSrcBits = PackVT.getScalarSizeInBits();

    unsigned NumPackedBits = NumSrcBits - BitSize;

    N1 = peekThroughBitcasts(N1);

    N2 = peekThroughBitcasts(N2);

    unsigned NumBits1 = N1.getScalarValueSizeInBits();

    unsigned NumBits2 = N2.getScalarValueSizeInBits();

    bool IsZero1 = llvm::isNullOrNullSplat(N1, /*AllowUndefs*/ false);

    bool IsZero2 = llvm::isNullOrNullSplat(N2, /*AllowUndefs*/ false);

    if ((!N1.isUndef() && !IsZero1 && NumBits1 != NumSrcBits) ||

        (!N2.isUndef() && !IsZero2 && NumBits2 != NumSrcBits))

      return false;

    if (Subtarget.hasSSE41() || BitSize == 8) {

      APInt ZeroMask = APInt::getHighBitsSet(NumSrcBits, NumPackedBits);

      if ((N1.isUndef() || IsZero1 || DAG.MaskedValueIsZero(N1, ZeroMask)) &&

          (N2.isUndef() || IsZero2 || DAG.MaskedValueIsZero(N2, ZeroMask))) {

        V1 = N1;

        V2 = N2;

        SrcVT = PackVT;

        PackOpcode = X86ISD::PACKUS;

        return true;

      }

    }

    bool IsAllOnes1 = llvm::isAllOnesOrAllOnesSplat(N1, /*AllowUndefs*/ false);

    bool IsAllOnes2 = llvm::isAllOnesOrAllOnesSplat(N2, /*AllowUndefs*/ false);

    if ((N1.isUndef() || IsZero1 || IsAllOnes1 ||

         DAG.ComputeNumSignBits(N1) > NumPackedBits) &&

        (N2.isUndef() || IsZero2 || IsAllOnes2 ||

         DAG.ComputeNumSignBits(N2) > NumPackedBits)) {

      V1 = N1;

      V2 = N2;

      SrcVT = PackVT;

      PackOpcode = X86ISD::PACKSS;

      return true;

    }

    return false;

  };


  // Attempt to match against wider and wider compaction patterns.

  for (unsigned NumStages = 1; NumStages <= MaxStages; ++NumStages) {

    MVT PackSVT = MVT::getIntegerVT(BitSize << NumStages);

    MVT PackVT = MVT::getVectorVT(PackSVT, NumElts >> NumStages);


    // Try binary shuffle.

    SmallVector<int, 32> BinaryMask;

    createPackShuffleMask(VT, BinaryMask, false, NumStages);

    if (isTargetShuffleEquivalent(VT, TargetMask, BinaryMask, DAG, V1, V2))

      if (MatchPACK(V1, V2, PackVT))

        return true;


    // Try unary shuffle.

    SmallVector<int, 32> UnaryMask;

    createPackShuffleMask(VT, UnaryMask, true, NumStages);

    if (isTargetShuffleEquivalent(VT, TargetMask, UnaryMask, DAG, V1))

      if (MatchPACK(V1, V1, PackVT))

        return true;

  }


  return false;

}


static SDValue lowerShuffleWithPACK(const SDLoc &DL, MVT VT, SDValue V1,

                                    SDValue V2, ArrayRef<int> Mask,

                                    const X86Subtarget &Subtarget,

                                    SelectionDAG &DAG) {

  MVT PackVT;

  unsigned PackOpcode;

  unsigned SizeBits = VT.getSizeInBits();

  unsigned EltBits = VT.getScalarSizeInBits();

  unsigned MaxStages = Log2_32(64 / EltBits);

  if (!matchShuffleWithPACK(VT, PackVT, V1, V2, PackOpcode, Mask, DAG,

                            Subtarget, MaxStages))

    return SDValue();


  unsigned CurrentEltBits = PackVT.getScalarSizeInBits();

  unsigned NumStages = Log2_32(CurrentEltBits / EltBits);


  // Don't lower multi-stage packs on AVX512, truncation is better.

  if (NumStages != 1 && SizeBits == 128 && Subtarget.hasVLX())

    return SDValue();


  // Pack to the largest type possible:

  // vXi64/vXi32 -> PACK*SDW and vXi16 -> PACK*SWB.

  unsigned MaxPackBits = 16;

  if (CurrentEltBits > 16 &&

      (PackOpcode == X86ISD::PACKSS || Subtarget.hasSSE41()))

    MaxPackBits = 32;


  // Repeatedly pack down to the target size.

  SDValue Res;

  for (unsigned i = 0; i != NumStages; ++i) {

    unsigned SrcEltBits = std::min(MaxPackBits, CurrentEltBits);

    unsigned NumSrcElts = SizeBits / SrcEltBits;

    MVT SrcSVT = MVT::getIntegerVT(SrcEltBits);

    MVT DstSVT = MVT::getIntegerVT(SrcEltBits / 2);

    MVT SrcVT = MVT::getVectorVT(SrcSVT, NumSrcElts);

    MVT DstVT = MVT::getVectorVT(DstSVT, NumSrcElts * 2);

    Res = DAG.getNode(PackOpcode, DL, DstVT, DAG.getBitcast(SrcVT, V1),

                      DAG.getBitcast(SrcVT, V2));

    V1 = V2 = Res;

    CurrentEltBits /= 2;

  }

  assert(Res && Res.getValueType() == VT &&

         "Failed to lower compaction shuffle");

  return Res;

}


/// Try to emit a bitmask instruction for a shuffle.

///

/// This handles cases where we can model a blend exactly as a bitmask due to

/// one of the inputs being zeroable.


static SDValue lowerShuffleAsBitMask(const SDLoc &DL, MVT VT, SDValue V1,

                                     SDValue V2, ArrayRef<int> Mask,

                                     const APInt &Zeroable,

                                     const X86Subtarget &Subtarget,

                                     SelectionDAG &DAG) {

  MVT MaskVT = VT;

  MVT EltVT = VT.getVectorElementType();

  SDValue Zero, AllOnes;

  // Use f64 if i64 isn't legal.

  if (EltVT == MVT::i64 && !Subtarget.is64Bit()) {

    EltVT = MVT::f64;

    MaskVT = MVT::getVectorVT(EltVT, Mask.size());

  }


  MVT LogicVT = VT;

  if (EltVT.isFloatingPoint()) {

    Zero = DAG.getConstantFP(0.0, DL, EltVT);

    APFloat AllOnesValue = APFloat::getAllOnesValue(EltVT.getFltSemantics());

    AllOnes = DAG.getConstantFP(AllOnesValue, DL, EltVT);

    LogicVT = MVT::getVectorVT(EltVT.changeTypeToInteger(), Mask.size());

  } else {

    Zero = DAG.getConstant(0, DL, EltVT);

    AllOnes = DAG.getAllOnesConstant(DL, EltVT);

  }


  SmallVector<SDValue, 16> VMaskOps(Mask.size(), Zero);

  SDValue V;

  for (int i = 0, Size = Mask.size(); i < Size; ++i) {

    if (Zeroable[i])

      continue;

    if (Mask[i] % Size != i)

      return SDValue(); // Not a blend.

    if (!V)

      V = Mask[i] < Size ? V1 : V2;

    else if (V != (Mask[i] < Size ? V1 : V2))

      return SDValue(); // Can only let one input through the mask.


    VMaskOps[i] = AllOnes;

  }

  if (!V)

    return SDValue(); // No non-zeroable elements!


  SDValue VMask = DAG.getBuildVector(MaskVT, DL, VMaskOps);

  VMask = DAG.getBitcast(LogicVT, VMask);

  V = DAG.getBitcast(LogicVT, V);

  SDValue And = DAG.getNode(ISD::AND, DL, LogicVT, V, VMask);

  return DAG.getBitcast(VT, And);

}


/// Try to emit a blend instruction for a shuffle using bit math.

///

/// This is used as a fallback approach when first class blend instructions are

/// unavailable. Currently it is only suitable for integer vectors, but could

/// be generalized for floating point vectors if desirable.


static SDValue lowerShuffleAsBitBlend(const SDLoc &DL, MVT VT, SDValue V1,

                                      SDValue V2, ArrayRef<int> Mask,

                                      SelectionDAG &DAG) {

  assert(VT.isInteger() && "Only supports integer vector types!");

  MVT EltVT = VT.getVectorElementType();

  SDValue Zero = DAG.getConstant(0, DL, EltVT);

  SDValue AllOnes = DAG.getAllOnesConstant(DL, EltVT);

  SmallVector<SDValue, 16> MaskOps;

  for (int i = 0, Size = Mask.size(); i < Size; ++i) {

    if (Mask[i] >= 0 && Mask[i] != i && Mask[i] != i + Size)

      return SDValue(); // Shuffled input!

    MaskOps.push_back(Mask[i] < Size ? AllOnes : Zero);

  }


  SDValue V1Mask = DAG.getBuildVector(VT, DL, MaskOps);

  return getBitSelect(DL, VT, V1, V2, V1Mask, DAG);

}


static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,

                                    SDValue PreservedSrc,

                                    const X86Subtarget &Subtarget,

                                    SelectionDAG &DAG);


static bool matchShuffleAsBlend(MVT VT, SDValue V1, SDValue V2,

                                MutableArrayRef<int> Mask,

                                const APInt &Zeroable, bool &ForceV1Zero,

                                bool &ForceV2Zero, uint64_t &BlendMask) {

  bool V1IsZeroOrUndef =

      V1.isUndef() || ISD::isBuildVectorAllZeros(V1.getNode());

  bool V2IsZeroOrUndef =

      V2.isUndef() || ISD::isBuildVectorAllZeros(V2.getNode());


  BlendMask = 0;

  ForceV1Zero = false, ForceV2Zero = false;

  assert(Mask.size() <= 64 && "Shuffle mask too big for blend mask");


  int NumElts = Mask.size();

  int NumLanes = VT.getSizeInBits() / 128;

  int NumEltsPerLane = NumElts / NumLanes;

  assert((NumLanes * NumEltsPerLane) == NumElts && "Value type mismatch");


  // For 32/64-bit elements, if we only reference one input (plus any undefs),

  // then ensure the blend mask part for that lane just references that input.

  bool ForceWholeLaneMasks =

      VT.is256BitVector() && VT.getScalarSizeInBits() >= 32;


  // Attempt to generate the binary blend mask. If an input is zero then

  // we can use any lane.

  for (int Lane = 0; Lane != NumLanes; ++Lane) {

    // Keep track of the inputs used per lane.

    bool LaneV1InUse = false;

    bool LaneV2InUse = false;

    uint64_t LaneBlendMask = 0;

    for (int LaneElt = 0; LaneElt != NumEltsPerLane; ++LaneElt) {

      int Elt = (Lane * NumEltsPerLane) + LaneElt;

      int M = Mask[Elt];

      if (M == SM_SentinelUndef)

        continue;

      if (M == Elt || (0 <= M && M < NumElts &&

                     IsElementEquivalent(NumElts, V1, V1, M, Elt))) {

        Mask[Elt] = Elt;

        LaneV1InUse = true;

        continue;

      }

      if (M == (Elt + NumElts) ||

          (NumElts <= M &&

           IsElementEquivalent(NumElts, V2, V2, M - NumElts, Elt))) {

        LaneBlendMask |= 1ull << LaneElt;

        Mask[Elt] = Elt + NumElts;

        LaneV2InUse = true;

        continue;

      }

      if (Zeroable[Elt]) {

        if (V1IsZeroOrUndef) {

          ForceV1Zero = true;

          Mask[Elt] = Elt;

          LaneV1InUse = true;

          continue;

        }

        if (V2IsZeroOrUndef) {

          ForceV2Zero = true;

          LaneBlendMask |= 1ull << LaneElt;

          Mask[Elt] = Elt + NumElts;

          LaneV2InUse = true;

          continue;

        }

      }

      return false;

    }


    // If we only used V2 then splat the lane blend mask to avoid any demanded

    // elts from V1 in this lane (the V1 equivalent is implicit with a zero

    // blend mask bit).

    if (ForceWholeLaneMasks && LaneV2InUse && !LaneV1InUse)

      LaneBlendMask = (1ull << NumEltsPerLane) - 1;


    BlendMask |= LaneBlendMask << (Lane * NumEltsPerLane);

  }

  return true;

}


/// Try to emit a blend instruction for a shuffle.

///

/// This doesn't do any checks for the availability of instructions for blending

/// these values. It relies on the availability of the X86ISD::BLENDI pattern to

/// be matched in the backend with the type given. What it does check for is

/// that the shuffle mask is a blend, or convertible into a blend with zero.


static SDValue lowerShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1,

                                   SDValue V2, ArrayRef<int> Original,

                                   const APInt &Zeroable,

                                   const X86Subtarget &Subtarget,

                                   SelectionDAG &DAG) {

  uint64_t BlendMask = 0;

  bool ForceV1Zero = false, ForceV2Zero = false;

  SmallVector<int, 64> Mask(Original);

  if (!matchShuffleAsBlend(VT, V1, V2, Mask, Zeroable, ForceV1Zero, ForceV2Zero,

                           BlendMask))

    return SDValue();


  // Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs.

  if (ForceV1Zero)

    V1 = getZeroVector(VT, Subtarget, DAG, DL);

  if (ForceV2Zero)

    V2 = getZeroVector(VT, Subtarget, DAG, DL);


  unsigned NumElts = VT.getVectorNumElements();


  switch (VT.SimpleTy) {

  case MVT::v4i64:

  case MVT::v8i32:

    assert(Subtarget.hasAVX2() && "256-bit integer blends require AVX2!");

    [[fallthrough]];

  case MVT::v4f64:

  case MVT::v8f32:

    assert(Subtarget.hasAVX() && "256-bit float blends require AVX!");

    [[fallthrough]];

  case MVT::v2f64:

  case MVT::v2i64:

  case MVT::v4f32:

  case MVT::v4i32:

  case MVT::v8i16:

    assert(Subtarget.hasSSE41() && "128-bit blends require SSE41!");

    return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V2,

                       DAG.getTargetConstant(BlendMask, DL, MVT::i8));

  case MVT::v16i16: {

    assert(Subtarget.hasAVX2() && "v16i16 blends require AVX2!");

    SmallVector<int, 8> RepeatedMask;

    if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {

      // We can lower these with PBLENDW which is mirrored across 128-bit lanes.

      assert(RepeatedMask.size() == 8 && "Repeated mask size doesn't match!");

      BlendMask = 0;

      for (int i = 0; i < 8; ++i)

        if (RepeatedMask[i] >= 8)

          BlendMask |= 1ull << i;

      return DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,

                         DAG.getTargetConstant(BlendMask, DL, MVT::i8));

    }

    // Use PBLENDW for lower/upper lanes and then blend lanes.

    // TODO - we should allow 2 PBLENDW here and leave shuffle combine to

    // merge to VSELECT where useful.

    uint64_t LoMask = BlendMask & 0xFF;

    uint64_t HiMask = (BlendMask >> 8) & 0xFF;

    if (LoMask == 0 || LoMask == 255 || HiMask == 0 || HiMask == 255) {

      SDValue Lo = DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,

                               DAG.getTargetConstant(LoMask, DL, MVT::i8));

      SDValue Hi = DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,

                               DAG.getTargetConstant(HiMask, DL, MVT::i8));

      return DAG.getVectorShuffle(

          MVT::v16i16, DL, Lo, Hi,

          {0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31});

    }

    [[fallthrough]];

  }

  case MVT::v32i8:

    assert(Subtarget.hasAVX2() && "256-bit byte-blends require AVX2!");

    [[fallthrough]];

  case MVT::v16i8: {

    assert(Subtarget.hasSSE41() && "128-bit byte-blends require SSE41!");


    // Attempt to lower to a bitmask if we can. VPAND is faster than VPBLENDVB.

    if (SDValue Masked = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,

                                               Subtarget, DAG))

      return Masked;


    if (Subtarget.hasBWI() && Subtarget.hasVLX()) {

      MVT IntegerType = MVT::getIntegerVT(std::max<unsigned>(NumElts, 8));

      SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);

      return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);

    }


    // If we have VPTERNLOG, we can use that as a bit blend.

    if (Subtarget.hasVLX())

      if (SDValue BitBlend =

              lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))

        return BitBlend;


    // Scale the blend by the number of bytes per element.

    int Scale = VT.getScalarSizeInBits() / 8;


    // This form of blend is always done on bytes. Compute the byte vector

    // type.

    MVT BlendVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);


    // x86 allows load folding with blendvb from the 2nd source operand. But

    // we are still using LLVM select here (see comment below), so that's V1.

    // If V2 can be load-folded and V1 cannot be load-folded, then commute to

    // allow that load-folding possibility.

    if (!ISD::isNormalLoad(V1.getNode()) && ISD::isNormalLoad(V2.getNode())) {

      ShuffleVectorSDNode::commuteMask(Mask);

      std::swap(V1, V2);

    }


    // Compute the VSELECT mask. Note that VSELECT is really confusing in the

    // mix of LLVM's code generator and the x86 backend. We tell the code

    // generator that boolean values in the elements of an x86 vector register

    // are -1 for true and 0 for false. We then use the LLVM semantics of 'true'

    // mapping a select to operand #1, and 'false' mapping to operand #2. The

    // reality in x86 is that vector masks (pre-AVX-512) use only the high bit

    // of the element (the remaining are ignored) and 0 in that high bit would

    // mean operand #1 while 1 in the high bit would mean operand #2. So while

    // the LLVM model for boolean values in vector elements gets the relevant

    // bit set, it is set backwards and over constrained relative to x86's

    // actual model.

    SmallVector<SDValue, 32> VSELECTMask;

    for (int i = 0, Size = Mask.size(); i < Size; ++i)

      for (int j = 0; j < Scale; ++j)

        VSELECTMask.push_back(

            Mask[i] < 0

                ? DAG.getUNDEF(MVT::i8)

                : DAG.getSignedConstant(Mask[i] < Size ? -1 : 0, DL, MVT::i8));


    V1 = DAG.getBitcast(BlendVT, V1);

    V2 = DAG.getBitcast(BlendVT, V2);

    return DAG.getBitcast(

        VT,

        DAG.getSelect(DL, BlendVT, DAG.getBuildVector(BlendVT, DL, VSELECTMask),

                      V1, V2));

  }

  case MVT::v16f32:

  case MVT::v8f64:

  case MVT::v8i64:

  case MVT::v16i32:

  case MVT::v32i16:

  case MVT::v64i8: {

    // Attempt to lower to a bitmask if we can. Only if not optimizing for size.

    bool OptForSize = DAG.shouldOptForSize();

    if (!OptForSize) {

      if (SDValue Masked = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,

                                                 Subtarget, DAG))

        return Masked;

    }


    // Otherwise load an immediate into a GPR, cast to k-register, and use a

    // masked move.

    MVT IntegerType = MVT::getIntegerVT(std::max<unsigned>(NumElts, 8));

    SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);

    return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);

  }

  default:

    llvm_unreachable("Not a supported integer vector type!");

  }

}


/// Try to lower as a blend of elements from two inputs followed by

/// a single-input permutation.

///

/// This matches the pattern where we can blend elements from two inputs and

/// then reduce the shuffle to a single-input permutation.


static SDValue lowerShuffleAsBlendAndPermute(const SDLoc &DL, MVT VT,

                                             SDValue V1, SDValue V2,

                                             ArrayRef<int> Mask,

                                             SelectionDAG &DAG,

                                             bool ImmBlends = false) {

  // We build up the blend mask while checking whether a blend is a viable way

  // to reduce the shuffle.

  SmallVector<int, 32> BlendMask(Mask.size(), -1);

  SmallVector<int, 32> PermuteMask(Mask.size(), -1);


  for (int i = 0, Size = Mask.size(); i < Size; ++i) {

    if (Mask[i] < 0)

      continue;


    assert(Mask[i] < Size * 2 && "Shuffle input is out of bounds.");


    if (BlendMask[Mask[i] % Size] < 0)

      BlendMask[Mask[i] % Size] = Mask[i];

    else if (BlendMask[Mask[i] % Size] != Mask[i])

      return SDValue(); // Can't blend in the needed input!


    PermuteMask[i] = Mask[i] % Size;

  }


  // If only immediate blends, then bail if the blend mask can't be widened to

  // i16.

  unsigned EltSize = VT.getScalarSizeInBits();

  if (ImmBlends && EltSize == 8 && !canWidenShuffleElements(BlendMask))

    return SDValue();


  SDValue V = DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);

  return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), PermuteMask);

}


/// Try to lower as an unpack of elements from two inputs followed by

/// a single-input permutation.

///

/// This matches the pattern where we can unpack elements from two inputs and

/// then reduce the shuffle to a single-input (wider) permutation.


static SDValue lowerShuffleAsUNPCKAndPermute(const SDLoc &DL, MVT VT,

                                             SDValue V1, SDValue V2,

                                             ArrayRef<int> Mask,

                                             SelectionDAG &DAG) {

  int NumElts = Mask.size();

  int NumLanes = VT.getSizeInBits() / 128;

  int NumLaneElts = NumElts / NumLanes;

  int NumHalfLaneElts = NumLaneElts / 2;


  bool MatchLo = true, MatchHi = true;

  SDValue Ops[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT)};


  // Determine UNPCKL/UNPCKH type and operand order.

  for (int Elt = 0; Elt != NumElts; ++Elt) {

    int M = Mask[Elt];

    if (M < 0)

      continue;


    // Normalize the mask value depending on whether it's V1 or V2.

    int NormM = M;

    SDValue &Op = Ops[Elt & 1];

    if (M < NumElts && (Op.isUndef() || Op == V1))

      Op = V1;

    else if (NumElts <= M && (Op.isUndef() || Op == V2)) {

      Op = V2;

      NormM -= NumElts;

    } else

      return SDValue();


    bool MatchLoAnyLane = false, MatchHiAnyLane = false;

    for (int Lane = 0; Lane != NumElts; Lane += NumLaneElts) {

      int Lo = Lane, Mid = Lane + NumHalfLaneElts, Hi = Lane + NumLaneElts;

      MatchLoAnyLane |= isUndefOrInRange(NormM, Lo, Mid);

      MatchHiAnyLane |= isUndefOrInRange(NormM, Mid, Hi);

      if (MatchLoAnyLane || MatchHiAnyLane) {

        assert((MatchLoAnyLane ^ MatchHiAnyLane) &&

               "Failed to match UNPCKLO/UNPCKHI");

        break;

      }

    }

    MatchLo &= MatchLoAnyLane;

    MatchHi &= MatchHiAnyLane;

    if (!MatchLo && !MatchHi)

      return SDValue();

  }

  assert((MatchLo ^ MatchHi) && "Failed to match UNPCKLO/UNPCKHI");


  // Element indices have changed after unpacking. Calculate permute mask

  // so that they will be put back to the position as dictated by the

  // original shuffle mask indices.

  SmallVector<int, 32> PermuteMask(NumElts, -1);

  for (int Elt = 0; Elt != NumElts; ++Elt) {

    int M = Mask[Elt];

    if (M < 0)

      continue;

    int NormM = M;

    if (NumElts <= M)

      NormM -= NumElts;

    bool IsFirstOp = M < NumElts;

    int BaseMaskElt =

        NumLaneElts * (NormM / NumLaneElts) + (2 * (NormM % NumHalfLaneElts));

    if ((IsFirstOp && V1 == Ops[0]) || (!IsFirstOp && V2 == Ops[0]))

      PermuteMask[Elt] = BaseMaskElt;

    else if ((IsFirstOp && V1 == Ops[1]) || (!IsFirstOp && V2 == Ops[1]))

      PermuteMask[Elt] = BaseMaskElt + 1;

    assert(PermuteMask[Elt] != -1 &&

           "Input mask element is defined but failed to assign permute mask");

  }


  unsigned UnpckOp = MatchLo ? X86ISD::UNPCKL : X86ISD::UNPCKH;

  SDValue Unpck = DAG.getNode(UnpckOp, DL, VT, Ops);

  return DAG.getVectorShuffle(VT, DL, Unpck, DAG.getUNDEF(VT), PermuteMask);

}


/// Try to lower a shuffle as a permute of the inputs followed by an

/// UNPCK instruction.

///

/// This specifically targets cases where we end up with alternating between

/// the two inputs, and so can permute them into something that feeds a single

/// UNPCK instruction. Note that this routine only targets integer vectors

/// because for floating point vectors we have a generalized SHUFPS lowering

/// strategy that handles everything that doesn't *exactly* match an unpack,

/// making this clever lowering unnecessary.


static SDValue lowerShuffleAsPermuteAndUnpack(const SDLoc &DL, MVT VT,

                                              SDValue V1, SDValue V2,

                                              ArrayRef<int> Mask,

                                              const X86Subtarget &Subtarget,

                                              SelectionDAG &DAG) {

  int Size = Mask.size();

  assert(Mask.size() >= 2 && "Single element masks are invalid.");


  // This routine only supports 128-bit integer dual input vectors.

  if (VT.isFloatingPoint() || !VT.is128BitVector() || V2.isUndef())

    return SDValue();


  int NumLoInputs =

      count_if(Mask, [Size](int M) { return M >= 0 && M % Size < Size / 2; });

  int NumHiInputs =

      count_if(Mask, [Size](int M) { return M % Size >= Size / 2; });


  bool UnpackLo = NumLoInputs >= NumHiInputs;


  auto TryUnpack = [&](int ScalarSize, int Scale) {

    SmallVector<int, 16> V1Mask((unsigned)Size, -1);

    SmallVector<int, 16> V2Mask((unsigned)Size, -1);


    for (int i = 0; i < Size; ++i) {

      if (Mask[i] < 0)

        continue;


      // Each element of the unpack contains Scale elements from this mask.

      int UnpackIdx = i / Scale;


      // We only handle the case where V1 feeds the first slots of the unpack.

      // We rely on canonicalization to ensure this is the case.

      if ((UnpackIdx % 2 == 0) != (Mask[i] < Size))

        return SDValue();


      // Setup the mask for this input. The indexing is tricky as we have to

      // handle the unpack stride.

      SmallVectorImpl<int> &VMask = (UnpackIdx % 2 == 0) ? V1Mask : V2Mask;

      VMask[(UnpackIdx / 2) * Scale + i % Scale + (UnpackLo ? 0 : Size / 2)] =

          Mask[i] % Size;

    }


    // If we will have to shuffle both inputs to use the unpack, check whether

    // we can just unpack first and shuffle the result. If so, skip this unpack.

    if ((NumLoInputs == 0 || NumHiInputs == 0) && !isNoopShuffleMask(V1Mask) &&

        !isNoopShuffleMask(V2Mask))

      return SDValue();


    // Shuffle the inputs into place.

    V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);

    V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);


    // Cast the inputs to the type we will use to unpack them.

    MVT UnpackVT =

        MVT::getVectorVT(MVT::getIntegerVT(ScalarSize), Size / Scale);

    V1 = DAG.getBitcast(UnpackVT, V1);

    V2 = DAG.getBitcast(UnpackVT, V2);


    // Unpack the inputs and cast the result back to the desired type.

    return DAG.getBitcast(

        VT, DAG.getNode(UnpackLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,

                        UnpackVT, V1, V2));

  };


  // We try each unpack from the largest to the smallest to try and find one

  // that fits this mask.

  int OrigScalarSize = VT.getScalarSizeInBits();

  for (int ScalarSize = 64; ScalarSize >= OrigScalarSize; ScalarSize /= 2)

    if (SDValue Unpack = TryUnpack(ScalarSize, ScalarSize / OrigScalarSize))

      return Unpack;


  // If we're shuffling with a zero vector then we're better off not doing

  // VECTOR_SHUFFLE(UNPCK()) as we lose track of those zero elements.

  if (ISD::isBuildVectorAllZeros(V1.getNode()) ||

      ISD::isBuildVectorAllZeros(V2.getNode()))

    return SDValue();


  // If none of the unpack-rooted lowerings worked (or were profitable) try an

  // initial unpack.

  if (NumLoInputs == 0 || NumHiInputs == 0) {

    assert((NumLoInputs > 0 || NumHiInputs > 0) &&

           "We have to have *some* inputs!");

    int HalfOffset = NumLoInputs == 0 ? Size / 2 : 0;


    // FIXME: We could consider the total complexity of the permute of each

    // possible unpacking. Or at the least we should consider how many

    // half-crossings are created.

    // FIXME: We could consider commuting the unpacks.


    SmallVector<int, 32> PermMask((unsigned)Size, -1);

    for (int i = 0; i < Size; ++i) {

      if (Mask[i] < 0)

        continue;


      assert(Mask[i] % Size >= HalfOffset && "Found input from wrong half!");


      PermMask[i] =

          2 * ((Mask[i] % Size) - HalfOffset) + (Mask[i] < Size ? 0 : 1);

    }

    return DAG.getVectorShuffle(

        VT, DL,

        DAG.getNode(NumLoInputs == 0 ? X86ISD::UNPCKH : X86ISD::UNPCKL, DL, VT,

                    V1, V2),

        DAG.getUNDEF(VT), PermMask);

  }


  return SDValue();

}


/// Helper to form a PALIGNR-based rotate+permute, merging 2 inputs and then

/// permuting the elements of the result in place.


static SDValue lowerShuffleAsByteRotateAndPermute(

    const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,

    const X86Subtarget &Subtarget, SelectionDAG &DAG) {

  if ((VT.is128BitVector() && !Subtarget.hasSSSE3()) ||

      (VT.is256BitVector() && !Subtarget.hasAVX2()) ||

      (VT.is512BitVector() && !Subtarget.hasBWI()))

    return SDValue();


  // We don't currently support lane crossing permutes.

  if (is128BitLaneCrossingShuffleMask(VT, Mask))

    return SDValue();


  int Scale = VT.getScalarSizeInBits() / 8;

  int NumLanes = VT.getSizeInBits() / 128;

  int NumElts = VT.getVectorNumElements();

  int NumEltsPerLane = NumElts / NumLanes;


  // Determine range of mask elts.

  bool Blend1 = true;

  bool Blend2 = true;

  std::pair<int, int> Range1 = std::make_pair(INT_MAX, INT_MIN);

  std::pair<int, int> Range2 = std::make_pair(INT_MAX, INT_MIN);

  for (int Lane = 0; Lane != NumElts; Lane += NumEltsPerLane) {

    for (int Elt = 0; Elt != NumEltsPerLane; ++Elt) {

      int M = Mask[Lane + Elt];

      if (M < 0)

        continue;

      if (M < NumElts) {

        Blend1 &= (M == (Lane + Elt));

        assert(Lane <= M && M < (Lane + NumEltsPerLane) && "Out of range mask");

        M = M % NumEltsPerLane;

        Range1.first = std::min(Range1.first, M);

        Range1.second = std::max(Range1.second, M);

      } else {

        M -= NumElts;

        Blend2 &= (M == (Lane + Elt));

        assert(Lane <= M && M < (Lane + NumEltsPerLane) && "Out of range mask");

        M = M % NumEltsPerLane;

        Range2.first = std::min(Range2.first, M);

        Range2.second = std::max(Range2.second, M);

      }

    }

  }


  // Bail if we don't need both elements.

  // TODO - it might be worth doing this for unary shuffles if the permute

  // can be widened.

  if (!(0 <= Range1.first && Range1.second < NumEltsPerLane) ||

      !(0 <= Range2.first && Range2.second < NumEltsPerLane))

    return SDValue();


  if (VT.getSizeInBits() > 128 && (Blend1 || Blend2))

    return SDValue();


  // Rotate the 2 ops so we can access both ranges, then permute the result.

  auto RotateAndPermute = [&](SDValue Lo, SDValue Hi, int RotAmt, int Ofs) {

    MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);

    SDValue Rotate = DAG.getBitcast(

        VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, DAG.getBitcast(ByteVT, Hi),

                        DAG.getBitcast(ByteVT, Lo),

                        DAG.getTargetConstant(Scale * RotAmt, DL, MVT::i8)));

    SmallVector<int, 64> PermMask(NumElts, SM_SentinelUndef);

    for (int Lane = 0; Lane != NumElts; Lane += NumEltsPerLane) {

      for (int Elt = 0; Elt != NumEltsPerLane; ++Elt) {

        int M = Mask[Lane + Elt];

        if (M < 0)

          continue;

        if (M < NumElts)

          PermMask[Lane + Elt] = Lane + ((M + Ofs - RotAmt) % NumEltsPerLane);

        else

          PermMask[Lane + Elt] = Lane + ((M - Ofs - RotAmt) % NumEltsPerLane);

      }

    }

    return DAG.getVectorShuffle(VT, DL, Rotate, DAG.getUNDEF(VT), PermMask);

  };


  // Check if the ranges are small enough to rotate from either direction.

  if (Range2.second < Range1.first)

    return RotateAndPermute(V1, V2, Range1.first, 0);

  if (Range1.second < Range2.first)

    return RotateAndPermute(V2, V1, Range2.first, NumElts);

  return SDValue();

}


static bool isBroadcastShuffleMask(ArrayRef<int> Mask) {

  return isUndefOrEqual(Mask, 0);

}


static bool isNoopOrBroadcastShuffleMask(ArrayRef<int> Mask) {

  return isNoopShuffleMask(Mask) || isBroadcastShuffleMask(Mask);

}


/// Check if the Mask consists of the same element repeated multiple times.


static bool isSingleElementRepeatedMask(ArrayRef<int> Mask) {

  size_t NumUndefs = 0;

  std::optional<int> UniqueElt;

  for (int Elt : Mask) {

    if (Elt == SM_SentinelUndef) {

      NumUndefs++;

      continue;

    }

    if (UniqueElt.has_value() && UniqueElt.value() != Elt)

      return false;

    UniqueElt = Elt;

  }

  // Make sure the element is repeated enough times by checking the number of

  // undefs is small.

  return NumUndefs <= Mask.size() / 2 && UniqueElt.has_value();

}


/// Generic routine to decompose a shuffle and blend into independent

/// blends and permutes.

///

/// This matches the extremely common pattern for handling combined

/// shuffle+blend operations on newer X86 ISAs where we have very fast blend

/// operations. It will try to pick the best arrangement of shuffles and

/// blends. For vXi8/vXi16 shuffles we may use unpack instead of blend.


static SDValue lowerShuffleAsDecomposedShuffleMerge(

    const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,

    const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG) {

  int NumElts = Mask.size();

  int NumLanes = VT.getSizeInBits() / 128;

  int NumEltsPerLane = NumElts / NumLanes;


  // Shuffle the input elements into the desired positions in V1 and V2 and

  // unpack/blend them together.

  bool IsAlternating = true;

  bool V1Zero = true, V2Zero = true;

  SmallVector<int, 32> V1Mask(NumElts, -1);

  SmallVector<int, 32> V2Mask(NumElts, -1);

  SmallVector<int, 32> FinalMask(NumElts, -1);

  for (int i = 0; i < NumElts; ++i) {

    int M = Mask[i];

    if (M >= 0 && M < NumElts) {

      V1Mask[i] = M;

      FinalMask[i] = i;

      V1Zero &= Zeroable[i];

      IsAlternating &= (i & 1) == 0;

    } else if (M >= NumElts) {

      V2Mask[i] = M - NumElts;

      FinalMask[i] = i + NumElts;

      V2Zero &= Zeroable[i];

      IsAlternating &= (i & 1) == 1;

    }

  }


  // If we effectively only demand the 0'th element of \p Input, and not only

  // as 0'th element, then broadcast said input,

  // and change \p InputMask to be a no-op (identity) mask.

  auto canonicalizeBroadcastableInput = [DL, VT, &Subtarget,

                                         &DAG](SDValue &Input,

                                               MutableArrayRef<int> InputMask) {

    unsigned EltSizeInBits = Input.getScalarValueSizeInBits();

    if (!Subtarget.hasAVX2() && (!Subtarget.hasAVX() || EltSizeInBits < 32 ||

                                 !X86::mayFoldLoad(Input, Subtarget)))

      return;

    if (isNoopShuffleMask(InputMask))

      return;

    assert(isBroadcastShuffleMask(InputMask) &&

           "Expected to demand only the 0'th element.");

    Input = DAG.getNode(X86ISD::VBROADCAST, DL, VT, Input);

    for (auto I : enumerate(InputMask)) {

      int &InputMaskElt = I.value();

      if (InputMaskElt >= 0)

        InputMaskElt = I.index();

    }

  };


  // Currently, we may need to produce one shuffle per input, and blend results.

  // It is possible that the shuffle for one of the inputs is already a no-op.

  // See if we can simplify non-no-op shuffles into broadcasts,

  // which we consider to be strictly better than an arbitrary shuffle.

  if (isNoopOrBroadcastShuffleMask(V1Mask) &&

      isNoopOrBroadcastShuffleMask(V2Mask)) {

    canonicalizeBroadcastableInput(V1, V1Mask);

    canonicalizeBroadcastableInput(V2, V2Mask);

  }


  // Try to lower with the simpler initial blend/unpack/rotate strategies unless

  // one of the input shuffles would be a no-op. We prefer to shuffle inputs as

  // the shuffle may be able to fold with a load or other benefit. However, when

  // we'll have to do 2x as many shuffles in order to achieve this, a 2-input

  // pre-shuffle first is a better strategy.

  if (!isNoopShuffleMask(V1Mask) && !isNoopShuffleMask(V2Mask)) {

    // If we don't have blends, see if we can create a cheap unpack.

    if (!Subtarget.hasSSE41() && VT.is128BitVector() &&

        (is128BitUnpackShuffleMask(V1Mask, DAG) ||

         is128BitUnpackShuffleMask(V2Mask, DAG)))

      if (SDValue PermUnpack = lowerShuffleAsPermuteAndUnpack(

              DL, VT, V1, V2, Mask, Subtarget, DAG))

        return PermUnpack;


    // Only prefer immediate blends to unpack/rotate.

    if (SDValue BlendPerm =

            lowerShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask, DAG, true))

      return BlendPerm;


    // If either input vector provides only a single element which is repeated

    // multiple times, unpacking from both input vectors would generate worse

    // code. e.g. for

    // t5: v16i8 = vector_shuffle<16,0,16,1,16,2,16,3,16,4,16,5,16,6,16,7> t2, t4

    // it is better to process t4 first to create a vector of t4[0], then unpack

    // that vector with t2.

    if (!V1Zero && !V2Zero && !isSingleElementRepeatedMask(V1Mask) &&

        !isSingleElementRepeatedMask(V2Mask))

      if (SDValue UnpackPerm =

              lowerShuffleAsUNPCKAndPermute(DL, VT, V1, V2, Mask, DAG))

        return UnpackPerm;


    if (SDValue RotatePerm = lowerShuffleAsByteRotateAndPermute(

            DL, VT, V1, V2, Mask, Subtarget, DAG))

      return RotatePerm;


    // Unpack/rotate failed - try again with variable blends.

    if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask,

                                                          DAG))

      return BlendPerm;


    if (VT.getScalarSizeInBits() >= 32)

      if (SDValue PermUnpack = lowerShuffleAsPermuteAndUnpack(

              DL, VT, V1, V2, Mask, Subtarget, DAG))

        return PermUnpack;

  }


  // If the final mask is an alternating blend of vXi8/vXi16, convert to an

  // UNPCKL(SHUFFLE, SHUFFLE) pattern.

  // TODO: It doesn't have to be alternating - but each lane mustn't have more

  // than half the elements coming from each source.

  if (IsAlternating && VT.getScalarSizeInBits() < 32) {

    V1Mask.assign(NumElts, -1);

    V2Mask.assign(NumElts, -1);

    FinalMask.assign(NumElts, -1);

    for (int i = 0; i != NumElts; i += NumEltsPerLane)

      for (int j = 0; j != NumEltsPerLane; ++j) {

        int M = Mask[i + j];

        if (M >= 0 && M < NumElts) {

          V1Mask[i + (j / 2)] = M;

          FinalMask[i + j] = i + (j / 2);

        } else if (M >= NumElts) {

          V2Mask[i + (j / 2)] = M - NumElts;

          FinalMask[i + j] = i + (j / 2) + NumElts;

        }

      }

  }


  V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);

  V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);

  return DAG.getVectorShuffle(VT, DL, V1, V2, FinalMask);

}


static int matchShuffleAsBitRotate(MVT &RotateVT, int EltSizeInBits,

                                   const X86Subtarget &Subtarget,

                                   ArrayRef<int> Mask) {

  assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");

  assert(EltSizeInBits < 64 && "Can't rotate 64-bit integers");


  // AVX512 only has vXi32/vXi64 rotates, so limit the rotation sub group size.

  int MinSubElts = Subtarget.hasAVX512() ? std::max(32 / EltSizeInBits, 2) : 2;

  int MaxSubElts = 64 / EltSizeInBits;

  unsigned RotateAmt, NumSubElts;

  if (!ShuffleVectorInst::isBitRotateMask(Mask, EltSizeInBits, MinSubElts,

                                          MaxSubElts, NumSubElts, RotateAmt))

    return -1;

  unsigned NumElts = Mask.size();

  MVT RotateSVT = MVT::getIntegerVT(EltSizeInBits * NumSubElts);

  RotateVT = MVT::getVectorVT(RotateSVT, NumElts / NumSubElts);

  return RotateAmt;

}


/// Lower shuffle using X86ISD::VROTLI rotations.


static SDValue lowerShuffleAsBitRotate(const SDLoc &DL, MVT VT, SDValue V1,

                                       ArrayRef<int> Mask,

                                       const X86Subtarget &Subtarget,

                                       SelectionDAG &DAG) {

  // Only XOP + AVX512 targets have bit rotation instructions.

  // If we at least have SSSE3 (PSHUFB) then we shouldn't attempt to use this.

  bool IsLegal =

      (VT.is128BitVector() && Subtarget.hasXOP()) || Subtarget.hasAVX512();

  if (!IsLegal && Subtarget.hasSSE3())

    return SDValue();


  MVT RotateVT;

  int RotateAmt = matchShuffleAsBitRotate(RotateVT, VT.getScalarSizeInBits(),

                                          Subtarget, Mask);

  if (RotateAmt < 0)

    return SDValue();


  // For pre-SSSE3 targets, if we are shuffling vXi8 elts then ISD::ROTL,

  // expanded to OR(SRL,SHL), will be more efficient, but if they can

  // widen to vXi16 or more then existing lowering should will be better.

  if (!IsLegal) {

    if ((RotateAmt % 16) == 0)

      return SDValue();

    // TODO: Use getTargetVShiftByConstNode.

    unsigned ShlAmt = RotateAmt;

    unsigned SrlAmt = RotateVT.getScalarSizeInBits() - RotateAmt;

    V1 = DAG.getBitcast(RotateVT, V1);

    SDValue SHL = DAG.getNode(X86ISD::VSHLI, DL, RotateVT, V1,

                              DAG.getTargetConstant(ShlAmt, DL, MVT::i8));

    SDValue SRL = DAG.getNode(X86ISD::VSRLI, DL, RotateVT, V1,

                              DAG.getTargetConstant(SrlAmt, DL, MVT::i8));

    SDValue Rot = DAG.getNode(ISD::OR, DL, RotateVT, SHL, SRL);

    return DAG.getBitcast(VT, Rot);

  }


  SDValue Rot =

      DAG.getNode(X86ISD::VROTLI, DL, RotateVT, DAG.getBitcast(RotateVT, V1),

                  DAG.getTargetConstant(RotateAmt, DL, MVT::i8));

  return DAG.getBitcast(VT, Rot);

}


/// Try to match a vector shuffle as an element rotation.

///

/// This is used for support PALIGNR for SSSE3 or VALIGND/Q for AVX512.


static int matchShuffleAsElementRotate(SDValue &V1, SDValue &V2,

                                       ArrayRef<int> Mask) {

  int NumElts = Mask.size();


  // We need to detect various ways of spelling a rotation:

  //   [11, 12, 13, 14, 15,  0,  1,  2]

  //   [-1, 12, 13, 14, -1, -1,  1, -1]

  //   [-1, -1, -1, -1, -1, -1,  1,  2]

  //   [ 3,  4,  5,  6,  7,  8,  9, 10]

  //   [-1,  4,  5,  6, -1, -1,  9, -1]

  //   [-1,  4,  5,  6, -1, -1, -1, -1]

  int Rotation = 0;

  SDValue Lo, Hi;

  for (int i = 0; i < NumElts; ++i) {

    int M = Mask[i];

    assert((M == SM_SentinelUndef || (0 <= M && M < (2*NumElts))) &&

           "Unexpected mask index.");

    if (M < 0)

      continue;


    // Determine where a rotated vector would have started.

    int StartIdx = i - (M % NumElts);

    if (StartIdx == 0)

      // The identity rotation isn't interesting, stop.

      return -1;


    // If we found the tail of a vector the rotation must be the missing

    // front. If we found the head of a vector, it must be how much of the

    // head.

    int CandidateRotation = StartIdx < 0 ? -StartIdx : NumElts - StartIdx;


    if (Rotation == 0)

      Rotation = CandidateRotation;

    else if (Rotation != CandidateRotation)

      // The rotations don't match, so we can't match this mask.

      return -1;


    // Compute which value this mask is pointing at.

    SDValue MaskV = M < NumElts ? V1 : V2;


    // Compute which of the two target values this index should be assigned

    // to. This reflects whether the high elements are remaining or the low

    // elements are remaining.

    SDValue &TargetV = StartIdx < 0 ? Hi : Lo;


    // Either set up this value if we've not encountered it before, or check

    // that it remains consistent.

    if (!TargetV)

      TargetV = MaskV;

    else if (TargetV != MaskV)

      // This may be a rotation, but it pulls from the inputs in some

      // unsupported interleaving.

      return -1;

  }


  // Check that we successfully analyzed the mask, and normalize the results.

  assert(Rotation != 0 && "Failed to locate a viable rotation!");

  assert((Lo || Hi) && "Failed to find a rotated input vector!");

  if (!Lo)

    Lo = Hi;

  else if (!Hi)

    Hi = Lo;


  V1 = Lo;

  V2 = Hi;


  return Rotation;

}


/// Try to lower a vector shuffle as a byte rotation.

///

/// SSSE3 has a generic PALIGNR instruction in x86 that will do an arbitrary

/// byte-rotation of the concatenation of two vectors; pre-SSSE3 can use

/// a PSRLDQ/PSLLDQ/POR pattern to get a similar effect. This routine will

/// try to generically lower a vector shuffle through such an pattern. It

/// does not check for the profitability of lowering either as PALIGNR or

/// PSRLDQ/PSLLDQ/POR, only whether the mask is valid to lower in that form.

/// This matches shuffle vectors that look like:

///

///   v8i16 [11, 12, 13, 14, 15, 0, 1, 2]

///

/// Essentially it concatenates V1 and V2, shifts right by some number of

/// elements, and takes the low elements as the result. Note that while this is

/// specified as a *right shift* because x86 is little-endian, it is a *left

/// rotate* of the vector lanes.


static int matchShuffleAsByteRotate(MVT VT, SDValue &V1, SDValue &V2,

                                    ArrayRef<int> Mask) {

  // Don't accept any shuffles with zero elements.

  if (isAnyZero(Mask))

    return -1;


  // PALIGNR works on 128-bit lanes.

  SmallVector<int, 16> RepeatedMask;

  if (!is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedMask))

    return -1;


  int Rotation = matchShuffleAsElementRotate(V1, V2, RepeatedMask);

  if (Rotation <= 0)

    return -1;


  // PALIGNR rotates bytes, so we need to scale the

  // rotation based on how many bytes are in the vector lane.

  int NumElts = RepeatedMask.size();

  int Scale = 16 / NumElts;

  return Rotation * Scale;

}


static SDValue lowerShuffleAsByteRotate(const SDLoc &DL, MVT VT, SDValue V1,

                                        SDValue V2, ArrayRef<int> Mask,

                                        const X86Subtarget &Subtarget,

                                        SelectionDAG &DAG) {

  assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");


  SDValue Lo = V1, Hi = V2;

  int ByteRotation = matchShuffleAsByteRotate(VT, Lo, Hi, Mask);

  if (ByteRotation <= 0)

    return SDValue();


  // Cast the inputs to i8 vector of correct length to match PALIGNR or

  // PSLLDQ/PSRLDQ.

  MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);

  Lo = DAG.getBitcast(ByteVT, Lo);

  Hi = DAG.getBitcast(ByteVT, Hi);


  // SSSE3 targets can use the palignr instruction.

  if (Subtarget.hasSSSE3()) {

    assert((!VT.is512BitVector() || Subtarget.hasBWI()) &&

           "512-bit PALIGNR requires BWI instructions");

    return DAG.getBitcast(

        VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, Lo, Hi,

                        DAG.getTargetConstant(ByteRotation, DL, MVT::i8)));

  }


  assert(VT.is128BitVector() &&

         "Rotate-based lowering only supports 128-bit lowering!");

  assert(Mask.size() <= 16 &&

         "Can shuffle at most 16 bytes in a 128-bit vector!");

  assert(ByteVT == MVT::v16i8 &&

         "SSE2 rotate lowering only needed for v16i8!");


  // Default SSE2 implementation

  int LoByteShift = 16 - ByteRotation;

  int HiByteShift = ByteRotation;


  SDValue LoShift =

      DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Lo,

                  DAG.getTargetConstant(LoByteShift, DL, MVT::i8));

  SDValue HiShift =

      DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Hi,

                  DAG.getTargetConstant(HiByteShift, DL, MVT::i8));

  return DAG.getBitcast(VT,

                        DAG.getNode(ISD::OR, DL, MVT::v16i8, LoShift, HiShift));

}


/// Try to lower a vector shuffle as a dword/qword rotation.

///

/// AVX512 has a VALIGND/VALIGNQ instructions that will do an arbitrary

/// rotation of the concatenation of two vectors; This routine will

/// try to generically lower a vector shuffle through such an pattern.

///

/// Essentially it concatenates V1 and V2, shifts right by some number of

/// elements, and takes the low elements as the result. Note that while this is

/// specified as a *right shift* because x86 is little-endian, it is a *left

/// rotate* of the vector lanes.


static SDValue lowerShuffleAsVALIGN(const SDLoc &DL, MVT VT, SDValue V1,

                                    SDValue V2, ArrayRef<int> Mask,

                                    const APInt &Zeroable,

                                    const X86Subtarget &Subtarget,

                                    SelectionDAG &DAG) {

  assert((VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) &&

         "Only 32-bit and 64-bit elements are supported!");


  // 128/256-bit vectors are only supported with VLX.

  assert((Subtarget.hasVLX() || (!VT.is128BitVector() && !VT.is256BitVector()))

         && "VLX required for 128/256-bit vectors");


  SDValue Lo = V1, Hi = V2;

  int Rotation = matchShuffleAsElementRotate(Lo, Hi, Mask);

  if (0 < Rotation)

    return DAG.getNode(X86ISD::VALIGN, DL, VT, Lo, Hi,

                       DAG.getTargetConstant(Rotation, DL, MVT::i8));


  // See if we can use VALIGN as a cross-lane version of VSHLDQ/VSRLDQ.

  // TODO: Pull this out as a matchShuffleAsElementShift helper?

  // TODO: We can probably make this more aggressive and use shift-pairs like

  // lowerShuffleAsByteShiftMask.

  unsigned NumElts = Mask.size();

  unsigned ZeroLo = Zeroable.countr_one();

  unsigned ZeroHi = Zeroable.countl_one();

  assert((ZeroLo + ZeroHi) < NumElts && "Zeroable shuffle detected");

  if (!ZeroLo && !ZeroHi)

    return SDValue();


  if (ZeroLo) {

    SDValue Src = Mask[ZeroLo] < (int)NumElts ? V1 : V2;

    int Low = Mask[ZeroLo] < (int)NumElts ? 0 : NumElts;

    if (isSequentialOrUndefInRange(Mask, ZeroLo, NumElts - ZeroLo, Low))

      return DAG.getNode(X86ISD::VALIGN, DL, VT, Src,

                         getZeroVector(VT, Subtarget, DAG, DL),

                         DAG.getTargetConstant(NumElts - ZeroLo, DL, MVT::i8));

  }


  if (ZeroHi) {

    SDValue Src = Mask[0] < (int)NumElts ? V1 : V2;

    int Low = Mask[0] < (int)NumElts ? 0 : NumElts;

    if (isSequentialOrUndefInRange(Mask, 0, NumElts - ZeroHi, Low + ZeroHi))

      return DAG.getNode(X86ISD::VALIGN, DL, VT,

                         getZeroVector(VT, Subtarget, DAG, DL), Src,

                         DAG.getTargetConstant(ZeroHi, DL, MVT::i8));

  }


  return SDValue();

}


/// Try to lower a vector shuffle as a byte shift sequence.


static SDValue lowerShuffleAsByteShiftMask(const SDLoc &DL, MVT VT, SDValue V1,

                                           SDValue V2, ArrayRef<int> Mask,

                                           const APInt &Zeroable,

                                           const X86Subtarget &Subtarget,

                                           SelectionDAG &DAG) {

  assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");

  assert(VT.is128BitVector() && "Only 128-bit vectors supported");


  // We need a shuffle that has zeros at one/both ends and a sequential

  // shuffle from one source within.

  unsigned ZeroLo = Zeroable.countr_one();

  unsigned ZeroHi = Zeroable.countl_one();

  if (!ZeroLo && !ZeroHi)

    return SDValue();


  unsigned NumElts = Mask.size();

  unsigned Len = NumElts - (ZeroLo + ZeroHi);

  if (!isSequentialOrUndefInRange(Mask, ZeroLo, Len, Mask[ZeroLo]))

    return SDValue();


  unsigned Scale = VT.getScalarSizeInBits() / 8;

  ArrayRef<int> StubMask = Mask.slice(ZeroLo, Len);

  if (!isUndefOrInRange(StubMask, 0, NumElts) &&

      !isUndefOrInRange(StubMask, NumElts, 2 * NumElts))

    return SDValue();


  SDValue Res = Mask[ZeroLo] < (int)NumElts ? V1 : V2;

  Res = DAG.getBitcast(MVT::v16i8, Res);


  // Use VSHLDQ/VSRLDQ ops to zero the ends of a vector and leave an

  // inner sequential set of elements, possibly offset:

  // 01234567 --> zzzzzz01 --> 1zzzzzzz

  // 01234567 --> 4567zzzz --> zzzzz456

  // 01234567 --> z0123456 --> 3456zzzz --> zz3456zz

  if (ZeroLo == 0) {

    unsigned Shift = (NumElts - 1) - (Mask[ZeroLo + Len - 1] % NumElts);

    Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,

                      DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));

    Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,

                      DAG.getTargetConstant(Scale * ZeroHi, DL, MVT::i8));

  } else if (ZeroHi == 0) {

    unsigned Shift = Mask[ZeroLo] % NumElts;

    Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,

                      DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));

    Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,

                      DAG.getTargetConstant(Scale * ZeroLo, DL, MVT::i8));

  } else if (!Subtarget.hasSSSE3()) {

    // If we don't have PSHUFB then its worth avoiding an AND constant mask

    // by performing 3 byte shifts. Shuffle combining can kick in above that.

    // TODO: There may be some cases where VSH{LR}DQ+PAND is still better.

    unsigned Shift = (NumElts - 1) - (Mask[ZeroLo + Len - 1] % NumElts);

    Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,

                      DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));

    Shift += Mask[ZeroLo] % NumElts;

    Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,

                      DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));

    Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,

                      DAG.getTargetConstant(Scale * ZeroLo, DL, MVT::i8));

  } else

    return SDValue();


  return DAG.getBitcast(VT, Res);

}


/// Try to lower a vector shuffle as a bit shift (shifts in zeros).

///

/// Attempts to match a shuffle mask against the PSLL(W/D/Q/DQ) and

/// PSRL(W/D/Q/DQ) SSE2 and AVX2 logical bit-shift instructions. The function

/// matches elements from one of the input vectors shuffled to the left or

/// right with zeroable elements 'shifted in'. It handles both the strictly

/// bit-wise element shifts and the byte shift across an entire 128-bit double

/// quad word lane.

///

/// PSHL : (little-endian) left bit shift.

/// [ zz, 0, zz,  2 ]

/// [ -1, 4, zz, -1 ]

/// PSRL : (little-endian) right bit shift.

/// [  1, zz,  3, zz]

/// [ -1, -1,  7, zz]

/// PSLLDQ : (little-endian) left byte shift

/// [ zz,  0,  1,  2,  3,  4,  5,  6]

/// [ zz, zz, -1, -1,  2,  3,  4, -1]

/// [ zz, zz, zz, zz, zz, zz, -1,  1]

/// PSRLDQ : (little-endian) right byte shift

/// [  5, 6,  7, zz, zz, zz, zz, zz]

/// [ -1, 5,  6,  7, zz, zz, zz, zz]

/// [  1, 2, -1, -1, -1, -1, zz, zz]


static int matchShuffleAsShift(MVT &ShiftVT, unsigned &Opcode,

                               unsigned ScalarSizeInBits, ArrayRef<int> Mask,

                               int MaskOffset, const APInt &Zeroable,

                               const X86Subtarget &Subtarget) {

  int Size = Mask.size();

  unsigned SizeInBits = Size * ScalarSizeInBits;


  auto CheckZeros = [&](int Shift, int Scale, bool Left) {

    for (int i = 0; i < Size; i += Scale)

      for (int j = 0; j < Shift; ++j)

        if (!Zeroable[i + j + (Left ? 0 : (Scale - Shift))])

          return false;


    return true;

  };


  auto MatchShift = [&](int Shift, int Scale, bool Left) {

    for (int i = 0; i != Size; i += Scale) {

      unsigned Pos = Left ? i + Shift : i;

      unsigned Low = Left ? i : i + Shift;

      unsigned Len = Scale - Shift;

      if (!isSequentialOrUndefInRange(Mask, Pos, Len, Low + MaskOffset))

        return -1;

    }


    int ShiftEltBits = ScalarSizeInBits * Scale;

    bool ByteShift = ShiftEltBits > 64;

    Opcode = Left ? (ByteShift ? X86ISD::VSHLDQ : X86ISD::VSHLI)

                  : (ByteShift ? X86ISD::VSRLDQ : X86ISD::VSRLI);

    int ShiftAmt = Shift * ScalarSizeInBits / (ByteShift ? 8 : 1);


    // Normalize the scale for byte shifts to still produce an i64 element

    // type.

    Scale = ByteShift ? Scale / 2 : Scale;


    // We need to round trip through the appropriate type for the shift.

    MVT ShiftSVT = MVT::getIntegerVT(ScalarSizeInBits * Scale);

    ShiftVT = ByteShift ? MVT::getVectorVT(MVT::i8, SizeInBits / 8)

                        : MVT::getVectorVT(ShiftSVT, Size / Scale);

    return (int)ShiftAmt;

  };


  // SSE/AVX supports logical shifts up to 64-bit integers - so we can just

  // keep doubling the size of the integer elements up to that. We can

  // then shift the elements of the integer vector by whole multiples of

  // their width within the elements of the larger integer vector. Test each

  // multiple to see if we can find a match with the moved element indices

  // and that the shifted in elements are all zeroable.

  unsigned MaxWidth = ((SizeInBits == 512) && !Subtarget.hasBWI() ? 64 : 128);

  for (int Scale = 2; Scale * ScalarSizeInBits <= MaxWidth; Scale *= 2)

    for (int Shift = 1; Shift != Scale; ++Shift)

      for (bool Left : {true, false})

        if (CheckZeros(Shift, Scale, Left)) {

          int ShiftAmt = MatchShift(Shift, Scale, Left);

          if (0 < ShiftAmt)

            return ShiftAmt;

        }


  // no match

  return -1;

}


static SDValue lowerShuffleAsShift(const SDLoc &DL, MVT VT, SDValue V1,

                                   SDValue V2, ArrayRef<int> Mask,

                                   const APInt &Zeroable,

                                   const X86Subtarget &Subtarget,

                                   SelectionDAG &DAG, bool BitwiseOnly) {

  int Size = Mask.size();

  assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");


  MVT ShiftVT;

  SDValue V = V1;

  unsigned Opcode;


  // Try to match shuffle against V1 shift.

  int ShiftAmt = matchShuffleAsShift(ShiftVT, Opcode, VT.getScalarSizeInBits(),

                                     Mask, 0, Zeroable, Subtarget);


  // If V1 failed, try to match shuffle against V2 shift.

  if (ShiftAmt < 0) {

    ShiftAmt = matchShuffleAsShift(ShiftVT, Opcode, VT.getScalarSizeInBits(),

                                   Mask, Size, Zeroable, Subtarget);

    V = V2;

  }


  if (ShiftAmt < 0)

    return SDValue();


  if (BitwiseOnly && (Opcode == X86ISD::VSHLDQ || Opcode == X86ISD::VSRLDQ))

    return SDValue();


  assert(DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) &&

         "Illegal integer vector type");

  V = DAG.getBitcast(ShiftVT, V);

  V = DAG.getNode(Opcode, DL, ShiftVT, V,

                  DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));

  return DAG.getBitcast(VT, V);

}


// EXTRQ: Extract Len elements from lower half of source, starting at Idx.

// Remainder of lower half result is zero and upper half is all undef.


static bool matchShuffleAsEXTRQ(MVT VT, SDValue &V1, SDValue &V2,

                                ArrayRef<int> Mask, uint64_t &BitLen,

                                uint64_t &BitIdx, const APInt &Zeroable) {

  int Size = Mask.size();

  int HalfSize = Size / 2;

  assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");

  assert(!Zeroable.isAllOnes() && "Fully zeroable shuffle mask");


  // Upper half must be undefined.

  if (!isUndefUpperHalf(Mask))

    return false;


  // Determine the extraction length from the part of the

  // lower half that isn't zeroable.

  int Len = HalfSize;

  for (; Len > 0; --Len)

    if (!Zeroable[Len - 1])

      break;

  assert(Len > 0 && "Zeroable shuffle mask");


  // Attempt to match first Len sequential elements from the lower half.

  SDValue Src;

  int Idx = -1;

  for (int i = 0; i != Len; ++i) {

    int M = Mask[i];

    if (M == SM_SentinelUndef)

      continue;

    SDValue &V = (M < Size ? V1 : V2);

    M = M % Size;


    // The extracted elements must start at a valid index and all mask

    // elements must be in the lower half.

    if (i > M || M >= HalfSize)

      return false;


    if (Idx < 0 || (Src == V && Idx == (M - i))) {

      Src = V;

      Idx = M - i;

      continue;

    }

    return false;

  }


  if (!Src || Idx < 0)

    return false;


  assert((Idx + Len) <= HalfSize && "Illegal extraction mask");

  BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;

  BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;

  V1 = Src;

  return true;

}


// INSERTQ: Extract lowest Len elements from lower half of second source and

// insert over first source, starting at Idx.

// { A[0], .., A[Idx-1], B[0], .., B[Len-1], A[Idx+Len], .., UNDEF, ... }


static bool matchShuffleAsINSERTQ(MVT VT, SDValue &V1, SDValue &V2,

                                  ArrayRef<int> Mask, uint64_t &BitLen,

                                  uint64_t &BitIdx) {

  int Size = Mask.size();

  int HalfSize = Size / 2;

  assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");


  // Upper half must be undefined.

  if (!isUndefUpperHalf(Mask))

    return false;


  for (int Idx = 0; Idx != HalfSize; ++Idx) {

    SDValue Base;


    // Attempt to match first source from mask before insertion point.

    if (isUndefInRange(Mask, 0, Idx)) {

      /* EMPTY */

    } else if (isSequentialOrUndefInRange(Mask, 0, Idx, 0)) {

      Base = V1;

    } else if (isSequentialOrUndefInRange(Mask, 0, Idx, Size)) {

      Base = V2;

    } else {

      continue;

    }


    // Extend the extraction length looking to match both the insertion of

    // the second source and the remaining elements of the first.

    for (int Hi = Idx + 1; Hi <= HalfSize; ++Hi) {

      SDValue Insert;

      int Len = Hi - Idx;


      // Match insertion.

      if (isSequentialOrUndefInRange(Mask, Idx, Len, 0)) {

        Insert = V1;

      } else if (isSequentialOrUndefInRange(Mask, Idx, Len, Size)) {

        Insert = V2;

      } else {

        continue;

      }


      // Match the remaining elements of the lower half.

      if (isUndefInRange(Mask, Hi, HalfSize - Hi)) {

        /* EMPTY */

      } else if ((!Base || (Base == V1)) &&

                 isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi, Hi)) {

        Base = V1;

      } else if ((!Base || (Base == V2)) &&

                 isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi,

                                            Size + Hi)) {

        Base = V2;

      } else {

        continue;

      }


      BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;

      BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;

      V1 = Base;

      V2 = Insert;

      return true;

    }

  }


  return false;

}


/// Try to lower a vector shuffle using SSE4a EXTRQ/INSERTQ.


static SDValue lowerShuffleWithSSE4A(const SDLoc &DL, MVT VT, SDValue V1,

                                     SDValue V2, ArrayRef<int> Mask,

                                     const APInt &Zeroable, SelectionDAG &DAG) {

  uint64_t BitLen, BitIdx;

  if (matchShuffleAsEXTRQ(VT, V1, V2, Mask, BitLen, BitIdx, Zeroable))

    return DAG.getNode(X86ISD::EXTRQI, DL, VT, V1,

                       DAG.getTargetConstant(BitLen, DL, MVT::i8),

                       DAG.getTargetConstant(BitIdx, DL, MVT::i8));


  if (matchShuffleAsINSERTQ(VT, V1, V2, Mask, BitLen, BitIdx))

    return DAG.getNode(X86ISD::INSERTQI, DL, VT, V1 ? V1 : DAG.getUNDEF(VT),

                       V2 ? V2 : DAG.getUNDEF(VT),

                       DAG.getTargetConstant(BitLen, DL, MVT::i8),

                       DAG.getTargetConstant(BitIdx, DL, MVT::i8));


  return SDValue();

}


/// Lower a vector shuffle as an any/signed/zero extension.

///

/// Given a specific number of elements, element bit width, and extension

/// stride, produce either an extension based on the available

/// features of the subtarget. The extended elements are consecutive and

/// begin and can start from an offsetted element index in the input; to

/// avoid excess shuffling the offset must either being in the bottom lane

/// or at the start of a higher lane. All extended elements must be from

/// the same lane.


static SDValue lowerShuffleAsSpecificExtension(const SDLoc &DL, MVT VT,

                                               int Scale, int Offset,

                                               unsigned ExtOpc, SDValue InputV,

                                               ArrayRef<int> Mask,

                                               const X86Subtarget &Subtarget,

                                               SelectionDAG &DAG) {

  assert(Scale > 1 && "Need a scale to extend.");

  assert(ISD::isExtOpcode(ExtOpc) && "Unsupported extension");

  int EltBits = VT.getScalarSizeInBits();

  int NumElements = VT.getVectorNumElements();

  int NumEltsPerLane = 128 / EltBits;

  int OffsetLane = Offset / NumEltsPerLane;

  assert((EltBits == 8 || EltBits == 16 || EltBits == 32) &&

         "Only 8, 16, and 32 bit elements can be extended.");

  assert(Scale * EltBits <= 64 && "Cannot zero extend past 64 bits.");

  assert(0 <= Offset && "Extension offset must be positive.");

  assert((Offset < NumEltsPerLane || Offset % NumEltsPerLane == 0) &&

         "Extension offset must be in the first lane or start an upper lane.");


  // Check that an index is in same lane as the base offset.

  auto SafeOffset = [&](int Idx) {

    return OffsetLane == (Idx / NumEltsPerLane);

  };


  // Shift along an input so that the offset base moves to the first element.

  auto ShuffleOffset = [&](SDValue V) {

    if (!Offset)

      return V;


    SmallVector<int, 8> ShMask((unsigned)NumElements, -1);

    for (int i = 0; i * Scale < NumElements; ++i) {

      int SrcIdx = i + Offset;

      ShMask[i] = SafeOffset(SrcIdx) ? SrcIdx : -1;

    }

    return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), ShMask);

  };


  // Found a valid a/zext mask! Try various lowering strategies based on the

  // input type and available ISA extensions.

  if (Subtarget.hasSSE41()) {

    // Not worth offsetting 128-bit vectors if scale == 2, a pattern using

    // PUNPCK will catch this in a later shuffle match.

    if (Offset && Scale == 2 && VT.is128BitVector())

      return SDValue();

    MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits * Scale),

                                 NumElements / Scale);

    InputV = DAG.getBitcast(VT, InputV);

    InputV = ShuffleOffset(InputV);

    InputV = getEXTEND_VECTOR_INREG(ExtOpc, DL, ExtVT, InputV, DAG);

    return DAG.getBitcast(VT, InputV);

  }


  assert(VT.is128BitVector() && "Only 128-bit vectors can be extended.");

  InputV = DAG.getBitcast(VT, InputV);

  bool AnyExt = ExtOpc == ISD::ANY_EXTEND;


  // TODO: Add pre-SSE41 SIGN_EXTEND_VECTOR_INREG handling.

  if (ExtOpc == ISD::SIGN_EXTEND)

    return SDValue();


  // For any extends we can cheat for larger element sizes and use shuffle

  // instructions that can fold with a load and/or copy.

  if (AnyExt && EltBits == 32) {

    int PSHUFDMask[4] = {Offset, -1, SafeOffset(Offset + 1) ? Offset + 1 : -1,

                         -1};

    return DAG.getBitcast(

        VT, DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,

                        DAG.getBitcast(MVT::v4i32, InputV),

                        getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));

  }

  if (AnyExt && EltBits == 16 && Scale > 2) {

    int PSHUFDMask[4] = {Offset / 2, -1,

                         SafeOffset(Offset + 1) ? (Offset + 1) / 2 : -1, -1};

    InputV = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,

                         DAG.getBitcast(MVT::v4i32, InputV),

                         getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));

    int PSHUFWMask[4] = {1, -1, -1, -1};

    unsigned OddEvenOp = (Offset & 1) ? X86ISD::PSHUFLW : X86ISD::PSHUFHW;

    return DAG.getBitcast(

        VT, DAG.getNode(OddEvenOp, DL, MVT::v8i16,

                        DAG.getBitcast(MVT::v8i16, InputV),

                        getV4X86ShuffleImm8ForMask(PSHUFWMask, DL, DAG)));

  }


  // The SSE4A EXTRQ instruction can efficiently extend the first 2 lanes

  // to 64-bits.

  if ((Scale * EltBits) == 64 && EltBits < 32 && Subtarget.hasSSE4A()) {

    assert(NumElements == (int)Mask.size() && "Unexpected shuffle mask size!");

    assert(VT.is128BitVector() && "Unexpected vector width!");


    int LoIdx = Offset * EltBits;

    SDValue Lo = DAG.getBitcast(

        MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,

                                DAG.getTargetConstant(EltBits, DL, MVT::i8),

                                DAG.getTargetConstant(LoIdx, DL, MVT::i8)));


    if (isUndefUpperHalf(Mask) || !SafeOffset(Offset + 1))

      return DAG.getBitcast(VT, Lo);


    int HiIdx = (Offset + 1) * EltBits;

    SDValue Hi = DAG.getBitcast(

        MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,

                                DAG.getTargetConstant(EltBits, DL, MVT::i8),

                                DAG.getTargetConstant(HiIdx, DL, MVT::i8)));

    return DAG.getBitcast(VT,

                          DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, Lo, Hi));

  }


  // If this would require more than 2 unpack instructions to expand, use

  // pshufb when available. We can only use more than 2 unpack instructions

  // when zero extending i8 elements which also makes it easier to use pshufb.

  if (Scale > 4 && EltBits == 8 && Subtarget.hasSSSE3()) {

    assert(NumElements == 16 && "Unexpected byte vector width!");

    SDValue PSHUFBMask[16];

    for (int i = 0; i < 16; ++i) {

      int Idx = Offset + (i / Scale);

      if ((i % Scale == 0 && SafeOffset(Idx))) {

        PSHUFBMask[i] = DAG.getConstant(Idx, DL, MVT::i8);

        continue;

      }

      PSHUFBMask[i] =

          AnyExt ? DAG.getUNDEF(MVT::i8) : DAG.getConstant(0x80, DL, MVT::i8);

    }

    InputV = DAG.getBitcast(MVT::v16i8, InputV);

    return DAG.getBitcast(

        VT, DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, InputV,

                        DAG.getBuildVector(MVT::v16i8, DL, PSHUFBMask)));

  }


  // If we are extending from an offset, ensure we start on a boundary that

  // we can unpack from.

  int AlignToUnpack = Offset % (NumElements / Scale);

  if (AlignToUnpack) {

    SmallVector<int, 8> ShMask((unsigned)NumElements, -1);

    for (int i = AlignToUnpack; i < NumElements; ++i)

      ShMask[i - AlignToUnpack] = i;

    InputV = DAG.getVectorShuffle(VT, DL, InputV, DAG.getUNDEF(VT), ShMask);

    Offset -= AlignToUnpack;

  }


  // Otherwise emit a sequence of unpacks.

  do {

    unsigned UnpackLoHi = X86ISD::UNPCKL;

    if (Offset >= (NumElements / 2)) {

      UnpackLoHi = X86ISD::UNPCKH;

      Offset -= (NumElements / 2);

    }


    MVT InputVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits), NumElements);

    SDValue Ext = AnyExt ? DAG.getUNDEF(InputVT)

                         : getZeroVector(InputVT, Subtarget, DAG, DL);

    InputV = DAG.getBitcast(InputVT, InputV);

    InputV = DAG.getNode(UnpackLoHi, DL, InputVT, InputV, Ext);

    Scale /= 2;

    EltBits *= 2;

    NumElements /= 2;

  } while (Scale > 1);

  return DAG.getBitcast(VT, InputV);

}


/// Try to lower a vector shuffle as a zero extension on any microarch.

///

/// This routine will try to do everything in its power to cleverly lower

/// a shuffle which happens to match the pattern of a zero extend. It doesn't

/// check for the profitability of this lowering,  it tries to aggressively

/// match this pattern. It will use all of the micro-architectural details it

/// can to emit an efficient lowering. It handles both blends with all-zero

/// inputs to explicitly zero-extend and undef-lanes (sometimes undef due to

/// masking out later).

///

/// The reason we have dedicated lowering for zext-style shuffles is that they

/// are both incredibly common and often quite performance sensitive.


static SDValue lowerShuffleAsZeroOrAnyExtend(

    const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,

    const APInt &Zeroable, const X86Subtarget &Subtarget,

    SelectionDAG &DAG) {

  int Bits = VT.getSizeInBits();

  int NumLanes = Bits / 128;

  int NumElements = VT.getVectorNumElements();

  int NumEltsPerLane = NumElements / NumLanes;

  assert(VT.getScalarSizeInBits() <= 32 &&

         "Exceeds 32-bit integer zero extension limit");

  assert((int)Mask.size() == NumElements && "Unexpected shuffle mask size");


  // Define a helper function to check a particular ext-scale and lower to it if

  // valid.

  auto Lower = [&](int Scale) -> SDValue {

    SDValue InputV;

    bool AnyExt = true;

    int Offset = 0;

    int Matches = 0;

    for (int i = 0; i < NumElements; ++i) {

      int M = Mask[i];

      if (M < 0)

        continue; // Valid anywhere but doesn't tell us anything.

      if (i % Scale != 0) {

        // Each of the extended elements need to be zeroable.

        if (!Zeroable[i])

          return SDValue();


        // We no longer are in the anyext case.

        AnyExt = false;

        continue;

      }


      // Each of the base elements needs to be consecutive indices into the

      // same input vector.

      SDValue V = M < NumElements ? V1 : V2;

      M = M % NumElements;

      if (!InputV) {

        InputV = V;

        Offset = M - (i / Scale);

      } else if (InputV != V)

        return SDValue(); // Flip-flopping inputs.


      // Offset must start in the lowest 128-bit lane or at the start of an

      // upper lane.

      // FIXME: Is it ever worth allowing a negative base offset?

      if (!((0 <= Offset && Offset < NumEltsPerLane) ||

            (Offset % NumEltsPerLane) == 0))

        return SDValue();


      // If we are offsetting, all referenced entries must come from the same

      // lane.

      if (Offset && (Offset / NumEltsPerLane) != (M / NumEltsPerLane))

        return SDValue();


      if ((M % NumElements) != (Offset + (i / Scale)))

        return SDValue(); // Non-consecutive strided elements.

      Matches++;

    }


    // If we fail to find an input, we have a zero-shuffle which should always

    // have already been handled.

    // FIXME: Maybe handle this here in case during blending we end up with one?

    if (!InputV)

      return SDValue();


    // If we are offsetting, don't extend if we only match a single input, we

    // can always do better by using a basic PSHUF or PUNPCK.

    if (Offset != 0 && Matches < 2)

      return SDValue();


    unsigned ExtOpc = AnyExt ? ISD::ANY_EXTEND : ISD::ZERO_EXTEND;

    return lowerShuffleAsSpecificExtension(DL, VT, Scale, Offset, ExtOpc,

                                           InputV, Mask, Subtarget, DAG);

  };


  // The widest scale possible for extending is to a 64-bit integer.

  assert(Bits % 64 == 0 &&

         "The number of bits in a vector must be divisible by 64 on x86!");

  int NumExtElements = Bits / 64;


  // Each iteration, try extending the elements half as much, but into twice as

  // many elements.

  for (; NumExtElements < NumElements; NumExtElements *= 2) {

    assert(NumElements % NumExtElements == 0 &&

           "The input vector size must be divisible by the extended size.");

    if (SDValue V = Lower(NumElements / NumExtElements))

      return V;

  }


  // General extends failed, but 128-bit vectors may be able to use MOVQ.

  if (Bits != 128)

    return SDValue();


  // Returns one of the source operands if the shuffle can be reduced to a

  // MOVQ, copying the lower 64-bits and zero-extending to the upper 64-bits.

  auto CanZExtLowHalf = [&]() {

    for (int i = NumElements / 2; i != NumElements; ++i)

      if (!Zeroable[i])

        return SDValue();

    if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, 0))

      return V1;

    if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, NumElements))

      return V2;

    return SDValue();

  };


  if (SDValue V = CanZExtLowHalf()) {

    V = DAG.getBitcast(MVT::v2i64, V);

    V = DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v2i64, V);

    return DAG.getBitcast(VT, V);

  }


  // No viable ext lowering found.

  return SDValue();

}


/// Try to get a scalar value for a specific element of a vector.

///

/// Looks through BUILD_VECTOR and SCALAR_TO_VECTOR nodes to find a scalar.


static SDValue getScalarValueForVectorElement(SDValue V, int Idx,

                                              SelectionDAG &DAG) {

  MVT VT = V.getSimpleValueType();

  MVT EltVT = VT.getVectorElementType();

  V = peekThroughBitcasts(V);


  // If the bitcasts shift the element size, we can't extract an equivalent

  // element from it.

  MVT NewVT = V.getSimpleValueType();

  if (!NewVT.isVector() || NewVT.getScalarSizeInBits() != VT.getScalarSizeInBits())

    return SDValue();


  if (V.getOpcode() == ISD::BUILD_VECTOR ||

      (Idx == 0 && V.getOpcode() == ISD::SCALAR_TO_VECTOR)) {

    // Ensure the scalar operand is the same size as the destination.

    // FIXME: Add support for scalar truncation where possible.

    SDValue S = V.getOperand(Idx);

    if (EltVT.getSizeInBits() == S.getSimpleValueType().getSizeInBits())

      return DAG.getBitcast(EltVT, S);

  }


  return SDValue();

}


/// Helper to test for a load that can be folded with x86 shuffles.

///

/// This is particularly important because the set of instructions varies

/// significantly based on whether the operand is a load or not.


static bool isShuffleFoldableLoad(SDValue V) {

  return V.hasOneUse() &&

         ISD::isNON_EXTLoad(peekThroughOneUseBitcasts(V).getNode());

}


template<typename T>


static bool isSoftF16(T VT, const X86Subtarget &Subtarget) {

  T EltVT = VT.getScalarType();

  return (EltVT == MVT::bf16 && !Subtarget.hasAVX10_2()) ||

         (EltVT == MVT::f16 && !Subtarget.hasFP16());

}


/// Try to lower insertion of a single element into a zero vector.

///

/// This is a common pattern that we have especially efficient patterns to lower

/// across all subtarget feature sets.


static SDValue lowerShuffleAsElementInsertion(

    const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,

    const APInt &Zeroable, const X86Subtarget &Subtarget,

    SelectionDAG &DAG) {

  MVT ExtVT = VT;

  MVT EltVT = VT.getVectorElementType();

  unsigned NumElts = VT.getVectorNumElements();

  unsigned EltBits = VT.getScalarSizeInBits();


  if (isSoftF16(EltVT, Subtarget))

    return SDValue();


  int V2Index =

      find_if(Mask, [&Mask](int M) { return M >= (int)Mask.size(); }) -

      Mask.begin();

  bool IsV1Constant = getTargetConstantFromNode(V1) != nullptr;

  bool IsV1Zeroable = true;

  for (int i = 0, Size = Mask.size(); i < Size; ++i)

    if (i != V2Index && !Zeroable[i]) {

      IsV1Zeroable = false;

      break;

    }


  // Bail if a non-zero V1 isn't used in place.

  if (!IsV1Zeroable) {

    SmallVector<int, 8> V1Mask(Mask);

    V1Mask[V2Index] = -1;

    if (!isNoopShuffleMask(V1Mask))

      return SDValue();

  }


  // Check for a single input from a SCALAR_TO_VECTOR node.

  // FIXME: All of this should be canonicalized into INSERT_VECTOR_ELT and

  // all the smarts here sunk into that routine. However, the current

  // lowering of BUILD_VECTOR makes that nearly impossible until the old

  // vector shuffle lowering is dead.

  SDValue V2S = getScalarValueForVectorElement(V2, Mask[V2Index] - Mask.size(),

                                               DAG);

  if (V2S && DAG.getTargetLoweringInfo().isTypeLegal(V2S.getValueType())) {

    // We need to zext the scalar if it is smaller than an i32.

    V2S = DAG.getBitcast(EltVT, V2S);

    if (EltVT == MVT::i8 || (EltVT == MVT::i16 && !Subtarget.hasFP16())) {

      // Using zext to expand a narrow element won't work for non-zero

      // insertions. But we can use a masked constant vector if we're

      // inserting V2 into the bottom of V1.

      if (!IsV1Zeroable && !(IsV1Constant && V2Index == 0))

        return SDValue();


      // Zero-extend directly to i32.

      ExtVT = MVT::getVectorVT(MVT::i32, ExtVT.getSizeInBits() / 32);

      V2S = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, V2S);


      // If we're inserting into a constant, mask off the inserted index

      // and OR with the zero-extended scalar.

      if (!IsV1Zeroable) {

        SmallVector<APInt> Bits(NumElts, APInt::getAllOnes(EltBits));

        Bits[V2Index] = APInt::getZero(EltBits);

        SDValue BitMask = getConstVector(Bits, VT, DAG, DL);

        V1 = DAG.getNode(ISD::AND, DL, VT, V1, BitMask);

        V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtVT, V2S);

        V2 = DAG.getBitcast(VT, DAG.getNode(X86ISD::VZEXT_MOVL, DL, ExtVT, V2));

        return DAG.getNode(ISD::OR, DL, VT, V1, V2);

      }

    }

    V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtVT, V2S);

  } else if (Mask[V2Index] != (int)Mask.size() || EltVT == MVT::i8 ||

             (EltVT == MVT::i16 && !Subtarget.hasAVX10_2())) {

    // Either not inserting from the low element of the input or the input

    // element size is too small to use VZEXT_MOVL to clear the high bits.

    return SDValue();

  }


  if (!IsV1Zeroable) {

    // If V1 can't be treated as a zero vector we have fewer options to lower

    // this. We can't support integer vectors or non-zero targets cheaply.

    assert(VT == ExtVT && "Cannot change extended type when non-zeroable!");

    if (!VT.isFloatingPoint() || V2Index != 0)

      return SDValue();

    if (!VT.is128BitVector())

      return SDValue();


    // Otherwise, use MOVSD, MOVSS or MOVSH.

    unsigned MovOpc = 0;

    if (EltVT == MVT::f16)

      MovOpc = X86ISD::MOVSH;

    else if (EltVT == MVT::f32)

      MovOpc = X86ISD::MOVSS;

    else if (EltVT == MVT::f64)

      MovOpc = X86ISD::MOVSD;

    else

      llvm_unreachable("Unsupported floating point element type to handle!");

    return DAG.getNode(MovOpc, DL, ExtVT, V1, V2);

  }


  // This lowering only works for the low element with floating point vectors.

  if (VT.isFloatingPoint() && V2Index != 0)

    return SDValue();


  V2 = DAG.getNode(X86ISD::VZEXT_MOVL, DL, ExtVT, V2);

  if (ExtVT != VT)

    V2 = DAG.getBitcast(VT, V2);


  if (V2Index != 0) {

    // If we have 4 or fewer lanes we can cheaply shuffle the element into

    // the desired position. Otherwise it is more efficient to do a vector

    // shift left. We know that we can do a vector shift left because all

    // the inputs are zero.

    if (VT.isFloatingPoint() || NumElts <= 4) {

      SmallVector<int, 4> V2Shuffle(Mask.size(), 1);

      V2Shuffle[V2Index] = 0;

      V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Shuffle);

    } else {

      V2 = DAG.getBitcast(MVT::v16i8, V2);

      V2 = DAG.getNode(

          X86ISD::VSHLDQ, DL, MVT::v16i8, V2,

          DAG.getTargetConstant(V2Index * EltBits / 8, DL, MVT::i8));

      V2 = DAG.getBitcast(VT, V2);

    }

  }

  return V2;

}


/// Try to lower broadcast of a single - truncated - integer element,

/// coming from a scalar_to_vector/build_vector node \p V0 with larger elements.

///

/// This assumes we have AVX2.


static SDValue lowerShuffleAsTruncBroadcast(const SDLoc &DL, MVT VT, SDValue V0,

                                            int BroadcastIdx,

                                            const X86Subtarget &Subtarget,

                                            SelectionDAG &DAG) {

  assert(Subtarget.hasAVX2() &&

         "We can only lower integer broadcasts with AVX2!");


  MVT EltVT = VT.getVectorElementType();

  MVT V0VT = V0.getSimpleValueType();


  assert(VT.isInteger() && "Unexpected non-integer trunc broadcast!");

  assert(V0VT.isVector() && "Unexpected non-vector vector-sized value!");


  MVT V0EltVT = V0VT.getVectorElementType();

  if (!V0EltVT.isInteger())

    return SDValue();


  const unsigned EltSize = EltVT.getSizeInBits();

  const unsigned V0EltSize = V0EltVT.getSizeInBits();


  // This is only a truncation if the original element type is larger.

  if (V0EltSize <= EltSize)

    return SDValue();


  assert(((V0EltSize % EltSize) == 0) &&

         "Scalar type sizes must all be powers of 2 on x86!");


  const unsigned V0Opc = V0.getOpcode();

  const unsigned Scale = V0EltSize / EltSize;

  const unsigned V0BroadcastIdx = BroadcastIdx / Scale;


  if ((V0Opc != ISD::SCALAR_TO_VECTOR || V0BroadcastIdx != 0) &&

      V0Opc != ISD::BUILD_VECTOR)

    return SDValue();


  SDValue Scalar = V0.getOperand(V0BroadcastIdx);


  // If we're extracting non-least-significant bits, shift so we can truncate.

  // Hopefully, we can fold away the trunc/srl/load into the broadcast.

  // Even if we can't (and !isShuffleFoldableLoad(Scalar)), prefer

  // vpbroadcast+vmovd+shr to vpshufb(m)+vmovd.

  if (const int OffsetIdx = BroadcastIdx % Scale)

    Scalar = DAG.getNode(ISD::SRL, DL, Scalar.getValueType(), Scalar,

                         DAG.getConstant(OffsetIdx * EltSize, DL, MVT::i8));


  return DAG.getNode(X86ISD::VBROADCAST, DL, VT,

                     DAG.getNode(ISD::TRUNCATE, DL, EltVT, Scalar));

}


/// Test whether this can be lowered with a single SHUFPS instruction.

///

/// This is used to disable more specialized lowerings when the shufps lowering

/// will happen to be efficient.


static bool isSingleSHUFPSMask(ArrayRef<int> Mask) {

  // This routine only handles 128-bit shufps.

  assert(Mask.size() == 4 && "Unsupported mask size!");

  assert(Mask[0] >= -1 && Mask[0] < 8 && "Out of bound mask element!");

  assert(Mask[1] >= -1 && Mask[1] < 8 && "Out of bound mask element!");

  assert(Mask[2] >= -1 && Mask[2] < 8 && "Out of bound mask element!");

  assert(Mask[3] >= -1 && Mask[3] < 8 && "Out of bound mask element!");


  // To lower with a single SHUFPS we need to have the low half and high half

  // each requiring a single input.

  if (Mask[0] >= 0 && Mask[1] >= 0 && (Mask[0] < 4) != (Mask[1] < 4))

    return false;

  if (Mask[2] >= 0 && Mask[3] >= 0 && (Mask[2] < 4) != (Mask[3] < 4))

    return false;


  return true;

}


/// Test whether the specified input (0 or 1) is in-place blended by the

/// given mask.

///

/// This returns true if the elements from a particular input are already in the

/// slot required by the given mask and require no permutation.


static bool isShuffleMaskInputInPlace(int Input, ArrayRef<int> Mask) {

  assert((Input == 0 || Input == 1) && "Only two inputs to shuffles.");

  int Size = Mask.size();

  for (int i = 0; i < Size; ++i)

    if (Mask[i] >= 0 && Mask[i] / Size == Input && Mask[i] % Size != i)

      return false;


  return true;

}


/// Test whether the specified input (0 or 1) is a broadcast/splat blended by

/// the given mask.

///


static bool isShuffleMaskInputBroadcastable(int Input, ArrayRef<int> Mask,

                                            int BroadcastableElement = 0) {

  assert((Input == 0 || Input == 1) && "Only two inputs to shuffles.");

  int Size = Mask.size();

  for (int i = 0; i < Size; ++i)

    if (Mask[i] >= 0 && Mask[i] / Size == Input &&

        Mask[i] % Size != BroadcastableElement)

      return false;

  return true;

}


/// If we are extracting two 128-bit halves of a vector and shuffling the

/// result, match that to a 256-bit AVX2 vperm* instruction to avoid a

/// multi-shuffle lowering.


static SDValue lowerShuffleOfExtractsAsVperm(const SDLoc &DL, SDValue N0,

                                             SDValue N1, ArrayRef<int> Mask,

                                             SelectionDAG &DAG) {

  MVT VT = N0.getSimpleValueType();

  assert((VT.is128BitVector() &&

          (VT.getScalarSizeInBits() == 32 || VT.getScalarSizeInBits() == 64)) &&

         "VPERM* family of shuffles requires 32-bit or 64-bit elements");


  // Check that both sources are extracts of the same source vector.

  if (N0.getOpcode() != ISD::EXTRACT_SUBVECTOR ||

      N1.getOpcode() != ISD::EXTRACT_SUBVECTOR ||

      N0.getOperand(0) != N1.getOperand(0) ||

      !N0.hasOneUse() || !N1.hasOneUse())

    return SDValue();


  SDValue WideVec = N0.getOperand(0);

  MVT WideVT = WideVec.getSimpleValueType();

  if (!WideVT.is256BitVector())

    return SDValue();


  // Match extracts of each half of the wide source vector. Commute the shuffle

  // if the extract of the low half is N1.

  unsigned NumElts = VT.getVectorNumElements();

  SmallVector<int, 4> NewMask(Mask);

  const APInt &ExtIndex0 = N0.getConstantOperandAPInt(1);

  const APInt &ExtIndex1 = N1.getConstantOperandAPInt(1);

  if (ExtIndex1 == 0 && ExtIndex0 == NumElts)

    ShuffleVectorSDNode::commuteMask(NewMask);

  else if (ExtIndex0 != 0 || ExtIndex1 != NumElts)

    return SDValue();


  // Final bailout: if the mask is simple, we are better off using an extract

  // and a simple narrow shuffle. Prefer extract+unpack(h/l)ps to vpermps

  // because that avoids a constant load from memory.

  if (NumElts == 4 &&

      (isSingleSHUFPSMask(NewMask) || is128BitUnpackShuffleMask(NewMask, DAG)))

    return SDValue();


  // Extend the shuffle mask with undef elements.

  NewMask.append(NumElts, -1);


  // shuf (extract X, 0), (extract X, 4), M --> extract (shuf X, undef, M'), 0

  SDValue Shuf = DAG.getVectorShuffle(WideVT, DL, WideVec, DAG.getUNDEF(WideVT),

                                      NewMask);

  // This is free: ymm -> xmm.

  return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Shuf,

                     DAG.getVectorIdxConstant(0, DL));

}


/// Try to lower broadcast of a single element.

///

/// For convenience, this code also bundles all of the subtarget feature set

/// filtering. While a little annoying to re-dispatch on type here, there isn't

/// a convenient way to factor it out.


static SDValue lowerShuffleAsBroadcast(const SDLoc &DL, MVT VT, SDValue V1,

                                       SDValue V2, ArrayRef<int> Mask,

                                       const X86Subtarget &Subtarget,

                                       SelectionDAG &DAG) {

  MVT EltVT = VT.getVectorElementType();

  if (!((Subtarget.hasSSE3() && VT == MVT::v2f64) ||

        (Subtarget.hasAVX() && (EltVT == MVT::f64 || EltVT == MVT::f32)) ||

        (Subtarget.hasAVX2() && (VT.isInteger() || EltVT == MVT::f16))))

    return SDValue();


  // With MOVDDUP (v2f64) we can broadcast from a register or a load, otherwise

  // we can only broadcast from a register with AVX2.

  unsigned NumEltBits = VT.getScalarSizeInBits();

  unsigned Opcode = (VT == MVT::v2f64 && !Subtarget.hasAVX2())

                        ? X86ISD::MOVDDUP

                        : X86ISD::VBROADCAST;

  bool BroadcastFromReg = (Opcode == X86ISD::MOVDDUP) || Subtarget.hasAVX2();


  // Check that the mask is a broadcast.

  int BroadcastIdx = getSplatIndex(Mask);

  if (BroadcastIdx < 0) {

    // Check for hidden broadcast.

    SmallVector<int, 16> BroadcastMask(VT.getVectorNumElements(), 0);

    if (!isShuffleEquivalent(Mask, BroadcastMask, V1, V2))

      return SDValue();

    BroadcastIdx = 0;

  }

  assert(BroadcastIdx < (int)Mask.size() && "We only expect to be called with "

                                            "a sorted mask where the broadcast "

                                            "comes from V1.");

  int NumActiveElts = count_if(Mask, [](int M) { return M >= 0; });


  // Go up the chain of (vector) values to find a scalar load that we can

  // combine with the broadcast.

  // TODO: Combine this logic with findEltLoadSrc() used by

  //       EltsFromConsecutiveLoads().

  int BitOffset = BroadcastIdx * NumEltBits;

  SDValue V = V1;

  for (;;) {

    switch (V.getOpcode()) {

    case ISD::BITCAST: {

      V = V.getOperand(0);

      continue;

    }

    case ISD::CONCAT_VECTORS: {

      int OpBitWidth = V.getOperand(0).getValueSizeInBits();

      int OpIdx = BitOffset / OpBitWidth;

      V = V.getOperand(OpIdx);

      BitOffset %= OpBitWidth;

      continue;

    }

    case ISD::EXTRACT_SUBVECTOR: {

      // The extraction index adds to the existing offset.

      unsigned EltBitWidth = V.getScalarValueSizeInBits();

      unsigned Idx = V.getConstantOperandVal(1);

      unsigned BeginOffset = Idx * EltBitWidth;

      BitOffset += BeginOffset;

      V = V.getOperand(0);

      continue;

    }

    case ISD::INSERT_SUBVECTOR: {

      SDValue VOuter = V.getOperand(0), VInner = V.getOperand(1);

      int EltBitWidth = VOuter.getScalarValueSizeInBits();

      int Idx = (int)V.getConstantOperandVal(2);

      int NumSubElts = (int)VInner.getSimpleValueType().getVectorNumElements();

      int BeginOffset = Idx * EltBitWidth;

      int EndOffset = BeginOffset + NumSubElts * EltBitWidth;

      if (BeginOffset <= BitOffset && BitOffset < EndOffset) {

        BitOffset -= BeginOffset;

        V = VInner;

      } else {

        V = VOuter;

      }

      continue;

    }

    }

    break;

  }

  assert((BitOffset % NumEltBits) == 0 && "Illegal bit-offset");

  BroadcastIdx = BitOffset / NumEltBits;


  // Do we need to bitcast the source to retrieve the original broadcast index?

  bool BitCastSrc = V.getScalarValueSizeInBits() != NumEltBits;


  // Check if this is a broadcast of a scalar. We special case lowering

  // for scalars so that we can more effectively fold with loads.

  // If the original value has a larger element type than the shuffle, the

  // broadcast element is in essence truncated. Make that explicit to ease

  // folding.

  if (BitCastSrc && VT.isInteger())

    if (SDValue TruncBroadcast = lowerShuffleAsTruncBroadcast(

            DL, VT, V, BroadcastIdx, Subtarget, DAG))

      return TruncBroadcast;


  // Also check the simpler case, where we can directly reuse the scalar.

  if (!BitCastSrc &&

      ((V.getOpcode() == ISD::BUILD_VECTOR && V.hasOneUse()) ||

       (V.getOpcode() == ISD::SCALAR_TO_VECTOR && BroadcastIdx == 0))) {

    V = V.getOperand(BroadcastIdx);


    // If we can't broadcast from a register, check that the input is a load.

    if (!BroadcastFromReg && !isShuffleFoldableLoad(V))

      return SDValue();

  } else if (ISD::isNormalLoad(V.getNode()) &&

             cast<LoadSDNode>(V)->isSimple()) {

    // We do not check for one-use of the vector load because a broadcast load

    // is expected to be a win for code size, register pressure, and possibly

    // uops even if the original vector load is not eliminated.


    // Reduce the vector load and shuffle to a broadcasted scalar load.

    auto *Ld = cast<LoadSDNode>(V);

    SDValue BaseAddr = Ld->getBasePtr();

    MVT SVT = VT.getScalarType();

    unsigned Offset = BroadcastIdx * SVT.getStoreSize();

    assert((int)(Offset * 8) == BitOffset && "Unexpected bit-offset");

    SDValue NewAddr =

        DAG.getMemBasePlusOffset(BaseAddr, TypeSize::getFixed(Offset), DL);


    // Directly form VBROADCAST_LOAD if we're using VBROADCAST opcode rather

    // than MOVDDUP.

    // FIXME: Should we add VBROADCAST_LOAD isel patterns for pre-AVX?

    if (Opcode == X86ISD::VBROADCAST) {

      SDVTList Tys = DAG.getVTList(VT, MVT::Other);

      SDValue Ops[] = {Ld->getChain(), NewAddr};

      V = DAG.getMemIntrinsicNode(

          X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, SVT,

          DAG.getMachineFunction().getMachineMemOperand(

              Ld->getMemOperand(), Offset, SVT.getStoreSize()));

      DAG.makeEquivalentMemoryOrdering(Ld, V);

      return DAG.getBitcast(VT, V);

    }

    assert(SVT == MVT::f64 && "Unexpected VT!");

    V = DAG.getLoad(SVT, DL, Ld->getChain(), NewAddr,

                    DAG.getMachineFunction().getMachineMemOperand(

                        Ld->getMemOperand(), Offset, SVT.getStoreSize()));

    DAG.makeEquivalentMemoryOrdering(Ld, V);

  } else if (!BroadcastFromReg) {

    // We can't broadcast from a vector register.

    return SDValue();

  } else if (BitOffset != 0) {

    // We can only broadcast from the zero-element of a vector register,

    // but it can be advantageous to broadcast from the zero-element of a

    // subvector.

    if (!VT.is256BitVector() && !VT.is512BitVector())

      return SDValue();


    // VPERMQ/VPERMPD can perform the cross-lane shuffle directly.

    if (VT == MVT::v4f64 || VT == MVT::v4i64)

      return SDValue();


    // If we are broadcasting an element from the lowest 128-bit subvector, try

    // to move the element in position.

    if (BitOffset < 128 && NumActiveElts > 1 &&

        V.getScalarValueSizeInBits() == NumEltBits) {

      assert((BitOffset % V.getScalarValueSizeInBits()) == 0 &&

             "Unexpected bit-offset");

      SmallVector<int, 16> ExtractMask(128 / NumEltBits, SM_SentinelUndef);

      ExtractMask[0] = BitOffset / V.getScalarValueSizeInBits();

      V = extractSubVector(V, 0, DAG, DL, 128);

      V = DAG.getVectorShuffle(V.getValueType(), DL, V, V, ExtractMask);

    } else {

      // Only broadcast the zero-element of a 128-bit subvector.

      if ((BitOffset % 128) != 0)

        return SDValue();


      assert((BitOffset % V.getScalarValueSizeInBits()) == 0 &&

             "Unexpected bit-offset");

      assert((V.getValueSizeInBits() == 256 || V.getValueSizeInBits() == 512) &&

             "Unexpected vector size");

      unsigned ExtractIdx = BitOffset / V.getScalarValueSizeInBits();

      V = extract128BitVector(V, ExtractIdx, DAG, DL);

    }

  }


  // On AVX we can use VBROADCAST directly for scalar sources.

  if (Opcode == X86ISD::MOVDDUP && !V.getValueType().isVector()) {

    V = DAG.getBitcast(MVT::f64, V);

    if (Subtarget.hasAVX()) {

      V = DAG.getNode(X86ISD::VBROADCAST, DL, MVT::v2f64, V);

      return DAG.getBitcast(VT, V);

    }

    V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V);

  }


  // If this is a scalar, do the broadcast on this type and bitcast.

  if (!V.getValueType().isVector()) {

    assert(V.getScalarValueSizeInBits() == NumEltBits &&

           "Unexpected scalar size");

    MVT BroadcastVT = MVT::getVectorVT(V.getSimpleValueType(),

                                       VT.getVectorNumElements());

    return DAG.getBitcast(VT, DAG.getNode(Opcode, DL, BroadcastVT, V));

  }


  // We only support broadcasting from 128-bit vectors to minimize the

  // number of patterns we need to deal with in isel. So extract down to

  // 128-bits, removing as many bitcasts as possible.

  if (V.getValueSizeInBits() > 128)

    V = extract128BitVector(peekThroughBitcasts(V), 0, DAG, DL);


  // Otherwise cast V to a vector with the same element type as VT, but

  // possibly narrower than VT. Then perform the broadcast.

  unsigned NumSrcElts = V.getValueSizeInBits() / NumEltBits;

  MVT CastVT = MVT::getVectorVT(VT.getVectorElementType(), NumSrcElts);

  return DAG.getNode(Opcode, DL, VT, DAG.getBitcast(CastVT, V));

}


// Check for whether we can use INSERTPS to perform the shuffle. We only use

// INSERTPS when the V1 elements are already in the correct locations

// because otherwise we can just always use two SHUFPS instructions which

// are much smaller to encode than a SHUFPS and an INSERTPS. We can also

// perform INSERTPS if a single V1 element is out of place and all V2

// elements are zeroable.


static bool matchShuffleAsInsertPS(SDValue &V1, SDValue &V2,

                                   unsigned &InsertPSMask,

                                   const APInt &Zeroable,

                                   ArrayRef<int> Mask, SelectionDAG &DAG) {

  assert(V1.getSimpleValueType().is128BitVector() && "Bad operand type!");

  assert(V2.getSimpleValueType().is128BitVector() && "Bad operand type!");

  assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");


  // Attempt to match INSERTPS with one element from VA or VB being

  // inserted into VA (or undef). If successful, V1, V2 and InsertPSMask

  // are updated.

  auto matchAsInsertPS = [&](SDValue VA, SDValue VB,

                             ArrayRef<int> CandidateMask) {

    unsigned ZMask = 0;

    int VADstIndex = -1;

    int VBDstIndex = -1;

    bool VAUsedInPlace = false;


    for (int i = 0; i < 4; ++i) {

      // Synthesize a zero mask from the zeroable elements (includes undefs).

      if (Zeroable[i]) {

        ZMask |= 1 << i;

        continue;

      }


      // Flag if we use any VA inputs in place.

      if (i == CandidateMask[i]) {

        VAUsedInPlace = true;

        continue;

      }


      // We can only insert a single non-zeroable element.

      if (VADstIndex >= 0 || VBDstIndex >= 0)

        return false;


      if (CandidateMask[i] < 4) {

        // VA input out of place for insertion.

        VADstIndex = i;

      } else {

        // VB input for insertion.

        VBDstIndex = i;

      }

    }


    // Don't bother if we have no (non-zeroable) element for insertion.

    if (VADstIndex < 0 && VBDstIndex < 0)

      return false;


    // Determine element insertion src/dst indices. The src index is from the

    // start of the inserted vector, not the start of the concatenated vector.

    unsigned VBSrcIndex = 0;

    if (VADstIndex >= 0) {

      // If we have a VA input out of place, we use VA as the V2 element

      // insertion and don't use the original V2 at all.

      VBSrcIndex = CandidateMask[VADstIndex];

      VBDstIndex = VADstIndex;

      VB = VA;

    } else {

      VBSrcIndex = CandidateMask[VBDstIndex] - 4;

    }


    // If no V1 inputs are used in place, then the result is created only from

    // the zero mask and the V2 insertion - so remove V1 dependency.

    if (!VAUsedInPlace)

      VA = DAG.getUNDEF(MVT::v4f32);


    // Update V1, V2 and InsertPSMask accordingly.

    V1 = VA;

    V2 = VB;


    // Insert the V2 element into the desired position.

    InsertPSMask = VBSrcIndex << 6 | VBDstIndex << 4 | ZMask;

    assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");

    return true;

  };


  if (matchAsInsertPS(V1, V2, Mask))

    return true;


  // Commute and try again.

  SmallVector<int, 4> CommutedMask(Mask);

  ShuffleVectorSDNode::commuteMask(CommutedMask);

  if (matchAsInsertPS(V2, V1, CommutedMask))

    return true;


  return false;

}


static SDValue lowerShuffleAsInsertPS(const SDLoc &DL, SDValue V1, SDValue V2,

                                      ArrayRef<int> Mask, const APInt &Zeroable,

                                      SelectionDAG &DAG) {

  assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");

  assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");


  // Attempt to match the insertps pattern.

  unsigned InsertPSMask = 0;

  if (!matchShuffleAsInsertPS(V1, V2, InsertPSMask, Zeroable, Mask, DAG))

    return SDValue();


  // Insert the V2 element into the desired position.

  return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,

                     DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));

}


/// Handle lowering of 2-lane 64-bit floating point shuffles.

///

/// This is the basis function for the 2-lane 64-bit shuffles as we have full

/// support for floating point shuffles but not integer shuffles. These

/// instructions will incur a domain crossing penalty on some chips though so

/// it is better to avoid lowering through this for integer vectors where

/// possible.


static SDValue lowerV2F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,

                                 const APInt &Zeroable, SDValue V1, SDValue V2,

                                 const X86Subtarget &Subtarget,

                                 SelectionDAG &DAG) {

  assert(V1.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");

  assert(V2.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");

  assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");


  if (V2.isUndef()) {

    // Check for being able to broadcast a single element.

    if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v2f64, V1, V2,

                                                    Mask, Subtarget, DAG))

      return Broadcast;


    // Straight shuffle of a single input vector. Simulate this by using the

    // single input as both of the "inputs" to this instruction..

    unsigned SHUFPDMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1);


    if (Subtarget.hasAVX()) {

      // If we have AVX, we can use VPERMILPS which will allow folding a load

      // into the shuffle.

      return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v2f64, V1,

                         DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));

    }


    return DAG.getNode(

        X86ISD::SHUFP, DL, MVT::v2f64,

        Mask[0] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,

        Mask[1] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,

        DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));

  }

  assert(Mask[0] >= 0 && "No undef lanes in multi-input v2 shuffles!");

  assert(Mask[1] >= 0 && "No undef lanes in multi-input v2 shuffles!");

  assert(Mask[0] < 2 && "We sort V1 to be the first input.");

  assert(Mask[1] >= 2 && "We sort V2 to be the second input.");


  if (Subtarget.hasAVX2())

    if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))

      return Extract;


  // When loading a scalar and then shuffling it into a vector we can often do

  // the insertion cheaply.

  if (SDValue Insertion = lowerShuffleAsElementInsertion(

          DL, MVT::v2f64, V1, V2, Mask, Zeroable, Subtarget, DAG))

    return Insertion;

  // Try inverting the insertion since for v2 masks it is easy to do and we

  // can't reliably sort the mask one way or the other.

  int InverseMask[2] = {Mask[0] < 0 ? -1 : (Mask[0] ^ 2),

                        Mask[1] < 0 ? -1 : (Mask[1] ^ 2)};

  if (SDValue Insertion = lowerShuffleAsElementInsertion(

          DL, MVT::v2f64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))

    return Insertion;


  // Try to use one of the special instruction patterns to handle two common

  // blend patterns if a zero-blend above didn't work.

  if (isShuffleEquivalent(Mask, {0, 3}, V1, V2) ||

      isShuffleEquivalent(Mask, {1, 3}, V1, V2))

    if (SDValue V1S = getScalarValueForVectorElement(V1, Mask[0], DAG))

      // We can either use a special instruction to load over the low double or

      // to move just the low double.

      return DAG.getNode(

          X86ISD::MOVSD, DL, MVT::v2f64, V2,

          DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V1S));


  if (Subtarget.hasSSE41())

    if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v2f64, V1, V2, Mask,

                                            Zeroable, Subtarget, DAG))

      return Blend;


  // Use dedicated unpack instructions for masks that match their pattern.

  if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v2f64, V1, V2, Mask, DAG))

    return V;


  unsigned SHUFPDMask = (Mask[0] == 1) | (((Mask[1] - 2) == 1) << 1);

  return DAG.getNode(X86ISD::SHUFP, DL, MVT::v2f64, V1, V2,

                     DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));

}


/// Handle lowering of 2-lane 64-bit integer shuffles.

///

/// Tries to lower a 2-lane 64-bit shuffle using shuffle operations provided by

/// the integer unit to minimize domain crossing penalties. However, for blends

/// it falls back to the floating point shuffle operation with appropriate bit

/// casting.


static SDValue lowerV2I64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,

                                 const APInt &Zeroable, SDValue V1, SDValue V2,

                                 const X86Subtarget &Subtarget,

                                 SelectionDAG &DAG) {

  assert(V1.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");

  assert(V2.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");

  assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");


  if (V2.isUndef()) {

    // Check for being able to broadcast a single element.

    if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v2i64, V1, V2,

                                                    Mask, Subtarget, DAG))

      return Broadcast;


    // Straight shuffle of a single input vector. For everything from SSE2

    // onward this has a single fast instruction with no scary immediates.

    // We have to map the mask as it is actually a v4i32 shuffle instruction.

    V1 = DAG.getBitcast(MVT::v4i32, V1);

    int WidenedMask[4] = {Mask[0] < 0 ? -1 : (Mask[0] * 2),

                          Mask[0] < 0 ? -1 : ((Mask[0] * 2) + 1),

                          Mask[1] < 0 ? -1 : (Mask[1] * 2),

                          Mask[1] < 0 ? -1 : ((Mask[1] * 2) + 1)};

    return DAG.getBitcast(

        MVT::v2i64,

        DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,

                    getV4X86ShuffleImm8ForMask(WidenedMask, DL, DAG)));

  }

  assert(Mask[0] != -1 && "No undef lanes in multi-input v2 shuffles!");

  assert(Mask[1] != -1 && "No undef lanes in multi-input v2 shuffles!");

  assert(Mask[0] < 2 && "We sort V1 to be the first input.");

  assert(Mask[1] >= 2 && "We sort V2 to be the second input.");


  if (Subtarget.hasAVX2())

    if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))

      return Extract;


  // Try to use shift instructions.

  if (SDValue Shift =

          lowerShuffleAsShift(DL, MVT::v2i64, V1, V2, Mask, Zeroable, Subtarget,

                              DAG, /*BitwiseOnly*/ false))

    return Shift;


  // When loading a scalar and then shuffling it into a vector we can often do

  // the insertion cheaply.

  if (SDValue Insertion = lowerShuffleAsElementInsertion(

          DL, MVT::v2i64, V1, V2, Mask, Zeroable, Subtarget, DAG))

    return Insertion;

  // Try inverting the insertion since for v2 masks it is easy to do and we

  // can't reliably sort the mask one way or the other.

  int InverseMask[2] = {Mask[0] ^ 2, Mask[1] ^ 2};

  if (SDValue Insertion = lowerShuffleAsElementInsertion(

          DL, MVT::v2i64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))

    return Insertion;


  // We have different paths for blend lowering, but they all must use the

  // *exact* same predicate.

  bool IsBlendSupported = Subtarget.hasSSE41();

  if (IsBlendSupported)

    if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v2i64, V1, V2, Mask,

                                            Zeroable, Subtarget, DAG))

      return Blend;


  // Use dedicated unpack instructions for masks that match their pattern.

  if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v2i64, V1, V2, Mask, DAG))

    return V;


  // Try to use byte rotation instructions.

  // Its more profitable for pre-SSSE3 to use shuffles/unpacks.

  if (Subtarget.hasSSSE3()) {

    if (Subtarget.hasVLX())

      if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v2i64, V1, V2, Mask,

                                                Zeroable, Subtarget, DAG))

        return Rotate;


    if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v2i64, V1, V2, Mask,

                                                  Subtarget, DAG))

      return Rotate;

  }


  // If we have direct support for blends, we should lower by decomposing into

  // a permute. That will be faster than the domain cross.

  if (IsBlendSupported)

    return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v2i64, V1, V2, Mask,

                                                Zeroable, Subtarget, DAG);


  // We implement this with SHUFPD which is pretty lame because it will likely

  // incur 2 cycles of stall for integer vectors on Nehalem and older chips.

  // However, all the alternatives are still more cycles and newer chips don't

  // have this problem. It would be really nice if x86 had better shuffles here.

  V1 = DAG.getBitcast(MVT::v2f64, V1);

  V2 = DAG.getBitcast(MVT::v2f64, V2);

  return DAG.getBitcast(MVT::v2i64,

                        DAG.getVectorShuffle(MVT::v2f64, DL, V1, V2, Mask));

}


/// Lower a vector shuffle using the SHUFPS instruction.

///

/// This is a helper routine dedicated to lowering vector shuffles using SHUFPS.

/// It makes no assumptions about whether this is the *best* lowering, it simply

/// uses it.


static SDValue lowerShuffleWithSHUFPS(const SDLoc &DL, MVT VT,

                                      ArrayRef<int> Mask, SDValue V1,

                                      SDValue V2, SelectionDAG &DAG) {

  SDValue LowV = V1, HighV = V2;

  SmallVector<int, 4> NewMask(Mask);

  int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });


  if (NumV2Elements == 1) {

    int V2Index = find_if(Mask, [](int M) { return M >= 4; }) - Mask.begin();


    // Compute the index adjacent to V2Index and in the same half by toggling

    // the low bit.

    int V2AdjIndex = V2Index ^ 1;


    if (Mask[V2AdjIndex] < 0) {

      // Handles all the cases where we have a single V2 element and an undef.

      // This will only ever happen in the high lanes because we commute the

      // vector otherwise.

      if (V2Index < 2)

        std::swap(LowV, HighV);

      NewMask[V2Index] -= 4;

    } else {

      // Handle the case where the V2 element ends up adjacent to a V1 element.

      // To make this work, blend them together as the first step.

      int V1Index = V2AdjIndex;

      int BlendMask[4] = {Mask[V2Index] - 4, 0, Mask[V1Index], 0};

      V2 = DAG.getNode(X86ISD::SHUFP, DL, VT, V2, V1,

                       getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));


      // Now proceed to reconstruct the final blend as we have the necessary

      // high or low half formed.

      if (V2Index < 2) {

        LowV = V2;

        HighV = V1;

      } else {

        HighV = V2;

      }

      NewMask[V1Index] = 2; // We put the V1 element in V2[2].

      NewMask[V2Index] = 0; // We shifted the V2 element into V2[0].

    }

  } else if (NumV2Elements == 2) {

    if (Mask[0] < 4 && Mask[1] < 4) {

      // Handle the easy case where we have V1 in the low lanes and V2 in the

      // high lanes.

      NewMask[2] -= 4;

      NewMask[3] -= 4;

    } else if (Mask[2] < 4 && Mask[3] < 4) {

      // We also handle the reversed case because this utility may get called

      // when we detect a SHUFPS pattern but can't easily commute the shuffle to

      // arrange things in the right direction.

      NewMask[0] -= 4;

      NewMask[1] -= 4;

      HighV = V1;

      LowV = V2;

    } else {

      // We have a mixture of V1 and V2 in both low and high lanes. Rather than

      // trying to place elements directly, just blend them and set up the final

      // shuffle to place them.


      // The first two blend mask elements are for V1, the second two are for

      // V2.

      int BlendMask[4] = {Mask[0] < 4 ? Mask[0] : Mask[1],

                          Mask[2] < 4 ? Mask[2] : Mask[3],

                          (Mask[0] >= 4 ? Mask[0] : Mask[1]) - 4,

                          (Mask[2] >= 4 ? Mask[2] : Mask[3]) - 4};

      V1 = DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,

                       getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));


      // Now we do a normal shuffle of V1 by giving V1 as both operands to

      // a blend.

      LowV = HighV = V1;

      NewMask[0] = Mask[0] < 4 ? 0 : 2;

      NewMask[1] = Mask[0] < 4 ? 2 : 0;

      NewMask[2] = Mask[2] < 4 ? 1 : 3;

      NewMask[3] = Mask[2] < 4 ? 3 : 1;

    }

  } else if (NumV2Elements == 3) {

    // Ideally canonicalizeShuffleMaskWithCommute should have caught this, but

    // we can get here due to other paths (e.g repeated mask matching) that we

    // don't want to do another round of lowerVECTOR_SHUFFLE.

    ShuffleVectorSDNode::commuteMask(NewMask);

    return lowerShuffleWithSHUFPS(DL, VT, NewMask, V2, V1, DAG);

  }

  return DAG.getNode(X86ISD::SHUFP, DL, VT, LowV, HighV,

                     getV4X86ShuffleImm8ForMask(NewMask, DL, DAG));

}


/// Lower 4-lane 32-bit floating point shuffles.

///

/// Uses instructions exclusively from the floating point unit to minimize

/// domain crossing penalties, as these are sufficient to implement all v4f32

/// shuffles.


static SDValue lowerV4F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,

                                 const APInt &Zeroable, SDValue V1, SDValue V2,

                                 const X86Subtarget &Subtarget,

                                 SelectionDAG &DAG) {

  assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");

  assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");

  assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");


  if (Subtarget.hasSSE41())

    if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4f32, V1, V2, Mask,

                                            Zeroable, Subtarget, DAG))

      return Blend;


  int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });


  if (NumV2Elements == 0) {

    // Check for being able to broadcast a single element.

    if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4f32, V1, V2,

                                                    Mask, Subtarget, DAG))

      return Broadcast;


    // Use even/odd duplicate instructions for masks that match their pattern.

    if (Subtarget.hasSSE3()) {

      if (isShuffleEquivalent(Mask, {0, 0, 2, 2}, V1, V2))

        return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v4f32, V1);

      if (isShuffleEquivalent(Mask, {1, 1, 3, 3}, V1, V2))

        return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v4f32, V1);

    }


    if (Subtarget.hasAVX()) {

      // If we have AVX, we can use VPERMILPS which will allow folding a load

      // into the shuffle.

      return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f32, V1,

                         getV4X86ShuffleImm8ForMask(Mask, DL, DAG));

    }


    // Use MOVLHPS/MOVHLPS to simulate unary shuffles. These are only valid

    // in SSE1 because otherwise they are widened to v2f64 and never get here.

    if (!Subtarget.hasSSE2()) {

      if (isShuffleEquivalent(Mask, {0, 1, 0, 1}, V1, V2))

        return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V1);

      if (isShuffleEquivalent(Mask, {2, 3, 2, 3}, V1, V2))

        return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V1, V1);

    }


    // Otherwise, use a straight shuffle of a single input vector. We pass the

    // input vector to both operands to simulate this with a SHUFPS.

    return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V1, V1,

                       getV4X86ShuffleImm8ForMask(Mask, DL, DAG));

  }


  if (Subtarget.hasSSE2())

    if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(

            DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG)) {

      ZExt = DAG.getBitcast(MVT::v4f32, ZExt);

      return ZExt;

    }


  if (Subtarget.hasAVX2())

    if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))

      return Extract;


  // There are special ways we can lower some single-element blends. However, we

  // have custom ways we can lower more complex single-element blends below that

  // we defer to if both this and BLENDPS fail to match, so restrict this to

  // when the V2 input is targeting element 0 of the mask -- that is the fast

  // case here.

  if (NumV2Elements == 1 && Mask[0] >= 4)

    if (SDValue V = lowerShuffleAsElementInsertion(

            DL, MVT::v4f32, V1, V2, Mask, Zeroable, Subtarget, DAG))

      return V;


  if (Subtarget.hasSSE41()) {

    // Use INSERTPS if we can complete the shuffle efficiently.

    if (SDValue V = lowerShuffleAsInsertPS(DL, V1, V2, Mask, Zeroable, DAG))

      return V;


    if (!isSingleSHUFPSMask(Mask))

      if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, MVT::v4f32, V1,

                                                            V2, Mask, DAG))

        return BlendPerm;

  }


  // Use low/high mov instructions. These are only valid in SSE1 because

  // otherwise they are widened to v2f64 and never get here.

  if (!Subtarget.hasSSE2()) {

    if (isShuffleEquivalent(Mask, {0, 1, 4, 5}, V1, V2))

      return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V2);

    if (isShuffleEquivalent(Mask, {2, 3, 6, 7}, V1, V2))

      return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V2, V1);

  }


  // Use dedicated unpack instructions for masks that match their pattern.

  if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4f32, V1, V2, Mask, DAG))

    return V;


  // Otherwise fall back to a SHUFPS lowering strategy.

  return lowerShuffleWithSHUFPS(DL, MVT::v4f32, Mask, V1, V2, DAG);

}


/// Lower 4-lane i32 vector shuffles.

///

/// We try to handle these with integer-domain shuffles where we can, but for

/// blends we use the floating point domain blend instructions.


static SDValue lowerV4I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,

                                 const APInt &Zeroable, SDValue V1, SDValue V2,

                                 const X86Subtarget &Subtarget,

                                 SelectionDAG &DAG) {

  assert(V1.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");

  assert(V2.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");

  assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");


  // Whenever we can lower this as a zext, that instruction is strictly faster

  // than any alternative. It also allows us to fold memory operands into the

  // shuffle in many cases.

  if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v4i32, V1, V2, Mask,

                                                   Zeroable, Subtarget, DAG))

    return ZExt;


  int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });


  // Try to use shift instructions if fast.

  if (Subtarget.preferLowerShuffleAsShift()) {

    if (SDValue Shift =

            lowerShuffleAsShift(DL, MVT::v4i32, V1, V2, Mask, Zeroable,

                                Subtarget, DAG, /*BitwiseOnly*/ true))

      return Shift;

    if (NumV2Elements == 0)

      if (SDValue Rotate =

              lowerShuffleAsBitRotate(DL, MVT::v4i32, V1, Mask, Subtarget, DAG))

        return Rotate;

  }


  if (NumV2Elements == 0) {

    // Try to use broadcast unless the mask only has one non-undef element.

    if (count_if(Mask, [](int M) { return M >= 0 && M < 4; }) > 1) {

      if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4i32, V1, V2,

                                                      Mask, Subtarget, DAG))

        return Broadcast;

    }


    // Straight shuffle of a single input vector. For everything from SSE2

    // onward this has a single fast instruction with no scary immediates.

    // We coerce the shuffle pattern to be compatible with UNPCK instructions

    // but we aren't actually going to use the UNPCK instruction because doing

    // so prevents folding a load into this instruction or making a copy.

    const int UnpackLoMask[] = {0, 0, 1, 1};

    const int UnpackHiMask[] = {2, 2, 3, 3};

    if (!isSingleElementRepeatedMask(Mask)) {

      if (isShuffleEquivalent(Mask, {0, 0, 1, 1}, V1, V2))

        Mask = UnpackLoMask;

      else if (isShuffleEquivalent(Mask, {2, 2, 3, 3}, V1, V2))

        Mask = UnpackHiMask;

    }


    return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,

                       getV4X86ShuffleImm8ForMask(Mask, DL, DAG));

  }


  if (Subtarget.hasAVX2())

    if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))

      return Extract;


  // Try to use shift instructions.

  if (SDValue Shift =

          lowerShuffleAsShift(DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget,

                              DAG, /*BitwiseOnly*/ false))

    return Shift;


  // There are special ways we can lower some single-element blends.

  if (NumV2Elements == 1)

    if (SDValue V = lowerShuffleAsElementInsertion(

            DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG))

      return V;


  // We have different paths for blend lowering, but they all must use the

  // *exact* same predicate.

  bool IsBlendSupported = Subtarget.hasSSE41();

  if (IsBlendSupported)

    if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4i32, V1, V2, Mask,

                                            Zeroable, Subtarget, DAG))

      return Blend;


  if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v4i32, V1, V2, Mask,

                                             Zeroable, Subtarget, DAG))

    return Masked;


  // Use dedicated unpack instructions for masks that match their pattern.

  if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4i32, V1, V2, Mask, DAG))

    return V;


  // Try to use byte rotation instructions.

  // Its more profitable for pre-SSSE3 to use shuffles/unpacks.

  if (Subtarget.hasSSSE3()) {

    if (Subtarget.hasVLX())

      if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v4i32, V1, V2, Mask,

                                                Zeroable, Subtarget, DAG))

        return Rotate;


    if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v4i32, V1, V2, Mask,

                                                  Subtarget, DAG))

      return Rotate;

  }


  // Assume that a single SHUFPS is faster than an alternative sequence of

  // multiple instructions (even if the CPU has a domain penalty).

  // If some CPU is harmed by the domain switch, we can fix it in a later pass.

  if (!isSingleSHUFPSMask(Mask)) {

    // If we have direct support for blends, we should lower by decomposing into

    // a permute. That will be faster than the domain cross.

    if (IsBlendSupported)

      return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4i32, V1, V2, Mask,

                                                  Zeroable, Subtarget, DAG);


    // Try to lower by permuting the inputs into an unpack instruction.

    if (SDValue Unpack = lowerShuffleAsPermuteAndUnpack(DL, MVT::v4i32, V1, V2,

                                                        Mask, Subtarget, DAG))

      return Unpack;

  }


  // We implement this with SHUFPS because it can blend from two vectors.

  // Because we're going to eventually use SHUFPS, we use SHUFPS even to build

  // up the inputs, bypassing domain shift penalties that we would incur if we

  // directly used PSHUFD on Nehalem and older. For newer chips, this isn't

  // relevant.

  SDValue CastV1 = DAG.getBitcast(MVT::v4f32, V1);

  SDValue CastV2 = DAG.getBitcast(MVT::v4f32, V2);

  SDValue ShufPS = DAG.getVectorShuffle(MVT::v4f32, DL, CastV1, CastV2, Mask);

  return DAG.getBitcast(MVT::v4i32, ShufPS);

}


/// Lowering of single-input v8i16 shuffles is the cornerstone of SSE2

/// shuffle lowering, and the most complex part.

///

/// The lowering strategy is to try to form pairs of input lanes which are

/// targeted at the same half of the final vector, and then use a dword shuffle

/// to place them onto the right half, and finally unpack the paired lanes into

/// their final position.

///

/// The exact breakdown of how to form these dword pairs and align them on the

/// correct sides is really tricky. See the comments within the function for

/// more of the details.

///

/// This code also handles repeated 128-bit lanes of v8i16 shuffles, but each

/// lane must shuffle the *exact* same way. In fact, you must pass a v8 Mask to

/// this routine for it to work correctly. To shuffle a 256-bit or 512-bit i16

/// vector, form the analogous 128-bit 8-element Mask.


static SDValue lowerV8I16GeneralSingleInputShuffle(

    const SDLoc &DL, MVT VT, SDValue V, MutableArrayRef<int> Mask,

    const X86Subtarget &Subtarget, SelectionDAG &DAG) {

  assert(VT.getVectorElementType() == MVT::i16 && "Bad input type!");

  MVT PSHUFDVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);


  assert(Mask.size() == 8 && "Shuffle mask length doesn't match!");

  MutableArrayRef<int> LoMask = Mask.slice(0, 4);

  MutableArrayRef<int> HiMask = Mask.slice(4, 4);


  // Attempt to directly match PSHUFLW or PSHUFHW.

  if (isUndefOrInRange(LoMask, 0, 4) &&

      isSequentialOrUndefInRange(HiMask, 0, 4, 4)) {

    return DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,

                       getV4X86ShuffleImm8ForMask(LoMask, DL, DAG));

  }

  if (isUndefOrInRange(HiMask, 4, 8) &&

      isSequentialOrUndefInRange(LoMask, 0, 4, 0)) {

    for (int i = 0; i != 4; ++i)

      HiMask[i] = (HiMask[i] < 0 ? HiMask[i] : (HiMask[i] - 4));

    return DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,

                       getV4X86ShuffleImm8ForMask(HiMask, DL, DAG));

  }


  SmallVector<int, 4> LoInputs;

  copy_if(LoMask, std::back_inserter(LoInputs), [](int M) { return M >= 0; });

  array_pod_sort(LoInputs.begin(), LoInputs.end());

  LoInputs.erase(llvm::unique(LoInputs), LoInputs.end());

  SmallVector<int, 4> HiInputs;

  copy_if(HiMask, std::back_inserter(HiInputs), [](int M) { return M >= 0; });

  array_pod_sort(HiInputs.begin(), HiInputs.end());

  HiInputs.erase(llvm::unique(HiInputs), HiInputs.end());

  int NumLToL = llvm::lower_bound(LoInputs, 4) - LoInputs.begin();

  int NumHToL = LoInputs.size() - NumLToL;

  int NumLToH = llvm::lower_bound(HiInputs, 4) - HiInputs.begin();

  int NumHToH = HiInputs.size() - NumLToH;

  MutableArrayRef<int> LToLInputs(LoInputs.data(), NumLToL);

  MutableArrayRef<int> LToHInputs(HiInputs.data(), NumLToH);

  MutableArrayRef<int> HToLInputs(LoInputs.data() + NumLToL, NumHToL);

  MutableArrayRef<int> HToHInputs(HiInputs.data() + NumLToH, NumHToH);


  // If we are shuffling values from one half - check how many different DWORD

  // pairs we need to create. If only 1 or 2 then we can perform this as a

  // PSHUFLW/PSHUFHW + PSHUFD instead of the PSHUFD+PSHUFLW+PSHUFHW chain below.

  auto ShuffleDWordPairs = [&](ArrayRef<int> PSHUFHalfMask,

                               ArrayRef<int> PSHUFDMask, unsigned ShufWOp) {

    V = DAG.getNode(ShufWOp, DL, VT, V,

                    getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));

    V = DAG.getBitcast(PSHUFDVT, V);

    V = DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, V,

                    getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));

    return DAG.getBitcast(VT, V);

  };


  if ((NumHToL + NumHToH) == 0 || (NumLToL + NumLToH) == 0) {

    int PSHUFDMask[4] = { -1, -1, -1, -1 };

    SmallVector<std::pair<int, int>, 4> DWordPairs;

    int DOffset = ((NumHToL + NumHToH) == 0 ? 0 : 2);


    // Collect the different DWORD pairs.

    for (int DWord = 0; DWord != 4; ++DWord) {

      int M0 = Mask[2 * DWord + 0];

      int M1 = Mask[2 * DWord + 1];

      M0 = (M0 >= 0 ? M0 % 4 : M0);

      M1 = (M1 >= 0 ? M1 % 4 : M1);

      if (M0 < 0 && M1 < 0)

        continue;


      bool Match = false;

      for (int j = 0, e = DWordPairs.size(); j < e; ++j) {

        auto &DWordPair = DWordPairs[j];

        if ((M0 < 0 || isUndefOrEqual(DWordPair.first, M0)) &&

            (M1 < 0 || isUndefOrEqual(DWordPair.second, M1))) {

          DWordPair.first = (M0 >= 0 ? M0 : DWordPair.first);

          DWordPair.second = (M1 >= 0 ? M1 : DWordPair.second);

          PSHUFDMask[DWord] = DOffset + j;

          Match = true;

          break;

        }

      }

      if (!Match) {

        PSHUFDMask[DWord] = DOffset + DWordPairs.size();

        DWordPairs.push_back(std::make_pair(M0, M1));

      }

    }


    if (DWordPairs.size() <= 2) {

      DWordPairs.resize(2, std::make_pair(-1, -1));

      int PSHUFHalfMask[4] = {DWordPairs[0].first, DWordPairs[0].second,

                              DWordPairs[1].first, DWordPairs[1].second};

      // For splat, ensure we widen the PSHUFDMask to allow vXi64 folds.

      if (ShuffleVectorSDNode::isSplatMask(PSHUFDMask) &&

          ShuffleVectorSDNode::isSplatMask(PSHUFHalfMask)) {

        int SplatIdx = ShuffleVectorSDNode::getSplatMaskIndex(PSHUFHalfMask);

        std::fill(PSHUFHalfMask, PSHUFHalfMask + 4, SplatIdx);

        PSHUFDMask[0] = PSHUFDMask[2] = DOffset + 0;

        PSHUFDMask[1] = PSHUFDMask[3] = DOffset + 1;

      }

      if ((NumHToL + NumHToH) == 0)

        return ShuffleDWordPairs(PSHUFHalfMask, PSHUFDMask, X86ISD::PSHUFLW);

      if ((NumLToL + NumLToH) == 0)

        return ShuffleDWordPairs(PSHUFHalfMask, PSHUFDMask, X86ISD::PSHUFHW);

    }

  }


  // Simplify the 1-into-3 and 3-into-1 cases with a single pshufd. For all

  // such inputs we can swap two of the dwords across the half mark and end up

  // with <=2 inputs to each half in each half. Once there, we can fall through

  // to the generic code below. For example:

  //

  // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]

  // Mask:  [0, 1, 2, 7, 4, 5, 6, 3] -----------------> [0, 1, 4, 7, 2, 3, 6, 5]

  //

  // However in some very rare cases we have a 1-into-3 or 3-into-1 on one half

  // and an existing 2-into-2 on the other half. In this case we may have to

  // pre-shuffle the 2-into-2 half to avoid turning it into a 3-into-1 or

  // 1-into-3 which could cause us to cycle endlessly fixing each side in turn.

  // Fortunately, we don't have to handle anything but a 2-into-2 pattern

  // because any other situation (including a 3-into-1 or 1-into-3 in the other

  // half than the one we target for fixing) will be fixed when we re-enter this

  // path. We will also combine away any sequence of PSHUFD instructions that

  // result into a single instruction. Here is an example of the tricky case:

  //

  // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]

  // Mask:  [3, 7, 1, 0, 2, 7, 3, 5] -THIS-IS-BAD!!!!-> [5, 7, 1, 0, 4, 7, 5, 3]

  //

  // This now has a 1-into-3 in the high half! Instead, we do two shuffles:

  //

  // Input: [a, b, c, d, e, f, g, h] PSHUFHW[0,2,1,3]-> [a, b, c, d, e, g, f, h]

  // Mask:  [3, 7, 1, 0, 2, 7, 3, 5] -----------------> [3, 7, 1, 0, 2, 7, 3, 6]

  //

  // Input: [a, b, c, d, e, g, f, h] -PSHUFD[0,2,1,3]-> [a, b, e, g, c, d, f, h]

  // Mask:  [3, 7, 1, 0, 2, 7, 3, 6] -----------------> [5, 7, 1, 0, 4, 7, 5, 6]

  //

  // The result is fine to be handled by the generic logic.

  auto balanceSides = [&](ArrayRef<int> AToAInputs, ArrayRef<int> BToAInputs,

                          ArrayRef<int> BToBInputs, ArrayRef<int> AToBInputs,

                          int AOffset, int BOffset) {

    assert((AToAInputs.size() == 3 || AToAInputs.size() == 1) &&

           "Must call this with A having 3 or 1 inputs from the A half.");

    assert((BToAInputs.size() == 1 || BToAInputs.size() == 3) &&

           "Must call this with B having 1 or 3 inputs from the B half.");

    assert(AToAInputs.size() + BToAInputs.size() == 4 &&

           "Must call this with either 3:1 or 1:3 inputs (summing to 4).");


    bool ThreeAInputs = AToAInputs.size() == 3;


    // Compute the index of dword with only one word among the three inputs in

    // a half by taking the sum of the half with three inputs and subtracting

    // the sum of the actual three inputs. The difference is the remaining

    // slot.

    int ADWord = 0, BDWord = 0;

    int &TripleDWord = ThreeAInputs ? ADWord : BDWord;

    int &OneInputDWord = ThreeAInputs ? BDWord : ADWord;

    int TripleInputOffset = ThreeAInputs ? AOffset : BOffset;

    ArrayRef<int> TripleInputs = ThreeAInputs ? AToAInputs : BToAInputs;

    int OneInput = ThreeAInputs ? BToAInputs[0] : AToAInputs[0];

    int TripleInputSum = 0 + 1 + 2 + 3 + (4 * TripleInputOffset);

    int TripleNonInputIdx =

        TripleInputSum - std::accumulate(TripleInputs.begin(), TripleInputs.end(), 0);

    TripleDWord = TripleNonInputIdx / 2;


    // We use xor with one to compute the adjacent DWord to whichever one the

    // OneInput is in.

    OneInputDWord = (OneInput / 2) ^ 1;


    // Check for one tricky case: We're fixing a 3<-1 or a 1<-3 shuffle for AToA

    // and BToA inputs. If there is also such a problem with the BToB and AToB

    // inputs, we don't try to fix it necessarily -- we'll recurse and see it in

    // the next pass. However, if we have a 2<-2 in the BToB and AToB inputs, it

    // is essential that we don't *create* a 3<-1 as then we might oscillate.

    if (BToBInputs.size() == 2 && AToBInputs.size() == 2) {

      // Compute how many inputs will be flipped by swapping these DWords. We

      // need

      // to balance this to ensure we don't form a 3-1 shuffle in the other

      // half.

      int NumFlippedAToBInputs = llvm::count(AToBInputs, 2 * ADWord) +

                                 llvm::count(AToBInputs, 2 * ADWord + 1);

      int NumFlippedBToBInputs = llvm::count(BToBInputs, 2 * BDWord) +

                                 llvm::count(BToBInputs, 2 * BDWord + 1);

      if ((NumFlippedAToBInputs == 1 &&

           (NumFlippedBToBInputs == 0 || NumFlippedBToBInputs == 2)) ||

          (NumFlippedBToBInputs == 1 &&

           (NumFlippedAToBInputs == 0 || NumFlippedAToBInputs == 2))) {

        // We choose whether to fix the A half or B half based on whether that

        // half has zero flipped inputs. At zero, we may not be able to fix it

        // with that half. We also bias towards fixing the B half because that

        // will more commonly be the high half, and we have to bias one way.

        auto FixFlippedInputs = [&V, &DL, &Mask, &DAG](int PinnedIdx, int DWord,

                                                       ArrayRef<int> Inputs) {

          int FixIdx = PinnedIdx ^ 1; // The adjacent slot to the pinned slot.

          bool IsFixIdxInput = is_contained(Inputs, PinnedIdx ^ 1);

          // Determine whether the free index is in the flipped dword or the

          // unflipped dword based on where the pinned index is. We use this bit

          // in an xor to conditionally select the adjacent dword.

          int FixFreeIdx = 2 * (DWord ^ (PinnedIdx / 2 == DWord));

          bool IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);

          if (IsFixIdxInput == IsFixFreeIdxInput)

            FixFreeIdx += 1;

          IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);

          assert(IsFixIdxInput != IsFixFreeIdxInput &&

                 "We need to be changing the number of flipped inputs!");

          int PSHUFHalfMask[] = {0, 1, 2, 3};

          std::swap(PSHUFHalfMask[FixFreeIdx % 4], PSHUFHalfMask[FixIdx % 4]);

          V = DAG.getNode(

              FixIdx < 4 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW, DL,

              MVT::getVectorVT(MVT::i16, V.getValueSizeInBits() / 16), V,

              getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));


          for (int &M : Mask)

            if (M >= 0 && M == FixIdx)

              M = FixFreeIdx;

            else if (M >= 0 && M == FixFreeIdx)

              M = FixIdx;

        };

        if (NumFlippedBToBInputs != 0) {

          int BPinnedIdx =

              BToAInputs.size() == 3 ? TripleNonInputIdx : OneInput;

          FixFlippedInputs(BPinnedIdx, BDWord, BToBInputs);

        } else {

          assert(NumFlippedAToBInputs != 0 && "Impossible given predicates!");

          int APinnedIdx = ThreeAInputs ? TripleNonInputIdx : OneInput;

          FixFlippedInputs(APinnedIdx, ADWord, AToBInputs);

        }

      }

    }


    int PSHUFDMask[] = {0, 1, 2, 3};

    PSHUFDMask[ADWord] = BDWord;

    PSHUFDMask[BDWord] = ADWord;

    V = DAG.getBitcast(

        VT,

        DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),

                    getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));


    // Adjust the mask to match the new locations of A and B.

    for (int &M : Mask)

      if (M >= 0 && M/2 == ADWord)

        M = 2 * BDWord + M % 2;

      else if (M >= 0 && M/2 == BDWord)

        M = 2 * ADWord + M % 2;


    // Recurse back into this routine to re-compute state now that this isn't

    // a 3 and 1 problem.

    return lowerV8I16GeneralSingleInputShuffle(DL, VT, V, Mask, Subtarget, DAG);

  };

  if ((NumLToL == 3 && NumHToL == 1) || (NumLToL == 1 && NumHToL == 3))

    return balanceSides(LToLInputs, HToLInputs, HToHInputs, LToHInputs, 0, 4);

  if ((NumHToH == 3 && NumLToH == 1) || (NumHToH == 1 && NumLToH == 3))

    return balanceSides(HToHInputs, LToHInputs, LToLInputs, HToLInputs, 4, 0);


  // At this point there are at most two inputs to the low and high halves from

  // each half. That means the inputs can always be grouped into dwords and

  // those dwords can then be moved to the correct half with a dword shuffle.

  // We use at most one low and one high word shuffle to collect these paired

  // inputs into dwords, and finally a dword shuffle to place them.

  int PSHUFLMask[4] = {-1, -1, -1, -1};

  int PSHUFHMask[4] = {-1, -1, -1, -1};

  int PSHUFDMask[4] = {-1, -1, -1, -1};


  // First fix the masks for all the inputs that are staying in their

  // original halves. This will then dictate the targets of the cross-half

  // shuffles.

  auto fixInPlaceInputs =

      [&PSHUFDMask](ArrayRef<int> InPlaceInputs, ArrayRef<int> IncomingInputs,

                    MutableArrayRef<int> SourceHalfMask,

                    MutableArrayRef<int> HalfMask, int HalfOffset) {

    if (InPlaceInputs.empty())

      return;

    if (InPlaceInputs.size() == 1) {

      SourceHalfMask[InPlaceInputs[0] - HalfOffset] =

          InPlaceInputs[0] - HalfOffset;

      PSHUFDMask[InPlaceInputs[0] / 2] = InPlaceInputs[0] / 2;

      return;

    }

    if (IncomingInputs.empty()) {

      // Just fix all of the in place inputs.

      for (int Input : InPlaceInputs) {

        SourceHalfMask[Input - HalfOffset] = Input - HalfOffset;

        PSHUFDMask[Input / 2] = Input / 2;

      }

      return;

    }


    assert(InPlaceInputs.size() == 2 && "Cannot handle 3 or 4 inputs!");

    SourceHalfMask[InPlaceInputs[0] - HalfOffset] =

        InPlaceInputs[0] - HalfOffset;

    // Put the second input next to the first so that they are packed into

    // a dword. We find the adjacent index by toggling the low bit.

    int AdjIndex = InPlaceInputs[0] ^ 1;

    SourceHalfMask[AdjIndex - HalfOffset] = InPlaceInputs[1] - HalfOffset;

    llvm::replace(HalfMask, InPlaceInputs[1], AdjIndex);

    PSHUFDMask[AdjIndex / 2] = AdjIndex / 2;

  };

  fixInPlaceInputs(LToLInputs, HToLInputs, PSHUFLMask, LoMask, 0);

  fixInPlaceInputs(HToHInputs, LToHInputs, PSHUFHMask, HiMask, 4);


  // Now gather the cross-half inputs and place them into a free dword of

  // their target half.

  // FIXME: This operation could almost certainly be simplified dramatically to

  // look more like the 3-1 fixing operation.

  auto moveInputsToRightHalf = [&PSHUFDMask](

      MutableArrayRef<int> IncomingInputs, ArrayRef<int> ExistingInputs,

      MutableArrayRef<int> SourceHalfMask, MutableArrayRef<int> HalfMask,

      MutableArrayRef<int> FinalSourceHalfMask, int SourceOffset,

      int DestOffset) {

    auto isWordClobbered = [](ArrayRef<int> SourceHalfMask, int Word) {

      return SourceHalfMask[Word] >= 0 && SourceHalfMask[Word] != Word;

    };

    auto isDWordClobbered = [&isWordClobbered](ArrayRef<int> SourceHalfMask,

                                               int Word) {

      int LowWord = Word & ~1;

      int HighWord = Word | 1;

      return isWordClobbered(SourceHalfMask, LowWord) ||

             isWordClobbered(SourceHalfMask, HighWord);

    };


    if (IncomingInputs.empty())

      return;


    if (ExistingInputs.empty()) {

      // Map any dwords with inputs from them into the right half.

      for (int Input : IncomingInputs) {

        // If the source half mask maps over the inputs, turn those into

        // swaps and use the swapped lane.

        if (isWordClobbered(SourceHalfMask, Input - SourceOffset)) {

          if (SourceHalfMask[SourceHalfMask[Input - SourceOffset]] < 0) {

            SourceHalfMask[SourceHalfMask[Input - SourceOffset]] =

                Input - SourceOffset;

            // We have to swap the uses in our half mask in one sweep.

            for (int &M : HalfMask)

              if (M == SourceHalfMask[Input - SourceOffset] + SourceOffset)

                M = Input;

              else if (M == Input)

                M = SourceHalfMask[Input - SourceOffset] + SourceOffset;

          } else {

            assert(SourceHalfMask[SourceHalfMask[Input - SourceOffset]] ==

                       Input - SourceOffset &&

                   "Previous placement doesn't match!");

          }

          // Note that this correctly re-maps both when we do a swap and when

          // we observe the other side of the swap above. We rely on that to

          // avoid swapping the members of the input list directly.

          Input = SourceHalfMask[Input - SourceOffset] + SourceOffset;

        }


        // Map the input's dword into the correct half.

        if (PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] < 0)

          PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] = Input / 2;

        else

          assert(PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] ==

                     Input / 2 &&

                 "Previous placement doesn't match!");

      }


      // And just directly shift any other-half mask elements to be same-half

      // as we will have mirrored the dword containing the element into the

      // same position within that half.

      for (int &M : HalfMask)

        if (M >= SourceOffset && M < SourceOffset + 4) {

          M = M - SourceOffset + DestOffset;

          assert(M >= 0 && "This should never wrap below zero!");

        }

      return;

    }


    // Ensure we have the input in a viable dword of its current half. This

    // is particularly tricky because the original position may be clobbered

    // by inputs being moved and *staying* in that half.

    if (IncomingInputs.size() == 1) {

      if (isWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {

        int InputFixed = find(SourceHalfMask, -1) - std::begin(SourceHalfMask) +

                         SourceOffset;

        SourceHalfMask[InputFixed - SourceOffset] =

            IncomingInputs[0] - SourceOffset;

        llvm::replace(HalfMask, IncomingInputs[0], InputFixed);

        IncomingInputs[0] = InputFixed;

      }

    } else if (IncomingInputs.size() == 2) {

      if (IncomingInputs[0] / 2 != IncomingInputs[1] / 2 ||

          isDWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {

        // We have two non-adjacent or clobbered inputs we need to extract from

        // the source half. To do this, we need to map them into some adjacent

        // dword slot in the source mask.

        int InputsFixed[2] = {IncomingInputs[0] - SourceOffset,

                              IncomingInputs[1] - SourceOffset};


        // If there is a free slot in the source half mask adjacent to one of

        // the inputs, place the other input in it. We use (Index XOR 1) to

        // compute an adjacent index.

        if (!isWordClobbered(SourceHalfMask, InputsFixed[0]) &&

            SourceHalfMask[InputsFixed[0] ^ 1] < 0) {

          SourceHalfMask[InputsFixed[0]] = InputsFixed[0];

          SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];

          InputsFixed[1] = InputsFixed[0] ^ 1;

        } else if (!isWordClobbered(SourceHalfMask, InputsFixed[1]) &&

                   SourceHalfMask[InputsFixed[1] ^ 1] < 0) {

          SourceHalfMask[InputsFixed[1]] = InputsFixed[1];

          SourceHalfMask[InputsFixed[1] ^ 1] = InputsFixed[0];

          InputsFixed[0] = InputsFixed[1] ^ 1;

        } else if (SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] < 0 &&

                   SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] < 0) {

          // The two inputs are in the same DWord but it is clobbered and the

          // adjacent DWord isn't used at all. Move both inputs to the free

          // slot.

          SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] = InputsFixed[0];

          SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] = InputsFixed[1];

          InputsFixed[0] = 2 * ((InputsFixed[0] / 2) ^ 1);

          InputsFixed[1] = 2 * ((InputsFixed[0] / 2) ^ 1) + 1;

        } else {

          // The only way we hit this point is if there is no clobbering

          // (because there are no off-half inputs to this half) and there is no

          // free slot adjacent to one of the inputs. In this case, we have to

          // swap an input with a non-input.

          for (int i = 0; i < 4; ++i)

            assert((SourceHalfMask[i] < 0 || SourceHalfMask[i] == i) &&

                   "We can't handle any clobbers here!");

          assert(InputsFixed[1] != (InputsFixed[0] ^ 1) &&

                 "Cannot have adjacent inputs here!");


          SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];

          SourceHalfMask[InputsFixed[1]] = InputsFixed[0] ^ 1;


          // We also have to update the final source mask in this case because

          // it may need to undo the above swap.

          for (int &M : FinalSourceHalfMask)

            if (M == (InputsFixed[0] ^ 1) + SourceOffset)

              M = InputsFixed[1] + SourceOffset;

            else if (M == InputsFixed[1] + SourceOffset)

              M = (InputsFixed[0] ^ 1) + SourceOffset;


          InputsFixed[1] = InputsFixed[0] ^ 1;

        }


        // Point everything at the fixed inputs.

        for (int &M : HalfMask)

          if (M == IncomingInputs[0])

            M = InputsFixed[0] + SourceOffset;

          else if (M == IncomingInputs[1])

            M = InputsFixed[1] + SourceOffset;


        IncomingInputs[0] = InputsFixed[0] + SourceOffset;

        IncomingInputs[1] = InputsFixed[1] + SourceOffset;

      }

    } else {

      llvm_unreachable("Unhandled input size!");

    }


    // Now hoist the DWord down to the right half.

    int FreeDWord = (PSHUFDMask[DestOffset / 2] < 0 ? 0 : 1) + DestOffset / 2;

    assert(PSHUFDMask[FreeDWord] < 0 && "DWord not free");

    PSHUFDMask[FreeDWord] = IncomingInputs[0] / 2;

    for (int &M : HalfMask)

      for (int Input : IncomingInputs)

        if (M == Input)

          M = FreeDWord * 2 + Input % 2;

  };

  moveInputsToRightHalf(HToLInputs, LToLInputs, PSHUFHMask, LoMask, HiMask,

                        /*SourceOffset*/ 4, /*DestOffset*/ 0);

  moveInputsToRightHalf(LToHInputs, HToHInputs, PSHUFLMask, HiMask, LoMask,

                        /*SourceOffset*/ 0, /*DestOffset*/ 4);


  // Now enact all the shuffles we've computed to move the inputs into their

  // target half.

  if (!isNoopShuffleMask(PSHUFLMask))

    V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,

                    getV4X86ShuffleImm8ForMask(PSHUFLMask, DL, DAG));

  if (!isNoopShuffleMask(PSHUFHMask))

    V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,

                    getV4X86ShuffleImm8ForMask(PSHUFHMask, DL, DAG));

  if (!isNoopShuffleMask(PSHUFDMask))

    V = DAG.getBitcast(

        VT,

        DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),

                    getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));


  // At this point, each half should contain all its inputs, and we can then

  // just shuffle them into their final position.

  assert(none_of(LoMask, [](int M) { return M >= 4; }) &&

         "Failed to lift all the high half inputs to the low mask!");

  assert(none_of(HiMask, [](int M) { return M >= 0 && M < 4; }) &&

         "Failed to lift all the low half inputs to the high mask!");


  // Do a half shuffle for the low mask.

  if (!isNoopShuffleMask(LoMask))

    V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,

                    getV4X86ShuffleImm8ForMask(LoMask, DL, DAG));


  // Do a half shuffle with the high mask after shifting its values down.

  for (int &M : HiMask)

    if (M >= 0)

      M -= 4;

  if (!isNoopShuffleMask(HiMask))

    V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,

                    getV4X86ShuffleImm8ForMask(HiMask, DL, DAG));


  return V;

}


/// Helper to form a PSHUFB-based shuffle+blend, opportunistically avoiding the

/// blend if only one input is used.


static SDValue lowerShuffleAsBlendOfPSHUFBs(

    const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,

    const APInt &Zeroable, SelectionDAG &DAG, bool &V1InUse, bool &V2InUse) {

  assert(!is128BitLaneCrossingShuffleMask(VT, Mask) &&

         "Lane crossing shuffle masks not supported");


  int NumBytes = VT.getSizeInBits() / 8;

  int Size = Mask.size();

  int Scale = NumBytes / Size;


  SmallVector<SDValue, 64> V1Mask(NumBytes, DAG.getUNDEF(MVT::i8));

  SmallVector<SDValue, 64> V2Mask(NumBytes, DAG.getUNDEF(MVT::i8));

  V1InUse = false;

  V2InUse = false;


  for (int i = 0; i < NumBytes; ++i) {

    int M = Mask[i / Scale];

    if (M < 0)

      continue;


    const int ZeroMask = 0x80;

    int V1Idx = M < Size ? M * Scale + i % Scale : ZeroMask;

    int V2Idx = M < Size ? ZeroMask : (M - Size) * Scale + i % Scale;

    if (Zeroable[i / Scale])

      V1Idx = V2Idx = ZeroMask;


    V1Mask[i] = DAG.getConstant(V1Idx, DL, MVT::i8);

    V2Mask[i] = DAG.getConstant(V2Idx, DL, MVT::i8);

    V1InUse |= (ZeroMask != V1Idx);

    V2InUse |= (ZeroMask != V2Idx);

  }


  MVT ShufVT = MVT::getVectorVT(MVT::i8, NumBytes);

  if (V1InUse)

    V1 = DAG.getNode(X86ISD::PSHUFB, DL, ShufVT, DAG.getBitcast(ShufVT, V1),

                     DAG.getBuildVector(ShufVT, DL, V1Mask));

  if (V2InUse)

    V2 = DAG.getNode(X86ISD::PSHUFB, DL, ShufVT, DAG.getBitcast(ShufVT, V2),

                     DAG.getBuildVector(ShufVT, DL, V2Mask));


  // If we need shuffled inputs from both, blend the two.

  SDValue V;

  if (V1InUse && V2InUse)

    V = DAG.getNode(ISD::OR, DL, ShufVT, V1, V2);

  else

    V = V1InUse ? V1 : V2;


  // Cast the result back to the correct type.

  return DAG.getBitcast(VT, V);

}


/// Generic lowering of 8-lane i16 shuffles.

///

/// This handles both single-input shuffles and combined shuffle/blends with

/// two inputs. The single input shuffles are immediately delegated to

/// a dedicated lowering routine.

///

/// The blends are lowered in one of three fundamental ways. If there are few

/// enough inputs, it delegates to a basic UNPCK-based strategy. If the shuffle

/// of the input is significantly cheaper when lowered as an interleaving of

/// the two inputs, try to interleave them. Otherwise, blend the low and high

/// halves of the inputs separately (making them have relatively few inputs)

/// and then concatenate them.


static SDValue lowerV8I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,

                                 const APInt &Zeroable, SDValue V1, SDValue V2,

                                 const X86Subtarget &Subtarget,

                                 SelectionDAG &DAG) {

  assert(V1.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");

  assert(V2.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");

  assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");


  // Whenever we can lower this as a zext, that instruction is strictly faster

  // than any alternative.

  if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v8i16, V1, V2, Mask,

                                                   Zeroable, Subtarget, DAG))

    return ZExt;


  // Try to use lower using a truncation.

  if (SDValue V = lowerShuffleWithVPMOV(DL, MVT::v8i16, V1, V2, Mask, Zeroable,

                                        Subtarget, DAG))

    return V;


  int NumV2Inputs = count_if(Mask, [](int M) { return M >= 8; });


  if (NumV2Inputs == 0) {

    // Try to use shift instructions.

    if (SDValue Shift =

            lowerShuffleAsShift(DL, MVT::v8i16, V1, V1, Mask, Zeroable,

                                Subtarget, DAG, /*BitwiseOnly*/ false))

      return Shift;


    // Check for being able to broadcast a single element.

    if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8i16, V1, V2,

                                                    Mask, Subtarget, DAG))

      return Broadcast;


    // Try to use bit rotation instructions.

    if (SDValue Rotate = lowerShuffleAsBitRotate(DL, MVT::v8i16, V1, Mask,

                                                 Subtarget, DAG))

      return Rotate;


    // Use dedicated unpack instructions for masks that match their pattern.

    if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i16, V1, V2, Mask, DAG))

      return V;


    // Use dedicated pack instructions for masks that match their pattern.

    if (SDValue V =

            lowerShuffleWithPACK(DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG))

      return V;


    // Try to use byte rotation instructions.

    if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i16, V1, V1, Mask,

                                                  Subtarget, DAG))

      return Rotate;


    // Make a copy of the mask so it can be modified.

    SmallVector<int, 8> MutableMask(Mask);

    return lowerV8I16GeneralSingleInputShuffle(DL, MVT::v8i16, V1, MutableMask,

                                               Subtarget, DAG);

  }


  assert(llvm::any_of(Mask, [](int M) { return M >= 0 && M < 8; }) &&

         "All single-input shuffles should be canonicalized to be V1-input "

         "shuffles.");


  // Try to use shift instructions.

  if (SDValue Shift =

          lowerShuffleAsShift(DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget,

                              DAG, /*BitwiseOnly*/ false))

    return Shift;


  // See if we can use SSE4A Extraction / Insertion.

  if (Subtarget.hasSSE4A())

    if (SDValue V = lowerShuffleWithSSE4A(DL, MVT::v8i16, V1, V2, Mask,

                                          Zeroable, DAG))

      return V;


  // There are special ways we can lower some single-element blends.

  if (NumV2Inputs == 1)

    if (SDValue V = lowerShuffleAsElementInsertion(

            DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget, DAG))

      return V;


  // We have different paths for blend lowering, but they all must use the

  // *exact* same predicate.

  bool IsBlendSupported = Subtarget.hasSSE41();

  if (IsBlendSupported)

    if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i16, V1, V2, Mask,

                                            Zeroable, Subtarget, DAG))

      return Blend;


  if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v8i16, V1, V2, Mask,

                                             Zeroable, Subtarget, DAG))

    return Masked;


  // Use dedicated unpack instructions for masks that match their pattern.

  if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i16, V1, V2, Mask, DAG))

    return V;


  // Use dedicated pack instructions for masks that match their pattern.

  if (SDValue V =

          lowerShuffleWithPACK(DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG))

    return V;


  // Try to use lower using a truncation.

  if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v8i16, V1, V2, Mask, Zeroable,

                                       Subtarget, DAG))

    return V;


  // Try to use byte rotation instructions.

  if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i16, V1, V2, Mask,

                                                Subtarget, DAG))

    return Rotate;


  if (SDValue BitBlend =

          lowerShuffleAsBitBlend(DL, MVT::v8i16, V1, V2, Mask, DAG))

    return BitBlend;


  // Try to use byte shift instructions to mask.

  if (SDValue V = lowerShuffleAsByteShiftMask(DL, MVT::v8i16, V1, V2, Mask,

                                              Zeroable, Subtarget, DAG))

    return V;


  // Attempt to lower using compaction, SSE41 is necessary for PACKUSDW.

  int NumEvenDrops = canLowerByDroppingElements(Mask, true, false);

  if ((NumEvenDrops == 1 || (NumEvenDrops == 2 && Subtarget.hasSSE41())) &&

      !Subtarget.hasVLX()) {

    // Check if this is part of a 256-bit vector truncation.

    unsigned PackOpc = 0;

    if (NumEvenDrops == 2 && Subtarget.hasAVX2() &&

        peekThroughBitcasts(V1).getOpcode() == ISD::EXTRACT_SUBVECTOR &&

        peekThroughBitcasts(V2).getOpcode() == ISD::EXTRACT_SUBVECTOR) {

      SDValue V1V2 = concatSubVectors(V1, V2, DAG, DL);

      V1V2 = DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1V2,

                         getZeroVector(MVT::v16i16, Subtarget, DAG, DL),

                         DAG.getTargetConstant(0xEE, DL, MVT::i8));

      V1V2 = DAG.getBitcast(MVT::v8i32, V1V2);

      V1 = extract128BitVector(V1V2, 0, DAG, DL);

      V2 = extract128BitVector(V1V2, 4, DAG, DL);

      PackOpc = X86ISD::PACKUS;

    } else if (Subtarget.hasSSE41()) {

      SmallVector<SDValue, 4> DWordClearOps(4,

                                            DAG.getConstant(0, DL, MVT::i32));

      for (unsigned i = 0; i != 4; i += 1 << (NumEvenDrops - 1))

        DWordClearOps[i] = DAG.getConstant(0xFFFF, DL, MVT::i32);

      SDValue DWordClearMask =

          DAG.getBuildVector(MVT::v4i32, DL, DWordClearOps);

      V1 = DAG.getNode(ISD::AND, DL, MVT::v4i32, DAG.getBitcast(MVT::v4i32, V1),

                       DWordClearMask);

      V2 = DAG.getNode(ISD::AND, DL, MVT::v4i32, DAG.getBitcast(MVT::v4i32, V2),

                       DWordClearMask);

      PackOpc = X86ISD::PACKUS;

    } else if (!Subtarget.hasSSSE3()) {

      SDValue ShAmt = DAG.getTargetConstant(16, DL, MVT::i8);

      V1 = DAG.getBitcast(MVT::v4i32, V1);

      V2 = DAG.getBitcast(MVT::v4i32, V2);

      V1 = DAG.getNode(X86ISD::VSHLI, DL, MVT::v4i32, V1, ShAmt);

      V2 = DAG.getNode(X86ISD::VSHLI, DL, MVT::v4i32, V2, ShAmt);

      V1 = DAG.getNode(X86ISD::VSRAI, DL, MVT::v4i32, V1, ShAmt);

      V2 = DAG.getNode(X86ISD::VSRAI, DL, MVT::v4i32, V2, ShAmt);

      PackOpc = X86ISD::PACKSS;

    }

    if (PackOpc) {

      // Now pack things back together.

      SDValue Result = DAG.getNode(PackOpc, DL, MVT::v8i16, V1, V2);

      if (NumEvenDrops == 2) {

        Result = DAG.getBitcast(MVT::v4i32, Result);

        Result = DAG.getNode(PackOpc, DL, MVT::v8i16, Result, Result);

      }

      return Result;

    }

  }


  // When compacting odd (upper) elements, use PACKSS pre-SSE41.

  int NumOddDrops = canLowerByDroppingElements(Mask, false, false);

  if (NumOddDrops == 1) {

    bool HasSSE41 = Subtarget.hasSSE41();

    V1 = DAG.getNode(HasSSE41 ? X86ISD::VSRLI : X86ISD::VSRAI, DL, MVT::v4i32,

                     DAG.getBitcast(MVT::v4i32, V1),

                     DAG.getTargetConstant(16, DL, MVT::i8));

    V2 = DAG.getNode(HasSSE41 ? X86ISD::VSRLI : X86ISD::VSRAI, DL, MVT::v4i32,

                     DAG.getBitcast(MVT::v4i32, V2),

                     DAG.getTargetConstant(16, DL, MVT::i8));

    return DAG.getNode(HasSSE41 ? X86ISD::PACKUS : X86ISD::PACKSS, DL,

                       MVT::v8i16, V1, V2);

  }


  // Try to lower by permuting the inputs into an unpack instruction.

  if (SDValue Unpack = lowerShuffleAsPermuteAndUnpack(DL, MVT::v8i16, V1, V2,

                                                      Mask, Subtarget, DAG))

    return Unpack;


  // If we can't directly blend but can use PSHUFB, that will be better as it

  // can both shuffle and set up the inefficient blend.

  if (!IsBlendSupported && Subtarget.hasSSSE3()) {

    bool V1InUse, V2InUse;

    return lowerShuffleAsBlendOfPSHUFBs(DL, MVT::v8i16, V1, V2, Mask,

                                        Zeroable, DAG, V1InUse, V2InUse);

  }


  // We can always bit-blend if we have to so the fallback strategy is to

  // decompose into single-input permutes and blends/unpacks.

  return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v8i16, V1, V2, Mask,

                                              Zeroable, Subtarget, DAG);

}


/// Lower 8-lane 16-bit floating point shuffles.


static SDValue lowerV8F16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,

                                 const APInt &Zeroable, SDValue V1, SDValue V2,

                                 const X86Subtarget &Subtarget,

                                 SelectionDAG &DAG) {

  assert(V1.getSimpleValueType() == MVT::v8f16 && "Bad operand type!");

  assert(V2.getSimpleValueType() == MVT::v8f16 && "Bad operand type!");

  assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");

  int NumV2Elements = count_if(Mask, [](int M) { return M >= 8; });


  if (Subtarget.hasFP16()) {

    if (NumV2Elements == 0) {

      // Check for being able to broadcast a single element.

      if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8f16, V1, V2,

                                                      Mask, Subtarget, DAG))

        return Broadcast;

    }

    if (NumV2Elements == 1 && Mask[0] >= 8)

      if (SDValue V = lowerShuffleAsElementInsertion(

              DL, MVT::v8f16, V1, V2, Mask, Zeroable, Subtarget, DAG))

        return V;

  }


  V1 = DAG.getBitcast(MVT::v8i16, V1);

  V2 = DAG.getBitcast(MVT::v8i16, V2);

  return DAG.getBitcast(MVT::v8f16,

                        DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, Mask));

}


// Lowers unary/binary shuffle as VPERMV/VPERMV3, for non-VLX targets,

// sub-512-bit shuffles are padded to 512-bits for the shuffle and then

// the active subvector is extracted.


static SDValue lowerShuffleWithPERMV(const SDLoc &DL, MVT VT,

                                     ArrayRef<int> OriginalMask, SDValue V1,

                                     SDValue V2, const X86Subtarget &Subtarget,

                                     SelectionDAG &DAG) {

  // Commute binary inputs so V2 is a load to simplify VPERMI2/T2 folds.

  SmallVector<int, 32> Mask(OriginalMask);

  if (!V2.isUndef() && isShuffleFoldableLoad(V1) &&

      !isShuffleFoldableLoad(V2)) {

    ShuffleVectorSDNode::commuteMask(Mask);

    std::swap(V1, V2);

  }


  MVT MaskVT = VT.changeTypeToInteger();

  SDValue MaskNode;

  MVT ShuffleVT = VT;

  if (!VT.is512BitVector() && !Subtarget.hasVLX()) {

    V1 = widenSubVector(V1, false, Subtarget, DAG, DL, 512);

    V2 = widenSubVector(V2, false, Subtarget, DAG, DL, 512);

    ShuffleVT = V1.getSimpleValueType();


    // Adjust mask to correct indices for the second input.

    int NumElts = VT.getVectorNumElements();

    unsigned Scale = 512 / VT.getSizeInBits();

    SmallVector<int, 32> AdjustedMask(Mask);

    for (int &M : AdjustedMask)

      if (NumElts <= M)

        M += (Scale - 1) * NumElts;

    MaskNode = getConstVector(AdjustedMask, MaskVT, DAG, DL, true);

    MaskNode = widenSubVector(MaskNode, false, Subtarget, DAG, DL, 512);

  } else {

    MaskNode = getConstVector(Mask, MaskVT, DAG, DL, true);

  }


  SDValue Result;

  if (V2.isUndef())

    Result = DAG.getNode(X86ISD::VPERMV, DL, ShuffleVT, MaskNode, V1);

  else

    Result = DAG.getNode(X86ISD::VPERMV3, DL, ShuffleVT, V1, MaskNode, V2);


  if (VT != ShuffleVT)

    Result = extractSubVector(Result, 0, DAG, DL, VT.getSizeInBits());


  return Result;

}


/// Generic lowering of v16i8 shuffles.

///

/// This is a hybrid strategy to lower v16i8 vectors. It first attempts to

/// detect any complexity reducing interleaving. If that doesn't help, it uses

/// UNPCK to spread the i8 elements across two i16-element vectors, and uses

/// the existing lowering for v8i16 blends on each half, finally PACK-ing them

/// back together.


static SDValue lowerV16I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,

                                 const APInt &Zeroable, SDValue V1, SDValue V2,

                                 const X86Subtarget &Subtarget,

                                 SelectionDAG &DAG) {

  assert(V1.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");

  assert(V2.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");

  assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");


  // Try to use shift instructions.

  if (SDValue Shift =

          lowerShuffleAsShift(DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget,

                              DAG, /*BitwiseOnly*/ false))

    return Shift;


  // Try to use byte rotation instructions.

  if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i8, V1, V2, Mask,

                                                Subtarget, DAG))

    return Rotate;


  // Use dedicated pack instructions for masks that match their pattern.

  if (SDValue V =

          lowerShuffleWithPACK(DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))

    return V;


  // Try to use a zext lowering.

  if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v16i8, V1, V2, Mask,

                                                   Zeroable, Subtarget, DAG))

    return ZExt;


  // Try to use lower using a truncation.

  if (SDValue V = lowerShuffleWithVPMOV(DL, MVT::v16i8, V1, V2, Mask, Zeroable,

                                        Subtarget, DAG))

    return V;


  if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v16i8, V1, V2, Mask, Zeroable,

                                       Subtarget, DAG))

    return V;


  // See if we can use SSE4A Extraction / Insertion.

  if (Subtarget.hasSSE4A())

    if (SDValue V = lowerShuffleWithSSE4A(DL, MVT::v16i8, V1, V2, Mask,

                                          Zeroable, DAG))

      return V;


  int NumV2Elements = count_if(Mask, [](int M) { return M >= 16; });


  // For single-input shuffles, there are some nicer lowering tricks we can use.

  if (NumV2Elements == 0) {

    // Check for being able to broadcast a single element.

    if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v16i8, V1, V2,

                                                    Mask, Subtarget, DAG))

      return Broadcast;


    // Try to use bit rotation instructions.

    if (SDValue Rotate = lowerShuffleAsBitRotate(DL, MVT::v16i8, V1, Mask,

                                                 Subtarget, DAG))

      return Rotate;


    if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i8, V1, V2, Mask, DAG))

      return V;


    // Check whether we can widen this to an i16 shuffle by duplicating bytes.

    // Notably, this handles splat and partial-splat shuffles more efficiently.

    // However, it only makes sense if the pre-duplication shuffle simplifies

    // things significantly. Currently, this means we need to be able to

    // express the pre-duplication shuffle as an i16 shuffle.

    //

    // FIXME: We should check for other patterns which can be widened into an

    // i16 shuffle as well.

    auto canWidenViaDuplication = [](ArrayRef<int> Mask) {

      for (int i = 0; i < 16; i += 2)

        if (Mask[i] >= 0 && Mask[i + 1] >= 0 && Mask[i] != Mask[i + 1])

          return false;


      return true;

    };

    auto tryToWidenViaDuplication = [&]() -> SDValue {

      if (!canWidenViaDuplication(Mask))

        return SDValue();

      SmallVector<int, 4> LoInputs;

      copy_if(Mask, std::back_inserter(LoInputs),

              [](int M) { return M >= 0 && M < 8; });

      array_pod_sort(LoInputs.begin(), LoInputs.end());

      LoInputs.erase(llvm::unique(LoInputs), LoInputs.end());

      SmallVector<int, 4> HiInputs;

      copy_if(Mask, std::back_inserter(HiInputs), [](int M) { return M >= 8; });

      array_pod_sort(HiInputs.begin(), HiInputs.end());

      HiInputs.erase(llvm::unique(HiInputs), HiInputs.end());


      bool TargetLo = LoInputs.size() >= HiInputs.size();

      ArrayRef<int> InPlaceInputs = TargetLo ? LoInputs : HiInputs;

      ArrayRef<int> MovingInputs = TargetLo ? HiInputs : LoInputs;


      int PreDupI16Shuffle[] = {-1, -1, -1, -1, -1, -1, -1, -1};

      SmallDenseMap<int, int, 8> LaneMap;

      for (int I : InPlaceInputs) {

        PreDupI16Shuffle[I/2] = I/2;

        LaneMap[I] = I;

      }

      int j = TargetLo ? 0 : 4, je = j + 4;

      for (int i = 0, ie = MovingInputs.size(); i < ie; ++i) {

        // Check if j is already a shuffle of this input. This happens when

        // there are two adjacent bytes after we move the low one.

        if (PreDupI16Shuffle[j] != MovingInputs[i] / 2) {

          // If we haven't yet mapped the input, search for a slot into which

          // we can map it.

          while (j < je && PreDupI16Shuffle[j] >= 0)

            ++j;


          if (j == je)

            // We can't place the inputs into a single half with a simple i16 shuffle, so bail.

            return SDValue();


          // Map this input with the i16 shuffle.

          PreDupI16Shuffle[j] = MovingInputs[i] / 2;

        }


        // Update the lane map based on the mapping we ended up with.

        LaneMap[MovingInputs[i]] = 2 * j + MovingInputs[i] % 2;

      }

      V1 = DAG.getBitcast(

          MVT::v16i8,

          DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),

                               DAG.getUNDEF(MVT::v8i16), PreDupI16Shuffle));


      // Unpack the bytes to form the i16s that will be shuffled into place.

      bool EvenInUse = false, OddInUse = false;

      for (int i = 0; i < 16; i += 2) {

        EvenInUse |= (Mask[i + 0] >= 0);

        OddInUse |= (Mask[i + 1] >= 0);

        if (EvenInUse && OddInUse)

          break;

      }

      V1 = DAG.getNode(TargetLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,

                       MVT::v16i8, EvenInUse ? V1 : DAG.getUNDEF(MVT::v16i8),

                       OddInUse ? V1 : DAG.getUNDEF(MVT::v16i8));


      int PostDupI16Shuffle[8] = {-1, -1, -1, -1, -1, -1, -1, -1};

      for (int i = 0; i < 16; ++i)

        if (Mask[i] >= 0) {

          int MappedMask = LaneMap[Mask[i]] - (TargetLo ? 0 : 8);

          assert(MappedMask < 8 && "Invalid v8 shuffle mask!");

          if (PostDupI16Shuffle[i / 2] < 0)

            PostDupI16Shuffle[i / 2] = MappedMask;

          else

            assert(PostDupI16Shuffle[i / 2] == MappedMask &&

                   "Conflicting entries in the original shuffle!");

        }

      return DAG.getBitcast(

          MVT::v16i8,

          DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),

                               DAG.getUNDEF(MVT::v8i16), PostDupI16Shuffle));

    };

    if (SDValue V = tryToWidenViaDuplication())

      return V;

  }


  if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v16i8, V1, V2, Mask,

                                             Zeroable, Subtarget, DAG))

    return Masked;


  // Use dedicated unpack instructions for masks that match their pattern.

  if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i8, V1, V2, Mask, DAG))

    return V;


  // Try to use byte shift instructions to mask.

  if (SDValue V = lowerShuffleAsByteShiftMask(DL, MVT::v16i8, V1, V2, Mask,

                                              Zeroable, Subtarget, DAG))

    return V;


  // Check for compaction patterns.

  bool IsSingleInput = V2.isUndef();

  int NumEvenDrops = canLowerByDroppingElements(Mask, true, IsSingleInput);


  // Check for SSSE3 which lets us lower all v16i8 shuffles much more directly

  // with PSHUFB. It is important to do this before we attempt to generate any

  // blends but after all of the single-input lowerings. If the single input

  // lowerings can find an instruction sequence that is faster than a PSHUFB, we

  // want to preserve that and we can DAG combine any longer sequences into

  // a PSHUFB in the end. But once we start blending from multiple inputs,

  // the complexity of DAG combining bad patterns back into PSHUFB is too high,

  // and there are *very* few patterns that would actually be faster than the

  // PSHUFB approach because of its ability to zero lanes.

  //

  // If the mask is a binary compaction, we can more efficiently perform this

  // as a PACKUS(AND(),AND()) - which is quicker than UNPACK(PSHUFB(),PSHUFB()).

  //

  // FIXME: The only exceptions to the above are blends which are exact

  // interleavings with direct instructions supporting them. We currently don't

  // handle those well here.

  if (Subtarget.hasSSSE3() && (IsSingleInput || NumEvenDrops != 1)) {

    bool V1InUse = false;

    bool V2InUse = false;


    SDValue PSHUFB = lowerShuffleAsBlendOfPSHUFBs(

        DL, MVT::v16i8, V1, V2, Mask, Zeroable, DAG, V1InUse, V2InUse);


    // If both V1 and V2 are in use and we can use a direct blend or an unpack,

    // do so. This avoids using them to handle blends-with-zero which is

    // important as a single pshufb is significantly faster for that.

    if (V1InUse && V2InUse) {

      if (Subtarget.hasSSE41())

        if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i8, V1, V2, Mask,

                                                Zeroable, Subtarget, DAG))

          return Blend;


      // We can use an unpack to do the blending rather than an or in some

      // cases. Even though the or may be (very minorly) more efficient, we

      // preference this lowering because there are common cases where part of

      // the complexity of the shuffles goes away when we do the final blend as

      // an unpack.

      // FIXME: It might be worth trying to detect if the unpack-feeding

      // shuffles will both be pshufb, in which case we shouldn't bother with

      // this.

      if (SDValue Unpack = lowerShuffleAsPermuteAndUnpack(

              DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))

        return Unpack;


      // AVX512VBMI can lower to VPERMB (non-VLX will pad to v64i8).

      if (Subtarget.hasVBMI())

        return lowerShuffleWithPERMV(DL, MVT::v16i8, Mask, V1, V2, Subtarget,

                                     DAG);


      // If we have XOP we can use one VPPERM instead of multiple PSHUFBs.

      if (Subtarget.hasXOP()) {

        SDValue MaskNode = getConstVector(Mask, MVT::v16i8, DAG, DL, true);

        return DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, V1, V2, MaskNode);

      }


      // Use PALIGNR+Permute if possible - permute might become PSHUFB but the

      // PALIGNR will be cheaper than the second PSHUFB+OR.

      if (SDValue V = lowerShuffleAsByteRotateAndPermute(

              DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))

        return V;

    }


    return PSHUFB;

  }


  // There are special ways we can lower some single-element blends.

  if (NumV2Elements == 1)

    if (SDValue V = lowerShuffleAsElementInsertion(

            DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))

      return V;


  if (SDValue Blend = lowerShuffleAsBitBlend(DL, MVT::v16i8, V1, V2, Mask, DAG))

    return Blend;


  // Check whether a compaction lowering can be done. This handles shuffles

  // which take every Nth element for some even N. See the helper function for

  // details.

  //

  // We special case these as they can be particularly efficiently handled with

  // the PACKUSB instruction on x86 and they show up in common patterns of

  // rearranging bytes to truncate wide elements.

  if (NumEvenDrops) {

    // NumEvenDrops is the power of two stride of the elements. Another way of

    // thinking about it is that we need to drop the even elements this many

    // times to get the original input.


    // First we need to zero all the dropped bytes.

    assert(NumEvenDrops <= 3 &&

           "No support for dropping even elements more than 3 times.");

    SmallVector<SDValue, 8> WordClearOps(8, DAG.getConstant(0, DL, MVT::i16));

    for (unsigned i = 0; i != 8; i += 1 << (NumEvenDrops - 1))

      WordClearOps[i] = DAG.getConstant(0xFF, DL, MVT::i16);

    SDValue WordClearMask = DAG.getBuildVector(MVT::v8i16, DL, WordClearOps);

    V1 = DAG.getNode(ISD::AND, DL, MVT::v8i16, DAG.getBitcast(MVT::v8i16, V1),

                     WordClearMask);

    if (!IsSingleInput)

      V2 = DAG.getNode(ISD::AND, DL, MVT::v8i16, DAG.getBitcast(MVT::v8i16, V2),

                       WordClearMask);


    // Now pack things back together.

    SDValue Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, V1,

                                 IsSingleInput ? V1 : V2);

    for (int i = 1; i < NumEvenDrops; ++i) {

      Result = DAG.getBitcast(MVT::v8i16, Result);

      Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, Result, Result);

    }

    return Result;

  }


  int NumOddDrops = canLowerByDroppingElements(Mask, false, IsSingleInput);

  if (NumOddDrops == 1) {

    V1 = DAG.getNode(X86ISD::VSRLI, DL, MVT::v8i16,

                     DAG.getBitcast(MVT::v8i16, V1),

                     DAG.getTargetConstant(8, DL, MVT::i8));

    if (!IsSingleInput)

      V2 = DAG.getNode(X86ISD::VSRLI, DL, MVT::v8i16,

                       DAG.getBitcast(MVT::v8i16, V2),

                       DAG.getTargetConstant(8, DL, MVT::i8));

    return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, V1,

                       IsSingleInput ? V1 : V2);

  }


  // Handle multi-input cases by blending/unpacking single-input shuffles.

  if (NumV2Elements > 0)

    return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v16i8, V1, V2, Mask,

                                                Zeroable, Subtarget, DAG);


  // The fallback path for single-input shuffles widens this into two v8i16

  // vectors with unpacks, shuffles those, and then pulls them back together

  // with a pack.

  SDValue V = V1;


  std::array<int, 8> LoBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};

  std::array<int, 8> HiBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};

  for (int i = 0; i < 16; ++i)

    if (Mask[i] >= 0)

      (i < 8 ? LoBlendMask[i] : HiBlendMask[i % 8]) = Mask[i];


  SDValue VLoHalf, VHiHalf;

  // Check if any of the odd lanes in the v16i8 are used. If not, we can mask

  // them out and avoid using UNPCK{L,H} to extract the elements of V as

  // i16s.

  if (none_of(LoBlendMask, [](int M) { return M >= 0 && M % 2 == 1; }) &&

      none_of(HiBlendMask, [](int M) { return M >= 0 && M % 2 == 1; })) {

    // Use a mask to drop the high bytes.

    VLoHalf = DAG.getBitcast(MVT::v8i16, V);

    VLoHalf = DAG.getNode(ISD::AND, DL, MVT::v8i16, VLoHalf,

                          DAG.getConstant(0x00FF, DL, MVT::v8i16));


    // This will be a single vector shuffle instead of a blend so nuke VHiHalf.

    VHiHalf = DAG.getUNDEF(MVT::v8i16);


    // Squash the masks to point directly into VLoHalf.

    for (int &M : LoBlendMask)

      if (M >= 0)

        M /= 2;

    for (int &M : HiBlendMask)

      if (M >= 0)

        M /= 2;

  } else {

    // Otherwise just unpack the low half of V into VLoHalf and the high half into

    // VHiHalf so that we can blend them as i16s.

    SDValue Zero = getZeroVector(MVT::v16i8, Subtarget, DAG, DL);


    VLoHalf = DAG.getBitcast(

        MVT::v8i16, DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, V, Zero));

    VHiHalf = DAG.getBitcast(

        MVT::v8i16, DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i8, V, Zero));

  }


  SDValue LoV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, LoBlendMask);

  SDValue HiV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, HiBlendMask);


  return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, LoV, HiV);

}


/// Dispatching routine to lower various 128-bit x86 vector shuffles.

///

/// This routine breaks down the specific type of 128-bit shuffle and

/// dispatches to the lowering routines accordingly.


static SDValue lower128BitShuffle(const SDLoc &DL, ArrayRef<int> Mask,

                                  MVT VT, SDValue V1, SDValue V2,

                                  const APInt &Zeroable,

                                  const X86Subtarget &Subtarget,

                                  SelectionDAG &DAG) {

  if (VT == MVT::v8bf16) {

    V1 = DAG.getBitcast(MVT::v8i16, V1);

    V2 = DAG.getBitcast(MVT::v8i16, V2);

    return DAG.getBitcast(VT,

                          DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, Mask));

  }


  switch (VT.SimpleTy) {

  case MVT::v2i64:

    return lowerV2I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);

  case MVT::v2f64:

    return lowerV2F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);

  case MVT::v4i32:

    return lowerV4I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);

  case MVT::v4f32:

    return lowerV4F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);

  case MVT::v8i16:

    return lowerV8I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);

  case MVT::v8f16:

    return lowerV8F16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);

  case MVT::v16i8:

    return lowerV16I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);


  default:

    llvm_unreachable("Unimplemented!");

  }

}


/// Generic routine to split vector shuffle into half-sized shuffles.

///

/// This routine just extracts two subvectors, shuffles them independently, and

/// then concatenates them back together. This should work effectively with all

/// AVX vector shuffle types.


static SDValue splitAndLowerShuffle(const SDLoc &DL, MVT VT, SDValue V1,

                                    SDValue V2, ArrayRef<int> Mask,

                                    SelectionDAG &DAG, bool SimpleOnly) {

  assert(VT.getSizeInBits() >= 256 &&

         "Only for 256-bit or wider vector shuffles!");

  assert(V1.getSimpleValueType() == VT && "Bad operand type!");

  assert(V2.getSimpleValueType() == VT && "Bad operand type!");


  // If this came from the AVX1 v8i32 -> v8f32 bitcast, split using v4i32.

  if (VT == MVT::v8f32) {

    SDValue BC1 = peekThroughBitcasts(V1);

    SDValue BC2 = peekThroughBitcasts(V2);

    if (BC1.getValueType() == MVT::v8i32 && BC2.getValueType() == MVT::v8i32) {

      if (SDValue Split = splitAndLowerShuffle(DL, MVT::v8i32, BC1, BC2, Mask,

                                               DAG, SimpleOnly))

        return DAG.getBitcast(VT, Split);

    }

  }


  ArrayRef<int> LoMask = Mask.slice(0, Mask.size() / 2);

  ArrayRef<int> HiMask = Mask.slice(Mask.size() / 2);


  int NumElements = VT.getVectorNumElements();

  int SplitNumElements = NumElements / 2;

  MVT ScalarVT = VT.getVectorElementType();

  MVT SplitVT = MVT::getVectorVT(ScalarVT, SplitNumElements);


  // Use splitVector/extractSubVector so that split build-vectors just build two

  // narrower build vectors. This helps shuffling with splats and zeros.

  auto SplitVector = [&](SDValue V) {

    SDValue LoV, HiV;

    std::tie(LoV, HiV) = splitVector(peekThroughBitcasts(V), DAG, DL);

    return std::make_pair(DAG.getBitcast(SplitVT, LoV),

                          DAG.getBitcast(SplitVT, HiV));

  };


  SDValue LoV1, HiV1, LoV2, HiV2;

  std::tie(LoV1, HiV1) = SplitVector(V1);

  std::tie(LoV2, HiV2) = SplitVector(V2);


  // Now create two 4-way blends of these half-width vectors.

  auto GetHalfBlendPiecesReq = [&](const ArrayRef<int> &HalfMask, bool &UseLoV1,

                                   bool &UseHiV1, bool &UseLoV2,

                                   bool &UseHiV2) {

    UseLoV1 = UseHiV1 = UseLoV2 = UseHiV2 = false;

    for (int i = 0; i < SplitNumElements; ++i) {

      int M = HalfMask[i];

      if (M >= NumElements) {

        if (M >= NumElements + SplitNumElements)

          UseHiV2 = true;

        else

          UseLoV2 = true;

      } else if (M >= 0) {

        if (M >= SplitNumElements)

          UseHiV1 = true;

        else

          UseLoV1 = true;

      }

    }

  };


  auto CheckHalfBlendUsable = [&](const ArrayRef<int> &HalfMask) -> bool {

    if (!SimpleOnly)

      return true;


    bool UseLoV1, UseHiV1, UseLoV2, UseHiV2;

    GetHalfBlendPiecesReq(HalfMask, UseLoV1, UseHiV1, UseLoV2, UseHiV2);


    return !(UseHiV1 || UseHiV2);

  };


  auto HalfBlend = [&](ArrayRef<int> HalfMask) {

    SmallVector<int, 32> V1BlendMask((unsigned)SplitNumElements, -1);

    SmallVector<int, 32> V2BlendMask((unsigned)SplitNumElements, -1);

    SmallVector<int, 32> BlendMask((unsigned)SplitNumElements, -1);

    for (int i = 0; i < SplitNumElements; ++i) {

      int M = HalfMask[i];

      if (M >= NumElements) {

        V2BlendMask[i] = M - NumElements;

        BlendMask[i] = SplitNumElements + i;

      } else if (M >= 0) {

        V1BlendMask[i] = M;

        BlendMask[i] = i;

      }

    }


    bool UseLoV1, UseHiV1, UseLoV2, UseHiV2;

    GetHalfBlendPiecesReq(HalfMask, UseLoV1, UseHiV1, UseLoV2, UseHiV2);


    // Because the lowering happens after all combining takes place, we need to

    // manually combine these blend masks as much as possible so that we create

    // a minimal number of high-level vector shuffle nodes.

    assert((!SimpleOnly || (!UseHiV1 && !UseHiV2)) && "Shuffle isn't simple");


    // First try just blending the halves of V1 or V2.

    if (!UseLoV1 && !UseHiV1 && !UseLoV2 && !UseHiV2)

      return DAG.getUNDEF(SplitVT);

    if (!UseLoV2 && !UseHiV2)

      return DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);

    if (!UseLoV1 && !UseHiV1)

      return DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);


    SDValue V1Blend, V2Blend;

    if (UseLoV1 && UseHiV1) {

      V1Blend = DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);

    } else {

      // We only use half of V1 so map the usage down into the final blend mask.

      V1Blend = UseLoV1 ? LoV1 : HiV1;

      for (int i = 0; i < SplitNumElements; ++i)

        if (BlendMask[i] >= 0 && BlendMask[i] < SplitNumElements)

          BlendMask[i] = V1BlendMask[i] - (UseLoV1 ? 0 : SplitNumElements);

    }

    if (UseLoV2 && UseHiV2) {

      V2Blend = DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);

    } else {

      // We only use half of V2 so map the usage down into the final blend mask.

      V2Blend = UseLoV2 ? LoV2 : HiV2;

      for (int i = 0; i < SplitNumElements; ++i)

        if (BlendMask[i] >= SplitNumElements)

          BlendMask[i] = V2BlendMask[i] + (UseLoV2 ? SplitNumElements : 0);

    }

    return DAG.getVectorShuffle(SplitVT, DL, V1Blend, V2Blend, BlendMask);

  };


  if (!CheckHalfBlendUsable(LoMask) || !CheckHalfBlendUsable(HiMask))

    return SDValue();


  SDValue Lo = HalfBlend(LoMask);

  SDValue Hi = HalfBlend(HiMask);

  return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);

}


/// Either split a vector in halves or decompose the shuffles and the

/// blend/unpack.

///

/// This is provided as a good fallback for many lowerings of non-single-input

/// shuffles with more than one 128-bit lane. In those cases, we want to select

/// between splitting the shuffle into 128-bit components and stitching those

/// back together vs. extracting the single-input shuffles and blending those

/// results.


static SDValue lowerShuffleAsSplitOrBlend(const SDLoc &DL, MVT VT, SDValue V1,

                                          SDValue V2, ArrayRef<int> Mask,

                                          const APInt &Zeroable,

                                          const X86Subtarget &Subtarget,

                                          SelectionDAG &DAG) {

  assert(!V2.isUndef() && "This routine must not be used to lower single-input "

         "shuffles as it could then recurse on itself.");

  int Size = Mask.size();


  // If this can be modeled as a broadcast of two elements followed by a blend,

  // prefer that lowering. This is especially important because broadcasts can

  // often fold with memory operands.

  auto DoBothBroadcast = [&] {

    int V1BroadcastIdx = -1, V2BroadcastIdx = -1;

    for (int M : Mask)

      if (M >= Size) {

        if (V2BroadcastIdx < 0)

          V2BroadcastIdx = M - Size;

        else if ((M - Size) != V2BroadcastIdx &&

                 !IsElementEquivalent(Size, V2, V2, M - Size, V2BroadcastIdx))

          return false;

      } else if (M >= 0) {

        if (V1BroadcastIdx < 0)

          V1BroadcastIdx = M;

        else if (M != V1BroadcastIdx &&

                 !IsElementEquivalent(Size, V1, V1, M, V1BroadcastIdx))

          return false;

      }

    return true;

  };

  if (DoBothBroadcast())

    return lowerShuffleAsDecomposedShuffleMerge(DL, VT, V1, V2, Mask, Zeroable,

                                                Subtarget, DAG);


  // If the inputs all stem from a single 128-bit lane of each input, then we

  // split them rather than blending because the split will decompose to

  // unusually few instructions.

  int LaneCount = VT.getSizeInBits() / 128;

  int LaneSize = Size / LaneCount;

  SmallBitVector LaneInputs[2];

  LaneInputs[0].resize(LaneCount, false);

  LaneInputs[1].resize(LaneCount, false);

  for (int i = 0; i < Size; ++i)

    if (Mask[i] >= 0)

      LaneInputs[Mask[i] / Size][(Mask[i] % Size) / LaneSize] = true;

  if (LaneInputs[0].count() <= 1 && LaneInputs[1].count() <= 1)

    return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG,

                                /*SimpleOnly*/ false);


  // Without AVX2, if we can freely split the subvectors then we're better off

  // performing half width shuffles.

  if (!Subtarget.hasAVX2()) {

    SDValue BC1 = peekThroughBitcasts(V1);

    SDValue BC2 = peekThroughBitcasts(V2);

    bool SplatOrSplitV1 = isFreeToSplitVector(BC1, DAG) ||

                          DAG.isSplatValue(BC1, /*AllowUndefs=*/true);

    bool SplatOrSplitV2 = isFreeToSplitVector(BC2, DAG) ||

                          DAG.isSplatValue(BC2, /*AllowUndefs=*/true);

    if (SplatOrSplitV1 && SplatOrSplitV2)

      return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG,

                                  /*SimpleOnly*/ false);

  }


  // Otherwise, just fall back to decomposed shuffles and a blend/unpack. This

  // requires that the decomposed single-input shuffles don't end up here.

  return lowerShuffleAsDecomposedShuffleMerge(DL, VT, V1, V2, Mask, Zeroable,

                                              Subtarget, DAG);

}


// Lower as SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)).

// TODO: Extend to support v8f32 (+ 512-bit shuffles).


static SDValue lowerShuffleAsLanePermuteAndSHUFP(const SDLoc &DL, MVT VT,

                                                 SDValue V1, SDValue V2,

                                                 ArrayRef<int> Mask,

                                                 SelectionDAG &DAG) {

  assert(VT == MVT::v4f64 && "Only for v4f64 shuffles");


  int LHSMask[4] = {-1, -1, -1, -1};

  int RHSMask[4] = {-1, -1, -1, -1};

  int SHUFPDMask[4] = {-1, -1, -1, -1};


  // As SHUFPD uses a single LHS/RHS element per lane, we can always

  // perform the shuffle once the lanes have been shuffled in place.

  for (int i = 0; i != 4; ++i) {

    int M = Mask[i];

    if (M < 0)

      continue;

    int LaneBase = i & ~1;

    auto &LaneMask = (i & 1) ? RHSMask : LHSMask;

    LaneMask[LaneBase + (M & 1)] = M;

    SHUFPDMask[i] = M & 1;

  }


  SDValue LHS = DAG.getVectorShuffle(VT, DL, V1, V2, LHSMask);

  SDValue RHS = DAG.getVectorShuffle(VT, DL, V1, V2, RHSMask);

  return DAG.getNode(X86ISD::SHUFP, DL, VT, LHS, RHS,

                     getSHUFPDImmForMask(SHUFPDMask, DL, DAG));

}


/// Lower a vector shuffle crossing multiple 128-bit lanes as

/// a lane permutation followed by a per-lane permutation.

///

/// This is mainly for cases where we can have non-repeating permutes

/// in each lane.

///

/// TODO: This is very similar to lowerShuffleAsLanePermuteAndRepeatedMask,

/// we should investigate merging them.


static SDValue lowerShuffleAsLanePermuteAndPermute(

    const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,

    SelectionDAG &DAG, const X86Subtarget &Subtarget) {

  int NumElts = VT.getVectorNumElements();

  int NumLanes = VT.getSizeInBits() / 128;

  int NumEltsPerLane = NumElts / NumLanes;

  bool CanUseSublanes = Subtarget.hasAVX2() && V2.isUndef();


  /// Attempts to find a sublane permute with the given size

  /// that gets all elements into their target lanes.

  ///

  /// If successful, fills CrossLaneMask and InLaneMask and returns true.

  /// If unsuccessful, returns false and may overwrite InLaneMask.

  auto getSublanePermute = [&](int NumSublanes) -> SDValue {

    int NumSublanesPerLane = NumSublanes / NumLanes;

    int NumEltsPerSublane = NumElts / NumSublanes;


    SmallVector<int, 16> CrossLaneMask;

    SmallVector<int, 16> InLaneMask(NumElts, SM_SentinelUndef);

    // CrossLaneMask but one entry == one sublane.

    SmallVector<int, 16> CrossLaneMaskLarge(NumSublanes, SM_SentinelUndef);

    APInt DemandedCrossLane = APInt::getZero(NumElts);


    for (int i = 0; i != NumElts; ++i) {

      int M = Mask[i];

      if (M < 0)

        continue;


      int SrcSublane = M / NumEltsPerSublane;

      int DstLane = i / NumEltsPerLane;


      // We only need to get the elements into the right lane, not sublane.

      // So search all sublanes that make up the destination lane.

      bool Found = false;

      int DstSubStart = DstLane * NumSublanesPerLane;

      int DstSubEnd = DstSubStart + NumSublanesPerLane;

      for (int DstSublane = DstSubStart; DstSublane < DstSubEnd; ++DstSublane) {

        if (!isUndefOrEqual(CrossLaneMaskLarge[DstSublane], SrcSublane))

          continue;


        Found = true;

        CrossLaneMaskLarge[DstSublane] = SrcSublane;

        int DstSublaneOffset = DstSublane * NumEltsPerSublane;

        InLaneMask[i] = DstSublaneOffset + M % NumEltsPerSublane;

        DemandedCrossLane.setBit(InLaneMask[i]);

        break;

      }

      if (!Found)

        return SDValue();

    }


    // Fill CrossLaneMask using CrossLaneMaskLarge.

    narrowShuffleMaskElts(NumEltsPerSublane, CrossLaneMaskLarge, CrossLaneMask);


    if (!CanUseSublanes) {

      // If we're only shuffling a single lowest lane and the rest are identity

      // then don't bother.

      // TODO - isShuffleMaskInputInPlace could be extended to something like

      // this.

      int NumIdentityLanes = 0;

      bool OnlyShuffleLowestLane = true;

      for (int i = 0; i != NumLanes; ++i) {

        int LaneOffset = i * NumEltsPerLane;

        if (isSequentialOrUndefInRange(InLaneMask, LaneOffset, NumEltsPerLane,

                                       i * NumEltsPerLane))

          NumIdentityLanes++;

        else if (CrossLaneMask[LaneOffset] != 0)

          OnlyShuffleLowestLane = false;

      }

      if (OnlyShuffleLowestLane && NumIdentityLanes == (NumLanes - 1))

        return SDValue();

    }


    // Simplify CrossLaneMask based on the actual demanded elements.

    if (V1.hasOneUse())

      for (int i = 0; i != NumElts; ++i)

        if (!DemandedCrossLane[i])

          CrossLaneMask[i] = SM_SentinelUndef;


    // Avoid returning the same shuffle operation. For example,

    // t7: v16i16 = vector_shuffle<8,9,10,11,4,5,6,7,0,1,2,3,12,13,14,15> t5,

    //                             undef:v16i16

    if (CrossLaneMask == Mask || InLaneMask == Mask)

      return SDValue();


    SDValue CrossLane = DAG.getVectorShuffle(VT, DL, V1, V2, CrossLaneMask);

    return DAG.getVectorShuffle(VT, DL, CrossLane, DAG.getUNDEF(VT),

                                InLaneMask);

  };


  // First attempt a solution with full lanes.

  if (SDValue V = getSublanePermute(/*NumSublanes=*/NumLanes))

    return V;


  // The rest of the solutions use sublanes.

  if (!CanUseSublanes)

    return SDValue();


  // Then attempt a solution with 64-bit sublanes (vpermq).

  if (SDValue V = getSublanePermute(/*NumSublanes=*/NumLanes * 2))

    return V;


  // If that doesn't work and we have fast variable cross-lane shuffle,

  // attempt 32-bit sublanes (vpermd).

  if (!Subtarget.hasFastVariableCrossLaneShuffle())

    return SDValue();


  return getSublanePermute(/*NumSublanes=*/NumLanes * 4);

}


/// Helper to get compute inlane shuffle mask for a complete shuffle mask.


static void computeInLaneShuffleMask(const ArrayRef<int> &Mask, int LaneSize,

                                     SmallVector<int> &InLaneMask) {

  int Size = Mask.size();

  InLaneMask.assign(Mask.begin(), Mask.end());

  for (int i = 0; i < Size; ++i) {

    int &M = InLaneMask[i];

    if (M < 0)

      continue;

    if (((M % Size) / LaneSize) != (i / LaneSize))

      M = (M % LaneSize) + ((i / LaneSize) * LaneSize) + Size;

  }

}


/// Lower a vector shuffle crossing multiple 128-bit lanes by shuffling one

/// source with a lane permutation.

///

/// This lowering strategy results in four instructions in the worst case for a

/// single-input cross lane shuffle which is lower than any other fully general

/// cross-lane shuffle strategy I'm aware of. Special cases for each particular

/// shuffle pattern should be handled prior to trying this lowering.


static SDValue lowerShuffleAsLanePermuteAndShuffle(

    const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,

    SelectionDAG &DAG, const X86Subtarget &Subtarget) {

  // FIXME: This should probably be generalized for 512-bit vectors as well.

  assert(VT.is256BitVector() && "Only for 256-bit vector shuffles!");

  int Size = Mask.size();

  int LaneSize = Size / 2;


  // Fold to SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)).

  // Only do this if the elements aren't all from the lower lane,

  // otherwise we're (probably) better off doing a split.

  if (VT == MVT::v4f64 &&

      !all_of(Mask, [LaneSize](int M) { return M < LaneSize; }))

    return lowerShuffleAsLanePermuteAndSHUFP(DL, VT, V1, V2, Mask, DAG);


  // If there are only inputs from one 128-bit lane, splitting will in fact be

  // less expensive. The flags track whether the given lane contains an element

  // that crosses to another lane.

  bool AllLanes;

  if (!Subtarget.hasAVX2()) {

    bool LaneCrossing[2] = {false, false};

    for (int i = 0; i < Size; ++i)

      if (Mask[i] >= 0 && ((Mask[i] % Size) / LaneSize) != (i / LaneSize))

        LaneCrossing[(Mask[i] % Size) / LaneSize] = true;

    AllLanes = LaneCrossing[0] && LaneCrossing[1];

  } else {

    bool LaneUsed[2] = {false, false};

    for (int i = 0; i < Size; ++i)

      if (Mask[i] >= 0)

        LaneUsed[(Mask[i] % Size) / LaneSize] = true;

    AllLanes = LaneUsed[0] && LaneUsed[1];

  }


  // TODO - we could support shuffling V2 in the Flipped input.

  assert(V2.isUndef() &&

         "This last part of this routine only works on single input shuffles");


  SmallVector<int> InLaneMask;

  computeInLaneShuffleMask(Mask, Mask.size() / 2, InLaneMask);


  assert(!is128BitLaneCrossingShuffleMask(VT, InLaneMask) &&

         "In-lane shuffle mask expected");


  // If we're not using both lanes in each lane and the inlane mask is not

  // repeating, then we're better off splitting.

  if (!AllLanes && !is128BitLaneRepeatedShuffleMask(VT, InLaneMask))

    return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG,

                                /*SimpleOnly*/ false);


  // Flip the lanes, and shuffle the results which should now be in-lane.

  MVT PVT = VT.isFloatingPoint() ? MVT::v4f64 : MVT::v4i64;

  SDValue Flipped = DAG.getBitcast(PVT, V1);

  Flipped =

      DAG.getVectorShuffle(PVT, DL, Flipped, DAG.getUNDEF(PVT), {2, 3, 0, 1});

  Flipped = DAG.getBitcast(VT, Flipped);

  return DAG.getVectorShuffle(VT, DL, V1, Flipped, InLaneMask);

}


/// Handle lowering 2-lane 128-bit shuffles.


static SDValue lowerV2X128Shuffle(const SDLoc &DL, MVT VT, SDValue V1,

                                  SDValue V2, ArrayRef<int> Mask,

                                  const APInt &Zeroable,

                                  const X86Subtarget &Subtarget,

                                  SelectionDAG &DAG) {

  if (V2.isUndef()) {

    // Attempt to match VBROADCAST*128 subvector broadcast load.

    bool SplatLo = isShuffleEquivalent(Mask, {0, 1, 0, 1}, V1);

    bool SplatHi = isShuffleEquivalent(Mask, {2, 3, 2, 3}, V1);

    if ((SplatLo || SplatHi) && !Subtarget.hasAVX512() && V1.hasOneUse() &&

        X86::mayFoldLoad(peekThroughOneUseBitcasts(V1), Subtarget)) {

      MVT MemVT = VT.getHalfNumVectorElementsVT();

      unsigned Ofs = SplatLo ? 0 : MemVT.getStoreSize();

      auto *Ld = cast<LoadSDNode>(peekThroughOneUseBitcasts(V1));

      if (SDValue BcstLd = getBROADCAST_LOAD(X86ISD::SUBV_BROADCAST_LOAD, DL,

                                             VT, MemVT, Ld, Ofs, DAG))

        return BcstLd;

    }


    // With AVX2, use VPERMQ/VPERMPD for unary shuffles to allow memory folding.

    if (Subtarget.hasAVX2())

      return SDValue();

  }


  bool V2IsZero = !V2.isUndef() && ISD::isBuildVectorAllZeros(V2.getNode());


  SmallVector<int, 4> WidenedMask;

  if (!canWidenShuffleElements(Mask, Zeroable, V2IsZero, WidenedMask))

    return SDValue();


  bool IsLowZero = (Zeroable & 0x3) == 0x3;

  bool IsHighZero = (Zeroable & 0xc) == 0xc;


  // Try to use an insert into a zero vector.

  if (WidenedMask[0] == 0 && IsHighZero) {

    MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);

    SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,

                              DAG.getVectorIdxConstant(0, DL));

    return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,

                       getZeroVector(VT, Subtarget, DAG, DL), LoV,

                       DAG.getVectorIdxConstant(0, DL));

  }


  // TODO: If minimizing size and one of the inputs is a zero vector and the

  // the zero vector has only one use, we could use a VPERM2X128 to save the

  // instruction bytes needed to explicitly generate the zero vector.


  // Blends are faster and handle all the non-lane-crossing cases.

  if (SDValue Blend = lowerShuffleAsBlend(DL, VT, V1, V2, Mask, Zeroable,

                                          Subtarget, DAG))

    return Blend;


  // If either input operand is a zero vector, use VPERM2X128 because its mask

  // allows us to replace the zero input with an implicit zero.

  if (!IsLowZero && !IsHighZero) {

    // Check for patterns which can be matched with a single insert of a 128-bit

    // subvector.

    bool OnlyUsesV1 = isShuffleEquivalent(Mask, {0, 1, 0, 1}, V1, V2);

    if (OnlyUsesV1 || isShuffleEquivalent(Mask, {0, 1, 4, 5}, V1, V2)) {


      // With AVX1, use vperm2f128 (below) to allow load folding. Otherwise,

      // this will likely become vinsertf128 which can't fold a 256-bit memop.

      if (!isa<LoadSDNode>(peekThroughBitcasts(V1))) {

        MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);

        SDValue SubVec =

            DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, OnlyUsesV1 ? V1 : V2,

                        DAG.getVectorIdxConstant(0, DL));

        return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, V1, SubVec,

                           DAG.getVectorIdxConstant(2, DL));

      }

    }


    // Try to use SHUF128 if possible.

    if (Subtarget.hasVLX()) {

      if (WidenedMask[0] < 2 && WidenedMask[1] >= 2) {

        unsigned PermMask = ((WidenedMask[0] % 2) << 0) |

                            ((WidenedMask[1] % 2) << 1);

        return DAG.getNode(X86ISD::SHUF128, DL, VT, V1, V2,

                           DAG.getTargetConstant(PermMask, DL, MVT::i8));

      }

    }

  }


  // Otherwise form a 128-bit permutation. After accounting for undefs,

  // convert the 64-bit shuffle mask selection values into 128-bit

  // selection bits by dividing the indexes by 2 and shifting into positions

  // defined by a vperm2*128 instruction's immediate control byte.


  // The immediate permute control byte looks like this:

  //    [1:0] - select 128 bits from sources for low half of destination

  //    [2]   - ignore

  //    [3]   - zero low half of destination

  //    [5:4] - select 128 bits from sources for high half of destination

  //    [6]   - ignore

  //    [7]   - zero high half of destination


  assert((WidenedMask[0] >= 0 || IsLowZero) &&

         (WidenedMask[1] >= 0 || IsHighZero) && "Undef half?");


  unsigned PermMask = 0;

  PermMask |= IsLowZero  ? 0x08 : (WidenedMask[0] << 0);

  PermMask |= IsHighZero ? 0x80 : (WidenedMask[1] << 4);


  // Check the immediate mask and replace unused sources with undef.

  if ((PermMask & 0x0a) != 0x00 && (PermMask & 0xa0) != 0x00)

    V1 = DAG.getUNDEF(VT);

  if ((PermMask & 0x0a) != 0x02 && (PermMask & 0xa0) != 0x20)

    V2 = DAG.getUNDEF(VT);


  return DAG.getNode(X86ISD::VPERM2X128, DL, VT, V1, V2,

                     DAG.getTargetConstant(PermMask, DL, MVT::i8));

}


/// Lower a vector shuffle by first fixing the 128-bit lanes and then

/// shuffling each lane.

///

/// This attempts to create a repeated lane shuffle where each lane uses one

/// or two of the lanes of the inputs. The lanes of the input vectors are

/// shuffled in one or two independent shuffles to get the lanes into the

/// position needed by the final shuffle.


static SDValue lowerShuffleAsLanePermuteAndRepeatedMask(

    const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,

    const X86Subtarget &Subtarget, SelectionDAG &DAG) {

  assert(!V2.isUndef() && "This is only useful with multiple inputs.");


  if (is128BitLaneRepeatedShuffleMask(VT, Mask))

    return SDValue();


  int NumElts = Mask.size();

  int NumLanes = VT.getSizeInBits() / 128;

  int NumLaneElts = 128 / VT.getScalarSizeInBits();

  SmallVector<int, 16> RepeatMask(NumLaneElts, -1);

  SmallVector<std::array<int, 2>, 2> LaneSrcs(NumLanes, {{-1, -1}});


  // First pass will try to fill in the RepeatMask from lanes that need two

  // sources.

  for (int Lane = 0; Lane != NumLanes; ++Lane) {

    int Srcs[2] = {-1, -1};

    SmallVector<int, 16> InLaneMask(NumLaneElts, -1);

    for (int i = 0; i != NumLaneElts; ++i) {

      int M = Mask[(Lane * NumLaneElts) + i];

      if (M < 0)

        continue;

      // Determine which of the possible input lanes (NumLanes from each source)

      // this element comes from. Assign that as one of the sources for this

      // lane. We can assign up to 2 sources for this lane. If we run out

      // sources we can't do anything.

      int LaneSrc = M / NumLaneElts;

      int Src;

      if (Srcs[0] < 0 || Srcs[0] == LaneSrc)

        Src = 0;

      else if (Srcs[1] < 0 || Srcs[1] == LaneSrc)

        Src = 1;

      else

        return SDValue();


      Srcs[Src] = LaneSrc;

      InLaneMask[i] = (M % NumLaneElts) + Src * NumElts;

    }


    // If this lane has two sources, see if it fits with the repeat mask so far.

    if (Srcs[1] < 0)

      continue;


    LaneSrcs[Lane][0] = Srcs[0];

    LaneSrcs[Lane][1] = Srcs[1];


    auto MatchMasks = [](ArrayRef<int> M1, ArrayRef<int> M2) {

      assert(M1.size() == M2.size() && "Unexpected mask size");

      for (int i = 0, e = M1.size(); i != e; ++i)

        if (M1[i] >= 0 && M2[i] >= 0 && M1[i] != M2[i])

          return false;

      return true;

    };


    auto MergeMasks = [](ArrayRef<int> Mask, MutableArrayRef<int> MergedMask) {

      assert(Mask.size() == MergedMask.size() && "Unexpected mask size");

      for (int i = 0, e = MergedMask.size(); i != e; ++i) {

        int M = Mask[i];

        if (M < 0)

          continue;

        assert((MergedMask[i] < 0 || MergedMask[i] == M) &&

               "Unexpected mask element");

        MergedMask[i] = M;

      }

    };


    if (MatchMasks(InLaneMask, RepeatMask)) {

      // Merge this lane mask into the final repeat mask.

      MergeMasks(InLaneMask, RepeatMask);

      continue;

    }


    // Didn't find a match. Swap the operands and try again.

    std::swap(LaneSrcs[Lane][0], LaneSrcs[Lane][1]);

    ShuffleVectorSDNode::commuteMask(InLaneMask);


    if (MatchMasks(InLaneMask, RepeatMask)) {

      // Merge this lane mask into the final repeat mask.

      MergeMasks(InLaneMask, RepeatMask);

      continue;

    }


    // Couldn't find a match with the operands in either order.

    return SDValue();

  }


  // Now handle any lanes with only one source.

  for (int Lane = 0; Lane != NumLanes; ++Lane) {

    // If this lane has already been processed, skip it.

    if (LaneSrcs[Lane][0] >= 0)

      continue;


    for (int i = 0; i != NumLaneElts; ++i) {

      int M = Mask[(Lane * NumLaneElts) + i];

      if (M < 0)

        continue;


      // If RepeatMask isn't defined yet we can define it ourself.

      if (RepeatMask[i] < 0)

        RepeatMask[i] = M % NumLaneElts;


      if (RepeatMask[i] < NumElts) {

        if (RepeatMask[i] != M % NumLaneElts)

          return SDValue();

        LaneSrcs[Lane][0] = M / NumLaneElts;

      } else {

        if (RepeatMask[i] != ((M % NumLaneElts) + NumElts))

          return SDValue();

        LaneSrcs[Lane][1] = M / NumLaneElts;

      }

    }


    if (LaneSrcs[Lane][0] < 0 && LaneSrcs[Lane][1] < 0)

      return SDValue();

  }


  SmallVector<int, 16> NewMask(NumElts, -1);

  for (int Lane = 0; Lane != NumLanes; ++Lane) {

    int Src = LaneSrcs[Lane][0];

    for (int i = 0; i != NumLaneElts; ++i) {

      int M = -1;

      if (Src >= 0)

        M = Src * NumLaneElts + i;

      NewMask[Lane * NumLaneElts + i] = M;

    }

  }

  SDValue NewV1 = DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);

  // Ensure we didn't get back the shuffle we started with.

  // FIXME: This is a hack to make up for some splat handling code in

  // getVectorShuffle.

  if (isa<ShuffleVectorSDNode>(NewV1) &&

      cast<ShuffleVectorSDNode>(NewV1)->getMask() == Mask)

    return SDValue();


  for (int Lane = 0; Lane != NumLanes; ++Lane) {

    int Src = LaneSrcs[Lane][1];

    for (int i = 0; i != NumLaneElts; ++i) {

      int M = -1;

      if (Src >= 0)

        M = Src * NumLaneElts + i;

      NewMask[Lane * NumLaneElts + i] = M;

    }

  }

  SDValue NewV2 = DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);

  // Ensure we didn't get back the shuffle we started with.

  // FIXME: This is a hack to make up for some splat handling code in

  // getVectorShuffle.

  if (isa<ShuffleVectorSDNode>(NewV2) &&

      cast<ShuffleVectorSDNode>(NewV2)->getMask() == Mask)

    return SDValue();


  for (int i = 0; i != NumElts; ++i) {

    if (Mask[i] < 0) {

      NewMask[i] = -1;

      continue;

    }

    NewMask[i] = RepeatMask[i % NumLaneElts];

    if (NewMask[i] < 0)

      continue;


    NewMask[i] += (i / NumLaneElts) * NumLaneElts;

  }

  return DAG.getVectorShuffle(VT, DL, NewV1, NewV2, NewMask);

}


/// If the input shuffle mask results in a vector that is undefined in all upper

/// or lower half elements and that mask accesses only 2 halves of the

/// shuffle's operands, return true. A mask of half the width with mask indexes

/// adjusted to access the extracted halves of the original shuffle operands is

/// returned in HalfMask. HalfIdx1 and HalfIdx2 return whether the upper or

/// lower half of each input operand is accessed.

static bool


getHalfShuffleMask(ArrayRef<int> Mask, MutableArrayRef<int> HalfMask,

                   int &HalfIdx1, int &HalfIdx2) {

  assert((Mask.size() == HalfMask.size() * 2) &&

         "Expected input mask to be twice as long as output");


  // Exactly one half of the result must be undef to allow narrowing.

  bool UndefLower = isUndefLowerHalf(Mask);

  bool UndefUpper = isUndefUpperHalf(Mask);

  if (UndefLower == UndefUpper)

    return false;


  unsigned HalfNumElts = HalfMask.size();

  unsigned MaskIndexOffset = UndefLower ? HalfNumElts : 0;

  HalfIdx1 = -1;

  HalfIdx2 = -1;

  for (unsigned i = 0; i != HalfNumElts; ++i) {

    int M = Mask[i + MaskIndexOffset];

    if (M < 0) {

      HalfMask[i] = M;

      continue;

    }


    // Determine which of the 4 half vectors this element is from.

    // i.e. 0 = Lower V1, 1 = Upper V1, 2 = Lower V2, 3 = Upper V2.

    int HalfIdx = M / HalfNumElts;


    // Determine the element index into its half vector source.

    int HalfElt = M % HalfNumElts;


    // We can shuffle with up to 2 half vectors, set the new 'half'

    // shuffle mask accordingly.

    if (HalfIdx1 < 0 || HalfIdx1 == HalfIdx) {

      HalfMask[i] = HalfElt;

      HalfIdx1 = HalfIdx;

      continue;

    }

    if (HalfIdx2 < 0 || HalfIdx2 == HalfIdx) {

      HalfMask[i] = HalfElt + HalfNumElts;

      HalfIdx2 = HalfIdx;

      continue;

    }


    // Too many half vectors referenced.

    return false;

  }


  return true;

}


/// Given the output values from getHalfShuffleMask(), create a half width

/// shuffle of extracted vectors followed by an insert back to full width.


static SDValue getShuffleHalfVectors(const SDLoc &DL, SDValue V1, SDValue V2,

                                     ArrayRef<int> HalfMask, int HalfIdx1,

                                     int HalfIdx2, bool UndefLower,

                                     SelectionDAG &DAG, bool UseConcat = false) {

  assert(V1.getValueType() == V2.getValueType() && "Different sized vectors?");

  assert(V1.getValueType().isSimple() && "Expecting only simple types");


  MVT VT = V1.getSimpleValueType();

  MVT HalfVT = VT.getHalfNumVectorElementsVT();

  unsigned HalfNumElts = HalfVT.getVectorNumElements();


  auto getHalfVector = [&](int HalfIdx) {

    if (HalfIdx < 0)

      return DAG.getUNDEF(HalfVT);

    SDValue V = (HalfIdx < 2 ? V1 : V2);

    HalfIdx = (HalfIdx % 2) * HalfNumElts;

    return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V,

                       DAG.getVectorIdxConstant(HalfIdx, DL));

  };


  // ins undef, (shuf (ext V1, HalfIdx1), (ext V2, HalfIdx2), HalfMask), Offset

  SDValue Half1 = getHalfVector(HalfIdx1);

  SDValue Half2 = getHalfVector(HalfIdx2);

  SDValue V = DAG.getVectorShuffle(HalfVT, DL, Half1, Half2, HalfMask);

  if (UseConcat) {

    SDValue Op0 = V;

    SDValue Op1 = DAG.getUNDEF(HalfVT);

    if (UndefLower)

      std::swap(Op0, Op1);

    return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Op0, Op1);

  }


  unsigned Offset = UndefLower ? HalfNumElts : 0;

  return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), V,

                     DAG.getVectorIdxConstant(Offset, DL));

}


/// Lower shuffles where an entire half of a 256 or 512-bit vector is UNDEF.

/// This allows for fast cases such as subvector extraction/insertion

/// or shuffling smaller vector types which can lower more efficiently.


static SDValue lowerShuffleWithUndefHalf(const SDLoc &DL, MVT VT, SDValue V1,

                                         SDValue V2, ArrayRef<int> Mask,

                                         const X86Subtarget &Subtarget,

                                         SelectionDAG &DAG) {

  assert((VT.is256BitVector() || VT.is512BitVector()) &&

         "Expected 256-bit or 512-bit vector");


  bool UndefLower = isUndefLowerHalf(Mask);

  if (!UndefLower && !isUndefUpperHalf(Mask))

    return SDValue();


  assert((!UndefLower || !isUndefUpperHalf(Mask)) &&

         "Completely undef shuffle mask should have been simplified already");


  // Upper half is undef and lower half is whole upper subvector.

  // e.g. vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>

  MVT HalfVT = VT.getHalfNumVectorElementsVT();

  unsigned HalfNumElts = HalfVT.getVectorNumElements();

  if (!UndefLower &&

      isSequentialOrUndefInRange(Mask, 0, HalfNumElts, HalfNumElts)) {

    SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,

                             DAG.getVectorIdxConstant(HalfNumElts, DL));

    return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,

                       DAG.getVectorIdxConstant(0, DL));

  }


  // Lower half is undef and upper half is whole lower subvector.

  // e.g. vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>

  if (UndefLower &&

      isSequentialOrUndefInRange(Mask, HalfNumElts, HalfNumElts, 0)) {

    SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,

                             DAG.getVectorIdxConstant(0, DL));

    return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,

                       DAG.getVectorIdxConstant(HalfNumElts, DL));

  }


  int HalfIdx1, HalfIdx2;

  SmallVector<int, 8> HalfMask(HalfNumElts);

  if (!getHalfShuffleMask(Mask, HalfMask, HalfIdx1, HalfIdx2))

    return SDValue();


  assert(HalfMask.size() == HalfNumElts && "Unexpected shuffle mask length");


  // Only shuffle the halves of the inputs when useful.

  unsigned NumLowerHalves =

      (HalfIdx1 == 0 || HalfIdx1 == 2) + (HalfIdx2 == 0 || HalfIdx2 == 2);

  unsigned NumUpperHalves =

      (HalfIdx1 == 1 || HalfIdx1 == 3) + (HalfIdx2 == 1 || HalfIdx2 == 3);

  assert(NumLowerHalves + NumUpperHalves <= 2 && "Only 1 or 2 halves allowed");


  // Determine the larger pattern of undef/halves, then decide if it's worth

  // splitting the shuffle based on subtarget capabilities and types.

  unsigned EltWidth = VT.getVectorElementType().getSizeInBits();

  if (!UndefLower) {

    // XXXXuuuu: no insert is needed.

    // Always extract lowers when setting lower - these are all free subreg ops.

    if (NumUpperHalves == 0)

      return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2,

                                   UndefLower, DAG);


    if (NumUpperHalves == 1) {

      // AVX2 has efficient 32/64-bit element cross-lane shuffles.

      if (Subtarget.hasAVX2()) {

        // extract128 + vunpckhps/vshufps, is better than vblend + vpermps.

        if (EltWidth == 32 && NumLowerHalves && HalfVT.is128BitVector() &&

            !is128BitUnpackShuffleMask(HalfMask, DAG) &&

            (!isSingleSHUFPSMask(HalfMask) ||

             Subtarget.hasFastVariableCrossLaneShuffle()))

          return SDValue();

        // If this is an unary shuffle (assume that the 2nd operand is

        // canonicalized to undef), then we can use vpermpd. Otherwise, we

        // are better off extracting the upper half of 1 operand and using a

        // narrow shuffle.

        if (EltWidth == 64 && V2.isUndef())

          return SDValue();

        // If this is an unary vXi8 shuffle with inplace halves, then perform as

        // full width pshufb, and then merge.

        if (EltWidth == 8 && HalfIdx1 == 0 && HalfIdx2 == 1)

          return SDValue();

      }

      // AVX512 has efficient cross-lane shuffles for all legal 512-bit types.

      if (Subtarget.hasAVX512() && VT.is512BitVector())

        return SDValue();

      // Extract + narrow shuffle is better than the wide alternative.

      return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2,

                                   UndefLower, DAG);

    }


    // Don't extract both uppers, instead shuffle and then extract.

    assert(NumUpperHalves == 2 && "Half vector count went wrong");

    return SDValue();

  }


  // UndefLower - uuuuXXXX: an insert to high half is required if we split this.

  if (NumUpperHalves == 0) {

    // AVX2 has efficient 64-bit element cross-lane shuffles.

    // TODO: Refine to account for unary shuffle, splat, and other masks?

    if (Subtarget.hasAVX2() && EltWidth == 64)

      return SDValue();

    // AVX512 has efficient cross-lane shuffles for all legal 512-bit types.

    if (Subtarget.hasAVX512() && VT.is512BitVector())

      return SDValue();

    // Narrow shuffle + insert is better than the wide alternative.

    return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2,

                                 UndefLower, DAG);

  }


  // NumUpperHalves != 0: don't bother with extract, shuffle, and then insert.

  return SDValue();

}


/// Handle case where shuffle sources are coming from the same 128-bit lane and

/// every lane can be represented as the same repeating mask - allowing us to

/// shuffle the sources with the repeating shuffle and then permute the result

/// to the destination lanes.


static SDValue lowerShuffleAsRepeatedMaskAndLanePermute(

    const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,

    const X86Subtarget &Subtarget, SelectionDAG &DAG) {

  int NumElts = VT.getVectorNumElements();

  int NumLanes = VT.getSizeInBits() / 128;

  int NumLaneElts = NumElts / NumLanes;


  // On AVX2 we may be able to just shuffle the lowest elements and then

  // broadcast the result.

  if (Subtarget.hasAVX2()) {

    for (unsigned BroadcastSize : {16, 32, 64}) {

      if (BroadcastSize <= VT.getScalarSizeInBits())

        continue;

      int NumBroadcastElts = BroadcastSize / VT.getScalarSizeInBits();


      // Attempt to match a repeating pattern every NumBroadcastElts,

      // accounting for UNDEFs but only references the lowest 128-bit

      // lane of the inputs.

      auto FindRepeatingBroadcastMask = [&](SmallVectorImpl<int> &RepeatMask) {

        for (int i = 0; i != NumElts; i += NumBroadcastElts)

          for (int j = 0; j != NumBroadcastElts; ++j) {

            int M = Mask[i + j];

            if (M < 0)

              continue;

            int &R = RepeatMask[j];

            if (0 != ((M % NumElts) / NumLaneElts))

              return false;

            if (0 <= R && R != M)

              return false;

            R = M;

          }

        return true;

      };


      SmallVector<int, 8> RepeatMask((unsigned)NumElts, -1);

      if (!FindRepeatingBroadcastMask(RepeatMask))

        continue;


      // Shuffle the (lowest) repeated elements in place for broadcast.

      SDValue RepeatShuf = DAG.getVectorShuffle(VT, DL, V1, V2, RepeatMask);


      // Shuffle the actual broadcast.

      SmallVector<int, 8> BroadcastMask((unsigned)NumElts, -1);

      for (int i = 0; i != NumElts; i += NumBroadcastElts)

        for (int j = 0; j != NumBroadcastElts; ++j)

          BroadcastMask[i + j] = j;


      // Avoid returning the same shuffle operation. For example,

      // v8i32 = vector_shuffle<0,1,0,1,0,1,0,1> t5, undef:v8i32

      if (BroadcastMask == Mask)

        return SDValue();


      return DAG.getVectorShuffle(VT, DL, RepeatShuf, DAG.getUNDEF(VT),

                                  BroadcastMask);

    }

  }


  // Bail if the shuffle mask doesn't cross 128-bit lanes.

  if (!is128BitLaneCrossingShuffleMask(VT, Mask))

    return SDValue();


  // Bail if we already have a repeated lane shuffle mask.

  if (is128BitLaneRepeatedShuffleMask(VT, Mask))

    return SDValue();


  // Helper to look for repeated mask in each split sublane, and that those

  // sublanes can then be permuted into place.

  auto ShuffleSubLanes = [&](int SubLaneScale) {

    int NumSubLanes = NumLanes * SubLaneScale;

    int NumSubLaneElts = NumLaneElts / SubLaneScale;


    // Check that all the sources are coming from the same lane and see if we

    // can form a repeating shuffle mask (local to each sub-lane). At the same

    // time, determine the source sub-lane for each destination sub-lane.

    int TopSrcSubLane = -1;

    SmallVector<int, 8> Dst2SrcSubLanes((unsigned)NumSubLanes, -1);

    SmallVector<SmallVector<int, 8>> RepeatedSubLaneMasks(

        SubLaneScale,

        SmallVector<int, 8>((unsigned)NumSubLaneElts, SM_SentinelUndef));


    for (int DstSubLane = 0; DstSubLane != NumSubLanes; ++DstSubLane) {

      // Extract the sub-lane mask, check that it all comes from the same lane

      // and normalize the mask entries to come from the first lane.

      int SrcLane = -1;

      SmallVector<int, 8> SubLaneMask((unsigned)NumSubLaneElts, -1);

      for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {

        int M = Mask[(DstSubLane * NumSubLaneElts) + Elt];

        if (M < 0)

          continue;

        int Lane = (M % NumElts) / NumLaneElts;

        if ((0 <= SrcLane) && (SrcLane != Lane))

          return SDValue();

        SrcLane = Lane;

        int LocalM = (M % NumLaneElts) + (M < NumElts ? 0 : NumElts);

        SubLaneMask[Elt] = LocalM;

      }


      // Whole sub-lane is UNDEF.

      if (SrcLane < 0)

        continue;


      // Attempt to match against the candidate repeated sub-lane masks.

      for (int SubLane = 0; SubLane != SubLaneScale; ++SubLane) {

        auto MatchMasks = [NumSubLaneElts](ArrayRef<int> M1, ArrayRef<int> M2) {

          for (int i = 0; i != NumSubLaneElts; ++i) {

            if (M1[i] < 0 || M2[i] < 0)

              continue;

            if (M1[i] != M2[i])

              return false;

          }

          return true;

        };


        auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane];

        if (!MatchMasks(SubLaneMask, RepeatedSubLaneMask))

          continue;


        // Merge the sub-lane mask into the matching repeated sub-lane mask.

        for (int i = 0; i != NumSubLaneElts; ++i) {

          int M = SubLaneMask[i];

          if (M < 0)

            continue;

          assert((RepeatedSubLaneMask[i] < 0 || RepeatedSubLaneMask[i] == M) &&

                 "Unexpected mask element");

          RepeatedSubLaneMask[i] = M;

        }


        // Track the top most source sub-lane - by setting the remaining to

        // UNDEF we can greatly simplify shuffle matching.

        int SrcSubLane = (SrcLane * SubLaneScale) + SubLane;

        TopSrcSubLane = std::max(TopSrcSubLane, SrcSubLane);

        Dst2SrcSubLanes[DstSubLane] = SrcSubLane;

        break;

      }


      // Bail if we failed to find a matching repeated sub-lane mask.

      if (Dst2SrcSubLanes[DstSubLane] < 0)

        return SDValue();

    }

    assert(0 <= TopSrcSubLane && TopSrcSubLane < NumSubLanes &&

           "Unexpected source lane");


    // Create a repeating shuffle mask for the entire vector.

    SmallVector<int, 8> RepeatedMask((unsigned)NumElts, -1);

    for (int SubLane = 0; SubLane <= TopSrcSubLane; ++SubLane) {

      int Lane = SubLane / SubLaneScale;

      auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane % SubLaneScale];

      for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {

        int M = RepeatedSubLaneMask[Elt];

        if (M < 0)

          continue;

        int Idx = (SubLane * NumSubLaneElts) + Elt;

        RepeatedMask[Idx] = M + (Lane * NumLaneElts);

      }

    }


    // Shuffle each source sub-lane to its destination.

    SmallVector<int, 8> SubLaneMask((unsigned)NumElts, -1);

    for (int i = 0; i != NumElts; i += NumSubLaneElts) {

      int SrcSubLane = Dst2SrcSubLanes[i / NumSubLaneElts];

      if (SrcSubLane < 0)

        continue;

      for (int j = 0; j != NumSubLaneElts; ++j)

        SubLaneMask[i + j] = j + (SrcSubLane * NumSubLaneElts);

    }


    // Avoid returning the same shuffle operation.

    // v8i32 = vector_shuffle<0,1,4,5,2,3,6,7> t5, undef:v8i32

    if (RepeatedMask == Mask || SubLaneMask == Mask)

      return SDValue();


    SDValue RepeatedShuffle =

        DAG.getVectorShuffle(VT, DL, V1, V2, RepeatedMask);


    return DAG.getVectorShuffle(VT, DL, RepeatedShuffle, DAG.getUNDEF(VT),

                                SubLaneMask);

  };


  // On AVX2 targets we can permute 256-bit vectors as 64-bit sub-lanes

  // (with PERMQ/PERMPD). On AVX2/AVX512BW targets, permuting 32-bit sub-lanes,

  // even with a variable shuffle, can be worth it for v32i8/v64i8 vectors.

  // Otherwise we can only permute whole 128-bit lanes.

  int MinSubLaneScale = 1, MaxSubLaneScale = 1;

  if (Subtarget.hasAVX2() && VT.is256BitVector()) {

    bool OnlyLowestElts = isUndefOrInRange(Mask, 0, NumLaneElts);

    MinSubLaneScale = 2;

    MaxSubLaneScale =

        (!OnlyLowestElts && V2.isUndef() && VT == MVT::v32i8) ? 4 : 2;

  }

  if (Subtarget.hasBWI() && VT == MVT::v64i8)

    MinSubLaneScale = MaxSubLaneScale = 4;


  for (int Scale = MinSubLaneScale; Scale <= MaxSubLaneScale; Scale *= 2)

    if (SDValue Shuffle = ShuffleSubLanes(Scale))

      return Shuffle;


  return SDValue();

}


static bool matchShuffleWithSHUFPD(MVT VT, SDValue &V1, SDValue &V2,

                                   bool &ForceV1Zero, bool &ForceV2Zero,

                                   unsigned &ShuffleImm, ArrayRef<int> Mask,

                                   const APInt &Zeroable) {

  int NumElts = VT.getVectorNumElements();

  assert(VT.getScalarSizeInBits() == 64 &&

         (NumElts == 2 || NumElts == 4 || NumElts == 8) &&

         "Unexpected data type for VSHUFPD");

  assert(isUndefOrZeroOrInRange(Mask, 0, 2 * NumElts) &&

         "Illegal shuffle mask");


  bool ZeroLane[2] = { true, true };

  for (int i = 0; i < NumElts; ++i)

    ZeroLane[i & 1] &= Zeroable[i];


  // Mask for V8F64: 0/1,  8/9,  2/3,  10/11, 4/5, ..

  // Mask for V4F64; 0/1,  4/5,  2/3,  6/7..

  bool IsSHUFPD = true;

  bool IsCommutable = true;

  SmallVector<int, 8> SHUFPDMask(NumElts, -1);

  for (int i = 0; i < NumElts; ++i) {

    if (Mask[i] == SM_SentinelUndef || ZeroLane[i & 1])

      continue;

    if (Mask[i] < 0)

      return false;

    int Val = (i & 6) + NumElts * (i & 1);

    int CommutVal = (i & 0xe) + NumElts * ((i & 1) ^ 1);

    if (Mask[i] < Val || Mask[i] > Val + 1)

      IsSHUFPD = false;

    if (Mask[i] < CommutVal || Mask[i] > CommutVal + 1)

      IsCommutable = false;

    SHUFPDMask[i] = Mask[i] % 2;

  }


  if (!IsSHUFPD && !IsCommutable)

    return false;


  if (!IsSHUFPD && IsCommutable)

    std::swap(V1, V2);


  ForceV1Zero = ZeroLane[0];

  ForceV2Zero = ZeroLane[1];

  ShuffleImm = getSHUFPDImm(SHUFPDMask);

  return true;

}


static SDValue lowerShuffleWithSHUFPD(const SDLoc &DL, MVT VT, SDValue V1,

                                      SDValue V2, ArrayRef<int> Mask,

                                      const APInt &Zeroable,

                                      const X86Subtarget &Subtarget,

                                      SelectionDAG &DAG) {

  assert((VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v8f64) &&

         "Unexpected data type for VSHUFPD");


  unsigned Immediate = 0;

  bool ForceV1Zero = false, ForceV2Zero = false;

  if (!matchShuffleWithSHUFPD(VT, V1, V2, ForceV1Zero, ForceV2Zero, Immediate,

                              Mask, Zeroable))

    return SDValue();


  // Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs.

  if (ForceV1Zero)

    V1 = getZeroVector(VT, Subtarget, DAG, DL);

  if (ForceV2Zero)

    V2 = getZeroVector(VT, Subtarget, DAG, DL);


  return DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,

                     DAG.getTargetConstant(Immediate, DL, MVT::i8));

}


// Look for {0, 8, 16, 24, 32, 40, 48, 56 } in the first 8 elements. Followed

// by zeroable elements in the remaining 24 elements. Turn this into two

// vmovqb instructions shuffled together.


static SDValue lowerShuffleAsVTRUNCAndUnpack(const SDLoc &DL, MVT VT,

                                             SDValue V1, SDValue V2,

                                             ArrayRef<int> Mask,

                                             const APInt &Zeroable,

                                             SelectionDAG &DAG) {

  assert(VT == MVT::v32i8 && "Unexpected type!");


  // The first 8 indices should be every 8th element.

  if (!isSequentialOrUndefInRange(Mask, 0, 8, 0, 8))

    return SDValue();


  // Remaining elements need to be zeroable.

  if (Zeroable.countl_one() < (Mask.size() - 8))

    return SDValue();


  V1 = DAG.getBitcast(MVT::v4i64, V1);

  V2 = DAG.getBitcast(MVT::v4i64, V2);


  V1 = DAG.getNode(X86ISD::VTRUNC, DL, MVT::v16i8, V1);

  V2 = DAG.getNode(X86ISD::VTRUNC, DL, MVT::v16i8, V2);


  // The VTRUNCs will put 0s in the upper 12 bytes. Use them to put zeroes in

  // the upper bits of the result using an unpckldq.

  SDValue Unpack = DAG.getVectorShuffle(MVT::v16i8, DL, V1, V2,

                                        { 0, 1, 2, 3, 16, 17, 18, 19,

                                          4, 5, 6, 7, 20, 21, 22, 23 });

  // Insert the unpckldq into a zero vector to widen to v32i8.

  return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v32i8,

                     DAG.getConstant(0, DL, MVT::v32i8), Unpack,

                     DAG.getVectorIdxConstant(0, DL));

}


// a = shuffle v1, v2, mask1    ; interleaving lower lanes of v1 and v2

// b = shuffle v1, v2, mask2    ; interleaving higher lanes of v1 and v2

//     =>

// ul = unpckl v1, v2

// uh = unpckh v1, v2

// a = vperm ul, uh

// b = vperm ul, uh

//

// Pattern-match interleave(256b v1, 256b v2) -> 512b v3 and lower it into unpck

// and permute. We cannot directly match v3 because it is split into two

// 256-bit vectors in earlier isel stages. Therefore, this function matches a

// pair of 256-bit shuffles and makes sure the masks are consecutive.

//

// Once unpck and permute nodes are created, the permute corresponding to this

// shuffle is returned, while the other permute replaces the other half of the

// shuffle in the selection dag.


static SDValue lowerShufflePairAsUNPCKAndPermute(const SDLoc &DL, MVT VT,

                                                 SDValue V1, SDValue V2,

                                                 ArrayRef<int> Mask,

                                                 SelectionDAG &DAG) {

  if (VT != MVT::v8f32 && VT != MVT::v8i32 && VT != MVT::v16i16 &&

      VT != MVT::v32i8)

    return SDValue();

  // <B0, B1, B0+1, B1+1, ..., >

  auto IsInterleavingPattern = [&](ArrayRef<int> Mask, unsigned Begin0,

                                   unsigned Begin1) {

    size_t Size = Mask.size();

    assert(Size % 2 == 0 && "Expected even mask size");

    for (unsigned I = 0; I < Size; I += 2) {

      if (Mask[I] != (int)(Begin0 + I / 2) ||

          Mask[I + 1] != (int)(Begin1 + I / 2))

        return false;

    }

    return true;

  };

  // Check which half is this shuffle node

  int NumElts = VT.getVectorNumElements();

  size_t FirstQtr = NumElts / 2;

  size_t ThirdQtr = NumElts + NumElts / 2;

  bool IsFirstHalf = IsInterleavingPattern(Mask, 0, NumElts);

  bool IsSecondHalf = IsInterleavingPattern(Mask, FirstQtr, ThirdQtr);

  if (!IsFirstHalf && !IsSecondHalf)

    return SDValue();


  // Find the intersection between shuffle users of V1 and V2.

  SmallVector<SDNode *, 2> Shuffles;

  for (SDNode *User : V1->users())

    if (User->getOpcode() == ISD::VECTOR_SHUFFLE && User->getOperand(0) == V1 &&

        User->getOperand(1) == V2)

      Shuffles.push_back(User);

  // Limit user size to two for now.

  if (Shuffles.size() != 2)

    return SDValue();

  // Find out which half of the 512-bit shuffles is each smaller shuffle

  auto *SVN1 = cast<ShuffleVectorSDNode>(Shuffles[0]);

  auto *SVN2 = cast<ShuffleVectorSDNode>(Shuffles[1]);

  SDNode *FirstHalf;

  SDNode *SecondHalf;

  if (IsInterleavingPattern(SVN1->getMask(), 0, NumElts) &&

      IsInterleavingPattern(SVN2->getMask(), FirstQtr, ThirdQtr)) {

    FirstHalf = Shuffles[0];

    SecondHalf = Shuffles[1];

  } else if (IsInterleavingPattern(SVN1->getMask(), FirstQtr, ThirdQtr) &&

             IsInterleavingPattern(SVN2->getMask(), 0, NumElts)) {

    FirstHalf = Shuffles[1];

    SecondHalf = Shuffles[0];

  } else {

    return SDValue();

  }

  // Lower into unpck and perm. Return the perm of this shuffle and replace

  // the other.

  SDValue Unpckl = DAG.getNode(X86ISD::UNPCKL, DL, VT, V1, V2);

  SDValue Unpckh = DAG.getNode(X86ISD::UNPCKH, DL, VT, V1, V2);

  SDValue Perm1 = DAG.getNode(X86ISD::VPERM2X128, DL, VT, Unpckl, Unpckh,

                              DAG.getTargetConstant(0x20, DL, MVT::i8));

  SDValue Perm2 = DAG.getNode(X86ISD::VPERM2X128, DL, VT, Unpckl, Unpckh,

                              DAG.getTargetConstant(0x31, DL, MVT::i8));

  if (IsFirstHalf) {

    DAG.ReplaceAllUsesWith(SecondHalf, &Perm2);

    return Perm1;

  }

  DAG.ReplaceAllUsesWith(FirstHalf, &Perm1);

  return Perm2;

}


/// Handle lowering of 4-lane 64-bit floating point shuffles.

///

/// Also ends up handling lowering of 4-lane 64-bit integer shuffles when AVX2

/// isn't available.


static SDValue lowerV4F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,

                                 const APInt &Zeroable, SDValue V1, SDValue V2,

                                 const X86Subtarget &Subtarget,

                                 SelectionDAG &DAG) {

  assert(V1.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");

  assert(V2.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");

  assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");


  if (SDValue V = lowerV2X128Shuffle(DL, MVT::v4f64, V1, V2, Mask, Zeroable,

                                     Subtarget, DAG))

    return V;


  if (V2.isUndef()) {

    // Check for being able to broadcast a single element.

    if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4f64, V1, V2,

                                                    Mask, Subtarget, DAG))

      return Broadcast;


    // Use low duplicate instructions for masks that match their pattern.

    if (isShuffleEquivalent(Mask, {0, 0, 2, 2}, V1, V2))

      return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v4f64, V1);


    if (!is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask)) {

      // Non-half-crossing single input shuffles can be lowered with an

      // interleaved permutation.

      unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |

                              ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3);

      return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f64, V1,

                         DAG.getTargetConstant(VPERMILPMask, DL, MVT::i8));

    }


    // With AVX2 we have direct support for this permutation.

    if (Subtarget.hasAVX2())

      return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4f64, V1,

                         getV4X86ShuffleImm8ForMask(Mask, DL, DAG));


    // Try to create an in-lane repeating shuffle mask and then shuffle the

    // results into the target lanes.

    if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(

            DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))

      return V;


    // Try to permute the lanes and then use a per-lane permute.

    if (SDValue V = lowerShuffleAsLanePermuteAndPermute(DL, MVT::v4f64, V1, V2,

                                                        Mask, DAG, Subtarget))

      return V;


    // Otherwise, fall back.

    return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v4f64, V1, V2, Mask,

                                               DAG, Subtarget);

  }


  if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4f64, V1, V2, Mask,

                                          Zeroable, Subtarget, DAG))

    return Blend;


  // Use dedicated unpack instructions for masks that match their pattern.

  if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4f64, V1, V2, Mask, DAG))

    return V;


  if (SDValue Op = lowerShuffleWithSHUFPD(DL, MVT::v4f64, V1, V2, Mask,

                                          Zeroable, Subtarget, DAG))

    return Op;


  bool V1IsInPlace = isShuffleMaskInputInPlace(0, Mask);

  bool V2IsInPlace = isShuffleMaskInputInPlace(1, Mask);

  bool V1IsSplat = isShuffleMaskInputBroadcastable(0, Mask);

  bool V2IsSplat = isShuffleMaskInputBroadcastable(1, Mask);


  // If we have lane crossing shuffles AND they don't all come from the lower

  // lane elements, lower to SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)).

  // TODO: Handle BUILD_VECTOR sources which getVectorShuffle currently

  // canonicalize to a blend of splat which isn't necessary for this combine.

  if (is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask) &&

      !all_of(Mask, [](int M) { return M < 2 || (4 <= M && M < 6); }) &&

      (V1.getOpcode() != ISD::BUILD_VECTOR) &&

      (V2.getOpcode() != ISD::BUILD_VECTOR) &&

      (!Subtarget.hasAVX2() ||

       !((V1IsInPlace || V1IsSplat) && (V2IsInPlace || V2IsSplat))))

    return lowerShuffleAsLanePermuteAndSHUFP(DL, MVT::v4f64, V1, V2, Mask, DAG);


  // If we have one input in place, then we can permute the other input and

  // blend the result.

  if (V1IsInPlace || V2IsInPlace)

    return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4f64, V1, V2, Mask,

                                                Zeroable, Subtarget, DAG);


  // Try to create an in-lane repeating shuffle mask and then shuffle the

  // results into the target lanes.

  if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(

          DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))

    return V;


  // Try to simplify this by merging 128-bit lanes to enable a lane-based

  // shuffle. However, if we have AVX2 and either inputs are already in place,

  // we will be able to shuffle even across lanes the other input in a single

  // instruction so skip this pattern.

  if (!(Subtarget.hasAVX2() && (V1IsInPlace || V2IsInPlace)))

    if (SDValue V = lowerShuffleAsLanePermuteAndRepeatedMask(

            DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))

      return V;


  // If we have VLX support, we can use VEXPAND.

  if (Subtarget.hasVLX())

    if (SDValue V = lowerShuffleWithEXPAND(DL, MVT::v4f64, V1, V2, Mask,

                                           Zeroable, Subtarget, DAG))

      return V;


  // If we have AVX2 then we always want to lower with a blend because an v4 we

  // can fully permute the elements.

  if (Subtarget.hasAVX2())

    return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4f64, V1, V2, Mask,

                                                Zeroable, Subtarget, DAG);


  // Otherwise fall back on generic lowering.

  return lowerShuffleAsSplitOrBlend(DL, MVT::v4f64, V1, V2, Mask, Zeroable,

                                    Subtarget, DAG);

}


/// Handle lowering of 4-lane 64-bit integer shuffles.

///

/// This routine is only called when we have AVX2 and thus a reasonable

/// instruction set for v4i64 shuffling..


static SDValue lowerV4I64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,

                                 const APInt &Zeroable, SDValue V1, SDValue V2,

                                 const X86Subtarget &Subtarget,

                                 SelectionDAG &DAG) {

  assert(V1.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");

  assert(V2.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");

  assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");

  assert(Subtarget.hasAVX2() && "We can only lower v4i64 with AVX2!");


  if (SDValue V = lowerV2X128Shuffle(DL, MVT::v4i64, V1, V2, Mask, Zeroable,

                                     Subtarget, DAG))

    return V;


  if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4i64, V1, V2, Mask,

                                          Zeroable, Subtarget, DAG))

    return Blend;


  // Check for being able to broadcast a single element.

  if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4i64, V1, V2, Mask,

                                                  Subtarget, DAG))

    return Broadcast;


  // Try to use shift instructions if fast.

  if (Subtarget.preferLowerShuffleAsShift())

    if (SDValue Shift =

            lowerShuffleAsShift(DL, MVT::v4i64, V1, V2, Mask, Zeroable,

                                Subtarget, DAG, /*BitwiseOnly*/ true))

      return Shift;


  if (V2.isUndef()) {

    // When the shuffle is mirrored between the 128-bit lanes of the unit, we

    // can use lower latency instructions that will operate on both lanes.

    SmallVector<int, 2> RepeatedMask;

    if (is128BitLaneRepeatedShuffleMask(MVT::v4i64, Mask, RepeatedMask)) {

      SmallVector<int, 4> PSHUFDMask;

      narrowShuffleMaskElts(2, RepeatedMask, PSHUFDMask);

      return DAG.getBitcast(

          MVT::v4i64,

          DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32,

                      DAG.getBitcast(MVT::v8i32, V1),

                      getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));

    }


    // AVX2 provides a direct instruction for permuting a single input across

    // lanes.

    return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4i64, V1,

                       getV4X86ShuffleImm8ForMask(Mask, DL, DAG));

  }


  // Try to use shift instructions.

  if (SDValue Shift =

          lowerShuffleAsShift(DL, MVT::v4i64, V1, V2, Mask, Zeroable, Subtarget,

                              DAG, /*BitwiseOnly*/ false))

    return Shift;


  // If we have VLX support, we can use VALIGN or VEXPAND.

  if (Subtarget.hasVLX()) {

    if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v4i64, V1, V2, Mask,

                                              Zeroable, Subtarget, DAG))

      return Rotate;


    if (SDValue V = lowerShuffleWithEXPAND(DL, MVT::v4i64, V1, V2, Mask,

                                           Zeroable, Subtarget, DAG))

      return V;

  }


  // Try to use PALIGNR.

  if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v4i64, V1, V2, Mask,

                                                Subtarget, DAG))

    return Rotate;


  // Use dedicated unpack instructions for masks that match their pattern.

  if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4i64, V1, V2, Mask, DAG))

    return V;


  bool V1IsInPlace = isShuffleMaskInputInPlace(0, Mask);

  bool V2IsInPlace = isShuffleMaskInputInPlace(1, Mask);


  // If we have one input in place, then we can permute the other input and

  // blend the result.

  if (V1IsInPlace || V2IsInPlace)

    return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4i64, V1, V2, Mask,

                                                Zeroable, Subtarget, DAG);


  // Try to create an in-lane repeating shuffle mask and then shuffle the

  // results into the target lanes.

  if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(

          DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))

    return V;


  // Try to lower to PERMQ(BLENDD(V1,V2)).

  if (SDValue V =

          lowerShuffleAsBlendAndPermute(DL, MVT::v4i64, V1, V2, Mask, DAG))

    return V;


  // Try to simplify this by merging 128-bit lanes to enable a lane-based

  // shuffle. However, if we have AVX2 and either inputs are already in place,

  // we will be able to shuffle even across lanes the other input in a single

  // instruction so skip this pattern.

  if (!V1IsInPlace && !V2IsInPlace)

    if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(

            DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))

      return Result;


  // Otherwise fall back on generic blend lowering.

  return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4i64, V1, V2, Mask,

                                              Zeroable, Subtarget, DAG);

}


/// Handle lowering of 8-lane 32-bit floating point shuffles.

///

/// Also ends up handling lowering of 8-lane 32-bit integer shuffles when AVX2

/// isn't available.


static SDValue lowerV8F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,

                                 const APInt &Zeroable, SDValue V1, SDValue V2,

                                 const X86Subtarget &Subtarget,

                                 SelectionDAG &DAG) {

  assert(V1.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");

  assert(V2.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");

  assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");


  if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8f32, V1, V2, Mask,

                                          Zeroable, Subtarget, DAG))

    return Blend;


  // Check for being able to broadcast a single element.

  if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8f32, V1, V2, Mask,

                                                  Subtarget, DAG))

    return Broadcast;


  if (!Subtarget.hasAVX2()) {

    SmallVector<int> InLaneMask;

    computeInLaneShuffleMask(Mask, Mask.size() / 2, InLaneMask);


    if (!is128BitLaneRepeatedShuffleMask(MVT::v8f32, InLaneMask))

      if (SDValue R = splitAndLowerShuffle(DL, MVT::v8f32, V1, V2, Mask, DAG,

                                           /*SimpleOnly*/ true))

        return R;

  }

  if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v8i32, V1, V2, Mask,

                                                   Zeroable, Subtarget, DAG))

    return DAG.getBitcast(MVT::v8f32, ZExt);


  // If the shuffle mask is repeated in each 128-bit lane, we have many more

  // options to efficiently lower the shuffle.

  SmallVector<int, 4> RepeatedMask;

  if (is128BitLaneRepeatedShuffleMask(MVT::v8f32, Mask, RepeatedMask)) {

    assert(RepeatedMask.size() == 4 &&

           "Repeated masks must be half the mask width!");


    // Use even/odd duplicate instructions for masks that match their pattern.

    if (isShuffleEquivalent(RepeatedMask, {0, 0, 2, 2}, V1, V2))

      return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v8f32, V1);

    if (isShuffleEquivalent(RepeatedMask, {1, 1, 3, 3}, V1, V2))

      return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v8f32, V1);


    if (V2.isUndef())

      return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f32, V1,

                         getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));


    // Use dedicated unpack instructions for masks that match their pattern.

    if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8f32, V1, V2, Mask, DAG))

      return V;


    // Otherwise, fall back to a SHUFPS sequence. Here it is important that we

    // have already handled any direct blends.

    return lowerShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask, V1, V2, DAG);

  }


  // Try to create an in-lane repeating shuffle mask and then shuffle the

  // results into the target lanes.

  if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(

          DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))

    return V;


  // If we have a single input shuffle with different shuffle patterns in the

  // two 128-bit lanes use the variable mask to VPERMILPS.

  if (V2.isUndef()) {

    if (!is128BitLaneCrossingShuffleMask(MVT::v8f32, Mask)) {

      SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);

      return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, V1, VPermMask);

    }

    if (Subtarget.hasAVX2()) {

      SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);

      return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8f32, VPermMask, V1);

    }

    // Otherwise, fall back.

    return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v8f32, V1, V2, Mask,

                                               DAG, Subtarget);

  }


  // Try to simplify this by merging 128-bit lanes to enable a lane-based

  // shuffle.

  if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(

          DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))

    return Result;


  // If we have VLX support, we can use VEXPAND.

  if (Subtarget.hasVLX())

    if (SDValue V = lowerShuffleWithEXPAND(DL, MVT::v8f32, V1, V2, Mask,

                                           Zeroable, Subtarget, DAG))

      return V;


  // Try to match an interleave of two v8f32s and lower them as unpck and

  // permutes using ymms. This needs to go before we try to split the vectors.

  // Don't attempt on AVX1 if we're likely to split vectors anyway.

  if ((Subtarget.hasAVX2() ||

       !(isFreeToSplitVector(peekThroughBitcasts(V1), DAG) ||

         isFreeToSplitVector(peekThroughBitcasts(V2), DAG))) &&

      !Subtarget.hasAVX512())

    if (SDValue V = lowerShufflePairAsUNPCKAndPermute(DL, MVT::v8f32, V1, V2,

                                                      Mask, DAG))

      return V;


  // For non-AVX512 if the Mask is of 16bit elements in lane then try to split

  // since after split we get a more efficient code using vpunpcklwd and

  // vpunpckhwd instrs than vblend.

  if (!Subtarget.hasAVX512() && isUnpackWdShuffleMask(Mask, MVT::v8f32, DAG))

    return lowerShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask, Zeroable,

                                      Subtarget, DAG);


  // If we have AVX2 then we always want to lower with a blend because at v8 we

  // can fully permute the elements.

  if (Subtarget.hasAVX2())

    return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v8f32, V1, V2, Mask,

                                                Zeroable, Subtarget, DAG);


  // Otherwise fall back on generic lowering.

  return lowerShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask, Zeroable,

                                    Subtarget, DAG);

}


/// Handle lowering of 8-lane 32-bit integer shuffles.

///

/// This routine is only called when we have AVX2 and thus a reasonable

/// instruction set for v8i32 shuffling..


static SDValue lowerV8I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,

                                 const APInt &Zeroable, SDValue V1, SDValue V2,

                                 const X86Subtarget &Subtarget,

                                 SelectionDAG &DAG) {

  assert(V1.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");

  assert(V2.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");

  assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");

  assert(Subtarget.hasAVX2() && "We can only lower v8i32 with AVX2!");


  int NumV2Elements = count_if(Mask, [](int M) { return M >= 8; });


  // Whenever we can lower this as a zext, that instruction is strictly faster

  // than any alternative. It also allows us to fold memory operands into the

  // shuffle in many cases.

  if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v8i32, V1, V2, Mask,

                                                   Zeroable, Subtarget, DAG))

    return ZExt;


  // Try to match an interleave of two v8i32s and lower them as unpck and

  // permutes using ymms. This needs to go before we try to split the vectors.

  if (!Subtarget.hasAVX512())

    if (SDValue V = lowerShufflePairAsUNPCKAndPermute(DL, MVT::v8i32, V1, V2,

                                                      Mask, DAG))

      return V;


  // For non-AVX512 if the Mask is of 16bit elements in lane then try to split

  // since after split we get a more efficient code than vblend by using

  // vpunpcklwd and vpunpckhwd instrs.

  if (isUnpackWdShuffleMask(Mask, MVT::v8i32, DAG) && !V2.isUndef() &&

      !Subtarget.hasAVX512())

    return lowerShuffleAsSplitOrBlend(DL, MVT::v8i32, V1, V2, Mask, Zeroable,

                                      Subtarget, DAG);


  if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i32, V1, V2, Mask,

                                          Zeroable, Subtarget, DAG))

    return Blend;


  // Check for being able to broadcast a single element.

  if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8i32, V1, V2, Mask,

                                                  Subtarget, DAG))

    return Broadcast;


  // Try to use shift instructions if fast.

  if (Subtarget.preferLowerShuffleAsShift()) {

    if (SDValue Shift =

            lowerShuffleAsShift(DL, MVT::v8i32, V1, V2, Mask, Zeroable,

                                Subtarget, DAG, /*BitwiseOnly*/ true))

      return Shift;

    if (NumV2Elements == 0)

      if (SDValue Rotate =

              lowerShuffleAsBitRotate(DL, MVT::v8i32, V1, Mask, Subtarget, DAG))

        return Rotate;

  }


  // If the shuffle mask is repeated in each 128-bit lane we can use more

  // efficient instructions that mirror the shuffles across the two 128-bit

  // lanes.

  SmallVector<int, 4> RepeatedMask;

  bool Is128BitLaneRepeatedShuffle =

      is128BitLaneRepeatedShuffleMask(MVT::v8i32, Mask, RepeatedMask);

  if (Is128BitLaneRepeatedShuffle) {

    assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");

    if (V2.isUndef())

      return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32, V1,

                         getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));


    // Use dedicated unpack instructions for masks that match their pattern.

    if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i32, V1, V2, Mask, DAG))

      return V;

  }


  // Try to use shift instructions.

  if (SDValue Shift =

          lowerShuffleAsShift(DL, MVT::v8i32, V1, V2, Mask, Zeroable, Subtarget,

                              DAG, /*BitwiseOnly*/ false))

    return Shift;


  if (!Subtarget.preferLowerShuffleAsShift() && NumV2Elements == 0)

    if (SDValue Rotate =

            lowerShuffleAsBitRotate(DL, MVT::v8i32, V1, Mask, Subtarget, DAG))

      return Rotate;


  // If we have VLX support, we can use VALIGN or EXPAND.

  if (Subtarget.hasVLX()) {

    if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v8i32, V1, V2, Mask,

                                              Zeroable, Subtarget, DAG))

      return Rotate;


    if (SDValue V = lowerShuffleWithEXPAND(DL, MVT::v8i32, V1, V2, Mask,

                                           Zeroable, Subtarget, DAG))

      return V;

  }


  // Try to use byte rotation instructions.

  if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i32, V1, V2, Mask,

                                                Subtarget, DAG))

    return Rotate;


  // Try to create an in-lane repeating shuffle mask and then shuffle the

  // results into the target lanes.

  if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(

          DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))

    return V;


  if (V2.isUndef()) {

    // Try to produce a fixed cross-128-bit lane permute followed by unpack

    // because that should be faster than the variable permute alternatives.

    if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v8i32, V1, V2, Mask, DAG))

      return V;


    // If the shuffle patterns aren't repeated but it's a single input, directly

    // generate a cross-lane VPERMD instruction.

    SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);

    return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8i32, VPermMask, V1);

  }


  // Assume that a single SHUFPS is faster than an alternative sequence of

  // multiple instructions (even if the CPU has a domain penalty).

  // If some CPU is harmed by the domain switch, we can fix it in a later pass.

  if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {

    SDValue CastV1 = DAG.getBitcast(MVT::v8f32, V1);

    SDValue CastV2 = DAG.getBitcast(MVT::v8f32, V2);

    SDValue ShufPS = lowerShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask,

                                            CastV1, CastV2, DAG);

    return DAG.getBitcast(MVT::v8i32, ShufPS);

  }


  // Try to simplify this by merging 128-bit lanes to enable a lane-based

  // shuffle.

  if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(

          DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))

    return Result;


  // Otherwise fall back on generic blend lowering.

  return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v8i32, V1, V2, Mask,

                                              Zeroable, Subtarget, DAG);

}


/// Handle lowering of 16-lane 16-bit integer shuffles.

///

/// This routine is only called when we have AVX2 and thus a reasonable

/// instruction set for v16i16 shuffling..


static SDValue lowerV16I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,

                                  const APInt &Zeroable, SDValue V1, SDValue V2,

                                  const X86Subtarget &Subtarget,

                                  SelectionDAG &DAG) {

  assert(V1.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");

  assert(V2.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");

  assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");

  assert(Subtarget.hasAVX2() && "We can only lower v16i16 with AVX2!");


  // Whenever we can lower this as a zext, that instruction is strictly faster

  // than any alternative. It also allows us to fold memory operands into the

  // shuffle in many cases.

  if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(

          DL, MVT::v16i16, V1, V2, Mask, Zeroable, Subtarget, DAG))

    return ZExt;


  // Check for being able to broadcast a single element.

  if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v16i16, V1, V2, Mask,

                                                  Subtarget, DAG))

    return Broadcast;


  if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i16, V1, V2, Mask,

                                          Zeroable, Subtarget, DAG))

    return Blend;


  // Use dedicated unpack instructions for masks that match their pattern.

  if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i16, V1, V2, Mask, DAG))

    return V;


  // Use dedicated pack instructions for masks that match their pattern.

  if (SDValue V =

          lowerShuffleWithPACK(DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))

    return V;


  // Try to use lower using a truncation.

  if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v16i16, V1, V2, Mask, Zeroable,

                                       Subtarget, DAG))

    return V;


  // Try to use shift instructions.

  if (SDValue Shift =

          lowerShuffleAsShift(DL, MVT::v16i16, V1, V2, Mask, Zeroable,

                              Subtarget, DAG, /*BitwiseOnly*/ false))

    return Shift;


  // Try to use byte rotation instructions.

  if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i16, V1, V2, Mask,

                                                Subtarget, DAG))

    return Rotate;


  // Try to create an in-lane repeating shuffle mask and then shuffle the

  // results into the target lanes.

  if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(

          DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))

    return V;


  if (V2.isUndef()) {

    // Try to use bit rotation instructions.

    if (SDValue Rotate =

            lowerShuffleAsBitRotate(DL, MVT::v16i16, V1, Mask, Subtarget, DAG))

      return Rotate;


    // Try to produce a fixed cross-128-bit lane permute followed by unpack

    // because that should be faster than the variable permute alternatives.

    if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v16i16, V1, V2, Mask, DAG))

      return V;


    // There are no generalized cross-lane shuffle operations available on i16

    // element types.

    if (is128BitLaneCrossingShuffleMask(MVT::v16i16, Mask)) {

      if (SDValue V = lowerShuffleAsLanePermuteAndPermute(

              DL, MVT::v16i16, V1, V2, Mask, DAG, Subtarget))

        return V;


      return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v16i16, V1, V2, Mask,

                                                 DAG, Subtarget);

    }


    SmallVector<int, 8> RepeatedMask;

    if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {

      // As this is a single-input shuffle, the repeated mask should be

      // a strictly valid v8i16 mask that we can pass through to the v8i16

      // lowering to handle even the v16 case.

      return lowerV8I16GeneralSingleInputShuffle(

          DL, MVT::v16i16, V1, RepeatedMask, Subtarget, DAG);

    }

  }


  if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v16i16, Mask, V1, V2,

                                              Zeroable, Subtarget, DAG))

    return PSHUFB;


  // AVX512BW can lower to VPERMW (non-VLX will pad to v32i16).

  if (Subtarget.hasBWI())

    return lowerShuffleWithPERMV(DL, MVT::v16i16, Mask, V1, V2, Subtarget, DAG);


  // Try to simplify this by merging 128-bit lanes to enable a lane-based

  // shuffle.

  if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(

          DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))

    return Result;


  // Try to permute the lanes and then use a per-lane permute.

  if (SDValue V = lowerShuffleAsLanePermuteAndPermute(

          DL, MVT::v16i16, V1, V2, Mask, DAG, Subtarget))

    return V;


  // Try to match an interleave of two v16i16s and lower them as unpck and

  // permutes using ymms.

  if (!Subtarget.hasAVX512())

    if (SDValue V = lowerShufflePairAsUNPCKAndPermute(DL, MVT::v16i16, V1, V2,

                                                      Mask, DAG))

      return V;


  // Otherwise fall back on generic lowering.

  return lowerShuffleAsSplitOrBlend(DL, MVT::v16i16, V1, V2, Mask, Zeroable,

                                    Subtarget, DAG);

}


/// Handle lowering of 32-lane 8-bit integer shuffles.

///

/// This routine is only called when we have AVX2 and thus a reasonable

/// instruction set for v32i8 shuffling..


static SDValue lowerV32I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,

                                 const APInt &Zeroable, SDValue V1, SDValue V2,

                                 const X86Subtarget &Subtarget,

                                 SelectionDAG &DAG) {

  assert(V1.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");

  assert(V2.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");

  assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");

  assert(Subtarget.hasAVX2() && "We can only lower v32i8 with AVX2!");


  // Whenever we can lower this as a zext, that instruction is strictly faster

  // than any alternative. It also allows us to fold memory operands into the

  // shuffle in many cases.

  if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v32i8, V1, V2, Mask,

                                                   Zeroable, Subtarget, DAG))

    return ZExt;


  // Check for being able to broadcast a single element.

  if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v32i8, V1, V2, Mask,

                                                  Subtarget, DAG))

    return Broadcast;


  if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v32i8, V1, V2, Mask,

                                          Zeroable, Subtarget, DAG))

    return Blend;


  // Use dedicated unpack instructions for masks that match their pattern.

  if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v32i8, V1, V2, Mask, DAG))

    return V;


  // Use dedicated pack instructions for masks that match their pattern.

  if (SDValue V =

          lowerShuffleWithPACK(DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))

    return V;


  // Try to use lower using a truncation.

  if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v32i8, V1, V2, Mask, Zeroable,

                                       Subtarget, DAG))

    return V;


  // Try to use shift instructions.

  if (SDValue Shift =

          lowerShuffleAsShift(DL, MVT::v32i8, V1, V2, Mask, Zeroable, Subtarget,

                              DAG, /*BitwiseOnly*/ false))

    return Shift;


  // Try to use byte rotation instructions.

  if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v32i8, V1, V2, Mask,

                                                Subtarget, DAG))

    return Rotate;


  // Try to use bit rotation instructions.

  if (V2.isUndef())

    if (SDValue Rotate =

            lowerShuffleAsBitRotate(DL, MVT::v32i8, V1, Mask, Subtarget, DAG))

      return Rotate;


  // Try to create an in-lane repeating shuffle mask and then shuffle the

  // results into the target lanes.

  if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(

          DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))

    return V;


  // There are no generalized cross-lane shuffle operations available on i8

  // element types.

  if (V2.isUndef() && is128BitLaneCrossingShuffleMask(MVT::v32i8, Mask)) {

    // Try to produce a fixed cross-128-bit lane permute followed by unpack

    // because that should be faster than the variable permute alternatives.

    if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v32i8, V1, V2, Mask, DAG))

      return V;


    if (SDValue V = lowerShuffleAsLanePermuteAndPermute(

            DL, MVT::v32i8, V1, V2, Mask, DAG, Subtarget))

      return V;


    return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v32i8, V1, V2, Mask,

                                               DAG, Subtarget);

  }


  if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v32i8, Mask, V1, V2,

                                              Zeroable, Subtarget, DAG))

    return PSHUFB;


  // AVX512VBMI can lower to VPERMB (non-VLX will pad to v64i8).

  if (Subtarget.hasVBMI())

    return lowerShuffleWithPERMV(DL, MVT::v32i8, Mask, V1, V2, Subtarget, DAG);


  // Try to simplify this by merging 128-bit lanes to enable a lane-based

  // shuffle.

  if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(

          DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))

    return Result;


  // Try to permute the lanes and then use a per-lane permute.

  if (SDValue V = lowerShuffleAsLanePermuteAndPermute(

          DL, MVT::v32i8, V1, V2, Mask, DAG, Subtarget))

    return V;


  // Look for {0, 8, 16, 24, 32, 40, 48, 56 } in the first 8 elements. Followed

  // by zeroable elements in the remaining 24 elements. Turn this into two

  // vmovqb instructions shuffled together.

  if (Subtarget.hasVLX())

    if (SDValue V = lowerShuffleAsVTRUNCAndUnpack(DL, MVT::v32i8, V1, V2,

                                                  Mask, Zeroable, DAG))

      return V;


  // Try to match an interleave of two v32i8s and lower them as unpck and

  // permutes using ymms.

  if (!Subtarget.hasAVX512())

    if (SDValue V = lowerShufflePairAsUNPCKAndPermute(DL, MVT::v32i8, V1, V2,

                                                      Mask, DAG))

      return V;


  // Otherwise fall back on generic lowering.

  return lowerShuffleAsSplitOrBlend(DL, MVT::v32i8, V1, V2, Mask, Zeroable,

                                    Subtarget, DAG);

}


/// High-level routine to lower various 256-bit x86 vector shuffles.

///

/// This routine either breaks down the specific type of a 256-bit x86 vector

/// shuffle or splits it into two 128-bit shuffles and fuses the results back

/// together based on the available instructions.


static SDValue lower256BitShuffle(const SDLoc &DL, ArrayRef<int> Mask, MVT VT,

                                  SDValue V1, SDValue V2, const APInt &Zeroable,

                                  const X86Subtarget &Subtarget,

                                  SelectionDAG &DAG) {

  // If we have a single input to the zero element, insert that into V1 if we

  // can do so cheaply.

  int NumElts = VT.getVectorNumElements();

  int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });


  if (NumV2Elements == 1 && Mask[0] >= NumElts)

    if (SDValue Insertion = lowerShuffleAsElementInsertion(

            DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))

      return Insertion;


  // Handle special cases where the lower or upper half is UNDEF.

  if (SDValue V =

          lowerShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))

    return V;


  // There is a really nice hard cut-over between AVX1 and AVX2 that means we

  // can check for those subtargets here and avoid much of the subtarget

  // querying in the per-vector-type lowering routines. With AVX1 we have

  // essentially *zero* ability to manipulate a 256-bit vector with integer

  // types. Since we'll use floating point types there eventually, just

  // immediately cast everything to a float and operate entirely in that domain.

  if (VT.isInteger() && !Subtarget.hasAVX2()) {

    int ElementBits = VT.getScalarSizeInBits();

    if (ElementBits < 32) {

      // No floating point type available, if we can't use the bit operations

      // for masking/blending then decompose into 128-bit vectors.

      if (SDValue V = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,

                                            Subtarget, DAG))

        return V;

      if (SDValue V = lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))

        return V;

      return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG, /*SimpleOnly*/ false);

    }


    MVT FpVT = MVT::getVectorVT(MVT::getFloatingPointVT(ElementBits),

                                VT.getVectorNumElements());

    V1 = DAG.getBitcast(FpVT, V1);

    V2 = DAG.getBitcast(FpVT, V2);

    return DAG.getBitcast(VT, DAG.getVectorShuffle(FpVT, DL, V1, V2, Mask));

  }


  if (VT == MVT::v16f16 || VT == MVT::v16bf16) {

    V1 = DAG.getBitcast(MVT::v16i16, V1);

    V2 = DAG.getBitcast(MVT::v16i16, V2);

    return DAG.getBitcast(VT,

                          DAG.getVectorShuffle(MVT::v16i16, DL, V1, V2, Mask));

  }


  switch (VT.SimpleTy) {

  case MVT::v4f64:

    return lowerV4F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);

  case MVT::v4i64:

    return lowerV4I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);

  case MVT::v8f32:

    return lowerV8F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);

  case MVT::v8i32:

    return lowerV8I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);

  case MVT::v16i16:

    return lowerV16I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);

  case MVT::v32i8:

    return lowerV32I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);


  default:

    llvm_unreachable("Not a valid 256-bit x86 vector type!");

  }

}


/// Try to lower a vector shuffle as a 128-bit shuffles.


static SDValue lowerV4X128Shuffle(const SDLoc &DL, MVT VT, ArrayRef<int> Mask,

                                  const APInt &Zeroable, SDValue V1, SDValue V2,

                                  const X86Subtarget &Subtarget,

                                  SelectionDAG &DAG) {

  assert(VT.getScalarSizeInBits() == 64 &&

         "Unexpected element type size for 128bit shuffle.");


  // To handle 256 bit vector requires VLX and most probably

  // function lowerV2X128VectorShuffle() is better solution.

  assert(VT.is512BitVector() && "Unexpected vector size for 512bit shuffle.");


  // TODO - use Zeroable like we do for lowerV2X128VectorShuffle?

  SmallVector<int, 4> Widened128Mask;

  if (!canWidenShuffleElements(Mask, Widened128Mask))

    return SDValue();

  assert(Widened128Mask.size() == 4 && "Shuffle widening mismatch");


  // Try to use an insert into a zero vector.

  if (Widened128Mask[0] == 0 && (Zeroable & 0xf0) == 0xf0 &&

      (Widened128Mask[1] == 1 || (Zeroable & 0x0c) == 0x0c)) {

    unsigned NumElts = ((Zeroable & 0x0c) == 0x0c) ? 2 : 4;

    MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);

    SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,

                              DAG.getVectorIdxConstant(0, DL));

    return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,

                       getZeroVector(VT, Subtarget, DAG, DL), LoV,

                       DAG.getVectorIdxConstant(0, DL));

  }


  // Check for patterns which can be matched with a single insert of a 256-bit

  // subvector.

  bool OnlyUsesV1 = isShuffleEquivalent(Mask, {0, 1, 2, 3, 0, 1, 2, 3}, V1, V2);

  if (OnlyUsesV1 ||

      isShuffleEquivalent(Mask, {0, 1, 2, 3, 8, 9, 10, 11}, V1, V2)) {

    MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 4);

    SDValue SubVec =

        DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, OnlyUsesV1 ? V1 : V2,

                    DAG.getVectorIdxConstant(0, DL));

    return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, V1, SubVec,

                       DAG.getVectorIdxConstant(4, DL));

  }


  // See if this is an insertion of the lower 128-bits of V2 into V1.

  bool IsInsert = true;

  int V2Index = -1;

  for (int i = 0; i < 4; ++i) {

    assert(Widened128Mask[i] >= -1 && "Illegal shuffle sentinel value");

    if (Widened128Mask[i] < 0)

      continue;


    // Make sure all V1 subvectors are in place.

    if (Widened128Mask[i] < 4) {

      if (Widened128Mask[i] != i) {

        IsInsert = false;

        break;

      }

    } else {

      // Make sure we only have a single V2 index and its the lowest 128-bits.

      if (V2Index >= 0 || Widened128Mask[i] != 4) {

        IsInsert = false;

        break;

      }

      V2Index = i;

    }

  }

  if (IsInsert && V2Index >= 0) {

    MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);

    SDValue Subvec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V2,

                                 DAG.getVectorIdxConstant(0, DL));

    return insert128BitVector(V1, Subvec, V2Index * 2, DAG, DL);

  }


  // See if we can widen to a 256-bit lane shuffle, we're going to lose 128-lane

  // UNDEF info by lowering to X86ISD::SHUF128 anyway, so by widening where

  // possible we at least ensure the lanes stay sequential to help later

  // combines.

  SmallVector<int, 2> Widened256Mask;

  if (canWidenShuffleElements(Widened128Mask, Widened256Mask)) {

    Widened128Mask.clear();

    narrowShuffleMaskElts(2, Widened256Mask, Widened128Mask);

  }


  // Try to lower to vshuf64x2/vshuf32x4.

  SDValue Ops[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT)};

  int PermMask[4] = {-1, -1, -1, -1};

  // Ensure elements came from the same Op.

  for (int i = 0; i < 4; ++i) {

    assert(Widened128Mask[i] >= -1 && "Illegal shuffle sentinel value");

    if (Widened128Mask[i] < 0)

      continue;


    SDValue Op = Widened128Mask[i] >= 4 ? V2 : V1;

    unsigned OpIndex = i / 2;

    if (Ops[OpIndex].isUndef())

      Ops[OpIndex] = Op;

    else if (Ops[OpIndex] != Op)

      return SDValue();


    PermMask[i] = Widened128Mask[i] % 4;

  }


  return DAG.getNode(X86ISD::SHUF128, DL, VT, Ops[0], Ops[1],

                     getV4X86ShuffleImm8ForMask(PermMask, DL, DAG));

}


/// Handle lowering of 8-lane 64-bit floating point shuffles.


static SDValue lowerV8F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,

                                 const APInt &Zeroable, SDValue V1, SDValue V2,

                                 const X86Subtarget &Subtarget,

                                 SelectionDAG &DAG) {

  assert(V1.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");

  assert(V2.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");

  assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");


  if (V2.isUndef()) {

    // Use low duplicate instructions for masks that match their pattern.

    if (isShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6}, V1, V2))

      return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v8f64, V1);


    if (!is128BitLaneCrossingShuffleMask(MVT::v8f64, Mask)) {

      // Non-half-crossing single input shuffles can be lowered with an

      // interleaved permutation.

      unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |

                              ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3) |

                              ((Mask[4] == 5) << 4) | ((Mask[5] == 5) << 5) |

                              ((Mask[6] == 7) << 6) | ((Mask[7] == 7) << 7);

      return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f64, V1,

                         DAG.getTargetConstant(VPERMILPMask, DL, MVT::i8));

    }


    SmallVector<int, 4> RepeatedMask;

    if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask))

      return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8f64, V1,

                         getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));

  }


  if (SDValue Shuf128 = lowerV4X128Shuffle(DL, MVT::v8f64, Mask, Zeroable, V1,

                                           V2, Subtarget, DAG))

    return Shuf128;


  if (SDValue Unpck = lowerShuffleWithUNPCK(DL, MVT::v8f64, V1, V2, Mask, DAG))

    return Unpck;


  // Check if the blend happens to exactly fit that of SHUFPD.

  if (SDValue Op = lowerShuffleWithSHUFPD(DL, MVT::v8f64, V1, V2, Mask,

                                          Zeroable, Subtarget, DAG))

    return Op;


  if (SDValue V = lowerShuffleWithEXPAND(DL, MVT::v8f64, V1, V2, Mask, Zeroable,

                                         Subtarget, DAG))

    return V;


  if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8f64, V1, V2, Mask,

                                          Zeroable, Subtarget, DAG))

    return Blend;


  return lowerShuffleWithPERMV(DL, MVT::v8f64, Mask, V1, V2, Subtarget, DAG);

}


/// Handle lowering of 16-lane 32-bit floating point shuffles.


static SDValue lowerV16F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,

                                  const APInt &Zeroable, SDValue V1, SDValue V2,

                                  const X86Subtarget &Subtarget,

                                  SelectionDAG &DAG) {

  assert(V1.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");

  assert(V2.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");

  assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");


  // If the shuffle mask is repeated in each 128-bit lane, we have many more

  // options to efficiently lower the shuffle.

  SmallVector<int, 4> RepeatedMask;

  if (is128BitLaneRepeatedShuffleMask(MVT::v16f32, Mask, RepeatedMask)) {

    assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");


    // Use even/odd duplicate instructions for masks that match their pattern.

    if (isShuffleEquivalent(RepeatedMask, {0, 0, 2, 2}, V1, V2))

      return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v16f32, V1);

    if (isShuffleEquivalent(RepeatedMask, {1, 1, 3, 3}, V1, V2))

      return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v16f32, V1);


    if (V2.isUndef())

      return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v16f32, V1,

                         getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));


    // Use dedicated unpack instructions for masks that match their pattern.

    if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16f32, V1, V2, Mask, DAG))

      return V;


    if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16f32, V1, V2, Mask,

                                            Zeroable, Subtarget, DAG))

      return Blend;


    // Otherwise, fall back to a SHUFPS sequence.

    return lowerShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask, V1, V2, DAG);

  }


  if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16f32, V1, V2, Mask,

                                          Zeroable, Subtarget, DAG))

    return Blend;


  if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(

          DL, MVT::v16i32, V1, V2, Mask, Zeroable, Subtarget, DAG))

    return DAG.getBitcast(MVT::v16f32, ZExt);


  // Try to create an in-lane repeating shuffle mask and then shuffle the

  // results into the target lanes.

  if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(

          DL, MVT::v16f32, V1, V2, Mask, Subtarget, DAG))

    return V;


  // If we have a single input shuffle with different shuffle patterns in the

  // 128-bit lanes and don't lane cross, use variable mask VPERMILPS.

  if (V2.isUndef() &&

      !is128BitLaneCrossingShuffleMask(MVT::v16f32, Mask)) {

    SDValue VPermMask = getConstVector(Mask, MVT::v16i32, DAG, DL, true);

    return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v16f32, V1, VPermMask);

  }


  // If we have AVX512F support, we can use VEXPAND.

  if (SDValue V = lowerShuffleWithEXPAND(DL, MVT::v16f32, V1, V2, Mask,

                                         Zeroable, Subtarget, DAG))

    return V;


  return lowerShuffleWithPERMV(DL, MVT::v16f32, Mask, V1, V2, Subtarget, DAG);

}


/// Handle lowering of 8-lane 64-bit integer shuffles.


static SDValue lowerV8I64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,

                                 const APInt &Zeroable, SDValue V1, SDValue V2,

                                 const X86Subtarget &Subtarget,

                                 SelectionDAG &DAG) {

  assert(V1.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");

  assert(V2.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");

  assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");


  // Try to use shift instructions if fast.

  if (Subtarget.preferLowerShuffleAsShift())

    if (SDValue Shift =

            lowerShuffleAsShift(DL, MVT::v8i64, V1, V2, Mask, Zeroable,

                                Subtarget, DAG, /*BitwiseOnly*/ true))

      return Shift;


  if (V2.isUndef()) {

    // When the shuffle is mirrored between the 128-bit lanes of the unit, we

    // can use lower latency instructions that will operate on all four

    // 128-bit lanes.

    SmallVector<int, 2> Repeated128Mask;

    if (is128BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated128Mask)) {

      SmallVector<int, 4> PSHUFDMask;

      narrowShuffleMaskElts(2, Repeated128Mask, PSHUFDMask);

      return DAG.getBitcast(

          MVT::v8i64,

          DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32,

                      DAG.getBitcast(MVT::v16i32, V1),

                      getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));

    }


    SmallVector<int, 4> Repeated256Mask;

    if (is256BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated256Mask))

      return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8i64, V1,

                         getV4X86ShuffleImm8ForMask(Repeated256Mask, DL, DAG));

  }


  if (SDValue Shuf128 = lowerV4X128Shuffle(DL, MVT::v8i64, Mask, Zeroable, V1,

                                           V2, Subtarget, DAG))

    return Shuf128;


  // Try to use shift instructions.

  if (SDValue Shift =

          lowerShuffleAsShift(DL, MVT::v8i64, V1, V2, Mask, Zeroable, Subtarget,

                              DAG, /*BitwiseOnly*/ false))

    return Shift;


  // Try to use VALIGN.

  if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v8i64, V1, V2, Mask,

                                            Zeroable, Subtarget, DAG))

    return Rotate;


  // Try to use PALIGNR.

  if (Subtarget.hasBWI())

    if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i64, V1, V2, Mask,

                                                  Subtarget, DAG))

      return Rotate;


  if (SDValue Unpck = lowerShuffleWithUNPCK(DL, MVT::v8i64, V1, V2, Mask, DAG))

    return Unpck;


  // If we have AVX512F support, we can use VEXPAND.

  if (SDValue V = lowerShuffleWithEXPAND(DL, MVT::v8i64, V1, V2, Mask, Zeroable,

                                         Subtarget, DAG))

    return V;


  if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i64, V1, V2, Mask,

                                          Zeroable, Subtarget, DAG))

    return Blend;


  return lowerShuffleWithPERMV(DL, MVT::v8i64, Mask, V1, V2, Subtarget, DAG);

}


/// Handle lowering of 16-lane 32-bit integer shuffles.


static SDValue lowerV16I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,

                                  const APInt &Zeroable, SDValue V1, SDValue V2,

                                  const X86Subtarget &Subtarget,

                                  SelectionDAG &DAG) {

  assert(V1.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");

  assert(V2.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");

  assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");


  int NumV2Elements = count_if(Mask, [](int M) { return M >= 16; });


  // Whenever we can lower this as a zext, that instruction is strictly faster

  // than any alternative. It also allows us to fold memory operands into the

  // shuffle in many cases.

  if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(

          DL, MVT::v16i32, V1, V2, Mask, Zeroable, Subtarget, DAG))

    return ZExt;


  // Try to use shift instructions if fast.

  if (Subtarget.preferLowerShuffleAsShift()) {

    if (SDValue Shift =

            lowerShuffleAsShift(DL, MVT::v16i32, V1, V2, Mask, Zeroable,

                                Subtarget, DAG, /*BitwiseOnly*/ true))

      return Shift;

    if (NumV2Elements == 0)

      if (SDValue Rotate = lowerShuffleAsBitRotate(DL, MVT::v16i32, V1, Mask,

                                                   Subtarget, DAG))

        return Rotate;

  }


  // If the shuffle mask is repeated in each 128-bit lane we can use more

  // efficient instructions that mirror the shuffles across the four 128-bit

  // lanes.

  SmallVector<int, 4> RepeatedMask;

  bool Is128BitLaneRepeatedShuffle =

      is128BitLaneRepeatedShuffleMask(MVT::v16i32, Mask, RepeatedMask);

  if (Is128BitLaneRepeatedShuffle) {

    assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");

    if (V2.isUndef())

      return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32, V1,

                         getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));


    // Use dedicated unpack instructions for masks that match their pattern.

    if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i32, V1, V2, Mask, DAG))

      return V;

  }


  // Try to use shift instructions.

  if (SDValue Shift =

          lowerShuffleAsShift(DL, MVT::v16i32, V1, V2, Mask, Zeroable,

                              Subtarget, DAG, /*BitwiseOnly*/ false))

    return Shift;


  if (!Subtarget.preferLowerShuffleAsShift() && NumV2Elements != 0)

    if (SDValue Rotate =

            lowerShuffleAsBitRotate(DL, MVT::v16i32, V1, Mask, Subtarget, DAG))

      return Rotate;


  // Try to use VALIGN.

  if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v16i32, V1, V2, Mask,

                                            Zeroable, Subtarget, DAG))

    return Rotate;


  // Try to use byte rotation instructions.

  if (Subtarget.hasBWI())

    if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i32, V1, V2, Mask,

                                                  Subtarget, DAG))

      return Rotate;


  // Assume that a single SHUFPS is faster than using a permv shuffle.

  // If some CPU is harmed by the domain switch, we can fix it in a later pass.

  if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {

    SDValue CastV1 = DAG.getBitcast(MVT::v16f32, V1);

    SDValue CastV2 = DAG.getBitcast(MVT::v16f32, V2);

    SDValue ShufPS = lowerShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask,

                                            CastV1, CastV2, DAG);

    return DAG.getBitcast(MVT::v16i32, ShufPS);

  }


  // Try to create an in-lane repeating shuffle mask and then shuffle the

  // results into the target lanes.

  if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(

          DL, MVT::v16i32, V1, V2, Mask, Subtarget, DAG))

    return V;


  // If we have AVX512F support, we can use VEXPAND.

  if (SDValue V = lowerShuffleWithEXPAND(DL, MVT::v16i32, V1, V2, Mask,

                                         Zeroable, Subtarget, DAG))

    return V;


  if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i32, V1, V2, Mask,

                                          Zeroable, Subtarget, DAG))

    return Blend;


  return lowerShuffleWithPERMV(DL, MVT::v16i32, Mask, V1, V2, Subtarget, DAG);

}


/// Handle lowering of 32-lane 16-bit integer shuffles.


static SDValue lowerV32I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,

                                  const APInt &Zeroable, SDValue V1, SDValue V2,

                                  const X86Subtarget &Subtarget,

                                  SelectionDAG &DAG) {

  assert(V1.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");

  assert(V2.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");

  assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");

  assert(Subtarget.hasBWI() && "We can only lower v32i16 with AVX-512-BWI!");


  // Whenever we can lower this as a zext, that instruction is strictly faster

  // than any alternative. It also allows us to fold memory operands into the

  // shuffle in many cases.

  if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(

          DL, MVT::v32i16, V1, V2, Mask, Zeroable, Subtarget, DAG))

    return ZExt;


  // Use dedicated unpack instructions for masks that match their pattern.

  if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v32i16, V1, V2, Mask, DAG))

    return V;


  // Use dedicated pack instructions for masks that match their pattern.

  if (SDValue V =

          lowerShuffleWithPACK(DL, MVT::v32i16, V1, V2, Mask, Subtarget, DAG))

    return V;


  // Try to use shift instructions.

  if (SDValue Shift =

          lowerShuffleAsShift(DL, MVT::v32i16, V1, V2, Mask, Zeroable,

                              Subtarget, DAG, /*BitwiseOnly*/ false))

    return Shift;


  // Try to use byte rotation instructions.

  if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v32i16, V1, V2, Mask,

                                                Subtarget, DAG))

    return Rotate;


  if (V2.isUndef()) {

    // Try to use bit rotation instructions.

    if (SDValue Rotate =

            lowerShuffleAsBitRotate(DL, MVT::v32i16, V1, Mask, Subtarget, DAG))

      return Rotate;


    SmallVector<int, 8> RepeatedMask;

    if (is128BitLaneRepeatedShuffleMask(MVT::v32i16, Mask, RepeatedMask)) {

      // As this is a single-input shuffle, the repeated mask should be

      // a strictly valid v8i16 mask that we can pass through to the v8i16

      // lowering to handle even the v32 case.

      return lowerV8I16GeneralSingleInputShuffle(DL, MVT::v32i16, V1,

                                                 RepeatedMask, Subtarget, DAG);

    }

  }


  if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v32i16, V1, V2, Mask,

                                          Zeroable, Subtarget, DAG))

    return Blend;


  if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v32i16, Mask, V1, V2,

                                              Zeroable, Subtarget, DAG))

    return PSHUFB;


  // Try to simplify this by merging 128-bit lanes to enable a lane-based

  // shuffle.

  if (!V2.isUndef())

    if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(

            DL, MVT::v32i16, V1, V2, Mask, Subtarget, DAG))

      return Result;


  return lowerShuffleWithPERMV(DL, MVT::v32i16, Mask, V1, V2, Subtarget, DAG);

}


/// Handle lowering of 64-lane 8-bit integer shuffles.


static SDValue lowerV64I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,

                                 const APInt &Zeroable, SDValue V1, SDValue V2,

                                 const X86Subtarget &Subtarget,

                                 SelectionDAG &DAG) {

  assert(V1.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");

  assert(V2.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");

  assert(Mask.size() == 64 && "Unexpected mask size for v64 shuffle!");

  assert(Subtarget.hasBWI() && "We can only lower v64i8 with AVX-512-BWI!");


  // Whenever we can lower this as a zext, that instruction is strictly faster

  // than any alternative. It also allows us to fold memory operands into the

  // shuffle in many cases.

  if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(

          DL, MVT::v64i8, V1, V2, Mask, Zeroable, Subtarget, DAG))

    return ZExt;


  // Use dedicated unpack instructions for masks that match their pattern.

  if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v64i8, V1, V2, Mask, DAG))

    return V;


  // Use dedicated pack instructions for masks that match their pattern.

  if (SDValue V =

          lowerShuffleWithPACK(DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))

    return V;


  // Try to use shift instructions.

  if (SDValue Shift =

          lowerShuffleAsShift(DL, MVT::v64i8, V1, V2, Mask, Zeroable, Subtarget,

                              DAG, /*BitwiseOnly*/ false))

    return Shift;


  // Try to use byte rotation instructions.

  if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v64i8, V1, V2, Mask,

                                                Subtarget, DAG))

    return Rotate;


  // Try to use bit rotation instructions.

  if (V2.isUndef())

    if (SDValue Rotate =

            lowerShuffleAsBitRotate(DL, MVT::v64i8, V1, Mask, Subtarget, DAG))

      return Rotate;


  // Lower as AND if possible.

  if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v64i8, V1, V2, Mask,

                                             Zeroable, Subtarget, DAG))

    return Masked;


  if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v64i8, Mask, V1, V2,

                                              Zeroable, Subtarget, DAG))

    return PSHUFB;


  // Try to create an in-lane repeating shuffle mask and then shuffle the

  // results into the target lanes.

  if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(

          DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))

    return V;


  if (SDValue Result = lowerShuffleAsLanePermuteAndPermute(

          DL, MVT::v64i8, V1, V2, Mask, DAG, Subtarget))

    return Result;


  if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v64i8, V1, V2, Mask,

                                          Zeroable, Subtarget, DAG))

    return Blend;


  if (!is128BitLaneCrossingShuffleMask(MVT::v64i8, Mask)) {

    // Use PALIGNR+Permute if possible - permute might become PSHUFB but the

    // PALIGNR will be cheaper than the second PSHUFB+OR.

    if (SDValue V = lowerShuffleAsByteRotateAndPermute(DL, MVT::v64i8, V1, V2,

                                                       Mask, Subtarget, DAG))

      return V;


    // If we can't directly blend but can use PSHUFB, that will be better as it

    // can both shuffle and set up the inefficient blend.

    bool V1InUse, V2InUse;

    return lowerShuffleAsBlendOfPSHUFBs(DL, MVT::v64i8, V1, V2, Mask, Zeroable,

                                        DAG, V1InUse, V2InUse);

  }


  // Try to simplify this by merging 128-bit lanes to enable a lane-based

  // shuffle.

  if (!V2.isUndef())

    if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(

            DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))

      return Result;


  // VBMI can use VPERMV/VPERMV3 byte shuffles.

  if (Subtarget.hasVBMI())

    return lowerShuffleWithPERMV(DL, MVT::v64i8, Mask, V1, V2, Subtarget, DAG);


  return splitAndLowerShuffle(DL, MVT::v64i8, V1, V2, Mask, DAG, /*SimpleOnly*/ false);

}


/// High-level routine to lower various 512-bit x86 vector shuffles.

///

/// This routine either breaks down the specific type of a 512-bit x86 vector

/// shuffle or splits it into two 256-bit shuffles and fuses the results back

/// together based on the available instructions.


static SDValue lower512BitShuffle(const SDLoc &DL, ArrayRef<int> Mask,

                                  MVT VT, SDValue V1, SDValue V2,

                                  const APInt &Zeroable,

                                  const X86Subtarget &Subtarget,

                                  SelectionDAG &DAG) {

  assert(Subtarget.hasAVX512() &&

         "Cannot lower 512-bit vectors w/ basic ISA!");


  // If we have a single input to the zero element, insert that into V1 if we

  // can do so cheaply.

  int NumElts = Mask.size();

  int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });


  if (NumV2Elements == 1 && Mask[0] >= NumElts)

    if (SDValue Insertion = lowerShuffleAsElementInsertion(

            DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))

      return Insertion;


  // Handle special cases where the lower or upper half is UNDEF.

  if (SDValue V =

          lowerShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))

    return V;


  // Check for being able to broadcast a single element.

  if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, VT, V1, V2, Mask,

                                                  Subtarget, DAG))

    return Broadcast;


  if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI()) {

    // Try using bit ops for masking and blending before falling back to

    // splitting.

    if (SDValue V = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,

                                          Subtarget, DAG))

      return V;

    if (SDValue V = lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))

      return V;


    return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG, /*SimpleOnly*/ false);

  }


  if (VT == MVT::v32f16 || VT == MVT::v32bf16) {

    if (!Subtarget.hasBWI())

      return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG,

                                  /*SimpleOnly*/ false);


    V1 = DAG.getBitcast(MVT::v32i16, V1);

    V2 = DAG.getBitcast(MVT::v32i16, V2);

    return DAG.getBitcast(VT,

                          DAG.getVectorShuffle(MVT::v32i16, DL, V1, V2, Mask));

  }


  // Dispatch to each element type for lowering. If we don't have support for

  // specific element type shuffles at 512 bits, immediately split them and

  // lower them. Each lowering routine of a given type is allowed to assume that

  // the requisite ISA extensions for that element type are available.

  switch (VT.SimpleTy) {

  case MVT::v8f64:

    return lowerV8F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);

  case MVT::v16f32:

    return lowerV16F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);

  case MVT::v8i64:

    return lowerV8I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);

  case MVT::v16i32:

    return lowerV16I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);

  case MVT::v32i16:

    return lowerV32I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);

  case MVT::v64i8:

    return lowerV64I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);


  default:

    llvm_unreachable("Not a valid 512-bit x86 vector type!");

  }

}


static SDValue lower1BitShuffleAsKSHIFTR(const SDLoc &DL, ArrayRef<int> Mask,

                                         MVT VT, SDValue V1, SDValue V2,

                                         const X86Subtarget &Subtarget,

                                         SelectionDAG &DAG) {

  // Shuffle should be unary.

  if (!V2.isUndef())

    return SDValue();


  int ShiftAmt = -1;

  int NumElts = Mask.size();

  for (int i = 0; i != NumElts; ++i) {

    int M = Mask[i];

    assert((M == SM_SentinelUndef || (0 <= M && M < NumElts)) &&

           "Unexpected mask index.");

    if (M < 0)

      continue;


    // The first non-undef element determines our shift amount.

    if (ShiftAmt < 0) {

      ShiftAmt = M - i;

      // Need to be shifting right.

      if (ShiftAmt <= 0)

        return SDValue();

    }

    // All non-undef elements must shift by the same amount.

    if (ShiftAmt != M - i)

      return SDValue();

  }

  assert(ShiftAmt >= 0 && "All undef?");


  // Great we found a shift right.

  SDValue Res = widenMaskVector(V1, false, Subtarget, DAG, DL);

  Res = DAG.getNode(X86ISD::KSHIFTR, DL, Res.getValueType(), Res,

                    DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));

  return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,

                     DAG.getVectorIdxConstant(0, DL));

}


// Determine if this shuffle can be implemented with a KSHIFT instruction.

// Returns the shift amount if possible or -1 if not. This is a simplified

// version of matchShuffleAsShift.


static int match1BitShuffleAsKSHIFT(unsigned &Opcode, ArrayRef<int> Mask,

                                    int MaskOffset, const APInt &Zeroable) {

  int Size = Mask.size();


  auto CheckZeros = [&](int Shift, bool Left) {

    for (int j = 0; j < Shift; ++j)

      if (!Zeroable[j + (Left ? 0 : (Size - Shift))])

        return false;


    return true;

  };


  auto MatchShift = [&](int Shift, bool Left) {

    unsigned Pos = Left ? Shift : 0;

    unsigned Low = Left ? 0 : Shift;

    unsigned Len = Size - Shift;

    return isSequentialOrUndefInRange(Mask, Pos, Len, Low + MaskOffset);

  };


  for (int Shift = 1; Shift != Size; ++Shift)

    for (bool Left : {true, false})

      if (CheckZeros(Shift, Left) && MatchShift(Shift, Left)) {

        Opcode = Left ? X86ISD::KSHIFTL : X86ISD::KSHIFTR;

        return Shift;

      }


  return -1;

}


// Lower vXi1 vector shuffles.

// There is no a dedicated instruction on AVX-512 that shuffles the masks.

// The only way to shuffle bits is to sign-extend the mask vector to SIMD

// vector, shuffle and then truncate it back.


static SDValue lower1BitShuffle(const SDLoc &DL, ArrayRef<int> Mask,

                                MVT VT, SDValue V1, SDValue V2,

                                const APInt &Zeroable,

                                const X86Subtarget &Subtarget,

                                SelectionDAG &DAG) {

  assert(Subtarget.hasAVX512() &&

         "Cannot lower 512-bit vectors w/o basic ISA!");


  int NumElts = Mask.size();

  int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });


  // Try to recognize shuffles that are just padding a subvector with zeros.

  int SubvecElts = 0;

  int Src = -1;

  for (int i = 0; i != NumElts; ++i) {

    if (Mask[i] >= 0) {

      // Grab the source from the first valid mask. All subsequent elements need

      // to use this same source.

      if (Src < 0)

        Src = Mask[i] / NumElts;

      if (Src != (Mask[i] / NumElts) || (Mask[i] % NumElts) != i)

        break;

    }


    ++SubvecElts;

  }

  assert(SubvecElts != NumElts && "Identity shuffle?");


  // Clip to a power 2.

  SubvecElts = llvm::bit_floor<uint32_t>(SubvecElts);


  // Make sure the number of zeroable bits in the top at least covers the bits

  // not covered by the subvector.

  if ((int)Zeroable.countl_one() >= (NumElts - SubvecElts)) {

    assert(Src >= 0 && "Expected a source!");

    MVT ExtractVT = MVT::getVectorVT(MVT::i1, SubvecElts);

    SDValue Extract =

        DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtractVT, Src == 0 ? V1 : V2,

                    DAG.getVectorIdxConstant(0, DL));

    return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,

                       DAG.getConstant(0, DL, VT), Extract,

                       DAG.getVectorIdxConstant(0, DL));

  }


  // Try a simple shift right with undef elements. Later we'll try with zeros.

  if (SDValue Shift =

          lower1BitShuffleAsKSHIFTR(DL, Mask, VT, V1, V2, Subtarget, DAG))

    return Shift;


  // Try to match KSHIFTs.

  unsigned Offset = 0;

  for (SDValue V : {V1, V2}) {

    unsigned Opcode;

    int ShiftAmt = match1BitShuffleAsKSHIFT(Opcode, Mask, Offset, Zeroable);

    if (ShiftAmt >= 0) {

      SDValue Res = widenMaskVector(V, false, Subtarget, DAG, DL);

      MVT WideVT = Res.getSimpleValueType();

      // Widened right shifts need two shifts to ensure we shift in zeroes.

      if (Opcode == X86ISD::KSHIFTR && WideVT != VT) {

        int WideElts = WideVT.getVectorNumElements();

        // Shift left to put the original vector in the MSBs of the new size.

        Res =

            DAG.getNode(X86ISD::KSHIFTL, DL, WideVT, Res,

                        DAG.getTargetConstant(WideElts - NumElts, DL, MVT::i8));

        // Increase the shift amount to account for the left shift.

        ShiftAmt += WideElts - NumElts;

      }


      Res = DAG.getNode(Opcode, DL, WideVT, Res,

                        DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));

      return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,

                         DAG.getVectorIdxConstant(0, DL));

    }

    Offset += NumElts; // Increment for next iteration.

  }


  // If we're performing an unary shuffle on a SETCC result, try to shuffle the

  // ops instead.

  // TODO: What other unary shuffles would benefit from this?

  if (NumV2Elements == 0 && V1.getOpcode() == ISD::SETCC && V1->hasOneUse()) {

    SDValue Op0 = V1.getOperand(0);

    SDValue Op1 = V1.getOperand(1);

    ISD::CondCode CC = cast<CondCodeSDNode>(V1.getOperand(2))->get();

    EVT OpVT = Op0.getValueType();

    if (OpVT.getScalarSizeInBits() >= 32 || isBroadcastShuffleMask(Mask))

      return DAG.getSetCC(

          DL, VT, DAG.getVectorShuffle(OpVT, DL, Op0, DAG.getUNDEF(OpVT), Mask),

          DAG.getVectorShuffle(OpVT, DL, Op1, DAG.getUNDEF(OpVT), Mask), CC);

  }


  MVT ExtVT;

  switch (VT.SimpleTy) {

  default:

    llvm_unreachable("Expected a vector of i1 elements");

  case MVT::v2i1:

    ExtVT = MVT::v2i64;

    break;

  case MVT::v4i1:

    ExtVT = MVT::v4i32;

    break;

  case MVT::v8i1:

    // Take 512-bit type, more shuffles on KNL. If we have VLX use a 256-bit

    // shuffle.

    ExtVT = Subtarget.hasVLX() ? MVT::v8i32 : MVT::v8i64;

    break;

  case MVT::v16i1:

    // Take 512-bit type, unless we are avoiding 512-bit types and have the

    // 256-bit operation available.

    ExtVT = Subtarget.canExtendTo512DQ() ? MVT::v16i32 : MVT::v16i16;

    break;

  case MVT::v32i1:

    // Take 512-bit type, unless we are avoiding 512-bit types and have the

    // 256-bit operation available.

    assert(Subtarget.hasBWI() && "Expected AVX512BW support");

    ExtVT = Subtarget.canExtendTo512BW() ? MVT::v32i16 : MVT::v32i8;

    break;

  case MVT::v64i1:

    // Fall back to scalarization. FIXME: We can do better if the shuffle

    // can be partitioned cleanly.

    if (!Subtarget.useBWIRegs())

      return SDValue();

    ExtVT = MVT::v64i8;

    break;

  }


  V1 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V1);

  V2 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V2);


  SDValue Shuffle = DAG.getVectorShuffle(ExtVT, DL, V1, V2, Mask);

  // i1 was sign extended we can use X86ISD::CVT2MASK.

  int NumElems = VT.getVectorNumElements();

  if ((Subtarget.hasBWI() && (NumElems >= 32)) ||

      (Subtarget.hasDQI() && (NumElems < 32)))

    return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, ExtVT),

                       Shuffle, ISD::SETGT);


  return DAG.getNode(ISD::TRUNCATE, DL, VT, Shuffle);

}


/// Helper function that returns true if the shuffle mask should be

/// commuted to improve canonicalization.


static bool canonicalizeShuffleMaskWithCommute(ArrayRef<int> Mask) {

  int NumElements = Mask.size();


  int NumV1Elements = 0, NumV2Elements = 0;

  for (int M : Mask)

    if (M < 0)

      continue;

    else if (M < NumElements)

      ++NumV1Elements;

    else

      ++NumV2Elements;


  // Commute the shuffle as needed such that more elements come from V1 than

  // V2. This allows us to match the shuffle pattern strictly on how many

  // elements come from V1 without handling the symmetric cases.

  if (NumV2Elements > NumV1Elements)

    return true;


  assert(NumV1Elements > 0 && "No V1 indices");


  if (NumV2Elements == 0)

    return false;


  // When the number of V1 and V2 elements are the same, try to minimize the

  // number of uses of V2 in the low half of the vector. When that is tied,

  // ensure that the sum of indices for V1 is equal to or lower than the sum

  // indices for V2. When those are equal, try to ensure that the number of odd

  // indices for V1 is lower than the number of odd indices for V2.

  if (NumV1Elements == NumV2Elements) {

    int LowV1Elements = 0, LowV2Elements = 0;

    for (int M : Mask.slice(0, NumElements / 2))

      if (M >= NumElements)

        ++LowV2Elements;

      else if (M >= 0)

        ++LowV1Elements;

    if (LowV2Elements > LowV1Elements)

      return true;

    if (LowV2Elements == LowV1Elements) {

      int SumV1Indices = 0, SumV2Indices = 0;

      for (int i = 0, Size = Mask.size(); i < Size; ++i)

        if (Mask[i] >= NumElements)

          SumV2Indices += i;

        else if (Mask[i] >= 0)

          SumV1Indices += i;

      if (SumV2Indices < SumV1Indices)

        return true;

      if (SumV2Indices == SumV1Indices) {

        int NumV1OddIndices = 0, NumV2OddIndices = 0;

        for (int i = 0, Size = Mask.size(); i < Size; ++i)

          if (Mask[i] >= NumElements)

            NumV2OddIndices += i % 2;

          else if (Mask[i] >= 0)

            NumV1OddIndices += i % 2;

        if (NumV2OddIndices < NumV1OddIndices)

          return true;

      }

    }

  }


  return false;

}


static bool canCombineAsMaskOperation(SDValue V,

                                      const X86Subtarget &Subtarget) {

  if (!Subtarget.hasAVX512())

    return false;


  if (!V.getValueType().isSimple())

    return false;


  MVT VT = V.getSimpleValueType().getScalarType();

  if ((VT == MVT::i16 || VT == MVT::i8) && !Subtarget.hasBWI())

    return false;


  // If vec width < 512, widen i8/i16 even with BWI as blendd/blendps/blendpd

  // are preferable to blendw/blendvb/masked-mov.

  if ((VT == MVT::i16 || VT == MVT::i8) &&

      V.getSimpleValueType().getSizeInBits() < 512)

    return false;


  auto HasMaskOperation = [&](SDValue V) {

    // TODO: Currently we only check limited opcode. We probably extend

    // it to all binary operation by checking TLI.isBinOp().

    switch (V->getOpcode()) {

    default:

      return false;

    case ISD::ADD:

    case ISD::SUB:

    case ISD::AND:

    case ISD::XOR:

    case ISD::OR:

    case ISD::SMAX:

    case ISD::SMIN:

    case ISD::UMAX:

    case ISD::UMIN:

    case ISD::ABS:

    case ISD::SHL:

    case ISD::SRL:

    case ISD::SRA:

    case ISD::MUL:

      break;

    }

    if (!V->hasOneUse())

      return false;


    return true;

  };


  if (HasMaskOperation(V))

    return true;


  return false;

}


// Forward declaration.

static SDValue canonicalizeShuffleMaskWithHorizOp(

    MutableArrayRef<SDValue> Ops, MutableArrayRef<int> Mask,

    unsigned RootSizeInBits, const SDLoc &DL, SelectionDAG &DAG,

    const X86Subtarget &Subtarget);


    /// Top-level lowering for x86 vector shuffles.

///

/// This handles decomposition, canonicalization, and lowering of all x86

/// vector shuffles. Most of the specific lowering strategies are encapsulated

/// above in helper routines. The canonicalization attempts to widen shuffles

/// to involve fewer lanes of wider elements, consolidate symmetric patterns

/// s.t. only one of the two inputs needs to be tested, etc.


static SDValue lowerVECTOR_SHUFFLE(SDValue Op, const X86Subtarget &Subtarget,

                                   SelectionDAG &DAG) {

  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);

  ArrayRef<int> OrigMask = SVOp->getMask();

  SDValue V1 = Op.getOperand(0);

  SDValue V2 = Op.getOperand(1);

  MVT VT = Op.getSimpleValueType();

  int NumElements = VT.getVectorNumElements();

  SDLoc DL(Op);

  bool Is1BitVector = (VT.getVectorElementType() == MVT::i1);


  assert((VT.getSizeInBits() != 64 || Is1BitVector) &&

         "Can't lower MMX shuffles");


  bool V1IsUndef = V1.isUndef();

  bool V2IsUndef = V2.isUndef();

  if (V1IsUndef && V2IsUndef)

    return DAG.getUNDEF(VT);


  // When we create a shuffle node we put the UNDEF node to second operand,

  // but in some cases the first operand may be transformed to UNDEF.

  // In this case we should just commute the node.

  if (V1IsUndef)

    return DAG.getCommutedVectorShuffle(*SVOp);


  // Check for non-undef masks pointing at an undef vector and make the masks

  // undef as well. This makes it easier to match the shuffle based solely on

  // the mask.

  if (V2IsUndef &&

      any_of(OrigMask, [NumElements](int M) { return M >= NumElements; })) {

    SmallVector<int, 8> NewMask(OrigMask);

    for (int &M : NewMask)

      if (M >= NumElements)

        M = -1;

    return DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);

  }


  // Check for illegal shuffle mask element index values.

  int MaskUpperLimit = OrigMask.size() * (V2IsUndef ? 1 : 2);

  (void)MaskUpperLimit;

  assert(llvm::all_of(OrigMask,

                      [&](int M) { return -1 <= M && M < MaskUpperLimit; }) &&

         "Out of bounds shuffle index");


  // We actually see shuffles that are entirely re-arrangements of a set of

  // zero inputs. This mostly happens while decomposing complex shuffles into

  // simple ones. Directly lower these as a buildvector of zeros.

  APInt KnownUndef, KnownZero;

  computeZeroableShuffleElements(OrigMask, V1, V2, KnownUndef, KnownZero);


  APInt Zeroable = KnownUndef | KnownZero;

  if (Zeroable.isAllOnes())

    return getZeroVector(VT, Subtarget, DAG, DL);


  bool V2IsZero = !V2IsUndef && ISD::isBuildVectorAllZeros(V2.getNode());


  // Try to collapse shuffles into using a vector type with fewer elements but

  // wider element types. We cap this to not form integers or floating point

  // elements wider than 64 bits. It does not seem beneficial to form i128

  // integers to handle flipping the low and high halves of AVX 256-bit vectors.

  SmallVector<int, 16> WidenedMask;

  if (VT.getScalarSizeInBits() < 64 && !Is1BitVector &&

      !canCombineAsMaskOperation(V1, Subtarget) &&

      !canCombineAsMaskOperation(V2, Subtarget) &&

      canWidenShuffleElements(OrigMask, Zeroable, V2IsZero, WidenedMask)) {

    // Shuffle mask widening should not interfere with a broadcast opportunity

    // by obfuscating the operands with bitcasts.

    // TODO: Avoid lowering directly from this top-level function: make this

    // a query (canLowerAsBroadcast) and defer lowering to the type-based calls.

    if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, VT, V1, V2, OrigMask,

                                                    Subtarget, DAG))

      return Broadcast;


    MVT NewEltVT = VT.isFloatingPoint()

                       ? MVT::getFloatingPointVT(VT.getScalarSizeInBits() * 2)

                       : MVT::getIntegerVT(VT.getScalarSizeInBits() * 2);

    int NewNumElts = NumElements / 2;

    MVT NewVT = MVT::getVectorVT(NewEltVT, NewNumElts);

    // Make sure that the new vector type is legal. For example, v2f64 isn't

    // legal on SSE1.

    if (DAG.getTargetLoweringInfo().isTypeLegal(NewVT)) {

      if (V2IsZero) {

        // Modify the new Mask to take all zeros from the all-zero vector.

        // Choose indices that are blend-friendly.

        bool UsedZeroVector = false;

        assert(is_contained(WidenedMask, SM_SentinelZero) &&

               "V2's non-undef elements are used?!");

        for (int i = 0; i != NewNumElts; ++i)

          if (WidenedMask[i] == SM_SentinelZero) {

            WidenedMask[i] = i + NewNumElts;

            UsedZeroVector = true;

          }

        // Ensure all elements of V2 are zero - isBuildVectorAllZeros permits

        // some elements to be undef.

        if (UsedZeroVector)

          V2 = getZeroVector(NewVT, Subtarget, DAG, DL);

      }

      V1 = DAG.getBitcast(NewVT, V1);

      V2 = DAG.getBitcast(NewVT, V2);

      return DAG.getBitcast(

          VT, DAG.getVectorShuffle(NewVT, DL, V1, V2, WidenedMask));

    }

  }


  SmallVector<SDValue> Ops = {V1, V2};

  SmallVector<int> Mask(OrigMask);


  // Canonicalize the shuffle with any horizontal ops inputs.

  // NOTE: This may update Ops and Mask.

  if (SDValue HOp = canonicalizeShuffleMaskWithHorizOp(

          Ops, Mask, VT.getSizeInBits(), DL, DAG, Subtarget))

    return DAG.getBitcast(VT, HOp);


  V1 = DAG.getBitcast(VT, Ops[0]);

  V2 = DAG.getBitcast(VT, Ops[1]);

  assert(NumElements == (int)Mask.size() &&

         "canonicalizeShuffleMaskWithHorizOp "

         "shouldn't alter the shuffle mask size");


  // Canonicalize zeros/ones/fp splat constants to ensure no undefs.

  // These will be materialized uniformly anyway, so make splat matching easier.

  // TODO: Allow all int constants?

  auto CanonicalizeConstant = [VT, &DL, &DAG](SDValue V) {

    if (auto *BV = dyn_cast<BuildVectorSDNode>(V)) {

      BitVector Undefs;

      if (SDValue Splat = BV->getSplatValue(&Undefs)) {

        if (Undefs.any() &&

            (isNullConstant(Splat) || isAllOnesConstant(Splat) ||

             isa<ConstantFPSDNode>(Splat))) {

          V = DAG.getBitcast(VT, DAG.getSplat(BV->getValueType(0), DL, Splat));

        }

      }

    }

    return V;

  };

  V1 = CanonicalizeConstant(V1);

  V2 = CanonicalizeConstant(V2);


  // Commute the shuffle if it will improve canonicalization.

  if (canonicalizeShuffleMaskWithCommute(Mask)) {

    ShuffleVectorSDNode::commuteMask(Mask);

    std::swap(V1, V2);

  }


  // For each vector width, delegate to a specialized lowering routine.

  if (VT.is128BitVector())

    return lower128BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);


  if (VT.is256BitVector())

    return lower256BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);


  if (VT.is512BitVector())

    return lower512BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);


  if (Is1BitVector)

    return lower1BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);


  llvm_unreachable("Unimplemented!");

}


// As legal vpcompress instructions depend on various AVX512 extensions, try to

// convert illegal vector sizes to legal ones to avoid expansion.


static SDValue lowerVECTOR_COMPRESS(SDValue Op, const X86Subtarget &Subtarget,

                                    SelectionDAG &DAG) {

  assert(Subtarget.hasAVX512() &&

         "Need AVX512 for custom VECTOR_COMPRESS lowering.");


  SDLoc DL(Op);

  SDValue Vec = Op.getOperand(0);

  SDValue Mask = Op.getOperand(1);

  SDValue Passthru = Op.getOperand(2);


  EVT VecVT = Vec.getValueType();

  EVT ElementVT = VecVT.getVectorElementType();

  unsigned NumElements = VecVT.getVectorNumElements();

  unsigned NumVecBits = VecVT.getFixedSizeInBits();

  unsigned NumElementBits = ElementVT.getFixedSizeInBits();


  // 128- and 256-bit vectors with <= 16 elements can be converted to and

  // compressed as 512-bit vectors in AVX512F.

  if (NumVecBits != 128 && NumVecBits != 256)

    return SDValue();


  if (NumElementBits == 32 || NumElementBits == 64) {

    unsigned NumLargeElements = 512 / NumElementBits;

    MVT LargeVecVT =

        MVT::getVectorVT(ElementVT.getSimpleVT(), NumLargeElements);

    MVT LargeMaskVT = MVT::getVectorVT(MVT::i1, NumLargeElements);


    Vec = widenSubVector(LargeVecVT, Vec, /*ZeroNewElements=*/false, Subtarget,

                         DAG, DL);

    Mask = widenSubVector(LargeMaskVT, Mask, /*ZeroNewElements=*/true,

                          Subtarget, DAG, DL);

    Passthru = Passthru.isUndef() ? DAG.getUNDEF(LargeVecVT)

                                  : widenSubVector(LargeVecVT, Passthru,

                                                   /*ZeroNewElements=*/false,

                                                   Subtarget, DAG, DL);


    SDValue Compressed =

        DAG.getNode(ISD::VECTOR_COMPRESS, DL, LargeVecVT, Vec, Mask, Passthru);

    return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VecVT, Compressed,

                       DAG.getConstant(0, DL, MVT::i64));

  }


  if (VecVT == MVT::v8i16 || VecVT == MVT::v8i8 || VecVT == MVT::v16i8 ||

      VecVT == MVT::v16i16) {

    MVT LageElementVT = MVT::getIntegerVT(512 / NumElements);

    EVT LargeVecVT = MVT::getVectorVT(LageElementVT, NumElements);


    Vec = DAG.getNode(ISD::ANY_EXTEND, DL, LargeVecVT, Vec);

    Passthru = Passthru.isUndef()

                   ? DAG.getUNDEF(LargeVecVT)

                   : DAG.getNode(ISD::ANY_EXTEND, DL, LargeVecVT, Passthru);


    SDValue Compressed =

        DAG.getNode(ISD::VECTOR_COMPRESS, DL, LargeVecVT, Vec, Mask, Passthru);

    return DAG.getNode(ISD::TRUNCATE, DL, VecVT, Compressed);

  }


  return SDValue();

}


/// Try to lower a VSELECT instruction to a vector shuffle.


static SDValue lowerVSELECTtoVectorShuffle(SDValue Op,

                                           const X86Subtarget &Subtarget,

                                           SelectionDAG &DAG) {

  SDValue Cond = Op.getOperand(0);

  SDValue LHS = Op.getOperand(1);

  SDValue RHS = Op.getOperand(2);

  MVT VT = Op.getSimpleValueType();


  // Only non-legal VSELECTs reach this lowering, convert those into generic

  // shuffles and re-use the shuffle lowering path for blends.

  if (ISD::isBuildVectorOfConstantSDNodes(Cond.getNode())) {

    SmallVector<int, 32> Mask;

    if (createShuffleMaskFromVSELECT(Mask, Cond))

      return DAG.getVectorShuffle(VT, SDLoc(Op), LHS, RHS, Mask);

  }


  return SDValue();

}


SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {

  SDValue Cond = Op.getOperand(0);

  SDValue LHS = Op.getOperand(1);

  SDValue RHS = Op.getOperand(2);


  SDLoc dl(Op);

  MVT VT = Op.getSimpleValueType();

  if (isSoftF16(VT, Subtarget)) {

    MVT NVT = VT.changeVectorElementTypeToInteger();

    return DAG.getBitcast(VT, DAG.getNode(ISD::VSELECT, dl, NVT, Cond,

                                          DAG.getBitcast(NVT, LHS),

                                          DAG.getBitcast(NVT, RHS)));

  }


  // A vselect where all conditions and data are constants can be optimized into

  // a single vector load by SelectionDAGLegalize::ExpandBUILD_VECTOR().

  if (ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()) &&

      ISD::isBuildVectorOfConstantSDNodes(LHS.getNode()) &&

      ISD::isBuildVectorOfConstantSDNodes(RHS.getNode()))

    return SDValue();


  // Try to lower this to a blend-style vector shuffle. This can handle all

  // constant condition cases.

  if (SDValue BlendOp = lowerVSELECTtoVectorShuffle(Op, Subtarget, DAG))

    return BlendOp;


  // If this VSELECT has a vector if i1 as a mask, it will be directly matched

  // with patterns on the mask registers on AVX-512.

  MVT CondVT = Cond.getSimpleValueType();

  unsigned CondEltSize = Cond.getScalarValueSizeInBits();

  if (CondEltSize == 1)

    return Op;


  // Variable blends are only legal from SSE4.1 onward.

  if (!Subtarget.hasSSE41())

    return SDValue();


  unsigned EltSize = VT.getScalarSizeInBits();

  unsigned NumElts = VT.getVectorNumElements();


  // Expand v32i16/v64i8 without BWI.

  if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())

    return SDValue();


  // If the VSELECT is on a 512-bit type, we have to convert a non-i1 condition

  // into an i1 condition so that we can use the mask-based 512-bit blend

  // instructions.

  if (VT.getSizeInBits() == 512) {

    // Build a mask by testing the condition against zero.

    MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);

    SDValue Mask = DAG.getSetCC(dl, MaskVT, Cond,

                                DAG.getConstant(0, dl, CondVT),

                                ISD::SETNE);

    // Now return a new VSELECT using the mask.

    return DAG.getSelect(dl, VT, Mask, LHS, RHS);

  }


  // SEXT/TRUNC cases where the mask doesn't match the destination size.

  if (CondEltSize != EltSize) {

    // If we don't have a sign splat, rely on the expansion.

    if (CondEltSize != DAG.ComputeNumSignBits(Cond))

      return SDValue();


    MVT NewCondSVT = MVT::getIntegerVT(EltSize);

    MVT NewCondVT = MVT::getVectorVT(NewCondSVT, NumElts);

    Cond = DAG.getSExtOrTrunc(Cond, dl, NewCondVT);

    return DAG.getNode(ISD::VSELECT, dl, VT, Cond, LHS, RHS);

  }


  // v16i16/v32i8 selects without AVX2, if the condition and another operand

  // are free to split, then better to split before expanding the

  // select. Don't bother with XOP as it has the fast VPCMOV instruction.

  // TODO: This is very similar to narrowVectorSelect.

  // TODO: Add Load splitting to isFreeToSplitVector ?

  if (EltSize < 32 && VT.is256BitVector() && !Subtarget.hasAVX2() &&

      !Subtarget.hasXOP()) {

    bool FreeCond = isFreeToSplitVector(Cond, DAG);

    bool FreeLHS = isFreeToSplitVector(LHS, DAG) ||

                   (ISD::isNormalLoad(LHS.getNode()) && LHS.hasOneUse());

    bool FreeRHS = isFreeToSplitVector(RHS, DAG) ||

                   (ISD::isNormalLoad(RHS.getNode()) && RHS.hasOneUse());

    if (FreeCond && (FreeLHS || FreeRHS))

      return splitVectorOp(Op, DAG, dl);

  }


  // Only some types will be legal on some subtargets. If we can emit a legal

  // VSELECT-matching blend, return Op, and but if we need to expand, return

  // a null value.

  switch (VT.SimpleTy) {

  default:

    // Most of the vector types have blends past SSE4.1.

    return Op;


  case MVT::v32i8:

    // The byte blends for AVX vectors were introduced only in AVX2.

    if (Subtarget.hasAVX2())

      return Op;


    return SDValue();


  case MVT::v8i16:

  case MVT::v16i16:

  case MVT::v8f16:

  case MVT::v16f16: {

    // Bitcast everything to the vXi8 type and use a vXi8 vselect.

    MVT CastVT = MVT::getVectorVT(MVT::i8, NumElts * 2);

    Cond = DAG.getBitcast(CastVT, Cond);

    LHS = DAG.getBitcast(CastVT, LHS);

    RHS = DAG.getBitcast(CastVT, RHS);

    SDValue Select = DAG.getNode(ISD::VSELECT, dl, CastVT, Cond, LHS, RHS);

    return DAG.getBitcast(VT, Select);

  }

  }

}


static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) {

  MVT VT = Op.getSimpleValueType();

  SDValue Vec = Op.getOperand(0);

  SDValue Idx = Op.getOperand(1);

  assert(isa<ConstantSDNode>(Idx) && "Constant index expected");

  SDLoc dl(Op);


  if (!Vec.getSimpleValueType().is128BitVector())

    return SDValue();


  if (VT.getSizeInBits() == 8) {

    // If IdxVal is 0, it's cheaper to do a move instead of a pextrb, unless

    // we're going to zero extend the register or fold the store.

    if (llvm::isNullConstant(Idx) && !X86::mayFoldIntoZeroExtend(Op) &&

        !X86::mayFoldIntoStore(Op))

      return DAG.getNode(ISD::TRUNCATE, dl, MVT::i8,

                         DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,

                                     DAG.getBitcast(MVT::v4i32, Vec), Idx));


    unsigned IdxVal = Idx->getAsZExtVal();

    SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32, Vec,

                                  DAG.getTargetConstant(IdxVal, dl, MVT::i8));

    return DAG.getNode(ISD::TRUNCATE, dl, VT, Extract);

  }


  if (VT == MVT::f32) {

    // EXTRACTPS outputs to a GPR32 register which will require a movd to copy

    // the result back to FR32 register. It's only worth matching if the

    // result has a single use which is a store or a bitcast to i32.  And in

    // the case of a store, it's not worth it if the index is a constant 0,

    // because a MOVSSmr can be used instead, which is smaller and faster.

    if (!Op.hasOneUse())

      return SDValue();

    SDNode *User = *Op.getNode()->user_begin();

    if ((User->getOpcode() != ISD::STORE || isNullConstant(Idx)) &&

        (User->getOpcode() != ISD::BITCAST ||

         User->getValueType(0) != MVT::i32))

      return SDValue();

    SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,

                                  DAG.getBitcast(MVT::v4i32, Vec), Idx);

    return DAG.getBitcast(MVT::f32, Extract);

  }


  if (VT == MVT::i32 || VT == MVT::i64)

      return Op;


  return SDValue();

}


/// Extract one bit from mask vector, like v16i1 or v8i1.

/// AVX-512 feature.


static SDValue ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG,

                                        const X86Subtarget &Subtarget) {

  SDValue Vec = Op.getOperand(0);

  SDLoc dl(Vec);

  MVT VecVT = Vec.getSimpleValueType();

  SDValue Idx = Op.getOperand(1);

  auto* IdxC = dyn_cast<ConstantSDNode>(Idx);

  MVT EltVT = Op.getSimpleValueType();


  assert((VecVT.getVectorNumElements() <= 16 || Subtarget.hasBWI()) &&

         "Unexpected vector type in ExtractBitFromMaskVector");


  // variable index can't be handled in mask registers,

  // extend vector to VR512/128

  if (!IdxC) {

    unsigned NumElts = VecVT.getVectorNumElements();

    // Extending v8i1/v16i1 to 512-bit get better performance on KNL

    // than extending to 128/256bit.

    if (NumElts == 1) {

      Vec = widenMaskVector(Vec, false, Subtarget, DAG, dl);

      MVT IntVT = MVT::getIntegerVT(Vec.getValueType().getVectorNumElements());

      return DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, DAG.getBitcast(IntVT, Vec));

    }

    MVT ExtEltVT = (NumElts <= 8) ? MVT::getIntegerVT(128 / NumElts) : MVT::i8;

    MVT ExtVecVT = MVT::getVectorVT(ExtEltVT, NumElts);

    SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVecVT, Vec);

    SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ExtEltVT, Ext, Idx);

    return DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt);

  }


  unsigned IdxVal = IdxC->getZExtValue();

  if (IdxVal == 0) // the operation is legal

    return Op;


  // Extend to natively supported kshift.

  Vec = widenMaskVector(Vec, false, Subtarget, DAG, dl);


  // Use kshiftr instruction to move to the lower element.

  Vec = DAG.getNode(X86ISD::KSHIFTR, dl, Vec.getSimpleValueType(), Vec,

                    DAG.getTargetConstant(IdxVal, dl, MVT::i8));


  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,

                     DAG.getVectorIdxConstant(0, dl));

}


// Helper to find all the extracted elements from a vector.


static APInt getExtractedDemandedElts(SDNode *N) {

  MVT VT = N->getSimpleValueType(0);

  unsigned NumElts = VT.getVectorNumElements();

  APInt DemandedElts = APInt::getZero(NumElts);

  for (SDNode *User : N->users()) {

    switch (User->getOpcode()) {

    case X86ISD::PEXTRB:

    case X86ISD::PEXTRW:

    case ISD::EXTRACT_VECTOR_ELT:

      if (!isa<ConstantSDNode>(User->getOperand(1))) {

        DemandedElts.setAllBits();

        return DemandedElts;

      }

      DemandedElts.setBit(User->getConstantOperandVal(1));

      break;

    case ISD::BITCAST: {

      if (!User->getValueType(0).isSimple() ||

          !User->getValueType(0).isVector()) {

        DemandedElts.setAllBits();

        return DemandedElts;

      }

      APInt DemandedSrcElts = getExtractedDemandedElts(User);

      DemandedElts |= APIntOps::ScaleBitMask(DemandedSrcElts, NumElts);

      break;

    }

    default:

      DemandedElts.setAllBits();

      return DemandedElts;

    }

  }

  return DemandedElts;

}


SDValue

X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,

                                           SelectionDAG &DAG) const {

  SDLoc dl(Op);

  SDValue Vec = Op.getOperand(0);

  MVT VecVT = Vec.getSimpleValueType();

  SDValue Idx = Op.getOperand(1);

  auto* IdxC = dyn_cast<ConstantSDNode>(Idx);


  if (VecVT.getVectorElementType() == MVT::i1)

    return ExtractBitFromMaskVector(Op, DAG, Subtarget);


  if (!IdxC) {

    // Its more profitable to go through memory (1 cycles throughput)

    // than using VMOVD + VPERMV/PSHUFB sequence (2/3 cycles throughput)

    // IACA tool was used to get performance estimation

    // (https://software.intel.com/en-us/articles/intel-architecture-code-analyzer)

    //

    // example : extractelement <16 x i8> %a, i32 %i

    //

    // Block Throughput: 3.00 Cycles

    // Throughput Bottleneck: Port5

    //

    // | Num Of |   Ports pressure in cycles  |    |

    // |  Uops  |  0  - DV  |  5  |  6  |  7  |    |

    // ---------------------------------------------

    // |   1    |           | 1.0 |     |     | CP | vmovd xmm1, edi

    // |   1    |           | 1.0 |     |     | CP | vpshufb xmm0, xmm0, xmm1

    // |   2    | 1.0       | 1.0 |     |     | CP | vpextrb eax, xmm0, 0x0

    // Total Num Of Uops: 4

    //

    //

    // Block Throughput: 1.00 Cycles

    // Throughput Bottleneck: PORT2_AGU, PORT3_AGU, Port4

    //

    // |    |  Ports pressure in cycles   |  |

    // |Uops| 1 | 2 - D  |3 -  D  | 4 | 5 |  |

    // ---------------------------------------------------------

    // |2^  |   | 0.5    | 0.5    |1.0|   |CP| vmovaps xmmword ptr [rsp-0x18], xmm0

    // |1   |0.5|        |        |   |0.5|  | lea rax, ptr [rsp-0x18]

    // |1   |   |0.5, 0.5|0.5, 0.5|   |   |CP| mov al, byte ptr [rdi+rax*1]

    // Total Num Of Uops: 4


    return SDValue();

  }


  unsigned IdxVal = IdxC->getZExtValue();


  // If this is a 256-bit vector result, first extract the 128-bit vector and

  // then extract the element from the 128-bit vector.

  if (VecVT.is256BitVector() || VecVT.is512BitVector()) {

    // Get the 128-bit vector.

    Vec = extract128BitVector(Vec, IdxVal, DAG, dl);

    MVT EltVT = VecVT.getVectorElementType();


    unsigned ElemsPerChunk = 128 / EltVT.getSizeInBits();

    assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");


    // Find IdxVal modulo ElemsPerChunk. Since ElemsPerChunk is a power of 2

    // this can be done with a mask.

    IdxVal &= ElemsPerChunk - 1;

    return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,

                       DAG.getVectorIdxConstant(IdxVal, dl));

  }


  assert(VecVT.is128BitVector() && "Unexpected vector length");


  MVT VT = Op.getSimpleValueType();


  if (VT == MVT::i16) {

    // If IdxVal is 0, it's cheaper to do a move instead of a pextrw, unless

    // we're going to zero extend the register or fold the store (SSE41 only).

    if (IdxVal == 0 && !X86::mayFoldIntoZeroExtend(Op) &&

        !(Subtarget.hasSSE41() && X86::mayFoldIntoStore(Op))) {

      if (Subtarget.hasFP16())

        return Op;


      return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,

                         DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,

                                     DAG.getBitcast(MVT::v4i32, Vec), Idx));

    }


    SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32, Vec,

                                  DAG.getTargetConstant(IdxVal, dl, MVT::i8));

    return DAG.getNode(ISD::TRUNCATE, dl, VT, Extract);

  }


  if (Subtarget.hasSSE41())

    if (SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG))

      return Res;


  // Only extract a single element from a v16i8 source - determine the common

  // DWORD/WORD that all extractions share, and extract the sub-byte.

  // TODO: Add QWORD MOVQ extraction?

  if (VT == MVT::i8) {

    APInt DemandedElts = getExtractedDemandedElts(Vec.getNode());

    assert(DemandedElts.getBitWidth() == 16 && "Vector width mismatch");


    // Extract either the lowest i32 or any i16, and extract the sub-byte.

    int DWordIdx = IdxVal / 4;

    if (DWordIdx == 0 && DemandedElts == (DemandedElts & 15)) {

      SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,

                                DAG.getBitcast(MVT::v4i32, Vec),

                                DAG.getVectorIdxConstant(DWordIdx, dl));

      int ShiftVal = (IdxVal % 4) * 8;

      if (ShiftVal != 0)

        Res = DAG.getNode(ISD::SRL, dl, MVT::i32, Res,

                          DAG.getConstant(ShiftVal, dl, MVT::i8));

      return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);

    }


    int WordIdx = IdxVal / 2;

    if (DemandedElts == (DemandedElts & (3 << (WordIdx * 2)))) {

      SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16,

                                DAG.getBitcast(MVT::v8i16, Vec),

                                DAG.getVectorIdxConstant(WordIdx, dl));

      int ShiftVal = (IdxVal % 2) * 8;

      if (ShiftVal != 0)

        Res = DAG.getNode(ISD::SRL, dl, MVT::i16, Res,

                          DAG.getConstant(ShiftVal, dl, MVT::i8));

      return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);

    }

  }


  if (VT == MVT::f16 || VT.getSizeInBits() == 32) {

    if (IdxVal == 0)

      return Op;


    // Shuffle the element to the lowest element, then movss or movsh.

    SmallVector<int, 8> Mask(VecVT.getVectorNumElements(), -1);

    Mask[0] = static_cast<int>(IdxVal);

    Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);

    return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,

                       DAG.getVectorIdxConstant(0, dl));

  }


  if (VT.getSizeInBits() == 64) {

    // FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b

    // FIXME: seems like this should be unnecessary if mov{h,l}pd were taught

    //        to match extract_elt for f64.

    if (IdxVal == 0)

      return Op;


    // UNPCKHPD the element to the lowest double word, then movsd.

    // Note if the lower 64 bits of the result of the UNPCKHPD is then stored

    // to a f64mem, the whole operation is folded into a single MOVHPDmr.

    int Mask[2] = { 1, -1 };

    Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);

    return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,

                       DAG.getVectorIdxConstant(0, dl));

  }


  return SDValue();

}


/// Insert one bit to mask vector, like v16i1 or v8i1.

/// AVX-512 feature.


static SDValue InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG,

                                     const X86Subtarget &Subtarget) {

  SDLoc dl(Op);

  SDValue Vec = Op.getOperand(0);

  SDValue Elt = Op.getOperand(1);

  SDValue Idx = Op.getOperand(2);

  MVT VecVT = Vec.getSimpleValueType();


  if (!isa<ConstantSDNode>(Idx)) {

    // Non constant index. Extend source and destination,

    // insert element and then truncate the result.

    unsigned NumElts = VecVT.getVectorNumElements();

    MVT ExtEltVT = (NumElts <= 8) ? MVT::getIntegerVT(128 / NumElts) : MVT::i8;

    MVT ExtVecVT = MVT::getVectorVT(ExtEltVT, NumElts);

    SDValue ExtOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ExtVecVT,

      DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVecVT, Vec),

      DAG.getNode(ISD::SIGN_EXTEND, dl, ExtEltVT, Elt), Idx);

    return DAG.getNode(ISD::TRUNCATE, dl, VecVT, ExtOp);

  }


  // Copy into a k-register, extract to v1i1 and insert_subvector.

  SDValue EltInVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i1, Elt);

  return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VecVT, Vec, EltInVec, Idx);

}


SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,

                                                  SelectionDAG &DAG) const {

  MVT VT = Op.getSimpleValueType();

  MVT EltVT = VT.getVectorElementType();

  unsigned NumElts = VT.getVectorNumElements();

  unsigned EltSizeInBits = EltVT.getScalarSizeInBits();


  if (EltVT == MVT::i1)

    return InsertBitToMaskVector(Op, DAG, Subtarget);


  SDLoc dl(Op);

  SDValue N0 = Op.getOperand(0);

  SDValue N1 = Op.getOperand(1);

  SDValue N2 = Op.getOperand(2);

  auto *N2C = dyn_cast<ConstantSDNode>(N2);


  if (EltVT == MVT::bf16) {

    MVT IVT = VT.changeVectorElementTypeToInteger();

    SDValue Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, IVT,

                              DAG.getBitcast(IVT, N0),

                              DAG.getBitcast(MVT::i16, N1), N2);

    return DAG.getBitcast(VT, Res);

  }


  if (!N2C) {

    // Variable insertion indices, usually we're better off spilling to stack,

    // but AVX512 can use a variable compare+select by comparing against all

    // possible vector indices, and FP insertion has less gpr->simd traffic.

    if (!(Subtarget.hasBWI() ||

          (Subtarget.hasAVX512() && EltSizeInBits >= 32) ||

          (Subtarget.hasSSE41() && (EltVT == MVT::f32 || EltVT == MVT::f64))))

      return SDValue();


    MVT IdxSVT = MVT::getIntegerVT(EltSizeInBits);

    MVT IdxVT = MVT::getVectorVT(IdxSVT, NumElts);

    if (!isTypeLegal(IdxSVT) || !isTypeLegal(IdxVT))

      return SDValue();


    SDValue IdxExt = DAG.getZExtOrTrunc(N2, dl, IdxSVT);

    SDValue IdxSplat = DAG.getSplatBuildVector(IdxVT, dl, IdxExt);

    SDValue EltSplat = DAG.getSplatBuildVector(VT, dl, N1);


    SmallVector<SDValue, 16> RawIndices;

    for (unsigned I = 0; I != NumElts; ++I)

      RawIndices.push_back(DAG.getConstant(I, dl, IdxSVT));

    SDValue Indices = DAG.getBuildVector(IdxVT, dl, RawIndices);


    // inselt N0, N1, N2 --> select (SplatN2 == {0,1,2...}) ? SplatN1 : N0.

    return DAG.getSelectCC(dl, IdxSplat, Indices, EltSplat, N0,

                           ISD::CondCode::SETEQ);

  }


  if (N2C->getAPIntValue().uge(NumElts))

    return SDValue();

  uint64_t IdxVal = N2C->getZExtValue();


  bool IsZeroElt = X86::isZeroNode(N1);

  bool IsAllOnesElt = VT.isInteger() && llvm::isAllOnesConstant(N1);


  if (IsZeroElt || IsAllOnesElt) {

    // Lower insertion of v16i8/v32i8/v64i16 -1 elts as an 'OR' blend.

    // We don't deal with i8 0 since it appears to be handled elsewhere.

    if (IsAllOnesElt &&

        ((VT == MVT::v16i8 && !Subtarget.hasSSE41()) ||

         ((VT == MVT::v32i8 || VT == MVT::v16i16) && !Subtarget.hasInt256()))) {

      SDValue ZeroCst = DAG.getConstant(0, dl, VT.getScalarType());

      SDValue OnesCst = DAG.getAllOnesConstant(dl, VT.getScalarType());

      SmallVector<SDValue, 8> CstVectorElts(NumElts, ZeroCst);

      CstVectorElts[IdxVal] = OnesCst;

      SDValue CstVector = DAG.getBuildVector(VT, dl, CstVectorElts);

      return DAG.getNode(ISD::OR, dl, VT, N0, CstVector);

    }

    // See if we can do this more efficiently with a blend shuffle with a

    // rematerializable vector.

    if (Subtarget.hasSSE41() &&

        (EltSizeInBits >= 16 || (IsZeroElt && !VT.is128BitVector()))) {

      SmallVector<int, 8> BlendMask;

      for (unsigned i = 0; i != NumElts; ++i)

        BlendMask.push_back(i == IdxVal ? i + NumElts : i);

      SDValue CstVector = IsZeroElt ? getZeroVector(VT, Subtarget, DAG, dl)

                                    : getOnesVector(VT, DAG, dl);

      return DAG.getVectorShuffle(VT, dl, N0, CstVector, BlendMask);

    }

  }


  // If the vector is wider than 128 bits, extract the 128-bit subvector, insert

  // into that, and then insert the subvector back into the result.

  if (VT.is256BitVector() || VT.is512BitVector()) {

    // With a 256-bit vector, we can insert into the zero element efficiently

    // using a blend if we have AVX or AVX2 and the right data type.

    if (VT.is256BitVector() && IdxVal == 0) {

      // TODO: It is worthwhile to cast integer to floating point and back

      // and incur a domain crossing penalty if that's what we'll end up

      // doing anyway after extracting to a 128-bit vector.

      if ((Subtarget.hasAVX() && (EltVT == MVT::f64 || EltVT == MVT::f32)) ||

          (Subtarget.hasAVX2() && (EltVT == MVT::i32 || EltVT == MVT::i64))) {

        SDValue N1Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1);

        return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1Vec,

                           DAG.getTargetConstant(1, dl, MVT::i8));

      }

    }


    unsigned NumEltsIn128 = 128 / EltSizeInBits;

    assert(isPowerOf2_32(NumEltsIn128) &&

           "Vectors will always have power-of-two number of elements.");


    // If we are not inserting into the low 128-bit vector chunk,

    // then prefer the broadcast+blend sequence.

    // FIXME: relax the profitability check iff all N1 uses are insertions.

    if (IdxVal >= NumEltsIn128 &&

        ((Subtarget.hasAVX2() && EltSizeInBits != 8) ||

         (Subtarget.hasAVX() && (EltSizeInBits >= 32) &&

          X86::mayFoldLoad(N1, Subtarget)))) {

      SDValue N1SplatVec = DAG.getSplatBuildVector(VT, dl, N1);

      SmallVector<int, 8> BlendMask;

      for (unsigned i = 0; i != NumElts; ++i)

        BlendMask.push_back(i == IdxVal ? i + NumElts : i);

      return DAG.getVectorShuffle(VT, dl, N0, N1SplatVec, BlendMask);

    }


    // Get the desired 128-bit vector chunk.

    SDValue V = extract128BitVector(N0, IdxVal, DAG, dl);


    // Insert the element into the desired chunk.

    // Since NumEltsIn128 is a power of 2 we can use mask instead of modulo.

    unsigned IdxIn128 = IdxVal & (NumEltsIn128 - 1);


    V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V, N1,

                    DAG.getVectorIdxConstant(IdxIn128, dl));


    // Insert the changed part back into the bigger vector

    return insert128BitVector(N0, V, IdxVal, DAG, dl);

  }

  assert(VT.is128BitVector() && "Only 128-bit vector types should be left!");


  // This will be just movw/movd/movq/movsh/movss/movsd.

  if (IdxVal == 0 && ISD::isBuildVectorAllZeros(N0.getNode())) {

    if (EltVT == MVT::i32 || EltVT == MVT::f32 || EltVT == MVT::f64 ||

        EltVT == MVT::f16 || EltVT == MVT::i64) {

      N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1);

      return getShuffleVectorZeroOrUndef(N1, 0, true, Subtarget, DAG);

    }


    // We can't directly insert an i8 or i16 into a vector, so zero extend

    // it to i32 first.

    if (EltVT == MVT::i16 || EltVT == MVT::i8) {

      N1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, N1);

      MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);

      N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ShufVT, N1);

      N1 = getShuffleVectorZeroOrUndef(N1, 0, true, Subtarget, DAG);

      return DAG.getBitcast(VT, N1);

    }

  }


  // Transform it so it match pinsr{b,w} which expects a GR32 as its second

  // argument. SSE41 required for pinsrb.

  if (VT == MVT::v8i16 || (VT == MVT::v16i8 && Subtarget.hasSSE41())) {

    unsigned Opc;

    if (VT == MVT::v8i16) {

      assert(Subtarget.hasSSE2() && "SSE2 required for PINSRW");

      Opc = X86ISD::PINSRW;

    } else {

      assert(VT == MVT::v16i8 && "PINSRB requires v16i8 vector");

      assert(Subtarget.hasSSE41() && "SSE41 required for PINSRB");

      Opc = X86ISD::PINSRB;

    }


    assert(N1.getValueType() != MVT::i32 && "Unexpected VT");

    N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);

    N2 = DAG.getTargetConstant(IdxVal, dl, MVT::i8);

    return DAG.getNode(Opc, dl, VT, N0, N1, N2);

  }


  if (Subtarget.hasSSE41()) {

    if (EltVT == MVT::f32) {

      // Bits [7:6] of the constant are the source select. This will always be

      //   zero here. The DAG Combiner may combine an extract_elt index into

      //   these bits. For example (insert (extract, 3), 2) could be matched by

      //   putting the '3' into bits [7:6] of X86ISD::INSERTPS.

      // Bits [5:4] of the constant are the destination select. This is the

      //   value of the incoming immediate.

      // Bits [3:0] of the constant are the zero mask. The DAG Combiner may

      //   combine either bitwise AND or insert of float 0.0 to set these bits.


      bool MinSize = DAG.getMachineFunction().getFunction().hasMinSize();

      if (IdxVal == 0 && (!MinSize || !X86::mayFoldLoad(N1, Subtarget))) {

        // If this is an insertion of 32-bits into the low 32-bits of

        // a vector, we prefer to generate a blend with immediate rather

        // than an insertps. Blends are simpler operations in hardware and so

        // will always have equal or better performance than insertps.

        // But if optimizing for size and there's a load folding opportunity,

        // generate insertps because blendps does not have a 32-bit memory

        // operand form.

        N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);

        return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1,

                           DAG.getTargetConstant(1, dl, MVT::i8));

      }

      // Create this as a scalar to vector..

      N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);

      return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1,

                         DAG.getTargetConstant(IdxVal << 4, dl, MVT::i8));

    }


    // PINSR* works with constant index.

    if (EltVT == MVT::i32 || EltVT == MVT::i64)

      return Op;

  }


  return SDValue();

}


static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, const X86Subtarget &Subtarget,

                                     SelectionDAG &DAG) {

  SDLoc dl(Op);

  MVT OpVT = Op.getSimpleValueType();


  // It's always cheaper to replace a xor+movd with xorps and simplifies further

  // combines.

  if (X86::isZeroNode(Op.getOperand(0)))

    return getZeroVector(OpVT, Subtarget, DAG, dl);


  // If this is a 256-bit vector result, first insert into a 128-bit

  // vector and then insert into the 256-bit vector.

  if (!OpVT.is128BitVector()) {

    // Insert into a 128-bit vector.

    unsigned SizeFactor = OpVT.getSizeInBits() / 128;

    MVT VT128 = MVT::getVectorVT(OpVT.getVectorElementType(),

                                 OpVT.getVectorNumElements() / SizeFactor);


    Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT128, Op.getOperand(0));


    // Insert the 128-bit vector.

    return insert128BitVector(DAG.getUNDEF(OpVT), Op, 0, DAG, dl);

  }

  assert(OpVT.is128BitVector() && OpVT.isInteger() && OpVT != MVT::v2i64 &&

         "Expected an SSE type!");


  // Pass through a v4i32 or V8i16 SCALAR_TO_VECTOR as that's what we use in

  // tblgen.

  if (OpVT == MVT::v4i32 || (OpVT == MVT::v8i16 && Subtarget.hasFP16()))

    return Op;


  SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0));

  return DAG.getBitcast(

      OpVT, DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, AnyExt));

}


// Lower a node with an INSERT_SUBVECTOR opcode.  This may result in a

// simple superregister reference or explicit instructions to insert

// the upper bits of a vector.


static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget,

                                     SelectionDAG &DAG) {

  assert(Op.getSimpleValueType().getVectorElementType() == MVT::i1);


  return insert1BitVector(Op, DAG, Subtarget);

}


static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget,

                                      SelectionDAG &DAG) {

  assert(Op.getSimpleValueType().getVectorElementType() == MVT::i1 &&

         "Only vXi1 extract_subvectors need custom lowering");


  SDLoc dl(Op);

  SDValue Vec = Op.getOperand(0);

  uint64_t IdxVal = Op.getConstantOperandVal(1);


  if (IdxVal == 0) // the operation is legal

    return Op;


  // Extend to natively supported kshift.

  Vec = widenMaskVector(Vec, false, Subtarget, DAG, dl);


  // Shift to the LSB.

  Vec = DAG.getNode(X86ISD::KSHIFTR, dl, Vec.getSimpleValueType(), Vec,

                    DAG.getTargetConstant(IdxVal, dl, MVT::i8));


  return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, Op.getValueType(), Vec,

                     DAG.getVectorIdxConstant(0, dl));

}


// Returns the appropriate wrapper opcode for a global reference.

unsigned X86TargetLowering::getGlobalWrapperKind(

    const GlobalValue *GV, const unsigned char OpFlags) const {

  // References to absolute symbols are never PC-relative.

  if (GV && GV->isAbsoluteSymbolRef())

    return X86ISD::Wrapper;


  // The following OpFlags under RIP-rel PIC use RIP.

  if (Subtarget.isPICStyleRIPRel() &&

      (OpFlags == X86II::MO_NO_FLAG || OpFlags == X86II::MO_COFFSTUB ||

       OpFlags == X86II::MO_DLLIMPORT))

    return X86ISD::WrapperRIP;


  // GOTPCREL references must always use RIP.

  if (OpFlags == X86II::MO_GOTPCREL || OpFlags == X86II::MO_GOTPCREL_NORELAX)

    return X86ISD::WrapperRIP;


  return X86ISD::Wrapper;

}


// ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as

// their target counterpart wrapped in the X86ISD::Wrapper node. Suppose N is

// one of the above mentioned nodes. It has to be wrapped because otherwise

// Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only

// be used to form addressing mode. These wrapped nodes will be selected

// into MOV32ri.

SDValue

X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const {

  ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);


  // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the

  // global base reg.

  unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);


  auto PtrVT = getPointerTy(DAG.getDataLayout());

  SDValue Result = DAG.getTargetConstantPool(

      CP->getConstVal(), PtrVT, CP->getAlign(), CP->getOffset(), OpFlag);

  SDLoc DL(CP);

  Result =

      DAG.getNode(getGlobalWrapperKind(nullptr, OpFlag), DL, PtrVT, Result);

  // With PIC, the address is actually $g + Offset.

  if (OpFlag) {

    Result =

        DAG.getNode(ISD::ADD, DL, PtrVT,

                    DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);

  }


  return Result;

}


SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {

  JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);


  // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the

  // global base reg.

  unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);


  EVT PtrVT = Op.getValueType();

  SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, OpFlag);

  SDLoc DL(JT);

  Result =

      DAG.getNode(getGlobalWrapperKind(nullptr, OpFlag), DL, PtrVT, Result);


  // With PIC, the address is actually $g + Offset.

  if (OpFlag)

    Result =

        DAG.getNode(ISD::ADD, DL, PtrVT,

                    DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);


  return Result;

}


SDValue X86TargetLowering::LowerExternalSymbol(SDValue Op,

                                               SelectionDAG &DAG) const {

  return LowerGlobalOrExternal(Op, DAG, /*ForCall=*/false, nullptr);

}


SDValue

X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const {

  // Create the TargetBlockAddressAddress node.

  unsigned char OpFlags =

    Subtarget.classifyBlockAddressReference();

  const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();

  int64_t Offset = cast<BlockAddressSDNode>(Op)->getOffset();

  SDLoc dl(Op);

  EVT PtrVT = Op.getValueType();

  SDValue Result = DAG.getTargetBlockAddress(BA, PtrVT, Offset, OpFlags);

  Result =

      DAG.getNode(getGlobalWrapperKind(nullptr, OpFlags), dl, PtrVT, Result);


  // With PIC, the address is actually $g + Offset.

  if (isGlobalRelativeToPICBase(OpFlags)) {

    Result = DAG.getNode(ISD::ADD, dl, PtrVT,

                         DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);

  }


  return Result;

}


/// Creates target global address or external symbol nodes for calls or

/// other uses.

SDValue X86TargetLowering::LowerGlobalOrExternal(SDValue Op, SelectionDAG &DAG,

                                                 bool ForCall,

                                                 bool *IsImpCall) const {

  // Unpack the global address or external symbol.

  SDLoc dl(Op);

  const GlobalValue *GV = nullptr;

  int64_t Offset = 0;

  const char *ExternalSym = nullptr;

  if (const auto *G = dyn_cast<GlobalAddressSDNode>(Op)) {

    GV = G->getGlobal();

    Offset = G->getOffset();

  } else {

    const auto *ES = cast<ExternalSymbolSDNode>(Op);

    ExternalSym = ES->getSymbol();

  }


  // Calculate some flags for address lowering.

  const Module &Mod = *DAG.getMachineFunction().getFunction().getParent();

  unsigned char OpFlags;

  if (ForCall)

    OpFlags = Subtarget.classifyGlobalFunctionReference(GV, Mod);

  else

    OpFlags = Subtarget.classifyGlobalReference(GV, Mod);

  bool HasPICReg = isGlobalRelativeToPICBase(OpFlags);

  bool NeedsLoad = isGlobalStubReference(OpFlags);


  CodeModel::Model M = DAG.getTarget().getCodeModel();

  EVT PtrVT = Op.getValueType();

  SDValue Result;


  if (GV) {

    // Create a target global address if this is a global. If possible, fold the

    // offset into the global address reference. Otherwise, ADD it on later.

    // Suppress the folding if Offset is negative: movl foo-1, %eax is not

    // allowed because if the address of foo is 0, the ELF R_X86_64_32

    // relocation will compute to a negative value, which is invalid.

    int64_t GlobalOffset = 0;

    if (OpFlags == X86II::MO_NO_FLAG && Offset >= 0 &&

        X86::isOffsetSuitableForCodeModel(Offset, M, true)) {

      std::swap(GlobalOffset, Offset);

    }

    Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, GlobalOffset, OpFlags);

  } else {

    // If this is not a global address, this must be an external symbol.

    Result = DAG.getTargetExternalSymbol(ExternalSym, PtrVT, OpFlags);

  }


  // If this is a direct call, avoid the wrapper if we don't need to do any

  // loads or adds. This allows SDAG ISel to match direct calls.

  if (ForCall && !NeedsLoad && !HasPICReg && Offset == 0)

    return Result;


  // If Import Call Optimization is enabled and this is an imported function

  // then make a note of it and return the global address without wrapping.

  if (IsImpCall && (OpFlags == X86II::MO_DLLIMPORT) &&

      Mod.getModuleFlag("import-call-optimization")) {

    assert(ForCall && "Should only enable import call optimization if we are "

                      "lowering a call");

    *IsImpCall = true;

    return Result;

  }


  Result = DAG.getNode(getGlobalWrapperKind(GV, OpFlags), dl, PtrVT, Result);


  // With PIC, the address is actually $g + Offset.

  if (HasPICReg) {

    Result = DAG.getNode(ISD::ADD, dl, PtrVT,

                         DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);

  }


  // For globals that require a load from a stub to get the address, emit the

  // load.

  if (NeedsLoad)

    Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result,

                         MachinePointerInfo::getGOT(DAG.getMachineFunction()));


  // If there was a non-zero offset that we didn't fold, create an explicit

  // addition for it.

  if (Offset != 0)

    Result = DAG.getNode(ISD::ADD, dl, PtrVT, Result,

                         DAG.getSignedConstant(Offset, dl, PtrVT));


  return Result;

}


SDValue

X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {

  return LowerGlobalOrExternal(Op, DAG, /*ForCall=*/false, nullptr);

}


static SDValue GetTLSADDR(SelectionDAG &DAG, GlobalAddressSDNode *GA,

                          const EVT PtrVT, unsigned ReturnReg,

                          unsigned char OperandFlags,

                          bool LoadGlobalBaseReg = false,

                          bool LocalDynamic = false) {

  MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();

  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);

  SDLoc dl(GA);

  SDValue TGA;

  bool UseTLSDESC = DAG.getTarget().useTLSDESC();

  SDValue Chain = DAG.getEntryNode();

  SDValue Ret;

  if (LocalDynamic && UseTLSDESC) {

    TGA = DAG.getTargetExternalSymbol("_TLS_MODULE_BASE_", PtrVT, OperandFlags);

    // Reuse existing GetTLSADDR node if we can find it.

    if (TGA->hasOneUse()) {

      // TLSDESC uses TGA.

      SDNode *TLSDescOp = *TGA->user_begin();

      assert(TLSDescOp->getOpcode() == X86ISD::TLSDESC &&

             "Unexpected TLSDESC DAG");

      // CALLSEQ_END uses TGA via a chain and glue.

      auto *CallSeqEndOp = TLSDescOp->getGluedUser();

      assert(CallSeqEndOp && CallSeqEndOp->getOpcode() == ISD::CALLSEQ_END &&

             "Unexpected TLSDESC DAG");

      // CopyFromReg uses CALLSEQ_END via a chain and glue.

      auto *CopyFromRegOp = CallSeqEndOp->getGluedUser();

      assert(CopyFromRegOp && CopyFromRegOp->getOpcode() == ISD::CopyFromReg &&

             "Unexpected TLSDESC DAG");

      Ret = SDValue(CopyFromRegOp, 0);

    }

  } else {

    TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0),

                                     GA->getOffset(), OperandFlags);

  }


  if (!Ret) {

    X86ISD::NodeType CallType = UseTLSDESC     ? X86ISD::TLSDESC

                                : LocalDynamic ? X86ISD::TLSBASEADDR

                                               : X86ISD::TLSADDR;


    Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);

    if (LoadGlobalBaseReg) {

      SDValue InGlue;

      Chain = DAG.getCopyToReg(Chain, dl, X86::EBX,

                               DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT),

                               InGlue);

      InGlue = Chain.getValue(1);

      Chain = DAG.getNode(CallType, dl, NodeTys, {Chain, TGA, InGlue});

    } else {

      Chain = DAG.getNode(CallType, dl, NodeTys, {Chain, TGA});

    }

    Chain = DAG.getCALLSEQ_END(Chain, 0, 0, Chain.getValue(1), dl);


    // TLSADDR will be codegen'ed as call. Inform MFI that function has calls.

    MFI.setHasCalls(true);


    SDValue Glue = Chain.getValue(1);

    Ret = DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Glue);

  }


  if (!UseTLSDESC)

    return Ret;


  const X86Subtarget &Subtarget = DAG.getSubtarget<X86Subtarget>();

  unsigned Seg = Subtarget.is64Bit() ? X86AS::FS : X86AS::GS;


  Value *Ptr = Constant::getNullValue(PointerType::get(*DAG.getContext(), Seg));

  SDValue Offset =

      DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), DAG.getIntPtrConstant(0, dl),

                  MachinePointerInfo(Ptr));

  return DAG.getNode(ISD::ADD, dl, PtrVT, Ret, Offset);

}


// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit

static SDValue


LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG,

                                const EVT PtrVT) {

  return GetTLSADDR(DAG, GA, PtrVT, X86::EAX, X86II::MO_TLSGD,

                    /*LoadGlobalBaseReg=*/true);

}


// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit LP64

static SDValue


LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG,

                                const EVT PtrVT) {

  return GetTLSADDR(DAG, GA, PtrVT, X86::RAX, X86II::MO_TLSGD);

}


// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit ILP32

static SDValue


LowerToTLSGeneralDynamicModelX32(GlobalAddressSDNode *GA, SelectionDAG &DAG,

                                 const EVT PtrVT) {

  return GetTLSADDR(DAG, GA, PtrVT, X86::EAX, X86II::MO_TLSGD);

}


static SDValue LowerToTLSLocalDynamicModel(GlobalAddressSDNode *GA,

                                           SelectionDAG &DAG, const EVT PtrVT,

                                           bool Is64Bit, bool Is64BitLP64) {

  SDLoc dl(GA);


  // Get the start address of the TLS block for this module.

  X86MachineFunctionInfo *MFI =

      DAG.getMachineFunction().getInfo<X86MachineFunctionInfo>();

  MFI->incNumLocalDynamicTLSAccesses();


  SDValue Base;

  if (Is64Bit) {

    unsigned ReturnReg = Is64BitLP64 ? X86::RAX : X86::EAX;

    Base = GetTLSADDR(DAG, GA, PtrVT, ReturnReg, X86II::MO_TLSLD,

                      /*LoadGlobalBaseReg=*/false,

                      /*LocalDynamic=*/true);

  } else {

    Base = GetTLSADDR(DAG, GA, PtrVT, X86::EAX, X86II::MO_TLSLDM,

                      /*LoadGlobalBaseReg=*/true,

                      /*LocalDynamic=*/true);

  }


  // Note: the CleanupLocalDynamicTLSPass will remove redundant computations

  // of Base.


  // Build x@dtpoff.

  unsigned char OperandFlags = X86II::MO_DTPOFF;

  unsigned WrapperKind = X86ISD::Wrapper;

  SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,

                                           GA->getValueType(0),

                                           GA->getOffset(), OperandFlags);

  SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);


  // Add x@dtpoff with the base.

  return DAG.getNode(ISD::ADD, dl, PtrVT, Offset, Base);

}


// Lower ISD::GlobalTLSAddress using the "initial exec" or "local exec" model.


static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG,

                                   const EVT PtrVT, TLSModel::Model model,

                                   bool is64Bit, bool isPIC) {

  SDLoc dl(GA);


  // Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit).

  Value *Ptr = Constant::getNullValue(

      PointerType::get(*DAG.getContext(), is64Bit ? X86AS::FS : X86AS::GS));


  SDValue ThreadPointer =

      DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), DAG.getIntPtrConstant(0, dl),

                  MachinePointerInfo(Ptr));


  unsigned char OperandFlags = 0;

  // Most TLS accesses are not RIP relative, even on x86-64.  One exception is

  // initialexec.

  unsigned WrapperKind = X86ISD::Wrapper;

  if (model == TLSModel::LocalExec) {

    OperandFlags = is64Bit ? X86II::MO_TPOFF : X86II::MO_NTPOFF;

  } else if (model == TLSModel::InitialExec) {

    if (is64Bit) {

      OperandFlags = X86II::MO_GOTTPOFF;

      WrapperKind = X86ISD::WrapperRIP;

    } else {

      OperandFlags = isPIC ? X86II::MO_GOTNTPOFF : X86II::MO_INDNTPOFF;

    }

  } else {

    llvm_unreachable("Unexpected model");

  }


  // emit "addl x@ntpoff,%eax" (local exec)

  // or "addl x@indntpoff,%eax" (initial exec)

  // or "addl x@gotntpoff(%ebx) ,%eax" (initial exec, 32-bit pic)

  SDValue TGA =

      DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0),

                                 GA->getOffset(), OperandFlags);

  SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);


  if (model == TLSModel::InitialExec) {

    if (isPIC && !is64Bit) {

      Offset = DAG.getNode(ISD::ADD, dl, PtrVT,

                           DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),

                           Offset);

    }


    Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset,

                         MachinePointerInfo::getGOT(DAG.getMachineFunction()));

  }


  // The address of the thread local variable is the add of the thread

  // pointer with the offset of the variable.

  return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset);

}


SDValue

X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {


  GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);


  if (DAG.getTarget().useEmulatedTLS())

    return LowerToTLSEmulatedModel(GA, DAG);


  const GlobalValue *GV = GA->getGlobal();

  EVT PtrVT = Op.getValueType();

  bool PositionIndependent = isPositionIndependent();


  if (Subtarget.isTargetELF()) {

    TLSModel::Model model = DAG.getTarget().getTLSModel(GV);

    switch (model) {

      case TLSModel::GeneralDynamic:

        if (Subtarget.is64Bit()) {

          if (Subtarget.isTarget64BitLP64())

            return LowerToTLSGeneralDynamicModel64(GA, DAG, PtrVT);

          return LowerToTLSGeneralDynamicModelX32(GA, DAG, PtrVT);

        }

        return LowerToTLSGeneralDynamicModel32(GA, DAG, PtrVT);

      case TLSModel::LocalDynamic:

        return LowerToTLSLocalDynamicModel(GA, DAG, PtrVT, Subtarget.is64Bit(),

                                           Subtarget.isTarget64BitLP64());

      case TLSModel::InitialExec:

      case TLSModel::LocalExec:

        return LowerToTLSExecModel(GA, DAG, PtrVT, model, Subtarget.is64Bit(),

                                   PositionIndependent);

    }

    llvm_unreachable("Unknown TLS model.");

  }


  if (Subtarget.isTargetDarwin()) {

    // Darwin only has one model of TLS.  Lower to that.

    unsigned char OpFlag = 0;

    unsigned WrapperKind = 0;


    // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the

    // global base reg.

    bool PIC32 = PositionIndependent && !Subtarget.is64Bit();

    if (PIC32) {

      OpFlag = X86II::MO_TLVP_PIC_BASE;

      WrapperKind = X86ISD::Wrapper;

    } else {

      OpFlag = X86II::MO_TLVP;

      WrapperKind = X86ISD::WrapperRIP;

    }

    SDLoc DL(Op);

    SDValue Result = DAG.getTargetGlobalAddress(GA->getGlobal(), DL,

                                                GA->getValueType(0),

                                                GA->getOffset(), OpFlag);

    SDValue Offset = DAG.getNode(WrapperKind, DL, PtrVT, Result);


    // With PIC32, the address is actually $g + Offset.

    if (PIC32)

      Offset = DAG.getNode(ISD::ADD, DL, PtrVT,

                           DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),

                           Offset);


    // Lowering the machine isd will make sure everything is in the right

    // location.

    SDValue Chain = DAG.getEntryNode();

    SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);

    Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);

    SDValue Args[] = { Chain, Offset };

    Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args);

    Chain = DAG.getCALLSEQ_END(Chain, 0, 0, Chain.getValue(1), DL);


    // TLSCALL will be codegen'ed as call. Inform MFI that function has calls.

    MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();

    MFI.setAdjustsStack(true);


    // And our return value (tls address) is in the standard call return value

    // location.

    unsigned Reg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;

    return DAG.getCopyFromReg(Chain, DL, Reg, PtrVT, Chain.getValue(1));

  }


  if (Subtarget.isOSWindows()) {

    // Just use the implicit TLS architecture

    // Need to generate something similar to:

    //   mov     rdx, qword [gs:abs 58H]; Load pointer to ThreadLocalStorage

    //                                  ; from TEB

    //   mov     ecx, dword [rel _tls_index]: Load index (from C runtime)

    //   mov     rcx, qword [rdx+rcx*8]

    //   mov     eax, .tls$:tlsvar

    //   [rax+rcx] contains the address

    // Windows 64bit: gs:0x58

    // Windows 32bit: fs:__tls_array


    SDLoc dl(GA);

    SDValue Chain = DAG.getEntryNode();


    // Get the Thread Pointer, which is %fs:__tls_array (32-bit) or

    // %gs:0x58 (64-bit). On MinGW, __tls_array is not available, so directly

    // use its literal value of 0x2C.

    Value *Ptr = Constant::getNullValue(

        Subtarget.is64Bit() ? PointerType::get(*DAG.getContext(), X86AS::GS)

                            : PointerType::get(*DAG.getContext(), X86AS::FS));


    SDValue TlsArray = Subtarget.is64Bit()

                           ? DAG.getIntPtrConstant(0x58, dl)

                           : (Subtarget.isTargetWindowsGNU()

                                  ? DAG.getIntPtrConstant(0x2C, dl)

                                  : DAG.getExternalSymbol("_tls_array", PtrVT));


    SDValue ThreadPointer =

        DAG.getLoad(PtrVT, dl, Chain, TlsArray, MachinePointerInfo(Ptr));


    SDValue res;

    if (GV->getThreadLocalMode() == GlobalVariable::LocalExecTLSModel) {

      res = ThreadPointer;

    } else {

      // Load the _tls_index variable

      SDValue IDX = DAG.getExternalSymbol("_tls_index", PtrVT);

      if (Subtarget.is64Bit())

        IDX = DAG.getExtLoad(ISD::ZEXTLOAD, dl, PtrVT, Chain, IDX,

                             MachinePointerInfo(), MVT::i32);

      else

        IDX = DAG.getLoad(PtrVT, dl, Chain, IDX, MachinePointerInfo());


      const DataLayout &DL = DAG.getDataLayout();

      SDValue Scale =

          DAG.getConstant(Log2_64_Ceil(DL.getPointerSize()), dl, MVT::i8);

      IDX = DAG.getNode(ISD::SHL, dl, PtrVT, IDX, Scale);


      res = DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, IDX);

    }


    res = DAG.getLoad(PtrVT, dl, Chain, res, MachinePointerInfo());


    // Get the offset of start of .tls section

    SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,

                                             GA->getValueType(0),

                                             GA->getOffset(), X86II::MO_SECREL);

    SDValue Offset = DAG.getNode(X86ISD::Wrapper, dl, PtrVT, TGA);


    // The address of the thread local variable is the add of the thread

    // pointer with the offset of the variable.

    return DAG.getNode(ISD::ADD, dl, PtrVT, res, Offset);

  }


  llvm_unreachable("TLS not implemented for this target.");

}


bool X86TargetLowering::addressingModeSupportsTLS(const GlobalValue &GV) const {

  if (Subtarget.is64Bit() && Subtarget.isTargetELF()) {

    const TargetMachine &TM = getTargetMachine();

    TLSModel::Model Model = TM.getTLSModel(&GV);

    switch (Model) {

    case TLSModel::LocalExec:

    case TLSModel::InitialExec:

      // We can include the %fs segment register in addressing modes.

      return true;

    case TLSModel::LocalDynamic:

    case TLSModel::GeneralDynamic:

      // These models do not result in %fs relative addresses unless

      // TLS descriptior are used.

      //

      // Even in the case of TLS descriptors we currently have no way to model

      // the difference between %fs access and the computations needed for the

      // offset and returning `true` for TLS-desc currently duplicates both

      // which is detrimental :-/

      return false;

    }

  }

  return false;

}


/// Lower SRA_PARTS and friends, which return two i32 values

/// and take a 2 x i32 value to shift plus a shift amount.

/// TODO: Can this be moved to general expansion code?


static SDValue LowerShiftParts(SDValue Op, SelectionDAG &DAG) {

  SDValue Lo, Hi;

  DAG.getTargetLoweringInfo().expandShiftParts(Op.getNode(), Lo, Hi, DAG);

  return DAG.getMergeValues({Lo, Hi}, SDLoc(Op));

}


// Try to use a packed vector operation to handle i64 on 32-bit targets when

// AVX512DQ is enabled.


static SDValue LowerI64IntToFP_AVX512DQ(SDValue Op, const SDLoc &dl,

                                        SelectionDAG &DAG,

                                        const X86Subtarget &Subtarget) {

  assert((Op.getOpcode() == ISD::SINT_TO_FP ||

          Op.getOpcode() == ISD::STRICT_SINT_TO_FP ||

          Op.getOpcode() == ISD::STRICT_UINT_TO_FP ||

          Op.getOpcode() == ISD::UINT_TO_FP) &&

         "Unexpected opcode!");

  bool IsStrict = Op->isStrictFPOpcode();

  unsigned OpNo = IsStrict ? 1 : 0;

  SDValue Src = Op.getOperand(OpNo);

  MVT SrcVT = Src.getSimpleValueType();

  MVT VT = Op.getSimpleValueType();


   if (!Subtarget.hasDQI() || SrcVT != MVT::i64 || Subtarget.is64Bit() ||

       (VT != MVT::f32 && VT != MVT::f64))

    return SDValue();


  // Pack the i64 into a vector, do the operation and extract.


  // Using 256-bit to ensure result is 128-bits for f32 case.

  unsigned NumElts = Subtarget.hasVLX() ? 4 : 8;

  MVT VecInVT = MVT::getVectorVT(MVT::i64, NumElts);

  MVT VecVT = MVT::getVectorVT(VT, NumElts);


  SDValue InVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecInVT, Src);

  if (IsStrict) {

    SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, {VecVT, MVT::Other},

                                 {Op.getOperand(0), InVec});

    SDValue Chain = CvtVec.getValue(1);

    SDValue Value = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,

                                DAG.getVectorIdxConstant(0, dl));

    return DAG.getMergeValues({Value, Chain}, dl);

  }


  SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, VecVT, InVec);


  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,

                     DAG.getVectorIdxConstant(0, dl));

}


// Try to use a packed vector operation to handle i64 on 32-bit targets.


static SDValue LowerI64IntToFP16(SDValue Op, const SDLoc &dl, SelectionDAG &DAG,

                                 const X86Subtarget &Subtarget) {

  assert((Op.getOpcode() == ISD::SINT_TO_FP ||

          Op.getOpcode() == ISD::STRICT_SINT_TO_FP ||

          Op.getOpcode() == ISD::STRICT_UINT_TO_FP ||

          Op.getOpcode() == ISD::UINT_TO_FP) &&

         "Unexpected opcode!");

  bool IsStrict = Op->isStrictFPOpcode();

  SDValue Src = Op.getOperand(IsStrict ? 1 : 0);

  MVT SrcVT = Src.getSimpleValueType();

  MVT VT = Op.getSimpleValueType();


  if (SrcVT != MVT::i64 || Subtarget.is64Bit() || VT != MVT::f16)

    return SDValue();


  // Pack the i64 into a vector, do the operation and extract.


  assert(Subtarget.hasFP16() && "Expected FP16");


  SDValue InVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Src);

  if (IsStrict) {

    SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, {MVT::v2f16, MVT::Other},

                                 {Op.getOperand(0), InVec});

    SDValue Chain = CvtVec.getValue(1);

    SDValue Value = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,

                                DAG.getVectorIdxConstant(0, dl));

    return DAG.getMergeValues({Value, Chain}, dl);

  }


  SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, MVT::v2f16, InVec);


  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,

                     DAG.getVectorIdxConstant(0, dl));

}


static bool useVectorCast(unsigned Opcode, MVT FromVT, MVT ToVT,

                          const X86Subtarget &Subtarget) {

  switch (Opcode) {

    case ISD::SINT_TO_FP:

      // TODO: Handle wider types with AVX/AVX512.

      if (!Subtarget.hasSSE2() || FromVT != MVT::v4i32)

        return false;

      // CVTDQ2PS or (V)CVTDQ2PD

      return ToVT == MVT::v4f32 || (Subtarget.hasAVX() && ToVT == MVT::v4f64);


    case ISD::UINT_TO_FP:

      // TODO: Handle wider types and i64 elements.

      if (!Subtarget.hasAVX512() || FromVT != MVT::v4i32)

        return false;

      // VCVTUDQ2PS or VCVTUDQ2PD

      return ToVT == MVT::v4f32 || ToVT == MVT::v4f64;


    default:

      return false;

  }

}


/// Given a scalar cast operation that is extracted from a vector, try to

/// vectorize the cast op followed by extraction. This will avoid an expensive

/// round-trip between XMM and GPR.


static SDValue vectorizeExtractedCast(SDValue Cast, const SDLoc &DL,

                                      SelectionDAG &DAG,

                                      const X86Subtarget &Subtarget) {

  // TODO: This could be enhanced to handle smaller integer types by peeking

  // through an extend.

  SDValue Extract = Cast.getOperand(0);

  MVT DestVT = Cast.getSimpleValueType();

  if (Extract.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||

      !isa<ConstantSDNode>(Extract.getOperand(1)))

    return SDValue();


  // See if we have a 128-bit vector cast op for this type of cast.

  SDValue VecOp = Extract.getOperand(0);

  MVT FromVT = VecOp.getSimpleValueType();

  unsigned NumEltsInXMM = 128 / FromVT.getScalarSizeInBits();

  MVT Vec128VT = MVT::getVectorVT(FromVT.getScalarType(), NumEltsInXMM);

  MVT ToVT = MVT::getVectorVT(DestVT, NumEltsInXMM);

  if (!useVectorCast(Cast.getOpcode(), Vec128VT, ToVT, Subtarget))

    return SDValue();


  // If we are extracting from a non-zero element, first shuffle the source

  // vector to allow extracting from element zero.

  if (!isNullConstant(Extract.getOperand(1))) {

    SmallVector<int, 16> Mask(FromVT.getVectorNumElements(), -1);

    Mask[0] = Extract.getConstantOperandVal(1);

    VecOp = DAG.getVectorShuffle(FromVT, DL, VecOp, DAG.getUNDEF(FromVT), Mask);

  }

  // If the source vector is wider than 128-bits, extract the low part. Do not

  // create an unnecessarily wide vector cast op.

  if (FromVT != Vec128VT)

    VecOp = extract128BitVector(VecOp, 0, DAG, DL);


  // cast (extelt V, 0) --> extelt (cast (extract_subv V)), 0

  // cast (extelt V, C) --> extelt (cast (extract_subv (shuffle V, [C...]))), 0

  SDValue VCast = DAG.getNode(Cast.getOpcode(), DL, ToVT, VecOp);

  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, DestVT, VCast,

                     DAG.getVectorIdxConstant(0, DL));

}


/// Given a scalar cast to FP with a cast to integer operand (almost an ftrunc),

/// try to vectorize the cast ops. This will avoid an expensive round-trip

/// between XMM and GPR.


static SDValue lowerFPToIntToFP(SDValue CastToFP, const SDLoc &DL,

                                SelectionDAG &DAG,

                                const X86Subtarget &Subtarget) {

  // TODO: Allow FP_TO_UINT.

  SDValue CastToInt = CastToFP.getOperand(0);

  MVT VT = CastToFP.getSimpleValueType();

  if (CastToInt.getOpcode() != ISD::FP_TO_SINT || VT.isVector())

    return SDValue();


  MVT IntVT = CastToInt.getSimpleValueType();

  SDValue X = CastToInt.getOperand(0);

  MVT SrcVT = X.getSimpleValueType();

  if (SrcVT != MVT::f32 && SrcVT != MVT::f64)

    return SDValue();


  // See if we have 128-bit vector cast instructions for this type of cast.

  // We need cvttps2dq/cvttpd2dq and cvtdq2ps/cvtdq2pd.

  if (!Subtarget.hasSSE2() || (VT != MVT::f32 && VT != MVT::f64) ||

      IntVT != MVT::i32)

    return SDValue();


  unsigned SrcSize = SrcVT.getSizeInBits();

  unsigned IntSize = IntVT.getSizeInBits();

  unsigned VTSize = VT.getSizeInBits();

  MVT VecSrcVT = MVT::getVectorVT(SrcVT, 128 / SrcSize);

  MVT VecIntVT = MVT::getVectorVT(IntVT, 128 / IntSize);

  MVT VecVT = MVT::getVectorVT(VT, 128 / VTSize);


  // We need target-specific opcodes if this is v2f64 -> v4i32 -> v2f64.

  unsigned ToIntOpcode =

      SrcSize != IntSize ? X86ISD::CVTTP2SI : (unsigned)ISD::FP_TO_SINT;

  unsigned ToFPOpcode =

      IntSize != VTSize ? X86ISD::CVTSI2P : (unsigned)ISD::SINT_TO_FP;


  // sint_to_fp (fp_to_sint X) --> extelt (sint_to_fp (fp_to_sint (s2v X))), 0

  //

  // We are not defining the high elements (for example, zero them) because

  // that could nullify any performance advantage that we hoped to gain from

  // this vector op hack. We do not expect any adverse effects (like denorm

  // penalties) with cast ops.

  SDValue ZeroIdx = DAG.getVectorIdxConstant(0, DL);

  SDValue VecX = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecSrcVT, X);

  SDValue VCastToInt = DAG.getNode(ToIntOpcode, DL, VecIntVT, VecX);

  SDValue VCastToFP = DAG.getNode(ToFPOpcode, DL, VecVT, VCastToInt);

  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, VCastToFP, ZeroIdx);

}


static SDValue lowerINT_TO_FP_vXi64(SDValue Op, const SDLoc &DL,

                                    SelectionDAG &DAG,

                                    const X86Subtarget &Subtarget) {

  bool IsStrict = Op->isStrictFPOpcode();

  MVT VT = Op->getSimpleValueType(0);

  SDValue Src = Op->getOperand(IsStrict ? 1 : 0);


  if (Subtarget.hasDQI()) {

    assert(!Subtarget.hasVLX() && "Unexpected features");


    assert((Src.getSimpleValueType() == MVT::v2i64 ||

            Src.getSimpleValueType() == MVT::v4i64) &&

           "Unsupported custom type");


    // With AVX512DQ, but not VLX we need to widen to get a 512-bit result type.

    assert((VT == MVT::v4f32 || VT == MVT::v2f64 || VT == MVT::v4f64) &&

           "Unexpected VT!");

    MVT WideVT = VT == MVT::v4f32 ? MVT::v8f32 : MVT::v8f64;


    // Need to concat with zero vector for strict fp to avoid spurious

    // exceptions.

    SDValue Tmp = IsStrict ? DAG.getConstant(0, DL, MVT::v8i64)

                           : DAG.getUNDEF(MVT::v8i64);

    Src = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i64, Tmp, Src,

                      DAG.getVectorIdxConstant(0, DL));

    SDValue Res, Chain;

    if (IsStrict) {

      Res = DAG.getNode(Op.getOpcode(), DL, {WideVT, MVT::Other},

                        {Op->getOperand(0), Src});

      Chain = Res.getValue(1);

    } else {

      Res = DAG.getNode(Op.getOpcode(), DL, WideVT, Src);

    }


    Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,

                      DAG.getVectorIdxConstant(0, DL));


    if (IsStrict)

      return DAG.getMergeValues({Res, Chain}, DL);

    return Res;

  }


  bool IsSigned = Op->getOpcode() == ISD::SINT_TO_FP ||

                  Op->getOpcode() == ISD::STRICT_SINT_TO_FP;

  if (VT != MVT::v4f32 || IsSigned)

    return SDValue();


  SDValue Zero = DAG.getConstant(0, DL, MVT::v4i64);

  SDValue One  = DAG.getConstant(1, DL, MVT::v4i64);

  SDValue Sign = DAG.getNode(ISD::OR, DL, MVT::v4i64,

                             DAG.getNode(ISD::SRL, DL, MVT::v4i64, Src, One),

                             DAG.getNode(ISD::AND, DL, MVT::v4i64, Src, One));

  SDValue IsNeg = DAG.getSetCC(DL, MVT::v4i64, Src, Zero, ISD::SETLT);

  SDValue SignSrc = DAG.getSelect(DL, MVT::v4i64, IsNeg, Sign, Src);

  SmallVector<SDValue, 4> SignCvts(4);

  SmallVector<SDValue, 4> Chains(4);

  for (int i = 0; i != 4; ++i) {

    SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, SignSrc,

                              DAG.getVectorIdxConstant(i, DL));

    if (IsStrict) {

      SignCvts[i] =

          DAG.getNode(ISD::STRICT_SINT_TO_FP, DL, {MVT::f32, MVT::Other},

                      {Op.getOperand(0), Elt});

      Chains[i] = SignCvts[i].getValue(1);

    } else {

      SignCvts[i] = DAG.getNode(ISD::SINT_TO_FP, DL, MVT::f32, Elt);

    }

  }

  SDValue SignCvt = DAG.getBuildVector(VT, DL, SignCvts);


  SDValue Slow, Chain;

  if (IsStrict) {

    Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);

    Slow = DAG.getNode(ISD::STRICT_FADD, DL, {MVT::v4f32, MVT::Other},

                       {Chain, SignCvt, SignCvt});

    Chain = Slow.getValue(1);

  } else {

    Slow = DAG.getNode(ISD::FADD, DL, MVT::v4f32, SignCvt, SignCvt);

  }


  IsNeg = DAG.getNode(ISD::TRUNCATE, DL, MVT::v4i32, IsNeg);

  SDValue Cvt = DAG.getSelect(DL, MVT::v4f32, IsNeg, Slow, SignCvt);


  if (IsStrict)

    return DAG.getMergeValues({Cvt, Chain}, DL);


  return Cvt;

}


static SDValue promoteXINT_TO_FP(SDValue Op, const SDLoc &dl,

                                 SelectionDAG &DAG) {

  bool IsStrict = Op->isStrictFPOpcode();

  SDValue Src = Op.getOperand(IsStrict ? 1 : 0);

  SDValue Chain = IsStrict ? Op->getOperand(0) : DAG.getEntryNode();

  MVT VT = Op.getSimpleValueType();

  MVT NVT = VT.isVector() ? VT.changeVectorElementType(MVT::f32) : MVT::f32;


  SDValue Rnd = DAG.getIntPtrConstant(0, dl, /*isTarget=*/true);

  if (IsStrict)

    return DAG.getNode(

        ISD::STRICT_FP_ROUND, dl, {VT, MVT::Other},

        {Chain,

         DAG.getNode(Op.getOpcode(), dl, {NVT, MVT::Other}, {Chain, Src}),

         Rnd});

  return DAG.getNode(ISD::FP_ROUND, dl, VT,

                     DAG.getNode(Op.getOpcode(), dl, NVT, Src), Rnd);

}


static bool isLegalConversion(MVT VT, MVT FloatVT, bool IsSigned,

                              const X86Subtarget &Subtarget) {

  if (FloatVT.getScalarType() != MVT::f16 || Subtarget.hasVLX()) {

    if (VT == MVT::v4i32 && Subtarget.hasSSE2() && IsSigned)

      return true;

    if (VT == MVT::v8i32 && Subtarget.hasAVX() && IsSigned)

      return true;

  }

  if (Subtarget.hasVLX() && (VT == MVT::v4i32 || VT == MVT::v8i32))

    return true;

  if (Subtarget.useAVX512Regs()) {

    if (VT == MVT::v16i32)

      return true;

    if (VT == MVT::v8i64 && FloatVT == MVT::v8f16 && Subtarget.hasFP16())

      return true;

    if (VT == MVT::v8i64 && Subtarget.hasDQI())

      return true;

  }

  if (Subtarget.hasDQI() && Subtarget.hasVLX() &&

      (VT == MVT::v2i64 || VT == MVT::v4i64))

    return true;

  return false;

}


SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,

                                           SelectionDAG &DAG) const {

  bool IsStrict = Op->isStrictFPOpcode();

  unsigned OpNo = IsStrict ? 1 : 0;

  SDValue Src = Op.getOperand(OpNo);

  SDValue Chain = IsStrict ? Op->getOperand(0) : DAG.getEntryNode();

  MVT SrcVT = Src.getSimpleValueType();

  MVT VT = Op.getSimpleValueType();

  SDLoc dl(Op);


  if (isSoftF16(VT, Subtarget))

    return promoteXINT_TO_FP(Op, dl, DAG);

  else if (isLegalConversion(SrcVT, VT, true, Subtarget))

    return Op;


  if (Subtarget.isTargetWin64() && SrcVT == MVT::i128)

    return LowerWin64_INT128_TO_FP(Op, DAG);


  if (SDValue Extract = vectorizeExtractedCast(Op, dl, DAG, Subtarget))

    return Extract;


  if (SDValue R = lowerFPToIntToFP(Op, dl, DAG, Subtarget))

    return R;


  if (SrcVT.isVector()) {

    if (SrcVT == MVT::v2i32 && VT == MVT::v2f64) {

      // Note: Since v2f64 is a legal type. We don't need to zero extend the

      // source for strict FP.

      if (IsStrict)

        return DAG.getNode(

            X86ISD::STRICT_CVTSI2P, dl, {VT, MVT::Other},

            {Chain, DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,

                                DAG.getUNDEF(SrcVT))});

      return DAG.getNode(X86ISD::CVTSI2P, dl, VT,

                         DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,

                                     DAG.getUNDEF(SrcVT)));

    }

    if (SrcVT == MVT::v2i64 || SrcVT == MVT::v4i64)

      return lowerINT_TO_FP_vXi64(Op, dl, DAG, Subtarget);


    return SDValue();

  }


  assert(SrcVT <= MVT::i64 && SrcVT >= MVT::i16 &&

         "Unknown SINT_TO_FP to lower!");


  bool UseSSEReg = isScalarFPTypeInSSEReg(VT);


  // These are really Legal; return the operand so the caller accepts it as

  // Legal.

  if (SrcVT == MVT::i32 && UseSSEReg)

    return Op;

  if (SrcVT == MVT::i64 && UseSSEReg && Subtarget.is64Bit())

    return Op;


  if (SDValue V = LowerI64IntToFP_AVX512DQ(Op, dl, DAG, Subtarget))

    return V;

  if (SDValue V = LowerI64IntToFP16(Op, dl, DAG, Subtarget))

    return V;


  // SSE doesn't have an i16 conversion so we need to promote.

  if (SrcVT == MVT::i16 && (UseSSEReg || VT == MVT::f128)) {

    SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i32, Src);

    if (IsStrict)

      return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},

                         {Chain, Ext});


    return DAG.getNode(ISD::SINT_TO_FP, dl, VT, Ext);

  }


  if (VT == MVT::f128 || !Subtarget.hasX87())

    return SDValue();


  SDValue ValueToStore = Src;

  if (SrcVT == MVT::i64 && Subtarget.hasSSE2() && !Subtarget.is64Bit())

    // Bitcasting to f64 here allows us to do a single 64-bit store from

    // an SSE register, avoiding the store forwarding penalty that would come

    // with two 32-bit stores.

    ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);


  unsigned Size = SrcVT.getStoreSize();

  Align Alignment(Size);

  MachineFunction &MF = DAG.getMachineFunction();

  auto PtrVT = getPointerTy(MF.getDataLayout());

  int SSFI = MF.getFrameInfo().CreateStackObject(Size, Alignment, false);

  MachinePointerInfo MPI =

      MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI);

  SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);

  Chain = DAG.getStore(Chain, dl, ValueToStore, StackSlot, MPI, Alignment);

  std::pair<SDValue, SDValue> Tmp =

      BuildFILD(VT, SrcVT, dl, Chain, StackSlot, MPI, Alignment, DAG);


  if (IsStrict)

    return DAG.getMergeValues({Tmp.first, Tmp.second}, dl);


  return Tmp.first;

}


std::pair<SDValue, SDValue> X86TargetLowering::BuildFILD(

    EVT DstVT, EVT SrcVT, const SDLoc &DL, SDValue Chain, SDValue Pointer,

    MachinePointerInfo PtrInfo, Align Alignment, SelectionDAG &DAG) const {

  // Build the FILD

  SDVTList Tys;

  bool useSSE = isScalarFPTypeInSSEReg(DstVT);

  if (useSSE)

    Tys = DAG.getVTList(MVT::f80, MVT::Other);

  else

    Tys = DAG.getVTList(DstVT, MVT::Other);


  SDValue FILDOps[] = {Chain, Pointer};

  SDValue Result =

      DAG.getMemIntrinsicNode(X86ISD::FILD, DL, Tys, FILDOps, SrcVT, PtrInfo,

                              Alignment, MachineMemOperand::MOLoad);

  Chain = Result.getValue(1);


  if (useSSE) {

    MachineFunction &MF = DAG.getMachineFunction();

    unsigned SSFISize = DstVT.getStoreSize();

    int SSFI =

        MF.getFrameInfo().CreateStackObject(SSFISize, Align(SSFISize), false);

    auto PtrVT = getPointerTy(MF.getDataLayout());

    SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);

    Tys = DAG.getVTList(MVT::Other);

    SDValue FSTOps[] = {Chain, Result, StackSlot};

    MachineMemOperand *StoreMMO = DAG.getMachineFunction().getMachineMemOperand(

        MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),

        MachineMemOperand::MOStore, SSFISize, Align(SSFISize));


    Chain =

        DAG.getMemIntrinsicNode(X86ISD::FST, DL, Tys, FSTOps, DstVT, StoreMMO);

    Result = DAG.getLoad(

        DstVT, DL, Chain, StackSlot,

        MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI));

    Chain = Result.getValue(1);

  }


  return { Result, Chain };

}


/// Horizontal vector math instructions may be slower than normal math with

/// shuffles. Limit horizontal op codegen based on size/speed trade-offs, uarch

/// implementation, and likely shuffle complexity of the alternate sequence.


static bool shouldUseHorizontalOp(bool IsSingleSource, SelectionDAG &DAG,

                                  const X86Subtarget &Subtarget) {

  bool IsOptimizingSize = DAG.shouldOptForSize();

  bool HasFastHOps = Subtarget.hasFastHorizontalOps();

  return !IsSingleSource || IsOptimizingSize || HasFastHOps;

}


/// 64-bit unsigned integer to double expansion.


static SDValue LowerUINT_TO_FP_i64(SDValue Op, const SDLoc &dl,

                                   SelectionDAG &DAG,

                                   const X86Subtarget &Subtarget) {

  // We can't use this algorithm for strict fp. It produces -0.0 instead of +0.0

  // when converting 0 when rounding toward negative infinity. Caller will

  // fall back to Expand for when i64 or is legal or use FILD in 32-bit mode.

  assert(!Op->isStrictFPOpcode() && "Expected non-strict uint_to_fp!");

  // This algorithm is not obvious. Here it is what we're trying to output:

  /*

     movq       %rax,  %xmm0

     punpckldq  (c0),  %xmm0  // c0: (uint4){ 0x43300000U, 0x45300000U, 0U, 0U }

     subpd      (c1),  %xmm0  // c1: (double2){ 0x1.0p52, 0x1.0p52 * 0x1.0p32 }

     #ifdef __SSE3__

       haddpd   %xmm0, %xmm0

     #else

       pshufd   $0x4e, %xmm0, %xmm1

       addpd    %xmm1, %xmm0

     #endif

  */


  LLVMContext *Context = DAG.getContext();


  // Build some magic constants.

  static const uint32_t CV0[] = { 0x43300000, 0x45300000, 0, 0 };

  Constant *C0 = ConstantDataVector::get(*Context, CV0);

  auto PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());

  SDValue CPIdx0 = DAG.getConstantPool(C0, PtrVT, Align(16));


  SmallVector<Constant*,2> CV1;

  CV1.push_back(

    ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),

                                      APInt(64, 0x4330000000000000ULL))));

  CV1.push_back(

    ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),

                                      APInt(64, 0x4530000000000000ULL))));

  Constant *C1 = ConstantVector::get(CV1);

  SDValue CPIdx1 = DAG.getConstantPool(C1, PtrVT, Align(16));


  // Load the 64-bit value into an XMM register.

  SDValue XR1 =

      DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Op.getOperand(0));

  SDValue CLod0 = DAG.getLoad(

      MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0,

      MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), Align(16));

  SDValue Unpck1 =

      getUnpackl(DAG, dl, MVT::v4i32, DAG.getBitcast(MVT::v4i32, XR1), CLod0);


  SDValue CLod1 = DAG.getLoad(

      MVT::v2f64, dl, CLod0.getValue(1), CPIdx1,

      MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), Align(16));

  SDValue XR2F = DAG.getBitcast(MVT::v2f64, Unpck1);

  // TODO: Are there any fast-math-flags to propagate here?

  SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1);

  SDValue Result;


  if (Subtarget.hasSSE3() &&

      shouldUseHorizontalOp(true, DAG, Subtarget)) {

    Result = DAG.getNode(X86ISD::FHADD, dl, MVT::v2f64, Sub, Sub);

  } else {

    SDValue Shuffle = DAG.getVectorShuffle(MVT::v2f64, dl, Sub, Sub, {1,-1});

    Result = DAG.getNode(ISD::FADD, dl, MVT::v2f64, Shuffle, Sub);

  }

  Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Result,

                       DAG.getVectorIdxConstant(0, dl));

  return Result;

}


/// 32-bit unsigned integer to float expansion.


static SDValue LowerUINT_TO_FP_i32(SDValue Op, const SDLoc &dl,

                                   SelectionDAG &DAG,

                                   const X86Subtarget &Subtarget) {

  unsigned OpNo = Op.getNode()->isStrictFPOpcode() ? 1 : 0;

  // FP constant to bias correct the final result.

  SDValue Bias = DAG.getConstantFP(

      llvm::bit_cast<double>(0x4330000000000000ULL), dl, MVT::f64);


  // Load the 32-bit value into an XMM register.

  SDValue Load =

      DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Op.getOperand(OpNo));


  // Zero out the upper parts of the register.

  Load = getShuffleVectorZeroOrUndef(Load, 0, true, Subtarget, DAG);


  // Or the load with the bias.

  SDValue Or = DAG.getNode(

      ISD::OR, dl, MVT::v2i64,

      DAG.getBitcast(MVT::v2i64, Load),

      DAG.getBitcast(MVT::v2i64,

                     DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Bias)));

  Or = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,

                   DAG.getBitcast(MVT::v2f64, Or),

                   DAG.getVectorIdxConstant(0, dl));


  if (Op.getNode()->isStrictFPOpcode()) {

    // Subtract the bias.

    // TODO: Are there any fast-math-flags to propagate here?

    SDValue Chain = Op.getOperand(0);

    SDValue Sub = DAG.getNode(ISD::STRICT_FSUB, dl, {MVT::f64, MVT::Other},

                              {Chain, Or, Bias});


    if (Op.getValueType() == Sub.getValueType())

      return Sub;


    // Handle final rounding.

    std::pair<SDValue, SDValue> ResultPair = DAG.getStrictFPExtendOrRound(

        Sub, Sub.getValue(1), dl, Op.getSimpleValueType());


    return DAG.getMergeValues({ResultPair.first, ResultPair.second}, dl);

  }


  // Subtract the bias.

  // TODO: Are there any fast-math-flags to propagate here?

  SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias);


  // Handle final rounding.

  return DAG.getFPExtendOrRound(Sub, dl, Op.getSimpleValueType());

}


static SDValue lowerUINT_TO_FP_v2i32(SDValue Op, const SDLoc &DL,

                                     SelectionDAG &DAG,

                                     const X86Subtarget &Subtarget) {

  if (Op.getSimpleValueType() != MVT::v2f64)

    return SDValue();


  bool IsStrict = Op->isStrictFPOpcode();


  SDValue N0 = Op.getOperand(IsStrict ? 1 : 0);

  assert(N0.getSimpleValueType() == MVT::v2i32 && "Unexpected input type");


  if (Subtarget.hasAVX512()) {

    if (!Subtarget.hasVLX()) {

      // Let generic type legalization widen this.

      if (!IsStrict)

        return SDValue();

      // Otherwise pad the integer input with 0s and widen the operation.

      N0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,

                       DAG.getConstant(0, DL, MVT::v2i32));

      SDValue Res = DAG.getNode(Op->getOpcode(), DL, {MVT::v4f64, MVT::Other},

                                {Op.getOperand(0), N0});

      SDValue Chain = Res.getValue(1);

      Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2f64, Res,

                        DAG.getVectorIdxConstant(0, DL));

      return DAG.getMergeValues({Res, Chain}, DL);

    }


    // Legalize to v4i32 type.

    N0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,

                     DAG.getUNDEF(MVT::v2i32));

    if (IsStrict)

      return DAG.getNode(X86ISD::STRICT_CVTUI2P, DL, {MVT::v2f64, MVT::Other},

                         {Op.getOperand(0), N0});

    return DAG.getNode(X86ISD::CVTUI2P, DL, MVT::v2f64, N0);

  }


  // Zero extend to 2i64, OR with the floating point representation of 2^52.

  // This gives us the floating point equivalent of 2^52 + the i32 integer

  // since double has 52-bits of mantissa. Then subtract 2^52 in floating

  // point leaving just our i32 integers in double format.

  SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v2i64, N0);

  SDValue VBias = DAG.getConstantFP(

      llvm::bit_cast<double>(0x4330000000000000ULL), DL, MVT::v2f64);

  SDValue Or = DAG.getNode(ISD::OR, DL, MVT::v2i64, ZExtIn,

                           DAG.getBitcast(MVT::v2i64, VBias));

  Or = DAG.getBitcast(MVT::v2f64, Or);


  if (IsStrict)

    return DAG.getNode(ISD::STRICT_FSUB, DL, {MVT::v2f64, MVT::Other},

                       {Op.getOperand(0), Or, VBias});

  return DAG.getNode(ISD::FSUB, DL, MVT::v2f64, Or, VBias);

}


static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, const SDLoc &DL,

                                     SelectionDAG &DAG,

                                     const X86Subtarget &Subtarget) {

  bool IsStrict = Op->isStrictFPOpcode();

  SDValue V = Op->getOperand(IsStrict ? 1 : 0);

  MVT VecIntVT = V.getSimpleValueType();

  assert((VecIntVT == MVT::v4i32 || VecIntVT == MVT::v8i32) &&

         "Unsupported custom type");


  if (Subtarget.hasAVX512()) {

    // With AVX512, but not VLX we need to widen to get a 512-bit result type.

    assert(!Subtarget.hasVLX() && "Unexpected features");

    MVT VT = Op->getSimpleValueType(0);


    // v8i32->v8f64 is legal with AVX512 so just return it.

    if (VT == MVT::v8f64)

      return Op;


    assert((VT == MVT::v4f32 || VT == MVT::v8f32 || VT == MVT::v4f64 ||

            VT == MVT::v8f16) &&

           "Unexpected VT!");

    MVT WideVT = VT == MVT::v8f16 ? MVT::v16f16 : MVT::v16f32;

    MVT WideIntVT = MVT::v16i32;

    if (VT == MVT::v4f64) {

      WideVT = MVT::v8f64;

      WideIntVT = MVT::v8i32;

    }


    // Need to concat with zero vector for strict fp to avoid spurious

    // exceptions.

    SDValue Tmp =

        IsStrict ? DAG.getConstant(0, DL, WideIntVT) : DAG.getUNDEF(WideIntVT);

    V = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideIntVT, Tmp, V,

                    DAG.getVectorIdxConstant(0, DL));

    SDValue Res, Chain;

    if (IsStrict) {

      Res = DAG.getNode(ISD::STRICT_UINT_TO_FP, DL, {WideVT, MVT::Other},

                        {Op->getOperand(0), V});

      Chain = Res.getValue(1);

    } else {

      Res = DAG.getNode(ISD::UINT_TO_FP, DL, WideVT, V);

    }


    Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,

                      DAG.getVectorIdxConstant(0, DL));


    if (IsStrict)

      return DAG.getMergeValues({Res, Chain}, DL);

    return Res;

  }


  if (Subtarget.hasAVX() && VecIntVT == MVT::v4i32 &&

      Op->getSimpleValueType(0) == MVT::v4f64) {

    SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v4i64, V);

    Constant *Bias = ConstantFP::get(

        *DAG.getContext(),

        APFloat(APFloat::IEEEdouble(), APInt(64, 0x4330000000000000ULL)));

    auto PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());

    SDValue CPIdx = DAG.getConstantPool(Bias, PtrVT, Align(8));

    SDVTList Tys = DAG.getVTList(MVT::v4f64, MVT::Other);

    SDValue Ops[] = {DAG.getEntryNode(), CPIdx};

    SDValue VBias = DAG.getMemIntrinsicNode(

        X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::f64,

        MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), Align(8),

        MachineMemOperand::MOLoad);


    SDValue Or = DAG.getNode(ISD::OR, DL, MVT::v4i64, ZExtIn,

                             DAG.getBitcast(MVT::v4i64, VBias));

    Or = DAG.getBitcast(MVT::v4f64, Or);


    if (IsStrict)

      return DAG.getNode(ISD::STRICT_FSUB, DL, {MVT::v4f64, MVT::Other},

                         {Op.getOperand(0), Or, VBias});

    return DAG.getNode(ISD::FSUB, DL, MVT::v4f64, Or, VBias);

  }


  // The algorithm is the following:

  // #ifdef __SSE4_1__

  //     uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);

  //     uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),

  //                                 (uint4) 0x53000000, 0xaa);

  // #else

  //     uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;

  //     uint4 hi = (v >> 16) | (uint4) 0x53000000;

  // #endif

  //     float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);

  //     return (float4) lo + fhi;


  bool Is128 = VecIntVT == MVT::v4i32;

  MVT VecFloatVT = Is128 ? MVT::v4f32 : MVT::v8f32;

  // If we convert to something else than the supported type, e.g., to v4f64,

  // abort early.

  if (VecFloatVT != Op->getSimpleValueType(0))

    return SDValue();


  // In the #idef/#else code, we have in common:

  // - The vector of constants:

  // -- 0x4b000000

  // -- 0x53000000

  // - A shift:

  // -- v >> 16


  // Create the splat vector for 0x4b000000.

  SDValue VecCstLow = DAG.getConstant(0x4b000000, DL, VecIntVT);

  // Create the splat vector for 0x53000000.

  SDValue VecCstHigh = DAG.getConstant(0x53000000, DL, VecIntVT);


  // Create the right shift.

  SDValue VecCstShift = DAG.getConstant(16, DL, VecIntVT);

  SDValue HighShift = DAG.getNode(ISD::SRL, DL, VecIntVT, V, VecCstShift);


  SDValue Low, High;

  if (Subtarget.hasSSE41()) {

    MVT VecI16VT = Is128 ? MVT::v8i16 : MVT::v16i16;

    //     uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);

    SDValue VecCstLowBitcast = DAG.getBitcast(VecI16VT, VecCstLow);

    SDValue VecBitcast = DAG.getBitcast(VecI16VT, V);

    // Low will be bitcasted right away, so do not bother bitcasting back to its

    // original type.

    Low = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecBitcast,

                      VecCstLowBitcast, DAG.getTargetConstant(0xaa, DL, MVT::i8));

    //     uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),

    //                                 (uint4) 0x53000000, 0xaa);

    SDValue VecCstHighBitcast = DAG.getBitcast(VecI16VT, VecCstHigh);

    SDValue VecShiftBitcast = DAG.getBitcast(VecI16VT, HighShift);

    // High will be bitcasted right away, so do not bother bitcasting back to

    // its original type.

    High = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecShiftBitcast,

                       VecCstHighBitcast, DAG.getTargetConstant(0xaa, DL, MVT::i8));

  } else {

    SDValue VecCstMask = DAG.getConstant(0xffff, DL, VecIntVT);

    //     uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;

    SDValue LowAnd = DAG.getNode(ISD::AND, DL, VecIntVT, V, VecCstMask);

    Low = DAG.getNode(ISD::OR, DL, VecIntVT, LowAnd, VecCstLow);


    //     uint4 hi = (v >> 16) | (uint4) 0x53000000;

    High = DAG.getNode(ISD::OR, DL, VecIntVT, HighShift, VecCstHigh);

  }


  // Create the vector constant for (0x1.0p39f + 0x1.0p23f).

  SDValue VecCstFSub = DAG.getConstantFP(

      APFloat(APFloat::IEEEsingle(), APInt(32, 0x53000080)), DL, VecFloatVT);


  //     float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);

  // NOTE: By using fsub of a positive constant instead of fadd of a negative

  // constant, we avoid reassociation in MachineCombiner when reassoc is

  // enabled. See PR24512.

  SDValue HighBitcast = DAG.getBitcast(VecFloatVT, High);

  // TODO: Are there any fast-math-flags to propagate here?

  //     (float4) lo;

  SDValue LowBitcast = DAG.getBitcast(VecFloatVT, Low);

  //     return (float4) lo + fhi;

  if (IsStrict) {

    SDValue FHigh = DAG.getNode(ISD::STRICT_FSUB, DL, {VecFloatVT, MVT::Other},

                                {Op.getOperand(0), HighBitcast, VecCstFSub});

    return DAG.getNode(ISD::STRICT_FADD, DL, {VecFloatVT, MVT::Other},

                       {FHigh.getValue(1), LowBitcast, FHigh});

  }


  SDValue FHigh =

      DAG.getNode(ISD::FSUB, DL, VecFloatVT, HighBitcast, VecCstFSub);

  return DAG.getNode(ISD::FADD, DL, VecFloatVT, LowBitcast, FHigh);

}


static SDValue lowerUINT_TO_FP_vec(SDValue Op, const SDLoc &dl, SelectionDAG &DAG,

                                   const X86Subtarget &Subtarget) {

  unsigned OpNo = Op.getNode()->isStrictFPOpcode() ? 1 : 0;

  SDValue N0 = Op.getOperand(OpNo);

  MVT SrcVT = N0.getSimpleValueType();


  switch (SrcVT.SimpleTy) {

  default:

    llvm_unreachable("Custom UINT_TO_FP is not supported!");

  case MVT::v2i32:

    return lowerUINT_TO_FP_v2i32(Op, dl, DAG, Subtarget);

  case MVT::v4i32:

  case MVT::v8i32:

    return lowerUINT_TO_FP_vXi32(Op, dl, DAG, Subtarget);

  case MVT::v2i64:

  case MVT::v4i64:

    return lowerINT_TO_FP_vXi64(Op, dl, DAG, Subtarget);

  }

}


SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,

                                           SelectionDAG &DAG) const {

  bool IsStrict = Op->isStrictFPOpcode();

  unsigned OpNo = IsStrict ? 1 : 0;

  SDValue Src = Op.getOperand(OpNo);

  SDLoc dl(Op);

  auto PtrVT = getPointerTy(DAG.getDataLayout());

  MVT SrcVT = Src.getSimpleValueType();

  MVT DstVT = Op->getSimpleValueType(0);

  SDValue Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();


  // Bail out when we don't have native conversion instructions.

  if (DstVT == MVT::f128)

    return SDValue();


  if (isSoftF16(DstVT, Subtarget))

    return promoteXINT_TO_FP(Op, dl, DAG);

  else if (isLegalConversion(SrcVT, DstVT, false, Subtarget))

    return Op;


  if (DstVT.isVector())

    return lowerUINT_TO_FP_vec(Op, dl, DAG, Subtarget);


  if (Subtarget.isTargetWin64() && SrcVT == MVT::i128)

    return LowerWin64_INT128_TO_FP(Op, DAG);


  if (SDValue Extract = vectorizeExtractedCast(Op, dl, DAG, Subtarget))

    return Extract;


  if (Subtarget.hasAVX512() && isScalarFPTypeInSSEReg(DstVT) &&

      (SrcVT == MVT::i32 || (SrcVT == MVT::i64 && Subtarget.is64Bit()))) {

    // Conversions from unsigned i32 to f32/f64 are legal,

    // using VCVTUSI2SS/SD.  Same for i64 in 64-bit mode.

    return Op;

  }


  // Promote i32 to i64 and use a signed conversion on 64-bit targets.

  if (SrcVT == MVT::i32 && Subtarget.is64Bit()) {

    Src = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Src);

    if (IsStrict)

      return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {DstVT, MVT::Other},

                         {Chain, Src});

    return DAG.getNode(ISD::SINT_TO_FP, dl, DstVT, Src);

  }


  if (SDValue V = LowerI64IntToFP_AVX512DQ(Op, dl, DAG, Subtarget))

    return V;

  if (SDValue V = LowerI64IntToFP16(Op, dl, DAG, Subtarget))

    return V;


  // The transform for i64->f64 isn't correct for 0 when rounding to negative

  // infinity. It produces -0.0, so disable under strictfp.

  if (SrcVT == MVT::i64 && DstVT == MVT::f64 && Subtarget.hasSSE2() &&

      !IsStrict)

    return LowerUINT_TO_FP_i64(Op, dl, DAG, Subtarget);

  // The transform for i32->f64/f32 isn't correct for 0 when rounding to

  // negative infinity. So disable under strictfp. Using FILD instead.

  if (SrcVT == MVT::i32 && Subtarget.hasSSE2() && DstVT != MVT::f80 &&

      !IsStrict)

    return LowerUINT_TO_FP_i32(Op, dl, DAG, Subtarget);

  if (Subtarget.is64Bit() && SrcVT == MVT::i64 &&

      (DstVT == MVT::f32 || DstVT == MVT::f64))

    return SDValue();


  // Make a 64-bit buffer, and use it to build an FILD.

  SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64, 8);

  int SSFI = cast<FrameIndexSDNode>(StackSlot)->getIndex();

  Align SlotAlign(8);

  MachinePointerInfo MPI =

      MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI);

  if (SrcVT == MVT::i32) {

    SDValue OffsetSlot =

        DAG.getMemBasePlusOffset(StackSlot, TypeSize::getFixed(4), dl);

    SDValue Store1 = DAG.getStore(Chain, dl, Src, StackSlot, MPI, SlotAlign);

    SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, dl, MVT::i32),

                                  OffsetSlot, MPI.getWithOffset(4), SlotAlign);

    std::pair<SDValue, SDValue> Tmp =

        BuildFILD(DstVT, MVT::i64, dl, Store2, StackSlot, MPI, SlotAlign, DAG);

    if (IsStrict)

      return DAG.getMergeValues({Tmp.first, Tmp.second}, dl);


    return Tmp.first;

  }


  assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP");

  SDValue ValueToStore = Src;

  if (isScalarFPTypeInSSEReg(Op.getValueType()) && !Subtarget.is64Bit()) {

    // Bitcasting to f64 here allows us to do a single 64-bit store from

    // an SSE register, avoiding the store forwarding penalty that would come

    // with two 32-bit stores.

    ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);

  }

  SDValue Store =

      DAG.getStore(Chain, dl, ValueToStore, StackSlot, MPI, SlotAlign);

  // For i64 source, we need to add the appropriate power of 2 if the input

  // was negative. We must be careful to do the computation in x87 extended

  // precision, not in SSE.

  SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);

  SDValue Ops[] = {Store, StackSlot};

  SDValue Fild =

      DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops, MVT::i64, MPI,

                              SlotAlign, MachineMemOperand::MOLoad);

  Chain = Fild.getValue(1);


  // Check whether the sign bit is set.

  SDValue SignSet = DAG.getSetCC(

      dl, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i64),

      Op.getOperand(OpNo), DAG.getConstant(0, dl, MVT::i64), ISD::SETLT);


  // Build a 64 bit pair (FF, 0) in the constant pool, with FF in the hi bits.

  APInt FF(64, 0x5F80000000000000ULL);

  SDValue FudgePtr =

      DAG.getConstantPool(ConstantInt::get(*DAG.getContext(), FF), PtrVT);

  Align CPAlignment = cast<ConstantPoolSDNode>(FudgePtr)->getAlign();


  // Get a pointer to FF if the sign bit was set, or to 0 otherwise.

  SDValue Zero = DAG.getIntPtrConstant(0, dl);

  SDValue Four = DAG.getIntPtrConstant(4, dl);

  SDValue Offset = DAG.getSelect(dl, Zero.getValueType(), SignSet, Four, Zero);

  FudgePtr = DAG.getNode(ISD::ADD, dl, PtrVT, FudgePtr, Offset);


  // Load the value out, extending it from f32 to f80.

  SDValue Fudge = DAG.getExtLoad(

      ISD::EXTLOAD, dl, MVT::f80, Chain, FudgePtr,

      MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), MVT::f32,

      CPAlignment);

  Chain = Fudge.getValue(1);

  // Extend everything to 80 bits to force it to be done on x87.

  // TODO: Are there any fast-math-flags to propagate here?

  if (IsStrict) {

    unsigned Opc = ISD::STRICT_FADD;

    // Windows needs the precision control changed to 80bits around this add.

    if (Subtarget.isOSWindows() && DstVT == MVT::f32)

      Opc = X86ISD::STRICT_FP80_ADD;


    SDValue Add =

        DAG.getNode(Opc, dl, {MVT::f80, MVT::Other}, {Chain, Fild, Fudge});

    // STRICT_FP_ROUND can't handle equal types.

    if (DstVT == MVT::f80)

      return Add;

    return DAG.getNode(ISD::STRICT_FP_ROUND, dl, {DstVT, MVT::Other},

                       {Add.getValue(1), Add,

                        DAG.getIntPtrConstant(0, dl, /*isTarget=*/true)});

  }

  unsigned Opc = ISD::FADD;

  // Windows needs the precision control changed to 80bits around this add.

  if (Subtarget.isOSWindows() && DstVT == MVT::f32)

    Opc = X86ISD::FP80_ADD;


  SDValue Add = DAG.getNode(Opc, dl, MVT::f80, Fild, Fudge);

  return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add,

                     DAG.getIntPtrConstant(0, dl, /*isTarget=*/true));

}


// If the given FP_TO_SINT (IsSigned) or FP_TO_UINT (!IsSigned) operation

// is legal, or has an fp128 or f16 source (which needs to be promoted to f32),

// just return an SDValue().

// Otherwise it is assumed to be a conversion from one of f32, f64 or f80

// to i16, i32 or i64, and we lower it to a legal sequence and return the

// result.

SDValue X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,

                                           bool IsSigned,

                                           SDValue &Chain) const {

  bool IsStrict = Op->isStrictFPOpcode();

  SDLoc DL(Op);


  EVT DstTy = Op.getValueType();

  SDValue Value = Op.getOperand(IsStrict ? 1 : 0);

  EVT TheVT = Value.getValueType();

  auto PtrVT = getPointerTy(DAG.getDataLayout());


  if (TheVT != MVT::f32 && TheVT != MVT::f64 && TheVT != MVT::f80) {

    // f16 must be promoted before using the lowering in this routine.

    // fp128 does not use this lowering.

    return SDValue();

  }


  // If using FIST to compute an unsigned i64, we'll need some fixup

  // to handle values above the maximum signed i64.  A FIST is always

  // used for the 32-bit subtarget, but also for f80 on a 64-bit target.

  bool UnsignedFixup = !IsSigned && DstTy == MVT::i64;


  // FIXME: This does not generate an invalid exception if the input does not

  // fit in i32. PR44019

  if (!IsSigned && DstTy != MVT::i64) {

    // Replace the fp-to-uint32 operation with an fp-to-sint64 FIST.

    // The low 32 bits of the fist result will have the correct uint32 result.

    assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT");

    DstTy = MVT::i64;

  }


  assert(DstTy.getSimpleVT() <= MVT::i64 &&

         DstTy.getSimpleVT() >= MVT::i16 &&

         "Unknown FP_TO_INT to lower!");


  // We lower FP->int64 into FISTP64 followed by a load from a temporary

  // stack slot.

  MachineFunction &MF = DAG.getMachineFunction();

  unsigned MemSize = DstTy.getStoreSize();

  int SSFI =

      MF.getFrameInfo().CreateStackObject(MemSize, Align(MemSize), false);

  SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);


  Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();


  SDValue Adjust; // 0x0 or 0x80000000, for result sign bit adjustment.


  if (UnsignedFixup) {

    //

    // Conversion to unsigned i64 is implemented with a select,

    // depending on whether the source value fits in the range

    // of a signed i64.  Let Thresh be the FP equivalent of

    // 0x8000000000000000ULL.

    //

    //  Adjust = (Value >= Thresh) ? 0x80000000 : 0;

    //  FltOfs = (Value >= Thresh) ? 0x80000000 : 0;

    //  FistSrc = (Value - FltOfs);

    //  Fist-to-mem64 FistSrc

    //  Add 0 or 0x800...0ULL to the 64-bit result, which is equivalent

    //  to XOR'ing the high 32 bits with Adjust.

    //

    // Being a power of 2, Thresh is exactly representable in all FP formats.

    // For X87 we'd like to use the smallest FP type for this constant, but

    // for DAG type consistency we have to match the FP operand type.


    APFloat Thresh(APFloat::IEEEsingle(), APInt(32, 0x5f000000));

    [[maybe_unused]] APFloat::opStatus Status = APFloat::opOK;

    bool LosesInfo = false;

    if (TheVT == MVT::f64)

      // The rounding mode is irrelevant as the conversion should be exact.

      Status = Thresh.convert(APFloat::IEEEdouble(), APFloat::rmNearestTiesToEven,

                              &LosesInfo);

    else if (TheVT == MVT::f80)

      Status = Thresh.convert(APFloat::x87DoubleExtended(),

                              APFloat::rmNearestTiesToEven, &LosesInfo);


    assert(Status == APFloat::opOK && !LosesInfo &&

           "FP conversion should have been exact");


    SDValue ThreshVal = DAG.getConstantFP(Thresh, DL, TheVT);


    EVT ResVT = getSetCCResultType(DAG.getDataLayout(),

                                   *DAG.getContext(), TheVT);

    SDValue Cmp;

    if (IsStrict) {

      Cmp = DAG.getSetCC(DL, ResVT, Value, ThreshVal, ISD::SETGE, Chain,

                         /*IsSignaling*/ true);

      Chain = Cmp.getValue(1);

    } else {

      Cmp = DAG.getSetCC(DL, ResVT, Value, ThreshVal, ISD::SETGE);

    }


    // Our preferred lowering of

    //

    // (Value >= Thresh) ? 0x8000000000000000ULL : 0

    //

    // is

    //

    // (Value >= Thresh) << 63

    //

    // but since we can get here after LegalOperations, DAGCombine might do the

    // wrong thing if we create a select. So, directly create the preferred

    // version.

    SDValue Zext = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Cmp);

    SDValue Const63 = DAG.getConstant(63, DL, MVT::i8);

    Adjust = DAG.getNode(ISD::SHL, DL, MVT::i64, Zext, Const63);


    SDValue FltOfs = DAG.getSelect(DL, TheVT, Cmp, ThreshVal,

                                   DAG.getConstantFP(0.0, DL, TheVT));


    if (IsStrict) {

      Value = DAG.getNode(ISD::STRICT_FSUB, DL, { TheVT, MVT::Other},

                          { Chain, Value, FltOfs });

      Chain = Value.getValue(1);

    } else

      Value = DAG.getNode(ISD::FSUB, DL, TheVT, Value, FltOfs);

  }


  MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(MF, SSFI);


  // FIXME This causes a redundant load/store if the SSE-class value is already

  // in memory, such as if it is on the callstack.

  if (isScalarFPTypeInSSEReg(TheVT)) {

    assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!");

    Chain = DAG.getStore(Chain, DL, Value, StackSlot, MPI);

    SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);

    SDValue Ops[] = { Chain, StackSlot };


    unsigned FLDSize = TheVT.getStoreSize();

    assert(FLDSize <= MemSize && "Stack slot not big enough");

    MachineMemOperand *MMO = MF.getMachineMemOperand(

        MPI, MachineMemOperand::MOLoad, FLDSize, Align(FLDSize));

    Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, TheVT, MMO);

    Chain = Value.getValue(1);

  }


  // Build the FP_TO_INT*_IN_MEM

  MachineMemOperand *MMO = MF.getMachineMemOperand(

      MPI, MachineMemOperand::MOStore, MemSize, Align(MemSize));

  SDValue Ops[] = { Chain, Value, StackSlot };

  SDValue FIST = DAG.getMemIntrinsicNode(X86ISD::FP_TO_INT_IN_MEM, DL,

                                         DAG.getVTList(MVT::Other),

                                         Ops, DstTy, MMO);


  SDValue Res = DAG.getLoad(Op.getValueType(), DL, FIST, StackSlot, MPI);

  Chain = Res.getValue(1);


  // If we need an unsigned fixup, XOR the result with adjust.

  if (UnsignedFixup)

    Res = DAG.getNode(ISD::XOR, DL, MVT::i64, Res, Adjust);


  return Res;

}


static SDValue LowerAVXExtend(SDValue Op, const SDLoc &dl, SelectionDAG &DAG,

                              const X86Subtarget &Subtarget) {

  MVT VT = Op.getSimpleValueType();

  SDValue In = Op.getOperand(0);

  MVT InVT = In.getSimpleValueType();

  unsigned Opc = Op.getOpcode();


  assert(VT.isVector() && InVT.isVector() && "Expected vector type");

  assert((Opc == ISD::ANY_EXTEND || Opc == ISD::ZERO_EXTEND) &&

         "Unexpected extension opcode");

  assert(VT.getVectorNumElements() == InVT.getVectorNumElements() &&

         "Expected same number of elements");

  assert((VT.getVectorElementType() == MVT::i16 ||

          VT.getVectorElementType() == MVT::i32 ||

          VT.getVectorElementType() == MVT::i64) &&

         "Unexpected element type");

  assert((InVT.getVectorElementType() == MVT::i8 ||

          InVT.getVectorElementType() == MVT::i16 ||

          InVT.getVectorElementType() == MVT::i32) &&

         "Unexpected element type");


  unsigned ExtendInVecOpc = DAG.getOpcode_EXTEND_VECTOR_INREG(Opc);


  if (VT == MVT::v32i16 && !Subtarget.hasBWI()) {

    assert(InVT == MVT::v32i8 && "Unexpected VT!");

    return splitVectorIntUnary(Op, DAG, dl);

  }


  if (Subtarget.hasInt256())

    return Op;


  // Optimize vectors in AVX mode:

  //

  //   v8i16 -> v8i32

  //   Use vpmovzwd for 4 lower elements  v8i16 -> v4i32.

  //   Use vpunpckhwd for 4 upper elements  v8i16 -> v4i32.

  //   Concat upper and lower parts.

  //

  //   v4i32 -> v4i64

  //   Use vpmovzdq for 4 lower elements  v4i32 -> v2i64.

  //   Use vpunpckhdq for 4 upper elements  v4i32 -> v2i64.

  //   Concat upper and lower parts.

  //

  MVT HalfVT = VT.getHalfNumVectorElementsVT();

  SDValue OpLo = DAG.getNode(ExtendInVecOpc, dl, HalfVT, In);


  // Short-circuit if we can determine that each 128-bit half is the same value.

  // Otherwise, this is difficult to match and optimize.

  if (auto *Shuf = dyn_cast<ShuffleVectorSDNode>(In))

    if (hasIdenticalHalvesShuffleMask(Shuf->getMask()))

      return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpLo);


  SDValue ZeroVec = DAG.getConstant(0, dl, InVT);

  SDValue Undef = DAG.getUNDEF(InVT);

  bool NeedZero = Opc == ISD::ZERO_EXTEND;

  SDValue OpHi = getUnpackh(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);

  OpHi = DAG.getBitcast(HalfVT, OpHi);


  return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);

}


// Helper to split and extend a v16i1 mask to v16i8 or v16i16.


static SDValue SplitAndExtendv16i1(unsigned ExtOpc, MVT VT, SDValue In,

                                   const SDLoc &dl, SelectionDAG &DAG) {

  assert((VT == MVT::v16i8 || VT == MVT::v16i16) && "Unexpected VT.");

  SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i1, In,

                           DAG.getVectorIdxConstant(0, dl));

  SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i1, In,

                           DAG.getVectorIdxConstant(8, dl));

  Lo = DAG.getNode(ExtOpc, dl, MVT::v8i16, Lo);

  Hi = DAG.getNode(ExtOpc, dl, MVT::v8i16, Hi);

  SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i16, Lo, Hi);

  return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);

}


static SDValue LowerZERO_EXTEND_Mask(SDValue Op, const SDLoc &DL,

                                     const X86Subtarget &Subtarget,

                                     SelectionDAG &DAG) {

  MVT VT = Op->getSimpleValueType(0);

  SDValue In = Op->getOperand(0);

  MVT InVT = In.getSimpleValueType();

  assert(InVT.getVectorElementType() == MVT::i1 && "Unexpected input type!");

  unsigned NumElts = VT.getVectorNumElements();


  // For all vectors, but vXi8 we can just emit a sign_extend and a shift. This

  // avoids a constant pool load.

  if (VT.getVectorElementType() != MVT::i8) {

    SDValue Extend = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, In);

    return DAG.getNode(ISD::SRL, DL, VT, Extend,

                       DAG.getConstant(VT.getScalarSizeInBits() - 1, DL, VT));

  }


  // Extend VT if BWI is not supported.

  MVT ExtVT = VT;

  if (!Subtarget.hasBWI()) {

    // If v16i32 is to be avoided, we'll need to split and concatenate.

    if (NumElts == 16 && !Subtarget.canExtendTo512DQ())

      return SplitAndExtendv16i1(ISD::ZERO_EXTEND, VT, In, DL, DAG);


    ExtVT = MVT::getVectorVT(MVT::i32, NumElts);

  }


  // Widen to 512-bits if VLX is not supported.

  MVT WideVT = ExtVT;

  if (!ExtVT.is512BitVector() && !Subtarget.hasVLX()) {

    NumElts *= 512 / ExtVT.getSizeInBits();

    InVT = MVT::getVectorVT(MVT::i1, NumElts);

    In = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, InVT, DAG.getUNDEF(InVT), In,

                     DAG.getVectorIdxConstant(0, DL));

    WideVT = MVT::getVectorVT(ExtVT.getVectorElementType(), NumElts);

  }


  SDValue One = DAG.getConstant(1, DL, WideVT);

  SDValue Zero = DAG.getConstant(0, DL, WideVT);


  SDValue SelectedVal = DAG.getSelect(DL, WideVT, In, One, Zero);


  // Truncate if we had to extend above.

  if (VT != ExtVT) {

    WideVT = MVT::getVectorVT(MVT::i8, NumElts);

    SelectedVal = DAG.getNode(ISD::TRUNCATE, DL, WideVT, SelectedVal);

  }


  // Extract back to 128/256-bit if we widened.

  if (WideVT != VT)

    SelectedVal = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SelectedVal,

                              DAG.getVectorIdxConstant(0, DL));


  return SelectedVal;

}


static SDValue LowerZERO_EXTEND(SDValue Op, const X86Subtarget &Subtarget,

                                SelectionDAG &DAG) {

  SDValue In = Op.getOperand(0);

  MVT SVT = In.getSimpleValueType();

  SDLoc DL(Op);


  if (SVT.getVectorElementType() == MVT::i1)

    return LowerZERO_EXTEND_Mask(Op, DL, Subtarget, DAG);


  assert(Subtarget.hasAVX() && "Expected AVX support");

  return LowerAVXExtend(Op, DL, DAG, Subtarget);

}


/// Helper to recursively truncate vector elements in half with PACKSS/PACKUS.

/// It makes use of the fact that vectors with enough leading sign/zero bits

/// prevent the PACKSS/PACKUS from saturating the results.

/// AVX2 (Int256) sub-targets require extra shuffling as the PACK*S operates

/// within each 128-bit lane.


static SDValue truncateVectorWithPACK(unsigned Opcode, EVT DstVT, SDValue In,

                                      const SDLoc &DL, SelectionDAG &DAG,

                                      const X86Subtarget &Subtarget) {

  assert((Opcode == X86ISD::PACKSS || Opcode == X86ISD::PACKUS) &&

         "Unexpected PACK opcode");

  assert(DstVT.isVector() && "VT not a vector?");


  // Requires SSE2 for PACKSS (SSE41 PACKUSDW is handled below).

  if (!Subtarget.hasSSE2())

    return SDValue();


  EVT SrcVT = In.getValueType();


  // No truncation required, we might get here due to recursive calls.

  if (SrcVT == DstVT)

    return In;


  unsigned NumElems = SrcVT.getVectorNumElements();

  if (NumElems < 2 || !isPowerOf2_32(NumElems) )

    return SDValue();


  unsigned DstSizeInBits = DstVT.getSizeInBits();

  unsigned SrcSizeInBits = SrcVT.getSizeInBits();

  assert(DstVT.getVectorNumElements() == NumElems && "Illegal truncation");

  assert(SrcSizeInBits > DstSizeInBits && "Illegal truncation");


  LLVMContext &Ctx = *DAG.getContext();

  EVT PackedSVT = EVT::getIntegerVT(Ctx, SrcVT.getScalarSizeInBits() / 2);

  EVT PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumElems);


  // Pack to the largest type possible:

  // vXi64/vXi32 -> PACK*SDW and vXi16 -> PACK*SWB.

  EVT InVT = MVT::i16, OutVT = MVT::i8;

  if (SrcVT.getScalarSizeInBits() > 16 &&

      (Opcode == X86ISD::PACKSS || Subtarget.hasSSE41())) {

    InVT = MVT::i32;

    OutVT = MVT::i16;

  }


  // Sub-128-bit truncation - widen to 128-bit src and pack in the lower half.

  // On pre-AVX512, pack the src in both halves to help value tracking.

  if (SrcSizeInBits <= 128) {

    InVT = EVT::getVectorVT(Ctx, InVT, 128 / InVT.getSizeInBits());

    OutVT = EVT::getVectorVT(Ctx, OutVT, 128 / OutVT.getSizeInBits());

    In = widenSubVector(In, false, Subtarget, DAG, DL, 128);

    SDValue LHS = DAG.getBitcast(InVT, In);

    SDValue RHS = Subtarget.hasAVX512() ? DAG.getUNDEF(InVT) : LHS;

    SDValue Res = DAG.getNode(Opcode, DL, OutVT, LHS, RHS);

    Res = extractSubVector(Res, 0, DAG, DL, SrcSizeInBits / 2);

    Res = DAG.getBitcast(PackedVT, Res);

    return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);

  }


  // Split lower/upper subvectors.

  SDValue Lo, Hi;

  std::tie(Lo, Hi) = splitVector(In, DAG, DL);


  // If Hi is undef, then don't bother packing it and widen the result instead.

  if (Hi.isUndef()) {

    EVT DstHalfVT = DstVT.getHalfNumVectorElementsVT(Ctx);

    if (SDValue Res =

            truncateVectorWithPACK(Opcode, DstHalfVT, Lo, DL, DAG, Subtarget))

      return widenSubVector(Res, false, Subtarget, DAG, DL, DstSizeInBits);

  }


  unsigned SubSizeInBits = SrcSizeInBits / 2;

  InVT = EVT::getVectorVT(Ctx, InVT, SubSizeInBits / InVT.getSizeInBits());

  OutVT = EVT::getVectorVT(Ctx, OutVT, SubSizeInBits / OutVT.getSizeInBits());


  // 256bit -> 128bit truncate - PACK lower/upper 128-bit subvectors.

  if (SrcVT.is256BitVector() && DstVT.is128BitVector()) {

    Lo = DAG.getBitcast(InVT, Lo);

    Hi = DAG.getBitcast(InVT, Hi);

    SDValue Res = DAG.getNode(Opcode, DL, OutVT, Lo, Hi);

    return DAG.getBitcast(DstVT, Res);

  }


  // AVX2: 512bit -> 256bit truncate - PACK lower/upper 256-bit subvectors.

  // AVX2: 512bit -> 128bit truncate - PACK(PACK, PACK).

  if (SrcVT.is512BitVector() && Subtarget.hasInt256()) {

    Lo = DAG.getBitcast(InVT, Lo);

    Hi = DAG.getBitcast(InVT, Hi);

    SDValue Res = DAG.getNode(Opcode, DL, OutVT, Lo, Hi);


    // 256-bit PACK(ARG0, ARG1) leaves us with ((LO0,LO1),(HI0,HI1)),

    // so we need to shuffle to get ((LO0,HI0),(LO1,HI1)).

    // Scale shuffle mask to avoid bitcasts and help ComputeNumSignBits.

    SmallVector<int, 64> Mask;

    int Scale = 64 / OutVT.getScalarSizeInBits();

    narrowShuffleMaskElts(Scale, { 0, 2, 1, 3 }, Mask);

    Res = DAG.getVectorShuffle(OutVT, DL, Res, Res, Mask);


    if (DstVT.is256BitVector())

      return DAG.getBitcast(DstVT, Res);


    // If 512bit -> 128bit truncate another stage.

    Res = DAG.getBitcast(PackedVT, Res);

    return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);

  }


  // Recursively pack lower/upper subvectors, concat result and pack again.

  assert(SrcSizeInBits >= 256 && "Expected 256-bit vector or greater");


  if (PackedVT.is128BitVector()) {

    // Avoid CONCAT_VECTORS on sub-128bit nodes as these can fail after

    // type legalization.

    SDValue Res =

        truncateVectorWithPACK(Opcode, PackedVT, In, DL, DAG, Subtarget);

    return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);

  }


  EVT HalfPackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumElems / 2);

  Lo = truncateVectorWithPACK(Opcode, HalfPackedVT, Lo, DL, DAG, Subtarget);

  Hi = truncateVectorWithPACK(Opcode, HalfPackedVT, Hi, DL, DAG, Subtarget);

  SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, PackedVT, Lo, Hi);

  return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);

}


/// Truncate using inreg zero extension (AND mask) and X86ISD::PACKUS.

/// e.g. trunc <8 x i32> X to <8 x i16> -->

/// MaskX = X & 0xffff (clear high bits to prevent saturation)

/// packus (extract_subv MaskX, 0), (extract_subv MaskX, 1)


static SDValue truncateVectorWithPACKUS(EVT DstVT, SDValue In, const SDLoc &DL,

                                        const X86Subtarget &Subtarget,

                                        SelectionDAG &DAG) {

  In = DAG.getZeroExtendInReg(In, DL, DstVT);

  return truncateVectorWithPACK(X86ISD::PACKUS, DstVT, In, DL, DAG, Subtarget);

}


/// Truncate using inreg sign extension and X86ISD::PACKSS.


static SDValue truncateVectorWithPACKSS(EVT DstVT, SDValue In, const SDLoc &DL,

                                        const X86Subtarget &Subtarget,

                                        SelectionDAG &DAG) {

  EVT SrcVT = In.getValueType();

  In = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, SrcVT, In,

                   DAG.getValueType(DstVT));

  return truncateVectorWithPACK(X86ISD::PACKSS, DstVT, In, DL, DAG, Subtarget);

}


/// Helper to determine if \p In truncated to \p DstVT has the necessary

/// signbits / leading zero bits to be truncated with PACKSS / PACKUS,

/// possibly by converting a SRL node to SRA for sign extension.


static SDValue matchTruncateWithPACK(unsigned &PackOpcode, EVT DstVT,

                                     SDValue In, const SDLoc &DL,

                                     SelectionDAG &DAG,

                                     const X86Subtarget &Subtarget,

                                     const SDNodeFlags Flags = SDNodeFlags()) {

  // Requires SSE2.

  if (!Subtarget.hasSSE2())

    return SDValue();


  EVT SrcVT = In.getValueType();

  EVT DstSVT = DstVT.getVectorElementType();

  EVT SrcSVT = SrcVT.getVectorElementType();

  unsigned NumDstEltBits = DstSVT.getSizeInBits();

  unsigned NumSrcEltBits = SrcSVT.getSizeInBits();


  // Check we have a truncation suited for PACKSS/PACKUS.

  if (!((SrcSVT == MVT::i16 || SrcSVT == MVT::i32 || SrcSVT == MVT::i64) &&

        (DstSVT == MVT::i8 || DstSVT == MVT::i16 || DstSVT == MVT::i32)))

    return SDValue();


  assert(NumSrcEltBits > NumDstEltBits && "Bad truncation");

  unsigned NumStages = Log2_32(NumSrcEltBits / NumDstEltBits);


  // Truncation from 128-bit to vXi32 can be better handled with PSHUFD.

  // Truncation to sub-64-bit vXi16 can be better handled with PSHUFD/PSHUFLW.

  // Truncation from v2i64 to v2i8 can be better handled with PSHUFB.

  if ((DstSVT == MVT::i32 && SrcVT.getSizeInBits() <= 128) ||

      (DstSVT == MVT::i16 && SrcVT.getSizeInBits() <= (64 * NumStages)) ||

      (DstVT == MVT::v2i8 && SrcVT == MVT::v2i64 && Subtarget.hasSSSE3()))

    return SDValue();


  // Prefer to lower v4i64 -> v4i32 as a shuffle unless we can cheaply

  // split this for packing.

  if (SrcVT == MVT::v4i64 && DstVT == MVT::v4i32 &&

      !isFreeToSplitVector(In, DAG) &&

      (!Subtarget.hasAVX() || DAG.ComputeNumSignBits(In) != 64))

    return SDValue();


  // Don't truncate AVX512 targets as multiple PACK nodes stages.

  if (Subtarget.hasAVX512() && NumStages > 1)

    return SDValue();


  unsigned NumPackedSignBits = std::min<unsigned>(NumDstEltBits, 16);

  unsigned NumPackedZeroBits = Subtarget.hasSSE41() ? NumPackedSignBits : 8;


  // Truncate with PACKUS if we are truncating a vector with leading zero

  // bits that extend all the way to the packed/truncated value.

  // e.g. Masks, zext_in_reg, etc.

  // Pre-SSE41 we can only use PACKUSWB.

  KnownBits Known = DAG.computeKnownBits(In);

  if ((Flags.hasNoUnsignedWrap() && NumDstEltBits <= NumPackedZeroBits) ||

      (NumSrcEltBits - NumPackedZeroBits) <= Known.countMinLeadingZeros()) {

    PackOpcode = X86ISD::PACKUS;

    return In;

  }


  // Truncate with PACKSS if we are truncating a vector with sign-bits

  // that extend all the way to the packed/truncated value.

  // e.g. Comparison result, sext_in_reg, etc.

  unsigned NumSignBits = DAG.ComputeNumSignBits(In);


  // Don't use PACKSS for vXi64 -> vXi32 truncations unless we're dealing with

  // a sign splat (or AVX512 VPSRAQ support). ComputeNumSignBits struggles to

  // see through BITCASTs later on and combines/simplifications can't then use

  // it.

  if (DstSVT == MVT::i32 && NumSignBits != NumSrcEltBits &&

      !Subtarget.hasAVX512())

    return SDValue();


  unsigned MinSignBits = NumSrcEltBits - NumPackedSignBits;

  if ((Flags.hasNoSignedWrap() && DstSVT != MVT::i32) ||

      MinSignBits < NumSignBits) {

    PackOpcode = X86ISD::PACKSS;

    return In;

  }


  // If we have a srl that only generates signbits that we will discard in

  // the truncation then we can use PACKSS by converting the srl to a sra.

  // SimplifyDemandedBits often relaxes sra to srl so we need to reverse it.

  if (In.getOpcode() == ISD::SRL && In->hasOneUse())

    if (std::optional<unsigned> ShAmt = DAG.getValidShiftAmount(In)) {

      if (*ShAmt == MinSignBits) {

        PackOpcode = X86ISD::PACKSS;

        return DAG.getNode(ISD::SRA, DL, SrcVT, In->ops());

      }

    }


  return SDValue();

}


/// This function lowers a vector truncation of 'extended sign-bits' or

/// 'extended zero-bits' values.

/// vXi16/vXi32/vXi64 to vXi8/vXi16/vXi32 into X86ISD::PACKSS/PACKUS operations.


static SDValue LowerTruncateVecPackWithSignBits(

    MVT DstVT, SDValue In, const SDLoc &DL, const X86Subtarget &Subtarget,

    SelectionDAG &DAG, const SDNodeFlags Flags = SDNodeFlags()) {

  MVT SrcVT = In.getSimpleValueType();

  MVT DstSVT = DstVT.getVectorElementType();

  MVT SrcSVT = SrcVT.getVectorElementType();

  if (!((SrcSVT == MVT::i16 || SrcSVT == MVT::i32 || SrcSVT == MVT::i64) &&

        (DstSVT == MVT::i8 || DstSVT == MVT::i16 || DstSVT == MVT::i32)))

    return SDValue();


  // If the upper half of the source is undef, then attempt to split and

  // only truncate the lower half.

  if (DstVT.getSizeInBits() >= 128) {

    if (SDValue Lo = isUpperSubvectorUndef(In, DL, DAG)) {

      MVT DstHalfVT = DstVT.getHalfNumVectorElementsVT();

      if (SDValue Res = LowerTruncateVecPackWithSignBits(DstHalfVT, Lo, DL,

                                                         Subtarget, DAG))

        return widenSubVector(Res, false, Subtarget, DAG, DL,

                              DstVT.getSizeInBits());

    }

  }


  unsigned PackOpcode;

  if (SDValue Src = matchTruncateWithPACK(PackOpcode, DstVT, In, DL, DAG,

                                          Subtarget, Flags))

    return truncateVectorWithPACK(PackOpcode, DstVT, Src, DL, DAG, Subtarget);


  return SDValue();

}


/// This function lowers a vector truncation from vXi32/vXi64 to vXi8/vXi16 into

/// X86ISD::PACKUS/X86ISD::PACKSS operations.


static SDValue LowerTruncateVecPack(MVT DstVT, SDValue In, const SDLoc &DL,

                                    const X86Subtarget &Subtarget,

                                    SelectionDAG &DAG) {

  MVT SrcVT = In.getSimpleValueType();

  MVT DstSVT = DstVT.getVectorElementType();

  MVT SrcSVT = SrcVT.getVectorElementType();

  unsigned NumElems = DstVT.getVectorNumElements();

  if (!((SrcSVT == MVT::i16 || SrcSVT == MVT::i32 || SrcSVT == MVT::i64) &&

        (DstSVT == MVT::i8 || DstSVT == MVT::i16) && isPowerOf2_32(NumElems) &&

        NumElems >= 8))

    return SDValue();


  // SSSE3's pshufb results in less instructions in the cases below.

  if (Subtarget.hasSSSE3() && NumElems == 8) {

    if (SrcSVT == MVT::i16)

      return SDValue();

    if (SrcSVT == MVT::i32 && (DstSVT == MVT::i8 || !Subtarget.hasSSE41()))

      return SDValue();

  }


  // If the upper half of the source is undef, then attempt to split and

  // only truncate the lower half.

  if (DstVT.getSizeInBits() >= 128) {

    if (SDValue Lo = isUpperSubvectorUndef(In, DL, DAG)) {

      MVT DstHalfVT = DstVT.getHalfNumVectorElementsVT();

      if (SDValue Res = LowerTruncateVecPack(DstHalfVT, Lo, DL, Subtarget, DAG))

        return widenSubVector(Res, false, Subtarget, DAG, DL,

                              DstVT.getSizeInBits());

    }

  }


  // SSE2 provides PACKUS for only 2 x v8i16 -> v16i8 and SSE4.1 provides PACKUS

  // for 2 x v4i32 -> v8i16. For SSSE3 and below, we need to use PACKSS to

  // truncate 2 x v4i32 to v8i16.

  if (Subtarget.hasSSE41() || DstSVT == MVT::i8)

    return truncateVectorWithPACKUS(DstVT, In, DL, Subtarget, DAG);


  if (SrcSVT == MVT::i16 || SrcSVT == MVT::i32)

    return truncateVectorWithPACKSS(DstVT, In, DL, Subtarget, DAG);


  // Special case vXi64 -> vXi16, shuffle to vXi32 and then use PACKSS.

  if (DstSVT == MVT::i16 && SrcSVT == MVT::i64) {

    MVT TruncVT = MVT::getVectorVT(MVT::i32, NumElems);

    SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, In);

    return truncateVectorWithPACKSS(DstVT, Trunc, DL, Subtarget, DAG);

  }


  return SDValue();

}


static SDValue LowerTruncateVecI1(SDValue Op, const SDLoc &DL,

                                  SelectionDAG &DAG,

                                  const X86Subtarget &Subtarget) {

  MVT VT = Op.getSimpleValueType();

  SDValue In = Op.getOperand(0);

  MVT InVT = In.getSimpleValueType();

  assert(VT.getVectorElementType() == MVT::i1 && "Unexpected vector type.");


  // Shift LSB to MSB and use VPMOVB/W2M or TESTD/Q.

  unsigned ShiftInx = InVT.getScalarSizeInBits() - 1;

  if (InVT.getScalarSizeInBits() <= 16) {

    if (Subtarget.hasBWI()) {

      // legal, will go to VPMOVB2M, VPMOVW2M

      if (DAG.ComputeNumSignBits(In) < InVT.getScalarSizeInBits()) {

        // We need to shift to get the lsb into sign position.

        // Shift packed bytes not supported natively, bitcast to word

        MVT ExtVT = MVT::getVectorVT(MVT::i16, InVT.getSizeInBits()/16);

        In = DAG.getNode(ISD::SHL, DL, ExtVT,

                         DAG.getBitcast(ExtVT, In),

                         DAG.getConstant(ShiftInx, DL, ExtVT));

        In = DAG.getBitcast(InVT, In);

      }

      return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, InVT),

                          In, ISD::SETGT);

    }

    // Use TESTD/Q, extended vector to packed dword/qword.

    assert((InVT.is256BitVector() || InVT.is128BitVector()) &&

           "Unexpected vector type.");

    unsigned NumElts = InVT.getVectorNumElements();

    assert((NumElts == 8 || NumElts == 16) && "Unexpected number of elements");

    // We need to change to a wider element type that we have support for.

    // For 8 element vectors this is easy, we either extend to v8i32 or v8i64.

    // For 16 element vectors we extend to v16i32 unless we are explicitly

    // trying to avoid 512-bit vectors. If we are avoiding 512-bit vectors

    // we need to split into two 8 element vectors which we can extend to v8i32,

    // truncate and concat the results. There's an additional complication if

    // the original type is v16i8. In that case we can't split the v16i8

    // directly, so we need to shuffle high elements to low and use

    // sign_extend_vector_inreg.

    if (NumElts == 16 && !Subtarget.canExtendTo512DQ()) {

      SDValue Lo, Hi;

      if (InVT == MVT::v16i8) {

        Lo = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, DL, MVT::v8i32, In);

        Hi = DAG.getVectorShuffle(

            InVT, DL, In, In,

            {8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1});

        Hi = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, DL, MVT::v8i32, Hi);

      } else {

        assert(InVT == MVT::v16i16 && "Unexpected VT!");

        Lo = extract128BitVector(In, 0, DAG, DL);

        Hi = extract128BitVector(In, 8, DAG, DL);

      }

      // We're split now, just emit two truncates and a concat. The two

      // truncates will trigger legalization to come back to this function.

      Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i1, Lo);

      Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i1, Hi);

      return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);

    }

    // We either have 8 elements or we're allowed to use 512-bit vectors.

    // If we have VLX, we want to use the narrowest vector that can get the

    // job done so we use vXi32.

    MVT EltVT = Subtarget.hasVLX() ? MVT::i32 : MVT::getIntegerVT(512/NumElts);

    MVT ExtVT = MVT::getVectorVT(EltVT, NumElts);

    In = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, In);

    InVT = ExtVT;

    ShiftInx = InVT.getScalarSizeInBits() - 1;

  }


  if (DAG.ComputeNumSignBits(In) < InVT.getScalarSizeInBits()) {

    // We need to shift to get the lsb into sign position.

    In = DAG.getNode(ISD::SHL, DL, InVT, In,

                     DAG.getConstant(ShiftInx, DL, InVT));

  }

  // If we have DQI, emit a pattern that will be iseled as vpmovq2m/vpmovd2m.

  if (Subtarget.hasDQI())

    return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, InVT), In, ISD::SETGT);

  return DAG.getSetCC(DL, VT, In, DAG.getConstant(0, DL, InVT), ISD::SETNE);

}


SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {

  SDLoc DL(Op);

  MVT VT = Op.getSimpleValueType();

  SDValue In = Op.getOperand(0);

  MVT InVT = In.getSimpleValueType();

  assert(VT.getVectorNumElements() == InVT.getVectorNumElements() &&

         "Invalid TRUNCATE operation");


  // If we're called by the type legalizer, handle a few cases.

  const TargetLowering &TLI = DAG.getTargetLoweringInfo();

  if (!TLI.isTypeLegal(VT) || !TLI.isTypeLegal(InVT)) {

    if ((InVT == MVT::v8i64 || InVT == MVT::v16i32 || InVT == MVT::v16i64) &&

        VT.is128BitVector() && Subtarget.hasAVX512()) {

      assert((InVT == MVT::v16i64 || Subtarget.hasVLX()) &&

             "Unexpected subtarget!");

      // The default behavior is to truncate one step, concatenate, and then

      // truncate the remainder. We'd rather produce two 64-bit results and

      // concatenate those.

      SDValue Lo, Hi;

      std::tie(Lo, Hi) = DAG.SplitVector(In, DL);


      EVT LoVT, HiVT;

      std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);


      Lo = DAG.getNode(ISD::TRUNCATE, DL, LoVT, Lo);

      Hi = DAG.getNode(ISD::TRUNCATE, DL, HiVT, Hi);

      return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);

    }


    // Pre-AVX512 (or prefer-256bit) see if we can make use of PACKSS/PACKUS.

    if (!Subtarget.hasAVX512() ||

        (InVT.is512BitVector() && VT.is256BitVector()))

      if (SDValue SignPack = LowerTruncateVecPackWithSignBits(

              VT, In, DL, Subtarget, DAG, Op->getFlags()))

        return SignPack;


    // Pre-AVX512 see if we can make use of PACKSS/PACKUS.

    if (!Subtarget.hasAVX512())

      return LowerTruncateVecPack(VT, In, DL, Subtarget, DAG);


    // Otherwise let default legalization handle it.

    return SDValue();

  }


  if (VT.getVectorElementType() == MVT::i1)

    return LowerTruncateVecI1(Op, DL, DAG, Subtarget);


  // Attempt to truncate with PACKUS/PACKSS even on AVX512 if we'd have to

  // concat from subvectors to use VPTRUNC etc.

  if (!Subtarget.hasAVX512() || isFreeToSplitVector(In, DAG))

    if (SDValue SignPack = LowerTruncateVecPackWithSignBits(

            VT, In, DL, Subtarget, DAG, Op->getFlags()))

      return SignPack;


  // vpmovqb/w/d, vpmovdb/w, vpmovwb

  if (Subtarget.hasAVX512()) {

    if (InVT == MVT::v32i16 && !Subtarget.hasBWI()) {

      assert(VT == MVT::v32i8 && "Unexpected VT!");

      return splitVectorIntUnary(Op, DAG, DL);

    }


    // word to byte only under BWI. Otherwise we have to promoted to v16i32

    // and then truncate that. But we should only do that if we haven't been

    // asked to avoid 512-bit vectors. The actual promotion to v16i32 will be

    // handled by isel patterns.

    if (InVT != MVT::v16i16 || Subtarget.hasBWI() ||

        Subtarget.canExtendTo512DQ())

      return Op;

  }


  // Handle truncation of V256 to V128 using shuffles.

  assert(VT.is128BitVector() && InVT.is256BitVector() && "Unexpected types!");


  if ((VT == MVT::v4i32) && (InVT == MVT::v4i64)) {

    // On AVX2, v4i64 -> v4i32 becomes VPERMD.

    if (Subtarget.hasInt256()) {

      static const int ShufMask[] = {0, 2, 4, 6, -1, -1, -1, -1};

      In = DAG.getBitcast(MVT::v8i32, In);

      In = DAG.getVectorShuffle(MVT::v8i32, DL, In, In, ShufMask);

      return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, In,

                         DAG.getVectorIdxConstant(0, DL));

    }


    SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,

                               DAG.getVectorIdxConstant(0, DL));

    SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,

                               DAG.getVectorIdxConstant(2, DL));

    static const int ShufMask[] = {0, 2, 4, 6};

    return DAG.getVectorShuffle(VT, DL, DAG.getBitcast(MVT::v4i32, OpLo),

                                DAG.getBitcast(MVT::v4i32, OpHi), ShufMask);

  }


  if ((VT == MVT::v8i16) && (InVT == MVT::v8i32)) {

    // On AVX2, v8i32 -> v8i16 becomes PSHUFB.

    if (Subtarget.hasInt256()) {

      // The PSHUFB mask:

      static const int ShufMask1[] = { 0,  1,  4,  5,  8,  9, 12, 13,

                                      -1, -1, -1, -1, -1, -1, -1, -1,

                                      16, 17, 20, 21, 24, 25, 28, 29,

                                      -1, -1, -1, -1, -1, -1, -1, -1 };

      In = DAG.getBitcast(MVT::v32i8, In);

      In = DAG.getVectorShuffle(MVT::v32i8, DL, In, In, ShufMask1);

      In = DAG.getBitcast(MVT::v4i64, In);


      static const int ShufMask2[] = {0, 2, -1, -1};

      In = DAG.getVectorShuffle(MVT::v4i64, DL, In, In, ShufMask2);

      In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,

                       DAG.getVectorIdxConstant(0, DL));

      return DAG.getBitcast(MVT::v8i16, In);

    }


    return Subtarget.hasSSE41()

               ? truncateVectorWithPACKUS(VT, In, DL, Subtarget, DAG)

               : truncateVectorWithPACKSS(VT, In, DL, Subtarget, DAG);

  }


  if (VT == MVT::v16i8 && InVT == MVT::v16i16)

    return truncateVectorWithPACKUS(VT, In, DL, Subtarget, DAG);


  llvm_unreachable("All 256->128 cases should have been handled above!");

}


// We can leverage the specific way the "cvttps2dq/cvttpd2dq" instruction

// behaves on out of range inputs to generate optimized conversions.


static SDValue expandFP_TO_UINT_SSE(MVT VT, SDValue Src, const SDLoc &dl,

                                    SelectionDAG &DAG,

                                    const X86Subtarget &Subtarget) {

  MVT SrcVT = Src.getSimpleValueType();

  unsigned DstBits = VT.getScalarSizeInBits();

  assert(DstBits == 32 && "expandFP_TO_UINT_SSE - only vXi32 supported");


  // Calculate the converted result for values in the range 0 to

  // 2^31-1 ("Small") and from 2^31 to 2^32-1 ("Big").

  SDValue Small = DAG.getNode(X86ISD::CVTTP2SI, dl, VT, Src);

  SDValue Big =

      DAG.getNode(X86ISD::CVTTP2SI, dl, VT,

                  DAG.getNode(ISD::FSUB, dl, SrcVT, Src,

                              DAG.getConstantFP(2147483648.0f, dl, SrcVT)));


  // The "CVTTP2SI" instruction conveniently sets the sign bit if

  // and only if the value was out of range. So we can use that

  // as our indicator that we rather use "Big" instead of "Small".

  //

  // Use "Small" if "IsOverflown" has all bits cleared

  // and "0x80000000 | Big" if all bits in "IsOverflown" are set.


  // AVX1 can't use the signsplat masking for 256-bit vectors - we have to

  // use the slightly slower blendv select instead.

  if (VT == MVT::v8i32 && !Subtarget.hasAVX2()) {

    SDValue Overflow = DAG.getNode(ISD::OR, dl, VT, Small, Big);

    return DAG.getNode(X86ISD::BLENDV, dl, VT, Small, Overflow, Small);

  }


  SDValue IsOverflown =

      DAG.getNode(X86ISD::VSRAI, dl, VT, Small,

                  DAG.getTargetConstant(DstBits - 1, dl, MVT::i8));

  return DAG.getNode(ISD::OR, dl, VT, Small,

                     DAG.getNode(ISD::AND, dl, VT, Big, IsOverflown));

}


SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {

  bool IsStrict = Op->isStrictFPOpcode();

  bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT ||

                  Op.getOpcode() == ISD::STRICT_FP_TO_SINT;

  bool HasVLX = Subtarget.hasVLX();

  MVT VT = Op->getSimpleValueType(0);

  SDValue Src = Op.getOperand(IsStrict ? 1 : 0);

  SDValue Chain = IsStrict ? Op->getOperand(0) : SDValue();

  MVT SrcVT = Src.getSimpleValueType();

  SDLoc dl(Op);


  SDValue Res;

  if (isSoftF16(SrcVT, Subtarget)) {

    MVT NVT = VT.isVector() ? VT.changeVectorElementType(MVT::f32) : MVT::f32;

    if (IsStrict)

      return DAG.getNode(Op.getOpcode(), dl, {VT, MVT::Other},

                         {Chain, DAG.getNode(ISD::STRICT_FP_EXTEND, dl,

                                             {NVT, MVT::Other}, {Chain, Src})});

    return DAG.getNode(Op.getOpcode(), dl, VT,

                       DAG.getNode(ISD::FP_EXTEND, dl, NVT, Src));

  } else if (isTypeLegal(SrcVT) &&

             isLegalConversion(VT, SrcVT, IsSigned, Subtarget)) {

    return Op;

  }


  if (VT.isVector()) {

    if (VT == MVT::v2i1 && SrcVT == MVT::v2f64) {

      MVT ResVT = MVT::v4i32;

      MVT TruncVT = MVT::v4i1;

      unsigned Opc;

      if (IsStrict)

        Opc = IsSigned ? X86ISD::STRICT_CVTTP2SI : X86ISD::STRICT_CVTTP2UI;

      else

        Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;


      if (!IsSigned && !HasVLX) {

        assert(Subtarget.useAVX512Regs() && "Unexpected features!");

        // Widen to 512-bits.

        ResVT = MVT::v8i32;

        TruncVT = MVT::v8i1;

        Opc = Op.getOpcode();

        // Need to concat with zero vector for strict fp to avoid spurious

        // exceptions.

        // TODO: Should we just do this for non-strict as well?

        SDValue Tmp = IsStrict ? DAG.getConstantFP(0.0, dl, MVT::v8f64)

                               : DAG.getUNDEF(MVT::v8f64);

        Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8f64, Tmp, Src,

                          DAG.getVectorIdxConstant(0, dl));

      }

      if (IsStrict) {

        Res = DAG.getNode(Opc, dl, {ResVT, MVT::Other}, {Chain, Src});

        Chain = Res.getValue(1);

      } else {

        Res = DAG.getNode(Opc, dl, ResVT, Src);

      }


      Res = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Res);

      Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i1, Res,

                        DAG.getVectorIdxConstant(0, dl));

      if (IsStrict)

        return DAG.getMergeValues({Res, Chain}, dl);

      return Res;

    }


    if (Subtarget.hasFP16() && SrcVT.getVectorElementType() == MVT::f16) {

      if ((HasVLX && (VT == MVT::v8i16 || VT == MVT::v16i16)) ||

          VT == MVT::v32i16)

        return Op;


      MVT ResVT = VT;

      MVT EleVT = VT.getVectorElementType();

      if (EleVT != MVT::i64)

        ResVT = EleVT == MVT::i32 ? MVT::v4i32 : MVT::v8i16;


      if (SrcVT == MVT::v2f16 || SrcVT == MVT::v4f16) {

        SDValue Tmp =

            IsStrict ? DAG.getConstantFP(0.0, dl, SrcVT) : DAG.getUNDEF(SrcVT);

        SmallVector<SDValue, 4> Ops(SrcVT == MVT::v2f16 ? 4 : 2, Tmp);

        Ops[0] = Src;

        Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8f16, Ops);

      }


      if (!HasVLX) {

        assert(Subtarget.useAVX512Regs() && "Unexpected features!");

        // Widen to 512-bits.

        unsigned IntSize = EleVT.getSizeInBits();

        unsigned Num = IntSize > 16 ? 512 / IntSize : 32;

        ResVT = MVT::getVectorVT(EleVT, Num);

        Src = widenSubVector(MVT::getVectorVT(MVT::f16, Num), Src, IsStrict,

                             Subtarget, DAG, dl);

      }


      if (IsStrict) {

        Res = DAG.getNode(IsSigned ? X86ISD::STRICT_CVTTP2SI

                                   : X86ISD::STRICT_CVTTP2UI,

                          dl, {ResVT, MVT::Other}, {Chain, Src});

        Chain = Res.getValue(1);

      } else {

        Res = DAG.getNode(IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI, dl,

                          ResVT, Src);

      }


      // TODO: Need to add exception check code for strict FP.

      if (EleVT.getSizeInBits() < 16) {

        if (HasVLX)

          ResVT = MVT::getVectorVT(EleVT, 8);

        Res = DAG.getNode(ISD::TRUNCATE, dl, ResVT, Res);

      }


      if (ResVT != VT)

        Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Res,

                          DAG.getVectorIdxConstant(0, dl));


      if (IsStrict)

        return DAG.getMergeValues({Res, Chain}, dl);

      return Res;

    }


    // v8f32/v16f32/v8f64->v8i16/v16i16 need to widen first.

    if (VT.getVectorElementType() == MVT::i16) {

      assert((SrcVT.getVectorElementType() == MVT::f32 ||

              SrcVT.getVectorElementType() == MVT::f64) &&

             "Expected f32/f64 vector!");

      MVT NVT = VT.changeVectorElementType(MVT::i32);

      if (IsStrict) {

        Res = DAG.getNode(IsSigned ? ISD::STRICT_FP_TO_SINT

                                   : ISD::STRICT_FP_TO_UINT,

                          dl, {NVT, MVT::Other}, {Chain, Src});

        Chain = Res.getValue(1);

      } else {

        Res = DAG.getNode(IsSigned ? ISD::FP_TO_SINT : ISD::FP_TO_UINT, dl,

                          NVT, Src);

      }


      // TODO: Need to add exception check code for strict FP.

      Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);


      if (IsStrict)

        return DAG.getMergeValues({Res, Chain}, dl);

      return Res;

    }


    // v8f64->v8i32 is legal, but we need v8i32 to be custom for v8f32.

    if (VT == MVT::v8i32 && SrcVT == MVT::v8f64) {

      assert(!IsSigned && "Expected unsigned conversion!");

      assert(Subtarget.useAVX512Regs() && "Requires avx512f");

      return Op;

    }


    // Widen vXi32 fp_to_uint with avx512f to 512-bit source.

    if ((VT == MVT::v4i32 || VT == MVT::v8i32) &&

        (SrcVT == MVT::v4f64 || SrcVT == MVT::v4f32 || SrcVT == MVT::v8f32) &&

        Subtarget.useAVX512Regs()) {

      assert(!IsSigned && "Expected unsigned conversion!");

      assert(!Subtarget.hasVLX() && "Unexpected features!");

      MVT WideVT = SrcVT == MVT::v4f64 ? MVT::v8f64 : MVT::v16f32;

      MVT ResVT = SrcVT == MVT::v4f64 ? MVT::v8i32 : MVT::v16i32;

      // Need to concat with zero vector for strict fp to avoid spurious

      // exceptions.

      // TODO: Should we just do this for non-strict as well?

      SDValue Tmp =

          IsStrict ? DAG.getConstantFP(0.0, dl, WideVT) : DAG.getUNDEF(WideVT);

      Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVT, Tmp, Src,

                        DAG.getVectorIdxConstant(0, dl));


      if (IsStrict) {

        Res = DAG.getNode(ISD::STRICT_FP_TO_UINT, dl, {ResVT, MVT::Other},

                          {Chain, Src});

        Chain = Res.getValue(1);

      } else {

        Res = DAG.getNode(ISD::FP_TO_UINT, dl, ResVT, Src);

      }


      Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Res,

                        DAG.getVectorIdxConstant(0, dl));


      if (IsStrict)

        return DAG.getMergeValues({Res, Chain}, dl);

      return Res;

    }


    // Widen vXi64 fp_to_uint/fp_to_sint with avx512dq to 512-bit source.

    if ((VT == MVT::v2i64 || VT == MVT::v4i64) &&

        (SrcVT == MVT::v2f64 || SrcVT == MVT::v4f64 || SrcVT == MVT::v4f32) &&

        Subtarget.useAVX512Regs() && Subtarget.hasDQI()) {

      assert(!Subtarget.hasVLX() && "Unexpected features!");

      MVT WideVT = SrcVT == MVT::v4f32 ? MVT::v8f32 : MVT::v8f64;

      // Need to concat with zero vector for strict fp to avoid spurious

      // exceptions.

      // TODO: Should we just do this for non-strict as well?

      SDValue Tmp =

          IsStrict ? DAG.getConstantFP(0.0, dl, WideVT) : DAG.getUNDEF(WideVT);

      Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVT, Tmp, Src,

                        DAG.getVectorIdxConstant(0, dl));


      if (IsStrict) {

        Res = DAG.getNode(Op.getOpcode(), dl, {MVT::v8i64, MVT::Other},

                          {Chain, Src});

        Chain = Res.getValue(1);

      } else {

        Res = DAG.getNode(Op.getOpcode(), dl, MVT::v8i64, Src);

      }


      Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Res,

                        DAG.getVectorIdxConstant(0, dl));


      if (IsStrict)

        return DAG.getMergeValues({Res, Chain}, dl);

      return Res;

    }


    if (VT == MVT::v2i64 && SrcVT == MVT::v2f32) {

      if (!Subtarget.hasVLX()) {

        // Non-strict nodes without VLX can we widened to v4f32->v4i64 by type

        // legalizer and then widened again by vector op legalization.

        if (!IsStrict)

          return SDValue();


        SDValue Zero = DAG.getConstantFP(0.0, dl, MVT::v2f32);

        SDValue Tmp = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8f32,

                                  {Src, Zero, Zero, Zero});

        Tmp = DAG.getNode(Op.getOpcode(), dl, {MVT::v8i64, MVT::Other},

                          {Chain, Tmp});

        SDValue Chain = Tmp.getValue(1);

        Tmp = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i64, Tmp,

                          DAG.getVectorIdxConstant(0, dl));

        return DAG.getMergeValues({Tmp, Chain}, dl);

      }


      assert(Subtarget.hasDQI() && Subtarget.hasVLX() && "Requires AVX512DQVL");

      SDValue Tmp = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,

                                DAG.getUNDEF(MVT::v2f32));

      if (IsStrict) {

        unsigned Opc = IsSigned ? X86ISD::STRICT_CVTTP2SI

                                : X86ISD::STRICT_CVTTP2UI;

        return DAG.getNode(Opc, dl, {VT, MVT::Other}, {Op->getOperand(0), Tmp});

      }

      unsigned Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;

      return DAG.getNode(Opc, dl, VT, Tmp);

    }


    // Generate optimized instructions for pre AVX512 unsigned conversions from

    // vXf32 to vXi32.

    if ((VT == MVT::v4i32 && SrcVT == MVT::v4f32) ||

        (VT == MVT::v4i32 && SrcVT == MVT::v4f64) ||

        (VT == MVT::v8i32 && SrcVT == MVT::v8f32)) {

      assert(!IsSigned && "Expected unsigned conversion!");

      return expandFP_TO_UINT_SSE(VT, Src, dl, DAG, Subtarget);

    }


    return SDValue();

  }


  assert(!VT.isVector());


  bool UseSSEReg = isScalarFPTypeInSSEReg(SrcVT);


  if (!IsSigned && UseSSEReg) {

    // Conversions from f32/f64 with AVX512 should be legal.

    if (Subtarget.hasAVX512())

      return Op;


    // We can leverage the specific way the "cvttss2si/cvttsd2si" instruction

    // behaves on out of range inputs to generate optimized conversions.

    if (!IsStrict && ((VT == MVT::i32 && !Subtarget.is64Bit()) ||

                      (VT == MVT::i64 && Subtarget.is64Bit()))) {

      unsigned DstBits = VT.getScalarSizeInBits();

      APInt UIntLimit = APInt::getSignMask(DstBits);

      SDValue FloatOffset = DAG.getNode(ISD::UINT_TO_FP, dl, SrcVT,

                                        DAG.getConstant(UIntLimit, dl, VT));

      MVT SrcVecVT = MVT::getVectorVT(SrcVT, 128 / SrcVT.getScalarSizeInBits());


      // Calculate the converted result for values in the range:

      // (i32) 0 to 2^31-1 ("Small") and from 2^31 to 2^32-1 ("Big").

      // (i64) 0 to 2^63-1 ("Small") and from 2^63 to 2^64-1 ("Big").

      SDValue Small =

          DAG.getNode(X86ISD::CVTTS2SI, dl, VT,

                      DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, SrcVecVT, Src));

      SDValue Big = DAG.getNode(

          X86ISD::CVTTS2SI, dl, VT,

          DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, SrcVecVT,

                      DAG.getNode(ISD::FSUB, dl, SrcVT, Src, FloatOffset)));


      // The "CVTTS2SI" instruction conveniently sets the sign bit if

      // and only if the value was out of range. So we can use that

      // as our indicator that we rather use "Big" instead of "Small".

      //

      // Use "Small" if "IsOverflown" has all bits cleared

      // and "0x80000000 | Big" if all bits in "IsOverflown" are set.

      SDValue IsOverflown = DAG.getNode(

          ISD::SRA, dl, VT, Small, DAG.getConstant(DstBits - 1, dl, MVT::i8));

      return DAG.getNode(ISD::OR, dl, VT, Small,

                         DAG.getNode(ISD::AND, dl, VT, Big, IsOverflown));

    }


    // Use default expansion for i64.

    if (VT == MVT::i64)

      return SDValue();


    assert(VT == MVT::i32 && "Unexpected VT!");


    // Promote i32 to i64 and use a signed operation on 64-bit targets.

    // FIXME: This does not generate an invalid exception if the input does not

    // fit in i32. PR44019

    if (Subtarget.is64Bit()) {

      if (IsStrict) {

        Res = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, {MVT::i64, MVT::Other},

                          {Chain, Src});

        Chain = Res.getValue(1);

      } else

        Res = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i64, Src);


      Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);

      if (IsStrict)

        return DAG.getMergeValues({Res, Chain}, dl);

      return Res;

    }


    // Use default expansion for SSE1/2 targets without SSE3. With SSE3 we can

    // use fisttp which will be handled later.

    if (!Subtarget.hasSSE3())

      return SDValue();

  }


  // Promote i16 to i32 if we can use a SSE operation or the type is f128.

  // FIXME: This does not generate an invalid exception if the input does not

  // fit in i16. PR44019

  if (VT == MVT::i16 && (UseSSEReg || SrcVT == MVT::f128)) {

    assert(IsSigned && "Expected i16 FP_TO_UINT to have been promoted!");

    if (IsStrict) {

      Res = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, {MVT::i32, MVT::Other},

                        {Chain, Src});

      Chain = Res.getValue(1);

    } else

      Res = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i32, Src);


    Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);

    if (IsStrict)

      return DAG.getMergeValues({Res, Chain}, dl);

    return Res;

  }


  // If this is a FP_TO_SINT using SSEReg we're done.

  if (UseSSEReg && IsSigned)

    return Op;


  // fp128 needs to use a libcall.

  if (SrcVT == MVT::f128) {

    RTLIB::Libcall LC;

    if (IsSigned)

      LC = RTLIB::getFPTOSINT(SrcVT, VT);

    else

      LC = RTLIB::getFPTOUINT(SrcVT, VT);


    MakeLibCallOptions CallOptions;

    std::pair<SDValue, SDValue> Tmp =

        makeLibCall(DAG, LC, VT, Src, CallOptions, dl, Chain);


    if (IsStrict)

      return DAG.getMergeValues({ Tmp.first, Tmp.second }, dl);


    return Tmp.first;

  }


  // Fall back to X87.

  if (SDValue V = FP_TO_INTHelper(Op, DAG, IsSigned, Chain)) {

    if (IsStrict)

      return DAG.getMergeValues({V, Chain}, dl);

    return V;

  }


  llvm_unreachable("Expected FP_TO_INTHelper to handle all remaining cases.");

}


SDValue X86TargetLowering::LowerLRINT_LLRINT(SDValue Op,

                                             SelectionDAG &DAG) const {

  SDValue Src = Op.getOperand(0);

  EVT DstVT = Op.getSimpleValueType();

  MVT SrcVT = Src.getSimpleValueType();


  if (SrcVT.isVector())

    return DstVT.getScalarType() == MVT::i32 ? Op : SDValue();


  if (SrcVT == MVT::f16)

    return SDValue();


  // If the source is in an SSE register, the node is Legal.

  if (isScalarFPTypeInSSEReg(SrcVT))

    return Op;


  return LRINT_LLRINTHelper(Op.getNode(), DAG);

}


SDValue X86TargetLowering::LRINT_LLRINTHelper(SDNode *N,

                                              SelectionDAG &DAG) const {

  EVT DstVT = N->getValueType(0);

  SDValue Src = N->getOperand(0);

  EVT SrcVT = Src.getValueType();


  if (SrcVT != MVT::f32 && SrcVT != MVT::f64 && SrcVT != MVT::f80) {

    // f16 must be promoted before using the lowering in this routine.

    // fp128 does not use this lowering.

    return SDValue();

  }


  SDLoc DL(N);

  SDValue Chain = DAG.getEntryNode();


  bool UseSSE = isScalarFPTypeInSSEReg(SrcVT);


  // If we're converting from SSE, the stack slot needs to hold both types.

  // Otherwise it only needs to hold the DstVT.

  EVT OtherVT = UseSSE ? SrcVT : DstVT;

  SDValue StackPtr = DAG.CreateStackTemporary(DstVT, OtherVT);

  int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();

  MachinePointerInfo MPI =

      MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI);


  if (UseSSE) {

    assert(DstVT == MVT::i64 && "Invalid LRINT/LLRINT to lower!");

    Chain = DAG.getStore(Chain, DL, Src, StackPtr, MPI);

    SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);

    SDValue Ops[] = { Chain, StackPtr };


    Src = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, SrcVT, MPI,

                                  /*Align*/ std::nullopt,

                                  MachineMemOperand::MOLoad);

    Chain = Src.getValue(1);

  }


  SDValue StoreOps[] = { Chain, Src, StackPtr };

  Chain = DAG.getMemIntrinsicNode(X86ISD::FIST, DL, DAG.getVTList(MVT::Other),

                                  StoreOps, DstVT, MPI, /*Align*/ std::nullopt,

                                  MachineMemOperand::MOStore);


  return DAG.getLoad(DstVT, DL, Chain, StackPtr, MPI);

}


SDValue

X86TargetLowering::LowerFP_TO_INT_SAT(SDValue Op, SelectionDAG &DAG) const {

  // This is based on the TargetLowering::expandFP_TO_INT_SAT implementation,

  // but making use of X86 specifics to produce better instruction sequences.

  SDNode *Node = Op.getNode();

  bool IsSigned = Node->getOpcode() == ISD::FP_TO_SINT_SAT;

  unsigned FpToIntOpcode = IsSigned ? ISD::FP_TO_SINT : ISD::FP_TO_UINT;

  SDLoc dl(SDValue(Node, 0));

  SDValue Src = Node->getOperand(0);


  // There are three types involved here: SrcVT is the source floating point

  // type, DstVT is the type of the result, and TmpVT is the result of the

  // intermediate FP_TO_*INT operation we'll use (which may be a promotion of

  // DstVT).

  EVT SrcVT = Src.getValueType();

  EVT DstVT = Node->getValueType(0);

  EVT TmpVT = DstVT;


  // This code is only for floats and doubles. Fall back to generic code for

  // anything else.

  if (!isScalarFPTypeInSSEReg(SrcVT) || isSoftF16(SrcVT, Subtarget))

    return SDValue();


  EVT SatVT = cast<VTSDNode>(Node->getOperand(1))->getVT();

  unsigned SatWidth = SatVT.getScalarSizeInBits();

  unsigned DstWidth = DstVT.getScalarSizeInBits();

  unsigned TmpWidth = TmpVT.getScalarSizeInBits();

  assert(SatWidth <= DstWidth && SatWidth <= TmpWidth &&

         "Expected saturation width smaller than result width");


  // Promote result of FP_TO_*INT to at least 32 bits.

  if (TmpWidth < 32) {

    TmpVT = MVT::i32;

    TmpWidth = 32;

  }


  // Promote conversions to unsigned 32-bit to 64-bit, because it will allow

  // us to use a native signed conversion instead.

  if (SatWidth == 32 && !IsSigned && Subtarget.is64Bit()) {

    TmpVT = MVT::i64;

    TmpWidth = 64;

  }


  // If the saturation width is smaller than the size of the temporary result,

  // we can always use signed conversion, which is native.

  if (SatWidth < TmpWidth)

    FpToIntOpcode = ISD::FP_TO_SINT;


  // Determine minimum and maximum integer values and their corresponding

  // floating-point values.

  APInt MinInt, MaxInt;

  if (IsSigned) {

    MinInt = APInt::getSignedMinValue(SatWidth).sext(DstWidth);

    MaxInt = APInt::getSignedMaxValue(SatWidth).sext(DstWidth);

  } else {

    MinInt = APInt::getMinValue(SatWidth).zext(DstWidth);

    MaxInt = APInt::getMaxValue(SatWidth).zext(DstWidth);

  }


  const fltSemantics &Sem = SrcVT.getFltSemantics();

  APFloat MinFloat(Sem);

  APFloat MaxFloat(Sem);


  APFloat::opStatus MinStatus = MinFloat.convertFromAPInt(

    MinInt, IsSigned, APFloat::rmTowardZero);

  APFloat::opStatus MaxStatus = MaxFloat.convertFromAPInt(

    MaxInt, IsSigned, APFloat::rmTowardZero);

  bool AreExactFloatBounds = !(MinStatus & APFloat::opStatus::opInexact)

                          && !(MaxStatus & APFloat::opStatus::opInexact);


  SDValue MinFloatNode = DAG.getConstantFP(MinFloat, dl, SrcVT);

  SDValue MaxFloatNode = DAG.getConstantFP(MaxFloat, dl, SrcVT);


  // If the integer bounds are exactly representable as floats, emit a

  // min+max+fptoi sequence. Otherwise use comparisons and selects.

  if (AreExactFloatBounds) {

    if (DstVT != TmpVT) {

      // Clamp by MinFloat from below. If Src is NaN, propagate NaN.

      SDValue MinClamped = DAG.getNode(

        X86ISD::FMAX, dl, SrcVT, MinFloatNode, Src);

      // Clamp by MaxFloat from above. If Src is NaN, propagate NaN.

      SDValue BothClamped = DAG.getNode(

        X86ISD::FMIN, dl, SrcVT, MaxFloatNode, MinClamped);

      // Convert clamped value to integer.

      SDValue FpToInt = DAG.getNode(FpToIntOpcode, dl, TmpVT, BothClamped);


      // NaN will become INDVAL, with the top bit set and the rest zero.

      // Truncation will discard the top bit, resulting in zero.

      return DAG.getNode(ISD::TRUNCATE, dl, DstVT, FpToInt);

    }


    // Clamp by MinFloat from below. If Src is NaN, the result is MinFloat.

    SDValue MinClamped = DAG.getNode(

      X86ISD::FMAX, dl, SrcVT, Src, MinFloatNode);

    // Clamp by MaxFloat from above. NaN cannot occur.

    SDValue BothClamped = DAG.getNode(

      X86ISD::FMINC, dl, SrcVT, MinClamped, MaxFloatNode);

    // Convert clamped value to integer.

    SDValue FpToInt = DAG.getNode(FpToIntOpcode, dl, DstVT, BothClamped);


    if (!IsSigned) {

      // In the unsigned case we're done, because we mapped NaN to MinFloat,

      // which is zero.

      return FpToInt;

    }


    // Otherwise, select zero if Src is NaN.

    SDValue ZeroInt = DAG.getConstant(0, dl, DstVT);

    return DAG.getSelectCC(

      dl, Src, Src, ZeroInt, FpToInt, ISD::CondCode::SETUO);

  }


  SDValue MinIntNode = DAG.getConstant(MinInt, dl, DstVT);

  SDValue MaxIntNode = DAG.getConstant(MaxInt, dl, DstVT);


  // Result of direct conversion, which may be selected away.

  SDValue FpToInt = DAG.getNode(FpToIntOpcode, dl, TmpVT, Src);


  if (DstVT != TmpVT) {

    // NaN will become INDVAL, with the top bit set and the rest zero.

    // Truncation will discard the top bit, resulting in zero.

    FpToInt = DAG.getNode(ISD::TRUNCATE, dl, DstVT, FpToInt);

  }


  SDValue Select = FpToInt;

  // For signed conversions where we saturate to the same size as the

  // result type of the fptoi instructions, INDVAL coincides with integer

  // minimum, so we don't need to explicitly check it.

  if (!IsSigned || SatWidth != TmpVT.getScalarSizeInBits()) {

    // If Src ULT MinFloat, select MinInt. In particular, this also selects

    // MinInt if Src is NaN.

    Select = DAG.getSelectCC(

      dl, Src, MinFloatNode, MinIntNode, Select, ISD::CondCode::SETULT);

  }


  // If Src OGT MaxFloat, select MaxInt.

  Select = DAG.getSelectCC(

    dl, Src, MaxFloatNode, MaxIntNode, Select, ISD::CondCode::SETOGT);


  // In the unsigned case we are done, because we mapped NaN to MinInt, which

  // is already zero. The promoted case was already handled above.

  if (!IsSigned || DstVT != TmpVT) {

    return Select;

  }


  // Otherwise, select 0 if Src is NaN.

  SDValue ZeroInt = DAG.getConstant(0, dl, DstVT);

  return DAG.getSelectCC(

    dl, Src, Src, ZeroInt, Select, ISD::CondCode::SETUO);

}


SDValue X86TargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {

  bool IsStrict = Op->isStrictFPOpcode();


  SDLoc DL(Op);

  MVT VT = Op.getSimpleValueType();

  SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();

  SDValue In = Op.getOperand(IsStrict ? 1 : 0);

  MVT SVT = In.getSimpleValueType();


  // Let f16->f80 get lowered to a libcall, except for darwin, where we should

  // lower it to an fp_extend via f32 (as only f16<>f32 libcalls are available)

  if (VT == MVT::f128 || (SVT == MVT::f16 && VT == MVT::f80 &&

                          !Subtarget.getTargetTriple().isOSDarwin()))

    return SDValue();


  if ((SVT == MVT::v8f16 && Subtarget.hasF16C()) ||

      (SVT == MVT::v16f16 && Subtarget.useAVX512Regs()))

    return Op;


  if (SVT == MVT::f16) {

    if (Subtarget.hasFP16())

      return Op;


    if (VT != MVT::f32) {

      if (IsStrict)

        return DAG.getNode(

            ISD::STRICT_FP_EXTEND, DL, {VT, MVT::Other},

            {Chain, DAG.getNode(ISD::STRICT_FP_EXTEND, DL,

                                {MVT::f32, MVT::Other}, {Chain, In})});


      return DAG.getNode(ISD::FP_EXTEND, DL, VT,

                         DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, In));

    }


    if (!Subtarget.hasF16C()) {

      if (!Subtarget.getTargetTriple().isOSDarwin())

        return SDValue();


      assert(VT == MVT::f32 && SVT == MVT::f16 && "unexpected extend libcall");


      // Need a libcall, but ABI for f16 is soft-float on MacOS.

      TargetLowering::CallLoweringInfo CLI(DAG);

      Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();


      In = DAG.getBitcast(MVT::i16, In);

      TargetLowering::ArgListTy Args;

      TargetLowering::ArgListEntry Entry(

          In, EVT(MVT::i16).getTypeForEVT(*DAG.getContext()));

      Entry.IsSExt = false;

      Entry.IsZExt = true;

      Args.push_back(Entry);


      SDValue Callee = DAG.getExternalSymbol(

          getLibcallName(RTLIB::FPEXT_F16_F32),

          getPointerTy(DAG.getDataLayout()));

      CLI.setDebugLoc(DL).setChain(Chain).setLibCallee(

          CallingConv::C, EVT(VT).getTypeForEVT(*DAG.getContext()), Callee,

          std::move(Args));


      SDValue Res;

      std::tie(Res,Chain) = LowerCallTo(CLI);

      if (IsStrict)

        Res = DAG.getMergeValues({Res, Chain}, DL);


      return Res;

    }


    In = DAG.getBitcast(MVT::i16, In);

    SDValue Res;

    if (IsStrict) {

      In = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v8i16,

                       getZeroVector(MVT::v8i16, Subtarget, DAG, DL), In,

                       DAG.getVectorIdxConstant(0, DL));

      Res = DAG.getNode(X86ISD::STRICT_CVTPH2PS, DL, {MVT::v4f32, MVT::Other},

                        {Chain, In});

      Chain = Res.getValue(1);

    } else {

      In = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, In);

      In = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v4i32,

                       DAG.getUNDEF(MVT::v4i32), In,

                       DAG.getVectorIdxConstant(0, DL));

      In = DAG.getBitcast(MVT::v8i16, In);

      Res = DAG.getNode(X86ISD::CVTPH2PS, DL, MVT::v4f32, In,

                        DAG.getTargetConstant(4, DL, MVT::i32));

    }

    Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Res,

                      DAG.getVectorIdxConstant(0, DL));

    if (IsStrict)

      return DAG.getMergeValues({Res, Chain}, DL);

    return Res;

  }


  if (!SVT.isVector() || SVT.getVectorElementType() == MVT::bf16)

    return Op;


  if (SVT.getVectorElementType() == MVT::f16) {

    if (Subtarget.hasFP16() && isTypeLegal(SVT))

      return Op;

    assert(Subtarget.hasF16C() && "Unexpected features!");

    if (SVT == MVT::v2f16)

      In = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f16, In,

                       DAG.getUNDEF(MVT::v2f16));

    SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8f16, In,

                              DAG.getUNDEF(MVT::v4f16));

    if (IsStrict)

      return DAG.getNode(X86ISD::STRICT_VFPEXT, DL, {VT, MVT::Other},

                         {Op->getOperand(0), Res});

    return DAG.getNode(X86ISD::VFPEXT, DL, VT, Res);

  } else if (VT == MVT::v4f64 || VT == MVT::v8f64) {

    return Op;

  }


  assert(SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!");


  SDValue Res =

      DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f32, In, DAG.getUNDEF(SVT));

  if (IsStrict)

    return DAG.getNode(X86ISD::STRICT_VFPEXT, DL, {VT, MVT::Other},

                       {Op->getOperand(0), Res});

  return DAG.getNode(X86ISD::VFPEXT, DL, VT, Res);

}


SDValue X86TargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {

  bool IsStrict = Op->isStrictFPOpcode();


  SDLoc DL(Op);

  SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();

  SDValue In = Op.getOperand(IsStrict ? 1 : 0);

  MVT VT = Op.getSimpleValueType();

  MVT SVT = In.getSimpleValueType();


  if (SVT == MVT::f128 || (VT == MVT::f16 && SVT == MVT::f80))

    return SDValue();


  if (VT == MVT::f16 && (SVT == MVT::f64 || SVT == MVT::f32) &&

      !Subtarget.hasFP16() && (SVT == MVT::f64 || !Subtarget.hasF16C())) {

    if (!Subtarget.getTargetTriple().isOSDarwin())

      return SDValue();


    // We need a libcall but the ABI for f16 libcalls on MacOS is soft.

    TargetLowering::CallLoweringInfo CLI(DAG);

    Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();


    TargetLowering::ArgListTy Args;

    TargetLowering::ArgListEntry Entry(

        In, EVT(SVT).getTypeForEVT(*DAG.getContext()));

    Entry.IsSExt = false;

    Entry.IsZExt = true;

    Args.push_back(Entry);


    SDValue Callee = DAG.getExternalSymbol(

        getLibcallName(SVT == MVT::f64 ? RTLIB::FPROUND_F64_F16

                                       : RTLIB::FPROUND_F32_F16),

        getPointerTy(DAG.getDataLayout()));

    CLI.setDebugLoc(DL).setChain(Chain).setLibCallee(

        CallingConv::C, EVT(MVT::i16).getTypeForEVT(*DAG.getContext()), Callee,

        std::move(Args));


    SDValue Res;

    std::tie(Res, Chain) = LowerCallTo(CLI);


    Res = DAG.getBitcast(MVT::f16, Res);


    if (IsStrict)

      Res = DAG.getMergeValues({Res, Chain}, DL);


    return Res;

  }


  if (VT.getScalarType() == MVT::bf16) {

    if (SVT.getScalarType() == MVT::f32 &&

        ((Subtarget.hasBF16() && Subtarget.hasVLX()) ||

         Subtarget.hasAVXNECONVERT()))

      return Op;

    return SDValue();

  }


  if (VT.getScalarType() == MVT::f16 && !Subtarget.hasFP16()) {

    if (!Subtarget.hasF16C() || SVT.getScalarType() != MVT::f32)

      return SDValue();


    if (VT.isVector())

      return Op;


    SDValue Res;

    SDValue Rnd = DAG.getTargetConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, DL,

                                        MVT::i32);

    if (IsStrict) {

      Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v4f32,

                        DAG.getConstantFP(0, DL, MVT::v4f32), In,

                        DAG.getVectorIdxConstant(0, DL));

      Res = DAG.getNode(X86ISD::STRICT_CVTPS2PH, DL, {MVT::v8i16, MVT::Other},

                        {Chain, Res, Rnd});

      Chain = Res.getValue(1);

    } else {

      // FIXME: Should we use zeros for upper elements for non-strict?

      Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4f32, In);

      Res = DAG.getNode(X86ISD::CVTPS2PH, DL, MVT::v8i16, Res, Rnd);

    }


    Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i16, Res,

                      DAG.getVectorIdxConstant(0, DL));

    Res = DAG.getBitcast(MVT::f16, Res);


    if (IsStrict)

      return DAG.getMergeValues({Res, Chain}, DL);


    return Res;

  }


  return Op;

}


static SDValue LowerFP16_TO_FP(SDValue Op, SelectionDAG &DAG) {

  bool IsStrict = Op->isStrictFPOpcode();

  SDValue Src = Op.getOperand(IsStrict ? 1 : 0);

  assert(Src.getValueType() == MVT::i16 && Op.getValueType() == MVT::f32 &&

         "Unexpected VT!");


  SDLoc dl(Op);

  SDValue Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16,

                            DAG.getConstant(0, dl, MVT::v8i16), Src,

                            DAG.getVectorIdxConstant(0, dl));


  SDValue Chain;

  if (IsStrict) {

    Res = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {MVT::v4f32, MVT::Other},

                      {Op.getOperand(0), Res});

    Chain = Res.getValue(1);

  } else {

    Res = DAG.getNode(X86ISD::CVTPH2PS, dl, MVT::v4f32, Res);

  }


  Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, Res,

                    DAG.getVectorIdxConstant(0, dl));


  if (IsStrict)

    return DAG.getMergeValues({Res, Chain}, dl);


  return Res;

}


static SDValue LowerFP_TO_FP16(SDValue Op, SelectionDAG &DAG) {

  bool IsStrict = Op->isStrictFPOpcode();

  SDValue Src = Op.getOperand(IsStrict ? 1 : 0);

  assert(Src.getValueType() == MVT::f32 && Op.getValueType() == MVT::i16 &&

         "Unexpected VT!");


  SDLoc dl(Op);

  SDValue Res, Chain;

  if (IsStrict) {

    Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v4f32,

                      DAG.getConstantFP(0, dl, MVT::v4f32), Src,

                      DAG.getVectorIdxConstant(0, dl));

    Res = DAG.getNode(

        X86ISD::STRICT_CVTPS2PH, dl, {MVT::v8i16, MVT::Other},

        {Op.getOperand(0), Res, DAG.getTargetConstant(4, dl, MVT::i32)});

    Chain = Res.getValue(1);

  } else {

    // FIXME: Should we use zeros for upper elements for non-strict?

    Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, Src);

    Res = DAG.getNode(X86ISD::CVTPS2PH, dl, MVT::v8i16, Res,

                      DAG.getTargetConstant(4, dl, MVT::i32));

  }


  Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Res,

                    DAG.getVectorIdxConstant(0, dl));


  if (IsStrict)

    return DAG.getMergeValues({Res, Chain}, dl);


  return Res;

}


SDValue X86TargetLowering::LowerFP_TO_BF16(SDValue Op,

                                           SelectionDAG &DAG) const {

  SDLoc DL(Op);


  MVT SVT = Op.getOperand(0).getSimpleValueType();

  if (SVT == MVT::f32 && ((Subtarget.hasBF16() && Subtarget.hasVLX()) ||

                          Subtarget.hasAVXNECONVERT())) {

    SDValue Res;

    Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4f32, Op.getOperand(0));

    Res = DAG.getNode(X86ISD::CVTNEPS2BF16, DL, MVT::v8bf16, Res);

    Res = DAG.getBitcast(MVT::v8i16, Res);

    return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i16, Res,

                       DAG.getVectorIdxConstant(0, DL));

  }


  MakeLibCallOptions CallOptions;

  RTLIB::Libcall LC = RTLIB::getFPROUND(SVT, MVT::bf16);

  SDValue Res =

      makeLibCall(DAG, LC, MVT::f16, Op.getOperand(0), CallOptions, DL).first;

  return DAG.getBitcast(MVT::i16, Res);

}


/// Depending on uarch and/or optimizing for size, we might prefer to use a

/// vector operation in place of the typical scalar operation.


static SDValue lowerAddSubToHorizontalOp(SDValue Op, const SDLoc &DL,

                                         SelectionDAG &DAG,

                                         const X86Subtarget &Subtarget) {

  // If both operands have other uses, this is probably not profitable.

  SDValue LHS = Op.getOperand(0);

  SDValue RHS = Op.getOperand(1);

  if (!LHS.hasOneUse() && !RHS.hasOneUse())

    return Op;


  // FP horizontal add/sub were added with SSE3. Integer with SSSE3.

  bool IsFP = Op.getSimpleValueType().isFloatingPoint();

  if (IsFP && !Subtarget.hasSSE3())

    return Op;

  if (!IsFP && !Subtarget.hasSSSE3())

    return Op;


  // Extract from a common vector.

  if (LHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||

      RHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||

      LHS.getOperand(0) != RHS.getOperand(0) ||

      !isa<ConstantSDNode>(LHS.getOperand(1)) ||

      !isa<ConstantSDNode>(RHS.getOperand(1)) ||

      !shouldUseHorizontalOp(true, DAG, Subtarget))

    return Op;


  // Allow commuted 'hadd' ops.

  // TODO: Allow commuted (f)sub by negating the result of (F)HSUB?

  unsigned HOpcode;

  switch (Op.getOpcode()) {

  // clang-format off

  case ISD::ADD: HOpcode = X86ISD::HADD; break;

  case ISD::SUB: HOpcode = X86ISD::HSUB; break;

  case ISD::FADD: HOpcode = X86ISD::FHADD; break;

  case ISD::FSUB: HOpcode = X86ISD::FHSUB; break;

  default:

    llvm_unreachable("Trying to lower unsupported opcode to horizontal op");

  // clang-format on

  }

  unsigned LExtIndex = LHS.getConstantOperandVal(1);

  unsigned RExtIndex = RHS.getConstantOperandVal(1);

  if ((LExtIndex & 1) == 1 && (RExtIndex & 1) == 0 &&

      (HOpcode == X86ISD::HADD || HOpcode == X86ISD::FHADD))

    std::swap(LExtIndex, RExtIndex);


  if ((LExtIndex & 1) != 0 || RExtIndex != (LExtIndex + 1))

    return Op;


  SDValue X = LHS.getOperand(0);

  EVT VecVT = X.getValueType();

  unsigned BitWidth = VecVT.getSizeInBits();

  unsigned NumLanes = BitWidth / 128;

  unsigned NumEltsPerLane = VecVT.getVectorNumElements() / NumLanes;

  assert((BitWidth == 128 || BitWidth == 256 || BitWidth == 512) &&

         "Not expecting illegal vector widths here");


  // Creating a 256-bit horizontal op would be wasteful, and there is no 512-bit

  // equivalent, so extract the 256/512-bit source op to 128-bit if we can.

  if (BitWidth == 256 || BitWidth == 512) {

    unsigned LaneIdx = LExtIndex / NumEltsPerLane;

    X = extract128BitVector(X, LaneIdx * NumEltsPerLane, DAG, DL);

    LExtIndex %= NumEltsPerLane;

  }


  // add (extractelt (X, 0), extractelt (X, 1)) --> extractelt (hadd X, X), 0

  // add (extractelt (X, 1), extractelt (X, 0)) --> extractelt (hadd X, X), 0

  // add (extractelt (X, 2), extractelt (X, 3)) --> extractelt (hadd X, X), 1

  // sub (extractelt (X, 0), extractelt (X, 1)) --> extractelt (hsub X, X), 0

  SDValue HOp = DAG.getNode(HOpcode, DL, X.getValueType(), X, X);

  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Op.getSimpleValueType(), HOp,

                     DAG.getVectorIdxConstant(LExtIndex / 2, DL));

}


/// Depending on uarch and/or optimizing for size, we might prefer to use a

/// vector operation in place of the typical scalar operation.

SDValue X86TargetLowering::lowerFaddFsub(SDValue Op, SelectionDAG &DAG) const {

  assert((Op.getValueType() == MVT::f32 || Op.getValueType() == MVT::f64) &&

         "Only expecting float/double");

  return lowerAddSubToHorizontalOp(Op, SDLoc(Op), DAG, Subtarget);

}


/// ISD::FROUND is defined to round to nearest with ties rounding away from 0.

/// This mode isn't supported in hardware on X86. But as long as we aren't

/// compiling with trapping math, we can emulate this with

/// trunc(X + copysign(nextafter(0.5, 0.0), X)).


static SDValue LowerFROUND(SDValue Op, SelectionDAG &DAG) {

  SDValue N0 = Op.getOperand(0);

  SDLoc dl(Op);

  MVT VT = Op.getSimpleValueType();


  // N0 += copysign(nextafter(0.5, 0.0), N0)

  const fltSemantics &Sem = VT.getFltSemantics();

  bool Ignored;

  APFloat Point5Pred = APFloat(0.5f);

  Point5Pred.convert(Sem, APFloat::rmNearestTiesToEven, &Ignored);

  Point5Pred.next(/*nextDown*/true);


  SDValue Adder = DAG.getNode(ISD::FCOPYSIGN, dl, VT,

                              DAG.getConstantFP(Point5Pred, dl, VT), N0);

  N0 = DAG.getNode(ISD::FADD, dl, VT, N0, Adder);


  // Truncate the result to remove fraction.

  return DAG.getNode(ISD::FTRUNC, dl, VT, N0);

}


/// The only differences between FABS and FNEG are the mask and the logic op.

/// FNEG also has a folding opportunity for FNEG(FABS(x)).


static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG) {

  assert((Op.getOpcode() == ISD::FABS || Op.getOpcode() == ISD::FNEG) &&

         "Wrong opcode for lowering FABS or FNEG.");


  bool IsFABS = (Op.getOpcode() == ISD::FABS);


  // If this is a FABS and it has an FNEG user, bail out to fold the combination

  // into an FNABS. We'll lower the FABS after that if it is still in use.

  if (IsFABS)

    for (SDNode *User : Op->users())

      if (User->getOpcode() == ISD::FNEG)

        return Op;


  SDLoc dl(Op);

  MVT VT = Op.getSimpleValueType();


  bool IsF128 = (VT == MVT::f128);

  assert(VT.isFloatingPoint() && VT != MVT::f80 &&

         DAG.getTargetLoweringInfo().isTypeLegal(VT) &&

         "Unexpected type in LowerFABSorFNEG");


  // FIXME: Use function attribute "OptimizeForSize" and/or CodeGenOptLevel to

  // decide if we should generate a 16-byte constant mask when we only need 4 or

  // 8 bytes for the scalar case.


  // There are no scalar bitwise logical SSE/AVX instructions, so we

  // generate a 16-byte vector constant and logic op even for the scalar case.

  // Using a 16-byte mask allows folding the load of the mask with

  // the logic op, so it can save (~4 bytes) on code size.

  bool IsFakeVector = !VT.isVector() && !IsF128;

  MVT LogicVT = VT;

  if (IsFakeVector)

    LogicVT = (VT == MVT::f64)   ? MVT::v2f64

              : (VT == MVT::f32) ? MVT::v4f32

                                 : MVT::v8f16;


  unsigned EltBits = VT.getScalarSizeInBits();

  // For FABS, mask is 0x7f...; for FNEG, mask is 0x80...

  APInt MaskElt = IsFABS ? APInt::getSignedMaxValue(EltBits) :

                           APInt::getSignMask(EltBits);

  const fltSemantics &Sem = VT.getFltSemantics();

  SDValue Mask = DAG.getConstantFP(APFloat(Sem, MaskElt), dl, LogicVT);


  SDValue Op0 = Op.getOperand(0);

  bool IsFNABS = !IsFABS && (Op0.getOpcode() == ISD::FABS);

  unsigned LogicOp = IsFABS  ? X86ISD::FAND :

                     IsFNABS ? X86ISD::FOR  :

                               X86ISD::FXOR;

  SDValue Operand = IsFNABS ? Op0.getOperand(0) : Op0;


  if (VT.isVector() || IsF128)

    return DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);


  // For the scalar case extend to a 128-bit vector, perform the logic op,

  // and extract the scalar result back out.

  Operand = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Operand);

  SDValue LogicNode = DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);

  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, LogicNode,

                     DAG.getVectorIdxConstant(0, dl));

}


static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) {

  SDValue Mag = Op.getOperand(0);

  SDValue Sign = Op.getOperand(1);

  SDLoc dl(Op);


  // If the sign operand is smaller, extend it first.

  MVT VT = Op.getSimpleValueType();

  if (Sign.getSimpleValueType().bitsLT(VT))

    Sign = DAG.getNode(ISD::FP_EXTEND, dl, VT, Sign);


  // And if it is bigger, shrink it first.

  if (Sign.getSimpleValueType().bitsGT(VT))

    Sign = DAG.getNode(ISD::FP_ROUND, dl, VT, Sign,

                       DAG.getIntPtrConstant(0, dl, /*isTarget=*/true));


  // At this point the operands and the result should have the same

  // type, and that won't be f80 since that is not custom lowered.

  bool IsF128 = (VT == MVT::f128);

  assert(VT.isFloatingPoint() && VT != MVT::f80 &&

         DAG.getTargetLoweringInfo().isTypeLegal(VT) &&

         "Unexpected type in LowerFCOPYSIGN");


  const fltSemantics &Sem = VT.getFltSemantics();


  // Perform all scalar logic operations as 16-byte vectors because there are no

  // scalar FP logic instructions in SSE.

  // TODO: This isn't necessary. If we used scalar types, we might avoid some

  // unnecessary splats, but we might miss load folding opportunities. Should

  // this decision be based on OptimizeForSize?

  bool IsFakeVector = !VT.isVector() && !IsF128;

  MVT LogicVT = VT;

  if (IsFakeVector)

    LogicVT = (VT == MVT::f64)   ? MVT::v2f64

              : (VT == MVT::f32) ? MVT::v4f32

                                 : MVT::v8f16;


  // The mask constants are automatically splatted for vector types.

  unsigned EltSizeInBits = VT.getScalarSizeInBits();

  SDValue SignMask = DAG.getConstantFP(

      APFloat(Sem, APInt::getSignMask(EltSizeInBits)), dl, LogicVT);

  SDValue MagMask = DAG.getConstantFP(

      APFloat(Sem, APInt::getSignedMaxValue(EltSizeInBits)), dl, LogicVT);


  // First, clear all bits but the sign bit from the second operand (sign).

  if (IsFakeVector)

    Sign = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Sign);

  SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, LogicVT, Sign, SignMask);


  // Next, clear the sign bit from the first operand (magnitude).

  // TODO: If we had general constant folding for FP logic ops, this check

  // wouldn't be necessary.

  SDValue MagBits;

  if (ConstantFPSDNode *Op0CN = isConstOrConstSplatFP(Mag)) {

    APFloat APF = Op0CN->getValueAPF();

    APF.clearSign();

    MagBits = DAG.getConstantFP(APF, dl, LogicVT);

  } else {

    // If the magnitude operand wasn't a constant, we need to AND out the sign.

    if (IsFakeVector)

      Mag = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Mag);

    MagBits = DAG.getNode(X86ISD::FAND, dl, LogicVT, Mag, MagMask);

  }


  // OR the magnitude value with the sign bit.

  SDValue Or = DAG.getNode(X86ISD::FOR, dl, LogicVT, MagBits, SignBit);

  return !IsFakeVector ? Or

                       : DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Or,

                                     DAG.getVectorIdxConstant(0, dl));

}


static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) {

  SDValue N0 = Op.getOperand(0);

  SDLoc dl(Op);

  MVT VT = Op.getSimpleValueType();


  MVT OpVT = N0.getSimpleValueType();

  assert((OpVT == MVT::f32 || OpVT == MVT::f64) &&

         "Unexpected type for FGETSIGN");


  // Lower ISD::FGETSIGN to (AND (X86ISD::MOVMSK ...) 1).

  MVT VecVT = (OpVT == MVT::f32 ? MVT::v4f32 : MVT::v2f64);

  SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, N0);

  Res = DAG.getNode(X86ISD::MOVMSK, dl, MVT::i32, Res);

  Res = DAG.getZExtOrTrunc(Res, dl, VT);

  Res = DAG.getNode(ISD::AND, dl, VT, Res, DAG.getConstant(1, dl, VT));

  return Res;

}


/// Helper for attempting to create a X86ISD::BT node.


static SDValue getBT(SDValue Src, SDValue BitNo, const SDLoc &DL, SelectionDAG &DAG) {

  // If Src is i8, promote it to i32 with any_extend.  There is no i8 BT

  // instruction.  Since the shift amount is in-range-or-undefined, we know

  // that doing a bittest on the i32 value is ok.  We extend to i32 because

  // the encoding for the i16 version is larger than the i32 version.

  // Also promote i16 to i32 for performance / code size reason.

  if (Src.getValueType().getScalarSizeInBits() < 32)

    Src = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Src);


  // No legal type found, give up.

  if (!DAG.getTargetLoweringInfo().isTypeLegal(Src.getValueType()))

    return SDValue();


  // See if we can use the 32-bit instruction instead of the 64-bit one for a

  // shorter encoding. Since the former takes the modulo 32 of BitNo and the

  // latter takes the modulo 64, this is only valid if the 5th bit of BitNo is

  // known to be zero.

  if (Src.getValueType() == MVT::i64 &&

      DAG.MaskedValueIsZero(BitNo, APInt(BitNo.getValueSizeInBits(), 32)))

    Src = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Src);


  // If the operand types disagree, extend the shift amount to match.  Since

  // BT ignores high bits (like shifts) we can use anyextend.

  if (Src.getValueType() != BitNo.getValueType()) {

    // Peek through a mask/modulo operation.

    // TODO: DAGCombine fails to do this as it just checks isTruncateFree, but

    // we probably need a better IsDesirableToPromoteOp to handle this as well.

    if (BitNo.getOpcode() == ISD::AND && BitNo->hasOneUse())

      BitNo = DAG.getNode(ISD::AND, DL, Src.getValueType(),

                          DAG.getNode(ISD::ANY_EXTEND, DL, Src.getValueType(),

                                      BitNo.getOperand(0)),

                          DAG.getNode(ISD::ANY_EXTEND, DL, Src.getValueType(),

                                      BitNo.getOperand(1)));

    else

      BitNo = DAG.getNode(ISD::ANY_EXTEND, DL, Src.getValueType(), BitNo);

  }


  return DAG.getNode(X86ISD::BT, DL, MVT::i32, Src, BitNo);

}


/// Helper for creating a X86ISD::SETCC node.


static SDValue getSETCC(X86::CondCode Cond, SDValue EFLAGS, const SDLoc &dl,

                        SelectionDAG &DAG) {

  return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,

                     DAG.getTargetConstant(Cond, dl, MVT::i8), EFLAGS);

}


/// Recursive helper for combineVectorSizedSetCCEquality() to see if we have a

/// recognizable memcmp expansion.


static bool isOrXorXorTree(SDValue X, bool Root = true) {

  if (X.getOpcode() == ISD::OR)

    return isOrXorXorTree(X.getOperand(0), false) &&

           isOrXorXorTree(X.getOperand(1), false);

  if (Root)

    return false;

  return X.getOpcode() == ISD::XOR;

}


/// Recursive helper for combineVectorSizedSetCCEquality() to emit the memcmp

/// expansion.

template <typename F>


static SDValue emitOrXorXorTree(SDValue X, const SDLoc &DL, SelectionDAG &DAG,

                                EVT VecVT, EVT CmpVT, bool HasPT, F SToV) {

  SDValue Op0 = X.getOperand(0);

  SDValue Op1 = X.getOperand(1);

  if (X.getOpcode() == ISD::OR) {

    SDValue A = emitOrXorXorTree(Op0, DL, DAG, VecVT, CmpVT, HasPT, SToV);

    SDValue B = emitOrXorXorTree(Op1, DL, DAG, VecVT, CmpVT, HasPT, SToV);

    if (VecVT != CmpVT)

      return DAG.getNode(ISD::OR, DL, CmpVT, A, B);

    if (HasPT)

      return DAG.getNode(ISD::OR, DL, VecVT, A, B);

    return DAG.getNode(ISD::AND, DL, CmpVT, A, B);

  }

  if (X.getOpcode() == ISD::XOR) {

    SDValue A = SToV(Op0);

    SDValue B = SToV(Op1);

    if (VecVT != CmpVT)

      return DAG.getSetCC(DL, CmpVT, A, B, ISD::SETNE);

    if (HasPT)

      return DAG.getNode(ISD::XOR, DL, VecVT, A, B);

    return DAG.getSetCC(DL, CmpVT, A, B, ISD::SETEQ);

  }

  llvm_unreachable("Impossible");

}


/// Try to map a 128-bit or larger integer comparison to vector instructions

/// before type legalization splits it up into chunks.


static SDValue combineVectorSizedSetCCEquality(EVT VT, SDValue X, SDValue Y,

                                               ISD::CondCode CC,

                                               const SDLoc &DL,

                                               SelectionDAG &DAG,

                                               const X86Subtarget &Subtarget) {

  assert((CC == ISD::SETNE || CC == ISD::SETEQ) && "Bad comparison predicate");


  // We're looking for an oversized integer equality comparison.

  EVT OpVT = X.getValueType();

  unsigned OpSize = OpVT.getSizeInBits();

  if (!OpVT.isScalarInteger() || OpSize < 128)

    return SDValue();


  // Ignore a comparison with zero because that gets special treatment in

  // EmitTest(). But make an exception for the special case of a pair of

  // logically-combined vector-sized operands compared to zero. This pattern may

  // be generated by the memcmp expansion pass with oversized integer compares

  // (see PR33325).

  bool IsOrXorXorTreeCCZero = isNullConstant(Y) && isOrXorXorTree(X);

  if (isNullConstant(Y) && OpSize == 128 && !IsOrXorXorTreeCCZero)

    return SDValue();


  // Don't perform this combine if constructing the vector will be expensive.

  auto IsVectorBitCastCheap = [](SDValue X) {

    X = peekThroughBitcasts(X);

    return isa<ConstantSDNode>(X) || X.getValueType().isVector() ||

           X.getOpcode() == ISD::LOAD;

  };

  if ((!IsVectorBitCastCheap(X) || !IsVectorBitCastCheap(Y)) &&

      !IsOrXorXorTreeCCZero)

    return SDValue();


  // Use XOR (plus OR) and PTEST after SSE4.1 for 128/256-bit operands.

  // Use PCMPNEQ (plus OR) and KORTEST for 512-bit operands.

  // Otherwise use PCMPEQ (plus AND) and mask testing.

  bool NoImplicitFloatOps =

      DAG.getMachineFunction().getFunction().hasFnAttribute(

          Attribute::NoImplicitFloat);

  if (!Subtarget.useSoftFloat() && !NoImplicitFloatOps &&

      ((OpSize == 128 && Subtarget.hasSSE2()) ||

       (OpSize == 256 && Subtarget.hasAVX()) ||

       (OpSize == 512 && Subtarget.useAVX512Regs()))) {

    bool HasPT = Subtarget.hasSSE41();


    // PTEST and MOVMSK are slow on Knights Landing and Knights Mill and widened

    // vector registers are essentially free. (Technically, widening registers

    // prevents load folding, but the tradeoff is worth it.)

    bool PreferKOT = Subtarget.preferMaskRegisters();

    bool NeedZExt = PreferKOT && !Subtarget.hasVLX() && OpSize != 512;


    EVT VecVT = MVT::v16i8;

    EVT CmpVT = PreferKOT ? MVT::v16i1 : VecVT;

    if (OpSize == 256) {

      VecVT = MVT::v32i8;

      CmpVT = PreferKOT ? MVT::v32i1 : VecVT;

    }

    EVT CastVT = VecVT;

    bool NeedsAVX512FCast = false;

    if (OpSize == 512 || NeedZExt) {

      if (Subtarget.hasBWI()) {

        VecVT = MVT::v64i8;

        CmpVT = MVT::v64i1;

        if (OpSize == 512)

          CastVT = VecVT;

      } else {

        VecVT = MVT::v16i32;

        CmpVT = MVT::v16i1;

        CastVT = OpSize == 512   ? VecVT

                 : OpSize == 256 ? MVT::v8i32

                                 : MVT::v4i32;

        NeedsAVX512FCast = true;

      }

    }


    auto ScalarToVector = [&](SDValue X) -> SDValue {

      bool TmpZext = false;

      EVT TmpCastVT = CastVT;

      if (X.getOpcode() == ISD::ZERO_EXTEND) {

        SDValue OrigX = X.getOperand(0);

        unsigned OrigSize = OrigX.getScalarValueSizeInBits();

        if (OrigSize < OpSize) {

          if (OrigSize == 128) {

            TmpCastVT = NeedsAVX512FCast ? MVT::v4i32 : MVT::v16i8;

            X = OrigX;

            TmpZext = true;

          } else if (OrigSize == 256) {

            TmpCastVT = NeedsAVX512FCast ? MVT::v8i32 : MVT::v32i8;

            X = OrigX;

            TmpZext = true;

          }

        }

      }

      X = DAG.getBitcast(TmpCastVT, X);

      if (!NeedZExt && !TmpZext)

        return X;

      return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VecVT,

                         DAG.getConstant(0, DL, VecVT), X,

                         DAG.getVectorIdxConstant(0, DL));

    };


    SDValue Cmp;

    if (IsOrXorXorTreeCCZero) {

      // This is a bitwise-combined equality comparison of 2 pairs of vectors:

      // setcc i128 (or (xor A, B), (xor C, D)), 0, eq|ne

      // Use 2 vector equality compares and 'and' the results before doing a

      // MOVMSK.

      Cmp = emitOrXorXorTree(X, DL, DAG, VecVT, CmpVT, HasPT, ScalarToVector);

    } else {

      SDValue VecX = ScalarToVector(X);

      SDValue VecY = ScalarToVector(Y);

      if (VecVT != CmpVT) {

        Cmp = DAG.getSetCC(DL, CmpVT, VecX, VecY, ISD::SETNE);

      } else if (HasPT) {

        Cmp = DAG.getNode(ISD::XOR, DL, VecVT, VecX, VecY);

      } else {

        Cmp = DAG.getSetCC(DL, CmpVT, VecX, VecY, ISD::SETEQ);

      }

    }

    // AVX512 should emit a setcc that will lower to kortest.

    if (VecVT != CmpVT) {

      EVT KRegVT = CmpVT == MVT::v64i1   ? MVT::i64

                   : CmpVT == MVT::v32i1 ? MVT::i32

                                         : MVT::i16;

      return DAG.getSetCC(DL, VT, DAG.getBitcast(KRegVT, Cmp),

                          DAG.getConstant(0, DL, KRegVT), CC);

    }

    if (HasPT) {

      SDValue BCCmp =

          DAG.getBitcast(OpSize == 256 ? MVT::v4i64 : MVT::v2i64, Cmp);

      SDValue PT = DAG.getNode(X86ISD::PTEST, DL, MVT::i32, BCCmp, BCCmp);

      X86::CondCode X86CC = CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE;

      SDValue X86SetCC = getSETCC(X86CC, PT, DL, DAG);

      return DAG.getNode(ISD::TRUNCATE, DL, VT, X86SetCC.getValue(0));

    }

    // If all bytes match (bitmask is 0x(FFFF)FFFF), that's equality.

    // setcc i128 X, Y, eq --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, eq

    // setcc i128 X, Y, ne --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, ne

    assert(Cmp.getValueType() == MVT::v16i8 &&

           "Non 128-bit vector on pre-SSE41 target");

    SDValue MovMsk = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Cmp);

    SDValue FFFFs = DAG.getConstant(0xFFFF, DL, MVT::i32);

    return DAG.getSetCC(DL, VT, MovMsk, FFFFs, CC);

  }


  return SDValue();

}


/// Helper for matching BINOP(EXTRACTELT(X,0),BINOP(EXTRACTELT(X,1),...))

/// style scalarized (associative) reduction patterns. Partial reductions

/// are supported when the pointer SrcMask is non-null.

/// TODO - move this to SelectionDAG?


static bool matchScalarReduction(SDValue Op, ISD::NodeType BinOp,

                                 SmallVectorImpl<SDValue> &SrcOps,

                                 SmallVectorImpl<APInt> *SrcMask = nullptr) {

  SmallVector<SDValue, 8> Opnds;

  DenseMap<SDValue, APInt> SrcOpMap;

  EVT VT = MVT::Other;


  // Recognize a special case where a vector is casted into wide integer to

  // test all 0s.

  assert(Op.getOpcode() == unsigned(BinOp) &&

         "Unexpected bit reduction opcode");

  Opnds.push_back(Op.getOperand(0));

  Opnds.push_back(Op.getOperand(1));


  for (unsigned Slot = 0, e = Opnds.size(); Slot < e; ++Slot) {

    SmallVectorImpl<SDValue>::const_iterator I = Opnds.begin() + Slot;

    // BFS traverse all BinOp operands.

    if (I->getOpcode() == unsigned(BinOp)) {

      Opnds.push_back(I->getOperand(0));

      Opnds.push_back(I->getOperand(1));

      // Re-evaluate the number of nodes to be traversed.

      e += 2; // 2 more nodes (LHS and RHS) are pushed.

      continue;

    }


    // Quit if a non-EXTRACT_VECTOR_ELT

    if (I->getOpcode() != ISD::EXTRACT_VECTOR_ELT)

      return false;


    // Quit if without a constant index.

    auto *Idx = dyn_cast<ConstantSDNode>(I->getOperand(1));

    if (!Idx)

      return false;


    SDValue Src = I->getOperand(0);

    DenseMap<SDValue, APInt>::iterator M = SrcOpMap.find(Src);

    if (M == SrcOpMap.end()) {

      VT = Src.getValueType();

      // Quit if not the same type.

      if (!SrcOpMap.empty() && VT != SrcOpMap.begin()->first.getValueType())

        return false;

      unsigned NumElts = VT.getVectorNumElements();

      APInt EltCount = APInt::getZero(NumElts);

      M = SrcOpMap.insert(std::make_pair(Src, EltCount)).first;

      SrcOps.push_back(Src);

    }


    // Quit if element already used.

    unsigned CIdx = Idx->getZExtValue();

    if (M->second[CIdx])

      return false;

    M->second.setBit(CIdx);

  }


  if (SrcMask) {

    // Collect the source partial masks.

    for (SDValue &SrcOp : SrcOps)

      SrcMask->push_back(SrcOpMap[SrcOp]);

  } else {

    // Quit if not all elements are used.

    for (const auto &I : SrcOpMap)

      if (!I.second.isAllOnes())

        return false;

  }


  return true;

}


// Helper function for comparing all bits of two vectors.


static SDValue LowerVectorAllEqual(const SDLoc &DL, SDValue LHS, SDValue RHS,

                                   ISD::CondCode CC, const APInt &OriginalMask,

                                   const X86Subtarget &Subtarget,

                                   SelectionDAG &DAG, X86::CondCode &X86CC) {

  EVT VT = LHS.getValueType();

  unsigned ScalarSize = VT.getScalarSizeInBits();

  if (OriginalMask.getBitWidth() != ScalarSize) {

    assert(ScalarSize == 1 && "Element Mask vs Vector bitwidth mismatch");

    return SDValue();

  }


  // Quit if not convertable to legal scalar or 128/256-bit vector.

  if (!llvm::has_single_bit<uint32_t>(VT.getSizeInBits()))

    return SDValue();


  // FCMP may use ISD::SETNE when nnan - early out if we manage to get here.

  if (VT.isFloatingPoint())

    return SDValue();


  assert((CC == ISD::SETEQ || CC == ISD::SETNE) && "Unsupported ISD::CondCode");

  X86CC = (CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE);


  APInt Mask = OriginalMask;


  auto MaskBits = [&](SDValue Src) {

    if (Mask.isAllOnes())

      return Src;

    EVT SrcVT = Src.getValueType();

    SDValue MaskValue = DAG.getConstant(Mask, DL, SrcVT);

    return DAG.getNode(ISD::AND, DL, SrcVT, Src, MaskValue);

  };


  // For sub-128-bit vector, cast to (legal) integer and compare with zero.

  if (VT.getSizeInBits() < 128) {

    EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());

    if (!DAG.getTargetLoweringInfo().isTypeLegal(IntVT)) {

      if (IntVT != MVT::i64)

        return SDValue();

      auto SplitLHS = DAG.SplitScalar(DAG.getBitcast(IntVT, MaskBits(LHS)), DL,

                                      MVT::i32, MVT::i32);

      auto SplitRHS = DAG.SplitScalar(DAG.getBitcast(IntVT, MaskBits(RHS)), DL,

                                      MVT::i32, MVT::i32);

      SDValue Lo =

          DAG.getNode(ISD::XOR, DL, MVT::i32, SplitLHS.first, SplitRHS.first);

      SDValue Hi =

          DAG.getNode(ISD::XOR, DL, MVT::i32, SplitLHS.second, SplitRHS.second);

      return DAG.getNode(X86ISD::CMP, DL, MVT::i32,

                         DAG.getNode(ISD::OR, DL, MVT::i32, Lo, Hi),

                         DAG.getConstant(0, DL, MVT::i32));

    }

    return DAG.getNode(X86ISD::CMP, DL, MVT::i32,

                       DAG.getBitcast(IntVT, MaskBits(LHS)),

                       DAG.getBitcast(IntVT, MaskBits(RHS)));

  }


  // Without PTEST, a masked v2i64 or-reduction is not faster than

  // scalarization.

  bool UseKORTEST = Subtarget.useAVX512Regs();

  bool UsePTEST = Subtarget.hasSSE41();

  if (!UsePTEST && !Mask.isAllOnes() && ScalarSize > 32)

    return SDValue();


  // Split down to 128/256/512-bit vector.

  unsigned TestSize = UseKORTEST ? 512 : (Subtarget.hasAVX() ? 256 : 128);


  // If the input vector has vector elements wider than the target test size,

  // then cast to <X x i64> so it will safely split.

  if (ScalarSize > TestSize) {

    if (!Mask.isAllOnes())

      return SDValue();

    VT = EVT::getVectorVT(*DAG.getContext(), MVT::i64, VT.getSizeInBits() / 64);

    LHS = DAG.getBitcast(VT, LHS);

    RHS = DAG.getBitcast(VT, RHS);

    Mask = APInt::getAllOnes(64);

  }


  if (VT.getSizeInBits() > TestSize) {

    KnownBits KnownRHS = DAG.computeKnownBits(RHS);

    if (KnownRHS.isConstant() && KnownRHS.getConstant() == Mask) {

      // If ICMP(AND(LHS,MASK),MASK) - reduce using AND splits.

      while (VT.getSizeInBits() > TestSize) {

        auto Split = DAG.SplitVector(LHS, DL);

        VT = Split.first.getValueType();

        LHS = DAG.getNode(ISD::AND, DL, VT, Split.first, Split.second);

      }

      RHS = DAG.getAllOnesConstant(DL, VT);

    } else if (!UsePTEST && !KnownRHS.isZero()) {

      // MOVMSK Special Case:

      // ALLOF(CMPEQ(X,Y)) -> AND(CMPEQ(X[0],Y[0]),CMPEQ(X[1],Y[1]),....)

      MVT SVT = ScalarSize >= 32 ? MVT::i32 : MVT::i8;

      VT = MVT::getVectorVT(SVT, VT.getSizeInBits() / SVT.getSizeInBits());

      LHS = DAG.getBitcast(VT, MaskBits(LHS));

      RHS = DAG.getBitcast(VT, MaskBits(RHS));

      EVT BoolVT = VT.changeVectorElementType(MVT::i1);

      SDValue V = DAG.getSetCC(DL, BoolVT, LHS, RHS, ISD::SETEQ);

      V = DAG.getSExtOrTrunc(V, DL, VT);

      while (VT.getSizeInBits() > TestSize) {

        auto Split = DAG.SplitVector(V, DL);

        VT = Split.first.getValueType();

        V = DAG.getNode(ISD::AND, DL, VT, Split.first, Split.second);

      }

      V = DAG.getNOT(DL, V, VT);

      V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);

      return DAG.getNode(X86ISD::CMP, DL, MVT::i32, V,

                         DAG.getConstant(0, DL, MVT::i32));

    } else {

      // Convert to a ICMP_EQ(XOR(LHS,RHS),0) pattern.

      SDValue V = DAG.getNode(ISD::XOR, DL, VT, LHS, RHS);

      while (VT.getSizeInBits() > TestSize) {

        auto Split = DAG.SplitVector(V, DL);

        VT = Split.first.getValueType();

        V = DAG.getNode(ISD::OR, DL, VT, Split.first, Split.second);

      }

      LHS = V;

      RHS = DAG.getConstant(0, DL, VT);

    }

  }


  if (UseKORTEST && VT.is512BitVector()) {

    MVT TestVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);

    MVT BoolVT = TestVT.changeVectorElementType(MVT::i1);

    LHS = DAG.getBitcast(TestVT, MaskBits(LHS));

    RHS = DAG.getBitcast(TestVT, MaskBits(RHS));

    SDValue V = DAG.getSetCC(DL, BoolVT, LHS, RHS, ISD::SETNE);

    return DAG.getNode(X86ISD::KORTEST, DL, MVT::i32, V, V);

  }


  if (UsePTEST) {

    MVT TestVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64);

    LHS = DAG.getBitcast(TestVT, MaskBits(LHS));

    RHS = DAG.getBitcast(TestVT, MaskBits(RHS));

    SDValue V = DAG.getNode(ISD::XOR, DL, TestVT, LHS, RHS);

    return DAG.getNode(X86ISD::PTEST, DL, MVT::i32, V, V);

  }


  assert(VT.getSizeInBits() == 128 && "Failure to split to 128-bits");

  MVT MaskVT = ScalarSize >= 32 ? MVT::v4i32 : MVT::v16i8;

  LHS = DAG.getBitcast(MaskVT, MaskBits(LHS));

  RHS = DAG.getBitcast(MaskVT, MaskBits(RHS));

  SDValue V = DAG.getNode(X86ISD::PCMPEQ, DL, MaskVT, LHS, RHS);

  V = DAG.getNOT(DL, V, MaskVT);

  V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);

  return DAG.getNode(X86ISD::CMP, DL, MVT::i32, V,

                     DAG.getConstant(0, DL, MVT::i32));

}


// Check whether an AND/OR'd reduction tree is PTEST-able, or if we can fallback

// to CMP(MOVMSK(PCMPEQB(X,Y))).


static SDValue MatchVectorAllEqualTest(SDValue OrigLHS, SDValue OrigRHS,

                                       ISD::CondCode CC, const SDLoc &DL,

                                       const X86Subtarget &Subtarget,

                                       SelectionDAG &DAG,

                                       X86::CondCode &X86CC) {

  SDValue Op = OrigLHS;


  bool CmpNull;

  APInt Mask;

  if (CC == ISD::SETEQ || CC == ISD::SETNE) {

    CmpNull = isNullConstant(OrigRHS);

    if (!CmpNull && !isAllOnesConstant(OrigRHS))

      return SDValue();


    if (!Subtarget.hasSSE2() || !Op->hasOneUse())

      return SDValue();


    // Check whether we're masking/truncating an OR-reduction result, in which

    // case track the masked bits.

    // TODO: Add CmpAllOnes support.

    Mask = APInt::getAllOnes(Op.getScalarValueSizeInBits());

    if (CmpNull) {

      switch (Op.getOpcode()) {

      case ISD::TRUNCATE: {

        SDValue Src = Op.getOperand(0);

        Mask = APInt::getLowBitsSet(Src.getScalarValueSizeInBits(),

                                    Op.getScalarValueSizeInBits());

        Op = Src;

        break;

      }

      case ISD::AND: {

        if (auto *Cst = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {

          Mask = Cst->getAPIntValue();

          Op = Op.getOperand(0);

        }

        break;

      }

      }

    }

  } else if (CC == ISD::SETGT && isAllOnesConstant(OrigRHS)) {

    CC = ISD::SETEQ;

    CmpNull = true;

    Mask = APInt::getSignMask(Op.getScalarValueSizeInBits());

  } else {

    return SDValue();

  }


  ISD::NodeType LogicOp = CmpNull ? ISD::OR : ISD::AND;


  // Match icmp(or(extract(X,0),extract(X,1)),0) anyof reduction patterns.

  // Match icmp(and(extract(X,0),extract(X,1)),-1) allof reduction patterns.

  SmallVector<SDValue, 8> VecIns;

  if (Op.getOpcode() == LogicOp && matchScalarReduction(Op, LogicOp, VecIns)) {

    EVT VT = VecIns[0].getValueType();

    assert(llvm::all_of(VecIns,

                        [VT](SDValue V) { return VT == V.getValueType(); }) &&

           "Reduction source vector mismatch");


    // Quit if not splittable to scalar/128/256/512-bit vector.

    if (!llvm::has_single_bit<uint32_t>(VT.getSizeInBits()))

      return SDValue();


    // If more than one full vector is evaluated, AND/OR them first before

    // PTEST.

    for (unsigned Slot = 0, e = VecIns.size(); e - Slot > 1;

         Slot += 2, e += 1) {

      // Each iteration will AND/OR 2 nodes and append the result until there is

      // only 1 node left, i.e. the final value of all vectors.

      SDValue LHS = VecIns[Slot];

      SDValue RHS = VecIns[Slot + 1];

      VecIns.push_back(DAG.getNode(LogicOp, DL, VT, LHS, RHS));

    }


    return LowerVectorAllEqual(DL, VecIns.back(),

                               CmpNull ? DAG.getConstant(0, DL, VT)

                                       : DAG.getAllOnesConstant(DL, VT),

                               CC, Mask, Subtarget, DAG, X86CC);

  }


  // Match icmp(reduce_or(X),0) anyof reduction patterns.

  // Match icmp(reduce_and(X),-1) allof reduction patterns.

  if (Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {

    ISD::NodeType BinOp;

    if (SDValue Match =

            DAG.matchBinOpReduction(Op.getNode(), BinOp, {LogicOp})) {

      EVT MatchVT = Match.getValueType();

      return LowerVectorAllEqual(DL, Match,

                                 CmpNull ? DAG.getConstant(0, DL, MatchVT)

                                         : DAG.getAllOnesConstant(DL, MatchVT),

                                 CC, Mask, Subtarget, DAG, X86CC);

    }

  }


  if (Mask.isAllOnes()) {

    assert(!Op.getValueType().isVector() &&

           "Illegal vector type for reduction pattern");

    SDValue Src = peekThroughBitcasts(Op);

    if (Src.getValueType().isFixedLengthVector() &&

        Src.getValueType().getScalarType() == MVT::i1) {

      // Match icmp(bitcast(icmp_ne(X,Y)),0) reduction patterns.

      // Match icmp(bitcast(icmp_eq(X,Y)),-1) reduction patterns.

      if (Src.getOpcode() == ISD::SETCC) {

        SDValue LHS = Src.getOperand(0);

        SDValue RHS = Src.getOperand(1);

        EVT LHSVT = LHS.getValueType();

        ISD::CondCode SrcCC = cast<CondCodeSDNode>(Src.getOperand(2))->get();

        if (SrcCC == (CmpNull ? ISD::SETNE : ISD::SETEQ) &&

            llvm::has_single_bit<uint32_t>(LHSVT.getSizeInBits())) {

          APInt SrcMask = APInt::getAllOnes(LHSVT.getScalarSizeInBits());

          return LowerVectorAllEqual(DL, LHS, RHS, CC, SrcMask, Subtarget, DAG,

                                     X86CC);

        }

      }

      // Match icmp(bitcast(vXi1 trunc(Y)),0) reduction patterns.

      // Match icmp(bitcast(vXi1 trunc(Y)),-1) reduction patterns.

      // Peek through truncation, mask the LSB and compare against zero/LSB.

      if (Src.getOpcode() == ISD::TRUNCATE) {

        SDValue Inner = Src.getOperand(0);

        EVT InnerVT = Inner.getValueType();

        if (llvm::has_single_bit<uint32_t>(InnerVT.getSizeInBits())) {

          unsigned BW = InnerVT.getScalarSizeInBits();

          APInt SrcMask = APInt(BW, 1);

          APInt Cmp = CmpNull ? APInt::getZero(BW) : SrcMask;

          return LowerVectorAllEqual(DL, Inner,

                                     DAG.getConstant(Cmp, DL, InnerVT), CC,

                                     SrcMask, Subtarget, DAG, X86CC);

        }

      }

    }

  }


  return SDValue();

}


/// return true if \c Op has a use that doesn't just read flags.


static bool hasNonFlagsUse(SDValue Op) {

  for (SDUse &Use : Op->uses()) {

    SDNode *User = Use.getUser();

    unsigned UOpNo = Use.getOperandNo();

    if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) {

      // Look past truncate.

      UOpNo = User->use_begin()->getOperandNo();

      User = User->use_begin()->getUser();

    }


    if (User->getOpcode() != ISD::BRCOND && User->getOpcode() != ISD::SETCC &&

        !(User->getOpcode() == ISD::SELECT && UOpNo == 0))

      return true;

  }

  return false;

}


// Transform to an x86-specific ALU node with flags if there is a chance of

// using an RMW op or only the flags are used. Otherwise, leave

// the node alone and emit a 'cmp' or 'test' instruction.


static bool isProfitableToUseFlagOp(SDValue Op) {

  for (SDNode *U : Op->users())

    if (U->getOpcode() != ISD::CopyToReg &&

        U->getOpcode() != ISD::SETCC &&

        U->getOpcode() != ISD::STORE)

      return false;


  return true;

}


/// Emit nodes that will be selected as "test Op0,Op0", or something

/// equivalent.


static SDValue EmitTest(SDValue Op, X86::CondCode X86CC, const SDLoc &dl,

                        SelectionDAG &DAG, const X86Subtarget &Subtarget) {

  // CF and OF aren't always set the way we want. Determine which

  // of these we need.

  bool NeedCF = false;

  bool NeedOF = false;

  switch (X86CC) {

  default: break;

  case X86::COND_A: case X86::COND_AE:

  case X86::COND_B: case X86::COND_BE:

    NeedCF = true;

    break;

  case X86::COND_G: case X86::COND_GE:

  case X86::COND_L: case X86::COND_LE:

  case X86::COND_O: case X86::COND_NO: {

    // Check if we really need to set the

    // Overflow flag. If NoSignedWrap is present

    // that is not actually needed.

    switch (Op->getOpcode()) {

    case ISD::ADD:

    case ISD::SUB:

    case ISD::MUL:

    case ISD::SHL:

      if (Op.getNode()->getFlags().hasNoSignedWrap())

        break;

      [[fallthrough]];

    default:

      NeedOF = true;

      break;

    }

    break;

  }

  }

  // See if we can use the EFLAGS value from the operand instead of

  // doing a separate TEST. TEST always sets OF and CF to 0, so unless

  // we prove that the arithmetic won't overflow, we can't use OF or CF.

  if (Op.getResNo() != 0 || NeedOF || NeedCF) {

    // Emit a CMP with 0, which is the TEST pattern.

    return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,

                       DAG.getConstant(0, dl, Op.getValueType()));

  }

  unsigned Opcode = 0;

  unsigned NumOperands = 0;


  SDValue ArithOp = Op;


  // NOTICE: In the code below we use ArithOp to hold the arithmetic operation

  // which may be the result of a CAST.  We use the variable 'Op', which is the

  // non-casted variable when we check for possible users.

  switch (ArithOp.getOpcode()) {

  case ISD::AND:

    // If the primary 'and' result isn't used, don't bother using X86ISD::AND,

    // because a TEST instruction will be better.

    if (!hasNonFlagsUse(Op))

      break;


    [[fallthrough]];

  case ISD::ADD:

  case ISD::SUB:

  case ISD::OR:

  case ISD::XOR:

    if (!isProfitableToUseFlagOp(Op))

      break;


    // Otherwise use a regular EFLAGS-setting instruction.

    switch (ArithOp.getOpcode()) {

    // clang-format off

    default: llvm_unreachable("unexpected operator!");

    case ISD::ADD: Opcode = X86ISD::ADD; break;

    case ISD::SUB: Opcode = X86ISD::SUB; break;

    case ISD::XOR: Opcode = X86ISD::XOR; break;

    case ISD::AND: Opcode = X86ISD::AND; break;

    case ISD::OR:  Opcode = X86ISD::OR;  break;

    // clang-format on

    }


    NumOperands = 2;

    break;

  case X86ISD::ADD:

  case X86ISD::SUB:

  case X86ISD::OR:

  case X86ISD::XOR:

  case X86ISD::AND:

    return SDValue(Op.getNode(), 1);

  case ISD::SSUBO:

  case ISD::USUBO: {

    // /USUBO/SSUBO will become a X86ISD::SUB and we can use its Z flag.

    SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);

    return DAG.getNode(X86ISD::SUB, dl, VTs, Op->getOperand(0),

                       Op->getOperand(1)).getValue(1);

  }

  default:

    break;

  }


  if (Opcode == 0) {

    // Emit a CMP with 0, which is the TEST pattern.

    return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,

                       DAG.getConstant(0, dl, Op.getValueType()));

  }

  SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);

  SmallVector<SDValue, 4> Ops(Op->ops().take_front(NumOperands));


  SDValue New = DAG.getNode(Opcode, dl, VTs, Ops);

  DAG.ReplaceAllUsesOfValueWith(SDValue(Op.getNode(), 0), New);

  return SDValue(New.getNode(), 1);

}


/// Emit nodes that will be selected as "cmp Op0,Op1", or something

/// equivalent.


static SDValue EmitCmp(SDValue Op0, SDValue Op1, X86::CondCode X86CC,

                       const SDLoc &dl, SelectionDAG &DAG,

                       const X86Subtarget &Subtarget) {

  if (isNullConstant(Op1))

    return EmitTest(Op0, X86CC, dl, DAG, Subtarget);


  EVT CmpVT = Op0.getValueType();


  assert((CmpVT == MVT::i8 || CmpVT == MVT::i16 ||

          CmpVT == MVT::i32 || CmpVT == MVT::i64) && "Unexpected VT!");


  // Only promote the compare up to I32 if it is a 16 bit operation

  // with an immediate. 16 bit immediates are to be avoided unless the target

  // isn't slowed down by length changing prefixes, we're optimizing for

  // codesize or the comparison is with a folded load.

  if (CmpVT == MVT::i16 && !Subtarget.hasFastImm16() &&

      !X86::mayFoldLoad(Op0, Subtarget) && !X86::mayFoldLoad(Op1, Subtarget) &&

      !DAG.getMachineFunction().getFunction().hasMinSize()) {

    auto *COp0 = dyn_cast<ConstantSDNode>(Op0);

    auto *COp1 = dyn_cast<ConstantSDNode>(Op1);

    // Don't do this if the immediate can fit in 8-bits.

    if ((COp0 && !COp0->getAPIntValue().isSignedIntN(8)) ||

        (COp1 && !COp1->getAPIntValue().isSignedIntN(8))) {

      unsigned ExtendOp =

          isX86CCSigned(X86CC) ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;

      if (X86CC == X86::COND_E || X86CC == X86::COND_NE) {

        // For equality comparisons try to use SIGN_EXTEND if the input was

        // truncate from something with enough sign bits.

        if (Op0.getOpcode() == ISD::TRUNCATE) {

          if (DAG.ComputeMaxSignificantBits(Op0.getOperand(0)) <= 16)

            ExtendOp = ISD::SIGN_EXTEND;

        } else if (Op1.getOpcode() == ISD::TRUNCATE) {

          if (DAG.ComputeMaxSignificantBits(Op1.getOperand(0)) <= 16)

            ExtendOp = ISD::SIGN_EXTEND;

        }

      }


      CmpVT = MVT::i32;

      Op0 = DAG.getNode(ExtendOp, dl, CmpVT, Op0);

      Op1 = DAG.getNode(ExtendOp, dl, CmpVT, Op1);

    }

  }


  // Try to shrink i64 compares if the input has enough zero bits.

  if (CmpVT == MVT::i64 && !isX86CCSigned(X86CC) &&

      Op0.hasOneUse() && // Hacky way to not break CSE opportunities with sub.

      DAG.MaskedValueIsZero(Op1, APInt::getHighBitsSet(64, 32)) &&

      DAG.MaskedValueIsZero(Op0, APInt::getHighBitsSet(64, 32))) {

    CmpVT = MVT::i32;

    Op0 = DAG.getNode(ISD::TRUNCATE, dl, CmpVT, Op0);

    Op1 = DAG.getNode(ISD::TRUNCATE, dl, CmpVT, Op1);

  }


  // Try to shrink all i64 compares if the inputs are representable as signed

  // i32.

  if (CmpVT == MVT::i64 &&

      Op0.hasOneUse() && // Hacky way to not break CSE opportunities with sub.

      DAG.ComputeNumSignBits(Op1) > 32 && DAG.ComputeNumSignBits(Op0) > 32) {

    CmpVT = MVT::i32;

    Op0 = DAG.getNode(ISD::TRUNCATE, dl, CmpVT, Op0);

    Op1 = DAG.getNode(ISD::TRUNCATE, dl, CmpVT, Op1);

  }


  // 0-x == y --> x+y == 0

  // 0-x != y --> x+y != 0

  if (Op0.getOpcode() == ISD::SUB && isNullConstant(Op0.getOperand(0)) &&

      Op0.hasOneUse() && (X86CC == X86::COND_E || X86CC == X86::COND_NE)) {

    SDVTList VTs = DAG.getVTList(CmpVT, MVT::i32);

    SDValue Add = DAG.getNode(X86ISD::ADD, dl, VTs, Op0.getOperand(1), Op1);

    return Add.getValue(1);

  }


  // x == 0-y --> x+y == 0

  // x != 0-y --> x+y != 0

  if (Op1.getOpcode() == ISD::SUB && isNullConstant(Op1.getOperand(0)) &&

      Op1.hasOneUse() && (X86CC == X86::COND_E || X86CC == X86::COND_NE)) {

    SDVTList VTs = DAG.getVTList(CmpVT, MVT::i32);

    SDValue Add = DAG.getNode(X86ISD::ADD, dl, VTs, Op0, Op1.getOperand(1));

    return Add.getValue(1);

  }


  // If we already have an XOR of the ops, use that to check for equality.

  // Else use SUB instead of CMP to enable CSE between SUB and CMP.

  unsigned X86Opc = X86ISD::SUB;

  if ((X86CC == X86::COND_E || X86CC == X86::COND_NE) &&

      (DAG.doesNodeExist(ISD::XOR, DAG.getVTList({CmpVT}), {Op0, Op1}) ||

       DAG.doesNodeExist(ISD::XOR, DAG.getVTList({CmpVT}), {Op1, Op0})))

    X86Opc = X86ISD::XOR;


  SDVTList VTs = DAG.getVTList(CmpVT, MVT::i32);

  SDValue CmpOp = DAG.getNode(X86Opc, dl, VTs, Op0, Op1);

  return CmpOp.getValue(1);

}


bool X86TargetLowering::isXAndYEqZeroPreferableToXAndYEqY(ISD::CondCode Cond,

                                                          EVT VT) const {

  return !VT.isVector() || Cond != ISD::CondCode::SETEQ;

}


bool X86TargetLowering::optimizeFMulOrFDivAsShiftAddBitcast(

    SDNode *N, SDValue, SDValue IntPow2) const {

  if (N->getOpcode() == ISD::FDIV)

    return true;


  EVT FPVT = N->getValueType(0);

  EVT IntVT = IntPow2.getValueType();


  // This indicates a non-free bitcast.

  // TODO: This is probably overly conservative as we will need to scale the

  // integer vector anyways for the int->fp cast.

  if (FPVT.isVector() &&

      FPVT.getScalarSizeInBits() != IntVT.getScalarSizeInBits())

    return false;


  return true;

}


/// Check if replacement of SQRT with RSQRT should be disabled.

bool X86TargetLowering::isFsqrtCheap(SDValue Op, SelectionDAG &DAG) const {

  EVT VT = Op.getValueType();


  // We don't need to replace SQRT with RSQRT for half type.

  if (VT.getScalarType() == MVT::f16)

    return true;


  // We never want to use both SQRT and RSQRT instructions for the same input.

  if (DAG.doesNodeExist(X86ISD::FRSQRT, DAG.getVTList(VT), Op))

    return false;


  if (VT.isVector())

    return Subtarget.hasFastVectorFSQRT();

  return Subtarget.hasFastScalarFSQRT();

}


/// The minimum architected relative accuracy is 2^-12. We need one

/// Newton-Raphson step to have a good float result (24 bits of precision).

SDValue X86TargetLowering::getSqrtEstimate(SDValue Op,

                                           SelectionDAG &DAG, int Enabled,

                                           int &RefinementSteps,

                                           bool &UseOneConstNR,

                                           bool Reciprocal) const {

  SDLoc DL(Op);

  EVT VT = Op.getValueType();


  // SSE1 has rsqrtss and rsqrtps. AVX adds a 256-bit variant for rsqrtps.

  // It is likely not profitable to do this for f64 because a double-precision

  // rsqrt estimate with refinement on x86 prior to FMA requires at least 16

  // instructions: convert to single, rsqrtss, convert back to double, refine

  // (3 steps = at least 13 insts). If an 'rsqrtsd' variant was added to the ISA

  // along with FMA, this could be a throughput win.

  // TODO: SQRT requires SSE2 to prevent the introduction of an illegal v4i32

  // after legalize types.

  if ((VT == MVT::f32 && Subtarget.hasSSE1()) ||

      (VT == MVT::v4f32 && Subtarget.hasSSE1() && Reciprocal) ||

      (VT == MVT::v4f32 && Subtarget.hasSSE2() && !Reciprocal) ||

      (VT == MVT::v8f32 && Subtarget.hasAVX()) ||

      (VT == MVT::v16f32 && Subtarget.useAVX512Regs())) {

    if (RefinementSteps == ReciprocalEstimate::Unspecified)

      RefinementSteps = 1;


    UseOneConstNR = false;

    // There is no FSQRT for 512-bits, but there is RSQRT14.

    unsigned Opcode = VT == MVT::v16f32 ? X86ISD::RSQRT14 : X86ISD::FRSQRT;

    SDValue Estimate = DAG.getNode(Opcode, DL, VT, Op);

    if (RefinementSteps == 0 && !Reciprocal)

      Estimate = DAG.getNode(ISD::FMUL, DL, VT, Op, Estimate);

    return Estimate;

  }


  if (VT.getScalarType() == MVT::f16 && isTypeLegal(VT) &&

      Subtarget.hasFP16()) {

    assert(Reciprocal && "Don't replace SQRT with RSQRT for half type");

    if (RefinementSteps == ReciprocalEstimate::Unspecified)

      RefinementSteps = 0;


    if (VT == MVT::f16) {

      SDValue Zero = DAG.getVectorIdxConstant(0, DL);

      SDValue Undef = DAG.getUNDEF(MVT::v8f16);

      Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v8f16, Op);

      Op = DAG.getNode(X86ISD::RSQRT14S, DL, MVT::v8f16, Undef, Op);

      return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f16, Op, Zero);

    }


    return DAG.getNode(X86ISD::RSQRT14, DL, VT, Op);

  }

  return SDValue();

}


/// The minimum architected relative accuracy is 2^-12. We need one

/// Newton-Raphson step to have a good float result (24 bits of precision).

SDValue X86TargetLowering::getRecipEstimate(SDValue Op, SelectionDAG &DAG,

                                            int Enabled,

                                            int &RefinementSteps) const {

  SDLoc DL(Op);

  EVT VT = Op.getValueType();


  // SSE1 has rcpss and rcpps. AVX adds a 256-bit variant for rcpps.

  // It is likely not profitable to do this for f64 because a double-precision

  // reciprocal estimate with refinement on x86 prior to FMA requires

  // 15 instructions: convert to single, rcpss, convert back to double, refine

  // (3 steps = 12 insts). If an 'rcpsd' variant was added to the ISA

  // along with FMA, this could be a throughput win.


  if ((VT == MVT::f32 && Subtarget.hasSSE1()) ||

      (VT == MVT::v4f32 && Subtarget.hasSSE1()) ||

      (VT == MVT::v8f32 && Subtarget.hasAVX()) ||

      (VT == MVT::v16f32 && Subtarget.useAVX512Regs())) {

    // Enable estimate codegen with 1 refinement step for vector division.

    // Scalar division estimates are disabled because they break too much

    // real-world code. These defaults are intended to match GCC behavior.

    if (VT == MVT::f32 && Enabled == ReciprocalEstimate::Unspecified)

      return SDValue();


    if (RefinementSteps == ReciprocalEstimate::Unspecified)

      RefinementSteps = 1;


    // There is no FSQRT for 512-bits, but there is RCP14.

    unsigned Opcode = VT == MVT::v16f32 ? X86ISD::RCP14 : X86ISD::FRCP;

    return DAG.getNode(Opcode, DL, VT, Op);

  }


  if (VT.getScalarType() == MVT::f16 && isTypeLegal(VT) &&

      Subtarget.hasFP16()) {

    if (RefinementSteps == ReciprocalEstimate::Unspecified)

      RefinementSteps = 0;


    if (VT == MVT::f16) {

      SDValue Zero = DAG.getVectorIdxConstant(0, DL);

      SDValue Undef = DAG.getUNDEF(MVT::v8f16);

      Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v8f16, Op);

      Op = DAG.getNode(X86ISD::RCP14S, DL, MVT::v8f16, Undef, Op);

      return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f16, Op, Zero);

    }


    return DAG.getNode(X86ISD::RCP14, DL, VT, Op);

  }

  return SDValue();

}


/// If we have at least two divisions that use the same divisor, convert to

/// multiplication by a reciprocal. This may need to be adjusted for a given

/// CPU if a division's cost is not at least twice the cost of a multiplication.

/// This is because we still need one division to calculate the reciprocal and

/// then we need two multiplies by that reciprocal as replacements for the

/// original divisions.

unsigned X86TargetLowering::combineRepeatedFPDivisors() const {

  return 2;

}


SDValue

X86TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,

                                 SelectionDAG &DAG,

                                 SmallVectorImpl<SDNode *> &Created) const {

  AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes();

  if (isIntDivCheap(N->getValueType(0), Attr))

    return SDValue(N,0); // Lower SDIV as SDIV


  assert((Divisor.isPowerOf2() || Divisor.isNegatedPowerOf2()) &&

         "Unexpected divisor!");


  // Only perform this transform if CMOV is supported otherwise the select

  // below will become a branch.

  if (!Subtarget.canUseCMOV())

    return SDValue();


  // fold (sdiv X, pow2)

  EVT VT = N->getValueType(0);

  // FIXME: Support i8.

  if (VT != MVT::i16 && VT != MVT::i32 &&

      !(Subtarget.is64Bit() && VT == MVT::i64))

    return SDValue();


  // If the divisor is 2 or -2, the default expansion is better.

  if (Divisor == 2 ||

      Divisor == APInt(Divisor.getBitWidth(), -2, /*isSigned*/ true))

    return SDValue();


  return TargetLowering::buildSDIVPow2WithCMov(N, Divisor, DAG, Created);

}


/// Result of 'and' is compared against zero. Change to a BT node if possible.

/// Returns the BT node and the condition code needed to use it.


static SDValue LowerAndToBT(SDValue And, ISD::CondCode CC, const SDLoc &dl,

                            SelectionDAG &DAG, X86::CondCode &X86CC) {

  assert(And.getOpcode() == ISD::AND && "Expected AND node!");

  SDValue Op0 = And.getOperand(0);

  SDValue Op1 = And.getOperand(1);

  if (Op0.getOpcode() == ISD::TRUNCATE)

    Op0 = Op0.getOperand(0);

  if (Op1.getOpcode() == ISD::TRUNCATE)

    Op1 = Op1.getOperand(0);


  SDValue Src, BitNo;

  if (Op1.getOpcode() == ISD::SHL)

    std::swap(Op0, Op1);

  if (Op0.getOpcode() == ISD::SHL) {

    if (isOneConstant(Op0.getOperand(0))) {

      // If we looked past a truncate, check that it's only truncating away

      // known zeros.

      unsigned BitWidth = Op0.getValueSizeInBits();

      unsigned AndBitWidth = And.getValueSizeInBits();

      if (BitWidth > AndBitWidth) {

        KnownBits Known = DAG.computeKnownBits(Op0);

        if (Known.countMinLeadingZeros() < BitWidth - AndBitWidth)

          return SDValue();

      }

      Src = Op1;

      BitNo = Op0.getOperand(1);

    }

  } else if (Op1.getOpcode() == ISD::Constant) {

    ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op1);

    uint64_t AndRHSVal = AndRHS->getZExtValue();

    SDValue AndLHS = Op0;


    if (AndRHSVal == 1 && AndLHS.getOpcode() == ISD::SRL) {

      Src = AndLHS.getOperand(0);

      BitNo = AndLHS.getOperand(1);

    } else {

      // Use BT if the immediate can't be encoded in a TEST instruction or we

      // are optimizing for size and the immedaite won't fit in a byte.

      bool OptForSize = DAG.shouldOptForSize();

      if ((!isUInt<32>(AndRHSVal) || (OptForSize && !isUInt<8>(AndRHSVal))) &&

          isPowerOf2_64(AndRHSVal)) {

        Src = AndLHS;

        BitNo = DAG.getConstant(Log2_64_Ceil(AndRHSVal), dl,

                                Src.getValueType());

      }

    }

  }


  // No patterns found, give up.

  if (!Src.getNode())

    return SDValue();


  // Remove any bit flip.

  if (isBitwiseNot(Src)) {

    Src = Src.getOperand(0);

    CC = CC == ISD::SETEQ ? ISD::SETNE : ISD::SETEQ;

  }


  // Attempt to create the X86ISD::BT node.

  if (SDValue BT = getBT(Src, BitNo, dl, DAG)) {

    X86CC = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B;

    return BT;

  }


  return SDValue();

}


// Check if pre-AVX condcode can be performed by a single FCMP op.


static bool cheapX86FSETCC_SSE(ISD::CondCode SetCCOpcode) {

  return (SetCCOpcode != ISD::SETONE) && (SetCCOpcode != ISD::SETUEQ);

}


/// Turns an ISD::CondCode into a value suitable for SSE floating-point mask

/// CMPs.


static unsigned translateX86FSETCC(ISD::CondCode SetCCOpcode, SDValue &Op0,

                                   SDValue &Op1, bool &IsAlwaysSignaling) {

  unsigned SSECC;

  bool Swap = false;


  // SSE Condition code mapping:

  //  0 - EQ

  //  1 - LT

  //  2 - LE

  //  3 - UNORD

  //  4 - NEQ

  //  5 - NLT

  //  6 - NLE

  //  7 - ORD

  switch (SetCCOpcode) {

  // clang-format off

  default: llvm_unreachable("Unexpected SETCC condition");

  case ISD::SETOEQ:

  case ISD::SETEQ:  SSECC = 0; break;

  case ISD::SETOGT:

  case ISD::SETGT:  Swap = true; [[fallthrough]];

  case ISD::SETLT:

  case ISD::SETOLT: SSECC = 1; break;

  case ISD::SETOGE:

  case ISD::SETGE:  Swap = true; [[fallthrough]];

  case ISD::SETLE:

  case ISD::SETOLE: SSECC = 2; break;

  case ISD::SETUO:  SSECC = 3; break;

  case ISD::SETUNE:

  case ISD::SETNE:  SSECC = 4; break;

  case ISD::SETULE: Swap = true; [[fallthrough]];

  case ISD::SETUGE: SSECC = 5; break;

  case ISD::SETULT: Swap = true; [[fallthrough]];

  case ISD::SETUGT: SSECC = 6; break;

  case ISD::SETO:   SSECC = 7; break;

  case ISD::SETUEQ: SSECC = 8; break;

  case ISD::SETONE: SSECC = 12; break;

  // clang-format on

  }

  if (Swap)

    std::swap(Op0, Op1);


  switch (SetCCOpcode) {

  default:

    IsAlwaysSignaling = true;

    break;

  case ISD::SETEQ:

  case ISD::SETOEQ:

  case ISD::SETUEQ:

  case ISD::SETNE:

  case ISD::SETONE:

  case ISD::SETUNE:

  case ISD::SETO:

  case ISD::SETUO:

    IsAlwaysSignaling = false;

    break;

  }


  return SSECC;

}


/// Break a VSETCC 256/512-bit vector into two new 128/256 ones and then

/// concatenate the result back.


static SDValue splitVSETCC(EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond,

                           SelectionDAG &DAG, const SDLoc &dl) {

  assert(VT.isInteger() && LHS.getValueType() == RHS.getValueType() &&

         "Unsupported VTs!");

  SDValue CC = DAG.getCondCode(Cond);


  // Extract the LHS Lo/Hi vectors

  SDValue LHS1, LHS2;

  std::tie(LHS1, LHS2) = splitVector(LHS, DAG, dl);


  // Extract the RHS Lo/Hi vectors

  SDValue RHS1, RHS2;

  std::tie(RHS1, RHS2) = splitVector(RHS, DAG, dl);


  // Issue the operation on the smaller types and concatenate the result back

  EVT LoVT, HiVT;

  std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);

  return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,

                     DAG.getNode(ISD::SETCC, dl, LoVT, LHS1, RHS1, CC),

                     DAG.getNode(ISD::SETCC, dl, HiVT, LHS2, RHS2, CC));

}


static SDValue LowerIntVSETCC_AVX512(SDValue Op, const SDLoc &dl,

                                     SelectionDAG &DAG) {

  SDValue Op0 = Op.getOperand(0);

  SDValue Op1 = Op.getOperand(1);

  SDValue CC = Op.getOperand(2);

  MVT VT = Op.getSimpleValueType();

  assert(VT.getVectorElementType() == MVT::i1 &&

         "Cannot set masked compare for this operation");


  ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();


  // Prefer SETGT over SETLT.

  if (SetCCOpcode == ISD::SETLT) {

    SetCCOpcode = ISD::getSetCCSwappedOperands(SetCCOpcode);

    std::swap(Op0, Op1);

  }


  return DAG.getSetCC(dl, VT, Op0, Op1, SetCCOpcode);

}


/// Given a buildvector constant, return a new vector constant with each element

/// incremented or decremented. If incrementing or decrementing would result in

/// unsigned overflow or underflow or this is not a simple vector constant,

/// return an empty value.


static SDValue incDecVectorConstant(SDValue V, SelectionDAG &DAG, bool IsInc,

                                    bool NSW) {

  auto *BV = dyn_cast<BuildVectorSDNode>(V.getNode());

  if (!BV || !V.getValueType().isSimple())

    return SDValue();


  MVT VT = V.getSimpleValueType();

  MVT EltVT = VT.getVectorElementType();

  unsigned NumElts = VT.getVectorNumElements();

  SmallVector<SDValue, 8> NewVecC;

  SDLoc DL(V);

  for (unsigned i = 0; i < NumElts; ++i) {

    auto *Elt = dyn_cast<ConstantSDNode>(BV->getOperand(i));

    if (!Elt || Elt->isOpaque() || Elt->getSimpleValueType(0) != EltVT)

      return SDValue();


    // Avoid overflow/underflow.

    const APInt &EltC = Elt->getAPIntValue();

    if ((IsInc && EltC.isMaxValue()) || (!IsInc && EltC.isZero()))

      return SDValue();

    if (NSW && ((IsInc && EltC.isMaxSignedValue()) ||

                (!IsInc && EltC.isMinSignedValue())))

      return SDValue();


    NewVecC.push_back(DAG.getConstant(EltC + (IsInc ? 1 : -1), DL, EltVT));

  }


  return DAG.getBuildVector(VT, DL, NewVecC);

}


/// As another special case, use PSUBUS[BW] when it's profitable. E.g. for

/// Op0 u<= Op1:

///   t = psubus Op0, Op1

///   pcmpeq t, <0..0>


static SDValue LowerVSETCCWithSUBUS(SDValue Op0, SDValue Op1, MVT VT,

                                    ISD::CondCode Cond, const SDLoc &dl,

                                    const X86Subtarget &Subtarget,

                                    SelectionDAG &DAG) {

  if (!Subtarget.hasSSE2())

    return SDValue();


  MVT VET = VT.getVectorElementType();

  if (VET != MVT::i8 && VET != MVT::i16)

    return SDValue();


  switch (Cond) {

  default:

    return SDValue();

  case ISD::SETULT: {

    // If the comparison is against a constant we can turn this into a

    // setule.  With psubus, setule does not require a swap.  This is

    // beneficial because the constant in the register is no longer

    // destructed as the destination so it can be hoisted out of a loop.

    // Only do this pre-AVX since vpcmp* is no longer destructive.

    if (Subtarget.hasAVX())

      return SDValue();

    SDValue ULEOp1 =

        incDecVectorConstant(Op1, DAG, /*IsInc*/ false, /*NSW*/ false);

    if (!ULEOp1)

      return SDValue();

    Op1 = ULEOp1;

    break;

  }

  case ISD::SETUGT: {

    // If the comparison is against a constant, we can turn this into a setuge.

    // This is beneficial because materializing a constant 0 for the PCMPEQ is

    // probably cheaper than XOR+PCMPGT using 2 different vector constants:

    // cmpgt (xor X, SignMaskC) CmpC --> cmpeq (usubsat (CmpC+1), X), 0

    SDValue UGEOp1 =

        incDecVectorConstant(Op1, DAG, /*IsInc*/ true, /*NSW*/ false);

    if (!UGEOp1)

      return SDValue();

    Op1 = Op0;

    Op0 = UGEOp1;

    break;

  }

  // Psubus is better than flip-sign because it requires no inversion.

  case ISD::SETUGE:

    std::swap(Op0, Op1);

    break;

  case ISD::SETULE:

    break;

  }


  SDValue Result = DAG.getNode(ISD::USUBSAT, dl, VT, Op0, Op1);

  return DAG.getNode(X86ISD::PCMPEQ, dl, VT, Result,

                     DAG.getConstant(0, dl, VT));

}


static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,

                           SelectionDAG &DAG) {

  bool IsStrict = Op.getOpcode() == ISD::STRICT_FSETCC ||

                  Op.getOpcode() == ISD::STRICT_FSETCCS;

  SDValue Op0 = Op.getOperand(IsStrict ? 1 : 0);

  SDValue Op1 = Op.getOperand(IsStrict ? 2 : 1);

  SDValue CC = Op.getOperand(IsStrict ? 3 : 2);

  MVT VT = Op->getSimpleValueType(0);

  ISD::CondCode Cond = cast<CondCodeSDNode>(CC)->get();

  MVT OpVT = Op0.getSimpleValueType();

  SDLoc dl(Op);


  if (OpVT.isFloatingPoint()) {

    MVT EltVT = OpVT.getVectorElementType();

    assert(EltVT == MVT::bf16 || EltVT == MVT::f16 || EltVT == MVT::f32 ||

           EltVT == MVT::f64);


    SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();

    if (isSoftF16(EltVT, Subtarget)) {

      if (Subtarget.hasAVX512() && !Subtarget.hasVLX())

        return SDValue();


      // Break 256-bit FP vector compare into smaller ones.

      if (OpVT.is256BitVector() && !Subtarget.useAVX512Regs())

        return splitVSETCC(VT, Op0, Op1, Cond, DAG, dl);


      // Break 512-bit FP vector compare into smaller ones.

      if (OpVT.is512BitVector())

        return splitVSETCC(VT, Op0, Op1, Cond, DAG, dl);


      MVT NVT = OpVT.changeVectorElementType(MVT::f32);

      if (IsStrict) {

        Op0 = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {NVT, MVT::Other},

                          {Chain, Op0});

        Op1 = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {NVT, MVT::Other},

                          {Chain, Op1});

        return DAG.getNode(Op.getOpcode(), dl, {VT, MVT::Other},

                           {Chain, Op0, Op1, CC});

      }

      MVT DVT = VT.getVectorElementType() == MVT::i16

                    ? VT.changeVectorElementType(MVT::i32)

                    : VT;

      SDValue Cmp = DAG.getNode(Op.getOpcode(), dl, DVT,

                                DAG.getNode(ISD::FP_EXTEND, dl, NVT, Op0),

                                DAG.getNode(ISD::FP_EXTEND, dl, NVT, Op1), CC);

      return DVT == VT ? Cmp : DAG.getNode(ISD::TRUNCATE, dl, VT, Cmp);

    }


    bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS;


    // If we have a strict compare with a vXi1 result and the input is 128/256

    // bits we can't use a masked compare unless we have VLX. If we use a wider

    // compare like we do for non-strict, we might trigger spurious exceptions

    // from the upper elements. Instead emit a AVX compare and convert to mask.

    unsigned Opc;

    if (Subtarget.hasAVX512() && VT.getVectorElementType() == MVT::i1 &&

        (!IsStrict || Subtarget.hasVLX() ||

         Op0.getSimpleValueType().is512BitVector())) {

#ifndef NDEBUG

      unsigned Num = VT.getVectorNumElements();

      assert(Num <= 16 ||

             (Num == 32 && (EltVT == MVT::f16 || EltVT == MVT::bf16)));

#endif

      Opc = IsStrict ? X86ISD::STRICT_CMPM : X86ISD::CMPM;

    } else {

      Opc = IsStrict ? X86ISD::STRICT_CMPP : X86ISD::CMPP;

      // The SSE/AVX packed FP comparison nodes are defined with a

      // floating-point vector result that matches the operand type. This allows

      // them to work with an SSE1 target (integer vector types are not legal).

      VT = Op0.getSimpleValueType();

    }


    SDValue Cmp;

    bool IsAlwaysSignaling;

    unsigned SSECC = translateX86FSETCC(Cond, Op0, Op1, IsAlwaysSignaling);

    if (!Subtarget.hasAVX()) {

      // TODO: We could use following steps to handle a quiet compare with

      // signaling encodings.

      // 1. Get ordered masks from a quiet ISD::SETO

      // 2. Use the masks to mask potential unordered elements in operand A, B

      // 3. Get the compare results of masked A, B

      // 4. Calculating final result using the mask and result from 3

      // But currently, we just fall back to scalar operations.

      if (IsStrict && IsAlwaysSignaling && !IsSignaling)

        return SDValue();


      // Insert an extra signaling instruction to raise exception.

      if (IsStrict && !IsAlwaysSignaling && IsSignaling) {

        SDValue SignalCmp = DAG.getNode(

            Opc, dl, {VT, MVT::Other},

            {Chain, Op0, Op1, DAG.getTargetConstant(1, dl, MVT::i8)}); // LT_OS

        // FIXME: It seems we need to update the flags of all new strict nodes.

        // Otherwise, mayRaiseFPException in MI will return false due to

        // NoFPExcept = false by default. However, I didn't find it in other

        // patches.

        SignalCmp->setFlags(Op->getFlags());

        Chain = SignalCmp.getValue(1);

      }


      // In the two cases not handled by SSE compare predicates (SETUEQ/SETONE),

      // emit two comparisons and a logic op to tie them together.

      if (!cheapX86FSETCC_SSE(Cond)) {

        // LLVM predicate is SETUEQ or SETONE.

        unsigned CC0, CC1;

        unsigned CombineOpc;

        if (Cond == ISD::SETUEQ) {

          CC0 = 3; // UNORD

          CC1 = 0; // EQ

          CombineOpc = X86ISD::FOR;

        } else {

          assert(Cond == ISD::SETONE);

          CC0 = 7; // ORD

          CC1 = 4; // NEQ

          CombineOpc = X86ISD::FAND;

        }


        SDValue Cmp0, Cmp1;

        if (IsStrict) {

          Cmp0 = DAG.getNode(

              Opc, dl, {VT, MVT::Other},

              {Chain, Op0, Op1, DAG.getTargetConstant(CC0, dl, MVT::i8)});

          Cmp1 = DAG.getNode(

              Opc, dl, {VT, MVT::Other},

              {Chain, Op0, Op1, DAG.getTargetConstant(CC1, dl, MVT::i8)});

          Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Cmp0.getValue(1),

                              Cmp1.getValue(1));

        } else {

          Cmp0 = DAG.getNode(

              Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(CC0, dl, MVT::i8));

          Cmp1 = DAG.getNode(

              Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(CC1, dl, MVT::i8));

        }

        Cmp = DAG.getNode(CombineOpc, dl, VT, Cmp0, Cmp1);

      } else {

        if (IsStrict) {

          Cmp = DAG.getNode(

              Opc, dl, {VT, MVT::Other},

              {Chain, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8)});

          Chain = Cmp.getValue(1);

        } else

          Cmp = DAG.getNode(

              Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8));

      }

    } else {

      // Handle all other FP comparisons here.

      if (IsStrict) {

        // Make a flip on already signaling CCs before setting bit 4 of AVX CC.

        SSECC |= (IsAlwaysSignaling ^ IsSignaling) << 4;

        Cmp = DAG.getNode(

            Opc, dl, {VT, MVT::Other},

            {Chain, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8)});

        Chain = Cmp.getValue(1);

      } else

        Cmp = DAG.getNode(

            Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8));

    }


    if (VT.getFixedSizeInBits() >

        Op.getSimpleValueType().getFixedSizeInBits()) {

      // We emitted a compare with an XMM/YMM result. Finish converting to a

      // mask register using a vptestm.

      EVT CastVT = EVT(VT).changeVectorElementTypeToInteger();

      Cmp = DAG.getBitcast(CastVT, Cmp);

      Cmp = DAG.getSetCC(dl, Op.getSimpleValueType(), Cmp,

                         DAG.getConstant(0, dl, CastVT), ISD::SETNE);

    } else {

      // If this is SSE/AVX CMPP, bitcast the result back to integer to match

      // the result type of SETCC. The bitcast is expected to be optimized

      // away during combining/isel.

      Cmp = DAG.getBitcast(Op.getSimpleValueType(), Cmp);

    }


    if (IsStrict)

      return DAG.getMergeValues({Cmp, Chain}, dl);


    return Cmp;

  }


  assert(!IsStrict && "Strict SETCC only handles FP operands.");


  [[maybe_unused]] MVT VTOp0 = Op0.getSimpleValueType();

  assert(VTOp0 == Op1.getSimpleValueType() &&

         "Expected operands with same type!");

  assert(VT.getVectorNumElements() == VTOp0.getVectorNumElements() &&

         "Invalid number of packed elements for source and destination!");


  // The non-AVX512 code below works under the assumption that source and

  // destination types are the same.

  assert((Subtarget.hasAVX512() || (VT == VTOp0)) &&

         "Value types for source and destination must be the same!");


  // The result is boolean, but operands are int/float

  if (VT.getVectorElementType() == MVT::i1) {

    // In AVX-512 architecture setcc returns mask with i1 elements,

    // But there is no compare instruction for i8 and i16 elements in KNL.

    assert((VTOp0.getScalarSizeInBits() >= 32 || Subtarget.hasBWI()) &&

           "Unexpected operand type");

    return LowerIntVSETCC_AVX512(Op, dl, DAG);

  }


  // Lower using XOP integer comparisons.

  if (VT.is128BitVector() && Subtarget.hasXOP()) {

    // Translate compare code to XOP PCOM compare mode.

    unsigned CmpMode = 0;

    switch (Cond) {

    // clang-format off

    default: llvm_unreachable("Unexpected SETCC condition");

    case ISD::SETULT:

    case ISD::SETLT: CmpMode = 0x00; break;

    case ISD::SETULE:

    case ISD::SETLE: CmpMode = 0x01; break;

    case ISD::SETUGT:

    case ISD::SETGT: CmpMode = 0x02; break;

    case ISD::SETUGE:

    case ISD::SETGE: CmpMode = 0x03; break;

    case ISD::SETEQ: CmpMode = 0x04; break;

    case ISD::SETNE: CmpMode = 0x05; break;

    // clang-format on

    }


    // Are we comparing unsigned or signed integers?

    unsigned Opc =

        ISD::isUnsignedIntSetCC(Cond) ? X86ISD::VPCOMU : X86ISD::VPCOM;


    return DAG.getNode(Opc, dl, VT, Op0, Op1,

                       DAG.getTargetConstant(CmpMode, dl, MVT::i8));

  }


  // (X & Y) != 0 --> (X & Y) == Y iff Y is power-of-2.

  // Revert part of the simplifySetCCWithAnd combine, to avoid an invert.

  if (Cond == ISD::SETNE && ISD::isBuildVectorAllZeros(Op1.getNode())) {

    SDValue BC0 = peekThroughBitcasts(Op0);

    if (BC0.getOpcode() == ISD::AND &&

        isConstantPowerOf2(BC0.getOperand(1), VT.getScalarSizeInBits(),

                           /*AllowUndefs=*/false)) {

      Cond = ISD::SETEQ;

      Op1 = DAG.getBitcast(VT, BC0.getOperand(1));

    }

  }


  // ICMP_EQ(AND(X,C),C) -> SRA(SHL(X,LOG2(C)),BW-1) iff C is power-of-2.

  if (Cond == ISD::SETEQ && Op0.getOpcode() == ISD::AND &&

      Op0.getOperand(1) == Op1 && Op0.hasOneUse()) {

    ConstantSDNode *C1 = isConstOrConstSplat(Op1);

    if (C1 && C1->getAPIntValue().isPowerOf2()) {

      unsigned BitWidth = VT.getScalarSizeInBits();

      unsigned ShiftAmt = BitWidth - C1->getAPIntValue().logBase2() - 1;


      SDValue Result = Op0.getOperand(0);

      Result = DAG.getNode(ISD::SHL, dl, VT, Result,

                           DAG.getConstant(ShiftAmt, dl, VT));

      Result = DAG.getNode(ISD::SRA, dl, VT, Result,

                           DAG.getConstant(BitWidth - 1, dl, VT));

      return Result;

    }

  }


  // Break 256-bit integer vector compare into smaller ones.

  if (VT.is256BitVector() && !Subtarget.hasInt256())

    return splitVSETCC(VT, Op0, Op1, Cond, DAG, dl);


  // Break 512-bit integer vector compare into smaller ones.

  // TODO: Try harder to use VPCMPx + VPMOV2x?

  if (VT.is512BitVector())

    return splitVSETCC(VT, Op0, Op1, Cond, DAG, dl);


  // If we have a limit constant, try to form PCMPGT (signed cmp) to avoid

  // not-of-PCMPEQ:

  // X != INT_MIN --> X >s INT_MIN

  // X != INT_MAX --> X <s INT_MAX --> INT_MAX >s X

  // +X != 0 --> +X >s 0

  APInt ConstValue;

  if (Cond == ISD::SETNE &&

      ISD::isConstantSplatVector(Op1.getNode(), ConstValue)) {

    if (ConstValue.isMinSignedValue())

      Cond = ISD::SETGT;

    else if (ConstValue.isMaxSignedValue())

      Cond = ISD::SETLT;

    else if (ConstValue.isZero() && DAG.SignBitIsZero(Op0))

      Cond = ISD::SETGT;

  }


  // If both operands are known non-negative, then an unsigned compare is the

  // same as a signed compare and there's no need to flip signbits.

  // TODO: We could check for more general simplifications here since we're

  // computing known bits.

  bool FlipSigns = ISD::isUnsignedIntSetCC(Cond) &&

                   !(DAG.SignBitIsZero(Op0) && DAG.SignBitIsZero(Op1));


  // Special case: Use min/max operations for unsigned compares.

  const TargetLowering &TLI = DAG.getTargetLoweringInfo();

  if (ISD::isUnsignedIntSetCC(Cond) &&

      (FlipSigns || ISD::isTrueWhenEqual(Cond)) &&

      TLI.isOperationLegal(ISD::UMIN, VT)) {

    // If we have a constant operand, increment/decrement it and change the

    // condition to avoid an invert.

    if (Cond == ISD::SETUGT) {

      // X > C --> X >= (C+1) --> X == umax(X, C+1)

      if (SDValue UGTOp1 =

              incDecVectorConstant(Op1, DAG, /*IsInc*/ true, /*NSW*/ false)) {

        Op1 = UGTOp1;

        Cond = ISD::SETUGE;

      }

    }

    if (Cond == ISD::SETULT) {

      // X < C --> X <= (C-1) --> X == umin(X, C-1)

      if (SDValue ULTOp1 =

              incDecVectorConstant(Op1, DAG, /*IsInc*/ false, /*NSW*/ false)) {

        Op1 = ULTOp1;

        Cond = ISD::SETULE;

      }

    }

    bool Invert = false;

    unsigned Opc;

    switch (Cond) {

    // clang-format off

    default: llvm_unreachable("Unexpected condition code");

    case ISD::SETUGT: Invert = true; [[fallthrough]];

    case ISD::SETULE: Opc = ISD::UMIN; break;

    case ISD::SETULT: Invert = true; [[fallthrough]];

    case ISD::SETUGE: Opc = ISD::UMAX; break;

    // clang-format on

    }


    SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);

    Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Op0, Result);


    // If the logical-not of the result is required, perform that now.

    if (Invert)

      Result = DAG.getNOT(dl, Result, VT);


    return Result;

  }


  // Try to use SUBUS and PCMPEQ.

  if (FlipSigns)

    if (SDValue V =

            LowerVSETCCWithSUBUS(Op0, Op1, VT, Cond, dl, Subtarget, DAG))

      return V;


  // We are handling one of the integer comparisons here. Since SSE only has

  // GT and EQ comparisons for integer, swapping operands and multiple

  // operations may be required for some comparisons.

  unsigned Opc = (Cond == ISD::SETEQ || Cond == ISD::SETNE) ? X86ISD::PCMPEQ

                                                            : X86ISD::PCMPGT;

  bool Swap = Cond == ISD::SETLT || Cond == ISD::SETULT ||

              Cond == ISD::SETGE || Cond == ISD::SETUGE;

  bool Invert = Cond == ISD::SETNE ||

                (Cond != ISD::SETEQ && ISD::isTrueWhenEqual(Cond));


  if (Swap)

    std::swap(Op0, Op1);


  // Check that the operation in question is available (most are plain SSE2,

  // but PCMPGTQ and PCMPEQQ have different requirements).

  if (VT == MVT::v2i64) {

    if (Opc == X86ISD::PCMPGT && !Subtarget.hasSSE42()) {

      assert(Subtarget.hasSSE2() && "Don't know how to lower!");


      // Special case for sign bit test. We can use a v4i32 PCMPGT and shuffle

      // the odd elements over the even elements.

      if (!FlipSigns && !Invert && ISD::isBuildVectorAllZeros(Op0.getNode())) {

        Op0 = DAG.getConstant(0, dl, MVT::v4i32);

        Op1 = DAG.getBitcast(MVT::v4i32, Op1);


        SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);

        static const int MaskHi[] = { 1, 1, 3, 3 };

        SDValue Result = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);


        return DAG.getBitcast(VT, Result);

      }


      if (!FlipSigns && !Invert && ISD::isBuildVectorAllOnes(Op1.getNode())) {

        Op0 = DAG.getBitcast(MVT::v4i32, Op0);

        Op1 = DAG.getAllOnesConstant(dl, MVT::v4i32);


        SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);

        static const int MaskHi[] = { 1, 1, 3, 3 };

        SDValue Result = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);


        return DAG.getBitcast(VT, Result);

      }


      // If the i64 elements are sign-extended enough to be representable as i32

      // then we can compare the lower i32 bits and splat.

      if (!FlipSigns && !Invert && DAG.ComputeNumSignBits(Op0) > 32 &&

          DAG.ComputeNumSignBits(Op1) > 32) {

        Op0 = DAG.getBitcast(MVT::v4i32, Op0);

        Op1 = DAG.getBitcast(MVT::v4i32, Op1);


        SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);

        static const int MaskLo[] = {0, 0, 2, 2};

        SDValue Result = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskLo);


        return DAG.getBitcast(VT, Result);

      }


      // Since SSE has no unsigned integer comparisons, we need to flip the sign

      // bits of the inputs before performing those operations. The lower

      // compare is always unsigned.

      SDValue SB = DAG.getConstant(FlipSigns ? 0x8000000080000000ULL

                                             : 0x0000000080000000ULL,

                                   dl, MVT::v2i64);


      Op0 = DAG.getNode(ISD::XOR, dl, MVT::v2i64, Op0, SB);

      Op1 = DAG.getNode(ISD::XOR, dl, MVT::v2i64, Op1, SB);


      // Cast everything to the right type.

      Op0 = DAG.getBitcast(MVT::v4i32, Op0);

      Op1 = DAG.getBitcast(MVT::v4i32, Op1);


      // Emulate PCMPGTQ with (hi1 > hi2) | ((hi1 == hi2) & (lo1 > lo2))

      SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);

      SDValue EQ = DAG.getNode(X86ISD::PCMPEQ, dl, MVT::v4i32, Op0, Op1);


      // Create masks for only the low parts/high parts of the 64 bit integers.

      static const int MaskHi[] = { 1, 1, 3, 3 };

      static const int MaskLo[] = { 0, 0, 2, 2 };

      SDValue EQHi = DAG.getVectorShuffle(MVT::v4i32, dl, EQ, EQ, MaskHi);

      SDValue GTLo = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskLo);

      SDValue GTHi = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);


      SDValue Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, EQHi, GTLo);

      Result = DAG.getNode(ISD::OR, dl, MVT::v4i32, Result, GTHi);


      if (Invert)

        Result = DAG.getNOT(dl, Result, MVT::v4i32);


      return DAG.getBitcast(VT, Result);

    }


    if (Opc == X86ISD::PCMPEQ && !Subtarget.hasSSE41()) {

      // If pcmpeqq is missing but pcmpeqd is available synthesize pcmpeqq with

      // pcmpeqd + pshufd + pand.

      assert(Subtarget.hasSSE2() && !FlipSigns && "Don't know how to lower!");


      // First cast everything to the right type.

      Op0 = DAG.getBitcast(MVT::v4i32, Op0);

      Op1 = DAG.getBitcast(MVT::v4i32, Op1);


      // Do the compare.

      SDValue Result = DAG.getNode(Opc, dl, MVT::v4i32, Op0, Op1);


      // Make sure the lower and upper halves are both all-ones.

      static const int Mask[] = { 1, 0, 3, 2 };

      SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Result, Result, Mask);

      Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, Result, Shuf);


      if (Invert)

        Result = DAG.getNOT(dl, Result, MVT::v4i32);


      return DAG.getBitcast(VT, Result);

    }

  }


  // Since SSE has no unsigned integer comparisons, we need to flip the sign

  // bits of the inputs before performing those operations.

  if (FlipSigns) {

    MVT EltVT = VT.getVectorElementType();

    SDValue SM = DAG.getConstant(APInt::getSignMask(EltVT.getSizeInBits()), dl,

                                 VT);

    Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SM);

    Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SM);

  }


  SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);


  // If the logical-not of the result is required, perform that now.

  if (Invert)

    Result = DAG.getNOT(dl, Result, VT);


  return Result;

}


// Try to select this as a KORTEST+SETCC or KTEST+SETCC if possible.


static SDValue EmitAVX512Test(SDValue Op0, SDValue Op1, ISD::CondCode CC,

                              const SDLoc &dl, SelectionDAG &DAG,

                              const X86Subtarget &Subtarget,

                              SDValue &X86CC) {

  assert((CC == ISD::SETEQ || CC == ISD::SETNE) && "Unsupported ISD::CondCode");


  // Must be a bitcast from vXi1.

  if (Op0.getOpcode() != ISD::BITCAST)

    return SDValue();


  Op0 = Op0.getOperand(0);

  MVT VT = Op0.getSimpleValueType();

  if (!(Subtarget.hasAVX512() && VT == MVT::v16i1) &&

      !(Subtarget.hasDQI() && VT == MVT::v8i1) &&

      !(Subtarget.hasBWI() && (VT == MVT::v32i1 || VT == MVT::v64i1)))

    return SDValue();


  X86::CondCode X86Cond;

  if (isNullConstant(Op1)) {

    X86Cond = CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE;

  } else if (isAllOnesConstant(Op1)) {

    // C flag is set for all ones.

    X86Cond = CC == ISD::SETEQ ? X86::COND_B : X86::COND_AE;

  } else

    return SDValue();


  // If the input is an AND, we can combine it's operands into the KTEST.

  bool KTestable = false;

  if (Subtarget.hasDQI() && (VT == MVT::v8i1 || VT == MVT::v16i1))

    KTestable = true;

  if (Subtarget.hasBWI() && (VT == MVT::v32i1 || VT == MVT::v64i1))

    KTestable = true;

  if (!isNullConstant(Op1))

    KTestable = false;

  if (KTestable && Op0.getOpcode() == ISD::AND && Op0.hasOneUse()) {

    SDValue LHS = Op0.getOperand(0);

    SDValue RHS = Op0.getOperand(1);

    X86CC = DAG.getTargetConstant(X86Cond, dl, MVT::i8);

    return DAG.getNode(X86ISD::KTEST, dl, MVT::i32, LHS, RHS);

  }


  // If the input is an OR, we can combine it's operands into the KORTEST.

  SDValue LHS = Op0;

  SDValue RHS = Op0;

  if (Op0.getOpcode() == ISD::OR && Op0.hasOneUse()) {

    LHS = Op0.getOperand(0);

    RHS = Op0.getOperand(1);

  }


  X86CC = DAG.getTargetConstant(X86Cond, dl, MVT::i8);

  return DAG.getNode(X86ISD::KORTEST, dl, MVT::i32, LHS, RHS);

}


/// Emit flags for the given setcc condition and operands. Also returns the

/// corresponding X86 condition code constant in X86CC.

SDValue X86TargetLowering::emitFlagsForSetcc(SDValue Op0, SDValue Op1,

                                             ISD::CondCode CC, const SDLoc &dl,

                                             SelectionDAG &DAG,

                                             SDValue &X86CC) const {

  // Equality Combines.

  if (CC == ISD::SETEQ || CC == ISD::SETNE) {

    X86::CondCode X86CondCode;


    // Optimize to BT if possible.

    // Lower (X & (1 << N)) == 0 to BT(X, N).

    // Lower ((X >>u N) & 1) != 0 to BT(X, N).

    // Lower ((X >>s N) & 1) != 0 to BT(X, N).

    if (Op0.getOpcode() == ISD::AND && Op0.hasOneUse() && isNullConstant(Op1)) {

      if (SDValue BT = LowerAndToBT(Op0, CC, dl, DAG, X86CondCode)) {

        X86CC = DAG.getTargetConstant(X86CondCode, dl, MVT::i8);

        return BT;

      }

    }


    // Try to use PTEST/PMOVMSKB for a tree AND/ORs equality compared with -1/0.

    if (SDValue CmpZ = MatchVectorAllEqualTest(Op0, Op1, CC, dl, Subtarget, DAG,

                                               X86CondCode)) {

      X86CC = DAG.getTargetConstant(X86CondCode, dl, MVT::i8);

      return CmpZ;

    }


    // Try to lower using KORTEST or KTEST.

    if (SDValue Test = EmitAVX512Test(Op0, Op1, CC, dl, DAG, Subtarget, X86CC))

      return Test;


    // Look for X == 0, X == 1, X != 0, or X != 1.  We can simplify some forms

    // of these.

    if (isOneConstant(Op1) || isNullConstant(Op1)) {

      // If the input is a setcc, then reuse the input setcc or use a new one

      // with the inverted condition.

      if (Op0.getOpcode() == X86ISD::SETCC) {

        bool Invert = (CC == ISD::SETNE) ^ isNullConstant(Op1);


        X86CC = Op0.getOperand(0);

        if (Invert) {

          X86CondCode = (X86::CondCode)Op0.getConstantOperandVal(0);

          X86CondCode = X86::GetOppositeBranchCondition(X86CondCode);

          X86CC = DAG.getTargetConstant(X86CondCode, dl, MVT::i8);

        }


        return Op0.getOperand(1);

      }

    }


    // Look for X == INT_MIN or X != INT_MIN. We can use NEG and test for

    // overflow.

    if (isMinSignedConstant(Op1)) {

      EVT VT = Op0.getValueType();

      if (VT == MVT::i32 || VT == MVT::i64 || Op0->hasOneUse()) {

        SDVTList CmpVTs = DAG.getVTList(VT, MVT::i32);

        X86::CondCode CondCode = CC == ISD::SETEQ ? X86::COND_O : X86::COND_NO;

        X86CC = DAG.getTargetConstant(CondCode, dl, MVT::i8);

        SDValue Neg = DAG.getNode(X86ISD::SUB, dl, CmpVTs,

                                  DAG.getConstant(0, dl, VT), Op0);

        return SDValue(Neg.getNode(), 1);

      }

    }


    // Try to use the carry flag from the add in place of an separate CMP for:

    // (seteq (add X, -1), -1). Similar for setne.

    if (isAllOnesConstant(Op1) && Op0.getOpcode() == ISD::ADD &&

        Op0.getOperand(1) == Op1) {

      if (isProfitableToUseFlagOp(Op0)) {

        SDVTList VTs = DAG.getVTList(Op0.getValueType(), MVT::i32);


        SDValue New = DAG.getNode(X86ISD::ADD, dl, VTs, Op0.getOperand(0),

                                  Op0.getOperand(1));

        DAG.ReplaceAllUsesOfValueWith(SDValue(Op0.getNode(), 0), New);

        X86CondCode = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B;

        X86CC = DAG.getTargetConstant(X86CondCode, dl, MVT::i8);

        return SDValue(New.getNode(), 1);

      }

    }

  }


  X86::CondCode CondCode =

      TranslateX86CC(CC, dl, /*IsFP*/ false, Op0, Op1, DAG);

  assert(CondCode != X86::COND_INVALID && "Unexpected condition code!");


  SDValue EFLAGS = EmitCmp(Op0, Op1, CondCode, dl, DAG, Subtarget);

  X86CC = DAG.getTargetConstant(CondCode, dl, MVT::i8);

  return EFLAGS;

}


SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {


  bool IsStrict = Op.getOpcode() == ISD::STRICT_FSETCC ||

                  Op.getOpcode() == ISD::STRICT_FSETCCS;

  MVT VT = Op->getSimpleValueType(0);


  if (VT.isVector()) return LowerVSETCC(Op, Subtarget, DAG);


  assert(VT == MVT::i8 && "SetCC type must be 8-bit integer");

  SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();

  SDValue Op0 = Op.getOperand(IsStrict ? 1 : 0);

  SDValue Op1 = Op.getOperand(IsStrict ? 2 : 1);

  SDLoc dl(Op);

  ISD::CondCode CC =

      cast<CondCodeSDNode>(Op.getOperand(IsStrict ? 3 : 2))->get();


  if (isSoftF16(Op0.getValueType(), Subtarget))

    return SDValue();


  // Handle f128 first, since one possible outcome is a normal integer

  // comparison which gets handled by emitFlagsForSetcc.

  if (Op0.getValueType() == MVT::f128) {

    softenSetCCOperands(DAG, MVT::f128, Op0, Op1, CC, dl, Op0, Op1, Chain,

                        Op.getOpcode() == ISD::STRICT_FSETCCS);


    // If softenSetCCOperands returned a scalar, use it.

    if (!Op1.getNode()) {

      assert(Op0.getValueType() == Op.getValueType() &&

             "Unexpected setcc expansion!");

      if (IsStrict)

        return DAG.getMergeValues({Op0, Chain}, dl);

      return Op0;

    }

  }


  if (Op0.getSimpleValueType().isInteger()) {

    // Attempt to canonicalize SGT/UGT -> SGE/UGE compares with constant which

    // reduces the number of EFLAGs bit reads (the GE conditions don't read ZF),

    // this may translate to less uops depending on uarch implementation. The

    // equivalent for SLE/ULE -> SLT/ULT isn't likely to happen as we already

    // canonicalize to that CondCode.

    // NOTE: Only do this if incrementing the constant doesn't increase the bit

    // encoding size - so it must either already be a i8 or i32 immediate, or it

    // shrinks down to that. We don't do this for any i64's to avoid additional

    // constant materializations.

    // TODO: Can we move this to TranslateX86CC to handle jumps/branches too?

    if (auto *Op1C = dyn_cast<ConstantSDNode>(Op1)) {

      const APInt &Op1Val = Op1C->getAPIntValue();

      if (!Op1Val.isZero()) {

        // Ensure the constant+1 doesn't overflow.

        if ((CC == ISD::CondCode::SETGT && !Op1Val.isMaxSignedValue()) ||

            (CC == ISD::CondCode::SETUGT && !Op1Val.isMaxValue())) {

          APInt Op1ValPlusOne = Op1Val + 1;

          if (Op1ValPlusOne.isSignedIntN(32) &&

              (!Op1Val.isSignedIntN(8) || Op1ValPlusOne.isSignedIntN(8))) {

            Op1 = DAG.getConstant(Op1ValPlusOne, dl, Op0.getValueType());

            CC = CC == ISD::CondCode::SETGT ? ISD::CondCode::SETGE

                                            : ISD::CondCode::SETUGE;

          }

        }

      }

    }


    SDValue X86CC;

    SDValue EFLAGS = emitFlagsForSetcc(Op0, Op1, CC, dl, DAG, X86CC);

    SDValue Res = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, X86CC, EFLAGS);

    return IsStrict ? DAG.getMergeValues({Res, Chain}, dl) : Res;

  }


  if (Subtarget.hasAVX10_2()) {

    if (CC == ISD::SETOEQ || CC == ISD::SETUNE) {

      auto NewCC = (CC == ISD::SETOEQ) ? X86::COND_E : (X86::COND_NE);

      assert(Op0.getSimpleValueType() != MVT::bf16 && "Unsupported Type");

      if (Op0.getSimpleValueType() != MVT::f80) {

        SDValue Res = getSETCC(

            NewCC, DAG.getNode(X86ISD::UCOMX, dl, MVT::i32, Op0, Op1), dl, DAG);

        return IsStrict ? DAG.getMergeValues({Res, Chain}, dl) : Res;

      }

    }

  }

  // Handle floating point.

  X86::CondCode CondCode = TranslateX86CC(CC, dl, /*IsFP*/ true, Op0, Op1, DAG);

  if (CondCode == X86::COND_INVALID)

    return SDValue();


  SDValue EFLAGS;

  if (IsStrict) {

    bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS;

    EFLAGS =

        DAG.getNode(IsSignaling ? X86ISD::STRICT_FCMPS : X86ISD::STRICT_FCMP,

                    dl, {MVT::i32, MVT::Other}, {Chain, Op0, Op1});

    Chain = EFLAGS.getValue(1);

  } else {

    EFLAGS = DAG.getNode(X86ISD::FCMP, dl, MVT::i32, Op0, Op1);

  }


  SDValue X86CC = DAG.getTargetConstant(CondCode, dl, MVT::i8);

  SDValue Res = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, X86CC, EFLAGS);

  return IsStrict ? DAG.getMergeValues({Res, Chain}, dl) : Res;

}


SDValue X86TargetLowering::LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) const {

  SDValue LHS = Op.getOperand(0);

  SDValue RHS = Op.getOperand(1);

  SDValue Carry = Op.getOperand(2);

  SDValue Cond = Op.getOperand(3);

  SDLoc DL(Op);


  assert(LHS.getSimpleValueType().isInteger() && "SETCCCARRY is integer only.");

  X86::CondCode CC = TranslateIntegerX86CC(cast<CondCodeSDNode>(Cond)->get());


  // Recreate the carry if needed.

  EVT CarryVT = Carry.getValueType();

  Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32),

                      Carry, DAG.getAllOnesConstant(DL, CarryVT));


  SDVTList VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);

  SDValue Cmp = DAG.getNode(X86ISD::SBB, DL, VTs, LHS, RHS, Carry.getValue(1));

  return getSETCC(CC, Cmp.getValue(1), DL, DAG);

}


// This function returns three things: the arithmetic computation itself

// (Value), an EFLAGS result (Overflow), and a condition code (Cond).  The

// flag and the condition code define the case in which the arithmetic

// computation overflows.

static std::pair<SDValue, SDValue>


getX86XALUOOp(X86::CondCode &Cond, SDValue Op, SelectionDAG &DAG) {

  assert(Op.getResNo() == 0 && "Unexpected result number!");

  SDValue Value, Overflow;

  SDValue LHS = Op.getOperand(0);

  SDValue RHS = Op.getOperand(1);

  unsigned BaseOp = 0;

  SDLoc DL(Op);

  switch (Op.getOpcode()) {

  default: llvm_unreachable("Unknown ovf instruction!");

  case ISD::SADDO:

    BaseOp = X86ISD::ADD;

    Cond = X86::COND_O;

    break;

  case ISD::UADDO:

    BaseOp = X86ISD::ADD;

    Cond = isOneConstant(RHS) ? X86::COND_E : X86::COND_B;

    break;

  case ISD::SSUBO:

    BaseOp = X86ISD::SUB;

    Cond = X86::COND_O;

    break;

  case ISD::USUBO:

    BaseOp = X86ISD::SUB;

    Cond = X86::COND_B;

    break;

  case ISD::SMULO:

    BaseOp = X86ISD::SMUL;

    Cond = X86::COND_O;

    break;

  case ISD::UMULO:

    BaseOp = X86ISD::UMUL;

    Cond = X86::COND_O;

    break;

  }


  if (BaseOp) {

    // Also sets EFLAGS.

    SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);

    Value = DAG.getNode(BaseOp, DL, VTs, LHS, RHS);

    Overflow = Value.getValue(1);

  }


  return std::make_pair(Value, Overflow);

}


static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {

  // Lower the "add/sub/mul with overflow" instruction into a regular ins plus

  // a "setcc" instruction that checks the overflow flag. The "brcond" lowering

  // looks for this combo and may remove the "setcc" instruction if the "setcc"

  // has only one use.

  SDLoc DL(Op);

  X86::CondCode Cond;

  SDValue Value, Overflow;

  std::tie(Value, Overflow) = getX86XALUOOp(Cond, Op, DAG);


  SDValue SetCC = getSETCC(Cond, Overflow, DL, DAG);

  assert(Op->getValueType(1) == MVT::i8 && "Unexpected VT!");

  return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(), Value, SetCC);

}


/// Return true if opcode is a X86 logical comparison.


static bool isX86LogicalCmp(SDValue Op) {

  unsigned Opc = Op.getOpcode();

  if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI ||

      Opc == X86ISD::FCMP)

    return true;

  if (Op.getResNo() == 1 &&

      (Opc == X86ISD::ADD || Opc == X86ISD::SUB || Opc == X86ISD::ADC ||

       Opc == X86ISD::SBB || Opc == X86ISD::SMUL || Opc == X86ISD::UMUL ||

       Opc == X86ISD::OR || Opc == X86ISD::XOR || Opc == X86ISD::AND))

    return true;


  return false;

}


static bool isTruncWithZeroHighBitsInput(SDValue V, SelectionDAG &DAG) {

  if (V.getOpcode() != ISD::TRUNCATE)

    return false;


  SDValue VOp0 = V.getOperand(0);

  unsigned InBits = VOp0.getValueSizeInBits();

  unsigned Bits = V.getValueSizeInBits();

  return DAG.MaskedValueIsZero(VOp0, APInt::getHighBitsSet(InBits,InBits-Bits));

}


// Lower various (select (icmp CmpVal, 0), LHS, RHS) custom patterns.


static SDValue LowerSELECTWithCmpZero(SDValue CmpVal, SDValue LHS, SDValue RHS,

                                      unsigned X86CC, const SDLoc &DL,

                                      SelectionDAG &DAG,

                                      const X86Subtarget &Subtarget) {

  EVT CmpVT = CmpVal.getValueType();

  EVT VT = LHS.getValueType();

  if (!CmpVT.isScalarInteger() || !VT.isScalarInteger())

    return SDValue();


  if (X86CC == X86::COND_E && CmpVal.getOpcode() == ISD::AND &&

      isOneConstant(CmpVal.getOperand(1))) {

    auto SplatLSB = [&](EVT SplatVT) {

      // we need mask of all zeros or ones with same size of the other

      // operands.

      SDValue Neg = CmpVal;

      if (CmpVT.bitsGT(SplatVT))

        Neg = DAG.getNode(ISD::TRUNCATE, DL, SplatVT, CmpVal);

      else if (CmpVT.bitsLT(SplatVT))

        Neg = DAG.getNode(

            ISD::AND, DL, SplatVT,

            DAG.getNode(ISD::ANY_EXTEND, DL, SplatVT, CmpVal.getOperand(0)),

            DAG.getConstant(1, DL, SplatVT));

      return DAG.getNegative(Neg, DL, SplatVT); // -(and (x, 0x1))

    };


    // SELECT (AND(X,1) == 0), 0, -1 -> NEG(AND(X,1))

    if (isNullConstant(LHS) && isAllOnesConstant(RHS))

      return SplatLSB(VT);


    // SELECT (AND(X,1) == 0), C1, C2 -> XOR(C1,AND(NEG(AND(X,1)),XOR(C1,C2))

    if (!Subtarget.canUseCMOV() && isa<ConstantSDNode>(LHS) &&

        isa<ConstantSDNode>(RHS)) {

      SDValue Mask = SplatLSB(VT);

      SDValue Diff = DAG.getNode(ISD::XOR, DL, VT, LHS, RHS);

      SDValue Flip = DAG.getNode(ISD::AND, DL, VT, Mask, Diff);

      return DAG.getNode(ISD::XOR, DL, VT, LHS, Flip);

    }


    SDValue Src1, Src2;

    auto isIdentityPatternZero = [&]() {

      switch (RHS.getOpcode()) {

      default:

        break;

      case ISD::OR:

      case ISD::XOR:

      case ISD::ADD:

        if (RHS.getOperand(0) == LHS || RHS.getOperand(1) == LHS) {

          Src1 = RHS.getOperand(RHS.getOperand(0) == LHS ? 1 : 0);

          Src2 = LHS;

          return true;

        }

        break;

      case ISD::SHL:

      case ISD::SRA:

      case ISD::SRL:

      case ISD::SUB:

        if (RHS.getOperand(0) == LHS) {

          Src1 = RHS.getOperand(1);

          Src2 = LHS;

          return true;

        }

        break;

      }

      return false;

    };


    auto isIdentityPatternOnes = [&]() {

      switch (LHS.getOpcode()) {

      default:

        break;

      case ISD::AND:

        if (LHS.getOperand(0) == RHS || LHS.getOperand(1) == RHS) {

          Src1 = LHS.getOperand(LHS.getOperand(0) == RHS ? 1 : 0);

          Src2 = RHS;

          return true;

        }

        break;

      }

      return false;

    };


    // Convert 'identity' patterns (iff X is 0 or 1):

    // SELECT (AND(X,1) == 0), Y, (OR Y, Z) -> (OR Y, (AND NEG(AND(X,1)), Z))

    // SELECT (AND(X,1) == 0), Y, (XOR Y, Z) -> (XOR Y, (AND NEG(AND(X,1)), Z))

    // SELECT (AND(X,1) == 0), Y, (ADD Y, Z) -> (ADD Y, (AND NEG(AND(X,1)), Z))

    // SELECT (AND(X,1) == 0), Y, (SUB Y, Z) -> (SUB Y, (AND NEG(AND(X,1)), Z))

    // SELECT (AND(X,1) == 0), Y, (SHL Y, Z) -> (SHL Y, (AND NEG(AND(X,1)), Z))

    // SELECT (AND(X,1) == 0), Y, (SRA Y, Z) -> (SRA Y, (AND NEG(AND(X,1)), Z))

    // SELECT (AND(X,1) == 0), Y, (SRL Y, Z) -> (SRL Y, (AND NEG(AND(X,1)), Z))

    if (!Subtarget.canUseCMOV() && isIdentityPatternZero()) {

      SDValue Mask = SplatLSB(Src1.getValueType());

      SDValue And = DAG.getNode(ISD::AND, DL, Src1.getValueType(), Mask,

                                Src1);                        // Mask & z

      return DAG.getNode(RHS.getOpcode(), DL, VT, Src2, And); // y Op And

    }

    // SELECT (AND(X,1) == 0), (AND Y, Z), Y -> (AND Y, (OR NEG(AND(X, 1)), Z))

    if (!Subtarget.canUseCMOV() && isIdentityPatternOnes()) {

      SDValue Mask = SplatLSB(VT);

      SDValue Or = DAG.getNode(ISD::OR, DL, VT, Mask, Src1); // Mask | z

      return DAG.getNode(LHS.getOpcode(), DL, VT, Src2, Or); // y Op Or

    }

  }


  if ((X86CC == X86::COND_E || X86CC == X86::COND_NE) &&

      (isAllOnesConstant(LHS) || isAllOnesConstant(RHS))) {

    SDValue Y = isAllOnesConstant(RHS) ? LHS : RHS;

    SDVTList CmpVTs = DAG.getVTList(CmpVT, MVT::i32);


    // 'X - 1' sets the carry flag if X == 0.

    // '0 - X' sets the carry flag if X != 0.

    // Convert the carry flag to a -1/0 mask with sbb:

    // select (X != 0), -1, Y --> 0 - X; or (sbb), Y

    // select (X == 0), Y, -1 --> 0 - X; or (sbb), Y

    // select (X != 0), Y, -1 --> X - 1; or (sbb), Y

    // select (X == 0), -1, Y --> X - 1; or (sbb), Y

    SDValue Sub;

    if (isAllOnesConstant(LHS) == (X86CC == X86::COND_NE)) {

      SDValue Zero = DAG.getConstant(0, DL, CmpVT);

      Sub = DAG.getNode(X86ISD::SUB, DL, CmpVTs, Zero, CmpVal);

    } else {

      SDValue One = DAG.getConstant(1, DL, CmpVT);

      Sub = DAG.getNode(X86ISD::SUB, DL, CmpVTs, CmpVal, One);

    }

    SDValue SBB = DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,

                              DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),

                              Sub.getValue(1));

    return DAG.getNode(ISD::OR, DL, VT, SBB, Y);

  }


  return SDValue();

}


SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {

  bool AddTest = true;

  SDValue Cond  = Op.getOperand(0);

  SDValue Op1 = Op.getOperand(1);

  SDValue Op2 = Op.getOperand(2);

  SDLoc DL(Op);

  MVT VT = Op1.getSimpleValueType();

  SDValue CC;


  if (isSoftF16(VT, Subtarget)) {

    MVT NVT = VT.changeTypeToInteger();

    return DAG.getBitcast(VT, DAG.getNode(ISD::SELECT, DL, NVT, Cond,

                                          DAG.getBitcast(NVT, Op1),

                                          DAG.getBitcast(NVT, Op2)));

  }


  // Lower FP selects into a CMP/AND/ANDN/OR sequence when the necessary SSE ops

  // are available or VBLENDV if AVX is available.

  // Otherwise FP cmovs get lowered into a less efficient branch sequence later.

  if (Cond.getOpcode() == ISD::SETCC && isScalarFPTypeInSSEReg(VT) &&

      VT == Cond.getOperand(0).getSimpleValueType() && Cond->hasOneUse()) {

    SDValue CondOp0 = Cond.getOperand(0), CondOp1 = Cond.getOperand(1);

    bool IsAlwaysSignaling;

    unsigned SSECC =

        translateX86FSETCC(cast<CondCodeSDNode>(Cond.getOperand(2))->get(),

                           CondOp0, CondOp1, IsAlwaysSignaling);


    if (Subtarget.hasAVX512()) {

      SDValue Cmp =

          DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CondOp0, CondOp1,

                      DAG.getTargetConstant(SSECC, DL, MVT::i8));

      assert(!VT.isVector() && "Not a scalar type?");

      return DAG.getNode(X86ISD::SELECTS, DL, VT, Cmp, Op1, Op2);

    }


    if (SSECC < 8 || Subtarget.hasAVX()) {

      SDValue Cmp = DAG.getNode(X86ISD::FSETCC, DL, VT, CondOp0, CondOp1,

                                DAG.getTargetConstant(SSECC, DL, MVT::i8));


      // If we have SSE41/AVX, we can use a variable vector select (VBLENDV)

      // instead of 3 logic instructions for size savings and potentially speed.

      // Unfortunately, there is no scalar form of VBLENDV.

      //

      // If either operand is a +0.0 constant, don't try this. We can expect to

      // optimize away at least one of the logic instructions later in that

      // case, so that sequence would be faster than a variable blend.

      if (Subtarget.hasSSE41() && !isNullFPConstant(Op1) &&

          !isNullFPConstant(Op2)) {

        // Convert to vectors, do a VSELECT, and convert back to scalar.

        // All of the conversions should be optimized away.

        MVT VecVT = VT == MVT::f32 ? MVT::v4f32 : MVT::v2f64;

        SDValue VOp1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op1);

        SDValue VOp2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op2);

        SDValue VCmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Cmp);


        MVT VCmpVT = VT == MVT::f32 ? MVT::v4i32 : MVT::v2i64;

        VCmp = DAG.getBitcast(VCmpVT, VCmp);


        SDValue VSel = DAG.getSelect(DL, VecVT, VCmp, VOp1, VOp2);


        return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, VSel,

                           DAG.getVectorIdxConstant(0, DL));

      }

      SDValue AndN = DAG.getNode(X86ISD::FANDN, DL, VT, Cmp, Op2);

      SDValue And = DAG.getNode(X86ISD::FAND, DL, VT, Cmp, Op1);

      return DAG.getNode(X86ISD::FOR, DL, VT, AndN, And);

    }

  }


  // AVX512 fallback is to lower selects of scalar floats to masked moves.

  if (isScalarFPTypeInSSEReg(VT) && Subtarget.hasAVX512()) {

    SDValue Cmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i1, Cond);

    return DAG.getNode(X86ISD::SELECTS, DL, VT, Cmp, Op1, Op2);

  }


  if (Cond.getOpcode() == ISD::SETCC &&

      !isSoftF16(Cond.getOperand(0).getSimpleValueType(), Subtarget)) {

    if (SDValue NewCond = LowerSETCC(Cond, DAG)) {

      Cond = NewCond;

      // If the condition was updated, it's possible that the operands of the

      // select were also updated (for example, EmitTest has a RAUW). Refresh

      // the local references to the select operands in case they got stale.

      Op1 = Op.getOperand(1);

      Op2 = Op.getOperand(2);

    }

  }


  // (select (x == 0), -1, y) -> (sign_bit (x - 1)) | y

  // (select (x == 0), y, -1) -> ~(sign_bit (x - 1)) | y

  // (select (x != 0), y, -1) -> (sign_bit (x - 1)) | y

  // (select (x != 0), -1, y) -> ~(sign_bit (x - 1)) | y

  // (select (and (x , 0x1) == 0), y, (z ^ y) ) -> (-(and (x , 0x1)) & z ) ^ y

  // (select (and (x , 0x1) == 0), y, (z | y) ) -> (-(and (x , 0x1)) & z ) | y

  // (select (x > 0), x, 0) -> (~(x >> (size_in_bits(x)-1))) & x

  // (select (x < 0), x, 0) -> ((x >> (size_in_bits(x)-1))) & x

  if (Cond.getOpcode() == X86ISD::SETCC &&

      Cond.getOperand(1).getOpcode() == X86ISD::CMP &&

      isNullConstant(Cond.getOperand(1).getOperand(1))) {

    SDValue Cmp = Cond.getOperand(1);

    SDValue CmpOp0 = Cmp.getOperand(0);

    unsigned CondCode = Cond.getConstantOperandVal(0);


    // Special handling for __builtin_ffs(X) - 1 pattern which looks like

    // (select (seteq X, 0), -1, (cttz_zero_undef X)). Disable the special

    // handle to keep the CMP with 0. This should be removed by

    // optimizeCompareInst by using the flags from the BSR/TZCNT used for the

    // cttz_zero_undef.

    auto MatchFFSMinus1 = [&](SDValue Op1, SDValue Op2) {

      return (Op1.getOpcode() == ISD::CTTZ_ZERO_UNDEF && Op1.hasOneUse() &&

              Op1.getOperand(0) == CmpOp0 && isAllOnesConstant(Op2));

    };

    if (Subtarget.canUseCMOV() && (VT == MVT::i32 || VT == MVT::i64) &&

        ((CondCode == X86::COND_NE && MatchFFSMinus1(Op1, Op2)) ||

         (CondCode == X86::COND_E && MatchFFSMinus1(Op2, Op1)))) {

      // Keep Cmp.

    } else if (SDValue R = LowerSELECTWithCmpZero(CmpOp0, Op1, Op2, CondCode,

                                                  DL, DAG, Subtarget)) {

      return R;

    } else if (VT.isScalarInteger() && isNullConstant(Op2) &&

               Cmp.getNode()->hasOneUse() && (CmpOp0 == Op1) &&

               ((CondCode == X86::COND_S) ||                    // smin(x, 0)

                (CondCode == X86::COND_G && hasAndNot(Op1)))) { // smax(x, 0)

      // (select (x < 0), x, 0) -> ((x >> (size_in_bits(x)-1))) & x

      //

      // If the comparison is testing for a positive value, we have to invert

      // the sign bit mask, so only do that transform if the target has a

      // bitwise 'and not' instruction (the invert is free).

      // (select (x > 0), x, 0) -> (~(x >> (size_in_bits(x)-1))) & x

      unsigned ShCt = VT.getSizeInBits() - 1;

      SDValue ShiftAmt = DAG.getConstant(ShCt, DL, VT);

      SDValue Shift = DAG.getNode(ISD::SRA, DL, VT, Op1, ShiftAmt);

      if (CondCode == X86::COND_G)

        Shift = DAG.getNOT(DL, Shift, VT);

      return DAG.getNode(ISD::AND, DL, VT, Shift, Op1);

    }

  }


  // Look past (and (setcc_carry (cmp ...)), 1).

  if (Cond.getOpcode() == ISD::AND &&

      Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY &&

      isOneConstant(Cond.getOperand(1)))

    Cond = Cond.getOperand(0);


  // Attempt to fold "raw cond" cases by treating them as:

  // (select (and X, 1), Op1, Op2  --> (select (icmpeq (and X, 1), 0), Op2, Op1)

  if (Cond.getOpcode() == ISD::AND && isOneConstant(Cond.getOperand(1)))

    if (SDValue R = LowerSELECTWithCmpZero(Cond, Op2, Op1, X86::COND_E, DL, DAG,

                                           Subtarget))

      return R;


  // If condition flag is set by a X86ISD::CMP, then use it as the condition

  // setting operand in place of the X86ISD::SETCC.

  unsigned CondOpcode = Cond.getOpcode();

  if (CondOpcode == X86ISD::SETCC ||

      CondOpcode == X86ISD::SETCC_CARRY) {

    CC = Cond.getOperand(0);


    SDValue Cmp = Cond.getOperand(1);

    bool IllegalFPCMov = false;

    if (VT.isFloatingPoint() && !VT.isVector() &&

        !isScalarFPTypeInSSEReg(VT) && Subtarget.canUseCMOV())  // FPStack?

      IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue());


    if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) ||

        Cmp.getOpcode() == X86ISD::BT) { // FIXME

      Cond = Cmp;

      AddTest = false;

    }

  } else if (CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||

             CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||

             CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) {

    SDValue Value;

    X86::CondCode X86Cond;

    std::tie(Value, Cond) = getX86XALUOOp(X86Cond, Cond.getValue(0), DAG);


    CC = DAG.getTargetConstant(X86Cond, DL, MVT::i8);

    AddTest = false;

  }


  if (AddTest) {

    // Look past the truncate if the high bits are known zero.

    if (isTruncWithZeroHighBitsInput(Cond, DAG))

      Cond = Cond.getOperand(0);


    // We know the result of AND is compared against zero. Try to match

    // it to BT.

    if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {

      X86::CondCode X86CondCode;

      if (SDValue BT = LowerAndToBT(Cond, ISD::SETNE, DL, DAG, X86CondCode)) {

        CC = DAG.getTargetConstant(X86CondCode, DL, MVT::i8);

        Cond = BT;

        AddTest = false;

      }

    }

  }


  if (AddTest) {

    CC = DAG.getTargetConstant(X86::COND_NE, DL, MVT::i8);

    Cond = EmitTest(Cond, X86::COND_NE, DL, DAG, Subtarget);

  }


  // a <  b ? -1 :  0 -> RES = ~setcc_carry

  // a <  b ?  0 : -1 -> RES = setcc_carry

  // a >= b ? -1 :  0 -> RES = setcc_carry

  // a >= b ?  0 : -1 -> RES = ~setcc_carry

  if (Cond.getOpcode() == X86ISD::SUB) {

    unsigned CondCode = CC->getAsZExtVal();


    if ((CondCode == X86::COND_AE || CondCode == X86::COND_B) &&

        (isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) &&

        (isNullConstant(Op1) || isNullConstant(Op2))) {

      SDValue Res =

          DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),

                      DAG.getTargetConstant(X86::COND_B, DL, MVT::i8), Cond);

      if (isAllOnesConstant(Op1) != (CondCode == X86::COND_B))

        return DAG.getNOT(DL, Res, Res.getValueType());

      return Res;

    }

  }


  // X86 doesn't have an i8 cmov. If both operands are the result of a truncate

  // widen the cmov and push the truncate through. This avoids introducing a new

  // branch during isel and doesn't add any extensions.

  if (Op.getValueType() == MVT::i8 &&

      Op1.getOpcode() == ISD::TRUNCATE && Op2.getOpcode() == ISD::TRUNCATE) {

    SDValue T1 = Op1.getOperand(0), T2 = Op2.getOperand(0);

    if (T1.getValueType() == T2.getValueType() &&

        // Exclude CopyFromReg to avoid partial register stalls.

        T1.getOpcode() != ISD::CopyFromReg && T2.getOpcode()!=ISD::CopyFromReg){

      SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, T1.getValueType(), T2, T1,

                                 CC, Cond);

      return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov);

    }

  }


  // Or finally, promote i8 cmovs if we have CMOV,

  //                 or i16 cmovs if it won't prevent folding a load.

  // FIXME: we should not limit promotion of i8 case to only when the CMOV is

  //        legal, but EmitLoweredSelect() can not deal with these extensions

  //        being inserted between two CMOV's. (in i16 case too TBN)

  //        https://bugs.llvm.org/show_bug.cgi?id=40974

  if ((Op.getValueType() == MVT::i8 && Subtarget.canUseCMOV()) ||

      (Op.getValueType() == MVT::i16 && !X86::mayFoldLoad(Op1, Subtarget) &&

       !X86::mayFoldLoad(Op2, Subtarget))) {

    Op1 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op1);

    Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op2);

    SDValue Ops[] = { Op2, Op1, CC, Cond };

    SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, MVT::i32, Ops);

    return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov);

  }


  // X86ISD::CMOV means set the result (which is operand 1) to the RHS if

  // condition is true.

  SDValue Ops[] = { Op2, Op1, CC, Cond };

  return DAG.getNode(X86ISD::CMOV, DL, Op.getValueType(), Ops, Op->getFlags());

}


static SDValue LowerSIGN_EXTEND_Mask(SDValue Op, const SDLoc &dl,

                                     const X86Subtarget &Subtarget,

                                     SelectionDAG &DAG) {

  MVT VT = Op->getSimpleValueType(0);

  SDValue In = Op->getOperand(0);

  MVT InVT = In.getSimpleValueType();

  assert(InVT.getVectorElementType() == MVT::i1 && "Unexpected input type!");

  MVT VTElt = VT.getVectorElementType();

  unsigned NumElts = VT.getVectorNumElements();


  // Extend VT if the scalar type is i8/i16 and BWI is not supported.

  MVT ExtVT = VT;

  if (!Subtarget.hasBWI() && VTElt.getSizeInBits() <= 16) {

    // If v16i32 is to be avoided, we'll need to split and concatenate.

    if (NumElts == 16 && !Subtarget.canExtendTo512DQ())

      return SplitAndExtendv16i1(Op.getOpcode(), VT, In, dl, DAG);


    ExtVT = MVT::getVectorVT(MVT::i32, NumElts);

  }


  // Widen to 512-bits if VLX is not supported.

  MVT WideVT = ExtVT;

  if (!ExtVT.is512BitVector() && !Subtarget.hasVLX()) {

    NumElts *= 512 / ExtVT.getSizeInBits();

    InVT = MVT::getVectorVT(MVT::i1, NumElts);

    In = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, InVT, DAG.getUNDEF(InVT), In,

                     DAG.getVectorIdxConstant(0, dl));

    WideVT = MVT::getVectorVT(ExtVT.getVectorElementType(), NumElts);

  }


  SDValue V;

  MVT WideEltVT = WideVT.getVectorElementType();

  if ((Subtarget.hasDQI() && WideEltVT.getSizeInBits() >= 32) ||

      (Subtarget.hasBWI() && WideEltVT.getSizeInBits() <= 16)) {

    V = DAG.getNode(Op.getOpcode(), dl, WideVT, In);

  } else {

    SDValue NegOne = DAG.getAllOnesConstant(dl, WideVT);

    SDValue Zero = DAG.getConstant(0, dl, WideVT);

    V = DAG.getSelect(dl, WideVT, In, NegOne, Zero);

  }


  // Truncate if we had to extend i16/i8 above.

  if (VT != ExtVT) {

    WideVT = MVT::getVectorVT(VTElt, NumElts);

    V = DAG.getNode(ISD::TRUNCATE, dl, WideVT, V);

  }


  // Extract back to 128/256-bit if we widened.

  if (WideVT != VT)

    V = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, V,

                    DAG.getVectorIdxConstant(0, dl));


  return V;

}


static SDValue LowerANY_EXTEND(SDValue Op, const X86Subtarget &Subtarget,

                               SelectionDAG &DAG) {

  SDValue In = Op->getOperand(0);

  MVT InVT = In.getSimpleValueType();

  SDLoc DL(Op);


  if (InVT.getVectorElementType() == MVT::i1)

    return LowerSIGN_EXTEND_Mask(Op, DL, Subtarget, DAG);


  assert(Subtarget.hasAVX() && "Expected AVX support");

  return LowerAVXExtend(Op, DL, DAG, Subtarget);

}


// Lowering for SIGN_EXTEND_VECTOR_INREG and ZERO_EXTEND_VECTOR_INREG.

// For sign extend this needs to handle all vector sizes and SSE4.1 and

// non-SSE4.1 targets. For zero extend this should only handle inputs of

// MVT::v64i8 when BWI is not supported, but AVX512 is.


static SDValue LowerEXTEND_VECTOR_INREG(SDValue Op,

                                        const X86Subtarget &Subtarget,

                                        SelectionDAG &DAG) {

  SDValue In = Op->getOperand(0);

  MVT VT = Op->getSimpleValueType(0);

  MVT InVT = In.getSimpleValueType();


  MVT SVT = VT.getVectorElementType();

  MVT InSVT = InVT.getVectorElementType();

  assert(SVT.getFixedSizeInBits() > InSVT.getFixedSizeInBits());


  if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16)

    return SDValue();

  if (InSVT != MVT::i32 && InSVT != MVT::i16 && InSVT != MVT::i8)

    return SDValue();

  if (!(VT.is128BitVector() && Subtarget.hasSSE2()) &&

      !(VT.is256BitVector() && Subtarget.hasAVX()) &&

      !(VT.is512BitVector() && Subtarget.hasAVX512()))

    return SDValue();


  SDLoc dl(Op);

  unsigned Opc = Op.getOpcode();

  unsigned NumElts = VT.getVectorNumElements();


  // For 256-bit vectors, we only need the lower (128-bit) half of the input.

  // For 512-bit vectors, we need 128-bits or 256-bits.

  if (InVT.getSizeInBits() > 128) {

    // Input needs to be at least the same number of elements as output, and

    // at least 128-bits.

    int InSize = InSVT.getSizeInBits() * NumElts;

    In = extractSubVector(In, 0, DAG, dl, std::max(InSize, 128));

    InVT = In.getSimpleValueType();

  }


  // SSE41 targets can use the pmov[sz]x* instructions directly for 128-bit results,

  // so are legal and shouldn't occur here. AVX2/AVX512 pmovsx* instructions still

  // need to be handled here for 256/512-bit results.

  if (Subtarget.hasInt256()) {

    assert(VT.getSizeInBits() > 128 && "Unexpected 128-bit vector extension");


    if (InVT.getVectorNumElements() != NumElts)

      return DAG.getNode(Op.getOpcode(), dl, VT, In);


    // FIXME: Apparently we create inreg operations that could be regular

    // extends.

    unsigned ExtOpc =

        Opc == ISD::SIGN_EXTEND_VECTOR_INREG ? ISD::SIGN_EXTEND

                                             : ISD::ZERO_EXTEND;

    return DAG.getNode(ExtOpc, dl, VT, In);

  }


  // pre-AVX2 256-bit extensions need to be split into 128-bit instructions.

  if (Subtarget.hasAVX()) {

    assert(VT.is256BitVector() && "256-bit vector expected");

    MVT HalfVT = VT.getHalfNumVectorElementsVT();

    int HalfNumElts = HalfVT.getVectorNumElements();


    unsigned NumSrcElts = InVT.getVectorNumElements();

    SmallVector<int, 16> HiMask(NumSrcElts, SM_SentinelUndef);

    for (int i = 0; i != HalfNumElts; ++i)

      HiMask[i] = HalfNumElts + i;


    SDValue Lo = DAG.getNode(Opc, dl, HalfVT, In);

    SDValue Hi = DAG.getVectorShuffle(InVT, dl, In, DAG.getUNDEF(InVT), HiMask);

    Hi = DAG.getNode(Opc, dl, HalfVT, Hi);

    return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);

  }


  // We should only get here for sign extend.

  assert(Opc == ISD::SIGN_EXTEND_VECTOR_INREG && "Unexpected opcode!");

  assert(VT.is128BitVector() && InVT.is128BitVector() && "Unexpected VTs");

  unsigned InNumElts = InVT.getVectorNumElements();


  // If the source elements are already all-signbits, we don't need to extend,

  // just splat the elements.

  APInt DemandedElts = APInt::getLowBitsSet(InNumElts, NumElts);

  if (DAG.ComputeNumSignBits(In, DemandedElts) == InVT.getScalarSizeInBits()) {

    unsigned Scale = InNumElts / NumElts;

    SmallVector<int, 16> ShuffleMask;

    for (unsigned I = 0; I != NumElts; ++I)

      ShuffleMask.append(Scale, I);

    return DAG.getBitcast(VT,

                          DAG.getVectorShuffle(InVT, dl, In, In, ShuffleMask));

  }


  // pre-SSE41 targets unpack lower lanes and then sign-extend using SRAI.

  SDValue Curr = In;

  SDValue SignExt = Curr;


  // As SRAI is only available on i16/i32 types, we expand only up to i32

  // and handle i64 separately.

  if (InVT != MVT::v4i32) {

    MVT DestVT = VT == MVT::v2i64 ? MVT::v4i32 : VT;


    unsigned DestWidth = DestVT.getScalarSizeInBits();

    unsigned Scale = DestWidth / InSVT.getSizeInBits();

    unsigned DestElts = DestVT.getVectorNumElements();


    // Build a shuffle mask that takes each input element and places it in the

    // MSBs of the new element size.

    SmallVector<int, 16> Mask(InNumElts, SM_SentinelUndef);

    for (unsigned i = 0; i != DestElts; ++i)

      Mask[i * Scale + (Scale - 1)] = i;


    Curr = DAG.getVectorShuffle(InVT, dl, In, In, Mask);

    Curr = DAG.getBitcast(DestVT, Curr);


    unsigned SignExtShift = DestWidth - InSVT.getSizeInBits();

    SignExt = DAG.getNode(X86ISD::VSRAI, dl, DestVT, Curr,

                          DAG.getTargetConstant(SignExtShift, dl, MVT::i8));

  }


  if (VT == MVT::v2i64) {

    assert(Curr.getValueType() == MVT::v4i32 && "Unexpected input VT");

    SDValue Zero = DAG.getConstant(0, dl, MVT::v4i32);

    SDValue Sign = DAG.getSetCC(dl, MVT::v4i32, Zero, Curr, ISD::SETGT);

    SignExt = DAG.getVectorShuffle(MVT::v4i32, dl, SignExt, Sign, {0, 4, 1, 5});

    SignExt = DAG.getBitcast(VT, SignExt);

  }


  return SignExt;

}


static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget &Subtarget,

                                SelectionDAG &DAG) {

  MVT VT = Op->getSimpleValueType(0);

  SDValue In = Op->getOperand(0);

  MVT InVT = In.getSimpleValueType();

  SDLoc dl(Op);


  if (InVT.getVectorElementType() == MVT::i1)

    return LowerSIGN_EXTEND_Mask(Op, dl, Subtarget, DAG);


  assert(VT.isVector() && InVT.isVector() && "Expected vector type");

  assert(VT.getVectorNumElements() == InVT.getVectorNumElements() &&

         "Expected same number of elements");

  assert((VT.getVectorElementType() == MVT::i16 ||

          VT.getVectorElementType() == MVT::i32 ||

          VT.getVectorElementType() == MVT::i64) &&

         "Unexpected element type");

  assert((InVT.getVectorElementType() == MVT::i8 ||

          InVT.getVectorElementType() == MVT::i16 ||

          InVT.getVectorElementType() == MVT::i32) &&

         "Unexpected element type");


  if (VT == MVT::v32i16 && !Subtarget.hasBWI()) {

    assert(InVT == MVT::v32i8 && "Unexpected VT!");

    return splitVectorIntUnary(Op, DAG, dl);

  }


  if (Subtarget.hasInt256())

    return Op;


  // Optimize vectors in AVX mode

  // Sign extend  v8i16 to v8i32 and

  //              v4i32 to v4i64

  //

  // Divide input vector into two parts

  // for v4i32 the high shuffle mask will be {2, 3, -1, -1}

  // use vpmovsx instruction to extend v4i32 -> v2i64; v8i16 -> v4i32

  // concat the vectors to original VT

  MVT HalfVT = VT.getHalfNumVectorElementsVT();

  SDValue OpLo = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, HalfVT, In);


  unsigned NumElems = InVT.getVectorNumElements();

  SmallVector<int,8> ShufMask(NumElems, -1);

  for (unsigned i = 0; i != NumElems/2; ++i)

    ShufMask[i] = i + NumElems/2;


  SDValue OpHi = DAG.getVectorShuffle(InVT, dl, In, In, ShufMask);

  OpHi = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, HalfVT, OpHi);


  return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);

}


/// Change a vector store into a pair of half-size vector stores.


static SDValue splitVectorStore(StoreSDNode *Store, SelectionDAG &DAG) {

  SDValue StoredVal = Store->getValue();

  assert((StoredVal.getValueType().is256BitVector() ||

          StoredVal.getValueType().is512BitVector()) &&

         "Expecting 256/512-bit op");


  // Splitting volatile memory ops is not allowed unless the operation was not

  // legal to begin with. Assume the input store is legal (this transform is

  // only used for targets with AVX). Note: It is possible that we have an

  // illegal type like v2i128, and so we could allow splitting a volatile store

  // in that case if that is important.

  if (!Store->isSimple())

    return SDValue();


  SDLoc DL(Store);

  SDValue Value0, Value1;

  std::tie(Value0, Value1) = splitVector(StoredVal, DAG, DL);

  unsigned HalfOffset = Value0.getValueType().getStoreSize();

  SDValue Ptr0 = Store->getBasePtr();

  SDValue Ptr1 =

      DAG.getMemBasePlusOffset(Ptr0, TypeSize::getFixed(HalfOffset), DL);

  SDValue Ch0 =

      DAG.getStore(Store->getChain(), DL, Value0, Ptr0, Store->getPointerInfo(),

                   Store->getBaseAlign(), Store->getMemOperand()->getFlags());

  SDValue Ch1 =

      DAG.getStore(Store->getChain(), DL, Value1, Ptr1,

                   Store->getPointerInfo().getWithOffset(HalfOffset),

                   Store->getBaseAlign(), Store->getMemOperand()->getFlags());

  return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Ch0, Ch1);

}


/// Scalarize a vector store, bitcasting to TargetVT to determine the scalar

/// type.


static SDValue scalarizeVectorStore(StoreSDNode *Store, MVT StoreVT,

                                    SelectionDAG &DAG) {

  SDValue StoredVal = Store->getValue();

  assert(StoreVT.is128BitVector() &&

         StoredVal.getValueType().is128BitVector() && "Expecting 128-bit op");

  StoredVal = DAG.getBitcast(StoreVT, StoredVal);


  // Splitting volatile memory ops is not allowed unless the operation was not

  // legal to begin with. We are assuming the input op is legal (this transform

  // is only used for targets with AVX).

  if (!Store->isSimple())

    return SDValue();


  MVT StoreSVT = StoreVT.getScalarType();

  unsigned NumElems = StoreVT.getVectorNumElements();

  unsigned ScalarSize = StoreSVT.getStoreSize();


  SDLoc DL(Store);

  SmallVector<SDValue, 4> Stores;

  for (unsigned i = 0; i != NumElems; ++i) {

    unsigned Offset = i * ScalarSize;

    SDValue Ptr = DAG.getMemBasePlusOffset(Store->getBasePtr(),

                                           TypeSize::getFixed(Offset), DL);

    SDValue Scl = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, StoreSVT, StoredVal,

                              DAG.getVectorIdxConstant(i, DL));

    SDValue Ch =

        DAG.getStore(Store->getChain(), DL, Scl, Ptr,

                     Store->getPointerInfo().getWithOffset(Offset),

                     Store->getBaseAlign(), Store->getMemOperand()->getFlags());

    Stores.push_back(Ch);

  }

  return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores);

}


static SDValue LowerStore(SDValue Op, const X86Subtarget &Subtarget,

                          SelectionDAG &DAG) {

  StoreSDNode *St = cast<StoreSDNode>(Op.getNode());

  SDLoc dl(St);

  SDValue StoredVal = St->getValue();


  // Without AVX512DQ, we need to use a scalar type for v2i1/v4i1/v8i1 stores.

  if (StoredVal.getValueType().isVector() &&

      StoredVal.getValueType().getVectorElementType() == MVT::i1) {

    unsigned NumElts = StoredVal.getValueType().getVectorNumElements();

    assert(NumElts <= 8 && "Unexpected VT");

    assert(!St->isTruncatingStore() && "Expected non-truncating store");

    assert(Subtarget.hasAVX512() && !Subtarget.hasDQI() &&

           "Expected AVX512F without AVX512DQI");


    // We must pad with zeros to ensure we store zeroes to any unused bits.

    StoredVal = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v16i1,

                            DAG.getUNDEF(MVT::v16i1), StoredVal,

                            DAG.getVectorIdxConstant(0, dl));

    StoredVal = DAG.getBitcast(MVT::i16, StoredVal);

    StoredVal = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, StoredVal);

    // Make sure we store zeros in the extra bits.

    if (NumElts < 8)

      StoredVal = DAG.getZeroExtendInReg(

          StoredVal, dl, EVT::getIntegerVT(*DAG.getContext(), NumElts));


    return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),

                        St->getPointerInfo(), St->getBaseAlign(),

                        St->getMemOperand()->getFlags());

  }


  if (St->isTruncatingStore())

    return SDValue();


  // If this is a 256/512-bit store of concatenated ops, we are better off

  // splitting that store into two half-size stores. This avoids spurious use of

  // concatenated ops and each half can execute independently. Some cores would

  // split the op into halves anyway, so the concat is purely an extra op.

  MVT StoreVT = StoredVal.getSimpleValueType();

  if (StoreVT.is256BitVector() || StoreVT.is512BitVector()) {

    if (StoredVal.hasOneUse() && isFreeToSplitVector(StoredVal, DAG))

      return splitVectorStore(St, DAG);

    return SDValue();

  }


  if (StoreVT.is32BitVector())

    return SDValue();


  const TargetLowering &TLI = DAG.getTargetLoweringInfo();

  assert(StoreVT.is64BitVector() && "Unexpected VT");

  assert(TLI.getTypeAction(*DAG.getContext(), StoreVT) ==

             TargetLowering::TypeWidenVector &&

         "Unexpected type action!");


  EVT WideVT = TLI.getTypeToTransformTo(*DAG.getContext(), StoreVT);

  StoredVal = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT, StoredVal,

                          DAG.getUNDEF(StoreVT));


  if (Subtarget.hasSSE2()) {

    // Widen the vector, cast to a v2x64 type, extract the single 64-bit element

    // and store it.

    MVT StVT = Subtarget.is64Bit() && StoreVT.isInteger() ? MVT::i64 : MVT::f64;

    MVT CastVT = MVT::getVectorVT(StVT, 2);

    StoredVal = DAG.getBitcast(CastVT, StoredVal);

    StoredVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, StVT, StoredVal,

                            DAG.getVectorIdxConstant(0, dl));


    return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),

                        St->getPointerInfo(), St->getBaseAlign(),

                        St->getMemOperand()->getFlags());

  }

  assert(Subtarget.hasSSE1() && "Expected SSE");

  SDVTList Tys = DAG.getVTList(MVT::Other);

  SDValue Ops[] = {St->getChain(), StoredVal, St->getBasePtr()};

  return DAG.getMemIntrinsicNode(X86ISD::VEXTRACT_STORE, dl, Tys, Ops, MVT::i64,

                                 St->getMemOperand());

}


// Lower vector extended loads using a shuffle. If SSSE3 is not available we

// may emit an illegal shuffle but the expansion is still better than scalar

// code. We generate sext/sext_invec for SEXTLOADs if it's available, otherwise

// we'll emit a shuffle and a arithmetic shift.

// FIXME: Is the expansion actually better than scalar code? It doesn't seem so.

// TODO: It is possible to support ZExt by zeroing the undef values during

// the shuffle phase or after the shuffle.


static SDValue LowerLoad(SDValue Op, const X86Subtarget &Subtarget,

                                 SelectionDAG &DAG) {

  MVT RegVT = Op.getSimpleValueType();

  assert(RegVT.isVector() && "We only custom lower vector loads.");

  assert(RegVT.isInteger() &&

         "We only custom lower integer vector loads.");


  LoadSDNode *Ld = cast<LoadSDNode>(Op.getNode());

  SDLoc dl(Ld);


  // Without AVX512DQ, we need to use a scalar type for v2i1/v4i1/v8i1 loads.

  if (RegVT.getVectorElementType() == MVT::i1) {

    assert(EVT(RegVT) == Ld->getMemoryVT() && "Expected non-extending load");

    assert(RegVT.getVectorNumElements() <= 8 && "Unexpected VT");

    assert(Subtarget.hasAVX512() && !Subtarget.hasDQI() &&

           "Expected AVX512F without AVX512DQI");


    SDValue NewLd = DAG.getLoad(MVT::i8, dl, Ld->getChain(), Ld->getBasePtr(),

                                Ld->getPointerInfo(), Ld->getBaseAlign(),

                                Ld->getMemOperand()->getFlags());


    // Replace chain users with the new chain.

    assert(NewLd->getNumValues() == 2 && "Loads must carry a chain!");


    SDValue Val = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, NewLd);

    Val = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, RegVT,

                      DAG.getBitcast(MVT::v16i1, Val),

                      DAG.getVectorIdxConstant(0, dl));

    return DAG.getMergeValues({Val, NewLd.getValue(1)}, dl);

  }


  return SDValue();

}


/// Return true if node is an ISD::AND or ISD::OR of two X86ISD::SETCC nodes

/// each of which has no other use apart from the AND / OR.


static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) {

  Opc = Op.getOpcode();

  if (Opc != ISD::OR && Opc != ISD::AND)

    return false;

  return (Op.getOperand(0).getOpcode() == X86ISD::SETCC &&

          Op.getOperand(0).hasOneUse() &&

          Op.getOperand(1).getOpcode() == X86ISD::SETCC &&

          Op.getOperand(1).hasOneUse());

}


SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {

  SDValue Chain = Op.getOperand(0);

  SDValue Cond  = Op.getOperand(1);

  SDValue Dest  = Op.getOperand(2);

  SDLoc dl(Op);


  // Bail out when we don't have native compare instructions.

  if (Cond.getOpcode() == ISD::SETCC &&

      Cond.getOperand(0).getValueType() != MVT::f128 &&

      !isSoftF16(Cond.getOperand(0).getValueType(), Subtarget)) {

    SDValue LHS = Cond.getOperand(0);

    SDValue RHS = Cond.getOperand(1);

    ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();


    // Special case for

    // setcc([su]{add,sub,mul}o == 0)

    // setcc([su]{add,sub,mul}o != 1)

    if (ISD::isOverflowIntrOpRes(LHS) &&

        (CC == ISD::SETEQ || CC == ISD::SETNE) &&

        (isNullConstant(RHS) || isOneConstant(RHS))) {

      SDValue Value, Overflow;

      X86::CondCode X86Cond;

      std::tie(Value, Overflow) = getX86XALUOOp(X86Cond, LHS.getValue(0), DAG);


      if ((CC == ISD::SETEQ) == isNullConstant(RHS))

        X86Cond = X86::GetOppositeBranchCondition(X86Cond);


      SDValue CCVal = DAG.getTargetConstant(X86Cond, dl, MVT::i8);

      return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,

                         Overflow, Op->getFlags());

    }


    if (LHS.getSimpleValueType().isInteger()) {

      SDValue CCVal;

      SDValue EFLAGS = emitFlagsForSetcc(LHS, RHS, CC, SDLoc(Cond), DAG, CCVal);

      return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,

                         EFLAGS, Op->getFlags());

    }


    if (CC == ISD::SETOEQ) {

      // For FCMP_OEQ, we can emit

      // two branches instead of an explicit AND instruction with a

      // separate test. However, we only do this if this block doesn't

      // have a fall-through edge, because this requires an explicit

      // jmp when the condition is false.

      if (Op.getNode()->hasOneUse()) {

        SDNode *User = *Op.getNode()->user_begin();

        // Look for an unconditional branch following this conditional branch.

        // We need this because we need to reverse the successors in order

        // to implement FCMP_OEQ.

        if (User->getOpcode() == ISD::BR) {

          SDValue FalseBB = User->getOperand(1);

          SDNode *NewBR =

            DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);

          assert(NewBR == User);

          (void)NewBR;

          Dest = FalseBB;


          SDValue Cmp =

              DAG.getNode(X86ISD::FCMP, SDLoc(Cond), MVT::i32, LHS, RHS);

          SDValue CCVal = DAG.getTargetConstant(X86::COND_NE, dl, MVT::i8);

          Chain = DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest,

                              CCVal, Cmp, Op->getFlags());

          CCVal = DAG.getTargetConstant(X86::COND_P, dl, MVT::i8);

          return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,

                             Cmp, Op->getFlags());

        }

      }

    } else if (CC == ISD::SETUNE) {

      // For FCMP_UNE, we can emit

      // two branches instead of an explicit OR instruction with a

      // separate test.

      SDValue Cmp = DAG.getNode(X86ISD::FCMP, SDLoc(Cond), MVT::i32, LHS, RHS);

      SDValue CCVal = DAG.getTargetConstant(X86::COND_NE, dl, MVT::i8);

      Chain = DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,

                          Cmp, Op->getFlags());

      CCVal = DAG.getTargetConstant(X86::COND_P, dl, MVT::i8);

      return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,

                         Cmp, Op->getFlags());

    } else {

      X86::CondCode X86Cond =

          TranslateX86CC(CC, dl, /*IsFP*/ true, LHS, RHS, DAG);

      SDValue Cmp = DAG.getNode(X86ISD::FCMP, SDLoc(Cond), MVT::i32, LHS, RHS);

      SDValue CCVal = DAG.getTargetConstant(X86Cond, dl, MVT::i8);

      return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,

                         Cmp, Op->getFlags());

    }

  }


  if (ISD::isOverflowIntrOpRes(Cond)) {

    SDValue Value, Overflow;

    X86::CondCode X86Cond;

    std::tie(Value, Overflow) = getX86XALUOOp(X86Cond, Cond.getValue(0), DAG);


    SDValue CCVal = DAG.getTargetConstant(X86Cond, dl, MVT::i8);

    return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,

                       Overflow, Op->getFlags());

  }


  // Look past the truncate if the high bits are known zero.

  if (isTruncWithZeroHighBitsInput(Cond, DAG))

    Cond = Cond.getOperand(0);


  EVT CondVT = Cond.getValueType();


  // Add an AND with 1 if we don't already have one.

  if (!(Cond.getOpcode() == ISD::AND && isOneConstant(Cond.getOperand(1))))

    Cond =

        DAG.getNode(ISD::AND, dl, CondVT, Cond, DAG.getConstant(1, dl, CondVT));


  SDValue LHS = Cond;

  SDValue RHS = DAG.getConstant(0, dl, CondVT);


  SDValue CCVal;

  SDValue EFLAGS = emitFlagsForSetcc(LHS, RHS, ISD::SETNE, dl, DAG, CCVal);

  return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal, EFLAGS,

                     Op->getFlags());

}


// Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets.

// Calls to _alloca are needed to probe the stack when allocating more than 4k

// bytes in one go. Touching the stack at 4K increments is necessary to ensure

// that the guard pages used by the OS virtual memory manager are allocated in

// correct sequence.

SDValue

X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,

                                           SelectionDAG &DAG) const {

  MachineFunction &MF = DAG.getMachineFunction();

  bool SplitStack = MF.shouldSplitStack();

  bool EmitStackProbeCall = hasStackProbeSymbol(MF);

  bool Lower = (Subtarget.isOSWindows() && !Subtarget.isTargetMachO()) ||

               SplitStack || EmitStackProbeCall;

  SDLoc dl(Op);


  // Get the inputs.

  SDNode *Node = Op.getNode();

  SDValue Chain = Op.getOperand(0);

  SDValue Size  = Op.getOperand(1);

  MaybeAlign Alignment(Op.getConstantOperandVal(2));

  EVT VT = Node->getValueType(0);


  // Chain the dynamic stack allocation so that it doesn't modify the stack

  // pointer when other instructions are using the stack.

  Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);


  bool Is64Bit = Subtarget.is64Bit();

  MVT SPTy = Op.getValueType().getSimpleVT();


  SDValue Result;

  if (!Lower) {

    const TargetLowering &TLI = DAG.getTargetLoweringInfo();

    Register SPReg = TLI.getStackPointerRegisterToSaveRestore();

    assert(SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and"

                    " not tell us which reg is the stack pointer!");


    const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();

    const Align StackAlign = TFI.getStackAlign();

    if (hasInlineStackProbe(MF)) {

      Result = DAG.getNode(X86ISD::PROBED_ALLOCA, dl, {SPTy, MVT::Other},

                           {Chain, Size});

      Chain = Result.getValue(1);

    } else {

      SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);

      Chain = SP.getValue(1);

      Result = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value

    }

    if (Alignment && *Alignment > StackAlign)

      Result = DAG.getNode(

          ISD::AND, dl, VT, Result,

          DAG.getSignedConstant(~(Alignment->value() - 1ULL), dl, VT));

    Chain = DAG.getCopyToReg(Chain, dl, SPReg, Result); // Output chain

  } else if (SplitStack) {

    if (Is64Bit) {

      // The 64 bit implementation of segmented stacks needs to clobber both r10

      // r11. This makes it impossible to use it along with nested parameters.

      const Function &F = MF.getFunction();

      for (const auto &A : F.args()) {

        if (A.hasNestAttr())

          report_fatal_error("Cannot use segmented stacks with functions that "

                             "have nested arguments.");

      }

    }


    Result =

        DAG.getNode(X86ISD::SEG_ALLOCA, dl, {SPTy, MVT::Other}, {Chain, Size});

    Chain = Result.getValue(1);

  } else {

    SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);

    Chain = DAG.getNode(X86ISD::DYN_ALLOCA, dl, NodeTys, Chain, Size);

    MF.getInfo<X86MachineFunctionInfo>()->setHasDynAlloca(true);


    const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();

    Register SPReg = RegInfo->getStackRegister();

    SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, SPTy);

    Chain = SP.getValue(1);


    if (Alignment) {

      SP = DAG.getNode(

          ISD::AND, dl, VT, SP.getValue(0),

          DAG.getSignedConstant(~(Alignment->value() - 1ULL), dl, VT));

      Chain = DAG.getCopyToReg(Chain, dl, SPReg, SP);

    }


    Result = SP;

  }


  Chain = DAG.getCALLSEQ_END(Chain, 0, 0, SDValue(), dl);


  SDValue Ops[2] = {Result, Chain};

  return DAG.getMergeValues(Ops, dl);

}


SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {

  MachineFunction &MF = DAG.getMachineFunction();

  SDValue Ptr = Op.getOperand(1);

  EVT PtrVT = Ptr.getValueType();


  X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();


  const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();

  SDLoc DL(Op);


  if (!Subtarget.is64Bit() ||

      Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv())) {

    // vastart just stores the address of the VarArgsFrameIndex slot into the

    // memory location argument.

    SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);

    return DAG.getStore(Op.getOperand(0), DL, FR, Ptr, MachinePointerInfo(SV));

  }


  // __va_list_tag:

  //   gp_offset         (0 - 6 * 8)

  //   fp_offset         (48 - 48 + 8 * 16)

  //   overflow_arg_area (point to parameters coming in memory).

  //   reg_save_area

  SmallVector<SDValue, 8> MemOps;

  SDValue FIN = Op.getOperand(1);

  // Store gp_offset

  SDValue Store = DAG.getStore(

      Op.getOperand(0), DL,

      DAG.getConstant(FuncInfo->getVarArgsGPOffset(), DL, MVT::i32), FIN,

      MachinePointerInfo(SV));

  MemOps.push_back(Store);


  // Store fp_offset

  FIN = DAG.getMemBasePlusOffset(FIN, TypeSize::getFixed(4), DL);

  Store = DAG.getStore(

      Op.getOperand(0), DL,

      DAG.getConstant(FuncInfo->getVarArgsFPOffset(), DL, MVT::i32), FIN,

      MachinePointerInfo(SV, 4));

  MemOps.push_back(Store);


  // Store ptr to overflow_arg_area

  FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(4, DL));

  SDValue OVFIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);

  Store =

      DAG.getStore(Op.getOperand(0), DL, OVFIN, FIN, MachinePointerInfo(SV, 8));

  MemOps.push_back(Store);


  // Store ptr to reg_save_area.

  FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(

      Subtarget.isTarget64BitLP64() ? 8 : 4, DL));

  SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), PtrVT);

  Store = DAG.getStore(

      Op.getOperand(0), DL, RSFIN, FIN,

      MachinePointerInfo(SV, Subtarget.isTarget64BitLP64() ? 16 : 12));

  MemOps.push_back(Store);

  return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);

}


SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {

  assert(Subtarget.is64Bit() &&

         "LowerVAARG only handles 64-bit va_arg!");

  assert(Op.getNumOperands() == 4);


  MachineFunction &MF = DAG.getMachineFunction();

  if (Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv()))

    // The Win64 ABI uses char* instead of a structure.

    return DAG.expandVAArg(Op.getNode());


  SDValue Chain = Op.getOperand(0);

  SDValue SrcPtr = Op.getOperand(1);

  const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();

  unsigned Align = Op.getConstantOperandVal(3);

  SDLoc dl(Op);


  EVT ArgVT = Op.getNode()->getValueType(0);

  Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());

  uint32_t ArgSize = DAG.getDataLayout().getTypeAllocSize(ArgTy);

  uint8_t ArgMode;


  // Decide which area this value should be read from.

  // TODO: Implement the AMD64 ABI in its entirety. This simple

  // selection mechanism works only for the basic types.

  assert(ArgVT != MVT::f80 && "va_arg for f80 not yet implemented");

  if (ArgVT.isFloatingPoint() && ArgSize <= 16 /*bytes*/) {

    ArgMode = 2;  // Argument passed in XMM register. Use fp_offset.

  } else {

    assert(ArgVT.isInteger() && ArgSize <= 32 /*bytes*/ &&

           "Unhandled argument type in LowerVAARG");

    ArgMode = 1;  // Argument passed in GPR64 register(s). Use gp_offset.

  }


  if (ArgMode == 2) {

    // Make sure using fp_offset makes sense.

    assert(!Subtarget.useSoftFloat() &&

           !(MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat)) &&

           Subtarget.hasSSE1());

  }


  // Insert VAARG node into the DAG

  // VAARG returns two values: Variable Argument Address, Chain

  SDValue InstOps[] = {Chain, SrcPtr,

                       DAG.getTargetConstant(ArgSize, dl, MVT::i32),

                       DAG.getTargetConstant(ArgMode, dl, MVT::i8),

                       DAG.getTargetConstant(Align, dl, MVT::i32)};

  SDVTList VTs = DAG.getVTList(SrcPtr.getValueType(), MVT::Other);

  SDValue VAARG = DAG.getMemIntrinsicNode(

      Subtarget.isTarget64BitLP64() ? X86ISD::VAARG_64 : X86ISD::VAARG_X32, dl,

      VTs, InstOps, MVT::i64, MachinePointerInfo(SV),

      /*Alignment=*/std::nullopt,

      MachineMemOperand::MOLoad | MachineMemOperand::MOStore);

  Chain = VAARG.getValue(1);


  // Load the next argument and return it

  return DAG.getLoad(ArgVT, dl, Chain, VAARG, MachinePointerInfo());

}


static SDValue LowerVACOPY(SDValue Op, const X86Subtarget &Subtarget,

                           SelectionDAG &DAG) {

  // X86-64 va_list is a struct { i32, i32, i8*, i8* }, except on Windows,

  // where a va_list is still an i8*.

  assert(Subtarget.is64Bit() && "This code only handles 64-bit va_copy!");

  if (Subtarget.isCallingConvWin64(

        DAG.getMachineFunction().getFunction().getCallingConv()))

    // Probably a Win64 va_copy.

    return DAG.expandVACopy(Op.getNode());


  SDValue Chain = Op.getOperand(0);

  SDValue DstPtr = Op.getOperand(1);

  SDValue SrcPtr = Op.getOperand(2);

  const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();

  const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();

  SDLoc DL(Op);


  return DAG.getMemcpy(

      Chain, DL, DstPtr, SrcPtr,

      DAG.getIntPtrConstant(Subtarget.isTarget64BitLP64() ? 24 : 16, DL),

      Align(Subtarget.isTarget64BitLP64() ? 8 : 4), /*isVolatile*/ false, false,

      /*CI=*/nullptr, std::nullopt, MachinePointerInfo(DstSV),

      MachinePointerInfo(SrcSV));

}


// Helper to get immediate/variable SSE shift opcode from other shift opcodes.


static unsigned getTargetVShiftUniformOpcode(unsigned Opc, bool IsVariable) {

  switch (Opc) {

  case ISD::SHL:

  case X86ISD::VSHL:

  case X86ISD::VSHLI:

    return IsVariable ? X86ISD::VSHL : X86ISD::VSHLI;

  case ISD::SRL:

  case X86ISD::VSRL:

  case X86ISD::VSRLI:

    return IsVariable ? X86ISD::VSRL : X86ISD::VSRLI;

  case ISD::SRA:

  case X86ISD::VSRA:

  case X86ISD::VSRAI:

    return IsVariable ? X86ISD::VSRA : X86ISD::VSRAI;

  }

  llvm_unreachable("Unknown target vector shift node");

}


/// Handle vector element shifts where the shift amount is a constant.

/// Takes immediate version of shift as input.


static SDValue getTargetVShiftByConstNode(unsigned Opc, const SDLoc &dl, MVT VT,

                                          SDValue SrcOp, uint64_t ShiftAmt,

                                          SelectionDAG &DAG) {

  MVT ElementType = VT.getVectorElementType();


  // Bitcast the source vector to the output type, this is mainly necessary for

  // vXi8/vXi64 shifts.

  if (VT != SrcOp.getSimpleValueType())

    SrcOp = DAG.getBitcast(VT, SrcOp);


  // Fold this packed shift into its first operand if ShiftAmt is 0.

  if (ShiftAmt == 0)

    return SrcOp;


  // Check for ShiftAmt >= element width

  if (ShiftAmt >= ElementType.getSizeInBits()) {

    if (Opc == X86ISD::VSRAI)

      ShiftAmt = ElementType.getSizeInBits() - 1;

    else

      return DAG.getConstant(0, dl, VT);

  }


  assert((Opc == X86ISD::VSHLI || Opc == X86ISD::VSRLI || Opc == X86ISD::VSRAI)

         && "Unknown target vector shift-by-constant node");


  // Fold this packed vector shift into a build vector if SrcOp is a

  // vector of Constants or UNDEFs.

  if (ISD::isBuildVectorOfConstantSDNodes(SrcOp.getNode())) {

    unsigned ShiftOpc;

    switch (Opc) {

    default: llvm_unreachable("Unknown opcode!");

    case X86ISD::VSHLI:

      ShiftOpc = ISD::SHL;

      break;

    case X86ISD::VSRLI:

      ShiftOpc = ISD::SRL;

      break;

    case X86ISD::VSRAI:

      ShiftOpc = ISD::SRA;

      break;

    }


    SDValue Amt = DAG.getConstant(ShiftAmt, dl, VT);

    if (SDValue C = DAG.FoldConstantArithmetic(ShiftOpc, dl, VT, {SrcOp, Amt}))

      return C;

  }


  return DAG.getNode(Opc, dl, VT, SrcOp,

                     DAG.getTargetConstant(ShiftAmt, dl, MVT::i8));

}


/// Handle vector element shifts by a splat shift amount


static SDValue getTargetVShiftNode(unsigned Opc, const SDLoc &dl, MVT VT,

                                   SDValue SrcOp, SDValue ShAmt, int ShAmtIdx,

                                   const X86Subtarget &Subtarget,

                                   SelectionDAG &DAG) {

  MVT AmtVT = ShAmt.getSimpleValueType();

  assert(AmtVT.isVector() && "Vector shift type mismatch");

  assert(0 <= ShAmtIdx && ShAmtIdx < (int)AmtVT.getVectorNumElements() &&

         "Illegal vector splat index");


  // Move the splat element to the bottom element.

  if (ShAmtIdx != 0) {

    SmallVector<int> Mask(AmtVT.getVectorNumElements(), -1);

    Mask[0] = ShAmtIdx;

    ShAmt = DAG.getVectorShuffle(AmtVT, dl, ShAmt, DAG.getUNDEF(AmtVT), Mask);

  }


  // Peek through any zext node if we can get back to a 128-bit source.

  if (AmtVT.getScalarSizeInBits() == 64 &&

      (ShAmt.getOpcode() == ISD::ZERO_EXTEND ||

       ShAmt.getOpcode() == ISD::ZERO_EXTEND_VECTOR_INREG) &&

      ShAmt.getOperand(0).getValueType().isSimple() &&

      ShAmt.getOperand(0).getValueType().is128BitVector()) {

    ShAmt = ShAmt.getOperand(0);

    AmtVT = ShAmt.getSimpleValueType();

  }


  // See if we can mask off the upper elements using the existing source node.

  // The shift uses the entire lower 64-bits of the amount vector, so no need to

  // do this for vXi64 types.

  bool IsMasked = false;

  if (AmtVT.getScalarSizeInBits() < 64) {

    if (ShAmt.getOpcode() == ISD::BUILD_VECTOR ||

        ShAmt.getOpcode() == ISD::SCALAR_TO_VECTOR) {

      // If the shift amount has come from a scalar, then zero-extend the scalar

      // before moving to the vector.

      ShAmt = DAG.getZExtOrTrunc(ShAmt.getOperand(0), dl, MVT::i32);

      ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, ShAmt);

      ShAmt = DAG.getNode(X86ISD::VZEXT_MOVL, dl, MVT::v4i32, ShAmt);

      AmtVT = MVT::v4i32;

      IsMasked = true;

    } else if (ShAmt.getOpcode() == ISD::AND) {

      // See if the shift amount is already masked (e.g. for rotation modulo),

      // then we can zero-extend it by setting all the other mask elements to

      // zero.

      SmallVector<SDValue> MaskElts(

          AmtVT.getVectorNumElements(),

          DAG.getConstant(0, dl, AmtVT.getScalarType()));

      MaskElts[0] = DAG.getAllOnesConstant(dl, AmtVT.getScalarType());

      SDValue Mask = DAG.getBuildVector(AmtVT, dl, MaskElts);

      if ((Mask = DAG.FoldConstantArithmetic(ISD::AND, dl, AmtVT,

                                             {ShAmt.getOperand(1), Mask}))) {

        ShAmt = DAG.getNode(ISD::AND, dl, AmtVT, ShAmt.getOperand(0), Mask);

        IsMasked = true;

      }

    }

  }


  // Extract if the shift amount vector is larger than 128-bits.

  if (AmtVT.getSizeInBits() > 128) {

    ShAmt = extract128BitVector(ShAmt, 0, DAG, dl);

    AmtVT = ShAmt.getSimpleValueType();

  }


  // Zero-extend bottom element to v2i64 vector type, either by extension or

  // shuffle masking.

  if (!IsMasked && AmtVT.getScalarSizeInBits() < 64) {

    if (AmtVT == MVT::v4i32 && (ShAmt.getOpcode() == X86ISD::VBROADCAST ||

                                ShAmt.getOpcode() == X86ISD::VBROADCAST_LOAD)) {

      ShAmt = DAG.getNode(X86ISD::VZEXT_MOVL, SDLoc(ShAmt), MVT::v4i32, ShAmt);

    } else if (Subtarget.hasSSE41()) {

      ShAmt = DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, SDLoc(ShAmt),

                          MVT::v2i64, ShAmt);

    } else {

      SDValue ByteShift = DAG.getTargetConstant(

          (128 - AmtVT.getScalarSizeInBits()) / 8, SDLoc(ShAmt), MVT::i8);

      ShAmt = DAG.getBitcast(MVT::v16i8, ShAmt);

      ShAmt = DAG.getNode(X86ISD::VSHLDQ, SDLoc(ShAmt), MVT::v16i8, ShAmt,

                          ByteShift);

      ShAmt = DAG.getNode(X86ISD::VSRLDQ, SDLoc(ShAmt), MVT::v16i8, ShAmt,

                          ByteShift);

    }

  }


  // Change opcode to non-immediate version.

  Opc = getTargetVShiftUniformOpcode(Opc, true);


  // The return type has to be a 128-bit type with the same element

  // type as the input type.

  MVT EltVT = VT.getVectorElementType();

  MVT ShVT = MVT::getVectorVT(EltVT, 128 / EltVT.getSizeInBits());


  ShAmt = DAG.getBitcast(ShVT, ShAmt);

  return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt);

}


/// Return Mask with the necessary casting or extending

/// for \p Mask according to \p MaskVT when lowering masking intrinsics


static SDValue getMaskNode(SDValue Mask, MVT MaskVT,

                           const X86Subtarget &Subtarget, SelectionDAG &DAG,

                           const SDLoc &dl) {


  if (isAllOnesConstant(Mask))

    return DAG.getConstant(1, dl, MaskVT);

  if (X86::isZeroNode(Mask))

    return DAG.getConstant(0, dl, MaskVT);


  assert(MaskVT.bitsLE(Mask.getSimpleValueType()) && "Unexpected mask size!");


  if (Mask.getSimpleValueType() == MVT::i64 && Subtarget.is32Bit()) {

    assert(MaskVT == MVT::v64i1 && "Expected v64i1 mask!");

    assert(Subtarget.hasBWI() && "Expected AVX512BW target!");

    // In case 32bit mode, bitcast i64 is illegal, extend/split it.

    SDValue Lo, Hi;

    std::tie(Lo, Hi) = DAG.SplitScalar(Mask, dl, MVT::i32, MVT::i32);

    Lo = DAG.getBitcast(MVT::v32i1, Lo);

    Hi = DAG.getBitcast(MVT::v32i1, Hi);

    return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi);

  } else {

    MVT BitcastVT = MVT::getVectorVT(MVT::i1,

                                     Mask.getSimpleValueType().getSizeInBits());

    // In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements

    // are extracted by EXTRACT_SUBVECTOR.

    return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,

                       DAG.getBitcast(BitcastVT, Mask),

                       DAG.getVectorIdxConstant(0, dl));

  }

}


/// Return (and \p Op, \p Mask) for compare instructions or

/// (vselect \p Mask, \p Op, \p PreservedSrc) for others along with the

/// necessary casting or extending for \p Mask when lowering masking intrinsics


static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,

                                    SDValue PreservedSrc,

                                    const X86Subtarget &Subtarget,

                                    SelectionDAG &DAG) {

  MVT VT = Op.getSimpleValueType();

  MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());

  unsigned OpcodeSelect = ISD::VSELECT;

  SDLoc dl(Op);


  if (isAllOnesConstant(Mask))

    return Op;


  SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);


  if (PreservedSrc.isUndef())

    PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);

  return DAG.getNode(OpcodeSelect, dl, VT, VMask, Op, PreservedSrc);

}


/// Creates an SDNode for a predicated scalar operation.

/// \returns (X86vselect \p Mask, \p Op, \p PreservedSrc).

/// The mask is coming as MVT::i8 and it should be transformed

/// to MVT::v1i1 while lowering masking intrinsics.

/// The main difference between ScalarMaskingNode and VectorMaskingNode is using

/// "X86select" instead of "vselect". We just can't create the "vselect" node

/// for a scalar instruction.


static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask,

                                    SDValue PreservedSrc,

                                    const X86Subtarget &Subtarget,

                                    SelectionDAG &DAG) {

  auto *MaskConst = dyn_cast<ConstantSDNode>(Mask);

  if (MaskConst && (MaskConst->getZExtValue() & 0x1))

    return Op;


  MVT VT = Op.getSimpleValueType();

  SDLoc dl(Op);


  assert(Mask.getValueType() == MVT::i8 && "Unexpect type");

  SDValue IMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v1i1,

                              DAG.getBitcast(MVT::v8i1, Mask),

                              DAG.getVectorIdxConstant(0, dl));

  if (Op.getOpcode() == X86ISD::FSETCCM ||

      Op.getOpcode() == X86ISD::FSETCCM_SAE ||

      Op.getOpcode() == X86ISD::VFPCLASSS)

    return DAG.getNode(ISD::AND, dl, VT, Op, IMask);


  if (PreservedSrc.isUndef())

    PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);


  if (MaskConst) {

    assert((MaskConst->getZExtValue() & 0x1) == 0 && "Expected false mask");

    // Discard op and blend passthrough with scalar op src/dst.

    SmallVector<int, 16> ShuffleMask(VT.getVectorNumElements());

    std::iota(ShuffleMask.begin(), ShuffleMask.end(), 0);

    ShuffleMask[0] = VT.getVectorNumElements();

    return DAG.getVectorShuffle(VT, dl, Op.getOperand(0), PreservedSrc,

                                ShuffleMask);

  }


  return DAG.getNode(X86ISD::SELECTS, dl, VT, IMask, Op, PreservedSrc);

}


static int getSEHRegistrationNodeSize(const Function *Fn) {

  if (!Fn->hasPersonalityFn())

    report_fatal_error(

        "querying registration node size for function without personality");

  // The RegNodeSize is 6 32-bit words for SEH and 4 for C++ EH. See

  // WinEHStatePass for the full struct definition.

  switch (classifyEHPersonality(Fn->getPersonalityFn())) {

  case EHPersonality::MSVC_X86SEH: return 24;

  case EHPersonality::MSVC_CXX: return 16;

  default: break;

  }

  report_fatal_error(

      "can only recover FP for 32-bit MSVC EH personality functions");

}


/// When the MSVC runtime transfers control to us, either to an outlined

/// function or when returning to a parent frame after catching an exception, we

/// recover the parent frame pointer by doing arithmetic on the incoming EBP.

/// Here's the math:

///   RegNodeBase = EntryEBP - RegNodeSize

///   ParentFP = RegNodeBase - ParentFrameOffset

/// Subtracting RegNodeSize takes us to the offset of the registration node, and

/// subtracting the offset (negative on x86) takes us back to the parent FP.


static SDValue recoverFramePointer(SelectionDAG &DAG, const Function *Fn,

                                   SDValue EntryEBP) {

  MachineFunction &MF = DAG.getMachineFunction();

  SDLoc dl;


  // It's possible that the parent function no longer has a personality function

  // if the exceptional code was optimized away, in which case we just return

  // the incoming EBP.

  if (!Fn->hasPersonalityFn())

    return EntryEBP;


  // Get an MCSymbol that will ultimately resolve to the frame offset of the EH

  // registration, or the .set_setframe offset.

  MCSymbol *OffsetSym = MF.getContext().getOrCreateParentFrameOffsetSymbol(

      GlobalValue::dropLLVMManglingEscape(Fn->getName()));

  MVT PtrVT = EntryEBP.getValueType().getSimpleVT();

  SDValue OffsetSymVal = DAG.getMCSymbol(OffsetSym, PtrVT);

  SDValue ParentFrameOffset =

      DAG.getNode(ISD::LOCAL_RECOVER, dl, PtrVT, OffsetSymVal);


  // Return EntryEBP + ParentFrameOffset for x64. This adjusts from RSP after

  // prologue to RBP in the parent function.

  const X86Subtarget &Subtarget = DAG.getSubtarget<X86Subtarget>();

  if (Subtarget.is64Bit())

    return DAG.getNode(ISD::ADD, dl, PtrVT, EntryEBP, ParentFrameOffset);


  int RegNodeSize = getSEHRegistrationNodeSize(Fn);

  // RegNodeBase = EntryEBP - RegNodeSize

  // ParentFP = RegNodeBase - ParentFrameOffset

  SDValue RegNodeBase = DAG.getNode(ISD::SUB, dl, PtrVT, EntryEBP,

                                    DAG.getConstant(RegNodeSize, dl, PtrVT));

  return DAG.getNode(ISD::SUB, dl, PtrVT, RegNodeBase, ParentFrameOffset);

}


SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,

                                                   SelectionDAG &DAG) const {

  // Helper to detect if the operand is CUR_DIRECTION rounding mode.

  auto isRoundModeCurDirection = [](SDValue Rnd) {

    if (auto *C = dyn_cast<ConstantSDNode>(Rnd))

      return C->getAPIntValue() == X86::STATIC_ROUNDING::CUR_DIRECTION;


    return false;

  };

  auto isRoundModeSAE = [](SDValue Rnd) {

    if (auto *C = dyn_cast<ConstantSDNode>(Rnd)) {

      unsigned RC = C->getZExtValue();

      if (RC & X86::STATIC_ROUNDING::NO_EXC) {

        // Clear the NO_EXC bit and check remaining bits.

        RC ^= X86::STATIC_ROUNDING::NO_EXC;

        // As a convenience we allow no other bits or explicitly

        // current direction.

        return RC == 0 || RC == X86::STATIC_ROUNDING::CUR_DIRECTION;

      }

    }


    return false;

  };

  auto isRoundModeSAEToX = [](SDValue Rnd, unsigned &RC) {

    if (auto *C = dyn_cast<ConstantSDNode>(Rnd)) {

      RC = C->getZExtValue();

      if (RC & X86::STATIC_ROUNDING::NO_EXC) {

        // Clear the NO_EXC bit and check remaining bits.

        RC ^= X86::STATIC_ROUNDING::NO_EXC;

        return RC == X86::STATIC_ROUNDING::TO_NEAREST_INT ||

               RC == X86::STATIC_ROUNDING::TO_NEG_INF ||

               RC == X86::STATIC_ROUNDING::TO_POS_INF ||

               RC == X86::STATIC_ROUNDING::TO_ZERO;

      }

    }


    return false;

  };


  SDLoc dl(Op);

  unsigned IntNo = Op.getConstantOperandVal(0);

  MVT VT = Op.getSimpleValueType();

  const IntrinsicData* IntrData = getIntrinsicWithoutChain(IntNo);


  // Propagate flags from original node to transformed node(s).

  SelectionDAG::FlagInserter FlagsInserter(DAG, Op->getFlags());


  if (IntrData) {

    switch(IntrData->Type) {

    case INTR_TYPE_1OP: {

      // We specify 2 possible opcodes for intrinsics with rounding modes.

      // First, we check if the intrinsic may have non-default rounding mode,

      // (IntrData->Opc1 != 0), then we check the rounding mode operand.

      unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;

      if (IntrWithRoundingModeOpcode != 0) {

        SDValue Rnd = Op.getOperand(2);

        unsigned RC = 0;

        if (isRoundModeSAEToX(Rnd, RC))

          return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),

                             Op.getOperand(1),

                             DAG.getTargetConstant(RC, dl, MVT::i32));

        if (!isRoundModeCurDirection(Rnd))

          return SDValue();

      }

      return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),

                         Op.getOperand(1));

    }

    case INTR_TYPE_1OP_SAE: {

      SDValue Sae = Op.getOperand(2);


      unsigned Opc;

      if (isRoundModeCurDirection(Sae))

        Opc = IntrData->Opc0;

      else if (isRoundModeSAE(Sae))

        Opc = IntrData->Opc1;

      else

        return SDValue();


      return DAG.getNode(Opc, dl, Op.getValueType(), Op.getOperand(1));

    }

    case INTR_TYPE_2OP: {

      SDValue Src2 = Op.getOperand(2);


      // We specify 2 possible opcodes for intrinsics with rounding modes.

      // First, we check if the intrinsic may have non-default rounding mode,

      // (IntrData->Opc1 != 0), then we check the rounding mode operand.

      unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;

      if (IntrWithRoundingModeOpcode != 0) {

        SDValue Rnd = Op.getOperand(3);

        unsigned RC = 0;

        if (isRoundModeSAEToX(Rnd, RC))

          return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),

                             Op.getOperand(1), Src2,

                             DAG.getTargetConstant(RC, dl, MVT::i32));

        if (!isRoundModeCurDirection(Rnd))

          return SDValue();

      }


      return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),

                         Op.getOperand(1), Src2);

    }

    case INTR_TYPE_2OP_SAE: {

      SDValue Sae = Op.getOperand(3);


      unsigned Opc;

      if (isRoundModeCurDirection(Sae))

        Opc = IntrData->Opc0;

      else if (isRoundModeSAE(Sae))

        Opc = IntrData->Opc1;

      else

        return SDValue();


      return DAG.getNode(Opc, dl, Op.getValueType(), Op.getOperand(1),

                         Op.getOperand(2));

    }

    case INTR_TYPE_3OP:

    case INTR_TYPE_3OP_IMM8: {

      SDValue Src1 = Op.getOperand(1);

      SDValue Src2 = Op.getOperand(2);

      SDValue Src3 = Op.getOperand(3);


      if (IntrData->Type == INTR_TYPE_3OP_IMM8 &&

          Src3.getValueType() != MVT::i8) {

        Src3 = DAG.getTargetConstant(Src3->getAsZExtVal() & 0xff, dl, MVT::i8);

      }


      // We specify 2 possible opcodes for intrinsics with rounding modes.

      // First, we check if the intrinsic may have non-default rounding mode,

      // (IntrData->Opc1 != 0), then we check the rounding mode operand.

      unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;

      if (IntrWithRoundingModeOpcode != 0) {

        SDValue Rnd = Op.getOperand(4);

        unsigned RC = 0;

        if (isRoundModeSAEToX(Rnd, RC))

          return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),

                             Src1, Src2, Src3,

                             DAG.getTargetConstant(RC, dl, MVT::i32));

        if (!isRoundModeCurDirection(Rnd))

          return SDValue();

      }


      return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),

                         {Src1, Src2, Src3});

    }

    case INTR_TYPE_4OP_IMM8: {

      assert(Op.getOperand(4)->getOpcode() == ISD::TargetConstant);

      SDValue Src4 = Op.getOperand(4);

      if (Src4.getValueType() != MVT::i8) {

        Src4 = DAG.getTargetConstant(Src4->getAsZExtVal() & 0xff, dl, MVT::i8);

      }


      return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),

                         Op.getOperand(1), Op.getOperand(2), Op.getOperand(3),

                         Src4);

    }

    case INTR_TYPE_1OP_MASK: {

      SDValue Src = Op.getOperand(1);

      SDValue PassThru = Op.getOperand(2);

      SDValue Mask = Op.getOperand(3);

      // We add rounding mode to the Node when

      //   - RC Opcode is specified and

      //   - RC is not "current direction".

      unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;

      if (IntrWithRoundingModeOpcode != 0) {

        SDValue Rnd = Op.getOperand(4);

        unsigned RC = 0;

        if (isRoundModeSAEToX(Rnd, RC))

          return getVectorMaskingNode(

              DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),

                          Src, DAG.getTargetConstant(RC, dl, MVT::i32)),

              Mask, PassThru, Subtarget, DAG);

        if (!isRoundModeCurDirection(Rnd))

          return SDValue();

      }

      return getVectorMaskingNode(

          DAG.getNode(IntrData->Opc0, dl, VT, Src), Mask, PassThru,

          Subtarget, DAG);

    }

    case INTR_TYPE_1OP_MASK_SAE: {

      SDValue Src = Op.getOperand(1);

      SDValue PassThru = Op.getOperand(2);

      SDValue Mask = Op.getOperand(3);

      SDValue Rnd = Op.getOperand(4);


      unsigned Opc;

      if (isRoundModeCurDirection(Rnd))

        Opc = IntrData->Opc0;

      else if (isRoundModeSAE(Rnd))

        Opc = IntrData->Opc1;

      else

        return SDValue();


      return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src), Mask, PassThru,

                                  Subtarget, DAG);

    }

    case INTR_TYPE_SCALAR_MASK: {

      SDValue Src1 = Op.getOperand(1);

      SDValue Src2 = Op.getOperand(2);

      SDValue passThru = Op.getOperand(3);

      SDValue Mask = Op.getOperand(4);

      unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;

      // There are 2 kinds of intrinsics in this group:

      // (1) With suppress-all-exceptions (sae) or rounding mode- 6 operands

      // (2) With rounding mode and sae - 7 operands.

      bool HasRounding = IntrWithRoundingModeOpcode != 0;

      if (Op.getNumOperands() == (5U + HasRounding)) {

        if (HasRounding) {

          SDValue Rnd = Op.getOperand(5);

          unsigned RC = 0;

          if (isRoundModeSAEToX(Rnd, RC))

            return getScalarMaskingNode(

                DAG.getNode(IntrWithRoundingModeOpcode, dl, VT, Src1, Src2,

                            DAG.getTargetConstant(RC, dl, MVT::i32)),

                Mask, passThru, Subtarget, DAG);

          if (!isRoundModeCurDirection(Rnd))

            return SDValue();

        }

        return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1,

                                                Src2),

                                    Mask, passThru, Subtarget, DAG);

      }


      assert(Op.getNumOperands() == (6U + HasRounding) &&

             "Unexpected intrinsic form");

      SDValue RoundingMode = Op.getOperand(5);

      unsigned Opc = IntrData->Opc0;

      if (HasRounding) {

        SDValue Sae = Op.getOperand(6);

        if (isRoundModeSAE(Sae))

          Opc = IntrWithRoundingModeOpcode;

        else if (!isRoundModeCurDirection(Sae))

          return SDValue();

      }

      return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1,

                                              Src2, RoundingMode),

                                  Mask, passThru, Subtarget, DAG);

    }

    case INTR_TYPE_SCALAR_MASK_RND: {

      SDValue Src1 = Op.getOperand(1);

      SDValue Src2 = Op.getOperand(2);

      SDValue passThru = Op.getOperand(3);

      SDValue Mask = Op.getOperand(4);

      SDValue Rnd = Op.getOperand(5);


      SDValue NewOp;

      unsigned RC = 0;

      if (isRoundModeCurDirection(Rnd))

        NewOp = DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2);

      else if (isRoundModeSAEToX(Rnd, RC))

        NewOp = DAG.getNode(IntrData->Opc1, dl, VT, Src1, Src2,

                            DAG.getTargetConstant(RC, dl, MVT::i32));

      else

        return SDValue();


      return getScalarMaskingNode(NewOp, Mask, passThru, Subtarget, DAG);

    }

    case INTR_TYPE_SCALAR_MASK_SAE: {

      SDValue Src1 = Op.getOperand(1);

      SDValue Src2 = Op.getOperand(2);

      SDValue passThru = Op.getOperand(3);

      SDValue Mask = Op.getOperand(4);

      SDValue Sae = Op.getOperand(5);

      unsigned Opc;

      if (isRoundModeCurDirection(Sae))

        Opc = IntrData->Opc0;

      else if (isRoundModeSAE(Sae))

        Opc = IntrData->Opc1;

      else

        return SDValue();


      return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2),

                                  Mask, passThru, Subtarget, DAG);

    }

    case INTR_TYPE_2OP_MASK: {

      SDValue Src1 = Op.getOperand(1);

      SDValue Src2 = Op.getOperand(2);

      SDValue PassThru = Op.getOperand(3);

      SDValue Mask = Op.getOperand(4);

      SDValue NewOp;

      if (IntrData->Opc1 != 0) {

        SDValue Rnd = Op.getOperand(5);

        unsigned RC = 0;

        if (isRoundModeSAEToX(Rnd, RC))

          NewOp = DAG.getNode(IntrData->Opc1, dl, VT, Src1, Src2,

                              DAG.getTargetConstant(RC, dl, MVT::i32));

        else if (!isRoundModeCurDirection(Rnd))

          return SDValue();

      }

      if (!NewOp)

        NewOp = DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2);

      return getVectorMaskingNode(NewOp, Mask, PassThru, Subtarget, DAG);

    }

    case INTR_TYPE_2OP_MASK_SAE: {

      SDValue Src1 = Op.getOperand(1);

      SDValue Src2 = Op.getOperand(2);

      SDValue PassThru = Op.getOperand(3);

      SDValue Mask = Op.getOperand(4);


      unsigned Opc = IntrData->Opc0;

      if (IntrData->Opc1 != 0) {

        SDValue Sae = Op.getOperand(5);

        if (isRoundModeSAE(Sae))

          Opc = IntrData->Opc1;

        else if (!isRoundModeCurDirection(Sae))

          return SDValue();

      }


      return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2),

                                  Mask, PassThru, Subtarget, DAG);

    }

    case INTR_TYPE_3OP_SCALAR_MASK_SAE: {

      SDValue Src1 = Op.getOperand(1);

      SDValue Src2 = Op.getOperand(2);

      SDValue Src3 = Op.getOperand(3);

      SDValue PassThru = Op.getOperand(4);

      SDValue Mask = Op.getOperand(5);

      SDValue Sae = Op.getOperand(6);

      unsigned Opc;

      if (isRoundModeCurDirection(Sae))

        Opc = IntrData->Opc0;

      else if (isRoundModeSAE(Sae))

        Opc = IntrData->Opc1;

      else

        return SDValue();


      return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2, Src3),

                                  Mask, PassThru, Subtarget, DAG);

    }

    case INTR_TYPE_3OP_MASK_SAE: {

      SDValue Src1 = Op.getOperand(1);

      SDValue Src2 = Op.getOperand(2);

      SDValue Src3 = Op.getOperand(3);

      SDValue PassThru = Op.getOperand(4);

      SDValue Mask = Op.getOperand(5);


      unsigned Opc = IntrData->Opc0;

      if (IntrData->Opc1 != 0) {

        SDValue Sae = Op.getOperand(6);

        if (isRoundModeSAE(Sae))

          Opc = IntrData->Opc1;

        else if (!isRoundModeCurDirection(Sae))

          return SDValue();

      }

      return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2, Src3),

                                  Mask, PassThru, Subtarget, DAG);

    }

    case BLENDV: {

      SDValue Src1 = Op.getOperand(1);

      SDValue Src2 = Op.getOperand(2);

      SDValue Src3 = Op.getOperand(3);


      EVT MaskVT = Src3.getValueType().changeVectorElementTypeToInteger();

      Src3 = DAG.getBitcast(MaskVT, Src3);


      // Reverse the operands to match VSELECT order.

      return DAG.getNode(IntrData->Opc0, dl, VT, Src3, Src2, Src1);

    }

    case VPERM_2OP : {

      SDValue Src1 = Op.getOperand(1);

      SDValue Src2 = Op.getOperand(2);


      // Swap Src1 and Src2 in the node creation

      return DAG.getNode(IntrData->Opc0, dl, VT,Src2, Src1);

    }

    case CFMA_OP_MASKZ:

    case CFMA_OP_MASK: {

      SDValue Src1 = Op.getOperand(1);

      SDValue Src2 = Op.getOperand(2);

      SDValue Src3 = Op.getOperand(3);

      SDValue Mask = Op.getOperand(4);

      MVT VT = Op.getSimpleValueType();


      SDValue PassThru = Src3;

      if (IntrData->Type == CFMA_OP_MASKZ)

        PassThru = getZeroVector(VT, Subtarget, DAG, dl);


      // We add rounding mode to the Node when

      //   - RC Opcode is specified and

      //   - RC is not "current direction".

      SDValue NewOp;

      if (IntrData->Opc1 != 0) {

        SDValue Rnd = Op.getOperand(5);

        unsigned RC = 0;

        if (isRoundModeSAEToX(Rnd, RC))

          NewOp = DAG.getNode(IntrData->Opc1, dl, VT, Src1, Src2, Src3,

                              DAG.getTargetConstant(RC, dl, MVT::i32));

        else if (!isRoundModeCurDirection(Rnd))

          return SDValue();

      }

      if (!NewOp)

        NewOp = DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2, Src3);

      if (IntrData->Opc0 == X86ISD::VFMADDCSH ||

          IntrData->Opc0 == X86ISD::VFCMADDCSH)

        return getScalarMaskingNode(NewOp, Mask, PassThru, Subtarget, DAG);

      return getVectorMaskingNode(NewOp, Mask, PassThru, Subtarget, DAG);

    }

    case IFMA_OP:

      // NOTE: We need to swizzle the operands to pass the multiply operands

      // first.

      return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),

                         Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));

    case FPCLASSS: {

      SDValue Src1 = Op.getOperand(1);

      SDValue Imm = Op.getOperand(2);

      SDValue Mask = Op.getOperand(3);

      SDValue FPclass = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Imm);

      SDValue FPclassMask = getScalarMaskingNode(FPclass, Mask, SDValue(),

                                                 Subtarget, DAG);

      // Need to fill with zeros to ensure the bitcast will produce zeroes

      // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.

      SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1,

                                DAG.getConstant(0, dl, MVT::v8i1), FPclassMask,

                                DAG.getVectorIdxConstant(0, dl));

      return DAG.getBitcast(MVT::i8, Ins);

    }


    case CMP_MASK_CC: {

      MVT MaskVT = Op.getSimpleValueType();

      SDValue CC = Op.getOperand(3);

      SDValue Mask = Op.getOperand(4);

      // We specify 2 possible opcodes for intrinsics with rounding modes.

      // First, we check if the intrinsic may have non-default rounding mode,

      // (IntrData->Opc1 != 0), then we check the rounding mode operand.

      if (IntrData->Opc1 != 0) {

        SDValue Sae = Op.getOperand(5);

        if (isRoundModeSAE(Sae))

          return DAG.getNode(IntrData->Opc1, dl, MaskVT, Op.getOperand(1),

                             Op.getOperand(2), CC, Mask, Sae);

        if (!isRoundModeCurDirection(Sae))

          return SDValue();

      }

      //default rounding mode

      return DAG.getNode(IntrData->Opc0, dl, MaskVT,

                         {Op.getOperand(1), Op.getOperand(2), CC, Mask});

    }

    case CMP_MASK_SCALAR_CC: {

      SDValue Src1 = Op.getOperand(1);

      SDValue Src2 = Op.getOperand(2);

      SDValue CC = Op.getOperand(3);

      SDValue Mask = Op.getOperand(4);


      SDValue Cmp;

      if (IntrData->Opc1 != 0) {

        SDValue Sae = Op.getOperand(5);

        if (isRoundModeSAE(Sae))

          Cmp = DAG.getNode(IntrData->Opc1, dl, MVT::v1i1, Src1, Src2, CC, Sae);

        else if (!isRoundModeCurDirection(Sae))

          return SDValue();

      }

      //default rounding mode

      if (!Cmp.getNode())

        Cmp = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Src2, CC);


      SDValue CmpMask = getScalarMaskingNode(Cmp, Mask, SDValue(),

                                             Subtarget, DAG);

      // Need to fill with zeros to ensure the bitcast will produce zeroes

      // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.

      SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1,

                                DAG.getConstant(0, dl, MVT::v8i1), CmpMask,

                                DAG.getVectorIdxConstant(0, dl));

      return DAG.getBitcast(MVT::i8, Ins);

    }

    case COMI: { // Comparison intrinsics

      ISD::CondCode CC = (ISD::CondCode)IntrData->Opc1;

      SDValue LHS = Op.getOperand(1);

      SDValue RHS = Op.getOperand(2);

      // Some conditions require the operands to be swapped.

      if (CC == ISD::SETLT || CC == ISD::SETLE)

        std::swap(LHS, RHS);


      // For AVX10.2, Support EQ and NE.

      bool HasAVX10_2_COMX =

          Subtarget.hasAVX10_2() && (CC == ISD::SETEQ || CC == ISD::SETNE);


      // AVX10.2 COMPARE supports only v2f64, v4f32 or v8f16.

      // For BF type we need to fall back.

      bool HasAVX10_2_COMX_Ty = (LHS.getSimpleValueType() != MVT::v8bf16);


      auto ComiOpCode = IntrData->Opc0;

      auto isUnordered = (ComiOpCode == X86ISD::UCOMI);


      if (HasAVX10_2_COMX && HasAVX10_2_COMX_Ty)

        ComiOpCode = isUnordered ? X86ISD::UCOMX : X86ISD::COMX;


      SDValue Comi = DAG.getNode(ComiOpCode, dl, MVT::i32, LHS, RHS);


      SDValue SetCC;

      switch (CC) {

      case ISD::SETEQ: {

        SetCC = getSETCC(X86::COND_E, Comi, dl, DAG);

        if (HasAVX10_2_COMX && HasAVX10_2_COMX_Ty) // ZF == 1

          break;

        // (ZF = 1 and PF = 0)

        SDValue SetNP = getSETCC(X86::COND_NP, Comi, dl, DAG);

        SetCC = DAG.getNode(ISD::AND, dl, MVT::i8, SetCC, SetNP);

        break;

      }

      case ISD::SETNE: {

        SetCC = getSETCC(X86::COND_NE, Comi, dl, DAG);

        if (HasAVX10_2_COMX && HasAVX10_2_COMX_Ty) // ZF == 0

          break;

        // (ZF = 0 or PF = 1)

        SDValue SetP = getSETCC(X86::COND_P, Comi, dl, DAG);

        SetCC = DAG.getNode(ISD::OR, dl, MVT::i8, SetCC, SetP);

        break;

      }

      case ISD::SETGT:   // (CF = 0 and ZF = 0)

      case ISD::SETLT: { // Condition opposite to GT. Operands swapped above.

        SetCC = getSETCC(X86::COND_A, Comi, dl, DAG);

        break;

      }

      case ISD::SETGE: // CF = 0

      case ISD::SETLE: // Condition opposite to GE. Operands swapped above.

        SetCC = getSETCC(X86::COND_AE, Comi, dl, DAG);

        break;

      default:

        llvm_unreachable("Unexpected illegal condition!");

      }

      return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);

    }

    case COMI_RM: { // Comparison intrinsics with Sae

      SDValue LHS = Op.getOperand(1);

      SDValue RHS = Op.getOperand(2);

      unsigned CondVal = Op.getConstantOperandVal(3);

      SDValue Sae = Op.getOperand(4);


      SDValue FCmp;

      if (isRoundModeCurDirection(Sae))

        FCmp = DAG.getNode(X86ISD::FSETCCM, dl, MVT::v1i1, LHS, RHS,

                           DAG.getTargetConstant(CondVal, dl, MVT::i8));

      else if (isRoundModeSAE(Sae))

        FCmp = DAG.getNode(X86ISD::FSETCCM_SAE, dl, MVT::v1i1, LHS, RHS,

                           DAG.getTargetConstant(CondVal, dl, MVT::i8), Sae);

      else

        return SDValue();

      // Need to fill with zeros to ensure the bitcast will produce zeroes

      // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.

      SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v16i1,

                                DAG.getConstant(0, dl, MVT::v16i1), FCmp,

                                DAG.getVectorIdxConstant(0, dl));

      return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32,

                         DAG.getBitcast(MVT::i16, Ins));

    }

    case VSHIFT: {

      SDValue SrcOp = Op.getOperand(1);

      SDValue ShAmt = Op.getOperand(2);

      assert(ShAmt.getValueType() == MVT::i32 &&

             "Unexpected VSHIFT amount type");


      // Catch shift-by-constant.

      if (auto *CShAmt = dyn_cast<ConstantSDNode>(ShAmt))

        return getTargetVShiftByConstNode(IntrData->Opc0, dl,

                                          Op.getSimpleValueType(), SrcOp,

                                          CShAmt->getZExtValue(), DAG);


      ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, ShAmt);

      return getTargetVShiftNode(IntrData->Opc0, dl, Op.getSimpleValueType(),

                                 SrcOp, ShAmt, 0, Subtarget, DAG);

    }

    case COMPRESS_EXPAND_IN_REG: {

      SDValue Mask = Op.getOperand(3);

      SDValue DataToCompress = Op.getOperand(1);

      SDValue PassThru = Op.getOperand(2);

      if (ISD::isBuildVectorAllOnes(Mask.getNode())) // return data as is

        return Op.getOperand(1);


      // Avoid false dependency.

      if (PassThru.isUndef())

        PassThru = getZeroVector(VT, Subtarget, DAG, dl);


      return DAG.getNode(IntrData->Opc0, dl, VT, DataToCompress, PassThru,

                         Mask);

    }

    case FIXUPIMM:

    case FIXUPIMM_MASKZ: {

      SDValue Src1 = Op.getOperand(1);

      SDValue Src2 = Op.getOperand(2);

      SDValue Src3 = Op.getOperand(3);

      SDValue Imm = Op.getOperand(4);

      SDValue Mask = Op.getOperand(5);

      SDValue Passthru = (IntrData->Type == FIXUPIMM)

                             ? Src1

                             : getZeroVector(VT, Subtarget, DAG, dl);


      unsigned Opc = IntrData->Opc0;

      if (IntrData->Opc1 != 0) {

        SDValue Sae = Op.getOperand(6);

        if (isRoundModeSAE(Sae))

          Opc = IntrData->Opc1;

        else if (!isRoundModeCurDirection(Sae))

          return SDValue();

      }


      SDValue FixupImm = DAG.getNode(Opc, dl, VT, Src1, Src2, Src3, Imm);


      if (Opc == X86ISD::VFIXUPIMM || Opc == X86ISD::VFIXUPIMM_SAE)

        return getVectorMaskingNode(FixupImm, Mask, Passthru, Subtarget, DAG);


      return getScalarMaskingNode(FixupImm, Mask, Passthru, Subtarget, DAG);

    }

    case ROUNDP: {

      assert(IntrData->Opc0 == X86ISD::VRNDSCALE && "Unexpected opcode");

      // Clear the upper bits of the rounding immediate so that the legacy

      // intrinsic can't trigger the scaling behavior of VRNDSCALE.

      uint64_t Round = Op.getConstantOperandVal(2);

      SDValue RoundingMode = DAG.getTargetConstant(Round & 0xf, dl, MVT::i32);

      return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),

                         Op.getOperand(1), RoundingMode);

    }

    case ROUNDS: {

      assert(IntrData->Opc0 == X86ISD::VRNDSCALES && "Unexpected opcode");

      // Clear the upper bits of the rounding immediate so that the legacy

      // intrinsic can't trigger the scaling behavior of VRNDSCALE.

      uint64_t Round = Op.getConstantOperandVal(3);

      SDValue RoundingMode = DAG.getTargetConstant(Round & 0xf, dl, MVT::i32);

      return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),

                         Op.getOperand(1), Op.getOperand(2), RoundingMode);

    }

    case BEXTRI: {

      assert(IntrData->Opc0 == X86ISD::BEXTRI && "Unexpected opcode");


      uint64_t Imm = Op.getConstantOperandVal(2);

      SDValue Control = DAG.getTargetConstant(Imm & 0xffff, dl,

                                              Op.getValueType());

      return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),

                         Op.getOperand(1), Control);

    }

    // ADC/SBB

    case ADX: {

      SDVTList CFVTs = DAG.getVTList(Op->getValueType(0), MVT::i32);

      SDVTList VTs = DAG.getVTList(Op.getOperand(2).getValueType(), MVT::i32);


      SDValue Res;

      // If the carry in is zero, then we should just use ADD/SUB instead of

      // ADC/SBB.

      if (isNullConstant(Op.getOperand(1))) {

        Res = DAG.getNode(IntrData->Opc1, dl, VTs, Op.getOperand(2),

                          Op.getOperand(3));

      } else {

        SDValue GenCF = DAG.getNode(X86ISD::ADD, dl, CFVTs, Op.getOperand(1),

                                    DAG.getAllOnesConstant(dl, MVT::i8));

        Res = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(2),

                          Op.getOperand(3), GenCF.getValue(1));

      }

      SDValue SetCC = getSETCC(X86::COND_B, Res.getValue(1), dl, DAG);

      SDValue Results[] = { SetCC, Res };

      return DAG.getMergeValues(Results, dl);

    }

    case CVTPD2PS_MASK:

    case CVTPD2DQ_MASK:

    case CVTQQ2PS_MASK:

    case TRUNCATE_TO_REG: {

      SDValue Src = Op.getOperand(1);

      SDValue PassThru = Op.getOperand(2);

      SDValue Mask = Op.getOperand(3);


      if (isAllOnesConstant(Mask))

        return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Src);


      MVT SrcVT = Src.getSimpleValueType();

      MVT MaskVT = MVT::getVectorVT(MVT::i1, SrcVT.getVectorNumElements());

      Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);

      return DAG.getNode(IntrData->Opc1, dl, Op.getValueType(),

                         {Src, PassThru, Mask});

    }

    case TRUNCATE2_TO_REG: {

      SDValue Src = Op.getOperand(1);

      SDValue Src2 = Op.getOperand(2);

      SDValue PassThru = Op.getOperand(3);

      SDValue Mask = Op.getOperand(4);


      if (isAllOnesConstant(Mask))

        return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), {Src, Src2});


      MVT Src2VT = Src2.getSimpleValueType();

      MVT MaskVT = MVT::getVectorVT(MVT::i1, Src2VT.getVectorNumElements());

      Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);

      return DAG.getNode(IntrData->Opc1, dl, Op.getValueType(),

                         {Src, Src2, PassThru, Mask});

    }

    case CVTPS2PH_MASK: {

      SDValue Src = Op.getOperand(1);

      SDValue Rnd = Op.getOperand(2);

      SDValue PassThru = Op.getOperand(3);

      SDValue Mask = Op.getOperand(4);


      unsigned RC = 0;

      unsigned Opc = IntrData->Opc0;

      bool SAE = Src.getValueType().is512BitVector() &&

                 (isRoundModeSAEToX(Rnd, RC) || isRoundModeSAE(Rnd));

      if (SAE) {

        Opc = X86ISD::CVTPS2PH_SAE;

        Rnd = DAG.getTargetConstant(RC, dl, MVT::i32);

      }


      if (isAllOnesConstant(Mask))

        return DAG.getNode(Opc, dl, Op.getValueType(), Src, Rnd);


      if (SAE)

        Opc = X86ISD::MCVTPS2PH_SAE;

      else

        Opc = IntrData->Opc1;

      MVT SrcVT = Src.getSimpleValueType();

      MVT MaskVT = MVT::getVectorVT(MVT::i1, SrcVT.getVectorNumElements());

      Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);

      return DAG.getNode(Opc, dl, Op.getValueType(), Src, Rnd, PassThru, Mask);

    }

    case CVTNEPS2BF16_MASK: {

      SDValue Src = Op.getOperand(1);

      SDValue PassThru = Op.getOperand(2);

      SDValue Mask = Op.getOperand(3);


      if (ISD::isBuildVectorAllOnes(Mask.getNode()))

        return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Src);


      // Break false dependency.

      if (PassThru.isUndef())

        PassThru = DAG.getConstant(0, dl, PassThru.getValueType());


      return DAG.getNode(IntrData->Opc1, dl, Op.getValueType(), Src, PassThru,

                         Mask);

    }

    default:

      break;

    }

  }


  switch (IntNo) {

  default: return SDValue();    // Don't custom lower most intrinsics.


  // ptest and testp intrinsics. The intrinsic these come from are designed to

  // return an integer value, not just an instruction so lower it to the ptest

  // or testp pattern and a setcc for the result.

  case Intrinsic::x86_avx512_ktestc_b:

  case Intrinsic::x86_avx512_ktestc_w:

  case Intrinsic::x86_avx512_ktestc_d:

  case Intrinsic::x86_avx512_ktestc_q:

  case Intrinsic::x86_avx512_ktestz_b:

  case Intrinsic::x86_avx512_ktestz_w:

  case Intrinsic::x86_avx512_ktestz_d:

  case Intrinsic::x86_avx512_ktestz_q:

  case Intrinsic::x86_sse41_ptestz:

  case Intrinsic::x86_sse41_ptestc:

  case Intrinsic::x86_sse41_ptestnzc:

  case Intrinsic::x86_avx_ptestz_256:

  case Intrinsic::x86_avx_ptestc_256:

  case Intrinsic::x86_avx_ptestnzc_256:

  case Intrinsic::x86_avx_vtestz_ps:

  case Intrinsic::x86_avx_vtestc_ps:

  case Intrinsic::x86_avx_vtestnzc_ps:

  case Intrinsic::x86_avx_vtestz_pd:

  case Intrinsic::x86_avx_vtestc_pd:

  case Intrinsic::x86_avx_vtestnzc_pd:

  case Intrinsic::x86_avx_vtestz_ps_256:

  case Intrinsic::x86_avx_vtestc_ps_256:

  case Intrinsic::x86_avx_vtestnzc_ps_256:

  case Intrinsic::x86_avx_vtestz_pd_256:

  case Intrinsic::x86_avx_vtestc_pd_256:

  case Intrinsic::x86_avx_vtestnzc_pd_256: {

    unsigned TestOpc = X86ISD::PTEST;

    X86::CondCode X86CC;

    switch (IntNo) {

    default: llvm_unreachable("Bad fallthrough in Intrinsic lowering.");

    case Intrinsic::x86_avx512_ktestc_b:

    case Intrinsic::x86_avx512_ktestc_w:

    case Intrinsic::x86_avx512_ktestc_d:

    case Intrinsic::x86_avx512_ktestc_q:

      // CF = 1

      TestOpc = X86ISD::KTEST;

      X86CC = X86::COND_B;

      break;

    case Intrinsic::x86_avx512_ktestz_b:

    case Intrinsic::x86_avx512_ktestz_w:

    case Intrinsic::x86_avx512_ktestz_d:

    case Intrinsic::x86_avx512_ktestz_q:

      TestOpc = X86ISD::KTEST;

      X86CC = X86::COND_E;

      break;

    case Intrinsic::x86_avx_vtestz_ps:

    case Intrinsic::x86_avx_vtestz_pd:

    case Intrinsic::x86_avx_vtestz_ps_256:

    case Intrinsic::x86_avx_vtestz_pd_256:

      TestOpc = X86ISD::TESTP;

      [[fallthrough]];

    case Intrinsic::x86_sse41_ptestz:

    case Intrinsic::x86_avx_ptestz_256:

      // ZF = 1

      X86CC = X86::COND_E;

      break;

    case Intrinsic::x86_avx_vtestc_ps:

    case Intrinsic::x86_avx_vtestc_pd:

    case Intrinsic::x86_avx_vtestc_ps_256:

    case Intrinsic::x86_avx_vtestc_pd_256:

      TestOpc = X86ISD::TESTP;

      [[fallthrough]];

    case Intrinsic::x86_sse41_ptestc:

    case Intrinsic::x86_avx_ptestc_256:

      // CF = 1

      X86CC = X86::COND_B;

      break;

    case Intrinsic::x86_avx_vtestnzc_ps:

    case Intrinsic::x86_avx_vtestnzc_pd:

    case Intrinsic::x86_avx_vtestnzc_ps_256:

    case Intrinsic::x86_avx_vtestnzc_pd_256:

      TestOpc = X86ISD::TESTP;

      [[fallthrough]];

    case Intrinsic::x86_sse41_ptestnzc:

    case Intrinsic::x86_avx_ptestnzc_256:

      // ZF and CF = 0

      X86CC = X86::COND_A;

      break;

    }


    SDValue LHS = Op.getOperand(1);

    SDValue RHS = Op.getOperand(2);

    SDValue Test = DAG.getNode(TestOpc, dl, MVT::i32, LHS, RHS);

    SDValue SetCC = getSETCC(X86CC, Test, dl, DAG);

    return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);

  }


  case Intrinsic::x86_sse42_pcmpistria128:

  case Intrinsic::x86_sse42_pcmpestria128:

  case Intrinsic::x86_sse42_pcmpistric128:

  case Intrinsic::x86_sse42_pcmpestric128:

  case Intrinsic::x86_sse42_pcmpistrio128:

  case Intrinsic::x86_sse42_pcmpestrio128:

  case Intrinsic::x86_sse42_pcmpistris128:

  case Intrinsic::x86_sse42_pcmpestris128:

  case Intrinsic::x86_sse42_pcmpistriz128:

  case Intrinsic::x86_sse42_pcmpestriz128: {

    unsigned Opcode;

    X86::CondCode X86CC;

    switch (IntNo) {

    default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.

    case Intrinsic::x86_sse42_pcmpistria128:

      Opcode = X86ISD::PCMPISTR;

      X86CC = X86::COND_A;

      break;

    case Intrinsic::x86_sse42_pcmpestria128:

      Opcode = X86ISD::PCMPESTR;

      X86CC = X86::COND_A;

      break;

    case Intrinsic::x86_sse42_pcmpistric128:

      Opcode = X86ISD::PCMPISTR;

      X86CC = X86::COND_B;

      break;

    case Intrinsic::x86_sse42_pcmpestric128:

      Opcode = X86ISD::PCMPESTR;

      X86CC = X86::COND_B;

      break;

    case Intrinsic::x86_sse42_pcmpistrio128:

      Opcode = X86ISD::PCMPISTR;

      X86CC = X86::COND_O;

      break;

    case Intrinsic::x86_sse42_pcmpestrio128:

      Opcode = X86ISD::PCMPESTR;

      X86CC = X86::COND_O;

      break;

    case Intrinsic::x86_sse42_pcmpistris128:

      Opcode = X86ISD::PCMPISTR;

      X86CC = X86::COND_S;

      break;

    case Intrinsic::x86_sse42_pcmpestris128:

      Opcode = X86ISD::PCMPESTR;

      X86CC = X86::COND_S;

      break;

    case Intrinsic::x86_sse42_pcmpistriz128:

      Opcode = X86ISD::PCMPISTR;

      X86CC = X86::COND_E;

      break;

    case Intrinsic::x86_sse42_pcmpestriz128:

      Opcode = X86ISD::PCMPESTR;

      X86CC = X86::COND_E;

      break;

    }

    SmallVector<SDValue, 5> NewOps(llvm::drop_begin(Op->ops()));

    SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);

    SDValue PCMP = DAG.getNode(Opcode, dl, VTs, NewOps).getValue(2);

    SDValue SetCC = getSETCC(X86CC, PCMP, dl, DAG);

    return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);

  }


  case Intrinsic::x86_sse42_pcmpistri128:

  case Intrinsic::x86_sse42_pcmpestri128: {

    unsigned Opcode;

    if (IntNo == Intrinsic::x86_sse42_pcmpistri128)

      Opcode = X86ISD::PCMPISTR;

    else

      Opcode = X86ISD::PCMPESTR;


    SmallVector<SDValue, 5> NewOps(llvm::drop_begin(Op->ops()));

    SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);

    return DAG.getNode(Opcode, dl, VTs, NewOps);

  }


  case Intrinsic::x86_sse42_pcmpistrm128:

  case Intrinsic::x86_sse42_pcmpestrm128: {

    unsigned Opcode;

    if (IntNo == Intrinsic::x86_sse42_pcmpistrm128)

      Opcode = X86ISD::PCMPISTR;

    else

      Opcode = X86ISD::PCMPESTR;


    SmallVector<SDValue, 5> NewOps(llvm::drop_begin(Op->ops()));

    SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);

    return DAG.getNode(Opcode, dl, VTs, NewOps).getValue(1);

  }


  case Intrinsic::eh_sjlj_lsda: {

    MachineFunction &MF = DAG.getMachineFunction();

    const TargetLowering &TLI = DAG.getTargetLoweringInfo();

    MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());

    auto &Context = MF.getContext();

    MCSymbol *S = Context.getOrCreateSymbol(Twine("GCC_except_table") +

                                            Twine(MF.getFunctionNumber()));

    return DAG.getNode(getGlobalWrapperKind(nullptr, /*OpFlags=*/0), dl, VT,

                       DAG.getMCSymbol(S, PtrVT));

  }


  case Intrinsic::x86_seh_lsda: {

    // Compute the symbol for the LSDA. We know it'll get emitted later.

    MachineFunction &MF = DAG.getMachineFunction();

    SDValue Op1 = Op.getOperand(1);

    auto *Fn = cast<Function>(cast<GlobalAddressSDNode>(Op1)->getGlobal());

    MCSymbol *LSDASym = MF.getContext().getOrCreateLSDASymbol(

        GlobalValue::dropLLVMManglingEscape(Fn->getName()));


    // Generate a simple absolute symbol reference. This intrinsic is only

    // supported on 32-bit Windows, which isn't PIC.

    SDValue Result = DAG.getMCSymbol(LSDASym, VT);

    return DAG.getNode(X86ISD::Wrapper, dl, VT, Result);

  }


  case Intrinsic::eh_recoverfp: {

    SDValue FnOp = Op.getOperand(1);

    SDValue IncomingFPOp = Op.getOperand(2);

    GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(FnOp);

    auto *Fn = dyn_cast_or_null<Function>(GSD ? GSD->getGlobal() : nullptr);

    if (!Fn)

      report_fatal_error(

          "llvm.eh.recoverfp must take a function as the first argument");

    return recoverFramePointer(DAG, Fn, IncomingFPOp);

  }


  case Intrinsic::localaddress: {

    // Returns one of the stack, base, or frame pointer registers, depending on

    // which is used to reference local variables.

    MachineFunction &MF = DAG.getMachineFunction();

    const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();

    Register Reg;

    if (RegInfo->hasBasePointer(MF))

      Reg = RegInfo->getBaseRegister();

    else { // Handles the SP or FP case.

      bool CantUseFP = RegInfo->hasStackRealignment(MF);

      if (CantUseFP)

        Reg = RegInfo->getPtrSizedStackRegister(MF);

      else

        Reg = RegInfo->getPtrSizedFrameRegister(MF);

    }

    return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, VT);

  }

  case Intrinsic::x86_avx512_vp2intersect_q_512:

  case Intrinsic::x86_avx512_vp2intersect_q_256:

  case Intrinsic::x86_avx512_vp2intersect_q_128:

  case Intrinsic::x86_avx512_vp2intersect_d_512:

  case Intrinsic::x86_avx512_vp2intersect_d_256:

  case Intrinsic::x86_avx512_vp2intersect_d_128: {

    SDLoc DL(Op);

    MVT MaskVT = Op.getSimpleValueType();

    SDVTList VTs = DAG.getVTList(MVT::Untyped, MVT::Other);

    SDValue Operation = DAG.getNode(X86ISD::VP2INTERSECT, DL, VTs,

                                    Op.getOperand(1), Op.getOperand(2));

    SDValue Result0 =

        DAG.getTargetExtractSubreg(X86::sub_mask_0, DL, MaskVT, Operation);

    SDValue Result1 =

        DAG.getTargetExtractSubreg(X86::sub_mask_1, DL, MaskVT, Operation);

    return DAG.getMergeValues({Result0, Result1}, DL);

  }

  case Intrinsic::x86_mmx_pslli_w:

  case Intrinsic::x86_mmx_pslli_d:

  case Intrinsic::x86_mmx_pslli_q:

  case Intrinsic::x86_mmx_psrli_w:

  case Intrinsic::x86_mmx_psrli_d:

  case Intrinsic::x86_mmx_psrli_q:

  case Intrinsic::x86_mmx_psrai_w:

  case Intrinsic::x86_mmx_psrai_d: {

    SDLoc DL(Op);

    SDValue ShAmt = Op.getOperand(2);

    // If the argument is a constant, convert it to a target constant.

    if (auto *C = dyn_cast<ConstantSDNode>(ShAmt)) {

      // Clamp out of bounds shift amounts since they will otherwise be masked

      // to 8-bits which may make it no longer out of bounds.

      unsigned ShiftAmount = C->getAPIntValue().getLimitedValue(255);

      if (ShiftAmount == 0)

        return Op.getOperand(1);


      return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(),

                         Op.getOperand(0), Op.getOperand(1),

                         DAG.getTargetConstant(ShiftAmount, DL, MVT::i32));

    }


    unsigned NewIntrinsic;

    switch (IntNo) {

    default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.

    case Intrinsic::x86_mmx_pslli_w:

      NewIntrinsic = Intrinsic::x86_mmx_psll_w;

      break;

    case Intrinsic::x86_mmx_pslli_d:

      NewIntrinsic = Intrinsic::x86_mmx_psll_d;

      break;

    case Intrinsic::x86_mmx_pslli_q:

      NewIntrinsic = Intrinsic::x86_mmx_psll_q;

      break;

    case Intrinsic::x86_mmx_psrli_w:

      NewIntrinsic = Intrinsic::x86_mmx_psrl_w;

      break;

    case Intrinsic::x86_mmx_psrli_d:

      NewIntrinsic = Intrinsic::x86_mmx_psrl_d;

      break;

    case Intrinsic::x86_mmx_psrli_q:

      NewIntrinsic = Intrinsic::x86_mmx_psrl_q;

      break;

    case Intrinsic::x86_mmx_psrai_w:

      NewIntrinsic = Intrinsic::x86_mmx_psra_w;

      break;

    case Intrinsic::x86_mmx_psrai_d:

      NewIntrinsic = Intrinsic::x86_mmx_psra_d;

      break;

    }


    // The vector shift intrinsics with scalars uses 32b shift amounts but

    // the sse2/mmx shift instructions reads 64 bits. Copy the 32 bits to an

    // MMX register.

    ShAmt = DAG.getNode(X86ISD::MMX_MOVW2D, DL, MVT::x86mmx, ShAmt);

    return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(),

                       DAG.getTargetConstant(NewIntrinsic, DL,

                                             getPointerTy(DAG.getDataLayout())),

                       Op.getOperand(1), ShAmt);

  }

  case Intrinsic::thread_pointer: {

    if (Subtarget.isTargetELF()) {

      SDLoc dl(Op);

      EVT PtrVT = Op.getValueType();

      // Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit).

      Value *Ptr = Constant::getNullValue(PointerType::get(

          *DAG.getContext(), Subtarget.is64Bit() ? X86AS::FS : X86AS::GS));

      return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),

                         DAG.getIntPtrConstant(0, dl), MachinePointerInfo(Ptr));

    }

    report_fatal_error(

        "Target OS doesn't support __builtin_thread_pointer() yet.");

  }

  }

}


static SDValue getAVX2GatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,

                                 SDValue Src, SDValue Mask, SDValue Base,

                                 SDValue Index, SDValue ScaleOp, SDValue Chain,

                                 const X86Subtarget &Subtarget) {

  SDLoc dl(Op);

  auto *C = dyn_cast<ConstantSDNode>(ScaleOp);

  // Scale must be constant.

  if (!C)

    return SDValue();

  const TargetLowering &TLI = DAG.getTargetLoweringInfo();

  SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,

                                        TLI.getPointerTy(DAG.getDataLayout()));

  EVT MaskVT = Mask.getValueType().changeVectorElementTypeToInteger();

  SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Other);

  // If source is undef or we know it won't be used, use a zero vector

  // to break register dependency.

  // TODO: use undef instead and let BreakFalseDeps deal with it?

  if (Src.isUndef() || ISD::isBuildVectorAllOnes(Mask.getNode()))

    Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);


  // Cast mask to an integer type.

  Mask = DAG.getBitcast(MaskVT, Mask);


  MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);


  SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale };

  SDValue Res =

      DAG.getMemIntrinsicNode(X86ISD::MGATHER, dl, VTs, Ops,

                              MemIntr->getMemoryVT(), MemIntr->getMemOperand());

  return DAG.getMergeValues({Res, Res.getValue(1)}, dl);

}


static SDValue getGatherNode(SDValue Op, SelectionDAG &DAG,

                             SDValue Src, SDValue Mask, SDValue Base,

                             SDValue Index, SDValue ScaleOp, SDValue Chain,

                             const X86Subtarget &Subtarget) {

  MVT VT = Op.getSimpleValueType();

  SDLoc dl(Op);

  auto *C = dyn_cast<ConstantSDNode>(ScaleOp);

  // Scale must be constant.

  if (!C)

    return SDValue();

  const TargetLowering &TLI = DAG.getTargetLoweringInfo();

  SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,

                                        TLI.getPointerTy(DAG.getDataLayout()));

  unsigned MinElts = std::min(Index.getSimpleValueType().getVectorNumElements(),

                              VT.getVectorNumElements());

  MVT MaskVT = MVT::getVectorVT(MVT::i1, MinElts);


  // We support two versions of the gather intrinsics. One with scalar mask and

  // one with vXi1 mask. Convert scalar to vXi1 if necessary.

  if (Mask.getValueType() != MaskVT)

    Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);


  SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Other);

  // If source is undef or we know it won't be used, use a zero vector

  // to break register dependency.

  // TODO: use undef instead and let BreakFalseDeps deal with it?

  if (Src.isUndef() || ISD::isBuildVectorAllOnes(Mask.getNode()))

    Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);


  MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);


  SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale };

  SDValue Res =

      DAG.getMemIntrinsicNode(X86ISD::MGATHER, dl, VTs, Ops,

                              MemIntr->getMemoryVT(), MemIntr->getMemOperand());

  return DAG.getMergeValues({Res, Res.getValue(1)}, dl);

}


static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,

                               SDValue Src, SDValue Mask, SDValue Base,

                               SDValue Index, SDValue ScaleOp, SDValue Chain,

                               const X86Subtarget &Subtarget) {

  SDLoc dl(Op);

  auto *C = dyn_cast<ConstantSDNode>(ScaleOp);

  // Scale must be constant.

  if (!C)

    return SDValue();

  const TargetLowering &TLI = DAG.getTargetLoweringInfo();

  SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,

                                        TLI.getPointerTy(DAG.getDataLayout()));

  unsigned MinElts = std::min(Index.getSimpleValueType().getVectorNumElements(),

                              Src.getSimpleValueType().getVectorNumElements());

  MVT MaskVT = MVT::getVectorVT(MVT::i1, MinElts);


  // We support two versions of the scatter intrinsics. One with scalar mask and

  // one with vXi1 mask. Convert scalar to vXi1 if necessary.

  if (Mask.getValueType() != MaskVT)

    Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);


  MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);


  SDVTList VTs = DAG.getVTList(MVT::Other);

  SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale};

  SDValue Res =

      DAG.getMemIntrinsicNode(X86ISD::MSCATTER, dl, VTs, Ops,

                              MemIntr->getMemoryVT(), MemIntr->getMemOperand());

  return Res;

}


static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,

                               SDValue Mask, SDValue Base, SDValue Index,

                               SDValue ScaleOp, SDValue Chain,

                               const X86Subtarget &Subtarget) {

  SDLoc dl(Op);

  auto *C = dyn_cast<ConstantSDNode>(ScaleOp);

  // Scale must be constant.

  if (!C)

    return SDValue();

  const TargetLowering &TLI = DAG.getTargetLoweringInfo();

  SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,

                                        TLI.getPointerTy(DAG.getDataLayout()));

  SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);

  SDValue Segment = DAG.getRegister(0, MVT::i32);

  MVT MaskVT =

    MVT::getVectorVT(MVT::i1, Index.getSimpleValueType().getVectorNumElements());

  SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);

  SDValue Ops[] = {VMask, Base, Scale, Index, Disp, Segment, Chain};

  SDNode *Res = DAG.getMachineNode(Opc, dl, MVT::Other, Ops);

  return SDValue(Res, 0);

}


/// Handles the lowering of builtin intrinsics with chain that return their

/// value into registers EDX:EAX.

/// If operand ScrReg is a valid register identifier, then operand 2 of N is

/// copied to SrcReg. The assumption is that SrcReg is an implicit input to

/// TargetOpcode.

/// Returns a Glue value which can be used to add extra copy-from-reg if the

/// expanded intrinsics implicitly defines extra registers (i.e. not just

/// EDX:EAX).


static SDValue expandIntrinsicWChainHelper(SDNode *N, const SDLoc &DL,

                                        SelectionDAG &DAG,

                                        unsigned TargetOpcode,

                                        unsigned SrcReg,

                                        const X86Subtarget &Subtarget,

                                        SmallVectorImpl<SDValue> &Results) {

  SDValue Chain = N->getOperand(0);

  SDValue Glue;


  if (SrcReg) {

    assert(N->getNumOperands() == 3 && "Unexpected number of operands!");

    Chain = DAG.getCopyToReg(Chain, DL, SrcReg, N->getOperand(2), Glue);

    Glue = Chain.getValue(1);

  }


  SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);

  SDValue N1Ops[] = {Chain, Glue};

  SDNode *N1 = DAG.getMachineNode(

      TargetOpcode, DL, Tys, ArrayRef<SDValue>(N1Ops, Glue.getNode() ? 2 : 1));

  Chain = SDValue(N1, 0);


  // Reads the content of XCR and returns it in registers EDX:EAX.

  SDValue LO, HI;

  if (Subtarget.is64Bit()) {

    LO = DAG.getCopyFromReg(Chain, DL, X86::RAX, MVT::i64, SDValue(N1, 1));

    HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,

                            LO.getValue(2));

  } else {

    LO = DAG.getCopyFromReg(Chain, DL, X86::EAX, MVT::i32, SDValue(N1, 1));

    HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,

                            LO.getValue(2));

  }

  Chain = HI.getValue(1);

  Glue = HI.getValue(2);


  if (Subtarget.is64Bit()) {

    // Merge the two 32-bit values into a 64-bit one.

    SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,

                              DAG.getConstant(32, DL, MVT::i8));

    Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));

    Results.push_back(Chain);

    return Glue;

  }


  // Use a buildpair to merge the two 32-bit values into a 64-bit one.

  SDValue Ops[] = { LO, HI };

  SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);

  Results.push_back(Pair);

  Results.push_back(Chain);

  return Glue;

}


/// Handles the lowering of builtin intrinsics that read the time stamp counter

/// (x86_rdtsc and x86_rdtscp). This function is also used to custom lower

/// READCYCLECOUNTER nodes.


static void getReadTimeStampCounter(SDNode *N, const SDLoc &DL, unsigned Opcode,

                                    SelectionDAG &DAG,

                                    const X86Subtarget &Subtarget,

                                    SmallVectorImpl<SDValue> &Results) {

  // The processor's time-stamp counter (a 64-bit MSR) is stored into the

  // EDX:EAX registers. EDX is loaded with the high-order 32 bits of the MSR

  // and the EAX register is loaded with the low-order 32 bits.

  SDValue Glue = expandIntrinsicWChainHelper(N, DL, DAG, Opcode,

                                             /* NoRegister */0, Subtarget,

                                             Results);

  if (Opcode != X86::RDTSCP)

    return;


  SDValue Chain = Results[1];

  // Instruction RDTSCP loads the IA32:TSC_AUX_MSR (address C000_0103H) into

  // the ECX register. Add 'ecx' explicitly to the chain.

  SDValue ecx = DAG.getCopyFromReg(Chain, DL, X86::ECX, MVT::i32, Glue);

  Results[1] = ecx;

  Results.push_back(ecx.getValue(1));

}


static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget &Subtarget,

                                     SelectionDAG &DAG) {

  SmallVector<SDValue, 3> Results;

  SDLoc DL(Op);

  getReadTimeStampCounter(Op.getNode(), DL, X86::RDTSC, DAG, Subtarget,

                          Results);

  return DAG.getMergeValues(Results, DL);

}


static SDValue MarkEHRegistrationNode(SDValue Op, SelectionDAG &DAG) {

  MachineFunction &MF = DAG.getMachineFunction();

  SDValue Chain = Op.getOperand(0);

  SDValue RegNode = Op.getOperand(2);

  WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();

  if (!EHInfo)

    report_fatal_error("EH registrations only live in functions using WinEH");


  // Cast the operand to an alloca, and remember the frame index.

  auto *FINode = dyn_cast<FrameIndexSDNode>(RegNode);

  if (!FINode)

    report_fatal_error("llvm.x86.seh.ehregnode expects a static alloca");

  EHInfo->EHRegNodeFrameIndex = FINode->getIndex();


  // Return the chain operand without making any DAG nodes.

  return Chain;

}


static SDValue MarkEHGuard(SDValue Op, SelectionDAG &DAG) {

  MachineFunction &MF = DAG.getMachineFunction();

  SDValue Chain = Op.getOperand(0);

  SDValue EHGuard = Op.getOperand(2);

  WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();

  if (!EHInfo)

    report_fatal_error("EHGuard only live in functions using WinEH");


  // Cast the operand to an alloca, and remember the frame index.

  auto *FINode = dyn_cast<FrameIndexSDNode>(EHGuard);

  if (!FINode)

    report_fatal_error("llvm.x86.seh.ehguard expects a static alloca");

  EHInfo->EHGuardFrameIndex = FINode->getIndex();


  // Return the chain operand without making any DAG nodes.

  return Chain;

}


/// Emit Truncating Store with signed or unsigned saturation.

static SDValue


EmitTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &DL, SDValue Val,

                SDValue Ptr, EVT MemVT, MachineMemOperand *MMO,

                SelectionDAG &DAG) {

  SDVTList VTs = DAG.getVTList(MVT::Other);

  SDValue Undef = DAG.getUNDEF(Ptr.getValueType());

  SDValue Ops[] = { Chain, Val, Ptr, Undef };

  unsigned Opc = SignedSat ? X86ISD::VTRUNCSTORES : X86ISD::VTRUNCSTOREUS;

  return DAG.getMemIntrinsicNode(Opc, DL, VTs, Ops, MemVT, MMO);

}


/// Emit Masked Truncating Store with signed or unsigned saturation.


static SDValue EmitMaskedTruncSStore(bool SignedSat, SDValue Chain,

                                     const SDLoc &DL,

                      SDValue Val, SDValue Ptr, SDValue Mask, EVT MemVT,

                      MachineMemOperand *MMO, SelectionDAG &DAG) {

  SDVTList VTs = DAG.getVTList(MVT::Other);

  SDValue Ops[] = { Chain, Val, Ptr, Mask };

  unsigned Opc = SignedSat ? X86ISD::VMTRUNCSTORES : X86ISD::VMTRUNCSTOREUS;

  return DAG.getMemIntrinsicNode(Opc, DL, VTs, Ops, MemVT, MMO);

}


bool X86::isExtendedSwiftAsyncFrameSupported(const X86Subtarget &Subtarget,

                                             const MachineFunction &MF) {

  if (!Subtarget.is64Bit())

    return false;

  // 64-bit targets support extended Swift async frame setup,

  // except for targets that use the windows 64 prologue.

  return !MF.getTarget().getMCAsmInfo()->usesWindowsCFI();

}


static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,

                                      SelectionDAG &DAG) {

  unsigned IntNo = Op.getConstantOperandVal(1);

  const IntrinsicData *IntrData = getIntrinsicWithChain(IntNo);

  if (!IntrData) {

    switch (IntNo) {


    case Intrinsic::swift_async_context_addr: {

      SDLoc dl(Op);

      auto &MF = DAG.getMachineFunction();

      auto *X86FI = MF.getInfo<X86MachineFunctionInfo>();

      if (X86::isExtendedSwiftAsyncFrameSupported(Subtarget, MF)) {

        MF.getFrameInfo().setFrameAddressIsTaken(true);

        X86FI->setHasSwiftAsyncContext(true);

        SDValue Chain = Op->getOperand(0);

        SDValue CopyRBP = DAG.getCopyFromReg(Chain, dl, X86::RBP, MVT::i64);

        SDValue Result =

            SDValue(DAG.getMachineNode(X86::SUB64ri32, dl, MVT::i64, CopyRBP,

                                       DAG.getTargetConstant(8, dl, MVT::i32)),

                    0);

        // Return { result, chain }.

        return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result,

                           CopyRBP.getValue(1));

      } else {

        // No special extended frame, create or reuse an existing stack slot.

        int PtrSize = Subtarget.is64Bit() ? 8 : 4;

        if (!X86FI->getSwiftAsyncContextFrameIdx())

          X86FI->setSwiftAsyncContextFrameIdx(

              MF.getFrameInfo().CreateStackObject(PtrSize, Align(PtrSize),

                                                  false));

        SDValue Result =

            DAG.getFrameIndex(*X86FI->getSwiftAsyncContextFrameIdx(),

                              PtrSize == 8 ? MVT::i64 : MVT::i32);

        // Return { result, chain }.

        return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result,

                           Op->getOperand(0));

      }

    }


    case llvm::Intrinsic::x86_seh_ehregnode:

      return MarkEHRegistrationNode(Op, DAG);

    case llvm::Intrinsic::x86_seh_ehguard:

      return MarkEHGuard(Op, DAG);

    case llvm::Intrinsic::x86_rdpkru: {

      SDLoc dl(Op);

      SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);

      // Create a RDPKRU node and pass 0 to the ECX parameter.

      return DAG.getNode(X86ISD::RDPKRU, dl, VTs, Op.getOperand(0),

                         DAG.getConstant(0, dl, MVT::i32));

    }

    case llvm::Intrinsic::x86_wrpkru: {

      SDLoc dl(Op);

      // Create a WRPKRU node, pass the input to the EAX parameter,  and pass 0

      // to the EDX and ECX parameters.

      return DAG.getNode(X86ISD::WRPKRU, dl, MVT::Other,

                         Op.getOperand(0), Op.getOperand(2),

                         DAG.getConstant(0, dl, MVT::i32),

                         DAG.getConstant(0, dl, MVT::i32));

    }

    case llvm::Intrinsic::asan_check_memaccess: {

      // Mark this as adjustsStack because it will be lowered to a call.

      DAG.getMachineFunction().getFrameInfo().setAdjustsStack(true);

      // Don't do anything here, we will expand these intrinsics out later.

      return Op;

    }

    case llvm::Intrinsic::x86_flags_read_u32:

    case llvm::Intrinsic::x86_flags_read_u64:

    case llvm::Intrinsic::x86_flags_write_u32:

    case llvm::Intrinsic::x86_flags_write_u64: {

      // We need a frame pointer because this will get lowered to a PUSH/POP

      // sequence.

      MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();

      MFI.setHasCopyImplyingStackAdjustment(true);

      // Don't do anything here, we will expand these intrinsics out later

      // during FinalizeISel in EmitInstrWithCustomInserter.

      return Op;

    }

    case Intrinsic::x86_lwpins32:

    case Intrinsic::x86_lwpins64:

    case Intrinsic::x86_umwait:

    case Intrinsic::x86_tpause: {

      SDLoc dl(Op);

      SDValue Chain = Op->getOperand(0);

      SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);

      unsigned Opcode;


      switch (IntNo) {

      default: llvm_unreachable("Impossible intrinsic");

      case Intrinsic::x86_umwait:

        Opcode = X86ISD::UMWAIT;

        break;

      case Intrinsic::x86_tpause:

        Opcode = X86ISD::TPAUSE;

        break;

      case Intrinsic::x86_lwpins32:

      case Intrinsic::x86_lwpins64:

        Opcode = X86ISD::LWPINS;

        break;

      }


      SDValue Operation =

          DAG.getNode(Opcode, dl, VTs, Chain, Op->getOperand(2),

                      Op->getOperand(3), Op->getOperand(4));

      SDValue SetCC = getSETCC(X86::COND_B, Operation.getValue(0), dl, DAG);

      return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC,

                         Operation.getValue(1));

    }

    case Intrinsic::x86_enqcmd:

    case Intrinsic::x86_enqcmds: {

      SDLoc dl(Op);

      SDValue Chain = Op.getOperand(0);

      SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);

      unsigned Opcode;

      switch (IntNo) {

      default: llvm_unreachable("Impossible intrinsic!");

      case Intrinsic::x86_enqcmd:

        Opcode = X86ISD::ENQCMD;

        break;

      case Intrinsic::x86_enqcmds:

        Opcode = X86ISD::ENQCMDS;

        break;

      }

      SDValue Operation = DAG.getNode(Opcode, dl, VTs, Chain, Op.getOperand(2),

                                      Op.getOperand(3));

      SDValue SetCC = getSETCC(X86::COND_E, Operation.getValue(0), dl, DAG);

      return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC,

                         Operation.getValue(1));

    }

    case Intrinsic::x86_aesenc128kl:

    case Intrinsic::x86_aesdec128kl:

    case Intrinsic::x86_aesenc256kl:

    case Intrinsic::x86_aesdec256kl: {

      SDLoc DL(Op);

      SDVTList VTs = DAG.getVTList(MVT::v2i64, MVT::i32, MVT::Other);

      SDValue Chain = Op.getOperand(0);

      unsigned Opcode;


      switch (IntNo) {

      default: llvm_unreachable("Impossible intrinsic");

      case Intrinsic::x86_aesenc128kl:

        Opcode = X86ISD::AESENC128KL;

        break;

      case Intrinsic::x86_aesdec128kl:

        Opcode = X86ISD::AESDEC128KL;

        break;

      case Intrinsic::x86_aesenc256kl:

        Opcode = X86ISD::AESENC256KL;

        break;

      case Intrinsic::x86_aesdec256kl:

        Opcode = X86ISD::AESDEC256KL;

        break;

      }


      MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);

      MachineMemOperand *MMO = MemIntr->getMemOperand();

      EVT MemVT = MemIntr->getMemoryVT();

      SDValue Operation = DAG.getMemIntrinsicNode(

          Opcode, DL, VTs, {Chain, Op.getOperand(2), Op.getOperand(3)}, MemVT,

          MMO);

      SDValue ZF = getSETCC(X86::COND_E, Operation.getValue(1), DL, DAG);


      return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(),

                         {ZF, Operation.getValue(0), Operation.getValue(2)});

    }

    case Intrinsic::x86_aesencwide128kl:

    case Intrinsic::x86_aesdecwide128kl:

    case Intrinsic::x86_aesencwide256kl:

    case Intrinsic::x86_aesdecwide256kl: {

      SDLoc DL(Op);

      SDVTList VTs = DAG.getVTList(

          {MVT::i32, MVT::v2i64, MVT::v2i64, MVT::v2i64, MVT::v2i64, MVT::v2i64,

           MVT::v2i64, MVT::v2i64, MVT::v2i64, MVT::Other});

      SDValue Chain = Op.getOperand(0);

      unsigned Opcode;


      switch (IntNo) {

      default: llvm_unreachable("Impossible intrinsic");

      case Intrinsic::x86_aesencwide128kl:

        Opcode = X86ISD::AESENCWIDE128KL;

        break;

      case Intrinsic::x86_aesdecwide128kl:

        Opcode = X86ISD::AESDECWIDE128KL;

        break;

      case Intrinsic::x86_aesencwide256kl:

        Opcode = X86ISD::AESENCWIDE256KL;

        break;

      case Intrinsic::x86_aesdecwide256kl:

        Opcode = X86ISD::AESDECWIDE256KL;

        break;

      }


      MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);

      MachineMemOperand *MMO = MemIntr->getMemOperand();

      EVT MemVT = MemIntr->getMemoryVT();

      SDValue Operation = DAG.getMemIntrinsicNode(

          Opcode, DL, VTs,

          {Chain, Op.getOperand(2), Op.getOperand(3), Op.getOperand(4),

           Op.getOperand(5), Op.getOperand(6), Op.getOperand(7),

           Op.getOperand(8), Op.getOperand(9), Op.getOperand(10)},

          MemVT, MMO);

      SDValue ZF = getSETCC(X86::COND_E, Operation.getValue(0), DL, DAG);


      return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(),

                         {ZF, Operation.getValue(1), Operation.getValue(2),

                          Operation.getValue(3), Operation.getValue(4),

                          Operation.getValue(5), Operation.getValue(6),

                          Operation.getValue(7), Operation.getValue(8),

                          Operation.getValue(9)});

    }

    case Intrinsic::x86_testui: {

      SDLoc dl(Op);

      SDValue Chain = Op.getOperand(0);

      SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);

      SDValue Operation = DAG.getNode(X86ISD::TESTUI, dl, VTs, Chain);

      SDValue SetCC = getSETCC(X86::COND_B, Operation.getValue(0), dl, DAG);

      return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC,

                         Operation.getValue(1));

    }

    case Intrinsic::x86_t2rpntlvwz0rs_internal:

    case Intrinsic::x86_t2rpntlvwz0rst1_internal:

    case Intrinsic::x86_t2rpntlvwz1rs_internal:

    case Intrinsic::x86_t2rpntlvwz1rst1_internal:

    case Intrinsic::x86_t2rpntlvwz0_internal:

    case Intrinsic::x86_t2rpntlvwz0t1_internal:

    case Intrinsic::x86_t2rpntlvwz1_internal:

    case Intrinsic::x86_t2rpntlvwz1t1_internal: {

      auto *X86MFI = DAG.getMachineFunction().getInfo<X86MachineFunctionInfo>();

      X86MFI->setAMXProgModel(AMXProgModelEnum::ManagedRA);

      unsigned IntNo = Op.getConstantOperandVal(1);

      unsigned Opc = 0;

      switch (IntNo) {

      default:

        llvm_unreachable("Unexpected intrinsic!");

      case Intrinsic::x86_t2rpntlvwz0_internal:

        Opc = X86::PT2RPNTLVWZ0V;

        break;

      case Intrinsic::x86_t2rpntlvwz0t1_internal:

        Opc = X86::PT2RPNTLVWZ0T1V;

        break;

      case Intrinsic::x86_t2rpntlvwz1_internal:

        Opc = X86::PT2RPNTLVWZ1V;

        break;

      case Intrinsic::x86_t2rpntlvwz1t1_internal:

        Opc = X86::PT2RPNTLVWZ1T1V;

        break;

      case Intrinsic::x86_t2rpntlvwz0rs_internal:

        Opc = X86::PT2RPNTLVWZ0RSV;

        break;

      case Intrinsic::x86_t2rpntlvwz0rst1_internal:

        Opc = X86::PT2RPNTLVWZ0RST1V;

        break;

      case Intrinsic::x86_t2rpntlvwz1rs_internal:

        Opc = X86::PT2RPNTLVWZ1RSV;

        break;

      case Intrinsic::x86_t2rpntlvwz1rst1_internal:

        Opc = X86::PT2RPNTLVWZ1RST1V;

        break;

      }


      SDLoc DL(Op);

      SDVTList VTs = DAG.getVTList(MVT::Untyped, MVT::Other);


      SDValue Ops[] = {Op.getOperand(2),                       // Row

                       Op.getOperand(3),                       // Col0

                       Op.getOperand(4),                       // Col1

                       Op.getOperand(5),                       // Base

                       DAG.getTargetConstant(1, DL, MVT::i8),  // Scale

                       Op.getOperand(6),                       // Index

                       DAG.getTargetConstant(0, DL, MVT::i32), // Disp

                       DAG.getRegister(0, MVT::i16),           // Segment

                       Op.getOperand(0)};                      // Chain


      MachineSDNode *Res = DAG.getMachineNode(Opc, DL, VTs, Ops);

      SDValue Res0 = DAG.getTargetExtractSubreg(X86::sub_t0, DL, MVT::x86amx,

                                                SDValue(Res, 0));

      SDValue Res1 = DAG.getTargetExtractSubreg(X86::sub_t1, DL, MVT::x86amx,

                                                SDValue(Res, 0));

      return DAG.getMergeValues({Res0, Res1, SDValue(Res, 1)}, DL);

    }

    case Intrinsic::x86_atomic_bts_rm:

    case Intrinsic::x86_atomic_btc_rm:

    case Intrinsic::x86_atomic_btr_rm: {

      SDLoc DL(Op);

      MVT VT = Op.getSimpleValueType();

      SDValue Chain = Op.getOperand(0);

      SDValue Op1 = Op.getOperand(2);

      SDValue Op2 = Op.getOperand(3);

      unsigned Opc = IntNo == Intrinsic::x86_atomic_bts_rm   ? X86ISD::LBTS_RM

                     : IntNo == Intrinsic::x86_atomic_btc_rm ? X86ISD::LBTC_RM

                                                             : X86ISD::LBTR_RM;

      MachineMemOperand *MMO = cast<MemIntrinsicSDNode>(Op)->getMemOperand();

      SDValue Res =

          DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::i32, MVT::Other),

                                  {Chain, Op1, Op2}, VT, MMO);

      Chain = Res.getValue(1);

      Res = DAG.getZExtOrTrunc(getSETCC(X86::COND_B, Res, DL, DAG), DL, VT);

      return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(), Res, Chain);

    }

    case Intrinsic::x86_atomic_bts:

    case Intrinsic::x86_atomic_btc:

    case Intrinsic::x86_atomic_btr: {

      SDLoc DL(Op);

      MVT VT = Op.getSimpleValueType();

      SDValue Chain = Op.getOperand(0);

      SDValue Op1 = Op.getOperand(2);

      SDValue Op2 = Op.getOperand(3);

      unsigned Opc = IntNo == Intrinsic::x86_atomic_bts   ? X86ISD::LBTS

                     : IntNo == Intrinsic::x86_atomic_btc ? X86ISD::LBTC

                                                          : X86ISD::LBTR;

      SDValue Size = DAG.getConstant(VT.getScalarSizeInBits(), DL, MVT::i32);

      MachineMemOperand *MMO = cast<MemIntrinsicSDNode>(Op)->getMemOperand();

      SDValue Res =

          DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::i32, MVT::Other),

                                  {Chain, Op1, Op2, Size}, VT, MMO);

      Chain = Res.getValue(1);

      Res = DAG.getZExtOrTrunc(getSETCC(X86::COND_B, Res, DL, DAG), DL, VT);

      unsigned Imm = Op2->getAsZExtVal();

      if (Imm)

        Res = DAG.getNode(ISD::SHL, DL, VT, Res,

                          DAG.getShiftAmountConstant(Imm, VT, DL));

      return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(), Res, Chain);

    }

    case Intrinsic::x86_cmpccxadd32:

    case Intrinsic::x86_cmpccxadd64: {

      SDLoc DL(Op);

      SDValue Chain = Op.getOperand(0);

      SDValue Addr = Op.getOperand(2);

      SDValue Src1 = Op.getOperand(3);

      SDValue Src2 = Op.getOperand(4);

      SDValue CC = Op.getOperand(5);

      MachineMemOperand *MMO = cast<MemIntrinsicSDNode>(Op)->getMemOperand();

      SDValue Operation = DAG.getMemIntrinsicNode(

          X86ISD::CMPCCXADD, DL, Op->getVTList(), {Chain, Addr, Src1, Src2, CC},

          MVT::i32, MMO);

      return Operation;

    }

    case Intrinsic::x86_aadd32:

    case Intrinsic::x86_aadd64:

    case Intrinsic::x86_aand32:

    case Intrinsic::x86_aand64:

    case Intrinsic::x86_aor32:

    case Intrinsic::x86_aor64:

    case Intrinsic::x86_axor32:

    case Intrinsic::x86_axor64: {

      SDLoc DL(Op);

      SDValue Chain = Op.getOperand(0);

      SDValue Op1 = Op.getOperand(2);

      SDValue Op2 = Op.getOperand(3);

      MVT VT = Op2.getSimpleValueType();

      unsigned Opc = 0;

      switch (IntNo) {

      default:

        llvm_unreachable("Unknown Intrinsic");

      case Intrinsic::x86_aadd32:

      case Intrinsic::x86_aadd64:

        Opc = X86ISD::AADD;

        break;

      case Intrinsic::x86_aand32:

      case Intrinsic::x86_aand64:

        Opc = X86ISD::AAND;

        break;

      case Intrinsic::x86_aor32:

      case Intrinsic::x86_aor64:

        Opc = X86ISD::AOR;

        break;

      case Intrinsic::x86_axor32:

      case Intrinsic::x86_axor64:

        Opc = X86ISD::AXOR;

        break;

      }

      MachineMemOperand *MMO = cast<MemSDNode>(Op)->getMemOperand();

      return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(),

                                     {Chain, Op1, Op2}, VT, MMO);

    }

    case Intrinsic::x86_atomic_add_cc:

    case Intrinsic::x86_atomic_sub_cc:

    case Intrinsic::x86_atomic_or_cc:

    case Intrinsic::x86_atomic_and_cc:

    case Intrinsic::x86_atomic_xor_cc: {

      SDLoc DL(Op);

      SDValue Chain = Op.getOperand(0);

      SDValue Op1 = Op.getOperand(2);

      SDValue Op2 = Op.getOperand(3);

      X86::CondCode CC = (X86::CondCode)Op.getConstantOperandVal(4);

      MVT VT = Op2.getSimpleValueType();

      unsigned Opc = 0;

      switch (IntNo) {

      default:

        llvm_unreachable("Unknown Intrinsic");

      case Intrinsic::x86_atomic_add_cc:

        Opc = X86ISD::LADD;

        break;

      case Intrinsic::x86_atomic_sub_cc:

        Opc = X86ISD::LSUB;

        break;

      case Intrinsic::x86_atomic_or_cc:

        Opc = X86ISD::LOR;

        break;

      case Intrinsic::x86_atomic_and_cc:

        Opc = X86ISD::LAND;

        break;

      case Intrinsic::x86_atomic_xor_cc:

        Opc = X86ISD::LXOR;

        break;

      }

      MachineMemOperand *MMO = cast<MemIntrinsicSDNode>(Op)->getMemOperand();

      SDValue LockArith =

          DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::i32, MVT::Other),

                                  {Chain, Op1, Op2}, VT, MMO);

      Chain = LockArith.getValue(1);

      return DAG.getMergeValues({getSETCC(CC, LockArith, DL, DAG), Chain}, DL);

    }

    }

    return SDValue();

  }


  SDLoc dl(Op);

  switch(IntrData->Type) {

  default: llvm_unreachable("Unknown Intrinsic Type");

  case RDSEED:

  case RDRAND: {

    // Emit the node with the right value type.

    SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::i32, MVT::Other);

    SDValue Result = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));


    // If the value returned by RDRAND/RDSEED was valid (CF=1), return 1.

    // Otherwise return the value from Rand, which is always 0, casted to i32.

    SDValue Ops[] = {DAG.getZExtOrTrunc(Result, dl, Op->getValueType(1)),

                     DAG.getConstant(1, dl, Op->getValueType(1)),

                     DAG.getTargetConstant(X86::COND_B, dl, MVT::i8),

                     SDValue(Result.getNode(), 1)};

    SDValue isValid = DAG.getNode(X86ISD::CMOV, dl, Op->getValueType(1), Ops);


    // Return { result, isValid, chain }.

    return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result, isValid,

                       SDValue(Result.getNode(), 2));

  }

  case GATHER_AVX2: {

    SDValue Chain = Op.getOperand(0);

    SDValue Src   = Op.getOperand(2);

    SDValue Base  = Op.getOperand(3);

    SDValue Index = Op.getOperand(4);

    SDValue Mask  = Op.getOperand(5);

    SDValue Scale = Op.getOperand(6);

    return getAVX2GatherNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,

                             Scale, Chain, Subtarget);

  }

  case GATHER: {

  //gather(v1, mask, index, base, scale);

    SDValue Chain = Op.getOperand(0);

    SDValue Src   = Op.getOperand(2);

    SDValue Base  = Op.getOperand(3);

    SDValue Index = Op.getOperand(4);

    SDValue Mask  = Op.getOperand(5);

    SDValue Scale = Op.getOperand(6);

    return getGatherNode(Op, DAG, Src, Mask, Base, Index, Scale,

                         Chain, Subtarget);

  }

  case SCATTER: {

  //scatter(base, mask, index, v1, scale);

    SDValue Chain = Op.getOperand(0);

    SDValue Base  = Op.getOperand(2);

    SDValue Mask  = Op.getOperand(3);

    SDValue Index = Op.getOperand(4);

    SDValue Src   = Op.getOperand(5);

    SDValue Scale = Op.getOperand(6);

    return getScatterNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,

                          Scale, Chain, Subtarget);

  }

  case PREFETCH: {

    const APInt &HintVal = Op.getConstantOperandAPInt(6);

    assert((HintVal == 2 || HintVal == 3) &&

           "Wrong prefetch hint in intrinsic: should be 2 or 3");

    unsigned Opcode = (HintVal == 2 ? IntrData->Opc1 : IntrData->Opc0);

    SDValue Chain = Op.getOperand(0);

    SDValue Mask  = Op.getOperand(2);

    SDValue Index = Op.getOperand(3);

    SDValue Base  = Op.getOperand(4);

    SDValue Scale = Op.getOperand(5);

    return getPrefetchNode(Opcode, Op, DAG, Mask, Base, Index, Scale, Chain,

                           Subtarget);

  }

  // Read Time Stamp Counter (RDTSC) and Processor ID (RDTSCP).

  case RDTSC: {

    SmallVector<SDValue, 2> Results;

    getReadTimeStampCounter(Op.getNode(), dl, IntrData->Opc0, DAG, Subtarget,

                            Results);

    return DAG.getMergeValues(Results, dl);

  }

  // Read Performance Monitoring Counters.

  case RDPMC:

  // Read Processor Register.

  case RDPRU:

  // GetExtended Control Register.

  case XGETBV: {

    SmallVector<SDValue, 2> Results;


    // RDPMC uses ECX to select the index of the performance counter to read.

    // RDPRU uses ECX to select the processor register to read.

    // XGETBV uses ECX to select the index of the XCR register to return.

    // The result is stored into registers EDX:EAX.

    expandIntrinsicWChainHelper(Op.getNode(), dl, DAG, IntrData->Opc0, X86::ECX,

                                Subtarget, Results);

    return DAG.getMergeValues(Results, dl);

  }

  // XTEST intrinsics.

  case XTEST: {

    SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Other);

    SDValue InTrans = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));


    SDValue SetCC = getSETCC(X86::COND_NE, InTrans, dl, DAG);

    SDValue Ret = DAG.getNode(ISD::ZERO_EXTEND, dl, Op->getValueType(0), SetCC);

    return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(),

                       Ret, SDValue(InTrans.getNode(), 1));

  }

  case TRUNCATE_TO_MEM_VI8:

  case TRUNCATE_TO_MEM_VI16:

  case TRUNCATE_TO_MEM_VI32: {

    SDValue Mask = Op.getOperand(4);

    SDValue DataToTruncate = Op.getOperand(3);

    SDValue Addr = Op.getOperand(2);

    SDValue Chain = Op.getOperand(0);


    MemIntrinsicSDNode *MemIntr = dyn_cast<MemIntrinsicSDNode>(Op);

    assert(MemIntr && "Expected MemIntrinsicSDNode!");


    EVT MemVT  = MemIntr->getMemoryVT();


    uint16_t TruncationOp = IntrData->Opc0;

    switch (TruncationOp) {

    case X86ISD::VTRUNC: {

      if (isAllOnesConstant(Mask)) // return just a truncate store

        return DAG.getTruncStore(Chain, dl, DataToTruncate, Addr, MemVT,

                                 MemIntr->getMemOperand());


      MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements());

      SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);

      SDValue Offset = DAG.getUNDEF(VMask.getValueType());


      return DAG.getMaskedStore(Chain, dl, DataToTruncate, Addr, Offset, VMask,

                                MemVT, MemIntr->getMemOperand(), ISD::UNINDEXED,

                                true /* truncating */);

    }

    case X86ISD::VTRUNCUS:

    case X86ISD::VTRUNCS: {

      bool IsSigned = (TruncationOp == X86ISD::VTRUNCS);

      if (isAllOnesConstant(Mask))

        return EmitTruncSStore(IsSigned, Chain, dl, DataToTruncate, Addr, MemVT,

                               MemIntr->getMemOperand(), DAG);


      MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements());

      SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);


      return EmitMaskedTruncSStore(IsSigned, Chain, dl, DataToTruncate, Addr,

                                   VMask, MemVT, MemIntr->getMemOperand(), DAG);

    }

    default:

      llvm_unreachable("Unsupported truncstore intrinsic");

    }

  }

  case INTR_TYPE_CAST_MMX:

    return SDValue(); // handled in combineINTRINSIC_*

  }

}


SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op,

                                           SelectionDAG &DAG) const {

  MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();

  MFI.setReturnAddressIsTaken(true);


  unsigned Depth = Op.getConstantOperandVal(0);

  SDLoc dl(Op);

  EVT PtrVT = Op.getValueType();


  if (Depth > 0) {

    SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);

    const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();

    SDValue Offset = DAG.getConstant(RegInfo->getSlotSize(), dl, PtrVT);

    return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),

                       DAG.getNode(ISD::ADD, dl, PtrVT, FrameAddr, Offset),

                       MachinePointerInfo());

  }


  // Just load the return address.

  SDValue RetAddrFI = getReturnAddressFrameIndex(DAG);

  return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), RetAddrFI,

                     MachinePointerInfo());

}


SDValue X86TargetLowering::LowerADDROFRETURNADDR(SDValue Op,

                                                 SelectionDAG &DAG) const {

  DAG.getMachineFunction().getFrameInfo().setReturnAddressIsTaken(true);

  return getReturnAddressFrameIndex(DAG);

}


SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {

  MachineFunction &MF = DAG.getMachineFunction();

  MachineFrameInfo &MFI = MF.getFrameInfo();

  X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();

  const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();

  EVT VT = Op.getValueType();


  MFI.setFrameAddressIsTaken(true);


  if (MF.getTarget().getMCAsmInfo()->usesWindowsCFI()) {

    // Depth > 0 makes no sense on targets which use Windows unwind codes.  It

    // is not possible to crawl up the stack without looking at the unwind codes

    // simultaneously.

    int FrameAddrIndex = FuncInfo->getFAIndex();

    if (!FrameAddrIndex) {

      // Set up a frame object for the return address.

      unsigned SlotSize = RegInfo->getSlotSize();

      FrameAddrIndex = MF.getFrameInfo().CreateFixedObject(

          SlotSize, /*SPOffset=*/0, /*IsImmutable=*/false);

      FuncInfo->setFAIndex(FrameAddrIndex);

    }

    return DAG.getFrameIndex(FrameAddrIndex, VT);

  }


  Register FrameReg =

      RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction());

  SDLoc dl(Op);  // FIXME probably not meaningful

  unsigned Depth = Op.getConstantOperandVal(0);

  assert(((FrameReg == X86::RBP && VT == MVT::i64) ||

          (FrameReg == X86::EBP && VT == MVT::i32)) &&

         "Invalid Frame Register!");

  SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);

  while (Depth--)

    FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr,

                            MachinePointerInfo());

  return FrameAddr;

}


// FIXME? Maybe this could be a TableGen attribute on some registers and

// this table could be generated automatically from RegInfo.


Register X86TargetLowering::getRegisterByName(const char* RegName, LLT VT,

                                              const MachineFunction &MF) const {

  const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();


  Register Reg = StringSwitch<unsigned>(RegName)

                     .Case("esp", X86::ESP)

                     .Case("rsp", X86::RSP)

                     .Case("ebp", X86::EBP)

                     .Case("rbp", X86::RBP)

                     .Case("r14", X86::R14)

                     .Case("r15", X86::R15)

                     .Default(0);


  if (Reg == X86::EBP || Reg == X86::RBP) {

    if (!TFI.hasFP(MF))

      report_fatal_error("register " + StringRef(RegName) +

                         " is allocatable: function has no frame pointer");

#ifndef NDEBUG

    else {

      const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();

      Register FrameReg = RegInfo->getPtrSizedFrameRegister(MF);

      assert((FrameReg == X86::EBP || FrameReg == X86::RBP) &&

             "Invalid Frame Register!");

    }

#endif

  }


  return Reg;

}


SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op,

                                                     SelectionDAG &DAG) const {

  const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();

  return DAG.getIntPtrConstant(2 * RegInfo->getSlotSize(), SDLoc(Op));

}


Register X86TargetLowering::getExceptionPointerRegister(

    const Constant *PersonalityFn) const {

  if (classifyEHPersonality(PersonalityFn) == EHPersonality::CoreCLR)

    return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;


  return Subtarget.isTarget64BitLP64() ? X86::RAX : X86::EAX;

}


Register X86TargetLowering::getExceptionSelectorRegister(

    const Constant *PersonalityFn) const {

  // Funclet personalities don't use selectors (the runtime does the selection).

  if (isFuncletEHPersonality(classifyEHPersonality(PersonalityFn)))

    return X86::NoRegister;

  return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;

}


bool X86TargetLowering::needsFixedCatchObjects() const {

  return Subtarget.isTargetWin64();

}


SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {

  SDValue Chain     = Op.getOperand(0);

  SDValue Offset    = Op.getOperand(1);

  SDValue Handler   = Op.getOperand(2);

  SDLoc dl      (Op);


  EVT PtrVT = getPointerTy(DAG.getDataLayout());

  const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();

  Register FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction());

  assert(((FrameReg == X86::RBP && PtrVT == MVT::i64) ||

          (FrameReg == X86::EBP && PtrVT == MVT::i32)) &&

         "Invalid Frame Register!");

  SDValue Frame = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, PtrVT);

  Register StoreAddrReg = (PtrVT == MVT::i64) ? X86::RCX : X86::ECX;


  SDValue StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, Frame,

                                 DAG.getIntPtrConstant(RegInfo->getSlotSize(),

                                                       dl));

  StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, StoreAddr, Offset);

  Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, MachinePointerInfo());

  Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr);


  return DAG.getNode(X86ISD::EH_RETURN, dl, MVT::Other, Chain,

                     DAG.getRegister(StoreAddrReg, PtrVT));

}


SDValue X86TargetLowering::lowerEH_SJLJ_SETJMP(SDValue Op,

                                               SelectionDAG &DAG) const {

  SDLoc DL(Op);

  // If the subtarget is not 64bit, we may need the global base reg

  // after isel expand pseudo, i.e., after CGBR pass ran.

  // Therefore, ask for the GlobalBaseReg now, so that the pass

  // inserts the code for us in case we need it.

  // Otherwise, we will end up in a situation where we will

  // reference a virtual register that is not defined!

  if (!Subtarget.is64Bit()) {

    const X86InstrInfo *TII = Subtarget.getInstrInfo();

    (void)TII->getGlobalBaseReg(&DAG.getMachineFunction());

  }

  return DAG.getNode(X86ISD::EH_SJLJ_SETJMP, DL,

                     DAG.getVTList(MVT::i32, MVT::Other),

                     Op.getOperand(0), Op.getOperand(1));

}


SDValue X86TargetLowering::lowerEH_SJLJ_LONGJMP(SDValue Op,

                                                SelectionDAG &DAG) const {

  SDLoc DL(Op);

  return DAG.getNode(X86ISD::EH_SJLJ_LONGJMP, DL, MVT::Other,

                     Op.getOperand(0), Op.getOperand(1));

}


SDValue X86TargetLowering::lowerEH_SJLJ_SETUP_DISPATCH(SDValue Op,

                                                       SelectionDAG &DAG) const {

  SDLoc DL(Op);

  return DAG.getNode(X86ISD::EH_SJLJ_SETUP_DISPATCH, DL, MVT::Other,

                     Op.getOperand(0));

}


static SDValue LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) {

  return Op.getOperand(0);

}


SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,

                                                SelectionDAG &DAG) const {

  SDValue Root = Op.getOperand(0);

  SDValue Trmp = Op.getOperand(1); // trampoline

  SDValue FPtr = Op.getOperand(2); // nested function

  SDValue Nest = Op.getOperand(3); // 'nest' parameter value

  SDLoc dl (Op);


  const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();

  const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();


  if (Subtarget.is64Bit()) {

    SDValue OutChains[6];


    // Large code-model.

    const unsigned char JMP64r  = 0xFF; // 64-bit jmp through register opcode.

    const unsigned char MOV64ri = 0xB8; // X86::MOV64ri opcode.


    const unsigned char N86R10 = TRI->getEncodingValue(X86::R10) & 0x7;

    const unsigned char N86R11 = TRI->getEncodingValue(X86::R11) & 0x7;


    const unsigned char REX_WB = 0x40 | 0x08 | 0x01; // REX prefix


    // Load the pointer to the nested function into R11.

    unsigned OpCode = ((MOV64ri | N86R11) << 8) | REX_WB; // movabsq r11

    SDValue Addr = Trmp;

    OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),

                                Addr, MachinePointerInfo(TrmpAddr));


    Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,

                       DAG.getConstant(2, dl, MVT::i64));

    OutChains[1] = DAG.getStore(Root, dl, FPtr, Addr,

                                MachinePointerInfo(TrmpAddr, 2), Align(2));


    // Load the 'nest' parameter value into R10.

    // R10 is specified in X86CallingConv.td

    OpCode = ((MOV64ri | N86R10) << 8) | REX_WB; // movabsq r10

    Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,

                       DAG.getConstant(10, dl, MVT::i64));

    OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),

                                Addr, MachinePointerInfo(TrmpAddr, 10));


    Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,

                       DAG.getConstant(12, dl, MVT::i64));

    OutChains[3] = DAG.getStore(Root, dl, Nest, Addr,

                                MachinePointerInfo(TrmpAddr, 12), Align(2));


    // Jump to the nested function.

    OpCode = (JMP64r << 8) | REX_WB; // jmpq *...

    Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,

                       DAG.getConstant(20, dl, MVT::i64));

    OutChains[4] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),

                                Addr, MachinePointerInfo(TrmpAddr, 20));


    unsigned char ModRM = N86R11 | (4 << 3) | (3 << 6); // ...r11

    Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,

                       DAG.getConstant(22, dl, MVT::i64));

    OutChains[5] = DAG.getStore(Root, dl, DAG.getConstant(ModRM, dl, MVT::i8),

                                Addr, MachinePointerInfo(TrmpAddr, 22));


    return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);

  } else {

    const Function *Func =

      cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue());

    CallingConv::ID CC = Func->getCallingConv();

    unsigned NestReg;


    switch (CC) {

    default:

      llvm_unreachable("Unsupported calling convention");

    case CallingConv::C:

    case CallingConv::X86_StdCall: {

      // Pass 'nest' parameter in ECX.

      // Must be kept in sync with X86CallingConv.td

      NestReg = X86::ECX;


      // Check that ECX wasn't needed by an 'inreg' parameter.

      FunctionType *FTy = Func->getFunctionType();

      const AttributeList &Attrs = Func->getAttributes();


      if (!Attrs.isEmpty() && !Func->isVarArg()) {

        unsigned InRegCount = 0;

        unsigned Idx = 0;


        for (FunctionType::param_iterator I = FTy->param_begin(),

             E = FTy->param_end(); I != E; ++I, ++Idx)

          if (Attrs.hasParamAttr(Idx, Attribute::InReg)) {

            const DataLayout &DL = DAG.getDataLayout();

            // FIXME: should only count parameters that are lowered to integers.

            InRegCount += (DL.getTypeSizeInBits(*I) + 31) / 32;

          }


        if (InRegCount > 2) {

          report_fatal_error("Nest register in use - reduce number of inreg"

                             " parameters!");

        }

      }

      break;

    }

    case CallingConv::X86_FastCall:

    case CallingConv::X86_ThisCall:

    case CallingConv::Fast:

    case CallingConv::Tail:

    case CallingConv::SwiftTail:

      // Pass 'nest' parameter in EAX.

      // Must be kept in sync with X86CallingConv.td

      NestReg = X86::EAX;

      break;

    }


    SDValue OutChains[4];

    SDValue Addr, Disp;


    Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,

                       DAG.getConstant(10, dl, MVT::i32));

    Disp = DAG.getNode(ISD::SUB, dl, MVT::i32, FPtr, Addr);


    // This is storing the opcode for MOV32ri.

    const unsigned char MOV32ri = 0xB8; // X86::MOV32ri's opcode byte.

    const unsigned char N86Reg = TRI->getEncodingValue(NestReg) & 0x7;

    OutChains[0] =

        DAG.getStore(Root, dl, DAG.getConstant(MOV32ri | N86Reg, dl, MVT::i8),

                     Trmp, MachinePointerInfo(TrmpAddr));


    Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,

                       DAG.getConstant(1, dl, MVT::i32));

    OutChains[1] = DAG.getStore(Root, dl, Nest, Addr,

                                MachinePointerInfo(TrmpAddr, 1), Align(1));


    const unsigned char JMP = 0xE9; // jmp <32bit dst> opcode.

    Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,

                       DAG.getConstant(5, dl, MVT::i32));

    OutChains[2] =

        DAG.getStore(Root, dl, DAG.getConstant(JMP, dl, MVT::i8), Addr,

                     MachinePointerInfo(TrmpAddr, 5), Align(1));


    Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,

                       DAG.getConstant(6, dl, MVT::i32));

    OutChains[3] = DAG.getStore(Root, dl, Disp, Addr,

                                MachinePointerInfo(TrmpAddr, 6), Align(1));


    return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);

  }

}


SDValue X86TargetLowering::LowerGET_ROUNDING(SDValue Op,

                                             SelectionDAG &DAG) const {

  /*

   The rounding mode is in bits 11:10 of FPSR, and has the following

   settings:

     00 Round to nearest

     01 Round to -inf

     10 Round to +inf

     11 Round to 0


  GET_ROUNDING, on the other hand, expects the following:

    -1 Undefined

     0 Round to 0

     1 Round to nearest

     2 Round to +inf

     3 Round to -inf


  To perform the conversion, we use a packed lookup table of the four 2-bit

  values that we can index by FPSP[11:10]

    0x2d --> (0b00,10,11,01) --> (0,2,3,1) >> FPSR[11:10]


    (0x2d >> ((FPSR & 0xc00) >> 9)) & 3

  */


  MachineFunction &MF = DAG.getMachineFunction();

  MVT VT = Op.getSimpleValueType();

  SDLoc DL(Op);


  // Save FP Control Word to stack slot

  int SSFI = MF.getFrameInfo().CreateStackObject(2, Align(2), false);

  SDValue StackSlot =

      DAG.getFrameIndex(SSFI, getPointerTy(DAG.getDataLayout()));


  MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(MF, SSFI);


  SDValue Chain = Op.getOperand(0);

  SDValue Ops[] = {Chain, StackSlot};

  Chain = DAG.getMemIntrinsicNode(X86ISD::FNSTCW16m, DL,

                                  DAG.getVTList(MVT::Other), Ops, MVT::i16, MPI,

                                  Align(2), MachineMemOperand::MOStore);


  // Load FP Control Word from stack slot

  SDValue CWD = DAG.getLoad(MVT::i16, DL, Chain, StackSlot, MPI, Align(2));

  Chain = CWD.getValue(1);


  // Mask and turn the control bits into a shift for the lookup table.

  SDValue Shift =

    DAG.getNode(ISD::SRL, DL, MVT::i16,

                DAG.getNode(ISD::AND, DL, MVT::i16,

                            CWD, DAG.getConstant(0xc00, DL, MVT::i16)),

                DAG.getConstant(9, DL, MVT::i8));

  Shift = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, Shift);


  SDValue LUT = DAG.getConstant(0x2d, DL, MVT::i32);

  SDValue RetVal =

    DAG.getNode(ISD::AND, DL, MVT::i32,

                DAG.getNode(ISD::SRL, DL, MVT::i32, LUT, Shift),

                DAG.getConstant(3, DL, MVT::i32));


  RetVal = DAG.getZExtOrTrunc(RetVal, DL, VT);


  return DAG.getMergeValues({RetVal, Chain}, DL);

}


SDValue X86TargetLowering::LowerSET_ROUNDING(SDValue Op,

                                             SelectionDAG &DAG) const {

  MachineFunction &MF = DAG.getMachineFunction();

  SDLoc DL(Op);

  SDValue Chain = Op.getNode()->getOperand(0);


  // FP control word may be set only from data in memory. So we need to allocate

  // stack space to save/load FP control word.

  int OldCWFrameIdx = MF.getFrameInfo().CreateStackObject(4, Align(4), false);

  SDValue StackSlot =

      DAG.getFrameIndex(OldCWFrameIdx, getPointerTy(DAG.getDataLayout()));

  MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(MF, OldCWFrameIdx);

  MachineMemOperand *MMO =

      MF.getMachineMemOperand(MPI, MachineMemOperand::MOStore, 2, Align(2));


  // Store FP control word into memory.

  SDValue Ops[] = {Chain, StackSlot};

  Chain = DAG.getMemIntrinsicNode(

      X86ISD::FNSTCW16m, DL, DAG.getVTList(MVT::Other), Ops, MVT::i16, MMO);


  // Load FP Control Word from stack slot and clear RM field (bits 11:10).

  SDValue CWD = DAG.getLoad(MVT::i16, DL, Chain, StackSlot, MPI);

  Chain = CWD.getValue(1);

  CWD = DAG.getNode(ISD::AND, DL, MVT::i16, CWD.getValue(0),

                    DAG.getConstant(0xf3ff, DL, MVT::i16));


  // Calculate new rounding mode.

  SDValue NewRM = Op.getNode()->getOperand(1);

  SDValue RMBits;

  if (auto *CVal = dyn_cast<ConstantSDNode>(NewRM)) {

    uint64_t RM = CVal->getZExtValue();

    int FieldVal = X86::getRoundingModeX86(RM);


    if (FieldVal == X86::rmInvalid) {

      FieldVal = X86::rmToNearest;

      LLVMContext &C = MF.getFunction().getContext();

      C.diagnose(DiagnosticInfoUnsupported(

          MF.getFunction(), "rounding mode is not supported by X86 hardware",

          DiagnosticLocation(DL.getDebugLoc()), DS_Error));

    }

    RMBits = DAG.getConstant(FieldVal, DL, MVT::i16);

  } else {

    // Need to convert argument into bits of control word:

    //    0 Round to 0       -> 11

    //    1 Round to nearest -> 00

    //    2 Round to +inf    -> 10

    //    3 Round to -inf    -> 01

    // The 2-bit value needs then to be shifted so that it occupies bits 11:10.

    // To make the conversion, put all these values into a value 0xc9 and shift

    // it left depending on the rounding mode:

    //    (0xc9 << 4) & 0xc00 = X86::rmTowardZero

    //    (0xc9 << 6) & 0xc00 = X86::rmToNearest

    //    ...

    // (0xc9 << (2 * NewRM + 4)) & 0xc00

    SDValue ShiftValue =

        DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,

                    DAG.getNode(ISD::ADD, DL, MVT::i32,

                                DAG.getNode(ISD::SHL, DL, MVT::i32, NewRM,

                                            DAG.getConstant(1, DL, MVT::i8)),

                                DAG.getConstant(4, DL, MVT::i32)));

    SDValue Shifted =

        DAG.getNode(ISD::SHL, DL, MVT::i16, DAG.getConstant(0xc9, DL, MVT::i16),

                    ShiftValue);

    RMBits = DAG.getNode(ISD::AND, DL, MVT::i16, Shifted,

                         DAG.getConstant(0xc00, DL, MVT::i16));

  }


  // Update rounding mode bits and store the new FP Control Word into stack.

  CWD = DAG.getNode(ISD::OR, DL, MVT::i16, CWD, RMBits);

  Chain = DAG.getStore(Chain, DL, CWD, StackSlot, MPI, Align(2));


  // Load FP control word from the slot.

  SDValue OpsLD[] = {Chain, StackSlot};

  MachineMemOperand *MMOL =

      MF.getMachineMemOperand(MPI, MachineMemOperand::MOLoad, 2, Align(2));

  Chain = DAG.getMemIntrinsicNode(

      X86ISD::FLDCW16m, DL, DAG.getVTList(MVT::Other), OpsLD, MVT::i16, MMOL);


  // If target supports SSE, set MXCSR as well. Rounding mode is encoded in the

  // same way but in bits 14:13.

  if (Subtarget.hasSSE1()) {

    // Store MXCSR into memory.

    Chain = DAG.getNode(

        ISD::INTRINSIC_VOID, DL, DAG.getVTList(MVT::Other), Chain,

        DAG.getTargetConstant(Intrinsic::x86_sse_stmxcsr, DL, MVT::i32),

        StackSlot);


    // Load MXCSR from stack slot and clear RM field (bits 14:13).

    SDValue CWD = DAG.getLoad(MVT::i32, DL, Chain, StackSlot, MPI);

    Chain = CWD.getValue(1);

    CWD = DAG.getNode(ISD::AND, DL, MVT::i32, CWD.getValue(0),

                      DAG.getConstant(0xffff9fff, DL, MVT::i32));


    // Shift X87 RM bits from 11:10 to 14:13.

    RMBits = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, RMBits);

    RMBits = DAG.getNode(ISD::SHL, DL, MVT::i32, RMBits,

                         DAG.getConstant(3, DL, MVT::i8));


    // Update rounding mode bits and store the new FP Control Word into stack.

    CWD = DAG.getNode(ISD::OR, DL, MVT::i32, CWD, RMBits);

    Chain = DAG.getStore(Chain, DL, CWD, StackSlot, MPI, Align(4));


    // Load MXCSR from the slot.

    Chain = DAG.getNode(

        ISD::INTRINSIC_VOID, DL, DAG.getVTList(MVT::Other), Chain,

        DAG.getTargetConstant(Intrinsic::x86_sse_ldmxcsr, DL, MVT::i32),

        StackSlot);

  }


  return Chain;

}


const unsigned X87StateSize = 28;

const unsigned FPStateSize = 32;

[[maybe_unused]] const unsigned FPStateSizeInBits = FPStateSize * 8;


SDValue X86TargetLowering::LowerGET_FPENV_MEM(SDValue Op,

                                              SelectionDAG &DAG) const {

  MachineFunction &MF = DAG.getMachineFunction();

  SDLoc DL(Op);

  SDValue Chain = Op->getOperand(0);

  SDValue Ptr = Op->getOperand(1);

  auto *Node = cast<FPStateAccessSDNode>(Op);

  EVT MemVT = Node->getMemoryVT();

  assert(MemVT.getSizeInBits() == FPStateSizeInBits);

  MachineMemOperand *MMO = cast<FPStateAccessSDNode>(Op)->getMemOperand();


  // Get x87 state, if it presents.

  if (Subtarget.hasX87()) {

    Chain =

        DAG.getMemIntrinsicNode(X86ISD::FNSTENVm, DL, DAG.getVTList(MVT::Other),

                                {Chain, Ptr}, MemVT, MMO);


    // FNSTENV changes the exception mask, so load back the stored environment.

    MachineMemOperand::Flags NewFlags =

        MachineMemOperand::MOLoad |

        (MMO->getFlags() & ~MachineMemOperand::MOStore);

    MMO = MF.getMachineMemOperand(MMO, NewFlags);

    Chain =

        DAG.getMemIntrinsicNode(X86ISD::FLDENVm, DL, DAG.getVTList(MVT::Other),

                                {Chain, Ptr}, MemVT, MMO);

  }


  // If target supports SSE, get MXCSR as well.

  if (Subtarget.hasSSE1()) {

    // Get pointer to the MXCSR location in memory.

    MVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());

    SDValue MXCSRAddr = DAG.getNode(ISD::ADD, DL, PtrVT, Ptr,

                                    DAG.getConstant(X87StateSize, DL, PtrVT));

    // Store MXCSR into memory.

    Chain = DAG.getNode(

        ISD::INTRINSIC_VOID, DL, DAG.getVTList(MVT::Other), Chain,

        DAG.getTargetConstant(Intrinsic::x86_sse_stmxcsr, DL, MVT::i32),

        MXCSRAddr);

  }


  return Chain;

}


static SDValue createSetFPEnvNodes(SDValue Ptr, SDValue Chain, const SDLoc &DL,

                                   EVT MemVT, MachineMemOperand *MMO,

                                   SelectionDAG &DAG,

                                   const X86Subtarget &Subtarget) {

  // Set x87 state, if it presents.

  if (Subtarget.hasX87())

    Chain =

        DAG.getMemIntrinsicNode(X86ISD::FLDENVm, DL, DAG.getVTList(MVT::Other),

                                {Chain, Ptr}, MemVT, MMO);

  // If target supports SSE, set MXCSR as well.

  if (Subtarget.hasSSE1()) {

    // Get pointer to the MXCSR location in memory.

    MVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());

    SDValue MXCSRAddr = DAG.getNode(ISD::ADD, DL, PtrVT, Ptr,

                                    DAG.getConstant(X87StateSize, DL, PtrVT));

    // Load MXCSR from memory.

    Chain = DAG.getNode(

        ISD::INTRINSIC_VOID, DL, DAG.getVTList(MVT::Other), Chain,

        DAG.getTargetConstant(Intrinsic::x86_sse_ldmxcsr, DL, MVT::i32),

        MXCSRAddr);

  }

  return Chain;

}


SDValue X86TargetLowering::LowerSET_FPENV_MEM(SDValue Op,

                                              SelectionDAG &DAG) const {

  SDLoc DL(Op);

  SDValue Chain = Op->getOperand(0);

  SDValue Ptr = Op->getOperand(1);

  auto *Node = cast<FPStateAccessSDNode>(Op);

  EVT MemVT = Node->getMemoryVT();

  assert(MemVT.getSizeInBits() == FPStateSizeInBits);

  MachineMemOperand *MMO = cast<FPStateAccessSDNode>(Op)->getMemOperand();

  return createSetFPEnvNodes(Ptr, Chain, DL, MemVT, MMO, DAG, Subtarget);

}


SDValue X86TargetLowering::LowerRESET_FPENV(SDValue Op,

                                            SelectionDAG &DAG) const {

  MachineFunction &MF = DAG.getMachineFunction();

  SDLoc DL(Op);

  SDValue Chain = Op.getNode()->getOperand(0);


  IntegerType *ItemTy = Type::getInt32Ty(*DAG.getContext());

  ArrayType *FPEnvTy = ArrayType::get(ItemTy, 8);

  SmallVector<Constant *, 8> FPEnvVals;


  // x87 FPU Control Word: mask all floating-point exceptions, sets rounding to

  // nearest. FPU precision is set to 53 bits on Windows and 64 bits otherwise

  // for compatibility with glibc.

  unsigned X87CW = Subtarget.isTargetWindowsMSVC() ? 0x27F : 0x37F;

  FPEnvVals.push_back(ConstantInt::get(ItemTy, X87CW));

  Constant *Zero = ConstantInt::get(ItemTy, 0);

  for (unsigned I = 0; I < 6; ++I)

    FPEnvVals.push_back(Zero);


  // MXCSR: mask all floating-point exceptions, sets rounding to nearest, clear

  // all exceptions, sets DAZ and FTZ to 0.

  FPEnvVals.push_back(ConstantInt::get(ItemTy, 0x1F80));

  Constant *FPEnvBits = ConstantArray::get(FPEnvTy, FPEnvVals);

  MVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());

  SDValue Env = DAG.getConstantPool(FPEnvBits, PtrVT);

  MachinePointerInfo MPI =

      MachinePointerInfo::getConstantPool(DAG.getMachineFunction());

  MachineMemOperand *MMO = MF.getMachineMemOperand(

      MPI, MachineMemOperand::MOStore, X87StateSize, Align(4));


  return createSetFPEnvNodes(Env, Chain, DL, MVT::i32, MMO, DAG, Subtarget);

}


// Generate a GFNI gf2p8affine bitmask for vXi8 bitreverse/shift/rotate.


uint64_t getGFNICtrlImm(unsigned Opcode, unsigned Amt = 0) {

  assert((Amt < 8) && "Shift/Rotation amount out of range");

  switch (Opcode) {

  case ISD::BITREVERSE:

    return 0x8040201008040201ULL;

  case ISD::SHL:

    return ((0x0102040810204080ULL >> (Amt)) &

            (0x0101010101010101ULL * (0xFF >> (Amt))));

  case ISD::SRL:

    return ((0x0102040810204080ULL << (Amt)) &

            (0x0101010101010101ULL * ((0xFF << (Amt)) & 0xFF)));

  case ISD::SRA:

    return (getGFNICtrlImm(ISD::SRL, Amt) |

            (0x8080808080808080ULL >> (64 - (8 * Amt))));

  case ISD::ROTL:

    return getGFNICtrlImm(ISD::SRL, 8 - Amt) | getGFNICtrlImm(ISD::SHL, Amt);

  case ISD::ROTR:

    return getGFNICtrlImm(ISD::SHL, 8 - Amt) | getGFNICtrlImm(ISD::SRL, Amt);

  }

  llvm_unreachable("Unsupported GFNI opcode");

}


// Generate a GFNI gf2p8affine bitmask for vXi8 bitreverse/shift/rotate.


SDValue getGFNICtrlMask(unsigned Opcode, SelectionDAG &DAG, const SDLoc &DL,

                        MVT VT, unsigned Amt = 0) {

  assert(VT.getVectorElementType() == MVT::i8 &&

         (VT.getSizeInBits() % 64) == 0 && "Illegal GFNI control type");

  uint64_t Imm = getGFNICtrlImm(Opcode, Amt);

  SmallVector<SDValue> MaskBits;

  for (unsigned I = 0, E = VT.getSizeInBits(); I != E; I += 8) {

    uint64_t Bits = (Imm >> (I % 64)) & 255;

    MaskBits.push_back(DAG.getConstant(Bits, DL, MVT::i8));

  }

  return DAG.getBuildVector(VT, DL, MaskBits);

}


/// Lower a vector CTLZ using native supported vector CTLZ instruction.

//

// i8/i16 vector implemented using dword LZCNT vector instruction

// ( sub(trunc(lzcnt(zext32(x)))) ). In case zext32(x) is illegal,

// split the vector, perform operation on it's Lo a Hi part and

// concatenate the results.


static SDValue LowerVectorCTLZ_AVX512CDI(SDValue Op, SelectionDAG &DAG,

                                         const X86Subtarget &Subtarget) {

  assert(Op.getOpcode() == ISD::CTLZ);

  SDLoc dl(Op);

  MVT VT = Op.getSimpleValueType();

  MVT EltVT = VT.getVectorElementType();

  unsigned NumElems = VT.getVectorNumElements();


  assert((EltVT == MVT::i8 || EltVT == MVT::i16) &&

          "Unsupported element type");


  // Split vector, it's Lo and Hi parts will be handled in next iteration.

  if (NumElems > 16 ||

      (NumElems == 16 && !Subtarget.canExtendTo512DQ()))

    return splitVectorIntUnary(Op, DAG, dl);


  MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems);

  assert((NewVT.is256BitVector() || NewVT.is512BitVector()) &&

          "Unsupported value type for operation");


  // Use native supported vector instruction vplzcntd.

  Op = DAG.getNode(ISD::ZERO_EXTEND, dl, NewVT, Op.getOperand(0));

  SDValue CtlzNode = DAG.getNode(ISD::CTLZ, dl, NewVT, Op);

  SDValue TruncNode = DAG.getNode(ISD::TRUNCATE, dl, VT, CtlzNode);

  SDValue Delta = DAG.getConstant(32 - EltVT.getSizeInBits(), dl, VT);


  return DAG.getNode(ISD::SUB, dl, VT, TruncNode, Delta);

}


// Lower CTLZ using a PSHUFB lookup table implementation.


static SDValue LowerVectorCTLZInRegLUT(SDValue Op, const SDLoc &DL,

                                       const X86Subtarget &Subtarget,

                                       SelectionDAG &DAG) {

  MVT VT = Op.getSimpleValueType();

  int NumElts = VT.getVectorNumElements();

  int NumBytes = NumElts * (VT.getScalarSizeInBits() / 8);

  MVT CurrVT = MVT::getVectorVT(MVT::i8, NumBytes);


  // Per-nibble leading zero PSHUFB lookup table.

  const int LUT[16] = {/* 0 */ 4, /* 1 */ 3, /* 2 */ 2, /* 3 */ 2,

                       /* 4 */ 1, /* 5 */ 1, /* 6 */ 1, /* 7 */ 1,

                       /* 8 */ 0, /* 9 */ 0, /* a */ 0, /* b */ 0,

                       /* c */ 0, /* d */ 0, /* e */ 0, /* f */ 0};


  SmallVector<SDValue, 64> LUTVec;

  for (int i = 0; i < NumBytes; ++i)

    LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));

  SDValue InRegLUT = DAG.getBuildVector(CurrVT, DL, LUTVec);


  // Begin by bitcasting the input to byte vector, then split those bytes

  // into lo/hi nibbles and use the PSHUFB LUT to perform CTLZ on each of them.

  // If the hi input nibble is zero then we add both results together, otherwise

  // we just take the hi result (by masking the lo result to zero before the

  // add).

  SDValue Op0 = DAG.getBitcast(CurrVT, Op.getOperand(0));

  SDValue Zero = DAG.getConstant(0, DL, CurrVT);


  SDValue NibbleShift = DAG.getConstant(0x4, DL, CurrVT);

  SDValue Lo = Op0;

  SDValue Hi = DAG.getNode(ISD::SRL, DL, CurrVT, Op0, NibbleShift);

  SDValue HiZ;

  if (CurrVT.is512BitVector()) {

    MVT MaskVT = MVT::getVectorVT(MVT::i1, CurrVT.getVectorNumElements());

    HiZ = DAG.getSetCC(DL, MaskVT, Hi, Zero, ISD::SETEQ);

    HiZ = DAG.getNode(ISD::SIGN_EXTEND, DL, CurrVT, HiZ);

  } else {

    HiZ = DAG.getSetCC(DL, CurrVT, Hi, Zero, ISD::SETEQ);

  }


  Lo = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Lo);

  Hi = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Hi);

  Lo = DAG.getNode(ISD::AND, DL, CurrVT, Lo, HiZ);

  SDValue Res = DAG.getNode(ISD::ADD, DL, CurrVT, Lo, Hi);


  // Merge result back from vXi8 back to VT, working on the lo/hi halves

  // of the current vector width in the same way we did for the nibbles.

  // If the upper half of the input element is zero then add the halves'

  // leading zero counts together, otherwise just use the upper half's.

  // Double the width of the result until we are at target width.

  while (CurrVT != VT) {

    int CurrScalarSizeInBits = CurrVT.getScalarSizeInBits();

    int CurrNumElts = CurrVT.getVectorNumElements();

    MVT NextSVT = MVT::getIntegerVT(CurrScalarSizeInBits * 2);

    MVT NextVT = MVT::getVectorVT(NextSVT, CurrNumElts / 2);

    SDValue Shift = DAG.getConstant(CurrScalarSizeInBits, DL, NextVT);


    // Check if the upper half of the input element is zero.

    if (CurrVT.is512BitVector()) {

      MVT MaskVT = MVT::getVectorVT(MVT::i1, CurrVT.getVectorNumElements());

      HiZ = DAG.getSetCC(DL, MaskVT, DAG.getBitcast(CurrVT, Op0),

                         DAG.getBitcast(CurrVT, Zero), ISD::SETEQ);

      HiZ = DAG.getNode(ISD::SIGN_EXTEND, DL, CurrVT, HiZ);

    } else {

      HiZ = DAG.getSetCC(DL, CurrVT, DAG.getBitcast(CurrVT, Op0),

                         DAG.getBitcast(CurrVT, Zero), ISD::SETEQ);

    }

    HiZ = DAG.getBitcast(NextVT, HiZ);


    // Move the upper/lower halves to the lower bits as we'll be extending to

    // NextVT. Mask the lower result to zero if HiZ is true and add the results

    // together.

    SDValue ResNext = Res = DAG.getBitcast(NextVT, Res);

    SDValue R0 = DAG.getNode(ISD::SRL, DL, NextVT, ResNext, Shift);

    SDValue R1 = DAG.getNode(ISD::SRL, DL, NextVT, HiZ, Shift);

    R1 = DAG.getNode(ISD::AND, DL, NextVT, ResNext, R1);

    Res = DAG.getNode(ISD::ADD, DL, NextVT, R0, R1);

    CurrVT = NextVT;

  }


  return Res;

}


static SDValue LowerVectorCTLZ(SDValue Op, const SDLoc &DL,

                               const X86Subtarget &Subtarget,

                               SelectionDAG &DAG) {

  MVT VT = Op.getSimpleValueType();


  if (Subtarget.hasCDI() &&

      // vXi8 vectors need to be promoted to 512-bits for vXi32.

      (Subtarget.canExtendTo512DQ() || VT.getVectorElementType() != MVT::i8))

    return LowerVectorCTLZ_AVX512CDI(Op, DAG, Subtarget);


  // Decompose 256-bit ops into smaller 128-bit ops.

  if (VT.is256BitVector() && !Subtarget.hasInt256())

    return splitVectorIntUnary(Op, DAG, DL);


  // Decompose 512-bit ops into smaller 256-bit ops.

  if (VT.is512BitVector() && !Subtarget.hasBWI())

    return splitVectorIntUnary(Op, DAG, DL);


  assert(Subtarget.hasSSSE3() && "Expected SSSE3 support for PSHUFB");

  return LowerVectorCTLZInRegLUT(Op, DL, Subtarget, DAG);

}


static SDValue LowerVectorCTLZ_GFNI(SDValue Op, const SDLoc &DL,

                                    SelectionDAG &DAG,

                                    const X86Subtarget &Subtarget) {

  MVT VT = Op.getSimpleValueType();

  SDValue Input = Op.getOperand(0);


  assert(VT.isVector() && VT.getVectorElementType() == MVT::i8 &&

         "Expected vXi8 input for GFNI-based CTLZ lowering");


  SDValue Reversed = DAG.getNode(ISD::BITREVERSE, DL, VT, Input);


  SDValue Neg = DAG.getNegative(Reversed, DL, VT);

  SDValue Filtered = DAG.getNode(ISD::AND, DL, VT, Reversed, Neg);


  MVT VT64 = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64);

  SDValue CTTZConst = DAG.getConstant(0xAACCF0FF00000000ULL, DL, VT64);

  SDValue CTTZMatrix = DAG.getBitcast(VT, CTTZConst);


  SDValue LZCNT =

      DAG.getNode(X86ISD::GF2P8AFFINEQB, DL, VT, Filtered, CTTZMatrix,

                  DAG.getTargetConstant(8, DL, MVT::i8));

  return LZCNT;

}


static SDValue LowerCTLZ(SDValue Op, const X86Subtarget &Subtarget,

                         SelectionDAG &DAG) {

  MVT VT = Op.getSimpleValueType();

  MVT OpVT = VT;

  unsigned NumBits = VT.getSizeInBits();

  SDLoc dl(Op);

  unsigned Opc = Op.getOpcode();


  if (VT.isVector() && VT.getScalarType() == MVT::i8 && Subtarget.hasGFNI())

    return LowerVectorCTLZ_GFNI(Op, dl, DAG, Subtarget);


  if (VT.isVector())

    return LowerVectorCTLZ(Op, dl, Subtarget, DAG);


  Op = Op.getOperand(0);

  if (VT == MVT::i8) {

    // Zero extend to i32 since there is not an i8 bsr.

    OpVT = MVT::i32;

    Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op);

  }


  // Check if we can safely pass a result though BSR for zero sources.

  SDValue PassThru = DAG.getUNDEF(OpVT);

  if (Opc == ISD::CTLZ && Subtarget.hasBitScanPassThrough() &&

      !DAG.isKnownNeverZero(Op))

    PassThru = DAG.getConstant(NumBits + NumBits - 1, dl, OpVT);


  // Issue a bsr (scan bits in reverse) which also sets EFLAGS.

  SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);

  Op = DAG.getNode(X86ISD::BSR, dl, VTs, PassThru, Op);


  // Skip CMOV if we're using a pass through value.

  if (Opc == ISD::CTLZ && PassThru.isUndef()) {

    // If src is zero (i.e. bsr sets ZF), returns NumBits.

    SDValue Ops[] = {Op, DAG.getConstant(NumBits + NumBits - 1, dl, OpVT),

                     DAG.getTargetConstant(X86::COND_E, dl, MVT::i8),

                     Op.getValue(1)};

    Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops);

  }


  // Finally xor with NumBits-1.

  Op = DAG.getNode(ISD::XOR, dl, OpVT, Op,

                   DAG.getConstant(NumBits - 1, dl, OpVT));


  if (VT == MVT::i8)

    Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op);

  return Op;

}


static SDValue LowerCTTZ(SDValue Op, const X86Subtarget &Subtarget,

                         SelectionDAG &DAG) {

  MVT VT = Op.getSimpleValueType();

  unsigned NumBits = VT.getScalarSizeInBits();

  SDValue N0 = Op.getOperand(0);

  SDLoc dl(Op);

  bool NonZeroSrc = DAG.isKnownNeverZero(N0);


  assert(!VT.isVector() && Op.getOpcode() == ISD::CTTZ &&

         "Only scalar CTTZ requires custom lowering");


  // Check if we can safely pass a result though BSF for zero sources.

  SDValue PassThru = DAG.getUNDEF(VT);

  if (!NonZeroSrc && Subtarget.hasBitScanPassThrough())

    PassThru = DAG.getConstant(NumBits, dl, VT);


  // Issue a bsf (scan bits forward) which also sets EFLAGS.

  SDVTList VTs = DAG.getVTList(VT, MVT::i32);

  Op = DAG.getNode(X86ISD::BSF, dl, VTs, PassThru, N0);


  // Skip CMOV if src is never zero or we're using a pass through value.

  if (NonZeroSrc || !PassThru.isUndef())

    return Op;


  // If src is zero (i.e. bsf sets ZF), returns NumBits.

  SDValue Ops[] = {Op, DAG.getConstant(NumBits, dl, VT),

                   DAG.getTargetConstant(X86::COND_E, dl, MVT::i8),

                   Op.getValue(1)};

  return DAG.getNode(X86ISD::CMOV, dl, VT, Ops);

}


static SDValue lowerAddSub(SDValue Op, SelectionDAG &DAG,

                           const X86Subtarget &Subtarget) {

  MVT VT = Op.getSimpleValueType();

  SDLoc DL(Op);


  if (VT == MVT::i16 || VT == MVT::i32)

    return lowerAddSubToHorizontalOp(Op, DL, DAG, Subtarget);


  if (VT == MVT::v32i16 || VT == MVT::v64i8)

    return splitVectorIntBinary(Op, DAG, DL);


  assert(Op.getSimpleValueType().is256BitVector() &&

         Op.getSimpleValueType().isInteger() &&

         "Only handle AVX 256-bit vector integer operation");

  return splitVectorIntBinary(Op, DAG, DL);

}


static SDValue LowerADDSAT_SUBSAT(SDValue Op, SelectionDAG &DAG,

                                  const X86Subtarget &Subtarget) {

  MVT VT = Op.getSimpleValueType();

  SDValue X = Op.getOperand(0), Y = Op.getOperand(1);

  unsigned Opcode = Op.getOpcode();

  SDLoc DL(Op);


  if (VT == MVT::v32i16 || VT == MVT::v64i8 ||

      (VT.is256BitVector() && !Subtarget.hasInt256())) {

    assert(Op.getSimpleValueType().isInteger() &&

           "Only handle AVX vector integer operation");

    return splitVectorIntBinary(Op, DAG, DL);

  }


  // Avoid the generic expansion with min/max if we don't have pminu*/pmaxu*.

  const TargetLowering &TLI = DAG.getTargetLoweringInfo();

  EVT SetCCResultType =

      TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);


  unsigned BitWidth = VT.getScalarSizeInBits();

  if (Opcode == ISD::USUBSAT) {

    if (!TLI.isOperationLegal(ISD::UMAX, VT) || useVPTERNLOG(Subtarget, VT)) {

      // Handle a special-case with a bit-hack instead of cmp+select:

      // usubsat X, SMIN --> (X ^ SMIN) & (X s>> BW-1)

      // If the target can use VPTERNLOG, DAGToDAG will match this as

      // "vpsra + vpternlog" which is better than "vpmax + vpsub" with a

      // "broadcast" constant load.

      ConstantSDNode *C = isConstOrConstSplat(Y, true);

      if (C && C->getAPIntValue().isSignMask()) {

        SDValue SignMask = DAG.getConstant(C->getAPIntValue(), DL, VT);

        SDValue ShiftAmt = DAG.getConstant(BitWidth - 1, DL, VT);

        SDValue Xor = DAG.getNode(ISD::XOR, DL, VT, X, SignMask);

        SDValue Sra = DAG.getNode(ISD::SRA, DL, VT, X, ShiftAmt);

        return DAG.getNode(ISD::AND, DL, VT, Xor, Sra);

      }

    }

    if (!TLI.isOperationLegal(ISD::UMAX, VT)) {

      // usubsat X, Y --> (X >u Y) ? X - Y : 0

      SDValue Sub = DAG.getNode(ISD::SUB, DL, VT, X, Y);

      SDValue Cmp = DAG.getSetCC(DL, SetCCResultType, X, Y, ISD::SETUGT);

      // TODO: Move this to DAGCombiner?

      if (SetCCResultType == VT &&

          DAG.ComputeNumSignBits(Cmp) == VT.getScalarSizeInBits())

        return DAG.getNode(ISD::AND, DL, VT, Cmp, Sub);

      return DAG.getSelect(DL, VT, Cmp, Sub, DAG.getConstant(0, DL, VT));

    }

  }


  if ((Opcode == ISD::SADDSAT || Opcode == ISD::SSUBSAT) &&

      (!VT.isVector() || VT == MVT::v2i64)) {

    APInt MinVal = APInt::getSignedMinValue(BitWidth);

    APInt MaxVal = APInt::getSignedMaxValue(BitWidth);

    SDValue Zero = DAG.getConstant(0, DL, VT);

    SDValue Result =

        DAG.getNode(Opcode == ISD::SADDSAT ? ISD::SADDO : ISD::SSUBO, DL,

                    DAG.getVTList(VT, SetCCResultType), X, Y);

    SDValue SumDiff = Result.getValue(0);

    SDValue Overflow = Result.getValue(1);

    SDValue SatMin = DAG.getConstant(MinVal, DL, VT);

    SDValue SatMax = DAG.getConstant(MaxVal, DL, VT);

    SDValue SumNeg =

        DAG.getSetCC(DL, SetCCResultType, SumDiff, Zero, ISD::SETLT);

    Result = DAG.getSelect(DL, VT, SumNeg, SatMax, SatMin);

    return DAG.getSelect(DL, VT, Overflow, Result, SumDiff);

  }


  // Use default expansion.

  return SDValue();

}


static SDValue LowerABS(SDValue Op, const X86Subtarget &Subtarget,

                        SelectionDAG &DAG) {

  MVT VT = Op.getSimpleValueType();

  SDLoc DL(Op);


  if (VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) {

    // Since X86 does not have CMOV for 8-bit integer, we don't convert

    // 8-bit integer abs to NEG and CMOV.

    SDValue N0 = Op.getOperand(0);

    SDValue Neg = DAG.getNode(X86ISD::SUB, DL, DAG.getVTList(VT, MVT::i32),

                              DAG.getConstant(0, DL, VT), N0);

    SDValue Ops[] = {N0, Neg, DAG.getTargetConstant(X86::COND_NS, DL, MVT::i8),

                     SDValue(Neg.getNode(), 1)};

    return DAG.getNode(X86ISD::CMOV, DL, VT, Ops);

  }


  // ABS(vXi64 X) --> VPBLENDVPD(X, 0-X, X).

  if ((VT == MVT::v2i64 || VT == MVT::v4i64) && Subtarget.hasSSE41()) {

    SDValue Src = Op.getOperand(0);

    SDValue Neg = DAG.getNegative(Src, DL, VT);

    return DAG.getNode(X86ISD::BLENDV, DL, VT, Src, Neg, Src);

  }


  if (VT.is256BitVector() && !Subtarget.hasInt256()) {

    assert(VT.isInteger() &&

           "Only handle AVX 256-bit vector integer operation");

    return splitVectorIntUnary(Op, DAG, DL);

  }


  if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())

    return splitVectorIntUnary(Op, DAG, DL);


  // Default to expand.

  return SDValue();

}


static SDValue LowerAVG(SDValue Op, const X86Subtarget &Subtarget,

                        SelectionDAG &DAG) {

  MVT VT = Op.getSimpleValueType();

  SDLoc DL(Op);


  // For AVX1 cases, split to use legal ops.

  if (VT.is256BitVector() && !Subtarget.hasInt256())

    return splitVectorIntBinary(Op, DAG, DL);


  if (VT == MVT::v32i16 || VT == MVT::v64i8)

    return splitVectorIntBinary(Op, DAG, DL);


  // Default to expand.

  return SDValue();

}


static SDValue LowerMINMAX(SDValue Op, const X86Subtarget &Subtarget,

                           SelectionDAG &DAG) {

  MVT VT = Op.getSimpleValueType();

  SDLoc DL(Op);


  // For AVX1 cases, split to use legal ops.

  if (VT.is256BitVector() && !Subtarget.hasInt256())

    return splitVectorIntBinary(Op, DAG, DL);


  if (VT == MVT::v32i16 || VT == MVT::v64i8)

    return splitVectorIntBinary(Op, DAG, DL);


  // Default to expand.

  return SDValue();

}


static SDValue LowerFMINIMUM_FMAXIMUM(SDValue Op, const X86Subtarget &Subtarget,

                                      SelectionDAG &DAG) {

  const TargetLowering &TLI = DAG.getTargetLoweringInfo();

  EVT VT = Op.getValueType();

  SDValue X = Op.getOperand(0);

  SDValue Y = Op.getOperand(1);

  SDLoc DL(Op);

  bool IsMaxOp =

      Op.getOpcode() == ISD::FMAXIMUM || Op.getOpcode() == ISD::FMAXIMUMNUM;

  bool IsNum =

      Op.getOpcode() == ISD::FMINIMUMNUM || Op.getOpcode() == ISD::FMAXIMUMNUM;

  if (Subtarget.hasAVX10_2() && TLI.isTypeLegal(VT)) {

    unsigned Opc = 0;

    if (VT.isVector())

      Opc = X86ISD::VMINMAX;

    else if (VT == MVT::f16 || VT == MVT::f32 || VT == MVT::f64)

      Opc = X86ISD::VMINMAXS;


    if (Opc) {

      SDValue Imm =

          DAG.getTargetConstant(IsMaxOp + (IsNum ? 16 : 0), DL, MVT::i32);

      return DAG.getNode(Opc, DL, VT, X, Y, Imm, Op->getFlags());

    }

  }


  uint64_t SizeInBits = VT.getScalarSizeInBits();

  APInt PreferredZero = APInt::getZero(SizeInBits);

  APInt OppositeZero = PreferredZero;

  EVT IVT = VT.changeTypeToInteger();

  X86ISD::NodeType MinMaxOp;

  if (IsMaxOp) {

    MinMaxOp = X86ISD::FMAX;

    OppositeZero.setSignBit();

  } else {

    PreferredZero.setSignBit();

    MinMaxOp = X86ISD::FMIN;

  }

  EVT SetCCType =

      TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);


  // The tables below show the expected result of Max in cases of NaN and

  // signed zeros.

  //

  //                 Y                       Y

  //             Num   xNaN              +0     -0

  //          ---------------         ---------------

  //     Num  |  Max |   Y  |     +0  |  +0  |  +0  |

  // X        ---------------  X      ---------------

  //    xNaN  |   X  |  X/Y |     -0  |  +0  |  -0  |

  //          ---------------         ---------------

  //

  // It is achieved by means of FMAX/FMIN with preliminary checks and operand

  // reordering.

  //

  // We check if any of operands is NaN and return NaN. Then we check if any of

  // operands is zero or negative zero (for fmaximum and fminimum respectively)

  // to ensure the correct zero is returned.

  auto MatchesZero = [](SDValue Op, APInt Zero) {

    Op = peekThroughBitcasts(Op);

    if (auto *CstOp = dyn_cast<ConstantFPSDNode>(Op))

      return CstOp->getValueAPF().bitcastToAPInt() == Zero;

    if (auto *CstOp = dyn_cast<ConstantSDNode>(Op))

      return CstOp->getAPIntValue() == Zero;

    if (Op->getOpcode() == ISD::BUILD_VECTOR ||

        Op->getOpcode() == ISD::SPLAT_VECTOR) {

      for (const SDValue &OpVal : Op->op_values()) {

        if (OpVal.isUndef())

          continue;

        auto *CstOp = dyn_cast<ConstantFPSDNode>(OpVal);

        if (!CstOp)

          return false;

        if (!CstOp->getValueAPF().isZero())

          continue;

        if (CstOp->getValueAPF().bitcastToAPInt() != Zero)

          return false;

      }

      return true;

    }

    return false;

  };


  bool IsXNeverNaN = DAG.isKnownNeverNaN(X);

  bool IsYNeverNaN = DAG.isKnownNeverNaN(Y);

  bool IgnoreSignedZero = DAG.getTarget().Options.NoSignedZerosFPMath ||

                          Op->getFlags().hasNoSignedZeros() ||

                          DAG.isKnownNeverZeroFloat(X) ||

                          DAG.isKnownNeverZeroFloat(Y);

  SDValue NewX, NewY;

  if (IgnoreSignedZero || MatchesZero(Y, PreferredZero) ||

      MatchesZero(X, OppositeZero)) {

    // Operands are already in right order or order does not matter.

    NewX = X;

    NewY = Y;

  } else if (MatchesZero(X, PreferredZero) || MatchesZero(Y, OppositeZero)) {

    NewX = Y;

    NewY = X;

  } else if (!VT.isVector() && (VT == MVT::f16 || Subtarget.hasDQI()) &&

             (Op->getFlags().hasNoNaNs() || IsXNeverNaN || IsYNeverNaN)) {

    if (IsXNeverNaN)

      std::swap(X, Y);

    // VFPCLASSS consumes a vector type. So provide a minimal one corresponded

    // xmm register.

    MVT VectorType = MVT::getVectorVT(VT.getSimpleVT(), 128 / SizeInBits);

    SDValue VX = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VectorType, X);

    // Bits of classes:

    // Bits  Imm8[0] Imm8[1] Imm8[2] Imm8[3] Imm8[4]  Imm8[5]  Imm8[6] Imm8[7]

    // Class    QNAN PosZero NegZero  PosINF  NegINF Denormal Negative    SNAN

    SDValue Imm = DAG.getTargetConstant(MinMaxOp == X86ISD::FMAX ? 0b11 : 0b101,

                                        DL, MVT::i32);

    SDValue IsNanZero = DAG.getNode(X86ISD::VFPCLASSS, DL, MVT::v1i1, VX, Imm);

    SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i1,

                              DAG.getConstant(0, DL, MVT::v8i1), IsNanZero,

                              DAG.getVectorIdxConstant(0, DL));

    SDValue NeedSwap = DAG.getBitcast(MVT::i8, Ins);

    NewX = DAG.getSelect(DL, VT, NeedSwap, Y, X);

    NewY = DAG.getSelect(DL, VT, NeedSwap, X, Y);

    return DAG.getNode(MinMaxOp, DL, VT, NewX, NewY, Op->getFlags());

  } else {

    SDValue IsXSigned;

    if (Subtarget.is64Bit() || VT != MVT::f64) {

      SDValue XInt = DAG.getNode(ISD::BITCAST, DL, IVT, X);

      SDValue ZeroCst = DAG.getConstant(0, DL, IVT);

      IsXSigned = DAG.getSetCC(DL, SetCCType, XInt, ZeroCst, ISD::SETLT);

    } else {

      assert(VT == MVT::f64);

      SDValue Ins = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v2f64,

                                DAG.getConstantFP(0, DL, MVT::v2f64), X,

                                DAG.getVectorIdxConstant(0, DL));

      SDValue VX = DAG.getNode(ISD::BITCAST, DL, MVT::v4f32, Ins);

      SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VX,

                               DAG.getVectorIdxConstant(1, DL));

      Hi = DAG.getBitcast(MVT::i32, Hi);

      SDValue ZeroCst = DAG.getConstant(0, DL, MVT::i32);

      EVT SetCCType = TLI.getSetCCResultType(DAG.getDataLayout(),

                                             *DAG.getContext(), MVT::i32);

      IsXSigned = DAG.getSetCC(DL, SetCCType, Hi, ZeroCst, ISD::SETLT);

    }

    if (MinMaxOp == X86ISD::FMAX) {

      NewX = DAG.getSelect(DL, VT, IsXSigned, X, Y);

      NewY = DAG.getSelect(DL, VT, IsXSigned, Y, X);

    } else {

      NewX = DAG.getSelect(DL, VT, IsXSigned, Y, X);

      NewY = DAG.getSelect(DL, VT, IsXSigned, X, Y);

    }

  }


  bool IgnoreNaN = DAG.getTarget().Options.NoNaNsFPMath ||

                   Op->getFlags().hasNoNaNs() || (IsXNeverNaN && IsYNeverNaN);


  // If we did no ordering operands for signed zero handling and we need

  // to process NaN and we know that one of the operands is not NaN then:

  //  - For minimum/maximum, put it in the first operand,

  //  - For minimumnum/maximumnum, put it in the second operand,

  // and we will not need to post handle NaN after max/min.

  if (IgnoreSignedZero && !IgnoreNaN &&

      DAG.isKnownNeverNaN(IsNum ? NewX : NewY))

    std::swap(NewX, NewY);


  SDValue MinMax = DAG.getNode(MinMaxOp, DL, VT, NewX, NewY, Op->getFlags());


  if (IgnoreNaN || DAG.isKnownNeverNaN(IsNum ? NewY : NewX))

    return MinMax;


  SDValue NaNSrc = IsNum ? MinMax : NewX;

  SDValue IsNaN = DAG.getSetCC(DL, SetCCType, NaNSrc, NaNSrc, ISD::SETUO);


  return DAG.getSelect(DL, VT, IsNaN, NewX, MinMax);

}


static SDValue LowerABD(SDValue Op, const X86Subtarget &Subtarget,

                        SelectionDAG &DAG) {

  MVT VT = Op.getSimpleValueType();

  SDLoc dl(Op);


  // For AVX1 cases, split to use legal ops.

  if (VT.is256BitVector() && !Subtarget.hasInt256())

    return splitVectorIntBinary(Op, DAG, dl);


  if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.useBWIRegs())

    return splitVectorIntBinary(Op, DAG, dl);


  bool IsSigned = Op.getOpcode() == ISD::ABDS;

  const TargetLowering &TLI = DAG.getTargetLoweringInfo();


  if (Subtarget.canUseCMOV() && VT.isScalarInteger()) {

    X86::CondCode CC = IsSigned ? X86::COND_L : X86::COND_B;

    unsigned ExtOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;


    // abds(lhs, rhs) -> select(slt(lhs,rhs),sub(rhs,lhs),sub(lhs,rhs))

    // abdu(lhs, rhs) -> select(ult(lhs,rhs),sub(rhs,lhs),sub(lhs,rhs))

    if (VT.bitsGE(MVT::i32)) {

      SDVTList VTs = DAG.getVTList(VT, MVT::i32);

      SDValue LHS = DAG.getFreeze(Op.getOperand(0));

      SDValue RHS = DAG.getFreeze(Op.getOperand(1));

      SDValue Diff0 = DAG.getNode(X86ISD::SUB, dl, VTs, LHS, RHS);

      SDValue Diff1 = DAG.getNode(X86ISD::SUB, dl, VTs, RHS, LHS);

      return DAG.getNode(X86ISD::CMOV, dl, VT, Diff1, Diff0,

                         DAG.getTargetConstant(CC, dl, MVT::i8),

                         Diff1.getValue(1));

    }


    // abds(lhs, rhs) -> trunc(abs(sub(sext(lhs), sext(rhs))))

    // abdu(lhs, rhs) -> trunc(abs(sub(zext(lhs), zext(rhs))))

    unsigned WideBits = std::max<unsigned>(2 * VT.getScalarSizeInBits(), 32u);

    MVT WideVT = MVT::getIntegerVT(WideBits);

    if (TLI.isTypeLegal(WideVT)) {

      SDVTList WideVTs = DAG.getVTList(WideVT, MVT::i32);

      SDValue LHS = DAG.getNode(ExtOpc, dl, WideVT, Op.getOperand(0));

      SDValue RHS = DAG.getNode(ExtOpc, dl, WideVT, Op.getOperand(1));

      SDValue Diff0 = DAG.getNode(X86ISD::SUB, dl, WideVTs, LHS, RHS);

      SDValue Diff1 = DAG.getNode(X86ISD::SUB, dl, WideVTs, RHS, LHS);

      SDValue AbsDiff = DAG.getNode(X86ISD::CMOV, dl, WideVT, Diff1, Diff0,

                                    DAG.getTargetConstant(CC, dl, MVT::i8),

                                    Diff1.getValue(1));

      return DAG.getNode(ISD::TRUNCATE, dl, VT, AbsDiff);

    }

  }


  // Default to expand.

  return SDValue();

}


static SDValue LowerMUL(SDValue Op, const X86Subtarget &Subtarget,

                        SelectionDAG &DAG) {

  SDLoc dl(Op);

  MVT VT = Op.getSimpleValueType();


  // Decompose 256-bit ops into 128-bit ops.

  if (VT.is256BitVector() && !Subtarget.hasInt256())

    return splitVectorIntBinary(Op, DAG, dl);


  if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())

    return splitVectorIntBinary(Op, DAG, dl);


  SDValue A = Op.getOperand(0);

  SDValue B = Op.getOperand(1);


  // Lower v16i8/v32i8/v64i8 mul as sign-extension to v8i16/v16i16/v32i16

  // vector pairs, multiply and truncate.

  if (VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8) {

    unsigned NumElts = VT.getVectorNumElements();

    unsigned NumLanes = VT.getSizeInBits() / 128;

    unsigned NumEltsPerLane = NumElts / NumLanes;


    if ((VT == MVT::v16i8 && Subtarget.hasInt256()) ||

        (VT == MVT::v32i8 && Subtarget.canExtendTo512BW())) {

      MVT ExVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements());

      return DAG.getNode(

          ISD::TRUNCATE, dl, VT,

          DAG.getNode(ISD::MUL, dl, ExVT,

                      DAG.getNode(ISD::ANY_EXTEND, dl, ExVT, A),

                      DAG.getNode(ISD::ANY_EXTEND, dl, ExVT, B)));

    }


    MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts / 2);


    // For vXi8 mul, try PMADDUBSW to avoid the need for extension.

    // Don't do this if we only need to unpack one half.

    if (Subtarget.hasSSSE3()) {

      bool BIsBuildVector = isa<BuildVectorSDNode>(B);

      bool IsLoLaneAllZeroOrUndef = BIsBuildVector;

      bool IsHiLaneAllZeroOrUndef = BIsBuildVector;

      if (BIsBuildVector) {

        for (auto [Idx, Val] : enumerate(B->ops())) {

          if ((Idx % NumEltsPerLane) >= (NumEltsPerLane / 2))

            IsHiLaneAllZeroOrUndef &= isNullConstantOrUndef(Val);

          else

            IsLoLaneAllZeroOrUndef &= isNullConstantOrUndef(Val);

        }

      }

      if (!(IsLoLaneAllZeroOrUndef || IsHiLaneAllZeroOrUndef)) {

        SDValue Mask = DAG.getBitcast(VT, DAG.getConstant(0x00FF, dl, ExVT));

        SDValue BLo = DAG.getNode(ISD::AND, dl, VT, Mask, B);

        SDValue BHi = DAG.getNode(X86ISD::ANDNP, dl, VT, Mask, B);

        SDValue RLo = DAG.getNode(X86ISD::VPMADDUBSW, dl, ExVT, A, BLo);

        SDValue RHi = DAG.getNode(X86ISD::VPMADDUBSW, dl, ExVT, A, BHi);

        RLo = DAG.getNode(ISD::AND, dl, VT, DAG.getBitcast(VT, RLo), Mask);

        RHi = DAG.getNode(X86ISD::VSHLI, dl, ExVT, RHi,

                          DAG.getTargetConstant(8, dl, MVT::i8));

        return DAG.getNode(ISD::OR, dl, VT, RLo, DAG.getBitcast(VT, RHi));

      }

    }


    // Extract the lo/hi parts to any extend to i16.

    // We're going to mask off the low byte of each result element of the

    // pmullw, so it doesn't matter what's in the high byte of each 16-bit

    // element.

    SDValue Undef = DAG.getUNDEF(VT);

    SDValue ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, A, Undef));

    SDValue AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, A, Undef));


    SDValue BLo, BHi;

    if (ISD::isBuildVectorOfConstantSDNodes(B.getNode())) {

      // If the RHS is a constant, manually unpackl/unpackh.

      SmallVector<SDValue, 16> LoOps, HiOps;

      for (unsigned i = 0; i != NumElts; i += 16) {

        for (unsigned j = 0; j != 8; ++j) {

          LoOps.push_back(DAG.getAnyExtOrTrunc(B.getOperand(i + j), dl,

                                               MVT::i16));

          HiOps.push_back(DAG.getAnyExtOrTrunc(B.getOperand(i + j + 8), dl,

                                               MVT::i16));

        }

      }


      BLo = DAG.getBuildVector(ExVT, dl, LoOps);

      BHi = DAG.getBuildVector(ExVT, dl, HiOps);

    } else {

      BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, B, Undef));

      BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, B, Undef));

    }


    // Multiply, mask the lower 8bits of the lo/hi results and pack.

    SDValue RLo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo);

    SDValue RHi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi);

    return getPack(DAG, Subtarget, dl, VT, RLo, RHi);

  }


  // Lower v4i32 mul as 2x shuffle, 2x pmuludq, 2x shuffle.

  if (VT == MVT::v4i32) {

    assert(Subtarget.hasSSE2() && !Subtarget.hasSSE41() &&

           "Should not custom lower when pmulld is available!");


    // Extract the odd parts.

    static const int UnpackMask[] = {1, 1, 3, 3};

    SDValue Aodds = DAG.getVectorShuffle(VT, dl, A, A, UnpackMask);

    SDValue Bodds = DAG.getVectorShuffle(VT, dl, B, B, UnpackMask);


    // Multiply the even parts.

    SDValue Evens = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64,

                                DAG.getBitcast(MVT::v2i64, A),

                                DAG.getBitcast(MVT::v2i64, B));

    // Now multiply odd parts.

    SDValue Odds = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64,

                               DAG.getBitcast(MVT::v2i64, Aodds),

                               DAG.getBitcast(MVT::v2i64, Bodds));


    Evens = DAG.getBitcast(VT, Evens);

    Odds = DAG.getBitcast(VT, Odds);


    // Merge the two vectors back together with a shuffle. This expands into 2

    // shuffles.

    static const int ShufMask[] = { 0, 4, 2, 6 };

    return DAG.getVectorShuffle(VT, dl, Evens, Odds, ShufMask);

  }


  assert((VT == MVT::v2i64 || VT == MVT::v4i64 || VT == MVT::v8i64) &&

         "Only know how to lower V2I64/V4I64/V8I64 multiply");

  assert(!Subtarget.hasDQI() && "DQI should use MULLQ");


  //  Ahi = psrlqi(a, 32);

  //  Bhi = psrlqi(b, 32);

  //

  //  AloBlo = pmuludq(a, b);

  //  AloBhi = pmuludq(a, Bhi);

  //  AhiBlo = pmuludq(Ahi, b);

  //

  //  Hi = psllqi(AloBhi + AhiBlo, 32);

  //  return AloBlo + Hi;

  KnownBits AKnown = DAG.computeKnownBits(A);

  KnownBits BKnown = DAG.computeKnownBits(B);


  APInt LowerBitsMask = APInt::getLowBitsSet(64, 32);

  bool ALoIsZero = LowerBitsMask.isSubsetOf(AKnown.Zero);

  bool BLoIsZero = LowerBitsMask.isSubsetOf(BKnown.Zero);


  APInt UpperBitsMask = APInt::getHighBitsSet(64, 32);

  bool AHiIsZero = UpperBitsMask.isSubsetOf(AKnown.Zero);

  bool BHiIsZero = UpperBitsMask.isSubsetOf(BKnown.Zero);


  SDValue Zero = DAG.getConstant(0, dl, VT);


  // Only multiply lo/hi halves that aren't known to be zero.

  SDValue AloBlo = Zero;

  if (!ALoIsZero && !BLoIsZero)

    AloBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, B);


  SDValue AloBhi = Zero;

  if (!ALoIsZero && !BHiIsZero) {

    SDValue Bhi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, B, 32, DAG);

    AloBhi = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, Bhi);

  }


  SDValue AhiBlo = Zero;

  if (!AHiIsZero && !BLoIsZero) {

    SDValue Ahi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, A, 32, DAG);

    AhiBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Ahi, B);

  }


  SDValue Hi = DAG.getNode(ISD::ADD, dl, VT, AloBhi, AhiBlo);

  Hi = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Hi, 32, DAG);


  return DAG.getNode(ISD::ADD, dl, VT, AloBlo, Hi);

}


static SDValue LowervXi8MulWithUNPCK(SDValue A, SDValue B, const SDLoc &dl,

                                     MVT VT, bool IsSigned,

                                     const X86Subtarget &Subtarget,

                                     SelectionDAG &DAG,

                                     SDValue *Low = nullptr) {

  // For vXi8 we will unpack the low and high half of each 128 bit lane to widen

  // to a vXi16 type. Do the multiplies, shift the results and pack the half

  // lane results back together.


  // We'll take different approaches for signed and unsigned.

  // For unsigned we'll use punpcklbw/punpckhbw to zero extend the bytes to

  // words and use pmullw to calculate the full 16-bit product.

  // For signed we'll use punpcklbw/punpckbw to extend the bytes to words and

  // shift them left into the upper byte of each word. This allows us to use

  // pmulhw to calculate the full 16-bit product. This trick means we don't

  // need to sign extend the bytes to use pmullw.

  MVT ExVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements() / 2);

  SDValue Zero = DAG.getConstant(0, dl, VT);


  SDValue ALo, AHi, BLo, BHi;

  if (IsSigned) {

    ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, Zero, A));

    BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, Zero, B));

    AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, Zero, A));

    BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, Zero, B));

  } else {

    ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, A, Zero));

    BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, B, Zero));

    AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, A, Zero));

    BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, B, Zero));

  }


  // Multiply, lshr the upper 8bits to the lower 8bits of the lo/hi results and

  // pack back to vXi8.

  unsigned MulOpc = IsSigned ? ISD::MULHS : ISD::MUL;

  SDValue RLo = DAG.getNode(MulOpc, dl, ExVT, ALo, BLo);

  SDValue RHi = DAG.getNode(MulOpc, dl, ExVT, AHi, BHi);


  if (Low)

    *Low = getPack(DAG, Subtarget, dl, VT, RLo, RHi);


  return getPack(DAG, Subtarget, dl, VT, RLo, RHi, /*PackHiHalf=*/true);

}


static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget,

                         SelectionDAG &DAG) {

  SDLoc dl(Op);

  MVT VT = Op.getSimpleValueType();

  bool IsSigned = Op->getOpcode() == ISD::MULHS;

  unsigned NumElts = VT.getVectorNumElements();

  SDValue A = Op.getOperand(0);

  SDValue B = Op.getOperand(1);


  // Decompose 256-bit ops into 128-bit ops.

  if (VT.is256BitVector() && !Subtarget.hasInt256())

    return splitVectorIntBinary(Op, DAG, dl);


  if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())

    return splitVectorIntBinary(Op, DAG, dl);


  if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32) {

    assert((VT == MVT::v4i32 && Subtarget.hasSSE2()) ||

           (VT == MVT::v8i32 && Subtarget.hasInt256()) ||

           (VT == MVT::v16i32 && Subtarget.hasAVX512()));


    // PMULxD operations multiply each even value (starting at 0) of LHS with

    // the related value of RHS and produce a widen result.

    // E.g., PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>

    // => <2 x i64> <ae|cg>

    //

    // In other word, to have all the results, we need to perform two PMULxD:

    // 1. one with the even values.

    // 2. one with the odd values.

    // To achieve #2, with need to place the odd values at an even position.

    //

    // Place the odd value at an even position (basically, shift all values 1

    // step to the left):

    const int Mask[] = {1, -1,  3, -1,  5, -1,  7, -1,

                        9, -1, 11, -1, 13, -1, 15, -1};

    // <a|b|c|d> => <b|undef|d|undef>

    SDValue Odd0 =

        DAG.getVectorShuffle(VT, dl, A, A, ArrayRef(&Mask[0], NumElts));

    // <e|f|g|h> => <f|undef|h|undef>

    SDValue Odd1 =

        DAG.getVectorShuffle(VT, dl, B, B, ArrayRef(&Mask[0], NumElts));


    // Emit two multiplies, one for the lower 2 ints and one for the higher 2

    // ints.

    MVT MulVT = MVT::getVectorVT(MVT::i64, NumElts / 2);

    unsigned Opcode =

        (IsSigned && Subtarget.hasSSE41()) ? X86ISD::PMULDQ : X86ISD::PMULUDQ;

    // PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>

    // => <2 x i64> <ae|cg>

    SDValue Mul1 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT,

                                                  DAG.getBitcast(MulVT, A),

                                                  DAG.getBitcast(MulVT, B)));

    // PMULUDQ <4 x i32> <b|undef|d|undef>, <4 x i32> <f|undef|h|undef>

    // => <2 x i64> <bf|dh>

    SDValue Mul2 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT,

                                                  DAG.getBitcast(MulVT, Odd0),

                                                  DAG.getBitcast(MulVT, Odd1)));


    // Shuffle it back into the right order.

    SmallVector<int, 16> ShufMask(NumElts);

    for (int i = 0; i != (int)NumElts; ++i)

      ShufMask[i] = (i / 2) * 2 + ((i % 2) * NumElts) + 1;


    SDValue Res = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, ShufMask);


    // If we have a signed multiply but no PMULDQ fix up the result of an

    // unsigned multiply.

    if (IsSigned && !Subtarget.hasSSE41()) {

      SDValue Zero = DAG.getConstant(0, dl, VT);

      SDValue T1 = DAG.getNode(ISD::AND, dl, VT,

                               DAG.getSetCC(dl, VT, Zero, A, ISD::SETGT), B);

      SDValue T2 = DAG.getNode(ISD::AND, dl, VT,

                               DAG.getSetCC(dl, VT, Zero, B, ISD::SETGT), A);


      SDValue Fixup = DAG.getNode(ISD::ADD, dl, VT, T1, T2);

      Res = DAG.getNode(ISD::SUB, dl, VT, Res, Fixup);

    }


    return Res;

  }


  // Only i8 vectors should need custom lowering after this.

  assert((VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256()) ||

         (VT == MVT::v64i8 && Subtarget.hasBWI())) &&

         "Unsupported vector type");


  // Lower v16i8/v32i8 as extension to v8i16/v16i16 vector pairs, multiply,

  // logical shift down the upper half and pack back to i8.


  // With SSE41 we can use sign/zero extend, but for pre-SSE41 we unpack

  // and then ashr/lshr the upper bits down to the lower bits before multiply.


  if ((VT == MVT::v16i8 && Subtarget.hasInt256()) ||

      (VT == MVT::v32i8 && Subtarget.canExtendTo512BW())) {

    MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts);

    unsigned ExAVX = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;

    SDValue ExA = DAG.getNode(ExAVX, dl, ExVT, A);

    SDValue ExB = DAG.getNode(ExAVX, dl, ExVT, B);

    SDValue Mul = DAG.getNode(ISD::MUL, dl, ExVT, ExA, ExB);

    Mul = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Mul, 8, DAG);

    return DAG.getNode(ISD::TRUNCATE, dl, VT, Mul);

  }


  return LowervXi8MulWithUNPCK(A, B, dl, VT, IsSigned, Subtarget, DAG);

}


// Custom lowering for SMULO/UMULO.


static SDValue LowerMULO(SDValue Op, const X86Subtarget &Subtarget,

                         SelectionDAG &DAG) {

  MVT VT = Op.getSimpleValueType();


  // Scalars defer to LowerXALUO.

  if (!VT.isVector())

    return LowerXALUO(Op, DAG);


  SDLoc dl(Op);

  bool IsSigned = Op->getOpcode() == ISD::SMULO;

  SDValue A = Op.getOperand(0);

  SDValue B = Op.getOperand(1);

  EVT OvfVT = Op->getValueType(1);


  if ((VT == MVT::v32i8 && !Subtarget.hasInt256()) ||

      (VT == MVT::v64i8 && !Subtarget.hasBWI())) {

    // Extract the LHS Lo/Hi vectors

    SDValue LHSLo, LHSHi;

    std::tie(LHSLo, LHSHi) = splitVector(A, DAG, dl);


    // Extract the RHS Lo/Hi vectors

    SDValue RHSLo, RHSHi;

    std::tie(RHSLo, RHSHi) = splitVector(B, DAG, dl);


    EVT LoOvfVT, HiOvfVT;

    std::tie(LoOvfVT, HiOvfVT) = DAG.GetSplitDestVTs(OvfVT);

    SDVTList LoVTs = DAG.getVTList(LHSLo.getValueType(), LoOvfVT);

    SDVTList HiVTs = DAG.getVTList(LHSHi.getValueType(), HiOvfVT);


    // Issue the split operations.

    SDValue Lo = DAG.getNode(Op.getOpcode(), dl, LoVTs, LHSLo, RHSLo);

    SDValue Hi = DAG.getNode(Op.getOpcode(), dl, HiVTs, LHSHi, RHSHi);


    // Join the separate data results and the overflow results.

    SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);

    SDValue Ovf = DAG.getNode(ISD::CONCAT_VECTORS, dl, OvfVT, Lo.getValue(1),

                              Hi.getValue(1));


    return DAG.getMergeValues({Res, Ovf}, dl);

  }


  const TargetLowering &TLI = DAG.getTargetLoweringInfo();

  EVT SetccVT =

      TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);


  if ((VT == MVT::v16i8 && Subtarget.hasInt256()) ||

      (VT == MVT::v32i8 && Subtarget.canExtendTo512BW())) {

    unsigned NumElts = VT.getVectorNumElements();

    MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts);

    unsigned ExAVX = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;

    SDValue ExA = DAG.getNode(ExAVX, dl, ExVT, A);

    SDValue ExB = DAG.getNode(ExAVX, dl, ExVT, B);

    SDValue Mul = DAG.getNode(ISD::MUL, dl, ExVT, ExA, ExB);


    SDValue Low = DAG.getNode(ISD::TRUNCATE, dl, VT, Mul);


    SDValue Ovf;

    if (IsSigned) {

      SDValue High, LowSign;

      if (OvfVT.getVectorElementType() == MVT::i1 &&

          (Subtarget.hasBWI() || Subtarget.canExtendTo512DQ())) {

        // Rather the truncating try to do the compare on vXi16 or vXi32.

        // Shift the high down filling with sign bits.

        High = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Mul, 8, DAG);

        // Fill all 16 bits with the sign bit from the low.

        LowSign =

            getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ExVT, Mul, 8, DAG);

        LowSign = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, LowSign,

                                             15, DAG);

        SetccVT = OvfVT;

        if (!Subtarget.hasBWI()) {

          // We can't do a vXi16 compare so sign extend to v16i32.

          High = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v16i32, High);

          LowSign = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v16i32, LowSign);

        }

      } else {

        // Otherwise do the compare at vXi8.

        High = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Mul, 8, DAG);

        High = DAG.getNode(ISD::TRUNCATE, dl, VT, High);

        LowSign =

            DAG.getNode(ISD::SRA, dl, VT, Low, DAG.getConstant(7, dl, VT));

      }


      Ovf = DAG.getSetCC(dl, SetccVT, LowSign, High, ISD::SETNE);

    } else {

      SDValue High =

          getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Mul, 8, DAG);

      if (OvfVT.getVectorElementType() == MVT::i1 &&

          (Subtarget.hasBWI() || Subtarget.canExtendTo512DQ())) {

        // Rather the truncating try to do the compare on vXi16 or vXi32.

        SetccVT = OvfVT;

        if (!Subtarget.hasBWI()) {

          // We can't do a vXi16 compare so sign extend to v16i32.

          High = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v16i32, High);

        }

      } else {

        // Otherwise do the compare at vXi8.

        High = DAG.getNode(ISD::TRUNCATE, dl, VT, High);

      }


      Ovf =

          DAG.getSetCC(dl, SetccVT, High,

                       DAG.getConstant(0, dl, High.getValueType()), ISD::SETNE);

    }


    Ovf = DAG.getSExtOrTrunc(Ovf, dl, OvfVT);


    return DAG.getMergeValues({Low, Ovf}, dl);

  }


  SDValue Low;

  SDValue High =

      LowervXi8MulWithUNPCK(A, B, dl, VT, IsSigned, Subtarget, DAG, &Low);


  SDValue Ovf;

  if (IsSigned) {

    // SMULO overflows if the high bits don't match the sign of the low.

    SDValue LowSign =

        DAG.getNode(ISD::SRA, dl, VT, Low, DAG.getConstant(7, dl, VT));

    Ovf = DAG.getSetCC(dl, SetccVT, LowSign, High, ISD::SETNE);

  } else {

    // UMULO overflows if the high bits are non-zero.

    Ovf =

        DAG.getSetCC(dl, SetccVT, High, DAG.getConstant(0, dl, VT), ISD::SETNE);

  }


  Ovf = DAG.getSExtOrTrunc(Ovf, dl, OvfVT);


  return DAG.getMergeValues({Low, Ovf}, dl);

}


SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) const {

  assert(Subtarget.isTargetWin64() && "Unexpected target");

  EVT VT = Op.getValueType();

  assert(VT.isInteger() && VT.getSizeInBits() == 128 &&

         "Unexpected return type for lowering");


  if (isa<ConstantSDNode>(Op->getOperand(1))) {

    SmallVector<SDValue> Result;

    if (expandDIVREMByConstant(Op.getNode(), Result, MVT::i64, DAG))

      return DAG.getNode(ISD::BUILD_PAIR, SDLoc(Op), VT, Result[0], Result[1]);

  }


  RTLIB::Libcall LC;

  bool isSigned;

  switch (Op->getOpcode()) {

  // clang-format off

  default: llvm_unreachable("Unexpected request for libcall!");

  case ISD::SDIV:      isSigned = true;  LC = RTLIB::SDIV_I128;    break;

  case ISD::UDIV:      isSigned = false; LC = RTLIB::UDIV_I128;    break;

  case ISD::SREM:      isSigned = true;  LC = RTLIB::SREM_I128;    break;

  case ISD::UREM:      isSigned = false; LC = RTLIB::UREM_I128;    break;

  // clang-format on

  }


  SDLoc dl(Op);

  SDValue InChain = DAG.getEntryNode();


  TargetLowering::ArgListTy Args;

  for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i) {

    EVT ArgVT = Op->getOperand(i).getValueType();

    assert(ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 &&

           "Unexpected argument type for lowering");

    SDValue StackPtr = DAG.CreateStackTemporary(ArgVT, 16);

    int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();

    MachinePointerInfo MPI =

        MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI);

    InChain =

        DAG.getStore(InChain, dl, Op->getOperand(i), StackPtr, MPI, Align(16));

    Args.emplace_back(StackPtr, PointerType::get(*DAG.getContext(), 0));

  }


  SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC),

                                         getPointerTy(DAG.getDataLayout()));


  TargetLowering::CallLoweringInfo CLI(DAG);

  CLI.setDebugLoc(dl)

      .setChain(InChain)

      .setLibCallee(

          getLibcallCallingConv(LC),

          static_cast<EVT>(MVT::v2i64).getTypeForEVT(*DAG.getContext()), Callee,

          std::move(Args))

      .setInRegister()

      .setSExtResult(isSigned)

      .setZExtResult(!isSigned);


  std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);

  return DAG.getBitcast(VT, CallInfo.first);

}


SDValue X86TargetLowering::LowerWin64_FP_TO_INT128(SDValue Op,

                                                   SelectionDAG &DAG,

                                                   SDValue &Chain) const {

  assert(Subtarget.isTargetWin64() && "Unexpected target");

  EVT VT = Op.getValueType();

  bool IsStrict = Op->isStrictFPOpcode();


  SDValue Arg = Op.getOperand(IsStrict ? 1 : 0);

  EVT ArgVT = Arg.getValueType();


  assert(VT.isInteger() && VT.getSizeInBits() == 128 &&

         "Unexpected return type for lowering");


  RTLIB::Libcall LC;

  if (Op->getOpcode() == ISD::FP_TO_SINT ||

      Op->getOpcode() == ISD::STRICT_FP_TO_SINT)

    LC = RTLIB::getFPTOSINT(ArgVT, VT);

  else

    LC = RTLIB::getFPTOUINT(ArgVT, VT);

  assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unexpected request for libcall!");


  SDLoc dl(Op);

  MakeLibCallOptions CallOptions;

  Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();


  SDValue Result;

  // Expect the i128 argument returned as a v2i64 in xmm0, cast back to the

  // expected VT (i128).

  std::tie(Result, Chain) =

      makeLibCall(DAG, LC, MVT::v2i64, Arg, CallOptions, dl, Chain);

  Result = DAG.getBitcast(VT, Result);

  return Result;

}


SDValue X86TargetLowering::LowerWin64_INT128_TO_FP(SDValue Op,

                                                   SelectionDAG &DAG) const {

  assert(Subtarget.isTargetWin64() && "Unexpected target");

  EVT VT = Op.getValueType();

  bool IsStrict = Op->isStrictFPOpcode();


  SDValue Arg = Op.getOperand(IsStrict ? 1 : 0);

  EVT ArgVT = Arg.getValueType();


  assert(ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 &&

         "Unexpected argument type for lowering");


  RTLIB::Libcall LC;

  if (Op->getOpcode() == ISD::SINT_TO_FP ||

      Op->getOpcode() == ISD::STRICT_SINT_TO_FP)

    LC = RTLIB::getSINTTOFP(ArgVT, VT);

  else

    LC = RTLIB::getUINTTOFP(ArgVT, VT);

  assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unexpected request for libcall!");


  SDLoc dl(Op);

  MakeLibCallOptions CallOptions;

  SDValue Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();


  // Pass the i128 argument as an indirect argument on the stack.

  SDValue StackPtr = DAG.CreateStackTemporary(ArgVT, 16);

  int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();

  MachinePointerInfo MPI =

      MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI);

  Chain = DAG.getStore(Chain, dl, Arg, StackPtr, MPI, Align(16));


  SDValue Result;

  std::tie(Result, Chain) =

      makeLibCall(DAG, LC, VT, StackPtr, CallOptions, dl, Chain);

  return IsStrict ? DAG.getMergeValues({Result, Chain}, dl) : Result;

}


// Return true if the required (according to Opcode) shift-imm form is natively

// supported by the Subtarget


static bool supportedVectorShiftWithImm(EVT VT, const X86Subtarget &Subtarget,

                                        unsigned Opcode) {

  assert((Opcode == ISD::SHL || Opcode == ISD::SRA || Opcode == ISD::SRL) &&

         "Unexpected shift opcode");


  if (!VT.isSimple())

    return false;


  if (!(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))

    return false;


  if (VT.getScalarSizeInBits() < 16)

    return false;


  if (VT.is512BitVector() && Subtarget.useAVX512Regs() &&

      (VT.getScalarSizeInBits() > 16 || Subtarget.hasBWI()))

    return true;


  bool LShift = (VT.is128BitVector() && Subtarget.hasSSE2()) ||

                (VT.is256BitVector() && Subtarget.hasInt256());


  bool AShift = LShift && (Subtarget.hasAVX512() ||

                           (VT != MVT::v2i64 && VT != MVT::v4i64));

  return (Opcode == ISD::SRA) ? AShift : LShift;

}


// The shift amount is a variable, but it is the same for all vector lanes.

// These instructions are defined together with shift-immediate.

static


bool supportedVectorShiftWithBaseAmnt(EVT VT, const X86Subtarget &Subtarget,

                                      unsigned Opcode) {

  return supportedVectorShiftWithImm(VT, Subtarget, Opcode);

}


// Return true if the required (according to Opcode) variable-shift form is

// natively supported by the Subtarget


static bool supportedVectorVarShift(EVT VT, const X86Subtarget &Subtarget,

                                    unsigned Opcode) {

  assert((Opcode == ISD::SHL || Opcode == ISD::SRA || Opcode == ISD::SRL) &&

         "Unexpected shift opcode");


  if (!VT.isSimple())

    return false;


  if (!(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))

    return false;


  if (!Subtarget.hasInt256() || VT.getScalarSizeInBits() < 16)

    return false;


  // vXi16 supported only on AVX-512, BWI

  if (VT.getScalarSizeInBits() == 16 && !Subtarget.hasBWI())

    return false;


  if (Subtarget.hasAVX512() &&

      (Subtarget.useAVX512Regs() || !VT.is512BitVector()))

    return true;


  bool LShift = VT.is128BitVector() || VT.is256BitVector();

  bool AShift = LShift &&  VT != MVT::v2i64 && VT != MVT::v4i64;

  return (Opcode == ISD::SRA) ? AShift : LShift;

}


static SDValue LowerShiftByScalarImmediate(SDValue Op, SelectionDAG &DAG,

                                           const X86Subtarget &Subtarget) {

  MVT VT = Op.getSimpleValueType();

  SDLoc dl(Op);

  SDValue R = Op.getOperand(0);

  SDValue Amt = Op.getOperand(1);

  unsigned X86Opc = getTargetVShiftUniformOpcode(Op.getOpcode(), false);

  unsigned EltSizeInBits = VT.getScalarSizeInBits();


  auto ArithmeticShiftRight64 = [&](uint64_t ShiftAmt) {

    assert((VT == MVT::v2i64 || VT == MVT::v4i64) && "Unexpected SRA type");

    MVT ExVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() * 2);

    SDValue Ex = DAG.getBitcast(ExVT, R);


    // ashr(R, 63) === cmp_slt(R, 0)

    if (ShiftAmt == 63 && Subtarget.hasSSE42()) {

      assert((VT != MVT::v4i64 || Subtarget.hasInt256()) &&

             "Unsupported PCMPGT op");

      return DAG.getNode(X86ISD::PCMPGT, dl, VT, DAG.getConstant(0, dl, VT), R);

    }


    if (ShiftAmt >= 32) {

      // Splat sign to upper i32 dst, and SRA upper i32 src to lower i32.

      SDValue Upper =

          getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex, 31, DAG);

      SDValue Lower = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex,

                                                 ShiftAmt - 32, DAG);

      if (VT == MVT::v2i64)

        Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {5, 1, 7, 3});

      if (VT == MVT::v4i64)

        Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,

                                  {9, 1, 11, 3, 13, 5, 15, 7});

    } else {

      // SRA upper i32, SRL whole i64 and select lower i32.

      SDValue Upper = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex,

                                                 ShiftAmt, DAG);

      SDValue Lower =

          getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, R, ShiftAmt, DAG);

      Lower = DAG.getBitcast(ExVT, Lower);

      if (VT == MVT::v2i64)

        Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {4, 1, 6, 3});

      if (VT == MVT::v4i64)

        Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,

                                  {8, 1, 10, 3, 12, 5, 14, 7});

    }

    return DAG.getBitcast(VT, Ex);

  };


  // Optimize shl/srl/sra with constant shift amount.

  APInt APIntShiftAmt;

  if (!X86::isConstantSplat(Amt, APIntShiftAmt))

    return SDValue();


  // If the shift amount is out of range, return undef.

  if (APIntShiftAmt.uge(EltSizeInBits))

    return DAG.getUNDEF(VT);


  uint64_t ShiftAmt = APIntShiftAmt.getZExtValue();


  if (supportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode()))

    return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG);


  // i64 SRA needs to be performed as partial shifts.

  if (((!Subtarget.hasXOP() && VT == MVT::v2i64) ||

       (Subtarget.hasInt256() && VT == MVT::v4i64)) &&

      Op.getOpcode() == ISD::SRA)

    return ArithmeticShiftRight64(ShiftAmt);


  // If we're logical shifting an all-signbits value then we can just perform as

  // a mask.

  if ((Op.getOpcode() == ISD::SHL || Op.getOpcode() == ISD::SRL) &&

      DAG.ComputeNumSignBits(R) == EltSizeInBits) {

    SDValue Mask = DAG.getAllOnesConstant(dl, VT);

    Mask = DAG.getNode(Op.getOpcode(), dl, VT, Mask, Amt);

    return DAG.getNode(ISD::AND, dl, VT, R, Mask);

  }


  if (VT == MVT::v16i8 || (Subtarget.hasInt256() && VT == MVT::v32i8) ||

      (Subtarget.hasBWI() && VT == MVT::v64i8)) {

    unsigned NumElts = VT.getVectorNumElements();

    MVT ShiftVT = MVT::getVectorVT(MVT::i16, NumElts / 2);


    // Simple i8 add case

    if (Op.getOpcode() == ISD::SHL && ShiftAmt == 1) {

      // R may be undef at run-time, but (shl R, 1) must be an even number (LSB

      // must be 0). (add undef, undef) however can be any value. To make this

      // safe, we must freeze R to ensure that register allocation uses the same

      // register for an undefined value. This ensures that the result will

      // still be even and preserves the original semantics.

      R = DAG.getFreeze(R);

      return DAG.getNode(ISD::ADD, dl, VT, R, R);

    }


    // ashr(R, 7)  === cmp_slt(R, 0)

    if (Op.getOpcode() == ISD::SRA && ShiftAmt == 7) {

      SDValue Zeros = DAG.getConstant(0, dl, VT);

      if (VT.is512BitVector()) {

        assert(VT == MVT::v64i8 && "Unexpected element type!");

        SDValue CMP = DAG.getSetCC(dl, MVT::v64i1, Zeros, R, ISD::SETGT);

        return DAG.getNode(ISD::SIGN_EXTEND, dl, VT, CMP);

      }

      return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R);

    }


    // XOP can shift v16i8 directly instead of as shift v8i16 + mask.

    if (VT == MVT::v16i8 && Subtarget.hasXOP())

      return SDValue();


    if (Subtarget.hasGFNI()) {

      SDValue Mask = getGFNICtrlMask(Op.getOpcode(), DAG, dl, VT, ShiftAmt);

      return DAG.getNode(X86ISD::GF2P8AFFINEQB, dl, VT, R, Mask,

                         DAG.getTargetConstant(0, dl, MVT::i8));

    }


    if (Op.getOpcode() == ISD::SHL) {

      // Make a large shift.

      SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ShiftVT, R,

                                               ShiftAmt, DAG);

      SHL = DAG.getBitcast(VT, SHL);

      // Zero out the rightmost bits.

      APInt Mask = APInt::getHighBitsSet(8, 8 - ShiftAmt);

      return DAG.getNode(ISD::AND, dl, VT, SHL, DAG.getConstant(Mask, dl, VT));

    }

    if (Op.getOpcode() == ISD::SRL) {

      // Make a large shift.

      SDValue SRL = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ShiftVT, R,

                                               ShiftAmt, DAG);

      SRL = DAG.getBitcast(VT, SRL);

      // Zero out the leftmost bits.

      APInt Mask = APInt::getLowBitsSet(8, 8 - ShiftAmt);

      return DAG.getNode(ISD::AND, dl, VT, SRL, DAG.getConstant(Mask, dl, VT));

    }

    if (Op.getOpcode() == ISD::SRA) {

      // ashr(R, Amt) === sub(xor(lshr(R, Amt), Mask), Mask)

      SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt);


      SDValue Mask = DAG.getConstant(128 >> ShiftAmt, dl, VT);

      Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask);

      Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask);

      return Res;

    }

    llvm_unreachable("Unknown shift opcode.");

  }


  return SDValue();

}


static SDValue LowerShiftByScalarVariable(SDValue Op, SelectionDAG &DAG,

                                          const X86Subtarget &Subtarget) {

  MVT VT = Op.getSimpleValueType();

  SDLoc dl(Op);

  SDValue R = Op.getOperand(0);

  SDValue Amt = Op.getOperand(1);

  unsigned Opcode = Op.getOpcode();

  unsigned X86OpcI = getTargetVShiftUniformOpcode(Opcode, false);


  int BaseShAmtIdx = -1;

  if (SDValue BaseShAmt = DAG.getSplatSourceVector(Amt, BaseShAmtIdx)) {

    if (supportedVectorShiftWithBaseAmnt(VT, Subtarget, Opcode))

      return getTargetVShiftNode(X86OpcI, dl, VT, R, BaseShAmt, BaseShAmtIdx,

                                 Subtarget, DAG);


    // vXi8 shifts - shift as v8i16 + mask result.

    if (((VT == MVT::v16i8 && !Subtarget.canExtendTo512DQ()) ||

         (VT == MVT::v32i8 && !Subtarget.canExtendTo512BW()) ||

         VT == MVT::v64i8) &&

        !Subtarget.hasXOP()) {

      unsigned NumElts = VT.getVectorNumElements();

      MVT ExtVT = MVT::getVectorVT(MVT::i16, NumElts / 2);

      if (supportedVectorShiftWithBaseAmnt(ExtVT, Subtarget, Opcode)) {

        unsigned LogicalOp = (Opcode == ISD::SHL ? ISD::SHL : ISD::SRL);

        unsigned LogicalX86Op = getTargetVShiftUniformOpcode(LogicalOp, false);


        // Create the mask using vXi16 shifts. For shift-rights we need to move

        // the upper byte down before splatting the vXi8 mask.

        SDValue BitMask = DAG.getAllOnesConstant(dl, ExtVT);

        BitMask = getTargetVShiftNode(LogicalX86Op, dl, ExtVT, BitMask,

                                      BaseShAmt, BaseShAmtIdx, Subtarget, DAG);

        if (Opcode != ISD::SHL)

          BitMask = getTargetVShiftByConstNode(LogicalX86Op, dl, ExtVT, BitMask,

                                               8, DAG);

        BitMask = DAG.getBitcast(VT, BitMask);

        BitMask = DAG.getVectorShuffle(VT, dl, BitMask, BitMask,

                                       SmallVector<int, 64>(NumElts, 0));


        SDValue Res = getTargetVShiftNode(LogicalX86Op, dl, ExtVT,

                                          DAG.getBitcast(ExtVT, R), BaseShAmt,

                                          BaseShAmtIdx, Subtarget, DAG);

        Res = DAG.getBitcast(VT, Res);

        Res = DAG.getNode(ISD::AND, dl, VT, Res, BitMask);


        if (Opcode == ISD::SRA) {

          // ashr(R, Amt) === sub(xor(lshr(R, Amt), SignMask), SignMask)

          // SignMask = lshr(SignBit, Amt) - safe to do this with PSRLW.

          SDValue SignMask = DAG.getConstant(0x8080, dl, ExtVT);

          SignMask =

              getTargetVShiftNode(LogicalX86Op, dl, ExtVT, SignMask, BaseShAmt,

                                  BaseShAmtIdx, Subtarget, DAG);

          SignMask = DAG.getBitcast(VT, SignMask);

          Res = DAG.getNode(ISD::XOR, dl, VT, Res, SignMask);

          Res = DAG.getNode(ISD::SUB, dl, VT, Res, SignMask);

        }

        return Res;

      }

    }

  }


  return SDValue();

}


// Convert a shift/rotate left amount to a multiplication scale factor.


static SDValue convertShiftLeftToScale(SDValue Amt, const SDLoc &dl,

                                       const X86Subtarget &Subtarget,

                                       SelectionDAG &DAG) {

  MVT VT = Amt.getSimpleValueType();

  if (!(VT == MVT::v8i16 || VT == MVT::v4i32 ||

        (Subtarget.hasInt256() && VT == MVT::v16i16) ||

        (Subtarget.hasAVX512() && VT == MVT::v32i16) ||

        (!Subtarget.hasAVX512() && VT == MVT::v16i8) ||

        (Subtarget.hasInt256() && VT == MVT::v32i8) ||

        (Subtarget.hasBWI() && VT == MVT::v64i8)))

    return SDValue();


  MVT SVT = VT.getVectorElementType();

  unsigned SVTBits = SVT.getSizeInBits();

  unsigned NumElems = VT.getVectorNumElements();


  APInt UndefElts;

  SmallVector<APInt> EltBits;

  if (getTargetConstantBitsFromNode(Amt, SVTBits, UndefElts, EltBits)) {

    APInt One(SVTBits, 1);

    SmallVector<SDValue> Elts(NumElems, DAG.getUNDEF(SVT));

    for (unsigned I = 0; I != NumElems; ++I) {

      if (UndefElts[I] || EltBits[I].uge(SVTBits))

        continue;

      uint64_t ShAmt = EltBits[I].getZExtValue();

      Elts[I] = DAG.getConstant(One.shl(ShAmt), dl, SVT);

    }

    return DAG.getBuildVector(VT, dl, Elts);

  }


  // If the target doesn't support variable shifts, use either FP conversion

  // or integer multiplication to avoid shifting each element individually.

  if (VT == MVT::v4i32) {

    Amt = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(23, dl, VT));

    Amt = DAG.getNode(ISD::ADD, dl, VT, Amt,

                      DAG.getConstant(0x3f800000U, dl, VT));

    Amt = DAG.getBitcast(MVT::v4f32, Amt);

    return DAG.getNode(ISD::FP_TO_SINT, dl, VT, Amt);

  }


  // AVX2 can more effectively perform this as a zext/trunc to/from v8i32.

  if (VT == MVT::v8i16 && !Subtarget.hasAVX2()) {

    SDValue Z = DAG.getConstant(0, dl, VT);

    SDValue Lo = DAG.getBitcast(MVT::v4i32, getUnpackl(DAG, dl, VT, Amt, Z));

    SDValue Hi = DAG.getBitcast(MVT::v4i32, getUnpackh(DAG, dl, VT, Amt, Z));

    Lo = convertShiftLeftToScale(Lo, dl, Subtarget, DAG);

    Hi = convertShiftLeftToScale(Hi, dl, Subtarget, DAG);

    if (Subtarget.hasSSE41())

      return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);

    return getPack(DAG, Subtarget, dl, VT, Lo, Hi);

  }


  return SDValue();

}


static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,

                          SelectionDAG &DAG) {

  MVT VT = Op.getSimpleValueType();

  SDLoc dl(Op);

  SDValue R = Op.getOperand(0);

  SDValue Amt = Op.getOperand(1);

  unsigned NumElts = VT.getVectorNumElements();

  unsigned EltSizeInBits = VT.getScalarSizeInBits();

  bool ConstantAmt = ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());


  unsigned Opc = Op.getOpcode();

  unsigned X86OpcV = getTargetVShiftUniformOpcode(Opc, true);

  unsigned X86OpcI = getTargetVShiftUniformOpcode(Opc, false);


  assert(VT.isVector() && "Custom lowering only for vector shifts!");

  assert(Subtarget.hasSSE2() && "Only custom lower when we have SSE2!");


  if (SDValue V = LowerShiftByScalarImmediate(Op, DAG, Subtarget))

    return V;


  if (SDValue V = LowerShiftByScalarVariable(Op, DAG, Subtarget))

    return V;


  if (supportedVectorVarShift(VT, Subtarget, Opc))

    return Op;


  // i64 vector arithmetic shift can be emulated with the transform:

  // M = lshr(SIGN_MASK, Amt)

  // ashr(R, Amt) === sub(xor(lshr(R, Amt), M), M)

  if (((VT == MVT::v2i64 && !Subtarget.hasXOP()) ||

       (VT == MVT::v4i64 && Subtarget.hasInt256())) &&

      Opc == ISD::SRA) {

    SDValue S = DAG.getConstant(APInt::getSignMask(64), dl, VT);

    SDValue M = DAG.getNode(ISD::SRL, dl, VT, S, Amt);

    R = DAG.getNode(ISD::SRL, dl, VT, R, Amt);

    R = DAG.getNode(ISD::XOR, dl, VT, R, M);

    R = DAG.getNode(ISD::SUB, dl, VT, R, M);

    return R;

  }


  // XOP has 128-bit variable logical/arithmetic shifts.

  // +ve/-ve Amt = shift left/right.

  if (Subtarget.hasXOP() && (VT == MVT::v2i64 || VT == MVT::v4i32 ||

                             VT == MVT::v8i16 || VT == MVT::v16i8)) {

    if (Opc == ISD::SRL || Opc == ISD::SRA)

      Amt = DAG.getNegative(Amt, dl, VT);

    if (Opc == ISD::SHL || Opc == ISD::SRL)

      return DAG.getNode(X86ISD::VPSHL, dl, VT, R, Amt);

    if (Opc == ISD::SRA)

      return DAG.getNode(X86ISD::VPSHA, dl, VT, R, Amt);

  }


  // 2i64 vector logical shifts can efficiently avoid scalarization - do the

  // shifts per-lane and then shuffle the partial results back together.

  if (VT == MVT::v2i64 && Opc != ISD::SRA) {

    // Splat the shift amounts so the scalar shifts above will catch it.

    SDValue Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {0, 0});

    SDValue Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {1, 1});

    SDValue R0 = DAG.getNode(Opc, dl, VT, R, Amt0);

    SDValue R1 = DAG.getNode(Opc, dl, VT, R, Amt1);

    return DAG.getVectorShuffle(VT, dl, R0, R1, {0, 3});

  }


  // Build a map of inrange constant amounts with element mask where they occur.

  SmallDenseMap<unsigned, APInt, 16> UniqueCstAmt;

  if (ConstantAmt) {

    for (unsigned I = 0; I != NumElts; ++I) {

      SDValue A = Amt.getOperand(I);

      if (A.isUndef() || A->getAsAPIntVal().uge(EltSizeInBits))

        continue;

      unsigned CstAmt = A->getAsAPIntVal().getZExtValue();

      auto [It, Inserted] = UniqueCstAmt.try_emplace(CstAmt);

      if (!Inserted) {

        It->second.setBit(I);

        continue;

      }

      It->second = APInt::getOneBitSet(NumElts, I);

    }

    assert(!UniqueCstAmt.empty() && "Illegal constant shift amounts");

  }


  // If possible, lower this shift as a sequence of two shifts by

  // constant plus a BLENDing shuffle instead of scalarizing it.

  // Example:

  //   (v4i32 (srl A, (build_vector < X, Y, Y, Y>)))

  //

  // Could be rewritten as:

  //   (v4i32 (MOVSS (srl A, <Y,Y,Y,Y>), (srl A, <X,X,X,X>)))

  //

  // The advantage is that the two shifts from the example would be

  // lowered as X86ISD::VSRLI nodes in parallel before blending.

  if (UniqueCstAmt.size() == 2 &&

      (VT == MVT::v8i16 || VT == MVT::v4i32 ||

       (VT == MVT::v16i16 && Subtarget.hasInt256()))) {

    unsigned AmtA = UniqueCstAmt.begin()->first;

    unsigned AmtB = std::next(UniqueCstAmt.begin())->first;

    const APInt &MaskA = UniqueCstAmt.begin()->second;

    const APInt &MaskB = std::next(UniqueCstAmt.begin())->second;

    SmallVector<int, 8> ShuffleMask(NumElts, SM_SentinelUndef);

    for (unsigned I = 0; I != NumElts; ++I) {

      if (MaskA[I])

        ShuffleMask[I] = I;

      if (MaskB[I])

        ShuffleMask[I] = I + NumElts;

    }


    // Only perform this blend if we can perform it without loading a mask.

    if ((VT != MVT::v16i16 ||

         is128BitLaneRepeatedShuffleMask(VT, ShuffleMask)) &&

        (VT == MVT::v4i32 || Subtarget.hasSSE41() || Opc != ISD::SHL ||

         canWidenShuffleElements(ShuffleMask))) {

      SDValue Shift1 =

          DAG.getNode(Opc, dl, VT, R, DAG.getConstant(AmtA, dl, VT));

      SDValue Shift2 =

          DAG.getNode(Opc, dl, VT, R, DAG.getConstant(AmtB, dl, VT));

      return DAG.getVectorShuffle(VT, dl, Shift1, Shift2, ShuffleMask);

    }

  }


  // Constant ISD::SRA/SRL/SHL can be performed efficiently on vXiN vectors by

  // using vYiM vector operations where X*N == Y*M and M > N.

  if (ConstantAmt &&

      (VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8 ||

       VT == MVT::v8i16 || VT == MVT::v16i16 || VT == MVT::v32i16) &&

      !Subtarget.hasXOP()) {

    MVT NarrowScalarVT = VT.getScalarType();

    // We can do this extra fast if each pair of narrow elements is shifted by

    // the same amount by doing this SWAR style: use a shift to move the valid

    // bits to the right position, mask out any bits which crossed from one

    // element to the other.

    // This optimized lowering is only valid if the elements in a pair can

    // be treated identically.

    SmallVector<SDValue, 32> AmtWideElts(Amt->ops());

    SmallVector<SDValue, 32> TmpAmtWideElts;

    int WideEltSizeInBits = EltSizeInBits;

    while (WideEltSizeInBits < 32) {

      // AVX1 does not have psrlvd, etc. which makes interesting 32-bit shifts

      // unprofitable.

      if (WideEltSizeInBits >= 16 && !Subtarget.hasAVX2()) {

        break;

      }

      TmpAmtWideElts.resize(AmtWideElts.size() / 2);

      bool SameShifts = true;

      for (unsigned SrcI = 0, E = AmtWideElts.size(); SrcI != E; SrcI += 2) {

        unsigned DstI = SrcI / 2;

        // Both elements are undef? Make a note and keep going.

        if (AmtWideElts[SrcI].isUndef() && AmtWideElts[SrcI + 1].isUndef()) {

          TmpAmtWideElts[DstI] = AmtWideElts[SrcI];

          continue;

        }

        // Even element is undef? We will shift it by the same shift amount as

        // the odd element.

        if (AmtWideElts[SrcI].isUndef()) {

          TmpAmtWideElts[DstI] = AmtWideElts[SrcI + 1];

          continue;

        }

        // Odd element is undef? We will shift it by the same shift amount as

        // the even element.

        if (AmtWideElts[SrcI + 1].isUndef()) {

          TmpAmtWideElts[DstI] = AmtWideElts[SrcI];

          continue;

        }

        // Both elements are equal.

        if (AmtWideElts[SrcI].getNode()->getAsAPIntVal() ==

            AmtWideElts[SrcI + 1].getNode()->getAsAPIntVal()) {

          TmpAmtWideElts[DstI] = AmtWideElts[SrcI];

          continue;

        }

        // One of the provisional wide elements will not have the same shift

        // amount. Let's bail.

        SameShifts = false;

        break;

      }

      if (!SameShifts) {

        break;

      }

      WideEltSizeInBits *= 2;

      std::swap(TmpAmtWideElts, AmtWideElts);

    }

    APInt APIntShiftAmt;

    bool IsConstantSplat = X86::isConstantSplat(Amt, APIntShiftAmt);

    bool Profitable = WidenShift;

    // AVX512BW brings support for vpsllvw.

    if (WideEltSizeInBits * AmtWideElts.size() >= 512 &&

        WideEltSizeInBits < 32 && !Subtarget.hasBWI()) {

      Profitable = false;

    }

    // Leave AVX512 uniform arithmetic shifts alone, they can be implemented

    // fairly cheaply in other ways.

    if (WideEltSizeInBits * AmtWideElts.size() >= 512 && IsConstantSplat) {

      Profitable = false;

    }

    // Leave it up to GFNI if we have it around.

    // TODO: gf2p8affine is usually higher latency and more port restricted. It

    // is probably a win to use other strategies in some cases.

    if (EltSizeInBits == 8 && Subtarget.hasGFNI()) {

      Profitable = false;

    }


    // AVX1 does not have vpand which makes our masking impractical. It does

    // have vandps but that is an FP instruction and crossing FP<->int typically

    // has some cost.

    if (WideEltSizeInBits * AmtWideElts.size() >= 256 &&

        (WideEltSizeInBits < 32 || IsConstantSplat) && !Subtarget.hasAVX2()) {

      Profitable = false;

    }

    unsigned WideNumElts = AmtWideElts.size();

    // We are only dealing with identical pairs.

    if (Profitable && WideNumElts != NumElts) {

      MVT WideScalarVT = MVT::getIntegerVT(WideEltSizeInBits);

      MVT WideVT = MVT::getVectorVT(WideScalarVT, WideNumElts);

      // Cast the operand to vXiM.

      SDValue RWide = DAG.getBitcast(WideVT, R);

      // Create our new vector of shift amounts.

      SDValue AmtWide = DAG.getBuildVector(

          MVT::getVectorVT(NarrowScalarVT, WideNumElts), dl, AmtWideElts);

      AmtWide = DAG.getZExtOrTrunc(AmtWide, dl, WideVT);

      // Perform the actual shift.

      unsigned LogicalOpc = Opc == ISD::SRA ? (unsigned)ISD::SRL : Opc;

      SDValue ShiftedR = DAG.getNode(LogicalOpc, dl, WideVT, RWide, AmtWide);

      // Now we need to construct a mask which will "drop" bits that get

      // shifted past the LSB/MSB. For a logical shift left, it will look

      // like:

      //   FullMask = (1 << EltSizeInBits) - 1

      //   Mask = FullMask << Amt

      //

      // This masking ensures that bits cannot migrate from one narrow lane to

      // another. The construction of this mask will be constant folded.

      // The mask for a logical right shift is nearly identical, the only

      // difference is that the all ones mask is shifted right instead of left.

      SDValue SplatFullMask = DAG.getAllOnesConstant(dl, VT);

      SDValue Mask = DAG.getNode(LogicalOpc, dl, VT, SplatFullMask, Amt);

      Mask = DAG.getBitcast(WideVT, Mask);

      // Finally, we mask the shifted vector with the SWAR mask.

      SDValue Masked = DAG.getNode(ISD::AND, dl, WideVT, ShiftedR, Mask);

      Masked = DAG.getBitcast(VT, Masked);

      if (Opc != ISD::SRA) {

        // Logical shifts are complete at this point.

        return Masked;

      }

      // At this point, we have done a *logical* shift right. We now need to

      // sign extend the result so that we get behavior equivalent to an

      // arithmetic shift right. Post-shifting by AmtWide, our narrow elements

      // are `EltSizeInBits-AmtWide` bits wide.

      //

      // To convert our `EltSizeInBits-AmtWide` bit unsigned numbers to signed

      // numbers as wide as `EltSizeInBits`, we need to replicate the bit at

      // position `EltSizeInBits-AmtWide` into the MSBs of each narrow lane. We

      // can use the following trick to accomplish this:

      //   SignBitMask = 1 << (EltSizeInBits-AmtWide-1)

      //   (Masked ^ SignBitMask) - SignBitMask

      //

      // When the sign bit is already clear, this will compute:

      //   Masked + SignBitMask - SignBitMask

      //

      // This is equal to Masked which is what we want: the sign bit was clear

      // so sign extending should be a no-op.

      //

      // When the sign bit is set, this will compute:

      //   Masked - SignBitmask - SignBitMask

      //

      // This is equal to Masked - 2*SignBitMask which will correctly sign

      // extend our result.

      SDValue SplatHighBit =

          DAG.getConstant(APInt::getSignMask(EltSizeInBits), dl, VT);

      // This does not induce recursion, all operands are constants.

      SDValue SignBitMask = DAG.getNode(LogicalOpc, dl, VT, SplatHighBit, Amt);

      SDValue FlippedSignBit =

          DAG.getNode(ISD::XOR, dl, VT, Masked, SignBitMask);

      SDValue Subtraction =

          DAG.getNode(ISD::SUB, dl, VT, FlippedSignBit, SignBitMask);

      return Subtraction;

    }

  }


  // If possible, lower this packed shift into a vector multiply instead of

  // expanding it into a sequence of scalar shifts.

  // For v32i8 cases, it might be quicker to split/extend to vXi16 shifts.

  if (Opc == ISD::SHL && !(VT == MVT::v32i8 && (Subtarget.hasXOP() ||

                                                Subtarget.canExtendTo512BW())))

    if (SDValue Scale = convertShiftLeftToScale(Amt, dl, Subtarget, DAG))

      return DAG.getNode(ISD::MUL, dl, VT, R, Scale);


  // Constant ISD::SRL can be performed efficiently on vXi16 vectors as we

  // can replace with ISD::MULHU, creating scale factor from (NumEltBits - Amt).

  if (Opc == ISD::SRL && ConstantAmt &&

      (VT == MVT::v8i16 || (VT == MVT::v16i16 && Subtarget.hasInt256()))) {

    SDValue EltBits = DAG.getConstant(EltSizeInBits, dl, VT);

    SDValue RAmt = DAG.getNode(ISD::SUB, dl, VT, EltBits, Amt);

    if (SDValue Scale = convertShiftLeftToScale(RAmt, dl, Subtarget, DAG)) {

      SDValue Zero = DAG.getConstant(0, dl, VT);

      SDValue ZAmt = DAG.getSetCC(dl, VT, Amt, Zero, ISD::SETEQ);

      SDValue Res = DAG.getNode(ISD::MULHU, dl, VT, R, Scale);

      return DAG.getSelect(dl, VT, ZAmt, R, Res);

    }

  }


  // Constant ISD::SRA can be performed efficiently on vXi16 vectors as we

  // can replace with ISD::MULHS, creating scale factor from (NumEltBits - Amt).

  // TODO: Special case handling for shift by 0/1, really we can afford either

  // of these cases in pre-SSE41/XOP/AVX512 but not both.

  if (Opc == ISD::SRA && ConstantAmt &&

      (VT == MVT::v8i16 || (VT == MVT::v16i16 && Subtarget.hasInt256())) &&

      ((Subtarget.hasSSE41() && !Subtarget.hasXOP() &&

        !Subtarget.hasAVX512()) ||

       DAG.isKnownNeverZero(Amt))) {

    SDValue EltBits = DAG.getConstant(EltSizeInBits, dl, VT);

    SDValue RAmt = DAG.getNode(ISD::SUB, dl, VT, EltBits, Amt);

    if (SDValue Scale = convertShiftLeftToScale(RAmt, dl, Subtarget, DAG)) {

      SDValue Amt0 =

          DAG.getSetCC(dl, VT, Amt, DAG.getConstant(0, dl, VT), ISD::SETEQ);

      SDValue Amt1 =

          DAG.getSetCC(dl, VT, Amt, DAG.getConstant(1, dl, VT), ISD::SETEQ);

      SDValue Sra1 =

          getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, R, 1, DAG);

      SDValue Res = DAG.getNode(ISD::MULHS, dl, VT, R, Scale);

      Res = DAG.getSelect(dl, VT, Amt0, R, Res);

      return DAG.getSelect(dl, VT, Amt1, Sra1, Res);

    }

  }


  // v4i32 Non Uniform Shifts.

  // If the shift amount is constant we can shift each lane using the SSE2

  // immediate shifts, else we need to zero-extend each lane to the lower i64

  // and shift using the SSE2 variable shifts.

  // The separate results can then be blended together.

  if (VT == MVT::v4i32) {

    SDValue Amt0, Amt1, Amt2, Amt3;

    if (ConstantAmt) {

      Amt0 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {0, 0, 0, 0});

      Amt1 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {1, 1, 1, 1});

      Amt2 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {2, 2, 2, 2});

      Amt3 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {3, 3, 3, 3});

    } else {

      // The SSE2 shifts use the lower i64 as the same shift amount for

      // all lanes and the upper i64 is ignored. On AVX we're better off

      // just zero-extending, but for SSE just duplicating the top 16-bits is

      // cheaper and has the same effect for out of range values.

      if (Subtarget.hasAVX()) {

        SDValue Z = DAG.getConstant(0, dl, VT);

        Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Z, {0, 4, -1, -1});

        Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Z, {1, 5, -1, -1});

        Amt2 = DAG.getVectorShuffle(VT, dl, Amt, Z, {2, 6, -1, -1});

        Amt3 = DAG.getVectorShuffle(VT, dl, Amt, Z, {3, 7, -1, -1});

      } else {

        SDValue Amt01 = DAG.getBitcast(MVT::v8i16, Amt);

        SDValue Amt23 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt01, Amt01,

                                             {4, 5, 6, 7, -1, -1, -1, -1});

        SDValue Msk02 = getV4X86ShuffleImm8ForMask({0, 1, 1, 1}, dl, DAG);

        SDValue Msk13 = getV4X86ShuffleImm8ForMask({2, 3, 3, 3}, dl, DAG);

        Amt0 = DAG.getNode(X86ISD::PSHUFLW, dl, MVT::v8i16, Amt01, Msk02);

        Amt1 = DAG.getNode(X86ISD::PSHUFLW, dl, MVT::v8i16, Amt01, Msk13);

        Amt2 = DAG.getNode(X86ISD::PSHUFLW, dl, MVT::v8i16, Amt23, Msk02);

        Amt3 = DAG.getNode(X86ISD::PSHUFLW, dl, MVT::v8i16, Amt23, Msk13);

      }

    }


    unsigned ShOpc = ConstantAmt ? Opc : X86OpcV;

    SDValue R0 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt0));

    SDValue R1 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt1));

    SDValue R2 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt2));

    SDValue R3 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt3));


    // Merge the shifted lane results optimally with/without PBLENDW.

    // TODO - ideally shuffle combining would handle this.

    if (Subtarget.hasSSE41()) {

      SDValue R02 = DAG.getVectorShuffle(VT, dl, R0, R2, {0, -1, 6, -1});

      SDValue R13 = DAG.getVectorShuffle(VT, dl, R1, R3, {-1, 1, -1, 7});

      return DAG.getVectorShuffle(VT, dl, R02, R13, {0, 5, 2, 7});

    }

    SDValue R01 = DAG.getVectorShuffle(VT, dl, R0, R1, {0, -1, -1, 5});

    SDValue R23 = DAG.getVectorShuffle(VT, dl, R2, R3, {2, -1, -1, 7});

    return DAG.getVectorShuffle(VT, dl, R01, R23, {0, 3, 4, 7});

  }


  // If we're shifting (per-lane) uniform vXi8 constants, we can use PSHUFB to

  // look up the pre-computed shift values.

  if ((VT == MVT::v16i8 && Subtarget.hasSSSE3()) ||

      (VT == MVT::v32i8 && Subtarget.hasInt256()) ||

      (VT == MVT::v64i8 && Subtarget.hasBWI())) {

    unsigned NumLanes = VT.getSizeInBits() / 128u;

    unsigned NumEltsPerLane = NumElts / NumLanes;

    SmallVector<APInt, 16> LUT;

    for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {

      unsigned LoElt = Lane * NumEltsPerLane;

      APInt EltMask = APInt::getBitsSet(NumElts, LoElt, LoElt + NumEltsPerLane);

      KnownBits KnownLane = DAG.computeKnownBits(R, EltMask);

      if (!KnownLane.isConstant())

        break;

      const APInt &LaneSplat = KnownLane.getConstant();

      for (unsigned I = 0; I != 8; ++I) {

        if (Opc == ISD::SHL)

          LUT.push_back(LaneSplat.shl(I));

        else if (Opc == ISD::SRL)

          LUT.push_back(LaneSplat.lshr(I));

        else if (Opc == ISD::SRA)

          LUT.push_back(LaneSplat.ashr(I));

      }

      LUT.append(8, APInt::getZero(8));

    }

    if (LUT.size() == NumElts) {

      APInt Undefs = APInt::getSplat(NumElts, APInt(16, 0xFF00));

      SDValue Mask = getConstVector(LUT, Undefs, VT, DAG, dl);

      return DAG.getNode(X86ISD::PSHUFB, dl, VT, Mask, Amt);

    }

  }


  // It's worth extending once and using the vXi16/vXi32 shifts for smaller

  // types, but without AVX512 the extra overheads to get from vXi8 to vXi32

  // make the existing SSE solution better.

  // NOTE: We honor prefered vector width before promoting to 512-bits.

  if ((Subtarget.hasInt256() && VT == MVT::v8i16) ||

      (Subtarget.canExtendTo512DQ() && VT == MVT::v16i16) ||

      (Subtarget.canExtendTo512DQ() && VT == MVT::v16i8) ||

      (Subtarget.canExtendTo512BW() && VT == MVT::v32i8) ||

      (Subtarget.hasBWI() && Subtarget.hasVLX() && VT == MVT::v16i8)) {

    assert((!Subtarget.hasBWI() || VT == MVT::v32i8 || VT == MVT::v16i8) &&

           "Unexpected vector type");

    MVT EvtSVT = Subtarget.hasBWI() ? MVT::i16 : MVT::i32;

    MVT ExtVT = MVT::getVectorVT(EvtSVT, NumElts);

    unsigned ExtOpc = Opc == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;

    R = DAG.getNode(ExtOpc, dl, ExtVT, R);

    Amt = DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVT, Amt);

    return DAG.getNode(ISD::TRUNCATE, dl, VT,

                       DAG.getNode(Opc, dl, ExtVT, R, Amt));

  }


  // Constant ISD::SRA/SRL can be performed efficiently on vXi8 vectors as we

  // extend to vXi16 to perform a MUL scale effectively as a MUL_LOHI.

  if (ConstantAmt && (Opc == ISD::SRA || Opc == ISD::SRL) &&

      (VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256()) ||

       (VT == MVT::v64i8 && Subtarget.hasBWI())) &&

      !Subtarget.hasXOP()) {

    MVT VT16 = MVT::getVectorVT(MVT::i16, NumElts / 2);

    SDValue Cst8 = DAG.getTargetConstant(8, dl, MVT::i8);


    // Extend constant shift amount to vXi16 (it doesn't matter if the type

    // isn't legal).

    MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts);

    Amt = DAG.getZExtOrTrunc(Amt, dl, ExVT);

    Amt = DAG.getNode(ISD::SUB, dl, ExVT, DAG.getConstant(8, dl, ExVT), Amt);

    Amt = DAG.getNode(ISD::SHL, dl, ExVT, DAG.getConstant(1, dl, ExVT), Amt);

    assert(ISD::isBuildVectorOfConstantSDNodes(Amt.getNode()) &&

           "Constant build vector expected");


    if (VT == MVT::v16i8 && Subtarget.hasInt256()) {

      bool IsSigned = Opc == ISD::SRA;

      R = DAG.getExtOrTrunc(IsSigned, R, dl, ExVT);

      R = DAG.getNode(ISD::MUL, dl, ExVT, R, Amt);

      R = DAG.getNode(X86ISD::VSRLI, dl, ExVT, R, Cst8);

      return DAG.getZExtOrTrunc(R, dl, VT);

    }


    SmallVector<SDValue, 16> LoAmt, HiAmt;

    for (unsigned i = 0; i != NumElts; i += 16) {

      for (int j = 0; j != 8; ++j) {

        LoAmt.push_back(Amt.getOperand(i + j));

        HiAmt.push_back(Amt.getOperand(i + j + 8));

      }

    }


    SDValue LoA = DAG.getBuildVector(VT16, dl, LoAmt);

    SDValue HiA = DAG.getBuildVector(VT16, dl, HiAmt);


    SDValue LoR = DAG.getBitcast(VT16, getUnpackl(DAG, dl, VT, R, R));

    SDValue HiR = DAG.getBitcast(VT16, getUnpackh(DAG, dl, VT, R, R));

    LoR = DAG.getNode(X86OpcI, dl, VT16, LoR, Cst8);

    HiR = DAG.getNode(X86OpcI, dl, VT16, HiR, Cst8);

    LoR = DAG.getNode(ISD::MUL, dl, VT16, LoR, LoA);

    HiR = DAG.getNode(ISD::MUL, dl, VT16, HiR, HiA);

    LoR = DAG.getNode(X86ISD::VSRLI, dl, VT16, LoR, Cst8);

    HiR = DAG.getNode(X86ISD::VSRLI, dl, VT16, HiR, Cst8);

    return DAG.getNode(X86ISD::PACKUS, dl, VT, LoR, HiR);

  }


  if (VT == MVT::v16i8 ||

      (VT == MVT::v32i8 && Subtarget.hasInt256() && !Subtarget.hasXOP()) ||

      (VT == MVT::v64i8 && Subtarget.hasBWI())) {

    MVT ExtVT = MVT::getVectorVT(MVT::i16, NumElts / 2);


    auto SignBitSelect = [&](MVT SelVT, SDValue Sel, SDValue V0, SDValue V1) {

      if (VT.is512BitVector()) {

        // On AVX512BW targets we make use of the fact that VSELECT lowers

        // to a masked blend which selects bytes based just on the sign bit

        // extracted to a mask.

        MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);

        V0 = DAG.getBitcast(VT, V0);

        V1 = DAG.getBitcast(VT, V1);

        Sel = DAG.getBitcast(VT, Sel);

        Sel = DAG.getSetCC(dl, MaskVT, DAG.getConstant(0, dl, VT), Sel,

                           ISD::SETGT);

        return DAG.getBitcast(SelVT, DAG.getSelect(dl, VT, Sel, V0, V1));

      } else if (Subtarget.hasSSE41()) {

        // On SSE41 targets we can use PBLENDVB which selects bytes based just

        // on the sign bit.

        V0 = DAG.getBitcast(VT, V0);

        V1 = DAG.getBitcast(VT, V1);

        Sel = DAG.getBitcast(VT, Sel);

        return DAG.getBitcast(SelVT,

                              DAG.getNode(X86ISD::BLENDV, dl, VT, Sel, V0, V1));

      }

      // On pre-SSE41 targets we test for the sign bit by comparing to

      // zero - a negative value will set all bits of the lanes to true

      // and VSELECT uses that in its OR(AND(V0,C),AND(V1,~C)) lowering.

      SDValue Z = DAG.getConstant(0, dl, SelVT);

      SDValue C = DAG.getNode(X86ISD::PCMPGT, dl, SelVT, Z, Sel);

      return DAG.getSelect(dl, SelVT, C, V0, V1);

    };


    // Turn 'a' into a mask suitable for VSELECT: a = a << 5;

    // We can safely do this using i16 shifts as we're only interested in

    // the 3 lower bits of each byte.

    Amt = DAG.getBitcast(ExtVT, Amt);

    Amt = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ExtVT, Amt, 5, DAG);

    Amt = DAG.getBitcast(VT, Amt);


    if (Opc == ISD::SHL || Opc == ISD::SRL) {

      // r = VSELECT(r, shift(r, 4), a);

      SDValue M = DAG.getNode(Opc, dl, VT, R, DAG.getConstant(4, dl, VT));

      R = SignBitSelect(VT, Amt, M, R);


      // a += a

      Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);


      // r = VSELECT(r, shift(r, 2), a);

      M = DAG.getNode(Opc, dl, VT, R, DAG.getConstant(2, dl, VT));

      R = SignBitSelect(VT, Amt, M, R);


      // a += a

      Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);


      // return VSELECT(r, shift(r, 1), a);

      M = DAG.getNode(Opc, dl, VT, R, DAG.getConstant(1, dl, VT));

      R = SignBitSelect(VT, Amt, M, R);

      return R;

    }


    if (Opc == ISD::SRA) {

      // For SRA we need to unpack each byte to the higher byte of a i16 vector

      // so we can correctly sign extend. We don't care what happens to the

      // lower byte.

      SDValue ALo = getUnpackl(DAG, dl, VT, DAG.getUNDEF(VT), Amt);

      SDValue AHi = getUnpackh(DAG, dl, VT, DAG.getUNDEF(VT), Amt);

      SDValue RLo = getUnpackl(DAG, dl, VT, DAG.getUNDEF(VT), R);

      SDValue RHi = getUnpackh(DAG, dl, VT, DAG.getUNDEF(VT), R);

      ALo = DAG.getBitcast(ExtVT, ALo);

      AHi = DAG.getBitcast(ExtVT, AHi);

      RLo = DAG.getBitcast(ExtVT, RLo);

      RHi = DAG.getBitcast(ExtVT, RHi);


      // r = VSELECT(r, shift(r, 4), a);

      SDValue MLo = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RLo, 4, DAG);

      SDValue MHi = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RHi, 4, DAG);

      RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);

      RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);


      // a += a

      ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);

      AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);


      // r = VSELECT(r, shift(r, 2), a);

      MLo = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RLo, 2, DAG);

      MHi = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RHi, 2, DAG);

      RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);

      RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);


      // a += a

      ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);

      AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);


      // r = VSELECT(r, shift(r, 1), a);

      MLo = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RLo, 1, DAG);

      MHi = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RHi, 1, DAG);

      RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);

      RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);


      // Logical shift the result back to the lower byte, leaving a zero upper

      // byte meaning that we can safely pack with PACKUSWB.

      RLo = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, RLo, 8, DAG);

      RHi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, RHi, 8, DAG);

      return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);

    }

  }


  if (Subtarget.hasInt256() && !Subtarget.hasXOP() && VT == MVT::v16i16) {

    MVT ExtVT = MVT::v8i32;

    SDValue Z = DAG.getConstant(0, dl, VT);

    SDValue ALo = getUnpackl(DAG, dl, VT, Amt, Z);

    SDValue AHi = getUnpackh(DAG, dl, VT, Amt, Z);

    SDValue RLo = getUnpackl(DAG, dl, VT, Z, R);

    SDValue RHi = getUnpackh(DAG, dl, VT, Z, R);

    ALo = DAG.getBitcast(ExtVT, ALo);

    AHi = DAG.getBitcast(ExtVT, AHi);

    RLo = DAG.getBitcast(ExtVT, RLo);

    RHi = DAG.getBitcast(ExtVT, RHi);

    SDValue Lo = DAG.getNode(Opc, dl, ExtVT, RLo, ALo);

    SDValue Hi = DAG.getNode(Opc, dl, ExtVT, RHi, AHi);

    Lo = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, Lo, 16, DAG);

    Hi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, Hi, 16, DAG);

    return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);

  }


  if (VT == MVT::v8i16) {

    // If we have a constant shift amount, the non-SSE41 path is best as

    // avoiding bitcasts make it easier to constant fold and reduce to PBLENDW.

    bool UseSSE41 = Subtarget.hasSSE41() &&

                    !ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());


    auto SignBitSelect = [&](SDValue Sel, SDValue V0, SDValue V1) {

      // On SSE41 targets we can use PBLENDVB which selects bytes based just on

      // the sign bit.

      if (UseSSE41) {

        MVT ExtVT = MVT::getVectorVT(MVT::i8, NumElts * 2);

        V0 = DAG.getBitcast(ExtVT, V0);

        V1 = DAG.getBitcast(ExtVT, V1);

        Sel = DAG.getBitcast(ExtVT, Sel);

        return DAG.getBitcast(

            VT, DAG.getNode(X86ISD::BLENDV, dl, ExtVT, Sel, V0, V1));

      }

      // On pre-SSE41 targets we splat the sign bit - a negative value will

      // set all bits of the lanes to true and VSELECT uses that in

      // its OR(AND(V0,C),AND(V1,~C)) lowering.

      SDValue C =

          getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, Sel, 15, DAG);

      return DAG.getSelect(dl, VT, C, V0, V1);

    };


    // Turn 'a' into a mask suitable for VSELECT: a = a << 12;

    if (UseSSE41) {

      // On SSE41 targets we need to replicate the shift mask in both

      // bytes for PBLENDVB.

      Amt = DAG.getNode(

          ISD::OR, dl, VT,

          getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Amt, 4, DAG),

          getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Amt, 12, DAG));

    } else {

      Amt = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Amt, 12, DAG);

    }


    // r = VSELECT(r, shift(r, 8), a);

    SDValue M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 8, DAG);

    R = SignBitSelect(Amt, M, R);


    // a += a

    Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);


    // r = VSELECT(r, shift(r, 4), a);

    M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 4, DAG);

    R = SignBitSelect(Amt, M, R);


    // a += a

    Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);


    // r = VSELECT(r, shift(r, 2), a);

    M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 2, DAG);

    R = SignBitSelect(Amt, M, R);


    // a += a

    Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);


    // return VSELECT(r, shift(r, 1), a);

    M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 1, DAG);

    R = SignBitSelect(Amt, M, R);

    return R;

  }


  // Decompose 256-bit shifts into 128-bit shifts.

  if (VT.is256BitVector())

    return splitVectorIntBinary(Op, DAG, dl);


  if (VT == MVT::v32i16 || VT == MVT::v64i8)

    return splitVectorIntBinary(Op, DAG, dl);


  return SDValue();

}


static SDValue LowerFunnelShift(SDValue Op, const X86Subtarget &Subtarget,

                                SelectionDAG &DAG) {

  MVT VT = Op.getSimpleValueType();

  assert((Op.getOpcode() == ISD::FSHL || Op.getOpcode() == ISD::FSHR) &&

         "Unexpected funnel shift opcode!");


  SDLoc DL(Op);

  SDValue Op0 = Op.getOperand(0);

  SDValue Op1 = Op.getOperand(1);

  SDValue Amt = Op.getOperand(2);

  unsigned EltSizeInBits = VT.getScalarSizeInBits();

  bool IsFSHR = Op.getOpcode() == ISD::FSHR;


  if (VT.isVector()) {

    APInt APIntShiftAmt;

    bool IsCstSplat = X86::isConstantSplat(Amt, APIntShiftAmt);

    unsigned NumElts = VT.getVectorNumElements();


    if (Subtarget.hasVBMI2() && EltSizeInBits > 8) {


      if (IsCstSplat) {

        if (IsFSHR)

          std::swap(Op0, Op1);

        uint64_t ShiftAmt = APIntShiftAmt.urem(EltSizeInBits);

        SDValue Imm = DAG.getTargetConstant(ShiftAmt, DL, MVT::i8);

        return getAVX512Node(IsFSHR ? X86ISD::VSHRD : X86ISD::VSHLD, DL, VT,

                             {Op0, Op1, Imm}, DAG, Subtarget);

      }

      return getAVX512Node(IsFSHR ? ISD::FSHR : ISD::FSHL, DL, VT,

                           {Op0, Op1, Amt}, DAG, Subtarget);

    }

    assert((VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8 ||

            VT == MVT::v8i16 || VT == MVT::v16i16 || VT == MVT::v32i16 ||

            VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32) &&

           "Unexpected funnel shift type!");


    // fshl(x,y,z) -> unpack(y,x) << (z & (bw-1))) >> bw.

    // fshr(x,y,z) -> unpack(y,x) >> (z & (bw-1))).

    if (IsCstSplat) {

      // TODO: Can't use generic expansion as UNDEF amt elements can be

      // converted to other values when folded to shift amounts, losing the

      // splat.

      uint64_t ShiftAmt = APIntShiftAmt.urem(EltSizeInBits);

      uint64_t ShXAmt = IsFSHR ? (EltSizeInBits - ShiftAmt) : ShiftAmt;

      uint64_t ShYAmt = IsFSHR ? ShiftAmt : (EltSizeInBits - ShiftAmt);

      assert((ShXAmt + ShYAmt) == EltSizeInBits && "Illegal funnel shift");

      MVT WideVT = MVT::getVectorVT(MVT::i16, NumElts / 2);


      if (EltSizeInBits == 8 &&

          (Subtarget.hasXOP() ||

           (useVPTERNLOG(Subtarget, VT) &&

            supportedVectorShiftWithImm(WideVT, Subtarget, ISD::SHL)))) {

        // For vXi8 cases on Subtargets that can perform VPCMOV/VPTERNLOG

        // bit-select - lower using vXi16 shifts and then perform the bitmask at

        // the original vector width to handle cases where we split.

        APInt MaskX = APInt::getHighBitsSet(8, 8 - ShXAmt);

        APInt MaskY = APInt::getLowBitsSet(8, 8 - ShYAmt);

        SDValue ShX =

            DAG.getNode(ISD::SHL, DL, WideVT, DAG.getBitcast(WideVT, Op0),

                        DAG.getShiftAmountConstant(ShXAmt, WideVT, DL));

        SDValue ShY =

            DAG.getNode(ISD::SRL, DL, WideVT, DAG.getBitcast(WideVT, Op1),

                        DAG.getShiftAmountConstant(ShYAmt, WideVT, DL));

        ShX = DAG.getNode(ISD::AND, DL, VT, DAG.getBitcast(VT, ShX),

                          DAG.getConstant(MaskX, DL, VT));

        ShY = DAG.getNode(ISD::AND, DL, VT, DAG.getBitcast(VT, ShY),

                          DAG.getConstant(MaskY, DL, VT));

        return DAG.getNode(ISD::OR, DL, VT, ShX, ShY);

      }


      SDValue ShX = DAG.getNode(ISD::SHL, DL, VT, Op0,

                                DAG.getShiftAmountConstant(ShXAmt, VT, DL));

      SDValue ShY = DAG.getNode(ISD::SRL, DL, VT, Op1,

                                DAG.getShiftAmountConstant(ShYAmt, VT, DL));

      return DAG.getNode(ISD::OR, DL, VT, ShX, ShY);

    }


    SDValue AmtMask = DAG.getConstant(EltSizeInBits - 1, DL, VT);

    SDValue AmtMod = DAG.getNode(ISD::AND, DL, VT, Amt, AmtMask);

    bool IsCst = ISD::isBuildVectorOfConstantSDNodes(AmtMod.getNode());


    // Constant vXi16 funnel shifts can be efficiently handled by default.

    if (IsCst && EltSizeInBits == 16)

      return SDValue();


    unsigned ShiftOpc = IsFSHR ? ISD::SRL : ISD::SHL;

    MVT ExtSVT = MVT::getIntegerVT(2 * EltSizeInBits);

    MVT ExtVT = MVT::getVectorVT(ExtSVT, NumElts / 2);


    // Split 256-bit integers on XOP/pre-AVX2 targets.

    // Split 512-bit integers on non 512-bit BWI targets.

    if ((VT.is256BitVector() && ((Subtarget.hasXOP() && EltSizeInBits < 16) ||

                                 !Subtarget.hasAVX2())) ||

        (VT.is512BitVector() && !Subtarget.useBWIRegs() &&

         EltSizeInBits < 32)) {

      // Pre-mask the amount modulo using the wider vector.

      Op = DAG.getNode(Op.getOpcode(), DL, VT, Op0, Op1, AmtMod);

      return splitVectorOp(Op, DAG, DL);

    }


    // Attempt to fold scalar shift as unpack(y,x) << zext(splat(z))

    if (supportedVectorShiftWithBaseAmnt(ExtVT, Subtarget, ShiftOpc)) {

      int ScalarAmtIdx = -1;

      if (SDValue ScalarAmt = DAG.getSplatSourceVector(AmtMod, ScalarAmtIdx)) {

        // Uniform vXi16 funnel shifts can be efficiently handled by default.

        if (EltSizeInBits == 16)

          return SDValue();


        SDValue Lo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, Op1, Op0));

        SDValue Hi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, Op1, Op0));

        Lo = getTargetVShiftNode(ShiftOpc, DL, ExtVT, Lo, ScalarAmt,

                                 ScalarAmtIdx, Subtarget, DAG);

        Hi = getTargetVShiftNode(ShiftOpc, DL, ExtVT, Hi, ScalarAmt,

                                 ScalarAmtIdx, Subtarget, DAG);

        return getPack(DAG, Subtarget, DL, VT, Lo, Hi, !IsFSHR);

      }

    }


    MVT WideSVT = MVT::getIntegerVT(

        std::min<unsigned>(EltSizeInBits * 2, Subtarget.hasBWI() ? 16 : 32));

    MVT WideVT = MVT::getVectorVT(WideSVT, NumElts);


    // If per-element shifts are legal, fallback to generic expansion.

    if (supportedVectorVarShift(VT, Subtarget, ShiftOpc) || Subtarget.hasXOP())

      return SDValue();


    // Attempt to fold as:

    // fshl(x,y,z) -> (((aext(x) << bw) | zext(y)) << (z & (bw-1))) >> bw.

    // fshr(x,y,z) -> (((aext(x) << bw) | zext(y)) >> (z & (bw-1))).

    if (supportedVectorVarShift(WideVT, Subtarget, ShiftOpc) &&

        supportedVectorShiftWithImm(WideVT, Subtarget, ShiftOpc)) {

      Op0 = DAG.getNode(ISD::ANY_EXTEND, DL, WideVT, Op0);

      Op1 = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT, Op1);

      AmtMod = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT, AmtMod);

      Op0 = getTargetVShiftByConstNode(X86ISD::VSHLI, DL, WideVT, Op0,

                                       EltSizeInBits, DAG);

      SDValue Res = DAG.getNode(ISD::OR, DL, WideVT, Op0, Op1);

      Res = DAG.getNode(ShiftOpc, DL, WideVT, Res, AmtMod);

      if (!IsFSHR)

        Res = getTargetVShiftByConstNode(X86ISD::VSRLI, DL, WideVT, Res,

                                         EltSizeInBits, DAG);

      return DAG.getNode(ISD::TRUNCATE, DL, VT, Res);

    }


    // Attempt to fold per-element (ExtVT) shift as unpack(y,x) << zext(z)

    if (((IsCst || !Subtarget.hasAVX512()) && !IsFSHR && EltSizeInBits <= 16) ||

        supportedVectorVarShift(ExtVT, Subtarget, ShiftOpc)) {

      SDValue Z = DAG.getConstant(0, DL, VT);

      SDValue RLo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, Op1, Op0));

      SDValue RHi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, Op1, Op0));

      SDValue ALo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, AmtMod, Z));

      SDValue AHi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, AmtMod, Z));

      SDValue Lo = DAG.getNode(ShiftOpc, DL, ExtVT, RLo, ALo);

      SDValue Hi = DAG.getNode(ShiftOpc, DL, ExtVT, RHi, AHi);

      return getPack(DAG, Subtarget, DL, VT, Lo, Hi, !IsFSHR);

    }


    // Fallback to generic expansion.

    return SDValue();

  }

  assert(

      (VT == MVT::i8 || VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) &&

      "Unexpected funnel shift type!");


  // Expand slow SHLD/SHRD cases if we are not optimizing for size.

  bool OptForSize = DAG.shouldOptForSize();

  bool ExpandFunnel = !OptForSize && Subtarget.isSHLDSlow();


  // fshl(x,y,z) -> (((aext(x) << bw) | zext(y)) << (z & (bw-1))) >> bw.

  // fshr(x,y,z) -> (((aext(x) << bw) | zext(y)) >> (z & (bw-1))).

  if ((VT == MVT::i8 || (ExpandFunnel && VT == MVT::i16)) &&

      !isa<ConstantSDNode>(Amt)) {

    SDValue Mask = DAG.getConstant(EltSizeInBits - 1, DL, Amt.getValueType());

    SDValue HiShift = DAG.getConstant(EltSizeInBits, DL, Amt.getValueType());

    Op0 = DAG.getAnyExtOrTrunc(Op0, DL, MVT::i32);

    Op1 = DAG.getZExtOrTrunc(Op1, DL, MVT::i32);

    Amt = DAG.getNode(ISD::AND, DL, Amt.getValueType(), Amt, Mask);

    SDValue Res = DAG.getNode(ISD::SHL, DL, MVT::i32, Op0, HiShift);

    Res = DAG.getNode(ISD::OR, DL, MVT::i32, Res, Op1);

    if (IsFSHR) {

      Res = DAG.getNode(ISD::SRL, DL, MVT::i32, Res, Amt);

    } else {

      Res = DAG.getNode(ISD::SHL, DL, MVT::i32, Res, Amt);

      Res = DAG.getNode(ISD::SRL, DL, MVT::i32, Res, HiShift);

    }

    return DAG.getZExtOrTrunc(Res, DL, VT);

  }


  if (VT == MVT::i8 || ExpandFunnel)

    return SDValue();


  // i16 needs to modulo the shift amount, but i32/i64 have implicit modulo.

  if (VT == MVT::i16) {

    Amt = DAG.getNode(ISD::AND, DL, Amt.getValueType(), Amt,

                      DAG.getConstant(15, DL, Amt.getValueType()));

    unsigned FSHOp = (IsFSHR ? X86ISD::FSHR : X86ISD::FSHL);

    return DAG.getNode(FSHOp, DL, VT, Op0, Op1, Amt);

  }


  return Op;

}


static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget,

                           SelectionDAG &DAG) {

  MVT VT = Op.getSimpleValueType();

  assert(VT.isVector() && "Custom lowering only for vector rotates!");


  SDLoc DL(Op);

  SDValue R = Op.getOperand(0);

  SDValue Amt = Op.getOperand(1);

  unsigned Opcode = Op.getOpcode();

  unsigned EltSizeInBits = VT.getScalarSizeInBits();

  int NumElts = VT.getVectorNumElements();

  bool IsROTL = Opcode == ISD::ROTL;


  // Check for constant splat rotation amount.

  APInt CstSplatValue;

  bool IsCstSplat = X86::isConstantSplat(Amt, CstSplatValue);


  // Check for splat rotate by zero.

  if (IsCstSplat && CstSplatValue.urem(EltSizeInBits) == 0)

    return R;


  // AVX512 implicitly uses modulo rotation amounts.

  if ((Subtarget.hasVLX() || Subtarget.hasAVX512()) && 32 <= EltSizeInBits) {

    // Attempt to rotate by immediate.

    if (IsCstSplat) {

      unsigned RotOpc = IsROTL ? X86ISD::VROTLI : X86ISD::VROTRI;

      uint64_t RotAmt = CstSplatValue.urem(EltSizeInBits);

      return DAG.getNode(RotOpc, DL, VT, R,

                         DAG.getTargetConstant(RotAmt, DL, MVT::i8));

    }


    // Else, fall-back on VPROLV/VPRORV.

    return Op;

  }


  // AVX512 VBMI2 vXi16 - lower to funnel shifts.

  if (Subtarget.hasVBMI2() && 16 == EltSizeInBits) {

    unsigned FunnelOpc = IsROTL ? ISD::FSHL : ISD::FSHR;

    return DAG.getNode(FunnelOpc, DL, VT, R, R, Amt);

  }


  SDValue Z = DAG.getConstant(0, DL, VT);


  if (!IsROTL) {

    // If the ISD::ROTR amount is constant, we're always better converting to

    // ISD::ROTL.

    if (SDValue NegAmt = DAG.FoldConstantArithmetic(ISD::SUB, DL, VT, {Z, Amt}))

      return DAG.getNode(ISD::ROTL, DL, VT, R, NegAmt);


    // XOP targets always prefers ISD::ROTL.

    if (Subtarget.hasXOP())

      return DAG.getNode(ISD::ROTL, DL, VT, R,

                         DAG.getNode(ISD::SUB, DL, VT, Z, Amt));

  }


  // Attempt to use GFNI gf2p8affine to rotate vXi8 by an uniform constant.

  if (IsCstSplat && Subtarget.hasGFNI() && VT.getScalarType() == MVT::i8 &&

      DAG.getTargetLoweringInfo().isTypeLegal(VT)) {

    uint64_t RotAmt = CstSplatValue.urem(EltSizeInBits);

    SDValue Mask = getGFNICtrlMask(Opcode, DAG, DL, VT, RotAmt);

    return DAG.getNode(X86ISD::GF2P8AFFINEQB, DL, VT, R, Mask,

                       DAG.getTargetConstant(0, DL, MVT::i8));

  }


  // Split 256-bit integers on XOP/pre-AVX2 targets.

  if (VT.is256BitVector() && (Subtarget.hasXOP() || !Subtarget.hasAVX2()))

    return splitVectorIntBinary(Op, DAG, DL);


  // XOP has 128-bit vector variable + immediate rotates.

  // +ve/-ve Amt = rotate left/right - just need to handle ISD::ROTL.

  // XOP implicitly uses modulo rotation amounts.

  if (Subtarget.hasXOP()) {

    assert(IsROTL && "Only ROTL expected");

    assert(VT.is128BitVector() && "Only rotate 128-bit vectors!");


    // Attempt to rotate by immediate.

    if (IsCstSplat) {

      uint64_t RotAmt = CstSplatValue.urem(EltSizeInBits);

      return DAG.getNode(X86ISD::VROTLI, DL, VT, R,

                         DAG.getTargetConstant(RotAmt, DL, MVT::i8));

    }


    // Use general rotate by variable (per-element).

    return Op;

  }


  // Rotate by an uniform constant - expand back to shifts.

  // TODO: Can't use generic expansion as UNDEF amt elements can be converted

  // to other values when folded to shift amounts, losing the splat.

  if (IsCstSplat) {

    uint64_t RotAmt = CstSplatValue.urem(EltSizeInBits);

    uint64_t ShlAmt = IsROTL ? RotAmt : (EltSizeInBits - RotAmt);

    uint64_t SrlAmt = IsROTL ? (EltSizeInBits - RotAmt) : RotAmt;

    SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, R,

                              DAG.getShiftAmountConstant(ShlAmt, VT, DL));

    SDValue Srl = DAG.getNode(ISD::SRL, DL, VT, R,

                              DAG.getShiftAmountConstant(SrlAmt, VT, DL));

    return DAG.getNode(ISD::OR, DL, VT, Shl, Srl);

  }


  // Split 512-bit integers on non 512-bit BWI targets.

  if (VT.is512BitVector() && !Subtarget.useBWIRegs())

    return splitVectorIntBinary(Op, DAG, DL);


  assert(

      (VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8 ||

       ((VT == MVT::v8i32 || VT == MVT::v16i16 || VT == MVT::v32i8) &&

        Subtarget.hasAVX2()) ||

       ((VT == MVT::v32i16 || VT == MVT::v64i8) && Subtarget.useBWIRegs())) &&

      "Only vXi32/vXi16/vXi8 vector rotates supported");


  MVT ExtSVT = MVT::getIntegerVT(2 * EltSizeInBits);

  MVT ExtVT = MVT::getVectorVT(ExtSVT, NumElts / 2);


  SDValue AmtMask = DAG.getConstant(EltSizeInBits - 1, DL, VT);

  SDValue AmtMod = DAG.getNode(ISD::AND, DL, VT, Amt, AmtMask);


  // Attempt to fold as unpack(x,x) << zext(splat(y)):

  // rotl(x,y) -> (unpack(x,x) << (y & (bw-1))) >> bw.

  // rotr(x,y) -> (unpack(x,x) >> (y & (bw-1))).

  if (EltSizeInBits == 8 || EltSizeInBits == 16 || EltSizeInBits == 32) {

    int BaseRotAmtIdx = -1;

    if (SDValue BaseRotAmt = DAG.getSplatSourceVector(AmtMod, BaseRotAmtIdx)) {

      if (EltSizeInBits == 16 && Subtarget.hasSSE41()) {

        unsigned FunnelOpc = IsROTL ? ISD::FSHL : ISD::FSHR;

        return DAG.getNode(FunnelOpc, DL, VT, R, R, Amt);

      }

      unsigned ShiftX86Opc = IsROTL ? X86ISD::VSHLI : X86ISD::VSRLI;

      SDValue Lo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, R, R));

      SDValue Hi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, R, R));

      Lo = getTargetVShiftNode(ShiftX86Opc, DL, ExtVT, Lo, BaseRotAmt,

                               BaseRotAmtIdx, Subtarget, DAG);

      Hi = getTargetVShiftNode(ShiftX86Opc, DL, ExtVT, Hi, BaseRotAmt,

                               BaseRotAmtIdx, Subtarget, DAG);

      return getPack(DAG, Subtarget, DL, VT, Lo, Hi, IsROTL);

    }

  }


  bool ConstantAmt = ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());

  unsigned ShiftOpc = IsROTL ? ISD::SHL : ISD::SRL;


  // Attempt to fold as unpack(x,x) << zext(y):

  // rotl(x,y) -> (unpack(x,x) << (y & (bw-1))) >> bw.

  // rotr(x,y) -> (unpack(x,x) >> (y & (bw-1))).

  // Const vXi16/vXi32 are excluded in favor of MUL-based lowering.

  if (!(ConstantAmt && EltSizeInBits != 8) &&

      !supportedVectorVarShift(VT, Subtarget, ShiftOpc) &&

      (ConstantAmt || supportedVectorVarShift(ExtVT, Subtarget, ShiftOpc))) {

    SDValue RLo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, R, R));

    SDValue RHi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, R, R));

    SDValue ALo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, AmtMod, Z));

    SDValue AHi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, AmtMod, Z));

    SDValue Lo = DAG.getNode(ShiftOpc, DL, ExtVT, RLo, ALo);

    SDValue Hi = DAG.getNode(ShiftOpc, DL, ExtVT, RHi, AHi);

    return getPack(DAG, Subtarget, DL, VT, Lo, Hi, IsROTL);

  }


  // v16i8/v32i8/v64i8: Split rotation into rot4/rot2/rot1 stages and select by

  // the amount bit.

  // TODO: We're doing nothing here that we couldn't do for funnel shifts.

  if (EltSizeInBits == 8) {

    MVT WideVT =

        MVT::getVectorVT(Subtarget.hasBWI() ? MVT::i16 : MVT::i32, NumElts);


    // Attempt to fold as:

    // rotl(x,y) -> (((aext(x) << bw) | zext(x)) << (y & (bw-1))) >> bw.

    // rotr(x,y) -> (((aext(x) << bw) | zext(x)) >> (y & (bw-1))).

    if (supportedVectorVarShift(WideVT, Subtarget, ShiftOpc) &&

        supportedVectorShiftWithImm(WideVT, Subtarget, ShiftOpc)) {

      // If we're rotating by constant, just use default promotion.

      if (ConstantAmt)

        return SDValue();

      // See if we can perform this by widening to vXi16 or vXi32.

      R = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT, R);

      R = DAG.getNode(

          ISD::OR, DL, WideVT, R,

          getTargetVShiftByConstNode(X86ISD::VSHLI, DL, WideVT, R, 8, DAG));

      Amt = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT, AmtMod);

      R = DAG.getNode(ShiftOpc, DL, WideVT, R, Amt);

      if (IsROTL)

        R = getTargetVShiftByConstNode(X86ISD::VSRLI, DL, WideVT, R, 8, DAG);

      return DAG.getNode(ISD::TRUNCATE, DL, VT, R);

    }


    // We don't need ModuloAmt here as we just peek at individual bits.

    auto SignBitSelect = [&](MVT SelVT, SDValue Sel, SDValue V0, SDValue V1) {

      if (Subtarget.hasSSE41()) {

        // On SSE41 targets we can use PBLENDVB which selects bytes based just

        // on the sign bit.

        V0 = DAG.getBitcast(VT, V0);

        V1 = DAG.getBitcast(VT, V1);

        Sel = DAG.getBitcast(VT, Sel);

        return DAG.getBitcast(SelVT,

                              DAG.getNode(X86ISD::BLENDV, DL, VT, Sel, V0, V1));

      }

      // On pre-SSE41 targets we test for the sign bit by comparing to

      // zero - a negative value will set all bits of the lanes to true

      // and VSELECT uses that in its OR(AND(V0,C),AND(V1,~C)) lowering.

      SDValue Z = DAG.getConstant(0, DL, SelVT);

      SDValue C = DAG.getNode(X86ISD::PCMPGT, DL, SelVT, Z, Sel);

      return DAG.getSelect(DL, SelVT, C, V0, V1);

    };


    // ISD::ROTR is currently only profitable on AVX512 targets with VPTERNLOG.

    if (!IsROTL && !useVPTERNLOG(Subtarget, VT)) {

      Amt = DAG.getNode(ISD::SUB, DL, VT, Z, Amt);

      IsROTL = true;

    }


    unsigned ShiftLHS = IsROTL ? ISD::SHL : ISD::SRL;

    unsigned ShiftRHS = IsROTL ? ISD::SRL : ISD::SHL;


    // Turn 'a' into a mask suitable for VSELECT: a = a << 5;

    // We can safely do this using i16 shifts as we're only interested in

    // the 3 lower bits of each byte.

    Amt = DAG.getBitcast(ExtVT, Amt);

    Amt = DAG.getNode(ISD::SHL, DL, ExtVT, Amt, DAG.getConstant(5, DL, ExtVT));

    Amt = DAG.getBitcast(VT, Amt);


    // r = VSELECT(r, rot(r, 4), a);

    SDValue M;

    M = DAG.getNode(

        ISD::OR, DL, VT,

        DAG.getNode(ShiftLHS, DL, VT, R, DAG.getConstant(4, DL, VT)),

        DAG.getNode(ShiftRHS, DL, VT, R, DAG.getConstant(4, DL, VT)));

    R = SignBitSelect(VT, Amt, M, R);


    // a += a

    Amt = DAG.getNode(ISD::ADD, DL, VT, Amt, Amt);


    // r = VSELECT(r, rot(r, 2), a);

    M = DAG.getNode(

        ISD::OR, DL, VT,

        DAG.getNode(ShiftLHS, DL, VT, R, DAG.getConstant(2, DL, VT)),

        DAG.getNode(ShiftRHS, DL, VT, R, DAG.getConstant(6, DL, VT)));

    R = SignBitSelect(VT, Amt, M, R);


    // a += a

    Amt = DAG.getNode(ISD::ADD, DL, VT, Amt, Amt);


    // return VSELECT(r, rot(r, 1), a);

    M = DAG.getNode(

        ISD::OR, DL, VT,

        DAG.getNode(ShiftLHS, DL, VT, R, DAG.getConstant(1, DL, VT)),

        DAG.getNode(ShiftRHS, DL, VT, R, DAG.getConstant(7, DL, VT)));

    return SignBitSelect(VT, Amt, M, R);

  }


  bool IsSplatAmt = DAG.isSplatValue(Amt);

  bool LegalVarShifts = supportedVectorVarShift(VT, Subtarget, ISD::SHL) &&

                        supportedVectorVarShift(VT, Subtarget, ISD::SRL);


  // Fallback for splats + all supported variable shifts.

  // Fallback for non-constants AVX2 vXi16 as well.

  if (IsSplatAmt || LegalVarShifts || (Subtarget.hasAVX2() && !ConstantAmt)) {

    Amt = DAG.getNode(ISD::AND, DL, VT, Amt, AmtMask);

    SDValue AmtR = DAG.getConstant(EltSizeInBits, DL, VT);

    AmtR = DAG.getNode(ISD::SUB, DL, VT, AmtR, Amt);

    SDValue SHL = DAG.getNode(IsROTL ? ISD::SHL : ISD::SRL, DL, VT, R, Amt);

    SDValue SRL = DAG.getNode(IsROTL ? ISD::SRL : ISD::SHL, DL, VT, R, AmtR);

    return DAG.getNode(ISD::OR, DL, VT, SHL, SRL);

  }


  // Everything below assumes ISD::ROTL.

  if (!IsROTL) {

    Amt = DAG.getNode(ISD::SUB, DL, VT, Z, Amt);

    IsROTL = true;

  }


  // ISD::ROT* uses modulo rotate amounts.

  Amt = DAG.getNode(ISD::AND, DL, VT, Amt, AmtMask);


  assert(IsROTL && "Only ROTL supported");


  // As with shifts, attempt to convert the rotation amount to a multiplication

  // factor, fallback to general expansion.

  SDValue Scale = convertShiftLeftToScale(Amt, DL, Subtarget, DAG);

  if (!Scale)

    return SDValue();


  // v8i16/v16i16: perform unsigned multiply hi/lo and OR the results.

  if (EltSizeInBits == 16) {

    SDValue Lo = DAG.getNode(ISD::MUL, DL, VT, R, Scale);

    SDValue Hi = DAG.getNode(ISD::MULHU, DL, VT, R, Scale);

    return DAG.getNode(ISD::OR, DL, VT, Lo, Hi);

  }


  // v4i32: make use of the PMULUDQ instruction to multiply 2 lanes of v4i32

  // to v2i64 results at a time. The upper 32-bits contain the wrapped bits

  // that can then be OR'd with the lower 32-bits.

  assert(VT == MVT::v4i32 && "Only v4i32 vector rotate expected");

  static const int OddMask[] = {1, 1, 3, 3};

  SDValue R13 = DAG.getVectorShuffle(VT, DL, R, R, OddMask);

  SDValue Scale13 = DAG.getVectorShuffle(VT, DL, Scale, Scale, OddMask);


  SDValue Res02 = DAG.getNode(X86ISD::PMULUDQ, DL, MVT::v2i64,

                              DAG.getBitcast(MVT::v2i64, R),

                              DAG.getBitcast(MVT::v2i64, Scale));

  SDValue Res13 = DAG.getNode(X86ISD::PMULUDQ, DL, MVT::v2i64,

                              DAG.getBitcast(MVT::v2i64, R13),

                              DAG.getBitcast(MVT::v2i64, Scale13));

  Res02 = DAG.getBitcast(VT, Res02);

  Res13 = DAG.getBitcast(VT, Res13);


  return DAG.getNode(ISD::OR, DL, VT,

                     DAG.getVectorShuffle(VT, DL, Res02, Res13, {0, 4, 2, 6}),

                     DAG.getVectorShuffle(VT, DL, Res02, Res13, {1, 5, 3, 7}));

}


/// Returns true if the operand type is exactly twice the native width, and

/// the corresponding cmpxchg8b or cmpxchg16b instruction is available.

/// Used to know whether to use cmpxchg8/16b when expanding atomic operations

/// (otherwise we leave them alone to become __sync_fetch_and_... calls).

bool X86TargetLowering::needsCmpXchgNb(Type *MemType) const {

  unsigned OpWidth = MemType->getPrimitiveSizeInBits();


  if (OpWidth == 64)

    return Subtarget.canUseCMPXCHG8B() && !Subtarget.is64Bit();

  if (OpWidth == 128)

    return Subtarget.canUseCMPXCHG16B();


  return false;

}


TargetLoweringBase::AtomicExpansionKind

X86TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {

  Type *MemType = SI->getValueOperand()->getType();


  if (!SI->getFunction()->hasFnAttribute(Attribute::NoImplicitFloat) &&

      !Subtarget.useSoftFloat()) {

    if (MemType->getPrimitiveSizeInBits() == 64 && !Subtarget.is64Bit() &&

        (Subtarget.hasSSE1() || Subtarget.hasX87()))

      return AtomicExpansionKind::None;


    if (MemType->getPrimitiveSizeInBits() == 128 && Subtarget.is64Bit() &&

        Subtarget.hasAVX())

      return AtomicExpansionKind::None;

  }


  return needsCmpXchgNb(MemType) ? AtomicExpansionKind::Expand

                                 : AtomicExpansionKind::None;

}


// Note: this turns large loads into lock cmpxchg8b/16b.

TargetLowering::AtomicExpansionKind

X86TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {

  Type *MemType = LI->getType();


  if (!LI->getFunction()->hasFnAttribute(Attribute::NoImplicitFloat) &&

      !Subtarget.useSoftFloat()) {

    // If this a 64 bit atomic load on a 32-bit target and SSE2 is enabled, we

    // can use movq to do the load. If we have X87 we can load into an 80-bit

    // X87 register and store it to a stack temporary.

    if (MemType->getPrimitiveSizeInBits() == 64 && !Subtarget.is64Bit() &&

        (Subtarget.hasSSE1() || Subtarget.hasX87()))

      return AtomicExpansionKind::None;


    // If this is a 128-bit load with AVX, 128-bit SSE loads/stores are atomic.

    if (MemType->getPrimitiveSizeInBits() == 128 && Subtarget.is64Bit() &&

        Subtarget.hasAVX())

      return AtomicExpansionKind::None;

  }


  return needsCmpXchgNb(MemType) ? AtomicExpansionKind::CmpXChg

                                 : AtomicExpansionKind::None;

}


enum BitTestKind : unsigned {

  UndefBit,

  ConstantBit,

  NotConstantBit,

  ShiftBit,

  NotShiftBit

};


static std::pair<Value *, BitTestKind> FindSingleBitChange(Value *V) {

  using namespace llvm::PatternMatch;

  BitTestKind BTK = UndefBit;

  if (auto *C = dyn_cast<ConstantInt>(V)) {

    // Check if V is a power of 2 or NOT power of 2.

    if (isPowerOf2_64(C->getZExtValue()))

      BTK = ConstantBit;

    else if (isPowerOf2_64((~C->getValue()).getZExtValue()))

      BTK = NotConstantBit;

    return {V, BTK};

  }


  // Check if V is some power of 2 pattern known to be non-zero

  if (auto *I = dyn_cast<Instruction>(V)) {

    bool Not = false;

    // Check if we have a NOT

    Value *PeekI;

    if (match(I, m_Not(m_Value(PeekI))) ||

        match(I, m_Sub(m_AllOnes(), m_Value(PeekI)))) {

      Not = true;

      I = dyn_cast<Instruction>(PeekI);


      // If I is constant, it will fold and we can evaluate later. If its an

      // argument or something of that nature, we can't analyze.

      if (I == nullptr)

        return {nullptr, UndefBit};

    }

    // We can only use 1 << X without more sophisticated analysis. C << X where

    // C is a power of 2 but not 1 can result in zero which cannot be translated

    // to bittest. Likewise any C >> X (either arith or logical) can be zero.

    if (I->getOpcode() == Instruction::Shl) {

      // Todo(1): The cmpxchg case is pretty costly so matching `BLSI(X)`, `X &

      // -X` and some other provable power of 2 patterns that we can use CTZ on

      // may be profitable.

      // Todo(2): It may be possible in some cases to prove that Shl(C, X) is

      // non-zero even where C != 1. Likewise LShr(C, X) and AShr(C, X) may also

      // be provably a non-zero power of 2.

      // Todo(3): ROTL and ROTR patterns on a power of 2 C should also be

      // transformable to bittest.

      auto *ShiftVal = dyn_cast<ConstantInt>(I->getOperand(0));

      if (!ShiftVal)

        return {nullptr, UndefBit};

      if (ShiftVal->equalsInt(1))

        BTK = Not ? NotShiftBit : ShiftBit;


      if (BTK == UndefBit)

        return {nullptr, UndefBit};


      Value *BitV = I->getOperand(1);


      // Read past a shiftmask instruction to find count

      Value *AndOp;

      uint64_t ShiftMask = I->getType()->getPrimitiveSizeInBits() - 1;

      if (match(BitV, m_c_And(m_Value(AndOp), m_SpecificInt(ShiftMask))))

        BitV = AndOp;


      return {BitV, BTK};

    }

  }

  return {nullptr, UndefBit};

}


TargetLowering::AtomicExpansionKind

X86TargetLowering::shouldExpandLogicAtomicRMWInIR(AtomicRMWInst *AI) const {

  using namespace llvm::PatternMatch;

  // If the atomicrmw's result isn't actually used, we can just add a "lock"

  // prefix to a normal instruction for these operations.

  if (AI->use_empty())

    return AtomicExpansionKind::None;


  if (AI->getOperation() == AtomicRMWInst::Xor) {

    // A ^ SignBit -> A + SignBit. This allows us to use `xadd` which is

    // preferable to both `cmpxchg` and `btc`.

    if (match(AI->getOperand(1), m_SignMask()))

      return AtomicExpansionKind::None;

  }


  // If the atomicrmw's result is used by a single bit AND, we may use

  // bts/btr/btc instruction for these operations.

  // Note: InstCombinePass can cause a de-optimization here. It replaces the

  // SETCC(And(AtomicRMW(P, power_of_2), power_of_2)) with LShr and Xor

  // (depending on CC). This pattern can only use bts/btr/btc but we don't

  // detect it.

  Instruction *I = AI->user_back();

  auto BitChange = FindSingleBitChange(AI->getValOperand());

  if (BitChange.second == UndefBit || !AI->hasOneUse() ||

      I->getOpcode() != Instruction::And ||

      AI->getType()->getPrimitiveSizeInBits() == 8 ||

      AI->getParent() != I->getParent())

    return AtomicExpansionKind::CmpXChg;


  unsigned OtherIdx = I->getOperand(0) == AI ? 1 : 0;


  // This is a redundant AND, it should get cleaned up elsewhere.

  if (AI == I->getOperand(OtherIdx))

    return AtomicExpansionKind::CmpXChg;


  // The following instruction must be a AND single bit.

  if (BitChange.second == ConstantBit || BitChange.second == NotConstantBit) {

    auto *C1 = cast<ConstantInt>(AI->getValOperand());

    auto *C2 = dyn_cast<ConstantInt>(I->getOperand(OtherIdx));

    if (!C2 || !isPowerOf2_64(C2->getZExtValue())) {

      return AtomicExpansionKind::CmpXChg;

    }

    if (AI->getOperation() == AtomicRMWInst::And) {

      return ~C1->getValue() == C2->getValue()

                 ? AtomicExpansionKind::BitTestIntrinsic

                 : AtomicExpansionKind::CmpXChg;

    }

    return C1 == C2 ? AtomicExpansionKind::BitTestIntrinsic

                    : AtomicExpansionKind::CmpXChg;

  }


  assert(BitChange.second == ShiftBit || BitChange.second == NotShiftBit);


  auto BitTested = FindSingleBitChange(I->getOperand(OtherIdx));

  if (BitTested.second != ShiftBit && BitTested.second != NotShiftBit)

    return AtomicExpansionKind::CmpXChg;


  assert(BitChange.first != nullptr && BitTested.first != nullptr);


  // If shift amounts are not the same we can't use BitTestIntrinsic.

  if (BitChange.first != BitTested.first)

    return AtomicExpansionKind::CmpXChg;


  // If atomic AND need to be masking all be one bit and testing the one bit

  // unset in the mask.

  if (AI->getOperation() == AtomicRMWInst::And)

    return (BitChange.second == NotShiftBit && BitTested.second == ShiftBit)

               ? AtomicExpansionKind::BitTestIntrinsic

               : AtomicExpansionKind::CmpXChg;


  // If atomic XOR/OR need to be setting and testing the same bit.

  return (BitChange.second == ShiftBit && BitTested.second == ShiftBit)

             ? AtomicExpansionKind::BitTestIntrinsic

             : AtomicExpansionKind::CmpXChg;

}


void X86TargetLowering::emitBitTestAtomicRMWIntrinsic(AtomicRMWInst *AI) const {

  IRBuilder<> Builder(AI);

  Builder.CollectMetadataToCopy(AI, {LLVMContext::MD_pcsections});

  Intrinsic::ID IID_C = Intrinsic::not_intrinsic;

  Intrinsic::ID IID_I = Intrinsic::not_intrinsic;

  switch (AI->getOperation()) {

  default:

    llvm_unreachable("Unknown atomic operation");

  case AtomicRMWInst::Or:

    IID_C = Intrinsic::x86_atomic_bts;

    IID_I = Intrinsic::x86_atomic_bts_rm;

    break;

  case AtomicRMWInst::Xor:

    IID_C = Intrinsic::x86_atomic_btc;

    IID_I = Intrinsic::x86_atomic_btc_rm;

    break;

  case AtomicRMWInst::And:

    IID_C = Intrinsic::x86_atomic_btr;

    IID_I = Intrinsic::x86_atomic_btr_rm;

    break;

  }

  Instruction *I = AI->user_back();

  LLVMContext &Ctx = AI->getContext();

  Value *Addr = Builder.CreatePointerCast(AI->getPointerOperand(),

                                          PointerType::getUnqual(Ctx));

  Value *Result = nullptr;

  auto BitTested = FindSingleBitChange(AI->getValOperand());

  assert(BitTested.first != nullptr);


  if (BitTested.second == ConstantBit || BitTested.second == NotConstantBit) {

    auto *C = cast<ConstantInt>(I->getOperand(I->getOperand(0) == AI ? 1 : 0));


    unsigned Imm = llvm::countr_zero(C->getZExtValue());

    Result = Builder.CreateIntrinsic(IID_C, AI->getType(),

                                     {Addr, Builder.getInt8(Imm)});

  } else {

    assert(BitTested.second == ShiftBit || BitTested.second == NotShiftBit);


    Value *SI = BitTested.first;

    assert(SI != nullptr);


    // BT{S|R|C} on memory operand don't modulo bit position so we need to

    // mask it.

    unsigned ShiftBits = SI->getType()->getPrimitiveSizeInBits();

    Value *BitPos =

        Builder.CreateAnd(SI, Builder.getIntN(ShiftBits, ShiftBits - 1));

    // Todo(1): In many cases it may be provable that SI is less than

    // ShiftBits in which case this mask is unnecessary

    // Todo(2): In the fairly idiomatic case of P[X / sizeof_bits(X)] OP 1

    // << (X % sizeof_bits(X)) we can drop the shift mask and AGEN in

    // favor of just a raw BT{S|R|C}.


    Result = Builder.CreateIntrinsic(IID_I, AI->getType(), {Addr, BitPos});

    Result = Builder.CreateZExtOrTrunc(Result, AI->getType());


    // If the result is only used for zero/non-zero status then we don't need to

    // shift value back. Otherwise do so.

    for (auto It = I->user_begin(); It != I->user_end(); ++It) {

      if (auto *ICmp = dyn_cast<ICmpInst>(*It)) {

        if (ICmp->isEquality()) {

          auto *C0 = dyn_cast<ConstantInt>(ICmp->getOperand(0));

          auto *C1 = dyn_cast<ConstantInt>(ICmp->getOperand(1));

          if (C0 || C1) {

            assert(C0 == nullptr || C1 == nullptr);

            if ((C0 ? C0 : C1)->isZero())

              continue;

          }

        }

      }

      Result = Builder.CreateShl(Result, BitPos);

      break;

    }

  }


  I->replaceAllUsesWith(Result);

  I->eraseFromParent();

  AI->eraseFromParent();

}


static bool shouldExpandCmpArithRMWInIR(AtomicRMWInst *AI) {

  using namespace llvm::PatternMatch;

  if (!AI->hasOneUse())

    return false;


  Value *Op = AI->getOperand(1);

  CmpPredicate Pred;

  Instruction *I = AI->user_back();

  AtomicRMWInst::BinOp Opc = AI->getOperation();

  if (Opc == AtomicRMWInst::Add) {

    if (match(I, m_c_ICmp(Pred, m_Sub(m_ZeroInt(), m_Specific(Op)), m_Value())))

      return Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE;

    if (match(I, m_OneUse(m_c_Add(m_Specific(Op), m_Value())))) {

      if (match(I->user_back(),

                m_SpecificICmp(CmpInst::ICMP_SLT, m_Value(), m_ZeroInt())))

        return true;

      if (match(I->user_back(),

                m_SpecificICmp(CmpInst::ICMP_SGT, m_Value(), m_AllOnes())))

        return true;

    }

    return false;

  }

  if (Opc == AtomicRMWInst::Sub) {

    if (match(I, m_c_ICmp(Pred, m_Specific(Op), m_Value())))

      return Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE;

    if (match(I, m_OneUse(m_Sub(m_Value(), m_Specific(Op))))) {

      if (match(I->user_back(),

                m_SpecificICmp(CmpInst::ICMP_SLT, m_Value(), m_ZeroInt())))

        return true;

      if (match(I->user_back(),

                m_SpecificICmp(CmpInst::ICMP_SGT, m_Value(), m_AllOnes())))

        return true;

    }

    return false;

  }

  if ((Opc == AtomicRMWInst::Or &&

       match(I, m_OneUse(m_c_Or(m_Specific(Op), m_Value())))) ||

      (Opc == AtomicRMWInst::And &&

       match(I, m_OneUse(m_c_And(m_Specific(Op), m_Value()))))) {

    if (match(I->user_back(), m_ICmp(Pred, m_Value(), m_ZeroInt())))

      return Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE ||

             Pred == CmpInst::ICMP_SLT;

    if (match(I->user_back(),

              m_SpecificICmp(CmpInst::ICMP_SGT, m_Value(), m_AllOnes())))

      return true;

    return false;

  }

  if (Opc == AtomicRMWInst::Xor) {

    if (match(I, m_c_ICmp(Pred, m_Specific(Op), m_Value())))

      return Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE;

    if (match(I, m_OneUse(m_c_Xor(m_Specific(Op), m_Value())))) {

      if (match(I->user_back(),

                m_SpecificICmp(CmpInst::ICMP_SLT, m_Value(), m_ZeroInt())))

        return true;

      if (match(I->user_back(),

                m_SpecificICmp(CmpInst::ICMP_SGT, m_Value(), m_AllOnes())))

        return true;

    }

    return false;

  }


  return false;

}


void X86TargetLowering::emitCmpArithAtomicRMWIntrinsic(

    AtomicRMWInst *AI) const {

  IRBuilder<> Builder(AI);

  Builder.CollectMetadataToCopy(AI, {LLVMContext::MD_pcsections});

  Instruction *TempI = nullptr;

  LLVMContext &Ctx = AI->getContext();

  ICmpInst *ICI = dyn_cast<ICmpInst>(AI->user_back());

  if (!ICI) {

    TempI = AI->user_back();

    assert(TempI->hasOneUse() && "Must have one use");

    ICI = cast<ICmpInst>(TempI->user_back());

  }

  X86::CondCode CC = X86::COND_INVALID;

  ICmpInst::Predicate Pred = ICI->getPredicate();

  switch (Pred) {

  default:

    llvm_unreachable("Not supported Pred");

  case CmpInst::ICMP_EQ:

    CC = X86::COND_E;

    break;

  case CmpInst::ICMP_NE:

    CC = X86::COND_NE;

    break;

  case CmpInst::ICMP_SLT:

    CC = X86::COND_S;

    break;

  case CmpInst::ICMP_SGT:

    CC = X86::COND_NS;

    break;

  }

  Intrinsic::ID IID = Intrinsic::not_intrinsic;

  switch (AI->getOperation()) {

  default:

    llvm_unreachable("Unknown atomic operation");

  case AtomicRMWInst::Add:

    IID = Intrinsic::x86_atomic_add_cc;

    break;

  case AtomicRMWInst::Sub:

    IID = Intrinsic::x86_atomic_sub_cc;

    break;

  case AtomicRMWInst::Or:

    IID = Intrinsic::x86_atomic_or_cc;

    break;

  case AtomicRMWInst::And:

    IID = Intrinsic::x86_atomic_and_cc;

    break;

  case AtomicRMWInst::Xor:

    IID = Intrinsic::x86_atomic_xor_cc;

    break;

  }

  Value *Addr = Builder.CreatePointerCast(AI->getPointerOperand(),

                                          PointerType::getUnqual(Ctx));

  Value *Call = Builder.CreateIntrinsic(

      IID, AI->getType(),

      {Addr, AI->getValOperand(), Builder.getInt32((unsigned)CC)});

  Value *Result = Builder.CreateTrunc(Call, Type::getInt1Ty(Ctx));

  ICI->replaceAllUsesWith(Result);

  ICI->eraseFromParent();

  if (TempI)

    TempI->eraseFromParent();

  AI->eraseFromParent();

}


TargetLowering::AtomicExpansionKind

X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {

  unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;

  Type *MemType = AI->getType();


  // If the operand is too big, we must see if cmpxchg8/16b is available

  // and default to library calls otherwise.

  if (MemType->getPrimitiveSizeInBits() > NativeWidth) {

    return needsCmpXchgNb(MemType) ? AtomicExpansionKind::CmpXChg

                                   : AtomicExpansionKind::None;

  }


  AtomicRMWInst::BinOp Op = AI->getOperation();

  switch (Op) {

  case AtomicRMWInst::Xchg:

    return AtomicExpansionKind::None;

  case AtomicRMWInst::Add:

  case AtomicRMWInst::Sub:

    if (shouldExpandCmpArithRMWInIR(AI))

      return AtomicExpansionKind::CmpArithIntrinsic;

    // It's better to use xadd, xsub or xchg for these in other cases.

    return AtomicExpansionKind::None;

  case AtomicRMWInst::Or:

  case AtomicRMWInst::And:

  case AtomicRMWInst::Xor:

    if (shouldExpandCmpArithRMWInIR(AI))

      return AtomicExpansionKind::CmpArithIntrinsic;

    return shouldExpandLogicAtomicRMWInIR(AI);

  case AtomicRMWInst::Nand:

  case AtomicRMWInst::Max:

  case AtomicRMWInst::Min:

  case AtomicRMWInst::UMax:

  case AtomicRMWInst::UMin:

  case AtomicRMWInst::FAdd:

  case AtomicRMWInst::FSub:

  case AtomicRMWInst::FMax:

  case AtomicRMWInst::FMin:

  case AtomicRMWInst::UIncWrap:

  case AtomicRMWInst::UDecWrap:

  case AtomicRMWInst::USubCond:

  case AtomicRMWInst::USubSat:

  default:

    // These always require a non-trivial set of data operations on x86. We must

    // use a cmpxchg loop.

    return AtomicExpansionKind::CmpXChg;

  }

}


LoadInst *

X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {

  unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;

  Type *MemType = AI->getType();

  // Accesses larger than the native width are turned into cmpxchg/libcalls, so

  // there is no benefit in turning such RMWs into loads, and it is actually

  // harmful as it introduces a mfence.

  if (MemType->getPrimitiveSizeInBits() > NativeWidth)

    return nullptr;


  // If this is a canonical idempotent atomicrmw w/no uses, we have a better

  // lowering available in lowerAtomicArith.

  // TODO: push more cases through this path.

  if (auto *C = dyn_cast<ConstantInt>(AI->getValOperand()))

    if (AI->getOperation() == AtomicRMWInst::Or && C->isZero() &&

        AI->use_empty())

      return nullptr;


  IRBuilder<> Builder(AI);

  Builder.CollectMetadataToCopy(AI, {LLVMContext::MD_pcsections});

  auto SSID = AI->getSyncScopeID();

  // We must restrict the ordering to avoid generating loads with Release or

  // ReleaseAcquire orderings.

  auto Order = AtomicCmpXchgInst::getStrongestFailureOrdering(AI->getOrdering());


  // Before the load we need a fence. Here is an example lifted from

  // http://www.hpl.hp.com/techreports/2012/HPL-2012-68.pdf showing why a fence

  // is required:

  // Thread 0:

  //   x.store(1, relaxed);

  //   r1 = y.fetch_add(0, release);

  // Thread 1:

  //   y.fetch_add(42, acquire);

  //   r2 = x.load(relaxed);

  // r1 = r2 = 0 is impossible, but becomes possible if the idempotent rmw is

  // lowered to just a load without a fence. A mfence flushes the store buffer,

  // making the optimization clearly correct.

  // FIXME: it is required if isReleaseOrStronger(Order) but it is not clear

  // otherwise, we might be able to be more aggressive on relaxed idempotent

  // rmw. In practice, they do not look useful, so we don't try to be

  // especially clever.


  // Use `fence seq_cst` over `llvm.x64.sse2.mfence` here to get the correct

  // lowering for SSID == SyncScope::SingleThread and avoidMFence || !hasMFence

  Builder.CreateFence(AtomicOrdering::SequentiallyConsistent, SSID);


  // Finally we can emit the atomic load.

  LoadInst *Loaded = Builder.CreateAlignedLoad(

      AI->getType(), AI->getPointerOperand(), AI->getAlign());

  Loaded->setAtomic(Order, SSID);

  AI->replaceAllUsesWith(Loaded);

  AI->eraseFromParent();

  return Loaded;

}


/// Emit a locked operation on a stack location which does not change any

/// memory location, but does involve a lock prefix.  Location is chosen to be

/// a) very likely accessed only by a single thread to minimize cache traffic,

/// and b) definitely dereferenceable.  Returns the new Chain result.


static SDValue emitLockedStackOp(SelectionDAG &DAG,

                                 const X86Subtarget &Subtarget, SDValue Chain,

                                 const SDLoc &DL) {

  // Implementation notes:

  // 1) LOCK prefix creates a full read/write reordering barrier for memory

  // operations issued by the current processor.  As such, the location

  // referenced is not relevant for the ordering properties of the instruction.

  // See: Intel® 64 and IA-32 ArchitecturesSoftware Developer’s Manual,

  // 8.2.3.9  Loads and Stores Are Not Reordered with Locked Instructions

  // 2) Using an immediate operand appears to be the best encoding choice

  // here since it doesn't require an extra register.

  // 3) OR appears to be very slightly faster than ADD. (Though, the difference

  // is small enough it might just be measurement noise.)

  // 4) When choosing offsets, there are several contributing factors:

  //   a) If there's no redzone, we default to TOS.  (We could allocate a cache

  //      line aligned stack object to improve this case.)

  //   b) To minimize our chances of introducing a false dependence, we prefer

  //      to offset the stack usage from TOS slightly.

  //   c) To minimize concerns about cross thread stack usage - in particular,

  //      the idiomatic MyThreadPool.run([&StackVars]() {...}) pattern which

  //      captures state in the TOS frame and accesses it from many threads -

  //      we want to use an offset such that the offset is in a distinct cache

  //      line from the TOS frame.

  //

  // For a general discussion of the tradeoffs and benchmark results, see:

  // https://shipilev.net/blog/2014/on-the-fence-with-dependencies/


  auto &MF = DAG.getMachineFunction();

  auto &TFL = *Subtarget.getFrameLowering();

  const unsigned SPOffset = TFL.has128ByteRedZone(MF) ? -64 : 0;


  if (Subtarget.is64Bit()) {

    SDValue Zero = DAG.getTargetConstant(0, DL, MVT::i32);

    SDValue Ops[] = {

      DAG.getRegister(X86::RSP, MVT::i64),                  // Base

      DAG.getTargetConstant(1, DL, MVT::i8),                // Scale

      DAG.getRegister(0, MVT::i64),                         // Index

      DAG.getTargetConstant(SPOffset, DL, MVT::i32),        // Disp

      DAG.getRegister(0, MVT::i16),                         // Segment.

      Zero,

      Chain};

    SDNode *Res = DAG.getMachineNode(X86::OR32mi8Locked, DL, MVT::i32,

                                     MVT::Other, Ops);

    return SDValue(Res, 1);

  }


  SDValue Zero = DAG.getTargetConstant(0, DL, MVT::i32);

  SDValue Ops[] = {

    DAG.getRegister(X86::ESP, MVT::i32),            // Base

    DAG.getTargetConstant(1, DL, MVT::i8),          // Scale

    DAG.getRegister(0, MVT::i32),                   // Index

    DAG.getTargetConstant(SPOffset, DL, MVT::i32),  // Disp

    DAG.getRegister(0, MVT::i16),                   // Segment.

    Zero,

    Chain

  };

  SDNode *Res = DAG.getMachineNode(X86::OR32mi8Locked, DL, MVT::i32,

                                   MVT::Other, Ops);

  return SDValue(Res, 1);

}


static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget &Subtarget,

                                 SelectionDAG &DAG) {

  SDLoc dl(Op);

  AtomicOrdering FenceOrdering =

      static_cast<AtomicOrdering>(Op.getConstantOperandVal(1));

  SyncScope::ID FenceSSID =

      static_cast<SyncScope::ID>(Op.getConstantOperandVal(2));


  // The only fence that needs an instruction is a sequentially-consistent

  // cross-thread fence.

  if (FenceOrdering == AtomicOrdering::SequentiallyConsistent &&

      FenceSSID == SyncScope::System) {

    if (!Subtarget.avoidMFence() && Subtarget.hasMFence())

      return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0));


    SDValue Chain = Op.getOperand(0);

    return emitLockedStackOp(DAG, Subtarget, Chain, dl);

  }


  // MEMBARRIER is a compiler barrier; it codegens to a no-op.

  return DAG.getNode(ISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0));

}


static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget &Subtarget,

                             SelectionDAG &DAG) {

  MVT T = Op.getSimpleValueType();

  SDLoc DL(Op);

  unsigned Reg = 0;

  unsigned size = 0;

  switch(T.SimpleTy) {

  default: llvm_unreachable("Invalid value type!");

  case MVT::i8:  Reg = X86::AL;  size = 1; break;

  case MVT::i16: Reg = X86::AX;  size = 2; break;

  case MVT::i32: Reg = X86::EAX; size = 4; break;

  case MVT::i64:

    assert(Subtarget.is64Bit() && "Node not type legal!");

    Reg = X86::RAX; size = 8;

    break;

  }

  SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), DL, Reg,

                                  Op.getOperand(2), SDValue());

  SDValue Ops[] = { cpIn.getValue(0),

                    Op.getOperand(1),

                    Op.getOperand(3),

                    DAG.getTargetConstant(size, DL, MVT::i8),

                    cpIn.getValue(1) };

  SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);

  MachineMemOperand *MMO = cast<AtomicSDNode>(Op)->getMemOperand();

  SDValue Result = DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG_DAG, DL, Tys,

                                           Ops, T, MMO);


  SDValue cpOut =

    DAG.getCopyFromReg(Result.getValue(0), DL, Reg, T, Result.getValue(1));

  SDValue EFLAGS = DAG.getCopyFromReg(cpOut.getValue(1), DL, X86::EFLAGS,

                                      MVT::i32, cpOut.getValue(2));

  SDValue Success = getSETCC(X86::COND_E, EFLAGS, DL, DAG);


  return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(),

                     cpOut, Success, EFLAGS.getValue(1));

}


// Create MOVMSKB, taking into account whether we need to split for AVX1.


static SDValue getPMOVMSKB(const SDLoc &DL, SDValue V, SelectionDAG &DAG,

                           const X86Subtarget &Subtarget) {

  MVT InVT = V.getSimpleValueType();


  if (InVT == MVT::v64i8) {

    SDValue Lo, Hi;

    std::tie(Lo, Hi) = DAG.SplitVector(V, DL);

    Lo = getPMOVMSKB(DL, Lo, DAG, Subtarget);

    Hi = getPMOVMSKB(DL, Hi, DAG, Subtarget);

    Lo = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Lo);

    Hi = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, Hi);

    Hi = DAG.getNode(ISD::SHL, DL, MVT::i64, Hi,

                     DAG.getConstant(32, DL, MVT::i8));

    return DAG.getNode(ISD::OR, DL, MVT::i64, Lo, Hi);

  }

  if (InVT == MVT::v32i8 && !Subtarget.hasInt256()) {

    SDValue Lo, Hi;

    std::tie(Lo, Hi) = DAG.SplitVector(V, DL);

    Lo = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Lo);

    Hi = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Hi);

    Hi = DAG.getNode(ISD::SHL, DL, MVT::i32, Hi,

                     DAG.getConstant(16, DL, MVT::i8));

    return DAG.getNode(ISD::OR, DL, MVT::i32, Lo, Hi);

  }


  return DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);

}


static SDValue LowerBITCAST(SDValue Op, const X86Subtarget &Subtarget,

                            SelectionDAG &DAG) {

  SDValue Src = Op.getOperand(0);

  MVT SrcVT = Src.getSimpleValueType();

  MVT DstVT = Op.getSimpleValueType();


  // Legalize (v64i1 (bitcast i64 (X))) by splitting the i64, bitcasting each

  // half to v32i1 and concatenating the result.

  if (SrcVT == MVT::i64 && DstVT == MVT::v64i1) {

    assert(!Subtarget.is64Bit() && "Expected 32-bit mode");

    assert(Subtarget.hasBWI() && "Expected BWI target");

    SDLoc dl(Op);

    SDValue Lo, Hi;

    std::tie(Lo, Hi) = DAG.SplitScalar(Src, dl, MVT::i32, MVT::i32);

    Lo = DAG.getBitcast(MVT::v32i1, Lo);

    Hi = DAG.getBitcast(MVT::v32i1, Hi);

    return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi);

  }


  // Use MOVMSK for vector to scalar conversion to prevent scalarization.

  if ((SrcVT == MVT::v16i1 || SrcVT == MVT::v32i1) && DstVT.isScalarInteger()) {

    assert(!Subtarget.hasAVX512() && "Should use K-registers with AVX512");

    MVT SExtVT = SrcVT == MVT::v16i1 ? MVT::v16i8 : MVT::v32i8;

    SDLoc DL(Op);

    SDValue V = DAG.getSExtOrTrunc(Src, DL, SExtVT);

    V = getPMOVMSKB(DL, V, DAG, Subtarget);

    return DAG.getZExtOrTrunc(V, DL, DstVT);

  }


  assert((SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8 ||

          SrcVT == MVT::i64) && "Unexpected VT!");


  assert(Subtarget.hasSSE2() && "Requires at least SSE2!");

  if (!(DstVT == MVT::f64 && SrcVT == MVT::i64) &&

      !(DstVT == MVT::x86mmx && SrcVT.isVector()))

    // This conversion needs to be expanded.

    return SDValue();


  SDLoc dl(Op);

  if (SrcVT.isVector()) {

    // Widen the vector in input in the case of MVT::v2i32.

    // Example: from MVT::v2i32 to MVT::v4i32.

    MVT NewVT = MVT::getVectorVT(SrcVT.getVectorElementType(),

                                 SrcVT.getVectorNumElements() * 2);

    Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewVT, Src,

                      DAG.getUNDEF(SrcVT));

  } else {

    assert(SrcVT == MVT::i64 && !Subtarget.is64Bit() &&

           "Unexpected source type in LowerBITCAST");

    Src = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Src);

  }


  MVT V2X64VT = DstVT == MVT::f64 ? MVT::v2f64 : MVT::v2i64;

  Src = DAG.getNode(ISD::BITCAST, dl, V2X64VT, Src);


  if (DstVT == MVT::x86mmx)

    return DAG.getNode(X86ISD::MOVDQ2Q, dl, DstVT, Src);


  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, DstVT, Src,

                     DAG.getVectorIdxConstant(0, dl));

}


/// Compute the horizontal sum of bytes in V for the elements of VT.

///

/// Requires V to be a byte vector and VT to be an integer vector type with

/// wider elements than V's type. The width of the elements of VT determines

/// how many bytes of V are summed horizontally to produce each element of the

/// result.


static SDValue LowerHorizontalByteSum(SDValue V, MVT VT,

                                      const X86Subtarget &Subtarget,

                                      SelectionDAG &DAG) {

  SDLoc DL(V);

  MVT ByteVecVT = V.getSimpleValueType();

  MVT EltVT = VT.getVectorElementType();

  assert(ByteVecVT.getVectorElementType() == MVT::i8 &&

         "Expected value to have byte element type.");

  assert(EltVT != MVT::i8 &&

         "Horizontal byte sum only makes sense for wider elements!");

  unsigned VecSize = VT.getSizeInBits();

  assert(ByteVecVT.getSizeInBits() == VecSize && "Cannot change vector size!");


  // PSADBW instruction horizontally add all bytes and leave the result in i64

  // chunks, thus directly computes the pop count for v2i64 and v4i64.

  if (EltVT == MVT::i64) {

    SDValue Zeros = DAG.getConstant(0, DL, ByteVecVT);

    MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);

    V = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT, V, Zeros);

    return DAG.getBitcast(VT, V);

  }


  if (EltVT == MVT::i32) {

    // We unpack the low half and high half into i32s interleaved with zeros so

    // that we can use PSADBW to horizontally sum them. The most useful part of

    // this is that it lines up the results of two PSADBW instructions to be

    // two v2i64 vectors which concatenated are the 4 population counts. We can

    // then use PACKUSWB to shrink and concatenate them into a v4i32 again.

    SDValue Zeros = DAG.getConstant(0, DL, VT);

    SDValue V32 = DAG.getBitcast(VT, V);

    SDValue Low = getUnpackl(DAG, DL, VT, V32, Zeros);

    SDValue High = getUnpackh(DAG, DL, VT, V32, Zeros);


    // Do the horizontal sums into two v2i64s.

    Zeros = DAG.getConstant(0, DL, ByteVecVT);

    MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);

    Low = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,

                      DAG.getBitcast(ByteVecVT, Low), Zeros);

    High = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,

                       DAG.getBitcast(ByteVecVT, High), Zeros);


    // Merge them together.

    MVT ShortVecVT = MVT::getVectorVT(MVT::i16, VecSize / 16);

    V = DAG.getNode(X86ISD::PACKUS, DL, ByteVecVT,

                    DAG.getBitcast(ShortVecVT, Low),

                    DAG.getBitcast(ShortVecVT, High));


    return DAG.getBitcast(VT, V);

  }


  // The only element type left is i16.

  assert(EltVT == MVT::i16 && "Unknown how to handle type");


  // To obtain pop count for each i16 element starting from the pop count for

  // i8 elements, shift the i16s left by 8, sum as i8s, and then shift as i16s

  // right by 8. It is important to shift as i16s as i8 vector shift isn't

  // directly supported.

  SDValue ShifterV = DAG.getConstant(8, DL, VT);

  SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, DAG.getBitcast(VT, V), ShifterV);

  V = DAG.getNode(ISD::ADD, DL, ByteVecVT, DAG.getBitcast(ByteVecVT, Shl),

                  DAG.getBitcast(ByteVecVT, V));

  return DAG.getNode(ISD::SRL, DL, VT, DAG.getBitcast(VT, V), ShifterV);

}


static SDValue LowerVectorCTPOPInRegLUT(SDValue Op, const SDLoc &DL,

                                        const X86Subtarget &Subtarget,

                                        SelectionDAG &DAG) {

  MVT VT = Op.getSimpleValueType();

  MVT EltVT = VT.getVectorElementType();

  int NumElts = VT.getVectorNumElements();

  (void)EltVT;

  assert(EltVT == MVT::i8 && "Only vXi8 vector CTPOP lowering supported.");


  // Implement a lookup table in register by using an algorithm based on:

  // http://wm.ite.pl/articles/sse-popcount.html

  //

  // The general idea is that every lower byte nibble in the input vector is an

  // index into a in-register pre-computed pop count table. We then split up the

  // input vector in two new ones: (1) a vector with only the shifted-right

  // higher nibbles for each byte and (2) a vector with the lower nibbles (and

  // masked out higher ones) for each byte. PSHUFB is used separately with both

  // to index the in-register table. Next, both are added and the result is a

  // i8 vector where each element contains the pop count for input byte.

  const int LUT[16] = {/* 0 */ 0, /* 1 */ 1, /* 2 */ 1, /* 3 */ 2,

                       /* 4 */ 1, /* 5 */ 2, /* 6 */ 2, /* 7 */ 3,

                       /* 8 */ 1, /* 9 */ 2, /* a */ 2, /* b */ 3,

                       /* c */ 2, /* d */ 3, /* e */ 3, /* f */ 4};


  SmallVector<SDValue, 64> LUTVec;

  for (int i = 0; i < NumElts; ++i)

    LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));

  SDValue InRegLUT = DAG.getBuildVector(VT, DL, LUTVec);

  SDValue M0F = DAG.getConstant(0x0F, DL, VT);


  // High nibbles

  SDValue FourV = DAG.getConstant(4, DL, VT);

  SDValue HiNibbles = DAG.getNode(ISD::SRL, DL, VT, Op, FourV);


  // Low nibbles

  SDValue LoNibbles = DAG.getNode(ISD::AND, DL, VT, Op, M0F);


  // The input vector is used as the shuffle mask that index elements into the

  // LUT. After counting low and high nibbles, add the vector to obtain the

  // final pop count per i8 element.

  SDValue HiPopCnt = DAG.getNode(X86ISD::PSHUFB, DL, VT, InRegLUT, HiNibbles);

  SDValue LoPopCnt = DAG.getNode(X86ISD::PSHUFB, DL, VT, InRegLUT, LoNibbles);

  return DAG.getNode(ISD::ADD, DL, VT, HiPopCnt, LoPopCnt);

}


// Please ensure that any codegen change from LowerVectorCTPOP is reflected in

// updated cost models in X86TTIImpl::getIntrinsicInstrCost.


static SDValue LowerVectorCTPOP(SDValue Op, const SDLoc &DL,

                                const X86Subtarget &Subtarget,

                                SelectionDAG &DAG) {

  MVT VT = Op.getSimpleValueType();

  assert((VT.is512BitVector() || VT.is256BitVector() || VT.is128BitVector()) &&

         "Unknown CTPOP type to handle");

  SDValue Op0 = Op.getOperand(0);


  // TRUNC(CTPOP(ZEXT(X))) to make use of vXi32/vXi64 VPOPCNT instructions.

  if (Subtarget.hasVPOPCNTDQ()) {

    unsigned NumElems = VT.getVectorNumElements();

    assert((VT.getVectorElementType() == MVT::i8 ||

            VT.getVectorElementType() == MVT::i16) && "Unexpected type");

    if (NumElems < 16 || (NumElems == 16 && Subtarget.canExtendTo512DQ())) {

      MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems);

      Op = DAG.getNode(ISD::ZERO_EXTEND, DL, NewVT, Op0);

      Op = DAG.getNode(ISD::CTPOP, DL, NewVT, Op);

      return DAG.getNode(ISD::TRUNCATE, DL, VT, Op);

    }

  }


  // Decompose 256-bit ops into smaller 128-bit ops.

  if (VT.is256BitVector() && !Subtarget.hasInt256())

    return splitVectorIntUnary(Op, DAG, DL);


  // Decompose 512-bit ops into smaller 256-bit ops.

  if (VT.is512BitVector() && !Subtarget.hasBWI())

    return splitVectorIntUnary(Op, DAG, DL);


  // For element types greater than i8, do vXi8 pop counts and a bytesum.

  if (VT.getScalarType() != MVT::i8) {

    MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);

    SDValue ByteOp = DAG.getBitcast(ByteVT, Op0);

    SDValue PopCnt8 = DAG.getNode(ISD::CTPOP, DL, ByteVT, ByteOp);

    return LowerHorizontalByteSum(PopCnt8, VT, Subtarget, DAG);

  }


  // We can't use the fast LUT approach, so fall back on LegalizeDAG.

  if (!Subtarget.hasSSSE3())

    return SDValue();


  return LowerVectorCTPOPInRegLUT(Op0, DL, Subtarget, DAG);

}


static SDValue LowerCTPOP(SDValue N, const X86Subtarget &Subtarget,

                          SelectionDAG &DAG) {

  MVT VT = N.getSimpleValueType();

  SDValue Op = N.getOperand(0);

  SDLoc DL(N);


  if (VT.isScalarInteger()) {

    // Compute the lower/upper bounds of the active bits of the value,

    // allowing us to shift the active bits down if necessary to fit into the

    // special cases below.

    KnownBits Known = DAG.computeKnownBits(Op);

    if (Known.isConstant())

      return DAG.getConstant(Known.getConstant().popcount(), DL, VT);

    unsigned LZ = Known.countMinLeadingZeros();

    unsigned TZ = Known.countMinTrailingZeros();

    assert((LZ + TZ) < Known.getBitWidth() && "Illegal shifted mask");

    unsigned ActiveBits = Known.getBitWidth() - LZ;

    unsigned ShiftedActiveBits = Known.getBitWidth() - (LZ + TZ);


    // i2 CTPOP - "ctpop(x) --> sub(x, (x >> 1))".

    if (ShiftedActiveBits <= 2) {

      if (ActiveBits > 2)

        Op = DAG.getNode(ISD::SRL, DL, VT, Op,

                         DAG.getShiftAmountConstant(TZ, VT, DL));

      Op = DAG.getZExtOrTrunc(Op, DL, MVT::i32);

      Op = DAG.getNode(ISD::SUB, DL, MVT::i32, Op,

                       DAG.getNode(ISD::SRL, DL, MVT::i32, Op,

                                   DAG.getShiftAmountConstant(1, VT, DL)));

      return DAG.getZExtOrTrunc(Op, DL, VT);

    }


    // i3 CTPOP - perform LUT into i32 integer.

    if (ShiftedActiveBits <= 3) {

      if (ActiveBits > 3)

        Op = DAG.getNode(ISD::SRL, DL, VT, Op,

                         DAG.getShiftAmountConstant(TZ, VT, DL));

      Op = DAG.getZExtOrTrunc(Op, DL, MVT::i32);

      Op = DAG.getNode(ISD::SHL, DL, MVT::i32, Op,

                       DAG.getShiftAmountConstant(1, VT, DL));

      Op = DAG.getNode(ISD::SRL, DL, MVT::i32,

                       DAG.getConstant(0b1110100110010100U, DL, MVT::i32), Op);

      Op = DAG.getNode(ISD::AND, DL, MVT::i32, Op,

                       DAG.getConstant(0x3, DL, MVT::i32));

      return DAG.getZExtOrTrunc(Op, DL, VT);

    }


    // i4 CTPOP - perform LUT into i64 integer.

    if (ShiftedActiveBits <= 4 &&

        DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64)) {

      SDValue LUT = DAG.getConstant(0x4332322132212110ULL, DL, MVT::i64);

      if (ActiveBits > 4)

        Op = DAG.getNode(ISD::SRL, DL, VT, Op,

                         DAG.getShiftAmountConstant(TZ, VT, DL));

      Op = DAG.getZExtOrTrunc(Op, DL, MVT::i32);

      Op = DAG.getNode(ISD::MUL, DL, MVT::i32, Op,

                       DAG.getConstant(4, DL, MVT::i32));

      Op = DAG.getNode(ISD::SRL, DL, MVT::i64, LUT,

                       DAG.getShiftAmountOperand(MVT::i64, Op));

      Op = DAG.getNode(ISD::AND, DL, MVT::i64, Op,

                       DAG.getConstant(0x7, DL, MVT::i64));

      return DAG.getZExtOrTrunc(Op, DL, VT);

    }


    // i8 CTPOP - with efficient i32 MUL, then attempt multiply-mask-multiply.

    if (ShiftedActiveBits <= 8) {

      SDValue Mask11 = DAG.getConstant(0x11111111U, DL, MVT::i32);

      if (ActiveBits > 8)

        Op = DAG.getNode(ISD::SRL, DL, VT, Op,

                         DAG.getShiftAmountConstant(TZ, VT, DL));

      Op = DAG.getZExtOrTrunc(Op, DL, MVT::i32);

      Op = DAG.getNode(ISD::MUL, DL, MVT::i32, Op,

                       DAG.getConstant(0x08040201U, DL, MVT::i32));

      Op = DAG.getNode(ISD::SRL, DL, MVT::i32, Op,

                       DAG.getShiftAmountConstant(3, MVT::i32, DL));

      Op = DAG.getNode(ISD::AND, DL, MVT::i32, Op, Mask11);

      Op = DAG.getNode(ISD::MUL, DL, MVT::i32, Op, Mask11);

      Op = DAG.getNode(ISD::SRL, DL, MVT::i32, Op,

                       DAG.getShiftAmountConstant(28, MVT::i32, DL));

      return DAG.getZExtOrTrunc(Op, DL, VT);

    }


    return SDValue(); // fallback to generic expansion.

  }


  assert(VT.isVector() &&

         "We only do custom lowering for vector population count.");

  return LowerVectorCTPOP(N, DL, Subtarget, DAG);

}


static SDValue LowerBITREVERSE_XOP(SDValue Op, SelectionDAG &DAG) {

  MVT VT = Op.getSimpleValueType();

  SDValue In = Op.getOperand(0);

  SDLoc DL(Op);


  // For scalars, its still beneficial to transfer to/from the SIMD unit to

  // perform the BITREVERSE.

  if (!VT.isVector()) {

    MVT VecVT = MVT::getVectorVT(VT, 128 / VT.getSizeInBits());

    SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, In);

    Res = DAG.getNode(ISD::BITREVERSE, DL, VecVT, Res);

    return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Res,

                       DAG.getVectorIdxConstant(0, DL));

  }


  int NumElts = VT.getVectorNumElements();

  int ScalarSizeInBytes = VT.getScalarSizeInBits() / 8;


  // Decompose 256-bit ops into smaller 128-bit ops.

  if (VT.is256BitVector())

    return splitVectorIntUnary(Op, DAG, DL);


  assert(VT.is128BitVector() &&

         "Only 128-bit vector bitreverse lowering supported.");


  // VPPERM reverses the bits of a byte with the permute Op (2 << 5), and we

  // perform the BSWAP in the shuffle.

  // Its best to shuffle using the second operand as this will implicitly allow

  // memory folding for multiple vectors.

  SmallVector<SDValue, 16> MaskElts;

  for (int i = 0; i != NumElts; ++i) {

    for (int j = ScalarSizeInBytes - 1; j >= 0; --j) {

      int SourceByte = 16 + (i * ScalarSizeInBytes) + j;

      int PermuteByte = SourceByte | (2 << 5);

      MaskElts.push_back(DAG.getConstant(PermuteByte, DL, MVT::i8));

    }

  }


  SDValue Mask = DAG.getBuildVector(MVT::v16i8, DL, MaskElts);

  SDValue Res = DAG.getBitcast(MVT::v16i8, In);

  Res = DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, DAG.getUNDEF(MVT::v16i8),

                    Res, Mask);

  return DAG.getBitcast(VT, Res);

}


static SDValue LowerBITREVERSE(SDValue Op, const X86Subtarget &Subtarget,

                               SelectionDAG &DAG) {

  MVT VT = Op.getSimpleValueType();


  if (Subtarget.hasXOP() && !VT.is512BitVector())

    return LowerBITREVERSE_XOP(Op, DAG);


  assert((Subtarget.hasSSSE3() || Subtarget.hasGFNI()) &&

         "SSSE3 or GFNI required for BITREVERSE");


  SDValue In = Op.getOperand(0);

  SDLoc DL(Op);


  // Split 512-bit ops without BWI so that we can still use the PSHUFB lowering.

  if (VT.is512BitVector() && !Subtarget.hasBWI())

    return splitVectorIntUnary(Op, DAG, DL);


  // Decompose 256-bit ops into smaller 128-bit ops on pre-AVX2.

  if (VT.is256BitVector() && !Subtarget.hasInt256())

    return splitVectorIntUnary(Op, DAG, DL);


  // Lower i8/i16/i32/i64 as vXi8 BITREVERSE + BSWAP

  if (!VT.isVector()) {

    assert(

        (VT == MVT::i32 || VT == MVT::i64 || VT == MVT::i16 || VT == MVT::i8) &&

        "Only tested for i8/i16/i32/i64");

    MVT VecVT = MVT::getVectorVT(VT, 128 / VT.getSizeInBits());

    SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, In);

    Res = DAG.getNode(ISD::BITREVERSE, DL, MVT::v16i8,

                      DAG.getBitcast(MVT::v16i8, Res));

    Res =

        DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, DAG.getBitcast(VecVT, Res),

                    DAG.getVectorIdxConstant(0, DL));

    return (VT == MVT::i8) ? Res : DAG.getNode(ISD::BSWAP, DL, VT, Res);

  }


  assert(VT.isVector() && VT.getSizeInBits() >= 128);


  // Lower vXi16/vXi32/vXi64 as BSWAP + vXi8 BITREVERSE.

  if (VT.getScalarType() != MVT::i8) {

    MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);

    SDValue Res = DAG.getNode(ISD::BSWAP, DL, VT, In);

    Res = DAG.getBitcast(ByteVT, Res);

    Res = DAG.getNode(ISD::BITREVERSE, DL, ByteVT, Res);

    return DAG.getBitcast(VT, Res);

  }

  assert(VT.isVector() && VT.getScalarType() == MVT::i8 &&

         "Only byte vector BITREVERSE supported");


  unsigned NumElts = VT.getVectorNumElements();


  // If we have GFNI, we can use GF2P8AFFINEQB to reverse the bits.

  if (Subtarget.hasGFNI()) {

    SDValue Matrix = getGFNICtrlMask(ISD::BITREVERSE, DAG, DL, VT);

    return DAG.getNode(X86ISD::GF2P8AFFINEQB, DL, VT, In, Matrix,

                       DAG.getTargetConstant(0, DL, MVT::i8));

  }


  // Perform BITREVERSE using PSHUFB lookups. Each byte is split into

  // two nibbles and a PSHUFB lookup to find the bitreverse of each

  // 0-15 value (moved to the other nibble).

  SDValue NibbleMask = DAG.getConstant(0xF, DL, VT);

  SDValue Lo = DAG.getNode(ISD::AND, DL, VT, In, NibbleMask);

  SDValue Hi = DAG.getNode(ISD::SRL, DL, VT, In, DAG.getConstant(4, DL, VT));


  const int LoLUT[16] = {

      /* 0 */ 0x00, /* 1 */ 0x80, /* 2 */ 0x40, /* 3 */ 0xC0,

      /* 4 */ 0x20, /* 5 */ 0xA0, /* 6 */ 0x60, /* 7 */ 0xE0,

      /* 8 */ 0x10, /* 9 */ 0x90, /* a */ 0x50, /* b */ 0xD0,

      /* c */ 0x30, /* d */ 0xB0, /* e */ 0x70, /* f */ 0xF0};

  const int HiLUT[16] = {

      /* 0 */ 0x00, /* 1 */ 0x08, /* 2 */ 0x04, /* 3 */ 0x0C,

      /* 4 */ 0x02, /* 5 */ 0x0A, /* 6 */ 0x06, /* 7 */ 0x0E,

      /* 8 */ 0x01, /* 9 */ 0x09, /* a */ 0x05, /* b */ 0x0D,

      /* c */ 0x03, /* d */ 0x0B, /* e */ 0x07, /* f */ 0x0F};


  SmallVector<SDValue, 16> LoMaskElts, HiMaskElts;

  for (unsigned i = 0; i < NumElts; ++i) {

    LoMaskElts.push_back(DAG.getConstant(LoLUT[i % 16], DL, MVT::i8));

    HiMaskElts.push_back(DAG.getConstant(HiLUT[i % 16], DL, MVT::i8));

  }


  SDValue LoMask = DAG.getBuildVector(VT, DL, LoMaskElts);

  SDValue HiMask = DAG.getBuildVector(VT, DL, HiMaskElts);

  Lo = DAG.getNode(X86ISD::PSHUFB, DL, VT, LoMask, Lo);

  Hi = DAG.getNode(X86ISD::PSHUFB, DL, VT, HiMask, Hi);

  return DAG.getNode(ISD::OR, DL, VT, Lo, Hi);

}


static SDValue LowerPARITY(SDValue Op, const X86Subtarget &Subtarget,

                           SelectionDAG &DAG) {

  SDLoc DL(Op);

  SDValue X = Op.getOperand(0);

  MVT VT = Op.getSimpleValueType();


  // Special case. If the input fits in 8-bits we can use a single 8-bit TEST.

  if (VT == MVT::i8 ||

      DAG.MaskedValueIsZero(X, APInt::getBitsSetFrom(VT.getSizeInBits(), 8))) {

    X = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, X);

    SDValue Flags = DAG.getNode(X86ISD::CMP, DL, MVT::i32, X,

                                DAG.getConstant(0, DL, MVT::i8));

    // Copy the inverse of the parity flag into a register with setcc.

    SDValue Setnp = getSETCC(X86::COND_NP, Flags, DL, DAG);

    // Extend to the original type.

    return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Setnp);

  }


  // If we have POPCNT, use the default expansion.

  if (Subtarget.hasPOPCNT())

    return SDValue();


  if (VT == MVT::i64) {

    // Xor the high and low 16-bits together using a 32-bit operation.

    SDValue Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32,

                             DAG.getNode(ISD::SRL, DL, MVT::i64, X,

                                         DAG.getConstant(32, DL, MVT::i8)));

    SDValue Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, X);

    X = DAG.getNode(ISD::XOR, DL, MVT::i32, Lo, Hi);

  }


  if (VT != MVT::i16) {

    // Xor the high and low 16-bits together using a 32-bit operation.

    SDValue Hi16 = DAG.getNode(ISD::SRL, DL, MVT::i32, X,

                               DAG.getConstant(16, DL, MVT::i8));

    X = DAG.getNode(ISD::XOR, DL, MVT::i32, X, Hi16);

  } else {

    // If the input is 16-bits, we need to extend to use an i32 shift below.

    X = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, X);

  }


  // Finally xor the low 2 bytes together and use a 8-bit flag setting xor.

  // This should allow an h-reg to be used to save a shift.

  SDValue Hi = DAG.getNode(

      ISD::TRUNCATE, DL, MVT::i8,

      DAG.getNode(ISD::SRL, DL, MVT::i32, X, DAG.getConstant(8, DL, MVT::i8)));

  SDValue Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, X);

  SDVTList VTs = DAG.getVTList(MVT::i8, MVT::i32);

  SDValue Flags = DAG.getNode(X86ISD::XOR, DL, VTs, Lo, Hi).getValue(1);


  // Copy the inverse of the parity flag into a register with setcc.

  SDValue Setnp = getSETCC(X86::COND_NP, Flags, DL, DAG);

  // Extend to the original type.

  return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Setnp);

}


static SDValue lowerAtomicArithWithLOCK(SDValue N, SelectionDAG &DAG,

                                        const X86Subtarget &Subtarget) {

  unsigned NewOpc = 0;

  switch (N->getOpcode()) {

  case ISD::ATOMIC_LOAD_ADD:

    NewOpc = X86ISD::LADD;

    break;

  case ISD::ATOMIC_LOAD_SUB:

    NewOpc = X86ISD::LSUB;

    break;

  case ISD::ATOMIC_LOAD_OR:

    NewOpc = X86ISD::LOR;

    break;

  case ISD::ATOMIC_LOAD_XOR:

    NewOpc = X86ISD::LXOR;

    break;

  case ISD::ATOMIC_LOAD_AND:

    NewOpc = X86ISD::LAND;

    break;

  default:

    llvm_unreachable("Unknown ATOMIC_LOAD_ opcode");

  }


  MachineMemOperand *MMO = cast<MemSDNode>(N)->getMemOperand();


  return DAG.getMemIntrinsicNode(

      NewOpc, SDLoc(N), DAG.getVTList(MVT::i32, MVT::Other),

      {N->getOperand(0), N->getOperand(1), N->getOperand(2)},

      /*MemVT=*/N->getSimpleValueType(0), MMO);

}


/// Lower atomic_load_ops into LOCK-prefixed operations.


static SDValue lowerAtomicArith(SDValue N, SelectionDAG &DAG,

                                const X86Subtarget &Subtarget) {

  AtomicSDNode *AN = cast<AtomicSDNode>(N.getNode());

  SDValue Chain = N->getOperand(0);

  SDValue LHS = N->getOperand(1);

  SDValue RHS = N->getOperand(2);

  unsigned Opc = N->getOpcode();

  MVT VT = N->getSimpleValueType(0);

  SDLoc DL(N);


  // We can lower atomic_load_add into LXADD. However, any other atomicrmw op

  // can only be lowered when the result is unused.  They should have already

  // been transformed into a cmpxchg loop in AtomicExpand.

  if (N->hasAnyUseOfValue(0)) {

    // Handle (atomic_load_sub p, v) as (atomic_load_add p, -v), to be able to

    // select LXADD if LOCK_SUB can't be selected.

    // Handle (atomic_load_xor p, SignBit) as (atomic_load_add p, SignBit) so we

    // can use LXADD as opposed to cmpxchg.

    if (Opc == ISD::ATOMIC_LOAD_SUB ||

        (Opc == ISD::ATOMIC_LOAD_XOR && isMinSignedConstant(RHS)))

      return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, DL, VT, Chain, LHS,

                           DAG.getNegative(RHS, DL, VT), AN->getMemOperand());


    assert(Opc == ISD::ATOMIC_LOAD_ADD &&

           "Used AtomicRMW ops other than Add should have been expanded!");

    return N;

  }


  // Specialized lowering for the canonical form of an idemptotent atomicrmw.

  // The core idea here is that since the memory location isn't actually

  // changing, all we need is a lowering for the *ordering* impacts of the

  // atomicrmw.  As such, we can chose a different operation and memory

  // location to minimize impact on other code.

  // The above holds unless the node is marked volatile in which

  // case it needs to be preserved according to the langref.

  if (Opc == ISD::ATOMIC_LOAD_OR && isNullConstant(RHS) && !AN->isVolatile()) {

    // On X86, the only ordering which actually requires an instruction is

    // seq_cst which isn't SingleThread, everything just needs to be preserved

    // during codegen and then dropped. Note that we expect (but don't assume),

    // that orderings other than seq_cst and acq_rel have been canonicalized to

    // a store or load.

    if (AN->getSuccessOrdering() == AtomicOrdering::SequentiallyConsistent &&

        AN->getSyncScopeID() == SyncScope::System) {

      // Prefer a locked operation against a stack location to minimize cache

      // traffic.  This assumes that stack locations are very likely to be

      // accessed only by the owning thread.

      SDValue NewChain = emitLockedStackOp(DAG, Subtarget, Chain, DL);

      assert(!N->hasAnyUseOfValue(0));

      // NOTE: The getUNDEF is needed to give something for the unused result 0.

      return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(),

                         DAG.getUNDEF(VT), NewChain);

    }

    // MEMBARRIER is a compiler barrier; it codegens to a no-op.

    SDValue NewChain = DAG.getNode(ISD::MEMBARRIER, DL, MVT::Other, Chain);

    assert(!N->hasAnyUseOfValue(0));

    // NOTE: The getUNDEF is needed to give something for the unused result 0.

    return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(),

                       DAG.getUNDEF(VT), NewChain);

  }


  SDValue LockOp = lowerAtomicArithWithLOCK(N, DAG, Subtarget);

  // RAUW the chain, but don't worry about the result, as it's unused.

  assert(!N->hasAnyUseOfValue(0));

  // NOTE: The getUNDEF is needed to give something for the unused result 0.

  return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(),

                     DAG.getUNDEF(VT), LockOp.getValue(1));

}


static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG,

                                 const X86Subtarget &Subtarget) {

  auto *Node = cast<AtomicSDNode>(Op.getNode());

  SDLoc dl(Node);

  EVT VT = Node->getMemoryVT();


  bool IsSeqCst =

      Node->getSuccessOrdering() == AtomicOrdering::SequentiallyConsistent;

  bool IsTypeLegal = DAG.getTargetLoweringInfo().isTypeLegal(VT);


  // If this store is not sequentially consistent and the type is legal

  // we can just keep it.

  if (!IsSeqCst && IsTypeLegal)

    return Op;


  if (!IsTypeLegal && !Subtarget.useSoftFloat() &&

      !DAG.getMachineFunction().getFunction().hasFnAttribute(

          Attribute::NoImplicitFloat)) {

    SDValue Chain;

    // For illegal i128 atomic_store, when AVX is enabled, we can simply emit a

    // vector store.

    if (VT == MVT::i128 && Subtarget.is64Bit() && Subtarget.hasAVX()) {

      SDValue VecVal = DAG.getBitcast(MVT::v2i64, Node->getVal());

      Chain = DAG.getStore(Node->getChain(), dl, VecVal, Node->getBasePtr(),

                           Node->getMemOperand());

    }


    // For illegal i64 atomic_stores, we can try to use MOVQ or MOVLPS if SSE

    // is enabled.

    if (VT == MVT::i64) {

      if (Subtarget.hasSSE1()) {

        SDValue SclToVec =

            DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Node->getVal());

        MVT StVT = Subtarget.hasSSE2() ? MVT::v2i64 : MVT::v4f32;

        SclToVec = DAG.getBitcast(StVT, SclToVec);

        SDVTList Tys = DAG.getVTList(MVT::Other);

        SDValue Ops[] = {Node->getChain(), SclToVec, Node->getBasePtr()};

        Chain = DAG.getMemIntrinsicNode(X86ISD::VEXTRACT_STORE, dl, Tys, Ops,

                                        MVT::i64, Node->getMemOperand());

      } else if (Subtarget.hasX87()) {

        // First load this into an 80-bit X87 register using a stack temporary.

        // This will put the whole integer into the significand.

        SDValue StackPtr = DAG.CreateStackTemporary(MVT::i64);

        int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();

        MachinePointerInfo MPI =

            MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI);

        Chain = DAG.getStore(Node->getChain(), dl, Node->getVal(), StackPtr,

                             MPI, MaybeAlign(), MachineMemOperand::MOStore);

        SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);

        SDValue LdOps[] = {Chain, StackPtr};

        SDValue Value = DAG.getMemIntrinsicNode(

            X86ISD::FILD, dl, Tys, LdOps, MVT::i64, MPI,

            /*Align*/ std::nullopt, MachineMemOperand::MOLoad);

        Chain = Value.getValue(1);


        // Now use an FIST to do the atomic store.

        SDValue StoreOps[] = {Chain, Value, Node->getBasePtr()};

        Chain =

            DAG.getMemIntrinsicNode(X86ISD::FIST, dl, DAG.getVTList(MVT::Other),

                                    StoreOps, MVT::i64, Node->getMemOperand());

      }

    }


    if (Chain) {

      // If this is a sequentially consistent store, also emit an appropriate

      // barrier.

      if (IsSeqCst)

        Chain = emitLockedStackOp(DAG, Subtarget, Chain, dl);


      return Chain;

    }

  }


  // Convert seq_cst store -> xchg

  // Convert wide store -> swap (-> cmpxchg8b/cmpxchg16b)

  // FIXME: 16-byte ATOMIC_SWAP isn't actually hooked up at the moment.

  SDValue Swap = DAG.getAtomic(ISD::ATOMIC_SWAP, dl, Node->getMemoryVT(),

                               Node->getOperand(0), Node->getOperand(2),

                               Node->getOperand(1), Node->getMemOperand());

  return Swap.getValue(1);

}


static SDValue LowerADDSUBO_CARRY(SDValue Op, SelectionDAG &DAG) {

  SDNode *N = Op.getNode();

  MVT VT = N->getSimpleValueType(0);

  unsigned Opc = Op.getOpcode();


  // Let legalize expand this if it isn't a legal type yet.

  if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))

    return SDValue();


  SDVTList VTs = DAG.getVTList(VT, MVT::i32);

  SDLoc DL(N);


  // Set the carry flag.

  SDValue Carry = Op.getOperand(2);

  EVT CarryVT = Carry.getValueType();

  Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32),

                      Carry, DAG.getAllOnesConstant(DL, CarryVT));


  bool IsAdd = Opc == ISD::UADDO_CARRY || Opc == ISD::SADDO_CARRY;

  SDValue Sum = DAG.getNode(IsAdd ? X86ISD::ADC : X86ISD::SBB, DL, VTs,

                            Op.getOperand(0), Op.getOperand(1),

                            Carry.getValue(1));


  bool IsSigned = Opc == ISD::SADDO_CARRY || Opc == ISD::SSUBO_CARRY;

  SDValue SetCC = getSETCC(IsSigned ? X86::COND_O : X86::COND_B,

                           Sum.getValue(1), DL, DAG);

  if (N->getValueType(1) == MVT::i1)

    SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);


  return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);

}


static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget &Subtarget,

                            SelectionDAG &DAG) {

  assert(Subtarget.isTargetDarwin() && Subtarget.is64Bit());


  // For MacOSX, we want to call an alternative entry point: __sincos_stret,

  // which returns the values as { float, float } (in XMM0) or

  // { double, double } (which is returned in XMM0, XMM1).

  SDLoc dl(Op);

  SDValue Arg = Op.getOperand(0);

  EVT ArgVT = Arg.getValueType();

  Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());


  TargetLowering::ArgListTy Args;

  Args.emplace_back(Arg, ArgTy);


  bool isF64 = ArgVT == MVT::f64;

  // Only optimize x86_64 for now. i386 is a bit messy. For f32,

  // the small struct {f32, f32} is returned in (eax, edx). For f64,

  // the results are returned via SRet in memory.

  const TargetLowering &TLI = DAG.getTargetLoweringInfo();

  RTLIB::Libcall LC = isF64 ? RTLIB::SINCOS_STRET_F64 : RTLIB::SINCOS_STRET_F32;

  const char *LibcallName = TLI.getLibcallName(LC);

  SDValue Callee =

      DAG.getExternalSymbol(LibcallName, TLI.getPointerTy(DAG.getDataLayout()));


  Type *RetTy = isF64 ? (Type *)StructType::get(ArgTy, ArgTy)

                      : (Type *)FixedVectorType::get(ArgTy, 4);


  TargetLowering::CallLoweringInfo CLI(DAG);

  CLI.setDebugLoc(dl)

      .setChain(DAG.getEntryNode())

      .setLibCallee(CallingConv::C, RetTy, Callee, std::move(Args));


  std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);


  if (isF64)

    // Returned in xmm0 and xmm1.

    return CallResult.first;


  // Returned in bits 0:31 and 32:64 xmm0.

  SDValue SinVal =

      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT, CallResult.first,

                  DAG.getVectorIdxConstant(0, dl));

  SDValue CosVal =

      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT, CallResult.first,

                  DAG.getVectorIdxConstant(1, dl));

  SDVTList Tys = DAG.getVTList(ArgVT, ArgVT);

  return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, SinVal, CosVal);

}


/// Widen a vector input to a vector of NVT.  The

/// input vector must have the same element type as NVT.


static SDValue ExtendToType(SDValue InOp, MVT NVT, SelectionDAG &DAG,

                            bool FillWithZeroes = false) {

  // Check if InOp already has the right width.

  MVT InVT = InOp.getSimpleValueType();

  if (InVT == NVT)

    return InOp;


  if (InOp.isUndef())

    return DAG.getUNDEF(NVT);


  assert(InVT.getVectorElementType() == NVT.getVectorElementType() &&

         "input and widen element type must match");


  unsigned InNumElts = InVT.getVectorNumElements();

  unsigned WidenNumElts = NVT.getVectorNumElements();

  assert(WidenNumElts > InNumElts && WidenNumElts % InNumElts == 0 &&

         "Unexpected request for vector widening");


  SDLoc dl(InOp);

  if (InOp.getOpcode() == ISD::CONCAT_VECTORS && InOp.getNumOperands() == 2) {

    SDValue N1 = InOp.getOperand(1);

    if ((ISD::isBuildVectorAllZeros(N1.getNode()) && FillWithZeroes) ||

        N1.isUndef()) {

      InOp = InOp.getOperand(0);

      InVT = InOp.getSimpleValueType();

      InNumElts = InVT.getVectorNumElements();

    }

  }

  if (ISD::isBuildVectorOfConstantSDNodes(InOp.getNode()) ||

      ISD::isBuildVectorOfConstantFPSDNodes(InOp.getNode())) {

    EVT EltVT = InOp.getOperand(0).getValueType();

    SDValue FillVal =

        FillWithZeroes ? DAG.getConstant(0, dl, EltVT) : DAG.getUNDEF(EltVT);

    SmallVector<SDValue, 16> Ops(InOp->ops());

    Ops.append(WidenNumElts - InNumElts, FillVal);

    return DAG.getBuildVector(NVT, dl, Ops);

  }

  SDValue FillVal =

      FillWithZeroes ? DAG.getConstant(0, dl, NVT) : DAG.getUNDEF(NVT);

  return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, NVT, FillVal, InOp,

                     DAG.getVectorIdxConstant(0, dl));

}


static SDValue LowerMSCATTER(SDValue Op, const X86Subtarget &Subtarget,

                             SelectionDAG &DAG) {

  assert(Subtarget.hasAVX512() &&

         "MGATHER/MSCATTER are supported on AVX-512 arch only");


  MaskedScatterSDNode *N = cast<MaskedScatterSDNode>(Op.getNode());

  SDValue Src = N->getValue();

  MVT VT = Src.getSimpleValueType();

  assert(VT.getScalarSizeInBits() >= 32 && "Unsupported scatter op");

  SDLoc dl(Op);


  SDValue Scale = N->getScale();

  SDValue Index = N->getIndex();

  SDValue Mask = N->getMask();

  SDValue Chain = N->getChain();

  SDValue BasePtr = N->getBasePtr();


  if (VT == MVT::v2f32 || VT == MVT::v2i32) {

    assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type");

    // If the index is v2i64 and we have VLX we can use xmm for data and index.

    if (Index.getValueType() == MVT::v2i64 && Subtarget.hasVLX()) {

      const TargetLowering &TLI = DAG.getTargetLoweringInfo();

      EVT WideVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);

      Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT, Src, DAG.getUNDEF(VT));

      SDVTList VTs = DAG.getVTList(MVT::Other);

      SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale};

      return DAG.getMemIntrinsicNode(X86ISD::MSCATTER, dl, VTs, Ops,

                                     N->getMemoryVT(), N->getMemOperand());

    }

    return SDValue();

  }


  MVT IndexVT = Index.getSimpleValueType();


  // If the index is v2i32, we're being called by type legalization and we

  // should just let the default handling take care of it.

  if (IndexVT == MVT::v2i32)

    return SDValue();


  // If we don't have VLX and neither the passthru or index is 512-bits, we

  // need to widen until one is.

  if (!Subtarget.hasVLX() && !VT.is512BitVector() &&

      !Index.getSimpleValueType().is512BitVector()) {

    // Determine how much we need to widen by to get a 512-bit type.

    unsigned Factor = std::min(512/VT.getSizeInBits(),

                               512/IndexVT.getSizeInBits());

    unsigned NumElts = VT.getVectorNumElements() * Factor;


    VT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);

    IndexVT = MVT::getVectorVT(IndexVT.getVectorElementType(), NumElts);

    MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);


    Src = ExtendToType(Src, VT, DAG);

    Index = ExtendToType(Index, IndexVT, DAG);

    Mask = ExtendToType(Mask, MaskVT, DAG, true);

  }


  SDVTList VTs = DAG.getVTList(MVT::Other);

  SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale};

  return DAG.getMemIntrinsicNode(X86ISD::MSCATTER, dl, VTs, Ops,

                                 N->getMemoryVT(), N->getMemOperand());

}


static SDValue LowerMLOAD(SDValue Op, const X86Subtarget &Subtarget,

                          SelectionDAG &DAG) {


  MaskedLoadSDNode *N = cast<MaskedLoadSDNode>(Op.getNode());

  MVT VT = Op.getSimpleValueType();

  MVT ScalarVT = VT.getScalarType();

  SDValue Mask = N->getMask();

  MVT MaskVT = Mask.getSimpleValueType();

  SDValue PassThru = N->getPassThru();

  SDLoc dl(Op);


  // Handle AVX masked loads which don't support passthru other than 0.

  if (MaskVT.getVectorElementType() != MVT::i1) {

    // We also allow undef in the isel pattern.

    if (PassThru.isUndef() || ISD::isBuildVectorAllZeros(PassThru.getNode()))

      return Op;


    SDValue NewLoad = DAG.getMaskedLoad(

        VT, dl, N->getChain(), N->getBasePtr(), N->getOffset(), Mask,

        getZeroVector(VT, Subtarget, DAG, dl), N->getMemoryVT(),

        N->getMemOperand(), N->getAddressingMode(), N->getExtensionType(),

        N->isExpandingLoad());

    // Emit a blend.

    SDValue Select = DAG.getNode(ISD::VSELECT, dl, VT, Mask, NewLoad, PassThru);

    return DAG.getMergeValues({ Select, NewLoad.getValue(1) }, dl);

  }


  assert((!N->isExpandingLoad() || Subtarget.hasAVX512()) &&

         "Expanding masked load is supported on AVX-512 target only!");


  assert((!N->isExpandingLoad() || ScalarVT.getSizeInBits() >= 32) &&

         "Expanding masked load is supported for 32 and 64-bit types only!");


  assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&

         "Cannot lower masked load op.");


  assert((ScalarVT.getSizeInBits() >= 32 ||

          (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16 ||

                                  ScalarVT == MVT::f16))) &&

         "Unsupported masked load op.");


  // This operation is legal for targets with VLX, but without

  // VLX the vector should be widened to 512 bit

  unsigned NumEltsInWideVec = 512 / VT.getScalarSizeInBits();

  MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);

  PassThru = ExtendToType(PassThru, WideDataVT, DAG);


  // Mask element has to be i1.

  assert(Mask.getSimpleValueType().getScalarType() == MVT::i1 &&

         "Unexpected mask type");


  MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec);


  Mask = ExtendToType(Mask, WideMaskVT, DAG, true);

  SDValue NewLoad = DAG.getMaskedLoad(

      WideDataVT, dl, N->getChain(), N->getBasePtr(), N->getOffset(), Mask,

      PassThru, N->getMemoryVT(), N->getMemOperand(), N->getAddressingMode(),

      N->getExtensionType(), N->isExpandingLoad());


  SDValue Extract =

      DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, NewLoad.getValue(0),

                  DAG.getVectorIdxConstant(0, dl));

  SDValue RetOps[] = {Extract, NewLoad.getValue(1)};

  return DAG.getMergeValues(RetOps, dl);

}


static SDValue LowerMSTORE(SDValue Op, const X86Subtarget &Subtarget,

                           SelectionDAG &DAG) {

  MaskedStoreSDNode *N = cast<MaskedStoreSDNode>(Op.getNode());

  SDValue DataToStore = N->getValue();

  MVT VT = DataToStore.getSimpleValueType();

  MVT ScalarVT = VT.getScalarType();

  SDValue Mask = N->getMask();

  SDLoc dl(Op);


  assert((!N->isCompressingStore() || Subtarget.hasAVX512()) &&

         "Expanding masked load is supported on AVX-512 target only!");


  assert((!N->isCompressingStore() || ScalarVT.getSizeInBits() >= 32) &&

         "Expanding masked load is supported for 32 and 64-bit types only!");


  assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&

         "Cannot lower masked store op.");


  assert((ScalarVT.getSizeInBits() >= 32 ||

          (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16 ||

                                  ScalarVT == MVT::f16))) &&

         "Unsupported masked store op.");


  // This operation is legal for targets with VLX, but without

  // VLX the vector should be widened to 512 bit

  unsigned NumEltsInWideVec = 512/VT.getScalarSizeInBits();

  MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);


  // Mask element has to be i1.

  assert(Mask.getSimpleValueType().getScalarType() == MVT::i1 &&

         "Unexpected mask type");


  MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec);


  DataToStore = ExtendToType(DataToStore, WideDataVT, DAG);

  Mask = ExtendToType(Mask, WideMaskVT, DAG, true);

  return DAG.getMaskedStore(N->getChain(), dl, DataToStore, N->getBasePtr(),

                            N->getOffset(), Mask, N->getMemoryVT(),

                            N->getMemOperand(), N->getAddressingMode(),

                            N->isTruncatingStore(), N->isCompressingStore());

}


static SDValue LowerMGATHER(SDValue Op, const X86Subtarget &Subtarget,

                            SelectionDAG &DAG) {

  assert(Subtarget.hasAVX2() &&

         "MGATHER/MSCATTER are supported on AVX-512/AVX-2 arch only");


  MaskedGatherSDNode *N = cast<MaskedGatherSDNode>(Op.getNode());

  SDLoc dl(Op);

  MVT VT = Op.getSimpleValueType();

  SDValue Index = N->getIndex();

  SDValue Mask = N->getMask();

  SDValue PassThru = N->getPassThru();

  MVT IndexVT = Index.getSimpleValueType();


  assert(VT.getScalarSizeInBits() >= 32 && "Unsupported gather op");


  // If the index is v2i32, we're being called by type legalization.

  if (IndexVT == MVT::v2i32)

    return SDValue();


  // If we don't have VLX and neither the passthru or index is 512-bits, we

  // need to widen until one is.

  MVT OrigVT = VT;

  if (Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&

      !IndexVT.is512BitVector()) {

    // Determine how much we need to widen by to get a 512-bit type.

    unsigned Factor = std::min(512/VT.getSizeInBits(),

                               512/IndexVT.getSizeInBits());


    unsigned NumElts = VT.getVectorNumElements() * Factor;


    VT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);

    IndexVT = MVT::getVectorVT(IndexVT.getVectorElementType(), NumElts);

    MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);


    PassThru = ExtendToType(PassThru, VT, DAG);

    Index = ExtendToType(Index, IndexVT, DAG);

    Mask = ExtendToType(Mask, MaskVT, DAG, true);

  }


  // Break dependency on the data register.

  if (PassThru.isUndef())

    PassThru = getZeroVector(VT, Subtarget, DAG, dl);


  SDValue Ops[] = { N->getChain(), PassThru, Mask, N->getBasePtr(), Index,

                    N->getScale() };

  SDValue NewGather = DAG.getMemIntrinsicNode(

      X86ISD::MGATHER, dl, DAG.getVTList(VT, MVT::Other), Ops, N->getMemoryVT(),

      N->getMemOperand());

  SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OrigVT, NewGather,

                                DAG.getVectorIdxConstant(0, dl));

  return DAG.getMergeValues({Extract, NewGather.getValue(1)}, dl);

}


static SDValue LowerADDRSPACECAST(SDValue Op, SelectionDAG &DAG) {

  SDLoc dl(Op);

  SDValue Src = Op.getOperand(0);

  MVT DstVT = Op.getSimpleValueType();


  AddrSpaceCastSDNode *N = cast<AddrSpaceCastSDNode>(Op.getNode());

  unsigned SrcAS = N->getSrcAddressSpace();


  assert(SrcAS != N->getDestAddressSpace() &&

         "addrspacecast must be between different address spaces");


  if (SrcAS == X86AS::PTR32_UPTR && DstVT == MVT::i64) {

    Op = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Src);

  } else if (DstVT == MVT::i64) {

    Op = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Src);

  } else if (DstVT == MVT::i32) {

    Op = DAG.getNode(ISD::TRUNCATE, dl, DstVT, Src);

  } else {

    report_fatal_error("Bad address space in addrspacecast");

  }

  return Op;

}


SDValue X86TargetLowering::LowerGC_TRANSITION(SDValue Op,

                                              SelectionDAG &DAG) const {

  // TODO: Eventually, the lowering of these nodes should be informed by or

  // deferred to the GC strategy for the function in which they appear. For

  // now, however, they must be lowered to something. Since they are logically

  // no-ops in the case of a null GC strategy (or a GC strategy which does not

  // require special handling for these nodes), lower them as literal NOOPs for

  // the time being.

  SmallVector<SDValue, 2> Ops;

  Ops.push_back(Op.getOperand(0));

  if (Op->getGluedNode())

    Ops.push_back(Op->getOperand(Op->getNumOperands() - 1));


  SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);

  return SDValue(DAG.getMachineNode(X86::NOOP, SDLoc(Op), VTs, Ops), 0);

}


// Custom split CVTPS2PH with wide types.


static SDValue LowerCVTPS2PH(SDValue Op, SelectionDAG &DAG) {

  SDLoc dl(Op);

  EVT VT = Op.getValueType();

  SDValue Lo, Hi;

  std::tie(Lo, Hi) = DAG.SplitVectorOperand(Op.getNode(), 0);

  EVT LoVT, HiVT;

  std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);

  SDValue RC = Op.getOperand(1);

  Lo = DAG.getNode(X86ISD::CVTPS2PH, dl, LoVT, Lo, RC);

  Hi = DAG.getNode(X86ISD::CVTPS2PH, dl, HiVT, Hi, RC);

  return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);

}


static SDValue LowerPREFETCH(SDValue Op, const X86Subtarget &Subtarget,

                             SelectionDAG &DAG) {

  unsigned IsData = Op.getConstantOperandVal(4);


  // We don't support non-data prefetch without PREFETCHI.

  // Just preserve the chain.

  if (!IsData && !Subtarget.hasPREFETCHI())

    return Op.getOperand(0);


  return Op;

}


static SDValue LowerFCanonicalize(SDValue Op, SelectionDAG &DAG) {

  SDNode *N = Op.getNode();

  SDValue Operand = N->getOperand(0);

  EVT VT = Operand.getValueType();

  SDLoc dl(N);


  SDValue One = DAG.getConstantFP(1.0, dl, VT);


  // TODO: Fix Crash for bf16 when generating strict_fmul as it

  // leads to a error : SoftPromoteHalfResult #0: t11: bf16,ch = strict_fmul t0,

  // ConstantFP:bf16<APFloat(16256)>, t5 LLVM ERROR: Do not know how to soft

  // promote this operator's result!

  SDValue Chain = DAG.getEntryNode();

  SDValue StrictFmul = DAG.getNode(ISD::STRICT_FMUL, dl, {VT, MVT::Other},

                                   {Chain, Operand, One});

  return StrictFmul;

}


static StringRef getInstrStrFromOpNo(const SmallVectorImpl<StringRef> &AsmStrs,

                                     unsigned OpNo) {

  const APInt Operand(32, OpNo);

  std::string OpNoStr = llvm::toString(Operand, 10, false);

  std::string Str(" $");


  std::string OpNoStr1(Str + OpNoStr);             // e.g. " $1" (OpNo=1)

  std::string OpNoStr2(Str + "{" + OpNoStr + ":"); // With modifier, e.g. ${1:P}


  auto I = StringRef::npos;

  for (auto &AsmStr : AsmStrs) {

    // Match the OpNo string. We should match exactly to exclude match

    // sub-string, e.g. "$12" contain "$1"

    if (AsmStr.ends_with(OpNoStr1))

      I = AsmStr.size() - OpNoStr1.size();


    // Get the index of operand in AsmStr.

    if (I == StringRef::npos)

      I = AsmStr.find(OpNoStr1 + ",");

    if (I == StringRef::npos)

      I = AsmStr.find(OpNoStr2);


    if (I == StringRef::npos)

      continue;


    assert(I > 0 && "Unexpected inline asm string!");

    // Remove the operand string and label (if exsit).

    // For example:

    // ".L__MSASMLABEL_.${:uid}__l:call dword ptr ${0:P}"

    // ==>

    // ".L__MSASMLABEL_.${:uid}__l:call dword ptr "

    // ==>

    // "call dword ptr "

    auto TmpStr = AsmStr.substr(0, I);

    I = TmpStr.rfind(':');

    if (I != StringRef::npos)

      TmpStr = TmpStr.substr(I + 1);

    return TmpStr.take_while(llvm::isAlpha);

  }


  return StringRef();

}


bool X86TargetLowering::isInlineAsmTargetBranch(

    const SmallVectorImpl<StringRef> &AsmStrs, unsigned OpNo) const {

  // In a __asm block, __asm inst foo where inst is CALL or JMP should be

  // changed from indirect TargetLowering::C_Memory to direct

  // TargetLowering::C_Address.

  // We don't need to special case LOOP* and Jcc, which cannot target a memory

  // location.

  StringRef Inst = getInstrStrFromOpNo(AsmStrs, OpNo);

  return Inst.equals_insensitive("call") || Inst.equals_insensitive("jmp");

}


static SDValue getFlagsOfCmpZeroFori1(SelectionDAG &DAG, const SDLoc &DL,

                                      SDValue Mask) {

  EVT Ty = MVT::i8;

  auto V = DAG.getBitcast(MVT::i1, Mask);

  auto VE = DAG.getZExtOrTrunc(V, DL, Ty);

  auto Zero = DAG.getConstant(0, DL, Ty);

  SDVTList X86SubVTs = DAG.getVTList(Ty, MVT::i32);

  auto CmpZero = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Zero, VE);

  return SDValue(CmpZero.getNode(), 1);

}


SDValue X86TargetLowering::visitMaskedLoad(

    SelectionDAG &DAG, const SDLoc &DL, SDValue Chain, MachineMemOperand *MMO,

    SDValue &NewLoad, SDValue Ptr, SDValue PassThru, SDValue Mask) const {

  // @llvm.masked.load.v1*(ptr, alignment, mask, passthru)

  // ->

  // _, flags = SUB 0, mask

  // res, chain = CLOAD inchain, ptr, (bit_cast_to_scalar passthru), cond, flags

  // bit_cast_to_vector<res>

  EVT VTy = PassThru.getValueType();

  EVT Ty = VTy.getVectorElementType();

  SDVTList Tys = DAG.getVTList(Ty, MVT::Other);

  auto ScalarPassThru = PassThru.isUndef() ? DAG.getConstant(0, DL, Ty)

                                           : DAG.getBitcast(Ty, PassThru);

  auto Flags = getFlagsOfCmpZeroFori1(DAG, DL, Mask);

  auto COND_NE = DAG.getTargetConstant(X86::COND_NE, DL, MVT::i8);

  SDValue Ops[] = {Chain, Ptr, ScalarPassThru, COND_NE, Flags};

  NewLoad = DAG.getMemIntrinsicNode(X86ISD::CLOAD, DL, Tys, Ops, Ty, MMO);

  return DAG.getBitcast(VTy, NewLoad);

}


SDValue X86TargetLowering::visitMaskedStore(SelectionDAG &DAG, const SDLoc &DL,

                                            SDValue Chain,

                                            MachineMemOperand *MMO, SDValue Ptr,

                                            SDValue Val, SDValue Mask) const {

  // llvm.masked.store.v1*(Src0, Ptr, alignment, Mask)

  // ->

  // _, flags = SUB 0, mask

  // chain = CSTORE inchain, (bit_cast_to_scalar val), ptr, cond, flags

  EVT Ty = Val.getValueType().getVectorElementType();

  SDVTList Tys = DAG.getVTList(MVT::Other);

  auto ScalarVal = DAG.getBitcast(Ty, Val);

  auto Flags = getFlagsOfCmpZeroFori1(DAG, DL, Mask);

  auto COND_NE = DAG.getTargetConstant(X86::COND_NE, DL, MVT::i8);

  SDValue Ops[] = {Chain, ScalarVal, Ptr, COND_NE, Flags};

  return DAG.getMemIntrinsicNode(X86ISD::CSTORE, DL, Tys, Ops, Ty, MMO);

}


/// Provide custom lowering hooks for some operations.


SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {

  switch (Op.getOpcode()) {

  // clang-format off

  default: llvm_unreachable("Should not custom lower this!");

  case ISD::ATOMIC_FENCE:       return LowerATOMIC_FENCE(Op, Subtarget, DAG);

  case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS:

    return LowerCMP_SWAP(Op, Subtarget, DAG);

  case ISD::CTPOP:              return LowerCTPOP(Op, Subtarget, DAG);

  case ISD::ATOMIC_LOAD_ADD:

  case ISD::ATOMIC_LOAD_SUB:

  case ISD::ATOMIC_LOAD_OR:

  case ISD::ATOMIC_LOAD_XOR:

  case ISD::ATOMIC_LOAD_AND:    return lowerAtomicArith(Op, DAG, Subtarget);

  case ISD::ATOMIC_STORE:       return LowerATOMIC_STORE(Op, DAG, Subtarget);

  case ISD::BITREVERSE:         return LowerBITREVERSE(Op, Subtarget, DAG);

  case ISD::PARITY:             return LowerPARITY(Op, Subtarget, DAG);

  case ISD::BUILD_VECTOR:       return LowerBUILD_VECTOR(Op, DAG);

  case ISD::CONCAT_VECTORS:     return LowerCONCAT_VECTORS(Op, Subtarget, DAG);

  case ISD::VECTOR_SHUFFLE:     return lowerVECTOR_SHUFFLE(Op, Subtarget, DAG);

  case ISD::VECTOR_COMPRESS:    return lowerVECTOR_COMPRESS(Op, Subtarget, DAG);

  case ISD::VSELECT:            return LowerVSELECT(Op, DAG);

  case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);

  case ISD::INSERT_VECTOR_ELT:  return LowerINSERT_VECTOR_ELT(Op, DAG);

  case ISD::INSERT_SUBVECTOR:   return LowerINSERT_SUBVECTOR(Op, Subtarget,DAG);

  case ISD::EXTRACT_SUBVECTOR:  return LowerEXTRACT_SUBVECTOR(Op,Subtarget,DAG);

  case ISD::SCALAR_TO_VECTOR:   return LowerSCALAR_TO_VECTOR(Op, Subtarget,DAG);

  case ISD::ConstantPool:       return LowerConstantPool(Op, DAG);

  case ISD::GlobalAddress:      return LowerGlobalAddress(Op, DAG);

  case ISD::GlobalTLSAddress:   return LowerGlobalTLSAddress(Op, DAG);

  case ISD::ExternalSymbol:     return LowerExternalSymbol(Op, DAG);

  case ISD::BlockAddress:       return LowerBlockAddress(Op, DAG);

  case ISD::SHL_PARTS:

  case ISD::SRA_PARTS:

  case ISD::SRL_PARTS:          return LowerShiftParts(Op, DAG);

  case ISD::FSHL:

  case ISD::FSHR:               return LowerFunnelShift(Op, Subtarget, DAG);

  case ISD::FCANONICALIZE:      return LowerFCanonicalize(Op, DAG);

  case ISD::STRICT_SINT_TO_FP:

  case ISD::SINT_TO_FP:         return LowerSINT_TO_FP(Op, DAG);

  case ISD::STRICT_UINT_TO_FP:

  case ISD::UINT_TO_FP:         return LowerUINT_TO_FP(Op, DAG);

  case ISD::TRUNCATE:           return LowerTRUNCATE(Op, DAG);

  case ISD::ZERO_EXTEND:        return LowerZERO_EXTEND(Op, Subtarget, DAG);

  case ISD::SIGN_EXTEND:        return LowerSIGN_EXTEND(Op, Subtarget, DAG);

  case ISD::ANY_EXTEND:         return LowerANY_EXTEND(Op, Subtarget, DAG);

  case ISD::ZERO_EXTEND_VECTOR_INREG:

  case ISD::SIGN_EXTEND_VECTOR_INREG:

    return LowerEXTEND_VECTOR_INREG(Op, Subtarget, DAG);

  case ISD::FP_TO_SINT:

  case ISD::STRICT_FP_TO_SINT:

  case ISD::FP_TO_UINT:

  case ISD::STRICT_FP_TO_UINT:  return LowerFP_TO_INT(Op, DAG);

  case ISD::FP_TO_SINT_SAT:

  case ISD::FP_TO_UINT_SAT:     return LowerFP_TO_INT_SAT(Op, DAG);

  case ISD::FP_EXTEND:

  case ISD::STRICT_FP_EXTEND:   return LowerFP_EXTEND(Op, DAG);

  case ISD::FP_ROUND:

  case ISD::STRICT_FP_ROUND:    return LowerFP_ROUND(Op, DAG);

  case ISD::FP16_TO_FP:

  case ISD::STRICT_FP16_TO_FP:  return LowerFP16_TO_FP(Op, DAG);

  case ISD::FP_TO_FP16:

  case ISD::STRICT_FP_TO_FP16:  return LowerFP_TO_FP16(Op, DAG);

  case ISD::FP_TO_BF16:         return LowerFP_TO_BF16(Op, DAG);

  case ISD::LOAD:               return LowerLoad(Op, Subtarget, DAG);

  case ISD::STORE:              return LowerStore(Op, Subtarget, DAG);

  case ISD::FADD:

  case ISD::FSUB:               return lowerFaddFsub(Op, DAG);

  case ISD::FROUND:             return LowerFROUND(Op, DAG);

  case ISD::FABS:

  case ISD::FNEG:               return LowerFABSorFNEG(Op, DAG);

  case ISD::FCOPYSIGN:          return LowerFCOPYSIGN(Op, DAG);

  case ISD::FGETSIGN:           return LowerFGETSIGN(Op, DAG);

  case ISD::LRINT:

  case ISD::LLRINT:             return LowerLRINT_LLRINT(Op, DAG);

  case ISD::SETCC:

  case ISD::STRICT_FSETCC:

  case ISD::STRICT_FSETCCS:     return LowerSETCC(Op, DAG);

  case ISD::SETCCCARRY:         return LowerSETCCCARRY(Op, DAG);

  case ISD::SELECT:             return LowerSELECT(Op, DAG);

  case ISD::BRCOND:             return LowerBRCOND(Op, DAG);

  case ISD::JumpTable:          return LowerJumpTable(Op, DAG);

  case ISD::VASTART:            return LowerVASTART(Op, DAG);

  case ISD::VAARG:              return LowerVAARG(Op, DAG);

  case ISD::VACOPY:             return LowerVACOPY(Op, Subtarget, DAG);

  case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);

  case ISD::INTRINSIC_VOID:

  case ISD::INTRINSIC_W_CHAIN:  return LowerINTRINSIC_W_CHAIN(Op, Subtarget, DAG);

  case ISD::RETURNADDR:         return LowerRETURNADDR(Op, DAG);

  case ISD::ADDROFRETURNADDR:   return LowerADDROFRETURNADDR(Op, DAG);

  case ISD::FRAMEADDR:          return LowerFRAMEADDR(Op, DAG);

  case ISD::FRAME_TO_ARGS_OFFSET:

                                return LowerFRAME_TO_ARGS_OFFSET(Op, DAG);

  case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);

  case ISD::EH_RETURN:          return LowerEH_RETURN(Op, DAG);

  case ISD::EH_SJLJ_SETJMP:     return lowerEH_SJLJ_SETJMP(Op, DAG);

  case ISD::EH_SJLJ_LONGJMP:    return lowerEH_SJLJ_LONGJMP(Op, DAG);

  case ISD::EH_SJLJ_SETUP_DISPATCH:

    return lowerEH_SJLJ_SETUP_DISPATCH(Op, DAG);

  case ISD::INIT_TRAMPOLINE:    return LowerINIT_TRAMPOLINE(Op, DAG);

  case ISD::ADJUST_TRAMPOLINE:  return LowerADJUST_TRAMPOLINE(Op, DAG);

  case ISD::GET_ROUNDING:       return LowerGET_ROUNDING(Op, DAG);

  case ISD::SET_ROUNDING:       return LowerSET_ROUNDING(Op, DAG);

  case ISD::GET_FPENV_MEM:      return LowerGET_FPENV_MEM(Op, DAG);

  case ISD::SET_FPENV_MEM:      return LowerSET_FPENV_MEM(Op, DAG);

  case ISD::RESET_FPENV:        return LowerRESET_FPENV(Op, DAG);

  case ISD::CTLZ:

  case ISD::CTLZ_ZERO_UNDEF:    return LowerCTLZ(Op, Subtarget, DAG);

  case ISD::CTTZ:

  case ISD::CTTZ_ZERO_UNDEF:    return LowerCTTZ(Op, Subtarget, DAG);

  case ISD::MUL:                return LowerMUL(Op, Subtarget, DAG);

  case ISD::MULHS:

  case ISD::MULHU:              return LowerMULH(Op, Subtarget, DAG);

  case ISD::ROTL:

  case ISD::ROTR:               return LowerRotate(Op, Subtarget, DAG);

  case ISD::SRA:

  case ISD::SRL:

  case ISD::SHL:                return LowerShift(Op, Subtarget, DAG);

  case ISD::SADDO:

  case ISD::UADDO:

  case ISD::SSUBO:

  case ISD::USUBO:              return LowerXALUO(Op, DAG);

  case ISD::SMULO:

  case ISD::UMULO:              return LowerMULO(Op, Subtarget, DAG);

  case ISD::READCYCLECOUNTER:   return LowerREADCYCLECOUNTER(Op, Subtarget,DAG);

  case ISD::BITCAST:            return LowerBITCAST(Op, Subtarget, DAG);

  case ISD::SADDO_CARRY:

  case ISD::SSUBO_CARRY:

  case ISD::UADDO_CARRY:

  case ISD::USUBO_CARRY:        return LowerADDSUBO_CARRY(Op, DAG);

  case ISD::ADD:

  case ISD::SUB:                return lowerAddSub(Op, DAG, Subtarget);

  case ISD::UADDSAT:

  case ISD::SADDSAT:

  case ISD::USUBSAT:

  case ISD::SSUBSAT:            return LowerADDSAT_SUBSAT(Op, DAG, Subtarget);

  case ISD::SMAX:

  case ISD::SMIN:

  case ISD::UMAX:

  case ISD::UMIN:               return LowerMINMAX(Op, Subtarget, DAG);

  case ISD::FMINIMUM:

  case ISD::FMAXIMUM:

  case ISD::FMINIMUMNUM:

  case ISD::FMAXIMUMNUM:

    return LowerFMINIMUM_FMAXIMUM(Op, Subtarget, DAG);

  case ISD::ABS:                return LowerABS(Op, Subtarget, DAG);

  case ISD::ABDS:

  case ISD::ABDU:               return LowerABD(Op, Subtarget, DAG);

  case ISD::AVGCEILU:           return LowerAVG(Op, Subtarget, DAG);

  case ISD::FSINCOS:            return LowerFSINCOS(Op, Subtarget, DAG);

  case ISD::MLOAD:              return LowerMLOAD(Op, Subtarget, DAG);

  case ISD::MSTORE:             return LowerMSTORE(Op, Subtarget, DAG);

  case ISD::MGATHER:            return LowerMGATHER(Op, Subtarget, DAG);

  case ISD::MSCATTER:           return LowerMSCATTER(Op, Subtarget, DAG);

  case ISD::GC_TRANSITION_START:

  case ISD::GC_TRANSITION_END:  return LowerGC_TRANSITION(Op, DAG);

  case ISD::ADDRSPACECAST:      return LowerADDRSPACECAST(Op, DAG);

  case X86ISD::CVTPS2PH:        return LowerCVTPS2PH(Op, DAG);

  case ISD::PREFETCH:           return LowerPREFETCH(Op, Subtarget, DAG);

  // clang-format on

  }

}


/// Replace a node with an illegal result type with a new node built out of

/// custom code.


void X86TargetLowering::ReplaceNodeResults(SDNode *N,

                                           SmallVectorImpl<SDValue>&Results,

                                           SelectionDAG &DAG) const {

  SDLoc dl(N);

  unsigned Opc = N->getOpcode();

  switch (Opc) {

  default:

#ifndef NDEBUG

    dbgs() << "ReplaceNodeResults: ";

    N->dump(&DAG);

#endif

    llvm_unreachable("Do not know how to custom type legalize this operation!");

  case X86ISD::CVTPH2PS: {

    EVT VT = N->getValueType(0);

    SDValue Lo, Hi;

    std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);

    EVT LoVT, HiVT;

    std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);

    Lo = DAG.getNode(X86ISD::CVTPH2PS, dl, LoVT, Lo);

    Hi = DAG.getNode(X86ISD::CVTPH2PS, dl, HiVT, Hi);

    SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);

    Results.push_back(Res);

    return;

  }

  case X86ISD::STRICT_CVTPH2PS: {

    EVT VT = N->getValueType(0);

    SDValue Lo, Hi;

    std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 1);

    EVT LoVT, HiVT;

    std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);

    Lo = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {LoVT, MVT::Other},

                     {N->getOperand(0), Lo});

    Hi = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {HiVT, MVT::Other},

                     {N->getOperand(0), Hi});

    SDValue Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,

                                Lo.getValue(1), Hi.getValue(1));

    SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);

    Results.push_back(Res);

    Results.push_back(Chain);

    return;

  }

  case X86ISD::CVTPS2PH:

    Results.push_back(LowerCVTPS2PH(SDValue(N, 0), DAG));

    return;

  case ISD::CTPOP: {

    assert(N->getValueType(0) == MVT::i64 && "Unexpected VT!");

    // If we have at most 32 active bits, then perform as i32 CTPOP.

    // TODO: Perform this in generic legalizer?

    KnownBits Known = DAG.computeKnownBits(N->getOperand(0));

    unsigned LZ = Known.countMinLeadingZeros();

    unsigned TZ = Known.countMinTrailingZeros();

    if ((LZ + TZ) >= 32) {

      SDValue Op = DAG.getNode(ISD::SRL, dl, MVT::i64, N->getOperand(0),

                               DAG.getShiftAmountConstant(TZ, MVT::i64, dl));

      Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Op);

      Op = DAG.getNode(ISD::CTPOP, dl, MVT::i32, Op);

      Op = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Op);

      Results.push_back(Op);

      return;

    }

    // Use a v2i64 if possible.

    bool NoImplicitFloatOps =

        DAG.getMachineFunction().getFunction().hasFnAttribute(

            Attribute::NoImplicitFloat);

    if (isTypeLegal(MVT::v2i64) && !NoImplicitFloatOps) {

      SDValue Wide =

          DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, N->getOperand(0));

      Wide = DAG.getNode(ISD::CTPOP, dl, MVT::v2i64, Wide);

      // Bit count should fit in 32-bits, extract it as that and then zero

      // extend to i64. Otherwise we end up extracting bits 63:32 separately.

      Wide = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Wide);

      Wide = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, Wide,

                         DAG.getVectorIdxConstant(0, dl));

      Wide = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Wide);

      Results.push_back(Wide);

    }

    return;

  }

  case ISD::MUL: {

    EVT VT = N->getValueType(0);

    assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&

           VT.getVectorElementType() == MVT::i8 && "Unexpected VT!");

    // Pre-promote these to vXi16 to avoid op legalization thinking all 16

    // elements are needed.

    MVT MulVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements());

    SDValue Op0 = DAG.getNode(ISD::ANY_EXTEND, dl, MulVT, N->getOperand(0));

    SDValue Op1 = DAG.getNode(ISD::ANY_EXTEND, dl, MulVT, N->getOperand(1));

    SDValue Res = DAG.getNode(ISD::MUL, dl, MulVT, Op0, Op1);

    Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);

    unsigned NumConcats = 16 / VT.getVectorNumElements();

    SmallVector<SDValue, 8> ConcatOps(NumConcats, DAG.getUNDEF(VT));

    ConcatOps[0] = Res;

    Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i8, ConcatOps);

    Results.push_back(Res);

    return;

  }

  case ISD::SMULO:

  case ISD::UMULO: {

    EVT VT = N->getValueType(0);

    assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&

           VT == MVT::v2i32 && "Unexpected VT!");

    bool IsSigned = Opc == ISD::SMULO;

    unsigned ExtOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;

    SDValue Op0 = DAG.getNode(ExtOpc, dl, MVT::v2i64, N->getOperand(0));

    SDValue Op1 = DAG.getNode(ExtOpc, dl, MVT::v2i64, N->getOperand(1));

    SDValue Res = DAG.getNode(ISD::MUL, dl, MVT::v2i64, Op0, Op1);

    // Extract the high 32 bits from each result using PSHUFD.

    // TODO: Could use SRL+TRUNCATE but that doesn't become a PSHUFD.

    SDValue Hi = DAG.getBitcast(MVT::v4i32, Res);

    Hi = DAG.getVectorShuffle(MVT::v4i32, dl, Hi, Hi, {1, 3, -1, -1});

    Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Hi,

                     DAG.getVectorIdxConstant(0, dl));


    // Truncate the low bits of the result. This will become PSHUFD.

    Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);


    SDValue HiCmp;

    if (IsSigned) {

      // SMULO overflows if the high bits don't match the sign of the low.

      HiCmp = DAG.getNode(ISD::SRA, dl, VT, Res, DAG.getConstant(31, dl, VT));

    } else {

      // UMULO overflows if the high bits are non-zero.

      HiCmp = DAG.getConstant(0, dl, VT);

    }

    SDValue Ovf = DAG.getSetCC(dl, N->getValueType(1), Hi, HiCmp, ISD::SETNE);


    // Widen the result with by padding with undef.

    Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Res,

                      DAG.getUNDEF(VT));

    Results.push_back(Res);

    Results.push_back(Ovf);

    return;

  }

  case X86ISD::VPMADDWD: {

    // Legalize types for X86ISD::VPMADDWD by widening.

    assert(Subtarget.hasSSE2() && "Requires at least SSE2!");


    EVT VT = N->getValueType(0);

    EVT InVT = N->getOperand(0).getValueType();

    assert(VT.getSizeInBits() < 128 && 128 % VT.getSizeInBits() == 0 &&

           "Expected a VT that divides into 128 bits.");

    assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&

           "Unexpected type action!");

    unsigned NumConcat = 128 / InVT.getSizeInBits();


    EVT InWideVT = EVT::getVectorVT(*DAG.getContext(),

                                    InVT.getVectorElementType(),

                                    NumConcat * InVT.getVectorNumElements());

    EVT WideVT = EVT::getVectorVT(*DAG.getContext(),

                                  VT.getVectorElementType(),

                                  NumConcat * VT.getVectorNumElements());


    SmallVector<SDValue, 16> Ops(NumConcat, DAG.getUNDEF(InVT));

    Ops[0] = N->getOperand(0);

    SDValue InVec0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, InWideVT, Ops);

    Ops[0] = N->getOperand(1);

    SDValue InVec1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, InWideVT, Ops);


    SDValue Res = DAG.getNode(Opc, dl, WideVT, InVec0, InVec1);

    Results.push_back(Res);

    return;

  }

  // We might have generated v2f32 FMIN/FMAX operations. Widen them to v4f32.

  case X86ISD::FMINC:

  case X86ISD::FMIN:

  case X86ISD::FMAXC:

  case X86ISD::FMAX:

  case X86ISD::STRICT_FMIN:

  case X86ISD::STRICT_FMAX: {

    EVT VT = N->getValueType(0);

    assert(VT == MVT::v2f32 && "Unexpected type (!= v2f32) on FMIN/FMAX.");

    bool IsStrict = Opc == X86ISD::STRICT_FMIN || Opc == X86ISD::STRICT_FMAX;

    SDValue UNDEF = DAG.getUNDEF(VT);

    SDValue LHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,

                              N->getOperand(IsStrict ? 1 : 0), UNDEF);

    SDValue RHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,

                              N->getOperand(IsStrict ? 2 : 1), UNDEF);

    SDValue Res;

    if (IsStrict)

      Res = DAG.getNode(Opc, dl, {MVT::v4f32, MVT::Other},

                        {N->getOperand(0), LHS, RHS});

    else

      Res = DAG.getNode(Opc, dl, MVT::v4f32, LHS, RHS);

    Results.push_back(Res);

    if (IsStrict)

      Results.push_back(Res.getValue(1));

    return;

  }

  case ISD::SDIV:

  case ISD::UDIV:

  case ISD::SREM:

  case ISD::UREM: {

    EVT VT = N->getValueType(0);

    if (VT.isVector()) {

      assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&

             "Unexpected type action!");

      // If this RHS is a constant splat vector we can widen this and let

      // division/remainder by constant optimize it.

      // TODO: Can we do something for non-splat?

      APInt SplatVal;

      if (ISD::isConstantSplatVector(N->getOperand(1).getNode(), SplatVal)) {

        unsigned NumConcats = 128 / VT.getSizeInBits();

        SmallVector<SDValue, 8> Ops0(NumConcats, DAG.getUNDEF(VT));

        Ops0[0] = N->getOperand(0);

        EVT ResVT = getTypeToTransformTo(*DAG.getContext(), VT);

        SDValue N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Ops0);

        SDValue N1 = DAG.getConstant(SplatVal, dl, ResVT);

        SDValue Res = DAG.getNode(Opc, dl, ResVT, N0, N1);

        Results.push_back(Res);

      }

      return;

    }


    SDValue V = LowerWin64_i128OP(SDValue(N,0), DAG);

    Results.push_back(V);

    return;

  }

  case ISD::TRUNCATE: {

    MVT VT = N->getSimpleValueType(0);

    if (getTypeAction(*DAG.getContext(), VT) != TypeWidenVector)

      return;


    // The generic legalizer will try to widen the input type to the same

    // number of elements as the widened result type. But this isn't always

    // the best thing so do some custom legalization to avoid some cases.

    MVT WidenVT = getTypeToTransformTo(*DAG.getContext(), VT).getSimpleVT();

    SDValue In = N->getOperand(0);

    EVT InVT = In.getValueType();

    EVT InEltVT = InVT.getVectorElementType();

    EVT EltVT = VT.getVectorElementType();

    unsigned MinElts = VT.getVectorNumElements();

    unsigned WidenNumElts = WidenVT.getVectorNumElements();

    unsigned InBits = InVT.getSizeInBits();


    // See if there are sufficient leading bits to perform a PACKUS/PACKSS.

    unsigned PackOpcode;

    if (SDValue Src = matchTruncateWithPACK(PackOpcode, VT, In, dl, DAG,

                                            Subtarget, N->getFlags())) {

      if (SDValue Res =

              truncateVectorWithPACK(PackOpcode, VT, Src, dl, DAG, Subtarget)) {

        Res = widenSubVector(WidenVT, Res, false, Subtarget, DAG, dl);

        Results.push_back(Res);

        return;

      }

    }


    if ((128 % InBits) == 0 && WidenVT.is128BitVector()) {

      // 128 bit and smaller inputs should avoid truncate all together and

      // use a shuffle.

      if ((InEltVT.getSizeInBits() % EltVT.getSizeInBits()) == 0) {

        int Scale = InEltVT.getSizeInBits() / EltVT.getSizeInBits();

        SmallVector<int, 16> TruncMask(WidenNumElts, -1);

        for (unsigned I = 0; I < MinElts; ++I)

          TruncMask[I] = Scale * I;

        SDValue WidenIn = widenSubVector(In, false, Subtarget, DAG, dl, 128);

        assert(isTypeLegal(WidenVT) && isTypeLegal(WidenIn.getValueType()) &&

               "Illegal vector type in truncation");

        WidenIn = DAG.getBitcast(WidenVT, WidenIn);

        Results.push_back(

            DAG.getVectorShuffle(WidenVT, dl, WidenIn, WidenIn, TruncMask));

        return;

      }

    }


    // With AVX512 there are some cases that can use a target specific

    // truncate node to go from 256/512 to less than 128 with zeros in the

    // upper elements of the 128 bit result.

    if (Subtarget.hasAVX512() && isTypeLegal(InVT)) {

      // We can use VTRUNC directly if for 256 bits with VLX or for any 512.

      if ((InBits == 256 && Subtarget.hasVLX()) || InBits == 512) {

        Results.push_back(DAG.getNode(X86ISD::VTRUNC, dl, WidenVT, In));

        return;

      }

      // There's one case we can widen to 512 bits and use VTRUNC.

      if (InVT == MVT::v4i64 && VT == MVT::v4i8 && isTypeLegal(MVT::v8i64)) {

        In = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i64, In,

                         DAG.getUNDEF(MVT::v4i64));

        Results.push_back(DAG.getNode(X86ISD::VTRUNC, dl, WidenVT, In));

        return;

      }

    }

    if (Subtarget.hasVLX() && InVT == MVT::v8i64 && VT == MVT::v8i8 &&

        getTypeAction(*DAG.getContext(), InVT) == TypeSplitVector &&

        isTypeLegal(MVT::v4i64)) {

      // Input needs to be split and output needs to widened. Let's use two

      // VTRUNCs, and shuffle their results together into the wider type.

      SDValue Lo, Hi;

      std::tie(Lo, Hi) = DAG.SplitVector(In, dl);


      Lo = DAG.getNode(X86ISD::VTRUNC, dl, MVT::v16i8, Lo);

      Hi = DAG.getNode(X86ISD::VTRUNC, dl, MVT::v16i8, Hi);

      SDValue Res = DAG.getVectorShuffle(MVT::v16i8, dl, Lo, Hi,

                                         { 0,  1,  2,  3, 16, 17, 18, 19,

                                          -1, -1, -1, -1, -1, -1, -1, -1 });

      Results.push_back(Res);

      return;

    }


    // Attempt to widen the truncation input vector to let LowerTRUNCATE handle

    // this via type legalization.

    if ((InEltVT == MVT::i16 || InEltVT == MVT::i32 || InEltVT == MVT::i64) &&

        (EltVT == MVT::i8 || EltVT == MVT::i16 || EltVT == MVT::i32) &&

        (!Subtarget.hasSSSE3() ||

         (!isTypeLegal(InVT) &&

          !(MinElts <= 4 && InEltVT == MVT::i64 && EltVT == MVT::i8)))) {

      SDValue WidenIn = widenSubVector(In, false, Subtarget, DAG, dl,

                                       InEltVT.getSizeInBits() * WidenNumElts);

      Results.push_back(DAG.getNode(ISD::TRUNCATE, dl, WidenVT, WidenIn));

      return;

    }


    return;

  }

  case ISD::ANY_EXTEND:

    // Right now, only MVT::v8i8 has Custom action for an illegal type.

    // It's intended to custom handle the input type.

    assert(N->getValueType(0) == MVT::v8i8 &&

           "Do not know how to legalize this Node");

    return;

  case ISD::SIGN_EXTEND:

  case ISD::ZERO_EXTEND: {

    EVT VT = N->getValueType(0);

    SDValue In = N->getOperand(0);

    EVT InVT = In.getValueType();

    if (!Subtarget.hasSSE41() && VT == MVT::v4i64 &&

        (InVT == MVT::v4i16 || InVT == MVT::v4i8)){

      assert(getTypeAction(*DAG.getContext(), InVT) == TypeWidenVector &&

             "Unexpected type action!");

      assert(Opc == ISD::SIGN_EXTEND && "Unexpected opcode");

      // Custom split this so we can extend i8/i16->i32 invec. This is better

      // since sign_extend_inreg i8/i16->i64 requires an extend to i32 using

      // sra. Then extending from i32 to i64 using pcmpgt. By custom splitting

      // we allow the sra from the extend to i32 to be shared by the split.

      In = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, In);


      // Fill a vector with sign bits for each element.

      SDValue Zero = DAG.getConstant(0, dl, MVT::v4i32);

      SDValue SignBits = DAG.getSetCC(dl, MVT::v4i32, Zero, In, ISD::SETGT);


      // Create an unpackl and unpackh to interleave the sign bits then bitcast

      // to v2i64.

      SDValue Lo = DAG.getVectorShuffle(MVT::v4i32, dl, In, SignBits,

                                        {0, 4, 1, 5});

      Lo = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, Lo);

      SDValue Hi = DAG.getVectorShuffle(MVT::v4i32, dl, In, SignBits,

                                        {2, 6, 3, 7});

      Hi = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, Hi);


      SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);

      Results.push_back(Res);

      return;

    }


    if (VT == MVT::v16i32 || VT == MVT::v8i64) {

      if (!InVT.is128BitVector()) {

        // Not a 128 bit vector, but maybe type legalization will promote

        // it to 128 bits.

        if (getTypeAction(*DAG.getContext(), InVT) != TypePromoteInteger)

          return;

        InVT = getTypeToTransformTo(*DAG.getContext(), InVT);

        if (!InVT.is128BitVector())

          return;


        // Promote the input to 128 bits. Type legalization will turn this into

        // zext_inreg/sext_inreg.

        In = DAG.getNode(Opc, dl, InVT, In);

      }


      // Perform custom splitting instead of the two stage extend we would get

      // by default.

      EVT LoVT, HiVT;

      std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));

      assert(isTypeLegal(LoVT) && "Split VT not legal?");


      SDValue Lo = getEXTEND_VECTOR_INREG(Opc, dl, LoVT, In, DAG);


      // We need to shift the input over by half the number of elements.

      unsigned NumElts = InVT.getVectorNumElements();

      unsigned HalfNumElts = NumElts / 2;

      SmallVector<int, 16> ShufMask(NumElts, SM_SentinelUndef);

      for (unsigned i = 0; i != HalfNumElts; ++i)

        ShufMask[i] = i + HalfNumElts;


      SDValue Hi = DAG.getVectorShuffle(InVT, dl, In, In, ShufMask);

      Hi = getEXTEND_VECTOR_INREG(Opc, dl, HiVT, Hi, DAG);


      SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);

      Results.push_back(Res);

    }

    return;

  }

  case ISD::FP_TO_SINT_SAT:

  case ISD::FP_TO_UINT_SAT: {

    if (!Subtarget.hasAVX10_2())

      return;


    bool IsSigned = Opc == ISD::FP_TO_SINT_SAT;

    EVT VT = N->getValueType(0);

    SDValue Op = N->getOperand(0);

    EVT OpVT = Op.getValueType();

    SDValue Res;


    if (VT == MVT::v2i32 && OpVT == MVT::v2f64) {

      if (IsSigned)

        Res = DAG.getNode(X86ISD::FP_TO_SINT_SAT, dl, MVT::v4i32, Op);

      else

        Res = DAG.getNode(X86ISD::FP_TO_UINT_SAT, dl, MVT::v4i32, Op);

      Results.push_back(Res);

    }

    return;

  }

  case ISD::FP_TO_SINT:

  case ISD::STRICT_FP_TO_SINT:

  case ISD::FP_TO_UINT:

  case ISD::STRICT_FP_TO_UINT: {

    bool IsStrict = N->isStrictFPOpcode();

    bool IsSigned = Opc == ISD::FP_TO_SINT || Opc == ISD::STRICT_FP_TO_SINT;

    EVT VT = N->getValueType(0);

    SDValue Src = N->getOperand(IsStrict ? 1 : 0);

    SDValue Chain = IsStrict ? N->getOperand(0) : SDValue();

    EVT SrcVT = Src.getValueType();


    SDValue Res;

    if (isSoftF16(SrcVT, Subtarget)) {

      EVT NVT = VT.isVector() ? VT.changeVectorElementType(MVT::f32) : MVT::f32;

      if (IsStrict) {

        Res =

            DAG.getNode(Opc, dl, {VT, MVT::Other},

                        {Chain, DAG.getNode(ISD::STRICT_FP_EXTEND, dl,

                                            {NVT, MVT::Other}, {Chain, Src})});

        Chain = Res.getValue(1);

      } else {

        Res =

            DAG.getNode(Opc, dl, VT, DAG.getNode(ISD::FP_EXTEND, dl, NVT, Src));

      }

      Results.push_back(Res);

      if (IsStrict)

        Results.push_back(Chain);


      return;

    }


    if (VT.isVector() && Subtarget.hasFP16() && Subtarget.hasVLX() &&

        SrcVT.getVectorElementType() == MVT::f16) {

      EVT EleVT = VT.getVectorElementType();

      EVT ResVT = EleVT == MVT::i32 ? MVT::v4i32 : MVT::v8i16;


      if (SrcVT != MVT::v8f16) {

        SDValue Tmp =

            IsStrict ? DAG.getConstantFP(0.0, dl, SrcVT) : DAG.getUNDEF(SrcVT);

        SmallVector<SDValue, 4> Ops(SrcVT == MVT::v2f16 ? 4 : 2, Tmp);

        Ops[0] = Src;

        Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8f16, Ops);

      }


      if (IsStrict) {

        Opc = IsSigned ? X86ISD::STRICT_CVTTP2SI : X86ISD::STRICT_CVTTP2UI;

        Res =

            DAG.getNode(Opc, dl, {ResVT, MVT::Other}, {N->getOperand(0), Src});

        Chain = Res.getValue(1);

      } else {

        Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;

        Res = DAG.getNode(Opc, dl, ResVT, Src);

      }


      // TODO: Need to add exception check code for strict FP.

      if (EleVT.getSizeInBits() < 16) {

        MVT TmpVT = MVT::getVectorVT(EleVT.getSimpleVT(), 8);

        Res = DAG.getNode(ISD::TRUNCATE, dl, TmpVT, Res);


        // Now widen to 128 bits.

        unsigned NumConcats = 128 / TmpVT.getSizeInBits();

        MVT ConcatVT = MVT::getVectorVT(EleVT.getSimpleVT(), 8 * NumConcats);

        SmallVector<SDValue, 8> ConcatOps(NumConcats, DAG.getUNDEF(TmpVT));

        ConcatOps[0] = Res;

        Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, ConcatVT, ConcatOps);

      }


      Results.push_back(Res);

      if (IsStrict)

        Results.push_back(Chain);


      return;

    }


    if (VT.isVector() && VT.getScalarSizeInBits() < 32) {

      assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&

             "Unexpected type action!");


      // Try to create a 128 bit vector, but don't exceed a 32 bit element.

      unsigned NewEltWidth = std::min(128 / VT.getVectorNumElements(), 32U);

      MVT PromoteVT = MVT::getVectorVT(MVT::getIntegerVT(NewEltWidth),

                                       VT.getVectorNumElements());

      SDValue Res;

      SDValue Chain;

      if (IsStrict) {

        Res = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, {PromoteVT, MVT::Other},

                          {N->getOperand(0), Src});

        Chain = Res.getValue(1);

      } else

        Res = DAG.getNode(ISD::FP_TO_SINT, dl, PromoteVT, Src);


      // Preserve what we know about the size of the original result. If the

      // result is v2i32, we have to manually widen the assert.

      if (PromoteVT == MVT::v2i32)

        Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Res,

                          DAG.getUNDEF(MVT::v2i32));


      Res = DAG.getNode(!IsSigned ? ISD::AssertZext : ISD::AssertSext, dl,

                        Res.getValueType(), Res,

                        DAG.getValueType(VT.getVectorElementType()));


      if (PromoteVT == MVT::v2i32)

        Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res,

                          DAG.getVectorIdxConstant(0, dl));


      // Truncate back to the original width.

      Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);


      // Now widen to 128 bits.

      unsigned NumConcats = 128 / VT.getSizeInBits();

      MVT ConcatVT = MVT::getVectorVT(VT.getSimpleVT().getVectorElementType(),

                                      VT.getVectorNumElements() * NumConcats);

      SmallVector<SDValue, 8> ConcatOps(NumConcats, DAG.getUNDEF(VT));

      ConcatOps[0] = Res;

      Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, ConcatVT, ConcatOps);

      Results.push_back(Res);

      if (IsStrict)

        Results.push_back(Chain);

      return;

    }


    if (VT == MVT::v2i32) {

      assert((!IsStrict || IsSigned || Subtarget.hasAVX512()) &&

             "Strict unsigned conversion requires AVX512");

      assert(Subtarget.hasSSE2() && "Requires at least SSE2!");

      assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&

             "Unexpected type action!");

      if (Src.getValueType() == MVT::v2f64) {

        if (!IsSigned && !Subtarget.hasAVX512()) {

          SDValue Res =

              expandFP_TO_UINT_SSE(MVT::v4i32, Src, dl, DAG, Subtarget);

          Results.push_back(Res);

          return;

        }


        if (IsStrict)

          Opc = IsSigned ? X86ISD::STRICT_CVTTP2SI : X86ISD::STRICT_CVTTP2UI;

        else

          Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;


        // If we have VLX we can emit a target specific FP_TO_UINT node,.

        if (!IsSigned && !Subtarget.hasVLX()) {

          // Otherwise we can defer to the generic legalizer which will widen

          // the input as well. This will be further widened during op

          // legalization to v8i32<-v8f64.

          // For strict nodes we'll need to widen ourselves.

          // FIXME: Fix the type legalizer to safely widen strict nodes?

          if (!IsStrict)

            return;

          Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f64, Src,

                            DAG.getConstantFP(0.0, dl, MVT::v2f64));

          Opc = N->getOpcode();

        }

        SDValue Res;

        SDValue Chain;

        if (IsStrict) {

          Res = DAG.getNode(Opc, dl, {MVT::v4i32, MVT::Other},

                            {N->getOperand(0), Src});

          Chain = Res.getValue(1);

        } else {

          Res = DAG.getNode(Opc, dl, MVT::v4i32, Src);

        }

        Results.push_back(Res);

        if (IsStrict)

          Results.push_back(Chain);

        return;

      }


      // Custom widen strict v2f32->v2i32 by padding with zeros.

      // FIXME: Should generic type legalizer do this?

      if (Src.getValueType() == MVT::v2f32 && IsStrict) {

        Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,

                          DAG.getConstantFP(0.0, dl, MVT::v2f32));

        SDValue Res = DAG.getNode(Opc, dl, {MVT::v4i32, MVT::Other},

                                  {N->getOperand(0), Src});

        Results.push_back(Res);

        Results.push_back(Res.getValue(1));

        return;

      }


      // The FP_TO_INTHelper below only handles f32/f64/f80 scalar inputs,

      // so early out here.

      return;

    }


    assert(!VT.isVector() && "Vectors should have been handled above!");


    if ((Subtarget.hasDQI() && VT == MVT::i64 &&

         (SrcVT == MVT::f32 || SrcVT == MVT::f64)) ||

        (Subtarget.hasFP16() && SrcVT == MVT::f16)) {

      assert(!Subtarget.is64Bit() && "i64 should be legal");

      unsigned NumElts = Subtarget.hasVLX() ? 2 : 8;

      // If we use a 128-bit result we might need to use a target specific node.

      unsigned SrcElts =

          std::max(NumElts, 128U / (unsigned)SrcVT.getSizeInBits());

      MVT VecVT = MVT::getVectorVT(MVT::i64, NumElts);

      MVT VecInVT = MVT::getVectorVT(SrcVT.getSimpleVT(), SrcElts);

      if (NumElts != SrcElts) {

        if (IsStrict)

          Opc = IsSigned ? X86ISD::STRICT_CVTTP2SI : X86ISD::STRICT_CVTTP2UI;

        else

          Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;

      }


      SDValue ZeroIdx = DAG.getVectorIdxConstant(0, dl);

      SDValue Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VecInVT,

                                DAG.getConstantFP(0.0, dl, VecInVT), Src,

                                ZeroIdx);

      SDValue Chain;

      if (IsStrict) {

        SDVTList Tys = DAG.getVTList(VecVT, MVT::Other);

        Res = DAG.getNode(Opc, SDLoc(N), Tys, N->getOperand(0), Res);

        Chain = Res.getValue(1);

      } else

        Res = DAG.getNode(Opc, SDLoc(N), VecVT, Res);

      Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Res, ZeroIdx);

      Results.push_back(Res);

      if (IsStrict)

        Results.push_back(Chain);

      return;

    }


    if (VT == MVT::i128 && Subtarget.isTargetWin64()) {

      SDValue Chain;

      SDValue V = LowerWin64_FP_TO_INT128(SDValue(N, 0), DAG, Chain);

      Results.push_back(V);

      if (IsStrict)

        Results.push_back(Chain);

      return;

    }


    if (SDValue V = FP_TO_INTHelper(SDValue(N, 0), DAG, IsSigned, Chain)) {

      Results.push_back(V);

      if (IsStrict)

        Results.push_back(Chain);

    }

    return;

  }

  case ISD::LRINT:

    if (N->getValueType(0) == MVT::v2i32) {

      SDValue Src = N->getOperand(0);

      if (Subtarget.hasFP16() && Src.getValueType() == MVT::v2f16) {

        Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f16, Src,

                          DAG.getUNDEF(MVT::v2f16));

        Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8f16, Src,

                          DAG.getUNDEF(MVT::v4f16));

      } else if (Src.getValueType() != MVT::v2f64) {

        return;

      }

      Results.push_back(DAG.getNode(X86ISD::CVTP2SI, dl, MVT::v4i32, Src));

      return;

    }

    [[fallthrough]];

  case ISD::LLRINT: {

    if (SDValue V = LRINT_LLRINTHelper(N, DAG))

      Results.push_back(V);

    return;

  }


  case ISD::SINT_TO_FP:

  case ISD::STRICT_SINT_TO_FP:

  case ISD::UINT_TO_FP:

  case ISD::STRICT_UINT_TO_FP: {

    bool IsStrict = N->isStrictFPOpcode();

    bool IsSigned = Opc == ISD::SINT_TO_FP || Opc == ISD::STRICT_SINT_TO_FP;

    EVT VT = N->getValueType(0);

    SDValue Src = N->getOperand(IsStrict ? 1 : 0);

    if (VT.getVectorElementType() == MVT::f16 && Subtarget.hasFP16() &&

        Subtarget.hasVLX()) {

      if (Src.getValueType().getVectorElementType() == MVT::i16)

        return;


      if (VT == MVT::v2f16 && Src.getValueType() == MVT::v2i32)

        Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,

                          IsStrict ? DAG.getConstant(0, dl, MVT::v2i32)

                                   : DAG.getUNDEF(MVT::v2i32));

      if (IsStrict) {

        unsigned Opc =

            IsSigned ? X86ISD::STRICT_CVTSI2P : X86ISD::STRICT_CVTUI2P;

        SDValue Res = DAG.getNode(Opc, dl, {MVT::v8f16, MVT::Other},

                                  {N->getOperand(0), Src});

        Results.push_back(Res);

        Results.push_back(Res.getValue(1));

      } else {

        unsigned Opc = IsSigned ? X86ISD::CVTSI2P : X86ISD::CVTUI2P;

        Results.push_back(DAG.getNode(Opc, dl, MVT::v8f16, Src));

      }

      return;

    }

    if (VT != MVT::v2f32)

      return;

    EVT SrcVT = Src.getValueType();

    if (Subtarget.hasDQI() && Subtarget.hasVLX() && SrcVT == MVT::v2i64) {

      if (IsStrict) {

        unsigned Opc = IsSigned ? X86ISD::STRICT_CVTSI2P

                                : X86ISD::STRICT_CVTUI2P;

        SDValue Res = DAG.getNode(Opc, dl, {MVT::v4f32, MVT::Other},

                                  {N->getOperand(0), Src});

        Results.push_back(Res);

        Results.push_back(Res.getValue(1));

      } else {

        unsigned Opc = IsSigned ? X86ISD::CVTSI2P : X86ISD::CVTUI2P;

        Results.push_back(DAG.getNode(Opc, dl, MVT::v4f32, Src));

      }

      return;

    }

    if (SrcVT == MVT::v2i64 && !IsSigned && Subtarget.is64Bit() &&

        Subtarget.hasSSE41() && !Subtarget.hasAVX512()) {

      SDValue Zero = DAG.getConstant(0, dl, SrcVT);

      SDValue One  = DAG.getConstant(1, dl, SrcVT);

      SDValue Sign = DAG.getNode(ISD::OR, dl, SrcVT,

                                 DAG.getNode(ISD::SRL, dl, SrcVT, Src, One),

                                 DAG.getNode(ISD::AND, dl, SrcVT, Src, One));

      SDValue IsNeg = DAG.getSetCC(dl, MVT::v2i64, Src, Zero, ISD::SETLT);

      SDValue SignSrc = DAG.getSelect(dl, SrcVT, IsNeg, Sign, Src);

      SmallVector<SDValue, 4> SignCvts(4, DAG.getConstantFP(0.0, dl, MVT::f32));

      for (int i = 0; i != 2; ++i) {

        SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64,

                                  SignSrc, DAG.getVectorIdxConstant(i, dl));

        if (IsStrict)

          SignCvts[i] =

              DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {MVT::f32, MVT::Other},

                          {N->getOperand(0), Elt});

        else

          SignCvts[i] = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::f32, Elt);

      };

      SDValue SignCvt = DAG.getBuildVector(MVT::v4f32, dl, SignCvts);

      SDValue Slow, Chain;

      if (IsStrict) {

        Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,

                            SignCvts[0].getValue(1), SignCvts[1].getValue(1));

        Slow = DAG.getNode(ISD::STRICT_FADD, dl, {MVT::v4f32, MVT::Other},

                           {Chain, SignCvt, SignCvt});

        Chain = Slow.getValue(1);

      } else {

        Slow = DAG.getNode(ISD::FADD, dl, MVT::v4f32, SignCvt, SignCvt);

      }

      IsNeg = DAG.getBitcast(MVT::v4i32, IsNeg);

      IsNeg =

          DAG.getVectorShuffle(MVT::v4i32, dl, IsNeg, IsNeg, {1, 3, -1, -1});

      SDValue Cvt = DAG.getSelect(dl, MVT::v4f32, IsNeg, Slow, SignCvt);

      Results.push_back(Cvt);

      if (IsStrict)

        Results.push_back(Chain);

      return;

    }


    if (SrcVT != MVT::v2i32)

      return;


    if (IsSigned || Subtarget.hasAVX512()) {

      if (!IsStrict)

        return;


      // Custom widen strict v2i32->v2f32 to avoid scalarization.

      // FIXME: Should generic type legalizer do this?

      Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,

                        DAG.getConstant(0, dl, MVT::v2i32));

      SDValue Res = DAG.getNode(Opc, dl, {MVT::v4f32, MVT::Other},

                                {N->getOperand(0), Src});

      Results.push_back(Res);

      Results.push_back(Res.getValue(1));

      return;

    }


    assert(Subtarget.hasSSE2() && "Requires at least SSE2!");

    SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v2i64, Src);

    SDValue VBias = DAG.getConstantFP(

        llvm::bit_cast<double>(0x4330000000000000ULL), dl, MVT::v2f64);

    SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, ZExtIn,

                             DAG.getBitcast(MVT::v2i64, VBias));

    Or = DAG.getBitcast(MVT::v2f64, Or);

    if (IsStrict) {

      SDValue Sub = DAG.getNode(ISD::STRICT_FSUB, dl, {MVT::v2f64, MVT::Other},

                                {N->getOperand(0), Or, VBias});

      SDValue Res = DAG.getNode(X86ISD::STRICT_VFPROUND, dl,

                                {MVT::v4f32, MVT::Other},

                                {Sub.getValue(1), Sub});

      Results.push_back(Res);

      Results.push_back(Res.getValue(1));

    } else {

      // TODO: Are there any fast-math-flags to propagate here?

      SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, Or, VBias);

      Results.push_back(DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, Sub));

    }

    return;

  }

  case ISD::STRICT_FP_ROUND:

  case ISD::FP_ROUND: {

    bool IsStrict = N->isStrictFPOpcode();

    SDValue Chain = IsStrict ? N->getOperand(0) : SDValue();

    SDValue Src = N->getOperand(IsStrict ? 1 : 0);

    SDValue Rnd = N->getOperand(IsStrict ? 2 : 1);

    EVT SrcVT = Src.getValueType();

    EVT VT = N->getValueType(0);

    SDValue V;

    if (VT == MVT::v2f16 && Src.getValueType() == MVT::v2f32) {

      SDValue Ext = IsStrict ? DAG.getConstantFP(0.0, dl, MVT::v2f32)

                             : DAG.getUNDEF(MVT::v2f32);

      Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src, Ext);

    }

    if (!Subtarget.hasFP16() && VT.getVectorElementType() == MVT::f16) {

      assert(Subtarget.hasF16C() && "Cannot widen f16 without F16C");

      if (SrcVT.getVectorElementType() != MVT::f32)

        return;


      if (IsStrict)

        V = DAG.getNode(X86ISD::STRICT_CVTPS2PH, dl, {MVT::v8i16, MVT::Other},

                        {Chain, Src, Rnd});

      else

        V = DAG.getNode(X86ISD::CVTPS2PH, dl, MVT::v8i16, Src, Rnd);


      Results.push_back(DAG.getBitcast(MVT::v8f16, V));

      if (IsStrict)

        Results.push_back(V.getValue(1));

      return;

    }

    if (!isTypeLegal(Src.getValueType()))

      return;

    EVT NewVT = VT.getVectorElementType() == MVT::f16 ? MVT::v8f16 : MVT::v4f32;

    if (IsStrict)

      V = DAG.getNode(X86ISD::STRICT_VFPROUND, dl, {NewVT, MVT::Other},

                      {Chain, Src});

    else

      V = DAG.getNode(X86ISD::VFPROUND, dl, NewVT, Src);

    Results.push_back(V);

    if (IsStrict)

      Results.push_back(V.getValue(1));

    return;

  }

  case ISD::FP_EXTEND:

  case ISD::STRICT_FP_EXTEND: {

    // Right now, only MVT::v2f32 has OperationAction for FP_EXTEND.

    // No other ValueType for FP_EXTEND should reach this point.

    assert(N->getValueType(0) == MVT::v2f32 &&

           "Do not know how to legalize this Node");

    if (!Subtarget.hasFP16() || !Subtarget.hasVLX())

      return;

    bool IsStrict = N->isStrictFPOpcode();

    SDValue Src = N->getOperand(IsStrict ? 1 : 0);

    if (Src.getValueType().getVectorElementType() != MVT::f16)

      return;

    SDValue Ext = IsStrict ? DAG.getConstantFP(0.0, dl, MVT::v2f16)

                           : DAG.getUNDEF(MVT::v2f16);

    SDValue V = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f16, Src, Ext);

    if (IsStrict)

      V = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {MVT::v4f32, MVT::Other},

                      {N->getOperand(0), V});

    else

      V = DAG.getNode(ISD::FP_EXTEND, dl, MVT::v4f32, V);

    Results.push_back(V);

    if (IsStrict)

      Results.push_back(V.getValue(1));

    return;

  }

  case ISD::INTRINSIC_W_CHAIN: {

    unsigned IntNo = N->getConstantOperandVal(1);

    switch (IntNo) {

    default : llvm_unreachable("Do not know how to custom type "

                               "legalize this intrinsic operation!");

    case Intrinsic::x86_rdtsc:

      return getReadTimeStampCounter(N, dl, X86::RDTSC, DAG, Subtarget,

                                     Results);

    case Intrinsic::x86_rdtscp:

      return getReadTimeStampCounter(N, dl, X86::RDTSCP, DAG, Subtarget,

                                     Results);

    case Intrinsic::x86_rdpmc:

      expandIntrinsicWChainHelper(N, dl, DAG, X86::RDPMC, X86::ECX, Subtarget,

                                  Results);

      return;

    case Intrinsic::x86_rdpru:

      expandIntrinsicWChainHelper(N, dl, DAG, X86::RDPRU, X86::ECX, Subtarget,

        Results);

      return;

    case Intrinsic::x86_xgetbv:

      expandIntrinsicWChainHelper(N, dl, DAG, X86::XGETBV, X86::ECX, Subtarget,

                                  Results);

      return;

    }

  }

  case ISD::READCYCLECOUNTER: {

    return getReadTimeStampCounter(N, dl, X86::RDTSC, DAG, Subtarget, Results);

  }

  case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS: {

    EVT T = N->getValueType(0);

    assert((T == MVT::i64 || T == MVT::i128) && "can only expand cmpxchg pair");

    bool Regs64bit = T == MVT::i128;

    assert((!Regs64bit || Subtarget.canUseCMPXCHG16B()) &&

           "64-bit ATOMIC_CMP_SWAP_WITH_SUCCESS requires CMPXCHG16B");

    MVT HalfT = Regs64bit ? MVT::i64 : MVT::i32;

    SDValue cpInL, cpInH;

    std::tie(cpInL, cpInH) =

        DAG.SplitScalar(N->getOperand(2), dl, HalfT, HalfT);

    cpInL = DAG.getCopyToReg(N->getOperand(0), dl,

                             Regs64bit ? X86::RAX : X86::EAX, cpInL, SDValue());

    cpInH =

        DAG.getCopyToReg(cpInL.getValue(0), dl, Regs64bit ? X86::RDX : X86::EDX,

                         cpInH, cpInL.getValue(1));

    SDValue swapInL, swapInH;

    std::tie(swapInL, swapInH) =

        DAG.SplitScalar(N->getOperand(3), dl, HalfT, HalfT);

    swapInH =

        DAG.getCopyToReg(cpInH.getValue(0), dl, Regs64bit ? X86::RCX : X86::ECX,

                         swapInH, cpInH.getValue(1));


    // In 64-bit mode we might need the base pointer in RBX, but we can't know

    // until later. So we keep the RBX input in a vreg and use a custom

    // inserter.

    // Since RBX will be a reserved register the register allocator will not

    // make sure its value will be properly saved and restored around this

    // live-range.

    SDValue Result;

    SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);

    MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand();

    if (Regs64bit) {

      SDValue Ops[] = {swapInH.getValue(0), N->getOperand(1), swapInL,

                       swapInH.getValue(1)};

      Result =

          DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG16_DAG, dl, Tys, Ops, T, MMO);

    } else {

      swapInL = DAG.getCopyToReg(swapInH.getValue(0), dl, X86::EBX, swapInL,

                                 swapInH.getValue(1));

      SDValue Ops[] = {swapInL.getValue(0), N->getOperand(1),

                       swapInL.getValue(1)};

      Result =

          DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG8_DAG, dl, Tys, Ops, T, MMO);

    }


    SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl,

                                        Regs64bit ? X86::RAX : X86::EAX,

                                        HalfT, Result.getValue(1));

    SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl,

                                        Regs64bit ? X86::RDX : X86::EDX,

                                        HalfT, cpOutL.getValue(2));

    SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)};


    SDValue EFLAGS = DAG.getCopyFromReg(cpOutH.getValue(1), dl, X86::EFLAGS,

                                        MVT::i32, cpOutH.getValue(2));

    SDValue Success = getSETCC(X86::COND_E, EFLAGS, dl, DAG);

    Success = DAG.getZExtOrTrunc(Success, dl, N->getValueType(1));


    Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, T, OpsF));

    Results.push_back(Success);

    Results.push_back(EFLAGS.getValue(1));

    return;

  }

  case ISD::ATOMIC_LOAD: {

    assert(

        (N->getValueType(0) == MVT::i64 || N->getValueType(0) == MVT::i128) &&

        "Unexpected VT!");

    bool NoImplicitFloatOps =

        DAG.getMachineFunction().getFunction().hasFnAttribute(

            Attribute::NoImplicitFloat);

    if (!Subtarget.useSoftFloat() && !NoImplicitFloatOps) {

      auto *Node = cast<AtomicSDNode>(N);


      if (N->getValueType(0) == MVT::i128) {

        if (Subtarget.is64Bit() && Subtarget.hasAVX()) {

          SDValue Ld = DAG.getLoad(MVT::v2i64, dl, Node->getChain(),

                                   Node->getBasePtr(), Node->getMemOperand());

          SDValue ResL = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Ld,

                                     DAG.getVectorIdxConstant(0, dl));

          SDValue ResH = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Ld,

                                     DAG.getVectorIdxConstant(1, dl));

          Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, N->getValueType(0),

                                        {ResL, ResH}));

          Results.push_back(Ld.getValue(1));

          return;

        }

        break;

      }

      if (Subtarget.hasSSE1()) {

        // Use a VZEXT_LOAD which will be selected as MOVQ or XORPS+MOVLPS.

        // Then extract the lower 64-bits.

        MVT LdVT = Subtarget.hasSSE2() ? MVT::v2i64 : MVT::v4f32;

        SDVTList Tys = DAG.getVTList(LdVT, MVT::Other);

        SDValue Ops[] = { Node->getChain(), Node->getBasePtr() };

        SDValue Ld = DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops,

                                             MVT::i64, Node->getMemOperand());

        if (Subtarget.hasSSE2()) {

          SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Ld,

                                    DAG.getVectorIdxConstant(0, dl));

          Results.push_back(Res);

          Results.push_back(Ld.getValue(1));

          return;

        }

        // We use an alternative sequence for SSE1 that extracts as v2f32 and

        // then casts to i64. This avoids a 128-bit stack temporary being

        // created by type legalization if we were to cast v4f32->v2i64.

        SDValue Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2f32, Ld,

                                  DAG.getVectorIdxConstant(0, dl));

        Res = DAG.getBitcast(MVT::i64, Res);

        Results.push_back(Res);

        Results.push_back(Ld.getValue(1));

        return;

      }

      if (Subtarget.hasX87()) {

        // First load this into an 80-bit X87 register. This will put the whole

        // integer into the significand.

        SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);

        SDValue Ops[] = { Node->getChain(), Node->getBasePtr() };

        SDValue Result = DAG.getMemIntrinsicNode(X86ISD::FILD,

                                                 dl, Tys, Ops, MVT::i64,

                                                 Node->getMemOperand());

        SDValue Chain = Result.getValue(1);


        // Now store the X87 register to a stack temporary and convert to i64.

        // This store is not atomic and doesn't need to be.

        // FIXME: We don't need a stack temporary if the result of the load

        // is already being stored. We could just directly store there.

        SDValue StackPtr = DAG.CreateStackTemporary(MVT::i64);

        int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();

        MachinePointerInfo MPI =

            MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI);

        SDValue StoreOps[] = { Chain, Result, StackPtr };

        Chain = DAG.getMemIntrinsicNode(

            X86ISD::FIST, dl, DAG.getVTList(MVT::Other), StoreOps, MVT::i64,

            MPI, std::nullopt /*Align*/, MachineMemOperand::MOStore);


        // Finally load the value back from the stack temporary and return it.

        // This load is not atomic and doesn't need to be.

        // This load will be further type legalized.

        Result = DAG.getLoad(MVT::i64, dl, Chain, StackPtr, MPI);

        Results.push_back(Result);

        Results.push_back(Result.getValue(1));

        return;

      }

    }

    // TODO: Use MOVLPS when SSE1 is available?

    // Delegate to generic TypeLegalization. Situations we can really handle

    // should have already been dealt with by AtomicExpandPass.cpp.

    break;

  }

  case ISD::ATOMIC_SWAP:

  case ISD::ATOMIC_LOAD_ADD:

  case ISD::ATOMIC_LOAD_SUB:

  case ISD::ATOMIC_LOAD_AND:

  case ISD::ATOMIC_LOAD_OR:

  case ISD::ATOMIC_LOAD_XOR:

  case ISD::ATOMIC_LOAD_NAND:

  case ISD::ATOMIC_LOAD_MIN:

  case ISD::ATOMIC_LOAD_MAX:

  case ISD::ATOMIC_LOAD_UMIN:

  case ISD::ATOMIC_LOAD_UMAX:

    // Delegate to generic TypeLegalization. Situations we can really handle

    // should have already been dealt with by AtomicExpandPass.cpp.

    break;


  case ISD::BITCAST: {

    assert(Subtarget.hasSSE2() && "Requires at least SSE2!");

    EVT DstVT = N->getValueType(0);

    EVT SrcVT = N->getOperand(0).getValueType();


    // If this is a bitcast from a v64i1 k-register to a i64 on a 32-bit target

    // we can split using the k-register rather than memory.

    if (SrcVT == MVT::v64i1 && DstVT == MVT::i64 && Subtarget.hasBWI()) {

      assert(!Subtarget.is64Bit() && "Expected 32-bit mode");

      SDValue Lo, Hi;

      std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);

      Lo = DAG.getBitcast(MVT::i32, Lo);

      Hi = DAG.getBitcast(MVT::i32, Hi);

      SDValue Res = DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi);

      Results.push_back(Res);

      return;

    }


    if (DstVT.isVector() && SrcVT == MVT::x86mmx) {

      // FIXME: Use v4f32 for SSE1?

      assert(Subtarget.hasSSE2() && "Requires SSE2");

      assert(getTypeAction(*DAG.getContext(), DstVT) == TypeWidenVector &&

             "Unexpected type action!");

      EVT WideVT = getTypeToTransformTo(*DAG.getContext(), DstVT);

      SDValue Res = DAG.getNode(X86ISD::MOVQ2DQ, dl, MVT::v2i64,

                                N->getOperand(0));

      Res = DAG.getBitcast(WideVT, Res);

      Results.push_back(Res);

      return;

    }


    return;

  }

  case ISD::MGATHER: {

    EVT VT = N->getValueType(0);

    if ((VT == MVT::v2f32 || VT == MVT::v2i32) &&

        (Subtarget.hasVLX() || !Subtarget.hasAVX512())) {

      auto *Gather = cast<MaskedGatherSDNode>(N);

      SDValue Index = Gather->getIndex();

      if (Index.getValueType() != MVT::v2i64)

        return;

      assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&

             "Unexpected type action!");

      EVT WideVT = getTypeToTransformTo(*DAG.getContext(), VT);

      SDValue Mask = Gather->getMask();

      assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type");

      SDValue PassThru = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT,

                                     Gather->getPassThru(),

                                     DAG.getUNDEF(VT));

      if (!Subtarget.hasVLX()) {

        // We need to widen the mask, but the instruction will only use 2

        // of its elements. So we can use undef.

        Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i1, Mask,

                           DAG.getUNDEF(MVT::v2i1));

        Mask = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, Mask);

      }

      SDValue Ops[] = { Gather->getChain(), PassThru, Mask,

                        Gather->getBasePtr(), Index, Gather->getScale() };

      SDValue Res = DAG.getMemIntrinsicNode(

          X86ISD::MGATHER, dl, DAG.getVTList(WideVT, MVT::Other), Ops,

          Gather->getMemoryVT(), Gather->getMemOperand());

      Results.push_back(Res);

      Results.push_back(Res.getValue(1));

      return;

    }

    return;

  }

  case ISD::LOAD: {

    // Use an f64/i64 load and a scalar_to_vector for v2f32/v2i32 loads. This

    // avoids scalarizing in 32-bit mode. In 64-bit mode this avoids a int->fp

    // cast since type legalization will try to use an i64 load.

    MVT VT = N->getSimpleValueType(0);

    assert(VT.isVector() && VT.getSizeInBits() == 64 && "Unexpected VT");

    assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&

           "Unexpected type action!");

    if (!ISD::isNON_EXTLoad(N))

      return;

    auto *Ld = cast<LoadSDNode>(N);

    if (Subtarget.hasSSE2()) {

      MVT LdVT = Subtarget.is64Bit() && VT.isInteger() ? MVT::i64 : MVT::f64;

      SDValue Res = DAG.getLoad(LdVT, dl, Ld->getChain(), Ld->getBasePtr(),

                                Ld->getPointerInfo(), Ld->getBaseAlign(),

                                Ld->getMemOperand()->getFlags());

      SDValue Chain = Res.getValue(1);

      MVT VecVT = MVT::getVectorVT(LdVT, 2);

      Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Res);

      EVT WideVT = getTypeToTransformTo(*DAG.getContext(), VT);

      Res = DAG.getBitcast(WideVT, Res);

      Results.push_back(Res);

      Results.push_back(Chain);

      return;

    }

    assert(Subtarget.hasSSE1() && "Expected SSE");

    SDVTList Tys = DAG.getVTList(MVT::v4f32, MVT::Other);

    SDValue Ops[] = {Ld->getChain(), Ld->getBasePtr()};

    SDValue Res = DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops,

                                          MVT::i64, Ld->getMemOperand());

    Results.push_back(Res);

    Results.push_back(Res.getValue(1));

    return;

  }

  case ISD::ADDRSPACECAST: {

    SDValue V = LowerADDRSPACECAST(SDValue(N,0), DAG);

    Results.push_back(V);

    return;

  }

  case ISD::BITREVERSE: {

    assert(N->getValueType(0) == MVT::i64 && "Unexpected VT!");

    assert((Subtarget.hasXOP() || Subtarget.hasGFNI()) && "Expected XOP/GFNI");

    // We can use VPPERM/GF2P8AFFINEQB by copying to a vector register and back.

    // We'll need to move the scalar in two i32 pieces.

    Results.push_back(LowerBITREVERSE(SDValue(N, 0), Subtarget, DAG));

    return;

  }

  case ISD::EXTRACT_VECTOR_ELT: {

    // f16 = extract vXf16 %vec, i64 %idx

    assert(N->getSimpleValueType(0) == MVT::f16 &&

           "Unexpected Value type of EXTRACT_VECTOR_ELT!");

    assert(Subtarget.hasFP16() && "Expected FP16");

    SDValue VecOp = N->getOperand(0);

    EVT ExtVT = VecOp.getValueType().changeVectorElementTypeToInteger();

    SDValue Split = DAG.getBitcast(ExtVT, N->getOperand(0));

    Split = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Split,

                        N->getOperand(1));

    Split = DAG.getBitcast(MVT::f16, Split);

    Results.push_back(Split);

    return;

  }

  }

}


const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {

  switch ((X86ISD::NodeType)Opcode) {

  case X86ISD::FIRST_NUMBER:       break;

#define NODE_NAME_CASE(NODE) case X86ISD::NODE: return "X86ISD::" #NODE;

  NODE_NAME_CASE(BSF)

  NODE_NAME_CASE(BSR)

  NODE_NAME_CASE(FSHL)

  NODE_NAME_CASE(FSHR)

  NODE_NAME_CASE(FAND)

  NODE_NAME_CASE(FANDN)

  NODE_NAME_CASE(FOR)

  NODE_NAME_CASE(FXOR)

  NODE_NAME_CASE(FILD)

  NODE_NAME_CASE(FIST)

  NODE_NAME_CASE(FP_TO_INT_IN_MEM)

  NODE_NAME_CASE(FLD)

  NODE_NAME_CASE(FST)

  NODE_NAME_CASE(CALL)

  NODE_NAME_CASE(CALL_RVMARKER)

  NODE_NAME_CASE(IMP_CALL)

  NODE_NAME_CASE(BT)

  NODE_NAME_CASE(CMP)

  NODE_NAME_CASE(FCMP)

  NODE_NAME_CASE(STRICT_FCMP)

  NODE_NAME_CASE(STRICT_FCMPS)

  NODE_NAME_CASE(COMI)

  NODE_NAME_CASE(UCOMI)

  NODE_NAME_CASE(COMX)

  NODE_NAME_CASE(UCOMX)

  NODE_NAME_CASE(CMPM)

  NODE_NAME_CASE(CMPMM)

  NODE_NAME_CASE(STRICT_CMPM)

  NODE_NAME_CASE(CMPMM_SAE)

  NODE_NAME_CASE(SETCC)

  NODE_NAME_CASE(SETCC_CARRY)

  NODE_NAME_CASE(FSETCC)

  NODE_NAME_CASE(FSETCCM)

  NODE_NAME_CASE(FSETCCM_SAE)

  NODE_NAME_CASE(CMOV)

  NODE_NAME_CASE(BRCOND)

  NODE_NAME_CASE(RET_GLUE)

  NODE_NAME_CASE(IRET)

  NODE_NAME_CASE(REP_STOS)

  NODE_NAME_CASE(REP_MOVS)

  NODE_NAME_CASE(GlobalBaseReg)

  NODE_NAME_CASE(Wrapper)

  NODE_NAME_CASE(WrapperRIP)

  NODE_NAME_CASE(MOVQ2DQ)

  NODE_NAME_CASE(MOVDQ2Q)

  NODE_NAME_CASE(MMX_MOVD2W)

  NODE_NAME_CASE(MMX_MOVW2D)

  NODE_NAME_CASE(PEXTRB)

  NODE_NAME_CASE(PEXTRW)

  NODE_NAME_CASE(INSERTPS)

  NODE_NAME_CASE(PINSRB)

  NODE_NAME_CASE(PINSRW)

  NODE_NAME_CASE(PSHUFB)

  NODE_NAME_CASE(ANDNP)

  NODE_NAME_CASE(BLENDI)

  NODE_NAME_CASE(BLENDV)

  NODE_NAME_CASE(HADD)

  NODE_NAME_CASE(HSUB)

  NODE_NAME_CASE(FHADD)

  NODE_NAME_CASE(FHSUB)

  NODE_NAME_CASE(CONFLICT)

  NODE_NAME_CASE(FMAX)

  NODE_NAME_CASE(FMAXS)

  NODE_NAME_CASE(FMAX_SAE)

  NODE_NAME_CASE(FMAXS_SAE)

  NODE_NAME_CASE(STRICT_FMAX)

  NODE_NAME_CASE(FMIN)

  NODE_NAME_CASE(FMINS)

  NODE_NAME_CASE(FMIN_SAE)

  NODE_NAME_CASE(FMINS_SAE)

  NODE_NAME_CASE(STRICT_FMIN)

  NODE_NAME_CASE(FMAXC)

  NODE_NAME_CASE(FMINC)

  NODE_NAME_CASE(FRSQRT)

  NODE_NAME_CASE(FRCP)

  NODE_NAME_CASE(EXTRQI)

  NODE_NAME_CASE(INSERTQI)

  NODE_NAME_CASE(TLSADDR)

  NODE_NAME_CASE(TLSBASEADDR)

  NODE_NAME_CASE(TLSCALL)

  NODE_NAME_CASE(TLSDESC)

  NODE_NAME_CASE(EH_SJLJ_SETJMP)

  NODE_NAME_CASE(EH_SJLJ_LONGJMP)

  NODE_NAME_CASE(EH_SJLJ_SETUP_DISPATCH)

  NODE_NAME_CASE(EH_RETURN)

  NODE_NAME_CASE(TC_RETURN)

  NODE_NAME_CASE(FNSTCW16m)

  NODE_NAME_CASE(FLDCW16m)

  NODE_NAME_CASE(FNSTENVm)

  NODE_NAME_CASE(FLDENVm)

  NODE_NAME_CASE(LCMPXCHG_DAG)

  NODE_NAME_CASE(LCMPXCHG8_DAG)

  NODE_NAME_CASE(LCMPXCHG16_DAG)

  NODE_NAME_CASE(LCMPXCHG16_SAVE_RBX_DAG)

  NODE_NAME_CASE(LADD)

  NODE_NAME_CASE(LSUB)

  NODE_NAME_CASE(LOR)

  NODE_NAME_CASE(LXOR)

  NODE_NAME_CASE(LAND)

  NODE_NAME_CASE(LBTS)

  NODE_NAME_CASE(LBTC)

  NODE_NAME_CASE(LBTR)

  NODE_NAME_CASE(LBTS_RM)

  NODE_NAME_CASE(LBTC_RM)

  NODE_NAME_CASE(LBTR_RM)

  NODE_NAME_CASE(AADD)

  NODE_NAME_CASE(AOR)

  NODE_NAME_CASE(AXOR)

  NODE_NAME_CASE(AAND)

  NODE_NAME_CASE(VZEXT_MOVL)

  NODE_NAME_CASE(VZEXT_LOAD)

  NODE_NAME_CASE(VEXTRACT_STORE)

  NODE_NAME_CASE(VTRUNC)

  NODE_NAME_CASE(VTRUNCS)

  NODE_NAME_CASE(VTRUNCUS)

  NODE_NAME_CASE(VMTRUNC)

  NODE_NAME_CASE(VMTRUNCS)

  NODE_NAME_CASE(VMTRUNCUS)

  NODE_NAME_CASE(VTRUNCSTORES)

  NODE_NAME_CASE(VTRUNCSTOREUS)

  NODE_NAME_CASE(VMTRUNCSTORES)

  NODE_NAME_CASE(VMTRUNCSTOREUS)

  NODE_NAME_CASE(VFPEXT)

  NODE_NAME_CASE(STRICT_VFPEXT)

  NODE_NAME_CASE(VFPEXT_SAE)

  NODE_NAME_CASE(VFPEXTS)

  NODE_NAME_CASE(VFPEXTS_SAE)

  NODE_NAME_CASE(VFPROUND)

  NODE_NAME_CASE(VFPROUND2)

  NODE_NAME_CASE(VFPROUND2_RND)

  NODE_NAME_CASE(STRICT_VFPROUND)

  NODE_NAME_CASE(VMFPROUND)

  NODE_NAME_CASE(VFPROUND_RND)

  NODE_NAME_CASE(VFPROUNDS)

  NODE_NAME_CASE(VFPROUNDS_RND)

  NODE_NAME_CASE(VSHLDQ)

  NODE_NAME_CASE(VSRLDQ)

  NODE_NAME_CASE(VSHL)

  NODE_NAME_CASE(VSRL)

  NODE_NAME_CASE(VSRA)

  NODE_NAME_CASE(VSHLI)

  NODE_NAME_CASE(VSRLI)

  NODE_NAME_CASE(VSRAI)

  NODE_NAME_CASE(VSHLV)

  NODE_NAME_CASE(VSRLV)

  NODE_NAME_CASE(VSRAV)

  NODE_NAME_CASE(VROTLI)

  NODE_NAME_CASE(VROTRI)

  NODE_NAME_CASE(VPPERM)

  NODE_NAME_CASE(CMPP)

  NODE_NAME_CASE(STRICT_CMPP)

  NODE_NAME_CASE(PCMPEQ)

  NODE_NAME_CASE(PCMPGT)

  NODE_NAME_CASE(PHMINPOS)

  NODE_NAME_CASE(ADD)

  NODE_NAME_CASE(SUB)

  NODE_NAME_CASE(ADC)

  NODE_NAME_CASE(SBB)

  NODE_NAME_CASE(SMUL)

  NODE_NAME_CASE(UMUL)

  NODE_NAME_CASE(OR)

  NODE_NAME_CASE(XOR)

  NODE_NAME_CASE(AND)

  NODE_NAME_CASE(BEXTR)

  NODE_NAME_CASE(BEXTRI)

  NODE_NAME_CASE(BZHI)

  NODE_NAME_CASE(PDEP)

  NODE_NAME_CASE(PEXT)

  NODE_NAME_CASE(MUL_IMM)

  NODE_NAME_CASE(MOVMSK)

  NODE_NAME_CASE(PTEST)

  NODE_NAME_CASE(TESTP)

  NODE_NAME_CASE(KORTEST)

  NODE_NAME_CASE(KTEST)

  NODE_NAME_CASE(KADD)

  NODE_NAME_CASE(KSHIFTL)

  NODE_NAME_CASE(KSHIFTR)

  NODE_NAME_CASE(PACKSS)

  NODE_NAME_CASE(PACKUS)

  NODE_NAME_CASE(PALIGNR)

  NODE_NAME_CASE(VALIGN)

  NODE_NAME_CASE(VSHLD)

  NODE_NAME_CASE(VSHRD)

  NODE_NAME_CASE(PSHUFD)

  NODE_NAME_CASE(PSHUFHW)

  NODE_NAME_CASE(PSHUFLW)

  NODE_NAME_CASE(SHUFP)

  NODE_NAME_CASE(SHUF128)

  NODE_NAME_CASE(MOVLHPS)

  NODE_NAME_CASE(MOVHLPS)

  NODE_NAME_CASE(MOVDDUP)

  NODE_NAME_CASE(MOVSHDUP)

  NODE_NAME_CASE(MOVSLDUP)

  NODE_NAME_CASE(MOVSD)

  NODE_NAME_CASE(MOVSS)

  NODE_NAME_CASE(MOVSH)

  NODE_NAME_CASE(UNPCKL)

  NODE_NAME_CASE(UNPCKH)

  NODE_NAME_CASE(VBROADCAST)

  NODE_NAME_CASE(VBROADCAST_LOAD)

  NODE_NAME_CASE(VBROADCASTM)

  NODE_NAME_CASE(SUBV_BROADCAST_LOAD)

  NODE_NAME_CASE(VPERMILPV)

  NODE_NAME_CASE(VPERMILPI)

  NODE_NAME_CASE(VPERM2X128)

  NODE_NAME_CASE(VPERMV)

  NODE_NAME_CASE(VPERMV3)

  NODE_NAME_CASE(VPERMI)

  NODE_NAME_CASE(VPTERNLOG)

  NODE_NAME_CASE(FP_TO_SINT_SAT)

  NODE_NAME_CASE(FP_TO_UINT_SAT)

  NODE_NAME_CASE(VFIXUPIMM)

  NODE_NAME_CASE(VFIXUPIMM_SAE)

  NODE_NAME_CASE(VFIXUPIMMS)

  NODE_NAME_CASE(VFIXUPIMMS_SAE)

  NODE_NAME_CASE(VRANGE)

  NODE_NAME_CASE(VRANGE_SAE)

  NODE_NAME_CASE(VRANGES)

  NODE_NAME_CASE(VRANGES_SAE)

  NODE_NAME_CASE(PMULUDQ)

  NODE_NAME_CASE(PMULDQ)

  NODE_NAME_CASE(PSADBW)

  NODE_NAME_CASE(DBPSADBW)

  NODE_NAME_CASE(VASTART_SAVE_XMM_REGS)

  NODE_NAME_CASE(VAARG_64)

  NODE_NAME_CASE(VAARG_X32)

  NODE_NAME_CASE(DYN_ALLOCA)

  NODE_NAME_CASE(MFENCE)

  NODE_NAME_CASE(SEG_ALLOCA)

  NODE_NAME_CASE(PROBED_ALLOCA)

  NODE_NAME_CASE(RDRAND)

  NODE_NAME_CASE(RDSEED)

  NODE_NAME_CASE(RDPKRU)

  NODE_NAME_CASE(WRPKRU)

  NODE_NAME_CASE(VPMADDUBSW)

  NODE_NAME_CASE(VPMADDWD)

  NODE_NAME_CASE(VPSHA)

  NODE_NAME_CASE(VPSHL)

  NODE_NAME_CASE(VPCOM)

  NODE_NAME_CASE(VPCOMU)

  NODE_NAME_CASE(VPERMIL2)

  NODE_NAME_CASE(FMSUB)

  NODE_NAME_CASE(STRICT_FMSUB)

  NODE_NAME_CASE(FNMADD)

  NODE_NAME_CASE(STRICT_FNMADD)

  NODE_NAME_CASE(FNMSUB)

  NODE_NAME_CASE(STRICT_FNMSUB)

  NODE_NAME_CASE(FMADDSUB)

  NODE_NAME_CASE(FMSUBADD)

  NODE_NAME_CASE(FMADD_RND)

  NODE_NAME_CASE(FNMADD_RND)

  NODE_NAME_CASE(FMSUB_RND)

  NODE_NAME_CASE(FNMSUB_RND)

  NODE_NAME_CASE(FMADDSUB_RND)

  NODE_NAME_CASE(FMSUBADD_RND)

  NODE_NAME_CASE(VFMADDC)

  NODE_NAME_CASE(VFMADDC_RND)

  NODE_NAME_CASE(VFCMADDC)

  NODE_NAME_CASE(VFCMADDC_RND)

  NODE_NAME_CASE(VFMULC)

  NODE_NAME_CASE(VFMULC_RND)

  NODE_NAME_CASE(VFCMULC)

  NODE_NAME_CASE(VFCMULC_RND)

  NODE_NAME_CASE(VFMULCSH)

  NODE_NAME_CASE(VFMULCSH_RND)

  NODE_NAME_CASE(VFCMULCSH)

  NODE_NAME_CASE(VFCMULCSH_RND)

  NODE_NAME_CASE(VFMADDCSH)

  NODE_NAME_CASE(VFMADDCSH_RND)

  NODE_NAME_CASE(VFCMADDCSH)

  NODE_NAME_CASE(VFCMADDCSH_RND)

  NODE_NAME_CASE(VPMADD52H)

  NODE_NAME_CASE(VPMADD52L)

  NODE_NAME_CASE(VRNDSCALE)

  NODE_NAME_CASE(STRICT_VRNDSCALE)

  NODE_NAME_CASE(VRNDSCALE_SAE)

  NODE_NAME_CASE(VRNDSCALES)

  NODE_NAME_CASE(VRNDSCALES_SAE)

  NODE_NAME_CASE(VREDUCE)

  NODE_NAME_CASE(VREDUCE_SAE)

  NODE_NAME_CASE(VREDUCES)

  NODE_NAME_CASE(VREDUCES_SAE)

  NODE_NAME_CASE(VGETMANT)

  NODE_NAME_CASE(VGETMANT_SAE)

  NODE_NAME_CASE(VGETMANTS)

  NODE_NAME_CASE(VGETMANTS_SAE)

  NODE_NAME_CASE(PCMPESTR)

  NODE_NAME_CASE(PCMPISTR)

  NODE_NAME_CASE(XTEST)

  NODE_NAME_CASE(COMPRESS)

  NODE_NAME_CASE(EXPAND)

  NODE_NAME_CASE(SELECTS)

  NODE_NAME_CASE(ADDSUB)

  NODE_NAME_CASE(RCP14)

  NODE_NAME_CASE(RCP14S)

  NODE_NAME_CASE(RSQRT14)

  NODE_NAME_CASE(RSQRT14S)

  NODE_NAME_CASE(FADD_RND)

  NODE_NAME_CASE(FADDS)

  NODE_NAME_CASE(FADDS_RND)

  NODE_NAME_CASE(FSUB_RND)

  NODE_NAME_CASE(FSUBS)

  NODE_NAME_CASE(FSUBS_RND)

  NODE_NAME_CASE(FMUL_RND)

  NODE_NAME_CASE(FMULS)

  NODE_NAME_CASE(FMULS_RND)

  NODE_NAME_CASE(FDIV_RND)

  NODE_NAME_CASE(FDIVS)

  NODE_NAME_CASE(FDIVS_RND)

  NODE_NAME_CASE(FSQRT_RND)

  NODE_NAME_CASE(FSQRTS)

  NODE_NAME_CASE(FSQRTS_RND)

  NODE_NAME_CASE(FGETEXP)

  NODE_NAME_CASE(FGETEXP_SAE)

  NODE_NAME_CASE(FGETEXPS)

  NODE_NAME_CASE(FGETEXPS_SAE)

  NODE_NAME_CASE(SCALEF)

  NODE_NAME_CASE(SCALEF_RND)

  NODE_NAME_CASE(SCALEFS)

  NODE_NAME_CASE(SCALEFS_RND)

  NODE_NAME_CASE(MULHRS)

  NODE_NAME_CASE(SINT_TO_FP_RND)

  NODE_NAME_CASE(UINT_TO_FP_RND)

  NODE_NAME_CASE(CVTTP2SI)

  NODE_NAME_CASE(CVTTP2UI)

  NODE_NAME_CASE(STRICT_CVTTP2SI)

  NODE_NAME_CASE(STRICT_CVTTP2UI)

  NODE_NAME_CASE(MCVTTP2SI)

  NODE_NAME_CASE(MCVTTP2UI)

  NODE_NAME_CASE(CVTTP2SI_SAE)

  NODE_NAME_CASE(CVTTP2UI_SAE)

  NODE_NAME_CASE(CVTTS2SI)

  NODE_NAME_CASE(CVTTS2UI)

  NODE_NAME_CASE(CVTTS2SI_SAE)

  NODE_NAME_CASE(CVTTS2UI_SAE)

  NODE_NAME_CASE(CVTSI2P)

  NODE_NAME_CASE(CVTUI2P)

  NODE_NAME_CASE(STRICT_CVTSI2P)

  NODE_NAME_CASE(STRICT_CVTUI2P)

  NODE_NAME_CASE(MCVTSI2P)

  NODE_NAME_CASE(MCVTUI2P)

  NODE_NAME_CASE(VFPCLASS)

  NODE_NAME_CASE(VFPCLASSS)

  NODE_NAME_CASE(MULTISHIFT)

  NODE_NAME_CASE(SCALAR_SINT_TO_FP)

  NODE_NAME_CASE(SCALAR_SINT_TO_FP_RND)

  NODE_NAME_CASE(SCALAR_UINT_TO_FP)

  NODE_NAME_CASE(SCALAR_UINT_TO_FP_RND)

  NODE_NAME_CASE(CVTPS2PH)

  NODE_NAME_CASE(STRICT_CVTPS2PH)

  NODE_NAME_CASE(CVTPS2PH_SAE)

  NODE_NAME_CASE(MCVTPS2PH)

  NODE_NAME_CASE(MCVTPS2PH_SAE)

  NODE_NAME_CASE(CVTPH2PS)

  NODE_NAME_CASE(STRICT_CVTPH2PS)

  NODE_NAME_CASE(CVTPH2PS_SAE)

  NODE_NAME_CASE(CVTP2SI)

  NODE_NAME_CASE(CVTP2UI)

  NODE_NAME_CASE(MCVTP2SI)

  NODE_NAME_CASE(MCVTP2UI)

  NODE_NAME_CASE(CVTP2SI_RND)

  NODE_NAME_CASE(CVTP2UI_RND)

  NODE_NAME_CASE(CVTS2SI)

  NODE_NAME_CASE(CVTS2UI)

  NODE_NAME_CASE(CVTS2SI_RND)

  NODE_NAME_CASE(CVTS2UI_RND)

  NODE_NAME_CASE(CVTNEPS2BF16)

  NODE_NAME_CASE(MCVTNEPS2BF16)

  NODE_NAME_CASE(DPBF16PS)

  NODE_NAME_CASE(DPFP16PS)

  NODE_NAME_CASE(MPSADBW)

  NODE_NAME_CASE(LWPINS)

  NODE_NAME_CASE(MGATHER)

  NODE_NAME_CASE(MSCATTER)

  NODE_NAME_CASE(VPDPBUSD)

  NODE_NAME_CASE(VPDPBUSDS)

  NODE_NAME_CASE(VPDPWSSD)

  NODE_NAME_CASE(VPDPWSSDS)

  NODE_NAME_CASE(VPSHUFBITQMB)

  NODE_NAME_CASE(GF2P8MULB)

  NODE_NAME_CASE(GF2P8AFFINEQB)

  NODE_NAME_CASE(GF2P8AFFINEINVQB)

  NODE_NAME_CASE(NT_CALL)

  NODE_NAME_CASE(NT_BRIND)

  NODE_NAME_CASE(UMWAIT)

  NODE_NAME_CASE(TPAUSE)

  NODE_NAME_CASE(ENQCMD)

  NODE_NAME_CASE(ENQCMDS)

  NODE_NAME_CASE(VP2INTERSECT)

  NODE_NAME_CASE(VPDPBSUD)

  NODE_NAME_CASE(VPDPBSUDS)

  NODE_NAME_CASE(VPDPBUUD)

  NODE_NAME_CASE(VPDPBUUDS)

  NODE_NAME_CASE(VPDPBSSD)

  NODE_NAME_CASE(VPDPBSSDS)

  NODE_NAME_CASE(VPDPWSUD)

  NODE_NAME_CASE(VPDPWSUDS)

  NODE_NAME_CASE(VPDPWUSD)

  NODE_NAME_CASE(VPDPWUSDS)

  NODE_NAME_CASE(VPDPWUUD)

  NODE_NAME_CASE(VPDPWUUDS)

  NODE_NAME_CASE(VMINMAX)

  NODE_NAME_CASE(VMINMAX_SAE)

  NODE_NAME_CASE(VMINMAXS)

  NODE_NAME_CASE(VMINMAXS_SAE)

  NODE_NAME_CASE(CVTP2IBS)

  NODE_NAME_CASE(CVTP2IUBS)

  NODE_NAME_CASE(CVTP2IBS_RND)

  NODE_NAME_CASE(CVTP2IUBS_RND)

  NODE_NAME_CASE(CVTTP2IBS)

  NODE_NAME_CASE(CVTTP2IUBS)

  NODE_NAME_CASE(CVTTP2IBS_SAE)

  NODE_NAME_CASE(CVTTP2IUBS_SAE)

  NODE_NAME_CASE(VCVT2PH2BF8)

  NODE_NAME_CASE(VCVT2PH2BF8S)

  NODE_NAME_CASE(VCVT2PH2HF8)

  NODE_NAME_CASE(VCVT2PH2HF8S)

  NODE_NAME_CASE(VCVTBIASPH2BF8)

  NODE_NAME_CASE(VCVTBIASPH2BF8S)

  NODE_NAME_CASE(VCVTBIASPH2HF8)

  NODE_NAME_CASE(VCVTBIASPH2HF8S)

  NODE_NAME_CASE(VCVTPH2BF8)

  NODE_NAME_CASE(VCVTPH2BF8S)

  NODE_NAME_CASE(VCVTPH2HF8)

  NODE_NAME_CASE(VCVTPH2HF8S)

  NODE_NAME_CASE(VMCVTBIASPH2BF8)

  NODE_NAME_CASE(VMCVTBIASPH2BF8S)

  NODE_NAME_CASE(VMCVTBIASPH2HF8)

  NODE_NAME_CASE(VMCVTBIASPH2HF8S)

  NODE_NAME_CASE(VMCVTPH2BF8)

  NODE_NAME_CASE(VMCVTPH2BF8S)

  NODE_NAME_CASE(VMCVTPH2HF8)

  NODE_NAME_CASE(VMCVTPH2HF8S)

  NODE_NAME_CASE(VCVTHF82PH)

  NODE_NAME_CASE(AESENC128KL)

  NODE_NAME_CASE(AESDEC128KL)

  NODE_NAME_CASE(AESENC256KL)

  NODE_NAME_CASE(AESDEC256KL)

  NODE_NAME_CASE(AESENCWIDE128KL)

  NODE_NAME_CASE(AESDECWIDE128KL)

  NODE_NAME_CASE(AESENCWIDE256KL)

  NODE_NAME_CASE(AESDECWIDE256KL)

  NODE_NAME_CASE(CMPCCXADD)

  NODE_NAME_CASE(TESTUI)

  NODE_NAME_CASE(FP80_ADD)

  NODE_NAME_CASE(STRICT_FP80_ADD)

  NODE_NAME_CASE(CCMP)

  NODE_NAME_CASE(CTEST)

  NODE_NAME_CASE(CLOAD)

  NODE_NAME_CASE(CSTORE)

  NODE_NAME_CASE(CVTTS2SIS)

  NODE_NAME_CASE(CVTTS2UIS)

  NODE_NAME_CASE(CVTTS2SIS_SAE)

  NODE_NAME_CASE(CVTTS2UIS_SAE)

  NODE_NAME_CASE(CVTTP2SIS)

  NODE_NAME_CASE(MCVTTP2SIS)

  NODE_NAME_CASE(CVTTP2UIS_SAE)

  NODE_NAME_CASE(CVTTP2SIS_SAE)

  NODE_NAME_CASE(CVTTP2UIS)

  NODE_NAME_CASE(MCVTTP2UIS)

  NODE_NAME_CASE(POP_FROM_X87_REG)

  }

  return nullptr;

#undef NODE_NAME_CASE

}


/// Return true if the addressing mode represented by AM is legal for this

/// target, for a load/store of the specified type.


bool X86TargetLowering::isLegalAddressingMode(const DataLayout &DL,

                                              const AddrMode &AM, Type *Ty,

                                              unsigned AS,

                                              Instruction *I) const {

  // X86 supports extremely general addressing modes.

  CodeModel::Model M = getTargetMachine().getCodeModel();


  // X86 allows a sign-extended 32-bit immediate field as a displacement.

  if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != nullptr))

    return false;


  if (AM.BaseGV) {

    unsigned GVFlags = Subtarget.classifyGlobalReference(AM.BaseGV);


    // If a reference to this global requires an extra load, we can't fold it.

    if (isGlobalStubReference(GVFlags))

      return false;


    // If BaseGV requires a register for the PIC base, we cannot also have a

    // BaseReg specified.

    if (AM.HasBaseReg && isGlobalRelativeToPICBase(GVFlags))

      return false;


    // If lower 4G is not available, then we must use rip-relative addressing.

    if ((M != CodeModel::Small || isPositionIndependent()) &&

        Subtarget.is64Bit() && (AM.BaseOffs || AM.Scale > 1))

      return false;

  }


  switch (AM.Scale) {

  case 0:

  case 1:

  case 2:

  case 4:

  case 8:

    // These scales always work.

    break;

  case 3:

  case 5:

  case 9:

    // These scales are formed with basereg+scalereg.  Only accept if there is

    // no basereg yet.

    if (AM.HasBaseReg)

      return false;

    break;

  default:  // Other stuff never works.

    return false;

  }


  return true;

}


bool X86TargetLowering::isBinOp(unsigned Opcode) const {

  switch (Opcode) {

  // These are non-commutative binops.

  // TODO: Add more X86ISD opcodes once we have test coverage.

  case X86ISD::ANDNP:

  case X86ISD::PCMPGT:

  case X86ISD::FMAX:

  case X86ISD::FMIN:

  case X86ISD::FANDN:

  case X86ISD::VPSHA:

  case X86ISD::VPSHL:

  case X86ISD::VSHLV:

  case X86ISD::VSRLV:

  case X86ISD::VSRAV:

    return true;

  }


  return TargetLoweringBase::isBinOp(Opcode);

}


bool X86TargetLowering::isCommutativeBinOp(unsigned Opcode) const {

  switch (Opcode) {

  // TODO: Add more X86ISD opcodes once we have test coverage.

  case X86ISD::PCMPEQ:

  case X86ISD::PMULDQ:

  case X86ISD::PMULUDQ:

  case X86ISD::FMAXC:

  case X86ISD::FMINC:

  case X86ISD::FAND:

  case X86ISD::FOR:

  case X86ISD::FXOR:

    return true;

  }


  return TargetLoweringBase::isCommutativeBinOp(Opcode);

}


bool X86TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const {

  if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())

    return false;

  unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();

  unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();

  return NumBits1 > NumBits2;

}


bool X86TargetLowering::allowTruncateForTailCall(Type *Ty1, Type *Ty2) const {

  if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())

    return false;


  if (!isTypeLegal(EVT::getEVT(Ty1)))

    return false;


  assert(Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop");


  // Assuming the caller doesn't have a zeroext or signext return parameter,

  // truncation all the way down to i1 is valid.

  return true;

}


bool X86TargetLowering::isLegalICmpImmediate(int64_t Imm) const {

  return isInt<32>(Imm);

}


bool X86TargetLowering::isLegalAddImmediate(int64_t Imm) const {

  // Can also use sub to handle negated immediates.

  return isInt<32>(Imm);

}


bool X86TargetLowering::isLegalStoreImmediate(int64_t Imm) const {

  return isInt<32>(Imm);

}


bool X86TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {

  if (!VT1.isScalarInteger() || !VT2.isScalarInteger())

    return false;

  unsigned NumBits1 = VT1.getSizeInBits();

  unsigned NumBits2 = VT2.getSizeInBits();

  return NumBits1 > NumBits2;

}


bool X86TargetLowering::isZExtFree(Type *Ty1, Type *Ty2) const {

  // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.

  return Ty1->isIntegerTy(32) && Ty2->isIntegerTy(64) && Subtarget.is64Bit();

}


bool X86TargetLowering::isZExtFree(EVT VT1, EVT VT2) const {

  // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.

  return VT1 == MVT::i32 && VT2 == MVT::i64 && Subtarget.is64Bit();

}


bool X86TargetLowering::isZExtFree(SDValue Val, EVT VT2) const {

  EVT VT1 = Val.getValueType();

  if (isZExtFree(VT1, VT2))

    return true;


  if (Val.getOpcode() != ISD::LOAD)

    return false;


  if (!VT1.isSimple() || !VT1.isInteger() ||

      !VT2.isSimple() || !VT2.isInteger())

    return false;


  switch (VT1.getSimpleVT().SimpleTy) {

  default: break;

  case MVT::i8:

  case MVT::i16:

  case MVT::i32:

    // X86 has 8, 16, and 32-bit zero-extending loads.

    return true;

  }


  return false;

}


bool X86TargetLowering::shouldConvertPhiType(Type *From, Type *To) const {

  if (!Subtarget.is64Bit())

    return false;

  return TargetLowering::shouldConvertPhiType(From, To);

}


bool X86TargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const {

  if (isa<MaskedLoadSDNode>(ExtVal.getOperand(0)))

    return false;


  EVT SrcVT = ExtVal.getOperand(0).getValueType();


  // There is no extending load for vXi1.

  if (SrcVT.getScalarType() == MVT::i1)

    return false;


  return true;

}


bool X86TargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction &MF,

                                                   EVT VT) const {

  if (Subtarget.useSoftFloat())

    return false;


  if (!Subtarget.hasAnyFMA())

    return false;


  VT = VT.getScalarType();


  if (!VT.isSimple())

    return false;


  switch (VT.getSimpleVT().SimpleTy) {

  case MVT::f16:

    return Subtarget.hasFP16();

  case MVT::f32:

  case MVT::f64:

    return true;

  default:

    break;

  }


  return false;

}


bool X86TargetLowering::isNarrowingProfitable(SDNode *N, EVT SrcVT,

                                              EVT DestVT) const {

  // i16 instructions are longer (0x66 prefix) and potentially slower.

  return !(SrcVT == MVT::i32 && DestVT == MVT::i16);

}


bool X86TargetLowering::shouldFoldSelectWithIdentityConstant(

    unsigned BinOpcode, EVT VT, unsigned SelectOpcode, SDValue X,

    SDValue Y) const {

  if (SelectOpcode == ISD::SELECT) {

    if (VT.isVector())

      return false;

    if (!Subtarget.hasBMI() || (VT != MVT::i32 && VT != MVT::i64))

      return false;

    using namespace llvm::SDPatternMatch;

    // BLSI

    if (BinOpcode == ISD::AND && (sd_match(Y, m_Neg(m_Specific(X))) ||

                                  sd_match(X, m_Neg(m_Specific(Y)))))

      return true;

    // BLSR

    if (BinOpcode == ISD::AND &&

        (sd_match(Y, m_Add(m_Specific(X), m_AllOnes())) ||

         sd_match(X, m_Add(m_Specific(Y), m_AllOnes()))))

      return true;

    // BLSMSK

    if (BinOpcode == ISD::XOR &&

        (sd_match(Y, m_Add(m_Specific(X), m_AllOnes())) ||

         sd_match(X, m_Add(m_Specific(Y), m_AllOnes()))))

      return true;


    return false;

  }

  // TODO: This is too general. There are cases where pre-AVX512 codegen would

  //       benefit. The transform may also be profitable for scalar code.

  if (!Subtarget.hasAVX512())

    return false;

  if (!Subtarget.hasVLX() && !VT.is512BitVector())

    return false;

  if (!VT.isVector() || VT.getScalarType() == MVT::i1)

    return false;


  return true;

}


/// Targets can use this to indicate that they only support *some*

/// VECTOR_SHUFFLE operations, those with specific masks.

/// By default, if a target supports the VECTOR_SHUFFLE node, all mask values

/// are assumed to be legal.


bool X86TargetLowering::isShuffleMaskLegal(ArrayRef<int> Mask, EVT VT) const {

  if (!VT.isSimple())

    return false;


  // Not for i1 vectors

  if (VT.getSimpleVT().getScalarType() == MVT::i1)

    return false;


  // Very little shuffling can be done for 64-bit vectors right now.

  if (VT.getSimpleVT().getSizeInBits() == 64)

    return false;


  // We only care that the types being shuffled are legal. The lowering can

  // handle any possible shuffle mask that results.

  return isTypeLegal(VT.getSimpleVT());

}


bool X86TargetLowering::isVectorClearMaskLegal(ArrayRef<int> Mask,

                                               EVT VT) const {

  // Don't convert an 'and' into a shuffle that we don't directly support.

  // vpblendw and vpshufb for 256-bit vectors are not available on AVX1.

  if (!Subtarget.hasAVX2())

    if (VT == MVT::v32i8 || VT == MVT::v16i16)

      return false;


  // Just delegate to the generic legality, clear masks aren't special.

  return isShuffleMaskLegal(Mask, VT);

}


bool X86TargetLowering::areJTsAllowed(const Function *Fn) const {

  // If the subtarget is using thunks, we need to not generate jump tables.

  if (Subtarget.useIndirectThunkBranches())

    return false;


  // Otherwise, fallback on the generic logic.

  return TargetLowering::areJTsAllowed(Fn);

}


MVT X86TargetLowering::getPreferredSwitchConditionType(LLVMContext &Context,

                                                       EVT ConditionVT) const {

  // Avoid 8 and 16 bit types because they increase the chance for unnecessary

  // zero-extensions.

  if (ConditionVT.getSizeInBits() < 32)

    return MVT::i32;

  return TargetLoweringBase::getPreferredSwitchConditionType(Context,

                                                             ConditionVT);

}


//===----------------------------------------------------------------------===//

//                           X86 Scheduler Hooks

//===----------------------------------------------------------------------===//


/// Utility function to emit xbegin specifying the start of an RTM region.


static MachineBasicBlock *emitXBegin(MachineInstr &MI, MachineBasicBlock *MBB,

                                     const TargetInstrInfo *TII) {

  const MIMetadata MIMD(MI);


  const BasicBlock *BB = MBB->getBasicBlock();

  MachineFunction::iterator I = ++MBB->getIterator();


  // For the v = xbegin(), we generate

  //

  // thisMBB:

  //  xbegin sinkMBB

  //

  // mainMBB:

  //  s0 = -1

  //

  // fallBB:

  //  eax = # XABORT_DEF

  //  s1 = eax

  //

  // sinkMBB:

  //  v = phi(s0/mainBB, s1/fallBB)


  MachineBasicBlock *thisMBB = MBB;

  MachineFunction *MF = MBB->getParent();

  MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);

  MachineBasicBlock *fallMBB = MF->CreateMachineBasicBlock(BB);

  MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);

  MF->insert(I, mainMBB);

  MF->insert(I, fallMBB);

  MF->insert(I, sinkMBB);


  if (isPhysRegUsedAfter(X86::EFLAGS, MI)) {

    mainMBB->addLiveIn(X86::EFLAGS);

    fallMBB->addLiveIn(X86::EFLAGS);

    sinkMBB->addLiveIn(X86::EFLAGS);

  }


  // Transfer the remainder of BB and its successor edges to sinkMBB.

  sinkMBB->splice(sinkMBB->begin(), MBB,

                  std::next(MachineBasicBlock::iterator(MI)), MBB->end());

  sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);


  MachineRegisterInfo &MRI = MF->getRegInfo();

  Register DstReg = MI.getOperand(0).getReg();

  const TargetRegisterClass *RC = MRI.getRegClass(DstReg);

  Register mainDstReg = MRI.createVirtualRegister(RC);

  Register fallDstReg = MRI.createVirtualRegister(RC);


  // thisMBB:

  //  xbegin fallMBB

  //  # fallthrough to mainMBB

  //  # abortion to fallMBB

  BuildMI(thisMBB, MIMD, TII->get(X86::XBEGIN_4)).addMBB(fallMBB);

  thisMBB->addSuccessor(mainMBB);

  thisMBB->addSuccessor(fallMBB);


  // mainMBB:

  //  mainDstReg := -1

  BuildMI(mainMBB, MIMD, TII->get(X86::MOV32ri), mainDstReg).addImm(-1);

  BuildMI(mainMBB, MIMD, TII->get(X86::JMP_1)).addMBB(sinkMBB);

  mainMBB->addSuccessor(sinkMBB);


  // fallMBB:

  //  ; pseudo instruction to model hardware's definition from XABORT

  //  EAX := XABORT_DEF

  //  fallDstReg := EAX

  BuildMI(fallMBB, MIMD, TII->get(X86::XABORT_DEF));

  BuildMI(fallMBB, MIMD, TII->get(TargetOpcode::COPY), fallDstReg)

      .addReg(X86::EAX);

  fallMBB->addSuccessor(sinkMBB);


  // sinkMBB:

  //  DstReg := phi(mainDstReg/mainBB, fallDstReg/fallBB)

  BuildMI(*sinkMBB, sinkMBB->begin(), MIMD, TII->get(X86::PHI), DstReg)

      .addReg(mainDstReg).addMBB(mainMBB)

      .addReg(fallDstReg).addMBB(fallMBB);


  MI.eraseFromParent();

  return sinkMBB;

}


MachineBasicBlock *

X86TargetLowering::EmitVAARGWithCustomInserter(MachineInstr &MI,

                                               MachineBasicBlock *MBB) const {

  // Emit va_arg instruction on X86-64.


  // Operands to this pseudo-instruction:

  // 0  ) Output        : destination address (reg)

  // 1-5) Input         : va_list address (addr, i64mem)

  // 6  ) ArgSize       : Size (in bytes) of vararg type

  // 7  ) ArgMode       : 0=overflow only, 1=use gp_offset, 2=use fp_offset

  // 8  ) Align         : Alignment of type

  // 9  ) EFLAGS (implicit-def)


  assert(MI.getNumOperands() == 10 && "VAARG should have 10 operands!");

  static_assert(X86::AddrNumOperands == 5, "VAARG assumes 5 address operands");


  Register DestReg = MI.getOperand(0).getReg();

  MachineOperand &Base = MI.getOperand(1);

  MachineOperand &Scale = MI.getOperand(2);

  MachineOperand &Index = MI.getOperand(3);

  MachineOperand &Disp = MI.getOperand(4);

  MachineOperand &Segment = MI.getOperand(5);

  unsigned ArgSize = MI.getOperand(6).getImm();

  unsigned ArgMode = MI.getOperand(7).getImm();

  Align Alignment = Align(MI.getOperand(8).getImm());


  MachineFunction *MF = MBB->getParent();


  // Memory Reference

  assert(MI.hasOneMemOperand() && "Expected VAARG to have one memoperand");


  MachineMemOperand *OldMMO = MI.memoperands().front();


  // Clone the MMO into two separate MMOs for loading and storing

  MachineMemOperand *LoadOnlyMMO = MF->getMachineMemOperand(

      OldMMO, OldMMO->getFlags() & ~MachineMemOperand::MOStore);

  MachineMemOperand *StoreOnlyMMO = MF->getMachineMemOperand(

      OldMMO, OldMMO->getFlags() & ~MachineMemOperand::MOLoad);


  // Machine Information

  const TargetInstrInfo *TII = Subtarget.getInstrInfo();

  MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();

  const TargetRegisterClass *AddrRegClass =

      getRegClassFor(getPointerTy(MBB->getParent()->getDataLayout()));

  const TargetRegisterClass *OffsetRegClass = getRegClassFor(MVT::i32);

  const MIMetadata MIMD(MI);


  // struct va_list {

  //   i32   gp_offset

  //   i32   fp_offset

  //   i64   overflow_area (address)

  //   i64   reg_save_area (address)

  // }

  // sizeof(va_list) = 24

  // alignment(va_list) = 8


  unsigned TotalNumIntRegs = 6;

  unsigned TotalNumXMMRegs = 8;

  bool UseGPOffset = (ArgMode == 1);

  bool UseFPOffset = (ArgMode == 2);

  unsigned MaxOffset = TotalNumIntRegs * 8 +

                       (UseFPOffset ? TotalNumXMMRegs * 16 : 0);


  /* Align ArgSize to a multiple of 8 */

  unsigned ArgSizeA8 = (ArgSize + 7) & ~7;

  bool NeedsAlign = (Alignment > 8);


  MachineBasicBlock *thisMBB = MBB;

  MachineBasicBlock *overflowMBB;

  MachineBasicBlock *offsetMBB;

  MachineBasicBlock *endMBB;


  Register OffsetDestReg;   // Argument address computed by offsetMBB

  Register OverflowDestReg; // Argument address computed by overflowMBB

  Register OffsetReg;


  if (!UseGPOffset && !UseFPOffset) {

    // If we only pull from the overflow region, we don't create a branch.

    // We don't need to alter control flow.

    OffsetDestReg = Register(); // unused

    OverflowDestReg = DestReg;


    offsetMBB = nullptr;

    overflowMBB = thisMBB;

    endMBB = thisMBB;

  } else {

    // First emit code to check if gp_offset (or fp_offset) is below the bound.

    // If so, pull the argument from reg_save_area. (branch to offsetMBB)

    // If not, pull from overflow_area. (branch to overflowMBB)

    //

    //       thisMBB

    //         |     .

    //         |        .

    //     offsetMBB   overflowMBB

    //         |        .

    //         |     .

    //        endMBB


    // Registers for the PHI in endMBB

    OffsetDestReg = MRI.createVirtualRegister(AddrRegClass);

    OverflowDestReg = MRI.createVirtualRegister(AddrRegClass);


    const BasicBlock *LLVM_BB = MBB->getBasicBlock();

    overflowMBB = MF->CreateMachineBasicBlock(LLVM_BB);

    offsetMBB = MF->CreateMachineBasicBlock(LLVM_BB);

    endMBB = MF->CreateMachineBasicBlock(LLVM_BB);


    MachineFunction::iterator MBBIter = ++MBB->getIterator();


    // Insert the new basic blocks

    MF->insert(MBBIter, offsetMBB);

    MF->insert(MBBIter, overflowMBB);

    MF->insert(MBBIter, endMBB);


    // Transfer the remainder of MBB and its successor edges to endMBB.

    endMBB->splice(endMBB->begin(), thisMBB,

                   std::next(MachineBasicBlock::iterator(MI)), thisMBB->end());

    endMBB->transferSuccessorsAndUpdatePHIs(thisMBB);


    // Make offsetMBB and overflowMBB successors of thisMBB

    thisMBB->addSuccessor(offsetMBB);

    thisMBB->addSuccessor(overflowMBB);


    // endMBB is a successor of both offsetMBB and overflowMBB

    offsetMBB->addSuccessor(endMBB);

    overflowMBB->addSuccessor(endMBB);


    // Load the offset value into a register

    OffsetReg = MRI.createVirtualRegister(OffsetRegClass);

    BuildMI(thisMBB, MIMD, TII->get(X86::MOV32rm), OffsetReg)

        .add(Base)

        .add(Scale)

        .add(Index)

        .addDisp(Disp, UseFPOffset ? 4 : 0)

        .add(Segment)

        .setMemRefs(LoadOnlyMMO);


    // Check if there is enough room left to pull this argument.

    BuildMI(thisMBB, MIMD, TII->get(X86::CMP32ri))

      .addReg(OffsetReg)

      .addImm(MaxOffset + 8 - ArgSizeA8);


    // Branch to "overflowMBB" if offset >= max

    // Fall through to "offsetMBB" otherwise

    BuildMI(thisMBB, MIMD, TII->get(X86::JCC_1))

      .addMBB(overflowMBB).addImm(X86::COND_AE);

  }


  // In offsetMBB, emit code to use the reg_save_area.

  if (offsetMBB) {

    assert(OffsetReg != 0);


    // Read the reg_save_area address.

    Register RegSaveReg = MRI.createVirtualRegister(AddrRegClass);

    BuildMI(

        offsetMBB, MIMD,

        TII->get(Subtarget.isTarget64BitLP64() ? X86::MOV64rm : X86::MOV32rm),

        RegSaveReg)

        .add(Base)

        .add(Scale)

        .add(Index)

        .addDisp(Disp, Subtarget.isTarget64BitLP64() ? 16 : 12)

        .add(Segment)

        .setMemRefs(LoadOnlyMMO);


    if (Subtarget.isTarget64BitLP64()) {

      // Zero-extend the offset

      Register OffsetReg64 = MRI.createVirtualRegister(AddrRegClass);

      BuildMI(offsetMBB, MIMD, TII->get(X86::SUBREG_TO_REG), OffsetReg64)

          .addImm(0)

          .addReg(OffsetReg)

          .addImm(X86::sub_32bit);


      // Add the offset to the reg_save_area to get the final address.

      BuildMI(offsetMBB, MIMD, TII->get(X86::ADD64rr), OffsetDestReg)

          .addReg(OffsetReg64)

          .addReg(RegSaveReg);

    } else {

      // Add the offset to the reg_save_area to get the final address.

      BuildMI(offsetMBB, MIMD, TII->get(X86::ADD32rr), OffsetDestReg)

          .addReg(OffsetReg)

          .addReg(RegSaveReg);

    }


    // Compute the offset for the next argument

    Register NextOffsetReg = MRI.createVirtualRegister(OffsetRegClass);

    BuildMI(offsetMBB, MIMD, TII->get(X86::ADD32ri), NextOffsetReg)

      .addReg(OffsetReg)

      .addImm(UseFPOffset ? 16 : 8);


    // Store it back into the va_list.

    BuildMI(offsetMBB, MIMD, TII->get(X86::MOV32mr))

        .add(Base)

        .add(Scale)

        .add(Index)

        .addDisp(Disp, UseFPOffset ? 4 : 0)

        .add(Segment)

        .addReg(NextOffsetReg)

        .setMemRefs(StoreOnlyMMO);


    // Jump to endMBB

    BuildMI(offsetMBB, MIMD, TII->get(X86::JMP_1))

      .addMBB(endMBB);

  }


  //

  // Emit code to use overflow area

  //


  // Load the overflow_area address into a register.

  Register OverflowAddrReg = MRI.createVirtualRegister(AddrRegClass);

  BuildMI(overflowMBB, MIMD,

          TII->get(Subtarget.isTarget64BitLP64() ? X86::MOV64rm : X86::MOV32rm),

          OverflowAddrReg)

      .add(Base)

      .add(Scale)

      .add(Index)

      .addDisp(Disp, 8)

      .add(Segment)

      .setMemRefs(LoadOnlyMMO);


  // If we need to align it, do so. Otherwise, just copy the address

  // to OverflowDestReg.

  if (NeedsAlign) {

    // Align the overflow address

    Register TmpReg = MRI.createVirtualRegister(AddrRegClass);


    // aligned_addr = (addr + (align-1)) & ~(align-1)

    BuildMI(

        overflowMBB, MIMD,

        TII->get(Subtarget.isTarget64BitLP64() ? X86::ADD64ri32 : X86::ADD32ri),

        TmpReg)

        .addReg(OverflowAddrReg)

        .addImm(Alignment.value() - 1);


    BuildMI(

        overflowMBB, MIMD,

        TII->get(Subtarget.isTarget64BitLP64() ? X86::AND64ri32 : X86::AND32ri),

        OverflowDestReg)

        .addReg(TmpReg)

        .addImm(~(uint64_t)(Alignment.value() - 1));

  } else {

    BuildMI(overflowMBB, MIMD, TII->get(TargetOpcode::COPY), OverflowDestReg)

      .addReg(OverflowAddrReg);

  }


  // Compute the next overflow address after this argument.

  // (the overflow address should be kept 8-byte aligned)

  Register NextAddrReg = MRI.createVirtualRegister(AddrRegClass);

  BuildMI(

      overflowMBB, MIMD,

      TII->get(Subtarget.isTarget64BitLP64() ? X86::ADD64ri32 : X86::ADD32ri),

      NextAddrReg)

      .addReg(OverflowDestReg)

      .addImm(ArgSizeA8);


  // Store the new overflow address.

  BuildMI(overflowMBB, MIMD,

          TII->get(Subtarget.isTarget64BitLP64() ? X86::MOV64mr : X86::MOV32mr))

      .add(Base)

      .add(Scale)

      .add(Index)

      .addDisp(Disp, 8)

      .add(Segment)

      .addReg(NextAddrReg)

      .setMemRefs(StoreOnlyMMO);


  // If we branched, emit the PHI to the front of endMBB.

  if (offsetMBB) {

    BuildMI(*endMBB, endMBB->begin(), MIMD,

            TII->get(X86::PHI), DestReg)

      .addReg(OffsetDestReg).addMBB(offsetMBB)

      .addReg(OverflowDestReg).addMBB(overflowMBB);

  }


  // Erase the pseudo instruction

  MI.eraseFromParent();


  return endMBB;

}


// The EFLAGS operand of SelectItr might be missing a kill marker

// because there were multiple uses of EFLAGS, and ISel didn't know

// which to mark. Figure out whether SelectItr should have had a

// kill marker, and set it if it should. Returns the correct kill

// marker value.


static bool checkAndUpdateEFLAGSKill(MachineBasicBlock::iterator SelectItr,

                                     MachineBasicBlock* BB,

                                     const TargetRegisterInfo* TRI) {

  if (isPhysRegUsedAfter(X86::EFLAGS, SelectItr))

    return false;


  // We found a def, or hit the end of the basic block and EFLAGS wasn't live

  // out. SelectMI should have a kill flag on EFLAGS.

  SelectItr->addRegisterKilled(X86::EFLAGS, TRI);

  return true;

}


// Return true if it is OK for this CMOV pseudo-opcode to be cascaded

// together with other CMOV pseudo-opcodes into a single basic-block with

// conditional jump around it.


static bool isCMOVPseudo(MachineInstr &MI) {

  switch (MI.getOpcode()) {

  case X86::CMOV_FR16:

  case X86::CMOV_FR16X:

  case X86::CMOV_FR32:

  case X86::CMOV_FR32X:

  case X86::CMOV_FR64:

  case X86::CMOV_FR64X:

  case X86::CMOV_GR8:

  case X86::CMOV_GR16:

  case X86::CMOV_GR32:

  case X86::CMOV_RFP32:

  case X86::CMOV_RFP64:

  case X86::CMOV_RFP80:

  case X86::CMOV_VR64:

  case X86::CMOV_VR128:

  case X86::CMOV_VR128X:

  case X86::CMOV_VR256:

  case X86::CMOV_VR256X:

  case X86::CMOV_VR512:

  case X86::CMOV_VK1:

  case X86::CMOV_VK2:

  case X86::CMOV_VK4:

  case X86::CMOV_VK8:

  case X86::CMOV_VK16:

  case X86::CMOV_VK32:

  case X86::CMOV_VK64:

    return true;


  default:

    return false;

  }

}


// Helper function, which inserts PHI functions into SinkMBB:

//   %Result(i) = phi [ %FalseValue(i), FalseMBB ], [ %TrueValue(i), TrueMBB ],

// where %FalseValue(i) and %TrueValue(i) are taken from the consequent CMOVs

// in [MIItBegin, MIItEnd) range. It returns the last MachineInstrBuilder for

// the last PHI function inserted.


static MachineInstrBuilder createPHIsForCMOVsInSinkBB(

    MachineBasicBlock::iterator MIItBegin, MachineBasicBlock::iterator MIItEnd,

    MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB,

    MachineBasicBlock *SinkMBB) {

  MachineFunction *MF = TrueMBB->getParent();

  const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();

  const MIMetadata MIMD(*MIItBegin);


  X86::CondCode CC = X86::CondCode(MIItBegin->getOperand(3).getImm());

  X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC);


  MachineBasicBlock::iterator SinkInsertionPoint = SinkMBB->begin();


  // As we are creating the PHIs, we have to be careful if there is more than

  // one.  Later CMOVs may reference the results of earlier CMOVs, but later

  // PHIs have to reference the individual true/false inputs from earlier PHIs.

  // That also means that PHI construction must work forward from earlier to

  // later, and that the code must maintain a mapping from earlier PHI's

  // destination registers, and the registers that went into the PHI.

  DenseMap<Register, std::pair<Register, Register>> RegRewriteTable;

  MachineInstrBuilder MIB;


  for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd; ++MIIt) {

    Register DestReg = MIIt->getOperand(0).getReg();

    Register Op1Reg = MIIt->getOperand(1).getReg();

    Register Op2Reg = MIIt->getOperand(2).getReg();


    // If this CMOV we are generating is the opposite condition from

    // the jump we generated, then we have to swap the operands for the

    // PHI that is going to be generated.

    if (MIIt->getOperand(3).getImm() == OppCC)

      std::swap(Op1Reg, Op2Reg);


    if (auto It = RegRewriteTable.find(Op1Reg); It != RegRewriteTable.end())

      Op1Reg = It->second.first;


    if (auto It = RegRewriteTable.find(Op2Reg); It != RegRewriteTable.end())

      Op2Reg = It->second.second;


    MIB =

        BuildMI(*SinkMBB, SinkInsertionPoint, MIMD, TII->get(X86::PHI), DestReg)

            .addReg(Op1Reg)

            .addMBB(FalseMBB)

            .addReg(Op2Reg)

            .addMBB(TrueMBB);


    // Add this PHI to the rewrite table.

    RegRewriteTable[DestReg] = std::make_pair(Op1Reg, Op2Reg);

  }


  return MIB;

}


// Lower cascaded selects in form of (SecondCmov (FirstCMOV F, T, cc1), T, cc2).

MachineBasicBlock *

X86TargetLowering::EmitLoweredCascadedSelect(MachineInstr &FirstCMOV,

                                             MachineInstr &SecondCascadedCMOV,

                                             MachineBasicBlock *ThisMBB) const {

  const TargetInstrInfo *TII = Subtarget.getInstrInfo();

  const MIMetadata MIMD(FirstCMOV);


  // We lower cascaded CMOVs such as

  //

  //   (SecondCascadedCMOV (FirstCMOV F, T, cc1), T, cc2)

  //

  // to two successive branches.

  //

  // Without this, we would add a PHI between the two jumps, which ends up

  // creating a few copies all around. For instance, for

  //

  //    (sitofp (zext (fcmp une)))

  //

  // we would generate:

  //

  //         ucomiss %xmm1, %xmm0

  //         movss  <1.0f>, %xmm0

  //         movaps  %xmm0, %xmm1

  //         jne     .LBB5_2

  //         xorps   %xmm1, %xmm1

  // .LBB5_2:

  //         jp      .LBB5_4

  //         movaps  %xmm1, %xmm0

  // .LBB5_4:

  //         retq

  //

  // because this custom-inserter would have generated:

  //

  //   A

  //   | \

  //   |  B

  //   | /

  //   C

  //   | \

  //   |  D

  //   | /

  //   E

  //

  // A: X = ...; Y = ...

  // B: empty

  // C: Z = PHI [X, A], [Y, B]

  // D: empty

  // E: PHI [X, C], [Z, D]

  //

  // If we lower both CMOVs in a single step, we can instead generate:

  //

  //   A

  //   | \

  //   |  C

  //   | /|

  //   |/ |

  //   |  |

  //   |  D

  //   | /

  //   E

  //

  // A: X = ...; Y = ...

  // D: empty

  // E: PHI [X, A], [X, C], [Y, D]

  //

  // Which, in our sitofp/fcmp example, gives us something like:

  //

  //         ucomiss %xmm1, %xmm0

  //         movss  <1.0f>, %xmm0

  //         jne     .LBB5_4

  //         jp      .LBB5_4

  //         xorps   %xmm0, %xmm0

  // .LBB5_4:

  //         retq

  //


  // We lower cascaded CMOV into two successive branches to the same block.

  // EFLAGS is used by both, so mark it as live in the second.

  const BasicBlock *LLVM_BB = ThisMBB->getBasicBlock();

  MachineFunction *F = ThisMBB->getParent();

  MachineBasicBlock *FirstInsertedMBB = F->CreateMachineBasicBlock(LLVM_BB);

  MachineBasicBlock *SecondInsertedMBB = F->CreateMachineBasicBlock(LLVM_BB);

  MachineBasicBlock *SinkMBB = F->CreateMachineBasicBlock(LLVM_BB);


  MachineFunction::iterator It = ++ThisMBB->getIterator();

  F->insert(It, FirstInsertedMBB);

  F->insert(It, SecondInsertedMBB);

  F->insert(It, SinkMBB);


  // For a cascaded CMOV, we lower it to two successive branches to

  // the same block (SinkMBB).  EFLAGS is used by both, so mark it as live in

  // the FirstInsertedMBB.

  FirstInsertedMBB->addLiveIn(X86::EFLAGS);


  // If the EFLAGS register isn't dead in the terminator, then claim that it's

  // live into the sink and copy blocks.

  const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();

  if (!SecondCascadedCMOV.killsRegister(X86::EFLAGS, /*TRI=*/nullptr) &&

      !checkAndUpdateEFLAGSKill(SecondCascadedCMOV, ThisMBB, TRI)) {

    SecondInsertedMBB->addLiveIn(X86::EFLAGS);

    SinkMBB->addLiveIn(X86::EFLAGS);

  }


  // Transfer the remainder of ThisMBB and its successor edges to SinkMBB.

  SinkMBB->splice(SinkMBB->begin(), ThisMBB,

                  std::next(MachineBasicBlock::iterator(FirstCMOV)),

                  ThisMBB->end());

  SinkMBB->transferSuccessorsAndUpdatePHIs(ThisMBB);


  // Fallthrough block for ThisMBB.

  ThisMBB->addSuccessor(FirstInsertedMBB);

  // The true block target of the first branch is always SinkMBB.

  ThisMBB->addSuccessor(SinkMBB);

  // Fallthrough block for FirstInsertedMBB.

  FirstInsertedMBB->addSuccessor(SecondInsertedMBB);

  // The true block for the branch of FirstInsertedMBB.

  FirstInsertedMBB->addSuccessor(SinkMBB);

  // This is fallthrough.

  SecondInsertedMBB->addSuccessor(SinkMBB);


  // Create the conditional branch instructions.

  X86::CondCode FirstCC = X86::CondCode(FirstCMOV.getOperand(3).getImm());

  BuildMI(ThisMBB, MIMD, TII->get(X86::JCC_1)).addMBB(SinkMBB).addImm(FirstCC);


  X86::CondCode SecondCC =

      X86::CondCode(SecondCascadedCMOV.getOperand(3).getImm());

  BuildMI(FirstInsertedMBB, MIMD, TII->get(X86::JCC_1))

      .addMBB(SinkMBB)

      .addImm(SecondCC);


  //  SinkMBB:

  //   %Result = phi [ %FalseValue, SecondInsertedMBB ], [ %TrueValue, ThisMBB ]

  Register DestReg = SecondCascadedCMOV.getOperand(0).getReg();

  Register Op1Reg = FirstCMOV.getOperand(1).getReg();

  Register Op2Reg = FirstCMOV.getOperand(2).getReg();

  MachineInstrBuilder MIB =

      BuildMI(*SinkMBB, SinkMBB->begin(), MIMD, TII->get(X86::PHI), DestReg)

          .addReg(Op1Reg)

          .addMBB(SecondInsertedMBB)

          .addReg(Op2Reg)

          .addMBB(ThisMBB);


  // The second SecondInsertedMBB provides the same incoming value as the

  // FirstInsertedMBB (the True operand of the SELECT_CC/CMOV nodes).

  MIB.addReg(FirstCMOV.getOperand(2).getReg()).addMBB(FirstInsertedMBB);


  // Now remove the CMOVs.

  FirstCMOV.eraseFromParent();

  SecondCascadedCMOV.eraseFromParent();


  return SinkMBB;

}


MachineBasicBlock *

X86TargetLowering::EmitLoweredSelect(MachineInstr &MI,

                                     MachineBasicBlock *ThisMBB) const {

  const TargetInstrInfo *TII = Subtarget.getInstrInfo();

  const MIMetadata MIMD(MI);


  // To "insert" a SELECT_CC instruction, we actually have to insert the

  // diamond control-flow pattern.  The incoming instruction knows the

  // destination vreg to set, the condition code register to branch on, the

  // true/false values to select between and a branch opcode to use.


  //  ThisMBB:

  //  ...

  //   TrueVal = ...

  //   cmpTY ccX, r1, r2

  //   bCC copy1MBB

  //   fallthrough --> FalseMBB


  // This code lowers all pseudo-CMOV instructions. Generally it lowers these

  // as described above, by inserting a BB, and then making a PHI at the join

  // point to select the true and false operands of the CMOV in the PHI.

  //

  // The code also handles two different cases of multiple CMOV opcodes

  // in a row.

  //

  // Case 1:

  // In this case, there are multiple CMOVs in a row, all which are based on

  // the same condition setting (or the exact opposite condition setting).

  // In this case we can lower all the CMOVs using a single inserted BB, and

  // then make a number of PHIs at the join point to model the CMOVs. The only

  // trickiness here, is that in a case like:

  //

  // t2 = CMOV cond1 t1, f1

  // t3 = CMOV cond1 t2, f2

  //

  // when rewriting this into PHIs, we have to perform some renaming on the

  // temps since you cannot have a PHI operand refer to a PHI result earlier

  // in the same block.  The "simple" but wrong lowering would be:

  //

  // t2 = PHI t1(BB1), f1(BB2)

  // t3 = PHI t2(BB1), f2(BB2)

  //

  // but clearly t2 is not defined in BB1, so that is incorrect. The proper

  // renaming is to note that on the path through BB1, t2 is really just a

  // copy of t1, and do that renaming, properly generating:

  //

  // t2 = PHI t1(BB1), f1(BB2)

  // t3 = PHI t1(BB1), f2(BB2)

  //

  // Case 2:

  // CMOV ((CMOV F, T, cc1), T, cc2) is checked here and handled by a separate

  // function - EmitLoweredCascadedSelect.


  X86::CondCode CC = X86::CondCode(MI.getOperand(3).getImm());

  X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC);

  MachineInstr *LastCMOV = &MI;

  MachineBasicBlock::iterator NextMIIt = MachineBasicBlock::iterator(MI);


  // Check for case 1, where there are multiple CMOVs with the same condition

  // first.  Of the two cases of multiple CMOV lowerings, case 1 reduces the

  // number of jumps the most.


  if (isCMOVPseudo(MI)) {

    // See if we have a string of CMOVS with the same condition. Skip over

    // intervening debug insts.

    while (NextMIIt != ThisMBB->end() && isCMOVPseudo(*NextMIIt) &&

           (NextMIIt->getOperand(3).getImm() == CC ||

            NextMIIt->getOperand(3).getImm() == OppCC)) {

      LastCMOV = &*NextMIIt;

      NextMIIt = next_nodbg(NextMIIt, ThisMBB->end());

    }

  }


  // This checks for case 2, but only do this if we didn't already find

  // case 1, as indicated by LastCMOV == MI.

  if (LastCMOV == &MI && NextMIIt != ThisMBB->end() &&

      NextMIIt->getOpcode() == MI.getOpcode() &&

      NextMIIt->getOperand(2).getReg() == MI.getOperand(2).getReg() &&

      NextMIIt->getOperand(1).getReg() == MI.getOperand(0).getReg() &&

      NextMIIt->getOperand(1).isKill()) {

    return EmitLoweredCascadedSelect(MI, *NextMIIt, ThisMBB);

  }


  const BasicBlock *LLVM_BB = ThisMBB->getBasicBlock();

  MachineFunction *F = ThisMBB->getParent();

  MachineBasicBlock *FalseMBB = F->CreateMachineBasicBlock(LLVM_BB);

  MachineBasicBlock *SinkMBB = F->CreateMachineBasicBlock(LLVM_BB);


  MachineFunction::iterator It = ++ThisMBB->getIterator();

  F->insert(It, FalseMBB);

  F->insert(It, SinkMBB);


  // Set the call frame size on entry to the new basic blocks.

  unsigned CallFrameSize = TII->getCallFrameSizeAt(MI);

  FalseMBB->setCallFrameSize(CallFrameSize);

  SinkMBB->setCallFrameSize(CallFrameSize);


  // If the EFLAGS register isn't dead in the terminator, then claim that it's

  // live into the sink and copy blocks.

  const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();

  if (!LastCMOV->killsRegister(X86::EFLAGS, /*TRI=*/nullptr) &&

      !checkAndUpdateEFLAGSKill(LastCMOV, ThisMBB, TRI)) {

    FalseMBB->addLiveIn(X86::EFLAGS);

    SinkMBB->addLiveIn(X86::EFLAGS);

  }


  // Transfer any debug instructions inside the CMOV sequence to the sunk block.

  auto DbgRange = llvm::make_range(MachineBasicBlock::iterator(MI),

                                   MachineBasicBlock::iterator(LastCMOV));

  for (MachineInstr &MI : llvm::make_early_inc_range(DbgRange))

    if (MI.isDebugInstr())

      SinkMBB->push_back(MI.removeFromParent());


  // Transfer the remainder of ThisMBB and its successor edges to SinkMBB.

  SinkMBB->splice(SinkMBB->end(), ThisMBB,

                  std::next(MachineBasicBlock::iterator(LastCMOV)),

                  ThisMBB->end());

  SinkMBB->transferSuccessorsAndUpdatePHIs(ThisMBB);


  // Fallthrough block for ThisMBB.

  ThisMBB->addSuccessor(FalseMBB);

  // The true block target of the first (or only) branch is always a SinkMBB.

  ThisMBB->addSuccessor(SinkMBB);

  // Fallthrough block for FalseMBB.

  FalseMBB->addSuccessor(SinkMBB);


  // Create the conditional branch instruction.

  BuildMI(ThisMBB, MIMD, TII->get(X86::JCC_1)).addMBB(SinkMBB).addImm(CC);


  //  SinkMBB:

  //   %Result = phi [ %FalseValue, FalseMBB ], [ %TrueValue, ThisMBB ]

  //  ...

  MachineBasicBlock::iterator MIItBegin = MachineBasicBlock::iterator(MI);

  MachineBasicBlock::iterator MIItEnd =

      std::next(MachineBasicBlock::iterator(LastCMOV));

  createPHIsForCMOVsInSinkBB(MIItBegin, MIItEnd, ThisMBB, FalseMBB, SinkMBB);


  // Now remove the CMOV(s).

  ThisMBB->erase(MIItBegin, MIItEnd);


  return SinkMBB;

}


static unsigned getSUBriOpcode(bool IsLP64) {

  if (IsLP64)

    return X86::SUB64ri32;

  else

    return X86::SUB32ri;

}


MachineBasicBlock *

X86TargetLowering::EmitLoweredProbedAlloca(MachineInstr &MI,

                                           MachineBasicBlock *MBB) const {

  MachineFunction *MF = MBB->getParent();

  const TargetInstrInfo *TII = Subtarget.getInstrInfo();

  const X86FrameLowering &TFI = *Subtarget.getFrameLowering();

  const MIMetadata MIMD(MI);

  const BasicBlock *LLVM_BB = MBB->getBasicBlock();


  const unsigned ProbeSize = getStackProbeSize(*MF);


  MachineRegisterInfo &MRI = MF->getRegInfo();

  MachineBasicBlock *testMBB = MF->CreateMachineBasicBlock(LLVM_BB);

  MachineBasicBlock *tailMBB = MF->CreateMachineBasicBlock(LLVM_BB);

  MachineBasicBlock *blockMBB = MF->CreateMachineBasicBlock(LLVM_BB);


  MachineFunction::iterator MBBIter = ++MBB->getIterator();

  MF->insert(MBBIter, testMBB);

  MF->insert(MBBIter, blockMBB);

  MF->insert(MBBIter, tailMBB);


  Register sizeVReg = MI.getOperand(1).getReg();


  Register physSPReg = TFI.Uses64BitFramePtr ? X86::RSP : X86::ESP;


  Register TmpStackPtr = MRI.createVirtualRegister(

      TFI.Uses64BitFramePtr ? &X86::GR64RegClass : &X86::GR32RegClass);

  Register FinalStackPtr = MRI.createVirtualRegister(

      TFI.Uses64BitFramePtr ? &X86::GR64RegClass : &X86::GR32RegClass);


  BuildMI(*MBB, {MI}, MIMD, TII->get(TargetOpcode::COPY), TmpStackPtr)

      .addReg(physSPReg);

  {

    const unsigned Opc = TFI.Uses64BitFramePtr ? X86::SUB64rr : X86::SUB32rr;

    BuildMI(*MBB, {MI}, MIMD, TII->get(Opc), FinalStackPtr)

        .addReg(TmpStackPtr)

        .addReg(sizeVReg);

  }


  // test rsp size


  BuildMI(testMBB, MIMD,

          TII->get(TFI.Uses64BitFramePtr ? X86::CMP64rr : X86::CMP32rr))

      .addReg(FinalStackPtr)

      .addReg(physSPReg);


  BuildMI(testMBB, MIMD, TII->get(X86::JCC_1))

      .addMBB(tailMBB)

      .addImm(X86::COND_GE);

  testMBB->addSuccessor(blockMBB);

  testMBB->addSuccessor(tailMBB);


  // Touch the block then extend it. This is done on the opposite side of

  // static probe where we allocate then touch, to avoid the need of probing the

  // tail of the static alloca. Possible scenarios are:

  //

  //       + ---- <- ------------ <- ------------- <- ------------ +

  //       |                                                       |

  // [free probe] -> [page alloc] -> [alloc probe] -> [tail alloc] + -> [dyn probe] -> [page alloc] -> [dyn probe] -> [tail alloc] +

  //                                                               |                                                               |

  //                                                               + <- ----------- <- ------------ <- ----------- <- ------------ +

  //

  // The property we want to enforce is to never have more than [page alloc] between two probes.


  const unsigned XORMIOpc =

      TFI.Uses64BitFramePtr ? X86::XOR64mi32 : X86::XOR32mi;

  addRegOffset(BuildMI(blockMBB, MIMD, TII->get(XORMIOpc)), physSPReg, false, 0)

      .addImm(0);


  BuildMI(blockMBB, MIMD, TII->get(getSUBriOpcode(TFI.Uses64BitFramePtr)),

          physSPReg)

      .addReg(physSPReg)

      .addImm(ProbeSize);


  BuildMI(blockMBB, MIMD, TII->get(X86::JMP_1)).addMBB(testMBB);

  blockMBB->addSuccessor(testMBB);


  // Replace original instruction by the expected stack ptr

  BuildMI(tailMBB, MIMD, TII->get(TargetOpcode::COPY),

          MI.getOperand(0).getReg())

      .addReg(FinalStackPtr);


  tailMBB->splice(tailMBB->end(), MBB,

                  std::next(MachineBasicBlock::iterator(MI)), MBB->end());

  tailMBB->transferSuccessorsAndUpdatePHIs(MBB);

  MBB->addSuccessor(testMBB);


  // Delete the original pseudo instruction.

  MI.eraseFromParent();


  // And we're done.

  return tailMBB;

}


MachineBasicBlock *

X86TargetLowering::EmitLoweredSegAlloca(MachineInstr &MI,

                                        MachineBasicBlock *BB) const {

  MachineFunction *MF = BB->getParent();

  const TargetInstrInfo *TII = Subtarget.getInstrInfo();

  const MIMetadata MIMD(MI);

  const BasicBlock *LLVM_BB = BB->getBasicBlock();


  assert(MF->shouldSplitStack());


  const bool Is64Bit = Subtarget.is64Bit();

  const bool IsLP64 = Subtarget.isTarget64BitLP64();


  const unsigned TlsReg = Is64Bit ? X86::FS : X86::GS;

  const unsigned TlsOffset = IsLP64 ? 0x70 : Is64Bit ? 0x40 : 0x30;


  // BB:

  //  ... [Till the alloca]

  // If stacklet is not large enough, jump to mallocMBB

  //

  // bumpMBB:

  //  Allocate by subtracting from RSP

  //  Jump to continueMBB

  //

  // mallocMBB:

  //  Allocate by call to runtime

  //

  // continueMBB:

  //  ...

  //  [rest of original BB]

  //


  MachineBasicBlock *mallocMBB = MF->CreateMachineBasicBlock(LLVM_BB);

  MachineBasicBlock *bumpMBB = MF->CreateMachineBasicBlock(LLVM_BB);

  MachineBasicBlock *continueMBB = MF->CreateMachineBasicBlock(LLVM_BB);


  MachineRegisterInfo &MRI = MF->getRegInfo();

  const TargetRegisterClass *AddrRegClass =

      getRegClassFor(getPointerTy(MF->getDataLayout()));


  Register mallocPtrVReg = MRI.createVirtualRegister(AddrRegClass),

           bumpSPPtrVReg = MRI.createVirtualRegister(AddrRegClass),

           tmpSPVReg = MRI.createVirtualRegister(AddrRegClass),

           SPLimitVReg = MRI.createVirtualRegister(AddrRegClass),

           sizeVReg = MI.getOperand(1).getReg(),

           physSPReg = IsLP64 ? X86::RSP : X86::ESP;


  MachineFunction::iterator MBBIter = ++BB->getIterator();


  MF->insert(MBBIter, bumpMBB);

  MF->insert(MBBIter, mallocMBB);

  MF->insert(MBBIter, continueMBB);


  continueMBB->splice(continueMBB->begin(), BB,

                      std::next(MachineBasicBlock::iterator(MI)), BB->end());

  continueMBB->transferSuccessorsAndUpdatePHIs(BB);


  // Add code to the main basic block to check if the stack limit has been hit,

  // and if so, jump to mallocMBB otherwise to bumpMBB.

  BuildMI(BB, MIMD, TII->get(TargetOpcode::COPY), tmpSPVReg).addReg(physSPReg);

  BuildMI(BB, MIMD, TII->get(IsLP64 ? X86::SUB64rr:X86::SUB32rr), SPLimitVReg)

    .addReg(tmpSPVReg).addReg(sizeVReg);

  BuildMI(BB, MIMD, TII->get(IsLP64 ? X86::CMP64mr:X86::CMP32mr))

    .addReg(0).addImm(1).addReg(0).addImm(TlsOffset).addReg(TlsReg)

    .addReg(SPLimitVReg);

  BuildMI(BB, MIMD, TII->get(X86::JCC_1)).addMBB(mallocMBB).addImm(X86::COND_G);


  // bumpMBB simply decreases the stack pointer, since we know the current

  // stacklet has enough space.

  BuildMI(bumpMBB, MIMD, TII->get(TargetOpcode::COPY), physSPReg)

    .addReg(SPLimitVReg);

  BuildMI(bumpMBB, MIMD, TII->get(TargetOpcode::COPY), bumpSPPtrVReg)

    .addReg(SPLimitVReg);

  BuildMI(bumpMBB, MIMD, TII->get(X86::JMP_1)).addMBB(continueMBB);


  // Calls into a routine in libgcc to allocate more space from the heap.

  const uint32_t *RegMask =

      Subtarget.getRegisterInfo()->getCallPreservedMask(*MF, CallingConv::C);

  if (IsLP64) {

    BuildMI(mallocMBB, MIMD, TII->get(X86::MOV64rr), X86::RDI)

      .addReg(sizeVReg);

    BuildMI(mallocMBB, MIMD, TII->get(X86::CALL64pcrel32))

      .addExternalSymbol("__morestack_allocate_stack_space")

      .addRegMask(RegMask)

      .addReg(X86::RDI, RegState::Implicit)

      .addReg(X86::RAX, RegState::ImplicitDefine);

  } else if (Is64Bit) {

    BuildMI(mallocMBB, MIMD, TII->get(X86::MOV32rr), X86::EDI)

      .addReg(sizeVReg);

    BuildMI(mallocMBB, MIMD, TII->get(X86::CALL64pcrel32))

      .addExternalSymbol("__morestack_allocate_stack_space")

      .addRegMask(RegMask)

      .addReg(X86::EDI, RegState::Implicit)

      .addReg(X86::EAX, RegState::ImplicitDefine);

  } else {

    BuildMI(mallocMBB, MIMD, TII->get(X86::SUB32ri), physSPReg).addReg(physSPReg)

      .addImm(12);

    BuildMI(mallocMBB, MIMD, TII->get(X86::PUSH32r)).addReg(sizeVReg);

    BuildMI(mallocMBB, MIMD, TII->get(X86::CALLpcrel32))

      .addExternalSymbol("__morestack_allocate_stack_space")

      .addRegMask(RegMask)

      .addReg(X86::EAX, RegState::ImplicitDefine);

  }


  if (!Is64Bit)

    BuildMI(mallocMBB, MIMD, TII->get(X86::ADD32ri), physSPReg).addReg(physSPReg)

      .addImm(16);


  BuildMI(mallocMBB, MIMD, TII->get(TargetOpcode::COPY), mallocPtrVReg)

    .addReg(IsLP64 ? X86::RAX : X86::EAX);

  BuildMI(mallocMBB, MIMD, TII->get(X86::JMP_1)).addMBB(continueMBB);


  // Set up the CFG correctly.

  BB->addSuccessor(bumpMBB);

  BB->addSuccessor(mallocMBB);

  mallocMBB->addSuccessor(continueMBB);

  bumpMBB->addSuccessor(continueMBB);


  // Take care of the PHI nodes.

  BuildMI(*continueMBB, continueMBB->begin(), MIMD, TII->get(X86::PHI),

          MI.getOperand(0).getReg())

      .addReg(mallocPtrVReg)

      .addMBB(mallocMBB)

      .addReg(bumpSPPtrVReg)

      .addMBB(bumpMBB);


  // Delete the original pseudo instruction.

  MI.eraseFromParent();


  // And we're done.

  return continueMBB;

}


MachineBasicBlock *

X86TargetLowering::EmitLoweredCatchRet(MachineInstr &MI,

                                       MachineBasicBlock *BB) const {

  MachineFunction *MF = BB->getParent();

  const TargetInstrInfo &TII = *Subtarget.getInstrInfo();

  MachineBasicBlock *TargetMBB = MI.getOperand(0).getMBB();

  const MIMetadata MIMD(MI);


  assert(!isAsynchronousEHPersonality(

             classifyEHPersonality(MF->getFunction().getPersonalityFn())) &&

         "SEH does not use catchret!");


  // Only 32-bit EH needs to worry about manually restoring stack pointers.

  if (!Subtarget.is32Bit())

    return BB;


  // C++ EH creates a new target block to hold the restore code, and wires up

  // the new block to the return destination with a normal JMP_4.

  MachineBasicBlock *RestoreMBB =

      MF->CreateMachineBasicBlock(BB->getBasicBlock());

  assert(BB->succ_size() == 1);

  MF->insert(std::next(BB->getIterator()), RestoreMBB);

  RestoreMBB->transferSuccessorsAndUpdatePHIs(BB);

  BB->addSuccessor(RestoreMBB);

  MI.getOperand(0).setMBB(RestoreMBB);


  // Marking this as an EH pad but not a funclet entry block causes PEI to

  // restore stack pointers in the block.

  RestoreMBB->setIsEHPad(true);


  auto RestoreMBBI = RestoreMBB->begin();

  BuildMI(*RestoreMBB, RestoreMBBI, MIMD, TII.get(X86::JMP_4)).addMBB(TargetMBB);

  return BB;

}


MachineBasicBlock *

X86TargetLowering::EmitLoweredTLSCall(MachineInstr &MI,

                                      MachineBasicBlock *BB) const {

  // This is pretty easy.  We're taking the value that we received from

  // our load from the relocation, sticking it in either RDI (x86-64)

  // or EAX and doing an indirect call.  The return value will then

  // be in the normal return register.

  MachineFunction *F = BB->getParent();

  const X86InstrInfo *TII = Subtarget.getInstrInfo();

  const MIMetadata MIMD(MI);


  assert(Subtarget.isTargetDarwin() && "Darwin only instr emitted?");

  assert(MI.getOperand(3).isGlobal() && "This should be a global");


  // Get a register mask for the lowered call.

  // FIXME: The 32-bit calls have non-standard calling conventions. Use a

  // proper register mask.

  const uint32_t *RegMask =

      Subtarget.is64Bit() ?

      Subtarget.getRegisterInfo()->getDarwinTLSCallPreservedMask() :

      Subtarget.getRegisterInfo()->getCallPreservedMask(*F, CallingConv::C);

  if (Subtarget.is64Bit()) {

    MachineInstrBuilder MIB =

        BuildMI(*BB, MI, MIMD, TII->get(X86::MOV64rm), X86::RDI)

            .addReg(X86::RIP)

            .addImm(0)

            .addReg(0)

            .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,

                              MI.getOperand(3).getTargetFlags())

            .addReg(0);

    MIB = BuildMI(*BB, MI, MIMD, TII->get(X86::CALL64m));

    addDirectMem(MIB, X86::RDI);

    MIB.addReg(X86::RAX, RegState::ImplicitDefine).addRegMask(RegMask);

  } else if (!isPositionIndependent()) {

    MachineInstrBuilder MIB =

        BuildMI(*BB, MI, MIMD, TII->get(X86::MOV32rm), X86::EAX)

            .addReg(0)

            .addImm(0)

            .addReg(0)

            .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,

                              MI.getOperand(3).getTargetFlags())

            .addReg(0);

    MIB = BuildMI(*BB, MI, MIMD, TII->get(X86::CALL32m));

    addDirectMem(MIB, X86::EAX);

    MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);

  } else {

    MachineInstrBuilder MIB =

        BuildMI(*BB, MI, MIMD, TII->get(X86::MOV32rm), X86::EAX)

            .addReg(TII->getGlobalBaseReg(F))

            .addImm(0)

            .addReg(0)

            .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,

                              MI.getOperand(3).getTargetFlags())

            .addReg(0);

    MIB = BuildMI(*BB, MI, MIMD, TII->get(X86::CALL32m));

    addDirectMem(MIB, X86::EAX);

    MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);

  }


  MI.eraseFromParent(); // The pseudo instruction is gone now.

  return BB;

}


static unsigned getOpcodeForIndirectThunk(unsigned RPOpc) {

  switch (RPOpc) {

  case X86::INDIRECT_THUNK_CALL32:

    return X86::CALLpcrel32;

  case X86::INDIRECT_THUNK_CALL64:

    return X86::CALL64pcrel32;

  case X86::INDIRECT_THUNK_TCRETURN32:

    return X86::TCRETURNdi;

  case X86::INDIRECT_THUNK_TCRETURN64:

    return X86::TCRETURNdi64;

  }

  llvm_unreachable("not indirect thunk opcode");

}


static const char *getIndirectThunkSymbol(const X86Subtarget &Subtarget,

                                          Register Reg) {

  if (Subtarget.useRetpolineExternalThunk()) {

    // When using an external thunk for retpolines, we pick names that match the

    // names GCC happens to use as well. This helps simplify the implementation

    // of the thunks for kernels where they have no easy ability to create

    // aliases and are doing non-trivial configuration of the thunk's body. For

    // example, the Linux kernel will do boot-time hot patching of the thunk

    // bodies and cannot easily export aliases of these to loaded modules.

    //

    // Note that at any point in the future, we may need to change the semantics

    // of how we implement retpolines and at that time will likely change the

    // name of the called thunk. Essentially, there is no hard guarantee that

    // LLVM will generate calls to specific thunks, we merely make a best-effort

    // attempt to help out kernels and other systems where duplicating the

    // thunks is costly.

    switch (Reg.id()) {

    case X86::EAX:

      assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");

      return "__x86_indirect_thunk_eax";

    case X86::ECX:

      assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");

      return "__x86_indirect_thunk_ecx";

    case X86::EDX:

      assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");

      return "__x86_indirect_thunk_edx";

    case X86::EDI:

      assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");

      return "__x86_indirect_thunk_edi";

    case X86::R11:

      assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!");

      return "__x86_indirect_thunk_r11";

    }

    llvm_unreachable("unexpected reg for external indirect thunk");

  }


  if (Subtarget.useRetpolineIndirectCalls() ||

      Subtarget.useRetpolineIndirectBranches()) {

    // When targeting an internal COMDAT thunk use an LLVM-specific name.

    switch (Reg.id()) {

    case X86::EAX:

      assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");

      return "__llvm_retpoline_eax";

    case X86::ECX:

      assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");

      return "__llvm_retpoline_ecx";

    case X86::EDX:

      assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");

      return "__llvm_retpoline_edx";

    case X86::EDI:

      assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");

      return "__llvm_retpoline_edi";

    case X86::R11:

      assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!");

      return "__llvm_retpoline_r11";

    }

    llvm_unreachable("unexpected reg for retpoline");

  }


  if (Subtarget.useLVIControlFlowIntegrity()) {

    assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!");

    return "__llvm_lvi_thunk_r11";

  }

  llvm_unreachable("getIndirectThunkSymbol() invoked without thunk feature");

}


MachineBasicBlock *

X86TargetLowering::EmitLoweredIndirectThunk(MachineInstr &MI,

                                            MachineBasicBlock *BB) const {

  // Copy the virtual register into the R11 physical register and

  // call the retpoline thunk.

  const MIMetadata MIMD(MI);

  const X86InstrInfo *TII = Subtarget.getInstrInfo();

  Register CalleeVReg = MI.getOperand(0).getReg();

  unsigned Opc = getOpcodeForIndirectThunk(MI.getOpcode());


  // Find an available scratch register to hold the callee. On 64-bit, we can

  // just use R11, but we scan for uses anyway to ensure we don't generate

  // incorrect code. On 32-bit, we use one of EAX, ECX, or EDX that isn't

  // already a register use operand to the call to hold the callee. If none

  // are available, use EDI instead. EDI is chosen because EBX is the PIC base

  // register and ESI is the base pointer to realigned stack frames with VLAs.

  SmallVector<Register, 3> AvailableRegs;

  if (Subtarget.is64Bit())

    AvailableRegs.push_back(X86::R11);

  else

    AvailableRegs.append({X86::EAX, X86::ECX, X86::EDX, X86::EDI});


  // Zero out any registers that are already used.

  for (const auto &MO : MI.operands()) {

    if (MO.isReg() && MO.isUse())

      llvm::replace(AvailableRegs, MO.getReg(), Register());

  }


  // Choose the first remaining non-zero available register.

  Register AvailableReg;

  for (Register MaybeReg : AvailableRegs) {

    if (MaybeReg) {

      AvailableReg = MaybeReg;

      break;

    }

  }

  if (!AvailableReg)

    report_fatal_error("calling convention incompatible with retpoline, no "

                       "available registers");


  const char *Symbol = getIndirectThunkSymbol(Subtarget, AvailableReg);


  BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), AvailableReg)

      .addReg(CalleeVReg);

  MI.getOperand(0).ChangeToES(Symbol);

  MI.setDesc(TII->get(Opc));

  MachineInstrBuilder(*BB->getParent(), &MI)

      .addReg(AvailableReg, RegState::Implicit | RegState::Kill);

  return BB;

}


/// SetJmp implies future control flow change upon calling the corresponding

/// LongJmp.

/// Instead of using the 'return' instruction, the long jump fixes the stack and

/// performs an indirect branch. To do so it uses the registers that were stored

/// in the jump buffer (when calling SetJmp).

/// In case the shadow stack is enabled we need to fix it as well, because some

/// return addresses will be skipped.

/// The function will save the SSP for future fixing in the function

/// emitLongJmpShadowStackFix.

/// \sa emitLongJmpShadowStackFix

/// \param [in] MI The temporary Machine Instruction for the builtin.

/// \param [in] MBB The Machine Basic Block that will be modified.

void X86TargetLowering::emitSetJmpShadowStackFix(MachineInstr &MI,

                                                 MachineBasicBlock *MBB) const {

  const MIMetadata MIMD(MI);

  MachineFunction *MF = MBB->getParent();

  const TargetInstrInfo *TII = Subtarget.getInstrInfo();

  MachineRegisterInfo &MRI = MF->getRegInfo();

  MachineInstrBuilder MIB;


  // Memory Reference.

  SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands());


  // Initialize a register with zero.

  MVT PVT = getPointerTy(MF->getDataLayout());

  const TargetRegisterClass *PtrRC = getRegClassFor(PVT);

  Register ZReg = MRI.createVirtualRegister(PtrRC);

  unsigned XorRROpc = (PVT == MVT::i64) ? X86::XOR64rr : X86::XOR32rr;

  BuildMI(*MBB, MI, MIMD, TII->get(XorRROpc))

      .addDef(ZReg)

      .addReg(ZReg, RegState::Undef)

      .addReg(ZReg, RegState::Undef);


  // Read the current SSP Register value to the zeroed register.

  Register SSPCopyReg = MRI.createVirtualRegister(PtrRC);

  unsigned RdsspOpc = (PVT == MVT::i64) ? X86::RDSSPQ : X86::RDSSPD;

  BuildMI(*MBB, MI, MIMD, TII->get(RdsspOpc), SSPCopyReg).addReg(ZReg);


  // Write the SSP register value to offset 3 in input memory buffer.

  unsigned PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;

  MIB = BuildMI(*MBB, MI, MIMD, TII->get(PtrStoreOpc));

  const int64_t SSPOffset = 3 * PVT.getStoreSize();

  const unsigned MemOpndSlot = 1;

  for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {

    if (i == X86::AddrDisp)

      MIB.addDisp(MI.getOperand(MemOpndSlot + i), SSPOffset);

    else

      MIB.add(MI.getOperand(MemOpndSlot + i));

  }

  MIB.addReg(SSPCopyReg);

  MIB.setMemRefs(MMOs);

}


MachineBasicBlock *

X86TargetLowering::emitEHSjLjSetJmp(MachineInstr &MI,

                                    MachineBasicBlock *MBB) const {

  const MIMetadata MIMD(MI);

  MachineFunction *MF = MBB->getParent();

  const TargetInstrInfo *TII = Subtarget.getInstrInfo();

  const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();

  MachineRegisterInfo &MRI = MF->getRegInfo();


  const BasicBlock *BB = MBB->getBasicBlock();

  MachineFunction::iterator I = ++MBB->getIterator();


  // Memory Reference

  SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands());


  unsigned MemOpndSlot = 0;


  unsigned CurOp = 0;


  Register DstReg = MI.getOperand(CurOp++).getReg();

  const TargetRegisterClass *RC = MRI.getRegClass(DstReg);

  assert(TRI->isTypeLegalForClass(*RC, MVT::i32) && "Invalid destination!");

  (void)TRI;

  Register mainDstReg = MRI.createVirtualRegister(RC);

  Register restoreDstReg = MRI.createVirtualRegister(RC);


  MemOpndSlot = CurOp;


  MVT PVT = getPointerTy(MF->getDataLayout());

  assert((PVT == MVT::i64 || PVT == MVT::i32) &&

         "Invalid Pointer Size!");


  // For v = setjmp(buf), we generate

  //

  // thisMBB:

  //  buf[LabelOffset] = restoreMBB <-- takes address of restoreMBB

  //  SjLjSetup restoreMBB

  //

  // mainMBB:

  //  v_main = 0

  //

  // sinkMBB:

  //  v = phi(main, restore)

  //

  // restoreMBB:

  //  if base pointer being used, load it from frame

  //  v_restore = 1


  MachineBasicBlock *thisMBB = MBB;

  MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);

  MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);

  MachineBasicBlock *restoreMBB = MF->CreateMachineBasicBlock(BB);

  MF->insert(I, mainMBB);

  MF->insert(I, sinkMBB);

  MF->push_back(restoreMBB);

  restoreMBB->setMachineBlockAddressTaken();


  MachineInstrBuilder MIB;


  // Transfer the remainder of BB and its successor edges to sinkMBB.

  sinkMBB->splice(sinkMBB->begin(), MBB,

                  std::next(MachineBasicBlock::iterator(MI)), MBB->end());

  sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);


  // thisMBB:

  unsigned PtrStoreOpc = 0;

  Register LabelReg;

  const int64_t LabelOffset = 1 * PVT.getStoreSize();

  bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&

                     !isPositionIndependent();


  // Prepare IP either in reg or imm.

  if (!UseImmLabel) {

    PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;

    const TargetRegisterClass *PtrRC = getRegClassFor(PVT);

    LabelReg = MRI.createVirtualRegister(PtrRC);

    if (Subtarget.is64Bit()) {

      MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(X86::LEA64r), LabelReg)

              .addReg(X86::RIP)

              .addImm(0)

              .addReg(0)

              .addMBB(restoreMBB)

              .addReg(0);

    } else {

      const X86InstrInfo *XII = static_cast<const X86InstrInfo*>(TII);

      MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(X86::LEA32r), LabelReg)

              .addReg(XII->getGlobalBaseReg(MF))

              .addImm(0)

              .addReg(0)

              .addMBB(restoreMBB, Subtarget.classifyBlockAddressReference())

              .addReg(0);

    }

  } else

    PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;

  // Store IP

  MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(PtrStoreOpc));

  for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {

    if (i == X86::AddrDisp)

      MIB.addDisp(MI.getOperand(MemOpndSlot + i), LabelOffset);

    else

      MIB.add(MI.getOperand(MemOpndSlot + i));

  }

  if (!UseImmLabel)

    MIB.addReg(LabelReg);

  else

    MIB.addMBB(restoreMBB);

  MIB.setMemRefs(MMOs);


  if (MF->getFunction().getParent()->getModuleFlag("cf-protection-return")) {

    emitSetJmpShadowStackFix(MI, thisMBB);

  }


  // Setup

  MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(X86::EH_SjLj_Setup))

          .addMBB(restoreMBB);


  const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();

  MIB.addRegMask(RegInfo->getNoPreservedMask());

  thisMBB->addSuccessor(mainMBB);

  thisMBB->addSuccessor(restoreMBB);


  // mainMBB:

  //  EAX = 0

  BuildMI(mainMBB, MIMD, TII->get(X86::MOV32r0), mainDstReg);

  mainMBB->addSuccessor(sinkMBB);


  // sinkMBB:

  BuildMI(*sinkMBB, sinkMBB->begin(), MIMD, TII->get(X86::PHI), DstReg)

      .addReg(mainDstReg)

      .addMBB(mainMBB)

      .addReg(restoreDstReg)

      .addMBB(restoreMBB);


  // restoreMBB:

  if (RegInfo->hasBasePointer(*MF)) {

    const bool Uses64BitFramePtr = Subtarget.isTarget64BitLP64();

    X86MachineFunctionInfo *X86FI = MF->getInfo<X86MachineFunctionInfo>();

    X86FI->setRestoreBasePointer(MF);

    Register FramePtr = RegInfo->getFrameRegister(*MF);

    Register BasePtr = RegInfo->getBaseRegister();

    unsigned Opm = Uses64BitFramePtr ? X86::MOV64rm : X86::MOV32rm;

    addRegOffset(BuildMI(restoreMBB, MIMD, TII->get(Opm), BasePtr),

                 FramePtr, true, X86FI->getRestoreBasePointerOffset())

      .setMIFlag(MachineInstr::FrameSetup);

  }

  BuildMI(restoreMBB, MIMD, TII->get(X86::MOV32ri), restoreDstReg).addImm(1);

  BuildMI(restoreMBB, MIMD, TII->get(X86::JMP_1)).addMBB(sinkMBB);

  restoreMBB->addSuccessor(sinkMBB);


  MI.eraseFromParent();

  return sinkMBB;

}


/// Fix the shadow stack using the previously saved SSP pointer.

/// \sa emitSetJmpShadowStackFix

/// \param [in] MI The temporary Machine Instruction for the builtin.

/// \param [in] MBB The Machine Basic Block that will be modified.

/// \return The sink MBB that will perform the future indirect branch.

MachineBasicBlock *

X86TargetLowering::emitLongJmpShadowStackFix(MachineInstr &MI,

                                             MachineBasicBlock *MBB) const {

  const MIMetadata MIMD(MI);

  MachineFunction *MF = MBB->getParent();

  const TargetInstrInfo *TII = Subtarget.getInstrInfo();

  MachineRegisterInfo &MRI = MF->getRegInfo();


  // Memory Reference

  SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands());


  MVT PVT = getPointerTy(MF->getDataLayout());

  const TargetRegisterClass *PtrRC = getRegClassFor(PVT);


  // checkSspMBB:

  //         xor vreg1, vreg1

  //         rdssp vreg1

  //         test vreg1, vreg1

  //         je sinkMBB   # Jump if Shadow Stack is not supported

  // fallMBB:

  //         mov buf+24/12(%rip), vreg2

  //         sub vreg1, vreg2

  //         jbe sinkMBB  # No need to fix the Shadow Stack

  // fixShadowMBB:

  //         shr 3/2, vreg2

  //         incssp vreg2  # fix the SSP according to the lower 8 bits

  //         shr 8, vreg2

  //         je sinkMBB

  // fixShadowLoopPrepareMBB:

  //         shl vreg2

  //         mov 128, vreg3

  // fixShadowLoopMBB:

  //         incssp vreg3

  //         dec vreg2

  //         jne fixShadowLoopMBB # Iterate until you finish fixing

  //                              # the Shadow Stack

  // sinkMBB:


  MachineFunction::iterator I = ++MBB->getIterator();

  const BasicBlock *BB = MBB->getBasicBlock();


  MachineBasicBlock *checkSspMBB = MF->CreateMachineBasicBlock(BB);

  MachineBasicBlock *fallMBB = MF->CreateMachineBasicBlock(BB);

  MachineBasicBlock *fixShadowMBB = MF->CreateMachineBasicBlock(BB);

  MachineBasicBlock *fixShadowLoopPrepareMBB = MF->CreateMachineBasicBlock(BB);

  MachineBasicBlock *fixShadowLoopMBB = MF->CreateMachineBasicBlock(BB);

  MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);

  MF->insert(I, checkSspMBB);

  MF->insert(I, fallMBB);

  MF->insert(I, fixShadowMBB);

  MF->insert(I, fixShadowLoopPrepareMBB);

  MF->insert(I, fixShadowLoopMBB);

  MF->insert(I, sinkMBB);


  // Transfer the remainder of BB and its successor edges to sinkMBB.

  sinkMBB->splice(sinkMBB->begin(), MBB, MachineBasicBlock::iterator(MI),

                  MBB->end());

  sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);


  MBB->addSuccessor(checkSspMBB);


  // Initialize a register with zero.

  Register ZReg = MRI.createVirtualRegister(&X86::GR32RegClass);

  BuildMI(checkSspMBB, MIMD, TII->get(X86::MOV32r0), ZReg);


  if (PVT == MVT::i64) {

    Register TmpZReg = MRI.createVirtualRegister(PtrRC);

    BuildMI(checkSspMBB, MIMD, TII->get(X86::SUBREG_TO_REG), TmpZReg)

      .addImm(0)

      .addReg(ZReg)

      .addImm(X86::sub_32bit);

    ZReg = TmpZReg;

  }


  // Read the current SSP Register value to the zeroed register.

  Register SSPCopyReg = MRI.createVirtualRegister(PtrRC);

  unsigned RdsspOpc = (PVT == MVT::i64) ? X86::RDSSPQ : X86::RDSSPD;

  BuildMI(checkSspMBB, MIMD, TII->get(RdsspOpc), SSPCopyReg).addReg(ZReg);


  // Check whether the result of the SSP register is zero and jump directly

  // to the sink.

  unsigned TestRROpc = (PVT == MVT::i64) ? X86::TEST64rr : X86::TEST32rr;

  BuildMI(checkSspMBB, MIMD, TII->get(TestRROpc))

      .addReg(SSPCopyReg)

      .addReg(SSPCopyReg);

  BuildMI(checkSspMBB, MIMD, TII->get(X86::JCC_1))

      .addMBB(sinkMBB)

      .addImm(X86::COND_E);

  checkSspMBB->addSuccessor(sinkMBB);

  checkSspMBB->addSuccessor(fallMBB);


  // Reload the previously saved SSP register value.

  Register PrevSSPReg = MRI.createVirtualRegister(PtrRC);

  unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm;

  const int64_t SPPOffset = 3 * PVT.getStoreSize();

  MachineInstrBuilder MIB =

      BuildMI(fallMBB, MIMD, TII->get(PtrLoadOpc), PrevSSPReg);

  for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {

    const MachineOperand &MO = MI.getOperand(i);

    if (i == X86::AddrDisp)

      MIB.addDisp(MO, SPPOffset);

    else if (MO.isReg()) // Don't add the whole operand, we don't want to

                         // preserve kill flags.

      MIB.addReg(MO.getReg());

    else

      MIB.add(MO);

  }

  MIB.setMemRefs(MMOs);


  // Subtract the current SSP from the previous SSP.

  Register SspSubReg = MRI.createVirtualRegister(PtrRC);

  unsigned SubRROpc = (PVT == MVT::i64) ? X86::SUB64rr : X86::SUB32rr;

  BuildMI(fallMBB, MIMD, TII->get(SubRROpc), SspSubReg)

      .addReg(PrevSSPReg)

      .addReg(SSPCopyReg);


  // Jump to sink in case PrevSSPReg <= SSPCopyReg.

  BuildMI(fallMBB, MIMD, TII->get(X86::JCC_1))

      .addMBB(sinkMBB)

      .addImm(X86::COND_BE);

  fallMBB->addSuccessor(sinkMBB);

  fallMBB->addSuccessor(fixShadowMBB);


  // Shift right by 2/3 for 32/64 because incssp multiplies the argument by 4/8.

  unsigned ShrRIOpc = (PVT == MVT::i64) ? X86::SHR64ri : X86::SHR32ri;

  unsigned Offset = (PVT == MVT::i64) ? 3 : 2;

  Register SspFirstShrReg = MRI.createVirtualRegister(PtrRC);

  BuildMI(fixShadowMBB, MIMD, TII->get(ShrRIOpc), SspFirstShrReg)

      .addReg(SspSubReg)

      .addImm(Offset);


  // Increase SSP when looking only on the lower 8 bits of the delta.

  unsigned IncsspOpc = (PVT == MVT::i64) ? X86::INCSSPQ : X86::INCSSPD;

  BuildMI(fixShadowMBB, MIMD, TII->get(IncsspOpc)).addReg(SspFirstShrReg);


  // Reset the lower 8 bits.

  Register SspSecondShrReg = MRI.createVirtualRegister(PtrRC);

  BuildMI(fixShadowMBB, MIMD, TII->get(ShrRIOpc), SspSecondShrReg)

      .addReg(SspFirstShrReg)

      .addImm(8);


  // Jump if the result of the shift is zero.

  BuildMI(fixShadowMBB, MIMD, TII->get(X86::JCC_1))

      .addMBB(sinkMBB)

      .addImm(X86::COND_E);

  fixShadowMBB->addSuccessor(sinkMBB);

  fixShadowMBB->addSuccessor(fixShadowLoopPrepareMBB);


  // Do a single shift left.

  unsigned ShlR1Opc = (PVT == MVT::i64) ? X86::SHL64ri : X86::SHL32ri;

  Register SspAfterShlReg = MRI.createVirtualRegister(PtrRC);

  BuildMI(fixShadowLoopPrepareMBB, MIMD, TII->get(ShlR1Opc), SspAfterShlReg)

      .addReg(SspSecondShrReg)

      .addImm(1);


  // Save the value 128 to a register (will be used next with incssp).

  Register Value128InReg = MRI.createVirtualRegister(PtrRC);

  unsigned MovRIOpc = (PVT == MVT::i64) ? X86::MOV64ri32 : X86::MOV32ri;

  BuildMI(fixShadowLoopPrepareMBB, MIMD, TII->get(MovRIOpc), Value128InReg)

      .addImm(128);

  fixShadowLoopPrepareMBB->addSuccessor(fixShadowLoopMBB);


  // Since incssp only looks at the lower 8 bits, we might need to do several

  // iterations of incssp until we finish fixing the shadow stack.

  Register DecReg = MRI.createVirtualRegister(PtrRC);

  Register CounterReg = MRI.createVirtualRegister(PtrRC);

  BuildMI(fixShadowLoopMBB, MIMD, TII->get(X86::PHI), CounterReg)

      .addReg(SspAfterShlReg)

      .addMBB(fixShadowLoopPrepareMBB)

      .addReg(DecReg)

      .addMBB(fixShadowLoopMBB);


  // Every iteration we increase the SSP by 128.

  BuildMI(fixShadowLoopMBB, MIMD, TII->get(IncsspOpc)).addReg(Value128InReg);


  // Every iteration we decrement the counter by 1.

  unsigned DecROpc = (PVT == MVT::i64) ? X86::DEC64r : X86::DEC32r;

  BuildMI(fixShadowLoopMBB, MIMD, TII->get(DecROpc), DecReg).addReg(CounterReg);


  // Jump if the counter is not zero yet.

  BuildMI(fixShadowLoopMBB, MIMD, TII->get(X86::JCC_1))

      .addMBB(fixShadowLoopMBB)

      .addImm(X86::COND_NE);

  fixShadowLoopMBB->addSuccessor(sinkMBB);

  fixShadowLoopMBB->addSuccessor(fixShadowLoopMBB);


  return sinkMBB;

}


MachineBasicBlock *

X86TargetLowering::emitEHSjLjLongJmp(MachineInstr &MI,

                                     MachineBasicBlock *MBB) const {

  const MIMetadata MIMD(MI);

  MachineFunction *MF = MBB->getParent();

  const TargetInstrInfo *TII = Subtarget.getInstrInfo();

  MachineRegisterInfo &MRI = MF->getRegInfo();


  // Memory Reference

  SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands());


  MVT PVT = getPointerTy(MF->getDataLayout());

  assert((PVT == MVT::i64 || PVT == MVT::i32) &&

         "Invalid Pointer Size!");


  const TargetRegisterClass *RC =

    (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;

  Register Tmp = MRI.createVirtualRegister(RC);

  // Since FP is only updated here but NOT referenced, it's treated as GPR.

  const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();

  Register FP = (PVT == MVT::i64) ? X86::RBP : X86::EBP;

  Register SP = RegInfo->getStackRegister();


  MachineInstrBuilder MIB;


  const int64_t LabelOffset = 1 * PVT.getStoreSize();

  const int64_t SPOffset = 2 * PVT.getStoreSize();


  unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm;

  unsigned IJmpOpc = (PVT == MVT::i64) ? X86::JMP64r : X86::JMP32r;


  MachineBasicBlock *thisMBB = MBB;


  // When CET and shadow stack is enabled, we need to fix the Shadow Stack.

  if (MF->getFunction().getParent()->getModuleFlag("cf-protection-return")) {

    thisMBB = emitLongJmpShadowStackFix(MI, thisMBB);

  }


  // Reload FP

  MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(PtrLoadOpc), FP);

  for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {

    const MachineOperand &MO = MI.getOperand(i);

    if (MO.isReg()) // Don't add the whole operand, we don't want to

                    // preserve kill flags.

      MIB.addReg(MO.getReg());

    else

      MIB.add(MO);

  }

  MIB.setMemRefs(MMOs);

  MIB.setMIFlag(MachineInstr::FrameDestroy);


  // Reload IP

  MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(PtrLoadOpc), Tmp);

  for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {

    const MachineOperand &MO = MI.getOperand(i);

    if (i == X86::AddrDisp)

      MIB.addDisp(MO, LabelOffset);

    else if (MO.isReg()) // Don't add the whole operand, we don't want to

                         // preserve kill flags.

      MIB.addReg(MO.getReg());

    else

      MIB.add(MO);

  }

  MIB.setMemRefs(MMOs);


  // Reload SP

  MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(PtrLoadOpc), SP);

  for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {

    if (i == X86::AddrDisp)

      MIB.addDisp(MI.getOperand(i), SPOffset);

    else

      MIB.add(MI.getOperand(i)); // We can preserve the kill flags here, it's

                                 // the last instruction of the expansion.

  }

  MIB.setMemRefs(MMOs);

  MIB.setMIFlag(MachineInstr::FrameDestroy);


  // Jump

  BuildMI(*thisMBB, MI, MIMD, TII->get(IJmpOpc)).addReg(Tmp);


  MI.eraseFromParent();

  return thisMBB;

}


void X86TargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI,

                                               MachineBasicBlock *MBB,

                                               MachineBasicBlock *DispatchBB,

                                               int FI) const {

  const MIMetadata MIMD(MI);

  MachineFunction *MF = MBB->getParent();

  MachineRegisterInfo *MRI = &MF->getRegInfo();

  const X86InstrInfo *TII = Subtarget.getInstrInfo();


  MVT PVT = getPointerTy(MF->getDataLayout());

  assert((PVT == MVT::i64 || PVT == MVT::i32) && "Invalid Pointer Size!");


  unsigned Op = 0;

  Register VR;


  bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&

                     !isPositionIndependent();


  if (UseImmLabel) {

    Op = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;

  } else {

    const TargetRegisterClass *TRC =

        (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;

    VR = MRI->createVirtualRegister(TRC);

    Op = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;


    if (Subtarget.is64Bit())

      BuildMI(*MBB, MI, MIMD, TII->get(X86::LEA64r), VR)

          .addReg(X86::RIP)

          .addImm(1)

          .addReg(0)

          .addMBB(DispatchBB)

          .addReg(0);

    else

      BuildMI(*MBB, MI, MIMD, TII->get(X86::LEA32r), VR)

          .addReg(0) /* TII->getGlobalBaseReg(MF) */

          .addImm(1)

          .addReg(0)

          .addMBB(DispatchBB, Subtarget.classifyBlockAddressReference())

          .addReg(0);

  }


  MachineInstrBuilder MIB = BuildMI(*MBB, MI, MIMD, TII->get(Op));

  addFrameReference(MIB, FI, Subtarget.is64Bit() ? 56 : 36);

  if (UseImmLabel)

    MIB.addMBB(DispatchBB);

  else

    MIB.addReg(VR);

}


MachineBasicBlock *

X86TargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,

                                         MachineBasicBlock *BB) const {

  const MIMetadata MIMD(MI);

  MachineFunction *MF = BB->getParent();

  MachineRegisterInfo *MRI = &MF->getRegInfo();

  const X86InstrInfo *TII = Subtarget.getInstrInfo();

  int FI = MF->getFrameInfo().getFunctionContextIndex();


  // Get a mapping of the call site numbers to all of the landing pads they're

  // associated with.

  DenseMap<unsigned, SmallVector<MachineBasicBlock *, 2>> CallSiteNumToLPad;

  unsigned MaxCSNum = 0;

  for (auto &MBB : *MF) {

    if (!MBB.isEHPad())

      continue;


    MCSymbol *Sym = nullptr;

    for (const auto &MI : MBB) {

      if (MI.isDebugInstr())

        continue;


      assert(MI.isEHLabel() && "expected EH_LABEL");

      Sym = MI.getOperand(0).getMCSymbol();

      break;

    }


    if (!MF->hasCallSiteLandingPad(Sym))

      continue;


    for (unsigned CSI : MF->getCallSiteLandingPad(Sym)) {

      CallSiteNumToLPad[CSI].push_back(&MBB);

      MaxCSNum = std::max(MaxCSNum, CSI);

    }

  }


  // Get an ordered list of the machine basic blocks for the jump table.

  std::vector<MachineBasicBlock *> LPadList;

  SmallPtrSet<MachineBasicBlock *, 32> InvokeBBs;

  LPadList.reserve(CallSiteNumToLPad.size());


  for (unsigned CSI = 1; CSI <= MaxCSNum; ++CSI) {

    for (auto &LP : CallSiteNumToLPad[CSI]) {

      LPadList.push_back(LP);

      InvokeBBs.insert_range(LP->predecessors());

    }

  }


  assert(!LPadList.empty() &&

         "No landing pad destinations for the dispatch jump table!");


  // Create the MBBs for the dispatch code.


  // Shove the dispatch's address into the return slot in the function context.

  MachineBasicBlock *DispatchBB = MF->CreateMachineBasicBlock();

  DispatchBB->setIsEHPad(true);


  MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();

  BuildMI(TrapBB, MIMD, TII->get(X86::TRAP));

  DispatchBB->addSuccessor(TrapBB);


  MachineBasicBlock *DispContBB = MF->CreateMachineBasicBlock();

  DispatchBB->addSuccessor(DispContBB);


  // Insert MBBs.

  MF->push_back(DispatchBB);

  MF->push_back(DispContBB);

  MF->push_back(TrapBB);


  // Insert code into the entry block that creates and registers the function

  // context.

  SetupEntryBlockForSjLj(MI, BB, DispatchBB, FI);


  // Create the jump table and associated information

  unsigned JTE = getJumpTableEncoding();

  MachineJumpTableInfo *JTI = MF->getOrCreateJumpTableInfo(JTE);

  unsigned MJTI = JTI->createJumpTableIndex(LPadList);


  const X86RegisterInfo &RI = TII->getRegisterInfo();

  // Add a register mask with no preserved registers.  This results in all

  // registers being marked as clobbered.

  if (RI.hasBasePointer(*MF)) {

    const bool FPIs64Bit = Subtarget.isTarget64BitLP64();

    X86MachineFunctionInfo *MFI = MF->getInfo<X86MachineFunctionInfo>();

    MFI->setRestoreBasePointer(MF);


    Register FP = RI.getFrameRegister(*MF);

    Register BP = RI.getBaseRegister();

    unsigned Op = FPIs64Bit ? X86::MOV64rm : X86::MOV32rm;

    addRegOffset(BuildMI(DispatchBB, MIMD, TII->get(Op), BP), FP, true,

                 MFI->getRestoreBasePointerOffset())

        .addRegMask(RI.getNoPreservedMask());

  } else {

    BuildMI(DispatchBB, MIMD, TII->get(X86::NOOP))

        .addRegMask(RI.getNoPreservedMask());

  }


  // IReg is used as an index in a memory operand and therefore can't be SP

  Register IReg = MRI->createVirtualRegister(&X86::GR32_NOSPRegClass);

  addFrameReference(BuildMI(DispatchBB, MIMD, TII->get(X86::MOV32rm), IReg), FI,

                    Subtarget.is64Bit() ? 8 : 4);

  BuildMI(DispatchBB, MIMD, TII->get(X86::CMP32ri))

      .addReg(IReg)

      .addImm(LPadList.size());

  BuildMI(DispatchBB, MIMD, TII->get(X86::JCC_1))

      .addMBB(TrapBB)

      .addImm(X86::COND_AE);


  if (Subtarget.is64Bit()) {

    Register BReg = MRI->createVirtualRegister(&X86::GR64RegClass);

    Register IReg64 = MRI->createVirtualRegister(&X86::GR64_NOSPRegClass);


    // leaq .LJTI0_0(%rip), BReg

    BuildMI(DispContBB, MIMD, TII->get(X86::LEA64r), BReg)

        .addReg(X86::RIP)

        .addImm(1)

        .addReg(0)

        .addJumpTableIndex(MJTI)

        .addReg(0);

    // movzx IReg64, IReg

    BuildMI(DispContBB, MIMD, TII->get(TargetOpcode::SUBREG_TO_REG), IReg64)

        .addImm(0)

        .addReg(IReg)

        .addImm(X86::sub_32bit);


    switch (JTE) {

    case MachineJumpTableInfo::EK_BlockAddress:

      // jmpq *(BReg,IReg64,8)

      BuildMI(DispContBB, MIMD, TII->get(X86::JMP64m))

          .addReg(BReg)

          .addImm(8)

          .addReg(IReg64)

          .addImm(0)

          .addReg(0);

      break;

    case MachineJumpTableInfo::EK_LabelDifference32: {

      Register OReg = MRI->createVirtualRegister(&X86::GR32RegClass);

      Register OReg64 = MRI->createVirtualRegister(&X86::GR64RegClass);

      Register TReg = MRI->createVirtualRegister(&X86::GR64RegClass);


      // movl (BReg,IReg64,4), OReg

      BuildMI(DispContBB, MIMD, TII->get(X86::MOV32rm), OReg)

          .addReg(BReg)

          .addImm(4)

          .addReg(IReg64)

          .addImm(0)

          .addReg(0);

      // movsx OReg64, OReg

      BuildMI(DispContBB, MIMD, TII->get(X86::MOVSX64rr32), OReg64)

          .addReg(OReg);

      // addq BReg, OReg64, TReg

      BuildMI(DispContBB, MIMD, TII->get(X86::ADD64rr), TReg)

          .addReg(OReg64)

          .addReg(BReg);

      // jmpq *TReg

      BuildMI(DispContBB, MIMD, TII->get(X86::JMP64r)).addReg(TReg);

      break;

    }

    default:

      llvm_unreachable("Unexpected jump table encoding");

    }

  } else {

    // jmpl *.LJTI0_0(,IReg,4)

    BuildMI(DispContBB, MIMD, TII->get(X86::JMP32m))

        .addReg(0)

        .addImm(4)

        .addReg(IReg)

        .addJumpTableIndex(MJTI)

        .addReg(0);

  }


  // Add the jump table entries as successors to the MBB.

  SmallPtrSet<MachineBasicBlock *, 8> SeenMBBs;

  for (auto &LP : LPadList)

    if (SeenMBBs.insert(LP).second)

      DispContBB->addSuccessor(LP);


  // N.B. the order the invoke BBs are processed in doesn't matter here.

  SmallVector<MachineBasicBlock *, 64> MBBLPads;

  const MCPhysReg *SavedRegs = MF->getRegInfo().getCalleeSavedRegs();

  for (MachineBasicBlock *MBB : InvokeBBs) {

    // Remove the landing pad successor from the invoke block and replace it

    // with the new dispatch block.

    // Keep a copy of Successors since it's modified inside the loop.

    SmallVector<MachineBasicBlock *, 8> Successors(MBB->succ_rbegin(),

                                                   MBB->succ_rend());

    // FIXME: Avoid quadratic complexity.

    for (auto *MBBS : Successors) {

      if (MBBS->isEHPad()) {

        MBB->removeSuccessor(MBBS);

        MBBLPads.push_back(MBBS);

      }

    }


    MBB->addSuccessor(DispatchBB);


    // Find the invoke call and mark all of the callee-saved registers as

    // 'implicit defined' so that they're spilled.  This prevents code from

    // moving instructions to before the EH block, where they will never be

    // executed.

    for (auto &II : reverse(*MBB)) {

      if (!II.isCall())

        continue;


      DenseSet<Register> DefRegs;

      for (auto &MOp : II.operands())

        if (MOp.isReg())

          DefRegs.insert(MOp.getReg());


      MachineInstrBuilder MIB(*MF, &II);

      for (unsigned RegIdx = 0; SavedRegs[RegIdx]; ++RegIdx) {

        Register Reg = SavedRegs[RegIdx];

        if (!DefRegs.contains(Reg))

          MIB.addReg(Reg, RegState::ImplicitDefine | RegState::Dead);

      }


      break;

    }

  }


  // Mark all former landing pads as non-landing pads.  The dispatch is the only

  // landing pad now.

  for (auto &LP : MBBLPads)

    LP->setIsEHPad(false);


  // The instruction is gone now.

  MI.eraseFromParent();

  return BB;

}


MachineBasicBlock *

X86TargetLowering::emitPatchableEventCall(MachineInstr &MI,

                                          MachineBasicBlock *BB) const {

  // Wrap patchable event calls in CALLSEQ_START/CALLSEQ_END, as tracing

  // calls may require proper stack alignment.

  const TargetInstrInfo &TII = *Subtarget.getInstrInfo();

  const MIMetadata MIMD(MI);

  MachineFunction &MF = *BB->getParent();


  // Emit CALLSEQ_START right before the instruction.

  MF.getFrameInfo().setAdjustsStack(true);

  unsigned AdjStackDown = TII.getCallFrameSetupOpcode();

  MachineInstrBuilder CallseqStart =

      BuildMI(MF, MIMD, TII.get(AdjStackDown)).addImm(0).addImm(0).addImm(0);

  BB->insert(MachineBasicBlock::iterator(MI), CallseqStart);


  // Emit CALLSEQ_END right after the instruction.

  unsigned AdjStackUp = TII.getCallFrameDestroyOpcode();

  MachineInstrBuilder CallseqEnd =

      BuildMI(MF, MIMD, TII.get(AdjStackUp)).addImm(0).addImm(0);

  BB->insertAfter(MachineBasicBlock::iterator(MI), CallseqEnd);


  return BB;

}


MachineBasicBlock *


X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,

                                               MachineBasicBlock *BB) const {

  MachineFunction *MF = BB->getParent();

  const TargetInstrInfo *TII = Subtarget.getInstrInfo();

  const MIMetadata MIMD(MI);


  auto TMMImmToTMMReg = [](unsigned Imm) {

    assert (Imm < 8 && "Illegal tmm index");

    return X86::TMM0 + Imm;

  };

  auto TMMImmToTMMPair = [](unsigned Imm) {

    assert(Imm < 8 && "Illegal tmm pair index.");

    return X86::TMM0_TMM1 + Imm / 2;

  };

  switch (MI.getOpcode()) {

  default:

    llvm_unreachable("Unexpected instr type to insert");

  case X86::INDIRECT_THUNK_CALL32:

  case X86::INDIRECT_THUNK_CALL64:

  case X86::INDIRECT_THUNK_TCRETURN32:

  case X86::INDIRECT_THUNK_TCRETURN64:

    return EmitLoweredIndirectThunk(MI, BB);

  case X86::CATCHRET:

    return EmitLoweredCatchRet(MI, BB);

  case X86::SEG_ALLOCA_32:

  case X86::SEG_ALLOCA_64:

    return EmitLoweredSegAlloca(MI, BB);

  case X86::PROBED_ALLOCA_32:

  case X86::PROBED_ALLOCA_64:

    return EmitLoweredProbedAlloca(MI, BB);

  case X86::TLSCall_32:

  case X86::TLSCall_64:

    return EmitLoweredTLSCall(MI, BB);

  case X86::CMOV_FR16:

  case X86::CMOV_FR16X:

  case X86::CMOV_FR32:

  case X86::CMOV_FR32X:

  case X86::CMOV_FR64:

  case X86::CMOV_FR64X:

  case X86::CMOV_GR8:

  case X86::CMOV_GR16:

  case X86::CMOV_GR32:

  case X86::CMOV_RFP32:

  case X86::CMOV_RFP64:

  case X86::CMOV_RFP80:

  case X86::CMOV_VR64:

  case X86::CMOV_VR128:

  case X86::CMOV_VR128X:

  case X86::CMOV_VR256:

  case X86::CMOV_VR256X:

  case X86::CMOV_VR512:

  case X86::CMOV_VK1:

  case X86::CMOV_VK2:

  case X86::CMOV_VK4:

  case X86::CMOV_VK8:

  case X86::CMOV_VK16:

  case X86::CMOV_VK32:

  case X86::CMOV_VK64:

    return EmitLoweredSelect(MI, BB);


  case X86::FP80_ADDr:

  case X86::FP80_ADDm32: {

    // Change the floating point control register to use double extended

    // precision when performing the addition.

    int OrigCWFrameIdx =

        MF->getFrameInfo().CreateStackObject(2, Align(2), false);

    addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::FNSTCW16m)),

                      OrigCWFrameIdx);


    // Load the old value of the control word...

    Register OldCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);

    addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::MOVZX32rm16), OldCW),

                      OrigCWFrameIdx);


    // OR 0b11 into bit 8 and 9. 0b11 is the encoding for double extended

    // precision.

    Register NewCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);

    BuildMI(*BB, MI, MIMD, TII->get(X86::OR32ri), NewCW)

        .addReg(OldCW, RegState::Kill)

        .addImm(0x300);


    // Extract to 16 bits.

    Register NewCW16 =

        MF->getRegInfo().createVirtualRegister(&X86::GR16RegClass);

    BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), NewCW16)

        .addReg(NewCW, RegState::Kill, X86::sub_16bit);


    // Prepare memory for FLDCW.

    int NewCWFrameIdx =

        MF->getFrameInfo().CreateStackObject(2, Align(2), false);

    addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::MOV16mr)),

                      NewCWFrameIdx)

        .addReg(NewCW16, RegState::Kill);


    // Reload the modified control word now...

    addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::FLDCW16m)),

                      NewCWFrameIdx);


    // Do the addition.

    if (MI.getOpcode() == X86::FP80_ADDr) {

      BuildMI(*BB, MI, MIMD, TII->get(X86::ADD_Fp80))

          .add(MI.getOperand(0))

          .add(MI.getOperand(1))

          .add(MI.getOperand(2));

    } else {

      BuildMI(*BB, MI, MIMD, TII->get(X86::ADD_Fp80m32))

          .add(MI.getOperand(0))

          .add(MI.getOperand(1))

          .add(MI.getOperand(2))

          .add(MI.getOperand(3))

          .add(MI.getOperand(4))

          .add(MI.getOperand(5))

          .add(MI.getOperand(6));

    }


    // Reload the original control word now.

    addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::FLDCW16m)),

                      OrigCWFrameIdx);


    MI.eraseFromParent(); // The pseudo instruction is gone now.

    return BB;

  }


  case X86::FP32_TO_INT16_IN_MEM:

  case X86::FP32_TO_INT32_IN_MEM:

  case X86::FP32_TO_INT64_IN_MEM:

  case X86::FP64_TO_INT16_IN_MEM:

  case X86::FP64_TO_INT32_IN_MEM:

  case X86::FP64_TO_INT64_IN_MEM:

  case X86::FP80_TO_INT16_IN_MEM:

  case X86::FP80_TO_INT32_IN_MEM:

  case X86::FP80_TO_INT64_IN_MEM: {

    // Change the floating point control register to use "round towards zero"

    // mode when truncating to an integer value.

    int OrigCWFrameIdx =

        MF->getFrameInfo().CreateStackObject(2, Align(2), false);

    addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::FNSTCW16m)),

                      OrigCWFrameIdx);


    // Load the old value of the control word...

    Register OldCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);

    addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::MOVZX32rm16), OldCW),

                      OrigCWFrameIdx);


    // OR 0b11 into bit 10 and 11. 0b11 is the encoding for round toward zero.

    Register NewCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);

    BuildMI(*BB, MI, MIMD, TII->get(X86::OR32ri), NewCW)

      .addReg(OldCW, RegState::Kill).addImm(0xC00);


    // Extract to 16 bits.

    Register NewCW16 =

        MF->getRegInfo().createVirtualRegister(&X86::GR16RegClass);

    BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), NewCW16)

      .addReg(NewCW, RegState::Kill, X86::sub_16bit);


    // Prepare memory for FLDCW.

    int NewCWFrameIdx =

        MF->getFrameInfo().CreateStackObject(2, Align(2), false);

    addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::MOV16mr)),

                      NewCWFrameIdx)

      .addReg(NewCW16, RegState::Kill);


    // Reload the modified control word now...

    addFrameReference(BuildMI(*BB, MI, MIMD,

                              TII->get(X86::FLDCW16m)), NewCWFrameIdx);


    // Get the X86 opcode to use.

    unsigned Opc;

    switch (MI.getOpcode()) {

    // clang-format off

    default: llvm_unreachable("illegal opcode!");

    case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break;

    case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break;

    case X86::FP32_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m32; break;

    case X86::FP64_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m64; break;

    case X86::FP64_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m64; break;

    case X86::FP64_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m64; break;

    case X86::FP80_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m80; break;

    case X86::FP80_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m80; break;

    case X86::FP80_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m80; break;

    // clang-format on

    }


    X86AddressMode AM = getAddressFromInstr(&MI, 0);

    addFullAddress(BuildMI(*BB, MI, MIMD, TII->get(Opc)), AM)

        .addReg(MI.getOperand(X86::AddrNumOperands).getReg());


    // Reload the original control word now.

    addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::FLDCW16m)),

                      OrigCWFrameIdx);


    MI.eraseFromParent(); // The pseudo instruction is gone now.

    return BB;

  }


  // xbegin

  case X86::XBEGIN:

    return emitXBegin(MI, BB, Subtarget.getInstrInfo());


  case X86::VAARG_64:

  case X86::VAARG_X32:

    return EmitVAARGWithCustomInserter(MI, BB);


  case X86::EH_SjLj_SetJmp32:

  case X86::EH_SjLj_SetJmp64:

    return emitEHSjLjSetJmp(MI, BB);


  case X86::EH_SjLj_LongJmp32:

  case X86::EH_SjLj_LongJmp64:

    return emitEHSjLjLongJmp(MI, BB);


  case X86::Int_eh_sjlj_setup_dispatch:

    return EmitSjLjDispatchBlock(MI, BB);


  case TargetOpcode::STATEPOINT:

    // As an implementation detail, STATEPOINT shares the STACKMAP format at

    // this point in the process.  We diverge later.

    return emitPatchPoint(MI, BB);


  case TargetOpcode::STACKMAP:

  case TargetOpcode::PATCHPOINT:

    return emitPatchPoint(MI, BB);


  case TargetOpcode::PATCHABLE_EVENT_CALL:

  case TargetOpcode::PATCHABLE_TYPED_EVENT_CALL:

    return emitPatchableEventCall(MI, BB);


  case X86::LCMPXCHG8B: {

    const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();

    // In addition to 4 E[ABCD] registers implied by encoding, CMPXCHG8B

    // requires a memory operand. If it happens that current architecture is

    // i686 and for current function we need a base pointer

    // - which is ESI for i686 - register allocator would not be able to

    // allocate registers for an address in form of X(%reg, %reg, Y)

    // - there never would be enough unreserved registers during regalloc

    // (without the need for base ptr the only option would be X(%edi, %esi, Y).

    // We are giving a hand to register allocator by precomputing the address in

    // a new vreg using LEA.


    // If it is not i686 or there is no base pointer - nothing to do here.

    if (!Subtarget.is32Bit() || !TRI->hasBasePointer(*MF))

      return BB;


    // Even though this code does not necessarily needs the base pointer to

    // be ESI, we check for that. The reason: if this assert fails, there are

    // some changes happened in the compiler base pointer handling, which most

    // probably have to be addressed somehow here.

    assert(TRI->getBaseRegister() == X86::ESI &&

           "LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a "

           "base pointer in mind");


    MachineRegisterInfo &MRI = MF->getRegInfo();

    MVT SPTy = getPointerTy(MF->getDataLayout());

    const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);

    Register computedAddrVReg = MRI.createVirtualRegister(AddrRegClass);


    X86AddressMode AM = getAddressFromInstr(&MI, 0);

    // Regalloc does not need any help when the memory operand of CMPXCHG8B

    // does not use index register.

    if (AM.IndexReg == X86::NoRegister)

      return BB;


    // After X86TargetLowering::ReplaceNodeResults CMPXCHG8B is glued to its

    // four operand definitions that are E[ABCD] registers. We skip them and

    // then insert the LEA.

    MachineBasicBlock::reverse_iterator RMBBI(MI.getReverseIterator());

    while (RMBBI != BB->rend() &&

           (RMBBI->definesRegister(X86::EAX, /*TRI=*/nullptr) ||

            RMBBI->definesRegister(X86::EBX, /*TRI=*/nullptr) ||

            RMBBI->definesRegister(X86::ECX, /*TRI=*/nullptr) ||

            RMBBI->definesRegister(X86::EDX, /*TRI=*/nullptr))) {

      ++RMBBI;

    }

    MachineBasicBlock::iterator MBBI(RMBBI);

    addFullAddress(

        BuildMI(*BB, *MBBI, MIMD, TII->get(X86::LEA32r), computedAddrVReg), AM);


    setDirectAddressInInstr(&MI, 0, computedAddrVReg);


    return BB;

  }

  case X86::LCMPXCHG16B_NO_RBX: {

    const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();

    Register BasePtr = TRI->getBaseRegister();

    if (TRI->hasBasePointer(*MF) &&

        (BasePtr == X86::RBX || BasePtr == X86::EBX)) {

      if (!BB->isLiveIn(BasePtr))

        BB->addLiveIn(BasePtr);

      // Save RBX into a virtual register.

      Register SaveRBX =

          MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);

      BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), SaveRBX)

          .addReg(X86::RBX);

      Register Dst = MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);

      MachineInstrBuilder MIB =

          BuildMI(*BB, MI, MIMD, TII->get(X86::LCMPXCHG16B_SAVE_RBX), Dst);

      for (unsigned Idx = 0; Idx < X86::AddrNumOperands; ++Idx)

        MIB.add(MI.getOperand(Idx));

      MIB.add(MI.getOperand(X86::AddrNumOperands));

      MIB.addReg(SaveRBX);

    } else {

      // Simple case, just copy the virtual register to RBX.

      BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), X86::RBX)

          .add(MI.getOperand(X86::AddrNumOperands));

      MachineInstrBuilder MIB =

          BuildMI(*BB, MI, MIMD, TII->get(X86::LCMPXCHG16B));

      for (unsigned Idx = 0; Idx < X86::AddrNumOperands; ++Idx)

        MIB.add(MI.getOperand(Idx));

    }

    MI.eraseFromParent();

    return BB;

  }

  case X86::MWAITX: {

    const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();

    Register BasePtr = TRI->getBaseRegister();

    bool IsRBX = (BasePtr == X86::RBX || BasePtr == X86::EBX);

    // If no need to save the base pointer, we generate MWAITXrrr,

    // else we generate pseudo MWAITX_SAVE_RBX.

    if (!IsRBX || !TRI->hasBasePointer(*MF)) {

      BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), X86::ECX)

          .addReg(MI.getOperand(0).getReg());

      BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), X86::EAX)

          .addReg(MI.getOperand(1).getReg());

      BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), X86::EBX)

          .addReg(MI.getOperand(2).getReg());

      BuildMI(*BB, MI, MIMD, TII->get(X86::MWAITXrrr));

      MI.eraseFromParent();

    } else {

      if (!BB->isLiveIn(BasePtr)) {

        BB->addLiveIn(BasePtr);

      }

      // Parameters can be copied into ECX and EAX but not EBX yet.

      BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), X86::ECX)

          .addReg(MI.getOperand(0).getReg());

      BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), X86::EAX)

          .addReg(MI.getOperand(1).getReg());

      assert(Subtarget.is64Bit() && "Expected 64-bit mode!");

      // Save RBX into a virtual register.

      Register SaveRBX =

          MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);

      BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), SaveRBX)

          .addReg(X86::RBX);

      // Generate mwaitx pseudo.

      Register Dst = MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);

      BuildMI(*BB, MI, MIMD, TII->get(X86::MWAITX_SAVE_RBX))

          .addDef(Dst) // Destination tied in with SaveRBX.

          .addReg(MI.getOperand(2).getReg()) // input value of EBX.

          .addUse(SaveRBX);                  // Save of base pointer.

      MI.eraseFromParent();

    }

    return BB;

  }

  case TargetOpcode::PREALLOCATED_SETUP: {

    assert(Subtarget.is32Bit() && "preallocated only used in 32-bit");

    auto *MFI = MF->getInfo<X86MachineFunctionInfo>();

    MFI->setHasPreallocatedCall(true);

    int64_t PreallocatedId = MI.getOperand(0).getImm();

    size_t StackAdjustment = MFI->getPreallocatedStackSize(PreallocatedId);

    assert(StackAdjustment != 0 && "0 stack adjustment");

    LLVM_DEBUG(dbgs() << "PREALLOCATED_SETUP stack adjustment "

                      << StackAdjustment << "\n");

    BuildMI(*BB, MI, MIMD, TII->get(X86::SUB32ri), X86::ESP)

        .addReg(X86::ESP)

        .addImm(StackAdjustment);

    MI.eraseFromParent();

    return BB;

  }

  case TargetOpcode::PREALLOCATED_ARG: {

    assert(Subtarget.is32Bit() && "preallocated calls only used in 32-bit");

    int64_t PreallocatedId = MI.getOperand(1).getImm();

    int64_t ArgIdx = MI.getOperand(2).getImm();

    auto *MFI = MF->getInfo<X86MachineFunctionInfo>();

    size_t ArgOffset = MFI->getPreallocatedArgOffsets(PreallocatedId)[ArgIdx];

    LLVM_DEBUG(dbgs() << "PREALLOCATED_ARG arg index " << ArgIdx

                      << ", arg offset " << ArgOffset << "\n");

    // stack pointer + offset

    addRegOffset(BuildMI(*BB, MI, MIMD, TII->get(X86::LEA32r),

                         MI.getOperand(0).getReg()),

                 X86::ESP, false, ArgOffset);

    MI.eraseFromParent();

    return BB;

  }

  case X86::PTDPBSSD:

  case X86::PTDPBSUD:

  case X86::PTDPBUSD:

  case X86::PTDPBUUD:

  case X86::PTDPBF16PS:

  case X86::PTDPFP16PS:

  case X86::PTCMMIMFP16PS:

  case X86::PTCMMRLFP16PS:

  case X86::PTDPBF8PS:

  case X86::PTDPBHF8PS:

  case X86::PTDPHBF8PS:

  case X86::PTDPHF8PS:

  case X86::PTTDPBF16PS:

  case X86::PTTDPFP16PS:

  case X86::PTTCMMIMFP16PS:

  case X86::PTTCMMRLFP16PS:

  case X86::PTCONJTCMMIMFP16PS:

  case X86::PTMMULTF32PS:

  case X86::PTTMMULTF32PS: {

    unsigned Opc;

    switch (MI.getOpcode()) {

    default: llvm_unreachable("illegal opcode!");

    case X86::PTDPBSSD: Opc = X86::TDPBSSD; break;

    case X86::PTDPBSUD: Opc = X86::TDPBSUD; break;

    case X86::PTDPBUSD: Opc = X86::TDPBUSD; break;

    case X86::PTDPBUUD: Opc = X86::TDPBUUD; break;

    case X86::PTDPBF16PS: Opc = X86::TDPBF16PS; break;

    case X86::PTDPFP16PS: Opc = X86::TDPFP16PS; break;

    case X86::PTCMMIMFP16PS:

      Opc = X86::TCMMIMFP16PS;

      break;

    case X86::PTCMMRLFP16PS:

      Opc = X86::TCMMRLFP16PS;

      break;

    case X86::PTDPBF8PS: Opc = X86::TDPBF8PS; break;

    case X86::PTDPBHF8PS: Opc = X86::TDPBHF8PS; break;

    case X86::PTDPHBF8PS: Opc = X86::TDPHBF8PS; break;

    case X86::PTDPHF8PS: Opc = X86::TDPHF8PS; break;

    case X86::PTTDPBF16PS:

      Opc = X86::TTDPBF16PS;

      break;

    case X86::PTTDPFP16PS:

      Opc = X86::TTDPFP16PS;

      break;

    case X86::PTTCMMIMFP16PS:

      Opc = X86::TTCMMIMFP16PS;

      break;

    case X86::PTTCMMRLFP16PS:

      Opc = X86::TTCMMRLFP16PS;

      break;

    case X86::PTCONJTCMMIMFP16PS:

      Opc = X86::TCONJTCMMIMFP16PS;

      break;

    case X86::PTMMULTF32PS:

      Opc = X86::TMMULTF32PS;

      break;

    case X86::PTTMMULTF32PS:

      Opc = X86::TTMMULTF32PS;

      break;

    }


    MachineInstrBuilder MIB = BuildMI(*BB, MI, MIMD, TII->get(Opc));

    MIB.addReg(TMMImmToTMMReg(MI.getOperand(0).getImm()), RegState::Define);

    MIB.addReg(TMMImmToTMMReg(MI.getOperand(0).getImm()), RegState::Undef);

    MIB.addReg(TMMImmToTMMReg(MI.getOperand(1).getImm()), RegState::Undef);

    MIB.addReg(TMMImmToTMMReg(MI.getOperand(2).getImm()), RegState::Undef);


    MI.eraseFromParent(); // The pseudo is gone now.

    return BB;

  }

  case X86::PTILEZERO: {

    unsigned Imm = MI.getOperand(0).getImm();

    BuildMI(*BB, MI, MIMD, TII->get(X86::TILEZERO), TMMImmToTMMReg(Imm));

    MI.eraseFromParent(); // The pseudo is gone now.

    auto *MFI = MF->getInfo<X86MachineFunctionInfo>();

    MFI->setAMXProgModel(AMXProgModelEnum::DirectReg);

    return BB;

  }

  case X86::PTILEZEROV: {

    auto *MFI = MF->getInfo<X86MachineFunctionInfo>();

    MFI->setAMXProgModel(AMXProgModelEnum::ManagedRA);

    return BB;

  }

  case X86::PTILELOADDRS:

  case X86::PTILELOADDRST1:

  case X86::PTILELOADD:

  case X86::PTILELOADDT1:

  case X86::PTILESTORED: {

    unsigned Opc;

    switch (MI.getOpcode()) {

    default: llvm_unreachable("illegal opcode!");

#define GET_EGPR_IF_ENABLED(OPC) (Subtarget.hasEGPR() ? OPC##_EVEX : OPC)

    case X86::PTILELOADD:

      Opc = GET_EGPR_IF_ENABLED(X86::TILELOADD);

      break;

    case X86::PTILELOADDT1:

      Opc = GET_EGPR_IF_ENABLED(X86::TILELOADDT1);

      break;

    case X86::PTILESTORED:

      Opc = GET_EGPR_IF_ENABLED(X86::TILESTORED);

      break;

    case X86::PTILELOADDRS:

      Opc = GET_EGPR_IF_ENABLED(X86::TILELOADDRS);

      break;

    case X86::PTILELOADDRST1:

      Opc = GET_EGPR_IF_ENABLED(X86::TILELOADDRST1);

      break;

    }

#undef GET_EGPR_IF_ENABLED


    MachineInstrBuilder MIB = BuildMI(*BB, MI, MIMD, TII->get(Opc));

    unsigned CurOp = 0;

    if (Opc != X86::TILESTORED && Opc != X86::TILESTORED_EVEX)

      MIB.addReg(TMMImmToTMMReg(MI.getOperand(CurOp++).getImm()),

                 RegState::Define);


    MIB.add(MI.getOperand(CurOp++)); // base

    MIB.add(MI.getOperand(CurOp++)); // scale

    MIB.add(MI.getOperand(CurOp++)); // index -- stride

    MIB.add(MI.getOperand(CurOp++)); // displacement

    MIB.add(MI.getOperand(CurOp++)); // segment


    if (Opc == X86::TILESTORED || Opc == X86::TILESTORED_EVEX)

      MIB.addReg(TMMImmToTMMReg(MI.getOperand(CurOp++).getImm()),

                 RegState::Undef);


    MI.eraseFromParent(); // The pseudo is gone now.

    return BB;

  }

  case X86::PT2RPNTLVWZ0:

  case X86::PT2RPNTLVWZ0T1:

  case X86::PT2RPNTLVWZ1:

  case X86::PT2RPNTLVWZ1T1:

  case X86::PT2RPNTLVWZ0RS:

  case X86::PT2RPNTLVWZ0RST1:

  case X86::PT2RPNTLVWZ1RS:

  case X86::PT2RPNTLVWZ1RST1: {

    const DebugLoc &DL = MI.getDebugLoc();

    unsigned Opc;

#define GET_EGPR_IF_ENABLED(OPC) (Subtarget.hasEGPR() ? OPC##_EVEX : OPC)

    switch (MI.getOpcode()) {

    default:

      llvm_unreachable("Unexpected instruction!");

    case X86::PT2RPNTLVWZ0:

      Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ0);

      break;

    case X86::PT2RPNTLVWZ0T1:

      Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ0T1);

      break;

    case X86::PT2RPNTLVWZ1:

      Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ1);

      break;

    case X86::PT2RPNTLVWZ1T1:

      Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ1T1);

      break;

    case X86::PT2RPNTLVWZ0RS:

      Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ0RS);

      break;

    case X86::PT2RPNTLVWZ0RST1:

      Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ0RST1);

      break;

    case X86::PT2RPNTLVWZ1RS:

      Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ1RS);

      break;

    case X86::PT2RPNTLVWZ1RST1:

      Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ1RST1);

      break;

    }

#undef GET_EGPR_IF_ENABLED

    MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(Opc));

    MIB.addReg(TMMImmToTMMPair(MI.getOperand(0).getImm()), RegState::Define);


    MIB.add(MI.getOperand(1)); // base

    MIB.add(MI.getOperand(2)); // scale

    MIB.add(MI.getOperand(3)); // index

    MIB.add(MI.getOperand(4)); // displacement

    MIB.add(MI.getOperand(5)); // segment

    MI.eraseFromParent();      // The pseudo is gone now.

    return BB;

  }

  case X86::PTTRANSPOSED:

  case X86::PTCONJTFP16: {

    const DebugLoc &DL = MI.getDebugLoc();

    unsigned Opc = MI.getOpcode() == X86::PTTRANSPOSED ? X86::TTRANSPOSED

                                                       : X86::TCONJTFP16;


    MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(Opc));

    MIB.addReg(TMMImmToTMMReg(MI.getOperand(0).getImm()), RegState::Define);

    MIB.addReg(TMMImmToTMMReg(MI.getOperand(1).getImm()), RegState::Undef);


    MI.eraseFromParent(); // The pseudo is gone now.

    return BB;

  }

  case X86::PTCVTROWPS2BF16Hrri:

  case X86::PTCVTROWPS2BF16Lrri:

  case X86::PTCVTROWPS2PHHrri:

  case X86::PTCVTROWPS2PHLrri:

  case X86::PTCVTROWD2PSrri:

  case X86::PTILEMOVROWrri: {

    const DebugLoc &DL = MI.getDebugLoc();

    unsigned Opc;

    switch (MI.getOpcode()) {

    default:

      llvm_unreachable("Unexpected instruction!");

    case X86::PTCVTROWD2PSrri:

      Opc = X86::TCVTROWD2PSrri;

      break;

    case X86::PTCVTROWPS2BF16Hrri:

      Opc = X86::TCVTROWPS2BF16Hrri;

      break;

    case X86::PTCVTROWPS2PHHrri:

      Opc = X86::TCVTROWPS2PHHrri;

      break;

    case X86::PTCVTROWPS2BF16Lrri:

      Opc = X86::TCVTROWPS2BF16Lrri;

      break;

    case X86::PTCVTROWPS2PHLrri:

      Opc = X86::TCVTROWPS2PHLrri;

      break;

    case X86::PTILEMOVROWrri:

      Opc = X86::TILEMOVROWrri;

      break;

    }

    MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(Opc));

    MIB.add(MI.getOperand(0));

    MIB.addReg(TMMImmToTMMReg(MI.getOperand(1).getImm()), RegState::Undef);

    MIB.addImm(MI.getOperand(2).getImm());


    MI.eraseFromParent(); // The pseudo is gone now.

    return BB;

  }

  case X86::PTCVTROWPS2BF16Hrre:

  case X86::PTCVTROWPS2BF16Lrre:

  case X86::PTCVTROWPS2PHHrre:

  case X86::PTCVTROWPS2PHLrre:

  case X86::PTCVTROWD2PSrre:

  case X86::PTILEMOVROWrre: {

    const DebugLoc &DL = MI.getDebugLoc();

    unsigned Opc;

    switch (MI.getOpcode()) {

    default:

      llvm_unreachable("Unexpected instruction!");

    case X86::PTCVTROWD2PSrre:

      Opc = X86::TCVTROWD2PSrre;

      break;

    case X86::PTCVTROWPS2BF16Hrre:

      Opc = X86::TCVTROWPS2BF16Hrre;

      break;

    case X86::PTCVTROWPS2BF16Lrre:

      Opc = X86::TCVTROWPS2BF16Lrre;

      break;

    case X86::PTCVTROWPS2PHHrre:

      Opc = X86::TCVTROWPS2PHHrre;

      break;

    case X86::PTCVTROWPS2PHLrre:

      Opc = X86::TCVTROWPS2PHLrre;

      break;

    case X86::PTILEMOVROWrre:

      Opc = X86::TILEMOVROWrre;

      break;

    }

    MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(Opc));

    MIB.add(MI.getOperand(0));

    MIB.addReg(TMMImmToTMMReg(MI.getOperand(1).getImm()), RegState::Undef);

    MIB.add(MI.getOperand(2));


    MI.eraseFromParent(); // The pseudo is gone now.

    return BB;

  }

  }

}


//===----------------------------------------------------------------------===//

//                           X86 Optimization Hooks

//===----------------------------------------------------------------------===//


bool


X86TargetLowering::targetShrinkDemandedConstant(SDValue Op,

                                                const APInt &DemandedBits,

                                                const APInt &DemandedElts,

                                                TargetLoweringOpt &TLO) const {

  EVT VT = Op.getValueType();

  unsigned Opcode = Op.getOpcode();

  unsigned EltSize = VT.getScalarSizeInBits();


  if (VT.isVector()) {

    // If the constant is only all signbits in the active bits, then we should

    // extend it to the entire constant to allow it act as a boolean constant

    // vector.

    auto NeedsSignExtension = [&](SDValue V, unsigned ActiveBits) {

      if (!ISD::isBuildVectorOfConstantSDNodes(V.getNode()))

        return false;

      for (unsigned i = 0, e = V.getNumOperands(); i != e; ++i) {

        if (!DemandedElts[i] || V.getOperand(i).isUndef())

          continue;

        const APInt &Val = V.getConstantOperandAPInt(i);

        if (Val.getBitWidth() > Val.getNumSignBits() &&

            Val.trunc(ActiveBits).getNumSignBits() == ActiveBits)

          return true;

      }

      return false;

    };

    // For vectors - if we have a constant, then try to sign extend.

    // TODO: Handle AND cases.

    unsigned ActiveBits = DemandedBits.getActiveBits();

    if (EltSize > ActiveBits && EltSize > 1 && isTypeLegal(VT) &&

        (Opcode == ISD::OR || Opcode == ISD::XOR || Opcode == X86ISD::ANDNP) &&

        NeedsSignExtension(Op.getOperand(1), ActiveBits)) {

      EVT ExtSVT = EVT::getIntegerVT(*TLO.DAG.getContext(), ActiveBits);

      EVT ExtVT = EVT::getVectorVT(*TLO.DAG.getContext(), ExtSVT,

                                   VT.getVectorNumElements());

      SDValue NewC =

          TLO.DAG.getNode(ISD::SIGN_EXTEND_INREG, SDLoc(Op), VT,

                          Op.getOperand(1), TLO.DAG.getValueType(ExtVT));

      SDValue NewOp =

          TLO.DAG.getNode(Opcode, SDLoc(Op), VT, Op.getOperand(0), NewC);

      return TLO.CombineTo(Op, NewOp);

    }

    return false;

  }


  // Only optimize Ands to prevent shrinking a constant that could be

  // matched by movzx.

  if (Opcode != ISD::AND)

    return false;


  // Make sure the RHS really is a constant.

  ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1));

  if (!C)

    return false;


  const APInt &Mask = C->getAPIntValue();


  // Clear all non-demanded bits initially.

  APInt ShrunkMask = Mask & DemandedBits;


  // Find the width of the shrunk mask.

  unsigned Width = ShrunkMask.getActiveBits();


  // If the mask is all 0s there's nothing to do here.

  if (Width == 0)

    return false;


  // Find the next power of 2 width, rounding up to a byte.

  Width = llvm::bit_ceil(std::max(Width, 8U));

  // Truncate the width to size to handle illegal types.

  Width = std::min(Width, EltSize);


  // Calculate a possible zero extend mask for this constant.

  APInt ZeroExtendMask = APInt::getLowBitsSet(EltSize, Width);


  // If we aren't changing the mask, just return true to keep it and prevent

  // the caller from optimizing.

  if (ZeroExtendMask == Mask)

    return true;


  // Make sure the new mask can be represented by a combination of mask bits

  // and non-demanded bits.

  if (!ZeroExtendMask.isSubsetOf(Mask | ~DemandedBits))

    return false;


  // Replace the constant with the zero extend mask.

  SDLoc DL(Op);

  SDValue NewC = TLO.DAG.getConstant(ZeroExtendMask, DL, VT);

  SDValue NewOp = TLO.DAG.getNode(ISD::AND, DL, VT, Op.getOperand(0), NewC);

  return TLO.CombineTo(Op, NewOp);

}


static void computeKnownBitsForPSADBW(SDValue LHS, SDValue RHS,

                                      KnownBits &Known,

                                      const APInt &DemandedElts,

                                      const SelectionDAG &DAG, unsigned Depth) {

  KnownBits Known2;

  unsigned NumSrcElts = LHS.getValueType().getVectorNumElements();

  APInt DemandedSrcElts = APIntOps::ScaleBitMask(DemandedElts, NumSrcElts);

  Known = DAG.computeKnownBits(RHS, DemandedSrcElts, Depth + 1);

  Known2 = DAG.computeKnownBits(LHS, DemandedSrcElts, Depth + 1);

  Known = KnownBits::abdu(Known, Known2).zext(16);

  // Known = (((D0 + D1) + (D2 + D3)) + ((D4 + D5) + (D6 + D7)))

  Known = KnownBits::add(Known, Known, /*NSW=*/true, /*NUW=*/true);

  Known = KnownBits::add(Known, Known, /*NSW=*/true, /*NUW=*/true);

  Known = KnownBits::add(Known, Known, /*NSW=*/true, /*NUW=*/true);

  Known = Known.zext(64);

}


static void computeKnownBitsForPMADDWD(SDValue LHS, SDValue RHS,

                                       KnownBits &Known,

                                       const APInt &DemandedElts,

                                       const SelectionDAG &DAG,

                                       unsigned Depth) {

  unsigned NumSrcElts = LHS.getValueType().getVectorNumElements();


  // Multiply signed i16 elements to create i32 values and add Lo/Hi pairs.

  APInt DemandedSrcElts = APIntOps::ScaleBitMask(DemandedElts, NumSrcElts);

  APInt DemandedLoElts =

      DemandedSrcElts & APInt::getSplat(NumSrcElts, APInt(2, 0b01));

  APInt DemandedHiElts =

      DemandedSrcElts & APInt::getSplat(NumSrcElts, APInt(2, 0b10));

  KnownBits LHSLo = DAG.computeKnownBits(LHS, DemandedLoElts, Depth + 1);

  KnownBits LHSHi = DAG.computeKnownBits(LHS, DemandedHiElts, Depth + 1);

  KnownBits RHSLo = DAG.computeKnownBits(RHS, DemandedLoElts, Depth + 1);

  KnownBits RHSHi = DAG.computeKnownBits(RHS, DemandedHiElts, Depth + 1);

  KnownBits Lo = KnownBits::mul(LHSLo.sext(32), RHSLo.sext(32));

  KnownBits Hi = KnownBits::mul(LHSHi.sext(32), RHSHi.sext(32));

  Known = KnownBits::add(Lo, Hi, /*NSW=*/false, /*NUW=*/false);

}


static void computeKnownBitsForPMADDUBSW(SDValue LHS, SDValue RHS,

                                         KnownBits &Known,

                                         const APInt &DemandedElts,

                                         const SelectionDAG &DAG,

                                         unsigned Depth) {

  unsigned NumSrcElts = LHS.getValueType().getVectorNumElements();


  // Multiply unsigned/signed i8 elements to create i16 values and add_sat Lo/Hi

  // pairs.

  APInt DemandedSrcElts = APIntOps::ScaleBitMask(DemandedElts, NumSrcElts);

  APInt DemandedLoElts =

      DemandedSrcElts & APInt::getSplat(NumSrcElts, APInt(2, 0b01));

  APInt DemandedHiElts =

      DemandedSrcElts & APInt::getSplat(NumSrcElts, APInt(2, 0b10));

  KnownBits LHSLo = DAG.computeKnownBits(LHS, DemandedLoElts, Depth + 1);

  KnownBits LHSHi = DAG.computeKnownBits(LHS, DemandedHiElts, Depth + 1);

  KnownBits RHSLo = DAG.computeKnownBits(RHS, DemandedLoElts, Depth + 1);

  KnownBits RHSHi = DAG.computeKnownBits(RHS, DemandedHiElts, Depth + 1);

  KnownBits Lo = KnownBits::mul(LHSLo.zext(16), RHSLo.sext(16));

  KnownBits Hi = KnownBits::mul(LHSHi.zext(16), RHSHi.sext(16));

  Known = KnownBits::sadd_sat(Lo, Hi);

}


static KnownBits computeKnownBitsForHorizontalOperation(

    const SDValue Op, const APInt &DemandedElts, unsigned Depth,

    const SelectionDAG &DAG,

    const function_ref<KnownBits(const KnownBits &, const KnownBits &)>

        KnownBitsFunc) {

  APInt DemandedEltsLHS, DemandedEltsRHS;

  getHorizDemandedEltsForFirstOperand(Op.getValueType().getSizeInBits(),

                                      DemandedElts, DemandedEltsLHS,

                                      DemandedEltsRHS);


  const auto ComputeForSingleOpFunc =

      [&DAG, Depth, KnownBitsFunc](SDValue Op, APInt &DemandedEltsOp) {

        return KnownBitsFunc(

            DAG.computeKnownBits(Op, DemandedEltsOp, Depth + 1),

            DAG.computeKnownBits(Op, DemandedEltsOp << 1, Depth + 1));

      };


  if (DemandedEltsRHS.isZero())

    return ComputeForSingleOpFunc(Op.getOperand(0), DemandedEltsLHS);

  if (DemandedEltsLHS.isZero())

    return ComputeForSingleOpFunc(Op.getOperand(1), DemandedEltsRHS);


  return ComputeForSingleOpFunc(Op.getOperand(0), DemandedEltsLHS)

      .intersectWith(ComputeForSingleOpFunc(Op.getOperand(1), DemandedEltsRHS));

}


void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,

                                                      KnownBits &Known,

                                                      const APInt &DemandedElts,

                                                      const SelectionDAG &DAG,

                                                      unsigned Depth) const {

  unsigned BitWidth = Known.getBitWidth();

  unsigned NumElts = DemandedElts.getBitWidth();

  unsigned Opc = Op.getOpcode();

  EVT VT = Op.getValueType();

  assert((Opc >= ISD::BUILTIN_OP_END ||

          Opc == ISD::INTRINSIC_WO_CHAIN ||

          Opc == ISD::INTRINSIC_W_CHAIN ||

          Opc == ISD::INTRINSIC_VOID) &&

         "Should use MaskedValueIsZero if you don't know whether Op"

         " is a target node!");


  Known.resetAll();

  switch (Opc) {

  default: break;

  case X86ISD::MUL_IMM: {

    KnownBits Known2;

    Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);

    Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);

    Known = KnownBits::mul(Known, Known2);

    break;

  }

  case X86ISD::BSF: {

    Known.Zero.setBitsFrom(Log2_32(BitWidth));


    KnownBits Known2;

    Known2 = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);

    if (Known2.isNonZero()) {

      // If we have a known 1, its position is our upper bound.

      unsigned PossibleTZ = Known2.countMaxTrailingZeros();

      unsigned LowBits = llvm::bit_width(PossibleTZ);

      Known.Zero.setBitsFrom(LowBits);

    } else if (!Op.getOperand(0).isUndef()) {

      Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);

      Known = Known.intersectWith(Known2);

    }

    break;

  }

  case X86ISD::BSR: {

    // TODO: Bound with input known bits?

    Known.Zero.setBitsFrom(Log2_32(BitWidth));


    if (!Op.getOperand(0).isUndef() &&

        !DAG.isKnownNeverZero(Op.getOperand(1), Depth + 1)) {

      KnownBits Known2;

      Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);

      Known = Known.intersectWith(Known2);

    }

    break;

  }

  case X86ISD::SETCC:

    Known.Zero.setBitsFrom(1);

    break;

  case X86ISD::MOVMSK: {

    unsigned NumLoBits = Op.getOperand(0).getValueType().getVectorNumElements();

    Known.Zero.setBitsFrom(NumLoBits);

    break;

  }

  case X86ISD::PEXTRB:

  case X86ISD::PEXTRW: {

    SDValue Src = Op.getOperand(0);

    EVT SrcVT = Src.getValueType();

    APInt DemandedElt = APInt::getOneBitSet(SrcVT.getVectorNumElements(),

                                            Op.getConstantOperandVal(1));

    Known = DAG.computeKnownBits(Src, DemandedElt, Depth + 1);

    Known = Known.anyextOrTrunc(BitWidth);

    Known.Zero.setBitsFrom(SrcVT.getScalarSizeInBits());

    break;

  }

  case X86ISD::VSRAI:

  case X86ISD::VSHLI:

  case X86ISD::VSRLI: {

    unsigned ShAmt = Op.getConstantOperandVal(1);

    if (ShAmt >= VT.getScalarSizeInBits()) {

      // Out of range logical bit shifts are guaranteed to be zero.

      // Out of range arithmetic bit shifts splat the sign bit.

      if (Opc != X86ISD::VSRAI) {

        Known.setAllZero();

        break;

      }


      ShAmt = VT.getScalarSizeInBits() - 1;

    }


    Known = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);

    if (Opc == X86ISD::VSHLI) {

      Known <<= ShAmt;

      // Low bits are known zero.

      Known.Zero.setLowBits(ShAmt);

    } else if (Opc == X86ISD::VSRLI) {

      Known >>= ShAmt;

      // High bits are known zero.

      Known.Zero.setHighBits(ShAmt);

    } else {

      Known.Zero.ashrInPlace(ShAmt);

      Known.One.ashrInPlace(ShAmt);

    }

    break;

  }

  case X86ISD::PACKUS: {

    // PACKUS is just a truncation if the upper half is zero.

    APInt DemandedLHS, DemandedRHS;

    getPackDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS);


    Known.One = APInt::getAllOnes(BitWidth * 2);

    Known.Zero = APInt::getAllOnes(BitWidth * 2);


    KnownBits Known2;

    if (!!DemandedLHS) {

      Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedLHS, Depth + 1);

      Known = Known.intersectWith(Known2);

    }

    if (!!DemandedRHS) {

      Known2 = DAG.computeKnownBits(Op.getOperand(1), DemandedRHS, Depth + 1);

      Known = Known.intersectWith(Known2);

    }


    if (Known.countMinLeadingZeros() < BitWidth)

      Known.resetAll();

    Known = Known.trunc(BitWidth);

    break;

  }

  case X86ISD::PSHUFB: {

    SDValue Src = Op.getOperand(0);

    SDValue Idx = Op.getOperand(1);


    // If the index vector is never negative (MSB is zero), then all elements

    // come from the source vector. This is useful for cases where

    // PSHUFB is being used as a LUT (ctpop etc.) - the target shuffle handling

    // below will handle the more common constant shuffle mask case.

    KnownBits KnownIdx = DAG.computeKnownBits(Idx, DemandedElts, Depth + 1);

    if (KnownIdx.isNonNegative())

      Known = DAG.computeKnownBits(Src, Depth + 1);

    break;

  }

  case X86ISD::VBROADCAST: {

    SDValue Src = Op.getOperand(0);

    if (!Src.getSimpleValueType().isVector()) {

      Known = DAG.computeKnownBits(Src, Depth + 1);

      return;

    }

    break;

  }

  case X86ISD::AND: {

    if (Op.getResNo() == 0) {

      KnownBits Known2;

      Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);

      Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);

      Known &= Known2;

    }

    break;

  }

  case X86ISD::ANDNP: {

    KnownBits Known2;

    Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);

    Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);


    // ANDNP = (~X & Y);

    Known.One &= Known2.Zero;

    Known.Zero |= Known2.One;

    break;

  }

  case X86ISD::FOR: {

    KnownBits Known2;

    Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);

    Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);


    Known |= Known2;

    break;

  }

  case X86ISD::PSADBW: {

    SDValue LHS = Op.getOperand(0);

    SDValue RHS = Op.getOperand(1);

    assert(VT.getScalarType() == MVT::i64 &&

           LHS.getValueType() == RHS.getValueType() &&

           LHS.getValueType().getScalarType() == MVT::i8 &&

           "Unexpected PSADBW types");

    computeKnownBitsForPSADBW(LHS, RHS, Known, DemandedElts, DAG, Depth);

    break;

  }

  case X86ISD::PCMPGT:

  case X86ISD::PCMPEQ: {

    KnownBits KnownLhs =

        DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);

    KnownBits KnownRhs =

        DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);

    std::optional<bool> Res = Opc == X86ISD::PCMPEQ

                                  ? KnownBits::eq(KnownLhs, KnownRhs)

                                  : KnownBits::sgt(KnownLhs, KnownRhs);

    if (Res) {

      if (*Res)

        Known.setAllOnes();

      else

        Known.setAllZero();

    }

    break;

  }

  case X86ISD::VPMADDWD: {

    SDValue LHS = Op.getOperand(0);

    SDValue RHS = Op.getOperand(1);

    assert(VT.getVectorElementType() == MVT::i32 &&

           LHS.getValueType() == RHS.getValueType() &&

           LHS.getValueType().getVectorElementType() == MVT::i16 &&

           "Unexpected PMADDWD types");

    computeKnownBitsForPMADDWD(LHS, RHS, Known, DemandedElts, DAG, Depth);

    break;

  }

  case X86ISD::VPMADDUBSW: {

    SDValue LHS = Op.getOperand(0);

    SDValue RHS = Op.getOperand(1);

    assert(VT.getVectorElementType() == MVT::i16 &&

           LHS.getValueType() == RHS.getValueType() &&

           LHS.getValueType().getVectorElementType() == MVT::i8 &&

           "Unexpected PMADDUBSW types");

    computeKnownBitsForPMADDUBSW(LHS, RHS, Known, DemandedElts, DAG, Depth);

    break;

  }

  case X86ISD::PMULUDQ: {

    KnownBits Known2;

    Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);

    Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);


    Known = Known.trunc(BitWidth / 2).zext(BitWidth);

    Known2 = Known2.trunc(BitWidth / 2).zext(BitWidth);

    Known = KnownBits::mul(Known, Known2);

    break;

  }

  case X86ISD::CMOV: {

    Known = DAG.computeKnownBits(Op.getOperand(1), Depth + 1);

    // If we don't know any bits, early out.

    if (Known.isUnknown())

      break;

    KnownBits Known2 = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);


    // Only known if known in both the LHS and RHS.

    Known = Known.intersectWith(Known2);

    break;

  }

  case X86ISD::BEXTR:

  case X86ISD::BEXTRI: {

    SDValue Op0 = Op.getOperand(0);

    SDValue Op1 = Op.getOperand(1);


    if (auto* Cst1 = dyn_cast<ConstantSDNode>(Op1)) {

      unsigned Shift = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 0);

      unsigned Length = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 8);


      // If the length is 0, the result is 0.

      if (Length == 0) {

        Known.setAllZero();

        break;

      }


      if ((Shift + Length) <= BitWidth) {

        Known = DAG.computeKnownBits(Op0, Depth + 1);

        Known = Known.extractBits(Length, Shift);

        Known = Known.zextOrTrunc(BitWidth);

      }

    }

    break;

  }

  case X86ISD::PDEP: {

    KnownBits Known2;

    Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);

    Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);

    // Zeros are retained from the mask operand. But not ones.

    Known.One.clearAllBits();

    // The result will have at least as many trailing zeros as the non-mask

    // operand since bits can only map to the same or higher bit position.

    Known.Zero.setLowBits(Known2.countMinTrailingZeros());

    break;

  }

  case X86ISD::PEXT: {

    Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);

    // The result has as many leading zeros as the number of zeroes in the mask.

    unsigned Count = Known.Zero.popcount();

    Known.Zero = APInt::getHighBitsSet(BitWidth, Count);

    Known.One.clearAllBits();

    break;

  }

  case X86ISD::VTRUNC:

  case X86ISD::VTRUNCS:

  case X86ISD::VTRUNCUS:

  case X86ISD::CVTSI2P:

  case X86ISD::CVTUI2P:

  case X86ISD::CVTP2SI:

  case X86ISD::CVTP2UI:

  case X86ISD::MCVTP2SI:

  case X86ISD::MCVTP2UI:

  case X86ISD::CVTTP2SI:

  case X86ISD::CVTTP2UI:

  case X86ISD::MCVTTP2SI:

  case X86ISD::MCVTTP2UI:

  case X86ISD::MCVTSI2P:

  case X86ISD::MCVTUI2P:

  case X86ISD::VFPROUND:

  case X86ISD::VMFPROUND:

  case X86ISD::CVTPS2PH:

  case X86ISD::MCVTPS2PH:

  case X86ISD::MCVTTP2SIS:

  case X86ISD::MCVTTP2UIS: {

    // Truncations/Conversions - upper elements are known zero.

    EVT SrcVT = Op.getOperand(0).getValueType();

    if (SrcVT.isVector()) {

      unsigned NumSrcElts = SrcVT.getVectorNumElements();

      if (NumElts > NumSrcElts && DemandedElts.countr_zero() >= NumSrcElts)

        Known.setAllZero();

    }

    break;

  }

  case X86ISD::STRICT_CVTTP2SI:

  case X86ISD::STRICT_CVTTP2UI:

  case X86ISD::STRICT_CVTSI2P:

  case X86ISD::STRICT_CVTUI2P:

  case X86ISD::STRICT_VFPROUND:

  case X86ISD::STRICT_CVTPS2PH: {

    // Strict Conversions - upper elements are known zero.

    EVT SrcVT = Op.getOperand(1).getValueType();

    if (SrcVT.isVector()) {

      unsigned NumSrcElts = SrcVT.getVectorNumElements();

      if (NumElts > NumSrcElts && DemandedElts.countr_zero() >= NumSrcElts)

        Known.setAllZero();

    }

    break;

  }

  case X86ISD::MOVQ2DQ: {

    // Move from MMX to XMM. Upper half of XMM should be 0.

    if (DemandedElts.countr_zero() >= (NumElts / 2))

      Known.setAllZero();

    break;

  }

  case X86ISD::VBROADCAST_LOAD: {

    APInt UndefElts;

    SmallVector<APInt, 16> EltBits;

    if (getTargetConstantBitsFromNode(Op, BitWidth, UndefElts, EltBits,

                                      /*AllowWholeUndefs*/ false,

                                      /*AllowPartialUndefs*/ false)) {

      Known.Zero.setAllBits();

      Known.One.setAllBits();

      for (unsigned I = 0; I != NumElts; ++I) {

        if (!DemandedElts[I])

          continue;

        if (UndefElts[I]) {

          Known.resetAll();

          break;

        }

        KnownBits Known2 = KnownBits::makeConstant(EltBits[I]);

        Known = Known.intersectWith(Known2);

      }

      return;

    }

    break;

  }

  case X86ISD::HADD:

  case X86ISD::HSUB: {

    Known = computeKnownBitsForHorizontalOperation(

        Op, DemandedElts, Depth, DAG,

        [Opc](const KnownBits &KnownLHS, const KnownBits &KnownRHS) {

          return KnownBits::computeForAddSub(

              /*Add=*/Opc == X86ISD::HADD, /*NSW=*/false, /*NUW=*/false,

              KnownLHS, KnownRHS);

        });

    break;

  }

  case ISD::INTRINSIC_WO_CHAIN: {

    switch (Op->getConstantOperandVal(0)) {

    case Intrinsic::x86_sse2_pmadd_wd:

    case Intrinsic::x86_avx2_pmadd_wd:

    case Intrinsic::x86_avx512_pmaddw_d_512: {

      SDValue LHS = Op.getOperand(1);

      SDValue RHS = Op.getOperand(2);

      assert(VT.getScalarType() == MVT::i32 &&

             LHS.getValueType() == RHS.getValueType() &&

             LHS.getValueType().getScalarType() == MVT::i16 &&

             "Unexpected PMADDWD types");

      computeKnownBitsForPMADDWD(LHS, RHS, Known, DemandedElts, DAG, Depth);

      break;

    }

    case Intrinsic::x86_ssse3_pmadd_ub_sw_128:

    case Intrinsic::x86_avx2_pmadd_ub_sw:

    case Intrinsic::x86_avx512_pmaddubs_w_512: {

      SDValue LHS = Op.getOperand(1);

      SDValue RHS = Op.getOperand(2);

      assert(VT.getScalarType() == MVT::i16 &&

             LHS.getValueType() == RHS.getValueType() &&

             LHS.getValueType().getScalarType() == MVT::i8 &&

             "Unexpected PMADDUBSW types");

      computeKnownBitsForPMADDUBSW(LHS, RHS, Known, DemandedElts, DAG, Depth);

      break;

    }

    case Intrinsic::x86_sse2_psad_bw:

    case Intrinsic::x86_avx2_psad_bw:

    case Intrinsic::x86_avx512_psad_bw_512: {

      SDValue LHS = Op.getOperand(1);

      SDValue RHS = Op.getOperand(2);

      assert(VT.getScalarType() == MVT::i64 &&

             LHS.getValueType() == RHS.getValueType() &&

             LHS.getValueType().getScalarType() == MVT::i8 &&

             "Unexpected PSADBW types");

      computeKnownBitsForPSADBW(LHS, RHS, Known, DemandedElts, DAG, Depth);

      break;

    }

    }

    break;

  }

  case X86ISD::VPMADD52L:

  case X86ISD::VPMADD52H: {

    assert(Op.getValueType().isVector() &&

           Op.getValueType().getScalarType() == MVT::i64 &&

           "Unexpected VPMADD52 type");

    KnownBits K0 =

        DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);

    KnownBits K1 =

        DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);

    KnownBits KAcc =

        DAG.computeKnownBits(Op.getOperand(2), DemandedElts, Depth + 1);

    K0 = K0.trunc(52);

    K1 = K1.trunc(52);

    KnownBits KnownMul = (Op.getOpcode() == X86ISD::VPMADD52L)

                             ? KnownBits::mul(K0, K1)

                             : KnownBits::mulhu(K0, K1);

    KnownMul = KnownMul.zext(64);

    Known = KnownBits::add(KAcc, KnownMul);

    return;

  }

  }


  // Handle target shuffles.

  // TODO - use resolveTargetShuffleInputs once we can limit recursive depth.

  if (isTargetShuffle(Opc)) {

    SmallVector<int, 64> Mask;

    SmallVector<SDValue, 2> Ops;

    if (getTargetShuffleMask(Op, true, Ops, Mask)) {

      unsigned NumOps = Ops.size();

      unsigned NumElts = VT.getVectorNumElements();

      if (Mask.size() == NumElts) {

        SmallVector<APInt, 2> DemandedOps(NumOps, APInt(NumElts, 0));

        Known.Zero.setAllBits(); Known.One.setAllBits();

        for (unsigned i = 0; i != NumElts; ++i) {

          if (!DemandedElts[i])

            continue;

          int M = Mask[i];

          if (M == SM_SentinelUndef) {

            // For UNDEF elements, we don't know anything about the common state

            // of the shuffle result.

            Known.resetAll();

            break;

          }

          if (M == SM_SentinelZero) {

            Known.One.clearAllBits();

            continue;

          }

          assert(0 <= M && (unsigned)M < (NumOps * NumElts) &&

                 "Shuffle index out of range");


          unsigned OpIdx = (unsigned)M / NumElts;

          unsigned EltIdx = (unsigned)M % NumElts;

          if (Ops[OpIdx].getValueType() != VT) {

            // TODO - handle target shuffle ops with different value types.

            Known.resetAll();

            break;

          }

          DemandedOps[OpIdx].setBit(EltIdx);

        }

        // Known bits are the values that are shared by every demanded element.

        for (unsigned i = 0; i != NumOps && !Known.isUnknown(); ++i) {

          if (!DemandedOps[i])

            continue;

          KnownBits Known2 =

              DAG.computeKnownBits(Ops[i], DemandedOps[i], Depth + 1);

          Known = Known.intersectWith(Known2);

        }

      }

    }

  }

}


unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode(

    SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,

    unsigned Depth) const {

  EVT VT = Op.getValueType();

  unsigned VTBits = VT.getScalarSizeInBits();

  unsigned Opcode = Op.getOpcode();

  switch (Opcode) {

  case X86ISD::SETCC_CARRY:

    // SETCC_CARRY sets the dest to ~0 for true or 0 for false.

    return VTBits;


  case X86ISD::VTRUNC: {

    SDValue Src = Op.getOperand(0);

    MVT SrcVT = Src.getSimpleValueType();

    unsigned NumSrcBits = SrcVT.getScalarSizeInBits();

    assert(VTBits < NumSrcBits && "Illegal truncation input type");

    APInt DemandedSrc = DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());

    unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedSrc, Depth + 1);

    if (Tmp > (NumSrcBits - VTBits))

      return Tmp - (NumSrcBits - VTBits);

    return 1;

  }


  case X86ISD::PACKSS: {

    // PACKSS is just a truncation if the sign bits extend to the packed size.

    APInt DemandedLHS, DemandedRHS;

    getPackDemandedElts(Op.getValueType(), DemandedElts, DemandedLHS,

                        DemandedRHS);


    // Helper to detect PACKSSDW(BITCAST(PACKSSDW(X)),BITCAST(PACKSSDW(Y)))

    // patterns often used to compact vXi64 allsignbit patterns.

    auto NumSignBitsPACKSS = [&](SDValue V, const APInt &Elts) -> unsigned {

      SDValue BC = peekThroughBitcasts(V);

      if (BC.getOpcode() == X86ISD::PACKSS &&

          BC.getScalarValueSizeInBits() == 16 &&

          V.getScalarValueSizeInBits() == 32) {

        SDValue BC0 = peekThroughBitcasts(BC.getOperand(0));

        SDValue BC1 = peekThroughBitcasts(BC.getOperand(1));

        if (BC0.getScalarValueSizeInBits() == 64 &&

            BC1.getScalarValueSizeInBits() == 64 &&

            DAG.ComputeNumSignBits(BC0, Depth + 1) == 64 &&

            DAG.ComputeNumSignBits(BC1, Depth + 1) == 64)

          return 32;

      }

      return DAG.ComputeNumSignBits(V, Elts, Depth + 1);

    };


    unsigned SrcBits = Op.getOperand(0).getScalarValueSizeInBits();

    unsigned Tmp0 = SrcBits, Tmp1 = SrcBits;

    if (!!DemandedLHS)

      Tmp0 = NumSignBitsPACKSS(Op.getOperand(0), DemandedLHS);

    if (!!DemandedRHS)

      Tmp1 = NumSignBitsPACKSS(Op.getOperand(1), DemandedRHS);

    unsigned Tmp = std::min(Tmp0, Tmp1);

    if (Tmp > (SrcBits - VTBits))

      return Tmp - (SrcBits - VTBits);

    return 1;

  }


  case X86ISD::VBROADCAST: {

    SDValue Src = Op.getOperand(0);

    if (!Src.getSimpleValueType().isVector())

      return DAG.ComputeNumSignBits(Src, Depth + 1);

    break;

  }


  case X86ISD::VSHLI: {

    SDValue Src = Op.getOperand(0);

    const APInt &ShiftVal = Op.getConstantOperandAPInt(1);

    if (ShiftVal.uge(VTBits))

      return VTBits; // Shifted all bits out --> zero.

    unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedElts, Depth + 1);

    if (ShiftVal.uge(Tmp))

      return 1; // Shifted all sign bits out --> unknown.

    return Tmp - ShiftVal.getZExtValue();

  }


  case X86ISD::VSRAI: {

    SDValue Src = Op.getOperand(0);

    APInt ShiftVal = Op.getConstantOperandAPInt(1);

    if (ShiftVal.uge(VTBits - 1))

      return VTBits; // Sign splat.

    unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedElts, Depth + 1);

    ShiftVal += Tmp;

    return ShiftVal.uge(VTBits) ? VTBits : ShiftVal.getZExtValue();

  }


  case X86ISD::FSETCC:

    // cmpss/cmpsd return zero/all-bits result values in the bottom element.

    if (VT == MVT::f32 || VT == MVT::f64 ||

        ((VT == MVT::v4f32 || VT == MVT::v2f64) && DemandedElts == 1))

      return VTBits;

    break;


  case X86ISD::PCMPGT:

  case X86ISD::PCMPEQ:

  case X86ISD::CMPP:

  case X86ISD::VPCOM:

  case X86ISD::VPCOMU:

    // Vector compares return zero/all-bits result values.

    return VTBits;


  case X86ISD::ANDNP: {

    unsigned Tmp0 =

        DAG.ComputeNumSignBits(Op.getOperand(0), DemandedElts, Depth + 1);

    if (Tmp0 == 1) return 1; // Early out.

    unsigned Tmp1 =

        DAG.ComputeNumSignBits(Op.getOperand(1), DemandedElts, Depth + 1);

    return std::min(Tmp0, Tmp1);

  }


  case X86ISD::CMOV: {

    unsigned Tmp0 = DAG.ComputeNumSignBits(Op.getOperand(0), Depth+1);

    if (Tmp0 == 1) return 1;  // Early out.

    unsigned Tmp1 = DAG.ComputeNumSignBits(Op.getOperand(1), Depth+1);

    return std::min(Tmp0, Tmp1);

  }

  }


  // Handle target shuffles.

  // TODO - use resolveTargetShuffleInputs once we can limit recursive depth.

  if (isTargetShuffle(Opcode)) {

    SmallVector<int, 64> Mask;

    SmallVector<SDValue, 2> Ops;

    if (getTargetShuffleMask(Op, true, Ops, Mask)) {

      unsigned NumOps = Ops.size();

      unsigned NumElts = VT.getVectorNumElements();

      if (Mask.size() == NumElts) {

        SmallVector<APInt, 2> DemandedOps(NumOps, APInt(NumElts, 0));

        for (unsigned i = 0; i != NumElts; ++i) {

          if (!DemandedElts[i])

            continue;

          int M = Mask[i];

          if (M == SM_SentinelUndef) {

            // For UNDEF elements, we don't know anything about the common state

            // of the shuffle result.

            return 1;

          } else if (M == SM_SentinelZero) {

            // Zero = all sign bits.

            continue;

          }

          assert(0 <= M && (unsigned)M < (NumOps * NumElts) &&

                 "Shuffle index out of range");


          unsigned OpIdx = (unsigned)M / NumElts;

          unsigned EltIdx = (unsigned)M % NumElts;

          if (Ops[OpIdx].getValueType() != VT) {

            // TODO - handle target shuffle ops with different value types.

            return 1;

          }

          DemandedOps[OpIdx].setBit(EltIdx);

        }

        unsigned Tmp0 = VTBits;

        for (unsigned i = 0; i != NumOps && Tmp0 > 1; ++i) {

          if (!DemandedOps[i])

            continue;

          unsigned Tmp1 =

              DAG.ComputeNumSignBits(Ops[i], DemandedOps[i], Depth + 1);

          Tmp0 = std::min(Tmp0, Tmp1);

        }

        return Tmp0;

      }

    }

  }


  // Fallback case.

  return 1;

}


SDValue X86TargetLowering::unwrapAddress(SDValue N) const {

  if (N->getOpcode() == X86ISD::Wrapper || N->getOpcode() == X86ISD::WrapperRIP)

    return N->getOperand(0);

  return N;

}


// Helper to look for a normal load that can be narrowed into a vzload with the

// specified VT and memory VT. Returns SDValue() on failure.


static SDValue narrowLoadToVZLoad(LoadSDNode *LN, MVT MemVT, MVT VT,

                                  SelectionDAG &DAG) {

  // Can't if the load is volatile or atomic.

  if (!LN->isSimple())

    return SDValue();


  SDVTList Tys = DAG.getVTList(VT, MVT::Other);

  SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};

  return DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, SDLoc(LN), Tys, Ops, MemVT,

                                 LN->getPointerInfo(), LN->getBaseAlign(),

                                 LN->getMemOperand()->getFlags());

}


// Attempt to match a combined shuffle mask against supported unary shuffle

// instructions.

// TODO: Investigate sharing more of this with shuffle lowering.


static bool matchUnaryShuffle(MVT MaskVT, ArrayRef<int> Mask,

                              bool AllowFloatDomain, bool AllowIntDomain,

                              SDValue V1, const SelectionDAG &DAG,

                              const X86Subtarget &Subtarget, unsigned &Shuffle,

                              MVT &SrcVT, MVT &DstVT) {

  unsigned NumMaskElts = Mask.size();

  unsigned MaskEltSize = MaskVT.getScalarSizeInBits();


  // Match against a VZEXT_MOVL vXi32 and vXi16 zero-extending instruction.

  if (Mask[0] == 0 &&

      (MaskEltSize == 32 || (MaskEltSize == 16 && Subtarget.hasFP16()))) {

    if ((isUndefOrZero(Mask[1]) && isUndefInRange(Mask, 2, NumMaskElts - 2)) ||

        (V1.getOpcode() == ISD::SCALAR_TO_VECTOR &&

         isUndefOrZeroInRange(Mask, 1, NumMaskElts - 1))) {

      Shuffle = X86ISD::VZEXT_MOVL;

      if (MaskEltSize == 16)

        SrcVT = DstVT = MaskVT.changeVectorElementType(MVT::f16);

      else

        SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT;

      return true;

    }

  }


  // Match against a ANY/SIGN/ZERO_EXTEND_VECTOR_INREG instruction.

  if (AllowIntDomain &&

      ((MaskVT.is128BitVector() && Subtarget.hasSSE41()) ||

       (MaskVT.is256BitVector() && Subtarget.hasInt256()) ||

       (MaskVT.is512BitVector() && Subtarget.useAVX512Regs()))) {

    unsigned MaxScale = 64 / MaskEltSize;

    bool UseSign = V1.getScalarValueSizeInBits() == MaskEltSize &&

                   DAG.ComputeNumSignBits(V1) == MaskEltSize;

    for (unsigned Scale = 2; Scale <= MaxScale; Scale *= 2) {

      // Skip 512-bit VPMOV?XBW on non-AVX512BW targets.

      if (Scale == 2 && MaskVT == MVT::v64i8 && !Subtarget.useBWIRegs())

        continue;

      bool MatchAny = true;

      bool MatchZero = true;

      bool MatchSign = UseSign;

      unsigned NumDstElts = NumMaskElts / Scale;

      for (unsigned i = 0;

           i != NumDstElts && (MatchAny || MatchSign || MatchZero); ++i) {

        if (!isUndefOrEqual(Mask[i * Scale], (int)i)) {

          MatchAny = MatchSign = MatchZero = false;

          break;

        }

        unsigned Pos = (i * Scale) + 1;

        unsigned Len = Scale - 1;

        MatchAny &= isUndefInRange(Mask, Pos, Len);

        MatchZero &= isUndefOrZeroInRange(Mask, Pos, Len);

        MatchSign &= isUndefOrEqualInRange(Mask, (int)i, Pos, Len);

      }

      if (MatchAny || MatchSign || MatchZero) {

        assert((MatchSign || MatchZero) &&

               "Failed to match sext/zext but matched aext?");

        unsigned SrcSize = std::max(128u, NumDstElts * MaskEltSize);

        MVT ScalarTy = MaskVT.isInteger() ? MaskVT.getScalarType()

                                          : MVT::getIntegerVT(MaskEltSize);

        SrcVT = MVT::getVectorVT(ScalarTy, SrcSize / MaskEltSize);


        Shuffle = unsigned(

            MatchAny ? ISD::ANY_EXTEND

                     : (MatchSign ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND));

        if (SrcVT.getVectorNumElements() != NumDstElts)

          Shuffle = DAG.getOpcode_EXTEND_VECTOR_INREG(Shuffle);


        DstVT = MVT::getIntegerVT(Scale * MaskEltSize);

        DstVT = MVT::getVectorVT(DstVT, NumDstElts);

        return true;

      }

    }

  }


  // Match against a VZEXT_MOVL instruction, SSE1 only supports 32-bits (MOVSS).

  if (((MaskEltSize == 32) || (MaskEltSize == 64 && Subtarget.hasSSE2()) ||

       (MaskEltSize == 16 && Subtarget.hasFP16())) &&

      isUndefOrEqual(Mask[0], 0) &&

      isUndefOrZeroInRange(Mask, 1, NumMaskElts - 1)) {

    Shuffle = X86ISD::VZEXT_MOVL;

    if (MaskEltSize == 16)

      SrcVT = DstVT = MaskVT.changeVectorElementType(MVT::f16);

    else

      SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT;

    return true;

  }


  // Check if we have SSE3 which will let us use MOVDDUP etc. The

  // instructions are no slower than UNPCKLPD but has the option to

  // fold the input operand into even an unaligned memory load.

  if (MaskVT.is128BitVector() && Subtarget.hasSSE3() && AllowFloatDomain) {

    if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0}, DAG, V1)) {

      Shuffle = X86ISD::MOVDDUP;

      SrcVT = DstVT = MVT::v2f64;

      return true;

    }

    if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2}, DAG, V1)) {

      Shuffle = X86ISD::MOVSLDUP;

      SrcVT = DstVT = MVT::v4f32;

      return true;

    }

    if (isTargetShuffleEquivalent(MaskVT, Mask, {1, 1, 3, 3}, DAG, V1)) {

      Shuffle = X86ISD::MOVSHDUP;

      SrcVT = DstVT = MVT::v4f32;

      return true;

    }

  }


  if (MaskVT.is256BitVector() && AllowFloatDomain) {

    assert(Subtarget.hasAVX() && "AVX required for 256-bit vector shuffles");

    if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2}, DAG, V1)) {

      Shuffle = X86ISD::MOVDDUP;

      SrcVT = DstVT = MVT::v4f64;

      return true;

    }

    if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2, 4, 4, 6, 6}, DAG,

                                  V1)) {

      Shuffle = X86ISD::MOVSLDUP;

      SrcVT = DstVT = MVT::v8f32;

      return true;

    }

    if (isTargetShuffleEquivalent(MaskVT, Mask, {1, 1, 3, 3, 5, 5, 7, 7}, DAG,

                                  V1)) {

      Shuffle = X86ISD::MOVSHDUP;

      SrcVT = DstVT = MVT::v8f32;

      return true;

    }

  }


  if (MaskVT.is512BitVector() && AllowFloatDomain) {

    assert(Subtarget.hasAVX512() &&

           "AVX512 required for 512-bit vector shuffles");

    if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2, 4, 4, 6, 6}, DAG,

                                  V1)) {

      Shuffle = X86ISD::MOVDDUP;

      SrcVT = DstVT = MVT::v8f64;

      return true;

    }

    if (isTargetShuffleEquivalent(

            MaskVT, Mask,

            {0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14}, DAG, V1)) {

      Shuffle = X86ISD::MOVSLDUP;

      SrcVT = DstVT = MVT::v16f32;

      return true;

    }

    if (isTargetShuffleEquivalent(

            MaskVT, Mask,

            {1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15}, DAG, V1)) {

      Shuffle = X86ISD::MOVSHDUP;

      SrcVT = DstVT = MVT::v16f32;

      return true;

    }

  }


  return false;

}


// Attempt to match a combined shuffle mask against supported unary immediate

// permute instructions.

// TODO: Investigate sharing more of this with shuffle lowering.


static bool matchUnaryPermuteShuffle(MVT MaskVT, ArrayRef<int> Mask,

                                     const APInt &Zeroable,

                                     bool AllowFloatDomain, bool AllowIntDomain,

                                     const SelectionDAG &DAG,

                                     const X86Subtarget &Subtarget,

                                     unsigned &Shuffle, MVT &ShuffleVT,

                                     unsigned &PermuteImm) {

  unsigned NumMaskElts = Mask.size();

  unsigned InputSizeInBits = MaskVT.getSizeInBits();

  unsigned MaskScalarSizeInBits = InputSizeInBits / NumMaskElts;

  MVT MaskEltVT = MVT::getIntegerVT(MaskScalarSizeInBits);

  bool ContainsZeros = isAnyZero(Mask);


  // Handle VPERMI/VPERMILPD vXi64/vXi64 patterns.

  if (!ContainsZeros && MaskScalarSizeInBits == 64) {

    // Check for lane crossing permutes.

    if (is128BitLaneCrossingShuffleMask(MaskEltVT, Mask)) {

      // PERMPD/PERMQ permutes within a 256-bit vector (AVX2+).

      if (Subtarget.hasAVX2() && MaskVT.is256BitVector()) {

        Shuffle = X86ISD::VPERMI;

        ShuffleVT = (AllowFloatDomain ? MVT::v4f64 : MVT::v4i64);

        PermuteImm = getV4X86ShuffleImm(Mask);

        return true;

      }

      if (Subtarget.hasAVX512() && MaskVT.is512BitVector()) {

        SmallVector<int, 4> RepeatedMask;

        if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask)) {

          Shuffle = X86ISD::VPERMI;

          ShuffleVT = (AllowFloatDomain ? MVT::v8f64 : MVT::v8i64);

          PermuteImm = getV4X86ShuffleImm(RepeatedMask);

          return true;

        }

      }

    } else if (AllowFloatDomain && Subtarget.hasAVX()) {

      // VPERMILPD can permute with a non-repeating shuffle.

      Shuffle = X86ISD::VPERMILPI;

      ShuffleVT = MVT::getVectorVT(MVT::f64, Mask.size());

      PermuteImm = 0;

      for (int i = 0, e = Mask.size(); i != e; ++i) {

        int M = Mask[i];

        if (M == SM_SentinelUndef)

          continue;

        assert(((M / 2) == (i / 2)) && "Out of range shuffle mask index");

        PermuteImm |= (M & 1) << i;

      }

      return true;

    }

  }


  // We are checking for shuffle match or shift match. Loop twice so we can

  // order which we try and match first depending on target preference.

  for (unsigned Order = 0; Order < 2; ++Order) {

    if (Subtarget.preferLowerShuffleAsShift() ? (Order == 1) : (Order == 0)) {

      // Handle PSHUFD/VPERMILPI vXi32/vXf32 repeated patterns.

      // AVX introduced the VPERMILPD/VPERMILPS float permutes, before then we

      // had to use 2-input SHUFPD/SHUFPS shuffles (not handled here).

      if ((MaskScalarSizeInBits == 64 || MaskScalarSizeInBits == 32) &&

          !ContainsZeros && (AllowIntDomain || Subtarget.hasAVX())) {

        SmallVector<int, 4> RepeatedMask;

        if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) {

          // Narrow the repeated mask to create 32-bit element permutes.

          SmallVector<int, 4> WordMask = RepeatedMask;

          if (MaskScalarSizeInBits == 64)

            narrowShuffleMaskElts(2, RepeatedMask, WordMask);


          Shuffle = (AllowIntDomain ? X86ISD::PSHUFD : X86ISD::VPERMILPI);

          ShuffleVT = (AllowIntDomain ? MVT::i32 : MVT::f32);

          ShuffleVT = MVT::getVectorVT(ShuffleVT, InputSizeInBits / 32);

          PermuteImm = getV4X86ShuffleImm(WordMask);

          return true;

        }

      }


      // Handle PSHUFLW/PSHUFHW vXi16 repeated patterns.

      if (!ContainsZeros && AllowIntDomain && MaskScalarSizeInBits == 16 &&

          ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||

           (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||

           (MaskVT.is512BitVector() && Subtarget.hasBWI()))) {

        SmallVector<int, 4> RepeatedMask;

        if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) {

          ArrayRef<int> LoMask(RepeatedMask.data() + 0, 4);

          ArrayRef<int> HiMask(RepeatedMask.data() + 4, 4);


          // PSHUFLW: permute lower 4 elements only.

          if (isUndefOrInRange(LoMask, 0, 4) &&

              isSequentialOrUndefInRange(HiMask, 0, 4, 4)) {

            Shuffle = X86ISD::PSHUFLW;

            ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16);

            PermuteImm = getV4X86ShuffleImm(LoMask);

            return true;

          }


          // PSHUFHW: permute upper 4 elements only.

          if (isUndefOrInRange(HiMask, 4, 8) &&

              isSequentialOrUndefInRange(LoMask, 0, 4, 0)) {

            // Offset the HiMask so that we can create the shuffle immediate.

            int OffsetHiMask[4];

            for (int i = 0; i != 4; ++i)

              OffsetHiMask[i] = (HiMask[i] < 0 ? HiMask[i] : HiMask[i] - 4);


            Shuffle = X86ISD::PSHUFHW;

            ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16);

            PermuteImm = getV4X86ShuffleImm(OffsetHiMask);

            return true;

          }

        }

      }

    } else {

      // Attempt to match against bit rotates.

      if (!ContainsZeros && AllowIntDomain && MaskScalarSizeInBits < 64 &&

          ((MaskVT.is128BitVector() && Subtarget.hasXOP()) ||

           Subtarget.hasAVX512())) {

        int RotateAmt = matchShuffleAsBitRotate(ShuffleVT, MaskScalarSizeInBits,

                                                Subtarget, Mask);

        if (0 < RotateAmt) {

          Shuffle = X86ISD::VROTLI;

          PermuteImm = (unsigned)RotateAmt;

          return true;

        }

      }

    }

    // Attempt to match against byte/bit shifts.

    if (AllowIntDomain &&

        ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||

         (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||

         (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {

      int ShiftAmt =

          matchShuffleAsShift(ShuffleVT, Shuffle, MaskScalarSizeInBits, Mask, 0,

                              Zeroable, Subtarget);

      if (0 < ShiftAmt && (!ShuffleVT.is512BitVector() || Subtarget.hasBWI() ||

                           32 <= ShuffleVT.getScalarSizeInBits())) {

        // Byte shifts can be slower so only match them on second attempt.

        if (Order == 0 &&

            (Shuffle == X86ISD::VSHLDQ || Shuffle == X86ISD::VSRLDQ))

          continue;


        PermuteImm = (unsigned)ShiftAmt;

        return true;

      }


    }

  }


  return false;

}


// Attempt to match a combined unary shuffle mask against supported binary

// shuffle instructions.

// TODO: Investigate sharing more of this with shuffle lowering.


static bool matchBinaryShuffle(MVT MaskVT, ArrayRef<int> Mask,

                               bool AllowFloatDomain, bool AllowIntDomain,

                               SDValue &V1, SDValue &V2, const SDLoc &DL,

                               SelectionDAG &DAG, const X86Subtarget &Subtarget,

                               unsigned &Shuffle, MVT &SrcVT, MVT &DstVT,

                               bool IsUnary) {

  unsigned NumMaskElts = Mask.size();

  unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();

  unsigned SizeInBits = MaskVT.getSizeInBits();


  if (MaskVT.is128BitVector()) {

    if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0}, DAG) &&

        AllowFloatDomain) {

      V2 = V1;

      V1 = (SM_SentinelUndef == Mask[0] ? DAG.getUNDEF(MVT::v4f32) : V1);

      Shuffle = Subtarget.hasSSE2() ? X86ISD::UNPCKL : X86ISD::MOVLHPS;

      SrcVT = DstVT = Subtarget.hasSSE2() ? MVT::v2f64 : MVT::v4f32;

      return true;

    }

    if (isTargetShuffleEquivalent(MaskVT, Mask, {1, 1}, DAG) &&

        AllowFloatDomain) {

      V2 = V1;

      Shuffle = Subtarget.hasSSE2() ? X86ISD::UNPCKH : X86ISD::MOVHLPS;

      SrcVT = DstVT = Subtarget.hasSSE2() ? MVT::v2f64 : MVT::v4f32;

      return true;

    }

    if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 3}, DAG) &&

        Subtarget.hasSSE2() && (AllowFloatDomain || !Subtarget.hasSSE41())) {

      std::swap(V1, V2);

      Shuffle = X86ISD::MOVSD;

      SrcVT = DstVT = MVT::v2f64;

      return true;

    }

    if (isTargetShuffleEquivalent(MaskVT, Mask, {4, 1, 2, 3}, DAG) &&

        (AllowFloatDomain || !Subtarget.hasSSE41())) {

      Shuffle = X86ISD::MOVSS;

      SrcVT = DstVT = MVT::v4f32;

      return true;

    }

    if (isTargetShuffleEquivalent(MaskVT, Mask, {8, 1, 2, 3, 4, 5, 6, 7},

                                  DAG) &&

        Subtarget.hasFP16()) {

      Shuffle = X86ISD::MOVSH;

      SrcVT = DstVT = MVT::v8f16;

      return true;

    }

  }


  // Attempt to match against either an unary or binary PACKSS/PACKUS shuffle.

  if (((MaskVT == MVT::v8i16 || MaskVT == MVT::v16i8) && Subtarget.hasSSE2()) ||

      ((MaskVT == MVT::v16i16 || MaskVT == MVT::v32i8) && Subtarget.hasInt256()) ||

      ((MaskVT == MVT::v32i16 || MaskVT == MVT::v64i8) && Subtarget.hasBWI())) {

    if (matchShuffleWithPACK(MaskVT, SrcVT, V1, V2, Shuffle, Mask, DAG,

                             Subtarget)) {

      DstVT = MaskVT;

      return true;

    }

  }

  // TODO: Can we handle this inside matchShuffleWithPACK?

  if (MaskVT == MVT::v4i32 && Subtarget.hasSSE2() &&

      isTargetShuffleEquivalent(MaskVT, Mask, {0, 2, 4, 6}, DAG) &&

      V1.getScalarValueSizeInBits() == 64 &&

      V2.getScalarValueSizeInBits() == 64) {

    // Use (SSE41) PACKUSWD if the leading zerobits goto the lowest 16-bits.

    unsigned MinLZV1 = DAG.computeKnownBits(V1).countMinLeadingZeros();

    unsigned MinLZV2 = DAG.computeKnownBits(V2).countMinLeadingZeros();

    if (Subtarget.hasSSE41() && MinLZV1 >= 48 && MinLZV2 >= 48) {

      SrcVT = MVT::v4i32;

      DstVT = MVT::v8i16;

      Shuffle = X86ISD::PACKUS;

      return true;

    }

    // Use PACKUSBW if the leading zerobits goto the lowest 8-bits.

    if (MinLZV1 >= 56 && MinLZV2 >= 56) {

      SrcVT = MVT::v8i16;

      DstVT = MVT::v16i8;

      Shuffle = X86ISD::PACKUS;

      return true;

    }

    // Use PACKSSWD if the signbits extend to the lowest 16-bits.

    if (DAG.ComputeNumSignBits(V1) > 48 && DAG.ComputeNumSignBits(V2) > 48) {

      SrcVT = MVT::v4i32;

      DstVT = MVT::v8i16;

      Shuffle = X86ISD::PACKSS;

      return true;

    }

  }


  // Attempt to match against either a unary or binary UNPCKL/UNPCKH shuffle.

  if ((MaskVT == MVT::v4f32 && Subtarget.hasSSE1()) ||

      (MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||

      (MaskVT.is256BitVector() && 32 <= EltSizeInBits && Subtarget.hasAVX()) ||

      (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||

      (MaskVT.is512BitVector() && Subtarget.hasAVX512() &&

       (32 <= EltSizeInBits || Subtarget.hasBWI()))) {

    if (matchShuffleWithUNPCK(MaskVT, V1, V2, Shuffle, IsUnary, Mask, DL, DAG,

                              Subtarget)) {

      SrcVT = DstVT = MaskVT;

      if (MaskVT.is256BitVector() && !Subtarget.hasAVX2())

        SrcVT = DstVT = (32 == EltSizeInBits ? MVT::v8f32 : MVT::v4f64);

      return true;

    }

  }


  // Attempt to match against a OR if we're performing a blend shuffle and the

  // non-blended source element is zero in each case.

  // TODO: Handle cases where V1/V2 sizes doesn't match SizeInBits.

  if (SizeInBits == V1.getValueSizeInBits() &&

      SizeInBits == V2.getValueSizeInBits() &&

      (EltSizeInBits % V1.getScalarValueSizeInBits()) == 0 &&

      (EltSizeInBits % V2.getScalarValueSizeInBits()) == 0) {

    bool IsBlend = true;

    unsigned NumV1Elts = V1.getValueType().getVectorNumElements();

    unsigned NumV2Elts = V2.getValueType().getVectorNumElements();

    unsigned Scale1 = NumV1Elts / NumMaskElts;

    unsigned Scale2 = NumV2Elts / NumMaskElts;

    APInt DemandedZeroV1 = APInt::getZero(NumV1Elts);

    APInt DemandedZeroV2 = APInt::getZero(NumV2Elts);

    for (unsigned i = 0; i != NumMaskElts; ++i) {

      int M = Mask[i];

      if (M == SM_SentinelUndef)

        continue;

      if (M == SM_SentinelZero) {

        DemandedZeroV1.setBits(i * Scale1, (i + 1) * Scale1);

        DemandedZeroV2.setBits(i * Scale2, (i + 1) * Scale2);

        continue;

      }

      if (M == (int)i) {

        DemandedZeroV2.setBits(i * Scale2, (i + 1) * Scale2);

        continue;

      }

      if (M == (int)(i + NumMaskElts)) {

        DemandedZeroV1.setBits(i * Scale1, (i + 1) * Scale1);

        continue;

      }

      IsBlend = false;

      break;

    }

    if (IsBlend) {

      if (DAG.MaskedVectorIsZero(V1, DemandedZeroV1) &&

          DAG.MaskedVectorIsZero(V2, DemandedZeroV2)) {

        Shuffle = ISD::OR;

        SrcVT = DstVT = MaskVT.changeTypeToInteger();

        return true;

      }

      if (NumV1Elts == NumV2Elts && NumV1Elts == NumMaskElts) {

        // FIXME: handle mismatched sizes?

        // TODO: investigate if `ISD::OR` handling in

        // `TargetLowering::SimplifyDemandedVectorElts` can be improved instead.

        auto computeKnownBitsElementWise = [&DAG](SDValue V) {

          unsigned NumElts = V.getValueType().getVectorNumElements();

          KnownBits Known(NumElts);

          for (unsigned EltIdx = 0; EltIdx != NumElts; ++EltIdx) {

            APInt Mask = APInt::getOneBitSet(NumElts, EltIdx);

            KnownBits PeepholeKnown = DAG.computeKnownBits(V, Mask);

            if (PeepholeKnown.isZero())

              Known.Zero.setBit(EltIdx);

            if (PeepholeKnown.isAllOnes())

              Known.One.setBit(EltIdx);

          }

          return Known;

        };


        KnownBits V1Known = computeKnownBitsElementWise(V1);

        KnownBits V2Known = computeKnownBitsElementWise(V2);


        for (unsigned i = 0; i != NumMaskElts && IsBlend; ++i) {

          int M = Mask[i];

          if (M == SM_SentinelUndef)

            continue;

          if (M == SM_SentinelZero) {

            IsBlend &= V1Known.Zero[i] && V2Known.Zero[i];

            continue;

          }

          if (M == (int)i) {

            IsBlend &= V2Known.Zero[i] || V1Known.One[i];

            continue;

          }

          if (M == (int)(i + NumMaskElts)) {

            IsBlend &= V1Known.Zero[i] || V2Known.One[i];

            continue;

          }

          llvm_unreachable("will not get here.");

        }

        if (IsBlend) {

          Shuffle = ISD::OR;

          SrcVT = DstVT = MaskVT.changeTypeToInteger();

          return true;

        }

      }

    }

  }


  return false;

}


static bool matchBinaryPermuteShuffle(

    MVT MaskVT, ArrayRef<int> Mask, const APInt &Zeroable,

    bool AllowFloatDomain, bool AllowIntDomain, SDValue &V1, SDValue &V2,

    const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget,

    unsigned &Shuffle, MVT &ShuffleVT, unsigned &PermuteImm) {

  unsigned NumMaskElts = Mask.size();

  unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();


  // Attempt to match against VALIGND/VALIGNQ rotate.

  if (AllowIntDomain && (EltSizeInBits == 64 || EltSizeInBits == 32) &&

      ((MaskVT.is128BitVector() && Subtarget.hasVLX()) ||

       (MaskVT.is256BitVector() && Subtarget.hasVLX()) ||

       (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {

    MVT AlignVT = MVT::getVectorVT(MVT::getIntegerVT(EltSizeInBits),

                                   MaskVT.getSizeInBits() / EltSizeInBits);

    if (!isAnyZero(Mask)) {

      int Rotation = matchShuffleAsElementRotate(V1, V2, Mask);

      if (0 < Rotation) {

        Shuffle = X86ISD::VALIGN;

        ShuffleVT = AlignVT;

        PermuteImm = Rotation;

        return true;

      }

    }

    // See if we can use VALIGN as a cross-lane version of VSHLDQ/VSRLDQ.

    unsigned ZeroLo = Zeroable.countr_one();

    unsigned ZeroHi = Zeroable.countl_one();

    assert((ZeroLo + ZeroHi) < NumMaskElts && "Zeroable shuffle detected");

    if (ZeroLo) {

      SmallVector<int, 16> ShiftMask(NumMaskElts, SM_SentinelZero);

      std::iota(ShiftMask.begin() + ZeroLo, ShiftMask.end(), 0);

      if (isTargetShuffleEquivalent(MaskVT, Mask, ShiftMask, DAG, V1)) {

        V2 = getZeroVector(AlignVT, Subtarget, DAG, DL);

        Shuffle = X86ISD::VALIGN;

        ShuffleVT = AlignVT;

        PermuteImm = NumMaskElts - ZeroLo;

        return true;

      }

    }

    if (ZeroHi) {

      SmallVector<int, 16> ShiftMask(NumMaskElts, SM_SentinelZero);

      std::iota(ShiftMask.begin(), ShiftMask.begin() + NumMaskElts - ZeroHi,

                ZeroHi);

      if (isTargetShuffleEquivalent(MaskVT, Mask, ShiftMask, DAG, V1)) {

        V2 = V1;

        V1 = getZeroVector(AlignVT, Subtarget, DAG, DL);

        Shuffle = X86ISD::VALIGN;

        ShuffleVT = AlignVT;

        PermuteImm = ZeroHi;

        return true;

      }

    }

  }


  // Attempt to match against PALIGNR byte rotate.

  if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSSE3()) ||

                         (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||

                         (MaskVT.is512BitVector() && Subtarget.hasBWI()))) {

    int ByteRotation = matchShuffleAsByteRotate(MaskVT, V1, V2, Mask);

    if (0 < ByteRotation) {

      Shuffle = X86ISD::PALIGNR;

      ShuffleVT = MVT::getVectorVT(MVT::i8, MaskVT.getSizeInBits() / 8);

      PermuteImm = ByteRotation;

      return true;

    }

  }


  // Attempt to combine to X86ISD::BLENDI.

  if ((NumMaskElts <= 8 && ((Subtarget.hasSSE41() && MaskVT.is128BitVector()) ||

                            (Subtarget.hasAVX() && MaskVT.is256BitVector()))) ||

      (MaskVT == MVT::v16i16 && Subtarget.hasAVX2())) {

    uint64_t BlendMask = 0;

    bool ForceV1Zero = false, ForceV2Zero = false;

    SmallVector<int, 8> TargetMask(Mask);

    if (matchShuffleAsBlend(MaskVT, V1, V2, TargetMask, Zeroable, ForceV1Zero,

                            ForceV2Zero, BlendMask)) {

      if (MaskVT == MVT::v16i16) {

        // We can only use v16i16 PBLENDW if the lanes are repeated.

        SmallVector<int, 8> RepeatedMask;

        if (isRepeatedTargetShuffleMask(128, MaskVT, TargetMask,

                                        RepeatedMask)) {

          assert(RepeatedMask.size() == 8 &&

                 "Repeated mask size doesn't match!");

          PermuteImm = 0;

          for (int i = 0; i < 8; ++i)

            if (RepeatedMask[i] >= 8)

              PermuteImm |= 1 << i;

          V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;

          V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;

          Shuffle = X86ISD::BLENDI;

          ShuffleVT = MaskVT;

          return true;

        }

      } else {

        V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;

        V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;

        PermuteImm = (unsigned)BlendMask;

        Shuffle = X86ISD::BLENDI;

        ShuffleVT = MaskVT;

        return true;

      }

    }

  }


  // Attempt to combine to INSERTPS, but only if it has elements that need to

  // be set to zero.

  if (AllowFloatDomain && EltSizeInBits == 32 && Subtarget.hasSSE41() &&

      MaskVT.is128BitVector() && isAnyZero(Mask) &&

      matchShuffleAsInsertPS(V1, V2, PermuteImm, Zeroable, Mask, DAG)) {

    Shuffle = X86ISD::INSERTPS;

    ShuffleVT = MVT::v4f32;

    return true;

  }


  // Attempt to combine to SHUFPD.

  if (AllowFloatDomain && EltSizeInBits == 64 &&

      ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||

       (MaskVT.is256BitVector() && Subtarget.hasAVX()) ||

       (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {

    bool ForceV1Zero = false, ForceV2Zero = false;

    if (matchShuffleWithSHUFPD(MaskVT, V1, V2, ForceV1Zero, ForceV2Zero,

                               PermuteImm, Mask, Zeroable)) {

      V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;

      V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;

      Shuffle = X86ISD::SHUFP;

      ShuffleVT = MVT::getVectorVT(MVT::f64, MaskVT.getSizeInBits() / 64);

      return true;

    }

  }


  // Attempt to combine to SHUFPS.

  if (AllowFloatDomain && EltSizeInBits == 32 &&

      ((MaskVT.is128BitVector() && Subtarget.hasSSE1()) ||

       (MaskVT.is256BitVector() && Subtarget.hasAVX()) ||

       (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {

    SmallVector<int, 4> RepeatedMask;

    if (isRepeatedTargetShuffleMask(128, MaskVT, Mask, RepeatedMask)) {

      // Match each half of the repeated mask, to determine if its just

      // referencing one of the vectors, is zeroable or entirely undef.

      auto MatchHalf = [&](unsigned Offset, int &S0, int &S1) {

        int M0 = RepeatedMask[Offset];

        int M1 = RepeatedMask[Offset + 1];


        if (isUndefInRange(RepeatedMask, Offset, 2)) {

          return DAG.getUNDEF(MaskVT);

        } else if (isUndefOrZeroInRange(RepeatedMask, Offset, 2)) {

          S0 = (SM_SentinelUndef == M0 ? -1 : 0);

          S1 = (SM_SentinelUndef == M1 ? -1 : 1);

          return getZeroVector(MaskVT, Subtarget, DAG, DL);

        } else if (isUndefOrInRange(M0, 0, 4) && isUndefOrInRange(M1, 0, 4)) {

          S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3);

          S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3);

          return V1;

        } else if (isUndefOrInRange(M0, 4, 8) && isUndefOrInRange(M1, 4, 8)) {

          S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3);

          S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3);

          return V2;

        }


        return SDValue();

      };


      int ShufMask[4] = {-1, -1, -1, -1};

      SDValue Lo = MatchHalf(0, ShufMask[0], ShufMask[1]);

      SDValue Hi = MatchHalf(2, ShufMask[2], ShufMask[3]);


      if (Lo && Hi) {

        V1 = Lo;

        V2 = Hi;

        Shuffle = X86ISD::SHUFP;

        ShuffleVT = MVT::getVectorVT(MVT::f32, MaskVT.getSizeInBits() / 32);

        PermuteImm = getV4X86ShuffleImm(ShufMask);

        return true;

      }

    }

  }


  // Attempt to combine to INSERTPS more generally if X86ISD::SHUFP failed.

  if (AllowFloatDomain && EltSizeInBits == 32 && Subtarget.hasSSE41() &&

      MaskVT.is128BitVector() &&

      matchShuffleAsInsertPS(V1, V2, PermuteImm, Zeroable, Mask, DAG)) {

    Shuffle = X86ISD::INSERTPS;

    ShuffleVT = MVT::v4f32;

    return true;

  }


  return false;

}


static SDValue combineX86ShuffleChainWithExtract(

    ArrayRef<SDValue> Inputs, unsigned RootOpcode, MVT RootVT,

    ArrayRef<int> BaseMask, int Depth, ArrayRef<const SDNode *> SrcNodes,

    bool AllowVariableCrossLaneMask, bool AllowVariablePerLaneMask,

    bool IsMaskedShuffle, SelectionDAG &DAG, const SDLoc &DL,

    const X86Subtarget &Subtarget);


/// Combine an arbitrary chain of shuffles into a single instruction if

/// possible.

///

/// This is the leaf of the recursive combine below. When we have found some

/// chain of single-use x86 shuffle instructions and accumulated the combined

/// shuffle mask represented by them, this will try to pattern match that mask

/// into either a single instruction if there is a special purpose instruction

/// for this operation, or into a PSHUFB instruction which is a fully general

/// instruction but should only be used to replace chains over a certain depth.


static SDValue combineX86ShuffleChain(

    ArrayRef<SDValue> Inputs, unsigned RootOpc, MVT RootVT,

    ArrayRef<int> BaseMask, int Depth, ArrayRef<const SDNode *> SrcNodes,

    bool AllowVariableCrossLaneMask, bool AllowVariablePerLaneMask,

    bool IsMaskedShuffle, SelectionDAG &DAG, const SDLoc &DL,

    const X86Subtarget &Subtarget) {

  assert(!BaseMask.empty() && "Cannot combine an empty shuffle mask!");

  assert((Inputs.size() == 1 || Inputs.size() == 2) &&

         "Unexpected number of shuffle inputs!");

  unsigned RootSizeInBits = RootVT.getSizeInBits();

  unsigned NumRootElts = RootVT.getVectorNumElements();


  // Canonicalize shuffle input op to the requested type.

  auto CanonicalizeShuffleInput = [&](MVT VT, SDValue Op) {

    if (VT.getSizeInBits() > Op.getValueSizeInBits())

      Op = widenSubVector(Op, false, Subtarget, DAG, DL, VT.getSizeInBits());

    else if (VT.getSizeInBits() < Op.getValueSizeInBits())

      Op = extractSubVector(Op, 0, DAG, DL, VT.getSizeInBits());

    return DAG.getBitcast(VT, Op);

  };


  // Find the inputs that enter the chain. Note that multiple uses are OK

  // here, we're not going to remove the operands we find.

  bool UnaryShuffle = (Inputs.size() == 1);

  SDValue V1 = peekThroughBitcasts(Inputs[0]);

  SDValue V2 = (UnaryShuffle ? DAG.getUNDEF(V1.getValueType())

                             : peekThroughBitcasts(Inputs[1]));


  MVT VT1 = V1.getSimpleValueType();

  MVT VT2 = V2.getSimpleValueType();

  assert((RootSizeInBits % VT1.getSizeInBits()) == 0 &&

         (RootSizeInBits % VT2.getSizeInBits()) == 0 && "Vector size mismatch");


  SDValue Res;


  unsigned NumBaseMaskElts = BaseMask.size();

  if (NumBaseMaskElts == 1) {

    assert(BaseMask[0] == 0 && "Invalid shuffle index found!");

    return CanonicalizeShuffleInput(RootVT, V1);

  }


  bool OptForSize = DAG.shouldOptForSize();

  unsigned BaseMaskEltSizeInBits = RootSizeInBits / NumBaseMaskElts;

  bool FloatDomain = VT1.isFloatingPoint() || VT2.isFloatingPoint() ||

                     (RootVT.isFloatingPoint() && Depth >= 1) ||

                     (RootVT.is256BitVector() && !Subtarget.hasAVX2());


  // If we are shuffling a splat (and not introducing zeros) then we can just

  // use it directly. This works for smaller elements as well as they already

  // repeat across each mask element.

  if (UnaryShuffle && !isAnyZero(BaseMask) &&

      V1.getValueSizeInBits() >= RootSizeInBits &&

      (BaseMaskEltSizeInBits % V1.getScalarValueSizeInBits()) == 0 &&

      DAG.isSplatValue(V1, /*AllowUndefs*/ false)) {

    return CanonicalizeShuffleInput(RootVT, V1);

  }


  SmallVector<int, 64> Mask(BaseMask);


  // See if the shuffle is a hidden identity shuffle - repeated args in HOPs

  // etc. can be simplified.

  if (VT1 == VT2 && VT1.getSizeInBits() == RootSizeInBits && VT1.isVector()) {

    SmallVector<int> ScaledMask, IdentityMask;

    unsigned NumElts = VT1.getVectorNumElements();

    if (Mask.size() <= NumElts &&

        scaleShuffleElements(Mask, NumElts, ScaledMask)) {

      for (unsigned i = 0; i != NumElts; ++i)

        IdentityMask.push_back(i);

      if (isTargetShuffleEquivalent(RootVT, ScaledMask, IdentityMask, DAG, V1,

                                    V2))

        return CanonicalizeShuffleInput(RootVT, V1);

    }

  }


  // Handle 128/256-bit lane shuffles of 512-bit vectors.

  if (RootVT.is512BitVector() &&

      (NumBaseMaskElts == 2 || NumBaseMaskElts == 4)) {

    // If the upper subvectors are zeroable, then an extract+insert is more

    // optimal than using X86ISD::SHUF128. The insertion is free, even if it has

    // to zero the upper subvectors.

    if (isUndefOrZeroInRange(Mask, 1, NumBaseMaskElts - 1)) {

      if (Depth == 0 && RootOpc == ISD::INSERT_SUBVECTOR)

        return SDValue(); // Nothing to do!

      assert(isInRange(Mask[0], 0, NumBaseMaskElts) &&

             "Unexpected lane shuffle");

      Res = CanonicalizeShuffleInput(RootVT, V1);

      unsigned SubIdx = Mask[0] * (NumRootElts / NumBaseMaskElts);

      bool UseZero = isAnyZero(Mask);

      Res = extractSubVector(Res, SubIdx, DAG, DL, BaseMaskEltSizeInBits);

      return widenSubVector(Res, UseZero, Subtarget, DAG, DL, RootSizeInBits);

    }


    // Narrow shuffle mask to v4x128.

    SmallVector<int, 4> ScaledMask;

    assert((BaseMaskEltSizeInBits % 128) == 0 && "Illegal mask size");

    narrowShuffleMaskElts(BaseMaskEltSizeInBits / 128, Mask, ScaledMask);


    // Try to lower to vshuf64x2/vshuf32x4.

    auto MatchSHUF128 = [&](MVT ShuffleVT, const SDLoc &DL,

                            ArrayRef<int> ScaledMask, SDValue V1, SDValue V2,

                            SelectionDAG &DAG) {

      int PermMask[4] = {-1, -1, -1, -1};

      // Ensure elements came from the same Op.

      SDValue Ops[2] = {DAG.getUNDEF(ShuffleVT), DAG.getUNDEF(ShuffleVT)};

      for (int i = 0; i < 4; ++i) {

        assert(ScaledMask[i] >= -1 && "Illegal shuffle sentinel value");

        if (ScaledMask[i] < 0)

          continue;


        SDValue Op = ScaledMask[i] >= 4 ? V2 : V1;

        unsigned OpIndex = i / 2;

        if (Ops[OpIndex].isUndef())

          Ops[OpIndex] = Op;

        else if (Ops[OpIndex] != Op)

          return SDValue();


        PermMask[i] = ScaledMask[i] % 4;

      }


      return DAG.getNode(X86ISD::SHUF128, DL, ShuffleVT,

                         CanonicalizeShuffleInput(ShuffleVT, Ops[0]),

                         CanonicalizeShuffleInput(ShuffleVT, Ops[1]),

                         getV4X86ShuffleImm8ForMask(PermMask, DL, DAG));

    };


    // FIXME: Is there a better way to do this? is256BitLaneRepeatedShuffleMask

    // doesn't work because our mask is for 128 bits and we don't have an MVT

    // to match that.

    bool PreferPERMQ = UnaryShuffle && !isFreeToSplitVector(V1, DAG) &&

                       isUndefOrInRange(ScaledMask[0], 0, 2) &&

                       isUndefOrInRange(ScaledMask[1], 0, 2) &&

                       isUndefOrInRange(ScaledMask[2], 2, 4) &&

                       isUndefOrInRange(ScaledMask[3], 2, 4) &&

                       (ScaledMask[0] < 0 || ScaledMask[2] < 0 ||

                        ScaledMask[0] == (ScaledMask[2] % 2)) &&

                       (ScaledMask[1] < 0 || ScaledMask[3] < 0 ||

                        ScaledMask[1] == (ScaledMask[3] % 2));


    if (!isAnyZero(ScaledMask) && !PreferPERMQ) {

      if (Depth == 0 && RootOpc == X86ISD::SHUF128)

        return SDValue(); // Nothing to do!

      MVT ShuffleVT = (FloatDomain ? MVT::v8f64 : MVT::v8i64);

      if (SDValue V = MatchSHUF128(ShuffleVT, DL, ScaledMask, V1, V2, DAG))

        return DAG.getBitcast(RootVT, V);

    }

  }


  // Handle 128-bit lane shuffles of 256-bit vectors.

  if (RootVT.is256BitVector() && NumBaseMaskElts == 2) {

    // If the upper half is zeroable, then an extract+insert is more optimal

    // than using X86ISD::VPERM2X128. The insertion is free, even if it has to

    // zero the upper half.

    if (isUndefOrZero(Mask[1])) {

      if (Depth == 0 && RootOpc == ISD::INSERT_SUBVECTOR)

        return SDValue(); // Nothing to do!

      assert(isInRange(Mask[0], 0, 2) && "Unexpected lane shuffle");

      Res = CanonicalizeShuffleInput(RootVT, V1);

      Res = extract128BitVector(Res, Mask[0] * (NumRootElts / 2), DAG, DL);

      return widenSubVector(Res, Mask[1] == SM_SentinelZero, Subtarget, DAG, DL,

                            256);

    }


    // If we're inserting the low subvector, an insert-subvector 'concat'

    // pattern is quicker than VPERM2X128.

    if (BaseMask[0] == 0 && (BaseMask[1] == 0 || BaseMask[1] == 2) &&

        !Subtarget.hasAVX2()) {

      if (Depth == 0 && RootOpc == ISD::INSERT_SUBVECTOR)

        return SDValue(); // Nothing to do!

      SDValue Lo = CanonicalizeShuffleInput(RootVT, V1);

      SDValue Hi = CanonicalizeShuffleInput(RootVT, BaseMask[1] == 0 ? V1 : V2);

      Hi = extractSubVector(Hi, 0, DAG, DL, 128);

      return insertSubVector(Lo, Hi, NumRootElts / 2, DAG, DL, 128);

    }


    // Don't lower to VPERM2X128 here if we have AVX2+, prefer to use

    // VPERMQ/VPERMPD for unary shuffles unless we need to use the zeroing

    // feature.

    // Prefer blends for sequential shuffles unless we are optimizing for size.

    if (UnaryShuffle &&

        !(Subtarget.hasAVX2() && isUndefOrInRange(Mask, 0, 2)) &&

        (OptForSize || !isSequentialOrUndefOrZeroInRange(Mask, 0, 2, 0))) {

      if (Depth == 0 && RootOpc == X86ISD::VPERM2X128)

        return SDValue(); // Nothing to do!

      unsigned PermMask = 0;

      PermMask |= ((Mask[0] < 0 ? 0x8 : (Mask[0] & 1)) << 0);

      PermMask |= ((Mask[1] < 0 ? 0x8 : (Mask[1] & 1)) << 4);

      return DAG.getNode(

          X86ISD::VPERM2X128, DL, RootVT, CanonicalizeShuffleInput(RootVT, V1),

          DAG.getUNDEF(RootVT), DAG.getTargetConstant(PermMask, DL, MVT::i8));

    }


    if (Depth == 0 && RootOpc == X86ISD::SHUF128)

      return SDValue(); // Nothing to do!


    // TODO - handle AVX512VL cases with X86ISD::SHUF128.

    if (!UnaryShuffle && !IsMaskedShuffle) {

      assert(llvm::all_of(Mask, [](int M) { return 0 <= M && M < 4; }) &&

             "Unexpected shuffle sentinel value");

      // Prefer blends to X86ISD::VPERM2X128.

      if (!((Mask[0] == 0 && Mask[1] == 3) || (Mask[0] == 2 && Mask[1] == 1))) {

        if (Depth == 0 && RootOpc == X86ISD::VPERM2X128)

          return SDValue(); // Nothing to do!

        unsigned PermMask = 0;

        PermMask |= ((Mask[0] & 3) << 0);

        PermMask |= ((Mask[1] & 3) << 4);

        SDValue LHS = isInRange(Mask[0], 0, 2) ? V1 : V2;

        SDValue RHS = isInRange(Mask[1], 0, 2) ? V1 : V2;

        return DAG.getNode(X86ISD::VPERM2X128, DL, RootVT,

                          CanonicalizeShuffleInput(RootVT, LHS),

                          CanonicalizeShuffleInput(RootVT, RHS),

                          DAG.getTargetConstant(PermMask, DL, MVT::i8));

      }

    }

  }


  // For masks that have been widened to 128-bit elements or more,

  // narrow back down to 64-bit elements.

  if (BaseMaskEltSizeInBits > 64) {

    assert((BaseMaskEltSizeInBits % 64) == 0 && "Illegal mask size");

    int MaskScale = BaseMaskEltSizeInBits / 64;

    SmallVector<int, 64> ScaledMask;

    narrowShuffleMaskElts(MaskScale, Mask, ScaledMask);

    Mask = std::move(ScaledMask);

  }


  // For masked shuffles, we're trying to match the root width for better

  // writemask folding, attempt to scale the mask.

  // TODO - variable shuffles might need this to be widened again.

  if (IsMaskedShuffle && NumRootElts > Mask.size()) {

    assert((NumRootElts % Mask.size()) == 0 && "Illegal mask size");

    int MaskScale = NumRootElts / Mask.size();

    SmallVector<int, 64> ScaledMask;

    narrowShuffleMaskElts(MaskScale, Mask, ScaledMask);

    Mask = std::move(ScaledMask);

  }


  unsigned NumMaskElts = Mask.size();

  unsigned MaskEltSizeInBits = RootSizeInBits / NumMaskElts;

  const TargetLowering &TLI = DAG.getTargetLoweringInfo();


  // Determine the effective mask value type.

  FloatDomain &= (32 <= MaskEltSizeInBits);

  MVT MaskVT = FloatDomain ? MVT::getFloatingPointVT(MaskEltSizeInBits)

                           : MVT::getIntegerVT(MaskEltSizeInBits);

  MaskVT = MVT::getVectorVT(MaskVT, NumMaskElts);


  // Only allow legal mask types.

  if (!TLI.isTypeLegal(MaskVT))

    return SDValue();


  // Attempt to match the mask against known shuffle patterns.

  MVT ShuffleSrcVT, ShuffleVT;

  unsigned Shuffle, PermuteImm;


  // Which shuffle domains are permitted?

  // Permit domain crossing at higher combine depths.

  // TODO: Should we indicate which domain is preferred if both are allowed?

  bool AllowFloatDomain = FloatDomain || (Depth >= 3);

  bool AllowIntDomain = (!FloatDomain || (Depth >= 3)) && Subtarget.hasSSE2() &&

                        (!MaskVT.is256BitVector() || Subtarget.hasAVX2());


  // Determine zeroable mask elements.

  APInt KnownUndef, KnownZero;

  resolveZeroablesFromTargetShuffle(Mask, KnownUndef, KnownZero);

  APInt Zeroable = KnownUndef | KnownZero;


  if (UnaryShuffle) {

    // Attempt to match against broadcast-from-vector.

    // Limit AVX1 to cases where we're loading+broadcasting a scalar element.

    if ((Subtarget.hasAVX2() ||

         (Subtarget.hasAVX() && 32 <= MaskEltSizeInBits)) &&

        (!IsMaskedShuffle || NumRootElts == NumMaskElts)) {

      if (isUndefOrEqual(Mask, 0)) {

        if (V1.getValueType() == MaskVT &&

            V1.getOpcode() == ISD::SCALAR_TO_VECTOR &&

            X86::mayFoldLoad(V1.getOperand(0), Subtarget)) {

          if (Depth == 0 && RootOpc == X86ISD::VBROADCAST)

            return SDValue(); // Nothing to do!

          Res = V1.getOperand(0);

          Res = DAG.getNode(X86ISD::VBROADCAST, DL, MaskVT, Res);

          return DAG.getBitcast(RootVT, Res);

        }

        if (Subtarget.hasAVX2()) {

          if (Depth == 0 && RootOpc == X86ISD::VBROADCAST)

            return SDValue(); // Nothing to do!

          Res = CanonicalizeShuffleInput(MaskVT, V1);

          Res = DAG.getNode(X86ISD::VBROADCAST, DL, MaskVT, Res);

          return DAG.getBitcast(RootVT, Res);

        }

      }

    }


    if (matchUnaryShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain, V1,

                          DAG, Subtarget, Shuffle, ShuffleSrcVT, ShuffleVT) &&

        (!IsMaskedShuffle ||

         (NumRootElts == ShuffleVT.getVectorNumElements()))) {

      if (Depth == 0 && RootOpc == Shuffle)

        return SDValue(); // Nothing to do!

      Res = CanonicalizeShuffleInput(ShuffleSrcVT, V1);

      Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res);

      return DAG.getBitcast(RootVT, Res);

    }


    if (matchUnaryPermuteShuffle(MaskVT, Mask, Zeroable, AllowFloatDomain,

                                 AllowIntDomain, DAG, Subtarget, Shuffle, ShuffleVT,

                                 PermuteImm) &&

        (!IsMaskedShuffle ||

         (NumRootElts == ShuffleVT.getVectorNumElements()))) {

      if (Depth == 0 && RootOpc == Shuffle)

        return SDValue(); // Nothing to do!

      Res = CanonicalizeShuffleInput(ShuffleVT, V1);

      Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res,

                        DAG.getTargetConstant(PermuteImm, DL, MVT::i8));

      return DAG.getBitcast(RootVT, Res);

    }

  }


  // Attempt to combine to INSERTPS, but only if the inserted element has come

  // from a scalar.

  // TODO: Handle other insertions here as well?

  if (!UnaryShuffle && AllowFloatDomain && RootSizeInBits == 128 &&

      Subtarget.hasSSE41() &&

      !isTargetShuffleEquivalent(MaskVT, Mask, {4, 1, 2, 3}, DAG)) {

    if (MaskEltSizeInBits == 32) {

      SDValue SrcV1 = V1, SrcV2 = V2;

      if (matchShuffleAsInsertPS(SrcV1, SrcV2, PermuteImm, Zeroable, Mask,

                                 DAG) &&

          SrcV2.getOpcode() == ISD::SCALAR_TO_VECTOR) {

        if (Depth == 0 && RootOpc == X86ISD::INSERTPS)

          return SDValue(); // Nothing to do!

        Res = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32,

                          CanonicalizeShuffleInput(MVT::v4f32, SrcV1),

                          CanonicalizeShuffleInput(MVT::v4f32, SrcV2),

                          DAG.getTargetConstant(PermuteImm, DL, MVT::i8));

        return DAG.getBitcast(RootVT, Res);

      }

    }

    if (MaskEltSizeInBits == 64 &&

        isTargetShuffleEquivalent(MaskVT, Mask, {0, 2}, DAG) &&

        V2.getOpcode() == ISD::SCALAR_TO_VECTOR &&

        V2.getScalarValueSizeInBits() <= 32) {

      if (Depth == 0 && RootOpc == X86ISD::INSERTPS)

        return SDValue(); // Nothing to do!

      PermuteImm = (/*DstIdx*/ 2 << 4) | (/*SrcIdx*/ 0 << 0);

      Res = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32,

                        CanonicalizeShuffleInput(MVT::v4f32, V1),

                        CanonicalizeShuffleInput(MVT::v4f32, V2),

                        DAG.getTargetConstant(PermuteImm, DL, MVT::i8));

      return DAG.getBitcast(RootVT, Res);

    }

  }


  SDValue NewV1 = V1; // Save operands in case early exit happens.

  SDValue NewV2 = V2;

  if (matchBinaryShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain, NewV1,

                         NewV2, DL, DAG, Subtarget, Shuffle, ShuffleSrcVT,

                         ShuffleVT, UnaryShuffle) &&

      (!IsMaskedShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {

    if (Depth == 0 && RootOpc == Shuffle)

      return SDValue(); // Nothing to do!

    NewV1 = CanonicalizeShuffleInput(ShuffleSrcVT, NewV1);

    NewV2 = CanonicalizeShuffleInput(ShuffleSrcVT, NewV2);

    Res = DAG.getNode(Shuffle, DL, ShuffleVT, NewV1, NewV2);

    return DAG.getBitcast(RootVT, Res);

  }


  NewV1 = V1; // Save operands in case early exit happens.

  NewV2 = V2;

  if (matchBinaryPermuteShuffle(MaskVT, Mask, Zeroable, AllowFloatDomain,

                                AllowIntDomain, NewV1, NewV2, DL, DAG,

                                Subtarget, Shuffle, ShuffleVT, PermuteImm) &&

      (!IsMaskedShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {

    if (Depth == 0 && RootOpc == Shuffle)

      return SDValue(); // Nothing to do!

    NewV1 = CanonicalizeShuffleInput(ShuffleVT, NewV1);

    NewV2 = CanonicalizeShuffleInput(ShuffleVT, NewV2);

    Res = DAG.getNode(Shuffle, DL, ShuffleVT, NewV1, NewV2,

                      DAG.getTargetConstant(PermuteImm, DL, MVT::i8));

    return DAG.getBitcast(RootVT, Res);

  }


  // Typically from here on, we need an integer version of MaskVT.

  MVT IntMaskVT = MVT::getIntegerVT(MaskEltSizeInBits);

  IntMaskVT = MVT::getVectorVT(IntMaskVT, NumMaskElts);


  // Annoyingly, SSE4A instructions don't map into the above match helpers.

  if (Subtarget.hasSSE4A() && AllowIntDomain && RootSizeInBits == 128) {

    uint64_t BitLen, BitIdx;

    if (matchShuffleAsEXTRQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx,

                            Zeroable)) {

      if (Depth == 0 && RootOpc == X86ISD::EXTRQI)

        return SDValue(); // Nothing to do!

      V1 = CanonicalizeShuffleInput(IntMaskVT, V1);

      Res = DAG.getNode(X86ISD::EXTRQI, DL, IntMaskVT, V1,

                        DAG.getTargetConstant(BitLen, DL, MVT::i8),

                        DAG.getTargetConstant(BitIdx, DL, MVT::i8));

      return DAG.getBitcast(RootVT, Res);

    }


    if (matchShuffleAsINSERTQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx)) {

      if (Depth == 0 && RootOpc == X86ISD::INSERTQI)

        return SDValue(); // Nothing to do!

      V1 = CanonicalizeShuffleInput(IntMaskVT, V1);

      V2 = CanonicalizeShuffleInput(IntMaskVT, V2);

      Res = DAG.getNode(X86ISD::INSERTQI, DL, IntMaskVT, V1, V2,

                        DAG.getTargetConstant(BitLen, DL, MVT::i8),

                        DAG.getTargetConstant(BitIdx, DL, MVT::i8));

      return DAG.getBitcast(RootVT, Res);

    }

  }


  // Match shuffle against TRUNCATE patterns.

  if (AllowIntDomain && MaskEltSizeInBits < 64 && Subtarget.hasAVX512()) {

    // Match against a VTRUNC instruction, accounting for src/dst sizes.

    if (matchShuffleAsVTRUNC(ShuffleSrcVT, ShuffleVT, IntMaskVT, Mask, Zeroable,

                             Subtarget)) {

      bool IsTRUNCATE = ShuffleVT.getVectorNumElements() ==

                        ShuffleSrcVT.getVectorNumElements();

      unsigned Opc =

          IsTRUNCATE ? (unsigned)ISD::TRUNCATE : (unsigned)X86ISD::VTRUNC;

      if (Depth == 0 && RootOpc == Opc)

        return SDValue(); // Nothing to do!

      V1 = CanonicalizeShuffleInput(ShuffleSrcVT, V1);

      Res = DAG.getNode(Opc, DL, ShuffleVT, V1);

      if (ShuffleVT.getSizeInBits() < RootSizeInBits)

        Res = widenSubVector(Res, true, Subtarget, DAG, DL, RootSizeInBits);

      return DAG.getBitcast(RootVT, Res);

    }


    // Do we need a more general binary truncation pattern?

    if (RootSizeInBits < 512 &&

        ((RootVT.is256BitVector() && Subtarget.useAVX512Regs()) ||

         (RootVT.is128BitVector() && Subtarget.hasVLX())) &&

        (MaskEltSizeInBits > 8 || Subtarget.hasBWI()) &&

        isSequentialOrUndefInRange(Mask, 0, NumMaskElts, 0, 2)) {

      // Bail if this was already a truncation or PACK node.

      // We sometimes fail to match PACK if we demand known undef elements.

      if (Depth == 0 &&

          (RootOpc == ISD::TRUNCATE || RootOpc == X86ISD::PACKSS ||

           RootOpc == X86ISD::PACKUS))

        return SDValue(); // Nothing to do!

      ShuffleSrcVT = MVT::getIntegerVT(MaskEltSizeInBits * 2);

      ShuffleSrcVT = MVT::getVectorVT(ShuffleSrcVT, NumMaskElts / 2);

      V1 = CanonicalizeShuffleInput(ShuffleSrcVT, V1);

      V2 = CanonicalizeShuffleInput(ShuffleSrcVT, V2);

      ShuffleSrcVT = MVT::getIntegerVT(MaskEltSizeInBits * 2);

      ShuffleSrcVT = MVT::getVectorVT(ShuffleSrcVT, NumMaskElts);

      Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, ShuffleSrcVT, V1, V2);

      Res = DAG.getNode(ISD::TRUNCATE, DL, IntMaskVT, Res);

      return DAG.getBitcast(RootVT, Res);

    }

  }


  // Don't try to re-form single instruction chains under any circumstances now

  // that we've done encoding canonicalization for them.

  if (Depth < 1)

    return SDValue();


  int NumVariableMasks = llvm::count_if(SrcNodes, [](const SDNode *N) {

    return isTargetShuffleVariableMask(N->getOpcode());

  });

  bool HasSlowVariableMask = llvm::any_of(SrcNodes, [](const SDNode *N) {

    return (N->getOpcode() == X86ISD::VPERMV3 ||

            N->getOpcode() == X86ISD::VPERMV);

  });


  // Depth threshold above which we can efficiently use variable mask shuffles.

  int VariableCrossLaneShuffleDepth =

      Subtarget.hasFastVariableCrossLaneShuffle() ? 1 : 2;

  int VariablePerLaneShuffleDepth =

      Subtarget.hasFastVariablePerLaneShuffle() ? 1 : 2;

  AllowVariableCrossLaneMask &=

      (Depth >= VariableCrossLaneShuffleDepth) || NumVariableMasks;

  AllowVariablePerLaneMask &=

      (Depth >= VariablePerLaneShuffleDepth) || NumVariableMasks;

  // VPERM2W/VPERM2B are 3 uops on Skylake and Icelake so we require a

  // higher depth before combining them.

  int BWIVPERMV3ShuffleDepth =

      VariableCrossLaneShuffleDepth + 2 - NumVariableMasks;

  bool AllowBWIVPERMV3 =

      (Depth >= BWIVPERMV3ShuffleDepth || HasSlowVariableMask);


  // If root was a VPERMV/VPERMV3 node, always allow a variable shuffle.

  if ((UnaryShuffle && RootOpc == X86ISD::VPERMV) || RootOpc == X86ISD::VPERMV3)

    AllowVariableCrossLaneMask = AllowVariablePerLaneMask = true;


  bool MaskContainsZeros = isAnyZero(Mask);


  if (is128BitLaneCrossingShuffleMask(MaskVT, Mask)) {

    // If we have a single input lane-crossing shuffle then lower to VPERMV.

    if (UnaryShuffle && AllowVariableCrossLaneMask && !MaskContainsZeros) {

      if (Subtarget.hasAVX2() &&

          (MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) {

        SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true);

        Res = CanonicalizeShuffleInput(MaskVT, V1);

        Res = DAG.getNode(X86ISD::VPERMV, DL, MaskVT, VPermMask, Res);

        return DAG.getBitcast(RootVT, Res);

      }

      // AVX512 variants (non-VLX will pad to 512-bit shuffles).

      if ((Subtarget.hasAVX512() &&

           (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||

            MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||

          (Subtarget.hasBWI() &&

           (MaskVT == MVT::v16i16 || MaskVT == MVT::v32i16)) ||

          (Subtarget.hasVBMI() &&

           (MaskVT == MVT::v32i8 || MaskVT == MVT::v64i8))) {

        V1 = CanonicalizeShuffleInput(MaskVT, V1);

        V2 = DAG.getUNDEF(MaskVT);

        Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);

        return DAG.getBitcast(RootVT, Res);

      }

    }


    // Lower a unary+zero lane-crossing shuffle as VPERMV3 with a zero

    // vector as the second source (non-VLX will pad to 512-bit shuffles).

    if (UnaryShuffle && AllowVariableCrossLaneMask &&

        ((Subtarget.hasAVX512() &&

          (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||

           MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 ||

           MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32 ||

           MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||

         (Subtarget.hasBWI() && AllowBWIVPERMV3 &&

          (MaskVT == MVT::v16i16 || MaskVT == MVT::v32i16)) ||

         (Subtarget.hasVBMI() && AllowBWIVPERMV3 &&

          (MaskVT == MVT::v32i8 || MaskVT == MVT::v64i8)))) {

      // Adjust shuffle mask - replace SM_SentinelZero with second source index.

      for (unsigned i = 0; i != NumMaskElts; ++i)

        if (Mask[i] == SM_SentinelZero)

          Mask[i] = NumMaskElts + i;

      V1 = CanonicalizeShuffleInput(MaskVT, V1);

      V2 = getZeroVector(MaskVT, Subtarget, DAG, DL);

      Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);

      return DAG.getBitcast(RootVT, Res);

    }


    // If that failed and either input is extracted then try to combine as a

    // shuffle with the larger type.

    if (SDValue WideShuffle = combineX86ShuffleChainWithExtract(

            Inputs, RootOpc, RootVT, BaseMask, Depth, SrcNodes,

            AllowVariableCrossLaneMask, AllowVariablePerLaneMask,

            IsMaskedShuffle, DAG, DL, Subtarget))

      return WideShuffle;


    // If we have a dual input lane-crossing shuffle then lower to VPERMV3,

    // (non-VLX will pad to 512-bit shuffles).

    if (AllowVariableCrossLaneMask && !MaskContainsZeros &&

        ((Subtarget.hasAVX512() &&

          (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||

           MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 ||

           MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32 ||

           MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) ||

         (Subtarget.hasBWI() && AllowBWIVPERMV3 &&

          (MaskVT == MVT::v16i16 || MaskVT == MVT::v32i16)) ||

         (Subtarget.hasVBMI() && AllowBWIVPERMV3 &&

          (MaskVT == MVT::v32i8 || MaskVT == MVT::v64i8)))) {

      V1 = CanonicalizeShuffleInput(MaskVT, V1);

      V2 = CanonicalizeShuffleInput(MaskVT, V2);

      Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);

      return DAG.getBitcast(RootVT, Res);

    }

    return SDValue();

  }


  // See if we can combine a single input shuffle with zeros to a bit-mask,

  // which is much simpler than any shuffle.

  if (UnaryShuffle && MaskContainsZeros && AllowVariablePerLaneMask &&

      isSequentialOrUndefOrZeroInRange(Mask, 0, NumMaskElts, 0) &&

      TLI.isTypeLegal(MaskVT)) {

    APInt Zero = APInt::getZero(MaskEltSizeInBits);

    APInt AllOnes = APInt::getAllOnes(MaskEltSizeInBits);

    APInt UndefElts(NumMaskElts, 0);

    SmallVector<APInt, 64> EltBits(NumMaskElts, Zero);

    for (unsigned i = 0; i != NumMaskElts; ++i) {

      int M = Mask[i];

      if (M == SM_SentinelUndef) {

        UndefElts.setBit(i);

        continue;

      }

      if (M == SM_SentinelZero)

        continue;

      EltBits[i] = AllOnes;

    }

    SDValue BitMask = getConstVector(EltBits, UndefElts, MaskVT, DAG, DL);

    Res = CanonicalizeShuffleInput(MaskVT, V1);

    unsigned AndOpcode =

        MaskVT.isFloatingPoint() ? unsigned(X86ISD::FAND) : unsigned(ISD::AND);

    Res = DAG.getNode(AndOpcode, DL, MaskVT, Res, BitMask);

    return DAG.getBitcast(RootVT, Res);

  }


  // If we have a single input shuffle with different shuffle patterns in the

  // the 128-bit lanes use the variable mask to VPERMILPS.

  // TODO Combine other mask types at higher depths.

  if (UnaryShuffle && AllowVariablePerLaneMask && !MaskContainsZeros &&

      ((MaskVT == MVT::v8f32 && Subtarget.hasAVX()) ||

       (MaskVT == MVT::v16f32 && Subtarget.hasAVX512()))) {

    SmallVector<SDValue, 16> VPermIdx;

    for (int M : Mask) {

      SDValue Idx =

          M < 0 ? DAG.getUNDEF(MVT::i32) : DAG.getConstant(M % 4, DL, MVT::i32);

      VPermIdx.push_back(Idx);

    }

    SDValue VPermMask = DAG.getBuildVector(IntMaskVT, DL, VPermIdx);

    Res = CanonicalizeShuffleInput(MaskVT, V1);

    Res = DAG.getNode(X86ISD::VPERMILPV, DL, MaskVT, Res, VPermMask);

    return DAG.getBitcast(RootVT, Res);

  }


  // With XOP, binary shuffles of 128/256-bit floating point vectors can combine

  // to VPERMIL2PD/VPERMIL2PS.

  if (AllowVariablePerLaneMask && Subtarget.hasXOP() &&

      (MaskVT == MVT::v2f64 || MaskVT == MVT::v4f64 || MaskVT == MVT::v4f32 ||

       MaskVT == MVT::v8f32)) {

    // VPERMIL2 Operation.

    // Bits[3] - Match Bit.

    // Bits[2:1] - (Per Lane) PD Shuffle Mask.

    // Bits[2:0] - (Per Lane) PS Shuffle Mask.

    unsigned NumLanes = MaskVT.getSizeInBits() / 128;

    unsigned NumEltsPerLane = NumMaskElts / NumLanes;

    SmallVector<int, 8> VPerm2Idx;

    unsigned M2ZImm = 0;

    for (int M : Mask) {

      if (M == SM_SentinelUndef) {

        VPerm2Idx.push_back(-1);

        continue;

      }

      if (M == SM_SentinelZero) {

        M2ZImm = 2;

        VPerm2Idx.push_back(8);

        continue;

      }

      int Index = (M % NumEltsPerLane) + ((M / NumMaskElts) * NumEltsPerLane);

      Index = (MaskVT.getScalarSizeInBits() == 64 ? Index << 1 : Index);

      VPerm2Idx.push_back(Index);

    }

    V1 = CanonicalizeShuffleInput(MaskVT, V1);

    V2 = CanonicalizeShuffleInput(MaskVT, V2);

    SDValue VPerm2MaskOp = getConstVector(VPerm2Idx, IntMaskVT, DAG, DL, true);

    Res = DAG.getNode(X86ISD::VPERMIL2, DL, MaskVT, V1, V2, VPerm2MaskOp,

                      DAG.getTargetConstant(M2ZImm, DL, MVT::i8));

    return DAG.getBitcast(RootVT, Res);

  }


  // If we have 3 or more shuffle instructions or a chain involving a variable

  // mask, we can replace them with a single PSHUFB instruction profitably.

  // Intel's manuals suggest only using PSHUFB if doing so replacing 5

  // instructions, but in practice PSHUFB tends to be *very* fast so we're

  // more aggressive.

  if (UnaryShuffle && AllowVariablePerLaneMask &&

      ((RootVT.is128BitVector() && Subtarget.hasSSSE3()) ||

       (RootVT.is256BitVector() && Subtarget.hasAVX2()) ||

       (RootVT.is512BitVector() && Subtarget.hasBWI()))) {

    SmallVector<SDValue, 16> PSHUFBMask;

    int NumBytes = RootVT.getSizeInBits() / 8;

    int Ratio = NumBytes / NumMaskElts;

    for (int i = 0; i < NumBytes; ++i) {

      int M = Mask[i / Ratio];

      if (M == SM_SentinelUndef) {

        PSHUFBMask.push_back(DAG.getUNDEF(MVT::i8));

        continue;

      }

      if (M == SM_SentinelZero) {

        PSHUFBMask.push_back(DAG.getConstant(0x80, DL, MVT::i8));

        continue;

      }

      M = Ratio * M + i % Ratio;

      assert((M / 16) == (i / 16) && "Lane crossing detected");

      PSHUFBMask.push_back(DAG.getConstant(M, DL, MVT::i8));

    }

    MVT ByteVT = MVT::getVectorVT(MVT::i8, NumBytes);

    Res = CanonicalizeShuffleInput(ByteVT, V1);

    SDValue PSHUFBMaskOp = DAG.getBuildVector(ByteVT, DL, PSHUFBMask);

    Res = DAG.getNode(X86ISD::PSHUFB, DL, ByteVT, Res, PSHUFBMaskOp);

    return DAG.getBitcast(RootVT, Res);

  }


  // With XOP, if we have a 128-bit binary input shuffle we can always combine

  // to VPPERM. We match the depth requirement of PSHUFB - VPPERM is never

  // slower than PSHUFB on targets that support both.

  if (AllowVariablePerLaneMask && RootVT.is128BitVector() &&

      Subtarget.hasXOP()) {

    // VPPERM Mask Operation

    // Bits[4:0] - Byte Index (0 - 31)

    // Bits[7:5] - Permute Operation (0 - Source byte, 4 - ZERO)

    SmallVector<SDValue, 16> VPPERMMask;

    int NumBytes = 16;

    int Ratio = NumBytes / NumMaskElts;

    for (int i = 0; i < NumBytes; ++i) {

      int M = Mask[i / Ratio];

      if (M == SM_SentinelUndef) {

        VPPERMMask.push_back(DAG.getUNDEF(MVT::i8));

        continue;

      }

      if (M == SM_SentinelZero) {

        VPPERMMask.push_back(DAG.getConstant(0x80, DL, MVT::i8));

        continue;

      }

      M = Ratio * M + i % Ratio;

      VPPERMMask.push_back(DAG.getConstant(M, DL, MVT::i8));

    }

    MVT ByteVT = MVT::v16i8;

    V1 = CanonicalizeShuffleInput(ByteVT, V1);

    V2 = CanonicalizeShuffleInput(ByteVT, V2);

    SDValue VPPERMMaskOp = DAG.getBuildVector(ByteVT, DL, VPPERMMask);

    Res = DAG.getNode(X86ISD::VPPERM, DL, ByteVT, V1, V2, VPPERMMaskOp);

    return DAG.getBitcast(RootVT, Res);

  }


  // If that failed and either input is extracted then try to combine as a

  // shuffle with the larger type.

  if (SDValue WideShuffle = combineX86ShuffleChainWithExtract(

          Inputs, RootOpc, RootVT, BaseMask, Depth, SrcNodes,

          AllowVariableCrossLaneMask, AllowVariablePerLaneMask, IsMaskedShuffle,

          DAG, DL, Subtarget))

    return WideShuffle;


  // If we have a dual input shuffle then lower to VPERMV3,

  // (non-VLX will pad to 512-bit shuffles)

  if (!UnaryShuffle && AllowVariablePerLaneMask && !MaskContainsZeros &&

      ((Subtarget.hasAVX512() &&

        (MaskVT == MVT::v2f64 || MaskVT == MVT::v4f64 || MaskVT == MVT::v8f64 ||

         MaskVT == MVT::v2i64 || MaskVT == MVT::v4i64 || MaskVT == MVT::v8i64 ||

         MaskVT == MVT::v4f32 || MaskVT == MVT::v4i32 || MaskVT == MVT::v8f32 ||

         MaskVT == MVT::v8i32 || MaskVT == MVT::v16f32 ||

         MaskVT == MVT::v16i32)) ||

       (Subtarget.hasBWI() && AllowBWIVPERMV3 &&

        (MaskVT == MVT::v8i16 || MaskVT == MVT::v16i16 ||

         MaskVT == MVT::v32i16)) ||

       (Subtarget.hasVBMI() && AllowBWIVPERMV3 &&

        (MaskVT == MVT::v16i8 || MaskVT == MVT::v32i8 ||

         MaskVT == MVT::v64i8)))) {

    V1 = CanonicalizeShuffleInput(MaskVT, V1);

    V2 = CanonicalizeShuffleInput(MaskVT, V2);

    Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);

    return DAG.getBitcast(RootVT, Res);

  }


  // Failed to find any combines.

  return SDValue();

}


// Combine an arbitrary chain of shuffles + extract_subvectors into a single

// instruction if possible.

//

// Wrapper for combineX86ShuffleChain that extends the shuffle mask to a larger

// type size to attempt to combine:

// shuffle(extract_subvector(x,c1),extract_subvector(y,c2),m1)

// -->

// extract_subvector(shuffle(x,y,m2),0)


static SDValue combineX86ShuffleChainWithExtract(

    ArrayRef<SDValue> Inputs, unsigned RootOpcode, MVT RootVT,

    ArrayRef<int> BaseMask, int Depth, ArrayRef<const SDNode *> SrcNodes,

    bool AllowVariableCrossLaneMask, bool AllowVariablePerLaneMask,

    bool IsMaskedShuffle, SelectionDAG &DAG, const SDLoc &DL,

    const X86Subtarget &Subtarget) {

  unsigned NumMaskElts = BaseMask.size();

  unsigned NumInputs = Inputs.size();

  if (NumInputs == 0)

    return SDValue();


  unsigned RootSizeInBits = RootVT.getSizeInBits();

  unsigned RootEltSizeInBits = RootSizeInBits / NumMaskElts;

  assert((RootSizeInBits % NumMaskElts) == 0 && "Unexpected root shuffle mask");


  // Peek through subvectors to find widest legal vector.

  // TODO: Handle ISD::TRUNCATE

  unsigned WideSizeInBits = RootSizeInBits;

  for (SDValue Input : Inputs) {

    Input = peekThroughBitcasts(Input);

    while (1) {

      if (Input.getOpcode() == ISD::EXTRACT_SUBVECTOR) {

        Input = peekThroughBitcasts(Input.getOperand(0));

        continue;

      }

      if (Input.getOpcode() == ISD::INSERT_SUBVECTOR &&

          Input.getOperand(0).isUndef() &&

          isNullConstant(Input.getOperand(2))) {

        Input = peekThroughBitcasts(Input.getOperand(1));

        continue;

      }

      break;

    }

    if (DAG.getTargetLoweringInfo().isTypeLegal(Input.getValueType()) &&

        WideSizeInBits < Input.getValueSizeInBits())

      WideSizeInBits = Input.getValueSizeInBits();

  }


  // Bail if we fail to find a source larger than the existing root.

  if (WideSizeInBits <= RootSizeInBits ||

      (WideSizeInBits % RootSizeInBits) != 0)

    return SDValue();


  // Create new mask for larger type.

  SmallVector<int, 64> WideMask;

  growShuffleMask(BaseMask, WideMask, RootSizeInBits, WideSizeInBits);


  // Attempt to peek through inputs and adjust mask when we extract from an

  // upper subvector.

  int AdjustedMasks = 0;

  SmallVector<SDValue, 4> WideInputs(Inputs);

  for (unsigned I = 0; I != NumInputs; ++I) {

    SDValue &Input = WideInputs[I];

    Input = peekThroughBitcasts(Input);

    while (1) {

      if (Input.getOpcode() == ISD::EXTRACT_SUBVECTOR &&

          Input.getOperand(0).getValueSizeInBits() <= WideSizeInBits) {

        uint64_t Idx = Input.getConstantOperandVal(1);

        if (Idx != 0) {

          ++AdjustedMasks;

          unsigned InputEltSizeInBits = Input.getScalarValueSizeInBits();

          Idx = (Idx * InputEltSizeInBits) / RootEltSizeInBits;


          int lo = I * WideMask.size();

          int hi = (I + 1) * WideMask.size();

          for (int &M : WideMask)

            if (lo <= M && M < hi)

              M += Idx;

        }

        Input = peekThroughBitcasts(Input.getOperand(0));

        continue;

      }

      // TODO: Handle insertions into upper subvectors.

      if (Input.getOpcode() == ISD::INSERT_SUBVECTOR &&

          Input.getOperand(0).isUndef() &&

          isNullConstant(Input.getOperand(2))) {

        Input = peekThroughBitcasts(Input.getOperand(1));

        continue;

      }

      break;

    }

  }


  // Remove unused/repeated shuffle source ops.

  resolveTargetShuffleInputsAndMask(WideInputs, WideMask);

  assert(!WideInputs.empty() && "Shuffle with no inputs detected");


  // Bail if we're always extracting from the lowest subvectors,

  // combineX86ShuffleChain should match this for the current width, or the

  // shuffle still references too many inputs.

  if (AdjustedMasks == 0 || WideInputs.size() > 2)

    return SDValue();


  // Minor canonicalization of the accumulated shuffle mask to make it easier

  // to match below. All this does is detect masks with sequential pairs of

  // elements, and shrink them to the half-width mask. It does this in a loop

  // so it will reduce the size of the mask to the minimal width mask which

  // performs an equivalent shuffle.

  while (WideMask.size() > 1) {

    SmallVector<int, 64> WidenedMask;

    if (!canWidenShuffleElements(WideMask, WidenedMask))

      break;

    WideMask = std::move(WidenedMask);

  }


  // Canonicalization of binary shuffle masks to improve pattern matching by

  // commuting the inputs.

  if (WideInputs.size() == 2 && canonicalizeShuffleMaskWithCommute(WideMask)) {

    ShuffleVectorSDNode::commuteMask(WideMask);

    std::swap(WideInputs[0], WideInputs[1]);

  }


  // Increase depth for every upper subvector we've peeked through.

  Depth += AdjustedMasks;


  // Attempt to combine wider chain.

  // TODO: Can we use a better Root?

  SDValue WideRoot = WideInputs.front().getValueSizeInBits() >

                             WideInputs.back().getValueSizeInBits()

                         ? WideInputs.front()

                         : WideInputs.back();

  assert(WideRoot.getValueSizeInBits() == WideSizeInBits &&

         "WideRootSize mismatch");


  if (SDValue WideShuffle = combineX86ShuffleChain(

          WideInputs, RootOpcode, WideRoot.getSimpleValueType(), WideMask,

          Depth, SrcNodes, AllowVariableCrossLaneMask, AllowVariablePerLaneMask,

          IsMaskedShuffle, DAG, SDLoc(WideRoot), Subtarget)) {

    WideShuffle = extractSubVector(WideShuffle, 0, DAG, DL, RootSizeInBits);

    return DAG.getBitcast(RootVT, WideShuffle);

  }


  return SDValue();

}


// Canonicalize the combined shuffle mask chain with horizontal ops.

// NOTE: This may update the Ops and Mask.


static SDValue canonicalizeShuffleMaskWithHorizOp(

    MutableArrayRef<SDValue> Ops, MutableArrayRef<int> Mask,

    unsigned RootSizeInBits, const SDLoc &DL, SelectionDAG &DAG,

    const X86Subtarget &Subtarget) {

  if (Mask.empty() || Ops.empty())

    return SDValue();


  SmallVector<SDValue> BC;

  for (SDValue Op : Ops)

    BC.push_back(peekThroughBitcasts(Op));


  // All ops must be the same horizop + type.

  SDValue BC0 = BC[0];

  EVT VT0 = BC0.getValueType();

  unsigned Opcode0 = BC0.getOpcode();

  if (VT0.getSizeInBits() != RootSizeInBits || llvm::any_of(BC, [&](SDValue V) {

        return V.getOpcode() != Opcode0 || V.getValueType() != VT0;

      }))

    return SDValue();


  bool isHoriz = (Opcode0 == X86ISD::FHADD || Opcode0 == X86ISD::HADD ||

                  Opcode0 == X86ISD::FHSUB || Opcode0 == X86ISD::HSUB);

  bool isPack = (Opcode0 == X86ISD::PACKSS || Opcode0 == X86ISD::PACKUS);

  if (!isHoriz && !isPack)

    return SDValue();


  // Do all ops have a single use?

  bool OneUseOps = llvm::all_of(Ops, [](SDValue Op) {

    return Op.hasOneUse() &&

           peekThroughBitcasts(Op) == peekThroughOneUseBitcasts(Op);

  });


  int NumElts = VT0.getVectorNumElements();

  int NumLanes = VT0.getSizeInBits() / 128;

  int NumEltsPerLane = NumElts / NumLanes;

  int NumHalfEltsPerLane = NumEltsPerLane / 2;

  MVT SrcVT = BC0.getOperand(0).getSimpleValueType();

  unsigned EltSizeInBits = RootSizeInBits / Mask.size();


  if (NumEltsPerLane >= 4 &&

      (isPack || shouldUseHorizontalOp(Ops.size() == 1, DAG, Subtarget))) {

    SmallVector<int> LaneMask, ScaledMask;

    if (isRepeatedTargetShuffleMask(128, EltSizeInBits, Mask, LaneMask) &&

        scaleShuffleElements(LaneMask, 4, ScaledMask)) {

      // See if we can remove the shuffle by resorting the HOP chain so that

      // the HOP args are pre-shuffled.

      // TODO: Generalize to any sized/depth chain.

      // TODO: Add support for PACKSS/PACKUS.

      if (isHoriz) {

        // Attempt to find a HOP(HOP(X,Y),HOP(Z,W)) source operand.

        auto GetHOpSrc = [&](int M) {

          if (M == SM_SentinelUndef)

            return DAG.getUNDEF(VT0);

          if (M == SM_SentinelZero)

            return getZeroVector(VT0.getSimpleVT(), Subtarget, DAG, DL);

          SDValue Src0 = BC[M / 4];

          SDValue Src1 = Src0.getOperand((M % 4) >= 2);

          if (Src1.getOpcode() == Opcode0 && Src0->isOnlyUserOf(Src1.getNode()))

            return Src1.getOperand(M % 2);

          return SDValue();

        };

        SDValue M0 = GetHOpSrc(ScaledMask[0]);

        SDValue M1 = GetHOpSrc(ScaledMask[1]);

        SDValue M2 = GetHOpSrc(ScaledMask[2]);

        SDValue M3 = GetHOpSrc(ScaledMask[3]);

        if (M0 && M1 && M2 && M3) {

          SDValue LHS = DAG.getNode(Opcode0, DL, SrcVT, M0, M1);

          SDValue RHS = DAG.getNode(Opcode0, DL, SrcVT, M2, M3);

          return DAG.getNode(Opcode0, DL, VT0, LHS, RHS);

        }

      }

      // shuffle(hop(x,y),hop(z,w)) -> permute(hop(x,z)) etc.

      if (Ops.size() >= 2) {

        SDValue LHS, RHS;

        auto GetHOpSrc = [&](int M, int &OutM) {

          // TODO: Support SM_SentinelZero

          if (M < 0)

            return M == SM_SentinelUndef;

          SDValue Src = BC[M / 4].getOperand((M % 4) >= 2);

          if (!LHS || LHS == Src) {

            LHS = Src;

            OutM = (M % 2);

            return true;

          }

          if (!RHS || RHS == Src) {

            RHS = Src;

            OutM = (M % 2) + 2;

            return true;

          }

          return false;

        };

        int PostMask[4] = {-1, -1, -1, -1};

        if (GetHOpSrc(ScaledMask[0], PostMask[0]) &&

            GetHOpSrc(ScaledMask[1], PostMask[1]) &&

            GetHOpSrc(ScaledMask[2], PostMask[2]) &&

            GetHOpSrc(ScaledMask[3], PostMask[3])) {

          LHS = DAG.getBitcast(SrcVT, LHS);

          RHS = DAG.getBitcast(SrcVT, RHS ? RHS : LHS);

          SDValue Res = DAG.getNode(Opcode0, DL, VT0, LHS, RHS);

          // Use SHUFPS for the permute so this will work on SSE2 targets,

          // shuffle combining and domain handling will simplify this later on.

          MVT ShuffleVT = MVT::getVectorVT(MVT::f32, RootSizeInBits / 32);

          Res = DAG.getBitcast(ShuffleVT, Res);

          return DAG.getNode(X86ISD::SHUFP, DL, ShuffleVT, Res, Res,

                             getV4X86ShuffleImm8ForMask(PostMask, DL, DAG));

        }

      }

    }

  }


  if (2 < Ops.size())

    return SDValue();


  SDValue BC1 = BC[BC.size() - 1];

  if (Mask.size() == VT0.getVectorNumElements()) {

    // Canonicalize binary shuffles of horizontal ops that use the

    // same sources to an unary shuffle.

    // TODO: Try to perform this fold even if the shuffle remains.

    if (Ops.size() == 2) {

      auto ContainsOps = [](SDValue HOp, SDValue Op) {

        return Op == HOp.getOperand(0) || Op == HOp.getOperand(1);

      };

      // Commute if all BC0's ops are contained in BC1.

      if (ContainsOps(BC1, BC0.getOperand(0)) &&

          ContainsOps(BC1, BC0.getOperand(1))) {

        ShuffleVectorSDNode::commuteMask(Mask);

        std::swap(Ops[0], Ops[1]);

        std::swap(BC0, BC1);

      }


      // If BC1 can be represented by BC0, then convert to unary shuffle.

      if (ContainsOps(BC0, BC1.getOperand(0)) &&

          ContainsOps(BC0, BC1.getOperand(1))) {

        for (int &M : Mask) {

          if (M < NumElts) // BC0 element or UNDEF/Zero sentinel.

            continue;

          int SubLane = ((M % NumEltsPerLane) >= NumHalfEltsPerLane) ? 1 : 0;

          M -= NumElts + (SubLane * NumHalfEltsPerLane);

          if (BC1.getOperand(SubLane) != BC0.getOperand(0))

            M += NumHalfEltsPerLane;

        }

      }

    }


    // Canonicalize unary horizontal ops to only refer to lower halves.

    for (int i = 0; i != NumElts; ++i) {

      int &M = Mask[i];

      if (isUndefOrZero(M))

        continue;

      if (M < NumElts && BC0.getOperand(0) == BC0.getOperand(1) &&

          (M % NumEltsPerLane) >= NumHalfEltsPerLane)

        M -= NumHalfEltsPerLane;

      if (NumElts <= M && BC1.getOperand(0) == BC1.getOperand(1) &&

          (M % NumEltsPerLane) >= NumHalfEltsPerLane)

        M -= NumHalfEltsPerLane;

    }

  }


  // Combine binary shuffle of 2 similar 'Horizontal' instructions into a

  // single instruction. Attempt to match a v2X64 repeating shuffle pattern that

  // represents the LHS/RHS inputs for the lower/upper halves.

  SmallVector<int, 16> TargetMask128, WideMask128;

  if (isRepeatedTargetShuffleMask(128, EltSizeInBits, Mask, TargetMask128) &&

      scaleShuffleElements(TargetMask128, 2, WideMask128)) {

    assert(isUndefOrZeroOrInRange(WideMask128, 0, 4) && "Illegal shuffle");

    bool SingleOp = (Ops.size() == 1);

    if (isPack || OneUseOps ||

        shouldUseHorizontalOp(SingleOp, DAG, Subtarget)) {

      SDValue Lo = isInRange(WideMask128[0], 0, 2) ? BC0 : BC1;

      SDValue Hi = isInRange(WideMask128[1], 0, 2) ? BC0 : BC1;

      Lo = Lo.getOperand(WideMask128[0] & 1);

      Hi = Hi.getOperand(WideMask128[1] & 1);

      if (SingleOp) {

        SDValue Undef = DAG.getUNDEF(SrcVT);

        SDValue Zero = getZeroVector(SrcVT, Subtarget, DAG, DL);

        Lo = (WideMask128[0] == SM_SentinelZero ? Zero : Lo);

        Hi = (WideMask128[1] == SM_SentinelZero ? Zero : Hi);

        Lo = (WideMask128[0] == SM_SentinelUndef ? Undef : Lo);

        Hi = (WideMask128[1] == SM_SentinelUndef ? Undef : Hi);

      }

      return DAG.getNode(Opcode0, DL, VT0, Lo, Hi);

    }

  }


  // If we are post-shuffling a 256-bit hop and not requiring the upper

  // elements, then try to narrow to a 128-bit hop directly.

  SmallVector<int, 16> WideMask64;

  if (Ops.size() == 1 && NumLanes == 2 &&

      scaleShuffleElements(Mask, 4, WideMask64) &&

      isUndefInRange(WideMask64, 2, 2)) {

    int M0 = WideMask64[0];

    int M1 = WideMask64[1];

    if (isInRange(M0, 0, 4) && isInRange(M1, 0, 4)) {

      MVT HalfVT = VT0.getSimpleVT().getHalfNumVectorElementsVT();

      unsigned Idx0 = (M0 & 2) ? (SrcVT.getVectorNumElements() / 2) : 0;

      unsigned Idx1 = (M1 & 2) ? (SrcVT.getVectorNumElements() / 2) : 0;

      SDValue V0 = extract128BitVector(BC[0].getOperand(M0 & 1), Idx0, DAG, DL);

      SDValue V1 = extract128BitVector(BC[0].getOperand(M1 & 1), Idx1, DAG, DL);

      SDValue Res = DAG.getNode(Opcode0, DL, HalfVT, V0, V1);

      return widenSubVector(Res, false, Subtarget, DAG, DL, 256);

    }

  }


  return SDValue();

}


// Attempt to constant fold all of the constant source ops.

// Returns true if the entire shuffle is folded to a constant.

// TODO: Extend this to merge multiple constant Ops and update the mask.


static SDValue combineX86ShufflesConstants(MVT VT, ArrayRef<SDValue> Ops,

                                           ArrayRef<int> Mask,

                                           ArrayRef<const SDNode *> SrcNodes,

                                           SelectionDAG &DAG, const SDLoc &DL,

                                           const X86Subtarget &Subtarget) {

  unsigned SizeInBits = VT.getSizeInBits();

  unsigned NumMaskElts = Mask.size();

  unsigned MaskSizeInBits = SizeInBits / NumMaskElts;

  unsigned NumOps = Ops.size();


  // Extract constant bits from each source op.

  SmallVector<APInt, 16> UndefEltsOps(NumOps);

  SmallVector<SmallVector<APInt, 16>, 16> RawBitsOps(NumOps);

  for (unsigned I = 0; I != NumOps; ++I)

    if (!getTargetConstantBitsFromNode(Ops[I], MaskSizeInBits, UndefEltsOps[I],

                                       RawBitsOps[I],

                                       /*AllowWholeUndefs*/ true,

                                       /*AllowPartialUndefs*/ true))

      return SDValue();


  // If we're optimizing for size, only fold if at least one of the constants is

  // only used once or the combined shuffle has included a variable mask

  // shuffle, this is to avoid constant pool bloat.

  bool IsOptimizingSize = DAG.shouldOptForSize();

  bool HasVariableMask = llvm::any_of(SrcNodes, [](const SDNode *N) {

    return isTargetShuffleVariableMask(N->getOpcode());

  });

  if (IsOptimizingSize && !HasVariableMask &&

      llvm::none_of(Ops, [](SDValue SrcOp) { return SrcOp->hasOneUse(); }))

    return SDValue();


  // Shuffle the constant bits according to the mask.

  APInt UndefElts(NumMaskElts, 0);

  APInt ZeroElts(NumMaskElts, 0);

  APInt ConstantElts(NumMaskElts, 0);

  SmallVector<APInt, 8> ConstantBitData(NumMaskElts,

                                        APInt::getZero(MaskSizeInBits));

  for (unsigned i = 0; i != NumMaskElts; ++i) {

    int M = Mask[i];

    if (M == SM_SentinelUndef) {

      UndefElts.setBit(i);

      continue;

    } else if (M == SM_SentinelZero) {

      ZeroElts.setBit(i);

      continue;

    }

    assert(0 <= M && M < (int)(NumMaskElts * NumOps));


    unsigned SrcOpIdx = (unsigned)M / NumMaskElts;

    unsigned SrcMaskIdx = (unsigned)M % NumMaskElts;


    auto &SrcUndefElts = UndefEltsOps[SrcOpIdx];

    if (SrcUndefElts[SrcMaskIdx]) {

      UndefElts.setBit(i);

      continue;

    }


    auto &SrcEltBits = RawBitsOps[SrcOpIdx];

    APInt &Bits = SrcEltBits[SrcMaskIdx];

    if (!Bits) {

      ZeroElts.setBit(i);

      continue;

    }


    ConstantElts.setBit(i);

    ConstantBitData[i] = Bits;

  }

  assert((UndefElts | ZeroElts | ConstantElts).isAllOnes());


  // Attempt to create a zero vector.

  if ((UndefElts | ZeroElts).isAllOnes())

    return getZeroVector(VT, Subtarget, DAG, DL);


  // Create the constant data.

  MVT MaskSVT;

  if (VT.isFloatingPoint() && (MaskSizeInBits == 32 || MaskSizeInBits == 64))

    MaskSVT = MVT::getFloatingPointVT(MaskSizeInBits);

  else

    MaskSVT = MVT::getIntegerVT(MaskSizeInBits);


  MVT MaskVT = MVT::getVectorVT(MaskSVT, NumMaskElts);

  if (!DAG.getTargetLoweringInfo().isTypeLegal(MaskVT))

    return SDValue();


  SDValue CstOp = getConstVector(ConstantBitData, UndefElts, MaskVT, DAG, DL);

  return DAG.getBitcast(VT, CstOp);

}


namespace llvm {

  namespace X86 {

    enum {

      MaxShuffleCombineDepth = 8

    };

  } // namespace X86

} // namespace llvm


/// Fully generic combining of x86 shuffle instructions.

///

/// This should be the last combine run over the x86 shuffle instructions. Once

/// they have been fully optimized, this will recursively consider all chains

/// of single-use shuffle instructions, build a generic model of the cumulative

/// shuffle operation, and check for simpler instructions which implement this

/// operation. We use this primarily for two purposes:

///

/// 1) Collapse generic shuffles to specialized single instructions when

///    equivalent. In most cases, this is just an encoding size win, but

///    sometimes we will collapse multiple generic shuffles into a single

///    special-purpose shuffle.

/// 2) Look for sequences of shuffle instructions with 3 or more total

///    instructions, and replace them with the slightly more expensive SSSE3

///    PSHUFB instruction if available. We do this as the last combining step

///    to ensure we avoid using PSHUFB if we can implement the shuffle with

///    a suitable short sequence of other instructions. The PSHUFB will either

///    use a register or have to read from memory and so is slightly (but only

///    slightly) more expensive than the other shuffle instructions.

///

/// Because this is inherently a quadratic operation (for each shuffle in

/// a chain, we recurse up the chain), the depth is limited to 8 instructions.

/// This should never be an issue in practice as the shuffle lowering doesn't

/// produce sequences of more than 8 instructions.

///

/// FIXME: We will currently miss some cases where the redundant shuffling

/// would simplify under the threshold for PSHUFB formation because of

/// combine-ordering. To fix this, we should do the redundant instruction

/// combining in this recursive walk.


static SDValue combineX86ShufflesRecursively(

    ArrayRef<SDValue> SrcOps, int SrcOpIndex, unsigned RootOpc, MVT RootVT,

    ArrayRef<int> RootMask, ArrayRef<const SDNode *> SrcNodes, unsigned Depth,

    unsigned MaxDepth, bool AllowVariableCrossLaneMask,

    bool AllowVariablePerLaneMask, bool IsMaskedShuffle, SelectionDAG &DAG,

    const SDLoc &DL, const X86Subtarget &Subtarget) {

  assert(!RootMask.empty() &&

         (RootMask.size() > 1 || (RootMask[0] == 0 && SrcOpIndex == 0)) &&

         "Illegal shuffle root mask");

  assert(RootVT.isVector() && "Shuffles operate on vector types!");

  unsigned RootSizeInBits = RootVT.getSizeInBits();

  const TargetLowering &TLI = DAG.getTargetLoweringInfo();


  // Bound the depth of our recursive combine because this is ultimately

  // quadratic in nature.

  if (Depth >= MaxDepth)

    return SDValue();


  // Directly rip through bitcasts to find the underlying operand.

  SDValue Op = SrcOps[SrcOpIndex];

  Op = peekThroughOneUseBitcasts(Op);


  EVT VT = Op.getValueType();

  if (!VT.isVector() || !VT.isSimple())

    return SDValue(); // Bail if we hit a non-simple non-vector.


  // FIXME: Just bail on f16 for now.

  if (VT.getVectorElementType() == MVT::f16)

    return SDValue();


  assert((RootSizeInBits % VT.getSizeInBits()) == 0 &&

         "Can only combine shuffles upto size of the root op.");


  // Create a demanded elts mask from the referenced elements of Op.

  APInt OpDemandedElts = APInt::getZero(RootMask.size());

  for (int M : RootMask) {

    int BaseIdx = RootMask.size() * SrcOpIndex;

    if (isInRange(M, BaseIdx, BaseIdx + RootMask.size()))

      OpDemandedElts.setBit(M - BaseIdx);

  }

  if (RootSizeInBits != VT.getSizeInBits()) {

    // Op is smaller than Root - extract the demanded elts for the subvector.

    unsigned Scale = RootSizeInBits / VT.getSizeInBits();

    unsigned NumOpMaskElts = RootMask.size() / Scale;

    assert((RootMask.size() % Scale) == 0 && "Root mask size mismatch");

    assert(OpDemandedElts

               .extractBits(RootMask.size() - NumOpMaskElts, NumOpMaskElts)

               .isZero() &&

           "Out of range elements referenced in root mask");

    OpDemandedElts = OpDemandedElts.extractBits(NumOpMaskElts, 0);

  }

  OpDemandedElts =

      APIntOps::ScaleBitMask(OpDemandedElts, VT.getVectorNumElements());


  // Extract target shuffle mask and resolve sentinels and inputs.

  SmallVector<int, 64> OpMask;

  SmallVector<SDValue, 2> OpInputs;

  APInt OpUndef, OpZero;

  if (getTargetShuffleInputs(Op, OpDemandedElts, OpInputs, OpMask, OpUndef,

                             OpZero, DAG, Depth, false)) {

    // Shuffle inputs must not be larger than the shuffle result.

    // TODO: Relax this for single input faux shuffles (e.g. trunc).

    if (llvm::any_of(OpInputs, [VT](SDValue OpInput) {

          return OpInput.getValueSizeInBits() > VT.getSizeInBits();

        }))

      return SDValue();

  } else if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&

             (RootSizeInBits % Op.getOperand(0).getValueSizeInBits()) == 0 &&

             !isNullConstant(Op.getOperand(1))) {

    SDValue SrcVec = Op.getOperand(0);

    int ExtractIdx = Op.getConstantOperandVal(1);

    unsigned NumElts = VT.getVectorNumElements();

    OpInputs.assign({SrcVec});

    OpMask.assign(NumElts, SM_SentinelUndef);

    std::iota(OpMask.begin(), OpMask.end(), ExtractIdx);

    OpZero = OpUndef = APInt::getZero(NumElts);

  } else {

    return SDValue();

  }


  // If the shuffle result was smaller than the root, we need to adjust the

  // mask indices and pad the mask with undefs.

  if (RootSizeInBits > VT.getSizeInBits()) {

    unsigned NumSubVecs = RootSizeInBits / VT.getSizeInBits();

    unsigned OpMaskSize = OpMask.size();

    if (OpInputs.size() > 1) {

      unsigned PaddedMaskSize = NumSubVecs * OpMaskSize;

      for (int &M : OpMask) {

        if (M < 0)

          continue;

        int EltIdx = M % OpMaskSize;

        int OpIdx = M / OpMaskSize;

        M = (PaddedMaskSize * OpIdx) + EltIdx;

      }

    }

    OpZero = OpZero.zext(NumSubVecs * OpMaskSize);

    OpUndef = OpUndef.zext(NumSubVecs * OpMaskSize);

    OpMask.append((NumSubVecs - 1) * OpMaskSize, SM_SentinelUndef);

  }


  SmallVector<int, 64> Mask;

  SmallVector<SDValue, 16> Ops;


  // We don't need to merge masks if the root is empty.

  bool EmptyRoot = (Depth == 0) && (RootMask.size() == 1);

  if (EmptyRoot) {

    // Only resolve zeros if it will remove an input, otherwise we might end

    // up in an infinite loop.

    bool ResolveKnownZeros = true;

    if (!OpZero.isZero()) {

      APInt UsedInputs = APInt::getZero(OpInputs.size());

      for (int i = 0, e = OpMask.size(); i != e; ++i) {

        int M = OpMask[i];

        if (OpUndef[i] || OpZero[i] || isUndefOrZero(M))

          continue;

        UsedInputs.setBit(M / OpMask.size());

        if (UsedInputs.isAllOnes()) {

          ResolveKnownZeros = false;

          break;

        }

      }

    }

    resolveTargetShuffleFromZeroables(OpMask, OpUndef, OpZero,

                                      ResolveKnownZeros);


    Mask = OpMask;

    Ops.append(OpInputs.begin(), OpInputs.end());

  } else {

    resolveTargetShuffleFromZeroables(OpMask, OpUndef, OpZero);


    // Add the inputs to the Ops list, avoiding duplicates.

    Ops.append(SrcOps.begin(), SrcOps.end());


    auto AddOp = [&Ops](SDValue Input, int InsertionPoint) -> int {

      // Attempt to find an existing match.

      SDValue InputBC = peekThroughBitcasts(Input);

      for (int i = 0, e = Ops.size(); i < e; ++i)

        if (InputBC == peekThroughBitcasts(Ops[i]))

          return i;

      // Match failed - should we replace an existing Op?

      if (InsertionPoint >= 0) {

        Ops[InsertionPoint] = Input;

        return InsertionPoint;

      }

      // Add to the end of the Ops list.

      Ops.push_back(Input);

      return Ops.size() - 1;

    };


    SmallVector<int, 2> OpInputIdx;

    for (SDValue OpInput : OpInputs)

      OpInputIdx.push_back(

          AddOp(OpInput, OpInputIdx.empty() ? SrcOpIndex : -1));


    assert(((RootMask.size() > OpMask.size() &&

             RootMask.size() % OpMask.size() == 0) ||

            (OpMask.size() > RootMask.size() &&

             OpMask.size() % RootMask.size() == 0) ||

            OpMask.size() == RootMask.size()) &&

           "The smaller number of elements must divide the larger.");


    // This function can be performance-critical, so we rely on the power-of-2

    // knowledge that we have about the mask sizes to replace div/rem ops with

    // bit-masks and shifts.

    assert(llvm::has_single_bit<uint32_t>(RootMask.size()) &&

           "Non-power-of-2 shuffle mask sizes");

    assert(llvm::has_single_bit<uint32_t>(OpMask.size()) &&

           "Non-power-of-2 shuffle mask sizes");

    unsigned RootMaskSizeLog2 = llvm::countr_zero(RootMask.size());

    unsigned OpMaskSizeLog2 = llvm::countr_zero(OpMask.size());


    unsigned MaskWidth = std::max<unsigned>(OpMask.size(), RootMask.size());

    unsigned RootRatio =

        std::max<unsigned>(1, OpMask.size() >> RootMaskSizeLog2);

    unsigned OpRatio = std::max<unsigned>(1, RootMask.size() >> OpMaskSizeLog2);

    assert((RootRatio == 1 || OpRatio == 1) &&

           "Must not have a ratio for both incoming and op masks!");


    assert(isPowerOf2_32(MaskWidth) && "Non-power-of-2 shuffle mask sizes");

    assert(isPowerOf2_32(RootRatio) && "Non-power-of-2 shuffle mask sizes");

    assert(isPowerOf2_32(OpRatio) && "Non-power-of-2 shuffle mask sizes");

    unsigned RootRatioLog2 = llvm::countr_zero(RootRatio);

    unsigned OpRatioLog2 = llvm::countr_zero(OpRatio);


    Mask.resize(MaskWidth, SM_SentinelUndef);


    // Merge this shuffle operation's mask into our accumulated mask. Note that

    // this shuffle's mask will be the first applied to the input, followed by

    // the root mask to get us all the way to the root value arrangement. The

    // reason for this order is that we are recursing up the operation chain.

    for (unsigned i = 0; i < MaskWidth; ++i) {

      unsigned RootIdx = i >> RootRatioLog2;

      if (RootMask[RootIdx] < 0) {

        // This is a zero or undef lane, we're done.

        Mask[i] = RootMask[RootIdx];

        continue;

      }


      unsigned RootMaskedIdx =

          RootRatio == 1

              ? RootMask[RootIdx]

              : (RootMask[RootIdx] << RootRatioLog2) + (i & (RootRatio - 1));


      // Just insert the scaled root mask value if it references an input other

      // than the SrcOp we're currently inserting.

      if ((RootMaskedIdx < (SrcOpIndex * MaskWidth)) ||

          (((SrcOpIndex + 1) * MaskWidth) <= RootMaskedIdx)) {

        Mask[i] = RootMaskedIdx;

        continue;

      }


      RootMaskedIdx = RootMaskedIdx & (MaskWidth - 1);

      unsigned OpIdx = RootMaskedIdx >> OpRatioLog2;

      if (OpMask[OpIdx] < 0) {

        // The incoming lanes are zero or undef, it doesn't matter which ones we

        // are using.

        Mask[i] = OpMask[OpIdx];

        continue;

      }


      // Ok, we have non-zero lanes, map them through to one of the Op's inputs.

      unsigned OpMaskedIdx = OpRatio == 1 ? OpMask[OpIdx]

                                          : (OpMask[OpIdx] << OpRatioLog2) +

                                                (RootMaskedIdx & (OpRatio - 1));


      OpMaskedIdx = OpMaskedIdx & (MaskWidth - 1);

      int InputIdx = OpMask[OpIdx] / (int)OpMask.size();

      assert(0 <= OpInputIdx[InputIdx] && "Unknown target shuffle input");

      OpMaskedIdx += OpInputIdx[InputIdx] * MaskWidth;


      Mask[i] = OpMaskedIdx;

    }

  }


  // Peek through any free bitcasts to insert_subvector vector widenings or

  // extract_subvector nodes back to root size.

  // TODO: Can resolveTargetShuffleInputsAndMask do some of this?

  for (auto [I, Op] : enumerate(Ops)) {

    SDValue BC = Op;

    while (1) {

      if (BC.getOpcode() == ISD::BITCAST && BC.hasOneUse()) {

        BC = BC.getOperand(0);

        continue;

      }

      if (BC.getOpcode() == ISD::INSERT_SUBVECTOR &&

          BC.getOperand(0).isUndef() && isNullConstant(BC.getOperand(2))) {

        // Set out of bounds mask indices to undef.

        Op = BC = BC.getOperand(1);

        unsigned Scale = RootSizeInBits / Op.getValueSizeInBits();

        int Lo = I * Mask.size();

        int Hi = (I + 1) * Mask.size();

        int NewHi = Lo + (Mask.size() / Scale);

        for (int &M : Mask) {

          if (Lo <= M && NewHi <= M && M < Hi)

            M = SM_SentinelUndef;

        }

        continue;

      }

      if (BC.getOpcode() == ISD::EXTRACT_SUBVECTOR &&

          (RootSizeInBits % BC.getOperand(0).getValueSizeInBits()) == 0 &&

          isNullConstant(BC.getOperand(1))) {

        Op = BC = BC.getOperand(0);

        continue;

      }

      break;

    }

  }


  // Remove unused/repeated shuffle source ops.

  resolveTargetShuffleInputsAndMask(Ops, Mask);


  // Handle the all undef/zero/ones cases early.

  if (all_of(Mask, [](int Idx) { return Idx == SM_SentinelUndef; }))

    return DAG.getUNDEF(RootVT);

  if (all_of(Mask, [](int Idx) { return Idx < 0; }))

    return getZeroVector(RootVT, Subtarget, DAG, DL);

  if (Ops.size() == 1 && ISD::isBuildVectorAllOnes(Ops[0].getNode()) &&

      !llvm::is_contained(Mask, SM_SentinelZero))

    return getOnesVector(RootVT, DAG, DL);


  assert(!Ops.empty() && "Shuffle with no inputs detected");


  // Update the list of shuffle nodes that have been combined so far.

  SmallVector<const SDNode *, 16> CombinedNodes(SrcNodes);

  CombinedNodes.push_back(Op.getNode());


  // See if we can recurse into each shuffle source op (if it's a target

  // shuffle). The source op should only be generally combined if it either has

  // a single use (i.e. current Op) or all its users have already been combined,

  // if not then we can still combine but should prevent generation of variable

  // shuffles to avoid constant pool bloat.

  // Don't recurse if we already have more source ops than we can combine in

  // the remaining recursion depth.

  if (Ops.size() < (MaxDepth - Depth)) {

    for (int i = 0, e = Ops.size(); i < e; ++i) {

      // For empty roots, we need to resolve zeroable elements before combining

      // them with other shuffles.

      SmallVector<int, 64> ResolvedMask = Mask;

      if (EmptyRoot)

        resolveTargetShuffleFromZeroables(ResolvedMask, OpUndef, OpZero);

      bool AllowCrossLaneVar = false;

      bool AllowPerLaneVar = false;

      if (Ops[i].getNode()->hasOneUse() ||

          SDNode::areOnlyUsersOf(CombinedNodes, Ops[i].getNode())) {

        AllowCrossLaneVar = AllowVariableCrossLaneMask;

        AllowPerLaneVar = AllowVariablePerLaneMask;

      }

      if (SDValue Res = combineX86ShufflesRecursively(

              Ops, i, RootOpc, RootVT, ResolvedMask, CombinedNodes, Depth + 1,

              MaxDepth, AllowCrossLaneVar, AllowPerLaneVar, IsMaskedShuffle,

              DAG, DL, Subtarget))

        return Res;

    }

  }


  // Attempt to constant fold all of the constant source ops.

  if (SDValue Cst = combineX86ShufflesConstants(

          RootVT, Ops, Mask, CombinedNodes, DAG, DL, Subtarget))

    return Cst;


  // If constant fold failed and we only have constants - then we have

  // multiple uses by a single non-variable shuffle - just bail.

  if (Depth == 0 && llvm::all_of(Ops, [&](SDValue Op) {

        APInt UndefElts;

        SmallVector<APInt> RawBits;

        unsigned EltSizeInBits = RootSizeInBits / Mask.size();

        return getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts,

                                             RawBits,

                                             /*AllowWholeUndefs*/ true,

                                             /*AllowPartialUndefs*/ true);

      })) {

    return SDValue();

  }


  // Canonicalize the combined shuffle mask chain with horizontal ops.

  // NOTE: This will update the Ops and Mask.

  if (SDValue HOp = canonicalizeShuffleMaskWithHorizOp(

          Ops, Mask, RootSizeInBits, DL, DAG, Subtarget))

    return DAG.getBitcast(RootVT, HOp);


  // Try to refine our inputs given our knowledge of target shuffle mask.

  for (auto I : enumerate(Ops)) {

    int OpIdx = I.index();

    SDValue &Op = I.value();


    // What range of shuffle mask element values results in picking from Op?

    int Lo = OpIdx * Mask.size();

    int Hi = Lo + Mask.size();


    // Which elements of Op do we demand, given the mask's granularity?

    APInt OpDemandedElts(Mask.size(), 0);

    for (int MaskElt : Mask) {

      if (isInRange(MaskElt, Lo, Hi)) { // Picks from Op?

        int OpEltIdx = MaskElt - Lo;

        OpDemandedElts.setBit(OpEltIdx);

      }

    }


    // Is the shuffle result smaller than the root?

    if (Op.getValueSizeInBits() < RootSizeInBits) {

      // We padded the mask with undefs. But we now need to undo that.

      unsigned NumExpectedVectorElts = Mask.size();

      unsigned EltSizeInBits = RootSizeInBits / NumExpectedVectorElts;

      unsigned NumOpVectorElts = Op.getValueSizeInBits() / EltSizeInBits;

      assert(!OpDemandedElts.extractBits(

                 NumExpectedVectorElts - NumOpVectorElts, NumOpVectorElts) &&

             "Demanding the virtual undef widening padding?");

      OpDemandedElts = OpDemandedElts.trunc(NumOpVectorElts); // NUW

    }


    // The Op itself may be of different VT, so we need to scale the mask.

    unsigned NumOpElts = Op.getValueType().getVectorNumElements();

    APInt OpScaledDemandedElts = APIntOps::ScaleBitMask(OpDemandedElts, NumOpElts);


    // Can this operand be simplified any further, given it's demanded elements?

    if (SDValue NewOp = TLI.SimplifyMultipleUseDemandedVectorElts(

            Op, OpScaledDemandedElts, DAG))

      Op = NewOp;

  }

  // FIXME: should we rerun resolveTargetShuffleInputsAndMask() now?


  // Widen any subvector shuffle inputs we've collected.

  // TODO: Remove this to avoid generating temporary nodes, we should only

  // widen once combineX86ShuffleChain has found a match.

  if (any_of(Ops, [RootSizeInBits](SDValue Op) {

        return Op.getValueSizeInBits() < RootSizeInBits;

      })) {

    for (SDValue &Op : Ops)

      if (Op.getValueSizeInBits() < RootSizeInBits)

        Op = widenSubVector(Op, false, Subtarget, DAG, SDLoc(Op),

                            RootSizeInBits);

    // Reresolve - we might have repeated subvector sources.

    resolveTargetShuffleInputsAndMask(Ops, Mask);

  }


  // Handle the all undef/zero/ones cases.

  if (all_of(Mask, [](int Idx) { return Idx == SM_SentinelUndef; }))

    return DAG.getUNDEF(RootVT);

  if (all_of(Mask, [](int Idx) { return Idx < 0; }))

    return getZeroVector(RootVT, Subtarget, DAG, DL);

  if (Ops.size() == 1 && ISD::isBuildVectorAllOnes(Ops[0].getNode()) &&

      !llvm::is_contained(Mask, SM_SentinelZero))

    return getOnesVector(RootVT, DAG, DL);


  assert(!Ops.empty() && "Shuffle with no inputs detected");


  // We can only combine unary and binary shuffle mask cases.

  if (Ops.size() <= 2) {

    // Minor canonicalization of the accumulated shuffle mask to make it easier

    // to match below. All this does is detect masks with sequential pairs of

    // elements, and shrink them to the half-width mask. It does this in a loop

    // so it will reduce the size of the mask to the minimal width mask which

    // performs an equivalent shuffle.

    while (Mask.size() > 1) {

      SmallVector<int, 64> WidenedMask;

      if (!canWidenShuffleElements(Mask, WidenedMask))

        break;

      Mask = std::move(WidenedMask);

    }


    // Canonicalization of binary shuffle masks to improve pattern matching by

    // commuting the inputs.

    if (Ops.size() == 2 && canonicalizeShuffleMaskWithCommute(Mask)) {

      ShuffleVectorSDNode::commuteMask(Mask);

      std::swap(Ops[0], Ops[1]);

    }


    // Try to combine into a single shuffle instruction.

    if (SDValue Shuffle = combineX86ShuffleChain(

            Ops, RootOpc, RootVT, Mask, Depth, CombinedNodes,

            AllowVariableCrossLaneMask, AllowVariablePerLaneMask,

            IsMaskedShuffle, DAG, DL, Subtarget))

      return Shuffle;


    // If all the operands come from the same larger vector, fallthrough and try

    // to use combineX86ShuffleChainWithExtract.

    SDValue LHS = peekThroughBitcasts(Ops.front());

    SDValue RHS = peekThroughBitcasts(Ops.back());

    if (Ops.size() != 2 || !Subtarget.hasAVX2() || RootSizeInBits != 128 ||

        (RootSizeInBits / Mask.size()) != 64 ||

        LHS.getOpcode() != ISD::EXTRACT_SUBVECTOR ||

        RHS.getOpcode() != ISD::EXTRACT_SUBVECTOR ||

        LHS.getOperand(0) != RHS.getOperand(0))

      return SDValue();

  }


  // If that failed and any input is extracted then try to combine as a

  // shuffle with the larger type.

  return combineX86ShuffleChainWithExtract(

      Ops, RootOpc, RootVT, Mask, Depth, CombinedNodes,

      AllowVariableCrossLaneMask, AllowVariablePerLaneMask, IsMaskedShuffle,

      DAG, DL, Subtarget);

}


/// Helper entry wrapper to combineX86ShufflesRecursively.


static SDValue combineX86ShufflesRecursively(SDValue Op, SelectionDAG &DAG,

                                             const X86Subtarget &Subtarget) {

  return combineX86ShufflesRecursively(

      {Op}, 0, Op.getOpcode(), Op.getSimpleValueType(), {0}, {}, /*Depth=*/0,

      X86::MaxShuffleCombineDepth, /*AllowVariableCrossLaneMask=*/true,

      /*AllowVariablePerLaneMask=*/true, isMaskableNode(Op, Subtarget), DAG,

      SDLoc(Op), Subtarget);

}


/// Get the PSHUF-style mask from PSHUF node.

///

/// This is a very minor wrapper around getTargetShuffleMask to easy forming v4

/// PSHUF-style masks that can be reused with such instructions.


static SmallVector<int, 4> getPSHUFShuffleMask(SDValue N) {

  MVT VT = N.getSimpleValueType();

  SmallVector<int, 4> Mask;

  SmallVector<SDValue, 2> Ops;

  bool HaveMask = getTargetShuffleMask(N, false, Ops, Mask);

  (void)HaveMask;

  assert(HaveMask);


  // If we have more than 128-bits, only the low 128-bits of shuffle mask

  // matter. Check that the upper masks are repeats and remove them.

  if (VT.getSizeInBits() > 128) {

    int LaneElts = 128 / VT.getScalarSizeInBits();

#ifndef NDEBUG

    for (int i = 1, NumLanes = VT.getSizeInBits() / 128; i < NumLanes; ++i)

      for (int j = 0; j < LaneElts; ++j)

        assert(Mask[j] == Mask[i * LaneElts + j] - (LaneElts * i) &&

               "Mask doesn't repeat in high 128-bit lanes!");

#endif

    Mask.resize(LaneElts);

  }


  switch (N.getOpcode()) {

  case X86ISD::PSHUFD:

    return Mask;

  case X86ISD::PSHUFLW:

    Mask.resize(4);

    return Mask;

  case X86ISD::PSHUFHW:

    Mask.erase(Mask.begin(), Mask.begin() + 4);

    for (int &M : Mask)

      M -= 4;

    return Mask;

  default:

    llvm_unreachable("No valid shuffle instruction found!");

  }

}


/// Get the expanded blend mask from a BLENDI node.

/// For v16i16 nodes, this will splat the repeated i8 mask.


static APInt getBLENDIBlendMask(SDValue V) {

  assert(V.getOpcode() == X86ISD::BLENDI && "Unknown blend shuffle");

  unsigned NumElts = V.getSimpleValueType().getVectorNumElements();

  APInt Mask = V.getConstantOperandAPInt(2);

  if (Mask.getBitWidth() > NumElts)

    Mask = Mask.trunc(NumElts);

  if (NumElts == 16) {

    assert(Mask.getBitWidth() == 8 && "Unexpected v16i16 blend mask width");

    Mask = APInt::getSplat(16, Mask);

  }

  assert(Mask.getBitWidth() == NumElts && "Unexpected blend mask width");

  return Mask;

}


/// Search for a combinable shuffle across a chain ending in pshufd.

///

/// We walk up the chain and look for a combinable shuffle, skipping over

/// shuffles that we could hoist this shuffle's transformation past without

/// altering anything.


static SDValue combineRedundantDWordShuffle(SDValue N,

                                            MutableArrayRef<int> Mask,

                                            const SDLoc &DL,

                                            SelectionDAG &DAG) {

  assert(N.getOpcode() == X86ISD::PSHUFD &&

         "Called with something other than an x86 128-bit half shuffle!");


  // Walk up a single-use chain looking for a combinable shuffle. Keep a stack

  // of the shuffles in the chain so that we can form a fresh chain to replace

  // this one.

  SmallVector<SDValue, 8> Chain;

  SDValue V = N.getOperand(0);

  for (; V.hasOneUse(); V = V.getOperand(0)) {

    switch (V.getOpcode()) {

    default:

      return SDValue(); // Nothing combined!


    case ISD::BITCAST:

      // Skip bitcasts as we always know the type for the target specific

      // instructions.

      continue;


    case X86ISD::PSHUFD:

      // Found another dword shuffle.

      break;


    case X86ISD::PSHUFLW:

      // Check that the low words (being shuffled) are the identity in the

      // dword shuffle, and the high words are self-contained.

      if (Mask[0] != 0 || Mask[1] != 1 ||

          !(Mask[2] >= 2 && Mask[2] < 4 && Mask[3] >= 2 && Mask[3] < 4))

        return SDValue();


      Chain.push_back(V);

      continue;


    case X86ISD::PSHUFHW:

      // Check that the high words (being shuffled) are the identity in the

      // dword shuffle, and the low words are self-contained.

      if (Mask[2] != 2 || Mask[3] != 3 ||

          !(Mask[0] >= 0 && Mask[0] < 2 && Mask[1] >= 0 && Mask[1] < 2))

        return SDValue();


      Chain.push_back(V);

      continue;


    case X86ISD::UNPCKL:

    case X86ISD::UNPCKH:

      // For either i8 -> i16 or i16 -> i32 unpacks, we can combine a dword

      // shuffle into a preceding word shuffle.

      if (V.getSimpleValueType().getVectorElementType() != MVT::i8 &&

          V.getSimpleValueType().getVectorElementType() != MVT::i16)

        return SDValue();


      // Search for a half-shuffle which we can combine with.

      unsigned CombineOp =

          V.getOpcode() == X86ISD::UNPCKL ? X86ISD::PSHUFLW : X86ISD::PSHUFHW;

      if (V.getOperand(0) != V.getOperand(1) ||

          !V->isOnlyUserOf(V.getOperand(0).getNode()))

        return SDValue();

      Chain.push_back(V);

      V = V.getOperand(0);

      do {

        switch (V.getOpcode()) {

        default:

          return SDValue(); // Nothing to combine.


        case X86ISD::PSHUFLW:

        case X86ISD::PSHUFHW:

          if (V.getOpcode() == CombineOp)

            break;


          Chain.push_back(V);


          [[fallthrough]];

        case ISD::BITCAST:

          V = V.getOperand(0);

          continue;

        }

        break;

      } while (V.hasOneUse());

      break;

    }

    // Break out of the loop if we break out of the switch.

    break;

  }


  if (!V.hasOneUse())

    // We fell out of the loop without finding a viable combining instruction.

    return SDValue();


  // Merge this node's mask and our incoming mask.

  SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);

  for (int &M : Mask)

    M = VMask[M];

  V = DAG.getNode(V.getOpcode(), DL, V.getValueType(), V.getOperand(0),

                  getV4X86ShuffleImm8ForMask(Mask, DL, DAG));


  // Rebuild the chain around this new shuffle.

  while (!Chain.empty()) {

    SDValue W = Chain.pop_back_val();


    if (V.getValueType() != W.getOperand(0).getValueType())

      V = DAG.getBitcast(W.getOperand(0).getValueType(), V);


    switch (W.getOpcode()) {

    default:

      llvm_unreachable("Only PSHUF and UNPCK instructions get here!");


    case X86ISD::UNPCKL:

    case X86ISD::UNPCKH:

      V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, V);

      break;


    case X86ISD::PSHUFD:

    case X86ISD::PSHUFLW:

    case X86ISD::PSHUFHW:

      V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, W.getOperand(1));

      break;

    }

  }

  if (V.getValueType() != N.getValueType())

    V = DAG.getBitcast(N.getValueType(), V);


  // Return the new chain to replace N.

  return V;

}


// Attempt to commute shufps LHS loads:

// permilps(shufps(load(),x)) --> permilps(shufps(x,load()))


static SDValue combineCommutableSHUFP(SDValue N, MVT VT, const SDLoc &DL,

                                      SelectionDAG &DAG) {

  // TODO: Add vXf64 support.

  if (VT != MVT::v4f32 && VT != MVT::v8f32 && VT != MVT::v16f32)

    return SDValue();


  // SHUFP(LHS, RHS) -> SHUFP(RHS, LHS) iff LHS is foldable + RHS is not.

  auto commuteSHUFP = [&VT, &DL, &DAG](SDValue Parent, SDValue V) {

    if (V.getOpcode() != X86ISD::SHUFP || !Parent->isOnlyUserOf(V.getNode()))

      return SDValue();

    SDValue N0 = V.getOperand(0);

    SDValue N1 = V.getOperand(1);

    unsigned Imm = V.getConstantOperandVal(2);

    const X86Subtarget &Subtarget = DAG.getSubtarget<X86Subtarget>();

    if (!X86::mayFoldLoad(peekThroughOneUseBitcasts(N0), Subtarget) ||

        X86::mayFoldLoad(peekThroughOneUseBitcasts(N1), Subtarget))

      return SDValue();

    Imm = llvm::rotl<uint8_t>(Imm, 4);

    return DAG.getNode(X86ISD::SHUFP, DL, VT, N1, N0,

                       DAG.getTargetConstant(Imm, DL, MVT::i8));

  };


  switch (N.getOpcode()) {

  case X86ISD::VPERMILPI:

    if (SDValue NewSHUFP = commuteSHUFP(N, N.getOperand(0))) {

      unsigned Imm = N.getConstantOperandVal(1);

      return DAG.getNode(X86ISD::VPERMILPI, DL, VT, NewSHUFP,

                         DAG.getTargetConstant(Imm ^ 0xAA, DL, MVT::i8));

    }

    break;

  case X86ISD::SHUFP: {

    SDValue N0 = N.getOperand(0);

    SDValue N1 = N.getOperand(1);

    unsigned Imm = N.getConstantOperandVal(2);

    if (N0 == N1) {

      if (SDValue NewSHUFP = commuteSHUFP(N, N0))

        return DAG.getNode(X86ISD::SHUFP, DL, VT, NewSHUFP, NewSHUFP,

                           DAG.getTargetConstant(Imm ^ 0xAA, DL, MVT::i8));

    } else if (SDValue NewSHUFP = commuteSHUFP(N, N0)) {

      return DAG.getNode(X86ISD::SHUFP, DL, VT, NewSHUFP, N1,

                         DAG.getTargetConstant(Imm ^ 0x0A, DL, MVT::i8));

    } else if (SDValue NewSHUFP = commuteSHUFP(N, N1)) {

      return DAG.getNode(X86ISD::SHUFP, DL, VT, N0, NewSHUFP,

                         DAG.getTargetConstant(Imm ^ 0xA0, DL, MVT::i8));

    }

    break;

  }

  }


  return SDValue();

}


// Attempt to fold BLEND(PERMUTE(X),PERMUTE(Y)) -> PERMUTE(BLEND(X,Y))

// iff we don't demand the same element index for both X and Y.

static SDValue


combineBlendOfPermutes(MVT VT, SDValue N0, SDValue N1, ArrayRef<int> BlendMask,

                       const APInt &DemandedElts, SelectionDAG &DAG,

                       const X86Subtarget &Subtarget, const SDLoc &DL) {

  assert(isBlendOrUndef(BlendMask) && "Blend shuffle expected");

  if (!N0.hasOneUse() || !N1.hasOneUse())

    return SDValue();


  unsigned NumElts = VT.getVectorNumElements();

  SDValue BC0 = peekThroughOneUseBitcasts(N0);

  SDValue BC1 = peekThroughOneUseBitcasts(N1);


  // See if both operands are shuffles, and that we can scale the shuffle masks

  // to the same width as the blend mask.

  // TODO: Support SM_SentinelZero?

  SmallVector<SDValue, 2> Ops0, Ops1;

  SmallVector<int, 32> Mask0, Mask1, ScaledMask0, ScaledMask1;

  if (!getTargetShuffleMask(BC0, /*AllowSentinelZero=*/false, Ops0, Mask0) ||

      !getTargetShuffleMask(BC1, /*AllowSentinelZero=*/false, Ops1, Mask1) ||

      !scaleShuffleElements(Mask0, NumElts, ScaledMask0) ||

      !scaleShuffleElements(Mask1, NumElts, ScaledMask1))

    return SDValue();


  // Determine the demanded elts from both permutes.

  APInt Demanded0, DemandedLHS0, DemandedRHS0;

  APInt Demanded1, DemandedLHS1, DemandedRHS1;

  if (!getShuffleDemandedElts(NumElts, BlendMask, DemandedElts, Demanded0,

                              Demanded1,

                              /*AllowUndefElts=*/true) ||

      !getShuffleDemandedElts(NumElts, ScaledMask0, Demanded0, DemandedLHS0,

                              DemandedRHS0, /*AllowUndefElts=*/true) ||

      !getShuffleDemandedElts(NumElts, ScaledMask1, Demanded1, DemandedLHS1,

                              DemandedRHS1, /*AllowUndefElts=*/true))

    return SDValue();


  // Confirm that we only use a single operand from both permutes and that we

  // don't demand the same index from both.

  if (!DemandedRHS0.isZero() || !DemandedRHS1.isZero() ||

      DemandedLHS0.intersects(DemandedLHS1))

    return SDValue();


  // Use the permute demanded elts masks as the new blend mask.

  // Create the new permute mask as a blend of the 2 original permute masks.

  SmallVector<int, 32> NewBlendMask(NumElts, SM_SentinelUndef);

  SmallVector<int, 32> NewPermuteMask(NumElts, SM_SentinelUndef);

  for (unsigned I = 0; I != NumElts; ++I) {

    if (Demanded0[I]) {

      int M = ScaledMask0[I];

      if (0 <= M) {

        assert(isUndefOrEqual(NewBlendMask[M], M) &&

               "BlendMask demands LHS AND RHS");

        NewBlendMask[M] = M;

        NewPermuteMask[I] = M;

      }

    } else if (Demanded1[I]) {

      int M = ScaledMask1[I];

      if (0 <= M) {

        assert(isUndefOrEqual(NewBlendMask[M], M + NumElts) &&

               "BlendMask demands LHS AND RHS");

        NewBlendMask[M] = M + NumElts;

        NewPermuteMask[I] = M;

      }

    }

  }

  assert(isBlendOrUndef(NewBlendMask) && "Bad blend");

  assert(isUndefOrInRange(NewPermuteMask, 0, NumElts) && "Bad permute");


  // v16i16 shuffles can explode in complexity very easily, only accept them if

  // the blend mask is the same in the 128-bit subvectors (or can widen to

  // v8i32) and the permute can be widened as well.

  if (VT == MVT::v16i16) {

    if (!is128BitLaneRepeatedShuffleMask(VT, NewBlendMask) &&

        !canWidenShuffleElements(NewBlendMask))

      return SDValue();

    if (!canWidenShuffleElements(NewPermuteMask))

      return SDValue();

  }


  // Don't introduce lane-crossing permutes without AVX2, unless it can be

  // widened to a lane permute (vperm2f128).

  if (VT.is256BitVector() && !Subtarget.hasAVX2() &&

      isLaneCrossingShuffleMask(128, VT.getScalarSizeInBits(),

                                NewPermuteMask) &&

      !canScaleShuffleElements(NewPermuteMask, 2))

    return SDValue();


  SDValue NewBlend =

      DAG.getVectorShuffle(VT, DL, DAG.getBitcast(VT, Ops0[0]),

                           DAG.getBitcast(VT, Ops1[0]), NewBlendMask);

  return DAG.getVectorShuffle(VT, DL, NewBlend, DAG.getUNDEF(VT),

                              NewPermuteMask);

}


// TODO - move this to TLI like isBinOp?


static bool isUnaryOp(unsigned Opcode) {

  switch (Opcode) {

  case ISD::CTLZ:

  case ISD::CTTZ:

  case ISD::CTPOP:

    return true;

  }

  return false;

}


// Canonicalize SHUFFLE(UNARYOP(X)) -> UNARYOP(SHUFFLE(X)).

// Canonicalize SHUFFLE(BINOP(X,Y)) -> BINOP(SHUFFLE(X),SHUFFLE(Y)).


static SDValue canonicalizeShuffleWithOp(SDValue N, SelectionDAG &DAG,

                                         const SDLoc &DL) {

  const TargetLowering &TLI = DAG.getTargetLoweringInfo();

  EVT ShuffleVT = N.getValueType();

  unsigned Opc = N.getOpcode();


  auto IsMergeableWithShuffle = [Opc, &DAG](SDValue Op, bool FoldShuf = true) {

    // AllZeros/AllOnes constants are freely shuffled and will peek through

    // bitcasts. Other constant build vectors do not peek through bitcasts. Only

    // merge with target shuffles if it has one use so shuffle combining is

    // likely to kick in. Shuffles of splats are expected to be removed.

    return ISD::isBuildVectorAllOnes(Op.getNode()) ||

           ISD::isBuildVectorAllZeros(Op.getNode()) ||

           ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) ||

           ISD::isBuildVectorOfConstantFPSDNodes(Op.getNode()) ||

           getTargetConstantFromNode(dyn_cast<LoadSDNode>(Op)) ||

           (Op.getOpcode() == Opc && Op->hasOneUse()) ||

           (Op.getOpcode() == ISD::INSERT_SUBVECTOR && Op->hasOneUse()) ||

           (Op.getOpcode() == ISD::CONCAT_VECTORS && Op->hasOneUse()) ||

           (FoldShuf && isTargetShuffle(Op.getOpcode()) && Op->hasOneUse()) ||

           DAG.isSplatValue(Op, /*AllowUndefs*/ false);

  };

  auto IsSafeToMoveShuffle = [ShuffleVT](SDValue Op, unsigned BinOp) {

    // Ensure we only shuffle whole vector src elements, unless its a logical

    // binops where we can more aggressively move shuffles from dst to src.

    return isLogicOp(BinOp) ||

           (Op.getScalarValueSizeInBits() <= ShuffleVT.getScalarSizeInBits());

  };


  switch (Opc) {

  // Unary and Unary+Permute Shuffles.

  case X86ISD::PSHUFB: {

    // Don't merge PSHUFB if it contains zero'd elements.

    SmallVector<int> Mask;

    SmallVector<SDValue> Ops;

    if (!getTargetShuffleMask(N, false, Ops, Mask))

      break;

    [[fallthrough]];

  }

  case X86ISD::VBROADCAST:

  case X86ISD::MOVDDUP:

  case X86ISD::PSHUFD:

  case X86ISD::PSHUFHW:

  case X86ISD::PSHUFLW:

  case X86ISD::VPERMV:

  case X86ISD::VPERMI:

  case X86ISD::VPERMILPI: {

    unsigned SrcIdx = Opc == X86ISD::VPERMV ? 1 : 0;

    if (N.getOperand(SrcIdx).getValueType() == ShuffleVT &&

        N->isOnlyUserOf(N.getOperand(SrcIdx).getNode())) {

      SDValue N0 = peekThroughOneUseBitcasts(N.getOperand(SrcIdx));

      unsigned SrcOpcode = N0.getOpcode();

      EVT OpVT = N0.getValueType();

      if (TLI.isBinOp(SrcOpcode) && IsSafeToMoveShuffle(N0, SrcOpcode)) {

        SDValue Op00 = peekThroughOneUseBitcasts(N0.getOperand(0));

        SDValue Op01 = peekThroughOneUseBitcasts(N0.getOperand(1));

        bool FoldShuf = Opc != X86ISD::VPERMI && Opc != X86ISD::VPERMV;

        if (IsMergeableWithShuffle(Op00, FoldShuf) ||

            IsMergeableWithShuffle(Op01, FoldShuf)) {

          SDValue LHS, RHS;

          Op00 = DAG.getBitcast(ShuffleVT, Op00);

          Op01 = DAG.getBitcast(ShuffleVT, Op01);

          if (Opc == X86ISD::VPERMV) {

            LHS = DAG.getNode(Opc, DL, ShuffleVT, N.getOperand(0), Op00);

            RHS = DAG.getNode(Opc, DL, ShuffleVT, N.getOperand(0), Op01);

          } else if (N.getNumOperands() == 2) {

            LHS = DAG.getNode(Opc, DL, ShuffleVT, Op00, N.getOperand(1));

            RHS = DAG.getNode(Opc, DL, ShuffleVT, Op01, N.getOperand(1));

          } else {

            LHS = DAG.getNode(Opc, DL, ShuffleVT, Op00);

            RHS = DAG.getNode(Opc, DL, ShuffleVT, Op01);

          }

          return DAG.getBitcast(ShuffleVT,

                                DAG.getNode(SrcOpcode, DL, OpVT,

                                            DAG.getBitcast(OpVT, LHS),

                                            DAG.getBitcast(OpVT, RHS)));

        }

      }

      if (SrcOpcode == ISD::SINT_TO_FP && IsSafeToMoveShuffle(N0, SrcOpcode) &&

          OpVT.getScalarSizeInBits() ==

              N0.getOperand(0).getScalarValueSizeInBits()) {

        SDValue Res = DAG.getBitcast(ShuffleVT, N0.getOperand(0));

        if (Opc == X86ISD::VPERMV)

          Res = DAG.getNode(Opc, DL, ShuffleVT, N.getOperand(0), Res);

        else if (N.getNumOperands() == 2)

          Res = DAG.getNode(Opc, DL, ShuffleVT, Res, N.getOperand(1));

        else

          Res = DAG.getNode(Opc, DL, ShuffleVT, Res);

        Res = DAG.getBitcast(N0.getOperand(0).getValueType(), Res);

        return DAG.getBitcast(ShuffleVT, DAG.getNode(SrcOpcode, DL, OpVT, Res));

      }

    }

    break;

  }

  // Binary and Binary+Permute Shuffles.

  case X86ISD::INSERTPS: {

    // Don't merge INSERTPS if it contains zero'd elements.

    unsigned InsertPSMask = N.getConstantOperandVal(2);

    unsigned ZeroMask = InsertPSMask & 0xF;

    if (ZeroMask != 0)

      break;

    [[fallthrough]];

  }

  case X86ISD::MOVSD:

  case X86ISD::MOVSS:

  case X86ISD::BLENDI:

  case X86ISD::SHUFP:

  case X86ISD::UNPCKH:

  case X86ISD::UNPCKL: {

    if (N->isOnlyUserOf(N.getOperand(0).getNode()) &&

        N->isOnlyUserOf(N.getOperand(1).getNode())) {

      SDValue N0 = peekThroughOneUseBitcasts(N.getOperand(0));

      SDValue N1 = peekThroughOneUseBitcasts(N.getOperand(1));

      unsigned SrcOpcode = N0.getOpcode();

      if (TLI.isBinOp(SrcOpcode) && N1.getOpcode() == SrcOpcode &&

          N0.getValueType() == N1.getValueType() &&

          IsSafeToMoveShuffle(N0, SrcOpcode) &&

          IsSafeToMoveShuffle(N1, SrcOpcode)) {

        SDValue Op00 = peekThroughOneUseBitcasts(N0.getOperand(0));

        SDValue Op10 = peekThroughOneUseBitcasts(N1.getOperand(0));

        SDValue Op01 = peekThroughOneUseBitcasts(N0.getOperand(1));

        SDValue Op11 = peekThroughOneUseBitcasts(N1.getOperand(1));

        // Ensure the total number of shuffles doesn't increase by folding this

        // shuffle through to the source ops.

        if (((IsMergeableWithShuffle(Op00) && IsMergeableWithShuffle(Op10)) ||

             (IsMergeableWithShuffle(Op01) && IsMergeableWithShuffle(Op11))) ||

            ((IsMergeableWithShuffle(Op00) || IsMergeableWithShuffle(Op10)) &&

             (IsMergeableWithShuffle(Op01) || IsMergeableWithShuffle(Op11)))) {

          SDValue LHS, RHS;

          Op00 = DAG.getBitcast(ShuffleVT, Op00);

          Op10 = DAG.getBitcast(ShuffleVT, Op10);

          Op01 = DAG.getBitcast(ShuffleVT, Op01);

          Op11 = DAG.getBitcast(ShuffleVT, Op11);

          if (N.getNumOperands() == 3) {

            LHS = DAG.getNode(Opc, DL, ShuffleVT, Op00, Op10, N.getOperand(2));

            RHS = DAG.getNode(Opc, DL, ShuffleVT, Op01, Op11, N.getOperand(2));

          } else {

            LHS = DAG.getNode(Opc, DL, ShuffleVT, Op00, Op10);

            RHS = DAG.getNode(Opc, DL, ShuffleVT, Op01, Op11);

          }

          EVT OpVT = N0.getValueType();

          return DAG.getBitcast(ShuffleVT,

                                DAG.getNode(SrcOpcode, DL, OpVT,

                                            DAG.getBitcast(OpVT, LHS),

                                            DAG.getBitcast(OpVT, RHS)));

        }

      }

      if (isUnaryOp(SrcOpcode) && N1.getOpcode() == SrcOpcode &&

          N0.getValueType() == N1.getValueType() &&

          IsSafeToMoveShuffle(N0, SrcOpcode) &&

          IsSafeToMoveShuffle(N1, SrcOpcode)) {

        SDValue Op00 = peekThroughOneUseBitcasts(N0.getOperand(0));

        SDValue Op10 = peekThroughOneUseBitcasts(N1.getOperand(0));

        SDValue Res;

        Op00 = DAG.getBitcast(ShuffleVT, Op00);

        Op10 = DAG.getBitcast(ShuffleVT, Op10);

        if (N.getNumOperands() == 3) {

          Res = DAG.getNode(Opc, DL, ShuffleVT, Op00, Op10, N.getOperand(2));

        } else {

          Res = DAG.getNode(Opc, DL, ShuffleVT, Op00, Op10);

        }

        EVT OpVT = N0.getValueType();

        return DAG.getBitcast(

            ShuffleVT,

            DAG.getNode(SrcOpcode, DL, OpVT, DAG.getBitcast(OpVT, Res)));

      }

      // TODO: We can generalize this for other shuffles/conversions.

      if (Opc == X86ISD::UNPCKL && SrcOpcode == X86ISD::CVTPH2PS &&

          N1.getOpcode() == SrcOpcode &&

          N0.getValueType() == N1.getValueType() &&

          N0.getOperand(0).getValueType() == N1.getOperand(0).getValueType() &&

          ShuffleVT.getScalarSizeInBits() == N0.getScalarValueSizeInBits() &&

          IsSafeToMoveShuffle(N0, SrcOpcode) &&

          IsSafeToMoveShuffle(N1, SrcOpcode)) {

        EVT OpSrcVT = N0.getOperand(0).getValueType();

        EVT OpDstVT = N0.getValueType();

        SDValue Res =

            DAG.getNode(Opc, DL, OpSrcVT, N0.getOperand(0), N1.getOperand(0));

        return DAG.getBitcast(ShuffleVT,

                              DAG.getNode(SrcOpcode, DL, OpDstVT, Res));

      }

    }

    break;

  }

  }

  return SDValue();

}


/// Attempt to fold vpermf128(op(),op()) -> op(vpermf128(),vpermf128()).


static SDValue canonicalizeLaneShuffleWithRepeatedOps(SDValue V,

                                                      SelectionDAG &DAG,

                                                      const SDLoc &DL) {

  assert(V.getOpcode() == X86ISD::VPERM2X128 && "Unknown lane shuffle");


  MVT VT = V.getSimpleValueType();

  SDValue Src0 = peekThroughBitcasts(V.getOperand(0));

  SDValue Src1 = peekThroughBitcasts(V.getOperand(1));

  unsigned SrcOpc0 = Src0.getOpcode();

  unsigned SrcOpc1 = Src1.getOpcode();

  EVT SrcVT0 = Src0.getValueType();

  EVT SrcVT1 = Src1.getValueType();


  if (!Src1.isUndef() && (SrcVT0 != SrcVT1 || SrcOpc0 != SrcOpc1))

    return SDValue();


  switch (SrcOpc0) {

  case X86ISD::MOVDDUP: {

    SDValue LHS = Src0.getOperand(0);

    SDValue RHS = Src1.isUndef() ? Src1 : Src1.getOperand(0);

    SDValue Res =

        DAG.getNode(X86ISD::VPERM2X128, DL, SrcVT0, LHS, RHS, V.getOperand(2));

    Res = DAG.getNode(SrcOpc0, DL, SrcVT0, Res);

    return DAG.getBitcast(VT, Res);

  }

  case X86ISD::VPERMILPI:

    // TODO: Handle v4f64 permutes with different low/high lane masks.

    if (SrcVT0 == MVT::v4f64) {

      uint64_t Mask = Src0.getConstantOperandVal(1);

      if ((Mask & 0x3) != ((Mask >> 2) & 0x3))

        break;

    }

    [[fallthrough]];

  case X86ISD::VSHLI:

  case X86ISD::VSRLI:

  case X86ISD::VSRAI:

  case X86ISD::PSHUFD:

    if (Src1.isUndef() || Src0.getOperand(1) == Src1.getOperand(1)) {

      SDValue LHS = Src0.getOperand(0);

      SDValue RHS = Src1.isUndef() ? Src1 : Src1.getOperand(0);

      SDValue Res = DAG.getNode(X86ISD::VPERM2X128, DL, SrcVT0, LHS, RHS,

                                V.getOperand(2));

      Res = DAG.getNode(SrcOpc0, DL, SrcVT0, Res, Src0.getOperand(1));

      return DAG.getBitcast(VT, Res);

    }

    break;

  }


  return SDValue();

}


/// Try to combine x86 target specific shuffles.


static SDValue combineTargetShuffle(SDValue N, const SDLoc &DL,

                                    SelectionDAG &DAG,

                                    TargetLowering::DAGCombinerInfo &DCI,

                                    const X86Subtarget &Subtarget) {

  using namespace SDPatternMatch;


  MVT VT = N.getSimpleValueType();

  unsigned NumElts = VT.getVectorNumElements();

  SmallVector<int, 4> Mask;

  unsigned Opcode = N.getOpcode();

  const TargetLowering &TLI = DAG.getTargetLoweringInfo();


  if (SDValue R = combineCommutableSHUFP(N, VT, DL, DAG))

    return R;


  // Handle specific target shuffles.

  switch (Opcode) {

  case X86ISD::MOVDDUP: {

    SDValue Src = N.getOperand(0);

    // Turn a 128-bit MOVDDUP of a full vector load into movddup+vzload.

    if (VT == MVT::v2f64 && Src.hasOneUse() &&

        ISD::isNormalLoad(Src.getNode())) {

      LoadSDNode *LN = cast<LoadSDNode>(Src);

      if (SDValue VZLoad = narrowLoadToVZLoad(LN, MVT::f64, MVT::v2f64, DAG)) {

        SDValue Movddup = DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v2f64, VZLoad);

        DCI.CombineTo(N.getNode(), Movddup);

        DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));

        DCI.recursivelyDeleteUnusedNodes(LN);

        return N; // Return N so it doesn't get rechecked!

      }

    }


    return SDValue();

  }

  case X86ISD::VBROADCAST: {

    SDValue Src = N.getOperand(0);

    SDValue BC = peekThroughBitcasts(Src);

    EVT SrcVT = Src.getValueType();

    EVT BCVT = BC.getValueType();


    // If broadcasting from another shuffle, attempt to simplify it.

    // TODO - we really need a general SimplifyDemandedVectorElts mechanism.

    if (isTargetShuffle(BC.getOpcode()) &&

        VT.getScalarSizeInBits() % BCVT.getScalarSizeInBits() == 0) {

      unsigned Scale = VT.getScalarSizeInBits() / BCVT.getScalarSizeInBits();

      SmallVector<int, 16> DemandedMask(BCVT.getVectorNumElements(),

                                        SM_SentinelUndef);

      for (unsigned i = 0; i != Scale; ++i)

        DemandedMask[i] = i;

      if (SDValue Res = combineX86ShufflesRecursively(

              {BC}, 0, BC.getOpcode(), BC.getSimpleValueType(), DemandedMask,

              {}, /*Depth=*/0, X86::MaxShuffleCombineDepth,

              /*AllowVariableCrossLaneMask=*/true,

              /*AllowVariablePerLaneMask=*/true,

              /*IsMaskedShuffle=*/false, DAG, DL, Subtarget))

        return DAG.getNode(X86ISD::VBROADCAST, DL, VT,

                           DAG.getBitcast(SrcVT, Res));

    }


    // broadcast(bitcast(src)) -> bitcast(broadcast(src))

    // 32-bit targets have to bitcast i64 to f64, so better to bitcast upward.

    if (Src.getOpcode() == ISD::BITCAST &&

        SrcVT.getScalarSizeInBits() == BCVT.getScalarSizeInBits() &&

        TLI.isTypeLegal(BCVT) &&

        FixedVectorType::isValidElementType(

            BCVT.getScalarType().getTypeForEVT(*DAG.getContext()))) {

      EVT NewVT = EVT::getVectorVT(*DAG.getContext(), BCVT.getScalarType(),

                                   VT.getVectorNumElements());

      return DAG.getBitcast(VT, DAG.getNode(X86ISD::VBROADCAST, DL, NewVT, BC));

    }


    // vbroadcast(bitcast(vbroadcast(src))) -> bitcast(vbroadcast(src))

    // If we're re-broadcasting a smaller type then broadcast with that type and

    // bitcast.

    // TODO: Do this for any splat?

    if (Src.getOpcode() == ISD::BITCAST &&

        (BC.getOpcode() == X86ISD::VBROADCAST ||

         BC.getOpcode() == X86ISD::VBROADCAST_LOAD) &&

        (VT.getScalarSizeInBits() % BCVT.getScalarSizeInBits()) == 0 &&

        (VT.getSizeInBits() % BCVT.getSizeInBits()) == 0) {

      MVT NewVT =

          MVT::getVectorVT(BCVT.getSimpleVT().getScalarType(),

                           VT.getSizeInBits() / BCVT.getScalarSizeInBits());

      return DAG.getBitcast(VT, DAG.getNode(X86ISD::VBROADCAST, DL, NewVT, BC));

    }


    // Reduce broadcast source vector to lowest 128-bits.

    if (SrcVT.getSizeInBits() > 128)

      return DAG.getNode(X86ISD::VBROADCAST, DL, VT,

                         extract128BitVector(Src, 0, DAG, DL));


    // broadcast(scalar_to_vector(x)) -> broadcast(x).

    if (Src.getOpcode() == ISD::SCALAR_TO_VECTOR &&

        Src.getValueType().getScalarType() == Src.getOperand(0).getValueType())

      return DAG.getNode(X86ISD::VBROADCAST, DL, VT, Src.getOperand(0));


    // broadcast(extract_vector_elt(x, 0)) -> broadcast(x).

    if (Src.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&

        isNullConstant(Src.getOperand(1)) &&

        Src.getValueType() ==

            Src.getOperand(0).getValueType().getScalarType() &&

        TLI.isTypeLegal(Src.getOperand(0).getValueType()))

      return DAG.getNode(X86ISD::VBROADCAST, DL, VT, Src.getOperand(0));


    // Share broadcast with the longest vector and extract low subvector (free).

    // Ensure the same SDValue from the SDNode use is being used.

    for (SDNode *User : Src->users())

      if (User != N.getNode() && User->getOpcode() == X86ISD::VBROADCAST &&

          Src == User->getOperand(0) &&

          User->getValueSizeInBits(0).getFixedValue() >

              VT.getFixedSizeInBits()) {

        return extractSubVector(SDValue(User, 0), 0, DAG, DL,

                                VT.getSizeInBits());

      }


    // vbroadcast(scalarload X) -> vbroadcast_load X

    // For float loads, extract other uses of the scalar from the broadcast.

    if (!SrcVT.isVector() && (Src.hasOneUse() || VT.isFloatingPoint()) &&

        ISD::isNormalLoad(Src.getNode())) {

      LoadSDNode *LN = cast<LoadSDNode>(Src);

      SDVTList Tys = DAG.getVTList(VT, MVT::Other);

      SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };

      SDValue BcastLd =

          DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, DL, Tys, Ops,

                                  LN->getMemoryVT(), LN->getMemOperand());

      // If the load value is used only by N, replace it via CombineTo N.

      bool NoReplaceExtract = Src.hasOneUse();

      DCI.CombineTo(N.getNode(), BcastLd);

      if (NoReplaceExtract) {

        DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));

        DCI.recursivelyDeleteUnusedNodes(LN);

      } else {

        SDValue Scl = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SrcVT, BcastLd,

                                  DAG.getVectorIdxConstant(0, DL));

        DCI.CombineTo(LN, Scl, BcastLd.getValue(1));

      }

      return N; // Return N so it doesn't get rechecked!

    }


    // Due to isTypeDesirableForOp, we won't always shrink a load truncated to

    // i16. So shrink it ourselves if we can make a broadcast_load.

    if (SrcVT == MVT::i16 && Src.getOpcode() == ISD::TRUNCATE &&

        Src.hasOneUse() && Src.getOperand(0).hasOneUse()) {

      assert(Subtarget.hasAVX2() && "Expected AVX2");

      SDValue TruncIn = Src.getOperand(0);


      // If this is a truncate of a non extending load we can just narrow it to

      // use a broadcast_load.

      if (ISD::isNormalLoad(TruncIn.getNode())) {

        LoadSDNode *LN = cast<LoadSDNode>(TruncIn);

        // Unless its volatile or atomic.

        if (LN->isSimple()) {

          SDVTList Tys = DAG.getVTList(VT, MVT::Other);

          SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };

          SDValue BcastLd = DAG.getMemIntrinsicNode(

              X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::i16,

              LN->getPointerInfo(), LN->getBaseAlign(),

              LN->getMemOperand()->getFlags());

          DCI.CombineTo(N.getNode(), BcastLd);

          DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));

          DCI.recursivelyDeleteUnusedNodes(Src.getNode());

          return N; // Return N so it doesn't get rechecked!

        }

      }


      // If this is a truncate of an i16 extload, we can directly replace it.

      if (ISD::isUNINDEXEDLoad(Src.getOperand(0).getNode()) &&

          ISD::isEXTLoad(Src.getOperand(0).getNode())) {

        LoadSDNode *LN = cast<LoadSDNode>(Src.getOperand(0));

        if (LN->getMemoryVT().getSizeInBits() == 16) {

          SDVTList Tys = DAG.getVTList(VT, MVT::Other);

          SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };

          SDValue BcastLd =

              DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, DL, Tys, Ops,

                                      LN->getMemoryVT(), LN->getMemOperand());

          DCI.CombineTo(N.getNode(), BcastLd);

          DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));

          DCI.recursivelyDeleteUnusedNodes(Src.getNode());

          return N; // Return N so it doesn't get rechecked!

        }

      }


      // If this is a truncate of load that has been shifted right, we can

      // offset the pointer and use a narrower load.

      if (TruncIn.getOpcode() == ISD::SRL &&

          TruncIn.getOperand(0).hasOneUse() &&

          isa<ConstantSDNode>(TruncIn.getOperand(1)) &&

          ISD::isNormalLoad(TruncIn.getOperand(0).getNode())) {

        LoadSDNode *LN = cast<LoadSDNode>(TruncIn.getOperand(0));

        unsigned ShiftAmt = TruncIn.getConstantOperandVal(1);

        // Make sure the shift amount and the load size are divisible by 16.

        // Don't do this if the load is volatile or atomic.

        if (ShiftAmt % 16 == 0 && TruncIn.getValueSizeInBits() % 16 == 0 &&

            LN->isSimple()) {

          unsigned Offset = ShiftAmt / 8;

          SDVTList Tys = DAG.getVTList(VT, MVT::Other);

          SDValue Ptr = DAG.getMemBasePlusOffset(

              LN->getBasePtr(), TypeSize::getFixed(Offset), DL);

          SDValue Ops[] = { LN->getChain(), Ptr };

          SDValue BcastLd = DAG.getMemIntrinsicNode(

              X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::i16,

              LN->getPointerInfo().getWithOffset(Offset), LN->getBaseAlign(),

              LN->getMemOperand()->getFlags());

          DCI.CombineTo(N.getNode(), BcastLd);

          DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));

          DCI.recursivelyDeleteUnusedNodes(Src.getNode());

          return N; // Return N so it doesn't get rechecked!

        }

      }

    }


    // vbroadcast(vzload X) -> vbroadcast_load X

    if (Src.getOpcode() == X86ISD::VZEXT_LOAD && Src.hasOneUse()) {

      MemSDNode *LN = cast<MemIntrinsicSDNode>(Src);

      if (LN->getMemoryVT().getSizeInBits() == VT.getScalarSizeInBits()) {

        SDVTList Tys = DAG.getVTList(VT, MVT::Other);

        SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };

        SDValue BcastLd =

            DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, DL, Tys, Ops,

                                    LN->getMemoryVT(), LN->getMemOperand());

        DCI.CombineTo(N.getNode(), BcastLd);

        DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));

        DCI.recursivelyDeleteUnusedNodes(LN);

        return N; // Return N so it doesn't get rechecked!

      }

    }


    // vbroadcast(vector load X) -> vbroadcast_load

    if (Src.hasOneUse() && ISD::isNormalLoad(Src.getNode())) {

      LoadSDNode *LN = cast<LoadSDNode>(Src);

      // Unless the load is volatile or atomic.

      if (LN->isSimple()) {

        SDVTList Tys = DAG.getVTList(VT, MVT::Other);

        SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};

        SDValue BcastLd = DAG.getMemIntrinsicNode(

            X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, SrcVT.getScalarType(),

            LN->getPointerInfo(), LN->getBaseAlign(),

            LN->getMemOperand()->getFlags());

        DCI.CombineTo(N.getNode(), BcastLd);

        DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));

        DCI.recursivelyDeleteUnusedNodes(LN);

        return N; // Return N so it doesn't get rechecked!

      }

    }


    return SDValue();

  }

  case X86ISD::VZEXT_MOVL: {

    SDValue N0 = N.getOperand(0);


    // Fold (vzmovl (shift x, y)) -> (shift (vzmovl x), y)

    // Zeroing out the upper elements means we're just shifting a zero value.

    // TODO: Try harder to move vzmovl upward towards SCALAR_TO_VECTOR nodes.

    // TODO: Move this to canonicalizeShuffleWithOp once we add zero handling.

    if (N0.getOpcode() == X86ISD::VSHL || N0.getOpcode() == X86ISD::VSHLI ||

        N0.getOpcode() == X86ISD::VSRL || N0.getOpcode() == X86ISD::VSRLI ||

        N0.getOpcode() == X86ISD::VSRA || N0.getOpcode() == X86ISD::VSRAI) {

      if (N0.hasOneUse())

        return DAG.getNode(

            N0.getOpcode(), DL, VT,

            DAG.getNode(X86ISD::VZEXT_MOVL, DL, VT, N0.getOperand(0)),

            N0.getOperand(1));

    }


    // If this a vzmovl of a full vector load, replace it with a vzload, unless

    // the load is volatile.

    if (N0.hasOneUse() && ISD::isNormalLoad(N0.getNode())) {

      auto *LN = cast<LoadSDNode>(N0);

      if (SDValue VZLoad =

              narrowLoadToVZLoad(LN, VT.getVectorElementType(), VT, DAG)) {

        DCI.CombineTo(N.getNode(), VZLoad);

        DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));

        DCI.recursivelyDeleteUnusedNodes(LN);

        return N;

      }

    }


    // If this a VZEXT_MOVL of a VBROADCAST_LOAD, we don't need the broadcast

    // and can just use a VZEXT_LOAD.

    // FIXME: Is there some way to do this with SimplifyDemandedVectorElts?

    if (N0.hasOneUse() && N0.getOpcode() == X86ISD::VBROADCAST_LOAD) {

      auto *LN = cast<MemSDNode>(N0);

      if (VT.getScalarSizeInBits() == LN->getMemoryVT().getSizeInBits()) {

        SDVTList Tys = DAG.getVTList(VT, MVT::Other);

        SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};

        SDValue VZLoad =

            DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, Ops,

                                    LN->getMemoryVT(), LN->getMemOperand());

        DCI.CombineTo(N.getNode(), VZLoad);

        DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));

        DCI.recursivelyDeleteUnusedNodes(LN);

        return N;

      }

    }


    // Turn (v2i64 (vzext_movl (scalar_to_vector (i64 X)))) into

    // (v2i64 (bitcast (v4i32 (vzext_movl (scalar_to_vector (i32 (trunc X)))))))

    // if the upper bits of the i64 are zero.

    if (N0.hasOneUse() && N0.getOpcode() == ISD::SCALAR_TO_VECTOR &&

        N0.getOperand(0).hasOneUse() &&

        N0.getOperand(0).getValueType() == MVT::i64) {

      SDValue In = N0.getOperand(0);

      APInt Mask = APInt::getHighBitsSet(64, 32);

      if (DAG.MaskedValueIsZero(In, Mask)) {

        SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, In);

        MVT VecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);

        SDValue SclVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Trunc);

        SDValue Movl = DAG.getNode(X86ISD::VZEXT_MOVL, DL, VecVT, SclVec);

        return DAG.getBitcast(VT, Movl);

      }

    }


    // Load a scalar integer constant directly to XMM instead of transferring an

    // immediate value from GPR.

    // vzext_movl (scalar_to_vector C) --> load [C,0...]

    if (N0.getOpcode() == ISD::SCALAR_TO_VECTOR) {

      if (auto *C = dyn_cast<ConstantSDNode>(N0.getOperand(0))) {

        // Create a vector constant - scalar constant followed by zeros.

        EVT ScalarVT = N0.getOperand(0).getValueType();

        Type *ScalarTy = ScalarVT.getTypeForEVT(*DAG.getContext());

        Constant *Zero = ConstantInt::getNullValue(ScalarTy);

        SmallVector<Constant *, 32> ConstantVec(NumElts, Zero);

        ConstantVec[0] = const_cast<ConstantInt *>(C->getConstantIntValue());


        // Load the vector constant from constant pool.

        MVT PVT = TLI.getPointerTy(DAG.getDataLayout());

        SDValue CP = DAG.getConstantPool(ConstantVector::get(ConstantVec), PVT);

        MachinePointerInfo MPI =

            MachinePointerInfo::getConstantPool(DAG.getMachineFunction());

        Align Alignment = cast<ConstantPoolSDNode>(CP)->getAlign();

        return DAG.getLoad(VT, DL, DAG.getEntryNode(), CP, MPI, Alignment,

                           MachineMemOperand::MOLoad);

      }

    }


    // Pull subvector inserts into undef through VZEXT_MOVL by making it an

    // insert into a zero vector. This helps get VZEXT_MOVL closer to

    // scalar_to_vectors where 256/512 are canonicalized to an insert and a

    // 128-bit scalar_to_vector. This reduces the number of isel patterns.

    if (!DCI.isBeforeLegalizeOps() && N0.hasOneUse()) {

      SDValue V = peekThroughOneUseBitcasts(N0);


      if (V.getOpcode() == ISD::INSERT_SUBVECTOR && V.getOperand(0).isUndef() &&

          isNullConstant(V.getOperand(2))) {

        SDValue In = V.getOperand(1);

        MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(),

                                     In.getValueSizeInBits() /

                                         VT.getScalarSizeInBits());

        In = DAG.getBitcast(SubVT, In);

        SDValue Movl = DAG.getNode(X86ISD::VZEXT_MOVL, DL, SubVT, In);

        return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,

                           getZeroVector(VT, Subtarget, DAG, DL), Movl,

                           V.getOperand(2));

      }

    }


    return SDValue();

  }

  case X86ISD::BLENDI: {

    SDValue N0 = N.getOperand(0);

    SDValue N1 = N.getOperand(1);

    unsigned EltBits = VT.getScalarSizeInBits();


    if (N0.getOpcode() == ISD::BITCAST && N1.getOpcode() == ISD::BITCAST) {

      // blend(bitcast(x),bitcast(y)) -> bitcast(blend(x,y)) to narrower types.

      // TODO: Handle MVT::v16i16 repeated blend mask.

      if (N0.getOperand(0).getValueType() == N1.getOperand(0).getValueType()) {

        MVT SrcVT = N0.getOperand(0).getSimpleValueType();

        unsigned SrcBits = SrcVT.getScalarSizeInBits();

        if ((EltBits % SrcBits) == 0 && SrcBits >= 32) {

          unsigned NewSize = SrcVT.getVectorNumElements();

          APInt BlendMask = getBLENDIBlendMask(N);

          APInt NewBlendMask = APIntOps::ScaleBitMask(BlendMask, NewSize);

          return DAG.getBitcast(

              VT, DAG.getNode(X86ISD::BLENDI, DL, SrcVT, N0.getOperand(0),

                              N1.getOperand(0),

                              DAG.getTargetConstant(NewBlendMask.getZExtValue(),

                                                    DL, MVT::i8)));

        }

      }

      // Share PSHUFB masks:

      // blend(pshufb(x,m1),pshufb(y,m2))

      // --> m3 = blend(m1,m2)

      //     blend(pshufb(x,m3),pshufb(y,m3))

      if (N0.hasOneUse() && N1.hasOneUse()) {

        SmallVector<int> Mask, ByteMask;

        SmallVector<SDValue> Ops;

        SDValue LHS = peekThroughOneUseBitcasts(N0);

        SDValue RHS = peekThroughOneUseBitcasts(N1);

        if (LHS.getOpcode() == X86ISD::PSHUFB &&

            RHS.getOpcode() == X86ISD::PSHUFB &&

            LHS.getOperand(1) != RHS.getOperand(1) &&

            LHS.getOperand(1).hasOneUse() && RHS.getOperand(1).hasOneUse() &&

            getTargetShuffleMask(N, /*AllowSentinelZero=*/false, Ops, Mask)) {

          assert(Ops.size() == 2 && LHS == peekThroughOneUseBitcasts(Ops[0]) &&

                 RHS == peekThroughOneUseBitcasts(Ops[1]) &&

                 "BLENDI decode mismatch");

          MVT ShufVT = LHS.getSimpleValueType();

          SDValue MaskLHS = LHS.getOperand(1);

          SDValue MaskRHS = RHS.getOperand(1);

          llvm::narrowShuffleMaskElts(EltBits / 8, Mask, ByteMask);

          if (SDValue NewMask = combineX86ShufflesConstants(

                  ShufVT, {MaskLHS, MaskRHS}, ByteMask,

                  {LHS.getNode(), RHS.getNode()}, DAG, DL, Subtarget)) {

            SDValue NewLHS = DAG.getNode(X86ISD::PSHUFB, DL, ShufVT,

                                         LHS.getOperand(0), NewMask);

            SDValue NewRHS = DAG.getNode(X86ISD::PSHUFB, DL, ShufVT,

                                         RHS.getOperand(0), NewMask);

            return DAG.getNode(X86ISD::BLENDI, DL, VT,

                               DAG.getBitcast(VT, NewLHS),

                               DAG.getBitcast(VT, NewRHS), N.getOperand(2));

          }

        }

      }

    }

    return SDValue();

  }

  case X86ISD::SHUFP: {

    // Fold shufps(shuffle(x),shuffle(y)) -> shufps(x,y).

    // This is a more relaxed shuffle combiner that can ignore oneuse limits.

    // TODO: Support types other than v4f32.

    if (VT == MVT::v4f32) {

      bool Updated = false;

      SmallVector<int> Mask;

      SmallVector<SDValue> Ops;

      if (getTargetShuffleMask(N, false, Ops, Mask) && Ops.size() == 2) {

        for (int i = 0; i != 2; ++i) {

          SmallVector<SDValue> SubOps;

          SmallVector<int> SubMask, SubScaledMask;

          SDValue Sub = peekThroughBitcasts(Ops[i]);

          // TODO: Scaling might be easier if we specify the demanded elts.

          if (getTargetShuffleInputs(Sub, SubOps, SubMask, DAG, 0, false) &&

              scaleShuffleElements(SubMask, 4, SubScaledMask) &&

              SubOps.size() == 1 && isUndefOrInRange(SubScaledMask, 0, 4)) {

            int Ofs = i * 2;

            Mask[Ofs + 0] = SubScaledMask[Mask[Ofs + 0] % 4] + (i * 4);

            Mask[Ofs + 1] = SubScaledMask[Mask[Ofs + 1] % 4] + (i * 4);

            Ops[i] = DAG.getBitcast(VT, SubOps[0]);

            Updated = true;

          }

        }

      }

      if (Updated) {

        for (int &M : Mask)

          M %= 4;

        Ops.push_back(getV4X86ShuffleImm8ForMask(Mask, DL, DAG));

        return DAG.getNode(X86ISD::SHUFP, DL, VT, Ops);

      }

    }

    return SDValue();

  }

  case X86ISD::VPERMI: {

    // vpermi(bitcast(x)) -> bitcast(vpermi(x)) for same number of elements.

    // TODO: Remove when we have preferred domains in combineX86ShuffleChain.

    SDValue N0 = N.getOperand(0);

    SDValue N1 = N.getOperand(1);

    unsigned EltSizeInBits = VT.getScalarSizeInBits();

    if (N0.getOpcode() == ISD::BITCAST &&

        N0.getOperand(0).getScalarValueSizeInBits() == EltSizeInBits) {

      SDValue Src = N0.getOperand(0);

      EVT SrcVT = Src.getValueType();

      SDValue Res = DAG.getNode(X86ISD::VPERMI, DL, SrcVT, Src, N1);

      return DAG.getBitcast(VT, Res);

    }

    return SDValue();

  }

  case X86ISD::SHUF128: {

    // If we're permuting the upper 256-bits subvectors of a concatenation, then

    // see if we can peek through and access the subvector directly.

    if (VT.is512BitVector()) {

      // 512-bit mask uses 4 x i2 indices - if the msb is always set then only

      // the upper subvector is used.

      SDValue LHS = peekThroughBitcasts(N->getOperand(0));

      SDValue RHS = peekThroughBitcasts(N->getOperand(1));

      uint64_t Mask = N->getConstantOperandVal(2);

      SmallVector<SDValue> LHSOps, RHSOps;

      SDValue NewLHS, NewRHS;

      if ((Mask & 0x0A) == 0x0A &&

          collectConcatOps(LHS.getNode(), LHSOps, DAG) && LHSOps.size() == 2) {

        NewLHS = widenSubVector(LHSOps[1], false, Subtarget, DAG, DL, 512);

        Mask &= ~0x0A;

      }

      if ((Mask & 0xA0) == 0xA0 &&

          collectConcatOps(RHS.getNode(), RHSOps, DAG) && RHSOps.size() == 2) {

        NewRHS = widenSubVector(RHSOps[1], false, Subtarget, DAG, DL, 512);

        Mask &= ~0xA0;

      }

      if (NewLHS || NewRHS)

        return DAG.getNode(X86ISD::SHUF128, DL, VT,

                           DAG.getBitcast(VT, NewLHS ? NewLHS : LHS),

                           DAG.getBitcast(VT, NewRHS ? NewRHS : RHS),

                           DAG.getTargetConstant(Mask, DL, MVT::i8));

    }

    return SDValue();

  }

  case X86ISD::VPERM2X128: {

    SDValue LHS = N->getOperand(0);

    SDValue RHS = N->getOperand(1);

    unsigned Imm = N.getConstantOperandVal(2) & 255;


    // Canonicalize unary/repeated operands to LHS.

    if (LHS.isUndef() && !RHS.isUndef())

      return DAG.getNode(X86ISD::VPERM2X128, DL, VT, RHS, LHS,

                         DAG.getTargetConstant(Imm ^ 0x22, DL, MVT::i8));

    if (LHS == RHS)

      return DAG.getNode(X86ISD::VPERM2X128, DL, VT, LHS, DAG.getUNDEF(VT),

                         DAG.getTargetConstant(Imm & ~0x22, DL, MVT::i8));


    // Fold vperm2x128(bitcast(x),bitcast(y),c) -> bitcast(vperm2x128(x,y,c)).

    if (LHS.getOpcode() == ISD::BITCAST &&

        (RHS.getOpcode() == ISD::BITCAST || RHS.isUndef())) {

      EVT SrcVT = LHS.getOperand(0).getValueType();

      if (RHS.isUndef() || SrcVT == RHS.getOperand(0).getValueType()) {

        return DAG.getBitcast(VT, DAG.getNode(X86ISD::VPERM2X128, DL, SrcVT,

                                              DAG.getBitcast(SrcVT, LHS),

                                              DAG.getBitcast(SrcVT, RHS),

                                              N->getOperand(2)));

      }

    }


    // Fold vperm2x128(op(),op()) -> op(vperm2x128(),vperm2x128()).

    if (SDValue Res = canonicalizeLaneShuffleWithRepeatedOps(N, DAG, DL))

      return Res;


    // Fold vperm2x128 subvector shuffle with an inner concat pattern.

    // vperm2x128(concat(X,Y),concat(Z,W)) --> concat X,Y etc.

    auto FindSubVector128 = [&](unsigned Idx) {

      if (Idx > 3)

        return SDValue();

      SDValue Src = peekThroughBitcasts(N.getOperand(Idx < 2 ? 0 : 1));

      SmallVector<SDValue> SubOps;

      if (collectConcatOps(Src.getNode(), SubOps, DAG) && SubOps.size() == 2)

        return SubOps[Idx & 1];

      unsigned NumElts = Src.getValueType().getVectorNumElements();

      if ((Idx & 1) == 1 && Src.getOpcode() == ISD::INSERT_SUBVECTOR &&

          Src.getOperand(1).getValueSizeInBits() == 128 &&

          Src.getConstantOperandAPInt(2) == (NumElts / 2)) {

        return Src.getOperand(1);

      }

      return SDValue();

    };

    if (SDValue SubLo = FindSubVector128(Imm & 0x0F)) {

      if (SDValue SubHi = FindSubVector128((Imm & 0xF0) >> 4)) {

        MVT SubVT = VT.getHalfNumVectorElementsVT();

        SubLo = DAG.getBitcast(SubVT, SubLo);

        SubHi = DAG.getBitcast(SubVT, SubHi);

        return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, SubLo, SubHi);

      }

    }


    // Attempt to match VBROADCAST*128 subvector broadcast load.

    if (RHS.isUndef()) {

      SmallVector<int, 4> Mask;

      DecodeVPERM2X128Mask(4, Imm, Mask);

      if (isUndefOrInRange(Mask, 0, 4)) {

        bool SplatLo = isShuffleEquivalent(Mask, {0, 1, 0, 1}, LHS);

        bool SplatHi = isShuffleEquivalent(Mask, {2, 3, 2, 3}, LHS);

        if ((SplatLo || SplatHi) && !Subtarget.hasAVX512() &&

            X86::mayFoldLoad(LHS, Subtarget, /*AssumeSingleUse=*/true)) {

          MVT MemVT = VT.getHalfNumVectorElementsVT();

          unsigned Ofs = SplatLo ? 0 : MemVT.getStoreSize();

          return getBROADCAST_LOAD(X86ISD::SUBV_BROADCAST_LOAD, DL, VT, MemVT,

                                   cast<LoadSDNode>(LHS), Ofs, DAG);

        }

      }

    }


    return SDValue();

  }

  case X86ISD::PSHUFD:

  case X86ISD::PSHUFLW:

  case X86ISD::PSHUFHW: {

    SDValue N0 = N.getOperand(0);

    SDValue N1 = N.getOperand(1);

    if (N0->hasOneUse()) {

      SDValue V = peekThroughOneUseBitcasts(N0);

      switch (V.getOpcode()) {

      case X86ISD::VSHL:

      case X86ISD::VSRL:

      case X86ISD::VSRA:

      case X86ISD::VSHLI:

      case X86ISD::VSRLI:

      case X86ISD::VSRAI:

      case X86ISD::VROTLI:

      case X86ISD::VROTRI: {

        MVT InnerVT = V.getSimpleValueType();

        if (InnerVT.getScalarSizeInBits() <= VT.getScalarSizeInBits()) {

          SDValue Res = DAG.getNode(Opcode, DL, VT,

                                    DAG.getBitcast(VT, V.getOperand(0)), N1);

          Res = DAG.getBitcast(InnerVT, Res);

          Res = DAG.getNode(V.getOpcode(), DL, InnerVT, Res, V.getOperand(1));

          return DAG.getBitcast(VT, Res);

        }

        break;

      }

      }

    }


    Mask = getPSHUFShuffleMask(N);

    assert(Mask.size() == 4);

    break;

  }

  case X86ISD::MOVSD:

  case X86ISD::MOVSH:

  case X86ISD::MOVSS: {

    SDValue N0 = N.getOperand(0);

    SDValue N1 = N.getOperand(1);


    // Canonicalize scalar FPOps:

    // MOVS*(N0, OP(N0, N1)) --> MOVS*(N0, SCALAR_TO_VECTOR(OP(N0[0], N1[0])))

    // If commutable, allow OP(N1[0], N0[0]).

    unsigned Opcode1 = N1.getOpcode();

    if (Opcode1 == ISD::FADD || Opcode1 == ISD::FMUL || Opcode1 == ISD::FSUB ||

        Opcode1 == ISD::FDIV) {

      SDValue N10 = N1.getOperand(0);

      SDValue N11 = N1.getOperand(1);

      if (N10 == N0 ||

          (N11 == N0 && (Opcode1 == ISD::FADD || Opcode1 == ISD::FMUL))) {

        if (N10 != N0)

          std::swap(N10, N11);

        MVT SVT = VT.getVectorElementType();

        SDValue ZeroIdx = DAG.getVectorIdxConstant(0, DL);

        N10 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SVT, N10, ZeroIdx);

        N11 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SVT, N11, ZeroIdx);

        SDValue Scl = DAG.getNode(Opcode1, DL, SVT, N10, N11);

        SDValue SclVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Scl);

        return DAG.getNode(Opcode, DL, VT, N0, SclVec);

      }

    }


    return SDValue();

  }

  case X86ISD::INSERTPS: {

    assert(VT == MVT::v4f32 && "INSERTPS ValueType must be MVT::v4f32");

    SDValue Op0 = N.getOperand(0);

    SDValue Op1 = N.getOperand(1);

    unsigned InsertPSMask = N.getConstantOperandVal(2);

    unsigned SrcIdx = (InsertPSMask >> 6) & 0x3;

    unsigned DstIdx = (InsertPSMask >> 4) & 0x3;

    unsigned ZeroMask = InsertPSMask & 0xF;


    // If we zero out all elements from Op0 then we don't need to reference it.

    if (((ZeroMask | (1u << DstIdx)) == 0xF) && !Op0.isUndef())

      return DAG.getNode(X86ISD::INSERTPS, DL, VT, DAG.getUNDEF(VT), Op1,

                         DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));


    // If we zero out the element from Op1 then we don't need to reference it.

    if ((ZeroMask & (1u << DstIdx)) && !Op1.isUndef())

      return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),

                         DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));


    // Attempt to merge insertps Op1 with an inner target shuffle node.

    SmallVector<int, 8> TargetMask1;

    SmallVector<SDValue, 2> Ops1;

    APInt KnownUndef1, KnownZero1;

    if (getTargetShuffleAndZeroables(Op1, TargetMask1, Ops1, KnownUndef1,

                                     KnownZero1)) {

      if (KnownUndef1[SrcIdx] || KnownZero1[SrcIdx]) {

        // Zero/UNDEF insertion - zero out element and remove dependency.

        InsertPSMask |= (1u << DstIdx);

        return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),

                           DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));

      }

      // Update insertps mask srcidx and reference the source input directly.

      int M = TargetMask1[SrcIdx];

      assert(0 <= M && M < 8 && "Shuffle index out of range");

      InsertPSMask = (InsertPSMask & 0x3f) | ((M & 0x3) << 6);

      Op1 = Ops1[M < 4 ? 0 : 1];

      return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,

                         DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));

    }


    // Attempt to merge insertps Op0 with an inner target shuffle node.

    SmallVector<int, 8> TargetMask0;

    SmallVector<SDValue, 2> Ops0;

    APInt KnownUndef0, KnownZero0;

    if (getTargetShuffleAndZeroables(Op0, TargetMask0, Ops0, KnownUndef0,

                                     KnownZero0)) {

      bool Updated = false;

      bool UseInput00 = false;

      bool UseInput01 = false;

      for (int i = 0; i != 4; ++i) {

        if ((InsertPSMask & (1u << i)) || (i == (int)DstIdx)) {

          // No change if element is already zero or the inserted element.

          continue;

        }


        if (KnownUndef0[i] || KnownZero0[i]) {

          // If the target mask is undef/zero then we must zero the element.

          InsertPSMask |= (1u << i);

          Updated = true;

          continue;

        }


        // The input vector element must be inline.

        int M = TargetMask0[i];

        if (M != i && M != (i + 4))

          return SDValue();


        // Determine which inputs of the target shuffle we're using.

        UseInput00 |= (0 <= M && M < 4);

        UseInput01 |= (4 <= M);

      }


      // If we're not using both inputs of the target shuffle then use the

      // referenced input directly.

      if (UseInput00 && !UseInput01) {

        Updated = true;

        Op0 = Ops0[0];

      } else if (!UseInput00 && UseInput01) {

        Updated = true;

        Op0 = Ops0[1];

      }


      if (Updated)

        return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,

                           DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));

    }


    // If we're inserting an element from a vbroadcast load, fold the

    // load into the X86insertps instruction. We need to convert the scalar

    // load to a vector and clear the source lane of the INSERTPS control.

    if (Op1.getOpcode() == X86ISD::VBROADCAST_LOAD && Op1.hasOneUse()) {

      auto *MemIntr = cast<MemIntrinsicSDNode>(Op1);

      if (MemIntr->getMemoryVT().getScalarSizeInBits() == 32) {

        SDValue Load = DAG.getLoad(MVT::f32, DL, MemIntr->getChain(),

                                   MemIntr->getBasePtr(),

                                   MemIntr->getMemOperand());

        SDValue Insert = DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0,

                           DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT,

                                       Load),

                           DAG.getTargetConstant(InsertPSMask & 0x3f, DL, MVT::i8));

        DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), Load.getValue(1));

        return Insert;

      }

    }


    return SDValue();

  }

  case X86ISD::VPERMV: {

    // Combine VPERMV to VPERMV3 if the source operand can be freely split.

    SmallVector<int, 32> Mask;

    SmallVector<SDValue, 2> SrcOps, SubOps;

    SDValue Src = peekThroughBitcasts(N.getOperand(1));

    if ((Subtarget.hasVLX() || VT.is512BitVector()) &&

        getTargetShuffleMask(N, /*AllowSentinelZero=*/false, SrcOps, Mask) &&

        collectConcatOps(Src.getNode(), SubOps, DAG)) {

      assert(Mask.size() == NumElts && "Unexpected shuffle mask size");

      assert(SrcOps.size() == 1 && "Unexpected shuffle ops");

      assert((SubOps.size() == 2 || SubOps.size() == 4) &&

             "Unexpected split ops");

      // Bail if we were permuting a widened vector.

      if (SubOps[1].isUndef() &&

          (SubOps.size() == 2 || (SubOps[2].isUndef() && SubOps[3].isUndef())))

        return SDValue();

      // Bail if any subops would have folded into the concat.

      if (any_of(SubOps, isShuffleFoldableLoad))

        return SDValue();

      // Concat 4x128 back to 2x256.

      if (SubOps.size() == 4) {

        SubOps[0] = concatSubVectors(SubOps[0], SubOps[1], DAG, DL);

        SubOps[1] = concatSubVectors(SubOps[2], SubOps[3], DAG, DL);

      }

      // Convert mask to 2 operand shuffle.

      int HalfElts = NumElts / 2;

      for (int &M : Mask)

        M += M >= HalfElts ? HalfElts : 0;

      SDValue Lo = widenSubVector(SubOps[0], false, Subtarget, DAG, DL,

                                  VT.getSizeInBits());

      SDValue Hi = widenSubVector(SubOps[1], false, Subtarget, DAG, DL,

                                  VT.getSizeInBits());

      return lowerShuffleWithPERMV(DL, VT, Mask, DAG.getBitcast(VT, Lo),

                                   DAG.getBitcast(VT, Hi), Subtarget, DAG);

    }

    return SDValue();

  }

  case X86ISD::VPERMV3: {

    MVT WideVT = VT.getDoubleNumVectorElementsVT();

    bool CanConcat = VT.is128BitVector() ||

                     (VT.is256BitVector() && Subtarget.useAVX512Regs());

    SmallVector<SDValue, 2> SrcOps;

    SmallVector<int, 32> Mask;

    if (getTargetShuffleMask(N, /*AllowSentinelZero=*/false, SrcOps, Mask)) {

      assert(Mask.size() == NumElts && "Unexpected shuffle mask size");

      SDValue V1 = peekThroughBitcasts(N.getOperand(0));

      SDValue V2 = peekThroughBitcasts(N.getOperand(2));

      // Canonicalize to VPERMV if both sources are the same.

      if (V1 == V2) {

        for (int &M : Mask)

          M = (M < 0 ? M : (M & (NumElts - 1)));

        return lowerShuffleWithPERMV(DL, VT, Mask, N.getOperand(0),

                                     DAG.getUNDEF(VT), Subtarget, DAG);

      }

      // If sources are half width, then concat and use VPERMV with adjusted

      // mask.

      SDValue Ops[2];

      MVT HalfVT = VT.getHalfNumVectorElementsVT();

      if (sd_match(V1,

                   m_InsertSubvector(m_Undef(), m_Value(Ops[0]), m_Zero())) &&

          sd_match(V2,

                   m_InsertSubvector(m_Undef(), m_Value(Ops[1]), m_Zero())) &&

          Ops[0].getValueType() == HalfVT && Ops[1].getValueType() == HalfVT) {

        if (SDValue ConcatSrc =

                combineConcatVectorOps(DL, VT, Ops, DAG, Subtarget)) {

          for (int &M : Mask)

            M = (M < (int)NumElts ? M : (M - (NumElts / 2)));

          return lowerShuffleWithPERMV(DL, VT, Mask, ConcatSrc,

                                       DAG.getUNDEF(VT), Subtarget, DAG);

        }

      }

      // Commute foldable source to the RHS.

      if (isShuffleFoldableLoad(N.getOperand(0)) &&

          !isShuffleFoldableLoad(N.getOperand(2))) {

        ShuffleVectorSDNode::commuteMask(Mask);

        return lowerShuffleWithPERMV(DL, VT, Mask, N.getOperand(2),

                                     N.getOperand(0), Subtarget, DAG);

      }

      // Combine VPERMV3 to widened VPERMV if the two source operands can be

      // freely concatenated, with a commuted shuffle mask.

      if (CanConcat) {

        if (SDValue ConcatSrc = combineConcatVectorOps(

                DL, WideVT, {N.getOperand(2), N.getOperand(0)}, DAG,

                Subtarget)) {

          ShuffleVectorSDNode::commuteMask(Mask);

          Mask.append(NumElts, SM_SentinelUndef);

          SDValue Perm =

              lowerShuffleWithPERMV(DL, WideVT, Mask, ConcatSrc,

                                    DAG.getUNDEF(WideVT), Subtarget, DAG);

          return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Perm,

                             DAG.getVectorIdxConstant(0, DL));

        }

      }

    }

    // Combine VPERMV3 to widened VPERMV if the two source operands can be

    // freely concatenated.

    if (CanConcat) {

      if (SDValue ConcatSrc = combineConcatVectorOps(

              DL, WideVT, {N.getOperand(0), N.getOperand(2)}, DAG, Subtarget)) {

        SDValue Mask = widenSubVector(N.getOperand(1), false, Subtarget, DAG,

                                      DL, WideVT.getSizeInBits());

        SDValue Perm = DAG.getNode(X86ISD::VPERMV, DL, WideVT, Mask, ConcatSrc);

        return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Perm,

                           DAG.getVectorIdxConstant(0, DL));

      }

    }

    return SDValue();

  }

  default:

    return SDValue();

  }


  // Nuke no-op shuffles that show up after combining.

  if (isNoopShuffleMask(Mask))

    return N.getOperand(0);


  // Look for simplifications involving one or two shuffle instructions.

  SDValue V = N.getOperand(0);

  switch (N.getOpcode()) {

  default:

    break;

  case X86ISD::PSHUFLW:

  case X86ISD::PSHUFHW:

    assert(VT.getVectorElementType() == MVT::i16 && "Bad word shuffle type!");


    // See if this reduces to a PSHUFD which is no more expensive and can

    // combine with more operations. Note that it has to at least flip the

    // dwords as otherwise it would have been removed as a no-op.

    if (ArrayRef<int>(Mask).equals({2, 3, 0, 1})) {

      int DMask[] = {0, 1, 2, 3};

      int DOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 2;

      DMask[DOffset + 0] = DOffset + 1;

      DMask[DOffset + 1] = DOffset + 0;

      MVT DVT = MVT::getVectorVT(MVT::i32, NumElts / 2);

      V = DAG.getBitcast(DVT, V);

      V = DAG.getNode(X86ISD::PSHUFD, DL, DVT, V,

                      getV4X86ShuffleImm8ForMask(DMask, DL, DAG));

      return DAG.getBitcast(VT, V);

    }


    // Look for shuffle patterns which can be implemented as a single unpack.

    // FIXME: This doesn't handle the location of the PSHUFD generically, and

    // only works when we have a PSHUFD followed by two half-shuffles.

    if (Mask[0] == Mask[1] && Mask[2] == Mask[3] &&

        (V.getOpcode() == X86ISD::PSHUFLW ||

         V.getOpcode() == X86ISD::PSHUFHW) &&

        V.getOpcode() != N.getOpcode() &&

        V.hasOneUse() && V.getOperand(0).hasOneUse()) {

      SDValue D = peekThroughOneUseBitcasts(V.getOperand(0));

      if (D.getOpcode() == X86ISD::PSHUFD) {

        SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);

        SmallVector<int, 4> DMask = getPSHUFShuffleMask(D);

        int NOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;

        int VOffset = V.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;

        int WordMask[8];

        for (int i = 0; i < 4; ++i) {

          WordMask[i + NOffset] = Mask[i] + NOffset;

          WordMask[i + VOffset] = VMask[i] + VOffset;

        }

        // Map the word mask through the DWord mask.

        int MappedMask[8];

        for (int i = 0; i < 8; ++i)

          MappedMask[i] = 2 * DMask[WordMask[i] / 2] + WordMask[i] % 2;

        if (ArrayRef<int>(MappedMask).equals({0, 0, 1, 1, 2, 2, 3, 3}) ||

            ArrayRef<int>(MappedMask).equals({4, 4, 5, 5, 6, 6, 7, 7})) {

          // We can replace all three shuffles with an unpack.

          V = DAG.getBitcast(VT, D.getOperand(0));

          return DAG.getNode(MappedMask[0] == 0 ? X86ISD::UNPCKL

                                                : X86ISD::UNPCKH,

                             DL, VT, V, V);

        }

      }

    }


    break;


  case X86ISD::PSHUFD:

    if (SDValue NewN = combineRedundantDWordShuffle(N, Mask, DL, DAG))

      return NewN;


    break;

  }


  return SDValue();

}


/// Checks if the shuffle mask takes subsequent elements

/// alternately from two vectors.

/// For example <0, 5, 2, 7> or <8, 1, 10, 3, 12, 5, 14, 7> are both correct.


static bool isAddSubOrSubAddMask(ArrayRef<int> Mask, bool &Op0Even) {


  int ParitySrc[2] = {-1, -1};

  unsigned Size = Mask.size();

  for (unsigned i = 0; i != Size; ++i) {

    int M = Mask[i];

    if (M < 0)

      continue;


    // Make sure we are using the matching element from the input.

    if ((M % Size) != i)

      return false;


    // Make sure we use the same input for all elements of the same parity.

    int Src = M / Size;

    if (ParitySrc[i % 2] >= 0 && ParitySrc[i % 2] != Src)

      return false;

    ParitySrc[i % 2] = Src;

  }


  // Make sure each input is used.

  if (ParitySrc[0] < 0 || ParitySrc[1] < 0 || ParitySrc[0] == ParitySrc[1])

    return false;


  Op0Even = ParitySrc[0] == 0;

  return true;

}


/// Returns true iff the shuffle node \p N can be replaced with ADDSUB(SUBADD)

/// operation. If true is returned then the operands of ADDSUB(SUBADD) operation

/// are written to the parameters \p Opnd0 and \p Opnd1.

///

/// We combine shuffle to ADDSUB(SUBADD) directly on the abstract vector shuffle nodes

/// so it is easier to generically match. We also insert dummy vector shuffle

/// nodes for the operands which explicitly discard the lanes which are unused

/// by this operation to try to flow through the rest of the combiner

/// the fact that they're unused.


static bool isAddSubOrSubAdd(SDNode *N, const X86Subtarget &Subtarget,

                             SelectionDAG &DAG, SDValue &Opnd0, SDValue &Opnd1,

                             bool &IsSubAdd, bool &HasAllowContract) {


  EVT VT = N->getValueType(0);

  const TargetLowering &TLI = DAG.getTargetLoweringInfo();

  if (!Subtarget.hasSSE3() || !TLI.isTypeLegal(VT) ||

      !VT.getSimpleVT().isFloatingPoint())

    return false;


  // We only handle target-independent shuffles.

  // FIXME: It would be easy and harmless to use the target shuffle mask

  // extraction tool to support more.

  if (N->getOpcode() != ISD::VECTOR_SHUFFLE)

    return false;


  SDValue V1 = N->getOperand(0);

  SDValue V2 = N->getOperand(1);


  // Make sure we have an FADD and an FSUB.

  if ((V1.getOpcode() != ISD::FADD && V1.getOpcode() != ISD::FSUB) ||

      (V2.getOpcode() != ISD::FADD && V2.getOpcode() != ISD::FSUB) ||

      V1.getOpcode() == V2.getOpcode())

    return false;


  // If there are other uses of these operations we can't fold them.

  if (!V1->hasOneUse() || !V2->hasOneUse())

    return false;


  // Ensure that both operations have the same operands. Note that we can

  // commute the FADD operands.

  SDValue LHS, RHS;

  if (V1.getOpcode() == ISD::FSUB) {

    LHS = V1->getOperand(0); RHS = V1->getOperand(1);

    if ((V2->getOperand(0) != LHS || V2->getOperand(1) != RHS) &&

        (V2->getOperand(0) != RHS || V2->getOperand(1) != LHS))

      return false;

  } else {

    assert(V2.getOpcode() == ISD::FSUB && "Unexpected opcode");

    LHS = V2->getOperand(0); RHS = V2->getOperand(1);

    if ((V1->getOperand(0) != LHS || V1->getOperand(1) != RHS) &&

        (V1->getOperand(0) != RHS || V1->getOperand(1) != LHS))

      return false;

  }


  ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(N)->getMask();

  bool Op0Even;

  if (!isAddSubOrSubAddMask(Mask, Op0Even))

    return false;


  // It's a subadd if the vector in the even parity is an FADD.

  IsSubAdd = Op0Even ? V1->getOpcode() == ISD::FADD

                     : V2->getOpcode() == ISD::FADD;

  HasAllowContract =

      V1->getFlags().hasAllowContract() && V2->getFlags().hasAllowContract();


  Opnd0 = LHS;

  Opnd1 = RHS;

  return true;

}


/// Combine shuffle of two fma nodes into FMAddSub or FMSubAdd.


static SDValue combineShuffleToFMAddSub(SDNode *N, const SDLoc &DL,

                                        const X86Subtarget &Subtarget,

                                        SelectionDAG &DAG) {

  // We only handle target-independent shuffles.

  // FIXME: It would be easy and harmless to use the target shuffle mask

  // extraction tool to support more.

  if (N->getOpcode() != ISD::VECTOR_SHUFFLE)

    return SDValue();


  MVT VT = N->getSimpleValueType(0);

  const TargetLowering &TLI = DAG.getTargetLoweringInfo();

  if (!Subtarget.hasAnyFMA() || !TLI.isTypeLegal(VT))

    return SDValue();


  // We're trying to match (shuffle fma(a, b, c), X86Fmsub(a, b, c).

  SDValue Op0 = N->getOperand(0);

  SDValue Op1 = N->getOperand(1);

  SDValue FMAdd = Op0, FMSub = Op1;

  if (FMSub.getOpcode() != X86ISD::FMSUB)

    std::swap(FMAdd, FMSub);


  if (FMAdd.getOpcode() != ISD::FMA || FMSub.getOpcode() != X86ISD::FMSUB ||

      FMAdd.getOperand(0) != FMSub.getOperand(0) || !FMAdd.hasOneUse() ||

      FMAdd.getOperand(1) != FMSub.getOperand(1) || !FMSub.hasOneUse() ||

      FMAdd.getOperand(2) != FMSub.getOperand(2))

    return SDValue();


  // Check for correct shuffle mask.

  ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(N)->getMask();

  bool Op0Even;

  if (!isAddSubOrSubAddMask(Mask, Op0Even))

    return SDValue();


  // FMAddSub takes zeroth operand from FMSub node.

  bool IsSubAdd = Op0Even ? Op0 == FMAdd : Op1 == FMAdd;

  unsigned Opcode = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;

  return DAG.getNode(Opcode, DL, VT, FMAdd.getOperand(0), FMAdd.getOperand(1),

                     FMAdd.getOperand(2));

}


/// Try to combine a shuffle into a target-specific add-sub or

/// mul-add-sub node.


static SDValue combineShuffleToAddSubOrFMAddSub(SDNode *N, const SDLoc &DL,

                                                const X86Subtarget &Subtarget,

                                                SelectionDAG &DAG) {

  if (SDValue V = combineShuffleToFMAddSub(N, DL, Subtarget, DAG))

    return V;


  SDValue Opnd0, Opnd1;

  bool IsSubAdd;

  bool HasAllowContract;

  if (!isAddSubOrSubAdd(N, Subtarget, DAG, Opnd0, Opnd1, IsSubAdd,

                        HasAllowContract))

    return SDValue();


  MVT VT = N->getSimpleValueType(0);


  // Try to generate X86ISD::FMADDSUB node here.

  SDValue Opnd2;

  if (isFMAddSubOrFMSubAdd(Subtarget, DAG, Opnd0, Opnd1, Opnd2, 2,

                           HasAllowContract)) {

    unsigned Opc = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;

    return DAG.getNode(Opc, DL, VT, Opnd0, Opnd1, Opnd2);

  }


  if (IsSubAdd)

    return SDValue();


  // Do not generate X86ISD::ADDSUB node for 512-bit types even though

  // the ADDSUB idiom has been successfully recognized. There are no known

  // X86 targets with 512-bit ADDSUB instructions!

  if (VT.is512BitVector())

    return SDValue();


  // Do not generate X86ISD::ADDSUB node for FP16's vector types even though

  // the ADDSUB idiom has been successfully recognized. There are no known

  // X86 targets with FP16 ADDSUB instructions!

  if (VT.getVectorElementType() == MVT::f16)

    return SDValue();


  return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);

}


/// If we have a shuffle of AVX/AVX512 (256/512 bit) vectors that only uses the

/// low half of each source vector and does not set any high half elements in

/// the destination vector, narrow the shuffle to half its original size.


static SDValue narrowShuffle(ShuffleVectorSDNode *Shuf, SelectionDAG &DAG) {

  EVT VT = Shuf->getValueType(0);

  if (!DAG.getTargetLoweringInfo().isTypeLegal(Shuf->getValueType(0)))

    return SDValue();

  if (!VT.is256BitVector() && !VT.is512BitVector())

    return SDValue();


  // See if we can ignore all of the high elements of the shuffle.

  ArrayRef<int> Mask = Shuf->getMask();

  if (!isUndefUpperHalf(Mask))

    return SDValue();


  // Check if the shuffle mask accesses only the low half of each input vector

  // (half-index output is 0 or 2).

  int HalfIdx1, HalfIdx2;

  SmallVector<int, 8> HalfMask(Mask.size() / 2);

  if (!getHalfShuffleMask(Mask, HalfMask, HalfIdx1, HalfIdx2) ||

      (HalfIdx1 % 2 == 1) || (HalfIdx2 % 2 == 1))

    return SDValue();


  // Create a half-width shuffle to replace the unnecessarily wide shuffle.

  // The trick is knowing that all of the insert/extract are actually free

  // subregister (zmm<->ymm or ymm<->xmm) ops. That leaves us with a shuffle

  // of narrow inputs into a narrow output, and that is always cheaper than

  // the wide shuffle that we started with.

  return getShuffleHalfVectors(SDLoc(Shuf), Shuf->getOperand(0),

                               Shuf->getOperand(1), HalfMask, HalfIdx1,

                               HalfIdx2, false, DAG, /*UseConcat*/ true);

}


static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG,

                              TargetLowering::DAGCombinerInfo &DCI,

                              const X86Subtarget &Subtarget) {

  if (auto *Shuf = dyn_cast<ShuffleVectorSDNode>(N))

    if (SDValue V = narrowShuffle(Shuf, DAG))

      return V;


  // If we have legalized the vector types, look for blends of FADD and FSUB

  // nodes that we can fuse into an ADDSUB, FMADDSUB, or FMSUBADD node.

  SDLoc dl(N);

  EVT VT = N->getValueType(0);

  const TargetLowering &TLI = DAG.getTargetLoweringInfo();

  if (TLI.isTypeLegal(VT) && !isSoftF16(VT, Subtarget))

    if (SDValue AddSub =

            combineShuffleToAddSubOrFMAddSub(N, dl, Subtarget, DAG))

      return AddSub;


  // Attempt to combine into a vector load/broadcast.

  if (SDValue LD = combineToConsecutiveLoads(

          VT, SDValue(N, 0), dl, DAG, Subtarget, /*IsAfterLegalize*/ true))

    return LD;


  if (isTargetShuffle(N->getOpcode())) {

    SDValue Op(N, 0);

    if (SDValue Shuffle = combineTargetShuffle(Op, dl, DAG, DCI, Subtarget))

      return Shuffle;


    // Try recursively combining arbitrary sequences of x86 shuffle

    // instructions into higher-order shuffles. We do this after combining

    // specific PSHUF instruction sequences into their minimal form so that we

    // can evaluate how many specialized shuffle instructions are involved in

    // a particular chain.

    if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))

      return Res;


    // Simplify source operands based on shuffle mask.

    // TODO - merge this into combineX86ShufflesRecursively.

    APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());

    if (TLI.SimplifyDemandedVectorElts(Op, DemandedElts, DCI))

      return SDValue(N, 0);


    // Canonicalize SHUFFLE(UNARYOP(X)) -> UNARYOP(SHUFFLE(X)).

    // Canonicalize SHUFFLE(BINOP(X,Y)) -> BINOP(SHUFFLE(X),SHUFFLE(Y)).

    // Perform this after other shuffle combines to allow inner shuffles to be

    // combined away first.

    if (SDValue BinOp = canonicalizeShuffleWithOp(Op, DAG, dl))

      return BinOp;

  }


  return SDValue();

}


// Simplify variable target shuffle masks based on the demanded elements.

// TODO: Handle DemandedBits in mask indices as well?


bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetShuffle(

    SDValue Op, const APInt &DemandedElts, unsigned MaskIndex,

    TargetLowering::TargetLoweringOpt &TLO, unsigned Depth) const {

  // If we're demanding all elements don't bother trying to simplify the mask.

  unsigned NumElts = DemandedElts.getBitWidth();

  if (DemandedElts.isAllOnes())

    return false;


  SDValue Mask = Op.getOperand(MaskIndex);

  if (!Mask.hasOneUse())

    return false;


  // Attempt to generically simplify the variable shuffle mask.

  APInt MaskUndef, MaskZero;

  if (SimplifyDemandedVectorElts(Mask, DemandedElts, MaskUndef, MaskZero, TLO,

                                 Depth + 1))

    return true;


  // Attempt to extract+simplify a (constant pool load) shuffle mask.

  // TODO: Support other types from getTargetShuffleMaskIndices?

  SDValue BC = peekThroughOneUseBitcasts(Mask);

  EVT BCVT = BC.getValueType();

  auto *Load = dyn_cast<LoadSDNode>(BC);

  if (!Load || !Load->getBasePtr().hasOneUse())

    return false;


  const Constant *C = getTargetConstantFromNode(Load);

  if (!C)

    return false;


  Type *CTy = C->getType();

  if (!CTy->isVectorTy() ||

      CTy->getPrimitiveSizeInBits() != Mask.getValueSizeInBits())

    return false;


  // Handle scaling for i64 elements on 32-bit targets.

  unsigned NumCstElts = cast<FixedVectorType>(CTy)->getNumElements();

  if (NumCstElts != NumElts && NumCstElts != (NumElts * 2))

    return false;

  unsigned Scale = NumCstElts / NumElts;


  // Simplify mask if we have an undemanded element that is not undef.

  bool Simplified = false;

  SmallVector<Constant *, 32> ConstVecOps;

  for (unsigned i = 0; i != NumCstElts; ++i) {

    Constant *Elt = C->getAggregateElement(i);

    if (!DemandedElts[i / Scale] && !isa<UndefValue>(Elt)) {

      ConstVecOps.push_back(UndefValue::get(Elt->getType()));

      Simplified = true;

      continue;

    }

    ConstVecOps.push_back(Elt);

  }

  if (!Simplified)

    return false;


  // Generate new constant pool entry + legalize immediately for the load.

  SDLoc DL(Op);

  SDValue CV = TLO.DAG.getConstantPool(ConstantVector::get(ConstVecOps), BCVT);

  SDValue LegalCV = LowerConstantPool(CV, TLO.DAG);

  SDValue NewMask = TLO.DAG.getLoad(

      BCVT, DL, TLO.DAG.getEntryNode(), LegalCV,

      MachinePointerInfo::getConstantPool(TLO.DAG.getMachineFunction()),

      Load->getAlign());

  return TLO.CombineTo(Mask, TLO.DAG.getBitcast(Mask.getValueType(), NewMask));

}


bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(

    SDValue Op, const APInt &DemandedElts, APInt &KnownUndef, APInt &KnownZero,

    TargetLoweringOpt &TLO, unsigned Depth) const {

  int NumElts = DemandedElts.getBitWidth();

  unsigned Opc = Op.getOpcode();

  EVT VT = Op.getValueType();


  // Handle special case opcodes.

  switch (Opc) {

  case X86ISD::PMULDQ:

  case X86ISD::PMULUDQ: {

    APInt LHSUndef, LHSZero;

    APInt RHSUndef, RHSZero;

    SDValue LHS = Op.getOperand(0);

    SDValue RHS = Op.getOperand(1);

    if (SimplifyDemandedVectorElts(LHS, DemandedElts, LHSUndef, LHSZero, TLO,

                                   Depth + 1))

      return true;

    if (SimplifyDemandedVectorElts(RHS, DemandedElts, RHSUndef, RHSZero, TLO,

                                   Depth + 1))

      return true;

    // Multiply by zero.

    KnownZero = LHSZero | RHSZero;

    break;

  }

  case X86ISD::VPMADDUBSW:

  case X86ISD::VPMADDWD: {

    APInt LHSUndef, LHSZero;

    APInt RHSUndef, RHSZero;

    SDValue LHS = Op.getOperand(0);

    SDValue RHS = Op.getOperand(1);

    APInt DemandedSrcElts = APIntOps::ScaleBitMask(DemandedElts, 2 * NumElts);


    if (SimplifyDemandedVectorElts(LHS, DemandedSrcElts, LHSUndef, LHSZero, TLO,

                                   Depth + 1))

      return true;

    if (SimplifyDemandedVectorElts(RHS, DemandedSrcElts, RHSUndef, RHSZero, TLO,

                                   Depth + 1))

      return true;


    // TODO: Multiply by zero.


    // If RHS/LHS elements are known zero then we don't need the LHS/RHS equivalent.

    APInt DemandedLHSElts = DemandedSrcElts & ~RHSZero;

    if (SimplifyDemandedVectorElts(LHS, DemandedLHSElts, LHSUndef, LHSZero, TLO,

                                   Depth + 1))

      return true;

    APInt DemandedRHSElts = DemandedSrcElts & ~LHSZero;

    if (SimplifyDemandedVectorElts(RHS, DemandedRHSElts, RHSUndef, RHSZero, TLO,

                                   Depth + 1))

      return true;

    break;

  }

  case X86ISD::PSADBW: {

    SDValue LHS = Op.getOperand(0);

    SDValue RHS = Op.getOperand(1);

    assert(VT.getScalarType() == MVT::i64 &&

           LHS.getValueType() == RHS.getValueType() &&

           LHS.getValueType().getScalarType() == MVT::i8 &&

           "Unexpected PSADBW types");


    // Aggressively peek through ops to get at the demanded elts.

    if (!DemandedElts.isAllOnes()) {

      unsigned NumSrcElts = LHS.getValueType().getVectorNumElements();

      APInt DemandedSrcElts = APIntOps::ScaleBitMask(DemandedElts, NumSrcElts);

      SDValue NewLHS = SimplifyMultipleUseDemandedVectorElts(

          LHS, DemandedSrcElts, TLO.DAG, Depth + 1);

      SDValue NewRHS = SimplifyMultipleUseDemandedVectorElts(

          RHS, DemandedSrcElts, TLO.DAG, Depth + 1);

      if (NewLHS || NewRHS) {

        NewLHS = NewLHS ? NewLHS : LHS;

        NewRHS = NewRHS ? NewRHS : RHS;

        return TLO.CombineTo(

            Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewLHS, NewRHS));

      }

    }

    break;

  }

  case X86ISD::VSHL:

  case X86ISD::VSRL:

  case X86ISD::VSRA: {

    // We only need the bottom 64-bits of the (128-bit) shift amount.

    SDValue Amt = Op.getOperand(1);

    MVT AmtVT = Amt.getSimpleValueType();

    assert(AmtVT.is128BitVector() && "Unexpected value type");


    // If we reuse the shift amount just for sse shift amounts then we know that

    // only the bottom 64-bits are only ever used.

    bool AssumeSingleUse = llvm::all_of(Amt->users(), [&Amt](SDNode *Use) {

      unsigned UseOpc = Use->getOpcode();

      return (UseOpc == X86ISD::VSHL || UseOpc == X86ISD::VSRL ||

              UseOpc == X86ISD::VSRA) &&

             Use->getOperand(0) != Amt;

    });


    APInt AmtUndef, AmtZero;

    unsigned NumAmtElts = AmtVT.getVectorNumElements();

    APInt AmtElts = APInt::getLowBitsSet(NumAmtElts, NumAmtElts / 2);

    if (SimplifyDemandedVectorElts(Amt, AmtElts, AmtUndef, AmtZero, TLO,

                                   Depth + 1, AssumeSingleUse))

      return true;

    [[fallthrough]];

  }

  case X86ISD::VSHLI:

  case X86ISD::VSRLI:

  case X86ISD::VSRAI: {

    SDValue Src = Op.getOperand(0);

    APInt SrcUndef;

    if (SimplifyDemandedVectorElts(Src, DemandedElts, SrcUndef, KnownZero, TLO,

                                   Depth + 1))

      return true;


    // Fold shift(0,x) -> 0

    if (DemandedElts.isSubsetOf(KnownZero))

      return TLO.CombineTo(

          Op, getZeroVector(VT.getSimpleVT(), Subtarget, TLO.DAG, SDLoc(Op)));


    // Aggressively peek through ops to get at the demanded elts.

    if (!DemandedElts.isAllOnes())

      if (SDValue NewSrc = SimplifyMultipleUseDemandedVectorElts(

              Src, DemandedElts, TLO.DAG, Depth + 1))

        return TLO.CombineTo(

            Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc, Op.getOperand(1)));

    break;

  }

  case X86ISD::VPSHA:

  case X86ISD::VPSHL:

  case X86ISD::VSHLV:

  case X86ISD::VSRLV:

  case X86ISD::VSRAV: {

    APInt LHSUndef, LHSZero;

    APInt RHSUndef, RHSZero;

    SDValue LHS = Op.getOperand(0);

    SDValue RHS = Op.getOperand(1);

    if (SimplifyDemandedVectorElts(LHS, DemandedElts, LHSUndef, LHSZero, TLO,

                                   Depth + 1))

      return true;


    // Fold shift(0,x) -> 0

    if (DemandedElts.isSubsetOf(LHSZero))

      return TLO.CombineTo(

          Op, getZeroVector(VT.getSimpleVT(), Subtarget, TLO.DAG, SDLoc(Op)));


    if (SimplifyDemandedVectorElts(RHS, DemandedElts, RHSUndef, RHSZero, TLO,

                                   Depth + 1))

      return true;


    KnownZero = LHSZero;

    break;

  }

  case X86ISD::CMPM:

  case X86ISD::CMPP: {

    // Scalarize packed fp comparison if we only require element 0.

    if (DemandedElts == 1) {

      SDLoc dl(Op);

      MVT VT = Op.getSimpleValueType();

      MVT OpSVT = Op.getOperand(0).getSimpleValueType().getScalarType();

      SDValue LHS = TLO.DAG.getExtractVectorElt(dl, OpSVT, Op.getOperand(0), 0);

      SDValue RHS = TLO.DAG.getExtractVectorElt(dl, OpSVT, Op.getOperand(1), 0);

      SDValue CC = Op.getOperand(2);

      if (Opc == X86ISD::CMPM) {

        SDValue Cmp =

            TLO.DAG.getNode(X86ISD::FSETCCM, dl, MVT::v1i1, LHS, RHS, CC);

        return TLO.CombineTo(

            Op, TLO.DAG.getInsertSubvector(dl, TLO.DAG.getUNDEF(VT), Cmp, 0));

      }

      SDValue Cmp = TLO.DAG.getNode(X86ISD::FSETCC, dl, OpSVT, LHS, RHS, CC);

      return TLO.CombineTo(Op,

                           TLO.DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Cmp));

    }

    break;

  }

  case X86ISD::PCMPEQ:

  case X86ISD::PCMPGT: {

    APInt LHSUndef, LHSZero;

    APInt RHSUndef, RHSZero;

    SDValue LHS = Op.getOperand(0);

    SDValue RHS = Op.getOperand(1);

    if (SimplifyDemandedVectorElts(LHS, DemandedElts, LHSUndef, LHSZero, TLO,

                                   Depth + 1))

      return true;

    if (SimplifyDemandedVectorElts(RHS, DemandedElts, RHSUndef, RHSZero, TLO,

                                   Depth + 1))

      return true;

    break;

  }

  case X86ISD::KSHIFTL: {

    SDValue Src = Op.getOperand(0);

    auto *Amt = cast<ConstantSDNode>(Op.getOperand(1));

    assert(Amt->getAPIntValue().ult(NumElts) && "Out of range shift amount");

    unsigned ShiftAmt = Amt->getZExtValue();


    if (ShiftAmt == 0)

      return TLO.CombineTo(Op, Src);


    // If this is ((X >>u C1) << ShAmt), see if we can simplify this into a

    // single shift.  We can do this if the bottom bits (which are shifted

    // out) are never demanded.

    if (Src.getOpcode() == X86ISD::KSHIFTR) {

      if (!DemandedElts.intersects(APInt::getLowBitsSet(NumElts, ShiftAmt))) {

        unsigned C1 = Src.getConstantOperandVal(1);

        unsigned NewOpc = X86ISD::KSHIFTL;

        int Diff = ShiftAmt - C1;

        if (Diff < 0) {

          Diff = -Diff;

          NewOpc = X86ISD::KSHIFTR;

        }


        SDLoc dl(Op);

        SDValue NewSA = TLO.DAG.getTargetConstant(Diff, dl, MVT::i8);

        return TLO.CombineTo(

            Op, TLO.DAG.getNode(NewOpc, dl, VT, Src.getOperand(0), NewSA));

      }

    }


    APInt DemandedSrc = DemandedElts.lshr(ShiftAmt);

    if (SimplifyDemandedVectorElts(Src, DemandedSrc, KnownUndef, KnownZero, TLO,

                                   Depth + 1))

      return true;


    KnownUndef <<= ShiftAmt;

    KnownZero <<= ShiftAmt;

    KnownZero.setLowBits(ShiftAmt);

    break;

  }

  case X86ISD::KSHIFTR: {

    SDValue Src = Op.getOperand(0);

    auto *Amt = cast<ConstantSDNode>(Op.getOperand(1));

    assert(Amt->getAPIntValue().ult(NumElts) && "Out of range shift amount");

    unsigned ShiftAmt = Amt->getZExtValue();


    if (ShiftAmt == 0)

      return TLO.CombineTo(Op, Src);


    // If this is ((X << C1) >>u ShAmt), see if we can simplify this into a

    // single shift.  We can do this if the top bits (which are shifted

    // out) are never demanded.

    if (Src.getOpcode() == X86ISD::KSHIFTL) {

      if (!DemandedElts.intersects(APInt::getHighBitsSet(NumElts, ShiftAmt))) {

        unsigned C1 = Src.getConstantOperandVal(1);

        unsigned NewOpc = X86ISD::KSHIFTR;

        int Diff = ShiftAmt - C1;

        if (Diff < 0) {

          Diff = -Diff;

          NewOpc = X86ISD::KSHIFTL;

        }


        SDLoc dl(Op);

        SDValue NewSA = TLO.DAG.getTargetConstant(Diff, dl, MVT::i8);

        return TLO.CombineTo(

            Op, TLO.DAG.getNode(NewOpc, dl, VT, Src.getOperand(0), NewSA));

      }

    }


    APInt DemandedSrc = DemandedElts.shl(ShiftAmt);

    if (SimplifyDemandedVectorElts(Src, DemandedSrc, KnownUndef, KnownZero, TLO,

                                   Depth + 1))

      return true;


    KnownUndef.lshrInPlace(ShiftAmt);

    KnownZero.lshrInPlace(ShiftAmt);

    KnownZero.setHighBits(ShiftAmt);

    break;

  }

  case X86ISD::ANDNP: {

    // ANDNP = (~LHS & RHS);

    SDValue LHS = Op.getOperand(0);

    SDValue RHS = Op.getOperand(1);


    auto GetDemandedMasks = [&](SDValue Op, bool Invert = false) {

      APInt UndefElts;

      SmallVector<APInt> EltBits;

      int NumElts = VT.getVectorNumElements();

      int EltSizeInBits = VT.getScalarSizeInBits();

      APInt OpBits = APInt::getAllOnes(EltSizeInBits);

      APInt OpElts = DemandedElts;

      if (getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts,

                                        EltBits)) {

        OpBits.clearAllBits();

        OpElts.clearAllBits();

        for (int I = 0; I != NumElts; ++I) {

          if (!DemandedElts[I])

            continue;

          if (UndefElts[I]) {

            // We can't assume an undef src element gives an undef dst - the

            // other src might be zero.

            OpBits.setAllBits();

            OpElts.setBit(I);

          } else if ((Invert && !EltBits[I].isAllOnes()) ||

                     (!Invert && !EltBits[I].isZero())) {

            OpBits |= Invert ? ~EltBits[I] : EltBits[I];

            OpElts.setBit(I);

          }

        }

      }

      return std::make_pair(OpBits, OpElts);

    };

    APInt BitsLHS, EltsLHS;

    APInt BitsRHS, EltsRHS;

    std::tie(BitsLHS, EltsLHS) = GetDemandedMasks(RHS);

    std::tie(BitsRHS, EltsRHS) = GetDemandedMasks(LHS, true);


    APInt LHSUndef, LHSZero;

    APInt RHSUndef, RHSZero;

    if (SimplifyDemandedVectorElts(LHS, EltsLHS, LHSUndef, LHSZero, TLO,

                                   Depth + 1))

      return true;

    if (SimplifyDemandedVectorElts(RHS, EltsRHS, RHSUndef, RHSZero, TLO,

                                   Depth + 1))

      return true;


    if (!DemandedElts.isAllOnes()) {

      SDValue NewLHS = SimplifyMultipleUseDemandedBits(LHS, BitsLHS, EltsLHS,

                                                       TLO.DAG, Depth + 1);

      SDValue NewRHS = SimplifyMultipleUseDemandedBits(RHS, BitsRHS, EltsRHS,

                                                       TLO.DAG, Depth + 1);

      if (NewLHS || NewRHS) {

        NewLHS = NewLHS ? NewLHS : LHS;

        NewRHS = NewRHS ? NewRHS : RHS;

        return TLO.CombineTo(

            Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewLHS, NewRHS));

      }

    }

    break;

  }

  case X86ISD::CVTSI2P:

  case X86ISD::CVTUI2P:

  case X86ISD::CVTPH2PS:

  case X86ISD::CVTPS2PH: {

    SDValue Src = Op.getOperand(0);

    EVT SrcVT = Src.getValueType();

    APInt SrcUndef, SrcZero;

    APInt SrcElts = DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());

    if (SimplifyDemandedVectorElts(Src, SrcElts, SrcUndef, SrcZero, TLO,

                                   Depth + 1))

      return true;

    break;

  }

  case X86ISD::PACKSS:

  case X86ISD::PACKUS: {

    SDValue N0 = Op.getOperand(0);

    SDValue N1 = Op.getOperand(1);


    APInt DemandedLHS, DemandedRHS;

    getPackDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS);


    APInt LHSUndef, LHSZero;

    if (SimplifyDemandedVectorElts(N0, DemandedLHS, LHSUndef, LHSZero, TLO,

                                   Depth + 1))

      return true;

    APInt RHSUndef, RHSZero;

    if (SimplifyDemandedVectorElts(N1, DemandedRHS, RHSUndef, RHSZero, TLO,

                                   Depth + 1))

      return true;


    // TODO - pass on known zero/undef.


    // Aggressively peek through ops to get at the demanded elts.

    // TODO - we should do this for all target/faux shuffles ops.

    if (!DemandedElts.isAllOnes()) {

      SDValue NewN0 = SimplifyMultipleUseDemandedVectorElts(N0, DemandedLHS,

                                                            TLO.DAG, Depth + 1);

      SDValue NewN1 = SimplifyMultipleUseDemandedVectorElts(N1, DemandedRHS,

                                                            TLO.DAG, Depth + 1);

      if (NewN0 || NewN1) {

        NewN0 = NewN0 ? NewN0 : N0;

        NewN1 = NewN1 ? NewN1 : N1;

        return TLO.CombineTo(Op,

                             TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewN0, NewN1));

      }

    }

    break;

  }

  case X86ISD::HADD:

  case X86ISD::HSUB:

  case X86ISD::FHADD:

  case X86ISD::FHSUB: {

    SDValue N0 = Op.getOperand(0);

    SDValue N1 = Op.getOperand(1);


    APInt DemandedLHS, DemandedRHS;

    getHorizDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS);


    APInt LHSUndef, LHSZero;

    if (SimplifyDemandedVectorElts(N0, DemandedLHS, LHSUndef, LHSZero, TLO,

                                   Depth + 1))

      return true;

    APInt RHSUndef, RHSZero;

    if (SimplifyDemandedVectorElts(N1, DemandedRHS, RHSUndef, RHSZero, TLO,

                                   Depth + 1))

      return true;


    // TODO - pass on known zero/undef.


    // Aggressively peek through ops to get at the demanded elts.

    // TODO: Handle repeated operands.

    if (N0 != N1 && !DemandedElts.isAllOnes()) {

      SDValue NewN0 = SimplifyMultipleUseDemandedVectorElts(N0, DemandedLHS,

                                                            TLO.DAG, Depth + 1);

      SDValue NewN1 = SimplifyMultipleUseDemandedVectorElts(N1, DemandedRHS,

                                                            TLO.DAG, Depth + 1);

      if (NewN0 || NewN1) {

        NewN0 = NewN0 ? NewN0 : N0;

        NewN1 = NewN1 ? NewN1 : N1;

        return TLO.CombineTo(Op,

                             TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewN0, NewN1));

      }

    }

    break;

  }

  case X86ISD::VTRUNC:

  case X86ISD::VTRUNCS:

  case X86ISD::VTRUNCUS: {

    SDValue Src = Op.getOperand(0);

    MVT SrcVT = Src.getSimpleValueType();

    APInt DemandedSrc = DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());

    APInt SrcUndef, SrcZero;

    if (SimplifyDemandedVectorElts(Src, DemandedSrc, SrcUndef, SrcZero, TLO,

                                   Depth + 1))

      return true;

    KnownZero = SrcZero.zextOrTrunc(NumElts);

    KnownUndef = SrcUndef.zextOrTrunc(NumElts);

    break;

  }

  case X86ISD::BLENDI: {

    SmallVector<int, 16> BlendMask;

    DecodeBLENDMask(NumElts, Op.getConstantOperandVal(2), BlendMask);

    if (SDValue R = combineBlendOfPermutes(

            VT.getSimpleVT(), Op.getOperand(0), Op.getOperand(1), BlendMask,

            DemandedElts, TLO.DAG, Subtarget, SDLoc(Op)))

      return TLO.CombineTo(Op, R);

    break;

  }

  case X86ISD::BLENDV: {

    APInt SelUndef, SelZero;

    if (SimplifyDemandedVectorElts(Op.getOperand(0), DemandedElts, SelUndef,

                                   SelZero, TLO, Depth + 1))

      return true;


    // TODO: Use SelZero to adjust LHS/RHS DemandedElts.

    APInt LHSUndef, LHSZero;

    if (SimplifyDemandedVectorElts(Op.getOperand(1), DemandedElts, LHSUndef,

                                   LHSZero, TLO, Depth + 1))

      return true;


    APInt RHSUndef, RHSZero;

    if (SimplifyDemandedVectorElts(Op.getOperand(2), DemandedElts, RHSUndef,

                                   RHSZero, TLO, Depth + 1))

      return true;


    KnownZero = LHSZero & RHSZero;

    KnownUndef = LHSUndef & RHSUndef;

    break;

  }

  case X86ISD::VZEXT_MOVL: {

    // If upper demanded elements are already zero then we have nothing to do.

    SDValue Src = Op.getOperand(0);

    APInt DemandedUpperElts = DemandedElts;

    DemandedUpperElts.clearLowBits(1);

    if (TLO.DAG.MaskedVectorIsZero(Src, DemandedUpperElts, Depth + 1))

      return TLO.CombineTo(Op, Src);

    break;

  }

  case X86ISD::VZEXT_LOAD: {

    // If upper demanded elements are not demanded then simplify to a

    // scalar_to_vector(load()).

    MVT SVT = VT.getSimpleVT().getVectorElementType();

    if (DemandedElts == 1 && Op.getValue(1).use_empty() && isTypeLegal(SVT)) {

      SDLoc DL(Op);

      auto *Mem = cast<MemSDNode>(Op);

      SDValue Elt = TLO.DAG.getLoad(SVT, DL, Mem->getChain(), Mem->getBasePtr(),

                                    Mem->getMemOperand());

      SDValue Vec = TLO.DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Elt);

      return TLO.CombineTo(Op, TLO.DAG.getBitcast(VT, Vec));

    }

    break;

  }

  case X86ISD::VBROADCAST: {

    SDValue Src = Op.getOperand(0);

    MVT SrcVT = Src.getSimpleValueType();

    // Don't bother broadcasting if we just need the 0'th element.

    if (DemandedElts == 1) {

      if (!SrcVT.isVector())

        Src = TLO.DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(Op), VT, Src);

      else if (Src.getValueType() != VT)

        Src = widenSubVector(VT.getSimpleVT(), Src, false, Subtarget, TLO.DAG,

                             SDLoc(Op));

      return TLO.CombineTo(Op, Src);

    }

    if (!SrcVT.isVector())

      break;

    APInt SrcUndef, SrcZero;

    APInt SrcElts = APInt::getOneBitSet(SrcVT.getVectorNumElements(), 0);

    if (SimplifyDemandedVectorElts(Src, SrcElts, SrcUndef, SrcZero, TLO,

                                   Depth + 1))

      return true;

    // Aggressively peek through src to get at the demanded elt.

    // TODO - we should do this for all target/faux shuffles ops.

    if (SDValue NewSrc = SimplifyMultipleUseDemandedVectorElts(

            Src, SrcElts, TLO.DAG, Depth + 1))

      return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc));

    break;

  }

  case X86ISD::VPERMV:

    if (SimplifyDemandedVectorEltsForTargetShuffle(Op, DemandedElts, 0, TLO,

                                                   Depth))

      return true;

    break;

  case X86ISD::PSHUFB:

  case X86ISD::VPERMV3:

  case X86ISD::VPERMILPV:

    if (SimplifyDemandedVectorEltsForTargetShuffle(Op, DemandedElts, 1, TLO,

                                                   Depth))

      return true;

    break;

  case X86ISD::VPPERM:

  case X86ISD::VPERMIL2:

    if (SimplifyDemandedVectorEltsForTargetShuffle(Op, DemandedElts, 2, TLO,

                                                   Depth))

      return true;

    break;

  }


  // For 256/512-bit ops that are 128/256-bit ops glued together, if we do not

  // demand any of the high elements, then narrow the op to 128/256-bits: e.g.

  // (op ymm0, ymm1) --> insert undef, (op xmm0, xmm1), 0

  if ((VT.is256BitVector() || VT.is512BitVector()) &&

      DemandedElts.lshr(NumElts / 2) == 0) {

    unsigned SizeInBits = VT.getSizeInBits();

    unsigned ExtSizeInBits = SizeInBits / 2;


    // See if 512-bit ops only use the bottom 128-bits.

    if (VT.is512BitVector() && DemandedElts.lshr(NumElts / 4) == 0)

      ExtSizeInBits = SizeInBits / 4;


    switch (Opc) {

      // Scalar broadcast.

    case X86ISD::VBROADCAST: {

      SDLoc DL(Op);

      SDValue Src = Op.getOperand(0);

      if (Src.getValueSizeInBits() > ExtSizeInBits)

        Src = extractSubVector(Src, 0, TLO.DAG, DL, ExtSizeInBits);

      EVT BcstVT = EVT::getVectorVT(*TLO.DAG.getContext(), VT.getScalarType(),

                                    ExtSizeInBits / VT.getScalarSizeInBits());

      SDValue Bcst = TLO.DAG.getNode(X86ISD::VBROADCAST, DL, BcstVT, Src);

      return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Bcst, 0,

                                               TLO.DAG, DL, ExtSizeInBits));

    }

    case X86ISD::VBROADCAST_LOAD: {

      SDLoc DL(Op);

      auto *MemIntr = cast<MemIntrinsicSDNode>(Op);

      EVT BcstVT = EVT::getVectorVT(*TLO.DAG.getContext(), VT.getScalarType(),

                                    ExtSizeInBits / VT.getScalarSizeInBits());

      SDVTList Tys = TLO.DAG.getVTList(BcstVT, MVT::Other);

      SDValue Ops[] = {MemIntr->getOperand(0), MemIntr->getOperand(1)};

      SDValue Bcst = TLO.DAG.getMemIntrinsicNode(

          X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MemIntr->getMemoryVT(),

          MemIntr->getMemOperand());

      TLO.DAG.makeEquivalentMemoryOrdering(SDValue(MemIntr, 1),

                                           Bcst.getValue(1));

      return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Bcst, 0,

                                               TLO.DAG, DL, ExtSizeInBits));

    }

      // Subvector broadcast.

    case X86ISD::SUBV_BROADCAST_LOAD: {

      auto *MemIntr = cast<MemIntrinsicSDNode>(Op);

      EVT MemVT = MemIntr->getMemoryVT();

      if (ExtSizeInBits == MemVT.getStoreSizeInBits()) {

        SDLoc DL(Op);

        SDValue Ld =

            TLO.DAG.getLoad(MemVT, DL, MemIntr->getChain(),

                            MemIntr->getBasePtr(), MemIntr->getMemOperand());

        TLO.DAG.makeEquivalentMemoryOrdering(SDValue(MemIntr, 1),

                                             Ld.getValue(1));

        return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Ld, 0,

                                                 TLO.DAG, DL, ExtSizeInBits));

      } else if ((ExtSizeInBits % MemVT.getStoreSizeInBits()) == 0) {

        SDLoc DL(Op);

        EVT BcstVT = EVT::getVectorVT(*TLO.DAG.getContext(), VT.getScalarType(),

                                      ExtSizeInBits / VT.getScalarSizeInBits());

        if (SDValue BcstLd =

                getBROADCAST_LOAD(Opc, DL, BcstVT, MemVT, MemIntr, 0, TLO.DAG))

          return TLO.CombineTo(Op,

                               insertSubVector(TLO.DAG.getUNDEF(VT), BcstLd, 0,

                                               TLO.DAG, DL, ExtSizeInBits));

      }

      break;

    }

      // Byte shifts by immediate.

    case X86ISD::VSHLDQ:

    case X86ISD::VSRLDQ:

      // Shift by uniform.

    case X86ISD::VSHL:

    case X86ISD::VSRL:

    case X86ISD::VSRA:

      // Shift by immediate.

    case X86ISD::VSHLI:

    case X86ISD::VSRLI:

    case X86ISD::VSRAI: {

      SDLoc DL(Op);

      SDValue Ext0 =

          extractSubVector(Op.getOperand(0), 0, TLO.DAG, DL, ExtSizeInBits);

      SDValue ExtOp =

          TLO.DAG.getNode(Opc, DL, Ext0.getValueType(), Ext0, Op.getOperand(1));

      SDValue UndefVec = TLO.DAG.getUNDEF(VT);

      SDValue Insert =

          insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits);

      return TLO.CombineTo(Op, Insert);

    }

    case X86ISD::VPERMI: {

      // Simplify 256-bit PERMPD/PERMQ to extract_subvector.

      // TODO: This should be done in shuffle combining.

      if (VT == MVT::v4f64 || VT == MVT::v4i64) {

        SmallVector<int, 4> Mask;

        DecodeVPERMMask(NumElts, Op.getConstantOperandVal(1), Mask);

        if (isUndefOrEqual(Mask[0], 2) && isUndefOrEqual(Mask[1], 3)) {

          SDLoc DL(Op);

          SDValue Ext = extractSubVector(Op.getOperand(0), 2, TLO.DAG, DL, 128);

          SDValue UndefVec = TLO.DAG.getUNDEF(VT);

          SDValue Insert = insertSubVector(UndefVec, Ext, 0, TLO.DAG, DL, 128);

          return TLO.CombineTo(Op, Insert);

        }

      }

      // Simplify 512-bit PERMPD/PERMQ to 256-bit variant on lower half.

      if (VT == MVT::v8f64 || VT == MVT::v8i64) {

        SDLoc DL(Op);

        SDValue Ext0 = extractSubVector(Op.getOperand(0), 0, TLO.DAG, DL, 256);

        SDValue ExtOp = TLO.DAG.getNode(Opc, DL, Ext0.getValueType(), Ext0,

                                        Op.getOperand(1));

        SDValue UndefVec = TLO.DAG.getUNDEF(VT);

        SDValue Insert = insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, 256);

        return TLO.CombineTo(Op, Insert);

      }

      break;

    }

    case X86ISD::VPERMV: {

      SmallVector<int, 16> Mask;

      SmallVector<SDValue, 2> Ops;

      // We can always split v16i32/v16f32 AVX512 to v8i32/v8f32 AVX2 variants.

      if ((VT.is256BitVector() || Subtarget.hasVLX() || VT == MVT::v16i32 ||

           VT == MVT::v16f32) &&

          getTargetShuffleMask(Op, /*AllowSentinelZero=*/false, Ops, Mask)) {

        // For lane-crossing shuffles, only split in half in case we're still

        // referencing higher elements.

        unsigned HalfElts = NumElts / 2;

        unsigned HalfSize = SizeInBits / 2;

        Mask.resize(HalfElts);

        if (all_of(Mask,

                   [&](int M) { return isUndefOrInRange(M, 0, HalfElts); })) {

          MVT HalfVT = VT.getSimpleVT().getHalfNumVectorElementsVT();

          SDLoc DL(Op);

          SDValue Ext;

          SDValue M =

              extractSubVector(Op.getOperand(0), 0, TLO.DAG, DL, HalfSize);

          SDValue V =

              extractSubVector(Op.getOperand(1), 0, TLO.DAG, DL, HalfSize);

          // For 128-bit v2X64/v4X32 instructions, use VPERMILPD/VPERMILPS.

          if (VT.is512BitVector() || VT.getScalarSizeInBits() <= 16)

            Ext = TLO.DAG.getNode(Opc, DL, HalfVT, M, V);

          else {

            MVT ShufSVT = MVT::getFloatingPointVT(VT.getScalarSizeInBits());

            MVT ShufVT = HalfVT.changeVectorElementType(ShufSVT);

            Ext = TLO.DAG.getNode(X86ISD::VPERMILPV, DL, ShufVT,

                                  TLO.DAG.getBitcast(ShufVT, V), M);

            Ext = TLO.DAG.getBitcast(HalfVT, Ext);

          }

          SDValue Insert = widenSubVector(Ext, /*ZeroNewElements=*/false,

                                          Subtarget, TLO.DAG, DL, SizeInBits);

          return TLO.CombineTo(Op, Insert);

        }

      }

      break;

    }

    case X86ISD::VPERMV3: {

      SmallVector<int, 16> Mask;

      SmallVector<SDValue, 2> Ops;

      if (Subtarget.hasVLX() &&

          getTargetShuffleMask(Op, /*AllowSentinelZero=*/false, Ops, Mask)) {

        // For lane-crossing shuffles, only split in half in case we're still

        // referencing higher elements.

        unsigned HalfElts = NumElts / 2;

        unsigned HalfSize = SizeInBits / 2;

        Mask.resize(HalfElts);

        if (all_of(Mask, [&](int M) {

              return isUndefOrInRange(M, 0, HalfElts) ||

                     isUndefOrInRange(M, NumElts, NumElts + HalfElts);

            })) {

          // Adjust mask elements for 2nd operand to point to half width.

          for (int &M : Mask)

            M = (M < NumElts) ? M : (M - HalfElts);

          MVT HalfVT = VT.getSimpleVT().getHalfNumVectorElementsVT();

          MVT HalfIntVT = HalfVT.changeVectorElementTypeToInteger();

          SDLoc DL(Op);

          SDValue Ext = TLO.DAG.getNode(

              Opc, DL, HalfVT,

              extractSubVector(Op.getOperand(0), 0, TLO.DAG, DL, HalfSize),

              getConstVector(Mask, HalfIntVT, TLO.DAG, DL, /*IsMask=*/true),

              extractSubVector(Op.getOperand(2), 0, TLO.DAG, DL, HalfSize));

          SDValue Insert = widenSubVector(Ext, /*ZeroNewElements=*/false,

                                          Subtarget, TLO.DAG, DL, SizeInBits);

          return TLO.CombineTo(Op, Insert);

        }

      }

      break;

    }

    case X86ISD::VPERM2X128: {

      // Simplify VPERM2F128/VPERM2I128 to extract_subvector.

      SDLoc DL(Op);

      unsigned LoMask = Op.getConstantOperandVal(2) & 0xF;

      if (LoMask & 0x8)

        return TLO.CombineTo(

            Op, getZeroVector(VT.getSimpleVT(), Subtarget, TLO.DAG, DL));

      unsigned EltIdx = (LoMask & 0x1) * (NumElts / 2);

      unsigned SrcIdx = (LoMask & 0x2) >> 1;

      SDValue ExtOp =

          extractSubVector(Op.getOperand(SrcIdx), EltIdx, TLO.DAG, DL, 128);

      SDValue UndefVec = TLO.DAG.getUNDEF(VT);

      SDValue Insert =

          insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits);

      return TLO.CombineTo(Op, Insert);

    }

      // Conversions.

      // TODO: Add more CVT opcodes when we have test coverage.

    case X86ISD::CVTTP2UI: {

      if (!Subtarget.hasVLX())

        break;

      [[fallthrough]];

    }

    case X86ISD::CVTTP2SI: {

      if (Op.getOperand(0).getValueType().getVectorElementType() == MVT::f16 &&

          !Subtarget.hasVLX())

        break;

      [[fallthrough]];

    }

    case X86ISD::CVTPH2PS: {

      SDLoc DL(Op);

      unsigned Scale = SizeInBits / ExtSizeInBits;

      SDValue SrcOp = Op.getOperand(0);

      MVT SrcVT = SrcOp.getSimpleValueType();

      unsigned SrcExtSize =

          std::max<unsigned>(SrcVT.getSizeInBits() / Scale, 128);

      MVT ExtVT = MVT::getVectorVT(VT.getSimpleVT().getScalarType(),

                                   ExtSizeInBits / VT.getScalarSizeInBits());

      SDValue ExtOp = TLO.DAG.getNode(

          Opc, DL, ExtVT, extractSubVector(SrcOp, 0, TLO.DAG, DL, SrcExtSize));

      SDValue UndefVec = TLO.DAG.getUNDEF(VT);

      SDValue Insert =

          insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits);

      return TLO.CombineTo(Op, Insert);

    }

      // Zero upper elements.

    case X86ISD::VZEXT_MOVL:

      // Variable blend.

    case X86ISD::BLENDV:

      // Target unary shuffles:

    case X86ISD::MOVDDUP:

      // Target unary shuffles by immediate:

    case X86ISD::PSHUFD:

    case X86ISD::PSHUFLW:

    case X86ISD::PSHUFHW:

    case X86ISD::VPERMILPI:

      // (Non-Lane Crossing) Target Shuffles.

    case X86ISD::VPERMILPV:

    case X86ISD::VPERMIL2:

    case X86ISD::PSHUFB:

    case X86ISD::UNPCKL:

    case X86ISD::UNPCKH:

    case X86ISD::BLENDI:

      // Integer ops.

    case X86ISD::PACKSS:

    case X86ISD::PACKUS:

    case X86ISD::PCMPEQ:

    case X86ISD::PCMPGT:

    case X86ISD::PMULUDQ:

    case X86ISD::PMULDQ:

    case X86ISD::VSHLV:

    case X86ISD::VSRLV:

    case X86ISD::VSRAV:

      // Float ops.

    case X86ISD::FMAX:

    case X86ISD::FMIN:

    case X86ISD::FMAXC:

    case X86ISD::FMINC:

    case X86ISD::FRSQRT:

    case X86ISD::FRCP:

      // Horizontal Ops.

    case X86ISD::HADD:

    case X86ISD::HSUB:

    case X86ISD::FHADD:

    case X86ISD::FHSUB: {

      SDLoc DL(Op);

      SmallVector<SDValue, 4> Ops;

      for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {

        SDValue SrcOp = Op.getOperand(i);

        EVT SrcVT = SrcOp.getValueType();

        assert((!SrcVT.isVector() || SrcVT.getSizeInBits() == SizeInBits) &&

               "Unsupported vector size");

        Ops.push_back(SrcVT.isVector() ? extractSubVector(SrcOp, 0, TLO.DAG, DL,

                                                          ExtSizeInBits)

                                       : SrcOp);

      }

      MVT ExtVT = VT.getSimpleVT();

      ExtVT = MVT::getVectorVT(ExtVT.getScalarType(),

                               ExtSizeInBits / ExtVT.getScalarSizeInBits());

      SDValue ExtOp = TLO.DAG.getNode(Opc, DL, ExtVT, Ops);

      SDValue UndefVec = TLO.DAG.getUNDEF(VT);

      SDValue Insert =

          insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits);

      return TLO.CombineTo(Op, Insert);

    }

    }

  }


  // For splats, unless we *only* demand the 0'th element,

  // stop attempts at simplification here, we aren't going to improve things,

  // this is better than any potential shuffle.

  if (!DemandedElts.isOne() && TLO.DAG.isSplatValue(Op, /*AllowUndefs*/false))

    return false;


  // Get target/faux shuffle mask.

  APInt OpUndef, OpZero;

  SmallVector<int, 64> OpMask;

  SmallVector<SDValue, 2> OpInputs;

  if (!getTargetShuffleInputs(Op, DemandedElts, OpInputs, OpMask, OpUndef,

                              OpZero, TLO.DAG, Depth, false))

    return false;


  // Shuffle inputs must be the same size as the result.

  if (OpMask.size() != (unsigned)NumElts ||

      llvm::any_of(OpInputs, [VT](SDValue V) {

        return VT.getSizeInBits() != V.getValueSizeInBits() ||

               !V.getValueType().isVector();

      }))

    return false;


  KnownZero = OpZero;

  KnownUndef = OpUndef;


  // Check if shuffle mask can be simplified to undef/zero/identity.

  int NumSrcs = OpInputs.size();

  for (int i = 0; i != NumElts; ++i)

    if (!DemandedElts[i])

      OpMask[i] = SM_SentinelUndef;


  if (isUndefInRange(OpMask, 0, NumElts)) {

    KnownUndef.setAllBits();

    return TLO.CombineTo(Op, TLO.DAG.getUNDEF(VT));

  }

  if (isUndefOrZeroInRange(OpMask, 0, NumElts)) {

    KnownZero.setAllBits();

    return TLO.CombineTo(

        Op, getZeroVector(VT.getSimpleVT(), Subtarget, TLO.DAG, SDLoc(Op)));

  }

  for (int Src = 0; Src != NumSrcs; ++Src)

    if (isSequentialOrUndefInRange(OpMask, 0, NumElts, Src * NumElts))

      return TLO.CombineTo(Op, TLO.DAG.getBitcast(VT, OpInputs[Src]));


  // Attempt to simplify inputs.

  for (int Src = 0; Src != NumSrcs; ++Src) {

    // TODO: Support inputs of different types.

    if (OpInputs[Src].getValueType() != VT)

      continue;


    int Lo = Src * NumElts;

    APInt SrcElts = APInt::getZero(NumElts);

    for (int i = 0; i != NumElts; ++i)

      if (DemandedElts[i]) {

        int M = OpMask[i] - Lo;

        if (0 <= M && M < NumElts)

          SrcElts.setBit(M);

      }


    // TODO - Propagate input undef/zero elts.

    APInt SrcUndef, SrcZero;

    if (SimplifyDemandedVectorElts(OpInputs[Src], SrcElts, SrcUndef, SrcZero,

                                   TLO, Depth + 1))

      return true;

  }


  // If we don't demand all elements, then attempt to combine to a simpler

  // shuffle.

  // We need to convert the depth to something combineX86ShufflesRecursively

  // can handle - so pretend its Depth == 0 again, and reduce the max depth

  // to match. This prevents combineX86ShuffleChain from returning a

  // combined shuffle that's the same as the original root, causing an

  // infinite loop.

  if (!DemandedElts.isAllOnes()) {

    assert(Depth < X86::MaxShuffleCombineDepth && "Depth out of range");


    SmallVector<int, 64> DemandedMask(NumElts, SM_SentinelUndef);

    for (int i = 0; i != NumElts; ++i)

      if (DemandedElts[i])

        DemandedMask[i] = i;


    SDValue NewShuffle = combineX86ShufflesRecursively(

        {Op}, 0, Op.getOpcode(), Op.getSimpleValueType(), DemandedMask, {}, 0,

        X86::MaxShuffleCombineDepth - Depth,

        /*AllowVariableCrossLaneMask=*/true,

        /*AllowVariablePerLaneMask=*/true, isMaskableNode(Op, Subtarget),

        TLO.DAG, SDLoc(Op), Subtarget);

    if (NewShuffle)

      return TLO.CombineTo(Op, NewShuffle);

  }


  return false;

}


bool X86TargetLowering::SimplifyDemandedBitsForTargetNode(

    SDValue Op, const APInt &OriginalDemandedBits,

    const APInt &OriginalDemandedElts, KnownBits &Known, TargetLoweringOpt &TLO,

    unsigned Depth) const {

  EVT VT = Op.getValueType();

  unsigned BitWidth = OriginalDemandedBits.getBitWidth();

  unsigned Opc = Op.getOpcode();

  switch(Opc) {

  case X86ISD::VTRUNC: {

    KnownBits KnownOp;

    SDValue Src = Op.getOperand(0);

    MVT SrcVT = Src.getSimpleValueType();


    // Simplify the input, using demanded bit information.

    APInt TruncMask = OriginalDemandedBits.zext(SrcVT.getScalarSizeInBits());

    APInt DemandedElts = OriginalDemandedElts.trunc(SrcVT.getVectorNumElements());

    if (SimplifyDemandedBits(Src, TruncMask, DemandedElts, KnownOp, TLO, Depth + 1))

      return true;

    break;

  }

  case X86ISD::PMULDQ:

  case X86ISD::PMULUDQ: {

    // PMULDQ/PMULUDQ only uses lower 32 bits from each vector element.

    KnownBits KnownLHS, KnownRHS;

    SDValue LHS = Op.getOperand(0);

    SDValue RHS = Op.getOperand(1);


    // Don't mask bits on 32-bit AVX512 targets which might lose a broadcast.

    // FIXME: Can we bound this better?

    APInt DemandedMask = APInt::getLowBitsSet(64, 32);

    APInt DemandedMaskLHS = APInt::getAllOnes(64);

    APInt DemandedMaskRHS = APInt::getAllOnes(64);


    bool Is32BitAVX512 = !Subtarget.is64Bit() && Subtarget.hasAVX512();

    if (!Is32BitAVX512 || !TLO.DAG.isSplatValue(LHS))

      DemandedMaskLHS = DemandedMask;

    if (!Is32BitAVX512 || !TLO.DAG.isSplatValue(RHS))

      DemandedMaskRHS = DemandedMask;


    if (SimplifyDemandedBits(LHS, DemandedMaskLHS, OriginalDemandedElts,

                             KnownLHS, TLO, Depth + 1))

      return true;

    if (SimplifyDemandedBits(RHS, DemandedMaskRHS, OriginalDemandedElts,

                             KnownRHS, TLO, Depth + 1))

      return true;


    // PMULUDQ(X,1) -> AND(X,(1<<32)-1) 'getZeroExtendInReg'.

    KnownRHS = KnownRHS.trunc(32);

    if (Opc == X86ISD::PMULUDQ && KnownRHS.isConstant() &&

        KnownRHS.getConstant().isOne()) {

      SDLoc DL(Op);

      SDValue Mask = TLO.DAG.getConstant(DemandedMask, DL, VT);

      return TLO.CombineTo(Op, TLO.DAG.getNode(ISD::AND, DL, VT, LHS, Mask));

    }


    // Aggressively peek through ops to get at the demanded low bits.

    SDValue DemandedLHS = SimplifyMultipleUseDemandedBits(

        LHS, DemandedMaskLHS, OriginalDemandedElts, TLO.DAG, Depth + 1);

    SDValue DemandedRHS = SimplifyMultipleUseDemandedBits(

        RHS, DemandedMaskRHS, OriginalDemandedElts, TLO.DAG, Depth + 1);

    if (DemandedLHS || DemandedRHS) {

      DemandedLHS = DemandedLHS ? DemandedLHS : LHS;

      DemandedRHS = DemandedRHS ? DemandedRHS : RHS;

      return TLO.CombineTo(

          Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, DemandedLHS, DemandedRHS));

    }

    break;

  }

  case X86ISD::ANDNP: {

    KnownBits Known2;

    SDValue Op0 = Op.getOperand(0);

    SDValue Op1 = Op.getOperand(1);


    if (SimplifyDemandedBits(Op1, OriginalDemandedBits, OriginalDemandedElts,

                             Known, TLO, Depth + 1))

      return true;


    if (SimplifyDemandedBits(Op0, ~Known.Zero & OriginalDemandedBits,

                             OriginalDemandedElts, Known2, TLO, Depth + 1))

      return true;


    // If the RHS is a constant, see if we can simplify it.

    if (ShrinkDemandedConstant(Op, ~Known2.One & OriginalDemandedBits,

                               OriginalDemandedElts, TLO))

      return true;


    // ANDNP = (~Op0 & Op1);

    Known.One &= Known2.Zero;

    Known.Zero |= Known2.One;

    break;

  }

  case X86ISD::VSHLI: {

    SDValue Op0 = Op.getOperand(0);

    SDValue Op1 = Op.getOperand(1);


    unsigned ShAmt = Op1->getAsZExtVal();

    if (ShAmt >= BitWidth)

      break;


    APInt DemandedMask = OriginalDemandedBits.lshr(ShAmt);


    // If this is ((X >>u C1) << ShAmt), see if we can simplify this into a

    // single shift.  We can do this if the bottom bits (which are shifted

    // out) are never demanded.

    if (Op0.getOpcode() == X86ISD::VSRLI &&

        OriginalDemandedBits.countr_zero() >= ShAmt) {

      unsigned Shift2Amt = Op0.getConstantOperandVal(1);

      if (Shift2Amt < BitWidth) {

        int Diff = ShAmt - Shift2Amt;

        if (Diff == 0)

          return TLO.CombineTo(Op, Op0.getOperand(0));


        unsigned NewOpc = Diff < 0 ? X86ISD::VSRLI : X86ISD::VSHLI;

        SDValue NewShift = TLO.DAG.getNode(

            NewOpc, SDLoc(Op), VT, Op0.getOperand(0),

            TLO.DAG.getTargetConstant(std::abs(Diff), SDLoc(Op), MVT::i8));

        return TLO.CombineTo(Op, NewShift);

      }

    }


    // If we are only demanding sign bits then we can use the shift source directly.

    unsigned NumSignBits =

        TLO.DAG.ComputeNumSignBits(Op0, OriginalDemandedElts, Depth + 1);

    unsigned UpperDemandedBits = BitWidth - OriginalDemandedBits.countr_zero();

    if (NumSignBits > ShAmt && (NumSignBits - ShAmt) >= UpperDemandedBits)

      return TLO.CombineTo(Op, Op0);


    if (SimplifyDemandedBits(Op0, DemandedMask, OriginalDemandedElts, Known,

                             TLO, Depth + 1))

      return true;


    Known <<= ShAmt;


    // Low bits known zero.

    Known.Zero.setLowBits(ShAmt);


    if (!OriginalDemandedBits.isSubsetOf(Known.Zero | Known.One)) {

      // Attempt to avoid multi-use ops if we don't need anything from them.

      if (SDValue DemandedOp0 = SimplifyMultipleUseDemandedBits(

              Op0, DemandedMask, OriginalDemandedElts, TLO.DAG, Depth + 1)) {

        SDValue NewOp =

            TLO.DAG.getNode(Op.getOpcode(), SDLoc(Op), VT, DemandedOp0, Op1);

        return TLO.CombineTo(Op, NewOp);

      }

    }

    return false;

  }

  case X86ISD::VSRLI: {

    SDValue Op0 = Op.getOperand(0);

    SDValue Op1 = Op.getOperand(1);


    unsigned ShAmt = Op1->getAsZExtVal();

    if (ShAmt >= BitWidth)

      break;


    APInt DemandedMask = OriginalDemandedBits << ShAmt;


    if (SimplifyDemandedBits(Op0, DemandedMask, OriginalDemandedElts, Known,

                             TLO, Depth + 1))

      return true;


    Known >>= ShAmt;


    // High bits known zero.

    Known.Zero.setHighBits(ShAmt);


    if (!OriginalDemandedBits.isSubsetOf(Known.Zero | Known.One)) {

      // Attempt to avoid multi-use ops if we don't need anything from them.

      if (SDValue DemandedOp0 = SimplifyMultipleUseDemandedBits(

              Op0, DemandedMask, OriginalDemandedElts, TLO.DAG, Depth + 1)) {

        SDValue NewOp =

            TLO.DAG.getNode(Op.getOpcode(), SDLoc(Op), VT, DemandedOp0, Op1);

        return TLO.CombineTo(Op, NewOp);

      }

    }

    return false;

  }

  case X86ISD::VSRAI: {

    SDValue Op0 = Op.getOperand(0);

    SDValue Op1 = Op.getOperand(1);


    unsigned ShAmt = Op1->getAsZExtVal();

    if (ShAmt >= BitWidth)

      break;


    APInt DemandedMask = OriginalDemandedBits << ShAmt;


    // If we only want bits that already match the signbit then we don't need

    // to shift.

    unsigned NumHiDemandedBits = BitWidth - OriginalDemandedBits.countr_zero();

    if (TLO.DAG.ComputeNumSignBits(Op0, OriginalDemandedElts, Depth + 1) >=

        NumHiDemandedBits)

      return TLO.CombineTo(Op, Op0);


    // fold (VSRAI (VSHLI X, C1), C1) --> X iff NumSignBits(X) > C1

    if (Op0.getOpcode() == X86ISD::VSHLI && Op1 == Op0.getOperand(1)) {

      SDValue Op00 = Op0.getOperand(0);

      unsigned NumSignBits =

          TLO.DAG.ComputeNumSignBits(Op00, OriginalDemandedElts);

      if (ShAmt < NumSignBits)

        return TLO.CombineTo(Op, Op00);

    }


    // If any of the demanded bits are produced by the sign extension, we also

    // demand the input sign bit.

    if (OriginalDemandedBits.countl_zero() < ShAmt)

      DemandedMask.setSignBit();


    if (SimplifyDemandedBits(Op0, DemandedMask, OriginalDemandedElts, Known,

                             TLO, Depth + 1))

      return true;


    Known >>= ShAmt;


    // If the input sign bit is known to be zero, or if none of the top bits

    // are demanded, turn this into an unsigned shift right.

    if (Known.Zero[BitWidth - ShAmt - 1] ||

        OriginalDemandedBits.countl_zero() >= ShAmt)

      return TLO.CombineTo(

          Op, TLO.DAG.getNode(X86ISD::VSRLI, SDLoc(Op), VT, Op0, Op1));


    // High bits are known one.

    if (Known.One[BitWidth - ShAmt - 1])

      Known.One.setHighBits(ShAmt);


    if (!OriginalDemandedBits.isSubsetOf(Known.Zero | Known.One)) {

      // Attempt to avoid multi-use ops if we don't need anything from them.

      if (SDValue DemandedOp0 = SimplifyMultipleUseDemandedBits(

              Op0, DemandedMask, OriginalDemandedElts, TLO.DAG, Depth + 1)) {

        SDValue NewOp =

            TLO.DAG.getNode(Op.getOpcode(), SDLoc(Op), VT, DemandedOp0, Op1);

        return TLO.CombineTo(Op, NewOp);

      }

    }

    return false;

  }

  case X86ISD::BLENDI: {

    SDValue LHS = Op.getOperand(0);

    SDValue RHS = Op.getOperand(1);

    APInt Mask = getBLENDIBlendMask(Op);


    APInt DemandedEltsLHS = OriginalDemandedElts & ~Mask;

    if (SimplifyDemandedBits(LHS, OriginalDemandedBits, DemandedEltsLHS, Known,

                             TLO, Depth + 1))

      return true;


    APInt DemandedEltsRHS = OriginalDemandedElts & Mask;

    if (SimplifyDemandedBits(RHS, OriginalDemandedBits, DemandedEltsRHS, Known,

                             TLO, Depth + 1))

      return true;


    // Attempt to avoid multi-use ops if we don't need anything from them.

    SDValue NewLHS = SimplifyMultipleUseDemandedBits(

        LHS, OriginalDemandedBits, DemandedEltsLHS, TLO.DAG, Depth + 1);

    SDValue NewRHS = SimplifyMultipleUseDemandedBits(

        RHS, OriginalDemandedBits, DemandedEltsRHS, TLO.DAG, Depth + 1);

    if (NewLHS || NewRHS) {

      NewLHS = NewLHS ? NewLHS : LHS;

      NewRHS = NewRHS ? NewRHS : RHS;

      return TLO.CombineTo(Op,

                           TLO.DAG.getNode(Op.getOpcode(), SDLoc(Op), VT,

                                           NewLHS, NewRHS, Op.getOperand(2)));

    }

    break;

  }

  case X86ISD::BLENDV: {

    SDValue Sel = Op.getOperand(0);

    SDValue LHS = Op.getOperand(1);

    SDValue RHS = Op.getOperand(2);


    APInt SignMask = APInt::getSignMask(BitWidth);

    SDValue NewSel = SimplifyMultipleUseDemandedBits(

        Sel, SignMask, OriginalDemandedElts, TLO.DAG, Depth + 1);

    SDValue NewLHS = SimplifyMultipleUseDemandedBits(

        LHS, OriginalDemandedBits, OriginalDemandedElts, TLO.DAG, Depth + 1);

    SDValue NewRHS = SimplifyMultipleUseDemandedBits(

        RHS, OriginalDemandedBits, OriginalDemandedElts, TLO.DAG, Depth + 1);


    if (NewSel || NewLHS || NewRHS) {

      NewSel = NewSel ? NewSel : Sel;

      NewLHS = NewLHS ? NewLHS : LHS;

      NewRHS = NewRHS ? NewRHS : RHS;

      return TLO.CombineTo(Op, TLO.DAG.getNode(X86ISD::BLENDV, SDLoc(Op), VT,

                                               NewSel, NewLHS, NewRHS));

    }

    break;

  }

  case X86ISD::PEXTRB:

  case X86ISD::PEXTRW: {

    SDValue Vec = Op.getOperand(0);

    auto *CIdx = dyn_cast<ConstantSDNode>(Op.getOperand(1));

    MVT VecVT = Vec.getSimpleValueType();

    unsigned NumVecElts = VecVT.getVectorNumElements();


    if (CIdx && CIdx->getAPIntValue().ult(NumVecElts)) {

      unsigned Idx = CIdx->getZExtValue();

      unsigned VecBitWidth = VecVT.getScalarSizeInBits();


      // If we demand no bits from the vector then we must have demanded

      // bits from the implict zext - simplify to zero.

      APInt DemandedVecBits = OriginalDemandedBits.trunc(VecBitWidth);

      if (DemandedVecBits == 0)

        return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT));


      APInt KnownUndef, KnownZero;

      APInt DemandedVecElts = APInt::getOneBitSet(NumVecElts, Idx);

      if (SimplifyDemandedVectorElts(Vec, DemandedVecElts, KnownUndef,

                                     KnownZero, TLO, Depth + 1))

        return true;


      KnownBits KnownVec;

      if (SimplifyDemandedBits(Vec, DemandedVecBits, DemandedVecElts,

                               KnownVec, TLO, Depth + 1))

        return true;


      if (SDValue V = SimplifyMultipleUseDemandedBits(

              Vec, DemandedVecBits, DemandedVecElts, TLO.DAG, Depth + 1))

        return TLO.CombineTo(

            Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, V, Op.getOperand(1)));


      Known = KnownVec.zext(BitWidth);

      return false;

    }

    break;

  }

  case X86ISD::PINSRB:

  case X86ISD::PINSRW: {

    SDValue Vec = Op.getOperand(0);

    SDValue Scl = Op.getOperand(1);

    auto *CIdx = dyn_cast<ConstantSDNode>(Op.getOperand(2));

    MVT VecVT = Vec.getSimpleValueType();


    if (CIdx && CIdx->getAPIntValue().ult(VecVT.getVectorNumElements())) {

      unsigned Idx = CIdx->getZExtValue();

      if (!OriginalDemandedElts[Idx])

        return TLO.CombineTo(Op, Vec);


      KnownBits KnownVec;

      APInt DemandedVecElts(OriginalDemandedElts);

      DemandedVecElts.clearBit(Idx);

      if (SimplifyDemandedBits(Vec, OriginalDemandedBits, DemandedVecElts,

                               KnownVec, TLO, Depth + 1))

        return true;


      KnownBits KnownScl;

      unsigned NumSclBits = Scl.getScalarValueSizeInBits();

      APInt DemandedSclBits = OriginalDemandedBits.zext(NumSclBits);

      if (SimplifyDemandedBits(Scl, DemandedSclBits, KnownScl, TLO, Depth + 1))

        return true;


      KnownScl = KnownScl.trunc(VecVT.getScalarSizeInBits());

      Known = KnownVec.intersectWith(KnownScl);

      return false;

    }

    break;

  }

  case X86ISD::PACKSS:

    // PACKSS saturates to MIN/MAX integer values. So if we just want the

    // sign bit then we can just ask for the source operands sign bit.

    // TODO - add known bits handling.

    if (OriginalDemandedBits.isSignMask()) {

      APInt DemandedLHS, DemandedRHS;

      getPackDemandedElts(VT, OriginalDemandedElts, DemandedLHS, DemandedRHS);


      KnownBits KnownLHS, KnownRHS;

      APInt SignMask = APInt::getSignMask(BitWidth * 2);

      if (SimplifyDemandedBits(Op.getOperand(0), SignMask, DemandedLHS,

                               KnownLHS, TLO, Depth + 1))

        return true;

      if (SimplifyDemandedBits(Op.getOperand(1), SignMask, DemandedRHS,

                               KnownRHS, TLO, Depth + 1))

        return true;


      // Attempt to avoid multi-use ops if we don't need anything from them.

      SDValue DemandedOp0 = SimplifyMultipleUseDemandedBits(

          Op.getOperand(0), SignMask, DemandedLHS, TLO.DAG, Depth + 1);

      SDValue DemandedOp1 = SimplifyMultipleUseDemandedBits(

          Op.getOperand(1), SignMask, DemandedRHS, TLO.DAG, Depth + 1);

      if (DemandedOp0 || DemandedOp1) {

        SDValue Op0 = DemandedOp0 ? DemandedOp0 : Op.getOperand(0);

        SDValue Op1 = DemandedOp1 ? DemandedOp1 : Op.getOperand(1);

        return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, Op0, Op1));

      }

    }

    // TODO - add general PACKSS/PACKUS SimplifyDemandedBits support.

    break;

  case X86ISD::VBROADCAST: {

    SDValue Src = Op.getOperand(0);

    MVT SrcVT = Src.getSimpleValueType();

    APInt DemandedElts = APInt::getOneBitSet(

        SrcVT.isVector() ? SrcVT.getVectorNumElements() : 1, 0);

    if (SimplifyDemandedBits(Src, OriginalDemandedBits, DemandedElts, Known,

                             TLO, Depth + 1))

      return true;

    // If we don't need the upper bits, attempt to narrow the broadcast source.

    // Don't attempt this on AVX512 as it might affect broadcast folding.

    // TODO: Should we attempt this for i32/i16 splats? They tend to be slower.

    if ((BitWidth == 64) && SrcVT.isScalarInteger() && !Subtarget.hasAVX512() &&

        OriginalDemandedBits.countl_zero() >= (BitWidth / 2) &&

        Src->hasOneUse()) {

      MVT NewSrcVT = MVT::getIntegerVT(BitWidth / 2);

      SDValue NewSrc =

          TLO.DAG.getNode(ISD::TRUNCATE, SDLoc(Src), NewSrcVT, Src);

      MVT NewVT = MVT::getVectorVT(NewSrcVT, VT.getVectorNumElements() * 2);

      SDValue NewBcst =

          TLO.DAG.getNode(X86ISD::VBROADCAST, SDLoc(Op), NewVT, NewSrc);

      return TLO.CombineTo(Op, TLO.DAG.getBitcast(VT, NewBcst));

    }

    break;

  }

  case X86ISD::PCMPGT:

    // icmp sgt(0, R) == ashr(R, BitWidth-1).

    if (ISD::isBuildVectorAllZeros(Op.getOperand(0).getNode())) {

      // iff we only need the signbit then we can use R directly.

      if (OriginalDemandedBits.isSignMask())

        return TLO.CombineTo(Op, Op.getOperand(1));

      // otherwise we just need R's signbit for the comparison.

      APInt SignMask = APInt::getSignMask(BitWidth);

      if (SimplifyDemandedBits(Op.getOperand(1), SignMask, OriginalDemandedElts,

                               Known, TLO, Depth + 1))

        return true;

    }

    break;

  case X86ISD::MOVMSK: {

    SDValue Src = Op.getOperand(0);

    MVT SrcVT = Src.getSimpleValueType();

    unsigned SrcBits = SrcVT.getScalarSizeInBits();

    unsigned NumElts = SrcVT.getVectorNumElements();


    // If we don't need the sign bits at all just return zero.

    if (OriginalDemandedBits.countr_zero() >= NumElts)

      return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT));


    // See if we only demand bits from the lower 128-bit vector.

    if (SrcVT.is256BitVector() &&

        OriginalDemandedBits.getActiveBits() <= (NumElts / 2)) {

      SDValue NewSrc = extract128BitVector(Src, 0, TLO.DAG, SDLoc(Src));

      return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc));

    }


    // Only demand the vector elements of the sign bits we need.

    APInt KnownUndef, KnownZero;

    APInt DemandedElts = OriginalDemandedBits.zextOrTrunc(NumElts);

    if (SimplifyDemandedVectorElts(Src, DemandedElts, KnownUndef, KnownZero,

                                   TLO, Depth + 1))

      return true;


    Known.Zero = KnownZero.zext(BitWidth);

    Known.Zero.setHighBits(BitWidth - NumElts);


    // MOVMSK only uses the MSB from each vector element.

    KnownBits KnownSrc;

    APInt DemandedSrcBits = APInt::getSignMask(SrcBits);

    if (SimplifyDemandedBits(Src, DemandedSrcBits, DemandedElts, KnownSrc, TLO,

                             Depth + 1))

      return true;


    if (KnownSrc.One[SrcBits - 1])

      Known.One.setLowBits(NumElts);

    else if (KnownSrc.Zero[SrcBits - 1])

      Known.Zero.setLowBits(NumElts);


    // Attempt to avoid multi-use os if we don't need anything from it.

    if (SDValue NewSrc = SimplifyMultipleUseDemandedBits(

            Src, DemandedSrcBits, DemandedElts, TLO.DAG, Depth + 1))

      return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc));

    return false;

  }

  case X86ISD::TESTP: {

    SDValue Op0 = Op.getOperand(0);

    SDValue Op1 = Op.getOperand(1);

    MVT OpVT = Op0.getSimpleValueType();

    assert((OpVT.getVectorElementType() == MVT::f32 ||

            OpVT.getVectorElementType() == MVT::f64) &&

           "Illegal vector type for X86ISD::TESTP");


    // TESTPS/TESTPD only demands the sign bits of ALL the elements.

    KnownBits KnownSrc;

    APInt SignMask = APInt::getSignMask(OpVT.getScalarSizeInBits());

    bool AssumeSingleUse = (Op0 == Op1) && Op->isOnlyUserOf(Op0.getNode());

    return SimplifyDemandedBits(Op0, SignMask, KnownSrc, TLO, Depth + 1,

                                AssumeSingleUse) ||

           SimplifyDemandedBits(Op1, SignMask, KnownSrc, TLO, Depth + 1,

                                AssumeSingleUse);

  }

  case X86ISD::CMOV: {

    KnownBits Known2;

    if (SimplifyDemandedBits(Op.getOperand(1), OriginalDemandedBits,

                             OriginalDemandedElts, Known2, TLO, Depth + 1))

      return true;

    if (SimplifyDemandedBits(Op.getOperand(0), OriginalDemandedBits,

                             OriginalDemandedElts, Known, TLO, Depth + 1))

      return true;


    // Only known if known in both the LHS and RHS.

    Known = Known.intersectWith(Known2);

    return false;

  }

  case X86ISD::BEXTR:

  case X86ISD::BEXTRI: {

    SDValue Op0 = Op.getOperand(0);

    SDValue Op1 = Op.getOperand(1);


    // Only bottom 16-bits of the control bits are required.

    if (auto *Cst1 = dyn_cast<ConstantSDNode>(Op1)) {

      // NOTE: SimplifyDemandedBits won't do this for constants.

      uint64_t Val1 = Cst1->getZExtValue();

      uint64_t MaskedVal1 = Val1 & 0xFFFF;

      if (Opc == X86ISD::BEXTR && MaskedVal1 != Val1) {

        SDLoc DL(Op);

        return TLO.CombineTo(

            Op, TLO.DAG.getNode(X86ISD::BEXTR, DL, VT, Op0,

                                TLO.DAG.getConstant(MaskedVal1, DL, VT)));

      }


      unsigned Shift = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 0);

      unsigned Length = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 8);


      // If the length is 0, the result is 0.

      if (Length == 0) {

        Known.setAllZero();

        return false;

      }


      if ((Shift + Length) <= BitWidth) {

        APInt DemandedMask = APInt::getBitsSet(BitWidth, Shift, Shift + Length);

        if (SimplifyDemandedBits(Op0, DemandedMask, Known, TLO, Depth + 1))

          return true;


        Known = Known.extractBits(Length, Shift);

        Known = Known.zextOrTrunc(BitWidth);

        return false;

      }

    } else {

      assert(Opc == X86ISD::BEXTR && "Unexpected opcode!");

      KnownBits Known1;

      APInt DemandedMask(APInt::getLowBitsSet(BitWidth, 16));

      if (SimplifyDemandedBits(Op1, DemandedMask, Known1, TLO, Depth + 1))

        return true;


      // If the length is 0, replace with 0.

      KnownBits LengthBits = Known1.extractBits(8, 8);

      if (LengthBits.isZero())

        return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT));

    }


    break;

  }

  case X86ISD::PDEP: {

    SDValue Op0 = Op.getOperand(0);

    SDValue Op1 = Op.getOperand(1);


    unsigned DemandedBitsLZ = OriginalDemandedBits.countl_zero();

    APInt LoMask = APInt::getLowBitsSet(BitWidth, BitWidth - DemandedBitsLZ);


    // If the demanded bits has leading zeroes, we don't demand those from the

    // mask.

    if (SimplifyDemandedBits(Op1, LoMask, Known, TLO, Depth + 1))

      return true;


    // The number of possible 1s in the mask determines the number of LSBs of

    // operand 0 used. Undemanded bits from the mask don't matter so filter

    // them before counting.

    KnownBits Known2;

    uint64_t Count = (~Known.Zero & LoMask).popcount();

    APInt DemandedMask(APInt::getLowBitsSet(BitWidth, Count));

    if (SimplifyDemandedBits(Op0, DemandedMask, Known2, TLO, Depth + 1))

      return true;


    // Zeroes are retained from the mask, but not ones.

    Known.One.clearAllBits();

    // The result will have at least as many trailing zeros as the non-mask

    // operand since bits can only map to the same or higher bit position.

    Known.Zero.setLowBits(Known2.countMinTrailingZeros());

    return false;

  }

  case X86ISD::VPMADD52L:

  case X86ISD::VPMADD52H: {

    KnownBits KnownOp0, KnownOp1, KnownOp2;

    SDValue Op0 = Op.getOperand(0);

    SDValue Op1 = Op.getOperand(1);

    SDValue Op2 = Op.getOperand(2);

    //  Only demand the lower 52-bits of operands 0 / 1 (and all 64-bits of

    //  operand 2).

    APInt Low52Bits = APInt::getLowBitsSet(BitWidth, 52);

    if (SimplifyDemandedBits(Op0, Low52Bits, OriginalDemandedElts, KnownOp0,

                             TLO, Depth + 1))

      return true;


    if (SimplifyDemandedBits(Op1, Low52Bits, OriginalDemandedElts, KnownOp1,

                             TLO, Depth + 1))

      return true;


    if (SimplifyDemandedBits(Op2, APInt::getAllOnes(64), OriginalDemandedElts,

                             KnownOp2, TLO, Depth + 1))

      return true;


    KnownBits KnownMul;

    KnownOp0 = KnownOp0.trunc(52);

    KnownOp1 = KnownOp1.trunc(52);

    KnownMul = Opc == X86ISD::VPMADD52L ? KnownBits::mul(KnownOp0, KnownOp1)

                                        : KnownBits::mulhu(KnownOp0, KnownOp1);

    KnownMul = KnownMul.zext(64);


    // lo/hi(X * Y) + Z --> C + Z

    if (KnownMul.isConstant()) {

      SDLoc DL(Op);

      SDValue C = TLO.DAG.getConstant(KnownMul.getConstant(), DL, VT);

      return TLO.CombineTo(Op, TLO.DAG.getNode(ISD::ADD, DL, VT, C, Op2));

    }


    Known = KnownBits::add(KnownMul, KnownOp2);

    return false;

  }

  }


  return TargetLowering::SimplifyDemandedBitsForTargetNode(

      Op, OriginalDemandedBits, OriginalDemandedElts, Known, TLO, Depth);

}


SDValue X86TargetLowering::SimplifyMultipleUseDemandedBitsForTargetNode(

    SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts,

    SelectionDAG &DAG, unsigned Depth) const {

  int NumElts = DemandedElts.getBitWidth();

  unsigned Opc = Op.getOpcode();

  EVT VT = Op.getValueType();


  switch (Opc) {

  case X86ISD::PINSRB:

  case X86ISD::PINSRW: {

    // If we don't demand the inserted element, return the base vector.

    SDValue Vec = Op.getOperand(0);

    auto *CIdx = dyn_cast<ConstantSDNode>(Op.getOperand(2));

    MVT VecVT = Vec.getSimpleValueType();

    if (CIdx && CIdx->getAPIntValue().ult(VecVT.getVectorNumElements()) &&

        !DemandedElts[CIdx->getZExtValue()])

      return Vec;

    break;

  }

  case X86ISD::VSHLI: {

    // If we are only demanding sign bits then we can use the shift source

    // directly.

    SDValue Op0 = Op.getOperand(0);

    unsigned ShAmt = Op.getConstantOperandVal(1);

    unsigned BitWidth = DemandedBits.getBitWidth();

    unsigned NumSignBits = DAG.ComputeNumSignBits(Op0, DemandedElts, Depth + 1);

    unsigned UpperDemandedBits = BitWidth - DemandedBits.countr_zero();

    if (NumSignBits > ShAmt && (NumSignBits - ShAmt) >= UpperDemandedBits)

      return Op0;

    break;

  }

  case X86ISD::VSRAI:

    // iff we only need the sign bit then we can use the source directly.

    // TODO: generalize where we only demand extended signbits.

    if (DemandedBits.isSignMask())

      return Op.getOperand(0);

    break;

  case X86ISD::PCMPGT:

    // icmp sgt(0, R) == ashr(R, BitWidth-1).

    // iff we only need the sign bit then we can use R directly.

    if (DemandedBits.isSignMask() &&

        ISD::isBuildVectorAllZeros(Op.getOperand(0).getNode()))

      return Op.getOperand(1);

    break;

  case X86ISD::BLENDV: {

    // BLENDV: Cond (MSB) ? LHS : RHS

    SDValue Cond = Op.getOperand(0);

    SDValue LHS = Op.getOperand(1);

    SDValue RHS = Op.getOperand(2);


    KnownBits CondKnown = DAG.computeKnownBits(Cond, DemandedElts, Depth + 1);

    if (CondKnown.isNegative())

      return LHS;

    if (CondKnown.isNonNegative())

      return RHS;

    break;

  }

  case X86ISD::ANDNP: {

    // ANDNP = (~LHS & RHS);

    SDValue LHS = Op.getOperand(0);

    SDValue RHS = Op.getOperand(1);


    KnownBits LHSKnown = DAG.computeKnownBits(LHS, DemandedElts, Depth + 1);

    KnownBits RHSKnown = DAG.computeKnownBits(RHS, DemandedElts, Depth + 1);


    // If all of the demanded bits are known 0 on LHS and known 0 on RHS, then

    // the (inverted) LHS bits cannot contribute to the result of the 'andn' in

    // this context, so return RHS.

    if (DemandedBits.isSubsetOf(RHSKnown.Zero | LHSKnown.Zero))

      return RHS;

    break;

  }

  }


  APInt ShuffleUndef, ShuffleZero;

  SmallVector<int, 16> ShuffleMask;

  SmallVector<SDValue, 2> ShuffleOps;

  if (getTargetShuffleInputs(Op, DemandedElts, ShuffleOps, ShuffleMask,

                             ShuffleUndef, ShuffleZero, DAG, Depth, false)) {

    // If all the demanded elts are from one operand and are inline,

    // then we can use the operand directly.

    int NumOps = ShuffleOps.size();

    if (ShuffleMask.size() == (unsigned)NumElts &&

        llvm::all_of(ShuffleOps, [VT](SDValue V) {

          return VT.getSizeInBits() == V.getValueSizeInBits();

        })) {


      if (DemandedElts.isSubsetOf(ShuffleUndef))

        return DAG.getUNDEF(VT);

      if (DemandedElts.isSubsetOf(ShuffleUndef | ShuffleZero))

        return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, SDLoc(Op));


      // Bitmask that indicates which ops have only been accessed 'inline'.

      APInt IdentityOp = APInt::getAllOnes(NumOps);

      for (int i = 0; i != NumElts; ++i) {

        int M = ShuffleMask[i];

        if (!DemandedElts[i] || ShuffleUndef[i])

          continue;

        int OpIdx = M / NumElts;

        int EltIdx = M % NumElts;

        if (M < 0 || EltIdx != i) {

          IdentityOp.clearAllBits();

          break;

        }

        IdentityOp &= APInt::getOneBitSet(NumOps, OpIdx);

        if (IdentityOp == 0)

          break;

      }

      assert((IdentityOp == 0 || IdentityOp.popcount() == 1) &&

             "Multiple identity shuffles detected");


      if (IdentityOp != 0)

        return DAG.getBitcast(VT, ShuffleOps[IdentityOp.countr_zero()]);

    }

  }


  return TargetLowering::SimplifyMultipleUseDemandedBitsForTargetNode(

      Op, DemandedBits, DemandedElts, DAG, Depth);

}


bool X86TargetLowering::isGuaranteedNotToBeUndefOrPoisonForTargetNode(

    SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,

    bool PoisonOnly, unsigned Depth) const {

  unsigned NumElts = DemandedElts.getBitWidth();


  switch (Op.getOpcode()) {

  case X86ISD::GlobalBaseReg:

  case X86ISD::Wrapper:

  case X86ISD::WrapperRIP:

    return true;

  case X86ISD::PACKSS:

  case X86ISD::PACKUS: {

    APInt DemandedLHS, DemandedRHS;

    getPackDemandedElts(Op.getSimpleValueType(), DemandedElts, DemandedLHS,

                        DemandedRHS);

    return (!DemandedLHS ||

            DAG.isGuaranteedNotToBeUndefOrPoison(Op.getOperand(0), DemandedLHS,

                                                 PoisonOnly, Depth + 1)) &&

           (!DemandedRHS ||

            DAG.isGuaranteedNotToBeUndefOrPoison(Op.getOperand(1), DemandedRHS,

                                                 PoisonOnly, Depth + 1));

  }

  case X86ISD::INSERTPS:

  case X86ISD::BLENDI:

  case X86ISD::PSHUFB:

  case X86ISD::PSHUFD:

  case X86ISD::UNPCKL:

  case X86ISD::UNPCKH:

  case X86ISD::VPERMILPV:

  case X86ISD::VPERMILPI:

  case X86ISD::VPERMV:

  case X86ISD::VPERMV3: {

    SmallVector<int, 8> Mask;

    SmallVector<SDValue, 2> Ops;

    if (getTargetShuffleMask(Op, true, Ops, Mask)) {

      SmallVector<APInt, 2> DemandedSrcElts(Ops.size(),

                                            APInt::getZero(NumElts));

      for (auto M : enumerate(Mask)) {

        if (!DemandedElts[M.index()] || M.value() == SM_SentinelZero)

          continue;

        if (M.value() == SM_SentinelUndef)

          return false;

        assert(0 <= M.value() && M.value() < (int)(Ops.size() * NumElts) &&

               "Shuffle mask index out of range");

        DemandedSrcElts[M.value() / NumElts].setBit(M.value() % NumElts);

      }

      for (auto Op : enumerate(Ops))

        if (!DemandedSrcElts[Op.index()].isZero() &&

            !DAG.isGuaranteedNotToBeUndefOrPoison(

                Op.value(), DemandedSrcElts[Op.index()], PoisonOnly, Depth + 1))

          return false;

      return true;

    }

    break;

  }

  }

  return TargetLowering::isGuaranteedNotToBeUndefOrPoisonForTargetNode(

      Op, DemandedElts, DAG, PoisonOnly, Depth);

}


bool X86TargetLowering::canCreateUndefOrPoisonForTargetNode(

    SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,

    bool PoisonOnly, bool ConsiderFlags, unsigned Depth) const {


  switch (Op.getOpcode()) {

  // SSE bit logic.

  case X86ISD::FAND:

  case X86ISD::FOR:

  case X86ISD::FXOR:

  case X86ISD::FANDN:

  case X86ISD::ANDNP:

  case X86ISD::VPTERNLOG:

    return false;

  // SSE vector insert/extracts use modulo indices.

  case X86ISD::PINSRB:

  case X86ISD::PINSRW:

  case X86ISD::PEXTRB:

  case X86ISD::PEXTRW:

    return false;

  // SSE vector multiplies are either inbounds or saturate.

  case X86ISD::VPMADDUBSW:

  case X86ISD::VPMADDWD:

    return false;

  // SSE vector shifts handle out of bounds shift amounts.

  case X86ISD::VSHLI:

  case X86ISD::VSRLI:

  case X86ISD::VSRAI:

    return false;

  // SSE blends.

  case X86ISD::BLENDI:

  case X86ISD::BLENDV:

    return false;

  // SSE packs.

  case X86ISD::PACKSS:

  case X86ISD::PACKUS:

    return false;

  // SSE target shuffles.

  case X86ISD::INSERTPS:

  case X86ISD::PSHUFB:

  case X86ISD::PSHUFD:

  case X86ISD::UNPCKL:

  case X86ISD::UNPCKH:

  case X86ISD::VPERMILPV:

  case X86ISD::VPERMILPI:

  case X86ISD::VPERMV:

  case X86ISD::VPERMV3:

    return false;

  // SSE comparisons handle all icmp/fcmp cases.

  // TODO: Add CMPM/MM with test coverage.

  case X86ISD::CMPP:

  case X86ISD::PCMPEQ:

  case X86ISD::PCMPGT:

    return false;

  // SSE signbit extraction.

  case X86ISD::MOVMSK:

    return false;

  // GFNI instructions.

  case X86ISD::GF2P8AFFINEINVQB:

  case X86ISD::GF2P8AFFINEQB:

  case X86ISD::GF2P8MULB:

    return false;

  case ISD::INTRINSIC_WO_CHAIN:

    switch (Op->getConstantOperandVal(0)) {

    case Intrinsic::x86_sse2_pmadd_wd:

    case Intrinsic::x86_avx2_pmadd_wd:

    case Intrinsic::x86_avx512_pmaddw_d_512:

    case Intrinsic::x86_ssse3_pmadd_ub_sw_128:

    case Intrinsic::x86_avx2_pmadd_ub_sw:

    case Intrinsic::x86_avx512_pmaddubs_w_512:

      return false;

    case Intrinsic::x86_avx512_vpermi2var_d_128:

    case Intrinsic::x86_avx512_vpermi2var_d_256:

    case Intrinsic::x86_avx512_vpermi2var_d_512:

    case Intrinsic::x86_avx512_vpermi2var_hi_128:

    case Intrinsic::x86_avx512_vpermi2var_hi_256:

    case Intrinsic::x86_avx512_vpermi2var_hi_512:

    case Intrinsic::x86_avx512_vpermi2var_pd_128:

    case Intrinsic::x86_avx512_vpermi2var_pd_256:

    case Intrinsic::x86_avx512_vpermi2var_pd_512:

    case Intrinsic::x86_avx512_vpermi2var_ps_128:

    case Intrinsic::x86_avx512_vpermi2var_ps_256:

    case Intrinsic::x86_avx512_vpermi2var_ps_512:

    case Intrinsic::x86_avx512_vpermi2var_q_128:

    case Intrinsic::x86_avx512_vpermi2var_q_256:

    case Intrinsic::x86_avx512_vpermi2var_q_512:

    case Intrinsic::x86_avx512_vpermi2var_qi_128:

    case Intrinsic::x86_avx512_vpermi2var_qi_256:

    case Intrinsic::x86_avx512_vpermi2var_qi_512:

      return false;

    }

  }

  return TargetLowering::canCreateUndefOrPoisonForTargetNode(

      Op, DemandedElts, DAG, PoisonOnly, ConsiderFlags, Depth);

}


bool X86TargetLowering::isSplatValueForTargetNode(SDValue Op,

                                                  const APInt &DemandedElts,

                                                  APInt &UndefElts,

                                                  const SelectionDAG &DAG,

                                                  unsigned Depth) const {

  unsigned NumElts = DemandedElts.getBitWidth();

  unsigned Opc = Op.getOpcode();


  switch (Opc) {

  case X86ISD::VBROADCAST:

  case X86ISD::VBROADCAST_LOAD:

    UndefElts = APInt::getZero(NumElts);

    return true;

  }


  return TargetLowering::isSplatValueForTargetNode(Op, DemandedElts, UndefElts,

                                                   DAG, Depth);

}


// Helper to peek through bitops/trunc/setcc to determine size of source vector.

// Allows combineBitcastvxi1 to determine what size vector generated a <X x i1>.


static bool checkBitcastSrcVectorSize(SDValue Src, unsigned Size,

                                      bool AllowTruncate, unsigned Depth) {

  // Limit recursion.

  if (Depth >= SelectionDAG::MaxRecursionDepth)

    return false;

  switch (Src.getOpcode()) {

  case ISD::TRUNCATE:

    if (!AllowTruncate)

      return false;

    [[fallthrough]];

  case ISD::SETCC:

    return Src.getOperand(0).getValueSizeInBits() == Size;

  case ISD::FREEZE:

    return checkBitcastSrcVectorSize(Src.getOperand(0), Size, AllowTruncate,

                                     Depth + 1);

  case ISD::AND:

  case ISD::XOR:

  case ISD::OR:

    return checkBitcastSrcVectorSize(Src.getOperand(0), Size, AllowTruncate,

                                     Depth + 1) &&

           checkBitcastSrcVectorSize(Src.getOperand(1), Size, AllowTruncate,

                                     Depth + 1);

  case ISD::SELECT:

  case ISD::VSELECT:

    return Src.getOperand(0).getScalarValueSizeInBits() == 1 &&

           checkBitcastSrcVectorSize(Src.getOperand(1), Size, AllowTruncate,

                                     Depth + 1) &&

           checkBitcastSrcVectorSize(Src.getOperand(2), Size, AllowTruncate,

                                     Depth + 1);

  case ISD::BUILD_VECTOR:

    return ISD::isBuildVectorAllZeros(Src.getNode()) ||

           ISD::isBuildVectorAllOnes(Src.getNode());

  }

  return false;

}


// Helper to flip between AND/OR/XOR opcodes and their X86ISD FP equivalents.


static unsigned getAltBitOpcode(unsigned Opcode) {

  switch(Opcode) {

  // clang-format off

  case ISD::AND: return X86ISD::FAND;

  case ISD::OR: return X86ISD::FOR;

  case ISD::XOR: return X86ISD::FXOR;

  case X86ISD::ANDNP: return X86ISD::FANDN;

  // clang-format on

  }

  llvm_unreachable("Unknown bitwise opcode");

}


// Helper to adjust v4i32 MOVMSK expansion to work with SSE1-only targets.


static SDValue adjustBitcastSrcVectorSSE1(SelectionDAG &DAG, SDValue Src,

                                          const SDLoc &DL) {

  EVT SrcVT = Src.getValueType();

  if (SrcVT != MVT::v4i1)

    return SDValue();


  switch (Src.getOpcode()) {

  case ISD::SETCC:

    if (Src.getOperand(0).getValueType() == MVT::v4i32 &&

        ISD::isBuildVectorAllZeros(Src.getOperand(1).getNode()) &&

        cast<CondCodeSDNode>(Src.getOperand(2))->get() == ISD::SETLT) {

      SDValue Op0 = Src.getOperand(0);

      if (ISD::isNormalLoad(Op0.getNode()))

        return DAG.getBitcast(MVT::v4f32, Op0);

      if (Op0.getOpcode() == ISD::BITCAST &&

          Op0.getOperand(0).getValueType() == MVT::v4f32)

        return Op0.getOperand(0);

    }

    break;

  case ISD::AND:

  case ISD::XOR:

  case ISD::OR: {

    SDValue Op0 = adjustBitcastSrcVectorSSE1(DAG, Src.getOperand(0), DL);

    SDValue Op1 = adjustBitcastSrcVectorSSE1(DAG, Src.getOperand(1), DL);

    if (Op0 && Op1)

      return DAG.getNode(getAltBitOpcode(Src.getOpcode()), DL, MVT::v4f32, Op0,

                         Op1);

    break;

  }

  }

  return SDValue();

}


// Helper to push sign extension of vXi1 SETCC result through bitops.


static SDValue signExtendBitcastSrcVector(SelectionDAG &DAG, EVT SExtVT,

                                          SDValue Src, const SDLoc &DL) {

  switch (Src.getOpcode()) {

  case ISD::SETCC:

  case ISD::FREEZE:

  case ISD::TRUNCATE:

  case ISD::BUILD_VECTOR:

    return DAG.getNode(ISD::SIGN_EXTEND, DL, SExtVT, Src);

  case ISD::AND:

  case ISD::XOR:

  case ISD::OR:

    return DAG.getNode(

        Src.getOpcode(), DL, SExtVT,

        signExtendBitcastSrcVector(DAG, SExtVT, Src.getOperand(0), DL),

        signExtendBitcastSrcVector(DAG, SExtVT, Src.getOperand(1), DL));

  case ISD::SELECT:

  case ISD::VSELECT:

    return DAG.getSelect(

        DL, SExtVT, Src.getOperand(0),

        signExtendBitcastSrcVector(DAG, SExtVT, Src.getOperand(1), DL),

        signExtendBitcastSrcVector(DAG, SExtVT, Src.getOperand(2), DL));

  }

  llvm_unreachable("Unexpected node type for vXi1 sign extension");

}


// Try to match patterns such as

// (i16 bitcast (v16i1 x))

// ->

// (i16 movmsk (16i8 sext (v16i1 x)))

// before the illegal vector is scalarized on subtargets that don't have legal

// vxi1 types.


static SDValue combineBitcastvxi1(SelectionDAG &DAG, EVT VT, SDValue Src,

                                  const SDLoc &DL,

                                  const X86Subtarget &Subtarget) {

  EVT SrcVT = Src.getValueType();

  if (Subtarget.useSoftFloat() || !SrcVT.isSimple() ||

      SrcVT.getScalarType() != MVT::i1)

    return SDValue();


  // Recognize the IR pattern for the movmsk intrinsic under SSE1 before type

  // legalization destroys the v4i32 type.

  if (Subtarget.hasSSE1() && !Subtarget.hasSSE2()) {

    if (SDValue V = adjustBitcastSrcVectorSSE1(DAG, Src, DL)) {

      V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32,

                      DAG.getBitcast(MVT::v4f32, V));

      return DAG.getZExtOrTrunc(V, DL, VT);

    }

  }


  // If the input is a truncate from v16i8 or v32i8 go ahead and use a

  // movmskb even with avx512. This will be better than truncating to vXi1 and

  // using a kmov. This can especially help KNL if the input is a v16i8/v32i8

  // vpcmpeqb/vpcmpgtb.

  bool PreferMovMsk = Src.getOpcode() == ISD::TRUNCATE && Src.hasOneUse() &&

                      (Src.getOperand(0).getValueType() == MVT::v16i8 ||

                       Src.getOperand(0).getValueType() == MVT::v32i8 ||

                       Src.getOperand(0).getValueType() == MVT::v64i8);


  // Prefer movmsk for AVX512 for (bitcast (setlt X, 0)) which can be handled

  // directly with vpmovmskb/vmovmskps/vmovmskpd.

  if (Src.getOpcode() == ISD::SETCC && Src.hasOneUse() &&

      cast<CondCodeSDNode>(Src.getOperand(2))->get() == ISD::SETLT &&

      ISD::isBuildVectorAllZeros(Src.getOperand(1).getNode())) {

    EVT CmpVT = Src.getOperand(0).getValueType();

    EVT EltVT = CmpVT.getVectorElementType();

    if (CmpVT.getSizeInBits() <= 256 &&

        (EltVT == MVT::i8 || EltVT == MVT::i32 || EltVT == MVT::i64))

      PreferMovMsk = true;

  }


  // With AVX512 vxi1 types are legal and we prefer using k-regs.

  // MOVMSK is supported in SSE2 or later.

  if (!Subtarget.hasSSE2() || (Subtarget.hasAVX512() && !PreferMovMsk))

    return SDValue();


  // If the upper ops of a concatenation are undef, then try to bitcast the

  // lower op and extend.

  SmallVector<SDValue, 4> SubSrcOps;

  if (collectConcatOps(Src.getNode(), SubSrcOps, DAG) &&

      SubSrcOps.size() >= 2) {

    SDValue LowerOp = SubSrcOps[0];

    ArrayRef<SDValue> UpperOps(std::next(SubSrcOps.begin()), SubSrcOps.end());

    if (LowerOp.getOpcode() == ISD::SETCC &&

        all_of(UpperOps, [](SDValue Op) { return Op.isUndef(); })) {

      EVT SubVT = VT.getIntegerVT(

          *DAG.getContext(), LowerOp.getValueType().getVectorMinNumElements());

      if (SDValue V = combineBitcastvxi1(DAG, SubVT, LowerOp, DL, Subtarget)) {

        EVT IntVT = VT.getIntegerVT(*DAG.getContext(), VT.getSizeInBits());

        return DAG.getBitcast(VT, DAG.getNode(ISD::ANY_EXTEND, DL, IntVT, V));

      }

    }

  }


  // There are MOVMSK flavors for types v16i8, v32i8, v4f32, v8f32, v4f64 and

  // v8f64. So all legal 128-bit and 256-bit vectors are covered except for

  // v8i16 and v16i16.

  // For these two cases, we can shuffle the upper element bytes to a

  // consecutive sequence at the start of the vector and treat the results as

  // v16i8 or v32i8, and for v16i8 this is the preferable solution. However,

  // for v16i16 this is not the case, because the shuffle is expensive, so we

  // avoid sign-extending to this type entirely.

  // For example, t0 := (v8i16 sext(v8i1 x)) needs to be shuffled as:

  // (v16i8 shuffle <0,2,4,6,8,10,12,14,u,u,...,u> (v16i8 bitcast t0), undef)

  MVT SExtVT;

  bool PropagateSExt = false;

  switch (SrcVT.getSimpleVT().SimpleTy) {

  default:

    return SDValue();

  case MVT::v2i1:

    SExtVT = MVT::v2i64;

    break;

  case MVT::v4i1:

    SExtVT = MVT::v4i32;

    // For cases such as (i4 bitcast (v4i1 setcc v4i64 v1, v2))

    // sign-extend to a 256-bit operation to avoid truncation.

    if (Subtarget.hasAVX() &&

        checkBitcastSrcVectorSize(Src, 256, Subtarget.hasAVX2(), 0)) {

      SExtVT = MVT::v4i64;

      PropagateSExt = true;

    }

    break;

  case MVT::v8i1:

    SExtVT = MVT::v8i16;

    // For cases such as (i8 bitcast (v8i1 setcc v8i32 v1, v2)),

    // sign-extend to a 256-bit operation to match the compare.

    // If the setcc operand is 128-bit, prefer sign-extending to 128-bit over

    // 256-bit because the shuffle is cheaper than sign extending the result of

    // the compare.

    if (Subtarget.hasAVX() && (checkBitcastSrcVectorSize(Src, 256, true, 0) ||

                               checkBitcastSrcVectorSize(Src, 512, true, 0))) {

      SExtVT = MVT::v8i32;

      PropagateSExt = true;

    }

    break;

  case MVT::v16i1:

    SExtVT = MVT::v16i8;

    // For the case (i16 bitcast (v16i1 setcc v16i16 v1, v2)),

    // it is not profitable to sign-extend to 256-bit because this will

    // require an extra cross-lane shuffle which is more expensive than

    // truncating the result of the compare to 128-bits.

    break;

  case MVT::v32i1:

    SExtVT = MVT::v32i8;

    break;

  case MVT::v64i1:

    // If we have AVX512F, but not AVX512BW and the input is truncated from

    // v64i8 checked earlier. Then split the input and make two pmovmskbs.

    if (Subtarget.hasAVX512()) {

      if (Subtarget.hasBWI())

        return SDValue();

      SExtVT = MVT::v64i8;

      break;

    }

    // Split if this is a <64 x i8> comparison result.

    if (checkBitcastSrcVectorSize(Src, 512, false, 0)) {

      SExtVT = MVT::v64i8;

      break;

    }

    return SDValue();

  };


  SDValue V = PropagateSExt ? signExtendBitcastSrcVector(DAG, SExtVT, Src, DL)

                            : DAG.getNode(ISD::SIGN_EXTEND, DL, SExtVT, Src);


  if (SExtVT == MVT::v16i8 || SExtVT == MVT::v32i8 || SExtVT == MVT::v64i8) {

    V = getPMOVMSKB(DL, V, DAG, Subtarget);

  } else {

    if (SExtVT == MVT::v8i16) {

      V = widenSubVector(V, false, Subtarget, DAG, DL, 256);

      V = DAG.getNode(ISD::TRUNCATE, DL, MVT::v16i8, V);

    }

    V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);

  }


  EVT IntVT =

      EVT::getIntegerVT(*DAG.getContext(), SrcVT.getVectorNumElements());

  V = DAG.getZExtOrTrunc(V, DL, IntVT);

  return DAG.getBitcast(VT, V);

}


// Convert a vXi1 constant build vector to the same width scalar integer.


static SDValue combinevXi1ConstantToInteger(SDValue Op, SelectionDAG &DAG) {

  EVT SrcVT = Op.getValueType();

  assert(SrcVT.getVectorElementType() == MVT::i1 &&

         "Expected a vXi1 vector");

  assert(ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) &&

         "Expected a constant build vector");


  APInt Imm(SrcVT.getVectorNumElements(), 0);

  for (unsigned Idx = 0, e = Op.getNumOperands(); Idx < e; ++Idx) {

    SDValue In = Op.getOperand(Idx);

    if (!In.isUndef() && (In->getAsZExtVal() & 0x1))

      Imm.setBit(Idx);

  }

  EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), Imm.getBitWidth());

  return DAG.getConstant(Imm, SDLoc(Op), IntVT);

}


static SDValue combineCastedMaskArithmetic(SDNode *N, SelectionDAG &DAG,

                                           TargetLowering::DAGCombinerInfo &DCI,

                                           const X86Subtarget &Subtarget) {

  using namespace SDPatternMatch;

  assert(N->getOpcode() == ISD::BITCAST && "Expected a bitcast");


  if (!DCI.isBeforeLegalizeOps())

    return SDValue();


  // Only do this if we have k-registers.

  if (!Subtarget.hasAVX512())

    return SDValue();


  EVT DstVT = N->getValueType(0);

  SDValue Op = N->getOperand(0);

  EVT SrcVT = Op.getValueType();


  // Make sure we have a bitcast between mask registers and a scalar type.

  if (!(SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 &&

        DstVT.isScalarInteger()) &&

      !(DstVT.isVector() && DstVT.getVectorElementType() == MVT::i1 &&

        SrcVT.isScalarInteger()))

    return SDValue();


  SDValue LHS, RHS;


  // Look for logic ops.

  if (!sd_match(Op, m_OneUse(m_BitwiseLogic(m_Value(LHS), m_Value(RHS)))))

    return SDValue();


  // If either operand was bitcast from DstVT, then perform logic with DstVT (at

  // least one of the getBitcast() will fold away).

  if (sd_match(LHS, m_OneUse(m_BitCast(m_SpecificVT(DstVT)))) ||

      sd_match(RHS, m_OneUse(m_BitCast(m_SpecificVT(DstVT)))))

    return DAG.getNode(Op.getOpcode(), SDLoc(N), DstVT,

                       DAG.getBitcast(DstVT, LHS), DAG.getBitcast(DstVT, RHS));


  // If the RHS is a vXi1 build vector, this is a good reason to flip too.

  // Most of these have to move a constant from the scalar domain anyway.

  if (ISD::isBuildVectorOfConstantSDNodes(RHS.getNode())) {

    RHS = combinevXi1ConstantToInteger(RHS, DAG);

    return DAG.getNode(Op.getOpcode(), SDLoc(N), DstVT,

                       DAG.getBitcast(DstVT, LHS), RHS);

  }


  return SDValue();

}


static SDValue createMMXBuildVector(BuildVectorSDNode *BV, SelectionDAG &DAG,

                                    const X86Subtarget &Subtarget) {

  SDLoc DL(BV);

  unsigned NumElts = BV->getNumOperands();

  SDValue Splat = BV->getSplatValue();


  // Build MMX element from integer GPR or SSE float values.

  auto CreateMMXElement = [&](SDValue V) {

    if (V.isUndef())

      return DAG.getUNDEF(MVT::x86mmx);

    if (V.getValueType().isFloatingPoint()) {

      if (Subtarget.hasSSE1() && !isa<ConstantFPSDNode>(V)) {

        V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4f32, V);

        V = DAG.getBitcast(MVT::v2i64, V);

        return DAG.getNode(X86ISD::MOVDQ2Q, DL, MVT::x86mmx, V);

      }

      V = DAG.getBitcast(MVT::i32, V);

    } else {

      V = DAG.getAnyExtOrTrunc(V, DL, MVT::i32);

    }

    return DAG.getNode(X86ISD::MMX_MOVW2D, DL, MVT::x86mmx, V);

  };


  // Convert build vector ops to MMX data in the bottom elements.

  SmallVector<SDValue, 8> Ops;


  const TargetLowering &TLI = DAG.getTargetLoweringInfo();


  // Broadcast - use (PUNPCKL+)PSHUFW to broadcast single element.

  if (Splat) {

    if (Splat.isUndef())

      return DAG.getUNDEF(MVT::x86mmx);


    Splat = CreateMMXElement(Splat);


    if (Subtarget.hasSSE1()) {

      // Unpack v8i8 to splat i8 elements to lowest 16-bits.

      if (NumElts == 8)

        Splat = DAG.getNode(

            ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx,

            DAG.getTargetConstant(Intrinsic::x86_mmx_punpcklbw, DL,

                                  TLI.getPointerTy(DAG.getDataLayout())),

            Splat, Splat);


      // Use PSHUFW to repeat 16-bit elements.

      unsigned ShufMask = (NumElts > 2 ? 0 : 0x44);

      return DAG.getNode(

          ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx,

          DAG.getTargetConstant(Intrinsic::x86_sse_pshuf_w, DL,

                                TLI.getPointerTy(DAG.getDataLayout())),

          Splat, DAG.getTargetConstant(ShufMask, DL, MVT::i8));

    }

    Ops.append(NumElts, Splat);

  } else {

    for (unsigned i = 0; i != NumElts; ++i)

      Ops.push_back(CreateMMXElement(BV->getOperand(i)));

  }


  // Use tree of PUNPCKLs to build up general MMX vector.

  while (Ops.size() > 1) {

    unsigned NumOps = Ops.size();

    unsigned IntrinOp =

        (NumOps == 2 ? Intrinsic::x86_mmx_punpckldq

                     : (NumOps == 4 ? Intrinsic::x86_mmx_punpcklwd

                                    : Intrinsic::x86_mmx_punpcklbw));

    SDValue Intrin = DAG.getTargetConstant(

        IntrinOp, DL, TLI.getPointerTy(DAG.getDataLayout()));

    for (unsigned i = 0; i != NumOps; i += 2)

      Ops[i / 2] = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx, Intrin,

                               Ops[i], Ops[i + 1]);

    Ops.resize(NumOps / 2);

  }


  return Ops[0];

}


// Recursive function that attempts to find if a bool vector node was originally

// a vector/float/double that got truncated/extended/bitcast to/from a scalar

// integer. If so, replace the scalar ops with bool vector equivalents back down

// the chain.


static SDValue combineBitcastToBoolVector(EVT VT, SDValue V, const SDLoc &DL,

                                          SelectionDAG &DAG,

                                          const X86Subtarget &Subtarget,

                                          unsigned Depth = 0) {

  if (Depth >= SelectionDAG::MaxRecursionDepth)

    return SDValue(); // Limit search depth.


  const TargetLowering &TLI = DAG.getTargetLoweringInfo();

  unsigned Opc = V.getOpcode();

  switch (Opc) {

  case ISD::BITCAST: {

    // Bitcast from a vector/float/double, we can cheaply bitcast to VT.

    SDValue Src = V.getOperand(0);

    EVT SrcVT = Src.getValueType();

    if (SrcVT.isVector() || SrcVT.isFloatingPoint())

      return DAG.getBitcast(VT, Src);

    break;

  }

  case ISD::Constant: {

    auto *C = cast<ConstantSDNode>(V);

    if (C->isZero())

      return DAG.getConstant(0, DL, VT);

    if (C->isAllOnes())

      return DAG.getAllOnesConstant(DL, VT);

    break;

  }

  case ISD::TRUNCATE: {

    // If we find a suitable source, a truncated scalar becomes a subvector.

    SDValue Src = V.getOperand(0);

    EVT NewSrcVT =

        EVT::getVectorVT(*DAG.getContext(), MVT::i1, Src.getValueSizeInBits());

    if (TLI.isTypeLegal(NewSrcVT))

      if (SDValue N0 = combineBitcastToBoolVector(NewSrcVT, Src, DL, DAG,

                                                  Subtarget, Depth + 1))

        return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, N0,

                           DAG.getVectorIdxConstant(0, DL));

    break;

  }

  case ISD::ANY_EXTEND:

  case ISD::ZERO_EXTEND: {

    // If we find a suitable source, an extended scalar becomes a subvector.

    SDValue Src = V.getOperand(0);

    EVT NewSrcVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,

                                    Src.getScalarValueSizeInBits());

    if (TLI.isTypeLegal(NewSrcVT))

      if (SDValue N0 = combineBitcastToBoolVector(NewSrcVT, Src, DL, DAG,

                                                  Subtarget, Depth + 1))

        return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,

                           Opc == ISD::ANY_EXTEND ? DAG.getUNDEF(VT)

                                                  : DAG.getConstant(0, DL, VT),

                           N0, DAG.getVectorIdxConstant(0, DL));

    break;

  }

  case ISD::OR:

  case ISD::XOR: {

    // If we find suitable sources, we can just move the op to the vector

    // domain.

    if (SDValue N0 = combineBitcastToBoolVector(VT, V.getOperand(0), DL, DAG,

                                                Subtarget, Depth + 1))

      if (SDValue N1 = combineBitcastToBoolVector(VT, V.getOperand(1), DL, DAG,

                                                  Subtarget, Depth + 1))

        return DAG.getNode(Opc, DL, VT, N0, N1);

    break;

  }

  case ISD::SHL: {

    // If we find a suitable source, a SHL becomes a KSHIFTL.

    SDValue Src0 = V.getOperand(0);

    if ((VT == MVT::v8i1 && !Subtarget.hasDQI()) ||

        ((VT == MVT::v32i1 || VT == MVT::v64i1) && !Subtarget.hasBWI()))

      break;


    if (auto *Amt = dyn_cast<ConstantSDNode>(V.getOperand(1)))

      if (SDValue N0 = combineBitcastToBoolVector(VT, Src0, DL, DAG, Subtarget,

                                                  Depth + 1))

        return DAG.getNode(

            X86ISD::KSHIFTL, DL, VT, N0,

            DAG.getTargetConstant(Amt->getZExtValue(), DL, MVT::i8));

    break;

  }

  }


  // Does the inner bitcast already exist?

  if (Depth > 0)

    if (SDNode *Alt = DAG.getNodeIfExists(ISD::BITCAST, DAG.getVTList(VT), {V}))

      return SDValue(Alt, 0);


  return SDValue();

}


static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG,

                              TargetLowering::DAGCombinerInfo &DCI,

                              const X86Subtarget &Subtarget) {

  SDValue N0 = N->getOperand(0);

  EVT VT = N->getValueType(0);

  EVT SrcVT = N0.getValueType();

  const TargetLowering &TLI = DAG.getTargetLoweringInfo();


  // Try to match patterns such as

  // (i16 bitcast (v16i1 x))

  // ->

  // (i16 movmsk (16i8 sext (v16i1 x)))

  // before the setcc result is scalarized on subtargets that don't have legal

  // vxi1 types.

  if (DCI.isBeforeLegalize()) {

    SDLoc dl(N);

    if (SDValue V = combineBitcastvxi1(DAG, VT, N0, dl, Subtarget))

      return V;


    // If this is a bitcast between a MVT::v4i1/v2i1 and an illegal integer

    // type, widen both sides to avoid a trip through memory.

    if ((VT == MVT::v4i1 || VT == MVT::v2i1) && SrcVT.isScalarInteger() &&

        Subtarget.hasAVX512()) {

      N0 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i8, N0);

      N0 = DAG.getBitcast(MVT::v8i1, N0);

      return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, N0,

                         DAG.getVectorIdxConstant(0, dl));

    }


    // If this is a bitcast between a MVT::v4i1/v2i1 and an illegal integer

    // type, widen both sides to avoid a trip through memory.

    if ((SrcVT == MVT::v4i1 || SrcVT == MVT::v2i1) && VT.isScalarInteger() &&

        Subtarget.hasAVX512()) {

      // Use zeros for the widening if we already have some zeroes. This can

      // allow SimplifyDemandedBits to remove scalar ANDs that may be down

      // stream of this.

      // FIXME: It might make sense to detect a concat_vectors with a mix of

      // zeroes and undef and turn it into insert_subvector for i1 vectors as

      // a separate combine. What we can't do is canonicalize the operands of

      // such a concat or we'll get into a loop with SimplifyDemandedBits.

      if (N0.getOpcode() == ISD::CONCAT_VECTORS) {

        SDValue LastOp = N0.getOperand(N0.getNumOperands() - 1);

        if (ISD::isBuildVectorAllZeros(LastOp.getNode())) {

          SrcVT = LastOp.getValueType();

          unsigned NumConcats = 8 / SrcVT.getVectorNumElements();

          SmallVector<SDValue, 4> Ops(N0->ops());

          Ops.resize(NumConcats, DAG.getConstant(0, dl, SrcVT));

          N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops);

          N0 = DAG.getBitcast(MVT::i8, N0);

          return DAG.getNode(ISD::TRUNCATE, dl, VT, N0);

        }

      }


      unsigned NumConcats = 8 / SrcVT.getVectorNumElements();

      SmallVector<SDValue, 4> Ops(NumConcats, DAG.getUNDEF(SrcVT));

      Ops[0] = N0;

      N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops);

      N0 = DAG.getBitcast(MVT::i8, N0);

      return DAG.getNode(ISD::TRUNCATE, dl, VT, N0);

    }

  } else if (DCI.isAfterLegalizeDAG()) {

    // If we're bitcasting from iX to vXi1, see if the integer originally

    // began as a vXi1 and whether we can remove the bitcast entirely.

    if (VT.isVector() && VT.getScalarType() == MVT::i1 &&

        SrcVT.isScalarInteger() && TLI.isTypeLegal(VT)) {

      if (SDValue V =

              combineBitcastToBoolVector(VT, N0, SDLoc(N), DAG, Subtarget))

        return V;

    }

  }


  // Look for (i8 (bitcast (v8i1 (extract_subvector (v16i1 X), 0)))) and

  // replace with (i8 (trunc (i16 (bitcast (v16i1 X))))). This can occur

  // due to insert_subvector legalization on KNL. By promoting the copy to i16

  // we can help with known bits propagation from the vXi1 domain to the

  // scalar domain.

  if (VT == MVT::i8 && SrcVT == MVT::v8i1 && Subtarget.hasAVX512() &&

      !Subtarget.hasDQI() && N0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&

      N0.getOperand(0).getValueType() == MVT::v16i1 &&

      isNullConstant(N0.getOperand(1)))

    return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT,

                       DAG.getBitcast(MVT::i16, N0.getOperand(0)));


  // Canonicalize (bitcast (vbroadcast_load)) so that the output of the bitcast

  // and the vbroadcast_load are both integer or both fp. In some cases this

  // will remove the bitcast entirely.

  if (N0.getOpcode() == X86ISD::VBROADCAST_LOAD && N0.hasOneUse() &&

       VT.isFloatingPoint() != SrcVT.isFloatingPoint() && VT.isVector()) {

    auto *BCast = cast<MemIntrinsicSDNode>(N0);

    unsigned SrcVTSize = SrcVT.getScalarSizeInBits();

    unsigned MemSize = BCast->getMemoryVT().getScalarSizeInBits();

    // Don't swap i8/i16 since don't have fp types that size.

    if (MemSize >= 32) {

      MVT MemVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(MemSize)

                                       : MVT::getIntegerVT(MemSize);

      MVT LoadVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(SrcVTSize)

                                        : MVT::getIntegerVT(SrcVTSize);

      LoadVT = MVT::getVectorVT(LoadVT, SrcVT.getVectorNumElements());


      SDVTList Tys = DAG.getVTList(LoadVT, MVT::Other);

      SDValue Ops[] = { BCast->getChain(), BCast->getBasePtr() };

      SDValue ResNode =

          DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, SDLoc(N), Tys, Ops,

                                  MemVT, BCast->getMemOperand());

      DAG.ReplaceAllUsesOfValueWith(SDValue(BCast, 1), ResNode.getValue(1));

      return DAG.getBitcast(VT, ResNode);

    }

  }


  // Attempt to peek through f16 bitcasted extractions hidden by truncation.

  if (VT == MVT::f16 && SrcVT == MVT::i16) {

    SDValue Src = peekThroughTruncates(N0);

    if (Src.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&

        Src.getOperand(0).getValueSizeInBits() == 128 &&

        isNullConstant(Src.getOperand(1))) {

      SDLoc DL(N);

      return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,

                         DAG.getBitcast(MVT::v8f16, Src.getOperand(0)),

                         DAG.getVectorIdxConstant(0, DL));

    }

  }


  // Since MMX types are special and don't usually play with other vector types,

  // it's better to handle them early to be sure we emit efficient code by

  // avoiding store-load conversions.

  if (VT == MVT::x86mmx) {

    // Detect MMX constant vectors.

    APInt UndefElts;

    SmallVector<APInt, 1> EltBits;

    if (getTargetConstantBitsFromNode(N0, 64, UndefElts, EltBits,

                                      /*AllowWholeUndefs*/ true,

                                      /*AllowPartialUndefs*/ true)) {

      SDLoc DL(N0);

      // Handle zero-extension of i32 with MOVD.

      if (EltBits[0].countl_zero() >= 32)

        return DAG.getNode(X86ISD::MMX_MOVW2D, DL, VT,

                           DAG.getConstant(EltBits[0].trunc(32), DL, MVT::i32));

      // Else, bitcast to a double.

      // TODO - investigate supporting sext 32-bit immediates on x86_64.

      APFloat F64(APFloat::IEEEdouble(), EltBits[0]);

      return DAG.getBitcast(VT, DAG.getConstantFP(F64, DL, MVT::f64));

    }


    // Detect bitcasts to x86mmx low word.

    if (N0.getOpcode() == ISD::BUILD_VECTOR &&

        (SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8) &&

        N0.getOperand(0).getValueType() == SrcVT.getScalarType()) {

      bool LowUndef = true, AllUndefOrZero = true;

      for (unsigned i = 1, e = SrcVT.getVectorNumElements(); i != e; ++i) {

        SDValue Op = N0.getOperand(i);

        LowUndef &= Op.isUndef() || (i >= e/2);

        AllUndefOrZero &= isNullConstantOrUndef(Op);

      }

      if (AllUndefOrZero) {

        SDValue N00 = N0.getOperand(0);

        SDLoc dl(N00);

        N00 = LowUndef ? DAG.getAnyExtOrTrunc(N00, dl, MVT::i32)

                       : DAG.getZExtOrTrunc(N00, dl, MVT::i32);

        return DAG.getNode(X86ISD::MMX_MOVW2D, dl, VT, N00);

      }

    }


    // Detect bitcasts of 64-bit build vectors and convert to a

    // MMX UNPCK/PSHUFW which takes MMX type inputs with the value in the

    // lowest element.

    if (N0.getOpcode() == ISD::BUILD_VECTOR &&

        (SrcVT == MVT::v2f32 || SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 ||

         SrcVT == MVT::v8i8))

      return createMMXBuildVector(cast<BuildVectorSDNode>(N0), DAG, Subtarget);


    // Detect bitcasts between element or subvector extraction to x86mmx.

    if ((N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT ||

         N0.getOpcode() == ISD::EXTRACT_SUBVECTOR) &&

        isNullConstant(N0.getOperand(1))) {

      SDValue N00 = N0.getOperand(0);

      if (N00.getValueType().is128BitVector())

        return DAG.getNode(X86ISD::MOVDQ2Q, SDLoc(N00), VT,

                           DAG.getBitcast(MVT::v2i64, N00));

    }


    // Detect bitcasts from FP_TO_SINT to x86mmx.

    if (SrcVT == MVT::v2i32 && N0.getOpcode() == ISD::FP_TO_SINT) {

      SDLoc DL(N0);

      SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,

                                DAG.getUNDEF(MVT::v2i32));

      return DAG.getNode(X86ISD::MOVDQ2Q, DL, VT,

                         DAG.getBitcast(MVT::v2i64, Res));

    }

  }


  // Try to remove a bitcast of constant vXi1 vector. We have to legalize

  // most of these to scalar anyway.

  if (Subtarget.hasAVX512() && VT.isScalarInteger() &&

      SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 &&

      ISD::isBuildVectorOfConstantSDNodes(N0.getNode())) {

    return combinevXi1ConstantToInteger(N0, DAG);

  }


  if (Subtarget.hasAVX512() && SrcVT.isScalarInteger() && VT.isVector() &&

      VT.getVectorElementType() == MVT::i1) {

    if (auto *C = dyn_cast<ConstantSDNode>(N0)) {

      if (C->isAllOnes())

        return DAG.getConstant(1, SDLoc(N0), VT);

      if (C->isZero())

        return DAG.getConstant(0, SDLoc(N0), VT);

    }

  }


  // Look for MOVMSK that is maybe truncated and then bitcasted to vXi1.

  // Turn it into a sign bit compare that produces a k-register. This avoids

  // a trip through a GPR.

  if (Subtarget.hasAVX512() && SrcVT.isScalarInteger() &&

      VT.isVector() && VT.getVectorElementType() == MVT::i1 &&

      isPowerOf2_32(VT.getVectorNumElements())) {

    unsigned NumElts = VT.getVectorNumElements();

    SDValue Src = N0;


    // Peek through truncate.

    if (N0.getOpcode() == ISD::TRUNCATE && N0.hasOneUse())

      Src = N0.getOperand(0);


    if (Src.getOpcode() == X86ISD::MOVMSK && Src.hasOneUse()) {

      SDValue MovmskIn = Src.getOperand(0);

      MVT MovmskVT = MovmskIn.getSimpleValueType();

      unsigned MovMskElts = MovmskVT.getVectorNumElements();


      // We allow extra bits of the movmsk to be used since they are known zero.

      // We can't convert a VPMOVMSKB without avx512bw.

      if (MovMskElts <= NumElts &&

          (Subtarget.hasBWI() || MovmskVT.getVectorElementType() != MVT::i8)) {

        EVT IntVT = EVT(MovmskVT).changeVectorElementTypeToInteger();

        MovmskIn = DAG.getBitcast(IntVT, MovmskIn);

        SDLoc dl(N);

        MVT CmpVT = MVT::getVectorVT(MVT::i1, MovMskElts);

        SDValue Cmp = DAG.getSetCC(dl, CmpVT, MovmskIn,

                                   DAG.getConstant(0, dl, IntVT), ISD::SETLT);

        if (EVT(CmpVT) == VT)

          return Cmp;


        // Pad with zeroes up to original VT to replace the zeroes that were

        // being used from the MOVMSK.

        unsigned NumConcats = NumElts / MovMskElts;

        SmallVector<SDValue, 4> Ops(NumConcats, DAG.getConstant(0, dl, CmpVT));

        Ops[0] = Cmp;

        return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Ops);

      }

    }

  }


  // Try to remove bitcasts from input and output of mask arithmetic to

  // remove GPR<->K-register crossings.

  if (SDValue V = combineCastedMaskArithmetic(N, DAG, DCI, Subtarget))

    return V;


  // bitcast(v1Ty insert_vector_elt(X, Y, 0)) --> Y

  if (N0.getOpcode() == ISD::INSERT_VECTOR_ELT && SrcVT.getScalarType() == VT &&

      SrcVT.getVectorNumElements() == 1)

    return N0.getOperand(1);


  // Convert a bitcasted integer logic operation that has one bitcasted

  // floating-point operand into a floating-point logic operation. This may

  // create a load of a constant, but that is cheaper than materializing the

  // constant in an integer register and transferring it to an SSE register or

  // transferring the SSE operand to integer register and back.

  unsigned FPOpcode;

  switch (N0.getOpcode()) {

  // clang-format off

  case ISD::AND: FPOpcode = X86ISD::FAND; break;

  case ISD::OR:  FPOpcode = X86ISD::FOR;  break;

  case ISD::XOR: FPOpcode = X86ISD::FXOR; break;

  default: return SDValue();

  // clang-format on

  }


  // Check if we have a bitcast from another integer type as well.

  if (!((Subtarget.hasSSE1() && VT == MVT::f32) ||

        (Subtarget.hasSSE2() && VT == MVT::f64) ||

        (Subtarget.hasFP16() && VT == MVT::f16) ||

        (Subtarget.hasSSE2() && VT.isInteger() && VT.isVector() &&

         TLI.isTypeLegal(VT))))

    return SDValue();


  SDValue LogicOp0 = N0.getOperand(0);

  SDValue LogicOp1 = N0.getOperand(1);

  SDLoc DL0(N0);


  // bitcast(logic(bitcast(X), Y)) --> logic'(X, bitcast(Y))

  if (N0.hasOneUse() && LogicOp0.getOpcode() == ISD::BITCAST &&

      LogicOp0.hasOneUse() && LogicOp0.getOperand(0).hasOneUse() &&

      LogicOp0.getOperand(0).getValueType() == VT &&

      !isa<ConstantSDNode>(LogicOp0.getOperand(0))) {

    SDValue CastedOp1 = DAG.getBitcast(VT, LogicOp1);

    unsigned Opcode = VT.isFloatingPoint() ? FPOpcode : N0.getOpcode();

    return DAG.getNode(Opcode, DL0, VT, LogicOp0.getOperand(0), CastedOp1);

  }

  // bitcast(logic(X, bitcast(Y))) --> logic'(bitcast(X), Y)

  if (N0.hasOneUse() && LogicOp1.getOpcode() == ISD::BITCAST &&

      LogicOp1.hasOneUse() && LogicOp1.getOperand(0).hasOneUse() &&

      LogicOp1.getOperand(0).getValueType() == VT &&

      !isa<ConstantSDNode>(LogicOp1.getOperand(0))) {

    SDValue CastedOp0 = DAG.getBitcast(VT, LogicOp0);

    unsigned Opcode = VT.isFloatingPoint() ? FPOpcode : N0.getOpcode();

    return DAG.getNode(Opcode, DL0, VT, LogicOp1.getOperand(0), CastedOp0);

  }


  return SDValue();

}


// (mul (zext a), (sext, b))


static bool detectExtMul(SelectionDAG &DAG, const SDValue &Mul, SDValue &Op0,

                         SDValue &Op1) {

  Op0 = Mul.getOperand(0);

  Op1 = Mul.getOperand(1);


  // The operand1 should be signed extend

  if (Op0.getOpcode() == ISD::SIGN_EXTEND)

    std::swap(Op0, Op1);


  auto IsFreeTruncation = [](SDValue &Op) -> bool {

    if ((Op.getOpcode() == ISD::ZERO_EXTEND ||

         Op.getOpcode() == ISD::SIGN_EXTEND) &&

        Op.getOperand(0).getScalarValueSizeInBits() <= 8)

      return true;


    auto *BV = dyn_cast<BuildVectorSDNode>(Op);

    return (BV && BV->isConstant());

  };


  // (dpbusd (zext a), (sext, b)). Since the first operand should be unsigned

  // value, we need to check Op0 is zero extended value. Op1 should be signed

  // value, so we just check the signed bits.

  if ((IsFreeTruncation(Op0) &&

       DAG.computeKnownBits(Op0).countMaxActiveBits() <= 8) &&

      (IsFreeTruncation(Op1) && DAG.ComputeMaxSignificantBits(Op1) <= 8))

    return true;


  return false;

}


static SDValue createVPDPBUSD(SelectionDAG &DAG, SDValue LHS, SDValue RHS,

                              unsigned &LogBias, const SDLoc &DL,

                              const X86Subtarget &Subtarget) {

  // Extend or truncate to MVT::i8 first.

  MVT Vi8VT =

      MVT::getVectorVT(MVT::i8, LHS.getValueType().getVectorElementCount());

  LHS = DAG.getZExtOrTrunc(LHS, DL, Vi8VT);

  RHS = DAG.getSExtOrTrunc(RHS, DL, Vi8VT);


  // VPDPBUSD(<16 x i32>C, <16 x i8>A, <16 x i8>B). For each dst element

  // C[0] = C[0] + A[0]B[0] + A[1]B[1] + A[2]B[2] + A[3]B[3].

  // The src A, B element type is i8, but the dst C element type is i32.

  // When we calculate the reduce stage, we use src vector type vXi8 for it

  // so we need logbias 2 to avoid extra 2 stages.

  LogBias = 2;


  unsigned RegSize = std::max(128u, (unsigned)Vi8VT.getSizeInBits());

  if (Subtarget.hasVNNI() && !Subtarget.hasVLX())

    RegSize = std::max(512u, RegSize);


  // "Zero-extend" the i8 vectors. This is not a per-element zext, rather we

  // fill in the missing vector elements with 0.

  unsigned NumConcat = RegSize / Vi8VT.getSizeInBits();

  SmallVector<SDValue, 16> Ops(NumConcat, DAG.getConstant(0, DL, Vi8VT));

  Ops[0] = LHS;

  MVT ExtendedVT = MVT::getVectorVT(MVT::i8, RegSize / 8);

  SDValue DpOp0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);

  Ops[0] = RHS;

  SDValue DpOp1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);


  // Actually build the DotProduct, split as 256/512 bits for

  // AVXVNNI/AVX512VNNI.

  auto DpBuilder = [](SelectionDAG &DAG, const SDLoc &DL,

                       ArrayRef<SDValue> Ops) {

    MVT VT = MVT::getVectorVT(MVT::i32, Ops[0].getValueSizeInBits() / 32);

    return DAG.getNode(X86ISD::VPDPBUSD, DL, VT, Ops);

  };

  MVT DpVT = MVT::getVectorVT(MVT::i32, RegSize / 32);

  SDValue Zero = DAG.getConstant(0, DL, DpVT);


  return SplitOpsAndApply(DAG, Subtarget, DL, DpVT, {Zero, DpOp0, DpOp1},

                          DpBuilder, /*CheckBWI=*/false, Subtarget.hasVNNI());

}


// Create a PSADBW given two sources representable as zexts of vXi8.


static SDValue createPSADBW(SelectionDAG &DAG, SDValue N0, SDValue N1,

                            const SDLoc &DL, const X86Subtarget &Subtarget) {

  // Find the appropriate width for the PSADBW.

  EVT DstVT = N0.getValueType();

  EVT SrcVT = EVT::getVectorVT(*DAG.getContext(), MVT::i8,

                               DstVT.getVectorElementCount());

  unsigned RegSize = std::max(128u, (unsigned)SrcVT.getSizeInBits());


  // Widen the vXi8 vectors, padding with zero vector elements.

  unsigned NumConcat = RegSize / SrcVT.getSizeInBits();

  SmallVector<SDValue, 16> Ops(NumConcat, DAG.getConstant(0, DL, SrcVT));

  Ops[0] = DAG.getZExtOrTrunc(N0, DL, SrcVT);

  MVT ExtendedVT = MVT::getVectorVT(MVT::i8, RegSize / 8);

  SDValue SadOp0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);

  Ops[0] = DAG.getZExtOrTrunc(N1, DL, SrcVT);

  SDValue SadOp1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);


  // Actually build the SAD, split as 128/256/512 bits for SSE/AVX2/AVX512BW.

  auto PSADBWBuilder = [](SelectionDAG &DAG, const SDLoc &DL,

                          ArrayRef<SDValue> Ops) {

    MVT VT = MVT::getVectorVT(MVT::i64, Ops[0].getValueSizeInBits() / 64);

    return DAG.getNode(X86ISD::PSADBW, DL, VT, Ops);

  };

  MVT SadVT = MVT::getVectorVT(MVT::i64, RegSize / 64);

  return SplitOpsAndApply(DAG, Subtarget, DL, SadVT, {SadOp0, SadOp1},

                          PSADBWBuilder);

}


// Attempt to replace an min/max v8i16/v16i8 horizontal reduction with

// PHMINPOSUW.


static SDValue combineMinMaxReduction(SDNode *Extract, SelectionDAG &DAG,

                                      const X86Subtarget &Subtarget) {

  // Bail without SSE41.

  if (!Subtarget.hasSSE41())

    return SDValue();


  EVT ExtractVT = Extract->getValueType(0);

  if (ExtractVT != MVT::i16 && ExtractVT != MVT::i8)

    return SDValue();


  // Check for SMAX/SMIN/UMAX/UMIN horizontal reduction patterns.

  ISD::NodeType BinOp;

  SDValue Src = DAG.matchBinOpReduction(

      Extract, BinOp, {ISD::SMAX, ISD::SMIN, ISD::UMAX, ISD::UMIN}, true);

  if (!Src)

    return SDValue();


  EVT SrcVT = Src.getValueType();

  EVT SrcSVT = SrcVT.getScalarType();

  if (SrcSVT != ExtractVT || (SrcVT.getSizeInBits() % 128) != 0)

    return SDValue();


  SDLoc DL(Extract);

  SDValue MinPos = Src;


  // First, reduce the source down to 128-bit, applying BinOp to lo/hi.

  while (SrcVT.getSizeInBits() > 128) {

    SDValue Lo, Hi;

    std::tie(Lo, Hi) = splitVector(MinPos, DAG, DL);

    SrcVT = Lo.getValueType();

    MinPos = DAG.getNode(BinOp, DL, SrcVT, Lo, Hi);

  }

  assert(((SrcVT == MVT::v8i16 && ExtractVT == MVT::i16) ||

          (SrcVT == MVT::v16i8 && ExtractVT == MVT::i8)) &&

         "Unexpected value type");


  // PHMINPOSUW applies to UMIN(v8i16), for SMIN/SMAX/UMAX we must apply a mask

  // to flip the value accordingly.

  SDValue Mask;

  unsigned MaskEltsBits = ExtractVT.getSizeInBits();

  if (BinOp == ISD::SMAX)

    Mask = DAG.getConstant(APInt::getSignedMaxValue(MaskEltsBits), DL, SrcVT);

  else if (BinOp == ISD::SMIN)

    Mask = DAG.getConstant(APInt::getSignedMinValue(MaskEltsBits), DL, SrcVT);

  else if (BinOp == ISD::UMAX)

    Mask = DAG.getAllOnesConstant(DL, SrcVT);


  if (Mask)

    MinPos = DAG.getNode(ISD::XOR, DL, SrcVT, Mask, MinPos);


  // For v16i8 cases we need to perform UMIN on pairs of byte elements,

  // shuffling each upper element down and insert zeros. This means that the

  // v16i8 UMIN will leave the upper element as zero, performing zero-extension

  // ready for the PHMINPOS.

  if (ExtractVT == MVT::i8) {

    SDValue Upper = DAG.getVectorShuffle(

        SrcVT, DL, MinPos, DAG.getConstant(0, DL, MVT::v16i8),

        {1, 16, 3, 16, 5, 16, 7, 16, 9, 16, 11, 16, 13, 16, 15, 16});

    MinPos = DAG.getNode(ISD::UMIN, DL, SrcVT, MinPos, Upper);

  }


  // Perform the PHMINPOS on a v8i16 vector,

  MinPos = DAG.getBitcast(MVT::v8i16, MinPos);

  MinPos = DAG.getNode(X86ISD::PHMINPOS, DL, MVT::v8i16, MinPos);

  MinPos = DAG.getBitcast(SrcVT, MinPos);


  if (Mask)

    MinPos = DAG.getNode(ISD::XOR, DL, SrcVT, Mask, MinPos);


  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractVT, MinPos,

                     DAG.getVectorIdxConstant(0, DL));

}


// Attempt to replace an all_of/any_of/parity style horizontal reduction with a MOVMSK.


static SDValue combinePredicateReduction(SDNode *Extract, SelectionDAG &DAG,

                                         const X86Subtarget &Subtarget) {

  // Bail without SSE2.

  if (!Subtarget.hasSSE2())

    return SDValue();


  EVT ExtractVT = Extract->getValueType(0);

  unsigned BitWidth = ExtractVT.getSizeInBits();

  if (ExtractVT != MVT::i64 && ExtractVT != MVT::i32 && ExtractVT != MVT::i16 &&

      ExtractVT != MVT::i8 && ExtractVT != MVT::i1)

    return SDValue();


  // Check for OR(any_of)/AND(all_of)/XOR(parity) horizontal reduction patterns.

  ISD::NodeType BinOp;

  SDValue Match = DAG.matchBinOpReduction(Extract, BinOp, {ISD::OR, ISD::AND});

  if (!Match && ExtractVT == MVT::i1)

    Match = DAG.matchBinOpReduction(Extract, BinOp, {ISD::XOR});

  if (!Match)

    return SDValue();


  // EXTRACT_VECTOR_ELT can require implicit extension of the vector element

  // which we can't support here for now.

  if (Match.getScalarValueSizeInBits() != BitWidth)

    return SDValue();


  SDValue Movmsk;

  SDLoc DL(Extract);

  EVT MatchVT = Match.getValueType();

  unsigned NumElts = MatchVT.getVectorNumElements();

  unsigned MaxElts = Subtarget.hasInt256() ? 32 : 16;

  const TargetLowering &TLI = DAG.getTargetLoweringInfo();

  LLVMContext &Ctx = *DAG.getContext();


  if (ExtractVT == MVT::i1) {

    // Special case for (pre-legalization) vXi1 reductions.

    if (NumElts > 64 || !isPowerOf2_32(NumElts))

      return SDValue();

    if (Match.getOpcode() == ISD::SETCC) {

      ISD::CondCode CC = cast<CondCodeSDNode>(Match.getOperand(2))->get();

      if ((BinOp == ISD::AND && CC == ISD::CondCode::SETEQ) ||

          (BinOp == ISD::OR && CC == ISD::CondCode::SETNE)) {

        // For all_of(setcc(x,y,eq)) - use (iX)x == (iX)y.

        // For any_of(setcc(x,y,ne)) - use (iX)x != (iX)y.

        X86::CondCode X86CC;

        SDValue LHS = DAG.getFreeze(Match.getOperand(0));

        SDValue RHS = DAG.getFreeze(Match.getOperand(1));

        APInt Mask = APInt::getAllOnes(LHS.getScalarValueSizeInBits());

        if (SDValue V = LowerVectorAllEqual(DL, LHS, RHS, CC, Mask, Subtarget,

                                            DAG, X86CC))

          return DAG.getNode(ISD::TRUNCATE, DL, ExtractVT,

                             getSETCC(X86CC, V, DL, DAG));

      }

    }

    if (TLI.isTypeLegal(MatchVT)) {

      // If this is a legal AVX512 predicate type then we can just bitcast.

      EVT MovmskVT = EVT::getIntegerVT(Ctx, NumElts);

      Movmsk = DAG.getBitcast(MovmskVT, Match);

    } else {

      // Use combineBitcastvxi1 to create the MOVMSK.

      while (NumElts > MaxElts) {

        SDValue Lo, Hi;

        std::tie(Lo, Hi) = DAG.SplitVector(Match, DL);

        Match = DAG.getNode(BinOp, DL, Lo.getValueType(), Lo, Hi);

        NumElts /= 2;

      }

      EVT MovmskVT = EVT::getIntegerVT(Ctx, NumElts);

      Movmsk = combineBitcastvxi1(DAG, MovmskVT, Match, DL, Subtarget);

    }

    if (!Movmsk)

      return SDValue();

    Movmsk = DAG.getZExtOrTrunc(Movmsk, DL, NumElts > 32 ? MVT::i64 : MVT::i32);

  } else {

    // FIXME: Better handling of k-registers or 512-bit vectors?

    unsigned MatchSizeInBits = Match.getValueSizeInBits();

    if (!(MatchSizeInBits == 128 ||

          (MatchSizeInBits == 256 && Subtarget.hasAVX())))

      return SDValue();


    // Make sure this isn't a vector of 1 element. The perf win from using

    // MOVMSK diminishes with less elements in the reduction, but it is

    // generally better to get the comparison over to the GPRs as soon as

    // possible to reduce the number of vector ops.

    if (Match.getValueType().getVectorNumElements() < 2)

      return SDValue();


    // Check that we are extracting a reduction of all sign bits.

    if (DAG.ComputeNumSignBits(Match) != BitWidth)

      return SDValue();


    if (MatchSizeInBits == 256 && BitWidth < 32 && !Subtarget.hasInt256()) {

      SDValue Lo, Hi;

      std::tie(Lo, Hi) = DAG.SplitVector(Match, DL);

      Match = DAG.getNode(BinOp, DL, Lo.getValueType(), Lo, Hi);

      MatchSizeInBits = Match.getValueSizeInBits();

    }


    // For 32/64 bit comparisons use MOVMSKPS/MOVMSKPD, else PMOVMSKB.

    MVT MaskSrcVT;

    if (64 == BitWidth || 32 == BitWidth)

      MaskSrcVT = MVT::getVectorVT(MVT::getFloatingPointVT(BitWidth),

                                   MatchSizeInBits / BitWidth);

    else

      MaskSrcVT = MVT::getVectorVT(MVT::i8, MatchSizeInBits / 8);


    SDValue BitcastLogicOp = DAG.getBitcast(MaskSrcVT, Match);

    Movmsk = getPMOVMSKB(DL, BitcastLogicOp, DAG, Subtarget);

    NumElts = MaskSrcVT.getVectorNumElements();

  }

  assert((NumElts <= 32 || NumElts == 64) &&

         "Not expecting more than 64 elements");


  MVT CmpVT = NumElts == 64 ? MVT::i64 : MVT::i32;

  if (BinOp == ISD::XOR) {

    // parity -> (PARITY(MOVMSK X))

    SDValue Result = DAG.getNode(ISD::PARITY, DL, CmpVT, Movmsk);

    return DAG.getZExtOrTrunc(Result, DL, ExtractVT);

  }


  SDValue CmpC;

  ISD::CondCode CondCode;

  if (BinOp == ISD::OR) {

    // any_of -> MOVMSK != 0

    CmpC = DAG.getConstant(0, DL, CmpVT);

    CondCode = ISD::CondCode::SETNE;

  } else {

    // all_of -> MOVMSK == ((1 << NumElts) - 1)

    CmpC = DAG.getConstant(APInt::getLowBitsSet(CmpVT.getSizeInBits(), NumElts),

                           DL, CmpVT);

    CondCode = ISD::CondCode::SETEQ;

  }


  // The setcc produces an i8 of 0/1, so extend that to the result width and

  // negate to get the final 0/-1 mask value.

  EVT SetccVT = TLI.getSetCCResultType(DAG.getDataLayout(), Ctx, CmpVT);

  SDValue Setcc = DAG.getSetCC(DL, SetccVT, Movmsk, CmpC, CondCode);

  SDValue Zext = DAG.getZExtOrTrunc(Setcc, DL, ExtractVT);

  return DAG.getNegative(Zext, DL, ExtractVT);

}


static SDValue combineVPDPBUSDPattern(SDNode *Extract, SelectionDAG &DAG,

                                      const X86Subtarget &Subtarget) {

  if (!Subtarget.hasVNNI() && !Subtarget.hasAVXVNNI())

    return SDValue();


  EVT ExtractVT = Extract->getValueType(0);

  // Verify the type we're extracting is i32, as the output element type of

  // vpdpbusd is i32.

  if (ExtractVT != MVT::i32)

    return SDValue();


  EVT VT = Extract->getOperand(0).getValueType();

  if (!isPowerOf2_32(VT.getVectorNumElements()))

    return SDValue();


  // Match shuffle + add pyramid.

  ISD::NodeType BinOp;

  SDValue Root = DAG.matchBinOpReduction(Extract, BinOp, {ISD::ADD});


  // We can't combine to vpdpbusd for zext, because each of the 4 multiplies

  // done by vpdpbusd compute a signed 16-bit product that will be sign extended

  // before adding into the accumulator.

  // TODO:

  // We also need to verify that the multiply has at least 2x the number of bits

  // of the input. We shouldn't match

  // (sign_extend (mul (vXi9 (zext (vXi8 X))), (vXi9 (zext (vXi8 Y)))).

  // if (Root && (Root.getOpcode() == ISD::SIGN_EXTEND))

  //   Root = Root.getOperand(0);


  // If there was a match, we want Root to be a mul.

  if (!Root || Root.getOpcode() != ISD::MUL)

    return SDValue();


  // Check whether we have an extend and mul pattern

  SDValue LHS, RHS;

  if (!detectExtMul(DAG, Root, LHS, RHS))

    return SDValue();


  // Create the dot product instruction.

  SDLoc DL(Extract);

  unsigned StageBias;

  SDValue DP = createVPDPBUSD(DAG, LHS, RHS, StageBias, DL, Subtarget);


  // If the original vector was wider than 4 elements, sum over the results

  // in the DP vector.

  unsigned Stages = Log2_32(VT.getVectorNumElements());

  EVT DpVT = DP.getValueType();


  if (Stages > StageBias) {

    unsigned DpElems = DpVT.getVectorNumElements();


    for (unsigned i = Stages - StageBias; i > 0; --i) {

      SmallVector<int, 16> Mask(DpElems, -1);

      for (unsigned j = 0, MaskEnd = 1 << (i - 1); j < MaskEnd; ++j)

        Mask[j] = MaskEnd + j;


      SDValue Shuffle =

          DAG.getVectorShuffle(DpVT, DL, DP, DAG.getUNDEF(DpVT), Mask);

      DP = DAG.getNode(ISD::ADD, DL, DpVT, DP, Shuffle);

    }

  }


  // Return the lowest ExtractSizeInBits bits.

  EVT ResVT =

      EVT::getVectorVT(*DAG.getContext(), ExtractVT,

                       DpVT.getSizeInBits() / ExtractVT.getSizeInBits());

  DP = DAG.getBitcast(ResVT, DP);

  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractVT, DP,

                     Extract->getOperand(1));

}


static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG,

                                      const X86Subtarget &Subtarget) {

  using namespace SDPatternMatch;


  // PSADBW is only supported on SSE2 and up.

  if (!Subtarget.hasSSE2())

    return SDValue();


  EVT ExtractVT = Extract->getValueType(0);

  if (ExtractVT != MVT::i8 && ExtractVT != MVT::i16 && ExtractVT != MVT::i32 &&

      ExtractVT != MVT::i64)

    return SDValue();


  EVT VT = Extract->getOperand(0).getValueType();

  if (!isPowerOf2_32(VT.getVectorNumElements()))

    return SDValue();


  // Match shuffle + add pyramid.

  ISD::NodeType BinOp;

  SDValue Root = DAG.matchBinOpReduction(Extract, BinOp, {ISD::ADD});

  if (!Root)

    return SDValue();


  // The operand is expected to be zero extended from i8.

  // In order to convert to i64 and above, additional any/zero/sign

  // extend is expected.

  // The zero extend from 32 bit has no mathematical effect on the result.

  // Also the sign extend is basically zero extend

  // (extends the sign bit which is zero).

  // So it is correct to skip the sign/zero extend instruction.

  if (Root.getOpcode() == ISD::SIGN_EXTEND ||

      Root.getOpcode() == ISD::ZERO_EXTEND ||

      Root.getOpcode() == ISD::ANY_EXTEND)

    Root = Root.getOperand(0);


  // Check whether we have an vXi8 abdu pattern.

  // TODO: Just match ISD::ABDU once the DAG is topological sorted.

  SDValue Src0, Src1;

  if (!sd_match(

          Root,

          m_AnyOf(

              m_SpecificVectorElementVT(

                  MVT::i8, m_c_BinOp(ISD::ABDU, m_Value(Src0), m_Value(Src1))),

              m_SpecificVectorElementVT(

                  MVT::i8, m_Sub(m_UMax(m_Value(Src0), m_Value(Src1)),

                                 m_UMin(m_Deferred(Src0), m_Deferred(Src1)))),

              m_Abs(

                  m_Sub(m_AllOf(m_Value(Src0),

                                m_ZExt(m_SpecificVectorElementVT(MVT::i8))),

                        m_AllOf(m_Value(Src1),

                                m_ZExt(m_SpecificVectorElementVT(MVT::i8))))))))

    return SDValue();


  // Create the SAD instruction.

  SDLoc DL(Extract);

  SDValue SAD = createPSADBW(DAG, Src0, Src1, DL, Subtarget);


  // If the original vector was wider than 8 elements, sum over the results

  // in the SAD vector.

  unsigned Stages = Log2_32(VT.getVectorNumElements());

  EVT SadVT = SAD.getValueType();

  if (Stages > 3) {

    unsigned SadElems = SadVT.getVectorNumElements();


    for(unsigned i = Stages - 3; i > 0; --i) {

      SmallVector<int, 16> Mask(SadElems, -1);

      for(unsigned j = 0, MaskEnd = 1 << (i - 1); j < MaskEnd; ++j)

        Mask[j] = MaskEnd + j;


      SDValue Shuffle =

          DAG.getVectorShuffle(SadVT, DL, SAD, DAG.getUNDEF(SadVT), Mask);

      SAD = DAG.getNode(ISD::ADD, DL, SadVT, SAD, Shuffle);

    }

  }


  unsigned ExtractSizeInBits = ExtractVT.getSizeInBits();

  // Return the lowest ExtractSizeInBits bits.

  EVT ResVT = EVT::getVectorVT(*DAG.getContext(), ExtractVT,

                               SadVT.getSizeInBits() / ExtractSizeInBits);

  SAD = DAG.getBitcast(ResVT, SAD);

  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractVT, SAD,

                     Extract->getOperand(1));

}


// If this extract is from a loaded vector value and will be used as an

// integer, that requires a potentially expensive XMM -> GPR transfer.

// Additionally, if we can convert to a scalar integer load, that will likely

// be folded into a subsequent integer op.

// Note: SrcVec might not have a VecVT type, but it must be the same size.

// Note: Unlike the related fold for this in DAGCombiner, this is not limited

//       to a single-use of the loaded vector. For the reasons above, we

//       expect this to be profitable even if it creates an extra load.

static SDValue


combineExtractFromVectorLoad(SDNode *N, EVT VecVT, SDValue SrcVec, uint64_t Idx,

                             const SDLoc &dl, SelectionDAG &DAG,

                             TargetLowering::DAGCombinerInfo &DCI) {

  assert(N->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&

         "Only EXTRACT_VECTOR_ELT supported so far");


  const TargetLowering &TLI = DAG.getTargetLoweringInfo();

  EVT VT = N->getValueType(0);


  bool LikelyUsedAsVector = any_of(N->users(), [](SDNode *Use) {

    return Use->getOpcode() == ISD::STORE ||

           Use->getOpcode() == ISD::INSERT_VECTOR_ELT ||

           Use->getOpcode() == ISD::SCALAR_TO_VECTOR;

  });


  auto *LoadVec = dyn_cast<LoadSDNode>(SrcVec);

  if (LoadVec && ISD::isNormalLoad(LoadVec) && VT.isInteger() &&

      VecVT.getVectorElementType() == VT &&

      VecVT.getSizeInBits() == SrcVec.getValueSizeInBits() &&

      DCI.isAfterLegalizeDAG() && !LikelyUsedAsVector && LoadVec->isSimple()) {

    SDValue NewPtr = TLI.getVectorElementPointer(

        DAG, LoadVec->getBasePtr(), VecVT, DAG.getVectorIdxConstant(Idx, dl));

    unsigned PtrOff = VT.getSizeInBits() * Idx / 8;

    MachinePointerInfo MPI = LoadVec->getPointerInfo().getWithOffset(PtrOff);

    Align Alignment = commonAlignment(LoadVec->getAlign(), PtrOff);

    SDValue Load =

        DAG.getLoad(VT, dl, LoadVec->getChain(), NewPtr, MPI, Alignment,

                    LoadVec->getMemOperand()->getFlags(), LoadVec->getAAInfo());

    DAG.makeEquivalentMemoryOrdering(LoadVec, Load);

    return Load;

  }


  return SDValue();

}


// Attempt to peek through a target shuffle and extract the scalar from the

// source.


static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG,

                                         TargetLowering::DAGCombinerInfo &DCI,

                                         const X86Subtarget &Subtarget) {

  if (DCI.isBeforeLegalizeOps())

    return SDValue();


  SDLoc dl(N);

  SDValue Src = N->getOperand(0);

  SDValue Idx = N->getOperand(1);


  EVT VT = N->getValueType(0);

  EVT SrcVT = Src.getValueType();

  EVT SrcSVT = SrcVT.getVectorElementType();

  unsigned SrcEltBits = SrcSVT.getSizeInBits();

  unsigned NumSrcElts = SrcVT.getVectorNumElements();


  // Don't attempt this for boolean mask vectors or unknown extraction indices.

  if (SrcSVT == MVT::i1 || !isa<ConstantSDNode>(Idx))

    return SDValue();


  const APInt &IdxC = N->getConstantOperandAPInt(1);

  if (IdxC.uge(NumSrcElts))

    return SDValue();


  SDValue SrcBC = peekThroughBitcasts(Src);


  // Handle extract(bitcast(broadcast(scalar_value))).

  if (X86ISD::VBROADCAST == SrcBC.getOpcode()) {

    SDValue SrcOp = SrcBC.getOperand(0);

    EVT SrcOpVT = SrcOp.getValueType();

    if (SrcOpVT.isScalarInteger() && VT.isInteger() &&

        (SrcOpVT.getSizeInBits() % SrcEltBits) == 0) {

      unsigned Scale = SrcOpVT.getSizeInBits() / SrcEltBits;

      unsigned Offset = IdxC.urem(Scale) * SrcEltBits;

      // TODO support non-zero offsets.

      if (Offset == 0) {

        SrcOp = DAG.getZExtOrTrunc(SrcOp, dl, SrcVT.getScalarType());

        SrcOp = DAG.getZExtOrTrunc(SrcOp, dl, VT);

        return SrcOp;

      }

    }

  }


  // If we're extracting a single element from a broadcast load and there are

  // no other users, just create a single load.

  if (peekThroughOneUseBitcasts(Src).getOpcode() == X86ISD::VBROADCAST_LOAD &&

      SrcBC.hasOneUse()) {

    auto *MemIntr = cast<MemIntrinsicSDNode>(SrcBC);

    unsigned SrcBCWidth = SrcBC.getScalarValueSizeInBits();

    if (MemIntr->getMemoryVT().getSizeInBits() == SrcBCWidth &&

        VT.getSizeInBits() == SrcBCWidth && SrcEltBits == SrcBCWidth) {

      SDValue Load =

          DAG.getLoad(VT, dl, MemIntr->getChain(), MemIntr->getBasePtr(),

                      MemIntr->getPointerInfo(), MemIntr->getBaseAlign(),

                      MemIntr->getMemOperand()->getFlags());

      DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), Load.getValue(1));

      return Load;

    }

  }


  // Handle extract(bitcast(scalar_to_vector(scalar_value))) for integers.

  // TODO: Move to DAGCombine?

  if (SrcBC.getOpcode() == ISD::SCALAR_TO_VECTOR && VT.isInteger() &&

      SrcBC.getValueType().isInteger() &&

      (SrcBC.getScalarValueSizeInBits() % SrcEltBits) == 0 &&

      SrcBC.getScalarValueSizeInBits() ==

          SrcBC.getOperand(0).getValueSizeInBits()) {

    unsigned Scale = SrcBC.getScalarValueSizeInBits() / SrcEltBits;

    if (IdxC.ult(Scale)) {

      unsigned Offset = IdxC.getZExtValue() * SrcVT.getScalarSizeInBits();

      SDValue Scl = SrcBC.getOperand(0);

      EVT SclVT = Scl.getValueType();

      if (Offset) {

        Scl = DAG.getNode(ISD::SRL, dl, SclVT, Scl,

                          DAG.getShiftAmountConstant(Offset, SclVT, dl));

      }

      Scl = DAG.getZExtOrTrunc(Scl, dl, SrcVT.getScalarType());

      Scl = DAG.getZExtOrTrunc(Scl, dl, VT);

      return Scl;

    }

  }


  // Handle extract(truncate(x)) for 0'th index.

  // TODO: Treat this as a faux shuffle?

  // TODO: When can we use this for general indices?

  if (ISD::TRUNCATE == Src.getOpcode() && IdxC == 0 &&

      (SrcVT.getSizeInBits() % 128) == 0) {

    Src = extract128BitVector(Src.getOperand(0), 0, DAG, dl);

    MVT ExtractVT = MVT::getVectorVT(SrcSVT.getSimpleVT(), 128 / SrcEltBits);

    return DAG.getNode(N->getOpcode(), dl, VT, DAG.getBitcast(ExtractVT, Src),

                       Idx);

  }


  // We can only legally extract other elements from 128-bit vectors and in

  // certain circumstances, depending on SSE-level.

  // TODO: Investigate float/double extraction if it will be just stored.

  auto GetLegalExtract = [&Subtarget, &DAG, &dl](SDValue Vec, EVT VecVT,

                                                 unsigned Idx) {

    EVT VecSVT = VecVT.getScalarType();

    if ((VecVT.is256BitVector() || VecVT.is512BitVector()) &&

        (VecSVT == MVT::i8 || VecSVT == MVT::i16 || VecSVT == MVT::i32 ||

         VecSVT == MVT::i64)) {

      unsigned EltSizeInBits = VecSVT.getSizeInBits();

      unsigned NumEltsPerLane = 128 / EltSizeInBits;

      unsigned LaneOffset = (Idx & ~(NumEltsPerLane - 1)) * EltSizeInBits;

      unsigned LaneIdx = LaneOffset / Vec.getScalarValueSizeInBits();

      VecVT = EVT::getVectorVT(*DAG.getContext(), VecSVT, NumEltsPerLane);

      Vec = extract128BitVector(Vec, LaneIdx, DAG, dl);

      Idx &= (NumEltsPerLane - 1);

    }

    if ((VecVT == MVT::v4i32 || VecVT == MVT::v2i64) &&

        ((Idx == 0 && Subtarget.hasSSE2()) || Subtarget.hasSSE41())) {

      return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VecVT.getScalarType(),

                         DAG.getBitcast(VecVT, Vec),

                         DAG.getVectorIdxConstant(Idx, dl));

    }

    if ((VecVT == MVT::v8i16 && Subtarget.hasSSE2()) ||

        (VecVT == MVT::v16i8 && Subtarget.hasSSE41())) {

      unsigned OpCode = (VecVT == MVT::v8i16 ? X86ISD::PEXTRW : X86ISD::PEXTRB);

      return DAG.getNode(OpCode, dl, MVT::i32, DAG.getBitcast(VecVT, Vec),

                         DAG.getTargetConstant(Idx, dl, MVT::i8));

    }

    return SDValue();

  };


  // Resolve the target shuffle inputs and mask.

  SmallVector<int, 16> Mask;

  SmallVector<SDValue, 2> Ops;

  if (!getTargetShuffleInputs(SrcBC, Ops, Mask, DAG))

    return SDValue();


  // Shuffle inputs must be the same size as the result.

  if (llvm::any_of(Ops, [SrcVT](SDValue Op) {

        return SrcVT.getSizeInBits() != Op.getValueSizeInBits();

      }))

    return SDValue();


  // Attempt to narrow/widen the shuffle mask to the correct size.

  if (Mask.size() != NumSrcElts) {

    if ((NumSrcElts % Mask.size()) == 0) {

      SmallVector<int, 16> ScaledMask;

      int Scale = NumSrcElts / Mask.size();

      narrowShuffleMaskElts(Scale, Mask, ScaledMask);

      Mask = std::move(ScaledMask);

    } else if ((Mask.size() % NumSrcElts) == 0) {

      // Simplify Mask based on demanded element.

      int ExtractIdx = (int)IdxC.getZExtValue();

      int Scale = Mask.size() / NumSrcElts;

      int Lo = Scale * ExtractIdx;

      int Hi = Scale * (ExtractIdx + 1);

      for (int i = 0, e = (int)Mask.size(); i != e; ++i)

        if (i < Lo || Hi <= i)

          Mask[i] = SM_SentinelUndef;


      SmallVector<int, 16> WidenedMask;

      while (Mask.size() > NumSrcElts &&

             canWidenShuffleElements(Mask, WidenedMask))

        Mask = std::move(WidenedMask);

    }

  }


  // If narrowing/widening failed, see if we can extract+zero-extend.

  int ExtractIdx;

  EVT ExtractVT;

  if (Mask.size() == NumSrcElts) {

    ExtractIdx = Mask[IdxC.getZExtValue()];

    ExtractVT = SrcVT;

  } else {

    unsigned Scale = Mask.size() / NumSrcElts;

    if ((Mask.size() % NumSrcElts) != 0 || SrcVT.isFloatingPoint())

      return SDValue();

    unsigned ScaledIdx = Scale * IdxC.getZExtValue();

    if (!isUndefOrZeroInRange(Mask, ScaledIdx + 1, Scale - 1))

      return SDValue();

    ExtractIdx = Mask[ScaledIdx];

    EVT ExtractSVT = EVT::getIntegerVT(*DAG.getContext(), SrcEltBits / Scale);

    ExtractVT = EVT::getVectorVT(*DAG.getContext(), ExtractSVT, Mask.size());

    assert(SrcVT.getSizeInBits() == ExtractVT.getSizeInBits() &&

           "Failed to widen vector type");

  }


  // If the shuffle source element is undef/zero then we can just accept it.

  if (ExtractIdx == SM_SentinelUndef)

    return DAG.getUNDEF(VT);


  if (ExtractIdx == SM_SentinelZero)

    return VT.isFloatingPoint() ? DAG.getConstantFP(0.0, dl, VT)

                                : DAG.getConstant(0, dl, VT);


  SDValue SrcOp = Ops[ExtractIdx / Mask.size()];

  ExtractIdx = ExtractIdx % Mask.size();

  if (SDValue V = GetLegalExtract(SrcOp, ExtractVT, ExtractIdx))

    return DAG.getZExtOrTrunc(V, dl, VT);


  if (N->getOpcode() == ISD::EXTRACT_VECTOR_ELT && ExtractVT == SrcVT)

    if (SDValue V = combineExtractFromVectorLoad(

            N, SrcVT, peekThroughBitcasts(SrcOp), ExtractIdx, dl, DAG, DCI))

      return V;


  return SDValue();

}


/// Extracting a scalar FP value from vector element 0 is free, so extract each

/// operand first, then perform the math as a scalar op.


static SDValue scalarizeExtEltFP(SDNode *ExtElt, SelectionDAG &DAG,

                                 const X86Subtarget &Subtarget,

                                 TargetLowering::DAGCombinerInfo &DCI) {

  assert(ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Expected extract");

  SDValue Vec = ExtElt->getOperand(0);

  SDValue Index = ExtElt->getOperand(1);

  EVT VT = ExtElt->getValueType(0);

  EVT VecVT = Vec.getValueType();


  // TODO: If this is a unary/expensive/expand op, allow extraction from a

  // non-zero element because the shuffle+scalar op will be cheaper?

  if (!Vec.hasOneUse() || !isNullConstant(Index) || VecVT.getScalarType() != VT)

    return SDValue();


  // Vector FP compares don't fit the pattern of FP math ops (propagate, not

  // extract, the condition code), so deal with those as a special-case.

  if (Vec.getOpcode() == ISD::SETCC && VT == MVT::i1) {

    EVT OpVT = Vec.getOperand(0).getValueType().getScalarType();

    if (OpVT != MVT::f32 && OpVT != MVT::f64)

      return SDValue();


    // extract (setcc X, Y, CC), 0 --> setcc (extract X, 0), (extract Y, 0), CC

    SDLoc DL(ExtElt);

    SDValue Ext0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, OpVT,

                               Vec.getOperand(0), Index);

    SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, OpVT,

                               Vec.getOperand(1), Index);

    return DAG.getNode(Vec.getOpcode(), DL, VT, Ext0, Ext1, Vec.getOperand(2));

  }


  if (!(VT == MVT::f16 && Subtarget.hasFP16()) && VT != MVT::f32 &&

      VT != MVT::f64)

    return SDValue();


  // Vector FP selects don't fit the pattern of FP math ops (because the

  // condition has a different type and we have to change the opcode), so deal

  // with those here.

  // FIXME: This is restricted to pre type legalization. If we loosen this we

  // need to convert vector bool to a scalar bool.

  if (DCI.isBeforeLegalize() && Vec.getOpcode() == ISD::VSELECT &&

      Vec.getOperand(0).getOpcode() == ISD::SETCC &&

      Vec.getOperand(0).getOperand(0).getValueType() == VecVT &&

      Vec.getOperand(0).getValueType().getScalarType() == MVT::i1) {

    // ext (sel Cond, X, Y), 0 --> sel (ext Cond, 0), (ext X, 0), (ext Y, 0)

    SDLoc DL(ExtElt);

    SDValue Ext0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL,

                               Vec.getOperand(0).getValueType().getScalarType(),

                               Vec.getOperand(0), Index);

    SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,

                               Vec.getOperand(1), Index);

    SDValue Ext2 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,

                               Vec.getOperand(2), Index);

    return DAG.getNode(ISD::SELECT, DL, VT, Ext0, Ext1, Ext2);

  }


  // TODO: This switch could include FNEG and the x86-specific FP logic ops

  // (FAND, FANDN, FOR, FXOR). But that may require enhancements to avoid

  // missed load folding and fma+fneg combining.

  switch (Vec.getOpcode()) {

  case ISD::FMA: // Begin 3 operands

  case ISD::FMAD:

  case ISD::FADD: // Begin 2 operands

  case ISD::FSUB:

  case ISD::FMUL:

  case ISD::FDIV:

  case ISD::FREM:

  case ISD::FCOPYSIGN:

  case ISD::FMINNUM:

  case ISD::FMAXNUM:

  case ISD::FMINNUM_IEEE:

  case ISD::FMAXNUM_IEEE:

  case ISD::FMAXIMUM:

  case ISD::FMINIMUM:

  case ISD::FMAXIMUMNUM:

  case ISD::FMINIMUMNUM:

  case X86ISD::FMAX:

  case X86ISD::FMIN:

  case ISD::FABS: // Begin 1 operand

  case ISD::FSQRT:

  case ISD::FRINT:

  case ISD::FCEIL:

  case ISD::FTRUNC:

  case ISD::FNEARBYINT:

  case ISD::FROUNDEVEN:

  case ISD::FROUND:

  case ISD::FFLOOR:

  case X86ISD::FRCP:

  case X86ISD::FRSQRT: {

    // extract (fp X, Y, ...), 0 --> fp (extract X, 0), (extract Y, 0), ...

    SDLoc DL(ExtElt);

    SmallVector<SDValue, 4> ExtOps;

    for (SDValue Op : Vec->ops())

      ExtOps.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Op, Index));

    return DAG.getNode(Vec.getOpcode(), DL, VT, ExtOps);

  }

  default:

    return SDValue();

  }

  llvm_unreachable("All opcodes should return within switch");

}


/// Try to convert a vector reduction sequence composed of binops and shuffles

/// into horizontal ops.


static SDValue combineArithReduction(SDNode *ExtElt, SelectionDAG &DAG,

                                     const X86Subtarget &Subtarget) {

  assert(ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Unexpected caller");


  // We need at least SSE2 to anything here.

  if (!Subtarget.hasSSE2())

    return SDValue();


  ISD::NodeType Opc;

  SDValue Rdx = DAG.matchBinOpReduction(ExtElt, Opc,

                                        {ISD::ADD, ISD::MUL, ISD::FADD}, true);

  if (!Rdx)

    return SDValue();


  SDValue Index = ExtElt->getOperand(1);

  assert(isNullConstant(Index) &&

         "Reduction doesn't end in an extract from index 0");


  EVT VT = ExtElt->getValueType(0);

  EVT VecVT = Rdx.getValueType();

  if (VecVT.getScalarType() != VT)

    return SDValue();


  SDLoc DL(ExtElt);

  unsigned NumElts = VecVT.getVectorNumElements();

  unsigned EltSizeInBits = VecVT.getScalarSizeInBits();


  // Extend v4i8/v8i8 vector to v16i8, with undef upper 64-bits.

  auto WidenToV16I8 = [&](SDValue V, bool ZeroExtend) {

    if (V.getValueType() == MVT::v4i8) {

      if (ZeroExtend && Subtarget.hasSSE41()) {

        V = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v4i32,

                        DAG.getConstant(0, DL, MVT::v4i32),

                        DAG.getBitcast(MVT::i32, V),

                        DAG.getVectorIdxConstant(0, DL));

        return DAG.getBitcast(MVT::v16i8, V);

      }

      V = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i8, V,

                      ZeroExtend ? DAG.getConstant(0, DL, MVT::v4i8)

                                 : DAG.getUNDEF(MVT::v4i8));

    }

    return DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V,

                       DAG.getUNDEF(MVT::v8i8));

  };


  // vXi8 mul reduction - promote to vXi16 mul reduction.

  if (Opc == ISD::MUL) {

    if (VT != MVT::i8 || NumElts < 4 || !isPowerOf2_32(NumElts))

      return SDValue();

    if (VecVT.getSizeInBits() >= 128) {

      EVT WideVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16, NumElts / 2);

      SDValue Lo = getUnpackl(DAG, DL, VecVT, Rdx, DAG.getUNDEF(VecVT));

      SDValue Hi = getUnpackh(DAG, DL, VecVT, Rdx, DAG.getUNDEF(VecVT));

      Lo = DAG.getBitcast(WideVT, Lo);

      Hi = DAG.getBitcast(WideVT, Hi);

      Rdx = DAG.getNode(Opc, DL, WideVT, Lo, Hi);

      while (Rdx.getValueSizeInBits() > 128) {

        std::tie(Lo, Hi) = splitVector(Rdx, DAG, DL);

        Rdx = DAG.getNode(Opc, DL, Lo.getValueType(), Lo, Hi);

      }

    } else {

      Rdx = WidenToV16I8(Rdx, false);

      Rdx = getUnpackl(DAG, DL, MVT::v16i8, Rdx, DAG.getUNDEF(MVT::v16i8));

      Rdx = DAG.getBitcast(MVT::v8i16, Rdx);

    }

    if (NumElts >= 8)

      Rdx = DAG.getNode(Opc, DL, MVT::v8i16, Rdx,

                        DAG.getVectorShuffle(MVT::v8i16, DL, Rdx, Rdx,

                                             {4, 5, 6, 7, -1, -1, -1, -1}));

    Rdx = DAG.getNode(Opc, DL, MVT::v8i16, Rdx,

                      DAG.getVectorShuffle(MVT::v8i16, DL, Rdx, Rdx,

                                           {2, 3, -1, -1, -1, -1, -1, -1}));

    Rdx = DAG.getNode(Opc, DL, MVT::v8i16, Rdx,

                      DAG.getVectorShuffle(MVT::v8i16, DL, Rdx, Rdx,

                                           {1, -1, -1, -1, -1, -1, -1, -1}));

    Rdx = DAG.getBitcast(MVT::v16i8, Rdx);

    return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);

  }


  // vXi8 add reduction - sub 128-bit vector.

  if (VecVT == MVT::v4i8 || VecVT == MVT::v8i8) {

    Rdx = WidenToV16I8(Rdx, true);

    Rdx = DAG.getNode(X86ISD::PSADBW, DL, MVT::v2i64, Rdx,

                      DAG.getConstant(0, DL, MVT::v16i8));

    Rdx = DAG.getBitcast(MVT::v16i8, Rdx);

    return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);

  }


  // Must be a >=128-bit vector with pow2 elements.

  if ((VecVT.getSizeInBits() % 128) != 0 || !isPowerOf2_32(NumElts))

    return SDValue();


  // vXi8 add reduction - sum lo/hi halves then use PSADBW.

  if (VT == MVT::i8) {

    while (Rdx.getValueSizeInBits() > 128) {

      SDValue Lo, Hi;

      std::tie(Lo, Hi) = splitVector(Rdx, DAG, DL);

      VecVT = Lo.getValueType();

      Rdx = DAG.getNode(ISD::ADD, DL, VecVT, Lo, Hi);

    }

    assert(VecVT == MVT::v16i8 && "v16i8 reduction expected");


    SDValue Hi = DAG.getVectorShuffle(

        MVT::v16i8, DL, Rdx, Rdx,

        {8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1});

    Rdx = DAG.getNode(ISD::ADD, DL, MVT::v16i8, Rdx, Hi);

    Rdx = DAG.getNode(X86ISD::PSADBW, DL, MVT::v2i64, Rdx,

                      getZeroVector(MVT::v16i8, Subtarget, DAG, DL));

    Rdx = DAG.getBitcast(MVT::v16i8, Rdx);

    return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);

  }


  // See if we can use vXi8 PSADBW add reduction for larger zext types.

  // If the source vector values are 0-255, then we can use PSADBW to

  // sum+zext v8i8 subvectors to vXi64, then perform the reduction.

  // TODO: See if its worth avoiding vXi16/i32 truncations?

  if (Opc == ISD::ADD && NumElts >= 4 && EltSizeInBits >= 16 &&

      DAG.computeKnownBits(Rdx).getMaxValue().ule(255) &&

      (EltSizeInBits == 16 || Rdx.getOpcode() == ISD::ZERO_EXTEND ||

       Subtarget.hasAVX512())) {

    if (Rdx.getValueType() == MVT::v8i16) {

      Rdx = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, Rdx,

                        DAG.getUNDEF(MVT::v8i16));

    } else {

      EVT ByteVT = VecVT.changeVectorElementType(MVT::i8);

      Rdx = DAG.getNode(ISD::TRUNCATE, DL, ByteVT, Rdx);

      if (ByteVT.getSizeInBits() < 128)

        Rdx = WidenToV16I8(Rdx, true);

    }


    // Build the PSADBW, split as 128/256/512 bits for SSE/AVX2/AVX512BW.

    auto PSADBWBuilder = [](SelectionDAG &DAG, const SDLoc &DL,

                            ArrayRef<SDValue> Ops) {

      MVT VT = MVT::getVectorVT(MVT::i64, Ops[0].getValueSizeInBits() / 64);

      SDValue Zero = DAG.getConstant(0, DL, Ops[0].getValueType());

      return DAG.getNode(X86ISD::PSADBW, DL, VT, Ops[0], Zero);

    };

    MVT SadVT = MVT::getVectorVT(MVT::i64, Rdx.getValueSizeInBits() / 64);

    Rdx = SplitOpsAndApply(DAG, Subtarget, DL, SadVT, {Rdx}, PSADBWBuilder);


    // TODO: We could truncate to vXi16/vXi32 before performing the reduction.

    while (Rdx.getValueSizeInBits() > 128) {

      SDValue Lo, Hi;

      std::tie(Lo, Hi) = splitVector(Rdx, DAG, DL);

      VecVT = Lo.getValueType();

      Rdx = DAG.getNode(ISD::ADD, DL, VecVT, Lo, Hi);

    }

    assert(Rdx.getValueType() == MVT::v2i64 && "v2i64 reduction expected");


    if (NumElts > 8) {

      SDValue RdxHi = DAG.getVectorShuffle(MVT::v2i64, DL, Rdx, Rdx, {1, -1});

      Rdx = DAG.getNode(ISD::ADD, DL, MVT::v2i64, Rdx, RdxHi);

    }


    VecVT = MVT::getVectorVT(VT.getSimpleVT(), 128 / VT.getSizeInBits());

    Rdx = DAG.getBitcast(VecVT, Rdx);

    return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);

  }


  // Only use (F)HADD opcodes if they aren't microcoded or minimizes codesize.

  if (!shouldUseHorizontalOp(true, DAG, Subtarget))

    return SDValue();


  unsigned HorizOpcode = Opc == ISD::ADD ? X86ISD::HADD : X86ISD::FHADD;


  // 256-bit horizontal instructions operate on 128-bit chunks rather than

  // across the whole vector, so we need an extract + hop preliminary stage.

  // This is the only step where the operands of the hop are not the same value.

  // TODO: We could extend this to handle 512-bit or even longer vectors.

  if (((VecVT == MVT::v16i16 || VecVT == MVT::v8i32) && Subtarget.hasSSSE3()) ||

      ((VecVT == MVT::v8f32 || VecVT == MVT::v4f64) && Subtarget.hasSSE3())) {

    unsigned NumElts = VecVT.getVectorNumElements();

    SDValue Hi = extract128BitVector(Rdx, NumElts / 2, DAG, DL);

    SDValue Lo = extract128BitVector(Rdx, 0, DAG, DL);

    Rdx = DAG.getNode(HorizOpcode, DL, Lo.getValueType(), Hi, Lo);

    VecVT = Rdx.getValueType();

  }

  if (!((VecVT == MVT::v8i16 || VecVT == MVT::v4i32) && Subtarget.hasSSSE3()) &&

      !((VecVT == MVT::v4f32 || VecVT == MVT::v2f64) && Subtarget.hasSSE3()))

    return SDValue();


  // extract (add (shuf X), X), 0 --> extract (hadd X, X), 0

  unsigned ReductionSteps = Log2_32(VecVT.getVectorNumElements());

  for (unsigned i = 0; i != ReductionSteps; ++i)

    Rdx = DAG.getNode(HorizOpcode, DL, VecVT, Rdx, Rdx);


  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);

}


/// Detect vector gather/scatter index generation and convert it from being a

/// bunch of shuffles and extracts into a somewhat faster sequence.

/// For i686, the best sequence is apparently storing the value and loading

/// scalars back, while for x64 we should use 64-bit extracts and shifts.


static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG,

                                       TargetLowering::DAGCombinerInfo &DCI,

                                       const X86Subtarget &Subtarget) {

  if (SDValue NewOp = combineExtractWithShuffle(N, DAG, DCI, Subtarget))

    return NewOp;


  SDValue InputVector = N->getOperand(0);

  SDValue EltIdx = N->getOperand(1);

  auto *CIdx = dyn_cast<ConstantSDNode>(EltIdx);


  EVT SrcVT = InputVector.getValueType();

  EVT VT = N->getValueType(0);

  SDLoc dl(InputVector);

  bool IsPextr = N->getOpcode() != ISD::EXTRACT_VECTOR_ELT;

  unsigned NumSrcElts = SrcVT.getVectorNumElements();

  unsigned NumEltBits = VT.getScalarSizeInBits();

  const TargetLowering &TLI = DAG.getTargetLoweringInfo();


  if (CIdx && CIdx->getAPIntValue().uge(NumSrcElts))

    return IsPextr ? DAG.getConstant(0, dl, VT) : DAG.getUNDEF(VT);


  // Integer Constant Folding.

  if (CIdx && VT.isInteger()) {

    APInt UndefVecElts;

    SmallVector<APInt, 16> EltBits;

    unsigned VecEltBitWidth = SrcVT.getScalarSizeInBits();

    if (getTargetConstantBitsFromNode(InputVector, VecEltBitWidth, UndefVecElts,

                                      EltBits, /*AllowWholeUndefs*/ true,

                                      /*AllowPartialUndefs*/ false)) {

      uint64_t Idx = CIdx->getZExtValue();

      if (UndefVecElts[Idx])

        return IsPextr ? DAG.getConstant(0, dl, VT) : DAG.getUNDEF(VT);

      return DAG.getConstant(EltBits[Idx].zext(NumEltBits), dl, VT);

    }


    // Convert extract_element(bitcast(<X x i1>) -> bitcast(extract_subvector()).

    // Improves lowering of bool masks on rust which splits them into byte array.

    if (InputVector.getOpcode() == ISD::BITCAST && (NumEltBits % 8) == 0) {

      SDValue Src = peekThroughBitcasts(InputVector);

      if (Src.getValueType().getScalarType() == MVT::i1 &&

          TLI.isTypeLegal(Src.getValueType())) {

        MVT SubVT = MVT::getVectorVT(MVT::i1, NumEltBits);

        SDValue Sub = DAG.getNode(

            ISD::EXTRACT_SUBVECTOR, dl, SubVT, Src,

            DAG.getVectorIdxConstant(CIdx->getZExtValue() * NumEltBits, dl));

        return DAG.getBitcast(VT, Sub);

      }

    }

  }


  if (IsPextr) {

    if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnes(NumEltBits),

                                 DCI))

      return SDValue(N, 0);


    // PEXTR*(PINSR*(v, s, c), c) -> s (with implicit zext handling).

    if ((InputVector.getOpcode() == X86ISD::PINSRB ||

         InputVector.getOpcode() == X86ISD::PINSRW) &&

        InputVector.getOperand(2) == EltIdx) {

      assert(SrcVT == InputVector.getOperand(0).getValueType() &&

             "Vector type mismatch");

      SDValue Scl = InputVector.getOperand(1);

      Scl = DAG.getNode(ISD::TRUNCATE, dl, SrcVT.getScalarType(), Scl);

      return DAG.getZExtOrTrunc(Scl, dl, VT);

    }


    // TODO - Remove this once we can handle the implicit zero-extension of

    // X86ISD::PEXTRW/X86ISD::PEXTRB in combinePredicateReduction and

    // combineBasicSADPattern.

    return SDValue();

  }


  // Detect mmx extraction of all bits as a i64. It works better as a bitcast.

  if (VT == MVT::i64 && SrcVT == MVT::v1i64 &&

      InputVector.getOpcode() == ISD::BITCAST &&

      InputVector.getOperand(0).getValueType() == MVT::x86mmx &&

      isNullConstant(EltIdx) && InputVector.hasOneUse())

    return DAG.getBitcast(VT, InputVector);


  // Detect mmx to i32 conversion through a v2i32 elt extract.

  if (VT == MVT::i32 && SrcVT == MVT::v2i32 &&

      InputVector.getOpcode() == ISD::BITCAST &&

      InputVector.getOperand(0).getValueType() == MVT::x86mmx &&

      isNullConstant(EltIdx) && InputVector.hasOneUse())

    return DAG.getNode(X86ISD::MMX_MOVD2W, dl, MVT::i32,

                       InputVector.getOperand(0));


  // Check whether this extract is the root of a sum of absolute differences

  // pattern. This has to be done here because we really want it to happen

  // pre-legalization,

  if (SDValue SAD = combineBasicSADPattern(N, DAG, Subtarget))

    return SAD;


  if (SDValue VPDPBUSD = combineVPDPBUSDPattern(N, DAG, Subtarget))

    return VPDPBUSD;


  // Attempt to replace an all_of/any_of horizontal reduction with a MOVMSK.

  if (SDValue Cmp = combinePredicateReduction(N, DAG, Subtarget))

    return Cmp;


  // Attempt to replace min/max v8i16/v16i8 reductions with PHMINPOSUW.

  if (SDValue MinMax = combineMinMaxReduction(N, DAG, Subtarget))

    return MinMax;


  // Attempt to optimize ADD/FADD/MUL reductions with HADD, promotion etc..

  if (SDValue V = combineArithReduction(N, DAG, Subtarget))

    return V;


  if (SDValue V = scalarizeExtEltFP(N, DAG, Subtarget, DCI))

    return V;


  if (CIdx)

    if (SDValue V = combineExtractFromVectorLoad(

            N, InputVector.getValueType(), InputVector, CIdx->getZExtValue(),

            dl, DAG, DCI))

      return V;


  // Attempt to extract a i1 element by using MOVMSK to extract the signbits

  // and then testing the relevant element.

  //

  // Note that we only combine extracts on the *same* result number, i.e.

  //   t0 = merge_values a0, a1, a2, a3

  //   i1 = extract_vector_elt t0, Constant:i64<2>

  //   i1 = extract_vector_elt t0, Constant:i64<3>

  // but not

  //   i1 = extract_vector_elt t0:1, Constant:i64<2>

  // since the latter would need its own MOVMSK.

  if (SrcVT.getScalarType() == MVT::i1) {

    bool IsVar = !CIdx;

    SmallVector<SDNode *, 16> BoolExtracts;

    unsigned ResNo = InputVector.getResNo();

    auto IsBoolExtract = [&BoolExtracts, &ResNo, &IsVar](SDNode *Use) {

      if (Use->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&

          Use->getOperand(0).getResNo() == ResNo &&

          Use->getValueType(0) == MVT::i1) {

        BoolExtracts.push_back(Use);

        IsVar |= !isa<ConstantSDNode>(Use->getOperand(1));

        return true;

      }

      return false;

    };

    // TODO: Can we drop the oneuse check for constant extracts?

    if (all_of(InputVector->users(), IsBoolExtract) &&

        (IsVar || BoolExtracts.size() > 1)) {

      EVT BCVT = EVT::getIntegerVT(*DAG.getContext(), NumSrcElts);

      if (SDValue BC =

              combineBitcastvxi1(DAG, BCVT, InputVector, dl, Subtarget)) {

        for (SDNode *Use : BoolExtracts) {

          // extractelement vXi1 X, MaskIdx --> ((movmsk X) & Mask) == Mask

          // Mask = 1 << MaskIdx

          SDValue MaskIdx = DAG.getZExtOrTrunc(Use->getOperand(1), dl, MVT::i8);

          SDValue MaskBit = DAG.getConstant(1, dl, BCVT);

          SDValue Mask = DAG.getNode(ISD::SHL, dl, BCVT, MaskBit, MaskIdx);

          SDValue Res = DAG.getNode(ISD::AND, dl, BCVT, BC, Mask);

          Res = DAG.getSetCC(dl, MVT::i1, Res, Mask, ISD::SETEQ);

          DCI.CombineTo(Use, Res);

        }

        return SDValue(N, 0);

      }

    }

  }


  // Attempt to fold extract(trunc(x),c) -> trunc(extract(x,c)).

  if (CIdx && InputVector.getOpcode() == ISD::TRUNCATE) {

    SDValue TruncSrc = InputVector.getOperand(0);

    EVT TruncSVT = TruncSrc.getValueType().getScalarType();

    if (DCI.isBeforeLegalize() && TLI.isTypeLegal(TruncSVT)) {

      SDValue NewExt =

          DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, TruncSVT, TruncSrc, EltIdx);

      return DAG.getAnyExtOrTrunc(NewExt, dl, VT);

    }

  }


  return SDValue();

}


// Convert (vXiY *ext(vXi1 bitcast(iX))) to extend_in_reg(broadcast(iX)).

// This is more or less the reverse of combineBitcastvxi1.


static SDValue combineToExtendBoolVectorInReg(

    unsigned Opcode, const SDLoc &DL, EVT VT, SDValue N0, SelectionDAG &DAG,

    TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) {

  if (Opcode != ISD::SIGN_EXTEND && Opcode != ISD::ZERO_EXTEND &&

      Opcode != ISD::ANY_EXTEND)

    return SDValue();

  if (!DCI.isBeforeLegalizeOps())

    return SDValue();

  if (!Subtarget.hasSSE2() || Subtarget.hasAVX512())

    return SDValue();


  EVT SVT = VT.getScalarType();

  EVT InSVT = N0.getValueType().getScalarType();

  unsigned EltSizeInBits = SVT.getSizeInBits();


  // Input type must be extending a bool vector (bit-casted from a scalar

  // integer) to legal integer types.

  if (!VT.isVector())

    return SDValue();

  if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16 && SVT != MVT::i8)

    return SDValue();

  if (InSVT != MVT::i1 || N0.getOpcode() != ISD::BITCAST)

    return SDValue();


  SDValue N00 = N0.getOperand(0);

  EVT SclVT = N00.getValueType();

  if (!SclVT.isScalarInteger())

    return SDValue();


  SDValue Vec;

  SmallVector<int> ShuffleMask;

  unsigned NumElts = VT.getVectorNumElements();

  assert(NumElts == SclVT.getSizeInBits() && "Unexpected bool vector size");


  // Broadcast the scalar integer to the vector elements.

  if (NumElts > EltSizeInBits) {

    // If the scalar integer is greater than the vector element size, then we

    // must split it down into sub-sections for broadcasting. For example:

    //   i16 -> v16i8 (i16 -> v8i16 -> v16i8) with 2 sub-sections.

    //   i32 -> v32i8 (i32 -> v8i32 -> v32i8) with 4 sub-sections.

    assert((NumElts % EltSizeInBits) == 0 && "Unexpected integer scale");

    unsigned Scale = NumElts / EltSizeInBits;

    EVT BroadcastVT = EVT::getVectorVT(*DAG.getContext(), SclVT, EltSizeInBits);

    bool UseBroadcast = Subtarget.hasInt256() &&

                        (!BroadcastVT.is128BitVector() || isa<LoadSDNode>(N00));

    Vec = UseBroadcast

              ? DAG.getSplat(BroadcastVT, DL, N00)

              : DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, BroadcastVT, N00);

    Vec = DAG.getBitcast(VT, Vec);


    for (unsigned i = 0; i != Scale; ++i) {

      int Offset = UseBroadcast ? (i * EltSizeInBits) : 0;

      ShuffleMask.append(EltSizeInBits, i + Offset);

    }

    Vec = DAG.getVectorShuffle(VT, DL, Vec, Vec, ShuffleMask);

  } else if (Subtarget.hasAVX2() && NumElts < EltSizeInBits &&

             (SclVT == MVT::i8 || SclVT == MVT::i16 || SclVT == MVT::i32)) {

    // If we have register broadcast instructions, use the scalar size as the

    // element type for the shuffle. Then cast to the wider element type. The

    // widened bits won't be used, and this might allow the use of a broadcast

    // load.

    assert((EltSizeInBits % NumElts) == 0 && "Unexpected integer scale");

    EVT BroadcastVT = EVT::getVectorVT(*DAG.getContext(), SclVT,

                                       (NumElts * EltSizeInBits) / NumElts);

    Vec = DAG.getBitcast(VT, DAG.getSplat(BroadcastVT, DL, N00));

  } else {

    // For smaller scalar integers, we can simply any-extend it to the vector

    // element size (we don't care about the upper bits) and broadcast it to all

    // elements.

    Vec = DAG.getSplat(VT, DL, DAG.getAnyExtOrTrunc(N00, DL, SVT));

  }


  // Now, mask the relevant bit in each element.

  SmallVector<SDValue, 32> Bits;

  for (unsigned i = 0; i != NumElts; ++i) {

    int BitIdx = (i % EltSizeInBits);

    APInt Bit = APInt::getBitsSet(EltSizeInBits, BitIdx, BitIdx + 1);

    Bits.push_back(DAG.getConstant(Bit, DL, SVT));

  }

  SDValue BitMask = DAG.getBuildVector(VT, DL, Bits);

  Vec = DAG.getNode(ISD::AND, DL, VT, Vec, BitMask);


  // Compare against the bitmask and extend the result.

  EVT CCVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, NumElts);

  Vec = DAG.getSetCC(DL, CCVT, Vec, BitMask, ISD::SETEQ);

  Vec = DAG.getSExtOrTrunc(Vec, DL, VT);


  // For SEXT, this is now done, otherwise shift the result down for

  // zero-extension.

  if (Opcode == ISD::SIGN_EXTEND)

    return Vec;

  return DAG.getNode(ISD::SRL, DL, VT, Vec,

                     DAG.getConstant(EltSizeInBits - 1, DL, VT));

}


/// If both arms of a vector select are concatenated vectors, split the select,

/// and concatenate the result to eliminate a wide (256-bit) vector instruction:

///   vselect Cond, (concat T0, T1), (concat F0, F1) -->

///   concat (vselect (split Cond), T0, F0), (vselect (split Cond), T1, F1)


static SDValue narrowVectorSelect(SDNode *N, SelectionDAG &DAG, const SDLoc &DL,

                                  const X86Subtarget &Subtarget) {

  unsigned Opcode = N->getOpcode();

  if (Opcode != X86ISD::BLENDV && Opcode != ISD::VSELECT)

    return SDValue();


  // TODO: Split 512-bit vectors too?

  EVT VT = N->getValueType(0);

  if (!VT.is256BitVector())

    return SDValue();


  // TODO: Split as long as any 2 of the 3 operands are concatenated?

  SDValue Cond = N->getOperand(0);

  SDValue TVal = N->getOperand(1);

  SDValue FVal = N->getOperand(2);

  if (!TVal.hasOneUse() || !FVal.hasOneUse() ||

      !isFreeToSplitVector(TVal, DAG) || !isFreeToSplitVector(FVal, DAG))

    return SDValue();


  auto makeBlend = [Opcode](SelectionDAG &DAG, const SDLoc &DL,

                            ArrayRef<SDValue> Ops) {

    return DAG.getNode(Opcode, DL, Ops[1].getValueType(), Ops);

  };

  return SplitOpsAndApply(DAG, Subtarget, DL, VT, {Cond, TVal, FVal}, makeBlend,

                          /*CheckBWI*/ false);

}


static SDValue combineSelectOfTwoConstants(SDNode *N, SelectionDAG &DAG,

                                           const SDLoc &DL) {

  SDValue Cond = N->getOperand(0);

  SDValue LHS = N->getOperand(1);

  SDValue RHS = N->getOperand(2);


  auto *TrueC = dyn_cast<ConstantSDNode>(LHS);

  auto *FalseC = dyn_cast<ConstantSDNode>(RHS);

  if (!TrueC || !FalseC)

    return SDValue();


  // Don't do this for crazy integer types.

  EVT VT = N->getValueType(0);

  if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))

    return SDValue();


  // We're going to use the condition bit in math or logic ops. We could allow

  // this with a wider condition value (post-legalization it becomes an i8),

  // but if nothing is creating selects that late, it doesn't matter.

  if (Cond.getValueType() != MVT::i1)

    return SDValue();


  // A power-of-2 multiply is just a shift. LEA also cheaply handles multiply by

  // 3, 5, or 9 with i32/i64, so those get transformed too.

  // TODO: For constants that overflow or do not differ by power-of-2 or small

  // multiplier, convert to 'and' + 'add'.

  const APInt &TrueVal = TrueC->getAPIntValue();

  const APInt &FalseVal = FalseC->getAPIntValue();


  // We have a more efficient lowering for "(X == 0) ? Y : -1" using SBB.

  if ((TrueVal.isAllOnes() || FalseVal.isAllOnes()) &&

      Cond.getOpcode() == ISD::SETCC && isNullConstant(Cond.getOperand(1))) {

    ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();

    if (CC == ISD::SETEQ || CC == ISD::SETNE)

      return SDValue();

  }


  bool OV;

  APInt Diff = TrueVal.ssub_ov(FalseVal, OV);

  if (OV)

    return SDValue();


  APInt AbsDiff = Diff.abs();

  if (AbsDiff.isPowerOf2() ||

      ((VT == MVT::i32 || VT == MVT::i64) &&

       (AbsDiff == 3 || AbsDiff == 5 || AbsDiff == 9))) {


    // We need a positive multiplier constant for shift/LEA codegen. The 'not'

    // of the condition can usually be folded into a compare predicate, but even

    // without that, the sequence should be cheaper than a CMOV alternative.

    if (TrueVal.slt(FalseVal)) {

      Cond = DAG.getNOT(DL, Cond, MVT::i1);

      std::swap(TrueC, FalseC);

    }


    // select Cond, TC, FC --> (zext(Cond) * (TC - FC)) + FC

    SDValue R = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Cond);


    // Multiply condition by the difference if non-one.

    if (!AbsDiff.isOne())

      R = DAG.getNode(ISD::MUL, DL, VT, R, DAG.getConstant(AbsDiff, DL, VT));


    // Add the base if non-zero.

    if (!FalseC->isZero())

      R = DAG.getNode(ISD::ADD, DL, VT, R, SDValue(FalseC, 0));


    return R;

  }


  return SDValue();

}


/// If this is a *dynamic* select (non-constant condition) and we can match

/// this node with one of the variable blend instructions, restructure the

/// condition so that blends can use the high (sign) bit of each element.

/// This function will also call SimplifyDemandedBits on already created

/// BLENDV to perform additional simplifications.


static SDValue combineVSelectToBLENDV(SDNode *N, SelectionDAG &DAG,

                                      const SDLoc &DL,

                                      TargetLowering::DAGCombinerInfo &DCI,

                                      const X86Subtarget &Subtarget) {

  SDValue Cond = N->getOperand(0);

  if ((N->getOpcode() != ISD::VSELECT &&

       N->getOpcode() != X86ISD::BLENDV) ||

      ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()))

    return SDValue();


  const TargetLowering &TLI = DAG.getTargetLoweringInfo();

  unsigned BitWidth = Cond.getScalarValueSizeInBits();

  EVT VT = N->getValueType(0);


  // We can only handle the cases where VSELECT is directly legal on the

  // subtarget. We custom lower VSELECT nodes with constant conditions and

  // this makes it hard to see whether a dynamic VSELECT will correctly

  // lower, so we both check the operation's status and explicitly handle the

  // cases where a *dynamic* blend will fail even though a constant-condition

  // blend could be custom lowered.

  // FIXME: We should find a better way to handle this class of problems.

  // Potentially, we should combine constant-condition vselect nodes

  // pre-legalization into shuffles and not mark as many types as custom

  // lowered.

  if (!TLI.isOperationLegalOrCustom(ISD::VSELECT, VT))

    return SDValue();

  // FIXME: We don't support i16-element blends currently. We could and

  // should support them by making *all* the bits in the condition be set

  // rather than just the high bit and using an i8-element blend.

  if (VT.getVectorElementType() == MVT::i16)

    return SDValue();

  // Dynamic blending was only available from SSE4.1 onward.

  if (VT.is128BitVector() && !Subtarget.hasSSE41())

    return SDValue();

  // Byte blends are only available in AVX2

  if (VT == MVT::v32i8 && !Subtarget.hasAVX2())

    return SDValue();

  // There are no 512-bit blend instructions that use sign bits.

  if (VT.is512BitVector())

    return SDValue();


  // Don't optimize before the condition has been transformed to a legal type

  // and don't ever optimize vector selects that map to AVX512 mask-registers.

  if (BitWidth < 8 || BitWidth > 64)

    return SDValue();


  auto OnlyUsedAsSelectCond = [](SDValue Cond) {

    for (SDUse &Use : Cond->uses())

      if ((Use.getUser()->getOpcode() != ISD::VSELECT &&

           Use.getUser()->getOpcode() != X86ISD::BLENDV) ||

          Use.getOperandNo() != 0)

        return false;


    return true;

  };


  APInt DemandedBits(APInt::getSignMask(BitWidth));


  if (OnlyUsedAsSelectCond(Cond)) {

    KnownBits Known;

    TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),

                                          !DCI.isBeforeLegalizeOps());

    if (!TLI.SimplifyDemandedBits(Cond, DemandedBits, Known, TLO, 0, true))

      return SDValue();


    // If we changed the computation somewhere in the DAG, this change will

    // affect all users of Cond. Update all the nodes so that we do not use

    // the generic VSELECT anymore. Otherwise, we may perform wrong

    // optimizations as we messed with the actual expectation for the vector

    // boolean values.

    for (SDNode *U : Cond->users()) {

      if (U->getOpcode() == X86ISD::BLENDV)

        continue;


      SDValue SB = DAG.getNode(X86ISD::BLENDV, SDLoc(U), U->getValueType(0),

                               Cond, U->getOperand(1), U->getOperand(2));

      DAG.ReplaceAllUsesOfValueWith(SDValue(U, 0), SB);

      DCI.AddToWorklist(U);

    }

    DCI.CommitTargetLoweringOpt(TLO);

    return SDValue(N, 0);

  }


  // Otherwise we can still at least try to simplify multiple use bits.

  if (SDValue V = TLI.SimplifyMultipleUseDemandedBits(Cond, DemandedBits, DAG))

    return DAG.getNode(X86ISD::BLENDV, DL, N->getValueType(0), V,

                       N->getOperand(1), N->getOperand(2));


  return SDValue();

}


// Try to match:

//   (or (and (M, (sub 0, X)), (pandn M, X)))

// which is a special case of:

//   (select M, (sub 0, X), X)

// Per:

// http://graphics.stanford.edu/~seander/bithacks.html#ConditionalNegate

// We know that, if fNegate is 0 or 1:

//   (fNegate ? -v : v) == ((v ^ -fNegate) + fNegate)

//

// Here, we have a mask, M (all 1s or 0), and, similarly, we know that:

//   ((M & 1) ? -X : X) == ((X ^ -(M & 1)) + (M & 1))

//   ( M      ? -X : X) == ((X ^   M     ) + (M & 1))

// This lets us transform our vselect to:

//   (add (xor X, M), (and M, 1))

// And further to:

//   (sub (xor X, M), M)


static SDValue combineLogicBlendIntoConditionalNegate(

    EVT VT, SDValue Mask, SDValue X, SDValue Y, const SDLoc &DL,

    SelectionDAG &DAG, const X86Subtarget &Subtarget) {

  using namespace SDPatternMatch;

  EVT MaskVT = Mask.getValueType();

  assert(MaskVT.isInteger() &&

         DAG.ComputeNumSignBits(Mask) == MaskVT.getScalarSizeInBits() &&

         "Mask must be zero/all-bits");


  if (X.getValueType() != MaskVT || Y.getValueType() != MaskVT ||

      !DAG.getTargetLoweringInfo().isOperationLegal(ISD::SUB, MaskVT))

    return SDValue();


  SDValue V;

  if (!sd_match(Y, m_Neg(m_AllOf(m_Specific(X), m_Value(V)))) &&

      !sd_match(X, m_Neg(m_AllOf(m_Specific(Y), m_Value(V)))))

    return SDValue();


  SDValue SubOp1 = DAG.getNode(ISD::XOR, DL, MaskVT, V, Mask);

  SDValue SubOp2 = Mask;


  // If the negate was on the false side of the select, then

  // the operands of the SUB need to be swapped. PR 27251.

  // This is because the pattern being matched above is

  // (vselect M, (sub (0, X), X)  -> (sub (xor X, M), M)

  // but if the pattern matched was

  // (vselect M, X, (sub (0, X))), that is really negation of the pattern

  // above, -(vselect M, (sub 0, X), X), and therefore the replacement

  // pattern also needs to be a negation of the replacement pattern above.

  // And -(sub X, Y) is just sub (Y, X), so swapping the operands of the

  // sub accomplishes the negation of the replacement pattern.

  if (V == Y)

    std::swap(SubOp1, SubOp2);


  SDValue Res = DAG.getNode(ISD::SUB, DL, MaskVT, SubOp1, SubOp2);

  return DAG.getBitcast(VT, Res);

}


static SDValue commuteSelect(SDNode *N, SelectionDAG &DAG, const SDLoc &DL,

                             const X86Subtarget &Subtarget) {

  using namespace SDPatternMatch;

  if (!Subtarget.hasAVX512())

    return SDValue();


  ISD::CondCode CC;

  SDValue Cond, X, Y, LHS, RHS;

  if (!sd_match(N, m_VSelect(m_AllOf(m_Value(Cond),

                                     m_OneUse(m_SetCC(m_Value(X), m_Value(Y),

                                                      m_CondCode(CC)))),

                             m_Value(LHS), m_Value(RHS))))

    return SDValue();


  if (canCombineAsMaskOperation(LHS, Subtarget) ||

      !canCombineAsMaskOperation(RHS, Subtarget))

    return SDValue();


  // Commute LHS and RHS to create opportunity to select mask instruction.

  // (vselect M, L, R) -> (vselect ~M, R, L)

  ISD::CondCode NewCC = ISD::getSetCCInverse(CC, X.getValueType());

  Cond = DAG.getSetCC(SDLoc(Cond), Cond.getValueType(), X, Y, NewCC);

  return DAG.getSelect(DL, LHS.getValueType(), Cond, RHS, LHS);

}


/// Do target-specific dag combines on SELECT and VSELECT nodes.


static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,

                             TargetLowering::DAGCombinerInfo &DCI,

                             const X86Subtarget &Subtarget) {

  SDLoc DL(N);

  SDValue Cond = N->getOperand(0);

  SDValue LHS = N->getOperand(1);

  SDValue RHS = N->getOperand(2);


  // Try simplification again because we use this function to optimize

  // BLENDV nodes that are not handled by the generic combiner.

  if (SDValue V = DAG.simplifySelect(Cond, LHS, RHS))

    return V;


  // When avx512 is available the lhs operand of select instruction can be

  // folded with mask instruction, while the rhs operand can't. Commute the

  // lhs and rhs of the select instruction to create the opportunity of

  // folding.

  if (SDValue V = commuteSelect(N, DAG, DL, Subtarget))

    return V;


  EVT VT = LHS.getValueType();

  EVT CondVT = Cond.getValueType();

  const TargetLowering &TLI = DAG.getTargetLoweringInfo();

  bool CondConstantVector = ISD::isBuildVectorOfConstantSDNodes(Cond.getNode());


  // Attempt to combine (select M, (sub 0, X), X) -> (sub (xor X, M), M).

  // Limit this to cases of non-constant masks that createShuffleMaskFromVSELECT

  // can't catch, plus vXi8 cases where we'd likely end up with BLENDV.

  if (CondVT.isVector() && CondVT.isInteger() &&

      CondVT.getScalarSizeInBits() == VT.getScalarSizeInBits() &&

      (!CondConstantVector || CondVT.getScalarType() == MVT::i8) &&

      DAG.ComputeNumSignBits(Cond) == CondVT.getScalarSizeInBits())

    if (SDValue V = combineLogicBlendIntoConditionalNegate(VT, Cond, RHS, LHS,

                                                           DL, DAG, Subtarget))

      return V;


  // If the sign bit is known then BLENDV can be folded away.

  if (N->getOpcode() == X86ISD::BLENDV) {

    KnownBits KnownCond = DAG.computeKnownBits(Cond);

    if (KnownCond.isNegative())

      return LHS;

    if (KnownCond.isNonNegative())

      return RHS;

  }


  if (N->getOpcode() == ISD::VSELECT || N->getOpcode() == X86ISD::BLENDV) {

    SmallVector<int, 64> CondMask;

    if (createShuffleMaskFromVSELECT(CondMask, Cond,

                                     N->getOpcode() == X86ISD::BLENDV)) {

      // Convert vselects with constant condition into shuffles.

      if (DCI.isBeforeLegalizeOps())

        return DAG.getVectorShuffle(VT, DL, LHS, RHS, CondMask);


      // fold vselect(cond, pshufb(x), pshufb(y)) -> or (pshufb(x), pshufb(y))

      // by forcing the unselected elements to zero.

      // TODO: Can we handle more shuffles with this?

      if (LHS.hasOneUse() && RHS.hasOneUse()) {

        SmallVector<SDValue, 1> LHSOps, RHSOps;

        SmallVector<int, 64> LHSMask, RHSMask, ByteMask;

        SDValue LHSShuf = peekThroughOneUseBitcasts(LHS);

        SDValue RHSShuf = peekThroughOneUseBitcasts(RHS);

        if (LHSShuf.getOpcode() == X86ISD::PSHUFB &&

            RHSShuf.getOpcode() == X86ISD::PSHUFB &&

            scaleShuffleMaskElts(VT.getSizeInBits() / 8, CondMask, ByteMask) &&

            getTargetShuffleMask(LHSShuf, true, LHSOps, LHSMask) &&

            getTargetShuffleMask(RHSShuf, true, RHSOps, RHSMask)) {

          assert(ByteMask.size() == LHSMask.size() &&

                 ByteMask.size() == RHSMask.size() && "Shuffle mask mismatch");

          for (auto [I, M] : enumerate(ByteMask)) {

            // getConstVector sets negative shuffle mask values as undef, so

            // ensure we hardcode SM_SentinelZero values to zero (0x80).

            if (M < (int)ByteMask.size()) {

              LHSMask[I] = isUndefOrZero(LHSMask[I]) ? 0x80 : LHSMask[I];

              RHSMask[I] = 0x80;

            } else {

              LHSMask[I] = 0x80;

              RHSMask[I] = isUndefOrZero(RHSMask[I]) ? 0x80 : RHSMask[I];

            }

          }

          MVT ByteVT = LHSShuf.getSimpleValueType();

          LHS = DAG.getNode(X86ISD::PSHUFB, DL, ByteVT, LHSOps[0],

                            getConstVector(LHSMask, ByteVT, DAG, DL, true));

          RHS = DAG.getNode(X86ISD::PSHUFB, DL, ByteVT, RHSOps[0],

                            getConstVector(RHSMask, ByteVT, DAG, DL, true));

          return DAG.getBitcast(VT, DAG.getNode(ISD::OR, DL, ByteVT, LHS, RHS));

        }

      }


      // Attempt to combine as shuffle.

      SDValue Op(N, 0);

      if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))

        return Res;

    }

  }


  // If we have SSE[12] support, try to form min/max nodes. SSE min/max

  // instructions match the semantics of the common C idiom x<y?x:y but not

  // x<=y?x:y, because of how they handle negative zero (which can be

  // ignored in unsafe-math mode).

  // We also try to create v2f32 min/max nodes, which we later widen to v4f32.

  if ((Cond.getOpcode() == ISD::SETCC ||

       Cond.getOpcode() == ISD::STRICT_FSETCCS) &&

      VT.isFloatingPoint() && VT != MVT::f80 && VT != MVT::f128 &&

      !isSoftF16(VT, Subtarget) && (TLI.isTypeLegal(VT) || VT == MVT::v2f32) &&

      ((VT != MVT::v8f16 && VT != MVT::v16f16) || Subtarget.hasVLX()) &&

      (Subtarget.hasSSE2() ||

       (Subtarget.hasSSE1() && VT.getScalarType() == MVT::f32))) {

    bool IsStrict = Cond->isStrictFPOpcode();

    ISD::CondCode CC =

        cast<CondCodeSDNode>(Cond.getOperand(IsStrict ? 3 : 2))->get();

    SDValue Op0 = Cond.getOperand(IsStrict ? 1 : 0);

    SDValue Op1 = Cond.getOperand(IsStrict ? 2 : 1);


    unsigned Opcode = 0;

    // Check for x CC y ? x : y.

    if (DAG.isEqualTo(LHS, Op0) && DAG.isEqualTo(RHS, Op1)) {

      switch (CC) {

      default: break;

      case ISD::SETULT:

        // Converting this to a min would handle NaNs incorrectly, and swapping

        // the operands would cause it to handle comparisons between positive

        // and negative zero incorrectly.

        if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {

          if (!DAG.getTarget().Options.NoSignedZerosFPMath &&

              !(DAG.isKnownNeverZeroFloat(LHS) ||

                DAG.isKnownNeverZeroFloat(RHS)))

            break;

          std::swap(LHS, RHS);

        }

        Opcode = X86ISD::FMIN;

        break;

      case ISD::SETOLE:

        // Converting this to a min would handle comparisons between positive

        // and negative zero incorrectly.

        if (!DAG.getTarget().Options.NoSignedZerosFPMath &&

            !DAG.isKnownNeverZeroFloat(LHS) && !DAG.isKnownNeverZeroFloat(RHS))

          break;

        Opcode = X86ISD::FMIN;

        break;

      case ISD::SETULE:

        // Converting this to a min would handle both negative zeros and NaNs

        // incorrectly, but we can swap the operands to fix both.

        std::swap(LHS, RHS);

        [[fallthrough]];

      case ISD::SETOLT:

      case ISD::SETLT:

      case ISD::SETLE:

        Opcode = X86ISD::FMIN;

        break;


      case ISD::SETOGE:

        // Converting this to a max would handle comparisons between positive

        // and negative zero incorrectly.

        if (!DAG.getTarget().Options.NoSignedZerosFPMath &&

            !DAG.isKnownNeverZeroFloat(LHS) && !DAG.isKnownNeverZeroFloat(RHS))

          break;

        Opcode = X86ISD::FMAX;

        break;

      case ISD::SETUGT:

        // Converting this to a max would handle NaNs incorrectly, and swapping

        // the operands would cause it to handle comparisons between positive

        // and negative zero incorrectly.

        if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {

          if (!DAG.getTarget().Options.NoSignedZerosFPMath &&

              !(DAG.isKnownNeverZeroFloat(LHS) ||

                DAG.isKnownNeverZeroFloat(RHS)))

            break;

          std::swap(LHS, RHS);

        }

        Opcode = X86ISD::FMAX;

        break;

      case ISD::SETUGE:

        // Converting this to a max would handle both negative zeros and NaNs

        // incorrectly, but we can swap the operands to fix both.

        std::swap(LHS, RHS);

        [[fallthrough]];

      case ISD::SETOGT:

      case ISD::SETGT:

      case ISD::SETGE:

        Opcode = X86ISD::FMAX;

        break;

      }

    // Check for x CC y ? y : x -- a min/max with reversed arms.

    } else if (DAG.isEqualTo(LHS, Op1) && DAG.isEqualTo(RHS, Op0)) {

      switch (CC) {

      default: break;

      case ISD::SETOGE:

        // Converting this to a min would handle comparisons between positive

        // and negative zero incorrectly, and swapping the operands would

        // cause it to handle NaNs incorrectly.

        if (!DAG.getTarget().Options.NoSignedZerosFPMath &&

            !(DAG.isKnownNeverZeroFloat(LHS) ||

              DAG.isKnownNeverZeroFloat(RHS))) {

          if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))

            break;

          std::swap(LHS, RHS);

        }

        Opcode = X86ISD::FMIN;

        break;

      case ISD::SETUGT:

        // Converting this to a min would handle NaNs incorrectly.

        if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))

          break;

        Opcode = X86ISD::FMIN;

        break;

      case ISD::SETUGE:

        // Converting this to a min would handle both negative zeros and NaNs

        // incorrectly, but we can swap the operands to fix both.

        std::swap(LHS, RHS);

        [[fallthrough]];

      case ISD::SETOGT:

      case ISD::SETGT:

      case ISD::SETGE:

        Opcode = X86ISD::FMIN;

        break;


      case ISD::SETULT:

        // Converting this to a max would handle NaNs incorrectly.

        if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))

          break;

        Opcode = X86ISD::FMAX;

        break;

      case ISD::SETOLE:

        // Converting this to a max would handle comparisons between positive

        // and negative zero incorrectly, and swapping the operands would

        // cause it to handle NaNs incorrectly.

        if (!DAG.getTarget().Options.NoSignedZerosFPMath &&

            !DAG.isKnownNeverZeroFloat(LHS) &&

            !DAG.isKnownNeverZeroFloat(RHS)) {

          if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))

            break;

          std::swap(LHS, RHS);

        }

        Opcode = X86ISD::FMAX;

        break;

      case ISD::SETULE:

        // Converting this to a max would handle both negative zeros and NaNs

        // incorrectly, but we can swap the operands to fix both.

        std::swap(LHS, RHS);

        [[fallthrough]];

      case ISD::SETOLT:

      case ISD::SETLT:

      case ISD::SETLE:

        Opcode = X86ISD::FMAX;

        break;

      }

    }


    if (Opcode) {

      if (IsStrict) {

        SDValue Ret = DAG.getNode(Opcode == X86ISD::FMIN ? X86ISD::STRICT_FMIN

                                                         : X86ISD::STRICT_FMAX,

                                  DL, {N->getValueType(0), MVT::Other},

                                  {Cond.getOperand(0), LHS, RHS});

        DAG.ReplaceAllUsesOfValueWith(Cond.getValue(1), Ret.getValue(1));

        return Ret;

      }

      return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS);

    }

  }


  // Some mask scalar intrinsics rely on checking if only one bit is set

  // and implement it in C code like this:

  // A[0] = (U & 1) ? A[0] : W[0];

  // This creates some redundant instructions that break pattern matching.

  // fold (select (setcc (and (X, 1), 0, seteq), Y, Z)) -> select(and(X, 1),Z,Y)

  if (Subtarget.hasAVX512() && N->getOpcode() == ISD::SELECT &&

      Cond.getOpcode() == ISD::SETCC && (VT == MVT::f32 || VT == MVT::f64)) {

    ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();

    SDValue AndNode = Cond.getOperand(0);

    if (AndNode.getOpcode() == ISD::AND && CC == ISD::SETEQ &&

        isNullConstant(Cond.getOperand(1)) &&

        isOneConstant(AndNode.getOperand(1))) {

      // LHS and RHS swapped due to

      // setcc outputting 1 when AND resulted in 0 and vice versa.

      AndNode = DAG.getZExtOrTrunc(AndNode, DL, MVT::i8);

      return DAG.getNode(ISD::SELECT, DL, VT, AndNode, RHS, LHS);

    }

  }


  // v16i8 (select v16i1, v16i8, v16i8) does not have a proper

  // lowering on KNL. In this case we convert it to

  // v16i8 (select v16i8, v16i8, v16i8) and use AVX instruction.

  // The same situation all vectors of i8 and i16 without BWI.

  // Make sure we extend these even before type legalization gets a chance to

  // split wide vectors.

  // Since SKX these selects have a proper lowering.

  if (Subtarget.hasAVX512() && !Subtarget.hasBWI() && CondVT.isVector() &&

      CondVT.getVectorElementType() == MVT::i1 &&

      (VT.getVectorElementType() == MVT::i8 ||

       VT.getVectorElementType() == MVT::i16)) {

    Cond = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Cond);

    return DAG.getNode(N->getOpcode(), DL, VT, Cond, LHS, RHS);

  }


  // AVX512 - Extend select to merge with target shuffle.

  // select(mask, extract_subvector(shuffle(x)), y) -->

  // extract_subvector(select(widen(mask), shuffle(x), widen(y)))

  // TODO - support non target shuffles as well with canCombineAsMaskOperation.

  if (Subtarget.hasAVX512() && CondVT.isVector() &&

      CondVT.getVectorElementType() == MVT::i1) {

    auto SelectableOp = [&TLI](SDValue Op, SDValue Alt) {

      return Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&

             isTargetShuffle(Op.getOperand(0).getOpcode()) &&

             isNullConstant(Op.getOperand(1)) &&

             TLI.isTypeLegal(Op.getOperand(0).getValueType()) &&

             Op.hasOneUse() && Op.getOperand(0).hasOneUse() &&

             (Op.getOperand(0).getOpcode() != X86ISD::VPERMV3 ||

              ISD::isBuildVectorAllZeros(Alt.getNode()));

    };


    bool SelectableLHS = SelectableOp(LHS, RHS);

    bool SelectableRHS = SelectableOp(RHS, LHS);

    if (SelectableLHS || SelectableRHS) {

      EVT SrcVT = SelectableLHS ? LHS.getOperand(0).getValueType()

                                : RHS.getOperand(0).getValueType();

      EVT SrcCondVT = SrcVT.changeVectorElementType(MVT::i1);

      LHS = insertSubVector(DAG.getUNDEF(SrcVT), LHS, 0, DAG, DL,

                            VT.getSizeInBits());

      RHS = insertSubVector(DAG.getUNDEF(SrcVT), RHS, 0, DAG, DL,

                            VT.getSizeInBits());

      Cond = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, SrcCondVT,

                         DAG.getUNDEF(SrcCondVT), Cond,

                         DAG.getVectorIdxConstant(0, DL));

      SDValue Res = DAG.getSelect(DL, SrcVT, Cond, LHS, RHS);

      return extractSubVector(Res, 0, DAG, DL, VT.getSizeInBits());

    }

  }


  if (SDValue V = combineSelectOfTwoConstants(N, DAG, DL))

    return V;


  if (N->getOpcode() == ISD::SELECT && Cond.getOpcode() == ISD::SETCC &&

      Cond.hasOneUse()) {

    EVT CondVT = Cond.getValueType();

    SDValue Cond0 = Cond.getOperand(0);

    SDValue Cond1 = Cond.getOperand(1);

    ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();


    // Canonicalize min/max:

    // (x > 0) ? x : 0 -> (x >= 0) ? x : 0

    // (x < -1) ? x : -1 -> (x <= -1) ? x : -1

    // This allows use of COND_S / COND_NS (see TranslateX86CC) which eliminates

    // the need for an extra compare against zero. e.g.

    // (a - b) > 0 : (a - b) ? 0 -> (a - b) >= 0 : (a - b) ? 0

    // subl   %esi, %edi

    // testl  %edi, %edi

    // movl   $0, %eax

    // cmovgl %edi, %eax

    // =>

    // xorl   %eax, %eax

    // subl   %esi, $edi

    // cmovsl %eax, %edi

    //

    // We can also canonicalize

    //  (x s> 1) ? x : 1 -> (x s>= 1) ? x : 1 -> (x s> 0) ? x : 1

    //  (x u> 1) ? x : 1 -> (x u>= 1) ? x : 1 -> (x != 0) ? x : 1

    // This allows the use of a test instruction for the compare.

    if (LHS == Cond0 && RHS == Cond1) {

      if ((CC == ISD::SETGT && (isNullConstant(RHS) || isOneConstant(RHS))) ||

          (CC == ISD::SETLT && isAllOnesConstant(RHS))) {

        ISD::CondCode NewCC = CC == ISD::SETGT ? ISD::SETGE : ISD::SETLE;

        Cond = DAG.getSetCC(SDLoc(Cond), CondVT, Cond0, Cond1, NewCC);

        return DAG.getSelect(DL, VT, Cond, LHS, RHS);

      }

      if (CC == ISD::SETUGT && isOneConstant(RHS)) {

        ISD::CondCode NewCC = ISD::SETUGE;

        Cond = DAG.getSetCC(SDLoc(Cond), CondVT, Cond0, Cond1, NewCC);

        return DAG.getSelect(DL, VT, Cond, LHS, RHS);

      }

    }


    // Similar to DAGCombine's select(or(CC0,CC1),X,Y) fold but for legal types.

    // fold eq + gt/lt nested selects into ge/le selects

    // select (cmpeq Cond0, Cond1), LHS, (select (cmpugt Cond0, Cond1), LHS, Y)

    // --> (select (cmpuge Cond0, Cond1), LHS, Y)

    // select (cmpslt Cond0, Cond1), LHS, (select (cmpeq Cond0, Cond1), LHS, Y)

    // --> (select (cmpsle Cond0, Cond1), LHS, Y)

    // .. etc ..

    if (RHS.getOpcode() == ISD::SELECT && RHS.getOperand(1) == LHS &&

        RHS.getOperand(0).getOpcode() == ISD::SETCC) {

      SDValue InnerSetCC = RHS.getOperand(0);

      ISD::CondCode InnerCC =

          cast<CondCodeSDNode>(InnerSetCC.getOperand(2))->get();

      if ((CC == ISD::SETEQ || InnerCC == ISD::SETEQ) &&

          Cond0 == InnerSetCC.getOperand(0) &&

          Cond1 == InnerSetCC.getOperand(1)) {

        ISD::CondCode NewCC;

        switch (CC == ISD::SETEQ ? InnerCC : CC) {

        // clang-format off

        case ISD::SETGT:  NewCC = ISD::SETGE; break;

        case ISD::SETLT:  NewCC = ISD::SETLE; break;

        case ISD::SETUGT: NewCC = ISD::SETUGE; break;

        case ISD::SETULT: NewCC = ISD::SETULE; break;

        default: NewCC = ISD::SETCC_INVALID; break;

        // clang-format on

        }

        if (NewCC != ISD::SETCC_INVALID) {

          Cond = DAG.getSetCC(DL, CondVT, Cond0, Cond1, NewCC);

          return DAG.getSelect(DL, VT, Cond, LHS, RHS.getOperand(2));

        }

      }

    }

  }


  // Check if the first operand is all zeros and Cond type is vXi1.

  // If this an avx512 target we can improve the use of zero masking by

  // swapping the operands and inverting the condition.

  if (N->getOpcode() == ISD::VSELECT && Cond.hasOneUse() &&

      Subtarget.hasAVX512() && CondVT.getVectorElementType() == MVT::i1 &&

      ISD::isBuildVectorAllZeros(LHS.getNode()) &&

      !ISD::isBuildVectorAllZeros(RHS.getNode())) {

    // Invert the cond to not(cond) : xor(op,allones)=not(op)

    SDValue CondNew = DAG.getNOT(DL, Cond, CondVT);

    // Vselect cond, op1, op2 = Vselect not(cond), op2, op1

    return DAG.getSelect(DL, VT, CondNew, RHS, LHS);

  }


  // Attempt to convert a (vXi1 bitcast(iX Cond)) selection mask before it might

  // get split by legalization.

  if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::BITCAST &&

      CondVT.getVectorElementType() == MVT::i1 &&

      TLI.isTypeLegal(VT.getScalarType())) {

    EVT ExtCondVT = VT.changeVectorElementTypeToInteger();

    if (SDValue ExtCond = combineToExtendBoolVectorInReg(

            ISD::SIGN_EXTEND, DL, ExtCondVT, Cond, DAG, DCI, Subtarget)) {

      ExtCond = DAG.getNode(ISD::TRUNCATE, DL, CondVT, ExtCond);

      return DAG.getSelect(DL, VT, ExtCond, LHS, RHS);

    }

  }


  // Exploits AVX2 VSHLV/VSRLV instructions for efficient unsigned vector shifts

  // with out-of-bounds clamping.


  // Unlike general shift instructions (SHL/SRL), AVX2's VSHLV/VSRLV handle

  // shift amounts exceeding the element bitwidth. VSHLV/VSRLV clamps the amount

  // to bitwidth-1 for unsigned shifts, effectively performing a maximum left

  // shift of bitwidth-1 positions. and returns zero for unsigned right shifts

  // exceeding bitwidth-1.

  if (N->getOpcode() == ISD::VSELECT) {

    using namespace llvm::SDPatternMatch;

    // fold select(icmp_ult(amt,BW),shl(x,amt),0) -> avx2 psllv(x,amt)

    // fold select(icmp_ult(amt,BW),srl(x,amt),0) -> avx2 psrlv(x,amt)

    if ((LHS.getOpcode() == ISD::SRL || LHS.getOpcode() == ISD::SHL) &&

        supportedVectorVarShift(VT, Subtarget, LHS.getOpcode()) &&

        ISD::isConstantSplatVectorAllZeros(RHS.getNode()) &&

        sd_match(Cond, m_SetCC(m_Specific(LHS.getOperand(1)),

                               m_SpecificInt(VT.getScalarSizeInBits()),

                               m_SpecificCondCode(ISD::SETULT)))) {

      return DAG.getNode(LHS.getOpcode() == ISD::SRL ? X86ISD::VSRLV

                                                     : X86ISD::VSHLV,

                         DL, VT, LHS.getOperand(0), LHS.getOperand(1));

    }

    // fold select(icmp_uge(amt,BW),0,shl(x,amt)) -> avx2 psllv(x,amt)

    // fold select(icmp_uge(amt,BW),0,srl(x,amt)) -> avx2 psrlv(x,amt)

    if ((RHS.getOpcode() == ISD::SRL || RHS.getOpcode() == ISD::SHL) &&

        supportedVectorVarShift(VT, Subtarget, RHS.getOpcode()) &&

        ISD::isConstantSplatVectorAllZeros(LHS.getNode()) &&

        sd_match(Cond, m_SetCC(m_Specific(RHS.getOperand(1)),

                               m_SpecificInt(VT.getScalarSizeInBits()),

                               m_SpecificCondCode(ISD::SETUGE)))) {

      return DAG.getNode(RHS.getOpcode() == ISD::SRL ? X86ISD::VSRLV

                                                     : X86ISD::VSHLV,

                         DL, VT, RHS.getOperand(0), RHS.getOperand(1));

    }

  }


  // Early exit check

  if (!TLI.isTypeLegal(VT) || isSoftF16(VT, Subtarget))

    return SDValue();


  if (SDValue V = combineVSelectToBLENDV(N, DAG, DL, DCI, Subtarget))

    return V;


  if (SDValue V = narrowVectorSelect(N, DAG, DL, Subtarget))

    return V;


  // select(~Cond, X, Y) -> select(Cond, Y, X)

  if (CondVT.getScalarType() != MVT::i1) {

    if (SDValue CondNot = IsNOT(Cond, DAG))

      return DAG.getNode(N->getOpcode(), DL, VT,

                         DAG.getBitcast(CondVT, CondNot), RHS, LHS);


    // select(pcmpeq(and(X,Pow2),0),A,B) -> select(pcmpeq(and(X,Pow2),Pow2),B,A)

    if (Cond.getOpcode() == X86ISD::PCMPEQ &&

        Cond.getOperand(0).getOpcode() == ISD::AND &&

        ISD::isBuildVectorAllZeros(Cond.getOperand(1).getNode()) &&

        isConstantPowerOf2(Cond.getOperand(0).getOperand(1),

                           Cond.getScalarValueSizeInBits(),

                           /*AllowUndefs=*/true) &&

        Cond.hasOneUse()) {

      Cond = DAG.getNode(X86ISD::PCMPEQ, DL, CondVT, Cond.getOperand(0),

                         Cond.getOperand(0).getOperand(1));

      return DAG.getNode(N->getOpcode(), DL, VT, Cond, RHS, LHS);

    }


    // pcmpgt(X, -1) -> pcmpgt(0, X) to help select/blendv just use the

    // signbit.

    if (Cond.getOpcode() == X86ISD::PCMPGT &&

        ISD::isBuildVectorAllOnes(Cond.getOperand(1).getNode()) &&

        Cond.hasOneUse()) {

      Cond = DAG.getNode(X86ISD::PCMPGT, DL, CondVT,

                         DAG.getConstant(0, DL, CondVT), Cond.getOperand(0));

      return DAG.getNode(N->getOpcode(), DL, VT, Cond, RHS, LHS);

    }

  }


  // Try to optimize vXi1 selects if both operands are either all constants or

  // bitcasts from scalar integer type. In that case we can convert the operands

  // to integer and use an integer select which will be converted to a CMOV.

  // We need to take a little bit of care to avoid creating an i64 type after

  // type legalization.

  if (N->getOpcode() == ISD::SELECT && VT.isVector() &&

      VT.getVectorElementType() == MVT::i1 &&

      (DCI.isBeforeLegalize() || (VT != MVT::v64i1 || Subtarget.is64Bit()))) {

    EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getVectorNumElements());

    if (DCI.isBeforeLegalize() || TLI.isTypeLegal(IntVT)) {

      bool LHSIsConst = ISD::isBuildVectorOfConstantSDNodes(LHS.getNode());

      bool RHSIsConst = ISD::isBuildVectorOfConstantSDNodes(RHS.getNode());


      if ((LHSIsConst || (LHS.getOpcode() == ISD::BITCAST &&

                          LHS.getOperand(0).getValueType() == IntVT)) &&

          (RHSIsConst || (RHS.getOpcode() == ISD::BITCAST &&

                          RHS.getOperand(0).getValueType() == IntVT))) {

        if (LHSIsConst)

          LHS = combinevXi1ConstantToInteger(LHS, DAG);

        else

          LHS = LHS.getOperand(0);


        if (RHSIsConst)

          RHS = combinevXi1ConstantToInteger(RHS, DAG);

        else

          RHS = RHS.getOperand(0);


        SDValue Select = DAG.getSelect(DL, IntVT, Cond, LHS, RHS);

        return DAG.getBitcast(VT, Select);

      }

    }

  }


  // If this is "((X & C) == 0) ? Y : Z" and C is a constant mask vector of

  // single bits, then invert the predicate and swap the select operands.

  // This can lower using a vector shift bit-hack rather than mask and compare.

  if (DCI.isBeforeLegalize() && !Subtarget.hasAVX512() &&

      N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC &&

      Cond.hasOneUse() && CondVT.getVectorElementType() == MVT::i1 &&

      Cond.getOperand(0).getOpcode() == ISD::AND &&

      isNullOrNullSplat(Cond.getOperand(1)) &&

      cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETEQ &&

      Cond.getOperand(0).getValueType() == VT) {

    // The 'and' mask must be composed of power-of-2 constants.

    SDValue And = Cond.getOperand(0);

    auto *C = isConstOrConstSplat(And.getOperand(1));

    if (C && C->getAPIntValue().isPowerOf2()) {

      // vselect (X & C == 0), LHS, RHS --> vselect (X & C != 0), RHS, LHS

      SDValue NotCond =

          DAG.getSetCC(DL, CondVT, And, Cond.getOperand(1), ISD::SETNE);

      return DAG.getSelect(DL, VT, NotCond, RHS, LHS);

    }


    // If we have a non-splat but still powers-of-2 mask, AVX1 can use pmulld

    // and AVX2 can use vpsllv{dq}. 8-bit lacks a proper shift or multiply.

    // 16-bit lacks a proper blendv.

    unsigned EltBitWidth = VT.getScalarSizeInBits();

    bool CanShiftBlend =

        TLI.isTypeLegal(VT) && ((Subtarget.hasAVX() && EltBitWidth == 32) ||

                                (Subtarget.hasAVX2() && EltBitWidth == 64) ||

                                (Subtarget.hasXOP()));

    if (CanShiftBlend &&

        ISD::matchUnaryPredicate(And.getOperand(1), [](ConstantSDNode *C) {

          return C->getAPIntValue().isPowerOf2();

        })) {

      // Create a left-shift constant to get the mask bits over to the sign-bit.

      SDValue Mask = And.getOperand(1);

      SmallVector<int, 32> ShlVals;

      for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) {

        auto *MaskVal = cast<ConstantSDNode>(Mask.getOperand(i));

        ShlVals.push_back(EltBitWidth - 1 -

                          MaskVal->getAPIntValue().exactLogBase2());

      }

      // vsel ((X & C) == 0), LHS, RHS --> vsel ((shl X, C') < 0), RHS, LHS

      SDValue ShlAmt = getConstVector(ShlVals, VT.getSimpleVT(), DAG, DL);

      SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, And.getOperand(0), ShlAmt);

      SDValue NewCond =

          DAG.getSetCC(DL, CondVT, Shl, Cond.getOperand(1), ISD::SETLT);

      return DAG.getSelect(DL, VT, NewCond, RHS, LHS);

    }

  }


  return SDValue();

}


/// Combine:

///   (brcond/cmov/setcc .., (cmp (atomic_load_add x, 1), 0), COND_S)

/// to:

///   (brcond/cmov/setcc .., (LADD x, 1), COND_LE)

/// i.e., reusing the EFLAGS produced by the LOCKed instruction.

/// Note that this is only legal for some op/cc combinations.


static SDValue combineSetCCAtomicArith(SDValue Cmp, X86::CondCode &CC,

                                       SelectionDAG &DAG,

                                       const X86Subtarget &Subtarget) {

  // This combine only operates on CMP-like nodes.

  if (!(Cmp.getOpcode() == X86ISD::CMP ||

        (Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))

    return SDValue();


  // Can't replace the cmp if it has more uses than the one we're looking at.

  // FIXME: We would like to be able to handle this, but would need to make sure

  // all uses were updated.

  if (!Cmp.hasOneUse())

    return SDValue();


  // This only applies to variations of the common case:

  //   (icmp slt x, 0) -> (icmp sle (add x, 1), 0)

  //   (icmp sge x, 0) -> (icmp sgt (add x, 1), 0)

  //   (icmp sle x, 0) -> (icmp slt (sub x, 1), 0)

  //   (icmp sgt x, 0) -> (icmp sge (sub x, 1), 0)

  // Using the proper condcodes (see below), overflow is checked for.


  // FIXME: We can generalize both constraints:

  // - XOR/OR/AND (if they were made to survive AtomicExpand)

  // - LHS != 1

  // if the result is compared.


  SDValue CmpLHS = Cmp.getOperand(0);

  SDValue CmpRHS = Cmp.getOperand(1);

  EVT CmpVT = CmpLHS.getValueType();


  if (!CmpLHS.hasOneUse())

    return SDValue();


  unsigned Opc = CmpLHS.getOpcode();

  if (Opc != ISD::ATOMIC_LOAD_ADD && Opc != ISD::ATOMIC_LOAD_SUB)

    return SDValue();


  SDValue OpRHS = CmpLHS.getOperand(2);

  auto *OpRHSC = dyn_cast<ConstantSDNode>(OpRHS);

  if (!OpRHSC)

    return SDValue();


  APInt Addend = OpRHSC->getAPIntValue();

  if (Opc == ISD::ATOMIC_LOAD_SUB)

    Addend = -Addend;


  auto *CmpRHSC = dyn_cast<ConstantSDNode>(CmpRHS);

  if (!CmpRHSC)

    return SDValue();


  APInt Comparison = CmpRHSC->getAPIntValue();

  APInt NegAddend = -Addend;


  // See if we can adjust the CC to make the comparison match the negated

  // addend.

  if (Comparison != NegAddend) {

    APInt IncComparison = Comparison + 1;

    if (IncComparison == NegAddend) {

      if (CC == X86::COND_A && !Comparison.isMaxValue()) {

        Comparison = IncComparison;

        CC = X86::COND_AE;

      } else if (CC == X86::COND_LE && !Comparison.isMaxSignedValue()) {

        Comparison = IncComparison;

        CC = X86::COND_L;

      }

    }

    APInt DecComparison = Comparison - 1;

    if (DecComparison == NegAddend) {

      if (CC == X86::COND_AE && !Comparison.isMinValue()) {

        Comparison = DecComparison;

        CC = X86::COND_A;

      } else if (CC == X86::COND_L && !Comparison.isMinSignedValue()) {

        Comparison = DecComparison;

        CC = X86::COND_LE;

      }

    }

  }


  // If the addend is the negation of the comparison value, then we can do

  // a full comparison by emitting the atomic arithmetic as a locked sub.

  if (Comparison == NegAddend) {

    // The CC is fine, but we need to rewrite the LHS of the comparison as an

    // atomic sub.

    auto *AN = cast<AtomicSDNode>(CmpLHS.getNode());

    auto AtomicSub = DAG.getAtomic(

        ISD::ATOMIC_LOAD_SUB, SDLoc(CmpLHS), CmpVT,

        /*Chain*/ CmpLHS.getOperand(0), /*LHS*/ CmpLHS.getOperand(1),

        /*RHS*/ DAG.getConstant(NegAddend, SDLoc(CmpRHS), CmpVT),

        AN->getMemOperand());

    auto LockOp = lowerAtomicArithWithLOCK(AtomicSub, DAG, Subtarget);

    DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(0), DAG.getUNDEF(CmpVT));

    DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(1), LockOp.getValue(1));

    return LockOp;

  }


  // We can handle comparisons with zero in a number of cases by manipulating

  // the CC used.

  if (!Comparison.isZero())

    return SDValue();


  if (CC == X86::COND_S && Addend == 1)

    CC = X86::COND_LE;

  else if (CC == X86::COND_NS && Addend == 1)

    CC = X86::COND_G;

  else if (CC == X86::COND_G && Addend == -1)

    CC = X86::COND_GE;

  else if (CC == X86::COND_LE && Addend == -1)

    CC = X86::COND_L;

  else

    return SDValue();


  SDValue LockOp = lowerAtomicArithWithLOCK(CmpLHS, DAG, Subtarget);

  DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(0), DAG.getUNDEF(CmpVT));

  DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(1), LockOp.getValue(1));

  return LockOp;

}


// Check whether we're just testing the signbit, and whether we can simplify

// this by tracking where the signbit came from.


static SDValue checkSignTestSetCCCombine(SDValue Cmp, X86::CondCode &CC,

                                         SelectionDAG &DAG) {

  if (CC != X86::COND_S && CC != X86::COND_NS)

    return SDValue();


  if (!Cmp.hasOneUse())

    return SDValue();


  SDValue Src;

  if (Cmp.getOpcode() == X86ISD::CMP) {

    // CMP(X,0) -> signbit test

    if (!isNullConstant(Cmp.getOperand(1)))

      return SDValue();

    Src = Cmp.getOperand(0);

    // Peek through a SRA node as we just need the signbit.

    // TODO: Remove one use limit once sdiv-fix regressions are fixed.

    // TODO: Use SimplifyDemandedBits instead of just SRA?

    if (Src.getOpcode() != ISD::SRA || !Src.hasOneUse())

      return SDValue();

    Src = Src.getOperand(0);

  } else if (Cmp.getOpcode() == X86ISD::OR) {

    // OR(X,Y) -> see if only one operand contributes to the signbit.

    // TODO: XOR(X,Y) -> see if only one operand contributes to the signbit.

    if (DAG.SignBitIsZero(Cmp.getOperand(0)))

      Src = Cmp.getOperand(1);

    else if (DAG.SignBitIsZero(Cmp.getOperand(1)))

      Src = Cmp.getOperand(0);

    else

      return SDValue();

  } else {

    return SDValue();

  }


  // Replace with a TEST on the MSB.

  SDLoc DL(Cmp);

  MVT SrcVT = Src.getSimpleValueType();

  APInt BitMask = APInt::getSignMask(SrcVT.getScalarSizeInBits());


  // If Src came from a SIGN_EXTEND_INREG or SHL (probably from an expanded

  // SIGN_EXTEND_INREG), then peek through and adjust the TEST bit.

  if (Src.getOpcode() == ISD::SHL) {

    if (std::optional<unsigned> ShiftAmt = DAG.getValidShiftAmount(Src)) {

      Src = Src.getOperand(0);

      BitMask.lshrInPlace(*ShiftAmt);

    }

  } else if (Src.getOpcode() == ISD::SIGN_EXTEND_INREG) {

    EVT ExtVT = cast<VTSDNode>(Src.getOperand(1))->getVT();

    Src = Src.getOperand(0);

    BitMask.lshrInPlace(BitMask.getBitWidth() - ExtVT.getScalarSizeInBits());

  }


  SDValue Mask = DAG.getNode(ISD::AND, DL, SrcVT, Src,

                             DAG.getConstant(BitMask, DL, SrcVT));

  CC = CC == X86::COND_S ? X86::COND_NE : X86::COND_E;

  return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Mask,

                     DAG.getConstant(0, DL, SrcVT));

}


// Check whether a boolean test is testing a boolean value generated by

// X86ISD::SETCC. If so, return the operand of that SETCC and proper condition

// code.

//

// Simplify the following patterns:

// (Op (CMP (SETCC Cond EFLAGS) 1) EQ) or

// (Op (CMP (SETCC Cond EFLAGS) 0) NEQ)

// to (Op EFLAGS Cond)

//

// (Op (CMP (SETCC Cond EFLAGS) 0) EQ) or

// (Op (CMP (SETCC Cond EFLAGS) 1) NEQ)

// to (Op EFLAGS !Cond)

//

// where Op could be BRCOND or CMOV.

//


static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) {

  // This combine only operates on CMP-like nodes.

  if (!(Cmp.getOpcode() == X86ISD::CMP ||

        (Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))

    return SDValue();


  // Quit if not used as a boolean value.

  if (CC != X86::COND_E && CC != X86::COND_NE)

    return SDValue();


  // Check CMP operands. One of them should be 0 or 1 and the other should be

  // an SetCC or extended from it.

  SDValue Op1 = Cmp.getOperand(0);

  SDValue Op2 = Cmp.getOperand(1);


  SDValue SetCC;

  const ConstantSDNode* C = nullptr;

  bool needOppositeCond = (CC == X86::COND_E);

  bool checkAgainstTrue = false; // Is it a comparison against 1?


  if ((C = dyn_cast<ConstantSDNode>(Op1)))

    SetCC = Op2;

  else if ((C = dyn_cast<ConstantSDNode>(Op2)))

    SetCC = Op1;

  else // Quit if all operands are not constants.

    return SDValue();


  if (C->getZExtValue() == 1) {

    needOppositeCond = !needOppositeCond;

    checkAgainstTrue = true;

  } else if (C->getZExtValue() != 0)

    // Quit if the constant is neither 0 or 1.

    return SDValue();


  bool truncatedToBoolWithAnd = false;

  // Skip (zext $x), (trunc $x), or (and $x, 1) node.

  while (SetCC.getOpcode() == ISD::ZERO_EXTEND ||

         SetCC.getOpcode() == ISD::TRUNCATE ||

         SetCC.getOpcode() == ISD::AND) {

    if (SetCC.getOpcode() == ISD::AND) {

      int OpIdx = -1;

      if (isOneConstant(SetCC.getOperand(0)))

        OpIdx = 1;

      if (isOneConstant(SetCC.getOperand(1)))

        OpIdx = 0;

      if (OpIdx < 0)

        break;

      SetCC = SetCC.getOperand(OpIdx);

      truncatedToBoolWithAnd = true;

    } else

      SetCC = SetCC.getOperand(0);

  }


  switch (SetCC.getOpcode()) {

  case X86ISD::SETCC_CARRY:

    // Since SETCC_CARRY gives output based on R = CF ? ~0 : 0, it's unsafe to

    // simplify it if the result of SETCC_CARRY is not canonicalized to 0 or 1,

    // i.e. it's a comparison against true but the result of SETCC_CARRY is not

    // truncated to i1 using 'and'.

    if (checkAgainstTrue && !truncatedToBoolWithAnd)

      break;

    assert(X86::CondCode(SetCC.getConstantOperandVal(0)) == X86::COND_B &&

           "Invalid use of SETCC_CARRY!");

    [[fallthrough]];

  case X86ISD::SETCC:

    // Set the condition code or opposite one if necessary.

    CC = X86::CondCode(SetCC.getConstantOperandVal(0));

    if (needOppositeCond)

      CC = X86::GetOppositeBranchCondition(CC);

    return SetCC.getOperand(1);

  case X86ISD::CMOV: {

    // Check whether false/true value has canonical one, i.e. 0 or 1.

    ConstantSDNode *FVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(0));

    ConstantSDNode *TVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(1));

    // Quit if true value is not a constant.

    if (!TVal)

      return SDValue();

    // Quit if false value is not a constant.

    if (!FVal) {

      SDValue Op = SetCC.getOperand(0);

      // Skip 'zext' or 'trunc' node.

      if (Op.getOpcode() == ISD::ZERO_EXTEND ||

          Op.getOpcode() == ISD::TRUNCATE)

        Op = Op.getOperand(0);

      // A special case for rdrand/rdseed, where 0 is set if false cond is

      // found.

      if ((Op.getOpcode() != X86ISD::RDRAND &&

           Op.getOpcode() != X86ISD::RDSEED) || Op.getResNo() != 0)

        return SDValue();

    }

    // Quit if false value is not the constant 0 or 1.

    bool FValIsFalse = true;

    if (FVal && FVal->getZExtValue() != 0) {

      if (FVal->getZExtValue() != 1)

        return SDValue();

      // If FVal is 1, opposite cond is needed.

      needOppositeCond = !needOppositeCond;

      FValIsFalse = false;

    }

    // Quit if TVal is not the constant opposite of FVal.

    if (FValIsFalse && TVal->getZExtValue() != 1)

      return SDValue();

    if (!FValIsFalse && TVal->getZExtValue() != 0)

      return SDValue();

    CC = X86::CondCode(SetCC.getConstantOperandVal(2));

    if (needOppositeCond)

      CC = X86::GetOppositeBranchCondition(CC);

    return SetCC.getOperand(3);

  }

  }


  return SDValue();

}


/// Check whether Cond is an AND/OR of SETCCs off of the same EFLAGS.

/// Match:

///   (X86or (X86setcc) (X86setcc))

///   (X86cmp (and (X86setcc) (X86setcc)), 0)


static bool checkBoolTestAndOrSetCCCombine(SDValue Cond, X86::CondCode &CC0,

                                           X86::CondCode &CC1, SDValue &Flags,

                                           bool &isAnd) {

  if (Cond->getOpcode() == X86ISD::CMP) {

    if (!isNullConstant(Cond->getOperand(1)))

      return false;


    Cond = Cond->getOperand(0);

  }


  isAnd = false;


  SDValue SetCC0, SetCC1;

  switch (Cond->getOpcode()) {

  default: return false;

  case ISD::AND:

  case X86ISD::AND:

    isAnd = true;

    [[fallthrough]];

  case ISD::OR:

  case X86ISD::OR:

    SetCC0 = Cond->getOperand(0);

    SetCC1 = Cond->getOperand(1);

    break;

  };


  // Make sure we have SETCC nodes, using the same flags value.

  if (SetCC0.getOpcode() != X86ISD::SETCC ||

      SetCC1.getOpcode() != X86ISD::SETCC ||

      SetCC0->getOperand(1) != SetCC1->getOperand(1))

    return false;


  CC0 = (X86::CondCode)SetCC0->getConstantOperandVal(0);

  CC1 = (X86::CondCode)SetCC1->getConstantOperandVal(0);

  Flags = SetCC0->getOperand(1);

  return true;

}


// When legalizing carry, we create carries via add X, -1

// If that comes from an actual carry, via setcc, we use the

// carry directly.


static SDValue combineCarryThroughADD(SDValue EFLAGS, SelectionDAG &DAG) {

  if (EFLAGS.getOpcode() == X86ISD::ADD) {

    if (isAllOnesConstant(EFLAGS.getOperand(1))) {

      bool FoundAndLSB = false;

      SDValue Carry = EFLAGS.getOperand(0);

      while (Carry.getOpcode() == ISD::TRUNCATE ||

             Carry.getOpcode() == ISD::ZERO_EXTEND ||

             (Carry.getOpcode() == ISD::AND &&

              isOneConstant(Carry.getOperand(1)))) {

        FoundAndLSB |= Carry.getOpcode() == ISD::AND;

        Carry = Carry.getOperand(0);

      }

      if (Carry.getOpcode() == X86ISD::SETCC ||

          Carry.getOpcode() == X86ISD::SETCC_CARRY) {

        // TODO: Merge this code with equivalent in combineAddOrSubToADCOrSBB?

        uint64_t CarryCC = Carry.getConstantOperandVal(0);

        SDValue CarryOp1 = Carry.getOperand(1);

        if (CarryCC == X86::COND_B)

          return CarryOp1;

        if (CarryCC == X86::COND_A) {

          // Try to convert COND_A into COND_B in an attempt to facilitate

          // materializing "setb reg".

          //

          // Do not flip "e > c", where "c" is a constant, because Cmp

          // instruction cannot take an immediate as its first operand.

          //

          if (CarryOp1.getOpcode() == X86ISD::SUB &&

              CarryOp1.getNode()->hasOneUse() &&

              CarryOp1.getValueType().isInteger() &&

              !isa<ConstantSDNode>(CarryOp1.getOperand(1))) {

            SDValue SubCommute =

                DAG.getNode(X86ISD::SUB, SDLoc(CarryOp1), CarryOp1->getVTList(),

                            CarryOp1.getOperand(1), CarryOp1.getOperand(0));

            return SDValue(SubCommute.getNode(), CarryOp1.getResNo());

          }

        }

        // If this is a check of the z flag of an add with 1, switch to the

        // C flag.

        if (CarryCC == X86::COND_E &&

            CarryOp1.getOpcode() == X86ISD::ADD &&

            isOneConstant(CarryOp1.getOperand(1)))

          return CarryOp1;

      } else if (FoundAndLSB) {

        SDLoc DL(Carry);

        SDValue BitNo = DAG.getConstant(0, DL, Carry.getValueType());

        if (Carry.getOpcode() == ISD::SRL) {

          BitNo = Carry.getOperand(1);

          Carry = Carry.getOperand(0);

        }

        return getBT(Carry, BitNo, DL, DAG);

      }

    }

  }


  return SDValue();

}


/// If we are inverting an PTEST/TESTP operand, attempt to adjust the CC

/// to avoid the inversion.


static SDValue combinePTESTCC(SDValue EFLAGS, X86::CondCode &CC,

                              SelectionDAG &DAG,

                              const X86Subtarget &Subtarget) {

  // TODO: Handle X86ISD::KTEST/X86ISD::KORTEST.

  if (EFLAGS.getOpcode() != X86ISD::PTEST &&

      EFLAGS.getOpcode() != X86ISD::TESTP)

    return SDValue();


  // PTEST/TESTP sets EFLAGS as:

  // TESTZ: ZF = (Op0 & Op1) == 0

  // TESTC: CF = (~Op0 & Op1) == 0

  // TESTNZC: ZF == 0 && CF == 0

  MVT VT = EFLAGS.getSimpleValueType();

  SDValue Op0 = EFLAGS.getOperand(0);

  SDValue Op1 = EFLAGS.getOperand(1);

  MVT OpVT = Op0.getSimpleValueType();

  const TargetLowering &TLI = DAG.getTargetLoweringInfo();


  // TEST*(~X,Y) == TEST*(X,Y)

  if (SDValue NotOp0 = IsNOT(Op0, DAG)) {

    X86::CondCode InvCC;

    switch (CC) {

    case X86::COND_B:

      // testc -> testz.

      InvCC = X86::COND_E;

      break;

    case X86::COND_AE:

      // !testc -> !testz.

      InvCC = X86::COND_NE;

      break;

    case X86::COND_E:

      // testz -> testc.

      InvCC = X86::COND_B;

      break;

    case X86::COND_NE:

      // !testz -> !testc.

      InvCC = X86::COND_AE;

      break;

    case X86::COND_A:

    case X86::COND_BE:

      // testnzc -> testnzc (no change).

      InvCC = CC;

      break;

    default:

      InvCC = X86::COND_INVALID;

      break;

    }


    if (InvCC != X86::COND_INVALID) {

      CC = InvCC;

      return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,

                         DAG.getBitcast(OpVT, NotOp0), Op1);

    }

  }


  if (CC == X86::COND_B || CC == X86::COND_AE) {

    // TESTC(X,~X) == TESTC(X,-1)

    if (SDValue NotOp1 = IsNOT(Op1, DAG)) {

      if (peekThroughBitcasts(NotOp1) == peekThroughBitcasts(Op0)) {

        SDLoc DL(EFLAGS);

        return DAG.getNode(

            EFLAGS.getOpcode(), DL, VT, DAG.getBitcast(OpVT, NotOp1),

            DAG.getBitcast(OpVT,

                           DAG.getAllOnesConstant(DL, NotOp1.getValueType())));

      }

    }

    // PTESTC(PCMPEQ(X,0),-1) == PTESTZ(X,X)

    if (EFLAGS.getOpcode() == X86ISD::PTEST &&

        ISD::isBuildVectorAllOnes(Op1.getNode())) {

      SDValue BC0 = peekThroughBitcasts(Op0);

      if (BC0.getOpcode() == X86ISD::PCMPEQ &&

          ISD::isBuildVectorAllZeros(BC0.getOperand(1).getNode())) {

        SDLoc DL(EFLAGS);

        CC = (CC == X86::COND_B ? X86::COND_E : X86::COND_NE);

        SDValue X = DAG.getBitcast(OpVT, BC0.getOperand(0));

        return DAG.getNode(EFLAGS.getOpcode(), DL, VT, X, X);

      }

    }

  }


  if (CC == X86::COND_E || CC == X86::COND_NE) {

    // TESTZ(X,~Y) == TESTC(Y,X)

    if (SDValue NotOp1 = IsNOT(Op1, DAG)) {

      CC = (CC == X86::COND_E ? X86::COND_B : X86::COND_AE);

      return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,

                         DAG.getBitcast(OpVT, NotOp1), Op0);

    }


    if (Op0 == Op1) {

      SDValue BC = peekThroughBitcasts(Op0);

      EVT BCVT = BC.getValueType();


      // TESTZ(AND(X,Y),AND(X,Y)) == TESTZ(X,Y)

      if (BC.getOpcode() == ISD::AND || BC.getOpcode() == X86ISD::FAND) {

        return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,

                           DAG.getBitcast(OpVT, BC.getOperand(0)),

                           DAG.getBitcast(OpVT, BC.getOperand(1)));

      }


      // TESTZ(AND(~X,Y),AND(~X,Y)) == TESTC(X,Y)

      if (BC.getOpcode() == X86ISD::ANDNP || BC.getOpcode() == X86ISD::FANDN) {

        CC = (CC == X86::COND_E ? X86::COND_B : X86::COND_AE);

        return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,

                           DAG.getBitcast(OpVT, BC.getOperand(0)),

                           DAG.getBitcast(OpVT, BC.getOperand(1)));

      }


      // If every element is an all-sign value, see if we can use TESTP/MOVMSK

      // to more efficiently extract the sign bits and compare that.

      // TODO: Handle TESTC with comparison inversion.

      // TODO: Can we remove SimplifyMultipleUseDemandedBits and rely on

      // TESTP/MOVMSK combines to make sure its never worse than PTEST?

      if (BCVT.isVector() && TLI.isTypeLegal(BCVT)) {

        unsigned EltBits = BCVT.getScalarSizeInBits();

        if (DAG.ComputeNumSignBits(BC) == EltBits) {

          assert(VT == MVT::i32 && "Expected i32 EFLAGS comparison result");

          APInt SignMask = APInt::getSignMask(EltBits);

          if (SDValue Res =

                  TLI.SimplifyMultipleUseDemandedBits(BC, SignMask, DAG)) {

            // For vXi16 cases we need to use pmovmksb and extract every other

            // sign bit.

            SDLoc DL(EFLAGS);

            if ((EltBits == 32 || EltBits == 64) && Subtarget.hasAVX()) {

              MVT FloatSVT = MVT::getFloatingPointVT(EltBits);

              MVT FloatVT =

                  MVT::getVectorVT(FloatSVT, OpVT.getSizeInBits() / EltBits);

              Res = DAG.getBitcast(FloatVT, Res);

              return DAG.getNode(X86ISD::TESTP, SDLoc(EFLAGS), VT, Res, Res);

            } else if (EltBits == 16) {

              MVT MovmskVT = BCVT.is128BitVector() ? MVT::v16i8 : MVT::v32i8;

              Res = DAG.getBitcast(MovmskVT, Res);

              Res = getPMOVMSKB(DL, Res, DAG, Subtarget);

              Res = DAG.getNode(ISD::AND, DL, MVT::i32, Res,

                                DAG.getConstant(0xAAAAAAAA, DL, MVT::i32));

            } else {

              Res = getPMOVMSKB(DL, Res, DAG, Subtarget);

            }

            return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Res,

                               DAG.getConstant(0, DL, MVT::i32));

          }

        }

      }

    }


    // TESTZ(-1,X) == TESTZ(X,X)

    if (ISD::isBuildVectorAllOnes(Op0.getNode()))

      return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT, Op1, Op1);


    // TESTZ(X,-1) == TESTZ(X,X)

    if (ISD::isBuildVectorAllOnes(Op1.getNode()))

      return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT, Op0, Op0);


    // TESTZ(OR(LO(X),HI(X)),OR(LO(Y),HI(Y))) -> TESTZ(X,Y)

    // TODO: Add COND_NE handling?

    if (CC == X86::COND_E && OpVT.is128BitVector() && Subtarget.hasAVX()) {

      SDValue Src0 = peekThroughBitcasts(Op0);

      SDValue Src1 = peekThroughBitcasts(Op1);

      if (Src0.getOpcode() == ISD::OR && Src1.getOpcode() == ISD::OR) {

        Src0 = getSplitVectorSrc(peekThroughBitcasts(Src0.getOperand(0)),

                                 peekThroughBitcasts(Src0.getOperand(1)), true);

        Src1 = getSplitVectorSrc(peekThroughBitcasts(Src1.getOperand(0)),

                                 peekThroughBitcasts(Src1.getOperand(1)), true);

        if (Src0 && Src1) {

          MVT OpVT2 = OpVT.getDoubleNumVectorElementsVT();

          return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,

                             DAG.getBitcast(OpVT2, Src0),

                             DAG.getBitcast(OpVT2, Src1));

        }

      }

    }

  }


  return SDValue();

}


// Attempt to simplify the MOVMSK input based on the comparison type.


static SDValue combineSetCCMOVMSK(SDValue EFLAGS, X86::CondCode &CC,

                                  SelectionDAG &DAG,

                                  const X86Subtarget &Subtarget) {

  // Handle eq/ne against zero (any_of).

  // Handle eq/ne against -1 (all_of).

  if (!(CC == X86::COND_E || CC == X86::COND_NE))

    return SDValue();

  if (EFLAGS.getValueType() != MVT::i32)

    return SDValue();

  unsigned CmpOpcode = EFLAGS.getOpcode();

  if (CmpOpcode != X86ISD::CMP && CmpOpcode != X86ISD::SUB)

    return SDValue();

  auto *CmpConstant = dyn_cast<ConstantSDNode>(EFLAGS.getOperand(1));

  if (!CmpConstant)

    return SDValue();

  const APInt &CmpVal = CmpConstant->getAPIntValue();


  SDValue CmpOp = EFLAGS.getOperand(0);

  unsigned CmpBits = CmpOp.getValueSizeInBits();

  assert(CmpBits == CmpVal.getBitWidth() && "Value size mismatch");


  // Peek through any truncate.

  if (CmpOp.getOpcode() == ISD::TRUNCATE)

    CmpOp = CmpOp.getOperand(0);


  // Bail if we don't find a MOVMSK.

  if (CmpOp.getOpcode() != X86ISD::MOVMSK)

    return SDValue();


  SDValue Vec = CmpOp.getOperand(0);

  MVT VecVT = Vec.getSimpleValueType();

  assert((VecVT.is128BitVector() || VecVT.is256BitVector()) &&

         "Unexpected MOVMSK operand");

  unsigned NumElts = VecVT.getVectorNumElements();

  unsigned NumEltBits = VecVT.getScalarSizeInBits();


  bool IsAnyOf = CmpOpcode == X86ISD::CMP && CmpVal.isZero();

  bool IsAllOf = (CmpOpcode == X86ISD::SUB || CmpOpcode == X86ISD::CMP) &&

                 NumElts <= CmpBits && CmpVal.isMask(NumElts);

  if (!IsAnyOf && !IsAllOf)

    return SDValue();


  // TODO: Check more combining cases for me.

  // Here we check the cmp use number to decide do combining or not.

  // Currently we only get 2 tests about combining "MOVMSK(CONCAT(..))"

  // and "MOVMSK(PCMPEQ(..))" are fit to use this constraint.

  bool IsOneUse = CmpOp.getNode()->hasOneUse();


  // See if we can peek through to a vector with a wider element type, if the

  // signbits extend down to all the sub-elements as well.

  // Calling MOVMSK with the wider type, avoiding the bitcast, helps expose

  // potential SimplifyDemandedBits/Elts cases.

  // If we looked through a truncate that discard bits, we can't do this

  // transform.

  // FIXME: We could do this transform for truncates that discarded bits by

  // inserting an AND mask between the new MOVMSK and the CMP.

  if (Vec.getOpcode() == ISD::BITCAST && NumElts <= CmpBits) {

    SDValue BC = peekThroughBitcasts(Vec);

    MVT BCVT = BC.getSimpleValueType();

    unsigned BCNumElts = BCVT.getVectorNumElements();

    unsigned BCNumEltBits = BCVT.getScalarSizeInBits();

    if ((BCNumEltBits == 32 || BCNumEltBits == 64) &&

        BCNumEltBits > NumEltBits &&

        DAG.ComputeNumSignBits(BC) > (BCNumEltBits - NumEltBits)) {

      SDLoc DL(EFLAGS);

      APInt CmpMask = APInt::getLowBitsSet(32, IsAnyOf ? 0 : BCNumElts);

      return DAG.getNode(X86ISD::CMP, DL, MVT::i32,

                         DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, BC),

                         DAG.getConstant(CmpMask, DL, MVT::i32));

    }

  }


  // MOVMSK(CONCAT(X,Y)) == 0 ->  MOVMSK(OR(X,Y)).

  // MOVMSK(CONCAT(X,Y)) != 0 ->  MOVMSK(OR(X,Y)).

  // MOVMSK(CONCAT(X,Y)) == -1 ->  MOVMSK(AND(X,Y)).

  // MOVMSK(CONCAT(X,Y)) != -1 ->  MOVMSK(AND(X,Y)).

  if (VecVT.is256BitVector() && NumElts <= CmpBits && IsOneUse) {

    SmallVector<SDValue> Ops;

    if (collectConcatOps(peekThroughBitcasts(Vec).getNode(), Ops, DAG) &&

        Ops.size() == 2) {

      SDLoc DL(EFLAGS);

      EVT SubVT = Ops[0].getValueType().changeTypeToInteger();

      APInt CmpMask = APInt::getLowBitsSet(32, IsAnyOf ? 0 : NumElts / 2);

      SDValue V = DAG.getNode(IsAnyOf ? ISD::OR : ISD::AND, DL, SubVT,

                              DAG.getBitcast(SubVT, Ops[0]),

                              DAG.getBitcast(SubVT, Ops[1]));

      V = DAG.getBitcast(VecVT.getHalfNumVectorElementsVT(), V);

      return DAG.getNode(X86ISD::CMP, DL, MVT::i32,

                         DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V),

                         DAG.getConstant(CmpMask, DL, MVT::i32));

    }

  }


  // MOVMSK(PCMPEQ(X,0)) == -1 -> PTESTZ(X,X).

  // MOVMSK(PCMPEQ(X,0)) != -1 -> !PTESTZ(X,X).

  // MOVMSK(PCMPEQ(X,Y)) == -1 -> PTESTZ(XOR(X,Y),XOR(X,Y)).

  // MOVMSK(PCMPEQ(X,Y)) != -1 -> !PTESTZ(XOR(X,Y),XOR(X,Y)).

  if (IsAllOf && Subtarget.hasSSE41() && IsOneUse) {

    MVT TestVT = VecVT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;

    SDValue BC = peekThroughBitcasts(Vec);

    // Ensure MOVMSK was testing every signbit of BC.

    if (BC.getValueType().getVectorNumElements() <= NumElts) {

      if (BC.getOpcode() == X86ISD::PCMPEQ) {

        SDValue V = DAG.getNode(ISD::XOR, SDLoc(BC), BC.getValueType(),

                                BC.getOperand(0), BC.getOperand(1));

        V = DAG.getBitcast(TestVT, V);

        return DAG.getNode(X86ISD::PTEST, SDLoc(EFLAGS), MVT::i32, V, V);

      }

      // Check for 256-bit split vector cases.

      if (BC.getOpcode() == ISD::AND &&

          BC.getOperand(0).getOpcode() == X86ISD::PCMPEQ &&

          BC.getOperand(1).getOpcode() == X86ISD::PCMPEQ) {

        SDValue LHS = BC.getOperand(0);

        SDValue RHS = BC.getOperand(1);

        LHS = DAG.getNode(ISD::XOR, SDLoc(LHS), LHS.getValueType(),

                          LHS.getOperand(0), LHS.getOperand(1));

        RHS = DAG.getNode(ISD::XOR, SDLoc(RHS), RHS.getValueType(),

                          RHS.getOperand(0), RHS.getOperand(1));

        LHS = DAG.getBitcast(TestVT, LHS);

        RHS = DAG.getBitcast(TestVT, RHS);

        SDValue V = DAG.getNode(ISD::OR, SDLoc(EFLAGS), TestVT, LHS, RHS);

        return DAG.getNode(X86ISD::PTEST, SDLoc(EFLAGS), MVT::i32, V, V);

      }

    }

  }


  // See if we can avoid a PACKSS by calling MOVMSK on the sources.

  // For vXi16 cases we can use a v2Xi8 PMOVMSKB. We must mask out

  // sign bits prior to the comparison with zero unless we know that

  // the vXi16 splats the sign bit down to the lower i8 half.

  // TODO: Handle all_of patterns.

  if (Vec.getOpcode() == X86ISD::PACKSS && VecVT == MVT::v16i8) {

    SDValue VecOp0 = Vec.getOperand(0);

    SDValue VecOp1 = Vec.getOperand(1);

    bool SignExt0 = DAG.ComputeNumSignBits(VecOp0) > 8;

    bool SignExt1 = DAG.ComputeNumSignBits(VecOp1) > 8;

    // PMOVMSKB(PACKSSBW(X, undef)) -> PMOVMSKB(BITCAST_v16i8(X)) & 0xAAAA.

    if (IsAnyOf && CmpBits == 8 && VecOp1.isUndef()) {

      SDLoc DL(EFLAGS);

      SDValue Result = DAG.getBitcast(MVT::v16i8, VecOp0);

      Result = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);

      Result = DAG.getZExtOrTrunc(Result, DL, MVT::i16);

      if (!SignExt0) {

        Result = DAG.getNode(ISD::AND, DL, MVT::i16, Result,

                             DAG.getConstant(0xAAAA, DL, MVT::i16));

      }

      return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Result,

                         DAG.getConstant(0, DL, MVT::i16));

    }

    // PMOVMSKB(PACKSSBW(LO(X), HI(X)))

    // -> PMOVMSKB(BITCAST_v32i8(X)) & 0xAAAAAAAA.

    if (CmpBits >= 16 && Subtarget.hasInt256() &&

        (IsAnyOf || (SignExt0 && SignExt1))) {

      if (SDValue Src = getSplitVectorSrc(VecOp0, VecOp1, true)) {

        SDLoc DL(EFLAGS);

        SDValue Result = peekThroughBitcasts(Src);

        if (IsAllOf && Result.getOpcode() == X86ISD::PCMPEQ &&

            Result.getValueType().getVectorNumElements() <= NumElts) {

          SDValue V = DAG.getNode(ISD::XOR, DL, Result.getValueType(),

                                  Result.getOperand(0), Result.getOperand(1));

          V = DAG.getBitcast(MVT::v4i64, V);

          return DAG.getNode(X86ISD::PTEST, SDLoc(EFLAGS), MVT::i32, V, V);

        }

        Result = DAG.getBitcast(MVT::v32i8, Result);

        Result = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);

        unsigned CmpMask = IsAnyOf ? 0 : 0xFFFFFFFF;

        if (!SignExt0 || !SignExt1) {

          assert(IsAnyOf &&

                 "Only perform v16i16 signmasks for any_of patterns");

          Result = DAG.getNode(ISD::AND, DL, MVT::i32, Result,

                               DAG.getConstant(0xAAAAAAAA, DL, MVT::i32));

        }

        return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Result,

                           DAG.getConstant(CmpMask, DL, MVT::i32));

      }

    }

  }


  // MOVMSK(SHUFFLE(X,u)) -> MOVMSK(X) iff every element is referenced.

  // Since we peek through a bitcast, we need to be careful if the base vector

  // type has smaller elements than the MOVMSK type.  In that case, even if

  // all the elements are demanded by the shuffle mask, only the "high"

  // elements which have highbits that align with highbits in the MOVMSK vec

  // elements are actually demanded. A simplification of spurious operations

  // on the "low" elements take place during other simplifications.

  //

  // For example:

  // MOVMSK64(BITCAST(SHUF32 X, (1,0,3,2))) even though all the elements are

  // demanded, because we are swapping around the result can change.

  //

  // To address this, we check that we can scale the shuffle mask to MOVMSK

  // element width (this will ensure "high" elements match). Its slightly overly

  // conservative, but fine for an edge case fold.

  SmallVector<int, 32> ShuffleMask;

  SmallVector<SDValue, 2> ShuffleInputs;

  if (NumElts <= CmpBits &&

      getTargetShuffleInputs(peekThroughBitcasts(Vec), ShuffleInputs,

                             ShuffleMask, DAG) &&

      ShuffleInputs.size() == 1 && isCompletePermute(ShuffleMask) &&

      ShuffleInputs[0].getValueSizeInBits() == VecVT.getSizeInBits() &&

      canScaleShuffleElements(ShuffleMask, NumElts)) {

    SDLoc DL(EFLAGS);

    SDValue Result = DAG.getBitcast(VecVT, ShuffleInputs[0]);

    Result = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);

    Result =

        DAG.getZExtOrTrunc(Result, DL, EFLAGS.getOperand(0).getValueType());

    return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Result, EFLAGS.getOperand(1));

  }


  // MOVMSKPS(V) !=/== 0 -> TESTPS(V,V)

  // MOVMSKPD(V) !=/== 0 -> TESTPD(V,V)

  // MOVMSKPS(V) !=/== -1 -> TESTPS(V,V)

  // MOVMSKPD(V) !=/== -1 -> TESTPD(V,V)

  // iff every element is referenced.

  if (NumElts <= CmpBits && Subtarget.hasAVX() &&

      !Subtarget.preferMovmskOverVTest() && IsOneUse &&

      (NumEltBits == 32 || NumEltBits == 64)) {

    SDLoc DL(EFLAGS);

    MVT FloatSVT = MVT::getFloatingPointVT(NumEltBits);

    MVT FloatVT = MVT::getVectorVT(FloatSVT, NumElts);

    MVT IntVT = FloatVT.changeVectorElementTypeToInteger();

    SDValue LHS = Vec;

    SDValue RHS = IsAnyOf ? Vec : DAG.getAllOnesConstant(DL, IntVT);

    CC = IsAnyOf ? CC : (CC == X86::COND_E ? X86::COND_B : X86::COND_AE);

    return DAG.getNode(X86ISD::TESTP, DL, MVT::i32,

                       DAG.getBitcast(FloatVT, LHS),

                       DAG.getBitcast(FloatVT, RHS));

  }


  return SDValue();

}


/// Optimize an EFLAGS definition used according to the condition code \p CC

/// into a simpler EFLAGS value, potentially returning a new \p CC and replacing

/// uses of chain values.


static SDValue combineSetCCEFLAGS(SDValue EFLAGS, X86::CondCode &CC,

                                  SelectionDAG &DAG,

                                  const X86Subtarget &Subtarget) {

  if (CC == X86::COND_B)

    if (SDValue Flags = combineCarryThroughADD(EFLAGS, DAG))

      return Flags;


  if (SDValue R = checkSignTestSetCCCombine(EFLAGS, CC, DAG))

    return R;


  if (SDValue R = checkBoolTestSetCCCombine(EFLAGS, CC))

    return R;


  if (SDValue R = combinePTESTCC(EFLAGS, CC, DAG, Subtarget))

    return R;


  if (SDValue R = combineSetCCMOVMSK(EFLAGS, CC, DAG, Subtarget))

    return R;


  return combineSetCCAtomicArith(EFLAGS, CC, DAG, Subtarget);

}


/// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL]


static SDValue combineCMov(SDNode *N, SelectionDAG &DAG,

                           TargetLowering::DAGCombinerInfo &DCI,

                           const X86Subtarget &Subtarget) {

  SDLoc DL(N);

  EVT VT = N->getValueType(0);

  SDValue FalseOp = N->getOperand(0);

  SDValue TrueOp = N->getOperand(1);

  X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2);

  SDValue Cond = N->getOperand(3);


  // cmov X, X, ?, ? --> X

  if (TrueOp == FalseOp)

    return TrueOp;


  // Try to simplify the EFLAGS and condition code operands.

  // We can't always do this as FCMOV only supports a subset of X86 cond.

  if (SDValue Flags = combineSetCCEFLAGS(Cond, CC, DAG, Subtarget)) {

    if (!(FalseOp.getValueType() == MVT::f80 ||

          (FalseOp.getValueType() == MVT::f64 && !Subtarget.hasSSE2()) ||

          (FalseOp.getValueType() == MVT::f32 && !Subtarget.hasSSE1())) ||

        !Subtarget.canUseCMOV() || hasFPCMov(CC)) {

      SDValue Ops[] = {FalseOp, TrueOp, DAG.getTargetConstant(CC, DL, MVT::i8),

                       Flags};

      return DAG.getNode(X86ISD::CMOV, DL, VT, Ops);

    }

  }


  // If this is a select between two integer constants, try to do some

  // optimizations.  Note that the operands are ordered the opposite of SELECT

  // operands.

  if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(TrueOp)) {

    if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(FalseOp)) {

      // Canonicalize the TrueC/FalseC values so that TrueC (the true value) is

      // larger than FalseC (the false value).

      if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue())) {

        CC = X86::GetOppositeBranchCondition(CC);

        std::swap(TrueC, FalseC);

        std::swap(TrueOp, FalseOp);

      }


      // Optimize C ? 8 : 0 -> zext(setcc(C)) << 3.  Likewise for any pow2/0.

      // This is efficient for any integer data type (including i8/i16) and

      // shift amount.

      if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) {

        Cond = getSETCC(CC, Cond, DL, DAG);


        // Zero extend the condition if needed.

        Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, TrueC->getValueType(0), Cond);


        unsigned ShAmt = TrueC->getAPIntValue().logBase2();

        Cond = DAG.getNode(ISD::SHL, DL, Cond.getValueType(), Cond,

                           DAG.getConstant(ShAmt, DL, MVT::i8));

        return Cond;

      }


      // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst.  This is efficient

      // for any integer data type, including i8/i16.

      if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) {

        Cond = getSETCC(CC, Cond, DL, DAG);


        // Zero extend the condition if needed.

        Cond = DAG.getNode(ISD::ZERO_EXTEND, DL,

                           FalseC->getValueType(0), Cond);

        Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,

                           SDValue(FalseC, 0));

        return Cond;

      }


      // Optimize cases that will turn into an LEA instruction.  This requires

      // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).

      if (VT == MVT::i32 || VT == MVT::i64) {

        APInt Diff = TrueC->getAPIntValue() - FalseC->getAPIntValue();

        assert(Diff.getBitWidth() == VT.getSizeInBits() &&

               "Implicit constant truncation");


        bool isFastMultiplier = false;

        if (Diff.ult(10)) {

          switch (Diff.getZExtValue()) {

          default: break;

          case 1:  // result = add base, cond

          case 2:  // result = lea base(    , cond*2)

          case 3:  // result = lea base(cond, cond*2)

          case 4:  // result = lea base(    , cond*4)

          case 5:  // result = lea base(cond, cond*4)

          case 8:  // result = lea base(    , cond*8)

          case 9:  // result = lea base(cond, cond*8)

            isFastMultiplier = true;

            break;

          }

        }


        if (isFastMultiplier) {

          Cond = getSETCC(CC, Cond, DL ,DAG);

          // Zero extend the condition if needed.

          Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0),

                             Cond);

          // Scale the condition by the difference.

          if (Diff != 1)

            Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,

                               DAG.getConstant(Diff, DL, Cond.getValueType()));


          // Add the base if non-zero.

          if (FalseC->getAPIntValue() != 0)

            Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,

                               SDValue(FalseC, 0));

          return Cond;

        }

      }

    }

  }


  // Handle these cases:

  //   (select (x != c), e, c) -> select (x != c), e, x),

  //   (select (x == c), c, e) -> select (x == c), x, e)

  // where the c is an integer constant, and the "select" is the combination

  // of CMOV and CMP.

  //

  // The rationale for this change is that the conditional-move from a constant

  // needs two instructions, however, conditional-move from a register needs

  // only one instruction.

  //

  // CAVEAT: By replacing a constant with a symbolic value, it may obscure

  //  some instruction-combining opportunities. This opt needs to be

  //  postponed as late as possible.

  //

  if (!DCI.isBeforeLegalize() && !DCI.isBeforeLegalizeOps()) {

    // the DCI.xxxx conditions are provided to postpone the optimization as

    // late as possible.


    ConstantSDNode *CmpAgainst = nullptr;

    if ((Cond.getOpcode() == X86ISD::CMP || Cond.getOpcode() == X86ISD::SUB) &&

        (CmpAgainst = dyn_cast<ConstantSDNode>(Cond.getOperand(1))) &&

        !isa<ConstantSDNode>(Cond.getOperand(0))) {


      if (CC == X86::COND_NE &&

          CmpAgainst == dyn_cast<ConstantSDNode>(FalseOp)) {

        CC = X86::GetOppositeBranchCondition(CC);

        std::swap(TrueOp, FalseOp);

      }


      if (CC == X86::COND_E && CmpAgainst == dyn_cast<ConstantSDNode>(TrueOp)) {

        SDValue Ops[] = {FalseOp, Cond.getOperand(0),

                         DAG.getTargetConstant(CC, DL, MVT::i8), Cond};

        return DAG.getNode(X86ISD::CMOV, DL, VT, Ops);

      }

    }

  }


  // Transform:

  //

  //   (cmov 1 T (uge T 2))

  //

  // to:

  //

  //   (adc T 0 (sub T 1))

  if (CC == X86::COND_AE && isOneConstant(FalseOp) &&

      Cond.getOpcode() == X86ISD::SUB && Cond->hasOneUse()) {

    SDValue Cond0 = Cond.getOperand(0);

    if (Cond0.getOpcode() == ISD::TRUNCATE)

      Cond0 = Cond0.getOperand(0);

    auto *Sub1C = dyn_cast<ConstantSDNode>(Cond.getOperand(1));

    if (Cond0 == TrueOp && Sub1C && Sub1C->getZExtValue() == 2) {

      EVT CondVT = Cond->getValueType(0);

      // Subtract 1 and generate a carry.

      SDValue NewSub =

          DAG.getNode(X86ISD::SUB, DL, Cond->getVTList(), Cond.getOperand(0),

                      DAG.getConstant(1, DL, CondVT));

      SDValue EFLAGS(NewSub.getNode(), 1);

      return DAG.getNode(X86ISD::ADC, DL, DAG.getVTList(VT, MVT::i32), TrueOp,

                         DAG.getConstant(0, DL, VT), EFLAGS);

    }

  }


  // Fold and/or of setcc's to double CMOV:

  //   (CMOV F, T, ((cc1 | cc2) != 0)) -> (CMOV (CMOV F, T, cc1), T, cc2)

  //   (CMOV F, T, ((cc1 & cc2) != 0)) -> (CMOV (CMOV T, F, !cc1), F, !cc2)

  //

  // This combine lets us generate:

  //   cmovcc1 (jcc1 if we don't have CMOV)

  //   cmovcc2 (same)

  // instead of:

  //   setcc1

  //   setcc2

  //   and/or

  //   cmovne (jne if we don't have CMOV)

  // When we can't use the CMOV instruction, it might increase branch

  // mispredicts.

  // When we can use CMOV, or when there is no mispredict, this improves

  // throughput and reduces register pressure.

  //

  if (CC == X86::COND_NE) {

    SDValue Flags;

    X86::CondCode CC0, CC1;

    bool isAndSetCC;

    if (checkBoolTestAndOrSetCCCombine(Cond, CC0, CC1, Flags, isAndSetCC)) {

      if (isAndSetCC) {

        std::swap(FalseOp, TrueOp);

        CC0 = X86::GetOppositeBranchCondition(CC0);

        CC1 = X86::GetOppositeBranchCondition(CC1);

      }


      SDValue LOps[] = {FalseOp, TrueOp,

                        DAG.getTargetConstant(CC0, DL, MVT::i8), Flags};

      SDValue LCMOV = DAG.getNode(X86ISD::CMOV, DL, VT, LOps);

      SDValue Ops[] = {LCMOV, TrueOp, DAG.getTargetConstant(CC1, DL, MVT::i8),

                       Flags};

      SDValue CMOV = DAG.getNode(X86ISD::CMOV, DL, VT, Ops);

      return CMOV;

    }

  }


  // Fold (CMOV C1, (ADD (CTTZ X), C2), (X != 0)) ->

  //      (ADD (CMOV C1-C2, (CTTZ X), (X != 0)), C2)

  // Or (CMOV (ADD (CTTZ X), C2), C1, (X == 0)) ->

  //    (ADD (CMOV (CTTZ X), C1-C2, (X == 0)), C2)

  // Or (CMOV (BSR ?, X), Y, (X == 0)) -> (BSR Y, X)

  // TODO: Or (CMOV (BSF ?, X), Y, (X == 0)) -> (BSF Y, X)

  if ((CC == X86::COND_NE || CC == X86::COND_E) &&

      Cond.getOpcode() == X86ISD::CMP && isNullConstant(Cond.getOperand(1))) {

    SDValue Add = TrueOp;

    SDValue Const = FalseOp;

    // Canonicalize the condition code for easier matching and output.

    if (CC == X86::COND_E)

      std::swap(Add, Const);


    // TODO: ADD BSF support, but requires changes to the "REP BSF" CTTZ hack.

    if (Subtarget.hasBitScanPassThrough() && Add.getOpcode() == X86ISD::BSR &&

        Add.getResNo() == 0 && Add.hasOneUse() &&

        Add.getOperand(1) == Cond.getOperand(0)) {

      return DAG.getNode(Add.getOpcode(), DL, Add->getVTList(), Const,

                         Add.getOperand(1));

    }


    // We might have replaced the constant in the cmov with the LHS of the

    // compare. If so change it to the RHS of the compare.

    if (Const == Cond.getOperand(0))

      Const = Cond.getOperand(1);


    // Ok, now make sure that Add is (add (cttz X), C2) and Const is a constant.

    if (isa<ConstantSDNode>(Const) && Add.getOpcode() == ISD::ADD &&

        Add.hasOneUse() && isa<ConstantSDNode>(Add.getOperand(1)) &&

        (Add.getOperand(0).getOpcode() == ISD::CTTZ_ZERO_UNDEF ||

         Add.getOperand(0).getOpcode() == ISD::CTTZ) &&

        Add.getOperand(0).getOperand(0) == Cond.getOperand(0)) {

      // This should constant fold.

      SDValue Diff = DAG.getNode(ISD::SUB, DL, VT, Const, Add.getOperand(1));

      SDValue CMov =

          DAG.getNode(X86ISD::CMOV, DL, VT, Diff, Add.getOperand(0),

                      DAG.getTargetConstant(X86::COND_NE, DL, MVT::i8), Cond);

      return DAG.getNode(ISD::ADD, DL, VT, CMov, Add.getOperand(1));

    }

  }


  return SDValue();

}


/// Different mul shrinking modes.

enum class ShrinkMode { MULS8, MULU8, MULS16, MULU16 };


static bool canReduceVMulWidth(SDNode *N, SelectionDAG &DAG, ShrinkMode &Mode) {

  EVT VT = N->getOperand(0).getValueType();

  if (VT.getScalarSizeInBits() != 32)

    return false;


  assert(N->getNumOperands() == 2 && "NumOperands of Mul are 2");

  unsigned SignBits[2] = {1, 1};

  bool IsPositive[2] = {false, false};

  for (unsigned i = 0; i < 2; i++) {

    SDValue Opd = N->getOperand(i);


    SignBits[i] = DAG.ComputeNumSignBits(Opd);

    IsPositive[i] = DAG.SignBitIsZero(Opd);

  }


  bool AllPositive = IsPositive[0] && IsPositive[1];

  unsigned MinSignBits = std::min(SignBits[0], SignBits[1]);

  // When ranges are from -128 ~ 127, use MULS8 mode.

  if (MinSignBits >= 25)

    Mode = ShrinkMode::MULS8;

  // When ranges are from 0 ~ 255, use MULU8 mode.

  else if (AllPositive && MinSignBits >= 24)

    Mode = ShrinkMode::MULU8;

  // When ranges are from -32768 ~ 32767, use MULS16 mode.

  else if (MinSignBits >= 17)

    Mode = ShrinkMode::MULS16;

  // When ranges are from 0 ~ 65535, use MULU16 mode.

  else if (AllPositive && MinSignBits >= 16)

    Mode = ShrinkMode::MULU16;

  else

    return false;

  return true;

}


/// When the operands of vector mul are extended from smaller size values,

/// like i8 and i16, the type of mul may be shrinked to generate more

/// efficient code. Two typical patterns are handled:

/// Pattern1:

///     %2 = sext/zext <N x i8> %1 to <N x i32>

///     %4 = sext/zext <N x i8> %3 to <N x i32>

//   or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)

///     %5 = mul <N x i32> %2, %4

///

/// Pattern2:

///     %2 = zext/sext <N x i16> %1 to <N x i32>

///     %4 = zext/sext <N x i16> %3 to <N x i32>

///  or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)

///     %5 = mul <N x i32> %2, %4

///

/// There are four mul shrinking modes:

/// If %2 == sext32(trunc8(%2)), i.e., the scalar value range of %2 is

/// -128 to 128, and the scalar value range of %4 is also -128 to 128,

/// generate pmullw+sext32 for it (MULS8 mode).

/// If %2 == zext32(trunc8(%2)), i.e., the scalar value range of %2 is

/// 0 to 255, and the scalar value range of %4 is also 0 to 255,

/// generate pmullw+zext32 for it (MULU8 mode).

/// If %2 == sext32(trunc16(%2)), i.e., the scalar value range of %2 is

/// -32768 to 32767, and the scalar value range of %4 is also -32768 to 32767,

/// generate pmullw+pmulhw for it (MULS16 mode).

/// If %2 == zext32(trunc16(%2)), i.e., the scalar value range of %2 is

/// 0 to 65535, and the scalar value range of %4 is also 0 to 65535,

/// generate pmullw+pmulhuw for it (MULU16 mode).


static SDValue reduceVMULWidth(SDNode *N, const SDLoc &DL, SelectionDAG &DAG,

                               const X86Subtarget &Subtarget) {

  // Check for legality

  // pmullw/pmulhw are not supported by SSE.

  if (!Subtarget.hasSSE2())

    return SDValue();


  // Check for profitability

  // pmulld is supported since SSE41. It is better to use pmulld

  // instead of pmullw+pmulhw, except for subtargets where pmulld is slower than

  // the expansion.

  bool OptForMinSize = DAG.getMachineFunction().getFunction().hasMinSize();

  if (Subtarget.hasSSE41() && (OptForMinSize || !Subtarget.isPMULLDSlow()))

    return SDValue();


  ShrinkMode Mode;

  if (!canReduceVMulWidth(N, DAG, Mode))

    return SDValue();


  SDValue N0 = N->getOperand(0);

  SDValue N1 = N->getOperand(1);

  EVT VT = N->getOperand(0).getValueType();

  unsigned NumElts = VT.getVectorNumElements();

  if ((NumElts % 2) != 0)

    return SDValue();


  EVT ReducedVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16, NumElts);


  // Shrink the operands of mul.

  SDValue NewN0 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N0);

  SDValue NewN1 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N1);


  // Generate the lower part of mul: pmullw. For MULU8/MULS8, only the

  // lower part is needed.

  SDValue MulLo = DAG.getNode(ISD::MUL, DL, ReducedVT, NewN0, NewN1);

  if (Mode == ShrinkMode::MULU8 || Mode == ShrinkMode::MULS8)

    return DAG.getNode((Mode == ShrinkMode::MULU8) ? ISD::ZERO_EXTEND

                                                   : ISD::SIGN_EXTEND,

                       DL, VT, MulLo);


  EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts / 2);

  // Generate the higher part of mul: pmulhw/pmulhuw. For MULU16/MULS16,

  // the higher part is also needed.

  SDValue MulHi =

      DAG.getNode(Mode == ShrinkMode::MULS16 ? ISD::MULHS : ISD::MULHU, DL,

                  ReducedVT, NewN0, NewN1);


  // Repack the lower part and higher part result of mul into a wider

  // result.

  // Generate shuffle functioning as punpcklwd.

  SmallVector<int, 16> ShuffleMask(NumElts);

  for (unsigned i = 0, e = NumElts / 2; i < e; i++) {

    ShuffleMask[2 * i] = i;

    ShuffleMask[2 * i + 1] = i + NumElts;

  }

  SDValue ResLo =

      DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);

  ResLo = DAG.getBitcast(ResVT, ResLo);

  // Generate shuffle functioning as punpckhwd.

  for (unsigned i = 0, e = NumElts / 2; i < e; i++) {

    ShuffleMask[2 * i] = i + NumElts / 2;

    ShuffleMask[2 * i + 1] = i + NumElts * 3 / 2;

  }

  SDValue ResHi =

      DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);

  ResHi = DAG.getBitcast(ResVT, ResHi);

  return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, ResLo, ResHi);

}


static SDValue combineMulSpecial(uint64_t MulAmt, SDNode *N, SelectionDAG &DAG,

                                 EVT VT, const SDLoc &DL) {


  auto combineMulShlAddOrSub = [&](int Mult, int Shift, bool isAdd) {

    SDValue Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),

                                 DAG.getConstant(Mult, DL, VT));

    Result = DAG.getNode(ISD::SHL, DL, VT, Result,

                         DAG.getConstant(Shift, DL, MVT::i8));

    Result = DAG.getNode(isAdd ? ISD::ADD : ISD::SUB, DL, VT, Result,

                         N->getOperand(0));

    return Result;

  };


  auto combineMulMulAddOrSub = [&](int Mul1, int Mul2, bool isAdd) {

    SDValue Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),

                                 DAG.getConstant(Mul1, DL, VT));

    Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, Result,

                         DAG.getConstant(Mul2, DL, VT));

    Result = DAG.getNode(isAdd ? ISD::ADD : ISD::SUB, DL, VT, Result,

                         N->getOperand(0));

    return Result;

  };


  switch (MulAmt) {

  default:

    break;

  case 11:

    // mul x, 11 => add ((shl (mul x, 5), 1), x)

    return combineMulShlAddOrSub(5, 1, /*isAdd*/ true);

  case 21:

    // mul x, 21 => add ((shl (mul x, 5), 2), x)

    return combineMulShlAddOrSub(5, 2, /*isAdd*/ true);

  case 41:

    // mul x, 41 => add ((shl (mul x, 5), 3), x)

    return combineMulShlAddOrSub(5, 3, /*isAdd*/ true);

  case 22:

    // mul x, 22 => add (add ((shl (mul x, 5), 2), x), x)

    return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),

                       combineMulShlAddOrSub(5, 2, /*isAdd*/ true));

  case 19:

    // mul x, 19 => add ((shl (mul x, 9), 1), x)

    return combineMulShlAddOrSub(9, 1, /*isAdd*/ true);

  case 37:

    // mul x, 37 => add ((shl (mul x, 9), 2), x)

    return combineMulShlAddOrSub(9, 2, /*isAdd*/ true);

  case 73:

    // mul x, 73 => add ((shl (mul x, 9), 3), x)

    return combineMulShlAddOrSub(9, 3, /*isAdd*/ true);

  case 13:

    // mul x, 13 => add ((shl (mul x, 3), 2), x)

    return combineMulShlAddOrSub(3, 2, /*isAdd*/ true);

  case 23:

    // mul x, 23 => sub ((shl (mul x, 3), 3), x)

    return combineMulShlAddOrSub(3, 3, /*isAdd*/ false);

  case 26:

    // mul x, 26 => add ((mul (mul x, 5), 5), x)

    return combineMulMulAddOrSub(5, 5, /*isAdd*/ true);

  case 28:

    // mul x, 28 => add ((mul (mul x, 9), 3), x)

    return combineMulMulAddOrSub(9, 3, /*isAdd*/ true);

  case 29:

    // mul x, 29 => add (add ((mul (mul x, 9), 3), x), x)

    return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),

                       combineMulMulAddOrSub(9, 3, /*isAdd*/ true));

  }


  // Another trick. If this is a power 2 + 2/4/8, we can use a shift followed

  // by a single LEA.

  // First check if this a sum of two power of 2s because that's easy. Then

  // count how many zeros are up to the first bit.

  // TODO: We can do this even without LEA at a cost of two shifts and an add.

  if (isPowerOf2_64(MulAmt & (MulAmt - 1))) {

    unsigned ScaleShift = llvm::countr_zero(MulAmt);

    if (ScaleShift >= 1 && ScaleShift < 4) {

      unsigned ShiftAmt = Log2_64((MulAmt & (MulAmt - 1)));

      SDValue Shift1 = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),

                                   DAG.getConstant(ShiftAmt, DL, MVT::i8));

      SDValue Shift2 = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),

                                   DAG.getConstant(ScaleShift, DL, MVT::i8));

      return DAG.getNode(ISD::ADD, DL, VT, Shift1, Shift2);

    }

  }


  return SDValue();

}


// If the upper 17 bits of either element are zero and the other element are

// zero/sign bits then we can use PMADDWD, which is always at least as quick as

// PMULLD, except on KNL.


static SDValue combineMulToPMADDWD(SDNode *N, const SDLoc &DL,

                                   SelectionDAG &DAG,

                                   const X86Subtarget &Subtarget) {

  if (!Subtarget.hasSSE2())

    return SDValue();


  if (Subtarget.isPMADDWDSlow())

    return SDValue();


  EVT VT = N->getValueType(0);


  // Only support vXi32 vectors.

  if (!VT.isVector() || VT.getVectorElementType() != MVT::i32)

    return SDValue();


  // Make sure the type is legal or can split/widen to a legal type.

  // With AVX512 but without BWI, we would need to split v32i16.

  unsigned NumElts = VT.getVectorNumElements();

  if (NumElts == 1 || !isPowerOf2_32(NumElts))

    return SDValue();


  // With AVX512 but without BWI, we would need to split v32i16.

  if (32 <= (2 * NumElts) && Subtarget.hasAVX512() && !Subtarget.hasBWI())

    return SDValue();


  SDValue N0 = N->getOperand(0);

  SDValue N1 = N->getOperand(1);


  // If we are zero/sign extending two steps without SSE4.1, its better to

  // reduce the vmul width instead.

  if (!Subtarget.hasSSE41() &&

      (((N0.getOpcode() == ISD::ZERO_EXTEND &&

         N0.getOperand(0).getScalarValueSizeInBits() <= 8) &&

        (N1.getOpcode() == ISD::ZERO_EXTEND &&

         N1.getOperand(0).getScalarValueSizeInBits() <= 8)) ||

       ((N0.getOpcode() == ISD::SIGN_EXTEND &&

         N0.getOperand(0).getScalarValueSizeInBits() <= 8) &&

        (N1.getOpcode() == ISD::SIGN_EXTEND &&

         N1.getOperand(0).getScalarValueSizeInBits() <= 8))))

    return SDValue();


  // If we are sign extending a wide vector without SSE4.1, its better to reduce

  // the vmul width instead.

  if (!Subtarget.hasSSE41() &&

      (N0.getOpcode() == ISD::SIGN_EXTEND &&

       N0.getOperand(0).getValueSizeInBits() > 128) &&

      (N1.getOpcode() == ISD::SIGN_EXTEND &&

       N1.getOperand(0).getValueSizeInBits() > 128))

    return SDValue();


  // Sign bits must extend down to the lowest i16.

  if (DAG.ComputeMaxSignificantBits(N1) > 16 ||

      DAG.ComputeMaxSignificantBits(N0) > 16)

    return SDValue();


  // At least one of the elements must be zero in the upper 17 bits, or can be

  // safely made zero without altering the final result.

  auto GetZeroableOp = [&](SDValue Op) {

    APInt Mask17 = APInt::getHighBitsSet(32, 17);

    if (DAG.MaskedValueIsZero(Op, Mask17))

      return Op;

    // Mask off upper 16-bits of sign-extended constants.

    if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode()))

      return DAG.getNode(ISD::AND, DL, VT, Op, DAG.getConstant(0xFFFF, DL, VT));

    if (Op.getOpcode() == ISD::SIGN_EXTEND && N->isOnlyUserOf(Op.getNode())) {

      SDValue Src = Op.getOperand(0);

      // Convert sext(vXi16) to zext(vXi16).

      if (Src.getScalarValueSizeInBits() == 16 && VT.getSizeInBits() <= 128)

        return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Src);

      // Convert sext(vXi8) to zext(vXi16 sext(vXi8)) on pre-SSE41 targets

      // which will expand the extension.

      if (Src.getScalarValueSizeInBits() < 16 && !Subtarget.hasSSE41()) {

        EVT ExtVT = VT.changeVectorElementType(MVT::i16);

        Src = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, Src);

        return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Src);

      }

    }

    // Convert SIGN_EXTEND_VECTOR_INREG to ZEXT_EXTEND_VECTOR_INREG.

    if (Op.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG &&

        N->isOnlyUserOf(Op.getNode())) {

      SDValue Src = Op.getOperand(0);

      if (Src.getScalarValueSizeInBits() == 16)

        return DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, DL, VT, Src);

    }

    // Convert VSRAI(Op, 16) to VSRLI(Op, 16).

    if (Op.getOpcode() == X86ISD::VSRAI && Op.getConstantOperandVal(1) == 16 &&

        N->isOnlyUserOf(Op.getNode())) {

      return DAG.getNode(X86ISD::VSRLI, DL, VT, Op.getOperand(0),

                         Op.getOperand(1));

    }

    return SDValue();

  };

  SDValue ZeroN0 = GetZeroableOp(N0);

  SDValue ZeroN1 = GetZeroableOp(N1);

  if (!ZeroN0 && !ZeroN1)

    return SDValue();

  N0 = ZeroN0 ? ZeroN0 : N0;

  N1 = ZeroN1 ? ZeroN1 : N1;


  // Use SplitOpsAndApply to handle AVX splitting.

  auto PMADDWDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,

                           ArrayRef<SDValue> Ops) {

    MVT ResVT = MVT::getVectorVT(MVT::i32, Ops[0].getValueSizeInBits() / 32);

    MVT OpVT = MVT::getVectorVT(MVT::i16, Ops[0].getValueSizeInBits() / 16);

    return DAG.getNode(X86ISD::VPMADDWD, DL, ResVT,

                       DAG.getBitcast(OpVT, Ops[0]),

                       DAG.getBitcast(OpVT, Ops[1]));

  };

  return SplitOpsAndApply(DAG, Subtarget, DL, VT, {N0, N1}, PMADDWDBuilder);

}


static SDValue combineMulToPMULDQ(SDNode *N, const SDLoc &DL, SelectionDAG &DAG,

                                  const X86Subtarget &Subtarget) {

  if (!Subtarget.hasSSE2())

    return SDValue();


  EVT VT = N->getValueType(0);


  // Only support vXi64 vectors.

  if (!VT.isVector() || VT.getVectorElementType() != MVT::i64 ||

      VT.getVectorNumElements() < 2 ||

      !isPowerOf2_32(VT.getVectorNumElements()))

    return SDValue();


  SDValue N0 = N->getOperand(0);

  SDValue N1 = N->getOperand(1);


  // MULDQ returns the 64-bit result of the signed multiplication of the lower

  // 32-bits. We can lower with this if the sign bits stretch that far.

  if (Subtarget.hasSSE41() && DAG.ComputeNumSignBits(N0) > 32 &&

      DAG.ComputeNumSignBits(N1) > 32) {

    auto PMULDQBuilder = [](SelectionDAG &DAG, const SDLoc &DL,

                            ArrayRef<SDValue> Ops) {

      return DAG.getNode(X86ISD::PMULDQ, DL, Ops[0].getValueType(), Ops);

    };

    return SplitOpsAndApply(DAG, Subtarget, DL, VT, {N0, N1}, PMULDQBuilder,

                            /*CheckBWI*/ false);

  }


  // If the upper bits are zero we can use a single pmuludq.

  APInt Mask = APInt::getHighBitsSet(64, 32);

  if (DAG.MaskedValueIsZero(N0, Mask) && DAG.MaskedValueIsZero(N1, Mask)) {

    auto PMULUDQBuilder = [](SelectionDAG &DAG, const SDLoc &DL,

                             ArrayRef<SDValue> Ops) {

      return DAG.getNode(X86ISD::PMULUDQ, DL, Ops[0].getValueType(), Ops);

    };

    return SplitOpsAndApply(DAG, Subtarget, DL, VT, {N0, N1}, PMULUDQBuilder,

                            /*CheckBWI*/ false);

  }


  return SDValue();

}


static SDValue combineMul(SDNode *N, SelectionDAG &DAG,

                          TargetLowering::DAGCombinerInfo &DCI,

                          const X86Subtarget &Subtarget) {

  EVT VT = N->getValueType(0);

  SDLoc DL(N);


  if (SDValue V = combineMulToPMADDWD(N, DL, DAG, Subtarget))

    return V;


  if (SDValue V = combineMulToPMULDQ(N, DL, DAG, Subtarget))

    return V;


  if (DCI.isBeforeLegalize() && VT.isVector())

    return reduceVMULWidth(N, DL, DAG, Subtarget);


  if (VT != MVT::i64 && VT != MVT::i32 &&

      (!VT.isVector() || !VT.isSimple() || !VT.isInteger()))

    return SDValue();


  KnownBits Known1 = DAG.computeKnownBits(N->getOperand(1));

  if (!Known1.isConstant())

    return SDValue();


  const APInt &C = Known1.getConstant();

  if (C.isZero())

    return DAG.getConstant(0, DL, VT);


  if (C.isAllOnes())

    return DAG.getNegative(N->getOperand(0), DL, VT);


  if (isPowerOf2_64(C.getZExtValue()))

    return SDValue();


  // Optimize a single multiply with constant into two operations in order to

  // implement it with two cheaper instructions, e.g. LEA + SHL, LEA + LEA.

  if (!MulConstantOptimization)

    return SDValue();


  // An imul is usually smaller than the alternative sequence.

  if (DAG.getMachineFunction().getFunction().hasMinSize())

    return SDValue();


  if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())

    return SDValue();


  int64_t SignMulAmt = C.getSExtValue();

  assert(SignMulAmt != INT64_MIN && "Int min should have been handled!");

  uint64_t AbsMulAmt = SignMulAmt < 0 ? -SignMulAmt : SignMulAmt;


  SDValue NewMul = SDValue();

  if (VT == MVT::i64 || VT == MVT::i32) {

    if (AbsMulAmt == 3 || AbsMulAmt == 5 || AbsMulAmt == 9) {

      NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),

                           DAG.getConstant(AbsMulAmt, DL, VT));

      if (SignMulAmt < 0)

        NewMul = DAG.getNegative(NewMul, DL, VT);


      return NewMul;

    }


    uint64_t MulAmt1 = 0;

    uint64_t MulAmt2 = 0;

    if ((AbsMulAmt % 9) == 0) {

      MulAmt1 = 9;

      MulAmt2 = AbsMulAmt / 9;

    } else if ((AbsMulAmt % 5) == 0) {

      MulAmt1 = 5;

      MulAmt2 = AbsMulAmt / 5;

    } else if ((AbsMulAmt % 3) == 0) {

      MulAmt1 = 3;

      MulAmt2 = AbsMulAmt / 3;

    }


    // For negative multiply amounts, only allow MulAmt2 to be a power of 2.

    if (MulAmt2 &&

        (isPowerOf2_64(MulAmt2) ||

         (SignMulAmt >= 0 && (MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9)))) {


      if (isPowerOf2_64(MulAmt2) && !(SignMulAmt >= 0 && N->hasOneUse() &&

                                      N->user_begin()->getOpcode() == ISD::ADD))

        // If second multiplifer is pow2, issue it first. We want the multiply

        // by 3, 5, or 9 to be folded into the addressing mode unless the lone

        // use is an add. Only do this for positive multiply amounts since the

        // negate would prevent it from being used as an address mode anyway.

        std::swap(MulAmt1, MulAmt2);


      if (isPowerOf2_64(MulAmt1))

        NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),

                             DAG.getConstant(Log2_64(MulAmt1), DL, MVT::i8));

      else

        NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),

                             DAG.getConstant(MulAmt1, DL, VT));


      if (isPowerOf2_64(MulAmt2))

        NewMul = DAG.getNode(ISD::SHL, DL, VT, NewMul,

                             DAG.getConstant(Log2_64(MulAmt2), DL, MVT::i8));

      else

        NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul,

                             DAG.getConstant(MulAmt2, DL, VT));


      // Negate the result.

      if (SignMulAmt < 0)

        NewMul = DAG.getNegative(NewMul, DL, VT);

    } else if (!Subtarget.slowLEA())

      NewMul = combineMulSpecial(C.getZExtValue(), N, DAG, VT, DL);

  }

  if (!NewMul) {

    EVT ShiftVT = VT.isVector() ? VT : MVT::i8;

    if (isPowerOf2_64(AbsMulAmt - 1)) {

      // (mul x, 2^N + 1) => (add (shl x, N), x)

      NewMul = DAG.getNode(

          ISD::ADD, DL, VT, N->getOperand(0),

          DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),

                      DAG.getConstant(Log2_64(AbsMulAmt - 1), DL, ShiftVT)));

      if (SignMulAmt < 0)

        NewMul = DAG.getNegative(NewMul, DL, VT);

    } else if (isPowerOf2_64(AbsMulAmt + 1)) {

      // (mul x, 2^N - 1) => (sub (shl x, N), x)

      NewMul =

          DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),

                      DAG.getConstant(Log2_64(AbsMulAmt + 1), DL, ShiftVT));

      // To negate, reverse the operands of the subtract.

      if (SignMulAmt < 0)

        NewMul = DAG.getNode(ISD::SUB, DL, VT, N->getOperand(0), NewMul);

      else

        NewMul = DAG.getNode(ISD::SUB, DL, VT, NewMul, N->getOperand(0));

    } else if (SignMulAmt >= 0 && isPowerOf2_64(AbsMulAmt - 2) &&

               (!VT.isVector() || Subtarget.fastImmVectorShift())) {

      // (mul x, 2^N + 2) => (add (shl x, N), (add x, x))

      NewMul =

          DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),

                      DAG.getConstant(Log2_64(AbsMulAmt - 2), DL, ShiftVT));

      NewMul = DAG.getNode(

          ISD::ADD, DL, VT, NewMul,

          DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0), N->getOperand(0)));

    } else if (SignMulAmt >= 0 && isPowerOf2_64(AbsMulAmt + 2) &&

               (!VT.isVector() || Subtarget.fastImmVectorShift())) {

      // (mul x, 2^N - 2) => (sub (shl x, N), (add x, x))

      NewMul =

          DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),

                      DAG.getConstant(Log2_64(AbsMulAmt + 2), DL, ShiftVT));

      NewMul = DAG.getNode(

          ISD::SUB, DL, VT, NewMul,

          DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0), N->getOperand(0)));

    } else if (SignMulAmt >= 0 && VT.isVector() &&

               Subtarget.fastImmVectorShift()) {

      uint64_t AbsMulAmtLowBit = AbsMulAmt & (-AbsMulAmt);

      uint64_t ShiftAmt1;

      std::optional<unsigned> Opc;

      if (isPowerOf2_64(AbsMulAmt - AbsMulAmtLowBit)) {

        ShiftAmt1 = AbsMulAmt - AbsMulAmtLowBit;

        Opc = ISD::ADD;

      } else if (isPowerOf2_64(AbsMulAmt + AbsMulAmtLowBit)) {

        ShiftAmt1 = AbsMulAmt + AbsMulAmtLowBit;

        Opc = ISD::SUB;

      }


      if (Opc) {

        SDValue Shift1 =

            DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),

                        DAG.getConstant(Log2_64(ShiftAmt1), DL, ShiftVT));

        SDValue Shift2 =

            DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),

                        DAG.getConstant(Log2_64(AbsMulAmtLowBit), DL, ShiftVT));

        NewMul = DAG.getNode(*Opc, DL, VT, Shift1, Shift2);

      }

    }

  }


  return NewMul;

}


// Try to form a MULHU or MULHS node by looking for

// (srl (mul ext, ext), 16)

// TODO: This is X86 specific because we want to be able to handle wide types

// before type legalization. But we can only do it if the vector will be

// legalized via widening/splitting. Type legalization can't handle promotion

// of a MULHU/MULHS. There isn't a way to convey this to the generic DAG

// combiner.


static SDValue combineShiftToPMULH(SDNode *N, SelectionDAG &DAG,

                                   const SDLoc &DL,

                                   const X86Subtarget &Subtarget) {

  using namespace SDPatternMatch;

  assert((N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA) &&

         "SRL or SRA node is required here!");


  if (!Subtarget.hasSSE2())

    return SDValue();


  // Input type should be at least vXi32.

  EVT VT = N->getValueType(0);

  if (!VT.isVector() || VT.getVectorElementType().getSizeInBits() < 32)

    return SDValue();


  // The operation must be a multiply shifted right by 16.

  SDValue LHS, RHS;

  if (!sd_match(N->getOperand(1), m_SpecificInt(16)) ||

      !sd_match(N->getOperand(0), m_OneUse(m_Mul(m_Value(LHS), m_Value(RHS)))))

    return SDValue();


  unsigned ExtOpc = LHS.getOpcode();

  if ((ExtOpc != ISD::SIGN_EXTEND && ExtOpc != ISD::ZERO_EXTEND) ||

      RHS.getOpcode() != ExtOpc)

    return SDValue();


  // Peek through the extends.

  LHS = LHS.getOperand(0);

  RHS = RHS.getOperand(0);


  // Ensure the input types match.

  EVT MulVT = LHS.getValueType();

  if (MulVT.getVectorElementType() != MVT::i16 || RHS.getValueType() != MulVT)

    return SDValue();


  unsigned Opc = ExtOpc == ISD::SIGN_EXTEND ? ISD::MULHS : ISD::MULHU;

  SDValue Mulh = DAG.getNode(Opc, DL, MulVT, LHS, RHS);


  ExtOpc = N->getOpcode() == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;

  return DAG.getNode(ExtOpc, DL, VT, Mulh);

}


static SDValue combineShiftLeft(SDNode *N, SelectionDAG &DAG,

                                const X86Subtarget &Subtarget) {

  using namespace llvm::SDPatternMatch;

  SDValue N0 = N->getOperand(0);

  SDValue N1 = N->getOperand(1);

  ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);

  EVT VT = N0.getValueType();

  unsigned EltSizeInBits = VT.getScalarSizeInBits();

  SDLoc DL(N);


  // Exploits AVX2 VSHLV/VSRLV instructions for efficient unsigned vector shifts

  // with out-of-bounds clamping.

  if (N0.getOpcode() == ISD::VSELECT &&

      supportedVectorVarShift(VT, Subtarget, ISD::SHL)) {

    SDValue Cond = N0.getOperand(0);

    SDValue N00 = N0.getOperand(1);

    SDValue N01 = N0.getOperand(2);

    // fold shl(select(icmp_ult(amt,BW),x,0),amt) -> avx2 psllv(x,amt)

    if (ISD::isConstantSplatVectorAllZeros(N01.getNode()) &&

        sd_match(Cond, m_SetCC(m_Specific(N1), m_SpecificInt(EltSizeInBits),

                               m_SpecificCondCode(ISD::SETULT)))) {

      return DAG.getNode(X86ISD::VSHLV, DL, VT, N00, N1);

    }

    // fold shl(select(icmp_uge(amt,BW),0,x),amt) -> avx2 psllv(x,amt)

    if (ISD::isConstantSplatVectorAllZeros(N00.getNode()) &&

        sd_match(Cond, m_SetCC(m_Specific(N1), m_SpecificInt(EltSizeInBits),

                               m_SpecificCondCode(ISD::SETUGE)))) {

      return DAG.getNode(X86ISD::VSHLV, DL, VT, N01, N1);

    }

  }


  // fold (shl (and (setcc_c), c1), c2) -> (and setcc_c, (c1 << c2))

  // since the result of setcc_c is all zero's or all ones.

  if (VT.isInteger() && !VT.isVector() &&

      N1C && N0.getOpcode() == ISD::AND &&

      N0.getOperand(1).getOpcode() == ISD::Constant) {

    SDValue N00 = N0.getOperand(0);

    APInt Mask = N0.getConstantOperandAPInt(1);

    Mask <<= N1C->getAPIntValue();

    bool MaskOK = false;

    // We can handle cases concerning bit-widening nodes containing setcc_c if

    // we carefully interrogate the mask to make sure we are semantics

    // preserving.

    // The transform is not safe if the result of C1 << C2 exceeds the bitwidth

    // of the underlying setcc_c operation if the setcc_c was zero extended.

    // Consider the following example:

    //   zext(setcc_c)                 -> i32 0x0000FFFF

    //   c1                            -> i32 0x0000FFFF

    //   c2                            -> i32 0x00000001

    //   (shl (and (setcc_c), c1), c2) -> i32 0x0001FFFE

    //   (and setcc_c, (c1 << c2))     -> i32 0x0000FFFE

    if (N00.getOpcode() == X86ISD::SETCC_CARRY) {

      MaskOK = true;

    } else if (N00.getOpcode() == ISD::SIGN_EXTEND &&

               N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {

      MaskOK = true;

    } else if ((N00.getOpcode() == ISD::ZERO_EXTEND ||

                N00.getOpcode() == ISD::ANY_EXTEND) &&

               N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {

      MaskOK = Mask.isIntN(N00.getOperand(0).getValueSizeInBits());

    }

    if (MaskOK && Mask != 0)

      return DAG.getNode(ISD::AND, DL, VT, N00, DAG.getConstant(Mask, DL, VT));

  }


  return SDValue();

}


static SDValue combineShiftRightArithmetic(SDNode *N, SelectionDAG &DAG,

                                           const X86Subtarget &Subtarget) {

  using namespace llvm::SDPatternMatch;

  SDValue N0 = N->getOperand(0);

  SDValue N1 = N->getOperand(1);

  EVT VT = N0.getValueType();

  unsigned Size = VT.getSizeInBits();

  SDLoc DL(N);


  if (SDValue V = combineShiftToPMULH(N, DAG, DL, Subtarget))

    return V;


  // fold sra(x,umin(amt,bw-1)) -> avx2 psrav(x,amt)

  if (supportedVectorVarShift(VT, Subtarget, ISD::SRA)) {

    SDValue ShrAmtVal;

    if (sd_match(N1, m_UMin(m_Value(ShrAmtVal),

                            m_SpecificInt(VT.getScalarSizeInBits() - 1))))

      return DAG.getNode(X86ISD::VSRAV, DL, VT, N0, ShrAmtVal);

  }


  // fold (SRA (SHL X, ShlConst), SraConst)

  // into (SHL (sext_in_reg X), ShlConst - SraConst)

  //   or (sext_in_reg X)

  //   or (SRA (sext_in_reg X), SraConst - ShlConst)

  // depending on relation between SraConst and ShlConst.

  // We only do this if (Size - ShlConst) is equal to 8, 16 or 32. That allows

  // us to do the sext_in_reg from corresponding bit.


  // sexts in X86 are MOVs. The MOVs have the same code size

  // as above SHIFTs (only SHIFT on 1 has lower code size).

  // However the MOVs have 2 advantages to a SHIFT:

  // 1. MOVs can write to a register that differs from source

  // 2. MOVs accept memory operands


  if (VT.isVector() || N1.getOpcode() != ISD::Constant ||

      N0.getOpcode() != ISD::SHL || !N0.hasOneUse() ||

      N0.getOperand(1).getOpcode() != ISD::Constant)

    return SDValue();


  SDValue N00 = N0.getOperand(0);

  SDValue N01 = N0.getOperand(1);

  APInt ShlConst = N01->getAsAPIntVal();

  APInt SraConst = N1->getAsAPIntVal();

  EVT CVT = N1.getValueType();


  if (CVT != N01.getValueType())

    return SDValue();

  if (SraConst.isNegative())

    return SDValue();


  for (MVT SVT : { MVT::i8, MVT::i16, MVT::i32 }) {

    unsigned ShiftSize = SVT.getSizeInBits();

    // Only deal with (Size - ShlConst) being equal to 8, 16 or 32.

    if (ShiftSize >= Size || ShlConst != Size - ShiftSize)

      continue;

    SDValue NN =

        DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, N00, DAG.getValueType(SVT));

    if (SraConst.eq(ShlConst))

      return NN;

    if (SraConst.ult(ShlConst))

      return DAG.getNode(ISD::SHL, DL, VT, NN,

                         DAG.getConstant(ShlConst - SraConst, DL, CVT));

    return DAG.getNode(ISD::SRA, DL, VT, NN,

                       DAG.getConstant(SraConst - ShlConst, DL, CVT));

  }

  return SDValue();

}


static SDValue combineShiftRightLogical(SDNode *N, SelectionDAG &DAG,

                                        TargetLowering::DAGCombinerInfo &DCI,

                                        const X86Subtarget &Subtarget) {

  using namespace llvm::SDPatternMatch;

  SDValue N0 = N->getOperand(0);

  SDValue N1 = N->getOperand(1);

  EVT VT = N0.getValueType();

  unsigned EltSizeInBits = VT.getScalarSizeInBits();

  SDLoc DL(N);


  if (SDValue V = combineShiftToPMULH(N, DAG, DL, Subtarget))

    return V;


  // Exploits AVX2 VSHLV/VSRLV instructions for efficient unsigned vector shifts

  // with out-of-bounds clamping.

  if (N0.getOpcode() == ISD::VSELECT &&

      supportedVectorVarShift(VT, Subtarget, ISD::SRL)) {

    SDValue Cond = N0.getOperand(0);

    SDValue N00 = N0.getOperand(1);

    SDValue N01 = N0.getOperand(2);

    // fold srl(select(icmp_ult(amt,BW),x,0),amt) -> avx2 psrlv(x,amt)

    if (ISD::isConstantSplatVectorAllZeros(N01.getNode()) &&

        sd_match(Cond, m_SetCC(m_Specific(N1), m_SpecificInt(EltSizeInBits),

                               m_SpecificCondCode(ISD::SETULT)))) {

      return DAG.getNode(X86ISD::VSRLV, DL, VT, N00, N1);

    }

    // fold srl(select(icmp_uge(amt,BW),0,x),amt) -> avx2 psrlv(x,amt)

    if (ISD::isConstantSplatVectorAllZeros(N00.getNode()) &&

        sd_match(Cond, m_SetCC(m_Specific(N1), m_SpecificInt(EltSizeInBits),

                               m_SpecificCondCode(ISD::SETUGE)))) {

      return DAG.getNode(X86ISD::VSRLV, DL, VT, N01, N1);

    }

  }


  // Only do this on the last DAG combine as it can interfere with other

  // combines.

  if (!DCI.isAfterLegalizeDAG())

    return SDValue();


  // Try to improve a sequence of srl (and X, C1), C2 by inverting the order.

  // TODO: This is a generic DAG combine that became an x86-only combine to

  // avoid shortcomings in other folds such as bswap, bit-test ('bt'), and

  // and-not ('andn').

  if (N0.getOpcode() != ISD::AND || !N0.hasOneUse())

    return SDValue();


  auto *ShiftC = dyn_cast<ConstantSDNode>(N1);

  auto *AndC = dyn_cast<ConstantSDNode>(N0.getOperand(1));

  if (!ShiftC || !AndC)

    return SDValue();


  // If we can shrink the constant mask below 8-bits or 32-bits, then this

  // transform should reduce code size. It may also enable secondary transforms

  // from improved known-bits analysis or instruction selection.

  APInt MaskVal = AndC->getAPIntValue();


  // If this can be matched by a zero extend, don't optimize.

  if (MaskVal.isMask()) {

    unsigned TO = MaskVal.countr_one();

    if (TO >= 8 && isPowerOf2_32(TO))

      return SDValue();

  }


  APInt NewMaskVal = MaskVal.lshr(ShiftC->getAPIntValue());

  unsigned OldMaskSize = MaskVal.getSignificantBits();

  unsigned NewMaskSize = NewMaskVal.getSignificantBits();

  if ((OldMaskSize > 8 && NewMaskSize <= 8) ||

      (OldMaskSize > 32 && NewMaskSize <= 32)) {

    // srl (and X, AndC), ShiftC --> and (srl X, ShiftC), (AndC >> ShiftC)

    SDValue NewMask = DAG.getConstant(NewMaskVal, DL, VT);

    SDValue NewShift = DAG.getNode(ISD::SRL, DL, VT, N0.getOperand(0), N1);

    return DAG.getNode(ISD::AND, DL, VT, NewShift, NewMask);

  }

  return SDValue();

}


static SDValue combineHorizOpWithShuffle(SDNode *N, SelectionDAG &DAG,

                                         const X86Subtarget &Subtarget) {

  unsigned Opcode = N->getOpcode();

  assert(isHorizOp(Opcode) && "Unexpected hadd/hsub/pack opcode");


  SDLoc DL(N);

  EVT VT = N->getValueType(0);

  SDValue N0 = N->getOperand(0);

  SDValue N1 = N->getOperand(1);

  EVT SrcVT = N0.getValueType();


  SDValue BC0 =

      N->isOnlyUserOf(N0.getNode()) ? peekThroughOneUseBitcasts(N0) : N0;

  SDValue BC1 =

      N->isOnlyUserOf(N1.getNode()) ? peekThroughOneUseBitcasts(N1) : N1;


  // Attempt to fold HOP(LOSUBVECTOR(SHUFFLE(X)),HISUBVECTOR(SHUFFLE(X)))

  // to SHUFFLE(HOP(LOSUBVECTOR(X),HISUBVECTOR(X))), this is mainly for

  // truncation trees that help us avoid lane crossing shuffles.

  // TODO: There's a lot more we can do for PACK/HADD style shuffle combines.

  // TODO: We don't handle vXf64 shuffles yet.

  if (VT.is128BitVector() && SrcVT.getScalarSizeInBits() <= 32) {

    if (SDValue BCSrc = getSplitVectorSrc(BC0, BC1, false)) {

      SmallVector<SDValue> ShuffleOps;

      SmallVector<int> ShuffleMask, ScaledMask;

      SDValue Vec = peekThroughBitcasts(BCSrc);

      if (getTargetShuffleInputs(Vec, ShuffleOps, ShuffleMask, DAG)) {

        resolveTargetShuffleInputsAndMask(ShuffleOps, ShuffleMask);

        // To keep the HOP LHS/RHS coherency, we must be able to scale the unary

        // shuffle to a v4X64 width - we can probably relax this in the future.

        if (!isAnyZero(ShuffleMask) && ShuffleOps.size() == 1 &&

            ShuffleOps[0].getValueType().is256BitVector() &&

            scaleShuffleElements(ShuffleMask, 4, ScaledMask)) {

          SDValue Lo, Hi;

          MVT ShufVT = VT.isFloatingPoint() ? MVT::v4f32 : MVT::v4i32;

          std::tie(Lo, Hi) = DAG.SplitVector(ShuffleOps[0], DL);

          Lo = DAG.getBitcast(SrcVT, Lo);

          Hi = DAG.getBitcast(SrcVT, Hi);

          SDValue Res = DAG.getNode(Opcode, DL, VT, Lo, Hi);

          Res = DAG.getBitcast(ShufVT, Res);

          Res = DAG.getVectorShuffle(ShufVT, DL, Res, Res, ScaledMask);

          return DAG.getBitcast(VT, Res);

        }

      }

    }

  }


  // Attempt to fold HOP(SHUFFLE(X,Y),SHUFFLE(Z,W)) -> SHUFFLE(HOP()).

  if (VT.is128BitVector() && SrcVT.getScalarSizeInBits() <= 32) {

    // If either/both ops are a shuffle that can scale to v2x64,

    // then see if we can perform this as a v4x32 post shuffle.

    SmallVector<SDValue> Ops0, Ops1;

    SmallVector<int> Mask0, Mask1, ScaledMask0, ScaledMask1;

    bool IsShuf0 =

        getTargetShuffleInputs(BC0, Ops0, Mask0, DAG) && !isAnyZero(Mask0) &&

        scaleShuffleElements(Mask0, 2, ScaledMask0) &&

        all_of(Ops0, [](SDValue Op) { return Op.getValueSizeInBits() == 128; });

    bool IsShuf1 =

        getTargetShuffleInputs(BC1, Ops1, Mask1, DAG) && !isAnyZero(Mask1) &&

        scaleShuffleElements(Mask1, 2, ScaledMask1) &&

        all_of(Ops1, [](SDValue Op) { return Op.getValueSizeInBits() == 128; });

    if (IsShuf0 || IsShuf1) {

      if (!IsShuf0) {

        Ops0.assign({BC0});

        ScaledMask0.assign({0, 1});

      }

      if (!IsShuf1) {

        Ops1.assign({BC1});

        ScaledMask1.assign({0, 1});

      }


      SDValue LHS, RHS;

      int PostShuffle[4] = {-1, -1, -1, -1};

      auto FindShuffleOpAndIdx = [&](int M, int &Idx, ArrayRef<SDValue> Ops) {

        if (M < 0)

          return true;

        Idx = M % 2;

        SDValue Src = Ops[M / 2];

        if (!LHS || LHS == Src) {

          LHS = Src;

          return true;

        }

        if (!RHS || RHS == Src) {

          Idx += 2;

          RHS = Src;

          return true;

        }

        return false;

      };

      if (FindShuffleOpAndIdx(ScaledMask0[0], PostShuffle[0], Ops0) &&

          FindShuffleOpAndIdx(ScaledMask0[1], PostShuffle[1], Ops0) &&

          FindShuffleOpAndIdx(ScaledMask1[0], PostShuffle[2], Ops1) &&

          FindShuffleOpAndIdx(ScaledMask1[1], PostShuffle[3], Ops1)) {

        LHS = DAG.getBitcast(SrcVT, LHS);

        RHS = DAG.getBitcast(SrcVT, RHS ? RHS : LHS);

        MVT ShufVT = VT.isFloatingPoint() ? MVT::v4f32 : MVT::v4i32;

        SDValue Res = DAG.getNode(Opcode, DL, VT, LHS, RHS);

        Res = DAG.getBitcast(ShufVT, Res);

        Res = DAG.getVectorShuffle(ShufVT, DL, Res, Res, PostShuffle);

        return DAG.getBitcast(VT, Res);

      }

    }

  }


  // Attempt to fold HOP(SHUFFLE(X,Y),SHUFFLE(X,Y)) -> SHUFFLE(HOP(X,Y)).

  if (VT.is256BitVector() && Subtarget.hasInt256()) {

    SmallVector<int> Mask0, Mask1;

    SmallVector<SDValue> Ops0, Ops1;

    SmallVector<int, 2> ScaledMask0, ScaledMask1;

    if (getTargetShuffleInputs(BC0, Ops0, Mask0, DAG) && !isAnyZero(Mask0) &&

        getTargetShuffleInputs(BC1, Ops1, Mask1, DAG) && !isAnyZero(Mask1) &&

        !Ops0.empty() && !Ops1.empty() &&

        all_of(Ops0,

               [](SDValue Op) { return Op.getValueType().is256BitVector(); }) &&

        all_of(Ops1,

               [](SDValue Op) { return Op.getValueType().is256BitVector(); }) &&

        scaleShuffleElements(Mask0, 2, ScaledMask0) &&

        scaleShuffleElements(Mask1, 2, ScaledMask1)) {

      SDValue Op00 = peekThroughBitcasts(Ops0.front());

      SDValue Op10 = peekThroughBitcasts(Ops1.front());

      SDValue Op01 = peekThroughBitcasts(Ops0.back());

      SDValue Op11 = peekThroughBitcasts(Ops1.back());

      if ((Op00 == Op11) && (Op01 == Op10)) {

        std::swap(Op10, Op11);

        ShuffleVectorSDNode::commuteMask(ScaledMask1);

      }

      if ((Op00 == Op10) && (Op01 == Op11)) {

        const int Map[4] = {0, 2, 1, 3};

        SmallVector<int, 4> ShuffleMask(

            {Map[ScaledMask0[0]], Map[ScaledMask1[0]], Map[ScaledMask0[1]],

             Map[ScaledMask1[1]]});

        MVT ShufVT = VT.isFloatingPoint() ? MVT::v4f64 : MVT::v4i64;

        SDValue Res = DAG.getNode(Opcode, DL, VT, DAG.getBitcast(SrcVT, Op00),

                                  DAG.getBitcast(SrcVT, Op01));

        Res = DAG.getBitcast(ShufVT, Res);

        Res = DAG.getVectorShuffle(ShufVT, DL, Res, Res, ShuffleMask);

        return DAG.getBitcast(VT, Res);

      }

    }

  }


  return SDValue();

}


static SDValue combineVectorPack(SDNode *N, SelectionDAG &DAG,

                                 TargetLowering::DAGCombinerInfo &DCI,

                                 const X86Subtarget &Subtarget) {

  unsigned Opcode = N->getOpcode();

  assert((X86ISD::PACKSS == Opcode || X86ISD::PACKUS == Opcode) &&

         "Unexpected pack opcode");


  EVT VT = N->getValueType(0);

  SDValue N0 = N->getOperand(0);

  SDValue N1 = N->getOperand(1);

  unsigned NumDstElts = VT.getVectorNumElements();

  unsigned DstBitsPerElt = VT.getScalarSizeInBits();

  unsigned SrcBitsPerElt = 2 * DstBitsPerElt;

  assert(N0.getScalarValueSizeInBits() == SrcBitsPerElt &&

         N1.getScalarValueSizeInBits() == SrcBitsPerElt &&

         "Unexpected PACKSS/PACKUS input type");


  bool IsSigned = (X86ISD::PACKSS == Opcode);


  // Constant Folding.

  APInt UndefElts0, UndefElts1;

  SmallVector<APInt, 32> EltBits0, EltBits1;

  if ((N0.isUndef() || N->isOnlyUserOf(N0.getNode())) &&

      (N1.isUndef() || N->isOnlyUserOf(N1.getNode())) &&

      getTargetConstantBitsFromNode(N0, SrcBitsPerElt, UndefElts0, EltBits0,

                                    /*AllowWholeUndefs*/ true,

                                    /*AllowPartialUndefs*/ true) &&

      getTargetConstantBitsFromNode(N1, SrcBitsPerElt, UndefElts1, EltBits1,

                                    /*AllowWholeUndefs*/ true,

                                    /*AllowPartialUndefs*/ true)) {

    unsigned NumLanes = VT.getSizeInBits() / 128;

    unsigned NumSrcElts = NumDstElts / 2;

    unsigned NumDstEltsPerLane = NumDstElts / NumLanes;

    unsigned NumSrcEltsPerLane = NumSrcElts / NumLanes;


    APInt Undefs(NumDstElts, 0);

    SmallVector<APInt, 32> Bits(NumDstElts, APInt::getZero(DstBitsPerElt));

    for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {

      for (unsigned Elt = 0; Elt != NumDstEltsPerLane; ++Elt) {

        unsigned SrcIdx = Lane * NumSrcEltsPerLane + Elt % NumSrcEltsPerLane;

        auto &UndefElts = (Elt >= NumSrcEltsPerLane ? UndefElts1 : UndefElts0);

        auto &EltBits = (Elt >= NumSrcEltsPerLane ? EltBits1 : EltBits0);


        if (UndefElts[SrcIdx]) {

          Undefs.setBit(Lane * NumDstEltsPerLane + Elt);

          continue;

        }


        APInt &Val = EltBits[SrcIdx];

        if (IsSigned) {

          // PACKSS: Truncate signed value with signed saturation.

          // Source values less than dst minint are saturated to minint.

          // Source values greater than dst maxint are saturated to maxint.

          Val = Val.truncSSat(DstBitsPerElt);

        } else {

          // PACKUS: Truncate signed value with unsigned saturation.

          // Source values less than zero are saturated to zero.

          // Source values greater than dst maxuint are saturated to maxuint.

          // NOTE: This is different from APInt::truncUSat.

          if (Val.isIntN(DstBitsPerElt))

            Val = Val.trunc(DstBitsPerElt);

          else if (Val.isNegative())

            Val = APInt::getZero(DstBitsPerElt);

          else

            Val = APInt::getAllOnes(DstBitsPerElt);

        }

        Bits[Lane * NumDstEltsPerLane + Elt] = Val;

      }

    }


    return getConstVector(Bits, Undefs, VT.getSimpleVT(), DAG, SDLoc(N));

  }


  // Try to fold PACK(SHUFFLE(),SHUFFLE()) -> SHUFFLE(PACK()).

  if (SDValue V = combineHorizOpWithShuffle(N, DAG, Subtarget))

    return V;


  // Try to fold PACKSS(NOT(X),NOT(Y)) -> NOT(PACKSS(X,Y)).

  // Currently limit this to allsignbits cases only.

  if (IsSigned &&

      (N0.isUndef() || DAG.ComputeNumSignBits(N0) == SrcBitsPerElt) &&

      (N1.isUndef() || DAG.ComputeNumSignBits(N1) == SrcBitsPerElt)) {

    SDValue Not0 = N0.isUndef() ? N0 : IsNOT(N0, DAG);

    SDValue Not1 = N1.isUndef() ? N1 : IsNOT(N1, DAG);

    if (Not0 && Not1) {

      SDLoc DL(N);

      MVT SrcVT = N0.getSimpleValueType();

      SDValue Pack =

          DAG.getNode(X86ISD::PACKSS, DL, VT, DAG.getBitcast(SrcVT, Not0),

                      DAG.getBitcast(SrcVT, Not1));

      return DAG.getNOT(DL, Pack, VT);

    }

  }


  // Try to combine a PACKUSWB/PACKSSWB implemented truncate with a regular

  // truncate to create a larger truncate.

  if (Subtarget.hasAVX512() &&

      N0.getOpcode() == ISD::TRUNCATE && N1.isUndef() && VT == MVT::v16i8 &&

      N0.getOperand(0).getValueType() == MVT::v8i32) {

    if ((IsSigned && DAG.ComputeNumSignBits(N0) > 8) ||

        (!IsSigned &&

         DAG.MaskedValueIsZero(N0, APInt::getHighBitsSet(16, 8)))) {

      if (Subtarget.hasVLX())

        return DAG.getNode(X86ISD::VTRUNC, SDLoc(N), VT, N0.getOperand(0));


      // Widen input to v16i32 so we can truncate that.

      SDLoc dl(N);

      SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i32,

                                   N0.getOperand(0), DAG.getUNDEF(MVT::v8i32));

      return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, Concat);

    }

  }


  // Try to fold PACK(EXTEND(X),EXTEND(Y)) -> CONCAT(X,Y) subvectors.

  if (VT.is128BitVector()) {

    unsigned ExtOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;

    SDValue Src0, Src1;

    if (N0.getOpcode() == ExtOpc &&

        N0.getOperand(0).getValueType().is64BitVector() &&

        N0.getOperand(0).getScalarValueSizeInBits() == DstBitsPerElt) {

      Src0 = N0.getOperand(0);

    }

    if (N1.getOpcode() == ExtOpc &&

        N1.getOperand(0).getValueType().is64BitVector() &&

        N1.getOperand(0).getScalarValueSizeInBits() == DstBitsPerElt) {

      Src1 = N1.getOperand(0);

    }

    if ((Src0 || N0.isUndef()) && (Src1 || N1.isUndef())) {

      assert((Src0 || Src1) && "Found PACK(UNDEF,UNDEF)");

      Src0 = Src0 ? Src0 : DAG.getUNDEF(Src1.getValueType());

      Src1 = Src1 ? Src1 : DAG.getUNDEF(Src0.getValueType());

      return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT, Src0, Src1);

    }


    // Try again with pack(*_extend_vector_inreg, undef).

    unsigned VecInRegOpc = IsSigned ? ISD::SIGN_EXTEND_VECTOR_INREG

                                    : ISD::ZERO_EXTEND_VECTOR_INREG;

    if (N0.getOpcode() == VecInRegOpc && N1.isUndef() &&

        N0.getOperand(0).getScalarValueSizeInBits() < DstBitsPerElt)

      return getEXTEND_VECTOR_INREG(ExtOpc, SDLoc(N), VT, N0.getOperand(0),

                                    DAG);

  }


  // Attempt to combine as shuffle.

  SDValue Op(N, 0);

  if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))

    return Res;


  return SDValue();

}


static SDValue combineVectorHADDSUB(SDNode *N, SelectionDAG &DAG,

                                    TargetLowering::DAGCombinerInfo &DCI,

                                    const X86Subtarget &Subtarget) {

  assert((X86ISD::HADD == N->getOpcode() || X86ISD::FHADD == N->getOpcode() ||

          X86ISD::HSUB == N->getOpcode() || X86ISD::FHSUB == N->getOpcode()) &&

         "Unexpected horizontal add/sub opcode");


  if (!shouldUseHorizontalOp(true, DAG, Subtarget)) {

    MVT VT = N->getSimpleValueType(0);

    SDValue LHS = N->getOperand(0);

    SDValue RHS = N->getOperand(1);


    // HOP(HOP'(X,X),HOP'(Y,Y)) -> HOP(PERMUTE(HOP'(X,Y)),PERMUTE(HOP'(X,Y)).

    if (LHS != RHS && LHS.getOpcode() == N->getOpcode() &&

        LHS.getOpcode() == RHS.getOpcode() &&

        LHS.getValueType() == RHS.getValueType() &&

        N->isOnlyUserOf(LHS.getNode()) && N->isOnlyUserOf(RHS.getNode())) {

      SDValue LHS0 = LHS.getOperand(0);

      SDValue LHS1 = LHS.getOperand(1);

      SDValue RHS0 = RHS.getOperand(0);

      SDValue RHS1 = RHS.getOperand(1);

      if ((LHS0 == LHS1 || LHS0.isUndef() || LHS1.isUndef()) &&

          (RHS0 == RHS1 || RHS0.isUndef() || RHS1.isUndef())) {

        SDLoc DL(N);

        SDValue Res = DAG.getNode(LHS.getOpcode(), DL, LHS.getValueType(),

                                  LHS0.isUndef() ? LHS1 : LHS0,

                                  RHS0.isUndef() ? RHS1 : RHS0);

        MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);

        Res = DAG.getBitcast(ShufVT, Res);

        SDValue NewLHS =

            DAG.getNode(X86ISD::PSHUFD, DL, ShufVT, Res,

                        getV4X86ShuffleImm8ForMask({0, 1, 0, 1}, DL, DAG));

        SDValue NewRHS =

            DAG.getNode(X86ISD::PSHUFD, DL, ShufVT, Res,

                        getV4X86ShuffleImm8ForMask({2, 3, 2, 3}, DL, DAG));

        return DAG.getNode(N->getOpcode(), DL, VT, DAG.getBitcast(VT, NewLHS),

                           DAG.getBitcast(VT, NewRHS));

      }

    }

  }


  // Try to fold HOP(SHUFFLE(),SHUFFLE()) -> SHUFFLE(HOP()).

  if (SDValue V = combineHorizOpWithShuffle(N, DAG, Subtarget))

    return V;


  return SDValue();

}


static SDValue combineVectorShiftVar(SDNode *N, SelectionDAG &DAG,

                                     TargetLowering::DAGCombinerInfo &DCI,

                                     const X86Subtarget &Subtarget) {

  assert((X86ISD::VSHL == N->getOpcode() || X86ISD::VSRA == N->getOpcode() ||

          X86ISD::VSRL == N->getOpcode()) &&

         "Unexpected shift opcode");

  EVT VT = N->getValueType(0);

  SDValue N0 = N->getOperand(0);

  SDValue N1 = N->getOperand(1);


  // Shift zero -> zero.

  if (ISD::isBuildVectorAllZeros(N0.getNode()))

    return DAG.getConstant(0, SDLoc(N), VT);


  // Detect constant shift amounts.

  APInt UndefElts;

  SmallVector<APInt, 32> EltBits;

  if (getTargetConstantBitsFromNode(N1, 64, UndefElts, EltBits,

                                    /*AllowWholeUndefs*/ true,

                                    /*AllowPartialUndefs*/ false)) {

    unsigned X86Opc = getTargetVShiftUniformOpcode(N->getOpcode(), false);

    return getTargetVShiftByConstNode(X86Opc, SDLoc(N), VT.getSimpleVT(), N0,

                                      EltBits[0].getZExtValue(), DAG);

  }


  const TargetLowering &TLI = DAG.getTargetLoweringInfo();

  APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());

  if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, DCI))

    return SDValue(N, 0);


  return SDValue();

}


static SDValue combineVectorShiftImm(SDNode *N, SelectionDAG &DAG,

                                     TargetLowering::DAGCombinerInfo &DCI,

                                     const X86Subtarget &Subtarget) {

  unsigned Opcode = N->getOpcode();

  assert((X86ISD::VSHLI == Opcode || X86ISD::VSRAI == Opcode ||

          X86ISD::VSRLI == Opcode) &&

         "Unexpected shift opcode");

  bool LogicalShift = X86ISD::VSHLI == Opcode || X86ISD::VSRLI == Opcode;

  EVT VT = N->getValueType(0);

  SDValue N0 = N->getOperand(0);

  SDValue N1 = N->getOperand(1);

  unsigned NumBitsPerElt = VT.getScalarSizeInBits();

  assert(VT == N0.getValueType() && (NumBitsPerElt % 8) == 0 &&

         "Unexpected value type");

  assert(N1.getValueType() == MVT::i8 && "Unexpected shift amount type");


  // (shift undef, X) -> 0

  if (N0.isUndef())

    return DAG.getConstant(0, SDLoc(N), VT);


  // Out of range logical bit shifts are guaranteed to be zero.

  // Out of range arithmetic bit shifts splat the sign bit.

  unsigned ShiftVal = N->getConstantOperandVal(1);

  if (ShiftVal >= NumBitsPerElt) {

    if (LogicalShift)

      return DAG.getConstant(0, SDLoc(N), VT);

    ShiftVal = NumBitsPerElt - 1;

  }


  // (shift X, 0) -> X

  if (!ShiftVal)

    return N0;


  // (shift 0, C) -> 0

  if (ISD::isBuildVectorAllZeros(N0.getNode()))

    // N0 is all zeros or undef. We guarantee that the bits shifted into the

    // result are all zeros, not undef.

    return DAG.getConstant(0, SDLoc(N), VT);


  // (VSRAI -1, C) -> -1

  if (!LogicalShift && ISD::isBuildVectorAllOnes(N0.getNode()))

    // N0 is all ones or undef. We guarantee that the bits shifted into the

    // result are all ones, not undef.

    return DAG.getAllOnesConstant(SDLoc(N), VT);


  auto MergeShifts = [&](SDValue X, uint64_t Amt0, uint64_t Amt1) {

    unsigned NewShiftVal = Amt0 + Amt1;

    if (NewShiftVal >= NumBitsPerElt) {

      // Out of range logical bit shifts are guaranteed to be zero.

      // Out of range arithmetic bit shifts splat the sign bit.

      if (LogicalShift)

        return DAG.getConstant(0, SDLoc(N), VT);

      NewShiftVal = NumBitsPerElt - 1;

    }

    return DAG.getNode(Opcode, SDLoc(N), VT, N0.getOperand(0),

                       DAG.getTargetConstant(NewShiftVal, SDLoc(N), MVT::i8));

  };


  // (shift (shift X, C2), C1) -> (shift X, (C1 + C2))

  if (Opcode == N0.getOpcode())

    return MergeShifts(N0.getOperand(0), ShiftVal, N0.getConstantOperandVal(1));


  // (shl (add X, X), C) -> (shl X, (C + 1))

  if (Opcode == X86ISD::VSHLI && N0.getOpcode() == ISD::ADD &&

      N0.getOperand(0) == N0.getOperand(1))

    return MergeShifts(N0.getOperand(0), ShiftVal, 1);


  // We can decode 'whole byte' logical bit shifts as shuffles.

  if (LogicalShift && (ShiftVal % 8) == 0) {

    SDValue Op(N, 0);

    if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))

      return Res;

  }


  // Attempt to detect an expanded vXi64 SIGN_EXTEND_INREG vXi1 pattern, and

  // convert to a splatted v2Xi32 SIGN_EXTEND_INREG pattern:

  // psrad(pshufd(psllq(X,63),1,1,3,3),31) ->

  // pshufd(psrad(pslld(X,31),31),0,0,2,2).

  if (Opcode == X86ISD::VSRAI && NumBitsPerElt == 32 && ShiftVal == 31 &&

      N0.getOpcode() == X86ISD::PSHUFD &&

      N0.getConstantOperandVal(1) == getV4X86ShuffleImm({1, 1, 3, 3}) &&

      N0->hasOneUse()) {

    SDValue BC = peekThroughOneUseBitcasts(N0.getOperand(0));

    if (BC.getOpcode() == X86ISD::VSHLI &&

        BC.getScalarValueSizeInBits() == 64 &&

        BC.getConstantOperandVal(1) == 63) {

      SDLoc DL(N);

      SDValue Src = BC.getOperand(0);

      Src = DAG.getBitcast(VT, Src);

      Src = DAG.getNode(X86ISD::PSHUFD, DL, VT, Src,

                        getV4X86ShuffleImm8ForMask({0, 0, 2, 2}, DL, DAG));

      Src = DAG.getNode(X86ISD::VSHLI, DL, VT, Src, N1);

      Src = DAG.getNode(X86ISD::VSRAI, DL, VT, Src, N1);

      return Src;

    }

  }


  auto TryConstantFold = [&](SDValue V) {

    APInt UndefElts;

    SmallVector<APInt, 32> EltBits;

    if (!getTargetConstantBitsFromNode(V, NumBitsPerElt, UndefElts, EltBits,

                                       /*AllowWholeUndefs*/ true,

                                       /*AllowPartialUndefs*/ true))

      return SDValue();

    assert(EltBits.size() == VT.getVectorNumElements() &&

           "Unexpected shift value type");

    // Undef elements need to fold to 0. It's possible SimplifyDemandedBits

    // created an undef input due to no input bits being demanded, but user

    // still expects 0 in other bits.

    for (unsigned i = 0, e = EltBits.size(); i != e; ++i) {

      APInt &Elt = EltBits[i];

      if (UndefElts[i])

        Elt = 0;

      else if (X86ISD::VSHLI == Opcode)

        Elt <<= ShiftVal;

      else if (X86ISD::VSRAI == Opcode)

        Elt.ashrInPlace(ShiftVal);

      else

        Elt.lshrInPlace(ShiftVal);

    }

    // Reset undef elements since they were zeroed above.

    UndefElts = 0;

    return getConstVector(EltBits, UndefElts, VT.getSimpleVT(), DAG, SDLoc(N));

  };


  // Constant Folding.

  if (N->isOnlyUserOf(N0.getNode())) {

    if (SDValue C = TryConstantFold(N0))

      return C;


    // Fold (shift (logic X, C2), C1) -> (logic (shift X, C1), (shift C2, C1))

    // Don't break NOT patterns.

    SDValue BC = peekThroughOneUseBitcasts(N0);

    if (ISD::isBitwiseLogicOp(BC.getOpcode()) &&

        BC->isOnlyUserOf(BC.getOperand(1).getNode()) &&

        !ISD::isBuildVectorAllOnes(BC.getOperand(1).getNode())) {

      if (SDValue RHS = TryConstantFold(BC.getOperand(1))) {

        SDLoc DL(N);

        SDValue LHS = DAG.getNode(Opcode, DL, VT,

                                  DAG.getBitcast(VT, BC.getOperand(0)), N1);

        return DAG.getNode(BC.getOpcode(), DL, VT, LHS, RHS);

      }

    }

  }


  const TargetLowering &TLI = DAG.getTargetLoweringInfo();

  if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnes(NumBitsPerElt),

                               DCI))

    return SDValue(N, 0);


  return SDValue();

}


static SDValue combineVectorInsert(SDNode *N, SelectionDAG &DAG,

                                   TargetLowering::DAGCombinerInfo &DCI,

                                   const X86Subtarget &Subtarget) {

  EVT VT = N->getValueType(0);

  unsigned Opcode = N->getOpcode();

  assert(((Opcode == X86ISD::PINSRB && VT == MVT::v16i8) ||

          (Opcode == X86ISD::PINSRW && VT == MVT::v8i16) ||

          Opcode == ISD::INSERT_VECTOR_ELT) &&

         "Unexpected vector insertion");


  SDValue Vec = N->getOperand(0);

  SDValue Scl = N->getOperand(1);

  SDValue Idx = N->getOperand(2);


  // Fold insert_vector_elt(undef, elt, 0) --> scalar_to_vector(elt).

  if (Opcode == ISD::INSERT_VECTOR_ELT && Vec.isUndef() && isNullConstant(Idx))

    return DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(N), VT, Scl);


  if (Opcode == X86ISD::PINSRB || Opcode == X86ISD::PINSRW) {

    unsigned NumBitsPerElt = VT.getScalarSizeInBits();

    const TargetLowering &TLI = DAG.getTargetLoweringInfo();

    if (TLI.SimplifyDemandedBits(SDValue(N, 0),

                                 APInt::getAllOnes(NumBitsPerElt), DCI))

      return SDValue(N, 0);

  }


  // Attempt to combine insertion patterns to a shuffle.

  if (VT.isSimple() && DCI.isAfterLegalizeDAG()) {

    SDValue Op(N, 0);

    if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))

      return Res;

  }


  return SDValue();

}


/// Recognize the distinctive (AND (setcc ...) (setcc ..)) where both setccs

/// reference the same FP CMP, and rewrite for CMPEQSS and friends. Likewise for

/// OR -> CMPNEQSS.


static SDValue combineCompareEqual(SDNode *N, SelectionDAG &DAG,

                                   TargetLowering::DAGCombinerInfo &DCI,

                                   const X86Subtarget &Subtarget) {

  unsigned opcode;


  // SSE1 supports CMP{eq|ne}SS, and SSE2 added CMP{eq|ne}SD, but

  // we're requiring SSE2 for both.

  if (Subtarget.hasSSE2() && isAndOrOfSetCCs(SDValue(N, 0U), opcode)) {

    SDValue N0 = N->getOperand(0);

    SDValue N1 = N->getOperand(1);

    SDValue CMP0 = N0.getOperand(1);

    SDValue CMP1 = N1.getOperand(1);

    SDLoc DL(N);


    // The SETCCs should both refer to the same CMP.

    if (CMP0.getOpcode() != X86ISD::FCMP || CMP0 != CMP1)

      return SDValue();


    SDValue CMP00 = CMP0->getOperand(0);

    SDValue CMP01 = CMP0->getOperand(1);

    EVT     VT    = CMP00.getValueType();


    if (VT == MVT::f32 || VT == MVT::f64 ||

        (VT == MVT::f16 && Subtarget.hasFP16())) {

      bool ExpectingFlags = false;

      // Check for any users that want flags:

      for (const SDNode *U : N->users()) {

        if (ExpectingFlags)

          break;


        switch (U->getOpcode()) {

        default:

        case ISD::BR_CC:

        case ISD::BRCOND:

        case ISD::SELECT:

          ExpectingFlags = true;

          break;

        case ISD::CopyToReg:

        case ISD::SIGN_EXTEND:

        case ISD::ZERO_EXTEND:

        case ISD::ANY_EXTEND:

          break;

        }

      }


      if (!ExpectingFlags) {

        enum X86::CondCode cc0 = (enum X86::CondCode)N0.getConstantOperandVal(0);

        enum X86::CondCode cc1 = (enum X86::CondCode)N1.getConstantOperandVal(0);


        if (cc1 == X86::COND_E || cc1 == X86::COND_NE) {

          X86::CondCode tmp = cc0;

          cc0 = cc1;

          cc1 = tmp;

        }


        if ((cc0 == X86::COND_E  && cc1 == X86::COND_NP) ||

            (cc0 == X86::COND_NE && cc1 == X86::COND_P)) {

          // FIXME: need symbolic constants for these magic numbers.

          // See X86ATTInstPrinter.cpp:printSSECC().

          unsigned x86cc = (cc0 == X86::COND_E) ? 0 : 4;

          if (Subtarget.hasAVX512()) {

            SDValue FSetCC =

                DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CMP00, CMP01,

                            DAG.getTargetConstant(x86cc, DL, MVT::i8));

            // Need to fill with zeros to ensure the bitcast will produce zeroes

            // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.

            SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v16i1,

                                      DAG.getConstant(0, DL, MVT::v16i1),

                                      FSetCC, DAG.getVectorIdxConstant(0, DL));

            return DAG.getZExtOrTrunc(DAG.getBitcast(MVT::i16, Ins), DL,

                                      N->getSimpleValueType(0));

          }

          SDValue OnesOrZeroesF =

              DAG.getNode(X86ISD::FSETCC, DL, CMP00.getValueType(), CMP00,

                          CMP01, DAG.getTargetConstant(x86cc, DL, MVT::i8));


          bool is64BitFP = (CMP00.getValueType() == MVT::f64);

          MVT IntVT = is64BitFP ? MVT::i64 : MVT::i32;


          if (is64BitFP && !Subtarget.is64Bit()) {

            // On a 32-bit target, we cannot bitcast the 64-bit float to a

            // 64-bit integer, since that's not a legal type. Since

            // OnesOrZeroesF is all ones or all zeroes, we don't need all the

            // bits, but can do this little dance to extract the lowest 32 bits

            // and work with those going forward.

            SDValue Vector64 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL,

                                           MVT::v2f64, OnesOrZeroesF);

            SDValue Vector32 = DAG.getBitcast(MVT::v4f32, Vector64);

            OnesOrZeroesF =

                DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Vector32,

                            DAG.getVectorIdxConstant(0, DL));

            IntVT = MVT::i32;

          }


          SDValue OnesOrZeroesI = DAG.getBitcast(IntVT, OnesOrZeroesF);

          SDValue ANDed = DAG.getNode(ISD::AND, DL, IntVT, OnesOrZeroesI,

                                      DAG.getConstant(1, DL, IntVT));

          SDValue OneBitOfTruth = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,

                                              ANDed);

          return OneBitOfTruth;

        }

      }

    }

  }

  return SDValue();

}


/// Try to fold: (and (xor X, -1), Y) -> (andnp X, Y).


static SDValue combineAndNotIntoANDNP(SDNode *N, const SDLoc &DL,

                                      SelectionDAG &DAG) {

  assert(N->getOpcode() == ISD::AND && "Unexpected opcode combine into ANDNP");


  MVT VT = N->getSimpleValueType(0);

  if (!VT.is128BitVector() && !VT.is256BitVector() && !VT.is512BitVector())

    return SDValue();


  SDValue X, Y;

  SDValue N0 = N->getOperand(0);

  SDValue N1 = N->getOperand(1);


  if (SDValue Not = IsNOT(N0, DAG)) {

    X = Not;

    Y = N1;

  } else if (SDValue Not = IsNOT(N1, DAG)) {

    X = Not;

    Y = N0;

  } else

    return SDValue();


  X = DAG.getBitcast(VT, X);

  Y = DAG.getBitcast(VT, Y);

  return DAG.getNode(X86ISD::ANDNP, DL, VT, X, Y);

}


/// Try to fold:

///   and (vector_shuffle<Z,...,Z>

///            (insert_vector_elt undef, (xor X, -1), Z), undef), Y

///   ->

///   andnp (vector_shuffle<Z,...,Z>

///              (insert_vector_elt undef, X, Z), undef), Y


static SDValue combineAndShuffleNot(SDNode *N, SelectionDAG &DAG,

                                    const X86Subtarget &Subtarget) {

  assert(N->getOpcode() == ISD::AND && "Unexpected opcode combine into ANDNP");


  EVT VT = N->getValueType(0);

  // Do not split 256 and 512 bit vectors with SSE2 as they overwrite original

  // value and require extra moves.

  if (!((VT.is128BitVector() && Subtarget.hasSSE2()) ||

        ((VT.is256BitVector() || VT.is512BitVector()) && Subtarget.hasAVX())))

    return SDValue();


  auto GetNot = [&DAG](SDValue V) {

    auto *SVN = dyn_cast<ShuffleVectorSDNode>(peekThroughOneUseBitcasts(V));

    // TODO: SVN->hasOneUse() is a strong condition. It can be relaxed if all

    // end-users are ISD::AND including cases

    // (and(extract_vector_element(SVN), Y)).

    if (!SVN || !SVN->hasOneUse() || !SVN->isSplat() ||

        !SVN->getOperand(1).isUndef()) {

      return SDValue();

    }

    SDValue IVEN = SVN->getOperand(0);

    if (IVEN.getOpcode() != ISD::INSERT_VECTOR_ELT ||

        !IVEN.getOperand(0).isUndef() || !IVEN.hasOneUse())

      return SDValue();

    if (!isa<ConstantSDNode>(IVEN.getOperand(2)) ||

        IVEN.getConstantOperandAPInt(2) != SVN->getSplatIndex())

      return SDValue();

    SDValue Src = IVEN.getOperand(1);

    if (SDValue Not = IsNOT(Src, DAG)) {

      SDValue NotSrc = DAG.getBitcast(Src.getValueType(), Not);

      SDValue NotIVEN =

          DAG.getNode(ISD::INSERT_VECTOR_ELT, SDLoc(IVEN), IVEN.getValueType(),

                      IVEN.getOperand(0), NotSrc, IVEN.getOperand(2));

      return DAG.getVectorShuffle(SVN->getValueType(0), SDLoc(SVN), NotIVEN,

                                  SVN->getOperand(1), SVN->getMask());

    }

    return SDValue();

  };


  SDValue X, Y;

  SDValue N0 = N->getOperand(0);

  SDValue N1 = N->getOperand(1);

  const TargetLowering &TLI = DAG.getTargetLoweringInfo();


  if (SDValue Not = GetNot(N0)) {

    X = Not;

    Y = N1;

  } else if (SDValue Not = GetNot(N1)) {

    X = Not;

    Y = N0;

  } else

    return SDValue();


  X = DAG.getBitcast(VT, X);

  Y = DAG.getBitcast(VT, Y);

  SDLoc DL(N);


  // We do not split for SSE at all, but we need to split vectors for AVX1 and

  // AVX2.

  if (!Subtarget.useAVX512Regs() && VT.is512BitVector() &&

      TLI.isTypeLegal(VT.getHalfNumVectorElementsVT(*DAG.getContext()))) {

    SDValue LoX, HiX;

    std::tie(LoX, HiX) = splitVector(X, DAG, DL);

    SDValue LoY, HiY;

    std::tie(LoY, HiY) = splitVector(Y, DAG, DL);

    EVT SplitVT = LoX.getValueType();

    SDValue LoV = DAG.getNode(X86ISD::ANDNP, DL, SplitVT, {LoX, LoY});

    SDValue HiV = DAG.getNode(X86ISD::ANDNP, DL, SplitVT, {HiX, HiY});

    return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, {LoV, HiV});

  }


  if (TLI.isTypeLegal(VT))

    return DAG.getNode(X86ISD::ANDNP, DL, VT, {X, Y});


  return SDValue();

}


// Try to widen AND, OR and XOR nodes to VT in order to remove casts around

// logical operations, like in the example below.

//   or (and (truncate x, truncate y)),

//      (xor (truncate z, build_vector (constants)))

// Given a target type \p VT, we generate

//   or (and x, y), (xor z, zext(build_vector (constants)))

// given x, y and z are of type \p VT. We can do so, if operands are either

// truncates from VT types, the second operand is a vector of constants, can

// be recursively promoted or is an existing extension we can extend further.


static SDValue PromoteMaskArithmetic(SDValue N, const SDLoc &DL, EVT VT,

                                     SelectionDAG &DAG,

                                     const X86Subtarget &Subtarget,

                                     unsigned Depth) {

  // Limit recursion to avoid excessive compile times.

  if (Depth >= SelectionDAG::MaxRecursionDepth)

    return SDValue();


  if (!ISD::isBitwiseLogicOp(N.getOpcode()))

    return SDValue();


  SDValue N0 = N.getOperand(0);

  SDValue N1 = N.getOperand(1);


  const TargetLowering &TLI = DAG.getTargetLoweringInfo();

  if (!TLI.isOperationLegalOrPromote(N.getOpcode(), VT))

    return SDValue();


  if (SDValue NN0 =

          PromoteMaskArithmetic(N0, DL, VT, DAG, Subtarget, Depth + 1))

    N0 = NN0;

  else {

    // The left side has to be a 'trunc'.

    bool LHSTrunc = N0.getOpcode() == ISD::TRUNCATE &&

                    N0.getOperand(0).getValueType() == VT;

    if (LHSTrunc)

      N0 = N0.getOperand(0);

    else

      return SDValue();

  }


  if (SDValue NN1 =

          PromoteMaskArithmetic(N1, DL, VT, DAG, Subtarget, Depth + 1))

    N1 = NN1;

  else {

    // The right side has to be a 'trunc', a (foldable) constant or an

    // existing extension we can extend further.

    bool RHSTrunc = N1.getOpcode() == ISD::TRUNCATE &&

                    N1.getOperand(0).getValueType() == VT;

    if (RHSTrunc)

      N1 = N1.getOperand(0);

    else if (ISD::isExtVecInRegOpcode(N1.getOpcode()) && VT.is256BitVector() &&

             Subtarget.hasInt256() && N1.hasOneUse())

      N1 = DAG.getNode(N1.getOpcode(), DL, VT, N1.getOperand(0));

    else if (SDValue Cst =

                 DAG.FoldConstantArithmetic(ISD::ZERO_EXTEND, DL, VT, {N1}))

      N1 = Cst;

    else

      return SDValue();

  }


  return DAG.getNode(N.getOpcode(), DL, VT, N0, N1);

}


// On AVX/AVX2 the type v8i1 is legalized to v8i16, which is an XMM sized

// register. In most cases we actually compare or select YMM-sized registers

// and mixing the two types creates horrible code. This method optimizes

// some of the transition sequences.

// Even with AVX-512 this is still useful for removing casts around logical

// operations on vXi1 mask types.


static SDValue PromoteMaskArithmetic(SDValue N, const SDLoc &DL,

                                     SelectionDAG &DAG,

                                     const X86Subtarget &Subtarget) {

  EVT VT = N.getValueType();

  assert(VT.isVector() && "Expected vector type");

  assert((N.getOpcode() == ISD::ANY_EXTEND ||

          N.getOpcode() == ISD::ZERO_EXTEND ||

          N.getOpcode() == ISD::SIGN_EXTEND) && "Invalid Node");


  SDValue Narrow = N.getOperand(0);

  EVT NarrowVT = Narrow.getValueType();


  // Generate the wide operation.

  SDValue Op = PromoteMaskArithmetic(Narrow, DL, VT, DAG, Subtarget, 0);

  if (!Op)

    return SDValue();

  switch (N.getOpcode()) {

  default: llvm_unreachable("Unexpected opcode");

  case ISD::ANY_EXTEND:

    return Op;

  case ISD::ZERO_EXTEND:

    return DAG.getZeroExtendInReg(Op, DL, NarrowVT);

  case ISD::SIGN_EXTEND:

    return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT,

                       Op, DAG.getValueType(NarrowVT));

  }

}


static unsigned convertIntLogicToFPLogicOpcode(unsigned Opcode) {

  unsigned FPOpcode;

  switch (Opcode) {

  // clang-format off

  default: llvm_unreachable("Unexpected input node for FP logic conversion");

  case ISD::AND: FPOpcode = X86ISD::FAND; break;

  case ISD::OR:  FPOpcode = X86ISD::FOR;  break;

  case ISD::XOR: FPOpcode = X86ISD::FXOR; break;

  // clang-format on

  }

  return FPOpcode;

}


/// If both input operands of a logic op are being cast from floating-point

/// types or FP compares, try to convert this into a floating-point logic node

/// to avoid unnecessary moves from SSE to integer registers.


static SDValue convertIntLogicToFPLogic(unsigned Opc, const SDLoc &DL, EVT VT,

                                        SDValue N0, SDValue N1,

                                        SelectionDAG &DAG,

                                        TargetLowering::DAGCombinerInfo &DCI,

                                        const X86Subtarget &Subtarget) {

  assert((Opc == ISD::OR || Opc == ISD::AND || Opc == ISD::XOR) &&

         "Unexpected bit opcode");


  if (!((N0.getOpcode() == ISD::BITCAST && N1.getOpcode() == ISD::BITCAST) ||

        (N0.getOpcode() == ISD::SETCC && N1.getOpcode() == ISD::SETCC)))

    return SDValue();


  SDValue N00 = N0.getOperand(0);

  SDValue N10 = N1.getOperand(0);

  EVT N00Type = N00.getValueType();

  EVT N10Type = N10.getValueType();


  // Ensure that both types are the same and are legal scalar fp types.

  if (N00Type != N10Type || !((Subtarget.hasSSE1() && N00Type == MVT::f32) ||

                              (Subtarget.hasSSE2() && N00Type == MVT::f64) ||

                              (Subtarget.hasFP16() && N00Type == MVT::f16)))

    return SDValue();


  if (N0.getOpcode() == ISD::BITCAST && !DCI.isBeforeLegalizeOps()) {

    unsigned FPOpcode = convertIntLogicToFPLogicOpcode(Opc);

    SDValue FPLogic = DAG.getNode(FPOpcode, DL, N00Type, N00, N10);

    return DAG.getBitcast(VT, FPLogic);

  }


  if (VT != MVT::i1 || N0.getOpcode() != ISD::SETCC || !N0.hasOneUse() ||

      !N1.hasOneUse())

    return SDValue();


  ISD::CondCode CC0 = cast<CondCodeSDNode>(N0.getOperand(2))->get();

  ISD::CondCode CC1 = cast<CondCodeSDNode>(N1.getOperand(2))->get();


  // The vector ISA for FP predicates is incomplete before AVX, so converting

  // COMIS* to CMPS* may not be a win before AVX.

  if (!Subtarget.hasAVX() &&

      !(cheapX86FSETCC_SSE(CC0) && cheapX86FSETCC_SSE(CC1)))

    return SDValue();


  // Convert scalar FP compares and logic to vector compares (COMIS* to CMPS*)

  // and vector logic:

  // logic (setcc N00, N01), (setcc N10, N11) -->

  // extelt (logic (setcc (s2v N00), (s2v N01)), setcc (s2v N10), (s2v N11))), 0

  unsigned NumElts = 128 / N00Type.getSizeInBits();

  EVT VecVT = EVT::getVectorVT(*DAG.getContext(), N00Type, NumElts);

  EVT BoolVecVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, NumElts);

  SDValue ZeroIndex = DAG.getVectorIdxConstant(0, DL);

  SDValue N01 = N0.getOperand(1);

  SDValue N11 = N1.getOperand(1);

  SDValue Vec00 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, N00);

  SDValue Vec01 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, N01);

  SDValue Vec10 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, N10);

  SDValue Vec11 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, N11);

  SDValue Setcc0 = DAG.getSetCC(DL, BoolVecVT, Vec00, Vec01, CC0);

  SDValue Setcc1 = DAG.getSetCC(DL, BoolVecVT, Vec10, Vec11, CC1);

  SDValue Logic = DAG.getNode(Opc, DL, BoolVecVT, Setcc0, Setcc1);

  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Logic, ZeroIndex);

}


// Attempt to fold BITOP(MOVMSK(X),MOVMSK(Y)) -> MOVMSK(BITOP(X,Y))

// to reduce XMM->GPR traffic.


static SDValue combineBitOpWithMOVMSK(unsigned Opc, const SDLoc &DL, SDValue N0,

                                      SDValue N1, SelectionDAG &DAG) {

  assert((Opc == ISD::OR || Opc == ISD::AND || Opc == ISD::XOR) &&

         "Unexpected bit opcode");


  // Both operands must be single use MOVMSK.

  if (N0.getOpcode() != X86ISD::MOVMSK || !N0.hasOneUse() ||

      N1.getOpcode() != X86ISD::MOVMSK || !N1.hasOneUse())

    return SDValue();


  SDValue Vec0 = N0.getOperand(0);

  SDValue Vec1 = N1.getOperand(0);

  EVT VecVT0 = Vec0.getValueType();

  EVT VecVT1 = Vec1.getValueType();


  // Both MOVMSK operands must be from vectors of the same size and same element

  // size, but its OK for a fp/int diff.

  if (VecVT0.getSizeInBits() != VecVT1.getSizeInBits() ||

      VecVT0.getScalarSizeInBits() != VecVT1.getScalarSizeInBits())

    return SDValue();


  unsigned VecOpc =

      VecVT0.isFloatingPoint() ? convertIntLogicToFPLogicOpcode(Opc) : Opc;

  SDValue Result =

      DAG.getNode(VecOpc, DL, VecVT0, Vec0, DAG.getBitcast(VecVT0, Vec1));

  return DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);

}


// Attempt to fold BITOP(SHIFT(X,Z),SHIFT(Y,Z)) -> SHIFT(BITOP(X,Y),Z).

// NOTE: This is a very limited case of what SimplifyUsingDistributiveLaws

// handles in InstCombine.


static SDValue combineBitOpWithShift(unsigned Opc, const SDLoc &DL, EVT VT,

                                     SDValue N0, SDValue N1,

                                     SelectionDAG &DAG) {

  assert((Opc == ISD::OR || Opc == ISD::AND || Opc == ISD::XOR) &&

         "Unexpected bit opcode");


  // Both operands must be single use.

  if (!N0.hasOneUse() || !N1.hasOneUse())

    return SDValue();


  // Search for matching shifts.

  SDValue BC0 = peekThroughOneUseBitcasts(N0);

  SDValue BC1 = peekThroughOneUseBitcasts(N1);


  unsigned BCOpc = BC0.getOpcode();

  EVT BCVT = BC0.getValueType();

  if (BCOpc != BC1->getOpcode() || BCVT != BC1.getValueType())

    return SDValue();


  switch (BCOpc) {

  case X86ISD::VSHLI:

  case X86ISD::VSRLI:

  case X86ISD::VSRAI: {

    if (BC0.getOperand(1) != BC1.getOperand(1))

      return SDValue();

    SDValue BitOp =

        DAG.getNode(Opc, DL, BCVT, BC0.getOperand(0), BC1.getOperand(0));

    SDValue Shift = DAG.getNode(BCOpc, DL, BCVT, BitOp, BC0.getOperand(1));

    return DAG.getBitcast(VT, Shift);

  }

  }


  return SDValue();

}


// Attempt to fold:

// BITOP(PACKSS(X,Z),PACKSS(Y,W)) --> PACKSS(BITOP(X,Y),BITOP(Z,W)).

// TODO: Handle PACKUS handling.


static SDValue combineBitOpWithPACK(unsigned Opc, const SDLoc &DL, EVT VT,

                                    SDValue N0, SDValue N1, SelectionDAG &DAG) {

  assert((Opc == ISD::OR || Opc == ISD::AND || Opc == ISD::XOR) &&

         "Unexpected bit opcode");


  // Both operands must be single use.

  if (!N0.hasOneUse() || !N1.hasOneUse())

    return SDValue();


  // Search for matching packs.

  N0 = peekThroughOneUseBitcasts(N0);

  N1 = peekThroughOneUseBitcasts(N1);


  if (N0.getOpcode() != X86ISD::PACKSS || N1.getOpcode() != X86ISD::PACKSS)

    return SDValue();


  MVT DstVT = N0.getSimpleValueType();

  if (DstVT != N1.getSimpleValueType())

    return SDValue();


  MVT SrcVT = N0.getOperand(0).getSimpleValueType();

  unsigned NumSrcBits = SrcVT.getScalarSizeInBits();


  // Limit to allsignbits packing.

  if (DAG.ComputeNumSignBits(N0.getOperand(0)) != NumSrcBits ||

      DAG.ComputeNumSignBits(N0.getOperand(1)) != NumSrcBits ||

      DAG.ComputeNumSignBits(N1.getOperand(0)) != NumSrcBits ||

      DAG.ComputeNumSignBits(N1.getOperand(1)) != NumSrcBits)

    return SDValue();


  SDValue LHS = DAG.getNode(Opc, DL, SrcVT, N0.getOperand(0), N1.getOperand(0));

  SDValue RHS = DAG.getNode(Opc, DL, SrcVT, N0.getOperand(1), N1.getOperand(1));

  return DAG.getBitcast(VT, DAG.getNode(X86ISD::PACKSS, DL, DstVT, LHS, RHS));

}


/// If this is a zero/all-bits result that is bitwise-anded with a low bits

/// mask. (Mask == 1 for the x86 lowering of a SETCC + ZEXT), replace the 'and'

/// with a shift-right to eliminate loading the vector constant mask value.


static SDValue combineAndMaskToShift(SDNode *N, const SDLoc &DL,

                                     SelectionDAG &DAG,

                                     const X86Subtarget &Subtarget) {

  SDValue Op0 = peekThroughBitcasts(N->getOperand(0));

  SDValue Op1 = peekThroughBitcasts(N->getOperand(1));

  EVT VT = Op0.getValueType();

  if (VT != Op1.getValueType() || !VT.isSimple() || !VT.isInteger())

    return SDValue();


  // Try to convert an "is positive" signbit masking operation into arithmetic

  // shift and "andn". This saves a materialization of a -1 vector constant.

  // The "is negative" variant should be handled more generally because it only

  // requires "and" rather than "andn":

  // and (pcmpgt X, -1), Y --> pandn (vsrai X, BitWidth - 1), Y

  //

  // This is limited to the original type to avoid producing even more bitcasts.

  // If the bitcasts can't be eliminated, then it is unlikely that this fold

  // will be profitable.

  if (N->getValueType(0) == VT &&

      supportedVectorShiftWithImm(VT, Subtarget, ISD::SRA)) {

    SDValue X, Y;

    if (Op1.getOpcode() == X86ISD::PCMPGT &&

        isAllOnesOrAllOnesSplat(Op1.getOperand(1)) && Op1.hasOneUse()) {

      X = Op1.getOperand(0);

      Y = Op0;

    } else if (Op0.getOpcode() == X86ISD::PCMPGT &&

               isAllOnesOrAllOnesSplat(Op0.getOperand(1)) && Op0.hasOneUse()) {

      X = Op0.getOperand(0);

      Y = Op1;

    }

    if (X && Y) {

      SDValue Sra =

          getTargetVShiftByConstNode(X86ISD::VSRAI, DL, VT.getSimpleVT(), X,

                                     VT.getScalarSizeInBits() - 1, DAG);

      return DAG.getNode(X86ISD::ANDNP, DL, VT, Sra, Y);

    }

  }


  APInt SplatVal;

  if (!X86::isConstantSplat(Op1, SplatVal, false) || !SplatVal.isMask())

    return SDValue();


  // Don't prevent creation of ANDN.

  if (isBitwiseNot(Op0))

    return SDValue();


  if (!supportedVectorShiftWithImm(VT, Subtarget, ISD::SRL))

    return SDValue();


  unsigned EltBitWidth = VT.getScalarSizeInBits();

  if (EltBitWidth != DAG.ComputeNumSignBits(Op0))

    return SDValue();


  unsigned ShiftVal = SplatVal.countr_one();

  SDValue ShAmt = DAG.getTargetConstant(EltBitWidth - ShiftVal, DL, MVT::i8);

  SDValue Shift = DAG.getNode(X86ISD::VSRLI, DL, VT, Op0, ShAmt);

  return DAG.getBitcast(N->getValueType(0), Shift);

}


// Get the index node from the lowered DAG of a GEP IR instruction with one

// indexing dimension.


static SDValue getIndexFromUnindexedLoad(LoadSDNode *Ld) {

  if (Ld->isIndexed())

    return SDValue();


  SDValue Base = Ld->getBasePtr();

  if (Base.getOpcode() != ISD::ADD)

    return SDValue();


  SDValue ShiftedIndex = Base.getOperand(0);

  if (ShiftedIndex.getOpcode() != ISD::SHL)

    return SDValue();


  return ShiftedIndex.getOperand(0);

}


static bool hasBZHI(const X86Subtarget &Subtarget, MVT VT) {

  return Subtarget.hasBMI2() &&

         (VT == MVT::i32 || (VT == MVT::i64 && Subtarget.is64Bit()));

}


/// Folds (and X, (or Y, ~Z)) --> (and X, ~(and ~Y, Z))

/// This undoes the inverse fold performed in InstCombine


static SDValue combineAndNotOrIntoAndNotAnd(SDNode *N, const SDLoc &DL,

                                            SelectionDAG &DAG) {

  using namespace llvm::SDPatternMatch;

  MVT VT = N->getSimpleValueType(0);

  if (!DAG.getTargetLoweringInfo().hasAndNot(SDValue(N, 0)))

    return SDValue();


  SDValue X, Y, Z;

  if (sd_match(N, m_And(m_Value(X),

                        m_OneUse(m_Or(m_Value(Y), m_Not(m_Value(Z))))))) {

    // Don't fold if Y or Z are constants to prevent infinite loops.

    if (!DAG.isConstantIntBuildVectorOrConstantInt(Y) &&

        !DAG.isConstantIntBuildVectorOrConstantInt(Z))

      return DAG.getNode(

          ISD::AND, DL, VT, X,

          DAG.getNOT(

              DL, DAG.getNode(ISD::AND, DL, VT, DAG.getNOT(DL, Y, VT), Z), VT));

  }


  return SDValue();

}


// This function recognizes cases where X86 bzhi instruction can replace and

// 'and-load' sequence.

// In case of loading integer value from an array of constants which is defined

// as follows:

//

//   int array[SIZE] = {0x0, 0x1, 0x3, 0x7, 0xF ..., 2^(SIZE-1) - 1}

//

// then applying a bitwise and on the result with another input.

// It's equivalent to performing bzhi (zero high bits) on the input, with the

// same index of the load.


static SDValue combineAndLoadToBZHI(SDNode *Node, SelectionDAG &DAG,

                                    const X86Subtarget &Subtarget) {

  MVT VT = Node->getSimpleValueType(0);

  SDLoc dl(Node);


  // Check if subtarget has BZHI instruction for the node's type

  if (!hasBZHI(Subtarget, VT))

    return SDValue();


  // Try matching the pattern for both operands.

  for (unsigned i = 0; i < 2; i++) {

    // continue if the operand is not a load instruction

    auto *Ld = dyn_cast<LoadSDNode>(Node->getOperand(i));

    if (!Ld)

      continue;

    const Value *MemOp = Ld->getMemOperand()->getValue();

    if (!MemOp)

      continue;

    // Get the Node which indexes into the array.

    SDValue Index = getIndexFromUnindexedLoad(Ld);

    if (!Index)

      continue;


    if (auto *GEP = dyn_cast<GetElementPtrInst>(MemOp)) {

      if (auto *GV = dyn_cast<GlobalVariable>(GEP->getOperand(0))) {

        if (GV->isConstant() && GV->hasDefinitiveInitializer()) {

          Constant *Init = GV->getInitializer();

          Type *Ty = Init->getType();

          if (!isa<ConstantDataArray>(Init) ||

              !Ty->getArrayElementType()->isIntegerTy() ||

              Ty->getArrayElementType()->getScalarSizeInBits() !=

                  VT.getSizeInBits() ||

              Ty->getArrayNumElements() >

                  Ty->getArrayElementType()->getScalarSizeInBits())

            continue;


          // Check if the array's constant elements are suitable to our case.

          uint64_t ArrayElementCount = Init->getType()->getArrayNumElements();

          bool ConstantsMatch = true;

          for (uint64_t j = 0; j < ArrayElementCount; j++) {

            auto *Elem = cast<ConstantInt>(Init->getAggregateElement(j));

            if (Elem->getZExtValue() != (((uint64_t)1 << j) - 1)) {

              ConstantsMatch = false;

              break;

            }

          }

          if (!ConstantsMatch)

            continue;


          // Do the transformation (For 32-bit type):

          // -> (and (load arr[idx]), inp)

          // <- (and (srl 0xFFFFFFFF, (sub 32, idx)))

          //    that will be replaced with one bzhi instruction.

          SDValue Inp = Node->getOperand(i == 0 ? 1 : 0);

          SDValue SizeC = DAG.getConstant(VT.getSizeInBits(), dl, MVT::i32);


          Index = DAG.getZExtOrTrunc(Index, dl, MVT::i32);

          SDValue Sub = DAG.getNode(ISD::SUB, dl, MVT::i32, SizeC, Index);

          Sub = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Sub);


          SDValue AllOnes = DAG.getAllOnesConstant(dl, VT);

          SDValue LShr = DAG.getNode(ISD::SRL, dl, VT, AllOnes, Sub);

          return DAG.getNode(ISD::AND, dl, VT, Inp, LShr);

        }

      }

    }

  }

  return SDValue();

}


// Look for (and (bitcast (vXi1 (concat_vectors (vYi1 setcc), undef,))), C)

// Where C is a mask containing the same number of bits as the setcc and

// where the setcc will freely 0 upper bits of k-register. We can replace the

// undef in the concat with 0s and remove the AND. This mainly helps with

// v2i1/v4i1 setcc being casted to scalar.


static SDValue combineScalarAndWithMaskSetcc(SDNode *N, SelectionDAG &DAG,

                                             const X86Subtarget &Subtarget) {

  assert(N->getOpcode() == ISD::AND && "Unexpected opcode!");


  EVT VT = N->getValueType(0);


  // Make sure this is an AND with constant. We will check the value of the

  // constant later.

  auto *C1 = dyn_cast<ConstantSDNode>(N->getOperand(1));

  if (!C1)

    return SDValue();


  // This is implied by the ConstantSDNode.

  assert(!VT.isVector() && "Expected scalar VT!");


  SDValue Src = N->getOperand(0);

  if (!Src.hasOneUse())

    return SDValue();


  // (Optionally) peek through any_extend().

  if (Src.getOpcode() == ISD::ANY_EXTEND) {

    if (!Src.getOperand(0).hasOneUse())

      return SDValue();

    Src = Src.getOperand(0);

  }


  if (Src.getOpcode() != ISD::BITCAST || !Src.getOperand(0).hasOneUse())

    return SDValue();


  Src = Src.getOperand(0);

  EVT SrcVT = Src.getValueType();


  const TargetLowering &TLI = DAG.getTargetLoweringInfo();

  if (!SrcVT.isVector() || SrcVT.getVectorElementType() != MVT::i1 ||

      !TLI.isTypeLegal(SrcVT))

    return SDValue();


  if (Src.getOpcode() != ISD::CONCAT_VECTORS)

    return SDValue();


  // We only care about the first subvector of the concat, we expect the

  // other subvectors to be ignored due to the AND if we make the change.

  SDValue SubVec = Src.getOperand(0);

  EVT SubVecVT = SubVec.getValueType();


  // The RHS of the AND should be a mask with as many bits as SubVec.

  if (!TLI.isTypeLegal(SubVecVT) ||

      !C1->getAPIntValue().isMask(SubVecVT.getVectorNumElements()))

    return SDValue();


  // First subvector should be a setcc with a legal result type or a

  // AND containing at least one setcc with a legal result type.

  auto IsLegalSetCC = [&](SDValue V) {

    if (V.getOpcode() != ISD::SETCC)

      return false;

    EVT SetccVT = V.getOperand(0).getValueType();

    if (!TLI.isTypeLegal(SetccVT) ||

        !(Subtarget.hasVLX() || SetccVT.is512BitVector()))

      return false;

    if (!(Subtarget.hasBWI() || SetccVT.getScalarSizeInBits() >= 32))

      return false;

    return true;

  };

  if (!(IsLegalSetCC(SubVec) || (SubVec.getOpcode() == ISD::AND &&

                                 (IsLegalSetCC(SubVec.getOperand(0)) ||

                                  IsLegalSetCC(SubVec.getOperand(1))))))

    return SDValue();


  // We passed all the checks. Rebuild the concat_vectors with zeroes

  // and cast it back to VT.

  SDLoc dl(N);

  SmallVector<SDValue, 4> Ops(Src.getNumOperands(),

                              DAG.getConstant(0, dl, SubVecVT));

  Ops[0] = SubVec;

  SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, dl, SrcVT,

                               Ops);

  EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), SrcVT.getSizeInBits());

  return DAG.getZExtOrTrunc(DAG.getBitcast(IntVT, Concat), dl, VT);

}


static SDValue getBMIMatchingOp(unsigned Opc, SelectionDAG &DAG,

                                SDValue OpMustEq, SDValue Op, unsigned Depth) {

  // We don't want to go crazy with the recursion here. This isn't a super

  // important optimization.

  static constexpr unsigned kMaxDepth = 2;


  // Only do this re-ordering if op has one use.

  if (!Op.hasOneUse())

    return SDValue();


  SDLoc DL(Op);

  // If we hit another assosiative op, recurse further.

  if (Op.getOpcode() == Opc) {

    // Done recursing.

    if (Depth++ >= kMaxDepth)

      return SDValue();


    for (unsigned OpIdx = 0; OpIdx < 2; ++OpIdx)

      if (SDValue R =

              getBMIMatchingOp(Opc, DAG, OpMustEq, Op.getOperand(OpIdx), Depth))

        return DAG.getNode(Op.getOpcode(), DL, Op.getValueType(), R,

                           Op.getOperand(1 - OpIdx));


  } else if (Op.getOpcode() == ISD::SUB) {

    if (Opc == ISD::AND) {

      // BLSI: (and x, (sub 0, x))

      if (isNullConstant(Op.getOperand(0)) && Op.getOperand(1) == OpMustEq)

        return DAG.getNode(Opc, DL, Op.getValueType(), OpMustEq, Op);

    }

    // Opc must be ISD::AND or ISD::XOR

    // BLSR: (and x, (sub x, 1))

    // BLSMSK: (xor x, (sub x, 1))

    if (isOneConstant(Op.getOperand(1)) && Op.getOperand(0) == OpMustEq)

      return DAG.getNode(Opc, DL, Op.getValueType(), OpMustEq, Op);


  } else if (Op.getOpcode() == ISD::ADD) {

    // Opc must be ISD::AND or ISD::XOR

    // BLSR: (and x, (add x, -1))

    // BLSMSK: (xor x, (add x, -1))

    if (isAllOnesConstant(Op.getOperand(1)) && Op.getOperand(0) == OpMustEq)

      return DAG.getNode(Opc, DL, Op.getValueType(), OpMustEq, Op);

  }

  return SDValue();

}


static SDValue combineBMILogicOp(SDNode *N, SelectionDAG &DAG,

                                 const X86Subtarget &Subtarget) {

  EVT VT = N->getValueType(0);

  // Make sure this node is a candidate for BMI instructions.

  if (!Subtarget.hasBMI() || !VT.isScalarInteger() ||

      (VT != MVT::i32 && VT != MVT::i64))

    return SDValue();


  assert(N->getOpcode() == ISD::AND || N->getOpcode() == ISD::XOR);


  // Try and match LHS and RHS.

  for (unsigned OpIdx = 0; OpIdx < 2; ++OpIdx)

    if (SDValue OpMatch =

            getBMIMatchingOp(N->getOpcode(), DAG, N->getOperand(OpIdx),

                             N->getOperand(1 - OpIdx), 0))

      return OpMatch;

  return SDValue();

}


/// Fold AND(Y, XOR(X, NEG(X))) -> ANDN(Y, BLSMSK(X)) if BMI is available.


static SDValue combineAndXorSubWithBMI(SDNode *And, const SDLoc &DL,

                                       SelectionDAG &DAG,

                                       const X86Subtarget &Subtarget) {

  using namespace llvm::SDPatternMatch;


  EVT VT = And->getValueType(0);

  // Make sure this node is a candidate for BMI instructions.

  if (!Subtarget.hasBMI() || (VT != MVT::i32 && VT != MVT::i64))

    return SDValue();


  SDValue X;

  SDValue Y;

  if (!sd_match(And, m_And(m_OneUse(m_Xor(m_Value(X),

                                          m_OneUse(m_Neg(m_Deferred(X))))),

                           m_Value(Y))))

    return SDValue();


  SDValue BLSMSK =

      DAG.getNode(ISD::XOR, DL, VT, X,

                  DAG.getNode(ISD::SUB, DL, VT, X, DAG.getConstant(1, DL, VT)));

  SDValue AndN = DAG.getNode(ISD::AND, DL, VT, Y, DAG.getNOT(DL, BLSMSK, VT));

  return AndN;

}


static SDValue combineX86SubCmpForFlags(SDNode *N, SDValue Flag,

                                        SelectionDAG &DAG,

                                        TargetLowering::DAGCombinerInfo &DCI,

                                        const X86Subtarget &ST) {

  // cmp(setcc(cc, X), 0)

  // brcond ne

  //  ->

  //    X

  //    brcond cc


  // sub(setcc(cc, X), 1)

  // brcond ne

  //  ->

  //    X

  //    brcond ~cc

  //

  // if only flag has users


  SDValue SetCC = N->getOperand(0);


  if (SetCC.getOpcode() != X86ISD::SETCC || !Flag.hasOneUse())

    return SDValue();


  // Check the only user of flag is `brcond ne`.

  SDNode *BrCond = *Flag->user_begin();

  if (BrCond->getOpcode() != X86ISD::BRCOND)

    return SDValue();

  unsigned CondNo = 2;

  if (static_cast<X86::CondCode>(BrCond->getConstantOperandVal(CondNo)) !=

      X86::COND_NE)

    return SDValue();


  SDValue X = SetCC.getOperand(1);

  // sub has two results while X only have one. DAG combine assumes the value

  // type matches.

  if (N->getOpcode() == X86ISD::SUB)

    X = DAG.getMergeValues({N->getOperand(0), X}, SDLoc(N));


  SDValue CCN = SetCC.getOperand(0);

  X86::CondCode CC =

      static_cast<X86::CondCode>(CCN->getAsAPIntVal().getSExtValue());

  X86::CondCode OppositeCC = X86::GetOppositeBranchCondition(CC);

  // Update CC for the consumer of the flag.

  // The old CC is `ne`. Hence, when comparing the result with 0, we are

  // checking if the second condition evaluates to true. When comparing the

  // result with 1, we are checking uf the second condition evaluates to false.

  SmallVector<SDValue> Ops(BrCond->op_values());

  if (isNullConstant(N->getOperand(1)))

    Ops[CondNo] = CCN;

  else if (isOneConstant(N->getOperand(1)))

    Ops[CondNo] = DAG.getTargetConstant(OppositeCC, SDLoc(BrCond), MVT::i8);

  else

    llvm_unreachable("expect constant 0 or 1");


  SDValue NewBrCond =

      DAG.getNode(X86ISD::BRCOND, SDLoc(BrCond), BrCond->getValueType(0), Ops);

  // Avoid self-assign error b/c CC1 can be `e/ne`.

  if (BrCond != NewBrCond.getNode())

    DCI.CombineTo(BrCond, NewBrCond);

  return X;

}


static SDValue combineAndOrForCcmpCtest(SDNode *N, SelectionDAG &DAG,

                                        TargetLowering::DAGCombinerInfo &DCI,

                                        const X86Subtarget &ST) {

  // and/or(setcc(cc0, flag0), setcc(cc1, sub (X, Y)))

  //  ->

  //    setcc(cc1, ccmp(X, Y, ~cflags/cflags, cc0/~cc0, flag0))


  // and/or(setcc(cc0, flag0), setcc(cc1, cmp (X, 0)))

  //  ->

  //    setcc(cc1, ctest(X, X, ~cflags/cflags, cc0/~cc0, flag0))

  //

  // where cflags is determined by cc1.


  if (!ST.hasCCMP())

    return SDValue();


  SDValue SetCC0 = N->getOperand(0);

  SDValue SetCC1 = N->getOperand(1);

  if (SetCC0.getOpcode() != X86ISD::SETCC ||

      SetCC1.getOpcode() != X86ISD::SETCC)

    return SDValue();


  auto GetCombineToOpc = [&](SDValue V) -> unsigned {

    SDValue Op = V.getOperand(1);

    unsigned Opc = Op.getOpcode();

    if (Opc == X86ISD::SUB)

      return X86ISD::CCMP;

    if (Opc == X86ISD::CMP && isNullConstant(Op.getOperand(1)))

      return X86ISD::CTEST;

    return 0U;

  };


  unsigned NewOpc = 0;


  // AND/OR is commutable. Canonicalize the operands to make SETCC with SUB/CMP

  // appear on the right.

  if (!(NewOpc = GetCombineToOpc(SetCC1))) {

    std::swap(SetCC0, SetCC1);

    if (!(NewOpc = GetCombineToOpc(SetCC1)))

      return SDValue();

  }


  X86::CondCode CC0 =

      static_cast<X86::CondCode>(SetCC0.getConstantOperandVal(0));

  // CCMP/CTEST is not conditional when the source condition is COND_P/COND_NP.

  if (CC0 == X86::COND_P || CC0 == X86::COND_NP)

    return SDValue();


  bool IsOR = N->getOpcode() == ISD::OR;


  // CMP/TEST is executed and updates the EFLAGS normally only when SrcCC

  // evaluates to true. So we need to inverse CC0 as SrcCC when the logic

  // operator is OR. Similar for CC1.

  SDValue SrcCC =

      IsOR ? DAG.getTargetConstant(X86::GetOppositeBranchCondition(CC0),

                                   SDLoc(SetCC0.getOperand(0)), MVT::i8)

           : SetCC0.getOperand(0);

  SDValue CC1N = SetCC1.getOperand(0);

  X86::CondCode CC1 =

      static_cast<X86::CondCode>(CC1N->getAsAPIntVal().getSExtValue());

  X86::CondCode OppositeCC1 = X86::GetOppositeBranchCondition(CC1);

  X86::CondCode CFlagsCC = IsOR ? CC1 : OppositeCC1;

  SDLoc DL(N);

  SDValue CFlags = DAG.getTargetConstant(

      X86::getCCMPCondFlagsFromCondCode(CFlagsCC), DL, MVT::i8);

  SDValue Sub = SetCC1.getOperand(1);


  // Replace any uses of the old flag produced by SUB/CMP with the new one

  // produced by CCMP/CTEST.

  SDValue CCMP = (NewOpc == X86ISD::CCMP)

                     ? DAG.getNode(X86ISD::CCMP, DL, MVT::i32,

                                   {Sub.getOperand(0), Sub.getOperand(1),

                                    CFlags, SrcCC, SetCC0.getOperand(1)})

                     : DAG.getNode(X86ISD::CTEST, DL, MVT::i32,

                                   {Sub.getOperand(0), Sub.getOperand(0),

                                    CFlags, SrcCC, SetCC0.getOperand(1)});


  return DAG.getNode(X86ISD::SETCC, DL, MVT::i8, {CC1N, CCMP});

}


static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,

                          TargetLowering::DAGCombinerInfo &DCI,

                          const X86Subtarget &Subtarget) {

  using namespace SDPatternMatch;


  SDValue N0 = N->getOperand(0);

  SDValue N1 = N->getOperand(1);

  EVT VT = N->getValueType(0);

  SDLoc dl(N);

  const TargetLowering &TLI = DAG.getTargetLoweringInfo();


  // If this is SSE1 only convert to FAND to avoid scalarization.

  if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {

    return DAG.getBitcast(MVT::v4i32,

                          DAG.getNode(X86ISD::FAND, dl, MVT::v4f32,

                                      DAG.getBitcast(MVT::v4f32, N0),

                                      DAG.getBitcast(MVT::v4f32, N1)));

  }


  // Use a 32-bit and+zext if upper bits known zero.

  if (VT == MVT::i64 && Subtarget.is64Bit() && !isa<ConstantSDNode>(N1)) {

    APInt HiMask = APInt::getHighBitsSet(64, 32);

    if (DAG.MaskedValueIsZero(N1, HiMask) ||

        DAG.MaskedValueIsZero(N0, HiMask)) {

      SDValue LHS = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, N0);

      SDValue RHS = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, N1);

      return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64,

                         DAG.getNode(ISD::AND, dl, MVT::i32, LHS, RHS));

    }

  }


  // Match all-of bool scalar reductions into a bitcast/movmsk + cmp.

  // TODO: Support multiple SrcOps.

  if (VT == MVT::i1) {

    SmallVector<SDValue, 2> SrcOps;

    SmallVector<APInt, 2> SrcPartials;

    if (matchScalarReduction(SDValue(N, 0), ISD::AND, SrcOps, &SrcPartials) &&

        SrcOps.size() == 1) {

      unsigned NumElts = SrcOps[0].getValueType().getVectorNumElements();

      EVT MaskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);

      SDValue Mask = combineBitcastvxi1(DAG, MaskVT, SrcOps[0], dl, Subtarget);

      if (!Mask && TLI.isTypeLegal(SrcOps[0].getValueType()))

        Mask = DAG.getBitcast(MaskVT, SrcOps[0]);

      if (Mask) {

        assert(SrcPartials[0].getBitWidth() == NumElts &&

               "Unexpected partial reduction mask");

        SDValue PartialBits = DAG.getConstant(SrcPartials[0], dl, MaskVT);

        Mask = DAG.getNode(ISD::AND, dl, MaskVT, Mask, PartialBits);

        return DAG.getSetCC(dl, MVT::i1, Mask, PartialBits, ISD::SETEQ);

      }

    }

  }


  // InstCombine converts:

  //    `(-x << C0) & C1`

  // to

  //    `(x * (Pow2_Ceil(C1) - (1 << C0))) & C1`

  // This saves an IR instruction but on x86 the neg/shift version is preferable

  // so undo the transform.


  if (N0.getOpcode() == ISD::MUL && N0.hasOneUse()) {

    // TODO: We don't actually need a splat for this, we just need the checks to

    // hold for each element.

    ConstantSDNode *N1C = isConstOrConstSplat(N1, /*AllowUndefs*/ true,

                                              /*AllowTruncation*/ false);

    ConstantSDNode *N01C =

        isConstOrConstSplat(N0.getOperand(1), /*AllowUndefs*/ true,

                            /*AllowTruncation*/ false);

    if (N1C && N01C) {

      const APInt &MulC = N01C->getAPIntValue();

      const APInt &AndC = N1C->getAPIntValue();

      APInt MulCLowBit = MulC & (-MulC);

      if (MulC.uge(AndC) && !MulC.isPowerOf2() &&

          (MulCLowBit + MulC).isPowerOf2()) {

        SDValue Neg = DAG.getNegative(N0.getOperand(0), dl, VT);

        int32_t MulCLowBitLog = MulCLowBit.exactLogBase2();

        assert(MulCLowBitLog != -1 &&

               "Isolated lowbit is somehow not a power of 2!");

        SDValue Shift = DAG.getNode(ISD::SHL, dl, VT, Neg,

                                    DAG.getConstant(MulCLowBitLog, dl, VT));

        return DAG.getNode(ISD::AND, dl, VT, Shift, N1);

      }

    }

  }


  if (SDValue SetCC = combineAndOrForCcmpCtest(N, DAG, DCI, Subtarget))

    return SetCC;


  if (SDValue V = combineScalarAndWithMaskSetcc(N, DAG, Subtarget))

    return V;


  if (SDValue R = combineBitOpWithMOVMSK(N->getOpcode(), dl, N0, N1, DAG))

    return R;


  if (SDValue R = combineBitOpWithShift(N->getOpcode(), dl, VT, N0, N1, DAG))

    return R;


  if (SDValue R = combineBitOpWithPACK(N->getOpcode(), dl, VT, N0, N1, DAG))

    return R;


  if (SDValue FPLogic = convertIntLogicToFPLogic(N->getOpcode(), dl, VT, N0, N1,

                                                 DAG, DCI, Subtarget))

    return FPLogic;


  if (SDValue R = combineAndShuffleNot(N, DAG, Subtarget))

    return R;


  if (DCI.isBeforeLegalizeOps())

    return SDValue();


  if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))

    return R;


  if (SDValue R = combineAndNotIntoANDNP(N, dl ,DAG))

    return R;


  if (SDValue ShiftRight = combineAndMaskToShift(N, dl, DAG, Subtarget))

    return ShiftRight;


  if (SDValue R = combineAndLoadToBZHI(N, DAG, Subtarget))

    return R;


  if (SDValue R = combineAndNotOrIntoAndNotAnd(N, dl, DAG))

    return R;


  // fold (and (mul x, c1), c2) -> (mul x, (and c1, c2))

  // iff c2 is all/no bits mask - i.e. a select-with-zero mask.

  // TODO: Handle PMULDQ/PMULUDQ/VPMADDWD/VPMADDUBSW?

  if (VT.isVector() && getTargetConstantFromNode(N1)) {

    unsigned Opc0 = N0.getOpcode();

    if ((Opc0 == ISD::MUL || Opc0 == ISD::MULHU || Opc0 == ISD::MULHS) &&

        getTargetConstantFromNode(N0.getOperand(1)) &&

        DAG.ComputeNumSignBits(N1) == VT.getScalarSizeInBits() &&

        N0->hasOneUse() && N0.getOperand(1)->hasOneUse()) {

      SDValue MaskMul = DAG.getNode(ISD::AND, dl, VT, N0.getOperand(1), N1);

      return DAG.getNode(Opc0, dl, VT, N0.getOperand(0), MaskMul);

    }

  }


  // On AVX512 targets, attempt to reverse foldVSelectToSignBitSplatMask.

  // to make use of predicated selects.

  // AND(X,SEXT(SETCC())) -> SELECT(SETCC(),X,0)

  if (DCI.isAfterLegalizeDAG() && VT.isVector()) {

    SDValue X, Y;

    EVT CondVT = VT.changeVectorElementType(MVT::i1);

    if (TLI.isTypeLegal(VT) && TLI.isTypeLegal(CondVT) &&

        (VT.is512BitVector() || Subtarget.hasVLX()) &&

        (VT.getScalarSizeInBits() >= 32 || Subtarget.hasBWI()) &&

        sd_match(N, m_And(m_Value(X),

                          m_OneUse(m_SExt(m_AllOf(

                              m_Value(Y), m_SpecificVT(CondVT),

                              m_SetCC(m_Value(), m_Value(), m_Value()))))))) {

      return DAG.getSelect(dl, VT, Y, X,

                           getZeroVector(VT.getSimpleVT(), Subtarget, DAG, dl));

    }

  }


  // Fold AND(SRL(X,Y),1) -> SETCC(BT(X,Y), COND_B) iff Y is not a constant

  // avoids slow variable shift (moving shift amount to ECX etc.)

  if (isOneConstant(N1) && N0->hasOneUse()) {

    SDValue Src = N0;

    while ((Src.getOpcode() == ISD::ZERO_EXTEND ||

            Src.getOpcode() == ISD::TRUNCATE) &&

           Src.getOperand(0)->hasOneUse())

      Src = Src.getOperand(0);

    bool ContainsNOT = false;

    X86::CondCode X86CC = X86::COND_B;

    // Peek through AND(NOT(SRL(X,Y)),1).

    if (isBitwiseNot(Src)) {

      Src = Src.getOperand(0);

      X86CC = X86::COND_AE;

      ContainsNOT = true;

    }

    if (Src.getOpcode() == ISD::SRL &&

        !isa<ConstantSDNode>(Src.getOperand(1))) {

      SDValue BitNo = Src.getOperand(1);

      Src = Src.getOperand(0);

      // Peek through AND(SRL(NOT(X),Y),1).

      if (isBitwiseNot(Src)) {

        Src = Src.getOperand(0);

        X86CC = X86CC == X86::COND_AE ? X86::COND_B : X86::COND_AE;

        ContainsNOT = true;

      }

      // If we have BMI2 then SHRX should be faster for i32/i64 cases.

      if (!(Subtarget.hasBMI2() && !ContainsNOT && VT.getSizeInBits() >= 32))

        if (SDValue BT = getBT(Src, BitNo, dl, DAG))

          return DAG.getZExtOrTrunc(getSETCC(X86CC, BT, dl, DAG), dl, VT);

    }

  }


  if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {

    // Attempt to recursively combine a bitmask AND with shuffles.

    SDValue Op(N, 0);

    if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))

      return Res;


    // If either operand is a constant mask, then only the elements that aren't

    // zero are actually demanded by the other operand.

    auto GetDemandedMasks = [&](SDValue Op) {

      APInt UndefElts;

      SmallVector<APInt> EltBits;

      int NumElts = VT.getVectorNumElements();

      int EltSizeInBits = VT.getScalarSizeInBits();

      APInt DemandedBits = APInt::getAllOnes(EltSizeInBits);

      APInt DemandedElts = APInt::getAllOnes(NumElts);

      if (getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts,

                                        EltBits)) {

        DemandedBits.clearAllBits();

        DemandedElts.clearAllBits();

        for (int I = 0; I != NumElts; ++I) {

          if (UndefElts[I]) {

            // We can't assume an undef src element gives an undef dst - the

            // other src might be zero.

            DemandedBits.setAllBits();

            DemandedElts.setBit(I);

          } else if (!EltBits[I].isZero()) {

            DemandedBits |= EltBits[I];

            DemandedElts.setBit(I);

          }

        }

      }

      return std::make_pair(DemandedBits, DemandedElts);

    };

    APInt Bits0, Elts0;

    APInt Bits1, Elts1;

    std::tie(Bits0, Elts0) = GetDemandedMasks(N1);

    std::tie(Bits1, Elts1) = GetDemandedMasks(N0);


    if (TLI.SimplifyDemandedVectorElts(N0, Elts0, DCI) ||

        TLI.SimplifyDemandedVectorElts(N1, Elts1, DCI) ||

        TLI.SimplifyDemandedBits(N0, Bits0, Elts0, DCI) ||

        TLI.SimplifyDemandedBits(N1, Bits1, Elts1, DCI)) {

      if (N->getOpcode() != ISD::DELETED_NODE)

        DCI.AddToWorklist(N);

      return SDValue(N, 0);

    }


    SDValue NewN0 = TLI.SimplifyMultipleUseDemandedBits(N0, Bits0, Elts0, DAG);

    SDValue NewN1 = TLI.SimplifyMultipleUseDemandedBits(N1, Bits1, Elts1, DAG);

    if (NewN0 || NewN1)

      return DAG.getNode(ISD::AND, dl, VT, NewN0 ? NewN0 : N0,

                         NewN1 ? NewN1 : N1);

  }


  // Attempt to combine a scalar bitmask AND with an extracted shuffle.

  if ((VT.getScalarSizeInBits() % 8) == 0 &&

      N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&

      isa<ConstantSDNode>(N0.getOperand(1)) && N0->hasOneUse()) {

    SDValue BitMask = N1;

    SDValue SrcVec = N0.getOperand(0);

    EVT SrcVecVT = SrcVec.getValueType();


    // Check that the constant bitmask masks whole bytes.

    APInt UndefElts;

    SmallVector<APInt, 64> EltBits;

    if (VT == SrcVecVT.getScalarType() && N0->isOnlyUserOf(SrcVec.getNode()) &&

        getTargetConstantBitsFromNode(BitMask, 8, UndefElts, EltBits) &&

        llvm::all_of(EltBits, [](const APInt &M) {

          return M.isZero() || M.isAllOnes();

        })) {

      unsigned NumElts = SrcVecVT.getVectorNumElements();

      unsigned Scale = SrcVecVT.getScalarSizeInBits() / 8;

      unsigned Idx = N0.getConstantOperandVal(1);


      // Create a root shuffle mask from the byte mask and the extracted index.

      SmallVector<int, 16> ShuffleMask(NumElts * Scale, SM_SentinelUndef);

      for (unsigned i = 0; i != Scale; ++i) {

        if (UndefElts[i])

          continue;

        int VecIdx = Scale * Idx + i;

        ShuffleMask[VecIdx] = EltBits[i].isZero() ? SM_SentinelZero : VecIdx;

      }


      if (SDValue Shuffle = combineX86ShufflesRecursively(

              {SrcVec}, 0, SrcVec.getOpcode(), SrcVec.getSimpleValueType(),

              ShuffleMask, {}, /*Depth=*/1, X86::MaxShuffleCombineDepth,

              /*AllowVariableCrossLaneMask=*/true,

              /*AllowVariablePerLaneMask=*/true,

              /*IsMaskedShuffle=*/false, DAG, SDLoc(SrcVec), Subtarget))

        return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Shuffle,

                           N0.getOperand(1));

    }

  }


  if (SDValue R = combineBMILogicOp(N, DAG, Subtarget))

    return R;


  if (SDValue R = combineAndXorSubWithBMI(N, dl, DAG, Subtarget))

    return R;


  return SDValue();

}


// Canonicalize OR(AND(X,C),AND(Y,~C)) -> OR(AND(X,C),ANDNP(C,Y))


static SDValue canonicalizeBitSelect(SDNode *N, const SDLoc &DL,

                                     SelectionDAG &DAG,

                                     const X86Subtarget &Subtarget) {

  assert(N->getOpcode() == ISD::OR && "Unexpected Opcode");


  MVT VT = N->getSimpleValueType(0);

  unsigned EltSizeInBits = VT.getScalarSizeInBits();

  if (!VT.isVector() || (EltSizeInBits % 8) != 0)

    return SDValue();


  SDValue N0 = peekThroughBitcasts(N->getOperand(0));

  SDValue N1 = peekThroughBitcasts(N->getOperand(1));

  if (N0.getOpcode() != ISD::AND || N1.getOpcode() != ISD::AND)

    return SDValue();


  // On XOP we'll lower to PCMOV so accept one use. With AVX512, we can use

  // VPTERNLOG. Otherwise only do this if either mask has multiple uses already.

  if (!(Subtarget.hasXOP() || useVPTERNLOG(Subtarget, VT) ||

        !N0.getOperand(1).hasOneUse() || !N1.getOperand(1).hasOneUse()))

    return SDValue();


  // Attempt to extract constant byte masks.

  APInt UndefElts0, UndefElts1;

  SmallVector<APInt, 32> EltBits0, EltBits1;

  if (!getTargetConstantBitsFromNode(N0.getOperand(1), 8, UndefElts0, EltBits0,

                                     /*AllowWholeUndefs*/ false,

                                     /*AllowPartialUndefs*/ false))

    return SDValue();

  if (!getTargetConstantBitsFromNode(N1.getOperand(1), 8, UndefElts1, EltBits1,

                                     /*AllowWholeUndefs*/ false,

                                     /*AllowPartialUndefs*/ false))

    return SDValue();


  for (unsigned i = 0, e = EltBits0.size(); i != e; ++i) {

    // TODO - add UNDEF elts support.

    if (UndefElts0[i] || UndefElts1[i])

      return SDValue();

    if (EltBits0[i] != ~EltBits1[i])

      return SDValue();

  }


  if (useVPTERNLOG(Subtarget, VT)) {

    // Emit a VPTERNLOG node directly - 0xCA is the imm code for A?B:C.

    // VPTERNLOG is only available as vXi32/64-bit types.

    MVT OpSVT = EltSizeInBits <= 32 ? MVT::i32 : MVT::i64;

    MVT OpVT =

        MVT::getVectorVT(OpSVT, VT.getSizeInBits() / OpSVT.getSizeInBits());

    SDValue A = DAG.getBitcast(OpVT, N0.getOperand(1));

    SDValue B = DAG.getBitcast(OpVT, N0.getOperand(0));

    SDValue C = DAG.getBitcast(OpVT, N1.getOperand(0));

    SDValue Imm = DAG.getTargetConstant(0xCA, DL, MVT::i8);

    SDValue Res = getAVX512Node(X86ISD::VPTERNLOG, DL, OpVT, {A, B, C, Imm},

                                DAG, Subtarget);

    return DAG.getBitcast(VT, Res);

  }


  SDValue X = N->getOperand(0);

  SDValue Y =

      DAG.getNode(X86ISD::ANDNP, DL, VT, DAG.getBitcast(VT, N0.getOperand(1)),

                  DAG.getBitcast(VT, N1.getOperand(0)));

  return DAG.getNode(ISD::OR, DL, VT, X, Y);

}


// Try to match OR(ANDNP(MASK,X),AND(MASK,Y)) logic pattern.

// TODO: Try to match OR(AND(~MASK,X),AND(MASK,Y)) logic pattern.

// Waiting for ANDNP combine allows other combines to happen that prevent

// matching.


static bool matchLogicBlend(SDNode *N, SDValue &X, SDValue &Y, SDValue &Mask) {

  using namespace SDPatternMatch;

  return sd_match(N, m_Or(m_BinOp(X86ISD::ANDNP, m_Value(Mask), m_Value(X)),

                          m_And(m_Deferred(Mask), m_Value(Y))));

}


// Try to fold:

//   (or (and (m, y), (pandn m, x)))

// into:

//   (vselect m, x, y)

// As a special case, try to fold:

//   (or (and (m, (sub 0, x)), (pandn m, x)))

// into:

//   (sub (xor X, M), M)


static SDValue combineLogicBlendIntoPBLENDV(SDNode *N, const SDLoc &DL,

                                            SelectionDAG &DAG,

                                            const X86Subtarget &Subtarget) {

  assert(N->getOpcode() == ISD::OR && "Unexpected Opcode");


  EVT VT = N->getValueType(0);

  if (!((VT.is128BitVector() && Subtarget.hasSSE2()) ||

        (VT.is256BitVector() && Subtarget.hasInt256())))

    return SDValue();


  SDValue X, Y, Mask;

  if (!matchLogicBlend(N, X, Y, Mask))

    return SDValue();


  // Validate that X, Y, and Mask are bitcasts, and see through them.

  Mask = peekThroughBitcasts(Mask);

  X = peekThroughBitcasts(X);

  Y = peekThroughBitcasts(Y);


  EVT MaskVT = Mask.getValueType();

  unsigned EltBits = MaskVT.getScalarSizeInBits();


  // TODO: Attempt to handle floating point cases as well?

  if (!MaskVT.isInteger() || DAG.ComputeNumSignBits(Mask) != EltBits)

    return SDValue();


  // Attempt to combine to conditional negate: (sub (xor X, M), M)

  if (SDValue Res = combineLogicBlendIntoConditionalNegate(VT, Mask, X, Y, DL,

                                                           DAG, Subtarget))

    return Res;


  // PBLENDVB is only available on SSE 4.1.

  if (!Subtarget.hasSSE41())

    return SDValue();


  // If we have VPTERNLOG we should prefer that since PBLENDVB is multiple uops.

  if (Subtarget.hasVLX())

    return SDValue();


  MVT BlendVT = VT.is256BitVector() ? MVT::v32i8 : MVT::v16i8;


  X = DAG.getBitcast(BlendVT, X);

  Y = DAG.getBitcast(BlendVT, Y);

  Mask = DAG.getBitcast(BlendVT, Mask);

  Mask = DAG.getSelect(DL, BlendVT, Mask, Y, X);

  return DAG.getBitcast(VT, Mask);

}


// Helper function for combineOrCmpEqZeroToCtlzSrl

// Transforms:

//   seteq(cmp x, 0)

//   into:

//   srl(ctlz x), log2(bitsize(x))

// Input pattern is checked by caller.


static SDValue lowerX86CmpEqZeroToCtlzSrl(SDValue Op, SelectionDAG &DAG) {

  SDValue Cmp = Op.getOperand(1);

  EVT VT = Cmp.getOperand(0).getValueType();

  unsigned Log2b = Log2_32(VT.getSizeInBits());

  SDLoc dl(Op);

  SDValue Clz = DAG.getNode(ISD::CTLZ, dl, VT, Cmp->getOperand(0));

  // The result of the shift is true or false, and on X86, the 32-bit

  // encoding of shr and lzcnt is more desirable.

  SDValue Trunc = DAG.getZExtOrTrunc(Clz, dl, MVT::i32);

  SDValue Scc = DAG.getNode(ISD::SRL, dl, MVT::i32, Trunc,

                            DAG.getConstant(Log2b, dl, MVT::i8));

  return Scc;

}


// Try to transform:

//   zext(or(setcc(eq, (cmp x, 0)), setcc(eq, (cmp y, 0))))

//   into:

//   srl(or(ctlz(x), ctlz(y)), log2(bitsize(x))

// Will also attempt to match more generic cases, eg:

//   zext(or(or(setcc(eq, cmp 0), setcc(eq, cmp 0)), setcc(eq, cmp 0)))

// Only applies if the target supports the FastLZCNT feature.


static SDValue combineOrCmpEqZeroToCtlzSrl(SDNode *N, SelectionDAG &DAG,

                                           TargetLowering::DAGCombinerInfo &DCI,

                                           const X86Subtarget &Subtarget) {

  if (DCI.isBeforeLegalize() || !Subtarget.getTargetLowering()->isCtlzFast())

    return SDValue();


  auto isORCandidate = [](SDValue N) {

    return (N->getOpcode() == ISD::OR && N->hasOneUse());

  };


  // Check the zero extend is extending to 32-bit or more. The code generated by

  // srl(ctlz) for 16-bit or less variants of the pattern would require extra

  // instructions to clear the upper bits.

  if (!N->hasOneUse() || !N->getSimpleValueType(0).bitsGE(MVT::i32) ||

      !isORCandidate(N->getOperand(0)))

    return SDValue();


  // Check the node matches: setcc(eq, cmp 0)

  auto isSetCCCandidate = [](SDValue N) {

    return N->getOpcode() == X86ISD::SETCC && N->hasOneUse() &&

           X86::CondCode(N->getConstantOperandVal(0)) == X86::COND_E &&

           N->getOperand(1).getOpcode() == X86ISD::CMP &&

           isNullConstant(N->getOperand(1).getOperand(1)) &&

           N->getOperand(1).getValueType().bitsGE(MVT::i32);

  };


  SDNode *OR = N->getOperand(0).getNode();

  SDValue LHS = OR->getOperand(0);

  SDValue RHS = OR->getOperand(1);


  // Save nodes matching or(or, setcc(eq, cmp 0)).

  SmallVector<SDNode *, 2> ORNodes;

  while (((isORCandidate(LHS) && isSetCCCandidate(RHS)) ||

          (isORCandidate(RHS) && isSetCCCandidate(LHS)))) {

    ORNodes.push_back(OR);

    OR = (LHS->getOpcode() == ISD::OR) ? LHS.getNode() : RHS.getNode();

    LHS = OR->getOperand(0);

    RHS = OR->getOperand(1);

  }


  // The last OR node should match or(setcc(eq, cmp 0), setcc(eq, cmp 0)).

  if (!(isSetCCCandidate(LHS) && isSetCCCandidate(RHS)) ||

      !isORCandidate(SDValue(OR, 0)))

    return SDValue();


  // We have a or(setcc(eq, cmp 0), setcc(eq, cmp 0)) pattern, try to lower it

  // to

  // or(srl(ctlz),srl(ctlz)).

  // The dag combiner can then fold it into:

  // srl(or(ctlz, ctlz)).

  SDValue NewLHS = lowerX86CmpEqZeroToCtlzSrl(LHS, DAG);

  SDValue Ret, NewRHS;

  if (NewLHS && (NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, DAG)))

    Ret = DAG.getNode(ISD::OR, SDLoc(OR), MVT::i32, NewLHS, NewRHS);


  if (!Ret)

    return SDValue();


  // Try to lower nodes matching the or(or, setcc(eq, cmp 0)) pattern.

  while (!ORNodes.empty()) {

    OR = ORNodes.pop_back_val();

    LHS = OR->getOperand(0);

    RHS = OR->getOperand(1);

    // Swap rhs with lhs to match or(setcc(eq, cmp, 0), or).

    if (RHS->getOpcode() == ISD::OR)

      std::swap(LHS, RHS);

    NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, DAG);

    if (!NewRHS)

      return SDValue();

    Ret = DAG.getNode(ISD::OR, SDLoc(OR), MVT::i32, Ret, NewRHS);

  }


  return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0), Ret);

}


/// If this is an add or subtract where one operand is produced by a cmp+setcc,

/// then try to convert it to an ADC or SBB. This replaces TEST+SET+{ADD/SUB}

/// with CMP+{ADC, SBB}.

/// Also try (ADD/SUB)+(AND(SRL,1)) bit extraction pattern with BT+{ADC, SBB}.


static SDValue combineAddOrSubToADCOrSBB(bool IsSub, const SDLoc &DL, EVT VT,

                                         SDValue X, SDValue Y,

                                         SelectionDAG &DAG,

                                         bool ZeroSecondOpOnly = false) {

  if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))

    return SDValue();


  // Look through a one-use zext.

  if (Y.getOpcode() == ISD::ZERO_EXTEND && Y.hasOneUse())

    Y = Y.getOperand(0);


  X86::CondCode CC;

  SDValue EFLAGS;

  if (Y.getOpcode() == X86ISD::SETCC && Y.hasOneUse()) {

    CC = (X86::CondCode)Y.getConstantOperandVal(0);

    EFLAGS = Y.getOperand(1);

  } else if (Y.getOpcode() == ISD::AND && isOneConstant(Y.getOperand(1)) &&

             Y.hasOneUse()) {

    EFLAGS = LowerAndToBT(Y, ISD::SETNE, DL, DAG, CC);

  }


  if (!EFLAGS)

    return SDValue();


  // If X is -1 or 0, then we have an opportunity to avoid constants required in

  // the general case below.

  auto *ConstantX = dyn_cast<ConstantSDNode>(X);

  if (ConstantX && !ZeroSecondOpOnly) {

    if ((!IsSub && CC == X86::COND_AE && ConstantX->isAllOnes()) ||

        (IsSub && CC == X86::COND_B && ConstantX->isZero())) {

      // This is a complicated way to get -1 or 0 from the carry flag:

      // -1 + SETAE --> -1 + (!CF) --> CF ? -1 : 0 --> SBB %eax, %eax

      //  0 - SETB  -->  0 -  (CF) --> CF ? -1 : 0 --> SBB %eax, %eax

      return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,

                         DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),

                         EFLAGS);

    }


    if ((!IsSub && CC == X86::COND_BE && ConstantX->isAllOnes()) ||

        (IsSub && CC == X86::COND_A && ConstantX->isZero())) {

      if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() &&

          EFLAGS.getValueType().isInteger() &&

          !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {

        // Swap the operands of a SUB, and we have the same pattern as above.

        // -1 + SETBE (SUB A, B) --> -1 + SETAE (SUB B, A) --> SUB + SBB

        //  0 - SETA  (SUB A, B) -->  0 - SETB  (SUB B, A) --> SUB + SBB

        SDValue NewSub = DAG.getNode(

            X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(),

            EFLAGS.getOperand(1), EFLAGS.getOperand(0));

        SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo());

        return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,

                           DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),

                           NewEFLAGS);

      }

    }

  }


  if (CC == X86::COND_B) {

    // X + SETB Z --> adc X, 0

    // X - SETB Z --> sbb X, 0

    return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL,

                       DAG.getVTList(VT, MVT::i32), X,

                       DAG.getConstant(0, DL, VT), EFLAGS);

  }


  if (ZeroSecondOpOnly)

    return SDValue();


  if (CC == X86::COND_A) {

    // Try to convert COND_A into COND_B in an attempt to facilitate

    // materializing "setb reg".

    //

    // Do not flip "e > c", where "c" is a constant, because Cmp instruction

    // cannot take an immediate as its first operand.

    //

    // If EFLAGS is from a CMP that compares the same operands as the earlier

    // SUB producing X (i.e. CMP X, Y), we can directly use the carry flag with

    // SBB/ADC without creating a flipped SUB.

    if (EFLAGS.getOpcode() == X86ISD::CMP &&

        EFLAGS.getValueType().isInteger() && X == EFLAGS.getOperand(0)) {

      return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL,

                         DAG.getVTList(VT, MVT::i32), X,

                         DAG.getConstant(0, DL, VT), EFLAGS);

    }


    if (EFLAGS.getOpcode() == X86ISD::SUB &&

        EFLAGS.getValueType().isInteger() &&

        !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {

      // Only create NewSub if we know one of the folds will succeed to avoid

      // introducing a temporary node that may persist and affect one-use checks

      // below.

      if (EFLAGS.getNode()->hasOneUse()) {

        SDValue NewSub = DAG.getNode(

            X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(),

            EFLAGS.getOperand(1), EFLAGS.getOperand(0));

        SDValue NewEFLAGS = NewSub.getValue(EFLAGS.getResNo());

        return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL,

                           DAG.getVTList(VT, MVT::i32), X,

                           DAG.getConstant(0, DL, VT), NewEFLAGS);

      }


      if (IsSub && X == EFLAGS.getValue(0)) {

        SDValue NewSub = DAG.getNode(

            X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(),

            EFLAGS.getOperand(1), EFLAGS.getOperand(0));

        SDValue NewEFLAGS = NewSub.getValue(EFLAGS.getResNo());

        return DAG.getNode(X86ISD::SBB, DL, DAG.getVTList(VT, MVT::i32),

                           EFLAGS.getOperand(0), EFLAGS.getOperand(1),

                           NewEFLAGS);

      }

    }

  }


  if (CC == X86::COND_AE) {

    // X + SETAE --> sbb X, -1

    // X - SETAE --> adc X, -1

    return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL,

                       DAG.getVTList(VT, MVT::i32), X,

                       DAG.getAllOnesConstant(DL, VT), EFLAGS);

  }


  if (CC == X86::COND_BE) {

    // X + SETBE --> sbb X, -1

    // X - SETBE --> adc X, -1

    // Try to convert COND_BE into COND_AE in an attempt to facilitate

    // materializing "setae reg".

    //

    // Do not flip "e <= c", where "c" is a constant, because Cmp instruction

    // cannot take an immediate as its first operand.

    //

    if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.getNode()->hasOneUse() &&

        EFLAGS.getValueType().isInteger() &&

        !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {

      SDValue NewSub =

          DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(),

                      EFLAGS.getOperand(1), EFLAGS.getOperand(0));

      SDValue NewEFLAGS = NewSub.getValue(EFLAGS.getResNo());

      return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL,

                         DAG.getVTList(VT, MVT::i32), X,

                         DAG.getAllOnesConstant(DL, VT), NewEFLAGS);

    }

  }


  if (CC != X86::COND_E && CC != X86::COND_NE)

    return SDValue();


  if (EFLAGS.getOpcode() != X86ISD::CMP || !EFLAGS.hasOneUse() ||

      !X86::isZeroNode(EFLAGS.getOperand(1)) ||

      !EFLAGS.getOperand(0).getValueType().isInteger())

    return SDValue();


  SDValue Z = EFLAGS.getOperand(0);

  EVT ZVT = Z.getValueType();


  // If X is -1 or 0, then we have an opportunity to avoid constants required in

  // the general case below.

  if (ConstantX) {

    // 'neg' sets the carry flag when Z != 0, so create 0 or -1 using 'sbb' with

    // fake operands:

    //  0 - (Z != 0) --> sbb %eax, %eax, (neg Z)

    // -1 + (Z == 0) --> sbb %eax, %eax, (neg Z)

    if ((IsSub && CC == X86::COND_NE && ConstantX->isZero()) ||

        (!IsSub && CC == X86::COND_E && ConstantX->isAllOnes())) {

      SDValue Zero = DAG.getConstant(0, DL, ZVT);

      SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);

      SDValue Neg = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Zero, Z);

      return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,

                         DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),

                         SDValue(Neg.getNode(), 1));

    }


    // cmp with 1 sets the carry flag when Z == 0, so create 0 or -1 using 'sbb'

    // with fake operands:

    //  0 - (Z == 0) --> sbb %eax, %eax, (cmp Z, 1)

    // -1 + (Z != 0) --> sbb %eax, %eax, (cmp Z, 1)

    if ((IsSub && CC == X86::COND_E && ConstantX->isZero()) ||

        (!IsSub && CC == X86::COND_NE && ConstantX->isAllOnes())) {

      SDValue One = DAG.getConstant(1, DL, ZVT);

      SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);

      SDValue Cmp1 = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Z, One);

      return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,

                         DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),

                         Cmp1.getValue(1));

    }

  }


  // (cmp Z, 1) sets the carry flag if Z is 0.

  SDValue One = DAG.getConstant(1, DL, ZVT);

  SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);

  SDValue Cmp1 = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Z, One);


  // Add the flags type for ADC/SBB nodes.

  SDVTList VTs = DAG.getVTList(VT, MVT::i32);


  // X - (Z != 0) --> sub X, (zext(setne Z, 0)) --> adc X, -1, (cmp Z, 1)

  // X + (Z != 0) --> add X, (zext(setne Z, 0)) --> sbb X, -1, (cmp Z, 1)

  if (CC == X86::COND_NE)

    return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL, VTs, X,

                       DAG.getAllOnesConstant(DL, VT), Cmp1.getValue(1));


  // X - (Z == 0) --> sub X, (zext(sete  Z, 0)) --> sbb X, 0, (cmp Z, 1)

  // X + (Z == 0) --> add X, (zext(sete  Z, 0)) --> adc X, 0, (cmp Z, 1)

  return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL, VTs, X,

                     DAG.getConstant(0, DL, VT), Cmp1.getValue(1));

}


/// If this is an add or subtract where one operand is produced by a cmp+setcc,

/// then try to convert it to an ADC or SBB. This replaces TEST+SET+{ADD/SUB}

/// with CMP+{ADC, SBB}.


static SDValue combineAddOrSubToADCOrSBB(SDNode *N, const SDLoc &DL,

                                         SelectionDAG &DAG) {

  bool IsSub = N->getOpcode() == ISD::SUB;

  SDValue X = N->getOperand(0);

  SDValue Y = N->getOperand(1);

  EVT VT = N->getValueType(0);


  if (SDValue ADCOrSBB = combineAddOrSubToADCOrSBB(IsSub, DL, VT, X, Y, DAG))

    return ADCOrSBB;


  // Commute and try again (negate the result for subtracts).

  if (SDValue ADCOrSBB = combineAddOrSubToADCOrSBB(IsSub, DL, VT, Y, X, DAG)) {

    if (IsSub)

      ADCOrSBB = DAG.getNegative(ADCOrSBB, DL, VT);

    return ADCOrSBB;

  }


  return SDValue();

}


static SDValue combineOrXorWithSETCC(unsigned Opc, const SDLoc &DL, EVT VT,

                                     SDValue N0, SDValue N1,

                                     SelectionDAG &DAG) {

  assert((Opc == ISD::XOR || Opc == ISD::OR) && "Unexpected opcode");


  // Delegate to combineAddOrSubToADCOrSBB if we have:

  //

  //   (xor/or (zero_extend (setcc)) imm)

  //

  // where imm is odd if and only if we have xor, in which case the XOR/OR are

  // equivalent to a SUB/ADD, respectively.

  if (N0.getOpcode() == ISD::ZERO_EXTEND &&

      N0.getOperand(0).getOpcode() == X86ISD::SETCC && N0.hasOneUse()) {

    if (auto *N1C = dyn_cast<ConstantSDNode>(N1)) {

      bool IsSub = Opc == ISD::XOR;

      bool N1COdd = N1C->getZExtValue() & 1;

      if (IsSub ? N1COdd : !N1COdd)

        if (SDValue R = combineAddOrSubToADCOrSBB(IsSub, DL, VT, N1, N0, DAG))

          return R;

    }

  }


  // not(pcmpeq(and(X,CstPow2),0)) -> pcmpeq(and(X,CstPow2),CstPow2)

  if (Opc == ISD::XOR && N0.getOpcode() == X86ISD::PCMPEQ &&

      N0.getOperand(0).getOpcode() == ISD::AND &&

      ISD::isBuildVectorAllZeros(N0.getOperand(1).getNode()) &&

      ISD::isBuildVectorAllOnes(N1.getNode()) &&

      isConstantPowerOf2(N0.getOperand(0).getOperand(1),

                         VT.getScalarSizeInBits(), /*AllowUndefs=*/true)) {

    return DAG.getNode(X86ISD::PCMPEQ, DL, VT, N0.getOperand(0),

                       N0.getOperand(0).getOperand(1));

  }


  return SDValue();

}


static SDValue combineOr(SDNode *N, SelectionDAG &DAG,

                         TargetLowering::DAGCombinerInfo &DCI,

                         const X86Subtarget &Subtarget) {

  SDValue N0 = N->getOperand(0);

  SDValue N1 = N->getOperand(1);

  EVT VT = N->getValueType(0);

  SDLoc dl(N);

  const TargetLowering &TLI = DAG.getTargetLoweringInfo();


  // If this is SSE1 only convert to FOR to avoid scalarization.

  if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {

    return DAG.getBitcast(MVT::v4i32,

                          DAG.getNode(X86ISD::FOR, dl, MVT::v4f32,

                                      DAG.getBitcast(MVT::v4f32, N0),

                                      DAG.getBitcast(MVT::v4f32, N1)));

  }


  // Match any-of bool scalar reductions into a bitcast/movmsk + cmp.

  // TODO: Support multiple SrcOps.

  if (VT == MVT::i1) {

    SmallVector<SDValue, 2> SrcOps;

    SmallVector<APInt, 2> SrcPartials;

    if (matchScalarReduction(SDValue(N, 0), ISD::OR, SrcOps, &SrcPartials) &&

        SrcOps.size() == 1) {

      unsigned NumElts = SrcOps[0].getValueType().getVectorNumElements();

      EVT MaskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);

      SDValue Mask = combineBitcastvxi1(DAG, MaskVT, SrcOps[0], dl, Subtarget);

      if (!Mask && TLI.isTypeLegal(SrcOps[0].getValueType()))

        Mask = DAG.getBitcast(MaskVT, SrcOps[0]);

      if (Mask) {

        assert(SrcPartials[0].getBitWidth() == NumElts &&

               "Unexpected partial reduction mask");

        SDValue ZeroBits = DAG.getConstant(0, dl, MaskVT);

        SDValue PartialBits = DAG.getConstant(SrcPartials[0], dl, MaskVT);

        Mask = DAG.getNode(ISD::AND, dl, MaskVT, Mask, PartialBits);

        return DAG.getSetCC(dl, MVT::i1, Mask, ZeroBits, ISD::SETNE);

      }

    }

  }


  if (SDValue SetCC = combineAndOrForCcmpCtest(N, DAG, DCI, Subtarget))

    return SetCC;


  if (SDValue R = combineBitOpWithMOVMSK(N->getOpcode(), dl, N0, N1, DAG))

    return R;


  if (SDValue R = combineBitOpWithShift(N->getOpcode(), dl, VT, N0, N1, DAG))

    return R;


  if (SDValue R = combineBitOpWithPACK(N->getOpcode(), dl, VT, N0, N1, DAG))

    return R;


  if (SDValue FPLogic = convertIntLogicToFPLogic(N->getOpcode(), dl, VT, N0, N1,

                                                 DAG, DCI, Subtarget))

    return FPLogic;


  if (DCI.isBeforeLegalizeOps())

    return SDValue();


  if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))

    return R;


  if (SDValue R = canonicalizeBitSelect(N, dl, DAG, Subtarget))

    return R;


  if (SDValue R = combineLogicBlendIntoPBLENDV(N, dl, DAG, Subtarget))

    return R;


  // Combine `(x86isd::setcc_carry) | C` and `(0 - SetCC) | C`

  // into `(zext (not SetCC)) * (C + 1) - 1` if we can get a LEA out of it.

  if ((VT == MVT::i32 || VT == MVT::i64) && N0.hasOneUse()) {

    if (auto *CN = dyn_cast<ConstantSDNode>(N1)) {

      uint64_t Val = CN->getZExtValue();

      if (Val == 1 || Val == 2 || Val == 3 || Val == 4 || Val == 7 ||

          Val == 8) {

        SDValue NotCond;

        if (N0.getOpcode() == X86ISD::SETCC_CARRY &&

            N0.getOperand(1).hasOneUse()) {

          X86::CondCode OldCC = (X86::CondCode)N0.getConstantOperandVal(0);

          X86::CondCode NewCC = X86::GetOppositeBranchCondition(OldCC);

          NotCond = getSETCC(NewCC, N0.getOperand(1), SDLoc(N0), DAG);

        } else if (N0.getOpcode() == ISD::SUB &&

                   isNullConstant(N0.getOperand(0))) {

          SDValue Cond = N0.getOperand(1);

          if (Cond.getOpcode() == ISD::ZERO_EXTEND && Cond.hasOneUse())

            Cond = Cond.getOperand(0);

          if (Cond.getOpcode() == X86ISD::SETCC && Cond.hasOneUse()) {

            X86::CondCode OldCC = (X86::CondCode)Cond.getConstantOperandVal(0);

            X86::CondCode NewCC = X86::GetOppositeBranchCondition(OldCC);

            NotCond = getSETCC(NewCC, Cond.getOperand(1), SDLoc(Cond), DAG);

          }

        }


        if (NotCond) {

          SDValue R = DAG.getZExtOrTrunc(NotCond, dl, VT);

          R = DAG.getNode(ISD::MUL, dl, VT, R, DAG.getConstant(Val + 1, dl, VT));

          R = DAG.getNode(ISD::SUB, dl, VT, R, DAG.getConstant(1, dl, VT));

          return R;

        }

      }

    }

  }


  // Combine OR(X,KSHIFTL(Y,Elts/2)) -> CONCAT_VECTORS(X,Y) == KUNPCK(X,Y).

  // Combine OR(KSHIFTL(X,Elts/2),Y) -> CONCAT_VECTORS(Y,X) == KUNPCK(Y,X).

  // iff the upper elements of the non-shifted arg are zero.

  // KUNPCK require 16+ bool vector elements.

  if (N0.getOpcode() == X86ISD::KSHIFTL || N1.getOpcode() == X86ISD::KSHIFTL) {

    unsigned NumElts = VT.getVectorNumElements();

    unsigned HalfElts = NumElts / 2;

    APInt UpperElts = APInt::getHighBitsSet(NumElts, HalfElts);

    if (NumElts >= 16 && N1.getOpcode() == X86ISD::KSHIFTL &&

        N1.getConstantOperandAPInt(1) == HalfElts &&

        DAG.MaskedVectorIsZero(N0, UpperElts)) {

      return DAG.getNode(

          ISD::CONCAT_VECTORS, dl, VT,

          extractSubVector(N0, 0, DAG, dl, HalfElts),

          extractSubVector(N1.getOperand(0), 0, DAG, dl, HalfElts));

    }

    if (NumElts >= 16 && N0.getOpcode() == X86ISD::KSHIFTL &&

        N0.getConstantOperandAPInt(1) == HalfElts &&

        DAG.MaskedVectorIsZero(N1, UpperElts)) {

      return DAG.getNode(

          ISD::CONCAT_VECTORS, dl, VT,

          extractSubVector(N1, 0, DAG, dl, HalfElts),

          extractSubVector(N0.getOperand(0), 0, DAG, dl, HalfElts));

    }

  }


  if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {

    // Attempt to recursively combine an OR of shuffles.

    SDValue Op(N, 0);

    if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))

      return Res;


    // If either operand is a constant mask, then only the elements that aren't

    // allones are actually demanded by the other operand.

    auto SimplifyUndemandedElts = [&](SDValue Op, SDValue OtherOp) {

      APInt UndefElts;

      SmallVector<APInt> EltBits;

      int NumElts = VT.getVectorNumElements();

      int EltSizeInBits = VT.getScalarSizeInBits();

      if (!getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts, EltBits))

        return false;


      APInt DemandedElts = APInt::getZero(NumElts);

      for (int I = 0; I != NumElts; ++I)

        if (!EltBits[I].isAllOnes())

          DemandedElts.setBit(I);


      return TLI.SimplifyDemandedVectorElts(OtherOp, DemandedElts, DCI);

    };

    if (SimplifyUndemandedElts(N0, N1) || SimplifyUndemandedElts(N1, N0)) {

      if (N->getOpcode() != ISD::DELETED_NODE)

        DCI.AddToWorklist(N);

      return SDValue(N, 0);

    }

  }


  if (SDValue R = combineOrXorWithSETCC(N->getOpcode(), dl, VT, N0, N1, DAG))

    return R;


  return SDValue();

}


/// Try to turn tests against the signbit in the form of:

///   XOR(TRUNCATE(SRL(X, size(X)-1)), 1)

/// into:

///   SETGT(X, -1)


static SDValue foldXorTruncShiftIntoCmp(SDNode *N, const SDLoc &DL,

                                        SelectionDAG &DAG) {

  // This is only worth doing if the output type is i8 or i1.

  EVT ResultType = N->getValueType(0);

  if (ResultType != MVT::i8 && ResultType != MVT::i1)

    return SDValue();


  SDValue N0 = N->getOperand(0);

  SDValue N1 = N->getOperand(1);


  // We should be performing an xor against a truncated shift.

  if (N0.getOpcode() != ISD::TRUNCATE || !N0.hasOneUse())

    return SDValue();


  // Make sure we are performing an xor against one.

  if (!isOneConstant(N1))

    return SDValue();


  // SetCC on x86 zero extends so only act on this if it's a logical shift.

  SDValue Shift = N0.getOperand(0);

  if (Shift.getOpcode() != ISD::SRL || !Shift.hasOneUse())

    return SDValue();


  // Make sure we are truncating from one of i16, i32 or i64.

  EVT ShiftTy = Shift.getValueType();

  if (ShiftTy != MVT::i16 && ShiftTy != MVT::i32 && ShiftTy != MVT::i64)

    return SDValue();


  // Make sure the shift amount extracts the sign bit.

  if (!isa<ConstantSDNode>(Shift.getOperand(1)) ||

      Shift.getConstantOperandAPInt(1) != (ShiftTy.getSizeInBits() - 1))

    return SDValue();


  // Create a greater-than comparison against -1.

  // N.B. Using SETGE against 0 works but we want a canonical looking

  // comparison, using SETGT matches up with what TranslateX86CC.

  SDValue ShiftOp = Shift.getOperand(0);

  EVT ShiftOpTy = ShiftOp.getValueType();

  const TargetLowering &TLI = DAG.getTargetLoweringInfo();

  EVT SetCCResultType = TLI.getSetCCResultType(DAG.getDataLayout(),

                                               *DAG.getContext(), ResultType);

  SDValue Cond =

      DAG.getSetCC(DL, SetCCResultType, ShiftOp,

                   DAG.getAllOnesConstant(DL, ShiftOpTy), ISD::SETGT);

  if (SetCCResultType != ResultType)

    Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, ResultType, Cond);

  return Cond;

}


/// Turn vector tests of the signbit in the form of:

///   xor (sra X, elt_size(X)-1), -1

/// into:

///   pcmpgt X, -1

///

/// This should be called before type legalization because the pattern may not

/// persist after that.


static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG,

                                         const X86Subtarget &Subtarget) {

  EVT VT = N->getValueType(0);

  if (!VT.isSimple())

    return SDValue();


  switch (VT.getSimpleVT().SimpleTy) {

  // clang-format off

  default: return SDValue();

  case MVT::v16i8:

  case MVT::v8i16:

  case MVT::v4i32:

  case MVT::v2i64: if (!Subtarget.hasSSE2()) return SDValue(); break;

  case MVT::v32i8:

  case MVT::v16i16:

  case MVT::v8i32:

  case MVT::v4i64: if (!Subtarget.hasAVX2()) return SDValue(); break;

    // clang-format on

  }


  // There must be a shift right algebraic before the xor, and the xor must be a

  // 'not' operation.

  SDValue Shift = N->getOperand(0);

  SDValue Ones = N->getOperand(1);

  if (Shift.getOpcode() != ISD::SRA || !Shift.hasOneUse() ||

      !ISD::isBuildVectorAllOnes(Ones.getNode()))

    return SDValue();


  // The shift should be smearing the sign bit across each vector element.

  auto *ShiftAmt =

      isConstOrConstSplat(Shift.getOperand(1), /*AllowUndefs*/ true);

  if (!ShiftAmt ||

      ShiftAmt->getAPIntValue() != (Shift.getScalarValueSizeInBits() - 1))

    return SDValue();


  // Create a greater-than comparison against -1. We don't use the more obvious

  // greater-than-or-equal-to-zero because SSE/AVX don't have that instruction.

  return DAG.getSetCC(SDLoc(N), VT, Shift.getOperand(0), Ones, ISD::SETGT);

}


/// Detect patterns of truncation with unsigned saturation:

///

/// 1. (truncate (umin (x, unsigned_max_of_dest_type)) to dest_type).

///   Return the source value x to be truncated or SDValue() if the pattern was

///   not matched.

///

/// 2. (truncate (smin (smax (x, C1), C2)) to dest_type),

///   where C1 >= 0 and C2 is unsigned max of destination type.

///

///    (truncate (smax (smin (x, C2), C1)) to dest_type)

///   where C1 >= 0, C2 is unsigned max of destination type and C1 <= C2.

///

///   These two patterns are equivalent to:

///   (truncate (umin (smax(x, C1), unsigned_max_of_dest_type)) to dest_type)

///   So return the smax(x, C1) value to be truncated or SDValue() if the

///   pattern was not matched.


static SDValue detectUSatPattern(SDValue In, EVT VT, SelectionDAG &DAG,

                                 const SDLoc &DL) {

  using namespace llvm::SDPatternMatch;

  EVT InVT = In.getValueType();


  // Saturation with truncation. We truncate from InVT to VT.

  assert(InVT.getScalarSizeInBits() > VT.getScalarSizeInBits() &&

         "Unexpected types for truncate operation");


  APInt C1, C2;

  SDValue UMin, SMin, SMax;


  // C2 should be equal to UINT32_MAX / UINT16_MAX / UINT8_MAX according

  // the element size of the destination type.

  if (sd_match(In, m_UMin(m_Value(UMin), m_ConstInt(C2))) &&

      C2.isMask(VT.getScalarSizeInBits()))

    return UMin;


  if (sd_match(In, m_SMin(m_Value(SMin), m_ConstInt(C2))) &&

      sd_match(SMin, m_SMax(m_Value(SMax), m_ConstInt(C1))) &&

      C1.isNonNegative() && C2.isMask(VT.getScalarSizeInBits()))

    return SMin;


  if (sd_match(In, m_SMax(m_Value(SMax), m_ConstInt(C1))) &&

      sd_match(SMax, m_SMin(m_Value(SMin), m_ConstInt(C2))) &&

      C1.isNonNegative() && C2.isMask(VT.getScalarSizeInBits()) && C2.uge(C1))

    return DAG.getNode(ISD::SMAX, DL, InVT, SMin, In.getOperand(1));


  return SDValue();

}


/// Detect patterns of truncation with signed saturation:

/// (truncate (smin ((smax (x, signed_min_of_dest_type)),

///                  signed_max_of_dest_type)) to dest_type)

/// or:

/// (truncate (smax ((smin (x, signed_max_of_dest_type)),

///                  signed_min_of_dest_type)) to dest_type).

/// With MatchPackUS, the smax/smin range is [0, unsigned_max_of_dest_type].

/// Return the source value to be truncated or SDValue() if the pattern was not

/// matched.


static SDValue detectSSatPattern(SDValue In, EVT VT, bool MatchPackUS = false) {

  using namespace llvm::SDPatternMatch;

  unsigned NumDstBits = VT.getScalarSizeInBits();

  unsigned NumSrcBits = In.getScalarValueSizeInBits();

  assert(NumSrcBits > NumDstBits && "Unexpected types for truncate operation");


  APInt SignedMax, SignedMin;

  if (MatchPackUS) {

    SignedMax = APInt::getAllOnes(NumDstBits).zext(NumSrcBits);

    SignedMin = APInt::getZero(NumSrcBits);

  } else {

    SignedMax = APInt::getSignedMaxValue(NumDstBits).sext(NumSrcBits);

    SignedMin = APInt::getSignedMinValue(NumDstBits).sext(NumSrcBits);

  }


  SDValue SMin, SMax;

  if (sd_match(In, m_SMin(m_Value(SMin), m_SpecificInt(SignedMax))) &&

      sd_match(SMin, m_SMax(m_Value(SMax), m_SpecificInt(SignedMin))))

    return SMax;


  if (sd_match(In, m_SMax(m_Value(SMax), m_SpecificInt(SignedMin))) &&

      sd_match(SMax, m_SMin(m_Value(SMin), m_SpecificInt(SignedMax))))

    return SMin;


  return SDValue();

}


static SDValue combineTruncateWithSat(SDValue In, EVT VT, const SDLoc &DL,

                                      SelectionDAG &DAG,

                                      const X86Subtarget &Subtarget) {

  if (!Subtarget.hasSSE2() || !VT.isVector())

    return SDValue();


  EVT SVT = VT.getVectorElementType();

  EVT InVT = In.getValueType();

  EVT InSVT = InVT.getVectorElementType();


  // If we're clamping a signed 32-bit vector to 0-255 and the 32-bit vector is

  // split across two registers. We can use a packusdw+perm to clamp to 0-65535

  // and concatenate at the same time. Then we can use a final vpmovuswb to

  // clip to 0-255.

  if (Subtarget.hasBWI() && !Subtarget.useAVX512Regs() &&

      InVT == MVT::v16i32 && VT == MVT::v16i8) {

    if (SDValue USatVal = detectSSatPattern(In, VT, true)) {

      // Emit a VPACKUSDW+VPERMQ followed by a VPMOVUSWB.

      SDValue Mid = truncateVectorWithPACK(X86ISD::PACKUS, MVT::v16i16, USatVal,

                                           DL, DAG, Subtarget);

      assert(Mid && "Failed to pack!");

      return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, Mid);

    }

  }


  // vXi32 truncate instructions are available with AVX512F.

  // vXi16 truncate instructions are only available with AVX512BW.

  // For 256-bit or smaller vectors, we require VLX.

  // FIXME: We could widen truncates to 512 to remove the VLX restriction.

  // If the result type is 256-bits or larger and we have disable 512-bit

  // registers, we should go ahead and use the pack instructions if possible.

  bool PreferAVX512 = ((Subtarget.hasAVX512() && InSVT == MVT::i32) ||

                       (Subtarget.hasBWI() && InSVT == MVT::i16)) &&

                      (InVT.getSizeInBits() > 128) &&

                      (Subtarget.hasVLX() || InVT.getSizeInBits() > 256) &&

                      !(!Subtarget.useAVX512Regs() && VT.getSizeInBits() >= 256);


  if (!PreferAVX512 && VT.getVectorNumElements() > 1 &&

      isPowerOf2_32(VT.getVectorNumElements()) &&

      (SVT == MVT::i8 || SVT == MVT::i16) &&

      (InSVT == MVT::i16 || InSVT == MVT::i32)) {

    if (SDValue USatVal = detectSSatPattern(In, VT, true)) {

      // vXi32 -> vXi8 must be performed as PACKUSWB(PACKSSDW,PACKSSDW).

      if (SVT == MVT::i8 && InSVT == MVT::i32) {

        EVT MidVT = VT.changeVectorElementType(MVT::i16);

        SDValue Mid = truncateVectorWithPACK(X86ISD::PACKSS, MidVT, USatVal, DL,

                                             DAG, Subtarget);

        assert(Mid && "Failed to pack!");

        SDValue V = truncateVectorWithPACK(X86ISD::PACKUS, VT, Mid, DL, DAG,

                                           Subtarget);

        assert(V && "Failed to pack!");

        return V;

      } else if (SVT == MVT::i8 || Subtarget.hasSSE41())

        return truncateVectorWithPACK(X86ISD::PACKUS, VT, USatVal, DL, DAG,

                                      Subtarget);

    }

    if (SDValue SSatVal = detectSSatPattern(In, VT))

      return truncateVectorWithPACK(X86ISD::PACKSS, VT, SSatVal, DL, DAG,

                                    Subtarget);

  }


  const TargetLowering &TLI = DAG.getTargetLoweringInfo();

  if (TLI.isTypeLegal(InVT) && InVT.isVector() && SVT != MVT::i1 &&

      Subtarget.hasAVX512() && (InSVT != MVT::i16 || Subtarget.hasBWI()) &&

      (SVT == MVT::i32 || SVT == MVT::i16 || SVT == MVT::i8)) {

    unsigned TruncOpc = 0;

    SDValue SatVal;

    if (SDValue SSatVal = detectSSatPattern(In, VT)) {

      SatVal = SSatVal;

      TruncOpc = X86ISD::VTRUNCS;

    } else if (SDValue USatVal = detectUSatPattern(In, VT, DAG, DL)) {

      SatVal = USatVal;

      TruncOpc = X86ISD::VTRUNCUS;

    }

    if (SatVal) {

      unsigned ResElts = VT.getVectorNumElements();

      // If the input type is less than 512 bits and we don't have VLX, we need

      // to widen to 512 bits.

      if (!Subtarget.hasVLX() && !InVT.is512BitVector()) {

        unsigned NumConcats = 512 / InVT.getSizeInBits();

        ResElts *= NumConcats;

        SmallVector<SDValue, 4> ConcatOps(NumConcats, DAG.getUNDEF(InVT));

        ConcatOps[0] = SatVal;

        InVT = EVT::getVectorVT(*DAG.getContext(), InSVT,

                                NumConcats * InVT.getVectorNumElements());

        SatVal = DAG.getNode(ISD::CONCAT_VECTORS, DL, InVT, ConcatOps);

      }

      // Widen the result if its narrower than 128 bits.

      if (ResElts * SVT.getSizeInBits() < 128)

        ResElts = 128 / SVT.getSizeInBits();

      EVT TruncVT = EVT::getVectorVT(*DAG.getContext(), SVT, ResElts);

      SDValue Res = DAG.getNode(TruncOpc, DL, TruncVT, SatVal);

      return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,

                         DAG.getVectorIdxConstant(0, DL));

    }

  }


  return SDValue();

}


static SDValue combineConstantPoolLoads(SDNode *N, const SDLoc &dl,

                                        SelectionDAG &DAG,

                                        TargetLowering::DAGCombinerInfo &DCI,

                                        const X86Subtarget &Subtarget) {

  auto *Ld = cast<LoadSDNode>(N);

  EVT RegVT = Ld->getValueType(0);

  SDValue Ptr = Ld->getBasePtr();

  SDValue Chain = Ld->getChain();

  ISD::LoadExtType Ext = Ld->getExtensionType();


  if (Ext != ISD::NON_EXTLOAD || !Subtarget.hasAVX() || !Ld->isSimple())

    return SDValue();


  if (!(RegVT.is128BitVector() || RegVT.is256BitVector()))

    return SDValue();


  const Constant *LdC = getTargetConstantFromBasePtr(Ptr);

  if (!LdC)

    return SDValue();


  auto MatchingBits = [](const APInt &Undefs, const APInt &UserUndefs,

                         ArrayRef<APInt> Bits, ArrayRef<APInt> UserBits) {

    for (unsigned I = 0, E = Undefs.getBitWidth(); I != E; ++I) {

      if (Undefs[I])

        continue;

      if (UserUndefs[I] || Bits[I] != UserBits[I])

        return false;

    }

    return true;

  };


  // Look through all other loads/broadcasts in the chain for another constant

  // pool entry.

  for (SDNode *User : Chain->users()) {

    auto *UserLd = dyn_cast<MemSDNode>(User);

    if (User != N && UserLd &&

        (User->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD ||

         User->getOpcode() == X86ISD::VBROADCAST_LOAD ||

         ISD::isNormalLoad(User)) &&

        UserLd->getChain() == Chain && User->hasAnyUseOfValue(0) &&

        User->getValueSizeInBits(0).getFixedValue() >

            RegVT.getFixedSizeInBits()) {

      EVT UserVT = User->getValueType(0);

      SDValue UserPtr = UserLd->getBasePtr();

      const Constant *UserC = getTargetConstantFromBasePtr(UserPtr);


      // See if we are loading a constant that matches in the lower

      // bits of a longer constant (but from a different constant pool ptr).

      if (UserC && UserPtr != Ptr) {

        unsigned LdSize = LdC->getType()->getPrimitiveSizeInBits();

        unsigned UserSize = UserC->getType()->getPrimitiveSizeInBits();

        if (LdSize < UserSize || !ISD::isNormalLoad(User)) {

          APInt Undefs, UserUndefs;

          SmallVector<APInt> Bits, UserBits;

          unsigned NumBits = std::min(RegVT.getScalarSizeInBits(),

                                      UserVT.getScalarSizeInBits());

          if (getTargetConstantBitsFromNode(SDValue(N, 0), NumBits, Undefs,

                                            Bits) &&

              getTargetConstantBitsFromNode(SDValue(User, 0), NumBits,

                                            UserUndefs, UserBits)) {

            if (MatchingBits(Undefs, UserUndefs, Bits, UserBits)) {

              DAG.makeEquivalentMemoryOrdering(SDValue(N, 1), SDValue(User, 1));

              SDValue Extract = extractSubVector(SDValue(User, 0), 0, DAG, dl,

                                                 RegVT.getSizeInBits());

              Extract = DAG.getBitcast(RegVT, Extract);

              return DCI.CombineTo(N, Extract, SDValue(User, 1));

            }

          }

        }

      }

    }

  }


  return SDValue();

}


static SDValue combineLoad(SDNode *N, SelectionDAG &DAG,

                           TargetLowering::DAGCombinerInfo &DCI,

                           const X86Subtarget &Subtarget) {

  auto *Ld = cast<LoadSDNode>(N);

  EVT RegVT = Ld->getValueType(0);

  EVT MemVT = Ld->getMemoryVT();

  SDLoc dl(Ld);

  const TargetLowering &TLI = DAG.getTargetLoweringInfo();


  // For chips with slow 32-byte unaligned loads, break the 32-byte operation

  // into two 16-byte operations. Also split non-temporal aligned loads on

  // pre-AVX2 targets as 32-byte loads will lower to regular temporal loads.

  ISD::LoadExtType Ext = Ld->getExtensionType();

  unsigned Fast;

  if (RegVT.is256BitVector() && !DCI.isBeforeLegalizeOps() &&

      Ext == ISD::NON_EXTLOAD &&

      ((Ld->isNonTemporal() && !Subtarget.hasInt256() &&

        Ld->getAlign() >= Align(16)) ||

       (TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), RegVT,

                               *Ld->getMemOperand(), &Fast) &&

        !Fast))) {

    unsigned NumElems = RegVT.getVectorNumElements();

    if (NumElems < 2)

      return SDValue();


    unsigned HalfOffset = 16;

    SDValue Ptr1 = Ld->getBasePtr();

    SDValue Ptr2 =

        DAG.getMemBasePlusOffset(Ptr1, TypeSize::getFixed(HalfOffset), dl);

    EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),

                                  NumElems / 2);

    SDValue Load1 =

        DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr1, Ld->getPointerInfo(),

                    Ld->getBaseAlign(), Ld->getMemOperand()->getFlags());

    SDValue Load2 =

        DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr2,

                    Ld->getPointerInfo().getWithOffset(HalfOffset),

                    Ld->getBaseAlign(), Ld->getMemOperand()->getFlags());

    SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,

                             Load1.getValue(1), Load2.getValue(1));


    SDValue NewVec = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Load1, Load2);

    return DCI.CombineTo(N, NewVec, TF, true);

  }


  // Bool vector load - attempt to cast to an integer, as we have good

  // (vXiY *ext(vXi1 bitcast(iX))) handling.

  if (Ext == ISD::NON_EXTLOAD && !Subtarget.hasAVX512() && RegVT.isVector() &&

      RegVT.getScalarType() == MVT::i1 && DCI.isBeforeLegalize()) {

    unsigned NumElts = RegVT.getVectorNumElements();

    EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);

    if (TLI.isTypeLegal(IntVT)) {

      SDValue IntLoad = DAG.getLoad(IntVT, dl, Ld->getChain(), Ld->getBasePtr(),

                                    Ld->getPointerInfo(), Ld->getBaseAlign(),

                                    Ld->getMemOperand()->getFlags());

      SDValue BoolVec = DAG.getBitcast(RegVT, IntLoad);

      return DCI.CombineTo(N, BoolVec, IntLoad.getValue(1), true);

    }

  }


  // If we also broadcast this vector to a wider type, then just extract the

  // lowest subvector.

  if (Ext == ISD::NON_EXTLOAD && Subtarget.hasAVX() && Ld->isSimple() &&

      (RegVT.is128BitVector() || RegVT.is256BitVector())) {

    SDValue Ptr = Ld->getBasePtr();

    SDValue Chain = Ld->getChain();

    for (SDNode *User : Chain->users()) {

      auto *UserLd = dyn_cast<MemSDNode>(User);

      if (User != N && UserLd &&

          User->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD &&

          UserLd->getChain() == Chain && UserLd->getBasePtr() == Ptr &&

          UserLd->getMemoryVT().getSizeInBits() == MemVT.getSizeInBits() &&

          User->hasAnyUseOfValue(0) &&

          User->getValueSizeInBits(0).getFixedValue() >

              RegVT.getFixedSizeInBits()) {

        DAG.makeEquivalentMemoryOrdering(SDValue(N, 1), SDValue(User, 1));

        SDValue Extract = extractSubVector(SDValue(User, 0), 0, DAG, dl,

                                           RegVT.getSizeInBits());

        Extract = DAG.getBitcast(RegVT, Extract);

        return DCI.CombineTo(N, Extract, SDValue(User, 1));

      }

    }

  }


  if (SDValue V = combineConstantPoolLoads(Ld, dl, DAG, DCI, Subtarget))

    return V;


  // Cast ptr32 and ptr64 pointers to the default address space before a load.

  unsigned AddrSpace = Ld->getAddressSpace();

  if (AddrSpace == X86AS::PTR64 || AddrSpace == X86AS::PTR32_SPTR ||

      AddrSpace == X86AS::PTR32_UPTR) {

    MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());

    if (PtrVT != Ld->getBasePtr().getSimpleValueType()) {

      SDValue Cast =

          DAG.getAddrSpaceCast(dl, PtrVT, Ld->getBasePtr(), AddrSpace, 0);

      return DAG.getExtLoad(Ext, dl, RegVT, Ld->getChain(), Cast,

                            Ld->getPointerInfo(), MemVT, Ld->getBaseAlign(),

                            Ld->getMemOperand()->getFlags());

    }

  }


  return SDValue();

}


/// If V is a build vector of boolean constants and exactly one of those

/// constants is true, return the operand index of that true element.

/// Otherwise, return -1.


static int getOneTrueElt(SDValue V) {

  // This needs to be a build vector of booleans.

  // TODO: Checking for the i1 type matches the IR definition for the mask,

  // but the mask check could be loosened to i8 or other types. That might

  // also require checking more than 'allOnesValue'; eg, the x86 HW

  // instructions only require that the MSB is set for each mask element.

  // The ISD::MSTORE comments/definition do not specify how the mask operand

  // is formatted.

  auto *BV = dyn_cast<BuildVectorSDNode>(V);

  if (!BV || BV->getValueType(0).getVectorElementType() != MVT::i1)

    return -1;


  int TrueIndex = -1;

  unsigned NumElts = BV->getValueType(0).getVectorNumElements();

  for (unsigned i = 0; i < NumElts; ++i) {

    const SDValue &Op = BV->getOperand(i);

    if (Op.isUndef())

      continue;

    auto *ConstNode = dyn_cast<ConstantSDNode>(Op);

    if (!ConstNode)

      return -1;

    if (ConstNode->getAPIntValue().countr_one() >= 1) {

      // If we already found a one, this is too many.

      if (TrueIndex >= 0)

        return -1;

      TrueIndex = i;

    }

  }

  return TrueIndex;

}


/// Given a masked memory load/store operation, return true if it has one mask

/// bit set. If it has one mask bit set, then also return the memory address of

/// the scalar element to load/store, the vector index to insert/extract that

/// scalar element, and the alignment for the scalar memory access.


static bool getParamsForOneTrueMaskedElt(MaskedLoadStoreSDNode *MaskedOp,

                                         SelectionDAG &DAG, SDValue &Addr,

                                         SDValue &Index, Align &Alignment,

                                         unsigned &Offset) {

  int TrueMaskElt = getOneTrueElt(MaskedOp->getMask());

  if (TrueMaskElt < 0)

    return false;


  // Get the address of the one scalar element that is specified by the mask

  // using the appropriate offset from the base pointer.

  EVT EltVT = MaskedOp->getMemoryVT().getVectorElementType();

  Offset = 0;

  Addr = MaskedOp->getBasePtr();

  if (TrueMaskElt != 0) {

    Offset = TrueMaskElt * EltVT.getStoreSize();

    Addr = DAG.getMemBasePlusOffset(Addr, TypeSize::getFixed(Offset),

                                    SDLoc(MaskedOp));

  }


  Index = DAG.getIntPtrConstant(TrueMaskElt, SDLoc(MaskedOp));

  Alignment = commonAlignment(MaskedOp->getBaseAlign(), EltVT.getStoreSize());

  return true;

}


/// If exactly one element of the mask is set for a non-extending masked load,

/// it is a scalar load and vector insert.

/// Note: It is expected that the degenerate cases of an all-zeros or all-ones

/// mask have already been optimized in IR, so we don't bother with those here.

static SDValue


reduceMaskedLoadToScalarLoad(MaskedLoadSDNode *ML, SelectionDAG &DAG,

                             TargetLowering::DAGCombinerInfo &DCI,

                             const X86Subtarget &Subtarget) {

  assert(ML->isUnindexed() && "Unexpected indexed masked load!");

  // TODO: This is not x86-specific, so it could be lifted to DAGCombiner.

  // However, some target hooks may need to be added to know when the transform

  // is profitable. Endianness would also have to be considered.


  SDValue Addr, VecIndex;

  Align Alignment;

  unsigned Offset;

  if (!getParamsForOneTrueMaskedElt(ML, DAG, Addr, VecIndex, Alignment, Offset))

    return SDValue();


  // Load the one scalar element that is specified by the mask using the

  // appropriate offset from the base pointer.

  SDLoc DL(ML);

  EVT VT = ML->getValueType(0);

  EVT EltVT = VT.getVectorElementType();


  EVT CastVT = VT;

  if (EltVT == MVT::i64 && !Subtarget.is64Bit()) {

    EltVT = MVT::f64;

    CastVT = VT.changeVectorElementType(EltVT);

  }


  SDValue Load =

      DAG.getLoad(EltVT, DL, ML->getChain(), Addr,

                  ML->getPointerInfo().getWithOffset(Offset),

                  Alignment, ML->getMemOperand()->getFlags());


  SDValue PassThru = DAG.getBitcast(CastVT, ML->getPassThru());


  // Insert the loaded element into the appropriate place in the vector.

  SDValue Insert =

      DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, CastVT, PassThru, Load, VecIndex);

  Insert = DAG.getBitcast(VT, Insert);

  return DCI.CombineTo(ML, Insert, Load.getValue(1), true);

}


static SDValue


combineMaskedLoadConstantMask(MaskedLoadSDNode *ML, SelectionDAG &DAG,

                              TargetLowering::DAGCombinerInfo &DCI) {

  assert(ML->isUnindexed() && "Unexpected indexed masked load!");

  if (!ISD::isBuildVectorOfConstantSDNodes(ML->getMask().getNode()))

    return SDValue();


  SDLoc DL(ML);

  EVT VT = ML->getValueType(0);


  // If we are loading the first and last elements of a vector, it is safe and

  // always faster to load the whole vector. Replace the masked load with a

  // vector load and select.

  unsigned NumElts = VT.getVectorNumElements();

  BuildVectorSDNode *MaskBV = cast<BuildVectorSDNode>(ML->getMask());

  bool LoadFirstElt = !isNullConstant(MaskBV->getOperand(0));

  bool LoadLastElt = !isNullConstant(MaskBV->getOperand(NumElts - 1));

  if (LoadFirstElt && LoadLastElt) {

    SDValue VecLd = DAG.getLoad(VT, DL, ML->getChain(), ML->getBasePtr(),

                                ML->getMemOperand());

    SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), VecLd,

                                  ML->getPassThru());

    return DCI.CombineTo(ML, Blend, VecLd.getValue(1), true);

  }


  // Convert a masked load with a constant mask into a masked load and a select.

  // This allows the select operation to use a faster kind of select instruction

  // (for example, vblendvps -> vblendps).


  // Don't try this if the pass-through operand is already undefined. That would

  // cause an infinite loop because that's what we're about to create.

  if (ML->getPassThru().isUndef())

    return SDValue();


  if (ISD::isBuildVectorAllZeros(ML->getPassThru().getNode()))

    return SDValue();


  // The new masked load has an undef pass-through operand. The select uses the

  // original pass-through operand.

  SDValue NewML = DAG.getMaskedLoad(

      VT, DL, ML->getChain(), ML->getBasePtr(), ML->getOffset(), ML->getMask(),

      DAG.getUNDEF(VT), ML->getMemoryVT(), ML->getMemOperand(),

      ML->getAddressingMode(), ML->getExtensionType());

  SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), NewML,

                                ML->getPassThru());


  return DCI.CombineTo(ML, Blend, NewML.getValue(1), true);

}


static SDValue combineMaskedLoad(SDNode *N, SelectionDAG &DAG,

                                 TargetLowering::DAGCombinerInfo &DCI,

                                 const X86Subtarget &Subtarget) {

  auto *Mld = cast<MaskedLoadSDNode>(N);


  // TODO: Expanding load with constant mask may be optimized as well.

  if (Mld->isExpandingLoad())

    return SDValue();


  if (Mld->getExtensionType() == ISD::NON_EXTLOAD) {

    if (SDValue ScalarLoad =

            reduceMaskedLoadToScalarLoad(Mld, DAG, DCI, Subtarget))

      return ScalarLoad;


    // TODO: Do some AVX512 subsets benefit from this transform?

    if (!Subtarget.hasAVX512())

      if (SDValue Blend = combineMaskedLoadConstantMask(Mld, DAG, DCI))

        return Blend;

  }


  // If the mask value has been legalized to a non-boolean vector, try to

  // simplify ops leading up to it. We only demand the MSB of each lane.

  SDValue Mask = Mld->getMask();

  if (Mask.getScalarValueSizeInBits() != 1) {

    EVT VT = Mld->getValueType(0);

    const TargetLowering &TLI = DAG.getTargetLoweringInfo();

    APInt DemandedBits(APInt::getSignMask(VT.getScalarSizeInBits()));

    if (TLI.SimplifyDemandedBits(Mask, DemandedBits, DCI)) {

      if (N->getOpcode() != ISD::DELETED_NODE)

        DCI.AddToWorklist(N);

      return SDValue(N, 0);

    }

    if (SDValue NewMask =

            TLI.SimplifyMultipleUseDemandedBits(Mask, DemandedBits, DAG))

      return DAG.getMaskedLoad(

          VT, SDLoc(N), Mld->getChain(), Mld->getBasePtr(), Mld->getOffset(),

          NewMask, Mld->getPassThru(), Mld->getMemoryVT(), Mld->getMemOperand(),

          Mld->getAddressingMode(), Mld->getExtensionType());

  }


  return SDValue();

}


/// If exactly one element of the mask is set for a non-truncating masked store,

/// it is a vector extract and scalar store.

/// Note: It is expected that the degenerate cases of an all-zeros or all-ones

/// mask have already been optimized in IR, so we don't bother with those here.


static SDValue reduceMaskedStoreToScalarStore(MaskedStoreSDNode *MS,

                                              SelectionDAG &DAG,

                                              const X86Subtarget &Subtarget) {

  // TODO: This is not x86-specific, so it could be lifted to DAGCombiner.

  // However, some target hooks may need to be added to know when the transform

  // is profitable. Endianness would also have to be considered.


  SDValue Addr, VecIndex;

  Align Alignment;

  unsigned Offset;

  if (!getParamsForOneTrueMaskedElt(MS, DAG, Addr, VecIndex, Alignment, Offset))

    return SDValue();


  // Extract the one scalar element that is actually being stored.

  SDLoc DL(MS);

  SDValue Value = MS->getValue();

  EVT VT = Value.getValueType();

  EVT EltVT = VT.getVectorElementType();

  if (EltVT == MVT::i64 && !Subtarget.is64Bit()) {

    EltVT = MVT::f64;

    EVT CastVT = VT.changeVectorElementType(EltVT);

    Value = DAG.getBitcast(CastVT, Value);

  }

  SDValue Extract =

      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Value, VecIndex);


  // Store that element at the appropriate offset from the base pointer.

  return DAG.getStore(MS->getChain(), DL, Extract, Addr,

                      MS->getPointerInfo().getWithOffset(Offset),

                      Alignment, MS->getMemOperand()->getFlags());

}


static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG,

                                  TargetLowering::DAGCombinerInfo &DCI,

                                  const X86Subtarget &Subtarget) {

  MaskedStoreSDNode *Mst = cast<MaskedStoreSDNode>(N);

  if (Mst->isCompressingStore())

    return SDValue();


  EVT VT = Mst->getValue().getValueType();

  const TargetLowering &TLI = DAG.getTargetLoweringInfo();


  if (Mst->isTruncatingStore())

    return SDValue();


  if (SDValue ScalarStore = reduceMaskedStoreToScalarStore(Mst, DAG, Subtarget))

    return ScalarStore;


  // If the mask value has been legalized to a non-boolean vector, try to

  // simplify ops leading up to it. We only demand the MSB of each lane.

  SDValue Mask = Mst->getMask();

  if (Mask.getScalarValueSizeInBits() != 1) {

    APInt DemandedBits(APInt::getSignMask(VT.getScalarSizeInBits()));

    if (TLI.SimplifyDemandedBits(Mask, DemandedBits, DCI)) {

      if (N->getOpcode() != ISD::DELETED_NODE)

        DCI.AddToWorklist(N);

      return SDValue(N, 0);

    }

    if (SDValue NewMask =

            TLI.SimplifyMultipleUseDemandedBits(Mask, DemandedBits, DAG))

      return DAG.getMaskedStore(Mst->getChain(), SDLoc(N), Mst->getValue(),

                                Mst->getBasePtr(), Mst->getOffset(), NewMask,

                                Mst->getMemoryVT(), Mst->getMemOperand(),

                                Mst->getAddressingMode());

  }


  SDValue Value = Mst->getValue();

  if (Value.getOpcode() == ISD::TRUNCATE && Value.getNode()->hasOneUse() &&

      TLI.isTruncStoreLegal(Value.getOperand(0).getValueType(),

                            Mst->getMemoryVT())) {

    return DAG.getMaskedStore(Mst->getChain(), SDLoc(N), Value.getOperand(0),

                              Mst->getBasePtr(), Mst->getOffset(), Mask,

                              Mst->getMemoryVT(), Mst->getMemOperand(),

                              Mst->getAddressingMode(), true);

  }


  return SDValue();

}


static SDValue combineStore(SDNode *N, SelectionDAG &DAG,

                            TargetLowering::DAGCombinerInfo &DCI,

                            const X86Subtarget &Subtarget) {

  StoreSDNode *St = cast<StoreSDNode>(N);

  EVT StVT = St->getMemoryVT();

  SDLoc dl(St);

  SDValue StoredVal = St->getValue();

  EVT VT = StoredVal.getValueType();

  const TargetLowering &TLI = DAG.getTargetLoweringInfo();


  // Convert a store of vXi1 into a store of iX and a bitcast.

  if (!Subtarget.hasAVX512() && VT == StVT && VT.isVector() &&

      VT.getVectorElementType() == MVT::i1) {


    EVT NewVT = EVT::getIntegerVT(*DAG.getContext(), VT.getVectorNumElements());

    StoredVal = DAG.getBitcast(NewVT, StoredVal);


    return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),

                        St->getPointerInfo(), St->getBaseAlign(),

                        St->getMemOperand()->getFlags());

  }


  // If this is a store of a scalar_to_vector to v1i1, just use a scalar store.

  // This will avoid a copy to k-register.

  if (VT == MVT::v1i1 && VT == StVT && Subtarget.hasAVX512() &&

      StoredVal.getOpcode() == ISD::SCALAR_TO_VECTOR &&

      StoredVal.getOperand(0).getValueType() == MVT::i8) {

    SDValue Val = StoredVal.getOperand(0);

    // We must store zeros to the unused bits.

    Val = DAG.getZeroExtendInReg(Val, dl, MVT::i1);

    return DAG.getStore(St->getChain(), dl, Val, St->getBasePtr(),

                        St->getPointerInfo(), St->getBaseAlign(),

                        St->getMemOperand()->getFlags());

  }


  // Widen v2i1/v4i1 stores to v8i1.

  if ((VT == MVT::v1i1 || VT == MVT::v2i1 || VT == MVT::v4i1) && VT == StVT &&

      Subtarget.hasAVX512()) {

    unsigned NumConcats = 8 / VT.getVectorNumElements();

    // We must store zeros to the unused bits.

    SmallVector<SDValue, 4> Ops(NumConcats, DAG.getConstant(0, dl, VT));

    Ops[0] = StoredVal;

    StoredVal = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops);

    return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),

                        St->getPointerInfo(), St->getBaseAlign(),

                        St->getMemOperand()->getFlags());

  }


  // Turn vXi1 stores of constants into a scalar store.

  if ((VT == MVT::v8i1 || VT == MVT::v16i1 || VT == MVT::v32i1 ||

       VT == MVT::v64i1) && VT == StVT && TLI.isTypeLegal(VT) &&

      ISD::isBuildVectorOfConstantSDNodes(StoredVal.getNode())) {

    // If its a v64i1 store without 64-bit support, we need two stores.

    if (!DCI.isBeforeLegalize() && VT == MVT::v64i1 && !Subtarget.is64Bit()) {

      SDValue Lo = DAG.getBuildVector(MVT::v32i1, dl,

                                      StoredVal->ops().slice(0, 32));

      Lo = combinevXi1ConstantToInteger(Lo, DAG);

      SDValue Hi = DAG.getBuildVector(MVT::v32i1, dl,

                                      StoredVal->ops().slice(32, 32));

      Hi = combinevXi1ConstantToInteger(Hi, DAG);


      SDValue Ptr0 = St->getBasePtr();

      SDValue Ptr1 = DAG.getMemBasePlusOffset(Ptr0, TypeSize::getFixed(4), dl);


      SDValue Ch0 =

          DAG.getStore(St->getChain(), dl, Lo, Ptr0, St->getPointerInfo(),

                       St->getBaseAlign(), St->getMemOperand()->getFlags());

      SDValue Ch1 = DAG.getStore(

          St->getChain(), dl, Hi, Ptr1, St->getPointerInfo().getWithOffset(4),

          St->getBaseAlign(), St->getMemOperand()->getFlags());

      return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Ch0, Ch1);

    }


    StoredVal = combinevXi1ConstantToInteger(StoredVal, DAG);

    return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),

                        St->getPointerInfo(), St->getBaseAlign(),

                        St->getMemOperand()->getFlags());

  }


  // Convert scalar fabs/fneg load-store to integer equivalents.

  if ((VT == MVT::f16 || VT == MVT::bf16 || VT == MVT::f32 || VT == MVT::f64) &&

      (StoredVal.getOpcode() == ISD::FABS ||

       StoredVal.getOpcode() == ISD::FNEG) &&

      ISD::isNormalLoad(StoredVal.getOperand(0).getNode()) &&

      StoredVal.hasOneUse() && StoredVal.getOperand(0).hasOneUse()) {

    MVT IntVT = VT.getSimpleVT().changeTypeToInteger();

    if (TLI.isTypeLegal(IntVT)) {

      APInt SignMask = APInt::getSignMask(VT.getScalarSizeInBits());

      unsigned SignOp = ISD::XOR;

      if (StoredVal.getOpcode() == ISD::FABS) {

        SignMask = ~SignMask;

        SignOp = ISD::AND;

      }

      SDValue LogicOp = DAG.getNode(

          SignOp, dl, IntVT, DAG.getBitcast(IntVT, StoredVal.getOperand(0)),

          DAG.getConstant(SignMask, dl, IntVT));

      return DAG.getStore(St->getChain(), dl, LogicOp, St->getBasePtr(),

                          St->getPointerInfo(), St->getBaseAlign(),

                          St->getMemOperand()->getFlags());

    }

  }


  // If we are saving a 32-byte vector and 32-byte stores are slow, such as on

  // Sandy Bridge, perform two 16-byte stores.

  unsigned Fast;

  if (VT.is256BitVector() && StVT == VT &&

      TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,

                             *St->getMemOperand(), &Fast) &&

      !Fast) {

    unsigned NumElems = VT.getVectorNumElements();

    if (NumElems < 2)

      return SDValue();


    return splitVectorStore(St, DAG);

  }


  // Split under-aligned vector non-temporal stores.

  if (St->isNonTemporal() && StVT == VT &&

      St->getAlign().value() < VT.getStoreSize()) {

    // ZMM/YMM nt-stores - either it can be stored as a series of shorter

    // vectors or the legalizer can scalarize it to use MOVNTI.

    if (VT.is256BitVector() || VT.is512BitVector()) {

      unsigned NumElems = VT.getVectorNumElements();

      if (NumElems < 2)

        return SDValue();

      return splitVectorStore(St, DAG);

    }


    // XMM nt-stores - scalarize this to f64 nt-stores on SSE4A, else i32/i64

    // to use MOVNTI.

    if (VT.is128BitVector() && Subtarget.hasSSE2()) {

      MVT NTVT = Subtarget.hasSSE4A()

                     ? MVT::v2f64

                     : (TLI.isTypeLegal(MVT::i64) ? MVT::v2i64 : MVT::v4i32);

      return scalarizeVectorStore(St, NTVT, DAG);

    }

  }


  // Try to optimize v16i16->v16i8 truncating stores when BWI is not

  // supported, but avx512f is by extending to v16i32 and truncating.

  if (!St->isTruncatingStore() && VT == MVT::v16i8 && !Subtarget.hasBWI() &&

      St->getValue().getOpcode() == ISD::TRUNCATE &&

      St->getValue().getOperand(0).getValueType() == MVT::v16i16 &&

      TLI.isTruncStoreLegal(MVT::v16i32, MVT::v16i8) &&

      St->getValue().hasOneUse() && !DCI.isBeforeLegalizeOps()) {

    SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::v16i32,

                              St->getValue().getOperand(0));

    return DAG.getTruncStore(St->getChain(), dl, Ext, St->getBasePtr(),

                             MVT::v16i8, St->getMemOperand());

  }


  // Try to fold a VTRUNCUS or VTRUNCS into a truncating store.

  if (!St->isTruncatingStore() &&

      (StoredVal.getOpcode() == X86ISD::VTRUNCUS ||

       StoredVal.getOpcode() == X86ISD::VTRUNCS) &&

      StoredVal.hasOneUse() &&

      TLI.isTruncStoreLegal(StoredVal.getOperand(0).getValueType(), VT)) {

    bool IsSigned = StoredVal.getOpcode() == X86ISD::VTRUNCS;

    return EmitTruncSStore(IsSigned, St->getChain(),

                           dl, StoredVal.getOperand(0), St->getBasePtr(),

                           VT, St->getMemOperand(), DAG);

  }


  // Try to fold a extract_element(VTRUNC) pattern into a truncating store.

  if (!St->isTruncatingStore()) {

    auto IsExtractedElement = [](SDValue V) {

      if (V.getOpcode() == ISD::TRUNCATE && V.hasOneUse())

        V = V.getOperand(0);

      unsigned Opc = V.getOpcode();

      if ((Opc == ISD::EXTRACT_VECTOR_ELT || Opc == X86ISD::PEXTRW) &&

          isNullConstant(V.getOperand(1)) && V.hasOneUse() &&

          V.getOperand(0).hasOneUse())

        return V.getOperand(0);

      return SDValue();

    };

    if (SDValue Extract = IsExtractedElement(StoredVal)) {

      SDValue Trunc = peekThroughOneUseBitcasts(Extract);

      if (Trunc.getOpcode() == X86ISD::VTRUNC) {

        SDValue Src = Trunc.getOperand(0);

        MVT DstVT = Trunc.getSimpleValueType();

        MVT SrcVT = Src.getSimpleValueType();

        unsigned NumSrcElts = SrcVT.getVectorNumElements();

        unsigned NumTruncBits = DstVT.getScalarSizeInBits() * NumSrcElts;

        MVT TruncVT = MVT::getVectorVT(DstVT.getScalarType(), NumSrcElts);

        if (NumTruncBits == VT.getSizeInBits() &&

            TLI.isTruncStoreLegal(SrcVT, TruncVT)) {

          return DAG.getTruncStore(St->getChain(), dl, Src, St->getBasePtr(),

                                   TruncVT, St->getMemOperand());

        }

      }

    }

  }


  // Optimize trunc store (of multiple scalars) to shuffle and store.

  // First, pack all of the elements in one place. Next, store to memory

  // in fewer chunks.

  if (St->isTruncatingStore() && VT.isVector()) {

    if (TLI.isTruncStoreLegal(VT, StVT)) {

      if (SDValue Val = detectSSatPattern(St->getValue(), St->getMemoryVT()))

        return EmitTruncSStore(true /* Signed saturation */, St->getChain(),

                               dl, Val, St->getBasePtr(),

                               St->getMemoryVT(), St->getMemOperand(), DAG);

      if (SDValue Val = detectUSatPattern(St->getValue(), St->getMemoryVT(),

                                          DAG, dl))

        return EmitTruncSStore(false /* Unsigned saturation */, St->getChain(),

                               dl, Val, St->getBasePtr(),

                               St->getMemoryVT(), St->getMemOperand(), DAG);

    }


    return SDValue();

  }


  // Cast ptr32 and ptr64 pointers to the default address space before a store.

  unsigned AddrSpace = St->getAddressSpace();

  if (AddrSpace == X86AS::PTR64 || AddrSpace == X86AS::PTR32_SPTR ||

      AddrSpace == X86AS::PTR32_UPTR) {

    MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());

    if (PtrVT != St->getBasePtr().getSimpleValueType()) {

      SDValue Cast =

          DAG.getAddrSpaceCast(dl, PtrVT, St->getBasePtr(), AddrSpace, 0);

      return DAG.getTruncStore(

          St->getChain(), dl, StoredVal, Cast, St->getPointerInfo(), StVT,

          St->getBaseAlign(), St->getMemOperand()->getFlags(), St->getAAInfo());

    }

  }


  // Convert store(cmov(load(p), x, CC), p) to cstore(x, p, CC)

  //         store(cmov(x, load(p), CC), p) to cstore(x, p, InvertCC)

  if ((VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) &&

      Subtarget.hasCF() && St->isSimple()) {

    SDValue Cmov;

    if (StoredVal.getOpcode() == X86ISD::CMOV)

      Cmov = StoredVal;

    else if (StoredVal.getOpcode() == ISD::TRUNCATE &&

             StoredVal.getOperand(0).getOpcode() == X86ISD::CMOV)

      Cmov = StoredVal.getOperand(0);

    else

      return SDValue();


    auto *Ld = dyn_cast<LoadSDNode>(St->getChain());

    if (!Ld || !Ld->isSimple() || Ld->getBasePtr() != St->getBasePtr())

      return SDValue();


    bool InvertCC = false;

    SDValue V = SDValue(Ld, 0);

    if (V == Cmov.getOperand(1))

      InvertCC = true;

    else if (V != Cmov.getOperand(0))

      return SDValue();


    SDVTList Tys = DAG.getVTList(MVT::Other);

    SDValue CC = Cmov.getOperand(2);

    SDValue Src = DAG.getAnyExtOrTrunc(Cmov.getOperand(!InvertCC), dl, VT);

    if (InvertCC)

      CC = DAG.getTargetConstant(

          GetOppositeBranchCondition(

              (X86::CondCode)Cmov.getConstantOperandVal(2)),

          dl, MVT::i8);

    SDValue Ops[] = {St->getChain(), Src, St->getBasePtr(), CC,

                     Cmov.getOperand(3)};

    return DAG.getMemIntrinsicNode(X86ISD::CSTORE, dl, Tys, Ops, VT,

                                   St->getMemOperand());

  }


  // Turn load->store of MMX types into GPR load/stores.  This avoids clobbering

  // the FP state in cases where an emms may be missing.

  // A preferable solution to the general problem is to figure out the right

  // places to insert EMMS.  This qualifies as a quick hack.


  // Similarly, turn load->store of i64 into double load/stores in 32-bit mode.

  if (VT.getSizeInBits() != 64)

    return SDValue();


  const Function &F = DAG.getMachineFunction().getFunction();

  bool NoImplicitFloatOps = F.hasFnAttribute(Attribute::NoImplicitFloat);

  bool F64IsLegal =

      !Subtarget.useSoftFloat() && !NoImplicitFloatOps && Subtarget.hasSSE2();


  if (!F64IsLegal || Subtarget.is64Bit())

    return SDValue();


  if (VT == MVT::i64 && isa<LoadSDNode>(St->getValue()) &&

      cast<LoadSDNode>(St->getValue())->isSimple() &&

      St->getChain().hasOneUse() && St->isSimple()) {

    auto *Ld = cast<LoadSDNode>(St->getValue());


    if (!ISD::isNormalLoad(Ld))

      return SDValue();


    // Avoid the transformation if there are multiple uses of the loaded value.

    if (!Ld->hasNUsesOfValue(1, 0))

      return SDValue();


    SDLoc LdDL(Ld);

    SDLoc StDL(N);


    // Remove any range metadata as we're converting to f64 load/store.

    Ld->getMemOperand()->clearRanges();


    // Lower to a single movq load/store pair.

    SDValue NewLd = DAG.getLoad(MVT::f64, LdDL, Ld->getChain(),

                                Ld->getBasePtr(), Ld->getMemOperand());


    // Make sure new load is placed in same chain order.

    DAG.makeEquivalentMemoryOrdering(Ld, NewLd);

    return DAG.getStore(St->getChain(), StDL, NewLd, St->getBasePtr(),

                        St->getMemOperand());

  }


  // This is similar to the above case, but here we handle a scalar 64-bit

  // integer store that is extracted from a vector on a 32-bit target.

  // If we have SSE2, then we can treat it like a floating-point double

  // to get past legalization. The execution dependencies fixup pass will

  // choose the optimal machine instruction for the store if this really is

  // an integer or v2f32 rather than an f64.

  if (VT == MVT::i64 &&

      St->getOperand(1).getOpcode() == ISD::EXTRACT_VECTOR_ELT) {

    SDValue OldExtract = St->getOperand(1);

    SDValue ExtOp0 = OldExtract.getOperand(0);

    unsigned VecSize = ExtOp0.getValueSizeInBits();

    EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, VecSize / 64);

    SDValue BitCast = DAG.getBitcast(VecVT, ExtOp0);

    SDValue NewExtract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,

                                     BitCast, OldExtract.getOperand(1));

    return DAG.getStore(St->getChain(), dl, NewExtract, St->getBasePtr(),

                        St->getPointerInfo(), St->getBaseAlign(),

                        St->getMemOperand()->getFlags());

  }


  return SDValue();

}


static SDValue combineVEXTRACT_STORE(SDNode *N, SelectionDAG &DAG,

                                     TargetLowering::DAGCombinerInfo &DCI,

                                     const X86Subtarget &Subtarget) {

  auto *St = cast<MemIntrinsicSDNode>(N);


  SDValue StoredVal = N->getOperand(1);

  MVT VT = StoredVal.getSimpleValueType();

  EVT MemVT = St->getMemoryVT();


  // Figure out which elements we demand.

  unsigned StElts = MemVT.getSizeInBits() / VT.getScalarSizeInBits();

  APInt DemandedElts = APInt::getLowBitsSet(VT.getVectorNumElements(), StElts);


  const TargetLowering &TLI = DAG.getTargetLoweringInfo();

  if (TLI.SimplifyDemandedVectorElts(StoredVal, DemandedElts, DCI)) {

    if (N->getOpcode() != ISD::DELETED_NODE)

      DCI.AddToWorklist(N);

    return SDValue(N, 0);

  }


  return SDValue();

}


/// Return 'true' if this vector operation is "horizontal"

/// and return the operands for the horizontal operation in LHS and RHS.  A

/// horizontal operation performs the binary operation on successive elements

/// of its first operand, then on successive elements of its second operand,

/// returning the resulting values in a vector.  For example, if

///   A = < float a0, float a1, float a2, float a3 >

/// and

///   B = < float b0, float b1, float b2, float b3 >

/// then the result of doing a horizontal operation on A and B is

///   A horizontal-op B = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >.

/// In short, LHS and RHS are inspected to see if LHS op RHS is of the form

/// A horizontal-op B, for some already available A and B, and if so then LHS is

/// set to A, RHS to B, and the routine returns 'true'.


static bool isHorizontalBinOp(unsigned HOpcode, SDValue &LHS, SDValue &RHS,

                              SelectionDAG &DAG, const X86Subtarget &Subtarget,

                              bool IsCommutative,

                              SmallVectorImpl<int> &PostShuffleMask,

                              bool ForceHorizOp) {

  // If either operand is undef, bail out. The binop should be simplified.

  if (LHS.isUndef() || RHS.isUndef())

    return false;


  // Look for the following pattern:

  //   A = < float a0, float a1, float a2, float a3 >

  //   B = < float b0, float b1, float b2, float b3 >

  // and

  //   LHS = VECTOR_SHUFFLE A, B, <0, 2, 4, 6>

  //   RHS = VECTOR_SHUFFLE A, B, <1, 3, 5, 7>

  // then LHS op RHS = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >

  // which is A horizontal-op B.


  MVT VT = LHS.getSimpleValueType();

  assert((VT.is128BitVector() || VT.is256BitVector()) &&

         "Unsupported vector type for horizontal add/sub");

  unsigned NumElts = VT.getVectorNumElements();


  auto GetShuffle = [&](SDValue Op, SDValue &N0, SDValue &N1,

                        SmallVectorImpl<int> &ShuffleMask) {

    bool UseSubVector = false;

    if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&

        Op.getOperand(0).getValueType().is256BitVector() &&

        llvm::isNullConstant(Op.getOperand(1))) {

      Op = Op.getOperand(0);

      UseSubVector = true;

    }

    SmallVector<SDValue, 2> SrcOps;

    SmallVector<int, 16> SrcMask, ScaledMask;

    SDValue BC = peekThroughBitcasts(Op);

    if (getTargetShuffleInputs(BC, SrcOps, SrcMask, DAG) &&

        !isAnyZero(SrcMask) && all_of(SrcOps, [BC](SDValue Op) {

          return Op.getValueSizeInBits() == BC.getValueSizeInBits();

        })) {

      resolveTargetShuffleInputsAndMask(SrcOps, SrcMask);

      if (!UseSubVector && SrcOps.size() <= 2 &&

          scaleShuffleElements(SrcMask, NumElts, ScaledMask)) {

        N0 = !SrcOps.empty() ? SrcOps[0] : SDValue();

        N1 = SrcOps.size() > 1 ? SrcOps[1] : SDValue();

        ShuffleMask.assign(ScaledMask.begin(), ScaledMask.end());

      }

      if (UseSubVector && SrcOps.size() == 1 &&

          scaleShuffleElements(SrcMask, 2 * NumElts, ScaledMask)) {

        std::tie(N0, N1) = DAG.SplitVector(SrcOps[0], SDLoc(Op));

        ArrayRef<int> Mask = ArrayRef<int>(ScaledMask).slice(0, NumElts);

        ShuffleMask.assign(Mask.begin(), Mask.end());

      }

    }

  };


  // View LHS in the form

  //   LHS = VECTOR_SHUFFLE A, B, LMask

  // If LHS is not a shuffle, then pretend it is the identity shuffle:

  //   LHS = VECTOR_SHUFFLE LHS, undef, <0, 1, ..., N-1>

  // NOTE: A default initialized SDValue represents an UNDEF of type VT.

  SDValue A, B;

  SmallVector<int, 16> LMask;

  GetShuffle(LHS, A, B, LMask);


  // Likewise, view RHS in the form

  //   RHS = VECTOR_SHUFFLE C, D, RMask

  SDValue C, D;

  SmallVector<int, 16> RMask;

  GetShuffle(RHS, C, D, RMask);


  // At least one of the operands should be a vector shuffle.

  unsigned NumShuffles = (LMask.empty() ? 0 : 1) + (RMask.empty() ? 0 : 1);

  if (NumShuffles == 0)

    return false;


  if (LMask.empty()) {

    A = LHS;

    for (unsigned i = 0; i != NumElts; ++i)

      LMask.push_back(i);

  }


  if (RMask.empty()) {

    C = RHS;

    for (unsigned i = 0; i != NumElts; ++i)

      RMask.push_back(i);

  }


  // If we have an unary mask, ensure the other op is set to null.

  if (isUndefOrInRange(LMask, 0, NumElts))

    B = SDValue();

  else if (isUndefOrInRange(LMask, NumElts, NumElts * 2))

    A = SDValue();


  if (isUndefOrInRange(RMask, 0, NumElts))

    D = SDValue();

  else if (isUndefOrInRange(RMask, NumElts, NumElts * 2))

    C = SDValue();


  // If A and B occur in reverse order in RHS, then canonicalize by commuting

  // RHS operands and shuffle mask.

  if (A != C) {

    std::swap(C, D);

    ShuffleVectorSDNode::commuteMask(RMask);

  }

  // Check that the shuffles are both shuffling the same vectors.

  if (!(A == C && B == D))

    return false;


  PostShuffleMask.clear();

  PostShuffleMask.append(NumElts, SM_SentinelUndef);


  // LHS and RHS are now:

  //   LHS = shuffle A, B, LMask

  //   RHS = shuffle A, B, RMask

  // Check that the masks correspond to performing a horizontal operation.

  // AVX defines horizontal add/sub to operate independently on 128-bit lanes,

  // so we just repeat the inner loop if this is a 256-bit op.

  unsigned Num128BitChunks = VT.getSizeInBits() / 128;

  unsigned NumEltsPer128BitChunk = NumElts / Num128BitChunks;

  unsigned NumEltsPer64BitChunk = NumEltsPer128BitChunk / 2;

  assert((NumEltsPer128BitChunk % 2 == 0) &&

         "Vector type should have an even number of elements in each lane");

  for (unsigned j = 0; j != NumElts; j += NumEltsPer128BitChunk) {

    for (unsigned i = 0; i != NumEltsPer128BitChunk; ++i) {

      // Ignore undefined components.

      int LIdx = LMask[i + j], RIdx = RMask[i + j];

      if (LIdx < 0 || RIdx < 0 ||

          (!A.getNode() && (LIdx < (int)NumElts || RIdx < (int)NumElts)) ||

          (!B.getNode() && (LIdx >= (int)NumElts || RIdx >= (int)NumElts)))

        continue;


      // Check that successive odd/even elements are being operated on. If not,

      // this is not a horizontal operation.

      if (!((RIdx & 1) == 1 && (LIdx + 1) == RIdx) &&

          !((LIdx & 1) == 1 && (RIdx + 1) == LIdx && IsCommutative))

        return false;


      // Compute the post-shuffle mask index based on where the element

      // is stored in the HOP result, and where it needs to be moved to.

      int Base = LIdx & ~1u;

      int Index = ((Base % NumEltsPer128BitChunk) / 2) +

                  ((Base % NumElts) & ~(NumEltsPer128BitChunk - 1));


      // The  low half of the 128-bit result must choose from A.

      // The high half of the 128-bit result must choose from B,

      // unless B is undef. In that case, we are always choosing from A.

      if ((B && Base >= (int)NumElts) || (!B && i >= NumEltsPer64BitChunk))

        Index += NumEltsPer64BitChunk;

      PostShuffleMask[i + j] = Index;

    }

  }


  SDValue NewLHS = A.getNode() ? A : B; // If A is 'UNDEF', use B for it.

  SDValue NewRHS = B.getNode() ? B : A; // If B is 'UNDEF', use A for it.


  bool IsIdentityPostShuffle =

      isSequentialOrUndefInRange(PostShuffleMask, 0, NumElts, 0);

  if (IsIdentityPostShuffle)

    PostShuffleMask.clear();


  // Avoid 128-bit multi lane shuffles if pre-AVX2 and FP (integer will split).

  if (!IsIdentityPostShuffle && !Subtarget.hasAVX2() && VT.isFloatingPoint() &&

      isMultiLaneShuffleMask(128, VT.getScalarSizeInBits(), PostShuffleMask))

    return false;


  // If the source nodes are already used in HorizOps then always accept this.

  // Shuffle folding should merge these back together.

  auto FoundHorizUser = [&](SDNode *User) {

    return User->getOpcode() == HOpcode && User->getValueType(0) == VT;

  };

  ForceHorizOp =

      ForceHorizOp || (llvm::any_of(NewLHS->users(), FoundHorizUser) &&

                       llvm::any_of(NewRHS->users(), FoundHorizUser));


  // Assume a SingleSource HOP if we only shuffle one input and don't need to

  // shuffle the result.

  if (!ForceHorizOp &&

      !shouldUseHorizontalOp(NewLHS == NewRHS &&

                                 (NumShuffles < 2 || !IsIdentityPostShuffle),

                             DAG, Subtarget))

    return false;


  LHS = DAG.getBitcast(VT, NewLHS);

  RHS = DAG.getBitcast(VT, NewRHS);

  return true;

}


// Try to synthesize horizontal (f)hadd/hsub from (f)adds/subs of shuffles.


static SDValue combineToHorizontalAddSub(SDNode *N, SelectionDAG &DAG,

                                         const X86Subtarget &Subtarget) {

  EVT VT = N->getValueType(0);

  unsigned Opcode = N->getOpcode();

  bool IsAdd = (Opcode == ISD::FADD) || (Opcode == ISD::ADD);

  SmallVector<int, 8> PostShuffleMask;


  auto MergableHorizOp = [N](unsigned HorizOpcode) {

    return N->hasOneUse() &&

           N->user_begin()->getOpcode() == ISD::VECTOR_SHUFFLE &&

           (N->user_begin()->getOperand(0).getOpcode() == HorizOpcode ||

            N->user_begin()->getOperand(1).getOpcode() == HorizOpcode);

  };


  switch (Opcode) {

  case ISD::FADD:

  case ISD::FSUB:

    if ((Subtarget.hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||

        (Subtarget.hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))) {

      SDValue LHS = N->getOperand(0);

      SDValue RHS = N->getOperand(1);

      auto HorizOpcode = IsAdd ? X86ISD::FHADD : X86ISD::FHSUB;

      if (isHorizontalBinOp(HorizOpcode, LHS, RHS, DAG, Subtarget, IsAdd,

                            PostShuffleMask, MergableHorizOp(HorizOpcode))) {

        SDValue HorizBinOp = DAG.getNode(HorizOpcode, SDLoc(N), VT, LHS, RHS);

        if (!PostShuffleMask.empty())

          HorizBinOp = DAG.getVectorShuffle(VT, SDLoc(HorizBinOp), HorizBinOp,

                                            DAG.getUNDEF(VT), PostShuffleMask);

        return HorizBinOp;

      }

    }

    break;

  case ISD::ADD:

  case ISD::SUB:

    if (Subtarget.hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32 ||

                                 VT == MVT::v16i16 || VT == MVT::v8i32)) {

      SDValue LHS = N->getOperand(0);

      SDValue RHS = N->getOperand(1);

      auto HorizOpcode = IsAdd ? X86ISD::HADD : X86ISD::HSUB;

      if (isHorizontalBinOp(HorizOpcode, LHS, RHS, DAG, Subtarget, IsAdd,

                            PostShuffleMask, MergableHorizOp(HorizOpcode))) {

        auto HOpBuilder = [HorizOpcode](SelectionDAG &DAG, const SDLoc &DL,

                                        ArrayRef<SDValue> Ops) {

          return DAG.getNode(HorizOpcode, DL, Ops[0].getValueType(), Ops);

        };

        SDValue HorizBinOp = SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT,

                                              {LHS, RHS}, HOpBuilder);

        if (!PostShuffleMask.empty())

          HorizBinOp = DAG.getVectorShuffle(VT, SDLoc(HorizBinOp), HorizBinOp,

                                            DAG.getUNDEF(VT), PostShuffleMask);

        return HorizBinOp;

      }

    }

    break;

  }


  return SDValue();

}


//  Try to combine the following nodes

//  t29: i64 = X86ISD::Wrapper TargetConstantPool:i64

//    <i32 -2147483648[float -0.000000e+00]> 0

//  t27: v16i32[v16f32],ch = X86ISD::VBROADCAST_LOAD

//    <(load 4 from constant-pool)> t0, t29

//  [t30: v16i32 = bitcast t27]

//  t6: v16i32 = xor t7, t27[t30]

//  t11: v16f32 = bitcast t6

//  t21: v16f32 = X86ISD::VFMULC[X86ISD::VCFMULC] t11, t8

//  into X86ISD::VFCMULC[X86ISD::VFMULC] if possible:

//  t22: v16f32 = bitcast t7

//  t23: v16f32 = X86ISD::VFCMULC[X86ISD::VFMULC] t8, t22

//  t24: v32f16 = bitcast t23


static SDValue combineFMulcFCMulc(SDNode *N, SelectionDAG &DAG,

                                  const X86Subtarget &Subtarget) {

  EVT VT = N->getValueType(0);

  SDValue LHS = N->getOperand(0);

  SDValue RHS = N->getOperand(1);

  int CombineOpcode =

      N->getOpcode() == X86ISD::VFCMULC ? X86ISD::VFMULC : X86ISD::VFCMULC;

  auto combineConjugation = [&](SDValue &r) {

    if (LHS->getOpcode() == ISD::BITCAST) {

      SDValue XOR = LHS.getOperand(0);

      if (XOR->getOpcode() == ISD::XOR) {

        KnownBits XORRHS = DAG.computeKnownBits(XOR.getOperand(1));

        if (XORRHS.isConstant()) {

          APInt ConjugationInt32 = APInt(32, 0x80000000);

          APInt ConjugationInt64 = APInt(64, 0x8000000080000000ULL);

          if ((XORRHS.getBitWidth() == 32 &&

               XORRHS.getConstant() == ConjugationInt32) ||

              (XORRHS.getBitWidth() == 64 &&

               XORRHS.getConstant() == ConjugationInt64)) {

            SelectionDAG::FlagInserter FlagsInserter(DAG, N);

            SDValue I2F = DAG.getBitcast(VT, LHS.getOperand(0).getOperand(0));

            SDValue FCMulC = DAG.getNode(CombineOpcode, SDLoc(N), VT, RHS, I2F);

            r = DAG.getBitcast(VT, FCMulC);

            return true;

          }

        }

      }

    }

    return false;

  };

  SDValue Res;

  if (combineConjugation(Res))

    return Res;

  std::swap(LHS, RHS);

  if (combineConjugation(Res))

    return Res;

  return Res;

}


//  Try to combine the following nodes:

//  FADD(A, FMA(B, C, 0)) and FADD(A, FMUL(B, C)) to FMA(B, C, A)


static SDValue combineFaddCFmul(SDNode *N, SelectionDAG &DAG,

                                const X86Subtarget &Subtarget) {

  auto AllowContract = [&DAG](const SDNodeFlags &Flags) {

    return DAG.getTarget().Options.AllowFPOpFusion == FPOpFusion::Fast ||

           Flags.hasAllowContract();

  };


  auto HasNoSignedZero = [&DAG](const SDNodeFlags &Flags) {

    return DAG.getTarget().Options.NoSignedZerosFPMath ||

           Flags.hasNoSignedZeros();

  };

  auto IsVectorAllNegativeZero = [&DAG](SDValue Op) {

    APInt AI = APInt(32, 0x80008000);

    KnownBits Bits = DAG.computeKnownBits(Op);

    return Bits.getBitWidth() == 32 && Bits.isConstant() &&

           Bits.getConstant() == AI;

  };


  if (N->getOpcode() != ISD::FADD || !Subtarget.hasFP16() ||

      !AllowContract(N->getFlags()))

    return SDValue();


  EVT VT = N->getValueType(0);

  if (VT != MVT::v8f16 && VT != MVT::v16f16 && VT != MVT::v32f16)

    return SDValue();


  SDValue LHS = N->getOperand(0);

  SDValue RHS = N->getOperand(1);

  bool IsConj;

  SDValue FAddOp1, MulOp0, MulOp1;

  auto GetCFmulFrom = [&MulOp0, &MulOp1, &IsConj, &AllowContract,

                       &IsVectorAllNegativeZero,

                       &HasNoSignedZero](SDValue N) -> bool {

    if (!N.hasOneUse() || N.getOpcode() != ISD::BITCAST)

      return false;

    SDValue Op0 = N.getOperand(0);

    unsigned Opcode = Op0.getOpcode();

    if (Op0.hasOneUse() && AllowContract(Op0->getFlags())) {

      if ((Opcode == X86ISD::VFMULC || Opcode == X86ISD::VFCMULC)) {

        MulOp0 = Op0.getOperand(0);

        MulOp1 = Op0.getOperand(1);

        IsConj = Opcode == X86ISD::VFCMULC;

        return true;

      }

      if ((Opcode == X86ISD::VFMADDC || Opcode == X86ISD::VFCMADDC) &&

          ((ISD::isBuildVectorAllZeros(Op0->getOperand(2).getNode()) &&

            HasNoSignedZero(Op0->getFlags())) ||

           IsVectorAllNegativeZero(Op0->getOperand(2)))) {

        MulOp0 = Op0.getOperand(0);

        MulOp1 = Op0.getOperand(1);

        IsConj = Opcode == X86ISD::VFCMADDC;

        return true;

      }

    }

    return false;

  };


  if (GetCFmulFrom(LHS))

    FAddOp1 = RHS;

  else if (GetCFmulFrom(RHS))

    FAddOp1 = LHS;

  else

    return SDValue();


  MVT CVT = MVT::getVectorVT(MVT::f32, VT.getVectorNumElements() / 2);

  FAddOp1 = DAG.getBitcast(CVT, FAddOp1);

  unsigned NewOp = IsConj ? X86ISD::VFCMADDC : X86ISD::VFMADDC;

  // FIXME: How do we handle when fast math flags of FADD are different from

  // CFMUL's?

  SDValue CFmul =

      DAG.getNode(NewOp, SDLoc(N), CVT, MulOp0, MulOp1, FAddOp1, N->getFlags());

  return DAG.getBitcast(VT, CFmul);

}


/// Do target-specific dag combines on floating-point adds/subs.


static SDValue combineFaddFsub(SDNode *N, SelectionDAG &DAG,

                               const X86Subtarget &Subtarget) {

  if (SDValue HOp = combineToHorizontalAddSub(N, DAG, Subtarget))

    return HOp;


  if (SDValue COp = combineFaddCFmul(N, DAG, Subtarget))

    return COp;


  return SDValue();

}


static SDValue combineLRINT_LLRINT(SDNode *N, SelectionDAG &DAG,

                                   const X86Subtarget &Subtarget) {

  EVT VT = N->getValueType(0);

  SDValue Src = N->getOperand(0);

  EVT SrcVT = Src.getValueType();

  SDLoc DL(N);


  const TargetLowering &TLI = DAG.getTargetLoweringInfo();


  // Let legalize expand this if it isn't a legal type yet.

  if (!TLI.isTypeLegal(VT))

    return SDValue();


  if ((SrcVT.getScalarType() == MVT::f16 && !Subtarget.hasFP16()) ||

      (SrcVT.getScalarType() == MVT::f32 && !Subtarget.hasDQI()))

    return SDValue();


  if (SrcVT == MVT::v2f16) {

    SrcVT = MVT::v4f16;

    Src = DAG.getNode(ISD::CONCAT_VECTORS, DL, SrcVT, Src,

                      DAG.getUNDEF(MVT::v2f16));

  }


  if (SrcVT == MVT::v4f16) {

    SrcVT = MVT::v8f16;

    Src = DAG.getNode(ISD::CONCAT_VECTORS, DL, SrcVT, Src,

                      DAG.getUNDEF(MVT::v4f16));

  } else if (SrcVT == MVT::v2f32) {

    SrcVT = MVT::v4f32;

    Src = DAG.getNode(ISD::CONCAT_VECTORS, DL, SrcVT, Src,

                      DAG.getUNDEF(MVT::v2f32));

  } else {

    return SDValue();

  }


  return DAG.getNode(X86ISD::CVTP2SI, DL, VT, Src);

}


// Attempt to fold some (truncate (srl (add/or/xor X, C1), C2)) patterns to

// (add/or/xor (truncate (srl X, C2)), C1'). C1' will be smaller than C1 so we

// are able to avoid generating code with MOVABS and large constants in certain

// cases.


static SDValue combinei64TruncSrlConstant(SDValue N, EVT VT, SelectionDAG &DAG,

                                          const SDLoc &DL) {

  assert(N.getOpcode() == ISD::SRL && "Unknown shift opcode");

  std::optional<unsigned> ValidSrlConst = DAG.getValidShiftAmount(N);

  if (!ValidSrlConst)

    return SDValue();

  unsigned SrlConstVal = *ValidSrlConst;


  SDValue Op = N.getOperand(0);

  unsigned Opcode = Op.getOpcode();

  assert(VT == MVT::i32 && Op.getValueType() == MVT::i64 &&

         "Illegal truncation types");


  if ((Opcode != ISD::ADD && Opcode != ISD::OR && Opcode != ISD::XOR) ||

      !isa<ConstantSDNode>(Op.getOperand(1)))

    return SDValue();

  const APInt &OpConst = Op.getConstantOperandAPInt(1);


  if (SrlConstVal <= 32 ||

      (Opcode == ISD::ADD && OpConst.countr_zero() < SrlConstVal))

    return SDValue();


  SDValue OpLhsSrl =

      DAG.getNode(ISD::SRL, DL, MVT::i64, Op.getOperand(0), N.getOperand(1));

  SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, VT, OpLhsSrl);


  APInt NewOpConstVal = OpConst.lshr(SrlConstVal).trunc(VT.getSizeInBits());

  SDValue NewOpConst = DAG.getConstant(NewOpConstVal, DL, VT);

  SDValue NewOpNode = DAG.getNode(Opcode, DL, VT, Trunc, NewOpConst);


  if (Opcode == ISD::ADD) {

    EVT CleanUpVT = EVT::getIntegerVT(*DAG.getContext(), 64 - SrlConstVal);

    return DAG.getZeroExtendInReg(NewOpNode, DL, CleanUpVT);

  }

  return NewOpNode;

}


/// Attempt to pre-truncate inputs to arithmetic ops if it will simplify

/// the codegen.

/// e.g. TRUNC( BINOP( X, Y ) ) --> BINOP( TRUNC( X ), TRUNC( Y ) )

/// TODO: This overlaps with the generic combiner's visitTRUNCATE. Remove

///       anything that is guaranteed to be transformed by DAGCombiner.


static SDValue combineTruncatedArithmetic(SDNode *N, SelectionDAG &DAG,

                                          const X86Subtarget &Subtarget,

                                          const SDLoc &DL) {

  assert(N->getOpcode() == ISD::TRUNCATE && "Wrong opcode");

  SDValue Src = N->getOperand(0);

  unsigned SrcOpcode = Src.getOpcode();

  const TargetLowering &TLI = DAG.getTargetLoweringInfo();


  EVT VT = N->getValueType(0);

  EVT SrcVT = Src.getValueType();


  auto IsFreeTruncation = [VT](SDValue Op) {

    unsigned TruncSizeInBits = VT.getScalarSizeInBits();


    // See if this has been extended from a smaller/equal size to

    // the truncation size, allowing a truncation to combine with the extend.

    unsigned Opcode = Op.getOpcode();

    if ((Opcode == ISD::ANY_EXTEND || Opcode == ISD::SIGN_EXTEND ||

         Opcode == ISD::ZERO_EXTEND) &&

        Op.getOperand(0).getScalarValueSizeInBits() <= TruncSizeInBits)

      return true;


    // See if this is a single use constant which can be constant folded.

    // NOTE: We don't peek throught bitcasts here because there is currently

    // no support for constant folding truncate+bitcast+vector_of_constants. So

    // we'll just send up with a truncate on both operands which will

    // get turned back into (truncate (binop)) causing an infinite loop.

    return ISD::isBuildVectorOfConstantSDNodes(Op.getNode());

  };


  auto TruncateArithmetic = [&](SDValue N0, SDValue N1) {

    SDValue Trunc0 = DAG.getNode(ISD::TRUNCATE, DL, VT, N0);

    SDValue Trunc1 = DAG.getNode(ISD::TRUNCATE, DL, VT, N1);

    return DAG.getNode(SrcOpcode, DL, VT, Trunc0, Trunc1);

  };


  // Don't combine if the operation has other uses.

  if (!Src.hasOneUse())

    return SDValue();


  if (VT == MVT::i32 && SrcVT == MVT::i64 && SrcOpcode == ISD::SRL)

    return combinei64TruncSrlConstant(Src, VT, DAG, DL);


  if (!VT.isVector())

    return SDValue();


  // In most cases its only worth pre-truncating if we're only facing the cost

  // of one truncation.

  // i.e. if one of the inputs will constant fold or the input is repeated.

  switch (SrcOpcode) {

  case ISD::MUL:

    // X86 is rubbish at scalar and vector i64 multiplies (until AVX512DQ) - its

    // better to truncate if we have the chance.

    if (SrcVT.getScalarType() == MVT::i64 &&

        TLI.isOperationLegal(SrcOpcode, VT) &&

        !TLI.isOperationLegal(SrcOpcode, SrcVT))

      return TruncateArithmetic(Src.getOperand(0), Src.getOperand(1));

    [[fallthrough]];

  case ISD::AND:

  case ISD::XOR:

  case ISD::OR:

  case ISD::ADD:

  case ISD::SUB: {

    SDValue Op0 = Src.getOperand(0);

    SDValue Op1 = Src.getOperand(1);

    if (TLI.isOperationLegal(SrcOpcode, VT) &&

        (Op0 == Op1 || IsFreeTruncation(Op0) || IsFreeTruncation(Op1)))

      return TruncateArithmetic(Op0, Op1);

    break;

  }

  }


  return SDValue();

}


// Try to form a MULHU or MULHS node by looking for

// (trunc (srl (mul ext, ext), >= 16))

// TODO: This is X86 specific because we want to be able to handle wide types

// before type legalization. But we can only do it if the vector will be

// legalized via widening/splitting. Type legalization can't handle promotion

// of a MULHU/MULHS. There isn't a way to convey this to the generic DAG

// combiner.


static SDValue combinePMULH(SDValue Src, EVT VT, const SDLoc &DL,

                            SelectionDAG &DAG, const X86Subtarget &Subtarget) {

  using namespace llvm::SDPatternMatch;


  if (!Subtarget.hasSSE2())

    return SDValue();


  // Only handle vXi16 types that are at least 128-bits unless they will be

  // widened.

  if (!VT.isVector() || VT.getVectorElementType() != MVT::i16)

    return SDValue();


  // Input type should be at least vXi32.

  EVT InVT = Src.getValueType();

  if (InVT.getVectorElementType().getSizeInBits() < 32)

    return SDValue();


  // First instruction should be a right shift by 16 of a multiply.

  SDValue LHS, RHS;

  APInt ShiftAmt;

  if (!sd_match(Src,

                m_Srl(m_Mul(m_Value(LHS), m_Value(RHS)), m_ConstInt(ShiftAmt))))

    return SDValue();


  if (ShiftAmt.ult(16) || ShiftAmt.uge(InVT.getScalarSizeInBits()))

    return SDValue();


  uint64_t AdditionalShift = ShiftAmt.getZExtValue() - 16;


  // Count leading sign/zero bits on both inputs - if there are enough then

  // truncation back to vXi16 will be cheap - either as a pack/shuffle

  // sequence or using AVX512 truncations. If the inputs are sext/zext then the

  // truncations may actually be free by peeking through to the ext source.

  auto IsSext = [&DAG](SDValue V) {

    return DAG.ComputeMaxSignificantBits(V) <= 16;

  };

  auto IsZext = [&DAG](SDValue V) {

    return DAG.computeKnownBits(V).countMaxActiveBits() <= 16;

  };


  bool IsSigned = IsSext(LHS) && IsSext(RHS);

  bool IsUnsigned = IsZext(LHS) && IsZext(RHS);

  if (!IsSigned && !IsUnsigned)

    return SDValue();


  // Check if both inputs are extensions, which will be removed by truncation.

  auto isOpTruncateFree = [](SDValue Op) {

    if (Op.getOpcode() == ISD::SIGN_EXTEND ||

        Op.getOpcode() == ISD::ZERO_EXTEND)

      return Op.getOperand(0).getScalarValueSizeInBits() <= 16;

    return ISD::isBuildVectorOfConstantSDNodes(Op.getNode());

  };

  bool IsTruncateFree = isOpTruncateFree(LHS) && isOpTruncateFree(RHS);


  // For AVX2+ targets, with the upper bits known zero, we can perform MULHU on

  // the (bitcasted) inputs directly, and then cheaply pack/truncate the result

  // (upper elts will be zero). Don't attempt this with just AVX512F as MULHU

  // will have to split anyway.

  unsigned InSizeInBits = InVT.getSizeInBits();

  if (IsUnsigned && !IsTruncateFree && Subtarget.hasInt256() &&

      !(Subtarget.hasAVX512() && !Subtarget.hasBWI() && VT.is256BitVector()) &&

      (InSizeInBits % 16) == 0) {

    EVT BCVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,

                                InVT.getSizeInBits() / 16);

    SDValue Res = DAG.getNode(ISD::MULHU, DL, BCVT, DAG.getBitcast(BCVT, LHS),

                              DAG.getBitcast(BCVT, RHS));

    Res = DAG.getNode(ISD::TRUNCATE, DL, VT, DAG.getBitcast(InVT, Res));

    return DAG.getNode(ISD::SRL, DL, VT, Res,

                       DAG.getShiftAmountConstant(AdditionalShift, VT, DL));

  }


  // Truncate back to source type.

  LHS = DAG.getNode(ISD::TRUNCATE, DL, VT, LHS);

  RHS = DAG.getNode(ISD::TRUNCATE, DL, VT, RHS);


  unsigned Opc = IsSigned ? ISD::MULHS : ISD::MULHU;

  SDValue Res = DAG.getNode(Opc, DL, VT, LHS, RHS);

  return DAG.getNode(ISD::SRL, DL, VT, Res,

                     DAG.getShiftAmountConstant(AdditionalShift, VT, DL));

}


// Attempt to match PMADDUBSW, which multiplies corresponding unsigned bytes

// from one vector with signed bytes from another vector, adds together

// adjacent pairs of 16-bit products, and saturates the result before

// truncating to 16-bits.

//

// Which looks something like this:

// (i16 (ssat (add (mul (zext (even elts (i8 A))), (sext (even elts (i8 B)))),

//                 (mul (zext (odd elts (i8 A)), (sext (odd elts (i8 B))))))))


static SDValue detectPMADDUBSW(SDValue In, EVT VT, SelectionDAG &DAG,

                               const X86Subtarget &Subtarget,

                               const SDLoc &DL) {

  if (!VT.isVector() || !Subtarget.hasSSSE3())

    return SDValue();


  unsigned NumElems = VT.getVectorNumElements();

  EVT ScalarVT = VT.getVectorElementType();

  if (ScalarVT != MVT::i16 || NumElems < 8 || !isPowerOf2_32(NumElems))

    return SDValue();


  SDValue SSatVal = detectSSatPattern(In, VT);

  if (!SSatVal || SSatVal.getOpcode() != ISD::ADD)

    return SDValue();


  // Ok this is a signed saturation of an ADD. See if this ADD is adding pairs

  // of multiplies from even/odd elements.

  SDValue N0 = SSatVal.getOperand(0);

  SDValue N1 = SSatVal.getOperand(1);


  if (N0.getOpcode() != ISD::MUL || N1.getOpcode() != ISD::MUL)

    return SDValue();


  SDValue N00 = N0.getOperand(0);

  SDValue N01 = N0.getOperand(1);

  SDValue N10 = N1.getOperand(0);

  SDValue N11 = N1.getOperand(1);


  // TODO: Handle constant vectors and use knownbits/computenumsignbits?

  // Canonicalize zero_extend to LHS.

  if (N01.getOpcode() == ISD::ZERO_EXTEND)

    std::swap(N00, N01);

  if (N11.getOpcode() == ISD::ZERO_EXTEND)

    std::swap(N10, N11);


  // Ensure we have a zero_extend and a sign_extend.

  if (N00.getOpcode() != ISD::ZERO_EXTEND ||

      N01.getOpcode() != ISD::SIGN_EXTEND ||

      N10.getOpcode() != ISD::ZERO_EXTEND ||

      N11.getOpcode() != ISD::SIGN_EXTEND)

    return SDValue();


  // Peek through the extends.

  N00 = N00.getOperand(0);

  N01 = N01.getOperand(0);

  N10 = N10.getOperand(0);

  N11 = N11.getOperand(0);


  // Ensure the extend is from vXi8.

  if (N00.getValueType().getVectorElementType() != MVT::i8 ||

      N01.getValueType().getVectorElementType() != MVT::i8 ||

      N10.getValueType().getVectorElementType() != MVT::i8 ||

      N11.getValueType().getVectorElementType() != MVT::i8)

    return SDValue();


  // All inputs should be build_vectors.

  if (N00.getOpcode() != ISD::BUILD_VECTOR ||

      N01.getOpcode() != ISD::BUILD_VECTOR ||

      N10.getOpcode() != ISD::BUILD_VECTOR ||

      N11.getOpcode() != ISD::BUILD_VECTOR)

    return SDValue();


  // N00/N10 are zero extended. N01/N11 are sign extended.


  // For each element, we need to ensure we have an odd element from one vector

  // multiplied by the odd element of another vector and the even element from

  // one of the same vectors being multiplied by the even element from the

  // other vector. So we need to make sure for each element i, this operator

  // is being performed:

  //  A[2 * i] * B[2 * i] + A[2 * i + 1] * B[2 * i + 1]

  SDValue ZExtIn, SExtIn;

  for (unsigned i = 0; i != NumElems; ++i) {

    SDValue N00Elt = N00.getOperand(i);

    SDValue N01Elt = N01.getOperand(i);

    SDValue N10Elt = N10.getOperand(i);

    SDValue N11Elt = N11.getOperand(i);

    // TODO: Be more tolerant to undefs.

    if (N00Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||

        N01Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||

        N10Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||

        N11Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT)

      return SDValue();

    auto *ConstN00Elt = dyn_cast<ConstantSDNode>(N00Elt.getOperand(1));

    auto *ConstN01Elt = dyn_cast<ConstantSDNode>(N01Elt.getOperand(1));

    auto *ConstN10Elt = dyn_cast<ConstantSDNode>(N10Elt.getOperand(1));

    auto *ConstN11Elt = dyn_cast<ConstantSDNode>(N11Elt.getOperand(1));

    if (!ConstN00Elt || !ConstN01Elt || !ConstN10Elt || !ConstN11Elt)

      return SDValue();

    unsigned IdxN00 = ConstN00Elt->getZExtValue();

    unsigned IdxN01 = ConstN01Elt->getZExtValue();

    unsigned IdxN10 = ConstN10Elt->getZExtValue();

    unsigned IdxN11 = ConstN11Elt->getZExtValue();

    // Add is commutative so indices can be reordered.

    if (IdxN00 > IdxN10) {

      std::swap(IdxN00, IdxN10);

      std::swap(IdxN01, IdxN11);

    }

    // N0 indices be the even element. N1 indices must be the next odd element.

    if (IdxN00 != 2 * i || IdxN10 != 2 * i + 1 ||

        IdxN01 != 2 * i || IdxN11 != 2 * i + 1)

      return SDValue();

    SDValue N00In = N00Elt.getOperand(0);

    SDValue N01In = N01Elt.getOperand(0);

    SDValue N10In = N10Elt.getOperand(0);

    SDValue N11In = N11Elt.getOperand(0);

    // First time we find an input capture it.

    if (!ZExtIn) {

      ZExtIn = N00In;

      SExtIn = N01In;

    }

    if (ZExtIn != N00In || SExtIn != N01In ||

        ZExtIn != N10In || SExtIn != N11In)

      return SDValue();

  }


  auto ExtractVec = [&DAG, &DL, NumElems](SDValue &Ext) {

    EVT ExtVT = Ext.getValueType();

    if (ExtVT.getVectorNumElements() != NumElems * 2) {

      MVT NVT = MVT::getVectorVT(MVT::i8, NumElems * 2);

      Ext = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NVT, Ext,

                        DAG.getVectorIdxConstant(0, DL));

    }

  };

  ExtractVec(ZExtIn);

  ExtractVec(SExtIn);


  auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,

                         ArrayRef<SDValue> Ops) {

    // Shrink by adding truncate nodes and let DAGCombine fold with the

    // sources.

    EVT InVT = Ops[0].getValueType();

    assert(InVT.getScalarType() == MVT::i8 &&

           "Unexpected scalar element type");

    assert(InVT == Ops[1].getValueType() && "Operands' types mismatch");

    EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,

                                 InVT.getVectorNumElements() / 2);

    return DAG.getNode(X86ISD::VPMADDUBSW, DL, ResVT, Ops[0], Ops[1]);

  };

  return SplitOpsAndApply(DAG, Subtarget, DL, VT, { ZExtIn, SExtIn },

                          PMADDBuilder);

}


static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG,

                               const X86Subtarget &Subtarget) {

  EVT VT = N->getValueType(0);

  SDValue Src = N->getOperand(0);

  SDLoc DL(N);


  // Attempt to pre-truncate inputs to arithmetic ops instead.

  if (SDValue V = combineTruncatedArithmetic(N, DAG, Subtarget, DL))

    return V;


  // Try to detect PMADD

  if (SDValue PMAdd = detectPMADDUBSW(Src, VT, DAG, Subtarget, DL))

    return PMAdd;


  // Try to combine truncation with signed/unsigned saturation.

  if (SDValue Val = combineTruncateWithSat(Src, VT, DL, DAG, Subtarget))

    return Val;


  // Try to combine PMULHUW/PMULHW for vXi16.

  if (SDValue V = combinePMULH(Src, VT, DL, DAG, Subtarget))

    return V;


  // The bitcast source is a direct mmx result.

  // Detect bitcasts between i32 to x86mmx

  if (Src.getOpcode() == ISD::BITCAST && VT == MVT::i32) {

    SDValue BCSrc = Src.getOperand(0);

    if (BCSrc.getValueType() == MVT::x86mmx)

      return DAG.getNode(X86ISD::MMX_MOVD2W, DL, MVT::i32, BCSrc);

  }


  return SDValue();

}


static SDValue combineVTRUNC(SDNode *N, SelectionDAG &DAG,

                             TargetLowering::DAGCombinerInfo &DCI) {

  EVT VT = N->getValueType(0);

  SDValue In = N->getOperand(0);

  SDLoc DL(N);


  if (SDValue SSatVal = detectSSatPattern(In, VT))

    return DAG.getNode(X86ISD::VTRUNCS, DL, VT, SSatVal);

  if (SDValue USatVal = detectUSatPattern(In, VT, DAG, DL))

    return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, USatVal);


  const TargetLowering &TLI = DAG.getTargetLoweringInfo();

  APInt DemandedMask(APInt::getAllOnes(VT.getScalarSizeInBits()));

  if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))

    return SDValue(N, 0);


  return SDValue();

}


/// Returns the negated value if the node \p N flips sign of FP value.

///

/// FP-negation node may have different forms: FNEG(x), FXOR (x, 0x80000000)

/// or FSUB(0, x)

/// AVX512F does not have FXOR, so FNEG is lowered as

/// (bitcast (xor (bitcast x), (bitcast ConstantFP(0x80000000)))).

/// In this case we go though all bitcasts.

/// This also recognizes splat of a negated value and returns the splat of that

/// value.


static SDValue isFNEG(SelectionDAG &DAG, SDNode *N, unsigned Depth = 0) {

  if (N->getOpcode() == ISD::FNEG)

    return N->getOperand(0);


  // Don't recurse exponentially.

  if (Depth > SelectionDAG::MaxRecursionDepth)

    return SDValue();


  unsigned ScalarSize = N->getValueType(0).getScalarSizeInBits();


  SDValue Op = peekThroughBitcasts(SDValue(N, 0));

  EVT VT = Op->getValueType(0);


  // Make sure the element size doesn't change.

  if (VT.getScalarSizeInBits() != ScalarSize)

    return SDValue();


  unsigned Opc = Op.getOpcode();

  switch (Opc) {

  case ISD::VECTOR_SHUFFLE: {

    // For a VECTOR_SHUFFLE(VEC1, VEC2), if the VEC2 is undef, then the negate

    // of this is VECTOR_SHUFFLE(-VEC1, UNDEF).  The mask can be anything here.

    if (!Op.getOperand(1).isUndef())

      return SDValue();

    if (SDValue NegOp0 = isFNEG(DAG, Op.getOperand(0).getNode(), Depth + 1))

      if (NegOp0.getValueType() == VT) // FIXME: Can we do better?

        return DAG.getVectorShuffle(VT, SDLoc(Op), NegOp0, DAG.getUNDEF(VT),

                                    cast<ShuffleVectorSDNode>(Op)->getMask());

    break;

  }

  case ISD::INSERT_VECTOR_ELT: {

    // Negate of INSERT_VECTOR_ELT(UNDEF, V, INDEX) is INSERT_VECTOR_ELT(UNDEF,

    // -V, INDEX).

    SDValue InsVector = Op.getOperand(0);

    SDValue InsVal = Op.getOperand(1);

    if (!InsVector.isUndef())

      return SDValue();

    if (SDValue NegInsVal = isFNEG(DAG, InsVal.getNode(), Depth + 1))

      if (NegInsVal.getValueType() == VT.getVectorElementType()) // FIXME

        return DAG.getNode(ISD::INSERT_VECTOR_ELT, SDLoc(Op), VT, InsVector,

                           NegInsVal, Op.getOperand(2));

    break;

  }

  case ISD::FSUB:

  case ISD::XOR:

  case X86ISD::FXOR: {

    SDValue Op1 = Op.getOperand(1);

    SDValue Op0 = Op.getOperand(0);


    // For XOR and FXOR, we want to check if constant

    // bits of Op1 are sign bit masks. For FSUB, we

    // have to check if constant bits of Op0 are sign

    // bit masks and hence we swap the operands.

    if (Opc == ISD::FSUB)

      std::swap(Op0, Op1);


    APInt UndefElts;

    SmallVector<APInt, 16> EltBits;

    // Extract constant bits and see if they are all

    // sign bit masks. Ignore the undef elements.

    if (getTargetConstantBitsFromNode(Op1, ScalarSize, UndefElts, EltBits,

                                      /* AllowWholeUndefs */ true,

                                      /* AllowPartialUndefs */ false)) {

      for (unsigned I = 0, E = EltBits.size(); I < E; I++)

        if (!UndefElts[I] && !EltBits[I].isSignMask())

          return SDValue();


      // Only allow bitcast from correctly-sized constant.

      Op0 = peekThroughBitcasts(Op0);

      if (Op0.getScalarValueSizeInBits() == ScalarSize)

        return Op0;

    }

    break;

  } // case

  } // switch


  return SDValue();

}


static unsigned negateFMAOpcode(unsigned Opcode, bool NegMul, bool NegAcc,

                                bool NegRes) {

  if (NegMul) {

    switch (Opcode) {

    // clang-format off

    default: llvm_unreachable("Unexpected opcode");

    case ISD::FMA:              Opcode = X86ISD::FNMADD;        break;

    case ISD::STRICT_FMA:       Opcode = X86ISD::STRICT_FNMADD; break;

    case X86ISD::FMADD_RND:     Opcode = X86ISD::FNMADD_RND;    break;

    case X86ISD::FMSUB:         Opcode = X86ISD::FNMSUB;        break;

    case X86ISD::STRICT_FMSUB:  Opcode = X86ISD::STRICT_FNMSUB; break;

    case X86ISD::FMSUB_RND:     Opcode = X86ISD::FNMSUB_RND;    break;

    case X86ISD::FNMADD:        Opcode = ISD::FMA;              break;

    case X86ISD::STRICT_FNMADD: Opcode = ISD::STRICT_FMA;       break;

    case X86ISD::FNMADD_RND:    Opcode = X86ISD::FMADD_RND;     break;

    case X86ISD::FNMSUB:        Opcode = X86ISD::FMSUB;         break;

    case X86ISD::STRICT_FNMSUB: Opcode = X86ISD::STRICT_FMSUB;  break;

    case X86ISD::FNMSUB_RND:    Opcode = X86ISD::FMSUB_RND;     break;

    // clang-format on

    }

  }


  if (NegAcc) {

    switch (Opcode) {

    // clang-format off

    default: llvm_unreachable("Unexpected opcode");

    case ISD::FMA:              Opcode = X86ISD::FMSUB;         break;

    case ISD::STRICT_FMA:       Opcode = X86ISD::STRICT_FMSUB;  break;

    case X86ISD::FMADD_RND:     Opcode = X86ISD::FMSUB_RND;     break;

    case X86ISD::FMSUB:         Opcode = ISD::FMA;              break;

    case X86ISD::STRICT_FMSUB:  Opcode = ISD::STRICT_FMA;       break;

    case X86ISD::FMSUB_RND:     Opcode = X86ISD::FMADD_RND;     break;

    case X86ISD::FNMADD:        Opcode = X86ISD::FNMSUB;        break;

    case X86ISD::STRICT_FNMADD: Opcode = X86ISD::STRICT_FNMSUB; break;

    case X86ISD::FNMADD_RND:    Opcode = X86ISD::FNMSUB_RND;    break;

    case X86ISD::FNMSUB:        Opcode = X86ISD::FNMADD;        break;

    case X86ISD::STRICT_FNMSUB: Opcode = X86ISD::STRICT_FNMADD; break;

    case X86ISD::FNMSUB_RND:    Opcode = X86ISD::FNMADD_RND;    break;

    case X86ISD::FMADDSUB:      Opcode = X86ISD::FMSUBADD;      break;

    case X86ISD::FMADDSUB_RND:  Opcode = X86ISD::FMSUBADD_RND;  break;

    case X86ISD::FMSUBADD:      Opcode = X86ISD::FMADDSUB;      break;

    case X86ISD::FMSUBADD_RND:  Opcode = X86ISD::FMADDSUB_RND;  break;

    // clang-format on

    }

  }


  if (NegRes) {

    switch (Opcode) {

    // For accuracy reason, we never combine fneg and fma under strict FP.

    // clang-format off

    default: llvm_unreachable("Unexpected opcode");

    case ISD::FMA:             Opcode = X86ISD::FNMSUB;       break;

    case X86ISD::FMADD_RND:    Opcode = X86ISD::FNMSUB_RND;   break;

    case X86ISD::FMSUB:        Opcode = X86ISD::FNMADD;       break;

    case X86ISD::FMSUB_RND:    Opcode = X86ISD::FNMADD_RND;   break;

    case X86ISD::FNMADD:       Opcode = X86ISD::FMSUB;        break;

    case X86ISD::FNMADD_RND:   Opcode = X86ISD::FMSUB_RND;    break;

    case X86ISD::FNMSUB:       Opcode = ISD::FMA;             break;

    case X86ISD::FNMSUB_RND:   Opcode = X86ISD::FMADD_RND;    break;

    // clang-format on

    }

  }


  return Opcode;

}


/// Do target-specific dag combines on floating point negations.


static SDValue combineFneg(SDNode *N, SelectionDAG &DAG,

                           TargetLowering::DAGCombinerInfo &DCI,

                           const X86Subtarget &Subtarget) {

  EVT OrigVT = N->getValueType(0);

  SDValue Arg = isFNEG(DAG, N);

  if (!Arg)

    return SDValue();


  const TargetLowering &TLI = DAG.getTargetLoweringInfo();

  EVT VT = Arg.getValueType();

  EVT SVT = VT.getScalarType();

  SDLoc DL(N);


  // Let legalize expand this if it isn't a legal type yet.

  if (!TLI.isTypeLegal(VT))

    return SDValue();


  // If we're negating a FMUL node on a target with FMA, then we can avoid the

  // use of a constant by performing (-0 - A*B) instead.

  // FIXME: Check rounding control flags as well once it becomes available.

  if (Arg.getOpcode() == ISD::FMUL && (SVT == MVT::f32 || SVT == MVT::f64) &&

      Arg->getFlags().hasNoSignedZeros() && Subtarget.hasAnyFMA()) {

    SDValue Zero = DAG.getConstantFP(0.0, DL, VT);

    SDValue NewNode = DAG.getNode(X86ISD::FNMSUB, DL, VT, Arg.getOperand(0),

                                  Arg.getOperand(1), Zero);

    return DAG.getBitcast(OrigVT, NewNode);

  }


  bool CodeSize = DAG.getMachineFunction().getFunction().hasOptSize();

  bool LegalOperations = !DCI.isBeforeLegalizeOps();

  if (SDValue NegArg =

          TLI.getNegatedExpression(Arg, DAG, LegalOperations, CodeSize))

    return DAG.getBitcast(OrigVT, NegArg);


  return SDValue();

}


SDValue X86TargetLowering::getNegatedExpression(SDValue Op, SelectionDAG &DAG,

                                                bool LegalOperations,

                                                bool ForCodeSize,

                                                NegatibleCost &Cost,

                                                unsigned Depth) const {

  // fneg patterns are removable even if they have multiple uses.

  if (SDValue Arg = isFNEG(DAG, Op.getNode(), Depth)) {

    Cost = NegatibleCost::Cheaper;

    return DAG.getBitcast(Op.getValueType(), Arg);

  }


  EVT VT = Op.getValueType();

  EVT SVT = VT.getScalarType();

  unsigned Opc = Op.getOpcode();

  SDNodeFlags Flags = Op.getNode()->getFlags();

  switch (Opc) {

  case ISD::FMA:

  case X86ISD::FMSUB:

  case X86ISD::FNMADD:

  case X86ISD::FNMSUB:

  case X86ISD::FMADD_RND:

  case X86ISD::FMSUB_RND:

  case X86ISD::FNMADD_RND:

  case X86ISD::FNMSUB_RND: {

    if (!Op.hasOneUse() || !Subtarget.hasAnyFMA() || !isTypeLegal(VT) ||

        !(SVT == MVT::f32 || SVT == MVT::f64) ||

        !isOperationLegal(ISD::FMA, VT))

      break;


    // Don't fold (fneg (fma (fneg x), y, (fneg z))) to (fma x, y, z)

    // if it may have signed zeros.

    if (!Flags.hasNoSignedZeros())

      break;


    // Because getCheaperNegatedExpression can delete nodes we need a handle to

    // keep temporary nodes alive.

    std::list<HandleSDNode> Handles;


    // This is always negatible for free but we might be able to remove some

    // extra operand negations as well.

    SmallVector<SDValue, 4> NewOps(Op.getNumOperands(), SDValue());

    for (int i = 0; i != 3; ++i) {

      NewOps[i] = getCheaperNegatedExpression(

          Op.getOperand(i), DAG, LegalOperations, ForCodeSize, Depth + 1);

      if (!!NewOps[i])

        Handles.emplace_back(NewOps[i]);

    }


    bool NegA = !!NewOps[0];

    bool NegB = !!NewOps[1];

    bool NegC = !!NewOps[2];

    unsigned NewOpc = negateFMAOpcode(Opc, NegA != NegB, NegC, true);


    Cost = (NegA || NegB || NegC) ? NegatibleCost::Cheaper

                                  : NegatibleCost::Neutral;


    // Fill in the non-negated ops with the original values.

    for (int i = 0, e = Op.getNumOperands(); i != e; ++i)

      if (!NewOps[i])

        NewOps[i] = Op.getOperand(i);

    return DAG.getNode(NewOpc, SDLoc(Op), VT, NewOps);

  }

  case X86ISD::FRCP:

    if (SDValue NegOp0 =

            getNegatedExpression(Op.getOperand(0), DAG, LegalOperations,

                                 ForCodeSize, Cost, Depth + 1))

      return DAG.getNode(Opc, SDLoc(Op), VT, NegOp0);

    break;

  }


  return TargetLowering::getNegatedExpression(Op, DAG, LegalOperations,

                                              ForCodeSize, Cost, Depth);

}


static SDValue lowerX86FPLogicOp(SDNode *N, SelectionDAG &DAG,

                                 const X86Subtarget &Subtarget) {

  MVT VT = N->getSimpleValueType(0);

  // If we have integer vector types available, use the integer opcodes.

  if (!VT.isVector() || !Subtarget.hasSSE2())

    return SDValue();


  SDLoc dl(N);

  MVT IntVT = VT.changeVectorElementTypeToInteger();

  SDValue Op0 = DAG.getBitcast(IntVT, N->getOperand(0));

  SDValue Op1 = DAG.getBitcast(IntVT, N->getOperand(1));

  unsigned IntOpcode;

  switch (N->getOpcode()) {

  // clang-format off

  default: llvm_unreachable("Unexpected FP logic op");

  case X86ISD::FOR:   IntOpcode = ISD::OR; break;

  case X86ISD::FXOR:  IntOpcode = ISD::XOR; break;

  case X86ISD::FAND:  IntOpcode = ISD::AND; break;

  case X86ISD::FANDN: IntOpcode = X86ISD::ANDNP; break;

  // clang-format on

  }

  SDValue IntOp = DAG.getNode(IntOpcode, dl, IntVT, Op0, Op1);

  return DAG.getBitcast(VT, IntOp);

}


/// Fold a xor(setcc cond, val), 1 --> setcc (inverted(cond), val)


static SDValue foldXor1SetCC(SDNode *N, const SDLoc &DL, SelectionDAG &DAG) {

  if (N->getOpcode() != ISD::XOR)

    return SDValue();


  SDValue LHS = N->getOperand(0);

  if (!isOneConstant(N->getOperand(1)) || LHS->getOpcode() != X86ISD::SETCC)

    return SDValue();


  X86::CondCode NewCC = X86::GetOppositeBranchCondition(

      X86::CondCode(LHS->getConstantOperandVal(0)));

  return getSETCC(NewCC, LHS->getOperand(1), DL, DAG);

}


static SDValue combineXorSubCTLZ(SDNode *N, const SDLoc &DL, SelectionDAG &DAG,

                                 const X86Subtarget &Subtarget) {

  assert((N->getOpcode() == ISD::XOR || N->getOpcode() == ISD::SUB) &&

         "Invalid opcode for combing with CTLZ");

  if (Subtarget.hasFastLZCNT())

    return SDValue();


  EVT VT = N->getValueType(0);

  if (VT != MVT::i8 && VT != MVT::i16 && VT != MVT::i32 &&

      (VT != MVT::i64 || !Subtarget.is64Bit()))

    return SDValue();


  SDValue N0 = N->getOperand(0);

  SDValue N1 = N->getOperand(1);


  if (N0.getOpcode() != ISD::CTLZ_ZERO_UNDEF &&

      N1.getOpcode() != ISD::CTLZ_ZERO_UNDEF)

    return SDValue();


  SDValue OpCTLZ;

  SDValue OpSizeTM1;


  if (N1.getOpcode() == ISD::CTLZ_ZERO_UNDEF) {

    OpCTLZ = N1;

    OpSizeTM1 = N0;

  } else if (N->getOpcode() == ISD::SUB) {

    return SDValue();

  } else {

    OpCTLZ = N0;

    OpSizeTM1 = N1;

  }


  if (!OpCTLZ.hasOneUse())

    return SDValue();

  auto *C = dyn_cast<ConstantSDNode>(OpSizeTM1);

  if (!C)

    return SDValue();


  if (C->getZExtValue() != uint64_t(OpCTLZ.getValueSizeInBits() - 1))

    return SDValue();

  EVT OpVT = VT;

  SDValue Op = OpCTLZ.getOperand(0);

  if (VT == MVT::i8) {

    // Zero extend to i32 since there is not an i8 bsr.

    OpVT = MVT::i32;

    Op = DAG.getNode(ISD::ZERO_EXTEND, DL, OpVT, Op);

  }


  SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);

  Op = DAG.getNode(X86ISD::BSR, DL, VTs, DAG.getUNDEF(OpVT), Op);

  if (VT == MVT::i8)

    Op = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, Op);


  return Op;

}


static SDValue combineXor(SDNode *N, SelectionDAG &DAG,

                          TargetLowering::DAGCombinerInfo &DCI,

                          const X86Subtarget &Subtarget) {

  SDValue N0 = N->getOperand(0);

  SDValue N1 = N->getOperand(1);

  EVT VT = N->getValueType(0);

  SDLoc DL(N);


  // If this is SSE1 only convert to FXOR to avoid scalarization.

  if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {

    return DAG.getBitcast(MVT::v4i32,

                          DAG.getNode(X86ISD::FXOR, DL, MVT::v4f32,

                                      DAG.getBitcast(MVT::v4f32, N0),

                                      DAG.getBitcast(MVT::v4f32, N1)));

  }


  if (SDValue Cmp = foldVectorXorShiftIntoCmp(N, DAG, Subtarget))

    return Cmp;


  if (SDValue R = combineBitOpWithMOVMSK(N->getOpcode(), DL, N0, N1, DAG))

    return R;


  if (SDValue R = combineBitOpWithShift(N->getOpcode(), DL, VT, N0, N1, DAG))

    return R;


  if (SDValue R = combineBitOpWithPACK(N->getOpcode(), DL, VT, N0, N1, DAG))

    return R;


  if (SDValue FPLogic = convertIntLogicToFPLogic(N->getOpcode(), DL, VT, N0, N1,

                                                 DAG, DCI, Subtarget))

    return FPLogic;


  if (SDValue R = combineXorSubCTLZ(N, DL, DAG, Subtarget))

    return R;


  if (DCI.isBeforeLegalizeOps())

    return SDValue();


  if (SDValue SetCC = foldXor1SetCC(N, DL, DAG))

    return SetCC;


  if (SDValue R = combineOrXorWithSETCC(N->getOpcode(), DL, VT, N0, N1, DAG))

    return R;


  if (SDValue RV = foldXorTruncShiftIntoCmp(N, DL, DAG))

    return RV;


  // Fold not(iX bitcast(vXi1)) -> (iX bitcast(not(vec))) for legal boolvecs.

  const TargetLowering &TLI = DAG.getTargetLoweringInfo();

  if (llvm::isAllOnesConstant(N1) && N0.getOpcode() == ISD::BITCAST &&

      N0.getOperand(0).getValueType().isVector() &&

      N0.getOperand(0).getValueType().getVectorElementType() == MVT::i1 &&

      TLI.isTypeLegal(N0.getOperand(0).getValueType()) && N0.hasOneUse()) {

    return DAG.getBitcast(

        VT, DAG.getNOT(DL, N0.getOperand(0), N0.getOperand(0).getValueType()));

  }


  // Handle AVX512 mask widening.

  // Fold not(insert_subvector(undef,sub)) -> insert_subvector(undef,not(sub))

  if (ISD::isBuildVectorAllOnes(N1.getNode()) && VT.isVector() &&

      VT.getVectorElementType() == MVT::i1 &&

      N0.getOpcode() == ISD::INSERT_SUBVECTOR && N0.getOperand(0).isUndef() &&

      TLI.isTypeLegal(N0.getOperand(1).getValueType())) {

    return DAG.getNode(

        ISD::INSERT_SUBVECTOR, DL, VT, N0.getOperand(0),

        DAG.getNOT(DL, N0.getOperand(1), N0.getOperand(1).getValueType()),

        N0.getOperand(2));

  }


  // Fold xor(zext(xor(x,c1)),c2) -> xor(zext(x),xor(zext(c1),c2))

  // Fold xor(truncate(xor(x,c1)),c2) -> xor(truncate(x),xor(truncate(c1),c2))

  // TODO: Under what circumstances could this be performed in DAGCombine?

  if ((N0.getOpcode() == ISD::TRUNCATE || N0.getOpcode() == ISD::ZERO_EXTEND) &&

      N0.getOperand(0).getOpcode() == N->getOpcode()) {

    SDValue TruncExtSrc = N0.getOperand(0);

    auto *N1C = dyn_cast<ConstantSDNode>(N1);

    auto *N001C = dyn_cast<ConstantSDNode>(TruncExtSrc.getOperand(1));

    if (N1C && !N1C->isOpaque() && N001C && !N001C->isOpaque()) {

      SDValue LHS = DAG.getZExtOrTrunc(TruncExtSrc.getOperand(0), DL, VT);

      SDValue RHS = DAG.getZExtOrTrunc(TruncExtSrc.getOperand(1), DL, VT);

      return DAG.getNode(ISD::XOR, DL, VT, LHS,

                         DAG.getNode(ISD::XOR, DL, VT, RHS, N1));

    }

  }


  if (SDValue R = combineBMILogicOp(N, DAG, Subtarget))

    return R;


  return combineFneg(N, DAG, DCI, Subtarget);

}


static SDValue combineBITREVERSE(SDNode *N, SelectionDAG &DAG,

                                 TargetLowering::DAGCombinerInfo &DCI,

                                 const X86Subtarget &Subtarget) {

  SDValue N0 = N->getOperand(0);

  EVT VT = N->getValueType(0);


  // Convert a (iX bitreverse(bitcast(vXi1 X))) -> (iX bitcast(shuffle(X)))

  if (VT.isInteger() && N0.getOpcode() == ISD::BITCAST && N0.hasOneUse()) {

    SDValue Src = N0.getOperand(0);

    EVT SrcVT = Src.getValueType();

    if (SrcVT.isVector() && SrcVT.getScalarType() == MVT::i1 &&

        (DCI.isBeforeLegalize() ||

         DAG.getTargetLoweringInfo().isTypeLegal(SrcVT)) &&

        Subtarget.hasSSSE3()) {

      unsigned NumElts = SrcVT.getVectorNumElements();

      SmallVector<int, 32> ReverseMask(NumElts);

      for (unsigned I = 0; I != NumElts; ++I)

        ReverseMask[I] = (NumElts - 1) - I;

      SDValue Rev =

          DAG.getVectorShuffle(SrcVT, SDLoc(N), Src, Src, ReverseMask);

      return DAG.getBitcast(VT, Rev);

    }

  }


  return SDValue();

}


// Various combines to try to convert to avgceilu.


static SDValue combineAVG(SDNode *N, SelectionDAG &DAG,

                          TargetLowering::DAGCombinerInfo &DCI,

                          const X86Subtarget &Subtarget) {

  unsigned Opcode = N->getOpcode();

  SDValue N0 = N->getOperand(0);

  SDValue N1 = N->getOperand(1);

  EVT VT = N->getValueType(0);

  EVT SVT = VT.getScalarType();

  SDLoc DL(N);


  // avgceils(x,y) -> flipsign(avgceilu(flipsign(x),flipsign(y)))

  // Only useful on vXi8 which doesn't have good SRA handling.

  if (Opcode == ISD::AVGCEILS && VT.isVector() && SVT == MVT::i8) {

    APInt SignBit = APInt::getSignMask(VT.getScalarSizeInBits());

    SDValue SignMask = DAG.getConstant(SignBit, DL, VT);

    N0 = DAG.getNode(ISD::XOR, DL, VT, N0, SignMask);

    N1 = DAG.getNode(ISD::XOR, DL, VT, N1, SignMask);

    return DAG.getNode(ISD::XOR, DL, VT,

                       DAG.getNode(ISD::AVGCEILU, DL, VT, N0, N1), SignMask);

  }


  return SDValue();

}


static SDValue combineBEXTR(SDNode *N, SelectionDAG &DAG,

                            TargetLowering::DAGCombinerInfo &DCI,

                            const X86Subtarget &Subtarget) {

  EVT VT = N->getValueType(0);

  unsigned NumBits = VT.getSizeInBits();


  // TODO - Constant Folding.


  // Simplify the inputs.

  const TargetLowering &TLI = DAG.getTargetLoweringInfo();

  APInt DemandedMask(APInt::getAllOnes(NumBits));

  if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))

    return SDValue(N, 0);


  return SDValue();

}


static bool isNullFPScalarOrVectorConst(SDValue V) {

  return isNullFPConstant(V) || ISD::isBuildVectorAllZeros(V.getNode());

}


/// If a value is a scalar FP zero or a vector FP zero (potentially including

/// undefined elements), return a zero constant that may be used to fold away

/// that value. In the case of a vector, the returned constant will not contain

/// undefined elements even if the input parameter does. This makes it suitable

/// to be used as a replacement operand with operations (eg, bitwise-and) where

/// an undef should not propagate.


static SDValue getNullFPConstForNullVal(SDValue V, SelectionDAG &DAG,

                                        const X86Subtarget &Subtarget) {

  if (!isNullFPScalarOrVectorConst(V))

    return SDValue();


  if (V.getValueType().isVector())

    return getZeroVector(V.getSimpleValueType(), Subtarget, DAG, SDLoc(V));


  return V;

}


static SDValue combineFAndFNotToFAndn(SDNode *N, SelectionDAG &DAG,

                                      const X86Subtarget &Subtarget) {

  SDValue N0 = N->getOperand(0);

  SDValue N1 = N->getOperand(1);

  EVT VT = N->getValueType(0);

  SDLoc DL(N);


  // Vector types are handled in combineANDXORWithAllOnesIntoANDNP().

  if (!((VT == MVT::f32 && Subtarget.hasSSE1()) ||

        (VT == MVT::f64 && Subtarget.hasSSE2()) ||

        (VT == MVT::v4f32 && Subtarget.hasSSE1() && !Subtarget.hasSSE2())))

    return SDValue();


  auto isAllOnesConstantFP = [](SDValue V) {

    if (V.getSimpleValueType().isVector())

      return ISD::isBuildVectorAllOnes(V.getNode());

    auto *C = dyn_cast<ConstantFPSDNode>(V);

    return C && C->getConstantFPValue()->isAllOnesValue();

  };


  // fand (fxor X, -1), Y --> fandn X, Y

  if (N0.getOpcode() == X86ISD::FXOR && isAllOnesConstantFP(N0.getOperand(1)))

    return DAG.getNode(X86ISD::FANDN, DL, VT, N0.getOperand(0), N1);


  // fand X, (fxor Y, -1) --> fandn Y, X

  if (N1.getOpcode() == X86ISD::FXOR && isAllOnesConstantFP(N1.getOperand(1)))

    return DAG.getNode(X86ISD::FANDN, DL, VT, N1.getOperand(0), N0);


  return SDValue();

}


/// Do target-specific dag combines on X86ISD::FAND nodes.


static SDValue combineFAnd(SDNode *N, SelectionDAG &DAG,

                           const X86Subtarget &Subtarget) {

  // FAND(0.0, x) -> 0.0

  if (SDValue V = getNullFPConstForNullVal(N->getOperand(0), DAG, Subtarget))

    return V;


  // FAND(x, 0.0) -> 0.0

  if (SDValue V = getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget))

    return V;


  if (SDValue V = combineFAndFNotToFAndn(N, DAG, Subtarget))

    return V;


  return lowerX86FPLogicOp(N, DAG, Subtarget);

}


/// Do target-specific dag combines on X86ISD::FANDN nodes.


static SDValue combineFAndn(SDNode *N, SelectionDAG &DAG,

                            const X86Subtarget &Subtarget) {

  // FANDN(0.0, x) -> x

  if (isNullFPScalarOrVectorConst(N->getOperand(0)))

    return N->getOperand(1);


  // FANDN(x, 0.0) -> 0.0

  if (SDValue V = getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget))

    return V;


  return lowerX86FPLogicOp(N, DAG, Subtarget);

}


/// Do target-specific dag combines on X86ISD::FOR and X86ISD::FXOR nodes.


static SDValue combineFOr(SDNode *N, SelectionDAG &DAG,

                          TargetLowering::DAGCombinerInfo &DCI,

                          const X86Subtarget &Subtarget) {

  assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR);


  // F[X]OR(0.0, x) -> x

  if (isNullFPScalarOrVectorConst(N->getOperand(0)))

    return N->getOperand(1);


  // F[X]OR(x, 0.0) -> x

  if (isNullFPScalarOrVectorConst(N->getOperand(1)))

    return N->getOperand(0);


  if (SDValue NewVal = combineFneg(N, DAG, DCI, Subtarget))

    return NewVal;


  return lowerX86FPLogicOp(N, DAG, Subtarget);

}


/// Do target-specific dag combines on X86ISD::FMIN and X86ISD::FMAX nodes.


static SDValue combineFMinFMax(SDNode *N, SelectionDAG &DAG) {

  assert(N->getOpcode() == X86ISD::FMIN || N->getOpcode() == X86ISD::FMAX);


  // FMIN/FMAX are commutative if no NaNs and no negative zeros are allowed.

  if (!DAG.getTarget().Options.NoNaNsFPMath ||

      !DAG.getTarget().Options.NoSignedZerosFPMath)

    return SDValue();


  // If we run in unsafe-math mode, then convert the FMAX and FMIN nodes

  // into FMINC and FMAXC, which are Commutative operations.

  unsigned NewOp = 0;

  switch (N->getOpcode()) {

    default: llvm_unreachable("unknown opcode");

    case X86ISD::FMIN:  NewOp = X86ISD::FMINC; break;

    case X86ISD::FMAX:  NewOp = X86ISD::FMAXC; break;

  }


  return DAG.getNode(NewOp, SDLoc(N), N->getValueType(0),

                     N->getOperand(0), N->getOperand(1));

}


static SDValue combineFMinNumFMaxNum(SDNode *N, SelectionDAG &DAG,

                                     const X86Subtarget &Subtarget) {

  EVT VT = N->getValueType(0);

  if (Subtarget.useSoftFloat() || isSoftF16(VT, Subtarget))

    return SDValue();


  const TargetLowering &TLI = DAG.getTargetLoweringInfo();


  auto IsMinMaxLegal = [&](EVT VT) {

    if (!TLI.isTypeLegal(VT))

      return false;

    return VT.getScalarType() != MVT::f16 ||

           (Subtarget.hasFP16() && (VT == MVT::v32f16 || Subtarget.hasVLX()));

  };


  if (!((Subtarget.hasSSE1() && VT == MVT::f32) ||

        (Subtarget.hasSSE2() && VT == MVT::f64) ||

        (Subtarget.hasFP16() && VT == MVT::f16) ||

        (VT.isVector() && IsMinMaxLegal(VT))))

    return SDValue();


  SDValue Op0 = N->getOperand(0);

  SDValue Op1 = N->getOperand(1);

  SDLoc DL(N);

  auto MinMaxOp = N->getOpcode() == ISD::FMAXNUM ? X86ISD::FMAX : X86ISD::FMIN;


  // If we don't have to respect NaN inputs, this is a direct translation to x86

  // min/max instructions.

  if (DAG.getTarget().Options.NoNaNsFPMath || N->getFlags().hasNoNaNs())

    return DAG.getNode(MinMaxOp, DL, VT, Op0, Op1, N->getFlags());


  // If one of the operands is known non-NaN use the native min/max instructions

  // with the non-NaN input as second operand.

  if (DAG.isKnownNeverNaN(Op1))

    return DAG.getNode(MinMaxOp, DL, VT, Op0, Op1, N->getFlags());

  if (DAG.isKnownNeverNaN(Op0))

    return DAG.getNode(MinMaxOp, DL, VT, Op1, Op0, N->getFlags());


  // If we have to respect NaN inputs, this takes at least 3 instructions.

  // Favor a library call when operating on a scalar and minimizing code size.

  if (!VT.isVector() && DAG.getMachineFunction().getFunction().hasMinSize())

    return SDValue();


  EVT SetCCType = TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(),

                                         VT);


  // There are 4 possibilities involving NaN inputs, and these are the required

  // outputs:

  //                   Op1

  //               Num     NaN

  //            ----------------

  //       Num  |  Max  |  Op0 |

  // Op0        ----------------

  //       NaN  |  Op1  |  NaN |

  //            ----------------

  //

  // The SSE FP max/min instructions were not designed for this case, but rather

  // to implement:

  //   Min = Op1 < Op0 ? Op1 : Op0

  //   Max = Op1 > Op0 ? Op1 : Op0

  //

  // So they always return Op0 if either input is a NaN. However, we can still

  // use those instructions for fmaxnum by selecting away a NaN input.


  // If either operand is NaN, the 2nd source operand (Op0) is passed through.

  SDValue MinOrMax = DAG.getNode(MinMaxOp, DL, VT, Op1, Op0);

  SDValue IsOp0Nan = DAG.getSetCC(DL, SetCCType, Op0, Op0, ISD::SETUO);


  // If Op0 is a NaN, select Op1. Otherwise, select the max. If both operands

  // are NaN, the NaN value of Op1 is the result.

  return DAG.getSelect(DL, VT, IsOp0Nan, Op1, MinOrMax);

}


static SDValue combineX86INT_TO_FP(SDNode *N, SelectionDAG &DAG,

                                   TargetLowering::DAGCombinerInfo &DCI) {

  EVT VT = N->getValueType(0);

  const TargetLowering &TLI = DAG.getTargetLoweringInfo();


  APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());

  if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, DCI))

    return SDValue(N, 0);


  // Convert a full vector load into vzload when not all bits are needed.

  SDValue In = N->getOperand(0);

  MVT InVT = In.getSimpleValueType();

  if (VT.getVectorNumElements() < InVT.getVectorNumElements() &&

      ISD::isNormalLoad(In.getNode()) && In.hasOneUse()) {

    assert(InVT.is128BitVector() && "Expected 128-bit input vector");

    LoadSDNode *LN = cast<LoadSDNode>(N->getOperand(0));

    unsigned NumBits = InVT.getScalarSizeInBits() * VT.getVectorNumElements();

    MVT MemVT = MVT::getIntegerVT(NumBits);

    MVT LoadVT = MVT::getVectorVT(MemVT, 128 / NumBits);

    if (SDValue VZLoad = narrowLoadToVZLoad(LN, MemVT, LoadVT, DAG)) {

      SDLoc dl(N);

      SDValue Convert = DAG.getNode(N->getOpcode(), dl, VT,

                                    DAG.getBitcast(InVT, VZLoad));

      DCI.CombineTo(N, Convert);

      DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));

      DCI.recursivelyDeleteUnusedNodes(LN);

      return SDValue(N, 0);

    }

  }


  return SDValue();

}


static SDValue combineCVTP2I_CVTTP2I(SDNode *N, SelectionDAG &DAG,

                                     TargetLowering::DAGCombinerInfo &DCI) {

  const SelectionDAGTargetInfo &TSI = DAG.getSelectionDAGInfo();

  bool IsStrict = TSI.isTargetStrictFPOpcode(N->getOpcode());

  EVT VT = N->getValueType(0);


  // Convert a full vector load into vzload when not all bits are needed.

  SDValue In = N->getOperand(IsStrict ? 1 : 0);

  MVT InVT = In.getSimpleValueType();

  if (VT.getVectorNumElements() < InVT.getVectorNumElements() &&

      ISD::isNormalLoad(In.getNode()) && In.hasOneUse()) {

    assert(InVT.is128BitVector() && "Expected 128-bit input vector");

    LoadSDNode *LN = cast<LoadSDNode>(In);

    unsigned NumBits = InVT.getScalarSizeInBits() * VT.getVectorNumElements();

    MVT MemVT = MVT::getFloatingPointVT(NumBits);

    MVT LoadVT = MVT::getVectorVT(MemVT, 128 / NumBits);

    if (SDValue VZLoad = narrowLoadToVZLoad(LN, MemVT, LoadVT, DAG)) {

      SDLoc dl(N);

      if (IsStrict) {

        SDValue Convert =

            DAG.getNode(N->getOpcode(), dl, {VT, MVT::Other},

                        {N->getOperand(0), DAG.getBitcast(InVT, VZLoad)});

        DCI.CombineTo(N, Convert, Convert.getValue(1));

      } else {

        SDValue Convert =

            DAG.getNode(N->getOpcode(), dl, VT, DAG.getBitcast(InVT, VZLoad));

        DCI.CombineTo(N, Convert);

      }

      DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));

      DCI.recursivelyDeleteUnusedNodes(LN);

      return SDValue(N, 0);

    }

  }


  return SDValue();

}


/// Do target-specific dag combines on X86ISD::ANDNP nodes.


static SDValue combineAndnp(SDNode *N, SelectionDAG &DAG,

                            TargetLowering::DAGCombinerInfo &DCI,

                            const X86Subtarget &Subtarget) {

  SDValue N0 = N->getOperand(0);

  SDValue N1 = N->getOperand(1);

  MVT VT = N->getSimpleValueType(0);

  int NumElts = VT.getVectorNumElements();

  unsigned EltSizeInBits = VT.getScalarSizeInBits();

  const TargetLowering &TLI = DAG.getTargetLoweringInfo();

  SDLoc DL(N);


  // ANDNP(undef, x) -> 0

  // ANDNP(x, undef) -> 0

  if (N0.isUndef() || N1.isUndef())

    return DAG.getConstant(0, DL, VT);


  // ANDNP(0, x) -> x

  if (ISD::isBuildVectorAllZeros(N0.getNode()))

    return N1;


  // ANDNP(x, 0) -> 0

  if (ISD::isBuildVectorAllZeros(N1.getNode()))

    return DAG.getConstant(0, DL, VT);


  // ANDNP(x, -1) -> NOT(x) -> XOR(x, -1)

  if (ISD::isBuildVectorAllOnes(N1.getNode()))

    return DAG.getNOT(DL, N0, VT);


  // Turn ANDNP back to AND if input is inverted.

  if (SDValue Not = IsNOT(N0, DAG))

    return DAG.getNode(ISD::AND, DL, VT, DAG.getBitcast(VT, Not), N1);


  // On AVX512 targets, attempt to reverse foldVSelectToSignBitSplatMask.

  // to make use of predicated selects.

  // ANDN(SEXT(SETCC()),X) -> SELECT(NOT(SETCC()),X,0)

  if (DCI.isAfterLegalizeDAG() && N0.getOpcode() == ISD::SIGN_EXTEND) {

    SDValue Src = N0.getOperand(0);

    EVT SrcVT = Src.getValueType();

    if (Src.getOpcode() == ISD::SETCC && SrcVT.getScalarType() == MVT::i1 &&

        (VT.is512BitVector() || Subtarget.hasVLX()) &&

        (VT.getScalarSizeInBits() >= 32 || Subtarget.hasBWI()) &&

        TLI.isTypeLegal(SrcVT) && N0.hasOneUse() && Src.hasOneUse())

      return DAG.getSelect(DL, VT, DAG.getNOT(DL, Src, SrcVT), N1,

                           getZeroVector(VT, Subtarget, DAG, DL));

  }


  // Constant Folding

  APInt Undefs0, Undefs1;

  SmallVector<APInt> EltBits0, EltBits1;

  if (getTargetConstantBitsFromNode(N0, EltSizeInBits, Undefs0, EltBits0,

                                    /*AllowWholeUndefs*/ true,

                                    /*AllowPartialUndefs*/ true)) {

    if (getTargetConstantBitsFromNode(N1, EltSizeInBits, Undefs1, EltBits1,

                                      /*AllowWholeUndefs*/ true,

                                      /*AllowPartialUndefs*/ true)) {

      SmallVector<APInt> ResultBits;

      for (int I = 0; I != NumElts; ++I)

        ResultBits.push_back(~EltBits0[I] & EltBits1[I]);

      return getConstVector(ResultBits, VT, DAG, DL);

    }


    // Constant fold NOT(N0) to allow us to use AND.

    // Ensure this is only performed if we can confirm that the bitcasted source

    // has oneuse to prevent an infinite loop with canonicalizeBitSelect.

    if (N0->hasOneUse()) {

      SDValue BC0 = peekThroughOneUseBitcasts(N0);

      if (BC0.getOpcode() != ISD::BITCAST) {

        for (APInt &Elt : EltBits0)

          Elt = ~Elt;

        SDValue Not = getConstVector(EltBits0, VT, DAG, DL);

        return DAG.getNode(ISD::AND, DL, VT, Not, N1);

      }

    }

  }


  // Attempt to recursively combine a bitmask ANDNP with shuffles.

  if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {

    SDValue Op(N, 0);

    if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))

      return Res;


    // If either operand is a constant mask, then only the elements that aren't

    // zero are actually demanded by the other operand.

    auto GetDemandedMasks = [&](SDValue Op, bool Invert = false) {

      APInt UndefElts;

      SmallVector<APInt> EltBits;

      APInt DemandedBits = APInt::getAllOnes(EltSizeInBits);

      APInt DemandedElts = APInt::getAllOnes(NumElts);

      if (getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts,

                                        EltBits)) {

        DemandedBits.clearAllBits();

        DemandedElts.clearAllBits();

        for (int I = 0; I != NumElts; ++I) {

          if (UndefElts[I]) {

            // We can't assume an undef src element gives an undef dst - the

            // other src might be zero.

            DemandedBits.setAllBits();

            DemandedElts.setBit(I);

          } else if ((Invert && !EltBits[I].isAllOnes()) ||

                     (!Invert && !EltBits[I].isZero())) {

            DemandedBits |= Invert ? ~EltBits[I] : EltBits[I];

            DemandedElts.setBit(I);

          }

        }

      }

      return std::make_pair(DemandedBits, DemandedElts);

    };

    APInt Bits0, Elts0;

    APInt Bits1, Elts1;

    std::tie(Bits0, Elts0) = GetDemandedMasks(N1);

    std::tie(Bits1, Elts1) = GetDemandedMasks(N0, true);


    if (TLI.SimplifyDemandedVectorElts(N0, Elts0, DCI) ||

        TLI.SimplifyDemandedVectorElts(N1, Elts1, DCI) ||

        TLI.SimplifyDemandedBits(N0, Bits0, Elts0, DCI) ||

        TLI.SimplifyDemandedBits(N1, Bits1, Elts1, DCI)) {

      if (N->getOpcode() != ISD::DELETED_NODE)

        DCI.AddToWorklist(N);

      return SDValue(N, 0);

    }

  }


  // Folds for better commutativity:

  if (N1->hasOneUse()) {

    // ANDNP(x,NOT(y)) -> AND(NOT(x),NOT(y)) -> NOT(OR(X,Y)).

    if (SDValue Not = IsNOT(N1, DAG))

      return DAG.getNOT(

          DL, DAG.getNode(ISD::OR, DL, VT, N0, DAG.getBitcast(VT, Not)), VT);


    // ANDNP(x,PSHUFB(y,z)) -> PSHUFB(y,OR(z,x))

    // Zero out elements by setting the PSHUFB mask value to 0xFF.

    if (DAG.ComputeNumSignBits(N0) == EltSizeInBits) {

      SDValue BC1 = peekThroughOneUseBitcasts(N1);

      if (BC1.getOpcode() == X86ISD::PSHUFB) {

        EVT ShufVT = BC1.getValueType();

        SDValue NewMask = DAG.getNode(ISD::OR, DL, ShufVT, BC1.getOperand(1),

                                      DAG.getBitcast(ShufVT, N0));

        SDValue NewShuf =

            DAG.getNode(X86ISD::PSHUFB, DL, ShufVT, BC1.getOperand(0), NewMask);

        return DAG.getBitcast(VT, NewShuf);

      }

    }

  }


  return SDValue();

}


static SDValue combineBT(SDNode *N, SelectionDAG &DAG,

                         TargetLowering::DAGCombinerInfo &DCI) {

  SDValue N1 = N->getOperand(1);


  // BT ignores high bits in the bit index operand.

  unsigned BitWidth = N1.getValueSizeInBits();

  APInt DemandedMask = APInt::getLowBitsSet(BitWidth, Log2_32(BitWidth));

  if (DAG.getTargetLoweringInfo().SimplifyDemandedBits(N1, DemandedMask, DCI)) {

    if (N->getOpcode() != ISD::DELETED_NODE)

      DCI.AddToWorklist(N);

    return SDValue(N, 0);

  }


  return SDValue();

}


static SDValue combineCVTPH2PS(SDNode *N, SelectionDAG &DAG,

                               TargetLowering::DAGCombinerInfo &DCI) {

  bool IsStrict = N->getOpcode() == X86ISD::STRICT_CVTPH2PS;

  SDValue Src = N->getOperand(IsStrict ? 1 : 0);


  if (N->getValueType(0) == MVT::v4f32 && Src.getValueType() == MVT::v8i16) {

    const TargetLowering &TLI = DAG.getTargetLoweringInfo();

    APInt DemandedElts = APInt::getLowBitsSet(8, 4);

    if (TLI.SimplifyDemandedVectorElts(Src, DemandedElts, DCI)) {

      if (N->getOpcode() != ISD::DELETED_NODE)

        DCI.AddToWorklist(N);

      return SDValue(N, 0);

    }


    // Convert a full vector load into vzload when not all bits are needed.

    if (ISD::isNormalLoad(Src.getNode()) && Src.hasOneUse()) {

      LoadSDNode *LN = cast<LoadSDNode>(N->getOperand(IsStrict ? 1 : 0));

      if (SDValue VZLoad = narrowLoadToVZLoad(LN, MVT::i64, MVT::v2i64, DAG)) {

        SDLoc dl(N);

        if (IsStrict) {

          SDValue Convert = DAG.getNode(

              N->getOpcode(), dl, {MVT::v4f32, MVT::Other},

              {N->getOperand(0), DAG.getBitcast(MVT::v8i16, VZLoad)});

          DCI.CombineTo(N, Convert, Convert.getValue(1));

        } else {

          SDValue Convert = DAG.getNode(N->getOpcode(), dl, MVT::v4f32,

                                        DAG.getBitcast(MVT::v8i16, VZLoad));

          DCI.CombineTo(N, Convert);

        }


        DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));

        DCI.recursivelyDeleteUnusedNodes(LN);

        return SDValue(N, 0);

      }

    }

  }


  return SDValue();

}


// Try to combine sext_in_reg of a cmov of constants by extending the constants.


static SDValue combineSextInRegCmov(SDNode *N, SelectionDAG &DAG) {

  assert(N->getOpcode() == ISD::SIGN_EXTEND_INREG);


  EVT DstVT = N->getValueType(0);


  SDValue N0 = N->getOperand(0);

  SDValue N1 = N->getOperand(1);

  EVT ExtraVT = cast<VTSDNode>(N1)->getVT();


  if (ExtraVT != MVT::i8 && ExtraVT != MVT::i16)

    return SDValue();


  // Look through single use any_extends / truncs.

  SDValue IntermediateBitwidthOp;

  if ((N0.getOpcode() == ISD::ANY_EXTEND || N0.getOpcode() == ISD::TRUNCATE) &&

      N0.hasOneUse()) {

    IntermediateBitwidthOp = N0;

    N0 = N0.getOperand(0);

  }


  // See if we have a single use cmov.

  if (N0.getOpcode() != X86ISD::CMOV || !N0.hasOneUse())

    return SDValue();


  SDValue CMovOp0 = N0.getOperand(0);

  SDValue CMovOp1 = N0.getOperand(1);


  // Make sure both operands are constants.

  if (!isa<ConstantSDNode>(CMovOp0.getNode()) ||

      !isa<ConstantSDNode>(CMovOp1.getNode()))

    return SDValue();


  SDLoc DL(N);


  // If we looked through an any_extend/trunc above, add one to the constants.

  if (IntermediateBitwidthOp) {

    unsigned IntermediateOpc = IntermediateBitwidthOp.getOpcode();

    CMovOp0 = DAG.getNode(IntermediateOpc, DL, DstVT, CMovOp0);

    CMovOp1 = DAG.getNode(IntermediateOpc, DL, DstVT, CMovOp1);

  }


  CMovOp0 = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, DstVT, CMovOp0, N1);

  CMovOp1 = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, DstVT, CMovOp1, N1);


  EVT CMovVT = DstVT;

  // We do not want i16 CMOV's. Promote to i32 and truncate afterwards.

  if (DstVT == MVT::i16) {

    CMovVT = MVT::i32;

    CMovOp0 = DAG.getNode(ISD::ZERO_EXTEND, DL, CMovVT, CMovOp0);

    CMovOp1 = DAG.getNode(ISD::ZERO_EXTEND, DL, CMovVT, CMovOp1);

  }


  SDValue CMov = DAG.getNode(X86ISD::CMOV, DL, CMovVT, CMovOp0, CMovOp1,

                             N0.getOperand(2), N0.getOperand(3));


  if (CMovVT != DstVT)

    CMov = DAG.getNode(ISD::TRUNCATE, DL, DstVT, CMov);


  return CMov;

}


static SDValue combineSignExtendInReg(SDNode *N, SelectionDAG &DAG,

                                      const X86Subtarget &Subtarget) {

  assert(N->getOpcode() == ISD::SIGN_EXTEND_INREG);


  if (SDValue V = combineSextInRegCmov(N, DAG))

    return V;


  EVT VT = N->getValueType(0);

  SDValue N0 = N->getOperand(0);

  SDValue N1 = N->getOperand(1);

  EVT ExtraVT = cast<VTSDNode>(N1)->getVT();

  SDLoc dl(N);


  // The SIGN_EXTEND_INREG to v4i64 is expensive operation on the

  // both SSE and AVX2 since there is no sign-extended shift right

  // operation on a vector with 64-bit elements.

  //(sext_in_reg (v4i64 anyext (v4i32 x )), ExtraVT) ->

  // (v4i64 sext (v4i32 sext_in_reg (v4i32 x , ExtraVT)))

  if (VT == MVT::v4i64 && (N0.getOpcode() == ISD::ANY_EXTEND ||

                           N0.getOpcode() == ISD::SIGN_EXTEND)) {

    SDValue N00 = N0.getOperand(0);


    // EXTLOAD has a better solution on AVX2,

    // it may be replaced with X86ISD::VSEXT node.

    if (N00.getOpcode() == ISD::LOAD && Subtarget.hasInt256())

      if (!ISD::isNormalLoad(N00.getNode()))

        return SDValue();


    // Attempt to promote any comparison mask ops before moving the

    // SIGN_EXTEND_INREG in the way.

    if (SDValue Promote = PromoteMaskArithmetic(N0, dl, DAG, Subtarget))

      return DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, VT, Promote, N1);


    if (N00.getValueType() == MVT::v4i32 && ExtraVT.getSizeInBits() < 128) {

      SDValue Tmp =

          DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::v4i32, N00, N1);

      return DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i64, Tmp);

    }

  }

  return SDValue();

}


/// sext(add_nsw(x, C)) --> add(sext(x), C_sext)

/// zext(add_nuw(x, C)) --> add(zext(x), C_zext)

/// Promoting a sign/zero extension ahead of a no overflow 'add' exposes

/// opportunities to combine math ops, use an LEA, or use a complex addressing

/// mode. This can eliminate extend, add, and shift instructions.


static SDValue promoteExtBeforeAdd(SDNode *Ext, SelectionDAG &DAG,

                                   const X86Subtarget &Subtarget) {

  if (Ext->getOpcode() != ISD::SIGN_EXTEND &&

      Ext->getOpcode() != ISD::ZERO_EXTEND)

    return SDValue();


  // TODO: This should be valid for other integer types.

  EVT VT = Ext->getValueType(0);

  if (VT != MVT::i64)

    return SDValue();


  SDValue Add = Ext->getOperand(0);

  if (Add.getOpcode() != ISD::ADD)

    return SDValue();


  SDValue AddOp0 = Add.getOperand(0);

  SDValue AddOp1 = Add.getOperand(1);

  bool Sext = Ext->getOpcode() == ISD::SIGN_EXTEND;

  bool NSW = Add->getFlags().hasNoSignedWrap();

  bool NUW = Add->getFlags().hasNoUnsignedWrap();

  NSW = NSW || (Sext && DAG.willNotOverflowAdd(true, AddOp0, AddOp1));

  NUW = NUW || (!Sext && DAG.willNotOverflowAdd(false, AddOp0, AddOp1));


  // We need an 'add nsw' feeding into the 'sext' or 'add nuw' feeding

  // into the 'zext'

  if ((Sext && !NSW) || (!Sext && !NUW))

    return SDValue();


  // Having a constant operand to the 'add' ensures that we are not increasing

  // the instruction count because the constant is extended for free below.

  // A constant operand can also become the displacement field of an LEA.

  auto *AddOp1C = dyn_cast<ConstantSDNode>(AddOp1);

  if (!AddOp1C)

    return SDValue();


  // Don't make the 'add' bigger if there's no hope of combining it with some

  // other 'add' or 'shl' instruction.

  // TODO: It may be profitable to generate simpler LEA instructions in place

  // of single 'add' instructions, but the cost model for selecting an LEA

  // currently has a high threshold.

  bool HasLEAPotential = false;

  for (auto *User : Ext->users()) {

    if (User->getOpcode() == ISD::ADD || User->getOpcode() == ISD::SHL) {

      HasLEAPotential = true;

      break;

    }

  }

  if (!HasLEAPotential)

    return SDValue();


  // Everything looks good, so pull the '{s|z}ext' ahead of the 'add'.

  int64_t AddC = Sext ? AddOp1C->getSExtValue() : AddOp1C->getZExtValue();

  SDValue NewExt = DAG.getNode(Ext->getOpcode(), SDLoc(Ext), VT, AddOp0);

  SDValue NewConstant = DAG.getConstant(AddC, SDLoc(Add), VT);


  // The wider add is guaranteed to not wrap because both operands are

  // sign-extended.

  SDNodeFlags Flags;

  Flags.setNoSignedWrap(NSW);

  Flags.setNoUnsignedWrap(NUW);

  return DAG.getNode(ISD::ADD, SDLoc(Add), VT, NewExt, NewConstant, Flags);

}


// If we face {ANY,SIGN,ZERO}_EXTEND that is applied to a CMOV with constant

// operands and the result of CMOV is not used anywhere else - promote CMOV

// itself instead of promoting its result. This could be beneficial, because:

//     1) X86TargetLowering::EmitLoweredSelect later can do merging of two

//        (or more) pseudo-CMOVs only when they go one-after-another and

//        getting rid of result extension code after CMOV will help that.

//     2) Promotion of constant CMOV arguments is free, hence the

//        {ANY,SIGN,ZERO}_EXTEND will just be deleted.

//     3) 16-bit CMOV encoding is 4 bytes, 32-bit CMOV is 3-byte, so this

//        promotion is also good in terms of code-size.

//        (64-bit CMOV is 4-bytes, that's why we don't do 32-bit => 64-bit

//         promotion).


static SDValue combineToExtendCMOV(SDNode *Extend, SelectionDAG &DAG) {

  SDValue CMovN = Extend->getOperand(0);

  if (CMovN.getOpcode() != X86ISD::CMOV || !CMovN.hasOneUse())

    return SDValue();


  EVT TargetVT = Extend->getValueType(0);

  unsigned ExtendOpcode = Extend->getOpcode();

  SDLoc DL(Extend);


  EVT VT = CMovN.getValueType();

  SDValue CMovOp0 = CMovN.getOperand(0);

  SDValue CMovOp1 = CMovN.getOperand(1);


  if (!isa<ConstantSDNode>(CMovOp0.getNode()) ||

      !isa<ConstantSDNode>(CMovOp1.getNode()))

    return SDValue();


  // Only extend to i32 or i64.

  if (TargetVT != MVT::i32 && TargetVT != MVT::i64)

    return SDValue();


  // Only extend from i16 unless its a sign_extend from i32. Zext/aext from i32

  // are free.

  if (VT != MVT::i16 && !(ExtendOpcode == ISD::SIGN_EXTEND && VT == MVT::i32))

    return SDValue();


  // If this a zero extend to i64, we should only extend to i32 and use a free

  // zero extend to finish.

  EVT ExtendVT = TargetVT;

  if (TargetVT == MVT::i64 && ExtendOpcode != ISD::SIGN_EXTEND)

    ExtendVT = MVT::i32;


  CMovOp0 = DAG.getNode(ExtendOpcode, DL, ExtendVT, CMovOp0);

  CMovOp1 = DAG.getNode(ExtendOpcode, DL, ExtendVT, CMovOp1);


  SDValue Res = DAG.getNode(X86ISD::CMOV, DL, ExtendVT, CMovOp0, CMovOp1,

                            CMovN.getOperand(2), CMovN.getOperand(3));


  // Finish extending if needed.

  if (ExtendVT != TargetVT)

    Res = DAG.getNode(ExtendOpcode, DL, TargetVT, Res);


  return Res;

}


// Attempt to combine a (sext/zext (setcc)) to a setcc with a xmm/ymm/zmm

// result type.


static SDValue combineExtSetcc(SDNode *N, SelectionDAG &DAG,

                               const X86Subtarget &Subtarget) {

  SDValue N0 = N->getOperand(0);

  EVT VT = N->getValueType(0);

  SDLoc dl(N);


  // Only do this combine with AVX512 for vector extends.

  if (!Subtarget.hasAVX512() || !VT.isVector() || N0.getOpcode() != ISD::SETCC)

    return SDValue();


  // Only combine legal element types.

  EVT SVT = VT.getVectorElementType();

  if (SVT != MVT::i8 && SVT != MVT::i16 && SVT != MVT::i32 &&

      SVT != MVT::i64 && SVT != MVT::f32 && SVT != MVT::f64)

    return SDValue();


  // We don't have CMPP Instruction for vxf16

  if (N0.getOperand(0).getValueType().getVectorElementType() == MVT::f16)

    return SDValue();

  // We can only do this if the vector size in 256 bits or less.

  unsigned Size = VT.getSizeInBits();

  if (Size > 256 && Subtarget.useAVX512Regs())

    return SDValue();


  EVT N00VT = N0.getOperand(0).getValueType();


  // Don't fold if the condition code can't be handled by PCMPEQ/PCMPGT since

  // that's the only integer compares with we have.

  ISD::CondCode CC = cast<CondCodeSDNode>(N0.getOperand(2))->get();

  if (N00VT.isInteger() && ISD::isUnsignedIntSetCC(CC))

    return SDValue();


  // Only do this combine if the extension will be fully consumed by the setcc.

  EVT MatchingVecType = N00VT.changeVectorElementTypeToInteger();

  if (Size != MatchingVecType.getSizeInBits())

    return SDValue();


  SDValue Res = DAG.getSetCC(dl, VT, N0.getOperand(0), N0.getOperand(1), CC);


  if (N->getOpcode() == ISD::ZERO_EXTEND)

    Res = DAG.getZeroExtendInReg(Res, dl, N0.getValueType());


  return Res;

}


static SDValue combineSext(SDNode *N, SelectionDAG &DAG,

                           TargetLowering::DAGCombinerInfo &DCI,

                           const X86Subtarget &Subtarget) {

  SDValue N0 = N->getOperand(0);

  EVT VT = N->getValueType(0);

  SDLoc DL(N);


  // (i32 (sext (i8 (x86isd::setcc_carry)))) -> (i32 (x86isd::setcc_carry))

  if (!DCI.isBeforeLegalizeOps() &&

      N0.getOpcode() == X86ISD::SETCC_CARRY) {

    SDValue Setcc = DAG.getNode(X86ISD::SETCC_CARRY, DL, VT, N0->getOperand(0),

                                 N0->getOperand(1));

    bool ReplaceOtherUses = !N0.hasOneUse();

    DCI.CombineTo(N, Setcc);

    // Replace other uses with a truncate of the widened setcc_carry.

    if (ReplaceOtherUses) {

      SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(N0),

                                  N0.getValueType(), Setcc);

      DCI.CombineTo(N0.getNode(), Trunc);

    }


    return SDValue(N, 0);

  }


  if (SDValue NewCMov = combineToExtendCMOV(N, DAG))

    return NewCMov;


  if (!DCI.isBeforeLegalizeOps())

    return SDValue();


  if (SDValue V = combineExtSetcc(N, DAG, Subtarget))

    return V;


  if (SDValue V = combineToExtendBoolVectorInReg(N->getOpcode(), DL, VT, N0,

                                                 DAG, DCI, Subtarget))

    return V;


  if (VT.isVector()) {

    if (SDValue R = PromoteMaskArithmetic(SDValue(N, 0), DL, DAG, Subtarget))

      return R;


    if (N0.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG)

      return DAG.getNode(N0.getOpcode(), DL, VT, N0.getOperand(0));

  }


  if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget))

    return NewAdd;


  return SDValue();

}


// Inverting a constant vector is profitable if it can be eliminated and the

// inverted vector is already present in DAG. Otherwise, it will be loaded

// anyway.

//

// We determine which of the values can be completely eliminated and invert it.

// If both are eliminable, select a vector with the first negative element.


static SDValue getInvertedVectorForFMA(SDValue V, SelectionDAG &DAG) {

  assert(ISD::isBuildVectorOfConstantFPSDNodes(V.getNode()) &&

         "ConstantFP build vector expected");

  // Check if we can eliminate V. We assume if a value is only used in FMAs, we

  // can eliminate it. Since this function is invoked for each FMA with this

  // vector.

  auto IsNotFMA = [](SDNode *User) {

    return User->getOpcode() != ISD::FMA &&

           User->getOpcode() != ISD::STRICT_FMA;

  };

  if (llvm::any_of(V->users(), IsNotFMA))

    return SDValue();


  SmallVector<SDValue, 8> Ops;

  EVT VT = V.getValueType();

  EVT EltVT = VT.getVectorElementType();

  for (const SDValue &Op : V->op_values()) {

    if (auto *Cst = dyn_cast<ConstantFPSDNode>(Op)) {

      Ops.push_back(DAG.getConstantFP(-Cst->getValueAPF(), SDLoc(Op), EltVT));

    } else {

      assert(Op.isUndef());

      Ops.push_back(DAG.getUNDEF(EltVT));

    }

  }


  SDNode *NV = DAG.getNodeIfExists(ISD::BUILD_VECTOR, DAG.getVTList(VT), Ops);

  if (!NV)

    return SDValue();


  // If an inverted version cannot be eliminated, choose it instead of the

  // original version.

  if (llvm::any_of(NV->users(), IsNotFMA))

    return SDValue(NV, 0);


  // If the inverted version also can be eliminated, we have to consistently

  // prefer one of the values. We prefer a constant with a negative value on

  // the first place.

  // N.B. We need to skip undefs that may precede a value.

  for (const SDValue &Op : V->op_values()) {

    if (auto *Cst = dyn_cast<ConstantFPSDNode>(Op)) {

      if (Cst->isNegative())

        return SDValue();

      break;

    }

  }

  return SDValue(NV, 0);

}


static SDValue combineFMA(SDNode *N, SelectionDAG &DAG,

                          TargetLowering::DAGCombinerInfo &DCI,

                          const X86Subtarget &Subtarget) {

  SDLoc dl(N);

  EVT VT = N->getValueType(0);

  const SelectionDAGTargetInfo &TSI = DAG.getSelectionDAGInfo();

  bool IsStrict = N->isTargetOpcode()

                      ? TSI.isTargetStrictFPOpcode(N->getOpcode())

                      : N->isStrictFPOpcode();


  // Let legalize expand this if it isn't a legal type yet.

  const TargetLowering &TLI = DAG.getTargetLoweringInfo();

  if (!TLI.isTypeLegal(VT))

    return SDValue();


  SDValue A = N->getOperand(IsStrict ? 1 : 0);

  SDValue B = N->getOperand(IsStrict ? 2 : 1);

  SDValue C = N->getOperand(IsStrict ? 3 : 2);


  // If the operation allows fast-math and the target does not support FMA,

  // split this into mul+add to avoid libcall(s).

  SDNodeFlags Flags = N->getFlags();

  if (!IsStrict && Flags.hasAllowReassociation() &&

      TLI.isOperationExpand(ISD::FMA, VT)) {

    SDValue Fmul = DAG.getNode(ISD::FMUL, dl, VT, A, B, Flags);

    return DAG.getNode(ISD::FADD, dl, VT, Fmul, C, Flags);

  }


  EVT ScalarVT = VT.getScalarType();

  if (((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) ||

       !Subtarget.hasAnyFMA()) &&

      !(ScalarVT == MVT::f16 && Subtarget.hasFP16()) &&

      !(ScalarVT == MVT::bf16 && Subtarget.hasAVX10_2()))

    return SDValue();


  auto invertIfNegative = [&DAG, &TLI, &DCI](SDValue &V) {

    bool CodeSize = DAG.getMachineFunction().getFunction().hasOptSize();

    bool LegalOperations = !DCI.isBeforeLegalizeOps();

    if (SDValue NegV = TLI.getCheaperNegatedExpression(V, DAG, LegalOperations,

                                                       CodeSize)) {

      V = NegV;

      return true;

    }

    // Look through extract_vector_elts. If it comes from an FNEG, create a

    // new extract from the FNEG input.

    if (V.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&

        isNullConstant(V.getOperand(1))) {

      SDValue Vec = V.getOperand(0);

      if (SDValue NegV = TLI.getCheaperNegatedExpression(

              Vec, DAG, LegalOperations, CodeSize)) {

        V = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(V), V.getValueType(),

                        NegV, V.getOperand(1));

        return true;

      }

    }

    // Lookup if there is an inverted version of constant vector V in DAG.

    if (ISD::isBuildVectorOfConstantFPSDNodes(V.getNode())) {

      if (SDValue NegV = getInvertedVectorForFMA(V, DAG)) {

        V = NegV;

        return true;

      }

    }

    return false;

  };


  // Do not convert the passthru input of scalar intrinsics.

  // FIXME: We could allow negations of the lower element only.

  bool NegA = invertIfNegative(A);

  // Create a dummy use for A so that in the process of negating B or C

  // recursively, it is not deleted.

  HandleSDNode NegAHandle(A);

  bool NegB = invertIfNegative(B);

  // Similar to A, get a handle on B.

  HandleSDNode NegBHandle(B);

  bool NegC = invertIfNegative(C);


  if (!NegA && !NegB && !NegC)

    return SDValue();


  unsigned NewOpcode =

      negateFMAOpcode(N->getOpcode(), NegA != NegB, NegC, false);


  // Propagate fast-math-flags to new FMA node.

  SelectionDAG::FlagInserter FlagsInserter(DAG, Flags);

  if (IsStrict) {

    assert(N->getNumOperands() == 4 && "Shouldn't be greater than 4");

    return DAG.getNode(NewOpcode, dl, {VT, MVT::Other},

                       {N->getOperand(0), A, B, C});

  } else {

    if (N->getNumOperands() == 4)

      return DAG.getNode(NewOpcode, dl, VT, A, B, C, N->getOperand(3));

    return DAG.getNode(NewOpcode, dl, VT, A, B, C);

  }

}


// Combine FMADDSUB(A, B, FNEG(C)) -> FMSUBADD(A, B, C)

// Combine FMSUBADD(A, B, FNEG(C)) -> FMADDSUB(A, B, C)


static SDValue combineFMADDSUB(SDNode *N, SelectionDAG &DAG,

                               TargetLowering::DAGCombinerInfo &DCI) {

  SDLoc dl(N);

  EVT VT = N->getValueType(0);

  const TargetLowering &TLI = DAG.getTargetLoweringInfo();

  bool CodeSize = DAG.getMachineFunction().getFunction().hasOptSize();

  bool LegalOperations = !DCI.isBeforeLegalizeOps();


  SDValue N2 = N->getOperand(2);


  SDValue NegN2 =

      TLI.getCheaperNegatedExpression(N2, DAG, LegalOperations, CodeSize);

  if (!NegN2)

    return SDValue();

  unsigned NewOpcode = negateFMAOpcode(N->getOpcode(), false, true, false);


  if (N->getNumOperands() == 4)

    return DAG.getNode(NewOpcode, dl, VT, N->getOperand(0), N->getOperand(1),

                       NegN2, N->getOperand(3));

  return DAG.getNode(NewOpcode, dl, VT, N->getOperand(0), N->getOperand(1),

                     NegN2);

}


// Try to widen the build vector and bitcast it to the type of zext.

// This is a special case for the 128-bit vector types. Intention is to remove

// the zext and replace it with a bitcast the wider type. While lowering

// the bitcast is removed and extra commutation due to zext is avoided.

// For example:

// zext v4i16 ( v4i8 build_vector (x, y, z, w)) -> bitcast v4i16 ( v8i8

// build_vector (x, 0, y, 0, z, w, 0)


static SDValue widenBuildVec(SDNode *Extend, SelectionDAG &DAG) {


  if (Extend->getOpcode() != ISD::ZERO_EXTEND)

    return SDValue();


  EVT ExtendVT = Extend->getValueType(0);


  SDValue BV = Extend->getOperand(0);

  if (BV.getOpcode() != ISD::BUILD_VECTOR || !BV.hasOneUse())

    return SDValue();


  if (any_of(BV->op_values(), [](SDValue Op) { return Op.isUndef(); })) {

    // If the build vector has undef elements, we cannot widen it.

    // The widening would create a vector with more undef elements, which

    // is not valid.

    return SDValue();

  }


  if (!all_of(BV->op_values(),

              [](SDValue Op) { return Op.getOpcode() == ISD::LOAD; })) {

    // If the build vector any element other than \ISD::LOAD, we cannot widen

    // it.

    return SDValue();

  }


  SDLoc dl(BV);

  EVT VT = BV.getValueType();

  EVT EltVT = BV.getOperand(0).getValueType();

  unsigned NumElts = VT.getVectorNumElements();


  const TargetLowering &TLI = DAG.getTargetLoweringInfo();


  if (TLI.getTypeAction(*DAG.getContext(), VT) !=

      TargetLowering::TypeWidenVector)

    return SDValue();


  EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);

  unsigned WidenNumElts = WidenVT.getVectorNumElements();


  SmallVector<SDValue, 16> NewOps(BV->op_begin(), BV->op_end());

  assert(WidenNumElts >= NumElts && "Shrinking vector instead of widening!");

  // Fill the new elements with Zero.

  NewOps.append(WidenNumElts - NumElts, DAG.getConstant(0, dl, EltVT));

  // Compute the step to place the elements in the right place and control the

  // iteration.

  unsigned step = WidenNumElts / NumElts;

  if (WidenVT.is128BitVector()) {

    if (step > 1 && Extend->getValueSizeInBits(0) == WidenVT.getSizeInBits()) {

      for (int i = NumElts - 1, j = WidenNumElts - step; i > 0;

           i--, j -= step) {

        SDValue temp = NewOps[i];

        NewOps[i] = NewOps[j];

        NewOps[j] = temp;

      }

      // Create new build vector with WidenVT and NewOps

      SDValue NewBV = DAG.getBuildVector(WidenVT, dl, NewOps);

      // Replace the old build vector with the new one. Bitcast the

      // new build vector to the type of the zext.

      SDValue NewBVBitcast = DAG.getBitcast(ExtendVT, NewBV);

      DAG.ReplaceAllUsesOfValueWith(SDValue(Extend, 0), NewBVBitcast);

      return NewBV;

    }

  }

  return SDValue();

}


static SDValue combineZext(SDNode *N, SelectionDAG &DAG,

                           TargetLowering::DAGCombinerInfo &DCI,

                           const X86Subtarget &Subtarget) {

  SDLoc dl(N);

  SDValue N0 = N->getOperand(0);

  EVT VT = N->getValueType(0);


  // (i32 (aext (i8 (x86isd::setcc_carry)))) -> (i32 (x86isd::setcc_carry))

  // FIXME: Is this needed? We don't seem to have any tests for it.

  if (!DCI.isBeforeLegalizeOps() && N->getOpcode() == ISD::ANY_EXTEND &&

      N0.getOpcode() == X86ISD::SETCC_CARRY) {

    SDValue Setcc = DAG.getNode(X86ISD::SETCC_CARRY, dl, VT, N0->getOperand(0),

                                 N0->getOperand(1));

    bool ReplaceOtherUses = !N0.hasOneUse();

    DCI.CombineTo(N, Setcc);

    // Replace other uses with a truncate of the widened setcc_carry.

    if (ReplaceOtherUses) {

      SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(N0),

                                  N0.getValueType(), Setcc);

      DCI.CombineTo(N0.getNode(), Trunc);

    }


    return SDValue(N, 0);

  }


  if (SDValue NewCMov = combineToExtendCMOV(N, DAG))

    return NewCMov;


  if (DCI.isBeforeLegalizeOps())

    if (SDValue V = combineExtSetcc(N, DAG, Subtarget))

      return V;


  if (SDValue V = combineToExtendBoolVectorInReg(N->getOpcode(), dl, VT, N0,

                                                 DAG, DCI, Subtarget))

    return V;


  if (VT.isVector())

    if (SDValue R = PromoteMaskArithmetic(SDValue(N, 0), dl, DAG, Subtarget))

      return R;


  if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget))

    return NewAdd;


  if (SDValue R = combineOrCmpEqZeroToCtlzSrl(N, DAG, DCI, Subtarget))

    return R;


  // TODO: Combine with any target/faux shuffle.

  if (N0.getOpcode() == X86ISD::PACKUS && N0.getValueSizeInBits() == 128 &&

      VT.getScalarSizeInBits() == N0.getOperand(0).getScalarValueSizeInBits()) {

    SDValue N00 = N0.getOperand(0);

    SDValue N01 = N0.getOperand(1);

    unsigned NumSrcEltBits = N00.getScalarValueSizeInBits();

    APInt ZeroMask = APInt::getHighBitsSet(NumSrcEltBits, NumSrcEltBits / 2);

    if ((N00.isUndef() || DAG.MaskedValueIsZero(N00, ZeroMask)) &&

        (N01.isUndef() || DAG.MaskedValueIsZero(N01, ZeroMask))) {

      return concatSubVectors(N00, N01, DAG, dl);

    }

  }


  if (SDValue V = widenBuildVec(N, DAG))

    return V;


  return SDValue();

}


/// If we have AVX512, but not BWI and this is a vXi16/vXi8 setcc, just

/// pre-promote its result type since vXi1 vectors don't get promoted

/// during type legalization.


static SDValue truncateAVX512SetCCNoBWI(EVT VT, EVT OpVT, SDValue LHS,

                                        SDValue RHS, ISD::CondCode CC,

                                        const SDLoc &DL, SelectionDAG &DAG,

                                        const X86Subtarget &Subtarget) {

  if (Subtarget.hasAVX512() && !Subtarget.hasBWI() && VT.isVector() &&

      VT.getVectorElementType() == MVT::i1 &&

      (OpVT.getVectorElementType() == MVT::i8 ||

       OpVT.getVectorElementType() == MVT::i16)) {

    SDValue Setcc = DAG.getSetCC(DL, OpVT, LHS, RHS, CC);

    return DAG.getNode(ISD::TRUNCATE, DL, VT, Setcc);

  }

  return SDValue();

}


// The pattern (setcc (and (broadcast x), (2^n, 2^{n+1}, ...)), (0, 0, ...),

// eq/ne) is generated when using an integer as a mask. Instead of generating a

// broadcast + vptest, we can directly move the integer to a mask register.


static SDValue combineAVX512SetCCToKMOV(EVT VT, SDValue Op0, ISD::CondCode CC,

                                        const SDLoc &DL, SelectionDAG &DAG,

                                        const X86Subtarget &Subtarget) {

  if (CC != ISD::SETNE && CC != ISD::SETEQ)

    return SDValue();


  if (!Subtarget.hasAVX512())

    return SDValue();


  if (Op0.getOpcode() != ISD::AND)

    return SDValue();


  SDValue Broadcast = Op0.getOperand(0);

  if (Broadcast.getOpcode() != X86ISD::VBROADCAST &&

      Broadcast.getOpcode() != X86ISD::VBROADCAST_LOAD)

    return SDValue();


  SDValue Load = Op0.getOperand(1);

  EVT LoadVT = Load.getSimpleValueType();


  APInt UndefElts;

  SmallVector<APInt, 32> EltBits;

  if (!getTargetConstantBitsFromNode(Load, LoadVT.getScalarSizeInBits(),

                                     UndefElts, EltBits,

                                     /*AllowWholeUndefs*/ true,

                                     /*AllowPartialUndefs*/ false) ||

      UndefElts[0] || !EltBits[0].isPowerOf2() || UndefElts.getBitWidth() > 16)

    return SDValue();


  // Check if the constant pool contains only powers of 2 starting from some

  // 2^N. The table may also contain undefs because of widening of vector

  // operands.

  unsigned N = EltBits[0].logBase2();

  unsigned Len = UndefElts.getBitWidth();

  for (unsigned I = 1; I != Len; ++I) {

    if (UndefElts[I]) {

      if (!UndefElts.extractBits(Len - (I + 1), I + 1).isAllOnes())

        return SDValue();

      break;

    }


    if (EltBits[I].getBitWidth() <= N + I || !EltBits[I].isOneBitSet(N + I))

      return SDValue();

  }


  MVT BroadcastOpVT = Broadcast.getSimpleValueType().getVectorElementType();

  SDValue BroadcastOp;

  if (Broadcast.getOpcode() != X86ISD::VBROADCAST) {

    BroadcastOp = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, BroadcastOpVT,

                              Broadcast, DAG.getVectorIdxConstant(0, DL));

  } else {

    BroadcastOp = Broadcast.getOperand(0);

    if (BroadcastOp.getValueType().isVector())

      return SDValue();

  }


  SDValue Masked = BroadcastOp;

  if (N != 0) {

    unsigned BroadcastOpBitWidth = BroadcastOpVT.getSizeInBits();

    unsigned NumDefinedElts = UndefElts.countTrailingZeros();


    if (NumDefinedElts > BroadcastOpBitWidth)

      return SDValue();


    APInt Mask = APInt::getLowBitsSet(BroadcastOpBitWidth, NumDefinedElts);

    SDValue ShiftedValue = DAG.getNode(ISD::SRL, DL, BroadcastOpVT, BroadcastOp,

                                       DAG.getConstant(N, DL, BroadcastOpVT));

    Masked = DAG.getNode(ISD::AND, DL, BroadcastOpVT, ShiftedValue,

                         DAG.getConstant(Mask, DL, BroadcastOpVT));

  }

  // We can't extract more than 16 bits using this pattern, because 2^{17} will

  // not fit in an i16 and a vXi32 where X > 16 is more than 512 bits.

  SDValue Trunc = DAG.getAnyExtOrTrunc(Masked, DL, MVT::i16);

  SDValue Bitcast = DAG.getNode(ISD::BITCAST, DL, MVT::v16i1, Trunc);


  if (CC == ISD::SETEQ)

    Bitcast = DAG.getNOT(DL, Bitcast, MVT::v16i1);


  if (VT != MVT::v16i1)

    return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Bitcast,

                       DAG.getVectorIdxConstant(0, DL));


  return Bitcast;

}


static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG,

                            TargetLowering::DAGCombinerInfo &DCI,

                            const X86Subtarget &Subtarget) {

  const ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();

  const SDValue LHS = N->getOperand(0);

  const SDValue RHS = N->getOperand(1);

  EVT VT = N->getValueType(0);

  EVT OpVT = LHS.getValueType();

  SDLoc DL(N);


  if (CC == ISD::SETNE || CC == ISD::SETEQ) {

    if (SDValue V = combineVectorSizedSetCCEquality(VT, LHS, RHS, CC, DL, DAG,

                                                    Subtarget))

      return V;

  }


  if (VT == MVT::i1) {

    X86::CondCode X86CC;

    if (SDValue V =

            MatchVectorAllEqualTest(LHS, RHS, CC, DL, Subtarget, DAG, X86CC))

      return DAG.getNode(ISD::TRUNCATE, DL, VT, getSETCC(X86CC, V, DL, DAG));

  }


  if (CC == ISD::SETNE || CC == ISD::SETEQ) {

    if (OpVT.isScalarInteger()) {

      // cmpeq(or(X,Y),X) --> cmpeq(and(~X,Y),0)

      // cmpne(or(X,Y),X) --> cmpne(and(~X,Y),0)

      auto MatchOrCmpEq = [&](SDValue N0, SDValue N1) {

        if (N0.getOpcode() == ISD::OR && N0->hasOneUse()) {

          if (N0.getOperand(0) == N1)

            return DAG.getNode(ISD::AND, DL, OpVT, DAG.getNOT(DL, N1, OpVT),

                               N0.getOperand(1));

          if (N0.getOperand(1) == N1)

            return DAG.getNode(ISD::AND, DL, OpVT, DAG.getNOT(DL, N1, OpVT),

                               N0.getOperand(0));

        }

        return SDValue();

      };

      if (SDValue AndN = MatchOrCmpEq(LHS, RHS))

        return DAG.getSetCC(DL, VT, AndN, DAG.getConstant(0, DL, OpVT), CC);

      if (SDValue AndN = MatchOrCmpEq(RHS, LHS))

        return DAG.getSetCC(DL, VT, AndN, DAG.getConstant(0, DL, OpVT), CC);


      // cmpeq(and(X,Y),Y) --> cmpeq(and(~X,Y),0)

      // cmpne(and(X,Y),Y) --> cmpne(and(~X,Y),0)

      auto MatchAndCmpEq = [&](SDValue N0, SDValue N1) {

        if (N0.getOpcode() == ISD::AND && N0->hasOneUse()) {

          if (N0.getOperand(0) == N1)

            return DAG.getNode(ISD::AND, DL, OpVT, N1,

                               DAG.getNOT(DL, N0.getOperand(1), OpVT));

          if (N0.getOperand(1) == N1)

            return DAG.getNode(ISD::AND, DL, OpVT, N1,

                               DAG.getNOT(DL, N0.getOperand(0), OpVT));

        }

        return SDValue();

      };

      if (SDValue AndN = MatchAndCmpEq(LHS, RHS))

        return DAG.getSetCC(DL, VT, AndN, DAG.getConstant(0, DL, OpVT), CC);

      if (SDValue AndN = MatchAndCmpEq(RHS, LHS))

        return DAG.getSetCC(DL, VT, AndN, DAG.getConstant(0, DL, OpVT), CC);


      // cmpeq(trunc(x),C) --> cmpeq(x,C)

      // cmpne(trunc(x),C) --> cmpne(x,C)

      // iff x upper bits are zero.

      if (LHS.getOpcode() == ISD::TRUNCATE &&

          LHS.getOperand(0).getScalarValueSizeInBits() >= 32 &&

          isa<ConstantSDNode>(RHS) && !DCI.isBeforeLegalize()) {

        EVT SrcVT = LHS.getOperand(0).getValueType();

        APInt UpperBits = APInt::getBitsSetFrom(SrcVT.getScalarSizeInBits(),

                                                OpVT.getScalarSizeInBits());

        const TargetLowering &TLI = DAG.getTargetLoweringInfo();

        if (DAG.MaskedValueIsZero(LHS.getOperand(0), UpperBits) &&

            TLI.isTypeLegal(LHS.getOperand(0).getValueType()))

          return DAG.getSetCC(DL, VT, LHS.getOperand(0),

                              DAG.getZExtOrTrunc(RHS, DL, SrcVT), CC);

      }


      // With C as a power of 2 and C != 0 and C != INT_MIN:

      //    icmp eq Abs(X) C ->

      //        (icmp eq A, C) | (icmp eq A, -C)

      //    icmp ne Abs(X) C ->

      //        (icmp ne A, C) & (icmp ne A, -C)

      // Both of these patterns can be better optimized in

      // DAGCombiner::foldAndOrOfSETCC. Note this only applies for scalar

      // integers which is checked above.

      if (LHS.getOpcode() == ISD::ABS && LHS.hasOneUse()) {

        if (auto *C = dyn_cast<ConstantSDNode>(RHS)) {

          const APInt &CInt = C->getAPIntValue();

          // We can better optimize this case in DAGCombiner::foldAndOrOfSETCC.

          if (CInt.isPowerOf2() && !CInt.isMinSignedValue()) {

            SDValue BaseOp = LHS.getOperand(0);

            SDValue SETCC0 = DAG.getSetCC(DL, VT, BaseOp, RHS, CC);

            SDValue SETCC1 = DAG.getSetCC(

                DL, VT, BaseOp, DAG.getConstant(-CInt, DL, OpVT), CC);

            return DAG.getNode(CC == ISD::SETEQ ? ISD::OR : ISD::AND, DL, VT,

                               SETCC0, SETCC1);

          }

        }

      }

    }

  }


  if (VT.isVector() && VT.getVectorElementType() == MVT::i1 &&

      (CC == ISD::SETNE || CC == ISD::SETEQ || ISD::isSignedIntSetCC(CC))) {

    // Using temporaries to avoid messing up operand ordering for later

    // transformations if this doesn't work.

    SDValue Op0 = LHS;

    SDValue Op1 = RHS;

    ISD::CondCode TmpCC = CC;

    // Put build_vector on the right.

    if (Op0.getOpcode() == ISD::BUILD_VECTOR) {

      std::swap(Op0, Op1);

      TmpCC = ISD::getSetCCSwappedOperands(TmpCC);

    }


    bool IsSEXT0 =

        (Op0.getOpcode() == ISD::SIGN_EXTEND) &&

        (Op0.getOperand(0).getValueType().getVectorElementType() == MVT::i1);

    bool IsVZero1 = ISD::isBuildVectorAllZeros(Op1.getNode());


    if (IsSEXT0 && IsVZero1) {

      assert(VT == Op0.getOperand(0).getValueType() &&

             "Unexpected operand type");

      if (TmpCC == ISD::SETGT)

        return DAG.getConstant(0, DL, VT);

      if (TmpCC == ISD::SETLE)

        return DAG.getConstant(1, DL, VT);

      if (TmpCC == ISD::SETEQ || TmpCC == ISD::SETGE)

        return DAG.getNOT(DL, Op0.getOperand(0), VT);


      assert((TmpCC == ISD::SETNE || TmpCC == ISD::SETLT) &&

             "Unexpected condition code!");

      return Op0.getOperand(0);

    }


    if (IsVZero1)

      if (SDValue V =

              combineAVX512SetCCToKMOV(VT, Op0, TmpCC, DL, DAG, Subtarget))

        return V;

  }


  // Try and make unsigned vector comparison signed. On pre AVX512 targets there

  // only are unsigned comparisons (`PCMPGT`) and on AVX512 its often better to

  // use `PCMPGT` if the result is mean to stay in a vector (and if its going to

  // a mask, there are signed AVX512 comparisons).

  if (VT.isVector() && OpVT.isVector() && OpVT.isInteger()) {

    bool CanMakeSigned = false;

    if (ISD::isUnsignedIntSetCC(CC)) {

      KnownBits CmpKnown =

          DAG.computeKnownBits(LHS).intersectWith(DAG.computeKnownBits(RHS));

      // If we know LHS/RHS share the same sign bit at each element we can

      // make this signed.

      // NOTE: `computeKnownBits` on a vector type aggregates common bits

      // across all lanes. So a pattern where the sign varies from lane to

      // lane, but at each lane Sign(LHS) is known to equal Sign(RHS), will be

      // missed. We could get around this by demanding each lane

      // independently, but this isn't the most important optimization and

      // that may eat into compile time.

      CanMakeSigned =

          CmpKnown.Zero.isSignBitSet() || CmpKnown.One.isSignBitSet();

    }

    if (CanMakeSigned || ISD::isSignedIntSetCC(CC)) {

      SDValue LHSOut = LHS;

      SDValue RHSOut = RHS;

      ISD::CondCode NewCC = CC;

      switch (CC) {

      case ISD::SETGE:

      case ISD::SETUGE:

        if (SDValue NewLHS = incDecVectorConstant(LHS, DAG, /*IsInc*/ true,

                                                  /*NSW*/ true))

          LHSOut = NewLHS;

        else if (SDValue NewRHS = incDecVectorConstant(

                     RHS, DAG, /*IsInc*/ false, /*NSW*/ true))

          RHSOut = NewRHS;

        else

          break;


        [[fallthrough]];

      case ISD::SETUGT:

        NewCC = ISD::SETGT;

        break;


      case ISD::SETLE:

      case ISD::SETULE:

        if (SDValue NewLHS = incDecVectorConstant(LHS, DAG, /*IsInc*/ false,

                                                  /*NSW*/ true))

          LHSOut = NewLHS;

        else if (SDValue NewRHS = incDecVectorConstant(RHS, DAG, /*IsInc*/ true,

                                                       /*NSW*/ true))

          RHSOut = NewRHS;

        else

          break;


        [[fallthrough]];

      case ISD::SETULT:

        // Will be swapped to SETGT in LowerVSETCC*.

        NewCC = ISD::SETLT;

        break;

      default:

        break;

      }

      if (NewCC != CC) {

        if (SDValue R = truncateAVX512SetCCNoBWI(VT, OpVT, LHSOut, RHSOut,

                                                 NewCC, DL, DAG, Subtarget))

          return R;

        return DAG.getSetCC(DL, VT, LHSOut, RHSOut, NewCC);

      }

    }

  }


  if (SDValue R =

          truncateAVX512SetCCNoBWI(VT, OpVT, LHS, RHS, CC, DL, DAG, Subtarget))

    return R;


  // In the middle end transforms:

  //    `(or (icmp eq X, C), (icmp eq X, C+1))`

  //        -> `(icmp ult (add x, -C), 2)`

  // Likewise inverted cases with `ugt`.

  //

  // Since x86, pre avx512, doesn't have unsigned vector compares, this results

  // in worse codegen. So, undo the middle-end transform and go back to `(or

  // (icmp eq), (icmp eq))` form.

  // Also skip AVX1 with ymm vectors, as the umin approach combines better than

  // the xmm approach.

  //

  // NB: We don't handle the similiar simplication of `(and (icmp ne), (icmp

  // ne))` as it doesn't end up instruction positive.

  // TODO: We might want to do this for avx512 as well if we `sext` the result.

  if (VT.isVector() && OpVT.isVector() && OpVT.isInteger() &&

      ISD::isUnsignedIntSetCC(CC) && LHS.getOpcode() == ISD::ADD &&

      !Subtarget.hasAVX512() &&

      (OpVT.getSizeInBits() <= 128 || !Subtarget.hasAVX() ||

       Subtarget.hasAVX2()) &&

      LHS.hasOneUse()) {


    APInt CmpC;

    SDValue AddC = LHS.getOperand(1);

    if (ISD::isConstantSplatVector(RHS.getNode(), CmpC) &&

        DAG.isConstantIntBuildVectorOrConstantInt(AddC)) {

      // See which form we have depending on the constant/condition.

      SDValue C0 = SDValue();

      SDValue C1 = SDValue();


      // If we had `(add x, -1)` and can lower with `umin`, don't transform as

      // we will end up generating an additional constant. Keeping in the

      // current form has a slight latency cost, but it probably worth saving a

      // constant.

      if (ISD::isConstantSplatVectorAllOnes(AddC.getNode()) &&

          DAG.getTargetLoweringInfo().isOperationLegal(ISD::UMIN, OpVT)) {

        // Pass

      }

      // Normal Cases

      else if ((CC == ISD::SETULT && CmpC == 2) ||

               (CC == ISD::SETULE && CmpC == 1)) {

        // These will constant fold.

        C0 = DAG.getNegative(AddC, DL, OpVT);

        C1 = DAG.getNode(ISD::SUB, DL, OpVT, C0,

                         DAG.getAllOnesConstant(DL, OpVT));

      }

      // Inverted Cases

      else if ((CC == ISD::SETUGT && (-CmpC) == 3) ||

               (CC == ISD::SETUGE && (-CmpC) == 2)) {

        // These will constant fold.

        C0 = DAG.getNOT(DL, AddC, OpVT);

        C1 = DAG.getNode(ISD::ADD, DL, OpVT, C0,

                         DAG.getAllOnesConstant(DL, OpVT));

      }

      if (C0 && C1) {

        SDValue NewLHS =

            DAG.getSetCC(DL, VT, LHS.getOperand(0), C0, ISD::SETEQ);

        SDValue NewRHS =

            DAG.getSetCC(DL, VT, LHS.getOperand(0), C1, ISD::SETEQ);

        return DAG.getNode(ISD::OR, DL, VT, NewLHS, NewRHS);

      }

    }

  }


  // For an SSE1-only target, lower a comparison of v4f32 to X86ISD::CMPP early

  // to avoid scalarization via legalization because v4i32 is not a legal type.

  if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32 &&

      LHS.getValueType() == MVT::v4f32)

    return LowerVSETCC(SDValue(N, 0), Subtarget, DAG);


  // X pred 0.0 --> X pred -X

  // If the negation of X already exists, use it in the comparison. This removes

  // the need to materialize 0.0 and allows matching to SSE's MIN/MAX

  // instructions in patterns with a 'select' node.

  if (isNullFPScalarOrVectorConst(RHS)) {

    SDVTList FNegVT = DAG.getVTList(OpVT);

    if (SDNode *FNeg = DAG.getNodeIfExists(ISD::FNEG, FNegVT, {LHS}))

      return DAG.getSetCC(DL, VT, LHS, SDValue(FNeg, 0), CC);

  }


  return SDValue();

}


static SDValue combineMOVMSK(SDNode *N, SelectionDAG &DAG,

                             TargetLowering::DAGCombinerInfo &DCI,

                             const X86Subtarget &Subtarget) {

  SDValue Src = N->getOperand(0);

  MVT SrcVT = Src.getSimpleValueType();

  MVT VT = N->getSimpleValueType(0);

  unsigned NumBits = VT.getScalarSizeInBits();

  unsigned NumElts = SrcVT.getVectorNumElements();

  unsigned NumBitsPerElt = SrcVT.getScalarSizeInBits();

  assert(VT == MVT::i32 && NumElts <= NumBits && "Unexpected MOVMSK types");


  // Perform constant folding.

  APInt UndefElts;

  SmallVector<APInt, 32> EltBits;

  if (getTargetConstantBitsFromNode(Src, NumBitsPerElt, UndefElts, EltBits,

                                    /*AllowWholeUndefs*/ true,

                                    /*AllowPartialUndefs*/ true)) {

    APInt Imm(32, 0);

    for (unsigned Idx = 0; Idx != NumElts; ++Idx)

      if (!UndefElts[Idx] && EltBits[Idx].isNegative())

        Imm.setBit(Idx);


    return DAG.getConstant(Imm, SDLoc(N), VT);

  }


  // Look through int->fp bitcasts that don't change the element width.

  unsigned EltWidth = SrcVT.getScalarSizeInBits();

  if (Subtarget.hasSSE2() && Src.getOpcode() == ISD::BITCAST &&

      Src.getOperand(0).getScalarValueSizeInBits() == EltWidth)

    return DAG.getNode(X86ISD::MOVMSK, SDLoc(N), VT, Src.getOperand(0));


  // Fold movmsk(not(x)) -> not(movmsk(x)) to improve folding of movmsk results

  // with scalar comparisons.

  if (SDValue NotSrc = IsNOT(Src, DAG)) {

    SDLoc DL(N);

    APInt NotMask = APInt::getLowBitsSet(NumBits, NumElts);

    NotSrc = DAG.getBitcast(SrcVT, NotSrc);

    return DAG.getNode(ISD::XOR, DL, VT,

                       DAG.getNode(X86ISD::MOVMSK, DL, VT, NotSrc),

                       DAG.getConstant(NotMask, DL, VT));

  }


  // Fold movmsk(icmp_sgt(x,-1)) -> not(movmsk(x)) to improve folding of movmsk

  // results with scalar comparisons.

  if (Src.getOpcode() == X86ISD::PCMPGT &&

      ISD::isBuildVectorAllOnes(Src.getOperand(1).getNode())) {

    SDLoc DL(N);

    APInt NotMask = APInt::getLowBitsSet(NumBits, NumElts);

    return DAG.getNode(ISD::XOR, DL, VT,

                       DAG.getNode(X86ISD::MOVMSK, DL, VT, Src.getOperand(0)),

                       DAG.getConstant(NotMask, DL, VT));

  }


  // Fold movmsk(icmp_eq(and(x,c1),c1)) -> movmsk(shl(x,c2))

  // Fold movmsk(icmp_eq(and(x,c1),0)) -> movmsk(not(shl(x,c2)))

  // iff pow2splat(c1).

  // Use KnownBits to determine if only a single bit is non-zero

  // in each element (pow2 or zero), and shift that bit to the msb.

  if (Src.getOpcode() == X86ISD::PCMPEQ) {

    KnownBits KnownLHS = DAG.computeKnownBits(Src.getOperand(0));

    KnownBits KnownRHS = DAG.computeKnownBits(Src.getOperand(1));

    unsigned ShiftAmt = KnownLHS.countMinLeadingZeros();

    if (KnownLHS.countMaxPopulation() == 1 &&

        (KnownRHS.isZero() || (KnownRHS.countMaxPopulation() == 1 &&

                               ShiftAmt == KnownRHS.countMinLeadingZeros()))) {

      SDLoc DL(N);

      MVT ShiftVT = SrcVT;

      SDValue ShiftLHS = Src.getOperand(0);

      SDValue ShiftRHS = Src.getOperand(1);

      if (ShiftVT.getScalarType() == MVT::i8) {

        // vXi8 shifts - we only care about the signbit so can use PSLLW.

        ShiftVT = MVT::getVectorVT(MVT::i16, NumElts / 2);

        ShiftLHS = DAG.getBitcast(ShiftVT, ShiftLHS);

        ShiftRHS = DAG.getBitcast(ShiftVT, ShiftRHS);

      }

      ShiftLHS = getTargetVShiftByConstNode(X86ISD::VSHLI, DL, ShiftVT,

                                            ShiftLHS, ShiftAmt, DAG);

      ShiftRHS = getTargetVShiftByConstNode(X86ISD::VSHLI, DL, ShiftVT,

                                            ShiftRHS, ShiftAmt, DAG);

      ShiftLHS = DAG.getBitcast(SrcVT, ShiftLHS);

      ShiftRHS = DAG.getBitcast(SrcVT, ShiftRHS);

      SDValue Res = DAG.getNode(ISD::XOR, DL, SrcVT, ShiftLHS, ShiftRHS);

      return DAG.getNode(X86ISD::MOVMSK, DL, VT, DAG.getNOT(DL, Res, SrcVT));

    }

  }


  // Fold movmsk(logic(X,C)) -> logic(movmsk(X),C)

  if (N->isOnlyUserOf(Src.getNode())) {

    SDValue SrcBC = peekThroughOneUseBitcasts(Src);

    if (ISD::isBitwiseLogicOp(SrcBC.getOpcode())) {

      APInt UndefElts;

      SmallVector<APInt, 32> EltBits;

      if (getTargetConstantBitsFromNode(SrcBC.getOperand(1), NumBitsPerElt,

                                        UndefElts, EltBits)) {

        APInt Mask = APInt::getZero(NumBits);

        for (unsigned Idx = 0; Idx != NumElts; ++Idx) {

          if (!UndefElts[Idx] && EltBits[Idx].isNegative())

            Mask.setBit(Idx);

        }

        SDLoc DL(N);

        SDValue NewSrc = DAG.getBitcast(SrcVT, SrcBC.getOperand(0));

        SDValue NewMovMsk = DAG.getNode(X86ISD::MOVMSK, DL, VT, NewSrc);

        return DAG.getNode(SrcBC.getOpcode(), DL, VT, NewMovMsk,

                           DAG.getConstant(Mask, DL, VT));

      }

    }

  }


  // Simplify the inputs.

  const TargetLowering &TLI = DAG.getTargetLoweringInfo();

  APInt DemandedMask(APInt::getAllOnes(NumBits));

  if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))

    return SDValue(N, 0);


  return SDValue();

}


static SDValue combineTESTP(SDNode *N, SelectionDAG &DAG,

                            TargetLowering::DAGCombinerInfo &DCI,

                            const X86Subtarget &Subtarget) {

  MVT VT = N->getSimpleValueType(0);

  unsigned NumBits = VT.getScalarSizeInBits();


  // Simplify the inputs.

  const TargetLowering &TLI = DAG.getTargetLoweringInfo();

  APInt DemandedMask(APInt::getAllOnes(NumBits));

  if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))

    return SDValue(N, 0);


  return SDValue();

}


static SDValue combineX86GatherScatter(SDNode *N, SelectionDAG &DAG,

                                       TargetLowering::DAGCombinerInfo &DCI) {

  auto *MemOp = cast<X86MaskedGatherScatterSDNode>(N);

  SDValue Mask = MemOp->getMask();


  // With vector masks we only demand the upper bit of the mask.

  if (Mask.getScalarValueSizeInBits() != 1) {

    const TargetLowering &TLI = DAG.getTargetLoweringInfo();

    APInt DemandedMask(APInt::getSignMask(Mask.getScalarValueSizeInBits()));

    if (TLI.SimplifyDemandedBits(Mask, DemandedMask, DCI)) {

      if (N->getOpcode() != ISD::DELETED_NODE)

        DCI.AddToWorklist(N);

      return SDValue(N, 0);

    }

  }


  return SDValue();

}


static SDValue rebuildGatherScatter(MaskedGatherScatterSDNode *GorS,

                                    SDValue Index, SDValue Base, SDValue Scale,

                                    SelectionDAG &DAG) {

  SDLoc DL(GorS);


  if (auto *Gather = dyn_cast<MaskedGatherSDNode>(GorS)) {

    SDValue Ops[] = { Gather->getChain(), Gather->getPassThru(),

                      Gather->getMask(), Base, Index, Scale } ;

    return DAG.getMaskedGather(Gather->getVTList(),

                               Gather->getMemoryVT(), DL, Ops,

                               Gather->getMemOperand(),

                               Gather->getIndexType(),

                               Gather->getExtensionType());

  }

  auto *Scatter = cast<MaskedScatterSDNode>(GorS);

  SDValue Ops[] = { Scatter->getChain(), Scatter->getValue(),

                    Scatter->getMask(), Base, Index, Scale };

  return DAG.getMaskedScatter(Scatter->getVTList(),

                              Scatter->getMemoryVT(), DL,

                              Ops, Scatter->getMemOperand(),

                              Scatter->getIndexType(),

                              Scatter->isTruncatingStore());

}


static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG,

                                    TargetLowering::DAGCombinerInfo &DCI) {

  SDLoc DL(N);

  auto *GorS = cast<MaskedGatherScatterSDNode>(N);

  SDValue Index = GorS->getIndex();

  SDValue Base = GorS->getBasePtr();

  SDValue Scale = GorS->getScale();

  EVT IndexVT = Index.getValueType();

  EVT IndexSVT = IndexVT.getVectorElementType();

  unsigned IndexWidth = Index.getScalarValueSizeInBits();

  const TargetLowering &TLI = DAG.getTargetLoweringInfo();

  EVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());


  if (DCI.isBeforeLegalize()) {

    // Attempt to move shifted index into the address scale, allows further

    // index truncation below.

    if (Index.getOpcode() == ISD::SHL && IndexSVT == PtrVT &&

        isa<ConstantSDNode>(Scale)) {

      unsigned ScaleAmt = Scale->getAsZExtVal();

      assert(isPowerOf2_32(ScaleAmt) && "Scale must be a power of 2");

      unsigned Log2ScaleAmt = Log2_32(ScaleAmt);

      unsigned MaskBits = IndexWidth - Log2ScaleAmt;

      APInt DemandedBits = APInt::getLowBitsSet(IndexWidth, MaskBits);

      if (TLI.SimplifyDemandedBits(Index, DemandedBits, DCI)) {

        if (N->getOpcode() != ISD::DELETED_NODE)

          DCI.AddToWorklist(N);

        return SDValue(N, 0);

      }

      if (auto MinShAmt = DAG.getValidMinimumShiftAmount(Index)) {

        if (*MinShAmt >= 1 && Log2ScaleAmt < 3 &&

            DAG.ComputeNumSignBits(Index.getOperand(0)) > 1) {

          SDValue ShAmt = Index.getOperand(1);

          SDValue NewShAmt =

              DAG.getNode(ISD::SUB, DL, ShAmt.getValueType(), ShAmt,

                          DAG.getConstant(1, DL, ShAmt.getValueType()));

          SDValue NewIndex = DAG.getNode(ISD::SHL, DL, Index.getValueType(),

                                         Index.getOperand(0), NewShAmt);

          SDValue NewScale =

              DAG.getConstant(ScaleAmt * 2, DL, Scale.getValueType());

          return rebuildGatherScatter(GorS, NewIndex, Base, NewScale, DAG);

        }

      }

    }


    // Shrink indices if they are larger than 32-bits.

    // Only do this before legalize types since v2i64 could become v2i32.

    // FIXME: We could check that the type is legal if we're after legalize

    // types, but then we would need to construct test cases where that happens.

    if (IndexWidth > 32 && DAG.ComputeNumSignBits(Index) > (IndexWidth - 32)) {

      EVT NewVT = IndexVT.changeVectorElementType(MVT::i32);


      // FIXME: We could support more than just constant fold, but we need to

      // careful with costing. A truncate that can be optimized out would be

      // fine. Otherwise we might only want to create a truncate if it avoids

      // a split.

      if (SDValue TruncIndex =

              DAG.FoldConstantArithmetic(ISD::TRUNCATE, DL, NewVT, Index))

        return rebuildGatherScatter(GorS, TruncIndex, Base, Scale, DAG);


      // Shrink any sign/zero extends from 32 or smaller to larger than 32 if

      // there are sufficient sign bits. Only do this before legalize types to

      // avoid creating illegal types in truncate.

      if ((Index.getOpcode() == ISD::SIGN_EXTEND ||

           Index.getOpcode() == ISD::ZERO_EXTEND) &&

          Index.getOperand(0).getScalarValueSizeInBits() <= 32) {

        Index = DAG.getNode(ISD::TRUNCATE, DL, NewVT, Index);

        return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);

      }


      // Shrink if we remove an illegal type.

      if (!TLI.isTypeLegal(Index.getValueType()) && TLI.isTypeLegal(NewVT)) {

        Index = DAG.getNode(ISD::TRUNCATE, DL, NewVT, Index);

        return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);

      }

    }

  }


  // Try to move splat adders from the index operand to the base

  // pointer operand. Taking care to multiply by the scale. We can only do

  // this when index element type is the same as the pointer type.

  // Otherwise we need to be sure the math doesn't wrap before the scale.

  if (Index.getOpcode() == ISD::ADD && IndexSVT == PtrVT &&

      isa<ConstantSDNode>(Scale)) {

    uint64_t ScaleAmt = Scale->getAsZExtVal();


    for (unsigned I = 0; I != 2; ++I)

      if (auto *BV = dyn_cast<BuildVectorSDNode>(Index.getOperand(I))) {

        BitVector UndefElts;

        if (SDValue Splat = BV->getSplatValue(&UndefElts)) {

          if (UndefElts.none()) {

            // If the splat value is constant we can add the scaled splat value

            // to the existing base.

            if (auto *C = dyn_cast<ConstantSDNode>(Splat)) {

              APInt Adder = C->getAPIntValue() * ScaleAmt;

              SDValue NewBase = DAG.getNode(ISD::ADD, DL, PtrVT, Base,

                                            DAG.getConstant(Adder, DL, PtrVT));

              SDValue NewIndex = Index.getOperand(1 - I);

              return rebuildGatherScatter(GorS, NewIndex, NewBase, Scale, DAG);

            }

            // For non-constant cases, limit this to non-scaled cases.

            if (ScaleAmt == 1) {

              SDValue NewBase = DAG.getNode(ISD::ADD, DL, PtrVT, Base, Splat);

              SDValue NewIndex = Index.getOperand(1 - I);

              return rebuildGatherScatter(GorS, NewIndex, NewBase, Scale, DAG);

            }

          }

        }

        // It's also possible base is just a constant. In that case, just

        // replace it with 0 and move the displacement into the index.

        if (ScaleAmt == 1 && BV->isConstant() && isa<ConstantSDNode>(Base)) {

          SDValue Splat = DAG.getSplatBuildVector(IndexVT, DL, Base);

          // Combine the constant build_vector and the constant base.

          Splat =

              DAG.getNode(ISD::ADD, DL, IndexVT, Index.getOperand(I), Splat);

          // Add to the other half of the original Index add.

          SDValue NewIndex = DAG.getNode(ISD::ADD, DL, IndexVT,

                                         Index.getOperand(1 - I), Splat);

          SDValue NewBase = DAG.getConstant(0, DL, PtrVT);

          return rebuildGatherScatter(GorS, NewIndex, NewBase, Scale, DAG);

        }

      }

  }


  if (DCI.isBeforeLegalizeOps()) {

    // Make sure the index is either i32 or i64

    if (IndexWidth != 32 && IndexWidth != 64) {

      MVT EltVT = IndexWidth > 32 ? MVT::i64 : MVT::i32;

      IndexVT = IndexVT.changeVectorElementType(EltVT);

      Index = DAG.getSExtOrTrunc(Index, DL, IndexVT);

      return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);

    }

  }


  // With vector masks we only demand the upper bit of the mask.

  SDValue Mask = GorS->getMask();

  if (Mask.getScalarValueSizeInBits() != 1) {

    APInt DemandedMask(APInt::getSignMask(Mask.getScalarValueSizeInBits()));

    if (TLI.SimplifyDemandedBits(Mask, DemandedMask, DCI)) {

      if (N->getOpcode() != ISD::DELETED_NODE)

        DCI.AddToWorklist(N);

      return SDValue(N, 0);

    }

  }


  return SDValue();

}


// Optimize  RES = X86ISD::SETCC CONDCODE, EFLAG_INPUT


static SDValue combineX86SetCC(SDNode *N, SelectionDAG &DAG,

                               const X86Subtarget &Subtarget) {

  SDLoc DL(N);

  X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(0));

  SDValue EFLAGS = N->getOperand(1);


  // Try to simplify the EFLAGS and condition code operands.

  if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG, Subtarget))

    return getSETCC(CC, Flags, DL, DAG);


  return SDValue();

}


/// Optimize branch condition evaluation.


static SDValue combineBrCond(SDNode *N, SelectionDAG &DAG,

                             const X86Subtarget &Subtarget) {

  SDLoc DL(N);

  SDValue EFLAGS = N->getOperand(3);

  X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(2));


  // Try to simplify the EFLAGS and condition code operands.

  // Make sure to not keep references to operands, as combineSetCCEFLAGS can

  // RAUW them under us.

  if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG, Subtarget)) {

    SDValue Cond = DAG.getTargetConstant(CC, DL, MVT::i8);

    return DAG.getNode(X86ISD::BRCOND, DL, N->getVTList(), N->getOperand(0),

                       N->getOperand(1), Cond, Flags);

  }


  return SDValue();

}


// TODO: Could we move this to DAGCombine?


static SDValue combineVectorCompareAndMaskUnaryOp(SDNode *N,

                                                  SelectionDAG &DAG) {

  // Take advantage of vector comparisons (etc.) producing 0 or -1 in each lane

  // to optimize away operation when it's from a constant.

  //

  // The general transformation is:

  //    UNARYOP(AND(VECTOR_CMP(x,y), constant)) -->

  //       AND(VECTOR_CMP(x,y), constant2)

  //    constant2 = UNARYOP(constant)


  // Early exit if this isn't a vector operation, the operand of the

  // unary operation isn't a bitwise AND, or if the sizes of the operations

  // aren't the same.

  EVT VT = N->getValueType(0);

  bool IsStrict = N->isStrictFPOpcode();

  unsigned NumEltBits = VT.getScalarSizeInBits();

  SDValue Op0 = N->getOperand(IsStrict ? 1 : 0);

  if (!VT.isVector() || Op0.getOpcode() != ISD::AND ||

      DAG.ComputeNumSignBits(Op0.getOperand(0)) != NumEltBits ||

      VT.getSizeInBits() != Op0.getValueSizeInBits())

    return SDValue();


  // Now check that the other operand of the AND is a constant. We could

  // make the transformation for non-constant splats as well, but it's unclear

  // that would be a benefit as it would not eliminate any operations, just

  // perform one more step in scalar code before moving to the vector unit.

  if (auto *BV = dyn_cast<BuildVectorSDNode>(Op0.getOperand(1))) {

    // Bail out if the vector isn't a constant.

    if (!BV->isConstant())

      return SDValue();


    // Everything checks out. Build up the new and improved node.

    SDLoc DL(N);

    EVT IntVT = BV->getValueType(0);

    // Create a new constant of the appropriate type for the transformed

    // DAG.

    SDValue SourceConst;

    if (IsStrict)

      SourceConst = DAG.getNode(N->getOpcode(), DL, {VT, MVT::Other},

                                {N->getOperand(0), SDValue(BV, 0)});

    else

      SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0));

    // The AND node needs bitcasts to/from an integer vector type around it.

    SDValue MaskConst = DAG.getBitcast(IntVT, SourceConst);

    SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT, Op0->getOperand(0),

                                 MaskConst);

    SDValue Res = DAG.getBitcast(VT, NewAnd);

    if (IsStrict)

      return DAG.getMergeValues({Res, SourceConst.getValue(1)}, DL);

    return Res;

  }


  return SDValue();

}


/// If we are converting a value to floating-point, try to replace scalar

/// truncate of an extracted vector element with a bitcast. This tries to keep

/// the sequence on XMM registers rather than moving between vector and GPRs.


static SDValue combineToFPTruncExtElt(SDNode *N, SelectionDAG &DAG) {

  // TODO: This is currently only used by combineSIntToFP, but it is generalized

  //       to allow being called by any similar cast opcode.

  // TODO: Consider merging this into lowering: vectorizeExtractedCast().

  SDValue Trunc = N->getOperand(0);

  if (!Trunc.hasOneUse() || Trunc.getOpcode() != ISD::TRUNCATE)

    return SDValue();


  SDValue ExtElt = Trunc.getOperand(0);

  if (!ExtElt.hasOneUse() || ExtElt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||

      !isNullConstant(ExtElt.getOperand(1)))

    return SDValue();


  EVT TruncVT = Trunc.getValueType();

  EVT SrcVT = ExtElt.getValueType();

  unsigned DestWidth = TruncVT.getSizeInBits();

  unsigned SrcWidth = SrcVT.getSizeInBits();

  if (SrcWidth % DestWidth != 0)

    return SDValue();


  // inttofp (trunc (extelt X, 0)) --> inttofp (extelt (bitcast X), 0)

  EVT SrcVecVT = ExtElt.getOperand(0).getValueType();

  unsigned VecWidth = SrcVecVT.getSizeInBits();

  unsigned NumElts = VecWidth / DestWidth;

  EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), TruncVT, NumElts);

  SDValue BitcastVec = DAG.getBitcast(BitcastVT, ExtElt.getOperand(0));

  SDLoc DL(N);

  SDValue NewExtElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, TruncVT,

                                  BitcastVec, ExtElt.getOperand(1));

  return DAG.getNode(N->getOpcode(), DL, N->getValueType(0), NewExtElt);

}


static SDValue combineUIntToFP(SDNode *N, SelectionDAG &DAG,

                               const X86Subtarget &Subtarget) {

  bool IsStrict = N->isStrictFPOpcode();

  SDValue Op0 = N->getOperand(IsStrict ? 1 : 0);

  EVT VT = N->getValueType(0);

  EVT InVT = Op0.getValueType();


  // Using i16 as an intermediate type is a bad idea, unless we have HW support

  // for it. Therefore for type sizes equal or smaller than 32 just go with i32.

  // if hasFP16 support:

  //   UINT_TO_FP(vXi1~15)  -> SINT_TO_FP(ZEXT(vXi1~15  to vXi16))

  //   UINT_TO_FP(vXi17~31) -> SINT_TO_FP(ZEXT(vXi17~31 to vXi32))

  // else

  //   UINT_TO_FP(vXi1~31) -> SINT_TO_FP(ZEXT(vXi1~31 to vXi32))

  // UINT_TO_FP(vXi33~63) -> SINT_TO_FP(ZEXT(vXi33~63 to vXi64))

  if (InVT.isVector() && VT.getVectorElementType() == MVT::f16) {

    unsigned ScalarSize = InVT.getScalarSizeInBits();

    if ((ScalarSize == 16 && Subtarget.hasFP16()) || ScalarSize == 32 ||

        ScalarSize >= 64)

      return SDValue();

    SDLoc dl(N);

    EVT DstVT =

        EVT::getVectorVT(*DAG.getContext(),

                         (Subtarget.hasFP16() && ScalarSize < 16) ? MVT::i16

                         : ScalarSize < 32                        ? MVT::i32

                                                                  : MVT::i64,

                         InVT.getVectorNumElements());

    SDValue P = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Op0);

    if (IsStrict)

      return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},

                         {N->getOperand(0), P});

    return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);

  }


  // UINT_TO_FP(vXi1) -> SINT_TO_FP(ZEXT(vXi1 to vXi32))

  // UINT_TO_FP(vXi8) -> SINT_TO_FP(ZEXT(vXi8 to vXi32))

  // UINT_TO_FP(vXi16) -> SINT_TO_FP(ZEXT(vXi16 to vXi32))

  if (InVT.isVector() && InVT.getScalarSizeInBits() < 32 &&

      VT.getScalarType() != MVT::f16) {

    SDLoc dl(N);

    EVT DstVT = InVT.changeVectorElementType(MVT::i32);

    SDValue P = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Op0);


    // UINT_TO_FP isn't legal without AVX512 so use SINT_TO_FP.

    if (IsStrict)

      return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},

                         {N->getOperand(0), P});

    return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);

  }


  // Since UINT_TO_FP is legal (it's marked custom), dag combiner won't

  // optimize it to a SINT_TO_FP when the sign bit is known zero. Perform

  // the optimization here.

  SDNodeFlags Flags = N->getFlags();

  if (Flags.hasNonNeg() || DAG.SignBitIsZero(Op0)) {

    if (IsStrict)

      return DAG.getNode(ISD::STRICT_SINT_TO_FP, SDLoc(N), {VT, MVT::Other},

                         {N->getOperand(0), Op0});

    return DAG.getNode(ISD::SINT_TO_FP, SDLoc(N), VT, Op0);

  }


  return SDValue();

}


static SDValue combineSIntToFP(SDNode *N, SelectionDAG &DAG,

                               TargetLowering::DAGCombinerInfo &DCI,

                               const X86Subtarget &Subtarget) {

  // First try to optimize away the conversion entirely when it's

  // conditionally from a constant. Vectors only.

  bool IsStrict = N->isStrictFPOpcode();

  if (SDValue Res = combineVectorCompareAndMaskUnaryOp(N, DAG))

    return Res;


  // Now move on to more general possibilities.

  SDValue Op0 = N->getOperand(IsStrict ? 1 : 0);

  EVT VT = N->getValueType(0);

  EVT InVT = Op0.getValueType();


  // Using i16 as an intermediate type is a bad idea, unless we have HW support

  // for it. Therefore for type sizes equal or smaller than 32 just go with i32.

  // if hasFP16 support:

  //   SINT_TO_FP(vXi1~15)  -> SINT_TO_FP(SEXT(vXi1~15  to vXi16))

  //   SINT_TO_FP(vXi17~31) -> SINT_TO_FP(SEXT(vXi17~31 to vXi32))

  // else

  //   SINT_TO_FP(vXi1~31) -> SINT_TO_FP(ZEXT(vXi1~31 to vXi32))

  // SINT_TO_FP(vXi33~63) -> SINT_TO_FP(SEXT(vXi33~63 to vXi64))

  if (InVT.isVector() && VT.getVectorElementType() == MVT::f16) {

    unsigned ScalarSize = InVT.getScalarSizeInBits();

    if ((ScalarSize == 16 && Subtarget.hasFP16()) || ScalarSize == 32 ||

        ScalarSize >= 64)

      return SDValue();

    SDLoc dl(N);

    EVT DstVT =

        EVT::getVectorVT(*DAG.getContext(),

                         (Subtarget.hasFP16() && ScalarSize < 16) ? MVT::i16

                         : ScalarSize < 32                        ? MVT::i32

                                                                  : MVT::i64,

                         InVT.getVectorNumElements());

    SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0);

    if (IsStrict)

      return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},

                         {N->getOperand(0), P});

    return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);

  }


  // SINT_TO_FP(vXi1) -> SINT_TO_FP(SEXT(vXi1 to vXi32))

  // SINT_TO_FP(vXi8) -> SINT_TO_FP(SEXT(vXi8 to vXi32))

  // SINT_TO_FP(vXi16) -> SINT_TO_FP(SEXT(vXi16 to vXi32))

  if (InVT.isVector() && InVT.getScalarSizeInBits() < 32 &&

      VT.getScalarType() != MVT::f16) {

    SDLoc dl(N);

    EVT DstVT = InVT.changeVectorElementType(MVT::i32);

    SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0);

    if (IsStrict)

      return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},

                         {N->getOperand(0), P});

    return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);

  }


  // Without AVX512DQ we only support i64 to float scalar conversion. For both

  // vectors and scalars, see if we know that the upper bits are all the sign

  // bit, in which case we can truncate the input to i32 and convert from that.

  if (InVT.getScalarSizeInBits() > 32 && !Subtarget.hasDQI()) {

    unsigned BitWidth = InVT.getScalarSizeInBits();

    unsigned NumSignBits = DAG.ComputeNumSignBits(Op0);

    if (NumSignBits >= (BitWidth - 31)) {

      EVT TruncVT = MVT::i32;

      if (InVT.isVector())

        TruncVT = InVT.changeVectorElementType(TruncVT);

      SDLoc dl(N);

      if (DCI.isBeforeLegalize() || TruncVT != MVT::v2i32) {

        SDValue Trunc = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Op0);

        if (IsStrict)

          return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},

                             {N->getOperand(0), Trunc});

        return DAG.getNode(ISD::SINT_TO_FP, dl, VT, Trunc);

      }

      // If we're after legalize and the type is v2i32 we need to shuffle and

      // use CVTSI2P.

      assert(InVT == MVT::v2i64 && "Unexpected VT!");

      SDValue Cast = DAG.getBitcast(MVT::v4i32, Op0);

      SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Cast, Cast,

                                          { 0, 2, -1, -1 });

      if (IsStrict)

        return DAG.getNode(X86ISD::STRICT_CVTSI2P, dl, {VT, MVT::Other},

                           {N->getOperand(0), Shuf});

      return DAG.getNode(X86ISD::CVTSI2P, dl, VT, Shuf);

    }

  }


  // Transform (SINT_TO_FP (i64 ...)) into an x87 operation if we have

  // a 32-bit target where SSE doesn't support i64->FP operations.

  if (!Subtarget.useSoftFloat() && Subtarget.hasX87() &&

      Op0.getOpcode() == ISD::LOAD) {

    LoadSDNode *Ld = cast<LoadSDNode>(Op0.getNode());


    // This transformation is not supported if the result type is f16 or f128.

    if (VT == MVT::f16 || VT == MVT::f128)

      return SDValue();


    // If we have AVX512DQ we can use packed conversion instructions unless

    // the VT is f80.

    if (Subtarget.hasDQI() && VT != MVT::f80)

      return SDValue();


    if (Ld->isSimple() && !VT.isVector() && ISD::isNormalLoad(Op0.getNode()) &&

        Op0.hasOneUse() && !Subtarget.is64Bit() && InVT == MVT::i64) {

      std::pair<SDValue, SDValue> Tmp =

          Subtarget.getTargetLowering()->BuildFILD(

              VT, InVT, SDLoc(N), Ld->getChain(), Ld->getBasePtr(),

              Ld->getPointerInfo(), Ld->getBaseAlign(), DAG);

      DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), Tmp.second);

      return Tmp.first;

    }

  }


  if (IsStrict)

    return SDValue();


  if (SDValue V = combineToFPTruncExtElt(N, DAG))

    return V;


  return SDValue();

}


static SDValue combineFPToSInt(SDNode *N, SelectionDAG &DAG,

                               const X86Subtarget &Subtarget) {

  EVT VT = N->getValueType(0);

  SDValue Src = N->getOperand(0);

  if (Subtarget.hasSSE2() && Src.getOpcode() == ISD::FRINT &&

      VT.getScalarType() == MVT::i32 && Src.hasOneUse())

    return DAG.getNode(ISD::LRINT, SDLoc(N), VT, Src.getOperand(0));


  return SDValue();

}


// Custom handling for VCVTTPS2QQS/VCVTTPS2UQQS


static SDValue combineFP_TO_xINT_SAT(SDNode *N, SelectionDAG &DAG,

                                     const X86Subtarget &Subtarget) {

  if (!Subtarget.hasAVX10_2())

    return SDValue();


  bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT_SAT;

  EVT SrcVT = N->getOperand(0).getValueType();

  EVT DstVT = N->getValueType(0);

  SDLoc dl(N);


  if (SrcVT == MVT::v2f32 && DstVT == MVT::v2i64) {

    SDValue V2F32Value = DAG.getUNDEF(SrcVT);


    // Concatenate the original v2f32 input and V2F32Value to create v4f32

    SDValue NewSrc = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,

                                 N->getOperand(0), V2F32Value);


    // Select the FP_TO_SINT_SAT/FP_TO_UINT_SAT node

    if (IsSigned)

      return DAG.getNode(X86ISD::FP_TO_SINT_SAT, dl, MVT::v2i64, NewSrc);


    return DAG.getNode(X86ISD::FP_TO_UINT_SAT, dl, MVT::v2i64, NewSrc);

  }

  return SDValue();

}


static bool needCarryOrOverflowFlag(SDValue Flags) {

  assert(Flags.getValueType() == MVT::i32 && "Unexpected VT!");


  for (const SDNode *User : Flags->users()) {

    X86::CondCode CC;

    switch (User->getOpcode()) {

    default:

      // Be conservative.

      return true;

    case X86ISD::SETCC:

    case X86ISD::SETCC_CARRY:

      CC = (X86::CondCode)User->getConstantOperandVal(0);

      break;

    case X86ISD::BRCOND:

    case X86ISD::CMOV:

      CC = (X86::CondCode)User->getConstantOperandVal(2);

      break;

    }


    switch (CC) {

    // clang-format off

    default: break;

    case X86::COND_A: case X86::COND_AE:

    case X86::COND_B: case X86::COND_BE:

    case X86::COND_O: case X86::COND_NO:

    case X86::COND_G: case X86::COND_GE:

    case X86::COND_L: case X86::COND_LE:

      return true;

    // clang-format on

    }

  }


  return false;

}


static bool onlyZeroFlagUsed(SDValue Flags) {

  assert(Flags.getValueType() == MVT::i32 && "Unexpected VT!");


  for (const SDNode *User : Flags->users()) {

    unsigned CCOpNo;

    switch (User->getOpcode()) {

    default:

      // Be conservative.

      return false;

    case X86ISD::SETCC:

    case X86ISD::SETCC_CARRY:

      CCOpNo = 0;

      break;

    case X86ISD::BRCOND:

    case X86ISD::CMOV:

      CCOpNo = 2;

      break;

    }


    X86::CondCode CC = (X86::CondCode)User->getConstantOperandVal(CCOpNo);

    if (CC != X86::COND_E && CC != X86::COND_NE)

      return false;

  }


  return true;

}


static SDValue combineCMP(SDNode *N, SelectionDAG &DAG,

                          TargetLowering::DAGCombinerInfo &DCI,

                          const X86Subtarget &Subtarget) {

  // Only handle test patterns.

  if (!isNullConstant(N->getOperand(1)))

    return SDValue();


  // If we have a CMP of a truncated binop, see if we can make a smaller binop

  // and use its flags directly.

  // TODO: Maybe we should try promoting compares that only use the zero flag

  // first if we can prove the upper bits with computeKnownBits?

  SDLoc dl(N);

  SDValue Op = N->getOperand(0);

  EVT VT = Op.getValueType();

  const TargetLowering &TLI = DAG.getTargetLoweringInfo();


  if (SDValue CMP =

          combineX86SubCmpForFlags(N, SDValue(N, 0), DAG, DCI, Subtarget))

    return CMP;


  // If we have a constant logical shift that's only used in a comparison

  // against zero turn it into an equivalent AND. This allows turning it into

  // a TEST instruction later.

  if ((Op.getOpcode() == ISD::SRL || Op.getOpcode() == ISD::SHL) &&

      Op.hasOneUse() && isa<ConstantSDNode>(Op.getOperand(1)) &&

      onlyZeroFlagUsed(SDValue(N, 0))) {

    unsigned BitWidth = VT.getSizeInBits();

    const APInt &ShAmt = Op.getConstantOperandAPInt(1);

    if (ShAmt.ult(BitWidth)) { // Avoid undefined shifts.

      unsigned MaskBits = BitWidth - ShAmt.getZExtValue();

      APInt Mask = Op.getOpcode() == ISD::SRL

                       ? APInt::getHighBitsSet(BitWidth, MaskBits)

                       : APInt::getLowBitsSet(BitWidth, MaskBits);

      if (Mask.isSignedIntN(32)) {

        Op = DAG.getNode(ISD::AND, dl, VT, Op.getOperand(0),

                         DAG.getConstant(Mask, dl, VT));

        return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,

                           DAG.getConstant(0, dl, VT));

      }

    }

  }


  // If we're extracting from a avx512 bool vector and comparing against zero,

  // then try to just bitcast the vector to an integer to use TEST/BT directly.

  // (and (extract_elt (kshiftr vXi1, C), 0), 1) -> (and (bc vXi1), 1<<C)

  if (Op.getOpcode() == ISD::AND && isOneConstant(Op.getOperand(1)) &&

      Op.hasOneUse() && onlyZeroFlagUsed(SDValue(N, 0))) {

    SDValue Src = Op.getOperand(0);

    if (Src.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&

        isNullConstant(Src.getOperand(1)) &&

        Src.getOperand(0).getValueType().getScalarType() == MVT::i1) {

      SDValue BoolVec = Src.getOperand(0);

      unsigned ShAmt = 0;

      if (BoolVec.getOpcode() == X86ISD::KSHIFTR) {

        ShAmt = BoolVec.getConstantOperandVal(1);

        BoolVec = BoolVec.getOperand(0);

      }

      BoolVec = widenMaskVector(BoolVec, false, Subtarget, DAG, dl);

      EVT VecVT = BoolVec.getValueType();

      unsigned BitWidth = VecVT.getVectorNumElements();

      EVT BCVT = EVT::getIntegerVT(*DAG.getContext(), BitWidth);

      if (TLI.isTypeLegal(VecVT) && TLI.isTypeLegal(BCVT)) {

        APInt Mask = APInt::getOneBitSet(BitWidth, ShAmt);

        Op = DAG.getBitcast(BCVT, BoolVec);

        Op = DAG.getNode(ISD::AND, dl, BCVT, Op,

                         DAG.getConstant(Mask, dl, BCVT));

        return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,

                           DAG.getConstant(0, dl, BCVT));

      }

    }

  }


  // Peek through any zero-extend if we're only testing for a zero result.

  if (Op.getOpcode() == ISD::ZERO_EXTEND && onlyZeroFlagUsed(SDValue(N, 0))) {

    SDValue Src = Op.getOperand(0);

    EVT SrcVT = Src.getValueType();

    if (SrcVT.getScalarSizeInBits() >= 8 && TLI.isTypeLegal(SrcVT))

      return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Src,

                         DAG.getConstant(0, dl, SrcVT));

  }


  // Look for a truncate.

  if (Op.getOpcode() != ISD::TRUNCATE)

    return SDValue();


  SDValue Trunc = Op;

  Op = Op.getOperand(0);


  // See if we can compare with zero against the truncation source,

  // which should help using the Z flag from many ops. Only do this for

  // i32 truncated op to prevent partial-reg compares of promoted ops.

  EVT OpVT = Op.getValueType();

  APInt UpperBits =

      APInt::getBitsSetFrom(OpVT.getSizeInBits(), VT.getSizeInBits());

  if (OpVT == MVT::i32 && DAG.MaskedValueIsZero(Op, UpperBits) &&

      onlyZeroFlagUsed(SDValue(N, 0))) {

    return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,

                       DAG.getConstant(0, dl, OpVT));

  }


  // After this the truncate and arithmetic op must have a single use.

  if (!Trunc.hasOneUse() || !Op.hasOneUse())

      return SDValue();


  unsigned NewOpc;

  switch (Op.getOpcode()) {

  default: return SDValue();

  case ISD::AND:

    // Skip and with constant. We have special handling for and with immediate

    // during isel to generate test instructions.

    if (isa<ConstantSDNode>(Op.getOperand(1)))

      return SDValue();

    NewOpc = X86ISD::AND;

    break;

  case ISD::OR:  NewOpc = X86ISD::OR;  break;

  case ISD::XOR: NewOpc = X86ISD::XOR; break;

  case ISD::ADD:

    // If the carry or overflow flag is used, we can't truncate.

    if (needCarryOrOverflowFlag(SDValue(N, 0)))

      return SDValue();

    NewOpc = X86ISD::ADD;

    break;

  case ISD::SUB:

    // If the carry or overflow flag is used, we can't truncate.

    if (needCarryOrOverflowFlag(SDValue(N, 0)))

      return SDValue();

    NewOpc = X86ISD::SUB;

    break;

  }


  // We found an op we can narrow. Truncate its inputs.

  SDValue Op0 = DAG.getNode(ISD::TRUNCATE, dl, VT, Op.getOperand(0));

  SDValue Op1 = DAG.getNode(ISD::TRUNCATE, dl, VT, Op.getOperand(1));


  // Use a X86 specific opcode to avoid DAG combine messing with it.

  SDVTList VTs = DAG.getVTList(VT, MVT::i32);

  Op = DAG.getNode(NewOpc, dl, VTs, Op0, Op1);


  // For AND, keep a CMP so that we can match the test pattern.

  if (NewOpc == X86ISD::AND)

    return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,

                       DAG.getConstant(0, dl, VT));


  // Return the flags.

  return Op.getValue(1);

}


static SDValue combineX86AddSub(SDNode *N, SelectionDAG &DAG,

                                TargetLowering::DAGCombinerInfo &DCI,

                                const X86Subtarget &ST) {

  assert((X86ISD::ADD == N->getOpcode() || X86ISD::SUB == N->getOpcode()) &&

         "Expected X86ISD::ADD or X86ISD::SUB");


  SDLoc DL(N);

  SDValue LHS = N->getOperand(0);

  SDValue RHS = N->getOperand(1);

  MVT VT = LHS.getSimpleValueType();

  bool IsSub = X86ISD::SUB == N->getOpcode();

  unsigned GenericOpc = IsSub ? ISD::SUB : ISD::ADD;


  if (IsSub && isOneConstant(RHS) && !N->hasAnyUseOfValue(0))

    if (SDValue CMP = combineX86SubCmpForFlags(N, SDValue(N, 1), DAG, DCI, ST))

      return CMP;


  // If we don't use the flag result, simplify back to a generic ADD/SUB.

  if (!N->hasAnyUseOfValue(1)) {

    SDValue Res = DAG.getNode(GenericOpc, DL, VT, LHS, RHS);

    return DAG.getMergeValues({Res, DAG.getConstant(0, DL, MVT::i32)}, DL);

  }


  // Fold any similar generic ADD/SUB opcodes to reuse this node.

  auto MatchGeneric = [&](unsigned Opc, SDValue N0, SDValue N1, bool Negate) {

    SDValue Ops[] = {N0, N1};

    SDVTList VTs = DAG.getVTList(N->getValueType(0));

    if (SDNode *GenericAddSub = DAG.getNodeIfExists(Opc, VTs, Ops)) {

      SDValue Op(N, 0);

      if (Negate) {

        // Bail if this is only used by a user of the x86 add/sub.

        if (GenericAddSub->hasOneUse() &&

            GenericAddSub->user_begin()->isOnlyUserOf(N))

          return;

        Op = DAG.getNegative(Op, DL, VT);

      }

      DCI.CombineTo(GenericAddSub, Op);

    }

  };

  MatchGeneric(GenericOpc, LHS, RHS, false);

  MatchGeneric(GenericOpc, RHS, LHS, X86ISD::SUB == N->getOpcode());


  if (auto *Const = dyn_cast<ConstantSDNode>(RHS)) {

    SDValue NegC = DAG.getConstant(-Const->getAPIntValue(), DL, VT);

    if (X86ISD::SUB == N->getOpcode()) {

      // Fold generic add(LHS, -C) to X86ISD::SUB(LHS, C).

      MatchGeneric(ISD::ADD, LHS, NegC, false);

    } else {

      // Negate X86ISD::ADD(LHS, C) and replace generic sub(-C, LHS).

      MatchGeneric(ISD::SUB, NegC, LHS, true);

    }

  } else if (auto *Const = dyn_cast<ConstantSDNode>(LHS)) {

    if (X86ISD::SUB == N->getOpcode()) {

      SDValue NegC = DAG.getConstant(-Const->getAPIntValue(), DL, VT);

      // Negate X86ISD::SUB(C, RHS) and replace generic add(RHS, -C).

      MatchGeneric(ISD::ADD, RHS, NegC, true);

    }

  }


  // TODO: Can we drop the ZeroSecondOpOnly limit? This is to guarantee that the

  // EFLAGS result doesn't change.

  return combineAddOrSubToADCOrSBB(IsSub, DL, VT, LHS, RHS, DAG,

                                   /*ZeroSecondOpOnly*/ true);

}


static SDValue combineSBB(SDNode *N, SelectionDAG &DAG) {

  SDValue LHS = N->getOperand(0);

  SDValue RHS = N->getOperand(1);

  SDValue BorrowIn = N->getOperand(2);


  if (SDValue Flags = combineCarryThroughADD(BorrowIn, DAG)) {

    MVT VT = N->getSimpleValueType(0);

    SDVTList VTs = DAG.getVTList(VT, MVT::i32);

    return DAG.getNode(X86ISD::SBB, SDLoc(N), VTs, LHS, RHS, Flags);

  }


  // Fold SBB(SUB(X,Y),0,Carry) -> SBB(X,Y,Carry)

  // iff the flag result is dead.

  if (LHS.getOpcode() == ISD::SUB && isNullConstant(RHS) &&

      !N->hasAnyUseOfValue(1))

    return DAG.getNode(X86ISD::SBB, SDLoc(N), N->getVTList(), LHS.getOperand(0),

                       LHS.getOperand(1), BorrowIn);


  return SDValue();

}


// Optimize RES, EFLAGS = X86ISD::ADC LHS, RHS, EFLAGS


static SDValue combineADC(SDNode *N, SelectionDAG &DAG,

                          TargetLowering::DAGCombinerInfo &DCI) {

  SDValue LHS = N->getOperand(0);

  SDValue RHS = N->getOperand(1);

  SDValue CarryIn = N->getOperand(2);

  auto *LHSC = dyn_cast<ConstantSDNode>(LHS);

  auto *RHSC = dyn_cast<ConstantSDNode>(RHS);


  // Canonicalize constant to RHS.

  if (LHSC && !RHSC)

    return DAG.getNode(X86ISD::ADC, SDLoc(N), N->getVTList(), RHS, LHS,

                       CarryIn);


  // If the LHS and RHS of the ADC node are zero, then it can't overflow and

  // the result is either zero or one (depending on the input carry bit).

  // Strength reduce this down to a "set on carry" aka SETCC_CARRY&1.

  if (LHSC && RHSC && LHSC->isZero() && RHSC->isZero() &&

      // We don't have a good way to replace an EFLAGS use, so only do this when

      // dead right now.

      SDValue(N, 1).use_empty()) {

    SDLoc DL(N);

    EVT VT = N->getValueType(0);

    SDValue CarryOut = DAG.getConstant(0, DL, N->getValueType(1));

    SDValue Res1 = DAG.getNode(

        ISD::AND, DL, VT,

        DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,

                    DAG.getTargetConstant(X86::COND_B, DL, MVT::i8), CarryIn),

        DAG.getConstant(1, DL, VT));

    return DCI.CombineTo(N, Res1, CarryOut);

  }


  // Fold ADC(C1,C2,Carry) -> ADC(0,C1+C2,Carry)

  // iff the flag result is dead.

  // TODO: Allow flag result if C1+C2 doesn't signed/unsigned overflow.

  if (LHSC && RHSC && !LHSC->isZero() && !N->hasAnyUseOfValue(1)) {

    SDLoc DL(N);

    APInt Sum = LHSC->getAPIntValue() + RHSC->getAPIntValue();

    return DAG.getNode(X86ISD::ADC, DL, N->getVTList(),

                       DAG.getConstant(0, DL, LHS.getValueType()),

                       DAG.getConstant(Sum, DL, LHS.getValueType()), CarryIn);

  }


  if (SDValue Flags = combineCarryThroughADD(CarryIn, DAG)) {

    MVT VT = N->getSimpleValueType(0);

    SDVTList VTs = DAG.getVTList(VT, MVT::i32);

    return DAG.getNode(X86ISD::ADC, SDLoc(N), VTs, LHS, RHS, Flags);

  }


  // Fold ADC(ADD(X,Y),0,Carry) -> ADC(X,Y,Carry)

  // iff the flag result is dead.

  if (LHS.getOpcode() == ISD::ADD && RHSC && RHSC->isZero() &&

      !N->hasAnyUseOfValue(1))

    return DAG.getNode(X86ISD::ADC, SDLoc(N), N->getVTList(), LHS.getOperand(0),

                       LHS.getOperand(1), CarryIn);


  return SDValue();

}


static SDValue matchPMADDWD(SelectionDAG &DAG, SDNode *N,

                            const SDLoc &DL, EVT VT,

                            const X86Subtarget &Subtarget) {

  using namespace SDPatternMatch;


  // Example of pattern we try to detect:

  // t := (v8i32 mul (sext (v8i16 x0), (sext (v8i16 x1))))

  //(add (build_vector (extract_elt t, 0),

  //                   (extract_elt t, 2),

  //                   (extract_elt t, 4),

  //                   (extract_elt t, 6)),

  //     (build_vector (extract_elt t, 1),

  //                   (extract_elt t, 3),

  //                   (extract_elt t, 5),

  //                   (extract_elt t, 7)))


  if (!Subtarget.hasSSE2())

    return SDValue();


  if (!VT.isVector() || VT.getVectorElementType() != MVT::i32 ||

      VT.getVectorNumElements() < 4 ||

      !isPowerOf2_32(VT.getVectorNumElements()))

    return SDValue();


  SDValue Op0, Op1, Accum;

  if (!sd_match(N, m_Add(m_AllOf(m_Opc(ISD::BUILD_VECTOR), m_Value(Op0)),

                         m_AllOf(m_Opc(ISD::BUILD_VECTOR), m_Value(Op1)))) &&

      !sd_match(N, m_Add(m_AllOf(m_Opc(ISD::BUILD_VECTOR), m_Value(Op0)),

                         m_Add(m_Value(Accum), m_AllOf(m_Opc(ISD::BUILD_VECTOR),

                                                       m_Value(Op1))))))

    return SDValue();


  // Check if one of Op0,Op1 is of the form:

  // (build_vector (extract_elt Mul, 0),

  //               (extract_elt Mul, 2),

  //               (extract_elt Mul, 4),

  //                   ...

  // the other is of the form:

  // (build_vector (extract_elt Mul, 1),

  //               (extract_elt Mul, 3),

  //               (extract_elt Mul, 5),

  //                   ...

  // and identify Mul.

  SDValue Mul;

  for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; i += 2) {

    SDValue Op0L = Op0->getOperand(i), Op1L = Op1->getOperand(i),

            Op0H = Op0->getOperand(i + 1), Op1H = Op1->getOperand(i + 1);

    // TODO: Be more tolerant to undefs.

    APInt Idx0L, Idx0H, Idx1L, Idx1H;

    SDValue Vec0L, Vec0H, Vec1L, Vec1H;

    if (!sd_match(Op0L, m_ExtractElt(m_Value(Vec0L), m_ConstInt(Idx0L))) ||

        !sd_match(Op0H, m_ExtractElt(m_Value(Vec0H), m_ConstInt(Idx0H))) ||

        !sd_match(Op1L, m_ExtractElt(m_Value(Vec1L), m_ConstInt(Idx1L))) ||

        !sd_match(Op1H, m_ExtractElt(m_Value(Vec1H), m_ConstInt(Idx1H))))

      return SDValue();

    // Commutativity of mul allows factors of a product to reorder.

    if (Idx0L.getZExtValue() > Idx1L.getZExtValue())

      std::swap(Idx0L, Idx1L);

    if (Idx0H.getZExtValue() > Idx1H.getZExtValue())

      std::swap(Idx0H, Idx1H);

    // Commutativity of add allows pairs of factors to reorder.

    if (Idx0L.getZExtValue() > Idx0H.getZExtValue()) {

      std::swap(Idx0L, Idx0H);

      std::swap(Idx1L, Idx1H);

    }

    if (Idx0L != 2 * i || Idx1L != 2 * i + 1 || Idx0H != 2 * i + 2 ||

        Idx1H != 2 * i + 3)

      return SDValue();

    if (!Mul) {

      // First time an extract_elt's source vector is visited. Must be a MUL

      // with 2X number of vector elements than the BUILD_VECTOR.

      // Both extracts must be from same MUL.

      Mul = Vec0L;

      if (Mul.getOpcode() != ISD::MUL ||

          Mul.getValueType().getVectorNumElements() != 2 * e)

        return SDValue();

    }

    // Check that the extract is from the same MUL previously seen.

    if (Mul != Vec0L || Mul != Vec1L || Mul != Vec0H || Mul != Vec1H)

      return SDValue();

  }


  // Check if the Mul source can be safely shrunk.

  ShrinkMode Mode;

  if (!canReduceVMulWidth(Mul.getNode(), DAG, Mode) ||

      Mode == ShrinkMode::MULU16)

    return SDValue();


  EVT TruncVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,

                                 VT.getVectorNumElements() * 2);

  SDValue N0 = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Mul.getOperand(0));

  SDValue N1 = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Mul.getOperand(1));


  auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,

                         ArrayRef<SDValue> Ops) {

    EVT InVT = Ops[0].getValueType();

    assert(InVT == Ops[1].getValueType() && "Operands' types mismatch");

    EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,

                                 InVT.getVectorNumElements() / 2);

    return DAG.getNode(X86ISD::VPMADDWD, DL, ResVT, Ops[0], Ops[1]);

  };

  SDValue R = SplitOpsAndApply(DAG, Subtarget, DL, VT, {N0, N1}, PMADDBuilder);

  if (Accum)

    R = DAG.getNode(ISD::ADD, DL, VT, R, Accum);

  return R;

}


// Attempt to turn this pattern into PMADDWD.

// (add (mul (sext (build_vector)), (sext (build_vector))),

//      (mul (sext (build_vector)), (sext (build_vector)))


static SDValue matchPMADDWD_2(SelectionDAG &DAG, SDNode *N,

                              const SDLoc &DL, EVT VT,

                              const X86Subtarget &Subtarget) {

  using namespace SDPatternMatch;


  if (!Subtarget.hasSSE2())

    return SDValue();


  if (!VT.isVector() || VT.getVectorElementType() != MVT::i32 ||

      VT.getVectorNumElements() < 4 ||

      !isPowerOf2_32(VT.getVectorNumElements()))

    return SDValue();


  // All inputs need to be sign extends.

  // TODO: Support ZERO_EXTEND from known positive?

  SDValue N00, N01, N10, N11;

  if (!sd_match(N, m_Add(m_Mul(m_SExt(m_Value(N00)), m_SExt(m_Value(N01))),

                         m_Mul(m_SExt(m_Value(N10)), m_SExt(m_Value(N11))))))

    return SDValue();


  // Must be extending from vXi16.

  EVT InVT = N00.getValueType();

  if (InVT.getVectorElementType() != MVT::i16 || N01.getValueType() != InVT ||

      N10.getValueType() != InVT || N11.getValueType() != InVT)

    return SDValue();


  // All inputs should be build_vectors.

  if (N00.getOpcode() != ISD::BUILD_VECTOR ||

      N01.getOpcode() != ISD::BUILD_VECTOR ||

      N10.getOpcode() != ISD::BUILD_VECTOR ||

      N11.getOpcode() != ISD::BUILD_VECTOR)

    return SDValue();


  // For each element, we need to ensure we have an odd element from one vector

  // multiplied by the odd element of another vector and the even element from

  // one of the same vectors being multiplied by the even element from the

  // other vector. So we need to make sure for each element i, this operator

  // is being performed:

  //  A[2 * i] * B[2 * i] + A[2 * i + 1] * B[2 * i + 1]

  SDValue In0, In1;

  for (unsigned i = 0; i != N00.getNumOperands(); ++i) {

    SDValue N00Elt = N00.getOperand(i);

    SDValue N01Elt = N01.getOperand(i);

    SDValue N10Elt = N10.getOperand(i);

    SDValue N11Elt = N11.getOperand(i);

    // TODO: Be more tolerant to undefs.

    SDValue N00In, N01In, N10In, N11In;

    APInt IdxN00, IdxN01, IdxN10, IdxN11;

    if (!sd_match(N00Elt, m_ExtractElt(m_Value(N00In), m_ConstInt(IdxN00))) ||

        !sd_match(N01Elt, m_ExtractElt(m_Value(N01In), m_ConstInt(IdxN01))) ||

        !sd_match(N10Elt, m_ExtractElt(m_Value(N10In), m_ConstInt(IdxN10))) ||

        !sd_match(N11Elt, m_ExtractElt(m_Value(N11In), m_ConstInt(IdxN11))))

      return SDValue();

    // Add is commutative so indices can be reordered.

    if (IdxN00.getZExtValue() > IdxN10.getZExtValue()) {

      std::swap(IdxN00, IdxN10);

      std::swap(IdxN01, IdxN11);

    }

    // N0 indices be the even element. N1 indices must be the next odd element.

    if (IdxN00 != 2 * i || IdxN10 != 2 * i + 1 || IdxN01 != 2 * i ||

        IdxN11 != 2 * i + 1)

      return SDValue();


    // First time we find an input capture it.

    if (!In0) {

      In0 = N00In;

      In1 = N01In;


      // The input vectors must be at least as wide as the output.

      // If they are larger than the output, we extract subvector below.

      if (In0.getValueSizeInBits() < VT.getSizeInBits() ||

          In1.getValueSizeInBits() < VT.getSizeInBits())

        return SDValue();

    }

    // Mul is commutative so the input vectors can be in any order.

    // Canonicalize to make the compares easier.

    if (In0 != N00In)

      std::swap(N00In, N01In);

    if (In0 != N10In)

      std::swap(N10In, N11In);

    if (In0 != N00In || In1 != N01In || In0 != N10In || In1 != N11In)

      return SDValue();

  }


  auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,

                         ArrayRef<SDValue> Ops) {

    EVT OpVT = Ops[0].getValueType();

    assert(OpVT.getScalarType() == MVT::i16 &&

           "Unexpected scalar element type");

    assert(OpVT == Ops[1].getValueType() && "Operands' types mismatch");

    EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,

                                 OpVT.getVectorNumElements() / 2);

    return DAG.getNode(X86ISD::VPMADDWD, DL, ResVT, Ops[0], Ops[1]);

  };


  // If the output is narrower than an input, extract the low part of the input

  // vector.

  EVT OutVT16 = EVT::getVectorVT(*DAG.getContext(), MVT::i16,

                               VT.getVectorNumElements() * 2);

  if (OutVT16.bitsLT(In0.getValueType())) {

    In0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OutVT16, In0,

                      DAG.getVectorIdxConstant(0, DL));

  }

  if (OutVT16.bitsLT(In1.getValueType())) {

    In1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OutVT16, In1,

                      DAG.getVectorIdxConstant(0, DL));

  }

  return SplitOpsAndApply(DAG, Subtarget, DL, VT, { In0, In1 },

                          PMADDBuilder);

}


// ADD(VPMADDWD(X,Y),VPMADDWD(Z,W)) -> VPMADDWD(SHUFFLE(X,Z), SHUFFLE(Y,W))

// If upper element in each pair of both VPMADDWD are zero then we can merge

// the operand elements and use the implicit add of VPMADDWD.

// TODO: Add support for VPMADDUBSW (which isn't commutable).


static SDValue combineAddOfPMADDWD(SelectionDAG &DAG, SDValue N0, SDValue N1,

                                   const SDLoc &DL, EVT VT) {

  if (N0.getOpcode() != N1.getOpcode() || N0.getOpcode() != X86ISD::VPMADDWD)

    return SDValue();


  // TODO: Add 256/512-bit support once VPMADDWD combines with shuffles.

  if (VT.getSizeInBits() > 128)

    return SDValue();


  unsigned NumElts = VT.getVectorNumElements();

  MVT OpVT = N0.getOperand(0).getSimpleValueType();

  APInt DemandedBits = APInt::getAllOnes(OpVT.getScalarSizeInBits());

  APInt DemandedHiElts = APInt::getSplat(2 * NumElts, APInt(2, 2));


  bool Op0HiZero =

      DAG.MaskedValueIsZero(N0.getOperand(0), DemandedBits, DemandedHiElts) ||

      DAG.MaskedValueIsZero(N0.getOperand(1), DemandedBits, DemandedHiElts);

  bool Op1HiZero =

      DAG.MaskedValueIsZero(N1.getOperand(0), DemandedBits, DemandedHiElts) ||

      DAG.MaskedValueIsZero(N1.getOperand(1), DemandedBits, DemandedHiElts);


  // TODO: Check for zero lower elements once we have actual codegen that

  // creates them.

  if (!Op0HiZero || !Op1HiZero)

    return SDValue();


  // Create a shuffle mask packing the lower elements from each VPMADDWD.

  SmallVector<int> Mask;

  for (int i = 0; i != (int)NumElts; ++i) {

    Mask.push_back(2 * i);

    Mask.push_back(2 * (i + NumElts));

  }


  SDValue LHS =

      DAG.getVectorShuffle(OpVT, DL, N0.getOperand(0), N1.getOperand(0), Mask);

  SDValue RHS =

      DAG.getVectorShuffle(OpVT, DL, N0.getOperand(1), N1.getOperand(1), Mask);

  return DAG.getNode(X86ISD::VPMADDWD, DL, VT, LHS, RHS);

}


/// CMOV of constants requires materializing constant operands in registers.

/// Try to fold those constants into an 'add' instruction to reduce instruction

/// count. We do this with CMOV rather the generic 'select' because there are

/// earlier folds that may be used to turn select-of-constants into logic hacks.


static SDValue pushAddIntoCmovOfConsts(SDNode *N, const SDLoc &DL,

                                       SelectionDAG &DAG,

                                       const X86Subtarget &Subtarget) {

  // If an operand is zero, add-of-0 gets simplified away, so that's clearly

  // better because we eliminate 1-2 instructions. This transform is still

  // an improvement without zero operands because we trade 2 move constants and

  // 1 add for 2 adds (LEA) as long as the constants can be represented as

  // immediate asm operands (fit in 32-bits).

  auto isSuitableCmov = [](SDValue V) {

    if (V.getOpcode() != X86ISD::CMOV || !V.hasOneUse())

      return false;

    if (!isa<ConstantSDNode>(V.getOperand(0)) ||

        !isa<ConstantSDNode>(V.getOperand(1)))

      return false;

    return isNullConstant(V.getOperand(0)) || isNullConstant(V.getOperand(1)) ||

           (V.getConstantOperandAPInt(0).isSignedIntN(32) &&

            V.getConstantOperandAPInt(1).isSignedIntN(32));

  };


  // Match an appropriate CMOV as the first operand of the add.

  SDValue Cmov = N->getOperand(0);

  SDValue OtherOp = N->getOperand(1);

  if (!isSuitableCmov(Cmov))

    std::swap(Cmov, OtherOp);

  if (!isSuitableCmov(Cmov))

    return SDValue();


  // Don't remove a load folding opportunity for the add. That would neutralize

  // any improvements from removing constant materializations.

  if (X86::mayFoldLoad(OtherOp, Subtarget))

    return SDValue();


  EVT VT = N->getValueType(0);

  SDValue FalseOp = Cmov.getOperand(0);

  SDValue TrueOp = Cmov.getOperand(1);


  // We will push the add through the select, but we can potentially do better

  // if we know there is another add in the sequence and this is pointer math.

  // In that case, we can absorb an add into the trailing memory op and avoid

  // a 3-operand LEA which is likely slower than a 2-operand LEA.

  // TODO: If target has "slow3OpsLEA", do this even without the trailing memop?

  if (OtherOp.getOpcode() == ISD::ADD && OtherOp.hasOneUse() &&

      !isa<ConstantSDNode>(OtherOp.getOperand(0)) &&

      all_of(N->users(), [&](SDNode *Use) {

        auto *MemNode = dyn_cast<MemSDNode>(Use);

        return MemNode && MemNode->getBasePtr().getNode() == N;

      })) {

    // add (cmov C1, C2), add (X, Y) --> add (cmov (add X, C1), (add X, C2)), Y

    // TODO: We are arbitrarily choosing op0 as the 1st piece of the sum, but

    //       it is possible that choosing op1 might be better.

    SDValue X = OtherOp.getOperand(0), Y = OtherOp.getOperand(1);

    FalseOp = DAG.getNode(ISD::ADD, DL, VT, X, FalseOp);

    TrueOp = DAG.getNode(ISD::ADD, DL, VT, X, TrueOp);

    Cmov = DAG.getNode(X86ISD::CMOV, DL, VT, FalseOp, TrueOp,

                       Cmov.getOperand(2), Cmov.getOperand(3));

    return DAG.getNode(ISD::ADD, DL, VT, Cmov, Y);

  }


  // add (cmov C1, C2), OtherOp --> cmov (add OtherOp, C1), (add OtherOp, C2)

  FalseOp = DAG.getNode(ISD::ADD, DL, VT, OtherOp, FalseOp);

  TrueOp = DAG.getNode(ISD::ADD, DL, VT, OtherOp, TrueOp);

  return DAG.getNode(X86ISD::CMOV, DL, VT, FalseOp, TrueOp, Cmov.getOperand(2),

                     Cmov.getOperand(3));

}


// Attempt to turn ADD(MUL(x, y), acc)) -> VPMADD52L

// When upper 12 bits of x, y and MUL(x, y) are known to be 0


static SDValue matchVPMADD52(SDNode *N, SelectionDAG &DAG, const SDLoc &DL,

                             EVT VT, const X86Subtarget &Subtarget) {

  using namespace SDPatternMatch;

  if (!VT.isVector() || VT.getScalarSizeInBits() != 64 ||

      (!Subtarget.hasAVXIFMA() && !Subtarget.hasIFMA()))

    return SDValue();


  // Need AVX-512VL vector length extensions if operating on XMM/YMM registers

  if (!Subtarget.hasAVXIFMA() && !Subtarget.hasVLX() &&

      VT.getSizeInBits() < 512)

    return SDValue();


  const auto TotalSize = VT.getSizeInBits();

  if (TotalSize < 128 || !isPowerOf2_64(TotalSize))

    return SDValue();


  SDValue X, Y, Acc;

  if (!sd_match(N, m_Add(m_Mul(m_Value(X), m_Value(Y)), m_Value(Acc))))

    return SDValue();


  KnownBits KnownX = DAG.computeKnownBits(X);

  if (KnownX.countMinLeadingZeros() < 12)

    return SDValue();

  KnownBits KnownY = DAG.computeKnownBits(Y);

  if (KnownY.countMinLeadingZeros() < 12)

    return SDValue();

  KnownBits KnownMul = KnownBits::mul(KnownX, KnownY);

  if (KnownMul.countMinLeadingZeros() < 12)

    return SDValue();


  auto VPMADD52Builder = [](SelectionDAG &G, SDLoc DL,

                            ArrayRef<SDValue> SubOps) {

    EVT SubVT = SubOps[0].getValueType();

    assert(SubVT.getScalarSizeInBits() == 64 &&

           "Unexpected element size, only supports 64bit size");

    return G.getNode(X86ISD::VPMADD52L, DL, SubVT, SubOps[1] /*X*/,

                     SubOps[2] /*Y*/, SubOps[0] /*Acc*/);

  };


  return SplitOpsAndApply(DAG, Subtarget, DL, VT, {Acc, X, Y}, VPMADD52Builder,

                          /*CheckBWI*/ false,

                          /*AllowAVX512*/ Subtarget.hasIFMA());

}


static SDValue combineAdd(SDNode *N, SelectionDAG &DAG,

                          TargetLowering::DAGCombinerInfo &DCI,

                          const X86Subtarget &Subtarget) {

  using namespace SDPatternMatch;

  EVT VT = N->getValueType(0);

  SDValue Op0 = N->getOperand(0);

  SDValue Op1 = N->getOperand(1);

  SDLoc DL(N);


  if (SDValue Select = pushAddIntoCmovOfConsts(N, DL, DAG, Subtarget))

    return Select;


  if (SDValue MAdd = matchPMADDWD(DAG, N, DL, VT, Subtarget))

    return MAdd;

  if (SDValue MAdd = matchPMADDWD_2(DAG, N, DL, VT, Subtarget))

    return MAdd;

  if (SDValue MAdd = combineAddOfPMADDWD(DAG, Op0, Op1, DL, VT))

    return MAdd;


  // Try to synthesize horizontal adds from adds of shuffles.

  if (SDValue V = combineToHorizontalAddSub(N, DAG, Subtarget))

    return V;


  // Prefer VSHLI to reduce uses, X86FixupInstTunings may revert this depending

  // on the scheduler model. Limit multiple users to AVX+ targets to prevent

  // introducing extra register moves.

  if (Op0 == Op1 && supportedVectorShiftWithImm(VT, Subtarget, ISD::SHL))

    if (Subtarget.hasAVX() || N->isOnlyUserOf(Op0.getNode()))

      return getTargetVShiftByConstNode(X86ISD::VSHLI, DL, VT.getSimpleVT(),

                                        Op0, 1, DAG);


  // Canonicalize hidden LEA pattern:

  // Fold (add (sub (shl x, c), y), z) -> (sub (add (shl x, c), z), y)

  // iff c < 4

  if (VT == MVT::i32 || VT == MVT::i64) {

    SDValue Y, Z, Shift;

    APInt Amt;

    if (sd_match(

            N, m_Add(m_OneUse(m_Sub(m_AllOf(m_Value(Shift),

                                            m_Shl(m_Value(), m_ConstInt(Amt))),

                                    m_Value(Y))),

                     m_Value(Z))) &&

        Amt.ult(4) && !isa<ConstantSDNode>(Z)) {

      return DAG.getNode(ISD::SUB, DL, VT,

                         DAG.getNode(ISD::ADD, DL, VT, Shift, Z), Y);

    }

  }


  SDValue X, Y;


  // add(psadbw(X,0),psadbw(Y,0)) -> psadbw(add(X,Y),0)

  // iff X and Y won't overflow.

  if (sd_match(Op0, m_c_BinOp(X86ISD::PSADBW, m_Value(X), m_Zero())) &&

      sd_match(Op1, m_c_BinOp(X86ISD::PSADBW, m_Value(Y), m_Zero())) &&

      DAG.willNotOverflowAdd(/*IsSigned=*/false, X, Y)) {

    MVT OpVT = X.getSimpleValueType();

    SDValue Sum = DAG.getNode(ISD::ADD, DL, OpVT, X, Y);

    return DAG.getNode(X86ISD::PSADBW, DL, VT, Sum,

                       getZeroVector(OpVT, Subtarget, DAG, DL));

  }


  if (VT.isVector()) {

    EVT BoolVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,

                                  VT.getVectorElementCount());


    // If vectors of i1 are legal, turn (add (zext (vXi1 X)), Y) into

    // (sub Y, (sext (vXi1 X))).

    // FIXME: We have the (sub Y, (zext (vXi1 X))) -> (add (sext (vXi1 X)), Y)

    // in generic DAG combine without a legal type check, but adding this there

    // caused regressions.

    if (DAG.getTargetLoweringInfo().isTypeLegal(BoolVT) &&

        sd_match(N, m_Add(m_ZExt(m_AllOf(m_SpecificVT(BoolVT), m_Value(X))),

                          m_Value(Y)))) {

      SDValue SExt = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, X);

      return DAG.getNode(ISD::SUB, DL, VT, Y, SExt);

    }


    // Fold (add X, (srl Y, 7)) -> (sub X, (icmp_sgt 0, Y)) to undo instcombine

    // canonicalisation as we don't have good vXi8 shifts.

    if (VT.getScalarType() == MVT::i8 &&

        sd_match(N, m_Add(m_Value(X), m_Srl(m_Value(Y), m_SpecificInt(7))))) {

      SDValue Cmp =

          DAG.getSetCC(DL, BoolVT, DAG.getConstant(0, DL, VT), Y, ISD::SETGT);

      return DAG.getNode(ISD::SUB, DL, VT, X, DAG.getSExtOrTrunc(Cmp, DL, VT));

    }

  }


  // Peephole for 512-bit VPDPBSSD on non-VLX targets.

  // TODO: Should this be part of matchPMADDWD/matchPMADDWD_2?

  if (Subtarget.hasVNNI() && Subtarget.useAVX512Regs() && VT == MVT::v16i32) {

    SDValue Accum, Lo0, Lo1, Hi0, Hi1;

    if (sd_match(N, m_Add(m_Value(Accum),

                          m_Node(ISD::CONCAT_VECTORS,

                                 m_BinOp(X86ISD::VPMADDWD, m_Value(Lo0),

                                         m_Value(Lo1)),

                                 m_BinOp(X86ISD::VPMADDWD, m_Value(Hi0),

                                         m_Value(Hi1)))))) {

      return DAG.getNode(X86ISD::VPDPWSSD, DL, VT, Accum,

                         concatSubVectors(Lo0, Hi0, DAG, DL),

                         concatSubVectors(Lo1, Hi1, DAG, DL));

    }

  }


  // Fold ADD(ADC(Y,0,W),X) -> ADC(X,Y,W)

  if (Op0.getOpcode() == X86ISD::ADC && Op0->hasOneUse() &&

      X86::isZeroNode(Op0.getOperand(1))) {

    assert(!Op0->hasAnyUseOfValue(1) && "Overflow bit in use");

    return DAG.getNode(X86ISD::ADC, SDLoc(Op0), Op0->getVTList(), Op1,

                       Op0.getOperand(0), Op0.getOperand(2));

  }


  if (SDValue IFMA52 = matchVPMADD52(N, DAG, DL, VT, Subtarget))

    return IFMA52;


  return combineAddOrSubToADCOrSBB(N, DL, DAG);

}


// Try to fold (sub Y, cmovns X, -X) -> (add Y, cmovns -X, X) if the cmov

// condition comes from the subtract node that produced -X. This matches the

// cmov expansion for absolute value. By swapping the operands we convert abs

// to nabs.


static SDValue combineSubABS(EVT VT, const SDLoc &DL, SDValue N0, SDValue N1,

                             SelectionDAG &DAG) {

  if (N1.getOpcode() != X86ISD::CMOV || !N1.hasOneUse())

    return SDValue();


  SDValue Cond = N1.getOperand(3);

  if (Cond.getOpcode() != X86ISD::SUB)

    return SDValue();

  assert(Cond.getResNo() == 1 && "Unexpected result number");


  SDValue FalseOp = N1.getOperand(0);

  SDValue TrueOp = N1.getOperand(1);

  X86::CondCode CC = (X86::CondCode)N1.getConstantOperandVal(2);


  // ABS condition should come from a negate operation.

  if ((CC == X86::COND_S || CC == X86::COND_NS) &&

      isNullConstant(Cond.getOperand(0))) {

    // Get the X and -X from the negate.

    SDValue NegX = Cond.getValue(0);

    SDValue X = Cond.getOperand(1);


    // Cmov operands should be X and NegX. Order doesn't matter.

    if (!(TrueOp == X && FalseOp == NegX) && !(TrueOp == NegX && FalseOp == X))

      return SDValue();


    // Build a new CMOV with the operands swapped.

    SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, VT, TrueOp, FalseOp,

                               N1.getOperand(2), Cond);

    // Convert sub to add.

    return DAG.getNode(ISD::ADD, DL, VT, N0, Cmov);

  }


  // Handle ABD special case:

  // NEG(ABD(X,Y)) -> NEG(CMOV(SUB(X,Y),SUB(Y,X))) -> CMOV(SUB(Y,X),SUB(X,Y)).

  // ABD condition should come from a pair of matching subtracts.

  if ((CC == X86::COND_L || CC == X86::COND_B) && isNullConstant(N0) &&

      (FalseOp == Cond.getValue(0) || TrueOp == Cond.getValue(0)) &&

      (TrueOp.getOpcode() == ISD::SUB || TrueOp.getOpcode() == X86ISD::SUB) &&

      (FalseOp.getOpcode() == ISD::SUB || FalseOp.getOpcode() == X86ISD::SUB) &&

      (TrueOp.getOperand(0) == FalseOp.getOperand(1)) &&

      (TrueOp.getOperand(1) == FalseOp.getOperand(0))) {

    // Build a new CMOV with the operands swapped.

    return DAG.getNode(X86ISD::CMOV, DL, VT, TrueOp, FalseOp, N1.getOperand(2),

                       Cond);

  }


  return SDValue();

}


static SDValue combineSubSetcc(SDNode *N, SelectionDAG &DAG) {

  SDValue Op0 = N->getOperand(0);

  SDValue Op1 = N->getOperand(1);


  // (sub C (zero_extend (setcc)))

  // =>

  // (add (zero_extend (setcc inverted) C-1))   if C is a nonzero immediate

  // Don't disturb (sub 0 setcc), which is easily done with neg.

  EVT VT = N->getValueType(0);

  auto *Op0C = dyn_cast<ConstantSDNode>(Op0);

  if (Op1.getOpcode() == ISD::ZERO_EXTEND && Op1.hasOneUse() && Op0C &&

      !Op0C->isZero() && Op1.getOperand(0).getOpcode() == X86ISD::SETCC &&

      Op1.getOperand(0).hasOneUse()) {

    SDValue SetCC = Op1.getOperand(0);

    X86::CondCode CC = (X86::CondCode)SetCC.getConstantOperandVal(0);

    X86::CondCode NewCC = X86::GetOppositeBranchCondition(CC);

    APInt NewImm = Op0C->getAPIntValue() - 1;

    SDLoc DL(Op1);

    SDValue NewSetCC = getSETCC(NewCC, SetCC.getOperand(1), DL, DAG);

    NewSetCC = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, NewSetCC);

    return DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(VT, VT), NewSetCC,

                       DAG.getConstant(NewImm, DL, VT));

  }


  return SDValue();

}


static SDValue combineX86CloadCstore(SDNode *N, SelectionDAG &DAG) {

  if (N->getConstantOperandVal(3) != X86::COND_NE)

    return SDValue();


  SDValue Sub = N->getOperand(4);

  if (Sub.getOpcode() != X86ISD::SUB)

    return SDValue();


  SDValue Op1 = Sub.getOperand(1);


  if (!X86::isZeroNode(Sub.getOperand(0)))

    return SDValue();


  SDLoc DL(N);

  SmallVector<SDValue, 5> Ops(N->op_values());

  if (Op1.getOpcode() == X86ISD::SETCC) {

    // res, flags2 = sub 0, (setcc cc, flag)

    // cload/cstore ..., cond_ne, flag2

    // ->

    // cload/cstore cc, flag

    Ops[3] = Op1.getOperand(0);

    Ops[4] = Op1.getOperand(1);

  } else if (Op1.getOpcode() == ISD::AND && Sub.getValue(0).use_empty()) {

    SDValue Src = Op1;

    SDValue Op10 = Op1.getOperand(0);

    if (Op10.getOpcode() == ISD::XOR && isAllOnesConstant(Op10.getOperand(1)) &&

        llvm::isOneConstant(Op1.getOperand(1))) {

      // res, flags2 = sub 0, (and (xor X, -1), 1)

      // cload/cstore ..., cond_ne, flag2

      // ->

      // res, flags2 = sub 0, (and X, 1)

      // cload/cstore ..., cond_e, flag2

      Src = DAG.getNode(ISD::AND, DL, Op1.getValueType(), Op10.getOperand(0),

                        Op1.getOperand(1));

      Ops[3] = DAG.getTargetConstant(X86::COND_E, DL, MVT::i8);

    }

    // res, flags2 = sub 0, (and X, Y)

    // cload/cstore ..., cc, flag2

    // ->

    // res, flags2 = cmp (and X, Y), 0

    // cload/cstore ..., cc, flag2

    Ops[4] = DAG.getNode(X86ISD::CMP, DL, MVT::i32, Src, Sub.getOperand(0));

  } else {

    return SDValue();

  }


  return DAG.getMemIntrinsicNode(N->getOpcode(), DL, N->getVTList(), Ops,

                                 cast<MemSDNode>(N)->getMemoryVT(),

                                 cast<MemSDNode>(N)->getMemOperand());

}


static SDValue combineSub(SDNode *N, SelectionDAG &DAG,

                          TargetLowering::DAGCombinerInfo &DCI,

                          const X86Subtarget &Subtarget) {

  EVT VT = N->getValueType(0);

  SDValue Op0 = N->getOperand(0);

  SDValue Op1 = N->getOperand(1);

  SDLoc DL(N);


  auto IsNonOpaqueConstant = [&](SDValue Op) {

    return DAG.isConstantIntBuildVectorOrConstantInt(Op,

                                                     /*AllowOpaques*/ false);

  };


  // X86 can't encode an immediate LHS of a sub. See if we can push the

  // negation into a preceding instruction. If the RHS of the sub is a XOR with

  // one use and a constant, invert the immediate, saving one register.

  // However, ignore cases where C1 is 0, as those will become a NEG.

  // sub(C1, xor(X, C2)) -> add(xor(X, ~C2), C1+1)

  if (Op1.getOpcode() == ISD::XOR && IsNonOpaqueConstant(Op0) &&

      !isNullConstant(Op0) && IsNonOpaqueConstant(Op1.getOperand(1)) &&

      Op1->hasOneUse()) {

    SDValue NewXor = DAG.getNode(ISD::XOR, SDLoc(Op1), VT, Op1.getOperand(0),

                                 DAG.getNOT(SDLoc(Op1), Op1.getOperand(1), VT));

    SDValue NewAdd =

        DAG.getNode(ISD::ADD, DL, VT, Op0, DAG.getConstant(1, DL, VT));

    return DAG.getNode(ISD::ADD, DL, VT, NewXor, NewAdd);

  }


  if (SDValue V = combineSubABS(VT, DL, Op0, Op1, DAG))

    return V;


  // Try to synthesize horizontal subs from subs of shuffles.

  if (SDValue V = combineToHorizontalAddSub(N, DAG, Subtarget))

    return V;


  // Fold SUB(X,ADC(Y,0,W)) -> SBB(X,Y,W)

  if (Op1.getOpcode() == X86ISD::ADC && Op1->hasOneUse() &&

      X86::isZeroNode(Op1.getOperand(1))) {

    assert(!Op1->hasAnyUseOfValue(1) && "Overflow bit in use");

    return DAG.getNode(X86ISD::SBB, SDLoc(Op1), Op1->getVTList(), Op0,

                       Op1.getOperand(0), Op1.getOperand(2));

  }


  // Fold SUB(X,SBB(Y,Z,W)) -> SUB(ADC(X,Z,W),Y)

  // Don't fold to ADC(0,0,W)/SETCC_CARRY pattern which will prevent more folds.

  if (Op1.getOpcode() == X86ISD::SBB && Op1->hasOneUse() &&

      !(X86::isZeroNode(Op0) && X86::isZeroNode(Op1.getOperand(1)))) {

    assert(!Op1->hasAnyUseOfValue(1) && "Overflow bit in use");

    SDValue ADC = DAG.getNode(X86ISD::ADC, SDLoc(Op1), Op1->getVTList(), Op0,

                              Op1.getOperand(1), Op1.getOperand(2));

    return DAG.getNode(ISD::SUB, DL, VT, ADC.getValue(0), Op1.getOperand(0));

  }


  if (SDValue V = combineXorSubCTLZ(N, DL, DAG, Subtarget))

    return V;


  if (SDValue V = combineAddOrSubToADCOrSBB(N, DL, DAG))

    return V;


  return combineSubSetcc(N, DAG);

}


static SDValue combineVectorCompare(SDNode *N, SelectionDAG &DAG,

                                    const X86Subtarget &Subtarget) {

  unsigned Opcode = N->getOpcode();

  assert((Opcode == X86ISD::PCMPEQ || Opcode == X86ISD::PCMPGT) &&

         "Unknown PCMP opcode");


  SDValue LHS = N->getOperand(0);

  SDValue RHS = N->getOperand(1);

  MVT VT = N->getSimpleValueType(0);

  unsigned EltBits = VT.getScalarSizeInBits();

  unsigned NumElts = VT.getVectorNumElements();

  SDLoc DL(N);


  if (LHS == RHS)

    return (Opcode == X86ISD::PCMPEQ) ? DAG.getAllOnesConstant(DL, VT)

                                      : DAG.getConstant(0, DL, VT);


  // Constant Folding.

  // PCMPEQ(X,UNDEF) -> UNDEF

  // PCMPGT(X,UNDEF) -> 0

  // PCMPGT(UNDEF,X) -> 0

  APInt LHSUndefs, RHSUndefs;

  SmallVector<APInt> LHSBits, RHSBits;

  if (getTargetConstantBitsFromNode(LHS, EltBits, LHSUndefs, LHSBits) &&

      getTargetConstantBitsFromNode(RHS, EltBits, RHSUndefs, RHSBits)) {

    APInt Ones = APInt::getAllOnes(EltBits);

    APInt Zero = APInt::getZero(EltBits);

    SmallVector<APInt> Results(NumElts);

    for (unsigned I = 0; I != NumElts; ++I) {

      if (Opcode == X86ISD::PCMPEQ) {

        Results[I] = (LHSBits[I] == RHSBits[I]) ? Ones : Zero;

      } else {

        bool AnyUndef = LHSUndefs[I] || RHSUndefs[I];

        Results[I] = (!AnyUndef && LHSBits[I].sgt(RHSBits[I])) ? Ones : Zero;

      }

    }

    if (Opcode == X86ISD::PCMPEQ)

      return getConstVector(Results, LHSUndefs | RHSUndefs, VT, DAG, DL);

    return getConstVector(Results, VT, DAG, DL);

  }


  return SDValue();

}


// Helper to determine if we can convert an integer comparison to a float

// comparison byt casting the operands.

static std::optional<unsigned>


CastIntSETCCtoFP(MVT VT, ISD::CondCode CC, unsigned NumSignificantBitsLHS,

                 unsigned NumSignificantBitsRHS) {

  MVT SVT = VT.getScalarType();

  assert(SVT == MVT::f32 && "Only tested for float so far");

  const fltSemantics &Sem = SVT.getFltSemantics();

  assert((CC == ISD::SETEQ || CC == ISD::SETGT) &&

         "Only PCMPEQ/PCMPGT currently supported");


  // TODO: Handle bitcastable integers.


  // For cvt + signed compare we need lhs and rhs to be exactly representable as

  // a fp value.

  unsigned FPPrec = APFloat::semanticsPrecision(Sem);

  if (FPPrec >= NumSignificantBitsLHS && FPPrec >= NumSignificantBitsRHS)

    return ISD::SINT_TO_FP;


  return std::nullopt;

}


/// Helper that combines an array of subvector ops as if they were the operands

/// of a ISD::CONCAT_VECTORS node, but may have come from another source (e.g.

/// ISD::INSERT_SUBVECTOR). The ops are assumed to be of the same type.


static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,

                                      ArrayRef<SDValue> Ops, SelectionDAG &DAG,

                                      const X86Subtarget &Subtarget,

                                      unsigned Depth) {

  assert(Subtarget.hasAVX() && "AVX assumed for concat_vectors");

  unsigned EltSizeInBits = VT.getScalarSizeInBits();


  if (llvm::all_of(Ops, [](SDValue Op) { return Op.isUndef(); }))

    return DAG.getUNDEF(VT);


  if (llvm::all_of(Ops, [](SDValue Op) {

        return Op.isUndef() || ISD::isBuildVectorAllZeros(Op.getNode());

      }))

    return getZeroVector(VT, Subtarget, DAG, DL);


  if (Depth >= SelectionDAG::MaxRecursionDepth)

    return SDValue(); // Limit search depth.


  SDValue Op0 = Ops[0];

  bool IsSplat = llvm::all_equal(Ops);

  unsigned NumOps = Ops.size();

  const TargetLowering &TLI = DAG.getTargetLoweringInfo();

  LLVMContext &Ctx = *DAG.getContext();


  // Repeated subvectors.

  if (IsSplat &&

      (VT.is256BitVector() || (VT.is512BitVector() && Subtarget.hasAVX512()))) {

    // If this broadcast is inserted into both halves, use a larger broadcast.

    if (Op0.getOpcode() == X86ISD::VBROADCAST)

      return DAG.getNode(Op0.getOpcode(), DL, VT, Op0.getOperand(0));


    // concat_vectors(movddup(x),movddup(x)) -> broadcast(x)

    if (Op0.getOpcode() == X86ISD::MOVDDUP && VT == MVT::v4f64 &&

        (Subtarget.hasAVX2() ||

         X86::mayFoldLoadIntoBroadcastFromMem(Op0.getOperand(0),

                                              VT.getScalarType(), Subtarget)))

      return DAG.getNode(X86ISD::VBROADCAST, DL, VT,

                         DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f64,

                                     Op0.getOperand(0),

                                     DAG.getVectorIdxConstant(0, DL)));


    // concat_vectors(scalar_to_vector(x),scalar_to_vector(x)) -> broadcast(x)

    if (Op0.getOpcode() == ISD::SCALAR_TO_VECTOR &&

        (Subtarget.hasAVX2() ||

         (EltSizeInBits >= 32 &&

          X86::mayFoldLoad(Op0.getOperand(0), Subtarget))) &&

        Op0.getOperand(0).getValueType() == VT.getScalarType())

      return DAG.getNode(X86ISD::VBROADCAST, DL, VT, Op0.getOperand(0));


    // concat_vectors(extract_subvector(splat(x)),

    //                extract_subvector(splat(x))) -> splat(x)

    // concat_vectors(extract_subvector(subv_broadcast(x)),

    //                extract_subvector(subv_broadcast(x))) -> subv_broadcast(x)

    if (Op0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&

        Op0.getOperand(0).getValueType() == VT) {

      SDValue SrcVec = Op0.getOperand(0);

      if (DAG.isSplatValue(SrcVec, /*AllowUndefs*/ false))

        return SrcVec;

      if (SrcVec.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD &&

          Op0.getValueType() == cast<MemSDNode>(SrcVec)->getMemoryVT())

        return SrcVec;

    }


    // concat_vectors(permq(x),permq(x)) -> permq(concat_vectors(x,x))

    if (Op0.getOpcode() == X86ISD::VPERMI && Subtarget.useAVX512Regs() &&

        !X86::mayFoldLoad(Op0.getOperand(0), Subtarget))

      return DAG.getNode(Op0.getOpcode(), DL, VT,

                         DAG.getNode(ISD::CONCAT_VECTORS, DL, VT,

                                     Op0.getOperand(0), Op0.getOperand(0)),

                         Op0.getOperand(1));

  }


  // TODO: This should go in combineX86ShufflesRecursively eventually.

  if (NumOps == 2) {

    SDValue Src0 = peekThroughBitcasts(Ops[0]);

    SDValue Src1 = peekThroughBitcasts(Ops[1]);

    if (Src0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&

        Src1.getOpcode() == ISD::EXTRACT_SUBVECTOR) {

      EVT SrcVT0 = Src0.getOperand(0).getValueType();

      EVT SrcVT1 = Src1.getOperand(0).getValueType();

      unsigned NumSrcElts0 = SrcVT0.getVectorNumElements();

      unsigned NumSrcElts1 = SrcVT1.getVectorNumElements();

      const APInt &SrcIdx0 = Src0.getConstantOperandAPInt(1);

      const APInt &SrcIdx1 = Src1.getConstantOperandAPInt(1);

      // concat(extract_subvector(v0), extract_subvector(v1)) -> vperm2x128.

      // Only concat of subvector high halves which vperm2x128 is best at or if

      // it should fold into a subvector broadcast.

      if (VT.is256BitVector() && SrcVT0.is256BitVector() &&

          SrcVT1.is256BitVector()) {

        assert((SrcIdx0 == 0 || SrcIdx0 == (NumSrcElts0 / 2)) &&

               (SrcIdx1 == 0 || SrcIdx1 == (NumSrcElts1 / 2)) &&

               "Bad subvector index");

        if ((SrcIdx0 == (NumSrcElts0 / 2) && SrcIdx1 == (NumSrcElts1 / 2)) ||

            (IsSplat && ISD::isNormalLoad(Src0.getOperand(0).getNode()))) {

          unsigned Index = 0;

          Index |= SrcIdx0 == 0 ? 0x00 : 0x01;

          Index |= SrcIdx1 == 0 ? 0x20 : 0x30;

          return DAG.getNode(X86ISD::VPERM2X128, DL, VT,

                             DAG.getBitcast(VT, Src0.getOperand(0)),

                             DAG.getBitcast(VT, Src1.getOperand(0)),

                             DAG.getTargetConstant(Index, DL, MVT::i8));

        }

      }

      // Widen extract_subvector

      // concat(extract_subvector(x,lo), extract_subvector(x,hi))

      // --> extract_subvector(x,lo)

      unsigned NumSubElts0 = Src0.getValueType().getVectorNumElements();

      if (Src0.getOperand(0) == Src1.getOperand(0) &&

          (SrcIdx0 == 0 || SrcIdx0 == (NumSrcElts0 / 2)) &&

          SrcIdx1 == (SrcIdx0 + NumSubElts0)) {

        return DAG.getBitcast(VT,

                              extractSubVector(Src0.getOperand(0),

                                               Src0.getConstantOperandVal(1),

                                               DAG, DL, VT.getSizeInBits()));

      }

    }

  }


  // Repeated opcode.

  // TODO - combineX86ShufflesRecursively should handle shuffle concatenation

  // but it currently struggles with different vector widths.

  if (llvm::all_of(Ops, [Op0](SDValue Op) {

        return Op.getOpcode() == Op0.getOpcode() && Op.hasOneUse();

      })) {

    auto ConcatSubOperand = [&](EVT VT, ArrayRef<SDValue> SubOps, unsigned I) {

      SmallVector<SDValue> Subs;

      for (SDValue SubOp : SubOps)

        Subs.push_back(SubOp.getOperand(I));

      // Attempt to peek through bitcasts and concat the original subvectors.

      EVT SubVT = peekThroughBitcasts(Subs[0]).getValueType();

      if (SubVT.isSimple() && SubVT.isVector()) {

        MVT ConcatVT =

            MVT::getVectorVT(SubVT.getSimpleVT().getScalarType(),

                             SubVT.getVectorElementCount() * Subs.size());

        for (SDValue &Sub : Subs)

          Sub = DAG.getBitcast(SubVT, Sub);

        if (SDValue ConcatSrc = combineConcatVectorOps(DL, ConcatVT, Subs, DAG,

                                                       Subtarget, Depth + 1))

          return DAG.getBitcast(VT, ConcatSrc);

        return DAG.getBitcast(

            VT, DAG.getNode(ISD::CONCAT_VECTORS, DL, ConcatVT, Subs));

      }

      return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Subs);

    };

    auto IsConcatFree = [](MVT VT, ArrayRef<SDValue> SubOps, unsigned Op) {

      bool AllConstants = true;

      bool AllSubs = true;

      unsigned VecSize = VT.getSizeInBits();

      SDValue BC0 = peekThroughBitcasts(SubOps[0].getOperand(Op));

      if (isa<LoadSDNode>(BC0) && all_of(SubOps, [&](SDValue SubOp) {

            return BC0 == peekThroughBitcasts(SubOp.getOperand(Op));

          }))

        return true;

      for (unsigned I = 0, E = SubOps.size(); I != E; ++I) {

        SDValue BC = peekThroughBitcasts(SubOps[I].getOperand(Op));

        unsigned SubSize = BC.getValueSizeInBits();

        unsigned EltSize = BC.getScalarValueSizeInBits();

        AllConstants &= ISD::isBuildVectorOfConstantSDNodes(BC.getNode()) ||

                        ISD::isBuildVectorOfConstantFPSDNodes(BC.getNode());

        AllSubs &= BC.getOpcode() == ISD::EXTRACT_SUBVECTOR &&

                   BC.getOperand(0).getValueSizeInBits() == VecSize &&

                   (BC.getConstantOperandVal(1) * EltSize) == (I * SubSize);

      }

      return AllConstants || AllSubs;

    };

    auto CombineSubOperand = [&](MVT VT, ArrayRef<SDValue> SubOps, unsigned I) {

      bool AllConstants = true;

      SmallVector<SDValue> Subs;

      for (SDValue SubOp : SubOps) {

        SDValue BC = peekThroughBitcasts(SubOp.getOperand(I));

        AllConstants &= ISD::isBuildVectorOfConstantSDNodes(BC.getNode()) ||

                        ISD::isBuildVectorOfConstantFPSDNodes(BC.getNode());

        Subs.push_back(SubOp.getOperand(I));

      }

      if (AllConstants)

        return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Subs);

      return combineConcatVectorOps(DL, VT, Subs, DAG, Subtarget, Depth + 1);

    };


    unsigned Opcode = Op0.getOpcode();

    switch (Opcode) {

    case ISD::BITCAST: {

      // TODO: Support AVX1/AVX2 bitcasts.

      SmallVector<SDValue, 4> SubOps;

      for (SDValue SubOp : Ops)

        SubOps.push_back(peekThroughBitcasts(SubOp.getOperand(0)));

      EVT InnerVT = SubOps[0].getValueType();

      unsigned InnerSizeInBits = InnerVT.getScalarSizeInBits();

      if (!IsSplat && InnerVT.isSimple() && InnerVT.isVector() &&

          (Subtarget.hasBWI() ||

           (EltSizeInBits >= 32 && InnerSizeInBits >= 32)) &&

          ((VT.is256BitVector() && Subtarget.hasVLX()) ||

           (VT.is512BitVector() && Subtarget.useAVX512Regs())) &&

          llvm::all_of(SubOps, [InnerVT](SDValue Op) {

            return Op.getValueType() == InnerVT;

          })) {

        MVT ConcatSVT = InnerVT.getScalarType().getSimpleVT();

        MVT ConcatVT = MVT::getVectorVT(

            ConcatSVT, VT.getSizeInBits() / ConcatSVT.getSizeInBits());

        if (SDValue ConcatSrc = combineConcatVectorOps(

                DL, ConcatVT, SubOps, DAG, Subtarget, Depth + 1))

          return DAG.getBitcast(VT, ConcatSrc);

      }

      break;

    }

    case ISD::VECTOR_SHUFFLE: {

      // TODO: Generalize NumOps support.

      if (!IsSplat && NumOps == 2 &&

          ((VT.is256BitVector() &&

            (EltSizeInBits >= 32 || Subtarget.hasInt256())) ||

           (VT.is512BitVector() && Subtarget.useAVX512Regs() &&

            (EltSizeInBits >= 32 || Subtarget.useBWIRegs())))) {

        SDValue Concat0 = CombineSubOperand(VT, Ops, 0);

        SDValue Concat1 = CombineSubOperand(VT, Ops, 1);

        if (Concat0 || Concat1 ||

            (Ops[0].getOperand(0) == Ops[1].getOperand(0) &&

             Ops[0].getOperand(1) == Ops[1].getOperand(1) &&

             Subtarget.hasVBMI())) {

          int NumSubElts = Op0.getValueType().getVectorNumElements();

          SmallVector<int> NewMask;

          for (int M : cast<ShuffleVectorSDNode>(Ops[0])->getMask()) {

            M = M >= NumSubElts ? M + NumSubElts : M;

            NewMask.push_back(M);

          }

          for (int M : cast<ShuffleVectorSDNode>(Ops[1])->getMask()) {

            if (0 <= M)

              M = (M >= NumSubElts ? M + NumSubElts : M) + NumSubElts;

            NewMask.push_back(M);

          }

          Concat0 = Concat0 ? Concat0 : ConcatSubOperand(VT, Ops, 0);

          Concat1 = Concat1 ? Concat1 : ConcatSubOperand(VT, Ops, 1);

          return DAG.getVectorShuffle(VT, DL, Concat0, Concat1, NewMask);

        }

      }

      break;

    }

    case X86ISD::VBROADCAST: {

      // TODO: 512-bit VBROADCAST concatenation.

      if (!IsSplat && llvm::all_of(Ops, [](SDValue Op) {

            return Op.getOperand(0).getValueType().is128BitVector();

          })) {

        if (VT == MVT::v4f64 || VT == MVT::v4i64)

          return DAG.getNode(X86ISD::UNPCKL, DL, VT,

                             ConcatSubOperand(VT, Ops, 0),

                             ConcatSubOperand(VT, Ops, 0));

        // TODO: Add pseudo v8i32 PSHUFD handling to AVX1Only targets.

        if (VT == MVT::v8f32 || (VT == MVT::v8i32 && Subtarget.hasInt256()))

          return DAG.getNode(VT == MVT::v8f32 ? X86ISD::VPERMILPI

                                              : X86ISD::PSHUFD,

                             DL, VT, ConcatSubOperand(VT, Ops, 0),

                             getV4X86ShuffleImm8ForMask({0, 0, 0, 0}, DL, DAG));

      }

      break;

    }

    case X86ISD::MOVDDUP:

    case X86ISD::MOVSHDUP:

    case X86ISD::MOVSLDUP: {

      if (!IsSplat && (VT.is256BitVector() ||

                       (VT.is512BitVector() && Subtarget.useAVX512Regs())))

        return DAG.getNode(Opcode, DL, VT, ConcatSubOperand(VT, Ops, 0));

      break;

    }

    case X86ISD::SHUFP: {

      if (!IsSplat &&

          (VT == MVT::v8f32 ||

           (VT == MVT::v16f32 && Subtarget.useAVX512Regs())) &&

          llvm::all_of(Ops, [Op0](SDValue Op) {

            return Op.getOperand(2) == Op0.getOperand(2);

          })) {

        SDValue Concat0 = CombineSubOperand(VT, Ops, 0);

        SDValue Concat1 = CombineSubOperand(VT, Ops, 1);

        if (Concat0 || Concat1)

          return DAG.getNode(Opcode, DL, VT,

                             Concat0 ? Concat0 : ConcatSubOperand(VT, Ops, 0),

                             Concat1 ? Concat1 : ConcatSubOperand(VT, Ops, 1),

                             Op0.getOperand(2));

      }

      break;

    }

    case X86ISD::UNPCKH:

    case X86ISD::UNPCKL: {

      // TODO: UNPCK should use CombineSubOperand

      // Don't concatenate build_vector patterns.

      if (!IsSplat &&

          ((VT.is256BitVector() &&

            (EltSizeInBits >= 32 || Subtarget.hasInt256())) ||

           (VT.is512BitVector() && Subtarget.useAVX512Regs() &&

            (EltSizeInBits >= 32 || Subtarget.useBWIRegs()))) &&

          none_of(Ops, [](SDValue Op) {

            return peekThroughBitcasts(Op.getOperand(0)).getOpcode() ==

                       ISD::SCALAR_TO_VECTOR ||

                   peekThroughBitcasts(Op.getOperand(1)).getOpcode() ==

                       ISD::SCALAR_TO_VECTOR;

          })) {

        SDValue Concat0 = CombineSubOperand(VT, Ops, 0);

        SDValue Concat1 = CombineSubOperand(VT, Ops, 1);

        if (Concat0 || Concat1 ||

            (Subtarget.hasInt256() && EltSizeInBits == 64))

          return DAG.getNode(Opcode, DL, VT,

                             Concat0 ? Concat0 : ConcatSubOperand(VT, Ops, 0),

                             Concat1 ? Concat1 : ConcatSubOperand(VT, Ops, 1));

      }

      break;

    }

    case X86ISD::PSHUFHW:

    case X86ISD::PSHUFLW:

    case X86ISD::PSHUFD:

      if (!IsSplat &&

          ((VT.is256BitVector() && Subtarget.hasInt256()) ||

           (VT.is512BitVector() && Subtarget.useAVX512Regs() &&

            (EltSizeInBits >= 32 || Subtarget.useBWIRegs()))) &&

          llvm::all_of(Ops, [Op0](SDValue Op) {

            return Op.getOperand(1) == Op0.getOperand(1);

          })) {

        return DAG.getNode(Opcode, DL, VT, ConcatSubOperand(VT, Ops, 0),

                           Op0.getOperand(1));

      }

      [[fallthrough]];

    case X86ISD::VPERMILPI:

      if (!IsSplat && EltSizeInBits == 32 &&

          (VT.is256BitVector() ||

           (VT.is512BitVector() && Subtarget.useAVX512Regs())) &&

          all_of(Ops, [&Op0](SDValue Op) {

            return Op0.getOperand(1) == Op.getOperand(1);

          })) {

        MVT FloatVT = VT.changeVectorElementType(MVT::f32);

        SDValue Res = DAG.getBitcast(FloatVT, ConcatSubOperand(VT, Ops, 0));

        Res =

            DAG.getNode(X86ISD::VPERMILPI, DL, FloatVT, Res, Op0.getOperand(1));

        return DAG.getBitcast(VT, Res);

      }

      break;

    case X86ISD::VPERMILPV:

      if (!IsSplat && (VT.is256BitVector() ||

                       (VT.is512BitVector() && Subtarget.useAVX512Regs()))) {

        SDValue Concat0 = CombineSubOperand(VT, Ops, 0);

        SDValue Concat1 = CombineSubOperand(VT, Ops, 1);

        if (Concat0 || Concat1)

          return DAG.getNode(Opcode, DL, VT,

                             Concat0 ? Concat0 : ConcatSubOperand(VT, Ops, 0),

                             Concat1 ? Concat1 : ConcatSubOperand(VT, Ops, 1));

      }

      break;

    case X86ISD::PSHUFB:

    case X86ISD::PSADBW:

    case X86ISD::VPMADDUBSW:

    case X86ISD::VPMADDWD:

      if (!IsSplat && ((VT.is256BitVector() && Subtarget.hasInt256()) ||

                       (VT.is512BitVector() && Subtarget.useBWIRegs()))) {

        MVT SrcVT = Op0.getOperand(0).getSimpleValueType();

        SrcVT = MVT::getVectorVT(SrcVT.getScalarType(),

                                 NumOps * SrcVT.getVectorNumElements());

        SDValue Concat0 = CombineSubOperand(SrcVT, Ops, 0);

        SDValue Concat1 = CombineSubOperand(SrcVT, Ops, 1);

        if (Concat0 || Concat1)

          return DAG.getNode(

              Opcode, DL, VT,

              Concat0 ? Concat0 : ConcatSubOperand(SrcVT, Ops, 0),

              Concat1 ? Concat1 : ConcatSubOperand(SrcVT, Ops, 1));

      }

      break;

    case X86ISD::VPERMV:

      // TODO: Handle 256-bit and NumOps == 4 cases.

      if (!IsSplat && NumOps == 2 &&

          (VT.is512BitVector() && Subtarget.useAVX512Regs())) {

        MVT OpVT = Op0.getSimpleValueType();

        int NumSrcElts = OpVT.getVectorNumElements();

        SmallVector<int, 64> ConcatMask;

        for (unsigned i = 0; i != NumOps; ++i) {

          SmallVector<int, 64> SubMask;

          SmallVector<SDValue, 2> SubOps;

          if (!getTargetShuffleMask(Ops[i], false, SubOps, SubMask))

            break;

          for (int M : SubMask) {

            if (0 <= M)

              M += i * NumSrcElts;

            ConcatMask.push_back(M);

          }

        }

        if (ConcatMask.size() == (NumOps * NumSrcElts))

          return lowerShuffleWithPERMV(DL, VT, ConcatMask,

                                       ConcatSubOperand(VT, Ops, 1),

                                       DAG.getUNDEF(VT), Subtarget, DAG);

      }

      break;

    case X86ISD::VPERMV3:

      // TODO: Handle 256-bit and NumOps == 4 cases.

      if (!IsSplat && NumOps == 2 &&

          (VT.is512BitVector() && Subtarget.useAVX512Regs())) {

        MVT OpVT = Op0.getSimpleValueType();

        int NumSrcElts = OpVT.getVectorNumElements();

        SmallVector<int, 64> ConcatMask;

        for (unsigned i = 0; i != NumOps; ++i) {

          SmallVector<int, 64> SubMask;

          SmallVector<SDValue, 2> SubOps;

          if (!getTargetShuffleMask(Ops[i], false, SubOps, SubMask))

            break;

          for (int M : SubMask) {

            if (0 <= M) {

              int Src = M < NumSrcElts ? 0 : 2;

              M += M < NumSrcElts ? 0 : NumSrcElts;


              // Reference the lowest sub if the upper sub is the same.

              if (Ops[0].getOperand(Src) != Ops[i].getOperand(Src))

                M += i * NumSrcElts;

            }

            ConcatMask.push_back(M);

          }

        }

        if (ConcatMask.size() == (NumOps * NumSrcElts)) {

          SDValue Concat0 = CombineSubOperand(VT, Ops, 0);

          SDValue Concat1 = CombineSubOperand(VT, Ops, 2);

          if (Concat0 || Concat1)

            return lowerShuffleWithPERMV(

                DL, VT, ConcatMask,

                Concat0 ? Concat0 : ConcatSubOperand(VT, Ops, 0),

                Concat1 ? Concat1 : ConcatSubOperand(VT, Ops, 2), Subtarget,

                DAG);

        }

      }

      break;

    case X86ISD::VPERM2X128: {

      if (!IsSplat && VT.is512BitVector() && Subtarget.useAVX512Regs()) {

        assert(NumOps == 2 && "Bad concat_vectors operands");

        unsigned Imm0 = Ops[0].getConstantOperandVal(2);

        unsigned Imm1 = Ops[1].getConstantOperandVal(2);

        // TODO: Handle zero'd subvectors.

        if ((Imm0 & 0x88) == 0 && (Imm1 & 0x88) == 0) {

          int Mask[4] = {(int)(Imm0 & 0x03), (int)((Imm0 >> 4) & 0x3), (int)(Imm1 & 0x03),

                         (int)((Imm1 >> 4) & 0x3)};

          MVT ShuffleVT = VT.isFloatingPoint() ? MVT::v8f64 : MVT::v8i64;

          SDValue LHS = concatSubVectors(Ops[0].getOperand(0),

                                         Ops[0].getOperand(1), DAG, DL);

          SDValue RHS = concatSubVectors(Ops[1].getOperand(0),

                                         Ops[1].getOperand(1), DAG, DL);

          SDValue Res = DAG.getNode(X86ISD::SHUF128, DL, ShuffleVT,

                                    DAG.getBitcast(ShuffleVT, LHS),

                                    DAG.getBitcast(ShuffleVT, RHS),

                                    getV4X86ShuffleImm8ForMask(Mask, DL, DAG));

          return DAG.getBitcast(VT, Res);

        }

      }

      break;

    }

    case X86ISD::SHUF128: {

      if (!IsSplat && NumOps == 2 && VT.is512BitVector()) {

        unsigned Imm0 = Ops[0].getConstantOperandVal(2);

        unsigned Imm1 = Ops[1].getConstantOperandVal(2);

        unsigned Imm = ((Imm0 & 1) << 0) | ((Imm0 & 2) << 1) | 0x08 |

                       ((Imm1 & 1) << 4) | ((Imm1 & 2) << 5) | 0x80;

        SDValue LHS = concatSubVectors(Ops[0].getOperand(0),

                                       Ops[0].getOperand(1), DAG, DL);

        SDValue RHS = concatSubVectors(Ops[1].getOperand(0),

                                       Ops[1].getOperand(1), DAG, DL);

        return DAG.getNode(X86ISD::SHUF128, DL, VT, LHS, RHS,

                           DAG.getTargetConstant(Imm, DL, MVT::i8));

      }

      break;

    }

    case ISD::TRUNCATE:

      if (!IsSplat && NumOps == 2 && VT.is256BitVector()) {

        EVT SrcVT = Ops[0].getOperand(0).getValueType();

        if (SrcVT.is256BitVector() && SrcVT.isSimple() &&

            SrcVT == Ops[1].getOperand(0).getValueType() &&

            Subtarget.useAVX512Regs() &&

            Subtarget.getPreferVectorWidth() >= 512 &&

            (SrcVT.getScalarSizeInBits() > 16 || Subtarget.useBWIRegs())) {

          EVT NewSrcVT = SrcVT.getDoubleNumVectorElementsVT(Ctx);

          return DAG.getNode(ISD::TRUNCATE, DL, VT,

                             ConcatSubOperand(NewSrcVT, Ops, 0));

        }

      }

      break;

    case ISD::ANY_EXTEND:

    case ISD::SIGN_EXTEND:

    case ISD::ZERO_EXTEND:

      // TODO: Handle ANY_EXTEND combos with SIGN/ZERO_EXTEND.

      if (!IsSplat && NumOps == 2 &&

          ((VT.is256BitVector() && Subtarget.hasInt256()) ||

           (VT.is512BitVector() && Subtarget.useAVX512Regs() &&

            (EltSizeInBits >= 32 || Subtarget.useBWIRegs())))) {

        EVT SrcVT = Ops[0].getOperand(0).getValueType();

        if (SrcVT.isSimple() && SrcVT.is128BitVector() &&

            SrcVT == Ops[1].getOperand(0).getValueType()) {

          EVT NewSrcVT = SrcVT.getDoubleNumVectorElementsVT(Ctx);

          return DAG.getNode(Opcode, DL, VT,

                             ConcatSubOperand(NewSrcVT, Ops, 0));

        }

      }

      break;

    case ISD::ANY_EXTEND_VECTOR_INREG:

    case ISD::SIGN_EXTEND_VECTOR_INREG:

    case ISD::ZERO_EXTEND_VECTOR_INREG: {

      // TODO: Handle ANY_EXTEND_INREG combos with SIGN/ZERO_EXTEND_INREG.

      if (!IsSplat && NumOps == 2 &&

          ((VT.is256BitVector() && Subtarget.hasInt256()) ||

           (VT.is512BitVector() && Subtarget.useAVX512Regs() &&

            (EltSizeInBits >= 32 || Subtarget.useBWIRegs()))) &&

          Op0.getOperand(0).getValueType().is128BitVector() &&

          Op0.getOperand(0).getValueType() ==

              Ops[0].getOperand(0).getValueType()) {

        EVT SrcVT = Op0.getOperand(0).getValueType();

        unsigned NumElts = VT.getVectorNumElements();

        MVT UnpackSVT =

            MVT::getIntegerVT(SrcVT.getScalarSizeInBits() * (NumElts / 2));

        MVT UnpackVT =

            MVT::getVectorVT(UnpackSVT, 128 / UnpackSVT.getScalarSizeInBits());

        SDValue Unpack =

            DAG.getNode(X86ISD::UNPCKL, DL, UnpackVT,

                        DAG.getBitcast(UnpackVT, Ops[0].getOperand(0)),

                        DAG.getBitcast(UnpackVT, Ops[1].getOperand(0)));

        return getEXTEND_VECTOR_INREG(Opcode, DL, VT,

                                      DAG.getBitcast(SrcVT, Unpack), DAG);

      }

      break;

    }

    case X86ISD::VSHLI:

    case X86ISD::VSRLI:

      // Special case: SHL/SRL AVX1 V4i64 by 32-bits can lower as a shuffle.

      if (VT == MVT::v4i64 && !Subtarget.hasInt256() &&

          llvm::all_of(Ops, [](SDValue Op) {

            return Op.getConstantOperandAPInt(1) == 32;

          })) {

        if (SDValue Res = CombineSubOperand(VT, Ops, 0)) {

          SDValue Zero = getZeroVector(MVT::v8i32, Subtarget, DAG, DL);

          Res = DAG.getBitcast(MVT::v8i32, Res);

          if (Opcode == X86ISD::VSHLI) {

            Res = DAG.getVectorShuffle(MVT::v8i32, DL, Res, Zero,

                                       {8, 0, 8, 2, 8, 4, 8, 6});

          } else {

            Res = DAG.getVectorShuffle(MVT::v8i32, DL, Res, Zero,

                                       {1, 8, 3, 8, 5, 8, 7, 8});

          }

          return DAG.getBitcast(VT, Res);

        }

      }

      [[fallthrough]];

    case X86ISD::VSRAI:

    case X86ISD::VSHL:

    case X86ISD::VSRL:

    case X86ISD::VSRA:

      if (((VT.is256BitVector() && Subtarget.hasInt256()) ||

           (VT.is512BitVector() && Subtarget.useAVX512Regs() &&

            (EltSizeInBits >= 32 || Subtarget.useBWIRegs()))) &&

          llvm::all_of(Ops, [Op0](SDValue Op) {

            return Op0.getOperand(1) == Op.getOperand(1);

          })) {

        return DAG.getNode(Opcode, DL, VT, ConcatSubOperand(VT, Ops, 0),

                           Op0.getOperand(1));

      }

      break;

    case X86ISD::VPERMI:

    case X86ISD::VROTLI:

    case X86ISD::VROTRI:

      if (!IsSplat &&

          ((VT.is256BitVector() && Subtarget.hasVLX()) ||

           (VT.is512BitVector() && Subtarget.useAVX512Regs())) &&

          llvm::all_of(Ops, [Op0](SDValue Op) {

            return Op0.getOperand(1) == Op.getOperand(1);

          })) {

        assert(!(Opcode == X86ISD::VPERMI &&

                 Op0.getValueType().is128BitVector()) &&

               "Illegal 128-bit X86ISD::VPERMI nodes");

        return DAG.getNode(Opcode, DL, VT, ConcatSubOperand(VT, Ops, 0),

                           Op0.getOperand(1));

      }

      break;

    case ISD::AND:

    case ISD::OR:

    case ISD::XOR:

    case X86ISD::ANDNP:

      // TODO: AVX512 targets should only use CombineSubOperand like AVX1/2.

      if (!IsSplat && (VT.is256BitVector() ||

                       (VT.is512BitVector() && Subtarget.useAVX512Regs()))) {

        // Don't concatenate root AVX1 NOT patterns.

        // TODO: Allow NOT folding if Concat0 succeeds.

        if (Opcode == ISD::XOR && Depth == 0 && !Subtarget.hasInt256() &&

            llvm::all_of(Ops, [](SDValue X) {

              return ISD::isBuildVectorAllOnes(X.getOperand(1).getNode());

            }))

          break;

        SDValue Concat0 = CombineSubOperand(VT, Ops, 0);

        SDValue Concat1 = CombineSubOperand(VT, Ops, 1);

        if (Concat0 || Concat1 || Subtarget.useAVX512Regs())

          return DAG.getNode(Opcode, DL, VT,

                             Concat0 ? Concat0 : ConcatSubOperand(VT, Ops, 0),

                             Concat1 ? Concat1 : ConcatSubOperand(VT, Ops, 1));

      }

      break;

    case X86ISD::PCMPEQ:

    case X86ISD::PCMPGT:

      // TODO: 512-bit PCMPEQ/PCMPGT -> VPCMP+VPMOVM2 handling.

      if (!IsSplat && VT.is256BitVector() && Subtarget.hasInt256()) {

        SDValue Concat0 = CombineSubOperand(VT, Ops, 0);

        SDValue Concat1 = CombineSubOperand(VT, Ops, 1);

        if (Concat0 || Concat1)

          return DAG.getNode(Opcode, DL, VT,

                             Concat0 ? Concat0 : ConcatSubOperand(VT, Ops, 0),

                             Concat1 ? Concat1 : ConcatSubOperand(VT, Ops, 1));

        break;

      }


      if (!IsSplat && VT == MVT::v8i32) {

        // Without AVX2, see if we can cast the values to v8f32 and use fcmp.

        // TODO: Handle v4f64 as well?

        unsigned MaxSigBitsLHS = 0, MaxSigBitsRHS = 0;

        for (unsigned I = 0; I != NumOps; ++I) {

          MaxSigBitsLHS =

              std::max(MaxSigBitsLHS,

                       DAG.ComputeMaxSignificantBits(Ops[I].getOperand(0)));

          MaxSigBitsRHS =

              std::max(MaxSigBitsRHS,

                       DAG.ComputeMaxSignificantBits(Ops[I].getOperand(1)));

          if (MaxSigBitsLHS == EltSizeInBits && MaxSigBitsRHS == EltSizeInBits)

            break;

        }


        ISD::CondCode ICC =

            Opcode == X86ISD::PCMPEQ ? ISD::SETEQ : ISD::SETGT;

        ISD::CondCode FCC =

            Opcode == X86ISD::PCMPEQ ? ISD::SETOEQ : ISD::SETOGT;


        MVT FpSVT = MVT::getFloatingPointVT(EltSizeInBits);

        MVT FpVT = VT.changeVectorElementType(FpSVT);


        if (std::optional<unsigned> CastOpc =

                CastIntSETCCtoFP(FpVT, ICC, MaxSigBitsLHS, MaxSigBitsRHS)) {

          SDValue LHS = CombineSubOperand(VT, Ops, 0);

          SDValue RHS = CombineSubOperand(VT, Ops, 1);

          LHS = LHS ? LHS : ConcatSubOperand(VT, Ops, 0);

          RHS = RHS ? RHS : ConcatSubOperand(VT, Ops, 1);

          LHS = DAG.getNode(*CastOpc, DL, FpVT, LHS);

          RHS = DAG.getNode(*CastOpc, DL, FpVT, RHS);


          bool IsAlwaysSignaling;

          unsigned FSETCC =

              translateX86FSETCC(FCC, LHS, RHS, IsAlwaysSignaling);

          return DAG.getBitcast(

              VT, DAG.getNode(X86ISD::CMPP, DL, FpVT, LHS, RHS,

                              DAG.getTargetConstant(FSETCC, DL, MVT::i8)));

        }

      }

      break;

    case ISD::CTPOP:

    case ISD::CTTZ:

    case ISD::CTLZ:

    case ISD::CTTZ_ZERO_UNDEF:

    case ISD::CTLZ_ZERO_UNDEF:

      if (!IsSplat && ((VT.is256BitVector() && Subtarget.hasInt256()) ||

                       (VT.is512BitVector() && Subtarget.useBWIRegs()))) {

        return DAG.getNode(Opcode, DL, VT, ConcatSubOperand(VT, Ops, 0));

      }

      break;

    case X86ISD::GF2P8AFFINEQB:

      // TODO: GF2P8AFFINEQB should use CombineSubOperand.

      if (!IsSplat &&

          (VT.is256BitVector() ||

           (VT.is512BitVector() && Subtarget.useAVX512Regs())) &&

          llvm::all_of(Ops, [Op0](SDValue Op) {

            return Op0.getOperand(2) == Op.getOperand(2);

          })) {

        return DAG.getNode(Opcode, DL, VT, ConcatSubOperand(VT, Ops, 0),

                           ConcatSubOperand(VT, Ops, 1), Op0.getOperand(2));

      }

      break;

    case ISD::ADD:

    case ISD::SUB:

    case ISD::MUL:

      // TODO: Add more integer binops?

      if (!IsSplat && ((VT.is256BitVector() && Subtarget.hasInt256()) ||

                       (VT.is512BitVector() && Subtarget.useAVX512Regs() &&

                        (EltSizeInBits >= 32 || Subtarget.useBWIRegs())))) {

        SDValue Concat0 = CombineSubOperand(VT, Ops, 0);

        SDValue Concat1 = CombineSubOperand(VT, Ops, 1);

        if (Concat0 || Concat1 || llvm::all_of(Ops, [](SDValue Op) {

              return Op.getOperand(0) == Op.getOperand(1);

            }))

          return DAG.getNode(Opcode, DL, VT,

                             Concat0 ? Concat0 : ConcatSubOperand(VT, Ops, 0),

                             Concat1 ? Concat1 : ConcatSubOperand(VT, Ops, 1));

      }

      break;

    // Due to VADD, VSUB, VMUL can executed on more ports than VINSERT and

    // their latency are short, so here we don't replace them unless we won't

    // introduce extra VINSERT.

    case ISD::FADD:

    case ISD::FSUB:

    case ISD::FMUL:

      if (!IsSplat && (VT.is256BitVector() ||

                       (VT.is512BitVector() && Subtarget.useAVX512Regs()))) {

        SDValue Concat0 = CombineSubOperand(VT, Ops, 0);

        SDValue Concat1 = CombineSubOperand(VT, Ops, 1);

        if (Concat0 || Concat1)

          return DAG.getNode(Opcode, DL, VT,

                             Concat0 ? Concat0 : ConcatSubOperand(VT, Ops, 0),

                             Concat1 ? Concat1 : ConcatSubOperand(VT, Ops, 1));

      }

      break;

    // Always prefer to concatenate high latency FDIV instructions.

    case ISD::FDIV:

      if (!IsSplat && (VT.is256BitVector() ||

                       (VT.is512BitVector() && Subtarget.useAVX512Regs()))) {

        return DAG.getNode(Opcode, DL, VT, ConcatSubOperand(VT, Ops, 0),

                           ConcatSubOperand(VT, Ops, 1));

      }

      break;

    case X86ISD::HADD:

    case X86ISD::HSUB:

    case X86ISD::FHADD:

    case X86ISD::FHSUB:

      if (!IsSplat && VT.is256BitVector() &&

          (VT.isFloatingPoint() || Subtarget.hasInt256())) {

        SDValue Concat0 = CombineSubOperand(VT, Ops, 0);

        SDValue Concat1 = CombineSubOperand(VT, Ops, 1);

        if (Concat0 || Concat1)

          return DAG.getNode(Opcode, DL, VT,

                             Concat0 ? Concat0 : ConcatSubOperand(VT, Ops, 0),

                             Concat1 ? Concat1 : ConcatSubOperand(VT, Ops, 1));

      }

      break;

    case X86ISD::PACKSS:

    case X86ISD::PACKUS:

      if (!IsSplat && ((VT.is256BitVector() && Subtarget.hasInt256()) ||

                       (VT.is512BitVector() && Subtarget.useBWIRegs()))) {

        MVT SrcVT = Op0.getOperand(0).getSimpleValueType();

        SrcVT = MVT::getVectorVT(SrcVT.getScalarType(),

                                 NumOps * SrcVT.getVectorNumElements());

        SDValue Concat0 = CombineSubOperand(SrcVT, Ops, 0);

        SDValue Concat1 = CombineSubOperand(SrcVT, Ops, 1);

        if (Concat0 || Concat1)

          return DAG.getNode(

              Opcode, DL, VT,

              Concat0 ? Concat0 : ConcatSubOperand(SrcVT, Ops, 0),

              Concat1 ? Concat1 : ConcatSubOperand(SrcVT, Ops, 1));

      }

      break;

    case X86ISD::VSHLD:

    case X86ISD::VSHRD:

    case X86ISD::PALIGNR:

      if (!IsSplat &&

          ((VT.is256BitVector() && Subtarget.hasInt256()) ||

           (VT.is512BitVector() && Subtarget.useBWIRegs())) &&

          llvm::all_of(Ops, [Op0](SDValue Op) {

            return Op0.getOperand(2) == Op.getOperand(2);

          })) {

        SDValue Concat0 = CombineSubOperand(VT, Ops, 0);

        SDValue Concat1 = CombineSubOperand(VT, Ops, 1);

        if (Concat0 || Concat1)

          return DAG.getNode(Opcode, DL, VT,

                             Concat0 ? Concat0 : ConcatSubOperand(VT, Ops, 0),

                             Concat1 ? Concat1 : ConcatSubOperand(VT, Ops, 1),

                             Op0.getOperand(2));

      }

      break;

    case X86ISD::BLENDI:

      if (VT.is256BitVector() && NumOps == 2 &&

          (EltSizeInBits >= 32 ||

           (Subtarget.hasInt256() &&

            Ops[0].getOperand(2) == Ops[1].getOperand(2)))) {

        SDValue Concat0 = CombineSubOperand(VT, Ops, 0);

        SDValue Concat1 = CombineSubOperand(VT, Ops, 1);

        if (Concat0 || Concat1) {

          unsigned NumElts = VT.getVectorNumElements();

          APInt Mask = getBLENDIBlendMask(Ops[0]).zext(NumElts);

          Mask.insertBits(getBLENDIBlendMask(Ops[1]), NumElts / 2);

          Mask = Mask.zextOrTrunc(8);

          return DAG.getNode(Opcode, DL, VT,

                             Concat0 ? Concat0 : ConcatSubOperand(VT, Ops, 0),

                             Concat1 ? Concat1 : ConcatSubOperand(VT, Ops, 1),

                             DAG.getTargetConstant(Mask, DL, MVT::i8));

        }

      }

      // TODO: BWI targets should only use CombineSubOperand.

      if (((VT.is256BitVector() && Subtarget.hasVLX()) ||

           (VT.is512BitVector() && Subtarget.useAVX512Regs())) &&

          (EltSizeInBits >= 32 || Subtarget.useBWIRegs())) {

        SDValue Concat0 = CombineSubOperand(VT, Ops, 0);

        SDValue Concat1 = CombineSubOperand(VT, Ops, 1);

        if (Concat0 || Concat1 || Subtarget.useBWIRegs()) {

          unsigned NumElts = VT.getVectorNumElements();

          APInt Mask = getBLENDIBlendMask(Ops[0]).zext(NumElts);

          for (unsigned I = 1; I != NumOps; ++I)

            Mask.insertBits(getBLENDIBlendMask(Ops[I]), I * (NumElts / NumOps));

          unsigned NumMaskBits = NumElts >= 8 ? NumElts : 8;

          Mask = Mask.zextOrTrunc(NumMaskBits);

          MVT MaskSVT = MVT::getIntegerVT(NumMaskBits);

          MVT MaskVT = MVT::getVectorVT(MVT::i1, NumMaskBits);

          SDValue Sel =

              DAG.getBitcast(MaskVT, DAG.getConstant(Mask, DL, MaskSVT));

          Sel = extractSubVector(Sel, 0, DAG, DL, NumElts);

          Concat0 = Concat0 ? Concat0 : ConcatSubOperand(VT, Ops, 0);

          Concat1 = Concat1 ? Concat1 : ConcatSubOperand(VT, Ops, 1);

          return DAG.getSelect(DL, VT, Sel, Concat1, Concat0);

        }

      }

      break;

    case ISD::VSELECT:

      // TODO: VSELECT should use CombineSubOperand.

      if (!IsSplat && Subtarget.hasAVX512() &&

          (VT.is256BitVector() ||

           (VT.is512BitVector() && Subtarget.useAVX512Regs())) &&

          (EltSizeInBits >= 32 || Subtarget.hasBWI())) {

        EVT SelVT = Ops[0].getOperand(0).getValueType();

        if (SelVT.getVectorElementType() == MVT::i1) {

          SelVT = EVT::getVectorVT(Ctx, MVT::i1,

                                   NumOps * SelVT.getVectorNumElements());

          if (TLI.isTypeLegal(SelVT))

            return DAG.getNode(

                Opcode, DL, VT, ConcatSubOperand(SelVT.getSimpleVT(), Ops, 0),

                ConcatSubOperand(VT, Ops, 1), ConcatSubOperand(VT, Ops, 2));

        }

      }

      [[fallthrough]];

    case X86ISD::BLENDV:

      // TODO: BLENDV should use CombineSubOperand.

      if (!IsSplat && VT.is256BitVector() && NumOps == 2 &&

          (EltSizeInBits >= 32 || Subtarget.hasInt256()) &&

          IsConcatFree(VT, Ops, 1) && IsConcatFree(VT, Ops, 2)) {

        EVT SelVT = Ops[0].getOperand(0).getValueType();

        SelVT = SelVT.getDoubleNumVectorElementsVT(Ctx);

        if (TLI.isTypeLegal(SelVT))

          return DAG.getNode(

              Opcode, DL, VT, ConcatSubOperand(SelVT.getSimpleVT(), Ops, 0),

              ConcatSubOperand(VT, Ops, 1), ConcatSubOperand(VT, Ops, 2));

      }

      break;

    }

  }


  // Fold subvector loads into one.

  // If needed, look through bitcasts to get to the load.

  if (auto *FirstLd = dyn_cast<LoadSDNode>(peekThroughBitcasts(Op0))) {

    unsigned Fast;

    const X86TargetLowering *TLI = Subtarget.getTargetLowering();

    if (TLI->allowsMemoryAccess(Ctx, DAG.getDataLayout(), VT,

                                *FirstLd->getMemOperand(), &Fast) &&

        Fast) {

      if (SDValue Ld =

              EltsFromConsecutiveLoads(VT, Ops, DL, DAG, Subtarget, false))

        return Ld;

    }

  }


  // Attempt to fold target constant loads.

  if (all_of(Ops, [](SDValue Op) { return getTargetConstantFromNode(Op); })) {

    SmallVector<APInt> EltBits;

    APInt UndefElts = APInt::getZero(VT.getVectorNumElements());

    for (unsigned I = 0; I != NumOps; ++I) {

      APInt OpUndefElts;

      SmallVector<APInt> OpEltBits;

      if (!getTargetConstantBitsFromNode(Ops[I], EltSizeInBits, OpUndefElts,

                                         OpEltBits, /*AllowWholeUndefs*/ true,

                                         /*AllowPartialUndefs*/ false))

        break;

      EltBits.append(OpEltBits);

      UndefElts.insertBits(OpUndefElts, I * OpUndefElts.getBitWidth());

    }

    if (EltBits.size() == VT.getVectorNumElements()) {

      Constant *C = getConstantVector(VT, EltBits, UndefElts, Ctx);

      MVT PVT = TLI.getPointerTy(DAG.getDataLayout());

      SDValue CV = DAG.getConstantPool(C, PVT);

      MachineFunction &MF = DAG.getMachineFunction();

      MachinePointerInfo MPI = MachinePointerInfo::getConstantPool(MF);

      SDValue Ld = DAG.getLoad(VT, DL, DAG.getEntryNode(), CV, MPI);

      SDValue Sub = extractSubVector(Ld, 0, DAG, DL, Op0.getValueSizeInBits());

      DAG.ReplaceAllUsesOfValueWith(Op0, Sub);

      return Ld;

    }

  }


  // If this simple subvector or scalar/subvector broadcast_load is inserted

  // into both halves, use a larger broadcast_load. Update other uses to use

  // an extracted subvector.

  if (IsSplat &&

      (VT.is256BitVector() || (VT.is512BitVector() && Subtarget.hasAVX512()))) {

    if (ISD::isNormalLoad(Op0.getNode()) ||

        Op0.getOpcode() == X86ISD::VBROADCAST_LOAD ||

        Op0.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) {

      auto *Mem = cast<MemSDNode>(Op0);

      unsigned Opc = Op0.getOpcode() == X86ISD::VBROADCAST_LOAD

                         ? X86ISD::VBROADCAST_LOAD

                         : X86ISD::SUBV_BROADCAST_LOAD;

      if (SDValue BcastLd =

              getBROADCAST_LOAD(Opc, DL, VT, Mem->getMemoryVT(), Mem, 0, DAG)) {

        SDValue BcastSrc =

            extractSubVector(BcastLd, 0, DAG, DL, Op0.getValueSizeInBits());

        DAG.ReplaceAllUsesOfValueWith(Op0, BcastSrc);

        return BcastLd;

      }

    }

  }


  // If we're splatting a 128-bit subvector to 512-bits, use SHUF128 directly.

  if (IsSplat && NumOps == 4 && VT.is512BitVector() &&

      Subtarget.useAVX512Regs()) {

    MVT ShuffleVT = VT.isFloatingPoint() ? MVT::v8f64 : MVT::v8i64;

    SDValue Res = widenSubVector(Op0, false, Subtarget, DAG, DL, 512);

    Res = DAG.getBitcast(ShuffleVT, Res);

    Res = DAG.getNode(X86ISD::SHUF128, DL, ShuffleVT, Res, Res,

                      getV4X86ShuffleImm8ForMask({0, 0, 0, 0}, DL, DAG));

    return DAG.getBitcast(VT, Res);

  }


  // We can always convert per-lane vXf64 shuffles into VSHUFPD.

  if (!IsSplat &&

      ((NumOps == 2 && VT == MVT::v4f64) ||

       (NumOps == 4 && VT == MVT::v8f64 && Subtarget.useAVX512Regs())) &&

      all_of(Ops, [](SDValue Op) { return Op.hasOneUse(); })) {

    // Collect the individual per-lane v2f64/v4f64 shuffles.

    MVT OpVT = Ops[0].getSimpleValueType();

    unsigned NumOpElts = OpVT.getVectorNumElements();

    SmallVector<SmallVector<SDValue, 2>, 4> SrcOps(NumOps);

    SmallVector<SmallVector<int, 8>, 4> SrcMasks(NumOps);

    if (all_of(seq<int>(NumOps), [&](int I) {

          return getTargetShuffleInputs(Ops[I], SrcOps[I], SrcMasks[I], DAG,

                                        Depth + 1) &&

                 !is128BitLaneCrossingShuffleMask(OpVT, SrcMasks[I]) &&

                 none_of(SrcMasks[I], isUndefOrZero) &&

                 SrcMasks[I].size() == NumOpElts &&

                 all_of(SrcOps[I], [&OpVT](SDValue V) {

                   return V.getValueType() == OpVT;

                 });

        })) {

      // Concatenate the shuffle masks into SHUFPD mask and collect subops.

      bool Unary = true;

      unsigned SHUFPDMask = 0;

      SmallVector<SDValue, 4> LHS(NumOps), RHS(NumOps);

      for (unsigned I = 0; I != NumOps; ++I) {

        LHS[I] = SrcOps[I][SrcMasks[I][0] / NumOpElts];

        RHS[I] = SrcOps[I][SrcMasks[I][1] / NumOpElts];

        Unary &= LHS[I] == RHS[I];

        for (unsigned J = 0; J != NumOpElts; ++J)

          SHUFPDMask |= (SrcMasks[I][J] & 1) << ((I * NumOpElts) + J);

      }

      // Concat SHUFPD LHS/RHS operands - if they match then it will become a

      // PERMILPD mask and we can always profitably concatenate them.

      SDValue Concat0 =

          combineConcatVectorOps(DL, VT, LHS, DAG, Subtarget, Depth + 1);

      SDValue Concat1 =

          combineConcatVectorOps(DL, VT, RHS, DAG, Subtarget, Depth + 1);

      if (Unary || Concat0 || Concat1) {

        Concat0 =

            Concat0 ? Concat0 : DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LHS);

        Concat1 =

            Concat1 ? Concat1 : DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, RHS);

        return DAG.getNode(X86ISD::SHUFP, DL, VT, Concat0, Concat1,

                           DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));

      }

    }

  }


  return SDValue();

}


static SDValue combineCONCAT_VECTORS(SDNode *N, SelectionDAG &DAG,

                                     TargetLowering::DAGCombinerInfo &DCI,

                                     const X86Subtarget &Subtarget) {

  EVT VT = N->getValueType(0);

  EVT SrcVT = N->getOperand(0).getValueType();

  const TargetLowering &TLI = DAG.getTargetLoweringInfo();

  SmallVector<SDValue, 4> Ops(N->ops());


  if (VT.getVectorElementType() == MVT::i1) {

    // Attempt to constant fold.

    unsigned SubSizeInBits = SrcVT.getSizeInBits();

    APInt Constant = APInt::getZero(VT.getSizeInBits());

    for (unsigned I = 0, E = Ops.size(); I != E; ++I) {

      auto *C = dyn_cast<ConstantSDNode>(peekThroughBitcasts(Ops[I]));

      if (!C) break;

      Constant.insertBits(C->getAPIntValue(), I * SubSizeInBits);

      if (I == (E - 1)) {

        EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());

        if (TLI.isTypeLegal(IntVT))

          return DAG.getBitcast(VT, DAG.getConstant(Constant, SDLoc(N), IntVT));

      }

    }


    // Don't do anything else for i1 vectors.

    return SDValue();

  }


  if (Subtarget.hasAVX() && TLI.isTypeLegal(VT) && TLI.isTypeLegal(SrcVT)) {

    if (SDValue R = combineConcatVectorOps(SDLoc(N), VT.getSimpleVT(), Ops, DAG,

                                           Subtarget))

      return R;

  }


  return SDValue();

}


static SDValue combineINSERT_SUBVECTOR(SDNode *N, SelectionDAG &DAG,

                                       TargetLowering::DAGCombinerInfo &DCI,

                                       const X86Subtarget &Subtarget) {

  if (DCI.isBeforeLegalizeOps())

    return SDValue();


  MVT OpVT = N->getSimpleValueType(0);


  bool IsI1Vector = OpVT.getVectorElementType() == MVT::i1;


  SDLoc dl(N);

  SDValue Vec = N->getOperand(0);

  SDValue SubVec = N->getOperand(1);


  uint64_t IdxVal = N->getConstantOperandVal(2);

  MVT SubVecVT = SubVec.getSimpleValueType();

  int VecNumElts = OpVT.getVectorNumElements();

  int SubVecNumElts = SubVecVT.getVectorNumElements();


  if (Vec.isUndef() && SubVec.isUndef())

    return DAG.getUNDEF(OpVT);


  // Inserting undefs/zeros into zeros/undefs is a zero vector.

  if ((Vec.isUndef() || ISD::isBuildVectorAllZeros(Vec.getNode())) &&

      (SubVec.isUndef() || ISD::isBuildVectorAllZeros(SubVec.getNode())))

    return getZeroVector(OpVT, Subtarget, DAG, dl);


  if (ISD::isBuildVectorAllZeros(Vec.getNode())) {

    // If we're inserting into a zero vector and then into a larger zero vector,

    // just insert into the larger zero vector directly.

    if (SubVec.getOpcode() == ISD::INSERT_SUBVECTOR &&

        ISD::isBuildVectorAllZeros(SubVec.getOperand(0).getNode())) {

      uint64_t Idx2Val = SubVec.getConstantOperandVal(2);

      return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,

                         getZeroVector(OpVT, Subtarget, DAG, dl),

                         SubVec.getOperand(1),

                         DAG.getVectorIdxConstant(IdxVal + Idx2Val, dl));

    }


    // If we're inserting into a zero vector and our input was extracted from an

    // insert into a zero vector of the same type and the extraction was at

    // least as large as the original insertion. Just insert the original

    // subvector into a zero vector.

    if (SubVec.getOpcode() == ISD::EXTRACT_SUBVECTOR && IdxVal == 0 &&

        isNullConstant(SubVec.getOperand(1)) &&

        SubVec.getOperand(0).getOpcode() == ISD::INSERT_SUBVECTOR) {

      SDValue Ins = SubVec.getOperand(0);

      if (isNullConstant(Ins.getOperand(2)) &&

          ISD::isBuildVectorAllZeros(Ins.getOperand(0).getNode()) &&

          Ins.getOperand(1).getValueSizeInBits().getFixedValue() <=

              SubVecVT.getFixedSizeInBits())

          return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,

                             getZeroVector(OpVT, Subtarget, DAG, dl),

                             Ins.getOperand(1), N->getOperand(2));

    }

  }


  // Stop here if this is an i1 vector.

  if (IsI1Vector)

    return SDValue();


  // Eliminate an intermediate vector widening:

  // insert_subvector X, (insert_subvector undef, Y, 0), Idx -->

  // insert_subvector X, Y, Idx

  // TODO: This is a more general version of a DAGCombiner fold, can we move it

  // there?

  if (SubVec.getOpcode() == ISD::INSERT_SUBVECTOR &&

      SubVec.getOperand(0).isUndef() && isNullConstant(SubVec.getOperand(2)))

    return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, Vec,

                       SubVec.getOperand(1), N->getOperand(2));


  // If this is an insert of an extract, combine to a shuffle. Don't do this

  // if the insert or extract can be represented with a subregister operation.

  if (SubVec.getOpcode() == ISD::EXTRACT_SUBVECTOR &&

      SubVec.getOperand(0).getSimpleValueType() == OpVT &&

      (IdxVal != 0 ||

       !(Vec.isUndef() || ISD::isBuildVectorAllZeros(Vec.getNode())))) {

    SDValue ExtSrc = SubVec.getOperand(0);

    int ExtIdxVal = SubVec.getConstantOperandVal(1);

    // Create a shuffle mask matching the extraction and insertion.

    SmallVector<int, 64> Mask(VecNumElts);

    std::iota(Mask.begin(), Mask.end(), 0);

    std::iota(Mask.begin() + IdxVal, Mask.begin() + IdxVal + SubVecNumElts,

              ExtIdxVal + VecNumElts);

    if (ExtIdxVal != 0)

      return DAG.getVectorShuffle(OpVT, dl, Vec, ExtSrc, Mask);

    // See if we can use a blend instead of extract/insert pair.

    SmallVector<int, 64> BlendMask(VecNumElts);

    std::iota(BlendMask.begin(), BlendMask.end(), 0);

    std::iota(BlendMask.begin() + IdxVal,

              BlendMask.begin() + IdxVal + SubVecNumElts, VecNumElts + IdxVal);

    if (isShuffleEquivalent(Mask, BlendMask, Vec, ExtSrc) &&

        VecNumElts == (2 * SubVecNumElts)) {

      assert((IdxVal % SubVecNumElts) == 0 && "Unaligned subvector insertion");

      if (OpVT.is256BitVector() && SubVecVT.is128BitVector()) {

        SDValue Blend = DAG.getNode(

            X86ISD::BLENDI, dl, MVT::v8f32, DAG.getBitcast(MVT::v8f32, Vec),

            DAG.getBitcast(MVT::v8f32, ExtSrc),

            DAG.getTargetConstant(IdxVal == 0 ? 0x0F : 0xF0, dl, MVT::i8));

        return DAG.getBitcast(OpVT, Blend);

      } else if (OpVT.is512BitVector() && SubVecVT.is256BitVector()) {

        MVT ShufVT = OpVT.isInteger() ? MVT::v8i64 : MVT::v8f64;

        SDValue Lo = DAG.getBitcast(ShufVT, IdxVal == 0 ? ExtSrc : Vec);

        SDValue Hi = DAG.getBitcast(ShufVT, IdxVal == 0 ? Vec : ExtSrc);

        SDValue Shuffle =

            DAG.getNode(X86ISD::SHUF128, dl, ShufVT, Lo, Hi,

                        getV4X86ShuffleImm8ForMask({0, 1, 2, 3}, dl, DAG));

        return DAG.getBitcast(OpVT, Shuffle);

      }

    }

  }


  // Match concat_vector style patterns.

  SmallVector<SDValue, 2> SubVectorOps;

  if (collectConcatOps(N, SubVectorOps, DAG)) {

    if (SDValue Fold =

            combineConcatVectorOps(dl, OpVT, SubVectorOps, DAG, Subtarget))

      return Fold;


    // If we're inserting all zeros into the upper half, change this to

    // a concat with zero. We will match this to a move

    // with implicit upper bit zeroing during isel.

    // We do this here because we don't want combineConcatVectorOps to

    // create INSERT_SUBVECTOR from CONCAT_VECTORS.

    if (SubVectorOps.size() == 2 &&

        ISD::isBuildVectorAllZeros(SubVectorOps[1].getNode()))

      return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,

                         getZeroVector(OpVT, Subtarget, DAG, dl),

                         SubVectorOps[0], DAG.getVectorIdxConstant(0, dl));


    // Attempt to recursively combine to a shuffle.

    if (all_of(SubVectorOps, [](SDValue SubOp) {

          return isTargetShuffle(peekThroughBitcasts(SubOp).getOpcode());

        })) {

      SDValue Op(N, 0);

      if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))

        return Res;

    }

  }


  // If this is a broadcast insert into an upper undef, use a larger broadcast.

  if (Vec.isUndef() && IdxVal != 0 && SubVec.getOpcode() == X86ISD::VBROADCAST)

    return DAG.getNode(X86ISD::VBROADCAST, dl, OpVT, SubVec.getOperand(0));


  // If this is a broadcast load inserted into an upper undef, use a larger

  // broadcast load.

  if (Vec.isUndef() && IdxVal != 0 && SubVec.hasOneUse() &&

      SubVec.getOpcode() == X86ISD::VBROADCAST_LOAD) {

    auto *MemIntr = cast<MemIntrinsicSDNode>(SubVec);

    return getBROADCAST_LOAD(X86ISD::VBROADCAST_LOAD, dl, OpVT,

                             MemIntr->getMemoryVT(), MemIntr, 0, DAG);

  }


  // If we're splatting the lower half subvector of a full vector load into the

  // upper half, attempt to create a subvector broadcast.

  if ((int)IdxVal == (VecNumElts / 2) &&

      Vec.getValueSizeInBits() == (2 * SubVec.getValueSizeInBits())) {

    auto *VecLd = dyn_cast<LoadSDNode>(Vec);

    auto *SubLd = dyn_cast<LoadSDNode>(SubVec);

    if (VecLd && SubLd &&

        DAG.areNonVolatileConsecutiveLoads(

            SubLd, VecLd, SubVec.getValueSizeInBits() / 8, 0)) {

      SDValue BcastLd = getBROADCAST_LOAD(X86ISD::SUBV_BROADCAST_LOAD, dl, OpVT,

                                          SubVecVT, SubLd, 0, DAG);

      SDValue NewSubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVecVT,

                                      BcastLd, DAG.getVectorIdxConstant(0, dl));

      DCI.CombineTo(SubLd, NewSubVec, BcastLd.getValue(1));

      return BcastLd;

    }

  }


  // Attempt to constant fold (if we're not widening).

  if (!Vec.isUndef() && !ISD::isBuildVectorAllZeros(Vec.getNode())) {

    unsigned EltSizeInBits = OpVT.getScalarSizeInBits();

    APInt VecUndefElts, SubUndefElts;

    SmallVector<APInt, 16> VecEltBits, SubEltBits;

    if (getTargetConstantBitsFromNode(Vec, EltSizeInBits, VecUndefElts,

                                      VecEltBits) &&

        getTargetConstantBitsFromNode(SubVec, EltSizeInBits, SubUndefElts,

                                      SubEltBits)) {

      VecUndefElts.insertBits(SubUndefElts, IdxVal);

      llvm::copy(SubEltBits, VecEltBits.begin() + IdxVal);

      return getConstVector(VecEltBits, VecUndefElts, OpVT, DAG, dl);

    }

  }


  // Attempt to recursively combine to a shuffle.

  if (isTargetShuffle(peekThroughBitcasts(Vec).getOpcode()) &&

      isTargetShuffle(peekThroughBitcasts(SubVec).getOpcode())) {

    SDValue Op(N, 0);

    if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))

      return Res;

  }


  // Match insertion of subvector load that perfectly aliases a base load.

  if ((IdxVal % SubVecNumElts) == 0 && ISD::isNormalLoad(Vec.getNode()) &&

      ISD::isNormalLoad(SubVec.getNode()) &&

      DAG.areNonVolatileConsecutiveLoads(

          cast<LoadSDNode>(SubVec), cast<LoadSDNode>(Vec),

          SubVec.getValueSizeInBits() / 8, IdxVal / SubVecNumElts))

    return Vec;


  return SDValue();

}


/// If we are extracting a subvector of a vector select and the select condition

/// is composed of concatenated vectors, try to narrow the select width. This

/// is a common pattern for AVX1 integer code because 256-bit selects may be

/// legal, but there is almost no integer math/logic available for 256-bit.

/// This function should only be called with legal types (otherwise, the calls

/// to get simple value types will assert).


static SDValue narrowExtractedVectorSelect(SDNode *Ext, const SDLoc &DL,

                                           SelectionDAG &DAG) {

  SDValue Sel = Ext->getOperand(0);

  if (Sel.getOpcode() != ISD::VSELECT ||

      !isFreeToSplitVector(Sel.getOperand(0), DAG))

    return SDValue();


  // Note: We assume simple value types because this should only be called with

  //       legal operations/types.

  // TODO: This can be extended to handle extraction to 256-bits.

  MVT VT = Ext->getSimpleValueType(0);

  if (!VT.is128BitVector())

    return SDValue();


  MVT SelCondVT = Sel.getOperand(0).getSimpleValueType();

  if (!SelCondVT.is256BitVector() && !SelCondVT.is512BitVector())

    return SDValue();


  MVT WideVT = Ext->getOperand(0).getSimpleValueType();

  MVT SelVT = Sel.getSimpleValueType();

  assert((SelVT.is256BitVector() || SelVT.is512BitVector()) &&

         "Unexpected vector type with legal operations");


  unsigned SelElts = SelVT.getVectorNumElements();

  unsigned CastedElts = WideVT.getVectorNumElements();

  unsigned ExtIdx = Ext->getConstantOperandVal(1);

  if (SelElts % CastedElts == 0) {

    // The select has the same or more (narrower) elements than the extract

    // operand. The extraction index gets scaled by that factor.

    ExtIdx *= (SelElts / CastedElts);

  } else if (CastedElts % SelElts == 0) {

    // The select has less (wider) elements than the extract operand. Make sure

    // that the extraction index can be divided evenly.

    unsigned IndexDivisor = CastedElts / SelElts;

    if (ExtIdx % IndexDivisor != 0)

      return SDValue();

    ExtIdx /= IndexDivisor;

  } else {

    llvm_unreachable("Element count of simple vector types are not divisible?");

  }


  unsigned NarrowingFactor = WideVT.getSizeInBits() / VT.getSizeInBits();

  unsigned NarrowElts = SelElts / NarrowingFactor;

  MVT NarrowSelVT = MVT::getVectorVT(SelVT.getVectorElementType(), NarrowElts);

  SDValue ExtCond = extract128BitVector(Sel.getOperand(0), ExtIdx, DAG, DL);

  SDValue ExtT = extract128BitVector(Sel.getOperand(1), ExtIdx, DAG, DL);

  SDValue ExtF = extract128BitVector(Sel.getOperand(2), ExtIdx, DAG, DL);

  SDValue NarrowSel = DAG.getSelect(DL, NarrowSelVT, ExtCond, ExtT, ExtF);

  return DAG.getBitcast(VT, NarrowSel);

}


static SDValue combineEXTRACT_SUBVECTOR(SDNode *N, SelectionDAG &DAG,

                                        TargetLowering::DAGCombinerInfo &DCI,

                                        const X86Subtarget &Subtarget) {

  if (!N->getValueType(0).isSimple())

    return SDValue();


  MVT VT = N->getSimpleValueType(0);

  SDValue InVec = N->getOperand(0);

  unsigned IdxVal = N->getConstantOperandVal(1);

  EVT InVecVT = InVec.getValueType();

  unsigned SizeInBits = VT.getSizeInBits();

  unsigned InSizeInBits = InVecVT.getSizeInBits();

  unsigned NumSubElts = VT.getVectorNumElements();

  unsigned NumInElts = InVecVT.getVectorNumElements();

  const TargetLowering &TLI = DAG.getTargetLoweringInfo();

  SDLoc DL(N);


  // For AVX1 only, if we are extracting from a 256-bit and+not (which will

  // eventually get combined/lowered into ANDNP) with a concatenated operand,

  // split the 'and' into 128-bit ops to avoid the concatenate and extract.

  // We let generic combining take over from there to simplify the

  // insert/extract and 'not'.

  // This pattern emerges during AVX1 legalization. We handle it before lowering

  // to avoid complications like splitting constant vector loads.

  if (Subtarget.hasAVX() && !Subtarget.hasAVX2() && TLI.isTypeLegal(InVecVT) &&

      InSizeInBits == 256 && InVec.getOpcode() == ISD::AND) {

    auto isConcatenatedNot = [](SDValue V) {

      V = peekThroughBitcasts(V);

      if (!isBitwiseNot(V))

        return false;

      SDValue NotOp = V->getOperand(0);

      return peekThroughBitcasts(NotOp).getOpcode() == ISD::CONCAT_VECTORS;

    };

    if (isConcatenatedNot(InVec.getOperand(0)) ||

        isConcatenatedNot(InVec.getOperand(1))) {

      // extract (and v4i64 X, (not (concat Y1, Y2))), n -> andnp v2i64 X(n), Y1

      return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT,

                         splitVectorIntBinary(InVec, DAG, DL),

                         N->getOperand(1));

    }

  }


  if (DCI.isBeforeLegalizeOps())

    return SDValue();


  if (SDValue V = narrowExtractedVectorSelect(N, DL, DAG))

    return V;


  if (ISD::isBuildVectorAllZeros(InVec.getNode()))

    return getZeroVector(VT, Subtarget, DAG, DL);


  if (ISD::isBuildVectorAllOnes(InVec.getNode())) {

    if (VT.getScalarType() == MVT::i1)

      return DAG.getConstant(1, DL, VT);

    return getOnesVector(VT, DAG, DL);

  }


  if (InVec.getOpcode() == ISD::BUILD_VECTOR)

    return DAG.getBuildVector(VT, DL, InVec->ops().slice(IdxVal, NumSubElts));


  // EXTRACT_SUBVECTOR(EXTRACT_SUBVECTOR(V,C1)),C2) - EXTRACT_SUBVECTOR(V,C1+C2)

  if (IdxVal != 0 && InVec.getOpcode() == ISD::EXTRACT_SUBVECTOR &&

      InVec.hasOneUse() && TLI.isTypeLegal(VT) &&

      TLI.isTypeLegal(InVec.getOperand(0).getValueType())) {

    unsigned NewIdx = IdxVal + InVec.getConstantOperandVal(1);

    return extractSubVector(InVec.getOperand(0), NewIdx, DAG, DL, SizeInBits);

  }


  // EXTRACT_SUBVECTOR(INSERT_SUBVECTOR(SRC,SUB,C1),C2)

  // --> INSERT_SUBVECTOR(EXTRACT_SUBVECTOR(SRC,C2),SUB,C1-C2)

  // iff SUB is entirely contained in the extraction.

  if (VT.getVectorElementType() != MVT::i1 && TLI.isTypeLegal(VT) &&

      InVec.getOpcode() == ISD::INSERT_SUBVECTOR && InVec.hasOneUse()) {

    SDValue Src = InVec.getOperand(0);

    SDValue Sub = InVec.getOperand(1);

    EVT SubVT = Sub.getValueType();

    uint64_t InsIdx = InVec.getConstantOperandVal(2);

    if (IdxVal <= InsIdx &&

        (IdxVal + NumSubElts) >= (InsIdx + SubVT.getVectorNumElements())) {

      SDValue NewSrc = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Src,

                                   DAG.getVectorIdxConstant(IdxVal, DL));

      return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, NewSrc, Sub,

                         DAG.getVectorIdxConstant(InsIdx - IdxVal, DL));

    }

  }


  // If we're extracting an upper subvector see if we'd get the same elements if

  // we extracted the lowest subvector instead which should allow

  // SimplifyDemandedVectorElts do more simplifications.

  if (IdxVal != 0) {

    bool AllEquiv = all_of(seq<unsigned>(NumSubElts), [&](unsigned I) {

      return IsElementEquivalent(NumInElts, InVec, InVec, I, I + IdxVal);

    });

    if (AllEquiv)

      return extractSubVector(InVec, 0, DAG, DL, SizeInBits);

  }


  // Check if we're extracting a whole broadcasted subvector.

  if (InVec.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) {

    auto *MemIntr = cast<MemIntrinsicSDNode>(InVec);

    EVT MemVT = MemIntr->getMemoryVT();

    if (MemVT == VT) {

      // If this is the only use, we can replace with a regular load (this may

      // have been missed by SimplifyDemandedVectorElts due to extra uses of the

      // memory chain).

      if (InVec.hasOneUse()) {

        SDValue Ld =

            DAG.getLoad(MemVT, DL, MemIntr->getChain(), MemIntr->getBasePtr(),

                        MemIntr->getMemOperand());

        DAG.makeEquivalentMemoryOrdering(SDValue(MemIntr, 1), Ld.getValue(1));

        return Ld;

      }

    }

  }


  // Attempt to extract from the source of a shuffle vector.

  if ((InSizeInBits % SizeInBits) == 0 && (IdxVal % NumSubElts) == 0) {

    SmallVector<int, 32> ShuffleMask;

    SmallVector<int, 32> ScaledMask;

    SmallVector<SDValue, 2> ShuffleInputs;

    unsigned NumSubVecs = InSizeInBits / SizeInBits;

    // Decode the shuffle mask and scale it so its shuffling subvectors.

    if (getTargetShuffleInputs(InVec, ShuffleInputs, ShuffleMask, DAG) &&

        scaleShuffleElements(ShuffleMask, NumSubVecs, ScaledMask)) {

      unsigned SubVecIdx = IdxVal / NumSubElts;

      if (ScaledMask[SubVecIdx] == SM_SentinelUndef)

        return DAG.getUNDEF(VT);

      if (ScaledMask[SubVecIdx] == SM_SentinelZero)

        return getZeroVector(VT, Subtarget, DAG, DL);

      SDValue Src = ShuffleInputs[ScaledMask[SubVecIdx] / NumSubVecs];

      if (Src.getValueSizeInBits() == InSizeInBits) {

        unsigned SrcSubVecIdx = ScaledMask[SubVecIdx] % NumSubVecs;

        unsigned SrcEltIdx = SrcSubVecIdx * NumSubElts;

        return extractSubVector(DAG.getBitcast(InVecVT, Src), SrcEltIdx, DAG,

                                DL, SizeInBits);

      }

    }

  }


  auto IsExtractFree = [](SDValue V) {

    if (V.hasOneUse()) {

      V = peekThroughOneUseBitcasts(V);

      if (V.getOpcode() == ISD::LOAD)

        return true;

    }

    V = peekThroughBitcasts(V);

    if (ISD::isBuildVectorOfConstantSDNodes(V.getNode()))

      return true;

    if (ISD::isBuildVectorOfConstantFPSDNodes(V.getNode()))

      return true;

    return V.isUndef();

  };


  // If we're extracting the lowest subvector and we're the only user,

  // we may be able to perform this with a smaller vector width.

  unsigned InOpcode = InVec.getOpcode();

  if (InVec.hasOneUse()) {

    if (IdxVal == 0 && VT == MVT::v2f64 && InVecVT == MVT::v4f64) {

      // v2f64 CVTDQ2PD(v4i32).

      if (InOpcode == ISD::SINT_TO_FP &&

          InVec.getOperand(0).getValueType() == MVT::v4i32) {

        return DAG.getNode(X86ISD::CVTSI2P, DL, VT, InVec.getOperand(0));

      }

      // v2f64 CVTUDQ2PD(v4i32).

      if (InOpcode == ISD::UINT_TO_FP && Subtarget.hasVLX() &&

          InVec.getOperand(0).getValueType() == MVT::v4i32) {

        return DAG.getNode(X86ISD::CVTUI2P, DL, VT, InVec.getOperand(0));

      }

      // v2f64 CVTPS2PD(v4f32).

      if (InOpcode == ISD::FP_EXTEND &&

          InVec.getOperand(0).getValueType() == MVT::v4f32) {

        return DAG.getNode(X86ISD::VFPEXT, DL, VT, InVec.getOperand(0));

      }

    }

    // v4i32 CVTPS2DQ(v4f32) / CVTPS2UDQ(v4f32).

    // v4f32 CVTDQ2PS(v4i32) / CVTUDQ2PS(v4i32).

    if ((InOpcode == ISD::FP_TO_SINT || InOpcode == ISD::SINT_TO_FP ||

         ((InOpcode == ISD::FP_TO_UINT || InOpcode == ISD::UINT_TO_FP) &&

          Subtarget.hasVLX())) &&

        (VT == MVT::v4i32 || VT == MVT::v4f32)) {

      SDValue Src = InVec.getOperand(0);

      if (Src.getValueType().getScalarSizeInBits() == 32)

        return DAG.getNode(InOpcode, DL, VT,

                           extractSubVector(Src, IdxVal, DAG, DL, SizeInBits));

    }

    if (IdxVal == 0 &&

        (ISD::isExtOpcode(InOpcode) || ISD::isExtVecInRegOpcode(InOpcode)) &&

        (SizeInBits == 128 || SizeInBits == 256) &&

        InVec.getOperand(0).getValueSizeInBits() >= SizeInBits) {

      SDValue Ext = InVec.getOperand(0);

      if (Ext.getValueSizeInBits() > SizeInBits)

        Ext = extractSubVector(Ext, 0, DAG, DL, SizeInBits);

      unsigned ExtOp = DAG.getOpcode_EXTEND_VECTOR_INREG(InOpcode);

      return DAG.getNode(ExtOp, DL, VT, Ext);

    }

    if (IdxVal == 0 && InOpcode == ISD::VSELECT &&

        InVec.getOperand(0).getValueType().is256BitVector() &&

        InVec.getOperand(1).getValueType().is256BitVector() &&

        InVec.getOperand(2).getValueType().is256BitVector()) {

      SDValue Ext0 = extractSubVector(InVec.getOperand(0), 0, DAG, DL, 128);

      SDValue Ext1 = extractSubVector(InVec.getOperand(1), 0, DAG, DL, 128);

      SDValue Ext2 = extractSubVector(InVec.getOperand(2), 0, DAG, DL, 128);

      return DAG.getNode(InOpcode, DL, VT, Ext0, Ext1, Ext2);

    }

    if (IdxVal == 0 && InOpcode == ISD::TRUNCATE && Subtarget.hasVLX() &&

        (SizeInBits == 128 || SizeInBits == 256)) {

      SDValue InVecSrc = InVec.getOperand(0);

      unsigned Scale = InVecSrc.getValueSizeInBits() / InSizeInBits;

      SDValue Ext = extractSubVector(InVecSrc, 0, DAG, DL, Scale * SizeInBits);

      return DAG.getNode(InOpcode, DL, VT, Ext);

    }


    if (SizeInBits == 128 || SizeInBits == 256) {

      switch (InOpcode) {

      case X86ISD::MOVDDUP:

        return DAG.getNode(

            InOpcode, DL, VT,

            extractSubVector(InVec.getOperand(0), IdxVal, DAG, DL, SizeInBits));

      case X86ISD::PSHUFD:

      case X86ISD::VPERMILPI:

        if (InVec.getOperand(0).hasOneUse()) {

          uint64_t M = InVec.getConstantOperandVal(1) & 255;

          M = VT.getScalarSizeInBits() < 64 ? M : (M >> IdxVal);

          return DAG.getNode(InOpcode, DL, VT,

                             extractSubVector(InVec.getOperand(0), IdxVal, DAG,

                                              DL, SizeInBits),

                             DAG.getTargetConstant(M, DL, MVT::i8));

        }

        break;

      case X86ISD::PCMPEQ:

      case X86ISD::PCMPGT:

      case X86ISD::UNPCKH:

      case X86ISD::UNPCKL:

        if (IsExtractFree(InVec.getOperand(0)) ||

            IsExtractFree(InVec.getOperand(1)))

          return DAG.getNode(InOpcode, DL, VT,

                             extractSubVector(InVec.getOperand(0), IdxVal, DAG,

                                              DL, SizeInBits),

                             extractSubVector(InVec.getOperand(1), IdxVal, DAG,

                                              DL, SizeInBits));

        break;

      case X86ISD::CMPP:

        if (IsExtractFree(InVec.getOperand(0)) ||

            IsExtractFree(InVec.getOperand(1)))

          return DAG.getNode(InOpcode, DL, VT,

                             extractSubVector(InVec.getOperand(0), IdxVal, DAG,

                                              DL, SizeInBits),

                             extractSubVector(InVec.getOperand(1), IdxVal, DAG,

                                              DL, SizeInBits),

                             InVec.getOperand(2));

        break;

      case X86ISD::BLENDI:

        if (IsExtractFree(InVec.getOperand(0)) ||

            IsExtractFree(InVec.getOperand(1))) {

          uint64_t M = InVec.getConstantOperandVal(2) & 255;

          M = VT.getScalarType() == MVT::i16 ? M : (M >> IdxVal);

          return DAG.getNode(InOpcode, DL, VT,

                             extractSubVector(InVec.getOperand(0), IdxVal, DAG,

                                              DL, SizeInBits),

                             extractSubVector(InVec.getOperand(1), IdxVal, DAG,

                                              DL, SizeInBits),

                             DAG.getTargetConstant(M, DL, MVT::i8));

        }

        break;

      case X86ISD::VPERMV:

        if (IdxVal != 0) {

          SDValue Mask = InVec.getOperand(0);

          SDValue Src = InVec.getOperand(1);

          Mask = extractSubVector(Mask, IdxVal, DAG, DL, SizeInBits);

          Mask = widenSubVector(Mask, /*ZeroNewElements=*/false, Subtarget, DAG,

                                DL, InSizeInBits);

          SDValue Shuffle = DAG.getNode(InOpcode, DL, InVecVT, Mask, Src);

          return extractSubVector(Shuffle, 0, DAG, DL, SizeInBits);

        }

        break;

      case X86ISD::VPERMV3:

        if (IdxVal != 0) {

          SDValue Src0 = InVec.getOperand(0);

          SDValue Mask = InVec.getOperand(1);

          SDValue Src1 = InVec.getOperand(2);

          Mask = extractSubVector(Mask, IdxVal, DAG, DL, SizeInBits);

          Mask = widenSubVector(Mask, /*ZeroNewElements=*/false, Subtarget, DAG,

                                DL, InSizeInBits);

          SDValue Shuffle =

              DAG.getNode(InOpcode, DL, InVecVT, Src0, Mask, Src1);

          return extractSubVector(Shuffle, 0, DAG, DL, SizeInBits);

        }

        break;

      }

    }

  }


  // Always split vXi64 logical shifts where we're extracting the upper 32-bits

  // as this is very likely to fold into a shuffle/truncation.

  if ((InOpcode == X86ISD::VSHLI || InOpcode == X86ISD::VSRLI) &&

      InVecVT.getScalarSizeInBits() == 64 &&

      InVec.getConstantOperandAPInt(1) == 32) {

    SDValue Ext =

        extractSubVector(InVec.getOperand(0), IdxVal, DAG, DL, SizeInBits);

    return DAG.getNode(InOpcode, DL, VT, Ext, InVec.getOperand(1));

  }


  return SDValue();

}


static SDValue combineSCALAR_TO_VECTOR(SDNode *N, SelectionDAG &DAG,

                                       const X86Subtarget &Subtarget) {

  using namespace SDPatternMatch;

  EVT VT = N->getValueType(0);

  SDValue Src = N->getOperand(0);

  SDLoc DL(N);


  // If this is a scalar to vector to v1i1 from an AND with 1, bypass the and.

  // This occurs frequently in our masked scalar intrinsic code and our

  // floating point select lowering with AVX512.

  // TODO: SimplifyDemandedBits instead?

  if (VT == MVT::v1i1 && Src.getOpcode() == ISD::AND && Src.hasOneUse() &&

      isOneConstant(Src.getOperand(1)))

    return DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i1, Src.getOperand(0));


  // Combine scalar_to_vector of an extract_vector_elt into an extract_subvec.

  if (VT == MVT::v1i1 && Src.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&

      Src.hasOneUse() && Src.getOperand(0).getValueType().isVector() &&

      Src.getOperand(0).getValueType().getVectorElementType() == MVT::i1 &&

      isNullConstant(Src.getOperand(1)))

    return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Src.getOperand(0),

                       Src.getOperand(1));


  // Reduce v2i64 to v4i32 if we don't need the upper bits or are known zero.

  // TODO: Move to DAGCombine/SimplifyDemandedBits?

  if ((VT == MVT::v2i64 || VT == MVT::v2f64) && Src.hasOneUse()) {

    auto IsExt64 = [&DAG](SDValue Op, bool IsZeroExt) {

      if (Op.getValueType() != MVT::i64)

        return SDValue();

      unsigned Opc = IsZeroExt ? ISD::ZERO_EXTEND : ISD::ANY_EXTEND;

      if (Op.getOpcode() == Opc &&

          Op.getOperand(0).getScalarValueSizeInBits() <= 32)

        return Op.getOperand(0);

      unsigned Ext = IsZeroExt ? ISD::ZEXTLOAD : ISD::EXTLOAD;

      if (auto *Ld = dyn_cast<LoadSDNode>(Op))

        if (Ld->getExtensionType() == Ext &&

            Ld->getMemoryVT().getScalarSizeInBits() <= 32)

          return Op;

      if (IsZeroExt) {

        KnownBits Known = DAG.computeKnownBits(Op);

        if (!Known.isConstant() && Known.countMinLeadingZeros() >= 32)

          return Op;

      }

      return SDValue();

    };


    if (SDValue AnyExt = IsExt64(peekThroughOneUseBitcasts(Src), false))

      return DAG.getBitcast(

          VT, DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32,

                          DAG.getAnyExtOrTrunc(AnyExt, DL, MVT::i32)));


    if (SDValue ZeroExt = IsExt64(peekThroughOneUseBitcasts(Src), true))

      return DAG.getBitcast(

          VT,

          DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v4i32,

                      DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32,

                                  DAG.getZExtOrTrunc(ZeroExt, DL, MVT::i32))));

  }


  if (Src.getOpcode() == ISD::BITCAST) {

    SDValue SrcOp = Src.getOperand(0);

    // Combine (v4i32 (scalar_to_vector (i32 (bitcast (float))))) to MOVD.

    if (VT == MVT::v4i32 && SrcOp.getValueType() == MVT::f32)

      return DAG.getBitcast(

          VT, DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4f32, SrcOp));

    // Combine (v2i64 (scalar_to_vector (i64 (bitcast (double))))) to MOVQ.

    if (VT == MVT::v2i64 && SrcOp.getValueType() == MVT::f64)

      return DAG.getBitcast(

          VT, DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, SrcOp));

    // Combine (v2i64 (scalar_to_vector (i64 (bitcast (mmx))))) to MOVQ2DQ.

    if (VT == MVT::v2i64 && SrcOp.getValueType() == MVT::x86mmx)

      return DAG.getNode(X86ISD::MOVQ2DQ, DL, VT, SrcOp);

  }


  if (VT == MVT::v4i32) {

    SDValue HalfSrc;

    // Combine (v4i32 (scalar_to_vector (i32 (anyext (bitcast (f16))))))

    // to remove XMM->GPR->XMM moves.

    if (sd_match(Src, m_AnyExt(m_BitCast(

                          m_AllOf(m_SpecificVT(MVT::f16), m_Value(HalfSrc))))))

      return DAG.getBitcast(

          VT, DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v8f16, HalfSrc));

  }


  // See if we're broadcasting the scalar value, in which case just reuse that.

  // Ensure the same SDValue from the SDNode use is being used.

  if (VT.getScalarType() == Src.getValueType())

    for (SDNode *User : Src->users())

      if (User->getOpcode() == X86ISD::VBROADCAST &&

          Src == User->getOperand(0)) {

        unsigned SizeInBits = VT.getFixedSizeInBits();

        unsigned BroadcastSizeInBits =

            User->getValueSizeInBits(0).getFixedValue();

        if (BroadcastSizeInBits == SizeInBits)

          return SDValue(User, 0);

        if (BroadcastSizeInBits > SizeInBits)

          return extractSubVector(SDValue(User, 0), 0, DAG, DL, SizeInBits);

        // TODO: Handle BroadcastSizeInBits < SizeInBits when we have test

        // coverage.

      }


  // Check for cases where we've ended up with a scalarized shift, typically

  // during type legalization.

  switch (Src.getOpcode()) {

  case ISD::SHL:

  case ISD::SRL:

  case ISD::SRA:

    if (auto *Amt = dyn_cast<ConstantSDNode>(Src.getOperand(1))) {

      if (supportedVectorShiftWithImm(VT, Subtarget, Src.getOpcode()) &&

          Src.hasOneUse()) {

        SDValue SrcVec =

            DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Src.getOperand(0));

        unsigned Opc = getTargetVShiftUniformOpcode(Src.getOpcode(), false);

        return getTargetVShiftByConstNode(Opc, DL, VT.getSimpleVT(), SrcVec,

                                          Amt->getZExtValue(), DAG);

      }

    }

    break;

  case ISD::FSHL:

  case ISD::FSHR:

    if (auto *Amt = dyn_cast<ConstantSDNode>(Src.getOperand(2))) {

      if (supportedVectorShiftWithImm(VT, Subtarget, ISD::SHL) &&

          Src.getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&

          Src.getOperand(1).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&

          Src.hasOneUse()) {

        uint64_t AmtVal =

            Amt->getAPIntValue().urem(Src.getScalarValueSizeInBits());

        SDValue SrcVec0 =

            DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Src.getOperand(0));

        SDValue SrcVec1 =

            DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Src.getOperand(1));

        return DAG.getNode(Src.getOpcode(), DL, VT, SrcVec0, SrcVec1,

                           DAG.getConstant(AmtVal, DL, VT));

      }

    }

    break;

  }


  return SDValue();

}


// Simplify PMULDQ and PMULUDQ operations.


static SDValue combinePMULDQ(SDNode *N, SelectionDAG &DAG,

                             TargetLowering::DAGCombinerInfo &DCI,

                             const X86Subtarget &Subtarget) {

  SDValue LHS = N->getOperand(0);

  SDValue RHS = N->getOperand(1);


  // Canonicalize constant to RHS.

  if (DAG.isConstantIntBuildVectorOrConstantInt(LHS) &&

      !DAG.isConstantIntBuildVectorOrConstantInt(RHS))

    return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0), RHS, LHS);


  // Multiply by zero.

  // Don't return RHS as it may contain UNDEFs.

  if (ISD::isBuildVectorAllZeros(RHS.getNode()))

    return DAG.getConstant(0, SDLoc(N), N->getValueType(0));


  // PMULDQ/PMULUDQ only uses lower 32 bits from each vector element.

  const TargetLowering &TLI = DAG.getTargetLoweringInfo();

  if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnes(64), DCI))

    return SDValue(N, 0);


  // If the input is an extend_invec and the SimplifyDemandedBits call didn't

  // convert it to any_extend_invec, due to the LegalOperations check, do the

  // conversion directly to a vector shuffle manually. This exposes combine

  // opportunities missed by combineEXTEND_VECTOR_INREG not calling

  // combineX86ShufflesRecursively on SSE4.1 targets.

  // FIXME: This is basically a hack around several other issues related to

  // ANY_EXTEND_VECTOR_INREG.

  if (N->getValueType(0) == MVT::v2i64 && LHS.hasOneUse() &&

      (LHS.getOpcode() == ISD::ZERO_EXTEND_VECTOR_INREG ||

       LHS.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG) &&

      LHS.getOperand(0).getValueType() == MVT::v4i32) {

    SDLoc dl(N);

    LHS = DAG.getVectorShuffle(MVT::v4i32, dl, LHS.getOperand(0),

                               LHS.getOperand(0), { 0, -1, 1, -1 });

    LHS = DAG.getBitcast(MVT::v2i64, LHS);

    return DAG.getNode(N->getOpcode(), dl, MVT::v2i64, LHS, RHS);

  }

  if (N->getValueType(0) == MVT::v2i64 && RHS.hasOneUse() &&

      (RHS.getOpcode() == ISD::ZERO_EXTEND_VECTOR_INREG ||

       RHS.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG) &&

      RHS.getOperand(0).getValueType() == MVT::v4i32) {

    SDLoc dl(N);

    RHS = DAG.getVectorShuffle(MVT::v4i32, dl, RHS.getOperand(0),

                               RHS.getOperand(0), { 0, -1, 1, -1 });

    RHS = DAG.getBitcast(MVT::v2i64, RHS);

    return DAG.getNode(N->getOpcode(), dl, MVT::v2i64, LHS, RHS);

  }


  return SDValue();

}


// Simplify VPMADDUBSW/VPMADDWD operations.


static SDValue combineVPMADD(SDNode *N, SelectionDAG &DAG,

                             TargetLowering::DAGCombinerInfo &DCI) {

  MVT VT = N->getSimpleValueType(0);

  SDValue LHS = N->getOperand(0);

  SDValue RHS = N->getOperand(1);

  unsigned Opc = N->getOpcode();

  bool IsPMADDWD = Opc == X86ISD::VPMADDWD;

  assert((Opc == X86ISD::VPMADDWD || Opc == X86ISD::VPMADDUBSW) &&

         "Unexpected PMADD opcode");


  // Multiply by zero.

  // Don't return LHS/RHS as it may contain UNDEFs.

  if (ISD::isBuildVectorAllZeros(LHS.getNode()) ||

      ISD::isBuildVectorAllZeros(RHS.getNode()))

    return DAG.getConstant(0, SDLoc(N), VT);


  // Constant folding.

  APInt LHSUndefs, RHSUndefs;

  SmallVector<APInt> LHSBits, RHSBits;

  unsigned SrcEltBits = LHS.getScalarValueSizeInBits();

  unsigned DstEltBits = VT.getScalarSizeInBits();

  if (getTargetConstantBitsFromNode(LHS, SrcEltBits, LHSUndefs, LHSBits) &&

      getTargetConstantBitsFromNode(RHS, SrcEltBits, RHSUndefs, RHSBits)) {

    SmallVector<APInt> Result;

    for (unsigned I = 0, E = LHSBits.size(); I != E; I += 2) {

      APInt LHSLo = LHSBits[I + 0], LHSHi = LHSBits[I + 1];

      APInt RHSLo = RHSBits[I + 0], RHSHi = RHSBits[I + 1];

      LHSLo = IsPMADDWD ? LHSLo.sext(DstEltBits) : LHSLo.zext(DstEltBits);

      LHSHi = IsPMADDWD ? LHSHi.sext(DstEltBits) : LHSHi.zext(DstEltBits);

      APInt Lo = LHSLo * RHSLo.sext(DstEltBits);

      APInt Hi = LHSHi * RHSHi.sext(DstEltBits);

      APInt Res = IsPMADDWD ? (Lo + Hi) : Lo.sadd_sat(Hi);

      Result.push_back(Res);

    }

    return getConstVector(Result, VT, DAG, SDLoc(N));

  }


  const TargetLowering &TLI = DAG.getTargetLoweringInfo();

  APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());

  if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, DCI))

    return SDValue(N, 0);


  return SDValue();

}


// Simplify VPMADD52L/VPMADD52H operations.


static SDValue combineVPMADD52LH(SDNode *N, SelectionDAG &DAG,

                                 TargetLowering::DAGCombinerInfo &DCI) {

  MVT VT = N->getSimpleValueType(0);


  bool AddLow = N->getOpcode() == X86ISD::VPMADD52L;

  SDValue Op0 = N->getOperand(0);

  SDValue Op1 = N->getOperand(1);

  SDValue Op2 = N->getOperand(2);

  SDLoc DL(N);


  APInt C0, C1;

  bool HasC0 = X86::isConstantSplat(Op0, C0),

       HasC1 = X86::isConstantSplat(Op1, C1);


  // lo/hi(C * X) + Z --> lo/hi(X * C) + Z

  if (HasC0 && !HasC1)

    return DAG.getNode(N->getOpcode(), DL, VT, Op1, Op0, Op2);


  // lo(X * 1) + Z --> lo(X) + Z iff X == lo(X)

  if (AddLow && HasC1 && C1.trunc(52).isOne()) {

    KnownBits KnownOp0 = DAG.computeKnownBits(Op0);

    if (KnownOp0.countMinLeadingZeros() >= 12)

      return DAG.getNode(ISD::ADD, DL, VT, Op0, Op2);

  }


  const TargetLowering &TLI = DAG.getTargetLoweringInfo();

  unsigned NumEltBits = VT.getScalarSizeInBits();

  if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnes(NumEltBits),

                               DCI))

    return SDValue(N, 0);


  return SDValue();

}


static SDValue combineEXTEND_VECTOR_INREG(SDNode *N, SelectionDAG &DAG,

                                          TargetLowering::DAGCombinerInfo &DCI,

                                          const X86Subtarget &Subtarget) {

  EVT VT = N->getValueType(0);

  SDValue In = N->getOperand(0);

  unsigned Opcode = N->getOpcode();

  unsigned InOpcode = In.getOpcode();

  const TargetLowering &TLI = DAG.getTargetLoweringInfo();

  SDLoc DL(N);


  // Try to merge vector loads and extend_inreg to an extload.

  if (!DCI.isBeforeLegalizeOps() && ISD::isNormalLoad(In.getNode()) &&

      In.hasOneUse()) {

    auto *Ld = cast<LoadSDNode>(In);

    if (Ld->isSimple()) {

      MVT SVT = In.getSimpleValueType().getVectorElementType();

      ISD::LoadExtType Ext = Opcode == ISD::SIGN_EXTEND_VECTOR_INREG

                                 ? ISD::SEXTLOAD

                                 : ISD::ZEXTLOAD;

      EVT MemVT = VT.changeVectorElementType(SVT);

      if (TLI.isLoadExtLegal(Ext, VT, MemVT)) {

        SDValue Load = DAG.getExtLoad(

            Ext, DL, VT, Ld->getChain(), Ld->getBasePtr(), Ld->getPointerInfo(),

            MemVT, Ld->getBaseAlign(), Ld->getMemOperand()->getFlags());

        DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));

        return Load;

      }

    }

  }


  // Fold EXTEND_VECTOR_INREG(EXTEND_VECTOR_INREG(X)) -> EXTEND_VECTOR_INREG(X).

  if (Opcode == InOpcode)

    return DAG.getNode(Opcode, DL, VT, In.getOperand(0));


  // Fold EXTEND_VECTOR_INREG(EXTRACT_SUBVECTOR(EXTEND(X),0))

  // -> EXTEND_VECTOR_INREG(X).

  // TODO: Handle non-zero subvector indices.

  if (InOpcode == ISD::EXTRACT_SUBVECTOR && In.getConstantOperandVal(1) == 0 &&

      In.getOperand(0).getOpcode() == DAG.getOpcode_EXTEND(Opcode) &&

      In.getOperand(0).getOperand(0).getValueSizeInBits() ==

          In.getValueSizeInBits())

    return DAG.getNode(Opcode, DL, VT, In.getOperand(0).getOperand(0));


  // Fold EXTEND_VECTOR_INREG(BUILD_VECTOR(X,Y,?,?)) -> BUILD_VECTOR(X,0,Y,0).

  // TODO: Move to DAGCombine?

  if (!DCI.isBeforeLegalizeOps() && Opcode == ISD::ZERO_EXTEND_VECTOR_INREG &&

      In.getOpcode() == ISD::BUILD_VECTOR && In.hasOneUse() &&

      In.getValueSizeInBits() == VT.getSizeInBits()) {

    unsigned NumElts = VT.getVectorNumElements();

    unsigned Scale = VT.getScalarSizeInBits() / In.getScalarValueSizeInBits();

    EVT EltVT = In.getOperand(0).getValueType();

    SmallVector<SDValue> Elts(Scale * NumElts, DAG.getConstant(0, DL, EltVT));

    for (unsigned I = 0; I != NumElts; ++I)

      Elts[I * Scale] = In.getOperand(I);

    return DAG.getBitcast(VT, DAG.getBuildVector(In.getValueType(), DL, Elts));

  }


  // Attempt to combine as a shuffle on SSE41+ targets.

  if (Subtarget.hasSSE41()) {

    SDValue Op(N, 0);

    if (TLI.isTypeLegal(VT) && TLI.isTypeLegal(In.getValueType()))

      if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))

        return Res;

  }


  return SDValue();

}


static SDValue combineKSHIFT(SDNode *N, SelectionDAG &DAG,

                             TargetLowering::DAGCombinerInfo &DCI) {

  EVT VT = N->getValueType(0);

  const TargetLowering &TLI = DAG.getTargetLoweringInfo();

  if (ISD::isBuildVectorAllZeros(N->getOperand(0).getNode()))

    return DAG.getConstant(0, SDLoc(N), VT);


  // Fold kshiftr(extract_subvector(X,C1),C2)

  //  --> extract_subvector(kshiftr(X,C1+C2),0)

  // Fold kshiftr(kshiftr(X,C1),C2) --> kshiftr(X,C1+C2)

  if (N->getOpcode() == X86ISD::KSHIFTR) {

    SDLoc DL(N);

    if (N->getOperand(0).getOpcode() == ISD::EXTRACT_SUBVECTOR ||

        N->getOperand(0).getOpcode() == X86ISD::KSHIFTR) {

      SDValue Src = N->getOperand(0).getOperand(0);

      uint64_t Amt = N->getConstantOperandVal(1) +

                     N->getOperand(0).getConstantOperandVal(1);

      EVT SrcVT = Src.getValueType();

      if (TLI.isTypeLegal(SrcVT) && Amt < SrcVT.getVectorNumElements()) {

        SDValue Shift = DAG.getNode(X86ISD::KSHIFTR, DL, SrcVT, Src,

                                    DAG.getTargetConstant(Amt, DL, MVT::i8));

        return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Shift,

                           DAG.getVectorIdxConstant(0, DL));

      }

    }

  }


  APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());

  if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, DCI))

    return SDValue(N, 0);


  return SDValue();

}


// Optimize (fp16_to_fp (fp_to_fp16 X)) to VCVTPS2PH followed by VCVTPH2PS.

// Done as a combine because the lowering for fp16_to_fp and fp_to_fp16 produce

// extra instructions between the conversion due to going to scalar and back.


static SDValue combineFP16_TO_FP(SDNode *N, SelectionDAG &DAG,

                                 const X86Subtarget &Subtarget) {

  if (Subtarget.useSoftFloat() || !Subtarget.hasF16C())

    return SDValue();


  if (N->getOperand(0).getOpcode() != ISD::FP_TO_FP16)

    return SDValue();


  if (N->getValueType(0) != MVT::f32 ||

      N->getOperand(0).getOperand(0).getValueType() != MVT::f32)

    return SDValue();


  SDLoc dl(N);

  SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32,

                            N->getOperand(0).getOperand(0));

  Res = DAG.getNode(X86ISD::CVTPS2PH, dl, MVT::v8i16, Res,

                    DAG.getTargetConstant(4, dl, MVT::i32));

  Res = DAG.getNode(X86ISD::CVTPH2PS, dl, MVT::v4f32, Res);

  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, Res,

                     DAG.getVectorIdxConstant(0, dl));

}


static SDValue combineFP_EXTEND(SDNode *N, SelectionDAG &DAG,

                                TargetLowering::DAGCombinerInfo &DCI,

                                const X86Subtarget &Subtarget) {

  EVT VT = N->getValueType(0);

  bool IsStrict = N->isStrictFPOpcode();

  SDValue Src = N->getOperand(IsStrict ? 1 : 0);

  EVT SrcVT = Src.getValueType();


  SDLoc dl(N);

  if (SrcVT.getScalarType() == MVT::bf16) {

    if (DCI.isAfterLegalizeDAG() && Src.getOpcode() == ISD::FP_ROUND &&

        !IsStrict && Src.getOperand(0).getValueType() == VT)

      return Src.getOperand(0);


    if (!SrcVT.isVector())

      return SDValue();


    assert(!IsStrict && "Strict FP doesn't support BF16");

    if (VT.getVectorElementType() == MVT::f64) {

      EVT TmpVT = VT.changeVectorElementType(MVT::f32);

      return DAG.getNode(ISD::FP_EXTEND, dl, VT,

                         DAG.getNode(ISD::FP_EXTEND, dl, TmpVT, Src));

    }

    assert(VT.getVectorElementType() == MVT::f32 && "Unexpected fpext");

    EVT NVT = SrcVT.changeVectorElementType(MVT::i32);

    Src = DAG.getBitcast(SrcVT.changeTypeToInteger(), Src);

    Src = DAG.getNode(ISD::ZERO_EXTEND, dl, NVT, Src);

    Src = DAG.getNode(ISD::SHL, dl, NVT, Src, DAG.getConstant(16, dl, NVT));

    return DAG.getBitcast(VT, Src);

  }


  if (!Subtarget.hasF16C() || Subtarget.useSoftFloat())

    return SDValue();


  if (Subtarget.hasFP16())

    return SDValue();


  if (!SrcVT.isVector() || SrcVT.getVectorElementType() != MVT::f16)

    return SDValue();


  if (VT.getVectorElementType() != MVT::f32 &&

      VT.getVectorElementType() != MVT::f64)

    return SDValue();


  unsigned NumElts = VT.getVectorNumElements();

  if (NumElts == 1 || !isPowerOf2_32(NumElts))

    return SDValue();


  // Convert the input to vXi16.

  EVT IntVT = SrcVT.changeVectorElementTypeToInteger();

  Src = DAG.getBitcast(IntVT, Src);


  // Widen to at least 8 input elements.

  if (NumElts < 8) {

    unsigned NumConcats = 8 / NumElts;

    SDValue Fill = NumElts == 4 ? DAG.getUNDEF(IntVT)

                                : DAG.getConstant(0, dl, IntVT);

    SmallVector<SDValue, 4> Ops(NumConcats, Fill);

    Ops[0] = Src;

    Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i16, Ops);

  }


  // Destination is vXf32 with at least 4 elements.

  EVT CvtVT = EVT::getVectorVT(*DAG.getContext(), MVT::f32,

                               std::max(4U, NumElts));

  SDValue Cvt, Chain;

  if (IsStrict) {

    Cvt = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {CvtVT, MVT::Other},

                      {N->getOperand(0), Src});

    Chain = Cvt.getValue(1);

  } else {

    Cvt = DAG.getNode(X86ISD::CVTPH2PS, dl, CvtVT, Src);

  }


  if (NumElts < 4) {

    assert(NumElts == 2 && "Unexpected size");

    Cvt = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2f32, Cvt,

                      DAG.getVectorIdxConstant(0, dl));

  }


  if (IsStrict) {

    // Extend to the original VT if necessary.

    if (Cvt.getValueType() != VT) {

      Cvt = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {VT, MVT::Other},

                        {Chain, Cvt});

      Chain = Cvt.getValue(1);

    }

    return DAG.getMergeValues({Cvt, Chain}, dl);

  }


  // Extend to the original VT if necessary.

  return DAG.getNode(ISD::FP_EXTEND, dl, VT, Cvt);

}


// Try to find a larger VBROADCAST_LOAD/SUBV_BROADCAST_LOAD that we can extract.


static SDValue combineBROADCAST_LOAD(SDNode *N, SelectionDAG &DAG,

                                     TargetLowering::DAGCombinerInfo &DCI) {

  assert((N->getOpcode() == X86ISD::VBROADCAST_LOAD ||

          N->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) &&

         "Unknown broadcast load type");


  auto *MemIntrin = cast<MemIntrinsicSDNode>(N);

  SDValue Ptr = MemIntrin->getBasePtr();

  SDValue Chain = MemIntrin->getChain();

  EVT VT = N->getSimpleValueType(0);

  EVT MemVT = MemIntrin->getMemoryVT();


  // Look at other users of our base pointer and try to find a wider broadcast.

  // The input chain and the size of the memory VT must match.

  for (SDNode *User : Ptr->users())

    if (User != N && User->getOpcode() == N->getOpcode() &&

        cast<MemIntrinsicSDNode>(User)->getBasePtr() == Ptr &&

        cast<MemIntrinsicSDNode>(User)->getChain() == Chain &&

        cast<MemIntrinsicSDNode>(User)->getMemoryVT().getSizeInBits() ==

            MemVT.getSizeInBits() &&

        User->getValueSizeInBits(0).getFixedValue() > VT.getFixedSizeInBits()) {

      assert(cast<MemIntrinsicSDNode>(User)->isSimple() &&

             MemIntrin->isSimple() && "Illegal broadcast load type");

      DAG.makeEquivalentMemoryOrdering(SDValue(N, 1), SDValue(User, 1));

      SDValue Extract = extractSubVector(SDValue(User, 0), 0, DAG, SDLoc(N),

                                         VT.getSizeInBits());

      Extract = DAG.getBitcast(VT, Extract);

      Extract = DCI.CombineTo(N, Extract, SDValue(User, 1));

      return Extract;

    }


  return SDValue();

}


static SDValue combineFP_ROUND(SDNode *N, SelectionDAG &DAG,

                               const X86Subtarget &Subtarget) {

  if (!Subtarget.hasF16C() || Subtarget.useSoftFloat())

    return SDValue();


  bool IsStrict = N->isStrictFPOpcode();

  EVT VT = N->getValueType(0);

  SDValue Src = N->getOperand(IsStrict ? 1 : 0);

  EVT SrcVT = Src.getValueType();


  if (!VT.isVector() || VT.getVectorElementType() != MVT::f16 ||

      SrcVT.getVectorElementType() != MVT::f32)

    return SDValue();


  SDLoc dl(N);


  SDValue Cvt, Chain;

  unsigned NumElts = VT.getVectorNumElements();

  if (Subtarget.hasFP16()) {

    // Combine (v8f16 fp_round(concat_vectors(v4f32 (xint_to_fp v4i64),

    //                                        v4f32 (xint_to_fp v4i64))))

    // into (v8f16 vector_shuffle(v8f16 (CVTXI2P v4i64),

    //                            v8f16 (CVTXI2P v4i64)))

    if (NumElts == 8 && Src.getOpcode() == ISD::CONCAT_VECTORS &&

        Src.getNumOperands() == 2) {

      SDValue Cvt0, Cvt1;

      SDValue Op0 = Src.getOperand(0);

      SDValue Op1 = Src.getOperand(1);

      bool IsOp0Strict = Op0->isStrictFPOpcode();

      if (Op0.getOpcode() != Op1.getOpcode() ||

          Op0.getOperand(IsOp0Strict ? 1 : 0).getValueType() != MVT::v4i64 ||

          Op1.getOperand(IsOp0Strict ? 1 : 0).getValueType() != MVT::v4i64) {

        return SDValue();

      }

      int Mask[8] = {0, 1, 2, 3, 8, 9, 10, 11};

      if (IsStrict) {

        assert(IsOp0Strict && "Op0 must be strict node");

        unsigned Opc = Op0.getOpcode() == ISD::STRICT_SINT_TO_FP

                           ? X86ISD::STRICT_CVTSI2P

                           : X86ISD::STRICT_CVTUI2P;

        Cvt0 = DAG.getNode(Opc, dl, {MVT::v8f16, MVT::Other},

                           {Op0.getOperand(0), Op0.getOperand(1)});

        Cvt1 = DAG.getNode(Opc, dl, {MVT::v8f16, MVT::Other},

                           {Op1.getOperand(0), Op1.getOperand(1)});

        Cvt = DAG.getVectorShuffle(MVT::v8f16, dl, Cvt0, Cvt1, Mask);

        return DAG.getMergeValues({Cvt, Cvt0.getValue(1)}, dl);

      }

      unsigned Opc = Op0.getOpcode() == ISD::SINT_TO_FP ? X86ISD::CVTSI2P

                                                        : X86ISD::CVTUI2P;

      Cvt0 = DAG.getNode(Opc, dl, MVT::v8f16, Op0.getOperand(0));

      Cvt1 = DAG.getNode(Opc, dl, MVT::v8f16, Op1.getOperand(0));

      return Cvt = DAG.getVectorShuffle(MVT::v8f16, dl, Cvt0, Cvt1, Mask);

    }

    return SDValue();

  }


  if (NumElts == 1 || !isPowerOf2_32(NumElts))

    return SDValue();


  // Widen to at least 4 input elements.

  if (NumElts < 4)

    Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,

                      DAG.getConstantFP(0.0, dl, SrcVT));


  // Destination is v8i16 with at least 8 elements.

  EVT CvtVT =

      EVT::getVectorVT(*DAG.getContext(), MVT::i16, std::max(8U, NumElts));

  SDValue Rnd = DAG.getTargetConstant(4, dl, MVT::i32);

  if (IsStrict) {

    Cvt = DAG.getNode(X86ISD::STRICT_CVTPS2PH, dl, {CvtVT, MVT::Other},

                      {N->getOperand(0), Src, Rnd});

    Chain = Cvt.getValue(1);

  } else {

    Cvt = DAG.getNode(X86ISD::CVTPS2PH, dl, CvtVT, Src, Rnd);

  }


  // Extract down to real number of elements.

  if (NumElts < 8) {

    EVT IntVT = VT.changeVectorElementTypeToInteger();

    Cvt = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, IntVT, Cvt,

                      DAG.getVectorIdxConstant(0, dl));

  }


  Cvt = DAG.getBitcast(VT, Cvt);


  if (IsStrict)

    return DAG.getMergeValues({Cvt, Chain}, dl);


  return Cvt;

}


static SDValue combineMOVDQ2Q(SDNode *N, SelectionDAG &DAG) {

  SDValue Src = N->getOperand(0);


  // Turn MOVDQ2Q+simple_load into an mmx load.

  if (ISD::isNormalLoad(Src.getNode()) && Src.hasOneUse()) {

    LoadSDNode *LN = cast<LoadSDNode>(Src.getNode());


    if (LN->isSimple()) {

      SDValue NewLd =

          DAG.getLoad(MVT::x86mmx, SDLoc(N), LN->getChain(), LN->getBasePtr(),

                      LN->getPointerInfo(), LN->getBaseAlign(),

                      LN->getMemOperand()->getFlags());

      DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), NewLd.getValue(1));

      return NewLd;

    }

  }


  return SDValue();

}


static SDValue combinePDEP(SDNode *N, SelectionDAG &DAG,

                           TargetLowering::DAGCombinerInfo &DCI) {

  unsigned NumBits = N->getSimpleValueType(0).getSizeInBits();

  const TargetLowering &TLI = DAG.getTargetLoweringInfo();

  if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnes(NumBits), DCI))

    return SDValue(N, 0);


  return SDValue();

}


// Fixup the MMX intrinsics' types: in IR they are expressed with <1 x i64>,

// and so SelectionDAGBuilder creates them with v1i64 types, but they need to

// use x86mmx instead.


static SDValue FixupMMXIntrinsicTypes(SDNode *N, SelectionDAG &DAG) {

  SDLoc dl(N);


  bool MadeChange = false, CastReturnVal = false;

  SmallVector<SDValue, 8> Args;

  for (const SDValue &Arg : N->op_values()) {

    if (Arg.getValueType() == MVT::v1i64) {

      MadeChange = true;

      Args.push_back(DAG.getBitcast(MVT::x86mmx, Arg));

    } else

      Args.push_back(Arg);

  }

  SDVTList VTs = N->getVTList();

  SDVTList NewVTs = VTs;

  if (VTs.NumVTs > 0 && VTs.VTs[0] == MVT::v1i64) {

    SmallVector<EVT> NewVTArr(ArrayRef<EVT>(VTs.VTs, VTs.NumVTs));

    NewVTArr[0] = MVT::x86mmx;

    NewVTs = DAG.getVTList(NewVTArr);

    MadeChange = true;

    CastReturnVal = true;

  }


  if (MadeChange) {

    SDValue Result = DAG.getNode(N->getOpcode(), dl, NewVTs, Args);

    if (CastReturnVal) {

      SmallVector<SDValue, 2> Returns;

      for (unsigned i = 0, e = Result->getNumValues(); i != e; ++i)

        Returns.push_back(Result.getValue(i));

      Returns[0] = DAG.getBitcast(MVT::v1i64, Returns[0]);

      return DAG.getMergeValues(Returns, dl);

    }

    return Result;

  }

  return SDValue();

}


static SDValue combineINTRINSIC_WO_CHAIN(SDNode *N, SelectionDAG &DAG,

                                         TargetLowering::DAGCombinerInfo &DCI) {

  if (!DCI.isBeforeLegalize())

    return SDValue();


  unsigned IntNo = N->getConstantOperandVal(0);

  const IntrinsicData *IntrData = getIntrinsicWithoutChain(IntNo);


  if (IntrData && IntrData->Type == INTR_TYPE_CAST_MMX)

    return FixupMMXIntrinsicTypes(N, DAG);


  return SDValue();

}


static SDValue combineINTRINSIC_W_CHAIN(SDNode *N, SelectionDAG &DAG,

                                        TargetLowering::DAGCombinerInfo &DCI) {

  if (!DCI.isBeforeLegalize())

    return SDValue();


  unsigned IntNo = N->getConstantOperandVal(1);

  const IntrinsicData *IntrData = getIntrinsicWithChain(IntNo);


  if (IntrData && IntrData->Type == INTR_TYPE_CAST_MMX)

    return FixupMMXIntrinsicTypes(N, DAG);


  return SDValue();

}


static SDValue combineINTRINSIC_VOID(SDNode *N, SelectionDAG &DAG,

                                     TargetLowering::DAGCombinerInfo &DCI) {

  if (!DCI.isBeforeLegalize())

    return SDValue();


  unsigned IntNo = N->getConstantOperandVal(1);

  const IntrinsicData *IntrData = getIntrinsicWithChain(IntNo);


  if (IntrData && IntrData->Type == INTR_TYPE_CAST_MMX)

    return FixupMMXIntrinsicTypes(N, DAG);


  return SDValue();

}


SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,

                                             DAGCombinerInfo &DCI) const {

  SelectionDAG &DAG = DCI.DAG;

  switch (N->getOpcode()) {

  // clang-format off

  default: break;

  case ISD::SCALAR_TO_VECTOR:

    return combineSCALAR_TO_VECTOR(N, DAG, Subtarget);

  case ISD::EXTRACT_VECTOR_ELT:

  case X86ISD::PEXTRW:

  case X86ISD::PEXTRB:

    return combineExtractVectorElt(N, DAG, DCI, Subtarget);

  case ISD::CONCAT_VECTORS:

    return combineCONCAT_VECTORS(N, DAG, DCI, Subtarget);

  case ISD::INSERT_SUBVECTOR:

    return combineINSERT_SUBVECTOR(N, DAG, DCI, Subtarget);

  case ISD::EXTRACT_SUBVECTOR:

    return combineEXTRACT_SUBVECTOR(N, DAG, DCI, Subtarget);

  case ISD::VSELECT:

  case ISD::SELECT:

  case X86ISD::BLENDV:      return combineSelect(N, DAG, DCI, Subtarget);

  case ISD::BITCAST:        return combineBitcast(N, DAG, DCI, Subtarget);

  case X86ISD::CMOV:        return combineCMov(N, DAG, DCI, Subtarget);

  case X86ISD::CMP:         return combineCMP(N, DAG, DCI, Subtarget);

  case ISD::ADD:            return combineAdd(N, DAG, DCI, Subtarget);

  case ISD::SUB:            return combineSub(N, DAG, DCI, Subtarget);

  case X86ISD::ADD:

  case X86ISD::SUB:         return combineX86AddSub(N, DAG, DCI, Subtarget);

  case X86ISD::CLOAD:

  case X86ISD::CSTORE:      return combineX86CloadCstore(N, DAG);

  case X86ISD::SBB:         return combineSBB(N, DAG);

  case X86ISD::ADC:         return combineADC(N, DAG, DCI);

  case ISD::MUL:            return combineMul(N, DAG, DCI, Subtarget);

  case ISD::SHL:            return combineShiftLeft(N, DAG, Subtarget);

  case ISD::SRA:            return combineShiftRightArithmetic(N, DAG, Subtarget);

  case ISD::SRL:            return combineShiftRightLogical(N, DAG, DCI, Subtarget);

  case ISD::AND:            return combineAnd(N, DAG, DCI, Subtarget);

  case ISD::OR:             return combineOr(N, DAG, DCI, Subtarget);

  case ISD::XOR:            return combineXor(N, DAG, DCI, Subtarget);

  case ISD::BITREVERSE:     return combineBITREVERSE(N, DAG, DCI, Subtarget);

  case ISD::AVGCEILS:

  case ISD::AVGCEILU:

  case ISD::AVGFLOORS:

  case ISD::AVGFLOORU:      return combineAVG(N, DAG, DCI, Subtarget);

  case X86ISD::BEXTR:

  case X86ISD::BEXTRI:      return combineBEXTR(N, DAG, DCI, Subtarget);

  case ISD::LOAD:           return combineLoad(N, DAG, DCI, Subtarget);

  case ISD::MLOAD:          return combineMaskedLoad(N, DAG, DCI, Subtarget);

  case ISD::STORE:          return combineStore(N, DAG, DCI, Subtarget);

  case ISD::MSTORE:         return combineMaskedStore(N, DAG, DCI, Subtarget);

  case X86ISD::VEXTRACT_STORE:

    return combineVEXTRACT_STORE(N, DAG, DCI, Subtarget);

  case ISD::SINT_TO_FP:

  case ISD::STRICT_SINT_TO_FP:

    return combineSIntToFP(N, DAG, DCI, Subtarget);

  case ISD::UINT_TO_FP:

  case ISD::STRICT_UINT_TO_FP:

    return combineUIntToFP(N, DAG, Subtarget);

  case ISD::FP_TO_SINT:     return combineFPToSInt(N, DAG, Subtarget);

  case ISD::LRINT:

  case ISD::LLRINT:         return combineLRINT_LLRINT(N, DAG, Subtarget);

  case ISD::FADD:

  case ISD::FSUB:           return combineFaddFsub(N, DAG, Subtarget);

  case X86ISD::VFCMULC:

  case X86ISD::VFMULC:      return combineFMulcFCMulc(N, DAG, Subtarget);

  case ISD::FNEG:           return combineFneg(N, DAG, DCI, Subtarget);

  case ISD::TRUNCATE:       return combineTruncate(N, DAG, Subtarget);

  case X86ISD::VTRUNC:      return combineVTRUNC(N, DAG, DCI);

  case X86ISD::ANDNP:       return combineAndnp(N, DAG, DCI, Subtarget);

  case X86ISD::FAND:        return combineFAnd(N, DAG, Subtarget);

  case X86ISD::FANDN:       return combineFAndn(N, DAG, Subtarget);

  case X86ISD::FXOR:

  case X86ISD::FOR:         return combineFOr(N, DAG, DCI, Subtarget);

  case X86ISD::FMIN:

  case X86ISD::FMAX:        return combineFMinFMax(N, DAG);

  case ISD::FMINNUM:

  case ISD::FMAXNUM:        return combineFMinNumFMaxNum(N, DAG, Subtarget);

  case X86ISD::CVTSI2P:

  case X86ISD::CVTUI2P:     return combineX86INT_TO_FP(N, DAG, DCI);

  case X86ISD::CVTP2SI:

  case X86ISD::CVTP2UI:

  case X86ISD::STRICT_CVTTP2SI:

  case X86ISD::CVTTP2SI:

  case X86ISD::STRICT_CVTTP2UI:

  case X86ISD::CVTTP2UI:

                            return combineCVTP2I_CVTTP2I(N, DAG, DCI);

  case X86ISD::STRICT_CVTPH2PS:

  case X86ISD::CVTPH2PS:    return combineCVTPH2PS(N, DAG, DCI);

  case X86ISD::BT:          return combineBT(N, DAG, DCI);

  case ISD::ANY_EXTEND:

  case ISD::ZERO_EXTEND:    return combineZext(N, DAG, DCI, Subtarget);

  case ISD::SIGN_EXTEND:    return combineSext(N, DAG, DCI, Subtarget);

  case ISD::SIGN_EXTEND_INREG: return combineSignExtendInReg(N, DAG, Subtarget);

  case ISD::ANY_EXTEND_VECTOR_INREG:

  case ISD::SIGN_EXTEND_VECTOR_INREG:

  case ISD::ZERO_EXTEND_VECTOR_INREG:

    return combineEXTEND_VECTOR_INREG(N, DAG, DCI, Subtarget);

  case ISD::SETCC:          return combineSetCC(N, DAG, DCI, Subtarget);

  case X86ISD::SETCC:       return combineX86SetCC(N, DAG, Subtarget);

  case X86ISD::BRCOND:      return combineBrCond(N, DAG, Subtarget);

  case X86ISD::PACKSS:

  case X86ISD::PACKUS:      return combineVectorPack(N, DAG, DCI, Subtarget);

  case X86ISD::HADD:

  case X86ISD::HSUB:

  case X86ISD::FHADD:

  case X86ISD::FHSUB:       return combineVectorHADDSUB(N, DAG, DCI, Subtarget);

  case X86ISD::VSHL:

  case X86ISD::VSRA:

  case X86ISD::VSRL:

    return combineVectorShiftVar(N, DAG, DCI, Subtarget);

  case X86ISD::VSHLI:

  case X86ISD::VSRAI:

  case X86ISD::VSRLI:

    return combineVectorShiftImm(N, DAG, DCI, Subtarget);

  case ISD::INSERT_VECTOR_ELT:

  case X86ISD::PINSRB:

  case X86ISD::PINSRW:      return combineVectorInsert(N, DAG, DCI, Subtarget);

  case X86ISD::SHUFP:       // Handle all target specific shuffles

  case X86ISD::INSERTPS:

  case X86ISD::EXTRQI:

  case X86ISD::INSERTQI:

  case X86ISD::VALIGN:

  case X86ISD::PALIGNR:

  case X86ISD::VSHLDQ:

  case X86ISD::VSRLDQ:

  case X86ISD::BLENDI:

  case X86ISD::UNPCKH:

  case X86ISD::UNPCKL:

  case X86ISD::MOVHLPS:

  case X86ISD::MOVLHPS:

  case X86ISD::PSHUFB:

  case X86ISD::PSHUFD:

  case X86ISD::PSHUFHW:

  case X86ISD::PSHUFLW:

  case X86ISD::MOVSHDUP:

  case X86ISD::MOVSLDUP:

  case X86ISD::MOVDDUP:

  case X86ISD::MOVSS:

  case X86ISD::MOVSD:

  case X86ISD::MOVSH:

  case X86ISD::VBROADCAST:

  case X86ISD::VPPERM:

  case X86ISD::VPERMI:

  case X86ISD::VPERMV:

  case X86ISD::VPERMV3:

  case X86ISD::VPERMIL2:

  case X86ISD::VPERMILPI:

  case X86ISD::VPERMILPV:

  case X86ISD::VPERM2X128:

  case X86ISD::SHUF128:

  case X86ISD::VZEXT_MOVL:

  case ISD::VECTOR_SHUFFLE: return combineShuffle(N, DAG, DCI,Subtarget);

  case X86ISD::FMADD_RND:

  case X86ISD::FMSUB:

  case X86ISD::STRICT_FMSUB:

  case X86ISD::FMSUB_RND:

  case X86ISD::FNMADD:

  case X86ISD::STRICT_FNMADD:

  case X86ISD::FNMADD_RND:

  case X86ISD::FNMSUB:

  case X86ISD::STRICT_FNMSUB:

  case X86ISD::FNMSUB_RND:

  case ISD::FMA:

  case ISD::STRICT_FMA:     return combineFMA(N, DAG, DCI, Subtarget);

  case X86ISD::FMADDSUB_RND:

  case X86ISD::FMSUBADD_RND:

  case X86ISD::FMADDSUB:

  case X86ISD::FMSUBADD:    return combineFMADDSUB(N, DAG, DCI);

  case X86ISD::MOVMSK:      return combineMOVMSK(N, DAG, DCI, Subtarget);

  case X86ISD::TESTP:       return combineTESTP(N, DAG, DCI, Subtarget);

  case X86ISD::MGATHER:

  case X86ISD::MSCATTER:    return combineX86GatherScatter(N, DAG, DCI);

  case ISD::MGATHER:

  case ISD::MSCATTER:       return combineGatherScatter(N, DAG, DCI);

  case X86ISD::PCMPEQ:

  case X86ISD::PCMPGT:      return combineVectorCompare(N, DAG, Subtarget);

  case X86ISD::PMULDQ:

  case X86ISD::PMULUDQ:     return combinePMULDQ(N, DAG, DCI, Subtarget);

  case X86ISD::VPMADDUBSW:

  case X86ISD::VPMADDWD:    return combineVPMADD(N, DAG, DCI);

  case X86ISD::VPMADD52L:

  case X86ISD::VPMADD52H:    return combineVPMADD52LH(N, DAG, DCI);

  case X86ISD::KSHIFTL:

  case X86ISD::KSHIFTR:     return combineKSHIFT(N, DAG, DCI);

  case ISD::FP16_TO_FP:     return combineFP16_TO_FP(N, DAG, Subtarget);

  case ISD::STRICT_FP_EXTEND:

  case ISD::FP_EXTEND:      return combineFP_EXTEND(N, DAG, DCI, Subtarget);

  case ISD::STRICT_FP_ROUND:

  case ISD::FP_ROUND:       return combineFP_ROUND(N, DAG, Subtarget);

  case X86ISD::VBROADCAST_LOAD:

  case X86ISD::SUBV_BROADCAST_LOAD: return combineBROADCAST_LOAD(N, DAG, DCI);

  case X86ISD::MOVDQ2Q:     return combineMOVDQ2Q(N, DAG);

  case X86ISD::PDEP:        return combinePDEP(N, DAG, DCI);

  case ISD::INTRINSIC_WO_CHAIN:  return combineINTRINSIC_WO_CHAIN(N, DAG, DCI);

  case ISD::INTRINSIC_W_CHAIN:  return combineINTRINSIC_W_CHAIN(N, DAG, DCI);

  case ISD::INTRINSIC_VOID:  return combineINTRINSIC_VOID(N, DAG, DCI);

  case ISD::FP_TO_SINT_SAT:

  case ISD::FP_TO_UINT_SAT: return combineFP_TO_xINT_SAT(N, DAG, Subtarget);

    // clang-format on

  }


  return SDValue();

}


bool X86TargetLowering::preferABDSToABSWithNSW(EVT VT) const {

  return Subtarget.canUseCMOV() && (VT == MVT::i32 || VT == MVT::i64);

}


// Prefer (non-AVX512) vector TRUNCATE(SIGN_EXTEND_INREG(X)) to use of PACKSS.


bool X86TargetLowering::preferSextInRegOfTruncate(EVT TruncVT, EVT VT,

                                                  EVT ExtVT) const {

  return Subtarget.hasAVX512() || !VT.isVector();

}


bool X86TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const {

  if (!isTypeLegal(VT))

    return false;


  // There are no vXi8 shifts.

  if (Opc == ISD::SHL && VT.isVector() && VT.getVectorElementType() == MVT::i8)

    return false;


  // TODO: Almost no 8-bit ops are desirable because they have no actual

  //       size/speed advantages vs. 32-bit ops, but they do have a major

  //       potential disadvantage by causing partial register stalls.

  //

  // 8-bit multiply/shl is probably not cheaper than 32-bit multiply/shl, and

  // we have specializations to turn 32-bit multiply/shl into LEA or other ops.

  // Also, see the comment in "IsDesirableToPromoteOp" - where we additionally

  // check for a constant operand to the multiply.

  if ((Opc == ISD::MUL || Opc == ISD::SHL) && VT == MVT::i8)

    return false;


  // i16 instruction encodings are longer and some i16 instructions are slow,

  // so those are not desirable.

  if (VT == MVT::i16) {

    switch (Opc) {

    default:

      break;

    case ISD::LOAD:

    case ISD::SIGN_EXTEND:

    case ISD::ZERO_EXTEND:

    case ISD::ANY_EXTEND:

    case ISD::MUL:

      return false;

    case ISD::SHL:

    case ISD::SRA:

    case ISD::SRL:

    case ISD::SUB:

    case ISD::ADD:

    case ISD::AND:

    case ISD::OR:

    case ISD::XOR:

      // NDD instruction never has "partial register write" issue b/c it has

      // destination register's upper bits [63:OSIZE]) zeroed even when

      // OSIZE=8/16.

      return Subtarget.hasNDD();

    }

  }


  // Any legal type not explicitly accounted for above here is desirable.

  return true;

}


SDValue X86TargetLowering::expandIndirectJTBranch(const SDLoc &dl,

                                                  SDValue Value, SDValue Addr,

                                                  int JTI,

                                                  SelectionDAG &DAG) const {

  const Module *M = DAG.getMachineFunction().getFunction().getParent();

  Metadata *IsCFProtectionSupported = M->getModuleFlag("cf-protection-branch");

  if (IsCFProtectionSupported) {

    // In case control-flow branch protection is enabled, we need to add

    // notrack prefix to the indirect branch.

    // In order to do that we create NT_BRIND SDNode.

    // Upon ISEL, the pattern will convert it to jmp with NoTrack prefix.

    SDValue Chain = Value;

    // Jump table debug info is only needed if CodeView is enabled.

    if (DAG.getTarget().getTargetTriple().isOSBinFormatCOFF())

      Chain = DAG.getJumpTableDebugInfo(JTI, Chain, dl);

    return DAG.getNode(X86ISD::NT_BRIND, dl, MVT::Other, Chain, Addr);

  }


  return TargetLowering::expandIndirectJTBranch(dl, Value, Addr, JTI, DAG);

}


TargetLowering::AndOrSETCCFoldKind


X86TargetLowering::isDesirableToCombineLogicOpOfSETCC(

    const SDNode *LogicOp, const SDNode *SETCC0, const SDNode *SETCC1) const {

  using AndOrSETCCFoldKind = TargetLowering::AndOrSETCCFoldKind;

  EVT VT = LogicOp->getValueType(0);

  EVT OpVT = SETCC0->getOperand(0).getValueType();

  if (!VT.isInteger())

    return AndOrSETCCFoldKind::None;


  if (VT.isVector())

    return AndOrSETCCFoldKind(AndOrSETCCFoldKind::NotAnd |

                              (isOperationLegal(ISD::ABS, OpVT)

                                   ? AndOrSETCCFoldKind::ABS

                                   : AndOrSETCCFoldKind::None));


  // Don't use `NotAnd` as even though `not` is generally shorter code size than

  // `add`, `add` can lower to LEA which can save moves / spills. Any case where

  // `NotAnd` applies, `AddAnd` does as well.

  // TODO: Currently we lower (icmp eq/ne (and ~X, Y), 0) -> `test (not X), Y`,

  // if we change that to `andn Y, X` it may be worth prefering `NotAnd` here.

  return AndOrSETCCFoldKind::AddAnd;

}


bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const {

  EVT VT = Op.getValueType();

  bool Is8BitMulByConstant = VT == MVT::i8 && Op.getOpcode() == ISD::MUL &&

                             isa<ConstantSDNode>(Op.getOperand(1));


  // i16 is legal, but undesirable since i16 instruction encodings are longer

  // and some i16 instructions are slow.

  // 8-bit multiply-by-constant can usually be expanded to something cheaper

  // using LEA and/or other ALU ops.

  if (VT != MVT::i16 && !Is8BitMulByConstant)

    return false;


  auto IsFoldableRMW = [](SDValue Load, SDValue Op) {

    if (!Op.hasOneUse())

      return false;

    SDNode *User = *Op->user_begin();

    if (!ISD::isNormalStore(User))

      return false;

    auto *Ld = cast<LoadSDNode>(Load);

    auto *St = cast<StoreSDNode>(User);

    return Ld->getBasePtr() == St->getBasePtr();

  };


  auto IsFoldableAtomicRMW = [](SDValue Load, SDValue Op) {

    if (!Load.hasOneUse() || Load.getOpcode() != ISD::ATOMIC_LOAD)

      return false;

    if (!Op.hasOneUse())

      return false;

    SDNode *User = *Op->user_begin();

    if (User->getOpcode() != ISD::ATOMIC_STORE)

      return false;

    auto *Ld = cast<AtomicSDNode>(Load);

    auto *St = cast<AtomicSDNode>(User);

    return Ld->getBasePtr() == St->getBasePtr();

  };


  auto IsFoldableZext = [](SDValue Op) {

    if (!Op.hasOneUse())

      return false;

    SDNode *User = *Op->user_begin();

    EVT VT = User->getValueType(0);

    return (User->getOpcode() == ISD::ZERO_EXTEND &&

            (VT == MVT::i32 || VT == MVT::i64));

  };


  bool Commute = false;

  switch (Op.getOpcode()) {

  default: return false;

  case ISD::SIGN_EXTEND:

  case ISD::ZERO_EXTEND:

  case ISD::ANY_EXTEND:

    break;

  case ISD::SHL:

  case ISD::SRA:

  case ISD::SRL: {

    SDValue N0 = Op.getOperand(0);

    // Look out for (store (shl (load), x)).

    if (X86::mayFoldLoad(N0, Subtarget) && IsFoldableRMW(N0, Op))

      return false;

    break;

  }

  case ISD::MUL:

    // When ZU is enabled, we prefer to not promote for MUL by a constant

    // when there is an opportunity to fold a zext with imulzu.

    if (Subtarget.hasZU() && IsFoldableZext(Op) &&

        (isa<ConstantSDNode>(Op.getOperand(0)) ||

         isa<ConstantSDNode>(Op.getOperand(1))))

      return false;

    [[fallthrough]];

  case ISD::ADD:

  case ISD::AND:

  case ISD::OR:

  case ISD::XOR:

    Commute = true;

    [[fallthrough]];

  case ISD::SUB: {

    SDValue N0 = Op.getOperand(0);

    SDValue N1 = Op.getOperand(1);

    // Avoid disabling potential load folding opportunities.

    if (X86::mayFoldLoad(N1, Subtarget) &&

        (!Commute || !isa<ConstantSDNode>(N0) ||

         (Op.getOpcode() != ISD::MUL && IsFoldableRMW(N1, Op))))

      return false;

    if (X86::mayFoldLoad(N0, Subtarget) &&

        ((Commute && !isa<ConstantSDNode>(N1)) ||

         (Op.getOpcode() != ISD::MUL && IsFoldableRMW(N0, Op))))

      return false;

    if (IsFoldableAtomicRMW(N0, Op) ||

        (Commute && IsFoldableAtomicRMW(N1, Op)))

      return false;

  }

  }


  PVT = MVT::i32;

  return true;

}


//===----------------------------------------------------------------------===//

//                           X86 Inline Assembly Support

//===----------------------------------------------------------------------===//


static X86::CondCode parseConstraintCode(llvm::StringRef Constraint) {

  X86::CondCode Cond = StringSwitch<X86::CondCode>(Constraint)

                           .Case("{@cca}", X86::COND_A)

                           .Case("{@ccae}", X86::COND_AE)

                           .Case("{@ccb}", X86::COND_B)

                           .Case("{@ccbe}", X86::COND_BE)

                           .Case("{@ccc}", X86::COND_B)

                           .Case("{@cce}", X86::COND_E)

                           .Case("{@ccz}", X86::COND_E)

                           .Case("{@ccg}", X86::COND_G)

                           .Case("{@ccge}", X86::COND_GE)

                           .Case("{@ccl}", X86::COND_L)

                           .Case("{@ccle}", X86::COND_LE)

                           .Case("{@ccna}", X86::COND_BE)

                           .Case("{@ccnae}", X86::COND_B)

                           .Case("{@ccnb}", X86::COND_AE)

                           .Case("{@ccnbe}", X86::COND_A)

                           .Case("{@ccnc}", X86::COND_AE)

                           .Case("{@ccne}", X86::COND_NE)

                           .Case("{@ccnz}", X86::COND_NE)

                           .Case("{@ccng}", X86::COND_LE)

                           .Case("{@ccnge}", X86::COND_L)

                           .Case("{@ccnl}", X86::COND_GE)

                           .Case("{@ccnle}", X86::COND_G)

                           .Case("{@ccno}", X86::COND_NO)

                           .Case("{@ccnp}", X86::COND_NP)

                           .Case("{@ccns}", X86::COND_NS)

                           .Case("{@cco}", X86::COND_O)

                           .Case("{@ccp}", X86::COND_P)

                           .Case("{@ccs}", X86::COND_S)

                           .Default(X86::COND_INVALID);

  return Cond;

}


/// Given a constraint letter, return the type of constraint for this target.

X86TargetLowering::ConstraintType


X86TargetLowering::getConstraintType(StringRef Constraint) const {

  if (Constraint.size() == 1) {

    switch (Constraint[0]) {

    case 'R':

    case 'q':

    case 'Q':

    case 'f':

    case 't':

    case 'u':

    case 'y':

    case 'x':

    case 'v':

    case 'l':

    case 'k': // AVX512 masking registers.

      return C_RegisterClass;

    case 'a':

    case 'b':

    case 'c':

    case 'd':

    case 'S':

    case 'D':

    case 'A':

      return C_Register;

    case 'I':

    case 'J':

    case 'K':

    case 'N':

    case 'G':

    case 'L':

    case 'M':

      return C_Immediate;

    case 'C':

    case 'e':

    case 'Z':

      return C_Other;

    default:

      break;

    }

  }

  else if (Constraint.size() == 2) {

    switch (Constraint[0]) {

    default:

      break;

    case 'W':

      if (Constraint[1] != 's')

        break;

      return C_Other;

    case 'Y':

      switch (Constraint[1]) {

      default:

        break;

      case 'z':

        return C_Register;

      case 'i':

      case 'm':

      case 'k':

      case 't':

      case '2':

        return C_RegisterClass;

      }

      break;

    case 'j':

      switch (Constraint[1]) {

      default:

        break;

      case 'r':

      case 'R':

        return C_RegisterClass;

      }

    }

  } else if (parseConstraintCode(Constraint) != X86::COND_INVALID)

    return C_Other;

  return TargetLowering::getConstraintType(Constraint);

}


/// Examine constraint type and operand type and determine a weight value.

/// This object must already have been set up with the operand type

/// and the current alternative constraint selected.

TargetLowering::ConstraintWeight


X86TargetLowering::getSingleConstraintMatchWeight(

    AsmOperandInfo &Info, const char *Constraint) const {

  ConstraintWeight Wt = CW_Invalid;

  Value *CallOperandVal = Info.CallOperandVal;

  // If we don't have a value, we can't do a match,

  // but allow it at the lowest weight.

  if (!CallOperandVal)

    return CW_Default;

  Type *Ty = CallOperandVal->getType();

  // Look at the constraint type.

  switch (*Constraint) {

  default:

    Wt = TargetLowering::getSingleConstraintMatchWeight(Info, Constraint);

    [[fallthrough]];

  case 'R':

  case 'q':

  case 'Q':

  case 'a':

  case 'b':

  case 'c':

  case 'd':

  case 'S':

  case 'D':

  case 'A':

    if (CallOperandVal->getType()->isIntegerTy())

      Wt = CW_SpecificReg;

    break;

  case 'f':

  case 't':

  case 'u':

    if (Ty->isFloatingPointTy())

      Wt = CW_SpecificReg;

    break;

  case 'y':

    if (Ty->getPrimitiveSizeInBits() == 64 && Subtarget.hasMMX())

      Wt = CW_SpecificReg;

    break;

  case 'Y':

    if (StringRef(Constraint).size() != 2)

      break;

    switch (Constraint[1]) {

    default:

      return CW_Invalid;

    // XMM0

    case 'z':

      if (((Ty->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1()) ||

          ((Ty->getPrimitiveSizeInBits() == 256) && Subtarget.hasAVX()) ||

          ((Ty->getPrimitiveSizeInBits() == 512) && Subtarget.hasAVX512()))

        return CW_SpecificReg;

      return CW_Invalid;

    // Conditional OpMask regs (AVX512)

    case 'k':

      if ((Ty->getPrimitiveSizeInBits() == 64) && Subtarget.hasAVX512())

        return CW_Register;

      return CW_Invalid;

    // Any MMX reg

    case 'm':

      if (Ty->getPrimitiveSizeInBits() == 64 && Subtarget.hasMMX())

        return CW_SpecificReg;

      return CW_Invalid;

    // Any SSE reg when ISA >= SSE2, same as 'x'

    case 'i':

    case 't':

    case '2':

      if (!Subtarget.hasSSE2())

        return CW_Invalid;

      break;

    }

    break;

  case 'j':

    if (StringRef(Constraint).size() != 2)

      break;

    switch (Constraint[1]) {

    default:

      return CW_Invalid;

    case 'r':

    case 'R':

      if (CallOperandVal->getType()->isIntegerTy())

        Wt = CW_SpecificReg;

      break;

    }

    break;

  case 'v':

    if ((Ty->getPrimitiveSizeInBits() == 512) && Subtarget.hasAVX512())

      Wt = CW_Register;

    [[fallthrough]];

  case 'x':

    if (((Ty->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1()) ||

        ((Ty->getPrimitiveSizeInBits() == 256) && Subtarget.hasAVX()))

      Wt = CW_Register;

    break;

  case 'k':

    // Enable conditional vector operations using %k<#> registers.

    if ((Ty->getPrimitiveSizeInBits() == 64) && Subtarget.hasAVX512())

      Wt = CW_Register;

    break;

  case 'I':

    if (auto *C = dyn_cast<ConstantInt>(Info.CallOperandVal))

      if (C->getZExtValue() <= 31)

        Wt = CW_Constant;

    break;

  case 'J':

    if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))

      if (C->getZExtValue() <= 63)

        Wt = CW_Constant;

    break;

  case 'K':

    if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))

      if ((C->getSExtValue() >= -0x80) && (C->getSExtValue() <= 0x7f))

        Wt = CW_Constant;

    break;

  case 'L':

    if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))

      if ((C->getZExtValue() == 0xff) || (C->getZExtValue() == 0xffff))

        Wt = CW_Constant;

    break;

  case 'M':

    if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))

      if (C->getZExtValue() <= 3)

        Wt = CW_Constant;

    break;

  case 'N':

    if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))

      if (C->getZExtValue() <= 0xff)

        Wt = CW_Constant;

    break;

  case 'G':

  case 'C':

    if (isa<ConstantFP>(CallOperandVal))

      Wt = CW_Constant;

    break;

  case 'e':

    if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))

      if ((C->getSExtValue() >= -0x80000000LL) &&

          (C->getSExtValue() <= 0x7fffffffLL))

        Wt = CW_Constant;

    break;

  case 'Z':

    if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))

      if (C->getZExtValue() <= 0xffffffff)

        Wt = CW_Constant;

    break;

  }

  return Wt;

}


/// Try to replace an X constraint, which matches anything, with another that

/// has more specific requirements based on the type of the corresponding

/// operand.


const char *X86TargetLowering::

LowerXConstraint(EVT ConstraintVT) const {

  // FP X constraints get lowered to SSE1/2 registers if available, otherwise

  // 'f' like normal targets.

  if (ConstraintVT.isFloatingPoint()) {

    if (Subtarget.hasSSE1())

      return "x";

  }


  return TargetLowering::LowerXConstraint(ConstraintVT);

}


// Lower @cc targets via setcc.


SDValue X86TargetLowering::LowerAsmOutputForConstraint(

    SDValue &Chain, SDValue &Glue, const SDLoc &DL,

    const AsmOperandInfo &OpInfo, SelectionDAG &DAG) const {

  X86::CondCode Cond = parseConstraintCode(OpInfo.ConstraintCode);

  if (Cond == X86::COND_INVALID)

    return SDValue();

  // Check that return type is valid.

  if (OpInfo.ConstraintVT.isVector() || !OpInfo.ConstraintVT.isInteger() ||

      OpInfo.ConstraintVT.getSizeInBits() < 8)

    report_fatal_error("Glue output operand is of invalid type");


  // Get EFLAGS register. Only update chain when copyfrom is glued.

  if (Glue.getNode()) {

    Glue = DAG.getCopyFromReg(Chain, DL, X86::EFLAGS, MVT::i32, Glue);

    Chain = Glue.getValue(1);

  } else

    Glue = DAG.getCopyFromReg(Chain, DL, X86::EFLAGS, MVT::i32);

  // Extract CC code.

  SDValue CC = getSETCC(Cond, Glue, DL, DAG);

  // Extend to 32-bits

  SDValue Result = DAG.getNode(ISD::ZERO_EXTEND, DL, OpInfo.ConstraintVT, CC);


  return Result;

}


/// Lower the specified operand into the Ops vector.

/// If it is invalid, don't add anything to Ops.


void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,

                                                     StringRef Constraint,

                                                     std::vector<SDValue> &Ops,

                                                     SelectionDAG &DAG) const {

  SDValue Result;

  char ConstraintLetter = Constraint[0];

  switch (ConstraintLetter) {

  default: break;

  case 'I':

    if (auto *C = dyn_cast<ConstantSDNode>(Op)) {

      if (C->getZExtValue() <= 31) {

        Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),

                                       Op.getValueType());

        break;

      }

    }

    return;

  case 'J':

    if (auto *C = dyn_cast<ConstantSDNode>(Op)) {

      if (C->getZExtValue() <= 63) {

        Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),

                                       Op.getValueType());

        break;

      }

    }

    return;

  case 'K':

    if (auto *C = dyn_cast<ConstantSDNode>(Op)) {

      if (isInt<8>(C->getSExtValue())) {

        Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),

                                       Op.getValueType());

        break;

      }

    }

    return;

  case 'L':

    if (auto *C = dyn_cast<ConstantSDNode>(Op)) {

      if (C->getZExtValue() == 0xff || C->getZExtValue() == 0xffff ||

          (Subtarget.is64Bit() && C->getZExtValue() == 0xffffffff)) {

        Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op),

                                       Op.getValueType());

        break;

      }

    }

    return;

  case 'M':

    if (auto *C = dyn_cast<ConstantSDNode>(Op)) {

      if (C->getZExtValue() <= 3) {

        Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),

                                       Op.getValueType());

        break;

      }

    }

    return;

  case 'N':

    if (auto *C = dyn_cast<ConstantSDNode>(Op)) {

      if (C->getZExtValue() <= 255) {

        Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),

                                       Op.getValueType());

        break;

      }

    }

    return;

  case 'O':

    if (auto *C = dyn_cast<ConstantSDNode>(Op)) {

      if (C->getZExtValue() <= 127) {

        Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),

                                       Op.getValueType());

        break;

      }

    }

    return;

  case 'e': {

    // 32-bit signed value

    if (auto *C = dyn_cast<ConstantSDNode>(Op)) {

      if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),

                                           C->getSExtValue())) {

        // Widen to 64 bits here to get it sign extended.

        Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op), MVT::i64);

        break;

      }

    // FIXME gcc accepts some relocatable values here too, but only in certain

    // memory models; it's complicated.

    }

    return;

  }

  case 'W': {

    assert(Constraint[1] == 's');

    // Op is a BlockAddressSDNode or a GlobalAddressSDNode with an optional

    // offset.

    if (const auto *BA = dyn_cast<BlockAddressSDNode>(Op)) {

      Ops.push_back(DAG.getTargetBlockAddress(BA->getBlockAddress(),

                                              BA->getValueType(0)));

    } else {

      int64_t Offset = 0;

      if (Op->getOpcode() == ISD::ADD &&

          isa<ConstantSDNode>(Op->getOperand(1))) {

        Offset = cast<ConstantSDNode>(Op->getOperand(1))->getSExtValue();

        Op = Op->getOperand(0);

      }

      if (const auto *GA = dyn_cast<GlobalAddressSDNode>(Op))

        Ops.push_back(DAG.getTargetGlobalAddress(GA->getGlobal(), SDLoc(Op),

                                                 GA->getValueType(0), Offset));

    }

    return;

  }

  case 'Z': {

    // 32-bit unsigned value

    if (auto *C = dyn_cast<ConstantSDNode>(Op)) {

      if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),

                                           C->getZExtValue())) {

        Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),

                                       Op.getValueType());

        break;

      }

    }

    // FIXME gcc accepts some relocatable values here too, but only in certain

    // memory models; it's complicated.

    return;

  }

  case 'i': {

    // Literal immediates are always ok.

    if (auto *CST = dyn_cast<ConstantSDNode>(Op)) {

      bool IsBool = CST->getConstantIntValue()->getBitWidth() == 1;

      BooleanContent BCont = getBooleanContents(MVT::i64);

      ISD::NodeType ExtOpc = IsBool ? getExtendForContent(BCont)

                                    : ISD::SIGN_EXTEND;

      int64_t ExtVal = ExtOpc == ISD::ZERO_EXTEND ? CST->getZExtValue()

                                                  : CST->getSExtValue();

      Result = DAG.getTargetConstant(ExtVal, SDLoc(Op), MVT::i64);

      break;

    }


    // In any sort of PIC mode addresses need to be computed at runtime by

    // adding in a register or some sort of table lookup.  These can't

    // be used as immediates. BlockAddresses and BasicBlocks are fine though.

    if ((Subtarget.isPICStyleGOT() || Subtarget.isPICStyleStubPIC()) &&

        !(isa<BlockAddressSDNode>(Op) || isa<BasicBlockSDNode>(Op)))

      return;


    // If we are in non-pic codegen mode, we allow the address of a global (with

    // an optional displacement) to be used with 'i'.

    if (auto *GA = dyn_cast<GlobalAddressSDNode>(Op))

      // If we require an extra load to get this address, as in PIC mode, we

      // can't accept it.

      if (isGlobalStubReference(

              Subtarget.classifyGlobalReference(GA->getGlobal())))

        return;

    break;

  }

  }


  if (Result.getNode()) {

    Ops.push_back(Result);

    return;

  }

  return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);

}


/// Check if \p RC is a general purpose register class.

/// I.e., GR* or one of their variant.


static bool isGRClass(const TargetRegisterClass &RC) {

  return RC.hasSuperClassEq(&X86::GR8RegClass) ||

         RC.hasSuperClassEq(&X86::GR16RegClass) ||

         RC.hasSuperClassEq(&X86::GR32RegClass) ||

         RC.hasSuperClassEq(&X86::GR64RegClass) ||

         RC.hasSuperClassEq(&X86::LOW32_ADDR_ACCESS_RBPRegClass);

}


/// Check if \p RC is a vector register class.

/// I.e., FR* / VR* or one of their variant.


static bool isFRClass(const TargetRegisterClass &RC) {

  return RC.hasSuperClassEq(&X86::FR16XRegClass) ||

         RC.hasSuperClassEq(&X86::FR32XRegClass) ||

         RC.hasSuperClassEq(&X86::FR64XRegClass) ||

         RC.hasSuperClassEq(&X86::VR128XRegClass) ||

         RC.hasSuperClassEq(&X86::VR256XRegClass) ||

         RC.hasSuperClassEq(&X86::VR512RegClass);

}


/// Check if \p RC is a mask register class.

/// I.e., VK* or one of their variant.


static bool isVKClass(const TargetRegisterClass &RC) {

  return RC.hasSuperClassEq(&X86::VK1RegClass) ||

         RC.hasSuperClassEq(&X86::VK2RegClass) ||

         RC.hasSuperClassEq(&X86::VK4RegClass) ||

         RC.hasSuperClassEq(&X86::VK8RegClass) ||

         RC.hasSuperClassEq(&X86::VK16RegClass) ||

         RC.hasSuperClassEq(&X86::VK32RegClass) ||

         RC.hasSuperClassEq(&X86::VK64RegClass);

}


static bool useEGPRInlineAsm(const X86Subtarget &Subtarget) {

  return Subtarget.hasEGPR() && Subtarget.useInlineAsmGPR32();

}


std::pair<unsigned, const TargetRegisterClass *>


X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,

                                                StringRef Constraint,

                                                MVT VT) const {

  // First, see if this is a constraint that directly corresponds to an LLVM

  // register class.

  if (Constraint.size() == 1) {

    // GCC Constraint Letters

    switch (Constraint[0]) {

    default: break;

    // 'A' means [ER]AX + [ER]DX.

    case 'A':

      if (Subtarget.is64Bit())

        return std::make_pair(X86::RAX, &X86::GR64_ADRegClass);

      assert((Subtarget.is32Bit() || Subtarget.is16Bit()) &&

             "Expecting 64, 32 or 16 bit subtarget");

      return std::make_pair(X86::EAX, &X86::GR32_ADRegClass);


      // TODO: Slight differences here in allocation order and leaving

      // RIP in the class. Do they matter any more here than they do

      // in the normal allocation?

    case 'k':

      if (Subtarget.hasAVX512()) {

        if (VT == MVT::v1i1 || VT == MVT::i1)

          return std::make_pair(0U, &X86::VK1RegClass);

        if (VT == MVT::v8i1 || VT == MVT::i8)

          return std::make_pair(0U, &X86::VK8RegClass);

        if (VT == MVT::v16i1 || VT == MVT::i16)

          return std::make_pair(0U, &X86::VK16RegClass);

      }

      if (Subtarget.hasBWI()) {

        if (VT == MVT::v32i1 || VT == MVT::i32)

          return std::make_pair(0U, &X86::VK32RegClass);

        if (VT == MVT::v64i1 || VT == MVT::i64)

          return std::make_pair(0U, &X86::VK64RegClass);

      }

      break;

    case 'q':   // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode.

      if (Subtarget.is64Bit()) {

        if (VT == MVT::i8 || VT == MVT::i1)

          return std::make_pair(0U, useEGPRInlineAsm(Subtarget)

                                        ? &X86::GR8RegClass

                                        : &X86::GR8_NOREX2RegClass);

        if (VT == MVT::i16)

          return std::make_pair(0U, useEGPRInlineAsm(Subtarget)

                                        ? &X86::GR16RegClass

                                        : &X86::GR16_NOREX2RegClass);

        if (VT == MVT::i32 || VT == MVT::f32)

          return std::make_pair(0U, useEGPRInlineAsm(Subtarget)

                                        ? &X86::GR32RegClass

                                        : &X86::GR32_NOREX2RegClass);

        if (VT != MVT::f80 && !VT.isVector())

          return std::make_pair(0U, useEGPRInlineAsm(Subtarget)

                                        ? &X86::GR64RegClass

                                        : &X86::GR64_NOREX2RegClass);

        break;

      }

      [[fallthrough]];

      // 32-bit fallthrough

    case 'Q':   // Q_REGS

      if (VT == MVT::i8 || VT == MVT::i1)

        return std::make_pair(0U, &X86::GR8_ABCD_LRegClass);

      if (VT == MVT::i16)

        return std::make_pair(0U, &X86::GR16_ABCDRegClass);

      if (VT == MVT::i32 || VT == MVT::f32 ||

          (!VT.isVector() && !Subtarget.is64Bit()))

        return std::make_pair(0U, &X86::GR32_ABCDRegClass);

      if (VT != MVT::f80 && !VT.isVector())

        return std::make_pair(0U, &X86::GR64_ABCDRegClass);

      break;

    case 'r':   // GENERAL_REGS

    case 'l':   // INDEX_REGS

      if (VT == MVT::i8 || VT == MVT::i1)

        return std::make_pair(0U, useEGPRInlineAsm(Subtarget)

                                      ? &X86::GR8RegClass

                                      : &X86::GR8_NOREX2RegClass);

      if (VT == MVT::i16)

        return std::make_pair(0U, useEGPRInlineAsm(Subtarget)

                                      ? &X86::GR16RegClass

                                      : &X86::GR16_NOREX2RegClass);

      if (VT == MVT::i32 || VT == MVT::f32 ||

          (!VT.isVector() && !Subtarget.is64Bit()))

        return std::make_pair(0U, useEGPRInlineAsm(Subtarget)

                                      ? &X86::GR32RegClass

                                      : &X86::GR32_NOREX2RegClass);

      if (VT != MVT::f80 && !VT.isVector())

        return std::make_pair(0U, useEGPRInlineAsm(Subtarget)

                                      ? &X86::GR64RegClass

                                      : &X86::GR64_NOREX2RegClass);

      break;

    case 'R':   // LEGACY_REGS

      if (VT == MVT::i8 || VT == MVT::i1)

        return std::make_pair(0U, &X86::GR8_NOREXRegClass);

      if (VT == MVT::i16)

        return std::make_pair(0U, &X86::GR16_NOREXRegClass);

      if (VT == MVT::i32 || VT == MVT::f32 ||

          (!VT.isVector() && !Subtarget.is64Bit()))

        return std::make_pair(0U, &X86::GR32_NOREXRegClass);

      if (VT != MVT::f80 && !VT.isVector())

        return std::make_pair(0U, &X86::GR64_NOREXRegClass);

      break;

    case 'f':  // FP Stack registers.

      // If SSE is enabled for this VT, use f80 to ensure the isel moves the

      // value to the correct fpstack register class.

      if (VT == MVT::f32 && !isScalarFPTypeInSSEReg(VT))

        return std::make_pair(0U, &X86::RFP32RegClass);

      if (VT == MVT::f64 && !isScalarFPTypeInSSEReg(VT))

        return std::make_pair(0U, &X86::RFP64RegClass);

      if (VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f80)

        return std::make_pair(0U, &X86::RFP80RegClass);

      break;

    case 'y':   // MMX_REGS if MMX allowed.

      if (!Subtarget.hasMMX()) break;

      return std::make_pair(0U, &X86::VR64RegClass);

    case 'v':

    case 'x':   // SSE_REGS if SSE1 allowed or AVX_REGS if AVX allowed

      if (!Subtarget.hasSSE1()) break;

      bool VConstraint = (Constraint[0] == 'v');


      switch (VT.SimpleTy) {

      default: break;

      // Scalar SSE types.

      case MVT::f16:

        if (VConstraint && Subtarget.hasFP16())

          return std::make_pair(0U, &X86::FR16XRegClass);

        break;

      case MVT::f32:

      case MVT::i32:

        if (VConstraint && Subtarget.hasVLX())

          return std::make_pair(0U, &X86::FR32XRegClass);

        return std::make_pair(0U, &X86::FR32RegClass);

      case MVT::f64:

      case MVT::i64:

        if (VConstraint && Subtarget.hasVLX())

          return std::make_pair(0U, &X86::FR64XRegClass);

        return std::make_pair(0U, &X86::FR64RegClass);

      case MVT::i128:

        if (Subtarget.is64Bit()) {

          if (VConstraint && Subtarget.hasVLX())

            return std::make_pair(0U, &X86::VR128XRegClass);

          return std::make_pair(0U, &X86::VR128RegClass);

        }

        break;

      // Vector types and fp128.

      case MVT::v8f16:

        if (!Subtarget.hasFP16())

          break;

        if (VConstraint)

          return std::make_pair(0U, &X86::VR128XRegClass);

        return std::make_pair(0U, &X86::VR128RegClass);

      case MVT::v8bf16:

        if (!Subtarget.hasBF16() || !Subtarget.hasVLX())

          break;

        if (VConstraint)

          return std::make_pair(0U, &X86::VR128XRegClass);

        return std::make_pair(0U, &X86::VR128RegClass);

      case MVT::f128:

        if (!Subtarget.is64Bit())

          break;

        [[fallthrough]];

      case MVT::v16i8:

      case MVT::v8i16:

      case MVT::v4i32:

      case MVT::v2i64:

      case MVT::v4f32:

      case MVT::v2f64:

        if (VConstraint && Subtarget.hasVLX())

          return std::make_pair(0U, &X86::VR128XRegClass);

        return std::make_pair(0U, &X86::VR128RegClass);

      // AVX types.

      case MVT::v16f16:

        if (!Subtarget.hasFP16())

          break;

        if (VConstraint)

          return std::make_pair(0U, &X86::VR256XRegClass);

        return std::make_pair(0U, &X86::VR256RegClass);

      case MVT::v16bf16:

        if (!Subtarget.hasBF16() || !Subtarget.hasVLX())

          break;

        if (VConstraint)

          return std::make_pair(0U, &X86::VR256XRegClass);

        return std::make_pair(0U, &X86::VR256RegClass);

      case MVT::v32i8:

      case MVT::v16i16:

      case MVT::v8i32:

      case MVT::v4i64:

      case MVT::v8f32:

      case MVT::v4f64:

        if (VConstraint && Subtarget.hasVLX())

          return std::make_pair(0U, &X86::VR256XRegClass);

        if (Subtarget.hasAVX())

          return std::make_pair(0U, &X86::VR256RegClass);

        break;

      case MVT::v32f16:

        if (!Subtarget.hasFP16())

          break;

        if (VConstraint)

          return std::make_pair(0U, &X86::VR512RegClass);

        return std::make_pair(0U, &X86::VR512_0_15RegClass);

      case MVT::v32bf16:

        if (!Subtarget.hasBF16())

          break;

        if (VConstraint)

          return std::make_pair(0U, &X86::VR512RegClass);

        return std::make_pair(0U, &X86::VR512_0_15RegClass);

      case MVT::v64i8:

      case MVT::v32i16:

      case MVT::v8f64:

      case MVT::v16f32:

      case MVT::v16i32:

      case MVT::v8i64:

        if (!Subtarget.hasAVX512()) break;

        if (VConstraint)

          return std::make_pair(0U, &X86::VR512RegClass);

        return std::make_pair(0U, &X86::VR512_0_15RegClass);

      }

      break;

    }

  } else if (Constraint.size() == 2 && Constraint[0] == 'Y') {

    switch (Constraint[1]) {

    default:

      break;

    case 'i':

    case 't':

    case '2':

      return getRegForInlineAsmConstraint(TRI, "x", VT);

    case 'm':

      if (!Subtarget.hasMMX()) break;

      return std::make_pair(0U, &X86::VR64RegClass);

    case 'z':

      if (!Subtarget.hasSSE1()) break;

      switch (VT.SimpleTy) {

      default: break;

      // Scalar SSE types.

      case MVT::f16:

        if (!Subtarget.hasFP16())

          break;

        return std::make_pair(X86::XMM0, &X86::FR16XRegClass);

      case MVT::f32:

      case MVT::i32:

        return std::make_pair(X86::XMM0, &X86::FR32RegClass);

      case MVT::f64:

      case MVT::i64:

        return std::make_pair(X86::XMM0, &X86::FR64RegClass);

      case MVT::v8f16:

        if (!Subtarget.hasFP16())

          break;

        return std::make_pair(X86::XMM0, &X86::VR128RegClass);

      case MVT::v8bf16:

        if (!Subtarget.hasBF16() || !Subtarget.hasVLX())

          break;

        return std::make_pair(X86::XMM0, &X86::VR128RegClass);

      case MVT::f128:

      case MVT::v16i8:

      case MVT::v8i16:

      case MVT::v4i32:

      case MVT::v2i64:

      case MVT::v4f32:

      case MVT::v2f64:

        return std::make_pair(X86::XMM0, &X86::VR128RegClass);

      // AVX types.

      case MVT::v16f16:

        if (!Subtarget.hasFP16())

          break;

        return std::make_pair(X86::YMM0, &X86::VR256RegClass);

      case MVT::v16bf16:

        if (!Subtarget.hasBF16() || !Subtarget.hasVLX())

          break;

        return std::make_pair(X86::YMM0, &X86::VR256RegClass);

      case MVT::v32i8:

      case MVT::v16i16:

      case MVT::v8i32:

      case MVT::v4i64:

      case MVT::v8f32:

      case MVT::v4f64:

        if (Subtarget.hasAVX())

          return std::make_pair(X86::YMM0, &X86::VR256RegClass);

        break;

      case MVT::v32f16:

        if (!Subtarget.hasFP16())

          break;

        return std::make_pair(X86::ZMM0, &X86::VR512_0_15RegClass);

      case MVT::v32bf16:

        if (!Subtarget.hasBF16())

          break;

        return std::make_pair(X86::ZMM0, &X86::VR512_0_15RegClass);

      case MVT::v64i8:

      case MVT::v32i16:

      case MVT::v8f64:

      case MVT::v16f32:

      case MVT::v16i32:

      case MVT::v8i64:

        if (Subtarget.hasAVX512())

          return std::make_pair(X86::ZMM0, &X86::VR512_0_15RegClass);

        break;

      }

      break;

    case 'k':

      // This register class doesn't allocate k0 for masked vector operation.

      if (Subtarget.hasAVX512()) {

        if (VT == MVT::v1i1 || VT == MVT::i1)

          return std::make_pair(0U, &X86::VK1WMRegClass);

        if (VT == MVT::v8i1 || VT == MVT::i8)

          return std::make_pair(0U, &X86::VK8WMRegClass);

        if (VT == MVT::v16i1 || VT == MVT::i16)

          return std::make_pair(0U, &X86::VK16WMRegClass);

      }

      if (Subtarget.hasBWI()) {

        if (VT == MVT::v32i1 || VT == MVT::i32)

          return std::make_pair(0U, &X86::VK32WMRegClass);

        if (VT == MVT::v64i1 || VT == MVT::i64)

          return std::make_pair(0U, &X86::VK64WMRegClass);

      }

      break;

    }

  } else if (Constraint.size() == 2 && Constraint[0] == 'j') {

    switch (Constraint[1]) {

    default:

      break;

    case 'r':

      if (VT == MVT::i8 || VT == MVT::i1)

        return std::make_pair(0U, &X86::GR8_NOREX2RegClass);

      if (VT == MVT::i16)

        return std::make_pair(0U, &X86::GR16_NOREX2RegClass);

      if (VT == MVT::i32 || VT == MVT::f32)

        return std::make_pair(0U, &X86::GR32_NOREX2RegClass);

      if (VT != MVT::f80 && !VT.isVector())

        return std::make_pair(0U, &X86::GR64_NOREX2RegClass);

      break;

    case 'R':

      if (VT == MVT::i8 || VT == MVT::i1)

        return std::make_pair(0U, &X86::GR8RegClass);

      if (VT == MVT::i16)

        return std::make_pair(0U, &X86::GR16RegClass);

      if (VT == MVT::i32 || VT == MVT::f32)

        return std::make_pair(0U, &X86::GR32RegClass);

      if (VT != MVT::f80 && !VT.isVector())

        return std::make_pair(0U, &X86::GR64RegClass);

      break;

    }

  }


  if (parseConstraintCode(Constraint) != X86::COND_INVALID)

    return std::make_pair(0U, &X86::GR32RegClass);


  // Use the default implementation in TargetLowering to convert the register

  // constraint into a member of a register class.

  std::pair<Register, const TargetRegisterClass*> Res;

  Res = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);


  // Not found as a standard register?

  if (!Res.second) {

    // Only match x87 registers if the VT is one SelectionDAGBuilder can convert

    // to/from f80.

    if (VT == MVT::Other || VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f80) {

      // Map st(0) -> st(7) -> ST0

      if (Constraint.size() == 7 && Constraint[0] == '{' &&

          tolower(Constraint[1]) == 's' && tolower(Constraint[2]) == 't' &&

          Constraint[3] == '(' &&

          (Constraint[4] >= '0' && Constraint[4] <= '7') &&

          Constraint[5] == ')' && Constraint[6] == '}') {

        // st(7) is not allocatable and thus not a member of RFP80. Return

        // singleton class in cases where we have a reference to it.

        if (Constraint[4] == '7')

          return std::make_pair(X86::FP7, &X86::RFP80_7RegClass);

        return std::make_pair(X86::FP0 + Constraint[4] - '0',

                              &X86::RFP80RegClass);

      }


      // GCC allows "st(0)" to be called just plain "st".

      if (StringRef("{st}").equals_insensitive(Constraint))

        return std::make_pair(X86::FP0, &X86::RFP80RegClass);

    }


    // flags -> EFLAGS

    if (StringRef("{flags}").equals_insensitive(Constraint))

      return std::make_pair(X86::EFLAGS, &X86::CCRRegClass);


    // dirflag -> DF

    // Only allow for clobber.

    if (StringRef("{dirflag}").equals_insensitive(Constraint) &&

        VT == MVT::Other)

      return std::make_pair(X86::DF, &X86::DFCCRRegClass);


    // fpsr -> FPSW

    // Only allow for clobber.

    if (StringRef("{fpsr}").equals_insensitive(Constraint) && VT == MVT::Other)

      return std::make_pair(X86::FPSW, &X86::FPCCRRegClass);


    return Res;

  }


  // Make sure it isn't a register that requires 64-bit mode.

  if (!Subtarget.is64Bit() &&

      (isFRClass(*Res.second) || isGRClass(*Res.second)) &&

      TRI->getEncodingValue(Res.first) >= 8) {

    // Register requires REX prefix, but we're in 32-bit mode.

    return std::make_pair(0, nullptr);

  }


  // Make sure it isn't a register that requires AVX512.

  if (!Subtarget.hasAVX512() && isFRClass(*Res.second) &&

      TRI->getEncodingValue(Res.first) & 0x10) {

    // Register requires EVEX prefix.

    return std::make_pair(0, nullptr);

  }


  // Otherwise, check to see if this is a register class of the wrong value

  // type.  For example, we want to map "{ax},i32" -> {eax}, we don't want it to

  // turn into {ax},{dx}.

  // MVT::Other is used to specify clobber names.

  if (TRI->isTypeLegalForClass(*Res.second, VT) || VT == MVT::Other)

    return Res;   // Correct type already, nothing to do.


  // Get a matching integer of the correct size. i.e. "ax" with MVT::32 should

  // return "eax". This should even work for things like getting 64bit integer

  // registers when given an f64 type.

  const TargetRegisterClass *Class = Res.second;

  // The generic code will match the first register class that contains the

  // given register. Thus, based on the ordering of the tablegened file,

  // the "plain" GR classes might not come first.

  // Therefore, use a helper method.

  if (isGRClass(*Class)) {

    unsigned Size = VT.getSizeInBits();

    if (Size == 1) Size = 8;

    if (Size != 8 && Size != 16 && Size != 32 && Size != 64)

      return std::make_pair(0, nullptr);

    Register DestReg = getX86SubSuperRegister(Res.first, Size);

    if (DestReg.isValid()) {

      bool is64Bit = Subtarget.is64Bit();

      const TargetRegisterClass *RC =

          Size == 8 ? (is64Bit ? &X86::GR8RegClass : &X86::GR8_NOREXRegClass)

        : Size == 16 ? (is64Bit ? &X86::GR16RegClass : &X86::GR16_NOREXRegClass)

        : Size == 32 ? (is64Bit ? &X86::GR32RegClass : &X86::GR32_NOREXRegClass)

        : /*Size == 64*/ (is64Bit ? &X86::GR64RegClass : nullptr);

      if (Size == 64 && !is64Bit) {

        // Model GCC's behavior here and select a fixed pair of 32-bit

        // registers.

        switch (DestReg) {

        case X86::RAX:

          return std::make_pair(X86::EAX, &X86::GR32_ADRegClass);

        case X86::RDX:

          return std::make_pair(X86::EDX, &X86::GR32_DCRegClass);

        case X86::RCX:

          return std::make_pair(X86::ECX, &X86::GR32_CBRegClass);

        case X86::RBX:

          return std::make_pair(X86::EBX, &X86::GR32_BSIRegClass);

        case X86::RSI:

          return std::make_pair(X86::ESI, &X86::GR32_SIDIRegClass);

        case X86::RDI:

          return std::make_pair(X86::EDI, &X86::GR32_DIBPRegClass);

        case X86::RBP:

          return std::make_pair(X86::EBP, &X86::GR32_BPSPRegClass);

        default:

          return std::make_pair(0, nullptr);

        }

      }

      if (RC && RC->contains(DestReg))

        return std::make_pair(DestReg, RC);

      return Res;

    }

    // No register found/type mismatch.

    return std::make_pair(0, nullptr);

  } else if (isFRClass(*Class)) {

    // Handle references to XMM physical registers that got mapped into the

    // wrong class.  This can happen with constraints like {xmm0} where the

    // target independent register mapper will just pick the first match it can

    // find, ignoring the required type.


    // TODO: Handle f128 and i128 in FR128RegClass after it is tested well.

    if (VT == MVT::f16)

      Res.second = &X86::FR16XRegClass;

    else if (VT == MVT::f32 || VT == MVT::i32)

      Res.second = &X86::FR32XRegClass;

    else if (VT == MVT::f64 || VT == MVT::i64)

      Res.second = &X86::FR64XRegClass;

    else if (TRI->isTypeLegalForClass(X86::VR128XRegClass, VT))

      Res.second = &X86::VR128XRegClass;

    else if (TRI->isTypeLegalForClass(X86::VR256XRegClass, VT))

      Res.second = &X86::VR256XRegClass;

    else if (TRI->isTypeLegalForClass(X86::VR512RegClass, VT))

      Res.second = &X86::VR512RegClass;

    else {

      // Type mismatch and not a clobber: Return an error;

      Res.first = 0;

      Res.second = nullptr;

    }

  } else if (isVKClass(*Class)) {

    if (VT == MVT::v1i1 || VT == MVT::i1)

      Res.second = &X86::VK1RegClass;

    else if (VT == MVT::v8i1 || VT == MVT::i8)

      Res.second = &X86::VK8RegClass;

    else if (VT == MVT::v16i1 || VT == MVT::i16)

      Res.second = &X86::VK16RegClass;

    else if (VT == MVT::v32i1 || VT == MVT::i32)

      Res.second = &X86::VK32RegClass;

    else if (VT == MVT::v64i1 || VT == MVT::i64)

      Res.second = &X86::VK64RegClass;

    else {

      // Type mismatch and not a clobber: Return an error;

      Res.first = 0;

      Res.second = nullptr;

    }

  }


  return Res;

}


bool X86TargetLowering::isIntDivCheap(EVT VT, AttributeList Attr) const {

  // Integer division on x86 is expensive. However, when aggressively optimizing

  // for code size, we prefer to use a div instruction, as it is usually smaller

  // than the alternative sequence.

  // The exception to this is vector division. Since x86 doesn't have vector

  // integer division, leaving the division as-is is a loss even in terms of

  // size, because it will have to be scalarized, while the alternative code

  // sequence can be performed in vector form.

  bool OptSize = Attr.hasFnAttr(Attribute::MinSize);

  return OptSize && !VT.isVector();

}


void X86TargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {

  if (!Subtarget.is64Bit())

    return;


  // Update IsSplitCSR in X86MachineFunctionInfo.

  X86MachineFunctionInfo *AFI =

      Entry->getParent()->getInfo<X86MachineFunctionInfo>();

  AFI->setIsSplitCSR(true);

}


void X86TargetLowering::insertCopiesSplitCSR(

    MachineBasicBlock *Entry,

    const SmallVectorImpl<MachineBasicBlock *> &Exits) const {

  const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();

  const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());

  if (!IStart)

    return;


  const TargetInstrInfo *TII = Subtarget.getInstrInfo();

  MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();

  MachineBasicBlock::iterator MBBI = Entry->begin();

  for (const MCPhysReg *I = IStart; *I; ++I) {

    const TargetRegisterClass *RC = nullptr;

    if (X86::GR64RegClass.contains(*I))

      RC = &X86::GR64RegClass;

    else

      llvm_unreachable("Unexpected register class in CSRsViaCopy!");


    Register NewVR = MRI->createVirtualRegister(RC);

    // Create copy from CSR to a virtual register.

    // FIXME: this currently does not emit CFI pseudo-instructions, it works

    // fine for CXX_FAST_TLS since the C++-style TLS access functions should be

    // nounwind. If we want to generalize this later, we may need to emit

    // CFI pseudo-instructions.

    assert(

        Entry->getParent()->getFunction().hasFnAttribute(Attribute::NoUnwind) &&

        "Function should be nounwind in insertCopiesSplitCSR!");

    Entry->addLiveIn(*I);

    BuildMI(*Entry, MBBI, MIMetadata(), TII->get(TargetOpcode::COPY), NewVR)

        .addReg(*I);


    // Insert the copy-back instructions right before the terminator.

    for (auto *Exit : Exits)

      BuildMI(*Exit, Exit->getFirstTerminator(), MIMetadata(),

              TII->get(TargetOpcode::COPY), *I)

          .addReg(NewVR);

  }

}


bool X86TargetLowering::supportSwiftError() const {

  return Subtarget.is64Bit();

}


MachineInstr *


X86TargetLowering::EmitKCFICheck(MachineBasicBlock &MBB,

                                 MachineBasicBlock::instr_iterator &MBBI,

                                 const TargetInstrInfo *TII) const {

  assert(MBBI->isCall() && MBBI->getCFIType() &&

         "Invalid call instruction for a KCFI check");


  MachineFunction &MF = *MBB.getParent();

  // If the call target is a memory operand, unfold it and use R11 for the

  // call, so KCFI_CHECK won't have to recompute the address.

  switch (MBBI->getOpcode()) {

  case X86::CALL64m:

  case X86::CALL64m_NT:

  case X86::TAILJMPm64:

  case X86::TAILJMPm64_REX: {

    MachineBasicBlock::instr_iterator OrigCall = MBBI;

    SmallVector<MachineInstr *, 2> NewMIs;

    if (!TII->unfoldMemoryOperand(MF, *OrigCall, X86::R11, /*UnfoldLoad=*/true,

                                  /*UnfoldStore=*/false, NewMIs))

      report_fatal_error("Failed to unfold memory operand for a KCFI check");

    for (auto *NewMI : NewMIs)

      MBBI = MBB.insert(OrigCall, NewMI);

    assert(MBBI->isCall() &&

           "Unexpected instruction after memory operand unfolding");

    if (OrigCall->shouldUpdateAdditionalCallInfo())

      MF.moveAdditionalCallInfo(&*OrigCall, &*MBBI);

    MBBI->setCFIType(MF, OrigCall->getCFIType());

    OrigCall->eraseFromParent();

    break;

  }

  default:

    break;

  }


  MachineOperand &Target = MBBI->getOperand(0);

  Register TargetReg;

  switch (MBBI->getOpcode()) {

  case X86::CALL64r:

  case X86::CALL64r_ImpCall:

  case X86::CALL64r_NT:

  case X86::TAILJMPr64:

  case X86::TAILJMPr64_REX:

    assert(Target.isReg() && "Unexpected target operand for an indirect call");

    Target.setIsRenamable(false);

    TargetReg = Target.getReg();

    break;

  case X86::CALL64pcrel32:

  case X86::TAILJMPd64:

    assert(Target.isSymbol() && "Unexpected target operand for a direct call");

    // X86TargetLowering::EmitLoweredIndirectThunk always uses r11 for

    // 64-bit indirect thunk calls.

    assert(StringRef(Target.getSymbolName()).ends_with("_r11") &&

           "Unexpected register for an indirect thunk call");

    TargetReg = X86::R11;

    break;

  default:

    llvm_unreachable("Unexpected CFI call opcode");

    break;

  }


  return BuildMI(MBB, MBBI, MIMetadata(*MBBI), TII->get(X86::KCFI_CHECK))

      .addReg(TargetReg)

      .addImm(MBBI->getCFIType())

      .getInstr();

}


/// Returns true if stack probing through a function call is requested.


bool X86TargetLowering::hasStackProbeSymbol(const MachineFunction &MF) const {

  return !getStackProbeSymbolName(MF).empty();

}


/// Returns true if stack probing through inline assembly is requested.


bool X86TargetLowering::hasInlineStackProbe(const MachineFunction &MF) const {


  // No inline stack probe for Windows, they have their own mechanism.

  if (Subtarget.isOSWindows() || Subtarget.isUEFI() ||

      MF.getFunction().hasFnAttribute("no-stack-arg-probe"))

    return false;


  // If the function specifically requests inline stack probes, emit them.

  if (MF.getFunction().hasFnAttribute("probe-stack"))

    return MF.getFunction().getFnAttribute("probe-stack").getValueAsString() ==

           "inline-asm";


  return false;

}


/// Returns the name of the symbol used to emit stack probes or the empty

/// string if not applicable.

StringRef


X86TargetLowering::getStackProbeSymbolName(const MachineFunction &MF) const {

  // Inline Stack probes disable stack probe call

  if (hasInlineStackProbe(MF))

    return "";


  // If the function specifically requests stack probes, emit them.

  if (MF.getFunction().hasFnAttribute("probe-stack"))

    return MF.getFunction().getFnAttribute("probe-stack").getValueAsString();


  // Generally, if we aren't on Windows, the platform ABI does not include

  // support for stack probes, so don't emit them.

  if ((!Subtarget.isOSWindows() && !Subtarget.isUEFI()) ||

      Subtarget.isTargetMachO() ||

      MF.getFunction().hasFnAttribute("no-stack-arg-probe"))

    return "";


  // We need a stack probe to conform to the Windows ABI. Choose the right

  // symbol.

  if (Subtarget.is64Bit())

    return Subtarget.isTargetCygMing() ? "___chkstk_ms" : "__chkstk";

  return Subtarget.isTargetCygMing() ? "_alloca" : "_chkstk";

}


unsigned


X86TargetLowering::getStackProbeSize(const MachineFunction &MF) const {

  // The default stack probe size is 4096 if the function has no stackprobesize

  // attribute.

  return MF.getFunction().getFnAttributeAsParsedInteger("stack-probe-size",

                                                        4096);

}


Align X86TargetLowering::getPrefLoopAlignment(MachineLoop *ML) const {

  if (ML && ML->isInnermost() &&

      ExperimentalPrefInnermostLoopAlignment.getNumOccurrences())

    return Align(1ULL << ExperimentalPrefInnermostLoopAlignment);

  return TargetLowering::getPrefLoopAlignment();

}


MRI
unsigned const MachineRegisterInfo * MRI
Definition AArch64AdvSIMDScalarPass.cpp:103

Success
#define Success
Definition AArch64Disassembler.cpp:42

Widen
static SDValue Widen(SelectionDAG *CurDAG, SDValue N)
Definition AArch64ISelDAGToDAG.cpp:1214

SelectTypeKind::FP
@ FP
Definition AArch64ISelDAGToDAG.cpp:1836

SDValue
return SDValue()

parseConstraintCode
static AArch64CC::CondCode parseConstraintCode(llvm::StringRef Constraint)
Definition AArch64ISelLowering.cpp:12927

LowerADDRSPACECAST
static SDValue LowerADDRSPACECAST(SDValue Op, SelectionDAG &DAG)
Definition AArch64ISelLowering.cpp:7062

LowerFunnelShift
static SDValue LowerFunnelShift(SDValue Op, SelectionDAG &DAG)
Definition AArch64ISelLowering.cpp:7397

getSETCC
static SDValue getSETCC(AArch64CC::CondCode CC, SDValue NZCV, const SDLoc &DL, SelectionDAG &DAG)
Helper function to create 'CSET', which is equivalent to 'CSINC <Wd>, WZR, WZR, invert(<cond>)'.
Definition AArch64ISelLowering.cpp:12951

LowerPREFETCH
static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG)
Definition AArch64ISelLowering.cpp:4525

LowerXALUO
static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG)
Definition AArch64ISelLowering.cpp:4495

foldVectorXorShiftIntoCmp
static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
Turn vector tests of the signbit in the form of: xor (sra X, elt_size(X)-1), -1 into: cmge X,...
Definition AArch64ISelLowering.cpp:18759

RegSize
unsigned RegSize
Definition AArch64MIPeepholeOpt.cpp:165

assert
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")

Wrapper
amdgpu aa AMDGPU Address space based Alias Analysis Wrapper
Definition AMDGPUAliasAnalysis.cpp:31

getNode
static msgpack::DocNode getNode(msgpack::DocNode DN, msgpack::Type Type, MCValue Val)
Definition AMDGPUDelayedMCExpr.cpp:15

NODE_NAME_CASE
#define NODE_NAME_CASE(node)
Definition AMDGPUISelLowering.cpp:5652

F64
constexpr LLT F64
Definition AMDGPULegalizerInfo.cpp:300

S1
constexpr LLT S1
Definition AMDGPULegalizerInfo.cpp:294

Select
AMDGPU Register Bank Select
Definition AMDGPURegBankSelect.cpp:68

LowerShift
static SDValue LowerShift(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST)
Definition ARMISelLowering.cpp:6583

LowerCONCAT_VECTORS
static SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
Definition ARMISelLowering.cpp:9134

LowerEXTRACT_SUBVECTOR
static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
Definition ARMISelLowering.cpp:9159

LowerATOMIC_FENCE
static SDValue LowerATOMIC_FENCE(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
Definition ARMISelLowering.cpp:4189

LowerCTTZ
static SDValue LowerCTTZ(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST)
Definition ARMISelLowering.cpp:6441

getZeroVector
static SDValue getZeroVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl)
getZeroVector - Returns a vector of specified type with all zero elements.
Definition ARMISelLowering.cpp:6235

LowerMUL
static SDValue LowerMUL(SDValue Op, SelectionDAG &DAG)
Definition ARMISelLowering.cpp:9535

LowerMLOAD
static SDValue LowerMLOAD(SDValue Op, SelectionDAG &DAG)
Definition ARMISelLowering.cpp:10161

LowerVSETCC
static SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
Definition ARMISelLowering.cpp:6703

MBB
MachineBasicBlock & MBB
Definition ARMSLSHardening.cpp:71

DL
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Definition ARMSLSHardening.cpp:73

MBBI
MachineBasicBlock MachineBasicBlock::iterator MBBI
Definition ARMSLSHardening.cpp:72

EXPAND
#define EXPAND(Op)

Results
Function Alias Analysis Results
Definition AliasAnalysis.cpp:734

BT
BitTracker BT
Definition BitTracker.cpp:68

BlockFrequencyInfo.h

A
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")

D
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")

E
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")

B
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")

CallingConv.h

CommandLine.h

Constants.h
This file contains the declarations for the subclasses of Constant, which represent the different fla...

OutputCostKind::CodeSize
@ CodeSize
Definition CostModel.cpp:34

DerivedTypes.h

EHPersonalities.h

isSigned
static bool isSigned(unsigned int Opcode)
Definition ExpandLargeDivRem.cpp:52

GlobalAlias.h

GlobalVariable.h

GEP
Hexagon Common GEP
Definition HexagonCommonGEP.cpp:164

TII
const HexagonInstrInfo * TII
Definition HexagonCopyToCombine.cpp:118

IRBuilder.h

MI
IRTranslator LLVM IR MI
Definition IRTranslator.cpp:110

Function.h

InlinePriorityMode::Size
@ Size
Definition InlineOrder.cpp:25

InlinePriorityMode::ML
@ ML
Definition InlineOrder.cpp:25

ShuffleOps
std::pair< Value *, Value * > ShuffleOps
We are building a shuffle to create V, which is a sequence of insertelement, extractelement pairs.
Definition InstCombineVectorOps.cpp:803

matchShuffleAsBitRotate
static int matchShuffleAsBitRotate(ArrayRef< int > Mask, int NumSubElts)
Try to lower a vector shuffle as a bit rotation.
Definition Instructions.cpp:2473

Instructions.h

getMask
static std::pair< Value *, APInt > getMask(Value *WideMask, unsigned Factor, ElementCount LeafValueEC)
Definition InterleavedAccessPass.cpp:588

LowerCTLZ
static Value * LowerCTLZ(LLVMContext &Context, Value *V, Instruction *IP)
Emit the code to lower ctlz of V before the specified instruction IP.
Definition IntrinsicLowering.cpp:187

LowerCTPOP
static Value * LowerCTPOP(LLVMContext &Context, Value *V, Instruction *IP)
Emit the code to lower ctpop of V before the specified instruction IP.
Definition IntrinsicLowering.cpp:148

Intrinsics.h

NumOps
const size_t AbstractManglingParser< Derived, Alloc >::NumOps
Definition ItaniumDemangle.h:3450

TemplateParamKind::Type
@ Type
Definition ItaniumDemangle.h:1243

Ops
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
Definition ItaniumDemangle.h:3368

KnownBits.h

RegName
#define RegName(no)

Options
static LVOptions Options
Definition LVOptions.cpp:25

isZero
static bool isZero(Value *V, const DataLayout &DL, DominatorTree *DT, AssumptionCache *AC)
Definition Lint.cpp:539

LivePhysRegs.h
This file implements the LivePhysRegs utility for tracking liveness of physical registers.

Matrix
Live Register Matrix
Definition LiveRegMatrix.cpp:44

lower128BitShuffle
static SDValue lower128BitShuffle(const SDLoc &DL, ArrayRef< int > Mask, MVT VT, SDValue V1, SDValue V2, SelectionDAG &DAG, const LoongArchSubtarget &Subtarget)
Dispatching routine to lower various 128-bit LoongArch vector shuffles.
Definition LoongArchISelLowering.cpp:1981

checkBitcastSrcVectorSize
static bool checkBitcastSrcVectorSize(SDValue Src, unsigned Size, unsigned Depth)
Definition LoongArchISelLowering.cpp:5133

signExtendBitcastSrcVector
static SDValue signExtendBitcastSrcVector(SelectionDAG &DAG, EVT SExtVT, SDValue Src, const SDLoc &DL)
Definition LoongArchISelLowering.cpp:5162

lower256BitShuffle
static SDValue lower256BitShuffle(const SDLoc &DL, ArrayRef< int > Mask, MVT VT, SDValue V1, SDValue V2, SelectionDAG &DAG, const LoongArchSubtarget &Subtarget)
Dispatching routine to lower various 256-bit LoongArch vector shuffles.
Definition LoongArchISelLowering.cpp:2590

computeZeroableShuffleElements
static void computeZeroableShuffleElements(ArrayRef< int > Mask, SDValue V1, SDValue V2, APInt &KnownUndef, APInt &KnownZero)
Compute whether each element of a shuffle is zeroable.
Definition LoongArchISelLowering.cpp:1341

matchShuffleAsByteRotate
static int matchShuffleAsByteRotate(MVT VT, SDValue &V1, SDValue &V2, ArrayRef< int > Mask)
Attempts to match vector shuffle as byte rotation.
Definition LoongArchISelLowering.cpp:1406

matchShuffleAsShift
static int matchShuffleAsShift(MVT &ShiftVT, unsigned &Opcode, unsigned ScalarSizeInBits, ArrayRef< int > Mask, int MaskOffset, const APInt &Zeroable)
Attempts to match a shuffle mask against the VBSLL, VBSRL, VSLLI and VSRLI instruction.
Definition LoongArchISelLowering.cpp:1206

isRepeatedShuffleMask
static bool isRepeatedShuffleMask(unsigned LaneSizeInBits, MVT VT, ArrayRef< int > Mask, SmallVectorImpl< int > &RepeatedMask)
Test whether a shuffle mask is equivalent within each sub-lane.
Definition LoongArchISelLowering.cpp:1377

isAndOrOfSetCCs
static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc)
Return true if node is an ISD::AND or ISD::OR of two M68k::SETcc nodes each of which has no other use...
Definition M68kISelLowering.cpp:2429

hasNonFlagsUse
static bool hasNonFlagsUse(SDValue Op)
return true if Op has a use that doesn't just read flags.
Definition M68kISelLowering.cpp:1850

isCMOVPseudo
static bool isCMOVPseudo(MachineInstr &MI)
Definition M68kISelLowering.cpp:3053

combineCarryThroughADD
static SDValue combineCarryThroughADD(SDValue CCR)
Definition M68kISelLowering.cpp:3537

isTruncWithZeroHighBitsInput
static bool isTruncWithZeroHighBitsInput(SDValue V, SelectionDAG &DAG)
Definition M68kISelLowering.cpp:2250

MCAsmInfo.h

MCContext.h

MCExpr.h

MCSymbol.h

F
#define F(x, y, z)
Definition MD5.cpp:55

I
#define I(x, y, z)
Definition MD5.cpp:58

G
#define G(x, y, z)
Definition MD5.cpp:56

Module
Machine Check Debug Module
Definition MachineCheckDebugify.cpp:124

MachineFrameInfo.h

MachineFunction.h

MachineInstrBuilder.h

MachineJumpTableInfo.h

MachineLoopInfo.h

MachineModuleInfo.h

MachineRegisterInfo.h

isUndef
static bool isUndef(const MachineInstr &MI)
Definition MachineSSAContext.cpp:57

Reg
Register Reg
Definition MachineSink.cpp:2117

TRI
Register const TargetRegisterInfo * TRI
Definition MachineSink.cpp:2118

MathExtras.h

R2
#define R2(n)

Register
Promote Memory to Register
Definition Mem2Reg.cpp:110

Context
@ Context
Definition MemProfContextDisambiguation.cpp:129

T
#define T
Definition Mips16ISelLowering.cpp:353

T1
#define T1
Definition Mips16ISelLowering.cpp:352

OpIdx
MachineInstr unsigned OpIdx
Definition NVPTXPrologEpilogPass.cpp:56

High
uint64_t High
Definition NVVMIntrRange.cpp:46

II
uint64_t IntrinsicInst * II
Definition NVVMIntrRange.cpp:46

P
#define P(N)

getCodeModel
static CodeModel::Model getCodeModel(const PPCSubtarget &S, const TargetMachine &TM, const MachineOperand &MO)
Definition PPCAsmPrinter.cpp:481

Operation
PowerPC Reduce CR logical Operation
Definition PPCReduceCRLogicals.cpp:735

Fixup
PowerPC TLS Dynamic Call Fixup
Definition PPCTLSDynamicCall.cpp:336

if
if(PassOpts->AAPipeline)
Definition PassBuilderBindings.cpp:64

PatternMatch.h

ProfileSummaryInfo.h

SPReg
static constexpr MCPhysReg SPReg
Definition RISCVFrameLowering.cpp:54

lowerVECTOR_SHUFFLE
static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG, const RISCVSubtarget &Subtarget)
Definition RISCVISelLowering.cpp:5813

negateFMAOpcode
static unsigned negateFMAOpcode(unsigned Opcode, bool NegMul, bool NegAcc)
Definition RISCVISelLowering.cpp:18564

combineVectorSizedSetCCEquality
static SDValue combineVectorSizedSetCCEquality(EVT VT, SDValue X, SDValue Y, ISD::CondCode CC, const SDLoc &DL, SelectionDAG &DAG, const RISCVSubtarget &Subtarget)
Try to map an integer comparison with size > XLEN to vector instructions before type legalization spl...
Definition RISCVISelLowering.cpp:16828

Cond
const SmallVectorImpl< MachineOperand > & Cond
Definition RISCVRedundantCopyElimination.cpp:71

Opc
auto Opc
Definition RISCVRedundantCopyElimination.cpp:75

Mode
static cl::opt< RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode > Mode("regalloc-enable-advisor", cl::Hidden, cl::init(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default), cl::desc("Enable regalloc advisor mode"), cl::values(clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default, "default", "Default"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Release, "release", "precompiled"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Development, "development", "for training")))

isValid
static bool isValid(const char C)
Returns true if C is a valid mangled character: <0-9a-zA-Z_>.
Definition RustDemangle.cpp:181

SDPatternMatch.h
Contains matchers for matching SelectionDAG nodes and values.

isSimple
static bool isSimple(Instruction *I)
Definition SLPVectorizer.cpp:1718

getValueType
static Type * getValueType(Value *V)
Returns the type of the given value/instruction V.
Definition SLPVectorizer.cpp:264

OpIndex
unsigned OpIndex
Definition SPIRVModuleAnalysis.cpp:56

contains
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition Value.cpp:480

SelectionDAGNodes.h

SmallBitVector.h
This file implements the SmallBitVector class.

SmallSet.h
This file defines the SmallSet class.

SmallVector.h
This file defines the SmallVector class.

GetOppositeBranchCondition
static SPCC::CondCodes GetOppositeBranchCondition(SPCC::CondCodes CC)
Definition SparcInstrInfo.cpp:82

Enabled
static bool Enabled
Definition Statistic.cpp:46

Statistic.h
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...

StringExtras.h
This file contains some functions that are useful when dealing with strings.

StringSwitch.h
This file implements the StringSwitch template, which mimics a switch() statement whose cases are str...

Debug.h

LLVM_DEBUG
#define LLVM_DEBUG(...)
Definition Debug.h:114

CmpMode
CmpMode
Definition SystemZISelLowering.cpp:3641

Y
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")

X
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")

Ptr
@ Ptr
Definition TargetLibraryInfo.cpp:77

TargetLowering.h
This file describes how to lower LLVM code to machine code.

TargetOptions.h

LUT
static const char LUT[]
Definition TypeSanitizer.cpp:195

getVectorElementType
static llvm::Type * getVectorElementType(llvm::Type *Ty)
Definition VETargetTransformInfo.h:24

getOpcode
static std::optional< unsigned > getOpcode(ArrayRef< VPValue * > Values)
Returns the opcode of Values or ~0 if they do not all agree.
Definition VPlanSLP.cpp:247

getBitWidth
static unsigned getBitWidth(Type *Ty, const DataLayout &DL)
Returns the bitwidth of the given scalar or pointer type.
Definition ValueTracking.cpp:99

UndefPoisonKind::PoisonOnly
@ PoisonOnly
Definition ValueTracking.cpp:7379

computeKnownBitsForHorizontalOperation
static KnownBits computeKnownBitsForHorizontalOperation(const Operator *I, const APInt &DemandedElts, const SimplifyQuery &Q, unsigned Depth, const function_ref< KnownBits(const KnownBits &, const KnownBits &)> KnownBitsFunc)
Definition ValueTracking.cpp:1083

VectorUtils.h

extractSubVector
static SDValue extractSubVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG, const SDLoc &DL, unsigned VectorWidth)
Definition WebAssemblyISelLowering.cpp:3094

WinEHFuncInfo.h

is64Bit
static bool is64Bit(const char *name)
Definition X86Disassembler.cpp:1086

GET_EGPR_IF_ENABLED
#define GET_EGPR_IF_ENABLED(OPC)

getSUBriOpcode
static unsigned getSUBriOpcode(bool IsLP64)
Definition X86FrameLowering.cpp:110

X86FrameLowering.h

convertIntLogicToFPLogic
static SDValue convertIntLogicToFPLogic(unsigned Opc, const SDLoc &DL, EVT VT, SDValue N0, SDValue N1, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
If both input operands of a logic op are being cast from floating-point types or FP compares,...
Definition X86ISelLowering.cpp:51105

isNoopOrBroadcastShuffleMask
static bool isNoopOrBroadcastShuffleMask(ArrayRef< int > Mask)
Definition X86ISelLowering.cpp:11652

matchLogicBlend
static bool matchLogicBlend(SDNode *N, SDValue &X, SDValue &Y, SDValue &Mask)
Definition X86ISelLowering.cpp:52138

widenMaskVectorType
static MVT widenMaskVectorType(MVT VT, const X86Subtarget &Subtarget)
Widen a mask vector type to a minimum of v8i1/v16i1 to allow use of KSHIFT and bitcast with integer t...
Definition X86ISelLowering.cpp:4243

combineAndLoadToBZHI
static SDValue combineAndLoadToBZHI(SDNode *Node, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Definition X86ISelLowering.cpp:51391

combineOrCmpEqZeroToCtlzSrl
static SDValue combineOrCmpEqZeroToCtlzSrl(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Definition X86ISelLowering.cpp:52227

combineAndnp
static SDValue combineAndnp(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Do target-specific dag combines on X86ISD::ANDNP nodes.
Definition X86ISelLowering.cpp:55475

combineExtractWithShuffle
static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Definition X86ISelLowering.cpp:46648

combineLogicBlendIntoPBLENDV
static SDValue combineLogicBlendIntoPBLENDV(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Definition X86ISelLowering.cpp:52152

LowerStore
static SDValue LowerStore(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Definition X86ISelLowering.cpp:25588

combineAddOrSubToADCOrSBB
static SDValue combineAddOrSubToADCOrSBB(bool IsSub, const SDLoc &DL, EVT VT, SDValue X, SDValue Y, SelectionDAG &DAG, bool ZeroSecondOpOnly=false)
If this is an add or subtract where one operand is produced by a cmp+setcc, then try to convert it to...
Definition X86ISelLowering.cpp:52306

matchScalarReduction
static bool matchScalarReduction(SDValue Op, ISD::NodeType BinOp, SmallVectorImpl< SDValue > &SrcOps, SmallVectorImpl< APInt > *SrcMask=nullptr)
Helper for matching BINOP(EXTRACTELT(X,0),BINOP(EXTRACTELT(X,1),...)) style scalarized (associative) ...
Definition X86ISelLowering.cpp:23005

translateX86FSETCC
static unsigned translateX86FSETCC(ISD::CondCode SetCCOpcode, SDValue &Op0, SDValue &Op1, bool &IsAlwaysSignaling)
Turns an ISD::CondCode into a value suitable for SSE floating-point mask CMPs.
Definition X86ISelLowering.cpp:23855

detectPMADDUBSW
static SDValue detectPMADDUBSW(SDValue In, EVT VT, SelectionDAG &DAG, const X86Subtarget &Subtarget, const SDLoc &DL)
Definition X86ISelLowering.cpp:54491

checkBoolTestSetCCCombine
static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC)
Definition X86ISelLowering.cpp:48490

useEGPRInlineAsm
static bool useEGPRInlineAsm(const X86Subtarget &Subtarget)
Definition X86ISelLowering.cpp:61694

getNullFPConstForNullVal
static SDValue getNullFPConstForNullVal(SDValue V, SelectionDAG &DAG, const X86Subtarget &Subtarget)
If a value is a scalar FP zero or a vector FP zero (potentially including undefined elements),...
Definition X86ISelLowering.cpp:55216

matchBinaryPermuteShuffle
static bool matchBinaryPermuteShuffle(MVT MaskVT, ArrayRef< int > Mask, const APInt &Zeroable, bool AllowFloatDomain, bool AllowIntDomain, SDValue &V1, SDValue &V2, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget, unsigned &Shuffle, MVT &ShuffleVT, unsigned &PermuteImm)
Definition X86ISelLowering.cpp:39752

combineSub
static SDValue combineSub(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Definition X86ISelLowering.cpp:58369

isGRClass
static bool isGRClass(const TargetRegisterClass &RC)
Check if RC is a general purpose register class.
Definition X86ISelLowering.cpp:61663

getTargetShuffleMask
static bool getTargetShuffleMask(SDValue N, bool AllowSentinelZero, SmallVectorImpl< SDValue > &Ops, SmallVectorImpl< int > &Mask, bool &IsUnary)
Calculates the shuffle mask corresponding to the target-specific opcode.
Definition X86ISelLowering.cpp:5547

vectorizeExtractedCast
static SDValue vectorizeExtractedCast(SDValue Cast, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Given a scalar cast operation that is extracted from a vector, try to vectorize the cast op followed ...
Definition X86ISelLowering.cpp:19878

combineShiftRightLogical
static SDValue combineShiftRightLogical(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Definition X86ISelLowering.cpp:50131

lowerShuffleAsInsertPS
static SDValue lowerShuffleAsInsertPS(const SDLoc &DL, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, SelectionDAG &DAG)
Definition X86ISelLowering.cpp:13349

combineSubSetcc
static SDValue combineSubSetcc(SDNode *N, SelectionDAG &DAG)
Definition X86ISelLowering.cpp:58291

match1BitShuffleAsKSHIFT
static int match1BitShuffleAsKSHIFT(unsigned &Opcode, ArrayRef< int > Mask, int MaskOffset, const APInt &Zeroable)
Definition X86ISelLowering.cpp:17966

isHorizontalBinOpPart
static bool isHorizontalBinOpPart(const BuildVectorSDNode *N, unsigned Opcode, const SDLoc &DL, SelectionDAG &DAG, unsigned BaseIdx, unsigned LastIdx, SDValue &V0, SDValue &V1)
This is a helper function of LowerToHorizontalOp().
Definition X86ISelLowering.cpp:8146

SplitAndExtendv16i1
static SDValue SplitAndExtendv16i1(unsigned ExtOpc, MVT VT, SDValue In, const SDLoc &dl, SelectionDAG &DAG)
Definition X86ISelLowering.cpp:20980

getShuffleHalfVectors
static SDValue getShuffleHalfVectors(const SDLoc &DL, SDValue V1, SDValue V2, ArrayRef< int > HalfMask, int HalfIdx1, int HalfIdx2, bool UndefLower, SelectionDAG &DAG, bool UseConcat=false)
Given the output values from getHalfShuffleMask(), create a half width shuffle of extracted vectors f...
Definition X86ISelLowering.cpp:15919

combineSignExtendInReg
static SDValue combineSignExtendInReg(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Definition X86ISelLowering.cpp:55740

combineFPToSInt
static SDValue combineFPToSInt(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Definition X86ISelLowering.cpp:57349

getTargetVShiftNode
static SDValue getTargetVShiftNode(unsigned Opc, const SDLoc &dl, MVT VT, SDValue SrcOp, SDValue ShAmt, int ShAmtIdx, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle vector element shifts by a splat shift amount.
Definition X86ISelLowering.cpp:26145

BitTestKind
BitTestKind
Definition X86ISelLowering.cpp:31742

ConstantBit
@ ConstantBit
Definition X86ISelLowering.cpp:31744

NotConstantBit
@ NotConstantBit
Definition X86ISelLowering.cpp:31745

NotShiftBit
@ NotShiftBit
Definition X86ISelLowering.cpp:31747

ShiftBit
@ ShiftBit
Definition X86ISelLowering.cpp:31746

UndefBit
@ UndefBit
Definition X86ISelLowering.cpp:31743

combineZext
static SDValue combineZext(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Definition X86ISelLowering.cpp:56252

incDecVectorConstant
static SDValue incDecVectorConstant(SDValue V, SelectionDAG &DAG, bool IsInc, bool NSW)
Given a buildvector constant, return a new vector constant with each element incremented or decrement...
Definition X86ISelLowering.cpp:23964

cheapX86FSETCC_SSE
static bool cheapX86FSETCC_SSE(ISD::CondCode SetCCOpcode)
Definition X86ISelLowering.cpp:23849

lowerV4F32Shuffle
static SDValue lowerV4F32Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Lower 4-lane 32-bit floating point shuffles.
Definition X86ISelLowering.cpp:13648

emitXBegin
static MachineBasicBlock * emitXBegin(MachineInstr &MI, MachineBasicBlock *MBB, const TargetInstrInfo *TII)
Utility function to emit xbegin specifying the start of an RTM region.
Definition X86ISelLowering.cpp:35713

EltsFromConsecutiveLoads
static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef< SDValue > Elts, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget, bool IsAfterLegalize)
Given the initializing elements 'Elts' of a vector of type 'VT', see if the elements can be replaced ...
Definition X86ISelLowering.cpp:7273

scaleShuffleElements
static bool scaleShuffleElements(ArrayRef< int > Mask, unsigned NumDstElts, SmallVectorImpl< int > &ScaledMask)
Definition X86ISelLowering.cpp:3935

GetTLSADDR
static SDValue GetTLSADDR(SelectionDAG &DAG, GlobalAddressSDNode *GA, const EVT PtrVT, unsigned ReturnReg, unsigned char OperandFlags, bool LoadGlobalBaseReg=false, bool LocalDynamic=false)
Definition X86ISelLowering.cpp:19408

LowerANY_EXTEND
static SDValue LowerANY_EXTEND(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Definition X86ISelLowering.cpp:25328

BrMergingCcmpBias
static cl::opt< int > BrMergingCcmpBias("x86-br-merging-ccmp-bias", cl::init(6), cl::desc("Increases 'x86-br-merging-base-cost' in cases that the target " "supports conditional compare instructions."), cl::Hidden)

getExtractedDemandedElts
static APInt getExtractedDemandedElts(SDNode *N)
Definition X86ISelLowering.cpp:18722

combineAndMaskToShift
static SDValue combineAndMaskToShift(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
If this is a zero/all-bits result that is bitwise-anded with a low bits mask.
Definition X86ISelLowering.cpp:51276

isBlendOrUndef
static bool isBlendOrUndef(ArrayRef< int > Mask)
Return true if every element in Mask, is an in-place blend/select mask or is undef.
Definition X86ISelLowering.cpp:3804

LowerADJUST_TRAMPOLINE
static SDValue LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG)
Definition X86ISelLowering.cpp:28483

lowerV8I32Shuffle
static SDValue lowerV8I32Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 8-lane 32-bit integer shuffles.
Definition X86ISelLowering.cpp:16826

LowerVectorCTLZInRegLUT
static SDValue LowerVectorCTLZInRegLUT(SDValue Op, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Definition X86ISelLowering.cpp:28997

combineX86ShufflesConstants
static SDValue combineX86ShufflesConstants(MVT VT, ArrayRef< SDValue > Ops, ArrayRef< int > Mask, ArrayRef< const SDNode * > SrcNodes, SelectionDAG &DAG, const SDLoc &DL, const X86Subtarget &Subtarget)
Definition X86ISelLowering.cpp:41052

combinePTESTCC
static SDValue combinePTESTCC(SDValue EFLAGS, X86::CondCode &CC, SelectionDAG &DAG, const X86Subtarget &Subtarget)
If we are inverting an PTEST/TESTP operand, attempt to adjust the CC to avoid the inversion.
Definition X86ISelLowering.cpp:48708

getAltBitOpcode
static unsigned getAltBitOpcode(unsigned Opcode)
Definition X86ISelLowering.cpp:45358

getConstantVector
static Constant * getConstantVector(MVT VT, ArrayRef< APInt > Bits, const APInt &Undefs, LLVMContext &C)
Definition X86ISelLowering.cpp:7571

LowerABD
static SDValue LowerABD(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Definition X86ISelLowering.cpp:29529

LowerREADCYCLECOUNTER
static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Definition X86ISelLowering.cpp:27657

combineBitcast
static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Definition X86ISelLowering.cpp:45820

promoteXINT_TO_FP
static SDValue promoteXINT_TO_FP(SDValue Op, const SDLoc &dl, SelectionDAG &DAG)
Definition X86ISelLowering.cpp:20056

combineCastedMaskArithmetic
static SDValue combineCastedMaskArithmetic(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Definition X86ISelLowering.cpp:45603

insert1BitVector
static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Insert i1-subvector to i1-vector.
Definition X86ISelLowering.cpp:4570

materializeVectorConstant
static SDValue materializeVectorConstant(SDValue Op, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Create a vector constant without a load.
Definition X86ISelLowering.cpp:8865

lowerShuffleWithPSHUFB
static SDValue lowerShuffleWithPSHUFB(const SDLoc &DL, MVT VT, ArrayRef< int > Mask, SDValue V1, SDValue V2, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower a shuffle with a single PSHUFB of V1 or V2.
Definition X86ISelLowering.cpp:10372

combineFP16_TO_FP
static SDValue combineFP16_TO_FP(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Definition X86ISelLowering.cpp:60437

combineConcatVectorOps
static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT, ArrayRef< SDValue > Ops, SelectionDAG &DAG, const X86Subtarget &Subtarget, unsigned Depth=0)
Helper that combines an array of subvector ops as if they were the operands of a ISD::CONCAT_VECTORS ...
Definition X86ISelLowering.cpp:58500

combineBMILogicOp
static SDValue combineBMILogicOp(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Definition X86ISelLowering.cpp:51591

LowerUINT_TO_FP_i64
static SDValue LowerUINT_TO_FP_i64(SDValue Op, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
64-bit unsigned integer to double expansion.
Definition X86ISelLowering.cpp:20249

useVectorCast
static bool useVectorCast(unsigned Opcode, MVT FromVT, MVT ToVT, const X86Subtarget &Subtarget)
Definition X86ISelLowering.cpp:19853

LowerFP_TO_FP16
static SDValue LowerFP_TO_FP16(SDValue Op, SelectionDAG &DAG)
Definition X86ISelLowering.cpp:22454

lowerV4X128Shuffle
static SDValue lowerV4X128Shuffle(const SDLoc &DL, MVT VT, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower a vector shuffle as a 128-bit shuffles.
Definition X86ISelLowering.cpp:17285

LowerTruncateVecPackWithSignBits
static SDValue LowerTruncateVecPackWithSignBits(MVT DstVT, SDValue In, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG, const SDNodeFlags Flags=SDNodeFlags())
This function lowers a vector truncation of 'extended sign-bits' or 'extended zero-bits' values.
Definition X86ISelLowering.cpp:21302

matchPMADDWD
static SDValue matchPMADDWD(SelectionDAG &DAG, SDNode *N, const SDLoc &DL, EVT VT, const X86Subtarget &Subtarget)
Definition X86ISelLowering.cpp:57741

combineSelect
static SDValue combineSelect(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Do target-specific dag combines on SELECT and VSELECT nodes.
Definition X86ISelLowering.cpp:47700

isUndefOrZeroInRange
static bool isUndefOrZeroInRange(ArrayRef< int > Mask, unsigned Pos, unsigned Size)
Return true if every element in Mask, beginning from position Pos and ending in Pos+Size is undef or ...
Definition X86ISelLowering.cpp:3837

combineToConsecutiveLoads
static SDValue combineToConsecutiveLoads(EVT VT, SDValue Op, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget, bool IsAfterLegalize)
Definition X86ISelLowering.cpp:7554

getConstVector
static SDValue getConstVector(ArrayRef< int > Values, MVT VT, SelectionDAG &DAG, const SDLoc &dl, bool IsMask=false)
Definition X86ISelLowering.cpp:3994

commuteSelect
static SDValue commuteSelect(SDNode *N, SelectionDAG &DAG, const SDLoc &DL, const X86Subtarget &Subtarget)
Definition X86ISelLowering.cpp:47674

createPHIsForCMOVsInSinkBB
static MachineInstrBuilder createPHIsForCMOVsInSinkBB(MachineBasicBlock::iterator MIItBegin, MachineBasicBlock::iterator MIItEnd, MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB, MachineBasicBlock *SinkMBB)
Definition X86ISelLowering.cpp:36134

insert128BitVector
static SDValue insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal, SelectionDAG &DAG, const SDLoc &dl)
Generate a DAG to put 128-bits into a vector > 128 bits.
Definition X86ISelLowering.cpp:4197

onlyZeroFlagUsed
static bool onlyZeroFlagUsed(SDValue Flags)
Definition X86ISelLowering.cpp:57422

extract256BitVector
static SDValue extract256BitVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG, const SDLoc &dl)
Generate a DAG to grab 256-bits from a 512-bit vector.
Definition X86ISelLowering.cpp:4165

combineFAndFNotToFAndn
static SDValue combineFAndFNotToFAndn(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Definition X86ISelLowering.cpp:55227

combineGatherScatter
static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
Definition X86ISelLowering.cpp:56893

combineMulToPMADDWD
static SDValue combineMulToPMADDWD(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Definition X86ISelLowering.cpp:49621

LowerINTRINSIC_W_CHAIN
static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Definition X86ISelLowering.cpp:27734

isFreeToSplitVector
static bool isFreeToSplitVector(SDValue V, SelectionDAG &DAG)
Definition X86ISelLowering.cpp:4368

lowerShuffleAsLanePermuteAndShuffle
static SDValue lowerShuffleAsLanePermuteAndShuffle(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Lower a vector shuffle crossing multiple 128-bit lanes by shuffling one source with a lane permutatio...
Definition X86ISelLowering.cpp:15516

checkSignTestSetCCCombine
static SDValue checkSignTestSetCCCombine(SDValue Cmp, X86::CondCode &CC, SelectionDAG &DAG)
Definition X86ISelLowering.cpp:48417

isFoldableUseOfShuffle
static bool isFoldableUseOfShuffle(SDNode *N)
Definition X86ISelLowering.cpp:7624

getTargetShuffleInputs
static bool getTargetShuffleInputs(SDValue Op, const APInt &DemandedElts, SmallVectorImpl< SDValue > &Inputs, SmallVectorImpl< int > &Mask, const SelectionDAG &DAG, unsigned Depth, bool ResolveKnownElts)
Definition X86ISelLowering.cpp:6723

getVectorMaskingNode
static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask, SDValue PreservedSrc, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Return (and Op, Mask) for compare instructions or (vselect Mask, Op, PreservedSrc) for others along w...
Definition X86ISelLowering.cpp:26276

lowerAddSub
static SDValue lowerAddSub(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Definition X86ISelLowering.cpp:29205

truncateVectorWithPACKSS
static SDValue truncateVectorWithPACKSS(EVT DstVT, SDValue In, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Truncate using inreg sign extension and X86ISD::PACKSS.
Definition X86ISelLowering.cpp:21197

combineMaskedLoadConstantMask
static SDValue combineMaskedLoadConstantMask(MaskedLoadSDNode *ML, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
Definition X86ISelLowering.cpp:53307

isShuffleMaskInputInPlace
static bool isShuffleMaskInputInPlace(int Input, ArrayRef< int > Mask)
Test whether the specified input (0 or 1) is in-place blended by the given mask.
Definition X86ISelLowering.cpp:12968

isMultiLaneShuffleMask
static bool isMultiLaneShuffleMask(unsigned LaneSizeInBits, unsigned ScalarSizeInBits, ArrayRef< int > Mask)
Test whether elements in each LaneSizeInBits lane in this shuffle mask come from multiple lanes - thi...
Definition X86ISelLowering.cpp:9969

LowerVSETCCWithSUBUS
static SDValue LowerVSETCCWithSUBUS(SDValue Op0, SDValue Op1, MVT VT, ISD::CondCode Cond, const SDLoc &dl, const X86Subtarget &Subtarget, SelectionDAG &DAG)
As another special case, use PSUBUS[BW] when it's profitable.
Definition X86ISelLowering.cpp:23998

combineFP_EXTEND
static SDValue combineFP_EXTEND(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Definition X86ISelLowering.cpp:60459

getBLENDIBlendMask
static APInt getBLENDIBlendMask(SDValue V)
Get the expanded blend mask from a BLENDI node.
Definition X86ISelLowering.cpp:41684

EmitTest
static SDValue EmitTest(SDValue Op, X86::CondCode X86CC, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Emit nodes that will be selected as "test Op0,Op0", or something equivalent.
Definition X86ISelLowering.cpp:23389

is128BitLaneRepeatedShuffleMask
static bool is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef< int > Mask, SmallVectorImpl< int > &RepeatedMask)
Test whether a shuffle mask is equivalent within each 128-bit lane.
Definition X86ISelLowering.cpp:10036

getPMOVMSKB
static SDValue getPMOVMSKB(const SDLoc &DL, SDValue V, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Definition X86ISelLowering.cpp:32324

combineSCALAR_TO_VECTOR
static SDValue combineSCALAR_TO_VECTOR(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Definition X86ISelLowering.cpp:60057

combineUIntToFP
static SDValue combineUIntToFP(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Definition X86ISelLowering.cpp:57164

getPackDemandedElts
static void getPackDemandedElts(EVT VT, const APInt &DemandedElts, APInt &DemandedLHS, APInt &DemandedRHS)
Definition X86ISelLowering.cpp:5507

LowerVectorCTPOPInRegLUT
static SDValue LowerVectorCTPOPInRegLUT(SDValue Op, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Definition X86ISelLowering.cpp:32484

LowerSELECTWithCmpZero
static SDValue LowerSELECTWithCmpZero(SDValue CmpVal, SDValue LHS, SDValue RHS, unsigned X86CC, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Definition X86ISelLowering.cpp:24884

combineADC
static SDValue combineADC(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
Definition X86ISelLowering.cpp:57683

CastIntSETCCtoFP
static std::optional< unsigned > CastIntSETCCtoFP(MVT VT, ISD::CondCode CC, unsigned NumSignificantBitsLHS, unsigned NumSignificantBitsRHS)
Definition X86ISelLowering.cpp:58478

lowerShuffleAsVTRUNCAndUnpack
static SDValue lowerShuffleAsVTRUNCAndUnpack(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, SelectionDAG &DAG)
Definition X86ISelLowering.cpp:16346

isShuffleFoldableLoad
static bool isShuffleFoldableLoad(SDValue)
Helper to test for a load that can be folded with x86 shuffles.
Definition X86ISelLowering.cpp:12750

narrowVectorSelect
static SDValue narrowVectorSelect(SDNode *N, SelectionDAG &DAG, const SDLoc &DL, const X86Subtarget &Subtarget)
If both arms of a vector select are concatenated vectors, split the select, and concatenate the resul...
Definition X86ISelLowering.cpp:47425

lowerShuffleAsElementInsertion
static SDValue lowerShuffleAsElementInsertion(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower insertion of a single element into a zero vector.
Definition X86ISelLowering.cpp:12766

combineXor
static SDValue combineXor(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Definition X86ISelLowering.cpp:55046

isUnpackWdShuffleMask
static bool isUnpackWdShuffleMask(ArrayRef< int > Mask, MVT VT, const SelectionDAG &DAG)
Definition X86ISelLowering.cpp:10209

LowerTruncateVecPack
static SDValue LowerTruncateVecPack(MVT DstVT, SDValue In, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
This function lowers a vector truncation from vXi32/vXi64 to vXi8/vXi16 into X86ISD::PACKUS/X86ISDPAC...
Definition X86ISelLowering.cpp:21334

lowerShuffleAsRepeatedMaskAndLanePermute
static SDValue lowerShuffleAsRepeatedMaskAndLanePermute(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle case where shuffle sources are coming from the same 128-bit lane and every lane can be represe...
Definition X86ISelLowering.cpp:16074

getSHUFPDImmForMask
static SDValue getSHUFPDImmForMask(ArrayRef< int > Mask, const SDLoc &DL, SelectionDAG &DAG)
Definition X86ISelLowering.cpp:10329

computeKnownBitsForPSADBW
static void computeKnownBitsForPSADBW(SDValue LHS, SDValue RHS, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth)
Definition X86ISelLowering.cpp:38487

getSEHRegistrationNodeSize
static int getSEHRegistrationNodeSize(const Function *Fn)
Definition X86ISelLowering.cpp:26338

getScalarMaskingNode
static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask, SDValue PreservedSrc, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Creates an SDNode for a predicated scalar operation.
Definition X86ISelLowering.cpp:26302

buildFromShuffleMostly
static SDValue buildFromShuffleMostly(SDValue Op, const SDLoc &DL, SelectionDAG &DAG)
Definition X86ISelLowering.cpp:7936

lowerBuildVectorToBitOp
static SDValue lowerBuildVectorToBitOp(BuildVectorSDNode *Op, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
If a BUILD_VECTOR's source elements all apply the same bit operation and one of their operands is con...
Definition X86ISelLowering.cpp:8742

LowerZERO_EXTEND
static SDValue LowerZERO_EXTEND(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Definition X86ISelLowering.cpp:21049

isFNEG
static SDValue isFNEG(SelectionDAG &DAG, SDNode *N, unsigned Depth=0)
Returns the negated value if the node N flips sign of FP value.
Definition X86ISelLowering.cpp:54694

lowerShuffleWithPERMV
static SDValue lowerShuffleWithPERMV(const SDLoc &DL, MVT VT, ArrayRef< int > OriginalMask, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Definition X86ISelLowering.cpp:14694

lowerV16I16Shuffle
static SDValue lowerV16I16Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 16-lane 16-bit integer shuffles.
Definition X86ISelLowering.cpp:16968

lowerShuffleWithUndefHalf
static SDValue lowerShuffleWithUndefHalf(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Lower shuffles where an entire half of a 256 or 512-bit vector is UNDEF.
Definition X86ISelLowering.cpp:15959

lowerAtomicArith
static SDValue lowerAtomicArith(SDValue N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Lower atomic_load_ops into LOCK-prefixed operations.
Definition X86ISelLowering.cpp:32886

convertShiftLeftToScale
static SDValue convertShiftLeftToScale(SDValue Amt, const SDLoc &dl, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Definition X86ISelLowering.cpp:30442

lowerV32I8Shuffle
static SDValue lowerV32I8Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 32-lane 8-bit integer shuffles.
Definition X86ISelLowering.cpp:17091

checkAndUpdateEFLAGSKill
static bool checkAndUpdateEFLAGSKill(MachineBasicBlock::iterator SelectItr, MachineBasicBlock *BB, const TargetRegisterInfo *TRI)
Definition X86ISelLowering.cpp:36080

computeKnownBitsForPMADDWD
static void computeKnownBitsForPMADDWD(SDValue LHS, SDValue RHS, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth)
Definition X86ISelLowering.cpp:38504

LowerBITCAST
static SDValue LowerBITCAST(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Definition X86ISelLowering.cpp:32352

LowerEXTRACT_VECTOR_ELT_SSE4
static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG)
Definition X86ISelLowering.cpp:18625

lowerShuffleAsTruncBroadcast
static SDValue lowerShuffleAsTruncBroadcast(const SDLoc &DL, MVT VT, SDValue V0, int BroadcastIdx, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower broadcast of a single - truncated - integer element, coming from a scalar_to_vector/buil...
Definition X86ISelLowering.cpp:12892

isAddSubOrSubAdd
static bool isAddSubOrSubAdd(const BuildVectorSDNode *BV, const X86Subtarget &Subtarget, SelectionDAG &DAG, SDValue &Opnd0, SDValue &Opnd1, unsigned &NumExtracts, bool &IsSubAdd, bool &HasAllowContract)
Returns true iff BV builds a vector with the result equivalent to the result of ADDSUB/SUBADD operati...
Definition X86ISelLowering.cpp:8301

ExpandHorizontalBinOp
static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1, const SDLoc &DL, SelectionDAG &DAG, unsigned X86Opcode, bool Mode, bool isUndefLO, bool isUndefHI)
Emit a sequence of two 128-bit horizontal add/sub followed by a concat_vector.
Definition X86ISelLowering.cpp:8260

combineBitOpWithPACK
static SDValue combineBitOpWithPACK(unsigned Opc, const SDLoc &DL, EVT VT, SDValue N0, SDValue N1, SelectionDAG &DAG)
Definition X86ISelLowering.cpp:51238

LowerToHorizontalOp
static SDValue LowerToHorizontalOp(const BuildVectorSDNode *BV, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Lower BUILD_VECTOR to a horizontal add/sub operation if possible.
Definition X86ISelLowering.cpp:8618

getGFNICtrlMask
SDValue getGFNICtrlMask(unsigned Opcode, SelectionDAG &DAG, const SDLoc &DL, MVT VT, unsigned Amt=0)
Definition X86ISelLowering.cpp:28948

combineFP_ROUND
static SDValue combineFP_ROUND(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Definition X86ISelLowering.cpp:60588

combineAndShuffleNot
static SDValue combineAndShuffleNot(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Try to fold: and (vector_shuffle<Z,...,Z> (insert_vector_elt undef, (xor X, -1), Z),...
Definition X86ISelLowering.cpp:50915

lowerShuffleAsBitMask
static SDValue lowerShuffleAsBitMask(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to emit a bitmask instruction for a shuffle.
Definition X86ISelLowering.cpp:11009

lowerShuffleWithUNPCK256
static SDValue lowerShuffleWithUNPCK256(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG)
Check if the mask can be mapped to a preliminary shuffle (vperm 64-bit) followed by unpack 256-bit.
Definition X86ISelLowering.cpp:10569

is256BitLaneRepeatedShuffleMask
static bool is256BitLaneRepeatedShuffleMask(MVT VT, ArrayRef< int > Mask, SmallVectorImpl< int > &RepeatedMask)
Test whether a shuffle mask is equivalent within each 256-bit lane.
Definition X86ISelLowering.cpp:10049

LowerMSCATTER
static SDValue LowerMSCATTER(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Definition X86ISelLowering.cpp:33163

LowerShiftByScalarVariable
static SDValue LowerShiftByScalarVariable(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Definition X86ISelLowering.cpp:30378

LowerSIGN_EXTEND_Mask
static SDValue LowerSIGN_EXTEND_Mask(SDValue Op, const SDLoc &dl, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Definition X86ISelLowering.cpp:25273

getVectorShuffle
static SDValue getVectorShuffle(SelectionDAG &DAG, EVT VT, const SDLoc &dl, SDValue V1, SDValue V2, ArrayRef< int > Mask)
Definition X86ISelLowering.cpp:4838

combineMaskedStore
static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Definition X86ISelLowering.cpp:53434

combineCommutableSHUFP
static SDValue combineCommutableSHUFP(SDValue N, MVT VT, const SDLoc &DL, SelectionDAG &DAG)
Definition X86ISelLowering.cpp:41833

LowerUINT_TO_FP_i32
static SDValue LowerUINT_TO_FP_i32(SDValue Op, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
32-bit unsigned integer to float expansion.
Definition X86ISelLowering.cpp:20317

combineAdd
static SDValue combineAdd(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Definition X86ISelLowering.cpp:58121

LowerTruncateVecI1
static SDValue LowerTruncateVecI1(SDValue Op, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Definition X86ISelLowering.cpp:21384

combineVTRUNC
static SDValue combineVTRUNC(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
Definition X86ISelLowering.cpp:54666

LowerLoad
static SDValue LowerLoad(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Definition X86ISelLowering.cpp:25673

ExperimentalPrefInnermostLoopAlignment
static cl::opt< int > ExperimentalPrefInnermostLoopAlignment("x86-experimental-pref-innermost-loop-alignment", cl::init(4), cl::desc("Sets the preferable loop alignment for experiments (as log2 bytes) " "for innermost loops only. If specified, this option overrides " "alignment set by x86-experimental-pref-loop-alignment."), cl::Hidden)

createVariablePermute
static SDValue createVariablePermute(MVT VT, SDValue SrcVec, SDValue IndicesVec, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Look for opportunities to create a VPERMV/VPERMILPV/PSHUFB variable permute from a vector of source v...
Definition X86ISelLowering.cpp:8890

getHopForBuildVector
static SDValue getHopForBuildVector(const BuildVectorSDNode *BV, const SDLoc &DL, SelectionDAG &DAG, unsigned HOpcode, SDValue V0, SDValue V1)
Definition X86ISelLowering.cpp:8580

LowerRotate
static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Definition X86ISelLowering.cpp:31375

needCarryOrOverflowFlag
static bool needCarryOrOverflowFlag(SDValue Flags)
Definition X86ISelLowering.cpp:57387

combineCVTPH2PS
static SDValue combineCVTPH2PS(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
Definition X86ISelLowering.cpp:55638

getOnesVector
static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl)
Returns a vector of specified type with all bits set.
Definition X86ISelLowering.cpp:4750

combineMaskedLoad
static SDValue combineMaskedLoad(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Definition X86ISelLowering.cpp:53355

isUndefLowerHalf
static bool isUndefLowerHalf(ArrayRef< int > Mask)
Return true if the mask creates a vector whose lower half is undefined.
Definition X86ISelLowering.cpp:3749

LowerMULH
static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Definition X86ISelLowering.cpp:29798

combineOrXorWithSETCC
static SDValue combineOrXorWithSETCC(unsigned Opc, const SDLoc &DL, EVT VT, SDValue N0, SDValue N1, SelectionDAG &DAG)
Definition X86ISelLowering.cpp:52535

combineRedundantDWordShuffle
static SDValue combineRedundantDWordShuffle(SDValue N, MutableArrayRef< int > Mask, const SDLoc &DL, SelectionDAG &DAG)
Search for a combinable shuffle across a chain ending in pshufd.
Definition X86ISelLowering.cpp:41703

getBMIMatchingOp
static SDValue getBMIMatchingOp(unsigned Opc, SelectionDAG &DAG, SDValue OpMustEq, SDValue Op, unsigned Depth)
Definition X86ISelLowering.cpp:51546

createPSADBW
static SDValue createPSADBW(SelectionDAG &DAG, SDValue N0, SDValue N1, const SDLoc &DL, const X86Subtarget &Subtarget)
Definition X86ISelLowering.cpp:46204

lowerBuildVectorAsBlend
static SDValue lowerBuildVectorAsBlend(BuildVectorSDNode *BVOp, SDLoc const &DL, X86Subtarget const &Subtarget, SelectionDAG &DAG)
Attempt to lower a BUILD_VECTOR of scalar values to a shuffle of splats representing a blend.
Definition X86ISelLowering.cpp:8820

getTargetVShiftByConstNode
static SDValue getTargetVShiftByConstNode(unsigned Opc, const SDLoc &dl, MVT VT, SDValue SrcOp, uint64_t ShiftAmt, SelectionDAG &DAG)
Handle vector element shifts where the shift amount is a constant.
Definition X86ISelLowering.cpp:26093

getPack
static SDValue getPack(SelectionDAG &DAG, const X86Subtarget &Subtarget, const SDLoc &dl, MVT VT, SDValue LHS, SDValue RHS, bool PackHiHalf=false)
Returns a node that packs the LHS + RHS nodes together at half width.
Definition X86ISelLowering.cpp:4877

combineMOVDQ2Q
static SDValue combineMOVDQ2Q(SDNode *N, SelectionDAG &DAG)
Definition X86ISelLowering.cpp:60679

matchUnaryShuffle
static bool matchUnaryShuffle(MVT MaskVT, ArrayRef< int > Mask, bool AllowFloatDomain, bool AllowIntDomain, SDValue V1, const SelectionDAG &DAG, const X86Subtarget &Subtarget, unsigned &Shuffle, MVT &SrcVT, MVT &DstVT)
Definition X86ISelLowering.cpp:39249

isConstantPowerOf2
static bool isConstantPowerOf2(SDValue V, unsigned EltSizeInBIts, bool AllowUndefs)
Definition X86ISelLowering.cpp:5394

lowerFPToIntToFP
static SDValue lowerFPToIntToFP(SDValue CastToFP, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Given a scalar cast to FP with a cast to integer operand (almost an ftrunc), try to vectorize the cas...
Definition X86ISelLowering.cpp:19920

combineAndXorSubWithBMI
static SDValue combineAndXorSubWithBMI(SDNode *And, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Fold AND(Y, XOR(X, NEG(X))) -> ANDN(Y, BLSMSK(X)) if BMI is available.
Definition X86ISelLowering.cpp:51611

combineX86SubCmpForFlags
static SDValue combineX86SubCmpForFlags(SDNode *N, SDValue Flag, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &ST)
Definition X86ISelLowering.cpp:51635

LowerVectorCTLZ_GFNI
static SDValue LowerVectorCTLZ_GFNI(SDValue Op, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Definition X86ISelLowering.cpp:29101

getHalfShuffleMask
static bool getHalfShuffleMask(ArrayRef< int > Mask, MutableArrayRef< int > HalfMask, int &HalfIdx1, int &HalfIdx2)
If the input shuffle mask results in a vector that is undefined in all upper or lower half elements a...
Definition X86ISelLowering.cpp:15868

BrMergingBaseCostThresh
static cl::opt< int > BrMergingBaseCostThresh("x86-br-merging-base-cost", cl::init(2), cl::desc("Sets the cost threshold for when multiple conditionals will be merged " "into one branch versus be split in multiple branches. Merging " "conditionals saves branches at the cost of additional instructions. " "This value sets the instruction cost limit, below which conditionals " "will be merged, and above which conditionals will be split. Set to -1 " "to never merge branches."), cl::Hidden)

getFauxShuffleMask
static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts, SmallVectorImpl< int > &Mask, SmallVectorImpl< SDValue > &Ops, const SelectionDAG &DAG, unsigned Depth, bool ResolveKnownElts)
Definition X86ISelLowering.cpp:6126

hasBZHI
static bool hasBZHI(const X86Subtarget &Subtarget, MVT VT)
Definition X86ISelLowering.cpp:51352

emitLockedStackOp
static SDValue emitLockedStackOp(SelectionDAG &DAG, const X86Subtarget &Subtarget, SDValue Chain, const SDLoc &DL)
Emit a locked operation on a stack location which does not change any memory location,...
Definition X86ISelLowering.cpp:32201

matchShuffleWithSHUFPD
static bool matchShuffleWithSHUFPD(MVT VT, SDValue &V1, SDValue &V2, bool &ForceV1Zero, bool &ForceV2Zero, unsigned &ShuffleImm, ArrayRef< int > Mask, const APInt &Zeroable)
Definition X86ISelLowering.cpp:16273

lowerV8F16Shuffle
static SDValue lowerV8F16Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Lower 8-lane 16-bit floating point shuffles.
Definition X86ISelLowering.cpp:14663

LowerFSINCOS
static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Definition X86ISelLowering.cpp:33068

SplitOpsAndApply
SDValue SplitOpsAndApply(SelectionDAG &DAG, const X86Subtarget &Subtarget, const SDLoc &DL, EVT VT, ArrayRef< SDValue > Ops, F Builder, bool CheckBWI=true, bool AllowAVX512=true)
Definition X86ISelLowering.cpp:4463

lowerAtomicArithWithLOCK
static SDValue lowerAtomicArithWithLOCK(SDValue N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Definition X86ISelLowering.cpp:32854

lowerShuffleAsBitBlend
static SDValue lowerShuffleAsBitBlend(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG)
Try to emit a blend instruction for a shuffle using bit math.
Definition X86ISelLowering.cpp:11063

reduceMaskedLoadToScalarLoad
static SDValue reduceMaskedLoadToScalarLoad(MaskedLoadSDNode *ML, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
If exactly one element of the mask is set for a non-extending masked load, it is a scalar load and ve...
Definition X86ISelLowering.cpp:53266

lower1BitShuffleAsKSHIFTR
static SDValue lower1BitShuffleAsKSHIFTR(const SDLoc &DL, ArrayRef< int > Mask, MVT VT, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Definition X86ISelLowering.cpp:17925

combineVectorPack
static SDValue combineVectorPack(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Definition X86ISelLowering.cpp:50351

expandIntrinsicWChainHelper
static SDValue expandIntrinsicWChainHelper(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, unsigned TargetOpcode, unsigned SrcReg, const X86Subtarget &Subtarget, SmallVectorImpl< SDValue > &Results)
Handles the lowering of builtin intrinsics with chain that return their value into registers EDX:EAX.
Definition X86ISelLowering.cpp:27581

matchUnaryPermuteShuffle
static bool matchUnaryPermuteShuffle(MVT MaskVT, ArrayRef< int > Mask, const APInt &Zeroable, bool AllowFloatDomain, bool AllowIntDomain, const SelectionDAG &DAG, const X86Subtarget &Subtarget, unsigned &Shuffle, MVT &ShuffleVT, unsigned &PermuteImm)
Definition X86ISelLowering.cpp:39407

shouldExpandCmpArithRMWInIR
static bool shouldExpandCmpArithRMWInIR(AtomicRMWInst *AI)
Definition X86ISelLowering.cpp:31967

combineVSelectToBLENDV
static SDValue combineVSelectToBLENDV(SDNode *N, SelectionDAG &DAG, const SDLoc &DL, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
If this is a dynamic select (non-constant condition) and we can match this node with one of the varia...
Definition X86ISelLowering.cpp:47529

combineVectorCompareAndMaskUnaryOp
static SDValue combineVectorCompareAndMaskUnaryOp(SDNode *N, SelectionDAG &DAG)
Definition X86ISelLowering.cpp:57074

LowerBuildVectorAsInsert
static SDValue LowerBuildVectorAsInsert(SDValue Op, const SDLoc &DL, const APInt &NonZeroMask, unsigned NumNonZero, unsigned NumZero, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Definition X86ISelLowering.cpp:6874

isRepeatedTargetShuffleMask
static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits, unsigned EltSizeInBits, ArrayRef< int > Mask, SmallVectorImpl< int > &RepeatedMask)
Test whether a target shuffle mask is equivalent within each sub-lane.
Definition X86ISelLowering.cpp:10056

getIndirectThunkSymbol
static const char * getIndirectThunkSymbol(const X86Subtarget &Subtarget, Register Reg)
Definition X86ISelLowering.cpp:36830

isLaneCrossingShuffleMask
static bool isLaneCrossingShuffleMask(unsigned LaneSizeInBits, unsigned ScalarSizeInBits, ArrayRef< int > Mask)
Test whether there are elements crossing LaneSizeInBits lanes in this shuffle mask.
Definition X86ISelLowering.cpp:9946

FixupMMXIntrinsicTypes
static SDValue FixupMMXIntrinsicTypes(SDNode *N, SelectionDAG &DAG)
Definition X86ISelLowering.cpp:60712

isShuffleMaskInputBroadcastable
static bool isShuffleMaskInputBroadcastable(int Input, ArrayRef< int > Mask, int BroadcastableElement=0)
Test whether the specified input (0 or 1) is a broadcast/splat blended by the given mask.
Definition X86ISelLowering.cpp:12981

getScatterNode
static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, SDValue Src, SDValue Mask, SDValue Base, SDValue Index, SDValue ScaleOp, SDValue Chain, const X86Subtarget &Subtarget)
Definition X86ISelLowering.cpp:27520

LowerAndToBT
static SDValue LowerAndToBT(SDValue And, ISD::CondCode CC, const SDLoc &dl, SelectionDAG &DAG, X86::CondCode &X86CC)
Result of 'and' is compared against zero.
Definition X86ISelLowering.cpp:23781

combinePMULH
static SDValue combinePMULH(SDValue Src, EVT VT, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Definition X86ISelLowering.cpp:54402

lowerShuffleAsZeroOrAnyExtend
static SDValue lowerShuffleAsZeroOrAnyExtend(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower a vector shuffle as a zero extension on any microarch.
Definition X86ISelLowering.cpp:12602

combineBasicSADPattern
static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Definition X86ISelLowering.cpp:46518

supportedVectorShiftWithBaseAmnt
static bool supportedVectorShiftWithBaseAmnt(EVT VT, const X86Subtarget &Subtarget, unsigned Opcode)
Definition X86ISelLowering.cpp:30197

combineVPMADD
static SDValue combineVPMADD(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
Definition X86ISelLowering.cpp:60252

LowerMULO
static SDValue LowerMULO(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Definition X86ISelLowering.cpp:29905

lowerShuffleWithSHUFPD
static SDValue lowerShuffleWithSHUFPD(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Definition X86ISelLowering.cpp:16319

combineBitOpWithShift
static SDValue combineBitOpWithShift(unsigned Opc, const SDLoc &DL, EVT VT, SDValue N0, SDValue N1, SelectionDAG &DAG)
Definition X86ISelLowering.cpp:51200

LowerHorizontalByteSum
static SDValue LowerHorizontalByteSum(SDValue V, MVT VT, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Compute the horizontal sum of bytes in V for the elements of VT.
Definition X86ISelLowering.cpp:32420

LowerFP16_TO_FP
static SDValue LowerFP16_TO_FP(SDValue Op, SelectionDAG &DAG)
Definition X86ISelLowering.cpp:22425

lowerV32I16Shuffle
static SDValue lowerV32I16Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 32-lane 16-bit integer shuffles.
Definition X86ISelLowering.cpp:17682

combineBitcastToBoolVector
static SDValue combineBitcastToBoolVector(EVT VT, SDValue V, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget, unsigned Depth=0)
Definition X86ISelLowering.cpp:45731

getPrefetchNode
static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, SDValue Mask, SDValue Base, SDValue Index, SDValue ScaleOp, SDValue Chain, const X86Subtarget &Subtarget)
Definition X86ISelLowering.cpp:27551

MarkEHGuard
static SDValue MarkEHGuard(SDValue Op, SelectionDAG &DAG)
Definition X86ISelLowering.cpp:27684

combineX86CloadCstore
static SDValue combineX86CloadCstore(SDNode *N, SelectionDAG &DAG)
Definition X86ISelLowering.cpp:58318

growShuffleMask
static void growShuffleMask(ArrayRef< int > SrcMask, SmallVectorImpl< int > &DstMask, unsigned SrcSizeInBits, unsigned DstSizeInBits)
Definition X86ISelLowering.cpp:3970

lowerShuffleWithEXPAND
static SDValue lowerShuffleWithEXPAND(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Definition X86ISelLowering.cpp:10430

computeInLaneShuffleMask
static void computeInLaneShuffleMask(const ArrayRef< int > &Mask, int LaneSize, SmallVector< int > &InLaneMask)
Helper to get compute inlane shuffle mask for a complete shuffle mask.
Definition X86ISelLowering.cpp:15496

lowerX86CmpEqZeroToCtlzSrl
static SDValue lowerX86CmpEqZeroToCtlzSrl(SDValue Op, SelectionDAG &DAG)
Definition X86ISelLowering.cpp:52206

combineVectorInsert
static SDValue combineVectorInsert(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Definition X86ISelLowering.cpp:50736

isX86CCSigned
static bool isX86CCSigned(X86::CondCode X86CC)
Return true if the condition is an signed comparison operation.
Definition X86ISelLowering.cpp:2948

combineTESTP
static SDValue combineTESTP(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Definition X86ISelLowering.cpp:56835

combineAnd
static SDValue combineAnd(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Definition X86ISelLowering.cpp:51777

getBROADCAST_LOAD
static SDValue getBROADCAST_LOAD(unsigned Opcode, const SDLoc &DL, EVT VT, EVT MemVT, MemSDNode *Mem, unsigned Offset, SelectionDAG &DAG)
Definition X86ISelLowering.cpp:6748

isUndefUpperHalf
static bool isUndefUpperHalf(ArrayRef< int > Mask)
Return true if the mask creates a vector whose upper half is undefined.
Definition X86ISelLowering.cpp:3755

LowerATOMIC_STORE
static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Definition X86ISelLowering.cpp:32954

getGFNICtrlImm
uint64_t getGFNICtrlImm(unsigned Opcode, unsigned Amt=0)
Definition X86ISelLowering.cpp:28925

lowerShuffleWithPACK
static SDValue lowerShuffleWithPACK(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Definition X86ISelLowering.cpp:10959

lowerShuffleAsSpecificExtension
static SDValue lowerShuffleAsSpecificExtension(const SDLoc &DL, MVT VT, int Scale, int Offset, unsigned ExtOpc, SDValue InputV, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Lower a vector shuffle as an any/signed/zero extension.
Definition X86ISelLowering.cpp:12430

LowerShiftParts
static SDValue LowerShiftParts(SDValue Op, SelectionDAG &DAG)
Lower SRA_PARTS and friends, which return two i32 values and take a 2 x i32 value to shift plus a shi...
Definition X86ISelLowering.cpp:19768

combineFMulcFCMulc
static SDValue combineFMulcFCMulc(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Definition X86ISelLowering.cpp:54109

canReduceVMulWidth
static bool canReduceVMulWidth(SDNode *N, SelectionDAG &DAG, ShrinkMode &Mode)
Definition X86ISelLowering.cpp:49401

getX86XALUOOp
static std::pair< SDValue, SDValue > getX86XALUOOp(X86::CondCode &Cond, SDValue Op, SelectionDAG &DAG)
Definition X86ISelLowering.cpp:24798

combineVectorShiftVar
static SDValue combineVectorShiftVar(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Definition X86ISelLowering.cpp:50550

LowerAVXCONCAT_VECTORS
static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Definition X86ISelLowering.cpp:9653

LowerAVG
static SDValue LowerAVG(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Definition X86ISelLowering.cpp:29328

combineSetCC
static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Definition X86ISelLowering.cpp:56422

combineCompareEqual
static SDValue combineCompareEqual(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Recognize the distinctive (AND (setcc ...) (setcc ..)) where both setccs reference the same FP CMP,...
Definition X86ISelLowering.cpp:50775

isVKClass
static bool isVKClass(const TargetRegisterClass &RC)
Check if RC is a mask register class.
Definition X86ISelLowering.cpp:61684

canLowerByDroppingElements
static int canLowerByDroppingElements(ArrayRef< int > Mask, bool MatchEven, bool IsSingleInput)
Check whether a compaction lowering can be done by dropping even/odd elements and compute how many ti...
Definition X86ISelLowering.cpp:10838

combineTruncatedArithmetic
static SDValue combineTruncatedArithmetic(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget, const SDLoc &DL)
Attempt to pre-truncate inputs to arithmetic ops if it will simplify the codegen.
Definition X86ISelLowering.cpp:54320

lowerShuffleAsBroadcast
static SDValue lowerShuffleAsBroadcast(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower broadcast of a single element.
Definition X86ISelLowering.cpp:13049

combineCVTP2I_CVTTP2I
static SDValue combineCVTP2I_CVTTP2I(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
Definition X86ISelLowering.cpp:55437

resolveTargetShuffleInputsAndMask
static void resolveTargetShuffleInputsAndMask(SmallVectorImpl< SDValue > &Inputs, SmallVectorImpl< int > &Mask)
Removes unused/repeated shuffle source inputs and adjusts the shuffle mask.
Definition X86ISelLowering.cpp:6653

lowerV64I8Shuffle
static SDValue lowerV64I8Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 64-lane 8-bit integer shuffles.
Definition X86ISelLowering.cpp:17753

combineBitOpWithMOVMSK
static SDValue combineBitOpWithMOVMSK(unsigned Opc, const SDLoc &DL, SDValue N0, SDValue N1, SelectionDAG &DAG)
Definition X86ISelLowering.cpp:51169

combineAndNotIntoANDNP
static SDValue combineAndNotIntoANDNP(SDNode *N, const SDLoc &DL, SelectionDAG &DAG)
Try to fold: (and (xor X, -1), Y) -> (andnp X, Y).
Definition X86ISelLowering.cpp:50883

combineShuffleToAddSubOrFMAddSub
static SDValue combineShuffleToAddSubOrFMAddSub(SDNode *N, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to combine a shuffle into a target-specific add-sub or mul-add-sub node.
Definition X86ISelLowering.cpp:43303

lowerShuffleAsLanePermuteAndPermute
static SDValue lowerShuffleAsLanePermuteAndPermute(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Lower a vector shuffle crossing multiple 128-bit lanes as a lane permutation followed by a per-lane p...
Definition X86ISelLowering.cpp:15385

lowerV8I16Shuffle
static SDValue lowerV8I16Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Generic lowering of 8-lane i16 shuffles.
Definition X86ISelLowering.cpp:14459

getEXTEND_VECTOR_INREG
static SDValue getEXTEND_VECTOR_INREG(unsigned Opcode, const SDLoc &DL, EVT VT, SDValue In, SelectionDAG &DAG)
Definition X86ISelLowering.cpp:4758

canonicalizeShuffleMaskWithCommute
static bool canonicalizeShuffleMaskWithCommute(ArrayRef< int > Mask)
Helper function that returns true if the shuffle mask should be commuted to improve canonicalization.
Definition X86ISelLowering.cpp:18141

lowerUINT_TO_FP_v2i32
static SDValue lowerUINT_TO_FP_v2i32(SDValue Op, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Definition X86ISelLowering.cpp:20367

getV4X86ShuffleImm8ForMask
static SDValue getV4X86ShuffleImm8ForMask(ArrayRef< int > Mask, const SDLoc &DL, SelectionDAG &DAG)
Definition X86ISelLowering.cpp:10292

splitVSETCC
static SDValue splitVSETCC(EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond, SelectionDAG &DAG, const SDLoc &dl)
Break a VSETCC 256/512-bit vector into two new 128/256 ones and then concatenate the result back.
Definition X86ISelLowering.cpp:23918

splitVectorStore
static SDValue splitVectorStore(StoreSDNode *Store, SelectionDAG &DAG)
Change a vector store into a pair of half-size vector stores.
Definition X86ISelLowering.cpp:25521

widenSubVector
static SDValue widenSubVector(MVT VT, SDValue Vec, bool ZeroNewElements, const X86Subtarget &Subtarget, SelectionDAG &DAG, const SDLoc &dl)
Widen a vector to a larger size with the same scalar type, with the new elements either zero or undef...
Definition X86ISelLowering.cpp:4205

supportedVectorVarShift
static bool supportedVectorVarShift(EVT VT, const X86Subtarget &Subtarget, unsigned Opcode)
Definition X86ISelLowering.cpp:30204

isUndefInRange
static bool isUndefInRange(ArrayRef< int > Mask, unsigned Pos, unsigned Size)
Return true if every element in Mask, beginning from position Pos and ending in Pos+Size is the undef...
Definition X86ISelLowering.cpp:3743

LowerToTLSGeneralDynamicModelX32
static SDValue LowerToTLSGeneralDynamicModelX32(GlobalAddressSDNode *GA, SelectionDAG &DAG, const EVT PtrVT)
Definition X86ISelLowering.cpp:19498

combineFAndn
static SDValue combineFAndn(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Do target-specific dag combines on X86ISD::FANDN nodes.
Definition X86ISelLowering.cpp:55276

LowerToTLSExecModel
static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG, const EVT PtrVT, TLSModel::Model model, bool is64Bit, bool isPIC)
Definition X86ISelLowering.cpp:19541

supportedVectorShiftWithImm
static bool supportedVectorShiftWithImm(EVT VT, const X86Subtarget &Subtarget, unsigned Opcode)
Definition X86ISelLowering.cpp:30168

lower1BitShuffle
static SDValue lower1BitShuffle(const SDLoc &DL, ArrayRef< int > Mask, MVT VT, SDValue V1, SDValue V2, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Definition X86ISelLowering.cpp:18000

combineToExtendBoolVectorInReg
static SDValue combineToExtendBoolVectorInReg(unsigned Opcode, const SDLoc &DL, EVT VT, SDValue N0, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Definition X86ISelLowering.cpp:47326

splitVectorIntBinary
static SDValue splitVectorIntBinary(SDValue Op, SelectionDAG &DAG, const SDLoc &dl)
Break a binary integer operation into 2 half sized ops and then concatenate the result back.
Definition X86ISelLowering.cpp:4445

createMMXBuildVector
static SDValue createMMXBuildVector(BuildVectorSDNode *BV, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Definition X86ISelLowering.cpp:45651

LowerADDSUBO_CARRY
static SDValue LowerADDSUBO_CARRY(SDValue Op, SelectionDAG &DAG)
Definition X86ISelLowering.cpp:33036

getV4X86ShuffleImm
static unsigned getV4X86ShuffleImm(ArrayRef< int > Mask)
Get a 4-lane 8-bit shuffle immediate for a mask.
Definition X86ISelLowering.cpp:10268

combineShiftLeft
static SDValue combineShiftLeft(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Definition X86ISelLowering.cpp:49995

resolveTargetShuffleFromZeroables
static void resolveTargetShuffleFromZeroables(SmallVectorImpl< int > &Mask, const APInt &KnownUndef, const APInt &KnownZero, bool ResolveKnownZeros=true)
Definition X86ISelLowering.cpp:6054

LowerBUILD_VECTORvXi1
static SDValue LowerBUILD_VECTORvXi1(SDValue Op, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Definition X86ISelLowering.cpp:8022

InsertBitToMaskVector
static SDValue InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Insert one bit to mask vector, like v16i1 or v8i1.
Definition X86ISelLowering.cpp:18912

getGatherNode
static SDValue getGatherNode(SDValue Op, SelectionDAG &DAG, SDValue Src, SDValue Mask, SDValue Base, SDValue Index, SDValue ScaleOp, SDValue Chain, const X86Subtarget &Subtarget)
Definition X86ISelLowering.cpp:27482

lowerShuffleAsLanePermuteAndRepeatedMask
static SDValue lowerShuffleAsLanePermuteAndRepeatedMask(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Lower a vector shuffle by first fixing the 128-bit lanes and then shuffling each lane.
Definition X86ISelLowering.cpp:15695

isSoftF16
static bool isSoftF16(T VT, const X86Subtarget &Subtarget)
Definition X86ISelLowering.cpp:12756

lowerV16I32Shuffle
static SDValue lowerV16I32Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 16-lane 32-bit integer shuffles.
Definition X86ISelLowering.cpp:17585

combineExtractVectorElt
static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Detect vector gather/scatter index generation and convert it from being a bunch of shuffles and extra...
Definition X86ISelLowering.cpp:47148

isSingleSHUFPSMask
static bool isSingleSHUFPSMask(ArrayRef< int > Mask)
Test whether this can be lowered with a single SHUFPS instruction.
Definition X86ISelLowering.cpp:12945

LowerFCanonicalize
static SDValue LowerFCanonicalize(SDValue Op, SelectionDAG &DAG)
Definition X86ISelLowering.cpp:33453

checkBoolTestAndOrSetCCCombine
static bool checkBoolTestAndOrSetCCCombine(SDValue Cond, X86::CondCode &CC0, X86::CondCode &CC1, SDValue &Flags, bool &isAnd)
Check whether Cond is an AND/OR of SETCCs off of the same EFLAGS.
Definition X86ISelLowering.cpp:48608

isX86LogicalCmp
static bool isX86LogicalCmp(SDValue Op)
Return true if opcode is a X86 logical comparison.
Definition X86ISelLowering.cpp:24859

isAnyInRange
static bool isAnyInRange(ArrayRef< int > Mask, int Low, int Hi)
Return true if the value of any element in Mask falls within the specified range (L,...
Definition X86ISelLowering.cpp:3767

combineEXTRACT_SUBVECTOR
static SDValue combineEXTRACT_SUBVECTOR(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Definition X86ISelLowering.cpp:59752

WidenShift
static cl::opt< bool > WidenShift("x86-widen-shift", cl::init(true), cl::desc("Replace narrow shifts with wider shifts."), cl::Hidden)

combineSextInRegCmov
static SDValue combineSextInRegCmov(SDNode *N, SelectionDAG &DAG)
Definition X86ISelLowering.cpp:55679

detectSSatPattern
static SDValue detectSSatPattern(SDValue In, EVT VT, bool MatchPackUS=false)
Detect patterns of truncation with signed saturation: (truncate (smin ((smax (x, signed_min_of_dest_t...
Definition X86ISelLowering.cpp:52892

FPStateSize
const unsigned FPStateSize
Definition X86ISelLowering.cpp:28809

matchShuffleWithUNPCK
static bool matchShuffleWithUNPCK(MVT VT, SDValue &V1, SDValue &V2, unsigned &UnpackOpcode, bool IsUnary, ArrayRef< int > TargetMask, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Definition X86ISelLowering.cpp:10453

combineFneg
static SDValue combineFneg(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Do target-specific dag combines on floating point negations.
Definition X86ISelLowering.cpp:54840

combineLoad
static SDValue combineLoad(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Definition X86ISelLowering.cpp:53095

combineToHorizontalAddSub
static SDValue combineToHorizontalAddSub(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Definition X86ISelLowering.cpp:54037

combineXorSubCTLZ
static SDValue combineXorSubCTLZ(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Definition X86ISelLowering.cpp:54990

insertSubVector
static SDValue insertSubVector(SDValue Result, SDValue Vec, unsigned IdxVal, SelectionDAG &DAG, const SDLoc &dl, unsigned vectorWidth)
Definition X86ISelLowering.cpp:4171

isHopBuildVector
static bool isHopBuildVector(const BuildVectorSDNode *BV, SelectionDAG &DAG, unsigned &HOpcode, SDValue &V0, SDValue &V1)
Definition X86ISelLowering.cpp:8491

combineSelectOfTwoConstants
static SDValue combineSelectOfTwoConstants(SDNode *N, SelectionDAG &DAG, const SDLoc &DL)
Definition X86ISelLowering.cpp:47452

combineFOr
static SDValue combineFOr(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Do target-specific dag combines on X86ISD::FOR and X86ISD::FXOR nodes.
Definition X86ISelLowering.cpp:55290

LowerINSERT_SUBVECTOR
static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Definition X86ISelLowering.cpp:19187

combineINTRINSIC_VOID
static SDValue combineINTRINSIC_VOID(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
Definition X86ISelLowering.cpp:60775

createShuffleMaskFromVSELECT
static bool createShuffleMaskFromVSELECT(SmallVectorImpl< int > &Mask, SDValue Cond, bool IsBLENDV=false)
Definition X86ISelLowering.cpp:6087

getMaskNode
static SDValue getMaskNode(SDValue Mask, MVT MaskVT, const X86Subtarget &Subtarget, SelectionDAG &DAG, const SDLoc &dl)
Return Mask with the necessary casting or extending for Mask according to MaskVT when lowering maskin...
Definition X86ISelLowering.cpp:26242

lowerV8F64Shuffle
static SDValue lowerV8F64Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 8-lane 64-bit floating point shuffles.
Definition X86ISelLowering.cpp:17391

shouldUseHorizontalOp
static bool shouldUseHorizontalOp(bool IsSingleSource, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Horizontal vector math instructions may be slower than normal math with shuffles.
Definition X86ISelLowering.cpp:20241

isFRClass
static bool isFRClass(const TargetRegisterClass &RC)
Check if RC is a vector register class.
Definition X86ISelLowering.cpp:61673

splitAndLowerShuffle
static SDValue splitAndLowerShuffle(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG, bool SimpleOnly)
Generic routine to split vector shuffle into half-sized shuffles.
Definition X86ISelLowering.cpp:15138

combineAVX512SetCCToKMOV
static SDValue combineAVX512SetCCToKMOV(EVT VT, SDValue Op0, ISD::CondCode CC, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Definition X86ISelLowering.cpp:56337

LowerToTLSGeneralDynamicModel64
static SDValue LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG, const EVT PtrVT)
Definition X86ISelLowering.cpp:19491

IsNOT
static SDValue IsNOT(SDValue V, SelectionDAG &DAG)
Definition X86ISelLowering.cpp:5410

combineFMinFMax
static SDValue combineFMinFMax(SDNode *N, SelectionDAG &DAG)
Do target-specific dag combines on X86ISD::FMIN and X86ISD::FMAX nodes.
Definition X86ISelLowering.cpp:55310

combineOr
static SDValue combineOr(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Definition X86ISelLowering.cpp:52571

LowerMGATHER
static SDValue LowerMGATHER(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Definition X86ISelLowering.cpp:33334

getVShift
static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, unsigned NumBits, SelectionDAG &DAG, const TargetLowering &TLI, const SDLoc &dl)
Return a vector logical shift node.
Definition X86ISelLowering.cpp:7139

combineVPDPBUSDPattern
static SDValue combineVPDPBUSDPattern(SDNode *Extract, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Definition X86ISelLowering.cpp:46447

LowerVACOPY
static SDValue LowerVACOPY(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Definition X86ISelLowering.cpp:26047

combineINTRINSIC_WO_CHAIN
static SDValue combineINTRINSIC_WO_CHAIN(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
Definition X86ISelLowering.cpp:60747

lowerV4I32Shuffle
static SDValue lowerV4I32Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Lower 4-lane i32 vector shuffles.
Definition X86ISelLowering.cpp:13752

widenMaskVector
static SDValue widenMaskVector(SDValue Vec, bool ZeroNewElements, const X86Subtarget &Subtarget, SelectionDAG &DAG, const SDLoc &dl)
Widen a mask vector to a minimum of v8i1/v16i1 to allow use of KSHIFT and bitcast with integer types.
Definition X86ISelLowering.cpp:4253

LowerAsSplatVectorLoad
static SDValue LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, const SDLoc &dl, SelectionDAG &DAG)
Definition X86ISelLowering.cpp:7151

isInRange
static bool isInRange(int Val, int Low, int Hi)
Return true if Val falls within the specified range (L, H].
Definition X86ISelLowering.cpp:3761

combineKSHIFT
static SDValue combineKSHIFT(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
Definition X86ISelLowering.cpp:60400

combineTargetShuffle
static SDValue combineTargetShuffle(SDValue N, const SDLoc &DL, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Try to combine x86 target specific shuffles.
Definition X86ISelLowering.cpp:42234

LowerSIGN_EXTEND
static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Definition X86ISelLowering.cpp:25468

splitVector
static std::pair< SDValue, SDValue > splitVector(SDValue Op, SelectionDAG &DAG, const SDLoc &dl)
Definition X86ISelLowering.cpp:4373

getBT
static SDValue getBT(SDValue Src, SDValue BitNo, const SDLoc &DL, SelectionDAG &DAG)
Helper for attempting to create a X86ISD::BT node.
Definition X86ISelLowering.cpp:22766

EmitTruncSStore
static SDValue EmitTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &DL, SDValue Val, SDValue Ptr, EVT MemVT, MachineMemOperand *MMO, SelectionDAG &DAG)
Emit Truncating Store with signed or unsigned saturation.
Definition X86ISelLowering.cpp:27704

ExtendToType
static SDValue ExtendToType(SDValue InOp, MVT NVT, SelectionDAG &DAG, bool FillWithZeroes=false)
Widen a vector input to a vector of NVT.
Definition X86ISelLowering.cpp:33120

getHorizDemandedElts
static void getHorizDemandedElts(EVT VT, const APInt &DemandedElts, APInt &DemandedLHS, APInt &DemandedRHS)
Definition X86ISelLowering.cpp:5532

LowerMSTORE
static SDValue LowerMSTORE(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Definition X86ISelLowering.cpp:33292

combineFMA
static SDValue combineFMA(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Definition X86ISelLowering.cpp:56059

lowerShuffleAsBlendAndPermute
static SDValue lowerShuffleAsBlendAndPermute(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG, bool ImmBlends=false)
Try to lower as a blend of elements from two inputs followed by a single-input permutation.
Definition X86ISelLowering.cpp:11331

matchShuffleAsEXTRQ
static bool matchShuffleAsEXTRQ(MVT VT, SDValue &V1, SDValue &V2, ArrayRef< int > Mask, uint64_t &BitLen, uint64_t &BitIdx, const APInt &Zeroable)
Definition X86ISelLowering.cpp:12281

X87StateSize
const unsigned X87StateSize
Definition X86ISelLowering.cpp:28808

lowerV8I64Shuffle
static SDValue lowerV8I64Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 8-lane 64-bit integer shuffles.
Definition X86ISelLowering.cpp:17512

isUndefOrEqual
static bool isUndefOrEqual(int Val, int CmpVal)
Val is the undef sentinel value or equal to the specified value.
Definition X86ISelLowering.cpp:3716

lowerShuffleAsVTRUNC
static SDValue lowerShuffleAsVTRUNC(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Definition X86ISelLowering.cpp:10744

isTargetShuffle
static bool isTargetShuffle(unsigned Opcode)
Definition X86ISelLowering.cpp:2840

isSingleElementRepeatedMask
static bool isSingleElementRepeatedMask(ArrayRef< int > Mask)
Check if the Mask consists of the same element repeated multiple times.
Definition X86ISelLowering.cpp:11657

LowerCVTPS2PH
static SDValue LowerCVTPS2PH(SDValue Op, SelectionDAG &DAG)
Definition X86ISelLowering.cpp:33428

combineShiftRightArithmetic
static SDValue combineShiftRightArithmetic(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Definition X86ISelLowering.cpp:50063

combinePMULDQ
static SDValue combinePMULDQ(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Definition X86ISelLowering.cpp:60199

combineX86ShufflesRecursively
static SDValue combineX86ShufflesRecursively(ArrayRef< SDValue > SrcOps, int SrcOpIndex, unsigned RootOpc, MVT RootVT, ArrayRef< int > RootMask, ArrayRef< const SDNode * > SrcNodes, unsigned Depth, unsigned MaxDepth, bool AllowVariableCrossLaneMask, bool AllowVariablePerLaneMask, bool IsMaskedShuffle, SelectionDAG &DAG, const SDLoc &DL, const X86Subtarget &Subtarget)
Fully generic combining of x86 shuffle instructions.
Definition X86ISelLowering.cpp:41177

LowerIntVSETCC_AVX512
static SDValue LowerIntVSETCC_AVX512(SDValue Op, const SDLoc &dl, SelectionDAG &DAG)
Definition X86ISelLowering.cpp:23940

lowerShuffleWithSSE4A
static SDValue lowerShuffleWithSSE4A(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, SelectionDAG &DAG)
Try to lower a vector shuffle using SSE4a EXTRQ/INSERTQ.
Definition X86ISelLowering.cpp:12403

lowerShuffleOfExtractsAsVperm
static SDValue lowerShuffleOfExtractsAsVperm(const SDLoc &DL, SDValue N0, SDValue N1, ArrayRef< int > Mask, SelectionDAG &DAG)
If we are extracting two 128-bit halves of a vector and shuffling the result, match that to a 256-bit...
Definition X86ISelLowering.cpp:12995

lowerV4F64Shuffle
static SDValue lowerV4F64Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 4-lane 64-bit floating point shuffles.
Definition X86ISelLowering.cpp:16467

getAVX512Node
static SDValue getAVX512Node(unsigned Opcode, const SDLoc &DL, MVT VT, ArrayRef< SDValue > Ops, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Definition X86ISelLowering.cpp:4506

lowerToAddSubOrFMAddSub
static SDValue lowerToAddSubOrFMAddSub(const BuildVectorSDNode *BV, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to fold a build_vector that performs an 'addsub' or 'fmaddsub' or 'fsubadd' operation accordingly...
Definition X86ISelLowering.cpp:8449

lowerV8I16GeneralSingleInputShuffle
static SDValue lowerV8I16GeneralSingleInputShuffle(const SDLoc &DL, MVT VT, SDValue V, MutableArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Lowering of single-input v8i16 shuffles is the cornerstone of SSE2 shuffle lowering,...
Definition X86ISelLowering.cpp:13895

lowerV2F64Shuffle
static SDValue lowerV2F64Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 2-lane 64-bit floating point shuffles.
Definition X86ISelLowering.cpp:13372

isUpperSubvectorUndef
static SDValue isUpperSubvectorUndef(SDValue V, const SDLoc &DL, SelectionDAG &DAG)
Definition X86ISelLowering.cpp:4347

BrMergingLikelyBias
static cl::opt< int > BrMergingLikelyBias("x86-br-merging-likely-bias", cl::init(0), cl::desc("Increases 'x86-br-merging-base-cost' in cases that it is likely " "that all conditionals will be executed. For example for merging " "the conditionals (a == b && c > d), if its known that a == b is " "likely, then it is likely that if the conditionals are split " "both sides will be executed, so it may be desirable to increase " "the instruction cost threshold. Set to -1 to never merge likely " "branches."), cl::Hidden)

getInvertedVectorForFMA
static SDValue getInvertedVectorForFMA(SDValue V, SelectionDAG &DAG)
Definition X86ISelLowering.cpp:56011

IsElementEquivalent
static bool IsElementEquivalent(int MaskSize, SDValue Op, SDValue ExpectedOp, int Idx, int ExpectedIdx)
Checks whether the vector elements referenced by two shuffle masks are equivalent.
Definition X86ISelLowering.cpp:9829

matchShuffleAsElementRotate
static int matchShuffleAsElementRotate(SDValue &V1, SDValue &V2, ArrayRef< int > Mask)
Try to match a vector shuffle as an element rotation.
Definition X86ISelLowering.cpp:11878

combineVEXTRACT_STORE
static SDValue combineVEXTRACT_STORE(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Definition X86ISelLowering.cpp:53813

isUndefOrZeroOrInRange
static bool isUndefOrZeroOrInRange(int Val, int Low, int Hi)
Return true if Val is undef, zero or if its value falls within the specified range (L,...
Definition X86ISelLowering.cpp:3791

getTargetConstantFromBasePtr
static const Constant * getTargetConstantFromBasePtr(SDValue Ptr)
Definition X86ISelLowering.cpp:4967

lowerShuffleAsBlend
static SDValue lowerShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Original, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to emit a blend instruction for a shuffle.
Definition X86ISelLowering.cpp:11170

findEltLoadSrc
static bool findEltLoadSrc(SDValue Elt, LoadSDNode *&Ld, int64_t &ByteOffset)
Definition X86ISelLowering.cpp:7226

isUndefOrInRange
static bool isUndefOrInRange(int Val, int Low, int Hi)
Return true if Val is undef or if its value falls within the specified range (L, H].
Definition X86ISelLowering.cpp:3778

combineAddOfPMADDWD
static SDValue combineAddOfPMADDWD(SelectionDAG &DAG, SDValue N0, SDValue N1, const SDLoc &DL, EVT VT)
Definition X86ISelLowering.cpp:57966

collectConcatOps
static bool collectConcatOps(SDNode *N, SmallVectorImpl< SDValue > &Ops, SelectionDAG &DAG)
Definition X86ISelLowering.cpp:4263

truncateVectorWithPACK
static SDValue truncateVectorWithPACK(unsigned Opcode, EVT DstVT, SDValue In, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Helper to recursively truncate vector elements in half with PACKSS/PACKUS.
Definition X86ISelLowering.cpp:21067

LowerFGETSIGN
static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG)
Definition X86ISelLowering.cpp:22747

combineSBB
static SDValue combineSBB(SDNode *N, SelectionDAG &DAG)
Definition X86ISelLowering.cpp:57661

computeKnownBitsForPMADDUBSW
static void computeKnownBitsForPMADDUBSW(SDValue LHS, SDValue RHS, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth)
Definition X86ISelLowering.cpp:38526

combineSIntToFP
static SDValue combineSIntToFP(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Definition X86ISelLowering.cpp:57228

FindSingleBitChange
static std::pair< Value *, BitTestKind > FindSingleBitChange(Value *V)
Definition X86ISelLowering.cpp:31750

combineToFPTruncExtElt
static SDValue combineToFPTruncExtElt(SDNode *N, SelectionDAG &DAG)
If we are converting a value to floating-point, try to replace scalar truncate of an extracted vector...
Definition X86ISelLowering.cpp:57132

is128BitLaneCrossingShuffleMask
static bool is128BitLaneCrossingShuffleMask(MVT VT, ArrayRef< int > Mask)
Test whether there are elements crossing 128-bit lanes in this shuffle mask.
Definition X86ISelLowering.cpp:9962

EmitCmp
static SDValue EmitCmp(SDValue Op0, SDValue Op1, X86::CondCode X86CC, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Emit nodes that will be selected as "cmp Op0,Op1", or something equivalent.
Definition X86ISelLowering.cpp:23499

LowerI64IntToFP16
static SDValue LowerI64IntToFP16(SDValue Op, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Definition X86ISelLowering.cpp:19818

lowerV4I64Shuffle
static SDValue lowerV4I64Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 4-lane 64-bit integer shuffles.
Definition X86ISelLowering.cpp:16590

combinevXi1ConstantToInteger
static SDValue combinevXi1ConstantToInteger(SDValue Op, SelectionDAG &DAG)
Definition X86ISelLowering.cpp:45586

FPStateSizeInBits
const unsigned FPStateSizeInBits
Definition X86ISelLowering.cpp:28810

reduceMaskedStoreToScalarStore
static SDValue reduceMaskedStoreToScalarStore(MaskedStoreSDNode *MS, SelectionDAG &DAG, const X86Subtarget &Subtarget)
If exactly one element of the mask is set for a non-truncating masked store, it is a vector extract a...
Definition X86ISelLowering.cpp:53402

convertIntLogicToFPLogicOpcode
static unsigned convertIntLogicToFPLogicOpcode(unsigned Opcode)
Definition X86ISelLowering.cpp:51089

lowerX86FPLogicOp
static SDValue lowerX86FPLogicOp(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Definition X86ISelLowering.cpp:54951

narrowExtractedVectorSelect
static SDValue narrowExtractedVectorSelect(SDNode *Ext, const SDLoc &DL, SelectionDAG &DAG)
If we are extracting a subvector of a vector select and the select condition is composed of concatena...
Definition X86ISelLowering.cpp:59701

combineScalarAndWithMaskSetcc
static SDValue combineScalarAndWithMaskSetcc(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Definition X86ISelLowering.cpp:51466

lowerShuffleAsLanePermuteAndSHUFP
static SDValue lowerShuffleAsLanePermuteAndSHUFP(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG)
Definition X86ISelLowering.cpp:15349

isNoopShuffleMask
static bool isNoopShuffleMask(ArrayRef< int > Mask)
Tiny helper function to identify a no-op mask.
Definition X86ISelLowering.cpp:9932

getUnpackh
static SDValue getUnpackh(SelectionDAG &DAG, const SDLoc &dl, EVT VT, SDValue V1, SDValue V2)
Returns a vector_shuffle node for an unpackh operation.
Definition X86ISelLowering.cpp:4867

combineExtractFromVectorLoad
static SDValue combineExtractFromVectorLoad(SDNode *N, EVT VecVT, SDValue SrcVec, uint64_t Idx, const SDLoc &dl, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
Definition X86ISelLowering.cpp:46611

lowerShuffleAsByteShiftMask
static SDValue lowerShuffleAsByteShiftMask(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower a vector shuffle as a byte shift sequence.
Definition X86ISelLowering.cpp:12093

combineFP_TO_xINT_SAT
static SDValue combineFP_TO_xINT_SAT(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Definition X86ISelLowering.cpp:57361

isTargetShuffleVariableMask
static bool isTargetShuffleVariableMask(unsigned Opcode)
Definition X86ISelLowering.cpp:2881

isLogicOp
static bool isLogicOp(unsigned Opcode)
Definition X86ISelLowering.cpp:2835

lowerShuffleAsShift
static SDValue lowerShuffleAsShift(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG, bool BitwiseOnly)
Definition X86ISelLowering.cpp:12242

LowerBuildVectorv8i16
static SDValue LowerBuildVectorv8i16(SDValue Op, const SDLoc &DL, const APInt &NonZeroMask, unsigned NumNonZero, unsigned NumZero, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Custom lower build_vector of v8i16.
Definition X86ISelLowering.cpp:6998

matchBinaryShuffle
static bool matchBinaryShuffle(MVT MaskVT, ArrayRef< int > Mask, bool AllowFloatDomain, bool AllowIntDomain, SDValue &V1, SDValue &V2, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget, unsigned &Shuffle, MVT &SrcVT, MVT &DstVT, bool IsUnary)
Definition X86ISelLowering.cpp:39556

lowerShuffleAsUNPCKAndPermute
static SDValue lowerShuffleAsUNPCKAndPermute(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG)
Try to lower as an unpack of elements from two inputs followed by a single-input permutation.
Definition X86ISelLowering.cpp:11370

canScaleShuffleElements
static bool canScaleShuffleElements(ArrayRef< int > Mask, unsigned NumDstElts)
Definition X86ISelLowering.cpp:3963

LowerBITREVERSE_XOP
static SDValue LowerBITREVERSE_XOP(SDValue Op, SelectionDAG &DAG)
Definition X86ISelLowering.cpp:32664

getShuffleVectorZeroOrUndef
static SDValue getShuffleVectorZeroOrUndef(SDValue V2, int Idx, bool IsZero, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Return a vector_shuffle of the specified vector of zero or undef vector.
Definition X86ISelLowering.cpp:4944

lowerBuildVectorAsBroadcast
static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp, const SDLoc &dl, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Attempt to use the vbroadcast instruction to generate a splat value from a splat BUILD_VECTOR which u...
Definition X86ISelLowering.cpp:7668

combineMulToPMULDQ
static SDValue combineMulToPMULDQ(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Definition X86ISelLowering.cpp:49732

LowerPARITY
static SDValue LowerPARITY(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Definition X86ISelLowering.cpp:32798

lowerV16F32Shuffle
static SDValue lowerV16F32Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 16-lane 32-bit floating point shuffles.
Definition X86ISelLowering.cpp:17445

LowerMINMAX
static SDValue LowerMINMAX(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Definition X86ISelLowering.cpp:29344

combineToExtendCMOV
static SDValue combineToExtendCMOV(SDNode *Extend, SelectionDAG &DAG)
Definition X86ISelLowering.cpp:55862

isHorizontalBinOp
static bool isHorizontalBinOp(unsigned HOpcode, SDValue &LHS, SDValue &RHS, SelectionDAG &DAG, const X86Subtarget &Subtarget, bool IsCommutative, SmallVectorImpl< int > &PostShuffleMask, bool ForceHorizOp)
Return 'true' if this vector operation is "horizontal" and return the operands for the horizontal ope...
Definition X86ISelLowering.cpp:53849

getTargetShuffleMaskIndices
static bool getTargetShuffleMaskIndices(SDValue MaskNode, unsigned MaskEltSizeInBits, SmallVectorImpl< uint64_t > &RawMask, APInt &UndefElts)
Definition X86ISelLowering.cpp:5376

promoteExtBeforeAdd
static SDValue promoteExtBeforeAdd(SDNode *Ext, SelectionDAG &DAG, const X86Subtarget &Subtarget)
sext(add_nsw(x, C)) --> add(sext(x), C_sext) zext(add_nuw(x, C)) --> add(zext(x), C_zext) Promoting a...
Definition X86ISelLowering.cpp:55787

getTargetConstantFromNode
static const Constant * getTargetConstantFromNode(LoadSDNode *Load)
Definition X86ISelLowering.cpp:4974

canonicalizeBitSelect
static SDValue canonicalizeBitSelect(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Definition X86ISelLowering.cpp:52071

canCombineAsMaskOperation
static bool canCombineAsMaskOperation(SDValue V, const X86Subtarget &Subtarget)
Definition X86ISelLowering.cpp:18203

lowerShuffleAsVALIGN
static SDValue lowerShuffleAsVALIGN(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower a vector shuffle as a dword/qword rotation.
Definition X86ISelLowering.cpp:12042

lowerVECTOR_COMPRESS
static SDValue lowerVECTOR_COMPRESS(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Definition X86ISelLowering.cpp:18430

isProfitableToUseFlagOp
static bool isProfitableToUseFlagOp(SDValue Op)
Definition X86ISelLowering.cpp:23377

LowerBITREVERSE
static SDValue LowerBITREVERSE(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Definition X86ISelLowering.cpp:32709

LowerFROUND
static SDValue LowerFROUND(SDValue Op, SelectionDAG &DAG)
ISD::FROUND is defined to round to nearest with ties rounding away from 0.
Definition X86ISelLowering.cpp:22594

detectUSatPattern
static SDValue detectUSatPattern(SDValue In, EVT VT, SelectionDAG &DAG, const SDLoc &DL)
Detect patterns of truncation with unsigned saturation:
Definition X86ISelLowering.cpp:52852

narrowShuffle
static SDValue narrowShuffle(ShuffleVectorSDNode *Shuf, SelectionDAG &DAG)
If we have a shuffle of AVX/AVX512 (256/512 bit) vectors that only uses the low half of each source v...
Definition X86ISelLowering.cpp:43347

TranslateX86CC
static X86::CondCode TranslateX86CC(ISD::CondCode SetCCOpcode, const SDLoc &DL, bool isFP, SDValue &LHS, SDValue &RHS, SelectionDAG &DAG)
Do a one-to-one translation of a ISD::CondCode to the X86-specific condition code,...
Definition X86ISelLowering.cpp:2988

getFlagsOfCmpZeroFori1
static SDValue getFlagsOfCmpZeroFori1(SelectionDAG &DAG, const SDLoc &DL, SDValue Mask)
Definition X86ISelLowering.cpp:33525

lower512BitShuffle
static SDValue lower512BitShuffle(const SDLoc &DL, ArrayRef< int > Mask, MVT VT, SDValue V1, SDValue V2, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
High-level routine to lower various 512-bit x86 vector shuffles.
Definition X86ISelLowering.cpp:17851

LowerBuildVectorv16i8
static SDValue LowerBuildVectorv16i8(SDValue Op, const SDLoc &DL, const APInt &NonZeroMask, unsigned NumNonZero, unsigned NumZero, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Custom lower build_vector of v16i8.
Definition X86ISelLowering.cpp:6916

getTargetConstantBitsFromNode
static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits, APInt &UndefElts, SmallVectorImpl< APInt > &EltBits, bool AllowWholeUndefs=true, bool AllowPartialUndefs=false)
Definition X86ISelLowering.cpp:5002

detectExtMul
static bool detectExtMul(SelectionDAG &DAG, const SDValue &Mul, SDValue &Op0, SDValue &Op1)
Definition X86ISelLowering.cpp:46129

combineMOVMSK
static SDValue combineMOVMSK(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Definition X86ISelLowering.cpp:56718

combineLRINT_LLRINT
static SDValue combineLRINT_LLRINT(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Definition X86ISelLowering.cpp:54236

lowerAddSubToHorizontalOp
static SDValue lowerAddSubToHorizontalOp(SDValue Op, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Depending on uarch and/or optimizing for size, we might prefer to use a vector operation in place of ...
Definition X86ISelLowering.cpp:22510

combineShiftToPMULH
static SDValue combineShiftToPMULH(SDNode *N, SelectionDAG &DAG, const SDLoc &DL, const X86Subtarget &Subtarget)
Definition X86ISelLowering.cpp:49953

getParamsForOneTrueMaskedElt
static bool getParamsForOneTrueMaskedElt(MaskedLoadStoreSDNode *MaskedOp, SelectionDAG &DAG, SDValue &Addr, SDValue &Index, Align &Alignment, unsigned &Offset)
Given a masked memory load/store operation, return true if it has one mask bit set.
Definition X86ISelLowering.cpp:53237

reduceVMULWidth
static SDValue reduceVMULWidth(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
When the operands of vector mul are extended from smaller size values, like i8 and i16,...
Definition X86ISelLowering.cpp:49463

TranslateIntegerX86CC
static X86::CondCode TranslateIntegerX86CC(ISD::CondCode SetCCOpcode)
Definition X86ISelLowering.cpp:2967

MarkEHRegistrationNode
static SDValue MarkEHRegistrationNode(SDValue Op, SelectionDAG &DAG)
Definition X86ISelLowering.cpp:27666

combineBROADCAST_LOAD
static SDValue combineBROADCAST_LOAD(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
Definition X86ISelLowering.cpp:60554

combineFMinNumFMaxNum
static SDValue combineFMinNumFMaxNum(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Definition X86ISelLowering.cpp:55331

combineCMP
static SDValue combineCMP(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Definition X86ISelLowering.cpp:57449

combinei64TruncSrlConstant
static SDValue combinei64TruncSrlConstant(SDValue N, EVT VT, SelectionDAG &DAG, const SDLoc &DL)
Definition X86ISelLowering.cpp:54278

isLegalConversion
static bool isLegalConversion(MVT VT, MVT FloatVT, bool IsSigned, const X86Subtarget &Subtarget)
Definition X86ISelLowering.cpp:20075

combineX86AddSub
static SDValue combineX86AddSub(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &ST)
Definition X86ISelLowering.cpp:57596

LowerFCOPYSIGN
static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG)
Definition X86ISelLowering.cpp:22677

createVPDPBUSD
static SDValue createVPDPBUSD(SelectionDAG &DAG, SDValue LHS, SDValue RHS, unsigned &LogBias, const SDLoc &DL, const X86Subtarget &Subtarget)
Definition X86ISelLowering.cpp:46159

LowerFMINIMUM_FMAXIMUM
static SDValue LowerFMINIMUM_FMAXIMUM(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Definition X86ISelLowering.cpp:29360

lowerV2X128Shuffle
static SDValue lowerV2X128Shuffle(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering 2-lane 128-bit shuffles.
Definition X86ISelLowering.cpp:15575

lowerUINT_TO_FP_vec
static SDValue lowerUINT_TO_FP_vec(SDValue Op, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Definition X86ISelLowering.cpp:20584

getSplitVectorSrc
static SDValue getSplitVectorSrc(SDValue LHS, SDValue RHS, bool AllowCommute)
Definition X86ISelLowering.cpp:4096

LowerFABSorFNEG
static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG)
The only differences between FABS and FNEG are the mask and the logic op.
Definition X86ISelLowering.cpp:22616

ShrinkMode
ShrinkMode
Different mul shrinking modes.
Definition X86ISelLowering.cpp:49399

ShrinkMode::MULS16
@ MULS16
Definition X86ISelLowering.cpp:49399

ShrinkMode::MULU16
@ MULU16
Definition X86ISelLowering.cpp:49399

ShrinkMode::MULU8
@ MULU8
Definition X86ISelLowering.cpp:49399

ShrinkMode::MULS8
@ MULS8
Definition X86ISelLowering.cpp:49399

combineVPMADD52LH
static SDValue combineVPMADD52LH(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
Definition X86ISelLowering.cpp:60298

concatSubVectors
static SDValue concatSubVectors(SDValue V1, SDValue V2, SelectionDAG &DAG, const SDLoc &dl)
Definition X86ISelLowering.cpp:4735

combineINTRINSIC_W_CHAIN
static SDValue combineINTRINSIC_W_CHAIN(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
Definition X86ISelLowering.cpp:60761

canonicalizeShuffleMaskWithHorizOp
static SDValue canonicalizeShuffleMaskWithHorizOp(MutableArrayRef< SDValue > Ops, MutableArrayRef< int > Mask, unsigned RootSizeInBits, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Definition X86ISelLowering.cpp:40843

combineConstantPoolLoads
static SDValue combineConstantPoolLoads(SDNode *N, const SDLoc &dl, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Definition X86ISelLowering.cpp:53019

isHorizOp
static bool isHorizOp(unsigned Opcode)
Definition X86ISelLowering.cpp:8113

EmitMaskedTruncSStore
static SDValue EmitMaskedTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &DL, SDValue Val, SDValue Ptr, SDValue Mask, EVT MemVT, MachineMemOperand *MMO, SelectionDAG &DAG)
Emit Masked Truncating Store with signed or unsigned saturation.
Definition X86ISelLowering.cpp:27715

lowerVSELECTtoVectorShuffle
static SDValue lowerVSELECTtoVectorShuffle(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower a VSELECT instruction to a vector shuffle.
Definition X86ISelLowering.cpp:18491

matchShuffleAsBlend
static bool matchShuffleAsBlend(MVT VT, SDValue V1, SDValue V2, MutableArrayRef< int > Mask, const APInt &Zeroable, bool &ForceV1Zero, bool &ForceV2Zero, uint64_t &BlendMask)
Definition X86ISelLowering.cpp:11086

adjustBitcastSrcVectorSSE1
static SDValue adjustBitcastSrcVectorSSE1(SelectionDAG &DAG, SDValue Src, const SDLoc &DL)
Definition X86ISelLowering.cpp:45371

combineMulSpecial
static SDValue combineMulSpecial(uint64_t MulAmt, SDNode *N, SelectionDAG &DAG, EVT VT, const SDLoc &DL)
Definition X86ISelLowering.cpp:49532

combineBitcastvxi1
static SDValue combineBitcastvxi1(SelectionDAG &DAG, EVT VT, SDValue Src, const SDLoc &DL, const X86Subtarget &Subtarget)
Definition X86ISelLowering.cpp:45436

getUnpackl
static SDValue getUnpackl(SelectionDAG &DAG, const SDLoc &dl, EVT VT, SDValue V1, SDValue V2)
Returns a vector_shuffle node for an unpackl operation.
Definition X86ISelLowering.cpp:4859

getScalarValueForVectorElement
static SDValue getScalarValueForVectorElement(SDValue V, int Idx, SelectionDAG &DAG)
Try to get a scalar value for a specific element of a vector.
Definition X86ISelLowering.cpp:12722

LowerZERO_EXTEND_Mask
static SDValue LowerZERO_EXTEND_Mask(SDValue Op, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Definition X86ISelLowering.cpp:20993

getOpcodeForIndirectThunk
static unsigned getOpcodeForIndirectThunk(unsigned RPOpc)
Definition X86ISelLowering.cpp:36816

lowerV16I8Shuffle
static SDValue lowerV16I8Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Generic lowering of v16i8 shuffles.
Definition X86ISelLowering.cpp:14746

matchTruncateWithPACK
static SDValue matchTruncateWithPACK(unsigned &PackOpcode, EVT DstVT, SDValue In, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget, const SDNodeFlags Flags=SDNodeFlags())
Helper to determine if In truncated to DstVT has the necessary signbits / leading zero bits to be tru...
Definition X86ISelLowering.cpp:21209

getSHUFPDImm
static unsigned getSHUFPDImm(ArrayRef< int > Mask)
Definition X86ISelLowering.cpp:10299

isNullFPScalarOrVectorConst
static bool isNullFPScalarOrVectorConst(SDValue V)
Definition X86ISelLowering.cpp:55206

hasIdenticalHalvesShuffleMask
static bool hasIdenticalHalvesShuffleMask(ArrayRef< int > Mask)
Return true if a shuffle mask chooses elements identically in its top and bottom halves.
Definition X86ISelLowering.cpp:10250

matchShuffleWithPACK
static bool matchShuffleWithPACK(MVT VT, MVT &SrcVT, SDValue &V1, SDValue &V2, unsigned &PackOpcode, ArrayRef< int > TargetMask, const SelectionDAG &DAG, const X86Subtarget &Subtarget, unsigned MaxStages=1)
Definition X86ISelLowering.cpp:10888

combineVectorCompare
static SDValue combineVectorCompare(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Definition X86ISelLowering.cpp:58431

matchShuffleAsVTRUNC
static bool matchShuffleAsVTRUNC(MVT &SrcVT, MVT &DstVT, MVT VT, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget)
Definition X86ISelLowering.cpp:10595

combineBITREVERSE
static SDValue combineBITREVERSE(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Definition X86ISelLowering.cpp:55137

PromoteMaskArithmetic
static SDValue PromoteMaskArithmetic(SDValue N, const SDLoc &DL, EVT VT, SelectionDAG &DAG, const X86Subtarget &Subtarget, unsigned Depth)
Definition X86ISelLowering.cpp:51001

combineArithReduction
static SDValue combineArithReduction(SDNode *ExtElt, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Try to convert a vector reduction sequence composed of binops and shuffles into horizontal ops.
Definition X86ISelLowering.cpp:46955

combineINSERT_SUBVECTOR
static SDValue combineINSERT_SUBVECTOR(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Definition X86ISelLowering.cpp:59490

lowerShuffleAsBitRotate
static SDValue lowerShuffleAsBitRotate(const SDLoc &DL, MVT VT, SDValue V1, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Lower shuffle using X86ISD::VROTLI rotations.
Definition X86ISelLowering.cpp:11834

lowerShuffleAsDecomposedShuffleMerge
static SDValue lowerShuffleAsDecomposedShuffleMerge(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Generic routine to decompose a shuffle and blend into independent blends and permutes.
Definition X86ISelLowering.cpp:11681

combineEXTEND_VECTOR_INREG
static SDValue combineEXTEND_VECTOR_INREG(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Definition X86ISelLowering.cpp:60332

useVPTERNLOG
static bool useVPTERNLOG(const X86Subtarget &Subtarget, MVT VT)
Definition X86ISelLowering.cpp:3087

combineBlendOfPermutes
static SDValue combineBlendOfPermutes(MVT VT, SDValue N0, SDValue N1, ArrayRef< int > BlendMask, const APInt &DemandedElts, SelectionDAG &DAG, const X86Subtarget &Subtarget, const SDLoc &DL)
Definition X86ISelLowering.cpp:41888

combineSetCCAtomicArith
static SDValue combineSetCCAtomicArith(SDValue Cmp, X86::CondCode &CC, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Combine: (brcond/cmov/setcc .., (cmp (atomic_load_add x, 1), 0), COND_S) to: (brcond/cmov/setcc ....
Definition X86ISelLowering.cpp:48298

combineSetCCEFLAGS
static SDValue combineSetCCEFLAGS(SDValue EFLAGS, X86::CondCode &CC, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Optimize an EFLAGS definition used according to the condition code CC into a simpler EFLAGS value,...
Definition X86ISelLowering.cpp:49119

isBroadcastShuffleMask
static bool isBroadcastShuffleMask(ArrayRef< int > Mask)
Definition X86ISelLowering.cpp:11648

LowerEXTEND_VECTOR_INREG
static SDValue LowerEXTEND_VECTOR_INREG(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Definition X86ISelLowering.cpp:25345

combinePDEP
static SDValue combinePDEP(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
Definition X86ISelLowering.cpp:60699

combineExtSetcc
static SDValue combineExtSetcc(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Definition X86ISelLowering.cpp:55909

canonicalizeShuffleWithOp
static SDValue canonicalizeShuffleWithOp(SDValue N, SelectionDAG &DAG, const SDLoc &DL)
Definition X86ISelLowering.cpp:41993

matchPMADDWD_2
static SDValue matchPMADDWD_2(SelectionDAG &DAG, SDNode *N, const SDLoc &DL, EVT VT, const X86Subtarget &Subtarget)
Definition X86ISelLowering.cpp:57851

LowerVectorCTLZ
static SDValue LowerVectorCTLZ(SDValue Op, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Definition X86ISelLowering.cpp:29079

foldXor1SetCC
static SDValue foldXor1SetCC(SDNode *N, const SDLoc &DL, SelectionDAG &DAG)
Fold a xor(setcc cond, val), 1 --> setcc (inverted(cond), val)
Definition X86ISelLowering.cpp:54977

MatchVectorAllEqualTest
static SDValue MatchVectorAllEqualTest(SDValue OrigLHS, SDValue OrigRHS, ISD::CondCode CC, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG, X86::CondCode &X86CC)
Definition X86ISelLowering.cpp:23222

lowerShuffleAsByteRotate
static SDValue lowerShuffleAsByteRotate(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Definition X86ISelLowering.cpp:11985

getInstrStrFromOpNo
static StringRef getInstrStrFromOpNo(const SmallVectorImpl< StringRef > &AsmStrs, unsigned OpNo)
Definition X86ISelLowering.cpp:33471

isSequentialOrUndefOrZeroInRange
static bool isSequentialOrUndefOrZeroInRange(ArrayRef< int > Mask, unsigned Pos, unsigned Size, int Low, int Step=1)
Return true if every element in Mask, beginning from position Pos and ending in Pos+Size,...
Definition X86ISelLowering.cpp:3826

lowerShuffleAsSplitOrBlend
static SDValue lowerShuffleAsSplitOrBlend(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Either split a vector in halves or decompose the shuffles and the blend/unpack.
Definition X86ISelLowering.cpp:15278

widenBuildVec
static SDValue widenBuildVec(SDNode *Extend, SelectionDAG &DAG)
Definition X86ISelLowering.cpp:56186

canWidenShuffleElements
static bool canWidenShuffleElements(ArrayRef< int > Mask, SmallVectorImpl< int > &WidenedMask)
Helper function to test whether a shuffle mask could be simplified by widening the elements being shu...
Definition X86ISelLowering.cpp:3862

splitVectorIntUnary
static SDValue splitVectorIntUnary(SDValue Op, SelectionDAG &DAG, const SDLoc &dl)
Break an unary integer operation into 2 half sized ops and then concatenate the result back.
Definition X86ISelLowering.cpp:4429

combineSext
static SDValue combineSext(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Definition X86ISelLowering.cpp:55954

lowerV2I64Shuffle
static SDValue lowerV2I64Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 2-lane 64-bit integer shuffles.
Definition X86ISelLowering.cpp:13456

LowerADDSAT_SUBSAT
static SDValue LowerADDSAT_SUBSAT(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Definition X86ISelLowering.cpp:29222

combineLogicBlendIntoConditionalNegate
static SDValue combineLogicBlendIntoConditionalNegate(EVT VT, SDValue Mask, SDValue X, SDValue Y, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Definition X86ISelLowering.cpp:47636

getShuffleScalarElt
static SDValue getShuffleScalarElt(SDValue Op, unsigned Index, SelectionDAG &DAG, unsigned Depth)
Returns the scalar element that will make up the i'th element of the result of the vector shuffle.
Definition X86ISelLowering.cpp:6773

getTargetVShiftUniformOpcode
static unsigned getTargetVShiftUniformOpcode(unsigned Opc, bool IsVariable)
Definition X86ISelLowering.cpp:26073

matchShuffleAsInsertPS
static bool matchShuffleAsInsertPS(SDValue &V1, SDValue &V2, unsigned &InsertPSMask, const APInt &Zeroable, ArrayRef< int > Mask, SelectionDAG &DAG)
Definition X86ISelLowering.cpp:13261

isNonZeroElementsInOrder
static bool isNonZeroElementsInOrder(const APInt &Zeroable, ArrayRef< int > Mask, const EVT &VectorType, bool &IsZeroSideLeft)
Definition X86ISelLowering.cpp:10341

lowerUINT_TO_FP_vXi32
static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Definition X86ISelLowering.cpp:20420

combineMul
static SDValue combineMul(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Definition X86ISelLowering.cpp:49774

emitOrXorXorTree
static SDValue emitOrXorXorTree(SDValue X, const SDLoc &DL, SelectionDAG &DAG, EVT VecVT, EVT CmpVT, bool HasPT, F SToV)
Recursive helper for combineVectorSizedSetCCEquality() to emit the memcmp expansion.
Definition X86ISelLowering.cpp:22827

truncateAVX512SetCCNoBWI
static SDValue truncateAVX512SetCCNoBWI(EVT VT, EVT OpVT, SDValue LHS, SDValue RHS, ISD::CondCode CC, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
If we have AVX512, but not BWI and this is a vXi16/vXi8 setcc, just pre-promote its result type since...
Definition X86ISelLowering.cpp:56320

lowerShuffleAsPermuteAndUnpack
static SDValue lowerShuffleAsPermuteAndUnpack(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower a shuffle as a permute of the inputs followed by an UNPCK instruction.
Definition X86ISelLowering.cpp:11453

combineAndOrForCcmpCtest
static SDValue combineAndOrForCcmpCtest(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &ST)
Definition X86ISelLowering.cpp:51697

narrowLoadToVZLoad
static SDValue narrowLoadToVZLoad(LoadSDNode *LN, MVT MemVT, MVT VT, SelectionDAG &DAG)
Definition X86ISelLowering.cpp:39233

scalarizeExtEltFP
static SDValue scalarizeExtEltFP(SDNode *ExtElt, SelectionDAG &DAG, const X86Subtarget &Subtarget, TargetLowering::DAGCombinerInfo &DCI)
Extracting a scalar FP value from vector element 0 is free, so extract each operand first,...
Definition X86ISelLowering.cpp:46852

combineSetCCMOVMSK
static SDValue combineSetCCMOVMSK(SDValue EFLAGS, X86::CondCode &CC, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Definition X86ISelLowering.cpp:48884

isAddSubOrSubAddMask
static bool isAddSubOrSubAddMask(ArrayRef< int > Mask, bool &Op0Even)
Checks if the shuffle mask takes subsequent elements alternately from two vectors.
Definition X86ISelLowering.cpp:43162

isCompletePermute
static bool isCompletePermute(ArrayRef< int > Mask)
Return true if every element of a single input is referenced by the shuffle mask.
Definition X86ISelLowering.cpp:3844

recoverFramePointer
static SDValue recoverFramePointer(SelectionDAG &DAG, const Function *Fn, SDValue EntryEBP)
When the MSVC runtime transfers control to us, either to an outlined function or when returning to a ...
Definition X86ISelLowering.cpp:26361

combineX86INT_TO_FP
static SDValue combineX86INT_TO_FP(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
Definition X86ISelLowering.cpp:55404

getReadTimeStampCounter
static void getReadTimeStampCounter(SDNode *N, const SDLoc &DL, unsigned Opcode, SelectionDAG &DAG, const X86Subtarget &Subtarget, SmallVectorImpl< SDValue > &Results)
Handles the lowering of builtin intrinsics that read the time stamp counter (x86_rdtsc and x86_rdtscp...
Definition X86ISelLowering.cpp:27636

LowerShiftByScalarImmediate
static SDValue LowerShiftByScalarImmediate(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Definition X86ISelLowering.cpp:30231

LowerVectorAllEqual
static SDValue LowerVectorAllEqual(const SDLoc &DL, SDValue LHS, SDValue RHS, ISD::CondCode CC, const APInt &OriginalMask, const X86Subtarget &Subtarget, SelectionDAG &DAG, X86::CondCode &X86CC)
Definition X86ISelLowering.cpp:23074

is128BitUnpackShuffleMask
static bool is128BitUnpackShuffleMask(ArrayRef< int > Mask, const SelectionDAG &DAG)
Definition X86ISelLowering.cpp:10225

isOrXorXorTree
static bool isOrXorXorTree(SDValue X, bool Root=true)
Recursive helper for combineVectorSizedSetCCEquality() to see if we have a recognizable memcmp expans...
Definition X86ISelLowering.cpp:22815

LowerAVXExtend
static SDValue LowerAVXExtend(SDValue Op, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Definition X86ISelLowering.cpp:20918

LowerSCALAR_TO_VECTOR
static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Definition X86ISelLowering.cpp:19148

combineTruncate
static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Definition X86ISelLowering.cpp:54633

combineFAnd
static SDValue combineFAnd(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Do target-specific dag combines on X86ISD::FAND nodes.
Definition X86ISelLowering.cpp:55259

combineFaddCFmul
static SDValue combineFaddCFmul(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Definition X86ISelLowering.cpp:54150

combineCONCAT_VECTORS
static SDValue combineCONCAT_VECTORS(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Definition X86ISelLowering.cpp:59454

getTargetConstantPoolFromBasePtr
static ConstantPoolSDNode * getTargetConstantPoolFromBasePtr(SDValue Ptr)
Definition X86ISelLowering.cpp:4959

canonicalizeLaneShuffleWithRepeatedOps
static SDValue canonicalizeLaneShuffleWithRepeatedOps(SDValue V, SelectionDAG &DAG, const SDLoc &DL)
Attempt to fold vpermf128(op(),op()) -> op(vpermf128(),vpermf128()).
Definition X86ISelLowering.cpp:42182

isShuffleEquivalent
static bool isShuffleEquivalent(ArrayRef< int > Mask, ArrayRef< int > ExpectedMask, SDValue V1=SDValue(), SDValue V2=SDValue())
Checks whether a shuffle mask is equivalent to an explicit list of arguments.
Definition X86ISelLowering.cpp:10110

lowerV8F32Shuffle
static SDValue lowerV8F32Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 8-lane 32-bit floating point shuffles.
Definition X86ISelLowering.cpp:16703

LowerI64IntToFP_AVX512DQ
static SDValue LowerI64IntToFP_AVX512DQ(SDValue Op, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Definition X86ISelLowering.cpp:19776

LowerBUILD_VECTORAsVariablePermute
static SDValue LowerBUILD_VECTORAsVariablePermute(SDValue V, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Definition X86ISelLowering.cpp:9159

lowerShuffleAsByteRotateAndPermute
static SDValue lowerShuffleAsByteRotateAndPermute(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Helper to form a PALIGNR-based rotate+permute, merging 2 inputs and then permuting the elements of th...
Definition X86ISelLowering.cpp:11564

combineX86GatherScatter
static SDValue combineX86GatherScatter(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
Definition X86ISelLowering.cpp:56850

combineVectorHADDSUB
static SDValue combineVectorHADDSUB(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Definition X86ISelLowering.cpp:50502

LowerVectorCTPOP
static SDValue LowerVectorCTPOP(SDValue Op, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Definition X86ISelLowering.cpp:32531

combineX86ShuffleChain
static SDValue combineX86ShuffleChain(ArrayRef< SDValue > Inputs, unsigned RootOpc, MVT RootVT, ArrayRef< int > BaseMask, int Depth, ArrayRef< const SDNode * > SrcNodes, bool AllowVariableCrossLaneMask, bool AllowVariablePerLaneMask, bool IsMaskedShuffle, SelectionDAG &DAG, const SDLoc &DL, const X86Subtarget &Subtarget)
Combine an arbitrary chain of shuffles into a single instruction if possible.
Definition X86ISelLowering.cpp:39957

getAVX512TruncNode
static SDValue getAVX512TruncNode(const SDLoc &DL, MVT DstVT, SDValue Src, const X86Subtarget &Subtarget, SelectionDAG &DAG, bool ZeroUppers)
Definition X86ISelLowering.cpp:10633

isFMAddSubOrFMSubAdd
static bool isFMAddSubOrFMSubAdd(const X86Subtarget &Subtarget, SelectionDAG &DAG, SDValue &Opnd0, SDValue &Opnd1, SDValue &Opnd2, unsigned ExpectedUses, bool AllowSubAddOrAddSubContract)
Returns true if is possible to fold MUL and an idiom that has already been recognized as ADDSUB/SUBAD...
Definition X86ISelLowering.cpp:8419

createPackShuffleMask
static void createPackShuffleMask(MVT VT, SmallVectorImpl< int > &Mask, bool Unary, unsigned NumStages=1)
Create a shuffle mask that matches the PACKSS/PACKUS truncation.
Definition X86ISelLowering.cpp:5485

isUndefOrEqualInRange
static bool isUndefOrEqualInRange(ArrayRef< int > Mask, int CmpVal, unsigned Pos, unsigned Size)
Return true if every element in Mask, beginning from position Pos and ending in Pos+Size is the undef...
Definition X86ISelLowering.cpp:3730

combineFaddFsub
static SDValue combineFaddFsub(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Do target-specific dag combines on floating-point adds/subs.
Definition X86ISelLowering.cpp:54225

LowerToTLSGeneralDynamicModel32
static SDValue LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG, const EVT PtrVT)
Definition X86ISelLowering.cpp:19483

splitVectorOp
static SDValue splitVectorOp(SDValue Op, SelectionDAG &DAG, const SDLoc &dl)
Break an operation into 2 half sized ops and then concatenate the results.
Definition X86ISelLowering.cpp:4404

MulConstantOptimization
static cl::opt< bool > MulConstantOptimization("mul-constant-optimization", cl::init(true), cl::desc("Replace 'mul x, Const' with more effective instructions like " "SHIFT, LEA, etc."), cl::Hidden)

getIndexFromUnindexedLoad
static SDValue getIndexFromUnindexedLoad(LoadSDNode *Ld)
Definition X86ISelLowering.cpp:51337

isAnyZero
static bool isAnyZero(ArrayRef< int > Mask)
Return true if the value of any element in Mask is the zero sentinel value.
Definition X86ISelLowering.cpp:3772

combineShuffle
static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Definition X86ISelLowering.cpp:43377

truncateVectorWithPACKUS
static SDValue truncateVectorWithPACKUS(EVT DstVT, SDValue In, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Truncate using inreg zero extension (AND mask) and X86ISD::PACKUS.
Definition X86ISelLowering.cpp:21189

lowerINT_TO_FP_vXi64
static SDValue lowerINT_TO_FP_vXi64(SDValue Op, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Definition X86ISelLowering.cpp:19967

isMaskableNode
static bool isMaskableNode(SDValue V, const X86Subtarget &Subtarget)
Definition X86ISelLowering.cpp:7649

resolveZeroablesFromTargetShuffle
static void resolveZeroablesFromTargetShuffle(const SmallVectorImpl< int > &Mask, APInt &KnownUndef, APInt &KnownZero)
Definition X86ISelLowering.cpp:6071

rebuildGatherScatter
static SDValue rebuildGatherScatter(MaskedGatherScatterSDNode *GorS, SDValue Index, SDValue Base, SDValue Scale, SelectionDAG &DAG)
Definition X86ISelLowering.cpp:56869

matchVPMADD52
static SDValue matchVPMADD52(SDNode *N, SelectionDAG &DAG, const SDLoc &DL, EVT VT, const X86Subtarget &Subtarget)
Definition X86ISelLowering.cpp:58077

combineSubABS
static SDValue combineSubABS(EVT VT, const SDLoc &DL, SDValue N0, SDValue N1, SelectionDAG &DAG)
Definition X86ISelLowering.cpp:58242

getPSHUFShuffleMask
static SmallVector< int, 4 > getPSHUFShuffleMask(SDValue N)
Get the PSHUF-style mask from PSHUF node.
Definition X86ISelLowering.cpp:41645

scalarizeVectorStore
static SDValue scalarizeVectorStore(StoreSDNode *Store, MVT StoreVT, SelectionDAG &DAG)
Scalarize a vector store, bitcasting to TargetVT to determine the scalar type.
Definition X86ISelLowering.cpp:25554

LowerBUILD_VECTORvXbf16
static SDValue LowerBUILD_VECTORvXbf16(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Definition X86ISelLowering.cpp:8008

combineShuffleToFMAddSub
static SDValue combineShuffleToFMAddSub(SDNode *N, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Combine shuffle of two fma nodes into FMAddSub or FMSubAdd.
Definition X86ISelLowering.cpp:43261

getAVX2GatherNode
static SDValue getAVX2GatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, SDValue Src, SDValue Mask, SDValue Base, SDValue Index, SDValue ScaleOp, SDValue Chain, const X86Subtarget &Subtarget)
Definition X86ISelLowering.cpp:27450

lowerShufflePairAsUNPCKAndPermute
static SDValue lowerShufflePairAsUNPCKAndPermute(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG)
Definition X86ISelLowering.cpp:16394

isUndefOrZero
static bool isUndefOrZero(int Val)
Val is either the undef or zero sentinel value.
Definition X86ISelLowering.cpp:3737

combineAndNotOrIntoAndNotAnd
static SDValue combineAndNotOrIntoAndNotAnd(SDNode *N, const SDLoc &DL, SelectionDAG &DAG)
Folds (and X, (or Y, ~Z)) --> (and X, ~(and ~Y, Z)) This undoes the inverse fold performed in InstCom...
Definition X86ISelLowering.cpp:51359

combineBEXTR
static SDValue combineBEXTR(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Definition X86ISelLowering.cpp:55189

combineFMADDSUB
static SDValue combineFMADDSUB(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
Definition X86ISelLowering.cpp:56156

combineCMov
static SDValue combineCMov(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL].
Definition X86ISelLowering.cpp:49142

extract128BitVector
static SDValue extract128BitVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG, const SDLoc &dl)
Generate a DAG to grab 128-bits from a vector > 128 bits.
Definition X86ISelLowering.cpp:4156

EmitAVX512Test
static SDValue EmitAVX512Test(SDValue Op0, SDValue Op1, ISD::CondCode CC, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget, SDValue &X86CC)
Definition X86ISelLowering.cpp:24528

lowerShuffleWithSHUFPS
static SDValue lowerShuffleWithSHUFPS(const SDLoc &DL, MVT VT, ArrayRef< int > Mask, SDValue V1, SDValue V2, SelectionDAG &DAG)
Lower a vector shuffle using the SHUFPS instruction.
Definition X86ISelLowering.cpp:13556

combineStore
static SDValue combineStore(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Definition X86ISelLowering.cpp:53481

combineX86ShuffleChainWithExtract
static SDValue combineX86ShuffleChainWithExtract(ArrayRef< SDValue > Inputs, unsigned RootOpcode, MVT RootVT, ArrayRef< int > BaseMask, int Depth, ArrayRef< const SDNode * > SrcNodes, bool AllowVariableCrossLaneMask, bool AllowVariablePerLaneMask, bool IsMaskedShuffle, SelectionDAG &DAG, const SDLoc &DL, const X86Subtarget &Subtarget)
Definition X86ISelLowering.cpp:40706

combineX86SetCC
static SDValue combineX86SetCC(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Definition X86ISelLowering.cpp:57041

combineMinMaxReduction
static SDValue combineMinMaxReduction(SDNode *Extract, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Definition X86ISelLowering.cpp:46234

combineHorizOpWithShuffle
static SDValue combineHorizOpWithShuffle(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Definition X86ISelLowering.cpp:50207

LowerVectorCTLZ_AVX512CDI
static SDValue LowerVectorCTLZ_AVX512CDI(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Lower a vector CTLZ using native supported vector CTLZ instruction.
Definition X86ISelLowering.cpp:28967

ExtractBitFromMaskVector
static SDValue ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Extract one bit from mask vector, like v16i1 or v8i1.
Definition X86ISelLowering.cpp:18676

LowervXi8MulWithUNPCK
static SDValue LowervXi8MulWithUNPCK(SDValue A, SDValue B, const SDLoc &dl, MVT VT, bool IsSigned, const X86Subtarget &Subtarget, SelectionDAG &DAG, SDValue *Low=nullptr)
Definition X86ISelLowering.cpp:29754

lowerShuffleAsBlendOfPSHUFBs
static SDValue lowerShuffleAsBlendOfPSHUFBs(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, SelectionDAG &DAG, bool &V1InUse, bool &V2InUse)
Helper to form a PSHUFB-based shuffle+blend, opportunistically avoiding the blend if only one input i...
Definition X86ISelLowering.cpp:14396

matchShuffleAsINSERTQ
static bool matchShuffleAsINSERTQ(MVT VT, SDValue &V1, SDValue &V2, ArrayRef< int > Mask, uint64_t &BitLen, uint64_t &BitIdx)
Definition X86ISelLowering.cpp:12337

getBitSelect
static SDValue getBitSelect(const SDLoc &DL, MVT VT, SDValue LHS, SDValue RHS, SDValue Mask, SelectionDAG &DAG)
Definition X86ISelLowering.cpp:4799

combineAVG
static SDValue combineAVG(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Definition X86ISelLowering.cpp:55165

isSequentialOrUndefInRange
static bool isSequentialOrUndefInRange(ArrayRef< int > Mask, unsigned Pos, unsigned Size, int Low, int Step=1)
Return true if every element in Mask, beginning from position Pos and ending in Pos + Size,...
Definition X86ISelLowering.cpp:3815

BrMergingUnlikelyBias
static cl::opt< int > BrMergingUnlikelyBias("x86-br-merging-unlikely-bias", cl::init(-1), cl::desc("Decreases 'x86-br-merging-base-cost' in cases that it is unlikely " "that all conditionals will be executed. For example for merging " "the conditionals (a == b && c > d), if its known that a == b is " "unlikely, then it is unlikely that if the conditionals are split " "both sides will be executed, so it may be desirable to decrease " "the instruction cost threshold. Set to -1 to never merge unlikely " "branches."), cl::Hidden)

createSetFPEnvNodes
static SDValue createSetFPEnvNodes(SDValue Ptr, SDValue Chain, const SDLoc &DL, EVT MemVT, MachineMemOperand *MMO, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Definition X86ISelLowering.cpp:28855

getTargetShuffleAndZeroables
static bool getTargetShuffleAndZeroables(SDValue N, SmallVectorImpl< int > &Mask, SmallVectorImpl< SDValue > &Ops, APInt &KnownUndef, APInt &KnownZero)
Decode a target shuffle mask and inputs and see if any values are known to be undef or zero from thei...
Definition X86ISelLowering.cpp:5952

combinePredicateReduction
static SDValue combinePredicateReduction(SDNode *Extract, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Definition X86ISelLowering.cpp:46308

LowerBuildVectorv4x32
static SDValue LowerBuildVectorv4x32(SDValue Op, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Custom lower build_vector of v4i32 or v4f32.
Definition X86ISelLowering.cpp:7012

isTargetShuffleEquivalent
static bool isTargetShuffleEquivalent(MVT VT, ArrayRef< int > Mask, ArrayRef< int > ExpectedMask, const SelectionDAG &DAG, SDValue V1=SDValue(), SDValue V2=SDValue())
Checks whether a target shuffle mask is equivalent to an explicit pattern.
Definition X86ISelLowering.cpp:10142

LowerABS
static SDValue LowerABS(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Definition X86ISelLowering.cpp:29292

pushAddIntoCmovOfConsts
static SDValue pushAddIntoCmovOfConsts(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
CMOV of constants requires materializing constant operands in registers.
Definition X86ISelLowering.cpp:58010

LowerCMP_SWAP
static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Definition X86ISelLowering.cpp:32285

LowerToTLSLocalDynamicModel
static SDValue LowerToTLSLocalDynamicModel(GlobalAddressSDNode *GA, SelectionDAG &DAG, const EVT PtrVT, bool Is64Bit, bool Is64BitLP64)
Definition X86ISelLowering.cpp:19503

combineTruncateWithSat
static SDValue combineTruncateWithSat(SDValue In, EVT VT, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Definition X86ISelLowering.cpp:52919

combineBT
static SDValue combineBT(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
Definition X86ISelLowering.cpp:55622

expandFP_TO_UINT_SSE
static SDValue expandFP_TO_UINT_SSE(MVT VT, SDValue Src, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Definition X86ISelLowering.cpp:21587

getUnderlyingExtractedFromVec
static int getUnderlyingExtractedFromVec(SDValue &ExtractedFromVec, SDValue ExtIdx)
For an EXTRACT_VECTOR_ELT with a constant index return the real underlying vector and index.
Definition X86ISelLowering.cpp:7906

combineVectorShiftImm
static SDValue combineVectorShiftImm(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Definition X86ISelLowering.cpp:50583

isUnaryOp
static bool isUnaryOp(unsigned Opcode)
Definition X86ISelLowering.cpp:41981

LowerCONCAT_VECTORSvXi1
static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op, const SDLoc &dl, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Definition X86ISelLowering.cpp:9726

combineBrCond
static SDValue combineBrCond(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Optimize branch condition evaluation.
Definition X86ISelLowering.cpp:57055

hasFPCMov
static bool hasFPCMov(unsigned X86CC)
Is there a floating point cmov for the specific X86 condition code?
Definition X86ISelLowering.cpp:3071

getOneTrueElt
static int getOneTrueElt(SDValue V)
If V is a build vector of boolean constants and exactly one of those constants is true,...
Definition X86ISelLowering.cpp:53202

lowerShuffleWithVPMOV
static SDValue lowerShuffleWithVPMOV(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Definition X86ISelLowering.cpp:10694

foldXorTruncShiftIntoCmp
static SDValue foldXorTruncShiftIntoCmp(SDNode *N, const SDLoc &DL, SelectionDAG &DAG)
Try to turn tests against the signbit in the form of: XOR(TRUNCATE(SRL(X, size(X)-1)),...
Definition X86ISelLowering.cpp:52740

lowerShuffleWithUNPCK
static SDValue lowerShuffleWithUNPCK(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG)
Definition X86ISelLowering.cpp:10542

X86ISelLowering.h

X86InstrBuilder.h

Concat
static constexpr int Concat[]
Definition X86InterleavedAccess.cpp:232

X86IntrinsicsInfo.h

X86MachineFunctionInfo.h

RHS
Value * RHS
Definition X86PartialReduction.cpp:74

LHS
Value * LHS
Definition X86PartialReduction.cpp:73

Mul
BinaryOperator * Mul
Definition X86PartialReduction.cpp:68

IsFreeTruncation
if(isa< SExtInst >(LHS)) std auto IsFreeTruncation
Definition X86PartialReduction.cpp:79

X86ShuffleDecode.h

X86TargetMachine.h

X86.h

FramePtr
static const unsigned FramePtr
Definition XCoreFrameLowering.cpp:32

Input
The Input class is used to parse a yaml document into in-memory structs and vectors.
Definition YAMLTraits.h:1313

Node
Definition ItaniumDemangle.h:166

VectorType
Definition ItaniumDemangle.h:1189

llvm::APFloatBase::IEEEsingle
static const fltSemantics & IEEEsingle()
Definition APFloat.h:296

llvm::APFloatBase::BFloat
static const fltSemantics & BFloat()
Definition APFloat.h:295

llvm::APFloatBase::rmNearestTiesToEven
static constexpr roundingMode rmNearestTiesToEven
Definition APFloat.h:344

llvm::APFloatBase::IEEEquad
static const fltSemantics & IEEEquad()
Definition APFloat.h:298

llvm::APFloatBase::IEEEdouble
static const fltSemantics & IEEEdouble()
Definition APFloat.h:297

llvm::APFloatBase::x87DoubleExtended
static const fltSemantics & x87DoubleExtended()
Definition APFloat.h:317

llvm::APFloatBase::rmTowardZero
static constexpr roundingMode rmTowardZero
Definition APFloat.h:348

llvm::APFloatBase::semanticsPrecision
static LLVM_ABI unsigned int semanticsPrecision(const fltSemantics &)
Definition APFloat.cpp:296

llvm::APFloatBase::IEEEhalf
static const fltSemantics & IEEEhalf()
Definition APFloat.h:294

llvm::APFloatBase::opStatus
opStatus
IEEE-754R 7: Default exception handling.
Definition APFloat.h:360

llvm::APFloatBase::opOK
@ opOK
Definition APFloat.h:361

llvm::APFloatBase::opInexact
@ opInexact
Definition APFloat.h:366

llvm::APFloat
Definition APFloat.h:940

llvm::APFloat::convert
LLVM_ABI opStatus convert(const fltSemantics &ToSemantics, roundingMode RM, bool *losesInfo)
Definition APFloat.cpp:6060

llvm::APFloat::getAllOnesValue
static LLVM_ABI APFloat getAllOnesValue(const fltSemantics &Semantics)
Returns a float which is bitcasted from an all one value int.
Definition APFloat.cpp:6086

llvm::APFloat::clearSign
void clearSign()
Definition APFloat.h:1280

llvm::APFloat::next
opStatus next(bool nextDown)
Definition APFloat.h:1236

llvm::APFloat::changeSign
void changeSign()
Definition APFloat.h:1279

llvm::APFloat::getZero
static APFloat getZero(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Zero.
Definition APFloat.h:1061

llvm::APInt
Class for arbitrary precision integers.
Definition APInt.h:78

llvm::APInt::getAllOnes
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
Definition APInt.h:234

llvm::APInt::clearBit
void clearBit(unsigned BitPosition)
Set a given bit to 0.
Definition APInt.h:1406

llvm::APInt::isNegatedPowerOf2
bool isNegatedPowerOf2() const
Check if this APInt's negated value is a power of two greater than zero.
Definition APInt.h:449

llvm::APInt::zext
LLVM_ABI APInt zext(unsigned width) const
Zero extend to a new width.
Definition APInt.cpp:1012

llvm::APInt::getSignMask
static APInt getSignMask(unsigned BitWidth)
Get the SignMask for a specific bit width.
Definition APInt.h:229

llvm::APInt::isMinSignedValue
bool isMinSignedValue() const
Determine if this is the smallest signed value.
Definition APInt.h:423

llvm::APInt::getZExtValue
uint64_t getZExtValue() const
Get zero extended value.
Definition APInt.h:1540

llvm::APInt::setHighBits
void setHighBits(unsigned hiBits)
Set the top hiBits bits.
Definition APInt.h:1391

llvm::APInt::popcount
unsigned popcount() const
Count the number of bits set.
Definition APInt.h:1670

llvm::APInt::setBitsFrom
void setBitsFrom(unsigned loBit)
Set the top bits starting from loBit.
Definition APInt.h:1385

llvm::APInt::extractBitsAsZExtValue
LLVM_ABI uint64_t extractBitsAsZExtValue(unsigned numBits, unsigned bitPosition) const
Definition APInt.cpp:520

llvm::APInt::zextOrTrunc
LLVM_ABI APInt zextOrTrunc(unsigned width) const
Zero extend or truncate to width.
Definition APInt.cpp:1033

llvm::APInt::getActiveBits
unsigned getActiveBits() const
Compute the number of active bits in the value.
Definition APInt.h:1512

llvm::APInt::trunc
LLVM_ABI APInt trunc(unsigned width) const
Truncate to new width.
Definition APInt.cpp:936

llvm::APInt::getMaxValue
static APInt getMaxValue(unsigned numBits)
Gets maximum unsigned value of APInt for specific bit width.
Definition APInt.h:206

llvm::APInt::setBit
void setBit(unsigned BitPosition)
Set the given bit to 1 whose position is given as "bitPosition".
Definition APInt.h:1330

llvm::APInt::abs
APInt abs() const
Get the absolute value.
Definition APInt.h:1795

llvm::APInt::isAllOnes
bool isAllOnes() const
Determine if all bits are set. This is true for zero-width values.
Definition APInt.h:371

llvm::APInt::getBitsSet
static APInt getBitsSet(unsigned numBits, unsigned loBit, unsigned hiBit)
Get a value with a block of bits set.
Definition APInt.h:258

llvm::APInt::isZero
bool isZero() const
Determine if this value is zero, i.e. all bits are clear.
Definition APInt.h:380

llvm::APInt::isSignMask
bool isSignMask() const
Check if the APInt's value is returned by getSignMask.
Definition APInt.h:466

llvm::APInt::urem
LLVM_ABI APInt urem(const APInt &RHS) const
Unsigned remainder operation.
Definition APInt.cpp:1666

llvm::APInt::setSignBit
void setSignBit()
Set the sign bit to 1.
Definition APInt.h:1340

llvm::APInt::getBitWidth
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition APInt.h:1488

llvm::APInt::ult
bool ult(const APInt &RHS) const
Unsigned less than comparison.
Definition APInt.h:1111

llvm::APInt::getSignedMaxValue
static APInt getSignedMaxValue(unsigned numBits)
Gets maximum signed value of APInt for a specific bit width.
Definition APInt.h:209

llvm::APInt::getMinValue
static APInt getMinValue(unsigned numBits)
Gets minimum unsigned value of APInt for a specific bit width.
Definition APInt.h:216

llvm::APInt::isNegative
bool isNegative() const
Determine sign of this APInt.
Definition APInt.h:329

llvm::APInt::intersects
bool intersects(const APInt &RHS) const
This operation tests if there are any pairs of corresponding bits between this APInt and RHS that are...
Definition APInt.h:1249

llvm::APInt::eq
bool eq(const APInt &RHS) const
Equality comparison.
Definition APInt.h:1079

llvm::APInt::exactLogBase2
int32_t exactLogBase2() const
Definition APInt.h:1783

llvm::APInt::clearAllBits
void clearAllBits()
Set every bit to 0.
Definition APInt.h:1396

llvm::APInt::ashrInPlace
void ashrInPlace(unsigned ShiftAmt)
Arithmetic right-shift this APInt by ShiftAmt in place.
Definition APInt.h:834

llvm::APInt::countr_zero
unsigned countr_zero() const
Count the number of trailing zero bits.
Definition APInt.h:1639

llvm::APInt::isSignedIntN
bool isSignedIntN(unsigned N) const
Check if this APInt has an N-bits signed integer value.
Definition APInt.h:435

llvm::APInt::getNumSignBits
unsigned getNumSignBits() const
Computes the number of leading bits of this APInt that are equal to its sign bit.
Definition APInt.h:1628

llvm::APInt::countl_zero
unsigned countl_zero() const
The APInt version of std::countl_zero.
Definition APInt.h:1598

llvm::APInt::getSplat
static LLVM_ABI APInt getSplat(unsigned NewLen, const APInt &V)
Return a value containing V broadcasted over NewLen bits.
Definition APInt.cpp:651

llvm::APInt::getSignedMinValue
static APInt getSignedMinValue(unsigned numBits)
Gets minimum signed value of APInt for a specific bit width.
Definition APInt.h:219

llvm::APInt::countTrailingZeros
unsigned countTrailingZeros() const
Definition APInt.h:1647

llvm::APInt::getSignificantBits
unsigned getSignificantBits() const
Get the minimum bit size for this signed APInt.
Definition APInt.h:1531

llvm::APInt::flipAllBits
void flipAllBits()
Toggle every bit to its opposite value.
Definition APInt.h:1452

llvm::APInt::countl_one
unsigned countl_one() const
Count the number of leading one bits.
Definition APInt.h:1615

llvm::APInt::insertBits
LLVM_ABI void insertBits(const APInt &SubBits, unsigned bitPosition)
Insert the bits from a smaller APInt starting at bitPosition.
Definition APInt.cpp:397

llvm::APInt::clearLowBits
void clearLowBits(unsigned loBits)
Set bottom loBits bits to 0.
Definition APInt.h:1435

llvm::APInt::logBase2
unsigned logBase2() const
Definition APInt.h:1761

llvm::APInt::ashr
APInt ashr(unsigned ShiftAmt) const
Arithmetic right-shift function.
Definition APInt.h:827

llvm::APInt::setAllBits
void setAllBits()
Set every bit to 1.
Definition APInt.h:1319

llvm::APInt::getBoolValue
bool getBoolValue() const
Convert APInt to a boolean value.
Definition APInt.h:471

llvm::APInt::isMask
bool isMask(unsigned numBits) const
Definition APInt.h:488

llvm::APInt::isMaxSignedValue
bool isMaxSignedValue() const
Determine if this is the largest signed value.
Definition APInt.h:405

llvm::APInt::isNonNegative
bool isNonNegative() const
Determine if this APInt Value is non-negative (>= 0)
Definition APInt.h:334

llvm::APInt::ule
bool ule(const APInt &RHS) const
Unsigned less or equal comparison.
Definition APInt.h:1150

llvm::APInt::sext
LLVM_ABI APInt sext(unsigned width) const
Sign extend to a new width.
Definition APInt.cpp:985

llvm::APInt::setBits
void setBits(unsigned loBit, unsigned hiBit)
Set the bits from loBit (inclusive) to hiBit (exclusive) to 1.
Definition APInt.h:1367

llvm::APInt::shl
APInt shl(unsigned shiftAmt) const
Left-shift function.
Definition APInt.h:873

llvm::APInt::isSubsetOf
bool isSubsetOf(const APInt &RHS) const
This operation checks that all bits set in this APInt are also set in RHS.
Definition APInt.h:1257

llvm::APInt::isPowerOf2
bool isPowerOf2() const
Check if this APInt's value is a power of two greater than zero.
Definition APInt.h:440

llvm::APInt::getLowBitsSet
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
Definition APInt.h:306

llvm::APInt::isSignBitSet
bool isSignBitSet() const
Determine if sign bit of this APInt is set.
Definition APInt.h:341

llvm::APInt::getHighBitsSet
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition APInt.h:296

llvm::APInt::getZero
static APInt getZero(unsigned numBits)
Get the '0' value for the specified bit-width.
Definition APInt.h:200

llvm::APInt::setLowBits
void setLowBits(unsigned loBits)
Set the bottom loBits bits.
Definition APInt.h:1388

llvm::APInt::extractBits
LLVM_ABI APInt extractBits(unsigned numBits, unsigned bitPosition) const
Return an APInt with the extracted bits [bitPosition,bitPosition+numBits).
Definition APInt.cpp:482

llvm::APInt::isIntN
bool isIntN(unsigned N) const
Check if this APInt has an N-bits unsigned integer value.
Definition APInt.h:432

llvm::APInt::isOne
bool isOne() const
Determine if this is a value of 1.
Definition APInt.h:389

llvm::APInt::getBitsSetFrom
static APInt getBitsSetFrom(unsigned numBits, unsigned loBit)
Constructs an APInt value that has a contiguous range of bits set.
Definition APInt.h:286

llvm::APInt::getOneBitSet
static APInt getOneBitSet(unsigned numBits, unsigned BitNo)
Return an APInt with exactly one bit set in the result.
Definition APInt.h:239

llvm::APInt::lshrInPlace
void lshrInPlace(unsigned ShiftAmt)
Logical right-shift this APInt by ShiftAmt in place.
Definition APInt.h:858

llvm::APInt::lshr
APInt lshr(unsigned shiftAmt) const
Logical right-shift function.
Definition APInt.h:851

llvm::APInt::countr_one
unsigned countr_one() const
Count the number of trailing one bits.
Definition APInt.h:1656

llvm::APInt::uge
bool uge(const APInt &RHS) const
Unsigned greater or equal comparison.
Definition APInt.h:1221

llvm::APInt::isMaxValue
bool isMaxValue() const
Determine if this is the largest unsigned value.
Definition APInt.h:399

llvm::APInt::truncSSat
LLVM_ABI APInt truncSSat(unsigned width) const
Truncate to new width with signed saturation.
Definition APInt.cpp:973

llvm::AddrSpaceCastSDNode
Definition SelectionDAGNodes.h:1388

llvm::ArrayRef
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:41

llvm::ArrayRef::equals
bool equals(ArrayRef RHS) const
equals - Check for element-wise equality.
Definition ArrayRef.h:183

llvm::ArrayRef::end
iterator end() const
Definition ArrayRef.h:136

llvm::ArrayRef::size
size_t size() const
size - Get the array size.
Definition ArrayRef.h:147

llvm::ArrayRef::drop_back
ArrayRef< T > drop_back(size_t N=1) const
Drop the last N elements of the array.
Definition ArrayRef.h:206

llvm::ArrayRef::begin
iterator begin() const
Definition ArrayRef.h:135

llvm::ArrayRef::empty
bool empty() const
empty - Check if the array is empty.
Definition ArrayRef.h:142

llvm::ArrayRef::slice
ArrayRef< T > slice(size_t N, size_t M) const
slice(n, m) - Chop off the first N elements of the array, and keep M elements in the array.
Definition ArrayRef.h:191

llvm::ArrayType::get
static LLVM_ABI ArrayType * get(Type *ElementType, uint64_t NumElements)
This static method is the primary way to construct an ArrayType.

llvm::AtomicCmpXchgInst::getStrongestFailureOrdering
static AtomicOrdering getStrongestFailureOrdering(AtomicOrdering SuccessOrdering)
Returns the strongest permitted ordering on failure, given the desired ordering on success.
Definition Instructions.h:658

llvm::AtomicRMWInst
an instruction that atomically reads a memory location, combines it with another value,...
Definition Instructions.h:710

llvm::AtomicRMWInst::getAlign
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
Definition Instructions.h:844

llvm::AtomicRMWInst::BinOp
BinOp
This enumeration lists the possible modifications atomicrmw can make.
Definition Instructions.h:722

llvm::AtomicRMWInst::Add
@ Add
*p = old + v
Definition Instructions.h:726

llvm::AtomicRMWInst::FAdd
@ FAdd
*p = old + v
Definition Instructions.h:747

llvm::AtomicRMWInst::USubCond
@ USubCond
Subtract only if no unsigned overflow.
Definition Instructions.h:778

llvm::AtomicRMWInst::Min
@ Min
*p = old <signed v ? old : v
Definition Instructions.h:740

llvm::AtomicRMWInst::Or
@ Or
*p = old | v
Definition Instructions.h:734

llvm::AtomicRMWInst::Sub
@ Sub
*p = old - v
Definition Instructions.h:728

llvm::AtomicRMWInst::And
@ And
*p = old & v
Definition Instructions.h:730

llvm::AtomicRMWInst::Xor
@ Xor
*p = old ^ v
Definition Instructions.h:736

llvm::AtomicRMWInst::USubSat
@ USubSat
*p = usub.sat(old, v) usub.sat matches the behavior of llvm.usub.sat.
Definition Instructions.h:782

llvm::AtomicRMWInst::FSub
@ FSub
*p = old - v
Definition Instructions.h:750

llvm::AtomicRMWInst::UIncWrap
@ UIncWrap
Increment one up to a maximum value.
Definition Instructions.h:770

llvm::AtomicRMWInst::Max
@ Max
*p = old >signed v ? old : v
Definition Instructions.h:738

llvm::AtomicRMWInst::UMin
@ UMin
*p = old <unsigned v ? old : v
Definition Instructions.h:744

llvm::AtomicRMWInst::FMin
@ FMin
*p = minnum(old, v) minnum matches the behavior of llvm.minnum.
Definition Instructions.h:758

llvm::AtomicRMWInst::UMax
@ UMax
*p = old >unsigned v ? old : v
Definition Instructions.h:742

llvm::AtomicRMWInst::FMax
@ FMax
*p = maxnum(old, v) maxnum matches the behavior of llvm.maxnum.
Definition Instructions.h:754

llvm::AtomicRMWInst::UDecWrap
@ UDecWrap
Decrement one until a minimum value or zero.
Definition Instructions.h:774

llvm::AtomicRMWInst::Xchg
@ Xchg
*p = v
Definition Instructions.h:724

llvm::AtomicRMWInst::Nand
@ Nand
*p = ~(old & v)
Definition Instructions.h:732

llvm::AtomicRMWInst::getPointerOperand
Value * getPointerOperand()
Definition Instructions.h:887

llvm::AtomicRMWInst::getOperation
BinOp getOperation() const
Definition Instructions.h:820

llvm::AtomicRMWInst::getSyncScopeID
SyncScope::ID getSyncScopeID() const
Returns the synchronization scope ID of this rmw instruction.
Definition Instructions.h:878

llvm::AtomicRMWInst::getValOperand
Value * getValOperand()
Definition Instructions.h:891

llvm::AtomicRMWInst::getOrdering
AtomicOrdering getOrdering() const
Returns the ordering constraint of this rmw instruction.
Definition Instructions.h:864

llvm::AtomicSDNode
This is an SDNode representing atomic operations.
Definition SelectionDAGNodes.h:1593

llvm::Attribute::getValueAsString
LLVM_ABI StringRef getValueAsString() const
Return the attribute's value as a string.
Definition Attributes.cpp:400

llvm::BasicBlock
LLVM Basic Block Representation.
Definition BasicBlock.h:62

llvm::BitVector
Definition BitVector.h:101

llvm::BitVector::count
size_type count() const
count - Returns the number of bits which are set.
Definition BitVector.h:181

llvm::BitVector::any
bool any() const
any - Returns true if any bit is set.
Definition BitVector.h:189

llvm::BitVector::none
bool none() const
none - Returns true if none of the bits are set.
Definition BitVector.h:207

llvm::BuildVectorSDNode
A "pseudo-class" with methods for operating on BUILD_VECTORs.
Definition SelectionDAGNodes.h:2191

llvm::BuildVectorSDNode::getRepeatedSequence
LLVM_ABI bool getRepeatedSequence(const APInt &DemandedElts, SmallVectorImpl< SDValue > &Sequence, BitVector *UndefElements=nullptr) const
Find the shortest repeating sequence of values in the build vector.
Definition SelectionDAG.cpp:13823

llvm::BuildVectorSDNode::getSplatValue
LLVM_ABI SDValue getSplatValue(const APInt &DemandedElts, BitVector *UndefElements=nullptr) const
Returns the demanded splatted value or a null value if this is not a splat.
Definition SelectionDAG.cpp:13783

llvm::BuildVectorSDNode::isConstantSplat
LLVM_ABI bool isConstantSplat(APInt &SplatValue, APInt &SplatUndef, unsigned &SplatBitSize, bool &HasAnyUndefs, unsigned MinSplatBits=0, bool isBigEndian=false) const
Check if this is a constant splat, and if so, find the smallest element size that splats the vector.
Definition SelectionDAG.cpp:13707

llvm::BuildVectorSDNode::isConstant
LLVM_ABI bool isConstant() const
Definition SelectionDAG.cpp:14007

llvm::CallInst
This class represents a function call, abstracting a target machine's calling convention.
Definition Instructions.h:1511

llvm::CmpInst::Predicate
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:676

llvm::CmpInst::ICMP_SLT
@ ICMP_SLT
signed less than
Definition InstrTypes.h:705

llvm::CmpInst::ICMP_SGT
@ ICMP_SGT
signed greater than
Definition InstrTypes.h:703

llvm::CmpInst::ICMP_EQ
@ ICMP_EQ
equal
Definition InstrTypes.h:697

llvm::CmpInst::ICMP_NE
@ ICMP_NE
not equal
Definition InstrTypes.h:698

llvm::CmpInst::getPredicate
Predicate getPredicate() const
Return the predicate for this instruction.
Definition InstrTypes.h:765

llvm::CmpPredicate
An abstraction over a floating-point predicate, and a pack of an integer predicate with samesign info...
Definition CmpPredicate.h:23

llvm::ConstantArray::get
static LLVM_ABI Constant * get(ArrayType *T, ArrayRef< Constant * > V)
Definition Constants.cpp:1317

llvm::ConstantDataVector::get
static LLVM_ABI Constant * get(LLVMContext &Context, ArrayRef< uint8_t > Elts)
get() constructors - Return a constant with vector type with an element count and element type matchi...
Definition Constants.cpp:3008

llvm::ConstantFPSDNode
Definition SelectionDAGNodes.h:1806

llvm::ConstantInt
This is the shared class of boolean and integer constants.
Definition Constants.h:87

llvm::ConstantInt::isValueValidForType
static LLVM_ABI bool isValueValidForType(Type *Ty, uint64_t V)
This static method returns true if the type Ty is big enough to represent the value V.
Definition Constants.cpp:1605

llvm::ConstantPoolSDNode
Definition SelectionDAGNodes.h:2086

llvm::ConstantPoolSDNode::isMachineConstantPoolEntry
bool isMachineConstantPoolEntry() const
Definition SelectionDAGNodes.h:2117

llvm::ConstantPoolSDNode::getOffset
int getOffset() const
Definition SelectionDAGNodes.h:2131

llvm::ConstantPoolSDNode::getConstVal
const Constant * getConstVal() const
Definition SelectionDAGNodes.h:2121

llvm::ConstantSDNode
Definition SelectionDAGNodes.h:1751

llvm::ConstantSDNode::getZExtValue
uint64_t getZExtValue() const
Definition SelectionDAGNodes.h:1768

llvm::ConstantSDNode::getAPIntValue
const APInt & getAPIntValue() const
Definition SelectionDAGNodes.h:1767

llvm::ConstantVector::get
static LLVM_ABI Constant * get(ArrayRef< Constant * > V)
Definition Constants.cpp:1426

llvm::Constant
This is an important base class in LLVM.
Definition Constant.h:43

llvm::Constant::getIntegerValue
static LLVM_ABI Constant * getIntegerValue(Type *Ty, const APInt &V)
Return the value for an integer or pointer constant, or a vector thereof, with the given scalar value...
Definition Constants.cpp:403

llvm::Constant::getNullValue
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
Definition Constants.cpp:373

llvm::Constant::getAggregateElement
LLVM_ABI Constant * getAggregateElement(unsigned Elt) const
For aggregates (struct/array/vector) return the constant that corresponds to the specified element if...
Definition Constants.cpp:435

llvm::DWARFExpression::Operation::getNumOperands
uint64_t getNumOperands() const
Definition DWARFExpression.h:93

llvm::DataLayout
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:63

llvm::DataLayout::getTypeAllocSize
TypeSize getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
Definition DataLayout.cpp:858

llvm::DebugLoc
A debug info location.
Definition DebugLoc.h:124

llvm::DemandedBits
Definition DemandedBits.h:41

llvm::DenseMapBase::find
iterator find(const_arg_type_t< KeyT > Val)
Definition DenseMap.h:167

llvm::DenseMapBase::try_emplace
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
Definition DenseMap.h:237

llvm::DenseMapBase::size
unsigned size() const
Definition DenseMap.h:110

llvm::DenseMapBase::empty
bool empty() const
Definition DenseMap.h:109

llvm::DenseMapBase::begin
iterator begin()
Definition DenseMap.h:78

llvm::DenseMapBase::end
iterator end()
Definition DenseMap.h:81

llvm::DenseMapBase::insert
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition DenseMap.h:222

llvm::DenseMap
Definition DenseMap.h:701

llvm::Expected
Tagged union holding either a T or a Error.
Definition Error.h:485

llvm::FastISel
This is a fast-path instruction selection class that generates poor code and doesn't support illegal ...
Definition FastISel.h:66

llvm::FixedVectorType::get
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition Type.cpp:803

llvm::FrameIndexSDNode
Definition SelectionDAGNodes.h:2000

llvm::FunctionLoweringInfo
FunctionLoweringInfo - This contains information that is global to a function that is used when lower...
Definition FunctionLoweringInfo.h:56

llvm::FunctionType::param_iterator
Type::subtype_iterator param_iterator
Definition DerivedTypes.h:128

llvm::Function
Definition Function.h:64

llvm::Function::hasOptSize
bool hasOptSize() const
Optimize this function for size (-Os) or minimum size (-Oz).
Definition Function.h:706

llvm::Function::getFnAttribute
Attribute getFnAttribute(Attribute::AttrKind Kind) const
Return the attribute for the given attribute kind.
Definition Function.cpp:762

llvm::Function::getFnAttributeAsParsedInteger
uint64_t getFnAttributeAsParsedInteger(StringRef Kind, uint64_t Default=0) const
For a string attribute Kind, parse attribute as an integer.
Definition Function.cpp:774

llvm::Function::hasMinSize
bool hasMinSize() const
Optimize this function for minimum size (-Oz).
Definition Function.h:703

llvm::Function::getCallingConv
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition Function.h:270

llvm::Function::hasPersonalityFn
bool hasPersonalityFn() const
Check whether this function has a personality function.
Definition Function.h:903

llvm::Function::getPersonalityFn
Constant * getPersonalityFn() const
Get the personality function associated with this function.
Definition Function.cpp:1036

llvm::Function::getAttributes
AttributeList getAttributes() const
Return the attribute list for this Function.
Definition Function.h:352

llvm::Function::getContext
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition Function.cpp:359

llvm::Function::hasFnAttribute
bool hasFnAttribute(Attribute::AttrKind Kind) const
Return true if the function has the attribute.
Definition Function.cpp:727

llvm::GlobalAddressSDNode
Definition SelectionDAGNodes.h:1972

llvm::GlobalAddressSDNode::getOffset
int64_t getOffset() const
Definition SelectionDAGNodes.h:1987

llvm::GlobalAddressSDNode::getGlobal
const GlobalValue * getGlobal() const
Definition SelectionDAGNodes.h:1986

llvm::GlobalValue
Definition GlobalValue.h:49

llvm::GlobalValue::LocalExecTLSModel
@ LocalExecTLSModel
Definition GlobalValue.h:202

llvm::GlobalValue::dropLLVMManglingEscape
static StringRef dropLLVMManglingEscape(StringRef Name)
If the given string begins with the GlobalValue name mangling escape character '\1',...
Definition GlobalValue.h:569

llvm::GlobalValue::isAbsoluteSymbolRef
LLVM_ABI bool isAbsoluteSymbolRef() const
Returns whether this is a reference to an absolute symbol.
Definition Globals.cpp:437

llvm::GlobalValue::getThreadLocalMode
ThreadLocalMode getThreadLocalMode() const
Definition GlobalValue.h:273

llvm::GlobalValue::getParent
Module * getParent()
Get the module that this global value is contained inside of...
Definition GlobalValue.h:663

llvm::HandleSDNode
This class is used to form a handle around another node that is persistent and is updated across invo...
Definition SelectionDAGNodes.h:1364

llvm::Init
Definition Record.h:286

llvm::Instruction
Definition Instruction.h:69

llvm::Instruction::eraseFromParent
LLVM_ABI InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
Definition Instruction.cpp:108

llvm::Instruction::user_back
Instruction * user_back()
Specialize the methods defined in Value, as we know that an instruction can only be used by other ins...
Definition Instruction.h:171

llvm::Instruction::getFunction
LLVM_ABI const Function * getFunction() const
Return the function this instruction belongs to.
Definition Instruction.cpp:86

llvm::Instruction::BinaryOps
BinaryOps
Definition Instruction.h:998

llvm::IntegerType
Class to represent integer types.
Definition DerivedTypes.h:42

llvm::LLT
Definition LowLevelType.h:40

llvm::LLVMContext
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68

llvm::LLVMContext::diagnose
LLVM_ABI void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
Definition LLVMContext.cpp:247

llvm::LSBaseSDNode::isIndexed
bool isIndexed() const
Return true if this is a pre/post inc/dec load/store.
Definition SelectionDAGNodes.h:2538

llvm::LoadInst
An instruction for reading from memory.
Definition Instructions.h:181

llvm::LoadInst::setAtomic
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this load instruction.
Definition Instructions.h:246

llvm::LoadSDNode
This class is used to represent ISD::LOAD nodes.
Definition SelectionDAGNodes.h:2550

llvm::LoadSDNode::getBasePtr
const SDValue & getBasePtr() const
Definition SelectionDAGNodes.h:2569

llvm::MCAsmInfo::usesWindowsCFI
bool usesWindowsCFI() const
Definition MCAsmInfo.h:652

llvm::MCContext::getOrCreateParentFrameOffsetSymbol
LLVM_ABI MCSymbol * getOrCreateParentFrameOffsetSymbol(const Twine &FuncName)
Definition MCContext.cpp:260

llvm::MCContext::getOrCreateLSDASymbol
LLVM_ABI MCSymbol * getOrCreateLSDASymbol(const Twine &FuncName)
Definition MCContext.cpp:265

llvm::MCSymbol
MCSymbol - Instances of this class represent a symbol name in the MC file, and MCSymbols are created ...
Definition MCSymbol.h:42

llvm::MIMetadata
Set of metadata that should be preserved when using BuildMI().
Definition MachineInstrBuilder.h:74

llvm::MVT
Machine Value Type.
Definition MachineValueType.h:36

llvm::MVT::getFloatingPointVT
static MVT getFloatingPointVT(unsigned BitWidth)
Definition MachineValueType.h:438

llvm::MVT::is128BitVector
bool is128BitVector() const
Return true if this is a 128-bit vector type.
Definition MachineValueType.h:157

llvm::MVT::INVALID_SIMPLE_VALUE_TYPE
@ INVALID_SIMPLE_VALUE_TYPE
Definition MachineValueType.h:41

llvm::MVT::SimpleTy
SimpleValueType SimpleTy
Definition MachineValueType.h:56

llvm::MVT::getScalarSizeInBits
uint64_t getScalarSizeInBits() const
Definition MachineValueType.h:353

llvm::MVT::changeVectorElementType
MVT changeVectorElementType(MVT EltVT) const
Return a VT for a vector type whose attributes match ourselves with the exception of the element type...
Definition MachineValueType.h:214

llvm::MVT::bitsLE
bool bitsLE(MVT VT) const
Return true if this has no more bits than VT.
Definition MachineValueType.h:432

llvm::MVT::getVectorNumElements
unsigned getVectorNumElements() const
Definition MachineValueType.h:301

llvm::MVT::isVector
bool isVector() const
Return true if this is a vector value type.
Definition MachineValueType.h:107

llvm::MVT::isInteger
bool isInteger() const
Return true if this is an integer or a vector integer type.
Definition MachineValueType.h:91

llvm::MVT::isScalableVector
bool isScalableVector() const
Return true if this is a vector value type where the runtime length is machine dependent.
Definition MachineValueType.h:114

llvm::MVT::is32BitVector
bool is32BitVector() const
Return true if this is a 32-bit vector type.
Definition MachineValueType.h:147

llvm::MVT::changeTypeToInteger
MVT changeTypeToInteger()
Return the type converted to an equivalently sized integer or vector with integer element type.
Definition MachineValueType.h:224

llvm::MVT::getVT
static LLVM_ABI MVT getVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
Definition ValueTypes.cpp:249

llvm::MVT::bitsLT
bool bitsLT(MVT VT) const
Return true if this has less bits than VT.
Definition MachineValueType.h:425

llvm::MVT::is512BitVector
bool is512BitVector() const
Return true if this is a 512-bit vector type.
Definition MachineValueType.h:167

llvm::MVT::integer_valuetypes
static auto integer_valuetypes()
Definition MachineValueType.h:532

llvm::MVT::getSizeInBits
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
Definition MachineValueType.h:315

llvm::MVT::fixedlen_vector_valuetypes
static auto fixedlen_vector_valuetypes()
Definition MachineValueType.h:549

llvm::MVT::getFixedSizeInBits
uint64_t getFixedSizeInBits() const
Return the size of the specified fixed width value type in bits.
Definition MachineValueType.h:349

llvm::MVT::getFltSemantics
LLVM_ABI const fltSemantics & getFltSemantics() const
Returns an APFloat semantics tag appropriate for the value type.
Definition ValueTypes.cpp:319

llvm::MVT::bitsGT
bool bitsGT(MVT VT) const
Return true if this has more bits than VT.
Definition MachineValueType.h:411

llvm::MVT::is256BitVector
bool is256BitVector() const
Return true if this is a 256-bit vector type.
Definition MachineValueType.h:162

llvm::MVT::getStoreSize
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
Definition MachineValueType.h:363

llvm::MVT::bitsGE
bool bitsGE(MVT VT) const
Return true if this has no less bits than VT.
Definition MachineValueType.h:418

llvm::MVT::isScalarInteger
bool isScalarInteger() const
Return true if this is an integer, not including vectors.
Definition MachineValueType.h:101

llvm::MVT::getVectorVT
static MVT getVectorVT(MVT VT, unsigned NumElements)
Definition MachineValueType.h:458

llvm::MVT::getVectorElementType
MVT getVectorElementType() const
Definition MachineValueType.h:270

llvm::MVT::isFloatingPoint
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
Definition MachineValueType.h:81

llvm::MVT::getIntegerVT
static MVT getIntegerVT(unsigned BitWidth)
Definition MachineValueType.h:448

llvm::MVT::getDoubleNumVectorElementsVT
MVT getDoubleNumVectorElementsVT() const
Definition MachineValueType.h:241

llvm::MVT::getHalfNumVectorElementsVT
MVT getHalfNumVectorElementsVT() const
Return a VT for a vector type with the same element type but half the number of elements.
Definition MachineValueType.h:232

llvm::MVT::getScalarType
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
Definition MachineValueType.h:266

llvm::MVT::is64BitVector
bool is64BitVector() const
Return true if this is a 64-bit vector type.
Definition MachineValueType.h:152

llvm::MVT::changeVectorElementTypeToInteger
MVT changeVectorElementTypeToInteger() const
Return a vector with the same number of elements as this vector, but with the element type converted ...
Definition MachineValueType.h:203

llvm::MachineBasicBlock
Definition MachineBasicBlock.h:122

llvm::MachineBasicBlock::transferSuccessorsAndUpdatePHIs
LLVM_ABI void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
Definition MachineBasicBlock.cpp:955

llvm::MachineBasicBlock::isEHPad
bool isEHPad() const
Returns true if the block is a landing pad.
Definition MachineBasicBlock.h:664

llvm::MachineBasicBlock::rend
reverse_iterator rend()
Definition MachineBasicBlock.h:387

llvm::MachineBasicBlock::insert
LLVM_ABI instr_iterator insert(instr_iterator I, MachineInstr *M)
Insert MI into the instruction list before I, possibly inside a bundle.
Definition MachineBasicBlock.cpp:1492

llvm::MachineBasicBlock::push_back
void push_back(MachineInstr *MI)
Definition MachineBasicBlock.h:1049

llvm::MachineBasicBlock::setCallFrameSize
void setCallFrameSize(unsigned N)
Set the call frame size on entry to this basic block.
Definition MachineBasicBlock.h:1273

llvm::MachineBasicBlock::getBasicBlock
const BasicBlock * getBasicBlock() const
Return the LLVM basic block that this instance corresponded to originally.
Definition MachineBasicBlock.h:253

llvm::MachineBasicBlock::succ_size
unsigned succ_size() const
Definition MachineBasicBlock.h:455

llvm::MachineBasicBlock::addSuccessor
LLVM_ABI void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
Definition MachineBasicBlock.cpp:816

llvm::MachineBasicBlock::removeSuccessor
LLVM_ABI void removeSuccessor(MachineBasicBlock *Succ, bool NormalizeSuccProbs=false)
Remove successor from the successors list of this MachineBasicBlock.
Definition MachineBasicBlock.cpp:854

llvm::MachineBasicBlock::begin
iterator begin()
Definition MachineBasicBlock.h:377

llvm::MachineBasicBlock::instr_iterator
Instructions::iterator instr_iterator
Definition MachineBasicBlock.h:336

llvm::MachineBasicBlock::reverse_iterator
MachineInstrBundleIterator< MachineInstr, true > reverse_iterator
Definition MachineBasicBlock.h:343

llvm::MachineBasicBlock::succ_rbegin
succ_reverse_iterator succ_rbegin()
Definition MachineBasicBlock.h:447

llvm::MachineBasicBlock::end
iterator end()
Definition MachineBasicBlock.h:379

llvm::MachineBasicBlock::addLiveIn
void addLiveIn(MCRegister PhysReg, LaneBitmask LaneMask=LaneBitmask::getAll())
Adds the specified register as a live in.
Definition MachineBasicBlock.h:478

llvm::MachineBasicBlock::getParent
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
Definition MachineBasicBlock.h:323

llvm::MachineBasicBlock::erase
LLVM_ABI instr_iterator erase(instr_iterator I)
Remove an instruction from the instruction list and delete it.
Definition MachineBasicBlock.cpp:1479

llvm::MachineBasicBlock::insertAfter
iterator insertAfter(iterator I, MachineInstr *MI)
Insert MI into the instruction list after I.
Definition MachineBasicBlock.h:1077

llvm::MachineBasicBlock::splice
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
Definition MachineBasicBlock.h:1156

llvm::MachineBasicBlock::iterator
MachineInstrBundleIterator< MachineInstr > iterator
Definition MachineBasicBlock.h:341

llvm::MachineBasicBlock::succ_rend
succ_reverse_iterator succ_rend()
Definition MachineBasicBlock.h:451

llvm::MachineBasicBlock::setMachineBlockAddressTaken
void setMachineBlockAddressTaken()
Set this block to indicate that its address is used as something other than the target of a terminato...
Definition MachineBasicBlock.h:309

llvm::MachineBasicBlock::isLiveIn
LLVM_ABI bool isLiveIn(MCRegister Reg, LaneBitmask LaneMask=LaneBitmask::getAll()) const
Return true if the specified register is in the live in set.
Definition MachineBasicBlock.cpp:636

llvm::MachineBasicBlock::setIsEHPad
void setIsEHPad(bool V=true)
Indicates the block is a landing pad.
Definition MachineBasicBlock.h:668

llvm::MachineFrameInfo
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
Definition MachineFrameInfo.h:111

llvm::MachineFrameInfo::CreateFixedObject
LLVM_ABI int CreateFixedObject(uint64_t Size, int64_t SPOffset, bool IsImmutable, bool isAliased=false)
Create a new object at a fixed location on the stack.
Definition MachineFrameInfo.cpp:83

llvm::MachineFrameInfo::setAdjustsStack
void setAdjustsStack(bool V)
Definition MachineFrameInfo.h:634

llvm::MachineFrameInfo::CreateStackObject
LLVM_ABI int CreateStackObject(uint64_t Size, Align Alignment, bool isSpillSlot, const AllocaInst *Alloca=nullptr, uint8_t ID=0)
Create a new statically sized stack object, returning a nonnegative identifier to represent it.
Definition MachineFrameInfo.cpp:51

llvm::MachineFrameInfo::setFrameAddressIsTaken
void setFrameAddressIsTaken(bool T)
Definition MachineFrameInfo.h:379

llvm::MachineFrameInfo::setHasCalls
void setHasCalls(bool V)
Definition MachineFrameInfo.h:638

llvm::MachineFrameInfo::setReturnAddressIsTaken
void setReturnAddressIsTaken(bool s)
Definition MachineFrameInfo.h:385

llvm::MachineFrameInfo::setHasCopyImplyingStackAdjustment
void setHasCopyImplyingStackAdjustment(bool B)
Definition MachineFrameInfo.h:649

llvm::MachineFrameInfo::isFixedObjectIndex
bool isFixedObjectIndex(int ObjectIdx) const
Returns true if the specified index corresponds to a fixed stack object.
Definition MachineFrameInfo.h:716

llvm::MachineFrameInfo::setObjectAlignment
void setObjectAlignment(int ObjectIdx, Align Alignment)
setObjectAlignment - Change the alignment of the specified stack object.
Definition MachineFrameInfo.h:515

llvm::MachineFrameInfo::getFunctionContextIndex
int getFunctionContextIndex() const
Return the index for the function context object.
Definition MachineFrameInfo.h:371

llvm::MachineFunction
Definition MachineFunction.h:286

llvm::MachineFunction::getWinEHFuncInfo
const WinEHFuncInfo * getWinEHFuncInfo() const
getWinEHFuncInfo - Return information about how the current function uses Windows exception handling.
Definition MachineFunction.h:806

llvm::MachineFunction::moveAdditionalCallInfo
void moveAdditionalCallInfo(const MachineInstr *Old, const MachineInstr *New)
Move the call site info from Old to \New call site info.
Definition MachineFunction.cpp:996

llvm::MachineFunction::getFunctionNumber
unsigned getFunctionNumber() const
getFunctionNumber - Return a unique ID for the current function.
Definition MachineFunction.h:742

llvm::MachineFunction::getSubtarget
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
Definition MachineFunction.h:762

llvm::MachineFunction::getMachineMemOperand
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
Definition MachineFunction.cpp:536

llvm::MachineFunction::getFrameInfo
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
Definition MachineFunction.h:778

llvm::MachineFunction::push_back
void push_back(MachineBasicBlock *MBB)
Definition MachineFunction.h:1001

llvm::MachineFunction::getContext
MCContext & getContext() const
Definition MachineFunction.h:719

llvm::MachineFunction::getRegInfo
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Definition MachineFunction.h:772

llvm::MachineFunction::getDataLayout
const DataLayout & getDataLayout() const
Return the DataLayout attached to the Module associated to this MF.
Definition MachineFunction.cpp:309

llvm::MachineFunction::getFunction
Function & getFunction()
Return the LLVM function that this machine code represents.
Definition MachineFunction.h:733

llvm::MachineFunction::iterator
BasicBlockListType::iterator iterator
Definition MachineFunction.h:966

llvm::MachineFunction::shouldSplitStack
bool shouldSplitStack() const
Should we be emitting segmented stack stuff for the function.
Definition MachineFunction.cpp:329

llvm::MachineFunction::getInfo
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Definition MachineFunction.h:860

llvm::MachineFunction::CreateMachineBasicBlock
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineInstr - Allocate a new MachineInstr.
Definition MachineFunction.cpp:499

llvm::MachineFunction::insert
void insert(iterator MBBI, MachineBasicBlock *MBB)
Definition MachineFunction.h:1003

llvm::MachineFunction::getTarget
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
Definition MachineFunction.h:758

llvm::MachineInstrBuilder
Definition MachineInstrBuilder.h:98

llvm::MachineInstrBuilder::setMemRefs
const MachineInstrBuilder & setMemRefs(ArrayRef< MachineMemOperand * > MMOs) const
Definition MachineInstrBuilder.h:237

llvm::MachineInstrBuilder::addExternalSymbol
const MachineInstrBuilder & addExternalSymbol(const char *FnName, unsigned TargetFlags=0) const
Definition MachineInstrBuilder.h:213

llvm::MachineInstrBuilder::setMIFlag
const MachineInstrBuilder & setMIFlag(MachineInstr::MIFlag Flag) const
Definition MachineInstrBuilder.h:306

llvm::MachineInstrBuilder::addImm
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
Definition MachineInstrBuilder.h:160

llvm::MachineInstrBuilder::add
const MachineInstrBuilder & add(const MachineOperand &MO) const
Definition MachineInstrBuilder.h:253

llvm::MachineInstrBuilder::addRegMask
const MachineInstrBuilder & addRegMask(const uint32_t *Mask) const
Definition MachineInstrBuilder.h:226

llvm::MachineInstrBuilder::addGlobalAddress
const MachineInstrBuilder & addGlobalAddress(const GlobalValue *GV, int64_t Offset=0, unsigned TargetFlags=0) const
Definition MachineInstrBuilder.h:206

llvm::MachineInstrBuilder::addDisp
const MachineInstrBuilder & addDisp(const MachineOperand &Disp, int64_t off, unsigned char TargetFlags=0) const
Definition MachineInstrBuilder.h:317

llvm::MachineInstrBuilder::addReg
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
Definition MachineInstrBuilder.h:126

llvm::MachineInstrBuilder::addJumpTableIndex
const MachineInstrBuilder & addJumpTableIndex(unsigned Idx, unsigned TargetFlags=0) const
Definition MachineInstrBuilder.h:200

llvm::MachineInstrBuilder::addMBB
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
Definition MachineInstrBuilder.h:175

llvm::MachineInstrBuilder::addUse
const MachineInstrBuilder & addUse(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register use operand.
Definition MachineInstrBuilder.h:152

llvm::MachineInstrBuilder::getInstr
MachineInstr * getInstr() const
If conversion operators fail, use this method to get the MachineInstr explicitly.
Definition MachineInstrBuilder.h:118

llvm::MachineInstrBuilder::addDef
const MachineInstrBuilder & addDef(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register definition operand.
Definition MachineInstrBuilder.h:145

llvm::MachineInstr
Representation of each machine instruction.
Definition MachineInstr.h:72

llvm::MachineInstr::killsRegister
bool killsRegister(Register Reg, const TargetRegisterInfo *TRI) const
Return true if the MachineInstr kills the specified register.
Definition MachineInstr.h:1508

llvm::MachineInstr::FrameDestroy
@ FrameDestroy
Definition MachineInstr.h:90

llvm::MachineInstr::FrameSetup
@ FrameSetup
Definition MachineInstr.h:88

llvm::MachineInstr::eraseFromParent
LLVM_ABI void eraseFromParent()
Unlink 'this' from the containing basic block and delete it.
Definition MachineInstr.cpp:770

llvm::MachineInstr::getOperand
const MachineOperand & getOperand(unsigned i) const
Definition MachineInstr.h:595

llvm::MachineJumpTableInfo::createJumpTableIndex
LLVM_ABI unsigned createJumpTableIndex(const std::vector< MachineBasicBlock * > &DestBBs)
createJumpTableIndex - Create a new jump table.
Definition MachineFunction.cpp:1377

llvm::MachineJumpTableInfo::EK_LabelDifference32
@ EK_LabelDifference32
EK_LabelDifference32 - Each entry is the address of the block minus the address of the jump table.
Definition MachineJumpTableInfo.h:74

llvm::MachineJumpTableInfo::EK_BlockAddress
@ EK_BlockAddress
EK_BlockAddress - Each entry is a plain address of block, e.g.: .word LBB123.
Definition MachineJumpTableInfo.h:55

llvm::MachineLoop
Definition MachineLoopInfo.h:48

llvm::MachineMemOperand
A description of a memory reference used in the backend.
Definition MachineMemOperand.h:130

llvm::MachineMemOperand::Flags
Flags
Flags values. These may be or'd together.
Definition MachineMemOperand.h:133

llvm::MachineMemOperand::MOVolatile
@ MOVolatile
The memory access is volatile.
Definition MachineMemOperand.h:141

llvm::MachineMemOperand::MOLoad
@ MOLoad
The memory access reads data.
Definition MachineMemOperand.h:137

llvm::MachineMemOperand::MONone
@ MONone
Definition MachineMemOperand.h:135

llvm::MachineMemOperand::MOStore
@ MOStore
The memory access writes data.
Definition MachineMemOperand.h:139

llvm::MachineMemOperand::getFlags
Flags getFlags() const
Return the raw flags of the source value,.
Definition MachineMemOperand.h:227

llvm::MachineOperand
MachineOperand class - Representation of each machine instruction operand.
Definition MachineOperand.h:48

llvm::MachineOperand::getImm
int64_t getImm() const
Definition MachineOperand.h:556

llvm::MachineOperand::isReg
bool isReg() const
isReg - Tests if this is a MO_Register operand.
Definition MachineOperand.h:328

llvm::MachineOperand::getReg
Register getReg() const
getReg - Returns the register number.
Definition MachineOperand.h:368

llvm::MachineRegisterInfo
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
Definition MachineRegisterInfo.h:53

llvm::MachineRegisterInfo::createVirtualRegister
LLVM_ABI Register createVirtualRegister(const TargetRegisterClass *RegClass, StringRef Name="")
createVirtualRegister - Create and return a new virtual register in the function with the specified r...
Definition MachineRegisterInfo.cpp:154

llvm::MachineSDNode
An SDNode that represents everything that will be needed to construct a MachineInstr.
Definition SelectionDAGNodes.h:3160

llvm::MaskedGatherSDNode
This class is used to represent an MGATHER node.
Definition SelectionDAGNodes.h:3044

llvm::MaskedGatherScatterSDNode
This is a base class used to represent MGATHER and MSCATTER nodes.
Definition SelectionDAGNodes.h:3006

llvm::MaskedLoadSDNode
This class is used to represent an MLOAD node.
Definition SelectionDAGNodes.h:2857

llvm::MaskedLoadStoreSDNode
This base class is used to represent MLOAD and MSTORE nodes.
Definition SelectionDAGNodes.h:2815

llvm::MaskedLoadStoreSDNode::getMask
const SDValue & getMask() const
Definition SelectionDAGNodes.h:2834

llvm::MaskedLoadStoreSDNode::getAddressingMode
ISD::MemIndexedMode getAddressingMode() const
Return the addressing mode for this load or store: unindexed, pre-inc, pre-dec, post-inc,...
Definition SelectionDAGNodes.h:2840

llvm::MaskedScatterSDNode
This class is used to represent an MSCATTER node.
Definition SelectionDAGNodes.h:3069

llvm::MaskedStoreSDNode
This class is used to represent an MSTORE node.
Definition SelectionDAGNodes.h:2886

llvm::MaskedStoreSDNode::isCompressingStore
bool isCompressingStore() const
Returns true if the op does a compression to the vector before storing.
Definition SelectionDAGNodes.h:2907

llvm::MaskedStoreSDNode::getOffset
const SDValue & getOffset() const
Definition SelectionDAGNodes.h:2911

llvm::MaskedStoreSDNode::getBasePtr
const SDValue & getBasePtr() const
Definition SelectionDAGNodes.h:2910

llvm::MaskedStoreSDNode::getMask
const SDValue & getMask() const
Definition SelectionDAGNodes.h:2912

llvm::MaskedStoreSDNode::getValue
const SDValue & getValue() const
Definition SelectionDAGNodes.h:2909

llvm::MaskedStoreSDNode::isTruncatingStore
bool isTruncatingStore() const
Return true if the op does a truncation before store.
Definition SelectionDAGNodes.h:2901

llvm::MemIntrinsicSDNode
This SDNode is used for target intrinsics that touch memory and need an associated MachineMemOperand.
Definition SelectionDAGNodes.h:1667

llvm::MemSDNode
This is an abstract virtual class for memory operations.
Definition SelectionDAGNodes.h:1408

llvm::MemSDNode::getAddressSpace
unsigned getAddressSpace() const
Return the address space for the associated pointer.
Definition SelectionDAGNodes.h:1499

llvm::MemSDNode::getBaseAlign
Align getBaseAlign() const
Returns alignment and volatility of the memory access.
Definition SelectionDAGNodes.h:1425

llvm::MemSDNode::getAlign
Align getAlign() const
Definition SelectionDAGNodes.h:1426

llvm::MemSDNode::isVolatile
bool isVolatile() const
Definition SelectionDAGNodes.h:1447

llvm::MemSDNode::getAAInfo
AAMDNodes getAAInfo() const
Returns the AA info that describes the dereference.
Definition SelectionDAGNodes.h:1456

llvm::MemSDNode::getSyncScopeID
SyncScope::ID getSyncScopeID() const
Returns the synchronization scope ID for this memory operation.
Definition SelectionDAGNodes.h:1462

llvm::MemSDNode::readMem
bool readMem() const
Definition SelectionDAGNodes.h:1421

llvm::MemSDNode::isSimple
bool isSimple() const
Returns true if the memory operation is neither atomic or volatile.
Definition SelectionDAGNodes.h:1485

llvm::MemSDNode::getSuccessOrdering
AtomicOrdering getSuccessOrdering() const
Return the atomic ordering requirements for this memory operation.
Definition SelectionDAGNodes.h:1467

llvm::MemSDNode::getMemOperand
MachineMemOperand * getMemOperand() const
Return a MachineMemOperand object describing the memory reference performed by operation.
Definition SelectionDAGNodes.h:1492

llvm::MemSDNode::getBasePtr
const SDValue & getBasePtr() const
Definition SelectionDAGNodes.h:1521

llvm::MemSDNode::getPointerInfo
const MachinePointerInfo & getPointerInfo() const
Definition SelectionDAGNodes.h:1494

llvm::MemSDNode::getChain
const SDValue & getChain() const
Definition SelectionDAGNodes.h:1519

llvm::MemSDNode::isNonTemporal
bool isNonTemporal() const
Definition SelectionDAGNodes.h:1448

llvm::MemSDNode::getMemoryVT
EVT getMemoryVT() const
Return the type of the in-memory value.
Definition SelectionDAGNodes.h:1488

llvm::Metadata
Root of the metadata hierarchy.
Definition Metadata.h:64

llvm::Module
A Module instance is used to store all the information related to an LLVM module.
Definition Module.h:67

llvm::Module::getModuleFlag
Metadata * getModuleFlag(StringRef Key) const
Return the corresponding value if Key appears in module flags, otherwise return null.
Definition Module.cpp:353

llvm::MutableArrayRef
MutableArrayRef - Represent a mutable reference to an array (0 or more elements consecutively in memo...
Definition ArrayRef.h:303

llvm::PointerType::getUnqual
static PointerType * getUnqual(Type *ElementType)
This constructs a pointer to an object of the specified type in the default address space (address sp...
Definition DerivedTypes.h:722

llvm::PointerType::get
static LLVM_ABI PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space.

llvm::Register
Wrapper class representing virtual and physical registers.
Definition Register.h:19

llvm::Register::isValid
constexpr bool isValid() const
Definition Register.h:107

llvm::SDLoc
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
Definition SelectionDAGNodes.h:1236

llvm::SDNode
Represents one node in the SelectionDAG.
Definition SelectionDAGNodes.h:512

llvm::SDNode::isStrictFPOpcode
bool isStrictFPOpcode()
Test if this node is a strict floating point pseudo-op.
Definition SelectionDAGNodes.h:723

llvm::SDNode::ops
ArrayRef< SDUse > ops() const
Definition SelectionDAGNodes.h:1054

llvm::SDNode::getAsAPIntVal
const APInt & getAsAPIntVal() const
Helper method returns the APInt value of a ConstantSDNode.
Definition SelectionDAGNodes.h:1802

llvm::SDNode::getOpcode
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
Definition SelectionDAGNodes.h:703

llvm::SDNode::getGluedUser
SDNode * getGluedUser() const
If this node has a glue value with a user, return the user (there is at most one).
Definition SelectionDAGNodes.h:1089

llvm::SDNode::hasOneUse
bool hasOneUse() const
Return true if there is exactly one use of this node.
Definition SelectionDAGNodes.h:775

llvm::SDNode::isOnlyUserOf
LLVM_ABI bool isOnlyUserOf(const SDNode *N) const
Return true if this node is the only use of N.
Definition SelectionDAG.cpp:13142

llvm::SDNode::op_values
iterator_range< value_op_iterator > op_values() const
Definition SelectionDAGNodes.h:1068

llvm::SDNode::getFlags
SDNodeFlags getFlags() const
Definition SelectionDAGNodes.h:1096

llvm::SDNode::getValueSizeInBits
TypeSize getValueSizeInBits(unsigned ResNo) const
Returns MVT::getSizeInBits(getValueType(ResNo)).
Definition SelectionDAGNodes.h:1130

llvm::SDNode::getSimpleValueType
MVT getSimpleValueType(unsigned ResNo) const
Return the type of a specified result as a simple type.
Definition SelectionDAGNodes.h:1121

llvm::SDNode::getAsZExtVal
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
Definition SelectionDAGNodes.h:1794

llvm::SDNode::getNumValues
unsigned getNumValues() const
Return the number of values defined/returned by this operator.
Definition SelectionDAGNodes.h:1112

llvm::SDNode::getNumOperands
unsigned getNumOperands() const
Return the number of values used by this operation.
Definition SelectionDAGNodes.h:1024

llvm::SDNode::getVTList
SDVTList getVTList() const
Definition SelectionDAGNodes.h:1073

llvm::SDNode::getOperand
const SDValue & getOperand(unsigned Num) const
Definition SelectionDAGNodes.h:1045

llvm::SDNode::getConstantOperandVal
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
Definition SelectionDAGNodes.h:1790

llvm::SDNode::areOnlyUsersOf
static LLVM_ABI bool areOnlyUsersOf(ArrayRef< const SDNode * > Nodes, const SDNode *N)
Return true if all the users of N are contained in Nodes.
Definition SelectionDAG.cpp:13155

llvm::SDNode::hasNUsesOfValue
bool hasNUsesOfValue(unsigned NUses, unsigned Value) const
Return true if there are exactly NUSES uses of the indicated value.
Definition SelectionDAGNodes.h:916

llvm::SDNode::hasAnyUseOfValue
LLVM_ABI bool hasAnyUseOfValue(unsigned Value) const
Return true if there are any use of the indicated value.
Definition SelectionDAG.cpp:13131

llvm::SDNode::getValueType
EVT getValueType(unsigned ResNo) const
Return the type of a specified result.
Definition SelectionDAGNodes.h:1115

llvm::SDNode::isUndef
bool isUndef() const
Returns true if the node type is UNDEF or POISON.
Definition SelectionDAGNodes.h:710

llvm::SDNode::users
iterator_range< user_iterator > users()
Definition SelectionDAGNodes.h:907

llvm::SDNode::setFlags
void setFlags(SDNodeFlags NewFlags)
Definition SelectionDAGNodes.h:1097

llvm::SDNode::user_begin
user_iterator user_begin() const
Provide iteration support to walk over all users of an SDNode.
Definition SelectionDAGNodes.h:903

llvm::SDNode::op_end
op_iterator op_end() const
Definition SelectionDAGNodes.h:1053

llvm::SDNode::op_begin
op_iterator op_begin() const
Definition SelectionDAGNodes.h:1052

llvm::SDUse
Represents a use of a SDNode.
Definition SelectionDAGNodes.h:286

llvm::SDValue
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
Definition SelectionDAGNodes.h:147

llvm::SDValue::isUndef
bool isUndef() const
Definition SelectionDAGNodes.h:1303

llvm::SDValue::getNode
SDNode * getNode() const
get the SDNode which holds the desired result
Definition SelectionDAGNodes.h:161

llvm::SDValue::hasOneUse
bool hasOneUse() const
Return true if there is exactly one node using value ResNo of Node.
Definition SelectionDAGNodes.h:1313

llvm::SDValue::getValue
SDValue getValue(unsigned R) const
Definition SelectionDAGNodes.h:181

llvm::SDValue::getValueType
EVT getValueType() const
Return the ValueType of the referenced return value.
Definition SelectionDAGNodes.h:1271

llvm::SDValue::getValueSizeInBits
TypeSize getValueSizeInBits() const
Returns the size of the value in bits.
Definition SelectionDAGNodes.h:201

llvm::SDValue::getOperand
const SDValue & getOperand(unsigned i) const
Definition SelectionDAGNodes.h:1279

llvm::SDValue::getConstantOperandAPInt
const APInt & getConstantOperandAPInt(unsigned i) const
Definition SelectionDAGNodes.h:1287

llvm::SDValue::getScalarValueSizeInBits
uint64_t getScalarValueSizeInBits() const
Definition SelectionDAGNodes.h:205

llvm::SDValue::getResNo
unsigned getResNo() const
get the index which selects a specific result in the SDNode
Definition SelectionDAGNodes.h:158

llvm::SDValue::getConstantOperandVal
uint64_t getConstantOperandVal(unsigned i) const
Definition SelectionDAGNodes.h:1283

llvm::SDValue::getSimpleValueType
MVT getSimpleValueType() const
Return the simple ValueType of the referenced return value.
Definition SelectionDAGNodes.h:192

llvm::SDValue::getOpcode
unsigned getOpcode() const
Definition SelectionDAGNodes.h:1267

llvm::SDValue::getNumOperands
unsigned getNumOperands() const
Definition SelectionDAGNodes.h:1275

llvm::SelectionDAGTargetInfo
Targets can subclass this to parameterize the SelectionDAG lowering and instruction selection process...
Definition SelectionDAGTargetInfo.h:33

llvm::SelectionDAGTargetInfo::isTargetStrictFPOpcode
virtual bool isTargetStrictFPOpcode(unsigned Opcode) const
Returns true if a node with the given target-specific opcode has strict floating-point semantics.
Definition SelectionDAGTargetInfo.h:53

llvm::SelectionDAG::FlagInserter
Help to insert SDNodeFlags automatically in transforming.
Definition SelectionDAG.h:372

llvm::SelectionDAG
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
Definition SelectionDAG.h:229

llvm::SelectionDAG::willNotOverflowAdd
bool willNotOverflowAdd(bool IsSigned, SDValue N0, SDValue N1) const
Determine if the result of the addition of 2 nodes can never overflow.
Definition SelectionDAG.h:2134

llvm::SelectionDAG::getOpcode_EXTEND_VECTOR_INREG
static unsigned getOpcode_EXTEND_VECTOR_INREG(unsigned Opcode)
Convert *_EXTEND to *_EXTEND_VECTOR_INREG opcode.
Definition SelectionDAG.h:995

llvm::SelectionDAG::getShiftAmountOperand
LLVM_ABI SDValue getShiftAmountOperand(EVT LHSTy, SDValue Op)
Return the specified value casted to the target's desired shift amount type.
Definition SelectionDAG.cpp:2463

llvm::SelectionDAG::getExtLoad
LLVM_ABI SDValue getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT VT, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, EVT MemVT, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
Definition SelectionDAG.cpp:9876

llvm::SelectionDAG::getTargetGlobalAddress
SDValue getTargetGlobalAddress(const GlobalValue *GV, const SDLoc &DL, EVT VT, int64_t offset=0, unsigned TargetFlags=0)
Definition SelectionDAG.h:758

llvm::SelectionDAG::getExtOrTrunc
SDValue getExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT, unsigned Opcode)
Convert Op, which must be of integer type, to the integer type VT, by either any/sign/zero-extending ...
Definition SelectionDAG.h:1025

llvm::SelectionDAG::getExtractVectorElt
SDValue getExtractVectorElt(const SDLoc &DL, EVT VT, SDValue Vec, unsigned Idx)
Extract element at Idx from Vec.
Definition SelectionDAG.h:941

llvm::SelectionDAG::getSplatSourceVector
LLVM_ABI SDValue getSplatSourceVector(SDValue V, int &SplatIndex)
If V is a splatted value, return the source vector and its splat index.
Definition SelectionDAG.cpp:3184

llvm::SelectionDAG::ComputeMaxSignificantBits
LLVM_ABI unsigned ComputeMaxSignificantBits(SDValue Op, unsigned Depth=0) const
Get the upper bound on bit size for this Value Op as a signed integer.
Definition SelectionDAG.cpp:5427

llvm::SelectionDAG::getMaskedGather
LLVM_ABI SDValue getMaskedGather(SDVTList VTs, EVT MemVT, const SDLoc &dl, ArrayRef< SDValue > Ops, MachineMemOperand *MMO, ISD::MemIndexType IndexType, ISD::LoadExtType ExtTy)
Definition SelectionDAG.cpp:10595

llvm::SelectionDAG::getAddrSpaceCast
LLVM_ABI SDValue getAddrSpaceCast(const SDLoc &dl, EVT VT, SDValue Ptr, unsigned SrcAS, unsigned DestAS)
Return an AddrSpaceCastSDNode.
Definition SelectionDAG.cpp:2435

llvm::SelectionDAG::getSubtarget
const TargetSubtargetInfo & getSubtarget() const
Definition SelectionDAG.h:500

llvm::SelectionDAG::getCopyToReg
SDValue getCopyToReg(SDValue Chain, const SDLoc &dl, Register Reg, SDValue N)
Definition SelectionDAG.h:813

llvm::SelectionDAG::getMergeValues
LLVM_ABI SDValue getMergeValues(ArrayRef< SDValue > Ops, const SDLoc &dl)
Create a MERGE_VALUES node from the given operands.
Definition SelectionDAG.cpp:9619

llvm::SelectionDAG::getVTList
LLVM_ABI SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
Definition SelectionDAG.cpp:11279

llvm::SelectionDAG::getShiftAmountConstant
LLVM_ABI SDValue getShiftAmountConstant(uint64_t Val, EVT VT, const SDLoc &DL)
Definition SelectionDAG.cpp:1806

llvm::SelectionDAG::getAllOnesConstant
LLVM_ABI SDValue getAllOnesConstant(const SDLoc &DL, EVT VT, bool IsTarget=false, bool IsOpaque=false)
Definition SelectionDAG.cpp:1795

llvm::SelectionDAG::getMachineNode
LLVM_ABI MachineSDNode * getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT)
These are used for target selectors to create a new node with specified return type(s),...
Definition SelectionDAG.cpp:11720

llvm::SelectionDAG::getNodeIfExists
LLVM_ABI SDNode * getNodeIfExists(unsigned Opcode, SDVTList VTList, ArrayRef< SDValue > Ops, const SDNodeFlags Flags, bool AllowCommute=false)
Get the specified node if it's already available, or else return NULL.
Definition SelectionDAG.cpp:11867

llvm::SelectionDAG::getFreeze
LLVM_ABI SDValue getFreeze(SDValue V)
Return a freeze using the SDLoc of the value operand.
Definition SelectionDAG.cpp:2457

llvm::SelectionDAG::getConstantPool
LLVM_ABI SDValue getConstantPool(const Constant *C, EVT VT, MaybeAlign Align=std::nullopt, int Offs=0, bool isT=false, unsigned TargetFlags=0)
Definition SelectionDAG.cpp:1963

llvm::SelectionDAG::makeEquivalentMemoryOrdering
LLVM_ABI SDValue makeEquivalentMemoryOrdering(SDValue OldChain, SDValue NewMemOpChain)
If an existing load has uses of its chain, create a token factor node with that chain and the new mem...
Definition SelectionDAG.cpp:12775

llvm::SelectionDAG::isConstantIntBuildVectorOrConstantInt
LLVM_ABI bool isConstantIntBuildVectorOrConstantInt(SDValue N, bool AllowOpaques=true) const
Test whether the given value is a constant int or similar node.
Definition SelectionDAG.cpp:14066

llvm::SelectionDAG::getJumpTableDebugInfo
LLVM_ABI SDValue getJumpTableDebugInfo(int JTI, SDValue Chain, const SDLoc &DL)
Definition SelectionDAG.cpp:1956

llvm::SelectionDAG::getSetCC
SDValue getSetCC(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond, SDValue Chain=SDValue(), bool IsSignaling=false)
Helper function to make it easier to build SetCC's if you just have an ISD::CondCode instead of an SD...
Definition SelectionDAG.h:1314

llvm::SelectionDAG::getConstantFP
LLVM_ABI SDValue getConstantFP(double Val, const SDLoc &DL, EVT VT, bool isTarget=false)
Create a ConstantFPSDNode wrapping a constant value.
Definition SelectionDAG.cpp:1868

llvm::SelectionDAG::getExtractSubvector
SDValue getExtractSubvector(const SDLoc &DL, EVT VT, SDValue Vec, unsigned Idx)
Return the VT typed sub-vector of Vec at Idx.
Definition SelectionDAG.h:963

llvm::SelectionDAG::getRegister
LLVM_ABI SDValue getRegister(Register Reg, EVT VT)
Definition SelectionDAG.cpp:2323

llvm::SelectionDAG::getLoad
LLVM_ABI SDValue getLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr)
Loads are not normal binary operators: their result type is not determined by their operands,...
Definition SelectionDAG.cpp:9859

llvm::SelectionDAG::getMemIntrinsicNode
LLVM_ABI SDValue getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl, SDVTList VTList, ArrayRef< SDValue > Ops, EVT MemVT, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MOLoad|MachineMemOperand::MOStore, LocationSize Size=LocationSize::precise(0), const AAMDNodes &AAInfo=AAMDNodes())
Creates a MemIntrinsicNode that may produce a result and takes a list of operands.
Definition SelectionDAG.cpp:9630

llvm::SelectionDAG::getInsertSubvector
SDValue getInsertSubvector(const SDLoc &DL, SDValue Vec, SDValue SubVec, unsigned Idx)
Insert SubVec at the Idx element of Vec.
Definition SelectionDAG.h:956

llvm::SelectionDAG::getAtomic
LLVM_ABI SDValue getAtomic(unsigned Opcode, const SDLoc &dl, EVT MemVT, SDValue Chain, SDValue Ptr, SDValue Val, MachineMemOperand *MMO)
Gets a node for an atomic op, produces result (if relevant) and chain and takes 2 operands.
Definition SelectionDAG.cpp:9582

llvm::SelectionDAG::getMemcpy
LLVM_ABI SDValue getMemcpy(SDValue Chain, const SDLoc &dl, SDValue Dst, SDValue Src, SDValue Size, Align Alignment, bool isVol, bool AlwaysInline, const CallInst *CI, std::optional< bool > OverrideTailCall, MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo, const AAMDNodes &AAInfo=AAMDNodes(), BatchAAResults *BatchAA=nullptr)
Definition SelectionDAG.cpp:9202

llvm::SelectionDAG::shouldOptForSize
LLVM_ABI bool shouldOptForSize() const
Definition SelectionDAG.cpp:1396

llvm::SelectionDAG::SplitVectorOperand
std::pair< SDValue, SDValue > SplitVectorOperand(const SDNode *N, unsigned OpNo)
Split the node's operand with EXTRACT_SUBVECTOR and return the low/high part.
Definition SelectionDAG.h:2469

llvm::SelectionDAG::getNOT
LLVM_ABI SDValue getNOT(const SDLoc &DL, SDValue Val, EVT VT)
Create a bitwise NOT operation as (XOR Val, -1).
Definition SelectionDAG.cpp:1617

llvm::SelectionDAG::getTargetLoweringInfo
const TargetLowering & getTargetLoweringInfo() const
Definition SelectionDAG.h:504

llvm::SelectionDAG::isEqualTo
LLVM_ABI bool isEqualTo(SDValue A, SDValue B) const
Test whether two SDValues are known to compare equal.
Definition SelectionDAG.cpp:6239

llvm::SelectionDAG::MaxRecursionDepth
static constexpr unsigned MaxRecursionDepth
Definition SelectionDAG.h:459

llvm::SelectionDAG::expandVACopy
LLVM_ABI SDValue expandVACopy(SDNode *Node)
Expand the specified ISD::VACOPY node as the Legalize pass would.
Definition SelectionDAG.cpp:2679

llvm::SelectionDAG::GetSplitDestVTs
LLVM_ABI std::pair< EVT, EVT > GetSplitDestVTs(const EVT &VT) const
Compute the VTs needed for the low/hi parts of a type which is split (or expanded) into two not neces...
Definition SelectionDAG.cpp:13591

llvm::SelectionDAG::getTargetJumpTable
SDValue getTargetJumpTable(int JTI, EVT VT, unsigned TargetFlags=0)
Definition SelectionDAG.h:768

llvm::SelectionDAG::getUNDEF
SDValue getUNDEF(EVT VT)
Return an UNDEF node. UNDEF does not have a useful SDLoc.
Definition SelectionDAG.h:1175

llvm::SelectionDAG::getCALLSEQ_END
SDValue getCALLSEQ_END(SDValue Chain, SDValue Op1, SDValue Op2, SDValue InGlue, const SDLoc &DL)
Return a new CALLSEQ_END node, which always must have a glue result (to ensure it's not CSE'd).
Definition SelectionDAG.h:1152

llvm::SelectionDAG::getBuildVector
SDValue getBuildVector(EVT VT, const SDLoc &DL, ArrayRef< SDValue > Ops)
Return an ISD::BUILD_VECTOR node.
Definition SelectionDAG.h:868

llvm::SelectionDAG::isSplatValue
LLVM_ABI bool isSplatValue(SDValue V, const APInt &DemandedElts, APInt &UndefElts, unsigned Depth=0) const
Test whether V has a splatted value for all the demanded elements.
Definition SelectionDAG.cpp:2979

llvm::SelectionDAG::getBitcast
LLVM_ABI SDValue getBitcast(EVT VT, SDValue V)
Return a bitcast using the SDLoc of the value operand, and casting to the provided type.
Definition SelectionDAG.cpp:2428

llvm::SelectionDAG::getCopyFromReg
SDValue getCopyFromReg(SDValue Chain, const SDLoc &dl, Register Reg, EVT VT)
Definition SelectionDAG.h:839

llvm::SelectionDAG::getSelect
SDValue getSelect(const SDLoc &DL, EVT VT, SDValue Cond, SDValue LHS, SDValue RHS, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build Select's if you just have operands and don't want to check...
Definition SelectionDAG.h:1343

llvm::SelectionDAG::getNegative
LLVM_ABI SDValue getNegative(SDValue Val, const SDLoc &DL, EVT VT)
Create negative operation as (SUB 0, Val).
Definition SelectionDAG.cpp:1612

llvm::SelectionDAG::getValidShiftAmount
LLVM_ABI std::optional< unsigned > getValidShiftAmount(SDValue V, const APInt &DemandedElts, unsigned Depth=0) const
If a SHL/SRA/SRL node V has a uniform shift amount that is less than the element bit-width of the shi...
Definition SelectionDAG.cpp:3303

llvm::SelectionDAG::simplifySelect
LLVM_ABI SDValue simplifySelect(SDValue Cond, SDValue TVal, SDValue FVal)
Try to simplify a select/vselect into 1 of its operands or a constant.
Definition SelectionDAG.cpp:10808

llvm::SelectionDAG::getZeroExtendInReg
LLVM_ABI SDValue getZeroExtendInReg(SDValue Op, const SDLoc &DL, EVT VT)
Return the expression required to zero extend the Op value assuming it was the smaller SrcTy value.
Definition SelectionDAG.cpp:1563

llvm::SelectionDAG::getDataLayout
const DataLayout & getDataLayout() const
Definition SelectionDAG.h:498

llvm::SelectionDAG::expandVAArg
LLVM_ABI SDValue expandVAArg(SDNode *Node)
Expand the specified ISD::VAARG node as the Legalize pass would.
Definition SelectionDAG.cpp:2645

llvm::SelectionDAG::doesNodeExist
LLVM_ABI bool doesNodeExist(unsigned Opcode, SDVTList VTList, ArrayRef< SDValue > Ops)
Check if a node exists without modifying its flags.
Definition SelectionDAG.cpp:11895

llvm::SelectionDAG::getSelectionDAGInfo
const SelectionDAGTargetInfo & getSelectionDAGInfo() const
Definition SelectionDAG.h:506

llvm::SelectionDAG::areNonVolatileConsecutiveLoads
LLVM_ABI bool areNonVolatileConsecutiveLoads(LoadSDNode *LD, LoadSDNode *Base, unsigned Bytes, int Dist) const
Return true if loads are next to each other and can be merged.
Definition SelectionDAG.cpp:13512

llvm::SelectionDAG::getConstant
LLVM_ABI SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
Definition SelectionDAG.cpp:1661

llvm::SelectionDAG::getMemBasePlusOffset
LLVM_ABI SDValue getMemBasePlusOffset(SDValue Base, TypeSize Offset, const SDLoc &DL, const SDNodeFlags Flags=SDNodeFlags())
Returns sum of the base pointer and offset.
Definition SelectionDAG.cpp:8599

llvm::SelectionDAG::getTruncStore
LLVM_ABI SDValue getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, EVT SVT, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
Definition SelectionDAG.cpp:9985

llvm::SelectionDAG::ReplaceAllUsesWith
LLVM_ABI void ReplaceAllUsesWith(SDValue From, SDValue To)
Modify anything using 'From' to use 'To' instead.
Definition SelectionDAG.cpp:12237

llvm::SelectionDAG::getCommutedVectorShuffle
LLVM_ABI SDValue getCommutedVectorShuffle(const ShuffleVectorSDNode &SV)
Returns an ISD::VECTOR_SHUFFLE node semantically equivalent to the shuffle node in input but with swa...
Definition SelectionDAG.cpp:2313

llvm::SelectionDAG::SplitVector
LLVM_ABI std::pair< SDValue, SDValue > SplitVector(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the vector with EXTRACT_SUBVECTOR using the provided VTs and return the low/high part.
Definition SelectionDAG.cpp:13636

llvm::SelectionDAG::isGuaranteedNotToBeUndefOrPoison
LLVM_ABI bool isGuaranteedNotToBeUndefOrPoison(SDValue Op, bool PoisonOnly=false, unsigned Depth=0) const
Return true if this function can prove that Op is never poison and, if PoisonOnly is false,...
Definition SelectionDAG.cpp:5440

llvm::SelectionDAG::getStore
LLVM_ABI SDValue getStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
Helper function to build ISD::STORE nodes.
Definition SelectionDAG.cpp:9909

llvm::SelectionDAG::getSignedConstant
LLVM_ABI SDValue getSignedConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Definition SelectionDAG.cpp:1789

llvm::SelectionDAG::InferPtrAlign
LLVM_ABI MaybeAlign InferPtrAlign(SDValue Ptr) const
Infer alignment of a load / store address.
Definition SelectionDAG.cpp:13540

llvm::SelectionDAG::getCALLSEQ_START
SDValue getCALLSEQ_START(SDValue Chain, uint64_t InSize, uint64_t OutSize, const SDLoc &DL)
Return a new CALLSEQ_START node, that starts new call frame, in which InSize bytes are set up inside ...
Definition SelectionDAG.h:1140

llvm::SelectionDAG::SignBitIsZero
LLVM_ABI bool SignBitIsZero(SDValue Op, unsigned Depth=0) const
Return true if the sign bit of Op is known to be zero.
Definition SelectionDAG.cpp:2919

llvm::SelectionDAG::getTargetExtractSubreg
LLVM_ABI SDValue getTargetExtractSubreg(int SRIdx, const SDLoc &DL, EVT VT, SDValue Operand)
A convenience function for creating TargetInstrInfo::EXTRACT_SUBREG nodes.
Definition SelectionDAG.cpp:11838

llvm::SelectionDAG::getSelectCC
SDValue getSelectCC(const SDLoc &DL, SDValue LHS, SDValue RHS, SDValue True, SDValue False, ISD::CondCode Cond, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build SelectCC's if you just have an ISD::CondCode instead of an...
Definition SelectionDAG.h:1353

llvm::SelectionDAG::getSExtOrTrunc
LLVM_ABI SDValue getSExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either sign-extending or trunca...
Definition SelectionDAG.cpp:1497

llvm::SelectionDAG::isKnownNeverZero
LLVM_ABI bool isKnownNeverZero(SDValue Op, unsigned Depth=0) const
Test whether the given SDValue is known to contain non-zero value(s).
Definition SelectionDAG.cpp:6078

llvm::SelectionDAG::FoldConstantArithmetic
LLVM_ABI SDValue FoldConstantArithmetic(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDValue > Ops, SDNodeFlags Flags=SDNodeFlags())
Definition SelectionDAG.cpp:6973

llvm::SelectionDAG::getValidMinimumShiftAmount
LLVM_ABI std::optional< unsigned > getValidMinimumShiftAmount(SDValue V, const APInt &DemandedElts, unsigned Depth=0) const
If a SHL/SRA/SRL node V has shift amounts that are all less than the element bit-width of the shift n...
Definition SelectionDAG.cpp:3325

llvm::SelectionDAG::getMaskedStore
LLVM_ABI SDValue getMaskedStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Base, SDValue Offset, SDValue Mask, EVT MemVT, MachineMemOperand *MMO, ISD::MemIndexedMode AM, bool IsTruncating=false, bool IsCompressing=false)
Definition SelectionDAG.cpp:10546

llvm::SelectionDAG::getExternalSymbol
LLVM_ABI SDValue getExternalSymbol(const char *Sym, EVT VT)
Definition SelectionDAG.cpp:2047

llvm::SelectionDAG::getTarget
const TargetMachine & getTarget() const
Definition SelectionDAG.h:499

llvm::SelectionDAG::getStrictFPExtendOrRound
LLVM_ABI std::pair< SDValue, SDValue > getStrictFPExtendOrRound(SDValue Op, SDValue Chain, const SDLoc &DL, EVT VT)
Convert Op, which must be a STRICT operation of float type, to the float type VT, by either extending...
Definition SelectionDAG.cpp:1478

llvm::SelectionDAG::getAnyExtOrTrunc
LLVM_ABI SDValue getAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either any-extending or truncat...
Definition SelectionDAG.cpp:1491

llvm::SelectionDAG::isKnownNeverZeroFloat
LLVM_ABI bool isKnownNeverZeroFloat(SDValue Op) const
Test whether the given floating point SDValue is known to never be positive or negative zero.
Definition SelectionDAG.cpp:6069

llvm::SelectionDAG::getIntPtrConstant
LLVM_ABI SDValue getIntPtrConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
Definition SelectionDAG.cpp:1801

llvm::SelectionDAG::getValueType
LLVM_ABI SDValue getValueType(EVT)
Definition SelectionDAG.cpp:2033

llvm::SelectionDAG::getNode
LLVM_ABI SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
Definition SelectionDAG.cpp:10910

llvm::SelectionDAG::getFPExtendOrRound
LLVM_ABI SDValue getFPExtendOrRound(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of float type, to the float type VT, by either extending or rounding (by tr...
Definition SelectionDAG.cpp:1470

llvm::SelectionDAG::isKnownNeverNaN
LLVM_ABI bool isKnownNeverNaN(SDValue Op, const APInt &DemandedElts, bool SNaN=false, unsigned Depth=0) const
Test whether the given SDValue (or all elements of it, if it is a vector) is known to never be NaN in...
Definition SelectionDAG.cpp:5875

llvm::SelectionDAG::getTargetConstant
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
Definition SelectionDAG.h:707

llvm::SelectionDAG::ComputeNumSignBits
LLVM_ABI unsigned ComputeNumSignBits(SDValue Op, unsigned Depth=0) const
Return the number of times the sign bit of the register is replicated into the other bits.
Definition SelectionDAG.cpp:4724

llvm::SelectionDAG::MaskedVectorIsZero
LLVM_ABI bool MaskedVectorIsZero(SDValue Op, const APInt &DemandedElts, unsigned Depth=0) const
Return true if 'Op' is known to be zero in DemandedElts.
Definition SelectionDAG.cpp:2943

llvm::SelectionDAG::getTargetBlockAddress
SDValue getTargetBlockAddress(const BlockAddress *BA, EVT VT, int64_t Offset=0, unsigned TargetFlags=0)
Definition SelectionDAG.h:808

llvm::SelectionDAG::isBaseWithConstantOffset
LLVM_ABI bool isBaseWithConstantOffset(SDValue Op) const
Return true if the specified operand is an ISD::ADD with a ConstantSDNode on the right-hand side,...
Definition SelectionDAG.cpp:5856

llvm::SelectionDAG::getVectorIdxConstant
LLVM_ABI SDValue getVectorIdxConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
Definition SelectionDAG.cpp:1819

llvm::SelectionDAG::ReplaceAllUsesOfValueWith
LLVM_ABI void ReplaceAllUsesOfValueWith(SDValue From, SDValue To)
Replace any uses of From with To, leaving uses of other values produced by From.getNode() alone.
Definition SelectionDAG.cpp:12398

llvm::SelectionDAG::getMachineFunction
MachineFunction & getMachineFunction() const
Definition SelectionDAG.h:493

llvm::SelectionDAG::getSplatBuildVector
SDValue getSplatBuildVector(EVT VT, const SDLoc &DL, SDValue Op)
Return a splat ISD::BUILD_VECTOR node, consisting of Op splatted to all elements.
Definition SelectionDAG.h:885

llvm::SelectionDAG::getFrameIndex
LLVM_ABI SDValue getFrameIndex(int FI, EVT VT, bool isTarget=false)
Definition SelectionDAG.cpp:1920

llvm::SelectionDAG::computeKnownBits
LLVM_ABI KnownBits computeKnownBits(SDValue Op, unsigned Depth=0) const
Determine which bits of Op are known to be either zero or one and return them in Known.
Definition SelectionDAG.cpp:3369

llvm::SelectionDAG::getZExtOrTrunc
LLVM_ABI SDValue getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either zero-extending or trunca...
Definition SelectionDAG.cpp:1503

llvm::SelectionDAG::getCondCode
LLVM_ABI SDValue getCondCode(ISD::CondCode Cond)
Definition SelectionDAG.cpp:2074

llvm::SelectionDAG::MaskedValueIsZero
LLVM_ABI bool MaskedValueIsZero(SDValue Op, const APInt &Mask, unsigned Depth=0) const
Return true if 'Op & Mask' is known to be zero.
Definition SelectionDAG.cpp:2927

llvm::SelectionDAG::getContext
LLVMContext * getContext() const
Definition SelectionDAG.h:511

llvm::SelectionDAG::getTargetExternalSymbol
LLVM_ABI SDValue getTargetExternalSymbol(const char *Sym, EVT VT, unsigned TargetFlags=0)
Definition SelectionDAG.cpp:2064

llvm::SelectionDAG::getMCSymbol
LLVM_ABI SDValue getMCSymbol(MCSymbol *Sym, EVT VT)
Definition SelectionDAG.cpp:2055

llvm::SelectionDAG::CreateStackTemporary
LLVM_ABI SDValue CreateStackTemporary(TypeSize Bytes, Align Alignment)
Create a stack temporary based on the size in bytes and the alignment.
Definition SelectionDAG.cpp:2726

llvm::SelectionDAG::UpdateNodeOperands
LLVM_ABI SDNode * UpdateNodeOperands(SDNode *N, SDValue Op)
Mutate the specified node in-place to have the specified operands.
Definition SelectionDAG.cpp:11372

llvm::SelectionDAG::getTargetConstantPool
SDValue getTargetConstantPool(const Constant *C, EVT VT, MaybeAlign Align=std::nullopt, int Offset=0, unsigned TargetFlags=0)
Definition SelectionDAG.h:777

llvm::SelectionDAG::getEntryNode
SDValue getEntryNode() const
Return the token chain corresponding to the entry of the function.
Definition SelectionDAG.h:581

llvm::SelectionDAG::getMaskedLoad
LLVM_ABI SDValue getMaskedLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Base, SDValue Offset, SDValue Mask, SDValue Src0, EVT MemVT, MachineMemOperand *MMO, ISD::MemIndexedMode AM, ISD::LoadExtType, bool IsExpanding=false)
Definition SelectionDAG.cpp:10500

llvm::SelectionDAG::getSplat
SDValue getSplat(EVT VT, const SDLoc &DL, SDValue Op)
Returns a node representing a splat of one value into all lanes of the provided vector type.
Definition SelectionDAG.h:918

llvm::SelectionDAG::SplitScalar
LLVM_ABI std::pair< SDValue, SDValue > SplitScalar(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the scalar node with EXTRACT_ELEMENT using the provided VTs and return the low/high part.
Definition SelectionDAG.cpp:13576

llvm::SelectionDAG::getOpcode_EXTEND
static unsigned getOpcode_EXTEND(unsigned Opcode)
Convert *_EXTEND_VECTOR_INREG to *_EXTEND opcode.
Definition SelectionDAG.h:979

llvm::SelectionDAG::matchBinOpReduction
LLVM_ABI SDValue matchBinOpReduction(SDNode *Extract, ISD::NodeType &BinOp, ArrayRef< ISD::NodeType > CandidateBinOps, bool AllowPartials=false)
Match a binop + shuffle pyramid that represents a horizontal reduction over the elements of a vector ...
Definition SelectionDAG.cpp:13237

llvm::SelectionDAG::getVectorShuffle
LLVM_ABI SDValue getVectorShuffle(EVT VT, const SDLoc &dl, SDValue N1, SDValue N2, ArrayRef< int > Mask)
Return an ISD::VECTOR_SHUFFLE node.
Definition SelectionDAG.cpp:2142

llvm::SelectionDAG::getMaskedScatter
LLVM_ABI SDValue getMaskedScatter(SDVTList VTs, EVT MemVT, const SDLoc &dl, ArrayRef< SDValue > Ops, MachineMemOperand *MMO, ISD::MemIndexType IndexType, bool IsTruncating=false)
Definition SelectionDAG.cpp:10642

llvm::ShuffleVectorInst::isBitRotateMask
static LLVM_ABI bool isBitRotateMask(ArrayRef< int > Mask, unsigned EltSizeInBits, unsigned MinSubElts, unsigned MaxSubElts, unsigned &NumSubElts, unsigned &RotateAmt)
Checks if the shuffle is a bit rotation of the first operand across multiple subelements,...
Definition Instructions.cpp:2494

llvm::ShuffleVectorSDNode
This SDNode is used to implement the code generator support for the llvm IR shufflevector instruction...
Definition SelectionDAGNodes.h:1691

llvm::ShuffleVectorSDNode::getMaskElt
int getMaskElt(unsigned Idx) const
Definition SelectionDAGNodes.h:1709

llvm::ShuffleVectorSDNode::getSplatMaskIndex
static int getSplatMaskIndex(ArrayRef< int > Mask)
Definition SelectionDAGNodes.h:1720

llvm::ShuffleVectorSDNode::getMask
ArrayRef< int > getMask() const
Definition SelectionDAGNodes.h:1704

llvm::ShuffleVectorSDNode::commuteMask
static void commuteMask(MutableArrayRef< int > Mask)
Change values in a shuffle permute mask assuming the two vector operands have swapped position.
Definition SelectionDAGNodes.h:1733

llvm::ShuffleVectorSDNode::isSplatMask
static LLVM_ABI bool isSplatMask(ArrayRef< int > Mask)
Definition SelectionDAG.cpp:14045

llvm::SmallBitVector
This is a 'bitvector' (really, a variable-sized bit array), optimized for the case when the array is ...
Definition SmallBitVector.h:35

llvm::SmallBitVector::resize
void resize(unsigned N, bool t=false)
Grow or shrink the bitvector.
Definition SmallBitVector.h:332

llvm::SmallDenseMap
Definition DenseMap.h:866

llvm::SmallPtrSetImpl::insert_range
void insert_range(Range &&R)
Definition SmallPtrSet.h:474

llvm::SmallPtrSetImpl::insert
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition SmallPtrSet.h:389

llvm::SmallSet
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition SmallSet.h:133

llvm::SmallSet::insert
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition SmallSet.h:183

llvm::SmallSet::size
size_type size() const
Definition SmallSet.h:170

llvm::SmallVectorImpl
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition SmallVector.h:574

llvm::SmallVectorImpl::pop_back_val
T pop_back_val()
Definition SmallVector.h:674

llvm::SmallVectorImpl::assign
void assign(size_type NumElts, ValueParamT Elt)
Definition SmallVector.h:705

llvm::SmallVectorImpl::erase
iterator erase(const_iterator CI)
Definition SmallVector.h:744

llvm::SmallVectorImpl::const_iterator
typename SuperClass::const_iterator const_iterator
Definition SmallVector.h:579

llvm::SmallVectorImpl::append
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
Definition SmallVector.h:684

llvm::SmallVectorImpl::clear
void clear()
Definition SmallVector.h:611

llvm::SmallVectorImpl::resize
void resize(size_type N)
Definition SmallVector.h:639

llvm::SmallVectorTemplateBase::push_back
void push_back(const T &Elt)
Definition SmallVector.h:417

llvm::SmallVectorTemplateCommon::end
iterator end()
Definition SmallVector.h:273

llvm::SmallVectorTemplateCommon::size
size_t size() const
Definition SmallVector.h:80

llvm::SmallVectorTemplateCommon::front
reference front()
Definition SmallVector.h:303

llvm::SmallVectorTemplateCommon::data
pointer data()
Return a pointer to the vector's buffer, even if empty().
Definition SmallVector.h:290

llvm::SmallVectorTemplateCommon::begin
iterator begin()
Definition SmallVector.h:271

llvm::SmallVectorTemplateCommon::back
reference back()
Definition SmallVector.h:312

llvm::SmallVectorTemplateCommon::empty
bool empty() const
Definition SmallVector.h:83

llvm::SmallVector
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition SmallVector.h:1203

llvm::SrcOp
Definition MachineIRBuilder.h:143

llvm::StoreInst
An instruction for storing to memory.
Definition Instructions.h:297

llvm::StoreSDNode
This class is used to represent ISD::STORE nodes.
Definition SelectionDAGNodes.h:2578

llvm::StoreSDNode::getBasePtr
const SDValue & getBasePtr() const
Definition SelectionDAGNodes.h:2597

llvm::StoreSDNode::getValue
const SDValue & getValue() const
Definition SelectionDAGNodes.h:2596

llvm::StoreSDNode::isTruncatingStore
bool isTruncatingStore() const
Return true if the op does a truncation before store.
Definition SelectionDAGNodes.h:2594

llvm::StringRef
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55

llvm::StringRef::empty
constexpr bool empty() const
empty - Check if the string is empty.
Definition StringRef.h:143

llvm::StringRef::size
constexpr size_t size() const
size - Get the string size.
Definition StringRef.h:146

llvm::StringRef::ends_with
bool ends_with(StringRef Suffix) const
Check if this string ends with the given Suffix.
Definition StringRef.h:273

llvm::StringRef::npos
static constexpr size_t npos
Definition StringRef.h:57

llvm::StringRef::equals_insensitive
bool equals_insensitive(StringRef RHS) const
Check for string equality, ignoring case.
Definition StringRef.h:172

llvm::StringSwitch
A switch()-like statement whose cases are string literals.
Definition StringSwitch.h:46

llvm::StringSwitch::Case
StringSwitch & Case(StringLiteral S, T Value)
Definition StringSwitch.h:68

llvm::StringSwitch::Default
R Default(T Value)
Definition StringSwitch.h:196

llvm::StructType::get
static LLVM_ABI StructType * get(LLVMContext &Context, ArrayRef< Type * > Elements, bool isPacked=false)
This static method is the primary way to create a literal StructType.
Definition Type.cpp:414

llvm::TargetFrameLowering
Information about stack frame layout on the target.
Definition TargetFrameLowering.h:47

llvm::TargetFrameLowering::hasFP
bool hasFP(const MachineFunction &MF) const
hasFP - Return true if the specified function should have a dedicated frame pointer register.
Definition TargetFrameLowering.h:311

llvm::TargetFrameLowering::getStackAlign
Align getStackAlign() const
getStackAlignment - This method returns the number of bytes to which the stack pointer must be aligne...
Definition TargetFrameLowering.h:107

llvm::TargetInstrInfo
TargetInstrInfo - Interface to description of machine instruction set.
Definition TargetInstrInfo.h:114

llvm::TargetLibraryInfo
Provides information about what library functions are available for the current target.
Definition TargetLibraryInfo.h:285

llvm::TargetLoweringBase::isOperationExpand
bool isOperationExpand(unsigned Op, EVT VT) const
Return true if the specified operation is illegal on this target or unlikely to be made legal with cu...
Definition TargetLowering.h:1461

llvm::TargetLoweringBase::setBooleanVectorContents
void setBooleanVectorContents(BooleanContent Ty)
Specify how the target extends the result of a vector boolean value from a vector of i1 to a wider ty...
Definition TargetLowering.h:2564

llvm::TargetLoweringBase::setOperationAction
void setOperationAction(unsigned Op, MVT VT, LegalizeAction Action)
Indicate that the specified operation does not work with the specified type and indicate what to do a...
Definition TargetLowering.h:2624

llvm::TargetLoweringBase::setMaxDivRemBitWidthSupported
void setMaxDivRemBitWidthSupported(unsigned SizeInBits)
Set the size in bits of the maximum div/rem the backend supports.
Definition TargetLowering.h:2853

llvm::TargetLoweringBase::hasAndNot
virtual bool hasAndNot(SDValue X) const
Return true if the target has a bitwise and-not operation: X = ~A & B This can be used to simplify se...
Definition TargetLowering.h:824

llvm::TargetLoweringBase::Enabled
@ Enabled
Definition TargetLowering.h:596

llvm::TargetLoweringBase::Unspecified
@ Unspecified
Definition TargetLowering.h:594

llvm::TargetLoweringBase::PredictableSelectIsExpensive
bool PredictableSelectIsExpensive
Tells the code generator that select is more expensive than a branch if the branch is usually predict...
Definition TargetLowering.h:3924

llvm::TargetLoweringBase::getValueType
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
Definition TargetLowering.h:1718

llvm::TargetLoweringBase::Custom
@ Custom
Definition TargetLowering.h:207

llvm::TargetLoweringBase::Expand
@ Expand
Definition TargetLowering.h:205

llvm::TargetLoweringBase::Promote
@ Promote
Definition TargetLowering.h:204

llvm::TargetLoweringBase::LibCall
@ LibCall
Definition TargetLowering.h:206

llvm::TargetLoweringBase::getLibcallCallingConv
CallingConv::ID getLibcallCallingConv(RTLIB::Libcall Call) const
Get the CallingConv that should be used for the specified libcall.
Definition TargetLowering.h:3632

llvm::TargetLoweringBase::MaxStoresPerMemcpyOptSize
unsigned MaxStoresPerMemcpyOptSize
Likewise for functions with the OptSize attribute.
Definition TargetLowering.h:3885

llvm::TargetLoweringBase::emitPatchPoint
MachineBasicBlock * emitPatchPoint(MachineInstr &MI, MachineBasicBlock *MBB) const
Replace/modify any TargetFrameIndex operands with a targte-dependent sequence of memory operands that...
Definition TargetLoweringBase.cpp:1235

llvm::TargetLoweringBase::getRegClassFor
virtual const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent=false) const
Return the register class that should be used for the specified value type.
Definition TargetLowering.h:1061

llvm::TargetLoweringBase::ShiftLegalizationStrategy
ShiftLegalizationStrategy
Return the preferred strategy to legalize tihs SHIFT instruction, with ExpansionFactor being the recu...
Definition TargetLowering.h:1096

llvm::TargetLoweringBase::ShiftLegalizationStrategy::LowerToLibcall
@ LowerToLibcall
Definition TargetLowering.h:1099

llvm::TargetLoweringBase::getTargetMachine
const TargetMachine & getTargetMachine() const
Definition TargetLowering.h:373

llvm::TargetLoweringBase::MaxLoadsPerMemcmp
unsigned MaxLoadsPerMemcmp
Specify maximum number of load instructions per memcmp call.
Definition TargetLowering.h:3904

llvm::TargetLoweringBase::setOperationPromotedToType
void setOperationPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)
Convenience method to set an operation to Promote and specify the type in a single call.
Definition TargetLowering.h:2797

llvm::TargetLoweringBase::LegalizeTypeAction
LegalizeTypeAction
This enum indicates whether a types are legal for a target, and if not, what action should be used to...
Definition TargetLowering.h:212

llvm::TargetLoweringBase::TypePromoteInteger
@ TypePromoteInteger
Definition TargetLowering.h:214

llvm::TargetLoweringBase::TypeSplitVector
@ TypeSplitVector
Definition TargetLowering.h:219

llvm::TargetLoweringBase::TypeWidenVector
@ TypeWidenVector
Definition TargetLowering.h:220

llvm::TargetLoweringBase::TypeLegal
@ TypeLegal
Definition TargetLowering.h:213

llvm::TargetLoweringBase::areJTsAllowed
virtual bool areJTsAllowed(const Function *Fn) const
Return true if lowering to a jump table is allowed.
Definition TargetLowering.h:1404

llvm::TargetLoweringBase::isOperationLegalOrPromote
bool isOperationLegalOrPromote(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal using promotion.
Definition TargetLowering.h:1372

llvm::TargetLoweringBase::addBypassSlowDiv
void addBypassSlowDiv(unsigned int SlowBitWidth, unsigned int FastBitWidth)
Tells the code generator which bitwidths to bypass.
Definition TargetLowering.h:2600

llvm::TargetLoweringBase::setMaxLargeFPConvertBitWidthSupported
void setMaxLargeFPConvertBitWidthSupported(unsigned SizeInBits)
Set the size in bits of the maximum fp to/from int conversion the backend supports.
Definition TargetLowering.h:2859

llvm::TargetLoweringBase::isTruncStoreLegal
bool isTruncStoreLegal(EVT ValVT, EVT MemVT) const
Return true if the specified store with truncation is legal on this target.
Definition TargetLowering.h:1533

llvm::TargetLoweringBase::isLoadBitCastBeneficial
virtual bool isLoadBitCastBeneficial(EVT LoadVT, EVT BitcastVT, const SelectionDAG &DAG, const MachineMemOperand &MMO) const
Return true if the following transform is beneficial: fold (conv (load x)) -> (load (conv*)x) On arch...
Definition TargetLoweringBase.cpp:2329

llvm::TargetLoweringBase::setPrefLoopAlignment
void setPrefLoopAlignment(Align Alignment)
Set the target's preferred loop alignment.
Definition TargetLowering.h:2833

llvm::TargetLoweringBase::isCommutativeBinOp
virtual bool isCommutativeBinOp(unsigned Opcode) const
Returns true if the opcode is a commutative binary operation.
Definition TargetLowering.h:2978

llvm::TargetLoweringBase::setMaxAtomicSizeInBitsSupported
void setMaxAtomicSizeInBitsSupported(unsigned SizeInBits)
Set the maximum atomic operation size supported by the backend.
Definition TargetLowering.h:2847

llvm::TargetLoweringBase::getPreferredVectorAction
virtual TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const
Return the preferred vector type legalization action.
Definition TargetLowering.h:537

llvm::TargetLoweringBase::getStackPointerRegisterToSaveRestore
Register getStackPointerRegisterToSaveRestore() const
If a physical register, this specifies the register that llvm.savestack/llvm.restorestack should save...
Definition TargetLowering.h:2060

llvm::TargetLoweringBase::isOperationCustom
bool isOperationCustom(unsigned Op, EVT VT) const
Return true if the operation uses custom lowering, regardless of whether the type is legal or not.
Definition TargetLowering.h:1399

llvm::TargetLoweringBase::MaxStoresPerMemsetOptSize
unsigned MaxStoresPerMemsetOptSize
Likewise for functions with the OptSize attribute.
Definition TargetLowering.h:3870

llvm::TargetLoweringBase::setBooleanContents
void setBooleanContents(BooleanContent Ty)
Specify how the target extends the result of integer and floating point boolean values from i1 to a w...
Definition TargetLowering.h:2550

llvm::TargetLoweringBase::MaxStoresPerMemmove
unsigned MaxStoresPerMemmove
Specify maximum number of store instructions per memmove call.
Definition TargetLowering.h:3918

llvm::TargetLoweringBase::getPrefLoopAlignment
virtual Align getPrefLoopAlignment(MachineLoop *ML=nullptr) const
Return the preferred loop alignment.
Definition TargetLoweringBase.cpp:2132

llvm::TargetLoweringBase::computeRegisterProperties
void computeRegisterProperties(const TargetRegisterInfo *TRI)
Once all of the register classes are added, this allows us to compute derived properties we expose.
Definition TargetLoweringBase.cpp:1355

llvm::TargetLoweringBase::getSetCCResultType
virtual EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const
Return the ValueType of the result of SETCC operations.
Definition TargetLoweringBase.cpp:1603

llvm::TargetLoweringBase::getTypeToTransformTo
virtual EVT getTypeToTransformTo(LLVMContext &Context, EVT VT) const
For types supported by the target, this is an identity function.
Definition TargetLowering.h:1172

llvm::TargetLoweringBase::MaxStoresPerMemmoveOptSize
unsigned MaxStoresPerMemmoveOptSize
Likewise for functions with the OptSize attribute.
Definition TargetLowering.h:3920

llvm::TargetLoweringBase::getBooleanContents
BooleanContent getBooleanContents(bool isVec, bool isFloat) const
For targets without i1 registers, this gives the nature of the high-bits of boolean values held in ty...
Definition TargetLowering.h:1023

llvm::TargetLoweringBase::getPreferredSwitchConditionType
virtual MVT getPreferredSwitchConditionType(LLVMContext &Context, EVT ConditionVT) const
Returns preferred type for switch condition.
Definition TargetLoweringBase.cpp:1732

llvm::TargetLoweringBase::addRegisterClass
void addRegisterClass(MVT VT, const TargetRegisterClass *RC)
Add the specified register class as an available regclass for the specified value type.
Definition TargetLowering.h:2607

llvm::TargetLoweringBase::isTypeLegal
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
Definition TargetLowering.h:1112

llvm::TargetLoweringBase::EnableExtLdPromotion
bool EnableExtLdPromotion
Definition TargetLowering.h:3927

llvm::TargetLoweringBase::shouldFoldConstantShiftPairToMask
virtual bool shouldFoldConstantShiftPairToMask(const SDNode *N) const
Return true if it is profitable to fold a pair of shifts into a mask.
Definition TargetLowering.h:850

llvm::TargetLoweringBase::getPointerTy
virtual MVT getPointerTy(const DataLayout &DL, uint32_t AS=0) const
Return the pointer type for the given address space, defaults to the pointer type from the data layou...
Definition TargetLowering.h:380

llvm::TargetLoweringBase::setPrefFunctionAlignment
void setPrefFunctionAlignment(Align Alignment)
Set the target's preferred function alignment.
Definition TargetLowering.h:2826

llvm::TargetLoweringBase::isOperationLegal
bool isOperationLegal(unsigned Op, EVT VT) const
Return true if the specified operation is legal on this target.
Definition TargetLowering.h:1466

llvm::TargetLoweringBase::MaxStoresPerMemset
unsigned MaxStoresPerMemset
Specify maximum number of store instructions per memset call.
Definition TargetLowering.h:3868

llvm::TargetLoweringBase::shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd
virtual bool shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(SDValue X, ConstantSDNode *XC, ConstantSDNode *CC, SDValue Y, unsigned OldShiftOpcode, unsigned NewShiftOpcode, SelectionDAG &DAG) const
Given the pattern (X & (C l>>/<< Y)) ==/!= 0 return true if it should be transformed into: ((X <</l>>...
Definition TargetLowering.h:874

llvm::TargetLoweringBase::setTruncStoreAction
void setTruncStoreAction(MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified truncating store does not work with the specified type and indicate what ...
Definition TargetLowering.h:2687

llvm::TargetLoweringBase::BooleanContent
BooleanContent
Enum that describes how the target represents true/false values.
Definition TargetLowering.h:237

llvm::TargetLoweringBase::ZeroOrOneBooleanContent
@ ZeroOrOneBooleanContent
Definition TargetLowering.h:239

llvm::TargetLoweringBase::ZeroOrNegativeOneBooleanContent
@ ZeroOrNegativeOneBooleanContent
Definition TargetLowering.h:240

llvm::TargetLoweringBase::preferredShiftLegalizationStrategy
virtual ShiftLegalizationStrategy preferredShiftLegalizationStrategy(SelectionDAG &DAG, SDNode *N, unsigned ExpansionFactor) const
Definition TargetLowering.h:1102

llvm::TargetLoweringBase::isOperationLegalOrCustom
bool isOperationLegalOrCustom(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
Definition TargetLowering.h:1358

llvm::TargetLoweringBase::allowsMemoryAccess
virtual bool allowsMemoryAccess(LLVMContext &Context, const DataLayout &DL, EVT VT, unsigned AddrSpace=0, Align Alignment=Align(1), MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *Fast=nullptr) const
Return true if the target supports a memory access of this type for the given address space and align...
Definition TargetLoweringBase.cpp:1815

llvm::TargetLoweringBase::MaxLoadsPerMemcmpOptSize
unsigned MaxLoadsPerMemcmpOptSize
Likewise for functions with the OptSize attribute.
Definition TargetLowering.h:3906

llvm::TargetLoweringBase::isBinOp
virtual bool isBinOp(unsigned Opcode) const
Return true if the node is a math/logic binary operator.
Definition TargetLowering.h:3022

llvm::TargetLoweringBase::setStackPointerRegisterToSaveRestore
void setStackPointerRegisterToSaveRestore(Register R)
If set to a physical register, this specifies the register that llvm.savestack/llvm....
Definition TargetLowering.h:2582

llvm::TargetLoweringBase::isLoadExtLegal
bool isLoadExtLegal(unsigned ExtType, EVT ValVT, EVT MemVT) const
Return true if the specified load with extension is legal on this target.
Definition TargetLowering.h:1486

llvm::TargetLoweringBase::AtomicExpansionKind
AtomicExpansionKind
Enum that specifies what an atomic load/AtomicRMWInst is expanded to, if at all.
Definition TargetLowering.h:256

llvm::TargetLoweringBase::AtomicExpansionKind::CmpArithIntrinsic
@ CmpArithIntrinsic
Definition TargetLowering.h:268

llvm::TargetLoweringBase::AtomicExpansionKind::CmpXChg
@ CmpXChg
Definition TargetLowering.h:264

llvm::TargetLoweringBase::AtomicExpansionKind::BitTestIntrinsic
@ BitTestIntrinsic
Definition TargetLowering.h:266

llvm::TargetLoweringBase::AtomicExpansionKind::None
@ None
Definition TargetLowering.h:257

llvm::TargetLoweringBase::AtomicExpansionKind::Expand
@ Expand
Definition TargetLowering.h:270

llvm::TargetLoweringBase::setCondCodeAction
void setCondCodeAction(ArrayRef< ISD::CondCode > CCs, MVT VT, LegalizeAction Action)
Indicate that the specified condition code is or isn't supported on the target and indicate what to d...
Definition TargetLowering.h:2748

llvm::TargetLoweringBase::AndOrSETCCFoldKind
AndOrSETCCFoldKind
Enum of different potentially desirable ways to fold (and/or (setcc ...), (setcc ....
Definition TargetLowering.h:294

llvm::TargetLoweringBase::NotAnd
@ NotAnd
Definition TargetLowering.h:297

llvm::TargetLoweringBase::ABS
@ ABS
Definition TargetLowering.h:298

llvm::TargetLoweringBase::AddAnd
@ AddAnd
Definition TargetLowering.h:296

llvm::TargetLoweringBase::None
@ None
Definition TargetLowering.h:295

llvm::TargetLoweringBase::setTargetDAGCombine
void setTargetDAGCombine(ArrayRef< ISD::NodeType > NTs)
Targets should invoke this method for each target independent node that they want to provide a custom...
Definition TargetLowering.h:2812

llvm::TargetLoweringBase::setLoadExtAction
void setLoadExtAction(unsigned ExtType, MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified load with extension does not work with the specified type and indicate wh...
Definition TargetLowering.h:2641

llvm::TargetLoweringBase::NegatibleCost
NegatibleCost
Enum that specifies when a float negation is beneficial.
Definition TargetLowering.h:286

llvm::TargetLoweringBase::NegatibleCost::Cheaper
@ Cheaper
Definition TargetLowering.h:287

llvm::TargetLoweringBase::NegatibleCost::Neutral
@ Neutral
Definition TargetLowering.h:288

llvm::TargetLoweringBase::getTypeAction
LegalizeTypeAction getTypeAction(LLVMContext &Context, EVT VT) const
Return how we should legalize values of this type, either it is already legal (return 'Legal') or we ...
Definition TargetLowering.h:1159

llvm::TargetLoweringBase::IsStrictFPEnabled
bool IsStrictFPEnabled
Definition TargetLowering.h:3939

llvm::TargetLoweringBase::getLibcallName
const char * getLibcallName(RTLIB::Libcall Call) const
Get the libcall routine name for the specified libcall.
Definition TargetLowering.h:3593

llvm::TargetLoweringBase::ArgListTy
std::vector< ArgListEntry > ArgListTy
Definition TargetLowering.h:341

llvm::TargetLoweringBase::MaxStoresPerMemcpy
unsigned MaxStoresPerMemcpy
Specify maximum number of store instructions per memcpy call.
Definition TargetLowering.h:3883

llvm::TargetLoweringBase::setSchedulingPreference
void setSchedulingPreference(Sched::Preference Pref)
Specify the target scheduling preference.
Definition TargetLowering.h:2569

llvm::TargetLoweringBase::shouldConvertPhiType
virtual bool shouldConvertPhiType(Type *From, Type *To) const
Given a set in interconnected phis of type 'From' that are loaded/stored or bitcast to type 'To',...
Definition TargetLowering.h:2972

llvm::TargetLoweringBase::isOperationLegalOrCustomOrPromote
bool isOperationLegalOrCustomOrPromote(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
Definition TargetLowering.h:1386

llvm::TargetLoweringBase::getExtendForContent
static ISD::NodeType getExtendForContent(BooleanContent Content)
Definition TargetLowering.h:343

llvm::TargetLowering
This class defines information used to lower LLVM code to legal SelectionDAG operators that the targe...
Definition TargetLowering.h:3947

llvm::TargetLowering::buildSDIVPow2WithCMov
SDValue buildSDIVPow2WithCMov(SDNode *N, const APInt &Divisor, SelectionDAG &DAG, SmallVectorImpl< SDNode * > &Created) const
Build sdiv by power-of-2 with conditional move instructions Ref: "Hacker's Delight" by Henry Warren 1...
Definition TargetLowering.cpp:6486

llvm::TargetLowering::ConstraintType
ConstraintType
Definition TargetLowering.h:5128

llvm::TargetLowering::C_RegisterClass
@ C_RegisterClass
Definition TargetLowering.h:5130

llvm::TargetLowering::C_Immediate
@ C_Immediate
Definition TargetLowering.h:5133

llvm::TargetLowering::C_Register
@ C_Register
Definition TargetLowering.h:5129

llvm::TargetLowering::C_Other
@ C_Other
Definition TargetLowering.h:5134

llvm::TargetLowering::SimplifyDemandedVectorElts
bool SimplifyDemandedVectorElts(SDValue Op, const APInt &DemandedEltMask, APInt &KnownUndef, APInt &KnownZero, TargetLoweringOpt &TLO, unsigned Depth=0, bool AssumeSingleUse=false) const
Look at Vector Op.
Definition TargetLowering.cpp:3170

llvm::TargetLowering::softenSetCCOperands
void softenSetCCOperands(SelectionDAG &DAG, EVT VT, SDValue &NewLHS, SDValue &NewRHS, ISD::CondCode &CCCode, const SDLoc &DL, const SDValue OldLHS, const SDValue OldRHS) const
Soften the operands of a comparison.
Definition TargetLowering.cpp:311

llvm::TargetLowering::makeLibCall
std::pair< SDValue, SDValue > makeLibCall(SelectionDAG &DAG, RTLIB::Libcall LC, EVT RetVT, ArrayRef< SDValue > Ops, MakeLibCallOptions CallOptions, const SDLoc &dl, SDValue Chain=SDValue()) const
Returns a pair of (return value, chain).
Definition TargetLowering.cpp:154

llvm::TargetLowering::expandIndirectJTBranch
virtual SDValue expandIndirectJTBranch(const SDLoc &dl, SDValue Value, SDValue Addr, int JTI, SelectionDAG &DAG) const
Expands target specific indirect branch for the case of JumpTable expansion.
Definition TargetLowering.cpp:506

llvm::TargetLowering::getCheaperNegatedExpression
SDValue getCheaperNegatedExpression(SDValue Op, SelectionDAG &DAG, bool LegalOps, bool OptForSize, unsigned Depth=0) const
This is the helper function to return the newly negated expression only when the cost is cheaper.
Definition TargetLowering.h:4634

llvm::TargetLowering::SimplifyMultipleUseDemandedBits
SDValue SimplifyMultipleUseDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth=0) const
More limited version of SimplifyDemandedBits that can be used to "lookthrough" ops that don't contrib...
Definition TargetLowering.cpp:709

llvm::TargetLowering::SimplifyMultipleUseDemandedVectorElts
SDValue SimplifyMultipleUseDemandedVectorElts(SDValue Op, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth=0) const
Helper wrapper around SimplifyMultipleUseDemandedBits, demanding all bits from only some vector eleme...
Definition TargetLowering.cpp:1004

llvm::TargetLowering::getConstraintType
virtual ConstraintType getConstraintType(StringRef Constraint) const
Given a constraint, return the type of constraint it is for this target.
Definition TargetLowering.cpp:5720

llvm::TargetLowering::ShrinkDemandedConstant
bool ShrinkDemandedConstant(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, TargetLoweringOpt &TLO) const
Check to see if the specified operand of the specified instruction is a constant integer.
Definition TargetLowering.cpp:542

llvm::TargetLowering::LowerToTLSEmulatedModel
virtual SDValue LowerToTLSEmulatedModel(const GlobalAddressSDNode *GA, SelectionDAG &DAG) const
Lower TLS global address SDNode for target independent emulated TLS model.
Definition TargetLowering.cpp:10702

llvm::TargetLowering::LowerXConstraint
virtual const char * LowerXConstraint(EVT ConstraintVT) const
Try to replace an X constraint, which matches anything, with another that has more specific requireme...
Definition TargetLowering.cpp:5766

llvm::TargetLowering::ConstraintWeight
ConstraintWeight
Definition TargetLowering.h:5138

llvm::TargetLowering::CW_Invalid
@ CW_Invalid
Definition TargetLowering.h:5140

llvm::TargetLowering::CW_Constant
@ CW_Constant
Definition TargetLowering.h:5150

llvm::TargetLowering::CW_SpecificReg
@ CW_SpecificReg
Definition TargetLowering.h:5147

llvm::TargetLowering::CW_Register
@ CW_Register
Definition TargetLowering.h:5148

llvm::TargetLowering::CW_Default
@ CW_Default
Definition TargetLowering.h:5151

llvm::TargetLowering::LowerCallTo
std::pair< SDValue, SDValue > LowerCallTo(CallLoweringInfo &CLI) const
This function lowers an abstract call to a function into an actual call.
Definition SelectionDAGBuilder.cpp:11044

llvm::TargetLowering::expandDIVREMByConstant
bool expandDIVREMByConstant(SDNode *N, SmallVectorImpl< SDValue > &Result, EVT HiLoVT, SelectionDAG &DAG, SDValue LL=SDValue(), SDValue LH=SDValue()) const
Attempt to expand an n-bit div/rem/divrem by constant using a n/2-bit urem by constant and other arit...
Definition TargetLowering.cpp:7998

llvm::TargetLowering::isPositionIndependent
bool isPositionIndependent() const
Definition TargetLowering.cpp:54

llvm::TargetLowering::getNegatedExpression
virtual SDValue getNegatedExpression(SDValue Op, SelectionDAG &DAG, bool LegalOps, bool OptForSize, NegatibleCost &Cost, unsigned Depth=0) const
Return the newly negated expression if the cost is not expensive and set the cost in Cost to indicate...
Definition TargetLowering.cpp:7478

llvm::TargetLowering::getSingleConstraintMatchWeight
virtual ConstraintWeight getSingleConstraintMatchWeight(AsmOperandInfo &info, const char *constraint) const
Examine constraint string and operand type and determine a weight value.
Definition TargetLowering.cpp:6164

llvm::TargetLowering::SimplifyMultipleUseDemandedBitsForTargetNode
virtual SDValue SimplifyMultipleUseDemandedBitsForTargetNode(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth) const
More limited version of SimplifyDemandedBits that can be used to "lookthrough" ops that don't contrib...
Definition TargetLowering.cpp:3963

llvm::TargetLowering::getRegForInlineAsmConstraint
virtual std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const
Given a physical register constraint (e.g.
Definition TargetLowering.cpp:5864

llvm::TargetLowering::SimplifyDemandedBits
bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0, bool AssumeSingleUse=false) const
Look at Op.
Definition TargetLowering.cpp:1161

llvm::TargetLowering::SimplifyDemandedBitsForTargetNode
virtual bool SimplifyDemandedBitsForTargetNode(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0) const
Attempt to simplify any target nodes based on the demanded bits/elts, returning true on success.
Definition TargetLowering.cpp:3950

llvm::TargetLowering::TargetLowering
TargetLowering(const TargetLowering &)=delete

llvm::TargetLowering::isSplatValueForTargetNode
virtual bool isSplatValueForTargetNode(SDValue Op, const APInt &DemandedElts, APInt &UndefElts, const SelectionDAG &DAG, unsigned Depth=0) const
Return true if vector Op has the same value across all DemandedElts, indicating any elements which ma...
Definition TargetLowering.cpp:4045

llvm::TargetLowering::getVectorElementPointer
SDValue getVectorElementPointer(SelectionDAG &DAG, SDValue VecPtr, EVT VecVT, SDValue Index) const
Get a pointer to vector element Idx located in memory for a vector of type VecVT starting at a base a...
Definition TargetLowering.cpp:10659

llvm::TargetLowering::combineRepeatedFPDivisors
virtual unsigned combineRepeatedFPDivisors() const
Indicate whether this target prefers to combine FDIVs with the same divisor.
Definition TargetLowering.h:5310

llvm::TargetLowering::LowerAsmOperandForConstraint
virtual void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const
Lower the specified operand into the Ops vector.
Definition TargetLowering.cpp:5782

llvm::TargetLowering::isGuaranteedNotToBeUndefOrPoisonForTargetNode
virtual bool isGuaranteedNotToBeUndefOrPoisonForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool PoisonOnly, unsigned Depth) const
Return true if this function can prove that Op is never poison and, if PoisonOnly is false,...
Definition TargetLowering.cpp:3997

llvm::TargetLowering::expandShiftParts
void expandShiftParts(SDNode *N, SDValue &Lo, SDValue &Hi, SelectionDAG &DAG) const
Expand shift-by-parts.
Definition TargetLowering.cpp:8360

llvm::TargetLowering::canCreateUndefOrPoisonForTargetNode
virtual bool canCreateUndefOrPoisonForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool PoisonOnly, bool ConsiderFlags, unsigned Depth) const
Return true if Op can create undef or poison from non-undef & non-poison operands.
Definition TargetLowering.cpp:4018

llvm::TargetMachine
Primary interface to the complete machine description for the target machine.
Definition TargetMachine.h:83

llvm::TargetMachine::getTLSModel
TLSModel::Model getTLSModel(const GlobalValue *GV) const
Returns the TLS model which should be used for the given global variable.
Definition TargetMachine.cpp:262

llvm::TargetMachine::getTargetTriple
const Triple & getTargetTriple() const
Definition TargetMachine.h:132

llvm::TargetMachine::useTLSDESC
bool useTLSDESC() const
Returns true if this target uses TLS Descriptors.
Definition TargetMachine.cpp:260

llvm::TargetMachine::useEmulatedTLS
bool useEmulatedTLS() const
Returns true if this target uses emulated TLS.
Definition TargetMachine.cpp:259

llvm::TargetMachine::Options
TargetOptions Options
Definition TargetMachine.h:124

llvm::TargetMachine::getCodeModel
CodeModel::Model getCodeModel() const
Returns the code model.
Definition TargetMachine.h:264

llvm::TargetMachine::getMCAsmInfo
const MCAsmInfo * getMCAsmInfo() const
Return target specific asm information.
Definition TargetMachine.h:240

llvm::TargetOptions
Definition TargetOptions.h:118

llvm::TargetOptions::NoSignedZerosFPMath
unsigned NoSignedZerosFPMath
NoSignedZerosFPMath - This flag is enabled when the -enable-no-signed-zeros-fp-math is specified on t...
Definition TargetOptions.h:179

llvm::TargetOptions::NoNaNsFPMath
unsigned NoNaNsFPMath
NoNaNsFPMath - This flag is enabled when the -enable-no-nans-fp-math flag is specified on the command...
Definition TargetOptions.h:168

llvm::TargetOptions::AllowFPOpFusion
FPOpFusion::FPOpFusionMode AllowFPOpFusion
AllowFPOpFusion - This flag is set by the -fp-contract=xxx option.
Definition TargetOptions.h:400

llvm::TargetRegisterClass
Definition TargetRegisterInfo.h:45

llvm::TargetRegisterClass::contains
bool contains(Register Reg) const
Return true if the specified register is included in this register class.
Definition TargetRegisterInfo.h:95

llvm::TargetRegisterClass::hasSuperClassEq
bool hasSuperClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a super-class of or equal to this class.
Definition TargetRegisterInfo.h:148

llvm::TargetRegisterInfo
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
Definition TargetRegisterInfo.h:242

llvm::TargetSubtargetInfo::getInstrInfo
virtual const TargetInstrInfo * getInstrInfo() const
Definition TargetSubtargetInfo.h:99

llvm::Target
Target - Wrapper for Target specific information.
Definition TargetRegistry.h:146

llvm::Triple::isOSBinFormatCOFF
bool isOSBinFormatCOFF() const
Tests whether the OS uses the COFF binary format.
Definition Triple.h:776

llvm::TypeSize::getFixed
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition TypeSize.h:344

llvm::Type
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45

llvm::Type::isVectorTy
bool isVectorTy() const
True if this is an instance of VectorType.
Definition Type.h:273

llvm::Type::getInt32Ty
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:297

llvm::Type::isFloatTy
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
Definition Type.h:153

llvm::Type::getPrimitiveSizeInBits
LLVM_ABI TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Definition Type.cpp:198

llvm::Type::isHalfTy
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition Type.h:142

llvm::Type::getScalarSizeInBits
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:231

llvm::Type::isDoubleTy
bool isDoubleTy() const
Return true if this is 'double', a 64-bit IEEE fp type.
Definition Type.h:156

llvm::Type::getInt1Ty
static LLVM_ABI IntegerType * getInt1Ty(LLVMContext &C)
Definition Type.cpp:294

llvm::Type::isIntegerTy
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:240

llvm::Type::getIntNTy
static LLVM_ABI IntegerType * getIntNTy(LLVMContext &C, unsigned N)
Definition Type.cpp:301

llvm::UndefValue::get
static LLVM_ABI UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
Definition Constants.cpp:1869

llvm::Use
A Use represents the edge between a Value definition and its users.
Definition Use.h:35

llvm::Use::getUser
User * getUser() const
Returns the User that contains this Use.
Definition Use.h:61

llvm::Use::getOperandNo
LLVM_ABI unsigned getOperandNo() const
Return the operand # of this use in its User.
Definition Use.cpp:35

llvm::User
Definition User.h:44

llvm::User::getOperand
Value * getOperand(unsigned i) const
Definition User.h:232

llvm::Value
LLVM Value Representation.
Definition Value.h:75

llvm::Value::getType
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256

llvm::Value::user_begin
user_iterator user_begin()
Definition Value.h:402

llvm::Value::hasOneUse
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:439

llvm::Value::replaceAllUsesWith
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition Value.cpp:546

llvm::Value::users
iterator_range< user_iterator > users()
Definition Value.h:426

llvm::Value::use_begin
use_iterator use_begin()
Definition Value.h:364

llvm::Value::use_empty
bool use_empty() const
Definition Value.h:346

llvm::Value::getContext
LLVM_ABI LLVMContext & getContext() const
All values hold a context through their type.
Definition Value.cpp:1099

llvm::Value::uses
iterator_range< use_iterator > uses()
Definition Value.h:380

llvm::Value::getName
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:322

llvm::VectorType::isValidElementType
static LLVM_ABI bool isValidElementType(Type *ElemTy)
Return true if the specified type is valid as a element type.

llvm::X86FrameLowering::has128ByteRedZone
bool has128ByteRedZone(const MachineFunction &MF) const
Return true if the function has a redzone (accessible bytes past the frame of the top of stack functi...
Definition X86FrameLowering.cpp:1453

llvm::X86FrameLowering::Uses64BitFramePtr
bool Uses64BitFramePtr
True if the 64-bit frame or stack pointer should be used.
Definition X86FrameLowering.h:48

llvm::X86InstrInfo
Definition X86InstrInfo.h:224

llvm::X86InstrInfo::getGlobalBaseReg
Register getGlobalBaseReg(MachineFunction *MF) const
getGlobalBaseReg - Return a virtual register initialized with the the global base register value.
Definition X86InstrInfo.cpp:9032

llvm::X86MachineFunctionInfo
X86MachineFunctionInfo - This class is derived from MachineFunction and contains private X86 target-s...
Definition X86MachineFunctionInfo.h:58

llvm::X86MachineFunctionInfo::setFAIndex
void setFAIndex(int Index)
Definition X86MachineFunctionInfo.h:226

llvm::X86MachineFunctionInfo::setAMXProgModel
void setAMXProgModel(AMXProgModelEnum Model)
Definition X86MachineFunctionInfo.h:262

llvm::X86MachineFunctionInfo::getVarArgsGPOffset
unsigned getVarArgsGPOffset() const
Definition X86MachineFunctionInfo.h:243

llvm::X86MachineFunctionInfo::getRegSaveFrameIndex
int getRegSaveFrameIndex() const
Definition X86MachineFunctionInfo.h:240

llvm::X86MachineFunctionInfo::getPreallocatedArgOffsets
ArrayRef< size_t > getPreallocatedArgOffsets(const size_t Id)
Definition X86MachineFunctionInfo.h:333

llvm::X86MachineFunctionInfo::setIsSplitCSR
void setIsSplitCSR(bool s)
Definition X86MachineFunctionInfo.h:273

llvm::X86MachineFunctionInfo::getVarArgsFPOffset
unsigned getVarArgsFPOffset() const
Definition X86MachineFunctionInfo.h:246

llvm::X86MachineFunctionInfo::getRestoreBasePointerOffset
int getRestoreBasePointerOffset() const
Definition X86MachineFunctionInfo.h:208

llvm::X86MachineFunctionInfo::getFAIndex
int getFAIndex() const
Definition X86MachineFunctionInfo.h:225

llvm::X86MachineFunctionInfo::getVarArgsFrameIndex
int getVarArgsFrameIndex() const
Definition X86MachineFunctionInfo.h:237

llvm::X86MachineFunctionInfo::setRestoreBasePointer
void setRestoreBasePointer(const MachineFunction *MF)
Definition X86MachineFunctionInfo.cpp:38

llvm::X86MachineFunctionInfo::setHasPreallocatedCall
void setHasPreallocatedCall(bool v)
Definition X86MachineFunctionInfo.h:282

llvm::X86MachineFunctionInfo::incNumLocalDynamicTLSAccesses
void incNumLocalDynamicTLSAccesses()
Definition X86MachineFunctionInfo.h:253

llvm::X86MachineFunctionInfo::getPreallocatedStackSize
size_t getPreallocatedStackSize(const size_t Id)
Definition X86MachineFunctionInfo.h:324

llvm::X86MachineFunctionInfo::getRAIndex
int getRAIndex() const
Definition X86MachineFunctionInfo.h:222

llvm::X86MachineFunctionInfo::setRAIndex
void setRAIndex(int Index)
Definition X86MachineFunctionInfo.h:223

llvm::X86RegisterInfo
Definition X86RegisterInfo.h:25

llvm::X86RegisterInfo::hasBasePointer
bool hasBasePointer(const MachineFunction &MF) const
Definition X86RegisterInfo.cpp:725

llvm::X86RegisterInfo::getPtrSizedFrameRegister
Register getPtrSizedFrameRegister(const MachineFunction &MF) const
Definition X86RegisterInfo.cpp:1018

llvm::X86RegisterInfo::getFrameRegister
Register getFrameRegister(const MachineFunction &MF) const override
Definition X86RegisterInfo.cpp:1012

llvm::X86RegisterInfo::getPtrSizedStackRegister
Register getPtrSizedStackRegister(const MachineFunction &MF) const
Definition X86RegisterInfo.cpp:1027

llvm::X86RegisterInfo::getStackRegister
Register getStackRegister() const
Definition X86RegisterInfo.h:162

llvm::X86RegisterInfo::getSlotSize
unsigned getSlotSize() const
Definition X86RegisterInfo.h:170

llvm::X86RegisterInfo::getBaseRegister
Register getBaseRegister() const
Definition X86RegisterInfo.h:163

llvm::X86RegisterInfo::getNoPreservedMask
const uint32_t * getNoPreservedMask() const override
Definition X86RegisterInfo.cpp:490

llvm::X86Subtarget
Definition X86Subtarget.h:52

llvm::X86Subtarget::canExtendTo512BW
bool canExtendTo512BW() const
Definition X86Subtarget.h:231

llvm::X86Subtarget::hasAnyFMA
bool hasAnyFMA() const
Definition X86Subtarget.h:199

llvm::X86Subtarget::hasSSE1
bool hasSSE1() const
Definition X86Subtarget.h:189

llvm::X86Subtarget::avoidMFence
bool avoidMFence() const
Avoid use of mfence forfence seq_cst, and instead use lock or.
Definition X86Subtarget.h:278

llvm::X86Subtarget::hasBitScanPassThrough
bool hasBitScanPassThrough() const
Definition X86Subtarget.h:263

llvm::X86Subtarget::hasSSE42
bool hasSSE42() const
Definition X86Subtarget.h:194

llvm::X86Subtarget::getTargetLowering
const X86TargetLowering * getTargetLowering() const override
Definition X86Subtarget.h:118

llvm::X86Subtarget::hasMFence
bool hasMFence() const
Use mfence if we have SSE2 or we're on x86-64 (even if we asked for no-sse2).
Definition X86Subtarget.h:275

llvm::X86Subtarget::canUseCMOV
bool canUseCMOV() const
Definition X86Subtarget.h:188

llvm::X86Subtarget::isTargetDarwin
bool isTargetDarwin() const
Definition X86Subtarget.h:282

llvm::X86Subtarget::isTarget64BitLP64
bool isTarget64BitLP64() const
Is this x86_64 with the LP64 programming model (standard AMD64, no x32)?
Definition X86Subtarget.h:176

llvm::X86Subtarget::getInstrInfo
const X86InstrInfo * getInstrInfo() const override
Definition X86Subtarget.h:122

llvm::X86Subtarget::useAVX512Regs
bool useAVX512Regs() const
Definition X86Subtarget.h:248

llvm::X86Subtarget::hasSSE3
bool hasSSE3() const
Definition X86Subtarget.h:191

llvm::X86Subtarget::isCallingConvWin64
bool isCallingConvWin64(CallingConv::ID CC) const
Definition X86Subtarget.h:341

llvm::X86Subtarget::hasAVX512
bool hasAVX512() const
Definition X86Subtarget.h:197

llvm::X86Subtarget::canExtendTo512DQ
bool canExtendTo512DQ() const
Definition X86Subtarget.h:228

llvm::X86Subtarget::hasSSE41
bool hasSSE41() const
Definition X86Subtarget.h:193

llvm::X86Subtarget::hasSSE2
bool hasSSE2() const
Definition X86Subtarget.h:190

llvm::X86Subtarget::hasSSSE3
bool hasSSSE3() const
Definition X86Subtarget.h:192

llvm::X86Subtarget::hasInt256
bool hasInt256() const
Definition X86Subtarget.h:198

llvm::X86Subtarget::getRegisterInfo
const X86RegisterInfo * getRegisterInfo() const override
Definition X86Subtarget.h:132

llvm::X86Subtarget::hasAVX
bool hasAVX() const
Definition X86Subtarget.h:195

llvm::X86Subtarget::getPreferVectorWidth
unsigned getPreferVectorWidth() const
Definition X86Subtarget.h:221

llvm::X86Subtarget::getFrameLowering
const X86FrameLowering * getFrameLowering() const override
Definition X86Subtarget.h:124

llvm::X86Subtarget::useBWIRegs
bool useBWIRegs() const
Definition X86Subtarget.h:256

llvm::X86Subtarget::hasAVX2
bool hasAVX2() const
Definition X86Subtarget.h:196

llvm::X86TargetLowering
Definition X86ISelLowering.h:1069

llvm::X86TargetLowering::shouldFormOverflowOp
bool shouldFormOverflowOp(unsigned Opcode, EVT VT, bool MathUsed) const override
Overflow nodes should get combined/lowered to optimal instructions (they should allow eliminating exp...
Definition X86ISelLowering.cpp:3409

llvm::X86TargetLowering::getPrefLoopAlignment
Align getPrefLoopAlignment(MachineLoop *ML) const override
Return the preferred loop alignment.
Definition X86ISelLowering.cpp:62392

llvm::X86TargetLowering::isLegalAddImmediate
bool isLegalAddImmediate(int64_t Imm) const override
Return true if the specified immediate is legal add immediate, that is the target has add instruction...
Definition X86ISelLowering.cpp:35516

llvm::X86TargetLowering::preferSextInRegOfTruncate
bool preferSextInRegOfTruncate(EVT TruncVT, EVT VT, EVT ExtVT) const override
Definition X86ISelLowering.cpp:60998

llvm::X86TargetLowering::PerformDAGCombine
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
Definition X86ISelLowering.cpp:60789

llvm::X86TargetLowering::preferABDSToABSWithNSW
bool preferABDSToABSWithNSW(EVT VT) const override
Definition X86ISelLowering.cpp:60993

llvm::X86TargetLowering::isCheapToSpeculateCtlz
bool isCheapToSpeculateCtlz(Type *Ty) const override
Return true if it is cheap to speculate a call to intrinsic ctlz.
Definition X86ISelLowering.cpp:3426

llvm::X86TargetLowering::getJumpTableEncoding
unsigned getJumpTableEncoding() const override
Return the entry encoding for a jump table in the current function.
Definition X86ISelLoweringCall.cpp:432

llvm::X86TargetLowering::BuildFILD
std::pair< SDValue, SDValue > BuildFILD(EVT DstVT, EVT SrcVT, const SDLoc &DL, SDValue Chain, SDValue Pointer, MachinePointerInfo PtrInfo, Align Alignment, SelectionDAG &DAG) const
Definition X86ISelLowering.cpp:20197

llvm::X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode
bool SimplifyDemandedVectorEltsForTargetNode(SDValue Op, const APInt &DemandedElts, APInt &KnownUndef, APInt &KnownZero, TargetLoweringOpt &TLO, unsigned Depth) const override
Attempt to simplify any target nodes based on the demanded vector elements, returning true on success...
Definition X86ISelLowering.cpp:43498

llvm::X86TargetLowering::LowerAsmOutputForConstraint
SDValue LowerAsmOutputForConstraint(SDValue &Chain, SDValue &Flag, const SDLoc &DL, const AsmOperandInfo &Constraint, SelectionDAG &DAG) const override
Handle Lowering flag assembly outputs.
Definition X86ISelLowering.cpp:61475

llvm::X86TargetLowering::LowerXConstraint
const char * LowerXConstraint(EVT ConstraintVT) const override
Try to replace an X constraint, which matches anything, with another that has more specific requireme...
Definition X86ISelLowering.cpp:61463

llvm::X86TargetLowering::SimplifyMultipleUseDemandedBitsForTargetNode
SDValue SimplifyMultipleUseDemandedBitsForTargetNode(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth) const override
More limited version of SimplifyDemandedBits that can be used to "lookthrough" ops that don't contrib...
Definition X86ISelLowering.cpp:45025

llvm::X86TargetLowering::useLoadStackGuardNode
bool useLoadStackGuardNode(const Module &M) const override
If this function returns true, SelectionDAGBuilder emits a LOAD_STACK_GUARD node when it is lowering ...
Definition X86ISelLowering.cpp:2735

llvm::X86TargetLowering::isSplatValueForTargetNode
bool isSplatValueForTargetNode(SDValue Op, const APInt &DemandedElts, APInt &UndefElts, const SelectionDAG &DAG, unsigned Depth) const override
Return true if vector Op has the same value across all DemandedElts, indicating any elements which ma...
Definition X86ISelLowering.cpp:45300

llvm::X86TargetLowering::convertSelectOfConstantsToMath
bool convertSelectOfConstantsToMath(EVT VT) const override
Return true if a select of constants (select Cond, C1, C2) should be transformed into simple math ops...
Definition X86ISelLowering.cpp:3335

llvm::X86TargetLowering::getConstraintType
ConstraintType getConstraintType(StringRef Constraint) const override
Given a constraint letter, return the type of constraint for this target.
Definition X86ISelLowering.cpp:61234

llvm::X86TargetLowering::getExceptionSelectorRegister
Register getExceptionSelectorRegister(const Constant *PersonalityFn) const override
If a physical register, this returns the register that receives the exception typeid on entry to a la...
Definition X86ISelLowering.cpp:28413

llvm::X86TargetLowering::preferredShiftLegalizationStrategy
ShiftLegalizationStrategy preferredShiftLegalizationStrategy(SelectionDAG &DAG, SDNode *N, unsigned ExpansionFactor) const override
Definition X86ISelLowering.cpp:3680

llvm::X86TargetLowering::LowerOperation
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
Provide custom lowering hooks for some operations.
Definition X86ISelLowering.cpp:33574

llvm::X86TargetLowering::isLegalStoreImmediate
bool isLegalStoreImmediate(int64_t Imm) const override
Return true if the specified immediate is legal for the value input of a store instruction.
Definition X86ISelLowering.cpp:35521

llvm::X86TargetLowering::visitMaskedStore
SDValue visitMaskedStore(SelectionDAG &DAG, const SDLoc &DL, SDValue Chain, MachineMemOperand *MMO, SDValue Ptr, SDValue Val, SDValue Mask) const override
Definition X86ISelLowering.cpp:33556

llvm::X86TargetLowering::getNegatedExpression
SDValue getNegatedExpression(SDValue Op, SelectionDAG &DAG, bool LegalOperations, bool ForCodeSize, NegatibleCost &Cost, unsigned Depth) const override
Return the newly negated expression if the cost is not expensive and set the cost in Cost to indicate...
Definition X86ISelLowering.cpp:54877

llvm::X86TargetLowering::isTypeDesirableForOp
bool isTypeDesirableForOp(unsigned Opc, EVT VT) const override
Return true if the target has native support for the specified value type and it is 'desirable' to us...
Definition X86ISelLowering.cpp:61003

llvm::X86TargetLowering::isCtlzFast
bool isCtlzFast() const override
Return true if ctlz instruction is fast.
Definition X86ISelLowering.cpp:3481

llvm::X86TargetLowering::getRegisterByName
Register getRegisterByName(const char *RegName, LLT VT, const MachineFunction &MF) const override
Return the register ID of the name passed in.
Definition X86ISelLowering.cpp:28369

llvm::X86TargetLowering::shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd
bool shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(SDValue X, ConstantSDNode *XC, ConstantSDNode *CC, SDValue Y, unsigned OldShiftOpcode, unsigned NewShiftOpcode, SelectionDAG &DAG) const override
Given the pattern (X & (C l>>/<< Y)) ==/!= 0 return true if it should be transformed into: ((X <</l>>...
Definition X86ISelLowering.cpp:3528

llvm::X86TargetLowering::supportSwiftError
bool supportSwiftError() const override
Return true if the target supports swifterror attribute.
Definition X86ISelLowering.cpp:62267

llvm::X86TargetLowering::isCheapToSpeculateCttz
bool isCheapToSpeculateCttz(Type *Ty) const override
Return true if it is cheap to speculate a call to intrinsic cttz.
Definition X86ISelLowering.cpp:3417

llvm::X86TargetLowering::shouldSplatInsEltVarIndex
bool shouldSplatInsEltVarIndex(EVT VT) const override
Return true if inserting a scalar into a variable element of an undef vector is more efficiently hand...
Definition X86ISelLowering.cpp:3689

llvm::X86TargetLowering::isInlineAsmTargetBranch
bool isInlineAsmTargetBranch(const SmallVectorImpl< StringRef > &AsmStrs, unsigned OpNo) const override
On x86, return true if the operand with index OpNo is a CALL or JUMP instruction, which can use eithe...
Definition X86ISelLowering.cpp:33514

llvm::X86TargetLowering::hasFastEqualityCompare
MVT hasFastEqualityCompare(unsigned NumBits) const override
Vector-sized comparisons are fast using PCMPEQ + PMOVMSK or PTEST.
Definition X86ISelLowering.cpp:3695

llvm::X86TargetLowering::SimplifyDemandedVectorEltsForTargetShuffle
bool SimplifyDemandedVectorEltsForTargetShuffle(SDValue Op, const APInt &DemandedElts, unsigned MaskIndex, TargetLoweringOpt &TLO, unsigned Depth) const
Definition X86ISelLowering.cpp:43431

llvm::X86TargetLowering::isLegalICmpImmediate
bool isLegalICmpImmediate(int64_t Imm) const override
Return true if the specified immediate is legal icmp immediate, that is the target has icmp instructi...
Definition X86ISelLowering.cpp:35512

llvm::X86TargetLowering::hasInlineStackProbe
bool hasInlineStackProbe(const MachineFunction &MF) const override
Returns true if stack probing through inline assembly is requested.
Definition X86ISelLowering.cpp:62343

llvm::X86TargetLowering::EmitInstrWithCustomInserter
MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *MBB) const override
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
Definition X86ISelLowering.cpp:37737

llvm::X86TargetLowering::preferedOpcodeForCmpEqPiecesOfOperand
unsigned preferedOpcodeForCmpEqPiecesOfOperand(EVT VT, unsigned ShiftOpc, bool MayTransformRotate, const APInt &ShiftOrRotateAmt, const std::optional< APInt > &AndMask) const override
Definition X86ISelLowering.cpp:3550

llvm::X86TargetLowering::isXAndYEqZeroPreferableToXAndYEqY
bool isXAndYEqZeroPreferableToXAndYEqY(ISD::CondCode Cond, EVT VT) const override
Definition X86ISelLowering.cpp:23593

llvm::X86TargetLowering::canMergeStoresTo
bool canMergeStoresTo(unsigned AddressSpace, EVT MemVT, const MachineFunction &MF) const override
Returns if it's reasonable to merge stores to MemVT size.
Definition X86ISelLowering.cpp:3463

llvm::X86TargetLowering::hasAndNot
bool hasAndNot(SDValue Y) const override
Return true if the target has a bitwise and-not operation: X = ~A & B This can be used to simplify se...
Definition X86ISelLowering.cpp:3506

llvm::X86TargetLowering::SimplifyDemandedBitsForTargetNode
bool SimplifyDemandedBitsForTargetNode(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth) const override
Attempt to simplify any target nodes based on the demanded bits/elts, returning true on success.
Definition X86ISelLowering.cpp:44405

llvm::X86TargetLowering::shouldConvertConstantLoadToIntImm
bool shouldConvertConstantLoadToIntImm(const APInt &Imm, Type *Ty) const override
Returns true if it is beneficial to convert a load of a constant to just the constant itself.
Definition X86ISelLowering.cpp:3316

llvm::X86TargetLowering::shouldReduceLoadWidth
bool shouldReduceLoadWidth(SDNode *Load, ISD::LoadExtType ExtTy, EVT NewVT, std::optional< unsigned > ByteOffset) const override
Return true if we believe it is correct and profitable to reduce the load node to a smaller type.
Definition X86ISelLowering.cpp:3249

llvm::X86TargetLowering::preferScalarizeSplat
bool preferScalarizeSplat(SDNode *N) const override
Definition X86ISelLowering.cpp:3645

llvm::X86TargetLowering::getRegForInlineAsmConstraint
std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const override
Given a physical register constraint (e.g.
Definition X86ISelLowering.cpp:61699

llvm::X86TargetLowering::isNarrowingProfitable
bool isNarrowingProfitable(SDNode *N, EVT SrcVT, EVT DestVT) const override
Return true if it's profitable to narrow operations of type SrcVT to DestVT.
Definition X86ISelLowering.cpp:35612

llvm::X86TargetLowering::isFPImmLegal
bool isFPImmLegal(const APFloat &Imm, EVT VT, bool ForCodeSize) const override
Returns true if the target can instruction select the specified FP immediate natively.
Definition X86ISelLowering.cpp:3241

llvm::X86TargetLowering::EmitKCFICheck
MachineInstr * EmitKCFICheck(MachineBasicBlock &MBB, MachineBasicBlock::instr_iterator &MBBI, const TargetInstrInfo *TII) const override
Definition X86ISelLowering.cpp:62272

llvm::X86TargetLowering::isLoadBitCastBeneficial
bool isLoadBitCastBeneficial(EVT LoadVT, EVT BitcastVT, const SelectionDAG &DAG, const MachineMemOperand &MMO) const override
Return true if the following transform is beneficial: fold (conv (load x)) -> (load (conv*)x) On arch...
Definition X86ISelLowering.cpp:3445

llvm::X86TargetLowering::hasAndNotCompare
bool hasAndNotCompare(SDValue Y) const override
Return true if the target should transform: (X & Y) == Y ---> (~X & Y) == 0 (X & Y) !...
Definition X86ISelLowering.cpp:3490

llvm::X86TargetLowering::reduceSelectOfFPConstantLoads
bool reduceSelectOfFPConstantLoads(EVT CmpOpVT) const override
Return true if it is profitable to convert a select of FP constants into a constant pool load whose a...
Definition X86ISelLowering.cpp:3326

llvm::X86TargetLowering::getStackProbeSymbolName
StringRef getStackProbeSymbolName(const MachineFunction &MF) const override
Returns the name of the symbol used to emit stack probes or the empty string if not applicable.
Definition X86ISelLowering.cpp:62361

llvm::X86TargetLowering::hasBitTest
bool hasBitTest(SDValue X, SDValue Y) const override
Return true if the target has a bit-test instruction: (X & (1 << Y)) ==/!= 0 This knowledge can be us...
Definition X86ISelLowering.cpp:3523

llvm::X86TargetLowering::allowTruncateForTailCall
bool allowTruncateForTailCall(Type *Ty1, Type *Ty2) const override
Return true if a truncation from FromTy to ToTy is permitted when deciding whether a call is in tail ...
Definition X86ISelLowering.cpp:35498

llvm::X86TargetLowering::isShuffleMaskLegal
bool isShuffleMaskLegal(ArrayRef< int > Mask, EVT VT) const override
Targets can use this to indicate that they only support some VECTOR_SHUFFLE operations,...
Definition X86ISelLowering.cpp:35660

llvm::X86TargetLowering::useStackGuardXorFP
bool useStackGuardXorFP() const override
If this function returns true, stack protection checks should XOR the frame pointer (or whichever poi...
Definition X86ISelLowering.cpp:2739

llvm::X86TargetLowering::ComputeNumSignBitsForTargetNode
unsigned ComputeNumSignBitsForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const override
Determine the number of bits in the operation that are sign bits.
Definition X86ISelLowering.cpp:39056

llvm::X86TargetLowering::shouldScalarizeBinop
bool shouldScalarizeBinop(SDValue) const override
Scalar ops always have equal or better analysis/performance/power than the vector equivalent,...
Definition X86ISelLowering.cpp:3390

llvm::X86TargetLowering::isTruncateFree
bool isTruncateFree(Type *Ty1, Type *Ty2) const override
Return true if it's free to truncate a value of type Ty1 to type Ty2.
Definition X86ISelLowering.cpp:35490

llvm::X86TargetLowering::shouldFoldConstantShiftPairToMask
bool shouldFoldConstantShiftPairToMask(const SDNode *N) const override
Return true if it is profitable to fold a pair of shifts into a mask.
Definition X86ISelLowering.cpp:3649

llvm::X86TargetLowering::decomposeMulByConstant
bool decomposeMulByConstant(LLVMContext &Context, EVT VT, SDValue C) const override
Return true if it is profitable to transform an integer multiplication-by-constant into simpler opera...
Definition X86ISelLowering.cpp:3344

llvm::X86TargetLowering::areJTsAllowed
bool areJTsAllowed(const Function *Fn) const override
Returns true if lowering to a jump table is allowed.
Definition X86ISelLowering.cpp:35689

llvm::X86TargetLowering::isCommutativeBinOp
bool isCommutativeBinOp(unsigned Opcode) const override
Returns true if the opcode is a commutative binary operation.
Definition X86ISelLowering.cpp:35473

llvm::X86TargetLowering::isScalarFPTypeInSSEReg
bool isScalarFPTypeInSSEReg(EVT VT) const
Return true if the specified scalar FP type is computed in an SSE register, not on the X87 floating p...
Definition X86ISelLowering.cpp:3440

llvm::X86TargetLowering::getTargetNodeName
const char * getTargetNodeName(unsigned Opcode) const override
This method returns the name of a target specific DAG node.
Definition X86ISelLowering.cpp:34929

llvm::X86TargetLowering::getPreferredSwitchConditionType
MVT getPreferredSwitchConditionType(LLVMContext &Context, EVT ConditionVT) const override
Returns preferred type for switch condition.
Definition X86ISelLowering.cpp:35698

llvm::X86TargetLowering::visitMaskedLoad
SDValue visitMaskedLoad(SelectionDAG &DAG, const SDLoc &DL, SDValue Chain, MachineMemOperand *MMO, SDValue &NewLoad, SDValue Ptr, SDValue PassThru, SDValue Mask) const override
Definition X86ISelLowering.cpp:33536

llvm::X86TargetLowering::isFMAFasterThanFMulAndFAdd
bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, EVT VT) const override
Return true if an FMA operation is faster than a pair of fmul and fadd instructions.
Definition X86ISelLowering.cpp:35586

llvm::X86TargetLowering::isExtractSubvectorCheap
bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, unsigned Index) const override
Return true if EXTRACT_SUBVECTOR is cheap for this result type with this index.
Definition X86ISelLowering.cpp:3376

llvm::X86TargetLowering::isVectorClearMaskLegal
bool isVectorClearMaskLegal(ArrayRef< int > Mask, EVT VT) const override
Similar to isShuffleMaskLegal.
Definition X86ISelLowering.cpp:35677

llvm::X86TargetLowering::getSingleConstraintMatchWeight
ConstraintWeight getSingleConstraintMatchWeight(AsmOperandInfo &Info, const char *Constraint) const override
Examine constraint string and operand type and determine a weight value.
Definition X86ISelLowering.cpp:61313

llvm::X86TargetLowering::isIntDivCheap
bool isIntDivCheap(EVT VT, AttributeList Attr) const override
Return true if integer divide is usually cheaper than a sequence of several shifts,...
Definition X86ISelLowering.cpp:62206

llvm::X86TargetLowering::getPreferredVectorAction
LegalizeTypeAction getPreferredVectorAction(MVT VT) const override
Customize the preferred legalization strategy for certain types.
Definition X86ISelLowering.cpp:2753

llvm::X86TargetLowering::shouldConvertPhiType
bool shouldConvertPhiType(Type *From, Type *To) const override
Given a set in interconnected phis of type 'From' that are loaded/stored or bitcast to type 'To',...
Definition X86ISelLowering.cpp:35567

llvm::X86TargetLowering::hasStackProbeSymbol
bool hasStackProbeSymbol(const MachineFunction &MF) const override
Returns true if stack probing through a function call is requested.
Definition X86ISelLowering.cpp:62338

llvm::X86TargetLowering::isZExtFree
bool isZExtFree(Type *Ty1, Type *Ty2) const override
Return true if any actual instruction that defines a value of type Ty1 implicit zero-extends the valu...
Definition X86ISelLowering.cpp:35533

llvm::X86TargetLowering::allowsMemoryAccess
bool allowsMemoryAccess(LLVMContext &Context, const DataLayout &DL, EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *Fast=nullptr) const override
This function returns true if the memory access is aligned or if the target allows this specific unal...
Definition X86ISelLoweringCall.cpp:391

llvm::X86TargetLowering::isLegalAddressingMode
bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS, Instruction *I=nullptr) const override
Return true if the addressing mode represented by AM is legal for this target, for a load/store of th...
Definition X86ISelLowering.cpp:35401

llvm::X86TargetLowering::emitStackGuardXorFP
SDValue emitStackGuardXorFP(SelectionDAG &DAG, SDValue Val, const SDLoc &DL) const override
Definition X86ISelLowering.cpp:2744

llvm::X86TargetLowering::isDesirableToCombineLogicOpOfSETCC
TargetLowering::AndOrSETCCFoldKind isDesirableToCombineLogicOpOfSETCC(const SDNode *LogicOp, const SDNode *SETCC0, const SDNode *SETCC1) const override
Return prefered fold type, Abs if this is a vector, AddAnd if its an integer, None otherwise.
Definition X86ISelLowering.cpp:61075

llvm::X86TargetLowering::shouldFoldMaskToVariableShiftPair
bool shouldFoldMaskToVariableShiftPair(SDValue Y) const override
There are two ways to clear extreme bits (either low or high): Mask: x & (-1 << y) (the instcombine c...
Definition X86ISelLowering.cpp:3668

llvm::X86TargetLowering::shouldFoldSelectWithIdentityConstant
bool shouldFoldSelectWithIdentityConstant(unsigned BinOpcode, EVT VT, unsigned SelectOpcode, SDValue X, SDValue Y) const override
Return true if pulling a binary operation into a select with an identity constant is profitable.
Definition X86ISelLowering.cpp:35618

llvm::X86TargetLowering::addressingModeSupportsTLS
bool addressingModeSupportsTLS(const GlobalValue &GV) const override
Returns true if the targets addressing mode can target thread local storage (TLS).
Definition X86ISelLowering.cpp:19741

llvm::X86TargetLowering::getReturnAddressFrameIndex
SDValue getReturnAddressFrameIndex(SelectionDAG &DAG) const
Definition X86ISelLowering.cpp:2900

llvm::X86TargetLowering::targetShrinkDemandedConstant
bool targetShrinkDemandedConstant(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, TargetLoweringOpt &TLO) const override
Definition X86ISelLowering.cpp:38396

llvm::X86TargetLowering::getExceptionPointerRegister
Register getExceptionPointerRegister(const Constant *PersonalityFn) const override
If a physical register, this returns the register that receives the exception address on entry to an ...
Definition X86ISelLowering.cpp:28405

llvm::X86TargetLowering::expandIndirectJTBranch
SDValue expandIndirectJTBranch(const SDLoc &dl, SDValue Value, SDValue Addr, int JTI, SelectionDAG &DAG) const override
Expands target specific indirect branch for the case of JumpTable expansion.
Definition X86ISelLowering.cpp:61053

llvm::X86TargetLowering::createFastISel
FastISel * createFastISel(FunctionLoweringInfo &funcInfo, const TargetLibraryInfo *libInfo) const override
This method returns a target specific FastISel object, or null if the target does not support "fast" ...
Definition X86ISelLowering.cpp:2772

llvm::X86TargetLowering::computeKnownBitsForTargetNode
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
Definition X86ISelLowering.cpp:38575

llvm::X86TargetLowering::isBinOp
bool isBinOp(unsigned Opcode) const override
Add x86-specific opcodes to the default list.
Definition X86ISelLowering.cpp:35453

llvm::X86TargetLowering::isGuaranteedNotToBeUndefOrPoisonForTargetNode
bool isGuaranteedNotToBeUndefOrPoisonForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool PoisonOnly, unsigned Depth) const override
Return true if this function can prove that Op is never poison and, if PoisonOnly is false,...
Definition X86ISelLowering.cpp:45145

llvm::X86TargetLowering::IsDesirableToPromoteOp
bool IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const override
Return true if the target has native support for the specified value type and it is 'desirable' to us...
Definition X86ISelLowering.cpp:61097

llvm::X86TargetLowering::unwrapAddress
SDValue unwrapAddress(SDValue N) const override
Definition X86ISelLowering.cpp:39225

llvm::X86TargetLowering::getJumpConditionMergingParams
CondMergingParams getJumpConditionMergingParams(Instruction::BinaryOps Opc, const Value *Lhs, const Value *Rhs) const override
Definition X86ISelLowering.cpp:3618

llvm::X86TargetLowering::getSetCCResultType
EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const override
Return the value type to use for ISD::SETCC.
Definition X86ISelLoweringCall.cpp:208

llvm::X86TargetLowering::X86TargetLowering
X86TargetLowering(const X86TargetMachine &TM, const X86Subtarget &STI)
Definition X86ISelLowering.cpp:131

llvm::X86TargetLowering::isTargetCanonicalSelect
bool isTargetCanonicalSelect(SDNode *N) const override
Return true if the given select/vselect should be considered canonical and not be transformed.
Definition X86ISelLowering.cpp:4991

llvm::X86TargetLowering::isVectorLoadExtDesirable
bool isVectorLoadExtDesirable(SDValue) const override
Return true if folding a vector load into ExtVal (a sign, zero, or any extend node) is profitable.
Definition X86ISelLowering.cpp:35573

llvm::X86TargetLowering::getTgtMemIntrinsic
bool getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &I, MachineFunction &MF, unsigned Intrinsic) const override
Given an intrinsic, checks if on the target the intrinsic will need to map to a MemIntrinsicNode (tou...
Definition X86ISelLowering.cpp:3092

llvm::X86TargetLowering::getTargetConstantFromLoad
const Constant * getTargetConstantFromLoad(LoadSDNode *LD) const override
This method returns the constant pool value that will be loaded by LD.
Definition X86ISelLowering.cpp:4986

llvm::X86TargetLowering::getTypeToTransformTo
EVT getTypeToTransformTo(LLVMContext &Context, EVT VT) const override
For types supported by the target, this is an identity function.
Definition X86ISelLowering.h:1679

llvm::X86TargetLowering::canCreateUndefOrPoisonForTargetNode
bool canCreateUndefOrPoisonForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool PoisonOnly, bool ConsiderFlags, unsigned Depth) const override
Return true if Op can create undef or poison from non-undef & non-poison operands.
Definition X86ISelLowering.cpp:45205

llvm::X86TargetLowering::getStackProbeSize
unsigned getStackProbeSize(const MachineFunction &MF) const
Definition X86ISelLowering.cpp:62385

llvm::X86TargetLowering::ShouldShrinkFPConstant
bool ShouldShrinkFPConstant(EVT VT) const override
If true, then instruction selection should seek to shrink the FP constant of the specified type to a ...
Definition X86ISelLowering.cpp:3433

llvm::X86TargetLowering::ReplaceNodeResults
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
Replace the results of node with an illegal result type with new values built out of custom code.
Definition X86ISelLowering.cpp:33738

llvm::X86TargetLowering::isMaskAndCmp0FoldingBeneficial
bool isMaskAndCmp0FoldingBeneficial(const Instruction &AndI) const override
Return if the target supports combining a chain like:
Definition X86ISelLowering.cpp:3485

llvm::X86TargetLowering::LowerAsmOperandForConstraint
void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const override
Lower the specified operand into the Ops vector.
Definition X86ISelLowering.cpp:61502

llvm::X86TargetLowering::needsFixedCatchObjects
bool needsFixedCatchObjects() const override
Definition X86ISelLowering.cpp:28421

llvm::X86TargetMachine
Definition X86TargetMachine.h:28

llvm::cl::opt
Definition CommandLine.h:1455

llvm::detail::DenseSetImpl::insert
std::pair< iterator, bool > insert(const ValueT &V)
Definition DenseSet.h:202

llvm::detail::DenseSetImpl::contains
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
Definition DenseSet.h:175

llvm::details::FixedOrScalableQuantity::getFixedValue
constexpr ScalarTy getFixedValue() const
Definition TypeSize.h:201

llvm::function_ref
An efficient, type-erasing, non-owning reference to a callable.
Definition STLFunctionalExtras.h:37

llvm::ilist_detail::node_parent_access::getParent
const ParentTy * getParent() const
Definition ilist_node.h:34

llvm::ilist_node_impl::getIterator
self_iterator getIterator()
Definition ilist_node.h:123

uint16_t

uint32_t

uint64_t

unsigned

Call
CallInst * Call
Definition ObjCARCOpts.cpp:2359

INT64_MIN
#define INT64_MIN
Definition DataTypes.h:74

ErrorHandling.h

llvm_unreachable
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Definition ErrorHandling.h:164

llvm::AArch64::RM
@ RM
Definition AArch64ISelLowering.h:34

llvm::AMDGPU::HSAMD::Kernel::Arg::Key::Align
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
Definition AMDGPUMetadata.h:183

llvm::AMDGPU::HSAMD::Kernel::Key::Args
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
Definition AMDGPUMetadata.h:396

llvm::AMDGPU::HSAMD::Kernel::Key::Attrs
constexpr char Attrs[]
Key for Kernel::Metadata::mAttrs.
Definition AMDGPUMetadata.h:394

llvm::AMDGPU::Imm
@ Imm
Definition AMDGPURegBankLegalizeRules.h:129

llvm::APIntOps::ScaleBitMask
LLVM_ABI APInt ScaleBitMask(const APInt &A, unsigned NewBitWidth, bool MatchAllBits=false)
Splat/Merge neighboring bits to widen/narrow the bitmask represented by.
Definition APInt.cpp:3009

llvm::ARMBuildAttrs::Symbol
@ Symbol
Definition ARMBuildAttributes.h:84

llvm::ARM_MB::LD
@ LD
Definition ARMBaseInfo.h:72

llvm::ARM::ProfileKind::M
@ M
Definition ARMTargetParser.h:171

llvm::AVRCC::COND_NE
@ COND_NE
Not equal.
Definition AVRInstrInfo.h:35

llvm::BitmaskEnumDetail::Mask
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition BitmaskEnum.h:127

llvm::COFF::Entry
@ Entry
Definition COFF.h:862

llvm::CallingConv::ID
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24

llvm::CallingConv::X86_ThisCall
@ X86_ThisCall
Similar to X86_StdCall.
Definition CallingConv.h:122

llvm::CallingConv::X86_StdCall
@ X86_StdCall
stdcall is mostly used by the Win32 API.
Definition CallingConv.h:99

llvm::CallingConv::Fast
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition CallingConv.h:41

llvm::CallingConv::Tail
@ Tail
Attemps to make calls as fast as possible while guaranteeing that tail call optimization can always b...
Definition CallingConv.h:76

llvm::CallingConv::SwiftTail
@ SwiftTail
This follows the Swift calling convention in how arguments are passed but guarantees tail calls will ...
Definition CallingConv.h:87

llvm::CallingConv::C
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34

llvm::CallingConv::X86_FastCall
@ X86_FastCall
'fast' analog of X86_StdCall.
Definition CallingConv.h:103

llvm::CodeModel::Model
Model
Definition CodeGen.h:31

llvm::CodeModel::Large
@ Large
Definition CodeGen.h:31

llvm::CodeModel::Small
@ Small
Definition CodeGen.h:31

llvm::CodeModel::Kernel
@ Kernel
Definition CodeGen.h:31

llvm::FPOpFusion::Fast
@ Fast
Definition TargetOptions.h:31

llvm::HexagonISD::CP
@ CP
Definition HexagonISelLowering.h:54

llvm::HexagonISD::JT
@ JT
Definition HexagonISelLowering.h:53

llvm::IRSimilarity::Legal
@ Legal
Definition IRSimilarityIdentifier.h:77

llvm::ISD::isConstantSplatVectorAllOnes
LLVM_ABI bool isConstantSplatVectorAllOnes(const SDNode *N, bool BuildVectorOnly=false)
Return true if the specified node is a BUILD_VECTOR or SPLAT_VECTOR where all of the elements are ~0 ...
Definition SelectionDAG.cpp:182

llvm::ISD::isNON_EXTLoad
bool isNON_EXTLoad(const SDNode *N)
Returns true if the specified node is a non-extending load.
Definition SelectionDAGNodes.h:3315

llvm::ISD::NodeType
NodeType
ISD::NodeType enum - This enum defines the target-independent operators for a SelectionDAG.
Definition ISDOpcodes.h:41

llvm::ISD::SETCC
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition ISDOpcodes.h:807

llvm::ISD::MERGE_VALUES
@ MERGE_VALUES
MERGE_VALUES - This node takes multiple discrete operands and returns them all as its individual resu...
Definition ISDOpcodes.h:256

llvm::ISD::CTLZ_ZERO_UNDEF
@ CTLZ_ZERO_UNDEF
Definition ISDOpcodes.h:780

llvm::ISD::STRICT_FSETCC
@ STRICT_FSETCC
STRICT_FSETCC/STRICT_FSETCCS - Constrained versions of SETCC, used for floating-point operands only.
Definition ISDOpcodes.h:504

llvm::ISD::DELETED_NODE
@ DELETED_NODE
DELETED_NODE - This is an illegal value that is used to catch errors.
Definition ISDOpcodes.h:45

llvm::ISD::STRICT_FSIN
@ STRICT_FSIN
Definition ISDOpcodes.h:435

llvm::ISD::JumpTable
@ JumpTable
Definition ISDOpcodes.h:91

llvm::ISD::EH_SJLJ_LONGJMP
@ EH_SJLJ_LONGJMP
OUTCHAIN = EH_SJLJ_LONGJMP(INCHAIN, buffer) This corresponds to the eh.sjlj.longjmp intrinsic.
Definition ISDOpcodes.h:163

llvm::ISD::SREM
@ SREM
Definition ISDOpcodes.h:264

llvm::ISD::FGETSIGN
@ FGETSIGN
INT = FGETSIGN(FP) - Return the sign bit of the specified floating point value as an integer 0/1 valu...
Definition ISDOpcodes.h:531

llvm::ISD::SMUL_LOHI
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
Definition ISDOpcodes.h:270

llvm::ISD::SSUBO_CARRY
@ SSUBO_CARRY
Definition ISDOpcodes.h:334

llvm::ISD::UDIV
@ UDIV
Definition ISDOpcodes.h:263

llvm::ISD::INSERT_SUBVECTOR
@ INSERT_SUBVECTOR
INSERT_SUBVECTOR(VECTOR1, VECTOR2, IDX) - Returns a vector with VECTOR2 inserted into VECTOR1.
Definition ISDOpcodes.h:593

llvm::ISD::UINT_TO_FP
@ UINT_TO_FP
Definition ISDOpcodes.h:869

llvm::ISD::UMIN
@ UMIN
Definition ISDOpcodes.h:726

llvm::ISD::BSWAP
@ BSWAP
Byte Swap and Counting operators.
Definition ISDOpcodes.h:771

llvm::ISD::ROTR
@ ROTR
Definition ISDOpcodes.h:766

llvm::ISD::ConstantFP
@ ConstantFP
Definition ISDOpcodes.h:87

llvm::ISD::STRICT_FATAN2
@ STRICT_FATAN2
Definition ISDOpcodes.h:441

llvm::ISD::UADDO
@ UADDO
Definition ISDOpcodes.h:344

llvm::ISD::SDIV
@ SDIV
Definition ISDOpcodes.h:262

llvm::ISD::STRICT_FCEIL
@ STRICT_FCEIL
Definition ISDOpcodes.h:454

llvm::ISD::FRAME_TO_ARGS_OFFSET
@ FRAME_TO_ARGS_OFFSET
FRAME_TO_ARGS_OFFSET - This node represents offset from frame pointer to first (possible) on-stack ar...
Definition ISDOpcodes.h:140

llvm::ISD::FMAD
@ FMAD
FMAD - Perform a * b + c, while getting the same result as the separately rounded operations.
Definition ISDOpcodes.h:515

llvm::ISD::STRICT_FTANH
@ STRICT_FTANH
Definition ISDOpcodes.h:444

llvm::ISD::ADD
@ ADD
Simple integer binary arithmetic operators.
Definition ISDOpcodes.h:259

llvm::ISD::STRICT_FMA
@ STRICT_FMA
Definition ISDOpcodes.h:425

llvm::ISD::ANY_EXTEND
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
Definition ISDOpcodes.h:841

llvm::ISD::FSUB
@ FSUB
Definition ISDOpcodes.h:411

llvm::ISD::FMA
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
Definition ISDOpcodes.h:511

llvm::ISD::INTRINSIC_VOID
@ INTRINSIC_VOID
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
Definition ISDOpcodes.h:215

llvm::ISD::RETURNADDR
@ RETURNADDR
Definition ISDOpcodes.h:111

llvm::ISD::EH_SJLJ_SETUP_DISPATCH
@ EH_SJLJ_SETUP_DISPATCH
OUTCHAIN = EH_SJLJ_SETUP_DISPATCH(INCHAIN) The target initializes the dispatch table here.
Definition ISDOpcodes.h:167

llvm::ISD::GlobalAddress
@ GlobalAddress
Definition ISDOpcodes.h:88

llvm::ISD::STRICT_FMINIMUM
@ STRICT_FMINIMUM
Definition ISDOpcodes.h:464

llvm::ISD::SINT_TO_FP
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition ISDOpcodes.h:868

llvm::ISD::CONCAT_VECTORS
@ CONCAT_VECTORS
CONCAT_VECTORS(VECTOR0, VECTOR1, ...) - Given a number of values of vector type with the same length ...
Definition ISDOpcodes.h:577

llvm::ISD::FADD
@ FADD
Simple binary floating point operators.
Definition ISDOpcodes.h:410

llvm::ISD::ABS
@ ABS
ABS - Determine the unsigned absolute value of a signed integer value of the same bitwidth.
Definition ISDOpcodes.h:744

llvm::ISD::SIGN_EXTEND_VECTOR_INREG
@ SIGN_EXTEND_VECTOR_INREG
SIGN_EXTEND_VECTOR_INREG(Vector) - This operator represents an in-register sign-extension of the low ...
Definition ISDOpcodes.h:898

llvm::ISD::UDIVREM
@ UDIVREM
Definition ISDOpcodes.h:276

llvm::ISD::SDIVREM
@ SDIVREM
SDIVREM/UDIVREM - Divide two integers and produce both a quotient and remainder result.
Definition ISDOpcodes.h:275

llvm::ISD::SRL
@ SRL
Definition ISDOpcodes.h:764

llvm::ISD::STRICT_FSETCCS
@ STRICT_FSETCCS
Definition ISDOpcodes.h:505

llvm::ISD::STRICT_FLOG2
@ STRICT_FLOG2
Definition ISDOpcodes.h:449

llvm::ISD::STRICT_FDIV
@ STRICT_FDIV
Definition ISDOpcodes.h:423

llvm::ISD::BUILD_PAIR
@ BUILD_PAIR
BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.
Definition ISDOpcodes.h:249

llvm::ISD::STRICT_FSQRT
@ STRICT_FSQRT
Constrained versions of libm-equivalent floating point intrinsics.
Definition ISDOpcodes.h:431

llvm::ISD::BUILTIN_OP_END
@ BUILTIN_OP_END
BUILTIN_OP_END - This must be the last enum value in this list.
Definition ISDOpcodes.h:1580

llvm::ISD::GlobalTLSAddress
@ GlobalTLSAddress
Definition ISDOpcodes.h:89

llvm::ISD::SRA
@ SRA
Definition ISDOpcodes.h:763

llvm::ISD::STRICT_FMUL
@ STRICT_FMUL
Definition ISDOpcodes.h:422

llvm::ISD::EH_RETURN
@ EH_RETURN
OUTCHAIN = EH_RETURN(INCHAIN, OFFSET, HANDLER) - This node represents 'eh_return' gcc dwarf builtin,...
Definition ISDOpcodes.h:151

llvm::ISD::USUBO
@ USUBO
Definition ISDOpcodes.h:348

llvm::ISD::AVGFLOORU
@ AVGFLOORU
Definition ISDOpcodes.h:708

llvm::ISD::SIGN_EXTEND
@ SIGN_EXTEND
Conversion operators.
Definition ISDOpcodes.h:832

llvm::ISD::STRICT_FASIN
@ STRICT_FASIN
Definition ISDOpcodes.h:438

llvm::ISD::AVGCEILS
@ AVGCEILS
AVGCEILS/AVGCEILU - Rounding averaging add - Add two integers using an integer of type i[N+2],...
Definition ISDOpcodes.h:712

llvm::ISD::STRICT_UINT_TO_FP
@ STRICT_UINT_TO_FP
Definition ISDOpcodes.h:478

llvm::ISD::SCALAR_TO_VECTOR
@ SCALAR_TO_VECTOR
SCALAR_TO_VECTOR(VAL) - This represents the operation of loading a scalar value into element 0 of the...
Definition ISDOpcodes.h:662

llvm::ISD::ADDROFRETURNADDR
@ ADDROFRETURNADDR
ADDROFRETURNADDR - Represents the llvm.addressofreturnaddress intrinsic.
Definition ISDOpcodes.h:117

llvm::ISD::UADDSAT
@ UADDSAT
Definition ISDOpcodes.h:361

llvm::ISD::STRICT_FATAN
@ STRICT_FATAN
Definition ISDOpcodes.h:440

llvm::ISD::CTTZ_ZERO_UNDEF
@ CTTZ_ZERO_UNDEF
Bit counting operators with an undefined result for zero inputs.
Definition ISDOpcodes.h:779

llvm::ISD::STRICT_FPOW
@ STRICT_FPOW
Definition ISDOpcodes.h:432

llvm::ISD::SETCCCARRY
@ SETCCCARRY
Like SetCC, ops #0 and #1 are the LHS and RHS operands to compare, but op #2 is a boolean indicating ...
Definition ISDOpcodes.h:815

llvm::ISD::CTTZ
@ CTTZ
Definition ISDOpcodes.h:772

llvm::ISD::SSUBO
@ SSUBO
Same for subtraction.
Definition ISDOpcodes.h:347

llvm::ISD::FP_TO_UINT
@ FP_TO_UINT
Definition ISDOpcodes.h:915

llvm::ISD::OR
@ OR
Definition ISDOpcodes.h:737

llvm::ISD::FCANONICALIZE
@ FCANONICALIZE
Returns platform specific canonical encoding of a floating point number.
Definition ISDOpcodes.h:534

llvm::ISD::SSUBSAT
@ SSUBSAT
RESULT = [US]SUBSAT(LHS, RHS) - Perform saturation subtraction on 2 integers with the same bit width ...
Definition ISDOpcodes.h:369

llvm::ISD::UMULO
@ UMULO
Definition ISDOpcodes.h:352

llvm::ISD::SRA_PARTS
@ SRA_PARTS
Definition ISDOpcodes.h:822

llvm::ISD::SELECT
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition ISDOpcodes.h:784

llvm::ISD::UMUL_LOHI
@ UMUL_LOHI
Definition ISDOpcodes.h:271

llvm::ISD::UNDEF
@ UNDEF
UNDEF - An undefined node.
Definition ISDOpcodes.h:228

llvm::ISD::SPLAT_VECTOR
@ SPLAT_VECTOR
SPLAT_VECTOR(VAL) - Returns a vector with the scalar value VAL duplicated in all lanes.
Definition ISDOpcodes.h:669

llvm::ISD::FSHL
@ FSHL
Definition ISDOpcodes.h:767

llvm::ISD::BasicBlock
@ BasicBlock
Various leaf nodes.
Definition ISDOpcodes.h:81

llvm::ISD::AVGCEILU
@ AVGCEILU
Definition ISDOpcodes.h:713

llvm::ISD::CopyFromReg
@ CopyFromReg
CopyFromReg - This node indicates that the input value is a virtual or physical register that is defi...
Definition ISDOpcodes.h:225

llvm::ISD::SADDO
@ SADDO
RESULT, BOOL = [SU]ADDO(LHS, RHS) - Overflow-aware nodes for addition.
Definition ISDOpcodes.h:343

llvm::ISD::FSHR
@ FSHR
Definition ISDOpcodes.h:768

llvm::ISD::STRICT_FTRUNC
@ STRICT_FTRUNC
Definition ISDOpcodes.h:458

llvm::ISD::USUBSAT
@ USUBSAT
Definition ISDOpcodes.h:370

llvm::ISD::GET_ROUNDING
@ GET_ROUNDING
Returns current rounding mode: -1 Undefined 0 Round to 0 1 Round to nearest, ties to even 2 Round to ...
Definition ISDOpcodes.h:958

llvm::ISD::MULHU
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
Definition ISDOpcodes.h:701

llvm::ISD::SHL
@ SHL
Shift and rotation operations.
Definition ISDOpcodes.h:762

llvm::ISD::VECTOR_SHUFFLE
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
Definition ISDOpcodes.h:642

llvm::ISD::EXTRACT_SUBVECTOR
@ EXTRACT_SUBVECTOR
EXTRACT_SUBVECTOR(VECTOR, IDX) - Returns a subvector from VECTOR.
Definition ISDOpcodes.h:607

llvm::ISD::STRICT_FMAXIMUM
@ STRICT_FMAXIMUM
Definition ISDOpcodes.h:463

llvm::ISD::STRICT_FMAXNUM
@ STRICT_FMAXNUM
Definition ISDOpcodes.h:452

llvm::ISD::XOR
@ XOR
Definition ISDOpcodes.h:738

llvm::ISD::EXTRACT_VECTOR_ELT
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition ISDOpcodes.h:569

llvm::ISD::CopyToReg
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
Definition ISDOpcodes.h:219

llvm::ISD::ZERO_EXTEND
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition ISDOpcodes.h:838

llvm::ISD::FP_TO_UINT_SAT
@ FP_TO_UINT_SAT
Definition ISDOpcodes.h:934

llvm::ISD::STRICT_FMINNUM
@ STRICT_FMINNUM
Definition ISDOpcodes.h:453

llvm::ISD::CTPOP
@ CTPOP
Definition ISDOpcodes.h:774

llvm::ISD::SELECT_CC
@ SELECT_CC
Select with condition operator - This selects between a true value and a false value (ops #2 and #3) ...
Definition ISDOpcodes.h:799

llvm::ISD::FMUL
@ FMUL
Definition ISDOpcodes.h:412

llvm::ISD::STRICT_FSINH
@ STRICT_FSINH
Definition ISDOpcodes.h:442

llvm::ISD::LOCAL_RECOVER
@ LOCAL_RECOVER
LOCAL_RECOVER - Represents the llvm.localrecover intrinsic.
Definition ISDOpcodes.h:130

llvm::ISD::SRL_PARTS
@ SRL_PARTS
Definition ISDOpcodes.h:823

llvm::ISD::SUB
@ SUB
Definition ISDOpcodes.h:260

llvm::ISD::MULHS
@ MULHS
Definition ISDOpcodes.h:702

llvm::ISD::SMULO
@ SMULO
Same for multiplication.
Definition ISDOpcodes.h:351

llvm::ISD::PARITY
@ PARITY
Definition ISDOpcodes.h:776

llvm::ISD::ConstantPool
@ ConstantPool
Definition ISDOpcodes.h:92

llvm::ISD::ANY_EXTEND_VECTOR_INREG
@ ANY_EXTEND_VECTOR_INREG
ANY_EXTEND_VECTOR_INREG(Vector) - This operator represents an in-register any-extension of the low la...
Definition ISDOpcodes.h:887

llvm::ISD::SIGN_EXTEND_INREG
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
Definition ISDOpcodes.h:876

llvm::ISD::SMIN
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
Definition ISDOpcodes.h:724

llvm::ISD::MCSymbol
@ MCSymbol
Definition ISDOpcodes.h:188

llvm::ISD::Constant
@ Constant
Definition ISDOpcodes.h:86

llvm::ISD::STRICT_FREM
@ STRICT_FREM
Definition ISDOpcodes.h:424

llvm::ISD::STRICT_FROUND
@ STRICT_FROUND
Definition ISDOpcodes.h:456

llvm::ISD::VSELECT
@ VSELECT
Select with a vector condition (op #0) and two vector operands (ops #1 and #2), returning a vector re...
Definition ISDOpcodes.h:793

llvm::ISD::UADDO_CARRY
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
Definition ISDOpcodes.h:323

llvm::ISD::STRICT_SINT_TO_FP
@ STRICT_SINT_TO_FP
STRICT_[US]INT_TO_FP - Convert a signed or unsigned integer to a floating point value.
Definition ISDOpcodes.h:477

llvm::ISD::STRICT_FFLOOR
@ STRICT_FFLOOR
Definition ISDOpcodes.h:455

llvm::ISD::STRICT_FROUNDEVEN
@ STRICT_FROUNDEVEN
Definition ISDOpcodes.h:457

llvm::ISD::STRICT_FEXP
@ STRICT_FEXP
Definition ISDOpcodes.h:445

llvm::ISD::FDIV
@ FDIV
Definition ISDOpcodes.h:413

llvm::ISD::FRAMEADDR
@ FRAMEADDR
FRAMEADDR, RETURNADDR - These nodes represent llvm.frameaddress and llvm.returnaddress on the DAG.
Definition ISDOpcodes.h:110

llvm::ISD::FREM
@ FREM
Definition ISDOpcodes.h:414

llvm::ISD::STRICT_FP_TO_UINT
@ STRICT_FP_TO_UINT
Definition ISDOpcodes.h:471

llvm::ISD::STRICT_FP_ROUND
@ STRICT_FP_ROUND
X = STRICT_FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision ...
Definition ISDOpcodes.h:493

llvm::ISD::STRICT_FP_TO_SINT
@ STRICT_FP_TO_SINT
STRICT_FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition ISDOpcodes.h:470

llvm::ISD::FP_TO_SINT
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition ISDOpcodes.h:914

llvm::ISD::TargetConstant
@ TargetConstant
TargetConstant* - Like Constant*, but the DAG does not do any folding, simplification,...
Definition ISDOpcodes.h:174

llvm::ISD::STRICT_FP_EXTEND
@ STRICT_FP_EXTEND
X = STRICT_FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition ISDOpcodes.h:498

llvm::ISD::AND
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition ISDOpcodes.h:736

llvm::ISD::INTRINSIC_WO_CHAIN
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
Definition ISDOpcodes.h:200

llvm::ISD::USUBO_CARRY
@ USUBO_CARRY
Definition ISDOpcodes.h:324

llvm::ISD::STRICT_FCOSH
@ STRICT_FCOSH
Definition ISDOpcodes.h:443

llvm::ISD::AVGFLOORS
@ AVGFLOORS
AVGFLOORS/AVGFLOORU - Averaging add - Add two integers using an integer of type i[N+1],...
Definition ISDOpcodes.h:707

llvm::ISD::STRICT_FADD
@ STRICT_FADD
Constrained versions of the binary floating point operators.
Definition ISDOpcodes.h:420

llvm::ISD::UREM
@ UREM
Definition ISDOpcodes.h:265

llvm::ISD::STRICT_FLOG10
@ STRICT_FLOG10
Definition ISDOpcodes.h:448

llvm::ISD::FREEZE
@ FREEZE
FREEZE - FREEZE(VAL) returns an arbitrary value if VAL is UNDEF (or is evaluated to UNDEF),...
Definition ISDOpcodes.h:236

llvm::ISD::INSERT_VECTOR_ELT
@ INSERT_VECTOR_ELT
INSERT_VECTOR_ELT(VECTOR, VAL, IDX) - Returns VECTOR with the element at IDX replaced with VAL.
Definition ISDOpcodes.h:558

llvm::ISD::TokenFactor
@ TokenFactor
TokenFactor - This node takes multiple tokens as input and produces a single token result.
Definition ISDOpcodes.h:53

llvm::ISD::STRICT_FEXP2
@ STRICT_FEXP2
Definition ISDOpcodes.h:446

llvm::ISD::STRICT_FSUB
@ STRICT_FSUB
Definition ISDOpcodes.h:421

llvm::ISD::STRICT_FLOG
@ STRICT_FLOG
Definition ISDOpcodes.h:447

llvm::ISD::ExternalSymbol
@ ExternalSymbol
Definition ISDOpcodes.h:93

llvm::ISD::STRICT_FTAN
@ STRICT_FTAN
Definition ISDOpcodes.h:437

llvm::ISD::MUL
@ MUL
Definition ISDOpcodes.h:261

llvm::ISD::FP_ROUND
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition ISDOpcodes.h:947

llvm::ISD::VECTOR_COMPRESS
@ VECTOR_COMPRESS
VECTOR_COMPRESS(Vec, Mask, Passthru) consecutively place vector elements based on mask e....
Definition ISDOpcodes.h:696

llvm::ISD::CTLZ
@ CTLZ
Definition ISDOpcodes.h:773

llvm::ISD::CLEAR_CACHE
@ CLEAR_CACHE
Definition ISDOpcodes.h:1576

llvm::ISD::STRICT_FLDEXP
@ STRICT_FLDEXP
Definition ISDOpcodes.h:434

llvm::ISD::ZERO_EXTEND_VECTOR_INREG
@ ZERO_EXTEND_VECTOR_INREG
ZERO_EXTEND_VECTOR_INREG(Vector) - This operator represents an in-register zero-extension of the low ...
Definition ISDOpcodes.h:909

llvm::ISD::STRICT_FCOS
@ STRICT_FCOS
Definition ISDOpcodes.h:436

llvm::ISD::STRICT_FNEARBYINT
@ STRICT_FNEARBYINT
Definition ISDOpcodes.h:451

llvm::ISD::FP_TO_SINT_SAT
@ FP_TO_SINT_SAT
FP_TO_[US]INT_SAT - Convert floating point value in operand 0 to a signed or unsigned scalar integer ...
Definition ISDOpcodes.h:933

llvm::ISD::EH_SJLJ_SETJMP
@ EH_SJLJ_SETJMP
RESULT, OUTCHAIN = EH_SJLJ_SETJMP(INCHAIN, buffer) This corresponds to the eh.sjlj....
Definition ISDOpcodes.h:157

llvm::ISD::TRUNCATE
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition ISDOpcodes.h:844

llvm::ISD::ROTL
@ ROTL
Definition ISDOpcodes.h:765

llvm::ISD::BlockAddress
@ BlockAddress
Definition ISDOpcodes.h:94

llvm::ISD::SHL_PARTS
@ SHL_PARTS
SHL_PARTS/SRA_PARTS/SRL_PARTS - These operators are used for expanded integer shift operations.
Definition ISDOpcodes.h:821

llvm::ISD::AssertSext
@ AssertSext
AssertSext, AssertZext - These nodes record if a register contains a value that has already been zero...
Definition ISDOpcodes.h:62

llvm::ISD::BITREVERSE
@ BITREVERSE
Definition ISDOpcodes.h:775

llvm::ISD::FCOPYSIGN
@ FCOPYSIGN
FCOPYSIGN(X, Y) - Return the value of X with the sign of Y.
Definition ISDOpcodes.h:527

llvm::ISD::SADDSAT
@ SADDSAT
RESULT = [US]ADDSAT(LHS, RHS) - Perform saturation addition on 2 integers with the same bit width (W)...
Definition ISDOpcodes.h:360

llvm::ISD::AssertZext
@ AssertZext
Definition ISDOpcodes.h:63

llvm::ISD::SMAX
@ SMAX
Definition ISDOpcodes.h:725

llvm::ISD::STRICT_FRINT
@ STRICT_FRINT
Definition ISDOpcodes.h:450

llvm::ISD::UMAX
@ UMAX
Definition ISDOpcodes.h:727

llvm::ISD::ABDS
@ ABDS
ABDS/ABDU - Absolute difference - Return the absolute difference between two numbers interpreted as s...
Definition ISDOpcodes.h:719

llvm::ISD::SADDO_CARRY
@ SADDO_CARRY
Carry-using overflow-aware nodes for multiple precision addition and subtraction.
Definition ISDOpcodes.h:333

llvm::ISD::INTRINSIC_W_CHAIN
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
Definition ISDOpcodes.h:208

llvm::ISD::STRICT_FACOS
@ STRICT_FACOS
Definition ISDOpcodes.h:439

llvm::ISD::ABDU
@ ABDU
Definition ISDOpcodes.h:720

llvm::ISD::BUILD_VECTOR
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
Definition ISDOpcodes.h:549

llvm::ISD::isExtVecInRegOpcode
bool isExtVecInRegOpcode(unsigned Opcode)
Definition ISDOpcodes.h:1773

llvm::ISD::isOverflowIntrOpRes
bool isOverflowIntrOpRes(SDValue Op)
Returns true if the specified value is the overflow result from one of the overflow intrinsic nodes.
Definition SelectionDAGNodes.h:3411

llvm::ISD::isBuildVectorOfConstantSDNodes
LLVM_ABI bool isBuildVectorOfConstantSDNodes(const SDNode *N)
Return true if the specified node is a BUILD_VECTOR node of all ConstantSDNode or undef.
Definition SelectionDAG.cpp:275

llvm::ISD::isNormalStore
bool isNormalStore(const SDNode *N)
Returns true if the specified node is a non-truncating and unindexed store.
Definition SelectionDAGNodes.h:3346

llvm::ISD::isExtOpcode
bool isExtOpcode(unsigned Opcode)
Definition ISDOpcodes.h:1768

llvm::ISD::isConstantSplatVectorAllZeros
LLVM_ABI bool isConstantSplatVectorAllZeros(const SDNode *N, bool BuildVectorOnly=false)
Return true if the specified node is a BUILD_VECTOR or SPLAT_VECTOR where all of the elements are 0 o...
Definition SelectionDAG.cpp:228

llvm::ISD::getSetCCInverse
LLVM_ABI CondCode getSetCCInverse(CondCode Operation, EVT Type)
Return the operation corresponding to !(X op Y), where 'op' is a valid SetCC operation.
Definition SelectionDAG.cpp:628

llvm::ISD::isBitwiseLogicOp
bool isBitwiseLogicOp(unsigned Opcode)
Whether this is bitwise logic opcode.
Definition ISDOpcodes.h:1584

llvm::ISD::isTrueWhenEqual
bool isTrueWhenEqual(CondCode Cond)
Return true if the specified condition returns true if the two operands to the condition are equal.
Definition ISDOpcodes.h:1755

llvm::ISD::isUNINDEXEDLoad
bool isUNINDEXEDLoad(const SDNode *N)
Returns true if the specified node is an unindexed load.
Definition SelectionDAGNodes.h:3339

llvm::ISD::isEXTLoad
bool isEXTLoad(const SDNode *N)
Returns true if the specified node is a EXTLOAD.
Definition SelectionDAGNodes.h:3321

llvm::ISD::isFreezeUndef
LLVM_ABI bool isFreezeUndef(const SDNode *N)
Return true if the specified node is FREEZE(UNDEF).
Definition SelectionDAG.cpp:347

llvm::ISD::getSetCCSwappedOperands
LLVM_ABI CondCode getSetCCSwappedOperands(CondCode Operation)
Return the operation corresponding to (Y op X) when given the operation for (X op Y).
Definition SelectionDAG.cpp:605

llvm::ISD::isBuildVectorAllZeros
LLVM_ABI bool isBuildVectorAllZeros(const SDNode *N)
Return true if the specified node is a BUILD_VECTOR where all of the elements are 0 or undef.
Definition SelectionDAG.cpp:271

llvm::ISD::isSignedIntSetCC
bool isSignedIntSetCC(CondCode Code)
Return true if this is a setcc instruction that performs a signed comparison when used with integer o...
Definition ISDOpcodes.h:1730

llvm::ISD::isConstantSplatVector
LLVM_ABI bool isConstantSplatVector(const SDNode *N, APInt &SplatValue)
Node predicates.
Definition SelectionDAG.cpp:151

llvm::ISD::matchUnaryPredicate
bool matchUnaryPredicate(SDValue Op, std::function< bool(ConstantSDNode *)> Match, bool AllowUndefs=false, bool AllowTruncation=false)
Hook for matching ConstantSDNode predicate.
Definition SelectionDAGNodes.h:3384

llvm::ISD::UNINDEXED
@ UNINDEXED
Definition ISDOpcodes.h:1646

llvm::ISD::isBuildVectorOfConstantFPSDNodes
LLVM_ABI bool isBuildVectorOfConstantFPSDNodes(const SDNode *N)
Return true if the specified node is a BUILD_VECTOR node of all ConstantFPSDNode or undef.
Definition SelectionDAG.cpp:288

llvm::ISD::CondCode
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
Definition ISDOpcodes.h:1697

llvm::ISD::SETOEQ
@ SETOEQ
Definition ISDOpcodes.h:1700

llvm::ISD::SETUNE
@ SETUNE
Definition ISDOpcodes.h:1713

llvm::ISD::SETUEQ
@ SETUEQ
Definition ISDOpcodes.h:1708

llvm::ISD::SETOLE
@ SETOLE
Definition ISDOpcodes.h:1704

llvm::ISD::SETOLT
@ SETOLT
Definition ISDOpcodes.h:1703

llvm::ISD::SETNE
@ SETNE
Definition ISDOpcodes.h:1722

llvm::ISD::SETUGT
@ SETUGT
Definition ISDOpcodes.h:1709

llvm::ISD::SETOGT
@ SETOGT
Definition ISDOpcodes.h:1701

llvm::ISD::SETULT
@ SETULT
Definition ISDOpcodes.h:1711

llvm::ISD::SETUO
@ SETUO
Definition ISDOpcodes.h:1707

llvm::ISD::SETONE
@ SETONE
Definition ISDOpcodes.h:1705

llvm::ISD::SETGT
@ SETGT
Definition ISDOpcodes.h:1718

llvm::ISD::SETLT
@ SETLT
Definition ISDOpcodes.h:1720

llvm::ISD::SETO
@ SETO
Definition ISDOpcodes.h:1706

llvm::ISD::SETGE
@ SETGE
Definition ISDOpcodes.h:1719

llvm::ISD::SETUGE
@ SETUGE
Definition ISDOpcodes.h:1710

llvm::ISD::SETLE
@ SETLE
Definition ISDOpcodes.h:1721

llvm::ISD::SETULE
@ SETULE
Definition ISDOpcodes.h:1712

llvm::ISD::SETOGE
@ SETOGE
Definition ISDOpcodes.h:1702

llvm::ISD::SETEQ
@ SETEQ
Definition ISDOpcodes.h:1717

llvm::ISD::SETCC_INVALID
@ SETCC_INVALID
Definition ISDOpcodes.h:1725

llvm::ISD::isBuildVectorAllOnes
LLVM_ABI bool isBuildVectorAllOnes(const SDNode *N)
Return true if the specified node is a BUILD_VECTOR where all of the elements are ~0 or undef.
Definition SelectionDAG.cpp:267

llvm::ISD::LoadExtType
LoadExtType
LoadExtType enum - This enum defines the three variants of LOADEXT (load with extension).
Definition ISDOpcodes.h:1677

llvm::ISD::NON_EXTLOAD
@ NON_EXTLOAD
Definition ISDOpcodes.h:1677

llvm::ISD::SEXTLOAD
@ SEXTLOAD
Definition ISDOpcodes.h:1677

llvm::ISD::ZEXTLOAD
@ ZEXTLOAD
Definition ISDOpcodes.h:1677

llvm::ISD::EXTLOAD
@ EXTLOAD
Definition ISDOpcodes.h:1677

llvm::ISD::isUnsignedIntSetCC
bool isUnsignedIntSetCC(CondCode Code)
Return true if this is a setcc instruction that performs an unsigned comparison when used with intege...
Definition ISDOpcodes.h:1736

llvm::ISD::isNormalLoad
bool isNormalLoad(const SDNode *N)
Returns true if the specified node is a non-extending and unindexed load.
Definition SelectionDAGNodes.h:3308

llvm::Intrinsic
This namespace contains an enum with a value for every intrinsic/builtin function known by LLVM.
Definition GenericSSAContext.h:27

llvm::Intrinsic::not_intrinsic
@ not_intrinsic
Definition Intrinsics.h:46

llvm::Intrinsic::ID
unsigned ID
Definition GenericSSAContext.h:28

llvm::LegalizeActions::LegalizeAction
LegalizeAction
Definition LegalizerInfo.h:45

llvm::M68k::MemAddrModeKind::V
@ V
Definition M68kBaseInfo.h:63

llvm::MIPatternMatch::m_ZeroInt
SpecificConstantMatch m_ZeroInt()
Convenience matchers for specific integer values.
Definition MIPatternMatch.h:278

llvm::MIPatternMatch::m_Neg
BinaryOp_match< SpecificConstantMatch, SrcTy, TargetOpcode::G_SUB > m_Neg(const SrcTy &&Src)
Matches a register negated by a G_SUB.
Definition MIPatternMatch.h:929

llvm::MIPatternMatch::m_Not
BinaryOp_match< SrcTy, SpecificConstantMatch, TargetOpcode::G_XOR, true > m_Not(const SrcTy &&Src)
Matches a register not-ed by a G_XOR.
Definition MIPatternMatch.h:937

llvm::MIPatternMatch::m_OneUse
OneUse_match< SubPat > m_OneUse(const SubPat &SP)
Definition MIPatternMatch.h:56

llvm::MipsISD::ThreadPointer
@ ThreadPointer
Definition MipsISelLowering.h:89

llvm::MipsISD::Ext
@ Ext
Definition MipsISelLowering.h:157

llvm::MipsISD::Ins
@ Ins
Definition MipsISelLowering.h:158

llvm::N86::EBP
@ EBP
Definition X86MCTargetDesc.h:54

llvm::NVPTXAS::AddressSpace
AddressSpace
Definition NVPTXAddrSpace.h:21

llvm::PatternMatch
Definition PatternMatch.h:47

llvm::PatternMatch::m_AllOnes
cst_pred_ty< is_all_ones > m_AllOnes()
Match an integer or vector with all bits set.
Definition PatternMatch.h:536

llvm::PatternMatch::m_And
BinaryOp_match< LHS, RHS, Instruction::And > m_And(const LHS &L, const RHS &R)
Definition PatternMatch.h:1308

llvm::PatternMatch::m_Add
BinaryOp_match< LHS, RHS, Instruction::Add > m_Add(const LHS &L, const RHS &R)
Definition PatternMatch.h:1194

llvm::PatternMatch::m_BinOp
class_match< BinaryOperator > m_BinOp()
Match an arbitrary binary operation and ignore it.
Definition PatternMatch.h:113

llvm::PatternMatch::m_SignMask
cst_pred_ty< is_sign_mask > m_SignMask()
Match an integer or vector with only the sign bit(s) set.
Definition PatternMatch.h:676

llvm::PatternMatch::m_c_And
BinaryOp_match< LHS, RHS, Instruction::And, true > m_c_And(const LHS &L, const RHS &R)
Matches an And with LHS and RHS in either order.
Definition PatternMatch.h:3001

llvm::PatternMatch::m_Xor
BinaryOp_match< LHS, RHS, Instruction::Xor > m_Xor(const LHS &L, const RHS &R)
Definition PatternMatch.h:1320

llvm::PatternMatch::m_SpecificInt
specific_intval< false > m_SpecificInt(const APInt &V)
Match a specific integer value or vector with all elements equal to the value.
Definition PatternMatch.h:1072

llvm::PatternMatch::match
bool match(Val *V, const Pattern &P)
Definition PatternMatch.h:49

llvm::PatternMatch::m_Specific
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
Definition PatternMatch.h:974

llvm::PatternMatch::m_c_ICmp
CmpClass_match< LHS, RHS, ICmpInst, true > m_c_ICmp(CmpPredicate &Pred, const LHS &L, const RHS &R)
Matches an ICmp with a predicate over LHS and RHS in either order.
Definition PatternMatch.h:2968

llvm::PatternMatch::m_ExtractElt
TwoOps_match< Val_t, Idx_t, Instruction::ExtractElement > m_ExtractElt(const Val_t &Val, const Idx_t &Idx)
Matches ExtractElementInst.
Definition PatternMatch.h:1978

llvm::PatternMatch::m_SMin
MaxMin_match< ICmpInst, LHS, RHS, smin_pred_ty > m_SMin(const LHS &L, const RHS &R)
Definition PatternMatch.h:2516

llvm::PatternMatch::m_c_Xor
BinaryOp_match< LHS, RHS, Instruction::Xor, true > m_c_Xor(const LHS &L, const RHS &R)
Matches an Xor with LHS and RHS in either order.
Definition PatternMatch.h:3015

llvm::PatternMatch::m_Mul
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
Definition PatternMatch.h:1260

llvm::PatternMatch::m_Deferred
deferredval_ty< Value > m_Deferred(Value *const &V)
Like m_Specific(), but works if the specific value to match is determined as part of the same match()...
Definition PatternMatch.h:992

llvm::PatternMatch::m_SpecificICmp
SpecificCmpClass_match< LHS, RHS, ICmpInst > m_SpecificICmp(CmpPredicate MatchPred, const LHS &L, const RHS &R)
Definition PatternMatch.h:1829

llvm::PatternMatch::m_ZExt
CastInst_match< OpTy, ZExtInst > m_ZExt(const OpTy &Op)
Matches ZExt.
Definition PatternMatch.h:2267

llvm::PatternMatch::m_UMax
MaxMin_match< ICmpInst, LHS, RHS, umax_pred_ty > m_UMax(const LHS &L, const RHS &R)
Definition PatternMatch.h:2522

llvm::PatternMatch::m_c_Add
BinaryOp_match< LHS, RHS, Instruction::Add, true > m_c_Add(const LHS &L, const RHS &R)
Matches a Add with LHS and RHS in either order.
Definition PatternMatch.h:2987

llvm::PatternMatch::m_BitCast
CastOperator_match< OpTy, Instruction::BitCast > m_BitCast(const OpTy &Op)
Matches BitCast.
Definition PatternMatch.h:2162

llvm::PatternMatch::m_SMax
MaxMin_match< ICmpInst, LHS, RHS, smax_pred_ty > m_SMax(const LHS &L, const RHS &R)
Definition PatternMatch.h:2510

llvm::PatternMatch::m_Value
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
Definition PatternMatch.h:105

llvm::PatternMatch::m_c_BinOp
AnyBinaryOp_match< LHS, RHS, true > m_c_BinOp(const LHS &L, const RHS &R)
Matches a BinaryOperator with LHS and RHS in either order.
Definition PatternMatch.h:2960

llvm::PatternMatch::m_ICmp
CmpClass_match< LHS, RHS, ICmpInst > m_ICmp(CmpPredicate &Pred, const LHS &L, const RHS &R)
Definition PatternMatch.h:1766

llvm::PatternMatch::m_Shl
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
Definition PatternMatch.h:1326

llvm::PatternMatch::m_Undef
auto m_Undef()
Match an arbitrary undef constant.
Definition PatternMatch.h:165

llvm::PatternMatch::m_Or
BinaryOp_match< LHS, RHS, Instruction::Or > m_Or(const LHS &L, const RHS &R)
Definition PatternMatch.h:1314

llvm::PatternMatch::m_SExt
CastInst_match< OpTy, SExtInst > m_SExt(const OpTy &Op)
Matches SExt.
Definition PatternMatch.h:2261

llvm::PatternMatch::m_Zero
is_zero m_Zero()
Match any null constant or a vector with all elements equal to 0.
Definition PatternMatch.h:624

llvm::PatternMatch::m_c_Or
BinaryOp_match< LHS, RHS, Instruction::Or, true > m_c_Or(const LHS &L, const RHS &R)
Matches an Or with LHS and RHS in either order.
Definition PatternMatch.h:3008

llvm::PatternMatch::m_BitwiseLogic
BinOpPred_match< LHS, RHS, is_bitwiselogic_op > m_BitwiseLogic(const LHS &L, const RHS &R)
Matches bitwise logic operations.
Definition PatternMatch.h:1679

llvm::PatternMatch::m_Sub
BinaryOp_match< LHS, RHS, Instruction::Sub > m_Sub(const LHS &L, const RHS &R)
Definition PatternMatch.h:1206

llvm::PatternMatch::m_UMin
MaxMin_match< ICmpInst, LHS, RHS, umin_pred_ty > m_UMin(const LHS &L, const RHS &R)
Definition PatternMatch.h:2528

llvm::RISCVFenceField::R
@ R
Definition RISCVBaseInfo.h:463

llvm::RTLIB::getSINTTOFP
LLVM_ABI Libcall getSINTTOFP(EVT OpVT, EVT RetVT)
getSINTTOFP - Return the SINTTOFP_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
Definition TargetLoweringBase.cpp:300

llvm::RTLIB::getUINTTOFP
LLVM_ABI Libcall getUINTTOFP(EVT OpVT, EVT RetVT)
getUINTTOFP - Return the UINTTOFP_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
Definition TargetLoweringBase.cpp:348

llvm::RTLIB::getFPTOUINT
LLVM_ABI Libcall getFPTOUINT(EVT OpVT, EVT RetVT)
getFPTOUINT - Return the FPTOUINT_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
Definition TargetLoweringBase.cpp:251

llvm::RTLIB::getFPTOSINT
LLVM_ABI Libcall getFPTOSINT(EVT OpVT, EVT RetVT)
getFPTOSINT - Return the FPTOSINT_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
Definition TargetLoweringBase.cpp:202

llvm::RTLIB::getFPROUND
LLVM_ABI Libcall getFPROUND(EVT OpVT, EVT RetVT)
getFPROUND - Return the FPROUND_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
Definition TargetLoweringBase.cpp:155

llvm::RegState::Implicit
@ Implicit
Not emitted register (e.g. carry, or temporary result).
Definition MachineInstrBuilder.h:49

llvm::RegState::Dead
@ Dead
Unused definition.
Definition MachineInstrBuilder.h:53

llvm::RegState::Define
@ Define
Register definition.
Definition MachineInstrBuilder.h:47

llvm::RegState::ImplicitDefine
@ ImplicitDefine
Definition MachineInstrBuilder.h:66

llvm::RegState::Kill
@ Kill
The last use of a register.
Definition MachineInstrBuilder.h:51

llvm::RegState::Undef
@ Undef
Value of the register doesn't matter.
Definition MachineInstrBuilder.h:55

llvm::SDPatternMatch
Definition SDPatternMatch.h:25

llvm::SDPatternMatch::m_Opc
Opcode_match m_Opc(unsigned Opcode)
Definition SDPatternMatch.h:158

llvm::SDPatternMatch::m_Srl
BinaryOpc_match< LHS, RHS > m_Srl(const LHS &L, const RHS &R)
Definition SDPatternMatch.h:891

llvm::SDPatternMatch::m_SpecificVT
auto m_SpecificVT(EVT RefVT, const Pattern &P)
Match a specific ValueType.
Definition SDPatternMatch.h:287

llvm::SDPatternMatch::m_InsertSubvector
TernaryOpc_match< LHS, RHS, IDX > m_InsertSubvector(const LHS &Base, const RHS &Sub, const IDX &Idx)
Definition SDPatternMatch.h:582

llvm::SDPatternMatch::m_Abs
UnaryOpc_match< Opnd > m_Abs(const Opnd &Op)
Definition SDPatternMatch.h:1023

llvm::SDPatternMatch::m_AnyOf
Or< Preds... > m_AnyOf(const Preds &...preds)
Definition SDPatternMatch.h:433

llvm::SDPatternMatch::m_AllOf
And< Preds... > m_AllOf(const Preds &...preds)
Definition SDPatternMatch.h:429

llvm::SDPatternMatch::m_SetCC
TernaryOpc_match< T0_P, T1_P, T2_P > m_SetCC(const T0_P &LHS, const T1_P &RHS, const T2_P &CC)
Definition SDPatternMatch.h:538

llvm::SDPatternMatch::m_AnyExt
UnaryOpc_match< Opnd > m_AnyExt(const Opnd &Op)
Definition SDPatternMatch.h:1015

llvm::SDPatternMatch::m_Node
auto m_Node(unsigned Opcode, const OpndPreds &...preds)
Definition SDPatternMatch.h:471

llvm::SDPatternMatch::m_VSelect
TernaryOpc_match< T0_P, T1_P, T2_P > m_VSelect(const T0_P &Cond, const T1_P &T, const T2_P &F)
Definition SDPatternMatch.h:557

llvm::SDPatternMatch::sd_match
bool sd_match(SDNode *N, const SelectionDAG *DAG, Pattern &&P)
Definition SDPatternMatch.h:69

llvm::SDPatternMatch::m_SpecificCondCode
CondCode_match m_SpecificCondCode(ISD::CondCode CC)
Match a conditional code SDNode with a specific ISD::CondCode.
Definition SDPatternMatch.h:1245

llvm::SDPatternMatch::m_SpecificVectorElementVT
auto m_SpecificVectorElementVT(EVT RefVT, const Pattern &P)
Match a vector ValueType.
Definition SDPatternMatch.h:310

llvm::SDPatternMatch::m_CondCode
CondCode_match m_CondCode()
Match any conditional code SDNode.
Definition SDPatternMatch.h:1239

llvm::SDPatternMatch::m_ConstInt
ConstantInt_match m_ConstInt()
Match any integer constants or splat of an integer constant.
Definition SDPatternMatch.h:1108

llvm::SIEncodingFamily::SI
@ SI
Definition SIDefines.h:36

llvm::SI
Definition SIInstrInfo.h:1805

llvm::SPII::Store
@ Store
Definition SparcInstrInfo.h:33

llvm::Sched::RegPressure
@ RegPressure
Definition TargetLowering.h:106

llvm::Sched::ILP
@ ILP
Definition TargetLowering.h:108

llvm::SyncScope::System
@ System
Synchronized with respect to all concurrently executing threads.
Definition LLVMContext.h:58

llvm::SyncScope::ID
uint8_t ID
Definition LLVMContext.h:47

llvm::TLSModel::Model
Model
Definition CodeGen.h:45

llvm::TLSModel::LocalDynamic
@ LocalDynamic
Definition CodeGen.h:47

llvm::TLSModel::InitialExec
@ InitialExec
Definition CodeGen.h:48

llvm::TLSModel::GeneralDynamic
@ GeneralDynamic
Definition CodeGen.h:46

llvm::TLSModel::LocalExec
@ LocalExec
Definition CodeGen.h:49

llvm::TargetOpcode
Invariant opcodes: All instruction sets have these as their low opcodes.
Definition TargetOpcodes.h:20

llvm::VE
Definition VEFixupKinds.h:15

llvm::WinEH::EncodingType::X86
@ X86
Windows x64, Windows Itanium (IA-64)
Definition MCAsmInfo.h:50

llvm::X86AS::PTR32_UPTR
@ PTR32_UPTR
Definition X86.h:217

llvm::X86AS::FS
@ FS
Definition X86.h:214

llvm::X86AS::PTR64
@ PTR64
Definition X86.h:218

llvm::X86AS::PTR32_SPTR
@ PTR32_SPTR
Definition X86.h:216

llvm::X86AS::GS
@ GS
Definition X86.h:213

llvm::X86II::MO_TLSLD
@ MO_TLSLD
MO_TLSLD - On a symbol operand this indicates that the immediate is the offset of the GOT entry with ...
Definition X86BaseInfo.h:411

llvm::X86II::MO_GOTPCREL_NORELAX
@ MO_GOTPCREL_NORELAX
MO_GOTPCREL_NORELAX - Same as MO_GOTPCREL except that R_X86_64_GOTPCREL relocations are guaranteed to...
Definition X86BaseInfo.h:391

llvm::X86II::MO_COFFSTUB
@ MO_COFFSTUB
MO_COFFSTUB - On a symbol operand "FOO", this indicates that the reference is actually to the "....
Definition X86BaseInfo.h:488

llvm::X86II::MO_NTPOFF
@ MO_NTPOFF
MO_NTPOFF - On a symbol operand this indicates that the immediate is the negative thread-pointer offs...
Definition X86BaseInfo.h:450

llvm::X86II::MO_INDNTPOFF
@ MO_INDNTPOFF
MO_INDNTPOFF - On a symbol operand this indicates that the immediate is the absolute address of the G...
Definition X86BaseInfo.h:432

llvm::X86II::MO_GOTNTPOFF
@ MO_GOTNTPOFF
MO_GOTNTPOFF - On a symbol operand this indicates that the immediate is the offset of the GOT entry w...
Definition X86BaseInfo.h:456

llvm::X86II::MO_TPOFF
@ MO_TPOFF
MO_TPOFF - On a symbol operand this indicates that the immediate is the thread-pointer offset for the...
Definition X86BaseInfo.h:438

llvm::X86II::MO_TLVP_PIC_BASE
@ MO_TLVP_PIC_BASE
MO_TLVP_PIC_BASE - On a symbol operand this indicates that the immediate is some TLS offset from the ...
Definition X86BaseInfo.h:476

llvm::X86II::MO_TLSGD
@ MO_TLSGD
MO_TLSGD - On a symbol operand this indicates that the immediate is the offset of the GOT entry with ...
Definition X86BaseInfo.h:403

llvm::X86II::MO_NO_FLAG
@ MO_NO_FLAG
MO_NO_FLAG - No flag for the operand.
Definition X86BaseInfo.h:363

llvm::X86II::MO_TLVP
@ MO_TLVP
MO_TLVP - On a symbol operand this indicates that the immediate is some TLS offset.
Definition X86BaseInfo.h:472

llvm::X86II::MO_DLLIMPORT
@ MO_DLLIMPORT
MO_DLLIMPORT - On a symbol operand "FOO", this indicates that the reference is actually to the "__imp...
Definition X86BaseInfo.h:460

llvm::X86II::MO_GOTTPOFF
@ MO_GOTTPOFF
MO_GOTTPOFF - On a symbol operand this indicates that the immediate is the offset of the GOT entry wi...
Definition X86BaseInfo.h:425

llvm::X86II::MO_SECREL
@ MO_SECREL
MO_SECREL - On a symbol operand this indicates that the immediate is the offset from beginning of sec...
Definition X86BaseInfo.h:480

llvm::X86II::MO_DTPOFF
@ MO_DTPOFF
MO_DTPOFF - On a symbol operand this indicates that the immediate is the offset of the GOT entry with...
Definition X86BaseInfo.h:444

llvm::X86II::MO_TLSLDM
@ MO_TLSLDM
MO_TLSLDM - On a symbol operand this indicates that the immediate is the offset of the GOT entry with...
Definition X86BaseInfo.h:419

llvm::X86II::MO_GOTPCREL
@ MO_GOTPCREL
MO_GOTPCREL - On a symbol operand this indicates that the immediate is offset to the GOT entry for th...
Definition X86BaseInfo.h:387

llvm::X86ISD::NodeType
NodeType
Definition X86ISelLowering.h:26

llvm::X86ISD::UNPCKL
@ UNPCKL
Definition X86ISelLowering.h:485

llvm::X86ISD::AESDECWIDE128KL
@ AESDECWIDE128KL
Definition X86ISelLowering.h:981

llvm::X86ISD::KSHIFTL
@ KSHIFTL
Definition X86ISelLowering.h:391

llvm::X86ISD::FST
@ FST
This instruction implements a truncating store from FP stack slots.
Definition X86ISelLowering.h:957

llvm::X86ISD::FP_TO_UINT_SAT
@ FP_TO_UINT_SAT
Definition X86ISelLowering.h:732

llvm::X86ISD::UNPCKH
@ UNPCKH
Definition X86ISelLowering.h:486

llvm::X86ISD::CMPM
@ CMPM
Vector comparison generating mask bits for fp and integer signed and unsigned data types.
Definition X86ISelLowering.h:412

llvm::X86ISD::VAARG_X32
@ VAARG_X32
Definition X86ISelLowering.h:962

llvm::X86ISD::GF2P8MULB
@ GF2P8MULB
Definition X86ISelLowering.h:801

llvm::X86ISD::FMAX
@ FMAX
Floating point max and min.
Definition X86ISelLowering.h:281

llvm::X86ISD::BT
@ BT
X86 bit-test instructions.
Definition X86ISelLowering.h:108

llvm::X86ISD::GF2P8AFFINEQB
@ GF2P8AFFINEQB
Definition X86ISelLowering.h:800

llvm::X86ISD::VPMADD52H
@ VPMADD52H
Definition X86ISelLowering.h:565

llvm::X86ISD::ENQCMDS
@ ENQCMDS
Definition X86ISelLowering.h:812

llvm::X86ISD::STRICT_CMPP
@ STRICT_CMPP
Definition X86ISelLowering.h:833

llvm::X86ISD::BEXTRI
@ BEXTRI
Definition X86ISelLowering.h:431

llvm::X86ISD::PACKSS
@ PACKSS
Definition X86ISelLowering.h:461

llvm::X86ISD::CMPP
@ CMPP
Definition X86ISelLowering.h:399

llvm::X86ISD::HADD
@ HADD
Integer horizontal add/sub.
Definition X86ISelLowering.h:270

llvm::X86ISD::VSRL
@ VSRL
Definition X86ISelLowering.h:377

llvm::X86ISD::MOVQ2DQ
@ MOVQ2DQ
Copies a 64-bit value from an MMX vector to the low word of an XMM vector, with the high word zero fi...
Definition X86ISelLowering.h:174

llvm::X86ISD::LOR
@ LOR
Definition X86ISelLowering.h:886

llvm::X86ISD::CTEST
@ CTEST
Definition X86ISelLowering.h:825

llvm::X86ISD::BLENDI
@ BLENDI
Blend where the selector is an immediate.
Definition X86ISelLowering.h:220

llvm::X86ISD::VZEXT_MOVL
@ VZEXT_MOVL
Definition X86ISelLowering.h:336

llvm::X86ISD::MCVTTP2UIS
@ MCVTTP2UIS
Definition X86ISelLowering.h:703

llvm::X86ISD::MCVTPS2PH_SAE
@ MCVTPS2PH_SAE
Definition X86ISelLowering.h:796

llvm::X86ISD::SUBV_BROADCAST_LOAD
@ SUBV_BROADCAST_LOAD
Definition X86ISelLowering.h:913

llvm::X86ISD::PSHUFD
@ PSHUFD
Definition X86ISelLowering.h:467

llvm::X86ISD::VTRUNCS
@ VTRUNCS
Definition X86ISelLowering.h:342

llvm::X86ISD::MCVTPS2PH
@ MCVTPS2PH
Definition X86ISelLowering.h:795

llvm::X86ISD::MOVLHPS
@ MOVLHPS
Definition X86ISelLowering.h:480

llvm::X86ISD::AXOR
@ AXOR
Definition X86ISelLowering.h:900

llvm::X86ISD::VSRLV
@ VSRLV
Definition X86ISelLowering.h:382

llvm::X86ISD::EH_SJLJ_SETJMP
@ EH_SJLJ_SETJMP
Definition X86ISelLowering.h:323

llvm::X86ISD::CVTPS2PH
@ CVTPS2PH
Definition X86ISelLowering.h:788

llvm::X86ISD::FIRST_NUMBER
@ FIRST_NUMBER
Definition X86ISelLowering.h:28

llvm::X86ISD::RSQRT14
@ RSQRT14
Definition X86ISelLowering.h:299

llvm::X86ISD::VPPERM
@ VPPERM
Definition X86ISelLowering.h:546

llvm::X86ISD::CMP
@ CMP
X86 compare and logical compare instructions.
Definition X86ISelLowering.h:98

llvm::X86ISD::BLENDV
@ BLENDV
Dynamic (non-constant condition) vector blend where only the sign bits of the condition elements are ...
Definition X86ISelLowering.h:227

llvm::X86ISD::VFMADDCSH
@ VFMADDCSH
Definition X86ISelLowering.h:600

llvm::X86ISD::KTEST
@ KTEST
Definition X86ISelLowering.h:454

llvm::X86ISD::SHUFP
@ SHUFP
Definition X86ISelLowering.h:470

llvm::X86ISD::FNSTCW16m
@ FNSTCW16m
Definition X86ISelLowering.h:916

llvm::X86ISD::FMSUBADD_RND
@ FMSUBADD_RND
Definition X86ISelLowering.h:587

llvm::X86ISD::RCP14S
@ RCP14S
Definition X86ISelLowering.h:302

llvm::X86ISD::ADDSUB
@ ADDSUB
Combined add and sub on an FP vector.
Definition X86ISelLowering.h:230

llvm::X86ISD::FMADDSUB_RND
@ FMADDSUB_RND
Definition X86ISelLowering.h:586

llvm::X86ISD::AAND
@ AAND
Definition X86ISelLowering.h:901

llvm::X86ISD::VPTERNLOG
@ VPTERNLOG
Definition X86ISelLowering.h:501

llvm::X86ISD::LBTC_RM
@ LBTC_RM
Definition X86ISelLowering.h:893

llvm::X86ISD::STRICT_FMAX
@ STRICT_FMAX
Floating point max and min.
Definition X86ISelLowering.h:871

llvm::X86ISD::STRICT_FCMP
@ STRICT_FCMP
Definition X86ISelLowering.h:829

llvm::X86ISD::VFPEXT
@ VFPEXT
Definition X86ISelLowering.h:353

llvm::X86ISD::AESENC128KL
@ AESENC128KL
Definition X86ISelLowering.h:976

llvm::X86ISD::MOVSH
@ MOVSH
Definition X86ISelLowering.h:484

llvm::X86ISD::STRICT_CMPM
@ STRICT_CMPM
Vector comparison generating mask bits for fp and integer signed and unsigned data types.
Definition X86ISelLowering.h:837

llvm::X86ISD::LAND
@ LAND
Definition X86ISelLowering.h:888

llvm::X86ISD::FNMADD
@ FNMADD
Definition X86ISelLowering.h:575

llvm::X86ISD::PHMINPOS
@ PHMINPOS
Definition X86ISelLowering.h:406

llvm::X86ISD::FHADD
@ FHADD
Floating point horizontal add/sub.
Definition X86ISelLowering.h:274

llvm::X86ISD::VTRUNCUS
@ VTRUNCUS
Definition X86ISelLowering.h:341

llvm::X86ISD::VPCOMU
@ VPCOMU
Definition X86ISelLowering.h:544

llvm::X86ISD::UCOMX
@ UCOMX
Definition X86ISelLowering.h:105

llvm::X86ISD::ENQCMD
@ ENQCMD
Definition X86ISelLowering.h:811

llvm::X86ISD::CVTP2SI
@ CVTP2SI
Definition X86ISelLowering.h:678

llvm::X86ISD::VSHRD
@ VSHRD
Definition X86ISelLowering.h:473

llvm::X86ISD::BSR
@ BSR
Bit scan reverse.
Definition X86ISelLowering.h:33

llvm::X86ISD::STRICT_CVTTP2UI
@ STRICT_CVTTP2UI
Definition X86ISelLowering.h:841

llvm::X86ISD::VZEXT_LOAD
@ VZEXT_LOAD
Definition X86ISelLowering.h:904

llvm::X86ISD::SETCC
@ SETCC
X86 SetCC.
Definition X86ISelLowering.h:112

llvm::X86ISD::MCVTTP2SIS
@ MCVTTP2SIS
Definition X86ISelLowering.h:702

llvm::X86ISD::XOR
@ XOR
Definition X86ISelLowering.h:426

llvm::X86ISD::PDEP
@ PDEP
Definition X86ISelLowering.h:437

llvm::X86ISD::SUB
@ SUB
Definition X86ISelLowering.h:420

llvm::X86ISD::VFCMADDCSH
@ VFCMADDCSH
Definition X86ISelLowering.h:602

llvm::X86ISD::LWPINS
@ LWPINS
Definition X86ISelLowering.h:804

llvm::X86ISD::VSHL
@ VSHL
Definition X86ISelLowering.h:376

llvm::X86ISD::VFIXUPIMM_SAE
@ VFIXUPIMM_SAE
Definition X86ISelLowering.h:504

llvm::X86ISD::VMFPROUND
@ VMFPROUND
Definition X86ISelLowering.h:369

llvm::X86ISD::LCMPXCHG8_DAG
@ LCMPXCHG8_DAG
Definition X86ISelLowering.h:878

llvm::X86ISD::MCVTTP2SI
@ MCVTTP2SI
Definition X86ISelLowering.h:725

llvm::X86ISD::VFCMADDC
@ VFCMADDC
Definition X86ISelLowering.h:592

llvm::X86ISD::VFIXUPIMM
@ VFIXUPIMM
Definition X86ISelLowering.h:503

llvm::X86ISD::TLSDESC
@ TLSDESC
Definition X86ISelLowering.h:317

llvm::X86ISD::FNMSUB
@ FNMSUB
Definition X86ISelLowering.h:577

llvm::X86ISD::VSRAI
@ VSRAI
Definition X86ISelLowering.h:388

llvm::X86ISD::SEG_ALLOCA
@ SEG_ALLOCA
Definition X86ISelLowering.h:752

llvm::X86ISD::FMADDSUB
@ FMADDSUB
Definition X86ISelLowering.h:578

llvm::X86ISD::NT_BRIND
@ NT_BRIND
BRIND node with NoTrack prefix.
Definition X86ISelLowering.h:144

llvm::X86ISD::UCOMI
@ UCOMI
Definition X86ISelLowering.h:101

llvm::X86ISD::CVTP2UI
@ CVTP2UI
Definition X86ISelLowering.h:679

llvm::X86ISD::MOVMSK
@ MOVMSK
Definition X86ISelLowering.h:444

llvm::X86ISD::FLDENVm
@ FLDENVm
Definition X86ISelLowering.h:925

llvm::X86ISD::MCVTTP2UI
@ MCVTTP2UI
Definition X86ISelLowering.h:726

llvm::X86ISD::SBB
@ SBB
Definition X86ISelLowering.h:422

llvm::X86ISD::PEXT
@ PEXT
Definition X86ISelLowering.h:438

llvm::X86ISD::COMX
@ COMX
Definition X86ISelLowering.h:104

llvm::X86ISD::VFMULC
@ VFMULC
Definition X86ISelLowering.h:595

llvm::X86ISD::MOVHLPS
@ MOVHLPS
Definition X86ISelLowering.h:481

llvm::X86ISD::FNMADD_RND
@ FNMADD_RND
Definition X86ISelLowering.h:583

llvm::X86ISD::GF2P8AFFINEINVQB
@ GF2P8AFFINEINVQB
Definition X86ISelLowering.h:799

llvm::X86ISD::AESENC256KL
@ AESENC256KL
Definition X86ISelLowering.h:978

llvm::X86ISD::STRICT_CVTTP2SI
@ STRICT_CVTTP2SI
Definition X86ISelLowering.h:840

llvm::X86ISD::SELECTS
@ SELECTS
X86 Select.
Definition X86ISelLowering.h:115

llvm::X86ISD::FSETCCM
@ FSETCCM
X86 FP SETCC, similar to above, but with output as an i1 mask and and a version with SAE.
Definition X86ISelLowering.h:128

llvm::X86ISD::VSRLDQ
@ VSRLDQ
Definition X86ISelLowering.h:373

llvm::X86ISD::VFMADDC
@ VFMADDC
Definition X86ISelLowering.h:590

llvm::X86ISD::DYN_ALLOCA
@ DYN_ALLOCA
Definition X86ISelLowering.h:747

llvm::X86ISD::FNMSUB_RND
@ FNMSUB_RND
Definition X86ISelLowering.h:585

llvm::X86ISD::STRICT_CVTUI2P
@ STRICT_CVTUI2P
Definition X86ISelLowering.h:856

llvm::X86ISD::VPDPBUSD
@ VPDPBUSD
Definition X86ISelLowering.h:568

llvm::X86ISD::VFPCLASSS
@ VFPCLASSS
Definition X86ISelLowering.h:527

llvm::X86ISD::PSHUFLW
@ PSHUFLW
Definition X86ISelLowering.h:469

llvm::X86ISD::TLSBASEADDR
@ TLSBASEADDR
Definition X86ISelLowering.h:309

llvm::X86ISD::MOVDDUP
@ MOVDDUP
Definition X86ISelLowering.h:477

llvm::X86ISD::COMI
@ COMI
Definition X86ISelLowering.h:100

llvm::X86ISD::FCMP
@ FCMP
Definition X86ISelLowering.h:99

llvm::X86ISD::LCMPXCHG_DAG
@ LCMPXCHG_DAG
Definition X86ISelLowering.h:877

llvm::X86ISD::STRICT_CVTPH2PS
@ STRICT_CVTPH2PS
Definition X86ISelLowering.h:865

llvm::X86ISD::TESTP
@ TESTP
Definition X86ISelLowering.h:450

llvm::X86ISD::PEXTRB
@ PEXTRB
Extract an 8-bit value from a vector and zero extend it to i32, corresponds to X86::PEXTRB.
Definition X86ISelLowering.h:190

llvm::X86ISD::FXOR
@ FXOR
Bitwise logical XOR of floating point values.
Definition X86ISelLowering.h:52

llvm::X86ISD::ADC
@ ADC
Definition X86ISelLowering.h:421

llvm::X86ISD::LXOR
@ LXOR
Definition X86ISelLowering.h:887

llvm::X86ISD::STRICT_CVTSI2P
@ STRICT_CVTSI2P
Definition X86ISelLowering.h:855

llvm::X86ISD::MCVTP2UI
@ MCVTP2UI
Definition X86ISelLowering.h:724

llvm::X86ISD::STRICT_FMSUB
@ STRICT_FMSUB
Definition X86ISelLowering.h:860

llvm::X86ISD::VSRA
@ VSRA
Definition X86ISelLowering.h:378

llvm::X86ISD::VROTRI
@ VROTRI
Definition X86ISelLowering.h:396

llvm::X86ISD::CVTUI2P
@ CVTUI2P
Definition X86ISelLowering.h:713

llvm::X86ISD::BRCOND
@ BRCOND
X86 conditional branches.
Definition X86ISelLowering.h:140

llvm::X86ISD::VBROADCAST_LOAD
@ VBROADCAST_LOAD
Definition X86ISelLowering.h:910

llvm::X86ISD::SHUF128
@ SHUF128
Definition X86ISelLowering.h:476

llvm::X86ISD::SMUL
@ SMUL
Definition X86ISelLowering.h:423

llvm::X86ISD::FSETCC
@ FSETCC
X86 FP SETCC, implemented with CMP{cc}SS/CMP{cc}SD.
Definition X86ISelLowering.h:124

llvm::X86ISD::VSRAV
@ VSRAV
Definition X86ISelLowering.h:383

llvm::X86ISD::FNSTENVm
@ FNSTENVm
Definition X86ISelLowering.h:922

llvm::X86ISD::STRICT_FMIN
@ STRICT_FMIN
Definition X86ISelLowering.h:872

llvm::X86ISD::PINSRB
@ PINSRB
Insert the lower 8-bits of a 32-bit value to a vector, corresponds to X86::PINSRB.
Definition X86ISelLowering.h:202

llvm::X86ISD::VSHLDQ
@ VSHLDQ
Definition X86ISelLowering.h:372

llvm::X86ISD::BEXTR
@ BEXTR
Definition X86ISelLowering.h:430

llvm::X86ISD::PMULUDQ
@ PMULUDQ
Definition X86ISelLowering.h:551

llvm::X86ISD::CVTNEPS2BF16
@ CVTNEPS2BF16
Definition X86ISelLowering.h:736

llvm::X86ISD::AESENCWIDE256KL
@ AESENCWIDE256KL
Definition X86ISelLowering.h:982

llvm::X86ISD::VSHLD
@ VSHLD
Definition X86ISelLowering.h:472

llvm::X86ISD::FMSUB_RND
@ FMSUB_RND
Definition X86ISelLowering.h:584

llvm::X86ISD::VBROADCASTM
@ VBROADCASTM
Definition X86ISelLowering.h:533

llvm::X86ISD::VMTRUNCSTOREUS
@ VMTRUNCSTOREUS
Definition X86ISelLowering.h:968

llvm::X86ISD::CVTTP2SI
@ CVTTP2SI
Definition X86ISelLowering.h:689

llvm::X86ISD::INSERTPS
@ INSERTPS
Insert any element of a 4 x float vector into any element of a destination 4 x floatvector.
Definition X86ISelLowering.h:198

llvm::X86ISD::VSRLI
@ VSRLI
Definition X86ISelLowering.h:387

llvm::X86ISD::MFENCE
@ MFENCE
Definition X86ISelLowering.h:759

llvm::X86ISD::AND
@ AND
Definition X86ISelLowering.h:427

llvm::X86ISD::VSHLV
@ VSHLV
Definition X86ISelLowering.h:381

llvm::X86ISD::PSHUFB
@ PSHUFB
Shuffle 16 8-bit values within a vector.
Definition X86ISelLowering.h:209

llvm::X86ISD::VMINMAX
@ VMINMAX
Definition X86ISelLowering.h:624

llvm::X86ISD::CLOAD
@ CLOAD
Definition X86ISelLowering.h:996

llvm::X86ISD::VTRUNCSTOREUS
@ VTRUNCSTOREUS
Definition X86ISelLowering.h:965

llvm::X86ISD::VPERMI
@ VPERMI
Definition X86ISelLowering.h:489

llvm::X86ISD::PEXTRW
@ PEXTRW
Extract a 16-bit value from a vector and zero extend it to i32, corresponds to X86::PEXTRW.
Definition X86ISelLowering.h:194

llvm::X86ISD::UMWAIT
@ UMWAIT
Definition X86ISelLowering.h:807

llvm::X86ISD::STRICT_FNMADD
@ STRICT_FNMADD
Definition X86ISelLowering.h:859

llvm::X86ISD::PCMPEQ
@ PCMPEQ
Definition X86ISelLowering.h:402

llvm::X86ISD::PROBED_ALLOCA
@ PROBED_ALLOCA
Definition X86ISelLowering.h:756

llvm::X86ISD::VPMADDWD
@ VPMADDWD
Definition X86ISelLowering.h:559

llvm::X86ISD::RDPKRU
@ RDPKRU
Definition X86ISelLowering.h:772

llvm::X86ISD::AADD
@ AADD
RAO arithmetic instructions.
Definition X86ISelLowering.h:898

llvm::X86ISD::RCP14
@ RCP14
Definition X86ISelLowering.h:301

llvm::X86ISD::FSHR
@ FSHR
Definition X86ISelLowering.h:40

llvm::X86ISD::VFPROUND
@ VFPROUND
Definition X86ISelLowering.h:359

llvm::X86ISD::AESDECWIDE256KL
@ AESDECWIDE256KL
Definition X86ISelLowering.h:983

llvm::X86ISD::AOR
@ AOR
Definition X86ISelLowering.h:899

llvm::X86ISD::VPDPWSSD
@ VPDPWSSD
Definition X86ISelLowering.h:570

llvm::X86ISD::CVTPH2PS
@ CVTPH2PS
Definition X86ISelLowering.h:790

llvm::X86ISD::FANDN
@ FANDN
Bitwise logical ANDNOT of floating point values.
Definition X86ISelLowering.h:56

llvm::X86ISD::KSHIFTR
@ KSHIFTR
Definition X86ISelLowering.h:392

llvm::X86ISD::STRICT_VFPEXT
@ STRICT_VFPEXT
Definition X86ISelLowering.h:844

llvm::X86ISD::AESDEC256KL
@ AESDEC256KL
Definition X86ISelLowering.h:979

llvm::X86ISD::CSTORE
@ CSTORE
Definition X86ISelLowering.h:997

llvm::X86ISD::LBTR_RM
@ LBTR_RM
Definition X86ISelLowering.h:894

llvm::X86ISD::PCMPESTR
@ PCMPESTR
Definition X86ISelLowering.h:782

llvm::X86ISD::VP2INTERSECT
@ VP2INTERSECT
Definition X86ISelLowering.h:815

llvm::X86ISD::TPAUSE
@ TPAUSE
Definition X86ISelLowering.h:808

llvm::X86ISD::FMSUB
@ FMSUB
Definition X86ISelLowering.h:576

llvm::X86ISD::EH_SJLJ_LONGJMP
@ EH_SJLJ_LONGJMP
Definition X86ISelLowering.h:326

llvm::X86ISD::GlobalBaseReg
@ GlobalBaseReg
On Darwin, this node represents the result of the popl at function entry, used for PIC code.
Definition X86ISelLowering.h:161

llvm::X86ISD::MCVTP2SI
@ MCVTP2SI
Definition X86ISelLowering.h:723

llvm::X86ISD::VFCMULC
@ VFCMULC
Definition X86ISelLowering.h:597

llvm::X86ISD::FMAXC
@ FMAXC
Commutative FMIN and FMAX.
Definition X86ISelLowering.h:285

llvm::X86ISD::FP_TO_SINT_SAT
@ FP_TO_SINT_SAT
Definition X86ISelLowering.h:731

llvm::X86ISD::AESDEC128KL
@ AESDEC128KL
Definition X86ISelLowering.h:977

llvm::X86ISD::PCMPISTR
@ PCMPISTR
Definition X86ISelLowering.h:781

llvm::X86ISD::VPERMIL2
@ VPERMIL2
Definition X86ISelLowering.h:548

llvm::X86ISD::EXTRQI
@ EXTRQI
SSE4A Extraction and Insertion.
Definition X86ISelLowering.h:536

llvm::X86ISD::FLD
@ FLD
This instruction implements an extending load to FP stack slots.
Definition X86ISelLowering.h:951

llvm::X86ISD::VRNDSCALE
@ VRNDSCALE
Definition X86ISelLowering.h:520

llvm::X86ISD::PSADBW
@ PSADBW
Compute Sum of Absolute Differences.
Definition X86ISelLowering.h:212

llvm::X86ISD::VPSHL
@ VPSHL
Definition X86ISelLowering.h:541

llvm::X86ISD::LBTS
@ LBTS
Definition X86ISelLowering.h:889

llvm::X86ISD::FHSUB
@ FHSUB
Definition X86ISelLowering.h:275

llvm::X86ISD::FOR
@ FOR
Bitwise logical OR of floating point values.
Definition X86ISelLowering.h:48

llvm::X86ISD::VPMADDUBSW
@ VPMADDUBSW
Definition X86ISelLowering.h:558

llvm::X86ISD::STRICT_VFPROUND
@ STRICT_VFPROUND
Definition X86ISelLowering.h:847

llvm::X86ISD::VRNDSCALES
@ VRNDSCALES
Definition X86ISelLowering.h:522

llvm::X86ISD::VPERMV
@ VPERMV
Definition X86ISelLowering.h:494

llvm::X86ISD::TLSCALL
@ TLSCALL
Definition X86ISelLowering.h:313

llvm::X86ISD::MUL_IMM
@ MUL_IMM
Definition X86ISelLowering.h:441

llvm::X86ISD::VPERMV3
@ VPERMV3
Definition X86ISelLowering.h:498

llvm::X86ISD::FIST
@ FIST
This instruction implements a fp->int store from FP stack slots.
Definition X86ISelLowering.h:945

llvm::X86ISD::FP_TO_INT_IN_MEM
@ FP_TO_INT_IN_MEM
This instruction implements FP_TO_SINT with the integer destination in memory and a FP reg source.
Definition X86ISelLowering.h:932

llvm::X86ISD::MGATHER
@ MGATHER
Definition X86ISelLowering.h:972

llvm::X86ISD::LADD
@ LADD
LOCK-prefixed arithmetic read-modify-write instructions.
Definition X86ISelLowering.h:884

llvm::X86ISD::FMIN
@ FMIN
Definition X86ISelLowering.h:282

llvm::X86ISD::VTRUNC
@ VTRUNC
Definition X86ISelLowering.h:339

llvm::X86ISD::KORTEST
@ KORTEST
Definition X86ISelLowering.h:453

llvm::X86ISD::FMSUBADD
@ FMSUBADD
Definition X86ISelLowering.h:579

llvm::X86ISD::MOVSLDUP
@ MOVSLDUP
Definition X86ISelLowering.h:479

llvm::X86ISD::MOVSD
@ MOVSD
Definition X86ISelLowering.h:482

llvm::X86ISD::MMX_MOVW2D
@ MMX_MOVW2D
Copies a GPR into the low 32-bit word of a MMX vector and zero out the high word.
Definition X86ISelLowering.h:186

llvm::X86ISD::VBROADCAST
@ VBROADCAST
Definition X86ISelLowering.h:531

llvm::X86ISD::PMULDQ
@ PMULDQ
Definition X86ISelLowering.h:553

llvm::X86ISD::UMUL
@ UMUL
Definition X86ISelLowering.h:424

llvm::X86ISD::Wrapper
@ Wrapper
A wrapper node for TargetConstantPool, TargetJumpTable, TargetExternalSymbol, TargetGlobalAddress,...
Definition X86ISelLowering.h:166

llvm::X86ISD::FLDCW16m
@ FLDCW16m
Definition X86ISelLowering.h:919

llvm::X86ISD::EXPAND
@ EXPAND
Definition X86ISelLowering.h:664

llvm::X86ISD::CVTTP2UI
@ CVTTP2UI
Definition X86ISelLowering.h:690

llvm::X86ISD::WRPKRU
@ WRPKRU
Definition X86ISelLowering.h:773

llvm::X86ISD::PINSRW
@ PINSRW
Insert the lower 16-bits of a 32-bit value to a vector, corresponds to X86::PINSRW.
Definition X86ISelLowering.h:206

llvm::X86ISD::MSCATTER
@ MSCATTER
Definition X86ISelLowering.h:973

llvm::X86ISD::VPERMILPV
@ VPERMILPV
Definition X86ISelLowering.h:487

llvm::X86ISD::CMPCCXADD
@ CMPCCXADD
Compare and Add if Condition is Met.
Definition X86ISelLowering.h:989

llvm::X86ISD::EH_SJLJ_SETUP_DISPATCH
@ EH_SJLJ_SETUP_DISPATCH
Definition X86ISelLowering.h:329

llvm::X86ISD::LSUB
@ LSUB
Definition X86ISelLowering.h:885

llvm::X86ISD::LBTR
@ LBTR
Definition X86ISelLowering.h:891

llvm::X86ISD::MOVSHDUP
@ MOVSHDUP
Definition X86ISelLowering.h:478

llvm::X86ISD::FRCP
@ FRCP
Definition X86ISelLowering.h:296

llvm::X86ISD::VMINMAXS
@ VMINMAXS
Definition X86ISelLowering.h:626

llvm::X86ISD::INSERTQI
@ INSERTQI
Definition X86ISelLowering.h:537

llvm::X86ISD::EH_RETURN
@ EH_RETURN
Definition X86ISelLowering.h:320

llvm::X86ISD::ADD
@ ADD
Definition X86ISelLowering.h:419

llvm::X86ISD::VTRUNCSTORES
@ VTRUNCSTORES
Definition X86ISelLowering.h:966

llvm::X86ISD::MMX_MOVD2W
@ MMX_MOVD2W
Copies a 32-bit value from the low word of a MMX vector to a GPR.
Definition X86ISelLowering.h:182

llvm::X86ISD::VPERM2X128
@ VPERM2X128
Definition X86ISelLowering.h:490

llvm::X86ISD::VPMADD52L
@ VPMADD52L
Definition X86ISelLowering.h:564

llvm::X86ISD::RDRAND
@ RDRAND
Definition X86ISelLowering.h:762

llvm::X86ISD::CCMP
@ CCMP
Definition X86ISelLowering.h:824

llvm::X86ISD::TESTUI
@ TESTUI
Definition X86ISelLowering.h:818

llvm::X86ISD::LBTC
@ LBTC
Definition X86ISelLowering.h:890

llvm::X86ISD::PTEST
@ PTEST
Definition X86ISelLowering.h:447

llvm::X86ISD::PALIGNR
@ PALIGNR
Definition X86ISelLowering.h:464

llvm::X86ISD::VPERMILPI
@ VPERMILPI
Definition X86ISelLowering.h:488

llvm::X86ISD::FILD
@ FILD
This instruction implements SINT_TO_FP with the integer source in memory and FP reg result.
Definition X86ISelLowering.h:939

llvm::X86ISD::VALIGN
@ VALIGN
Definition X86ISelLowering.h:466

llvm::X86ISD::VPCOM
@ VPCOM
Definition X86ISelLowering.h:543

llvm::X86ISD::MOVDQ2Q
@ MOVDQ2Q
Copies a 64-bit value from the low word of an XMM vector to an MMX vector.
Definition X86ISelLowering.h:178

llvm::X86ISD::SETCC_CARRY
@ SETCC_CARRY
Definition X86ISelLowering.h:119

llvm::X86ISD::FP80_ADD
@ FP80_ADD
Definition X86ISelLowering.h:821

llvm::X86ISD::STRICT_FCMPS
@ STRICT_FCMPS
Definition X86ISelLowering.h:830

llvm::X86ISD::ANDNP
@ ANDNP
Bitwise Logical AND NOT of Packed FP values.
Definition X86ISelLowering.h:217

llvm::X86ISD::RDSEED
@ RDSEED
Definition X86ISelLowering.h:766

llvm::X86ISD::MCVTSI2P
@ MCVTSI2P
Definition X86ISelLowering.h:727

llvm::X86ISD::AESENCWIDE128KL
@ AESENCWIDE128KL
Definition X86ISelLowering.h:980

llvm::X86ISD::PACKUS
@ PACKUS
Definition X86ISelLowering.h:462

llvm::X86ISD::BSF
@ BSF
Bit scan forward.
Definition X86ISelLowering.h:31

llvm::X86ISD::FMADD_RND
@ FMADD_RND
Definition X86ISelLowering.h:582

llvm::X86ISD::CVTPS2PH_SAE
@ CVTPS2PH_SAE
Definition X86ISelLowering.h:789

llvm::X86ISD::FSETCCM_SAE
@ FSETCCM_SAE
Definition X86ISelLowering.h:129

llvm::X86ISD::LCMPXCHG16_DAG
@ LCMPXCHG16_DAG
Definition X86ISelLowering.h:879

llvm::X86ISD::VEXTRACT_STORE
@ VEXTRACT_STORE
Definition X86ISelLowering.h:907

llvm::X86ISD::VPSHA
@ VPSHA
Definition X86ISelLowering.h:540

llvm::X86ISD::VAARG_64
@ VAARG_64
These instructions grab the address of the next argument from a va_list.
Definition X86ISelLowering.h:961

llvm::X86ISD::FAND
@ FAND
Bitwise logical AND of floating point values.
Definition X86ISelLowering.h:44

llvm::X86ISD::CVTTS2SI
@ CVTTS2SI
Definition X86ISelLowering.h:706

llvm::X86ISD::HSUB
@ HSUB
Definition X86ISelLowering.h:271

llvm::X86ISD::PCMPGT
@ PCMPGT
Definition X86ISelLowering.h:403

llvm::X86ISD::STRICT_FP80_ADD
@ STRICT_FP80_ADD
Definition X86ISelLowering.h:868

llvm::X86ISD::OR
@ OR
Definition X86ISelLowering.h:425

llvm::X86ISD::CMOV
@ CMOV
X86 conditional moves.
Definition X86ISelLowering.h:134

llvm::X86ISD::STRICT_FNMSUB
@ STRICT_FNMSUB
Definition X86ISelLowering.h:861

llvm::X86ISD::VSHLI
@ VSHLI
Definition X86ISelLowering.h:386

llvm::X86ISD::CVTSI2P
@ CVTSI2P
Definition X86ISelLowering.h:712

llvm::X86ISD::MOVSS
@ MOVSS
Definition X86ISelLowering.h:483

llvm::X86ISD::VMTRUNCSTORES
@ VMTRUNCSTORES
Definition X86ISelLowering.h:969

llvm::X86ISD::TLSADDR
@ TLSADDR
Definition X86ISelLowering.h:305

llvm::X86ISD::WrapperRIP
@ WrapperRIP
Special wrapper used under X86-64 PIC mode for RIP relative displacements.
Definition X86ISelLowering.h:170

llvm::X86ISD::RSQRT14S
@ RSQRT14S
Definition X86ISelLowering.h:300

llvm::X86ISD::STRICT_CVTPS2PH
@ STRICT_CVTPS2PH
Definition X86ISelLowering.h:864

llvm::X86ISD::FMINC
@ FMINC
Definition X86ISelLowering.h:286

llvm::X86ISD::LBTS_RM
@ LBTS_RM
Definition X86ISelLowering.h:892

llvm::X86ISD::FSHL
@ FSHL
X86 funnel/double shift i16 instructions.
Definition X86ISelLowering.h:39

llvm::X86ISD::PSHUFHW
@ PSHUFHW
Definition X86ISelLowering.h:468

llvm::X86ISD::VROTLI
@ VROTLI
Definition X86ISelLowering.h:395

llvm::X86ISD::MCVTUI2P
@ MCVTUI2P
Definition X86ISelLowering.h:728

llvm::X86ISD::FRSQRT
@ FRSQRT
Floating point reciprocal-sqrt and reciprocal approximation.
Definition X86ISelLowering.h:295

llvm::X86
Define some predicates that are used for node matching.
Definition X86TargetParser.h:25

llvm::X86::rmTowardZero
@ rmTowardZero
Definition X86ISelLowering.h:1011

llvm::X86::rmToNearest
@ rmToNearest
Definition X86ISelLowering.h:1008

llvm::X86::rmDownward
@ rmDownward
Definition X86ISelLowering.h:1009

llvm::X86::rmInvalid
@ rmInvalid
Definition X86ISelLowering.h:1007

llvm::X86::rmUpward
@ rmUpward
Definition X86ISelLowering.h:1010

llvm::X86::CondCode
CondCode
Definition X86BaseInfo.h:77

llvm::X86::COND_GE
@ COND_GE
Definition X86BaseInfo.h:91

llvm::X86::COND_NP
@ COND_NP
Definition X86BaseInfo.h:89

llvm::X86::COND_NS
@ COND_NS
Definition X86BaseInfo.h:87

llvm::X86::COND_E
@ COND_E
Definition X86BaseInfo.h:82

llvm::X86::COND_G
@ COND_G
Definition X86BaseInfo.h:93

llvm::X86::COND_O
@ COND_O
Definition X86BaseInfo.h:78

llvm::X86::COND_BE
@ COND_BE
Definition X86BaseInfo.h:84

llvm::X86::COND_INVALID
@ COND_INVALID
Definition X86BaseInfo.h:102

llvm::X86::COND_B
@ COND_B
Definition X86BaseInfo.h:80

llvm::X86::COND_NE
@ COND_NE
Definition X86BaseInfo.h:83

llvm::X86::COND_NO
@ COND_NO
Definition X86BaseInfo.h:79

llvm::X86::COND_A
@ COND_A
Definition X86BaseInfo.h:85

llvm::X86::COND_LE
@ COND_LE
Definition X86BaseInfo.h:92

llvm::X86::COND_S
@ COND_S
Definition X86BaseInfo.h:86

llvm::X86::COND_L
@ COND_L
Definition X86BaseInfo.h:90

llvm::X86::COND_AE
@ COND_AE
Definition X86BaseInfo.h:81

llvm::X86::COND_P
@ COND_P
Definition X86BaseInfo.h:88

llvm::X86::AddrDisp
@ AddrDisp
Definition X86BaseInfo.h:32

llvm::X86::AddrNumOperands
@ AddrNumOperands
Definition X86BaseInfo.h:36

llvm::X86::TO_POS_INF
@ TO_POS_INF
Definition X86BaseInfo.h:44

llvm::X86::TO_NEAREST_INT
@ TO_NEAREST_INT
Definition X86BaseInfo.h:42

llvm::X86::NO_EXC
@ NO_EXC
Definition X86BaseInfo.h:47

llvm::X86::CUR_DIRECTION
@ CUR_DIRECTION
Definition X86BaseInfo.h:46

llvm::X86::TO_ZERO
@ TO_ZERO
Definition X86BaseInfo.h:45

llvm::X86::TO_NEG_INF
@ TO_NEG_INF
Definition X86BaseInfo.h:43

llvm::X86::MaxShuffleCombineDepth
@ MaxShuffleCombineDepth
Definition X86ISelLowering.cpp:41143

llvm::X86::mayFoldLoadIntoBroadcastFromMem
bool mayFoldLoadIntoBroadcastFromMem(SDValue Op, MVT EltVT, const X86Subtarget &Subtarget, bool AssumeSingleUse=false)
Check if Op is a load operation that could be folded into a vector splat instruction as a memory oper...
Definition X86ISelLowering.cpp:2800

llvm::X86::isZeroNode
bool isZeroNode(SDValue Elt)
Returns true if Elt is a constant zero or floating point constant +0.0.
Definition X86ISelLowering.cpp:3987

llvm::X86::GetOppositeBranchCondition
CondCode GetOppositeBranchCondition(CondCode CC)
GetOppositeBranchCondition - Return the inverse of the specified cond, e.g.
Definition X86InstrInfo.cpp:3309

llvm::X86::mayFoldIntoZeroExtend
bool mayFoldIntoZeroExtend(SDValue Op)
Check if Op is an operation that could be folded into a zero extend x86 instruction.
Definition X86ISelLowering.cpp:2827

llvm::X86::mayFoldIntoStore
bool mayFoldIntoStore(SDValue Op)
Check if Op is a value that could be used to fold a store into some other x86 instruction as a memory...
Definition X86ISelLowering.cpp:2814

llvm::X86::isExtendedSwiftAsyncFrameSupported
bool isExtendedSwiftAsyncFrameSupported(const X86Subtarget &Subtarget, const MachineFunction &MF)
True if the target supports the extended frame for async Swift functions.
Definition X86ISelLowering.cpp:27725

llvm::X86::FirstMacroFusionInstKind::Cmp
@ Cmp
Definition X86BaseInfo.h:109

llvm::X86::FirstMacroFusionInstKind::AddSub
@ AddSub
Definition X86BaseInfo.h:111

llvm::X86::getRoundingModeX86
int getRoundingModeX86(unsigned RM)
Convert LLVM rounding mode to X86 rounding mode.
Definition X86ISelLowering.cpp:5361

llvm::X86::getCCMPCondFlagsFromCondCode
int getCCMPCondFlagsFromCondCode(CondCode CC)
Definition X86InstrInfo.cpp:3215

llvm::X86::mayFoldLoad
bool mayFoldLoad(SDValue Op, const X86Subtarget &Subtarget, bool AssumeSingleUse=false)
Check if Op is a load operation that could be folded into some other x86 instruction as a memory oper...
Definition X86ISelLowering.cpp:2781

llvm::X86::createFastISel
FastISel * createFastISel(FunctionLoweringInfo &funcInfo, const TargetLibraryInfo *libInfo)
Definition X86FastISel.cpp:4094

llvm::X86::isOffsetSuitableForCodeModel
bool isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M, bool hasSymbolicDisplacement)
Returns true of the given offset can be fit into displacement field of the instruction.
Definition X86ISelLowering.cpp:2918

llvm::X86::isConstantSplat
bool isConstantSplat(SDValue Op, APInt &SplatVal, bool AllowPartialUndefs)
If Op is a constant whose elements are all the same constant or undefined, return true and return the...
Definition X86ISelLowering.cpp:5336

llvm::cl::Hidden
@ Hidden
Definition CommandLine.h:139

llvm::cl::init
initializer< Ty > init(const Ty &Val)
Definition CommandLine.h:445

llvm::codeview::EncodedFramePtrReg::BasePtr
@ BasePtr
Definition CodeView.h:528

llvm::codeview::EncodedFramePtrReg::StackPtr
@ StackPtr
Definition CodeView.h:526

llvm::codeview::PublicSymFlags::Function
@ Function
Definition CodeView.h:409

llvm::dwarf::Index
Index
Definition Dwarf.h:903

llvm::dxil::OpCode
OpCode
Definition DXILConstants.h:18

llvm::hlsl::rootsig::RegisterType::TReg
@ TReg
Definition HLSLRootSignature.h:31

llvm::hlsl::rootsig::RegisterType::BReg
@ BReg
Definition HLSLRootSignature.h:31

llvm::ifs::IFSEndiannessType::Big
@ Big
Definition IFSStub.h:39

llvm::lltok::APFloat
@ APFloat
Definition LLToken.h:517

llvm::logicalview::LVAttributeKind::Zero
@ Zero
Definition LVOptions.h:130

llvm::ms_demangle::IntrinsicFunctionKind::New
@ New
Definition MicrosoftDemangleNodes.h:121

llvm::ms_demangle::QualifierMangleMode::Result
@ Result
Definition MicrosoftDemangle.h:132

llvm::numbers::e
constexpr double e
Definition STLForwardCompat.h:61

llvm::objcarc::ARCInstKind::User
@ User
could "use" a pointer
Definition ObjCARCInstKind.h:52

llvm::pdb::PDB_SymType::Callee
@ Callee
Definition PDBTypes.h:282

llvm::pdb::PDB_SymType::ArrayType
@ ArrayType
Definition PDBTypes.h:259

llvm::rdf::Node
NodeAddr< NodeBase * > Node
Definition RDFGraph.h:381

llvm::rdf::Func
NodeAddr< FuncNode * > Func
Definition RDFGraph.h:393

llvm::sampleprof::Base
@ Base
Definition Discriminator.h:58

llvm::sandboxir::Instruction
friend class Instruction
Iterator for Instructions in a `BasicBlock.
Definition BasicBlock.h:73

llvm::sframe::BaseReg::SP
@ SP
Definition SFrame.h:79

llvm::tgtok::In
@ In
Definition TGLexer.h:84

llvm
This is an optimization pass for GlobalISel generic memory operations.
Definition AddressRanges.h:18

llvm::DecodeZeroExtendMask
void DecodeZeroExtendMask(unsigned SrcScalarBits, unsigned DstScalarBits, unsigned NumDstElts, bool IsAnyExtend, SmallVectorImpl< int > &ShuffleMask)
Decode a zero extension instruction as a shuffle mask.
Definition X86ShuffleDecode.cpp:371

llvm::next_nodbg
IterT next_nodbg(IterT It, IterT End, bool SkipPseudoOp=true)
Increment It, then continue incrementing it while it points to a debug instruction.
Definition MachineBasicBlock.h:1501

llvm::isGlobalStubReference
static bool isGlobalStubReference(unsigned char TargetFlag)
isGlobalStubReference - Return true if the specified TargetFlag operand is a reference to a stub for ...
Definition X86InstrInfo.h:121

llvm::drop_begin
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:316

llvm::ThreadPriority::Low
@ Low
Lower the current thread's priority such that it does not affect foreground tasks significantly.
Definition Threading.h:280

llvm::Offset
@ Offset
Definition DWP.cpp:477

llvm::Length
@ Length
Definition DWP.cpp:477

llvm::Value
FunctionAddr VTableAddr Value
Definition InstrProf.h:137

llvm::DecodeMOVHLPSMask
void DecodeMOVHLPSMask(unsigned NElts, SmallVectorImpl< int > &ShuffleMask)
Decode a MOVHLPS instruction as a v2f64/v4f32 shuffle mask.
Definition X86ShuffleDecode.cpp:61

llvm::FMSUB
@ FMSUB
Definition RISCVInstrInfo.h:75

llvm::FNMSUB
@ FNMSUB
Definition RISCVInstrInfo.h:76

llvm::find
auto find(R &&Range, const T &Val)
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1751

llvm::isGlobalRelativeToPICBase
static bool isGlobalRelativeToPICBase(unsigned char TargetFlag)
isGlobalRelativeToPICBase - Return true if the specified global value reference is relative to a 32-b...
Definition X86InstrInfo.h:139

llvm::all_of
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1725

llvm::size
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition STLExtras.h:1655

llvm::DecodeZeroMoveLowMask
void DecodeZeroMoveLowMask(unsigned NumElts, SmallVectorImpl< int > &ShuffleMask)
Decode a move lower and zero upper instruction as a shuffle mask.
Definition X86ShuffleDecode.cpp:385

llvm::BuildMI
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
Definition MachineInstrBuilder.h:369

llvm::Cost
InstructionCost Cost
Definition FunctionSpecialization.h:103

llvm::DecodeVPERMILPMask
void DecodeVPERMILPMask(unsigned NumElts, unsigned ScalarBits, ArrayRef< uint64_t > RawMask, const APInt &UndefElts, SmallVectorImpl< int > &ShuffleMask)
Decode a VPERMILPD/VPERMILPS variable mask from a raw array of constants.
Definition X86ShuffleDecode.cpp:477

llvm::isInt
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
Definition MathExtras.h:165

llvm::isNullConstant
LLVM_ABI bool isNullConstant(SDValue V)
Returns true if V is a constant integer zero.
Definition SelectionDAG.cpp:12828

llvm::isAllOnesOrAllOnesSplat
LLVM_ABI bool isAllOnesOrAllOnesSplat(const MachineInstr &MI, const MachineRegisterInfo &MRI, bool AllowUndefs=false)
Return true if the value is a constant -1 integer or a splatted vector of a constant -1 integer (with...
Definition Utils.cpp:1606

llvm::DecodePSHUFLWMask
void DecodePSHUFLWMask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
Decodes the shuffle masks for pshuflw.
Definition X86ShuffleDecode.cpp:178

llvm::Depth
@ Depth
Definition SIMachineScheduler.h:36

llvm::getIntrinsicWithChain
static const IntrinsicData * getIntrinsicWithChain(unsigned IntNo)
Definition X86IntrinsicsInfo.h:347

llvm::peekThroughBitcasts
LLVM_ABI SDValue peekThroughBitcasts(SDValue V)
Return the non-bitcasted source operand of V if it exists.
Definition SelectionDAG.cpp:12920

llvm::LoopIdiomVectorizeStyle::Masked
@ Masked
Definition LoopIdiomVectorize.h:16

llvm::enumerate
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2472

llvm::dyn_cast
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643

llvm::Log2_64_Ceil
unsigned Log2_64_Ceil(uint64_t Value)
Return the ceil log base 2 of the specified value, 64 if the value is zero.
Definition MathExtras.h:350

llvm::AlignStyle::Left
@ Left
Definition FormatCommon.h:17

llvm::getX86SubSuperRegister
MCRegister getX86SubSuperRegister(MCRegister Reg, unsigned Size, bool High=false)
Definition X86MCTargetDesc.cpp:750

llvm::ExceptionHandling::SjLj
@ SjLj
setjmp/longjmp based exceptions
Definition CodeGen.h:56

llvm::isIntOrFPConstant
bool isIntOrFPConstant(SDValue V)
Return true if V is either a integer or FP constant.
Definition SelectionDAGNodes.h:1968

llvm::DecodeVPERMV3Mask
void DecodeVPERMV3Mask(ArrayRef< uint64_t > RawMask, const APInt &UndefElts, SmallVectorImpl< int > &ShuffleMask)
Decode a VPERMT2 W/D/Q/PS/PD mask from a raw array of constants.
Definition X86ShuffleDecode.cpp:559

llvm::make_range
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
Definition iterator_range.h:70

llvm::bit_width
int bit_width(T Value)
Returns the number of bits needed to represent Value if Value is nonzero.
Definition bit.h:303

llvm::make_early_inc_range
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:632

llvm::DecodeBLENDMask
void DecodeBLENDMask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
Decode a BLEND immediate mask into a shuffle mask.
Definition X86ShuffleDecode.cpp:315

llvm::decodeVSHUF64x2FamilyMask
void decodeVSHUF64x2FamilyMask(unsigned NumElts, unsigned ScalarSize, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
Decode a shuffle packed values at 128-bit granularity (SHUFF32x4/SHUFF64x2/SHUFI32x4/SHUFI64x2) immed...
Definition X86ShuffleDecode.cpp:264

llvm::DecodeVPERMMask
void DecodeVPERMMask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
Decodes the shuffle masks for VPERMQ/VPERMPD.
Definition X86ShuffleDecode.cpp:364

llvm::addFrameReference
static const MachineInstrBuilder & addFrameReference(const MachineInstrBuilder &MIB, int FI, int Offset=0, bool mem=true)
addFrameReference - This function is used to add a reference to the base of an abstract object on the...
Definition PPCInstrBuilder.h:32

llvm::DecodeEXTRQIMask
void DecodeEXTRQIMask(unsigned NumElts, unsigned EltSize, int Len, int Idx, SmallVectorImpl< int > &ShuffleMask)
Decode a SSE4A EXTRQ instruction as a shuffle mask.
Definition X86ShuffleDecode.cpp:400

llvm::addFullAddress
static const MachineInstrBuilder & addFullAddress(const MachineInstrBuilder &MIB, const X86AddressMode &AM)
Definition X86InstrBuilder.h:169

llvm::isPowerOf2_64
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
Definition MathExtras.h:284

llvm::getIntrinsicWithoutChain
static const IntrinsicData * getIntrinsicWithoutChain(unsigned IntNo)
Definition X86IntrinsicsInfo.h:2080

llvm::unique
auto unique(Range &&R, Predicate P)
Definition STLExtras.h:2076

llvm::isNullOrNullSplat
LLVM_ABI bool isNullOrNullSplat(const MachineInstr &MI, const MachineRegisterInfo &MRI, bool AllowUndefs=false)
Return true if the value is a constant 0 integer or a splatted vector of a constant 0 integer (with n...
Definition Utils.cpp:1588

llvm::DecodePSRLDQMask
void DecodePSRLDQMask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
Definition X86ShuffleDecode.cpp:112

llvm::bit_ceil
T bit_ceil(T Value)
Returns the smallest integral power of two no smaller than Value if Value is nonzero.
Definition bit.h:345

llvm::copy_if
OutputIt copy_if(R &&Range, OutputIt Out, UnaryPredicate P)
Provide wrappers to std::copy_if which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1777

llvm::popcount
constexpr int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition bit.h:154

llvm::Log2_64
unsigned Log2_64(uint64_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:337

llvm::isMinSignedConstant
LLVM_ABI bool isMinSignedConstant(SDValue V)
Returns true if V is a constant min signed integer value.
Definition SelectionDAG.cpp:12852

llvm::isConstOrConstSplatFP
LLVM_ABI ConstantFPSDNode * isConstOrConstSplatFP(SDValue N, bool AllowUndefs=false)
Returns the SDNode if it is a constant splat BuildVector or constant float.
Definition SelectionDAG.cpp:13017

llvm::DecodeINSERTPSMask
void DecodeINSERTPSMask(unsigned Imm, SmallVectorImpl< int > &ShuffleMask, bool SrcIsMem)
Decode a 128-bit INSERTPS instruction as a v4f32 shuffle mask.
Definition X86ShuffleDecode.cpp:26

llvm::DecodeVPERM2X128Mask
void DecodeVPERM2X128Mask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
Definition X86ShuffleDecode.cpp:281

llvm::setDirectAddressInInstr
static void setDirectAddressInInstr(MachineInstr *MI, unsigned Operand, Register Reg)
Replace the address used in the instruction with the direct memory reference.
Definition X86InstrBuilder.h:126

llvm::countr_zero
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition bit.h:202

llvm::M1
unsigned M1(unsigned Val)
Definition VE.h:377

llvm::EHPersonality::CoreCLR
@ CoreCLR
Definition EHPersonalities.h:33

llvm::EHPersonality::MSVC_X86SEH
@ MSVC_X86SEH
Definition EHPersonalities.h:30

llvm::EHPersonality::MSVC_CXX
@ MSVC_CXX
Definition EHPersonalities.h:32

llvm::dyn_cast_or_null
auto dyn_cast_or_null(const Y &Val)
Definition Casting.h:753

llvm::DecodeVPERMIL2PMask
void DecodeVPERMIL2PMask(unsigned NumElts, unsigned ScalarBits, unsigned M2Z, ArrayRef< uint64_t > RawMask, const APInt &UndefElts, SmallVectorImpl< int > &ShuffleMask)
Decode a VPERMIL2PD/VPERMIL2PS variable mask from a raw array of constants.
Definition X86ShuffleDecode.cpp:499

llvm::has_single_bit
constexpr bool has_single_bit(T Value) noexcept
Definition bit.h:147

llvm::any_of
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1732

llvm::DecodeMOVLHPSMask
void DecodeMOVLHPSMask(unsigned NElts, SmallVectorImpl< int > &ShuffleMask)
Decode a MOVLHPS instruction as a v2f64/v4f32 shuffle mask.
Definition X86ShuffleDecode.cpp:70

llvm::getShuffleDemandedElts
LLVM_ABI bool getShuffleDemandedElts(int SrcWidth, ArrayRef< int > Mask, const APInt &DemandedElts, APInt &DemandedLHS, APInt &DemandedRHS, bool AllowUndefElts=false)
Transform a shuffle mask's output demanded element mask into demanded element masks for the 2 operand...
Definition VectorUtils.cpp:452

llvm::Log2_32
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:331

llvm::countl_zero
int countl_zero(T Val)
Count number of 0's from the most significant bit to the least stopping at the first 1.
Definition bit.h:236

llvm::isAlpha
bool isAlpha(char C)
Checks if character C is a valid letter as classified by "C" locale.
Definition StringExtras.h:118

llvm::isBitwiseNot
LLVM_ABI bool isBitwiseNot(SDValue V, bool AllowUndefs=false)
Returns true if V is a bitwise not operation.
Definition SelectionDAG.cpp:12961

llvm::HexPrintStyle::Upper
@ Upper
Definition NativeFormatting.h:23

llvm::HexPrintStyle::Lower
@ Lower
Definition NativeFormatting.h:23

llvm::reverse
auto reverse(ContainerTy &&C)
Definition STLExtras.h:406

llvm::isPowerOf2_32
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:279

llvm::getHorizDemandedEltsForFirstOperand
LLVM_ABI void getHorizDemandedEltsForFirstOperand(unsigned VectorBitWidth, const APInt &DemandedElts, APInt &DemandedLHS, APInt &DemandedRHS)
Compute the demanded elements mask of horizontal binary operations.
Definition VectorUtils.cpp:780

llvm::get
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
Definition PointerIntPair.h:268

llvm::ComplexDeinterleavingOperation::Splat
@ Splat
Definition ComplexDeinterleavingPass.h:42

llvm::FNMADD
@ FNMADD
Definition AArch64InstrInfo.h:174

llvm::createUnpackShuffleMask
void createUnpackShuffleMask(EVT VT, SmallVectorImpl< int > &Mask, bool Lo, bool Unary)
Generate unpacklo/unpackhi shuffle mask.
Definition X86ISelLowering.cpp:4806

llvm::ManagedRA
@ ManagedRA
Definition X86MachineFunctionInfo.h:26

llvm::DirectReg
@ DirectReg
Definition X86MachineFunctionInfo.h:26

llvm::Hi_32
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
Definition MathExtras.h:150

llvm::dbgs
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207

llvm::peekThroughTruncates
LLVM_ABI SDValue peekThroughTruncates(SDValue V)
Return the non-truncated source operand of V if it exists.
Definition SelectionDAG.cpp:12955

llvm::none_of
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1739

llvm::report_fatal_error
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
Definition Error.cpp:167

llvm::Count
FunctionAddr VTableAddr Count
Definition InstrProf.h:139

llvm::DecodeINSERTQIMask
void DecodeINSERTQIMask(unsigned NumElts, unsigned EltSize, int Len, int Idx, SmallVectorImpl< int > &ShuffleMask)
Decode a SSE4A INSERTQ instruction as a shuffle mask.
Definition X86ShuffleDecode.cpp:437

llvm::isUInt
constexpr bool isUInt(uint64_t x)
Checks if an unsigned integer fits into the given bit width.
Definition MathExtras.h:189

llvm::peekThroughOneUseBitcasts
LLVM_ABI SDValue peekThroughOneUseBitcasts(SDValue V)
Return the non-bitcasted and one-use source operand of V if it exists.
Definition SelectionDAG.cpp:12926

llvm::classifyEHPersonality
LLVM_ABI EHPersonality classifyEHPersonality(const Value *Pers)
See if the given exception handling personality function is one that we understand.
Definition EHPersonalities.cpp:23

llvm::CodeGenOptLevel::Default
@ Default
-O2, -Os
Definition CodeGen.h:85

llvm::SmallVector
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
Definition SmallVector.h:1129

llvm::Lo_32
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Definition MathExtras.h:155

llvm::isa
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547

llvm::WaitForUnlockResult::Success
@ Success
The lock was released successfully.
Definition AdvisoryLock.h:20

llvm::PackElem::Hi
@ Hi
Definition VECustomDAG.h:132

llvm::PackElem::Lo
@ Lo
Definition VECustomDAG.h:131

llvm::DecodeVPERMVMask
void DecodeVPERMVMask(ArrayRef< uint64_t > RawMask, const APInt &UndefElts, SmallVectorImpl< int > &ShuffleMask)
Decode a VPERM W/D/Q/PS/PD mask from a raw array of constants.
Definition X86ShuffleDecode.cpp:545

llvm::verifyIntrinsicTables
static void verifyIntrinsicTables()
Definition X86IntrinsicsInfo.h:2087

llvm::AtomicOrdering
AtomicOrdering
Atomic ordering for LLVM's memory model.
Definition AtomicOrdering.h:56

llvm::AtomicOrdering::SequentiallyConsistent
@ SequentiallyConsistent
Definition AtomicOrdering.h:64

llvm::ModRefInfo::Mod
@ Mod
The access may modify the value stored in memory.
Definition ModRef.h:34

llvm::createSplat2ShuffleMask
void createSplat2ShuffleMask(MVT VT, SmallVectorImpl< int > &Mask, bool Lo)
Similar to unpacklo/unpackhi, but without the 128-bit lane limitation imposed by AVX and specific to ...
Definition X86ISelLowering.cpp:4826

llvm::IRMemLocation::First
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
Definition ModRef.h:71

llvm::isFuncletEHPersonality
bool isFuncletEHPersonality(EHPersonality Pers)
Returns true if this is a personality function that invokes handler funclets (which must return to it...
Definition EHPersonalities.h:66

llvm::IRBuilder
IRBuilder(LLVMContext &, FolderTy, InserterTy, MDNode *, ArrayRef< OperandBundleDef >) -> IRBuilder< FolderTy, InserterTy >

llvm::DecodeVALIGNMask
void DecodeVALIGNMask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
Definition X86ShuffleDecode.cpp:139

llvm::lower_bound
auto lower_bound(R &&Range, T &&Value)
Provide wrappers to std::lower_bound which take ranges instead of having to pass begin/end explicitly...
Definition STLExtras.h:1994

llvm::narrowShuffleMaskElts
LLVM_ABI void narrowShuffleMaskElts(int Scale, ArrayRef< int > Mask, SmallVectorImpl< int > &ScaledMask)
Replace each shuffle mask index with the scaled sequential indices for an equivalent mask of narrowed...
Definition VectorUtils.cpp:519

llvm::bit_cast
To bit_cast(const From &from) noexcept
Definition bit.h:90

llvm::replace
void replace(R &&Range, const T &OldValue, const T &NewValue)
Provide wrappers to std::replace which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1860

llvm::RecurKind::UMin
@ UMin
Unsigned integer min implemented in terms of select(cmp()).
Definition IVDescriptors.h:46

llvm::RecurKind::Xor
@ Xor
Bitwise or logical XOR of integers.
Definition IVDescriptors.h:43

llvm::RecurKind::SMax
@ SMax
Signed integer max implemented in terms of select(cmp()).
Definition IVDescriptors.h:45

llvm::RecurKind::And
@ And
Bitwise or logical AND of integers.
Definition IVDescriptors.h:42

llvm::RecurKind::SMin
@ SMin
Signed integer min implemented in terms of select(cmp()).
Definition IVDescriptors.h:44

llvm::RecurKind::Sub
@ Sub
Subtraction of integers.
Definition IVDescriptors.h:38

llvm::RecurKind::Add
@ Add
Sum of integers.
Definition IVDescriptors.h:37

llvm::MCPhysReg
uint16_t MCPhysReg
An unsigned integer type large enough to represent all physical registers, but not necessarily virtua...
Definition MCRegister.h:21

llvm::DecodeScalarMoveMask
void DecodeScalarMoveMask(unsigned NumElts, bool IsLoad, SmallVectorImpl< int > &ShuffleMask)
Decode a scalar float move instruction as a shuffle mask.
Definition X86ShuffleDecode.cpp:391

llvm::isNullConstantOrUndef
LLVM_ABI bool isNullConstantOrUndef(SDValue V)
Returns true if V is a constant integer zero or an UNDEF node.
Definition SelectionDAG.cpp:12833

llvm::count
auto count(R &&Range, const E &Element)
Wrapper function around std::count to count the number of times an element Element occurs in the give...
Definition STLExtras.h:1954

llvm::getAddressFromInstr
static X86AddressMode getAddressFromInstr(const MachineInstr *MI, unsigned Operand)
Compute the addressing mode from an machine instruction starting with the given operand.
Definition X86InstrBuilder.h:86

llvm::DecodeVPPERMMask
void DecodeVPPERMMask(ArrayRef< uint64_t > RawMask, const APInt &UndefElts, SmallVectorImpl< int > &ShuffleMask)
Decode a VPPERM mask from a raw array of constants such as from BUILD_VECTOR.
Definition X86ShuffleDecode.cpp:325

llvm::Op
DWARFExpression::Operation Op
Definition DWARFExpressionPrinter.cpp:22

llvm::DecodePALIGNRMask
void DecodePALIGNRMask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
Definition X86ShuffleDecode.cpp:125

llvm::isPhysRegUsedAfter
bool isPhysRegUsedAfter(Register Reg, MachineBasicBlock::iterator MBI)
Check if physical register Reg is used after MBI.
Definition LivePhysRegs.cpp:353

llvm::DecodeMOVSLDUPMask
void DecodeMOVSLDUPMask(unsigned NumElts, SmallVectorImpl< int > &ShuffleMask)
Definition X86ShuffleDecode.cpp:78

llvm::RoundingMode
RoundingMode
Rounding mode.
Definition FloatingPointMode.h:38

llvm::M0
unsigned M0(unsigned Val)
Definition VE.h:376

llvm::ArrayRef
ArrayRef(const T &OneElt) -> ArrayRef< T >

llvm::isAsynchronousEHPersonality
bool isAsynchronousEHPersonality(EHPersonality Pers)
Returns true if this personality function catches asynchronous exceptions.
Definition EHPersonalities.h:51

llvm::toString
std::string toString(const APInt &I, unsigned Radix, bool Signed, bool formatAsCLiteral=false, bool UpperCase=true, bool InsertSeparators=false)
Definition StringExtras.h:344

llvm::isConstOrConstSplat
LLVM_ABI ConstantSDNode * isConstOrConstSplat(SDValue N, bool AllowUndefs=false, bool AllowTruncation=false)
Returns the SDNode if it is a constant splat BuildVector or constant int.
Definition SelectionDAG.cpp:12971

llvm::copy
OutputIt copy(R &&Range, OutputIt Out)
Definition STLExtras.h:1835

llvm::BitWidth
constexpr unsigned BitWidth
Definition BitmaskEnum.h:220

llvm::DecodeUNPCKLMask
void DecodeUNPCKLMask(unsigned NumElts, unsigned ScalarBits, SmallVectorImpl< int > &ShuffleMask)
Decodes the shuffle masks for unpcklps/unpcklpd and punpckl*.
Definition X86ShuffleDecode.cpp:234

llvm::DS_Error
@ DS_Error
Definition DiagnosticInfo.h:53

llvm::DecodePSLLDQMask
void DecodePSLLDQMask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
Definition X86ShuffleDecode.cpp:100

llvm::count_if
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
Definition STLExtras.h:1961

llvm::DecodeUNPCKHMask
void DecodeUNPCKHMask(unsigned NumElts, unsigned ScalarBits, SmallVectorImpl< int > &ShuffleMask)
Decodes the shuffle masks for unpckhps/unpckhpd and punpckh*.
Definition X86ShuffleDecode.cpp:218

llvm::cast
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559

llvm::find_if
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1758

llvm::isOneConstant
LLVM_ABI bool isOneConstant(SDValue V)
Returns true if V is a constant integer one.
Definition SelectionDAG.cpp:12847

llvm::addDirectMem
static const MachineInstrBuilder & addDirectMem(const MachineInstrBuilder &MIB, Register Reg)
addDirectMem - This function is used to add a direct memory reference to the current instruction – th...
Definition X86InstrBuilder.h:118

llvm::extractBits
static uint32_t extractBits(uint64_t Val, uint32_t Hi, uint32_t Lo)
Definition RuntimeDyldELF.cpp:736

llvm::is_contained
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1897

llvm::commonAlignment
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition Alignment.h:201

llvm::isNullFPConstant
LLVM_ABI bool isNullFPConstant(SDValue V)
Returns true if V is an FP constant with a value of positive zero.
Definition SelectionDAG.cpp:12837

llvm::all_equal
bool all_equal(std::initializer_list< T > Values)
Returns true if all Values in the initializer lists are equal or the list.
Definition STLExtras.h:2108

llvm::VPERM_2OP
@ VPERM_2OP
Definition X86IntrinsicsInfo.h:58

llvm::TRUNCATE_TO_MEM_VI16
@ TRUNCATE_TO_MEM_VI16
Definition X86IntrinsicsInfo.h:70

llvm::INTR_TYPE_SCALAR_MASK_SAE
@ INTR_TYPE_SCALAR_MASK_SAE
Definition X86IntrinsicsInfo.h:60

llvm::INTR_TYPE_1OP_SAE
@ INTR_TYPE_1OP_SAE
Definition X86IntrinsicsInfo.h:50

llvm::TRUNCATE_TO_MEM_VI32
@ TRUNCATE_TO_MEM_VI32
Definition X86IntrinsicsInfo.h:71

llvm::INTR_TYPE_2OP_SAE
@ INTR_TYPE_2OP_SAE
Definition X86IntrinsicsInfo.h:51

llvm::CVTPD2PS_MASK
@ CVTPD2PS_MASK
Definition X86IntrinsicsInfo.h:49

llvm::TRUNCATE_TO_REG
@ TRUNCATE_TO_REG
Definition X86IntrinsicsInfo.h:64

llvm::INTR_TYPE_1OP
@ INTR_TYPE_1OP
Definition X86IntrinsicsInfo.h:35

llvm::COMI
@ COMI
Definition X86IntrinsicsInfo.h:45

llvm::RDTSC
@ RDTSC
Definition X86IntrinsicsInfo.h:30

llvm::INTR_TYPE_3OP_SCALAR_MASK_SAE
@ INTR_TYPE_3OP_SCALAR_MASK_SAE
Definition X86IntrinsicsInfo.h:62

llvm::XGETBV
@ XGETBV
Definition X86IntrinsicsInfo.h:32

llvm::XTEST
@ XTEST
Definition X86IntrinsicsInfo.h:31

llvm::INTR_TYPE_3OP_MASK_SAE
@ INTR_TYPE_3OP_MASK_SAE
Definition X86IntrinsicsInfo.h:54

llvm::GATHER_AVX2
@ GATHER_AVX2
Definition X86IntrinsicsInfo.h:74

llvm::CFMA_OP_MASKZ
@ CFMA_OP_MASKZ
Definition X86IntrinsicsInfo.h:41

llvm::ROUNDS
@ ROUNDS
Definition X86IntrinsicsInfo.h:76

llvm::INTR_TYPE_2OP_MASK
@ INTR_TYPE_2OP_MASK
Definition X86IntrinsicsInfo.h:56

llvm::CVTPD2DQ_MASK
@ CVTPD2DQ_MASK
Definition X86IntrinsicsInfo.h:67

llvm::VSHIFT
@ VSHIFT
Definition X86IntrinsicsInfo.h:44

llvm::COMI_RM
@ COMI_RM
Definition X86IntrinsicsInfo.h:46

llvm::TRUNCATE_TO_MEM_VI8
@ TRUNCATE_TO_MEM_VI8
Definition X86IntrinsicsInfo.h:69

llvm::TRUNCATE2_TO_REG
@ TRUNCATE2_TO_REG
Definition X86IntrinsicsInfo.h:65

llvm::SCATTER
@ SCATTER
Definition X86IntrinsicsInfo.h:25

llvm::CVTNEPS2BF16_MASK
@ CVTNEPS2BF16_MASK
Definition X86IntrinsicsInfo.h:23

llvm::FIXUPIMM
@ FIXUPIMM
Definition X86IntrinsicsInfo.h:72

llvm::CMP_MASK_SCALAR_CC
@ CMP_MASK_SCALAR_CC
Definition X86IntrinsicsInfo.h:43

llvm::INTR_TYPE_1OP_MASK_SAE
@ INTR_TYPE_1OP_MASK_SAE
Definition X86IntrinsicsInfo.h:52

llvm::CFMA_OP_MASK
@ CFMA_OP_MASK
Definition X86IntrinsicsInfo.h:40

llvm::FIXUPIMM_MASKZ
@ FIXUPIMM_MASKZ
Definition X86IntrinsicsInfo.h:73

llvm::INTR_TYPE_SCALAR_MASK
@ INTR_TYPE_SCALAR_MASK
Definition X86IntrinsicsInfo.h:59

llvm::INTR_TYPE_3OP_IMM8
@ INTR_TYPE_3OP_IMM8
Definition X86IntrinsicsInfo.h:39

llvm::BLENDV
@ BLENDV
Definition X86IntrinsicsInfo.h:47

llvm::RDPRU
@ RDPRU
Definition X86IntrinsicsInfo.h:77

llvm::INTR_TYPE_2OP_MASK_SAE
@ INTR_TYPE_2OP_MASK_SAE
Definition X86IntrinsicsInfo.h:53

llvm::BEXTRI
@ BEXTRI
Definition X86IntrinsicsInfo.h:48

llvm::RDSEED
@ RDSEED
Definition X86IntrinsicsInfo.h:27

llvm::INTR_TYPE_3OP
@ INTR_TYPE_3OP
Definition X86IntrinsicsInfo.h:37

llvm::GATHER
@ GATHER
Definition X86IntrinsicsInfo.h:24

llvm::RDRAND
@ RDRAND
Definition X86IntrinsicsInfo.h:28

llvm::CVTPS2PH_MASK
@ CVTPS2PH_MASK
Definition X86IntrinsicsInfo.h:66

llvm::INTR_TYPE_2OP
@ INTR_TYPE_2OP
Definition X86IntrinsicsInfo.h:36

llvm::CMP_MASK_CC
@ CMP_MASK_CC
Definition X86IntrinsicsInfo.h:42

llvm::INTR_TYPE_SCALAR_MASK_RND
@ INTR_TYPE_SCALAR_MASK_RND
Definition X86IntrinsicsInfo.h:61

llvm::ROUNDP
@ ROUNDP
Definition X86IntrinsicsInfo.h:75

llvm::CVTQQ2PS_MASK
@ CVTQQ2PS_MASK
Definition X86IntrinsicsInfo.h:68

llvm::RDPMC
@ RDPMC
Definition X86IntrinsicsInfo.h:29

llvm::FPCLASSS
@ FPCLASSS
Definition X86IntrinsicsInfo.h:34

llvm::INTR_TYPE_1OP_MASK
@ INTR_TYPE_1OP_MASK
Definition X86IntrinsicsInfo.h:55

llvm::ADX
@ ADX
Definition X86IntrinsicsInfo.h:33

llvm::COMPRESS_EXPAND_IN_REG
@ COMPRESS_EXPAND_IN_REG
Definition X86IntrinsicsInfo.h:63

llvm::INTR_TYPE_CAST_MMX
@ INTR_TYPE_CAST_MMX
Definition X86IntrinsicsInfo.h:78

llvm::INTR_TYPE_4OP_IMM8
@ INTR_TYPE_4OP_IMM8
Definition X86IntrinsicsInfo.h:38

llvm::IFMA_OP
@ IFMA_OP
Definition X86IntrinsicsInfo.h:57

llvm::PREFETCH
@ PREFETCH
Definition X86IntrinsicsInfo.h:26

llvm::DecodePSHUFMask
void DecodePSHUFMask(unsigned NumElts, unsigned ScalarBits, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
Decodes the shuffle masks for pshufd/pshufw/vpermilpd/vpermilps.
Definition X86ShuffleDecode.cpp:148

llvm::seq
auto seq(T Begin, T End)
Iterate over an integral type from Begin up to - but not including - End.
Definition Sequence.h:305

llvm::DecodeMOVDDUPMask
void DecodeMOVDDUPMask(unsigned NumElts, SmallVectorImpl< int > &ShuffleMask)
Definition X86ShuffleDecode.cpp:92

llvm::array_pod_sort
void array_pod_sort(IteratorTy Start, IteratorTy End)
array_pod_sort - This sorts an array with the specified start and end extent.
Definition STLExtras.h:1582

llvm::AllocTokenMode::Increment
@ Increment
Incrementally increasing token ID.
Definition AllocToken.h:26

llvm::DecodeVectorBroadcast
void DecodeVectorBroadcast(unsigned NumElts, SmallVectorImpl< int > &ShuffleMask)
Decodes a broadcast of the first element of a vector.
Definition X86ShuffleDecode.cpp:250

llvm::bit_floor
T bit_floor(T Value)
Returns the largest integral power of two no greater than Value if Value is nonzero.
Definition bit.h:330

llvm::DecodeSHUFPMask
void DecodeSHUFPMask(unsigned NumElts, unsigned ScalarBits, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
Decodes the shuffle masks for shufp*.
Definition X86ShuffleDecode.cpp:201

llvm::DecodePSHUFHWMask
void DecodePSHUFHWMask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
Decodes the shuffle masks for pshufhw.
Definition X86ShuffleDecode.cpp:164

llvm::addRegOffset
static const MachineInstrBuilder & addRegOffset(const MachineInstrBuilder &MIB, Register Reg, bool isKill, int Offset)
addRegOffset - This function is used to add a memory reference of the form [Reg + Offset],...
Definition X86InstrBuilder.h:151

llvm::PGSOQueryType::Test
@ Test
Definition SizeOpts.h:37

llvm::DecodeMOVSHDUPMask
void DecodeMOVSHDUPMask(unsigned NumElts, SmallVectorImpl< int > &ShuffleMask)
Definition X86ShuffleDecode.cpp:85

llvm::rotl
constexpr T rotl(T V, int R)
Definition bit.h:369

llvm::fltNanEncoding::AllOnes
@ AllOnes
Definition APFloat.cpp:92

llvm::isAllOnesConstant
LLVM_ABI bool isAllOnesConstant(SDValue V)
Returns true if V is an integer constant with all bits set.
Definition SelectionDAG.cpp:12842

llvm::SM_SentinelUndef
@ SM_SentinelUndef
Definition X86ShuffleDecode.h:28

llvm::SM_SentinelZero
@ SM_SentinelZero
Definition X86ShuffleDecode.h:28

llvm::scaleShuffleMaskElts
LLVM_ABI bool scaleShuffleMaskElts(unsigned NumDstElts, ArrayRef< int > Mask, SmallVectorImpl< int > &ScaledMask)
Attempt to narrow/widen the Mask shuffle mask to the NumDstElts target width.
Definition VectorUtils.cpp:626

llvm::DecodePSHUFBMask
void DecodePSHUFBMask(ArrayRef< uint64_t > RawMask, const APInt &UndefElts, SmallVectorImpl< int > &ShuffleMask)
Decode a PSHUFB mask from a raw array of constants such as from BUILD_VECTOR.
Definition X86ShuffleDecode.cpp:293

llvm::getSplatIndex
LLVM_ABI int getSplatIndex(ArrayRef< int > Mask)
If all non-negative Mask elements are the same value, return that value.
Definition VectorUtils.cpp:369

std::swap
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:869

N
#define N

EQ
#define EQ(a, b)
Definition regexec.c:65

InsertionPoint
Definition CFIFixup.cpp:186

llvm::Align
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39

llvm::Align::value
constexpr uint64_t value() const
This is a hole in the type system and should not be abused.
Definition Alignment.h:77

llvm::EVT
Extended Value Type.
Definition ValueTypes.h:35

llvm::EVT::changeVectorElementTypeToInteger
EVT changeVectorElementTypeToInteger() const
Return a vector with the same number of elements as this vector, but with the element type converted ...
Definition ValueTypes.h:94

llvm::EVT::getStoreSize
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
Definition ValueTypes.h:395

llvm::EVT::isSimple
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition ValueTypes.h:137

llvm::EVT::getVectorVT
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
Definition ValueTypes.h:74

llvm::EVT::changeTypeToInteger
EVT changeTypeToInteger() const
Return the type converted to an equivalently sized integer or vector with integer element type.
Definition ValueTypes.h:121

llvm::EVT::bitsGT
bool bitsGT(EVT VT) const
Return true if this has more bits than VT.
Definition ValueTypes.h:284

llvm::EVT::bitsLT
bool bitsLT(EVT VT) const
Return true if this has less bits than VT.
Definition ValueTypes.h:300

llvm::EVT::isFloatingPoint
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
Definition ValueTypes.h:147

llvm::EVT::getVectorElementCount
ElementCount getVectorElementCount() const
Definition ValueTypes.h:350

llvm::EVT::getDoubleNumVectorElementsVT
EVT getDoubleNumVectorElementsVT(LLVMContext &Context) const
Definition ValueTypes.h:463

llvm::EVT::getSizeInBits
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition ValueTypes.h:373

llvm::EVT::isByteSized
bool isByteSized() const
Return true if the bit size is a multiple of 8.
Definition ValueTypes.h:243

llvm::EVT::getVectorMinNumElements
unsigned getVectorMinNumElements() const
Given a vector type, return the minimum number of elements it contains.
Definition ValueTypes.h:359

llvm::EVT::getScalarSizeInBits
uint64_t getScalarSizeInBits() const
Definition ValueTypes.h:385

llvm::EVT::getEVT
static LLVM_ABI EVT getEVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
Definition ValueTypes.cpp:301

llvm::EVT::getStoreSizeInBits
TypeSize getStoreSizeInBits() const
Return the number of bits overwritten by a store of the specified value type.
Definition ValueTypes.h:412

llvm::EVT::getSimpleVT
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition ValueTypes.h:316

llvm::EVT::is128BitVector
bool is128BitVector() const
Return true if this is a 128-bit vector type.
Definition ValueTypes.h:207

llvm::EVT::getIntegerVT
static EVT getIntegerVT(LLVMContext &Context, unsigned BitWidth)
Returns the EVT that represents an integer with the given number of bits.
Definition ValueTypes.h:65

llvm::EVT::getFixedSizeInBits
uint64_t getFixedSizeInBits() const
Return the size of the specified fixed width value type in bits.
Definition ValueTypes.h:381

llvm::EVT::is512BitVector
bool is512BitVector() const
Return true if this is a 512-bit vector type.
Definition ValueTypes.h:217

llvm::EVT::getFloatingPointVT
static EVT getFloatingPointVT(unsigned BitWidth)
Returns the EVT that represents a floating-point type with the given number of bits.
Definition ValueTypes.h:59

llvm::EVT::isVector
bool isVector() const
Return true if this is a vector value type.
Definition ValueTypes.h:168

llvm::EVT::getScalarType
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition ValueTypes.h:323

llvm::EVT::is256BitVector
bool is256BitVector() const
Return true if this is a 256-bit vector type.
Definition ValueTypes.h:212

llvm::EVT::getTypeForEVT
LLVM_ABI Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
Definition ValueTypes.cpp:218

llvm::EVT::getVectorElementType
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition ValueTypes.h:328

llvm::EVT::isScalarInteger
bool isScalarInteger() const
Return true if this is an integer, but not a vector.
Definition ValueTypes.h:157

llvm::EVT::changeVectorElementType
EVT changeVectorElementType(EVT EltVT) const
Return a VT for a vector type whose attributes match ourselves with the exception of the element type...
Definition ValueTypes.h:102

llvm::EVT::getFltSemantics
LLVM_ABI const fltSemantics & getFltSemantics() const
Returns an APFloat semantics tag appropriate for the value type.
Definition ValueTypes.cpp:332

llvm::EVT::getVectorNumElements
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition ValueTypes.h:336

llvm::EVT::getHalfNumVectorElementsVT
EVT getHalfNumVectorElementsVT(LLVMContext &Context) const
Definition ValueTypes.h:453

llvm::EVT::isInteger
bool isInteger() const
Return true if this is an integer or a vector integer type.
Definition ValueTypes.h:152

llvm::EVT::is64BitVector
bool is64BitVector() const
Return true if this is a 64-bit vector type.
Definition ValueTypes.h:202

llvm::IntrinsicData
Definition X86IntrinsicsInfo.h:81

llvm::IntrinsicData::Opc0
uint16_t Opc0
Definition X86IntrinsicsInfo.h:85

llvm::IntrinsicData::Type
IntrinsicType Type
Definition X86IntrinsicsInfo.h:84

llvm::IntrinsicData::Opc1
uint16_t Opc1
Definition X86IntrinsicsInfo.h:86

llvm::KnownBits
Definition KnownBits.h:24

llvm::KnownBits::makeConstant
static KnownBits makeConstant(const APInt &C)
Create known bits from a known constant.
Definition KnownBits.h:301

llvm::KnownBits::sadd_sat
static LLVM_ABI KnownBits sadd_sat(const KnownBits &LHS, const KnownBits &RHS)
Compute knownbits resulting from llvm.sadd.sat(LHS, RHS)
Definition KnownBits.cpp:761

llvm::KnownBits::eq
static LLVM_ABI std::optional< bool > eq(const KnownBits &LHS, const KnownBits &RHS)
Determine if these known bits always give the same ICMP_EQ result.
Definition KnownBits.cpp:484

llvm::KnownBits::anyextOrTrunc
KnownBits anyextOrTrunc(unsigned BitWidth) const
Return known bits for an "any" extension or truncation of the value we're tracking.
Definition KnownBits.h:186

llvm::KnownBits::mulhu
static LLVM_ABI KnownBits mulhu(const KnownBits &LHS, const KnownBits &RHS)
Compute known bits from zero-extended multiply-hi.
Definition KnownBits.cpp:913

llvm::KnownBits::isNonNegative
bool isNonNegative() const
Returns true if this value is known to be non-negative.
Definition KnownBits.h:108

llvm::KnownBits::isZero
bool isZero() const
Returns true if value is all zero.
Definition KnownBits.h:80

llvm::KnownBits::countMinTrailingZeros
unsigned countMinTrailingZeros() const
Returns the minimum number of trailing zero bits.
Definition KnownBits.h:242

llvm::KnownBits::isUnknown
bool isUnknown() const
Returns true if we don't know any bits.
Definition KnownBits.h:66

llvm::KnownBits::countMaxTrailingZeros
unsigned countMaxTrailingZeros() const
Returns the maximum number of trailing zero bits possible.
Definition KnownBits.h:274

llvm::KnownBits::trunc
KnownBits trunc(unsigned BitWidth) const
Return known bits for a truncation of the value we're tracking.
Definition KnownBits.h:161

llvm::KnownBits::countMaxPopulation
unsigned countMaxPopulation() const
Returns the maximum number of bits that could be one.
Definition KnownBits.h:289

llvm::KnownBits::setAllZero
void setAllZero()
Make all bits known to be zero and discard any previous information.
Definition KnownBits.h:86

llvm::KnownBits::getBitWidth
unsigned getBitWidth() const
Get the bit width of this value.
Definition KnownBits.h:44

llvm::KnownBits::zext
KnownBits zext(unsigned BitWidth) const
Return known bits for a zero extension of the value we're tracking.
Definition KnownBits.h:172

llvm::KnownBits::isConstant
bool isConstant() const
Returns true if we know the value of all bits.
Definition KnownBits.h:54

llvm::KnownBits::resetAll
void resetAll()
Resets the known state of all bits.
Definition KnownBits.h:74

llvm::KnownBits::isNonZero
bool isNonZero() const
Returns true if this value is known to be non-zero.
Definition KnownBits.h:111

llvm::KnownBits::abdu
static LLVM_ABI KnownBits abdu(const KnownBits &LHS, const KnownBits &RHS)
Compute known bits for abdu(LHS, RHS).
Definition KnownBits.cpp:228

llvm::KnownBits::extractBits
KnownBits extractBits(unsigned NumBits, unsigned BitPosition) const
Return a subset of the known bits from [bitPosition,bitPosition+numBits).
Definition KnownBits.h:225

llvm::KnownBits::countMaxActiveBits
unsigned countMaxActiveBits() const
Returns the maximum number of bits needed to represent all possible unsigned values with these known ...
Definition KnownBits.h:296

llvm::KnownBits::intersectWith
KnownBits intersectWith(const KnownBits &RHS) const
Returns KnownBits information that is known to be true for both this and RHS.
Definition KnownBits.h:311

llvm::KnownBits::sext
KnownBits sext(unsigned BitWidth) const
Return known bits for a sign extension of the value we're tracking.
Definition KnownBits.h:180

llvm::KnownBits::add
static KnownBits add(const KnownBits &LHS, const KnownBits &RHS, bool NSW=false, bool NUW=false)
Compute knownbits resulting from addition of LHS and RHS.
Definition KnownBits.h:347

llvm::KnownBits::zextOrTrunc
KnownBits zextOrTrunc(unsigned BitWidth) const
Return known bits for a zero extension or truncation of the value we're tracking.
Definition KnownBits.h:196

llvm::KnownBits::countMinLeadingZeros
unsigned countMinLeadingZeros() const
Returns the minimum number of leading zero bits.
Definition KnownBits.h:248

llvm::KnownBits::getMaxValue
APInt getMaxValue() const
Return the maximal unsigned value possible given these KnownBits.
Definition KnownBits.h:145

llvm::KnownBits::computeForAddSub
static LLVM_ABI KnownBits computeForAddSub(bool Add, bool NSW, bool NUW, const KnownBits &LHS, const KnownBits &RHS)
Compute known bits resulting from adding LHS and RHS.
Definition KnownBits.cpp:60

llvm::KnownBits::isNegative
bool isNegative() const
Returns true if this value is known to be negative.
Definition KnownBits.h:105

llvm::KnownBits::setAllOnes
void setAllOnes()
Make all bits known to be one and discard any previous information.
Definition KnownBits.h:92

llvm::KnownBits::One
APInt One
Definition KnownBits.h:26

llvm::KnownBits::mul
static LLVM_ABI KnownBits mul(const KnownBits &LHS, const KnownBits &RHS, bool NoUndefSelfMultiply=false)
Compute known bits resulting from multiplying LHS and RHS.
Definition KnownBits.cpp:800

llvm::KnownBits::Zero
APInt Zero
Definition KnownBits.h:25

llvm::KnownBits::sgt
static LLVM_ABI std::optional< bool > sgt(const KnownBits &LHS, const KnownBits &RHS)
Determine if these known bits always give the same ICMP_SGT result.
Definition KnownBits.cpp:522

llvm::KnownBits::isAllOnes
bool isAllOnes() const
Returns true if value is all one bits.
Definition KnownBits.h:83

llvm::KnownBits::getConstant
const APInt & getConstant() const
Returns the value when all bits have a known value.
Definition KnownBits.h:60

llvm::MIPatternMatch::And
Matching combinators.
Definition MIPatternMatch.h:314

llvm::MIPatternMatch::Or
Definition MIPatternMatch.h:333

llvm::MachinePointerInfo
This class contains a discriminated union of information about pointers in memory operands,...
Definition MachineMemOperand.h:42

llvm::MachinePointerInfo::isDereferenceable
LLVM_ABI bool isDereferenceable(unsigned Size, LLVMContext &C, const DataLayout &DL) const
Return true if memory region [V, V+Offset+Size) is known to be dereferenceable.
Definition MachineOperand.cpp:1055

llvm::MachinePointerInfo::getConstantPool
static LLVM_ABI MachinePointerInfo getConstantPool(MachineFunction &MF)
Return a MachinePointerInfo record that refers to the constant pool.
Definition MachineOperand.cpp:1071

llvm::MachinePointerInfo::getWithOffset
MachinePointerInfo getWithOffset(int64_t O) const
Definition MachineMemOperand.h:82

llvm::MachinePointerInfo::getGOT
static LLVM_ABI MachinePointerInfo getGOT(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a GOT entry.
Definition MachineOperand.cpp:1086

llvm::MachinePointerInfo::getFixedStack
static LLVM_ABI MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
Definition MachineOperand.cpp:1077

llvm::MaybeAlign
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition Alignment.h:106

llvm::MemOp
Definition TargetLowering.h:118

llvm::MinMax
Definition AssumeBundleQueries.h:72

llvm::SDNodeFlags
These are IR-level optimization flags that may be propagated to SDNodes.
Definition SelectionDAGNodes.h:384

llvm::SDNodeFlags::hasAllowContract
bool hasAllowContract() const
Definition SelectionDAGNodes.h:483

llvm::SDNodeFlags::hasNoSignedZeros
bool hasNoSignedZeros() const
Definition SelectionDAGNodes.h:481

llvm::SDNodeFlags::setNoSignedWrap
void setNoSignedWrap(bool b)
Definition SelectionDAGNodes.h:456

llvm::SDPatternMatch::Not
Definition SDPatternMatch.h:411

llvm::SDVTList
This represents a list of ValueType's that has been intern'd by a SelectionDAG.
Definition SelectionDAGNodes.h:80

llvm::SDVTList::VTs
const EVT * VTs
Definition SelectionDAGNodes.h:81

llvm::SDVTList::NumVTs
unsigned int NumVTs
Definition SelectionDAGNodes.h:82

llvm::TargetLoweringBase::AddrMode
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
Definition TargetLowering.h:2897

llvm::TargetLoweringBase::AddrMode::BaseOffs
int64_t BaseOffs
Definition TargetLowering.h:2899

llvm::TargetLoweringBase::AddrMode::BaseGV
GlobalValue * BaseGV
Definition TargetLowering.h:2898

llvm::TargetLoweringBase::AddrMode::HasBaseReg
bool HasBaseReg
Definition TargetLowering.h:2900

llvm::TargetLoweringBase::AddrMode::Scale
int64_t Scale
Definition TargetLowering.h:2901

llvm::TargetLoweringBase::CondMergingParams
Definition TargetLowering.h:658

llvm::TargetLoweringBase::IntrinsicInfo
Definition TargetLowering.h:1218

llvm::TargetLowering::AsmOperandInfo
This contains information for each constraint that we are lowering.
Definition TargetLowering.h:5155

llvm::TargetLowering::CallLoweringInfo
This structure contains all information that is necessary for lowering calls.
Definition TargetLowering.h:4704

llvm::TargetLowering::CallLoweringInfo::setLibCallee
CallLoweringInfo & setLibCallee(CallingConv::ID CC, Type *ResultType, SDValue Target, ArgListTy &&ArgsList)
Definition TargetLowering.h:4761

llvm::TargetLowering::CallLoweringInfo::setDebugLoc
CallLoweringInfo & setDebugLoc(const SDLoc &dl)
Definition TargetLowering.h:4750

llvm::TargetLowering::CallLoweringInfo::setChain
CallLoweringInfo & setChain(SDValue InChain)
Definition TargetLowering.h:4755

llvm::TargetLowering::DAGCombinerInfo
Definition TargetLowering.h:4408

llvm::TargetLowering::DAGCombinerInfo::isBeforeLegalizeOps
bool isBeforeLegalizeOps() const
Definition TargetLowering.h:4420

llvm::TargetLowering::DAGCombinerInfo::isAfterLegalizeDAG
bool isAfterLegalizeDAG() const
Definition TargetLowering.h:4421

llvm::TargetLowering::DAGCombinerInfo::AddToWorklist
LLVM_ABI void AddToWorklist(SDNode *N)
Definition DAGCombiner.cpp:935

llvm::TargetLowering::DAGCombinerInfo::isCalledByLegalizer
bool isCalledByLegalizer() const
Definition TargetLowering.h:4423

llvm::TargetLowering::DAGCombinerInfo::recursivelyDeleteUnusedNodes
LLVM_ABI bool recursivelyDeleteUnusedNodes(SDNode *N)
Definition DAGCombiner.cpp:955

llvm::TargetLowering::DAGCombinerInfo::isBeforeLegalize
bool isBeforeLegalize() const
Definition TargetLowering.h:4419

llvm::TargetLowering::DAGCombinerInfo::DAG
SelectionDAG & DAG
Definition TargetLowering.h:4414

llvm::TargetLowering::DAGCombinerInfo::CombineTo
LLVM_ABI SDValue CombineTo(SDNode *N, ArrayRef< SDValue > To, bool AddTo=true)
Definition DAGCombiner.cpp:940

llvm::TargetLowering::DAGCombinerInfo::CommitTargetLoweringOpt
LLVM_ABI void CommitTargetLoweringOpt(const TargetLoweringOpt &TLO)
Definition DAGCombiner.cpp:960

llvm::TargetLowering::MakeLibCallOptions
This structure is used to pass arguments to makeLibCall function.
Definition TargetLowering.h:4900

llvm::TargetLowering::TargetLoweringOpt
A convenience struct that encapsulates a DAG, and two SDValues for returning information from TargetL...
Definition TargetLowering.h:4114

llvm::TargetLowering::TargetLoweringOpt::DAG
SelectionDAG & DAG
Definition TargetLowering.h:4115

llvm::TargetLowering::TargetLoweringOpt::CombineTo
bool CombineTo(SDValue O, SDValue N)
Definition TargetLowering.h:4128

llvm::WinEHFuncInfo
Definition WinEHFuncInfo.h:90

llvm::WinEHFuncInfo::EHGuardFrameIndex
int EHGuardFrameIndex
Definition WinEHFuncInfo.h:112

llvm::WinEHFuncInfo::EHRegNodeFrameIndex
int EHRegNodeFrameIndex
Definition WinEHFuncInfo.h:110

llvm::X86AddressMode
X86AddressMode - This struct holds a generalized full x86 address mode.
Definition X86InstrBuilder.h:42

llvm::X86AddressMode::IndexReg
Register IndexReg
Definition X86InstrBuilder.h:53

llvm::cl::desc
Definition CommandLine.h:411

llvm::fltSemantics
Definition APFloat.cpp:103