doxygen/LowerMatrixIntrinsics_8cpp_source.html

//===- LowerMatrixIntrinsics.cpp -  Lower matrix intrinsics -----*- C++ -*-===//

//

// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.

// See https://llvm.org/LICENSE.txt for license information.

// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

//

//===----------------------------------------------------------------------===//

//

// Lower matrix intrinsics to vector operations.

//

// TODO:

//  * Improve fusion:

//   * Support more cases, e.g. multiply-add, multiply-sub, operands/results

//     transposed.

//   * Improve cost-modeling, e.g. choose different number of rows/columns

//     columns for tiles, consider cost of copies on alias.

//

//===----------------------------------------------------------------------===//


#include "llvm/Transforms/Scalar/LowerMatrixIntrinsics.h"

#include "llvm/ADT/PostOrderIterator.h"

#include "llvm/ADT/STLExtras.h"

#include "llvm/ADT/ScopeExit.h"

#include "llvm/ADT/SmallVector.h"

#include "llvm/ADT/Statistic.h"

#include "llvm/Analysis/AliasAnalysis.h"

#include "llvm/Analysis/DomTreeUpdater.h"

#include "llvm/Analysis/LoopInfo.h"

#include "llvm/Analysis/OptimizationRemarkEmitter.h"

#include "llvm/Analysis/TargetTransformInfo.h"

#include "llvm/Analysis/ValueTracking.h"

#include "llvm/Analysis/VectorUtils.h"

#include "llvm/IR/CFG.h"

#include "llvm/IR/DataLayout.h"

#include "llvm/IR/DebugInfoMetadata.h"

#include "llvm/IR/DerivedTypes.h"

#include "llvm/IR/Function.h"

#include "llvm/IR/IRBuilder.h"

#include "llvm/IR/InstrTypes.h"

#include "llvm/IR/Instructions.h"

#include "llvm/IR/IntrinsicInst.h"

#include "llvm/IR/MatrixBuilder.h"

#include "llvm/IR/PatternMatch.h"

#include "llvm/Support/Alignment.h"

#include "llvm/Support/CommandLine.h"

#include "llvm/Support/Compiler.h"

#include "llvm/Support/Debug.h"

#include "llvm/Transforms/Utils/BasicBlockUtils.h"

#include "llvm/Transforms/Utils/LoopUtils.h"

#include "llvm/Transforms/Utils/MatrixUtils.h"


#include <cmath>


using namespace llvm;

using namespace PatternMatch;


#define DEBUG_TYPE "lower-matrix-intrinsics"


STATISTIC(FlattenedMatrices, "Number of matrix flattenings");

STATISTIC(ReshapedMatrices, "Number of matrix reshapes");

STATISTIC(SplitMatrices, "Number of matrix splits");


static cl::opt<bool>

    FuseMatrix("fuse-matrix", cl::init(true), cl::Hidden,

               cl::desc("Enable/disable fusing matrix instructions."));

// TODO: Allow and use non-square tiles.

static cl::opt<unsigned> TileSize(

    "fuse-matrix-tile-size", cl::init(4), cl::Hidden,

    cl::desc(

        "Tile size for matrix instruction fusion using square-shaped tiles."));

static cl::opt<bool> TileUseLoops("fuse-matrix-use-loops", cl::init(false),

                                  cl::Hidden,

                                  cl::desc("Generate loop nest for tiling."));

static cl::opt<bool> ForceFusion(

    "force-fuse-matrix", cl::init(false), cl::Hidden,

    cl::desc("Force matrix instruction fusion even if not profitable."));

static cl::opt<bool> AllowContractEnabled(

    "matrix-allow-contract", cl::init(false), cl::Hidden,

    cl::desc("Allow the use of FMAs if available and profitable. This may "

             "result in different results, due to less rounding error."));


static cl::opt<bool>

    VerifyShapeInfo("verify-matrix-shapes", cl::Hidden,

                    cl::desc("Enable/disable matrix shape verification."),

                    cl::init(false));


enum class MatrixLayoutTy { ColumnMajor, RowMajor };


static cl::opt<MatrixLayoutTy> MatrixLayout(

    "matrix-default-layout", cl::init(MatrixLayoutTy::ColumnMajor),

    cl::desc("Sets the default matrix layout"),

    cl::values(clEnumValN(MatrixLayoutTy::ColumnMajor, "column-major",

                          "Use column-major layout"),

               clEnumValN(MatrixLayoutTy::RowMajor, "row-major",

                          "Use row-major layout")));


static cl::opt<bool> PrintAfterTransposeOpt("matrix-print-after-transpose-opt",

                                            cl::init(false));


/// Helper function to either return Scope, if it is a subprogram or the

/// attached subprogram for a local scope.


static DISubprogram *getSubprogram(DIScope *Scope) {

  if (auto *Subprogram = dyn_cast<DISubprogram>(Scope))

    return Subprogram;

  return cast<DILocalScope>(Scope)->getSubprogram();

}


/// Return true if V is a splat of a value (which is used when multiplying a

/// matrix with a scalar).


static bool isSplat(Value *V) {

  if (auto *SV = dyn_cast<ShuffleVectorInst>(V))

    return SV->isZeroEltSplat();

  return false;

}


/// Match any mul operation (fp or integer).

template <typename LTy, typename RTy>


auto m_AnyMul(const LTy &L, const RTy &R) {

  return m_CombineOr(m_Mul(L, R), m_FMul(L, R));

}


/// Match any add operation (fp or integer).

template <typename LTy, typename RTy>


auto m_AnyAdd(const LTy &L, const RTy &R) {

  return m_CombineOr(m_Add(L, R), m_FAdd(L, R));

}


namespace {


// Given an element pointer \p BasePtr to the start of a (sub) matrix, compute

// the start address of vector \p VecIdx with type (\p EltType x \p NumElements)

// assuming \p Stride elements between start two consecutive vectors.

// \p Stride must be >= \p NumElements.

// For column-major matrixes, the function computes the address of a column

// vectors and \p NumElements must be set to the number of elements in a column

// (= number of rows of the matrix). For row-major matrixes, the function

// computes the address of a row vector and \p NumElements must be set to the

// number of elements in a column (= number of columns of the matrix).

//

// Consider a 4x4 matrix in column-mjaor layout like below

//

//      0       1      2      3

// 0   v_0_0  v_0_1  v_0_2  v_0_3

// 1   v_1_0  v_1_1  v_1_2  v_1_3

// 2   v_2_0  v_2_1  v_2_2  v_2_3

// 3   v_3_0  v_3_1  v_3_2  v_3_3


// To compute the column addresses for a 2x3 sub-matrix at row 1 and column 1,

// we need a pointer to the first element of the submatrix as base pointer.

// Then we can use computeVectorAddr to compute the addresses for the columns

// of the sub-matrix.

//

// Column 0: computeVectorAddr(Base, 0 (column), 4 (stride), 2 (num rows), ..)

//           -> just returns Base

// Column 1: computeVectorAddr(Base, 1 (column), 4 (stride), 2 (num rows), ..)

//           -> returns Base + (1 * 4)

// Column 2: computeVectorAddr(Base, 2 (column), 4 (stride), 2 (num rows), ..)

//           -> returns Base + (2 * 4)

//

// The graphic below illustrates the number of elements in a column (marked

// with |) and the number of skipped elements (marked with }).

//

//         v_0_0  v_0_1 {v_0_2 {v_0_3

//                Base   Col 1  Col 2

//                  |     |      |

//         v_1_0 |v_1_1 |v_1_2 |v_1_3

//         v_2_0 |v_2_1 |v_2_2 |v_2_3

//         v_3_0 {v_3_1 {v_3_2  v_3_3

//

Value *computeVectorAddr(Value *BasePtr, Value *VecIdx, Value *Stride,

                         unsigned NumElements, Type *EltType,

                         IRBuilder<> &Builder) {


  assert((!isa<ConstantInt>(Stride) ||

          cast<ConstantInt>(Stride)->getZExtValue() >= NumElements) &&

         "Stride must be >= the number of elements in the result vector.");


  // Compute the start of the vector with index VecIdx as VecIdx * Stride.

  Value *VecStart = Builder.CreateMul(VecIdx, Stride, "vec.start");


  // Get pointer to the start of the selected vector. Skip GEP creation,

  // if we select vector 0.

  if (isa<ConstantInt>(VecStart) && cast<ConstantInt>(VecStart)->isZero())

    VecStart = BasePtr;

  else

    VecStart = Builder.CreateGEP(EltType, BasePtr, VecStart, "vec.gep");


  return VecStart;

}


namespace {

struct ShapeInfo {

  unsigned NumRows;

  unsigned NumColumns;


  bool IsColumnMajor;


  ShapeInfo(unsigned NumRows = 0, unsigned NumColumns = 0)

      : NumRows(NumRows), NumColumns(NumColumns),

        IsColumnMajor(MatrixLayout == MatrixLayoutTy::ColumnMajor) {}


  ShapeInfo(Value *NumRows, Value *NumColumns)

      : ShapeInfo(cast<ConstantInt>(NumRows)->getZExtValue(),

                  cast<ConstantInt>(NumColumns)->getZExtValue()) {}


  bool operator==(const ShapeInfo &other) {

    return NumRows == other.NumRows && NumColumns == other.NumColumns;

  }

  bool operator!=(const ShapeInfo &other) { return !(*this == other); }


  /// Returns true if shape-information is defined, meaning both dimensions

  /// are != 0.

  operator bool() const {

    assert(NumRows == 0 || NumColumns != 0);

    return NumRows != 0;

  }


  unsigned getStride() const {

    if (IsColumnMajor)

      return NumRows;

    return NumColumns;

  }


  unsigned getNumVectors() const {

    if (IsColumnMajor)

      return NumColumns;

    return NumRows;

  }


  /// Returns the transposed shape.

  ShapeInfo t() const { return ShapeInfo(NumColumns, NumRows); }


  friend raw_ostream &operator<<(raw_ostream &OS, ShapeInfo SI);


  LLVM_DUMP_METHOD void dump() const { dbgs() << *this << '\n'; }

};


raw_ostream &operator<<(raw_ostream &OS, ShapeInfo SI) {

  return OS << SI.NumRows << 'x' << SI.NumColumns;

}


} // namespace


static bool isUniformShape(Value *V) {

  Instruction *I = dyn_cast<Instruction>(V);

  if (!I)

    return true;


  if (I->isBinaryOp())

    return true;


  if (auto *Cast = dyn_cast<CastInst>(V)) {

    switch (Cast->getOpcode()) {

    case llvm::Instruction::Trunc:

    case llvm::Instruction::ZExt:

    case llvm::Instruction::SExt:

    case llvm::Instruction::FPToUI:

    case llvm::Instruction::FPToSI:

    case llvm::Instruction::UIToFP:

    case llvm::Instruction::SIToFP:

    case llvm::Instruction::FPTrunc:

    case llvm::Instruction::FPExt:

      return true;

    case llvm::Instruction::AddrSpaceCast:

    case CastInst::PtrToAddr:

    case CastInst::PtrToInt:

    case CastInst::IntToPtr:

      return false;

    case CastInst::BitCast: {

      if (auto *SrcVTy = dyn_cast<FixedVectorType>(Cast->getSrcTy()))

        if (auto *DestVTy = dyn_cast<FixedVectorType>(Cast->getDestTy()))

          return SrcVTy->getNumElements() == DestVTy->getNumElements();

      return false;

    }

    case llvm::Instruction::CastOpsEnd:

      llvm_unreachable("not an actual cast op");

    }

    llvm_unreachable("unhandled cast opcode");

  }


  if (auto *II = dyn_cast<IntrinsicInst>(V))

    switch (II->getIntrinsicID()) {

    case Intrinsic::abs:

    case Intrinsic::fabs:

      return true;

    default:

      return false;

    }


  switch (I->getOpcode()) {

  case Instruction::PHI:

  case Instruction::FNeg:

    return true;

  default:

    return false;

  }

}


/// Return the ShapeInfo for the result of \p I, it it can be determined.

static std::optional<ShapeInfo>

computeShapeInfoForInst(Instruction *I,

                        const DenseMap<Value *, ShapeInfo> &ShapeMap) {

  Value *M;

  Value *N;

  Value *K;

  if (match(I, m_Intrinsic<Intrinsic::matrix_multiply>(

                   m_Value(), m_Value(), m_Value(M), m_Value(N), m_Value(K))))

    return ShapeInfo(M, K);

  if (match(I, m_Intrinsic<Intrinsic::matrix_transpose>(m_Value(), m_Value(M),

                                                        m_Value(N)))) {

    // Flip dimensions.

    return ShapeInfo(N, M);

  }

  if (match(I, m_Intrinsic<Intrinsic::matrix_column_major_store>(

                   m_Value(), m_Value(), m_Value(), m_Value(), m_Value(M),

                   m_Value(N))))

    return ShapeInfo(N, M);

  if (match(I, m_Intrinsic<Intrinsic::matrix_column_major_load>(

                   m_Value(), m_Value(), m_Value(), m_Value(M), m_Value(N))))

    return ShapeInfo(M, N);

  Value *MatrixA;

  if (match(I, m_Store(m_Value(MatrixA), m_Value()))) {

    auto OpShape = ShapeMap.find(MatrixA);

    if (OpShape != ShapeMap.end())

      return OpShape->second;

  }


  if (isUniformShape(I) || isa<SelectInst>(I)) {

    auto Ops = I->operands();

    auto ShapedOps = isa<SelectInst>(I) ? drop_begin(Ops) : Ops;

    // Find the first operand that has a known shape and use that.

    for (auto &Op : ShapedOps) {

      auto OpShape = ShapeMap.find(Op.get());

      if (OpShape != ShapeMap.end())

        return OpShape->second;

    }

  }

  return std::nullopt;

}


/// LowerMatrixIntrinsics contains the methods used to lower matrix intrinsics.

///

/// Currently, the lowering for each matrix intrinsic is done as follows:

/// 1. Propagate the shape information from intrinsics to connected

/// instructions.

/// 2. Lower instructions with shape information (assuming column-major layout).

///  The lowering works similarly using row-major layout.

///  2.1. Get column vectors for each argument. If we already lowered the

///       definition of an argument, use the produced column vectors directly.

///       If not, split the operand vector containing an embedded matrix into

///       a set of column vectors,

///  2.2. Lower the instruction in terms of column major operations, which

///       yields a set of column vectors containing result matrix. Note that we

///       lower all instructions that have shape information. Besides the

///       intrinsics, this includes stores for example.

///  2.3. Update uses of the lowered instruction. If we have shape information

///       for a user, there is nothing to do, as we will look up the result

///       column matrix when lowering the user. For other uses, we embed the

///       result matrix in a flat vector and update the use.

///  2.4. Cache the result column matrix for the instruction we lowered

/// 3. After we lowered all instructions in a function, remove the now

///    obsolete instructions.

///

class LowerMatrixIntrinsics {

  Function &Func;

  const DataLayout &DL;

  const TargetTransformInfo &TTI;

  FunctionAnalysisManager *AM;

  AliasAnalysis *AA = nullptr;

  DominatorTree *DT = nullptr;

  LoopInfo *LI = nullptr;

  OptimizationRemarkEmitter *ORE = nullptr;


  /// Contains estimates of the number of operations (loads, stores, compute) required to lower a matrix operation.

  struct OpInfoTy {

    /// Number of stores emitted to generate this matrix.

    unsigned NumStores = 0;

    /// Number of loads emitted to generate this matrix.

    unsigned NumLoads = 0;

    /// Number of compute operations emitted to generate this matrix.

    unsigned NumComputeOps = 0;

    /// Most of the time transposes can be fused with matrix multiplies or can

    /// be folded away via algebraic simplifications.  This is the number of

    /// transposes that we failed to make "free" via such optimizations.

    unsigned NumExposedTransposes = 0;


    OpInfoTy &operator+=(const OpInfoTy &RHS) {

      NumStores += RHS.NumStores;

      NumLoads += RHS.NumLoads;

      NumComputeOps += RHS.NumComputeOps;

      NumExposedTransposes += RHS.NumExposedTransposes;

      return *this;

    }

  };


  /// Wrapper class representing a matrix as a set of vectors, either in row or

  /// column major layout. All vectors must have the same vector type.

  class MatrixTy {

    SmallVector<Value *, 16> Vectors;


    OpInfoTy OpInfo;


    bool IsColumnMajor = true;


  public:

    MatrixTy() : IsColumnMajor(MatrixLayout == MatrixLayoutTy::ColumnMajor) {}

    MatrixTy(ArrayRef<Value *> Vectors)

        : Vectors(Vectors),

          IsColumnMajor(MatrixLayout == MatrixLayoutTy::ColumnMajor) {}

    MatrixTy(unsigned NumRows, unsigned NumColumns, Type *EltTy)

        : IsColumnMajor(MatrixLayout == MatrixLayoutTy::ColumnMajor) {


      unsigned D = isColumnMajor() ? NumColumns : NumRows;

      for (unsigned J = 0; J < D; ++J)

        addVector(PoisonValue::get(FixedVectorType::get(

            EltTy, isColumnMajor() ? NumRows : NumColumns)));

    }


    Value *getVector(unsigned i) const { return Vectors[i]; }

    Value *getColumn(unsigned i) const {

      assert(isColumnMajor() && "only supported for column-major matrixes");

      return Vectors[i];

    }

    Value *getRow(unsigned i) const {

      assert(!isColumnMajor() && "only supported for row-major matrixes");

      return Vectors[i];

    }


    void setVector(unsigned i, Value *V) { Vectors[i] = V; }


    Type *getElementType() const { return getVectorTy()->getElementType(); }


    unsigned getNumVectors() const {

      if (isColumnMajor())

        return getNumColumns();

      return getNumRows();

    }


    unsigned getNumColumns() const {

      if (isColumnMajor())

        return Vectors.size();

      else {

        assert(Vectors.size() > 0 && "Cannot call getNumRows without columns");

        return getVectorTy()->getNumElements();

      }

    }

    unsigned getNumRows() const {

      if (isColumnMajor()) {

        assert(Vectors.size() > 0 && "Cannot call getNumRows without columns");

        return getVectorTy()->getNumElements();

      } else

        return Vectors.size();

    }


    void addVector(Value *V) { Vectors.push_back(V); }

    FixedVectorType *getColumnTy() {

      assert(isColumnMajor() && "only supported for column-major matrixes");

      return getVectorTy();

    }


    FixedVectorType *getVectorTy() const {

      return cast<FixedVectorType>(Vectors[0]->getType());

    }


    iterator_range<SmallVector<Value *, 8>::iterator> columns() {

      assert(isColumnMajor() &&

             "columns() only supported for column-major matrixes");

      return make_range(Vectors.begin(), Vectors.end());

    }


    iterator_range<SmallVector<Value *, 8>::iterator> vectors() {

      return make_range(Vectors.begin(), Vectors.end());

    }


    /// Embed the vectors of the matrix into a flat vector by concatenating

    /// them.

    Value *embedInVector(IRBuilder<> &Builder) const {

      return Vectors.size() == 1 ? Vectors[0]

                                 : concatenateVectors(Builder, Vectors);

    }


    MatrixTy &addNumLoads(unsigned N) {

      OpInfo.NumLoads += N;

      return *this;

    }


    void setNumLoads(unsigned N) { OpInfo.NumLoads = N; }


    MatrixTy &addNumStores(unsigned N) {

      OpInfo.NumStores += N;

      return *this;

    }


    MatrixTy &addNumExposedTransposes(unsigned N) {

      OpInfo.NumExposedTransposes += N;

      return *this;

    }


    MatrixTy &addNumComputeOps(unsigned N) {

      OpInfo.NumComputeOps += N;

      return *this;

    }


    unsigned getNumStores() const { return OpInfo.NumStores; }

    unsigned getNumLoads() const { return OpInfo.NumLoads; }

    unsigned getNumComputeOps() const { return OpInfo.NumComputeOps; }


    const OpInfoTy &getOpInfo() const { return OpInfo; }


    bool isColumnMajor() const { return IsColumnMajor; }


    unsigned getStride() const {

      if (isColumnMajor())

        return getNumRows();

      return getNumColumns();

    }


    ShapeInfo shape() const { return {getNumRows(), getNumColumns()}; }


    /// Extract a vector of \p NumElts starting at index (\p I, \p J). If the

    /// matrix is column-major, the result vector is extracted from a column

    /// vector, otherwise from a row vector.

    Value *extractVector(unsigned I, unsigned J, unsigned NumElts,

                         IRBuilder<> &Builder) const {

      Value *Vec = isColumnMajor() ? getColumn(J) : getRow(I);

      assert(cast<FixedVectorType>(Vec->getType())->getNumElements() >=

                 NumElts &&

             "Extracted vector will contain poison values");

      return Builder.CreateShuffleVector(

          Vec, createSequentialMask(isColumnMajor() ? I : J, NumElts, 0),

          "block");

    }

  };


  /// Maps instructions to their shape information. The shape information

  /// describes the shape to be used while lowering. This matches the shape of

  /// the result value of the instruction, with the only exceptions being store

  /// instructions and the matrix_column_major_store intrinsics. For those, the

  /// shape information indicates that those instructions should be lowered

  /// using shape information as well. Note that extra care is needed when

  /// erasing or RAUW'ing a value that is present in ShapeMap. If the

  /// replacement is also a matrix operation, use

  /// updateShapeAndReplaceAllUsesWith to make sure the replacement is added to

  /// ShapeMap.  We don't use ValueMap, as there are also cases where we do not

  /// want to add shape information for a replacement instruction. When directly

  /// erasing a value with an entry in ShapeMap, use

  /// eraseFromParentAndRemoveFromShapeMap to make sure ShapeMap is also updated

  /// accordingly.

  DenseMap<Value *, ShapeInfo> ShapeMap;


  /// List of instructions to remove. While lowering, we are not replacing all

  /// users of a lowered instruction, if shape information is available and

  /// those need to be removed after we finished lowering.

  SmallVector<Instruction *, 16> ToRemove;


  /// Map from instructions to their produced column matrix.

  MapVector<Value *, MatrixTy> Inst2ColumnMatrix;


private:

  static FastMathFlags getFastMathFlags(Instruction *Inst) {

    FastMathFlags FMF;


    if (isa<FPMathOperator>(*Inst))

      FMF = Inst->getFastMathFlags();


    FMF.setAllowContract(AllowContractEnabled || FMF.allowContract());


    return FMF;

  }


public:

  LowerMatrixIntrinsics(Function &F, TargetTransformInfo &TTI,

                        FunctionAnalysisManager *AM)

      : Func(F), DL(F.getDataLayout()), TTI(TTI), AM(AM) {}


  unsigned getNumOps(Type *VT) {

    assert(isa<FixedVectorType>(VT) && "Expected vector type");

    return getNumOps(VT->getScalarType(),

                     cast<FixedVectorType>(VT)->getNumElements());

  }


  /// Is this the minimal version executed in the backend pipelines.

  bool isMinimal() const {

    return !DT;

  }


  /// Return the estimated number of vector ops required for an operation on

  /// \p VT * N.

  unsigned getNumOps(Type *ST, unsigned N) {

    return std::ceil((ST->getPrimitiveSizeInBits() * N).getFixedValue() /

                     double(TTI.getRegisterBitWidth(

                                   TargetTransformInfo::RGK_FixedWidthVector)

                                .getFixedValue()));

  }


  /// Return the set of vectors that a matrix value is lowered to.

  ///

  /// If we lowered \p MatrixVal, just return the cache result matrix. Otherwise

  /// split the flat vector \p MatrixVal containing a matrix with shape \p SI

  /// into vectors.

  MatrixTy getMatrix(Value *MatrixVal, const ShapeInfo &SI,

                     IRBuilder<> &Builder) {

    FixedVectorType *VType = cast<FixedVectorType>(MatrixVal->getType());

    assert(VType->getNumElements() == SI.NumRows * SI.NumColumns &&

           "The vector size must match the number of matrix elements");


    // Check if we lowered MatrixVal using shape information. In that case,

    // return the existing matrix, if it matches the requested shape

    // information. If there is a mis-match, embed the result in a flat

    // vector and split it later.

    auto Found = Inst2ColumnMatrix.find(MatrixVal);

    if (Found != Inst2ColumnMatrix.end()) {

      MatrixTy &M = Found->second;

      // Return the found matrix, if its shape matches the requested shape

      // information

      if (SI.NumRows == M.getNumRows() && SI.NumColumns == M.getNumColumns())

        return M;


      MatrixVal = M.embedInVector(Builder);

    }


    // Otherwise split MatrixVal.

    SmallVector<Value *, 16> SplitVecs;

    for (unsigned MaskStart = 0; MaskStart < VType->getNumElements();

         MaskStart += SI.getStride()) {

      Value *V = Builder.CreateShuffleVector(

          MatrixVal, createSequentialMask(MaskStart, SI.getStride(), 0),

          "split");

      SplitVecs.push_back(V);

    }


    if (Instruction *Inst = dyn_cast<Instruction>(MatrixVal)) {

      if (Found != Inst2ColumnMatrix.end()) {

        // FIXME: re: "at least": SplitVecs.size() doesn't count the shuffles

        // that embedInVector created.

        LLVM_DEBUG(dbgs() << "matrix reshape from " << Found->second.shape()

                          << " to " << SI << " using at least "

                          << SplitVecs.size() << " shuffles on behalf of:\n"

                          << *Inst << '\n');

        ReshapedMatrices++;

      } else if (!ShapeMap.contains(MatrixVal)) {

        LLVM_DEBUG(

            dbgs()

            << "splitting a " << SI << " matrix with " << SplitVecs.size()

            << " shuffles beacuse we do not have a shape-aware lowering for "

               "its def:\n"

            << *Inst << '\n');

        (void)Inst;

        SplitMatrices++;

      } else {

        // The ShapeMap has it, so it's a case where we're being lowered

        // before the def, and we expect that InstCombine will clean things up

        // afterward.

      }

    }


    return {SplitVecs};

  }


  /// If \p V already has a known shape return false.  Otherwise set the shape

  /// for instructions that support it.

  bool setShapeInfo(Value *V, ShapeInfo Shape) {

    assert(Shape && "Shape not set");

    if (isa<UndefValue>(V) || !supportsShapeInfo(V))

      return false;


    auto SIter = ShapeMap.find(V);

    if (SIter != ShapeMap.end()) {

      if (VerifyShapeInfo && (SIter->second.NumRows != Shape.NumRows ||

                              SIter->second.NumColumns != Shape.NumColumns)) {

        errs() << "Conflicting shapes (" << SIter->second.NumRows << "x"

               << SIter->second.NumColumns << " vs " << Shape.NumRows << "x"

               << Shape.NumColumns << ") for " << *V << "\n";

        report_fatal_error(

            "Matrix shape verification failed, compilation aborted!");

      }


      LLVM_DEBUG(dbgs() << "  not overriding existing shape: "

                        << SIter->second.NumRows << " "

                        << SIter->second.NumColumns << " for " << *V << "\n");

      return false;

    }


    ShapeMap.insert({V, Shape});

    LLVM_DEBUG(dbgs() << "  " << Shape.NumRows << " x " << Shape.NumColumns

                      << " for " << *V << "\n");

    return true;

  }


  /// Returns true if shape information can be used for \p V. The supported

  /// instructions must match the instructions that can be lowered by this pass.

  bool supportsShapeInfo(Value *V) {

    Instruction *Inst = dyn_cast<Instruction>(V);

    if (!Inst)

      return false;


    IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst);

    if (II)

      switch (II->getIntrinsicID()) {

      case Intrinsic::matrix_multiply:

      case Intrinsic::matrix_transpose:

      case Intrinsic::matrix_column_major_load:

      case Intrinsic::matrix_column_major_store:

        return true;

      default:

        return isUniformShape(II);

      }

    return isUniformShape(V) || isa<StoreInst>(V) || isa<LoadInst>(V) ||

           isa<SelectInst>(V);

  }


  /// Propagate the shape information of instructions to their users.

  /// The work list contains instructions for which we can compute the shape,

  /// either based on the information provided by matrix intrinsics or known

  /// shapes of operands.

  SmallVector<Instruction *, 32>

  propagateShapeForward(SmallVectorImpl<Instruction *> &WorkList) {

    SmallVector<Instruction *, 32> NewWorkList;

    // Pop an element for which we guaranteed to have at least one of the

    // operand shapes.  Add the shape for this and then add users to the work

    // list.

    LLVM_DEBUG(dbgs() << "Forward-propagate shapes:\n");

    while (!WorkList.empty()) {

      Instruction *Inst = WorkList.pop_back_val();


      // New entry, set the value and insert operands

      bool Propagate = false;

      if (auto SI = computeShapeInfoForInst(Inst, ShapeMap))

        Propagate = setShapeInfo(Inst, *SI);


      if (Propagate) {

        NewWorkList.push_back(Inst);

        for (auto *User : Inst->users())

          if (ShapeMap.count(User) == 0)

            WorkList.push_back(cast<Instruction>(User));

      }

    }


    return NewWorkList;

  }


  /// Propagate the shape to operands of instructions with shape information.

  /// \p Worklist contains the instruction for which we already know the shape.

  SmallVector<Instruction *, 32>

  propagateShapeBackward(SmallVectorImpl<Instruction *> &WorkList) {

    SmallVector<Instruction *, 32> NewWorkList;


    auto pushInstruction = [](Value *V,

                              SmallVectorImpl<Instruction *> &WorkList) {

      Instruction *I = dyn_cast<Instruction>(V);

      if (I)

        WorkList.push_back(I);

    };

    // Pop an element with known shape.  Traverse the operands, if their shape

    // derives from the result shape and is unknown, add it and add them to the

    // worklist.

    LLVM_DEBUG(dbgs() << "Backward-propagate shapes:\n");

    while (!WorkList.empty()) {

      Value *V = WorkList.pop_back_val();


      size_t BeforeProcessingV = WorkList.size();

      if (!isa<Instruction>(V))

        continue;


      Value *MatrixA;

      Value *MatrixB;

      Value *M;

      Value *N;

      Value *K;

      if (match(V, m_Intrinsic<Intrinsic::matrix_multiply>(

                       m_Value(MatrixA), m_Value(MatrixB), m_Value(M),

                       m_Value(N), m_Value(K)))) {

        if (setShapeInfo(MatrixA, {M, N}))

          pushInstruction(MatrixA, WorkList);


        if (setShapeInfo(MatrixB, {N, K}))

          pushInstruction(MatrixB, WorkList);


      } else if (match(V, m_Intrinsic<Intrinsic::matrix_transpose>(

                              m_Value(MatrixA), m_Value(M), m_Value(N)))) {

        // Flip dimensions.

        if (setShapeInfo(MatrixA, {M, N}))

          pushInstruction(MatrixA, WorkList);

      } else if (match(V, m_Intrinsic<Intrinsic::matrix_column_major_store>(

                              m_Value(MatrixA), m_Value(), m_Value(), m_Value(),

                              m_Value(M), m_Value(N)))) {

        if (setShapeInfo(MatrixA, {M, N})) {

          pushInstruction(MatrixA, WorkList);

        }

      } else if (isa<LoadInst>(V) ||

                 match(V, m_Intrinsic<Intrinsic::matrix_column_major_load>())) {

        // Nothing to do, no matrix input.

      } else if (isa<StoreInst>(V)) {

        // Nothing to do.  We forward-propagated to this so we would just

        // backward propagate to an instruction with an already known shape.

      } else if (isUniformShape(V) || isa<SelectInst>(V)) {

        auto Ops = cast<Instruction>(V)->operands();

        auto ShapedOps = isa<SelectInst>(V) ? drop_begin(Ops) : Ops;

        // Propagate to all operands.

        ShapeInfo Shape = ShapeMap[V];

        for (Use &U : ShapedOps) {

          if (setShapeInfo(U.get(), Shape))

            pushInstruction(U.get(), WorkList);

        }

      }

      // After we discovered new shape info for new instructions in the

      // worklist, we use their users as seeds for the next round of forward

      // propagation.

      for (size_t I = BeforeProcessingV; I != WorkList.size(); I++)

        for (User *U : WorkList[I]->users())

          if (isa<Instruction>(U) && V != U)

            NewWorkList.push_back(cast<Instruction>(U));

    }

    return NewWorkList;

  }


  /// (Op0 op Op1)^T -> Op0^T op Op1^T

  /// Transpose \p Op0 and \p Op1 of shape \p Shape0 and \p Shape1, then use

  /// them on both sides of \p Operation.

  Instruction *distributeTransposes(

      Value *Op0, ShapeInfo Shape0, Value *Op1, ShapeInfo Shape1,

      MatrixBuilder &Builder,

      function_ref<Instruction *(Value *, ShapeInfo, Value *, ShapeInfo)>

          Operation) {

    Value *T0 = Builder.CreateMatrixTranspose(

        Op0, Shape0.NumRows, Shape0.NumColumns, Op0->getName() + "_t");

    // We are being run after shape prop, add shape for newly created

    // instructions so that we lower them later.

    setShapeInfo(T0, Shape0.t());

    Value *T1 = Builder.CreateMatrixTranspose(

        Op1, Shape1.NumRows, Shape1.NumColumns, Op1->getName() + "_t");

    setShapeInfo(T1, Shape1.t());

    return Operation(T0, Shape0.t(), T1, Shape1.t());

  }


  /// Erase \p Inst from both ShapeMap (if an entry exists) and erase \p Inst

  /// itself.

  void eraseFromParentAndRemoveFromShapeMap(Instruction *Inst) {

    ShapeMap.erase(Inst);

    Inst->eraseFromParent();

  }


  /// Erase \p V from \p BB and move \II forward to avoid invalidating

  /// iterators.

  void eraseFromParentAndMove(Value *V, BasicBlock::reverse_iterator &II,

                              BasicBlock &BB) {

    auto *Inst = cast<Instruction>(V);

    // Still used, don't erase.

    if (!Inst->use_empty())

      return;

    if (II != BB.rend() && Inst == &*II)

      ++II;

    eraseFromParentAndRemoveFromShapeMap(Inst);

  }


  /// Add a new entry to ShapeMap for \p New with \p Old's shape info, erase the

  /// entry for \p Old and replace all uses of \p Old with \p New.

  void updateShapeAndReplaceAllUsesWith(Instruction &Old, Value *New) {

    // We need to remove Old from the ShapeMap otherwise RAUW will replace it

    // with New. We should only add New it it supportsShapeInfo so we insert

    // it conditionally instead.

    auto S = ShapeMap.find(&Old);

    if (S != ShapeMap.end()) {

      ShapeMap.erase(S);

      if (supportsShapeInfo(New))

        ShapeMap.insert({New, S->second});

    }

    Old.replaceAllUsesWith(New);

  }


  /// Sink a top-level transpose inside matmuls and adds.

  /// This creates and erases instructions as needed, and returns the newly

  /// created instruction while updating the iterator to avoid invalidation. If

  /// this returns nullptr, no new instruction was created.

  Instruction *sinkTranspose(Instruction &I, BasicBlock::reverse_iterator &II,

                             bool &Changed) {

    BasicBlock &BB = *I.getParent();

    IRBuilder<> IB(&I);

    MatrixBuilder Builder(IB);


    Value *TA, *TAMA, *TAMB;

    ConstantInt *R, *K, *C;

    if (!match(&I, m_Intrinsic<Intrinsic::matrix_transpose>(

                       m_Value(TA), m_ConstantInt(R), m_ConstantInt(C))))

      return nullptr;


    // Transpose of a transpose is a nop when the shapes match.

    Value *TATA;

    if (match(TA, m_Intrinsic<Intrinsic::matrix_transpose>(

                      m_Value(TATA), m_Specific(C), m_Specific(R)))) {

      updateShapeAndReplaceAllUsesWith(I, TATA);

      eraseFromParentAndMove(&I, II, BB);

      eraseFromParentAndMove(TA, II, BB);

      Changed = true;

      return nullptr;

    }


    // k^T -> k

    if (isSplat(TA)) {

      updateShapeAndReplaceAllUsesWith(I, TA);

      eraseFromParentAndMove(&I, II, BB);

      Changed = true;

      return nullptr;

    }


    // (A * B)^t -> B^t * A^t

    // RxK KxC      CxK   KxR

    if (match(TA, m_Intrinsic<Intrinsic::matrix_multiply>(

                      m_Value(TAMA), m_Value(TAMB), m_ConstantInt(R),

                      m_ConstantInt(K), m_ConstantInt(C)))) {

      auto NewInst = distributeTransposes(

          TAMB, {K, C}, TAMA, {R, K}, Builder,

          [&](Value *T0, ShapeInfo Shape0, Value *T1, ShapeInfo Shape1) {

            return Builder.CreateMatrixMultiply(T0, T1, Shape0.NumRows,

                                                Shape0.NumColumns,

                                                Shape1.NumColumns, "mmul");

          });

      updateShapeAndReplaceAllUsesWith(I, NewInst);

      eraseFromParentAndMove(&I, II, BB);

      eraseFromParentAndMove(TA, II, BB);

      Changed = true;

      return NewInst;

    }


    // Same as above, but with a mul, which occurs when multiplied

    // with a scalar.

    // (A * k)^t -> A^t * k

    //  R  x  C     RxC

    if (match(TA, m_AnyMul(m_Value(TAMA), m_Value(TAMB))) &&

        (isSplat(TAMA) || isSplat(TAMB))) {

      IRBuilder<> LocalBuilder(&I);

      // We know that the transposed operand is of shape RxC.

      // An when multiplied with a scalar, the shape is preserved.

      auto NewInst = distributeTransposes(

          TAMA, {R, C}, TAMB, {R, C}, Builder,

          [&](Value *T0, ShapeInfo Shape0, Value *T1, ShapeInfo Shape1) {

            bool IsFP = I.getType()->isFPOrFPVectorTy();

            auto *Mul = IsFP ? LocalBuilder.CreateFMul(T0, T1, "mmul")

                             : LocalBuilder.CreateMul(T0, T1, "mmul");

            auto *Result = cast<Instruction>(Mul);

            setShapeInfo(Result, Shape0);

            return Result;

          });

      updateShapeAndReplaceAllUsesWith(I, NewInst);

      eraseFromParentAndMove(&I, II, BB);

      eraseFromParentAndMove(TA, II, BB);

      Changed = true;

      return NewInst;

    }


    // (A + B)^t -> A^t + B^t

    // RxC RxC      CxR   CxR

    if (match(TA, m_AnyAdd(m_Value(TAMA), m_Value(TAMB)))) {

      IRBuilder<> LocalBuilder(&I);

      auto NewInst = distributeTransposes(

          TAMA, {R, C}, TAMB, {R, C}, Builder,

          [&](Value *T0, ShapeInfo Shape0, Value *T1, ShapeInfo Shape1) {

            bool IsFP = I.getType()->isFPOrFPVectorTy();

            auto *Add = IsFP ? LocalBuilder.CreateFAdd(T0, T1, "madd")

                             : LocalBuilder.CreateAdd(T0, T1, "madd");


            auto *Result = cast<Instruction>(Add);

            setShapeInfo(Result, Shape0);

            return Result;

          });

      updateShapeAndReplaceAllUsesWith(I, NewInst);

      eraseFromParentAndMove(&I, II, BB);

      eraseFromParentAndMove(TA, II, BB);

      Changed = true;

      return NewInst;

    }


    return nullptr;

  }


  bool liftTranspose(Instruction &I) {

    // Erase dead Instructions after lifting transposes from binops.

    auto CleanupBinOp = [this](Instruction &T, Value *A, Value *B) {

      if (T.use_empty())

        eraseFromParentAndRemoveFromShapeMap(&T);

      if (A->use_empty())

        eraseFromParentAndRemoveFromShapeMap(cast<Instruction>(A));

      if (A != B && B->use_empty())

        eraseFromParentAndRemoveFromShapeMap(cast<Instruction>(B));

    };


    Value *A, *B, *AT, *BT;

    ConstantInt *R, *K, *C;

    // A^t * B ^t -> (B * A)^t

    if (match(&I, m_Intrinsic<Intrinsic::matrix_multiply>(

                      m_Value(A), m_Value(B), m_ConstantInt(R),

                      m_ConstantInt(K), m_ConstantInt(C))) &&

        match(A, m_Intrinsic<Intrinsic::matrix_transpose>(m_Value(AT))) &&

        match(B, m_Intrinsic<Intrinsic::matrix_transpose>(m_Value((BT))))) {

      IRBuilder<> IB(&I);

      MatrixBuilder Builder(IB);

      Value *M = Builder.CreateMatrixMultiply(

          BT, AT, C->getZExtValue(), K->getZExtValue(), R->getZExtValue());

      setShapeInfo(M, {C, R});

      Instruction *NewInst = Builder.CreateMatrixTranspose(M, C->getZExtValue(),

                                                           R->getZExtValue());

      updateShapeAndReplaceAllUsesWith(I, NewInst);

      CleanupBinOp(I, A, B);

      return true;

    }

    // A^t + B ^t -> (A + B)^t. Pick rows and columns from first transpose. If

    // the shape of the second transpose is different, there's a shape conflict

    // which gets resolved by picking the shape of the first operand.

    else if (match(&I, m_FAdd(m_Value(A), m_Value(B))) &&

             match(A, m_Intrinsic<Intrinsic::matrix_transpose>(

                          m_Value(AT), m_ConstantInt(R), m_ConstantInt(C))) &&

             match(B, m_Intrinsic<Intrinsic::matrix_transpose>(

                          m_Value(BT), m_ConstantInt(), m_ConstantInt()))) {

      IRBuilder<> Builder(&I);

      auto *Add = Builder.CreateFAdd(AT, BT, "mfadd");

      MatrixBuilder MBuilder(Builder);

      Instruction *NewInst = MBuilder.CreateMatrixTranspose(

          Add, R->getZExtValue(), C->getZExtValue(), "mfadd_t");

      updateShapeAndReplaceAllUsesWith(I, NewInst);

      assert(computeShapeInfoForInst(NewInst, ShapeMap) ==

                 computeShapeInfoForInst(&I, ShapeMap) &&

             "Shape of new instruction doesn't match original shape.");

      CleanupBinOp(I, A, B);

      if (auto *AddI = dyn_cast<Instruction>(Add)) {

        setShapeInfo(AddI, {R, C});

        assert(

            computeShapeInfoForInst(AddI, ShapeMap).value_or(ShapeMap[AddI]) ==

                ShapeMap[AddI] &&

            "Shape of updated addition doesn't match cached shape.");

      }

      return true;

    }

    return false;

  }


  /// Try moving transposes in order to fold them away or into multiplies.

  bool optimizeTransposes() {

    bool Changed = false;

    // First sink all transposes inside matmuls and adds, hoping that we end up

    // with NN, NT or TN variants.

    for (BasicBlock &BB : reverse(Func)) {

      for (auto II = BB.rbegin(); II != BB.rend();) {

        Instruction &I = *II;

        // We may remove II.  By default continue on the next/prev instruction.

        ++II;

        if (Instruction *NewInst = sinkTranspose(I, II, Changed))

          II = std::next(BasicBlock::reverse_iterator(NewInst));

      }

    }


    // If we have a TT matmul or a TT add, lift the transpose. We may be able

    // to fold into consuming multiply or add.

    for (BasicBlock &BB : Func) {

      for (Instruction &I : llvm::make_early_inc_range(BB)) {

        Changed |= liftTranspose(I);

      }

    }

    return Changed;

  }


  bool Visit() {

    SmallVector<Instruction *, 32> WorkList;


    // Initially only the shape of matrix intrinsics is known.

    // Initialize the work list with ops carrying shape information.

    for (BasicBlock &BB : Func)

      for (Instruction &Inst : BB) {

        IntrinsicInst *II = dyn_cast<IntrinsicInst>(&Inst);

        if (!II)

          continue;


        switch (II->getIntrinsicID()) {

        case Intrinsic::matrix_multiply:

        case Intrinsic::matrix_transpose:

        case Intrinsic::matrix_column_major_load:

        case Intrinsic::matrix_column_major_store:

          WorkList.push_back(&Inst);

          break;

        default:

          break;

        }

      }


    // Avoid unnecessary work if there are no matrix intrinsics in the function.

    if (WorkList.empty())

      return false;


    if (AM) {

      ORE = &AM->getResult<OptimizationRemarkEmitterAnalysis>(Func);

      AA = &AM->getResult<AAManager>(Func);

      DT = &AM->getResult<DominatorTreeAnalysis>(Func);

      LI = &AM->getResult<LoopAnalysis>(Func);

    }


    // Propagate shapes until nothing changes any longer.

    while (!WorkList.empty()) {

      WorkList = propagateShapeForward(WorkList);

      WorkList = propagateShapeBackward(WorkList);

    }


    bool Changed = false;

    if (!isMinimal()) {

      Changed |= optimizeTransposes();

      if (PrintAfterTransposeOpt) {

        dbgs() << "Dump after matrix transpose optimization:\n";

        Func.print(dbgs());

      }

    }


    SmallVector<CallInst *, 16> MaybeFusableInsts;

    SmallVector<Instruction *, 16> MatrixInsts;

    SmallVector<IntrinsicInst *, 16> LifetimeEnds;


    // First, collect all instructions with shape information and candidates for

    // fusion (currently only matrix multiplies).

    ReversePostOrderTraversal<Function *> RPOT(&Func);

    for (auto *BB : RPOT)

      for (Instruction &I : *BB) {

        if (match(&I, m_Intrinsic<Intrinsic::lifetime_end>()))

          LifetimeEnds.push_back(cast<IntrinsicInst>(&I));

        if (!ShapeMap.contains(&I))

          continue;

        if (match(&I, m_Intrinsic<Intrinsic::matrix_multiply>()))

          MaybeFusableInsts.push_back(cast<CallInst>(&I));

        MatrixInsts.push_back(&I);

      }


    // Second, try to lower any dot products

    SmallPtrSet<Instruction *, 16> FusedInsts;

    for (CallInst *CI : MaybeFusableInsts)

      lowerDotProduct(CI, FusedInsts, getFastMathFlags(CI));


    // Third, try to fuse candidates.

    for (CallInst *CI : MaybeFusableInsts)

      if (!FusedInsts.contains(CI))

        LowerMatrixMultiplyFused(CI, FusedInsts, LifetimeEnds);


    Changed |= !FusedInsts.empty();


    // Fourth, pre-process all the PHINode's. The incoming values will be

    // assigned later in VisitPHI.

    for (Instruction *Inst : MatrixInsts) {

      if (FusedInsts.count(Inst))

        continue;


      auto *PHI = dyn_cast<PHINode>(Inst);

      if (!PHI)

        continue;


      const ShapeInfo &SI = ShapeMap.at(Inst);

      auto *EltTy = cast<FixedVectorType>(PHI->getType())->getElementType();

      MatrixTy PhiM(SI.NumRows, SI.NumColumns, EltTy);


      IRBuilder<> Builder(Inst);

      for (unsigned VI = 0, VE = PhiM.getNumVectors(); VI != VE; ++VI)

        PhiM.setVector(VI, Builder.CreatePHI(PhiM.getVectorTy(),

                                             PHI->getNumIncomingValues(),

                                             PHI->getName()));

      assert(!Inst2ColumnMatrix.contains(PHI) && "map already contains phi?");

      Inst2ColumnMatrix[PHI] = PhiM;

    }


    // Fifth, lower remaining instructions with shape information.

    for (Instruction *Inst : MatrixInsts) {

      if (FusedInsts.count(Inst))

        continue;


      const ShapeInfo &SI = ShapeMap.at(Inst);


      Value *Op1;

      Value *Op2;

      MatrixTy Result;

      IRBuilder<> Builder(Inst);

      if (auto *BinOp = dyn_cast<BinaryOperator>(Inst))

        Result = VisitBinaryOperator(BinOp, SI, Builder);

      else if (auto *Cast = dyn_cast<CastInst>(Inst))

        Result = VisitCastInstruction(Cast, SI, Builder);

      else if (auto *UnOp = dyn_cast<UnaryOperator>(Inst))

        Result = VisitUnaryOperator(UnOp, SI, Builder);

      else if (auto *Intr = dyn_cast<IntrinsicInst>(Inst))

        Result = VisitIntrinsicInst(Intr, SI, Builder);

      else if (auto *Select = dyn_cast<SelectInst>(Inst))

        Result = VisitSelectInst(Select, SI, Builder);

      else if (match(Inst, m_Load(m_Value(Op1))))

        Result = VisitLoad(cast<LoadInst>(Inst), SI, Op1, Builder);

      else if (match(Inst, m_Store(m_Value(Op1), m_Value(Op2))))

        Result = VisitStore(cast<StoreInst>(Inst), SI, Op1, Op2, Builder);

      else if (auto *PHI = dyn_cast<PHINode>(Inst))

        Result = VisitPHI(PHI, SI, Builder);

      else

        continue;


      finalizeLowering(Inst, Result, Builder);

      Changed = true;

    }


    if (ORE) {

      RemarkGenerator RemarkGen(Inst2ColumnMatrix, *ORE, Func);

      RemarkGen.emitRemarks();

    }


    // Delete the instructions backwards, as it has a reduced likelihood of

    // having to update as many def-use and use-def chains.

    //

    // Because we add to ToRemove during fusion we can't guarantee that defs

    // are before uses.  Change uses to poison temporarily as these should get

    // removed as well.

    //

    // For verification, we keep track of where we changed uses to poison in

    // PoisonedInsts and then check that we in fact remove them.

    SmallPtrSet<Instruction *, 16> PoisonedInsts;

    for (auto *Inst : reverse(ToRemove)) {

      for (Use &U : llvm::make_early_inc_range(Inst->uses())) {

        if (auto *Poisoned = dyn_cast<Instruction>(U.getUser()))

          PoisonedInsts.insert(Poisoned);

        U.set(PoisonValue::get(Inst->getType()));

      }

      Inst->eraseFromParent();

      PoisonedInsts.erase(Inst);

    }

    if (!PoisonedInsts.empty()) {

      // If we didn't remove all poisoned instructions, it's a hard error.

      dbgs() << "Poisoned but present instructions:\n";

      for (auto *I : PoisonedInsts)

        dbgs() << *I << "\n";

      llvm_unreachable("Poisoned but instruction not removed");

    }


    return Changed;

  }


  /// Replace intrinsic calls.

  MatrixTy VisitIntrinsicInst(IntrinsicInst *Inst, const ShapeInfo &SI,

                              IRBuilder<> &Builder) {

    assert(Inst->getCalledFunction() &&

           Inst->getCalledFunction()->isIntrinsic());


    switch (Inst->getCalledFunction()->getIntrinsicID()) {

    case Intrinsic::matrix_multiply:

      return LowerMultiply(Inst, Builder);

    case Intrinsic::matrix_transpose:

      return LowerTranspose(Inst, Builder);

    case Intrinsic::matrix_column_major_load:

      return LowerColumnMajorLoad(Inst, Builder);

    case Intrinsic::matrix_column_major_store:

      return LowerColumnMajorStore(Inst, Builder);

    case Intrinsic::abs:

    case Intrinsic::fabs: {

      MatrixTy Result;

      MatrixTy M = getMatrix(Inst->getOperand(0), SI, Builder);

      Builder.setFastMathFlags(getFastMathFlags(Inst));


      for (auto *Vector : M.vectors()) {

        switch (Inst->getIntrinsicID()) {

        case Intrinsic::abs:

          Result.addVector(Builder.CreateBinaryIntrinsic(Intrinsic::abs, Vector,

                                                         Inst->getOperand(1)));

          continue;

        case Intrinsic::fabs:

          Result.addVector(

              Builder.CreateUnaryIntrinsic(Inst->getIntrinsicID(), Vector));

          continue;

        default:

          llvm_unreachable("unexpected intrinsic");

        }

      }


      return Result.addNumComputeOps(getNumOps(Result.getVectorTy()) *

                                     Result.getNumVectors());

    }

    default:

      break;

    }

    llvm_unreachable(

        "only intrinsics supporting shape info should be seen here");

  }


  /// Compute the alignment for a column/row \p Idx with \p Stride between them.

  /// The address at \p Idx == 0 has alignment \p A. If \p Stride is a

  /// ConstantInt, reduce the initial alignment based on the byte offset. For

  /// non-ConstantInt strides, return the common alignment of the initial

  /// alignment and the element size in bytes.

  Align getAlignForIndex(unsigned Idx, Value *Stride, Type *ElementTy,

                         MaybeAlign A) const {

    Align InitialAlign = DL.getValueOrABITypeAlignment(A, ElementTy);

    if (Idx == 0)

      return InitialAlign;


    TypeSize ElementSizeInBits = DL.getTypeSizeInBits(ElementTy);

    if (auto *ConstStride = dyn_cast<ConstantInt>(Stride)) {

      uint64_t StrideInBytes =

          ConstStride->getZExtValue() * ElementSizeInBits / 8;

      return commonAlignment(InitialAlign, Idx * StrideInBytes);

    }

    return commonAlignment(InitialAlign, ElementSizeInBits / 8);

  }


  IntegerType *getIndexType(Value *Ptr) const {

    return cast<IntegerType>(DL.getIndexType(Ptr->getType()));

  }


  Value *getIndex(Value *Ptr, uint64_t V) const {

    return ConstantInt::get(getIndexType(Ptr), V);

  }


  Value *castToIndexType(Value *Ptr, Value *V, IRBuilder<> &Builder) const {

    assert(isa<IntegerType>(V->getType()) &&

           "Attempted to cast non-integral type to integer index");

    // In case the data layout's index type differs in width from the type of

    // the value we're given, truncate or zero extend to the appropriate width.

    // We zero extend here as indices are unsigned.

    return Builder.CreateZExtOrTrunc(V, getIndexType(Ptr),

                                     V->getName() + ".cast");

  }


  /// Load a matrix with \p Shape starting at \p Ptr and using \p Stride between

  /// vectors.

  MatrixTy loadMatrix(Type *Ty, Value *Ptr, MaybeAlign MAlign, Value *Stride,

                      bool IsVolatile, ShapeInfo Shape, IRBuilder<> &Builder) {

    auto *VType = cast<FixedVectorType>(Ty);

    Type *EltTy = VType->getElementType();

    Type *VecTy = FixedVectorType::get(EltTy, Shape.getStride());

    Value *EltPtr = Ptr;

    MatrixTy Result;

    Stride = castToIndexType(Ptr, Stride, Builder);

    for (unsigned I = 0, E = Shape.getNumVectors(); I < E; ++I) {

      Value *GEP = computeVectorAddr(

          EltPtr, Builder.getIntN(Stride->getType()->getScalarSizeInBits(), I),

          Stride, Shape.getStride(), EltTy, Builder);

      Value *Vector = Builder.CreateAlignedLoad(

          VecTy, GEP, getAlignForIndex(I, Stride, EltTy, MAlign),

          IsVolatile, "col.load");


      Result.addVector(Vector);

    }

    return Result.addNumLoads(getNumOps(Result.getVectorTy()) *

                              Result.getNumVectors());

  }


  /// Loads a sub-matrix with shape \p ResultShape from a \p R x \p C matrix,

  /// starting at \p MatrixPtr[I][J].

  MatrixTy loadMatrix(Value *MatrixPtr, MaybeAlign Align, bool IsVolatile,

                      ShapeInfo MatrixShape, Value *I, Value *J,

                      ShapeInfo ResultShape, Type *EltTy,

                      IRBuilder<> &Builder) {

    Value *Offset = Builder.CreateAdd(

        Builder.CreateMul(J, getIndex(MatrixPtr, MatrixShape.getStride())), I);


    Value *TileStart = Builder.CreateGEP(EltTy, MatrixPtr, Offset);

    auto *TileTy = FixedVectorType::get(EltTy, ResultShape.NumRows *

                                                   ResultShape.NumColumns);


    return loadMatrix(TileTy, TileStart, Align,

                      getIndex(MatrixPtr, MatrixShape.getStride()), IsVolatile,

                      ResultShape, Builder);

  }


  /// Lower a load instruction with shape information.

  MatrixTy LowerLoad(Instruction *Inst, Value *Ptr, MaybeAlign Align,

                     Value *Stride, bool IsVolatile, ShapeInfo Shape,

                     IRBuilder<> &Builder) {

    return loadMatrix(Inst->getType(), Ptr, Align, Stride, IsVolatile, Shape,

                      Builder);

  }


  /// Lowers llvm.matrix.column.major.load.

  ///

  /// The intrinsic loads a matrix from memory using a stride between columns.

  MatrixTy LowerColumnMajorLoad(CallInst *Inst, IRBuilder<> &Builder) {

    assert(MatrixLayout == MatrixLayoutTy::ColumnMajor &&

           "Intrinsic only supports column-major layout!");

    Value *Ptr = Inst->getArgOperand(0);

    Value *Stride = Inst->getArgOperand(1);

    return LowerLoad(Inst, Ptr, Inst->getParamAlign(0), Stride,

                     cast<ConstantInt>(Inst->getArgOperand(2))->isOne(),

                     {Inst->getArgOperand(3), Inst->getArgOperand(4)}, Builder);

  }


  /// Stores a sub-matrix \p StoreVal into the \p R x \p C matrix starting at \p

  /// MatrixPtr[I][J].

  void storeMatrix(const MatrixTy &StoreVal, Value *MatrixPtr,

                   MaybeAlign MAlign, bool IsVolatile, ShapeInfo MatrixShape,

                   Value *I, Value *J, Type *EltTy, IRBuilder<> &Builder) {

    Value *Offset = Builder.CreateAdd(

        Builder.CreateMul(J, getIndex(MatrixPtr, MatrixShape.getStride())), I);


    Value *TileStart = Builder.CreateGEP(EltTy, MatrixPtr, Offset);

    auto *TileTy = FixedVectorType::get(EltTy, StoreVal.getNumRows() *

                                                   StoreVal.getNumColumns());


    storeMatrix(TileTy, StoreVal, TileStart, MAlign,

                getIndex(MatrixPtr, MatrixShape.getStride()), IsVolatile,

                Builder);

  }


  /// Store matrix \p StoreVal starting at \p Ptr and using \p Stride between

  /// vectors.

  MatrixTy storeMatrix(Type *Ty, MatrixTy StoreVal, Value *Ptr,

                       MaybeAlign MAlign, Value *Stride, bool IsVolatile,

                       IRBuilder<> &Builder) {

    auto *VType = cast<FixedVectorType>(Ty);

    Value *EltPtr = Ptr;

    Stride = castToIndexType(Ptr, Stride, Builder);

    for (auto Vec : enumerate(StoreVal.vectors())) {

      Value *GEP = computeVectorAddr(

          EltPtr,

          Builder.getIntN(Stride->getType()->getScalarSizeInBits(),

                          Vec.index()),

          Stride, StoreVal.getStride(), VType->getElementType(), Builder);

      Builder.CreateAlignedStore(Vec.value(), GEP,

                                 getAlignForIndex(Vec.index(), Stride,

                                                  VType->getElementType(),

                                                  MAlign),

                                 IsVolatile);

    }

    return MatrixTy().addNumStores(getNumOps(StoreVal.getVectorTy()) *

                                   StoreVal.getNumVectors());

  }


  /// Lower a store instruction with shape information.

  MatrixTy LowerStore(Instruction *Inst, Value *Matrix, Value *Ptr,

                      MaybeAlign A, Value *Stride, bool IsVolatile,

                      ShapeInfo Shape, IRBuilder<> &Builder) {

    auto StoreVal = getMatrix(Matrix, Shape, Builder);

    return storeMatrix(Matrix->getType(), StoreVal, Ptr, A, Stride, IsVolatile,

                       Builder);

  }


  /// Lowers llvm.matrix.column.major.store.

  ///

  /// The intrinsic store a matrix back memory using a stride between columns.

  MatrixTy LowerColumnMajorStore(CallInst *Inst, IRBuilder<> &Builder) {

    assert(MatrixLayout == MatrixLayoutTy::ColumnMajor &&

           "Intrinsic only supports column-major layout!");

    Value *Matrix = Inst->getArgOperand(0);

    Value *Ptr = Inst->getArgOperand(1);

    Value *Stride = Inst->getArgOperand(2);

    return LowerStore(Inst, Matrix, Ptr, Inst->getParamAlign(1), Stride,

                      cast<ConstantInt>(Inst->getArgOperand(3))->isOne(),

                      {Inst->getArgOperand(4), Inst->getArgOperand(5)},

                      Builder);

  }


  // Set elements I..I+NumElts-1 to Block

  Value *insertVector(Value *Col, unsigned I, Value *Block,

                      IRBuilder<> &Builder) {


    // First, bring Block to the same size as Col

    unsigned BlockNumElts =

        cast<FixedVectorType>(Block->getType())->getNumElements();

    unsigned NumElts = cast<FixedVectorType>(Col->getType())->getNumElements();

    assert(NumElts >= BlockNumElts && "Too few elements for current block");


    Block = Builder.CreateShuffleVector(

        Block, createSequentialMask(0, BlockNumElts, NumElts - BlockNumElts));


    // If Col is 7 long and I is 2 and BlockNumElts is 2 the mask is: 0, 1, 7,

    // 8, 4, 5, 6

    SmallVector<int, 16> Mask;

    unsigned i;

    for (i = 0; i < I; i++)

      Mask.push_back(i);


    unsigned VecNumElts =

        cast<FixedVectorType>(Col->getType())->getNumElements();

    for (; i < I + BlockNumElts; i++)

      Mask.push_back(i - I + VecNumElts);


    for (; i < VecNumElts; i++)

      Mask.push_back(i);


    return Builder.CreateShuffleVector(Col, Block, Mask);

  }


  Value *createMulAdd(Value *Sum, Value *A, Value *B, bool UseFPOp,

                      IRBuilder<> &Builder, bool AllowContraction,

                      unsigned &NumComputeOps) {

    NumComputeOps += getNumOps(A->getType());

    if (!Sum)

      return UseFPOp ? Builder.CreateFMul(A, B) : Builder.CreateMul(A, B);


    if (UseFPOp) {

      if (AllowContraction) {

        // Use fmuladd for floating point operations and let the backend decide

        // if that's profitable.

        return Builder.CreateIntrinsic(Intrinsic::fmuladd, A->getType(),

                                       {A, B, Sum});

      }

      NumComputeOps += getNumOps(A->getType());

      Value *Mul = Builder.CreateFMul(A, B);

      return Builder.CreateFAdd(Sum, Mul);

    }


    NumComputeOps += getNumOps(A->getType());

    Value *Mul = Builder.CreateMul(A, B);

    return Builder.CreateAdd(Sum, Mul);

  }


  /// Cache \p Matrix as result of \p Inst and update the uses of \p Inst. For

  /// users with shape information, there's nothing to do: they will use the

  /// cached value when they are lowered. For other users, \p Matrix is

  /// flattened and the uses are updated to use it. Also marks \p Inst for

  /// deletion.

  void finalizeLowering(Instruction *Inst, MatrixTy Matrix,

                        IRBuilder<> &Builder) {

    auto inserted = Inst2ColumnMatrix.insert(std::make_pair(Inst, Matrix));

    (void)inserted;

    assert((inserted.second || isa<PHINode>(Inst)) &&

           "multiple matrix lowering mapping");


    ToRemove.push_back(Inst);

    Value *Flattened = nullptr;

    for (Use &U : llvm::make_early_inc_range(Inst->uses())) {

      if (ShapeMap.contains(U.getUser()))

        continue;


      if (!Flattened) {

        Flattened = Matrix.embedInVector(Builder);

        LLVM_DEBUG(

            if (Instruction *User = dyn_cast<Instruction>(U.getUser())) dbgs()

                << "flattening a " << Matrix.shape() << " matrix:\n"

                << *Inst

                << "\nbecause we do not have a shape-aware lowering for its "

                   "user:\n"

                << *User << '\n';);

        FlattenedMatrices++;

      }

      U.set(Flattened);

    }

  }


  /// Special case for MatMul lowering. Prevents scalar loads of row-major

  /// vectors Lowers to vector reduction add instead of sequential add if

  /// reassocation is enabled.

  void lowerDotProduct(CallInst *MatMul,

                       SmallPtrSet<Instruction *, 16> &FusedInsts,

                       FastMathFlags FMF) {

    if (FusedInsts.contains(MatMul) ||

        MatrixLayout != MatrixLayoutTy::ColumnMajor)

      return;

    ShapeInfo LShape(MatMul->getArgOperand(2), MatMul->getArgOperand(3));

    ShapeInfo RShape(MatMul->getArgOperand(3), MatMul->getArgOperand(4));


    if (LShape.NumRows != 1 || RShape.NumColumns != 1) // not a dot product

      return;


    Value *LHS = MatMul->getArgOperand(0);

    Value *RHS = MatMul->getArgOperand(1);


    Type *ElementType = cast<FixedVectorType>(LHS->getType())->getElementType();

    bool IsIntVec = ElementType->isIntegerTy();


    // Floating point reductions require reassocation.

    if (!IsIntVec && !FMF.allowReassoc())

      return;


    auto CanBeFlattened = [](Value *Op) {

      if (match(Op, m_BinOp()))

        return true;

      return match(

          Op, m_OneUse(m_CombineOr(

                  m_Load(m_Value()),

                  m_CombineOr(m_Intrinsic<Intrinsic::matrix_transpose>(),

                              m_Intrinsic<Intrinsic::matrix_column_major_load>(

                                  m_Value(), m_SpecificInt(1))))));

    };

    // Returns the cost benefit of using \p Op with the dot product lowering. If

    // the returned cost is < 0, the argument is cheaper to use in the

    // dot-product lowering.

    auto GetCostForArg = [this, &CanBeFlattened](Value *Op, unsigned N) {

      if (!ShapeMap.contains(Op))

        return InstructionCost::getInvalid();


      if (!isa<Instruction>(Op))

        return InstructionCost(0);


      FixedVectorType *VecTy = cast<FixedVectorType>(Op->getType());

      Type *EltTy = VecTy->getElementType();


      if (!CanBeFlattened(Op)) {

        InstructionCost EmbedCost(0);

        // Roughly estimate the cost for embedding the columns into a vector.

        for (unsigned I = 1; I < N; ++I)

          EmbedCost += TTI.getShuffleCost(

              TTI::SK_Splice, FixedVectorType::get(EltTy, 1),

              FixedVectorType::get(EltTy, 1), {}, TTI::TCK_RecipThroughput);

        return EmbedCost;

      }


      if (match(Op, m_BinOp()) && ShapeMap.contains(Op)) {

        InstructionCost OriginalCost =

            TTI.getArithmeticInstrCost(cast<Instruction>(Op)->getOpcode(),

                                       EltTy) *

            N;

        InstructionCost NewCost = TTI.getArithmeticInstrCost(

            cast<Instruction>(Op)->getOpcode(), VecTy);

        return NewCost - OriginalCost;

      }


      if (match(Op, m_Intrinsic<Intrinsic::matrix_transpose>())) {

        // The transpose can be skipped for the dot product lowering, roughly

        // estimate the savings as the cost of embedding the columns in a

        // vector.

        InstructionCost EmbedCost(0);

        for (unsigned I = 1; I < N; ++I)

          EmbedCost -= TTI.getShuffleCost(

              TTI::SK_Splice, FixedVectorType::get(EltTy, 1),

              FixedVectorType::get(EltTy, 1), {}, TTI::TCK_RecipThroughput);

        return EmbedCost;

      }


      // Costs for loads.

      if (N == 1)

        return InstructionCost(0);


      return TTI.getMemoryOpCost(Instruction::Load, VecTy, Align(1), 0) -

             N * TTI.getMemoryOpCost(Instruction::Load, EltTy, Align(1), 0);

    };


    // Iterate over LHS and operations feeding LHS and check if it is profitable

    // to flatten the visited ops.  For each op, we compute the difference

    // between the flattened and matrix versions.

    SmallPtrSet<Value *, 4> Seen;

    SmallVector<Value *> WorkList;

    SmallVector<Value *> ToFlatten;

    WorkList.push_back(LHS);

    InstructionCost LHSCost(0);

    while (!WorkList.empty()) {

      Value *Op = WorkList.pop_back_val();

      if (!Seen.insert(Op).second)

        continue;


      InstructionCost OpCost = GetCostForArg(Op, LShape.NumColumns);

      if (OpCost + LHSCost >= LHSCost)

        continue;


      LHSCost += OpCost;

      ToFlatten.push_back(Op);

      if (auto *I = dyn_cast<Instruction>(Op))

        WorkList.append(I->op_begin(), I->op_end());

    }


    // We compare the costs of a vector.reduce.add to sequential add.

    int AddOpCode = IsIntVec ? Instruction::Add : Instruction::FAdd;

    int MulOpCode = IsIntVec ? Instruction::Mul : Instruction::FMul;

    InstructionCost ReductionCost =

        TTI.getArithmeticReductionCost(

            AddOpCode, cast<FixedVectorType>(LHS->getType()),

            IsIntVec ? std::nullopt : std::optional(FMF)) +

        TTI.getArithmeticInstrCost(MulOpCode, LHS->getType());

    InstructionCost SequentialAddCost =

        TTI.getArithmeticInstrCost(AddOpCode, ElementType) *

            (LShape.NumColumns - 1) +

        TTI.getArithmeticInstrCost(MulOpCode, ElementType) *

            (LShape.NumColumns);

    if ((LHSCost + ReductionCost - SequentialAddCost) > InstructionCost(0))

      return;


    FusedInsts.insert(MatMul);

    IRBuilder<> Builder(MatMul);

    auto FlattenArg = [&Builder, &FusedInsts, &CanBeFlattened,

                       this](Value *Op) {

      // Matmul must be the only user of loads because we don't use LowerLoad

      // for row vectors (LowerLoad results in scalar loads and shufflevectors

      // instead of single vector load).

      if (!CanBeFlattened(Op))

        return;


      if (match(Op, m_BinOp())) {

        auto It = ShapeMap.find(Op);

        if (It != ShapeMap.end()) {

          It->second = It->second.t();

          return;

        }

      }


      FusedInsts.insert(cast<Instruction>(Op));

      // If vector uses the builtin load, lower to a LoadInst

      Value *Arg;

      if (match(Op, m_Intrinsic<Intrinsic::matrix_column_major_load>(

                        m_Value(Arg)))) {

        auto *NewLoad = Builder.CreateLoad(Op->getType(), Arg);

        Op->replaceAllUsesWith(NewLoad);

        eraseFromParentAndRemoveFromShapeMap(cast<Instruction>(Op));

        return;

      } else if (match(Op, m_Intrinsic<Intrinsic::matrix_transpose>(

                               m_Value(Arg)))) {

        ToRemove.push_back(cast<Instruction>(Op));

        Op->replaceAllUsesWith(Arg);

        return;

      }

    };


    for (auto *V : ToFlatten)

      FlattenArg(V);


    LHS = MatMul->getArgOperand(0);


    // Insert mul/fmul and llvm.vector.reduce.fadd

    Value *Mul =

        IsIntVec ? Builder.CreateMul(LHS, RHS) : Builder.CreateFMul(LHS, RHS);


    Value *Result;

    if (IsIntVec)

      Result = Builder.CreateAddReduce(Mul);

    else {

      Result = Builder.CreateFAddReduce(

          ConstantFP::get(

              cast<FixedVectorType>(LHS->getType())->getElementType(), 0.0),

          Mul);

      cast<Instruction>(Result)->setFastMathFlags(FMF);

    }


    // pack scalar back into a matrix and then replace matmul inst

    Result = Builder.CreateInsertElement(PoisonValue::get(MatMul->getType()),

                                         Result, uint64_t(0));

    MatMul->replaceAllUsesWith(Result);

    FusedInsts.insert(MatMul);

    ToRemove.push_back(MatMul);

  }


  /// Compute \p Result += \p A * \p B for input matrices with left-associating

  /// addition.

  ///

  /// We can fold a transpose into the operand that is used to extract scalars.

  /// This is the first operands with row-major and the second with

  /// column-major.  If \p IsScalarMatrixTransposed we assume the appropriate

  /// operand is transposed.

  void emitMatrixMultiply(MatrixTy &Result, const MatrixTy &A,

                          const MatrixTy &B, IRBuilder<> &Builder, bool IsTiled,

                          bool IsScalarMatrixTransposed, FastMathFlags FMF) {

    const unsigned VF = std::max<unsigned>(

        TTI.getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector)

                .getFixedValue() /

            Result.getElementType()->getPrimitiveSizeInBits().getFixedValue(),

        1U);

    unsigned R = Result.getNumRows();

    unsigned C = Result.getNumColumns();

    unsigned M = A.getNumColumns();


    bool IsFP = Result.getElementType()->isFloatingPointTy();

    assert(A.isColumnMajor() == B.isColumnMajor() &&

           Result.isColumnMajor() == A.isColumnMajor() &&

           "operands must agree on matrix layout");

    unsigned NumComputeOps = 0;


    Builder.setFastMathFlags(FMF);


    if (A.isColumnMajor()) {

      // Multiply columns from the first operand with scalars from the second

      // operand. Then move along the K axes and accumulate the columns.  With

      // this the adds can be vectorized without reassociation.

      for (unsigned J = 0; J < C; ++J) {

        unsigned BlockSize = VF;

        // If Result is zero, we don't need to accumulate in the K==0 iteration.

        bool isSumZero = isa<ConstantAggregateZero>(Result.getColumn(J));


        for (unsigned I = 0; I < R; I += BlockSize) {

          // Gradually lower the vectorization factor to cover the remainder.

          while (I + BlockSize > R)

            BlockSize /= 2;


          Value *Sum = IsTiled ? Result.extractVector(I, J, BlockSize, Builder)

                               : nullptr;

          for (unsigned K = 0; K < M; ++K) {

            Value *L = A.extractVector(I, K, BlockSize, Builder);

            Value *RH = Builder.CreateExtractElement(

                B.getColumn(IsScalarMatrixTransposed ? K : J),

                IsScalarMatrixTransposed ? J : K);

            Value *Splat = Builder.CreateVectorSplat(BlockSize, RH, "splat");

            Sum =

                createMulAdd(isSumZero && K == 0 ? nullptr : Sum, L, Splat,

                             IsFP, Builder, FMF.allowContract(), NumComputeOps);

          }

          Result.setVector(J,

                           insertVector(Result.getVector(J), I, Sum, Builder));

        }

      }

    } else {

      // Multiply rows from the second operand with scalars from the first

      // operand. Then move along the K axes and accumulate the rows.  With this

      // the adds can be vectorized without reassociation.

      for (unsigned I = 0; I < R; ++I) {

        unsigned BlockSize = VF;

        bool isSumZero = isa<ConstantAggregateZero>(Result.getRow(I));

        for (unsigned J = 0; J < C; J += BlockSize) {

          // Gradually lower the vectorization factor to cover the remainder.

          while (J + BlockSize > C)

            BlockSize /= 2;


          Value *Sum = nullptr;

          for (unsigned K = 0; K < M; ++K) {

            Value *R = B.extractVector(K, J, BlockSize, Builder);

            Value *LH = Builder.CreateExtractElement(

                A.getVector(IsScalarMatrixTransposed ? K : I),

                IsScalarMatrixTransposed ? I : K);

            Value *Splat = Builder.CreateVectorSplat(BlockSize, LH, "splat");

            Sum =

                createMulAdd(isSumZero && K == 0 ? nullptr : Sum, Splat, R,

                             IsFP, Builder, FMF.allowContract(), NumComputeOps);

          }

          Result.setVector(I,

                           insertVector(Result.getVector(I), J, Sum, Builder));

        }

      }

    }

    Result.addNumComputeOps(NumComputeOps);

  }


  /// Ensure that the memory in \p Load does not alias \p Store by potentially

  /// copying it to a new location.  This new or otherwise the original location

  /// is returned.

  Value *getNonAliasingPointer(LoadInst *Load, StoreInst *Store,

                               CallInst *MatMul) {

    MemoryLocation StoreLoc = MemoryLocation::get(Store);

    MemoryLocation LoadLoc = MemoryLocation::get(Load);


    // If we can statically determine noalias we're good.

    if (AA->isNoAlias(LoadLoc, StoreLoc))

      return Load->getPointerOperand();


    // Create code to check if the memory locations of the Load and Store

    // overlap and if they do, copy Load's operand to a new buffer.


    // First, create  new blocks for 2n part of the check and the copy.

    BasicBlock *Check0 = MatMul->getParent();

    // FIXME: Use lazy DTU and update SplitBlock to accept a DTU instead of a

    // DT. Manually collect dominator tree updates, to avoid unnecessary work,

    // as we adjust Check0 and Check1's branches.

    SmallVector<DominatorTree::UpdateType, 4> DTUpdates;

    for (BasicBlock *Succ : successors(Check0))

      DTUpdates.push_back({DT->Delete, Check0, Succ});


    BasicBlock *Check1 =

        SplitBlock(MatMul->getParent(), MatMul, (DomTreeUpdater *)nullptr, LI,

                   nullptr, "alias_cont");

    BasicBlock *Copy =

        SplitBlock(MatMul->getParent(), MatMul, (DomTreeUpdater *)nullptr, LI,

                   nullptr, "copy");

    BasicBlock *Fusion =

        SplitBlock(MatMul->getParent(), MatMul, (DomTreeUpdater *)nullptr, LI,

                   nullptr, "no_alias");


    // Check if the loaded memory location begins before the end of the store

    // location. If the condition holds, they might overlap, otherwise they are

    // guaranteed to not overlap.

    IRBuilder<> Builder(MatMul);

    Check0->getTerminator()->eraseFromParent();

    Builder.SetInsertPoint(Check0);

    Type *IntPtrTy = Builder.getIntPtrTy(Load->getDataLayout());

    Value *StoreBegin = Builder.CreatePtrToInt(

        const_cast<Value *>(StoreLoc.Ptr), IntPtrTy, "store.begin");

    Value *StoreEnd = Builder.CreateAdd(

        StoreBegin, ConstantInt::get(IntPtrTy, StoreLoc.Size.getValue()),

        "store.end", true, true);

    Value *LoadBegin = Builder.CreatePtrToInt(const_cast<Value *>(LoadLoc.Ptr),

                                              IntPtrTy, "load.begin");

    Builder.CreateCondBr(Builder.CreateICmpULT(LoadBegin, StoreEnd), Check1,

                         Fusion);


    // Check if the store begins before the end of the load location. If the

    // condition holds, they alias, otherwise they are guaranteed to not

    // overlap.

    Check1->getTerminator()->eraseFromParent();

    Builder.SetInsertPoint(Check1, Check1->begin());

    Value *LoadEnd = Builder.CreateAdd(

        LoadBegin, ConstantInt::get(IntPtrTy, LoadLoc.Size.getValue()),

        "load.end", true, true);

    Builder.CreateCondBr(Builder.CreateICmpULT(StoreBegin, LoadEnd), Copy,

                         Fusion);


    // Copy load operand to new alloca.

    Builder.SetInsertPoint(Copy, Copy->begin());

    auto *VT = cast<FixedVectorType>(Load->getType());

    // Use an array type for the alloca, to avoid potentially huge alignment

    // requirements for large vector types.

    auto *ArrayTy = ArrayType::get(VT->getElementType(), VT->getNumElements());

    AllocaInst *Alloca =

        Builder.CreateAlloca(ArrayTy, Load->getPointerAddressSpace());


    Builder.CreateMemCpy(Alloca, Alloca->getAlign(), Load->getPointerOperand(),

                         Load->getAlign(), LoadLoc.Size.getValue());

    Builder.SetInsertPoint(Fusion, Fusion->begin());

    PHINode *PHI = Builder.CreatePHI(Load->getPointerOperandType(), 3);

    PHI->addIncoming(Load->getPointerOperand(), Check0);

    PHI->addIncoming(Load->getPointerOperand(), Check1);

    PHI->addIncoming(Alloca, Copy);


    // Adjust DT.

    DTUpdates.push_back({DT->Insert, Check0, Check1});

    DTUpdates.push_back({DT->Insert, Check0, Fusion});

    DTUpdates.push_back({DT->Insert, Check1, Copy});

    DTUpdates.push_back({DT->Insert, Check1, Fusion});

    DT->applyUpdates(DTUpdates);

    return PHI;

  }


  bool isFusionProfitable(CallInst *MatMul) {

    if (ForceFusion)

      return true;


    ShapeInfo LShape(MatMul->getArgOperand(2), MatMul->getArgOperand(3));

    ShapeInfo RShape(MatMul->getArgOperand(3), MatMul->getArgOperand(4));


    const unsigned R = LShape.NumRows;

    const unsigned C = RShape.NumColumns;

    const unsigned M = LShape.NumColumns;

    auto *EltType = cast<FixedVectorType>(MatMul->getType())->getElementType();


    const unsigned VF = std::max<unsigned>(

        TTI.getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector)

                .getFixedValue() /

            EltType->getPrimitiveSizeInBits().getFixedValue(),

        1U);


    // Cost model for tiling

    //

    // For tiling to be beneficial, we need reuse either along the R or

    // the C axis.  We vectorize along the R axis so that means at least

    // 3 elements.

    // TODO: Also consider cost of copying if operands alias.

    if (R <= VF && C == 1)

      return false;

    // Then we need enough elements to exceed the number of vector

    // registers we have.  Note that this is an oversimplification since

    // fusing also takes some extra loads which may exceed the number of

    // reloads necessary.

    unsigned Op0Regs = (R + VF - 1) / VF * M;

    unsigned Op1Regs = (M + VF - 1) / VF * C;

    return Op0Regs + Op1Regs >

           TTI.getNumberOfRegisters(TTI.getRegisterClassForType(true));

  }


  MatrixTy getZeroMatrix(Type *EltType, unsigned R, unsigned C) {

    MatrixTy Res;

    auto *ColumType = FixedVectorType::get(EltType, R);

    for (unsigned I = 0; I < C; ++I)

      Res.addVector(ConstantAggregateZero::get(ColumType));

    return Res;

  }


  void createTiledLoops(CallInst *MatMul, Value *LPtr, ShapeInfo LShape,

                        Value *RPtr, ShapeInfo RShape, StoreInst *Store) {

    auto *EltType = cast<FixedVectorType>(MatMul->getType())->getElementType();


    // Create the main tiling loop nest.

    TileInfo TI(LShape.NumRows, RShape.NumColumns, LShape.NumColumns, TileSize);

    DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy);

    Instruction *InsertI = cast<Instruction>(MatMul);

    BasicBlock *Start = InsertI->getParent();

    BasicBlock *End =

        SplitBlock(InsertI->getParent(), InsertI, DT, LI, nullptr, "continue");

    IRBuilder<> Builder(MatMul);

    BasicBlock *InnerBody = TI.CreateTiledLoops(Start, End, Builder, DTU, *LI);


    Type *TileVecTy =

        FixedVectorType::get(MatMul->getType()->getScalarType(), TileSize);

    MatrixTy TileResult;

    // Insert in the inner loop header.

    Builder.SetInsertPoint(TI.KLoop.Header->getTerminator());

    // Create PHI nodes for the result columns to accumulate across iterations.

    SmallVector<PHINode *, 4> ColumnPhis;

    for (unsigned I = 0; I < TileSize; I++) {

      auto *Phi = Builder.CreatePHI(TileVecTy, 2, "result.vec." + Twine(I));

      Phi->addIncoming(ConstantAggregateZero::get(TileVecTy),

                       TI.RowLoop.Header->getSingleSuccessor());

      TileResult.addVector(Phi);

      ColumnPhis.push_back(Phi);

    }


    // Insert in the inner loop body, which computes

    //   Res += Load(CurrentRow, K) * Load(K, CurrentColumn)

    Builder.SetInsertPoint(InnerBody->getTerminator());

    // Load tiles of the operands.

    MatrixTy A =

        loadMatrix(LPtr, {}, false, LShape, TI.RowLoop.Index, TI.KLoop.Index,

                   {TileSize, TileSize}, EltType, Builder);

    MatrixTy B =

        loadMatrix(RPtr, {}, false, RShape, TI.KLoop.Index, TI.ColumnLoop.Index,

                   {TileSize, TileSize}, EltType, Builder);

    emitMatrixMultiply(TileResult, A, B, Builder, true, false,

                       getFastMathFlags(MatMul));

    // Store result after the inner loop is done.

    Builder.SetInsertPoint(TI.RowLoop.Latch->getTerminator());

    storeMatrix(TileResult, Store->getPointerOperand(), Store->getAlign(),

                Store->isVolatile(), {LShape.NumRows, RShape.NumColumns},

                TI.RowLoop.Index, TI.ColumnLoop.Index, EltType, Builder);


    for (unsigned I = 0; I < TileResult.getNumVectors(); I++)

      ColumnPhis[I]->addIncoming(TileResult.getVector(I), TI.KLoop.Latch);


    // Force unrolling of a few iterations of the inner loop, to make sure there

    // is enough work per iteration.

    // FIXME: The unroller should make this decision directly instead, but

    // currently the cost-model is not up to the task.

    unsigned InnerLoopUnrollCount = std::min(10u, LShape.NumColumns / TileSize);

    addStringMetadataToLoop(LI->getLoopFor(TI.KLoop.Header),

                            "llvm.loop.unroll.count", InnerLoopUnrollCount);

  }


  void emitSIMDTiling(CallInst *MatMul, LoadInst *LoadOp0, LoadInst *LoadOp1,

                      StoreInst *Store,

                      SmallPtrSetImpl<Instruction *> &FusedInsts) {

    assert(MatrixLayout == MatrixLayoutTy::ColumnMajor &&

           "Tiling only supported for column-major matrixes at the moment!");

    if (!isFusionProfitable(MatMul))

      return;


    ShapeInfo LShape(MatMul->getArgOperand(2), MatMul->getArgOperand(3));

    ShapeInfo RShape(MatMul->getArgOperand(3), MatMul->getArgOperand(4));


    const unsigned R = LShape.NumRows;

    const unsigned C = RShape.NumColumns;

    const unsigned M = LShape.NumColumns;

    auto *EltType = cast<FixedVectorType>(MatMul->getType())->getElementType();


    Value *APtr = getNonAliasingPointer(LoadOp0, Store, MatMul);

    Value *BPtr = getNonAliasingPointer(LoadOp1, Store, MatMul);

    Value *CPtr = Store->getPointerOperand();


    if (TileUseLoops && (R % TileSize == 0 && C % TileSize == 0))

      createTiledLoops(MatMul, APtr, LShape, BPtr, RShape, Store);

    else {

      IRBuilder<> Builder(Store);

      for (unsigned J = 0; J < C; J += TileSize)

        for (unsigned I = 0; I < R; I += TileSize) {

          const unsigned TileR = std::min(R - I, unsigned(TileSize));

          const unsigned TileC = std::min(C - J, unsigned(TileSize));

          MatrixTy Res = getZeroMatrix(EltType, TileR, TileC);


          for (unsigned K = 0; K < M; K += TileSize) {

            const unsigned TileM = std::min(M - K, unsigned(TileSize));

            MatrixTy A =

                loadMatrix(APtr, LoadOp0->getAlign(), LoadOp0->isVolatile(),

                           LShape, getIndex(APtr, I), getIndex(APtr, K),

                           {TileR, TileM}, EltType, Builder);

            MatrixTy B =

                loadMatrix(BPtr, LoadOp1->getAlign(), LoadOp1->isVolatile(),

                           RShape, getIndex(BPtr, K), getIndex(BPtr, J),

                           {TileM, TileC}, EltType, Builder);

            emitMatrixMultiply(Res, A, B, Builder, true, false,

                               getFastMathFlags(MatMul));

          }

          storeMatrix(Res, CPtr, Store->getAlign(), Store->isVolatile(), {R, M},

                      getIndex(CPtr, I), getIndex(CPtr, J), EltType, Builder);

        }

    }


    // Mark eliminated instructions as fused and remove them.

    FusedInsts.insert(Store);

    FusedInsts.insert(MatMul);

    eraseFromParentAndRemoveFromShapeMap(Store);

    eraseFromParentAndRemoveFromShapeMap(MatMul);

    if (LoadOp0->use_empty()) {

      FusedInsts.insert(LoadOp0);

      eraseFromParentAndRemoveFromShapeMap(LoadOp0);

    }

    if (LoadOp1 != LoadOp0 && LoadOp1->use_empty()) {

      FusedInsts.insert(LoadOp1);

      eraseFromParentAndRemoveFromShapeMap(LoadOp1);

    }

  }


  /// Try to lower matrix multiply chains by fusing operations.

  ///

  /// Call finalizeLowering on lowered instructions.  Instructions that are

  /// completely eliminated by fusion are added to \p FusedInsts.

  void

  LowerMatrixMultiplyFused(CallInst *MatMul,

                           SmallPtrSetImpl<Instruction *> &FusedInsts,

                           SmallVector<IntrinsicInst *, 16> &LifetimeEnds) {

    if (!FuseMatrix || !DT)

      return;


    assert(AA && LI && "Analyses should be available");


    Value *A = MatMul->getArgOperand(0);

    Value *B = MatMul->getArgOperand(1);


    // We can fold the transpose into the operand that is used to fetch scalars.

    Value *T;

    if (MatrixLayout == MatrixLayoutTy::ColumnMajor

            ? match(B, m_Intrinsic<Intrinsic::matrix_transpose>(m_Value(T)))

            : match(A, m_Intrinsic<Intrinsic::matrix_transpose>(m_Value(T)))) {

      IRBuilder<> Builder(MatMul);

      auto *EltType =

          cast<FixedVectorType>(MatMul->getType())->getElementType();

      ShapeInfo LShape(MatMul->getArgOperand(2), MatMul->getArgOperand(3));

      ShapeInfo RShape(MatMul->getArgOperand(3), MatMul->getArgOperand(4));

      const unsigned R = LShape.NumRows;

      const unsigned M = LShape.NumColumns;

      const unsigned C = RShape.NumColumns;


      MatrixTy MA;

      MatrixTy MB;


      Value *Transpose;

      if (MatrixLayout == MatrixLayoutTy::ColumnMajor) {

        MA = getMatrix(A, ShapeInfo(R, M), Builder);

        MB = getMatrix(T, ShapeInfo(C, M), Builder);

        Transpose = B;

      } else {

        MA = getMatrix(T, ShapeInfo(R, M), Builder);

        MB = getMatrix(B, ShapeInfo(C, M), Builder);

        Transpose = A;

      }


      // Initialize the output

      MatrixTy Result(R, C, EltType);


      emitMatrixMultiply(Result, MA, MB, Builder, false, true,

                         getFastMathFlags(MatMul));


      FusedInsts.insert(MatMul);

      if (Transpose->hasOneUse()) {

        FusedInsts.insert(cast<Instruction>(Transpose));

        ToRemove.push_back(cast<Instruction>(Transpose));

        // TODO: add a fake entry for the folded instruction so that this is

        // included in the expression in the remark.

        Inst2ColumnMatrix[Transpose] = MatrixTy(M, C, EltType);

      }

      finalizeLowering(MatMul, Result, Builder);

      return;

    }


    if (!MatMul->hasOneUse() || MatrixLayout != MatrixLayoutTy::ColumnMajor)

      return;


    // Lower {ld, ld} -> matmul -> st chains.  No need to call finalizeLowering

    // since the single store user will be lowered as part of this.

    auto *LoadOp0 = dyn_cast<LoadInst>(A);

    auto *LoadOp1 = dyn_cast<LoadInst>(B);

    auto *Store = dyn_cast<StoreInst>(*MatMul->user_begin());

    if (LoadOp0 && LoadOp1 && Store) {

      // The store address must dominate the MatMul instruction, otherwise

      // we create invalid IR.

      SetVector<Value *> WorkList;

      WorkList.insert(Store->getOperand(1));

      SmallVector<Instruction *> ToHoist;

      for (unsigned I = 0; I != WorkList.size(); ++I) {

        Value *Current = WorkList[I];

        auto *CurrI = dyn_cast<Instruction>(Current);

        if (!CurrI)

          continue;

        if (isa<PHINode>(CurrI))

          return;

        if (DT->dominates(CurrI, MatMul))

          continue;

        if (CurrI->mayHaveSideEffects() || CurrI->mayReadFromMemory())

          return;

        ToHoist.push_back(CurrI);

        WorkList.insert_range(CurrI->operands());

      }


      sort(ToHoist, [this](Instruction *A, Instruction *B) {

        return DT->dominates(A, B);

      });

      for (Instruction *I : ToHoist)

        I->moveBefore(MatMul->getIterator());


      // Deal with lifetime.end calls that might be between Load0/Load1 and the

      // store. To avoid introducing loads to dead objects (i.e. after the

      // lifetime has been termined by @llvm.lifetime.end), either sink them

      // after the store if in the same block, or remove the lifetime.end marker

      // otherwise. This might pessimize further optimizations, by extending the

      // lifetime of the object until the function returns, but should be

      // conservatively correct.

      MemoryLocation Load0Loc = MemoryLocation::get(LoadOp0);

      MemoryLocation Load1Loc = MemoryLocation::get(LoadOp1);

      BasicBlock *StoreParent = Store->getParent();

      bool FusableOpsInSameBlock = LoadOp0->getParent() == StoreParent &&

                                   LoadOp1->getParent() == StoreParent;

      for (unsigned Idx = 0; Idx != LifetimeEnds.size();) {

        IntrinsicInst *End = LifetimeEnds[Idx];

        auto Inc = make_scope_exit([&Idx]() { Idx++; });

        // If the lifetime.end is guaranteed to be before the loads or after the

        // store, it won't interfere with fusion.

        if (DT->dominates(End, LoadOp0) && DT->dominates(End, LoadOp1))

          continue;

        if (DT->dominates(Store, End))

          continue;

        // If all fusable ops are in the same block and the lifetime.end is in a

        // different block, it won't interfere with fusion.

        if (FusableOpsInSameBlock && End->getParent() != StoreParent)

          continue;


        // If the loads don't alias the lifetime.end, it won't interfere with

        // fusion.

        MemoryLocation EndLoc = MemoryLocation::getForArgument(End, 0, nullptr);

        if (!EndLoc.Ptr)

          continue;

        if (AA->isNoAlias(Load0Loc, EndLoc) && AA->isNoAlias(Load1Loc, EndLoc))

          continue;


        // If both lifetime.end and the store are in the same block, extend the

        // lifetime until after the store, so the new lifetime covers the loads

        // we introduce later.

        if (End->getParent() == StoreParent) {

          End->moveAfter(Store);

          continue;

        }


        // Otherwise remove the conflicting lifetime.end marker.

        ToRemove.push_back(End);

        std::swap(LifetimeEnds[Idx], LifetimeEnds.back());

        LifetimeEnds.pop_back();

        Inc.release();

      }


      emitSIMDTiling(MatMul, LoadOp0, LoadOp1, Store, FusedInsts);

      return;

    }

  }


  /// Lowers llvm.matrix.multiply.

  MatrixTy LowerMultiply(CallInst *MatMul, IRBuilder<> &Builder) {

    auto *EltType = cast<FixedVectorType>(MatMul->getType())->getElementType();

    ShapeInfo LShape(MatMul->getArgOperand(2), MatMul->getArgOperand(3));

    ShapeInfo RShape(MatMul->getArgOperand(3), MatMul->getArgOperand(4));


    const MatrixTy &Lhs = getMatrix(MatMul->getArgOperand(0), LShape, Builder);

    const MatrixTy &Rhs = getMatrix(MatMul->getArgOperand(1), RShape, Builder);

    assert(Lhs.getElementType() == Rhs.getElementType() &&

           "Matrix multiply argument element types do not match.");


    const unsigned R = LShape.NumRows;

    const unsigned C = RShape.NumColumns;

    assert(LShape.NumColumns == RShape.NumRows);


    // Initialize the output

    MatrixTy Result(R, C, EltType);

    assert(Lhs.getElementType() == Result.getElementType() &&

           "Matrix multiply result element type does not match arguments.");


    emitMatrixMultiply(Result, Lhs, Rhs, Builder, false, false,

                       getFastMathFlags(MatMul));

    return Result;

  }


  /// Lowers llvm.matrix.transpose.

  MatrixTy LowerTranspose(CallInst *Inst, IRBuilder<> &Builder) {

    MatrixTy Result;

    Value *InputVal = Inst->getArgOperand(0);

    FixedVectorType *VectorTy = cast<FixedVectorType>(InputVal->getType());

    ShapeInfo ArgShape(Inst->getArgOperand(1), Inst->getArgOperand(2));

    MatrixTy InputMatrix = getMatrix(InputVal, ArgShape, Builder);


    const unsigned NewNumVecs =

        InputMatrix.isColumnMajor() ? ArgShape.NumRows : ArgShape.NumColumns;

    const unsigned NewNumElts =

        InputMatrix.isColumnMajor() ? ArgShape.NumColumns : ArgShape.NumRows;


    for (unsigned I = 0; I < NewNumVecs; ++I) {

      // Build a single result vector. First initialize it.

      Value *ResultVector = PoisonValue::get(

          FixedVectorType::get(VectorTy->getElementType(), NewNumElts));

      // Go through the old elements and insert it into the resulting vector.

      for (auto J : enumerate(InputMatrix.vectors())) {

        Value *Elt = Builder.CreateExtractElement(J.value(), I);

        // Row and column indices are transposed.

        ResultVector =

            Builder.CreateInsertElement(ResultVector, Elt, J.index());

      }

      Result.addVector(ResultVector);

    }


    // TODO: Improve estimate of operations needed for transposes. Currently we

    // just count the insertelement/extractelement instructions, but do not

    // account for later simplifications/combines.

    return Result.addNumComputeOps(2 * ArgShape.NumRows * ArgShape.NumColumns)

        .addNumExposedTransposes(1);

  }


  /// Lower load instructions.

  MatrixTy VisitLoad(LoadInst *Inst, const ShapeInfo &SI, Value *Ptr,

                     IRBuilder<> &Builder) {

    return LowerLoad(Inst, Ptr, Inst->getAlign(), getIndex(Ptr, SI.getStride()),

                     Inst->isVolatile(), SI, Builder);

  }


  MatrixTy VisitStore(StoreInst *Inst, const ShapeInfo &SI, Value *StoredVal,

                      Value *Ptr, IRBuilder<> &Builder) {

    return LowerStore(Inst, StoredVal, Ptr, Inst->getAlign(),

                      getIndex(Ptr, SI.getStride()), Inst->isVolatile(), SI,

                      Builder);

  }


  MatrixTy VisitPHI(PHINode *Inst, const ShapeInfo &SI, IRBuilder<> &Builder) {

    auto BlockIP = Inst->getParent()->getFirstInsertionPt();

    Builder.SetInsertPoint(BlockIP);

    MatrixTy PhiM = getMatrix(Inst, SI, Builder);


    for (auto [IncomingV, IncomingB] :

         llvm::zip_equal(Inst->incoming_values(), Inst->blocks())) {

      // getMatrix() may insert some instructions to help with reshaping. The

      // safest place for those is at the top of the block after the rest of the

      // PHI's. Even better, if we can put it in the incoming block.

      Builder.SetInsertPoint(BlockIP);

      if (auto *IncomingInst = dyn_cast<Instruction>(IncomingV))

        if (auto MaybeIP = IncomingInst->getInsertionPointAfterDef())

          Builder.SetInsertPoint(*MaybeIP);


      MatrixTy OpM = getMatrix(IncomingV, SI, Builder);


      for (unsigned VI = 0, VE = PhiM.getNumVectors(); VI != VE; ++VI) {

        PHINode *NewPHI = cast<PHINode>(PhiM.getVector(VI));

        NewPHI->addIncoming(OpM.getVector(VI), IncomingB);

      }

    }


    // finalizeLowering() may also insert instructions in some cases. The safe

    // place for those is at the end of the initial block of PHIs.

    Builder.SetInsertPoint(BlockIP);

    return PhiM;

  }


  /// Lower binary operators.

  MatrixTy VisitBinaryOperator(BinaryOperator *Inst, const ShapeInfo &SI,

                               IRBuilder<> &Builder) {

    Value *Lhs = Inst->getOperand(0);

    Value *Rhs = Inst->getOperand(1);


    MatrixTy Result;

    MatrixTy A = getMatrix(Lhs, SI, Builder);

    MatrixTy B = getMatrix(Rhs, SI, Builder);

    assert(A.isColumnMajor() == B.isColumnMajor() &&

           Result.isColumnMajor() == A.isColumnMajor() &&

           "operands must agree on matrix layout");


    Builder.setFastMathFlags(getFastMathFlags(Inst));


    for (auto [AV, BV] : llvm::zip_equal(A.vectors(), B.vectors()))

      Result.addVector(Builder.CreateBinOp(Inst->getOpcode(), AV, BV));


    return Result.addNumComputeOps(getNumOps(Result.getVectorTy()) *

                                   Result.getNumVectors());

  }


  /// Lower unary operators.

  MatrixTy VisitUnaryOperator(UnaryOperator *Inst, const ShapeInfo &SI,

                              IRBuilder<> &Builder) {

    Value *Op = Inst->getOperand(0);


    MatrixTy Result;

    MatrixTy M = getMatrix(Op, SI, Builder);


    Builder.setFastMathFlags(getFastMathFlags(Inst));


    // Helper to perform unary op on vectors.

    auto BuildVectorOp = [&Builder, Inst](Value *Op) {

      switch (Inst->getOpcode()) {

      case Instruction::FNeg:

        return Builder.CreateFNeg(Op);

      default:

        llvm_unreachable("Unsupported unary operator for matrix");

      }

    };


    for (auto *Vector : M.vectors())

      Result.addVector(BuildVectorOp(Vector));


    return Result.addNumComputeOps(getNumOps(Result.getVectorTy()) *

                                   Result.getNumVectors());

  }


  /// Lower cast instructions.

  MatrixTy VisitCastInstruction(CastInst *Inst, const ShapeInfo &Shape,

                                IRBuilder<> &Builder) {

    Value *Op = Inst->getOperand(0);


    MatrixTy Result;

    MatrixTy M = getMatrix(Op, Shape, Builder);


    Builder.setFastMathFlags(getFastMathFlags(Inst));


    auto *OrigVTy = cast<VectorType>(Inst->getType());

    auto *NewVTy = VectorType::get(OrigVTy->getElementType(),

                                   ElementCount::getFixed(M.getStride()));


    for (auto *Vector : M.vectors())

      Result.addVector(Builder.CreateCast(Inst->getOpcode(), Vector, NewVTy));


    return Result.addNumComputeOps(getNumOps(Result.getVectorTy()) *

                                   Result.getNumVectors());

  }


  /// Lower selects.

  MatrixTy VisitSelectInst(SelectInst *Inst, const ShapeInfo &Shape,

                           IRBuilder<> &Builder) {

    Value *Cond = Inst->getOperand(0);

    Value *OpA = Inst->getOperand(1);

    Value *OpB = Inst->getOperand(2);


    MatrixTy Result;

    MatrixTy A = getMatrix(OpA, Shape, Builder);

    MatrixTy B = getMatrix(OpB, Shape, Builder);


    SmallVector<Value*> CondV;

    if (isa<FixedVectorType>(Cond->getType())) {

      MatrixTy C = getMatrix(Cond, Shape, Builder);

      llvm::copy(C.vectors(), std::back_inserter(CondV));

    } else {

      CondV.resize(A.getNumVectors());

      llvm::fill(CondV, Cond);

    }


    for (auto [CV, AV, BV] : llvm::zip_equal(CondV, A.vectors(), B.vectors()))

      Result.addVector(Builder.CreateSelect(CV, AV, BV));


    return Result.addNumComputeOps(getNumOps(Result.getVectorTy()) *

                                   Result.getNumVectors());

  }


  /// Helper to linearize a matrix expression tree into a string. Currently

  /// matrix expressions are linarized by starting at an expression leaf and

  /// linearizing bottom up.

  struct ExprLinearizer {

    unsigned LengthToBreak = 100;

    std::string Str;

    raw_string_ostream Stream;

    unsigned LineLength = 0;

    const DataLayout &DL;


    /// Mapping from instructions to matrixes. It is used to identify

    /// matrix instructions.

    const MapVector<Value *, MatrixTy> &Inst2Matrix;


    /// Mapping from values to the leaves of all expressions that the value is

    /// part of.

    const DenseMap<Value *, SmallPtrSet<Value *, 2>> &Shared;


    /// Set of matrix expressions in the scope of a given DISubprogram.

    const SmallSetVector<Value *, 32> &ExprsInSubprogram;


    /// Leaf node of the expression to linearize.

    Value *Leaf;


    /// Used to keep track of sub-expressions that get reused while linearizing

    /// the expression. Re-used sub-expressions are marked as (reused).

    SmallPtrSet<Value *, 8> ReusedExprs;


    ExprLinearizer(const DataLayout &DL,

                   const MapVector<Value *, MatrixTy> &Inst2Matrix,

                   const DenseMap<Value *, SmallPtrSet<Value *, 2>> &Shared,

                   const SmallSetVector<Value *, 32> &ExprsInSubprogram,

                   Value *Leaf)

        : Stream(Str), DL(DL), Inst2Matrix(Inst2Matrix), Shared(Shared),

          ExprsInSubprogram(ExprsInSubprogram), Leaf(Leaf) {}


    void indent(unsigned N) {

      LineLength += N;

      for (unsigned i = 0; i < N; i++)

        Stream << " ";

    }


    void lineBreak() {

      Stream << "\n";

      LineLength = 0;

    }


    void maybeIndent(unsigned Indent) {

      if (LineLength >= LengthToBreak)

        lineBreak();


      if (LineLength == 0)

        indent(Indent);

    }


    void write(StringRef S) {

      LineLength += S.size();

      Stream << S;

    }


    Value *getUnderlyingObjectThroughLoads(Value *V) {

      if (Value *Ptr = getPointerOperand(V))

        return getUnderlyingObjectThroughLoads(Ptr);

      else if (V->getType()->isPointerTy())

        return getUnderlyingObject(V);

      return V;

    }


    /// Returns true if \p V is a matrix value in the given subprogram.

    bool isMatrix(Value *V) const { return ExprsInSubprogram.count(V); }


    /// If \p V is a matrix value, print its shape as NumRows x NumColumns to

    /// \p SS.

    void prettyPrintMatrixType(Value *V, raw_string_ostream &SS) {

      auto M = Inst2Matrix.find(V);

      if (M == Inst2Matrix.end())

        SS << "unknown";

      else {

        SS << M->second.getNumRows();

        SS << "x";

        SS << M->second.getNumColumns();

      }

    }


    /// Write the called function name. Handles calls to llvm.matrix.*

    /// specially: we write the name, followed by the dimensions of the input

    /// matrixes, followed by the scalar type name.

    void writeFnName(CallInst *CI) {

      if (!CI->getCalledFunction())

        write("<no called fn>");

      else {

        StringRef Name = CI->getCalledFunction()->getName();

        if (!Name.starts_with("llvm.matrix")) {

          write(Name);

          return;

        }

        auto *II = cast<IntrinsicInst>(CI);

        write(Intrinsic::getBaseName(II->getIntrinsicID())

                  .drop_front(StringRef("llvm.matrix.").size()));

        write(".");

        std::string Tmp;

        raw_string_ostream SS(Tmp);


        switch (II->getIntrinsicID()) {

        case Intrinsic::matrix_multiply:

          prettyPrintMatrixType(II->getOperand(0), SS);

          SS << ".";

          prettyPrintMatrixType(II->getOperand(1), SS);

          SS << "." << *II->getType()->getScalarType();

          break;

        case Intrinsic::matrix_transpose:

          prettyPrintMatrixType(II->getOperand(0), SS);

          SS << "." << *II->getType()->getScalarType();

          break;

        case Intrinsic::matrix_column_major_load:

          prettyPrintMatrixType(II, SS);

          SS << "." << *II->getType()->getScalarType();

          break;

        case Intrinsic::matrix_column_major_store:

          prettyPrintMatrixType(II->getOperand(0), SS);

          SS << "." << *II->getOperand(0)->getType()->getScalarType();

          break;

        default:

          llvm_unreachable("Unhandled case");

        }

        write(Tmp);

      }

    }


    unsigned getNumShapeArgs(CallInst *CI) const {

      if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(CI)) {

        switch (II->getIntrinsicID()) {

        case Intrinsic::matrix_multiply:

          return 3;

        case Intrinsic::matrix_transpose:

          return 2;

        case Intrinsic::matrix_column_major_load:

        case Intrinsic::matrix_column_major_store:

          return 3;

        default:

          return 0;

        }

      }

      return 0;

    }


    /// Special printing for values: for pointers, we print if they refer to an

    /// (function) external address or a stack address, for other values we

    /// either print the constant or "scalar"/"matrix" for other values.

    void write(Value *V) {

      V = getUnderlyingObjectThroughLoads(V);

      if (V->getType()->isPointerTy()) {

        if (isa<AllocaInst>(V)) {

          Stream << "stack addr";

          LineLength += StringRef("stack addr").size();

        } else {

          Stream << "addr";

          LineLength += StringRef("addr").size();

        }

        if (!V->getName().empty()) {

          Stream << " %" << V->getName() << "";

          LineLength += V->getName().size() + 2;

        }

        return;

      }


      std::string Tmp;

      raw_string_ostream TmpStream(Tmp);


      if (auto *CI = dyn_cast<ConstantInt>(V))

        TmpStream << CI->getValue();

      else if (isa<Constant>(V))

        TmpStream << "constant";

      else {

        if (isMatrix(V))

          TmpStream << "matrix";

        else

          TmpStream << "scalar";

      }

      Tmp = std::string(StringRef(Tmp).trim());

      LineLength += Tmp.size();

      Stream << Tmp;

    }


    /// Linearize expression \p Expr starting at an indentation of \p Indent.

    /// Expressions that are re-used multiple times are prefixed with (reused)

    /// at the re-used root instruction.

    void linearizeExpr(Value *Expr, unsigned Indent, bool ParentReused,

                       bool ParentShared) {

      auto *I = cast<Instruction>(Expr);

      maybeIndent(Indent);

      SmallVector<Value *, 8> Ops;


      // Is Expr shared with other expression leaves?

      bool ExprShared = false;


      // Deal with shared subtrees. Mark them as shared, if required.

      if (!ParentShared) {

        auto SI = Shared.find(Expr);

        assert(SI != Shared.end() && SI->second.count(Leaf));


        for (Value *S : SI->second) {

          if (S == Leaf)

            continue;

          DebugLoc DL = cast<Instruction>(S)->getDebugLoc();

          write("shared with remark at line " + std::to_string(DL.getLine()) +

                " column " + std::to_string(DL.getCol()) + " (");

        }

        ExprShared = SI->second.size() > 1;

      }


      bool Reused = !ReusedExprs.insert(Expr).second;

      if (Reused && !ParentReused)

        write("(reused) ");


      if (auto *CI = dyn_cast<CallInst>(I)) {

        writeFnName(CI);


        Ops.append(CI->arg_begin(), CI->arg_end() - getNumShapeArgs(CI));

      } else if (isa<BitCastInst>(Expr)) {

        // Special case bitcasts, which are used to materialize matrixes from

        // non-matrix ops.

        write("matrix");

        return;

      } else {

        Ops.append(I->value_op_begin(), I->value_op_end());

        write(I->getOpcodeName());

      }


      write("(");


      unsigned NumOpsToBreak = 1;

      if (match(Expr, m_Intrinsic<Intrinsic::matrix_column_major_load>()))

        NumOpsToBreak = 2;


      for (Value *Op : Ops) {

        if (Ops.size() > NumOpsToBreak)

          lineBreak();


        maybeIndent(Indent + 1);

        if (isMatrix(Op))

          linearizeExpr(Op, Indent + 1, Reused, ExprShared);

        else

          write(Op);

        if (Op != Ops.back())

          write(", ");

      }


      write(")");

    }


    const std::string &getResult() {

      return Str;

    }

  };


  /// Generate remarks for matrix operations in a function. To generate remarks

  /// for matrix expressions, the following approach is used:

  /// 1. Use the inlined-at debug information to group matrix operations to the

  ///    DISubprograms they are contained in.

  /// 2. Collect leaves of matrix expressions (done in

  ///    RemarkGenerator::getExpressionLeaves) for each subprogram - expression

  //     mapping.  Leaves are lowered matrix instructions without other matrix

  //     users (like stores) in the current subprogram.

  /// 3. For each leaf, create a remark containing a linearizied version of the

  ///    matrix expression. The expression is linearized by a recursive

  ///    bottom-up traversal of the matrix operands, starting at a leaf. Note

  ///    that multiple leaves can share sub-expressions. Shared subexpressions

  ///    are explicitly marked as shared().

  struct RemarkGenerator {

    const MapVector<Value *, MatrixTy> &Inst2Matrix;

    OptimizationRemarkEmitter &ORE;

    Function &Func;

    const DataLayout &DL;


    RemarkGenerator(const MapVector<Value *, MatrixTy> &Inst2Matrix,

                    OptimizationRemarkEmitter &ORE, Function &Func)

        : Inst2Matrix(Inst2Matrix), ORE(ORE), Func(Func),

          DL(Func.getDataLayout()) {}


    /// Return all leaves of the expressions in \p ExprsInSubprogram. Those are

    /// instructions in Inst2Matrix returning void or without any users in

    /// \p ExprsInSubprogram. Currently that should only include stores.

    SmallVector<Value *, 4>

    getExpressionLeaves(const SmallSetVector<Value *, 32> &ExprsInSubprogram) {

      SmallVector<Value *, 4> Leaves;

      for (auto *Expr : ExprsInSubprogram)

        if (Expr->getType()->isVoidTy() ||

            !any_of(Expr->users(), [&ExprsInSubprogram](User *U) {

              return ExprsInSubprogram.count(U);

            }))

          Leaves.push_back(Expr);

      return Leaves;

    }


    /// Recursively traverse expression \p V starting at \p Leaf and add \p Leaf

    /// to all visited expressions in \p Shared. Limit the matrix operations to

    /// the ones in \p ExprsInSubprogram.

    void collectSharedInfo(Value *Leaf, Value *V,

                           const SmallSetVector<Value *, 32> &ExprsInSubprogram,

                           DenseMap<Value *, SmallPtrSet<Value *, 2>> &Shared) {


      if (!ExprsInSubprogram.count(V))

        return;


      Shared[V].insert(Leaf);


      for (Value *Op : cast<Instruction>(V)->operand_values())

        collectSharedInfo(Leaf, Op, ExprsInSubprogram, Shared);

    }


    /// Calculate the number of exclusive and shared op counts for expression

    /// starting at \p V. Expressions used multiple times are counted once.

    /// Limit the matrix operations to the ones in \p ExprsInSubprogram.

    std::pair<OpInfoTy, OpInfoTy>

    sumOpInfos(Value *Root, SmallPtrSetImpl<Value *> &ReusedExprs,

               const SmallSetVector<Value *, 32> &ExprsInSubprogram,

               DenseMap<Value *, SmallPtrSet<Value *, 2>> &Shared) const {

      if (!ExprsInSubprogram.count(Root))

        return {};


      // Already counted this expression. Stop.

      if (!ReusedExprs.insert(Root).second)

        return {};


      OpInfoTy SharedCount;

      OpInfoTy Count;


      auto I = Shared.find(Root);

      auto CM = Inst2Matrix.find(Root);

      if (I->second.size() == 1)

        Count = CM->second.getOpInfo();

      else

        SharedCount = CM->second.getOpInfo();


      for (Value *Op : cast<Instruction>(Root)->operand_values()) {

        auto C = sumOpInfos(Op, ReusedExprs, ExprsInSubprogram, Shared);

        Count += C.first;

        SharedCount += C.second;

      }

      return {Count, SharedCount};

    }


    void emitRemarks() {

      if (!ORE.allowExtraAnalysis(DEBUG_TYPE))

        return;


      // Map matrix operations to their containting subprograms, by traversing

      // the inlinedAt chain. If the function does not have a DISubprogram, we

      // only map them to the containing function.

      MapVector<DISubprogram *, SmallVector<Value *, 8>> Subprog2Exprs;

      for (const auto &KV : Inst2Matrix) {

        if (Func.getSubprogram()) {

          auto *I = cast<Instruction>(KV.first);

          DILocation *Context = I->getDebugLoc();

          while (Context) {

            Subprog2Exprs[getSubprogram(Context->getScope())].push_back(

                KV.first);

            Context = DebugLoc(Context).getInlinedAt();

          }

        } else {

          Subprog2Exprs[nullptr].push_back(KV.first);

        }

      }

      for (auto &KV : Subprog2Exprs) {

        SmallSetVector<Value *, 32> ExprsInSubprogram(KV.second.begin(),

                                                      KV.second.end());

        auto Leaves = getExpressionLeaves(ExprsInSubprogram);


        DenseMap<Value *, SmallPtrSet<Value *, 2>> Shared;

        for (Value *Leaf : Leaves)

          collectSharedInfo(Leaf, Leaf, ExprsInSubprogram, Shared);


        // Generate remarks for each leaf.

        for (auto *L : Leaves) {


          DebugLoc Loc = cast<Instruction>(L)->getDebugLoc();

          DILocation *Context = cast<Instruction>(L)->getDebugLoc();

          while (Context) {

            if (getSubprogram(Context->getScope()) == KV.first) {

              Loc = Context;

              break;

            }

            Context = DebugLoc(Context).getInlinedAt();

          }


          SmallPtrSet<Value *, 8> ReusedExprs;

          OpInfoTy Counts, SharedCounts;

          std::tie(Counts, SharedCounts) =

              sumOpInfos(L, ReusedExprs, ExprsInSubprogram, Shared);


          OptimizationRemark Rem(DEBUG_TYPE, "matrix-lowered", Loc,

                                 cast<Instruction>(L)->getParent());


          Rem << "Lowered with ";

          Rem << ore::NV("NumStores", Counts.NumStores) << " stores, "

              << ore::NV("NumLoads", Counts.NumLoads) << " loads, "

              << ore::NV("NumComputeOps", Counts.NumComputeOps)

              << " compute ops, "

              << ore::NV("NumExposedTransposes", Counts.NumExposedTransposes)

              << " exposed transposes";


          if (SharedCounts.NumStores > 0 || SharedCounts.NumLoads > 0 ||

              SharedCounts.NumComputeOps > 0) {

            Rem << ",\nadditionally "

                << ore::NV("NumStores", SharedCounts.NumStores) << " stores, "

                << ore::NV("NumLoads", SharedCounts.NumLoads) << " loads, "

                << ore::NV("NumFPOps", SharedCounts.NumComputeOps)

                << " compute ops"

                << " are shared with other expressions";

          }


          Rem << ("\n" + linearize(L, Shared, ExprsInSubprogram, DL));

          ORE.emit(Rem);

        }

      }

    }


    std::string

    linearize(Value *L,

              const DenseMap<Value *, SmallPtrSet<Value *, 2>> &Shared,

              const SmallSetVector<Value *, 32> &ExprsInSubprogram,

              const DataLayout &DL) {

      ExprLinearizer Lin(DL, Inst2Matrix, Shared, ExprsInSubprogram, L);

      Lin.linearizeExpr(L, 0, false, false);

      return Lin.getResult();

    }

  };

};

} // namespace


PreservedAnalyses LowerMatrixIntrinsicsPass::run(Function &F,

                                                 FunctionAnalysisManager &AM) {

  auto &TTI = AM.getResult<TargetIRAnalysis>(F);


  LowerMatrixIntrinsics LMT(F, TTI, Minimal ? nullptr : &AM);

  if (LMT.Visit()) {

    PreservedAnalyses PA;

    if (!Minimal) {

      PA.preserve<LoopAnalysis>();

      PA.preserve<DominatorTreeAnalysis>();

    }

    return PA;

  }

  return PreservedAnalyses::all();

}


void LowerMatrixIntrinsicsPass::printPipeline(

    raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) {

  static_cast<PassInfoMixin<LowerMatrixIntrinsicsPass> *>(this)->printPipeline(

      OS, MapClassName2PassName);

  OS << '<';

  if (Minimal)

    OS << "minimal";

  OS << '>';

}


assert
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")

Select
AMDGPU Register Bank Select
Definition AMDGPURegBankSelect.cpp:68

PHI
Rewrite undef for PHI
Definition AMDGPURewriteUndefForPHI.cpp:98

AliasAnalysis.h

Alignment.h

getParent
static const Function * getParent(const Value *V)
Definition BasicAliasAnalysis.cpp:885

BasicBlockUtils.h

BT
BitTracker BT
Definition BitTracker.cpp:68

A
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")

D
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")

E
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")

B
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")

CommandLine.h

clEnumValN
#define clEnumValN(ENUMVAL, FLAGNAME, DESC)
Definition CommandLine.h:688

Compiler.h

LLVM_DUMP_METHOD
#define LLVM_DUMP_METHOD
Mark debug helper function definitions like dump() that should not be stripped from debug builds.
Definition Compiler.h:638

IntrinsicCostStrategy::InstructionCost
@ InstructionCost
Definition CostModel.cpp:52

DataLayout.h

DebugInfoMetadata.h

DerivedTypes.h

DomTreeUpdater.h

GEP
Hexagon Common GEP
Definition HexagonCommonGEP.cpp:164

vectors
hexagon Hexagon specific predictive commoning for HVX vectors
Definition HexagonVectorLoopCarriedReuse.cpp:209

IRBuilder.h

CFG.h
This file provides various utilities for inspecting and working with the control flow graph in LLVM I...

Function.h

IntrinsicInst.h

users
iv users
Definition IVUsers.cpp:48

InstrTypes.h

Instructions.h

TemplateParamKind::Type
@ Type
Definition ItaniumDemangle.h:1243

Ops
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
Definition ItaniumDemangle.h:3368

isZero
static bool isZero(Value *V, const DataLayout &DL, DominatorTree *DT, AssumptionCache *AC)
Definition Lint.cpp:539

Matrix
Live Register Matrix
Definition LiveRegMatrix.cpp:44

LoopInfo.h

LoopUtils.h

getSubprogram
static DISubprogram * getSubprogram(DIScope *Scope)
Helper function to either return Scope, if it is a subprogram or the attached subprogram for a local ...
Definition LowerMatrixIntrinsics.cpp:102

ForceFusion
static cl::opt< bool > ForceFusion("force-fuse-matrix", cl::init(false), cl::Hidden, cl::desc("Force matrix instruction fusion even if not profitable."))

VerifyShapeInfo
static cl::opt< bool > VerifyShapeInfo("verify-matrix-shapes", cl::Hidden, cl::desc("Enable/disable matrix shape verification."), cl::init(false))

isSplat
static bool isSplat(Value *V)
Return true if V is a splat of a value (which is used when multiplying a matrix with a scalar).
Definition LowerMatrixIntrinsics.cpp:110

TileUseLoops
static cl::opt< bool > TileUseLoops("fuse-matrix-use-loops", cl::init(false), cl::Hidden, cl::desc("Generate loop nest for tiling."))

FuseMatrix
static cl::opt< bool > FuseMatrix("fuse-matrix", cl::init(true), cl::Hidden, cl::desc("Enable/disable fusing matrix instructions."))

m_AnyAdd
auto m_AnyAdd(const LTy &L, const RTy &R)
Match any add operation (fp or integer).
Definition LowerMatrixIntrinsics.cpp:124

AllowContractEnabled
static cl::opt< bool > AllowContractEnabled("matrix-allow-contract", cl::init(false), cl::Hidden, cl::desc("Allow the use of FMAs if available and profitable. This may " "result in different results, due to less rounding error."))

MatrixLayoutTy
MatrixLayoutTy
Definition LowerMatrixIntrinsics.cpp:87

MatrixLayoutTy::RowMajor
@ RowMajor
Definition LowerMatrixIntrinsics.cpp:87

MatrixLayoutTy::ColumnMajor
@ ColumnMajor
Definition LowerMatrixIntrinsics.cpp:87

m_AnyMul
auto m_AnyMul(const LTy &L, const RTy &R)
Match any mul operation (fp or integer).
Definition LowerMatrixIntrinsics.cpp:118

PrintAfterTransposeOpt
static cl::opt< bool > PrintAfterTransposeOpt("matrix-print-after-transpose-opt", cl::init(false))

DEBUG_TYPE
#define DEBUG_TYPE
Definition LowerMatrixIntrinsics.cpp:57

TileSize
static cl::opt< unsigned > TileSize("fuse-matrix-tile-size", cl::init(4), cl::Hidden, cl::desc("Tile size for matrix instruction fusion using square-shaped tiles."))

MatrixLayout
static cl::opt< MatrixLayoutTy > MatrixLayout("matrix-default-layout", cl::init(MatrixLayoutTy::ColumnMajor), cl::desc("Sets the default matrix layout"), cl::values(clEnumValN(MatrixLayoutTy::ColumnMajor, "column-major", "Use column-major layout"), clEnumValN(MatrixLayoutTy::RowMajor, "row-major", "Use row-major layout")))

LowerMatrixIntrinsics.h

F
#define F(x, y, z)
Definition MD5.cpp:55

I
#define I(x, y, z)
Definition MD5.cpp:58

MatrixBuilder.h

MatrixUtils.h

Context
@ Context
Definition MemProfContextDisambiguation.cpp:129

T
#define T
Definition Mips16ISelLowering.cpp:353

T1
#define T1
Definition Mips16ISelLowering.cpp:352

II
uint64_t IntrinsicInst * II
Definition NVVMIntrRange.cpp:46

OptimizationRemarkEmitter.h

Operation
PowerPC Reduce CR logical Operation
Definition PPCReduceCRLogicals.cpp:735

PatternMatch.h

PostOrderIterator.h
This file builds on the ADT/GraphTraits.h file to build a generic graph post order iterator.

Cond
const SmallVectorImpl< MachineOperand > & Cond
Definition RISCVRedundantCopyElimination.cpp:71

extractVector
static Value * extractVector(IRBuilderTy &IRB, Value *V, unsigned BeginIndex, unsigned EndIndex, const Twine &Name)
Definition SROA.cpp:2605

insertVector
static Value * insertVector(IRBuilderTy &IRB, Value *Old, Value *V, unsigned BeginIndex, const Twine &Name)
Definition SROA.cpp:2627

STLExtras.h
This file contains some templates that are useful if you are working with the STL at all.

ScopeExit.h
This file defines the make_scope_exit function, which executes user-defined cleanup logic at scope ex...

SmallVector.h
This file defines the SmallVector class.

Statistic.h
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...

STATISTIC
#define STATISTIC(VARNAME, DESC)
Definition Statistic.h:171

Debug.h

LLVM_DEBUG
#define LLVM_DEBUG(...)
Definition Debug.h:114

getType
static SymbolRef::Type getType(const Symbol *Sym)
Definition TapiFile.cpp:39

BlockSize
static const int BlockSize
Definition TarWriter.cpp:33

Ptr
@ Ptr
Definition TargetLibraryInfo.cpp:77

TargetTransformInfo.h
This pass exposes codegen information to IR-level passes.

getOpcode
static std::optional< unsigned > getOpcode(ArrayRef< VPValue * > Values)
Returns the opcode of Values or ~0 if they do not all agree.
Definition VPlanSLP.cpp:247

ValueTracking.h

VectorUtils.h

LowerStore
static SDValue LowerStore(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Definition X86ISelLowering.cpp:25584

LowerLoad
static SDValue LowerLoad(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Definition X86ISelLowering.cpp:25669

RHS
Value * RHS
Definition X86PartialReduction.cpp:74

LHS
Value * LHS
Definition X86PartialReduction.cpp:73

llvm::AllocaInst::getAlign
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
Definition Instructions.h:129

llvm::AnalysisManager::getResult
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Definition PassManager.h:412

llvm::BasicBlock::begin
iterator begin()
Instruction iterator methods.
Definition BasicBlock.h:459

llvm::BasicBlock::getParent
const Function * getParent() const
Return the enclosing method, or null if none.
Definition BasicBlock.h:213

llvm::BasicBlock::rbegin
reverse_iterator rbegin()
Definition BasicBlock.h:475

llvm::BasicBlock::reverse_iterator
InstListType::reverse_iterator reverse_iterator
Definition BasicBlock.h:172

llvm::BasicBlock::rend
reverse_iterator rend()
Definition BasicBlock.h:477

llvm::BasicBlock::getTerminator
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition BasicBlock.h:233

llvm::BinaryOperator::getOpcode
BinaryOps getOpcode() const
Definition InstrTypes.h:374

llvm::CallBase::getCalledFunction
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
Definition InstrTypes.h:1346

llvm::CallBase::arg_begin
User::op_iterator arg_begin()
Return the iterator pointing to the beginning of the argument list.
Definition InstrTypes.h:1265

llvm::CallBase::getParamAlign
MaybeAlign getParamAlign(unsigned ArgNo) const
Extract the alignment for a call or parameter (0=unknown).
Definition InstrTypes.h:1776

llvm::CallBase::getArgOperand
Value * getArgOperand(unsigned i) const
Definition InstrTypes.h:1290

llvm::CallBase::arg_end
User::op_iterator arg_end()
Return the iterator pointing to the end of the argument list.
Definition InstrTypes.h:1271

llvm::CastInst::getOpcode
Instruction::CastOps getOpcode() const
Return the opcode of this CastInst.
Definition InstrTypes.h:610

llvm::ConstantAggregateZero::get
static LLVM_ABI ConstantAggregateZero * get(Type *Ty)
Definition Constants.cpp:1680

llvm::DILocalScope::getSubprogram
LLVM_ABI DISubprogram * getSubprogram() const
Get the subprogram for this scope.
Definition DebugInfoMetadata.cpp:1308

llvm::DIScope
Base class for scope-like contexts.
Definition DebugInfoMetadata.h:571

llvm::DISubprogram
Subprogram description. Uses SubclassData1.
Definition DebugInfoMetadata.h:2270

llvm::DenseMapBase::find
iterator find(const_arg_type_t< KeyT > Val)
Definition DenseMap.h:167

llvm::DenseMapBase::end
iterator end()
Definition DenseMap.h:81

llvm::DenseMap
Definition DenseMap.h:701

llvm::DominatorTreeAnalysis
Analysis pass which computes a DominatorTree.
Definition Dominators.h:284

llvm::ElementCount::getFixed
static constexpr ElementCount getFixed(ScalarTy MinVal)
Definition TypeSize.h:310

llvm::FastMathFlags::setAllowContract
void setAllowContract(bool B=true)
Definition FMF.h:90

llvm::FastMathFlags::allowReassoc
bool allowReassoc() const
Flag queries.
Definition FMF.h:64

llvm::FastMathFlags::allowContract
bool allowContract() const
Definition FMF.h:69

llvm::FixedVectorType::getNumElements
unsigned getNumElements() const
Definition DerivedTypes.h:637

llvm::FixedVectorType::get
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition Type.cpp:803

llvm::Function
Definition Function.h:64

llvm::Function::getIntrinsicID
Intrinsic::ID getIntrinsicID() const LLVM_READONLY
getIntrinsicID - This method returns the ID number of the specified function, or Intrinsic::not_intri...
Definition Function.h:244

llvm::Function::isIntrinsic
bool isIntrinsic() const
isIntrinsic - Returns true if the function's name starts with "llvm.".
Definition Function.h:249

llvm::IRBuilderBase::CreateFAddReduce
LLVM_ABI CallInst * CreateFAddReduce(Value *Acc, Value *Src)
Create a sequential vector fadd reduction intrinsic of the source vector.
Definition IRBuilder.cpp:357

llvm::IRBuilderBase::CreateICmpULT
Value * CreateICmpULT(Value *LHS, Value *RHS, const Twine &Name="")
Definition IRBuilder.h:2348

llvm::IRBuilderBase::CreateInsertElement
Value * CreateInsertElement(Type *VecTy, Value *NewElt, Value *Idx, const Twine &Name="")
Definition IRBuilder.h:2579

llvm::IRBuilderBase::CreateAlloca
AllocaInst * CreateAlloca(Type *Ty, unsigned AddrSpace, Value *ArraySize=nullptr, const Twine &Name="")
Definition IRBuilder.h:1833

llvm::IRBuilderBase::CreateExtractElement
Value * CreateExtractElement(Value *Vec, Value *Idx, const Twine &Name="")
Definition IRBuilder.h:2567

llvm::IRBuilderBase::CreateAlignedLoad
LoadInst * CreateAlignedLoad(Type *Ty, Value *Ptr, MaybeAlign Align, const char *Name)
Definition IRBuilder.h:1867

llvm::IRBuilderBase::CreateZExtOrTrunc
Value * CreateZExtOrTrunc(Value *V, Type *DestTy, const Twine &Name="")
Create a ZExt or Trunc from the integer value V to DestTy.
Definition IRBuilder.h:2103

llvm::IRBuilderBase::CreateMemCpy
CallInst * CreateMemCpy(Value *Dst, MaybeAlign DstAlign, Value *Src, MaybeAlign SrcAlign, uint64_t Size, bool isVolatile=false, const AAMDNodes &AAInfo=AAMDNodes())
Create and insert a memcpy between the specified pointers.
Definition IRBuilder.h:687

llvm::IRBuilderBase::CreateFAdd
Value * CreateFAdd(Value *L, Value *R, const Twine &Name="", MDNode *FPMD=nullptr)
Definition IRBuilder.h:1613

llvm::IRBuilderBase::CreateVectorSplat
LLVM_ABI Value * CreateVectorSplat(unsigned NumElts, Value *V, const Twine &Name="")
Return a vector value that contains.
Definition IRBuilder.cpp:1128

llvm::IRBuilderBase::CreateSelect
LLVM_ABI Value * CreateSelect(Value *C, Value *True, Value *False, const Twine &Name="", Instruction *MDFrom=nullptr)
Definition IRBuilder.cpp:1018

llvm::IRBuilderBase::CreateAddReduce
LLVM_ABI CallInst * CreateAddReduce(Value *Src)
Create a vector int add reduction intrinsic of the source vector.
Definition IRBuilder.cpp:367

llvm::IRBuilderBase::getIntPtrTy
IntegerType * getIntPtrTy(const DataLayout &DL, unsigned AddrSpace=0)
Fetch the type of an integer with size at least as big as that of a pointer in the given address spac...
Definition IRBuilder.h:611

llvm::IRBuilderBase::CreateCast
Value * CreateCast(Instruction::CastOps Op, Value *V, Type *DestTy, const Twine &Name="", MDNode *FPMathTag=nullptr, FMFSource FMFSource={})
Definition IRBuilder.h:2241

llvm::IRBuilderBase::setFastMathFlags
void setFastMathFlags(FastMathFlags NewFMF)
Set the fast-math flags to be used with generated fp-math operators.
Definition IRBuilder.h:345

llvm::IRBuilderBase::CreateGEP
Value * CreateGEP(Type *Ty, Value *Ptr, ArrayRef< Value * > IdxList, const Twine &Name="", GEPNoWrapFlags NW=GEPNoWrapFlags::none())
Definition IRBuilder.h:1926

llvm::IRBuilderBase::CreateBinaryIntrinsic
LLVM_ABI Value * CreateBinaryIntrinsic(Intrinsic::ID ID, Value *LHS, Value *RHS, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with 2 operands which is mangled on the first type.
Definition IRBuilder.cpp:824

llvm::IRBuilderBase::CreateIntrinsic
LLVM_ABI CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
Definition IRBuilder.cpp:835

llvm::IRBuilderBase::CreatePHI
PHINode * CreatePHI(Type *Ty, unsigned NumReservedValues, const Twine &Name="")
Definition IRBuilder.h:2497

llvm::IRBuilderBase::getIntN
ConstantInt * getIntN(unsigned N, uint64_t C)
Get a constant N-bit value, zero extended or truncated from a 64-bit value.
Definition IRBuilder.h:533

llvm::IRBuilderBase::CreateCondBr
BranchInst * CreateCondBr(Value *Cond, BasicBlock *True, BasicBlock *False, MDNode *BranchWeights=nullptr, MDNode *Unpredictable=nullptr)
Create a conditional 'br Cond, TrueDest, FalseDest' instruction.
Definition IRBuilder.h:1197

llvm::IRBuilderBase::CreateUnaryIntrinsic
LLVM_ABI CallInst * CreateUnaryIntrinsic(Intrinsic::ID ID, Value *V, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with 1 operand which is mangled on its type.
Definition IRBuilder.cpp:816

llvm::IRBuilderBase::CreateLoad
LoadInst * CreateLoad(Type *Ty, Value *Ptr, const char *Name)
Provided to resolve 'CreateLoad(Ty, Ptr, "...")' correctly, instead of converting the string to 'bool...
Definition IRBuilder.h:1850

llvm::IRBuilderBase::CreateShuffleVector
Value * CreateShuffleVector(Value *V1, Value *V2, Value *Mask, const Twine &Name="")
Definition IRBuilder.h:2601

llvm::IRBuilderBase::CreateAdd
Value * CreateAdd(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition IRBuilder.h:1403

llvm::IRBuilderBase::CreatePtrToInt
Value * CreatePtrToInt(Value *V, Type *DestTy, const Twine &Name="")
Definition IRBuilder.h:2197

llvm::IRBuilderBase::CreateBinOp
Value * CreateBinOp(Instruction::BinaryOps Opc, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:1708

llvm::IRBuilderBase::SetInsertPoint
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Definition IRBuilder.h:207

llvm::IRBuilderBase::CreateAlignedStore
StoreInst * CreateAlignedStore(Value *Val, Value *Ptr, MaybeAlign Align, bool isVolatile=false)
Definition IRBuilder.h:1886

llvm::IRBuilderBase::CreateFMul
Value * CreateFMul(Value *L, Value *R, const Twine &Name="", MDNode *FPMD=nullptr)
Definition IRBuilder.h:1651

llvm::IRBuilderBase::CreateFNeg
Value * CreateFNeg(Value *V, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:1793

llvm::IRBuilderBase::CreateMul
Value * CreateMul(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition IRBuilder.h:1437

llvm::IRBuilder
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition IRBuilder.h:2788

llvm::Instruction
Definition Instruction.h:69

llvm::Instruction::moveAfter
LLVM_ABI void moveAfter(Instruction *MovePos)
Unlink this instruction from its current basic block and insert it into the basic block that MovePos ...
Definition Instruction.cpp:200

llvm::Instruction::setFastMathFlags
LLVM_ABI void setFastMathFlags(FastMathFlags FMF)
Convenience function for setting multiple fast-math flags on this instruction, which must be an opera...
Definition Instruction.cpp:633

llvm::Instruction::eraseFromParent
LLVM_ABI InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
Definition Instruction.cpp:108

llvm::Instruction::getFastMathFlags
LLVM_ABI FastMathFlags getFastMathFlags() const LLVM_READONLY
Convenience function for getting all the fast-math flags, which must be an operator which supports th...
Definition Instruction.cpp:683

llvm::IntrinsicInst::getIntrinsicID
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
Definition IntrinsicInst.h:56

llvm::LoadInst::isVolatile
bool isVolatile() const
Return true if this is a load from a volatile memory location.
Definition Instructions.h:210

llvm::LoadInst::getAlign
Align getAlign() const
Return the alignment of the access that is being performed.
Definition Instructions.h:216

llvm::LocationSize::getValue
TypeSize getValue() const
Definition MemoryLocation.h:158

llvm::LoopAnalysis
Analysis pass that exposes the LoopInfo for a function.
Definition LoopInfo.h:569

llvm::LowerMatrixIntrinsicsPass::run
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM)
Definition LowerMatrixIntrinsics.cpp:2845

llvm::LowerMatrixIntrinsicsPass::printPipeline
void printPipeline(raw_ostream &OS, function_ref< StringRef(StringRef)> MapClassName2PassName)
Definition LowerMatrixIntrinsics.cpp:2861

llvm::MatrixBuilder::CreateMatrixTranspose
CallInst * CreateMatrixTranspose(Value *Matrix, unsigned Rows, unsigned Columns, const Twine &Name="")
Create a llvm.matrix.transpose call, transposing Matrix with Rows rows and Columns columns.
Definition MatrixBuilder.h:110

llvm::MatrixBuilder::CreateMatrixMultiply
CallInst * CreateMatrixMultiply(Value *LHS, Value *RHS, unsigned LHSRows, unsigned LHSColumns, unsigned RHSColumns, const Twine &Name="")
Create a llvm.matrix.multiply call, multiplying matrixes LHS and RHS.
Definition MatrixBuilder.h:126

llvm::MemoryLocation::get
static LLVM_ABI MemoryLocation get(const LoadInst *LI)
Return a location with information about the memory reference by the given instruction.
Definition MemoryLocation.cpp:36

llvm::MemoryLocation::Size
LocationSize Size
The maximum size of the location, in address-units, or UnknownSize if the size is not known.
Definition MemoryLocation.h:234

llvm::MemoryLocation::Ptr
const Value * Ptr
The address of the start of the location.
Definition MemoryLocation.h:225

llvm::MemoryLocation::getForArgument
static LLVM_ABI MemoryLocation getForArgument(const CallBase *Call, unsigned ArgIdx, const TargetLibraryInfo *TLI)
Return a location representing a particular argument of a call.
Definition MemoryLocation.cpp:181

llvm::PHINode::addIncoming
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
Definition Instructions.h:2774

llvm::PHINode::blocks
iterator_range< const_block_iterator > blocks() const
Definition Instructions.h:2700

llvm::PHINode::incoming_values
op_range incoming_values()
Definition Instructions.h:2704

llvm::PoisonValue::get
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
Definition Constants.cpp:1888

llvm::PreservedAnalyses
A set of analyses that are preserved following a run of a transformation pass.
Definition Analysis.h:112

llvm::PreservedAnalyses::all
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition Analysis.h:118

llvm::PreservedAnalyses::preserve
PreservedAnalyses & preserve()
Mark an analysis as preserved.
Definition Analysis.h:132

llvm::SetVector::size
size_type size() const
Determine the number of elements in the SetVector.
Definition SetVector.h:102

llvm::SetVector::insert_range
void insert_range(Range &&R)
Definition SetVector.h:175

llvm::SetVector::count
size_type count(const key_type &key) const
Count the number of elements of a given key in the SetVector.
Definition SetVector.h:261

llvm::SetVector::insert
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition SetVector.h:150

llvm::SmallPtrSetImplBase::empty
bool empty() const
Definition SmallPtrSet.h:98

llvm::SmallPtrSetImpl::erase
bool erase(PtrType Ptr)
Remove pointer from the set.
Definition SmallPtrSet.h:404

llvm::SmallPtrSetImpl::count
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
Definition SmallPtrSet.h:455

llvm::SmallPtrSetImpl::insert
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition SmallPtrSet.h:389

llvm::SmallPtrSetImpl::contains
bool contains(ConstPtrType Ptr) const
Definition SmallPtrSet.h:461

llvm::SmallVectorImpl::pop_back_val
T pop_back_val()
Definition SmallVector.h:673

llvm::SmallVectorImpl::append
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
Definition SmallVector.h:683

llvm::SmallVectorImpl::resize
void resize(size_type N)
Definition SmallVector.h:638

llvm::SmallVectorTemplateBase::pop_back
void pop_back()
Definition SmallVector.h:428

llvm::SmallVectorTemplateBase::push_back
void push_back(const T &Elt)
Definition SmallVector.h:416

llvm::SmallVectorTemplateCommon::size
size_t size() const
Definition SmallVector.h:79

llvm::SmallVectorTemplateCommon::back
reference back()
Definition SmallVector.h:311

llvm::SmallVectorTemplateCommon::empty
bool empty() const
Definition SmallVector.h:82

llvm::StoreInst::getAlign
Align getAlign() const
Definition Instructions.h:339

llvm::StoreInst::isVolatile
bool isVolatile() const
Return true if this is a store to a volatile memory location.
Definition Instructions.h:331

llvm::StringRef
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55

llvm::StringRef::drop_front
StringRef drop_front(size_t N=1) const
Return a StringRef equal to 'this' but with the first N elements dropped.
Definition StringRef.h:611

llvm::StringRef::size
constexpr size_t size() const
size - Get the string size.
Definition StringRef.h:146

llvm::TargetIRAnalysis
Analysis pass providing the TargetTransformInfo.
Definition TargetTransformInfo.h:1989

llvm::TargetTransformInfo::TCK_RecipThroughput
@ TCK_RecipThroughput
Reciprocal throughput.
Definition TargetTransformInfo.h:276

llvm::TargetTransformInfo::RGK_FixedWidthVector
@ RGK_FixedWidthVector
Definition TargetTransformInfo.h:1215

llvm::TargetTransformInfo::SK_Splice
@ SK_Splice
Concatenates elements from the first input vector with elements of the second input vector.
Definition TargetTransformInfo.h:1145

llvm::Type
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45

llvm::Type::getScalarType
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition Type.h:352

llvm::Type::getPrimitiveSizeInBits
LLVM_ABI TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Definition Type.cpp:198

llvm::Type::getScalarSizeInBits
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:231

llvm::Type::isVoidTy
bool isVoidTy() const
Return true if this is 'void'.
Definition Type.h:139

llvm::UnaryOperator::getOpcode
UnaryOps getOpcode() const
Definition InstrTypes.h:154

llvm::User::getOperand
Value * getOperand(unsigned i) const
Definition User.h:232

llvm::Value
LLVM Value Representation.
Definition Value.h:75

llvm::Value::getType
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256

llvm::Value::user_begin
user_iterator user_begin()
Definition Value.h:402

llvm::Value::hasOneUse
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:439

llvm::Value::replaceAllUsesWith
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition Value.cpp:546

llvm::Value::users
iterator_range< user_iterator > users()
Definition Value.h:426

llvm::Value::use_empty
bool use_empty() const
Definition Value.h:346

llvm::Value::uses
iterator_range< use_iterator > uses()
Definition Value.h:380

llvm::Value::getName
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:322

llvm::VectorType::getElementType
Type * getElementType() const
Definition DerivedTypes.h:463

llvm::cl::opt
Definition CommandLine.h:1455

llvm::details::FixedOrScalableQuantity::getFixedValue
constexpr ScalarTy getFixedValue() const
Definition TypeSize.h:201

llvm::function_ref
An efficient, type-erasing, non-owning reference to a callable.
Definition STLFunctionalExtras.h:37

llvm::ilist_detail::node_parent_access::getParent
const ParentTy * getParent() const
Definition ilist_node.h:34

llvm::ilist_node_impl::getIterator
self_iterator getIterator()
Definition ilist_node.h:123

llvm::raw_ostream
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition raw_ostream.h:53

Changed
Changed
Definition ObjCARCOpts.cpp:2370

llvm_unreachable
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Definition ErrorHandling.h:164

llvm::AArch64PACKey::IB
@ IB
Definition AArch64BaseInfo.h:904

llvm::AMDGPU::HSAMD::Kernel::Arg::Key::Align
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
Definition AMDGPUMetadata.h:183

llvm::ARM_MB::ST
@ ST
Definition ARMBaseInfo.h:73

llvm::ARM::ProfileKind::M
@ M
Definition ARMTargetParser.h:171

llvm::BitmaskEnumDetail::Mask
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition BitmaskEnum.h:127

llvm::CallingConv::C
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34

llvm::GraphProgram::Name
Name
Definition GraphWriter.h:51

llvm::ISD::BasicBlock
@ BasicBlock
Various leaf nodes.
Definition ISDOpcodes.h:81

llvm::Intrinsic::getBaseName
LLVM_ABI StringRef getBaseName(ID id)
Return the LLVM name for an intrinsic, without encoded types for overloading, such as "llvm....
Definition Intrinsics.cpp:44

llvm::M68k::MemAddrModeKind::U
@ U
Definition M68kBaseInfo.h:61

llvm::M68k::MemAddrModeKind::V
@ V
Definition M68kBaseInfo.h:63

llvm::M68k::MemAddrModeKind::K
@ K
Definition M68kBaseInfo.h:68

llvm::M68k::MemAddrModeKind::L
@ L
Definition M68kBaseInfo.h:70

llvm::MIPatternMatch::m_OneUse
OneUse_match< SubPat > m_OneUse(const SubPat &SP)
Definition MIPatternMatch.h:56

llvm::NVPTX::Shared
@ Shared
Definition NVPTX.h:184

llvm::PatternMatch
Definition PatternMatch.h:47

llvm::PatternMatch::m_Store
TwoOps_match< ValueOpTy, PointerOpTy, Instruction::Store > m_Store(const ValueOpTy &ValueOp, const PointerOpTy &PointerOp)
Matches StoreInst.
Definition PatternMatch.h:2051

llvm::PatternMatch::m_Add
BinaryOp_match< LHS, RHS, Instruction::Add > m_Add(const LHS &L, const RHS &R)
Definition PatternMatch.h:1174

llvm::PatternMatch::m_BinOp
class_match< BinaryOperator > m_BinOp()
Match an arbitrary binary operation and ignore it.
Definition PatternMatch.h:113

llvm::PatternMatch::m_SpecificInt
specific_intval< false > m_SpecificInt(const APInt &V)
Match a specific integer value or vector with all elements equal to the value.
Definition PatternMatch.h:1052

llvm::PatternMatch::m_FMul
BinaryOp_match< LHS, RHS, Instruction::FMul > m_FMul(const LHS &L, const RHS &R)
Definition PatternMatch.h:1246

llvm::PatternMatch::match
bool match(Val *V, const Pattern &P)
Definition PatternMatch.h:49

llvm::PatternMatch::m_Specific
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
Definition PatternMatch.h:954

llvm::PatternMatch::m_ConstantInt
class_match< ConstantInt > m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
Definition PatternMatch.h:181

llvm::PatternMatch::m_Intrinsic
IntrinsicID_match m_Intrinsic()
Match intrinsic calls like this: m_Intrinsic<Intrinsic::fabs>(m_Value(X))
Definition PatternMatch.h:2776

llvm::PatternMatch::m_FAdd
BinaryOp_match< LHS, RHS, Instruction::FAdd > m_FAdd(const LHS &L, const RHS &R)
Definition PatternMatch.h:1180

llvm::PatternMatch::m_Mul
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
Definition PatternMatch.h:1240

llvm::PatternMatch::m_Load
OneOps_match< OpTy, Instruction::Load > m_Load(const OpTy &Op)
Matches LoadInst.
Definition PatternMatch.h:2044

llvm::PatternMatch::m_Value
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
Definition PatternMatch.h:105

llvm::PatternMatch::m_CombineOr
match_combine_or< LTy, RTy > m_CombineOr(const LTy &L, const RTy &R)
Combine two pattern matchers matching L || R.
Definition PatternMatch.h:252

llvm::RISCVFenceField::R
@ R
Definition RISCVBaseInfo.h:463

llvm::SIEncodingFamily::SI
@ SI
Definition SIDefines.h:36

llvm::SI
Definition SIInstrInfo.h:1745

llvm::SPII::Store
@ Store
Definition SparcInstrInfo.h:33

llvm::SPII::Load
@ Load
Definition SparcInstrInfo.h:32

llvm::X86AS::SS
@ SS
Definition X86.h:215

llvm::X86II::TA
@ TA
Definition X86BaseInfo.h:738

llvm::cl::Hidden
@ Hidden
Definition CommandLine.h:139

llvm::cl::values
ValuesClass values(OptsTy... Options)
Helper to build a ValuesClass by forwarding a variable number of arguments as an initializer list to ...
Definition CommandLine.h:713

llvm::cl::init
initializer< Ty > init(const Ty &Val)
Definition CommandLine.h:445

llvm::codeview::EncodedFramePtrReg::BasePtr
@ BasePtr
Definition CodeView.h:528

llvm::codeview::FrameCookieKind::Copy
@ Copy
Definition CodeView.h:495

llvm::codeview::PublicSymFlags::Function
@ Function
Definition CodeView.h:409

llvm::dwarf_linker::DebugSectionKind::DebugLoc
@ DebugLoc
Definition DWARFLinkerBase.h:34

llvm::dxil::ElementType
ElementType
The element type of an SRV or UAV resource.
Definition DXILABI.h:60

llvm::memprof::Meta::Start
@ Start
Definition MemProf.h:69

llvm::ms_demangle::IntrinsicFunctionKind::New
@ New
Definition MicrosoftDemangleNodes.h:121

llvm::ms_demangle::QualifierMangleMode::Result
@ Result
Definition MicrosoftDemangle.h:132

llvm::ore::NV
DiagnosticInfoOptimizationBase::Argument NV
Definition OptimizationRemarkEmitter.h:139

llvm::rdf::Phi
NodeAddr< PhiNode * > Phi
Definition RDFGraph.h:390

llvm::sandboxir::Instruction
friend class Instruction
Iterator for Instructions in a `BasicBlock.
Definition BasicBlock.h:73

llvm
This is an optimization pass for GlobalISel generic memory operations.
Definition AddressRanges.h:18

llvm::drop_begin
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:316

llvm::dump
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
Definition SparseBitVector.h:874

llvm::Offset
@ Offset
Definition DWP.cpp:477

llvm::Value
FunctionAddr VTableAddr Value
Definition InstrProf.h:137

llvm::fill
void fill(R &&Range, T &&Value)
Provide wrappers to std::fill which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1745

llvm::PseudoProbeType::Block
@ Block
Definition PseudoProbe.h:30

llvm::size
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition STLExtras.h:1655

llvm::zip_equal
detail::zippy< detail::zip_first, T, U, Args... > zip_equal(T &&t, U &&u, Args &&...args)
zip iterator that assumes that all iteratees have the same length.
Definition STLExtras.h:839

llvm::make_scope_exit
detail::scope_exit< std::decay_t< Callable > > make_scope_exit(Callable &&F)
Definition ScopeExit.h:59

llvm::enumerate
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2472

llvm::dyn_cast
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:644

llvm::successors
auto successors(const MachineBasicBlock *BB)
Definition MachineBasicBlock.h:1437

llvm::operator!=
bool operator!=(uint64_t V1, const APInt &V2)
Definition APInt.h:2113

llvm::make_range
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
Definition iterator_range.h:70

llvm::operator+=
LLVM_ATTRIBUTE_ALWAYS_INLINE DynamicAPInt & operator+=(DynamicAPInt &A, int64_t B)
Definition DynamicAPInt.h:531

llvm::make_early_inc_range
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:632

llvm::concatenateVectors
LLVM_ABI Value * concatenateVectors(IRBuilderBase &Builder, ArrayRef< Value * > Vecs)
Concatenate a list of vectors.
Definition VectorUtils.cpp:1228

llvm::operator==
bool operator==(const AddressRangeValuePair &LHS, const AddressRangeValuePair &RHS)
Definition AddressRanges.h:151

llvm::getPointerOperand
const Value * getPointerOperand(const Value *V)
A helper function that returns the pointer operand of a load, store or GEP instruction.
Definition Instructions.h:5101

llvm::addStringMetadataToLoop
LLVM_ABI void addStringMetadataToLoop(Loop *TheLoop, const char *MDString, unsigned V=0)
Set input string into loop metadata by keeping other values intact.
Definition LoopUtils.cpp:215

llvm::any_of
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1732

llvm::reverse
auto reverse(ContainerTy &&C)
Definition STLExtras.h:406

llvm::write
LLVM_ABI Error write(MCStreamer &Out, ArrayRef< std::string > Inputs, OnCuIndexOverflow OverflowOptValue)
Definition DWP.cpp:622

llvm::sort
void sort(IteratorTy Start, IteratorTy End)
Definition STLExtras.h:1622

llvm::ComplexDeinterleavingOperation::Splat
@ Splat
Definition ComplexDeinterleavingPass.h:42

llvm::dbgs
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207

llvm::report_fatal_error
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
Definition Error.cpp:167

llvm::Count
FunctionAddr VTableAddr Count
Definition InstrProf.h:139

llvm::SmallVector
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
Definition SmallVector.h:1122

llvm::isa
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:548

llvm::errs
LLVM_ABI raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
Definition raw_ostream.cpp:908

llvm::TTI
TargetTransformInfo TTI
Definition TargetTransformInfo.h:218

llvm::IRBuilder
IRBuilder(LLVMContext &, FolderTy, InserterTy, MDNode *, ArrayRef< OperandBundleDef >) -> IRBuilder< FolderTy, InserterTy >

llvm::RecurKind::Mul
@ Mul
Product of integers.
Definition IVDescriptors.h:40

llvm::RecurKind::Add
@ Add
Sum of integers.
Definition IVDescriptors.h:37

llvm::Op
DWARFExpression::Operation Op
Definition DWARFExpressionPrinter.cpp:22

llvm::operator<<
raw_ostream & operator<<(raw_ostream &OS, const APFixedPoint &FX)
Definition APFixedPoint.h:312

llvm::ArrayRef
ArrayRef(const T &OneElt) -> ArrayRef< T >

llvm::copy
OutputIt copy(R &&Range, OutputIt Out)
Definition STLExtras.h:1835

llvm::cast
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:560

llvm::SplitBlock
LLVM_ABI BasicBlock * SplitBlock(BasicBlock *Old, BasicBlock::iterator SplitPt, DominatorTree *DT, LoopInfo *LI=nullptr, MemorySSAUpdater *MSSAU=nullptr, const Twine &BBName="", bool Before=false)
Split the specified block at the specified instruction.
Definition BasicBlockUtils.cpp:961

llvm::commonAlignment
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition Alignment.h:201

llvm::FunctionAnalysisManager
AnalysisManager< Function > FunctionAnalysisManager
Convenience typedef for the Function analysis manager.
Definition PassManager.h:564

llvm::VFParamKind::Vector
@ Vector
Definition VFABIDemangler.h:27

llvm::getUnderlyingObject
LLVM_ABI const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=MaxLookupSearchDepth)
This method strips off any GEP address adjustments, pointer casts or llvm.threadlocal....
Definition ValueTracking.cpp:6683

llvm::AliasAnalysis
AAResults AliasAnalysis
Temporary typedef for legacy code that uses a generic AliasAnalysis pointer or reference.
Definition AliasAnalysis.h:721

llvm::createSequentialMask
LLVM_ABI llvm::SmallVector< int, 16 > createSequentialMask(unsigned Start, unsigned NumInts, unsigned NumUndefs)
Create a sequential shuffle mask.
Definition VectorUtils.cpp:1173

std::swap
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:869

N
#define N

llvm::PassInfoMixin
A CRTP mix-in to automatically provide informational APIs needed for passes.
Definition PassManager.h:70

llvm::cl::desc
Definition CommandLine.h:411