docs/doxygen/LowerMatrixIntrinsics_8cpp_source.html

//===- LowerMatrixIntrinsics.cpp -  Lower matrix intrinsics -----*- C++ -*-===//

//

// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.

// See https://llvm.org/LICENSE.txt for license information.

// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

//

//===----------------------------------------------------------------------===//

//

// Lower matrix intrinsics to vector operations.

//

// TODO:

//  * Improve fusion:

//   * Support more cases, e.g. multiply-add, multiply-sub, operands/results

//     transposed.

//   * Improve cost-modeling, e.g. choose different number of rows/columns

//     columns for tiles, consider cost of copies on alias.

//

//===----------------------------------------------------------------------===//


#include "llvm/Transforms/Scalar/LowerMatrixIntrinsics.h"

#include "llvm/ADT/PostOrderIterator.h"

#include "llvm/ADT/ScopeExit.h"

#include "llvm/ADT/SmallSet.h"

#include "llvm/ADT/SmallVector.h"

#include "llvm/Analysis/AliasAnalysis.h"

#include "llvm/Analysis/DomTreeUpdater.h"

#include "llvm/Analysis/LoopInfo.h"

#include "llvm/Analysis/OptimizationRemarkEmitter.h"

#include "llvm/Analysis/TargetTransformInfo.h"

#include "llvm/Analysis/ValueTracking.h"

#include "llvm/Analysis/VectorUtils.h"

#include "llvm/IR/CFG.h"

#include "llvm/IR/DataLayout.h"

#include "llvm/IR/DebugInfoMetadata.h"

#include "llvm/IR/Function.h"

#include "llvm/IR/IRBuilder.h"

#include "llvm/IR/Instructions.h"

#include "llvm/IR/IntrinsicInst.h"

#include "llvm/IR/MatrixBuilder.h"

#include "llvm/IR/PatternMatch.h"

#include "llvm/Support/Alignment.h"

#include "llvm/Support/CommandLine.h"

#include "llvm/Support/Debug.h"

#include "llvm/Transforms/Utils/BasicBlockUtils.h"

#include "llvm/Transforms/Utils/LoopUtils.h"

#include "llvm/Transforms/Utils/MatrixUtils.h"


#include <cmath>


using namespace llvm;

using namespace PatternMatch;


#define DEBUG_TYPE "lower-matrix-intrinsics"


static cl::opt<bool>

    FuseMatrix("fuse-matrix", cl::init(true), cl::Hidden,

               cl::desc("Enable/disable fusing matrix instructions."));

// TODO: Allow and use non-square tiles.

static cl::opt<unsigned> TileSize(

    "fuse-matrix-tile-size", cl::init(4), cl::Hidden,

    cl::desc(

        "Tile size for matrix instruction fusion using square-shaped tiles."));

static cl::opt<bool> TileUseLoops("fuse-matrix-use-loops", cl::init(false),

                                  cl::Hidden,

                                  cl::desc("Generate loop nest for tiling."));

static cl::opt<bool> ForceFusion(

    "force-fuse-matrix", cl::init(false), cl::Hidden,

    cl::desc("Force matrix instruction fusion even if not profitable."));

static cl::opt<bool> AllowContractEnabled(

    "matrix-allow-contract", cl::init(false), cl::Hidden,

    cl::desc("Allow the use of FMAs if available and profitable. This may "

             "result in different results, due to less rounding error."));


static cl::opt<bool>

    VerifyShapeInfo("verify-matrix-shapes", cl::Hidden,

                    cl::desc("Enable/disable matrix shape verification."),

                    cl::init(false));


enum class MatrixLayoutTy { ColumnMajor, RowMajor };


static cl::opt<MatrixLayoutTy> MatrixLayout(

    "matrix-default-layout", cl::init(MatrixLayoutTy::ColumnMajor),

    cl::desc("Sets the default matrix layout"),

    cl::values(clEnumValN(MatrixLayoutTy::ColumnMajor, "column-major",

                          "Use column-major layout"),

               clEnumValN(MatrixLayoutTy::RowMajor, "row-major",

                          "Use row-major layout")));


static cl::opt<bool> PrintAfterTransposeOpt("matrix-print-after-transpose-opt",

                                            cl::init(false));


/// Helper function to either return Scope, if it is a subprogram or the

/// attached subprogram for a local scope.

static DISubprogram *getSubprogram(DIScope *Scope) {

  if (auto *Subprogram = dyn_cast<DISubprogram>(Scope))

    return Subprogram;

  return cast<DILocalScope>(Scope)->getSubprogram();

}


/// Return true if V is a splat of a value (which is used when multiplying a

/// matrix with a scalar).

static bool isSplat(Value *V) {

  if (auto *SV = dyn_cast<ShuffleVectorInst>(V))

    return SV->isZeroEltSplat();

  return false;

}


/// Match any mul operation (fp or integer).

template <typename LTy, typename RTy>

auto m_AnyMul(const LTy &L, const RTy &R) {

  return m_CombineOr(m_Mul(L, R), m_FMul(L, R));

}


/// Match any add operation (fp or integer).

template <typename LTy, typename RTy>

auto m_AnyAdd(const LTy &L, const RTy &R) {

  return m_CombineOr(m_Add(L, R), m_FAdd(L, R));

}


namespace {


// Given an element pointer \p BasePtr to the start of a (sub) matrix, compute

// the start address of vector \p VecIdx with type (\p EltType x \p NumElements)

// assuming \p Stride elements between start two consecutive vectors.

// \p Stride must be >= \p NumElements.

// For column-major matrixes, the function computes the address of a column

// vectors and \p NumElements must be set to the number of elements in a column

// (= number of rows of the matrix). For row-major matrixes, the function

// computes the address of a row vector and \p NumElements must be set to the

// number of elements in a column (= number of columns of the matrix).

//

// Consider a 4x4 matrix in column-mjaor layout like below

//

//      0       1      2      3

// 0   v_0_0  v_0_1  v_0_2  v_0_3

// 1   v_1_0  v_1_1  v_1_2  v_1_3

// 2   v_2_0  v_2_1  v_2_2  v_2_3

// 3   v_3_0  v_3_1  v_3_2  v_3_3


// To compute the column addresses for a 2x3 sub-matrix at row 1 and column 1,

// we need a pointer to the first element of the submatrix as base pointer.

// Then we can use computeVectorAddr to compute the addresses for the columns

// of the sub-matrix.

//

// Column 0: computeVectorAddr(Base, 0 (column), 4 (stride), 2 (num rows), ..)

//           -> just returns Base

// Column 1: computeVectorAddr(Base, 1 (column), 4 (stride), 2 (num rows), ..)

//           -> returns Base + (1 * 4)

// Column 2: computeVectorAddr(Base, 2 (column), 4 (stride), 2 (num rows), ..)

//           -> returns Base + (2 * 4)

//

// The graphic below illustrates the number of elements in a column (marked

// with |) and the number of skipped elements (marked with }).

//

//         v_0_0  v_0_1 {v_0_2 {v_0_3

//                Base   Col 1  Col 2

//                  |     |      |

//         v_1_0 |v_1_1 |v_1_2 |v_1_3

//         v_2_0 |v_2_1 |v_2_2 |v_2_3

//         v_3_0 {v_3_1 {v_3_2  v_3_3

//

Value *computeVectorAddr(Value *BasePtr, Value *VecIdx, Value *Stride,

                         unsigned NumElements, Type *EltType,

                         IRBuilder<> &Builder) {


  assert((!isa<ConstantInt>(Stride) ||

          cast<ConstantInt>(Stride)->getZExtValue() >= NumElements) &&

         "Stride must be >= the number of elements in the result vector.");


  // Compute the start of the vector with index VecIdx as VecIdx * Stride.

  Value *VecStart = Builder.CreateMul(VecIdx, Stride, "vec.start");


  // Get pointer to the start of the selected vector. Skip GEP creation,

  // if we select vector 0.

  if (isa<ConstantInt>(VecStart) && cast<ConstantInt>(VecStart)->isZero())

    VecStart = BasePtr;

  else

    VecStart = Builder.CreateGEP(EltType, BasePtr, VecStart, "vec.gep");


  return VecStart;

}


namespace {

struct ShapeInfo {

  unsigned NumRows;

  unsigned NumColumns;


  bool IsColumnMajor;


  ShapeInfo(unsigned NumRows = 0, unsigned NumColumns = 0)

      : NumRows(NumRows), NumColumns(NumColumns),

        IsColumnMajor(MatrixLayout == MatrixLayoutTy::ColumnMajor) {}


  ShapeInfo(Value *NumRows, Value *NumColumns)

      : ShapeInfo(cast<ConstantInt>(NumRows)->getZExtValue(),

                  cast<ConstantInt>(NumColumns)->getZExtValue()) {}


  bool operator==(const ShapeInfo &other) {

    return NumRows == other.NumRows && NumColumns == other.NumColumns;

  }

  bool operator!=(const ShapeInfo &other) { return !(*this == other); }


  /// Returns true if shape-information is defined, meaning both dimensions

  /// are != 0.

  operator bool() const {

    assert(NumRows == 0 || NumColumns != 0);

    return NumRows != 0;

  }


  unsigned getStride() const {

    if (IsColumnMajor)

      return NumRows;

    return NumColumns;

  }


  unsigned getNumVectors() const {

    if (IsColumnMajor)

      return NumColumns;

    return NumRows;

  }


  /// Returns the transposed shape.

  ShapeInfo t() const { return ShapeInfo(NumColumns, NumRows); }

};

} // namespace


static bool isUniformShape(Value *V) {

  Instruction *I = dyn_cast<Instruction>(V);

  if (!I)

    return true;


  switch (I->getOpcode()) {

  case Instruction::FAdd:

  case Instruction::FSub:

  case Instruction::FMul: // Scalar multiply.

  case Instruction::FNeg:

  case Instruction::Add:

  case Instruction::Mul:

  case Instruction::Sub:

    return true;

  default:

    return false;

  }

}


/// Return the ShapeInfo for the result of \p I, it it can be determined.

static std::optional<ShapeInfo>

computeShapeInfoForInst(Instruction *I,

                        const DenseMap<Value *, ShapeInfo> &ShapeMap) {

  Value *M;

  Value *N;

  Value *K;

  if (match(I, m_Intrinsic<Intrinsic::matrix_multiply>(

                   m_Value(), m_Value(), m_Value(M), m_Value(N), m_Value(K))))

    return ShapeInfo(M, K);

  if (match(I, m_Intrinsic<Intrinsic::matrix_transpose>(m_Value(), m_Value(M),

                                                        m_Value(N)))) {

    // Flip dimensions.

    return ShapeInfo(N, M);

  }

  if (match(I, m_Intrinsic<Intrinsic::matrix_column_major_store>(

                   m_Value(), m_Value(), m_Value(), m_Value(), m_Value(M),

                   m_Value(N))))

    return ShapeInfo(N, M);

  if (match(I, m_Intrinsic<Intrinsic::matrix_column_major_load>(

                   m_Value(), m_Value(), m_Value(), m_Value(M), m_Value(N))))

    return ShapeInfo(M, N);

  Value *MatrixA;

  if (match(I, m_Store(m_Value(MatrixA), m_Value()))) {

    auto OpShape = ShapeMap.find(MatrixA);

    if (OpShape != ShapeMap.end())

      return OpShape->second;

  }


  if (isUniformShape(I)) {

    // Find the first operand that has a known shape and use that.

    for (auto &Op : I->operands()) {

      auto OpShape = ShapeMap.find(Op.get());

      if (OpShape != ShapeMap.end())

        return OpShape->second;

    }

  }

  return std::nullopt;

}


/// LowerMatrixIntrinsics contains the methods used to lower matrix intrinsics.

///

/// Currently, the lowering for each matrix intrinsic is done as follows:

/// 1. Propagate the shape information from intrinsics to connected

/// instructions.

/// 2. Lower instructions with shape information (assuming column-major layout).

///  The lowering works similarly using row-major layout.

///  2.1. Get column vectors for each argument. If we already lowered the

///       definition of an argument, use the produced column vectors directly.

///       If not, split the operand vector containing an embedded matrix into

///       a set of column vectors,

///  2.2. Lower the instruction in terms of column major operations, which

///       yields a set of column vectors containing result matrix. Note that we

///       lower all instructions that have shape information. Besides the

///       intrinsics, this includes stores for example.

///  2.3. Update uses of the lowered instruction. If we have shape information

///       for a user, there is nothing to do, as we will look up the result

///       column matrix when lowering the user. For other uses, we embed the

///       result matrix in a flat vector and update the use.

///  2.4. Cache the result column matrix for the instruction we lowered

/// 3. After we lowered all instructions in a function, remove the now

///    obsolete instructions.

///

class LowerMatrixIntrinsics {

  Function &Func;

  const DataLayout &DL;

  const TargetTransformInfo &TTI;

  FunctionAnalysisManager *AM;

  AliasAnalysis *AA = nullptr;

  DominatorTree *DT = nullptr;

  LoopInfo *LI = nullptr;

  OptimizationRemarkEmitter *ORE = nullptr;


  /// Contains estimates of the number of operations (loads, stores, compute) required to lower a matrix operation.

  struct OpInfoTy {

    /// Number of stores emitted to generate this matrix.

    unsigned NumStores = 0;

    /// Number of loads emitted to generate this matrix.

    unsigned NumLoads = 0;

    /// Number of compute operations emitted to generate this matrix.

    unsigned NumComputeOps = 0;

    /// Most of the time transposes can be fused with matrix multiplies or can

    /// be folded away via algebraic simplifications.  This is the number of

    /// transposes that we failed to make "free" via such optimizations.

    unsigned NumExposedTransposes = 0;


    OpInfoTy &operator+=(const OpInfoTy &RHS) {

      NumStores += RHS.NumStores;

      NumLoads += RHS.NumLoads;

      NumComputeOps += RHS.NumComputeOps;

      NumExposedTransposes += RHS.NumExposedTransposes;

      return *this;

    }

  };


  /// Wrapper class representing a matrix as a set of vectors, either in row or

  /// column major layout. All vectors must have the same vector type.

  class MatrixTy {

    SmallVector<Value *, 16> Vectors;


    OpInfoTy OpInfo;


    bool IsColumnMajor = true;


  public:

    MatrixTy() : IsColumnMajor(MatrixLayout == MatrixLayoutTy::ColumnMajor) {}

    MatrixTy(ArrayRef<Value *> Vectors)

        : Vectors(Vectors),

          IsColumnMajor(MatrixLayout == MatrixLayoutTy::ColumnMajor) {}

    MatrixTy(unsigned NumRows, unsigned NumColumns, Type *EltTy)

        : IsColumnMajor(MatrixLayout == MatrixLayoutTy::ColumnMajor) {


      unsigned D = isColumnMajor() ? NumColumns : NumRows;

      for (unsigned J = 0; J < D; ++J)

        addVector(PoisonValue::get(FixedVectorType::get(

            EltTy, isColumnMajor() ? NumRows : NumColumns)));

    }


    Value *getVector(unsigned i) const { return Vectors[i]; }

    Value *getColumn(unsigned i) const {

      assert(isColumnMajor() && "only supported for column-major matrixes");

      return Vectors[i];

    }

    Value *getRow(unsigned i) const {

      assert(!isColumnMajor() && "only supported for row-major matrixes");

      return Vectors[i];

    }


    void setVector(unsigned i, Value *V) { Vectors[i] = V; }


    Type *getElementType() const { return getVectorTy()->getElementType(); }


    unsigned getNumVectors() const {

      if (isColumnMajor())

        return getNumColumns();

      return getNumRows();

    }


    unsigned getNumColumns() const {

      if (isColumnMajor())

        return Vectors.size();

      else {

        assert(Vectors.size() > 0 && "Cannot call getNumRows without columns");

        return cast<FixedVectorType>(Vectors[0]->getType())->getNumElements();

      }

    }

    unsigned getNumRows() const {

      if (isColumnMajor()) {

        assert(Vectors.size() > 0 && "Cannot call getNumRows without columns");

        return cast<FixedVectorType>(Vectors[0]->getType())->getNumElements();

      } else

        return Vectors.size();

    }


    void addVector(Value *V) { Vectors.push_back(V); }

    VectorType *getColumnTy() {

      assert(isColumnMajor() && "only supported for column-major matrixes");

      return getVectorTy();

    }


    VectorType *getVectorTy() const {

      return cast<VectorType>(Vectors[0]->getType());

    }


    iterator_range<SmallVector<Value *, 8>::iterator> columns() {

      assert(isColumnMajor() &&

             "columns() only supported for column-major matrixes");

      return make_range(Vectors.begin(), Vectors.end());

    }


    iterator_range<SmallVector<Value *, 8>::iterator> vectors() {

      return make_range(Vectors.begin(), Vectors.end());

    }


    /// Embed the vectors of the matrix into a flat vector by concatenating

    /// them.

    Value *embedInVector(IRBuilder<> &Builder) const {

      return Vectors.size() == 1 ? Vectors[0]

                                 : concatenateVectors(Builder, Vectors);

    }


    MatrixTy &addNumLoads(unsigned N) {

      OpInfo.NumLoads += N;

      return *this;

    }


    void setNumLoads(unsigned N) { OpInfo.NumLoads = N; }


    MatrixTy &addNumStores(unsigned N) {

      OpInfo.NumStores += N;

      return *this;

    }


    MatrixTy &addNumExposedTransposes(unsigned N) {

      OpInfo.NumExposedTransposes += N;

      return *this;

    }


    MatrixTy &addNumComputeOps(unsigned N) {

      OpInfo.NumComputeOps += N;

      return *this;

    }


    unsigned getNumStores() const { return OpInfo.NumStores; }

    unsigned getNumLoads() const { return OpInfo.NumLoads; }

    unsigned getNumComputeOps() const { return OpInfo.NumComputeOps; }


    const OpInfoTy &getOpInfo() const { return OpInfo; }


    bool isColumnMajor() const { return IsColumnMajor; }


    unsigned getStride() const {

      if (isColumnMajor())

        return getNumRows();

      return getNumColumns();

    }


    /// Extract a vector of \p NumElts starting at index (\p I, \p J). If the

    /// matrix is column-major, the result vector is extracted from a column

    /// vector, otherwise from a row vector.

    Value *extractVector(unsigned I, unsigned J, unsigned NumElts,

                         IRBuilder<> &Builder) const {

      Value *Vec = isColumnMajor() ? getColumn(J) : getRow(I);

      assert(cast<FixedVectorType>(Vec->getType())->getNumElements() >=

                 NumElts &&

             "Extracted vector will contain poison values");

      return Builder.CreateShuffleVector(

          Vec, createSequentialMask(isColumnMajor() ? I : J, NumElts, 0),

          "block");

    }

  };


  /// Maps instructions to their shape information. The shape information

  /// describes the shape to be used while lowering. This matches the shape of

  /// the result value of the instruction, with the only exceptions being store

  /// instructions and the matrix_column_major_store intrinsics. For those, the

  /// shape information indicates that those instructions should be lowered

  /// using shape information as well. Note that extra care is needed when

  /// erasing or RAUW'ing a value that is present in ShapeMap. If the

  /// replacement is also a matrix operation, use

  /// updateShapeAndReplaceAllUsesWith to make sure the replacement is added to

  /// ShapeMap.  We don't use ValueMap, as there are also cases where we do not

  /// want to add shape information for a replacement instruction. When directly

  /// erasing a value with an entry in ShapeMap, use

  /// eraseFromParentAndRemoveFromShapeMap to make sure ShapeMap is also updated

  /// accordingly.

  DenseMap<Value *, ShapeInfo> ShapeMap;


  /// List of instructions to remove. While lowering, we are not replacing all

  /// users of a lowered instruction, if shape information is available and

  /// those need to be removed after we finished lowering.

  SmallVector<Instruction *, 16> ToRemove;


  /// Map from instructions to their produced column matrix.

  MapVector<Value *, MatrixTy> Inst2ColumnMatrix;


private:

  static FastMathFlags getFastMathFlags(Instruction *Inst) {

    FastMathFlags FMF;


    if (isa<FPMathOperator>(*Inst))

      FMF = Inst->getFastMathFlags();


    FMF.setAllowContract(AllowContractEnabled || FMF.allowContract());


    return FMF;

  }


public:

  LowerMatrixIntrinsics(Function &F, TargetTransformInfo &TTI,

                        FunctionAnalysisManager *AM)

      : Func(F), DL(F.getDataLayout()), TTI(TTI), AM(AM) {}


  unsigned getNumOps(Type *VT) {

    assert(isa<VectorType>(VT) && "Expected vector type");

    return getNumOps(VT->getScalarType(),

                     cast<FixedVectorType>(VT)->getNumElements());

  }


  /// Is this the minimal version executed in the backend pipelines.

  bool isMinimal() const {

    return !DT;

  }


  /// Return the estimated number of vector ops required for an operation on

  /// \p VT * N.

  unsigned getNumOps(Type *ST, unsigned N) {

    return std::ceil((ST->getPrimitiveSizeInBits() * N).getFixedValue() /

                     double(TTI.getRegisterBitWidth(

                                   TargetTransformInfo::RGK_FixedWidthVector)

                                .getFixedValue()));

  }


  /// Return the set of vectors that a matrix value is lowered to.

  ///

  /// If we lowered \p MatrixVal, just return the cache result matrix. Otherwise

  /// split the flat vector \p MatrixVal containing a matrix with shape \p SI

  /// into vectors.

  MatrixTy getMatrix(Value *MatrixVal, const ShapeInfo &SI,

                     IRBuilder<> &Builder) {

    VectorType *VType = dyn_cast<VectorType>(MatrixVal->getType());

    assert(VType && "MatrixVal must be a vector type");

    assert(cast<FixedVectorType>(VType)->getNumElements() ==

               SI.NumRows * SI.NumColumns &&

           "The vector size must match the number of matrix elements");


    // Check if we lowered MatrixVal using shape information. In that case,

    // return the existing matrix, if it matches the requested shape

    // information. If there is a mis-match, embed the result in a flat

    // vector and split it later.

    auto Found = Inst2ColumnMatrix.find(MatrixVal);

    if (Found != Inst2ColumnMatrix.end()) {

      MatrixTy &M = Found->second;

      // Return the found matrix, if its shape matches the requested shape

      // information

      if (SI.NumRows == M.getNumRows() && SI.NumColumns == M.getNumColumns())

        return M;


      MatrixVal = M.embedInVector(Builder);

    }


    // Otherwise split MatrixVal.

    SmallVector<Value *, 16> SplitVecs;

    for (unsigned MaskStart = 0;

         MaskStart < cast<FixedVectorType>(VType)->getNumElements();

         MaskStart += SI.getStride()) {

      Value *V = Builder.CreateShuffleVector(

          MatrixVal, createSequentialMask(MaskStart, SI.getStride(), 0),

          "split");

      SplitVecs.push_back(V);

    }


    return {SplitVecs};

  }


  /// If \p V already has a known shape return false.  Otherwise set the shape

  /// for instructions that support it.

  bool setShapeInfo(Value *V, ShapeInfo Shape) {

    assert(Shape && "Shape not set");

    if (isa<UndefValue>(V) || !supportsShapeInfo(V))

      return false;


    auto SIter = ShapeMap.find(V);

    if (SIter != ShapeMap.end()) {

      if (VerifyShapeInfo && (SIter->second.NumRows != Shape.NumRows ||

                              SIter->second.NumColumns != Shape.NumColumns)) {

        errs() << "Conflicting shapes (" << SIter->second.NumRows << "x"

               << SIter->second.NumColumns << " vs " << Shape.NumRows << "x"

               << Shape.NumColumns << ") for " << *V << "\n";

        report_fatal_error(

            "Matrix shape verification failed, compilation aborted!");

      }


      LLVM_DEBUG(dbgs() << "  not overriding existing shape: "

                        << SIter->second.NumRows << " "

                        << SIter->second.NumColumns << " for " << *V << "\n");

      return false;

    }


    ShapeMap.insert({V, Shape});

    LLVM_DEBUG(dbgs() << "  " << Shape.NumRows << " x " << Shape.NumColumns

                      << " for " << *V << "\n");

    return true;

  }


  /// Returns true if shape information can be used for \p V. The supported

  /// instructions must match the instructions that can be lowered by this pass.

  bool supportsShapeInfo(Value *V) {

    Instruction *Inst = dyn_cast<Instruction>(V);

    if (!Inst)

      return false;


    IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst);

    if (II)

      switch (II->getIntrinsicID()) {

      case Intrinsic::matrix_multiply:

      case Intrinsic::matrix_transpose:

      case Intrinsic::matrix_column_major_load:

      case Intrinsic::matrix_column_major_store:

        return true;

      default:

        return false;

      }

    return isUniformShape(V) || isa<StoreInst>(V) || isa<LoadInst>(V);

  }


  /// Propagate the shape information of instructions to their users.

  /// The work list contains instructions for which we can compute the shape,

  /// either based on the information provided by matrix intrinsics or known

  /// shapes of operands.

  SmallVector<Instruction *, 32>

  propagateShapeForward(SmallVectorImpl<Instruction *> &WorkList) {

    SmallVector<Instruction *, 32> NewWorkList;

    // Pop an element for which we guaranteed to have at least one of the

    // operand shapes.  Add the shape for this and then add users to the work

    // list.

    LLVM_DEBUG(dbgs() << "Forward-propagate shapes:\n");

    while (!WorkList.empty()) {

      Instruction *Inst = WorkList.pop_back_val();


      // New entry, set the value and insert operands

      bool Propagate = false;

      if (auto SI = computeShapeInfoForInst(Inst, ShapeMap))

        Propagate = setShapeInfo(Inst, *SI);


      if (Propagate) {

        NewWorkList.push_back(Inst);

        for (auto *User : Inst->users())

          if (ShapeMap.count(User) == 0)

            WorkList.push_back(cast<Instruction>(User));

      }

    }


    return NewWorkList;

  }


  /// Propagate the shape to operands of instructions with shape information.

  /// \p Worklist contains the instruction for which we already know the shape.

  SmallVector<Instruction *, 32>

  propagateShapeBackward(SmallVectorImpl<Instruction *> &WorkList) {

    SmallVector<Instruction *, 32> NewWorkList;


    auto pushInstruction = [](Value *V,

                              SmallVectorImpl<Instruction *> &WorkList) {

      Instruction *I = dyn_cast<Instruction>(V);

      if (I)

        WorkList.push_back(I);

    };

    // Pop an element with known shape.  Traverse the operands, if their shape

    // derives from the result shape and is unknown, add it and add them to the

    // worklist.

    LLVM_DEBUG(dbgs() << "Backward-propagate shapes:\n");

    while (!WorkList.empty()) {

      Value *V = WorkList.pop_back_val();


      size_t BeforeProcessingV = WorkList.size();

      if (!isa<Instruction>(V))

        continue;


      Value *MatrixA;

      Value *MatrixB;

      Value *M;

      Value *N;

      Value *K;

      if (match(V, m_Intrinsic<Intrinsic::matrix_multiply>(

                       m_Value(MatrixA), m_Value(MatrixB), m_Value(M),

                       m_Value(N), m_Value(K)))) {

        if (setShapeInfo(MatrixA, {M, N}))

          pushInstruction(MatrixA, WorkList);


        if (setShapeInfo(MatrixB, {N, K}))

          pushInstruction(MatrixB, WorkList);


      } else if (match(V, m_Intrinsic<Intrinsic::matrix_transpose>(

                              m_Value(MatrixA), m_Value(M), m_Value(N)))) {

        // Flip dimensions.

        if (setShapeInfo(MatrixA, {M, N}))

          pushInstruction(MatrixA, WorkList);

      } else if (match(V, m_Intrinsic<Intrinsic::matrix_column_major_store>(

                              m_Value(MatrixA), m_Value(), m_Value(), m_Value(),

                              m_Value(M), m_Value(N)))) {

        if (setShapeInfo(MatrixA, {M, N})) {

          pushInstruction(MatrixA, WorkList);

        }

      } else if (isa<LoadInst>(V) ||

                 match(V, m_Intrinsic<Intrinsic::matrix_column_major_load>())) {

        // Nothing to do, no matrix input.

      } else if (isa<StoreInst>(V)) {

        // Nothing to do.  We forward-propagated to this so we would just

        // backward propagate to an instruction with an already known shape.

      } else if (isUniformShape(V)) {

        // Propagate to all operands.

        ShapeInfo Shape = ShapeMap[V];

        for (Use &U : cast<Instruction>(V)->operands()) {

          if (setShapeInfo(U.get(), Shape))

            pushInstruction(U.get(), WorkList);

        }

      }

      // After we discovered new shape info for new instructions in the

      // worklist, we use their users as seeds for the next round of forward

      // propagation.

      for (size_t I = BeforeProcessingV; I != WorkList.size(); I++)

        for (User *U : WorkList[I]->users())

          if (isa<Instruction>(U) && V != U)

            NewWorkList.push_back(cast<Instruction>(U));

    }

    return NewWorkList;

  }


  /// (Op0 op Op1)^T -> Op0^T op Op1^T

  /// Transpose \p Op0 and \p Op1 of shape \p Shape0 and \p Shape1, then use

  /// them on both sides of \p Operation.

  Instruction *distributeTransposes(

      Value *Op0, ShapeInfo Shape0, Value *Op1, ShapeInfo Shape1,

      MatrixBuilder &Builder,

      function_ref<Instruction *(Value *, ShapeInfo, Value *, ShapeInfo)>

          Operation) {

    Value *T0 = Builder.CreateMatrixTranspose(

        Op0, Shape0.NumRows, Shape0.NumColumns, Op0->getName() + "_t");

    // We are being run after shape prop, add shape for newly created

    // instructions so that we lower them later.

    setShapeInfo(T0, Shape0.t());

    Value *T1 = Builder.CreateMatrixTranspose(

        Op1, Shape1.NumRows, Shape1.NumColumns, Op1->getName() + "_t");

    setShapeInfo(T1, Shape1.t());

    return Operation(T0, Shape0.t(), T1, Shape1.t());

  }


  /// Erase \p Inst from both ShapeMap (if an entry exists) and erase \p Inst

  /// itself.

  void eraseFromParentAndRemoveFromShapeMap(Instruction *Inst) {

    auto Iter = ShapeMap.find(Inst);

    if (Iter != ShapeMap.end())

      ShapeMap.erase(Iter);

    Inst->eraseFromParent();

  }


  /// Erase \p V from \p BB and move \II forward to avoid invalidating

  /// iterators.

  void eraseFromParentAndMove(Value *V, BasicBlock::reverse_iterator &II,

                              BasicBlock &BB) {

    auto *Inst = cast<Instruction>(V);

    // Still used, don't erase.

    if (!Inst->use_empty())

      return;

    if (II != BB.rend() && Inst == &*II)

      ++II;

    eraseFromParentAndRemoveFromShapeMap(Inst);

  }


  /// Add a new entry to ShapeMap for \p New with \p Old's shape info, erase the

  /// entry for \p Old and replace all uses of \p Old with \p New.

  void updateShapeAndReplaceAllUsesWith(Instruction &Old, Value *New) {

    // We need to remove Old from the ShapeMap otherwise RAUW will replace it

    // with New. We should only add New it it supportsShapeInfo so we insert

    // it conditionally instead.

    auto S = ShapeMap.find(&Old);

    if (S != ShapeMap.end()) {

      ShapeMap.erase(S);

      if (supportsShapeInfo(New))

        ShapeMap.insert({New, S->second});

    }

    Old.replaceAllUsesWith(New);

  }


  /// Sink a top-level transpose inside matmuls and adds.

  /// This creates and erases instructions as needed, and returns the newly

  /// created instruction while updating the iterator to avoid invalidation. If

  /// this returns nullptr, no new instruction was created.

  Instruction *sinkTranspose(Instruction &I, BasicBlock::reverse_iterator &II) {

    BasicBlock &BB = *I.getParent();

    IRBuilder<> IB(&I);

    MatrixBuilder Builder(IB);


    Value *TA, *TAMA, *TAMB;

    ConstantInt *R, *K, *C;

    if (!match(&I, m_Intrinsic<Intrinsic::matrix_transpose>(

                       m_Value(TA), m_ConstantInt(R), m_ConstantInt(C))))

      return nullptr;


    // Transpose of a transpose is a nop

    Value *TATA;

    if (match(TA, m_Intrinsic<Intrinsic::matrix_transpose>(m_Value(TATA)))) {

      updateShapeAndReplaceAllUsesWith(I, TATA);

      eraseFromParentAndMove(&I, II, BB);

      eraseFromParentAndMove(TA, II, BB);

      return nullptr;

    }


    // k^T -> k

    if (isSplat(TA)) {

      updateShapeAndReplaceAllUsesWith(I, TA);

      eraseFromParentAndMove(&I, II, BB);

      return nullptr;

    }


    // (A * B)^t -> B^t * A^t

    // RxK KxC      CxK   KxR

    if (match(TA, m_Intrinsic<Intrinsic::matrix_multiply>(

                      m_Value(TAMA), m_Value(TAMB), m_ConstantInt(R),

                      m_ConstantInt(K), m_ConstantInt(C)))) {

      auto NewInst = distributeTransposes(

          TAMB, {K, C}, TAMA, {R, K}, Builder,

          [&](Value *T0, ShapeInfo Shape0, Value *T1, ShapeInfo Shape1) {

            return Builder.CreateMatrixMultiply(T0, T1, Shape0.NumRows,

                                                Shape0.NumColumns,

                                                Shape1.NumColumns, "mmul");

          });

      updateShapeAndReplaceAllUsesWith(I, NewInst);

      eraseFromParentAndMove(&I, II, BB);

      eraseFromParentAndMove(TA, II, BB);

      return NewInst;

    }


    // Same as above, but with a mul, which occurs when multiplied

    // with a scalar.

    // (A * k)^t -> A^t * k

    //  R  x  C     RxC

    if (match(TA, m_AnyMul(m_Value(TAMA), m_Value(TAMB))) &&

        (isSplat(TAMA) || isSplat(TAMB))) {

      IRBuilder<> LocalBuilder(&I);

      // We know that the transposed operand is of shape RxC.

      // An when multiplied with a scalar, the shape is preserved.

      auto NewInst = distributeTransposes(

          TAMA, {R, C}, TAMB, {R, C}, Builder,

          [&](Value *T0, ShapeInfo Shape0, Value *T1, ShapeInfo Shape1) {

            bool IsFP = I.getType()->isFPOrFPVectorTy();

            auto *Mul = IsFP ? LocalBuilder.CreateFMul(T0, T1, "mmul")

                             : LocalBuilder.CreateMul(T0, T1, "mmul");

            auto *Result = cast<Instruction>(Mul);

            setShapeInfo(Result, Shape0);

            return Result;

          });

      updateShapeAndReplaceAllUsesWith(I, NewInst);

      eraseFromParentAndMove(&I, II, BB);

      eraseFromParentAndMove(TA, II, BB);

      return NewInst;

    }


    // (A + B)^t -> A^t + B^t

    // RxC RxC      CxR   CxR

    if (match(TA, m_AnyAdd(m_Value(TAMA), m_Value(TAMB)))) {

      IRBuilder<> LocalBuilder(&I);

      auto NewInst = distributeTransposes(

          TAMA, {R, C}, TAMB, {R, C}, Builder,

          [&](Value *T0, ShapeInfo Shape0, Value *T1, ShapeInfo Shape1) {

            bool IsFP = I.getType()->isFPOrFPVectorTy();

            auto *Add = IsFP ? LocalBuilder.CreateFAdd(T0, T1, "madd")

                             : LocalBuilder.CreateAdd(T0, T1, "madd");


            auto *Result = cast<Instruction>(Add);

            setShapeInfo(Result, Shape0);

            return Result;

          });

      updateShapeAndReplaceAllUsesWith(I, NewInst);

      eraseFromParentAndMove(&I, II, BB);

      eraseFromParentAndMove(TA, II, BB);

      return NewInst;

    }


    return nullptr;

  }


  void liftTranspose(Instruction &I) {

    // Erase dead Instructions after lifting transposes from binops.

    auto CleanupBinOp = [this](Instruction &T, Value *A, Value *B) {

      if (T.use_empty())

        eraseFromParentAndRemoveFromShapeMap(&T);

      if (A->use_empty())

        eraseFromParentAndRemoveFromShapeMap(cast<Instruction>(A));

      if (A != B && B->use_empty())

        eraseFromParentAndRemoveFromShapeMap(cast<Instruction>(B));

    };


    Value *A, *B, *AT, *BT;

    ConstantInt *R, *K, *C;

    // A^t * B ^t -> (B * A)^t

    if (match(&I, m_Intrinsic<Intrinsic::matrix_multiply>(

                      m_Value(A), m_Value(B), m_ConstantInt(R),

                      m_ConstantInt(K), m_ConstantInt(C))) &&

        match(A, m_Intrinsic<Intrinsic::matrix_transpose>(m_Value(AT))) &&

        match(B, m_Intrinsic<Intrinsic::matrix_transpose>(m_Value((BT))))) {

      IRBuilder<> IB(&I);

      MatrixBuilder Builder(IB);

      Value *M = Builder.CreateMatrixMultiply(

          BT, AT, C->getZExtValue(), K->getZExtValue(), R->getZExtValue());

      setShapeInfo(M, {C, R});

      Instruction *NewInst = Builder.CreateMatrixTranspose(M, C->getZExtValue(),

                                                           R->getZExtValue());

      updateShapeAndReplaceAllUsesWith(I, NewInst);

      CleanupBinOp(I, A, B);

    }

    // A^t + B ^t -> (A + B)^t. Pick rows and columns from first transpose. If

    // the shape of the second transpose is different, there's a shape conflict

    // which gets resolved by picking the shape of the first operand.

    else if (match(&I, m_FAdd(m_Value(A), m_Value(B))) &&

             match(A, m_Intrinsic<Intrinsic::matrix_transpose>(

                          m_Value(AT), m_ConstantInt(R), m_ConstantInt(C))) &&

             match(B, m_Intrinsic<Intrinsic::matrix_transpose>(

                          m_Value(BT), m_ConstantInt(), m_ConstantInt()))) {

      IRBuilder<> Builder(&I);

      auto *Add = Builder.CreateFAdd(AT, BT, "mfadd");

      MatrixBuilder MBuilder(Builder);

      Instruction *NewInst = MBuilder.CreateMatrixTranspose(

          Add, R->getZExtValue(), C->getZExtValue(), "mfadd_t");

      updateShapeAndReplaceAllUsesWith(I, NewInst);

      assert(computeShapeInfoForInst(NewInst, ShapeMap) ==

                 computeShapeInfoForInst(&I, ShapeMap) &&

             "Shape of new instruction doesn't match original shape.");

      CleanupBinOp(I, A, B);

      if (auto *AddI = dyn_cast<Instruction>(Add)) {

        setShapeInfo(AddI, {R, C});

        assert(

            computeShapeInfoForInst(AddI, ShapeMap).value_or(ShapeMap[AddI]) ==

                ShapeMap[AddI] &&

            "Shape of updated addition doesn't match cached shape.");

      }

    }

  }


  /// Try moving transposes in order to fold them away or into multiplies.

  void optimizeTransposes() {

    // First sink all transposes inside matmuls and adds, hoping that we end up

    // with NN, NT or TN variants.

    for (BasicBlock &BB : reverse(Func)) {

      for (auto II = BB.rbegin(); II != BB.rend();) {

        Instruction &I = *II;

        // We may remove II.  By default continue on the next/prev instruction.

        ++II;

        if (Instruction *NewInst = sinkTranspose(I, II))

          II = std::next(BasicBlock::reverse_iterator(NewInst));

      }

    }


    // If we have a TT matmul or a TT add, lift the transpose. We may be able

    // to fold into consuming multiply or add.

    for (BasicBlock &BB : Func) {

      for (Instruction &I : llvm::make_early_inc_range(BB)) {

        liftTranspose(I);

      }

    }

  }


  bool Visit() {

    SmallVector<Instruction *, 32> WorkList;


    // Initially only the shape of matrix intrinsics is known.

    // Initialize the work list with ops carrying shape information.

    for (BasicBlock &BB : Func)

      for (Instruction &Inst : BB) {

        IntrinsicInst *II = dyn_cast<IntrinsicInst>(&Inst);

        if (!II)

          continue;


        switch (II->getIntrinsicID()) {

        case Intrinsic::matrix_multiply:

        case Intrinsic::matrix_transpose:

        case Intrinsic::matrix_column_major_load:

        case Intrinsic::matrix_column_major_store:

          WorkList.push_back(&Inst);

          break;

        default:

          break;

        }

      }


    // Avoid unnecessary work if there are no matrix intrinsics in the function.

    if (WorkList.empty())

      return false;


    if (AM) {

      ORE = &AM->getResult<OptimizationRemarkEmitterAnalysis>(Func);

      AA = &AM->getResult<AAManager>(Func);

      DT = &AM->getResult<DominatorTreeAnalysis>(Func);

      LI = &AM->getResult<LoopAnalysis>(Func);

    }


    // Propagate shapes until nothing changes any longer.

    while (!WorkList.empty()) {

      WorkList = propagateShapeForward(WorkList);

      WorkList = propagateShapeBackward(WorkList);

    }


    if (!isMinimal()) {

      optimizeTransposes();

      if (PrintAfterTransposeOpt) {

        dbgs() << "Dump after matrix transpose optimization:\n";

        Func.print(dbgs());

      }

    }


    bool Changed = false;

    SmallVector<CallInst *, 16> MaybeFusableInsts;

    SmallVector<Instruction *, 16> MatrixInsts;

    SmallVector<IntrinsicInst *, 16> LifetimeEnds;


    // First, collect all instructions with shape information and candidates for

    // fusion (currently only matrix multiplies).

    ReversePostOrderTraversal<Function *> RPOT(&Func);

    for (auto *BB : RPOT)

      for (Instruction &I : *BB) {

        if (match(&I, m_Intrinsic<Intrinsic::lifetime_end>()))

          LifetimeEnds.push_back(cast<IntrinsicInst>(&I));

        if (ShapeMap.find(&I) == ShapeMap.end())

          continue;

        if (match(&I, m_Intrinsic<Intrinsic::matrix_multiply>()))

          MaybeFusableInsts.push_back(cast<CallInst>(&I));

        MatrixInsts.push_back(&I);

      }


    // Second, try to lower any dot products

    SmallPtrSet<Instruction *, 16> FusedInsts;

    for (CallInst *CI : MaybeFusableInsts)

      lowerDotProduct(CI, FusedInsts, getFastMathFlags(CI));


    // Third, try to fuse candidates.

    for (CallInst *CI : MaybeFusableInsts)

      if (!FusedInsts.contains(CI))

        LowerMatrixMultiplyFused(CI, FusedInsts, LifetimeEnds);


    Changed = !FusedInsts.empty();


    // Fourth, lower remaining instructions with shape information.

    for (Instruction *Inst : MatrixInsts) {

      if (FusedInsts.count(Inst))

        continue;


      IRBuilder<> Builder(Inst);


      if (CallInst *CInst = dyn_cast<CallInst>(Inst))

        Changed |= VisitCallInst(CInst);


      Value *Op1;

      Value *Op2;

      if (auto *BinOp = dyn_cast<BinaryOperator>(Inst))

        Changed |= VisitBinaryOperator(BinOp);

      if (auto *UnOp = dyn_cast<UnaryOperator>(Inst))

        Changed |= VisitUnaryOperator(UnOp);

      if (match(Inst, m_Load(m_Value(Op1))))

        Changed |= VisitLoad(cast<LoadInst>(Inst), Op1, Builder);

      else if (match(Inst, m_Store(m_Value(Op1), m_Value(Op2))))

        Changed |= VisitStore(cast<StoreInst>(Inst), Op1, Op2, Builder);

    }


    if (ORE) {

      RemarkGenerator RemarkGen(Inst2ColumnMatrix, *ORE, Func);

      RemarkGen.emitRemarks();

    }


    // Delete the instructions backwards, as it has a reduced likelihood of

    // having to update as many def-use and use-def chains.

    //

    // Because we add to ToRemove during fusion we can't guarantee that defs

    // are before uses.  Change uses to poison temporarily as these should get

    // removed as well.

    //

    // For verification, we keep track of where we changed uses to poison in

    // PoisonedInsts and then check that we in fact remove them.

    SmallSet<Instruction *, 16> PoisonedInsts;

    for (auto *Inst : reverse(ToRemove)) {

      for (Use &U : llvm::make_early_inc_range(Inst->uses())) {

        if (auto *Poisoned = dyn_cast<Instruction>(U.getUser()))

          PoisonedInsts.insert(Poisoned);

        U.set(PoisonValue::get(Inst->getType()));

      }

      Inst->eraseFromParent();

      PoisonedInsts.erase(Inst);

    }

    if (!PoisonedInsts.empty()) {

      // If we didn't remove all poisoned instructions, it's a hard error.

      dbgs() << "Poisoned but present instructions:\n";

      for (auto *I : PoisonedInsts)

        dbgs() << *I << "\n";

      llvm_unreachable("Poisoned but instruction not removed");

    }


    return Changed;

  }


  /// Replace intrinsic calls

  bool VisitCallInst(CallInst *Inst) {

    if (!Inst->getCalledFunction() || !Inst->getCalledFunction()->isIntrinsic())

      return false;


    switch (Inst->getCalledFunction()->getIntrinsicID()) {

    case Intrinsic::matrix_multiply:

      LowerMultiply(Inst);

      break;

    case Intrinsic::matrix_transpose:

      LowerTranspose(Inst);

      break;

    case Intrinsic::matrix_column_major_load:

      LowerColumnMajorLoad(Inst);

      break;

    case Intrinsic::matrix_column_major_store:

      LowerColumnMajorStore(Inst);

      break;

    default:

      return false;

    }

    return true;

  }


  /// Compute the alignment for a column/row \p Idx with \p Stride between them.

  /// The address at \p Idx == 0 has alignment \p A. If \p Stride is a

  /// ConstantInt, reduce the initial alignment based on the byte offset. For

  /// non-ConstantInt strides, return the common alignment of the initial

  /// alignment and the element size in bytes.

  Align getAlignForIndex(unsigned Idx, Value *Stride, Type *ElementTy,

                         MaybeAlign A) const {

    Align InitialAlign = DL.getValueOrABITypeAlignment(A, ElementTy);

    if (Idx == 0)

      return InitialAlign;


    TypeSize ElementSizeInBits = DL.getTypeSizeInBits(ElementTy);

    if (auto *ConstStride = dyn_cast<ConstantInt>(Stride)) {

      uint64_t StrideInBytes =

          ConstStride->getZExtValue() * ElementSizeInBits / 8;

      return commonAlignment(InitialAlign, Idx * StrideInBytes);

    }

    return commonAlignment(InitialAlign, ElementSizeInBits / 8);

  }


  /// Load a matrix with \p Shape starting at \p Ptr and using \p Stride between

  /// vectors.

  MatrixTy loadMatrix(Type *Ty, Value *Ptr, MaybeAlign MAlign, Value *Stride,

                      bool IsVolatile, ShapeInfo Shape, IRBuilder<> &Builder) {

    auto *VType = cast<VectorType>(Ty);

    Type *EltTy = VType->getElementType();

    Type *VecTy = FixedVectorType::get(EltTy, Shape.getStride());

    Value *EltPtr = Ptr;

    MatrixTy Result;

    for (unsigned I = 0, E = Shape.getNumVectors(); I < E; ++I) {

      Value *GEP = computeVectorAddr(

          EltPtr, Builder.getIntN(Stride->getType()->getScalarSizeInBits(), I),

          Stride, Shape.getStride(), EltTy, Builder);

      Value *Vector = Builder.CreateAlignedLoad(

          VecTy, GEP, getAlignForIndex(I, Stride, EltTy, MAlign),

          IsVolatile, "col.load");


      Result.addVector(Vector);

    }

    return Result.addNumLoads(getNumOps(Result.getVectorTy()) *

                              Result.getNumVectors());

  }


  /// Loads a sub-matrix with shape \p ResultShape from a \p R x \p C matrix,

  /// starting at \p MatrixPtr[I][J].

  MatrixTy loadMatrix(Value *MatrixPtr, MaybeAlign Align, bool IsVolatile,

                      ShapeInfo MatrixShape, Value *I, Value *J,

                      ShapeInfo ResultShape, Type *EltTy,

                      IRBuilder<> &Builder) {


    Value *Offset = Builder.CreateAdd(

        Builder.CreateMul(J, Builder.getInt64(MatrixShape.getStride())), I);


    Value *TileStart = Builder.CreateGEP(EltTy, MatrixPtr, Offset);

    auto *TileTy = FixedVectorType::get(EltTy, ResultShape.NumRows *

                                                   ResultShape.NumColumns);


    return loadMatrix(TileTy, TileStart, Align,

                      Builder.getInt64(MatrixShape.getStride()), IsVolatile,

                      ResultShape, Builder);

  }


  /// Lower a load instruction with shape information.

  void LowerLoad(Instruction *Inst, Value *Ptr, MaybeAlign Align, Value *Stride,

                 bool IsVolatile, ShapeInfo Shape) {

    IRBuilder<> Builder(Inst);

    finalizeLowering(Inst,

                     loadMatrix(Inst->getType(), Ptr, Align, Stride, IsVolatile,

                                Shape, Builder),

                     Builder);

  }


  /// Lowers llvm.matrix.column.major.load.

  ///

  /// The intrinsic loads a matrix from memory using a stride between columns.

  void LowerColumnMajorLoad(CallInst *Inst) {

    assert(MatrixLayout == MatrixLayoutTy::ColumnMajor &&

           "Intrinsic only supports column-major layout!");

    Value *Ptr = Inst->getArgOperand(0);

    Value *Stride = Inst->getArgOperand(1);

    LowerLoad(Inst, Ptr, Inst->getParamAlign(0), Stride,

              cast<ConstantInt>(Inst->getArgOperand(2))->isOne(),

              {Inst->getArgOperand(3), Inst->getArgOperand(4)});

  }


  /// Stores a sub-matrix \p StoreVal into the \p R x \p C matrix starting at \p

  /// MatrixPtr[I][J].

  void storeMatrix(const MatrixTy &StoreVal, Value *MatrixPtr,

                   MaybeAlign MAlign, bool IsVolatile, ShapeInfo MatrixShape,

                   Value *I, Value *J, Type *EltTy, IRBuilder<> &Builder) {

    Value *Offset = Builder.CreateAdd(

        Builder.CreateMul(J, Builder.getInt64(MatrixShape.getStride())), I);


    Value *TileStart = Builder.CreateGEP(EltTy, MatrixPtr, Offset);

    auto *TileTy = FixedVectorType::get(EltTy, StoreVal.getNumRows() *

                                                   StoreVal.getNumColumns());


    storeMatrix(TileTy, StoreVal, TileStart, MAlign,

                Builder.getInt64(MatrixShape.getStride()), IsVolatile, Builder);

  }


  /// Store matrix \p StoreVal starting at \p Ptr and using \p Stride between

  /// vectors.

  MatrixTy storeMatrix(Type *Ty, MatrixTy StoreVal, Value *Ptr,

                       MaybeAlign MAlign, Value *Stride, bool IsVolatile,

                       IRBuilder<> &Builder) {

    auto VType = cast<VectorType>(Ty);

    Value *EltPtr = Ptr;

    for (auto Vec : enumerate(StoreVal.vectors())) {

      Value *GEP = computeVectorAddr(

          EltPtr,

          Builder.getIntN(Stride->getType()->getScalarSizeInBits(),

                          Vec.index()),

          Stride, StoreVal.getStride(), VType->getElementType(), Builder);

      Builder.CreateAlignedStore(Vec.value(), GEP,

                                 getAlignForIndex(Vec.index(), Stride,

                                                  VType->getElementType(),

                                                  MAlign),

                                 IsVolatile);

    }

    return MatrixTy().addNumStores(getNumOps(StoreVal.getVectorTy()) *

                                   StoreVal.getNumVectors());

  }


  /// Lower a store instruction with shape information.

  void LowerStore(Instruction *Inst, Value *Matrix, Value *Ptr, MaybeAlign A,

                  Value *Stride, bool IsVolatile, ShapeInfo Shape) {

    IRBuilder<> Builder(Inst);

    auto StoreVal = getMatrix(Matrix, Shape, Builder);

    finalizeLowering(Inst,

                     storeMatrix(Matrix->getType(), StoreVal, Ptr, A, Stride,

                                 IsVolatile, Builder),

                     Builder);

  }


  /// Lowers llvm.matrix.column.major.store.

  ///

  /// The intrinsic store a matrix back memory using a stride between columns.

  void LowerColumnMajorStore(CallInst *Inst) {

    assert(MatrixLayout == MatrixLayoutTy::ColumnMajor &&

           "Intrinsic only supports column-major layout!");

    Value *Matrix = Inst->getArgOperand(0);

    Value *Ptr = Inst->getArgOperand(1);

    Value *Stride = Inst->getArgOperand(2);

    LowerStore(Inst, Matrix, Ptr, Inst->getParamAlign(1), Stride,

               cast<ConstantInt>(Inst->getArgOperand(3))->isOne(),

               {Inst->getArgOperand(4), Inst->getArgOperand(5)});

  }


  // Set elements I..I+NumElts-1 to Block

  Value *insertVector(Value *Col, unsigned I, Value *Block,

                      IRBuilder<> &Builder) {


    // First, bring Block to the same size as Col

    unsigned BlockNumElts =

        cast<FixedVectorType>(Block->getType())->getNumElements();

    unsigned NumElts = cast<FixedVectorType>(Col->getType())->getNumElements();

    assert(NumElts >= BlockNumElts && "Too few elements for current block");


    Block = Builder.CreateShuffleVector(

        Block, createSequentialMask(0, BlockNumElts, NumElts - BlockNumElts));


    // If Col is 7 long and I is 2 and BlockNumElts is 2 the mask is: 0, 1, 7,

    // 8, 4, 5, 6

    SmallVector<int, 16> Mask;

    unsigned i;

    for (i = 0; i < I; i++)

      Mask.push_back(i);


    unsigned VecNumElts =

        cast<FixedVectorType>(Col->getType())->getNumElements();

    for (; i < I + BlockNumElts; i++)

      Mask.push_back(i - I + VecNumElts);


    for (; i < VecNumElts; i++)

      Mask.push_back(i);


    return Builder.CreateShuffleVector(Col, Block, Mask);

  }


  Value *createMulAdd(Value *Sum, Value *A, Value *B, bool UseFPOp,

                      IRBuilder<> &Builder, bool AllowContraction,

                      unsigned &NumComputeOps) {

    NumComputeOps += getNumOps(A->getType());

    if (!Sum)

      return UseFPOp ? Builder.CreateFMul(A, B) : Builder.CreateMul(A, B);


    if (UseFPOp) {

      if (AllowContraction) {

        // Use fmuladd for floating point operations and let the backend decide

        // if that's profitable.

        return Builder.CreateIntrinsic(Intrinsic::fmuladd, A->getType(),

                                       {A, B, Sum});

      }

      NumComputeOps += getNumOps(A->getType());

      Value *Mul = Builder.CreateFMul(A, B);

      return Builder.CreateFAdd(Sum, Mul);

    }


    NumComputeOps += getNumOps(A->getType());

    Value *Mul = Builder.CreateMul(A, B);

    return Builder.CreateAdd(Sum, Mul);

  }


  /// Cache \p Matrix as result of \p Inst and update the uses of \p Inst. For

  /// users with shape information, there's nothing to do: they will use the

  /// cached value when they are lowered. For other users, \p Matrix is

  /// flattened and the uses are updated to use it. Also marks \p Inst for

  /// deletion.

  void finalizeLowering(Instruction *Inst, MatrixTy Matrix,

                        IRBuilder<> &Builder) {

    auto inserted = Inst2ColumnMatrix.insert(std::make_pair(Inst, Matrix));

    (void)inserted;

    assert(inserted.second && "multiple matrix lowering mapping");


    ToRemove.push_back(Inst);

    Value *Flattened = nullptr;

    for (Use &U : llvm::make_early_inc_range(Inst->uses())) {

      if (ShapeMap.find(U.getUser()) == ShapeMap.end()) {

        if (!Flattened)

          Flattened = Matrix.embedInVector(Builder);

        U.set(Flattened);

      }

    }

  }


  /// Special case for MatMul lowering. Prevents scalar loads of row-major

  /// vectors Lowers to vector reduction add instead of sequential add if

  /// reassocation is enabled.

  void lowerDotProduct(CallInst *MatMul,

                       SmallPtrSet<Instruction *, 16> &FusedInsts,

                       FastMathFlags FMF) {

    if (FusedInsts.contains(MatMul) ||

        MatrixLayout != MatrixLayoutTy::ColumnMajor)

      return;

    ShapeInfo LShape(MatMul->getArgOperand(2), MatMul->getArgOperand(3));

    ShapeInfo RShape(MatMul->getArgOperand(3), MatMul->getArgOperand(4));


    if (LShape.NumRows != 1 || RShape.NumColumns != 1) // not a dot product

      return;


    Value *LHS = MatMul->getArgOperand(0);

    Value *RHS = MatMul->getArgOperand(1);


    Type *ElementType = cast<VectorType>(LHS->getType())->getElementType();

    bool IsIntVec = ElementType->isIntegerTy();


    // Floating point reductions require reassocation.

    if (!IsIntVec && !FMF.allowReassoc())

      return;


    auto CanBeFlattened = [](Value *Op) {

      if (match(Op, m_BinOp()))

        return true;

      return match(

          Op, m_OneUse(m_CombineOr(

                  m_Load(m_Value()),

                  m_CombineOr(m_Intrinsic<Intrinsic::matrix_transpose>(),

                              m_Intrinsic<Intrinsic::matrix_column_major_load>(

                                  m_Value(), m_SpecificInt(1))))));

    };

    // Returns the cost benefit of using \p Op with the dot product lowering. If

    // the returned cost is < 0, the argument is cheaper to use in the

    // dot-product lowering.

    auto GetCostForArg = [this, &CanBeFlattened](Value *Op, unsigned N) {

      if (ShapeMap.find(Op) == ShapeMap.end())

        return InstructionCost::getInvalid();


      if (!isa<Instruction>(Op))

        return InstructionCost(0);


      FixedVectorType *VecTy = cast<FixedVectorType>(Op->getType());

      Type *EltTy = VecTy->getElementType();


      if (!CanBeFlattened(Op)) {

        InstructionCost EmbedCost(0);

        // Roughly estimate the cost for embedding the columns into a vector.

        for (unsigned I = 1; I < N; ++I)

          EmbedCost +=

              TTI.getShuffleCost(TTI::SK_Splice, FixedVectorType::get(EltTy, 1),

                                 {}, TTI::TCK_RecipThroughput);

        return EmbedCost;

      }


      if (match(Op, m_BinOp()) && ShapeMap.find(Op) != ShapeMap.end()) {

        InstructionCost OriginalCost =

            TTI.getArithmeticInstrCost(cast<Instruction>(Op)->getOpcode(),

                                       EltTy) *

            N;

        InstructionCost NewCost = TTI.getArithmeticInstrCost(

            cast<Instruction>(Op)->getOpcode(), VecTy);

        return NewCost - OriginalCost;

      }


      if (match(Op, m_Intrinsic<Intrinsic::matrix_transpose>())) {

        // The transpose can be skipped for the dot product lowering, roughly

        // estimate the savings as the cost of embedding the columns in a

        // vector.

        InstructionCost EmbedCost(0);

        for (unsigned I = 1; I < N; ++I)

          EmbedCost -=

              TTI.getShuffleCost(TTI::SK_Splice, FixedVectorType::get(EltTy, 1),

                                 {}, TTI::TCK_RecipThroughput);

        return EmbedCost;

      }


      // Costs for loads.

      if (N == 1)

        return InstructionCost(0);


      return TTI.getMemoryOpCost(Instruction::Load, VecTy, Align(1), 0) -

             N * TTI.getMemoryOpCost(Instruction::Load, EltTy, Align(1), 0);

    };


    // Iterate over LHS and operations feeding LHS and check if it is profitable

    // to flatten the visited ops.  For each op, we compute the difference

    // between the flattened and matrix versions.

    SmallPtrSet<Value *, 4> Seen;

    SmallVector<Value *> WorkList;

    SmallVector<Value *> ToFlatten;

    WorkList.push_back(LHS);

    InstructionCost LHSCost(0);

    while (!WorkList.empty()) {

      Value *Op = WorkList.pop_back_val();

      if (!Seen.insert(Op).second)

        continue;


      InstructionCost OpCost = GetCostForArg(Op, LShape.NumColumns);

      if (OpCost + LHSCost >= LHSCost)

        continue;


      LHSCost += OpCost;

      ToFlatten.push_back(Op);

      if (auto *I = dyn_cast<Instruction>(Op))

        WorkList.append(I->op_begin(), I->op_end());

    }


    // We compare the costs of a vector.reduce.add to sequential add.

    int AddOpCode = IsIntVec ? Instruction::Add : Instruction::FAdd;

    int MulOpCode = IsIntVec ? Instruction::Mul : Instruction::FMul;

    InstructionCost ReductionCost =

        TTI.getArithmeticReductionCost(

            AddOpCode, cast<VectorType>(LHS->getType()),

            IsIntVec ? std::nullopt : std::optional(FMF)) +

        TTI.getArithmeticInstrCost(MulOpCode, LHS->getType());

    InstructionCost SequentialAddCost =

        TTI.getArithmeticInstrCost(AddOpCode, ElementType) *

            (LShape.NumColumns - 1) +

        TTI.getArithmeticInstrCost(MulOpCode, ElementType) *

            (LShape.NumColumns);

    if ((LHSCost + ReductionCost - SequentialAddCost) > InstructionCost(0))

      return;


    FusedInsts.insert(MatMul);

    IRBuilder<> Builder(MatMul);

    auto FlattenArg = [&Builder, &FusedInsts, &CanBeFlattened,

                       this](Value *Op) {

      // Matmul must be the only user of loads because we don't use LowerLoad

      // for row vectors (LowerLoad results in scalar loads and shufflevectors

      // instead of single vector load).

      if (!CanBeFlattened(Op))

        return;


      if (match(Op, m_BinOp())) {

        auto It = ShapeMap.find(Op);

        if (It != ShapeMap.end()) {

          It->second = It->second.t();

          return;

        }

      }


      FusedInsts.insert(cast<Instruction>(Op));

      // If vector uses the builtin load, lower to a LoadInst

      Value *Arg;

      if (match(Op, m_Intrinsic<Intrinsic::matrix_column_major_load>(

                        m_Value(Arg)))) {

        auto *NewLoad = Builder.CreateLoad(Op->getType(), Arg);

        Op->replaceAllUsesWith(NewLoad);

        eraseFromParentAndRemoveFromShapeMap(cast<Instruction>(Op));

        return;

      } else if (match(Op, m_Intrinsic<Intrinsic::matrix_transpose>(

                               m_Value(Arg)))) {

        ToRemove.push_back(cast<Instruction>(Op));

        Op->replaceAllUsesWith(Arg);

        return;

      }

    };


    for (auto *V : ToFlatten)

      FlattenArg(V);


    LHS = MatMul->getArgOperand(0);


    // Insert mul/fmul and llvm.vector.reduce.fadd

    Value *Mul =

        IsIntVec ? Builder.CreateMul(LHS, RHS) : Builder.CreateFMul(LHS, RHS);


    Value *Result;

    if (IsIntVec)

      Result = Builder.CreateAddReduce(Mul);

    else {

      Result = Builder.CreateFAddReduce(

          ConstantFP::get(cast<VectorType>(LHS->getType())->getElementType(),

                          0.0),

          Mul);

      cast<Instruction>(Result)->setFastMathFlags(FMF);

    }


    // pack scalar back into a matrix and then replace matmul inst

    Result = Builder.CreateInsertElement(PoisonValue::get(MatMul->getType()),

                                         Result, uint64_t(0));

    MatMul->replaceAllUsesWith(Result);

    FusedInsts.insert(MatMul);

    ToRemove.push_back(MatMul);

  }


  /// Compute \p Result += \p A * \p B for input matrices with left-associating

  /// addition.

  ///

  /// We can fold a transpose into the operand that is used to extract scalars.

  /// This is the first operands with row-major and the second with

  /// column-major.  If \p IsScalarMatrixTransposed we assume the appropriate

  /// operand is transposed.

  void emitMatrixMultiply(MatrixTy &Result, const MatrixTy &A,

                          const MatrixTy &B, IRBuilder<> &Builder, bool IsTiled,

                          bool IsScalarMatrixTransposed, FastMathFlags FMF) {

    const unsigned VF = std::max<unsigned>(

        TTI.getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector)

                .getFixedValue() /

            Result.getElementType()->getPrimitiveSizeInBits().getFixedValue(),

        1U);

    unsigned R = Result.getNumRows();

    unsigned C = Result.getNumColumns();

    unsigned M = A.getNumColumns();


    bool IsFP = Result.getElementType()->isFloatingPointTy();

    assert(A.isColumnMajor() == B.isColumnMajor() &&

           Result.isColumnMajor() == A.isColumnMajor() &&

           "operands must agree on matrix layout");

    unsigned NumComputeOps = 0;


    Builder.setFastMathFlags(FMF);


    if (A.isColumnMajor()) {

      // Multiply columns from the first operand with scalars from the second

      // operand. Then move along the K axes and accumulate the columns.  With

      // this the adds can be vectorized without reassociation.

      for (unsigned J = 0; J < C; ++J) {

        unsigned BlockSize = VF;

        // If Result is zero, we don't need to accumulate in the K==0 iteration.

        bool isSumZero = isa<ConstantAggregateZero>(Result.getColumn(J));


        for (unsigned I = 0; I < R; I += BlockSize) {

          // Gradually lower the vectorization factor to cover the remainder.

          while (I + BlockSize > R)

            BlockSize /= 2;


          Value *Sum = IsTiled ? Result.extractVector(I, J, BlockSize, Builder)

                               : nullptr;

          for (unsigned K = 0; K < M; ++K) {

            Value *L = A.extractVector(I, K, BlockSize, Builder);

            Value *RH = Builder.CreateExtractElement(

                B.getColumn(IsScalarMatrixTransposed ? K : J),

                IsScalarMatrixTransposed ? J : K);

            Value *Splat = Builder.CreateVectorSplat(BlockSize, RH, "splat");

            Sum =

                createMulAdd(isSumZero && K == 0 ? nullptr : Sum, L, Splat,

                             IsFP, Builder, FMF.allowContract(), NumComputeOps);

          }

          Result.setVector(J,

                           insertVector(Result.getVector(J), I, Sum, Builder));

        }

      }

    } else {

      // Multiply rows from the second operand with scalars from the first

      // operand. Then move along the K axes and accumulate the rows.  With this

      // the adds can be vectorized without reassociation.

      for (unsigned I = 0; I < R; ++I) {

        unsigned BlockSize = VF;

        bool isSumZero = isa<ConstantAggregateZero>(Result.getRow(I));

        for (unsigned J = 0; J < C; J += BlockSize) {

          // Gradually lower the vectorization factor to cover the remainder.

          while (J + BlockSize > C)

            BlockSize /= 2;


          Value *Sum = nullptr;

          for (unsigned K = 0; K < M; ++K) {

            Value *R = B.extractVector(K, J, BlockSize, Builder);

            Value *LH = Builder.CreateExtractElement(

                A.getVector(IsScalarMatrixTransposed ? K : I),

                IsScalarMatrixTransposed ? I : K);

            Value *Splat = Builder.CreateVectorSplat(BlockSize, LH, "splat");

            Sum =

                createMulAdd(isSumZero && K == 0 ? nullptr : Sum, Splat, R,

                             IsFP, Builder, FMF.allowContract(), NumComputeOps);

          }

          Result.setVector(I,

                           insertVector(Result.getVector(I), J, Sum, Builder));

        }

      }

    }

    Result.addNumComputeOps(NumComputeOps);

  }


  /// Ensure that the memory in \p Load does not alias \p Store by potentially

  /// copying it to a new location.  This new or otherwise the original location

  /// is returned.

  Value *getNonAliasingPointer(LoadInst *Load, StoreInst *Store,

                               CallInst *MatMul) {

    MemoryLocation StoreLoc = MemoryLocation::get(Store);

    MemoryLocation LoadLoc = MemoryLocation::get(Load);


    // If we can statically determine noalias we're good.

    if (AA->isNoAlias(LoadLoc, StoreLoc))

      return Load->getPointerOperand();


    // Create code to check if the memory locations of the Load and Store

    // overlap and if they do, copy Load's operand to a new buffer.


    // First, create  new blocks for 2n part of the check and the copy.

    BasicBlock *Check0 = MatMul->getParent();

    // FIXME: Use lazy DTU and update SplitBlock to accept a DTU instead of a

    // DT. Manually collect dominator tree updates, to avoid unnecessary work,

    // as we adjust Check0 and Check1's branches.

    SmallVector<DominatorTree::UpdateType, 4> DTUpdates;

    for (BasicBlock *Succ : successors(Check0))

      DTUpdates.push_back({DT->Delete, Check0, Succ});


    BasicBlock *Check1 =

        SplitBlock(MatMul->getParent(), MatMul, (DomTreeUpdater *)nullptr, LI,

                   nullptr, "alias_cont");

    BasicBlock *Copy =

        SplitBlock(MatMul->getParent(), MatMul, (DomTreeUpdater *)nullptr, LI,

                   nullptr, "copy");

    BasicBlock *Fusion =

        SplitBlock(MatMul->getParent(), MatMul, (DomTreeUpdater *)nullptr, LI,

                   nullptr, "no_alias");


    // Check if the loaded memory location begins before the end of the store

    // location. If the condition holds, they might overlap, otherwise they are

    // guaranteed to not overlap.

    IRBuilder<> Builder(MatMul);

    Check0->getTerminator()->eraseFromParent();

    Builder.SetInsertPoint(Check0);

    Type *IntPtrTy = Builder.getIntPtrTy(Load->getDataLayout());

    Value *StoreBegin = Builder.CreatePtrToInt(

        const_cast<Value *>(StoreLoc.Ptr), IntPtrTy, "store.begin");

    Value *StoreEnd = Builder.CreateAdd(

        StoreBegin, ConstantInt::get(IntPtrTy, StoreLoc.Size.getValue()),

        "store.end", true, true);

    Value *LoadBegin = Builder.CreatePtrToInt(const_cast<Value *>(LoadLoc.Ptr),

                                              IntPtrTy, "load.begin");

    Builder.CreateCondBr(Builder.CreateICmpULT(LoadBegin, StoreEnd), Check1,

                         Fusion);


    // Check if the store begins before the end of the load location. If the

    // condition holds, they alias, otherwise they are guaranteed to not

    // overlap.

    Check1->getTerminator()->eraseFromParent();

    Builder.SetInsertPoint(Check1, Check1->begin());

    Value *LoadEnd = Builder.CreateAdd(

        LoadBegin, ConstantInt::get(IntPtrTy, LoadLoc.Size.getValue()),

        "load.end", true, true);

    Builder.CreateCondBr(Builder.CreateICmpULT(StoreBegin, LoadEnd), Copy,

                         Fusion);


    // Copy load operand to new alloca.

    Builder.SetInsertPoint(Copy, Copy->begin());

    auto *VT = cast<FixedVectorType>(Load->getType());

    // Use an array type for the alloca, to avoid potentially huge alignment

    // requirements for large vector types.

    auto *ArrayTy = ArrayType::get(VT->getElementType(), VT->getNumElements());

    AllocaInst *Alloca =

        Builder.CreateAlloca(ArrayTy, Load->getPointerAddressSpace());


    Builder.CreateMemCpy(Alloca, Alloca->getAlign(), Load->getPointerOperand(),

                         Load->getAlign(), LoadLoc.Size.getValue());

    Builder.SetInsertPoint(Fusion, Fusion->begin());

    PHINode *PHI = Builder.CreatePHI(Load->getPointerOperandType(), 3);

    PHI->addIncoming(Load->getPointerOperand(), Check0);

    PHI->addIncoming(Load->getPointerOperand(), Check1);

    PHI->addIncoming(Alloca, Copy);


    // Adjust DT.

    DTUpdates.push_back({DT->Insert, Check0, Check1});

    DTUpdates.push_back({DT->Insert, Check0, Fusion});

    DTUpdates.push_back({DT->Insert, Check1, Copy});

    DTUpdates.push_back({DT->Insert, Check1, Fusion});

    DT->applyUpdates(DTUpdates);

    return PHI;

  }


  bool isFusionProfitable(CallInst *MatMul) {

    if (ForceFusion)

      return true;


    ShapeInfo LShape(MatMul->getArgOperand(2), MatMul->getArgOperand(3));

    ShapeInfo RShape(MatMul->getArgOperand(3), MatMul->getArgOperand(4));


    const unsigned R = LShape.NumRows;

    const unsigned C = RShape.NumColumns;

    const unsigned M = LShape.NumColumns;

    auto *EltType = cast<VectorType>(MatMul->getType())->getElementType();


    const unsigned VF = std::max<unsigned>(

        TTI.getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector)

                .getFixedValue() /

            EltType->getPrimitiveSizeInBits().getFixedValue(),

        1U);


    // Cost model for tiling

    //

    // For tiling to be beneficial, we need reuse either along the R or

    // the C axis.  We vectorize along the R axis so that means at least

    // 3 elements.

    // TODO: Also consider cost of copying if operands alias.

    if (R <= VF && C == 1)

      return false;

    // Then we need enough elements to exceed the number of vector

    // registers we have.  Note that this is an oversimplification since

    // fusing also takes some extra loads which may exceed the number of

    // reloads necessary.

    unsigned Op0Regs = (R + VF - 1) / VF * M;

    unsigned Op1Regs = (M + VF - 1) / VF * C;

    return Op0Regs + Op1Regs >

           TTI.getNumberOfRegisters(TTI.getRegisterClassForType(true));

  }


  MatrixTy getZeroMatrix(Type *EltType, unsigned R, unsigned C) {

    MatrixTy Res;

    auto *ColumType = FixedVectorType::get(EltType, R);

    for (unsigned I = 0; I < C; ++I)

      Res.addVector(ConstantAggregateZero::get(ColumType));

    return Res;

  }


  void createTiledLoops(CallInst *MatMul, Value *LPtr, ShapeInfo LShape,

                        Value *RPtr, ShapeInfo RShape, StoreInst *Store) {

    auto *EltType = cast<VectorType>(MatMul->getType())->getElementType();


    // Create the main tiling loop nest.

    TileInfo TI(LShape.NumRows, RShape.NumColumns, LShape.NumColumns, TileSize);

    DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy);

    Instruction *InsertI = cast<Instruction>(MatMul);

    BasicBlock *Start = InsertI->getParent();

    BasicBlock *End =

        SplitBlock(InsertI->getParent(), InsertI, DT, LI, nullptr, "continue");

    IRBuilder<> Builder(MatMul);

    BasicBlock *InnerBody = TI.CreateTiledLoops(Start, End, Builder, DTU, *LI);


    Type *TileVecTy =

        FixedVectorType::get(MatMul->getType()->getScalarType(), TileSize);

    MatrixTy TileResult;

    // Insert in the inner loop header.

    Builder.SetInsertPoint(TI.KLoop.Header->getTerminator());

    // Create PHI nodes for the result columns to accumulate across iterations.

    SmallVector<PHINode *, 4> ColumnPhis;

    for (unsigned I = 0; I < TileSize; I++) {

      auto *Phi = Builder.CreatePHI(TileVecTy, 2, "result.vec." + Twine(I));

      Phi->addIncoming(ConstantAggregateZero::get(TileVecTy),

                       TI.RowLoop.Header->getSingleSuccessor());

      TileResult.addVector(Phi);

      ColumnPhis.push_back(Phi);

    }


    // Insert in the inner loop body, which computes

    //   Res += Load(CurrentRow, K) * Load(K, CurrentColumn)

    Builder.SetInsertPoint(InnerBody->getTerminator());

    // Load tiles of the operands.

    MatrixTy A =

        loadMatrix(LPtr, {}, false, LShape, TI.RowLoop.Index, TI.KLoop.Index,

                   {TileSize, TileSize}, EltType, Builder);

    MatrixTy B =

        loadMatrix(RPtr, {}, false, RShape, TI.KLoop.Index, TI.ColumnLoop.Index,

                   {TileSize, TileSize}, EltType, Builder);

    emitMatrixMultiply(TileResult, A, B, Builder, true, false,

                       getFastMathFlags(MatMul));

    // Store result after the inner loop is done.

    Builder.SetInsertPoint(TI.RowLoop.Latch->getTerminator());

    storeMatrix(TileResult, Store->getPointerOperand(), Store->getAlign(),

                Store->isVolatile(), {LShape.NumRows, RShape.NumColumns},

                TI.RowLoop.Index, TI.ColumnLoop.Index, EltType, Builder);


    for (unsigned I = 0; I < TileResult.getNumVectors(); I++)

      ColumnPhis[I]->addIncoming(TileResult.getVector(I), TI.KLoop.Latch);


    // Force unrolling of a few iterations of the inner loop, to make sure there

    // is enough work per iteration.

    // FIXME: The unroller should make this decision directly instead, but

    // currently the cost-model is not up to the task.

    unsigned InnerLoopUnrollCount = std::min(10u, LShape.NumColumns / TileSize);

    addStringMetadataToLoop(LI->getLoopFor(TI.KLoop.Header),

                            "llvm.loop.unroll.count", InnerLoopUnrollCount);

  }


  void emitSIMDTiling(CallInst *MatMul, LoadInst *LoadOp0, LoadInst *LoadOp1,

                      StoreInst *Store,

                      SmallPtrSetImpl<Instruction *> &FusedInsts) {

    assert(MatrixLayout == MatrixLayoutTy::ColumnMajor &&

           "Tiling only supported for column-major matrixes at the moment!");

    if (!isFusionProfitable(MatMul))

      return;


    ShapeInfo LShape(MatMul->getArgOperand(2), MatMul->getArgOperand(3));

    ShapeInfo RShape(MatMul->getArgOperand(3), MatMul->getArgOperand(4));


    const unsigned R = LShape.NumRows;

    const unsigned C = RShape.NumColumns;

    const unsigned M = LShape.NumColumns;

    auto *EltType = cast<VectorType>(MatMul->getType())->getElementType();


    Value *APtr = getNonAliasingPointer(LoadOp0, Store, MatMul);

    Value *BPtr = getNonAliasingPointer(LoadOp1, Store, MatMul);

    Value *CPtr = Store->getPointerOperand();


    if (TileUseLoops && (R % TileSize == 0 && C % TileSize == 0))

      createTiledLoops(MatMul, APtr, LShape, BPtr, RShape, Store);

    else {

      IRBuilder<> Builder(Store);

      for (unsigned J = 0; J < C; J += TileSize)

        for (unsigned I = 0; I < R; I += TileSize) {

          const unsigned TileR = std::min(R - I, unsigned(TileSize));

          const unsigned TileC = std::min(C - J, unsigned(TileSize));

          MatrixTy Res = getZeroMatrix(EltType, TileR, TileC);


          for (unsigned K = 0; K < M; K += TileSize) {

            const unsigned TileM = std::min(M - K, unsigned(TileSize));

            MatrixTy A =

                loadMatrix(APtr, LoadOp0->getAlign(), LoadOp0->isVolatile(),

                           LShape, Builder.getInt64(I), Builder.getInt64(K),

                           {TileR, TileM}, EltType, Builder);

            MatrixTy B =

                loadMatrix(BPtr, LoadOp1->getAlign(), LoadOp1->isVolatile(),

                           RShape, Builder.getInt64(K), Builder.getInt64(J),

                           {TileM, TileC}, EltType, Builder);

            emitMatrixMultiply(Res, A, B, Builder, true, false,

                               getFastMathFlags(MatMul));

          }

          storeMatrix(Res, CPtr, Store->getAlign(), Store->isVolatile(), {R, M},

                      Builder.getInt64(I), Builder.getInt64(J), EltType,

                      Builder);

        }

    }


    // Mark eliminated instructions as fused and remove them.

    FusedInsts.insert(Store);

    FusedInsts.insert(MatMul);

    eraseFromParentAndRemoveFromShapeMap(Store);

    eraseFromParentAndRemoveFromShapeMap(MatMul);

    if (LoadOp0->hasNUses(0)) {

      FusedInsts.insert(LoadOp0);

      eraseFromParentAndRemoveFromShapeMap(LoadOp0);

    }

    if (LoadOp1 != LoadOp0 && LoadOp1->hasNUses(0)) {

      FusedInsts.insert(LoadOp1);

      eraseFromParentAndRemoveFromShapeMap(LoadOp1);

    }

  }


  /// Try to lower matrix multiply chains by fusing operations.

  ///

  /// Call finalizeLowering on lowered instructions.  Instructions that are

  /// completely eliminated by fusion are added to \p FusedInsts.

  void

  LowerMatrixMultiplyFused(CallInst *MatMul,

                           SmallPtrSetImpl<Instruction *> &FusedInsts,

                           SmallVector<IntrinsicInst *, 16> &LifetimeEnds) {

    if (!FuseMatrix || !DT)

      return;


    assert(AA && LI && "Analyses should be available");


    Value *A = MatMul->getArgOperand(0);

    Value *B = MatMul->getArgOperand(1);


    // We can fold the transpose into the operand that is used to fetch scalars.

    Value *T;

    if (MatrixLayout == MatrixLayoutTy::ColumnMajor

            ? match(B, m_Intrinsic<Intrinsic::matrix_transpose>(m_Value(T)))

            : match(A, m_Intrinsic<Intrinsic::matrix_transpose>(m_Value(T)))) {

      IRBuilder<> Builder(MatMul);

      auto *EltType = cast<VectorType>(MatMul->getType())->getElementType();

      ShapeInfo LShape(MatMul->getArgOperand(2), MatMul->getArgOperand(3));

      ShapeInfo RShape(MatMul->getArgOperand(3), MatMul->getArgOperand(4));

      const unsigned R = LShape.NumRows;

      const unsigned M = LShape.NumColumns;

      const unsigned C = RShape.NumColumns;


      MatrixTy MA;

      MatrixTy MB;


      Value *Transpose;

      if (MatrixLayout == MatrixLayoutTy::ColumnMajor) {

        MA = getMatrix(A, ShapeInfo(R, M), Builder);

        MB = getMatrix(T, ShapeInfo(C, M), Builder);

        Transpose = B;

      } else {

        MA = getMatrix(T, ShapeInfo(R, M), Builder);

        MB = getMatrix(B, ShapeInfo(C, M), Builder);

        Transpose = A;

      }


      // Initialize the output

      MatrixTy Result(R, C, EltType);


      emitMatrixMultiply(Result, MA, MB, Builder, false, true,

                         getFastMathFlags(MatMul));


      FusedInsts.insert(MatMul);

      if (Transpose->hasOneUse()) {

        FusedInsts.insert(cast<Instruction>(Transpose));

        ToRemove.push_back(cast<Instruction>(Transpose));

        // TODO: add a fake entry for the folded instruction so that this is

        // included in the expression in the remark.

        Inst2ColumnMatrix[Transpose] = MatrixTy(M, C, EltType);

      }

      finalizeLowering(MatMul, Result, Builder);

      return;

    }


    if (!MatMul->hasOneUse() || MatrixLayout != MatrixLayoutTy::ColumnMajor)

      return;


    // Lower {ld, ld} -> matmul -> st chains.  No need to call finalizeLowering

    // since the single store user will be lowered as part of this.

    auto *LoadOp0 = dyn_cast<LoadInst>(A);

    auto *LoadOp1 = dyn_cast<LoadInst>(B);

    auto *Store = dyn_cast<StoreInst>(*MatMul->user_begin());

    if (LoadOp0 && LoadOp1 && Store) {

      // The store address must dominate the MatMul instruction, otherwise

      // we create invalid IR.

      SetVector<Value *> WorkList;

      WorkList.insert(Store->getOperand(1));

      SmallVector<Instruction *> ToHoist;

      for (unsigned I = 0; I != WorkList.size(); ++I) {

        Value *Current = WorkList[I];

        auto *CurrI = dyn_cast<Instruction>(Current);

        if (!CurrI)

          continue;

        if (isa<PHINode>(CurrI))

          return;

        if (DT->dominates(CurrI, MatMul))

          continue;

        if (CurrI->mayHaveSideEffects() || CurrI->mayReadFromMemory())

          return;

        ToHoist.push_back(CurrI);

        WorkList.insert(CurrI->op_begin(), CurrI->op_end());

      }


      sort(ToHoist, [this](Instruction *A, Instruction *B) {

        return DT->dominates(A, B);

      });

      for (Instruction *I : ToHoist)

        I->moveBefore(MatMul);


      // Deal with lifetime.end calls that might be between Load0/Load1 and the

      // store. To avoid introducing loads to dead objects (i.e. after the

      // lifetime has been termined by @llvm.lifetime.end), either sink them

      // after the store if in the same block, or remove the lifetime.end marker

      // otherwise. This might pessimize further optimizations, by extending the

      // lifetime of the object until the function returns, but should be

      // conservatively correct.

      MemoryLocation Load0Loc = MemoryLocation::get(LoadOp0);

      MemoryLocation Load1Loc = MemoryLocation::get(LoadOp1);

      BasicBlock *StoreParent = Store->getParent();

      bool FusableOpsInSameBlock = LoadOp0->getParent() == StoreParent &&

                                   LoadOp1->getParent() == StoreParent;

      for (unsigned Idx = 0; Idx != LifetimeEnds.size();) {

        IntrinsicInst *End = LifetimeEnds[Idx];

        auto Inc = make_scope_exit([&Idx]() { Idx++; });

        // If the lifetime.end is guaranteed to be before the loads or after the

        // store, it won't interfere with fusion.

        if (DT->dominates(End, LoadOp0) && DT->dominates(End, LoadOp1))

          continue;

        if (DT->dominates(Store, End))

          continue;

        // If all fusable ops are in the same block and the lifetime.end is in a

        // different block, it won't interfere with fusion.

        if (FusableOpsInSameBlock && End->getParent() != StoreParent)

          continue;


        // If the loads don't alias the lifetime.end, it won't interfere with

        // fusion.

        MemoryLocation EndLoc = MemoryLocation::getForArgument(End, 1, nullptr);

        if (!EndLoc.Ptr)

          continue;

        if (AA->isNoAlias(Load0Loc, EndLoc) && AA->isNoAlias(Load1Loc, EndLoc))

          continue;


        // If both lifetime.end and the store are in the same block, extend the

        // lifetime until after the store, so the new lifetime covers the loads

        // we introduce later.

        if (End->getParent() == StoreParent) {

          End->moveAfter(Store);

          continue;

        }


        // Otherwise remove the conflicting lifetime.end marker.

        ToRemove.push_back(End);

        std::swap(LifetimeEnds[Idx], LifetimeEnds.back());

        LifetimeEnds.pop_back();

        Inc.release();

      }


      emitSIMDTiling(MatMul, LoadOp0, LoadOp1, Store, FusedInsts);

      return;

    }

  }


  /// Lowers llvm.matrix.multiply.

  void LowerMultiply(CallInst *MatMul) {

    IRBuilder<> Builder(MatMul);

    auto *EltType = cast<VectorType>(MatMul->getType())->getElementType();

    ShapeInfo LShape(MatMul->getArgOperand(2), MatMul->getArgOperand(3));

    ShapeInfo RShape(MatMul->getArgOperand(3), MatMul->getArgOperand(4));


    const MatrixTy &Lhs = getMatrix(MatMul->getArgOperand(0), LShape, Builder);

    const MatrixTy &Rhs = getMatrix(MatMul->getArgOperand(1), RShape, Builder);

    assert(Lhs.getElementType() == Rhs.getElementType() &&

           "Matrix multiply argument element types do not match.");


    const unsigned R = LShape.NumRows;

    const unsigned C = RShape.NumColumns;

    assert(LShape.NumColumns == RShape.NumRows);


    // Initialize the output

    MatrixTy Result(R, C, EltType);

    assert(Lhs.getElementType() == Result.getElementType() &&

           "Matrix multiply result element type does not match arguments.");


    emitMatrixMultiply(Result, Lhs, Rhs, Builder, false, false,

                       getFastMathFlags(MatMul));

    finalizeLowering(MatMul, Result, Builder);

  }


  /// Lowers llvm.matrix.transpose.

  void LowerTranspose(CallInst *Inst) {

    MatrixTy Result;

    IRBuilder<> Builder(Inst);

    Value *InputVal = Inst->getArgOperand(0);

    VectorType *VectorTy = cast<VectorType>(InputVal->getType());

    ShapeInfo ArgShape(Inst->getArgOperand(1), Inst->getArgOperand(2));

    MatrixTy InputMatrix = getMatrix(InputVal, ArgShape, Builder);


    const unsigned NewNumVecs =

        InputMatrix.isColumnMajor() ? ArgShape.NumRows : ArgShape.NumColumns;

    const unsigned NewNumElts =

        InputMatrix.isColumnMajor() ? ArgShape.NumColumns : ArgShape.NumRows;


    for (unsigned I = 0; I < NewNumVecs; ++I) {

      // Build a single result vector. First initialize it.

      Value *ResultVector = PoisonValue::get(

          FixedVectorType::get(VectorTy->getElementType(), NewNumElts));

      // Go through the old elements and insert it into the resulting vector.

      for (auto J : enumerate(InputMatrix.vectors())) {

        Value *Elt = Builder.CreateExtractElement(J.value(), I);

        // Row and column indices are transposed.

        ResultVector =

            Builder.CreateInsertElement(ResultVector, Elt, J.index());

      }

      Result.addVector(ResultVector);

    }


    // TODO: Improve estimate of operations needed for transposes. Currently we

    // just count the insertelement/extractelement instructions, but do not

    // account for later simplifications/combines.

    finalizeLowering(

        Inst,

        Result.addNumComputeOps(2 * ArgShape.NumRows * ArgShape.NumColumns)

            .addNumExposedTransposes(1),

        Builder);

  }


  /// Lower load instructions, if shape information is available.

  bool VisitLoad(LoadInst *Inst, Value *Ptr, IRBuilder<> &Builder) {

    auto I = ShapeMap.find(Inst);

    if (I == ShapeMap.end())

      return false;


    LowerLoad(Inst, Ptr, Inst->getAlign(),

              Builder.getInt64(I->second.getStride()), Inst->isVolatile(),

              I->second);

    return true;

  }


  bool VisitStore(StoreInst *Inst, Value *StoredVal, Value *Ptr,

                  IRBuilder<> &Builder) {

    auto I = ShapeMap.find(StoredVal);

    if (I == ShapeMap.end())

      return false;


    LowerStore(Inst, StoredVal, Ptr, Inst->getAlign(),

               Builder.getInt64(I->second.getStride()), Inst->isVolatile(),

               I->second);

    return true;

  }


  /// Lower binary operators, if shape information is available.

  bool VisitBinaryOperator(BinaryOperator *Inst) {

    auto I = ShapeMap.find(Inst);

    if (I == ShapeMap.end())

      return false;


    Value *Lhs = Inst->getOperand(0);

    Value *Rhs = Inst->getOperand(1);


    IRBuilder<> Builder(Inst);

    ShapeInfo &Shape = I->second;


    MatrixTy Result;

    MatrixTy A = getMatrix(Lhs, Shape, Builder);

    MatrixTy B = getMatrix(Rhs, Shape, Builder);

    assert(A.isColumnMajor() == B.isColumnMajor() &&

           Result.isColumnMajor() == A.isColumnMajor() &&

           "operands must agree on matrix layout");


    Builder.setFastMathFlags(getFastMathFlags(Inst));


    // Helper to perform binary op on vectors.

    auto BuildVectorOp = [&Builder, Inst](Value *LHS, Value *RHS) {

      switch (Inst->getOpcode()) {

      case Instruction::Add:

        return Builder.CreateAdd(LHS, RHS);

      case Instruction::Mul:

        return Builder.CreateMul(LHS, RHS);

      case Instruction::Sub:

        return Builder.CreateSub(LHS, RHS);

      case Instruction::FAdd:

        return Builder.CreateFAdd(LHS, RHS);

      case Instruction::FMul:

        return Builder.CreateFMul(LHS, RHS);

      case Instruction::FSub:

        return Builder.CreateFSub(LHS, RHS);

      default:

        llvm_unreachable("Unsupported binary operator for matrix");

      }

    };


    for (unsigned I = 0; I < Shape.getNumVectors(); ++I)

      Result.addVector(BuildVectorOp(A.getVector(I), B.getVector(I)));


    finalizeLowering(Inst,

                     Result.addNumComputeOps(getNumOps(Result.getVectorTy()) *

                                             Result.getNumVectors()),

                     Builder);

    return true;

  }


  /// Lower unary operators, if shape information is available.

  bool VisitUnaryOperator(UnaryOperator *Inst) {

    auto I = ShapeMap.find(Inst);

    if (I == ShapeMap.end())

      return false;


    Value *Op = Inst->getOperand(0);


    IRBuilder<> Builder(Inst);

    ShapeInfo &Shape = I->second;


    MatrixTy Result;

    MatrixTy M = getMatrix(Op, Shape, Builder);


    Builder.setFastMathFlags(getFastMathFlags(Inst));


    // Helper to perform unary op on vectors.

    auto BuildVectorOp = [&Builder, Inst](Value *Op) {

      switch (Inst->getOpcode()) {

      case Instruction::FNeg:

        return Builder.CreateFNeg(Op);

      default:

        llvm_unreachable("Unsupported unary operator for matrix");

      }

    };


    for (unsigned I = 0; I < Shape.getNumVectors(); ++I)

      Result.addVector(BuildVectorOp(M.getVector(I)));


    finalizeLowering(Inst,

                     Result.addNumComputeOps(getNumOps(Result.getVectorTy()) *

                                             Result.getNumVectors()),

                     Builder);

    return true;

  }


  /// Helper to linearize a matrix expression tree into a string. Currently

  /// matrix expressions are linarized by starting at an expression leaf and

  /// linearizing bottom up.

  struct ExprLinearizer {

    unsigned LengthToBreak = 100;

    std::string Str;

    raw_string_ostream Stream;

    unsigned LineLength = 0;

    const DataLayout &DL;


    /// Mapping from instructions to matrixes. It is used to identify

    /// matrix instructions.

    const MapVector<Value *, MatrixTy> &Inst2Matrix;


    /// Mapping from values to the leaves of all expressions that the value is

    /// part of.

    const DenseMap<Value *, SmallPtrSet<Value *, 2>> &Shared;


    /// Set of matrix expressions in the scope of a given DISubprogram.

    const SmallSetVector<Value *, 32> &ExprsInSubprogram;


    /// Leaf node of the expression to linearize.

    Value *Leaf;


    /// Used to keep track of sub-expressions that get reused while linearizing

    /// the expression. Re-used sub-expressions are marked as (reused).

    SmallPtrSet<Value *, 8> ReusedExprs;


    ExprLinearizer(const DataLayout &DL,

                   const MapVector<Value *, MatrixTy> &Inst2Matrix,

                   const DenseMap<Value *, SmallPtrSet<Value *, 2>> &Shared,

                   const SmallSetVector<Value *, 32> &ExprsInSubprogram,

                   Value *Leaf)

        : Stream(Str), DL(DL), Inst2Matrix(Inst2Matrix), Shared(Shared),

          ExprsInSubprogram(ExprsInSubprogram), Leaf(Leaf) {}


    void indent(unsigned N) {

      LineLength += N;

      for (unsigned i = 0; i < N; i++)

        Stream << " ";

    }


    void lineBreak() {

      Stream << "\n";

      LineLength = 0;

    }


    void maybeIndent(unsigned Indent) {

      if (LineLength >= LengthToBreak)

        lineBreak();


      if (LineLength == 0)

        indent(Indent);

    }


    void write(StringRef S) {

      LineLength += S.size();

      Stream << S;

    }


    Value *getUnderlyingObjectThroughLoads(Value *V) {

      if (Value *Ptr = getPointerOperand(V))

        return getUnderlyingObjectThroughLoads(Ptr);

      else if (V->getType()->isPointerTy())

        return getUnderlyingObject(V);

      return V;

    }


    /// Returns true if \p V is a matrix value in the given subprogram.

    bool isMatrix(Value *V) const { return ExprsInSubprogram.count(V); }


    /// If \p V is a matrix value, print its shape as NumRows x NumColumns to

    /// \p SS.

    void prettyPrintMatrixType(Value *V, raw_string_ostream &SS) {

      auto M = Inst2Matrix.find(V);

      if (M == Inst2Matrix.end())

        SS << "unknown";

      else {

        SS << M->second.getNumRows();

        SS << "x";

        SS << M->second.getNumColumns();

      }

    }


    /// Write the called function name. Handles calls to llvm.matrix.*

    /// specially: we write the name, followed by the dimensions of the input

    /// matrixes, followed by the scalar type name.

    void writeFnName(CallInst *CI) {

      if (!CI->getCalledFunction())

        write("<no called fn>");

      else {

        StringRef Name = CI->getCalledFunction()->getName();

        if (!Name.starts_with("llvm.matrix")) {

          write(Name);

          return;

        }

        auto *II = cast<IntrinsicInst>(CI);

        write(Intrinsic::getBaseName(II->getIntrinsicID())

                  .drop_front(StringRef("llvm.matrix.").size()));

        write(".");

        std::string Tmp;

        raw_string_ostream SS(Tmp);


        switch (II->getIntrinsicID()) {

        case Intrinsic::matrix_multiply:

          prettyPrintMatrixType(II->getOperand(0), SS);

          SS << ".";

          prettyPrintMatrixType(II->getOperand(1), SS);

          SS << "." << *II->getType()->getScalarType();

          break;

        case Intrinsic::matrix_transpose:

          prettyPrintMatrixType(II->getOperand(0), SS);

          SS << "." << *II->getType()->getScalarType();

          break;

        case Intrinsic::matrix_column_major_load:

          prettyPrintMatrixType(II, SS);

          SS << "." << *II->getType()->getScalarType();

          break;

        case Intrinsic::matrix_column_major_store:

          prettyPrintMatrixType(II->getOperand(0), SS);

          SS << "." << *II->getOperand(0)->getType()->getScalarType();

          break;

        default:

          llvm_unreachable("Unhandled case");

        }

        write(Tmp);

      }

    }


    unsigned getNumShapeArgs(CallInst *CI) const {

      if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(CI)) {

        switch (II->getIntrinsicID()) {

        case Intrinsic::matrix_multiply:

          return 3;

        case Intrinsic::matrix_transpose:

          return 2;

        case Intrinsic::matrix_column_major_load:

        case Intrinsic::matrix_column_major_store:

          return 3;

        default:

          return 0;

        }

      }

      return 0;

    }


    /// Special printing for values: for pointers, we print if they refer to an

    /// (function) external address or a stack address, for other values we

    /// either print the constant or "scalar"/"matrix" for other values.

    void write(Value *V) {

      V = getUnderlyingObjectThroughLoads(V);

      if (V->getType()->isPointerTy()) {

        if (isa<AllocaInst>(V)) {

          Stream << "stack addr";

          LineLength += StringRef("stack addr").size();

        } else {

          Stream << "addr";

          LineLength += StringRef("addr").size();

        }

        if (!V->getName().empty()) {

          Stream << " %" << V->getName() << "";

          LineLength += V->getName().size() + 2;

        }

        return;

      }


      std::string Tmp;

      raw_string_ostream TmpStream(Tmp);


      if (auto *CI = dyn_cast<ConstantInt>(V))

        TmpStream << CI->getValue();

      else if (isa<Constant>(V))

        TmpStream << "constant";

      else {

        if (isMatrix(V))

          TmpStream << "matrix";

        else

          TmpStream << "scalar";

      }

      Tmp = std::string(StringRef(Tmp).trim());

      LineLength += Tmp.size();

      Stream << Tmp;

    }


    /// Linearize expression \p Expr starting at an indentation of \p Indent.

    /// Expressions that are re-used multiple times are prefixed with (reused)

    /// at the re-used root instruction.

    void linearizeExpr(Value *Expr, unsigned Indent, bool ParentReused,

                       bool ParentShared) {

      auto *I = cast<Instruction>(Expr);

      maybeIndent(Indent);

      SmallVector<Value *, 8> Ops;


      // Is Expr shared with other expression leaves?

      bool ExprShared = false;


      // Deal with shared subtrees. Mark them as shared, if required.

      if (!ParentShared) {

        auto SI = Shared.find(Expr);

        assert(SI != Shared.end() && SI->second.count(Leaf));


        for (Value *S : SI->second) {

          if (S == Leaf)

            continue;

          DebugLoc DL = cast<Instruction>(S)->getDebugLoc();

          write("shared with remark at line " + std::to_string(DL.getLine()) +

                " column " + std::to_string(DL.getCol()) + " (");

        }

        ExprShared = SI->second.size() > 1;

      }


      bool Reused = !ReusedExprs.insert(Expr).second;

      if (Reused && !ParentReused)

        write("(reused) ");


      if (auto *CI = dyn_cast<CallInst>(I)) {

        writeFnName(CI);


        Ops.append(CI->arg_begin(), CI->arg_end() - getNumShapeArgs(CI));

      } else if (isa<BitCastInst>(Expr)) {

        // Special case bitcasts, which are used to materialize matrixes from

        // non-matrix ops.

        write("matrix");

        return;

      } else {

        Ops.append(I->value_op_begin(), I->value_op_end());

        write(std::string(I->getOpcodeName()));

      }


      write(std::string("("));


      unsigned NumOpsToBreak = 1;

      if (match(Expr, m_Intrinsic<Intrinsic::matrix_column_major_load>()))

        NumOpsToBreak = 2;


      for (Value *Op : Ops) {

        if (Ops.size() > NumOpsToBreak)

          lineBreak();


        maybeIndent(Indent + 1);

        if (isMatrix(Op))

          linearizeExpr(Op, Indent + 1, Reused, ExprShared);

        else

          write(Op);

        if (Op != Ops.back())

          write(", ");

      }


      write(")");

    }


    const std::string &getResult() {

      return Str;

    }

  };


  /// Generate remarks for matrix operations in a function. To generate remarks

  /// for matrix expressions, the following approach is used:

  /// 1. Use the inlined-at debug information to group matrix operations to the

  ///    DISubprograms they are contained in.

  /// 2. Collect leaves of matrix expressions (done in

  ///    RemarkGenerator::getExpressionLeaves) for each subprogram - expression

  //     mapping.  Leaves are lowered matrix instructions without other matrix

  //     users (like stores) in the current subprogram.

  /// 3. For each leaf, create a remark containing a linearizied version of the

  ///    matrix expression. The expression is linearized by a recursive

  ///    bottom-up traversal of the matrix operands, starting at a leaf. Note

  ///    that multiple leaves can share sub-expressions. Shared subexpressions

  ///    are explicitly marked as shared().

  struct RemarkGenerator {

    const MapVector<Value *, MatrixTy> &Inst2Matrix;

    OptimizationRemarkEmitter &ORE;

    Function &Func;

    const DataLayout &DL;


    RemarkGenerator(const MapVector<Value *, MatrixTy> &Inst2Matrix,

                    OptimizationRemarkEmitter &ORE, Function &Func)

        : Inst2Matrix(Inst2Matrix), ORE(ORE), Func(Func),

          DL(Func.getDataLayout()) {}


    /// Return all leaves of the expressions in \p ExprsInSubprogram. Those are

    /// instructions in Inst2Matrix returning void or without any users in

    /// \p ExprsInSubprogram. Currently that should only include stores.

    SmallVector<Value *, 4>

    getExpressionLeaves(const SmallSetVector<Value *, 32> &ExprsInSubprogram) {

      SmallVector<Value *, 4> Leaves;

      for (auto *Expr : ExprsInSubprogram)

        if (Expr->getType()->isVoidTy() ||

            !any_of(Expr->users(), [&ExprsInSubprogram](User *U) {

              return ExprsInSubprogram.count(U);

            }))

          Leaves.push_back(Expr);

      return Leaves;

    }


    /// Recursively traverse expression \p V starting at \p Leaf and add \p Leaf

    /// to all visited expressions in \p Shared. Limit the matrix operations to

    /// the ones in \p ExprsInSubprogram.

    void collectSharedInfo(Value *Leaf, Value *V,

                           const SmallSetVector<Value *, 32> &ExprsInSubprogram,

                           DenseMap<Value *, SmallPtrSet<Value *, 2>> &Shared) {


      if (!ExprsInSubprogram.count(V))

        return;


      Shared[V].insert(Leaf);


      for (Value *Op : cast<Instruction>(V)->operand_values())

        collectSharedInfo(Leaf, Op, ExprsInSubprogram, Shared);

    }


    /// Calculate the number of exclusive and shared op counts for expression

    /// starting at \p V. Expressions used multiple times are counted once.

    /// Limit the matrix operations to the ones in \p ExprsInSubprogram.

    std::pair<OpInfoTy, OpInfoTy>

    sumOpInfos(Value *Root, SmallPtrSetImpl<Value *> &ReusedExprs,

               const SmallSetVector<Value *, 32> &ExprsInSubprogram,

               DenseMap<Value *, SmallPtrSet<Value *, 2>> &Shared) const {

      if (!ExprsInSubprogram.count(Root))

        return {};


      // Already counted this expression. Stop.

      if (!ReusedExprs.insert(Root).second)

        return {};


      OpInfoTy SharedCount;

      OpInfoTy Count;


      auto I = Shared.find(Root);

      auto CM = Inst2Matrix.find(Root);

      if (I->second.size() == 1)

        Count = CM->second.getOpInfo();

      else

        SharedCount = CM->second.getOpInfo();


      for (Value *Op : cast<Instruction>(Root)->operand_values()) {

        auto C = sumOpInfos(Op, ReusedExprs, ExprsInSubprogram, Shared);

        Count += C.first;

        SharedCount += C.second;

      }

      return {Count, SharedCount};

    }


    void emitRemarks() {

      if (!ORE.allowExtraAnalysis(DEBUG_TYPE))

        return;


      // Map matrix operations to their containting subprograms, by traversing

      // the inlinedAt chain. If the function does not have a DISubprogram, we

      // only map them to the containing function.

      MapVector<DISubprogram *, SmallVector<Value *, 8>> Subprog2Exprs;

      for (const auto &KV : Inst2Matrix) {

        if (Func.getSubprogram()) {

          auto *I = cast<Instruction>(KV.first);

          DILocation *Context = I->getDebugLoc();

          while (Context) {

            Subprog2Exprs[getSubprogram(Context->getScope())].push_back(

                KV.first);

            Context = DebugLoc(Context).getInlinedAt();

          }

        } else {

          Subprog2Exprs[nullptr].push_back(KV.first);

        }

      }

      for (auto &KV : Subprog2Exprs) {

        SmallSetVector<Value *, 32> ExprsInSubprogram(KV.second.begin(),

                                                      KV.second.end());

        auto Leaves = getExpressionLeaves(ExprsInSubprogram);


        DenseMap<Value *, SmallPtrSet<Value *, 2>> Shared;

        for (Value *Leaf : Leaves)

          collectSharedInfo(Leaf, Leaf, ExprsInSubprogram, Shared);


        // Generate remarks for each leaf.

        for (auto *L : Leaves) {


          DebugLoc Loc = cast<Instruction>(L)->getDebugLoc();

          DILocation *Context = cast<Instruction>(L)->getDebugLoc();

          while (Context) {

            if (getSubprogram(Context->getScope()) == KV.first) {

              Loc = Context;

              break;

            }

            Context = DebugLoc(Context).getInlinedAt();

          }


          SmallPtrSet<Value *, 8> ReusedExprs;

          OpInfoTy Counts, SharedCounts;

          std::tie(Counts, SharedCounts) =

              sumOpInfos(L, ReusedExprs, ExprsInSubprogram, Shared);


          OptimizationRemark Rem(DEBUG_TYPE, "matrix-lowered", Loc,

                                 cast<Instruction>(L)->getParent());


          Rem << "Lowered with ";

          Rem << ore::NV("NumStores", Counts.NumStores) << " stores, "

              << ore::NV("NumLoads", Counts.NumLoads) << " loads, "

              << ore::NV("NumComputeOps", Counts.NumComputeOps)

              << " compute ops, "

              << ore::NV("NumExposedTransposes", Counts.NumExposedTransposes)

              << " exposed transposes";


          if (SharedCounts.NumStores > 0 || SharedCounts.NumLoads > 0 ||

              SharedCounts.NumComputeOps > 0) {

            Rem << ",\nadditionally "

                << ore::NV("NumStores", SharedCounts.NumStores) << " stores, "

                << ore::NV("NumLoads", SharedCounts.NumLoads) << " loads, "

                << ore::NV("NumFPOps", SharedCounts.NumComputeOps)

                << " compute ops"

                << " are shared with other expressions";

          }


          Rem << ("\n" + linearize(L, Shared, ExprsInSubprogram, DL));

          ORE.emit(Rem);

        }

      }

    }


    std::string

    linearize(Value *L,

              const DenseMap<Value *, SmallPtrSet<Value *, 2>> &Shared,

              const SmallSetVector<Value *, 32> &ExprsInSubprogram,

              const DataLayout &DL) {

      ExprLinearizer Lin(DL, Inst2Matrix, Shared, ExprsInSubprogram, L);

      Lin.linearizeExpr(L, 0, false, false);

      return Lin.getResult();

    }

  };

};

} // namespace


PreservedAnalyses LowerMatrixIntrinsicsPass::run(Function &F,

                                                 FunctionAnalysisManager &AM) {

  auto &TTI = AM.getResult<TargetIRAnalysis>(F);


  LowerMatrixIntrinsics LMT(F, TTI, Minimal ? nullptr : &AM);

  if (LMT.Visit()) {

    PreservedAnalyses PA;

    if (!Minimal) {

      PA.preserve<LoopAnalysis>();

      PA.preserve<DominatorTreeAnalysis>();

    }

    return PA;

  }

  return PreservedAnalyses::all();

}


void LowerMatrixIntrinsicsPass::printPipeline(

    raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) {

  static_cast<PassInfoMixin<LowerMatrixIntrinsicsPass> *>(this)->printPipeline(

      OS, MapClassName2PassName);

  OS << '<';

  if (Minimal)

    OS << "minimal";

  OS << '>';

}

PHI
Rewrite undef for PHI
Definition: AMDGPURewriteUndefForPHI.cpp:100

ToRemove
ReachingDefAnalysis InstSet & ToRemove
Definition: ARMLowOverheadLoops.cpp:531

DL
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Definition: ARMSLSHardening.cpp:73

AliasAnalysis.h

Alignment.h

getParent
static const Function * getParent(const Value *V)
Definition: BasicAliasAnalysis.cpp:863

BasicBlockUtils.h

BT
BitTracker BT
Definition: BitTracker.cpp:73

B
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")

A
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")

D
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")

CommandLine.h

clEnumValN
#define clEnumValN(ENUMVAL, FLAGNAME, DESC)
Definition: CommandLine.h:686

DataLayout.h

Idx
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
Definition: DeadArgumentElimination.cpp:353

DebugInfoMetadata.h

Debug.h

LLVM_DEBUG
#define LLVM_DEBUG(...)
Definition: Debug.h:106

DomTreeUpdater.h

Name
std::string Name
Definition: ELFObjHandler.cpp:77

End
bool End
Definition: ELF_riscv.cpp:480

GEP
Hexagon Common GEP
Definition: HexagonCommonGEP.cpp:170

vectors
hexagon Hexagon specific predictive commoning for HVX vectors
Definition: HexagonVectorLoopCarriedReuse.cpp:218

IRBuilder.h

CFG.h
This file provides various utilities for inspecting and working with the control flow graph in LLVM I...

Function.h

IntrinsicInst.h

users
iv users
Definition: IVUsers.cpp:48

Instructions.h

isZero
static bool isZero(Value *V, const DataLayout &DL, DominatorTree *DT, AssumptionCache *AC)
Definition: Lint.cpp:557

Matrix
Live Register Matrix
Definition: LiveRegMatrix.cpp:44

LoopInfo.h

LoopUtils.h

getSubprogram
static DISubprogram * getSubprogram(DIScope *Scope)
Helper function to either return Scope, if it is a subprogram or the attached subprogram for a local ...
Definition: LowerMatrixIntrinsics.cpp:94

ForceFusion
static cl::opt< bool > ForceFusion("force-fuse-matrix", cl::init(false), cl::Hidden, cl::desc("Force matrix instruction fusion even if not profitable."))

VerifyShapeInfo
static cl::opt< bool > VerifyShapeInfo("verify-matrix-shapes", cl::Hidden, cl::desc("Enable/disable matrix shape verification."), cl::init(false))

isSplat
static bool isSplat(Value *V)
Return true if V is a splat of a value (which is used when multiplying a matrix with a scalar).
Definition: LowerMatrixIntrinsics.cpp:102

TileUseLoops
static cl::opt< bool > TileUseLoops("fuse-matrix-use-loops", cl::init(false), cl::Hidden, cl::desc("Generate loop nest for tiling."))

FuseMatrix
static cl::opt< bool > FuseMatrix("fuse-matrix", cl::init(true), cl::Hidden, cl::desc("Enable/disable fusing matrix instructions."))

m_AnyAdd
auto m_AnyAdd(const LTy &L, const RTy &R)
Match any add operation (fp or integer).
Definition: LowerMatrixIntrinsics.cpp:116

AllowContractEnabled
static cl::opt< bool > AllowContractEnabled("matrix-allow-contract", cl::init(false), cl::Hidden, cl::desc("Allow the use of FMAs if available and profitable. This may " "result in different results, due to less rounding error."))

MatrixLayoutTy
MatrixLayoutTy
Definition: LowerMatrixIntrinsics.cpp:79

MatrixLayoutTy::RowMajor
@ RowMajor

MatrixLayoutTy::ColumnMajor
@ ColumnMajor

m_AnyMul
auto m_AnyMul(const LTy &L, const RTy &R)
Match any mul operation (fp or integer).
Definition: LowerMatrixIntrinsics.cpp:110

PrintAfterTransposeOpt
static cl::opt< bool > PrintAfterTransposeOpt("matrix-print-after-transpose-opt", cl::init(false))

DEBUG_TYPE
#define DEBUG_TYPE
Definition: LowerMatrixIntrinsics.cpp:53

TileSize
static cl::opt< unsigned > TileSize("fuse-matrix-tile-size", cl::init(4), cl::Hidden, cl::desc("Tile size for matrix instruction fusion using square-shaped tiles."))

MatrixLayout
static cl::opt< MatrixLayoutTy > MatrixLayout("matrix-default-layout", cl::init(MatrixLayoutTy::ColumnMajor), cl::desc("Sets the default matrix layout"), cl::values(clEnumValN(MatrixLayoutTy::ColumnMajor, "column-major", "Use column-major layout"), clEnumValN(MatrixLayoutTy::RowMajor, "row-major", "Use row-major layout")))

LowerMatrixIntrinsics.h

F
#define F(x, y, z)
Definition: MD5.cpp:55

I
#define I(x, y, z)
Definition: MD5.cpp:58

MatrixBuilder.h

MatrixUtils.h

T1
#define T1
Definition: Mips16ISelLowering.cpp:340

II
uint64_t IntrinsicInst * II
Definition: NVVMIntrRange.cpp:51

OptimizationRemarkEmitter.h

Operation
PowerPC Reduce CR logical Operation
Definition: PPCReduceCRLogicals.cpp:735

PatternMatch.h

PostOrderIterator.h
This file builds on the ADT/GraphTraits.h file to build a generic graph post order iterator.

assert
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())

getNumElements
static unsigned getNumElements(Type *Ty)
Definition: SLPVectorizer.cpp:254

extractVector
static Value * extractVector(IRBuilderTy &IRB, Value *V, unsigned BeginIndex, unsigned EndIndex, const Twine &Name)
Definition: SROA.cpp:2550

insertVector
static Value * insertVector(IRBuilderTy &IRB, Value *Old, Value *V, unsigned BeginIndex, const Twine &Name)
Definition: SROA.cpp:2572

OS
raw_pwrite_stream & OS
Definition: SampleProfWriter.cpp:51

ScopeExit.h
This file defines the make_scope_exit function, which executes user-defined cleanup logic at scope ex...

SmallSet.h
This file defines the SmallSet class.

SmallVector.h
This file defines the SmallVector class.

getType
static SymbolRef::Type getType(const Symbol *Sym)
Definition: TapiFile.cpp:39

BlockSize
static const int BlockSize
Definition: TarWriter.cpp:33

Ptr
@ Ptr
Definition: TargetLibraryInfo.cpp:77

TargetTransformInfo.h
This pass exposes codegen information to IR-level passes.

getOpcode
static std::optional< unsigned > getOpcode(ArrayRef< VPValue * > Values)
Returns the opcode of Values or ~0 if they do not all agree.
Definition: VPlanSLP.cpp:191

ValueTracking.h

VectorUtils.h

LowerStore
static SDValue LowerStore(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Definition: X86ISelLowering.cpp:25151

LowerLoad
static SDValue LowerLoad(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Definition: X86ISelLowering.cpp:25238

RHS
Value * RHS
Definition: X86PartialReduction.cpp:74

LHS
Value * LHS
Definition: X86PartialReduction.cpp:73

Mul
BinaryOperator * Mul
Definition: X86PartialReduction.cpp:68

T

VectorType
Definition: ItaniumDemangle.h:1173

bool

llvm::AAManager
A manager for alias analyses.
Definition: AliasAnalysis.h:927

llvm::AAResults
Definition: AliasAnalysis.h:314

llvm::AAResults::isNoAlias
bool isNoAlias(const MemoryLocation &LocA, const MemoryLocation &LocB)
A trivial helper function to check to see if the specified pointers are no-alias.
Definition: AliasAnalysis.h:368

llvm::AllocaInst
an instruction to allocate memory on the stack
Definition: Instructions.h:63

llvm::AllocaInst::getAlign
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
Definition: Instructions.h:124

llvm::AnalysisManager
A container for analyses that lazily runs them and caches their results.
Definition: PassManager.h:253

llvm::AnalysisManager::getResult
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Definition: PassManager.h:410

llvm::ArrayRef
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41

llvm::BasicBlock
LLVM Basic Block Representation.
Definition: BasicBlock.h:61

llvm::BasicBlock::begin
iterator begin()
Instruction iterator methods.
Definition: BasicBlock.h:448

llvm::BasicBlock::rbegin
reverse_iterator rbegin()
Definition: BasicBlock.h:464

llvm::BasicBlock::reverse_iterator
InstListType::reverse_iterator reverse_iterator
Definition: BasicBlock.h:179

llvm::BasicBlock::rend
reverse_iterator rend()
Definition: BasicBlock.h:466

llvm::BasicBlock::getTerminator
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition: BasicBlock.h:239

llvm::BinaryOperator
Definition: InstrTypes.h:170

llvm::BinaryOperator::getOpcode
BinaryOps getOpcode() const
Definition: InstrTypes.h:370

llvm::CallBase::getCalledFunction
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
Definition: InstrTypes.h:1341

llvm::CallBase::arg_begin
User::op_iterator arg_begin()
Return the iterator pointing to the beginning of the argument list.
Definition: InstrTypes.h:1261

llvm::CallBase::getParamAlign
MaybeAlign getParamAlign(unsigned ArgNo) const
Extract the alignment for a call or parameter (0=unknown).
Definition: InstrTypes.h:1748

llvm::CallBase::getArgOperand
Value * getArgOperand(unsigned i) const
Definition: InstrTypes.h:1286

llvm::CallBase::arg_end
User::op_iterator arg_end()
Return the iterator pointing to the end of the argument list.
Definition: InstrTypes.h:1267

llvm::CallInst
This class represents a function call, abstracting a target machine's calling convention.
Definition: Instructions.h:1479

llvm::ConstantAggregateZero::get
static ConstantAggregateZero * get(Type *Ty)
Definition: Constants.cpp:1672

llvm::ConstantInt
This is the shared class of boolean and integer constants.
Definition: Constants.h:83

llvm::DILocalScope::getSubprogram
DISubprogram * getSubprogram() const
Get the subprogram for this scope.
Definition: DebugInfoMetadata.cpp:1051

llvm::DILocation
Debug location.
Definition: DebugInfoMetadata.h:1988

llvm::DIScope
Base class for scope-like contexts.
Definition: DebugInfoMetadata.h:519

llvm::DISubprogram
Subprogram description.
Definition: DebugInfoMetadata.h:1710

llvm::DWARFExpression::Operation
This class represents an Operation in the Expression.
Definition: DWARFExpression.h:32

llvm::DataLayout
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:63

llvm::DebugLoc
A debug info location.
Definition: DebugLoc.h:33

llvm::DebugLoc::getInlinedAt
DILocation * getInlinedAt() const
Definition: DebugLoc.cpp:39

llvm::DenseMapBase::find
iterator find(const_arg_type_t< KeyT > Val)
Definition: DenseMap.h:156

llvm::DenseMapBase::erase
bool erase(const KeyT &Val)
Definition: DenseMap.h:321

llvm::DenseMapBase::count
size_type count(const_arg_type_t< KeyT > Val) const
Return 1 if the specified key is in the map, 0 otherwise.
Definition: DenseMap.h:152

llvm::DenseMapBase::end
iterator end()
Definition: DenseMap.h:84

llvm::DenseMapBase::insert
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition: DenseMap.h:211

llvm::DenseMap
Definition: DenseMap.h:727

llvm::DomTreeUpdater
Definition: DomTreeUpdater.h:30

llvm::DominatorTreeAnalysis
Analysis pass which computes a DominatorTree.
Definition: Dominators.h:279

llvm::DominatorTreeBase::applyUpdates
void applyUpdates(ArrayRef< UpdateType > Updates)
Inform the dominator tree about a sequence of CFG edge insertions and deletions and perform a batch u...
Definition: GenericDomTree.h:596

llvm::DominatorTreeBase::Delete
static constexpr UpdateKind Delete
Definition: GenericDomTree.h:253

llvm::DominatorTreeBase::Insert
static constexpr UpdateKind Insert
Definition: GenericDomTree.h:252

llvm::DominatorTree
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition: Dominators.h:162

llvm::DominatorTree::dominates
bool dominates(const BasicBlock *BB, const Use &U) const
Return true if the (end of the) basic block BB dominates the use U.
Definition: Dominators.cpp:122

llvm::FastMathFlags
Convenience struct for specifying and reasoning about fast-math flags.
Definition: FMF.h:20

llvm::FastMathFlags::setAllowContract
void setAllowContract(bool B=true)
Definition: FMF.h:91

llvm::FastMathFlags::allowReassoc
bool allowReassoc() const
Flag queries.
Definition: FMF.h:65

llvm::FastMathFlags::allowContract
bool allowContract() const
Definition: FMF.h:70

llvm::FixedVectorType
Class to represent fixed width SIMD vectors.
Definition: DerivedTypes.h:563

llvm::FixedVectorType::get
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition: Type.cpp:791

llvm::Function
Definition: Function.h:63

llvm::Function::getIntrinsicID
Intrinsic::ID getIntrinsicID() const LLVM_READONLY
getIntrinsicID - This method returns the ID number of the specified function, or Intrinsic::not_intri...
Definition: Function.h:251

llvm::Function::isIntrinsic
bool isIntrinsic() const
isIntrinsic - Returns true if the function's name starts with "llvm.".
Definition: Function.h:256

llvm::IRBuilderBase::CreateFAddReduce
CallInst * CreateFAddReduce(Value *Acc, Value *Src)
Create a sequential vector fadd reduction intrinsic of the source vector.
Definition: IRBuilder.cpp:402

llvm::IRBuilderBase::CreateICmpULT
Value * CreateICmpULT(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2286

llvm::IRBuilderBase::CreateFSub
Value * CreateFSub(Value *L, Value *R, const Twine &Name="", MDNode *FPMD=nullptr)
Definition: IRBuilder.h:1595

llvm::IRBuilderBase::CreateInsertElement
Value * CreateInsertElement(Type *VecTy, Value *NewElt, Value *Idx, const Twine &Name="")
Definition: IRBuilder.h:2511

llvm::IRBuilderBase::CreateAlloca
AllocaInst * CreateAlloca(Type *Ty, unsigned AddrSpace, Value *ArraySize=nullptr, const Twine &Name="")
Definition: IRBuilder.h:1781

llvm::IRBuilderBase::CreateExtractElement
Value * CreateExtractElement(Value *Vec, Value *Idx, const Twine &Name="")
Definition: IRBuilder.h:2499

llvm::IRBuilderBase::CreateAlignedLoad
LoadInst * CreateAlignedLoad(Type *Ty, Value *Ptr, MaybeAlign Align, const char *Name)
Definition: IRBuilder.h:1815

llvm::IRBuilderBase::CreateFAdd
Value * CreateFAdd(Value *L, Value *R, const Twine &Name="", MDNode *FPMD=nullptr)
Definition: IRBuilder.h:1576

llvm::IRBuilderBase::CreateVectorSplat
Value * CreateVectorSplat(unsigned NumElts, Value *V, const Twine &Name="")
Return a vector value that contains.
Definition: IRBuilder.cpp:1163

llvm::IRBuilderBase::CreateAddReduce
CallInst * CreateAddReduce(Value *Src)
Create a vector int add reduction intrinsic of the source vector.
Definition: IRBuilder.cpp:412

llvm::IRBuilderBase::getIntPtrTy
IntegerType * getIntPtrTy(const DataLayout &DL, unsigned AddrSpace=0)
Fetch the type of an integer with size at least as big as that of a pointer in the given address spac...
Definition: IRBuilder.h:594

llvm::IRBuilderBase::setFastMathFlags
void setFastMathFlags(FastMathFlags NewFMF)
Set the fast-math flags to be used with generated fp-math operators.
Definition: IRBuilder.h:330

llvm::IRBuilderBase::CreateGEP
Value * CreateGEP(Type *Ty, Value *Ptr, ArrayRef< Value * > IdxList, const Twine &Name="", GEPNoWrapFlags NW=GEPNoWrapFlags::none())
Definition: IRBuilder.h:1874

llvm::IRBuilderBase::getInt64
ConstantInt * getInt64(uint64_t C)
Get a constant 64-bit value.
Definition: IRBuilder.h:510

llvm::IRBuilderBase::CreateIntrinsic
CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
Definition: IRBuilder.cpp:900

llvm::IRBuilderBase::CreatePHI
PHINode * CreatePHI(Type *Ty, unsigned NumReservedValues, const Twine &Name="")
Definition: IRBuilder.h:2435

llvm::IRBuilderBase::CreateSub
Value * CreateSub(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition: IRBuilder.h:1387

llvm::IRBuilderBase::getIntN
ConstantInt * getIntN(unsigned N, uint64_t C)
Get a constant N-bit value, zero extended or truncated from a 64-bit value.
Definition: IRBuilder.h:516

llvm::IRBuilderBase::CreateCondBr
BranchInst * CreateCondBr(Value *Cond, BasicBlock *True, BasicBlock *False, MDNode *BranchWeights=nullptr, MDNode *Unpredictable=nullptr)
Create a conditional 'br Cond, TrueDest, FalseDest' instruction.
Definition: IRBuilder.h:1164

llvm::IRBuilderBase::CreateLoad
LoadInst * CreateLoad(Type *Ty, Value *Ptr, const char *Name)
Provided to resolve 'CreateLoad(Ty, Ptr, "...")' correctly, instead of converting the string to 'bool...
Definition: IRBuilder.h:1798

llvm::IRBuilderBase::CreateShuffleVector
Value * CreateShuffleVector(Value *V1, Value *V2, Value *Mask, const Twine &Name="")
Definition: IRBuilder.h:2533

llvm::IRBuilderBase::CreateAdd
Value * CreateAdd(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition: IRBuilder.h:1370

llvm::IRBuilderBase::CreatePtrToInt
Value * CreatePtrToInt(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2142

llvm::IRBuilderBase::SetInsertPoint
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Definition: IRBuilder.h:199

llvm::IRBuilderBase::CreateAlignedStore
StoreInst * CreateAlignedStore(Value *Val, Value *Ptr, MaybeAlign Align, bool isVolatile=false)
Definition: IRBuilder.h:1834

llvm::IRBuilderBase::CreateFMul
Value * CreateFMul(Value *L, Value *R, const Twine &Name="", MDNode *FPMD=nullptr)
Definition: IRBuilder.h:1614

llvm::IRBuilderBase::CreateFNeg
Value * CreateFNeg(Value *V, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:1742

llvm::IRBuilderBase::CreateMemCpy
CallInst * CreateMemCpy(Value *Dst, MaybeAlign DstAlign, Value *Src, MaybeAlign SrcAlign, uint64_t Size, bool isVolatile=false, MDNode *TBAATag=nullptr, MDNode *TBAAStructTag=nullptr, MDNode *ScopeTag=nullptr, MDNode *NoAliasTag=nullptr)
Create and insert a memcpy between the specified pointers.
Definition: IRBuilder.h:677

llvm::IRBuilderBase::CreateMul
Value * CreateMul(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition: IRBuilder.h:1404

llvm::IRBuilder
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition: IRBuilder.h:2705

llvm::InstructionCost
Definition: InstructionCost.h:29

llvm::InstructionCost::getInvalid
static InstructionCost getInvalid(CostType Val=0)
Definition: InstructionCost.h:73

llvm::Instruction
Definition: Instruction.h:68

llvm::Instruction::setFastMathFlags
void setFastMathFlags(FastMathFlags FMF)
Convenience function for setting multiple fast-math flags on this instruction, which must be an opera...
Definition: Instruction.cpp:589

llvm::Instruction::eraseFromParent
InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
Definition: Instruction.cpp:94

llvm::Instruction::getFastMathFlags
FastMathFlags getFastMathFlags() const LLVM_READONLY
Convenience function for getting all the fast-math flags, which must be an operator which supports th...
Definition: Instruction.cpp:639

llvm::IntrinsicInst
A wrapper class for inspecting calls to intrinsic functions.
Definition: IntrinsicInst.h:48

llvm::LoadInst
An instruction for reading from memory.
Definition: Instructions.h:176

llvm::LoadInst::isVolatile
bool isVolatile() const
Return true if this is a load from a volatile memory location.
Definition: Instructions.h:205

llvm::LoadInst::getAlign
Align getAlign() const
Return the alignment of the access that is being performed.
Definition: Instructions.h:211

llvm::LocationSize::getValue
TypeSize getValue() const
Definition: MemoryLocation.h:170

llvm::LoopAnalysis
Analysis pass that exposes the LoopInfo for a function.
Definition: LoopInfo.h:566

llvm::LoopInfoBase::getLoopFor
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
Definition: GenericLoopInfo.h:606

llvm::LoopInfo
Definition: LoopInfo.h:407

llvm::LowerMatrixIntrinsicsPass::run
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM)
Definition: LowerMatrixIntrinsics.cpp:2644

llvm::LowerMatrixIntrinsicsPass::printPipeline
void printPipeline(raw_ostream &OS, function_ref< StringRef(StringRef)> MapClassName2PassName)
Definition: LowerMatrixIntrinsics.cpp:2660

llvm::MapVector
This class implements a map that also provides access to all stored values in a deterministic order.
Definition: MapVector.h:36

llvm::MapVector::end
iterator end()
Definition: MapVector.h:71

llvm::MapVector::find
iterator find(const KeyT &Key)
Definition: MapVector.h:167

llvm::MapVector::insert
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition: MapVector.h:141

llvm::MatrixBuilder
Definition: MatrixBuilder.h:33

llvm::MatrixBuilder::CreateMatrixTranspose
CallInst * CreateMatrixTranspose(Value *Matrix, unsigned Rows, unsigned Columns, const Twine &Name="")
Create a llvm.matrix.transpose call, transposing Matrix with Rows rows and Columns columns.
Definition: MatrixBuilder.h:110

llvm::MatrixBuilder::CreateMatrixMultiply
CallInst * CreateMatrixMultiply(Value *LHS, Value *RHS, unsigned LHSRows, unsigned LHSColumns, unsigned RHSColumns, const Twine &Name="")
Create a llvm.matrix.multiply call, multiplying matrixes LHS and RHS.
Definition: MatrixBuilder.h:126

llvm::MemoryLocation
Representation for a specific memory location.
Definition: MemoryLocation.h:227

llvm::MemoryLocation::get
static MemoryLocation get(const LoadInst *LI)
Return a location with information about the memory reference by the given instruction.
Definition: MemoryLocation.cpp:35

llvm::MemoryLocation::Size
LocationSize Size
The maximum size of the location, in address-units, or UnknownSize if the size is not known.
Definition: MemoryLocation.h:244

llvm::MemoryLocation::Ptr
const Value * Ptr
The address of the start of the location.
Definition: MemoryLocation.h:235

llvm::MemoryLocation::getForArgument
static MemoryLocation getForArgument(const CallBase *Call, unsigned ArgIdx, const TargetLibraryInfo *TLI)
Return a location representing a particular argument of a call.
Definition: MemoryLocation.cpp:159

llvm::OptimizationRemarkEmitterAnalysis
Definition: OptimizationRemarkEmitter.h:164

llvm::OptimizationRemarkEmitter
The optimization diagnostic interface.
Definition: OptimizationRemarkEmitter.h:32

llvm::OptimizationRemarkEmitter::allowExtraAnalysis
bool allowExtraAnalysis(StringRef PassName) const
Whether we allow for extra compile-time budget to perform more analysis to produce fewer false positi...
Definition: OptimizationRemarkEmitter.h:97

llvm::OptimizationRemarkEmitter::emit
void emit(DiagnosticInfoOptimizationBase &OptDiag)
Output the remark via the diagnostic handler and to the optimization record file.
Definition: OptimizationRemarkEmitter.cpp:79

llvm::OptimizationRemark
Diagnostic information for applied optimization remarks.
Definition: DiagnosticInfo.h:762

llvm::PHINode
Definition: Instructions.h:2600

llvm::PoisonValue::get
static PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
Definition: Constants.cpp:1878

llvm::PreservedAnalyses
A set of analyses that are preserved following a run of a transformation pass.
Definition: Analysis.h:111

llvm::PreservedAnalyses::all
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition: Analysis.h:117

llvm::PreservedAnalyses::preserve
void preserve()
Mark an analysis as preserved.
Definition: Analysis.h:131

llvm::ReversePostOrderTraversal
Definition: PostOrderIterator.h:299

llvm::SetVector
A vector that has set insertion semantics.
Definition: SetVector.h:57

llvm::SetVector::size
size_type size() const
Determine the number of elements in the SetVector.
Definition: SetVector.h:98

llvm::SetVector::count
size_type count(const key_type &key) const
Count the number of elements of a given key in the SetVector.
Definition: SetVector.h:264

llvm::SetVector::insert
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition: SetVector.h:162

llvm::SmallPtrSetImplBase::empty
bool empty() const
Definition: SmallPtrSet.h:93

llvm::SmallPtrSetImpl
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
Definition: SmallPtrSet.h:363

llvm::SmallPtrSetImpl::count
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
Definition: SmallPtrSet.h:452

llvm::SmallPtrSetImpl::insert
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:384

llvm::SmallPtrSetImpl::contains
bool contains(ConstPtrType Ptr) const
Definition: SmallPtrSet.h:458

llvm::SmallPtrSet
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:519

llvm::SmallSetVector
A SetVector that performs no allocations if smaller than a certain size.
Definition: SetVector.h:370

llvm::SmallSet
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition: SmallSet.h:132

llvm::SmallSet::empty
bool empty() const
Definition: SmallSet.h:168

llvm::SmallSet::erase
bool erase(const T &V)
Definition: SmallSet.h:193

llvm::SmallSet::insert
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition: SmallSet.h:181

llvm::SmallVectorBase::empty
bool empty() const
Definition: SmallVector.h:81

llvm::SmallVectorBase::size
size_t size() const
Definition: SmallVector.h:78

llvm::SmallVectorImpl
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:573

llvm::SmallVectorImpl::pop_back_val
T pop_back_val()
Definition: SmallVector.h:673

llvm::SmallVectorImpl::append
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
Definition: SmallVector.h:683

llvm::SmallVectorTemplateBase::pop_back
void pop_back()
Definition: SmallVector.h:425

llvm::SmallVectorTemplateBase::push_back
void push_back(const T &Elt)
Definition: SmallVector.h:413

llvm::SmallVectorTemplateCommon::end
iterator end()
Definition: SmallVector.h:269

llvm::SmallVectorTemplateCommon::begin
iterator begin()
Definition: SmallVector.h:267

llvm::SmallVectorTemplateCommon::back
reference back()
Definition: SmallVector.h:308

llvm::SmallVector
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1196

llvm::StoreInst
An instruction for storing to memory.
Definition: Instructions.h:292

llvm::StoreInst::getAlign
Align getAlign() const
Definition: Instructions.h:333

llvm::StoreInst::isVolatile
bool isVolatile() const
Return true if this is a store to a volatile memory location.
Definition: Instructions.h:325

llvm::StringRef
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:51

llvm::StringRef::drop_front
StringRef drop_front(size_t N=1) const
Return a StringRef equal to 'this' but with the first N elements dropped.
Definition: StringRef.h:609

llvm::StringRef::size
constexpr size_t size() const
size - Get the string size.
Definition: StringRef.h:150

llvm::TargetIRAnalysis
Analysis pass providing the TargetTransformInfo.
Definition: TargetTransformInfo.h:3172

llvm::TargetTransformInfo
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
Definition: TargetTransformInfo.h:212

llvm::TargetTransformInfo::getRegisterBitWidth
TypeSize getRegisterBitWidth(RegisterKind K) const
Definition: TargetTransformInfo.cpp:776

llvm::TargetTransformInfo::getMemoryOpCost
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, OperandValueInfo OpdInfo={OK_AnyValue, OP_None}, const Instruction *I=nullptr) const
Definition: TargetTransformInfo.cpp:1125

llvm::TargetTransformInfo::getArithmeticReductionCost
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
Calculate the cost of vector reduction intrinsics.
Definition: TargetTransformInfo.cpp:1215

llvm::TargetTransformInfo::getRegisterClassForType
unsigned getRegisterClassForType(bool Vector, Type *Ty=nullptr) const
Definition: TargetTransformInfo.cpp:767

llvm::TargetTransformInfo::TCK_RecipThroughput
@ TCK_RecipThroughput
Reciprocal throughput.
Definition: TargetTransformInfo.h:264

llvm::TargetTransformInfo::getArithmeticInstrCost
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr, const TargetLibraryInfo *TLibInfo=nullptr) const
This is an approximation of reciprocal throughput of a math/logic op.
Definition: TargetTransformInfo.cpp:940

llvm::TargetTransformInfo::RGK_FixedWidthVector
@ RGK_FixedWidthVector
Definition: TargetTransformInfo.h:1175

llvm::TargetTransformInfo::getShuffleCost
InstructionCost getShuffleCost(ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask={}, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, int Index=0, VectorType *SubTp=nullptr, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const
Definition: TargetTransformInfo.cpp:976

llvm::TargetTransformInfo::getNumberOfRegisters
unsigned getNumberOfRegisters(unsigned ClassID) const
Definition: TargetTransformInfo.cpp:759

llvm::TargetTransformInfo::SK_Splice
@ SK_Splice
Concatenates elements from the first input vector with elements of the second input vector.
Definition: TargetTransformInfo.h:1106

llvm::Twine
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition: Twine.h:81

llvm::TypeSize
Definition: TypeSize.h:334

llvm::Type
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45

llvm::Type::getScalarSizeInBits
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.

llvm::Type::getPrimitiveSizeInBits
TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.

llvm::Type::isVoidTy
bool isVoidTy() const
Return true if this is 'void'.
Definition: Type.h:139

llvm::Type::getScalarType
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition: Type.h:355

llvm::UnaryOperator
Definition: InstrTypes.h:100

llvm::UnaryOperator::getOpcode
UnaryOps getOpcode() const
Definition: InstrTypes.h:153

llvm::Use
A Use represents the edge between a Value definition and its users.
Definition: Use.h:43

llvm::User
Definition: User.h:44

llvm::User::getOperand
Value * getOperand(unsigned i) const
Definition: User.h:228

llvm::Value
LLVM Value Representation.
Definition: Value.h:74

llvm::Value::getType
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255

llvm::Value::user_begin
user_iterator user_begin()
Definition: Value.h:397

llvm::Value::hasOneUse
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition: Value.h:434

llvm::Value::replaceAllUsesWith
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition: Value.cpp:534

llvm::Value::users
iterator_range< user_iterator > users()
Definition: Value.h:421

llvm::Value::hasNUses
bool hasNUses(unsigned N) const
Return true if this Value has exactly N uses.
Definition: Value.cpp:149

llvm::Value::use_empty
bool use_empty() const
Definition: Value.h:344

llvm::Value::uses
iterator_range< use_iterator > uses()
Definition: Value.h:376

llvm::Value::getName
StringRef getName() const
Return a constant reference to the value's name.
Definition: Value.cpp:309

llvm::VectorType::getElementType
Type * getElementType() const
Definition: DerivedTypes.h:460

llvm::cl::opt
Definition: CommandLine.h:1423

llvm::details::FixedOrScalableQuantity::getFixedValue
constexpr ScalarTy getFixedValue() const
Definition: TypeSize.h:202

llvm::function_ref
An efficient, type-erasing, non-owning reference to a callable.
Definition: STLFunctionalExtras.h:37

llvm::ilist_detail::node_parent_access::getParent
const ParentTy * getParent() const
Definition: ilist_node.h:32

llvm::iterator_range
A range adaptor for a pair of iterators.
Definition: iterator_range.h:42

llvm::raw_ostream
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition: raw_ostream.h:52

llvm::raw_string_ostream
A raw_ostream that writes to an std::string.
Definition: raw_ostream.h:661

uint64_t

llvm_unreachable
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Definition: ErrorHandling.h:143

llvm::AArch64PACKey::IB
@ IB
Definition: AArch64BaseInfo.h:846

llvm::ARM_MB::ST
@ ST
Definition: ARMBaseInfo.h:73

llvm::ARM::ProfileKind::M
@ M

llvm::BitmaskEnumDetail::Mask
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:125

llvm::CallingConv::C
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34

llvm::Intrinsic::getBaseName
StringRef getBaseName(ID id)
Return the LLVM name for an intrinsic, without encoded types for overloading, such as "llvm....
Definition: Intrinsics.cpp:41

llvm::M68k::MemAddrModeKind::U
@ U

llvm::M68k::MemAddrModeKind::V
@ V

llvm::M68k::MemAddrModeKind::K
@ K

llvm::M68k::MemAddrModeKind::L
@ L

llvm::PatternMatch::m_Store
TwoOps_match< ValueOpTy, PointerOpTy, Instruction::Store > m_Store(const ValueOpTy &ValueOp, const PointerOpTy &PointerOp)
Matches StoreInst.
Definition: PatternMatch.h:1902

llvm::PatternMatch::m_Add
BinaryOp_match< LHS, RHS, Instruction::Add > m_Add(const LHS &L, const RHS &R)
Definition: PatternMatch.h:1102

llvm::PatternMatch::m_BinOp
class_match< BinaryOperator > m_BinOp()
Match an arbitrary binary operation and ignore it.
Definition: PatternMatch.h:100

llvm::PatternMatch::m_SpecificInt
specific_intval< false > m_SpecificInt(const APInt &V)
Match a specific integer value or vector with all elements equal to the value.
Definition: PatternMatch.h:982

llvm::PatternMatch::m_FMul
BinaryOp_match< LHS, RHS, Instruction::FMul > m_FMul(const LHS &L, const RHS &R)
Definition: PatternMatch.h:1174

llvm::PatternMatch::match
bool match(Val *V, const Pattern &P)
Definition: PatternMatch.h:49

llvm::PatternMatch::m_ConstantInt
class_match< ConstantInt > m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
Definition: PatternMatch.h:168

llvm::PatternMatch::m_FAdd
BinaryOp_match< LHS, RHS, Instruction::FAdd > m_FAdd(const LHS &L, const RHS &R)
Definition: PatternMatch.h:1108

llvm::PatternMatch::m_Mul
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
Definition: PatternMatch.h:1168

llvm::PatternMatch::m_OneUse
OneUse_match< T > m_OneUse(const T &SubPattern)
Definition: PatternMatch.h:67

llvm::PatternMatch::m_Load
OneOps_match< OpTy, Instruction::Load > m_Load(const OpTy &Op)
Matches LoadInst.
Definition: PatternMatch.h:1895

llvm::PatternMatch::m_Value
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
Definition: PatternMatch.h:92

llvm::PatternMatch::m_CombineOr
match_combine_or< LTy, RTy > m_CombineOr(const LTy &L, const RTy &R)
Combine two pattern matchers matching L || R.
Definition: PatternMatch.h:239

llvm::RISCVFenceField::R
@ R
Definition: RISCVBaseInfo.h:371

llvm::SIEncodingFamily::SI
@ SI
Definition: SIDefines.h:36

llvm::SPII::Store
@ Store
Definition: SparcInstrInfo.h:33

llvm::SPII::Load
@ Load
Definition: SparcInstrInfo.h:32

llvm::X86AS::SS
@ SS
Definition: X86.h:212

llvm::X86II::TA
@ TA
Definition: X86BaseInfo.h:738

llvm::cl::Hidden
@ Hidden
Definition: CommandLine.h:137

llvm::cl::values
ValuesClass values(OptsTy... Options)
Helper to build a ValuesClass by forwarding a variable number of arguments as an initializer list to ...
Definition: CommandLine.h:711

llvm::cl::init
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:443

llvm::codeview::EncodedFramePtrReg::BasePtr
@ BasePtr

llvm::codeview::FrameCookieKind::Copy
@ Copy

llvm::dxil::ElementType
ElementType
The element type of an SRV or UAV resource.
Definition: DXILABI.h:58

llvm::ms_demangle::IntrinsicFunctionKind::New
@ New

llvm::ms_demangle::QualifierMangleMode::Result
@ Result

llvm::ore::NV
DiagnosticInfoOptimizationBase::Argument NV
Definition: OptimizationRemarkEmitter.h:135

llvm::rdf::Phi
NodeAddr< PhiNode * > Phi
Definition: RDFGraph.h:390

llvm::rdf::Func
NodeAddr< FuncNode * > Func
Definition: RDFGraph.h:393

llvm
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18

llvm::Offset
@ Offset
Definition: DWP.cpp:480

llvm::PseudoProbeType::Block
@ Block

llvm::size
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition: STLExtras.h:1697

llvm::make_scope_exit
detail::scope_exit< std::decay_t< Callable > > make_scope_exit(Callable &&F)
Definition: ScopeExit.h:59

llvm::enumerate
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition: STLExtras.h:2448

llvm::successors
auto successors(const MachineBasicBlock *BB)
Definition: MachineBasicBlock.h:1376

llvm::operator!=
bool operator!=(uint64_t V1, const APInt &V2)
Definition: APInt.h:2082

llvm::make_range
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
Definition: iterator_range.h:77

llvm::operator+=
LLVM_ATTRIBUTE_ALWAYS_INLINE DynamicAPInt & operator+=(DynamicAPInt &A, int64_t B)
Definition: DynamicAPInt.h:518

llvm::getUnderlyingObject
const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=6)
This method strips off any GEP address adjustments, pointer casts or llvm.threadlocal....
Definition: ValueTracking.cpp:6761

llvm::make_early_inc_range
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition: STLExtras.h:657

llvm::concatenateVectors
Value * concatenateVectors(IRBuilderBase &Builder, ArrayRef< Value * > Vecs)
Concatenate a list of vectors.
Definition: VectorUtils.cpp:1095

llvm::operator==
bool operator==(const AddressRangeValuePair &LHS, const AddressRangeValuePair &RHS)
Definition: AddressRanges.h:153

llvm::getPointerOperand
const Value * getPointerOperand(const Value *V)
A helper function that returns the pointer operand of a load, store or GEP instruction.
Definition: Instructions.h:4998

llvm::addStringMetadataToLoop
void addStringMetadataToLoop(Loop *TheLoop, const char *MDString, unsigned V=0)
Set input string into loop metadata by keeping other values intact.
Definition: LoopUtils.cpp:214

llvm::any_of
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1746

llvm::reverse
auto reverse(ContainerTy &&C)
Definition: STLExtras.h:420

llvm::write
Error write(MCStreamer &Out, ArrayRef< std::string > Inputs, OnCuIndexOverflow OverflowOptValue)
Definition: DWP.cpp:625

llvm::sort
void sort(IteratorTy Start, IteratorTy End)
Definition: STLExtras.h:1664

llvm::ComplexDeinterleavingOperation::Splat
@ Splat

llvm::dbgs
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163

llvm::report_fatal_error
void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
Definition: Error.cpp:167

llvm::errs
raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
Definition: raw_ostream.cpp:907

llvm::RecurKind::Add
@ Add
Sum of integers.

llvm::Op
DWARFExpression::Operation Op
Definition: DWARFExpression.cpp:22

llvm::cast
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition: Casting.h:565

llvm::SplitBlock
BasicBlock * SplitBlock(BasicBlock *Old, BasicBlock::iterator SplitPt, DominatorTree *DT, LoopInfo *LI=nullptr, MemorySSAUpdater *MSSAU=nullptr, const Twine &BBName="", bool Before=false)
Split the specified block at the specified instruction.
Definition: BasicBlockUtils.cpp:1084

llvm::commonAlignment
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition: Alignment.h:212

llvm::VFParamKind::Vector
@ Vector

llvm::createSequentialMask
llvm::SmallVector< int, 16 > createSequentialMask(unsigned Start, unsigned NumInts, unsigned NumUndefs)
Create a sequential shuffle mask.
Definition: VectorUtils.cpp:1040

std::swap
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:860

N
#define N

llvm::Align
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39

llvm::BitTracker
Definition: BitTracker.h:35

llvm::MaybeAlign
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition: Alignment.h:117

llvm::PassInfoMixin
A CRTP mix-in to automatically provide informational APIs needed for passes.
Definition: PassManager.h:69

llvm::TileInfo
A helper struct to create IR loop nests for tiling in IR of the following form: for ColumnLoop....
Definition: MatrixUtils.h:31

llvm::cl::desc
Definition: CommandLine.h:409

llvm::indent
Definition: raw_ostream.h:781