57#define DEBUG_TYPE "lower-matrix-intrinsics"
59STATISTIC(FlattenedMatrices,
"Number of matrix flattenings");
60STATISTIC(ReshapedMatrices,
"Number of matrix reshapes");
65 cl::desc(
"Enable/disable fusing matrix instructions."));
70 "Tile size for matrix instruction fusion using square-shaped tiles."));
73 cl::desc(
"Generate loop nest for tiling."));
76 cl::desc(
"Force matrix instruction fusion even if not profitable."));
79 cl::desc(
"Allow the use of FMAs if available and profitable. This may "
80 "result in different results, due to less rounding error."));
84 cl::desc(
"Enable/disable matrix shape verification."),
91 cl::desc(
"Sets the default matrix layout"),
93 "Use column-major layout"),
95 "Use row-major layout")));
112 return SV->isZeroEltSplat();
117template <
typename LTy,
typename RTy>
123template <
typename LTy,
typename RTy>
171 unsigned NumElements,
Type *EltType,
176 "Stride must be >= the number of elements in the result vector.");
179 Value *VecStart = Builder.CreateMul(VecIdx, Stride,
"vec.start");
186 VecStart = Builder.CreateGEP(EltType, BasePtr, VecStart,
"vec.gep");
198 ShapeInfo(
unsigned NumRows = 0,
unsigned NumColumns = 0)
199 : NumRows(NumRows), NumColumns(NumColumns),
203 : ShapeInfo(
cast<ConstantInt>(NumRows)->getZExtValue(),
204 cast<ConstantInt>(NumColumns)->getZExtValue()) {}
207 return NumRows == other.NumRows && NumColumns == other.NumColumns;
209 bool operator!=(
const ShapeInfo &other) {
return !(*
this == other); }
213 operator bool()
const {
214 assert(NumRows == 0 || NumColumns != 0);
218 unsigned getStride()
const {
224 unsigned getNumVectors()
const {
231 ShapeInfo t()
const {
return ShapeInfo(NumColumns, NumRows); }
233 friend raw_ostream &
operator<<(raw_ostream &OS, ShapeInfo SI);
239 return OS <<
SI.NumRows <<
'x' <<
SI.NumColumns;
244static bool isUniformShape(
Value *V) {
253 switch (Cast->getOpcode()) {
254 case llvm::Instruction::Trunc:
255 case llvm::Instruction::ZExt:
256 case llvm::Instruction::SExt:
257 case llvm::Instruction::FPToUI:
258 case llvm::Instruction::FPToSI:
259 case llvm::Instruction::UIToFP:
260 case llvm::Instruction::SIToFP:
261 case llvm::Instruction::FPTrunc:
262 case llvm::Instruction::FPExt:
264 case llvm::Instruction::AddrSpaceCast:
265 case CastInst::PtrToAddr:
266 case CastInst::PtrToInt:
267 case CastInst::IntToPtr:
269 case CastInst::BitCast: {
272 return SrcVTy->getNumElements() == DestVTy->getNumElements();
275 case llvm::Instruction::CastOpsEnd:
282 switch (
II->getIntrinsicID()) {
284 case Intrinsic::fabs:
290 switch (
I->getOpcode()) {
291 case Instruction::PHI:
292 case Instruction::FNeg:
300static std::optional<ShapeInfo>
308 return ShapeInfo(M, K);
312 return ShapeInfo(
N, M);
317 return ShapeInfo(
N, M);
320 return ShapeInfo(M,
N);
323 auto OpShape = ShapeMap.
find(MatrixA);
324 if (OpShape != ShapeMap.
end())
325 return OpShape->second;
329 auto Ops =
I->operands();
332 for (
auto &
Op : ShapedOps) {
333 auto OpShape = ShapeMap.
find(
Op.get());
334 if (OpShape != ShapeMap.
end())
335 return OpShape->second;
364class LowerMatrixIntrinsics {
366 const DataLayout &DL;
367 const TargetTransformInfo &TTI;
370 DominatorTree *DT =
nullptr;
371 LoopInfo *LI =
nullptr;
372 OptimizationRemarkEmitter *ORE =
nullptr;
377 unsigned NumStores = 0;
379 unsigned NumLoads = 0;
381 unsigned NumComputeOps = 0;
385 unsigned NumExposedTransposes = 0;
388 NumStores +=
RHS.NumStores;
389 NumLoads +=
RHS.NumLoads;
390 NumComputeOps +=
RHS.NumComputeOps;
391 NumExposedTransposes +=
RHS.NumExposedTransposes;
399 SmallVector<Value *, 16> Vectors;
403 bool IsColumnMajor =
true;
410 MatrixTy(
unsigned NumRows,
unsigned NumColumns,
Type *EltTy)
413 unsigned D = isColumnMajor() ? NumColumns : NumRows;
414 for (
unsigned J = 0; J <
D; ++J)
416 EltTy, isColumnMajor() ? NumRows : NumColumns)));
419 Value *getVector(
unsigned i)
const {
return Vectors[i]; }
420 Value *getColumn(
unsigned i)
const {
421 assert(isColumnMajor() &&
"only supported for column-major matrixes");
424 Value *getRow(
unsigned i)
const {
425 assert(!isColumnMajor() &&
"only supported for row-major matrixes");
429 void setVector(
unsigned i,
Value *V) { Vectors[i] =
V; }
431 Type *getElementType()
const {
return getVectorTy()->getElementType(); }
433 unsigned getNumVectors()
const {
435 return getNumColumns();
439 unsigned getNumColumns()
const {
441 return Vectors.size();
443 assert(Vectors.size() > 0 &&
"Cannot call getNumRows without columns");
444 return getVectorTy()->getNumElements();
447 unsigned getNumRows()
const {
448 if (isColumnMajor()) {
449 assert(Vectors.size() > 0 &&
"Cannot call getNumRows without columns");
450 return getVectorTy()->getNumElements();
452 return Vectors.size();
455 void addVector(
Value *V) { Vectors.push_back(V); }
456 FixedVectorType *getColumnTy() {
457 assert(isColumnMajor() &&
"only supported for column-major matrixes");
458 return getVectorTy();
461 FixedVectorType *getVectorTy()
const {
465 iterator_range<SmallVector<Value *, 8>::iterator> columns() {
467 "columns() only supported for column-major matrixes");
468 return make_range(Vectors.begin(), Vectors.end());
471 iterator_range<SmallVector<Value *, 8>::iterator>
vectors() {
472 return make_range(Vectors.begin(), Vectors.end());
478 return Vectors.size() == 1 ? Vectors[0]
482 MatrixTy &addNumLoads(
unsigned N) {
483 OpInfo.NumLoads +=
N;
487 void setNumLoads(
unsigned N) { OpInfo.NumLoads =
N; }
489 MatrixTy &addNumStores(
unsigned N) {
490 OpInfo.NumStores +=
N;
494 MatrixTy &addNumExposedTransposes(
unsigned N) {
495 OpInfo.NumExposedTransposes +=
N;
499 MatrixTy &addNumComputeOps(
unsigned N) {
500 OpInfo.NumComputeOps +=
N;
504 unsigned getNumStores()
const {
return OpInfo.NumStores; }
505 unsigned getNumLoads()
const {
return OpInfo.NumLoads; }
506 unsigned getNumComputeOps()
const {
return OpInfo.NumComputeOps; }
508 const OpInfoTy &getOpInfo()
const {
return OpInfo; }
510 bool isColumnMajor()
const {
return IsColumnMajor; }
512 unsigned getStride()
const {
515 return getNumColumns();
518 ShapeInfo shape()
const {
return {getNumRows(), getNumColumns()}; }
525 Value *Vec = isColumnMajor() ? getColumn(J) : getRow(
I);
528 "Extracted vector will contain poison values");
549 DenseMap<Value *, ShapeInfo> ShapeMap;
554 SmallVector<Instruction *, 16> ToRemove;
557 MapVector<Value *, MatrixTy> Inst2ColumnMatrix;
560 static FastMathFlags getFastMathFlags(Instruction *Inst) {
572 LowerMatrixIntrinsics(Function &
F, TargetTransformInfo &TTI,
574 : Func(
F), DL(
F.getDataLayout()), TTI(TTI), AM(AM) {}
576 unsigned getNumOps(
Type *VT) {
583 bool isMinimal()
const {
589 unsigned getNumOps(
Type *ST,
unsigned N) {
590 return std::ceil((
ST->getPrimitiveSizeInBits() *
N).getFixedValue() /
591 double(TTI.getRegisterBitWidth(
601 MatrixTy getMatrix(
Value *MatrixVal,
const ShapeInfo &SI,
605 "The vector size must match the number of matrix elements");
611 auto Found = Inst2ColumnMatrix.find(MatrixVal);
612 if (Found != Inst2ColumnMatrix.end()) {
613 MatrixTy &
M = Found->second;
616 if (
SI.NumRows ==
M.getNumRows() &&
SI.NumColumns ==
M.getNumColumns())
619 MatrixVal =
M.embedInVector(Builder);
623 SmallVector<Value *, 16> SplitVecs;
625 MaskStart +=
SI.getStride()) {
633 if (Found != Inst2ColumnMatrix.end()) {
636 LLVM_DEBUG(
dbgs() <<
"matrix reshape from " << Found->second.shape()
637 <<
" to " << SI <<
" using at least "
638 << SplitVecs.
size() <<
" shuffles on behalf of:\n"
641 }
else if (!ShapeMap.contains(MatrixVal)) {
644 <<
"splitting a " << SI <<
" matrix with " << SplitVecs.
size()
645 <<
" shuffles beacuse we do not have a shape-aware lowering for "
662 bool setShapeInfo(
Value *V, ShapeInfo Shape) {
663 assert(Shape &&
"Shape not set");
667 auto SIter = ShapeMap.find(V);
668 if (SIter != ShapeMap.end()) {
670 SIter->second.NumColumns != Shape.NumColumns)) {
671 errs() <<
"Conflicting shapes (" << SIter->second.NumRows <<
"x"
672 << SIter->second.NumColumns <<
" vs " << Shape.NumRows <<
"x"
673 << Shape.NumColumns <<
") for " << *
V <<
"\n";
675 "Matrix shape verification failed, compilation aborted!");
679 << SIter->second.NumRows <<
" "
680 << SIter->second.NumColumns <<
" for " << *V <<
"\n");
684 ShapeMap.insert({
V, Shape});
685 LLVM_DEBUG(
dbgs() <<
" " << Shape.NumRows <<
" x " << Shape.NumColumns
686 <<
" for " << *V <<
"\n");
692 bool supportsShapeInfo(
Value *V) {
699 switch (
II->getIntrinsicID()) {
700 case Intrinsic::matrix_multiply:
701 case Intrinsic::matrix_transpose:
702 case Intrinsic::matrix_column_major_load:
703 case Intrinsic::matrix_column_major_store:
706 return isUniformShape(
II);
717 propagateShapeForward(SmallVectorImpl<Instruction *> &WorkList) {
723 while (!WorkList.
empty()) {
727 bool Propagate =
false;
728 if (
auto SI = computeShapeInfoForInst(Inst, ShapeMap))
729 Propagate = setShapeInfo(Inst, *SI);
733 for (
auto *User : Inst->
users())
734 if (ShapeMap.count(User) == 0)
745 propagateShapeBackward(SmallVectorImpl<Instruction *> &WorkList) {
748 auto pushInstruction = [](
Value *
V,
749 SmallVectorImpl<Instruction *> &WorkList) {
758 while (!WorkList.
empty()) {
761 size_t BeforeProcessingV = WorkList.
size();
773 if (setShapeInfo(MatrixA, {
M,
N}))
774 pushInstruction(MatrixA, WorkList);
776 if (setShapeInfo(MatrixB, {
N,
K}))
777 pushInstruction(MatrixB, WorkList);
782 if (setShapeInfo(MatrixA, {
M,
N}))
783 pushInstruction(MatrixA, WorkList);
787 if (setShapeInfo(MatrixA, {
M,
N})) {
788 pushInstruction(MatrixA, WorkList);
800 ShapeInfo Shape = ShapeMap[
V];
801 for (Use &U : ShapedOps) {
802 if (setShapeInfo(
U.get(), Shape))
803 pushInstruction(
U.get(), WorkList);
809 for (
size_t I = BeforeProcessingV;
I != WorkList.
size();
I++)
810 for (User *U : WorkList[
I]->
users())
821 Value *Op0, ShapeInfo Shape0,
Value *Op1, ShapeInfo Shape1,
822 MatrixBuilder &Builder,
823 function_ref<Instruction *(
Value *, ShapeInfo,
Value *, ShapeInfo)>
826 Op0, Shape0.NumRows, Shape0.NumColumns, Op0->
getName() +
"_t");
829 setShapeInfo(T0, Shape0.t());
831 Op1, Shape1.NumRows, Shape1.NumColumns, Op1->
getName() +
"_t");
832 setShapeInfo(
T1, Shape1.t());
838 void eraseFromParentAndRemoveFromShapeMap(Instruction *Inst) {
839 ShapeMap.erase(Inst);
851 if (
II != BB.
rend() && Inst == &*
II)
853 eraseFromParentAndRemoveFromShapeMap(Inst);
858 void updateShapeAndReplaceAllUsesWith(Instruction &Old,
Value *New) {
862 auto S = ShapeMap.find(&Old);
863 if (S != ShapeMap.end()) {
865 if (supportsShapeInfo(New))
866 ShapeMap.insert({
New, S->second});
879 MatrixBuilder Builder(IB);
882 ConstantInt *
R, *
K, *
C;
891 updateShapeAndReplaceAllUsesWith(
I, TATA);
892 eraseFromParentAndMove(&
I,
II, BB);
893 eraseFromParentAndMove(TA,
II, BB);
900 updateShapeAndReplaceAllUsesWith(
I, TA);
901 eraseFromParentAndMove(&
I,
II, BB);
911 auto NewInst = distributeTransposes(
912 TAMB, {
K,
C}, TAMA, {
R,
K}, Builder,
913 [&](
Value *T0, ShapeInfo Shape0,
Value *
T1, ShapeInfo Shape1) {
916 Shape1.NumColumns,
"mmul");
918 updateShapeAndReplaceAllUsesWith(
I, NewInst);
919 eraseFromParentAndMove(&
I,
II, BB);
920 eraseFromParentAndMove(TA,
II, BB);
934 auto NewInst = distributeTransposes(
935 TAMA, {
R,
C}, TAMB, {
R,
C}, Builder,
936 [&](
Value *T0, ShapeInfo Shape0,
Value *
T1, ShapeInfo Shape1) {
937 bool IsFP =
I.getType()->isFPOrFPVectorTy();
938 auto *
Mul = IsFP ? LocalBuilder.CreateFMul(T0,
T1,
"mmul")
939 : LocalBuilder.CreateMul(T0,
T1,
"mmul");
941 setShapeInfo(Result, Shape0);
944 updateShapeAndReplaceAllUsesWith(
I, NewInst);
945 eraseFromParentAndMove(&
I,
II, BB);
946 eraseFromParentAndMove(TA,
II, BB);
955 auto NewInst = distributeTransposes(
956 TAMA, {
R,
C}, TAMB, {
R,
C}, Builder,
957 [&](
Value *T0, ShapeInfo Shape0,
Value *
T1, ShapeInfo Shape1) {
958 bool IsFP =
I.getType()->isFPOrFPVectorTy();
959 auto *
Add = IsFP ? LocalBuilder.CreateFAdd(T0,
T1,
"madd")
960 : LocalBuilder.CreateAdd(T0,
T1,
"madd");
963 setShapeInfo(Result, Shape0);
966 updateShapeAndReplaceAllUsesWith(
I, NewInst);
967 eraseFromParentAndMove(&
I,
II, BB);
968 eraseFromParentAndMove(TA,
II, BB);
976 bool liftTranspose(Instruction &
I) {
980 eraseFromParentAndRemoveFromShapeMap(&
T);
983 if (
A !=
B &&
B->use_empty())
988 ConstantInt *
R, *
K, *
C;
996 MatrixBuilder Builder(IB);
998 BT, AT,
C->getZExtValue(),
K->getZExtValue(),
R->getZExtValue());
999 setShapeInfo(M, {
C,
R});
1002 updateShapeAndReplaceAllUsesWith(
I, NewInst);
1003 CleanupBinOp(
I,
A,
B);
1015 auto *
Add = Builder.CreateFAdd(AT,
BT,
"mfadd");
1016 MatrixBuilder MBuilder(Builder);
1017 Instruction *NewInst = MBuilder.CreateMatrixTranspose(
1018 Add,
R->getZExtValue(),
C->getZExtValue(),
"mfadd_t");
1019 updateShapeAndReplaceAllUsesWith(
I, NewInst);
1020 assert(computeShapeInfoForInst(NewInst, ShapeMap) ==
1021 computeShapeInfoForInst(&
I, ShapeMap) &&
1022 "Shape of new instruction doesn't match original shape.");
1023 CleanupBinOp(
I,
A,
B);
1025 setShapeInfo(AddI, {
R,
C});
1027 computeShapeInfoForInst(AddI, ShapeMap).value_or(ShapeMap[AddI]) ==
1029 "Shape of updated addition doesn't match cached shape.");
1037 bool optimizeTransposes() {
1041 for (BasicBlock &BB :
reverse(Func)) {
1046 if (Instruction *NewInst = sinkTranspose(
I,
II,
Changed))
1053 for (BasicBlock &BB : Func) {
1066 for (BasicBlock &BB : Func)
1067 for (Instruction &Inst : BB) {
1072 switch (
II->getIntrinsicID()) {
1073 case Intrinsic::matrix_multiply:
1074 case Intrinsic::matrix_transpose:
1075 case Intrinsic::matrix_column_major_load:
1076 case Intrinsic::matrix_column_major_store:
1085 if (WorkList.
empty())
1089 ORE = &AM->getResult<OptimizationRemarkEmitterAnalysis>(Func);
1090 AA = &AM->getResult<AAManager>(Func);
1091 DT = &AM->getResult<DominatorTreeAnalysis>(Func);
1092 LI = &AM->getResult<LoopAnalysis>(Func);
1096 while (!WorkList.
empty()) {
1097 WorkList = propagateShapeForward(WorkList);
1098 WorkList = propagateShapeBackward(WorkList);
1103 Changed |= optimizeTransposes();
1105 dbgs() <<
"Dump after matrix transpose optimization:\n";
1111 SmallVector<Instruction *, 16> MatrixInsts;
1116 ReversePostOrderTraversal<Function *> RPOT(&Func);
1117 for (
auto *BB : RPOT)
1118 for (Instruction &
I : *BB) {
1121 if (!ShapeMap.contains(&
I))
1129 SmallPtrSet<Instruction *, 16> FusedInsts;
1130 for (CallInst *CI : MaybeFusableInsts)
1131 lowerDotProduct(CI, FusedInsts, getFastMathFlags(CI));
1134 for (CallInst *CI : MaybeFusableInsts)
1136 LowerMatrixMultiplyFused(CI, FusedInsts, LifetimeEnds);
1142 for (Instruction *Inst : MatrixInsts) {
1143 if (FusedInsts.
count(Inst))
1150 const ShapeInfo &
SI = ShapeMap.at(Inst);
1152 MatrixTy PhiM(
SI.NumRows,
SI.NumColumns, EltTy);
1155 for (
unsigned VI = 0, VE = PhiM.getNumVectors(); VI != VE; ++VI)
1156 PhiM.setVector(VI, Builder.CreatePHI(PhiM.getVectorTy(),
1157 PHI->getNumIncomingValues(),
1159 assert(!Inst2ColumnMatrix.contains(
PHI) &&
"map already contains phi?");
1160 Inst2ColumnMatrix[
PHI] = PhiM;
1164 for (Instruction *Inst : MatrixInsts) {
1165 if (FusedInsts.
count(Inst))
1168 const ShapeInfo &
SI = ShapeMap.at(Inst);
1175 Result = VisitBinaryOperator(BinOp, SI, Builder);
1177 Result = VisitCastInstruction(Cast, SI, Builder);
1179 Result = VisitUnaryOperator(UnOp, SI, Builder);
1181 Result = VisitIntrinsicInst(Intr, SI, Builder);
1193 finalizeLowering(Inst, Result, Builder);
1198 RemarkGenerator RemarkGen(Inst2ColumnMatrix, *ORE, Func);
1199 RemarkGen.emitRemarks();
1211 SmallPtrSet<Instruction *, 16> PoisonedInsts;
1212 for (
auto *Inst :
reverse(ToRemove)) {
1215 PoisonedInsts.
insert(Poisoned);
1219 PoisonedInsts.
erase(Inst);
1221 if (!PoisonedInsts.
empty()) {
1223 dbgs() <<
"Poisoned but present instructions:\n";
1224 for (
auto *
I : PoisonedInsts)
1225 dbgs() << *
I <<
"\n";
1233 MatrixTy VisitIntrinsicInst(IntrinsicInst *Inst,
const ShapeInfo &SI,
1239 case Intrinsic::matrix_multiply:
1240 return LowerMultiply(Inst, Builder);
1241 case Intrinsic::matrix_transpose:
1242 return LowerTranspose(Inst, Builder);
1243 case Intrinsic::matrix_column_major_load:
1244 return LowerColumnMajorLoad(Inst, Builder);
1245 case Intrinsic::matrix_column_major_store:
1246 return LowerColumnMajorStore(Inst, Builder);
1247 case Intrinsic::abs:
1248 case Intrinsic::fabs: {
1250 MatrixTy
M = getMatrix(Inst->
getOperand(0), SI, Builder);
1253 for (
auto *
Vector :
M.vectors()) {
1255 case Intrinsic::abs:
1259 case Intrinsic::fabs:
1268 return Result.addNumComputeOps(getNumOps(
Result.getVectorTy()) *
1275 "only intrinsics supporting shape info should be seen here");
1283 Align getAlignForIndex(
unsigned Idx,
Value *Stride,
Type *ElementTy,
1284 MaybeAlign
A)
const {
1285 Align InitialAlign = DL.getValueOrABITypeAlignment(
A, ElementTy);
1287 return InitialAlign;
1289 TypeSize ElementSizeInBits = DL.getTypeSizeInBits(ElementTy);
1291 uint64_t StrideInBytes =
1292 ConstStride->getZExtValue() * ElementSizeInBits / 8;
1298 IntegerType *getIndexType(
Value *
Ptr)
const {
1303 return ConstantInt::get(getIndexType(
Ptr), V);
1308 "Attempted to cast non-integral type to integer index");
1313 V->getName() +
".cast");
1319 bool IsVolatile, ShapeInfo Shape,
IRBuilder<> &Builder) {
1325 Stride = castToIndexType(
Ptr, Stride, Builder);
1326 for (
unsigned I = 0,
E = Shape.getNumVectors();
I <
E; ++
I) {
1329 Stride, Shape.getStride(), EltTy, Builder);
1331 VecTy,
GEP, getAlignForIndex(
I, Stride, EltTy, MAlign),
1332 IsVolatile,
"col.load");
1336 return Result.addNumLoads(getNumOps(
Result.getVectorTy()) *
1342 MatrixTy loadMatrix(
Value *MatrixPtr, MaybeAlign Align,
bool IsVolatile,
1344 ShapeInfo ResultShape,
Type *EltTy,
1347 Builder.
CreateMul(J, getIndex(MatrixPtr, MatrixShape.getStride())),
I);
1351 ResultShape.NumColumns);
1353 return loadMatrix(TileTy, TileStart, Align,
1354 getIndex(MatrixPtr, MatrixShape.getStride()), IsVolatile,
1355 ResultShape, Builder);
1360 Value *Stride,
bool IsVolatile, ShapeInfo Shape,
1362 return loadMatrix(Inst->
getType(),
Ptr, Align, Stride, IsVolatile, Shape,
1369 MatrixTy LowerColumnMajorLoad(CallInst *Inst,
IRBuilder<> &Builder) {
1371 "Intrinsic only supports column-major layout!");
1376 {Inst->getArgOperand(3), Inst->getArgOperand(4)}, Builder);
1381 void storeMatrix(
const MatrixTy &StoreVal,
Value *MatrixPtr,
1382 MaybeAlign MAlign,
bool IsVolatile, ShapeInfo MatrixShape,
1385 Builder.
CreateMul(J, getIndex(MatrixPtr, MatrixShape.getStride())),
I);
1389 StoreVal.getNumColumns());
1391 storeMatrix(TileTy, StoreVal, TileStart, MAlign,
1392 getIndex(MatrixPtr, MatrixShape.getStride()), IsVolatile,
1398 MatrixTy storeMatrix(
Type *Ty, MatrixTy StoreVal,
Value *
Ptr,
1399 MaybeAlign MAlign,
Value *Stride,
bool IsVolatile,
1403 Stride = castToIndexType(
Ptr, Stride, Builder);
1404 for (
auto Vec :
enumerate(StoreVal.vectors())) {
1411 getAlignForIndex(Vec.index(), Stride,
1416 return MatrixTy().addNumStores(getNumOps(StoreVal.getVectorTy()) *
1417 StoreVal.getNumVectors());
1422 MaybeAlign
A,
Value *Stride,
bool IsVolatile,
1424 auto StoreVal = getMatrix(
Matrix, Shape, Builder);
1425 return storeMatrix(
Matrix->getType(), StoreVal,
Ptr,
A, Stride, IsVolatile,
1432 MatrixTy LowerColumnMajorStore(CallInst *Inst,
IRBuilder<> &Builder) {
1434 "Intrinsic only supports column-major layout!");
1440 {Inst->getArgOperand(4), Inst->getArgOperand(5)},
1449 unsigned BlockNumElts =
1452 assert(NumElts >= BlockNumElts &&
"Too few elements for current block");
1459 SmallVector<int, 16>
Mask;
1461 for (i = 0; i <
I; i++)
1464 unsigned VecNumElts =
1466 for (; i <
I + BlockNumElts; i++)
1467 Mask.push_back(i -
I + VecNumElts);
1469 for (; i < VecNumElts; i++)
1477 unsigned &NumComputeOps) {
1478 NumComputeOps += getNumOps(
A->getType());
1483 if (AllowContraction) {
1489 NumComputeOps += getNumOps(
A->getType());
1494 NumComputeOps += getNumOps(
A->getType());
1504 void finalizeLowering(Instruction *Inst, MatrixTy
Matrix,
1506 auto inserted = Inst2ColumnMatrix.insert(std::make_pair(Inst,
Matrix));
1509 "multiple matrix lowering mapping");
1511 ToRemove.push_back(Inst);
1512 Value *Flattened =
nullptr;
1514 if (ShapeMap.contains(
U.getUser()))
1518 Flattened =
Matrix.embedInVector(Builder);
1521 <<
"flattening a " <<
Matrix.shape() <<
" matrix:\n"
1523 <<
"\nbecause we do not have a shape-aware lowering for its "
1526 FlattenedMatrices++;
1535 void lowerDotProduct(CallInst *MatMul,
1536 SmallPtrSet<Instruction *, 16> &FusedInsts,
1537 FastMathFlags FMF) {
1544 if (LShape.NumRows != 1 || RShape.NumColumns != 1)
1557 auto CanBeFlattened = [](
Value *
Op) {
1570 auto GetCostForArg = [
this, &CanBeFlattened](
Value *
Op,
unsigned N) {
1571 if (!ShapeMap.contains(
Op))
1572 return InstructionCost::getInvalid();
1580 if (!CanBeFlattened(
Op)) {
1583 for (
unsigned I = 1;
I <
N; ++
I)
1584 EmbedCost += TTI.getShuffleCost(
1597 return NewCost - OriginalCost;
1605 for (
unsigned I = 1;
I <
N; ++
I)
1606 EmbedCost -= TTI.getShuffleCost(
1616 return TTI.getMemoryOpCost(Instruction::Load, VecTy,
Align(1), 0) -
1617 N * TTI.getMemoryOpCost(Instruction::Load, EltTy,
Align(1), 0);
1623 SmallPtrSet<Value *, 4> Seen;
1628 while (!WorkList.
empty()) {
1634 if (OpCost + LHSCost >= LHSCost)
1640 WorkList.
append(
I->op_begin(),
I->op_end());
1644 int AddOpCode = IsIntVec ? Instruction::Add : Instruction::FAdd;
1645 int MulOpCode = IsIntVec ? Instruction::Mul : Instruction::FMul;
1647 TTI.getArithmeticReductionCost(
1649 IsIntVec ? std::nullopt : std::optional(FMF)) +
1650 TTI.getArithmeticInstrCost(MulOpCode,
LHS->
getType());
1652 TTI.getArithmeticInstrCost(AddOpCode, ElementType) *
1653 (LShape.NumColumns - 1) +
1654 TTI.getArithmeticInstrCost(MulOpCode, ElementType) *
1655 (LShape.NumColumns);
1656 if ((LHSCost + ReductionCost - SequentialAddCost) >
InstructionCost(0))
1659 FusedInsts.
insert(MatMul);
1661 auto FlattenArg = [&Builder, &FusedInsts, &CanBeFlattened,
1666 if (!CanBeFlattened(
Op))
1670 auto It = ShapeMap.find(
Op);
1671 if (It != ShapeMap.end()) {
1672 It->second = It->second.t();
1682 auto *NewLoad = Builder.
CreateLoad(
Op->getType(), Arg);
1683 Op->replaceAllUsesWith(NewLoad);
1689 Op->replaceAllUsesWith(Arg);
1694 for (
auto *V : ToFlatten)
1716 Result, uint64_t(0));
1718 FusedInsts.insert(MatMul);
1719 ToRemove.push_back(MatMul);
1729 void emitMatrixMultiply(MatrixTy &Result,
const MatrixTy &
A,
1731 bool IsScalarMatrixTransposed, FastMathFlags FMF) {
1732 const unsigned VF = std::max<unsigned>(
1735 Result.getElementType()->getPrimitiveSizeInBits().getFixedValue(),
1737 unsigned R =
Result.getNumRows();
1738 unsigned C =
Result.getNumColumns();
1739 unsigned M =
A.getNumColumns();
1741 bool IsFP =
Result.getElementType()->isFloatingPointTy();
1742 assert(
A.isColumnMajor() ==
B.isColumnMajor() &&
1743 Result.isColumnMajor() ==
A.isColumnMajor() &&
1744 "operands must agree on matrix layout");
1745 unsigned NumComputeOps = 0;
1749 if (
A.isColumnMajor()) {
1753 for (
unsigned J = 0; J <
C; ++J) {
1765 for (
unsigned K = 0;
K <
M; ++
K) {
1768 B.getColumn(IsScalarMatrixTransposed ? K : J),
1769 IsScalarMatrixTransposed ? J : K);
1772 createMulAdd(isSumZero && K == 0 ?
nullptr : Sum, L,
Splat,
1783 for (
unsigned I = 0;
I <
R; ++
I) {
1786 for (
unsigned J = 0; J <
C; J +=
BlockSize) {
1791 Value *Sum =
nullptr;
1792 for (
unsigned K = 0;
K <
M; ++
K) {
1795 A.getVector(IsScalarMatrixTransposed ? K :
I),
1796 IsScalarMatrixTransposed ?
I : K);
1799 createMulAdd(isSumZero && K == 0 ?
nullptr : Sum,
Splat, R,
1807 Result.addNumComputeOps(NumComputeOps);
1813 Value *getNonAliasingPointer(LoadInst *Load, StoreInst *Store,
1819 if (AA->isNoAlias(LoadLoc, StoreLoc))
1820 return Load->getPointerOperand();
1832 DTUpdates.
push_back({DT->Delete, Check0, Succ});
1836 nullptr,
"alias_cont");
1842 nullptr,
"no_alias");
1852 const_cast<Value *
>(StoreLoc.
Ptr), IntPtrTy,
"store.begin");
1854 StoreBegin, ConstantInt::get(IntPtrTy, StoreLoc.
Size.
getValue()),
1855 "store.end",
true,
true);
1857 IntPtrTy,
"load.begin");
1867 LoadBegin, ConstantInt::get(IntPtrTy, LoadLoc.
Size.
getValue()),
1868 "load.end",
true,
true);
1877 auto *ArrayTy = ArrayType::get(VT->getElementType(), VT->getNumElements());
1878 AllocaInst *Alloca =
1886 PHI->addIncoming(
Load->getPointerOperand(), Check1);
1887 PHI->addIncoming(Alloca, Copy);
1890 DTUpdates.
push_back({DT->Insert, Check0, Check1});
1891 DTUpdates.
push_back({DT->Insert, Check0, Fusion});
1893 DTUpdates.
push_back({DT->Insert, Check1, Fusion});
1894 DT->applyUpdates(DTUpdates);
1898 bool isFusionProfitable(CallInst *MatMul) {
1905 const unsigned R = LShape.NumRows;
1906 const unsigned C = RShape.NumColumns;
1907 const unsigned M = LShape.NumColumns;
1910 const unsigned VF = std::max<unsigned>(
1922 if (R <= VF &&
C == 1)
1928 unsigned Op0Regs = (
R + VF - 1) / VF * M;
1929 unsigned Op1Regs = (
M + VF - 1) / VF *
C;
1930 return Op0Regs + Op1Regs >
1931 TTI.getNumberOfRegisters(TTI.getRegisterClassForType(
true));
1934 MatrixTy getZeroMatrix(
Type *EltType,
unsigned R,
unsigned C) {
1937 for (
unsigned I = 0;
I <
C; ++
I)
1942 void createTiledLoops(CallInst *MatMul,
Value *LPtr, ShapeInfo LShape,
1943 Value *RPtr, ShapeInfo RShape, StoreInst *Store) {
1947 TileInfo TI(LShape.NumRows, RShape.NumColumns, LShape.NumColumns,
TileSize);
1948 DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy);
1954 BasicBlock *InnerBody = TI.CreateTiledLoops(Start, End, Builder, DTU, *LI);
1958 MatrixTy TileResult;
1964 auto *
Phi = Builder.
CreatePHI(TileVecTy, 2,
"result.vec." + Twine(
I));
1966 TI.RowLoop.Header->getSingleSuccessor());
1967 TileResult.addVector(Phi);
1976 loadMatrix(LPtr, {},
false, LShape, TI.RowLoop.Index, TI.KLoop.Index,
1979 loadMatrix(RPtr, {},
false, RShape, TI.KLoop.Index, TI.ColumnLoop.Index,
1981 emitMatrixMultiply(TileResult,
A,
B, Builder,
true,
false,
1982 getFastMathFlags(MatMul));
1985 storeMatrix(TileResult,
Store->getPointerOperand(),
Store->getAlign(),
1986 Store->isVolatile(), {LShape.NumRows, RShape.NumColumns},
1987 TI.RowLoop.Index, TI.ColumnLoop.Index, EltType, Builder);
1989 for (
unsigned I = 0;
I < TileResult.getNumVectors();
I++)
1990 ColumnPhis[
I]->addIncoming(TileResult.getVector(
I), TI.KLoop.Latch);
1996 unsigned InnerLoopUnrollCount = std::min(10u, LShape.NumColumns /
TileSize);
1998 "llvm.loop.unroll.count", InnerLoopUnrollCount);
2001 void emitSIMDTiling(CallInst *MatMul, LoadInst *LoadOp0, LoadInst *LoadOp1,
2003 SmallPtrSetImpl<Instruction *> &FusedInsts) {
2005 "Tiling only supported for column-major matrixes at the moment!");
2006 if (!isFusionProfitable(MatMul))
2012 const unsigned R = LShape.NumRows;
2013 const unsigned C = RShape.NumColumns;
2014 const unsigned M = LShape.NumColumns;
2017 Value *APtr = getNonAliasingPointer(LoadOp0, Store, MatMul);
2018 Value *BPtr = getNonAliasingPointer(LoadOp1, Store, MatMul);
2022 createTiledLoops(MatMul, APtr, LShape, BPtr, RShape, Store);
2025 for (
unsigned J = 0; J <
C; J +=
TileSize)
2027 const unsigned TileR = std::min(R -
I,
unsigned(
TileSize));
2028 const unsigned TileC = std::min(
C - J,
unsigned(
TileSize));
2029 MatrixTy Res = getZeroMatrix(EltType, TileR, TileC);
2032 const unsigned TileM = std::min(M - K,
unsigned(
TileSize));
2035 LShape, getIndex(APtr,
I), getIndex(APtr, K),
2036 {TileR, TileM}, EltType, Builder);
2039 RShape, getIndex(BPtr, K), getIndex(BPtr, J),
2040 {TileM, TileC}, EltType, Builder);
2041 emitMatrixMultiply(Res,
A,
B, Builder,
true,
false,
2042 getFastMathFlags(MatMul));
2044 storeMatrix(Res, CPtr,
Store->getAlign(),
Store->isVolatile(), {R, M},
2045 getIndex(CPtr,
I), getIndex(CPtr, J), EltType, Builder);
2050 FusedInsts.
insert(Store);
2051 FusedInsts.
insert(MatMul);
2052 eraseFromParentAndRemoveFromShapeMap(Store);
2053 eraseFromParentAndRemoveFromShapeMap(MatMul);
2055 FusedInsts.
insert(LoadOp0);
2056 eraseFromParentAndRemoveFromShapeMap(LoadOp0);
2058 if (LoadOp1 != LoadOp0 && LoadOp1->
use_empty()) {
2059 FusedInsts.
insert(LoadOp1);
2060 eraseFromParentAndRemoveFromShapeMap(LoadOp1);
2069 LowerMatrixMultiplyFused(CallInst *MatMul,
2070 SmallPtrSetImpl<Instruction *> &FusedInsts,
2075 assert(AA && LI &&
"Analyses should be available");
2090 const unsigned R = LShape.NumRows;
2091 const unsigned M = LShape.NumColumns;
2092 const unsigned C = RShape.NumColumns;
2099 MA = getMatrix(
A, ShapeInfo(R, M), Builder);
2100 MB = getMatrix(
T, ShapeInfo(
C, M), Builder);
2103 MA = getMatrix(
T, ShapeInfo(R, M), Builder);
2104 MB = getMatrix(
B, ShapeInfo(
C, M), Builder);
2109 MatrixTy
Result(R,
C, EltType);
2111 emitMatrixMultiply(Result, MA, MB, Builder,
false,
true,
2112 getFastMathFlags(MatMul));
2114 FusedInsts.
insert(MatMul);
2120 Inst2ColumnMatrix[Transpose] = MatrixTy(M,
C, EltType);
2122 finalizeLowering(MatMul, Result, Builder);
2134 if (LoadOp0 && LoadOp1 && Store) {
2137 SetVector<Value *> WorkList;
2140 for (
unsigned I = 0;
I != WorkList.
size(); ++
I) {
2141 Value *Current = WorkList[
I];
2147 if (DT->dominates(CurrI, MatMul))
2149 if (CurrI->mayHaveSideEffects() || CurrI->mayReadFromMemory())
2155 sort(ToHoist, [
this](Instruction *
A, Instruction *
B) {
2156 return DT->dominates(
A,
B);
2158 for (Instruction *
I : ToHoist)
2171 bool FusableOpsInSameBlock = LoadOp0->
getParent() == StoreParent &&
2173 for (
unsigned Idx = 0; Idx != LifetimeEnds.
size();) {
2174 IntrinsicInst *End = LifetimeEnds[Idx];
2178 if (DT->dominates(End, LoadOp0) && DT->dominates(End, LoadOp1))
2180 if (DT->dominates(Store, End))
2184 if (FusableOpsInSameBlock && End->
getParent() != StoreParent)
2192 if (AA->isNoAlias(Load0Loc, EndLoc) && AA->isNoAlias(Load1Loc, EndLoc))
2204 ToRemove.push_back(End);
2210 emitSIMDTiling(MatMul, LoadOp0, LoadOp1, Store, FusedInsts);
2216 MatrixTy LowerMultiply(CallInst *MatMul,
IRBuilder<> &Builder) {
2221 const MatrixTy &Lhs = getMatrix(MatMul->
getArgOperand(0), LShape, Builder);
2222 const MatrixTy &Rhs = getMatrix(MatMul->
getArgOperand(1), RShape, Builder);
2223 assert(Lhs.getElementType() == Rhs.getElementType() &&
2224 "Matrix multiply argument element types do not match.");
2226 const unsigned R = LShape.NumRows;
2227 const unsigned C = RShape.NumColumns;
2228 assert(LShape.NumColumns == RShape.NumRows);
2231 MatrixTy
Result(R,
C, EltType);
2232 assert(Lhs.getElementType() ==
Result.getElementType() &&
2233 "Matrix multiply result element type does not match arguments.");
2235 emitMatrixMultiply(Result, Lhs, Rhs, Builder,
false,
false,
2236 getFastMathFlags(MatMul));
2241 MatrixTy LowerTranspose(CallInst *Inst,
IRBuilder<> &Builder) {
2246 MatrixTy InputMatrix = getMatrix(InputVal, ArgShape, Builder);
2248 const unsigned NewNumVecs =
2249 InputMatrix.isColumnMajor() ? ArgShape.NumRows : ArgShape.NumColumns;
2250 const unsigned NewNumElts =
2251 InputMatrix.isColumnMajor() ? ArgShape.NumColumns : ArgShape.NumRows;
2253 for (
unsigned I = 0;
I < NewNumVecs; ++
I) {
2258 for (
auto J :
enumerate(InputMatrix.vectors())) {
2264 Result.addVector(ResultVector);
2270 return Result.addNumComputeOps(2 * ArgShape.NumRows * ArgShape.NumColumns)
2271 .addNumExposedTransposes(1);
2275 MatrixTy VisitLoad(LoadInst *Inst,
const ShapeInfo &SI,
Value *
Ptr,
2281 MatrixTy VisitStore(StoreInst *Inst,
const ShapeInfo &SI,
Value *StoredVal,
2288 MatrixTy VisitPHI(PHINode *Inst,
const ShapeInfo &SI,
IRBuilder<> &Builder) {
2289 auto BlockIP = Inst->
getParent()->getFirstInsertionPt();
2291 MatrixTy PhiM = getMatrix(Inst, SI, Builder);
2293 for (
auto [IncomingV, IncomingB] :
2300 if (
auto MaybeIP = IncomingInst->getInsertionPointAfterDef())
2303 MatrixTy OpM = getMatrix(IncomingV, SI, Builder);
2305 for (
unsigned VI = 0, VE = PhiM.getNumVectors(); VI != VE; ++VI) {
2307 NewPHI->
addIncoming(OpM.getVector(VI), IncomingB);
2318 MatrixTy VisitBinaryOperator(BinaryOperator *Inst,
const ShapeInfo &SI,
2324 MatrixTy
A = getMatrix(Lhs, SI, Builder);
2325 MatrixTy
B = getMatrix(Rhs, SI, Builder);
2326 assert(
A.isColumnMajor() ==
B.isColumnMajor() &&
2327 Result.isColumnMajor() ==
A.isColumnMajor() &&
2328 "operands must agree on matrix layout");
2335 return Result.addNumComputeOps(getNumOps(
Result.getVectorTy()) *
2340 MatrixTy VisitUnaryOperator(UnaryOperator *Inst,
const ShapeInfo &SI,
2345 MatrixTy
M = getMatrix(
Op, SI, Builder);
2350 auto BuildVectorOp = [&Builder, Inst](
Value *
Op) {
2352 case Instruction::FNeg:
2359 for (
auto *
Vector :
M.vectors())
2362 return Result.addNumComputeOps(getNumOps(
Result.getVectorTy()) *
2367 MatrixTy VisitCastInstruction(CastInst *Inst,
const ShapeInfo &Shape,
2372 MatrixTy
M = getMatrix(
Op, Shape, Builder);
2377 auto *NewVTy = VectorType::get(OrigVTy->getElementType(),
2380 for (
auto *
Vector :
M.vectors())
2383 return Result.addNumComputeOps(getNumOps(
Result.getVectorTy()) *
2388 MatrixTy VisitSelectInst(SelectInst *Inst,
const ShapeInfo &Shape,
2395 MatrixTy
A = getMatrix(OpA, Shape, Builder);
2396 MatrixTy
B = getMatrix(OpB, Shape, Builder);
2400 MatrixTy
C = getMatrix(
Cond, Shape, Builder);
2401 llvm::copy(
C.vectors(), std::back_inserter(CondV));
2403 CondV.
resize(
A.getNumVectors());
2410 return Result.addNumComputeOps(getNumOps(
Result.getVectorTy()) *
2417 struct ExprLinearizer {
2418 unsigned LengthToBreak = 100;
2420 raw_string_ostream Stream;
2421 unsigned LineLength = 0;
2422 const DataLayout &DL;
2426 const MapVector<Value *, MatrixTy> &Inst2Matrix;
2430 const DenseMap<Value *, SmallPtrSet<Value *, 2>> &Shared;
2433 const SmallSetVector<Value *, 32> &ExprsInSubprogram;
2440 SmallPtrSet<Value *, 8> ReusedExprs;
2442 ExprLinearizer(
const DataLayout &DL,
2443 const MapVector<Value *, MatrixTy> &Inst2Matrix,
2444 const DenseMap<
Value *, SmallPtrSet<Value *, 2>> &Shared,
2445 const SmallSetVector<Value *, 32> &ExprsInSubprogram,
2447 : Stream(Str), DL(DL), Inst2Matrix(Inst2Matrix), Shared(Shared),
2448 ExprsInSubprogram(ExprsInSubprogram), Leaf(Leaf) {}
2450 void indent(
unsigned N) {
2452 for (
unsigned i = 0; i <
N; i++)
2461 void maybeIndent(
unsigned Indent) {
2462 if (LineLength >= LengthToBreak)
2465 if (LineLength == 0)
2469 void write(StringRef S) {
2470 LineLength += S.
size();
2474 Value *getUnderlyingObjectThroughLoads(
Value *V) {
2476 return getUnderlyingObjectThroughLoads(
Ptr);
2477 else if (
V->getType()->isPointerTy())
2483 bool isMatrix(
Value *V)
const {
return ExprsInSubprogram.count(V); }
2487 void prettyPrintMatrixType(
Value *V, raw_string_ostream &SS) {
2488 auto M = Inst2Matrix.find(V);
2489 if (M == Inst2Matrix.end())
2492 SS <<
M->second.getNumRows();
2494 SS <<
M->second.getNumColumns();
2501 void writeFnName(CallInst *CI) {
2503 write(
"<no called fn>");
2506 if (!
Name.starts_with(
"llvm.matrix")) {
2515 raw_string_ostream
SS(Tmp);
2517 switch (
II->getIntrinsicID()) {
2518 case Intrinsic::matrix_multiply:
2519 prettyPrintMatrixType(
II->getOperand(0), SS);
2521 prettyPrintMatrixType(
II->getOperand(1), SS);
2522 SS <<
"." << *
II->getType()->getScalarType();
2524 case Intrinsic::matrix_transpose:
2525 prettyPrintMatrixType(
II->getOperand(0), SS);
2526 SS <<
"." << *
II->getType()->getScalarType();
2528 case Intrinsic::matrix_column_major_load:
2529 prettyPrintMatrixType(
II, SS);
2530 SS <<
"." << *
II->getType()->getScalarType();
2532 case Intrinsic::matrix_column_major_store:
2533 prettyPrintMatrixType(
II->getOperand(0), SS);
2534 SS <<
"." << *
II->getOperand(0)->getType()->getScalarType();
2543 unsigned getNumShapeArgs(CallInst *CI)
const {
2545 switch (
II->getIntrinsicID()) {
2546 case Intrinsic::matrix_multiply:
2548 case Intrinsic::matrix_transpose:
2550 case Intrinsic::matrix_column_major_load:
2551 case Intrinsic::matrix_column_major_store:
2564 V = getUnderlyingObjectThroughLoads(V);
2565 if (
V->getType()->isPointerTy()) {
2567 Stream <<
"stack addr";
2568 LineLength += StringRef(
"stack addr").size();
2571 LineLength += StringRef(
"addr").size();
2573 if (!
V->getName().empty()) {
2574 Stream <<
" %" <<
V->getName() <<
"";
2575 LineLength +=
V->getName().size() + 2;
2581 raw_string_ostream TmpStream(Tmp);
2584 TmpStream << CI->getValue();
2586 TmpStream <<
"constant";
2589 TmpStream <<
"matrix";
2591 TmpStream <<
"scalar";
2593 Tmp = std::string(StringRef(Tmp).trim());
2594 LineLength += Tmp.size();
2601 void linearizeExpr(
Value *Expr,
unsigned Indent,
bool ParentReused,
2602 bool ParentShared) {
2604 maybeIndent(Indent);
2605 SmallVector<Value *, 8>
Ops;
2608 bool ExprShared =
false;
2611 if (!ParentShared) {
2612 auto SI = Shared.find(Expr);
2613 assert(SI != Shared.end() &&
SI->second.count(Leaf));
2619 write(
"shared with remark at line " + std::to_string(DL.getLine()) +
2620 " column " + std::to_string(DL.getCol()) +
" (");
2622 ExprShared =
SI->second.size() > 1;
2625 bool Reused = !ReusedExprs.insert(Expr).second;
2626 if (Reused && !ParentReused)
2639 Ops.append(
I->value_op_begin(),
I->value_op_end());
2640 write(
I->getOpcodeName());
2645 unsigned NumOpsToBreak = 1;
2650 if (
Ops.size() > NumOpsToBreak)
2653 maybeIndent(Indent + 1);
2655 linearizeExpr(
Op, Indent + 1, Reused, ExprShared);
2658 if (
Op !=
Ops.back())
2665 const std::string &getResult() {
2683 struct RemarkGenerator {
2684 const MapVector<Value *, MatrixTy> &Inst2Matrix;
2685 OptimizationRemarkEmitter &ORE;
2687 const DataLayout &DL;
2689 RemarkGenerator(
const MapVector<Value *, MatrixTy> &Inst2Matrix,
2690 OptimizationRemarkEmitter &ORE, Function &Func)
2691 : Inst2Matrix(Inst2Matrix), ORE(ORE), Func(Func),
2692 DL(Func.getDataLayout()) {}
2698 getExpressionLeaves(
const SmallSetVector<Value *, 32> &ExprsInSubprogram) {
2700 for (
auto *Expr : ExprsInSubprogram)
2702 !
any_of(Expr->
users(), [&ExprsInSubprogram](User *U) {
2703 return ExprsInSubprogram.count(U);
2712 void collectSharedInfo(
Value *Leaf,
Value *V,
2713 const SmallSetVector<Value *, 32> &ExprsInSubprogram,
2714 DenseMap<
Value *, SmallPtrSet<Value *, 2>> &Shared) {
2716 if (!ExprsInSubprogram.
count(V))
2722 collectSharedInfo(Leaf,
Op, ExprsInSubprogram, Shared);
2728 std::pair<OpInfoTy, OpInfoTy>
2729 sumOpInfos(
Value *Root, SmallPtrSetImpl<Value *> &ReusedExprs,
2730 const SmallSetVector<Value *, 32> &ExprsInSubprogram,
2731 DenseMap<
Value *, SmallPtrSet<Value *, 2>> &Shared)
const {
2732 if (!ExprsInSubprogram.
count(Root))
2736 if (!ReusedExprs.
insert(Root).second)
2739 OpInfoTy SharedCount;
2743 auto CM = Inst2Matrix.find(Root);
2744 if (
I->second.size() == 1)
2745 Count = CM->second.getOpInfo();
2747 SharedCount = CM->second.getOpInfo();
2750 auto C = sumOpInfos(
Op, ReusedExprs, ExprsInSubprogram, Shared);
2752 SharedCount +=
C.second;
2754 return {
Count, SharedCount};
2757 void emitRemarks() {
2764 MapVector<DISubprogram *, SmallVector<Value *, 8>> Subprog2Exprs;
2765 for (
const auto &KV : Inst2Matrix) {
2766 if (Func.getSubprogram()) {
2768 DILocation *
Context =
I->getDebugLoc();
2775 Subprog2Exprs[
nullptr].push_back(KV.first);
2778 for (
auto &KV : Subprog2Exprs) {
2779 SmallSetVector<Value *, 32> ExprsInSubprogram(KV.second.begin(),
2781 auto Leaves = getExpressionLeaves(ExprsInSubprogram);
2783 DenseMap<Value *, SmallPtrSet<Value *, 2>>
Shared;
2784 for (
Value *Leaf : Leaves)
2785 collectSharedInfo(Leaf, Leaf, ExprsInSubprogram, Shared);
2788 for (
auto *L : Leaves) {
2800 SmallPtrSet<Value *, 8> ReusedExprs;
2801 OpInfoTy Counts, SharedCounts;
2802 std::tie(Counts, SharedCounts) =
2803 sumOpInfos(L, ReusedExprs, ExprsInSubprogram, Shared);
2805 OptimizationRemark Rem(
DEBUG_TYPE,
"matrix-lowered", Loc,
2808 Rem <<
"Lowered with ";
2809 Rem <<
ore::NV(
"NumStores", Counts.NumStores) <<
" stores, "
2810 <<
ore::NV(
"NumLoads", Counts.NumLoads) <<
" loads, "
2811 <<
ore::NV(
"NumComputeOps", Counts.NumComputeOps)
2813 <<
ore::NV(
"NumExposedTransposes", Counts.NumExposedTransposes)
2814 <<
" exposed transposes";
2816 if (SharedCounts.NumStores > 0 || SharedCounts.NumLoads > 0 ||
2817 SharedCounts.NumComputeOps > 0) {
2818 Rem <<
",\nadditionally "
2819 <<
ore::NV(
"NumStores", SharedCounts.NumStores) <<
" stores, "
2820 <<
ore::NV(
"NumLoads", SharedCounts.NumLoads) <<
" loads, "
2821 <<
ore::NV(
"NumFPOps", SharedCounts.NumComputeOps)
2823 <<
" are shared with other expressions";
2826 Rem << (
"\n" + linearize(L, Shared, ExprsInSubprogram, DL));
2834 const DenseMap<
Value *, SmallPtrSet<Value *, 2>> &Shared,
2835 const SmallSetVector<Value *, 32> &ExprsInSubprogram,
2836 const DataLayout &DL) {
2837 ExprLinearizer Lin(DL, Inst2Matrix, Shared, ExprsInSubprogram, L);
2838 Lin.linearizeExpr(L, 0,
false,
false);
2839 return Lin.getResult();
2849 LowerMatrixIntrinsics LMT(
F,
TTI, Minimal ?
nullptr : &AM);
2864 OS, MapClassName2PassName);
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
AMDGPU Register Bank Select
static const Function * getParent(const Value *V)
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
#define clEnumValN(ENUMVAL, FLAGNAME, DESC)
#define LLVM_DUMP_METHOD
Mark debug helper function definitions like dump() that should not be stripped from debug builds.
hexagon Hexagon specific predictive commoning for HVX vectors
This file provides various utilities for inspecting and working with the control flow graph in LLVM I...
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
static bool isZero(Value *V, const DataLayout &DL, DominatorTree *DT, AssumptionCache *AC)
static DISubprogram * getSubprogram(DIScope *Scope)
Helper function to either return Scope, if it is a subprogram or the attached subprogram for a local ...
static cl::opt< bool > ForceFusion("force-fuse-matrix", cl::init(false), cl::Hidden, cl::desc("Force matrix instruction fusion even if not profitable."))
static cl::opt< bool > VerifyShapeInfo("verify-matrix-shapes", cl::Hidden, cl::desc("Enable/disable matrix shape verification."), cl::init(false))
static bool isSplat(Value *V)
Return true if V is a splat of a value (which is used when multiplying a matrix with a scalar).
static cl::opt< bool > TileUseLoops("fuse-matrix-use-loops", cl::init(false), cl::Hidden, cl::desc("Generate loop nest for tiling."))
static cl::opt< bool > FuseMatrix("fuse-matrix", cl::init(true), cl::Hidden, cl::desc("Enable/disable fusing matrix instructions."))
auto m_AnyAdd(const LTy &L, const RTy &R)
Match any add operation (fp or integer).
static cl::opt< bool > AllowContractEnabled("matrix-allow-contract", cl::init(false), cl::Hidden, cl::desc("Allow the use of FMAs if available and profitable. This may " "result in different results, due to less rounding error."))
auto m_AnyMul(const LTy &L, const RTy &R)
Match any mul operation (fp or integer).
static cl::opt< bool > PrintAfterTransposeOpt("matrix-print-after-transpose-opt", cl::init(false))
static cl::opt< unsigned > TileSize("fuse-matrix-tile-size", cl::init(4), cl::Hidden, cl::desc("Tile size for matrix instruction fusion using square-shaped tiles."))
static cl::opt< MatrixLayoutTy > MatrixLayout("matrix-default-layout", cl::init(MatrixLayoutTy::ColumnMajor), cl::desc("Sets the default matrix layout"), cl::values(clEnumValN(MatrixLayoutTy::ColumnMajor, "column-major", "Use column-major layout"), clEnumValN(MatrixLayoutTy::RowMajor, "row-major", "Use row-major layout")))
uint64_t IntrinsicInst * II
PowerPC Reduce CR logical Operation
This file builds on the ADT/GraphTraits.h file to build a generic graph post order iterator.
const SmallVectorImpl< MachineOperand > & Cond
static Value * extractVector(IRBuilderTy &IRB, Value *V, unsigned BeginIndex, unsigned EndIndex, const Twine &Name)
static Value * insertVector(IRBuilderTy &IRB, Value *Old, Value *V, unsigned BeginIndex, const Twine &Name)
This file defines the make_scope_exit function, which executes user-defined cleanup logic at scope ex...
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
static SymbolRef::Type getType(const Symbol *Sym)
static const int BlockSize
static std::optional< unsigned > getOpcode(ArrayRef< VPValue * > Values)
Returns the opcode of Values or ~0 if they do not all agree.
static SDValue LowerStore(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue LowerLoad(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
iterator begin()
Instruction iterator methods.
const Function * getParent() const
Return the enclosing method, or null if none.
reverse_iterator rbegin()
InstListType::reverse_iterator reverse_iterator
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
BinaryOps getOpcode() const
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
User::op_iterator arg_begin()
Return the iterator pointing to the beginning of the argument list.
MaybeAlign getParamAlign(unsigned ArgNo) const
Extract the alignment for a call or parameter (0=unknown).
Value * getArgOperand(unsigned i) const
User::op_iterator arg_end()
Return the iterator pointing to the end of the argument list.
Instruction::CastOps getOpcode() const
Return the opcode of this CastInst.
static LLVM_ABI ConstantAggregateZero * get(Type *Ty)
LLVM_ABI DISubprogram * getSubprogram() const
Get the subprogram for this scope.
Base class for scope-like contexts.
Subprogram description. Uses SubclassData1.
iterator find(const_arg_type_t< KeyT > Val)
Analysis pass which computes a DominatorTree.
static constexpr ElementCount getFixed(ScalarTy MinVal)
void setAllowContract(bool B=true)
bool allowReassoc() const
Flag queries.
bool allowContract() const
unsigned getNumElements() const
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
Intrinsic::ID getIntrinsicID() const LLVM_READONLY
getIntrinsicID - This method returns the ID number of the specified function, or Intrinsic::not_intri...
bool isIntrinsic() const
isIntrinsic - Returns true if the function's name starts with "llvm.".
LLVM_ABI CallInst * CreateFAddReduce(Value *Acc, Value *Src)
Create a sequential vector fadd reduction intrinsic of the source vector.
Value * CreateICmpULT(Value *LHS, Value *RHS, const Twine &Name="")
Value * CreateInsertElement(Type *VecTy, Value *NewElt, Value *Idx, const Twine &Name="")
AllocaInst * CreateAlloca(Type *Ty, unsigned AddrSpace, Value *ArraySize=nullptr, const Twine &Name="")
Value * CreateExtractElement(Value *Vec, Value *Idx, const Twine &Name="")
LoadInst * CreateAlignedLoad(Type *Ty, Value *Ptr, MaybeAlign Align, const char *Name)
Value * CreateZExtOrTrunc(Value *V, Type *DestTy, const Twine &Name="")
Create a ZExt or Trunc from the integer value V to DestTy.
CallInst * CreateMemCpy(Value *Dst, MaybeAlign DstAlign, Value *Src, MaybeAlign SrcAlign, uint64_t Size, bool isVolatile=false, const AAMDNodes &AAInfo=AAMDNodes())
Create and insert a memcpy between the specified pointers.
Value * CreateFAdd(Value *L, Value *R, const Twine &Name="", MDNode *FPMD=nullptr)
LLVM_ABI Value * CreateVectorSplat(unsigned NumElts, Value *V, const Twine &Name="")
Return a vector value that contains.
LLVM_ABI Value * CreateSelect(Value *C, Value *True, Value *False, const Twine &Name="", Instruction *MDFrom=nullptr)
LLVM_ABI CallInst * CreateAddReduce(Value *Src)
Create a vector int add reduction intrinsic of the source vector.
IntegerType * getIntPtrTy(const DataLayout &DL, unsigned AddrSpace=0)
Fetch the type of an integer with size at least as big as that of a pointer in the given address spac...
Value * CreateCast(Instruction::CastOps Op, Value *V, Type *DestTy, const Twine &Name="", MDNode *FPMathTag=nullptr, FMFSource FMFSource={})
void setFastMathFlags(FastMathFlags NewFMF)
Set the fast-math flags to be used with generated fp-math operators.
Value * CreateGEP(Type *Ty, Value *Ptr, ArrayRef< Value * > IdxList, const Twine &Name="", GEPNoWrapFlags NW=GEPNoWrapFlags::none())
LLVM_ABI Value * CreateBinaryIntrinsic(Intrinsic::ID ID, Value *LHS, Value *RHS, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with 2 operands which is mangled on the first type.
LLVM_ABI CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
PHINode * CreatePHI(Type *Ty, unsigned NumReservedValues, const Twine &Name="")
ConstantInt * getIntN(unsigned N, uint64_t C)
Get a constant N-bit value, zero extended or truncated from a 64-bit value.
BranchInst * CreateCondBr(Value *Cond, BasicBlock *True, BasicBlock *False, MDNode *BranchWeights=nullptr, MDNode *Unpredictable=nullptr)
Create a conditional 'br Cond, TrueDest, FalseDest' instruction.
LLVM_ABI CallInst * CreateUnaryIntrinsic(Intrinsic::ID ID, Value *V, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with 1 operand which is mangled on its type.
LoadInst * CreateLoad(Type *Ty, Value *Ptr, const char *Name)
Provided to resolve 'CreateLoad(Ty, Ptr, "...")' correctly, instead of converting the string to 'bool...
Value * CreateShuffleVector(Value *V1, Value *V2, Value *Mask, const Twine &Name="")
Value * CreateAdd(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Value * CreatePtrToInt(Value *V, Type *DestTy, const Twine &Name="")
Value * CreateBinOp(Instruction::BinaryOps Opc, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
StoreInst * CreateAlignedStore(Value *Val, Value *Ptr, MaybeAlign Align, bool isVolatile=false)
Value * CreateFMul(Value *L, Value *R, const Twine &Name="", MDNode *FPMD=nullptr)
Value * CreateFNeg(Value *V, const Twine &Name="", MDNode *FPMathTag=nullptr)
Value * CreateMul(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
LLVM_ABI void moveAfter(Instruction *MovePos)
Unlink this instruction from its current basic block and insert it into the basic block that MovePos ...
LLVM_ABI void setFastMathFlags(FastMathFlags FMF)
Convenience function for setting multiple fast-math flags on this instruction, which must be an opera...
LLVM_ABI InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
LLVM_ABI FastMathFlags getFastMathFlags() const LLVM_READONLY
Convenience function for getting all the fast-math flags, which must be an operator which supports th...
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
bool isVolatile() const
Return true if this is a load from a volatile memory location.
Align getAlign() const
Return the alignment of the access that is being performed.
TypeSize getValue() const
Analysis pass that exposes the LoopInfo for a function.
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM)
void printPipeline(raw_ostream &OS, function_ref< StringRef(StringRef)> MapClassName2PassName)
CallInst * CreateMatrixTranspose(Value *Matrix, unsigned Rows, unsigned Columns, const Twine &Name="")
Create a llvm.matrix.transpose call, transposing Matrix with Rows rows and Columns columns.
CallInst * CreateMatrixMultiply(Value *LHS, Value *RHS, unsigned LHSRows, unsigned LHSColumns, unsigned RHSColumns, const Twine &Name="")
Create a llvm.matrix.multiply call, multiplying matrixes LHS and RHS.
static LLVM_ABI MemoryLocation get(const LoadInst *LI)
Return a location with information about the memory reference by the given instruction.
LocationSize Size
The maximum size of the location, in address-units, or UnknownSize if the size is not known.
const Value * Ptr
The address of the start of the location.
static LLVM_ABI MemoryLocation getForArgument(const CallBase *Call, unsigned ArgIdx, const TargetLibraryInfo *TLI)
Return a location representing a particular argument of a call.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
iterator_range< const_block_iterator > blocks() const
op_range incoming_values()
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
A set of analyses that are preserved following a run of a transformation pass.
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
PreservedAnalyses & preserve()
Mark an analysis as preserved.
size_type size() const
Determine the number of elements in the SetVector.
void insert_range(Range &&R)
size_type count(const key_type &key) const
Count the number of elements of a given key in the SetVector.
bool insert(const value_type &X)
Insert a new element into the SetVector.
bool erase(PtrType Ptr)
Remove pointer from the set.
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
bool contains(ConstPtrType Ptr) const
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void push_back(const T &Elt)
bool isVolatile() const
Return true if this is a store to a volatile memory location.
StringRef - Represent a constant reference to a string, i.e.
StringRef drop_front(size_t N=1) const
Return a StringRef equal to 'this' but with the first N elements dropped.
constexpr size_t size() const
size - Get the string size.
Analysis pass providing the TargetTransformInfo.
The instances of the Type class are immutable: once they are created, they are never changed.
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
LLVM_ABI TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
bool isVoidTy() const
Return true if this is 'void'.
UnaryOps getOpcode() const
Value * getOperand(unsigned i) const
LLVM Value Representation.
Type * getType() const
All values are typed, get the type of this value.
user_iterator user_begin()
bool hasOneUse() const
Return true if there is exactly one use of this value.
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
iterator_range< user_iterator > users()
iterator_range< use_iterator > uses()
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Type * getElementType() const
constexpr ScalarTy getFixedValue() const
An efficient, type-erasing, non-owning reference to a callable.
const ParentTy * getParent() const
self_iterator getIterator()
This class implements an extremely fast bulk output stream that can only output to a stream.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
@ C
The default llvm calling convention, compatible with C.
@ BasicBlock
Various leaf nodes.
LLVM_ABI StringRef getBaseName(ID id)
Return the LLVM name for an intrinsic, without encoded types for overloading, such as "llvm....
OneUse_match< SubPat > m_OneUse(const SubPat &SP)
TwoOps_match< ValueOpTy, PointerOpTy, Instruction::Store > m_Store(const ValueOpTy &ValueOp, const PointerOpTy &PointerOp)
Matches StoreInst.
BinaryOp_match< LHS, RHS, Instruction::Add > m_Add(const LHS &L, const RHS &R)
class_match< BinaryOperator > m_BinOp()
Match an arbitrary binary operation and ignore it.
specific_intval< false > m_SpecificInt(const APInt &V)
Match a specific integer value or vector with all elements equal to the value.
BinaryOp_match< LHS, RHS, Instruction::FMul > m_FMul(const LHS &L, const RHS &R)
bool match(Val *V, const Pattern &P)
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
class_match< ConstantInt > m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
IntrinsicID_match m_Intrinsic()
Match intrinsic calls like this: m_Intrinsic<Intrinsic::fabs>(m_Value(X))
BinaryOp_match< LHS, RHS, Instruction::FAdd > m_FAdd(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
OneOps_match< OpTy, Instruction::Load > m_Load(const OpTy &Op)
Matches LoadInst.
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
match_combine_or< LTy, RTy > m_CombineOr(const LTy &L, const RTy &R)
Combine two pattern matchers matching L || R.
ValuesClass values(OptsTy... Options)
Helper to build a ValuesClass by forwarding a variable number of arguments as an initializer list to ...
initializer< Ty > init(const Ty &Val)
ElementType
The element type of an SRV or UAV resource.
DiagnosticInfoOptimizationBase::Argument NV
NodeAddr< PhiNode * > Phi
friend class Instruction
Iterator for Instructions in a `BasicBlock.
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
FunctionAddr VTableAddr Value
void fill(R &&Range, T &&Value)
Provide wrappers to std::fill which take ranges instead of having to pass begin/end explicitly.
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
detail::zippy< detail::zip_first, T, U, Args... > zip_equal(T &&t, U &&u, Args &&...args)
zip iterator that assumes that all iteratees have the same length.
detail::scope_exit< std::decay_t< Callable > > make_scope_exit(Callable &&F)
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
auto successors(const MachineBasicBlock *BB)
bool operator!=(uint64_t V1, const APInt &V2)
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
LLVM_ATTRIBUTE_ALWAYS_INLINE DynamicAPInt & operator+=(DynamicAPInt &A, int64_t B)
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
LLVM_ABI Value * concatenateVectors(IRBuilderBase &Builder, ArrayRef< Value * > Vecs)
Concatenate a list of vectors.
bool operator==(const AddressRangeValuePair &LHS, const AddressRangeValuePair &RHS)
const Value * getPointerOperand(const Value *V)
A helper function that returns the pointer operand of a load, store or GEP instruction.
LLVM_ABI void addStringMetadataToLoop(Loop *TheLoop, const char *MDString, unsigned V=0)
Set input string into loop metadata by keeping other values intact.
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
auto reverse(ContainerTy &&C)
LLVM_ABI Error write(MCStreamer &Out, ArrayRef< std::string > Inputs, OnCuIndexOverflow OverflowOptValue)
void sort(IteratorTy Start, IteratorTy End)
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
FunctionAddr VTableAddr Count
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
LLVM_ABI raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
IRBuilder(LLVMContext &, FolderTy, InserterTy, MDNode *, ArrayRef< OperandBundleDef >) -> IRBuilder< FolderTy, InserterTy >
@ Mul
Product of integers.
DWARFExpression::Operation Op
raw_ostream & operator<<(raw_ostream &OS, const APFixedPoint &FX)
ArrayRef(const T &OneElt) -> ArrayRef< T >
OutputIt copy(R &&Range, OutputIt Out)
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
LLVM_ABI BasicBlock * SplitBlock(BasicBlock *Old, BasicBlock::iterator SplitPt, DominatorTree *DT, LoopInfo *LI=nullptr, MemorySSAUpdater *MSSAU=nullptr, const Twine &BBName="", bool Before=false)
Split the specified block at the specified instruction.
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
AnalysisManager< Function > FunctionAnalysisManager
Convenience typedef for the Function analysis manager.
LLVM_ABI const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=MaxLookupSearchDepth)
This method strips off any GEP address adjustments, pointer casts or llvm.threadlocal....
AAResults AliasAnalysis
Temporary typedef for legacy code that uses a generic AliasAnalysis pointer or reference.
LLVM_ABI llvm::SmallVector< int, 16 > createSequentialMask(unsigned Start, unsigned NumInts, unsigned NumUndefs)
Create a sequential shuffle mask.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
A CRTP mix-in to automatically provide informational APIs needed for passes.