56#include "llvm/IR/IntrinsicsX86.h"
67using namespace PatternMatch;
69#define DEBUG_TYPE "lower-amx-type"
73 m_Intrinsic<Intrinsic::x86_cast_vector_to_tile>(
m_Value())) ||
74 match(
II, m_Intrinsic<Intrinsic::x86_cast_tile_to_vector>(
m_Value()));
78 auto *
II = dyn_cast<IntrinsicInst>(
I);
85 if (
II->getType()->isX86_AMXTy())
88 if (V->getType()->isX86_AMXTy())
98 if (
I.getType()->isX86_AMXTy())
111 unsigned AllocaAS =
DL.getAllocaAddrSpace();
113 new AllocaInst(Ty, AllocaAS,
"",
F.getEntryBlock().begin());
120 if (!isa<AllocaInst>(&
I))
127 Value *Row =
nullptr, *Col =
nullptr;
128 switch (
II->getIntrinsicID()) {
131 case Intrinsic::x86_tileloadd64_internal:
132 case Intrinsic::x86_tileloaddt164_internal:
133 case Intrinsic::x86_tilestored64_internal: {
134 Row =
II->getArgOperand(0);
135 Col =
II->getArgOperand(1);
140 case Intrinsic::x86_tcmmimfp16ps_internal:
141 case Intrinsic::x86_tcmmrlfp16ps_internal:
142 case Intrinsic::x86_tdpbssd_internal:
143 case Intrinsic::x86_tdpbsud_internal:
144 case Intrinsic::x86_tdpbusd_internal:
145 case Intrinsic::x86_tdpbuud_internal:
146 case Intrinsic::x86_tdpbf16ps_internal:
147 case Intrinsic::x86_tdpfp16ps_internal: {
150 Row =
II->getArgOperand(0);
151 Col =
II->getArgOperand(1);
154 Row =
II->getArgOperand(0);
155 Col =
II->getArgOperand(2);
158 if (isa<ConstantInt>(
II->getArgOperand(2)))
160 (cast<ConstantInt>(
II->getOperand(2))->getSExtValue()) / 4);
161 else if (isa<Instruction>(
II->getArgOperand(2))) {
176 cast<Instruction>(Row)->moveAfter(cast<Instruction>(
II->getOperand(2)));
184 Col =
II->getArgOperand(1);
191 return std::make_pair(Row, Col);
195 Use &U = *(Phi->use_begin());
196 unsigned OpNo = U.getOperandNo();
197 User *V = U.getUser();
203 if (
isAMXCast(dyn_cast<Instruction>(V))) {
206 Use &U = *(V->use_begin());
207 OpNo = U.getOperandNo();
210 return getShape(cast<IntrinsicInst>(V), OpNo);
211 }
else if (isa<PHINode>(V)) {
214 Use &U = *(V->use_begin());
221 return std::make_pair(
nullptr,
nullptr);
225class X86LowerAMXType {
231 std::map<Value *, Value *> Col2Row;
247 Value *Row =
nullptr, *Col =
nullptr;
249 unsigned OpNo =
U.getOperandNo();
250 auto *
II = cast<IntrinsicInst>(
U.getUser());
254 Value *Stride = Builder.getInt64(64);
255 Value *I8Ptr =
LD->getOperand(0);
256 std::array<Value *, 4>
Args = {Row, Col, I8Ptr, Stride};
258 Value *NewInst = Builder.CreateIntrinsic(Intrinsic::x86_tileloadd64_internal,
260 Bitcast->replaceAllUsesWith(NewInst);
273 auto *
II = cast<IntrinsicInst>(Tile);
276 Value *Row =
II->getOperand(0);
277 Value *Col =
II->getOperand(1);
281 Value *Stride = Builder.getInt64(64);
282 Value *I8Ptr =
ST->getOperand(1);
283 std::array<Value *, 5>
Args = {Row, Col, I8Ptr, Stride, Tile};
284 Builder.CreateIntrinsic(Intrinsic::x86_tilestored64_internal, std::nullopt,
297 Value *Vec = Builder.CreateLoad(
Bitcast->getType(),
ST->getOperand(1));
298 Bitcast->replaceAllUsesWith(Vec);
302bool X86LowerAMXType::transformBitcast(
BitCastInst *Bitcast) {
305 Value *I8Ptr, *Stride;
306 auto *Src =
Bitcast->getOperand(0);
308 auto Prepare = [&](
Type *MemTy) {
311 Stride = Builder.getInt64(64);
314 if (
Bitcast->getType()->isX86_AMXTy()) {
324 unsigned OpNo =
U.getOperandNo();
325 auto *
II = dyn_cast<IntrinsicInst>(
U.getUser());
328 Prepare(
Bitcast->getOperand(0)->getType());
329 Builder.CreateStore(Src, AllocaAddr);
331 Value *Row =
nullptr, *Col =
nullptr;
333 std::array<Value *, 4>
Args = {Row, Col, I8Ptr, Stride};
334 Value *NewInst = Builder.CreateIntrinsic(
335 Intrinsic::x86_tileloadd64_internal, std::nullopt, Args);
336 Bitcast->replaceAllUsesWith(NewInst);
345 auto *
II = dyn_cast<IntrinsicInst>(Src);
349 Value *Row =
II->getOperand(0);
350 Value *Col =
II->getOperand(1);
351 std::array<Value *, 5>
Args = {Row, Col, I8Ptr, Stride, Src};
352 Builder.CreateIntrinsic(Intrinsic::x86_tilestored64_internal, std::nullopt,
354 Value *NewInst = Builder.CreateLoad(
Bitcast->getType(), AllocaAddr);
355 Bitcast->replaceAllUsesWith(NewInst);
361bool X86LowerAMXType::visit() {
367 auto *
Bitcast = dyn_cast<BitCastInst>(&Inst);
372 if (
Bitcast->getType()->isX86_AMXTy()) {
379 if (transformBitcast(Bitcast))
399 combineLoadBitcast(LD, Bitcast);
403 }
else if (Src->getType()->isX86_AMXTy()) {
410 ST = dyn_cast<StoreInst>(
U.getUser());
415 if (transformBitcast(Bitcast))
439 combineBitcastStore(Bitcast, ST);
447 bool C = !DeadInsts.
empty();
449 for (
auto *Inst : DeadInsts)
450 Inst->eraseFromParent();
461 unsigned AllocaAS =
DL.getAllocaAddrSpace();
462 Type *V256I32Ty = VectorType::get(Builder.
getInt32Ty(), 256,
false);
464 new AllocaInst(V256I32Ty, AllocaAS,
"",
F->getEntryBlock().begin());
474 auto *
II = cast<IntrinsicInst>(TileDef);
475 assert(
II &&
"Not tile intrinsic!");
476 Value *Row =
II->getOperand(0);
477 Value *Col =
II->getOperand(1);
483 std::array<Value *, 5> Args = {Row, Col,
Ptr, Stride, TileDef};
486 Intrinsic::x86_tilestored64_internal, std::nullopt, Args);
492 assert(V->getType()->isX86_AMXTy() &&
"Not define tile!");
497 Value *PhiOp = cast<PHINode>(V)->getIncomingValue(0);
498 II = cast<IntrinsicInst>(PhiOp);
500 II = cast<IntrinsicInst>(V);
502 Value *Row =
II->getOperand(0);
503 Value *Col =
II->getOperand(1);
505 Instruction *UserI = cast<Instruction>(U.getUser());
508 std::array<Value *, 4> Args = {Row, Col,
Ptr, Stride};
516 for (
Use &U :
I->uses()) {
517 User *V = U.getUser();
527class X86VolatileTileData {
535 bool volatileTileData();
540Value *X86VolatileTileData::updatePhiIncomings(
544 for (
auto *
I : Incomings) {
548 for (
Use &U :
I->uses()) {
550 if (isa<PHINode>(V) || V == Store)
560 for (
Use &U :
PHI->uses())
562 PHI->eraseFromParent();
620void X86VolatileTileData::volatileTilePHI(
PHINode *
PHI) {
624 for (
unsigned I = 0, E =
PHI->getNumIncomingValues();
I != E; ++
I) {
627 assert(Inst &&
"We shouldn't fold AMX instrution!");
631 Value *StorePtr = updatePhiIncomings(BB, Incomings);
632 replacePhiDefWithLoad(
PHI, StorePtr);
651void X86VolatileTileData::volatileTileNonPHI(
Instruction *
I) {
657 for (
Use &U :
I->uses()) {
659 assert(!isa<PHINode>(V) &&
"PHI Nodes should be excluded!");
677bool X86VolatileTileData::volatileTileData() {
678 bool Changed =
false;
684 if (!
I.getType()->isX86_AMXTy())
686 if (isa<PHINode>(&
I))
696 volatileTileNonPHI(
I);
701 volatileTilePHI(dyn_cast<PHINode>(
I));
712class X86LowerAMXCast {
714 std::unique_ptr<DominatorTree> DT;
723 bool transformAllAMXCast();
737 for (
unsigned i = 0, e =
I->getNumOperands(); i != e; ++i) {
738 Value *OpV =
I->getOperand(i);
739 I->setOperand(i,
nullptr);
747 if (
Instruction *OpI = dyn_cast<Instruction>(OpV)) {
753 I->eraseFromParent();
767bool X86LowerAMXCast::optimizeAMXCastFromPhi(
772 Type *SrcTy = Src->getType();
784 while (!PhiWorklist.
empty()) {
786 for (
unsigned I = 0;
I < OldPN->getNumOperands(); ++
I) {
787 Value *IncValue = OldPN->getIncomingValue(
I);
790 if (isa<Constant>(IncValue)) {
791 auto *IncConst = dyn_cast<Constant>(IncValue);
792 if (!isa<UndefValue>(IncValue) && !IncConst->isZeroValue())
794 Value *Row =
nullptr, *Col =
nullptr;
795 std::tie(Row, Col) =
getShape(OldPN);
798 if (!Row || !Col || !isa<Constant>(Row) || !isa<Constant>(Col))
801 auto *
Block = OldPN->getIncomingBlock(
I);
804 Intrinsic::x86_tilezero_internal, std::nullopt, {Row, Col});
806 NewInst = Builder.CreateIntrinsic(Intrinsic::x86_cast_tile_to_vector,
807 {IncValue->
getType()}, {NewInst});
810 OldPN->setIncomingValue(
I, NewInst);
814 if (
auto *PNode = dyn_cast<PHINode>(IncValue)) {
815 if (OldPhiNodes.
insert(PNode))
819 Instruction *ACI = dyn_cast<Instruction>(IncValue);
824 if (TyA != DestTy || TyB != SrcTy)
834 for (
auto *OldPN : OldPhiNodes) {
841 if (TyA != DestTy || TyB != SrcTy)
843 }
else if (
auto *
PHI = dyn_cast<PHINode>(V)) {
862 if (OldPhiNodes.count(
PHI) == 0)
871 for (
auto *OldPN : OldPhiNodes) {
872 Builder.SetInsertPoint(OldPN);
873 PHINode *NewPN = Builder.CreatePHI(DestTy, OldPN->getNumOperands());
874 NewPNodes[OldPN] = NewPN;
878 for (
auto *OldPN : OldPhiNodes) {
879 PHINode *NewPN = NewPNodes[OldPN];
880 for (
unsigned j = 0, e = OldPN->getNumOperands(); j != e; ++j) {
881 Value *
V = OldPN->getOperand(j);
882 Value *NewV =
nullptr;
887 else if (
auto *PrevPN = dyn_cast<PHINode>(V))
888 NewV = NewPNodes[PrevPN];
890 NewPN->
addIncoming(NewV, OldPN->getIncomingBlock(j));
902 for (
auto *OldPN : OldPhiNodes) {
903 PHINode *NewPN = NewPNodes[OldPN];
909 assert(TyA == DestTy && TyB == SrcTy);
914 }
else if (
auto *
PHI = dyn_cast<PHINode>(V)) {
937 auto *
II = cast<IntrinsicInst>(Tile);
940 Value *Row =
II->getOperand(0);
941 Value *Col =
II->getOperand(1);
944 Value *Stride = Builder.CreateSExt(Col, Builder.getInt64Ty());
945 Value *I8Ptr = Builder.CreateBitCast(
ST->getOperand(1), Builder.getPtrTy());
946 std::array<Value *, 5>
Args = {Row, Col, I8Ptr, Stride, Tile};
947 Builder.CreateIntrinsic(Intrinsic::x86_tilestored64_internal, std::nullopt,
958 bool EraseLoad =
true;
959 Value *Row =
nullptr, *Col =
nullptr;
961 unsigned OpNo =
U.getOperandNo();
962 auto *
II = cast<IntrinsicInst>(
U.getUser());
970 Value *Stride = Builder.CreateSExt(Col, Builder.getInt64Ty());
977 if (!DT->dominates(Row, LD) || !DT->dominates(Col, LD)) {
981 Builder.SetInsertPoint(&*std::next(
LD->getIterator()));
982 Builder.CreateStore(LD, AllocaAddr);
984 Builder.SetInsertPoint(Cast);
985 I8Ptr = Builder.CreateBitCast(AllocaAddr, Builder.getPtrTy());
988 I8Ptr = Builder.CreateBitCast(
LD->getOperand(0), Builder.getPtrTy());
990 std::array<Value *, 4>
Args = {Row, Col, I8Ptr, Stride};
992 Value *NewInst = Builder.CreateIntrinsic(Intrinsic::x86_tileloadd64_internal,
1000 bool Change =
false;
1001 for (
auto *Cast : Casts) {
1002 auto *
II = cast<IntrinsicInst>(Cast);
1008 if (
II->getIntrinsicID() == Intrinsic::x86_cast_tile_to_vector) {
1014 if (combineCastStore(cast<IntrinsicInst>(Cast), Store)) {
1019 for (
auto *Store : DeadStores)
1020 Store->eraseFromParent();
1024 if (!Load || !
Load->hasOneUse())
1031 if (combineLoadCast(cast<IntrinsicInst>(Cast), Load)) {
1034 Load->eraseFromParent();
1042 bool Change =
false;
1052 m_Intrinsic<Intrinsic::x86_cast_vector_to_tile>(
m_Value(Vec))))
1054 else if (
match(&
I, m_Intrinsic<Intrinsic::x86_cast_tile_to_vector>(
1061 for (
auto *Inst : Insts) {
1064 if (!
II ||
II->getIntrinsicID() != IID)
1073 II->replaceAllUsesWith(Inst->getOperand(0));
1079 Convert(Vec2TileInsts, Intrinsic::x86_cast_tile_to_vector);
1080 Convert(Tile2VecInsts, Intrinsic::x86_cast_vector_to_tile);
1084 for (
auto *Inst : Insts) {
1085 if (Inst->use_empty()) {
1086 Inst->eraseFromParent();
1094 EraseInst(Vec2TileInsts);
1095 EraseInst(Tile2VecInsts);
1096 LLVM_DEBUG(
dbgs() <<
"[LowerAMXTYpe][combineAMXcast] IR dump after combine "
1097 "Vec2Tile and Tile2Vec:\n";
1099 Change |= combineLdSt(LiveCasts);
1100 EraseInst(LiveCasts);
1101 LLVM_DEBUG(
dbgs() <<
"[LowerAMXTYpe][combineAMXcast] IR dump after combine "
1102 "AMXCast and load/store:\n";
1109 if (isa<PHINode>(
I.getOperand(0)))
1114 for (
auto *
I : PhiCastWorkList) {
1118 PHINode *PN = cast<PHINode>(
I->getOperand(0));
1119 if (optimizeAMXCastFromPhi(cast<IntrinsicInst>(
I), PN, DeadInst)) {
1127 while (!DeadInst.
empty()) {
1131 LLVM_DEBUG(
dbgs() <<
"[LowerAMXTYpe][combineAMXcast] IR dump after "
1132 "optimizeAMXCastFromPhi:\n";
1139bool X86LowerAMXCast::transformAMXCast(
IntrinsicInst *AMXCast) {
1142 Value *I8Ptr, *Stride;
1145 auto Prepare = [&](
Type *MemTy) {
1147 I8Ptr = Builder.CreateBitCast(AllocaAddr, Builder.getPtrTy());
1148 Stride = Builder.getInt64(64);
1169 unsigned OpNo =
U.getOperandNo();
1170 auto *
II = dyn_cast<IntrinsicInst>(
U.getUser());
1174 Builder.CreateStore(Src, AllocaAddr);
1176 Value *Row =
nullptr, *Col =
nullptr;
1178 std::array<Value *, 4>
Args = {
1179 Row, Col, I8Ptr, Builder.CreateSExt(Col, Builder.getInt64Ty())};
1180 Value *NewInst = Builder.CreateIntrinsic(
1181 Intrinsic::x86_tileloadd64_internal, std::nullopt, Args);
1192 auto *
II = dyn_cast<IntrinsicInst>(Src);
1196 Value *Row =
II->getOperand(0);
1197 Value *Col =
II->getOperand(1);
1198 std::array<Value *, 5>
Args = {
1199 Row, Col, I8Ptr, Builder.CreateSExt(Col, Builder.getInt64Ty()), Src};
1200 Builder.CreateIntrinsic(Intrinsic::x86_tilestored64_internal, std::nullopt,
1202 Value *NewInst = Builder.CreateLoad(AMXCast->
getType(), AllocaAddr);
1210bool X86LowerAMXCast::transformAllAMXCast() {
1211 bool Change =
false;
1221 for (
auto *Inst : WorkLists) {
1222 Change |= transformAMXCast(cast<IntrinsicInst>(Inst));
1252 &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(
F);
1254 X86LowerAMXCast LAC(
F);
1255 C |= LAC.combineAMXcast(TLI);
1258 C |= LAC.transformAllAMXCast();
1260 X86LowerAMXType LAT(
F);
1266 if (
TM->getOptLevel() == CodeGenOptLevel::None) {
1271 if (!
F.hasFnAttribute(Attribute::OptimizeNone)) {
1272 X86VolatileTileData VTD(
F);
1273 C = VTD.volatileTileData() ||
C;
1289static const char PassName[] =
"Lower AMX type for load/store";
1290char X86LowerAMXTypeLegacyPass::ID = 0;
1299 return new X86LowerAMXTypeLegacyPass();
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static bool DCEInstruction(Instruction *I, SmallSetVector< Instruction *, 16 > &WorkList, const TargetLibraryInfo *TLI)
uint64_t IntrinsicInst * II
const char LLVMTargetMachineRef TM
#define INITIALIZE_PASS_DEPENDENCY(depName)
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
This file builds on the ADT/GraphTraits.h file to build a generic graph post order iterator.
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
This file implements a set that has insertion order iteration characteristics.
This file defines the SmallSet class.
Target-Independent Code Generator Pass Configuration Options pass.
static bool isAMXCast(Instruction *II)
static void replaceWithTileLoad(Use &U, Value *Ptr, bool IsPHI=false)
static Instruction * createTileStore(Instruction *TileDef, Value *Ptr)
static std::pair< Value *, Value * > getShape(IntrinsicInst *II, unsigned OpNo)
static Value * getAllocaPos(BasicBlock *BB)
static bool containsAMXCode(Function &F)
static bool isIncomingOfPHI(Instruction *I)
static bool isAMXIntrinsic(Value *I)
static const char PassName[]
static Instruction * getFirstNonAllocaInTheEntryBlock(Function &F)
static AllocaInst * createAllocaInstAtEntry(IRBuilder<> &Builder, BasicBlock *BB, Type *Ty)
an instruction to allocate memory on the stack
void setAlignment(Align Align)
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
void setPreservesCFG()
This function should be called by the pass, iff they do not:
LLVM Basic Block Representation.
const Function * getParent() const
Return the enclosing method, or null if none.
InstListType::iterator iterator
Instruction iterators...
const Module * getModule() const
Return the module owning the function this basic block belongs to, or nullptr if the function does no...
This class represents a no-op cast from one type to another.
This class represents an Operation in the Expression.
A parsed version of the target data layout string in and methods for querying it.
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
FunctionPass class - This class is used to implement most global optimizations.
virtual bool runOnFunction(Function &F)=0
runOnFunction - Virtual method overriden by subclasses to do the per-function processing of the pass.
CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, Instruction *FMFSource=nullptr, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
IntegerType * getInt32Ty()
Fetch the type representing a 32-bit integer.
Value * CreateUDiv(Value *LHS, Value *RHS, const Twine &Name="", bool isExact=false)
ConstantInt * getInt64(uint64_t C)
Get a constant 64-bit value.
Value * CreateBitCast(Value *V, Type *DestTy, const Twine &Name="")
LLVMContext & getContext() const
PointerType * getPtrTy(unsigned AddrSpace=0)
Fetch the type representing a pointer.
ConstantInt * getInt16(uint16_t C)
Get a constant 16-bit value.
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
void moveBefore(Instruction *MovePos)
Unlink this instruction from its current basic block and insert it into the basic block that MovePos ...
A wrapper class for inspecting calls to intrinsic functions.
This is an important class for using LLVM in a threaded context.
An instruction for reading from memory.
A Module instance is used to store all the information related to an LLVM module.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
virtual void getAnalysisUsage(AnalysisUsage &) const
getAnalysisUsage - This function should be overriden by passes that need analysis information to do t...
bool empty() const
Determine if the SetVector is empty or not.
bool insert(const value_type &X)
Insert a new element into the SetVector.
value_type pop_back_val()
bool contains(const key_type &key) const
Check if the SetVector contains the given key.
A SetVector that performs no allocations if smaller than a certain size.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
Provides information about what library functions are available for the current target.
Primary interface to the complete machine description for the target machine.
Target-Independent Code Generator Pass Configuration Options.
The instances of the Type class are immutable: once they are created, they are never changed.
static Type * getX86_AMXTy(LLVMContext &C)
bool isX86_AMXTy() const
Return true if this is X86 AMX.
A Use represents the edge between a Value definition and its users.
bool replaceUsesOfWith(Value *From, Value *To)
Replace uses of one Value with another.
void setOperand(unsigned i, Value *Val)
Value * getOperand(unsigned i) const
LLVM Value Representation.
Type * getType() const
All values are typed, get the type of this value.
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
iterator_range< user_iterator > users()
const ParentTy * getParent() const
self_iterator getIterator()
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
@ C
The default llvm calling convention, compatible with C.
@ Bitcast
Perform the operation on a different, but equivalently sized type.
bool match(Val *V, const Pattern &P)
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
NodeAddr< FuncNode * > Func
This is an optimization pass for GlobalISel generic memory operations.
void salvageDebugInfo(const MachineRegisterInfo &MRI, MachineInstr &MI)
Assuming the instruction MI is going to be deleted, attempt to salvage debug users of MI by writing t...
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
iterator_range< po_iterator< T > > post_order(const T &G)
bool isInstructionTriviallyDead(Instruction *I, const TargetLibraryInfo *TLI=nullptr)
Return true if the result produced by the instruction is not used, and the instruction will return.
auto reverse(ContainerTy &&C)
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
void initializeX86LowerAMXTypeLegacyPassPass(PassRegistry &)
bool salvageKnowledge(Instruction *I, AssumptionCache *AC=nullptr, DominatorTree *DT=nullptr)
Calls BuildAssumeFromInst and if the resulting llvm.assume is valid insert if before I.
FunctionPass * createX86LowerAMXTypePass()
The pass transforms load/store <256 x i32> to AMX load/store intrinsics or split the data to two <128...