56#include "llvm/IR/IntrinsicsX86.h"
67using namespace PatternMatch;
69#define DEBUG_TYPE "lower-amx-type"
73 m_Intrinsic<Intrinsic::x86_cast_vector_to_tile>(
m_Value())) ||
74 match(
II, m_Intrinsic<Intrinsic::x86_cast_tile_to_vector>(
m_Value()));
78 auto *
II = dyn_cast<IntrinsicInst>(
I);
85 if (
II->getType()->isX86_AMXTy())
88 if (V->getType()->isX86_AMXTy())
98 if (
I.getType()->isX86_AMXTy())
110 unsigned AllocaAS =
DL.getAllocaAddrSpace();
112 new AllocaInst(Ty, AllocaAS,
"",
F.getEntryBlock().begin());
119 if (!isa<AllocaInst>(&
I))
126 Value *Row =
nullptr, *Col =
nullptr;
127 switch (
II->getIntrinsicID()) {
130 case Intrinsic::x86_tileloadd64_internal:
131 case Intrinsic::x86_tileloaddt164_internal:
132 case Intrinsic::x86_tilestored64_internal: {
133 Row =
II->getArgOperand(0);
134 Col =
II->getArgOperand(1);
139 case Intrinsic::x86_tcmmimfp16ps_internal:
140 case Intrinsic::x86_tcmmrlfp16ps_internal:
141 case Intrinsic::x86_tdpbssd_internal:
142 case Intrinsic::x86_tdpbsud_internal:
143 case Intrinsic::x86_tdpbusd_internal:
144 case Intrinsic::x86_tdpbuud_internal:
145 case Intrinsic::x86_tdpbf16ps_internal:
146 case Intrinsic::x86_tdpfp16ps_internal: {
149 Row =
II->getArgOperand(0);
150 Col =
II->getArgOperand(1);
153 Row =
II->getArgOperand(0);
154 Col =
II->getArgOperand(2);
157 if (isa<ConstantInt>(
II->getArgOperand(2)))
159 (cast<ConstantInt>(
II->getOperand(2))->getSExtValue()) / 4);
160 else if (isa<Instruction>(
II->getArgOperand(2))) {
175 cast<Instruction>(Row)->moveAfter(cast<Instruction>(
II->getOperand(2)));
183 Col =
II->getArgOperand(1);
190 return std::make_pair(Row, Col);
194 Use &U = *(Phi->use_begin());
195 unsigned OpNo = U.getOperandNo();
196 User *V = U.getUser();
202 if (
isAMXCast(dyn_cast<Instruction>(V))) {
205 Use &U = *(V->use_begin());
206 OpNo = U.getOperandNo();
209 return getShape(cast<IntrinsicInst>(V), OpNo);
210 }
else if (isa<PHINode>(V)) {
213 Use &U = *(V->use_begin());
220 return std::make_pair(
nullptr,
nullptr);
224class X86LowerAMXType {
230 std::map<Value *, Value *> Col2Row;
246 Value *Row =
nullptr, *Col =
nullptr;
248 unsigned OpNo =
U.getOperandNo();
249 auto *
II = cast<IntrinsicInst>(
U.getUser());
253 Value *Stride = Builder.getInt64(64);
254 Value *I8Ptr =
LD->getOperand(0);
255 std::array<Value *, 4>
Args = {Row, Col, I8Ptr, Stride};
257 Value *NewInst = Builder.CreateIntrinsic(Intrinsic::x86_tileloadd64_internal,
259 Bitcast->replaceAllUsesWith(NewInst);
272 auto *
II = cast<IntrinsicInst>(Tile);
275 Value *Row =
II->getOperand(0);
276 Value *Col =
II->getOperand(1);
280 Value *Stride = Builder.getInt64(64);
281 Value *I8Ptr =
ST->getOperand(1);
282 std::array<Value *, 5>
Args = {Row, Col, I8Ptr, Stride, Tile};
283 Builder.CreateIntrinsic(Intrinsic::x86_tilestored64_internal, std::nullopt,
296 Value *Vec = Builder.CreateLoad(
Bitcast->getType(),
ST->getOperand(1));
297 Bitcast->replaceAllUsesWith(Vec);
301bool X86LowerAMXType::transformBitcast(
BitCastInst *Bitcast) {
304 Value *I8Ptr, *Stride;
305 auto *Src =
Bitcast->getOperand(0);
307 auto Prepare = [&](
Type *MemTy) {
310 Stride = Builder.getInt64(64);
313 if (
Bitcast->getType()->isX86_AMXTy()) {
323 unsigned OpNo =
U.getOperandNo();
324 auto *
II = dyn_cast<IntrinsicInst>(
U.getUser());
327 Prepare(
Bitcast->getOperand(0)->getType());
328 Builder.CreateStore(Src, AllocaAddr);
330 Value *Row =
nullptr, *Col =
nullptr;
332 std::array<Value *, 4>
Args = {Row, Col, I8Ptr, Stride};
333 Value *NewInst = Builder.CreateIntrinsic(
334 Intrinsic::x86_tileloadd64_internal, std::nullopt, Args);
335 Bitcast->replaceAllUsesWith(NewInst);
344 auto *
II = dyn_cast<IntrinsicInst>(Src);
348 Value *Row =
II->getOperand(0);
349 Value *Col =
II->getOperand(1);
350 std::array<Value *, 5>
Args = {Row, Col, I8Ptr, Stride, Src};
351 Builder.CreateIntrinsic(Intrinsic::x86_tilestored64_internal, std::nullopt,
353 Value *NewInst = Builder.CreateLoad(
Bitcast->getType(), AllocaAddr);
354 Bitcast->replaceAllUsesWith(NewInst);
360bool X86LowerAMXType::visit() {
366 auto *
Bitcast = dyn_cast<BitCastInst>(&Inst);
371 if (
Bitcast->getType()->isX86_AMXTy()) {
378 if (transformBitcast(Bitcast))
398 combineLoadBitcast(LD, Bitcast);
402 }
else if (Src->getType()->isX86_AMXTy()) {
409 ST = dyn_cast<StoreInst>(
U.getUser());
414 if (transformBitcast(Bitcast))
438 combineBitcastStore(Bitcast, ST);
446 bool C = !DeadInsts.
empty();
448 for (
auto *Inst : DeadInsts)
449 Inst->eraseFromParent();
459 unsigned AllocaAS =
DL.getAllocaAddrSpace();
460 Type *V256I32Ty = VectorType::get(Builder.
getInt32Ty(), 256,
false);
462 new AllocaInst(V256I32Ty, AllocaAS,
"",
F->getEntryBlock().begin());
472 auto *
II = cast<IntrinsicInst>(TileDef);
473 assert(
II &&
"Not tile intrinsic!");
474 Value *Row =
II->getOperand(0);
475 Value *Col =
II->getOperand(1);
481 std::array<Value *, 5> Args = {Row, Col,
Ptr, Stride, TileDef};
484 Intrinsic::x86_tilestored64_internal, std::nullopt, Args);
490 assert(V->getType()->isX86_AMXTy() &&
"Not define tile!");
495 Value *PhiOp = cast<PHINode>(V)->getIncomingValue(0);
496 II = cast<IntrinsicInst>(PhiOp);
498 II = cast<IntrinsicInst>(V);
500 Value *Row =
II->getOperand(0);
501 Value *Col =
II->getOperand(1);
503 Instruction *UserI = cast<Instruction>(U.getUser());
506 std::array<Value *, 4> Args = {Row, Col,
Ptr, Stride};
514 for (
Use &U :
I->uses()) {
515 User *V = U.getUser();
525class X86VolatileTileData {
533 bool volatileTileData();
538Value *X86VolatileTileData::updatePhiIncomings(
542 for (
auto *
I : Incomings) {
546 for (
Use &U :
I->uses()) {
548 if (isa<PHINode>(V) || V == Store)
558 for (
Use &U :
PHI->uses())
560 PHI->eraseFromParent();
618void X86VolatileTileData::volatileTilePHI(
PHINode *
PHI) {
622 for (
unsigned I = 0, E =
PHI->getNumIncomingValues();
I != E; ++
I) {
625 assert(Inst &&
"We shouldn't fold AMX instrution!");
629 Value *StorePtr = updatePhiIncomings(BB, Incomings);
630 replacePhiDefWithLoad(
PHI, StorePtr);
649void X86VolatileTileData::volatileTileNonPHI(
Instruction *
I) {
655 for (
Use &U :
I->uses()) {
657 assert(!isa<PHINode>(V) &&
"PHI Nodes should be excluded!");
675bool X86VolatileTileData::volatileTileData() {
676 bool Changed =
false;
682 if (!
I.getType()->isX86_AMXTy())
684 if (isa<PHINode>(&
I))
694 volatileTileNonPHI(
I);
699 volatileTilePHI(dyn_cast<PHINode>(
I));
710class X86LowerAMXCast {
712 std::unique_ptr<DominatorTree> DT;
721 bool transformAllAMXCast();
735 for (
unsigned i = 0, e =
I->getNumOperands(); i != e; ++i) {
736 Value *OpV =
I->getOperand(i);
737 I->setOperand(i,
nullptr);
745 if (
Instruction *OpI = dyn_cast<Instruction>(OpV)) {
751 I->eraseFromParent();
765bool X86LowerAMXCast::optimizeAMXCastFromPhi(
770 Type *SrcTy = Src->getType();
782 while (!PhiWorklist.
empty()) {
784 for (
unsigned I = 0;
I < OldPN->getNumOperands(); ++
I) {
785 Value *IncValue = OldPN->getIncomingValue(
I);
788 if (isa<Constant>(IncValue)) {
789 auto *IncConst = dyn_cast<Constant>(IncValue);
790 if (!isa<UndefValue>(IncValue) && !IncConst->isZeroValue())
792 Value *Row =
nullptr, *Col =
nullptr;
793 std::tie(Row, Col) =
getShape(OldPN);
796 if (!Row || !Col || !isa<Constant>(Row) || !isa<Constant>(Col))
799 auto *
Block = OldPN->getIncomingBlock(
I);
802 Intrinsic::x86_tilezero_internal, std::nullopt, {Row, Col});
804 NewInst = Builder.CreateIntrinsic(Intrinsic::x86_cast_tile_to_vector,
805 {IncValue->
getType()}, {NewInst});
808 OldPN->setIncomingValue(
I, NewInst);
812 if (
auto *PNode = dyn_cast<PHINode>(IncValue)) {
813 if (OldPhiNodes.
insert(PNode))
817 Instruction *ACI = dyn_cast<Instruction>(IncValue);
822 if (TyA != DestTy || TyB != SrcTy)
832 for (
auto *OldPN : OldPhiNodes) {
839 if (TyA != DestTy || TyB != SrcTy)
841 }
else if (
auto *
PHI = dyn_cast<PHINode>(V)) {
860 if (OldPhiNodes.count(
PHI) == 0)
869 for (
auto *OldPN : OldPhiNodes) {
870 Builder.SetInsertPoint(OldPN);
871 PHINode *NewPN = Builder.CreatePHI(DestTy, OldPN->getNumOperands());
872 NewPNodes[OldPN] = NewPN;
876 for (
auto *OldPN : OldPhiNodes) {
877 PHINode *NewPN = NewPNodes[OldPN];
878 for (
unsigned j = 0, e = OldPN->getNumOperands(); j != e; ++j) {
879 Value *
V = OldPN->getOperand(j);
880 Value *NewV =
nullptr;
885 else if (
auto *PrevPN = dyn_cast<PHINode>(V))
886 NewV = NewPNodes[PrevPN];
888 NewPN->
addIncoming(NewV, OldPN->getIncomingBlock(j));
900 for (
auto *OldPN : OldPhiNodes) {
901 PHINode *NewPN = NewPNodes[OldPN];
907 assert(TyA == DestTy && TyB == SrcTy);
912 }
else if (
auto *
PHI = dyn_cast<PHINode>(V)) {
935 auto *
II = cast<IntrinsicInst>(Tile);
938 Value *Row =
II->getOperand(0);
939 Value *Col =
II->getOperand(1);
942 Value *Stride = Builder.CreateSExt(Col, Builder.getInt64Ty());
943 Value *I8Ptr = Builder.CreateBitCast(
ST->getOperand(1), Builder.getPtrTy());
944 std::array<Value *, 5>
Args = {Row, Col, I8Ptr, Stride, Tile};
945 Builder.CreateIntrinsic(Intrinsic::x86_tilestored64_internal, std::nullopt,
956 bool EraseLoad =
true;
957 Value *Row =
nullptr, *Col =
nullptr;
959 unsigned OpNo =
U.getOperandNo();
960 auto *
II = cast<IntrinsicInst>(
U.getUser());
968 Value *Stride = Builder.CreateSExt(Col, Builder.getInt64Ty());
975 if (!DT->dominates(Row, LD) || !DT->dominates(Col, LD)) {
979 Builder.SetInsertPoint(&*std::next(
LD->getIterator()));
980 Builder.CreateStore(LD, AllocaAddr);
982 Builder.SetInsertPoint(Cast);
983 I8Ptr = Builder.CreateBitCast(AllocaAddr, Builder.getPtrTy());
986 I8Ptr = Builder.CreateBitCast(
LD->getOperand(0), Builder.getPtrTy());
988 std::array<Value *, 4>
Args = {Row, Col, I8Ptr, Stride};
990 Value *NewInst = Builder.CreateIntrinsic(Intrinsic::x86_tileloadd64_internal,
999 for (
auto *Cast : Casts) {
1000 auto *
II = cast<IntrinsicInst>(Cast);
1006 if (
II->getIntrinsicID() == Intrinsic::x86_cast_tile_to_vector) {
1012 if (combineCastStore(cast<IntrinsicInst>(Cast), Store)) {
1017 for (
auto *Store : DeadStores)
1018 Store->eraseFromParent();
1022 if (!Load || !
Load->hasOneUse())
1029 if (combineLoadCast(cast<IntrinsicInst>(Cast), Load)) {
1032 Load->eraseFromParent();
1040 bool Change =
false;
1050 m_Intrinsic<Intrinsic::x86_cast_vector_to_tile>(
m_Value(Vec))))
1052 else if (
match(&
I, m_Intrinsic<Intrinsic::x86_cast_tile_to_vector>(
1059 for (
auto *Inst : Insts) {
1062 if (!
II ||
II->getIntrinsicID() != IID)
1071 II->replaceAllUsesWith(Inst->getOperand(0));
1077 Convert(Vec2TileInsts, Intrinsic::x86_cast_tile_to_vector);
1078 Convert(Tile2VecInsts, Intrinsic::x86_cast_vector_to_tile);
1082 for (
auto *Inst : Insts) {
1083 if (Inst->use_empty()) {
1084 Inst->eraseFromParent();
1092 EraseInst(Vec2TileInsts);
1093 EraseInst(Tile2VecInsts);
1094 LLVM_DEBUG(
dbgs() <<
"[LowerAMXTYpe][combineAMXcast] IR dump after combine "
1095 "Vec2Tile and Tile2Vec:\n";
1097 Change |= combineLdSt(LiveCasts);
1098 EraseInst(LiveCasts);
1099 LLVM_DEBUG(
dbgs() <<
"[LowerAMXTYpe][combineAMXcast] IR dump after combine "
1100 "AMXCast and load/store:\n";
1107 if (isa<PHINode>(
I.getOperand(0)))
1112 for (
auto *
I : PhiCastWorkList) {
1116 PHINode *PN = cast<PHINode>(
I->getOperand(0));
1117 if (optimizeAMXCastFromPhi(cast<IntrinsicInst>(
I), PN, DeadInst)) {
1125 while (!DeadInst.
empty()) {
1129 LLVM_DEBUG(
dbgs() <<
"[LowerAMXTYpe][combineAMXcast] IR dump after "
1130 "optimizeAMXCastFromPhi:\n";
1137bool X86LowerAMXCast::transformAMXCast(
IntrinsicInst *AMXCast) {
1140 Value *I8Ptr, *Stride;
1143 auto Prepare = [&](
Type *MemTy) {
1145 I8Ptr = Builder.CreateBitCast(AllocaAddr, Builder.getPtrTy());
1146 Stride = Builder.getInt64(64);
1167 unsigned OpNo =
U.getOperandNo();
1168 auto *
II = dyn_cast<IntrinsicInst>(
U.getUser());
1172 Builder.CreateStore(Src, AllocaAddr);
1174 Value *Row =
nullptr, *Col =
nullptr;
1176 std::array<Value *, 4>
Args = {
1177 Row, Col, I8Ptr, Builder.CreateSExt(Col, Builder.getInt64Ty())};
1178 Value *NewInst = Builder.CreateIntrinsic(
1179 Intrinsic::x86_tileloadd64_internal, std::nullopt, Args);
1190 auto *
II = dyn_cast<IntrinsicInst>(Src);
1194 Value *Row =
II->getOperand(0);
1195 Value *Col =
II->getOperand(1);
1196 std::array<Value *, 5>
Args = {
1197 Row, Col, I8Ptr, Builder.CreateSExt(Col, Builder.getInt64Ty()), Src};
1198 Builder.CreateIntrinsic(Intrinsic::x86_tilestored64_internal, std::nullopt,
1200 Value *NewInst = Builder.CreateLoad(AMXCast->
getType(), AllocaAddr);
1208bool X86LowerAMXCast::transformAllAMXCast() {
1209 bool Change =
false;
1219 for (
auto *Inst : WorkLists) {
1220 Change |= transformAMXCast(cast<IntrinsicInst>(Inst));
1250 &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(
F);
1252 X86LowerAMXCast LAC(
F);
1253 C |= LAC.combineAMXcast(TLI);
1256 C |= LAC.transformAllAMXCast();
1258 X86LowerAMXType LAT(
F);
1264 if (
TM->getOptLevel() == CodeGenOptLevel::None) {
1269 if (!
F.hasFnAttribute(Attribute::OptimizeNone)) {
1270 X86VolatileTileData VTD(
F);
1271 C = VTD.volatileTileData() ||
C;
1287static const char PassName[] =
"Lower AMX type for load/store";
1288char X86LowerAMXTypeLegacyPass::ID = 0;
1297 return new X86LowerAMXTypeLegacyPass();
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static bool DCEInstruction(Instruction *I, SmallSetVector< Instruction *, 16 > &WorkList, const TargetLibraryInfo *TLI)
uint64_t IntrinsicInst * II
#define INITIALIZE_PASS_DEPENDENCY(depName)
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
This file builds on the ADT/GraphTraits.h file to build a generic graph post order iterator.
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
This file implements a set that has insertion order iteration characteristics.
This file defines the SmallSet class.
Target-Independent Code Generator Pass Configuration Options pass.
static bool isAMXCast(Instruction *II)
static void replaceWithTileLoad(Use &U, Value *Ptr, bool IsPHI=false)
static Instruction * createTileStore(Instruction *TileDef, Value *Ptr)
static std::pair< Value *, Value * > getShape(IntrinsicInst *II, unsigned OpNo)
static Value * getAllocaPos(BasicBlock *BB)
static bool containsAMXCode(Function &F)
static bool isIncomingOfPHI(Instruction *I)
static bool isAMXIntrinsic(Value *I)
static const char PassName[]
static Instruction * getFirstNonAllocaInTheEntryBlock(Function &F)
static AllocaInst * createAllocaInstAtEntry(IRBuilder<> &Builder, BasicBlock *BB, Type *Ty)
an instruction to allocate memory on the stack
void setAlignment(Align Align)
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
void setPreservesCFG()
This function should be called by the pass, iff they do not:
LLVM Basic Block Representation.
const Function * getParent() const
Return the enclosing method, or null if none.
InstListType::iterator iterator
Instruction iterators...
This class represents a no-op cast from one type to another.
This class represents an Operation in the Expression.
A parsed version of the target data layout string in and methods for querying it.
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
FunctionPass class - This class is used to implement most global optimizations.
virtual bool runOnFunction(Function &F)=0
runOnFunction - Virtual method overriden by subclasses to do the per-function processing of the pass.
CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, Instruction *FMFSource=nullptr, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
IntegerType * getInt32Ty()
Fetch the type representing a 32-bit integer.
Value * CreateUDiv(Value *LHS, Value *RHS, const Twine &Name="", bool isExact=false)
ConstantInt * getInt64(uint64_t C)
Get a constant 64-bit value.
Value * CreateBitCast(Value *V, Type *DestTy, const Twine &Name="")
LLVMContext & getContext() const
PointerType * getPtrTy(unsigned AddrSpace=0)
Fetch the type representing a pointer.
ConstantInt * getInt16(uint16_t C)
Get a constant 16-bit value.
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
void moveBefore(Instruction *MovePos)
Unlink this instruction from its current basic block and insert it into the basic block that MovePos ...
A wrapper class for inspecting calls to intrinsic functions.
This is an important class for using LLVM in a threaded context.
An instruction for reading from memory.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
virtual void getAnalysisUsage(AnalysisUsage &) const
getAnalysisUsage - This function should be overriden by passes that need analysis information to do t...
bool empty() const
Determine if the SetVector is empty or not.
bool insert(const value_type &X)
Insert a new element into the SetVector.
value_type pop_back_val()
bool contains(const key_type &key) const
Check if the SetVector contains the given key.
A SetVector that performs no allocations if smaller than a certain size.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
Provides information about what library functions are available for the current target.
Primary interface to the complete machine description for the target machine.
Target-Independent Code Generator Pass Configuration Options.
The instances of the Type class are immutable: once they are created, they are never changed.
static Type * getX86_AMXTy(LLVMContext &C)
bool isX86_AMXTy() const
Return true if this is X86 AMX.
A Use represents the edge between a Value definition and its users.
bool replaceUsesOfWith(Value *From, Value *To)
Replace uses of one Value with another.
void setOperand(unsigned i, Value *Val)
Value * getOperand(unsigned i) const
LLVM Value Representation.
Type * getType() const
All values are typed, get the type of this value.
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
iterator_range< user_iterator > users()
const ParentTy * getParent() const
self_iterator getIterator()
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
@ C
The default llvm calling convention, compatible with C.
@ Bitcast
Perform the operation on a different, but equivalently sized type.
bool match(Val *V, const Pattern &P)
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
NodeAddr< FuncNode * > Func
This is an optimization pass for GlobalISel generic memory operations.
void salvageDebugInfo(const MachineRegisterInfo &MRI, MachineInstr &MI)
Assuming the instruction MI is going to be deleted, attempt to salvage debug users of MI by writing t...
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
iterator_range< po_iterator< T > > post_order(const T &G)
bool isInstructionTriviallyDead(Instruction *I, const TargetLibraryInfo *TLI=nullptr)
Return true if the result produced by the instruction is not used, and the instruction will return.
auto reverse(ContainerTy &&C)
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
void initializeX86LowerAMXTypeLegacyPassPass(PassRegistry &)
bool salvageKnowledge(Instruction *I, AssumptionCache *AC=nullptr, DominatorTree *DT=nullptr)
Calls BuildAssumeFromInst and if the resulting llvm.assume is valid insert if before I.
FunctionPass * createX86LowerAMXTypePass()
The pass transforms load/store <256 x i32> to AMX load/store intrinsics or split the data to two <128...