33#include "llvm/IR/IntrinsicsX86.h"
52#define DEBUG_TYPE "x86-lower-amx-intrinsics"
57 return FVT->getNumElements() == 256 &&
58 FVT->getElementType()->isIntegerTy(32);
65 cl::desc(
"X86: enable AMX scalarizition."));
68class X86LowerAMXIntrinsics {
73 : Func(
F), DTU(DomTU), LI(LoopI) {}
79 BasicBlock *createLoop(BasicBlock *Preheader, BasicBlock *Exit,
Value *Bound,
80 ConstantInt *Step, StringRef Name, IRBuilderBase &
B,
82 template <
bool IsTileLoad>
83 Value *createTileLoadStoreLoops(BasicBlock *Start, BasicBlock *End,
86 template <Intrinsic::ID IntrID>
87 std::enable_if_t<IntrID == Intrinsic::x86_tdpbssd_internal ||
88 IntrID == Intrinsic::x86_tdpbsud_internal ||
89 IntrID == Intrinsic::x86_tdpbusd_internal ||
90 IntrID == Intrinsic::x86_tdpbuud_internal ||
91 IntrID == Intrinsic::x86_tdpbf16ps_internal,
93 createTileDPLoops(BasicBlock *Start, BasicBlock *End, IRBuilderBase &
B,
96 template <
bool IsTileLoad>
97 bool lowerTileLoadStore(Instruction *TileLoadStore);
98 template <Intrinsic::ID IntrID>
99 std::enable_if_t<IntrID == Intrinsic::x86_tdpbssd_internal ||
100 IntrID == Intrinsic::x86_tdpbsud_internal ||
101 IntrID == Intrinsic::x86_tdpbusd_internal ||
102 IntrID == Intrinsic::x86_tdpbuud_internal ||
103 IntrID == Intrinsic::x86_tdpbf16ps_internal,
105 lowerTileDP(Instruction *TileDP);
106 bool lowerTileZero(Instruction *TileZero);
122 Type *I16Ty = Type::getInt16Ty(Ctx);
126 PHINode::Create(I16Ty, 2, Name +
".iv", Header->getTerminator()->getIterator());
127 IV->addIncoming(ConstantInt::get(I16Ty, 0), Preheader);
129 B.SetInsertPoint(Latch);
130 Value *Inc =
B.CreateAdd(
IV, Step, Name +
".step");
131 Value *
Cond =
B.CreateICmpNE(Inc, Bound, Name +
".cond");
136 "Expected a non-zero step size. This is chosen by the pass and "
137 "should always be non-zero to imply a finite loop.");
140 *BR, {BoundInt->getZExtValue() / Step->
getZExtValue(), 1},
false);
145 IV->addIncoming(Inc, Latch);
151 {DominatorTree::Delete, Preheader, Tmp},
152 {DominatorTree::Insert, Header, Body},
153 {DominatorTree::Insert, Body, Latch},
154 {DominatorTree::Insert, Latch, Header},
155 {DominatorTree::Insert, Latch,
Exit},
156 {DominatorTree::Insert, Preheader, Header},
159 L->addBasicBlockToLoop(Header, *LI);
160 L->addBasicBlockToLoop(Body, *LI);
161 L->addBasicBlockToLoop(Latch, *LI);
166template <
bool IsTileLoad>
167Value *X86LowerAMXIntrinsics::createTileLoadStoreLoops(
168 BasicBlock *Start, BasicBlock *End, IRBuilderBase &
B,
Value *Row,
170 std::string IntrinName = IsTileLoad ?
"tileload" :
"tilestore";
171 Loop *RowLoop =
nullptr;
172 Loop *ColLoop =
nullptr;
178 ParentL->addChildLoop(RowLoop);
183 BasicBlock *RowBody = createLoop(Start, End, Row,
B.getInt16(1),
184 IntrinName +
".scalarize.rows",
B, RowLoop);
187 BasicBlock *ColBody = createLoop(RowBody, RowLatch, Col,
B.getInt16(1),
188 IntrinName +
".scalarize.cols",
B, ColLoop);
195 Type *EltTy =
B.getInt32Ty();
202 Value *CurrentRowZExt =
B.CreateZExt(CurrentRow, Stride->
getType());
203 Value *CurrentColZExt =
B.CreateZExt(CurrentCol, Stride->
getType());
205 B.CreateAdd(
B.CreateMul(CurrentRowZExt, Stride), CurrentColZExt);
207 Value *Idx =
B.CreateAdd(
B.CreateMul(CurrentRow,
B.getInt16(16)), CurrentCol);
214 PHINode *VecCPhiRowLoop =
B.CreatePHI(V256I32Ty, 2,
"vec.phi.row");
221 PHINode *VecPhi =
B.CreatePHI(V256I32Ty, 2,
"vec.phi");
230 Value *Elt =
B.CreateLoad(EltTy, EltPtr);
231 Value *ResVec =
B.CreateInsertElement(VecPhi, Elt, Idx);
238 Value *Vec = BitCast->getOperand(0);
246 Value *Elt =
B.CreateExtractElement(Vec, Idx);
248 B.CreateStore(Elt, EltPtr);
253template <Intrinsic::ID IntrID>
254std::enable_if_t<IntrID == Intrinsic::x86_tdpbssd_internal ||
255 IntrID == Intrinsic::x86_tdpbsud_internal ||
256 IntrID == Intrinsic::x86_tdpbusd_internal ||
257 IntrID == Intrinsic::x86_tdpbuud_internal ||
258 IntrID == Intrinsic::x86_tdpbf16ps_internal,
260X86LowerAMXIntrinsics::createTileDPLoops(BasicBlock *Start, BasicBlock *End,
261 IRBuilderBase &
B,
Value *Row,
264 std::string IntrinName;
266 case Intrinsic::x86_tdpbssd_internal:
267 IntrinName =
"tiledpbssd";
269 case Intrinsic::x86_tdpbsud_internal:
270 IntrinName =
"tiledpbsud";
272 case Intrinsic::x86_tdpbusd_internal:
273 IntrinName =
"tiledpbusd";
275 case Intrinsic::x86_tdpbuud_internal:
276 IntrinName =
"tiledpbuud";
278 case Intrinsic::x86_tdpbf16ps_internal:
279 IntrinName =
"tiledpbf16ps";
282 Loop *RowLoop =
nullptr;
283 Loop *ColLoop =
nullptr;
284 Loop *InnerLoop =
nullptr;
292 ParentL->addChildLoop(RowLoop);
297 BasicBlock *RowBody = createLoop(Start, End, Row,
B.getInt16(1),
298 IntrinName +
".scalarize.rows",
B, RowLoop);
301 BasicBlock *ColBody = createLoop(RowBody, RowLatch, Col,
B.getInt16(1),
302 IntrinName +
".scalarize.cols",
B, ColLoop);
308 createLoop(ColBody, ColLoopLatch, K,
B.getInt16(1),
309 IntrinName +
".scalarize.inner",
B, InnerLoop);
317 Value *CurrentInner = &*InnerLoopHeader->
begin();
321 Value *VecC = BitCastAcc->getOperand(0);
327 Value *VecA = BitCastLHS->getOperand(0);
330 Value *VecB = BitCastRHS->getOperand(0);
340 PHINode *VecCPhiRowLoop =
B.CreatePHI(V256I32Ty, 2,
"vec.c.phi.row");
343 PHINode *VecDPhiRowLoop =
B.CreatePHI(V256I32Ty, 2,
"vec.d.phi.row");
357 PHINode *VecCPhiColLoop =
B.CreatePHI(V256I32Ty, 2,
"vec.c.phi.col");
358 VecCPhiColLoop->
addIncoming(VecCPhiRowLoop, RowBody);
359 PHINode *VecDPhiColLoop =
B.CreatePHI(V256I32Ty, 2,
"vec.d.phi.col");
360 VecDPhiColLoop->
addIncoming(VecDPhiRowLoop, RowBody);
362 B.CreateAdd(
B.CreateMul(CurrentRow,
B.getInt16(16)), CurrentCol);
370 PHINode *VecCPhi =
B.CreatePHI(V256I32Ty, 2,
"vec.c.inner.phi");
375 B.CreateAdd(
B.CreateMul(CurrentRow,
B.getInt16(16)), CurrentInner);
377 B.CreateAdd(
B.CreateMul(CurrentInner,
B.getInt16(16)), CurrentCol);
378 Value *NewVecC =
nullptr;
380 if (IntrID != Intrinsic::x86_tdpbf16ps_internal) {
397 Value *EltC =
B.CreateExtractElement(VecCPhi, IdxC);
398 Value *EltA =
B.CreateExtractElement(VecA, IdxA);
399 Value *SubVecA =
B.CreateBitCast(EltA, V4I8Ty);
400 Value *EltB =
B.CreateExtractElement(VecB, IdxB);
401 Value *SubVecB =
B.CreateBitCast(EltB, V4I8Ty);
402 Value *SEXTSubVecB =
nullptr;
403 Value *SEXTSubVecA =
nullptr;
405 case Intrinsic::x86_tdpbssd_internal:
406 SEXTSubVecB =
B.CreateSExt(SubVecB, V4I32Ty);
407 SEXTSubVecA =
B.CreateSExt(SubVecA, V4I32Ty);
409 case Intrinsic::x86_tdpbsud_internal:
410 SEXTSubVecB =
B.CreateZExt(SubVecB, V4I32Ty);
411 SEXTSubVecA =
B.CreateSExt(SubVecA, V4I32Ty);
413 case Intrinsic::x86_tdpbusd_internal:
414 SEXTSubVecB =
B.CreateSExt(SubVecB, V4I32Ty);
415 SEXTSubVecA =
B.CreateZExt(SubVecA, V4I32Ty);
417 case Intrinsic::x86_tdpbuud_internal:
418 SEXTSubVecB =
B.CreateZExt(SubVecB, V4I32Ty);
419 SEXTSubVecA =
B.CreateZExt(SubVecA, V4I32Ty);
424 Value *SubVecR =
B.CreateAddReduce(
B.CreateMul(SEXTSubVecA, SEXTSubVecB));
425 Value *ResElt =
B.CreateAdd(EltC, SubVecR);
426 NewVecC =
B.CreateInsertElement(VecCPhi, ResElt, IdxC);
452 Value *EltC =
B.CreateExtractElement(VecCPhi, IdxC);
453 Value *EltCF32 =
B.CreateBitCast(EltC,
B.getFloatTy());
454 Value *EltA =
B.CreateExtractElement(VecA, IdxA);
455 Value *SubVecA =
B.CreateBitCast(EltA, V2I16Ty);
456 Value *EltB =
B.CreateExtractElement(VecB, IdxB);
457 Value *SubVecB =
B.CreateBitCast(EltB, V2I16Ty);
459 int ShuffleMask[4] = {2, 0, 3, 1};
460 auto ShuffleArray =
ArrayRef(ShuffleMask);
461 Value *AV2F32 =
B.CreateBitCast(
462 B.CreateShuffleVector(SubVecA, ZeroV2I16, ShuffleArray), V2F32Ty);
463 Value *BV2F32 =
B.CreateBitCast(
464 B.CreateShuffleVector(SubVecB, ZeroV2I16, ShuffleArray), V2F32Ty);
465 Value *SubVecR =
B.CreateFAddReduce(EltCF32,
B.CreateFMul(AV2F32, BV2F32));
466 Value *ResElt =
B.CreateBitCast(SubVecR,
B.getInt32Ty());
467 NewVecC =
B.CreateInsertElement(VecCPhi, ResElt, IdxC);
475 Value *NewEltC =
B.CreateExtractElement(NewVecC, IdxC);
476 Value *NewVecD =
B.CreateInsertElement(VecDPhiColLoop, NewEltC, IdxC);
480 VecCPhiColLoop->
addIncoming(NewVecC, ColLoopLatch);
482 VecDPhiColLoop->
addIncoming(NewVecD, ColLoopLatch);
487template <Intrinsic::ID IntrID>
488std::enable_if_t<IntrID == Intrinsic::x86_tdpbssd_internal ||
489 IntrID == Intrinsic::x86_tdpbsud_internal ||
490 IntrID == Intrinsic::x86_tdpbusd_internal ||
491 IntrID == Intrinsic::x86_tdpbuud_internal ||
492 IntrID == Intrinsic::x86_tdpbf16ps_internal,
494X86LowerAMXIntrinsics::lowerTileDP(Instruction *TileDP) {
500 PreBuilder.SetInsertPoint(TileDP);
504 Value *NDWord = PreBuilder.CreateLShr(
N, PreBuilder.getInt16(2));
505 Value *KDWord = PreBuilder.CreateLShr(K, PreBuilder.getInt16(2));
510 Value *ResVec = createTileDPLoops<IntrID>(Start, End, Builder, M, NDWord,
516 Builder.CreateBitCast(ResVec, Type::getX86_AMXTy(Builder.getContext()));
522 I->replaceAllUsesWith(ResVec);
523 I->eraseFromParent();
531template <
bool IsTileLoad>
532bool X86LowerAMXIntrinsics::lowerTileLoadStore(Instruction *TileLoadStore) {
533 Value *
M, *
N, *Ptr, *Stride, *Tile;
545 PreBuilder.SetInsertPoint(TileLoadStore);
546 Value *NDWord = PreBuilder.CreateLShr(
N, PreBuilder.getInt16(2));
547 Value *StrideDWord = PreBuilder.CreateLShr(Stride, PreBuilder.getInt64(2));
552 Value *ResVec = createTileLoadStoreLoops<IsTileLoad>(
553 Start, End, Builder, M, NDWord, Ptr, StrideDWord,
554 IsTileLoad ?
nullptr : Tile);
560 Builder.CreateBitCast(ResVec, Type::getX86_AMXTy(Builder.getContext()));
566 I->replaceAllUsesWith(ResVec);
567 I->eraseFromParent();
576bool X86LowerAMXIntrinsics::lowerTileZero(Instruction *TileZero) {
584 I->replaceAllUsesWith(VecZero);
585 I->eraseFromParent();
592bool X86LowerAMXIntrinsics::visit() {
598 switch (Inst->getIntrinsicID()) {
599 case Intrinsic::x86_tdpbssd_internal:
600 case Intrinsic::x86_tdpbsud_internal:
601 case Intrinsic::x86_tdpbusd_internal:
602 case Intrinsic::x86_tdpbuud_internal:
603 case Intrinsic::x86_tileloadd64_internal:
604 case Intrinsic::x86_tilestored64_internal:
605 case Intrinsic::x86_tilezero_internal:
606 case Intrinsic::x86_tdpbf16ps_internal:
616 for (
auto *Inst : WorkList) {
617 switch (Inst->getIntrinsicID()) {
618 case Intrinsic::x86_tdpbssd_internal:
619 C = lowerTileDP<Intrinsic::x86_tdpbssd_internal>(Inst) ||
C;
621 case Intrinsic::x86_tdpbsud_internal:
622 C = lowerTileDP<Intrinsic::x86_tdpbsud_internal>(Inst) ||
C;
624 case Intrinsic::x86_tdpbusd_internal:
625 C = lowerTileDP<Intrinsic::x86_tdpbusd_internal>(Inst) ||
C;
627 case Intrinsic::x86_tdpbuud_internal:
628 C = lowerTileDP<Intrinsic::x86_tdpbuud_internal>(Inst) ||
C;
630 case Intrinsic::x86_tdpbf16ps_internal:
631 C = lowerTileDP<Intrinsic::x86_tdpbf16ps_internal>(Inst) ||
C;
633 case Intrinsic::x86_tileloadd64_internal:
634 C = lowerTileLoadStore<true>(Inst) ||
C;
636 case Intrinsic::x86_tilestored64_internal:
637 C = lowerTileLoadStore<false>(Inst) ||
C;
639 case Intrinsic::x86_tilezero_internal:
640 C = lowerTileZero(Inst) ||
C;
651bool shouldRunLowerAMXIntrinsics(
const Function &
F,
const TargetMachine *TM) {
656bool runLowerAMXIntrinsics(Function &
F, DominatorTree *DT, LoopInfo *LI) {
657 DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy);
659 X86LowerAMXIntrinsics LAT(
F, DTU, LI);
666 if (!shouldRunLowerAMXIntrinsics(
F, TM))
671 bool Changed = runLowerAMXIntrinsics(
F, &DT, &LI);
682class X86LowerAMXIntrinsicsLegacyPass :
public FunctionPass {
690 if (!shouldRunLowerAMXIntrinsics(
F, TM))
693 auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>();
694 auto *DT = DTWP ? &DTWP->getDomTree() :
nullptr;
695 auto *LIWP = getAnalysisIfAvailable<LoopInfoWrapperPass>();
696 auto *LI = LIWP ? &LIWP->getLoopInfo() :
nullptr;
697 return runLowerAMXIntrinsics(
F, DT, LI);
699 StringRef getPassName()
const override {
return "Lower AMX intrinsics"; }
701 void getAnalysisUsage(AnalysisUsage &AU)
const override {
709static const char PassName[] =
"Lower AMX intrinsics";
710char X86LowerAMXIntrinsicsLegacyPass::ID = 0;
718 return new X86LowerAMXIntrinsicsLegacyPass();
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static bool runOnFunction(Function &F, bool PostInlining)
This header defines various interfaces for pass management in LLVM.
uint64_t IntrinsicInst * II
FunctionAnalysisManager FAM
#define INITIALIZE_PASS_DEPENDENCY(depName)
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
This file contains the declarations for profiling metadata utility functions.
const SmallVectorImpl< MachineOperand > & Cond
static void visit(BasicBlock &Start, std::function< bool(BasicBlock *)> op)
Target-Independent Code Generator Pass Configuration Options pass.
static cl::opt< bool > X86ScalarizeAMX("enable-x86-scalar-amx", cl::init(false), cl::Hidden, cl::desc("X86: enable AMX scalarizition."))
static bool isV256I32Ty(Type *Ty)
static const char PassName[]
static const uint32_t IV[8]
AnalysisUsage & addRequired()
AnalysisUsage & addPreserved()
Add the specified Pass class to the set of analyses preserved by this pass.
LLVM Basic Block Representation.
iterator begin()
Instruction iterator methods.
const Function * getParent() const
Return the enclosing method, or null if none.
LLVM_ABI InstListType::const_iterator getFirstNonPHIIt() const
Returns an iterator to the first instruction in this block that is not a PHINode instruction.
static BasicBlock * Create(LLVMContext &Context, const Twine &Name="", Function *Parent=nullptr, BasicBlock *InsertBefore=nullptr)
Creates a new BasicBlock.
LLVM_ABI const BasicBlock * getSinglePredecessor() const
Return the predecessor of this block if it has a single predecessor block.
LLVM_ABI const BasicBlock * getSingleSuccessor() const
Return the successor of this block if it has a single successor.
InstListType::iterator iterator
Instruction iterators...
LLVM_ABI LLVMContext & getContext() const
Get the context in which this basic block lives.
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
static BranchInst * Create(BasicBlock *IfTrue, InsertPosition InsertBefore=nullptr)
BasicBlock * getSuccessor(unsigned i) const
void setSuccessor(unsigned idx, BasicBlock *NewSucc)
This is the shared class of boolean and integer constants.
uint64_t getZExtValue() const
Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
Analysis pass which computes a DominatorTree.
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
FunctionPass class - This class is used to implement most global optimizations.
void applyUpdatesPermissive(ArrayRef< UpdateT > Updates)
Submit updates to all available trees.
Common base class shared among various IRBuilders.
LLVM_ABI InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
Analysis pass that exposes the LoopInfo for a function.
void addChildLoop(LoopT *NewChild)
Add the specified loop to be a child of this loop.
void addTopLevelLoop(LoopT *New)
This adds the specified loop to the collection of top-level loops.
LoopT * AllocateLoop(ArgsTy &&...Args)
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
Represents a single loop in the control flow graph.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
static PHINode * Create(Type *Ty, unsigned NumReservedValues, const Twine &NameStr="", InsertPosition InsertBefore=nullptr)
Constructors - NumReservedValues is a hint for the number of incoming edges that this phi node will h...
A set of analyses that are preserved following a run of a transformation pass.
static PreservedAnalyses none()
Convenience factory function for the empty preserved set.
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
PreservedAnalyses & preserve()
Mark an analysis as preserved.
void push_back(const T &Elt)
StringRef - Represent a constant reference to a string, i.e.
Primary interface to the complete machine description for the target machine.
CodeGenOptLevel getOptLevel() const
Returns the optimization level: None, Less, Default, or Aggressive.
Target-Independent Code Generator Pass Configuration Options.
The instances of the Type class are immutable: once they are created, they are never changed.
LLVM Value Representation.
Type * getType() const
All values are typed, get the type of this value.
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
iterator_range< use_iterator > uses()
PreservedAnalyses run(Function &F, FunctionAnalysisManager &FAM)
const ParentTy * getParent() const
Pass manager infrastructure for declaring and invalidating analyses.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
@ C
The default llvm calling convention, compatible with C.
@ BR
Control flow instructions. These all have token chains.
@ BasicBlock
Various leaf nodes.
bool match(Val *V, const Pattern &P)
IntrinsicID_match m_Intrinsic()
Match intrinsic calls like this: m_Intrinsic<Intrinsic::fabs>(m_Value(X))
CastOperator_match< OpTy, Instruction::BitCast > m_BitCast(const OpTy &Op)
Matches BitCast.
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
initializer< Ty > init(const Ty &Val)
friend class Instruction
Iterator for Instructions in a `BasicBlock.
This is an optimization pass for GlobalISel generic memory operations.
FunctionAddr VTableAddr Value
FunctionPass * createX86LowerAMXIntrinsicsLegacyPass()
cl::opt< bool > ProfcheckDisableMetadataFixes
LLVM_ABI void setExplicitlyUnknownBranchWeightsIfProfiled(Instruction &I, StringRef PassName, const Function *F=nullptr)
Like setExplicitlyUnknownBranchWeights(...), but only sets unknown branch weights in the new instruct...
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
IRBuilder(LLVMContext &, FolderTy, InserterTy, MDNode *, ArrayRef< OperandBundleDef >) -> IRBuilder< FolderTy, InserterTy >
LLVM_ABI BasicBlock * SplitBlock(BasicBlock *Old, BasicBlock::iterator SplitPt, DominatorTree *DT, LoopInfo *LI=nullptr, MemorySSAUpdater *MSSAU=nullptr, const Twine &BBName="")
Split the specified block at the specified instruction.
ArrayRef(const T &OneElt) -> ArrayRef< T >
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
iterator_range< df_iterator< T > > depth_first(const T &G)
AnalysisManager< Function > FunctionAnalysisManager
Convenience typedef for the Function analysis manager.
LLVM_ABI void setFittedBranchWeights(Instruction &I, ArrayRef< uint64_t > Weights, bool IsExpected, bool ElideAllZero=false)
Variant of setBranchWeights where the Weights will be fit first to uint32_t by shifting right.