31#include "llvm/IR/IntrinsicsAMDGPU.h"
36#define DEBUG_TYPE "amdgpu-atomic-optimizer"
43struct ReplacementInfo {
66class AMDGPUAtomicOptimizerImpl
78 Value *
const Identity)
const;
80 Value *
const Identity)
const;
83 std::pair<Value *, Value *>
89 bool ValDivergent)
const;
92 AMDGPUAtomicOptimizerImpl() =
delete;
97 : UA(UA),
DL(
DL), DTU(DTU), ST(ST), IsPixelShader(IsPixelShader),
108char AMDGPUAtomicOptimizer::ID = 0;
112bool AMDGPUAtomicOptimizer::runOnFunction(
Function &
F) {
113 if (skipFunction(
F)) {
118 &getAnalysis<UniformityInfoWrapperPass>().getUniformityInfo();
122 getAnalysisIfAvailable<DominatorTreeWrapperPass>();
124 DomTreeUpdater::UpdateStrategy::Lazy);
132 return AMDGPUAtomicOptimizerImpl(UA,
DL, DTU, ST, IsPixelShader, ScanImpl)
143 DomTreeUpdater::UpdateStrategy::Lazy);
149 AMDGPUAtomicOptimizerImpl(UA,
DL, DTU, ST, IsPixelShader, ScanImpl)
161bool AMDGPUAtomicOptimizerImpl::run(
Function &
F) {
170 const bool Changed = !ToReplace.empty();
172 for (ReplacementInfo &Info : ToReplace) {
195void AMDGPUAtomicOptimizerImpl::visitAtomicRMWInst(
AtomicRMWInst &
I) {
197 switch (
I.getPointerAddressSpace()) {
228 !(
I.getType()->isFloatTy() ||
I.getType()->isDoubleTy())) {
232 const unsigned PtrIdx = 0;
233 const unsigned ValIdx = 1;
237 if (UA->isDivergentUse(
I.getOperandUse(PtrIdx))) {
241 bool ValDivergent = UA->isDivergentUse(
I.getOperandUse(ValIdx));
258 const ReplacementInfo
Info = {&
I,
Op, ValIdx, ValDivergent};
260 ToReplace.push_back(Info);
263void AMDGPUAtomicOptimizerImpl::visitIntrinsicInst(
IntrinsicInst &
I) {
266 switch (
I.getIntrinsicID()) {
269 case Intrinsic::amdgcn_struct_buffer_atomic_add:
270 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
271 case Intrinsic::amdgcn_raw_buffer_atomic_add:
272 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
275 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
276 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
277 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
278 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
281 case Intrinsic::amdgcn_struct_buffer_atomic_and:
282 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
283 case Intrinsic::amdgcn_raw_buffer_atomic_and:
284 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
287 case Intrinsic::amdgcn_struct_buffer_atomic_or:
288 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
289 case Intrinsic::amdgcn_raw_buffer_atomic_or:
290 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
293 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
294 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
295 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
296 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
299 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
300 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
301 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
302 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
305 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
306 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
307 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
308 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
311 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
312 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
313 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
314 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
317 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
318 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
319 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
320 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
325 const unsigned ValIdx = 0;
327 const bool ValDivergent = UA->isDivergentUse(
I.getOperandUse(ValIdx));
343 for (
unsigned Idx = 1;
Idx <
I.getNumOperands();
Idx++) {
344 if (UA->isDivergentUse(
I.getOperandUse(
Idx))) {
352 const ReplacementInfo
Info = {&
I,
Op, ValIdx, ValDivergent};
354 ToReplace.push_back(Info);
367 return B.CreateBinOp(Instruction::Add,
LHS,
RHS);
371 return B.CreateBinOp(Instruction::Sub,
LHS,
RHS);
375 return B.CreateBinOp(Instruction::And,
LHS,
RHS);
377 return B.CreateBinOp(Instruction::Or,
LHS,
RHS);
379 return B.CreateBinOp(Instruction::Xor,
LHS,
RHS);
394 return B.CreateMaxNum(
LHS,
RHS);
396 return B.CreateMinNum(
LHS,
RHS);
407 Value *
const Identity)
const {
408 Type *AtomicTy =
V->getType();
409 Module *
M =
B.GetInsertBlock()->getModule();
417 B.CreateCall(UpdateDPP,
418 {Identity, V, B.getInt32(DPP::ROW_XMASK0 | 1 << Idx),
419 B.getInt32(0xf), B.getInt32(0xf), B.getFalse()}));
424 Value *Permlanex16Call =
B.CreateIntrinsic(
425 V->getType(), Intrinsic::amdgcn_permlanex16,
426 {V, V, B.getInt32(-1), B.getInt32(-1), B.getFalse(), B.getFalse()});
428 if (
ST->isWave32()) {
432 if (
ST->hasPermLane64()) {
434 Value *Permlane64Call =
435 B.CreateIntrinsic(
V->getType(), Intrinsic::amdgcn_permlane64, V);
443 Value *Lane0 =
B.CreateCall(ReadLane, {
V,
B.getInt32(0)});
444 Value *Lane32 =
B.CreateCall(ReadLane, {
V,
B.getInt32(32)});
452 Value *Identity)
const {
453 Type *AtomicTy =
V->getType();
454 Module *
M =
B.GetInsertBlock()->getModule();
461 B.CreateCall(UpdateDPP,
462 {Identity, V, B.getInt32(DPP::ROW_SHR0 | 1 << Idx),
463 B.getInt32(0xf), B.getInt32(0xf), B.getFalse()}));
465 if (
ST->hasDPPBroadcasts()) {
469 B.CreateCall(UpdateDPP,
470 {Identity, V, B.getInt32(DPP::BCAST15), B.getInt32(0xa),
471 B.getInt32(0xf), B.getFalse()}));
474 B.CreateCall(UpdateDPP,
475 {Identity, V, B.getInt32(DPP::BCAST31), B.getInt32(0xc),
476 B.getInt32(0xf), B.getFalse()}));
484 Value *PermX =
B.CreateIntrinsic(
485 V->getType(), Intrinsic::amdgcn_permlanex16,
486 {V, V, B.getInt32(-1), B.getInt32(-1), B.getFalse(), B.getFalse()});
488 Value *UpdateDPPCall =
B.CreateCall(
490 B.getInt32(0xa),
B.getInt32(0xf),
B.getFalse()});
493 if (!
ST->isWave32()) {
495 Value *
const Lane31 =
B.CreateIntrinsic(
496 V->getType(), Intrinsic::amdgcn_readlane, {V, B.getInt32(31)});
498 Value *UpdateDPPCall =
B.CreateCall(
500 B.getInt32(0xc),
B.getInt32(0xf),
B.getFalse()});
511 Value *Identity)
const {
512 Type *AtomicTy =
V->getType();
513 Module *
M =
B.GetInsertBlock()->getModule();
516 if (
ST->hasDPPWavefrontShifts()) {
518 V =
B.CreateCall(UpdateDPP,
520 B.getInt32(0xf),
B.getFalse()});
530 V =
B.CreateCall(UpdateDPP,
532 B.getInt32(0xf),
B.getInt32(0xf),
B.getFalse()});
535 V =
B.CreateCall(WriteLane, {
B.CreateCall(ReadLane, {Old,
B.getInt32(15)}),
538 if (!
ST->isWave32()) {
542 {
B.CreateCall(ReadLane, {Old,
B.getInt32(31)}),
B.getInt32(32), V});
547 {
B.CreateCall(ReadLane, {Old,
B.getInt32(47)}),
B.getInt32(48), V});
559std::pair<Value *, Value *> AMDGPUAtomicOptimizerImpl::buildScanIteratively(
562 auto *Ty =
I.getType();
563 auto *WaveTy =
B.getIntNTy(
ST->getWavefrontSize());
564 auto *EntryBB =
I.getParent();
565 auto NeedResult = !
I.use_empty();
568 B.CreateIntrinsic(Intrinsic::amdgcn_ballot, WaveTy,
B.getTrue());
571 B.SetInsertPoint(ComputeLoop);
575 PHINode *OldValuePhi =
nullptr;
577 OldValuePhi =
B.CreatePHI(Ty, 2,
"OldValuePhi");
580 auto *ActiveBits =
B.CreatePHI(WaveTy, 2,
"ActiveBits");
581 ActiveBits->addIncoming(Ballot, EntryBB);
585 B.CreateIntrinsic(Intrinsic::cttz, WaveTy, {ActiveBits,
B.getTrue()});
587 auto *LaneIdxInt =
B.CreateTrunc(FF1,
B.getInt32Ty());
590 Value *LaneValue =
B.CreateIntrinsic(
V->getType(), Intrinsic::amdgcn_readlane,
595 Value *OldValue =
nullptr;
597 OldValue =
B.CreateIntrinsic(
V->getType(), Intrinsic::amdgcn_writelane,
598 {Accumulator, LaneIdxInt, OldValuePhi});
604 Accumulator->addIncoming(NewAccumulator, ComputeLoop);
608 auto *
Mask =
B.CreateShl(ConstantInt::get(WaveTy, 1), FF1);
610 auto *InverseMask =
B.CreateXor(Mask, ConstantInt::get(WaveTy, -1));
611 auto *NewActiveBits =
B.CreateAnd(ActiveBits, InverseMask);
612 ActiveBits->addIncoming(NewActiveBits, ComputeLoop);
615 auto *IsEnd =
B.CreateICmpEQ(NewActiveBits, ConstantInt::get(WaveTy, 0));
616 B.CreateCondBr(IsEnd, ComputeEnd, ComputeLoop);
618 B.SetInsertPoint(ComputeEnd);
620 return {OldValue, NewAccumulator};
662void AMDGPUAtomicOptimizerImpl::optimizeAtomic(
Instruction &
I,
665 bool ValDivergent)
const {
670 B.setIsFPConstrained(
I.getFunction()->hasFnAttribute(Attribute::StrictFP));
685 PixelEntryBB =
I.getParent();
687 Value *
const Cond =
B.CreateIntrinsic(Intrinsic::amdgcn_ps_live, {}, {});
692 PixelExitBB =
I.getParent();
694 I.moveBefore(NonHelperTerminator);
695 B.SetInsertPoint(&
I);
698 Type *
const Ty =
I.getType();
699 Type *Int32Ty =
B.getInt32Ty();
701 [[maybe_unused]]
const unsigned TyBitWidth =
DL->getTypeSizeInBits(Ty);
705 Value *
V =
I.getOperand(ValIdx);
709 Type *
const WaveTy =
B.getIntNTy(
ST->getWavefrontSize());
711 B.CreateIntrinsic(Intrinsic::amdgcn_ballot, WaveTy,
B.getTrue());
718 if (
ST->isWave32()) {
719 Mbcnt =
B.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_lo, {},
720 {Ballot,
B.getInt32(0)});
722 Value *
const ExtractLo =
B.CreateTrunc(Ballot, Int32Ty);
723 Value *
const ExtractHi =
B.CreateTrunc(
B.CreateLShr(Ballot, 32), Int32Ty);
724 Mbcnt =
B.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_lo, {},
725 {ExtractLo,
B.getInt32(0)});
727 B.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_hi, {}, {ExtractHi, Mbcnt});
743 Value *ExclScan =
nullptr;
744 Value *NewV =
nullptr;
746 const bool NeedResult = !
I.use_empty();
757 B.CreateIntrinsic(Intrinsic::amdgcn_set_inactive, Ty, {
V, Identity});
758 if (!NeedResult &&
ST->hasPermLaneX16()) {
762 NewV = buildReduction(
B, ScanOp, NewV, Identity);
764 NewV = buildScan(
B, ScanOp, NewV, Identity);
766 ExclScan = buildShiftRight(
B, NewV, Identity);
770 Value *
const LastLaneIdx =
B.getInt32(
ST->getWavefrontSize() - 1);
771 NewV =
B.CreateIntrinsic(Ty, Intrinsic::amdgcn_readlane,
772 {NewV, LastLaneIdx});
775 NewV =
B.CreateIntrinsic(Intrinsic::amdgcn_strict_wwm, Ty, NewV);
780 std::tie(ExclScan, NewV) = buildScanIteratively(
B, ScanOp, Identity, V,
I,
781 ComputeLoop, ComputeEnd);
794 Value *
const Ctpop =
B.CreateIntCast(
795 B.CreateUnaryIntrinsic(Intrinsic::ctpop, Ballot), Ty,
false);
801 Value *
const Ctpop =
B.CreateIntCast(
802 B.CreateUnaryIntrinsic(Intrinsic::ctpop, Ballot), Int32Ty,
false);
803 Value *
const CtpopFP =
B.CreateUIToFP(Ctpop, Ty);
804 NewV =
B.CreateFMul(V, CtpopFP);
823 Value *
const Ctpop =
B.CreateIntCast(
824 B.CreateUnaryIntrinsic(Intrinsic::ctpop, Ballot), Ty,
false);
833 Value *
const Cond =
B.CreateICmpEQ(Mbcnt,
B.getInt32(0));
861 B.SetInsertPoint(ComputeEnd);
863 B.Insert(Terminator);
867 B.SetInsertPoint(OriginalBB);
868 B.CreateBr(ComputeLoop);
882 DTU.applyUpdates(DomTreeUpdates);
884 Predecessor = ComputeEnd;
886 Predecessor = OriginalBB;
889 B.SetInsertPoint(SingleLaneTerminator);
899 B.SetInsertPoint(&
I);
905 PHI->addIncoming(NewI, SingleLaneTerminator->
getParent());
910 Value *BroadcastI =
nullptr;
911 BroadcastI =
B.CreateIntrinsic(Ty, Intrinsic::amdgcn_readfirstlane,
PHI);
917 Value *LaneOffset =
nullptr;
921 B.CreateIntrinsic(Intrinsic::amdgcn_strict_wwm, Ty, ExclScan);
923 LaneOffset = ExclScan;
928 Mbcnt = isAtomicFloatingPointTy ?
B.CreateUIToFP(Mbcnt, Ty)
929 :
B.CreateIntCast(Mbcnt, Ty,
false);
945 LaneOffset =
B.CreateSelect(
Cond, Identity, V);
948 LaneOffset =
buildMul(
B, V,
B.CreateAnd(Mbcnt, 1));
952 LaneOffset =
B.CreateFMul(V, Mbcnt);
958 if (isAtomicFloatingPointTy) {
978 PHI->addIncoming(Result,
I.getParent());
979 I.replaceAllUsesWith(
PHI);
982 I.replaceAllUsesWith(Result);
991 "AMDGPU atomic optimizations",
false,
false)
998 return new AMDGPUAtomicOptimizer(ScanStrategy);
static Constant * getIdentityValueForAtomicOp(Type *const Ty, AtomicRMWInst::BinOp Op)
static bool isLegalCrossLaneType(Type *Ty)
static Value * buildMul(IRBuilder<> &B, Value *LHS, Value *RHS)
static Value * buildNonAtomicBinOp(IRBuilder<> &B, AtomicRMWInst::BinOp Op, Value *LHS, Value *RHS)
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
Analysis containing CSE Info
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
AMD GCN specific subclass of TargetSubtarget.
Generic memory optimizations
#define INITIALIZE_PASS_DEPENDENCY(depName)
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
const SmallVectorImpl< MachineOperand > & Cond
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
Target-Independent Code Generator Pass Configuration Options pass.
static APFloat getNaN(const fltSemantics &Sem, bool Negative=false, uint64_t payload=0)
Factory for NaN values.
static APFloat getZero(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Zero.
static APInt getMaxValue(unsigned numBits)
Gets maximum unsigned value of APInt for specific bit width.
static APInt getSignedMaxValue(unsigned numBits)
Gets maximum signed value of APInt for a specific bit width.
static APInt getMinValue(unsigned numBits)
Gets minimum unsigned value of APInt for a specific bit width.
static APInt getSignedMinValue(unsigned numBits)
Gets minimum signed value of APInt for a specific bit width.
A container for analyses that lazily runs them and caches their results.
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
AnalysisUsage & addPreserved()
Add the specified Pass class to the set of analyses preserved by this pass.
an instruction that atomically reads a memory location, combines it with another value,...
static bool isFPOperation(BinOp Op)
BinOp
This enumeration lists the possible modifications atomicrmw can make.
@ Min
*p = old <signed v ? old : v
@ Max
*p = old >signed v ? old : v
@ UMin
*p = old <unsigned v ? old : v
@ FMin
*p = minnum(old, v) minnum matches the behavior of llvm.minnum.
@ UMax
*p = old >unsigned v ? old : v
@ FMax
*p = maxnum(old, v) maxnum matches the behavior of llvm.maxnum.
LLVM Basic Block Representation.
InstListType::const_iterator getFirstNonPHIIt() const
Iterator returning form of getFirstNonPHI.
static BasicBlock * Create(LLVMContext &Context, const Twine &Name="", Function *Parent=nullptr, BasicBlock *InsertBefore=nullptr)
Creates a new BasicBlock.
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Conditional or Unconditional Branch instruction.
This class represents a function call, abstracting a target machine's calling convention.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
@ ICMP_SLT
signed less than
@ ICMP_UGT
unsigned greater than
@ ICMP_SGT
signed greater than
@ ICMP_ULT
unsigned less than
This is the shared class of boolean and integer constants.
bool isOne() const
This is just a convenience method to make client code smaller for a common case.
This is an important base class in LLVM.
This class represents an Operation in the Expression.
A parsed version of the target data layout string in and methods for querying it.
Analysis pass which computes a DominatorTree.
static constexpr UpdateKind Delete
static constexpr UpdateKind Insert
Legacy analysis pass which computes a DominatorTree.
DominatorTree & getDomTree()
FunctionPass class - This class is used to implement most global optimizations.
virtual bool runOnFunction(Function &F)=0
runOnFunction - Virtual method overriden by subclasses to do the per-function processing of the pass.
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Base class for instruction visitors.
RetTy visitIntrinsicInst(IntrinsicInst &I)
RetTy visitAtomicRMWInst(AtomicRMWInst &I)
A wrapper class for inspecting calls to intrinsic functions.
This is an important class for using LLVM in a threaded context.
A Module instance is used to store all the information related to an LLVM module.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
virtual void getAnalysisUsage(AnalysisUsage &) const
getAnalysisUsage - This function should be overriden by passes that need analysis information to do t...
static PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
A set of analyses that are preserved following a run of a transformation pass.
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
void preserve()
Mark an analysis as preserved.
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Primary interface to the complete machine description for the target machine.
const STC & getSubtarget(const Function &F) const
This method returns a pointer to the specified type of TargetSubtargetInfo.
Target-Independent Code Generator Pass Configuration Options.
TMC & getTM() const
Get the right type of TargetMachine for this target.
The instances of the Type class are immutable: once they are created, they are never changed.
unsigned getIntegerBitWidth() const
const fltSemantics & getFltSemantics() const
@ FloatTyID
32-bit floating point type
@ IntegerTyID
Arbitrary bit width integers.
@ DoubleTyID
64-bit floating point type
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
TypeID getTypeID() const
Return the type id for the type.
TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
void setOperand(unsigned i, Value *Val)
LLVM Value Representation.
const ParentTy * getParent() const
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ LOCAL_ADDRESS
Address space for local memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
@ AMDGPU_PS
Used for Mesa/AMDPAL pixel shaders.
@ C
The default llvm calling convention, compatible with C.
Function * getDeclaration(Module *M, ID id, ArrayRef< Type * > Tys=std::nullopt)
Create or insert an LLVM Function declaration for an intrinsic, and return it.
This is an optimization pass for GlobalISel generic memory operations.
FunctionPass * createAMDGPUAtomicOptimizerPass(ScanOptions ScanStrategy)
DWARFExpression::Operation Op
constexpr unsigned BitWidth
char & AMDGPUAtomicOptimizerID
Instruction * SplitBlockAndInsertIfThen(Value *Cond, BasicBlock::iterator SplitBefore, bool Unreachable, MDNode *BranchWeights=nullptr, DomTreeUpdater *DTU=nullptr, LoopInfo *LI=nullptr, BasicBlock *ThenBlock=nullptr)
Split the containing block at the specified instruction - everything before SplitBefore stays in the ...
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM)