22 #include "llvm/IR/IntrinsicsAMDGPU.h"
27 #define DEBUG_TYPE "amdgpu-atomic-optimizer"
34 struct ReplacementInfo {
52 Value *
const Identity)
const;
54 Value *
const Identity)
const;
57 bool ValDivergent)
const;
83 if (skipFunction(
F)) {
87 DA = &getAnalysis<LegacyDivergenceAnalysis>();
88 DL = &
F.getParent()->getDataLayout();
90 getAnalysisIfAvailable<DominatorTreeWrapperPass>();
99 const bool Changed = !ToReplace.empty();
101 for (ReplacementInfo &
Info : ToReplace) {
110 void AMDGPUAtomicOptimizer::visitAtomicRMWInst(
AtomicRMWInst &
I) {
112 switch (
I.getPointerAddressSpace()) {
137 const unsigned PtrIdx = 0;
138 const unsigned ValIdx = 1;
142 if (
DA->isDivergentUse(&
I.getOperandUse(PtrIdx))) {
146 const bool ValDivergent =
DA->isDivergentUse(&
I.getOperandUse(ValIdx));
153 (!
ST->hasDPP() ||
DL->getTypeSizeInBits(
I.getType()) != 32)) {
160 const ReplacementInfo
Info = {&
I,
Op, ValIdx, ValDivergent};
162 ToReplace.push_back(
Info);
165 void AMDGPUAtomicOptimizer::visitIntrinsicInst(
IntrinsicInst &
I) {
168 switch (
I.getIntrinsicID()) {
171 case Intrinsic::amdgcn_buffer_atomic_add:
172 case Intrinsic::amdgcn_struct_buffer_atomic_add:
173 case Intrinsic::amdgcn_raw_buffer_atomic_add:
176 case Intrinsic::amdgcn_buffer_atomic_sub:
177 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
178 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
181 case Intrinsic::amdgcn_buffer_atomic_and:
182 case Intrinsic::amdgcn_struct_buffer_atomic_and:
183 case Intrinsic::amdgcn_raw_buffer_atomic_and:
186 case Intrinsic::amdgcn_buffer_atomic_or:
187 case Intrinsic::amdgcn_struct_buffer_atomic_or:
188 case Intrinsic::amdgcn_raw_buffer_atomic_or:
191 case Intrinsic::amdgcn_buffer_atomic_xor:
192 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
193 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
196 case Intrinsic::amdgcn_buffer_atomic_smin:
197 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
198 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
201 case Intrinsic::amdgcn_buffer_atomic_umin:
202 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
203 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
206 case Intrinsic::amdgcn_buffer_atomic_smax:
207 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
208 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
211 case Intrinsic::amdgcn_buffer_atomic_umax:
212 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
213 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
218 const unsigned ValIdx = 0;
220 const bool ValDivergent =
DA->isDivergentUse(&
I.getOperandUse(ValIdx));
227 (!
ST->hasDPP() ||
DL->getTypeSizeInBits(
I.getType()) != 32)) {
233 for (
unsigned Idx = 1; Idx <
I.getNumOperands(); Idx++) {
234 if (
DA->isDivergentUse(&
I.getOperandUse(Idx))) {
242 const ReplacementInfo
Info = {&
I,
Op, ValIdx, ValDivergent};
244 ToReplace.push_back(
Info);
259 return B.CreateBinOp(Instruction::Sub,
LHS,
RHS);
261 return B.CreateBinOp(Instruction::And,
LHS,
RHS);
263 return B.CreateBinOp(Instruction::Or,
LHS,
RHS);
265 return B.CreateBinOp(Instruction::Xor,
LHS,
RHS);
288 Value *
const Identity)
const {
290 Module *
M =
B.GetInsertBlock()->getModule();
295 for (
unsigned Idx = 0; Idx < 4; Idx++) {
298 B.CreateCall(UpdateDPP,
299 {Identity, V, B.getInt32(DPP::ROW_XMASK0 | 1 << Idx),
300 B.getInt32(0xf), B.getInt32(0xf), B.getFalse()}));
308 Intrinsic::amdgcn_permlanex16, {},
309 {V, V, B.getInt32(-1), B.getInt32(-1), B.getFalse(), B.getFalse()}));
318 Value *
const Lane0 =
B.CreateCall(ReadLane, {V,
B.getInt32(0)});
319 Value *
const Lane32 =
B.CreateCall(ReadLane, {V,
B.getInt32(32)});
328 Module *
M =
B.GetInsertBlock()->getModule();
332 for (
unsigned Idx = 0; Idx < 4; Idx++) {
335 B.CreateCall(UpdateDPP,
336 {Identity, V, B.getInt32(DPP::ROW_SHR0 | 1 << Idx),
337 B.getInt32(0xf), B.getInt32(0xf), B.getFalse()}));
339 if (
ST->hasDPPBroadcasts()) {
343 B.CreateCall(UpdateDPP,
344 {Identity, V, B.getInt32(DPP::BCAST15), B.getInt32(0xa),
345 B.getInt32(0xf), B.getFalse()}));
348 B.CreateCall(UpdateDPP,
349 {Identity, V, B.getInt32(DPP::BCAST31), B.getInt32(0xc),
350 B.getInt32(0xf), B.getFalse()}));
358 Value *
const PermX =
B.CreateIntrinsic(
359 Intrinsic::amdgcn_permlanex16, {},
360 {V, V,
B.getInt32(-1),
B.getInt32(-1),
B.getFalse(),
B.getFalse()});
363 B.CreateCall(UpdateDPP,
364 {Identity, PermX, B.getInt32(DPP::QUAD_PERM_ID),
365 B.getInt32(0xa), B.getInt32(0xf), B.getFalse()}));
366 if (!
ST->isWave32()) {
368 Value *
const Lane31 =
B.CreateIntrinsic(Intrinsic::amdgcn_readlane, {},
369 {V,
B.getInt32(31)});
372 B.CreateCall(UpdateDPP,
373 {Identity, Lane31, B.getInt32(DPP::QUAD_PERM_ID),
374 B.getInt32(0xc), B.getInt32(0xf), B.getFalse()}));
383 Value *
const Identity)
const {
385 Module *
M =
B.GetInsertBlock()->getModule();
389 if (
ST->hasDPPWavefrontShifts()) {
391 V =
B.CreateCall(UpdateDPP,
393 B.getInt32(0xf),
B.getFalse()});
403 V =
B.CreateCall(UpdateDPP,
405 B.getInt32(0xf),
B.getInt32(0xf),
B.getFalse()});
408 V =
B.CreateCall(WriteLane, {
B.CreateCall(ReadLane, {Old,
B.getInt32(15)}),
411 if (!
ST->isWave32()) {
415 {
B.CreateCall(ReadLane, {Old,
B.getInt32(31)}),
B.getInt32(32), V});
420 {
B.CreateCall(ReadLane, {Old,
B.getInt32(47)}),
B.getInt32(48), V});
453 void AMDGPUAtomicOptimizer::optimizeAtomic(
Instruction &
I,
456 bool ValDivergent)
const {
472 PixelEntryBB =
I.getParent();
474 Value *
const Cond =
B.CreateIntrinsic(Intrinsic::amdgcn_ps_live, {}, {});
479 PixelExitBB =
I.getParent();
481 I.moveBefore(NonHelperTerminator);
482 B.SetInsertPoint(&
I);
485 Type *
const Ty =
I.getType();
486 const unsigned TyBitWidth =
DL->getTypeSizeInBits(Ty);
491 Value *
const V =
I.getOperand(ValIdx);
495 Type *
const WaveTy =
B.getIntNTy(
ST->getWavefrontSize());
497 B.CreateIntrinsic(Intrinsic::amdgcn_ballot, WaveTy,
B.getTrue());
504 if (
ST->isWave32()) {
505 Mbcnt =
B.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_lo, {},
506 {Ballot,
B.getInt32(0)});
508 Value *
const BitCast =
B.CreateBitCast(Ballot, VecTy);
509 Value *
const ExtractLo =
B.CreateExtractElement(BitCast,
B.getInt32(0));
510 Value *
const ExtractHi =
B.CreateExtractElement(BitCast,
B.getInt32(1));
511 Mbcnt =
B.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_lo, {},
512 {ExtractLo,
B.getInt32(0)});
514 B.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_hi, {}, {ExtractHi, Mbcnt});
516 Mbcnt =
B.CreateIntCast(Mbcnt, Ty,
false);
520 Value *ExclScan =
nullptr;
521 Value *NewV =
nullptr;
523 const bool NeedResult = !
I.use_empty();
530 NewV =
B.CreateIntrinsic(Intrinsic::amdgcn_set_inactive, Ty, {V, Identity});
534 if (!NeedResult &&
ST->hasPermLaneX16()) {
538 NewV = buildReduction(
B, ScanOp, NewV, Identity);
540 NewV = buildScan(
B, ScanOp, NewV, Identity);
542 ExclScan = buildShiftRight(
B, NewV, Identity);
547 Value *
const LastLaneIdx =
B.getInt32(
ST->getWavefrontSize() - 1);
549 NewV =
B.CreateIntrinsic(Intrinsic::amdgcn_readlane, {},
550 {NewV, LastLaneIdx});
554 NewV =
B.CreateIntrinsic(Intrinsic::amdgcn_strict_wwm, Ty, NewV);
564 Value *
const Ctpop =
B.CreateIntCast(
565 B.CreateUnaryIntrinsic(Intrinsic::ctpop, Ballot), Ty,
false);
584 Value *
const Ctpop =
B.CreateIntCast(
585 B.CreateUnaryIntrinsic(Intrinsic::ctpop, Ballot), Ty,
false);
594 Value *
const Cond =
B.CreateICmpEQ(Mbcnt,
B.getIntN(TyBitWidth, 0));
608 B.SetInsertPoint(SingleLaneTerminator);
618 B.SetInsertPoint(&
I);
622 PHINode *
const PHI =
B.CreatePHI(Ty, 2);
629 Value *BroadcastI =
nullptr;
631 if (TyBitWidth == 64) {
632 Value *
const ExtractLo =
B.CreateTrunc(PHI,
B.getInt32Ty());
633 Value *
const ExtractHi =
634 B.CreateTrunc(
B.CreateLShr(PHI, 32),
B.getInt32Ty());
636 B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, {}, ExtractLo);
638 B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, {}, ExtractHi);
639 Value *
const PartialInsert =
B.CreateInsertElement(
642 B.CreateInsertElement(PartialInsert, ReadFirstLaneHi,
B.getInt32(1));
643 BroadcastI =
B.CreateBitCast(Insert, Ty);
644 }
else if (TyBitWidth == 32) {
646 BroadcastI =
B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, {}, PHI);
655 Value *LaneOffset =
nullptr;
658 B.CreateIntrinsic(Intrinsic::amdgcn_strict_wwm, Ty, ExclScan);
673 LaneOffset =
B.CreateSelect(
Cond, Identity, V);
676 LaneOffset =
buildMul(
B, V,
B.CreateAnd(Mbcnt, 1));
686 PHINode *
const PHI =
B.CreatePHI(Ty, 2);
689 I.replaceAllUsesWith(PHI);
692 I.replaceAllUsesWith(Result);
701 "AMDGPU atomic optimizations",
false,
false)
708 return new AMDGPUAtomicOptimizer();