22 #include "llvm/IR/IntrinsicsAMDGPU.h"
27 #define DEBUG_TYPE "amdgpu-atomic-optimizer"
34 struct ReplacementInfo {
52 Value *
const Identity)
const;
54 Value *
const Identity)
const;
57 bool ValDivergent)
const;
83 if (skipFunction(
F)) {
87 DA = &getAnalysis<LegacyDivergenceAnalysis>();
88 DL = &
F.getParent()->getDataLayout();
90 getAnalysisIfAvailable<DominatorTreeWrapperPass>();
99 const bool Changed = !ToReplace.empty();
101 for (ReplacementInfo &
Info : ToReplace) {
110 void AMDGPUAtomicOptimizer::visitAtomicRMWInst(
AtomicRMWInst &
I) {
112 switch (
I.getPointerAddressSpace()) {
137 const unsigned PtrIdx = 0;
138 const unsigned ValIdx = 1;
142 if (
DA->isDivergentUse(&
I.getOperandUse(PtrIdx))) {
146 const bool ValDivergent =
DA->isDivergentUse(&
I.getOperandUse(ValIdx));
153 (!
ST->hasDPP() ||
DL->getTypeSizeInBits(
I.getType()) != 32)) {
160 const ReplacementInfo
Info = {&
I,
Op, ValIdx, ValDivergent};
162 ToReplace.push_back(
Info);
165 void AMDGPUAtomicOptimizer::visitIntrinsicInst(
IntrinsicInst &
I) {
168 switch (
I.getIntrinsicID()) {
171 case Intrinsic::amdgcn_buffer_atomic_add:
172 case Intrinsic::amdgcn_struct_buffer_atomic_add:
173 case Intrinsic::amdgcn_raw_buffer_atomic_add:
176 case Intrinsic::amdgcn_buffer_atomic_sub:
177 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
178 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
181 case Intrinsic::amdgcn_buffer_atomic_and:
182 case Intrinsic::amdgcn_struct_buffer_atomic_and:
183 case Intrinsic::amdgcn_raw_buffer_atomic_and:
186 case Intrinsic::amdgcn_buffer_atomic_or:
187 case Intrinsic::amdgcn_struct_buffer_atomic_or:
188 case Intrinsic::amdgcn_raw_buffer_atomic_or:
191 case Intrinsic::amdgcn_buffer_atomic_xor:
192 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
193 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
196 case Intrinsic::amdgcn_buffer_atomic_smin:
197 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
198 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
201 case Intrinsic::amdgcn_buffer_atomic_umin:
202 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
203 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
206 case Intrinsic::amdgcn_buffer_atomic_smax:
207 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
208 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
211 case Intrinsic::amdgcn_buffer_atomic_umax:
212 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
213 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
218 const unsigned ValIdx = 0;
220 const bool ValDivergent =
DA->isDivergentUse(&
I.getOperandUse(ValIdx));
227 (!
ST->hasDPP() ||
DL->getTypeSizeInBits(
I.getType()) != 32)) {
233 for (
unsigned Idx = 1; Idx <
I.getNumOperands(); Idx++) {
234 if (
DA->isDivergentUse(&
I.getOperandUse(Idx))) {
242 const ReplacementInfo
Info = {&
I,
Op, ValIdx, ValDivergent};
244 ToReplace.push_back(
Info);
259 return B.CreateBinOp(Instruction::Sub,
LHS,
RHS);
261 return B.CreateBinOp(Instruction::And,
LHS,
RHS);
263 return B.CreateBinOp(Instruction::Or,
LHS,
RHS);
265 return B.CreateBinOp(Instruction::Xor,
LHS,
RHS);
288 Value *
const Identity)
const {
290 Module *
M =
B.GetInsertBlock()->getModule();
295 for (
unsigned Idx = 0; Idx < 4; Idx++) {
298 B.CreateCall(UpdateDPP,
299 {Identity, V, B.getInt32(DPP::ROW_XMASK0 | 1 << Idx),
300 B.getInt32(0xf), B.getInt32(0xf), B.getFalse()}));
308 Intrinsic::amdgcn_permlanex16, {},
309 {V, V, B.getInt32(-1), B.getInt32(-1), B.getFalse(), B.getFalse()}));
314 if (
ST->hasPermLane64()) {
317 B,
Op, V,
B.CreateIntrinsic(Intrinsic::amdgcn_permlane64, {}, V));
324 Value *
const Lane0 =
B.CreateCall(ReadLane, {V,
B.getInt32(0)});
325 Value *
const Lane32 =
B.CreateCall(ReadLane, {V,
B.getInt32(32)});
334 Module *
M =
B.GetInsertBlock()->getModule();
338 for (
unsigned Idx = 0; Idx < 4; Idx++) {
341 B.CreateCall(UpdateDPP,
342 {Identity, V, B.getInt32(DPP::ROW_SHR0 | 1 << Idx),
343 B.getInt32(0xf), B.getInt32(0xf), B.getFalse()}));
345 if (
ST->hasDPPBroadcasts()) {
349 B.CreateCall(UpdateDPP,
350 {Identity, V, B.getInt32(DPP::BCAST15), B.getInt32(0xa),
351 B.getInt32(0xf), B.getFalse()}));
354 B.CreateCall(UpdateDPP,
355 {Identity, V, B.getInt32(DPP::BCAST31), B.getInt32(0xc),
356 B.getInt32(0xf), B.getFalse()}));
364 Value *
const PermX =
B.CreateIntrinsic(
365 Intrinsic::amdgcn_permlanex16, {},
366 {V, V,
B.getInt32(-1),
B.getInt32(-1),
B.getFalse(),
B.getFalse()});
369 B.CreateCall(UpdateDPP,
370 {Identity, PermX, B.getInt32(DPP::QUAD_PERM_ID),
371 B.getInt32(0xa), B.getInt32(0xf), B.getFalse()}));
372 if (!
ST->isWave32()) {
374 Value *
const Lane31 =
B.CreateIntrinsic(Intrinsic::amdgcn_readlane, {},
375 {V,
B.getInt32(31)});
378 B.CreateCall(UpdateDPP,
379 {Identity, Lane31, B.getInt32(DPP::QUAD_PERM_ID),
380 B.getInt32(0xc), B.getInt32(0xf), B.getFalse()}));
389 Value *
const Identity)
const {
391 Module *
M =
B.GetInsertBlock()->getModule();
395 if (
ST->hasDPPWavefrontShifts()) {
397 V =
B.CreateCall(UpdateDPP,
399 B.getInt32(0xf),
B.getFalse()});
409 V =
B.CreateCall(UpdateDPP,
411 B.getInt32(0xf),
B.getInt32(0xf),
B.getFalse()});
414 V =
B.CreateCall(WriteLane, {
B.CreateCall(ReadLane, {Old,
B.getInt32(15)}),
417 if (!
ST->isWave32()) {
421 {
B.CreateCall(ReadLane, {Old,
B.getInt32(31)}),
B.getInt32(32), V});
426 {
B.CreateCall(ReadLane, {Old,
B.getInt32(47)}),
B.getInt32(48), V});
459 void AMDGPUAtomicOptimizer::optimizeAtomic(
Instruction &
I,
462 bool ValDivergent)
const {
478 PixelEntryBB =
I.getParent();
480 Value *
const Cond =
B.CreateIntrinsic(Intrinsic::amdgcn_ps_live, {}, {});
485 PixelExitBB =
I.getParent();
487 I.moveBefore(NonHelperTerminator);
488 B.SetInsertPoint(&
I);
491 Type *
const Ty =
I.getType();
492 const unsigned TyBitWidth =
DL->getTypeSizeInBits(Ty);
497 Value *
const V =
I.getOperand(ValIdx);
501 Type *
const WaveTy =
B.getIntNTy(
ST->getWavefrontSize());
503 B.CreateIntrinsic(Intrinsic::amdgcn_ballot, WaveTy,
B.getTrue());
510 if (
ST->isWave32()) {
511 Mbcnt =
B.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_lo, {},
512 {Ballot,
B.getInt32(0)});
514 Value *
const BitCast =
B.CreateBitCast(Ballot, VecTy);
515 Value *
const ExtractLo =
B.CreateExtractElement(BitCast,
B.getInt32(0));
516 Value *
const ExtractHi =
B.CreateExtractElement(BitCast,
B.getInt32(1));
517 Mbcnt =
B.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_lo, {},
518 {ExtractLo,
B.getInt32(0)});
520 B.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_hi, {}, {ExtractHi, Mbcnt});
522 Mbcnt =
B.CreateIntCast(Mbcnt, Ty,
false);
526 Value *ExclScan =
nullptr;
527 Value *NewV =
nullptr;
529 const bool NeedResult = !
I.use_empty();
536 NewV =
B.CreateIntrinsic(Intrinsic::amdgcn_set_inactive, Ty, {V, Identity});
540 if (!NeedResult &&
ST->hasPermLaneX16()) {
544 NewV = buildReduction(
B, ScanOp, NewV, Identity);
546 NewV = buildScan(
B, ScanOp, NewV, Identity);
548 ExclScan = buildShiftRight(
B, NewV, Identity);
553 Value *
const LastLaneIdx =
B.getInt32(
ST->getWavefrontSize() - 1);
555 NewV =
B.CreateIntrinsic(Intrinsic::amdgcn_readlane, {},
556 {NewV, LastLaneIdx});
560 NewV =
B.CreateIntrinsic(Intrinsic::amdgcn_strict_wwm, Ty, NewV);
570 Value *
const Ctpop =
B.CreateIntCast(
571 B.CreateUnaryIntrinsic(Intrinsic::ctpop, Ballot), Ty,
false);
590 Value *
const Ctpop =
B.CreateIntCast(
591 B.CreateUnaryIntrinsic(Intrinsic::ctpop, Ballot), Ty,
false);
600 Value *
const Cond =
B.CreateICmpEQ(Mbcnt,
B.getIntN(TyBitWidth, 0));
614 B.SetInsertPoint(SingleLaneTerminator);
624 B.SetInsertPoint(&
I);
630 PHI->addIncoming(NewI, SingleLaneTerminator->
getParent());
635 Value *BroadcastI =
nullptr;
637 if (TyBitWidth == 64) {
638 Value *
const ExtractLo =
B.CreateTrunc(
PHI,
B.getInt32Ty());
639 Value *
const ExtractHi =
640 B.CreateTrunc(
B.CreateLShr(
PHI, 32),
B.getInt32Ty());
642 B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, {}, ExtractLo);
644 B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, {}, ExtractHi);
645 Value *
const PartialInsert =
B.CreateInsertElement(
648 B.CreateInsertElement(PartialInsert, ReadFirstLaneHi,
B.getInt32(1));
649 BroadcastI =
B.CreateBitCast(Insert, Ty);
650 }
else if (TyBitWidth == 32) {
652 BroadcastI =
B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, {},
PHI);
661 Value *LaneOffset =
nullptr;
664 B.CreateIntrinsic(Intrinsic::amdgcn_strict_wwm, Ty, ExclScan);
679 LaneOffset =
B.CreateSelect(
Cond, Identity, V);
682 LaneOffset =
buildMul(
B, V,
B.CreateAnd(Mbcnt, 1));
694 PHI->addIncoming(Result,
I.getParent());
695 I.replaceAllUsesWith(
PHI);
698 I.replaceAllUsesWith(Result);
707 "AMDGPU atomic optimizations",
false,
false)
714 return new AMDGPUAtomicOptimizer();