LLVM  14.0.0git
AMDGPUAtomicOptimizer.cpp
Go to the documentation of this file.
1 //===-- AMDGPUAtomicOptimizer.cpp -----------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// This pass optimizes atomic operations by using a single lane of a wavefront
11 /// to perform the atomic operation, thus reducing contention on that memory
12 /// location.
13 //
14 //===----------------------------------------------------------------------===//
15 
16 #include "AMDGPU.h"
17 #include "GCNSubtarget.h"
20 #include "llvm/IR/IRBuilder.h"
21 #include "llvm/IR/InstVisitor.h"
22 #include "llvm/IR/IntrinsicsAMDGPU.h"
23 #include "llvm/InitializePasses.h"
26 
27 #define DEBUG_TYPE "amdgpu-atomic-optimizer"
28 
29 using namespace llvm;
30 using namespace llvm::AMDGPU;
31 
32 namespace {
33 
34 struct ReplacementInfo {
35  Instruction *I;
37  unsigned ValIdx;
38  bool ValDivergent;
39 };
40 
41 class AMDGPUAtomicOptimizer : public FunctionPass,
42  public InstVisitor<AMDGPUAtomicOptimizer> {
43 private:
46  const DataLayout *DL;
47  DominatorTree *DT;
48  const GCNSubtarget *ST;
49  bool IsPixelShader;
50 
51  Value *buildReduction(IRBuilder<> &B, AtomicRMWInst::BinOp Op, Value *V,
52  Value *const Identity) const;
54  Value *const Identity) const;
55  Value *buildShiftRight(IRBuilder<> &B, Value *V, Value *const Identity) const;
56  void optimizeAtomic(Instruction &I, AtomicRMWInst::BinOp Op, unsigned ValIdx,
57  bool ValDivergent) const;
58 
59 public:
60  static char ID;
61 
62  AMDGPUAtomicOptimizer() : FunctionPass(ID) {}
63 
64  bool runOnFunction(Function &F) override;
65 
66  void getAnalysisUsage(AnalysisUsage &AU) const override {
70  }
71 
72  void visitAtomicRMWInst(AtomicRMWInst &I);
73  void visitIntrinsicInst(IntrinsicInst &I);
74 };
75 
76 } // namespace
77 
79 
81 
83  if (skipFunction(F)) {
84  return false;
85  }
86 
87  DA = &getAnalysis<LegacyDivergenceAnalysis>();
88  DL = &F.getParent()->getDataLayout();
89  DominatorTreeWrapperPass *const DTW =
90  getAnalysisIfAvailable<DominatorTreeWrapperPass>();
91  DT = DTW ? &DTW->getDomTree() : nullptr;
92  const TargetPassConfig &TPC = getAnalysis<TargetPassConfig>();
93  const TargetMachine &TM = TPC.getTM<TargetMachine>();
94  ST = &TM.getSubtarget<GCNSubtarget>(F);
95  IsPixelShader = F.getCallingConv() == CallingConv::AMDGPU_PS;
96 
97  visit(F);
98 
99  const bool Changed = !ToReplace.empty();
100 
101  for (ReplacementInfo &Info : ToReplace) {
102  optimizeAtomic(*Info.I, Info.Op, Info.ValIdx, Info.ValDivergent);
103  }
104 
105  ToReplace.clear();
106 
107  return Changed;
108 }
109 
110 void AMDGPUAtomicOptimizer::visitAtomicRMWInst(AtomicRMWInst &I) {
111  // Early exit for unhandled address space atomic instructions.
112  switch (I.getPointerAddressSpace()) {
113  default:
114  return;
117  break;
118  }
119 
120  AtomicRMWInst::BinOp Op = I.getOperation();
121 
122  switch (Op) {
123  default:
124  return;
125  case AtomicRMWInst::Add:
126  case AtomicRMWInst::Sub:
127  case AtomicRMWInst::And:
128  case AtomicRMWInst::Or:
129  case AtomicRMWInst::Xor:
130  case AtomicRMWInst::Max:
131  case AtomicRMWInst::Min:
132  case AtomicRMWInst::UMax:
133  case AtomicRMWInst::UMin:
134  break;
135  }
136 
137  const unsigned PtrIdx = 0;
138  const unsigned ValIdx = 1;
139 
140  // If the pointer operand is divergent, then each lane is doing an atomic
141  // operation on a different address, and we cannot optimize that.
142  if (DA->isDivergentUse(&I.getOperandUse(PtrIdx))) {
143  return;
144  }
145 
146  const bool ValDivergent = DA->isDivergentUse(&I.getOperandUse(ValIdx));
147 
148  // If the value operand is divergent, each lane is contributing a different
149  // value to the atomic calculation. We can only optimize divergent values if
150  // we have DPP available on our subtarget, and the atomic operation is 32
151  // bits.
152  if (ValDivergent &&
153  (!ST->hasDPP() || DL->getTypeSizeInBits(I.getType()) != 32)) {
154  return;
155  }
156 
157  // If we get here, we can optimize the atomic using a single wavefront-wide
158  // atomic operation to do the calculation for the entire wavefront, so
159  // remember the instruction so we can come back to it.
160  const ReplacementInfo Info = {&I, Op, ValIdx, ValDivergent};
161 
162  ToReplace.push_back(Info);
163 }
164 
165 void AMDGPUAtomicOptimizer::visitIntrinsicInst(IntrinsicInst &I) {
167 
168  switch (I.getIntrinsicID()) {
169  default:
170  return;
171  case Intrinsic::amdgcn_buffer_atomic_add:
172  case Intrinsic::amdgcn_struct_buffer_atomic_add:
173  case Intrinsic::amdgcn_raw_buffer_atomic_add:
175  break;
176  case Intrinsic::amdgcn_buffer_atomic_sub:
177  case Intrinsic::amdgcn_struct_buffer_atomic_sub:
178  case Intrinsic::amdgcn_raw_buffer_atomic_sub:
180  break;
181  case Intrinsic::amdgcn_buffer_atomic_and:
182  case Intrinsic::amdgcn_struct_buffer_atomic_and:
183  case Intrinsic::amdgcn_raw_buffer_atomic_and:
185  break;
186  case Intrinsic::amdgcn_buffer_atomic_or:
187  case Intrinsic::amdgcn_struct_buffer_atomic_or:
188  case Intrinsic::amdgcn_raw_buffer_atomic_or:
190  break;
191  case Intrinsic::amdgcn_buffer_atomic_xor:
192  case Intrinsic::amdgcn_struct_buffer_atomic_xor:
193  case Intrinsic::amdgcn_raw_buffer_atomic_xor:
195  break;
196  case Intrinsic::amdgcn_buffer_atomic_smin:
197  case Intrinsic::amdgcn_struct_buffer_atomic_smin:
198  case Intrinsic::amdgcn_raw_buffer_atomic_smin:
200  break;
201  case Intrinsic::amdgcn_buffer_atomic_umin:
202  case Intrinsic::amdgcn_struct_buffer_atomic_umin:
203  case Intrinsic::amdgcn_raw_buffer_atomic_umin:
205  break;
206  case Intrinsic::amdgcn_buffer_atomic_smax:
207  case Intrinsic::amdgcn_struct_buffer_atomic_smax:
208  case Intrinsic::amdgcn_raw_buffer_atomic_smax:
210  break;
211  case Intrinsic::amdgcn_buffer_atomic_umax:
212  case Intrinsic::amdgcn_struct_buffer_atomic_umax:
213  case Intrinsic::amdgcn_raw_buffer_atomic_umax:
215  break;
216  }
217 
218  const unsigned ValIdx = 0;
219 
220  const bool ValDivergent = DA->isDivergentUse(&I.getOperandUse(ValIdx));
221 
222  // If the value operand is divergent, each lane is contributing a different
223  // value to the atomic calculation. We can only optimize divergent values if
224  // we have DPP available on our subtarget, and the atomic operation is 32
225  // bits.
226  if (ValDivergent &&
227  (!ST->hasDPP() || DL->getTypeSizeInBits(I.getType()) != 32)) {
228  return;
229  }
230 
231  // If any of the other arguments to the intrinsic are divergent, we can't
232  // optimize the operation.
233  for (unsigned Idx = 1; Idx < I.getNumOperands(); Idx++) {
234  if (DA->isDivergentUse(&I.getOperandUse(Idx))) {
235  return;
236  }
237  }
238 
239  // If we get here, we can optimize the atomic using a single wavefront-wide
240  // atomic operation to do the calculation for the entire wavefront, so
241  // remember the instruction so we can come back to it.
242  const ReplacementInfo Info = {&I, Op, ValIdx, ValDivergent};
243 
244  ToReplace.push_back(Info);
245 }
246 
247 // Use the builder to create the non-atomic counterpart of the specified
248 // atomicrmw binary op.
250  Value *LHS, Value *RHS) {
251  CmpInst::Predicate Pred;
252 
253  switch (Op) {
254  default:
255  llvm_unreachable("Unhandled atomic op");
256  case AtomicRMWInst::Add:
257  return B.CreateBinOp(Instruction::Add, LHS, RHS);
258  case AtomicRMWInst::Sub:
259  return B.CreateBinOp(Instruction::Sub, LHS, RHS);
260  case AtomicRMWInst::And:
261  return B.CreateBinOp(Instruction::And, LHS, RHS);
262  case AtomicRMWInst::Or:
263  return B.CreateBinOp(Instruction::Or, LHS, RHS);
264  case AtomicRMWInst::Xor:
265  return B.CreateBinOp(Instruction::Xor, LHS, RHS);
266 
267  case AtomicRMWInst::Max:
268  Pred = CmpInst::ICMP_SGT;
269  break;
270  case AtomicRMWInst::Min:
271  Pred = CmpInst::ICMP_SLT;
272  break;
273  case AtomicRMWInst::UMax:
274  Pred = CmpInst::ICMP_UGT;
275  break;
276  case AtomicRMWInst::UMin:
277  Pred = CmpInst::ICMP_ULT;
278  break;
279  }
280  Value *Cond = B.CreateICmp(Pred, LHS, RHS);
281  return B.CreateSelect(Cond, LHS, RHS);
282 }
283 
284 // Use the builder to create a reduction of V across the wavefront, with all
285 // lanes active, returning the same result in all lanes.
286 Value *AMDGPUAtomicOptimizer::buildReduction(IRBuilder<> &B,
288  Value *const Identity) const {
289  Type *const Ty = V->getType();
290  Module *M = B.GetInsertBlock()->getModule();
291  Function *UpdateDPP =
292  Intrinsic::getDeclaration(M, Intrinsic::amdgcn_update_dpp, Ty);
293 
294  // Reduce within each row of 16 lanes.
295  for (unsigned Idx = 0; Idx < 4; Idx++) {
297  B, Op, V,
298  B.CreateCall(UpdateDPP,
299  {Identity, V, B.getInt32(DPP::ROW_XMASK0 | 1 << Idx),
300  B.getInt32(0xf), B.getInt32(0xf), B.getFalse()}));
301  }
302 
303  // Reduce within each pair of rows (i.e. 32 lanes).
304  assert(ST->hasPermLaneX16());
306  B, Op, V,
307  B.CreateIntrinsic(
308  Intrinsic::amdgcn_permlanex16, {},
309  {V, V, B.getInt32(-1), B.getInt32(-1), B.getFalse(), B.getFalse()}));
310 
311  if (ST->isWave32())
312  return V;
313 
314  // Pick an arbitrary lane from 0..31 and an arbitrary lane from 32..63 and
315  // combine them with a scalar operation.
316  Function *ReadLane =
317  Intrinsic::getDeclaration(M, Intrinsic::amdgcn_readlane, {});
318  Value *const Lane0 = B.CreateCall(ReadLane, {V, B.getInt32(0)});
319  Value *const Lane32 = B.CreateCall(ReadLane, {V, B.getInt32(32)});
320  return buildNonAtomicBinOp(B, Op, Lane0, Lane32);
321 }
322 
323 // Use the builder to create an inclusive scan of V across the wavefront, with
324 // all lanes active.
325 Value *AMDGPUAtomicOptimizer::buildScan(IRBuilder<> &B, AtomicRMWInst::BinOp Op,
326  Value *V, Value *const Identity) const {
327  Type *const Ty = V->getType();
328  Module *M = B.GetInsertBlock()->getModule();
329  Function *UpdateDPP =
330  Intrinsic::getDeclaration(M, Intrinsic::amdgcn_update_dpp, Ty);
331 
332  for (unsigned Idx = 0; Idx < 4; Idx++) {
334  B, Op, V,
335  B.CreateCall(UpdateDPP,
336  {Identity, V, B.getInt32(DPP::ROW_SHR0 | 1 << Idx),
337  B.getInt32(0xf), B.getInt32(0xf), B.getFalse()}));
338  }
339  if (ST->hasDPPBroadcasts()) {
340  // GFX9 has DPP row broadcast operations.
342  B, Op, V,
343  B.CreateCall(UpdateDPP,
344  {Identity, V, B.getInt32(DPP::BCAST15), B.getInt32(0xa),
345  B.getInt32(0xf), B.getFalse()}));
347  B, Op, V,
348  B.CreateCall(UpdateDPP,
349  {Identity, V, B.getInt32(DPP::BCAST31), B.getInt32(0xc),
350  B.getInt32(0xf), B.getFalse()}));
351  } else {
352  // On GFX10 all DPP operations are confined to a single row. To get cross-
353  // row operations we have to use permlane or readlane.
354 
355  // Combine lane 15 into lanes 16..31 (and, for wave 64, lane 47 into lanes
356  // 48..63).
357  assert(ST->hasPermLaneX16());
358  Value *const PermX = B.CreateIntrinsic(
359  Intrinsic::amdgcn_permlanex16, {},
360  {V, V, B.getInt32(-1), B.getInt32(-1), B.getFalse(), B.getFalse()});
362  B, Op, V,
363  B.CreateCall(UpdateDPP,
364  {Identity, PermX, B.getInt32(DPP::QUAD_PERM_ID),
365  B.getInt32(0xa), B.getInt32(0xf), B.getFalse()}));
366  if (!ST->isWave32()) {
367  // Combine lane 31 into lanes 32..63.
368  Value *const Lane31 = B.CreateIntrinsic(Intrinsic::amdgcn_readlane, {},
369  {V, B.getInt32(31)});
371  B, Op, V,
372  B.CreateCall(UpdateDPP,
373  {Identity, Lane31, B.getInt32(DPP::QUAD_PERM_ID),
374  B.getInt32(0xc), B.getInt32(0xf), B.getFalse()}));
375  }
376  }
377  return V;
378 }
379 
380 // Use the builder to create a shift right of V across the wavefront, with all
381 // lanes active, to turn an inclusive scan into an exclusive scan.
382 Value *AMDGPUAtomicOptimizer::buildShiftRight(IRBuilder<> &B, Value *V,
383  Value *const Identity) const {
384  Type *const Ty = V->getType();
385  Module *M = B.GetInsertBlock()->getModule();
386  Function *UpdateDPP =
387  Intrinsic::getDeclaration(M, Intrinsic::amdgcn_update_dpp, Ty);
388 
389  if (ST->hasDPPWavefrontShifts()) {
390  // GFX9 has DPP wavefront shift operations.
391  V = B.CreateCall(UpdateDPP,
392  {Identity, V, B.getInt32(DPP::WAVE_SHR1), B.getInt32(0xf),
393  B.getInt32(0xf), B.getFalse()});
394  } else {
395  Function *ReadLane =
396  Intrinsic::getDeclaration(M, Intrinsic::amdgcn_readlane, {});
397  Function *WriteLane =
398  Intrinsic::getDeclaration(M, Intrinsic::amdgcn_writelane, {});
399 
400  // On GFX10 all DPP operations are confined to a single row. To get cross-
401  // row operations we have to use permlane or readlane.
402  Value *Old = V;
403  V = B.CreateCall(UpdateDPP,
404  {Identity, V, B.getInt32(DPP::ROW_SHR0 + 1),
405  B.getInt32(0xf), B.getInt32(0xf), B.getFalse()});
406 
407  // Copy the old lane 15 to the new lane 16.
408  V = B.CreateCall(WriteLane, {B.CreateCall(ReadLane, {Old, B.getInt32(15)}),
409  B.getInt32(16), V});
410 
411  if (!ST->isWave32()) {
412  // Copy the old lane 31 to the new lane 32.
413  V = B.CreateCall(
414  WriteLane,
415  {B.CreateCall(ReadLane, {Old, B.getInt32(31)}), B.getInt32(32), V});
416 
417  // Copy the old lane 47 to the new lane 48.
418  V = B.CreateCall(
419  WriteLane,
420  {B.CreateCall(ReadLane, {Old, B.getInt32(47)}), B.getInt32(48), V});
421  }
422  }
423 
424  return V;
425 }
426 
428  unsigned BitWidth) {
429  switch (Op) {
430  default:
431  llvm_unreachable("Unhandled atomic op");
432  case AtomicRMWInst::Add:
433  case AtomicRMWInst::Sub:
434  case AtomicRMWInst::Or:
435  case AtomicRMWInst::Xor:
436  case AtomicRMWInst::UMax:
438  case AtomicRMWInst::And:
439  case AtomicRMWInst::UMin:
441  case AtomicRMWInst::Max:
443  case AtomicRMWInst::Min:
445  }
446 }
447 
448 static Value *buildMul(IRBuilder<> &B, Value *LHS, Value *RHS) {
449  const ConstantInt *CI = dyn_cast<ConstantInt>(LHS);
450  return (CI && CI->isOne()) ? RHS : B.CreateMul(LHS, RHS);
451 }
452 
453 void AMDGPUAtomicOptimizer::optimizeAtomic(Instruction &I,
455  unsigned ValIdx,
456  bool ValDivergent) const {
457  // Start building just before the instruction.
458  IRBuilder<> B(&I);
459 
460  // If we are in a pixel shader, because of how we have to mask out helper
461  // lane invocations, we need to record the entry and exit BB's.
462  BasicBlock *PixelEntryBB = nullptr;
463  BasicBlock *PixelExitBB = nullptr;
464 
465  // If we're optimizing an atomic within a pixel shader, we need to wrap the
466  // entire atomic operation in a helper-lane check. We do not want any helper
467  // lanes that are around only for the purposes of derivatives to take part
468  // in any cross-lane communication, and we use a branch on whether the lane is
469  // live to do this.
470  if (IsPixelShader) {
471  // Record I's original position as the entry block.
472  PixelEntryBB = I.getParent();
473 
474  Value *const Cond = B.CreateIntrinsic(Intrinsic::amdgcn_ps_live, {}, {});
475  Instruction *const NonHelperTerminator =
476  SplitBlockAndInsertIfThen(Cond, &I, false, nullptr, DT, nullptr);
477 
478  // Record I's new position as the exit block.
479  PixelExitBB = I.getParent();
480 
481  I.moveBefore(NonHelperTerminator);
482  B.SetInsertPoint(&I);
483  }
484 
485  Type *const Ty = I.getType();
486  const unsigned TyBitWidth = DL->getTypeSizeInBits(Ty);
487  auto *const VecTy = FixedVectorType::get(B.getInt32Ty(), 2);
488 
489  // This is the value in the atomic operation we need to combine in order to
490  // reduce the number of atomic operations.
491  Value *const V = I.getOperand(ValIdx);
492 
493  // We need to know how many lanes are active within the wavefront, and we do
494  // this by doing a ballot of active lanes.
495  Type *const WaveTy = B.getIntNTy(ST->getWavefrontSize());
496  CallInst *const Ballot =
497  B.CreateIntrinsic(Intrinsic::amdgcn_ballot, WaveTy, B.getTrue());
498 
499  // We need to know how many lanes are active within the wavefront that are
500  // below us. If we counted each lane linearly starting from 0, a lane is
501  // below us only if its associated index was less than ours. We do this by
502  // using the mbcnt intrinsic.
503  Value *Mbcnt;
504  if (ST->isWave32()) {
505  Mbcnt = B.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_lo, {},
506  {Ballot, B.getInt32(0)});
507  } else {
508  Value *const BitCast = B.CreateBitCast(Ballot, VecTy);
509  Value *const ExtractLo = B.CreateExtractElement(BitCast, B.getInt32(0));
510  Value *const ExtractHi = B.CreateExtractElement(BitCast, B.getInt32(1));
511  Mbcnt = B.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_lo, {},
512  {ExtractLo, B.getInt32(0)});
513  Mbcnt =
514  B.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_hi, {}, {ExtractHi, Mbcnt});
515  }
516  Mbcnt = B.CreateIntCast(Mbcnt, Ty, false);
517 
518  Value *const Identity = B.getInt(getIdentityValueForAtomicOp(Op, TyBitWidth));
519 
520  Value *ExclScan = nullptr;
521  Value *NewV = nullptr;
522 
523  const bool NeedResult = !I.use_empty();
524 
525  // If we have a divergent value in each lane, we need to combine the value
526  // using DPP.
527  if (ValDivergent) {
528  // First we need to set all inactive invocations to the identity value, so
529  // that they can correctly contribute to the final result.
530  NewV = B.CreateIntrinsic(Intrinsic::amdgcn_set_inactive, Ty, {V, Identity});
531 
532  const AtomicRMWInst::BinOp ScanOp =
534  if (!NeedResult && ST->hasPermLaneX16()) {
535  // On GFX10 the permlanex16 instruction helps us build a reduction without
536  // too many readlanes and writelanes, which are generally bad for
537  // performance.
538  NewV = buildReduction(B, ScanOp, NewV, Identity);
539  } else {
540  NewV = buildScan(B, ScanOp, NewV, Identity);
541  if (NeedResult)
542  ExclScan = buildShiftRight(B, NewV, Identity);
543 
544  // Read the value from the last lane, which has accumulated the values of
545  // each active lane in the wavefront. This will be our new value which we
546  // will provide to the atomic operation.
547  Value *const LastLaneIdx = B.getInt32(ST->getWavefrontSize() - 1);
548  assert(TyBitWidth == 32);
549  NewV = B.CreateIntrinsic(Intrinsic::amdgcn_readlane, {},
550  {NewV, LastLaneIdx});
551  }
552 
553  // Finally mark the readlanes in the WWM section.
554  NewV = B.CreateIntrinsic(Intrinsic::amdgcn_strict_wwm, Ty, NewV);
555  } else {
556  switch (Op) {
557  default:
558  llvm_unreachable("Unhandled atomic op");
559 
560  case AtomicRMWInst::Add:
561  case AtomicRMWInst::Sub: {
562  // The new value we will be contributing to the atomic operation is the
563  // old value times the number of active lanes.
564  Value *const Ctpop = B.CreateIntCast(
565  B.CreateUnaryIntrinsic(Intrinsic::ctpop, Ballot), Ty, false);
566  NewV = buildMul(B, V, Ctpop);
567  break;
568  }
569 
570  case AtomicRMWInst::And:
571  case AtomicRMWInst::Or:
572  case AtomicRMWInst::Max:
573  case AtomicRMWInst::Min:
574  case AtomicRMWInst::UMax:
575  case AtomicRMWInst::UMin:
576  // These operations with a uniform value are idempotent: doing the atomic
577  // operation multiple times has the same effect as doing it once.
578  NewV = V;
579  break;
580 
581  case AtomicRMWInst::Xor:
582  // The new value we will be contributing to the atomic operation is the
583  // old value times the parity of the number of active lanes.
584  Value *const Ctpop = B.CreateIntCast(
585  B.CreateUnaryIntrinsic(Intrinsic::ctpop, Ballot), Ty, false);
586  NewV = buildMul(B, V, B.CreateAnd(Ctpop, 1));
587  break;
588  }
589  }
590 
591  // We only want a single lane to enter our new control flow, and we do this
592  // by checking if there are any active lanes below us. Only one lane will
593  // have 0 active lanes below us, so that will be the only one to progress.
594  Value *const Cond = B.CreateICmpEQ(Mbcnt, B.getIntN(TyBitWidth, 0));
595 
596  // Store I's original basic block before we split the block.
597  BasicBlock *const EntryBB = I.getParent();
598 
599  // We need to introduce some new control flow to force a single lane to be
600  // active. We do this by splitting I's basic block at I, and introducing the
601  // new block such that:
602  // entry --> single_lane -\
603  // \------------------> exit
604  Instruction *const SingleLaneTerminator =
605  SplitBlockAndInsertIfThen(Cond, &I, false, nullptr, DT, nullptr);
606 
607  // Move the IR builder into single_lane next.
608  B.SetInsertPoint(SingleLaneTerminator);
609 
610  // Clone the original atomic operation into single lane, replacing the
611  // original value with our newly created one.
612  Instruction *const NewI = I.clone();
613  B.Insert(NewI);
614  NewI->setOperand(ValIdx, NewV);
615 
616  // Move the IR builder into exit next, and start inserting just before the
617  // original instruction.
618  B.SetInsertPoint(&I);
619 
620  if (NeedResult) {
621  // Create a PHI node to get our new atomic result into the exit block.
622  PHINode *const PHI = B.CreatePHI(Ty, 2);
623  PHI->addIncoming(UndefValue::get(Ty), EntryBB);
624  PHI->addIncoming(NewI, SingleLaneTerminator->getParent());
625 
626  // We need to broadcast the value who was the lowest active lane (the first
627  // lane) to all other lanes in the wavefront. We use an intrinsic for this,
628  // but have to handle 64-bit broadcasts with two calls to this intrinsic.
629  Value *BroadcastI = nullptr;
630 
631  if (TyBitWidth == 64) {
632  Value *const ExtractLo = B.CreateTrunc(PHI, B.getInt32Ty());
633  Value *const ExtractHi =
634  B.CreateTrunc(B.CreateLShr(PHI, 32), B.getInt32Ty());
635  CallInst *const ReadFirstLaneLo =
636  B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, {}, ExtractLo);
637  CallInst *const ReadFirstLaneHi =
638  B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, {}, ExtractHi);
639  Value *const PartialInsert = B.CreateInsertElement(
640  UndefValue::get(VecTy), ReadFirstLaneLo, B.getInt32(0));
641  Value *const Insert =
642  B.CreateInsertElement(PartialInsert, ReadFirstLaneHi, B.getInt32(1));
643  BroadcastI = B.CreateBitCast(Insert, Ty);
644  } else if (TyBitWidth == 32) {
645 
646  BroadcastI = B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, {}, PHI);
647  } else {
648  llvm_unreachable("Unhandled atomic bit width");
649  }
650 
651  // Now that we have the result of our single atomic operation, we need to
652  // get our individual lane's slice into the result. We use the lane offset
653  // we previously calculated combined with the atomic result value we got
654  // from the first lane, to get our lane's index into the atomic result.
655  Value *LaneOffset = nullptr;
656  if (ValDivergent) {
657  LaneOffset =
658  B.CreateIntrinsic(Intrinsic::amdgcn_strict_wwm, Ty, ExclScan);
659  } else {
660  switch (Op) {
661  default:
662  llvm_unreachable("Unhandled atomic op");
663  case AtomicRMWInst::Add:
664  case AtomicRMWInst::Sub:
665  LaneOffset = buildMul(B, V, Mbcnt);
666  break;
667  case AtomicRMWInst::And:
668  case AtomicRMWInst::Or:
669  case AtomicRMWInst::Max:
670  case AtomicRMWInst::Min:
671  case AtomicRMWInst::UMax:
672  case AtomicRMWInst::UMin:
673  LaneOffset = B.CreateSelect(Cond, Identity, V);
674  break;
675  case AtomicRMWInst::Xor:
676  LaneOffset = buildMul(B, V, B.CreateAnd(Mbcnt, 1));
677  break;
678  }
679  }
680  Value *const Result = buildNonAtomicBinOp(B, Op, BroadcastI, LaneOffset);
681 
682  if (IsPixelShader) {
683  // Need a final PHI to reconverge to above the helper lane branch mask.
684  B.SetInsertPoint(PixelExitBB->getFirstNonPHI());
685 
686  PHINode *const PHI = B.CreatePHI(Ty, 2);
687  PHI->addIncoming(UndefValue::get(Ty), PixelEntryBB);
688  PHI->addIncoming(Result, I.getParent());
689  I.replaceAllUsesWith(PHI);
690  } else {
691  // Replace the original atomic instruction with the new one.
692  I.replaceAllUsesWith(Result);
693  }
694  }
695 
696  // And delete the original.
697  I.eraseFromParent();
698 }
699 
700 INITIALIZE_PASS_BEGIN(AMDGPUAtomicOptimizer, DEBUG_TYPE,
701  "AMDGPU atomic optimizations", false, false)
704 INITIALIZE_PASS_END(AMDGPUAtomicOptimizer, DEBUG_TYPE,
705  "AMDGPU atomic optimizations", false, false)
706 
708  return new AMDGPUAtomicOptimizer();
709 }
llvm
This file implements support for optimizing divisions by a constant.
Definition: AllocatorList.h:23
M
We currently emits eax Perhaps this is what we really should generate is Is imull three or four cycles eax eax The current instruction priority is based on pattern complexity The former is more complex because it folds a load so the latter will not be emitted Perhaps we should use AddedComplexity to give LEA32r a higher priority We should always try to match LEA first since the LEA matching code does some estimate to determine whether the match is profitable if we care more about code then imull is better It s two bytes shorter than movl leal On a Pentium M
Definition: README.txt:252
llvm::AMDGPU::DPP::ROW_SHR0
@ ROW_SHR0
Definition: SIDefines.h:694
llvm::DataLayout
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:113
llvm::Intrinsic::getDeclaration
Function * getDeclaration(Module *M, ID id, ArrayRef< Type * > Tys=None)
Create or insert an LLVM Function declaration for an intrinsic, and return it.
Definition: Function.cpp:1379
Insert
Vector Rotate Left Mask Mask Insert
Definition: README_P9.txt:112
llvm::CmpInst::Predicate
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:720
llvm::Function
Definition: Function.h:62
llvm::AtomicRMWInst::Xor
@ Xor
*p = old ^ v
Definition: Instructions.h:752
llvm::AtomicRMWInst::BinOp
BinOp
This enumeration lists the possible modifications atomicrmw can make.
Definition: Instructions.h:738
llvm::SmallVector
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1168
llvm::APInt::getMinValue
static APInt getMinValue(unsigned numBits)
Gets minimum unsigned value of APInt for a specific bit width.
Definition: APInt.h:196
llvm::APInt::getMaxValue
static APInt getMaxValue(unsigned numBits)
Gets maximum unsigned value of APInt for specific bit width.
Definition: APInt.h:186
llvm::IRBuilder<>
llvm::DominatorTree
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition: Dominators.h:151
llvm::CmpInst::ICMP_SGT
@ ICMP_SGT
signed greater than
Definition: InstrTypes.h:747
llvm::APInt::getSignedMaxValue
static APInt getSignedMaxValue(unsigned numBits)
Gets maximum signed value of APInt for a specific bit width.
Definition: APInt.h:189
llvm::Type
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
llvm::CallingConv::AMDGPU_PS
@ AMDGPU_PS
Calling convention used for Mesa/AMDPAL pixel shaders.
Definition: CallingConv.h:210
llvm::GCNSubtarget
Definition: GCNSubtarget.h:31
F
#define F(x, y, z)
Definition: MD5.cpp:56
llvm::BasicBlock
LLVM Basic Block Representation.
Definition: BasicBlock.h:58
llvm::ConstantInt
This is the shared class of boolean and integer constants.
Definition: Constants.h:79
TargetMachine.h
DEBUG_TYPE
#define DEBUG_TYPE
Definition: AMDGPUAtomicOptimizer.cpp:27
GCNSubtarget.h
llvm::AnalysisUsage
Represent the analysis usage information of a pass.
Definition: PassAnalysisSupport.h:47
llvm::AMDGPUAtomicOptimizerID
char & AMDGPUAtomicOptimizerID
Definition: AMDGPUAtomicOptimizer.cpp:80
llvm::ms_demangle::QualifierMangleMode::Result
@ Result
false
Definition: StackSlotColoring.cpp:142
B
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
llvm::AMDGPU
Definition: AMDGPUMetadataVerifier.h:22
llvm::Instruction
Definition: Instruction.h:45
llvm::DominatorTreeWrapperPass
Legacy analysis pass which computes a DominatorTree.
Definition: Dominators.h:287
llvm::M68kBeads::DA
@ DA
Definition: M68kBaseInfo.h:59
llvm::UndefValue::get
static UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
Definition: Constants.cpp:1796
llvm::LegacyDivergenceAnalysis
Definition: LegacyDivergenceAnalysis.h:31
Info
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
llvm::BasicBlock::getFirstNonPHI
const Instruction * getFirstNonPHI() const
Returns a pointer to the first instruction in this block that is not a PHINode instruction.
Definition: BasicBlock.cpp:216
llvm::FixedVectorType::get
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition: Type.cpp:686
llvm::AtomicRMWInst::Add
@ Add
*p = old + v
Definition: Instructions.h:742
INITIALIZE_PASS_END
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:58
llvm::TargetPassConfig
Target-Independent Code Generator Pass Configuration Options.
Definition: TargetPassConfig.h:84
llvm::AtomicRMWInst::UMin
@ UMin
*p = old <unsigned v ? old : v
Definition: Instructions.h:760
llvm::AtomicRMWInst::Sub
@ Sub
*p = old - v
Definition: Instructions.h:744
llvm::ARM_MB::ST
@ ST
Definition: ARMBaseInfo.h:73
INITIALIZE_PASS_DEPENDENCY
INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
llvm::createAMDGPUAtomicOptimizerPass
FunctionPass * createAMDGPUAtomicOptimizerPass()
Definition: AMDGPUAtomicOptimizer.cpp:707
llvm::PHINode::addIncoming
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
Definition: Instructions.h:2783
I
#define I(x, y, z)
Definition: MD5.cpp:59
llvm::AtomicRMWInst::Min
@ Min
*p = old <signed v ? old : v
Definition: Instructions.h:756
getIdentityValueForAtomicOp
static APInt getIdentityValueForAtomicOp(AtomicRMWInst::BinOp Op, unsigned BitWidth)
Definition: AMDGPUAtomicOptimizer.cpp:427
llvm::AtomicRMWInst::Or
@ Or
*p = old | v
Definition: Instructions.h:750
TargetPassConfig.h
llvm::AMDGPUAS::LOCAL_ADDRESS
@ LOCAL_ADDRESS
Address space for local memory.
Definition: AMDGPU.h:363
IRBuilder.h
assert
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
llvm::TargetMachine
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:79
llvm::Module
A Module instance is used to store all the information related to an LLVM module.
Definition: Module.h:67
llvm::APInt
Class for arbitrary precision integers.
Definition: APInt.h:75
llvm::User::setOperand
void setOperand(unsigned i, Value *Val)
Definition: User.h:174
llvm::CmpInst::ICMP_SLT
@ ICMP_SLT
signed less than
Definition: InstrTypes.h:749
Cond
SmallVector< MachineOperand, 4 > Cond
Definition: BasicBlockSections.cpp:179
AMDGPU.h
InstVisitor.h
llvm::CmpInst::ICMP_ULT
@ ICMP_ULT
unsigned less than
Definition: InstrTypes.h:745
llvm_unreachable
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Definition: ErrorHandling.h:134
llvm::Value::getType
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
llvm::AnalysisUsage::addPreserved
AnalysisUsage & addPreserved()
Add the specified Pass class to the set of analyses preserved by this pass.
Definition: PassAnalysisSupport.h:98
DL
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Definition: AArch64SLSHardening.cpp:76
llvm::TargetPassConfig::getTM
TMC & getTM() const
Get the right type of TargetMachine for this target.
Definition: TargetPassConfig.h:151
llvm::InstVisitor
Base class for instruction visitors.
Definition: InstVisitor.h:79
llvm::AtomicRMWInst
an instruction that atomically reads a memory location, combines it with another value,...
Definition: Instructions.h:726
runOnFunction
static bool runOnFunction(Function &F, bool PostInlining)
Definition: EntryExitInstrumenter.cpp:69
buildMul
static Value * buildMul(IRBuilder<> &B, Value *LHS, Value *RHS)
Definition: AMDGPUAtomicOptimizer.cpp:448
llvm::AMDGPU::SendMsg::Op
Op
Definition: SIDefines.h:324
llvm::AtomicRMWInst::And
@ And
*p = old & v
Definition: Instructions.h:746
llvm::BitWidth
constexpr unsigned BitWidth
Definition: BitmaskEnum.h:147
llvm::DominatorTreeWrapperPass::getDomTree
DominatorTree & getDomTree()
Definition: Dominators.h:295
llvm::MCID::Add
@ Add
Definition: MCInstrDesc.h:183
llvm::APInt::getSignedMinValue
static APInt getSignedMinValue(unsigned numBits)
Gets minimum signed value of APInt for a specific bit width.
Definition: APInt.h:199
llvm::IntrinsicInst
A wrapper class for inspecting calls to intrinsic functions.
Definition: IntrinsicInst.h:45
optimizations
AMDGPU atomic optimizations
Definition: AMDGPUAtomicOptimizer.cpp:705
LegacyDivergenceAnalysis.h
llvm::CmpInst::ICMP_UGT
@ ICMP_UGT
unsigned greater than
Definition: InstrTypes.h:743
llvm::Instruction::getParent
const BasicBlock * getParent() const
Definition: Instruction.h:94
llvm::PHINode
Definition: Instructions.h:2633
TM
const char LLVMTargetMachineRef TM
Definition: PassBuilderBindings.cpp:47
INITIALIZE_PASS_BEGIN
INITIALIZE_PASS_BEGIN(AMDGPUAtomicOptimizer, DEBUG_TYPE, "AMDGPU atomic optimizations", false, false) INITIALIZE_PASS_END(AMDGPUAtomicOptimizer
llvm::FunctionPass
FunctionPass class - This class is used to implement most global optimizations.
Definition: Pass.h:298
llvm::CallInst
This class represents a function call, abstracting a target machine's calling convention.
Definition: Instructions.h:1475
llvm::ConstantInt::isOne
bool isOne() const
This is just a convenience method to make client code smaller for a common case.
Definition: Constants.h:200
llvm::AMDGPUAS::GLOBAL_ADDRESS
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
Definition: AMDGPU.h:359
llvm::AnalysisUsage::addRequired
AnalysisUsage & addRequired()
Definition: PassAnalysisSupport.h:75
llvm::SplitBlockAndInsertIfThen
Instruction * SplitBlockAndInsertIfThen(Value *Cond, Instruction *SplitBefore, bool Unreachable, MDNode *BranchWeights, DominatorTree *DT, LoopInfo *LI=nullptr, BasicBlock *ThenBlock=nullptr)
Split the containing block at the specified instruction - everything before SplitBefore stays in the ...
Definition: BasicBlockUtils.cpp:1418
llvm::AtomicRMWInst::UMax
@ UMax
*p = old >unsigned v ? old : v
Definition: Instructions.h:758
BasicBlockUtils.h
InitializePasses.h
llvm::Value
LLVM Value Representation.
Definition: Value.h:74
llvm::AMDGPU::DPP::WAVE_SHR1
@ WAVE_SHR1
Definition: SIDefines.h:707
buildNonAtomicBinOp
static Value * buildNonAtomicBinOp(IRBuilder<> &B, AtomicRMWInst::BinOp Op, Value *LHS, Value *RHS)
Definition: AMDGPUAtomicOptimizer.cpp:249
llvm::Intrinsic::ID
unsigned ID
Definition: TargetTransformInfo.h:37
llvm::AtomicRMWInst::Max
@ Max
*p = old >signed v ? old : v
Definition: Instructions.h:754