LLVM  11.0.0git
AMDGPUAtomicOptimizer.cpp
Go to the documentation of this file.
1 //===-- AMDGPUAtomicOptimizer.cpp -----------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// This pass optimizes atomic operations by using a single lane of a wavefront
11 /// to perform the atomic operation, thus reducing contention on that memory
12 /// location.
13 //
14 //===----------------------------------------------------------------------===//
15 
16 #include "AMDGPU.h"
17 #include "AMDGPUSubtarget.h"
18 #include "SIDefines.h"
21 #include "llvm/IR/IRBuilder.h"
22 #include "llvm/IR/InstVisitor.h"
23 #include "llvm/InitializePasses.h"
25 
26 #define DEBUG_TYPE "amdgpu-atomic-optimizer"
27 
28 using namespace llvm;
29 using namespace llvm::AMDGPU;
30 
31 namespace {
32 
33 struct ReplacementInfo {
34  Instruction *I;
36  unsigned ValIdx;
37  bool ValDivergent;
38 };
39 
40 class AMDGPUAtomicOptimizer : public FunctionPass,
41  public InstVisitor<AMDGPUAtomicOptimizer> {
42 private:
44  const LegacyDivergenceAnalysis *DA;
45  const DataLayout *DL;
46  DominatorTree *DT;
47  const GCNSubtarget *ST;
48  bool IsPixelShader;
49 
51  Value *const Identity) const;
52  Value *buildShiftRight(IRBuilder<> &B, Value *V, Value *const Identity) const;
53  void optimizeAtomic(Instruction &I, AtomicRMWInst::BinOp Op, unsigned ValIdx,
54  bool ValDivergent) const;
55 
56 public:
57  static char ID;
58 
59  AMDGPUAtomicOptimizer() : FunctionPass(ID) {}
60 
61  bool runOnFunction(Function &F) override;
62 
63  void getAnalysisUsage(AnalysisUsage &AU) const override {
67  }
68 
69  void visitAtomicRMWInst(AtomicRMWInst &I);
70  void visitIntrinsicInst(IntrinsicInst &I);
71 };
72 
73 } // namespace
74 
76 
78 
80  if (skipFunction(F)) {
81  return false;
82  }
83 
84  DA = &getAnalysis<LegacyDivergenceAnalysis>();
85  DL = &F.getParent()->getDataLayout();
86  DominatorTreeWrapperPass *const DTW =
87  getAnalysisIfAvailable<DominatorTreeWrapperPass>();
88  DT = DTW ? &DTW->getDomTree() : nullptr;
89  const TargetPassConfig &TPC = getAnalysis<TargetPassConfig>();
90  const TargetMachine &TM = TPC.getTM<TargetMachine>();
91  ST = &TM.getSubtarget<GCNSubtarget>(F);
92  IsPixelShader = F.getCallingConv() == CallingConv::AMDGPU_PS;
93 
94  visit(F);
95 
96  const bool Changed = !ToReplace.empty();
97 
98  for (ReplacementInfo &Info : ToReplace) {
99  optimizeAtomic(*Info.I, Info.Op, Info.ValIdx, Info.ValDivergent);
100  }
101 
102  ToReplace.clear();
103 
104  return Changed;
105 }
106 
107 void AMDGPUAtomicOptimizer::visitAtomicRMWInst(AtomicRMWInst &I) {
108  // Early exit for unhandled address space atomic instructions.
109  switch (I.getPointerAddressSpace()) {
110  default:
111  return;
114  break;
115  }
116 
118 
119  switch (Op) {
120  default:
121  return;
122  case AtomicRMWInst::Add:
123  case AtomicRMWInst::Sub:
124  case AtomicRMWInst::And:
125  case AtomicRMWInst::Or:
126  case AtomicRMWInst::Xor:
127  case AtomicRMWInst::Max:
128  case AtomicRMWInst::Min:
129  case AtomicRMWInst::UMax:
130  case AtomicRMWInst::UMin:
131  break;
132  }
133 
134  const unsigned PtrIdx = 0;
135  const unsigned ValIdx = 1;
136 
137  // If the pointer operand is divergent, then each lane is doing an atomic
138  // operation on a different address, and we cannot optimize that.
139  if (DA->isDivergentUse(&I.getOperandUse(PtrIdx))) {
140  return;
141  }
142 
143  const bool ValDivergent = DA->isDivergentUse(&I.getOperandUse(ValIdx));
144 
145  // If the value operand is divergent, each lane is contributing a different
146  // value to the atomic calculation. We can only optimize divergent values if
147  // we have DPP available on our subtarget, and the atomic operation is 32
148  // bits.
149  if (ValDivergent &&
150  (!ST->hasDPP() || DL->getTypeSizeInBits(I.getType()) != 32)) {
151  return;
152  }
153 
154  // If we get here, we can optimize the atomic using a single wavefront-wide
155  // atomic operation to do the calculation for the entire wavefront, so
156  // remember the instruction so we can come back to it.
157  const ReplacementInfo Info = {&I, Op, ValIdx, ValDivergent};
158 
159  ToReplace.push_back(Info);
160 }
161 
162 void AMDGPUAtomicOptimizer::visitIntrinsicInst(IntrinsicInst &I) {
164 
165  switch (I.getIntrinsicID()) {
166  default:
167  return;
168  case Intrinsic::amdgcn_buffer_atomic_add:
169  case Intrinsic::amdgcn_struct_buffer_atomic_add:
170  case Intrinsic::amdgcn_raw_buffer_atomic_add:
171  Op = AtomicRMWInst::Add;
172  break;
173  case Intrinsic::amdgcn_buffer_atomic_sub:
174  case Intrinsic::amdgcn_struct_buffer_atomic_sub:
175  case Intrinsic::amdgcn_raw_buffer_atomic_sub:
176  Op = AtomicRMWInst::Sub;
177  break;
178  case Intrinsic::amdgcn_buffer_atomic_and:
179  case Intrinsic::amdgcn_struct_buffer_atomic_and:
180  case Intrinsic::amdgcn_raw_buffer_atomic_and:
181  Op = AtomicRMWInst::And;
182  break;
183  case Intrinsic::amdgcn_buffer_atomic_or:
184  case Intrinsic::amdgcn_struct_buffer_atomic_or:
185  case Intrinsic::amdgcn_raw_buffer_atomic_or:
186  Op = AtomicRMWInst::Or;
187  break;
188  case Intrinsic::amdgcn_buffer_atomic_xor:
189  case Intrinsic::amdgcn_struct_buffer_atomic_xor:
190  case Intrinsic::amdgcn_raw_buffer_atomic_xor:
191  Op = AtomicRMWInst::Xor;
192  break;
193  case Intrinsic::amdgcn_buffer_atomic_smin:
194  case Intrinsic::amdgcn_struct_buffer_atomic_smin:
195  case Intrinsic::amdgcn_raw_buffer_atomic_smin:
196  Op = AtomicRMWInst::Min;
197  break;
198  case Intrinsic::amdgcn_buffer_atomic_umin:
199  case Intrinsic::amdgcn_struct_buffer_atomic_umin:
200  case Intrinsic::amdgcn_raw_buffer_atomic_umin:
201  Op = AtomicRMWInst::UMin;
202  break;
203  case Intrinsic::amdgcn_buffer_atomic_smax:
204  case Intrinsic::amdgcn_struct_buffer_atomic_smax:
205  case Intrinsic::amdgcn_raw_buffer_atomic_smax:
206  Op = AtomicRMWInst::Max;
207  break;
208  case Intrinsic::amdgcn_buffer_atomic_umax:
209  case Intrinsic::amdgcn_struct_buffer_atomic_umax:
210  case Intrinsic::amdgcn_raw_buffer_atomic_umax:
211  Op = AtomicRMWInst::UMax;
212  break;
213  }
214 
215  const unsigned ValIdx = 0;
216 
217  const bool ValDivergent = DA->isDivergentUse(&I.getOperandUse(ValIdx));
218 
219  // If the value operand is divergent, each lane is contributing a different
220  // value to the atomic calculation. We can only optimize divergent values if
221  // we have DPP available on our subtarget, and the atomic operation is 32
222  // bits.
223  if (ValDivergent &&
224  (!ST->hasDPP() || DL->getTypeSizeInBits(I.getType()) != 32)) {
225  return;
226  }
227 
228  // If any of the other arguments to the intrinsic are divergent, we can't
229  // optimize the operation.
230  for (unsigned Idx = 1; Idx < I.getNumOperands(); Idx++) {
231  if (DA->isDivergentUse(&I.getOperandUse(Idx))) {
232  return;
233  }
234  }
235 
236  // If we get here, we can optimize the atomic using a single wavefront-wide
237  // atomic operation to do the calculation for the entire wavefront, so
238  // remember the instruction so we can come back to it.
239  const ReplacementInfo Info = {&I, Op, ValIdx, ValDivergent};
240 
241  ToReplace.push_back(Info);
242 }
243 
244 // Use the builder to create the non-atomic counterpart of the specified
245 // atomicrmw binary op.
247  Value *LHS, Value *RHS) {
248  CmpInst::Predicate Pred;
249 
250  switch (Op) {
251  default:
252  llvm_unreachable("Unhandled atomic op");
253  case AtomicRMWInst::Add:
254  return B.CreateBinOp(Instruction::Add, LHS, RHS);
255  case AtomicRMWInst::Sub:
256  return B.CreateBinOp(Instruction::Sub, LHS, RHS);
257  case AtomicRMWInst::And:
258  return B.CreateBinOp(Instruction::And, LHS, RHS);
259  case AtomicRMWInst::Or:
260  return B.CreateBinOp(Instruction::Or, LHS, RHS);
261  case AtomicRMWInst::Xor:
262  return B.CreateBinOp(Instruction::Xor, LHS, RHS);
263 
264  case AtomicRMWInst::Max:
265  Pred = CmpInst::ICMP_SGT;
266  break;
267  case AtomicRMWInst::Min:
268  Pred = CmpInst::ICMP_SLT;
269  break;
270  case AtomicRMWInst::UMax:
271  Pred = CmpInst::ICMP_UGT;
272  break;
273  case AtomicRMWInst::UMin:
274  Pred = CmpInst::ICMP_ULT;
275  break;
276  }
277  Value *Cond = B.CreateICmp(Pred, LHS, RHS);
278  return B.CreateSelect(Cond, LHS, RHS);
279 }
280 
281 // Use the builder to create an inclusive scan of V across the wavefront, with
282 // all lanes active.
283 Value *AMDGPUAtomicOptimizer::buildScan(IRBuilder<> &B, AtomicRMWInst::BinOp Op,
284  Value *V, Value *const Identity) const {
285  Type *const Ty = V->getType();
286  Module *M = B.GetInsertBlock()->getModule();
287  Function *UpdateDPP =
288  Intrinsic::getDeclaration(M, Intrinsic::amdgcn_update_dpp, Ty);
289  Function *PermLaneX16 =
290  Intrinsic::getDeclaration(M, Intrinsic::amdgcn_permlanex16, {});
291  Function *ReadLane =
292  Intrinsic::getDeclaration(M, Intrinsic::amdgcn_readlane, {});
293 
294  for (unsigned Idx = 0; Idx < 4; Idx++) {
296  B, Op, V,
297  B.CreateCall(UpdateDPP,
298  {Identity, V, B.getInt32(DPP::ROW_SHR0 | 1 << Idx),
299  B.getInt32(0xf), B.getInt32(0xf), B.getFalse()}));
300  }
301  if (ST->hasDPPBroadcasts()) {
302  // GFX9 has DPP row broadcast operations.
304  B, Op, V,
305  B.CreateCall(UpdateDPP,
306  {Identity, V, B.getInt32(DPP::BCAST15), B.getInt32(0xa),
307  B.getInt32(0xf), B.getFalse()}));
309  B, Op, V,
310  B.CreateCall(UpdateDPP,
311  {Identity, V, B.getInt32(DPP::BCAST31), B.getInt32(0xc),
312  B.getInt32(0xf), B.getFalse()}));
313  } else {
314  // On GFX10 all DPP operations are confined to a single row. To get cross-
315  // row operations we have to use permlane or readlane.
316 
317  // Combine lane 15 into lanes 16..31 (and, for wave 64, lane 47 into lanes
318  // 48..63).
319  Value *const PermX =
320  B.CreateCall(PermLaneX16, {V, V, B.getInt32(-1), B.getInt32(-1),
321  B.getFalse(), B.getFalse()});
323  B, Op, V,
324  B.CreateCall(UpdateDPP,
325  {Identity, PermX, B.getInt32(DPP::QUAD_PERM_ID),
326  B.getInt32(0xa), B.getInt32(0xf), B.getFalse()}));
327  if (!ST->isWave32()) {
328  // Combine lane 31 into lanes 32..63.
329  Value *const Lane31 = B.CreateCall(ReadLane, {V, B.getInt32(31)});
331  B, Op, V,
332  B.CreateCall(UpdateDPP,
333  {Identity, Lane31, B.getInt32(DPP::QUAD_PERM_ID),
334  B.getInt32(0xc), B.getInt32(0xf), B.getFalse()}));
335  }
336  }
337  return V;
338 }
339 
340 // Use the builder to create a shift right of V across the wavefront, with all
341 // lanes active, to turn an inclusive scan into an exclusive scan.
342 Value *AMDGPUAtomicOptimizer::buildShiftRight(IRBuilder<> &B, Value *V,
343  Value *const Identity) const {
344  Type *const Ty = V->getType();
345  Module *M = B.GetInsertBlock()->getModule();
346  Function *UpdateDPP =
347  Intrinsic::getDeclaration(M, Intrinsic::amdgcn_update_dpp, Ty);
348  Function *ReadLane =
349  Intrinsic::getDeclaration(M, Intrinsic::amdgcn_readlane, {});
350  Function *WriteLane =
351  Intrinsic::getDeclaration(M, Intrinsic::amdgcn_writelane, {});
352 
353  if (ST->hasDPPWavefrontShifts()) {
354  // GFX9 has DPP wavefront shift operations.
355  V = B.CreateCall(UpdateDPP,
356  {Identity, V, B.getInt32(DPP::WAVE_SHR1), B.getInt32(0xf),
357  B.getInt32(0xf), B.getFalse()});
358  } else {
359  // On GFX10 all DPP operations are confined to a single row. To get cross-
360  // row operations we have to use permlane or readlane.
361  Value *Old = V;
362  V = B.CreateCall(UpdateDPP,
363  {Identity, V, B.getInt32(DPP::ROW_SHR0 + 1),
364  B.getInt32(0xf), B.getInt32(0xf), B.getFalse()});
365 
366  // Copy the old lane 15 to the new lane 16.
367  V = B.CreateCall(WriteLane, {B.CreateCall(ReadLane, {Old, B.getInt32(15)}),
368  B.getInt32(16), V});
369 
370  if (!ST->isWave32()) {
371  // Copy the old lane 31 to the new lane 32.
372  V = B.CreateCall(
373  WriteLane,
374  {B.CreateCall(ReadLane, {Old, B.getInt32(31)}), B.getInt32(32), V});
375 
376  // Copy the old lane 47 to the new lane 48.
377  V = B.CreateCall(
378  WriteLane,
379  {B.CreateCall(ReadLane, {Old, B.getInt32(47)}), B.getInt32(48), V});
380  }
381  }
382 
383  return V;
384 }
385 
387  unsigned BitWidth) {
388  switch (Op) {
389  default:
390  llvm_unreachable("Unhandled atomic op");
391  case AtomicRMWInst::Add:
392  case AtomicRMWInst::Sub:
393  case AtomicRMWInst::Or:
394  case AtomicRMWInst::Xor:
395  case AtomicRMWInst::UMax:
396  return APInt::getMinValue(BitWidth);
397  case AtomicRMWInst::And:
398  case AtomicRMWInst::UMin:
399  return APInt::getMaxValue(BitWidth);
400  case AtomicRMWInst::Max:
401  return APInt::getSignedMinValue(BitWidth);
402  case AtomicRMWInst::Min:
403  return APInt::getSignedMaxValue(BitWidth);
404  }
405 }
406 
407 void AMDGPUAtomicOptimizer::optimizeAtomic(Instruction &I,
409  unsigned ValIdx,
410  bool ValDivergent) const {
411  // Start building just before the instruction.
412  IRBuilder<> B(&I);
413 
414  // If we are in a pixel shader, because of how we have to mask out helper
415  // lane invocations, we need to record the entry and exit BB's.
416  BasicBlock *PixelEntryBB = nullptr;
417  BasicBlock *PixelExitBB = nullptr;
418 
419  // If we're optimizing an atomic within a pixel shader, we need to wrap the
420  // entire atomic operation in a helper-lane check. We do not want any helper
421  // lanes that are around only for the purposes of derivatives to take part
422  // in any cross-lane communication, and we use a branch on whether the lane is
423  // live to do this.
424  if (IsPixelShader) {
425  // Record I's original position as the entry block.
426  PixelEntryBB = I.getParent();
427 
428  Value *const Cond = B.CreateIntrinsic(Intrinsic::amdgcn_ps_live, {}, {});
429  Instruction *const NonHelperTerminator =
430  SplitBlockAndInsertIfThen(Cond, &I, false, nullptr, DT, nullptr);
431 
432  // Record I's new position as the exit block.
433  PixelExitBB = I.getParent();
434 
435  I.moveBefore(NonHelperTerminator);
436  B.SetInsertPoint(&I);
437  }
438 
439  Type *const Ty = I.getType();
440  const unsigned TyBitWidth = DL->getTypeSizeInBits(Ty);
441  auto *const VecTy = FixedVectorType::get(B.getInt32Ty(), 2);
442 
443  // This is the value in the atomic operation we need to combine in order to
444  // reduce the number of atomic operations.
445  Value *const V = I.getOperand(ValIdx);
446 
447  // We need to know how many lanes are active within the wavefront, and we do
448  // this by doing a ballot of active lanes.
449  Type *const WaveTy = B.getIntNTy(ST->getWavefrontSize());
450  CallInst *const Ballot =
451  B.CreateIntrinsic(Intrinsic::amdgcn_ballot, WaveTy, B.getTrue());
452 
453  // We need to know how many lanes are active within the wavefront that are
454  // below us. If we counted each lane linearly starting from 0, a lane is
455  // below us only if its associated index was less than ours. We do this by
456  // using the mbcnt intrinsic.
457  Value *Mbcnt;
458  if (ST->isWave32()) {
459  Mbcnt = B.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_lo, {},
460  {Ballot, B.getInt32(0)});
461  } else {
462  Value *const BitCast = B.CreateBitCast(Ballot, VecTy);
463  Value *const ExtractLo = B.CreateExtractElement(BitCast, B.getInt32(0));
464  Value *const ExtractHi = B.CreateExtractElement(BitCast, B.getInt32(1));
465  Mbcnt = B.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_lo, {},
466  {ExtractLo, B.getInt32(0)});
467  Mbcnt =
468  B.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_hi, {}, {ExtractHi, Mbcnt});
469  }
470  Mbcnt = B.CreateIntCast(Mbcnt, Ty, false);
471 
472  Value *const Identity = B.getInt(getIdentityValueForAtomicOp(Op, TyBitWidth));
473 
474  Value *ExclScan = nullptr;
475  Value *NewV = nullptr;
476 
477  // If we have a divergent value in each lane, we need to combine the value
478  // using DPP.
479  if (ValDivergent) {
480  // First we need to set all inactive invocations to the identity value, so
481  // that they can correctly contribute to the final result.
482  NewV = B.CreateIntrinsic(Intrinsic::amdgcn_set_inactive, Ty, {V, Identity});
483 
484  const AtomicRMWInst::BinOp ScanOp =
486  NewV = buildScan(B, ScanOp, NewV, Identity);
487  ExclScan = buildShiftRight(B, NewV, Identity);
488 
489  // Read the value from the last lane, which has accumlated the values of
490  // each active lane in the wavefront. This will be our new value which we
491  // will provide to the atomic operation.
492  Value *const LastLaneIdx = B.getInt32(ST->getWavefrontSize() - 1);
493  if (TyBitWidth == 64) {
494  Value *const ExtractLo = B.CreateTrunc(NewV, B.getInt32Ty());
495  Value *const ExtractHi =
496  B.CreateTrunc(B.CreateLShr(NewV, 32), B.getInt32Ty());
497  CallInst *const ReadLaneLo = B.CreateIntrinsic(
498  Intrinsic::amdgcn_readlane, {}, {ExtractLo, LastLaneIdx});
499  CallInst *const ReadLaneHi = B.CreateIntrinsic(
500  Intrinsic::amdgcn_readlane, {}, {ExtractHi, LastLaneIdx});
501  Value *const PartialInsert = B.CreateInsertElement(
502  UndefValue::get(VecTy), ReadLaneLo, B.getInt32(0));
503  Value *const Insert =
504  B.CreateInsertElement(PartialInsert, ReadLaneHi, B.getInt32(1));
505  NewV = B.CreateBitCast(Insert, Ty);
506  } else if (TyBitWidth == 32) {
507  NewV = B.CreateIntrinsic(Intrinsic::amdgcn_readlane, {},
508  {NewV, LastLaneIdx});
509  } else {
510  llvm_unreachable("Unhandled atomic bit width");
511  }
512 
513  // Finally mark the readlanes in the WWM section.
514  NewV = B.CreateIntrinsic(Intrinsic::amdgcn_wwm, Ty, NewV);
515  } else {
516  switch (Op) {
517  default:
518  llvm_unreachable("Unhandled atomic op");
519 
520  case AtomicRMWInst::Add:
521  case AtomicRMWInst::Sub: {
522  // The new value we will be contributing to the atomic operation is the
523  // old value times the number of active lanes.
524  Value *const Ctpop = B.CreateIntCast(
525  B.CreateUnaryIntrinsic(Intrinsic::ctpop, Ballot), Ty, false);
526  NewV = B.CreateMul(V, Ctpop);
527  break;
528  }
529 
530  case AtomicRMWInst::And:
531  case AtomicRMWInst::Or:
532  case AtomicRMWInst::Max:
533  case AtomicRMWInst::Min:
534  case AtomicRMWInst::UMax:
535  case AtomicRMWInst::UMin:
536  // These operations with a uniform value are idempotent: doing the atomic
537  // operation multiple times has the same effect as doing it once.
538  NewV = V;
539  break;
540 
541  case AtomicRMWInst::Xor:
542  // The new value we will be contributing to the atomic operation is the
543  // old value times the parity of the number of active lanes.
544  Value *const Ctpop = B.CreateIntCast(
545  B.CreateUnaryIntrinsic(Intrinsic::ctpop, Ballot), Ty, false);
546  NewV = B.CreateMul(V, B.CreateAnd(Ctpop, 1));
547  break;
548  }
549  }
550 
551  // We only want a single lane to enter our new control flow, and we do this
552  // by checking if there are any active lanes below us. Only one lane will
553  // have 0 active lanes below us, so that will be the only one to progress.
554  Value *const Cond = B.CreateICmpEQ(Mbcnt, B.getIntN(TyBitWidth, 0));
555 
556  // Store I's original basic block before we split the block.
557  BasicBlock *const EntryBB = I.getParent();
558 
559  // We need to introduce some new control flow to force a single lane to be
560  // active. We do this by splitting I's basic block at I, and introducing the
561  // new block such that:
562  // entry --> single_lane -\
563  // \------------------> exit
564  Instruction *const SingleLaneTerminator =
565  SplitBlockAndInsertIfThen(Cond, &I, false, nullptr, DT, nullptr);
566 
567  // Move the IR builder into single_lane next.
568  B.SetInsertPoint(SingleLaneTerminator);
569 
570  // Clone the original atomic operation into single lane, replacing the
571  // original value with our newly created one.
572  Instruction *const NewI = I.clone();
573  B.Insert(NewI);
574  NewI->setOperand(ValIdx, NewV);
575 
576  // Move the IR builder into exit next, and start inserting just before the
577  // original instruction.
578  B.SetInsertPoint(&I);
579 
580  const bool NeedResult = !I.use_empty();
581  if (NeedResult) {
582  // Create a PHI node to get our new atomic result into the exit block.
583  PHINode *const PHI = B.CreatePHI(Ty, 2);
584  PHI->addIncoming(UndefValue::get(Ty), EntryBB);
585  PHI->addIncoming(NewI, SingleLaneTerminator->getParent());
586 
587  // We need to broadcast the value who was the lowest active lane (the first
588  // lane) to all other lanes in the wavefront. We use an intrinsic for this,
589  // but have to handle 64-bit broadcasts with two calls to this intrinsic.
590  Value *BroadcastI = nullptr;
591 
592  if (TyBitWidth == 64) {
593  Value *const ExtractLo = B.CreateTrunc(PHI, B.getInt32Ty());
594  Value *const ExtractHi =
595  B.CreateTrunc(B.CreateLShr(PHI, 32), B.getInt32Ty());
596  CallInst *const ReadFirstLaneLo =
597  B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, {}, ExtractLo);
598  CallInst *const ReadFirstLaneHi =
599  B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, {}, ExtractHi);
600  Value *const PartialInsert = B.CreateInsertElement(
601  UndefValue::get(VecTy), ReadFirstLaneLo, B.getInt32(0));
602  Value *const Insert =
603  B.CreateInsertElement(PartialInsert, ReadFirstLaneHi, B.getInt32(1));
604  BroadcastI = B.CreateBitCast(Insert, Ty);
605  } else if (TyBitWidth == 32) {
606 
607  BroadcastI = B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, {}, PHI);
608  } else {
609  llvm_unreachable("Unhandled atomic bit width");
610  }
611 
612  // Now that we have the result of our single atomic operation, we need to
613  // get our individual lane's slice into the result. We use the lane offset
614  // we previously calculated combined with the atomic result value we got
615  // from the first lane, to get our lane's index into the atomic result.
616  Value *LaneOffset = nullptr;
617  if (ValDivergent) {
618  LaneOffset = B.CreateIntrinsic(Intrinsic::amdgcn_wwm, Ty, ExclScan);
619  } else {
620  switch (Op) {
621  default:
622  llvm_unreachable("Unhandled atomic op");
623  case AtomicRMWInst::Add:
624  case AtomicRMWInst::Sub:
625  LaneOffset = B.CreateMul(V, Mbcnt);
626  break;
627  case AtomicRMWInst::And:
628  case AtomicRMWInst::Or:
629  case AtomicRMWInst::Max:
630  case AtomicRMWInst::Min:
631  case AtomicRMWInst::UMax:
632  case AtomicRMWInst::UMin:
633  LaneOffset = B.CreateSelect(Cond, Identity, V);
634  break;
635  case AtomicRMWInst::Xor:
636  LaneOffset = B.CreateMul(V, B.CreateAnd(Mbcnt, 1));
637  break;
638  }
639  }
640  Value *const Result = buildNonAtomicBinOp(B, Op, BroadcastI, LaneOffset);
641 
642  if (IsPixelShader) {
643  // Need a final PHI to reconverge to above the helper lane branch mask.
644  B.SetInsertPoint(PixelExitBB->getFirstNonPHI());
645 
646  PHINode *const PHI = B.CreatePHI(Ty, 2);
647  PHI->addIncoming(UndefValue::get(Ty), PixelEntryBB);
648  PHI->addIncoming(Result, I.getParent());
649  I.replaceAllUsesWith(PHI);
650  } else {
651  // Replace the original atomic instruction with the new one.
652  I.replaceAllUsesWith(Result);
653  }
654  }
655 
656  // And delete the original.
657  I.eraseFromParent();
658 }
659 
660 INITIALIZE_PASS_BEGIN(AMDGPUAtomicOptimizer, DEBUG_TYPE,
661  "AMDGPU atomic optimizations", false, false)
664 INITIALIZE_PASS_END(AMDGPUAtomicOptimizer, DEBUG_TYPE,
665  "AMDGPU atomic optimizations", false, false)
666 
668  return new AMDGPUAtomicOptimizer();
669 }
SymbolTableList< Instruction >::iterator eraseFromParent()
This method unlinks &#39;this&#39; from the containing basic block and deletes it.
Definition: Instruction.cpp:80
A parsed version of the target data layout string in and methods for querying it. ...
Definition: DataLayout.h:111
AnalysisUsage & addPreserved()
Add the specified Pass class to the set of analyses preserved by this pass.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
Base class for instruction visitors.
Definition: InstVisitor.h:79
AMDGPU specific subclass of TargetSubtarget.
This class represents lattice values for constants.
Definition: AllocatorList.h:23
Value * CreateICmpEQ(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2160
A Module instance is used to store all the information related to an LLVM module. ...
Definition: Module.h:67
Value * CreateLShr(Value *LHS, Value *RHS, const Twine &Name="", bool isExact=false)
Definition: IRBuilder.h:1276
unsigned less than
Definition: InstrTypes.h:750
This class represents a function call, abstracting a target machine&#39;s calling convention.
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition: Type.cpp:616
Value * CreateMul(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition: IRBuilder.h:1200
const Use & getOperandUse(unsigned i) const
Definition: User.h:182
TMC & getTM() const
Get the right type of TargetMachine for this target.
F(f)
an instruction that atomically reads a memory location, combines it with another value, and then stores the result back.
Definition: Instructions.h:701
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:725
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Definition: Instructions.h:849
static APInt getSignedMaxValue(unsigned numBits)
Gets maximum signed value of APInt for a specific bit width.
Definition: APInt.h:539
Value * CreateAnd(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:1316
char & AMDGPUAtomicOptimizerID
IntegerType * getInt32Ty()
Fetch the type representing a 32-bit integer.
Definition: IRBuilder.h:458
AnalysisUsage & addRequired()
const Module * getModule() const
Return the module owning the function this basic block belongs to, or nullptr if the function does no...
Definition: BasicBlock.cpp:146
#define DEBUG_TYPE
static Value * buildNonAtomicBinOp(IRBuilder<> &B, AtomicRMWInst::BinOp Op, Value *LHS, Value *RHS)
const DataLayout & getDataLayout() const
Get the data layout for the module&#39;s target platform.
Definition: Module.cpp:397
BinOp getOperation() const
Definition: Instructions.h:780
DominatorTree & getDomTree()
Definition: Dominators.h:270
CallInst * CreateCall(FunctionType *FTy, Value *Callee, ArrayRef< Value *> Args=None, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:2324
Target-Independent Code Generator Pass Configuration Options.
Instruction * clone() const
Create a copy of &#39;this&#39; instruction that is identical in all ways except the following: ...
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:244
BasicBlock * GetInsertBlock() const
Definition: IRBuilder.h:155
Value * CreateIntCast(Value *V, Type *DestTy, bool isSigned, const Twine &Name="")
Definition: IRBuilder.h:2115
*p = old <unsigned v ? old : v
Definition: Instructions.h:735
*p = old <signed v ? old : v
Definition: Instructions.h:731
Address space for global memory (RAT0, VTX0).
Definition: AMDGPU.h:298
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition: Value.cpp:486
CallInst * CreateUnaryIntrinsic(Intrinsic::ID ID, Value *V, Instruction *FMFSource=nullptr, const Twine &Name="")
Create a call to intrinsic ID with 1 operand which is mangled on its type.
Definition: IRBuilder.cpp:765
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree...
Definition: Dominators.h:144
Function * getDeclaration(Module *M, ID id, ArrayRef< Type *> Tys=None)
Create or insert an LLVM Function declaration for an intrinsic, and return it.
Definition: Function.cpp:1160
Value * CreateExtractElement(Value *Vec, Value *Idx, const Twine &Name="")
Definition: IRBuilder.h:2371
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block...
Definition: IRBuilder.h:161
Value * getOperand(unsigned i) const
Definition: User.h:169
Analysis containing CSE Info
Definition: CSEInfo.cpp:25
ConstantInt * getIntN(unsigned N, uint64_t C)
Get a constant N-bit value, zero extended or truncated from a 64-bit value.
Definition: IRBuilder.h:429
PHINode * CreatePHI(Type *Ty, unsigned NumReservedValues, const Twine &Name="")
Definition: IRBuilder.h:2316
SmallVector< MachineOperand, 4 > Cond
Value * CreateInsertElement(Value *Vec, Value *NewElt, Value *Idx, const Twine &Name="")
Definition: IRBuilder.h:2384
static bool runOnFunction(Function &F, bool PostInlining)
const Instruction * getFirstNonPHI() const
Returns a pointer to the first instruction in this block that is not a PHINode instruction.
Definition: BasicBlock.cpp:214
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
LLVM Basic Block Representation.
Definition: BasicBlock.h:58
The instances of the Type class are immutable: once they are created, they are never changed...
Definition: Type.h:46
ConstantInt * getTrue()
Get the constant value for i1 true.
Definition: IRBuilder.h:398
signed less than
Definition: InstrTypes.h:754
Value * CreateBitCast(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2046
Value * CreateTrunc(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:1950
*p = old >unsigned v ? old : v
Definition: Instructions.h:733
Represent the analysis usage information of a pass.
unsigned greater than
Definition: InstrTypes.h:748
FunctionPass class - This class is used to implement most global optimizations.
Definition: Pass.h:284
constexpr unsigned BitWidth
Definition: BitmaskEnum.h:147
IntegerType * getIntNTy(unsigned N)
Fetch the type representing an N-bit integer.
Definition: IRBuilder.h:471
BinOp
This enumeration lists the possible modifications atomicrmw can make.
Definition: Instructions.h:713
Address space for local memory.
Definition: AMDGPU.h:302
static UndefValue * get(Type *T)
Static factory methods - Return an &#39;undef&#39; object of the specified type.
Definition: Constants.cpp:1665
INITIALIZE_PASS_END(RegBankSelect, DEBUG_TYPE, "Assign register bank of generic virtual registers", false, false) RegBankSelect
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Value * CreateICmp(CmpInst::Predicate P, Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2270
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
Definition: IntrinsicInst.h:51
Calling convention used for Mesa/AMDPAL pixel shaders.
Definition: CallingConv.h:205
static APInt getIdentityValueForAtomicOp(AtomicRMWInst::BinOp Op, unsigned BitWidth)
unsigned getNumOperands() const
Definition: User.h:191
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition: Function.h:219
This is a &#39;vector&#39; (really, a variable-sized array), optimized for the case when the array is small...
Definition: SmallVector.h:883
CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type *> Types, ArrayRef< Value *> Args, Instruction *FMFSource=nullptr, const Twine &Name="")
Create a call to intrinsic ID with args, mangled using Types.
Definition: IRBuilder.cpp:782
FunctionPass * createAMDGPUAtomicOptimizerPass()
Value * CreateSelect(Value *C, Value *True, Value *False, const Twine &Name="", Instruction *MDFrom=nullptr)
Definition: IRBuilder.cpp:917
AMDGPU atomic optimizations
ConstantInt * getInt32(uint32_t C)
Get a constant 32-bit value.
Definition: IRBuilder.h:418
static APInt getMinValue(unsigned numBits)
Gets minimum unsigned value of APInt for a specific bit width.
Definition: APInt.h:546
void setOperand(unsigned i, Value *Val)
Definition: User.h:174
Class for arbitrary precision integers.
Definition: APInt.h:69
static APInt getMaxValue(unsigned numBits)
Gets maximum unsigned value of APInt for specific bit width.
Definition: APInt.h:534
ConstantInt * getFalse()
Get the constant value for i1 false.
Definition: IRBuilder.h:403
*p = old >signed v ? old : v
Definition: Instructions.h:729
#define I(x, y, z)
Definition: MD5.cpp:59
INITIALIZE_PASS_BEGIN(AMDGPUAtomicOptimizer, DEBUG_TYPE, "AMDGPU atomic optimizations", false, false) INITIALIZE_PASS_END(AMDGPUAtomicOptimizer
signed greater than
Definition: InstrTypes.h:752
static APInt getSignedMinValue(unsigned numBits)
Gets minimum signed value of APInt for a specific bit width.
Definition: APInt.h:549
Module * getParent()
Get the module that this global value is contained inside of...
Definition: GlobalValue.h:572
LLVM Value Representation.
Definition: Value.h:74
void moveBefore(Instruction *MovePos)
Unlink this instruction from its current basic block and insert it into the basic block that MovePos ...
Definition: Instruction.cpp:99
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:65
Value * CreateBinOp(Instruction::BinaryOps Opc, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:1506
Legacy analysis pass which computes a DominatorTree.
Definition: Dominators.h:262
ConstantInt * getInt(const APInt &AI)
Get a constant integer value.
Definition: IRBuilder.h:434
bool use_empty() const
Definition: Value.h:341
Instruction * SplitBlockAndInsertIfThen(Value *Cond, Instruction *SplitBefore, bool Unreachable, MDNode *BranchWeights=nullptr, DominatorTree *DT=nullptr, LoopInfo *LI=nullptr, BasicBlock *ThenBlock=nullptr)
Split the containing block at the specified instruction - everything before SplitBefore stays in the ...
A wrapper class for inspecting calls to intrinsic functions.
Definition: IntrinsicInst.h:44
const BasicBlock * getParent() const
Definition: Instruction.h:94
INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
InstTy * Insert(InstTy *I, const Twine &Name="") const
Insert and return the specified instruction.
Definition: IRBuilder.h:126