LLVM  10.0.0svn
AMDGPUAtomicOptimizer.cpp
Go to the documentation of this file.
1 //===-- AMDGPUAtomicOptimizer.cpp -----------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// This pass optimizes atomic operations by using a single lane of a wavefront
11 /// to perform the atomic operation, thus reducing contention on that memory
12 /// location.
13 //
14 //===----------------------------------------------------------------------===//
15 
16 #include "AMDGPU.h"
17 #include "AMDGPUSubtarget.h"
20 #include "llvm/IR/IRBuilder.h"
21 #include "llvm/IR/InstVisitor.h"
23 
24 #define DEBUG_TYPE "amdgpu-atomic-optimizer"
25 
26 using namespace llvm;
27 
28 namespace {
29 
30 enum DPP_CTRL {
31  DPP_ROW_SR1 = 0x111,
32  DPP_ROW_SR2 = 0x112,
33  DPP_ROW_SR3 = 0x113,
34  DPP_ROW_SR4 = 0x114,
35  DPP_ROW_SR8 = 0x118,
36  DPP_WF_SR1 = 0x138,
37  DPP_ROW_BCAST15 = 0x142,
38  DPP_ROW_BCAST31 = 0x143
39 };
40 
41 struct ReplacementInfo {
42  Instruction *I;
44  unsigned ValIdx;
45  bool ValDivergent;
46 };
47 
48 class AMDGPUAtomicOptimizer : public FunctionPass,
49  public InstVisitor<AMDGPUAtomicOptimizer> {
50 private:
52  const LegacyDivergenceAnalysis *DA;
53  const DataLayout *DL;
54  DominatorTree *DT;
55  bool HasDPP;
56  bool IsPixelShader;
57 
58  void optimizeAtomic(Instruction &I, AtomicRMWInst::BinOp Op, unsigned ValIdx,
59  bool ValDivergent) const;
60 
61 public:
62  static char ID;
63 
64  AMDGPUAtomicOptimizer() : FunctionPass(ID) {}
65 
66  bool runOnFunction(Function &F) override;
67 
68  void getAnalysisUsage(AnalysisUsage &AU) const override {
72  }
73 
74  void visitAtomicRMWInst(AtomicRMWInst &I);
75  void visitIntrinsicInst(IntrinsicInst &I);
76 };
77 
78 } // namespace
79 
81 
83 
85  if (skipFunction(F)) {
86  return false;
87  }
88 
89  DA = &getAnalysis<LegacyDivergenceAnalysis>();
90  DL = &F.getParent()->getDataLayout();
91  DominatorTreeWrapperPass *const DTW =
92  getAnalysisIfAvailable<DominatorTreeWrapperPass>();
93  DT = DTW ? &DTW->getDomTree() : nullptr;
94  const TargetPassConfig &TPC = getAnalysis<TargetPassConfig>();
95  const TargetMachine &TM = TPC.getTM<TargetMachine>();
96  const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
97  HasDPP = ST.hasDPP();
98  IsPixelShader = F.getCallingConv() == CallingConv::AMDGPU_PS;
99 
100  visit(F);
101 
102  const bool Changed = !ToReplace.empty();
103 
104  for (ReplacementInfo &Info : ToReplace) {
105  optimizeAtomic(*Info.I, Info.Op, Info.ValIdx, Info.ValDivergent);
106  }
107 
108  ToReplace.clear();
109 
110  return Changed;
111 }
112 
113 void AMDGPUAtomicOptimizer::visitAtomicRMWInst(AtomicRMWInst &I) {
114  // Early exit for unhandled address space atomic instructions.
115  switch (I.getPointerAddressSpace()) {
116  default:
117  return;
120  break;
121  }
122 
124 
125  switch (Op) {
126  default:
127  return;
128  case AtomicRMWInst::Add:
129  case AtomicRMWInst::Sub:
130  case AtomicRMWInst::And:
131  case AtomicRMWInst::Or:
132  case AtomicRMWInst::Xor:
133  case AtomicRMWInst::Max:
134  case AtomicRMWInst::Min:
135  case AtomicRMWInst::UMax:
136  case AtomicRMWInst::UMin:
137  break;
138  }
139 
140  const unsigned PtrIdx = 0;
141  const unsigned ValIdx = 1;
142 
143  // If the pointer operand is divergent, then each lane is doing an atomic
144  // operation on a different address, and we cannot optimize that.
145  if (DA->isDivergentUse(&I.getOperandUse(PtrIdx))) {
146  return;
147  }
148 
149  const bool ValDivergent = DA->isDivergentUse(&I.getOperandUse(ValIdx));
150 
151  // If the value operand is divergent, each lane is contributing a different
152  // value to the atomic calculation. We can only optimize divergent values if
153  // we have DPP available on our subtarget, and the atomic operation is 32
154  // bits.
155  if (ValDivergent && (!HasDPP || (DL->getTypeSizeInBits(I.getType()) != 32))) {
156  return;
157  }
158 
159  // If we get here, we can optimize the atomic using a single wavefront-wide
160  // atomic operation to do the calculation for the entire wavefront, so
161  // remember the instruction so we can come back to it.
162  const ReplacementInfo Info = {&I, Op, ValIdx, ValDivergent};
163 
164  ToReplace.push_back(Info);
165 }
166 
167 void AMDGPUAtomicOptimizer::visitIntrinsicInst(IntrinsicInst &I) {
169 
170  switch (I.getIntrinsicID()) {
171  default:
172  return;
173  case Intrinsic::amdgcn_buffer_atomic_add:
174  case Intrinsic::amdgcn_struct_buffer_atomic_add:
175  case Intrinsic::amdgcn_raw_buffer_atomic_add:
176  Op = AtomicRMWInst::Add;
177  break;
178  case Intrinsic::amdgcn_buffer_atomic_sub:
179  case Intrinsic::amdgcn_struct_buffer_atomic_sub:
180  case Intrinsic::amdgcn_raw_buffer_atomic_sub:
181  Op = AtomicRMWInst::Sub;
182  break;
183  case Intrinsic::amdgcn_buffer_atomic_and:
184  case Intrinsic::amdgcn_struct_buffer_atomic_and:
185  case Intrinsic::amdgcn_raw_buffer_atomic_and:
186  Op = AtomicRMWInst::And;
187  break;
188  case Intrinsic::amdgcn_buffer_atomic_or:
189  case Intrinsic::amdgcn_struct_buffer_atomic_or:
190  case Intrinsic::amdgcn_raw_buffer_atomic_or:
191  Op = AtomicRMWInst::Or;
192  break;
193  case Intrinsic::amdgcn_buffer_atomic_xor:
194  case Intrinsic::amdgcn_struct_buffer_atomic_xor:
195  case Intrinsic::amdgcn_raw_buffer_atomic_xor:
196  Op = AtomicRMWInst::Xor;
197  break;
198  case Intrinsic::amdgcn_buffer_atomic_smin:
199  case Intrinsic::amdgcn_struct_buffer_atomic_smin:
200  case Intrinsic::amdgcn_raw_buffer_atomic_smin:
201  Op = AtomicRMWInst::Min;
202  break;
203  case Intrinsic::amdgcn_buffer_atomic_umin:
204  case Intrinsic::amdgcn_struct_buffer_atomic_umin:
205  case Intrinsic::amdgcn_raw_buffer_atomic_umin:
206  Op = AtomicRMWInst::UMin;
207  break;
208  case Intrinsic::amdgcn_buffer_atomic_smax:
209  case Intrinsic::amdgcn_struct_buffer_atomic_smax:
210  case Intrinsic::amdgcn_raw_buffer_atomic_smax:
211  Op = AtomicRMWInst::Max;
212  break;
213  case Intrinsic::amdgcn_buffer_atomic_umax:
214  case Intrinsic::amdgcn_struct_buffer_atomic_umax:
215  case Intrinsic::amdgcn_raw_buffer_atomic_umax:
216  Op = AtomicRMWInst::UMax;
217  break;
218  }
219 
220  const unsigned ValIdx = 0;
221 
222  const bool ValDivergent = DA->isDivergentUse(&I.getOperandUse(ValIdx));
223 
224  // If the value operand is divergent, each lane is contributing a different
225  // value to the atomic calculation. We can only optimize divergent values if
226  // we have DPP available on our subtarget, and the atomic operation is 32
227  // bits.
228  if (ValDivergent && (!HasDPP || (DL->getTypeSizeInBits(I.getType()) != 32))) {
229  return;
230  }
231 
232  // If any of the other arguments to the intrinsic are divergent, we can't
233  // optimize the operation.
234  for (unsigned Idx = 1; Idx < I.getNumOperands(); Idx++) {
235  if (DA->isDivergentUse(&I.getOperandUse(Idx))) {
236  return;
237  }
238  }
239 
240  // If we get here, we can optimize the atomic using a single wavefront-wide
241  // atomic operation to do the calculation for the entire wavefront, so
242  // remember the instruction so we can come back to it.
243  const ReplacementInfo Info = {&I, Op, ValIdx, ValDivergent};
244 
245  ToReplace.push_back(Info);
246 }
247 
248 // Use the builder to create the non-atomic counterpart of the specified
249 // atomicrmw binary op.
251  Value *LHS, Value *RHS) {
252  CmpInst::Predicate Pred;
253 
254  switch (Op) {
255  default:
256  llvm_unreachable("Unhandled atomic op");
257  case AtomicRMWInst::Add:
258  return B.CreateBinOp(Instruction::Add, LHS, RHS);
259  case AtomicRMWInst::Sub:
260  return B.CreateBinOp(Instruction::Sub, LHS, RHS);
261  case AtomicRMWInst::And:
262  return B.CreateBinOp(Instruction::And, LHS, RHS);
263  case AtomicRMWInst::Or:
264  return B.CreateBinOp(Instruction::Or, LHS, RHS);
265  case AtomicRMWInst::Xor:
266  return B.CreateBinOp(Instruction::Xor, LHS, RHS);
267 
268  case AtomicRMWInst::Max:
269  Pred = CmpInst::ICMP_SGT;
270  break;
271  case AtomicRMWInst::Min:
272  Pred = CmpInst::ICMP_SLT;
273  break;
274  case AtomicRMWInst::UMax:
275  Pred = CmpInst::ICMP_UGT;
276  break;
277  case AtomicRMWInst::UMin:
278  Pred = CmpInst::ICMP_ULT;
279  break;
280  }
281  Value *Cond = B.CreateICmp(Pred, LHS, RHS);
282  return B.CreateSelect(Cond, LHS, RHS);
283 }
284 
286  unsigned BitWidth) {
287  switch (Op) {
288  default:
289  llvm_unreachable("Unhandled atomic op");
290  case AtomicRMWInst::Add:
291  case AtomicRMWInst::Sub:
292  case AtomicRMWInst::Or:
293  case AtomicRMWInst::Xor:
294  case AtomicRMWInst::UMax:
295  return APInt::getMinValue(BitWidth);
296  case AtomicRMWInst::And:
297  case AtomicRMWInst::UMin:
298  return APInt::getMaxValue(BitWidth);
299  case AtomicRMWInst::Max:
300  return APInt::getSignedMinValue(BitWidth);
301  case AtomicRMWInst::Min:
302  return APInt::getSignedMaxValue(BitWidth);
303  }
304 }
305 
306 void AMDGPUAtomicOptimizer::optimizeAtomic(Instruction &I,
308  unsigned ValIdx,
309  bool ValDivergent) const {
310  // Start building just before the instruction.
311  IRBuilder<> B(&I);
312 
313  // If we are in a pixel shader, because of how we have to mask out helper
314  // lane invocations, we need to record the entry and exit BB's.
315  BasicBlock *PixelEntryBB = nullptr;
316  BasicBlock *PixelExitBB = nullptr;
317 
318  // If we're optimizing an atomic within a pixel shader, we need to wrap the
319  // entire atomic operation in a helper-lane check. We do not want any helper
320  // lanes that are around only for the purposes of derivatives to take part
321  // in any cross-lane communication, and we use a branch on whether the lane is
322  // live to do this.
323  if (IsPixelShader) {
324  // Record I's original position as the entry block.
325  PixelEntryBB = I.getParent();
326 
327  Value *const Cond = B.CreateIntrinsic(Intrinsic::amdgcn_ps_live, {}, {});
328  Instruction *const NonHelperTerminator =
329  SplitBlockAndInsertIfThen(Cond, &I, false, nullptr, DT, nullptr);
330 
331  // Record I's new position as the exit block.
332  PixelExitBB = I.getParent();
333 
334  I.moveBefore(NonHelperTerminator);
335  B.SetInsertPoint(&I);
336  }
337 
338  Type *const Ty = I.getType();
339  const unsigned TyBitWidth = DL->getTypeSizeInBits(Ty);
340  Type *const VecTy = VectorType::get(B.getInt32Ty(), 2);
341 
342  // This is the value in the atomic operation we need to combine in order to
343  // reduce the number of atomic operations.
344  Value *const V = I.getOperand(ValIdx);
345 
346  // We need to know how many lanes are active within the wavefront, and we do
347  // this by doing a ballot of active lanes.
348  CallInst *const Ballot = B.CreateIntrinsic(
349  Intrinsic::amdgcn_icmp, {B.getInt64Ty(), B.getInt32Ty()},
350  {B.getInt32(1), B.getInt32(0), B.getInt32(CmpInst::ICMP_NE)});
351 
352  // We need to know how many lanes are active within the wavefront that are
353  // below us. If we counted each lane linearly starting from 0, a lane is
354  // below us only if its associated index was less than ours. We do this by
355  // using the mbcnt intrinsic.
356  Value *const BitCast = B.CreateBitCast(Ballot, VecTy);
357  Value *const ExtractLo = B.CreateExtractElement(BitCast, B.getInt32(0));
358  Value *const ExtractHi = B.CreateExtractElement(BitCast, B.getInt32(1));
359  CallInst *const PartialMbcnt = B.CreateIntrinsic(
360  Intrinsic::amdgcn_mbcnt_lo, {}, {ExtractLo, B.getInt32(0)});
361  Value *const Mbcnt =
362  B.CreateIntCast(B.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_hi, {},
363  {ExtractHi, PartialMbcnt}),
364  Ty, false);
365 
366  Value *const Identity = B.getInt(getIdentityValueForAtomicOp(Op, TyBitWidth));
367 
368  Value *ExclScan = nullptr;
369  Value *NewV = nullptr;
370 
371  // If we have a divergent value in each lane, we need to combine the value
372  // using DPP.
373  if (ValDivergent) {
374  // First we need to set all inactive invocations to the identity value, so
375  // that they can correctly contribute to the final result.
376  CallInst *const SetInactive =
377  B.CreateIntrinsic(Intrinsic::amdgcn_set_inactive, Ty, {V, Identity});
378 
379  ExclScan =
380  B.CreateIntrinsic(Intrinsic::amdgcn_update_dpp, Ty,
381  {Identity, SetInactive, B.getInt32(DPP_WF_SR1),
382  B.getInt32(0xf), B.getInt32(0xf), B.getFalse()});
383 
384  const unsigned Iters = 6;
385  const unsigned DPPCtrl[Iters] = {DPP_ROW_SR1, DPP_ROW_SR2,
386  DPP_ROW_SR4, DPP_ROW_SR8,
387  DPP_ROW_BCAST15, DPP_ROW_BCAST31};
388  const unsigned RowMask[Iters] = {0xf, 0xf, 0xf, 0xf, 0xa, 0xc};
389  const unsigned BankMask[Iters] = {0xf, 0xf, 0xe, 0xc, 0xf, 0xf};
390 
391  // This loop performs an exclusive scan across the wavefront, with all lanes
392  // active (by using the WWM intrinsic).
393  for (unsigned Idx = 0; Idx < Iters; Idx++) {
394  CallInst *const DPP = B.CreateIntrinsic(
395  Intrinsic::amdgcn_update_dpp, Ty,
396  {Identity, ExclScan, B.getInt32(DPPCtrl[Idx]),
397  B.getInt32(RowMask[Idx]), B.getInt32(BankMask[Idx]), B.getFalse()});
398 
399  ExclScan = buildNonAtomicBinOp(B, Op, ExclScan, DPP);
400  }
401 
402  NewV = buildNonAtomicBinOp(B, Op, SetInactive, ExclScan);
403 
404  // Read the value from the last lane, which has accumlated the values of
405  // each active lane in the wavefront. This will be our new value which we
406  // will provide to the atomic operation.
407  if (TyBitWidth == 64) {
408  Value *const ExtractLo = B.CreateTrunc(NewV, B.getInt32Ty());
409  Value *const ExtractHi =
410  B.CreateTrunc(B.CreateLShr(NewV, B.getInt64(32)), B.getInt32Ty());
411  CallInst *const ReadLaneLo = B.CreateIntrinsic(
412  Intrinsic::amdgcn_readlane, {}, {ExtractLo, B.getInt32(63)});
413  CallInst *const ReadLaneHi = B.CreateIntrinsic(
414  Intrinsic::amdgcn_readlane, {}, {ExtractHi, B.getInt32(63)});
415  Value *const PartialInsert = B.CreateInsertElement(
416  UndefValue::get(VecTy), ReadLaneLo, B.getInt32(0));
417  Value *const Insert =
418  B.CreateInsertElement(PartialInsert, ReadLaneHi, B.getInt32(1));
419  NewV = B.CreateBitCast(Insert, Ty);
420  } else if (TyBitWidth == 32) {
421  NewV = B.CreateIntrinsic(Intrinsic::amdgcn_readlane, {},
422  {NewV, B.getInt32(63)});
423  } else {
424  llvm_unreachable("Unhandled atomic bit width");
425  }
426 
427  // Finally mark the readlanes in the WWM section.
428  NewV = B.CreateIntrinsic(Intrinsic::amdgcn_wwm, Ty, NewV);
429  } else {
430  switch (Op) {
431  default:
432  llvm_unreachable("Unhandled atomic op");
433 
434  case AtomicRMWInst::Add:
435  case AtomicRMWInst::Sub: {
436  // The new value we will be contributing to the atomic operation is the
437  // old value times the number of active lanes.
438  Value *const Ctpop = B.CreateIntCast(
439  B.CreateUnaryIntrinsic(Intrinsic::ctpop, Ballot), Ty, false);
440  NewV = B.CreateMul(V, Ctpop);
441  break;
442  }
443 
444  case AtomicRMWInst::And:
445  case AtomicRMWInst::Or:
446  case AtomicRMWInst::Max:
447  case AtomicRMWInst::Min:
448  case AtomicRMWInst::UMax:
449  case AtomicRMWInst::UMin:
450  // These operations with a uniform value are idempotent: doing the atomic
451  // operation multiple times has the same effect as doing it once.
452  NewV = V;
453  break;
454 
455  case AtomicRMWInst::Xor:
456  // The new value we will be contributing to the atomic operation is the
457  // old value times the parity of the number of active lanes.
458  Value *const Ctpop = B.CreateIntCast(
459  B.CreateUnaryIntrinsic(Intrinsic::ctpop, Ballot), Ty, false);
460  NewV = B.CreateMul(V, B.CreateAnd(Ctpop, 1));
461  break;
462  }
463  }
464 
465  // We only want a single lane to enter our new control flow, and we do this
466  // by checking if there are any active lanes below us. Only one lane will
467  // have 0 active lanes below us, so that will be the only one to progress.
468  Value *const Cond = B.CreateICmpEQ(Mbcnt, B.getIntN(TyBitWidth, 0));
469 
470  // Store I's original basic block before we split the block.
471  BasicBlock *const EntryBB = I.getParent();
472 
473  // We need to introduce some new control flow to force a single lane to be
474  // active. We do this by splitting I's basic block at I, and introducing the
475  // new block such that:
476  // entry --> single_lane -\
477  // \------------------> exit
478  Instruction *const SingleLaneTerminator =
479  SplitBlockAndInsertIfThen(Cond, &I, false, nullptr, DT, nullptr);
480 
481  // Move the IR builder into single_lane next.
482  B.SetInsertPoint(SingleLaneTerminator);
483 
484  // Clone the original atomic operation into single lane, replacing the
485  // original value with our newly created one.
486  Instruction *const NewI = I.clone();
487  B.Insert(NewI);
488  NewI->setOperand(ValIdx, NewV);
489 
490  // Move the IR builder into exit next, and start inserting just before the
491  // original instruction.
492  B.SetInsertPoint(&I);
493 
494  const bool NeedResult = !I.use_empty();
495  if (NeedResult) {
496  // Create a PHI node to get our new atomic result into the exit block.
497  PHINode *const PHI = B.CreatePHI(Ty, 2);
498  PHI->addIncoming(UndefValue::get(Ty), EntryBB);
499  PHI->addIncoming(NewI, SingleLaneTerminator->getParent());
500 
501  // We need to broadcast the value who was the lowest active lane (the first
502  // lane) to all other lanes in the wavefront. We use an intrinsic for this,
503  // but have to handle 64-bit broadcasts with two calls to this intrinsic.
504  Value *BroadcastI = nullptr;
505 
506  if (TyBitWidth == 64) {
507  Value *const ExtractLo = B.CreateTrunc(PHI, B.getInt32Ty());
508  Value *const ExtractHi =
509  B.CreateTrunc(B.CreateLShr(PHI, B.getInt64(32)), B.getInt32Ty());
510  CallInst *const ReadFirstLaneLo =
511  B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, {}, ExtractLo);
512  CallInst *const ReadFirstLaneHi =
513  B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, {}, ExtractHi);
514  Value *const PartialInsert = B.CreateInsertElement(
515  UndefValue::get(VecTy), ReadFirstLaneLo, B.getInt32(0));
516  Value *const Insert =
517  B.CreateInsertElement(PartialInsert, ReadFirstLaneHi, B.getInt32(1));
518  BroadcastI = B.CreateBitCast(Insert, Ty);
519  } else if (TyBitWidth == 32) {
520 
521  BroadcastI = B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, {}, PHI);
522  } else {
523  llvm_unreachable("Unhandled atomic bit width");
524  }
525 
526  // Now that we have the result of our single atomic operation, we need to
527  // get our individual lane's slice into the result. We use the lane offset
528  // we previously calculated combined with the atomic result value we got
529  // from the first lane, to get our lane's index into the atomic result.
530  Value *LaneOffset = nullptr;
531  if (ValDivergent) {
532  LaneOffset = B.CreateIntrinsic(Intrinsic::amdgcn_wwm, Ty, ExclScan);
533  } else {
534  switch (Op) {
535  default:
536  llvm_unreachable("Unhandled atomic op");
537  case AtomicRMWInst::Add:
538  case AtomicRMWInst::Sub:
539  LaneOffset = B.CreateMul(V, Mbcnt);
540  break;
541  case AtomicRMWInst::And:
542  case AtomicRMWInst::Or:
543  case AtomicRMWInst::Max:
544  case AtomicRMWInst::Min:
545  case AtomicRMWInst::UMax:
546  case AtomicRMWInst::UMin:
547  LaneOffset = B.CreateSelect(Cond, Identity, V);
548  break;
549  case AtomicRMWInst::Xor:
550  LaneOffset = B.CreateMul(V, B.CreateAnd(Mbcnt, 1));
551  break;
552  }
553  }
554  Value *const Result = buildNonAtomicBinOp(B, Op, BroadcastI, LaneOffset);
555 
556  if (IsPixelShader) {
557  // Need a final PHI to reconverge to above the helper lane branch mask.
558  B.SetInsertPoint(PixelExitBB->getFirstNonPHI());
559 
560  PHINode *const PHI = B.CreatePHI(Ty, 2);
561  PHI->addIncoming(UndefValue::get(Ty), PixelEntryBB);
562  PHI->addIncoming(Result, I.getParent());
563  I.replaceAllUsesWith(PHI);
564  } else {
565  // Replace the original atomic instruction with the new one.
566  I.replaceAllUsesWith(Result);
567  }
568  }
569 
570  // And delete the original.
571  I.eraseFromParent();
572 }
573 
574 INITIALIZE_PASS_BEGIN(AMDGPUAtomicOptimizer, DEBUG_TYPE,
575  "AMDGPU atomic optimizations", false, false)
578 INITIALIZE_PASS_END(AMDGPUAtomicOptimizer, DEBUG_TYPE,
579  "AMDGPU atomic optimizations", false, false)
580 
582  return new AMDGPUAtomicOptimizer();
583 }
SymbolTableList< Instruction >::iterator eraseFromParent()
This method unlinks &#39;this&#39; from the containing basic block and deletes it.
Definition: Instruction.cpp:67
A parsed version of the target data layout string in and methods for querying it. ...
Definition: DataLayout.h:111
Value * CreateICmp(CmpInst::Predicate P, Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2198
AnalysisUsage & addPreserved()
Add the specified Pass class to the set of analyses preserved by this pass.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
*p = old <signed v ? old : v
Definition: Instructions.h:723
Value * CreateBinOp(Instruction::BinaryOps Opc, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:1458
Base class for instruction visitors.
Definition: InstVisitor.h:80
AMDGPU specific subclass of TargetSubtarget.
This class represents lattice values for constants.
Definition: AllocatorList.h:23
Address space for local memory.
Definition: AMDGPU.h:274
This class represents a function call, abstracting a target machine&#39;s calling convention.
unsigned less than
Definition: InstrTypes.h:757
*p = old <unsigned v ? old : v
Definition: Instructions.h:727
*p = old >unsigned v ? old : v
Definition: Instructions.h:725
const Use & getOperandUse(unsigned i) const
Definition: User.h:182
TMC & getTM() const
Get the right type of TargetMachine for this target.
F(f)
an instruction that atomically reads a memory location, combines it with another value, and then stores the result back.
Definition: Instructions.h:693
*p = old >signed v ? old : v
Definition: Instructions.h:721
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Definition: Instructions.h:821
static APInt getSignedMaxValue(unsigned numBits)
Gets maximum signed value of APInt for a specific bit width.
Definition: APInt.h:534
char & AMDGPUAtomicOptimizerID
IntegerType * getInt32Ty()
Fetch the type representing a 32-bit integer.
Definition: IRBuilder.h:383
AnalysisUsage & addRequired()
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition: PassSupport.h:50
#define DEBUG_TYPE
static Value * buildNonAtomicBinOp(IRBuilder<> &B, AtomicRMWInst::BinOp Op, Value *LHS, Value *RHS)
const DataLayout & getDataLayout() const
Get the data layout for the module&#39;s target platform.
Definition: Module.cpp:369
IntegerType * getInt64Ty()
Fetch the type representing a 64-bit integer.
Definition: IRBuilder.h:388
Address space for global memory (RAT0, VTX0).
Definition: AMDGPU.h:270
BinOp getOperation() const
Definition: Instructions.h:752
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition: IRBuilder.h:779
DominatorTree & getDomTree()
Definition: Dominators.h:269
Target-Independent Code Generator Pass Configuration Options.
Instruction * clone() const
Create a copy of &#39;this&#39; instruction that is identical in all ways except the following: ...
Value * CreateBitCast(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:1958
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:245
BinOp
This enumeration lists the possible modifications atomicrmw can make.
Definition: Instructions.h:705
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition: Value.cpp:429
CallInst * CreateUnaryIntrinsic(Intrinsic::ID ID, Value *V, Instruction *FMFSource=nullptr, const Twine &Name="")
Create a call to intrinsic ID with 1 operand which is mangled on its type.
Definition: IRBuilder.cpp:731
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree...
Definition: Dominators.h:144
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block...
Definition: IRBuilder.h:132
Value * getOperand(unsigned i) const
Definition: User.h:169
Analysis containing CSE Info
Definition: CSEInfo.cpp:20
ConstantInt * getIntN(unsigned N, uint64_t C)
Get a constant N-bit value, zero extended or truncated from a 64-bit value.
Definition: IRBuilder.h:354
static bool runOnFunction(Function &F, bool PostInlining)
const Instruction * getFirstNonPHI() const
Returns a pointer to the first instruction in this block that is not a PHINode instruction.
Definition: BasicBlock.cpp:189
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
LLVM Basic Block Representation.
Definition: BasicBlock.h:57
The instances of the Type class are immutable: once they are created, they are never changed...
Definition: Type.h:45
Value * CreateSelect(Value *C, Value *True, Value *False, const Twine &Name="", Instruction *MDFrom=nullptr)
Definition: IRBuilder.h:2271
Represent the analysis usage information of a pass.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:732
FunctionPass class - This class is used to implement most global optimizations.
Definition: Pass.h:284
Value * CreateICmpEQ(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2088
ConstantInt * getInt64(uint64_t C)
Get a constant 64-bit value.
Definition: IRBuilder.h:348
Value * CreateExtractElement(Value *Vec, Value *Idx, const Twine &Name="")
Definition: IRBuilder.h:2293
static UndefValue * get(Type *T)
Static factory methods - Return an &#39;undef&#39; object of the specified type.
Definition: Constants.cpp:1433
INITIALIZE_PASS_END(RegBankSelect, DEBUG_TYPE, "Assign register bank of generic virtual registers", false, false) RegBankSelect
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Value * CreateMul(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition: IRBuilder.h:1152
Value * CreateTrunc(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:1874
signed greater than
Definition: InstrTypes.h:759
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
Definition: IntrinsicInst.h:50
PHINode * CreatePHI(Type *Ty, unsigned NumReservedValues, const Twine &Name="")
Definition: IRBuilder.h:2218
Calling convention used for Mesa/AMDPAL pixel shaders.
Definition: CallingConv.h:194
static APInt getIdentityValueForAtomicOp(AtomicRMWInst::BinOp Op, unsigned BitWidth)
unsigned getNumOperands() const
Definition: User.h:191
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition: Function.h:212
Value * CreateIntCast(Value *V, Type *DestTy, bool isSigned, const Twine &Name="")
Definition: IRBuilder.h:2027
This is a &#39;vector&#39; (really, a variable-sized array), optimized for the case when the array is small...
Definition: SmallVector.h:837
CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type *> Types, ArrayRef< Value *> Args, Instruction *FMFSource=nullptr, const Twine &Name="")
Create a call to intrinsic ID with args, mangled using Types.
Definition: IRBuilder.cpp:748
Value * CreateInsertElement(Value *Vec, Value *NewElt, Value *Idx, const Twine &Name="")
Definition: IRBuilder.h:2306
FunctionPass * createAMDGPUAtomicOptimizerPass()
signed less than
Definition: InstrTypes.h:761
AMDGPU atomic optimizations
ConstantInt * getInt32(uint32_t C)
Get a constant 32-bit value.
Definition: IRBuilder.h:343
static APInt getMinValue(unsigned numBits)
Gets minimum unsigned value of APInt for a specific bit width.
Definition: APInt.h:541
void setOperand(unsigned i, Value *Val)
Definition: User.h:174
Class for arbitrary precision integers.
Definition: APInt.h:69
static APInt getMaxValue(unsigned numBits)
Gets maximum unsigned value of APInt for specific bit width.
Definition: APInt.h:529
ConstantInt * getFalse()
Get the constant value for i1 false.
Definition: IRBuilder.h:328
static VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
Definition: Type.cpp:609
#define I(x, y, z)
Definition: MD5.cpp:58
INITIALIZE_PASS_BEGIN(AMDGPUAtomicOptimizer, DEBUG_TYPE, "AMDGPU atomic optimizations", false, false) INITIALIZE_PASS_END(AMDGPUAtomicOptimizer
Value * CreateAnd(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:1268
InstTy * Insert(InstTy *I, const Twine &Name="") const
Insert and return the specified instruction.
Definition: IRBuilder.h:830
static APInt getSignedMinValue(unsigned numBits)
Gets minimum signed value of APInt for a specific bit width.
Definition: APInt.h:544
Module * getParent()
Get the module that this global value is contained inside of...
Definition: GlobalValue.h:575
LLVM Value Representation.
Definition: Value.h:73
void moveBefore(Instruction *MovePos)
Unlink this instruction from its current basic block and insert it into the basic block that MovePos ...
Definition: Instruction.cpp:86
Value * CreateLShr(Value *LHS, Value *RHS, const Twine &Name="", bool isExact=false)
Definition: IRBuilder.h:1228
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:65
unsigned greater than
Definition: InstrTypes.h:755
Legacy analysis pass which computes a DominatorTree.
Definition: Dominators.h:259
ConstantInt * getInt(const APInt &AI)
Get a constant integer value.
Definition: IRBuilder.h:359
bool use_empty() const
Definition: Value.h:342
Instruction * SplitBlockAndInsertIfThen(Value *Cond, Instruction *SplitBefore, bool Unreachable, MDNode *BranchWeights=nullptr, DominatorTree *DT=nullptr, LoopInfo *LI=nullptr, BasicBlock *ThenBlock=nullptr)
Split the containing block at the specified instruction - everything before SplitBefore stays in the ...
A wrapper class for inspecting calls to intrinsic functions.
Definition: IntrinsicInst.h:43
const BasicBlock * getParent() const
Definition: Instruction.h:66