LLVM  9.0.0svn
AMDGPUAtomicOptimizer.cpp
Go to the documentation of this file.
1 //===-- AMDGPUAtomicOptimizer.cpp -----------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// This pass optimizes atomic operations by using a single lane of a wavefront
11 /// to perform the atomic operation, thus reducing contention on that memory
12 /// location.
13 //
14 //===----------------------------------------------------------------------===//
15 
16 #include "AMDGPU.h"
17 #include "AMDGPUSubtarget.h"
20 #include "llvm/IR/IRBuilder.h"
21 #include "llvm/IR/InstVisitor.h"
23 
24 #define DEBUG_TYPE "amdgpu-atomic-optimizer"
25 
26 using namespace llvm;
27 
28 namespace {
29 
30 enum DPP_CTRL {
31  DPP_ROW_SR1 = 0x111,
32  DPP_ROW_SR2 = 0x112,
33  DPP_ROW_SR3 = 0x113,
34  DPP_ROW_SR4 = 0x114,
35  DPP_ROW_SR8 = 0x118,
36  DPP_WF_SR1 = 0x138,
37  DPP_ROW_BCAST15 = 0x142,
38  DPP_ROW_BCAST31 = 0x143
39 };
40 
41 struct ReplacementInfo {
42  Instruction *I;
44  unsigned ValIdx;
45  bool ValDivergent;
46 };
47 
48 class AMDGPUAtomicOptimizer : public FunctionPass,
49  public InstVisitor<AMDGPUAtomicOptimizer> {
50 private:
52  const LegacyDivergenceAnalysis *DA;
53  const DataLayout *DL;
54  DominatorTree *DT;
55  bool HasDPP;
56  bool IsPixelShader;
57 
58  void optimizeAtomic(Instruction &I, Instruction::BinaryOps Op,
59  unsigned ValIdx, bool ValDivergent) const;
60 
61  void setConvergent(CallInst *const CI) const;
62 
63 public:
64  static char ID;
65 
66  AMDGPUAtomicOptimizer() : FunctionPass(ID) {}
67 
68  bool runOnFunction(Function &F) override;
69 
70  void getAnalysisUsage(AnalysisUsage &AU) const override {
74  }
75 
76  void visitAtomicRMWInst(AtomicRMWInst &I);
77  void visitIntrinsicInst(IntrinsicInst &I);
78 };
79 
80 } // namespace
81 
83 
85 
87  if (skipFunction(F)) {
88  return false;
89  }
90 
91  DA = &getAnalysis<LegacyDivergenceAnalysis>();
92  DL = &F.getParent()->getDataLayout();
93  DominatorTreeWrapperPass *const DTW =
94  getAnalysisIfAvailable<DominatorTreeWrapperPass>();
95  DT = DTW ? &DTW->getDomTree() : nullptr;
96  const TargetPassConfig &TPC = getAnalysis<TargetPassConfig>();
97  const TargetMachine &TM = TPC.getTM<TargetMachine>();
98  const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
99  HasDPP = ST.hasDPP();
100  IsPixelShader = F.getCallingConv() == CallingConv::AMDGPU_PS;
101 
102  visit(F);
103 
104  const bool Changed = !ToReplace.empty();
105 
106  for (ReplacementInfo &Info : ToReplace) {
107  optimizeAtomic(*Info.I, Info.Op, Info.ValIdx, Info.ValDivergent);
108  }
109 
110  ToReplace.clear();
111 
112  return Changed;
113 }
114 
115 void AMDGPUAtomicOptimizer::visitAtomicRMWInst(AtomicRMWInst &I) {
116  // Early exit for unhandled address space atomic instructions.
117  switch (I.getPointerAddressSpace()) {
118  default:
119  return;
122  break;
123  }
124 
126 
127  switch (I.getOperation()) {
128  default:
129  return;
130  case AtomicRMWInst::Add:
131  Op = Instruction::Add;
132  break;
133  case AtomicRMWInst::Sub:
134  Op = Instruction::Sub;
135  break;
136  }
137 
138  const unsigned PtrIdx = 0;
139  const unsigned ValIdx = 1;
140 
141  // If the pointer operand is divergent, then each lane is doing an atomic
142  // operation on a different address, and we cannot optimize that.
143  if (DA->isDivergent(I.getOperand(PtrIdx))) {
144  return;
145  }
146 
147  const bool ValDivergent = DA->isDivergent(I.getOperand(ValIdx));
148 
149  // If the value operand is divergent, each lane is contributing a different
150  // value to the atomic calculation. We can only optimize divergent values if
151  // we have DPP available on our subtarget, and the atomic operation is 32
152  // bits.
153  if (ValDivergent && (!HasDPP || (DL->getTypeSizeInBits(I.getType()) != 32))) {
154  return;
155  }
156 
157  // If we get here, we can optimize the atomic using a single wavefront-wide
158  // atomic operation to do the calculation for the entire wavefront, so
159  // remember the instruction so we can come back to it.
160  const ReplacementInfo Info = {&I, Op, ValIdx, ValDivergent};
161 
162  ToReplace.push_back(Info);
163 }
164 
165 void AMDGPUAtomicOptimizer::visitIntrinsicInst(IntrinsicInst &I) {
167 
168  switch (I.getIntrinsicID()) {
169  default:
170  return;
171  case Intrinsic::amdgcn_buffer_atomic_add:
172  case Intrinsic::amdgcn_struct_buffer_atomic_add:
173  case Intrinsic::amdgcn_raw_buffer_atomic_add:
174  Op = Instruction::Add;
175  break;
176  case Intrinsic::amdgcn_buffer_atomic_sub:
177  case Intrinsic::amdgcn_struct_buffer_atomic_sub:
178  case Intrinsic::amdgcn_raw_buffer_atomic_sub:
179  Op = Instruction::Sub;
180  break;
181  }
182 
183  const unsigned ValIdx = 0;
184 
185  const bool ValDivergent = DA->isDivergent(I.getOperand(ValIdx));
186 
187  // If the value operand is divergent, each lane is contributing a different
188  // value to the atomic calculation. We can only optimize divergent values if
189  // we have DPP available on our subtarget, and the atomic operation is 32
190  // bits.
191  if (ValDivergent && (!HasDPP || (DL->getTypeSizeInBits(I.getType()) != 32))) {
192  return;
193  }
194 
195  // If any of the other arguments to the intrinsic are divergent, we can't
196  // optimize the operation.
197  for (unsigned Idx = 1; Idx < I.getNumOperands(); Idx++) {
198  if (DA->isDivergent(I.getOperand(Idx))) {
199  return;
200  }
201  }
202 
203  // If we get here, we can optimize the atomic using a single wavefront-wide
204  // atomic operation to do the calculation for the entire wavefront, so
205  // remember the instruction so we can come back to it.
206  const ReplacementInfo Info = {&I, Op, ValIdx, ValDivergent};
207 
208  ToReplace.push_back(Info);
209 }
210 
211 void AMDGPUAtomicOptimizer::optimizeAtomic(Instruction &I,
213  unsigned ValIdx,
214  bool ValDivergent) const {
215  // Start building just before the instruction.
216  IRBuilder<> B(&I);
217 
218  // If we are in a pixel shader, because of how we have to mask out helper
219  // lane invocations, we need to record the entry and exit BB's.
220  BasicBlock *PixelEntryBB = nullptr;
221  BasicBlock *PixelExitBB = nullptr;
222 
223  // If we're optimizing an atomic within a pixel shader, we need to wrap the
224  // entire atomic operation in a helper-lane check. We do not want any helper
225  // lanes that are around only for the purposes of derivatives to take part
226  // in any cross-lane communication, and we use a branch on whether the lane is
227  // live to do this.
228  if (IsPixelShader) {
229  // Record I's original position as the entry block.
230  PixelEntryBB = I.getParent();
231 
232  Value *const Cond = B.CreateIntrinsic(Intrinsic::amdgcn_ps_live, {}, {});
233  Instruction *const NonHelperTerminator =
234  SplitBlockAndInsertIfThen(Cond, &I, false, nullptr, DT, nullptr);
235 
236  // Record I's new position as the exit block.
237  PixelExitBB = I.getParent();
238 
239  I.moveBefore(NonHelperTerminator);
240  B.SetInsertPoint(&I);
241  }
242 
243  Type *const Ty = I.getType();
244  const unsigned TyBitWidth = DL->getTypeSizeInBits(Ty);
245  Type *const VecTy = VectorType::get(B.getInt32Ty(), 2);
246 
247  // This is the value in the atomic operation we need to combine in order to
248  // reduce the number of atomic operations.
249  Value *const V = I.getOperand(ValIdx);
250 
251  // We need to know how many lanes are active within the wavefront, and we do
252  // this by doing a ballot of active lanes.
253  CallInst *const Ballot =
254  B.CreateIntrinsic(Intrinsic::amdgcn_icmp, {B.getInt32Ty()},
255  {B.getInt32(1), B.getInt32(0), B.getInt32(33)});
256  setConvergent(Ballot);
257 
258  // We need to know how many lanes are active within the wavefront that are
259  // below us. If we counted each lane linearly starting from 0, a lane is
260  // below us only if its associated index was less than ours. We do this by
261  // using the mbcnt intrinsic.
262  Value *const BitCast = B.CreateBitCast(Ballot, VecTy);
263  Value *const ExtractLo = B.CreateExtractElement(BitCast, B.getInt32(0));
264  Value *const ExtractHi = B.CreateExtractElement(BitCast, B.getInt32(1));
265  CallInst *const PartialMbcnt = B.CreateIntrinsic(
266  Intrinsic::amdgcn_mbcnt_lo, {}, {ExtractLo, B.getInt32(0)});
267  CallInst *const Mbcnt = B.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_hi, {},
268  {ExtractHi, PartialMbcnt});
269 
270  Value *const MbcntCast = B.CreateIntCast(Mbcnt, Ty, false);
271 
272  Value *LaneOffset = nullptr;
273  Value *NewV = nullptr;
274 
275  // If we have a divergent value in each lane, we need to combine the value
276  // using DPP.
277  if (ValDivergent) {
278  Value *const Identity = B.getIntN(TyBitWidth, 0);
279 
280  // First we need to set all inactive invocations to 0, so that they can
281  // correctly contribute to the final result.
282  CallInst *const SetInactive =
283  B.CreateIntrinsic(Intrinsic::amdgcn_set_inactive, Ty, {V, Identity});
284  setConvergent(SetInactive);
285 
286  CallInst *const FirstDPP =
287  B.CreateIntrinsic(Intrinsic::amdgcn_update_dpp, Ty,
288  {Identity, SetInactive, B.getInt32(DPP_WF_SR1),
289  B.getInt32(0xf), B.getInt32(0xf), B.getFalse()});
290  setConvergent(FirstDPP);
291  NewV = FirstDPP;
292 
293  const unsigned Iters = 7;
294  const unsigned DPPCtrl[Iters] = {
295  DPP_ROW_SR1, DPP_ROW_SR2, DPP_ROW_SR3, DPP_ROW_SR4,
296  DPP_ROW_SR8, DPP_ROW_BCAST15, DPP_ROW_BCAST31};
297  const unsigned RowMask[Iters] = {0xf, 0xf, 0xf, 0xf, 0xf, 0xa, 0xc};
298  const unsigned BankMask[Iters] = {0xf, 0xf, 0xf, 0xe, 0xc, 0xf, 0xf};
299 
300  // This loop performs an exclusive scan across the wavefront, with all lanes
301  // active (by using the WWM intrinsic).
302  for (unsigned Idx = 0; Idx < Iters; Idx++) {
303  Value *const UpdateValue = Idx < 3 ? FirstDPP : NewV;
304  CallInst *const DPP = B.CreateIntrinsic(
305  Intrinsic::amdgcn_update_dpp, Ty,
306  {Identity, UpdateValue, B.getInt32(DPPCtrl[Idx]),
307  B.getInt32(RowMask[Idx]), B.getInt32(BankMask[Idx]), B.getFalse()});
308  setConvergent(DPP);
309 
310  NewV = B.CreateBinOp(Op, NewV, DPP);
311  }
312 
313  LaneOffset = B.CreateIntrinsic(Intrinsic::amdgcn_wwm, Ty, NewV);
314  NewV = B.CreateBinOp(Op, NewV, SetInactive);
315 
316  // Read the value from the last lane, which has accumlated the values of
317  // each active lane in the wavefront. This will be our new value with which
318  // we will provide to the atomic operation.
319  if (TyBitWidth == 64) {
320  Value *const ExtractLo = B.CreateTrunc(NewV, B.getInt32Ty());
321  Value *const ExtractHi =
322  B.CreateTrunc(B.CreateLShr(NewV, B.getInt64(32)), B.getInt32Ty());
323  CallInst *const ReadLaneLo = B.CreateIntrinsic(
324  Intrinsic::amdgcn_readlane, {}, {ExtractLo, B.getInt32(63)});
325  setConvergent(ReadLaneLo);
326  CallInst *const ReadLaneHi = B.CreateIntrinsic(
327  Intrinsic::amdgcn_readlane, {}, {ExtractHi, B.getInt32(63)});
328  setConvergent(ReadLaneHi);
329  Value *const PartialInsert = B.CreateInsertElement(
330  UndefValue::get(VecTy), ReadLaneLo, B.getInt32(0));
331  Value *const Insert =
332  B.CreateInsertElement(PartialInsert, ReadLaneHi, B.getInt32(1));
333  NewV = B.CreateBitCast(Insert, Ty);
334  } else if (TyBitWidth == 32) {
335  CallInst *const ReadLane = B.CreateIntrinsic(Intrinsic::amdgcn_readlane,
336  {}, {NewV, B.getInt32(63)});
337  setConvergent(ReadLane);
338  NewV = ReadLane;
339  } else {
340  llvm_unreachable("Unhandled atomic bit width");
341  }
342 
343  // Finally mark the readlanes in the WWM section.
344  NewV = B.CreateIntrinsic(Intrinsic::amdgcn_wwm, Ty, NewV);
345  } else {
346  // Get the total number of active lanes we have by using popcount.
347  Instruction *const Ctpop = B.CreateUnaryIntrinsic(Intrinsic::ctpop, Ballot);
348  Value *const CtpopCast = B.CreateIntCast(Ctpop, Ty, false);
349 
350  // Calculate the new value we will be contributing to the atomic operation
351  // for the entire wavefront.
352  NewV = B.CreateMul(V, CtpopCast);
353  LaneOffset = B.CreateMul(V, MbcntCast);
354  }
355 
356  // We only want a single lane to enter our new control flow, and we do this
357  // by checking if there are any active lanes below us. Only one lane will
358  // have 0 active lanes below us, so that will be the only one to progress.
359  Value *const Cond = B.CreateICmpEQ(MbcntCast, B.getIntN(TyBitWidth, 0));
360 
361  // Store I's original basic block before we split the block.
362  BasicBlock *const EntryBB = I.getParent();
363 
364  // We need to introduce some new control flow to force a single lane to be
365  // active. We do this by splitting I's basic block at I, and introducing the
366  // new block such that:
367  // entry --> single_lane -\
368  // \------------------> exit
369  Instruction *const SingleLaneTerminator =
370  SplitBlockAndInsertIfThen(Cond, &I, false, nullptr, DT, nullptr);
371 
372  // Move the IR builder into single_lane next.
373  B.SetInsertPoint(SingleLaneTerminator);
374 
375  // Clone the original atomic operation into single lane, replacing the
376  // original value with our newly created one.
377  Instruction *const NewI = I.clone();
378  B.Insert(NewI);
379  NewI->setOperand(ValIdx, NewV);
380 
381  // Move the IR builder into exit next, and start inserting just before the
382  // original instruction.
383  B.SetInsertPoint(&I);
384 
385  // Create a PHI node to get our new atomic result into the exit block.
386  PHINode *const PHI = B.CreatePHI(Ty, 2);
387  PHI->addIncoming(UndefValue::get(Ty), EntryBB);
388  PHI->addIncoming(NewI, SingleLaneTerminator->getParent());
389 
390  // We need to broadcast the value who was the lowest active lane (the first
391  // lane) to all other lanes in the wavefront. We use an intrinsic for this,
392  // but have to handle 64-bit broadcasts with two calls to this intrinsic.
393  Value *BroadcastI = nullptr;
394 
395  if (TyBitWidth == 64) {
396  Value *const ExtractLo = B.CreateTrunc(PHI, B.getInt32Ty());
397  Value *const ExtractHi =
398  B.CreateTrunc(B.CreateLShr(PHI, B.getInt64(32)), B.getInt32Ty());
399  CallInst *const ReadFirstLaneLo =
400  B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, {}, ExtractLo);
401  setConvergent(ReadFirstLaneLo);
402  CallInst *const ReadFirstLaneHi =
403  B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, {}, ExtractHi);
404  setConvergent(ReadFirstLaneHi);
405  Value *const PartialInsert = B.CreateInsertElement(
406  UndefValue::get(VecTy), ReadFirstLaneLo, B.getInt32(0));
407  Value *const Insert =
408  B.CreateInsertElement(PartialInsert, ReadFirstLaneHi, B.getInt32(1));
409  BroadcastI = B.CreateBitCast(Insert, Ty);
410  } else if (TyBitWidth == 32) {
411  CallInst *const ReadFirstLane =
412  B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, {}, PHI);
413  setConvergent(ReadFirstLane);
414  BroadcastI = ReadFirstLane;
415  } else {
416  llvm_unreachable("Unhandled atomic bit width");
417  }
418 
419  // Now that we have the result of our single atomic operation, we need to
420  // get our individual lane's slice into the result. We use the lane offset we
421  // previously calculated combined with the atomic result value we got from the
422  // first lane, to get our lane's index into the atomic result.
423  Value *const Result = B.CreateBinOp(Op, BroadcastI, LaneOffset);
424 
425  if (IsPixelShader) {
426  // Need a final PHI to reconverge to above the helper lane branch mask.
427  B.SetInsertPoint(PixelExitBB->getFirstNonPHI());
428 
429  PHINode *const PHI = B.CreatePHI(Ty, 2);
430  PHI->addIncoming(UndefValue::get(Ty), PixelEntryBB);
431  PHI->addIncoming(Result, I.getParent());
432  I.replaceAllUsesWith(PHI);
433  } else {
434  // Replace the original atomic instruction with the new one.
435  I.replaceAllUsesWith(Result);
436  }
437 
438  // And delete the original.
439  I.eraseFromParent();
440 }
441 
442 void AMDGPUAtomicOptimizer::setConvergent(CallInst *const CI) const {
444 }
445 
446 INITIALIZE_PASS_BEGIN(AMDGPUAtomicOptimizer, DEBUG_TYPE,
447  "AMDGPU atomic optimizations", false, false)
450 INITIALIZE_PASS_END(AMDGPUAtomicOptimizer, DEBUG_TYPE,
451  "AMDGPU atomic optimizations", false, false)
452 
454  return new AMDGPUAtomicOptimizer();
455 }
SymbolTableList< Instruction >::iterator eraseFromParent()
This method unlinks &#39;this&#39; from the containing basic block and deletes it.
Definition: Instruction.cpp:67
A parsed version of the target data layout string in and methods for querying it. ...
Definition: DataLayout.h:110
AnalysisUsage & addPreserved()
Add the specified Pass class to the set of analyses preserved by this pass.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
Value * CreateBinOp(Instruction::BinaryOps Opc, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:1333
Base class for instruction visitors.
Definition: InstVisitor.h:80
AMDGPU specific subclass of TargetSubtarget.
This class represents lattice values for constants.
Definition: AllocatorList.h:23
This class represents a function call, abstracting a target machine&#39;s calling convention.
Address space for global memory (RAT0, VTX0).
Definition: AMDGPU.h:255
TMC & getTM() const
Get the right type of TargetMachine for this target.
F(f)
an instruction that atomically reads a memory location, combines it with another value, and then stores the result back.
Definition: Instructions.h:691
void addAttribute(unsigned i, Attribute::AttrKind Kind)
adds the attribute to the list of attributes.
Definition: InstrTypes.h:1297
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Definition: Instructions.h:819
char & AMDGPUAtomicOptimizerID
IntegerType * getInt32Ty()
Fetch the type representing a 32-bit integer.
Definition: IRBuilder.h:346
AnalysisUsage & addRequired()
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition: PassSupport.h:50
#define DEBUG_TYPE
const DataLayout & getDataLayout() const
Get the data layout for the module&#39;s target platform.
Definition: Module.cpp:369
BinOp getOperation() const
Definition: Instructions.h:750
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition: IRBuilder.h:742
DominatorTree & getDomTree()
Definition: Dominators.h:269
Target-Independent Code Generator Pass Configuration Options.
Instruction * clone() const
Create a copy of &#39;this&#39; instruction that is identical in all ways except the following: ...
Value * CreateBitCast(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:1767
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:244
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition: Value.cpp:429
CallInst * CreateUnaryIntrinsic(Intrinsic::ID ID, Value *V, Instruction *FMFSource=nullptr, const Twine &Name="")
Create a call to intrinsic ID with 1 operand which is mangled on its type.
Definition: IRBuilder.cpp:733
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree...
Definition: Dominators.h:144
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block...
Definition: IRBuilder.h:126
Value * getOperand(unsigned i) const
Definition: User.h:169
Analysis containing CSE Info
Definition: CSEInfo.cpp:20
ConstantInt * getIntN(unsigned N, uint64_t C)
Get a constant N-bit value, zero extended or truncated from a 64-bit value.
Definition: IRBuilder.h:317
static bool runOnFunction(Function &F, bool PostInlining)
const Instruction * getFirstNonPHI() const
Returns a pointer to the first instruction in this block that is not a PHINode instruction.
Definition: BasicBlock.cpp:189
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
LLVM Basic Block Representation.
Definition: BasicBlock.h:57
The instances of the Type class are immutable: once they are created, they are never changed...
Definition: Type.h:45
Represent the analysis usage information of a pass.
FunctionPass class - This class is used to implement most global optimizations.
Definition: Pass.h:284
Value * CreateICmpEQ(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:1874
ConstantInt * getInt64(uint64_t C)
Get a constant 64-bit value.
Definition: IRBuilder.h:311
Value * CreateExtractElement(Value *Vec, Value *Idx, const Twine &Name="")
Definition: IRBuilder.h:2077
static UndefValue * get(Type *T)
Static factory methods - Return an &#39;undef&#39; object of the specified type.
Definition: Constants.cpp:1414
Calling convention used for Mesa/AMDPAL pixel shaders.
Definition: CallingConv.h:194
INITIALIZE_PASS_END(RegBankSelect, DEBUG_TYPE, "Assign register bank of generic virtual registers", false, false) RegBankSelect
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Value * CreateMul(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition: IRBuilder.h:1083
Value * CreateTrunc(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:1690
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
Definition: IntrinsicInst.h:50
PHINode * CreatePHI(Type *Ty, unsigned NumReservedValues, const Twine &Name="")
Definition: IRBuilder.h:2004
unsigned getNumOperands() const
Definition: User.h:191
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition: Function.h:212
Value * CreateIntCast(Value *V, Type *DestTy, bool isSigned, const Twine &Name="")
Definition: IRBuilder.h:1836
This is a &#39;vector&#39; (really, a variable-sized array), optimized for the case when the array is small...
Definition: SmallVector.h:839
CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type *> Types, ArrayRef< Value *> Args, Instruction *FMFSource=nullptr, const Twine &Name="")
Create a call to intrinsic ID with args, mangled using Types.
Definition: IRBuilder.cpp:750
Value * CreateInsertElement(Value *Vec, Value *NewElt, Value *Idx, const Twine &Name="")
Definition: IRBuilder.h:2090
FunctionPass * createAMDGPUAtomicOptimizerPass()
AMDGPU atomic optimizations
ConstantInt * getInt32(uint32_t C)
Get a constant 32-bit value.
Definition: IRBuilder.h:306
void setOperand(unsigned i, Value *Val)
Definition: User.h:174
Address space for local memory.
Definition: AMDGPU.h:259
ConstantInt * getFalse()
Get the constant value for i1 false.
Definition: IRBuilder.h:291
#define I(x, y, z)
Definition: MD5.cpp:58
INITIALIZE_PASS_BEGIN(AMDGPUAtomicOptimizer, DEBUG_TYPE, "AMDGPU atomic optimizations", false, false) INITIALIZE_PASS_END(AMDGPUAtomicOptimizer
InstTy * Insert(InstTy *I, const Twine &Name="") const
Insert and return the specified instruction.
Definition: IRBuilder.h:793
Module * getParent()
Get the module that this global value is contained inside of...
Definition: GlobalValue.h:565
LLVM Value Representation.
Definition: Value.h:72
static VectorType * get(Type *ElementType, unsigned NumElements)
This static method is the primary way to construct an VectorType.
Definition: Type.cpp:605
void moveBefore(Instruction *MovePos)
Unlink this instruction from its current basic block and insert it into the basic block that MovePos ...
Definition: Instruction.cpp:86
Value * CreateLShr(Value *LHS, Value *RHS, const Twine &Name="", bool isExact=false)
Definition: IRBuilder.h:1159
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:58
Legacy analysis pass which computes a DominatorTree.
Definition: Dominators.h:259
Instruction * SplitBlockAndInsertIfThen(Value *Cond, Instruction *SplitBefore, bool Unreachable, MDNode *BranchWeights=nullptr, DominatorTree *DT=nullptr, LoopInfo *LI=nullptr, BasicBlock *ThenBlock=nullptr)
Split the containing block at the specified instruction - everything before SplitBefore stays in the ...
A wrapper class for inspecting calls to intrinsic functions.
Definition: IntrinsicInst.h:43
const BasicBlock * getParent() const
Definition: Instruction.h:66