LLVM  16.0.0git
AMDGPULowerKernelArguments.cpp
Go to the documentation of this file.
1 //===-- AMDGPULowerKernelArguments.cpp ------------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file This pass replaces accesses to kernel arguments with loads from
10 /// offsets from the kernarg base pointer.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPU.h"
15 #include "GCNSubtarget.h"
17 #include "llvm/IR/IntrinsicsAMDGPU.h"
18 #include "llvm/IR/IRBuilder.h"
19 #include "llvm/IR/MDBuilder.h"
21 #define DEBUG_TYPE "amdgpu-lower-kernel-arguments"
22 
23 using namespace llvm;
24 
25 namespace {
26 
27 class AMDGPULowerKernelArguments : public FunctionPass{
28 public:
29  static char ID;
30 
31  AMDGPULowerKernelArguments() : FunctionPass(ID) {}
32 
33  bool runOnFunction(Function &F) override;
34 
35  void getAnalysisUsage(AnalysisUsage &AU) const override {
37  AU.setPreservesAll();
38  }
39 };
40 
41 } // end anonymous namespace
42 
43 // skip allocas
45  BasicBlock::iterator InsPt = BB.getFirstInsertionPt();
46  for (BasicBlock::iterator E = BB.end(); InsPt != E; ++InsPt) {
47  AllocaInst *AI = dyn_cast<AllocaInst>(&*InsPt);
48 
49  // If this is a dynamic alloca, the value may depend on the loaded kernargs,
50  // so loads will need to be inserted before it.
51  if (!AI || !AI->isStaticAlloca())
52  break;
53  }
54 
55  return InsPt;
56 }
57 
59  CallingConv::ID CC = F.getCallingConv();
60  if (CC != CallingConv::AMDGPU_KERNEL || F.arg_empty())
61  return false;
62 
63  auto &TPC = getAnalysis<TargetPassConfig>();
64 
65  const TargetMachine &TM = TPC.getTM<TargetMachine>();
66  const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
67  LLVMContext &Ctx = F.getParent()->getContext();
68  const DataLayout &DL = F.getParent()->getDataLayout();
69  BasicBlock &EntryBlock = *F.begin();
70  IRBuilder<> Builder(&*getInsertPt(EntryBlock));
71 
72  const Align KernArgBaseAlign(16); // FIXME: Increase if necessary
73  const uint64_t BaseOffset = ST.getExplicitKernelArgOffset(F);
74 
75  Align MaxAlign;
76  // FIXME: Alignment is broken with explicit arg offset.;
77  const uint64_t TotalKernArgSize = ST.getKernArgSegmentSize(F, MaxAlign);
78  if (TotalKernArgSize == 0)
79  return false;
80 
81  CallInst *KernArgSegment =
82  Builder.CreateIntrinsic(Intrinsic::amdgcn_kernarg_segment_ptr, {}, {},
83  nullptr, F.getName() + ".kernarg.segment");
84 
85  KernArgSegment->addRetAttr(Attribute::NonNull);
86  KernArgSegment->addRetAttr(
87  Attribute::getWithDereferenceableBytes(Ctx, TotalKernArgSize));
88 
89  unsigned AS = KernArgSegment->getType()->getPointerAddressSpace();
90  uint64_t ExplicitArgOffset = 0;
91 
92  for (Argument &Arg : F.args()) {
93  const bool IsByRef = Arg.hasByRefAttr();
94  Type *ArgTy = IsByRef ? Arg.getParamByRefType() : Arg.getType();
95  MaybeAlign ParamAlign = IsByRef ? Arg.getParamAlign() : None;
96  Align ABITypeAlign = DL.getValueOrABITypeAlignment(ParamAlign, ArgTy);
97 
98  uint64_t Size = DL.getTypeSizeInBits(ArgTy);
99  uint64_t AllocSize = DL.getTypeAllocSize(ArgTy);
100 
101  uint64_t EltOffset = alignTo(ExplicitArgOffset, ABITypeAlign) + BaseOffset;
102  ExplicitArgOffset = alignTo(ExplicitArgOffset, ABITypeAlign) + AllocSize;
103 
104  if (Arg.use_empty())
105  continue;
106 
107  // If this is byval, the loads are already explicit in the function. We just
108  // need to rewrite the pointer values.
109  if (IsByRef) {
110  Value *ArgOffsetPtr = Builder.CreateConstInBoundsGEP1_64(
111  Builder.getInt8Ty(), KernArgSegment, EltOffset,
112  Arg.getName() + ".byval.kernarg.offset");
113 
114  Value *CastOffsetPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
115  ArgOffsetPtr, Arg.getType());
116  Arg.replaceAllUsesWith(CastOffsetPtr);
117  continue;
118  }
119 
120  if (PointerType *PT = dyn_cast<PointerType>(ArgTy)) {
121  // FIXME: Hack. We rely on AssertZext to be able to fold DS addressing
122  // modes on SI to know the high bits are 0 so pointer adds don't wrap. We
123  // can't represent this with range metadata because it's only allowed for
124  // integer types.
125  if ((PT->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
126  PT->getAddressSpace() == AMDGPUAS::REGION_ADDRESS) &&
127  !ST.hasUsableDSOffset())
128  continue;
129 
130  // FIXME: We can replace this with equivalent alias.scope/noalias
131  // metadata, but this appears to be a lot of work.
132  if (Arg.hasNoAliasAttr())
133  continue;
134  }
135 
136  auto *VT = dyn_cast<FixedVectorType>(ArgTy);
137  bool IsV3 = VT && VT->getNumElements() == 3;
138  bool DoShiftOpt = Size < 32 && !ArgTy->isAggregateType();
139 
140  VectorType *V4Ty = nullptr;
141 
142  int64_t AlignDownOffset = alignDown(EltOffset, 4);
143  int64_t OffsetDiff = EltOffset - AlignDownOffset;
144  Align AdjustedAlign = commonAlignment(
145  KernArgBaseAlign, DoShiftOpt ? AlignDownOffset : EltOffset);
146 
147  Value *ArgPtr;
148  Type *AdjustedArgTy;
149  if (DoShiftOpt) { // FIXME: Handle aggregate types
150  // Since we don't have sub-dword scalar loads, avoid doing an extload by
151  // loading earlier than the argument address, and extracting the relevant
152  // bits.
153  //
154  // Additionally widen any sub-dword load to i32 even if suitably aligned,
155  // so that CSE between different argument loads works easily.
156  ArgPtr = Builder.CreateConstInBoundsGEP1_64(
157  Builder.getInt8Ty(), KernArgSegment, AlignDownOffset,
158  Arg.getName() + ".kernarg.offset.align.down");
159  AdjustedArgTy = Builder.getInt32Ty();
160  } else {
161  ArgPtr = Builder.CreateConstInBoundsGEP1_64(
162  Builder.getInt8Ty(), KernArgSegment, EltOffset,
163  Arg.getName() + ".kernarg.offset");
164  AdjustedArgTy = ArgTy;
165  }
166 
167  if (IsV3 && Size >= 32) {
168  V4Ty = FixedVectorType::get(VT->getElementType(), 4);
169  // Use the hack that clang uses to avoid SelectionDAG ruining v3 loads
170  AdjustedArgTy = V4Ty;
171  }
172 
173  ArgPtr = Builder.CreateBitCast(ArgPtr, AdjustedArgTy->getPointerTo(AS),
174  ArgPtr->getName() + ".cast");
175  LoadInst *Load =
176  Builder.CreateAlignedLoad(AdjustedArgTy, ArgPtr, AdjustedAlign);
177  Load->setMetadata(LLVMContext::MD_invariant_load, MDNode::get(Ctx, {}));
178 
179  MDBuilder MDB(Ctx);
180 
181  if (isa<PointerType>(ArgTy)) {
182  if (Arg.hasNonNullAttr())
183  Load->setMetadata(LLVMContext::MD_nonnull, MDNode::get(Ctx, {}));
184 
185  uint64_t DerefBytes = Arg.getDereferenceableBytes();
186  if (DerefBytes != 0) {
187  Load->setMetadata(
188  LLVMContext::MD_dereferenceable,
189  MDNode::get(Ctx,
190  MDB.createConstant(
191  ConstantInt::get(Builder.getInt64Ty(), DerefBytes))));
192  }
193 
194  uint64_t DerefOrNullBytes = Arg.getDereferenceableOrNullBytes();
195  if (DerefOrNullBytes != 0) {
196  Load->setMetadata(
197  LLVMContext::MD_dereferenceable_or_null,
198  MDNode::get(Ctx,
199  MDB.createConstant(ConstantInt::get(Builder.getInt64Ty(),
200  DerefOrNullBytes))));
201  }
202 
203  unsigned ParamAlign = Arg.getParamAlignment();
204  if (ParamAlign != 0) {
205  Load->setMetadata(
206  LLVMContext::MD_align,
207  MDNode::get(Ctx,
208  MDB.createConstant(ConstantInt::get(Builder.getInt64Ty(),
209  ParamAlign))));
210  }
211  }
212 
213  // TODO: Convert noalias arg to !noalias
214 
215  if (DoShiftOpt) {
216  Value *ExtractBits = OffsetDiff == 0 ?
217  Load : Builder.CreateLShr(Load, OffsetDiff * 8);
218 
219  IntegerType *ArgIntTy = Builder.getIntNTy(Size);
220  Value *Trunc = Builder.CreateTrunc(ExtractBits, ArgIntTy);
221  Value *NewVal = Builder.CreateBitCast(Trunc, ArgTy,
222  Arg.getName() + ".load");
223  Arg.replaceAllUsesWith(NewVal);
224  } else if (IsV3) {
225  Value *Shuf = Builder.CreateShuffleVector(Load, ArrayRef<int>{0, 1, 2},
226  Arg.getName() + ".load");
227  Arg.replaceAllUsesWith(Shuf);
228  } else {
229  Load->setName(Arg.getName() + ".load");
230  Arg.replaceAllUsesWith(Load);
231  }
232  }
233 
234  KernArgSegment->addRetAttr(
235  Attribute::getWithAlignment(Ctx, std::max(KernArgBaseAlign, MaxAlign)));
236 
237  return true;
238 }
239 
240 INITIALIZE_PASS_BEGIN(AMDGPULowerKernelArguments, DEBUG_TYPE,
241  "AMDGPU Lower Kernel Arguments", false, false)
242 INITIALIZE_PASS_END(AMDGPULowerKernelArguments, DEBUG_TYPE, "AMDGPU Lower Kernel Arguments",
244 
245 char AMDGPULowerKernelArguments::ID = 0;
246 
248  return new AMDGPULowerKernelArguments();
249 }
llvm::Check::Size
@ Size
Definition: FileCheck.h:77
llvm::alignTo
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition: Alignment.h:156
llvm::Argument
This class represents an incoming formal argument to a Function.
Definition: Argument.h:28
llvm::AMDGPUAS::LOCAL_ADDRESS
@ LOCAL_ADDRESS
Address space for local memory.
Definition: AMDGPU.h:376
llvm
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
DEBUG_TYPE
#define DEBUG_TYPE
Definition: AMDGPULowerKernelArguments.cpp:21
llvm::DataLayout
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:113
llvm::BasicBlock::iterator
InstListType::iterator iterator
Instruction iterators...
Definition: BasicBlock.h:87
llvm::Function
Definition: Function.h:60
llvm::Type::getPointerAddressSpace
unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
Definition: DerivedTypes.h:729
llvm::IRBuilder<>
llvm::Type
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
llvm::GCNSubtarget
Definition: GCNSubtarget.h:31
llvm::max
Expected< ExpressionValue > max(const ExpressionValue &Lhs, const ExpressionValue &Rhs)
Definition: FileCheck.cpp:337
llvm::MDNode::get
static MDTuple * get(LLVMContext &Context, ArrayRef< Metadata * > MDs)
Definition: Metadata.h:1400
llvm::commonAlignment
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition: Alignment.h:213
F
#define F(x, y, z)
Definition: MD5.cpp:55
llvm::BasicBlock
LLVM Basic Block Representation.
Definition: BasicBlock.h:55
llvm::AMDGPUAS::REGION_ADDRESS
@ REGION_ADDRESS
Address space for region memory. (GDS)
Definition: AMDGPU.h:373
Arg
amdgpu Simplify well known AMD library false FunctionCallee Value * Arg
Definition: AMDGPULibCalls.cpp:187
llvm::AllocaInst::isStaticAlloca
bool isStaticAlloca() const
Return true if this alloca is in the entry block of the function and is a constant size.
Definition: Instructions.cpp:1507
INITIALIZE_PASS_BEGIN
INITIALIZE_PASS_BEGIN(AMDGPULowerKernelArguments, DEBUG_TYPE, "AMDGPU Lower Kernel Arguments", false, false) INITIALIZE_PASS_END(AMDGPULowerKernelArguments
TargetMachine.h
GCNSubtarget.h
E
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
llvm::AnalysisUsage
Represent the analysis usage information of a pass.
Definition: PassAnalysisSupport.h:47
false
Definition: StackSlotColoring.cpp:141
llvm::MaybeAlign
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition: Alignment.h:117
llvm::alignDown
uint64_t alignDown(uint64_t Value, uint64_t Align, uint64_t Skew=0)
Returns the largest uint64_t less than or equal to Value and is Skew mod Align.
Definition: MathExtras.h:694
llvm::IntegerType
Class to represent integer types.
Definition: DerivedTypes.h:40
AMDGPU
Definition: AMDGPUReplaceLDSUseWithPointer.cpp:114
MDBuilder.h
llvm::ConstantInt::get
static Constant * get(Type *Ty, uint64_t V, bool IsSigned=false)
If Ty is a vector type, return a Constant with a splat of the given value.
Definition: Constants.cpp:879
llvm::FixedVectorType::get
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition: Type.cpp:684
llvm::Align
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
llvm::CallingConv::ID
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition: CallingConv.h:24
INITIALIZE_PASS_END
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:58
llvm::TargetPassConfig
Target-Independent Code Generator Pass Configuration Options.
Definition: TargetPassConfig.h:84
llvm::VectorType
Base class of all SIMD vector types.
Definition: DerivedTypes.h:389
uint64_t
llvm::omp::Kernel
Function * Kernel
Summary of a kernel (=entry point for target offloading).
Definition: OpenMPOpt.h:21
llvm::ARM_MB::ST
@ ST
Definition: ARMBaseInfo.h:73
llvm::LLVMContext
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:67
llvm::Attribute::getWithDereferenceableBytes
static Attribute getWithDereferenceableBytes(LLVMContext &Context, uint64_t Bytes)
Definition: Attributes.cpp:177
llvm::PointerType
Class to represent pointers.
Definition: DerivedTypes.h:632
llvm::CallBase::addRetAttr
void addRetAttr(Attribute::AttrKind Kind)
Adds the attribute to the return value.
Definition: InstrTypes.h:1516
TargetPassConfig.h
IRBuilder.h
llvm::TargetMachine
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:77
Builder
assume Assume Builder
Definition: AssumeBundleBuilder.cpp:651
llvm::ArrayRef< int >
AMDGPU.h
llvm::Value::getType
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
DL
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Definition: AArch64SLSHardening.cpp:76
CC
auto CC
Definition: RISCVRedundantCopyElimination.cpp:79
llvm::Value::getName
StringRef getName() const
Return a constant reference to the value's name.
Definition: Value.cpp:308
llvm::LoadInst
An instruction for reading from memory.
Definition: Instructions.h:173
llvm::CallingConv::AMDGPU_KERNEL
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
Definition: CallingConv.h:201
llvm::Attribute::getWithAlignment
static Attribute getWithAlignment(LLVMContext &Context, Align Alignment)
Return a uniquified Attribute object that has the specific alignment set.
Definition: Attributes.cpp:167
runOnFunction
static bool runOnFunction(Function &F, bool PostInlining)
Definition: EntryExitInstrumenter.cpp:85
llvm::None
constexpr std::nullopt_t None
Definition: None.h:27
llvm::AnalysisUsage::setPreservesAll
void setPreservesAll()
Set by analyses that do not transform their input at all.
Definition: PassAnalysisSupport.h:130
llvm::SPII::Load
@ Load
Definition: SparcInstrInfo.h:32
llvm::createAMDGPULowerKernelArgumentsPass
FunctionPass * createAMDGPULowerKernelArgumentsPass()
Definition: AMDGPULowerKernelArguments.cpp:247
Arguments
AMDGPU Lower Kernel Arguments
Definition: AMDGPULowerKernelArguments.cpp:242
llvm::Type::getPointerTo
PointerType * getPointerTo(unsigned AddrSpace=0) const
Return a pointer to the current type.
Definition: Type.cpp:774
llvm::MDBuilder
Definition: MDBuilder.h:36
getInsertPt
static BasicBlock::iterator getInsertPt(BasicBlock &BB)
Definition: AMDGPULowerKernelArguments.cpp:44
TM
const char LLVMTargetMachineRef TM
Definition: PassBuilderBindings.cpp:47
llvm::FunctionPass
FunctionPass class - This class is used to implement most global optimizations.
Definition: Pass.h:308
llvm::CallInst
This class represents a function call, abstracting a target machine's calling convention.
Definition: Instructions.h:1473
BB
Common register allocation spilling lr str ldr sxth r3 ldr mla r4 can lr mov lr str ldr sxth r3 mla r4 and then merge mul and lr str ldr sxth r3 mla r4 It also increase the likelihood the store may become dead bb27 Successors according to LLVM BB
Definition: README.txt:39
llvm::Type::isAggregateType
bool isAggregateType() const
Return true if the type is an aggregate type.
Definition: Type.h:276
llvm::AnalysisUsage::addRequired
AnalysisUsage & addRequired()
Definition: PassAnalysisSupport.h:75
llvm::AllocaInst
an instruction to allocate memory on the stack
Definition: Instructions.h:58
llvm::Value
LLVM Value Representation.
Definition: Value.h:74