LLVM 20.0.0git
AMDGPUImageIntrinsicOptimizer.cpp
Go to the documentation of this file.
1//===- AMDGPUImageIntrinsicOptimizer.cpp ----------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This pass tries to combine multiple image_load intrinsics with dim=2dmsaa
10// or dim=2darraymsaa into a single image_msaa_load intrinsic if:
11//
12// - they refer to the same vaddr except for sample_id,
13// - they use a constant sample_id and they fall into the same group,
14// - they have the same dmask and the number of intrinsics and the number of
15// vaddr/vdata dword transfers is reduced by the combine.
16//
17// Examples for the tradeoff (all are assuming 2DMsaa for vaddr):
18//
19// +----------+-----+-----+-------+---------+------------+---------+----------+
20// | popcount | a16 | d16 | #load | vaddr / | #msaa_load | vaddr / | combine? |
21// | (dmask) | | | | vdata | | vdata | |
22// +----------+-----+-----+-------+---------+------------+---------+----------+
23// | 1 | 0 | 0 | 4 | 12 / 4 | 1 | 3 / 4 | yes |
24// +----------+-----+-----+-------+---------+------------+---------+----------+
25// | 1 | 0 | 0 | 2 | 6 / 2 | 1 | 3 / 4 | yes? |
26// +----------+-----+-----+-------+---------+------------+---------+----------+
27// | 2 | 0 | 0 | 4 | 12 / 8 | 2 | 6 / 8 | yes |
28// +----------+-----+-----+-------+---------+------------+---------+----------+
29// | 2 | 0 | 0 | 2 | 6 / 4 | 2 | 6 / 8 | no |
30// +----------+-----+-----+-------+---------+------------+---------+----------+
31// | 1 | 0 | 1 | 2 | 6 / 2 | 1 | 3 / 2 | yes |
32// +----------+-----+-----+-------+---------+------------+---------+----------+
33//
34// Some cases are of questionable benefit, like the one marked with "yes?"
35// above: fewer intrinsics and fewer vaddr and fewer total transfers between SP
36// and TX, but higher vdata. We start by erring on the side of converting these
37// to MSAA_LOAD.
38//
39// clang-format off
40//
41// This pass will combine intrinsics such as (not neccessarily consecutive):
42// call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
43// call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 1, <8 x i32> %rsrc, i32 0, i32 0)
44// call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 2, <8 x i32> %rsrc, i32 0, i32 0)
45// call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 3, <8 x i32> %rsrc, i32 0, i32 0)
46// ==>
47// call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32(i32 1, i32 %s, i32 %t, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
48//
49// clang-format on
50//
51// Future improvements:
52//
53// - We may occasionally not want to do the combine if it increases the maximum
54// register pressure.
55//
56// - Ensure clausing when multiple MSAA_LOAD are generated.
57//
58// Note: Even though the image_msaa_load intrinsic already exists on gfx10, this
59// combine only applies to gfx11, due to a limitation in gfx10: the gfx10
60// IMAGE_MSAA_LOAD only works correctly with single-channel texture formats, and
61// we don't know the format at compile time.
62//===----------------------------------------------------------------------===//
63
64#include "AMDGPU.h"
65#include "AMDGPUInstrInfo.h"
66#include "AMDGPUTargetMachine.h"
67#include "llvm/IR/Function.h"
68#include "llvm/IR/IRBuilder.h"
70#include "llvm/IR/IntrinsicsAMDGPU.h"
71#include "llvm/Pass.h"
73
74using namespace llvm;
75
76#define DEBUG_TYPE "amdgpu-image-intrinsic-opt"
77
78namespace {
79class AMDGPUImageIntrinsicOptimizer : public FunctionPass {
80 const TargetMachine *TM;
81
82public:
83 static char ID;
84
85 AMDGPUImageIntrinsicOptimizer(const TargetMachine *TM = nullptr)
86 : FunctionPass(ID), TM(TM) {}
87
88 bool runOnFunction(Function &F) override;
89
90}; // End of class AMDGPUImageIntrinsicOptimizer
91} // End anonymous namespace
92
93INITIALIZE_PASS(AMDGPUImageIntrinsicOptimizer, DEBUG_TYPE,
94 "AMDGPU Image Intrinsic Optimizer", false, false)
95
96char AMDGPUImageIntrinsicOptimizer::ID = 0;
97
100 SmallVector<SmallVector<IntrinsicInst *, 4>> &MergeableInsts,
101 const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) {
102 for (SmallVector<IntrinsicInst *, 4> &IIList : MergeableInsts) {
103 // Check Dim.
104 if (IIList.front()->getIntrinsicID() != II->getIntrinsicID())
105 continue;
106
107 // Check D16.
108 if (IIList.front()->getType() != II->getType())
109 continue;
110
111 // Check all arguments (DMask, VAddr, RSrc etc).
112 bool AllEqual = true;
113 assert(IIList.front()->arg_size() == II->arg_size());
114 for (int I = 1, E = II->arg_size(); AllEqual && I != E; ++I) {
115 Value *ArgList = IIList.front()->getArgOperand(I);
116 Value *Arg = II->getArgOperand(I);
117 if (I == ImageDimIntr->VAddrEnd - 1) {
118 // Check FragId group.
119 auto FragIdList = cast<ConstantInt>(IIList.front()->getArgOperand(I));
120 auto FragId = cast<ConstantInt>(II->getArgOperand(I));
121 AllEqual = FragIdList->getValue().udiv(4) == FragId->getValue().udiv(4);
122 } else {
123 // Check all arguments except FragId.
124 AllEqual = ArgList == Arg;
125 }
126 }
127 if (!AllEqual)
128 continue;
129
130 // Add to the list.
131 IIList.emplace_back(II);
132 return;
133 }
134
135 // Similar instruction not found, so add a new list.
136 MergeableInsts.emplace_back(1, II);
137 LLVM_DEBUG(dbgs() << "New: " << *II << "\n");
138}
139
140// Collect list of all instructions we know how to merge in a subset of the
141// block. It returns an iterator to the instruction after the last one analyzed.
145 for (; I != E; ++I) {
146 // Don't combine if there is a store in the middle or if there is a memory
147 // barrier.
148 if (I->mayHaveSideEffects()) {
149 ++I;
150 break;
151 }
152
153 // Ignore non-intrinsics.
154 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
155 Intrinsic::ID IntrinID = II->getIntrinsicID();
156
157 // Ignore other intrinsics.
158 if (IntrinID != Intrinsic::amdgcn_image_load_2dmsaa &&
159 IntrinID != Intrinsic::amdgcn_image_load_2darraymsaa)
160 continue;
161
162 // Check for constant FragId.
163 const auto *ImageDimIntr = AMDGPU::getImageDimIntrinsicInfo(IntrinID);
164 const uint8_t FragIdIndex = ImageDimIntr->VAddrEnd - 1;
165 if (!isa<ConstantInt>(II->getArgOperand(FragIdIndex)))
166 continue;
167
168 LLVM_DEBUG(dbgs() << "Merge: " << *II << "\n");
169 addInstToMergeableList(II, MergeableInsts, ImageDimIntr);
170 }
171 }
172
173 return I;
174}
175
177 bool Modified = false;
178
179 SmallVector<Instruction *, 4> InstrsToErase;
180 for (const auto &IIList : MergeableInsts) {
181 if (IIList.size() <= 1)
182 continue;
183
184 // Assume the arguments are unchanged and later override them, if needed.
185 SmallVector<Value *, 16> Args(IIList.front()->args());
186
187 // Validate function argument and return types, extracting overloaded
188 // types along the way.
189 SmallVector<Type *, 6> OverloadTys;
190 Function *F = IIList.front()->getCalledFunction();
191 if (!Intrinsic::getIntrinsicSignature(F, OverloadTys))
192 continue;
193
194 Intrinsic::ID IntrinID = IIList.front()->getIntrinsicID();
195 const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
197
198 Type *EltTy = IIList.front()->getType()->getScalarType();
199 Type *NewTy = FixedVectorType::get(EltTy, 4);
200 OverloadTys[0] = NewTy;
201 bool isD16 = EltTy->isHalfTy();
202
203 ConstantInt *DMask = cast<ConstantInt>(
204 IIList.front()->getArgOperand(ImageDimIntr->DMaskIndex));
205 unsigned DMaskVal = DMask->getZExtValue() & 0xf;
206 unsigned NumElts = popcount(DMaskVal);
207
208 // Number of instructions and the number of vaddr/vdata dword transfers
209 // should be reduced.
210 unsigned NumLoads = IIList.size();
211 unsigned NumMsaas = NumElts;
212 unsigned NumVAddrLoads = 3 * NumLoads;
213 unsigned NumVDataLoads = divideCeil(NumElts, isD16 ? 2 : 1) * NumLoads;
214 unsigned NumVAddrMsaas = 3 * NumMsaas;
215 unsigned NumVDataMsaas = divideCeil(4, isD16 ? 2 : 1) * NumMsaas;
216
217 if (NumLoads < NumMsaas ||
218 (NumVAddrLoads + NumVDataLoads < NumVAddrMsaas + NumVDataMsaas))
219 continue;
220
221 const uint8_t FragIdIndex = ImageDimIntr->VAddrEnd - 1;
222 auto FragId = cast<ConstantInt>(IIList.front()->getArgOperand(FragIdIndex));
223 const APInt &NewFragIdVal = FragId->getValue().udiv(4) * 4;
224
225 // Create the new instructions.
226 IRBuilder<> B(IIList.front());
227
228 // Create the new image_msaa_load intrinsic.
230 while (DMaskVal != 0) {
231 unsigned NewMaskVal = 1 << countr_zero(DMaskVal);
232
233 Intrinsic::ID NewIntrinID;
234 if (IntrinID == Intrinsic::amdgcn_image_load_2dmsaa)
235 NewIntrinID = Intrinsic::amdgcn_image_msaa_load_2dmsaa;
236 else
237 NewIntrinID = Intrinsic::amdgcn_image_msaa_load_2darraymsaa;
238
240 IIList.front()->getModule(), NewIntrinID, OverloadTys);
241 Args[ImageDimIntr->DMaskIndex] =
242 ConstantInt::get(DMask->getType(), NewMaskVal);
243 Args[FragIdIndex] = ConstantInt::get(FragId->getType(), NewFragIdVal);
244 CallInst *NewCall = B.CreateCall(NewIntrin, Args);
245 LLVM_DEBUG(dbgs() << "Optimize: " << *NewCall << "\n");
246
247 NewCalls.push_back(NewCall);
248 DMaskVal -= NewMaskVal;
249 }
250
251 // Create the new extractelement instructions.
252 for (auto &II : IIList) {
253 Value *VecOp = nullptr;
254 auto Idx = cast<ConstantInt>(II->getArgOperand(FragIdIndex));
255 B.SetCurrentDebugLocation(II->getDebugLoc());
256 if (NumElts == 1) {
257 VecOp = B.CreateExtractElement(NewCalls[0], Idx->getValue().urem(4));
258 LLVM_DEBUG(dbgs() << "Add: " << *VecOp << "\n");
259 } else {
260 VecOp = UndefValue::get(II->getType());
261 for (unsigned I = 0; I < NumElts; ++I) {
262 VecOp = B.CreateInsertElement(
263 VecOp,
264 B.CreateExtractElement(NewCalls[I], Idx->getValue().urem(4)), I);
265 LLVM_DEBUG(dbgs() << "Add: " << *VecOp << "\n");
266 }
267 }
268
269 // Replace the old instruction.
270 II->replaceAllUsesWith(VecOp);
271 VecOp->takeName(II);
272 InstrsToErase.push_back(II);
273 }
274
275 Modified = true;
276 }
277
278 for (auto I : InstrsToErase)
279 I->eraseFromParent();
280
281 return Modified;
282}
283
285 if (!TM)
286 return false;
287
288 // This optimization only applies to GFX11 and beyond.
289 const GCNSubtarget &ST = TM->getSubtarget<GCNSubtarget>(F);
290 if (!AMDGPU::isGFX11Plus(ST) || ST.hasMSAALoadDstSelBug())
291 return false;
292
293 Module *M = F.getParent();
294
295 // Early test to determine if the intrinsics are used.
296 if (llvm::none_of(*M, [](Function &F) {
297 return !F.users().empty() &&
298 (F.getIntrinsicID() == Intrinsic::amdgcn_image_load_2dmsaa ||
299 F.getIntrinsicID() == Intrinsic::amdgcn_image_load_2darraymsaa);
300 }))
301 return false;
302
303 bool Modified = false;
304 for (auto &BB : F) {
305 BasicBlock::iterator SectionEnd;
306 for (BasicBlock::iterator I = BB.begin(), E = BB.end(); I != E;
307 I = SectionEnd) {
309
310 SectionEnd = collectMergeableInsts(I, E, MergeableInsts);
311 Modified |= optimizeSection(MergeableInsts);
312 }
313 }
314
315 return Modified;
316}
317
318bool AMDGPUImageIntrinsicOptimizer::runOnFunction(Function &F) {
319 if (skipFunction(F))
320 return false;
321
323}
324
327 return new AMDGPUImageIntrinsicOptimizer(TM);
328}
329
333
334 bool Changed = imageIntrinsicOptimizerImpl(F, &TM);
335 return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all();
336}
aarch64 promote const
bool optimizeSection(ArrayRef< SmallVector< IntrinsicInst *, 4 > > MergeableInsts)
INITIALIZE_PASS(AMDGPUImageIntrinsicOptimizer, DEBUG_TYPE, "AMDGPU Image Intrinsic Optimizer", false, false) char AMDGPUImageIntrinsicOptimizer void addInstToMergeableList(IntrinsicInst *II, SmallVector< SmallVector< IntrinsicInst *, 4 > > &MergeableInsts, const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr)
BasicBlock::iterator collectMergeableInsts(BasicBlock::iterator I, BasicBlock::iterator E, SmallVector< SmallVector< IntrinsicInst *, 4 > > &MergeableInsts)
static bool imageIntrinsicOptimizerImpl(Function &F, const TargetMachine *TM)
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
The AMDGPU TargetMachine interface definition for hw codegen targets.
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
#define LLVM_DEBUG(X)
Definition: Debug.h:101
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
uint64_t IntrinsicInst * II
const char LLVMTargetMachineRef TM
#define INITIALIZE_PASS(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:38
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
Class for arbitrary precision integers.
Definition: APInt.h:78
APInt udiv(const APInt &RHS) const
Unsigned division operation.
Definition: APInt.cpp:1543
A container for analyses that lazily runs them and caches their results.
Definition: PassManager.h:253
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
InstListType::iterator iterator
Instruction iterators...
Definition: BasicBlock.h:177
This class represents a function call, abstracting a target machine's calling convention.
This is the shared class of boolean and integer constants.
Definition: Constants.h:81
uint64_t getZExtValue() const
Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...
Definition: Constants.h:155
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition: Type.cpp:680
FunctionPass class - This class is used to implement most global optimizations.
Definition: Pass.h:310
virtual bool runOnFunction(Function &F)=0
runOnFunction - Virtual method overriden by subclasses to do the per-function processing of the pass.
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition: IRBuilder.h:2674
A wrapper class for inspecting calls to intrinsic functions.
Definition: IntrinsicInst.h:48
A Module instance is used to store all the information related to an LLVM module.
Definition: Module.h:65
A set of analyses that are preserved following a run of a transformation pass.
Definition: Analysis.h:111
static PreservedAnalyses none()
Convenience factory function for the empty preserved set.
Definition: Analysis.h:114
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition: Analysis.h:117
void push_back(const T &Elt)
Definition: SmallVector.h:427
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1210
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:77
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition: Type.h:142
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition: Type.h:343
static UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
Definition: Constants.cpp:1833
LLVM Value Representation.
Definition: Value.h:74
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
void takeName(Value *V)
Transfer the name from V to this value.
Definition: Value.cpp:383
bool isGFX11Plus(const MCSubtargetInfo &STI)
const ImageDimIntrinsicInfo * getImageDimIntrinsicInfo(unsigned Intr)
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition: CallingConv.h:24
bool getIntrinsicSignature(Intrinsic::ID, FunctionType *FT, SmallVectorImpl< Type * > &ArgTys)
Gets the type arguments of an intrinsic call by matching type contraints specified by the ....
Definition: Function.cpp:1839
Function * getDeclaration(Module *M, ID id, ArrayRef< Type * > Tys=std::nullopt)
Create or insert an LLVM Function declaration for an intrinsic, and return it.
Definition: Function.cpp:1539
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition: bit.h:385
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition: bit.h:215
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1736
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition: MathExtras.h:403
FunctionPass * createAMDGPUImageIntrinsicOptimizerPass(const TargetMachine *)
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM)