70#include "llvm/IR/IntrinsicsAMDGPU.h"
76#define DEBUG_TYPE "amdgpu-image-intrinsic-opt"
79class AMDGPUImageIntrinsicOptimizer :
public FunctionPass {
85 AMDGPUImageIntrinsicOptimizer(
const TargetMachine *TM =
nullptr)
94 "AMDGPU Image Intrinsic Optimizer",
false,
false)
96char AMDGPUImageIntrinsicOptimizer::
ID = 0;
101 const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) {
104 if (IIList.front()->getIntrinsicID() !=
II->getIntrinsicID())
108 if (IIList.front()->getType() !=
II->getType())
112 bool AllEqual =
true;
113 assert(IIList.front()->arg_size() ==
II->arg_size());
114 for (
int I = 1, E =
II->arg_size(); AllEqual &&
I != E; ++
I) {
115 Value *ArgList = IIList.front()->getArgOperand(
I);
117 if (
I == ImageDimIntr->VAddrEnd - 1) {
119 auto FragIdList = cast<ConstantInt>(IIList.front()->getArgOperand(
I));
120 auto FragId = cast<ConstantInt>(
II->getArgOperand(
I));
121 AllEqual = FragIdList->getValue().udiv(4) == FragId->getValue().udiv(4);
124 AllEqual = ArgList == Arg;
131 IIList.emplace_back(
II);
136 MergeableInsts.emplace_back(1,
II);
145 for (;
I != E; ++
I) {
148 if (
I->mayHaveSideEffects()) {
158 if (IntrinID != Intrinsic::amdgcn_image_load_2dmsaa &&
159 IntrinID != Intrinsic::amdgcn_image_load_2darraymsaa)
164 const uint8_t FragIdIndex = ImageDimIntr->VAddrEnd - 1;
165 if (!isa<ConstantInt>(
II->getArgOperand(FragIdIndex)))
180 for (
const auto &IIList : MergeableInsts) {
181 if (IIList.size() <= 1)
190 Function *
F = IIList.front()->getCalledFunction();
200 OverloadTys[0] = NewTy;
204 IIList.front()->getArgOperand(ImageDimIntr->
DMaskIndex));
206 unsigned NumElts =
popcount(DMaskVal);
210 unsigned NumLoads = IIList.size();
211 unsigned NumMsaas = NumElts;
212 unsigned NumVAddrLoads = 3 * NumLoads;
213 unsigned NumVDataLoads =
divideCeil(NumElts, isD16 ? 2 : 1) * NumLoads;
214 unsigned NumVAddrMsaas = 3 * NumMsaas;
215 unsigned NumVDataMsaas =
divideCeil(4, isD16 ? 2 : 1) * NumMsaas;
217 if (NumLoads < NumMsaas ||
218 (NumVAddrLoads + NumVDataLoads < NumVAddrMsaas + NumVDataMsaas))
221 const uint8_t FragIdIndex = ImageDimIntr->
VAddrEnd - 1;
222 auto FragId = cast<ConstantInt>(IIList.front()->getArgOperand(FragIdIndex));
223 const APInt &NewFragIdVal = FragId->getValue().
udiv(4) * 4;
230 while (DMaskVal != 0) {
234 if (IntrinID == Intrinsic::amdgcn_image_load_2dmsaa)
235 NewIntrinID = Intrinsic::amdgcn_image_msaa_load_2dmsaa;
237 NewIntrinID = Intrinsic::amdgcn_image_msaa_load_2darraymsaa;
240 IIList.front()->getModule(), NewIntrinID, OverloadTys);
242 ConstantInt::get(DMask->
getType(), NewMaskVal);
243 Args[FragIdIndex] = ConstantInt::get(FragId->getType(), NewFragIdVal);
244 CallInst *NewCall =
B.CreateCall(NewIntrin, Args);
248 DMaskVal -= NewMaskVal;
252 for (
auto &
II : IIList) {
253 Value *VecOp =
nullptr;
254 auto Idx = cast<ConstantInt>(
II->getArgOperand(FragIdIndex));
255 B.SetCurrentDebugLocation(
II->getDebugLoc());
257 VecOp =
B.CreateExtractElement(NewCalls[0],
Idx->getValue().urem(4));
261 for (
unsigned I = 0;
I < NumElts; ++
I) {
262 VecOp =
B.CreateInsertElement(
264 B.CreateExtractElement(NewCalls[
I],
Idx->getValue().urem(4)),
I);
270 II->replaceAllUsesWith(VecOp);
278 for (
auto I : InstrsToErase)
279 I->eraseFromParent();
297 return !
F.users().empty() &&
298 (
F.getIntrinsicID() == Intrinsic::amdgcn_image_load_2dmsaa ||
299 F.getIntrinsicID() == Intrinsic::amdgcn_image_load_2darraymsaa);
318bool AMDGPUImageIntrinsicOptimizer::runOnFunction(
Function &
F) {
327 return new AMDGPUImageIntrinsicOptimizer(TM);
bool optimizeSection(ArrayRef< SmallVector< IntrinsicInst *, 4 > > MergeableInsts)
INITIALIZE_PASS(AMDGPUImageIntrinsicOptimizer, DEBUG_TYPE, "AMDGPU Image Intrinsic Optimizer", false, false) char AMDGPUImageIntrinsicOptimizer void addInstToMergeableList(IntrinsicInst *II, SmallVector< SmallVector< IntrinsicInst *, 4 > > &MergeableInsts, const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr)
BasicBlock::iterator collectMergeableInsts(BasicBlock::iterator I, BasicBlock::iterator E, SmallVector< SmallVector< IntrinsicInst *, 4 > > &MergeableInsts)
static bool imageIntrinsicOptimizerImpl(Function &F, const TargetMachine *TM)
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
The AMDGPU TargetMachine interface definition for hw codegen targets.
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
uint64_t IntrinsicInst * II
#define INITIALIZE_PASS(passName, arg, name, cfg, analysis)
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
Class for arbitrary precision integers.
APInt udiv(const APInt &RHS) const
Unsigned division operation.
A container for analyses that lazily runs them and caches their results.
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
InstListType::iterator iterator
Instruction iterators...
This class represents a function call, abstracting a target machine's calling convention.
This is the shared class of boolean and integer constants.
uint64_t getZExtValue() const
Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
FunctionPass class - This class is used to implement most global optimizations.
virtual bool runOnFunction(Function &F)=0
runOnFunction - Virtual method overriden by subclasses to do the per-function processing of the pass.
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
A wrapper class for inspecting calls to intrinsic functions.
A Module instance is used to store all the information related to an LLVM module.
A set of analyses that are preserved following a run of a transformation pass.
static PreservedAnalyses none()
Convenience factory function for the empty preserved set.
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Primary interface to the complete machine description for the target machine.
The instances of the Type class are immutable: once they are created, they are never changed.
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
static UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
LLVM Value Representation.
Type * getType() const
All values are typed, get the type of this value.
void takeName(Value *V)
Transfer the name from V to this value.
bool isGFX11Plus(const MCSubtargetInfo &STI)
const ImageDimIntrinsicInfo * getImageDimIntrinsicInfo(unsigned Intr)
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
bool getIntrinsicSignature(Intrinsic::ID, FunctionType *FT, SmallVectorImpl< Type * > &ArgTys)
Gets the type arguments of an intrinsic call by matching type contraints specified by the ....
Function * getDeclaration(Module *M, ID id, ArrayRef< Type * > Tys=std::nullopt)
Create or insert an LLVM Function declaration for an intrinsic, and return it.
This is an optimization pass for GlobalISel generic memory operations.
int popcount(T Value) noexcept
Count the number of set bits in a value.
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
FunctionPass * createAMDGPUImageIntrinsicOptimizerPass(const TargetMachine *)
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM)