LLVM 19.0.0git
AMDGPULateCodeGenPrepare.cpp
Go to the documentation of this file.
1//===-- AMDGPUCodeGenPrepare.cpp ------------------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// This pass does misc. AMDGPU optimizations on IR *just* before instruction
11/// selection.
12//
13//===----------------------------------------------------------------------===//
14
15#include "AMDGPU.h"
16#include "AMDGPUTargetMachine.h"
21#include "llvm/IR/IRBuilder.h"
22#include "llvm/IR/InstVisitor.h"
27
28#define DEBUG_TYPE "amdgpu-late-codegenprepare"
29
30using namespace llvm;
31
32// Scalar load widening needs running after load-store-vectorizer as that pass
33// doesn't handle overlapping cases. In addition, this pass enhances the
34// widening to handle cases where scalar sub-dword loads are naturally aligned
35// only but not dword aligned.
36static cl::opt<bool>
37 WidenLoads("amdgpu-late-codegenprepare-widen-constant-loads",
38 cl::desc("Widen sub-dword constant address space loads in "
39 "AMDGPULateCodeGenPrepare"),
41
42namespace {
43
44class AMDGPULateCodeGenPrepare
45 : public FunctionPass,
46 public InstVisitor<AMDGPULateCodeGenPrepare, bool> {
47 Module *Mod = nullptr;
48 const DataLayout *DL = nullptr;
49
50 AssumptionCache *AC = nullptr;
51 UniformityInfo *UA = nullptr;
52
53public:
54 static char ID;
55
56 AMDGPULateCodeGenPrepare() : FunctionPass(ID) {}
57
58 StringRef getPassName() const override {
59 return "AMDGPU IR late optimizations";
60 }
61
62 void getAnalysisUsage(AnalysisUsage &AU) const override {
66 AU.setPreservesAll();
67 }
68
69 bool doInitialization(Module &M) override;
70 bool runOnFunction(Function &F) override;
71
72 bool visitInstruction(Instruction &) { return false; }
73
74 // Check if the specified value is at least DWORD aligned.
75 bool isDWORDAligned(const Value *V) const {
76 KnownBits Known = computeKnownBits(V, *DL, 0, AC);
77 return Known.countMinTrailingZeros() >= 2;
78 }
79
80 bool canWidenScalarExtLoad(LoadInst &LI) const;
81 bool visitLoadInst(LoadInst &LI);
82};
83
84} // end anonymous namespace
85
86bool AMDGPULateCodeGenPrepare::doInitialization(Module &M) {
87 Mod = &M;
88 DL = &Mod->getDataLayout();
89 return false;
90}
91
92bool AMDGPULateCodeGenPrepare::runOnFunction(Function &F) {
93 if (skipFunction(F))
94 return false;
95
96 const TargetPassConfig &TPC = getAnalysis<TargetPassConfig>();
97 const TargetMachine &TM = TPC.getTM<TargetMachine>();
98 const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
99 if (ST.hasScalarSubwordLoads())
100 return false;
101
102 AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
103 UA = &getAnalysis<UniformityInfoWrapperPass>().getUniformityInfo();
104
105 bool Changed = false;
106 for (auto &BB : F)
108 Changed |= visit(I);
109
110 return Changed;
111}
112
113bool AMDGPULateCodeGenPrepare::canWidenScalarExtLoad(LoadInst &LI) const {
114 unsigned AS = LI.getPointerAddressSpace();
115 // Skip non-constant address space.
116 if (AS != AMDGPUAS::CONSTANT_ADDRESS &&
118 return false;
119 // Skip non-simple loads.
120 if (!LI.isSimple())
121 return false;
122 auto *Ty = LI.getType();
123 // Skip aggregate types.
124 if (Ty->isAggregateType())
125 return false;
126 unsigned TySize = DL->getTypeStoreSize(Ty);
127 // Only handle sub-DWORD loads.
128 if (TySize >= 4)
129 return false;
130 // That load must be at least naturally aligned.
131 if (LI.getAlign() < DL->getABITypeAlign(Ty))
132 return false;
133 // It should be uniform, i.e. a scalar load.
134 return UA->isUniform(&LI);
135}
136
137bool AMDGPULateCodeGenPrepare::visitLoadInst(LoadInst &LI) {
138 if (!WidenLoads)
139 return false;
140
141 // Skip if that load is already aligned on DWORD at least as it's handled in
142 // SDAG.
143 if (LI.getAlign() >= 4)
144 return false;
145
146 if (!canWidenScalarExtLoad(LI))
147 return false;
148
149 int64_t Offset = 0;
150 auto *Base =
152 // If that base is not DWORD aligned, it's not safe to perform the following
153 // transforms.
154 if (!isDWORDAligned(Base))
155 return false;
156
157 int64_t Adjust = Offset & 0x3;
158 if (Adjust == 0) {
159 // With a zero adjust, the original alignment could be promoted with a
160 // better one.
161 LI.setAlignment(Align(4));
162 return true;
163 }
164
165 IRBuilder<> IRB(&LI);
166 IRB.SetCurrentDebugLocation(LI.getDebugLoc());
167
168 unsigned LdBits = DL->getTypeStoreSizeInBits(LI.getType());
169 auto IntNTy = Type::getIntNTy(LI.getContext(), LdBits);
170
171 auto *NewPtr = IRB.CreateConstGEP1_64(
172 IRB.getInt8Ty(),
173 IRB.CreateAddrSpaceCast(Base, LI.getPointerOperand()->getType()),
174 Offset - Adjust);
175
176 LoadInst *NewLd = IRB.CreateAlignedLoad(IRB.getInt32Ty(), NewPtr, Align(4));
177 NewLd->copyMetadata(LI);
178 NewLd->setMetadata(LLVMContext::MD_range, nullptr);
179
180 unsigned ShAmt = Adjust * 8;
181 auto *NewVal = IRB.CreateBitCast(
182 IRB.CreateTrunc(IRB.CreateLShr(NewLd, ShAmt), IntNTy), LI.getType());
183 LI.replaceAllUsesWith(NewVal);
185
186 return true;
187}
188
189INITIALIZE_PASS_BEGIN(AMDGPULateCodeGenPrepare, DEBUG_TYPE,
190 "AMDGPU IR late optimizations", false, false)
194INITIALIZE_PASS_END(AMDGPULateCodeGenPrepare, DEBUG_TYPE,
196
197char AMDGPULateCodeGenPrepare::ID = 0;
198
200 return new AMDGPULateCodeGenPrepare();
201}
aarch64 falkor hwpf fix late
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static cl::opt< bool > WidenLoads("amdgpu-late-codegenprepare-widen-constant-loads", cl::desc("Widen sub-dword constant address space loads in " "AMDGPULateCodeGenPrepare"), cl::ReallyHidden, cl::init(true))
#define DEBUG_TYPE
AMDGPU IR late optimizations
The AMDGPU TargetMachine interface definition for hw codegen targets.
Legalize the Machine IR a function s Machine IR
Definition: Legalizer.cpp:81
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
Module * Mod
const char LLVMTargetMachineRef TM
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition: PassSupport.h:55
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:59
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:52
Target-Independent Code Generator Pass Configuration Options pass.
LLVM IR instance of the generic uniformity analysis.
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
void setPreservesAll()
Set by analyses that do not transform their input at all.
An immutable pass that tracks lazily created AssumptionCache objects.
A cache of @llvm.assume calls within a function.
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:110
FunctionPass class - This class is used to implement most global optimizations.
Definition: Pass.h:311
virtual bool runOnFunction(Function &F)=0
runOnFunction - Virtual method overriden by subclasses to do the per-function processing of the pass.
bool isUniform(ConstValueRefT V) const
Whether V is uniform/non-divergent.
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition: IRBuilder.h:2644
Base class for instruction visitors.
Definition: InstVisitor.h:78
void visitInstruction(Instruction &I)
Definition: InstVisitor.h:280
RetTy visitLoadInst(LoadInst &I)
Definition: InstVisitor.h:169
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
Definition: Instruction.h:454
void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
Definition: Metadata.cpp:1636
void copyMetadata(const Instruction &SrcInst, ArrayRef< unsigned > WL=ArrayRef< unsigned >())
Copy metadata from SrcInst to this instruction.
An instruction for reading from memory.
Definition: Instructions.h:184
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Definition: Instructions.h:286
void setAlignment(Align Align)
Definition: Instructions.h:240
Value * getPointerOperand()
Definition: Instructions.h:280
bool isSimple() const
Definition: Instructions.h:272
Align getAlign() const
Return the alignment of the access that is being performed.
Definition: Instructions.h:236
A Module instance is used to store all the information related to an LLVM module.
Definition: Module.h:65
const DataLayout & getDataLayout() const
Get the data layout for the module's target platform.
Definition: Module.h:287
virtual void getAnalysisUsage(AnalysisUsage &) const
getAnalysisUsage - This function should be overriden by passes that need analysis information to do t...
Definition: Pass.cpp:98
virtual bool doInitialization(Module &)
doInitialization - Virtual method overridden by subclasses to do any necessary initialization before ...
Definition: Pass.h:119
virtual StringRef getPassName() const
getPassName - Return a nice clean name for a pass.
Definition: Pass.cpp:81
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:50
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:76
Target-Independent Code Generator Pass Configuration Options.
TMC & getTM() const
Get the right type of TargetMachine for this target.
static IntegerType * getIntNTy(LLVMContext &C, unsigned N)
Legacy analysis pass which computes a CycleInfo.
LLVM Value Representation.
Definition: Value.h:74
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition: Value.cpp:534
LLVMContext & getContext() const
All values hold a context through their type.
Definition: Value.cpp:1074
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition: CallingConv.h:24
@ ReallyHidden
Definition: CommandLine.h:139
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:450
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
@ Offset
Definition: DWP.cpp:456
bool RecursivelyDeleteTriviallyDeadInstructions(Value *V, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr, std::function< void(Value *)> AboutToDeleteCallback=std::function< void(Value *)>())
If the specified value is a trivially dead instruction, delete it.
Definition: Local.cpp:533
Value * GetPointerBaseWithConstantOffset(Value *Ptr, int64_t &Offset, const DataLayout &DL, bool AllowNonInbounds=true)
Analyze the specified pointer to see if it can be expressed as a base pointer plus a constant offset.
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition: STLExtras.h:665
void computeKnownBits(const Value *V, KnownBits &Known, const DataLayout &DL, unsigned Depth=0, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true)
Determine which bits of V are known to be either zero or one and return them in the KnownZero/KnownOn...
FunctionPass * createAMDGPULateCodeGenPreparePass()
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
unsigned countMinTrailingZeros() const
Returns the minimum number of trailing zero bits.
Definition: KnownBits.h:238