Go to the documentation of this file.
26 #define DEBUG_TYPE "amdgpu-late-codegenprepare"
35 WidenLoads(
"amdgpu-late-codegenprepare-widen-constant-loads",
36 cl::desc(
"Widen sub-dword constant address space loads in "
37 "AMDGPULateCodeGenPrepare"),
42 class AMDGPULateCodeGenPrepare
44 public InstVisitor<AMDGPULateCodeGenPrepare, bool> {
57 return "AMDGPU IR late optimizations";
66 bool doInitialization(
Module &M)
override;
69 bool visitInstruction(
Instruction &) {
return false; }
72 bool isDWORDAligned(
const Value *V)
const {
77 bool canWidenScalarExtLoad(
LoadInst &LI)
const;
83 bool AMDGPULateCodeGenPrepare::doInitialization(
Module &M) {
93 AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(
F);
94 DA = &getAnalysis<LegacyDivergenceAnalysis>();
104 bool AMDGPULateCodeGenPrepare::canWidenScalarExtLoad(
LoadInst &LI)
const {
115 if (Ty->isAggregateType())
117 unsigned TySize =
DL->getTypeStoreSize(Ty);
125 return DA->isUniform(&LI);
128 bool AMDGPULateCodeGenPrepare::visitLoadInst(
LoadInst &LI) {
137 if (!canWidenScalarExtLoad(LI))
145 if (!isDWORDAligned(Base))
148 int64_t Adjust =
Offset & 0x3;
160 unsigned LdBits =
DL->getTypeStoreSize(LI.
getType()) * 8;
165 auto *NewPtr = IRB.CreateBitCast(
166 IRB.CreateConstGEP1_64(
168 IRB.CreatePointerBitCastOrAddrSpaceCast(Base, Int8PtrTy),
171 LoadInst *NewLd = IRB.CreateAlignedLoad(IRB.getInt32Ty(), NewPtr,
Align(4));
173 NewLd->
setMetadata(LLVMContext::MD_range,
nullptr);
175 unsigned ShAmt = Adjust * 8;
176 auto *NewVal = IRB.CreateBitCast(
177 IRB.CreateTrunc(IRB.CreateLShr(NewLd, ShAmt), IntNTy), LI.
getType());
185 "AMDGPU IR late optimizations",
false,
false)
191 char AMDGPULateCodeGenPrepare::
ID = 0;
194 return new AMDGPULateCodeGenPrepare();
bool RecursivelyDeleteTriviallyDeadInstructions(Value *V, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr, std::function< void(Value *)> AboutToDeleteCallback=std::function< void(Value *)>())
If the specified value is a trivially dead instruction, delete it.
This is an optimization pass for GlobalISel generic memory operations.
We currently emits eax Perhaps this is what we really should generate is Is imull three or four cycles eax eax The current instruction priority is based on pattern complexity The former is more complex because it folds a load so the latter will not be emitted Perhaps we should use AddedComplexity to give LEA32r a higher priority We should always try to match LEA first since the LEA matching code does some estimate to determine whether the match is profitable if we care more about code then imull is better It s two bytes shorter than movl leal On a Pentium M
A parsed version of the target data layout string in and methods for querying it.
static PointerType * getInt8PtrTy(LLVMContext &C, unsigned AS=0)
FunctionPass * createAMDGPULateCodeGenPreparePass()
aarch64 falkor hwpf fix late
void copyMetadata(const Instruction &SrcInst, ArrayRef< unsigned > WL=ArrayRef< unsigned >())
Copy metadata from SrcInst to this instruction.
Value * getPointerOperand()
Align getAlign() const
Return the alignment of the access that is being performed.
static PointerType * getInt32PtrTy(LLVMContext &C, unsigned AS=0)
void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
unsigned countMinTrailingZeros() const
Returns the minimum number of trailing zero bits.
Represent the analysis usage information of a pass.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
Statically lint checks LLVM IR
This struct is a compact representation of a valid (non-zero power of two) alignment.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
void setAlignment(Align Align)
initializer< Ty > init(const Ty &Val)
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Class to represent pointers.
void computeKnownBits(const Value *V, KnownBits &Known, const DataLayout &DL, unsigned Depth=0, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, OptimizationRemarkEmitter *ORE=nullptr, bool UseInstrInfo=true)
Determine which bits of V are known to be either zero or one and return them in the KnownZero/KnownOn...
A Module instance is used to store all the information related to an LLVM module.
An immutable pass that tracks lazily created AssumptionCache objects.
StringRef - Represent a constant reference to a string, i.e.
A cache of @llvm.assume calls within a function.
Type * getType() const
All values are typed, get the type of this value.
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
LLVMContext & getContext() const
All values hold a context through their type.
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Base class for instruction visitors.
INITIALIZE_PASS_BEGIN(AMDGPULateCodeGenPrepare, DEBUG_TYPE, "AMDGPU IR late optimizations", false, false) INITIALIZE_PASS_END(AMDGPULateCodeGenPrepare
An instruction for reading from memory.
static bool runOnFunction(Function &F, bool PostInlining)
static IntegerType * getIntNTy(LLVMContext &C, unsigned N)
void setPreservesAll()
Set by analyses that do not transform their input at all.
AMDGPU IR late optimizations
Value * GetPointerBaseWithConstantOffset(Value *Ptr, int64_t &Offset, const DataLayout &DL, bool AllowNonInbounds=true)
Analyze the specified pointer to see if it can be expressed as a base pointer plus a constant offset.
static cl::opt< bool > WidenLoads("amdgpu-late-codegenprepare-widen-constant-loads", cl::desc("Widen sub-dword constant address space loads in " "AMDGPULateCodeGenPrepare"), cl::ReallyHidden, cl::init(true))
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
const DataLayout & getDataLayout() const
Get the data layout for the module's target platform.
FunctionPass class - This class is used to implement most global optimizations.
Common register allocation spilling lr str ldr sxth r3 ldr mla r4 can lr mov lr str ldr sxth r3 mla r4 and then merge mul and lr str ldr sxth r3 mla r4 It also increase the likelihood the store may become dead bb27 Successors according to LLVM BB
AnalysisUsage & addRequired()
LLVM Value Representation.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).