28#define DEBUG_TYPE "amdgpu-late-codegenprepare"
37 WidenLoads(
"amdgpu-late-codegenprepare-widen-constant-loads",
38 cl::desc(
"Widen sub-dword constant address space loads in "
39 "AMDGPULateCodeGenPrepare"),
44class AMDGPULateCodeGenPrepare
46 public InstVisitor<AMDGPULateCodeGenPrepare, bool> {
61 return "AMDGPU IR late optimizations";
77 bool isDWORDAligned(
const Value *V)
const {
82 bool canWidenScalarExtLoad(
LoadInst &LI)
const;
88class LiveRegOptimizer {
94 Type *ConvertToScalar;
105 Type *calculateConvertType(
Type *OriginalType);
123 bool shouldReplace(
Type *ITy) {
128 auto TLI =
ST->getTargetLowering();
151bool AMDGPULateCodeGenPrepare::doInitialization(
Module &M) {
157bool AMDGPULateCodeGenPrepare::runOnFunction(
Function &
F) {
165 AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(
F);
166 UA = &getAnalysis<UniformityInfoWrapperPass>().getUniformityInfo();
175 LiveRegOptimizer LRO(
Mod, &ST);
177 bool Changed =
false;
179 bool HasScalarSubwordLoads =
ST.hasScalarSubwordLoads();
183 Changed |= !HasScalarSubwordLoads && visit(
I);
184 Changed |= LRO.optimizeLiveType(&
I, DeadInsts);
191Type *LiveRegOptimizer::calculateConvertType(
Type *OriginalType) {
197 TypeSize OriginalSize =
DL->getTypeSizeInBits(VTy);
198 TypeSize ConvertScalarSize =
DL->getTypeSizeInBits(ConvertToScalar);
199 unsigned ConvertEltCount =
200 (OriginalSize + ConvertScalarSize - 1) / ConvertScalarSize;
202 if (OriginalSize <= ConvertScalarSize)
206 ConvertEltCount,
false);
212 Type *NewTy = calculateConvertType(
V->getType());
214 TypeSize OriginalSize =
DL->getTypeSizeInBits(VTy);
215 TypeSize NewSize =
DL->getTypeSizeInBits(NewTy);
220 if (OriginalSize == NewSize)
221 return Builder.CreateBitCast(V, NewTy,
V->getName() +
".bc");
224 assert(NewSize > OriginalSize);
229 for (
unsigned I = 0;
I < OriginalElementCount;
I++)
232 for (
uint64_t I = OriginalElementCount;
I < ExpandedVecElementCount;
I++)
233 ShuffleMask.
push_back(OriginalElementCount);
235 Value *ExpandedVec = Builder.CreateShuffleVector(V, ShuffleMask);
236 return Builder.CreateBitCast(ExpandedVec, NewTy,
V->getName() +
".bc");
244 TypeSize OriginalSize =
DL->getTypeSizeInBits(
V->getType());
245 TypeSize NewSize =
DL->getTypeSizeInBits(NewVTy);
249 if (OriginalSize == NewSize)
250 return Builder.CreateBitCast(V, NewVTy,
V->getName() +
".bc");
254 assert(OriginalSize > NewSize);
256 if (!
V->getType()->isVectorTy()) {
259 return cast<Instruction>(Builder.CreateBitCast(Trunc, NewVTy));
268 cast<Instruction>(Builder.CreateBitCast(V, ExpandedVT));
272 std::iota(ShuffleMask.
begin(), ShuffleMask.
end(), 0);
274 return Builder.CreateShuffleVector(Converted, ShuffleMask);
277bool LiveRegOptimizer::optimizeLiveType(
285 while (!Worklist.
empty()) {
291 if (!shouldReplace(
II->getType()))
294 if (
PHINode *Phi = dyn_cast<PHINode>(
II)) {
297 for (
Value *V :
Phi->incoming_values()) {
299 if (
PHINode *OpPhi = dyn_cast<PHINode>(V)) {
300 if (!PhiNodes.
count(OpPhi) && !Visited.
count(OpPhi))
307 if (!IncInst && !isa<ConstantAggregateZero>(V))
317 for (
User *V :
II->users()) {
319 if (
PHINode *OpPhi = dyn_cast<PHINode>(V)) {
320 if (!PhiNodes.
count(OpPhi) && !Visited.
count(OpPhi))
327 if (UseInst->
getParent() !=
II->getParent() || isa<PHINode>(
II)) {
328 Uses.insert(UseInst);
329 if (!Defs.
count(
II) && !isa<PHINode>(
II)) {
340 Value *ConvertVal = convertToOptType(
D, InsertPt);
342 ValMap[
D] = ConvertVal;
347 for (
PHINode *Phi : PhiNodes) {
349 Phi->getNumIncomingValues(),
350 Phi->getName() +
".tc",
Phi->getIterator());
354 for (
PHINode *Phi : PhiNodes) {
355 PHINode *NewPhi = cast<PHINode>(ValMap[Phi]);
356 bool MissingIncVal =
false;
357 for (
int I = 0, E =
Phi->getNumIncomingValues();
I < E;
I++) {
358 Value *IncVal =
Phi->getIncomingValue(
I);
359 if (isa<ConstantAggregateZero>(IncVal)) {
360 Type *NewType = calculateConvertType(
Phi->getType());
361 NewPhi->
addIncoming(ConstantInt::get(NewType, 0,
false),
362 Phi->getIncomingBlock(
I));
366 MissingIncVal =
true;
370 DeadInst = cast<Instruction>(ValMap[Phi]);
381 Value *NewVal =
nullptr;
382 if (BBUseValMap.
contains(
U->getParent()) &&
383 BBUseValMap[
U->getParent()].contains(ValMap[
Op]))
384 NewVal = BBUseValMap[
U->getParent()][ValMap[
Op]];
388 convertFromOptType(
Op->getType(), cast<Instruction>(ValMap[
Op]),
389 InsertPt,
U->getParent());
390 BBUseValMap[
U->getParent()][ValMap[
Op]] = NewVal;
393 U->setOperand(OpIdx, NewVal);
401bool AMDGPULateCodeGenPrepare::canWidenScalarExtLoad(
LoadInst &LI)
const {
414 unsigned TySize =
DL->getTypeStoreSize(Ty);
425bool AMDGPULateCodeGenPrepare::visitLoadInst(
LoadInst &LI) {
434 if (!canWidenScalarExtLoad(LI))
442 if (!isDWORDAligned(
Base))
445 int64_t Adjust =
Offset & 0x3;
456 unsigned LdBits =
DL->getTypeStoreSizeInBits(LI.
getType());
459 auto *NewPtr = IRB.CreateConstGEP1_64(
464 LoadInst *NewLd = IRB.CreateAlignedLoad(IRB.getInt32Ty(), NewPtr,
Align(4));
466 NewLd->
setMetadata(LLVMContext::MD_range,
nullptr);
468 unsigned ShAmt = Adjust * 8;
469 auto *NewVal = IRB.CreateBitCast(
470 IRB.CreateTrunc(IRB.CreateLShr(NewLd, ShAmt), IntNTy), LI.
getType());
478 "AMDGPU IR late optimizations",
false,
false)
485char AMDGPULateCodeGenPrepare::
ID = 0;
488 return new AMDGPULateCodeGenPrepare();
aarch64 falkor hwpf fix late
static cl::opt< bool > WidenLoads("amdgpu-late-codegenprepare-widen-constant-loads", cl::desc("Widen sub-dword constant address space loads in " "AMDGPULateCodeGenPrepare"), cl::ReallyHidden, cl::init(true))
AMDGPU IR late optimizations
The AMDGPU TargetMachine interface definition for hw codegen targets.
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
Rewrite Partial Register Uses
Legalize the Machine IR a function s Machine IR
uint64_t IntrinsicInst * II
const char LLVMTargetMachineRef TM
#define INITIALIZE_PASS_DEPENDENCY(depName)
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
Target-Independent Code Generator Pass Configuration Options pass.
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
void setPreservesAll()
Set by analyses that do not transform their input at all.
An immutable pass that tracks lazily created AssumptionCache objects.
A cache of @llvm.assume calls within a function.
LLVM Basic Block Representation.
InstListType::iterator iterator
Instruction iterators...
This class represents an Operation in the Expression.
A parsed version of the target data layout string in and methods for querying it.
bool contains(const_arg_type_t< KeyT > Val) const
Return true if the specified key is in the map, false otherwise.
Class to represent fixed width SIMD vectors.
FunctionPass class - This class is used to implement most global optimizations.
virtual bool runOnFunction(Function &F)=0
runOnFunction - Virtual method overriden by subclasses to do the per-function processing of the pass.
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Base class for instruction visitors.
void visitInstruction(Instruction &I)
RetTy visitLoadInst(LoadInst &I)
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
void copyMetadata(const Instruction &SrcInst, ArrayRef< unsigned > WL=ArrayRef< unsigned >())
Copy metadata from SrcInst to this instruction.
static IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
An instruction for reading from memory.
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
void setAlignment(Align Align)
Value * getPointerOperand()
Align getAlign() const
Return the alignment of the access that is being performed.
A Module instance is used to store all the information related to an LLVM module.
LLVMContext & getContext() const
Get the global data context.
const DataLayout & getDataLayout() const
Get the data layout for the module's target platform.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
static PHINode * Create(Type *Ty, unsigned NumReservedValues, const Twine &NameStr="", InsertPosition InsertBefore=nullptr)
Constructors - NumReservedValues is a hint for the number of incoming edges that this phi node will h...
virtual void getAnalysisUsage(AnalysisUsage &) const
getAnalysisUsage - This function should be overriden by passes that need analysis information to do t...
virtual bool doInitialization(Module &)
doInitialization - Virtual method overridden by subclasses to do any necessary initialization before ...
virtual StringRef getPassName() const
getPassName - Return a nice clean name for a pass.
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
reference emplace_back(ArgTypes &&... Args)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
StringRef - Represent a constant reference to a string, i.e.
std::pair< LegalizeTypeAction, EVT > LegalizeKind
LegalizeKind holds the legalization kind that needs to happen to EVT in order to type-legalize it.
Primary interface to the complete machine description for the target machine.
Target-Independent Code Generator Pass Configuration Options.
TMC & getTM() const
Get the right type of TargetMachine for this target.
The instances of the Type class are immutable: once they are created, they are never changed.
static IntegerType * getIntNTy(LLVMContext &C, unsigned N)
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
bool isAggregateType() const
Return true if the type is an aggregate type.
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
static IntegerType * getInt32Ty(LLVMContext &C)
bool isIntegerTy() const
True if this is an instance of IntegerType.
LLVM Value Representation.
Type * getType() const
All values are typed, get the type of this value.
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
LLVMContext & getContext() const
All values hold a context through their type.
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
Type * getElementType() const
constexpr ScalarTy getFixedValue() const
const ParentTy * getParent() const
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
initializer< Ty > init(const Ty &Val)
NodeAddr< PhiNode * > Phi
This is an optimization pass for GlobalISel generic memory operations.
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are are tuples (A,...
Value * GetPointerBaseWithConstantOffset(Value *Ptr, int64_t &Offset, const DataLayout &DL, bool AllowNonInbounds=true)
Analyze the specified pointer to see if it can be expressed as a base pointer plus a constant offset.
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
auto reverse(ContainerTy &&C)
void computeKnownBits(const Value *V, KnownBits &Known, const DataLayout &DL, unsigned Depth=0, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true)
Determine which bits of V are known to be either zero or one and return them in the KnownZero/KnownOn...
DWARFExpression::Operation Op
bool RecursivelyDeleteTriviallyDeadInstructionsPermissive(SmallVectorImpl< WeakTrackingVH > &DeadInsts, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr, std::function< void(Value *)> AboutToDeleteCallback=std::function< void(Value *)>())
Same functionality as RecursivelyDeleteTriviallyDeadInstructions, but allow instructions that are not...
FunctionPass * createAMDGPULateCodeGenPreparePass()
This struct is a compact representation of a valid (non-zero power of two) alignment.
static EVT getEVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
unsigned countMinTrailingZeros() const
Returns the minimum number of trailing zero bits.