27#define DEBUG_TYPE "amdgpu-late-codegenprepare"
36 WidenLoads(
"amdgpu-late-codegenprepare-widen-constant-loads",
37 cl::desc(
"Widen sub-dword constant address space loads in "
38 "AMDGPULateCodeGenPrepare"),
43class AMDGPULateCodeGenPrepare
44 :
public InstVisitor<AMDGPULateCodeGenPrepare, bool> {
57 :
F(
F),
DL(
F.getDataLayout()), ST(ST), AC(AC), UA(UA) {}
62 bool isDWORDAligned(
const Value *V)
const {
67 bool canWidenScalarExtLoad(
LoadInst &LI)
const;
73class LiveRegOptimizer {
79 Type *
const ConvertToScalar;
90 Type *calculateConvertType(
Type *OriginalType);
108 bool shouldReplace(
Type *ITy) {
113 const auto *TLI =
ST.getTargetLowering();
130 ConvertToScalar(
Type::getInt32Ty(
Mod.getContext())) {}
135bool AMDGPULateCodeGenPrepare::run() {
143 LiveRegOptimizer LRO(*
F.getParent(), ST);
145 bool Changed =
false;
147 bool HasScalarSubwordLoads =
ST.hasScalarSubwordLoads();
151 Changed |= !HasScalarSubwordLoads &&
visit(
I);
152 Changed |= LRO.optimizeLiveType(&
I, DeadInsts);
159Type *LiveRegOptimizer::calculateConvertType(
Type *OriginalType) {
165 TypeSize OriginalSize =
DL.getTypeSizeInBits(VTy);
166 TypeSize ConvertScalarSize =
DL.getTypeSizeInBits(ConvertToScalar);
167 unsigned ConvertEltCount =
168 (OriginalSize + ConvertScalarSize - 1) / ConvertScalarSize;
170 if (OriginalSize <= ConvertScalarSize)
174 ConvertEltCount,
false);
180 Type *NewTy = calculateConvertType(
V->getType());
182 TypeSize OriginalSize =
DL.getTypeSizeInBits(VTy);
183 TypeSize NewSize =
DL.getTypeSizeInBits(NewTy);
188 if (OriginalSize == NewSize)
189 return Builder.CreateBitCast(V, NewTy,
V->getName() +
".bc");
192 assert(NewSize > OriginalSize);
197 for (
unsigned I = 0;
I < OriginalElementCount;
I++)
200 for (
uint64_t I = OriginalElementCount;
I < ExpandedVecElementCount;
I++)
201 ShuffleMask.
push_back(OriginalElementCount);
203 Value *ExpandedVec = Builder.CreateShuffleVector(V, ShuffleMask);
204 return Builder.CreateBitCast(ExpandedVec, NewTy,
V->getName() +
".bc");
212 TypeSize OriginalSize =
DL.getTypeSizeInBits(
V->getType());
213 TypeSize NewSize =
DL.getTypeSizeInBits(NewVTy);
217 if (OriginalSize == NewSize)
218 return Builder.CreateBitCast(V, NewVTy,
V->getName() +
".bc");
222 assert(OriginalSize > NewSize);
224 if (!
V->getType()->isVectorTy()) {
227 return cast<Instruction>(Builder.CreateBitCast(Trunc, NewVTy));
236 cast<Instruction>(Builder.CreateBitCast(V, ExpandedVT));
240 std::iota(ShuffleMask.
begin(), ShuffleMask.
end(), 0);
242 return Builder.CreateShuffleVector(Converted, ShuffleMask);
245bool LiveRegOptimizer::optimizeLiveType(
253 while (!Worklist.
empty()) {
259 if (!shouldReplace(
II->getType()))
262 if (
PHINode *Phi = dyn_cast<PHINode>(
II)) {
265 for (
Value *V :
Phi->incoming_values()) {
267 if (
PHINode *OpPhi = dyn_cast<PHINode>(V)) {
268 if (!PhiNodes.
count(OpPhi) && !Visited.
count(OpPhi))
275 if (!IncInst && !isa<ConstantAggregateZero>(V))
285 for (
User *V :
II->users()) {
287 if (
PHINode *OpPhi = dyn_cast<PHINode>(V)) {
288 if (!PhiNodes.
count(OpPhi) && !Visited.
count(OpPhi))
295 if (UseInst->
getParent() !=
II->getParent() || isa<PHINode>(
II)) {
296 Uses.insert(UseInst);
297 if (!isa<PHINode>(
II))
307 Value *ConvertVal = convertToOptType(
D, InsertPt);
309 ValMap[
D] = ConvertVal;
314 for (
PHINode *Phi : PhiNodes) {
316 Phi->getNumIncomingValues(),
317 Phi->getName() +
".tc",
Phi->getIterator());
321 for (
PHINode *Phi : PhiNodes) {
322 PHINode *NewPhi = cast<PHINode>(ValMap[Phi]);
323 bool MissingIncVal =
false;
324 for (
int I = 0, E =
Phi->getNumIncomingValues();
I < E;
I++) {
325 Value *IncVal =
Phi->getIncomingValue(
I);
326 if (isa<ConstantAggregateZero>(IncVal)) {
327 Type *NewType = calculateConvertType(
Phi->getType());
328 NewPhi->
addIncoming(ConstantInt::get(NewType, 0,
false),
329 Phi->getIncomingBlock(
I));
333 MissingIncVal =
true;
342 while (!PHIWorklist.
empty()) {
344 VisitedPhis.
insert(NextDeadValue);
346 std::find_if(PhiNodes.begin(), PhiNodes.end(),
347 [
this, &NextDeadValue](
PHINode *CandPhi) {
348 return ValMap[CandPhi] == NextDeadValue;
352 if (OriginalPhi != PhiNodes.end())
353 ValMap.
erase(*OriginalPhi);
355 DeadInsts.
emplace_back(cast<Instruction>(NextDeadValue));
358 if (!VisitedPhis.
contains(cast<PHINode>(U)))
371 Value *NewVal =
nullptr;
372 if (BBUseValMap.
contains(
U->getParent()) &&
373 BBUseValMap[
U->getParent()].contains(ValMap[
Op]))
374 NewVal = BBUseValMap[
U->getParent()][ValMap[
Op]];
380 if (isa<Instruction>(
Op) && !isa<PHINode>(
Op) &&
381 U->getParent() == cast<Instruction>(
Op)->getParent()) {
385 convertFromOptType(
Op->getType(), cast<Instruction>(ValMap[
Op]),
386 InsertPt,
U->getParent());
387 BBUseValMap[
U->getParent()][ValMap[
Op]] = NewVal;
391 U->setOperand(OpIdx, NewVal);
399bool AMDGPULateCodeGenPrepare::canWidenScalarExtLoad(
LoadInst &LI)
const {
412 unsigned TySize =
DL.getTypeStoreSize(Ty);
423bool AMDGPULateCodeGenPrepare::visitLoadInst(
LoadInst &LI) {
432 if (!canWidenScalarExtLoad(LI))
440 if (!isDWORDAligned(
Base))
443 int64_t Adjust =
Offset & 0x3;
454 unsigned LdBits =
DL.getTypeStoreSizeInBits(LI.
getType());
457 auto *NewPtr = IRB.CreateConstGEP1_64(
462 LoadInst *NewLd = IRB.CreateAlignedLoad(IRB.getInt32Ty(), NewPtr,
Align(4));
464 NewLd->
setMetadata(LLVMContext::MD_range,
nullptr);
466 unsigned ShAmt = Adjust * 8;
467 auto *NewVal = IRB.CreateBitCast(
468 IRB.CreateTrunc(IRB.CreateLShr(NewLd, ShAmt), IntNTy), LI.
getType());
482 bool Changed = AMDGPULateCodeGenPrepare(
F, ST, &AC, UI).run();
498 return "AMDGPU IR late optimizations";
520 getAnalysis<AssumptionCacheTracker>().getAssumptionCache(
F);
522 getAnalysis<UniformityInfoWrapperPass>().getUniformityInfo();
524 return AMDGPULateCodeGenPrepare(
F, ST, &AC, UI).run();
528 "AMDGPU IR late optimizations",
false,
false)
aarch64 falkor hwpf fix late
static cl::opt< bool > WidenLoads("amdgpu-late-codegenprepare-widen-constant-loads", cl::desc("Widen sub-dword constant address space loads in " "AMDGPULateCodeGenPrepare"), cl::ReallyHidden, cl::init(true))
The AMDGPU TargetMachine interface definition for hw codegen targets.
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
static bool runOnFunction(Function &F, bool PostInlining)
Legalize the Machine IR a function s Machine IR
Generic memory optimizations
uint64_t IntrinsicInst * II
if(auto Err=PB.parsePassPipeline(MPM, Passes)) return wrap(std MPM run * Mod
FunctionAnalysisManager FAM
#define INITIALIZE_PASS_DEPENDENCY(depName)
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Remove Loads Into Fake Uses
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
void visit(MachineFunction &MF, MachineBasicBlock &Start, std::function< void(MachineBasicBlock *)> op)
Target-Independent Code Generator Pass Configuration Options pass.
bool runOnFunction(Function &F) override
runOnFunction - Virtual method overriden by subclasses to do the per-function processing of the pass.
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - This function should be overriden by passes that need analysis information to do t...
AMDGPULateCodeGenPrepareLegacy()
StringRef getPassName() const override
getPassName - Return a nice clean name for a pass.
PreservedAnalyses run(Function &, FunctionAnalysisManager &)
A container for analyses that lazily runs them and caches their results.
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
void setPreservesAll()
Set by analyses that do not transform their input at all.
A function analysis which provides an AssumptionCache.
An immutable pass that tracks lazily created AssumptionCache objects.
A cache of @llvm.assume calls within a function.
LLVM Basic Block Representation.
InstListType::iterator iterator
Instruction iterators...
Represents analyses that only rely on functions' control flow.
This class represents an Operation in the Expression.
A parsed version of the target data layout string in and methods for querying it.
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
bool erase(const KeyT &Val)
bool contains(const_arg_type_t< KeyT > Val) const
Return true if the specified key is in the map, false otherwise.
Class to represent fixed width SIMD vectors.
FunctionPass class - This class is used to implement most global optimizations.
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Base class for instruction visitors.
void visitInstruction(Instruction &I)
RetTy visitLoadInst(LoadInst &I)
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
void copyMetadata(const Instruction &SrcInst, ArrayRef< unsigned > WL=ArrayRef< unsigned >())
Copy metadata from SrcInst to this instruction.
static IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
An instruction for reading from memory.
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
void setAlignment(Align Align)
Value * getPointerOperand()
Align getAlign() const
Return the alignment of the access that is being performed.
A Module instance is used to store all the information related to an LLVM module.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
static PHINode * Create(Type *Ty, unsigned NumReservedValues, const Twine &NameStr="", InsertPosition InsertBefore=nullptr)
Constructors - NumReservedValues is a hint for the number of incoming edges that this phi node will h...
A set of analyses that are preserved following a run of a transformation pass.
static PreservedAnalyses none()
Convenience factory function for the empty preserved set.
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
void preserveSet()
Mark an analysis set as preserved.
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
bool contains(ConstPtrType Ptr) const
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
reference emplace_back(ArgTypes &&... Args)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
StringRef - Represent a constant reference to a string, i.e.
std::pair< LegalizeTypeAction, EVT > LegalizeKind
LegalizeKind holds the legalization kind that needs to happen to EVT in order to type-legalize it.
Primary interface to the complete machine description for the target machine.
const STC & getSubtarget(const Function &F) const
This method returns a pointer to the specified type of TargetSubtargetInfo.
Target-Independent Code Generator Pass Configuration Options.
TMC & getTM() const
Get the right type of TargetMachine for this target.
The instances of the Type class are immutable: once they are created, they are never changed.
static IntegerType * getIntNTy(LLVMContext &C, unsigned N)
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
bool isAggregateType() const
Return true if the type is an aggregate type.
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
bool isIntegerTy() const
True if this is an instance of IntegerType.
LLVM Value Representation.
Type * getType() const
All values are typed, get the type of this value.
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
iterator_range< user_iterator > users()
LLVMContext & getContext() const
All values hold a context through their type.
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
Type * getElementType() const
constexpr ScalarTy getFixedValue() const
const ParentTy * getParent() const
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
initializer< Ty > init(const Ty &Val)
PointerTypeMap run(const Module &M)
Compute the PointerTypeMap for the module M.
NodeAddr< PhiNode * > Phi
This is an optimization pass for GlobalISel generic memory operations.
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Value * GetPointerBaseWithConstantOffset(Value *Ptr, int64_t &Offset, const DataLayout &DL, bool AllowNonInbounds=true)
Analyze the specified pointer to see if it can be expressed as a base pointer plus a constant offset.
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
auto reverse(ContainerTy &&C)
void computeKnownBits(const Value *V, KnownBits &Known, const DataLayout &DL, unsigned Depth=0, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true)
Determine which bits of V are known to be either zero or one and return them in the KnownZero/KnownOn...
DWARFExpression::Operation Op
bool RecursivelyDeleteTriviallyDeadInstructionsPermissive(SmallVectorImpl< WeakTrackingVH > &DeadInsts, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr, std::function< void(Value *)> AboutToDeleteCallback=std::function< void(Value *)>())
Same functionality as RecursivelyDeleteTriviallyDeadInstructions, but allow instructions that are not...
FunctionPass * createAMDGPULateCodeGenPrepareLegacyPass()
This struct is a compact representation of a valid (non-zero power of two) alignment.
static EVT getEVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
unsigned countMinTrailingZeros() const
Returns the minimum number of trailing zero bits.