28#define DEBUG_TYPE "amdgpu-late-codegenprepare"
37 WidenLoads(
"amdgpu-late-codegenprepare-widen-constant-loads",
38 cl::desc(
"Widen sub-dword constant address space loads in "
39 "AMDGPULateCodeGenPrepare"),
44class AMDGPULateCodeGenPrepare
45 :
public InstVisitor<AMDGPULateCodeGenPrepare, bool> {
58 :
Mod(&M),
DL(&M.getDataLayout()), ST(ST), AC(AC), UA(UA) {}
63 bool isDWORDAligned(
const Value *V)
const {
68 bool canWidenScalarExtLoad(
LoadInst &LI)
const;
74class LiveRegOptimizer {
80 Type *ConvertToScalar;
91 Type *calculateConvertType(
Type *OriginalType);
109 bool shouldReplace(
Type *ITy) {
114 auto TLI =
ST->getTargetLowering();
130 DL = &
Mod->getDataLayout();
137bool AMDGPULateCodeGenPrepare::run(
Function &
F) {
145 LiveRegOptimizer LRO(
Mod, &ST);
147 bool Changed =
false;
149 bool HasScalarSubwordLoads =
ST.hasScalarSubwordLoads();
153 Changed |= !HasScalarSubwordLoads && visit(
I);
154 Changed |= LRO.optimizeLiveType(&
I, DeadInsts);
161Type *LiveRegOptimizer::calculateConvertType(
Type *OriginalType) {
167 TypeSize OriginalSize =
DL->getTypeSizeInBits(VTy);
168 TypeSize ConvertScalarSize =
DL->getTypeSizeInBits(ConvertToScalar);
169 unsigned ConvertEltCount =
170 (OriginalSize + ConvertScalarSize - 1) / ConvertScalarSize;
172 if (OriginalSize <= ConvertScalarSize)
176 ConvertEltCount,
false);
182 Type *NewTy = calculateConvertType(
V->getType());
184 TypeSize OriginalSize =
DL->getTypeSizeInBits(VTy);
185 TypeSize NewSize =
DL->getTypeSizeInBits(NewTy);
190 if (OriginalSize == NewSize)
191 return Builder.CreateBitCast(V, NewTy,
V->getName() +
".bc");
194 assert(NewSize > OriginalSize);
199 for (
unsigned I = 0;
I < OriginalElementCount;
I++)
202 for (
uint64_t I = OriginalElementCount;
I < ExpandedVecElementCount;
I++)
203 ShuffleMask.
push_back(OriginalElementCount);
205 Value *ExpandedVec = Builder.CreateShuffleVector(V, ShuffleMask);
206 return Builder.CreateBitCast(ExpandedVec, NewTy,
V->getName() +
".bc");
214 TypeSize OriginalSize =
DL->getTypeSizeInBits(
V->getType());
215 TypeSize NewSize =
DL->getTypeSizeInBits(NewVTy);
219 if (OriginalSize == NewSize)
220 return Builder.CreateBitCast(V, NewVTy,
V->getName() +
".bc");
224 assert(OriginalSize > NewSize);
226 if (!
V->getType()->isVectorTy()) {
229 return cast<Instruction>(Builder.CreateBitCast(Trunc, NewVTy));
238 cast<Instruction>(Builder.CreateBitCast(V, ExpandedVT));
242 std::iota(ShuffleMask.
begin(), ShuffleMask.
end(), 0);
244 return Builder.CreateShuffleVector(Converted, ShuffleMask);
247bool LiveRegOptimizer::optimizeLiveType(
255 while (!Worklist.
empty()) {
261 if (!shouldReplace(
II->getType()))
264 if (
PHINode *Phi = dyn_cast<PHINode>(
II)) {
267 for (
Value *V :
Phi->incoming_values()) {
269 if (
PHINode *OpPhi = dyn_cast<PHINode>(V)) {
270 if (!PhiNodes.
count(OpPhi) && !Visited.
count(OpPhi))
277 if (!IncInst && !isa<ConstantAggregateZero>(V))
287 for (
User *V :
II->users()) {
289 if (
PHINode *OpPhi = dyn_cast<PHINode>(V)) {
290 if (!PhiNodes.
count(OpPhi) && !Visited.
count(OpPhi))
297 if (UseInst->
getParent() !=
II->getParent() || isa<PHINode>(
II)) {
298 Uses.insert(UseInst);
299 if (!Defs.
count(
II) && !isa<PHINode>(
II)) {
310 Value *ConvertVal = convertToOptType(
D, InsertPt);
312 ValMap[
D] = ConvertVal;
317 for (
PHINode *Phi : PhiNodes) {
319 Phi->getNumIncomingValues(),
320 Phi->getName() +
".tc",
Phi->getIterator());
324 for (
PHINode *Phi : PhiNodes) {
325 PHINode *NewPhi = cast<PHINode>(ValMap[Phi]);
326 bool MissingIncVal =
false;
327 for (
int I = 0, E =
Phi->getNumIncomingValues();
I < E;
I++) {
328 Value *IncVal =
Phi->getIncomingValue(
I);
329 if (isa<ConstantAggregateZero>(IncVal)) {
330 Type *NewType = calculateConvertType(
Phi->getType());
331 NewPhi->
addIncoming(ConstantInt::get(NewType, 0,
false),
332 Phi->getIncomingBlock(
I));
333 }
else if (ValMap.
contains(IncVal) && ValMap[IncVal])
336 MissingIncVal =
true;
345 while (!PHIWorklist.
empty()) {
347 VisitedPhis.
insert(NextDeadValue);
349 std::find_if(PhiNodes.begin(), PhiNodes.end(),
350 [
this, &NextDeadValue](
PHINode *CandPhi) {
351 return ValMap[CandPhi] == NextDeadValue;
355 if (OriginalPhi != PhiNodes.end())
356 ValMap.
erase(*OriginalPhi);
358 DeadInsts.
emplace_back(cast<Instruction>(NextDeadValue));
361 if (!VisitedPhis.
contains(cast<PHINode>(U)))
374 Value *NewVal =
nullptr;
375 if (BBUseValMap.
contains(
U->getParent()) &&
376 BBUseValMap[
U->getParent()].contains(ValMap[
Op]))
377 NewVal = BBUseValMap[
U->getParent()][ValMap[
Op]];
383 if (isa<Instruction>(
Op) && !isa<PHINode>(
Op) &&
384 U->getParent() == cast<Instruction>(
Op)->getParent()) {
388 convertFromOptType(
Op->getType(), cast<Instruction>(ValMap[
Op]),
389 InsertPt,
U->getParent());
390 BBUseValMap[
U->getParent()][ValMap[
Op]] = NewVal;
394 U->setOperand(OpIdx, NewVal);
402bool AMDGPULateCodeGenPrepare::canWidenScalarExtLoad(
LoadInst &LI)
const {
415 unsigned TySize =
DL->getTypeStoreSize(Ty);
426bool AMDGPULateCodeGenPrepare::visitLoadInst(
LoadInst &LI) {
435 if (!canWidenScalarExtLoad(LI))
443 if (!isDWORDAligned(
Base))
446 int64_t Adjust =
Offset & 0x3;
457 unsigned LdBits =
DL->getTypeStoreSizeInBits(LI.
getType());
460 auto *NewPtr = IRB.CreateConstGEP1_64(
465 LoadInst *NewLd = IRB.CreateAlignedLoad(IRB.getInt32Ty(), NewPtr,
Align(4));
467 NewLd->
setMetadata(LLVMContext::MD_range,
nullptr);
469 unsigned ShAmt = Adjust * 8;
470 auto *NewVal = IRB.CreateBitCast(
471 IRB.CreateTrunc(IRB.CreateLShr(NewLd, ShAmt), IntNTy), LI.
getType());
485 AMDGPULateCodeGenPrepare Impl(*
F.getParent(), ST, &AC, &UI);
487 bool Changed = Impl.run(
F);
503 return "AMDGPU IR late optimizations";
525 getAnalysis<AssumptionCacheTracker>().getAssumptionCache(
F);
527 getAnalysis<UniformityInfoWrapperPass>().getUniformityInfo();
529 AMDGPULateCodeGenPrepare Impl(*
F.getParent(), ST, &AC, &UI);
535 "AMDGPU IR late optimizations",
false,
false)
aarch64 falkor hwpf fix late
static cl::opt< bool > WidenLoads("amdgpu-late-codegenprepare-widen-constant-loads", cl::desc("Widen sub-dword constant address space loads in " "AMDGPULateCodeGenPrepare"), cl::ReallyHidden, cl::init(true))
The AMDGPU TargetMachine interface definition for hw codegen targets.
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
static bool runOnFunction(Function &F, bool PostInlining)
Rewrite Partial Register Uses
Legalize the Machine IR a function s Machine IR
Generic memory optimizations
uint64_t IntrinsicInst * II
if(auto Err=PB.parsePassPipeline(MPM, Passes)) return wrap(std MPM run * Mod
FunctionAnalysisManager FAM
#define INITIALIZE_PASS_DEPENDENCY(depName)
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
Target-Independent Code Generator Pass Configuration Options pass.
bool runOnFunction(Function &F) override
runOnFunction - Virtual method overriden by subclasses to do the per-function processing of the pass.
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - This function should be overriden by passes that need analysis information to do t...
AMDGPULateCodeGenPrepareLegacy()
StringRef getPassName() const override
getPassName - Return a nice clean name for a pass.
PreservedAnalyses run(Function &, FunctionAnalysisManager &)
A container for analyses that lazily runs them and caches their results.
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
void setPreservesAll()
Set by analyses that do not transform their input at all.
A function analysis which provides an AssumptionCache.
An immutable pass that tracks lazily created AssumptionCache objects.
A cache of @llvm.assume calls within a function.
LLVM Basic Block Representation.
InstListType::iterator iterator
Instruction iterators...
Represents analyses that only rely on functions' control flow.
This class represents an Operation in the Expression.
A parsed version of the target data layout string in and methods for querying it.
bool erase(const KeyT &Val)
bool contains(const_arg_type_t< KeyT > Val) const
Return true if the specified key is in the map, false otherwise.
Class to represent fixed width SIMD vectors.
FunctionPass class - This class is used to implement most global optimizations.
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Base class for instruction visitors.
void visitInstruction(Instruction &I)
RetTy visitLoadInst(LoadInst &I)
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
void copyMetadata(const Instruction &SrcInst, ArrayRef< unsigned > WL=ArrayRef< unsigned >())
Copy metadata from SrcInst to this instruction.
static IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
An instruction for reading from memory.
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
void setAlignment(Align Align)
Value * getPointerOperand()
Align getAlign() const
Return the alignment of the access that is being performed.
A Module instance is used to store all the information related to an LLVM module.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
static PHINode * Create(Type *Ty, unsigned NumReservedValues, const Twine &NameStr="", InsertPosition InsertBefore=nullptr)
Constructors - NumReservedValues is a hint for the number of incoming edges that this phi node will h...
A set of analyses that are preserved following a run of a transformation pass.
static PreservedAnalyses none()
Convenience factory function for the empty preserved set.
void preserveSet()
Mark an analysis set as preserved.
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
bool contains(ConstPtrType Ptr) const
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
reference emplace_back(ArgTypes &&... Args)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
StringRef - Represent a constant reference to a string, i.e.
std::pair< LegalizeTypeAction, EVT > LegalizeKind
LegalizeKind holds the legalization kind that needs to happen to EVT in order to type-legalize it.
Primary interface to the complete machine description for the target machine.
const STC & getSubtarget(const Function &F) const
This method returns a pointer to the specified type of TargetSubtargetInfo.
Target-Independent Code Generator Pass Configuration Options.
TMC & getTM() const
Get the right type of TargetMachine for this target.
The instances of the Type class are immutable: once they are created, they are never changed.
static IntegerType * getIntNTy(LLVMContext &C, unsigned N)
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
bool isAggregateType() const
Return true if the type is an aggregate type.
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
static IntegerType * getInt32Ty(LLVMContext &C)
bool isIntegerTy() const
True if this is an instance of IntegerType.
LLVM Value Representation.
Type * getType() const
All values are typed, get the type of this value.
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
iterator_range< user_iterator > users()
LLVMContext & getContext() const
All values hold a context through their type.
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
Type * getElementType() const
constexpr ScalarTy getFixedValue() const
const ParentTy * getParent() const
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
initializer< Ty > init(const Ty &Val)
PointerTypeMap run(const Module &M)
Compute the PointerTypeMap for the module M.
NodeAddr< PhiNode * > Phi
This is an optimization pass for GlobalISel generic memory operations.
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Value * GetPointerBaseWithConstantOffset(Value *Ptr, int64_t &Offset, const DataLayout &DL, bool AllowNonInbounds=true)
Analyze the specified pointer to see if it can be expressed as a base pointer plus a constant offset.
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
auto reverse(ContainerTy &&C)
void computeKnownBits(const Value *V, KnownBits &Known, const DataLayout &DL, unsigned Depth=0, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true)
Determine which bits of V are known to be either zero or one and return them in the KnownZero/KnownOn...
DWARFExpression::Operation Op
bool RecursivelyDeleteTriviallyDeadInstructionsPermissive(SmallVectorImpl< WeakTrackingVH > &DeadInsts, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr, std::function< void(Value *)> AboutToDeleteCallback=std::function< void(Value *)>())
Same functionality as RecursivelyDeleteTriviallyDeadInstructions, but allow instructions that are not...
FunctionPass * createAMDGPULateCodeGenPrepareLegacyPass()
This struct is a compact representation of a valid (non-zero power of two) alignment.
static EVT getEVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
unsigned countMinTrailingZeros() const
Returns the minimum number of trailing zero bits.