28#include "llvm/IR/IntrinsicsAMDGPU.h"
31#define GET_GICOMBINER_DEPS
32#include "AMDGPUGenPreLegalizeGICombiner.inc"
33#undef GET_GICOMBINER_DEPS
35#define DEBUG_TYPE "amdgpu-postlegalizer-combiner"
41#define GET_GICOMBINER_TYPES
42#include "AMDGPUGenPostLegalizeGICombiner.inc"
43#undef GET_GICOMBINER_TYPES
45class AMDGPUPostLegalizerCombinerImpl :
public Combiner {
47 const AMDGPUPostLegalizerCombinerImplRuleConfig &RuleConfig;
54 AMDGPUPostLegalizerCombinerImpl(
57 const AMDGPUPostLegalizerCombinerImplRuleConfig &RuleConfig,
61 static const char *
getName() {
return "AMDGPUPostLegalizerCombinerImpl"; }
66 struct FMinFMaxLegacyInfo {
74 FMinFMaxLegacyInfo &
Info)
const;
76 const FMinFMaxLegacyInfo &
Info)
const;
90 struct CvtF32UByteMatchInfo {
96 CvtF32UByteMatchInfo &MatchInfo)
const;
98 const CvtF32UByteMatchInfo &MatchInfo)
const;
104 bool matchCombineSignExtendInReg(
105 MachineInstr &
MI, std::pair<MachineInstr *, unsigned> &MatchInfo)
const;
106 void applyCombineSignExtendInReg(
107 MachineInstr &
MI, std::pair<MachineInstr *, unsigned> &MatchInfo)
const;
114 bool matchCombine_s_mul_u64(
MachineInstr &
MI,
unsigned &NewOpcode)
const;
117#define GET_GICOMBINER_CLASS_MEMBERS
118#define AMDGPUSubtarget GCNSubtarget
119#include "AMDGPUGenPostLegalizeGICombiner.inc"
120#undef GET_GICOMBINER_CLASS_MEMBERS
121#undef AMDGPUSubtarget
124#define GET_GICOMBINER_IMPL
125#define AMDGPUSubtarget GCNSubtarget
126#include "AMDGPUGenPostLegalizeGICombiner.inc"
127#undef AMDGPUSubtarget
128#undef GET_GICOMBINER_IMPL
130AMDGPUPostLegalizerCombinerImpl::AMDGPUPostLegalizerCombinerImpl(
133 const AMDGPUPostLegalizerCombinerImplRuleConfig &RuleConfig,
135 :
Combiner(MF, CInfo, TPC, &VT, CSEInfo), RuleConfig(RuleConfig), STI(STI),
136 TII(*STI.getInstrInfo()),
137 Helper(Observer,
B,
false, &VT, MDT, LI, STI),
139#include
"AMDGPUGenPostLegalizeGICombiner.inc"
144bool AMDGPUPostLegalizerCombinerImpl::tryCombineAll(
MachineInstr &
MI)
const {
145 if (tryCombineAllImpl(
MI))
148 switch (
MI.getOpcode()) {
149 case TargetOpcode::G_SHL:
150 case TargetOpcode::G_LSHR:
151 case TargetOpcode::G_ASHR:
161bool AMDGPUPostLegalizerCombinerImpl::matchFMinFMaxLegacy(
162 MachineInstr &
MI, MachineInstr &FCmp, FMinFMaxLegacyInfo &
Info)
const {
175 if ((
Info.LHS != True ||
Info.RHS != False) &&
176 (
Info.LHS != False ||
Info.RHS != True))
182 if (
Info.LHS != True)
189void AMDGPUPostLegalizerCombinerImpl::applySelectFCmpToFMinFMaxLegacy(
190 MachineInstr &
MI,
const FMinFMaxLegacyInfo &
Info)
const {
192 : AMDGPU::G_AMDGPU_FMIN_LEGACY;
202 B.buildInstr(
Opc, {
MI.getOperand(0)}, {
X,
Y},
MI.getFlags());
204 MI.eraseFromParent();
207bool AMDGPUPostLegalizerCombinerImpl::matchUCharToFloat(
208 MachineInstr &
MI)
const {
215 LLT Ty =
MRI.getType(DstReg);
218 unsigned SrcSize =
MRI.getType(SrcReg).getSizeInBits();
219 assert(SrcSize == 16 || SrcSize == 32 || SrcSize == 64);
227void AMDGPUPostLegalizerCombinerImpl::applyUCharToFloat(
228 MachineInstr &
MI)
const {
233 LLT Ty =
MRI.getType(DstReg);
234 LLT SrcTy =
MRI.getType(SrcReg);
236 SrcReg =
B.buildAnyExtOrTrunc(
S32, SrcReg).getReg(0);
239 B.buildInstr(AMDGPU::G_AMDGPU_CVT_F32_UBYTE0, {DstReg}, {SrcReg},
242 auto Cvt0 =
B.buildInstr(AMDGPU::G_AMDGPU_CVT_F32_UBYTE0, {
S32}, {SrcReg},
244 B.buildFPTrunc(DstReg, Cvt0,
MI.getFlags());
247 MI.eraseFromParent();
250bool AMDGPUPostLegalizerCombinerImpl::matchRcpSqrtToRsq(
252 std::function<
void(MachineIRBuilder &)> &MatchInfo)
const {
253 auto getRcpSrc = [=](
const MachineInstr &
MI) -> MachineInstr * {
258 if (GI->is(Intrinsic::amdgcn_rcp))
259 return MRI.getVRegDef(
MI.getOperand(2).getReg());
264 auto getSqrtSrc = [=](
const MachineInstr &
MI) -> MachineInstr * {
267 MachineInstr *SqrtSrcMI =
nullptr;
274 MachineInstr *RcpSrcMI =
nullptr, *SqrtSrcMI =
nullptr;
276 if ((RcpSrcMI = getRcpSrc(
MI)) && (SqrtSrcMI = getSqrtSrc(*RcpSrcMI))) {
277 MatchInfo = [SqrtSrcMI, &
MI](MachineIRBuilder &
B) {
278 B.buildIntrinsic(Intrinsic::amdgcn_rsq, {
MI.getOperand(0)})
280 .setMIFlags(
MI.getFlags());
286 if ((SqrtSrcMI = getSqrtSrc(
MI)) && (RcpSrcMI = getRcpSrc(*SqrtSrcMI))) {
287 MatchInfo = [RcpSrcMI, &
MI](MachineIRBuilder &
B) {
288 B.buildIntrinsic(Intrinsic::amdgcn_rsq, {
MI.getOperand(0)})
290 .setMIFlags(
MI.getFlags());
297bool AMDGPUPostLegalizerCombinerImpl::matchFDivSqrtToRsqF16(
298 MachineInstr &
MI)
const {
300 return MRI.hasOneNonDBGUse(Sqrt);
303void AMDGPUPostLegalizerCombinerImpl::applyFDivSqrtToRsqF16(
307 LLT DstTy =
MRI.getType(Dst);
308 uint32_t
Flags =
MI.getFlags();
309 Register RSQ =
B.buildIntrinsic(Intrinsic::amdgcn_rsq, {DstTy})
313 B.buildFMul(Dst, RSQ,
Y, Flags);
314 MI.eraseFromParent();
317bool AMDGPUPostLegalizerCombinerImpl::matchCvtF32UByteN(
318 MachineInstr &
MI, CvtF32UByteMatchInfo &MatchInfo)
const {
328 const unsigned Offset =
MI.getOpcode() - AMDGPU::G_AMDGPU_CVT_F32_UBYTE0;
330 unsigned ShiftOffset = 8 *
Offset;
332 ShiftOffset += ShiftAmt;
334 ShiftOffset -= ShiftAmt;
336 MatchInfo.CvtVal = Src0;
337 MatchInfo.ShiftOffset = ShiftOffset;
338 return ShiftOffset < 32 && ShiftOffset >= 8 && (ShiftOffset % 8) == 0;
345void AMDGPUPostLegalizerCombinerImpl::applyCvtF32UByteN(
346 MachineInstr &
MI,
const CvtF32UByteMatchInfo &MatchInfo)
const {
347 unsigned NewOpc = AMDGPU::G_AMDGPU_CVT_F32_UBYTE0 + MatchInfo.ShiftOffset / 8;
351 LLT SrcTy =
MRI.getType(MatchInfo.CvtVal);
354 CvtSrc =
B.buildAnyExt(
S32, CvtSrc).getReg(0);
358 B.buildInstr(NewOpc, {
MI.getOperand(0)}, {CvtSrc},
MI.getFlags());
359 MI.eraseFromParent();
362bool AMDGPUPostLegalizerCombinerImpl::matchRemoveFcanonicalize(
364 const SITargetLowering *TLI =
static_cast<const SITargetLowering *
>(
365 MF.getSubtarget().getTargetLowering());
366 Reg =
MI.getOperand(1).getReg();
376bool AMDGPUPostLegalizerCombinerImpl::matchCombineSignExtendInReg(
377 MachineInstr &
MI, std::pair<MachineInstr *, unsigned> &MatchData)
const {
379 if (!
MRI.hasOneNonDBGUse(LoadReg))
384 MachineInstr *LoadMI =
MRI.getVRegDef(LoadReg);
385 int64_t Width =
MI.getOperand(2).getImm();
387 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
388 MatchData = {LoadMI, AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE};
390 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
391 MatchData = {LoadMI, AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT};
393 case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_UBYTE:
394 MatchData = {LoadMI, AMDGPU::G_AMDGPU_S_BUFFER_LOAD_SBYTE};
396 case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_USHORT:
397 MatchData = {LoadMI, AMDGPU::G_AMDGPU_S_BUFFER_LOAD_SSHORT};
405void AMDGPUPostLegalizerCombinerImpl::applyCombineSignExtendInReg(
406 MachineInstr &
MI, std::pair<MachineInstr *, unsigned> &MatchData)
const {
407 auto [LoadMI, NewOpcode] = MatchData;
411 Register SignExtendInsnDst =
MI.getOperand(0).getReg();
414 MI.eraseFromParent();
417bool AMDGPUPostLegalizerCombinerImpl::matchCombine_s_mul_u64(
418 MachineInstr &
MI,
unsigned &NewOpcode)
const {
424 if (VT->getKnownBits(Src1).countMinLeadingZeros() >= 32 &&
425 VT->getKnownBits(Src0).countMinLeadingZeros() >= 32) {
426 NewOpcode = AMDGPU::G_AMDGPU_S_MUL_U64_U32;
430 if (VT->computeNumSignBits(Src1) >= 33 &&
431 VT->computeNumSignBits(Src0) >= 33) {
432 NewOpcode = AMDGPU::G_AMDGPU_S_MUL_I64_I32;
441class AMDGPUPostLegalizerCombiner :
public MachineFunctionPass {
445 AMDGPUPostLegalizerCombiner(
bool IsOptNone =
false);
447 StringRef getPassName()
const override {
448 return "AMDGPUPostLegalizerCombiner";
451 bool runOnMachineFunction(MachineFunction &MF)
override;
453 void getAnalysisUsage(AnalysisUsage &AU)
const override;
457 AMDGPUPostLegalizerCombinerImplRuleConfig RuleConfig;
461void AMDGPUPostLegalizerCombiner::getAnalysisUsage(AnalysisUsage &AU)
const {
465 AU.
addRequired<GISelValueTrackingAnalysisLegacy>();
474AMDGPUPostLegalizerCombiner::AMDGPUPostLegalizerCombiner(
bool IsOptNone)
475 : MachineFunctionPass(
ID), IsOptNone(IsOptNone) {
476 if (!RuleConfig.parseCommandLineOption())
480bool AMDGPUPostLegalizerCombiner::runOnMachineFunction(
MachineFunction &MF) {
483 auto *TPC = &getAnalysis<TargetPassConfig>();
493 &getAnalysis<GISelValueTrackingAnalysisLegacy>().get(MF);
496 : &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
499 LI, EnableOpt,
F.hasOptSize(),
F.hasMinSize());
501 CInfo.MaxIterations = 1;
504 CInfo.EnableFullDCE =
false;
505 AMDGPUPostLegalizerCombinerImpl Impl(MF, CInfo, TPC, *VT,
nullptr,
506 RuleConfig, ST, MDT, LI);
507 return Impl.combineMachineInstrs();
510char AMDGPUPostLegalizerCombiner::ID = 0;
512 "Combine AMDGPU machine instrs after legalization",
false,
517 "Combine AMDGPU machine instrs after legalization",
false,
521 return new AMDGPUPostLegalizerCombiner(IsOptNone);
unsigned const MachineRegisterInfo * MRI
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
#define GET_GICOMBINER_CONSTRUCTOR_INITS
This contains common combine transformations that may be used in a combine pass.
This file declares the targeting of the Machinelegalizer class for AMDGPU.
Provides AMDGPU specific target descriptions.
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
Analysis containing CSE Info
This contains common combine transformations that may be used in a combine pass,or by the target else...
Option class for Targets to specify which operations are combined how and when.
This contains the base class for all Combiners generated by TableGen.
AMD GCN specific subclass of TargetSubtarget.
Provides analysis for querying information about KnownBits during GISel passes.
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
const HexagonInstrInfo * TII
Contains matchers for matching SSA Machine Instructions.
Promote Memory to Register
#define INITIALIZE_PASS_DEPENDENCY(depName)
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
static StringRef getName(Value *V)
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
Target-Independent Code Generator Pass Configuration Options pass.
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
AnalysisUsage & addRequired()
AnalysisUsage & addPreserved()
Add the specified Pass class to the set of analyses preserved by this pass.
LLVM_ABI void setPreservesCFG()
This function should be called by the pass, iff they do not:
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
@ FCMP_OGT
0 0 1 0 True if ordered and greater than
Predicate getSwappedPredicate() const
For example, EQ->EQ, SLE->SGE, ULT->UGT, OEQ->OEQ, ULE->UGE, OLT->OGT, etc.
Predicate getInversePredicate() const
For example, EQ -> NE, UGT -> ULE, SLT -> SGE, OEQ -> UNE, UGT -> OLE, OLT -> UGE,...
Predicate getUnorderedPredicate() const
GISelValueTracking * getValueTracking() const
bool tryCombineShiftToUnmerge(MachineInstr &MI, unsigned TargetShiftAmount) const
FunctionPass class - This class is used to implement most global optimizations.
To use KnownBitsInfo analysis in a pass, KnownBitsInfo &Info = getAnalysis<GISelValueTrackingInfoAnal...
bool maskedValueIsZero(Register Val, const APInt &Mask)
constexpr bool isScalar() const
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
constexpr TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
DominatorTree Class - Concrete subclass of DominatorTreeBase that is used to compute a normal dominat...
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
Function & getFunction()
Return the LLVM function that this machine code represents.
const MachineFunctionProperties & getProperties() const
Get the function properties.
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
Helper class to build MachineInstr.
Representation of each machine instruction.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
LLVM_ABI void setDesc(const MCInstrDesc &TID)
Replace the instruction descriptor (thus opcode) of the current instruction with a new one.
const MachineOperand & getOperand(unsigned i) const
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
Register getReg() const
getReg - Returns the register number.
Wrapper class representing virtual and physical registers.
bool isCanonicalized(SelectionDAG &DAG, SDValue Op, unsigned MaxDepth=5) const
CodeGenOptLevel getOptLevel() const
Returns the optimization level: None, Less, Default, or Aggressive.
Target-Independent Code Generator Pass Configuration Options.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
operand_type_match m_Reg()
UnaryOp_match< SrcTy, TargetOpcode::G_ZEXT > m_GZExt(const SrcTy &Src)
ConstantMatch< APInt > m_ICst(APInt &Cst)
UnaryOp_match< SrcTy, TargetOpcode::G_FSQRT > m_GFSqrt(const SrcTy &Src)
bool mi_match(Reg R, const MachineRegisterInfo &MRI, Pattern &&P)
BinaryOp_match< LHS, RHS, TargetOpcode::G_SHL, false > m_GShl(const LHS &L, const RHS &R)
bind_ty< MachineInstr * > m_MInstr(MachineInstr *&MI)
BinaryOp_match< LHS, RHS, TargetOpcode::G_LSHR, false > m_GLShr(const LHS &L, const RHS &R)
Predicate getPredicate(unsigned Condition, unsigned Hint)
Return predicate consisting of specified condition and hint bits.
This is an optimization pass for GlobalISel generic memory operations.
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
LLVM_ABI void getSelectionDAGFallbackAnalysisUsage(AnalysisUsage &AU)
Modify analysis usage so it preserves passes required for the SelectionDAG fallback.
FunctionPass * createAMDGPUPostLegalizeCombiner(bool IsOptNone)
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
@ SinglePass
Enables Observer-based DCE and additional heuristics that retry combining defined and used instructio...