28#include "llvm/IR/IntrinsicsAMDGPU.h"
31#define GET_GICOMBINER_DEPS
32#include "AMDGPUGenPreLegalizeGICombiner.inc"
33#undef GET_GICOMBINER_DEPS
35#define DEBUG_TYPE "amdgpu-postlegalizer-combiner"
38using namespace MIPatternMatch;
41#define GET_GICOMBINER_TYPES
42#include "AMDGPUGenPostLegalizeGICombiner.inc"
43#undef GET_GICOMBINER_TYPES
45class AMDGPUPostLegalizerCombinerImpl :
public Combiner {
47 const AMDGPUPostLegalizerCombinerImplRuleConfig &RuleConfig;
54 AMDGPUPostLegalizerCombinerImpl(
57 const AMDGPUPostLegalizerCombinerImplRuleConfig &RuleConfig,
61 static const char *
getName() {
return "AMDGPUPostLegalizerCombinerImpl"; }
66 struct FMinFMaxLegacyInfo {
77 const FMinFMaxLegacyInfo &
Info)
const;
91 struct CvtF32UByteMatchInfo {
97 CvtF32UByteMatchInfo &MatchInfo)
const;
99 const CvtF32UByteMatchInfo &MatchInfo)
const;
105 bool matchCombineSignExtendInReg(
106 MachineInstr &
MI, std::pair<MachineInstr *, unsigned> &MatchInfo)
const;
107 void applyCombineSignExtendInReg(
108 MachineInstr &
MI, std::pair<MachineInstr *, unsigned> &MatchInfo)
const;
112 bool matchCombine_s_mul_u64(
MachineInstr &
MI,
unsigned &NewOpcode)
const;
116 void applyCombine_s_mul_u64(
MachineInstr &
MI,
unsigned &NewOpcode)
const;
119#define GET_GICOMBINER_CLASS_MEMBERS
120#define AMDGPUSubtarget GCNSubtarget
121#include "AMDGPUGenPostLegalizeGICombiner.inc"
122#undef GET_GICOMBINER_CLASS_MEMBERS
123#undef AMDGPUSubtarget
126#define GET_GICOMBINER_IMPL
127#define AMDGPUSubtarget GCNSubtarget
128#include "AMDGPUGenPostLegalizeGICombiner.inc"
129#undef AMDGPUSubtarget
130#undef GET_GICOMBINER_IMPL
132AMDGPUPostLegalizerCombinerImpl::AMDGPUPostLegalizerCombinerImpl(
135 const AMDGPUPostLegalizerCombinerImplRuleConfig &RuleConfig,
137 :
Combiner(MF, CInfo, TPC, &KB, CSEInfo), RuleConfig(RuleConfig), STI(STI),
138 TII(*STI.getInstrInfo()),
139 Helper(Observer,
B,
false, &KB, MDT, LI),
141#include
"AMDGPUGenPostLegalizeGICombiner.inc"
146bool AMDGPUPostLegalizerCombinerImpl::tryCombineAll(
MachineInstr &
MI)
const {
147 if (tryCombineAllImpl(
MI))
150 switch (
MI.getOpcode()) {
151 case TargetOpcode::G_SHL:
152 case TargetOpcode::G_LSHR:
153 case TargetOpcode::G_ASHR:
157 return Helper.tryCombineShiftToUnmerge(
MI, 32);
163bool AMDGPUPostLegalizerCombinerImpl::matchFMinFMaxLegacy(
170 if (!
MRI.hasOneNonDBGUse(
Cond) ||
175 Info.True =
MI.getOperand(2).getReg();
176 Info.False =
MI.getOperand(3).getReg();
199void AMDGPUPostLegalizerCombinerImpl::applySelectFCmpToFMinToFMaxLegacy(
201 B.setInstrAndDebugLoc(
MI);
203 B.buildInstr(Opc, {
MI.getOperand(0)}, {
X,
Y},
MI.getFlags());
210 buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY,
Info.RHS,
Info.LHS);
212 buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY,
Info.LHS,
Info.RHS);
220 buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY,
Info.LHS,
Info.RHS);
222 buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY,
Info.RHS,
Info.LHS);
228 buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY,
Info.RHS,
Info.LHS);
230 buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY,
Info.LHS,
Info.RHS);
236 buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY,
Info.LHS,
Info.RHS);
238 buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY,
Info.RHS,
Info.LHS);
245 MI.eraseFromParent();
248bool AMDGPUPostLegalizerCombinerImpl::matchUCharToFloat(
256 LLT Ty =
MRI.getType(DstReg);
259 unsigned SrcSize =
MRI.getType(SrcReg).getSizeInBits();
260 assert(SrcSize == 16 || SrcSize == 32 || SrcSize == 64);
262 return Helper.getKnownBits()->maskedValueIsZero(SrcReg, Mask);
268void AMDGPUPostLegalizerCombinerImpl::applyUCharToFloat(
270 B.setInstrAndDebugLoc(
MI);
276 LLT Ty =
MRI.getType(DstReg);
277 LLT SrcTy =
MRI.getType(SrcReg);
279 SrcReg =
B.buildAnyExtOrTrunc(
S32, SrcReg).getReg(0);
282 B.buildInstr(AMDGPU::G_AMDGPU_CVT_F32_UBYTE0, {DstReg}, {SrcReg},
285 auto Cvt0 =
B.buildInstr(AMDGPU::G_AMDGPU_CVT_F32_UBYTE0, {
S32}, {SrcReg},
287 B.buildFPTrunc(DstReg, Cvt0,
MI.getFlags());
290 MI.eraseFromParent();
293bool AMDGPUPostLegalizerCombinerImpl::matchRcpSqrtToRsq(
300 if (
auto *GI = dyn_cast<GIntrinsic>(&
MI)) {
301 if (GI->is(Intrinsic::amdgcn_rcp))
302 return MRI.getVRegDef(
MI.getOperand(2).getReg());
319 if ((RcpSrcMI = getRcpSrc(
MI)) && (SqrtSrcMI = getSqrtSrc(*RcpSrcMI))) {
321 B.buildIntrinsic(Intrinsic::amdgcn_rsq, {
MI.getOperand(0)})
323 .setMIFlags(
MI.getFlags());
329 if ((SqrtSrcMI = getSqrtSrc(
MI)) && (RcpSrcMI = getRcpSrc(*SqrtSrcMI))) {
331 B.buildIntrinsic(Intrinsic::amdgcn_rsq, {
MI.getOperand(0)})
333 .setMIFlags(
MI.getFlags());
340bool AMDGPUPostLegalizerCombinerImpl::matchFDivSqrtToRsqF16(
343 return MRI.hasOneNonDBGUse(Sqrt);
346void AMDGPUPostLegalizerCombinerImpl::applyFDivSqrtToRsqF16(
350 LLT DstTy =
MRI.getType(Dst);
352 Register RSQ =
B.buildIntrinsic(Intrinsic::amdgcn_rsq, {DstTy})
356 B.buildFMul(Dst, RSQ,
Y, Flags);
357 MI.eraseFromParent();
360bool AMDGPUPostLegalizerCombinerImpl::matchCvtF32UByteN(
371 const unsigned Offset =
MI.getOpcode() - AMDGPU::G_AMDGPU_CVT_F32_UBYTE0;
373 unsigned ShiftOffset = 8 *
Offset;
375 ShiftOffset += ShiftAmt;
377 ShiftOffset -= ShiftAmt;
379 MatchInfo.CvtVal = Src0;
380 MatchInfo.ShiftOffset = ShiftOffset;
381 return ShiftOffset < 32 && ShiftOffset >= 8 && (ShiftOffset % 8) == 0;
388void AMDGPUPostLegalizerCombinerImpl::applyCvtF32UByteN(
390 B.setInstrAndDebugLoc(
MI);
391 unsigned NewOpc = AMDGPU::G_AMDGPU_CVT_F32_UBYTE0 + MatchInfo.ShiftOffset / 8;
395 LLT SrcTy =
MRI.getType(MatchInfo.CvtVal);
398 CvtSrc =
B.buildAnyExt(
S32, CvtSrc).getReg(0);
402 B.buildInstr(NewOpc, {
MI.getOperand(0)}, {CvtSrc},
MI.getFlags());
403 MI.eraseFromParent();
406bool AMDGPUPostLegalizerCombinerImpl::matchRemoveFcanonicalize(
409 MF.getSubtarget().getTargetLowering());
410 Reg =
MI.getOperand(1).getReg();
420bool AMDGPUPostLegalizerCombinerImpl::matchCombineSignExtendInReg(
421 MachineInstr &
MI, std::pair<MachineInstr *, unsigned> &MatchData)
const {
423 if (!
MRI.hasOneNonDBGUse(LoadReg))
429 int64_t Width =
MI.getOperand(2).getImm();
431 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
432 MatchData = {LoadMI, AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE};
434 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
435 MatchData = {LoadMI, AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT};
437 case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_UBYTE:
438 MatchData = {LoadMI, AMDGPU::G_AMDGPU_S_BUFFER_LOAD_SBYTE};
440 case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_USHORT:
441 MatchData = {LoadMI, AMDGPU::G_AMDGPU_S_BUFFER_LOAD_SSHORT};
449void AMDGPUPostLegalizerCombinerImpl::applyCombineSignExtendInReg(
450 MachineInstr &
MI, std::pair<MachineInstr *, unsigned> &MatchData)
const {
451 auto [LoadMI, NewOpcode] = MatchData;
455 Register SignExtendInsnDst =
MI.getOperand(0).getReg();
458 MI.eraseFromParent();
461bool AMDGPUPostLegalizerCombinerImpl::matchCombine_s_mul_u64(
468 if (KB->getKnownBits(Src1).countMinLeadingZeros() >= 32 &&
469 KB->getKnownBits(Src0).countMinLeadingZeros() >= 32) {
470 NewOpcode = AMDGPU::G_AMDGPU_S_MUL_U64_U32;
474 if (KB->computeNumSignBits(Src1) >= 33 &&
475 KB->computeNumSignBits(Src0) >= 33) {
476 NewOpcode = AMDGPU::G_AMDGPU_S_MUL_I64_I32;
482void AMDGPUPostLegalizerCombinerImpl::applyCombine_s_mul_u64(
484 Helper.replaceOpcodeWith(
MI, NewOpcode);
494 AMDGPUPostLegalizerCombiner(
bool IsOptNone =
false);
497 return "AMDGPUPostLegalizerCombiner";
506 AMDGPUPostLegalizerCombinerImplRuleConfig RuleConfig;
510void AMDGPUPostLegalizerCombiner::getAnalysisUsage(
AnalysisUsage &AU)
const {
523AMDGPUPostLegalizerCombiner::AMDGPUPostLegalizerCombiner(
bool IsOptNone)
527 if (!RuleConfig.parseCommandLineOption())
531bool AMDGPUPostLegalizerCombiner::runOnMachineFunction(
MachineFunction &MF) {
533 MachineFunctionProperties::Property::FailedISel))
535 auto *TPC = &getAnalysis<TargetPassConfig>();
544 GISelKnownBits *KB = &getAnalysis<GISelKnownBitsAnalysis>().get(MF);
546 IsOptNone ? nullptr : &getAnalysis<MachineDominatorTree>();
549 LI, EnableOpt,
F.hasOptSize(),
F.hasMinSize());
551 AMDGPUPostLegalizerCombinerImpl Impl(MF, CInfo, TPC, *KB,
nullptr,
552 RuleConfig, ST, MDT, LI);
553 return Impl.combineMachineInstrs();
556char AMDGPUPostLegalizerCombiner::ID = 0;
558 "Combine AMDGPU machine instrs after legalization",
false,
568 return new AMDGPUPostLegalizerCombiner(IsOptNone);
unsigned const MachineRegisterInfo * MRI
This contains common combine transformations that may be used in a combine pass.
This file declares the targeting of the Machinelegalizer class for AMDGPU.
Provides AMDGPU specific target descriptions.
#define GET_GICOMBINER_CONSTRUCTOR_INITS
Combine AMDGPU machine instrs after legalization
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
Analysis containing CSE Info
This contains common combine transformations that may be used in a combine pass,or by the target else...
Option class for Targets to specify which operations are combined how and when.
This contains the base class for all Combiners generated by TableGen.
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
AMD GCN specific subclass of TargetSubtarget.
Provides analysis for querying information about KnownBits during GISel passes.
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
const HexagonInstrInfo * TII
Contains matchers for matching SSA Machine Instructions.
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
#define INITIALIZE_PASS_DEPENDENCY(depName)
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
static StringRef getName(Value *V)
const SmallVectorImpl< MachineOperand > & Cond
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
Target-Independent Code Generator Pass Configuration Options pass.
Class for arbitrary precision integers.
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
AnalysisUsage & addPreserved()
Add the specified Pass class to the set of analyses preserved by this pass.
void setPreservesCFG()
This function should be called by the pass, iff they do not:
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
@ FCMP_OEQ
0 0 0 1 True if ordered and equal
@ FCMP_TRUE
1 1 1 1 Always true (always folded)
@ FCMP_OLT
0 1 0 0 True if ordered and less than
@ FCMP_ULE
1 1 0 1 True if unordered, less than, or equal
@ FCMP_OGT
0 0 1 0 True if ordered and greater than
@ FCMP_OGE
0 0 1 1 True if ordered and greater than or equal
@ FCMP_ULT
1 1 0 0 True if unordered or less than
@ FCMP_ONE
0 1 1 0 True if ordered and operands are unequal
@ FCMP_UEQ
1 0 0 1 True if unordered or equal
@ FCMP_UGT
1 0 1 0 True if unordered or greater than
@ FCMP_OLE
0 1 0 1 True if ordered and less than or equal
@ FCMP_ORD
0 1 1 1 True if ordered (no nans)
@ FCMP_UNE
1 1 1 0 True if unordered or not equal
@ FCMP_UGE
1 0 1 1 True if unordered, greater than, or equal
@ FCMP_FALSE
0 0 0 0 Always false (always folded)
@ FCMP_UNO
1 0 0 0 True if unordered: isnan(X) | isnan(Y)
virtual bool tryCombineAll(MachineInstr &I) const =0
FunctionPass class - This class is used to implement most global optimizations.
To use KnownBitsInfo analysis in a pass, KnownBitsInfo &Info = getAnalysis<GISelKnownBitsInfoAnalysis...
constexpr bool isScalar() const
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
constexpr TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
DominatorTree Class - Concrete subclass of DominatorTreeBase that is used to compute a normal dominat...
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
virtual bool runOnMachineFunction(MachineFunction &MF)=0
runOnMachineFunction - This method must be overloaded to perform the desired machine code transformat...
bool hasProperty(Property P) const
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
Function & getFunction()
Return the LLVM function that this machine code represents.
const LLVMTargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
const MachineFunctionProperties & getProperties() const
Get the function properties.
Helper class to build MachineInstr.
Representation of each machine instruction.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
void setDesc(const MCInstrDesc &TID)
Replace the instruction descriptor (thus opcode) of the current instruction with a new one.
const MachineOperand & getOperand(unsigned i) const
void setReg(Register Reg)
Change the register this operand corresponds to.
Register getReg() const
getReg - Returns the register number.
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
virtual StringRef getPassName() const
getPassName - Return a nice clean name for a pass.
Wrapper class representing virtual and physical registers.
bool isCanonicalized(SelectionDAG &DAG, SDValue Op, unsigned MaxDepth=5) const
StringRef - Represent a constant reference to a string, i.e.
CodeGenOptLevel getOptLevel() const
Returns the optimization level: None, Less, Default, or Aggressive.
Target-Independent Code Generator Pass Configuration Options.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
operand_type_match m_Reg()
operand_type_match m_Pred()
UnaryOp_match< SrcTy, TargetOpcode::G_ZEXT > m_GZExt(const SrcTy &Src)
ConstantMatch< APInt > m_ICst(APInt &Cst)
UnaryOp_match< SrcTy, TargetOpcode::G_FSQRT > m_GFSqrt(const SrcTy &Src)
bool mi_match(Reg R, const MachineRegisterInfo &MRI, Pattern &&P)
BinaryOp_match< LHS, RHS, TargetOpcode::G_SHL, false > m_GShl(const LHS &L, const RHS &R)
bind_ty< MachineInstr * > m_MInstr(MachineInstr *&MI)
BinaryOp_match< LHS, RHS, TargetOpcode::G_LSHR, false > m_GLShr(const LHS &L, const RHS &R)
CompareOp_match< Pred, LHS, RHS, TargetOpcode::G_FCMP > m_GFCmp(const Pred &P, const LHS &L, const RHS &R)
Reg
All possible values of the reg field in the ModR/M byte.
This is an optimization pass for GlobalISel generic memory operations.
FunctionPass * createAMDGPUPostLegalizeCombiner(bool IsOptNone)
void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
void initializeAMDGPUPostLegalizerCombinerPass(PassRegistry &)
void getSelectionDAGFallbackAnalysisUsage(AnalysisUsage &AU)
Modify analysis usage so it preserves passes required for the SelectionDAG fallback.
auto instrs(const MachineBasicBlock &BB)