28#include "llvm/IR/IntrinsicsAMDGPU.h"
31#define GET_GICOMBINER_DEPS
32#include "AMDGPUGenPreLegalizeGICombiner.inc"
33#undef GET_GICOMBINER_DEPS
35#define DEBUG_TYPE "amdgpu-postlegalizer-combiner"
38using namespace MIPatternMatch;
41#define GET_GICOMBINER_TYPES
42#include "AMDGPUGenPostLegalizeGICombiner.inc"
43#undef GET_GICOMBINER_TYPES
45class AMDGPUPostLegalizerCombinerImpl :
public Combiner {
47 const AMDGPUPostLegalizerCombinerImplRuleConfig &RuleConfig;
54 AMDGPUPostLegalizerCombinerImpl(
57 const AMDGPUPostLegalizerCombinerImplRuleConfig &RuleConfig,
61 static const char *
getName() {
return "AMDGPUPostLegalizerCombinerImpl"; }
66 struct FMinFMaxLegacyInfo {
74 FMinFMaxLegacyInfo &
Info)
const;
76 const FMinFMaxLegacyInfo &
Info)
const;
90 struct CvtF32UByteMatchInfo {
96 CvtF32UByteMatchInfo &MatchInfo)
const;
98 const CvtF32UByteMatchInfo &MatchInfo)
const;
104 bool matchCombineSignExtendInReg(
105 MachineInstr &
MI, std::pair<MachineInstr *, unsigned> &MatchInfo)
const;
106 void applyCombineSignExtendInReg(
107 MachineInstr &
MI, std::pair<MachineInstr *, unsigned> &MatchInfo)
const;
114 bool matchCombine_s_mul_u64(
MachineInstr &
MI,
unsigned &NewOpcode)
const;
117#define GET_GICOMBINER_CLASS_MEMBERS
118#define AMDGPUSubtarget GCNSubtarget
119#include "AMDGPUGenPostLegalizeGICombiner.inc"
120#undef GET_GICOMBINER_CLASS_MEMBERS
121#undef AMDGPUSubtarget
124#define GET_GICOMBINER_IMPL
125#define AMDGPUSubtarget GCNSubtarget
126#include "AMDGPUGenPostLegalizeGICombiner.inc"
127#undef AMDGPUSubtarget
128#undef GET_GICOMBINER_IMPL
130AMDGPUPostLegalizerCombinerImpl::AMDGPUPostLegalizerCombinerImpl(
133 const AMDGPUPostLegalizerCombinerImplRuleConfig &RuleConfig,
135 :
Combiner(MF, CInfo, TPC, &KB, CSEInfo), RuleConfig(RuleConfig), STI(STI),
136 TII(*STI.getInstrInfo()),
137 Helper(Observer,
B,
false, &KB, MDT, LI),
139#include
"AMDGPUGenPostLegalizeGICombiner.inc"
144bool AMDGPUPostLegalizerCombinerImpl::tryCombineAll(
MachineInstr &
MI)
const {
145 if (tryCombineAllImpl(
MI))
148 switch (
MI.getOpcode()) {
149 case TargetOpcode::G_SHL:
150 case TargetOpcode::G_LSHR:
151 case TargetOpcode::G_ASHR:
155 return Helper.tryCombineShiftToUnmerge(
MI, 32);
161bool AMDGPUPostLegalizerCombinerImpl::matchFMinFMaxLegacy(
175 if ((
Info.LHS != True ||
Info.RHS != False) &&
176 (
Info.LHS != False ||
Info.RHS != True))
182 if (
Info.LHS != True)
189void AMDGPUPostLegalizerCombinerImpl::applySelectFCmpToFMinFMaxLegacy(
192 : AMDGPU::G_AMDGPU_FMIN_LEGACY;
202 B.buildInstr(Opc, {
MI.getOperand(0)}, {
X,
Y},
MI.getFlags());
204 MI.eraseFromParent();
207bool AMDGPUPostLegalizerCombinerImpl::matchUCharToFloat(
215 LLT Ty =
MRI.getType(DstReg);
218 unsigned SrcSize =
MRI.getType(SrcReg).getSizeInBits();
219 assert(SrcSize == 16 || SrcSize == 32 || SrcSize == 64);
221 return Helper.getKnownBits()->maskedValueIsZero(SrcReg, Mask);
227void AMDGPUPostLegalizerCombinerImpl::applyUCharToFloat(
233 LLT Ty =
MRI.getType(DstReg);
234 LLT SrcTy =
MRI.getType(SrcReg);
236 SrcReg =
B.buildAnyExtOrTrunc(
S32, SrcReg).getReg(0);
239 B.buildInstr(AMDGPU::G_AMDGPU_CVT_F32_UBYTE0, {DstReg}, {SrcReg},
242 auto Cvt0 =
B.buildInstr(AMDGPU::G_AMDGPU_CVT_F32_UBYTE0, {
S32}, {SrcReg},
244 B.buildFPTrunc(DstReg, Cvt0,
MI.getFlags());
247 MI.eraseFromParent();
250bool AMDGPUPostLegalizerCombinerImpl::matchRcpSqrtToRsq(
257 if (
auto *GI = dyn_cast<GIntrinsic>(&
MI)) {
258 if (GI->is(Intrinsic::amdgcn_rcp))
259 return MRI.getVRegDef(
MI.getOperand(2).getReg());
276 if ((RcpSrcMI = getRcpSrc(
MI)) && (SqrtSrcMI = getSqrtSrc(*RcpSrcMI))) {
278 B.buildIntrinsic(Intrinsic::amdgcn_rsq, {
MI.getOperand(0)})
280 .setMIFlags(
MI.getFlags());
286 if ((SqrtSrcMI = getSqrtSrc(
MI)) && (RcpSrcMI = getRcpSrc(*SqrtSrcMI))) {
288 B.buildIntrinsic(Intrinsic::amdgcn_rsq, {
MI.getOperand(0)})
290 .setMIFlags(
MI.getFlags());
297bool AMDGPUPostLegalizerCombinerImpl::matchFDivSqrtToRsqF16(
300 return MRI.hasOneNonDBGUse(Sqrt);
303void AMDGPUPostLegalizerCombinerImpl::applyFDivSqrtToRsqF16(
307 LLT DstTy =
MRI.getType(Dst);
309 Register RSQ =
B.buildIntrinsic(Intrinsic::amdgcn_rsq, {DstTy})
313 B.buildFMul(Dst, RSQ,
Y, Flags);
314 MI.eraseFromParent();
317bool AMDGPUPostLegalizerCombinerImpl::matchCvtF32UByteN(
328 const unsigned Offset =
MI.getOpcode() - AMDGPU::G_AMDGPU_CVT_F32_UBYTE0;
330 unsigned ShiftOffset = 8 *
Offset;
332 ShiftOffset += ShiftAmt;
334 ShiftOffset -= ShiftAmt;
336 MatchInfo.CvtVal = Src0;
337 MatchInfo.ShiftOffset = ShiftOffset;
338 return ShiftOffset < 32 && ShiftOffset >= 8 && (ShiftOffset % 8) == 0;
345void AMDGPUPostLegalizerCombinerImpl::applyCvtF32UByteN(
347 unsigned NewOpc = AMDGPU::G_AMDGPU_CVT_F32_UBYTE0 + MatchInfo.ShiftOffset / 8;
351 LLT SrcTy =
MRI.getType(MatchInfo.CvtVal);
354 CvtSrc =
B.buildAnyExt(
S32, CvtSrc).getReg(0);
358 B.buildInstr(NewOpc, {
MI.getOperand(0)}, {CvtSrc},
MI.getFlags());
359 MI.eraseFromParent();
362bool AMDGPUPostLegalizerCombinerImpl::matchRemoveFcanonicalize(
365 MF.getSubtarget().getTargetLowering());
366 Reg =
MI.getOperand(1).getReg();
376bool AMDGPUPostLegalizerCombinerImpl::matchCombineSignExtendInReg(
377 MachineInstr &
MI, std::pair<MachineInstr *, unsigned> &MatchData)
const {
379 if (!
MRI.hasOneNonDBGUse(LoadReg))
385 int64_t Width =
MI.getOperand(2).getImm();
387 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
388 MatchData = {LoadMI, AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE};
390 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
391 MatchData = {LoadMI, AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT};
393 case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_UBYTE:
394 MatchData = {LoadMI, AMDGPU::G_AMDGPU_S_BUFFER_LOAD_SBYTE};
396 case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_USHORT:
397 MatchData = {LoadMI, AMDGPU::G_AMDGPU_S_BUFFER_LOAD_SSHORT};
405void AMDGPUPostLegalizerCombinerImpl::applyCombineSignExtendInReg(
406 MachineInstr &
MI, std::pair<MachineInstr *, unsigned> &MatchData)
const {
407 auto [LoadMI, NewOpcode] = MatchData;
411 Register SignExtendInsnDst =
MI.getOperand(0).getReg();
414 MI.eraseFromParent();
417bool AMDGPUPostLegalizerCombinerImpl::matchCombine_s_mul_u64(
424 if (KB->getKnownBits(Src1).countMinLeadingZeros() >= 32 &&
425 KB->getKnownBits(Src0).countMinLeadingZeros() >= 32) {
426 NewOpcode = AMDGPU::G_AMDGPU_S_MUL_U64_U32;
430 if (KB->computeNumSignBits(Src1) >= 33 &&
431 KB->computeNumSignBits(Src0) >= 33) {
432 NewOpcode = AMDGPU::G_AMDGPU_S_MUL_I64_I32;
445 AMDGPUPostLegalizerCombiner(
bool IsOptNone =
false);
448 return "AMDGPUPostLegalizerCombiner";
457 AMDGPUPostLegalizerCombinerImplRuleConfig RuleConfig;
461void AMDGPUPostLegalizerCombiner::getAnalysisUsage(
AnalysisUsage &AU)
const {
474AMDGPUPostLegalizerCombiner::AMDGPUPostLegalizerCombiner(
bool IsOptNone)
478 if (!RuleConfig.parseCommandLineOption())
482bool AMDGPUPostLegalizerCombiner::runOnMachineFunction(
MachineFunction &MF) {
484 MachineFunctionProperties::Property::FailedISel))
486 auto *TPC = &getAnalysis<TargetPassConfig>();
495 GISelKnownBits *KB = &getAnalysis<GISelKnownBitsAnalysis>().get(MF);
498 : &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
501 LI, EnableOpt,
F.hasOptSize(),
F.hasMinSize());
503 AMDGPUPostLegalizerCombinerImpl Impl(MF, CInfo, TPC, *KB,
nullptr,
504 RuleConfig, ST, MDT, LI);
505 return Impl.combineMachineInstrs();
508char AMDGPUPostLegalizerCombiner::ID = 0;
510 "Combine AMDGPU machine instrs after legalization",
false,
520 return new AMDGPUPostLegalizerCombiner(IsOptNone);
unsigned const MachineRegisterInfo * MRI
This contains common combine transformations that may be used in a combine pass.
This file declares the targeting of the Machinelegalizer class for AMDGPU.
Provides AMDGPU specific target descriptions.
#define GET_GICOMBINER_CONSTRUCTOR_INITS
Combine AMDGPU machine instrs after legalization
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
Analysis containing CSE Info
This contains common combine transformations that may be used in a combine pass,or by the target else...
Option class for Targets to specify which operations are combined how and when.
This contains the base class for all Combiners generated by TableGen.
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
AMD GCN specific subclass of TargetSubtarget.
Provides analysis for querying information about KnownBits during GISel passes.
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
const HexagonInstrInfo * TII
Contains matchers for matching SSA Machine Instructions.
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
#define INITIALIZE_PASS_DEPENDENCY(depName)
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
static StringRef getName(Value *V)
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
Target-Independent Code Generator Pass Configuration Options pass.
Class for arbitrary precision integers.
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
AnalysisUsage & addPreserved()
Add the specified Pass class to the set of analyses preserved by this pass.
void setPreservesCFG()
This function should be called by the pass, iff they do not:
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
@ FCMP_OGT
0 0 1 0 True if ordered and greater than
Predicate getSwappedPredicate() const
For example, EQ->EQ, SLE->SGE, ULT->UGT, OEQ->OEQ, ULE->UGE, OLT->OGT, etc.
Predicate getInversePredicate() const
For example, EQ -> NE, UGT -> ULE, SLT -> SGE, OEQ -> UNE, UGT -> OLE, OLT -> UGE,...
Predicate getUnorderedPredicate() const
virtual bool tryCombineAll(MachineInstr &I) const =0
FunctionPass class - This class is used to implement most global optimizations.
To use KnownBitsInfo analysis in a pass, KnownBitsInfo &Info = getAnalysis<GISelKnownBitsInfoAnalysis...
constexpr bool isScalar() const
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
constexpr TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
Analysis pass which computes a MachineDominatorTree.
DominatorTree Class - Concrete subclass of DominatorTreeBase that is used to compute a normal dominat...
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
virtual bool runOnMachineFunction(MachineFunction &MF)=0
runOnMachineFunction - This method must be overloaded to perform the desired machine code transformat...
bool hasProperty(Property P) const
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
Function & getFunction()
Return the LLVM function that this machine code represents.
const LLVMTargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
const MachineFunctionProperties & getProperties() const
Get the function properties.
Helper class to build MachineInstr.
Representation of each machine instruction.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
void setDesc(const MCInstrDesc &TID)
Replace the instruction descriptor (thus opcode) of the current instruction with a new one.
const MachineOperand & getOperand(unsigned i) const
void setReg(Register Reg)
Change the register this operand corresponds to.
Register getReg() const
getReg - Returns the register number.
unsigned getPredicate() const
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
virtual StringRef getPassName() const
getPassName - Return a nice clean name for a pass.
Wrapper class representing virtual and physical registers.
bool isCanonicalized(SelectionDAG &DAG, SDValue Op, unsigned MaxDepth=5) const
StringRef - Represent a constant reference to a string, i.e.
CodeGenOptLevel getOptLevel() const
Returns the optimization level: None, Less, Default, or Aggressive.
Target-Independent Code Generator Pass Configuration Options.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
operand_type_match m_Reg()
UnaryOp_match< SrcTy, TargetOpcode::G_ZEXT > m_GZExt(const SrcTy &Src)
ConstantMatch< APInt > m_ICst(APInt &Cst)
UnaryOp_match< SrcTy, TargetOpcode::G_FSQRT > m_GFSqrt(const SrcTy &Src)
bool mi_match(Reg R, const MachineRegisterInfo &MRI, Pattern &&P)
BinaryOp_match< LHS, RHS, TargetOpcode::G_SHL, false > m_GShl(const LHS &L, const RHS &R)
bind_ty< MachineInstr * > m_MInstr(MachineInstr *&MI)
BinaryOp_match< LHS, RHS, TargetOpcode::G_LSHR, false > m_GLShr(const LHS &L, const RHS &R)
Reg
All possible values of the reg field in the ModR/M byte.
This is an optimization pass for GlobalISel generic memory operations.
FunctionPass * createAMDGPUPostLegalizeCombiner(bool IsOptNone)
void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
void initializeAMDGPUPostLegalizerCombinerPass(PassRegistry &)
void getSelectionDAGFallbackAnalysisUsage(AnalysisUsage &AU)
Modify analysis usage so it preserves passes required for the SelectionDAG fallback.
auto instrs(const MachineBasicBlock &BB)
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.