LLVM  12.0.0git
AMDGPUPostLegalizerCombiner.cpp
Go to the documentation of this file.
1 //=== lib/CodeGen/GlobalISel/AMDGPUPostLegalizerCombiner.cpp ---------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This pass does combining of machine instructions at the generic MI level,
10 // after the legalizer.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPUTargetMachine.h"
15 #include "AMDGPULegalizerInfo.h"
24 #include "llvm/Support/Debug.h"
26 
27 #define DEBUG_TYPE "amdgpu-postlegalizer-combiner"
28 
29 using namespace llvm;
30 using namespace MIPatternMatch;
31 
38 };
39 
40 // TODO: Make sure fmin_legacy/fmax_legacy don't canonicalize
43  // FIXME: Combines should have subtarget predicates, and we shouldn't need
44  // this here.
46  return false;
47 
48  // FIXME: Type predicate on pattern
49  if (MRI.getType(MI.getOperand(0).getReg()) != LLT::scalar(32))
50  return false;
51 
52  Register Cond = MI.getOperand(1).getReg();
53  if (!MRI.hasOneNonDBGUse(Cond) ||
54  !mi_match(Cond, MRI,
55  m_GFCmp(m_Pred(Info.Pred), m_Reg(Info.LHS), m_Reg(Info.RHS))))
56  return false;
57 
58  Info.True = MI.getOperand(2).getReg();
59  Info.False = MI.getOperand(3).getReg();
60 
61  if (!(Info.LHS == Info.True && Info.RHS == Info.False) &&
62  !(Info.LHS == Info.False && Info.RHS == Info.True))
63  return false;
64 
65  switch (Info.Pred) {
67  case CmpInst::FCMP_OEQ:
68  case CmpInst::FCMP_ONE:
69  case CmpInst::FCMP_ORD:
70  case CmpInst::FCMP_UNO:
71  case CmpInst::FCMP_UEQ:
72  case CmpInst::FCMP_UNE:
73  case CmpInst::FCMP_TRUE:
74  return false;
75  default:
76  return true;
77  }
78 }
79 
81  const FMinFMaxLegacyInfo &Info) {
82 
83  auto buildNewInst = [&MI](unsigned Opc, Register X, Register Y) {
84  MachineIRBuilder MIB(MI);
85  MIB.buildInstr(Opc, {MI.getOperand(0)}, {X, Y}, MI.getFlags());
86  };
87 
88  switch (Info.Pred) {
89  case CmpInst::FCMP_ULT:
90  case CmpInst::FCMP_ULE:
91  if (Info.LHS == Info.True)
92  buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY, Info.RHS, Info.LHS);
93  else
94  buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY, Info.LHS, Info.RHS);
95  break;
96  case CmpInst::FCMP_OLE:
97  case CmpInst::FCMP_OLT: {
98  // We need to permute the operands to get the correct NaN behavior. The
99  // selected operand is the second one based on the failing compare with NaN,
100  // so permute it based on the compare type the hardware uses.
101  if (Info.LHS == Info.True)
102  buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY, Info.LHS, Info.RHS);
103  else
104  buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY, Info.RHS, Info.LHS);
105  break;
106  }
107  case CmpInst::FCMP_UGE:
108  case CmpInst::FCMP_UGT: {
109  if (Info.LHS == Info.True)
110  buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY, Info.RHS, Info.LHS);
111  else
112  buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY, Info.LHS, Info.RHS);
113  break;
114  }
115  case CmpInst::FCMP_OGT:
116  case CmpInst::FCMP_OGE: {
117  if (Info.LHS == Info.True)
118  buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY, Info.LHS, Info.RHS);
119  else
120  buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY, Info.RHS, Info.LHS);
121  break;
122  }
123  default:
124  llvm_unreachable("predicate should not have matched");
125  }
126 
127  MI.eraseFromParent();
128 }
129 
131  MachineFunction &MF, CombinerHelper &Helper) {
132  Register DstReg = MI.getOperand(0).getReg();
133 
134  // TODO: We could try to match extracting the higher bytes, which would be
135  // easier if i8 vectors weren't promoted to i32 vectors, particularly after
136  // types are legalized. v4i8 -> v4f32 is probably the only case to worry
137  // about in practice.
138  LLT Ty = MRI.getType(DstReg);
139  if (Ty == LLT::scalar(32) || Ty == LLT::scalar(16)) {
140  Register SrcReg = MI.getOperand(1).getReg();
141  unsigned SrcSize = MRI.getType(SrcReg).getSizeInBits();
142  assert(SrcSize == 16 || SrcSize == 32 || SrcSize == 64);
143  const APInt Mask = APInt::getHighBitsSet(SrcSize, SrcSize - 8);
144  return Helper.getKnownBits()->maskedValueIsZero(SrcReg, Mask);
145  }
146 
147  return false;
148 }
149 
151  MachineIRBuilder B(MI);
152 
153  const LLT S32 = LLT::scalar(32);
154 
155  Register DstReg = MI.getOperand(0).getReg();
156  Register SrcReg = MI.getOperand(1).getReg();
157  LLT Ty = B.getMRI()->getType(DstReg);
158  LLT SrcTy = B.getMRI()->getType(SrcReg);
159  if (SrcTy != S32)
160  SrcReg = B.buildAnyExtOrTrunc(S32, SrcReg).getReg(0);
161 
162  if (Ty == S32) {
163  B.buildInstr(AMDGPU::G_AMDGPU_CVT_F32_UBYTE0, {DstReg},
164  {SrcReg}, MI.getFlags());
165  } else {
166  auto Cvt0 = B.buildInstr(AMDGPU::G_AMDGPU_CVT_F32_UBYTE0, {S32},
167  {SrcReg}, MI.getFlags());
168  B.buildFPTrunc(DstReg, Cvt0, MI.getFlags());
169  }
170 
171  MI.eraseFromParent();
172 }
173 
174 // FIXME: Should be able to have 2 separate matchdatas rather than custom struct
175 // boilerplate.
178  unsigned ShiftOffset;
179 };
180 
182  MachineFunction &MF,
183  CvtF32UByteMatchInfo &MatchInfo) {
184  Register SrcReg = MI.getOperand(1).getReg();
185 
186  // Look through G_ZEXT.
187  mi_match(SrcReg, MRI, m_GZExt(m_Reg(SrcReg)));
188 
189  Register Src0;
190  int64_t ShiftAmt;
191  bool IsShr = mi_match(SrcReg, MRI, m_GLShr(m_Reg(Src0), m_ICst(ShiftAmt)));
192  if (IsShr || mi_match(SrcReg, MRI, m_GShl(m_Reg(Src0), m_ICst(ShiftAmt)))) {
193  const unsigned Offset = MI.getOpcode() - AMDGPU::G_AMDGPU_CVT_F32_UBYTE0;
194 
195  unsigned ShiftOffset = 8 * Offset;
196  if (IsShr)
197  ShiftOffset += ShiftAmt;
198  else
199  ShiftOffset -= ShiftAmt;
200 
201  MatchInfo.CvtVal = Src0;
202  MatchInfo.ShiftOffset = ShiftOffset;
203  return ShiftOffset < 32 && ShiftOffset >= 8 && (ShiftOffset % 8) == 0;
204  }
205 
206  // TODO: Simplify demanded bits.
207  return false;
208 }
209 
211  const CvtF32UByteMatchInfo &MatchInfo) {
212  MachineIRBuilder B(MI);
213  unsigned NewOpc = AMDGPU::G_AMDGPU_CVT_F32_UBYTE0 + MatchInfo.ShiftOffset / 8;
214 
215  const LLT S32 = LLT::scalar(32);
216  Register CvtSrc = MatchInfo.CvtVal;
217  LLT SrcTy = B.getMRI()->getType(MatchInfo.CvtVal);
218  if (SrcTy != S32) {
219  assert(SrcTy.isScalar() && SrcTy.getSizeInBits() >= 8);
220  CvtSrc = B.buildAnyExt(S32, CvtSrc).getReg(0);
221  }
222 
223  assert(MI.getOpcode() != NewOpc);
224  B.buildInstr(NewOpc, {MI.getOperand(0)}, {CvtSrc}, MI.getFlags());
225  MI.eraseFromParent();
226 }
227 
228 #define AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS
229 #include "AMDGPUGenPostLegalizeGICombiner.inc"
230 #undef AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS
231 
232 namespace {
233 #define AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_H
234 #include "AMDGPUGenPostLegalizeGICombiner.inc"
235 #undef AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_H
236 
237 class AMDGPUPostLegalizerCombinerInfo : public CombinerInfo {
238  GISelKnownBits *KB;
240 
241 public:
242  AMDGPUGenPostLegalizerCombinerHelperRuleConfig GeneratedRuleCfg;
243 
244  AMDGPUPostLegalizerCombinerInfo(bool EnableOpt, bool OptSize, bool MinSize,
245  const AMDGPULegalizerInfo *LI,
247  : CombinerInfo(/*AllowIllegalOps*/ false, /*ShouldLegalizeIllegal*/ true,
248  /*LegalizerInfo*/ LI, EnableOpt, OptSize, MinSize),
249  KB(KB), MDT(MDT) {
250  if (!GeneratedRuleCfg.parseCommandLineOption())
251  report_fatal_error("Invalid rule identifier");
252  }
253 
254  bool combine(GISelChangeObserver &Observer, MachineInstr &MI,
255  MachineIRBuilder &B) const override;
256 };
257 
259  MachineInstr &MI,
260  MachineIRBuilder &B) const {
261  CombinerHelper Helper(Observer, B, KB, MDT);
262  AMDGPUGenPostLegalizerCombinerHelper Generated(GeneratedRuleCfg);
263 
264  if (Generated.tryCombineAll(Observer, MI, B, Helper))
265  return true;
266 
267  switch (MI.getOpcode()) {
268  case TargetOpcode::G_SHL:
269  case TargetOpcode::G_LSHR:
270  case TargetOpcode::G_ASHR:
271  // On some subtargets, 64-bit shift is a quarter rate instruction. In the
272  // common case, splitting this into a move and a 32-bit shift is faster and
273  // the same code size.
274  return Helper.tryCombineShiftToUnmerge(MI, 32);
275  }
276 
277  return false;
278 }
279 
280 #define AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_CPP
281 #include "AMDGPUGenPostLegalizeGICombiner.inc"
282 #undef AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_CPP
283 
284 // Pass boilerplate
285 // ================
286 
287 class AMDGPUPostLegalizerCombiner : public MachineFunctionPass {
288 public:
289  static char ID;
290 
291  AMDGPUPostLegalizerCombiner(bool IsOptNone = false);
292 
293  StringRef getPassName() const override {
294  return "AMDGPUPostLegalizerCombiner";
295  }
296 
297  bool runOnMachineFunction(MachineFunction &MF) override;
298 
299  void getAnalysisUsage(AnalysisUsage &AU) const override;
300 private:
301  bool IsOptNone;
302 };
303 } // end anonymous namespace
304 
305 void AMDGPUPostLegalizerCombiner::getAnalysisUsage(AnalysisUsage &AU) const {
307  AU.setPreservesCFG();
311  if (!IsOptNone) {
314  }
316 }
317 
318 AMDGPUPostLegalizerCombiner::AMDGPUPostLegalizerCombiner(bool IsOptNone)
319  : MachineFunctionPass(ID), IsOptNone(IsOptNone) {
321 }
322 
323 bool AMDGPUPostLegalizerCombiner::runOnMachineFunction(MachineFunction &MF) {
324  if (MF.getProperties().hasProperty(
326  return false;
327  auto *TPC = &getAnalysis<TargetPassConfig>();
328  const Function &F = MF.getFunction();
329  bool EnableOpt =
330  MF.getTarget().getOptLevel() != CodeGenOpt::None && !skipFunction(F);
331 
332  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
333  const AMDGPULegalizerInfo *LI
334  = static_cast<const AMDGPULegalizerInfo *>(ST.getLegalizerInfo());
335 
336  GISelKnownBits *KB = &getAnalysis<GISelKnownBitsAnalysis>().get(MF);
337  MachineDominatorTree *MDT =
338  IsOptNone ? nullptr : &getAnalysis<MachineDominatorTree>();
339  AMDGPUPostLegalizerCombinerInfo PCInfo(EnableOpt, F.hasOptSize(),
340  F.hasMinSize(), LI, KB, MDT);
341  Combiner C(PCInfo, TPC);
342  return C.combineMachineInstrs(MF, /*CSEInfo*/ nullptr);
343 }
344 
346 INITIALIZE_PASS_BEGIN(AMDGPUPostLegalizerCombiner, DEBUG_TYPE,
347  "Combine AMDGPU machine instrs after legalization",
348  false, false)
351 INITIALIZE_PASS_END(AMDGPUPostLegalizerCombiner, DEBUG_TYPE,
352  "Combine AMDGPU machine instrs after legalization", false,
353  false)
354 
355 namespace llvm {
357  return new AMDGPUPostLegalizerCombiner(IsOptNone);
358 }
359 } // end namespace llvm
uint64_t CallInst * C
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
AnalysisUsage & addPreserved()
Add the specified Pass class to the set of analyses preserved by this pass.
bool maskedValueIsZero(Register Val, const APInt &Mask)
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
coro Split coroutine into a set of functions driving its state machine
Definition: CoroSplit.cpp:1806
LLVM_ATTRIBUTE_NORETURN void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
Definition: Error.cpp:140
This class represents lattice values for constants.
Definition: AllocatorList.h:23
1 1 1 1 Always true (always folded)
Definition: InstrTypes.h:742
void getSelectionDAGFallbackAnalysisUsage(AnalysisUsage &AU)
Modify analysis usage so it preserves passes required for the SelectionDAG fallback.
Definition: Utils.cpp:509
1 0 0 0 True if unordered: isnan(X) | isnan(Y)
Definition: InstrTypes.h:735
Register getReg(unsigned Idx) const
Get the register for the operand index.
CompareOp_match< Pred, LHS, RHS, TargetOpcode::G_FCMP > m_GFCmp(const Pred &P, const LHS &L, const RHS &R)
const MachineFunctionProperties & getProperties() const
Get the function properties.
bool hasOneNonDBGUse(Register RegNo) const
hasOneNonDBGUse - Return true if there is exactly one non-Debug use of the specified register...
bool isScalar() const
0 1 0 0 True if ordered and less than
Definition: InstrTypes.h:731
UnaryOp_match< SrcTy, TargetOpcode::G_ZEXT > m_GZExt(const SrcTy &Src)
static bool matchUCharToFloat(MachineInstr &MI, MachineRegisterInfo &MRI, MachineFunction &MF, CombinerHelper &Helper)
bool mi_match(Reg R, const MachineRegisterInfo &MRI, Pattern &&P)
F(f)
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:725
static bool matchCvtF32UByteN(MachineInstr &MI, MachineRegisterInfo &MRI, MachineFunction &MF, CvtF32UByteMatchInfo &MatchInfo)
Function & getFunction()
Return the LLVM function that this machine code represents.
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
AnalysisUsage & addRequired()
MachineInstrBuilder buildAnyExt(const DstOp &Res, const SrcOp &Op)
Build and insert Res = G_ANYEXT Op0.
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
static void applyUCharToFloat(MachineInstr &MI)
void eraseFromParent()
Unlink &#39;this&#39; from the containing basic block and delete it.
MachineInstrBuilder buildAnyExtOrTrunc(const DstOp &Res, const SrcOp &Op)
Res = COPY Op depending on the differing sizes of Res and Op.
0 0 0 1 True if ordered and equal
Definition: InstrTypes.h:728
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
Definition: MachineInstr.h:456
std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E&#39;s largest value.
Definition: BitmaskEnum.h:80
INITIALIZE_PASS_BEGIN(AMDGPUPostLegalizerCombiner, DEBUG_TYPE, "Combine AMDGPU machine instrs after legalization", false, false) INITIALIZE_PASS_END(AMDGPUPostLegalizerCombiner
Target-Independent Code Generator Pass Configuration Options.
Combine AMDGPU machine instrs after legalization
0 1 1 1 True if ordered (no nans)
Definition: InstrTypes.h:734
static bool matchFMinFMaxLegacy(MachineInstr &MI, MachineRegisterInfo &MRI, MachineFunction &MF, FMinFMaxLegacyInfo &Info)
static LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
bind_ty< CmpInst::Predicate > m_Pred(CmpInst::Predicate &P)
Analysis containing CSE Info
Definition: CSEInfo.cpp:25
SmallVector< MachineOperand, 4 > Cond
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Get a value with high bits set.
Definition: APInt.h:654
To use KnownBitsInfo analysis in a pass, KnownBitsInfo &Info = getAnalysis<GISelKnownBitsInfoAnalysis...
MachineRegisterInfo * getMRI()
Getter for MRI.
Abstract class that contains various methods for clients to notify about changes. ...
GISelKnownBits * getKnownBits() const
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
CodeGenOpt::Level getOptLevel() const
Returns the optimization level: None, Less, Default, or Aggressive.
unsigned const MachineRegisterInfo * MRI
MachineInstrBuilder buildInstr(unsigned Opcode)
Build and insert <empty> = Opcode <empty>.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
bool hasFminFmaxLegacy() const
bool tryCombineShiftToUnmerge(MachineInstr &MI, unsigned TargetShiftAmount)
FunctionPass * createAMDGPUPostLegalizeCombiner(bool IsOptNone)
Helper class to build MachineInstr.
1 0 0 1 True if unordered or equal
Definition: InstrTypes.h:736
1 0 1 1 True if unordered, greater than, or equal
Definition: InstrTypes.h:738
Represent the analysis usage information of a pass.
0 0 1 0 True if ordered and greater than
Definition: InstrTypes.h:729
FunctionPass class - This class is used to implement most global optimizations.
Definition: Pass.h:284
The AMDGPU TargetMachine interface definition for hw codgen targets.
1 1 0 0 True if unordered or less than
Definition: InstrTypes.h:739
INITIALIZE_PASS_END(RegBankSelect, DEBUG_TYPE, "Assign register bank of generic virtual registers", false, false) RegBankSelect
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
1 1 0 1 True if unordered, less than, or equal
Definition: InstrTypes.h:740
unsigned getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
BinaryOp_match< LHS, RHS, TargetOpcode::G_LSHR, false > m_GLShr(const LHS &L, const RHS &R)
void setPreservesCFG()
This function should be called by the pass, iff they do not:
Definition: Pass.cpp:253
static void applySelectFCmpToFMinToFMaxLegacy(MachineInstr &MI, const FMinFMaxLegacyInfo &Info)
This class provides the information for the target register banks.
Class for arbitrary precision integers.
Definition: APInt.h:69
This file declares the targeting of the Machinelegalizer class for AMDGPU.
0 1 1 0 True if ordered and operands are unequal
Definition: InstrTypes.h:733
vector combine
MachineRegisterInfo - Keep track of information for virtual and physical registers, including vreg register classes, use/def chains for registers, etc.
Provides AMDGPU specific target descriptions.
Representation of each machine instruction.
Definition: MachineInstr.h:62
ConstantMatch m_ICst(int64_t &Cst)
LLT getType(Register Reg) const
Get the low-level type of Reg or LLT{} if Reg is not a generic (target independent) virtual register...
1 0 1 0 True if unordered or greater than
Definition: InstrTypes.h:737
MachineInstrBuilder buildFPTrunc(const DstOp &Res, const SrcOp &Op, Optional< unsigned > Flags=None)
Build and insert Res = G_FPTRUNC Op.
void initializeAMDGPUPostLegalizerCombinerPass(PassRegistry &)
BinaryOp_match< LHS, RHS, TargetOpcode::G_SHL, false > m_GShl(const LHS &L, const RHS &R)
const LLVMTargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
0 0 1 1 True if ordered and greater than or equal
Definition: InstrTypes.h:730
uint16_t getFlags() const
Return the MI flags bitvector.
Definition: MachineInstr.h:316
0 1 0 1 True if ordered and less than or equal
Definition: InstrTypes.h:732
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
bool hasProperty(Property P) const
0 0 0 0 Always false (always folded)
Definition: InstrTypes.h:727
operand_type_match m_Reg()
const LegalizerInfo * getLegalizerInfo() const override
IRTranslator LLVM IR MI
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:57
Register getReg() const
getReg - Returns the register number.
#define DEBUG_TYPE
const MachineOperand & getOperand(unsigned i) const
Definition: MachineInstr.h:466
DominatorTree Class - Concrete subclass of DominatorTreeBase that is used to compute a normal dominat...
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
static void applyCvtF32UByteN(MachineInstr &MI, const CvtF32UByteMatchInfo &MatchInfo)
1 1 1 0 True if unordered or not equal
Definition: InstrTypes.h:741
INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)