Line data Source code
1 : //===-- AMDGPUISelLowering.h - AMDGPU Lowering Interface --------*- C++ -*-===//
2 : //
3 : // The LLVM Compiler Infrastructure
4 : //
5 : // This file is distributed under the University of Illinois Open Source
6 : // License. See LICENSE.TXT for details.
7 : //
8 : //===----------------------------------------------------------------------===//
9 : //
10 : /// \file
11 : /// Interface definition of the TargetLowering class that is common
12 : /// to all AMD GPUs.
13 : //
14 : //===----------------------------------------------------------------------===//
15 :
16 : #ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUISELLOWERING_H
17 : #define LLVM_LIB_TARGET_AMDGPU_AMDGPUISELLOWERING_H
18 :
19 : #include "AMDGPU.h"
20 : #include "llvm/CodeGen/CallingConvLower.h"
21 : #include "llvm/CodeGen/TargetLowering.h"
22 :
23 : namespace llvm {
24 :
25 : class AMDGPUMachineFunction;
26 : class AMDGPUSubtarget;
27 : struct ArgDescriptor;
28 :
29 : class AMDGPUTargetLowering : public TargetLowering {
30 : private:
31 : const AMDGPUSubtarget *Subtarget;
32 :
33 : /// \returns AMDGPUISD::FFBH_U32 node if the incoming \p Op may have been
34 : /// legalized from a smaller type VT. Need to match pre-legalized type because
35 : /// the generic legalization inserts the add/sub between the select and
36 : /// compare.
37 : SDValue getFFBX_U32(SelectionDAG &DAG, SDValue Op, const SDLoc &DL, unsigned Opc) const;
38 :
39 : public:
40 : static unsigned numBitsUnsigned(SDValue Op, SelectionDAG &DAG);
41 : static unsigned numBitsSigned(SDValue Op, SelectionDAG &DAG);
42 :
43 : protected:
44 : SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const;
45 : SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const;
46 : /// Split a vector store into multiple scalar stores.
47 : /// \returns The resulting chain.
48 :
49 : SDValue LowerFREM(SDValue Op, SelectionDAG &DAG) const;
50 : SDValue LowerFCEIL(SDValue Op, SelectionDAG &DAG) const;
51 : SDValue LowerFTRUNC(SDValue Op, SelectionDAG &DAG) const;
52 : SDValue LowerFRINT(SDValue Op, SelectionDAG &DAG) const;
53 : SDValue LowerFNEARBYINT(SDValue Op, SelectionDAG &DAG) const;
54 :
55 : SDValue LowerFROUND32_16(SDValue Op, SelectionDAG &DAG) const;
56 : SDValue LowerFROUND64(SDValue Op, SelectionDAG &DAG) const;
57 : SDValue LowerFROUND(SDValue Op, SelectionDAG &DAG) const;
58 : SDValue LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const;
59 : SDValue LowerFLOG(SDValue Op, SelectionDAG &DAG,
60 : double Log2BaseInverted) const;
61 : SDValue lowerFEXP(SDValue Op, SelectionDAG &DAG) const;
62 :
63 : SDValue LowerCTLZ_CTTZ(SDValue Op, SelectionDAG &DAG) const;
64 :
65 : SDValue LowerINT_TO_FP32(SDValue Op, SelectionDAG &DAG, bool Signed) const;
66 : SDValue LowerINT_TO_FP64(SDValue Op, SelectionDAG &DAG, bool Signed) const;
67 : SDValue LowerUINT_TO_FP(SDValue Op, SelectionDAG &DAG) const;
68 : SDValue LowerSINT_TO_FP(SDValue Op, SelectionDAG &DAG) const;
69 :
70 : SDValue LowerFP64_TO_INT(SDValue Op, SelectionDAG &DAG, bool Signed) const;
71 : SDValue LowerFP_TO_FP16(SDValue Op, SelectionDAG &DAG) const;
72 : SDValue LowerFP_TO_UINT(SDValue Op, SelectionDAG &DAG) const;
73 : SDValue LowerFP_TO_SINT(SDValue Op, SelectionDAG &DAG) const;
74 :
75 : SDValue LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const;
76 :
77 : protected:
78 : bool shouldCombineMemoryType(EVT VT) const;
79 : SDValue performLoadCombine(SDNode *N, DAGCombinerInfo &DCI) const;
80 : SDValue performStoreCombine(SDNode *N, DAGCombinerInfo &DCI) const;
81 : SDValue performAssertSZExtCombine(SDNode *N, DAGCombinerInfo &DCI) const;
82 :
83 : SDValue splitBinaryBitConstantOpImpl(DAGCombinerInfo &DCI, const SDLoc &SL,
84 : unsigned Opc, SDValue LHS,
85 : uint32_t ValLo, uint32_t ValHi) const;
86 : SDValue performShlCombine(SDNode *N, DAGCombinerInfo &DCI) const;
87 : SDValue performSraCombine(SDNode *N, DAGCombinerInfo &DCI) const;
88 : SDValue performSrlCombine(SDNode *N, DAGCombinerInfo &DCI) const;
89 : SDValue performTruncateCombine(SDNode *N, DAGCombinerInfo &DCI) const;
90 : SDValue performMulCombine(SDNode *N, DAGCombinerInfo &DCI) const;
91 : SDValue performMulhsCombine(SDNode *N, DAGCombinerInfo &DCI) const;
92 : SDValue performMulhuCombine(SDNode *N, DAGCombinerInfo &DCI) const;
93 : SDValue performMulLoHi24Combine(SDNode *N, DAGCombinerInfo &DCI) const;
94 : SDValue performCtlz_CttzCombine(const SDLoc &SL, SDValue Cond, SDValue LHS,
95 : SDValue RHS, DAGCombinerInfo &DCI) const;
96 : SDValue performSelectCombine(SDNode *N, DAGCombinerInfo &DCI) const;
97 :
98 : bool isConstantCostlierToNegate(SDValue N) const;
99 : SDValue performFNegCombine(SDNode *N, DAGCombinerInfo &DCI) const;
100 : SDValue performFAbsCombine(SDNode *N, DAGCombinerInfo &DCI) const;
101 : SDValue performRcpCombine(SDNode *N, DAGCombinerInfo &DCI) const;
102 :
103 : static EVT getEquivalentMemType(LLVMContext &Context, EVT VT);
104 :
105 : virtual SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op,
106 : SelectionDAG &DAG) const;
107 :
108 : /// Return 64-bit value Op as two 32-bit integers.
109 : std::pair<SDValue, SDValue> split64BitValue(SDValue Op,
110 : SelectionDAG &DAG) const;
111 : SDValue getLoHalf64(SDValue Op, SelectionDAG &DAG) const;
112 : SDValue getHiHalf64(SDValue Op, SelectionDAG &DAG) const;
113 :
114 : /// Split a vector load into 2 loads of half the vector.
115 : SDValue SplitVectorLoad(SDValue Op, SelectionDAG &DAG) const;
116 :
117 : /// Split a vector store into 2 stores of half the vector.
118 : SDValue SplitVectorStore(SDValue Op, SelectionDAG &DAG) const;
119 :
120 : SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const;
121 : SDValue LowerSDIVREM(SDValue Op, SelectionDAG &DAG) const;
122 : SDValue LowerUDIVREM(SDValue Op, SelectionDAG &DAG) const;
123 : SDValue LowerDIVREM24(SDValue Op, SelectionDAG &DAG, bool sign) const;
124 : void LowerUDIVREM64(SDValue Op, SelectionDAG &DAG,
125 : SmallVectorImpl<SDValue> &Results) const;
126 :
127 : void analyzeFormalArgumentsCompute(
128 : CCState &State,
129 : const SmallVectorImpl<ISD::InputArg> &Ins) const;
130 :
131 : public:
132 : AMDGPUTargetLowering(const TargetMachine &TM, const AMDGPUSubtarget &STI);
133 :
134 0 : bool mayIgnoreSignedZero(SDValue Op) const {
135 197 : if (getTargetMachine().Options.NoSignedZerosFPMath)
136 0 : return true;
137 :
138 0 : const auto Flags = Op.getNode()->getFlags();
139 151 : if (Flags.isDefined())
140 0 : return Flags.hasNoSignedZeros();
141 :
142 : return false;
143 : }
144 :
145 : static inline SDValue stripBitcast(SDValue Val) {
146 44131 : return Val.getOpcode() == ISD::BITCAST ? Val.getOperand(0) : Val;
147 : }
148 :
149 : static bool allUsesHaveSourceMods(const SDNode *N,
150 : unsigned CostThreshold = 4);
151 : bool isFAbsFree(EVT VT) const override;
152 : bool isFNegFree(EVT VT) const override;
153 : bool isTruncateFree(EVT Src, EVT Dest) const override;
154 : bool isTruncateFree(Type *Src, Type *Dest) const override;
155 :
156 : bool isZExtFree(Type *Src, Type *Dest) const override;
157 : bool isZExtFree(EVT Src, EVT Dest) const override;
158 : bool isZExtFree(SDValue Val, EVT VT2) const override;
159 :
160 : bool isNarrowingProfitable(EVT VT1, EVT VT2) const override;
161 :
162 : MVT getVectorIdxTy(const DataLayout &) const override;
163 : bool isSelectSupported(SelectSupportKind) const override;
164 :
165 : bool isFPImmLegal(const APFloat &Imm, EVT VT) const override;
166 : bool ShouldShrinkFPConstant(EVT VT) const override;
167 : bool shouldReduceLoadWidth(SDNode *Load,
168 : ISD::LoadExtType ExtType,
169 : EVT ExtVT) const override;
170 :
171 : bool isLoadBitCastBeneficial(EVT, EVT) const final;
172 :
173 : bool storeOfVectorConstantIsCheap(EVT MemVT,
174 : unsigned NumElem,
175 : unsigned AS) const override;
176 : bool aggressivelyPreferBuildVectorSources(EVT VecVT) const override;
177 : bool isCheapToSpeculateCttz() const override;
178 : bool isCheapToSpeculateCtlz() const override;
179 :
180 : bool isSDNodeAlwaysUniform(const SDNode *N) const override;
181 : static CCAssignFn *CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg);
182 : static CCAssignFn *CCAssignFnForReturn(CallingConv::ID CC, bool IsVarArg);
183 :
184 : SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
185 : const SmallVectorImpl<ISD::OutputArg> &Outs,
186 : const SmallVectorImpl<SDValue> &OutVals, const SDLoc &DL,
187 : SelectionDAG &DAG) const override;
188 :
189 : SDValue addTokenForArgument(SDValue Chain,
190 : SelectionDAG &DAG,
191 : MachineFrameInfo &MFI,
192 : int ClobberedFI) const;
193 :
194 : SDValue lowerUnhandledCall(CallLoweringInfo &CLI,
195 : SmallVectorImpl<SDValue> &InVals,
196 : StringRef Reason) const;
197 : SDValue LowerCall(CallLoweringInfo &CLI,
198 : SmallVectorImpl<SDValue> &InVals) const override;
199 :
200 : SDValue LowerDYNAMIC_STACKALLOC(SDValue Op,
201 : SelectionDAG &DAG) const;
202 :
203 : SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
204 : SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override;
205 : void ReplaceNodeResults(SDNode * N,
206 : SmallVectorImpl<SDValue> &Results,
207 : SelectionDAG &DAG) const override;
208 :
209 : SDValue combineFMinMaxLegacy(const SDLoc &DL, EVT VT, SDValue LHS,
210 : SDValue RHS, SDValue True, SDValue False,
211 : SDValue CC, DAGCombinerInfo &DCI) const;
212 :
213 : const char* getTargetNodeName(unsigned Opcode) const override;
214 :
215 : // FIXME: Turn off MergeConsecutiveStores() before Instruction Selection
216 : // for AMDGPU.
217 : // A commit ( git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@319036
218 : // 91177308-0d34-0410-b5e6-96231b3b80d8 ) turned on
219 : // MergeConsecutiveStores() before Instruction Selection for all targets.
220 : // Enough AMDGPU compiles go into an infinite loop ( MergeConsecutiveStores()
221 : // merges two stores; LegalizeStoreOps() un-merges; MergeConsecutiveStores()
222 : // re-merges, etc. ) to warrant turning it off for now.
223 262734 : bool mergeStoresAfterLegalization() const override { return false; }
224 :
225 28 : bool isFsqrtCheap(SDValue Operand, SelectionDAG &DAG) const override {
226 28 : return true;
227 : }
228 : SDValue getSqrtEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled,
229 : int &RefinementSteps, bool &UseOneConstNR,
230 : bool Reciprocal) const override;
231 : SDValue getRecipEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled,
232 : int &RefinementSteps) const override;
233 :
234 : virtual SDNode *PostISelFolding(MachineSDNode *N,
235 : SelectionDAG &DAG) const = 0;
236 :
237 : /// Determine which of the bits specified in \p Mask are known to be
238 : /// either zero or one and return them in the \p KnownZero and \p KnownOne
239 : /// bitsets.
240 : void computeKnownBitsForTargetNode(const SDValue Op,
241 : KnownBits &Known,
242 : const APInt &DemandedElts,
243 : const SelectionDAG &DAG,
244 : unsigned Depth = 0) const override;
245 :
246 : unsigned ComputeNumSignBitsForTargetNode(SDValue Op, const APInt &DemandedElts,
247 : const SelectionDAG &DAG,
248 : unsigned Depth = 0) const override;
249 :
250 : bool isKnownNeverNaNForTargetNode(SDValue Op,
251 : const SelectionDAG &DAG,
252 : bool SNaN = false,
253 : unsigned Depth = 0) const override;
254 :
255 : /// Helper function that adds Reg to the LiveIn list of the DAG's
256 : /// MachineFunction.
257 : ///
258 : /// \returns a RegisterSDNode representing Reg if \p RawReg is true, otherwise
259 : /// a copy from the register.
260 : SDValue CreateLiveInRegister(SelectionDAG &DAG,
261 : const TargetRegisterClass *RC,
262 : unsigned Reg, EVT VT,
263 : const SDLoc &SL,
264 : bool RawReg = false) const;
265 15514 : SDValue CreateLiveInRegister(SelectionDAG &DAG,
266 : const TargetRegisterClass *RC,
267 : unsigned Reg, EVT VT) const {
268 15514 : return CreateLiveInRegister(DAG, RC, Reg, VT, SDLoc(DAG.getEntryNode()));
269 : }
270 :
271 : // Returns the raw live in register rather than a copy from it.
272 258 : SDValue CreateLiveInRegisterRaw(SelectionDAG &DAG,
273 : const TargetRegisterClass *RC,
274 : unsigned Reg, EVT VT) const {
275 258 : return CreateLiveInRegister(DAG, RC, Reg, VT, SDLoc(DAG.getEntryNode()), true);
276 : }
277 :
278 : /// Similar to CreateLiveInRegister, except value maybe loaded from a stack
279 : /// slot rather than passed in a register.
280 : SDValue loadStackInputValue(SelectionDAG &DAG,
281 : EVT VT,
282 : const SDLoc &SL,
283 : int64_t Offset) const;
284 :
285 : SDValue storeStackInputValue(SelectionDAG &DAG,
286 : const SDLoc &SL,
287 : SDValue Chain,
288 : SDValue ArgVal,
289 : int64_t Offset) const;
290 :
291 : SDValue loadInputValue(SelectionDAG &DAG,
292 : const TargetRegisterClass *RC,
293 : EVT VT, const SDLoc &SL,
294 : const ArgDescriptor &Arg) const;
295 :
296 : enum ImplicitParameter {
297 : FIRST_IMPLICIT,
298 : GRID_DIM = FIRST_IMPLICIT,
299 : GRID_OFFSET,
300 : };
301 :
302 : /// Helper function that returns the byte offset of the given
303 : /// type of implicit parameter.
304 : uint32_t getImplicitParameterOffset(const MachineFunction &MF,
305 : const ImplicitParameter Param) const;
306 :
307 156 : MVT getFenceOperandTy(const DataLayout &DL) const override {
308 156 : return MVT::i32;
309 : }
310 :
311 : AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *) const override;
312 : };
313 :
314 : namespace AMDGPUISD {
315 :
316 : enum NodeType : unsigned {
317 : // AMDIL ISD Opcodes
318 : FIRST_NUMBER = ISD::BUILTIN_OP_END,
319 : UMUL, // 32bit unsigned multiplication
320 : BRANCH_COND,
321 : // End AMDIL ISD Opcodes
322 :
323 : // Function call.
324 : CALL,
325 : TC_RETURN,
326 : TRAP,
327 :
328 : // Masked control flow nodes.
329 : IF,
330 : ELSE,
331 : LOOP,
332 :
333 : // A uniform kernel return that terminates the wavefront.
334 : ENDPGM,
335 :
336 : // Return to a shader part's epilog code.
337 : RETURN_TO_EPILOG,
338 :
339 : // Return with values from a non-entry function.
340 : RET_FLAG,
341 :
342 : DWORDADDR,
343 : FRACT,
344 :
345 : /// CLAMP value between 0.0 and 1.0. NaN clamped to 0, following clamp output
346 : /// modifier behavior with dx10_enable.
347 : CLAMP,
348 :
349 : // This is SETCC with the full mask result which is used for a compare with a
350 : // result bit per item in the wavefront.
351 : SETCC,
352 : SETREG,
353 : // FP ops with input and output chain.
354 : FMA_W_CHAIN,
355 : FMUL_W_CHAIN,
356 :
357 : // SIN_HW, COS_HW - f32 for SI, 1 ULP max error, valid from -100 pi to 100 pi.
358 : // Denormals handled on some parts.
359 : COS_HW,
360 : SIN_HW,
361 : FMAX_LEGACY,
362 : FMIN_LEGACY,
363 : FMAX3,
364 : SMAX3,
365 : UMAX3,
366 : FMIN3,
367 : SMIN3,
368 : UMIN3,
369 : FMED3,
370 : SMED3,
371 : UMED3,
372 : FDOT2,
373 : URECIP,
374 : DIV_SCALE,
375 : DIV_FMAS,
376 : DIV_FIXUP,
377 : // For emitting ISD::FMAD when f32 denormals are enabled because mac/mad is
378 : // treated as an illegal operation.
379 : FMAD_FTZ,
380 : TRIG_PREOP, // 1 ULP max error for f64
381 :
382 : // RCP, RSQ - For f32, 1 ULP max error, no denormal handling.
383 : // For f64, max error 2^29 ULP, handles denormals.
384 : RCP,
385 : RSQ,
386 : RCP_LEGACY,
387 : RSQ_LEGACY,
388 : RCP_IFLAG,
389 : FMUL_LEGACY,
390 : RSQ_CLAMP,
391 : LDEXP,
392 : FP_CLASS,
393 : DOT4,
394 : CARRY,
395 : BORROW,
396 : BFE_U32, // Extract range of bits with zero extension to 32-bits.
397 : BFE_I32, // Extract range of bits with sign extension to 32-bits.
398 : BFI, // (src0 & src1) | (~src0 & src2)
399 : BFM, // Insert a range of bits into a 32-bit word.
400 : FFBH_U32, // ctlz with -1 if input is zero.
401 : FFBH_I32,
402 : FFBL_B32, // cttz with -1 if input is zero.
403 : MUL_U24,
404 : MUL_I24,
405 : MULHI_U24,
406 : MULHI_I24,
407 : MAD_U24,
408 : MAD_I24,
409 : MAD_U64_U32,
410 : MAD_I64_I32,
411 : MUL_LOHI_I24,
412 : MUL_LOHI_U24,
413 : PERM,
414 : TEXTURE_FETCH,
415 : EXPORT, // exp on SI+
416 : EXPORT_DONE, // exp on SI+ with done bit set
417 : R600_EXPORT,
418 : CONST_ADDRESS,
419 : REGISTER_LOAD,
420 : REGISTER_STORE,
421 : SAMPLE,
422 : SAMPLEB,
423 : SAMPLED,
424 : SAMPLEL,
425 :
426 : // These cvt_f32_ubyte* nodes need to remain consecutive and in order.
427 : CVT_F32_UBYTE0,
428 : CVT_F32_UBYTE1,
429 : CVT_F32_UBYTE2,
430 : CVT_F32_UBYTE3,
431 :
432 : // Convert two float 32 numbers into a single register holding two packed f16
433 : // with round to zero.
434 : CVT_PKRTZ_F16_F32,
435 : CVT_PKNORM_I16_F32,
436 : CVT_PKNORM_U16_F32,
437 : CVT_PK_I16_I32,
438 : CVT_PK_U16_U32,
439 :
440 : // Same as the standard node, except the high bits of the resulting integer
441 : // are known 0.
442 : FP_TO_FP16,
443 :
444 : // Wrapper around fp16 results that are known to zero the high bits.
445 : FP16_ZEXT,
446 :
447 : /// This node is for VLIW targets and it is used to represent a vector
448 : /// that is stored in consecutive registers with the same channel.
449 : /// For example:
450 : /// |X |Y|Z|W|
451 : /// T0|v.x| | | |
452 : /// T1|v.y| | | |
453 : /// T2|v.z| | | |
454 : /// T3|v.w| | | |
455 : BUILD_VERTICAL_VECTOR,
456 : /// Pointer to the start of the shader's constant data.
457 : CONST_DATA_PTR,
458 : INIT_EXEC,
459 : INIT_EXEC_FROM_INPUT,
460 : SENDMSG,
461 : SENDMSGHALT,
462 : INTERP_MOV,
463 : INTERP_P1,
464 : INTERP_P2,
465 : PC_ADD_REL_OFFSET,
466 : KILL,
467 : DUMMY_CHAIN,
468 : FIRST_MEM_OPCODE_NUMBER = ISD::FIRST_TARGET_MEMORY_OPCODE,
469 : STORE_MSKOR,
470 : LOAD_CONSTANT,
471 : TBUFFER_STORE_FORMAT,
472 : TBUFFER_STORE_FORMAT_X3,
473 : TBUFFER_STORE_FORMAT_D16,
474 : TBUFFER_LOAD_FORMAT,
475 : TBUFFER_LOAD_FORMAT_D16,
476 : ATOMIC_CMP_SWAP,
477 : ATOMIC_INC,
478 : ATOMIC_DEC,
479 : ATOMIC_LOAD_FADD,
480 : ATOMIC_LOAD_FMIN,
481 : ATOMIC_LOAD_FMAX,
482 : BUFFER_LOAD,
483 : BUFFER_LOAD_FORMAT,
484 : BUFFER_LOAD_FORMAT_D16,
485 : SBUFFER_LOAD,
486 : BUFFER_STORE,
487 : BUFFER_STORE_FORMAT,
488 : BUFFER_STORE_FORMAT_D16,
489 : BUFFER_ATOMIC_SWAP,
490 : BUFFER_ATOMIC_ADD,
491 : BUFFER_ATOMIC_SUB,
492 : BUFFER_ATOMIC_SMIN,
493 : BUFFER_ATOMIC_UMIN,
494 : BUFFER_ATOMIC_SMAX,
495 : BUFFER_ATOMIC_UMAX,
496 : BUFFER_ATOMIC_AND,
497 : BUFFER_ATOMIC_OR,
498 : BUFFER_ATOMIC_XOR,
499 : BUFFER_ATOMIC_CMPSWAP,
500 :
501 : LAST_AMDGPU_ISD_NUMBER
502 : };
503 :
504 :
505 : } // End namespace AMDGPUISD
506 :
507 : } // End namespace llvm
508 :
509 : #endif
|