Line data Source code
1 : //===-- SIISelLowering.cpp - SI DAG Lowering Implementation ---------------===//
2 : //
3 : // The LLVM Compiler Infrastructure
4 : //
5 : // This file is distributed under the University of Illinois Open Source
6 : // License. See LICENSE.TXT for details.
7 : //
8 : //===----------------------------------------------------------------------===//
9 : //
10 : /// \file
11 : /// Custom DAG lowering for SI
12 : //
13 : //===----------------------------------------------------------------------===//
14 :
15 : #ifdef _MSC_VER
16 : // Provide M_PI.
17 : #define _USE_MATH_DEFINES
18 : #endif
19 :
20 : #include "SIISelLowering.h"
21 : #include "AMDGPU.h"
22 : #include "AMDGPUIntrinsicInfo.h"
23 : #include "AMDGPUSubtarget.h"
24 : #include "AMDGPUTargetMachine.h"
25 : #include "SIDefines.h"
26 : #include "SIInstrInfo.h"
27 : #include "SIMachineFunctionInfo.h"
28 : #include "SIRegisterInfo.h"
29 : #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
30 : #include "Utils/AMDGPUBaseInfo.h"
31 : #include "llvm/ADT/APFloat.h"
32 : #include "llvm/ADT/APInt.h"
33 : #include "llvm/ADT/ArrayRef.h"
34 : #include "llvm/ADT/BitVector.h"
35 : #include "llvm/ADT/SmallVector.h"
36 : #include "llvm/ADT/Statistic.h"
37 : #include "llvm/ADT/StringRef.h"
38 : #include "llvm/ADT/StringSwitch.h"
39 : #include "llvm/ADT/Twine.h"
40 : #include "llvm/CodeGen/Analysis.h"
41 : #include "llvm/CodeGen/CallingConvLower.h"
42 : #include "llvm/CodeGen/DAGCombine.h"
43 : #include "llvm/CodeGen/ISDOpcodes.h"
44 : #include "llvm/CodeGen/MachineBasicBlock.h"
45 : #include "llvm/CodeGen/MachineFrameInfo.h"
46 : #include "llvm/CodeGen/MachineFunction.h"
47 : #include "llvm/CodeGen/MachineInstr.h"
48 : #include "llvm/CodeGen/MachineInstrBuilder.h"
49 : #include "llvm/CodeGen/MachineMemOperand.h"
50 : #include "llvm/CodeGen/MachineModuleInfo.h"
51 : #include "llvm/CodeGen/MachineOperand.h"
52 : #include "llvm/CodeGen/MachineRegisterInfo.h"
53 : #include "llvm/CodeGen/SelectionDAG.h"
54 : #include "llvm/CodeGen/SelectionDAGNodes.h"
55 : #include "llvm/CodeGen/TargetCallingConv.h"
56 : #include "llvm/CodeGen/TargetRegisterInfo.h"
57 : #include "llvm/CodeGen/ValueTypes.h"
58 : #include "llvm/IR/Constants.h"
59 : #include "llvm/IR/DataLayout.h"
60 : #include "llvm/IR/DebugLoc.h"
61 : #include "llvm/IR/DerivedTypes.h"
62 : #include "llvm/IR/DiagnosticInfo.h"
63 : #include "llvm/IR/Function.h"
64 : #include "llvm/IR/GlobalValue.h"
65 : #include "llvm/IR/InstrTypes.h"
66 : #include "llvm/IR/Instruction.h"
67 : #include "llvm/IR/Instructions.h"
68 : #include "llvm/IR/IntrinsicInst.h"
69 : #include "llvm/IR/Type.h"
70 : #include "llvm/Support/Casting.h"
71 : #include "llvm/Support/CodeGen.h"
72 : #include "llvm/Support/CommandLine.h"
73 : #include "llvm/Support/Compiler.h"
74 : #include "llvm/Support/ErrorHandling.h"
75 : #include "llvm/Support/KnownBits.h"
76 : #include "llvm/Support/MachineValueType.h"
77 : #include "llvm/Support/MathExtras.h"
78 : #include "llvm/Target/TargetOptions.h"
79 : #include <cassert>
80 : #include <cmath>
81 : #include <cstdint>
82 : #include <iterator>
83 : #include <tuple>
84 : #include <utility>
85 : #include <vector>
86 :
87 : using namespace llvm;
88 :
89 : #define DEBUG_TYPE "si-lower"
90 :
91 : STATISTIC(NumTailCalls, "Number of tail calls");
92 :
93 : static cl::opt<bool> EnableVGPRIndexMode(
94 : "amdgpu-vgpr-index-mode",
95 : cl::desc("Use GPR indexing mode instead of movrel for vector indexing"),
96 : cl::init(false));
97 :
98 : static cl::opt<unsigned> AssumeFrameIndexHighZeroBits(
99 : "amdgpu-frame-index-zero-bits",
100 : cl::desc("High bits of frame index assumed to be zero"),
101 : cl::init(5),
102 : cl::ReallyHidden);
103 :
104 : static unsigned findFirstFreeSGPR(CCState &CCInfo) {
105 45 : unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
106 209 : for (unsigned Reg = 0; Reg < NumSGPRs; ++Reg) {
107 418 : if (!CCInfo.isAllocated(AMDGPU::SGPR0 + Reg)) {
108 : return AMDGPU::SGPR0 + Reg;
109 : }
110 : }
111 0 : llvm_unreachable("Cannot allocate sgpr");
112 : }
113 :
114 2492 : SITargetLowering::SITargetLowering(const TargetMachine &TM,
115 2492 : const GCNSubtarget &STI)
116 : : AMDGPUTargetLowering(TM, STI),
117 2492 : Subtarget(&STI) {
118 : addRegisterClass(MVT::i1, &AMDGPU::VReg_1RegClass);
119 : addRegisterClass(MVT::i64, &AMDGPU::SReg_64RegClass);
120 :
121 : addRegisterClass(MVT::i32, &AMDGPU::SReg_32_XM0RegClass);
122 : addRegisterClass(MVT::f32, &AMDGPU::VGPR_32RegClass);
123 :
124 : addRegisterClass(MVT::f64, &AMDGPU::VReg_64RegClass);
125 : addRegisterClass(MVT::v2i32, &AMDGPU::SReg_64RegClass);
126 : addRegisterClass(MVT::v2f32, &AMDGPU::VReg_64RegClass);
127 :
128 : addRegisterClass(MVT::v2i64, &AMDGPU::SReg_128RegClass);
129 : addRegisterClass(MVT::v2f64, &AMDGPU::SReg_128RegClass);
130 :
131 : addRegisterClass(MVT::v4i32, &AMDGPU::SReg_128RegClass);
132 : addRegisterClass(MVT::v4f32, &AMDGPU::VReg_128RegClass);
133 :
134 : addRegisterClass(MVT::v8i32, &AMDGPU::SReg_256RegClass);
135 : addRegisterClass(MVT::v8f32, &AMDGPU::VReg_256RegClass);
136 :
137 : addRegisterClass(MVT::v16i32, &AMDGPU::SReg_512RegClass);
138 : addRegisterClass(MVT::v16f32, &AMDGPU::VReg_512RegClass);
139 :
140 2492 : if (Subtarget->has16BitInsts()) {
141 : addRegisterClass(MVT::i16, &AMDGPU::SReg_32_XM0RegClass);
142 : addRegisterClass(MVT::f16, &AMDGPU::SReg_32_XM0RegClass);
143 :
144 : // Unless there are also VOP3P operations, not operations are really legal.
145 : addRegisterClass(MVT::v2i16, &AMDGPU::SReg_32_XM0RegClass);
146 : addRegisterClass(MVT::v2f16, &AMDGPU::SReg_32_XM0RegClass);
147 : addRegisterClass(MVT::v4i16, &AMDGPU::SReg_64RegClass);
148 : addRegisterClass(MVT::v4f16, &AMDGPU::SReg_64RegClass);
149 : }
150 :
151 2492 : computeRegisterProperties(Subtarget->getRegisterInfo());
152 :
153 : // We need to custom lower vector stores from local memory
154 : setOperationAction(ISD::LOAD, MVT::v2i32, Custom);
155 : setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
156 : setOperationAction(ISD::LOAD, MVT::v8i32, Custom);
157 : setOperationAction(ISD::LOAD, MVT::v16i32, Custom);
158 : setOperationAction(ISD::LOAD, MVT::i1, Custom);
159 : setOperationAction(ISD::LOAD, MVT::v32i32, Custom);
160 :
161 : setOperationAction(ISD::STORE, MVT::v2i32, Custom);
162 : setOperationAction(ISD::STORE, MVT::v4i32, Custom);
163 : setOperationAction(ISD::STORE, MVT::v8i32, Custom);
164 : setOperationAction(ISD::STORE, MVT::v16i32, Custom);
165 : setOperationAction(ISD::STORE, MVT::i1, Custom);
166 : setOperationAction(ISD::STORE, MVT::v32i32, Custom);
167 :
168 : setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand);
169 : setTruncStoreAction(MVT::v4i32, MVT::v4i16, Expand);
170 : setTruncStoreAction(MVT::v8i32, MVT::v8i16, Expand);
171 : setTruncStoreAction(MVT::v16i32, MVT::v16i16, Expand);
172 : setTruncStoreAction(MVT::v32i32, MVT::v32i16, Expand);
173 : setTruncStoreAction(MVT::v2i32, MVT::v2i8, Expand);
174 : setTruncStoreAction(MVT::v4i32, MVT::v4i8, Expand);
175 : setTruncStoreAction(MVT::v8i32, MVT::v8i8, Expand);
176 : setTruncStoreAction(MVT::v16i32, MVT::v16i8, Expand);
177 : setTruncStoreAction(MVT::v32i32, MVT::v32i8, Expand);
178 :
179 : setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
180 : setOperationAction(ISD::GlobalAddress, MVT::i64, Custom);
181 :
182 : setOperationAction(ISD::SELECT, MVT::i1, Promote);
183 : setOperationAction(ISD::SELECT, MVT::i64, Custom);
184 : setOperationAction(ISD::SELECT, MVT::f64, Promote);
185 : AddPromotedToType(ISD::SELECT, MVT::f64, MVT::i64);
186 :
187 : setOperationAction(ISD::SELECT_CC, MVT::f32, Expand);
188 : setOperationAction(ISD::SELECT_CC, MVT::i32, Expand);
189 : setOperationAction(ISD::SELECT_CC, MVT::i64, Expand);
190 : setOperationAction(ISD::SELECT_CC, MVT::f64, Expand);
191 : setOperationAction(ISD::SELECT_CC, MVT::i1, Expand);
192 :
193 : setOperationAction(ISD::SETCC, MVT::i1, Promote);
194 : setOperationAction(ISD::SETCC, MVT::v2i1, Expand);
195 : setOperationAction(ISD::SETCC, MVT::v4i1, Expand);
196 : AddPromotedToType(ISD::SETCC, MVT::i1, MVT::i32);
197 :
198 : setOperationAction(ISD::TRUNCATE, MVT::v2i32, Expand);
199 : setOperationAction(ISD::FP_ROUND, MVT::v2f32, Expand);
200 :
201 : setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i1, Custom);
202 : setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i1, Custom);
203 : setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i8, Custom);
204 : setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i8, Custom);
205 : setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i16, Custom);
206 : setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i16, Custom);
207 : setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::Other, Custom);
208 :
209 : setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
210 : setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::f32, Custom);
211 : setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v4f32, Custom);
212 : setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i16, Custom);
213 : setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::f16, Custom);
214 : setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v2i16, Custom);
215 : setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v2f16, Custom);
216 :
217 : setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v2f16, Custom);
218 : setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v4f16, Custom);
219 : setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
220 :
221 : setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
222 : setOperationAction(ISD::INTRINSIC_VOID, MVT::v2i16, Custom);
223 : setOperationAction(ISD::INTRINSIC_VOID, MVT::v2f16, Custom);
224 : setOperationAction(ISD::INTRINSIC_VOID, MVT::v4f16, Custom);
225 :
226 : setOperationAction(ISD::BRCOND, MVT::Other, Custom);
227 : setOperationAction(ISD::BR_CC, MVT::i1, Expand);
228 : setOperationAction(ISD::BR_CC, MVT::i32, Expand);
229 : setOperationAction(ISD::BR_CC, MVT::i64, Expand);
230 : setOperationAction(ISD::BR_CC, MVT::f32, Expand);
231 : setOperationAction(ISD::BR_CC, MVT::f64, Expand);
232 :
233 : setOperationAction(ISD::UADDO, MVT::i32, Legal);
234 : setOperationAction(ISD::USUBO, MVT::i32, Legal);
235 :
236 : setOperationAction(ISD::ADDCARRY, MVT::i32, Legal);
237 : setOperationAction(ISD::SUBCARRY, MVT::i32, Legal);
238 :
239 : setOperationAction(ISD::SHL_PARTS, MVT::i64, Expand);
240 : setOperationAction(ISD::SRA_PARTS, MVT::i64, Expand);
241 : setOperationAction(ISD::SRL_PARTS, MVT::i64, Expand);
242 :
243 : #if 0
244 : setOperationAction(ISD::ADDCARRY, MVT::i64, Legal);
245 : setOperationAction(ISD::SUBCARRY, MVT::i64, Legal);
246 : #endif
247 :
248 : // We only support LOAD/STORE and vector manipulation ops for vectors
249 : // with > 4 elements.
250 22428 : for (MVT VT : {MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32,
251 24920 : MVT::v2i64, MVT::v2f64, MVT::v4i16, MVT::v4f16, MVT::v32i32 }) {
252 5853708 : for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
253 5831280 : switch (Op) {
254 : case ISD::LOAD:
255 : case ISD::STORE:
256 : case ISD::BUILD_VECTOR:
257 : case ISD::BITCAST:
258 : case ISD::EXTRACT_VECTOR_ELT:
259 : case ISD::INSERT_VECTOR_ELT:
260 : case ISD::INSERT_SUBVECTOR:
261 : case ISD::EXTRACT_SUBVECTOR:
262 : case ISD::SCALAR_TO_VECTOR:
263 : break;
264 22428 : case ISD::CONCAT_VECTORS:
265 : setOperationAction(Op, VT, Custom);
266 22428 : break;
267 5607000 : default:
268 : setOperationAction(Op, VT, Expand);
269 5607000 : break;
270 : }
271 : }
272 : }
273 :
274 : setOperationAction(ISD::FP_EXTEND, MVT::v4f32, Expand);
275 :
276 : // TODO: For dynamic 64-bit vector inserts/extracts, should emit a pseudo that
277 : // is expanded to avoid having two separate loops in case the index is a VGPR.
278 :
279 : // Most operations are naturally 32-bit vector operations. We only support
280 : // load and store of i64 vectors, so promote v2i64 vector operations to v4i32.
281 7476 : for (MVT Vec64 : { MVT::v2i64, MVT::v2f64 }) {
282 : setOperationAction(ISD::BUILD_VECTOR, Vec64, Promote);
283 : AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v4i32);
284 :
285 : setOperationAction(ISD::EXTRACT_VECTOR_ELT, Vec64, Promote);
286 : AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v4i32);
287 :
288 : setOperationAction(ISD::INSERT_VECTOR_ELT, Vec64, Promote);
289 : AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v4i32);
290 :
291 : setOperationAction(ISD::SCALAR_TO_VECTOR, Vec64, Promote);
292 : AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v4i32);
293 : }
294 :
295 : setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i32, Expand);
296 : setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8f32, Expand);
297 : setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16i32, Expand);
298 : setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16f32, Expand);
299 :
300 : setOperationAction(ISD::BUILD_VECTOR, MVT::v4f16, Custom);
301 : setOperationAction(ISD::BUILD_VECTOR, MVT::v4i16, Custom);
302 :
303 : // Avoid stack access for these.
304 : // TODO: Generalize to more vector types.
305 : setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i16, Custom);
306 : setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f16, Custom);
307 : setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i16, Custom);
308 : setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f16, Custom);
309 :
310 : setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i16, Custom);
311 : setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f16, Custom);
312 : setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i8, Custom);
313 : setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i8, Custom);
314 : setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i8, Custom);
315 :
316 : setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i8, Custom);
317 : setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i8, Custom);
318 : setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i8, Custom);
319 :
320 : setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i16, Custom);
321 : setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f16, Custom);
322 : setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i16, Custom);
323 : setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f16, Custom);
324 :
325 : // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling,
326 : // and output demarshalling
327 : setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i32, Custom);
328 : setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i64, Custom);
329 :
330 : // We can't return success/failure, only the old value,
331 : // let LLVM add the comparison
332 : setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i32, Expand);
333 : setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i64, Expand);
334 :
335 2492 : if (Subtarget->hasFlatAddressSpace()) {
336 : setOperationAction(ISD::ADDRSPACECAST, MVT::i32, Custom);
337 : setOperationAction(ISD::ADDRSPACECAST, MVT::i64, Custom);
338 : }
339 :
340 : setOperationAction(ISD::BSWAP, MVT::i32, Legal);
341 : setOperationAction(ISD::BITREVERSE, MVT::i32, Legal);
342 :
343 : // On SI this is s_memtime and s_memrealtime on VI.
344 : setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Legal);
345 : setOperationAction(ISD::TRAP, MVT::Other, Custom);
346 : setOperationAction(ISD::DEBUGTRAP, MVT::Other, Custom);
347 :
348 2492 : if (Subtarget->has16BitInsts()) {
349 : setOperationAction(ISD::FLOG, MVT::f16, Custom);
350 : setOperationAction(ISD::FEXP, MVT::f16, Custom);
351 : setOperationAction(ISD::FLOG10, MVT::f16, Custom);
352 : }
353 :
354 : // v_mad_f32 does not support denormals according to some sources.
355 2492 : if (!Subtarget->hasFP32Denormals())
356 : setOperationAction(ISD::FMAD, MVT::f32, Legal);
357 :
358 : if (!Subtarget->hasBFI()) {
359 : // fcopysign can be done in a single instruction with BFI.
360 : setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
361 : setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
362 : }
363 :
364 : if (!Subtarget->hasBCNT(32))
365 : setOperationAction(ISD::CTPOP, MVT::i32, Expand);
366 :
367 : if (!Subtarget->hasBCNT(64))
368 : setOperationAction(ISD::CTPOP, MVT::i64, Expand);
369 :
370 : if (Subtarget->hasFFBH())
371 : setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, Custom);
372 :
373 : if (Subtarget->hasFFBL())
374 : setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32, Custom);
375 :
376 : // We only really have 32-bit BFE instructions (and 16-bit on VI).
377 : //
378 : // On SI+ there are 64-bit BFEs, but they are scalar only and there isn't any
379 : // effort to match them now. We want this to be false for i64 cases when the
380 : // extraction isn't restricted to the upper or lower half. Ideally we would
381 : // have some pass reduce 64-bit extracts to 32-bit if possible. Extracts that
382 : // span the midpoint are probably relatively rare, so don't worry about them
383 : // for now.
384 : if (Subtarget->hasBFE())
385 : setHasExtractBitsInsn(true);
386 :
387 : setOperationAction(ISD::FMINNUM, MVT::f64, Legal);
388 : setOperationAction(ISD::FMAXNUM, MVT::f64, Legal);
389 :
390 2492 : if (Subtarget->getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
391 : setOperationAction(ISD::FTRUNC, MVT::f64, Legal);
392 : setOperationAction(ISD::FCEIL, MVT::f64, Legal);
393 : setOperationAction(ISD::FRINT, MVT::f64, Legal);
394 : } else {
395 : setOperationAction(ISD::FCEIL, MVT::f64, Custom);
396 : setOperationAction(ISD::FTRUNC, MVT::f64, Custom);
397 : setOperationAction(ISD::FRINT, MVT::f64, Custom);
398 : setOperationAction(ISD::FFLOOR, MVT::f64, Custom);
399 : }
400 :
401 : setOperationAction(ISD::FFLOOR, MVT::f64, Legal);
402 :
403 : setOperationAction(ISD::FSIN, MVT::f32, Custom);
404 : setOperationAction(ISD::FCOS, MVT::f32, Custom);
405 : setOperationAction(ISD::FDIV, MVT::f32, Custom);
406 : setOperationAction(ISD::FDIV, MVT::f64, Custom);
407 :
408 2492 : if (Subtarget->has16BitInsts()) {
409 : setOperationAction(ISD::Constant, MVT::i16, Legal);
410 :
411 : setOperationAction(ISD::SMIN, MVT::i16, Legal);
412 : setOperationAction(ISD::SMAX, MVT::i16, Legal);
413 :
414 : setOperationAction(ISD::UMIN, MVT::i16, Legal);
415 : setOperationAction(ISD::UMAX, MVT::i16, Legal);
416 :
417 : setOperationAction(ISD::SIGN_EXTEND, MVT::i16, Promote);
418 : AddPromotedToType(ISD::SIGN_EXTEND, MVT::i16, MVT::i32);
419 :
420 : setOperationAction(ISD::ROTR, MVT::i16, Promote);
421 : setOperationAction(ISD::ROTL, MVT::i16, Promote);
422 :
423 : setOperationAction(ISD::SDIV, MVT::i16, Promote);
424 : setOperationAction(ISD::UDIV, MVT::i16, Promote);
425 : setOperationAction(ISD::SREM, MVT::i16, Promote);
426 : setOperationAction(ISD::UREM, MVT::i16, Promote);
427 :
428 : setOperationAction(ISD::BSWAP, MVT::i16, Promote);
429 : setOperationAction(ISD::BITREVERSE, MVT::i16, Promote);
430 :
431 : setOperationAction(ISD::CTTZ, MVT::i16, Promote);
432 : setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i16, Promote);
433 : setOperationAction(ISD::CTLZ, MVT::i16, Promote);
434 : setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16, Promote);
435 : setOperationAction(ISD::CTPOP, MVT::i16, Promote);
436 :
437 : setOperationAction(ISD::SELECT_CC, MVT::i16, Expand);
438 :
439 : setOperationAction(ISD::BR_CC, MVT::i16, Expand);
440 :
441 : setOperationAction(ISD::LOAD, MVT::i16, Custom);
442 :
443 : setTruncStoreAction(MVT::i64, MVT::i16, Expand);
444 :
445 : setOperationAction(ISD::FP16_TO_FP, MVT::i16, Promote);
446 : AddPromotedToType(ISD::FP16_TO_FP, MVT::i16, MVT::i32);
447 : setOperationAction(ISD::FP_TO_FP16, MVT::i16, Promote);
448 : AddPromotedToType(ISD::FP_TO_FP16, MVT::i16, MVT::i32);
449 :
450 : setOperationAction(ISD::FP_TO_SINT, MVT::i16, Promote);
451 : setOperationAction(ISD::FP_TO_UINT, MVT::i16, Promote);
452 : setOperationAction(ISD::SINT_TO_FP, MVT::i16, Promote);
453 : setOperationAction(ISD::UINT_TO_FP, MVT::i16, Promote);
454 :
455 : // F16 - Constant Actions.
456 : setOperationAction(ISD::ConstantFP, MVT::f16, Legal);
457 :
458 : // F16 - Load/Store Actions.
459 : setOperationAction(ISD::LOAD, MVT::f16, Promote);
460 : AddPromotedToType(ISD::LOAD, MVT::f16, MVT::i16);
461 : setOperationAction(ISD::STORE, MVT::f16, Promote);
462 : AddPromotedToType(ISD::STORE, MVT::f16, MVT::i16);
463 :
464 : // F16 - VOP1 Actions.
465 : setOperationAction(ISD::FP_ROUND, MVT::f16, Custom);
466 : setOperationAction(ISD::FCOS, MVT::f16, Promote);
467 : setOperationAction(ISD::FSIN, MVT::f16, Promote);
468 : setOperationAction(ISD::FP_TO_SINT, MVT::f16, Promote);
469 : setOperationAction(ISD::FP_TO_UINT, MVT::f16, Promote);
470 : setOperationAction(ISD::SINT_TO_FP, MVT::f16, Promote);
471 : setOperationAction(ISD::UINT_TO_FP, MVT::f16, Promote);
472 : setOperationAction(ISD::FROUND, MVT::f16, Custom);
473 :
474 : // F16 - VOP2 Actions.
475 : setOperationAction(ISD::BR_CC, MVT::f16, Expand);
476 : setOperationAction(ISD::SELECT_CC, MVT::f16, Expand);
477 : setOperationAction(ISD::FMAXNUM, MVT::f16, Legal);
478 : setOperationAction(ISD::FMINNUM, MVT::f16, Legal);
479 : setOperationAction(ISD::FDIV, MVT::f16, Custom);
480 :
481 : // F16 - VOP3 Actions.
482 : setOperationAction(ISD::FMA, MVT::f16, Legal);
483 1240 : if (!Subtarget->hasFP16Denormals())
484 : setOperationAction(ISD::FMAD, MVT::f16, Legal);
485 :
486 6200 : for (MVT VT : {MVT::v2i16, MVT::v2f16, MVT::v4i16, MVT::v4f16}) {
487 1294560 : for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
488 1289600 : switch (Op) {
489 : case ISD::LOAD:
490 : case ISD::STORE:
491 : case ISD::BUILD_VECTOR:
492 : case ISD::BITCAST:
493 : case ISD::EXTRACT_VECTOR_ELT:
494 : case ISD::INSERT_VECTOR_ELT:
495 : case ISD::INSERT_SUBVECTOR:
496 : case ISD::EXTRACT_SUBVECTOR:
497 : case ISD::SCALAR_TO_VECTOR:
498 : break;
499 4960 : case ISD::CONCAT_VECTORS:
500 : setOperationAction(Op, VT, Custom);
501 4960 : break;
502 1240000 : default:
503 : setOperationAction(Op, VT, Expand);
504 1240000 : break;
505 : }
506 : }
507 : }
508 :
509 : // XXX - Do these do anything? Vector constants turn into build_vector.
510 : setOperationAction(ISD::Constant, MVT::v2i16, Legal);
511 : setOperationAction(ISD::ConstantFP, MVT::v2f16, Legal);
512 :
513 : setOperationAction(ISD::UNDEF, MVT::v2i16, Legal);
514 : setOperationAction(ISD::UNDEF, MVT::v2f16, Legal);
515 :
516 : setOperationAction(ISD::STORE, MVT::v2i16, Promote);
517 : AddPromotedToType(ISD::STORE, MVT::v2i16, MVT::i32);
518 : setOperationAction(ISD::STORE, MVT::v2f16, Promote);
519 : AddPromotedToType(ISD::STORE, MVT::v2f16, MVT::i32);
520 :
521 : setOperationAction(ISD::LOAD, MVT::v2i16, Promote);
522 : AddPromotedToType(ISD::LOAD, MVT::v2i16, MVT::i32);
523 : setOperationAction(ISD::LOAD, MVT::v2f16, Promote);
524 : AddPromotedToType(ISD::LOAD, MVT::v2f16, MVT::i32);
525 :
526 : setOperationAction(ISD::AND, MVT::v2i16, Promote);
527 : AddPromotedToType(ISD::AND, MVT::v2i16, MVT::i32);
528 : setOperationAction(ISD::OR, MVT::v2i16, Promote);
529 : AddPromotedToType(ISD::OR, MVT::v2i16, MVT::i32);
530 : setOperationAction(ISD::XOR, MVT::v2i16, Promote);
531 : AddPromotedToType(ISD::XOR, MVT::v2i16, MVT::i32);
532 :
533 : setOperationAction(ISD::LOAD, MVT::v4i16, Promote);
534 : AddPromotedToType(ISD::LOAD, MVT::v4i16, MVT::v2i32);
535 : setOperationAction(ISD::LOAD, MVT::v4f16, Promote);
536 : AddPromotedToType(ISD::LOAD, MVT::v4f16, MVT::v2i32);
537 :
538 : setOperationAction(ISD::STORE, MVT::v4i16, Promote);
539 : AddPromotedToType(ISD::STORE, MVT::v4i16, MVT::v2i32);
540 : setOperationAction(ISD::STORE, MVT::v4f16, Promote);
541 : AddPromotedToType(ISD::STORE, MVT::v4f16, MVT::v2i32);
542 :
543 : setOperationAction(ISD::ANY_EXTEND, MVT::v2i32, Expand);
544 : setOperationAction(ISD::ZERO_EXTEND, MVT::v2i32, Expand);
545 : setOperationAction(ISD::SIGN_EXTEND, MVT::v2i32, Expand);
546 : setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Expand);
547 :
548 : setOperationAction(ISD::ANY_EXTEND, MVT::v4i32, Expand);
549 : setOperationAction(ISD::ZERO_EXTEND, MVT::v4i32, Expand);
550 : setOperationAction(ISD::SIGN_EXTEND, MVT::v4i32, Expand);
551 :
552 1240 : if (!Subtarget->hasVOP3PInsts()) {
553 : setOperationAction(ISD::BUILD_VECTOR, MVT::v2i16, Custom);
554 : setOperationAction(ISD::BUILD_VECTOR, MVT::v2f16, Custom);
555 : }
556 :
557 : setOperationAction(ISD::FNEG, MVT::v2f16, Legal);
558 : // This isn't really legal, but this avoids the legalizer unrolling it (and
559 : // allows matching fneg (fabs x) patterns)
560 : setOperationAction(ISD::FABS, MVT::v2f16, Legal);
561 : }
562 :
563 2492 : if (Subtarget->hasVOP3PInsts()) {
564 : setOperationAction(ISD::ADD, MVT::v2i16, Legal);
565 : setOperationAction(ISD::SUB, MVT::v2i16, Legal);
566 : setOperationAction(ISD::MUL, MVT::v2i16, Legal);
567 : setOperationAction(ISD::SHL, MVT::v2i16, Legal);
568 : setOperationAction(ISD::SRL, MVT::v2i16, Legal);
569 : setOperationAction(ISD::SRA, MVT::v2i16, Legal);
570 : setOperationAction(ISD::SMIN, MVT::v2i16, Legal);
571 : setOperationAction(ISD::UMIN, MVT::v2i16, Legal);
572 : setOperationAction(ISD::SMAX, MVT::v2i16, Legal);
573 : setOperationAction(ISD::UMAX, MVT::v2i16, Legal);
574 :
575 : setOperationAction(ISD::FADD, MVT::v2f16, Legal);
576 : setOperationAction(ISD::FMUL, MVT::v2f16, Legal);
577 : setOperationAction(ISD::FMA, MVT::v2f16, Legal);
578 : setOperationAction(ISD::FMINNUM, MVT::v2f16, Legal);
579 : setOperationAction(ISD::FMAXNUM, MVT::v2f16, Legal);
580 : setOperationAction(ISD::FCANONICALIZE, MVT::v2f16, Legal);
581 :
582 : setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i16, Custom);
583 : setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f16, Custom);
584 :
585 : setOperationAction(ISD::SHL, MVT::v4i16, Custom);
586 : setOperationAction(ISD::SRA, MVT::v4i16, Custom);
587 : setOperationAction(ISD::SRL, MVT::v4i16, Custom);
588 : setOperationAction(ISD::ADD, MVT::v4i16, Custom);
589 : setOperationAction(ISD::SUB, MVT::v4i16, Custom);
590 : setOperationAction(ISD::MUL, MVT::v4i16, Custom);
591 :
592 : setOperationAction(ISD::SMIN, MVT::v4i16, Custom);
593 : setOperationAction(ISD::SMAX, MVT::v4i16, Custom);
594 : setOperationAction(ISD::UMIN, MVT::v4i16, Custom);
595 : setOperationAction(ISD::UMAX, MVT::v4i16, Custom);
596 :
597 : setOperationAction(ISD::FADD, MVT::v4f16, Custom);
598 : setOperationAction(ISD::FMUL, MVT::v4f16, Custom);
599 : setOperationAction(ISD::FMINNUM, MVT::v4f16, Custom);
600 : setOperationAction(ISD::FMAXNUM, MVT::v4f16, Custom);
601 : setOperationAction(ISD::FCANONICALIZE, MVT::v4f16, Custom);
602 :
603 : setOperationAction(ISD::FEXP, MVT::v2f16, Custom);
604 : setOperationAction(ISD::SELECT, MVT::v4i16, Custom);
605 : setOperationAction(ISD::SELECT, MVT::v4f16, Custom);
606 : }
607 :
608 : setOperationAction(ISD::FNEG, MVT::v4f16, Custom);
609 : setOperationAction(ISD::FABS, MVT::v4f16, Custom);
610 :
611 2492 : if (Subtarget->has16BitInsts()) {
612 : setOperationAction(ISD::SELECT, MVT::v2i16, Promote);
613 : AddPromotedToType(ISD::SELECT, MVT::v2i16, MVT::i32);
614 : setOperationAction(ISD::SELECT, MVT::v2f16, Promote);
615 : AddPromotedToType(ISD::SELECT, MVT::v2f16, MVT::i32);
616 : } else {
617 : // Legalization hack.
618 : setOperationAction(ISD::SELECT, MVT::v2i16, Custom);
619 : setOperationAction(ISD::SELECT, MVT::v2f16, Custom);
620 :
621 : setOperationAction(ISD::FNEG, MVT::v2f16, Custom);
622 : setOperationAction(ISD::FABS, MVT::v2f16, Custom);
623 : }
624 :
625 14952 : for (MVT VT : { MVT::v4i16, MVT::v4f16, MVT::v2i8, MVT::v4i8, MVT::v8i8 }) {
626 : setOperationAction(ISD::SELECT, VT, Custom);
627 : }
628 :
629 : setTargetDAGCombine(ISD::ADD);
630 : setTargetDAGCombine(ISD::ADDCARRY);
631 : setTargetDAGCombine(ISD::SUB);
632 : setTargetDAGCombine(ISD::SUBCARRY);
633 : setTargetDAGCombine(ISD::FADD);
634 : setTargetDAGCombine(ISD::FSUB);
635 : setTargetDAGCombine(ISD::FMINNUM);
636 : setTargetDAGCombine(ISD::FMAXNUM);
637 : setTargetDAGCombine(ISD::FMA);
638 : setTargetDAGCombine(ISD::SMIN);
639 : setTargetDAGCombine(ISD::SMAX);
640 : setTargetDAGCombine(ISD::UMIN);
641 : setTargetDAGCombine(ISD::UMAX);
642 : setTargetDAGCombine(ISD::SETCC);
643 : setTargetDAGCombine(ISD::AND);
644 : setTargetDAGCombine(ISD::OR);
645 : setTargetDAGCombine(ISD::XOR);
646 : setTargetDAGCombine(ISD::SINT_TO_FP);
647 : setTargetDAGCombine(ISD::UINT_TO_FP);
648 : setTargetDAGCombine(ISD::FCANONICALIZE);
649 : setTargetDAGCombine(ISD::SCALAR_TO_VECTOR);
650 : setTargetDAGCombine(ISD::ZERO_EXTEND);
651 : setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
652 : setTargetDAGCombine(ISD::BUILD_VECTOR);
653 :
654 : // All memory operations. Some folding on the pointer operand is done to help
655 : // matching the constant offsets in the addressing modes.
656 : setTargetDAGCombine(ISD::LOAD);
657 : setTargetDAGCombine(ISD::STORE);
658 : setTargetDAGCombine(ISD::ATOMIC_LOAD);
659 : setTargetDAGCombine(ISD::ATOMIC_STORE);
660 : setTargetDAGCombine(ISD::ATOMIC_CMP_SWAP);
661 : setTargetDAGCombine(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS);
662 : setTargetDAGCombine(ISD::ATOMIC_SWAP);
663 : setTargetDAGCombine(ISD::ATOMIC_LOAD_ADD);
664 : setTargetDAGCombine(ISD::ATOMIC_LOAD_SUB);
665 : setTargetDAGCombine(ISD::ATOMIC_LOAD_AND);
666 : setTargetDAGCombine(ISD::ATOMIC_LOAD_OR);
667 : setTargetDAGCombine(ISD::ATOMIC_LOAD_XOR);
668 : setTargetDAGCombine(ISD::ATOMIC_LOAD_NAND);
669 : setTargetDAGCombine(ISD::ATOMIC_LOAD_MIN);
670 : setTargetDAGCombine(ISD::ATOMIC_LOAD_MAX);
671 : setTargetDAGCombine(ISD::ATOMIC_LOAD_UMIN);
672 : setTargetDAGCombine(ISD::ATOMIC_LOAD_UMAX);
673 :
674 : setSchedulingPreference(Sched::RegPressure);
675 :
676 : // SI at least has hardware support for floating point exceptions, but no way
677 : // of using or handling them is implemented. They are also optional in OpenCL
678 : // (Section 7.3)
679 2492 : setHasFloatingPointExceptions(Subtarget->hasFPExceptions());
680 2492 : }
681 :
682 1042597 : const GCNSubtarget *SITargetLowering::getSubtarget() const {
683 1042597 : return Subtarget;
684 : }
685 :
686 : //===----------------------------------------------------------------------===//
687 : // TargetLowering queries
688 : //===----------------------------------------------------------------------===//
689 :
690 : // v_mad_mix* support a conversion from f16 to f32.
691 : //
692 : // There is only one special case when denormals are enabled we don't currently,
693 : // where this is OK to use.
694 24 : bool SITargetLowering::isFPExtFoldable(unsigned Opcode,
695 : EVT DestVT, EVT SrcVT) const {
696 24 : return ((Opcode == ISD::FMAD && Subtarget->hasMadMixInsts()) ||
697 2 : (Opcode == ISD::FMA && Subtarget->hasFmaMixInsts())) &&
698 46 : DestVT.getScalarType() == MVT::f32 && !Subtarget->hasFP32Denormals() &&
699 11 : SrcVT.getScalarType() == MVT::f16;
700 : }
701 :
702 32 : bool SITargetLowering::isShuffleMaskLegal(ArrayRef<int>, EVT) const {
703 : // SI has some legal vector types, but no legal vector operations. Say no
704 : // shuffles are legal in order to prefer scalarizing some vector operations.
705 32 : return false;
706 : }
707 :
708 146769 : MVT SITargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context,
709 : CallingConv::ID CC,
710 : EVT VT) const {
711 : // TODO: Consider splitting all arguments into 32-bit pieces.
712 175209 : if (CC != CallingConv::AMDGPU_KERNEL && VT.isVector()) {
713 8786 : EVT ScalarVT = VT.getScalarType();
714 8786 : unsigned Size = ScalarVT.getSizeInBits();
715 8786 : if (Size == 32)
716 8356 : return ScalarVT.getSimpleVT();
717 :
718 1705 : if (Size == 64)
719 167 : return MVT::i32;
720 :
721 1538 : if (Size == 16 && Subtarget->has16BitInsts())
722 1987 : return VT.isInteger() ? MVT::v2i16 : MVT::v2f16;
723 : }
724 :
725 138413 : return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
726 : }
727 :
728 146769 : unsigned SITargetLowering::getNumRegistersForCallingConv(LLVMContext &Context,
729 : CallingConv::ID CC,
730 : EVT VT) const {
731 175209 : if (CC != CallingConv::AMDGPU_KERNEL && VT.isVector()) {
732 : unsigned NumElts = VT.getVectorNumElements();
733 8786 : EVT ScalarVT = VT.getScalarType();
734 8786 : unsigned Size = ScalarVT.getSizeInBits();
735 :
736 8786 : if (Size == 32)
737 8356 : return NumElts;
738 :
739 1705 : if (Size == 64)
740 167 : return 2 * NumElts;
741 :
742 1538 : if (Size == 16 && Subtarget->has16BitInsts())
743 1108 : return (VT.getVectorNumElements() + 1) / 2;
744 : }
745 :
746 138413 : return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT);
747 : }
748 :
749 3407 : unsigned SITargetLowering::getVectorTypeBreakdownForCallingConv(
750 : LLVMContext &Context, CallingConv::ID CC,
751 : EVT VT, EVT &IntermediateVT,
752 : unsigned &NumIntermediates, MVT &RegisterVT) const {
753 6813 : if (CC != CallingConv::AMDGPU_KERNEL && VT.isVector()) {
754 : unsigned NumElts = VT.getVectorNumElements();
755 3406 : EVT ScalarVT = VT.getScalarType();
756 3406 : unsigned Size = ScalarVT.getSizeInBits();
757 3406 : if (Size == 32) {
758 2851 : RegisterVT = ScalarVT.getSimpleVT();
759 2851 : IntermediateVT = RegisterVT;
760 2851 : NumIntermediates = NumElts;
761 3187 : return NumIntermediates;
762 : }
763 :
764 555 : if (Size == 64) {
765 94 : RegisterVT = MVT::i32;
766 94 : IntermediateVT = RegisterVT;
767 94 : NumIntermediates = 2 * NumElts;
768 94 : return NumIntermediates;
769 : }
770 :
771 : // FIXME: We should fix the ABI to be the same on targets without 16-bit
772 : // support, but unless we can properly handle 3-vectors, it will be still be
773 : // inconsistent.
774 461 : if (Size == 16 && Subtarget->has16BitInsts()) {
775 242 : RegisterVT = VT.isInteger() ? MVT::v2i16 : MVT::v2f16;
776 242 : IntermediateVT = RegisterVT;
777 242 : NumIntermediates = (NumElts + 1) / 2;
778 242 : return NumIntermediates;
779 : }
780 : }
781 :
782 220 : return TargetLowering::getVectorTypeBreakdownForCallingConv(
783 220 : Context, CC, VT, IntermediateVT, NumIntermediates, RegisterVT);
784 : }
785 :
786 26154 : bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
787 : const CallInst &CI,
788 : MachineFunction &MF,
789 : unsigned IntrID) const {
790 26154 : if (const AMDGPU::RsrcIntrinsic *RsrcIntr =
791 26154 : AMDGPU::lookupRsrcIntrinsic(IntrID)) {
792 : AttributeList Attr = Intrinsic::getAttributes(CI.getContext(),
793 1918 : (Intrinsic::ID)IntrID);
794 1918 : if (Attr.hasFnAttribute(Attribute::ReadNone))
795 : return false;
796 :
797 1822 : SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
798 :
799 1822 : if (RsrcIntr->IsImage) {
800 709 : Info.ptrVal = MFI->getImagePSV(
801 709 : *MF.getSubtarget<GCNSubtarget>().getInstrInfo(),
802 709 : CI.getArgOperand(RsrcIntr->RsrcArg));
803 709 : Info.align = 0;
804 : } else {
805 1113 : Info.ptrVal = MFI->getBufferPSV(
806 1113 : *MF.getSubtarget<GCNSubtarget>().getInstrInfo(),
807 1113 : CI.getArgOperand(RsrcIntr->RsrcArg));
808 : }
809 :
810 1822 : Info.flags = MachineMemOperand::MODereferenceable;
811 1822 : if (Attr.hasFnAttribute(Attribute::ReadOnly)) {
812 1071 : Info.opc = ISD::INTRINSIC_W_CHAIN;
813 1071 : Info.memVT = MVT::getVT(CI.getType());
814 : Info.flags |= MachineMemOperand::MOLoad;
815 751 : } else if (Attr.hasFnAttribute(Attribute::WriteOnly)) {
816 501 : Info.opc = ISD::INTRINSIC_VOID;
817 501 : Info.memVT = MVT::getVT(CI.getArgOperand(0)->getType());
818 : Info.flags |= MachineMemOperand::MOStore;
819 : } else {
820 : // Atomic
821 250 : Info.opc = ISD::INTRINSIC_W_CHAIN;
822 250 : Info.memVT = MVT::getVT(CI.getType());
823 : Info.flags = MachineMemOperand::MOLoad |
824 : MachineMemOperand::MOStore |
825 : MachineMemOperand::MODereferenceable;
826 :
827 : // XXX - Should this be volatile without known ordering?
828 : Info.flags |= MachineMemOperand::MOVolatile;
829 : }
830 1822 : return true;
831 : }
832 :
833 : switch (IntrID) {
834 245 : case Intrinsic::amdgcn_atomic_inc:
835 : case Intrinsic::amdgcn_atomic_dec:
836 : case Intrinsic::amdgcn_ds_fadd:
837 : case Intrinsic::amdgcn_ds_fmin:
838 : case Intrinsic::amdgcn_ds_fmax: {
839 245 : Info.opc = ISD::INTRINSIC_W_CHAIN;
840 245 : Info.memVT = MVT::getVT(CI.getType());
841 245 : Info.ptrVal = CI.getOperand(0);
842 245 : Info.align = 0;
843 245 : Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
844 :
845 : const ConstantInt *Vol = dyn_cast<ConstantInt>(CI.getOperand(4));
846 242 : if (!Vol || !Vol->isZero())
847 : Info.flags |= MachineMemOperand::MOVolatile;
848 :
849 : return true;
850 : }
851 :
852 : default:
853 : return false;
854 : }
855 : }
856 :
857 31707 : bool SITargetLowering::getAddrModeArguments(IntrinsicInst *II,
858 : SmallVectorImpl<Value*> &Ops,
859 : Type *&AccessTy) const {
860 : switch (II->getIntrinsicID()) {
861 269 : case Intrinsic::amdgcn_atomic_inc:
862 : case Intrinsic::amdgcn_atomic_dec:
863 : case Intrinsic::amdgcn_ds_fadd:
864 : case Intrinsic::amdgcn_ds_fmin:
865 : case Intrinsic::amdgcn_ds_fmax: {
866 269 : Value *Ptr = II->getArgOperand(0);
867 269 : AccessTy = II->getType();
868 269 : Ops.push_back(Ptr);
869 : return true;
870 : }
871 : default:
872 : return false;
873 : }
874 : }
875 :
876 54857 : bool SITargetLowering::isLegalFlatAddressingMode(const AddrMode &AM) const {
877 54857 : if (!Subtarget->hasFlatInstOffsets()) {
878 : // Flat instructions do not have offsets, and only have the register
879 : // address.
880 84784 : return AM.BaseOffs == 0 && AM.Scale == 0;
881 : }
882 :
883 : // GFX9 added a 13-bit signed offset. When using regular flat instructions,
884 : // the sign bit is ignored and is treated as a 12-bit unsigned offset.
885 :
886 : // Just r + i
887 2704 : return isUInt<12>(AM.BaseOffs) && AM.Scale == 0;
888 : }
889 :
890 111122 : bool SITargetLowering::isLegalGlobalAddressingMode(const AddrMode &AM) const {
891 111122 : if (Subtarget->hasFlatGlobalInsts())
892 41446 : return isInt<13>(AM.BaseOffs) && AM.Scale == 0;
893 :
894 90399 : if (!Subtarget->hasAddr64() || Subtarget->useFlatForGlobal()) {
895 : // Assume the we will use FLAT for all global memory accesses
896 : // on VI.
897 : // FIXME: This assumption is currently wrong. On VI we still use
898 : // MUBUF instructions for the r + i addressing mode. As currently
899 : // implemented, the MUBUF instructions only work on buffer < 4GB.
900 : // It may be possible to support > 4GB buffers with MUBUF instructions,
901 : // by setting the stride value in the resource descriptor which would
902 : // increase the size limit to (stride * 4GB). However, this is risky,
903 : // because it has never been validated.
904 43763 : return isLegalFlatAddressingMode(AM);
905 : }
906 :
907 46636 : return isLegalMUBUFAddressingMode(AM);
908 : }
909 :
910 52973 : bool SITargetLowering::isLegalMUBUFAddressingMode(const AddrMode &AM) const {
911 : // MUBUF / MTBUF instructions have a 12-bit unsigned byte offset, and
912 : // additionally can do r + r + i with addr64. 32-bit has more addressing
913 : // mode options. Depending on the resource constant, it can also do
914 : // (i64 r0) + (i32 r1) * (i14 i).
915 : //
916 : // Private arrays end up using a scratch buffer most of the time, so also
917 : // assume those use MUBUF instructions. Scratch loads / stores are currently
918 : // implemented as mubuf instructions with offen bit set, so slightly
919 : // different than the normal addr64.
920 52973 : if (!isUInt<12>(AM.BaseOffs))
921 : return false;
922 :
923 : // FIXME: Since we can split immediate into soffset and immediate offset,
924 : // would it make sense to allow any immediate?
925 :
926 52452 : switch (AM.Scale) {
927 : case 0: // r + i or just i, depending on HasBaseReg.
928 : return true;
929 : case 1:
930 : return true; // We have r + r or r + i.
931 855 : case 2:
932 855 : if (AM.HasBaseReg) {
933 : // Reject 2 * r + r.
934 855 : return false;
935 : }
936 :
937 : // Allow 2 * r as r + r
938 : // Or 2 * r + i is allowed as r + r + i.
939 : return true;
940 13051 : default: // Don't allow n * r
941 13051 : return false;
942 : }
943 : }
944 :
945 221586 : bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL,
946 : const AddrMode &AM, Type *Ty,
947 : unsigned AS, Instruction *I) const {
948 : // No global is ever allowed as a base.
949 221586 : if (AM.BaseGV)
950 : return false;
951 :
952 218800 : if (AS == AMDGPUAS::GLOBAL_ADDRESS)
953 82368 : return isLegalGlobalAddressingMode(AM);
954 :
955 272864 : if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
956 136432 : AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
957 : // If the offset isn't a multiple of 4, it probably isn't going to be
958 : // correctly aligned.
959 : // FIXME: Can we get the real alignment here?
960 97945 : if (AM.BaseOffs % 4 != 0)
961 99 : return isLegalMUBUFAddressingMode(AM);
962 :
963 : // There are no SMRD extloads, so if we have to do a small type access we
964 : // will use a MUBUF load.
965 : // FIXME?: We also need to do this if unaligned, but we don't know the
966 : // alignment here.
967 195692 : if (Ty->isSized() && DL.getTypeStoreSize(Ty) < 4)
968 28754 : return isLegalGlobalAddressingMode(AM);
969 :
970 69092 : if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS) {
971 : // SMRD instructions have an 8-bit, dword offset on SI.
972 19754 : if (!isUInt<8>(AM.BaseOffs / 4))
973 : return false;
974 49338 : } else if (Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) {
975 : // On CI+, this can also be a 32-bit literal constant offset. If it fits
976 : // in 8-bits, it can use a smaller encoding.
977 9715 : if (!isUInt<32>(AM.BaseOffs / 4))
978 : return false;
979 39623 : } else if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
980 : // On VI, these use the SMEM format and the offset is 20-bit in bytes.
981 39623 : if (!isUInt<20>(AM.BaseOffs))
982 : return false;
983 : } else
984 0 : llvm_unreachable("unhandled generation");
985 :
986 68951 : if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
987 : return true;
988 :
989 393 : if (AM.Scale == 1 && AM.HasBaseReg)
990 : return true;
991 :
992 393 : return false;
993 :
994 38487 : } else if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
995 6238 : return isLegalMUBUFAddressingMode(AM);
996 32249 : } else if (AS == AMDGPUAS::LOCAL_ADDRESS ||
997 : AS == AMDGPUAS::REGION_ADDRESS) {
998 : // Basic, single offset DS instructions allow a 16-bit unsigned immediate
999 : // field.
1000 : // XXX - If doing a 4-byte aligned 8-byte type access, we effectively have
1001 : // an 8-bit dword offset but we don't know the alignment here.
1002 21155 : if (!isUInt<16>(AM.BaseOffs))
1003 : return false;
1004 :
1005 19369 : if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
1006 : return true;
1007 :
1008 3786 : if (AM.Scale == 1 && AM.HasBaseReg)
1009 : return true;
1010 :
1011 2426 : return false;
1012 11094 : } else if (AS == AMDGPUAS::FLAT_ADDRESS ||
1013 : AS == AMDGPUAS::UNKNOWN_ADDRESS_SPACE) {
1014 : // For an unknown address space, this usually means that this is for some
1015 : // reason being used for pure arithmetic, and not based on some addressing
1016 : // computation. We don't have instructions that compute pointers with any
1017 : // addressing modes, so treat them as having no offset like flat
1018 : // instructions.
1019 11094 : return isLegalFlatAddressingMode(AM);
1020 : } else {
1021 0 : llvm_unreachable("unhandled address space");
1022 : }
1023 : }
1024 :
1025 15492 : bool SITargetLowering::canMergeStoresTo(unsigned AS, EVT MemVT,
1026 : const SelectionDAG &DAG) const {
1027 15492 : if (AS == AMDGPUAS::GLOBAL_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS) {
1028 7251 : return (MemVT.getSizeInBits() <= 4 * 32);
1029 8241 : } else if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
1030 3301 : unsigned MaxPrivateBits = 8 * getSubtarget()->getMaxPrivateElementSize();
1031 3301 : return (MemVT.getSizeInBits() <= MaxPrivateBits);
1032 4940 : } else if (AS == AMDGPUAS::LOCAL_ADDRESS) {
1033 4940 : return (MemVT.getSizeInBits() <= 2 * 32);
1034 : }
1035 : return true;
1036 : }
1037 :
1038 130502 : bool SITargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
1039 : unsigned AddrSpace,
1040 : unsigned Align,
1041 : bool *IsFast) const {
1042 130502 : if (IsFast)
1043 83883 : *IsFast = false;
1044 :
1045 : // TODO: I think v3i32 should allow unaligned accesses on CI with DS_READ_B96,
1046 : // which isn't a simple VT.
1047 : // Until MVT is extended to handle this, simply check for the size and
1048 : // rely on the condition below: allow accesses if the size is a multiple of 4.
1049 130502 : if (VT == MVT::Other || (VT != MVT::Other && VT.getSizeInBits() > 1024 &&
1050 : VT.getStoreSize() > 16)) {
1051 0 : return false;
1052 : }
1053 :
1054 130502 : if (AddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
1055 : AddrSpace == AMDGPUAS::REGION_ADDRESS) {
1056 : // ds_read/write_b64 require 8-byte alignment, but we can do a 4 byte
1057 : // aligned, 8 byte access in a single operation using ds_read2/write2_b32
1058 : // with adjacent offsets.
1059 8037 : bool AlignedBy4 = (Align % 4 == 0);
1060 8037 : if (IsFast)
1061 5971 : *IsFast = AlignedBy4;
1062 :
1063 8037 : return AlignedBy4;
1064 : }
1065 :
1066 : // FIXME: We have to be conservative here and assume that flat operations
1067 : // will access scratch. If we had access to the IR function, then we
1068 : // could determine if any private memory was used in the function.
1069 122465 : if (!Subtarget->hasUnalignedScratchAccess() &&
1070 244858 : (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS ||
1071 122429 : AddrSpace == AMDGPUAS::FLAT_ADDRESS)) {
1072 863 : bool AlignedBy4 = Align >= 4;
1073 863 : if (IsFast)
1074 673 : *IsFast = AlignedBy4;
1075 :
1076 863 : return AlignedBy4;
1077 : }
1078 :
1079 121602 : if (Subtarget->hasUnalignedBufferAccess()) {
1080 : // If we have an uniform constant load, it still requires using a slow
1081 : // buffer instruction if unaligned.
1082 6651 : if (IsFast) {
1083 4382 : *IsFast = (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS ||
1084 4382 : AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) ?
1085 675 : (Align % 4 == 0) : true;
1086 : }
1087 :
1088 6651 : return true;
1089 : }
1090 :
1091 : // Smaller than dword value must be aligned.
1092 114951 : if (VT.bitsLT(MVT::i32))
1093 : return false;
1094 :
1095 : // 8.1.6 - For Dword or larger reads or writes, the two LSBs of the
1096 : // byte-address are ignored, thus forcing Dword alignment.
1097 : // This applies to private, global, and constant memory.
1098 113161 : if (IsFast)
1099 71273 : *IsFast = true;
1100 :
1101 117755 : return VT.bitsGT(MVT::i32) && Align % 4 == 0;
1102 : }
1103 :
1104 124 : EVT SITargetLowering::getOptimalMemOpType(uint64_t Size, unsigned DstAlign,
1105 : unsigned SrcAlign, bool IsMemset,
1106 : bool ZeroMemset,
1107 : bool MemcpyStrSrc,
1108 : MachineFunction &MF) const {
1109 : // FIXME: Should account for address space here.
1110 :
1111 : // The default fallback uses the private pointer size as a guess for a type to
1112 : // use. Make sure we switch these to 64-bit accesses.
1113 :
1114 124 : if (Size >= 16 && DstAlign >= 4) // XXX: Should only do for global
1115 94 : return MVT::v4i32;
1116 :
1117 30 : if (Size >= 8 && DstAlign >= 4)
1118 12 : return MVT::v2i32;
1119 :
1120 : // Use the default.
1121 18 : return MVT::Other;
1122 : }
1123 :
1124 : static bool isFlatGlobalAddrSpace(unsigned AS) {
1125 667 : return AS == AMDGPUAS::GLOBAL_ADDRESS ||
1126 : AS == AMDGPUAS::FLAT_ADDRESS ||
1127 667 : AS == AMDGPUAS::CONSTANT_ADDRESS;
1128 : }
1129 :
1130 246 : bool SITargetLowering::isNoopAddrSpaceCast(unsigned SrcAS,
1131 : unsigned DestAS) const {
1132 246 : return isFlatGlobalAddrSpace(SrcAS) && isFlatGlobalAddrSpace(DestAS);
1133 : }
1134 :
1135 4734 : bool SITargetLowering::isMemOpHasNoClobberedMemOperand(const SDNode *N) const {
1136 : const MemSDNode *MemNode = cast<MemSDNode>(N);
1137 4734 : const Value *Ptr = MemNode->getMemOperand()->getValue();
1138 : const Instruction *I = dyn_cast_or_null<Instruction>(Ptr);
1139 9032 : return I && I->getMetadata("amdgpu.noclobber");
1140 : }
1141 :
1142 94 : bool SITargetLowering::isCheapAddrSpaceCast(unsigned SrcAS,
1143 : unsigned DestAS) const {
1144 : // Flat -> private/local is a simple truncate.
1145 : // Flat -> global is no-op
1146 94 : if (SrcAS == AMDGPUAS::FLAT_ADDRESS)
1147 : return true;
1148 :
1149 30 : return isNoopAddrSpaceCast(SrcAS, DestAS);
1150 : }
1151 :
1152 0 : bool SITargetLowering::isMemOpUniform(const SDNode *N) const {
1153 : const MemSDNode *MemNode = cast<MemSDNode>(N);
1154 :
1155 0 : return AMDGPUInstrInfo::isUniformMMO(MemNode->getMemOperand());
1156 : }
1157 :
1158 : TargetLoweringBase::LegalizeTypeAction
1159 206860 : SITargetLowering::getPreferredVectorAction(EVT VT) const {
1160 206860 : if (VT.getVectorNumElements() != 1 && VT.getScalarType().bitsLE(MVT::i16))
1161 107180 : return TypeSplitVector;
1162 :
1163 99680 : return TargetLoweringBase::getPreferredVectorAction(VT);
1164 : }
1165 :
1166 32 : bool SITargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
1167 : Type *Ty) const {
1168 : // FIXME: Could be smarter if called for vector constants.
1169 32 : return true;
1170 : }
1171 :
1172 303233 : bool SITargetLowering::isTypeDesirableForOp(unsigned Op, EVT VT) const {
1173 303233 : if (Subtarget->has16BitInsts() && VT == MVT::i16) {
1174 31162 : switch (Op) {
1175 : case ISD::LOAD:
1176 : case ISD::STORE:
1177 :
1178 : // These operations are done with 32-bit instructions anyway.
1179 : case ISD::AND:
1180 : case ISD::OR:
1181 : case ISD::XOR:
1182 : case ISD::SELECT:
1183 : // TODO: Extensions?
1184 : return true;
1185 26971 : default:
1186 26971 : return false;
1187 : }
1188 : }
1189 :
1190 : // SimplifySetCC uses this function to determine whether or not it should
1191 : // create setcc with i1 operands. We don't have instructions for i1 setcc.
1192 682 : if (VT == MVT::i1 && Op == ISD::SETCC)
1193 24 : return false;
1194 :
1195 272047 : return TargetLowering::isTypeDesirableForOp(Op, VT);
1196 : }
1197 :
1198 41155 : SDValue SITargetLowering::lowerKernArgParameterPtr(SelectionDAG &DAG,
1199 : const SDLoc &SL,
1200 : SDValue Chain,
1201 : uint64_t Offset) const {
1202 41155 : const DataLayout &DL = DAG.getDataLayout();
1203 41155 : MachineFunction &MF = DAG.getMachineFunction();
1204 41155 : const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1205 :
1206 : const ArgDescriptor *InputPtrReg;
1207 : const TargetRegisterClass *RC;
1208 :
1209 : std::tie(InputPtrReg, RC)
1210 : = Info->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
1211 :
1212 41155 : MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
1213 : MVT PtrVT = getPointerTy(DL, AMDGPUAS::CONSTANT_ADDRESS);
1214 : SDValue BasePtr = DAG.getCopyFromReg(Chain, SL,
1215 41155 : MRI.getLiveInVirtReg(InputPtrReg->getRegister()), PtrVT);
1216 :
1217 41155 : return DAG.getObjectPtrOffset(SL, BasePtr, Offset);
1218 : }
1219 :
1220 42 : SDValue SITargetLowering::getImplicitArgPtr(SelectionDAG &DAG,
1221 : const SDLoc &SL) const {
1222 42 : uint64_t Offset = getImplicitParameterOffset(DAG.getMachineFunction(),
1223 42 : FIRST_IMPLICIT);
1224 42 : return lowerKernArgParameterPtr(DAG, SL, DAG.getEntryNode(), Offset);
1225 : }
1226 :
1227 41113 : SDValue SITargetLowering::convertArgType(SelectionDAG &DAG, EVT VT, EVT MemVT,
1228 : const SDLoc &SL, SDValue Val,
1229 : bool Signed,
1230 : const ISD::InputArg *Arg) const {
1231 41113 : if (Arg && (Arg->Flags.isSExt() || Arg->Flags.isZExt()) &&
1232 80 : VT.bitsLT(MemVT)) {
1233 0 : unsigned Opc = Arg->Flags.isZExt() ? ISD::AssertZext : ISD::AssertSext;
1234 0 : Val = DAG.getNode(Opc, SL, MemVT, Val, DAG.getValueType(VT));
1235 : }
1236 :
1237 41113 : if (MemVT.isFloatingPoint())
1238 2526 : Val = getFPExtOrFPTrunc(DAG, Val, SL, VT);
1239 38587 : else if (Signed)
1240 14 : Val = DAG.getSExtOrTrunc(Val, SL, VT);
1241 : else
1242 38573 : Val = DAG.getZExtOrTrunc(Val, SL, VT);
1243 :
1244 41113 : return Val;
1245 : }
1246 :
1247 41113 : SDValue SITargetLowering::lowerKernargMemParameter(
1248 : SelectionDAG &DAG, EVT VT, EVT MemVT,
1249 : const SDLoc &SL, SDValue Chain,
1250 : uint64_t Offset, unsigned Align, bool Signed,
1251 : const ISD::InputArg *Arg) const {
1252 41113 : Type *Ty = MemVT.getTypeForEVT(*DAG.getContext());
1253 41113 : PointerType *PtrTy = PointerType::get(Ty, AMDGPUAS::CONSTANT_ADDRESS);
1254 41113 : MachinePointerInfo PtrInfo(UndefValue::get(PtrTy));
1255 :
1256 : // Try to avoid using an extload by loading earlier than the argument address,
1257 : // and extracting the relevant bits. The load should hopefully be merged with
1258 : // the previous argument.
1259 41113 : if (MemVT.getStoreSize() < 4 && Align < 4) {
1260 : // TODO: Handle align < 4 and size >= 4 (can happen with packed structs).
1261 : int64_t AlignDownOffset = alignDown(Offset, 4);
1262 1688 : int64_t OffsetDiff = Offset - AlignDownOffset;
1263 :
1264 1688 : EVT IntVT = MemVT.changeTypeToInteger();
1265 :
1266 : // TODO: If we passed in the base kernel offset we could have a better
1267 : // alignment than 4, but we don't really need it.
1268 1688 : SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, AlignDownOffset);
1269 : SDValue Load = DAG.getLoad(MVT::i32, SL, Chain, Ptr, PtrInfo, 4,
1270 : MachineMemOperand::MODereferenceable |
1271 1688 : MachineMemOperand::MOInvariant);
1272 :
1273 1688 : SDValue ShiftAmt = DAG.getConstant(OffsetDiff * 8, SL, MVT::i32);
1274 1688 : SDValue Extract = DAG.getNode(ISD::SRL, SL, MVT::i32, Load, ShiftAmt);
1275 :
1276 1688 : SDValue ArgVal = DAG.getNode(ISD::TRUNCATE, SL, IntVT, Extract);
1277 1688 : ArgVal = DAG.getNode(ISD::BITCAST, SL, MemVT, ArgVal);
1278 1688 : ArgVal = convertArgType(DAG, VT, MemVT, SL, ArgVal, Signed, Arg);
1279 :
1280 :
1281 3376 : return DAG.getMergeValues({ ArgVal, Load.getValue(1) }, SL);
1282 : }
1283 :
1284 39425 : SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, Offset);
1285 : SDValue Load = DAG.getLoad(MemVT, SL, Chain, Ptr, PtrInfo, Align,
1286 : MachineMemOperand::MODereferenceable |
1287 39425 : MachineMemOperand::MOInvariant);
1288 :
1289 39425 : SDValue Val = convertArgType(DAG, VT, MemVT, SL, Load, Signed, Arg);
1290 78850 : return DAG.getMergeValues({ Val, Load.getValue(1) }, SL);
1291 : }
1292 :
1293 402 : SDValue SITargetLowering::lowerStackParameter(SelectionDAG &DAG, CCValAssign &VA,
1294 : const SDLoc &SL, SDValue Chain,
1295 : const ISD::InputArg &Arg) const {
1296 402 : MachineFunction &MF = DAG.getMachineFunction();
1297 402 : MachineFrameInfo &MFI = MF.getFrameInfo();
1298 :
1299 402 : if (Arg.Flags.isByVal()) {
1300 71 : unsigned Size = Arg.Flags.getByValSize();
1301 71 : int FrameIdx = MFI.CreateFixedObject(Size, VA.getLocMemOffset(), false);
1302 71 : return DAG.getFrameIndex(FrameIdx, MVT::i32);
1303 : }
1304 :
1305 331 : unsigned ArgOffset = VA.getLocMemOffset();
1306 331 : unsigned ArgSize = VA.getValVT().getStoreSize();
1307 :
1308 331 : int FI = MFI.CreateFixedObject(ArgSize, ArgOffset, true);
1309 :
1310 : // Create load nodes to retrieve arguments from the stack.
1311 331 : SDValue FIN = DAG.getFrameIndex(FI, MVT::i32);
1312 : SDValue ArgValue;
1313 :
1314 : // For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT)
1315 : ISD::LoadExtType ExtType = ISD::NON_EXTLOAD;
1316 : MVT MemVT = VA.getValVT();
1317 :
1318 331 : switch (VA.getLocInfo()) {
1319 : default:
1320 : break;
1321 0 : case CCValAssign::BCvt:
1322 : MemVT = VA.getLocVT();
1323 0 : break;
1324 0 : case CCValAssign::SExt:
1325 : ExtType = ISD::SEXTLOAD;
1326 0 : break;
1327 0 : case CCValAssign::ZExt:
1328 : ExtType = ISD::ZEXTLOAD;
1329 0 : break;
1330 3 : case CCValAssign::AExt:
1331 : ExtType = ISD::EXTLOAD;
1332 3 : break;
1333 : }
1334 :
1335 331 : ArgValue = DAG.getExtLoad(
1336 : ExtType, SL, VA.getLocVT(), Chain, FIN,
1337 : MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI),
1338 331 : MemVT);
1339 331 : return ArgValue;
1340 : }
1341 :
1342 13733 : SDValue SITargetLowering::getPreloadedValue(SelectionDAG &DAG,
1343 : const SIMachineFunctionInfo &MFI,
1344 : EVT VT,
1345 : AMDGPUFunctionArgInfo::PreloadedValue PVID) const {
1346 : const ArgDescriptor *Reg;
1347 : const TargetRegisterClass *RC;
1348 :
1349 : std::tie(Reg, RC) = MFI.getPreloadedValue(PVID);
1350 13733 : return CreateLiveInRegister(DAG, RC, Reg->getRegister(), VT);
1351 : }
1352 :
1353 0 : static void processShaderInputArgs(SmallVectorImpl<ISD::InputArg> &Splits,
1354 : CallingConv::ID CallConv,
1355 : ArrayRef<ISD::InputArg> Ins,
1356 : BitVector &Skipped,
1357 : FunctionType *FType,
1358 : SIMachineFunctionInfo *Info) {
1359 0 : for (unsigned I = 0, E = Ins.size(), PSInputNum = 0; I != E; ++I) {
1360 0 : const ISD::InputArg *Arg = &Ins[I];
1361 :
1362 : assert((!Arg->VT.isVector() || Arg->VT.getScalarSizeInBits() == 16) &&
1363 : "vector type argument should have been split");
1364 :
1365 : // First check if it's a PS input addr.
1366 0 : if (CallConv == CallingConv::AMDGPU_PS &&
1367 0 : !Arg->Flags.isInReg() && !Arg->Flags.isByVal() && PSInputNum <= 15) {
1368 :
1369 0 : bool SkipArg = !Arg->Used && !Info->isPSInputAllocated(PSInputNum);
1370 :
1371 : // Inconveniently only the first part of the split is marked as isSplit,
1372 : // so skip to the end. We only want to increment PSInputNum once for the
1373 : // entire split argument.
1374 0 : if (Arg->Flags.isSplit()) {
1375 0 : while (!Arg->Flags.isSplitEnd()) {
1376 : assert(!Arg->VT.isVector() &&
1377 : "unexpected vector split in ps argument type");
1378 0 : if (!SkipArg)
1379 0 : Splits.push_back(*Arg);
1380 0 : Arg = &Ins[++I];
1381 : }
1382 : }
1383 :
1384 0 : if (SkipArg) {
1385 : // We can safely skip PS inputs.
1386 0 : Skipped.set(Arg->getOrigArgIndex());
1387 0 : ++PSInputNum;
1388 0 : continue;
1389 : }
1390 :
1391 : Info->markPSInputAllocated(PSInputNum);
1392 0 : if (Arg->Used)
1393 : Info->markPSInputEnabled(PSInputNum);
1394 :
1395 0 : ++PSInputNum;
1396 : }
1397 :
1398 0 : Splits.push_back(*Arg);
1399 : }
1400 0 : }
1401 :
1402 : // Allocate special inputs passed in VGPRs.
1403 0 : static void allocateSpecialEntryInputVGPRs(CCState &CCInfo,
1404 : MachineFunction &MF,
1405 : const SIRegisterInfo &TRI,
1406 : SIMachineFunctionInfo &Info) {
1407 0 : if (Info.hasWorkItemIDX()) {
1408 : unsigned Reg = AMDGPU::VGPR0;
1409 0 : MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
1410 :
1411 0 : CCInfo.AllocateReg(Reg);
1412 0 : Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg));
1413 : }
1414 :
1415 0 : if (Info.hasWorkItemIDY()) {
1416 : unsigned Reg = AMDGPU::VGPR1;
1417 0 : MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
1418 :
1419 0 : CCInfo.AllocateReg(Reg);
1420 0 : Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg));
1421 : }
1422 :
1423 0 : if (Info.hasWorkItemIDZ()) {
1424 : unsigned Reg = AMDGPU::VGPR2;
1425 0 : MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
1426 :
1427 0 : CCInfo.AllocateReg(Reg);
1428 0 : Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg));
1429 : }
1430 0 : }
1431 :
1432 : // Try to allocate a VGPR at the end of the argument list, or if no argument
1433 : // VGPRs are left allocating a stack slot.
1434 38 : static ArgDescriptor allocateVGPR32Input(CCState &CCInfo) {
1435 : ArrayRef<MCPhysReg> ArgVGPRs
1436 38 : = makeArrayRef(AMDGPU::VGPR_32RegClass.begin(), 32);
1437 : unsigned RegIdx = CCInfo.getFirstUnallocated(ArgVGPRs);
1438 38 : if (RegIdx == ArgVGPRs.size()) {
1439 : // Spill to stack required.
1440 8 : int64_t Offset = CCInfo.AllocateStack(4, 4);
1441 :
1442 : return ArgDescriptor::createStack(Offset);
1443 : }
1444 :
1445 30 : unsigned Reg = ArgVGPRs[RegIdx];
1446 30 : Reg = CCInfo.AllocateReg(Reg);
1447 : assert(Reg != AMDGPU::NoRegister);
1448 :
1449 30 : MachineFunction &MF = CCInfo.getMachineFunction();
1450 30 : MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
1451 : return ArgDescriptor::createRegister(Reg);
1452 : }
1453 :
1454 0 : static ArgDescriptor allocateSGPR32InputImpl(CCState &CCInfo,
1455 : const TargetRegisterClass *RC,
1456 : unsigned NumArgRegs) {
1457 0 : ArrayRef<MCPhysReg> ArgSGPRs = makeArrayRef(RC->begin(), 32);
1458 : unsigned RegIdx = CCInfo.getFirstUnallocated(ArgSGPRs);
1459 0 : if (RegIdx == ArgSGPRs.size())
1460 0 : report_fatal_error("ran out of SGPRs for arguments");
1461 :
1462 0 : unsigned Reg = ArgSGPRs[RegIdx];
1463 0 : Reg = CCInfo.AllocateReg(Reg);
1464 : assert(Reg != AMDGPU::NoRegister);
1465 :
1466 0 : MachineFunction &MF = CCInfo.getMachineFunction();
1467 0 : MF.addLiveIn(Reg, RC);
1468 0 : return ArgDescriptor::createRegister(Reg);
1469 : }
1470 :
1471 : static ArgDescriptor allocateSGPR32Input(CCState &CCInfo) {
1472 0 : return allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_32RegClass, 32);
1473 : }
1474 :
1475 : static ArgDescriptor allocateSGPR64Input(CCState &CCInfo) {
1476 0 : return allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_64RegClass, 16);
1477 : }
1478 :
1479 0 : static void allocateSpecialInputVGPRs(CCState &CCInfo,
1480 : MachineFunction &MF,
1481 : const SIRegisterInfo &TRI,
1482 : SIMachineFunctionInfo &Info) {
1483 0 : if (Info.hasWorkItemIDX())
1484 0 : Info.setWorkItemIDX(allocateVGPR32Input(CCInfo));
1485 :
1486 0 : if (Info.hasWorkItemIDY())
1487 0 : Info.setWorkItemIDY(allocateVGPR32Input(CCInfo));
1488 :
1489 0 : if (Info.hasWorkItemIDZ())
1490 0 : Info.setWorkItemIDZ(allocateVGPR32Input(CCInfo));
1491 0 : }
1492 :
1493 0 : static void allocateSpecialInputSGPRs(CCState &CCInfo,
1494 : MachineFunction &MF,
1495 : const SIRegisterInfo &TRI,
1496 : SIMachineFunctionInfo &Info) {
1497 : auto &ArgInfo = Info.getArgInfo();
1498 :
1499 : // TODO: Unify handling with private memory pointers.
1500 :
1501 0 : if (Info.hasDispatchPtr())
1502 0 : ArgInfo.DispatchPtr = allocateSGPR64Input(CCInfo);
1503 :
1504 0 : if (Info.hasQueuePtr())
1505 0 : ArgInfo.QueuePtr = allocateSGPR64Input(CCInfo);
1506 :
1507 0 : if (Info.hasKernargSegmentPtr())
1508 0 : ArgInfo.KernargSegmentPtr = allocateSGPR64Input(CCInfo);
1509 :
1510 0 : if (Info.hasDispatchID())
1511 0 : ArgInfo.DispatchID = allocateSGPR64Input(CCInfo);
1512 :
1513 : // flat_scratch_init is not applicable for non-kernel functions.
1514 :
1515 0 : if (Info.hasWorkGroupIDX())
1516 0 : ArgInfo.WorkGroupIDX = allocateSGPR32Input(CCInfo);
1517 :
1518 0 : if (Info.hasWorkGroupIDY())
1519 0 : ArgInfo.WorkGroupIDY = allocateSGPR32Input(CCInfo);
1520 :
1521 0 : if (Info.hasWorkGroupIDZ())
1522 0 : ArgInfo.WorkGroupIDZ = allocateSGPR32Input(CCInfo);
1523 :
1524 0 : if (Info.hasImplicitArgPtr())
1525 0 : ArgInfo.ImplicitArgPtr = allocateSGPR64Input(CCInfo);
1526 0 : }
1527 :
1528 : // Allocate special inputs passed in user SGPRs.
1529 17950 : static void allocateHSAUserSGPRs(CCState &CCInfo,
1530 : MachineFunction &MF,
1531 : const SIRegisterInfo &TRI,
1532 : SIMachineFunctionInfo &Info) {
1533 17950 : if (Info.hasImplicitBufferPtr()) {
1534 2 : unsigned ImplicitBufferPtrReg = Info.addImplicitBufferPtr(TRI);
1535 2 : MF.addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass);
1536 2 : CCInfo.AllocateReg(ImplicitBufferPtrReg);
1537 : }
1538 :
1539 : // FIXME: How should these inputs interact with inreg / custom SGPR inputs?
1540 17950 : if (Info.hasPrivateSegmentBuffer()) {
1541 2564 : unsigned PrivateSegmentBufferReg = Info.addPrivateSegmentBuffer(TRI);
1542 2564 : MF.addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SGPR_128RegClass);
1543 2564 : CCInfo.AllocateReg(PrivateSegmentBufferReg);
1544 : }
1545 :
1546 17950 : if (Info.hasDispatchPtr()) {
1547 42 : unsigned DispatchPtrReg = Info.addDispatchPtr(TRI);
1548 42 : MF.addLiveIn(DispatchPtrReg, &AMDGPU::SGPR_64RegClass);
1549 42 : CCInfo.AllocateReg(DispatchPtrReg);
1550 : }
1551 :
1552 17950 : if (Info.hasQueuePtr()) {
1553 57 : unsigned QueuePtrReg = Info.addQueuePtr(TRI);
1554 57 : MF.addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass);
1555 57 : CCInfo.AllocateReg(QueuePtrReg);
1556 : }
1557 :
1558 17950 : if (Info.hasKernargSegmentPtr()) {
1559 15176 : unsigned InputPtrReg = Info.addKernargSegmentPtr(TRI);
1560 15176 : MF.addLiveIn(InputPtrReg, &AMDGPU::SGPR_64RegClass);
1561 15176 : CCInfo.AllocateReg(InputPtrReg);
1562 : }
1563 :
1564 17950 : if (Info.hasDispatchID()) {
1565 5 : unsigned DispatchIDReg = Info.addDispatchID(TRI);
1566 5 : MF.addLiveIn(DispatchIDReg, &AMDGPU::SGPR_64RegClass);
1567 5 : CCInfo.AllocateReg(DispatchIDReg);
1568 : }
1569 :
1570 17950 : if (Info.hasFlatScratchInit()) {
1571 381 : unsigned FlatScratchInitReg = Info.addFlatScratchInit(TRI);
1572 381 : MF.addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass);
1573 381 : CCInfo.AllocateReg(FlatScratchInitReg);
1574 : }
1575 :
1576 : // TODO: Add GridWorkGroupCount user SGPRs when used. For now with HSA we read
1577 : // these from the dispatch pointer.
1578 17950 : }
1579 :
1580 : // Allocate special input registers that are initialized per-wave.
1581 17950 : static void allocateSystemSGPRs(CCState &CCInfo,
1582 : MachineFunction &MF,
1583 : SIMachineFunctionInfo &Info,
1584 : CallingConv::ID CallConv,
1585 : bool IsShader) {
1586 17950 : if (Info.hasWorkGroupIDX()) {
1587 : unsigned Reg = Info.addWorkGroupIDX();
1588 16213 : MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass);
1589 16213 : CCInfo.AllocateReg(Reg);
1590 : }
1591 :
1592 17950 : if (Info.hasWorkGroupIDY()) {
1593 : unsigned Reg = Info.addWorkGroupIDY();
1594 24 : MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass);
1595 24 : CCInfo.AllocateReg(Reg);
1596 : }
1597 :
1598 17950 : if (Info.hasWorkGroupIDZ()) {
1599 : unsigned Reg = Info.addWorkGroupIDZ();
1600 24 : MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass);
1601 24 : CCInfo.AllocateReg(Reg);
1602 : }
1603 :
1604 17950 : if (Info.hasWorkGroupInfo()) {
1605 : unsigned Reg = Info.addWorkGroupInfo();
1606 0 : MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass);
1607 0 : CCInfo.AllocateReg(Reg);
1608 : }
1609 :
1610 17950 : if (Info.hasPrivateSegmentWaveByteOffset()) {
1611 : // Scratch wave offset passed in system SGPR.
1612 : unsigned PrivateSegmentWaveByteOffsetReg;
1613 :
1614 16263 : if (IsShader) {
1615 : PrivateSegmentWaveByteOffsetReg =
1616 : Info.getPrivateSegmentWaveByteOffsetSystemSGPR();
1617 :
1618 : // This is true if the scratch wave byte offset doesn't have a fixed
1619 : // location.
1620 50 : if (PrivateSegmentWaveByteOffsetReg == AMDGPU::NoRegister) {
1621 : PrivateSegmentWaveByteOffsetReg = findFirstFreeSGPR(CCInfo);
1622 : Info.setPrivateSegmentWaveByteOffset(PrivateSegmentWaveByteOffsetReg);
1623 : }
1624 : } else
1625 : PrivateSegmentWaveByteOffsetReg = Info.addPrivateSegmentWaveByteOffset();
1626 :
1627 16263 : MF.addLiveIn(PrivateSegmentWaveByteOffsetReg, &AMDGPU::SGPR_32RegClass);
1628 16263 : CCInfo.AllocateReg(PrivateSegmentWaveByteOffsetReg);
1629 : }
1630 17950 : }
1631 :
1632 17974 : static void reservePrivateMemoryRegs(const TargetMachine &TM,
1633 : MachineFunction &MF,
1634 : const SIRegisterInfo &TRI,
1635 : SIMachineFunctionInfo &Info) {
1636 : // Now that we've figured out where the scratch register inputs are, see if
1637 : // should reserve the arguments and use them directly.
1638 17974 : MachineFrameInfo &MFI = MF.getFrameInfo();
1639 : bool HasStackObjects = MFI.hasStackObjects();
1640 :
1641 : // Record that we know we have non-spill stack objects so we don't need to
1642 : // check all stack objects later.
1643 17974 : if (HasStackObjects)
1644 : Info.setHasNonSpillStackObjects(true);
1645 :
1646 : // Everything live out of a block is spilled with fast regalloc, so it's
1647 : // almost certain that spilling will be required.
1648 17974 : if (TM.getOptLevel() == CodeGenOpt::None)
1649 : HasStackObjects = true;
1650 :
1651 : // For now assume stack access is needed in any callee functions, so we need
1652 : // the scratch registers to pass in.
1653 17790 : bool RequiresStackAccess = HasStackObjects || MFI.hasCalls();
1654 :
1655 17974 : const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1656 17974 : if (ST.isAmdHsaOrMesa(MF.getFunction())) {
1657 2567 : if (RequiresStackAccess) {
1658 : // If we have stack objects, we unquestionably need the private buffer
1659 : // resource. For the Code Object V2 ABI, this will be the first 4 user
1660 : // SGPR inputs. We can reserve those and use them directly.
1661 :
1662 : unsigned PrivateSegmentBufferReg = Info.getPreloadedReg(
1663 : AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_BUFFER);
1664 : Info.setScratchRSrcReg(PrivateSegmentBufferReg);
1665 :
1666 484 : if (MFI.hasCalls()) {
1667 : // If we have calls, we need to keep the frame register in a register
1668 : // that won't be clobbered by a call, so ensure it is copied somewhere.
1669 :
1670 : // This is not a problem for the scratch wave offset, because the same
1671 : // registers are reserved in all functions.
1672 :
1673 : // FIXME: Nothing is really ensuring this is a call preserved register,
1674 : // it's just selected from the end so it happens to be.
1675 : unsigned ReservedOffsetReg
1676 265 : = TRI.reservedPrivateSegmentWaveByteOffsetReg(MF);
1677 : Info.setScratchWaveOffsetReg(ReservedOffsetReg);
1678 : } else {
1679 : unsigned PrivateSegmentWaveByteOffsetReg = Info.getPreloadedReg(
1680 : AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
1681 : Info.setScratchWaveOffsetReg(PrivateSegmentWaveByteOffsetReg);
1682 : }
1683 : } else {
1684 : unsigned ReservedBufferReg
1685 2083 : = TRI.reservedPrivateSegmentBufferReg(MF);
1686 : unsigned ReservedOffsetReg
1687 2083 : = TRI.reservedPrivateSegmentWaveByteOffsetReg(MF);
1688 :
1689 : // We tentatively reserve the last registers (skipping the last two
1690 : // which may contain VCC). After register allocation, we'll replace
1691 : // these with the ones immediately after those which were really
1692 : // allocated. In the prologue copies will be inserted from the argument
1693 : // to these reserved registers.
1694 : Info.setScratchRSrcReg(ReservedBufferReg);
1695 : Info.setScratchWaveOffsetReg(ReservedOffsetReg);
1696 : }
1697 : } else {
1698 15407 : unsigned ReservedBufferReg = TRI.reservedPrivateSegmentBufferReg(MF);
1699 :
1700 : // Without HSA, relocations are used for the scratch pointer and the
1701 : // buffer resource setup is always inserted in the prologue. Scratch wave
1702 : // offset is still in an input SGPR.
1703 : Info.setScratchRSrcReg(ReservedBufferReg);
1704 :
1705 15407 : if (HasStackObjects && !MFI.hasCalls()) {
1706 : unsigned ScratchWaveOffsetReg = Info.getPreloadedReg(
1707 : AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
1708 : Info.setScratchWaveOffsetReg(ScratchWaveOffsetReg);
1709 : } else {
1710 : unsigned ReservedOffsetReg
1711 15081 : = TRI.reservedPrivateSegmentWaveByteOffsetReg(MF);
1712 : Info.setScratchWaveOffsetReg(ReservedOffsetReg);
1713 : }
1714 : }
1715 17974 : }
1716 :
1717 19524 : bool SITargetLowering::supportSplitCSR(MachineFunction *MF) const {
1718 19524 : const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
1719 19524 : return !Info->isEntryFunction();
1720 : }
1721 :
1722 1755 : void SITargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
1723 :
1724 1755 : }
1725 :
1726 1754 : void SITargetLowering::insertCopiesSplitCSR(
1727 : MachineBasicBlock *Entry,
1728 : const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
1729 1754 : const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
1730 :
1731 1754 : const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
1732 1754 : if (!IStart)
1733 1754 : return;
1734 :
1735 0 : const TargetInstrInfo *TII = Subtarget->getInstrInfo();
1736 0 : MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
1737 0 : MachineBasicBlock::iterator MBBI = Entry->begin();
1738 0 : for (const MCPhysReg *I = IStart; *I; ++I) {
1739 : const TargetRegisterClass *RC = nullptr;
1740 0 : if (AMDGPU::SReg_64RegClass.contains(*I))
1741 : RC = &AMDGPU::SGPR_64RegClass;
1742 0 : else if (AMDGPU::SReg_32RegClass.contains(*I))
1743 : RC = &AMDGPU::SGPR_32RegClass;
1744 : else
1745 0 : llvm_unreachable("Unexpected register class in CSRsViaCopy!");
1746 :
1747 0 : unsigned NewVR = MRI->createVirtualRegister(RC);
1748 : // Create copy from CSR to a virtual register.
1749 0 : Entry->addLiveIn(*I);
1750 0 : BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
1751 0 : .addReg(*I);
1752 :
1753 : // Insert the copy-back instructions right before the terminator.
1754 0 : for (auto *Exit : Exits)
1755 0 : BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
1756 0 : TII->get(TargetOpcode::COPY), *I)
1757 0 : .addReg(NewVR);
1758 : }
1759 : }
1760 :
1761 19712 : SDValue SITargetLowering::LowerFormalArguments(
1762 : SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
1763 : const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
1764 : SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
1765 19712 : const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
1766 :
1767 19712 : MachineFunction &MF = DAG.getMachineFunction();
1768 19712 : const Function &Fn = MF.getFunction();
1769 : FunctionType *FType = MF.getFunction().getFunctionType();
1770 19712 : SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1771 19712 : const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1772 :
1773 39424 : if (Subtarget->isAmdHsaOS() && AMDGPU::isShader(CallConv)) {
1774 : DiagnosticInfoUnsupported NoGraphicsHSA(
1775 3 : Fn, "unsupported non-compute shaders with HSA", DL.getDebugLoc());
1776 3 : DAG.getContext()->diagnose(NoGraphicsHSA);
1777 : return DAG.getEntryNode();
1778 : }
1779 :
1780 : // Create stack objects that are used for emitting debugger prologue if
1781 : // "amdgpu-debugger-emit-prologue" attribute was specified.
1782 19709 : if (ST.debuggerEmitPrologue())
1783 4 : createDebuggerPrologueStackObjects(MF);
1784 :
1785 : SmallVector<ISD::InputArg, 16> Splits;
1786 : SmallVector<CCValAssign, 16> ArgLocs;
1787 19709 : BitVector Skipped(Ins.size());
1788 : CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
1789 39418 : *DAG.getContext());
1790 :
1791 19709 : bool IsShader = AMDGPU::isShader(CallConv);
1792 : bool IsKernel = AMDGPU::isKernel(CallConv);
1793 19709 : bool IsEntryFunc = AMDGPU::isEntryFunctionCC(CallConv);
1794 :
1795 19709 : if (!IsEntryFunc) {
1796 : // 4 bytes are reserved at offset 0 for the emergency stack slot. Skip over
1797 : // this when allocating argument fixed offsets.
1798 1759 : CCInfo.AllocateStack(4, 4);
1799 : }
1800 :
1801 19709 : if (IsShader) {
1802 1737 : processShaderInputArgs(Splits, CallConv, Ins, Skipped, FType, Info);
1803 :
1804 : // At least one interpolation mode must be enabled or else the GPU will
1805 : // hang.
1806 : //
1807 : // Check PSInputAddr instead of PSInputEnable. The idea is that if the user
1808 : // set PSInputAddr, the user wants to enable some bits after the compilation
1809 : // based on run-time states. Since we can't know what the final PSInputEna
1810 : // will look like, so we shouldn't do anything here and the user should take
1811 : // responsibility for the correct programming.
1812 : //
1813 : // Otherwise, the following restrictions apply:
1814 : // - At least one of PERSP_* (0xF) or LINEAR_* (0x70) must be enabled.
1815 : // - If POS_W_FLOAT (11) is enabled, at least one of PERSP_* must be
1816 : // enabled too.
1817 1737 : if (CallConv == CallingConv::AMDGPU_PS) {
1818 1433 : if ((Info->getPSInputAddr() & 0x7F) == 0 ||
1819 1136 : ((Info->getPSInputAddr() & 0xF) == 0 &&
1820 : Info->isPSInputAllocated(11))) {
1821 : CCInfo.AllocateReg(AMDGPU::VGPR0);
1822 : CCInfo.AllocateReg(AMDGPU::VGPR1);
1823 : Info->markPSInputAllocated(0);
1824 : Info->markPSInputEnabled(0);
1825 : }
1826 2866 : if (Subtarget->isAmdPalOS()) {
1827 : // For isAmdPalOS, the user does not enable some bits after compilation
1828 : // based on run-time states; the register values being generated here are
1829 : // the final ones set in hardware. Therefore we need to apply the
1830 : // workaround to PSInputAddr and PSInputEnable together. (The case where
1831 : // a bit is set in PSInputAddr but not PSInputEnable is where the
1832 : // frontend set up an input arg for a particular interpolation mode, but
1833 : // nothing uses that input arg. Really we should have an earlier pass
1834 : // that removes such an arg.)
1835 17 : unsigned PsInputBits = Info->getPSInputAddr() & Info->getPSInputEnable();
1836 17 : if ((PsInputBits & 0x7F) == 0 ||
1837 14 : ((PsInputBits & 0xF) == 0 &&
1838 : (PsInputBits >> 11 & 1)))
1839 3 : Info->markPSInputEnabled(
1840 : countTrailingZeros(Info->getPSInputAddr(), ZB_Undefined));
1841 : }
1842 : }
1843 :
1844 : assert(!Info->hasDispatchPtr() &&
1845 : !Info->hasKernargSegmentPtr() && !Info->hasFlatScratchInit() &&
1846 : !Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() &&
1847 : !Info->hasWorkGroupIDZ() && !Info->hasWorkGroupInfo() &&
1848 : !Info->hasWorkItemIDX() && !Info->hasWorkItemIDY() &&
1849 : !Info->hasWorkItemIDZ());
1850 17972 : } else if (IsKernel) {
1851 : assert(Info->hasWorkGroupIDX() && Info->hasWorkItemIDX());
1852 : } else {
1853 1759 : Splits.append(Ins.begin(), Ins.end());
1854 : }
1855 :
1856 19709 : if (IsEntryFunc) {
1857 17950 : allocateSpecialEntryInputVGPRs(CCInfo, MF, *TRI, *Info);
1858 17950 : allocateHSAUserSGPRs(CCInfo, MF, *TRI, *Info);
1859 : }
1860 :
1861 19709 : if (IsKernel) {
1862 16213 : analyzeFormalArgumentsCompute(CCInfo, Ins);
1863 : } else {
1864 3496 : CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, isVarArg);
1865 3496 : CCInfo.AnalyzeFormalArguments(Splits, AssignFn);
1866 : }
1867 :
1868 : SmallVector<SDValue, 16> Chains;
1869 :
1870 : // FIXME: This is the minimum kernel argument alignment. We should improve
1871 : // this to the maximum alignment of the arguments.
1872 : //
1873 : // FIXME: Alignment of explicit arguments totally broken with non-0 explicit
1874 : // kern arg offset.
1875 : const unsigned KernelArgBaseAlign = 16;
1876 :
1877 83072 : for (unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) {
1878 63363 : const ISD::InputArg &Arg = Ins[i];
1879 63363 : if (Arg.isOrigArg() && Skipped[Arg.getOrigArgIndex()]) {
1880 4412 : InVals.push_back(DAG.getUNDEF(Arg.VT));
1881 43618 : continue;
1882 : }
1883 :
1884 61157 : CCValAssign &VA = ArgLocs[ArgIdx++];
1885 : MVT VT = VA.getLocVT();
1886 :
1887 61157 : if (IsEntryFunc && VA.isMemLoc()) {
1888 41010 : VT = Ins[i].VT;
1889 : EVT MemVT = VA.getLocVT();
1890 :
1891 41010 : const uint64_t Offset = VA.getLocMemOffset();
1892 41010 : unsigned Align = MinAlign(KernelArgBaseAlign, Offset);
1893 :
1894 : SDValue Arg = lowerKernargMemParameter(
1895 82020 : DAG, VT, MemVT, DL, Chain, Offset, Align, Ins[i].Flags.isSExt(), &Ins[i]);
1896 41010 : Chains.push_back(Arg.getValue(1));
1897 :
1898 : auto *ParamTy =
1899 41010 : dyn_cast<PointerType>(FType->getParamType(Ins[i].getOrigArgIndex()));
1900 14128 : if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS &&
1901 49102 : ParamTy && ParamTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
1902 : // On SI local pointers are just offsets into LDS, so they are always
1903 : // less than 16-bits. On CI and newer they could potentially be
1904 : // real pointers, so we can't guarantee their size.
1905 644 : Arg = DAG.getNode(ISD::AssertZext, DL, Arg.getValueType(), Arg,
1906 644 : DAG.getValueType(MVT::i16));
1907 : }
1908 :
1909 41010 : InVals.push_back(Arg);
1910 : continue;
1911 20147 : } else if (!IsEntryFunc && VA.isMemLoc()) {
1912 402 : SDValue Val = lowerStackParameter(DAG, VA, DL, Chain, Arg);
1913 402 : InVals.push_back(Val);
1914 402 : if (!Arg.Flags.isByVal())
1915 331 : Chains.push_back(Val.getValue(1));
1916 : continue;
1917 : }
1918 :
1919 : assert(VA.isRegLoc() && "Parameter must be in a register!");
1920 :
1921 19745 : unsigned Reg = VA.getLocReg();
1922 19745 : const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg, VT);
1923 : EVT ValVT = VA.getValVT();
1924 :
1925 19745 : Reg = MF.addLiveIn(Reg, RC);
1926 19745 : SDValue Val = DAG.getCopyFromReg(Chain, DL, Reg, VT);
1927 :
1928 19745 : if (Arg.Flags.isSRet() && !getSubtarget()->enableHugePrivateBuffer()) {
1929 : // The return object should be reasonably addressable.
1930 :
1931 : // FIXME: This helps when the return is a real sret. If it is a
1932 : // automatically inserted sret (i.e. CanLowerReturn returns false), an
1933 : // extra copy is inserted in SelectionDAGBuilder which obscures this.
1934 12 : unsigned NumBits = 32 - AssumeFrameIndexHighZeroBits;
1935 12 : Val = DAG.getNode(ISD::AssertZext, DL, VT, Val,
1936 12 : DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), NumBits)));
1937 : }
1938 :
1939 : // If this is an 8 or 16-bit value, it is really passed promoted
1940 : // to 32 bits. Insert an assert[sz]ext to capture this, then
1941 : // truncate to the right size.
1942 19745 : switch (VA.getLocInfo()) {
1943 : case CCValAssign::Full:
1944 : break;
1945 : case CCValAssign::BCvt:
1946 0 : Val = DAG.getNode(ISD::BITCAST, DL, ValVT, Val);
1947 0 : break;
1948 : case CCValAssign::SExt:
1949 8 : Val = DAG.getNode(ISD::AssertSext, DL, VT, Val,
1950 8 : DAG.getValueType(ValVT));
1951 8 : Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
1952 8 : break;
1953 : case CCValAssign::ZExt:
1954 12 : Val = DAG.getNode(ISD::AssertZext, DL, VT, Val,
1955 12 : DAG.getValueType(ValVT));
1956 12 : Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
1957 12 : break;
1958 : case CCValAssign::AExt:
1959 7 : Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
1960 7 : break;
1961 0 : default:
1962 0 : llvm_unreachable("Unknown loc info!");
1963 : }
1964 :
1965 19745 : InVals.push_back(Val);
1966 : }
1967 :
1968 19709 : if (!IsEntryFunc) {
1969 : // Special inputs come after user arguments.
1970 1759 : allocateSpecialInputVGPRs(CCInfo, MF, *TRI, *Info);
1971 : }
1972 :
1973 : // Start adding system SGPRs.
1974 19709 : if (IsEntryFunc) {
1975 17950 : allocateSystemSGPRs(CCInfo, MF, *Info, CallConv, IsShader);
1976 : } else {
1977 1759 : CCInfo.AllocateReg(Info->getScratchRSrcReg());
1978 1759 : CCInfo.AllocateReg(Info->getScratchWaveOffsetReg());
1979 1759 : CCInfo.AllocateReg(Info->getFrameOffsetReg());
1980 1759 : allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info);
1981 : }
1982 :
1983 : auto &ArgUsageInfo =
1984 19709 : DAG.getPass()->getAnalysis<AMDGPUArgumentUsageInfo>();
1985 19709 : ArgUsageInfo.setFuncArgInfo(Fn, Info->getArgInfo());
1986 :
1987 19709 : unsigned StackArgSize = CCInfo.getNextStackOffset();
1988 : Info->setBytesInStackArgArea(StackArgSize);
1989 :
1990 19709 : return Chains.empty() ? Chain :
1991 19709 : DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
1992 : }
1993 :
1994 : // TODO: If return values can't fit in registers, we should return as many as
1995 : // possible in registers before passing on stack.
1996 20295 : bool SITargetLowering::CanLowerReturn(
1997 : CallingConv::ID CallConv,
1998 : MachineFunction &MF, bool IsVarArg,
1999 : const SmallVectorImpl<ISD::OutputArg> &Outs,
2000 : LLVMContext &Context) const {
2001 : // Replacing returns with sret/stack usage doesn't make sense for shaders.
2002 : // FIXME: Also sort of a workaround for custom vector splitting in LowerReturn
2003 : // for shaders. Vector types should be explicitly handled by CC.
2004 20295 : if (AMDGPU::isEntryFunctionCC(CallConv))
2005 : return true;
2006 :
2007 : SmallVector<CCValAssign, 16> RVLocs;
2008 4684 : CCState CCInfo(CallConv, IsVarArg, MF, RVLocs, Context);
2009 2342 : return CCInfo.CheckReturn(Outs, CCAssignFnForReturn(CallConv, IsVarArg));
2010 : }
2011 :
2012 : SDValue
2013 19637 : SITargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
2014 : bool isVarArg,
2015 : const SmallVectorImpl<ISD::OutputArg> &Outs,
2016 : const SmallVectorImpl<SDValue> &OutVals,
2017 : const SDLoc &DL, SelectionDAG &DAG) const {
2018 19637 : MachineFunction &MF = DAG.getMachineFunction();
2019 19637 : SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
2020 :
2021 : if (AMDGPU::isKernel(CallConv)) {
2022 : return AMDGPUTargetLowering::LowerReturn(Chain, CallConv, isVarArg, Outs,
2023 16187 : OutVals, DL, DAG);
2024 : }
2025 :
2026 3450 : bool IsShader = AMDGPU::isShader(CallConv);
2027 :
2028 3450 : Info->setIfReturnsVoid(Outs.empty());
2029 3450 : bool IsWaveEnd = Info->returnsVoid() && IsShader;
2030 :
2031 : // CCValAssign - represent the assignment of the return value to a location.
2032 : SmallVector<CCValAssign, 48> RVLocs;
2033 : SmallVector<ISD::OutputArg, 48> Splits;
2034 :
2035 : // CCState - Info about the registers and stack slots.
2036 : CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
2037 6900 : *DAG.getContext());
2038 :
2039 : // Analyze outgoing return values.
2040 3450 : CCInfo.AnalyzeReturn(Outs, CCAssignFnForReturn(CallConv, isVarArg));
2041 :
2042 3450 : SDValue Flag;
2043 : SmallVector<SDValue, 48> RetOps;
2044 3450 : RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
2045 :
2046 : // Add return address for callable functions.
2047 3450 : if (!Info->isEntryFunction()) {
2048 1713 : const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
2049 : SDValue ReturnAddrReg = CreateLiveInRegister(
2050 3426 : DAG, &AMDGPU::SReg_64RegClass, TRI->getReturnAddressReg(MF), MVT::i64);
2051 :
2052 : // FIXME: Should be able to use a vreg here, but need a way to prevent it
2053 : // from being allcoated to a CSR.
2054 :
2055 : SDValue PhysReturnAddrReg = DAG.getRegister(TRI->getReturnAddressReg(MF),
2056 1713 : MVT::i64);
2057 :
2058 1713 : Chain = DAG.getCopyToReg(Chain, DL, PhysReturnAddrReg, ReturnAddrReg, Flag);
2059 1713 : Flag = Chain.getValue(1);
2060 :
2061 1713 : RetOps.push_back(PhysReturnAddrReg);
2062 : }
2063 :
2064 : // Copy the result values into the output registers.
2065 9020 : for (unsigned I = 0, RealRVLocIdx = 0, E = RVLocs.size(); I != E;
2066 : ++I, ++RealRVLocIdx) {
2067 5570 : CCValAssign &VA = RVLocs[I];
2068 : assert(VA.isRegLoc() && "Can only return in registers!");
2069 : // TODO: Partially return in registers if return values don't fit.
2070 5570 : SDValue Arg = OutVals[RealRVLocIdx];
2071 :
2072 : // Copied from other backends.
2073 5570 : switch (VA.getLocInfo()) {
2074 : case CCValAssign::Full:
2075 : break;
2076 : case CCValAssign::BCvt:
2077 0 : Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
2078 0 : break;
2079 : case CCValAssign::SExt:
2080 0 : Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
2081 0 : break;
2082 : case CCValAssign::ZExt:
2083 0 : Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
2084 0 : break;
2085 : case CCValAssign::AExt:
2086 6 : Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
2087 6 : break;
2088 0 : default:
2089 0 : llvm_unreachable("Unknown loc info!");
2090 : }
2091 :
2092 5570 : Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Arg, Flag);
2093 5570 : Flag = Chain.getValue(1);
2094 5570 : RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
2095 : }
2096 :
2097 : // FIXME: Does sret work properly?
2098 3450 : if (!Info->isEntryFunction()) {
2099 1713 : const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
2100 : const MCPhysReg *I =
2101 1713 : TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
2102 1713 : if (I) {
2103 0 : for (; *I; ++I) {
2104 0 : if (AMDGPU::SReg_64RegClass.contains(*I))
2105 0 : RetOps.push_back(DAG.getRegister(*I, MVT::i64));
2106 0 : else if (AMDGPU::SReg_32RegClass.contains(*I))
2107 0 : RetOps.push_back(DAG.getRegister(*I, MVT::i32));
2108 : else
2109 0 : llvm_unreachable("Unexpected register class in CSRsViaCopy!");
2110 : }
2111 : }
2112 : }
2113 :
2114 : // Update chain and glue.
2115 3450 : RetOps[0] = Chain;
2116 3450 : if (Flag.getNode())
2117 2828 : RetOps.push_back(Flag);
2118 :
2119 : unsigned Opc = AMDGPUISD::ENDPGM;
2120 3450 : if (!IsWaveEnd)
2121 2828 : Opc = IsShader ? AMDGPUISD::RETURN_TO_EPILOG : AMDGPUISD::RET_FLAG;
2122 3450 : return DAG.getNode(Opc, DL, MVT::Other, RetOps);
2123 : }
2124 :
2125 533 : SDValue SITargetLowering::LowerCallResult(
2126 : SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool IsVarArg,
2127 : const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
2128 : SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool IsThisReturn,
2129 : SDValue ThisVal) const {
2130 533 : CCAssignFn *RetCC = CCAssignFnForReturn(CallConv, IsVarArg);
2131 :
2132 : // Assign locations to each value returned by this call.
2133 : SmallVector<CCValAssign, 16> RVLocs;
2134 : CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), RVLocs,
2135 533 : *DAG.getContext());
2136 533 : CCInfo.AnalyzeCallResult(Ins, RetCC);
2137 :
2138 : // Copy all of the result registers out of their specified physreg.
2139 872 : for (unsigned i = 0; i != RVLocs.size(); ++i) {
2140 339 : CCValAssign VA = RVLocs[i];
2141 339 : SDValue Val;
2142 :
2143 339 : if (VA.isRegLoc()) {
2144 339 : Val = DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InFlag);
2145 339 : Chain = Val.getValue(1);
2146 339 : InFlag = Val.getValue(2);
2147 : } else if (VA.isMemLoc()) {
2148 0 : report_fatal_error("TODO: return values in memory");
2149 : } else
2150 : llvm_unreachable("unknown argument location type");
2151 :
2152 339 : switch (VA.getLocInfo()) {
2153 : case CCValAssign::Full:
2154 : break;
2155 : case CCValAssign::BCvt:
2156 0 : Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val);
2157 0 : break;
2158 : case CCValAssign::ZExt:
2159 7 : Val = DAG.getNode(ISD::AssertZext, DL, VA.getLocVT(), Val,
2160 7 : DAG.getValueType(VA.getValVT()));
2161 7 : Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
2162 7 : break;
2163 : case CCValAssign::SExt:
2164 7 : Val = DAG.getNode(ISD::AssertSext, DL, VA.getLocVT(), Val,
2165 7 : DAG.getValueType(VA.getValVT()));
2166 7 : Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
2167 7 : break;
2168 : case CCValAssign::AExt:
2169 3 : Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
2170 3 : break;
2171 0 : default:
2172 0 : llvm_unreachable("Unknown loc info!");
2173 : }
2174 :
2175 339 : InVals.push_back(Val);
2176 : }
2177 :
2178 533 : return Chain;
2179 : }
2180 :
2181 : // Add code to pass special inputs required depending on used features separate
2182 : // from the explicit user arguments present in the IR.
2183 575 : void SITargetLowering::passSpecialInputs(
2184 : CallLoweringInfo &CLI,
2185 : CCState &CCInfo,
2186 : const SIMachineFunctionInfo &Info,
2187 : SmallVectorImpl<std::pair<unsigned, SDValue>> &RegsToPass,
2188 : SmallVectorImpl<SDValue> &MemOpChains,
2189 : SDValue Chain) const {
2190 : // If we don't have a call site, this was a call inserted by
2191 : // legalization. These can never use special inputs.
2192 575 : if (!CLI.CS)
2193 0 : return;
2194 :
2195 : const Function *CalleeFunc = CLI.CS.getCalledFunction();
2196 : assert(CalleeFunc);
2197 :
2198 575 : SelectionDAG &DAG = CLI.DAG;
2199 575 : const SDLoc &DL = CLI.DL;
2200 :
2201 575 : const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
2202 :
2203 : auto &ArgUsageInfo =
2204 575 : DAG.getPass()->getAnalysis<AMDGPUArgumentUsageInfo>();
2205 : const AMDGPUFunctionArgInfo &CalleeArgInfo
2206 : = ArgUsageInfo.lookupFuncArgInfo(*CalleeFunc);
2207 :
2208 : const AMDGPUFunctionArgInfo &CallerArgInfo = Info.getArgInfo();
2209 :
2210 : // TODO: Unify with private memory register handling. This is complicated by
2211 : // the fact that at least in kernels, the input argument is not necessarily
2212 : // in the same location as the input.
2213 575 : AMDGPUFunctionArgInfo::PreloadedValue InputRegs[] = {
2214 : AMDGPUFunctionArgInfo::DISPATCH_PTR,
2215 : AMDGPUFunctionArgInfo::QUEUE_PTR,
2216 : AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR,
2217 : AMDGPUFunctionArgInfo::DISPATCH_ID,
2218 : AMDGPUFunctionArgInfo::WORKGROUP_ID_X,
2219 : AMDGPUFunctionArgInfo::WORKGROUP_ID_Y,
2220 : AMDGPUFunctionArgInfo::WORKGROUP_ID_Z,
2221 : AMDGPUFunctionArgInfo::WORKITEM_ID_X,
2222 : AMDGPUFunctionArgInfo::WORKITEM_ID_Y,
2223 : AMDGPUFunctionArgInfo::WORKITEM_ID_Z,
2224 : AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR
2225 : };
2226 :
2227 6900 : for (auto InputID : InputRegs) {
2228 : const ArgDescriptor *OutgoingArg;
2229 : const TargetRegisterClass *ArgRC;
2230 :
2231 6325 : std::tie(OutgoingArg, ArgRC) = CalleeArgInfo.getPreloadedValue(InputID);
2232 6325 : if (!OutgoingArg)
2233 6204 : continue;
2234 :
2235 : const ArgDescriptor *IncomingArg;
2236 : const TargetRegisterClass *IncomingArgRC;
2237 : std::tie(IncomingArg, IncomingArgRC)
2238 121 : = CallerArgInfo.getPreloadedValue(InputID);
2239 : assert(IncomingArgRC == ArgRC);
2240 :
2241 : // All special arguments are ints for now.
2242 121 : EVT ArgVT = TRI->getSpillSize(*ArgRC) == 8 ? MVT::i64 : MVT::i32;
2243 121 : SDValue InputReg;
2244 :
2245 121 : if (IncomingArg) {
2246 111 : InputReg = loadInputValue(DAG, ArgRC, ArgVT, DL, *IncomingArg);
2247 : } else {
2248 : // The implicit arg ptr is special because it doesn't have a corresponding
2249 : // input for kernels, and is computed from the kernarg segment pointer.
2250 : assert(InputID == AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
2251 10 : InputReg = getImplicitArgPtr(DAG, DL);
2252 : }
2253 :
2254 121 : if (OutgoingArg->isRegister()) {
2255 111 : RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
2256 : } else {
2257 10 : unsigned SpecialArgOffset = CCInfo.AllocateStack(ArgVT.getStoreSize(), 4);
2258 : SDValue ArgStore = storeStackInputValue(DAG, DL, Chain, InputReg,
2259 10 : SpecialArgOffset);
2260 10 : MemOpChains.push_back(ArgStore);
2261 : }
2262 : }
2263 : }
2264 :
2265 : static bool canGuaranteeTCO(CallingConv::ID CC) {
2266 39 : return CC == CallingConv::Fast;
2267 : }
2268 :
2269 : /// Return true if we might ever do TCO for calls with this calling convention.
2270 : static bool mayTailCallThisCC(CallingConv::ID CC) {
2271 51 : switch (CC) {
2272 : case CallingConv::C:
2273 : return true;
2274 : default:
2275 : return canGuaranteeTCO(CC);
2276 : }
2277 : }
2278 :
2279 51 : bool SITargetLowering::isEligibleForTailCallOptimization(
2280 : SDValue Callee, CallingConv::ID CalleeCC, bool IsVarArg,
2281 : const SmallVectorImpl<ISD::OutputArg> &Outs,
2282 : const SmallVectorImpl<SDValue> &OutVals,
2283 : const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
2284 39 : if (!mayTailCallThisCC(CalleeCC))
2285 : return false;
2286 :
2287 51 : MachineFunction &MF = DAG.getMachineFunction();
2288 51 : const Function &CallerF = MF.getFunction();
2289 : CallingConv::ID CallerCC = CallerF.getCallingConv();
2290 51 : const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
2291 51 : const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
2292 :
2293 : // Kernels aren't callable, and don't have a live in return address so it
2294 : // doesn't make sense to do a tail call with entry functions.
2295 51 : if (!CallerPreserved)
2296 : return false;
2297 :
2298 : bool CCMatch = CallerCC == CalleeCC;
2299 :
2300 48 : if (DAG.getTarget().Options.GuaranteedTailCallOpt) {
2301 0 : if (canGuaranteeTCO(CalleeCC) && CCMatch)
2302 0 : return true;
2303 : return false;
2304 : }
2305 :
2306 : // TODO: Can we handle var args?
2307 48 : if (IsVarArg)
2308 : return false;
2309 :
2310 160 : for (const Argument &Arg : CallerF.args()) {
2311 115 : if (Arg.hasByValAttr())
2312 : return false;
2313 : }
2314 :
2315 45 : LLVMContext &Ctx = *DAG.getContext();
2316 :
2317 : // Check that the call results are passed in the same way.
2318 45 : if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, Ctx, Ins,
2319 : CCAssignFnForCall(CalleeCC, IsVarArg),
2320 : CCAssignFnForCall(CallerCC, IsVarArg)))
2321 : return false;
2322 :
2323 : // The callee has to preserve all registers the caller needs to preserve.
2324 45 : if (!CCMatch) {
2325 0 : const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
2326 0 : if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
2327 : return false;
2328 : }
2329 :
2330 : // Nothing more to check if the callee is taking no arguments.
2331 45 : if (Outs.empty())
2332 : return true;
2333 :
2334 : SmallVector<CCValAssign, 16> ArgLocs;
2335 82 : CCState CCInfo(CalleeCC, IsVarArg, MF, ArgLocs, Ctx);
2336 :
2337 41 : CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, IsVarArg));
2338 :
2339 41 : const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
2340 : // If the stack arguments for this call do not fit into our own save area then
2341 : // the call cannot be made tail.
2342 : // TODO: Is this really necessary?
2343 41 : if (CCInfo.getNextStackOffset() > FuncInfo->getBytesInStackArgArea())
2344 : return false;
2345 :
2346 38 : const MachineRegisterInfo &MRI = MF.getRegInfo();
2347 38 : return parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals);
2348 : }
2349 :
2350 28 : bool SITargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
2351 28 : if (!CI->isTailCall())
2352 : return false;
2353 :
2354 4 : const Function *ParentFn = CI->getParent()->getParent();
2355 4 : if (AMDGPU::isEntryFunctionCC(ParentFn->getCallingConv()))
2356 : return false;
2357 :
2358 1 : auto Attr = ParentFn->getFnAttribute("disable-tail-calls");
2359 1 : return (Attr.getValueAsString() != "true");
2360 : }
2361 :
2362 : // The wave scratch offset register is used as the global base pointer.
2363 583 : SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
2364 : SmallVectorImpl<SDValue> &InVals) const {
2365 583 : SelectionDAG &DAG = CLI.DAG;
2366 583 : const SDLoc &DL = CLI.DL;
2367 : SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
2368 : SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
2369 : SmallVector<ISD::InputArg, 32> &Ins = CLI.Ins;
2370 583 : SDValue Chain = CLI.Chain;
2371 583 : SDValue Callee = CLI.Callee;
2372 : bool &IsTailCall = CLI.IsTailCall;
2373 583 : CallingConv::ID CallConv = CLI.CallConv;
2374 583 : bool IsVarArg = CLI.IsVarArg;
2375 : bool IsSibCall = false;
2376 : bool IsThisReturn = false;
2377 583 : MachineFunction &MF = DAG.getMachineFunction();
2378 :
2379 583 : if (IsVarArg) {
2380 : return lowerUnhandledCall(CLI, InVals,
2381 2 : "unsupported call to variadic function ");
2382 : }
2383 :
2384 582 : if (!CLI.CS.getInstruction())
2385 1 : report_fatal_error("unsupported libcall legalization");
2386 :
2387 : if (!CLI.CS.getCalledFunction()) {
2388 : return lowerUnhandledCall(CLI, InVals,
2389 8 : "unsupported indirect call to function ");
2390 : }
2391 :
2392 577 : if (IsTailCall && MF.getTarget().Options.GuaranteedTailCallOpt) {
2393 : return lowerUnhandledCall(CLI, InVals,
2394 2 : "unsupported required tail call to function ");
2395 : }
2396 :
2397 1152 : if (AMDGPU::isShader(MF.getFunction().getCallingConv())) {
2398 : // Note the issue is with the CC of the calling function, not of the call
2399 : // itself.
2400 : return lowerUnhandledCall(CLI, InVals,
2401 2 : "unsupported call from graphics shader of function ");
2402 : }
2403 :
2404 : // The first 4 bytes are reserved for the callee's emergency stack slot.
2405 575 : if (IsTailCall) {
2406 51 : IsTailCall = isEligibleForTailCallOptimization(
2407 : Callee, CallConv, IsVarArg, Outs, OutVals, Ins, DAG);
2408 51 : if (!IsTailCall && CLI.CS && CLI.CS.isMustTailCall()) {
2409 0 : report_fatal_error("failed to perform tail call elimination on a call "
2410 : "site marked musttail");
2411 : }
2412 :
2413 51 : bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
2414 :
2415 : // A sibling call is one where we're under the usual C ABI and not planning
2416 : // to change that but can still do a tail call:
2417 51 : if (!TailCallOpt && IsTailCall)
2418 : IsSibCall = true;
2419 :
2420 : if (IsTailCall)
2421 : ++NumTailCalls;
2422 : }
2423 :
2424 575 : const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
2425 :
2426 : // Analyze operands of the call, assigning locations to each operand.
2427 : SmallVector<CCValAssign, 16> ArgLocs;
2428 1150 : CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
2429 575 : CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, IsVarArg);
2430 :
2431 : // The first 4 bytes are reserved for the callee's emergency stack slot.
2432 575 : CCInfo.AllocateStack(4, 4);
2433 :
2434 575 : CCInfo.AnalyzeCallOperands(Outs, AssignFn);
2435 :
2436 : // Get a count of how many bytes are to be pushed on the stack.
2437 575 : unsigned NumBytes = CCInfo.getNextStackOffset();
2438 :
2439 575 : if (IsSibCall) {
2440 : // Since we're not changing the ABI to make this a tail call, the memory
2441 : // operands are already available in the caller's incoming argument space.
2442 : NumBytes = 0;
2443 : }
2444 :
2445 : // FPDiff is the byte offset of the call's argument area from the callee's.
2446 : // Stores to callee stack arguments will be placed in FixedStackSlots offset
2447 : // by this amount for a tail call. In a sibling call it must be 0 because the
2448 : // caller will deallocate the entire stack and the callee still expects its
2449 : // arguments to begin at SP+0. Completely unused for non-tail calls.
2450 : int32_t FPDiff = 0;
2451 575 : MachineFrameInfo &MFI = MF.getFrameInfo();
2452 : SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
2453 :
2454 575 : SDValue CallerSavedFP;
2455 :
2456 : // Adjust the stack pointer for the new arguments...
2457 : // These operations are automatically eliminated by the prolog/epilog pass
2458 575 : if (!IsSibCall) {
2459 533 : Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
2460 :
2461 533 : unsigned OffsetReg = Info->getScratchWaveOffsetReg();
2462 :
2463 : // In the HSA case, this should be an identity copy.
2464 : SDValue ScratchRSrcReg
2465 533 : = DAG.getCopyFromReg(Chain, DL, Info->getScratchRSrcReg(), MVT::v4i32);
2466 533 : RegsToPass.emplace_back(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, ScratchRSrcReg);
2467 :
2468 : // TODO: Don't hardcode these registers and get from the callee function.
2469 : SDValue ScratchWaveOffsetReg
2470 533 : = DAG.getCopyFromReg(Chain, DL, OffsetReg, MVT::i32);
2471 533 : RegsToPass.emplace_back(AMDGPU::SGPR4, ScratchWaveOffsetReg);
2472 :
2473 533 : if (!Info->isEntryFunction()) {
2474 : // Avoid clobbering this function's FP value. In the current convention
2475 : // callee will overwrite this, so do save/restore around the call site.
2476 104 : CallerSavedFP = DAG.getCopyFromReg(Chain, DL,
2477 104 : Info->getFrameOffsetReg(), MVT::i32);
2478 : }
2479 : }
2480 :
2481 : SmallVector<SDValue, 8> MemOpChains;
2482 : MVT PtrVT = MVT::i32;
2483 :
2484 : // Walk the register/memloc assignments, inserting copies/loads.
2485 2648 : for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size(); i != e;
2486 : ++i, ++realArgIdx) {
2487 2073 : CCValAssign &VA = ArgLocs[i];
2488 2073 : SDValue Arg = OutVals[realArgIdx];
2489 :
2490 : // Promote the value if needed.
2491 2073 : switch (VA.getLocInfo()) {
2492 : case CCValAssign::Full:
2493 : break;
2494 : case CCValAssign::BCvt:
2495 0 : Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
2496 0 : break;
2497 : case CCValAssign::ZExt:
2498 10 : Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
2499 10 : break;
2500 : case CCValAssign::SExt:
2501 10 : Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
2502 10 : break;
2503 : case CCValAssign::AExt:
2504 4 : Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
2505 4 : break;
2506 : case CCValAssign::FPExt:
2507 0 : Arg = DAG.getNode(ISD::FP_EXTEND, DL, VA.getLocVT(), Arg);
2508 0 : break;
2509 0 : default:
2510 0 : llvm_unreachable("Unknown loc info!");
2511 : }
2512 :
2513 2073 : if (VA.isRegLoc()) {
2514 3982 : RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
2515 : } else {
2516 : assert(VA.isMemLoc());
2517 :
2518 82 : SDValue DstAddr;
2519 : MachinePointerInfo DstInfo;
2520 :
2521 82 : unsigned LocMemOffset = VA.getLocMemOffset();
2522 82 : int32_t Offset = LocMemOffset;
2523 :
2524 82 : SDValue PtrOff = DAG.getConstant(Offset, DL, PtrVT);
2525 : unsigned Align = 0;
2526 :
2527 82 : if (IsTailCall) {
2528 35 : ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags;
2529 35 : unsigned OpSize = Flags.isByVal() ?
2530 35 : Flags.getByValSize() : VA.getValVT().getStoreSize();
2531 :
2532 : // FIXME: We can have better than the minimum byval required alignment.
2533 35 : Align = Flags.isByVal() ? Flags.getByValAlign() :
2534 : MinAlign(Subtarget->getStackAlignment(), Offset);
2535 :
2536 : Offset = Offset + FPDiff;
2537 35 : int FI = MFI.CreateFixedObject(OpSize, Offset, true);
2538 :
2539 35 : DstAddr = DAG.getFrameIndex(FI, PtrVT);
2540 35 : DstInfo = MachinePointerInfo::getFixedStack(MF, FI);
2541 :
2542 : // Make sure any stack arguments overlapping with where we're storing
2543 : // are loaded before this eventual operation. Otherwise they'll be
2544 : // clobbered.
2545 :
2546 : // FIXME: Why is this really necessary? This seems to just result in a
2547 : // lot of code to copy the stack and write them back to the same
2548 : // locations, which are supposed to be immutable?
2549 35 : Chain = addTokenForArgument(Chain, DAG, MFI, FI);
2550 : } else {
2551 47 : DstAddr = PtrOff;
2552 47 : DstInfo = MachinePointerInfo::getStack(MF, LocMemOffset);
2553 47 : Align = MinAlign(Subtarget->getStackAlignment(), LocMemOffset);
2554 : }
2555 :
2556 82 : if (Outs[i].Flags.isByVal()) {
2557 : SDValue SizeNode =
2558 40 : DAG.getConstant(Outs[i].Flags.getByValSize(), DL, MVT::i32);
2559 : SDValue Cpy = DAG.getMemcpy(
2560 : Chain, DL, DstAddr, Arg, SizeNode, Outs[i].Flags.getByValAlign(),
2561 : /*isVol = */ false, /*AlwaysInline = */ true,
2562 : /*isTailCall = */ false, DstInfo,
2563 40 : MachinePointerInfo(UndefValue::get(Type::getInt8PtrTy(
2564 80 : *DAG.getContext(), AMDGPUAS::PRIVATE_ADDRESS))));
2565 :
2566 40 : MemOpChains.push_back(Cpy);
2567 : } else {
2568 42 : SDValue Store = DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo, Align);
2569 42 : MemOpChains.push_back(Store);
2570 : }
2571 : }
2572 : }
2573 :
2574 : // Copy special input registers after user input arguments.
2575 575 : passSpecialInputs(CLI, CCInfo, *Info, RegsToPass, MemOpChains, Chain);
2576 :
2577 575 : if (!MemOpChains.empty())
2578 62 : Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
2579 :
2580 : // Build a sequence of copy-to-reg nodes chained together with token chain
2581 : // and flag operands which copy the outgoing args into the appropriate regs.
2582 575 : SDValue InFlag;
2583 3743 : for (auto &RegToPass : RegsToPass) {
2584 3168 : Chain = DAG.getCopyToReg(Chain, DL, RegToPass.first,
2585 3168 : RegToPass.second, InFlag);
2586 3168 : InFlag = Chain.getValue(1);
2587 : }
2588 :
2589 :
2590 575 : SDValue PhysReturnAddrReg;
2591 575 : if (IsTailCall) {
2592 : // Since the return is being combined with the call, we need to pass on the
2593 : // return address.
2594 :
2595 42 : const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
2596 : SDValue ReturnAddrReg = CreateLiveInRegister(
2597 84 : DAG, &AMDGPU::SReg_64RegClass, TRI->getReturnAddressReg(MF), MVT::i64);
2598 :
2599 42 : PhysReturnAddrReg = DAG.getRegister(TRI->getReturnAddressReg(MF),
2600 42 : MVT::i64);
2601 42 : Chain = DAG.getCopyToReg(Chain, DL, PhysReturnAddrReg, ReturnAddrReg, InFlag);
2602 42 : InFlag = Chain.getValue(1);
2603 : }
2604 :
2605 : // We don't usually want to end the call-sequence here because we would tidy
2606 : // the frame up *after* the call, however in the ABI-changing tail-call case
2607 : // we've carefully laid out the parameters so that when sp is reset they'll be
2608 : // in the correct location.
2609 575 : if (IsTailCall && !IsSibCall) {
2610 0 : Chain = DAG.getCALLSEQ_END(Chain,
2611 : DAG.getTargetConstant(NumBytes, DL, MVT::i32),
2612 : DAG.getTargetConstant(0, DL, MVT::i32),
2613 0 : InFlag, DL);
2614 0 : InFlag = Chain.getValue(1);
2615 : }
2616 :
2617 : std::vector<SDValue> Ops;
2618 575 : Ops.push_back(Chain);
2619 575 : Ops.push_back(Callee);
2620 :
2621 575 : if (IsTailCall) {
2622 : // Each tail call may have to adjust the stack by a different amount, so
2623 : // this information must travel along with the operation for eventual
2624 : // consumption by emitEpilogue.
2625 42 : Ops.push_back(DAG.getTargetConstant(FPDiff, DL, MVT::i32));
2626 :
2627 42 : Ops.push_back(PhysReturnAddrReg);
2628 : }
2629 :
2630 : // Add argument registers to the end of the list so that they are known live
2631 : // into the call.
2632 3743 : for (auto &RegToPass : RegsToPass) {
2633 3168 : Ops.push_back(DAG.getRegister(RegToPass.first,
2634 6336 : RegToPass.second.getValueType()));
2635 : }
2636 :
2637 : // Add a register mask operand representing the call-preserved registers.
2638 :
2639 575 : auto *TRI = static_cast<const SIRegisterInfo*>(Subtarget->getRegisterInfo());
2640 575 : const uint32_t *Mask = TRI->getCallPreservedMask(MF, CallConv);
2641 : assert(Mask && "Missing call preserved mask for calling convention");
2642 575 : Ops.push_back(DAG.getRegisterMask(Mask));
2643 :
2644 575 : if (InFlag.getNode())
2645 575 : Ops.push_back(InFlag);
2646 :
2647 575 : SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
2648 :
2649 : // If we're doing a tall call, use a TC_RETURN here rather than an
2650 : // actual call instruction.
2651 575 : if (IsTailCall) {
2652 : MFI.setHasTailCall();
2653 42 : return DAG.getNode(AMDGPUISD::TC_RETURN, DL, NodeTys, Ops);
2654 : }
2655 :
2656 : // Returns a chain and a flag for retval copy to use.
2657 533 : SDValue Call = DAG.getNode(AMDGPUISD::CALL, DL, NodeTys, Ops);
2658 533 : Chain = Call.getValue(0);
2659 533 : InFlag = Call.getValue(1);
2660 :
2661 533 : if (CallerSavedFP) {
2662 104 : SDValue FPReg = DAG.getRegister(Info->getFrameOffsetReg(), MVT::i32);
2663 104 : Chain = DAG.getCopyToReg(Chain, DL, FPReg, CallerSavedFP, InFlag);
2664 104 : InFlag = Chain.getValue(1);
2665 : }
2666 :
2667 533 : uint64_t CalleePopBytes = NumBytes;
2668 533 : Chain = DAG.getCALLSEQ_END(Chain, DAG.getTargetConstant(0, DL, MVT::i32),
2669 : DAG.getTargetConstant(CalleePopBytes, DL, MVT::i32),
2670 533 : InFlag, DL);
2671 533 : if (!Ins.empty())
2672 117 : InFlag = Chain.getValue(1);
2673 :
2674 : // Handle result values, copying them out of physregs into vregs that we
2675 : // return.
2676 : return LowerCallResult(Chain, InFlag, CallConv, IsVarArg, Ins, DL, DAG,
2677 : InVals, IsThisReturn,
2678 533 : IsThisReturn ? OutVals[0] : SDValue());
2679 : }
2680 :
2681 131 : unsigned SITargetLowering::getRegisterByName(const char* RegName, EVT VT,
2682 : SelectionDAG &DAG) const {
2683 0 : unsigned Reg = StringSwitch<unsigned>(RegName)
2684 131 : .Case("m0", AMDGPU::M0)
2685 131 : .Case("exec", AMDGPU::EXEC)
2686 131 : .Case("exec_lo", AMDGPU::EXEC_LO)
2687 131 : .Case("exec_hi", AMDGPU::EXEC_HI)
2688 131 : .Case("flat_scratch", AMDGPU::FLAT_SCR)
2689 131 : .Case("flat_scratch_lo", AMDGPU::FLAT_SCR_LO)
2690 131 : .Case("flat_scratch_hi", AMDGPU::FLAT_SCR_HI)
2691 : .Default(AMDGPU::NoRegister);
2692 :
2693 131 : if (Reg == AMDGPU::NoRegister) {
2694 0 : report_fatal_error(Twine("invalid register name \""
2695 : + StringRef(RegName) + "\"."));
2696 :
2697 : }
2698 :
2699 162 : if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS &&
2700 31 : Subtarget->getRegisterInfo()->regsOverlap(Reg, AMDGPU::FLAT_SCR)) {
2701 1 : report_fatal_error(Twine("invalid register \""
2702 : + StringRef(RegName) + "\" for subtarget."));
2703 : }
2704 :
2705 : switch (Reg) {
2706 17 : case AMDGPU::M0:
2707 : case AMDGPU::EXEC_LO:
2708 : case AMDGPU::EXEC_HI:
2709 : case AMDGPU::FLAT_SCR_LO:
2710 : case AMDGPU::FLAT_SCR_HI:
2711 17 : if (VT.getSizeInBits() == 32)
2712 : return Reg;
2713 : break;
2714 113 : case AMDGPU::EXEC:
2715 : case AMDGPU::FLAT_SCR:
2716 113 : if (VT.getSizeInBits() == 64)
2717 : return Reg;
2718 : break;
2719 0 : default:
2720 0 : llvm_unreachable("missing register type checking");
2721 : }
2722 :
2723 2 : report_fatal_error(Twine("invalid type for register \""
2724 : + StringRef(RegName) + "\"."));
2725 : }
2726 :
2727 : // If kill is not the last instruction, split the block so kill is always a
2728 : // proper terminator.
2729 84 : MachineBasicBlock *SITargetLowering::splitKillBlock(MachineInstr &MI,
2730 : MachineBasicBlock *BB) const {
2731 84 : const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
2732 :
2733 : MachineBasicBlock::iterator SplitPoint(&MI);
2734 : ++SplitPoint;
2735 :
2736 84 : if (SplitPoint == BB->end()) {
2737 : // Don't bother with a new block.
2738 8 : MI.setDesc(TII->getKillTerminatorFromPseudo(MI.getOpcode()));
2739 4 : return BB;
2740 : }
2741 :
2742 80 : MachineFunction *MF = BB->getParent();
2743 : MachineBasicBlock *SplitBB
2744 80 : = MF->CreateMachineBasicBlock(BB->getBasicBlock());
2745 :
2746 : MF->insert(++MachineFunction::iterator(BB), SplitBB);
2747 : SplitBB->splice(SplitBB->begin(), BB, SplitPoint, BB->end());
2748 :
2749 80 : SplitBB->transferSuccessorsAndUpdatePHIs(BB);
2750 80 : BB->addSuccessor(SplitBB);
2751 :
2752 160 : MI.setDesc(TII->getKillTerminatorFromPseudo(MI.getOpcode()));
2753 80 : return SplitBB;
2754 : }
2755 :
2756 : // Do a v_movrels_b32 or v_movreld_b32 for each unique value of \p IdxReg in the
2757 : // wavefront. If the value is uniform and just happens to be in a VGPR, this
2758 : // will only do one iteration. In the worst case, this will loop 64 times.
2759 : //
2760 : // TODO: Just use v_readlane_b32 if we know the VGPR has a uniform value.
2761 32 : static MachineBasicBlock::iterator emitLoadM0FromVGPRLoop(
2762 : const SIInstrInfo *TII,
2763 : MachineRegisterInfo &MRI,
2764 : MachineBasicBlock &OrigBB,
2765 : MachineBasicBlock &LoopBB,
2766 : const DebugLoc &DL,
2767 : const MachineOperand &IdxReg,
2768 : unsigned InitReg,
2769 : unsigned ResultReg,
2770 : unsigned PhiReg,
2771 : unsigned InitSaveExecReg,
2772 : int Offset,
2773 : bool UseGPRIdxMode,
2774 : bool IsIndirectSrc) {
2775 32 : MachineBasicBlock::iterator I = LoopBB.begin();
2776 :
2777 32 : unsigned PhiExec = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
2778 32 : unsigned NewExec = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
2779 32 : unsigned CurrentIdxReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
2780 32 : unsigned CondReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
2781 :
2782 32 : BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiReg)
2783 32 : .addReg(InitReg)
2784 : .addMBB(&OrigBB)
2785 32 : .addReg(ResultReg)
2786 : .addMBB(&LoopBB);
2787 :
2788 32 : BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiExec)
2789 32 : .addReg(InitSaveExecReg)
2790 : .addMBB(&OrigBB)
2791 32 : .addReg(NewExec)
2792 : .addMBB(&LoopBB);
2793 :
2794 : // Read the next variant <- also loop target.
2795 64 : BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), CurrentIdxReg)
2796 32 : .addReg(IdxReg.getReg(), getUndefRegState(IdxReg.isUndef()));
2797 :
2798 : // Compare the just read M0 value to all possible Idx values.
2799 64 : BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_CMP_EQ_U32_e64), CondReg)
2800 32 : .addReg(CurrentIdxReg)
2801 32 : .addReg(IdxReg.getReg(), 0, IdxReg.getSubReg());
2802 :
2803 : // Update EXEC, save the original EXEC value to VCC.
2804 96 : BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_AND_SAVEEXEC_B64), NewExec)
2805 32 : .addReg(CondReg, RegState::Kill);
2806 :
2807 : MRI.setSimpleHint(NewExec, CondReg);
2808 :
2809 32 : if (UseGPRIdxMode) {
2810 : unsigned IdxReg;
2811 16 : if (Offset == 0) {
2812 : IdxReg = CurrentIdxReg;
2813 : } else {
2814 6 : IdxReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
2815 18 : BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), IdxReg)
2816 6 : .addReg(CurrentIdxReg, RegState::Kill)
2817 6 : .addImm(Offset);
2818 : }
2819 16 : unsigned IdxMode = IsIndirectSrc ?
2820 : VGPRIndexMode::SRC0_ENABLE : VGPRIndexMode::DST_ENABLE;
2821 : MachineInstr *SetOn =
2822 32 : BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_ON))
2823 16 : .addReg(IdxReg, RegState::Kill)
2824 16 : .addImm(IdxMode);
2825 16 : SetOn->getOperand(3).setIsUndef();
2826 : } else {
2827 : // Move index from VCC into M0
2828 16 : if (Offset == 0) {
2829 30 : BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
2830 10 : .addReg(CurrentIdxReg, RegState::Kill);
2831 : } else {
2832 18 : BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
2833 6 : .addReg(CurrentIdxReg, RegState::Kill)
2834 6 : .addImm(Offset);
2835 : }
2836 : }
2837 :
2838 : // Update EXEC, switch all done bits to 0 and all todo bits to 1.
2839 : MachineInstr *InsertPt =
2840 64 : BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_XOR_B64), AMDGPU::EXEC)
2841 32 : .addReg(AMDGPU::EXEC)
2842 32 : .addReg(NewExec);
2843 :
2844 : // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
2845 : // s_cbranch_scc0?
2846 :
2847 : // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover.
2848 64 : BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
2849 : .addMBB(&LoopBB);
2850 :
2851 32 : return InsertPt->getIterator();
2852 : }
2853 :
2854 : // This has slightly sub-optimal regalloc when the source vector is killed by
2855 : // the read. The register allocator does not understand that the kill is
2856 : // per-workitem, so is kept alive for the whole loop so we end up not re-using a
2857 : // subregister from it, using 1 more VGPR than necessary. This was saved when
2858 : // this was expanded after register allocation.
2859 32 : static MachineBasicBlock::iterator loadM0FromVGPR(const SIInstrInfo *TII,
2860 : MachineBasicBlock &MBB,
2861 : MachineInstr &MI,
2862 : unsigned InitResultReg,
2863 : unsigned PhiReg,
2864 : int Offset,
2865 : bool UseGPRIdxMode,
2866 : bool IsIndirectSrc) {
2867 32 : MachineFunction *MF = MBB.getParent();
2868 32 : MachineRegisterInfo &MRI = MF->getRegInfo();
2869 : const DebugLoc &DL = MI.getDebugLoc();
2870 : MachineBasicBlock::iterator I(&MI);
2871 :
2872 32 : unsigned DstReg = MI.getOperand(0).getReg();
2873 32 : unsigned SaveExec = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
2874 32 : unsigned TmpExec = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
2875 :
2876 64 : BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), TmpExec);
2877 :
2878 : // Save the EXEC mask
2879 64 : BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B64), SaveExec)
2880 32 : .addReg(AMDGPU::EXEC);
2881 :
2882 : // To insert the loop we need to split the block. Move everything after this
2883 : // point to a new block, and insert a new empty block between the two.
2884 32 : MachineBasicBlock *LoopBB = MF->CreateMachineBasicBlock();
2885 32 : MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock();
2886 : MachineFunction::iterator MBBI(MBB);
2887 : ++MBBI;
2888 :
2889 : MF->insert(MBBI, LoopBB);
2890 : MF->insert(MBBI, RemainderBB);
2891 :
2892 32 : LoopBB->addSuccessor(LoopBB);
2893 32 : LoopBB->addSuccessor(RemainderBB);
2894 :
2895 : // Move the rest of the block into a new block.
2896 32 : RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
2897 : RemainderBB->splice(RemainderBB->begin(), &MBB, I, MBB.end());
2898 :
2899 32 : MBB.addSuccessor(LoopBB);
2900 :
2901 32 : const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
2902 :
2903 : auto InsPt = emitLoadM0FromVGPRLoop(TII, MRI, MBB, *LoopBB, DL, *Idx,
2904 : InitResultReg, DstReg, PhiReg, TmpExec,
2905 32 : Offset, UseGPRIdxMode, IsIndirectSrc);
2906 :
2907 32 : MachineBasicBlock::iterator First = RemainderBB->begin();
2908 64 : BuildMI(*RemainderBB, First, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC)
2909 32 : .addReg(SaveExec);
2910 :
2911 32 : return InsPt;
2912 : }
2913 :
2914 : // Returns subreg index, offset
2915 : static std::pair<unsigned, int>
2916 0 : computeIndirectRegAndOffset(const SIRegisterInfo &TRI,
2917 : const TargetRegisterClass *SuperRC,
2918 : unsigned VecReg,
2919 : int Offset) {
2920 0 : int NumElts = TRI.getRegSizeInBits(*SuperRC) / 32;
2921 :
2922 : // Skip out of bounds offsets, or else we would end up using an undefined
2923 : // register.
2924 0 : if (Offset >= NumElts || Offset < 0)
2925 0 : return std::make_pair(AMDGPU::sub0, Offset);
2926 :
2927 0 : return std::make_pair(AMDGPU::sub0 + Offset, 0);
2928 : }
2929 :
2930 : // Return true if the index is an SGPR and was set.
2931 161 : static bool setM0ToIndexFromSGPR(const SIInstrInfo *TII,
2932 : MachineRegisterInfo &MRI,
2933 : MachineInstr &MI,
2934 : int Offset,
2935 : bool UseGPRIdxMode,
2936 : bool IsIndirectSrc) {
2937 161 : MachineBasicBlock *MBB = MI.getParent();
2938 : const DebugLoc &DL = MI.getDebugLoc();
2939 : MachineBasicBlock::iterator I(&MI);
2940 :
2941 161 : const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
2942 161 : const TargetRegisterClass *IdxRC = MRI.getRegClass(Idx->getReg());
2943 :
2944 : assert(Idx->getReg() != AMDGPU::NoRegister);
2945 :
2946 161 : if (!TII->getRegisterInfo().isSGPRClass(IdxRC))
2947 : return false;
2948 :
2949 129 : if (UseGPRIdxMode) {
2950 31 : unsigned IdxMode = IsIndirectSrc ?
2951 : VGPRIndexMode::SRC0_ENABLE : VGPRIndexMode::DST_ENABLE;
2952 31 : if (Offset == 0) {
2953 : MachineInstr *SetOn =
2954 34 : BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_ON))
2955 : .add(*Idx)
2956 17 : .addImm(IdxMode);
2957 :
2958 17 : SetOn->getOperand(3).setIsUndef();
2959 : } else {
2960 14 : unsigned Tmp = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
2961 28 : BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), Tmp)
2962 : .add(*Idx)
2963 14 : .addImm(Offset);
2964 : MachineInstr *SetOn =
2965 28 : BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_ON))
2966 14 : .addReg(Tmp, RegState::Kill)
2967 14 : .addImm(IdxMode);
2968 :
2969 14 : SetOn->getOperand(3).setIsUndef();
2970 : }
2971 :
2972 31 : return true;
2973 : }
2974 :
2975 98 : if (Offset == 0) {
2976 252 : BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
2977 : .add(*Idx);
2978 : } else {
2979 28 : BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
2980 : .add(*Idx)
2981 14 : .addImm(Offset);
2982 : }
2983 :
2984 : return true;
2985 : }
2986 :
2987 : // Control flow needs to be inserted if indexing with a VGPR.
2988 71 : static MachineBasicBlock *emitIndirectSrc(MachineInstr &MI,
2989 : MachineBasicBlock &MBB,
2990 : const GCNSubtarget &ST) {
2991 71 : const SIInstrInfo *TII = ST.getInstrInfo();
2992 : const SIRegisterInfo &TRI = TII->getRegisterInfo();
2993 71 : MachineFunction *MF = MBB.getParent();
2994 71 : MachineRegisterInfo &MRI = MF->getRegInfo();
2995 :
2996 71 : unsigned Dst = MI.getOperand(0).getReg();
2997 71 : unsigned SrcReg = TII->getNamedOperand(MI, AMDGPU::OpName::src)->getReg();
2998 71 : int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
2999 :
3000 : const TargetRegisterClass *VecRC = MRI.getRegClass(SrcReg);
3001 :
3002 : unsigned SubReg;
3003 : std::tie(SubReg, Offset)
3004 71 : = computeIndirectRegAndOffset(TRI, VecRC, SrcReg, Offset);
3005 :
3006 : bool UseGPRIdxMode = ST.useVGPRIndexMode(EnableVGPRIndexMode);
3007 :
3008 71 : if (setM0ToIndexFromSGPR(TII, MRI, MI, Offset, UseGPRIdxMode, true)) {
3009 : MachineBasicBlock::iterator I(&MI);
3010 : const DebugLoc &DL = MI.getDebugLoc();
3011 :
3012 59 : if (UseGPRIdxMode) {
3013 : // TODO: Look at the uses to avoid the copy. This may require rescheduling
3014 : // to avoid interfering with other uses, so probably requires a new
3015 : // optimization pass.
3016 34 : BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOV_B32_e32), Dst)
3017 17 : .addReg(SrcReg, RegState::Undef, SubReg)
3018 17 : .addReg(SrcReg, RegState::Implicit)
3019 17 : .addReg(AMDGPU::M0, RegState::Implicit);
3020 34 : BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF));
3021 : } else {
3022 126 : BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
3023 42 : .addReg(SrcReg, RegState::Undef, SubReg)
3024 42 : .addReg(SrcReg, RegState::Implicit);
3025 : }
3026 :
3027 59 : MI.eraseFromParent();
3028 :
3029 : return &MBB;
3030 : }
3031 :
3032 : const DebugLoc &DL = MI.getDebugLoc();
3033 : MachineBasicBlock::iterator I(&MI);
3034 :
3035 12 : unsigned PhiReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3036 12 : unsigned InitReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3037 :
3038 24 : BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), InitReg);
3039 :
3040 : auto InsPt = loadM0FromVGPR(TII, MBB, MI, InitReg, PhiReg,
3041 12 : Offset, UseGPRIdxMode, true);
3042 12 : MachineBasicBlock *LoopBB = InsPt->getParent();
3043 :
3044 12 : if (UseGPRIdxMode) {
3045 12 : BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::V_MOV_B32_e32), Dst)
3046 6 : .addReg(SrcReg, RegState::Undef, SubReg)
3047 6 : .addReg(SrcReg, RegState::Implicit)
3048 6 : .addReg(AMDGPU::M0, RegState::Implicit);
3049 12 : BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF));
3050 : } else {
3051 18 : BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
3052 6 : .addReg(SrcReg, RegState::Undef, SubReg)
3053 6 : .addReg(SrcReg, RegState::Implicit);
3054 : }
3055 :
3056 12 : MI.eraseFromParent();
3057 :
3058 12 : return LoopBB;
3059 : }
3060 :
3061 66 : static unsigned getMOVRELDPseudo(const SIRegisterInfo &TRI,
3062 : const TargetRegisterClass *VecRC) {
3063 66 : switch (TRI.getRegSizeInBits(*VecRC)) {
3064 : case 32: // 4 bytes
3065 : return AMDGPU::V_MOVRELD_B32_V1;
3066 6 : case 64: // 8 bytes
3067 6 : return AMDGPU::V_MOVRELD_B32_V2;
3068 38 : case 128: // 16 bytes
3069 38 : return AMDGPU::V_MOVRELD_B32_V4;
3070 16 : case 256: // 32 bytes
3071 16 : return AMDGPU::V_MOVRELD_B32_V8;
3072 6 : case 512: // 64 bytes
3073 6 : return AMDGPU::V_MOVRELD_B32_V16;
3074 0 : default:
3075 0 : llvm_unreachable("unsupported size for MOVRELD pseudos");
3076 : }
3077 : }
3078 :
3079 90 : static MachineBasicBlock *emitIndirectDst(MachineInstr &MI,
3080 : MachineBasicBlock &MBB,
3081 : const GCNSubtarget &ST) {
3082 90 : const SIInstrInfo *TII = ST.getInstrInfo();
3083 : const SIRegisterInfo &TRI = TII->getRegisterInfo();
3084 90 : MachineFunction *MF = MBB.getParent();
3085 90 : MachineRegisterInfo &MRI = MF->getRegInfo();
3086 :
3087 90 : unsigned Dst = MI.getOperand(0).getReg();
3088 90 : const MachineOperand *SrcVec = TII->getNamedOperand(MI, AMDGPU::OpName::src);
3089 90 : const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
3090 90 : const MachineOperand *Val = TII->getNamedOperand(MI, AMDGPU::OpName::val);
3091 90 : int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
3092 90 : const TargetRegisterClass *VecRC = MRI.getRegClass(SrcVec->getReg());
3093 :
3094 : // This can be an immediate, but will be folded later.
3095 : assert(Val->getReg());
3096 :
3097 : unsigned SubReg;
3098 90 : std::tie(SubReg, Offset) = computeIndirectRegAndOffset(TRI, VecRC,
3099 : SrcVec->getReg(),
3100 90 : Offset);
3101 : bool UseGPRIdxMode = ST.useVGPRIndexMode(EnableVGPRIndexMode);
3102 :
3103 90 : if (Idx->getReg() == AMDGPU::NoRegister) {
3104 : MachineBasicBlock::iterator I(&MI);
3105 : const DebugLoc &DL = MI.getDebugLoc();
3106 :
3107 : assert(Offset == 0);
3108 :
3109 0 : BuildMI(MBB, I, DL, TII->get(TargetOpcode::INSERT_SUBREG), Dst)
3110 : .add(*SrcVec)
3111 : .add(*Val)
3112 0 : .addImm(SubReg);
3113 :
3114 0 : MI.eraseFromParent();
3115 : return &MBB;
3116 : }
3117 :
3118 90 : if (setM0ToIndexFromSGPR(TII, MRI, MI, Offset, UseGPRIdxMode, false)) {
3119 : MachineBasicBlock::iterator I(&MI);
3120 : const DebugLoc &DL = MI.getDebugLoc();
3121 :
3122 70 : if (UseGPRIdxMode) {
3123 28 : BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOV_B32_indirect))
3124 14 : .addReg(SrcVec->getReg(), RegState::Undef, SubReg) // vdst
3125 : .add(*Val)
3126 14 : .addReg(Dst, RegState::ImplicitDefine)
3127 14 : .addReg(SrcVec->getReg(), RegState::Implicit)
3128 14 : .addReg(AMDGPU::M0, RegState::Implicit);
3129 :
3130 28 : BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF));
3131 : } else {
3132 56 : const MCInstrDesc &MovRelDesc = TII->get(getMOVRELDPseudo(TRI, VecRC));
3133 :
3134 112 : BuildMI(MBB, I, DL, MovRelDesc)
3135 56 : .addReg(Dst, RegState::Define)
3136 56 : .addReg(SrcVec->getReg())
3137 : .add(*Val)
3138 56 : .addImm(SubReg - AMDGPU::sub0);
3139 : }
3140 :
3141 70 : MI.eraseFromParent();
3142 : return &MBB;
3143 : }
3144 :
3145 20 : if (Val->isReg())
3146 20 : MRI.clearKillFlags(Val->getReg());
3147 :
3148 : const DebugLoc &DL = MI.getDebugLoc();
3149 :
3150 20 : unsigned PhiReg = MRI.createVirtualRegister(VecRC);
3151 :
3152 : auto InsPt = loadM0FromVGPR(TII, MBB, MI, SrcVec->getReg(), PhiReg,
3153 20 : Offset, UseGPRIdxMode, false);
3154 20 : MachineBasicBlock *LoopBB = InsPt->getParent();
3155 :
3156 20 : if (UseGPRIdxMode) {
3157 20 : BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::V_MOV_B32_indirect))
3158 10 : .addReg(PhiReg, RegState::Undef, SubReg) // vdst
3159 : .add(*Val) // src0
3160 10 : .addReg(Dst, RegState::ImplicitDefine)
3161 10 : .addReg(PhiReg, RegState::Implicit)
3162 10 : .addReg(AMDGPU::M0, RegState::Implicit);
3163 20 : BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF));
3164 : } else {
3165 10 : const MCInstrDesc &MovRelDesc = TII->get(getMOVRELDPseudo(TRI, VecRC));
3166 :
3167 20 : BuildMI(*LoopBB, InsPt, DL, MovRelDesc)
3168 10 : .addReg(Dst, RegState::Define)
3169 10 : .addReg(PhiReg)
3170 : .add(*Val)
3171 10 : .addImm(SubReg - AMDGPU::sub0);
3172 : }
3173 :
3174 20 : MI.eraseFromParent();
3175 :
3176 20 : return LoopBB;
3177 : }
3178 :
3179 14237 : MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
3180 : MachineInstr &MI, MachineBasicBlock *BB) const {
3181 :
3182 14237 : const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
3183 14237 : MachineFunction *MF = BB->getParent();
3184 14237 : SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
3185 :
3186 14237 : if (TII->isMIMG(MI)) {
3187 737 : if (MI.memoperands_empty() && MI.mayLoadOrStore()) {
3188 0 : report_fatal_error("missing mem operand from MIMG instruction");
3189 : }
3190 : // Add a memoperand for mimg instructions so that they aren't assumed to
3191 : // be ordered memory instuctions.
3192 :
3193 737 : return BB;
3194 : }
3195 :
3196 13500 : switch (MI.getOpcode()) {
3197 2285 : case AMDGPU::S_ADD_U64_PSEUDO:
3198 : case AMDGPU::S_SUB_U64_PSEUDO: {
3199 2285 : MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
3200 : const DebugLoc &DL = MI.getDebugLoc();
3201 :
3202 2285 : MachineOperand &Dest = MI.getOperand(0);
3203 : MachineOperand &Src0 = MI.getOperand(1);
3204 : MachineOperand &Src1 = MI.getOperand(2);
3205 :
3206 2285 : unsigned DestSub0 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
3207 2285 : unsigned DestSub1 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
3208 :
3209 : MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(MI, MRI,
3210 : Src0, &AMDGPU::SReg_64RegClass, AMDGPU::sub0,
3211 2285 : &AMDGPU::SReg_32_XM0RegClass);
3212 : MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(MI, MRI,
3213 : Src0, &AMDGPU::SReg_64RegClass, AMDGPU::sub1,
3214 2285 : &AMDGPU::SReg_32_XM0RegClass);
3215 :
3216 : MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(MI, MRI,
3217 : Src1, &AMDGPU::SReg_64RegClass, AMDGPU::sub0,
3218 2285 : &AMDGPU::SReg_32_XM0RegClass);
3219 : MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(MI, MRI,
3220 : Src1, &AMDGPU::SReg_64RegClass, AMDGPU::sub1,
3221 2285 : &AMDGPU::SReg_32_XM0RegClass);
3222 :
3223 2285 : bool IsAdd = (MI.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO);
3224 :
3225 2285 : unsigned LoOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
3226 2285 : unsigned HiOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
3227 4570 : BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0)
3228 : .add(Src0Sub0)
3229 : .add(Src1Sub0);
3230 4570 : BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1)
3231 : .add(Src0Sub1)
3232 : .add(Src1Sub1);
3233 4570 : BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg())
3234 2285 : .addReg(DestSub0)
3235 : .addImm(AMDGPU::sub0)
3236 2285 : .addReg(DestSub1)
3237 : .addImm(AMDGPU::sub1);
3238 2285 : MI.eraseFromParent();
3239 : return BB;
3240 : }
3241 9134 : case AMDGPU::SI_INIT_M0: {
3242 9134 : BuildMI(*BB, MI.getIterator(), MI.getDebugLoc(),
3243 18268 : TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
3244 9134 : .add(MI.getOperand(0));
3245 9134 : MI.eraseFromParent();
3246 9134 : return BB;
3247 : }
3248 3 : case AMDGPU::SI_INIT_EXEC:
3249 : // This should be before all vector instructions.
3250 : BuildMI(*BB, &*BB->begin(), MI.getDebugLoc(), TII->get(AMDGPU::S_MOV_B64),
3251 3 : AMDGPU::EXEC)
3252 3 : .addImm(MI.getOperand(0).getImm());
3253 3 : MI.eraseFromParent();
3254 3 : return BB;
3255 :
3256 : case AMDGPU::SI_INIT_EXEC_FROM_INPUT: {
3257 : // Extract the thread count from an SGPR input and set EXEC accordingly.
3258 : // Since BFM can't shift by 64, handle that case with CMP + CMOV.
3259 : //
3260 : // S_BFE_U32 count, input, {shift, 7}
3261 : // S_BFM_B64 exec, count, 0
3262 : // S_CMP_EQ_U32 count, 64
3263 : // S_CMOV_B64 exec, -1
3264 : MachineInstr *FirstMI = &*BB->begin();
3265 4 : MachineRegisterInfo &MRI = MF->getRegInfo();
3266 4 : unsigned InputReg = MI.getOperand(0).getReg();
3267 4 : unsigned CountReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
3268 : bool Found = false;
3269 :
3270 : // Move the COPY of the input reg to the beginning, so that we can use it.
3271 10 : for (auto I = BB->begin(); I != &MI; I++) {
3272 20 : if (I->getOpcode() != TargetOpcode::COPY ||
3273 10 : I->getOperand(0).getReg() != InputReg)
3274 : continue;
3275 :
3276 4 : if (I == FirstMI) {
3277 : FirstMI = &*++BB->begin();
3278 : } else {
3279 4 : I->removeFromParent();
3280 : BB->insert(FirstMI, &*I);
3281 : }
3282 : Found = true;
3283 : break;
3284 : }
3285 : assert(Found);
3286 : (void)Found;
3287 :
3288 : // This should be before all vector instructions.
3289 12 : BuildMI(*BB, FirstMI, DebugLoc(), TII->get(AMDGPU::S_BFE_U32), CountReg)
3290 4 : .addReg(InputReg)
3291 4 : .addImm((MI.getOperand(1).getImm() & 0x7f) | 0x70000);
3292 8 : BuildMI(*BB, FirstMI, DebugLoc(), TII->get(AMDGPU::S_BFM_B64),
3293 4 : AMDGPU::EXEC)
3294 4 : .addReg(CountReg)
3295 : .addImm(0);
3296 12 : BuildMI(*BB, FirstMI, DebugLoc(), TII->get(AMDGPU::S_CMP_EQ_U32))
3297 4 : .addReg(CountReg, RegState::Kill)
3298 : .addImm(64);
3299 4 : BuildMI(*BB, FirstMI, DebugLoc(), TII->get(AMDGPU::S_CMOV_B64),
3300 4 : AMDGPU::EXEC)
3301 : .addImm(-1);
3302 4 : MI.eraseFromParent();
3303 4 : return BB;
3304 : }
3305 :
3306 : case AMDGPU::GET_GROUPSTATICSIZE: {
3307 : DebugLoc DL = MI.getDebugLoc();
3308 122 : BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_MOV_B32))
3309 61 : .add(MI.getOperand(0))
3310 61 : .addImm(MFI->getLDSSize());
3311 61 : MI.eraseFromParent();
3312 : return BB;
3313 : }
3314 71 : case AMDGPU::SI_INDIRECT_SRC_V1:
3315 : case AMDGPU::SI_INDIRECT_SRC_V2:
3316 : case AMDGPU::SI_INDIRECT_SRC_V4:
3317 : case AMDGPU::SI_INDIRECT_SRC_V8:
3318 : case AMDGPU::SI_INDIRECT_SRC_V16:
3319 71 : return emitIndirectSrc(MI, *BB, *getSubtarget());
3320 90 : case AMDGPU::SI_INDIRECT_DST_V1:
3321 : case AMDGPU::SI_INDIRECT_DST_V2:
3322 : case AMDGPU::SI_INDIRECT_DST_V4:
3323 : case AMDGPU::SI_INDIRECT_DST_V8:
3324 : case AMDGPU::SI_INDIRECT_DST_V16:
3325 90 : return emitIndirectDst(MI, *BB, *getSubtarget());
3326 84 : case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO:
3327 : case AMDGPU::SI_KILL_I1_PSEUDO:
3328 84 : return splitKillBlock(MI, BB);
3329 49 : case AMDGPU::V_CNDMASK_B64_PSEUDO: {
3330 49 : MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
3331 :
3332 49 : unsigned Dst = MI.getOperand(0).getReg();
3333 49 : unsigned Src0 = MI.getOperand(1).getReg();
3334 49 : unsigned Src1 = MI.getOperand(2).getReg();
3335 : const DebugLoc &DL = MI.getDebugLoc();
3336 49 : unsigned SrcCond = MI.getOperand(3).getReg();
3337 :
3338 49 : unsigned DstLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3339 49 : unsigned DstHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3340 49 : unsigned SrcCondCopy = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
3341 :
3342 98 : BuildMI(*BB, MI, DL, TII->get(AMDGPU::COPY), SrcCondCopy)
3343 49 : .addReg(SrcCond);
3344 98 : BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstLo)
3345 49 : .addReg(Src0, 0, AMDGPU::sub0)
3346 49 : .addReg(Src1, 0, AMDGPU::sub0)
3347 49 : .addReg(SrcCondCopy);
3348 98 : BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstHi)
3349 49 : .addReg(Src0, 0, AMDGPU::sub1)
3350 49 : .addReg(Src1, 0, AMDGPU::sub1)
3351 49 : .addReg(SrcCondCopy);
3352 :
3353 98 : BuildMI(*BB, MI, DL, TII->get(AMDGPU::REG_SEQUENCE), Dst)
3354 49 : .addReg(DstLo)
3355 : .addImm(AMDGPU::sub0)
3356 49 : .addReg(DstHi)
3357 : .addImm(AMDGPU::sub1);
3358 49 : MI.eraseFromParent();
3359 49 : return BB;
3360 : }
3361 78 : case AMDGPU::SI_BR_UNDEF: {
3362 78 : const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
3363 : const DebugLoc &DL = MI.getDebugLoc();
3364 156 : MachineInstr *Br = BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
3365 78 : .add(MI.getOperand(0));
3366 78 : Br->getOperand(1).setIsUndef(true); // read undef SCC
3367 78 : MI.eraseFromParent();
3368 78 : return BB;
3369 : }
3370 1066 : case AMDGPU::ADJCALLSTACKUP:
3371 : case AMDGPU::ADJCALLSTACKDOWN: {
3372 1066 : const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
3373 : MachineInstrBuilder MIB(*MF, &MI);
3374 :
3375 : // Add an implicit use of the frame offset reg to prevent the restore copy
3376 : // inserted after the call from being reorderd after stack operations in the
3377 : // the caller's frame.
3378 1066 : MIB.addReg(Info->getStackPtrOffsetReg(), RegState::ImplicitDefine)
3379 1066 : .addReg(Info->getStackPtrOffsetReg(), RegState::Implicit)
3380 1066 : .addReg(Info->getFrameOffsetReg(), RegState::Implicit);
3381 : return BB;
3382 : }
3383 575 : case AMDGPU::SI_CALL_ISEL:
3384 : case AMDGPU::SI_TCRETURN_ISEL: {
3385 575 : const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
3386 : const DebugLoc &DL = MI.getDebugLoc();
3387 575 : unsigned ReturnAddrReg = TII->getRegisterInfo().getReturnAddressReg(*MF);
3388 :
3389 575 : MachineRegisterInfo &MRI = MF->getRegInfo();
3390 575 : unsigned GlobalAddrReg = MI.getOperand(0).getReg();
3391 575 : MachineInstr *PCRel = MRI.getVRegDef(GlobalAddrReg);
3392 : assert(PCRel->getOpcode() == AMDGPU::SI_PC_ADD_REL_OFFSET);
3393 :
3394 575 : const GlobalValue *G = PCRel->getOperand(1).getGlobal();
3395 :
3396 : MachineInstrBuilder MIB;
3397 1150 : if (MI.getOpcode() == AMDGPU::SI_CALL_ISEL) {
3398 1066 : MIB = BuildMI(*BB, MI, DL, TII->get(AMDGPU::SI_CALL), ReturnAddrReg)
3399 533 : .add(MI.getOperand(0))
3400 : .addGlobalAddress(G);
3401 : } else {
3402 84 : MIB = BuildMI(*BB, MI, DL, TII->get(AMDGPU::SI_TCRETURN))
3403 42 : .add(MI.getOperand(0))
3404 : .addGlobalAddress(G);
3405 :
3406 : // There is an additional imm operand for tcreturn, but it should be in the
3407 : // right place already.
3408 : }
3409 :
3410 4741 : for (unsigned I = 1, E = MI.getNumOperands(); I != E; ++I)
3411 4166 : MIB.add(MI.getOperand(I));
3412 :
3413 : MIB.cloneMemRefs(MI);
3414 575 : MI.eraseFromParent();
3415 : return BB;
3416 : }
3417 0 : default:
3418 0 : return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB);
3419 : }
3420 : }
3421 :
3422 29044 : bool SITargetLowering::hasBitPreservingFPLogic(EVT VT) const {
3423 29044 : return isTypeLegal(VT.getScalarType());
3424 : }
3425 :
3426 4434 : bool SITargetLowering::enableAggressiveFMAFusion(EVT VT) const {
3427 : // This currently forces unfolding various combinations of fsub into fma with
3428 : // free fneg'd operands. As long as we have fast FMA (controlled by
3429 : // isFMAFasterThanFMulAndFAdd), we should perform these.
3430 :
3431 : // When fma is quarter rate, for f64 where add / sub are at best half rate,
3432 : // most of these combines appear to be cycle neutral but save on instruction
3433 : // count / code size.
3434 4434 : return true;
3435 : }
3436 :
3437 14419 : EVT SITargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &Ctx,
3438 : EVT VT) const {
3439 14419 : if (!VT.isVector()) {
3440 14302 : return MVT::i1;
3441 : }
3442 117 : return EVT::getVectorVT(Ctx, MVT::i1, VT.getVectorNumElements());
3443 : }
3444 :
3445 147959 : MVT SITargetLowering::getScalarShiftAmountTy(const DataLayout &, EVT VT) const {
3446 : // TODO: Should i16 be used always if legal? For now it would force VALU
3447 : // shifts.
3448 147959 : return (VT == MVT::i16) ? MVT::i16 : MVT::i32;
3449 : }
3450 :
3451 : // Answering this is somewhat tricky and depends on the specific device which
3452 : // have different rates for fma or all f64 operations.
3453 : //
3454 : // v_fma_f64 and v_mul_f64 always take the same number of cycles as each other
3455 : // regardless of which device (although the number of cycles differs between
3456 : // devices), so it is always profitable for f64.
3457 : //
3458 : // v_fma_f32 takes 4 or 16 cycles depending on the device, so it is profitable
3459 : // only on full rate devices. Normally, we should prefer selecting v_mad_f32
3460 : // which we can always do even without fused FP ops since it returns the same
3461 : // result as the separate operations and since it is always full
3462 : // rate. Therefore, we lie and report that it is not faster for f32. v_mad_f32
3463 : // however does not support denormals, so we do report fma as faster if we have
3464 : // a fast fma device and require denormals.
3465 : //
3466 12661 : bool SITargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
3467 12661 : VT = VT.getScalarType();
3468 :
3469 12661 : switch (VT.getSimpleVT().SimpleTy) {
3470 9778 : case MVT::f32: {
3471 : // This is as fast on some subtargets. However, we always have full rate f32
3472 : // mad available which returns the same result as the separate operations
3473 : // which we should prefer over fma. We can't use this if we want to support
3474 : // denormals, so only report this in these cases.
3475 9778 : if (Subtarget->hasFP32Denormals())
3476 628 : return Subtarget->hasFastFMAF32() || Subtarget->hasDLInsts();
3477 :
3478 : // If the subtarget has v_fmac_f32, that's just as good as v_mac_f32.
3479 9150 : return Subtarget->hasFastFMAF32() && Subtarget->hasDLInsts();
3480 : }
3481 : case MVT::f64:
3482 : return true;
3483 1842 : case MVT::f16:
3484 1842 : return Subtarget->has16BitInsts() && Subtarget->hasFP16Denormals();
3485 : default:
3486 : break;
3487 : }
3488 :
3489 0 : return false;
3490 : }
3491 :
3492 : //===----------------------------------------------------------------------===//
3493 : // Custom DAG Lowering Operations
3494 : //===----------------------------------------------------------------------===//
3495 :
3496 : // Work around LegalizeDAG doing the wrong thing and fully scalarizing if the
3497 : // wider vector type is legal.
3498 13 : SDValue SITargetLowering::splitUnaryVectorOp(SDValue Op,
3499 : SelectionDAG &DAG) const {
3500 : unsigned Opc = Op.getOpcode();
3501 13 : EVT VT = Op.getValueType();
3502 : assert(VT == MVT::v4f16);
3503 :
3504 : SDValue Lo, Hi;
3505 13 : std::tie(Lo, Hi) = DAG.SplitVectorOperand(Op.getNode(), 0);
3506 :
3507 : SDLoc SL(Op);
3508 : SDValue OpLo = DAG.getNode(Opc, SL, Lo.getValueType(), Lo,
3509 26 : Op->getFlags());
3510 : SDValue OpHi = DAG.getNode(Opc, SL, Hi.getValueType(), Hi,
3511 26 : Op->getFlags());
3512 :
3513 26 : return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
3514 : }
3515 :
3516 : // Work around LegalizeDAG doing the wrong thing and fully scalarizing if the
3517 : // wider vector type is legal.
3518 125 : SDValue SITargetLowering::splitBinaryVectorOp(SDValue Op,
3519 : SelectionDAG &DAG) const {
3520 : unsigned Opc = Op.getOpcode();
3521 125 : EVT VT = Op.getValueType();
3522 : assert(VT == MVT::v4i16 || VT == MVT::v4f16);
3523 :
3524 : SDValue Lo0, Hi0;
3525 125 : std::tie(Lo0, Hi0) = DAG.SplitVectorOperand(Op.getNode(), 0);
3526 : SDValue Lo1, Hi1;
3527 125 : std::tie(Lo1, Hi1) = DAG.SplitVectorOperand(Op.getNode(), 1);
3528 :
3529 : SDLoc SL(Op);
3530 :
3531 : SDValue OpLo = DAG.getNode(Opc, SL, Lo0.getValueType(), Lo0, Lo1,
3532 250 : Op->getFlags());
3533 : SDValue OpHi = DAG.getNode(Opc, SL, Hi0.getValueType(), Hi0, Hi1,
3534 250 : Op->getFlags());
3535 :
3536 250 : return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
3537 : }
3538 :
3539 219926 : SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
3540 439852 : switch (Op.getOpcode()) {
3541 24175 : default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
3542 1753 : case ISD::BRCOND: return LowerBRCOND(Op, DAG);
3543 74303 : case ISD::LOAD: {
3544 74303 : SDValue Result = LowerLOAD(Op, DAG);
3545 : assert((!Result.getNode() ||
3546 : Result.getNode()->getNumValues() == 2) &&
3547 : "Load should return a value and a chain");
3548 74303 : return Result;
3549 : }
3550 :
3551 98 : case ISD::FSIN:
3552 : case ISD::FCOS:
3553 98 : return LowerTrig(Op, DAG);
3554 734 : case ISD::SELECT: return LowerSELECT(Op, DAG);
3555 251 : case ISD::FDIV: return LowerFDIV(Op, DAG);
3556 263 : case ISD::ATOMIC_CMP_SWAP: return LowerATOMIC_CMP_SWAP(Op, DAG);
3557 82361 : case ISD::STORE: return LowerSTORE(Op, DAG);
3558 1026 : case ISD::GlobalAddress: {
3559 1026 : MachineFunction &MF = DAG.getMachineFunction();
3560 1026 : SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
3561 1026 : return LowerGlobalAddress(MFI, Op, DAG);
3562 : }
3563 21890 : case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
3564 1617 : case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, DAG);
3565 2731 : case ISD::INTRINSIC_VOID: return LowerINTRINSIC_VOID(Op, DAG);
3566 46 : case ISD::ADDRSPACECAST: return lowerADDRSPACECAST(Op, DAG);
3567 162 : case ISD::INSERT_VECTOR_ELT:
3568 162 : return lowerINSERT_VECTOR_ELT(Op, DAG);
3569 6568 : case ISD::EXTRACT_VECTOR_ELT:
3570 6568 : return lowerEXTRACT_VECTOR_ELT(Op, DAG);
3571 1255 : case ISD::BUILD_VECTOR:
3572 1255 : return lowerBUILD_VECTOR(Op, DAG);
3573 519 : case ISD::FP_ROUND:
3574 519 : return lowerFP_ROUND(Op, DAG);
3575 27 : case ISD::TRAP:
3576 27 : return lowerTRAP(Op, DAG);
3577 9 : case ISD::DEBUGTRAP:
3578 9 : return lowerDEBUGTRAP(Op, DAG);
3579 13 : case ISD::FABS:
3580 : case ISD::FNEG:
3581 : case ISD::FCANONICALIZE:
3582 13 : return splitUnaryVectorOp(Op, DAG);
3583 125 : case ISD::SHL:
3584 : case ISD::SRA:
3585 : case ISD::SRL:
3586 : case ISD::ADD:
3587 : case ISD::SUB:
3588 : case ISD::MUL:
3589 : case ISD::SMIN:
3590 : case ISD::SMAX:
3591 : case ISD::UMIN:
3592 : case ISD::UMAX:
3593 : case ISD::FMINNUM:
3594 : case ISD::FMAXNUM:
3595 : case ISD::FADD:
3596 : case ISD::FMUL:
3597 125 : return splitBinaryVectorOp(Op, DAG);
3598 : }
3599 : return SDValue();
3600 : }
3601 :
3602 45 : static SDValue adjustLoadValueTypeImpl(SDValue Result, EVT LoadVT,
3603 : const SDLoc &DL,
3604 : SelectionDAG &DAG, bool Unpacked) {
3605 45 : if (!LoadVT.isVector())
3606 12 : return Result;
3607 :
3608 33 : if (Unpacked) { // From v2i32/v4i32 back to v2f16/v4f16.
3609 : // Truncate to v2i16/v4i16.
3610 19 : EVT IntLoadVT = LoadVT.changeTypeToInteger();
3611 :
3612 : // Workaround legalizer not scalarizing truncate after vector op
3613 : // legalization byt not creating intermediate vector trunc.
3614 : SmallVector<SDValue, 4> Elts;
3615 19 : DAG.ExtractVectorElements(Result, Elts);
3616 77 : for (SDValue &Elt : Elts)
3617 58 : Elt = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Elt);
3618 :
3619 19 : Result = DAG.getBuildVector(IntLoadVT, DL, Elts);
3620 :
3621 : // Bitcast to original type (v2f16/v4f16).
3622 19 : return DAG.getNode(ISD::BITCAST, DL, LoadVT, Result);
3623 : }
3624 :
3625 : // Cast back to the original packed type.
3626 14 : return DAG.getNode(ISD::BITCAST, DL, LoadVT, Result);
3627 : }
3628 :
3629 54 : SDValue SITargetLowering::adjustLoadValueType(unsigned Opcode,
3630 : MemSDNode *M,
3631 : SelectionDAG &DAG,
3632 : ArrayRef<SDValue> Ops,
3633 : bool IsIntrinsic) const {
3634 : SDLoc DL(M);
3635 :
3636 54 : bool Unpacked = Subtarget->hasUnpackedD16VMem();
3637 54 : EVT LoadVT = M->getValueType(0);
3638 :
3639 54 : EVT EquivLoadVT = LoadVT;
3640 72 : if (Unpacked && LoadVT.isVector()) {
3641 : EquivLoadVT = LoadVT.isVector() ?
3642 12 : EVT::getVectorVT(*DAG.getContext(), MVT::i32,
3643 12 : LoadVT.getVectorNumElements()) : LoadVT;
3644 : }
3645 :
3646 : // Change from v4f16/v2f16 to EquivLoadVT.
3647 54 : SDVTList VTList = DAG.getVTList(EquivLoadVT, MVT::Other);
3648 :
3649 : SDValue Load
3650 : = DAG.getMemIntrinsicNode(
3651 : IsIntrinsic ? (unsigned)ISD::INTRINSIC_W_CHAIN : Opcode, DL,
3652 : VTList, Ops, M->getMemoryVT(),
3653 108 : M->getMemOperand());
3654 54 : if (!Unpacked) // Just adjusted the opcode.
3655 36 : return Load;
3656 :
3657 18 : SDValue Adjusted = adjustLoadValueTypeImpl(Load, LoadVT, DL, DAG, Unpacked);
3658 :
3659 36 : return DAG.getMergeValues({ Adjusted, Load.getValue(1) }, DL);
3660 : }
3661 :
3662 72 : static SDValue lowerICMPIntrinsic(const SITargetLowering &TLI,
3663 : SDNode *N, SelectionDAG &DAG) {
3664 72 : EVT VT = N->getValueType(0);
3665 72 : const auto *CD = dyn_cast<ConstantSDNode>(N->getOperand(3));
3666 : if (!CD)
3667 8 : return DAG.getUNDEF(VT);
3668 :
3669 64 : int CondCode = CD->getSExtValue();
3670 64 : if (CondCode < ICmpInst::Predicate::FIRST_ICMP_PREDICATE ||
3671 : CondCode > ICmpInst::Predicate::LAST_ICMP_PREDICATE)
3672 4 : return DAG.getUNDEF(VT);
3673 :
3674 : ICmpInst::Predicate IcInput = static_cast<ICmpInst::Predicate>(CondCode);
3675 :
3676 :
3677 60 : SDValue LHS = N->getOperand(1);
3678 60 : SDValue RHS = N->getOperand(2);
3679 :
3680 : SDLoc DL(N);
3681 :
3682 : EVT CmpVT = LHS.getValueType();
3683 : if (CmpVT == MVT::i16 && !TLI.isTypeLegal(MVT::i16)) {
3684 10 : unsigned PromoteOp = ICmpInst::isSigned(IcInput) ?
3685 : ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
3686 10 : LHS = DAG.getNode(PromoteOp, DL, MVT::i32, LHS);
3687 10 : RHS = DAG.getNode(PromoteOp, DL, MVT::i32, RHS);
3688 : }
3689 :
3690 60 : ISD::CondCode CCOpcode = getICmpCondCode(IcInput);
3691 :
3692 : return DAG.getNode(AMDGPUISD::SETCC, DL, VT, LHS, RHS,
3693 60 : DAG.getCondCode(CCOpcode));
3694 : }
3695 :
3696 86 : static SDValue lowerFCMPIntrinsic(const SITargetLowering &TLI,
3697 : SDNode *N, SelectionDAG &DAG) {
3698 86 : EVT VT = N->getValueType(0);
3699 86 : const auto *CD = dyn_cast<ConstantSDNode>(N->getOperand(3));
3700 : if (!CD)
3701 2 : return DAG.getUNDEF(VT);
3702 :
3703 84 : int CondCode = CD->getSExtValue();
3704 84 : if (CondCode < FCmpInst::Predicate::FIRST_FCMP_PREDICATE ||
3705 : CondCode > FCmpInst::Predicate::LAST_FCMP_PREDICATE) {
3706 4 : return DAG.getUNDEF(VT);
3707 : }
3708 :
3709 80 : SDValue Src0 = N->getOperand(1);
3710 80 : SDValue Src1 = N->getOperand(2);
3711 : EVT CmpVT = Src0.getValueType();
3712 : SDLoc SL(N);
3713 :
3714 : if (CmpVT == MVT::f16 && !TLI.isTypeLegal(CmpVT)) {
3715 14 : Src0 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src0);
3716 14 : Src1 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src1);
3717 : }
3718 :
3719 : FCmpInst::Predicate IcInput = static_cast<FCmpInst::Predicate>(CondCode);
3720 80 : ISD::CondCode CCOpcode = getFCmpCondCode(IcInput);
3721 : return DAG.getNode(AMDGPUISD::SETCC, SL, VT, Src0,
3722 80 : Src1, DAG.getCondCode(CCOpcode));
3723 : }
3724 :
3725 556 : void SITargetLowering::ReplaceNodeResults(SDNode *N,
3726 : SmallVectorImpl<SDValue> &Results,
3727 : SelectionDAG &DAG) const {
3728 1112 : switch (N->getOpcode()) {
3729 : case ISD::INSERT_VECTOR_ELT: {
3730 67 : if (SDValue Res = lowerINSERT_VECTOR_ELT(SDValue(N, 0), DAG))
3731 23 : Results.push_back(Res);
3732 67 : return;
3733 : }
3734 : case ISD::EXTRACT_VECTOR_ELT: {
3735 0 : if (SDValue Res = lowerEXTRACT_VECTOR_ELT(SDValue(N, 0), DAG))
3736 0 : Results.push_back(Res);
3737 0 : return;
3738 : }
3739 85 : case ISD::INTRINSIC_WO_CHAIN: {
3740 255 : unsigned IID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
3741 : switch (IID) {
3742 29 : case Intrinsic::amdgcn_cvt_pkrtz: {
3743 29 : SDValue Src0 = N->getOperand(1);
3744 29 : SDValue Src1 = N->getOperand(2);
3745 : SDLoc SL(N);
3746 : SDValue Cvt = DAG.getNode(AMDGPUISD::CVT_PKRTZ_F16_F32, SL, MVT::i32,
3747 29 : Src0, Src1);
3748 58 : Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Cvt));
3749 : return;
3750 : }
3751 56 : case Intrinsic::amdgcn_cvt_pknorm_i16:
3752 : case Intrinsic::amdgcn_cvt_pknorm_u16:
3753 : case Intrinsic::amdgcn_cvt_pk_i16:
3754 : case Intrinsic::amdgcn_cvt_pk_u16: {
3755 56 : SDValue Src0 = N->getOperand(1);
3756 56 : SDValue Src1 = N->getOperand(2);
3757 : SDLoc SL(N);
3758 : unsigned Opcode;
3759 :
3760 56 : if (IID == Intrinsic::amdgcn_cvt_pknorm_i16)
3761 : Opcode = AMDGPUISD::CVT_PKNORM_I16_F32;
3762 38 : else if (IID == Intrinsic::amdgcn_cvt_pknorm_u16)
3763 : Opcode = AMDGPUISD::CVT_PKNORM_U16_F32;
3764 20 : else if (IID == Intrinsic::amdgcn_cvt_pk_i16)
3765 : Opcode = AMDGPUISD::CVT_PK_I16_I32;
3766 : else
3767 : Opcode = AMDGPUISD::CVT_PK_U16_U32;
3768 :
3769 112 : EVT VT = N->getValueType(0);
3770 : if (isTypeLegal(VT))
3771 0 : Results.push_back(DAG.getNode(Opcode, SL, VT, Src0, Src1));
3772 : else {
3773 56 : SDValue Cvt = DAG.getNode(Opcode, SL, MVT::i32, Src0, Src1);
3774 112 : Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, Cvt));
3775 : }
3776 : return;
3777 : }
3778 : }
3779 : break;
3780 : }
3781 : case ISD::INTRINSIC_W_CHAIN: {
3782 0 : if (SDValue Res = LowerINTRINSIC_W_CHAIN(SDValue(N, 0), DAG)) {
3783 0 : Results.push_back(Res);
3784 0 : Results.push_back(Res.getValue(1));
3785 0 : return;
3786 : }
3787 :
3788 0 : break;
3789 : }
3790 : case ISD::SELECT: {
3791 : SDLoc SL(N);
3792 26 : EVT VT = N->getValueType(0);
3793 26 : EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
3794 52 : SDValue LHS = DAG.getNode(ISD::BITCAST, SL, NewVT, N->getOperand(1));
3795 52 : SDValue RHS = DAG.getNode(ISD::BITCAST, SL, NewVT, N->getOperand(2));
3796 :
3797 26 : EVT SelectVT = NewVT;
3798 26 : if (NewVT.bitsLT(MVT::i32)) {
3799 2 : LHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, LHS);
3800 2 : RHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, RHS);
3801 : SelectVT = MVT::i32;
3802 : }
3803 :
3804 : SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, SelectVT,
3805 52 : N->getOperand(0), LHS, RHS);
3806 :
3807 0 : if (NewVT != SelectVT)
3808 2 : NewSelect = DAG.getNode(ISD::TRUNCATE, SL, NewVT, NewSelect);
3809 52 : Results.push_back(DAG.getNode(ISD::BITCAST, SL, VT, NewSelect));
3810 : return;
3811 : }
3812 : case ISD::FNEG: {
3813 6 : if (N->getValueType(0) != MVT::v2f16)
3814 : break;
3815 :
3816 : SDLoc SL(N);
3817 10 : SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::i32, N->getOperand(0));
3818 :
3819 : SDValue Op = DAG.getNode(ISD::XOR, SL, MVT::i32,
3820 : BC,
3821 5 : DAG.getConstant(0x80008000, SL, MVT::i32));
3822 10 : Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Op));
3823 : return;
3824 : }
3825 : case ISD::FABS: {
3826 8 : if (N->getValueType(0) != MVT::v2f16)
3827 : break;
3828 :
3829 : SDLoc SL(N);
3830 12 : SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::i32, N->getOperand(0));
3831 :
3832 : SDValue Op = DAG.getNode(ISD::AND, SL, MVT::i32,
3833 : BC,
3834 6 : DAG.getConstant(0x7fff7fff, SL, MVT::i32));
3835 12 : Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Op));
3836 : return;
3837 : }
3838 : default:
3839 : break;
3840 : }
3841 : }
3842 :
3843 : /// Helper function for LowerBRCOND
3844 0 : static SDNode *findUser(SDValue Value, unsigned Opcode) {
3845 :
3846 : SDNode *Parent = Value.getNode();
3847 853 : for (SDNode::use_iterator I = Parent->use_begin(), E = Parent->use_end();
3848 1906 : I != E; ++I) {
3849 :
3850 0 : if (I.getUse().get() != Value)
3851 0 : continue;
3852 :
3853 850 : if (I->getOpcode() == Opcode)
3854 : return *I;
3855 : }
3856 : return nullptr;
3857 : }
3858 :
3859 1753 : unsigned SITargetLowering::isCFIntrinsic(const SDNode *Intr) const {
3860 1753 : if (Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN) {
3861 1806 : switch (cast<ConstantSDNode>(Intr->getOperand(1))->getZExtValue()) {
3862 : case Intrinsic::amdgcn_if:
3863 : return AMDGPUISD::IF;
3864 52 : case Intrinsic::amdgcn_else:
3865 52 : return AMDGPUISD::ELSE;
3866 75 : case Intrinsic::amdgcn_loop:
3867 75 : return AMDGPUISD::LOOP;
3868 : case Intrinsic::amdgcn_end_cf:
3869 : llvm_unreachable("should not occur");
3870 2 : default:
3871 2 : return 0;
3872 : }
3873 : }
3874 :
3875 : // break, if_break, else_break are all only used as inputs to loop, not
3876 : // directly as branch conditions.
3877 : return 0;
3878 : }
3879 :
3880 4 : void SITargetLowering::createDebuggerPrologueStackObjects(
3881 : MachineFunction &MF) const {
3882 : // Create stack objects that are used for emitting debugger prologue.
3883 : //
3884 : // Debugger prologue writes work group IDs and work item IDs to scratch memory
3885 : // at fixed location in the following format:
3886 : // offset 0: work group ID x
3887 : // offset 4: work group ID y
3888 : // offset 8: work group ID z
3889 : // offset 16: work item ID x
3890 : // offset 20: work item ID y
3891 : // offset 24: work item ID z
3892 4 : SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
3893 : int ObjectIdx = 0;
3894 :
3895 : // For each dimension:
3896 16 : for (unsigned i = 0; i < 3; ++i) {
3897 : // Create fixed stack object for work group ID.
3898 12 : ObjectIdx = MF.getFrameInfo().CreateFixedObject(4, i * 4, true);
3899 : Info->setDebuggerWorkGroupIDStackObjectIndex(i, ObjectIdx);
3900 : // Create fixed stack object for work item ID.
3901 12 : ObjectIdx = MF.getFrameInfo().CreateFixedObject(4, i * 4 + 16, true);
3902 : Info->setDebuggerWorkItemIDStackObjectIndex(i, ObjectIdx);
3903 : }
3904 4 : }
3905 :
3906 1338 : bool SITargetLowering::shouldEmitFixup(const GlobalValue *GV) const {
3907 1338 : const Triple &TT = getTargetMachine().getTargetTriple();
3908 1235 : return (GV->getType()->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS ||
3909 1441 : GV->getType()->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) &&
3910 103 : AMDGPU::shouldEmitConstantsToTextSection(TT);
3911 : }
3912 :
3913 694 : bool SITargetLowering::shouldEmitGOTReloc(const GlobalValue *GV) const {
3914 629 : return (GV->getType()->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS ||
3915 557 : GV->getType()->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS ||
3916 137 : GV->getType()->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) &&
3917 842 : !shouldEmitFixup(GV) &&
3918 76 : !getTargetMachine().shouldAssumeDSOLocal(*GV->getParent(), GV);
3919 : }
3920 :
3921 588 : bool SITargetLowering::shouldEmitPCReloc(const GlobalValue *GV) const {
3922 588 : return !shouldEmitFixup(GV) && !shouldEmitGOTReloc(GV);
3923 : }
3924 :
3925 : /// This transforms the control flow intrinsics to get the branch destination as
3926 : /// last parameter, also switches branch target with BR if the need arise
3927 1753 : SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND,
3928 : SelectionDAG &DAG) const {
3929 : SDLoc DL(BRCOND);
3930 :
3931 1753 : SDNode *Intr = BRCOND.getOperand(1).getNode();
3932 1753 : SDValue Target = BRCOND.getOperand(2);
3933 : SDNode *BR = nullptr;
3934 : SDNode *SetCC = nullptr;
3935 :
3936 1753 : if (Intr->getOpcode() == ISD::SETCC) {
3937 : // As long as we negate the condition everything is fine
3938 : SetCC = Intr;
3939 1425 : Intr = SetCC->getOperand(0).getNode();
3940 :
3941 : } else {
3942 : // Get the target from BR if we don't negate the condition
3943 : BR = findUser(BRCOND, ISD::BR);
3944 328 : Target = BR->getOperand(1);
3945 : }
3946 :
3947 : // FIXME: This changes the types of the intrinsics instead of introducing new
3948 : // nodes with the correct types.
3949 : // e.g. llvm.amdgcn.loop
3950 :
3951 : // eg: i1,ch = llvm.amdgcn.loop t0, TargetConstant:i32<6271>, t3
3952 : // => t9: ch = llvm.amdgcn.loop t0, TargetConstant:i32<6271>, t3, BasicBlock:ch<bb1 0x7fee5286d088>
3953 :
3954 1753 : unsigned CFNode = isCFIntrinsic(Intr);
3955 1753 : if (CFNode == 0) {
3956 : // This is a uniform branch so we don't need to legalize.
3957 1153 : return BRCOND;
3958 : }
3959 :
3960 1200 : bool HaveChain = Intr->getOpcode() == ISD::INTRINSIC_VOID ||
3961 : Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN;
3962 :
3963 : assert(!SetCC ||
3964 : (SetCC->getConstantOperandVal(1) == 1 &&
3965 : cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get() ==
3966 : ISD::SETNE));
3967 :
3968 : // operands of the new intrinsic call
3969 : SmallVector<SDValue, 4> Ops;
3970 600 : if (HaveChain)
3971 600 : Ops.push_back(BRCOND.getOperand(0));
3972 :
3973 1200 : Ops.append(Intr->op_begin() + (HaveChain ? 2 : 1), Intr->op_end());
3974 600 : Ops.push_back(Target);
3975 :
3976 1200 : ArrayRef<EVT> Res(Intr->value_begin() + 1, Intr->value_end());
3977 :
3978 : // build the new intrinsic call
3979 600 : SDNode *Result = DAG.getNode(CFNode, DL, DAG.getVTList(Res), Ops).getNode();
3980 :
3981 600 : if (!HaveChain) {
3982 : SDValue Ops[] = {
3983 : SDValue(Result, 0),
3984 : BRCOND.getOperand(0)
3985 0 : };
3986 :
3987 0 : Result = DAG.getMergeValues(Ops, DL).getNode();
3988 : }
3989 :
3990 600 : if (BR) {
3991 : // Give the branch instruction our target
3992 : SDValue Ops[] = {
3993 90 : BR->getOperand(0),
3994 : BRCOND.getOperand(2)
3995 180 : };
3996 180 : SDValue NewBR = DAG.getNode(ISD::BR, DL, BR->getVTList(), Ops);
3997 90 : DAG.ReplaceAllUsesWith(BR, NewBR.getNode());
3998 : BR = NewBR.getNode();
3999 : }
4000 :
4001 600 : SDValue Chain = SDValue(Result, Result->getNumValues() - 1);
4002 :
4003 : // Copy the intrinsic results to registers
4004 1725 : for (unsigned i = 1, e = Intr->getNumValues() - 1; i != e; ++i) {
4005 : SDNode *CopyToReg = findUser(SDValue(Intr, i), ISD::CopyToReg);
4006 525 : if (!CopyToReg)
4007 : continue;
4008 :
4009 522 : Chain = DAG.getCopyToReg(
4010 : Chain, DL,
4011 522 : CopyToReg->getOperand(1),
4012 : SDValue(Result, i - 1),
4013 1044 : SDValue());
4014 :
4015 1044 : DAG.ReplaceAllUsesWith(SDValue(CopyToReg, 0), CopyToReg->getOperand(0));
4016 : }
4017 :
4018 : // Remove the old intrinsic from the chain
4019 1200 : DAG.ReplaceAllUsesOfValueWith(
4020 600 : SDValue(Intr, Intr->getNumValues() - 1),
4021 600 : Intr->getOperand(0));
4022 :
4023 600 : return Chain;
4024 : }
4025 :
4026 2526 : SDValue SITargetLowering::getFPExtOrFPTrunc(SelectionDAG &DAG,
4027 : SDValue Op,
4028 : const SDLoc &DL,
4029 : EVT VT) const {
4030 2526 : return Op.getValueType().bitsLE(VT) ?
4031 2526 : DAG.getNode(ISD::FP_EXTEND, DL, VT, Op) :
4032 2526 : DAG.getNode(ISD::FTRUNC, DL, VT, Op);
4033 : }
4034 :
4035 519 : SDValue SITargetLowering::lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
4036 : assert(Op.getValueType() == MVT::f16 &&
4037 : "Do not know how to custom lower FP_ROUND for non-f16 type");
4038 :
4039 519 : SDValue Src = Op.getOperand(0);
4040 : EVT SrcVT = Src.getValueType();
4041 : if (SrcVT != MVT::f64)
4042 509 : return Op;
4043 :
4044 : SDLoc DL(Op);
4045 :
4046 10 : SDValue FpToFp16 = DAG.getNode(ISD::FP_TO_FP16, DL, MVT::i32, Src);
4047 10 : SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, FpToFp16);
4048 10 : return DAG.getNode(ISD::BITCAST, DL, MVT::f16, Trunc);
4049 : }
4050 :
4051 27 : SDValue SITargetLowering::lowerTRAP(SDValue Op, SelectionDAG &DAG) const {
4052 : SDLoc SL(Op);
4053 27 : SDValue Chain = Op.getOperand(0);
4054 :
4055 27 : if (Subtarget->getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa ||
4056 12 : !Subtarget->isTrapHandlerEnabled())
4057 21 : return DAG.getNode(AMDGPUISD::ENDPGM, SL, MVT::Other, Chain);
4058 :
4059 6 : MachineFunction &MF = DAG.getMachineFunction();
4060 6 : SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
4061 : unsigned UserSGPR = Info->getQueuePtrUserSGPR();
4062 : assert(UserSGPR != AMDGPU::NoRegister);
4063 : SDValue QueuePtr = CreateLiveInRegister(
4064 12 : DAG, &AMDGPU::SReg_64RegClass, UserSGPR, MVT::i64);
4065 6 : SDValue SGPR01 = DAG.getRegister(AMDGPU::SGPR0_SGPR1, MVT::i64);
4066 : SDValue ToReg = DAG.getCopyToReg(Chain, SL, SGPR01,
4067 6 : QueuePtr, SDValue());
4068 : SDValue Ops[] = {
4069 : ToReg,
4070 6 : DAG.getTargetConstant(GCNSubtarget::TrapIDLLVMTrap, SL, MVT::i16),
4071 : SGPR01,
4072 6 : ToReg.getValue(1)
4073 6 : };
4074 6 : return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
4075 : }
4076 :
4077 9 : SDValue SITargetLowering::lowerDEBUGTRAP(SDValue Op, SelectionDAG &DAG) const {
4078 : SDLoc SL(Op);
4079 9 : SDValue Chain = Op.getOperand(0);
4080 9 : MachineFunction &MF = DAG.getMachineFunction();
4081 :
4082 9 : if (Subtarget->getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa ||
4083 4 : !Subtarget->isTrapHandlerEnabled()) {
4084 : DiagnosticInfoUnsupported NoTrap(MF.getFunction(),
4085 : "debugtrap handler not supported",
4086 : Op.getDebugLoc(),
4087 7 : DS_Warning);
4088 7 : LLVMContext &Ctx = MF.getFunction().getContext();
4089 7 : Ctx.diagnose(NoTrap);
4090 7 : return Chain;
4091 : }
4092 :
4093 : SDValue Ops[] = {
4094 : Chain,
4095 2 : DAG.getTargetConstant(GCNSubtarget::TrapIDLLVMDebugTrap, SL, MVT::i16)
4096 2 : };
4097 2 : return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
4098 : }
4099 :
4100 32 : SDValue SITargetLowering::getSegmentAperture(unsigned AS, const SDLoc &DL,
4101 : SelectionDAG &DAG) const {
4102 : // FIXME: Use inline constants (src_{shared, private}_base) instead.
4103 32 : if (Subtarget->hasApertureRegs()) {
4104 12 : unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ?
4105 : AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE :
4106 : AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE;
4107 : unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ?
4108 : AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE :
4109 : AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE;
4110 12 : unsigned Encoding =
4111 : AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ |
4112 12 : Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
4113 : WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
4114 :
4115 12 : SDValue EncodingImm = DAG.getTargetConstant(Encoding, DL, MVT::i16);
4116 : SDValue ApertureReg = SDValue(
4117 12 : DAG.getMachineNode(AMDGPU::S_GETREG_B32, DL, MVT::i32, EncodingImm), 0);
4118 12 : SDValue ShiftAmount = DAG.getTargetConstant(WidthM1 + 1, DL, MVT::i32);
4119 12 : return DAG.getNode(ISD::SHL, DL, MVT::i32, ApertureReg, ShiftAmount);
4120 : }
4121 :
4122 20 : MachineFunction &MF = DAG.getMachineFunction();
4123 20 : SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
4124 : unsigned UserSGPR = Info->getQueuePtrUserSGPR();
4125 : assert(UserSGPR != AMDGPU::NoRegister);
4126 :
4127 : SDValue QueuePtr = CreateLiveInRegister(
4128 40 : DAG, &AMDGPU::SReg_64RegClass, UserSGPR, MVT::i64);
4129 :
4130 : // Offset into amd_queue_t for group_segment_aperture_base_hi /
4131 : // private_segment_aperture_base_hi.
4132 20 : uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
4133 :
4134 20 : SDValue Ptr = DAG.getObjectPtrOffset(DL, QueuePtr, StructOffset);
4135 :
4136 : // TODO: Use custom target PseudoSourceValue.
4137 : // TODO: We should use the value from the IR intrinsic call, but it might not
4138 : // be available and how do we get it?
4139 20 : Value *V = UndefValue::get(PointerType::get(Type::getInt8Ty(*DAG.getContext()),
4140 : AMDGPUAS::CONSTANT_ADDRESS));
4141 :
4142 : MachinePointerInfo PtrInfo(V, StructOffset);
4143 : return DAG.getLoad(MVT::i32, DL, QueuePtr.getValue(1), Ptr, PtrInfo,
4144 : MinAlign(64, StructOffset),
4145 : MachineMemOperand::MODereferenceable |
4146 40 : MachineMemOperand::MOInvariant);
4147 : }
4148 :
4149 46 : SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op,
4150 : SelectionDAG &DAG) const {
4151 : SDLoc SL(Op);
4152 : const AddrSpaceCastSDNode *ASC = cast<AddrSpaceCastSDNode>(Op);
4153 :
4154 46 : SDValue Src = ASC->getOperand(0);
4155 46 : SDValue FlatNullPtr = DAG.getConstant(0, SL, MVT::i64);
4156 :
4157 : const AMDGPUTargetMachine &TM =
4158 : static_cast<const AMDGPUTargetMachine &>(getTargetMachine());
4159 :
4160 : // flat -> local/private
4161 46 : if (ASC->getSrcAddressSpace() == AMDGPUAS::FLAT_ADDRESS) {
4162 12 : unsigned DestAS = ASC->getDestAddressSpace();
4163 :
4164 24 : if (DestAS == AMDGPUAS::LOCAL_ADDRESS ||
4165 12 : DestAS == AMDGPUAS::PRIVATE_ADDRESS) {
4166 : unsigned NullVal = TM.getNullPointerValue(DestAS);
4167 12 : SDValue SegmentNullPtr = DAG.getConstant(NullVal, SL, MVT::i32);
4168 12 : SDValue NonNull = DAG.getSetCC(SL, MVT::i1, Src, FlatNullPtr, ISD::SETNE);
4169 12 : SDValue Ptr = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Src);
4170 :
4171 : return DAG.getNode(ISD::SELECT, SL, MVT::i32,
4172 12 : NonNull, Ptr, SegmentNullPtr);
4173 : }
4174 : }
4175 :
4176 : // local/private -> flat
4177 34 : if (ASC->getDestAddressSpace() == AMDGPUAS::FLAT_ADDRESS) {
4178 : unsigned SrcAS = ASC->getSrcAddressSpace();
4179 :
4180 66 : if (SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
4181 33 : SrcAS == AMDGPUAS::PRIVATE_ADDRESS) {
4182 : unsigned NullVal = TM.getNullPointerValue(SrcAS);
4183 32 : SDValue SegmentNullPtr = DAG.getConstant(NullVal, SL, MVT::i32);
4184 :
4185 : SDValue NonNull
4186 32 : = DAG.getSetCC(SL, MVT::i1, Src, SegmentNullPtr, ISD::SETNE);
4187 :
4188 32 : SDValue Aperture = getSegmentAperture(ASC->getSrcAddressSpace(), SL, DAG);
4189 : SDValue CvtPtr
4190 32 : = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Aperture);
4191 :
4192 : return DAG.getNode(ISD::SELECT, SL, MVT::i64, NonNull,
4193 : DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr),
4194 32 : FlatNullPtr);
4195 : }
4196 : }
4197 :
4198 : // global <-> flat are no-ops and never emitted.
4199 :
4200 2 : const MachineFunction &MF = DAG.getMachineFunction();
4201 : DiagnosticInfoUnsupported InvalidAddrSpaceCast(
4202 2 : MF.getFunction(), "invalid addrspacecast", SL.getDebugLoc());
4203 2 : DAG.getContext()->diagnose(InvalidAddrSpaceCast);
4204 :
4205 4 : return DAG.getUNDEF(ASC->getValueType(0));
4206 : }
4207 :
4208 229 : SDValue SITargetLowering::lowerINSERT_VECTOR_ELT(SDValue Op,
4209 : SelectionDAG &DAG) const {
4210 229 : SDValue Vec = Op.getOperand(0);
4211 229 : SDValue InsVal = Op.getOperand(1);
4212 229 : SDValue Idx = Op.getOperand(2);
4213 229 : EVT VecVT = Vec.getValueType();
4214 229 : EVT EltVT = VecVT.getVectorElementType();
4215 229 : unsigned VecSize = VecVT.getSizeInBits();
4216 229 : unsigned EltSize = EltVT.getSizeInBits();
4217 :
4218 :
4219 : assert(VecSize <= 64);
4220 :
4221 : unsigned NumElts = VecVT.getVectorNumElements();
4222 : SDLoc SL(Op);
4223 : auto KIdx = dyn_cast<ConstantSDNode>(Idx);
4224 :
4225 229 : if (NumElts == 4 && EltSize == 16 && KIdx) {
4226 20 : SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Vec);
4227 :
4228 : SDValue LoHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BCVec,
4229 20 : DAG.getConstant(0, SL, MVT::i32));
4230 : SDValue HiHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BCVec,
4231 20 : DAG.getConstant(1, SL, MVT::i32));
4232 :
4233 20 : SDValue LoVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, LoHalf);
4234 20 : SDValue HiVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, HiHalf);
4235 :
4236 40 : unsigned Idx = KIdx->getZExtValue();
4237 : bool InsertLo = Idx < 2;
4238 : SDValue InsHalf = DAG.getNode(ISD::INSERT_VECTOR_ELT, SL, MVT::v2i16,
4239 20 : InsertLo ? LoVec : HiVec,
4240 : DAG.getNode(ISD::BITCAST, SL, MVT::i16, InsVal),
4241 47 : DAG.getConstant(InsertLo ? Idx : (Idx - 2), SL, MVT::i32));
4242 :
4243 20 : InsHalf = DAG.getNode(ISD::BITCAST, SL, MVT::i32, InsHalf);
4244 :
4245 : SDValue Concat = InsertLo ?
4246 14 : DAG.getBuildVector(MVT::v2i32, SL, { InsHalf, HiHalf }) :
4247 27 : DAG.getBuildVector(MVT::v2i32, SL, { LoHalf, InsHalf });
4248 :
4249 20 : return DAG.getNode(ISD::BITCAST, SL, VecVT, Concat);
4250 : }
4251 :
4252 : if (isa<ConstantSDNode>(Idx))
4253 164 : return SDValue();
4254 :
4255 45 : MVT IntVT = MVT::getIntegerVT(VecSize);
4256 :
4257 : // Avoid stack access for dynamic indexing.
4258 45 : SDValue Val = InsVal;
4259 : if (InsVal.getValueType() == MVT::f16)
4260 21 : Val = DAG.getNode(ISD::BITCAST, SL, MVT::i16, InsVal);
4261 :
4262 : // v_bfi_b32 (v_bfm_b32 16, (shl idx, 16)), val, vec
4263 45 : SDValue ExtVal = DAG.getNode(ISD::ZERO_EXTEND, SL, IntVT, Val);
4264 :
4265 : assert(isPowerOf2_32(EltSize));
4266 45 : SDValue ScaleFactor = DAG.getConstant(Log2_32(EltSize), SL, MVT::i32);
4267 :
4268 : // Convert vector index to bit-index.
4269 45 : SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, ScaleFactor);
4270 :
4271 45 : SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, IntVT, Vec);
4272 : SDValue BFM = DAG.getNode(ISD::SHL, SL, IntVT,
4273 : DAG.getConstant(0xffff, SL, IntVT),
4274 45 : ScaledIdx);
4275 :
4276 45 : SDValue LHS = DAG.getNode(ISD::AND, SL, IntVT, BFM, ExtVal);
4277 : SDValue RHS = DAG.getNode(ISD::AND, SL, IntVT,
4278 45 : DAG.getNOT(SL, BFM, IntVT), BCVec);
4279 :
4280 45 : SDValue BFI = DAG.getNode(ISD::OR, SL, IntVT, LHS, RHS);
4281 45 : return DAG.getNode(ISD::BITCAST, SL, VecVT, BFI);
4282 : }
4283 :
4284 6568 : SDValue SITargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
4285 : SelectionDAG &DAG) const {
4286 : SDLoc SL(Op);
4287 :
4288 6568 : EVT ResultVT = Op.getValueType();
4289 6568 : SDValue Vec = Op.getOperand(0);
4290 6568 : SDValue Idx = Op.getOperand(1);
4291 6568 : EVT VecVT = Vec.getValueType();
4292 6568 : unsigned VecSize = VecVT.getSizeInBits();
4293 6568 : EVT EltVT = VecVT.getVectorElementType();
4294 : assert(VecSize <= 64);
4295 :
4296 : DAGCombinerInfo DCI(DAG, AfterLegalizeVectorOps, true, nullptr);
4297 :
4298 : // Make sure we do any optimizations that will make it easier to fold
4299 : // source modifiers before obscuring it with bit operations.
4300 :
4301 : // XXX - Why doesn't this get called when vector_shuffle is expanded?
4302 6568 : if (SDValue Combined = performExtractVectorEltCombine(Op.getNode(), DCI))
4303 7 : return Combined;
4304 :
4305 6561 : unsigned EltSize = EltVT.getSizeInBits();
4306 : assert(isPowerOf2_32(EltSize));
4307 :
4308 6561 : MVT IntVT = MVT::getIntegerVT(VecSize);
4309 6561 : SDValue ScaleFactor = DAG.getConstant(Log2_32(EltSize), SL, MVT::i32);
4310 :
4311 : // Convert vector index to bit-index (* EltSize)
4312 6561 : SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, ScaleFactor);
4313 :
4314 6561 : SDValue BC = DAG.getNode(ISD::BITCAST, SL, IntVT, Vec);
4315 6561 : SDValue Elt = DAG.getNode(ISD::SRL, SL, IntVT, BC, ScaledIdx);
4316 :
4317 : if (ResultVT == MVT::f16) {
4318 1652 : SDValue Result = DAG.getNode(ISD::TRUNCATE, SL, MVT::i16, Elt);
4319 1652 : return DAG.getNode(ISD::BITCAST, SL, ResultVT, Result);
4320 : }
4321 :
4322 4909 : return DAG.getAnyExtOrTrunc(Elt, SL, ResultVT);
4323 : }
4324 :
4325 1255 : SDValue SITargetLowering::lowerBUILD_VECTOR(SDValue Op,
4326 : SelectionDAG &DAG) const {
4327 : SDLoc SL(Op);
4328 1255 : EVT VT = Op.getValueType();
4329 :
4330 : if (VT == MVT::v4i16 || VT == MVT::v4f16) {
4331 292 : EVT HalfVT = MVT::getVectorVT(VT.getVectorElementType().getSimpleVT(), 2);
4332 :
4333 : // Turn into pair of packed build_vectors.
4334 : // TODO: Special case for constants that can be materialized with s_mov_b64.
4335 : SDValue Lo = DAG.getBuildVector(HalfVT, SL,
4336 584 : { Op.getOperand(0), Op.getOperand(1) });
4337 : SDValue Hi = DAG.getBuildVector(HalfVT, SL,
4338 584 : { Op.getOperand(2), Op.getOperand(3) });
4339 :
4340 292 : SDValue CastLo = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Lo);
4341 292 : SDValue CastHi = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Hi);
4342 :
4343 584 : SDValue Blend = DAG.getBuildVector(MVT::v2i32, SL, { CastLo, CastHi });
4344 292 : return DAG.getNode(ISD::BITCAST, SL, VT, Blend);
4345 : }
4346 :
4347 : assert(VT == MVT::v2f16 || VT == MVT::v2i16);
4348 : assert(!Subtarget->hasVOP3PInsts() && "this should be legal");
4349 :
4350 963 : SDValue Lo = Op.getOperand(0);
4351 963 : SDValue Hi = Op.getOperand(1);
4352 :
4353 : // Avoid adding defined bits with the zero_extend.
4354 963 : if (Hi.isUndef()) {
4355 33 : Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Lo);
4356 33 : SDValue ExtLo = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, Lo);
4357 33 : return DAG.getNode(ISD::BITCAST, SL, VT, ExtLo);
4358 : }
4359 :
4360 930 : Hi = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Hi);
4361 930 : Hi = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Hi);
4362 :
4363 : SDValue ShlHi = DAG.getNode(ISD::SHL, SL, MVT::i32, Hi,
4364 930 : DAG.getConstant(16, SL, MVT::i32));
4365 930 : if (Lo.isUndef())
4366 11 : return DAG.getNode(ISD::BITCAST, SL, VT, ShlHi);
4367 :
4368 919 : Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Lo);
4369 919 : Lo = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Lo);
4370 :
4371 919 : SDValue Or = DAG.getNode(ISD::OR, SL, MVT::i32, Lo, ShlHi);
4372 919 : return DAG.getNode(ISD::BITCAST, SL, VT, Or);
4373 : }
4374 :
4375 : bool
4376 1725 : SITargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const {
4377 : // We can fold offsets for anything that doesn't require a GOT relocation.
4378 3413 : return (GA->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS ||
4379 3307 : GA->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS ||
4380 3450 : GA->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) &&
4381 106 : !shouldEmitGOTReloc(GA->getGlobal());
4382 : }
4383 :
4384 : static SDValue
4385 613 : buildPCRelGlobalAddress(SelectionDAG &DAG, const GlobalValue *GV,
4386 : const SDLoc &DL, unsigned Offset, EVT PtrVT,
4387 : unsigned GAFlags = SIInstrInfo::MO_NONE) {
4388 : // In order to support pc-relative addressing, the PC_ADD_REL_OFFSET SDNode is
4389 : // lowered to the following code sequence:
4390 : //
4391 : // For constant address space:
4392 : // s_getpc_b64 s[0:1]
4393 : // s_add_u32 s0, s0, $symbol
4394 : // s_addc_u32 s1, s1, 0
4395 : //
4396 : // s_getpc_b64 returns the address of the s_add_u32 instruction and then
4397 : // a fixup or relocation is emitted to replace $symbol with a literal
4398 : // constant, which is a pc-relative offset from the encoding of the $symbol
4399 : // operand to the global variable.
4400 : //
4401 : // For global address space:
4402 : // s_getpc_b64 s[0:1]
4403 : // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
4404 : // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
4405 : //
4406 : // s_getpc_b64 returns the address of the s_add_u32 instruction and then
4407 : // fixups or relocations are emitted to replace $symbol@*@lo and
4408 : // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
4409 : // which is a 64-bit pc-relative offset from the encoding of the $symbol
4410 : // operand to the global variable.
4411 : //
4412 : // What we want here is an offset from the value returned by s_getpc
4413 : // (which is the address of the s_add_u32 instruction) to the global
4414 : // variable, but since the encoding of $symbol starts 4 bytes after the start
4415 : // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
4416 : // small. This requires us to add 4 to the global variable offset in order to
4417 : // compute the correct address.
4418 613 : SDValue PtrLo = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset + 4,
4419 613 : GAFlags);
4420 : SDValue PtrHi = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset + 4,
4421 : GAFlags == SIInstrInfo::MO_NONE ?
4422 1226 : GAFlags : GAFlags + 1);
4423 613 : return DAG.getNode(AMDGPUISD::PC_ADD_REL_OFFSET, DL, PtrVT, PtrLo, PtrHi);
4424 : }
4425 :
4426 1026 : SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI,
4427 : SDValue Op,
4428 : SelectionDAG &DAG) const {
4429 : GlobalAddressSDNode *GSD = cast<GlobalAddressSDNode>(Op);
4430 1026 : const GlobalValue *GV = GSD->getGlobal();
4431 1639 : if (GSD->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
4432 1639 : GSD->getAddressSpace() == AMDGPUAS::REGION_ADDRESS ||
4433 613 : GSD->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS)
4434 413 : return AMDGPUTargetLowering::LowerGlobalAddress(MFI, Op, DAG);
4435 :
4436 : SDLoc DL(GSD);
4437 613 : EVT PtrVT = Op.getValueType();
4438 :
4439 : // FIXME: Should not make address space based decisions here.
4440 613 : if (shouldEmitFixup(GV))
4441 25 : return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT);
4442 588 : else if (shouldEmitPCReloc(GV))
4443 563 : return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT,
4444 563 : SIInstrInfo::MO_REL32);
4445 :
4446 : SDValue GOTAddr = buildPCRelGlobalAddress(DAG, GV, DL, 0, PtrVT,
4447 25 : SIInstrInfo::MO_GOTPCREL32);
4448 :
4449 25 : Type *Ty = PtrVT.getTypeForEVT(*DAG.getContext());
4450 25 : PointerType *PtrTy = PointerType::get(Ty, AMDGPUAS::CONSTANT_ADDRESS);
4451 25 : const DataLayout &DataLayout = DAG.getDataLayout();
4452 25 : unsigned Align = DataLayout.getABITypeAlignment(PtrTy);
4453 : MachinePointerInfo PtrInfo
4454 25 : = MachinePointerInfo::getGOT(DAG.getMachineFunction());
4455 :
4456 : return DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), GOTAddr, PtrInfo, Align,
4457 : MachineMemOperand::MODereferenceable |
4458 25 : MachineMemOperand::MOInvariant);
4459 : }
4460 :
4461 9146 : SDValue SITargetLowering::copyToM0(SelectionDAG &DAG, SDValue Chain,
4462 : const SDLoc &DL, SDValue V) const {
4463 : // We can't use S_MOV_B32 directly, because there is no way to specify m0 as
4464 : // the destination register.
4465 : //
4466 : // We can't use CopyToReg, because MachineCSE won't combine COPY instructions,
4467 : // so we will end up with redundant moves to m0.
4468 : //
4469 : // We use a pseudo to ensure we emit s_mov_b32 with m0 as the direct result.
4470 :
4471 : // A Null SDValue creates a glue result.
4472 9146 : SDNode *M0 = DAG.getMachineNode(AMDGPU::SI_INIT_M0, DL, MVT::Other, MVT::Glue,
4473 : V, Chain);
4474 9146 : return SDValue(M0, 0);
4475 : }
4476 :
4477 91 : SDValue SITargetLowering::lowerImplicitZextParam(SelectionDAG &DAG,
4478 : SDValue Op,
4479 : MVT VT,
4480 : unsigned Offset) const {
4481 : SDLoc SL(Op);
4482 : SDValue Param = lowerKernargMemParameter(DAG, MVT::i32, MVT::i32, SL,
4483 182 : DAG.getEntryNode(), Offset, 4, false);
4484 : // The local size values will have the hi 16-bits as zero.
4485 : return DAG.getNode(ISD::AssertZext, SL, MVT::i32, Param,
4486 91 : DAG.getValueType(VT));
4487 : }
4488 :
4489 2 : static SDValue emitNonHSAIntrinsicError(SelectionDAG &DAG, const SDLoc &DL,
4490 : EVT VT) {
4491 2 : DiagnosticInfoUnsupported BadIntrin(DAG.getMachineFunction().getFunction(),
4492 : "non-hsa intrinsic with hsa target",
4493 2 : DL.getDebugLoc());
4494 2 : DAG.getContext()->diagnose(BadIntrin);
4495 2 : return DAG.getUNDEF(VT);
4496 : }
4497 :
4498 5 : static SDValue emitRemovedIntrinsicError(SelectionDAG &DAG, const SDLoc &DL,
4499 : EVT VT) {
4500 5 : DiagnosticInfoUnsupported BadIntrin(DAG.getMachineFunction().getFunction(),
4501 : "intrinsic not supported on subtarget",
4502 5 : DL.getDebugLoc());
4503 5 : DAG.getContext()->diagnose(BadIntrin);
4504 5 : return DAG.getUNDEF(VT);
4505 : }
4506 :
4507 737 : static SDValue getBuildDwordsVector(SelectionDAG &DAG, SDLoc DL,
4508 : ArrayRef<SDValue> Elts) {
4509 : assert(!Elts.empty());
4510 : MVT Type;
4511 : unsigned NumElts;
4512 :
4513 737 : if (Elts.size() == 1) {
4514 : Type = MVT::f32;
4515 : NumElts = 1;
4516 509 : } else if (Elts.size() == 2) {
4517 : Type = MVT::v2f32;
4518 : NumElts = 2;
4519 349 : } else if (Elts.size() <= 4) {
4520 : Type = MVT::v4f32;
4521 : NumElts = 4;
4522 101 : } else if (Elts.size() <= 8) {
4523 : Type = MVT::v8f32;
4524 : NumElts = 8;
4525 : } else {
4526 : assert(Elts.size() <= 16);
4527 : Type = MVT::v16f32;
4528 : NumElts = 16;
4529 : }
4530 :
4531 737 : SmallVector<SDValue, 16> VecElts(NumElts);
4532 2760 : for (unsigned i = 0; i < Elts.size(); ++i) {
4533 4046 : SDValue Elt = Elts[i];
4534 : if (Elt.getValueType() != MVT::f32)
4535 714 : Elt = DAG.getBitcast(MVT::f32, Elt);
4536 2023 : VecElts[i] = Elt;
4537 : }
4538 1126 : for (unsigned i = Elts.size(); i < NumElts; ++i)
4539 389 : VecElts[i] = DAG.getUNDEF(MVT::f32);
4540 :
4541 737 : if (NumElts == 1)
4542 228 : return VecElts[0];
4543 509 : return DAG.getBuildVector(Type, DL, VecElts);
4544 : }
4545 :
4546 0 : static bool parseCachePolicy(SDValue CachePolicy, SelectionDAG &DAG,
4547 : SDValue *GLC, SDValue *SLC) {
4548 : auto CachePolicyConst = dyn_cast<ConstantSDNode>(CachePolicy.getNode());
4549 : if (!CachePolicyConst)
4550 0 : return false;
4551 :
4552 0 : uint64_t Value = CachePolicyConst->getZExtValue();
4553 0 : SDLoc DL(CachePolicy);
4554 0 : if (GLC) {
4555 0 : *GLC = DAG.getTargetConstant((Value & 0x1) ? 1 : 0, DL, MVT::i32);
4556 0 : Value &= ~(uint64_t)0x1;
4557 : }
4558 0 : if (SLC) {
4559 0 : *SLC = DAG.getTargetConstant((Value & 0x2) ? 1 : 0, DL, MVT::i32);
4560 0 : Value &= ~(uint64_t)0x2;
4561 : }
4562 :
4563 0 : return Value == 0;
4564 : }
4565 :
4566 744 : SDValue SITargetLowering::lowerImage(SDValue Op,
4567 : const AMDGPU::ImageDimIntrinsicInfo *Intr,
4568 : SelectionDAG &DAG) const {
4569 : SDLoc DL(Op);
4570 744 : MachineFunction &MF = DAG.getMachineFunction();
4571 744 : const GCNSubtarget* ST = &MF.getSubtarget<GCNSubtarget>();
4572 : const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
4573 744 : AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
4574 744 : const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(Intr->Dim);
4575 : const AMDGPU::MIMGLZMappingInfo *LZMappingInfo =
4576 744 : AMDGPU::getMIMGLZMappingInfo(Intr->BaseOpcode);
4577 : unsigned IntrOpcode = Intr->BaseOpcode;
4578 :
4579 744 : SmallVector<EVT, 2> ResultTypes(Op->value_begin(), Op->value_end());
4580 : bool IsD16 = false;
4581 : bool IsA16 = false;
4582 744 : SDValue VData;
4583 : int NumVDataDwords;
4584 : unsigned AddrIdx; // Index of first address argument
4585 : unsigned DMask;
4586 :
4587 744 : if (BaseOpcode->Atomic) {
4588 42 : VData = Op.getOperand(2);
4589 :
4590 42 : bool Is64Bit = VData.getValueType() == MVT::i64;
4591 42 : if (BaseOpcode->AtomicX2) {
4592 2 : SDValue VData2 = Op.getOperand(3);
4593 2 : VData = DAG.getBuildVector(Is64Bit ? MVT::v2i64 : MVT::v2i32, DL,
4594 6 : {VData, VData2});
4595 2 : if (Is64Bit)
4596 0 : VData = DAG.getBitcast(MVT::v4i32, VData);
4597 :
4598 2 : ResultTypes[0] = Is64Bit ? MVT::v2i64 : MVT::v2i32;
4599 2 : DMask = Is64Bit ? 0xf : 0x3;
4600 2 : NumVDataDwords = Is64Bit ? 4 : 2;
4601 : AddrIdx = 4;
4602 : } else {
4603 40 : DMask = Is64Bit ? 0x3 : 0x1;
4604 40 : NumVDataDwords = Is64Bit ? 2 : 1;
4605 : AddrIdx = 3;
4606 : }
4607 : } else {
4608 : unsigned DMaskIdx;
4609 :
4610 702 : if (BaseOpcode->Store) {
4611 87 : VData = Op.getOperand(2);
4612 :
4613 87 : MVT StoreVT = VData.getSimpleValueType();
4614 87 : if (StoreVT.getScalarType() == MVT::f16) {
4615 12 : if (Subtarget->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS ||
4616 12 : !BaseOpcode->HasD16)
4617 0 : return Op; // D16 is unsupported for this instruction
4618 :
4619 : IsD16 = true;
4620 12 : VData = handleD16VData(VData, DAG);
4621 : }
4622 :
4623 174 : NumVDataDwords = (VData.getValueType().getSizeInBits() + 31) / 32;
4624 : DMaskIdx = 3;
4625 : } else {
4626 615 : MVT LoadVT = Op.getSimpleValueType();
4627 615 : if (LoadVT.getScalarType() == MVT::f16) {
4628 27 : if (Subtarget->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS ||
4629 27 : !BaseOpcode->HasD16)
4630 0 : return Op; // D16 is unsupported for this instruction
4631 :
4632 : IsD16 = true;
4633 54 : if (LoadVT.isVector() && Subtarget->hasUnpackedD16VMem())
4634 11 : ResultTypes[0] = (LoadVT == MVT::v2f16) ? MVT::v2i32 : MVT::v4i32;
4635 : }
4636 :
4637 615 : NumVDataDwords = (ResultTypes[0].getSizeInBits() + 31) / 32;
4638 615 : DMaskIdx = isa<MemSDNode>(Op) ? 2 : 1;
4639 : }
4640 :
4641 : auto DMaskConst = dyn_cast<ConstantSDNode>(Op.getOperand(DMaskIdx));
4642 : if (!DMaskConst)
4643 0 : return Op;
4644 :
4645 702 : AddrIdx = DMaskIdx + 1;
4646 702 : DMask = DMaskConst->getZExtValue();
4647 702 : if (!DMask && !BaseOpcode->Store) {
4648 : // Eliminate no-op loads. Stores with dmask == 0 are *not* no-op: they
4649 : // store the channels' default values.
4650 7 : SDValue Undef = DAG.getUNDEF(Op.getValueType());
4651 7 : if (isa<MemSDNode>(Op))
4652 6 : return DAG.getMergeValues({Undef, Op.getOperand(0)}, DL);
4653 5 : return Undef;
4654 : }
4655 : }
4656 :
4657 737 : unsigned NumGradients = BaseOpcode->Gradients ? DimInfo->NumGradients : 0;
4658 737 : unsigned NumCoords = BaseOpcode->Coordinates ? DimInfo->NumCoords : 0;
4659 737 : unsigned NumLCM = BaseOpcode->LodOrClampOrMip ? 1 : 0;
4660 737 : unsigned NumVAddrs = BaseOpcode->NumExtraArgs + NumGradients +
4661 : NumCoords + NumLCM;
4662 : unsigned NumMIVAddrs = NumVAddrs;
4663 :
4664 : SmallVector<SDValue, 4> VAddrs;
4665 :
4666 : // Optimize _L to _LZ when _L is zero
4667 737 : if (LZMappingInfo) {
4668 : if (auto ConstantLod =
4669 60 : dyn_cast<ConstantFPSDNode>(Op.getOperand(AddrIdx+NumVAddrs-1))) {
4670 60 : if (ConstantLod->isZero() || ConstantLod->isNegative()) {
4671 30 : IntrOpcode = LZMappingInfo->LZ; // set new opcode to _lz variant of _l
4672 30 : NumMIVAddrs--; // remove 'lod'
4673 : }
4674 : }
4675 : }
4676 :
4677 : // Check for 16 bit addresses and pack if true.
4678 737 : unsigned DimIdx = AddrIdx + BaseOpcode->NumExtraArgs;
4679 737 : MVT VAddrVT = Op.getOperand(DimIdx).getSimpleValueType();
4680 737 : if (VAddrVT.getScalarType() == MVT::f16 &&
4681 : ST->hasFeature(AMDGPU::FeatureR128A16)) {
4682 : IsA16 = true;
4683 228 : for (unsigned i = AddrIdx; i < (AddrIdx + NumMIVAddrs); ++i) {
4684 167 : SDValue AddrLo, AddrHi;
4685 : // Push back extra arguments.
4686 167 : if (i < DimIdx) {
4687 42 : AddrLo = Op.getOperand(i);
4688 : } else {
4689 125 : AddrLo = Op.getOperand(i);
4690 : // Dz/dh, dz/dv and the last odd coord are packed with undef. Also,
4691 : // in 1D, derivatives dx/dh and dx/dv are packed with undef.
4692 125 : if (((i + 1) >= (AddrIdx + NumMIVAddrs)) ||
4693 91 : ((NumGradients / 2) % 2 == 1 &&
4694 25 : (i == DimIdx + (NumGradients / 2) - 1 ||
4695 16 : i == DimIdx + NumGradients - 1))) {
4696 52 : AddrHi = DAG.getUNDEF(MVT::f16);
4697 : } else {
4698 73 : AddrHi = Op.getOperand(i + 1);
4699 : i++;
4700 : }
4701 125 : AddrLo = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f16,
4702 250 : {AddrLo, AddrHi});
4703 125 : AddrLo = DAG.getBitcast(MVT::i32, AddrLo);
4704 : }
4705 167 : VAddrs.push_back(AddrLo);
4706 : }
4707 : } else {
4708 2532 : for (unsigned i = 0; i < NumMIVAddrs; ++i)
4709 3712 : VAddrs.push_back(Op.getOperand(AddrIdx + i));
4710 : }
4711 :
4712 1474 : SDValue VAddr = getBuildDwordsVector(DAG, DL, VAddrs);
4713 :
4714 737 : SDValue True = DAG.getTargetConstant(1, DL, MVT::i1);
4715 737 : SDValue False = DAG.getTargetConstant(0, DL, MVT::i1);
4716 : unsigned CtrlIdx; // Index of texfailctrl argument
4717 737 : SDValue Unorm;
4718 737 : if (!BaseOpcode->Sampler) {
4719 239 : Unorm = True;
4720 239 : CtrlIdx = AddrIdx + NumVAddrs + 1;
4721 : } else {
4722 : auto UnormConst =
4723 498 : dyn_cast<ConstantSDNode>(Op.getOperand(AddrIdx + NumVAddrs + 2));
4724 : if (!UnormConst)
4725 0 : return Op;
4726 :
4727 996 : Unorm = UnormConst->getZExtValue() ? True : False;
4728 498 : CtrlIdx = AddrIdx + NumVAddrs + 3;
4729 : }
4730 :
4731 737 : SDValue TexFail = Op.getOperand(CtrlIdx);
4732 : auto TexFailConst = dyn_cast<ConstantSDNode>(TexFail.getNode());
4733 1474 : if (!TexFailConst || TexFailConst->getZExtValue() != 0)
4734 0 : return Op;
4735 :
4736 737 : SDValue GLC;
4737 737 : SDValue SLC;
4738 737 : if (BaseOpcode->Atomic) {
4739 42 : GLC = True; // TODO no-return optimization
4740 84 : if (!parseCachePolicy(Op.getOperand(CtrlIdx + 1), DAG, nullptr, &SLC))
4741 0 : return Op;
4742 : } else {
4743 1390 : if (!parseCachePolicy(Op.getOperand(CtrlIdx + 1), DAG, &GLC, &SLC))
4744 0 : return Op;
4745 : }
4746 :
4747 : SmallVector<SDValue, 14> Ops;
4748 737 : if (BaseOpcode->Store || BaseOpcode->Atomic)
4749 129 : Ops.push_back(VData); // vdata
4750 737 : Ops.push_back(VAddr);
4751 1474 : Ops.push_back(Op.getOperand(AddrIdx + NumVAddrs)); // rsrc
4752 737 : if (BaseOpcode->Sampler)
4753 996 : Ops.push_back(Op.getOperand(AddrIdx + NumVAddrs + 1)); // sampler
4754 737 : Ops.push_back(DAG.getTargetConstant(DMask, DL, MVT::i32));
4755 737 : Ops.push_back(Unorm);
4756 737 : Ops.push_back(GLC);
4757 737 : Ops.push_back(SLC);
4758 737 : Ops.push_back(IsA16 && // a16 or r128
4759 : ST->hasFeature(AMDGPU::FeatureR128A16) ? True : False);
4760 737 : Ops.push_back(False); // tfe
4761 737 : Ops.push_back(False); // lwe
4762 1389 : Ops.push_back(DimInfo->DA ? True : False);
4763 737 : if (BaseOpcode->HasD16)
4764 1289 : Ops.push_back(IsD16 ? True : False);
4765 737 : if (isa<MemSDNode>(Op))
4766 706 : Ops.push_back(Op.getOperand(0)); // chain
4767 :
4768 737 : int NumVAddrDwords = VAddr.getValueType().getSizeInBits() / 32;
4769 : int Opcode = -1;
4770 :
4771 737 : if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
4772 443 : Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx8,
4773 : NumVDataDwords, NumVAddrDwords);
4774 443 : if (Opcode == -1)
4775 716 : Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx6,
4776 : NumVDataDwords, NumVAddrDwords);
4777 : assert(Opcode != -1);
4778 :
4779 737 : MachineSDNode *NewNode = DAG.getMachineNode(Opcode, DL, ResultTypes, Ops);
4780 : if (auto MemOp = dyn_cast<MemSDNode>(Op)) {
4781 706 : MachineMemOperand *MemRef = MemOp->getMemOperand();
4782 1412 : DAG.setNodeMemRefs(NewNode, {MemRef});
4783 : }
4784 :
4785 737 : if (BaseOpcode->AtomicX2) {
4786 : SmallVector<SDValue, 1> Elt;
4787 2 : DAG.ExtractVectorElements(SDValue(NewNode, 0), Elt, 0, 1);
4788 4 : return DAG.getMergeValues({Elt[0], SDValue(NewNode, 1)}, DL);
4789 735 : } else if (IsD16 && !BaseOpcode->Store) {
4790 : MVT LoadVT = Op.getSimpleValueType();
4791 : SDValue Adjusted = adjustLoadValueTypeImpl(
4792 54 : SDValue(NewNode, 0), LoadVT, DL, DAG, Subtarget->hasUnpackedD16VMem());
4793 54 : return DAG.getMergeValues({Adjusted, SDValue(NewNode, 1)}, DL);
4794 : }
4795 :
4796 708 : return SDValue(NewNode, 0);
4797 : }
4798 :
4799 528 : SDValue SITargetLowering::lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc,
4800 : SDValue Offset, SDValue GLC,
4801 : SelectionDAG &DAG) const {
4802 528 : MachineFunction &MF = DAG.getMachineFunction();
4803 1056 : MachineMemOperand *MMO = MF.getMachineMemOperand(
4804 : MachinePointerInfo(),
4805 : MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
4806 : MachineMemOperand::MOInvariant,
4807 : VT.getStoreSize(), VT.getStoreSize());
4808 :
4809 1056 : if (!Offset->isDivergent()) {
4810 : SDValue Ops[] = {
4811 : Rsrc,
4812 : Offset, // Offset
4813 : GLC // glc
4814 464 : };
4815 : return DAG.getMemIntrinsicNode(AMDGPUISD::SBUFFER_LOAD, DL,
4816 464 : DAG.getVTList(VT), Ops, VT, MMO);
4817 : }
4818 :
4819 : // We have a divergent offset. Emit a MUBUF buffer load instead. We can
4820 : // assume that the buffer is unswizzled.
4821 : SmallVector<SDValue, 4> Loads;
4822 : unsigned NumLoads = 1;
4823 : MVT LoadVT = VT.getSimpleVT();
4824 :
4825 : assert(LoadVT == MVT::i32 || LoadVT == MVT::v2i32 || LoadVT == MVT::v4i32 ||
4826 : LoadVT == MVT::v8i32 || LoadVT == MVT::v16i32);
4827 :
4828 : if (VT == MVT::v8i32 || VT == MVT::v16i32) {
4829 : NumLoads = VT == MVT::v16i32 ? 4 : 2;
4830 : LoadVT = MVT::v4i32;
4831 : }
4832 :
4833 128 : SDVTList VTList = DAG.getVTList({LoadVT, MVT::Glue});
4834 64 : unsigned CachePolicy = cast<ConstantSDNode>(GLC)->getZExtValue();
4835 : SDValue Ops[] = {
4836 64 : DAG.getEntryNode(), // Chain
4837 : Rsrc, // rsrc
4838 64 : DAG.getConstant(0, DL, MVT::i32), // vindex
4839 : {}, // voffset
4840 : {}, // soffset
4841 : {}, // offset
4842 64 : DAG.getConstant(CachePolicy, DL, MVT::i32), // cachepolicy
4843 64 : DAG.getConstant(0, DL, MVT::i1), // idxen
4844 64 : };
4845 :
4846 : // Use the alignment to ensure that the required offsets will fit into the
4847 : // immediate offsets.
4848 64 : setBufferOffsets(Offset, DAG, &Ops[3], NumLoads > 1 ? 16 * NumLoads : 4);
4849 :
4850 64 : uint64_t InstOffset = cast<ConstantSDNode>(Ops[5])->getZExtValue();
4851 168 : for (unsigned i = 0; i < NumLoads; ++i) {
4852 104 : Ops[5] = DAG.getConstant(InstOffset + 16 * i, DL, MVT::i32);
4853 104 : Loads.push_back(DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_LOAD, DL, VTList,
4854 208 : Ops, LoadVT, MMO));
4855 : }
4856 :
4857 : if (VT == MVT::v8i32 || VT == MVT::v16i32)
4858 16 : return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Loads);
4859 :
4860 48 : return Loads[0];
4861 : }
4862 :
4863 21890 : SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
4864 : SelectionDAG &DAG) const {
4865 21890 : MachineFunction &MF = DAG.getMachineFunction();
4866 21890 : auto MFI = MF.getInfo<SIMachineFunctionInfo>();
4867 :
4868 21890 : EVT VT = Op.getValueType();
4869 : SDLoc DL(Op);
4870 21890 : unsigned IntrinsicID = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
4871 :
4872 : // TODO: Should this propagate fast-math-flags?
4873 :
4874 21890 : switch (IntrinsicID) {
4875 4 : case Intrinsic::amdgcn_implicit_buffer_ptr: {
4876 4 : if (getSubtarget()->isAmdHsaOrMesa(MF.getFunction()))
4877 2 : return emitNonHSAIntrinsicError(DAG, DL, VT);
4878 : return getPreloadedValue(DAG, *MFI, VT,
4879 2 : AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
4880 : }
4881 54 : case Intrinsic::amdgcn_dispatch_ptr:
4882 : case Intrinsic::amdgcn_queue_ptr: {
4883 54 : if (!Subtarget->isAmdHsaOrMesa(MF.getFunction())) {
4884 : DiagnosticInfoUnsupported BadIntrin(
4885 : MF.getFunction(), "unsupported hsa intrinsic without hsa target",
4886 2 : DL.getDebugLoc());
4887 2 : DAG.getContext()->diagnose(BadIntrin);
4888 2 : return DAG.getUNDEF(VT);
4889 : }
4890 :
4891 52 : auto RegID = IntrinsicID == Intrinsic::amdgcn_dispatch_ptr ?
4892 : AMDGPUFunctionArgInfo::DISPATCH_PTR : AMDGPUFunctionArgInfo::QUEUE_PTR;
4893 52 : return getPreloadedValue(DAG, *MFI, VT, RegID);
4894 : }
4895 40 : case Intrinsic::amdgcn_implicitarg_ptr: {
4896 40 : if (MFI->isEntryFunction())
4897 32 : return getImplicitArgPtr(DAG, DL);
4898 : return getPreloadedValue(DAG, *MFI, VT,
4899 8 : AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
4900 : }
4901 13565 : case Intrinsic::amdgcn_kernarg_segment_ptr: {
4902 : return getPreloadedValue(DAG, *MFI, VT,
4903 13565 : AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
4904 : }
4905 9 : case Intrinsic::amdgcn_dispatch_id: {
4906 9 : return getPreloadedValue(DAG, *MFI, VT, AMDGPUFunctionArgInfo::DISPATCH_ID);
4907 : }
4908 : case Intrinsic::amdgcn_rcp:
4909 29 : return DAG.getNode(AMDGPUISD::RCP, DL, VT, Op.getOperand(1));
4910 : case Intrinsic::amdgcn_rsq:
4911 33 : return DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
4912 5 : case Intrinsic::amdgcn_rsq_legacy:
4913 5 : if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
4914 1 : return emitRemovedIntrinsicError(DAG, DL, VT);
4915 :
4916 4 : return DAG.getNode(AMDGPUISD::RSQ_LEGACY, DL, VT, Op.getOperand(1));
4917 11 : case Intrinsic::amdgcn_rcp_legacy:
4918 11 : if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
4919 4 : return emitRemovedIntrinsicError(DAG, DL, VT);
4920 7 : return DAG.getNode(AMDGPUISD::RCP_LEGACY, DL, VT, Op.getOperand(1));
4921 6 : case Intrinsic::amdgcn_rsq_clamp: {
4922 6 : if (Subtarget->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
4923 3 : return DAG.getNode(AMDGPUISD::RSQ_CLAMP, DL, VT, Op.getOperand(1));
4924 :
4925 3 : Type *Type = VT.getTypeForEVT(*DAG.getContext());
4926 3 : APFloat Max = APFloat::getLargest(Type->getFltSemantics());
4927 3 : APFloat Min = APFloat::getLargest(Type->getFltSemantics(), true);
4928 :
4929 3 : SDValue Rsq = DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
4930 : SDValue Tmp = DAG.getNode(ISD::FMINNUM, DL, VT, Rsq,
4931 3 : DAG.getConstantFP(Max, DL, VT));
4932 : return DAG.getNode(ISD::FMAXNUM, DL, VT, Tmp,
4933 3 : DAG.getConstantFP(Min, DL, VT));
4934 : }
4935 2 : case Intrinsic::r600_read_ngroups_x:
4936 4 : if (Subtarget->isAmdHsaOS())
4937 0 : return emitNonHSAIntrinsicError(DAG, DL, VT);
4938 :
4939 : return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
4940 2 : SI::KernelInputOffsets::NGROUPS_X, 4, false);
4941 2 : case Intrinsic::r600_read_ngroups_y:
4942 4 : if (Subtarget->isAmdHsaOS())
4943 0 : return emitNonHSAIntrinsicError(DAG, DL, VT);
4944 :
4945 : return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
4946 2 : SI::KernelInputOffsets::NGROUPS_Y, 4, false);
4947 2 : case Intrinsic::r600_read_ngroups_z:
4948 4 : if (Subtarget->isAmdHsaOS())
4949 0 : return emitNonHSAIntrinsicError(DAG, DL, VT);
4950 :
4951 : return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
4952 2 : SI::KernelInputOffsets::NGROUPS_Z, 4, false);
4953 2 : case Intrinsic::r600_read_global_size_x:
4954 4 : if (Subtarget->isAmdHsaOS())
4955 0 : return emitNonHSAIntrinsicError(DAG, DL, VT);
4956 :
4957 : return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
4958 2 : SI::KernelInputOffsets::GLOBAL_SIZE_X, 4, false);
4959 2 : case Intrinsic::r600_read_global_size_y:
4960 4 : if (Subtarget->isAmdHsaOS())
4961 0 : return emitNonHSAIntrinsicError(DAG, DL, VT);
4962 :
4963 : return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
4964 2 : SI::KernelInputOffsets::GLOBAL_SIZE_Y, 4, false);
4965 2 : case Intrinsic::r600_read_global_size_z:
4966 4 : if (Subtarget->isAmdHsaOS())
4967 0 : return emitNonHSAIntrinsicError(DAG, DL, VT);
4968 :
4969 : return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
4970 2 : SI::KernelInputOffsets::GLOBAL_SIZE_Z, 4, false);
4971 13 : case Intrinsic::r600_read_local_size_x:
4972 26 : if (Subtarget->isAmdHsaOS())
4973 0 : return emitNonHSAIntrinsicError(DAG, DL, VT);
4974 :
4975 : return lowerImplicitZextParam(DAG, Op, MVT::i16,
4976 13 : SI::KernelInputOffsets::LOCAL_SIZE_X);
4977 39 : case Intrinsic::r600_read_local_size_y:
4978 78 : if (Subtarget->isAmdHsaOS())
4979 0 : return emitNonHSAIntrinsicError(DAG, DL, VT);
4980 :
4981 : return lowerImplicitZextParam(DAG, Op, MVT::i16,
4982 39 : SI::KernelInputOffsets::LOCAL_SIZE_Y);
4983 39 : case Intrinsic::r600_read_local_size_z:
4984 78 : if (Subtarget->isAmdHsaOS())
4985 0 : return emitNonHSAIntrinsicError(DAG, DL, VT);
4986 :
4987 : return lowerImplicitZextParam(DAG, Op, MVT::i16,
4988 39 : SI::KernelInputOffsets::LOCAL_SIZE_Z);
4989 49 : case Intrinsic::amdgcn_workgroup_id_x:
4990 : case Intrinsic::r600_read_tgid_x:
4991 : return getPreloadedValue(DAG, *MFI, VT,
4992 49 : AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
4993 24 : case Intrinsic::amdgcn_workgroup_id_y:
4994 : case Intrinsic::r600_read_tgid_y:
4995 : return getPreloadedValue(DAG, *MFI, VT,
4996 24 : AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
4997 24 : case Intrinsic::amdgcn_workgroup_id_z:
4998 : case Intrinsic::r600_read_tgid_z:
4999 : return getPreloadedValue(DAG, *MFI, VT,
5000 24 : AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
5001 3280 : case Intrinsic::amdgcn_workitem_id_x: {
5002 : case Intrinsic::r600_read_tidig_x:
5003 : return loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32,
5004 3280 : SDLoc(DAG.getEntryNode()),
5005 3280 : MFI->getArgInfo().WorkItemIDX);
5006 : }
5007 125 : case Intrinsic::amdgcn_workitem_id_y:
5008 : case Intrinsic::r600_read_tidig_y:
5009 : return loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32,
5010 125 : SDLoc(DAG.getEntryNode()),
5011 125 : MFI->getArgInfo().WorkItemIDY);
5012 74 : case Intrinsic::amdgcn_workitem_id_z:
5013 : case Intrinsic::r600_read_tidig_z:
5014 : return loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32,
5015 74 : SDLoc(DAG.getEntryNode()),
5016 74 : MFI->getArgInfo().WorkItemIDZ);
5017 : case AMDGPUIntrinsic::SI_load_const: {
5018 : SDValue Load =
5019 : lowerSBuffer(MVT::i32, DL, Op.getOperand(1), Op.getOperand(2),
5020 468 : DAG.getTargetConstant(0, DL, MVT::i1), DAG);
5021 468 : return DAG.getNode(ISD::BITCAST, DL, MVT::f32, Load);
5022 : }
5023 60 : case Intrinsic::amdgcn_s_buffer_load: {
5024 60 : unsigned Cache = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
5025 : return lowerSBuffer(VT, DL, Op.getOperand(1), Op.getOperand(2),
5026 120 : DAG.getTargetConstant(Cache & 1, DL, MVT::i1), DAG);
5027 : }
5028 33 : case Intrinsic::amdgcn_fdiv_fast:
5029 33 : return lowerFDIV_FAST(Op, DAG);
5030 84 : case Intrinsic::amdgcn_interp_mov: {
5031 84 : SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(4));
5032 84 : SDValue Glue = M0.getValue(1);
5033 : return DAG.getNode(AMDGPUISD::INTERP_MOV, DL, MVT::f32, Op.getOperand(1),
5034 84 : Op.getOperand(2), Op.getOperand(3), Glue);
5035 : }
5036 215 : case Intrinsic::amdgcn_interp_p1: {
5037 215 : SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(4));
5038 215 : SDValue Glue = M0.getValue(1);
5039 : return DAG.getNode(AMDGPUISD::INTERP_P1, DL, MVT::f32, Op.getOperand(1),
5040 215 : Op.getOperand(2), Op.getOperand(3), Glue);
5041 : }
5042 199 : case Intrinsic::amdgcn_interp_p2: {
5043 199 : SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(5));
5044 199 : SDValue Glue = SDValue(M0.getNode(), 1);
5045 : return DAG.getNode(AMDGPUISD::INTERP_P2, DL, MVT::f32, Op.getOperand(1),
5046 : Op.getOperand(2), Op.getOperand(3), Op.getOperand(4),
5047 199 : Glue);
5048 : }
5049 : case Intrinsic::amdgcn_sin:
5050 7 : return DAG.getNode(AMDGPUISD::SIN_HW, DL, VT, Op.getOperand(1));
5051 :
5052 : case Intrinsic::amdgcn_cos:
5053 3 : return DAG.getNode(AMDGPUISD::COS_HW, DL, VT, Op.getOperand(1));
5054 :
5055 3 : case Intrinsic::amdgcn_log_clamp: {
5056 3 : if (Subtarget->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
5057 2 : return SDValue();
5058 :
5059 : DiagnosticInfoUnsupported BadIntrin(
5060 : MF.getFunction(), "intrinsic not supported on subtarget",
5061 1 : DL.getDebugLoc());
5062 1 : DAG.getContext()->diagnose(BadIntrin);
5063 1 : return DAG.getUNDEF(VT);
5064 : }
5065 : case Intrinsic::amdgcn_ldexp:
5066 : return DAG.getNode(AMDGPUISD::LDEXP, DL, VT,
5067 10 : Op.getOperand(1), Op.getOperand(2));
5068 :
5069 : case Intrinsic::amdgcn_fract:
5070 8 : return DAG.getNode(AMDGPUISD::FRACT, DL, VT, Op.getOperand(1));
5071 :
5072 : case Intrinsic::amdgcn_class:
5073 : return DAG.getNode(AMDGPUISD::FP_CLASS, DL, VT,
5074 64 : Op.getOperand(1), Op.getOperand(2));
5075 10 : case Intrinsic::amdgcn_div_fmas:
5076 : return DAG.getNode(AMDGPUISD::DIV_FMAS, DL, VT,
5077 : Op.getOperand(1), Op.getOperand(2), Op.getOperand(3),
5078 10 : Op.getOperand(4));
5079 :
5080 : case Intrinsic::amdgcn_div_fixup:
5081 : return DAG.getNode(AMDGPUISD::DIV_FIXUP, DL, VT,
5082 13 : Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
5083 :
5084 : case Intrinsic::amdgcn_trig_preop:
5085 : return DAG.getNode(AMDGPUISD::TRIG_PREOP, DL, VT,
5086 4 : Op.getOperand(1), Op.getOperand(2));
5087 27 : case Intrinsic::amdgcn_div_scale: {
5088 : // 3rd parameter required to be a constant.
5089 : const ConstantSDNode *Param = dyn_cast<ConstantSDNode>(Op.getOperand(3));
5090 : if (!Param)
5091 3 : return DAG.getMergeValues({ DAG.getUNDEF(VT), DAG.getUNDEF(MVT::i1) }, DL);
5092 :
5093 : // Translate to the operands expected by the machine instruction. The
5094 : // first parameter must be the same as the first instruction.
5095 24 : SDValue Numerator = Op.getOperand(1);
5096 24 : SDValue Denominator = Op.getOperand(2);
5097 :
5098 : // Note this order is opposite of the machine instruction's operations,
5099 : // which is s0.f = Quotient, s1.f = Denominator, s2.f = Numerator. The
5100 : // intrinsic has the numerator as the first operand to match a normal
5101 : // division operation.
5102 :
5103 48 : SDValue Src0 = Param->isAllOnesValue() ? Numerator : Denominator;
5104 :
5105 : return DAG.getNode(AMDGPUISD::DIV_SCALE, DL, Op->getVTList(), Src0,
5106 48 : Denominator, Numerator);
5107 : }
5108 72 : case Intrinsic::amdgcn_icmp: {
5109 72 : return lowerICMPIntrinsic(*this, Op.getNode(), DAG);
5110 : }
5111 86 : case Intrinsic::amdgcn_fcmp: {
5112 86 : return lowerFCMPIntrinsic(*this, Op.getNode(), DAG);
5113 : }
5114 : case Intrinsic::amdgcn_fmed3:
5115 : return DAG.getNode(AMDGPUISD::FMED3, DL, VT,
5116 84 : Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
5117 2 : case Intrinsic::amdgcn_fdot2:
5118 : return DAG.getNode(AMDGPUISD::FDOT2, DL, VT,
5119 : Op.getOperand(1), Op.getOperand(2), Op.getOperand(3),
5120 2 : Op.getOperand(4));
5121 : case Intrinsic::amdgcn_fmul_legacy:
5122 : return DAG.getNode(AMDGPUISD::FMUL_LEGACY, DL, VT,
5123 57 : Op.getOperand(1), Op.getOperand(2));
5124 : case Intrinsic::amdgcn_sffbh:
5125 4 : return DAG.getNode(AMDGPUISD::FFBH_I32, DL, VT, Op.getOperand(1));
5126 : case Intrinsic::amdgcn_sbfe:
5127 : return DAG.getNode(AMDGPUISD::BFE_I32, DL, VT,
5128 102 : Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
5129 : case Intrinsic::amdgcn_ubfe:
5130 : return DAG.getNode(AMDGPUISD::BFE_U32, DL, VT,
5131 94 : Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
5132 103 : case Intrinsic::amdgcn_cvt_pkrtz:
5133 : case Intrinsic::amdgcn_cvt_pknorm_i16:
5134 : case Intrinsic::amdgcn_cvt_pknorm_u16:
5135 : case Intrinsic::amdgcn_cvt_pk_i16:
5136 : case Intrinsic::amdgcn_cvt_pk_u16: {
5137 : // FIXME: Stop adding cast if v2f16/v2i16 are legal.
5138 103 : EVT VT = Op.getValueType();
5139 : unsigned Opcode;
5140 :
5141 103 : if (IntrinsicID == Intrinsic::amdgcn_cvt_pkrtz)
5142 : Opcode = AMDGPUISD::CVT_PKRTZ_F16_F32;
5143 56 : else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_i16)
5144 : Opcode = AMDGPUISD::CVT_PKNORM_I16_F32;
5145 38 : else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_u16)
5146 : Opcode = AMDGPUISD::CVT_PKNORM_U16_F32;
5147 20 : else if (IntrinsicID == Intrinsic::amdgcn_cvt_pk_i16)
5148 : Opcode = AMDGPUISD::CVT_PK_I16_I32;
5149 : else
5150 : Opcode = AMDGPUISD::CVT_PK_U16_U32;
5151 :
5152 : if (isTypeLegal(VT))
5153 103 : return DAG.getNode(Opcode, DL, VT, Op.getOperand(1), Op.getOperand(2));
5154 :
5155 : SDValue Node = DAG.getNode(Opcode, DL, MVT::i32,
5156 0 : Op.getOperand(1), Op.getOperand(2));
5157 0 : return DAG.getNode(ISD::BITCAST, DL, VT, Node);
5158 : }
5159 13 : case Intrinsic::amdgcn_wqm: {
5160 13 : SDValue Src = Op.getOperand(1);
5161 13 : return SDValue(DAG.getMachineNode(AMDGPU::WQM, DL, Src.getValueType(), Src),
5162 13 : 0);
5163 : }
5164 278 : case Intrinsic::amdgcn_wwm: {
5165 278 : SDValue Src = Op.getOperand(1);
5166 278 : return SDValue(DAG.getMachineNode(AMDGPU::WWM, DL, Src.getValueType(), Src),
5167 278 : 0);
5168 : }
5169 : case Intrinsic::amdgcn_fmad_ftz:
5170 : return DAG.getNode(AMDGPUISD::FMAD_FTZ, DL, VT, Op.getOperand(1),
5171 111 : Op.getOperand(2), Op.getOperand(3));
5172 2241 : default:
5173 2241 : if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
5174 2241 : AMDGPU::getImageDimIntrinsicInfo(IntrinsicID))
5175 2241 : return lowerImage(Op, ImageDimIntr, DAG);
5176 :
5177 2205 : return Op;
5178 : }
5179 : }
5180 :
5181 1617 : SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
5182 : SelectionDAG &DAG) const {
5183 3234 : unsigned IntrID = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
5184 : SDLoc DL(Op);
5185 :
5186 1617 : switch (IntrID) {
5187 : case Intrinsic::amdgcn_atomic_inc:
5188 : case Intrinsic::amdgcn_atomic_dec:
5189 : case Intrinsic::amdgcn_ds_fadd:
5190 : case Intrinsic::amdgcn_ds_fmin:
5191 : case Intrinsic::amdgcn_ds_fmax: {
5192 : MemSDNode *M = cast<MemSDNode>(Op);
5193 : unsigned Opc;
5194 : switch (IntrID) {
5195 : case Intrinsic::amdgcn_atomic_inc:
5196 : Opc = AMDGPUISD::ATOMIC_INC;
5197 : break;
5198 115 : case Intrinsic::amdgcn_atomic_dec:
5199 : Opc = AMDGPUISD::ATOMIC_DEC;
5200 115 : break;
5201 6 : case Intrinsic::amdgcn_ds_fadd:
5202 : Opc = AMDGPUISD::ATOMIC_LOAD_FADD;
5203 6 : break;
5204 6 : case Intrinsic::amdgcn_ds_fmin:
5205 : Opc = AMDGPUISD::ATOMIC_LOAD_FMIN;
5206 6 : break;
5207 6 : case Intrinsic::amdgcn_ds_fmax:
5208 : Opc = AMDGPUISD::ATOMIC_LOAD_FMAX;
5209 6 : break;
5210 0 : default:
5211 0 : llvm_unreachable("Unknown intrinsic!");
5212 : }
5213 : SDValue Ops[] = {
5214 245 : M->getOperand(0), // Chain
5215 : M->getOperand(2), // Ptr
5216 : M->getOperand(3) // Value
5217 245 : };
5218 :
5219 245 : return DAG.getMemIntrinsicNode(Opc, SDLoc(Op), M->getVTList(), Ops,
5220 735 : M->getMemoryVT(), M->getMemOperand());
5221 : }
5222 202 : case Intrinsic::amdgcn_buffer_load:
5223 : case Intrinsic::amdgcn_buffer_load_format: {
5224 404 : unsigned Glc = cast<ConstantSDNode>(Op.getOperand(5))->getZExtValue();
5225 404 : unsigned Slc = cast<ConstantSDNode>(Op.getOperand(6))->getZExtValue();
5226 : unsigned IdxEn = 1;
5227 : if (auto Idx = dyn_cast<ConstantSDNode>(Op.getOperand(3)))
5228 226 : IdxEn = Idx->getZExtValue() != 0;
5229 : SDValue Ops[] = {
5230 : Op.getOperand(0), // Chain
5231 : Op.getOperand(2), // rsrc
5232 : Op.getOperand(3), // vindex
5233 : SDValue(), // voffset -- will be set by setBufferOffsets
5234 : SDValue(), // soffset -- will be set by setBufferOffsets
5235 : SDValue(), // offset -- will be set by setBufferOffsets
5236 202 : DAG.getConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy
5237 202 : DAG.getConstant(IdxEn, DL, MVT::i1), // idxen
5238 202 : };
5239 :
5240 202 : setBufferOffsets(Op.getOperand(4), DAG, &Ops[3]);
5241 202 : unsigned Opc = (IntrID == Intrinsic::amdgcn_buffer_load) ?
5242 : AMDGPUISD::BUFFER_LOAD : AMDGPUISD::BUFFER_LOAD_FORMAT;
5243 :
5244 202 : EVT VT = Op.getValueType();
5245 202 : EVT IntVT = VT.changeTypeToInteger();
5246 : auto *M = cast<MemSDNode>(Op);
5247 202 : EVT LoadVT = Op.getValueType();
5248 :
5249 211 : if (LoadVT.getScalarType() == MVT::f16)
5250 : return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16,
5251 9 : M, DAG, Ops);
5252 : return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, IntVT,
5253 579 : M->getMemOperand());
5254 : }
5255 107 : case Intrinsic::amdgcn_raw_buffer_load:
5256 : case Intrinsic::amdgcn_raw_buffer_load_format: {
5257 107 : auto Offsets = splitBufferOffsets(Op.getOperand(3), DAG);
5258 : SDValue Ops[] = {
5259 : Op.getOperand(0), // Chain
5260 : Op.getOperand(2), // rsrc
5261 107 : DAG.getConstant(0, DL, MVT::i32), // vindex
5262 : Offsets.first, // voffset
5263 : Op.getOperand(4), // soffset
5264 : Offsets.second, // offset
5265 : Op.getOperand(5), // cachepolicy
5266 107 : DAG.getConstant(0, DL, MVT::i1), // idxen
5267 107 : };
5268 :
5269 107 : unsigned Opc = (IntrID == Intrinsic::amdgcn_raw_buffer_load) ?
5270 : AMDGPUISD::BUFFER_LOAD : AMDGPUISD::BUFFER_LOAD_FORMAT;
5271 :
5272 107 : EVT VT = Op.getValueType();
5273 107 : EVT IntVT = VT.changeTypeToInteger();
5274 : auto *M = cast<MemSDNode>(Op);
5275 107 : EVT LoadVT = Op.getValueType();
5276 :
5277 116 : if (LoadVT.getScalarType() == MVT::f16)
5278 : return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16,
5279 9 : M, DAG, Ops);
5280 : return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, IntVT,
5281 294 : M->getMemOperand());
5282 : }
5283 75 : case Intrinsic::amdgcn_struct_buffer_load:
5284 : case Intrinsic::amdgcn_struct_buffer_load_format: {
5285 75 : auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG);
5286 : SDValue Ops[] = {
5287 : Op.getOperand(0), // Chain
5288 : Op.getOperand(2), // rsrc
5289 : Op.getOperand(3), // vindex
5290 : Offsets.first, // voffset
5291 : Op.getOperand(5), // soffset
5292 : Offsets.second, // offset
5293 : Op.getOperand(6), // cachepolicy
5294 75 : DAG.getConstant(1, DL, MVT::i1), // idxen
5295 75 : };
5296 :
5297 75 : unsigned Opc = (IntrID == Intrinsic::amdgcn_struct_buffer_load) ?
5298 : AMDGPUISD::BUFFER_LOAD : AMDGPUISD::BUFFER_LOAD_FORMAT;
5299 :
5300 75 : EVT VT = Op.getValueType();
5301 75 : EVT IntVT = VT.changeTypeToInteger();
5302 : auto *M = cast<MemSDNode>(Op);
5303 75 : EVT LoadVT = Op.getValueType();
5304 :
5305 84 : if (LoadVT.getScalarType() == MVT::f16)
5306 : return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16,
5307 9 : M, DAG, Ops);
5308 : return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, IntVT,
5309 198 : M->getMemOperand());
5310 : }
5311 : case Intrinsic::amdgcn_tbuffer_load: {
5312 : MemSDNode *M = cast<MemSDNode>(Op);
5313 37 : EVT LoadVT = Op.getValueType();
5314 :
5315 74 : unsigned Dfmt = cast<ConstantSDNode>(Op.getOperand(7))->getZExtValue();
5316 74 : unsigned Nfmt = cast<ConstantSDNode>(Op.getOperand(8))->getZExtValue();
5317 74 : unsigned Glc = cast<ConstantSDNode>(Op.getOperand(9))->getZExtValue();
5318 74 : unsigned Slc = cast<ConstantSDNode>(Op.getOperand(10))->getZExtValue();
5319 : unsigned IdxEn = 1;
5320 : if (auto Idx = dyn_cast<ConstantSDNode>(Op.getOperand(3)))
5321 66 : IdxEn = Idx->getZExtValue() != 0;
5322 : SDValue Ops[] = {
5323 : Op.getOperand(0), // Chain
5324 : Op.getOperand(2), // rsrc
5325 : Op.getOperand(3), // vindex
5326 : Op.getOperand(4), // voffset
5327 : Op.getOperand(5), // soffset
5328 : Op.getOperand(6), // offset
5329 37 : DAG.getConstant(Dfmt | (Nfmt << 4), DL, MVT::i32), // format
5330 37 : DAG.getConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy
5331 37 : DAG.getConstant(IdxEn, DL, MVT::i1), // idxen
5332 37 : };
5333 :
5334 46 : if (LoadVT.getScalarType() == MVT::f16)
5335 : return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16,
5336 9 : M, DAG, Ops);
5337 : return DAG.getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
5338 : Op->getVTList(), Ops, LoadVT,
5339 84 : M->getMemOperand());
5340 : }
5341 : case Intrinsic::amdgcn_raw_tbuffer_load: {
5342 : MemSDNode *M = cast<MemSDNode>(Op);
5343 33 : EVT LoadVT = Op.getValueType();
5344 33 : auto Offsets = splitBufferOffsets(Op.getOperand(3), DAG);
5345 :
5346 : SDValue Ops[] = {
5347 : Op.getOperand(0), // Chain
5348 : Op.getOperand(2), // rsrc
5349 33 : DAG.getConstant(0, DL, MVT::i32), // vindex
5350 : Offsets.first, // voffset
5351 : Op.getOperand(4), // soffset
5352 : Offsets.second, // offset
5353 : Op.getOperand(5), // format
5354 : Op.getOperand(6), // cachepolicy
5355 33 : DAG.getConstant(0, DL, MVT::i1), // idxen
5356 33 : };
5357 :
5358 42 : if (LoadVT.getScalarType() == MVT::f16)
5359 : return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16,
5360 9 : M, DAG, Ops);
5361 : return DAG.getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
5362 : Op->getVTList(), Ops, LoadVT,
5363 72 : M->getMemOperand());
5364 : }
5365 : case Intrinsic::amdgcn_struct_tbuffer_load: {
5366 : MemSDNode *M = cast<MemSDNode>(Op);
5367 37 : EVT LoadVT = Op.getValueType();
5368 37 : auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG);
5369 :
5370 : SDValue Ops[] = {
5371 : Op.getOperand(0), // Chain
5372 : Op.getOperand(2), // rsrc
5373 : Op.getOperand(3), // vindex
5374 : Offsets.first, // voffset
5375 : Op.getOperand(5), // soffset
5376 : Offsets.second, // offset
5377 : Op.getOperand(6), // format
5378 : Op.getOperand(7), // cachepolicy
5379 37 : DAG.getConstant(1, DL, MVT::i1), // idxen
5380 37 : };
5381 :
5382 46 : if (LoadVT.getScalarType() == MVT::f16)
5383 : return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16,
5384 9 : M, DAG, Ops);
5385 : return DAG.getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
5386 : Op->getVTList(), Ops, LoadVT,
5387 84 : M->getMemOperand());
5388 : }
5389 58 : case Intrinsic::amdgcn_buffer_atomic_swap:
5390 : case Intrinsic::amdgcn_buffer_atomic_add:
5391 : case Intrinsic::amdgcn_buffer_atomic_sub:
5392 : case Intrinsic::amdgcn_buffer_atomic_smin:
5393 : case Intrinsic::amdgcn_buffer_atomic_umin:
5394 : case Intrinsic::amdgcn_buffer_atomic_smax:
5395 : case Intrinsic::amdgcn_buffer_atomic_umax:
5396 : case Intrinsic::amdgcn_buffer_atomic_and:
5397 : case Intrinsic::amdgcn_buffer_atomic_or:
5398 : case Intrinsic::amdgcn_buffer_atomic_xor: {
5399 116 : unsigned Slc = cast<ConstantSDNode>(Op.getOperand(6))->getZExtValue();
5400 : unsigned IdxEn = 1;
5401 : if (auto Idx = dyn_cast<ConstantSDNode>(Op.getOperand(4)))
5402 60 : IdxEn = Idx->getZExtValue() != 0;
5403 : SDValue Ops[] = {
5404 : Op.getOperand(0), // Chain
5405 : Op.getOperand(2), // vdata
5406 : Op.getOperand(3), // rsrc
5407 : Op.getOperand(4), // vindex
5408 : SDValue(), // voffset -- will be set by setBufferOffsets
5409 : SDValue(), // soffset -- will be set by setBufferOffsets
5410 : SDValue(), // offset -- will be set by setBufferOffsets
5411 58 : DAG.getConstant(Slc << 1, DL, MVT::i32), // cachepolicy
5412 58 : DAG.getConstant(IdxEn, DL, MVT::i1), // idxen
5413 58 : };
5414 58 : setBufferOffsets(Op.getOperand(5), DAG, &Ops[4]);
5415 58 : EVT VT = Op.getValueType();
5416 :
5417 : auto *M = cast<MemSDNode>(Op);
5418 : unsigned Opcode = 0;
5419 :
5420 : switch (IntrID) {
5421 : case Intrinsic::amdgcn_buffer_atomic_swap:
5422 : Opcode = AMDGPUISD::BUFFER_ATOMIC_SWAP;
5423 : break;
5424 16 : case Intrinsic::amdgcn_buffer_atomic_add:
5425 : Opcode = AMDGPUISD::BUFFER_ATOMIC_ADD;
5426 16 : break;
5427 14 : case Intrinsic::amdgcn_buffer_atomic_sub:
5428 : Opcode = AMDGPUISD::BUFFER_ATOMIC_SUB;
5429 14 : break;
5430 2 : case Intrinsic::amdgcn_buffer_atomic_smin:
5431 : Opcode = AMDGPUISD::BUFFER_ATOMIC_SMIN;
5432 2 : break;
5433 2 : case Intrinsic::amdgcn_buffer_atomic_umin:
5434 : Opcode = AMDGPUISD::BUFFER_ATOMIC_UMIN;
5435 2 : break;
5436 2 : case Intrinsic::amdgcn_buffer_atomic_smax:
5437 : Opcode = AMDGPUISD::BUFFER_ATOMIC_SMAX;
5438 2 : break;
5439 2 : case Intrinsic::amdgcn_buffer_atomic_umax:
5440 : Opcode = AMDGPUISD::BUFFER_ATOMIC_UMAX;
5441 2 : break;
5442 2 : case Intrinsic::amdgcn_buffer_atomic_and:
5443 : Opcode = AMDGPUISD::BUFFER_ATOMIC_AND;
5444 2 : break;
5445 2 : case Intrinsic::amdgcn_buffer_atomic_or:
5446 : Opcode = AMDGPUISD::BUFFER_ATOMIC_OR;
5447 2 : break;
5448 2 : case Intrinsic::amdgcn_buffer_atomic_xor:
5449 : Opcode = AMDGPUISD::BUFFER_ATOMIC_XOR;
5450 2 : break;
5451 0 : default:
5452 0 : llvm_unreachable("unhandled atomic opcode");
5453 : }
5454 :
5455 : return DAG.getMemIntrinsicNode(Opcode, DL, Op->getVTList(), Ops, VT,
5456 174 : M->getMemOperand());
5457 : }
5458 54 : case Intrinsic::amdgcn_raw_buffer_atomic_swap:
5459 : case Intrinsic::amdgcn_raw_buffer_atomic_add:
5460 : case Intrinsic::amdgcn_raw_buffer_atomic_sub:
5461 : case Intrinsic::amdgcn_raw_buffer_atomic_smin:
5462 : case Intrinsic::amdgcn_raw_buffer_atomic_umin:
5463 : case Intrinsic::amdgcn_raw_buffer_atomic_smax:
5464 : case Intrinsic::amdgcn_raw_buffer_atomic_umax:
5465 : case Intrinsic::amdgcn_raw_buffer_atomic_and:
5466 : case Intrinsic::amdgcn_raw_buffer_atomic_or:
5467 : case Intrinsic::amdgcn_raw_buffer_atomic_xor: {
5468 54 : auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG);
5469 : SDValue Ops[] = {
5470 : Op.getOperand(0), // Chain
5471 : Op.getOperand(2), // vdata
5472 : Op.getOperand(3), // rsrc
5473 54 : DAG.getConstant(0, DL, MVT::i32), // vindex
5474 : Offsets.first, // voffset
5475 : Op.getOperand(5), // soffset
5476 : Offsets.second, // offset
5477 : Op.getOperand(6), // cachepolicy
5478 54 : DAG.getConstant(0, DL, MVT::i1), // idxen
5479 54 : };
5480 54 : EVT VT = Op.getValueType();
5481 :
5482 : auto *M = cast<MemSDNode>(Op);
5483 : unsigned Opcode = 0;
5484 :
5485 : switch (IntrID) {
5486 : case Intrinsic::amdgcn_raw_buffer_atomic_swap:
5487 : Opcode = AMDGPUISD::BUFFER_ATOMIC_SWAP;
5488 : break;
5489 16 : case Intrinsic::amdgcn_raw_buffer_atomic_add:
5490 : Opcode = AMDGPUISD::BUFFER_ATOMIC_ADD;
5491 16 : break;
5492 14 : case Intrinsic::amdgcn_raw_buffer_atomic_sub:
5493 : Opcode = AMDGPUISD::BUFFER_ATOMIC_SUB;
5494 14 : break;
5495 2 : case Intrinsic::amdgcn_raw_buffer_atomic_smin:
5496 : Opcode = AMDGPUISD::BUFFER_ATOMIC_SMIN;
5497 2 : break;
5498 2 : case Intrinsic::amdgcn_raw_buffer_atomic_umin:
5499 : Opcode = AMDGPUISD::BUFFER_ATOMIC_UMIN;
5500 2 : break;
5501 2 : case Intrinsic::amdgcn_raw_buffer_atomic_smax:
5502 : Opcode = AMDGPUISD::BUFFER_ATOMIC_SMAX;
5503 2 : break;
5504 2 : case Intrinsic::amdgcn_raw_buffer_atomic_umax:
5505 : Opcode = AMDGPUISD::BUFFER_ATOMIC_UMAX;
5506 2 : break;
5507 2 : case Intrinsic::amdgcn_raw_buffer_atomic_and:
5508 : Opcode = AMDGPUISD::BUFFER_ATOMIC_AND;
5509 2 : break;
5510 2 : case Intrinsic::amdgcn_raw_buffer_atomic_or:
5511 : Opcode = AMDGPUISD::BUFFER_ATOMIC_OR;
5512 2 : break;
5513 2 : case Intrinsic::amdgcn_raw_buffer_atomic_xor:
5514 : Opcode = AMDGPUISD::BUFFER_ATOMIC_XOR;
5515 2 : break;
5516 0 : default:
5517 0 : llvm_unreachable("unhandled atomic opcode");
5518 : }
5519 :
5520 : return DAG.getMemIntrinsicNode(Opcode, DL, Op->getVTList(), Ops, VT,
5521 162 : M->getMemOperand());
5522 : }
5523 64 : case Intrinsic::amdgcn_struct_buffer_atomic_swap:
5524 : case Intrinsic::amdgcn_struct_buffer_atomic_add:
5525 : case Intrinsic::amdgcn_struct_buffer_atomic_sub:
5526 : case Intrinsic::amdgcn_struct_buffer_atomic_smin:
5527 : case Intrinsic::amdgcn_struct_buffer_atomic_umin:
5528 : case Intrinsic::amdgcn_struct_buffer_atomic_smax:
5529 : case Intrinsic::amdgcn_struct_buffer_atomic_umax:
5530 : case Intrinsic::amdgcn_struct_buffer_atomic_and:
5531 : case Intrinsic::amdgcn_struct_buffer_atomic_or:
5532 : case Intrinsic::amdgcn_struct_buffer_atomic_xor: {
5533 64 : auto Offsets = splitBufferOffsets(Op.getOperand(5), DAG);
5534 : SDValue Ops[] = {
5535 : Op.getOperand(0), // Chain
5536 : Op.getOperand(2), // vdata
5537 : Op.getOperand(3), // rsrc
5538 : Op.getOperand(4), // vindex
5539 : Offsets.first, // voffset
5540 : Op.getOperand(6), // soffset
5541 : Offsets.second, // offset
5542 : Op.getOperand(7), // cachepolicy
5543 64 : DAG.getConstant(1, DL, MVT::i1), // idxen
5544 64 : };
5545 64 : EVT VT = Op.getValueType();
5546 :
5547 : auto *M = cast<MemSDNode>(Op);
5548 : unsigned Opcode = 0;
5549 :
5550 : switch (IntrID) {
5551 : case Intrinsic::amdgcn_struct_buffer_atomic_swap:
5552 : Opcode = AMDGPUISD::BUFFER_ATOMIC_SWAP;
5553 : break;
5554 19 : case Intrinsic::amdgcn_struct_buffer_atomic_add:
5555 : Opcode = AMDGPUISD::BUFFER_ATOMIC_ADD;
5556 19 : break;
5557 17 : case Intrinsic::amdgcn_struct_buffer_atomic_sub:
5558 : Opcode = AMDGPUISD::BUFFER_ATOMIC_SUB;
5559 17 : break;
5560 2 : case Intrinsic::amdgcn_struct_buffer_atomic_smin:
5561 : Opcode = AMDGPUISD::BUFFER_ATOMIC_SMIN;
5562 2 : break;
5563 2 : case Intrinsic::amdgcn_struct_buffer_atomic_umin:
5564 : Opcode = AMDGPUISD::BUFFER_ATOMIC_UMIN;
5565 2 : break;
5566 2 : case Intrinsic::amdgcn_struct_buffer_atomic_smax:
5567 : Opcode = AMDGPUISD::BUFFER_ATOMIC_SMAX;
5568 2 : break;
5569 2 : case Intrinsic::amdgcn_struct_buffer_atomic_umax:
5570 : Opcode = AMDGPUISD::BUFFER_ATOMIC_UMAX;
5571 2 : break;
5572 2 : case Intrinsic::amdgcn_struct_buffer_atomic_and:
5573 : Opcode = AMDGPUISD::BUFFER_ATOMIC_AND;
5574 2 : break;
5575 2 : case Intrinsic::amdgcn_struct_buffer_atomic_or:
5576 : Opcode = AMDGPUISD::BUFFER_ATOMIC_OR;
5577 2 : break;
5578 2 : case Intrinsic::amdgcn_struct_buffer_atomic_xor:
5579 : Opcode = AMDGPUISD::BUFFER_ATOMIC_XOR;
5580 2 : break;
5581 0 : default:
5582 0 : llvm_unreachable("unhandled atomic opcode");
5583 : }
5584 :
5585 : return DAG.getMemIntrinsicNode(Opcode, DL, Op->getVTList(), Ops, VT,
5586 192 : M->getMemOperand());
5587 : }
5588 12 : case Intrinsic::amdgcn_buffer_atomic_cmpswap: {
5589 24 : unsigned Slc = cast<ConstantSDNode>(Op.getOperand(7))->getZExtValue();
5590 : unsigned IdxEn = 1;
5591 : if (auto Idx = dyn_cast<ConstantSDNode>(Op.getOperand(5)))
5592 16 : IdxEn = Idx->getZExtValue() != 0;
5593 : SDValue Ops[] = {
5594 : Op.getOperand(0), // Chain
5595 : Op.getOperand(2), // src
5596 : Op.getOperand(3), // cmp
5597 : Op.getOperand(4), // rsrc
5598 : Op.getOperand(5), // vindex
5599 : SDValue(), // voffset -- will be set by setBufferOffsets
5600 : SDValue(), // soffset -- will be set by setBufferOffsets
5601 : SDValue(), // offset -- will be set by setBufferOffsets
5602 12 : DAG.getConstant(Slc << 1, DL, MVT::i32), // cachepolicy
5603 12 : DAG.getConstant(IdxEn, DL, MVT::i1), // idxen
5604 12 : };
5605 12 : setBufferOffsets(Op.getOperand(6), DAG, &Ops[5]);
5606 12 : EVT VT = Op.getValueType();
5607 : auto *M = cast<MemSDNode>(Op);
5608 :
5609 : return DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_ATOMIC_CMPSWAP, DL,
5610 36 : Op->getVTList(), Ops, VT, M->getMemOperand());
5611 : }
5612 8 : case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: {
5613 8 : auto Offsets = splitBufferOffsets(Op.getOperand(5), DAG);
5614 : SDValue Ops[] = {
5615 : Op.getOperand(0), // Chain
5616 : Op.getOperand(2), // src
5617 : Op.getOperand(3), // cmp
5618 : Op.getOperand(4), // rsrc
5619 8 : DAG.getConstant(0, DL, MVT::i32), // vindex
5620 : Offsets.first, // voffset
5621 : Op.getOperand(6), // soffset
5622 : Offsets.second, // offset
5623 : Op.getOperand(7), // cachepolicy
5624 8 : DAG.getConstant(0, DL, MVT::i1), // idxen
5625 8 : };
5626 8 : EVT VT = Op.getValueType();
5627 : auto *M = cast<MemSDNode>(Op);
5628 :
5629 : return DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_ATOMIC_CMPSWAP, DL,
5630 24 : Op->getVTList(), Ops, VT, M->getMemOperand());
5631 : }
5632 12 : case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: {
5633 12 : auto Offsets = splitBufferOffsets(Op.getOperand(6), DAG);
5634 : SDValue Ops[] = {
5635 : Op.getOperand(0), // Chain
5636 : Op.getOperand(2), // src
5637 : Op.getOperand(3), // cmp
5638 : Op.getOperand(4), // rsrc
5639 : Op.getOperand(5), // vindex
5640 : Offsets.first, // voffset
5641 : Op.getOperand(7), // soffset
5642 : Offsets.second, // offset
5643 : Op.getOperand(8), // cachepolicy
5644 12 : DAG.getConstant(1, DL, MVT::i1), // idxen
5645 12 : };
5646 12 : EVT VT = Op.getValueType();
5647 : auto *M = cast<MemSDNode>(Op);
5648 :
5649 : return DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_ATOMIC_CMPSWAP, DL,
5650 36 : Op->getVTList(), Ops, VT, M->getMemOperand());
5651 : }
5652 :
5653 673 : default:
5654 673 : if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
5655 673 : AMDGPU::getImageDimIntrinsicInfo(IntrID))
5656 673 : return lowerImage(Op, ImageDimIntr, DAG);
5657 :
5658 52 : return SDValue();
5659 : }
5660 : }
5661 :
5662 66 : SDValue SITargetLowering::handleD16VData(SDValue VData,
5663 : SelectionDAG &DAG) const {
5664 66 : EVT StoreVT = VData.getValueType();
5665 :
5666 : // No change for f16 and legal vector D16 types.
5667 66 : if (!StoreVT.isVector())
5668 21 : return VData;
5669 :
5670 : SDLoc DL(VData);
5671 : assert((StoreVT.getVectorNumElements() != 3) && "Handle v3f16");
5672 :
5673 45 : if (Subtarget->hasUnpackedD16VMem()) {
5674 : // We need to unpack the packed data to store.
5675 15 : EVT IntStoreVT = StoreVT.changeTypeToInteger();
5676 15 : SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData);
5677 :
5678 15 : EVT EquivStoreVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
5679 15 : StoreVT.getVectorNumElements());
5680 15 : SDValue ZExt = DAG.getNode(ISD::ZERO_EXTEND, DL, EquivStoreVT, IntVData);
5681 15 : return DAG.UnrollVectorOp(ZExt.getNode());
5682 : }
5683 :
5684 : assert(isTypeLegal(StoreVT));
5685 30 : return VData;
5686 : }
5687 :
5688 2731 : SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
5689 : SelectionDAG &DAG) const {
5690 : SDLoc DL(Op);
5691 2731 : SDValue Chain = Op.getOperand(0);
5692 2731 : unsigned IntrinsicID = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
5693 2731 : MachineFunction &MF = DAG.getMachineFunction();
5694 :
5695 2731 : switch (IntrinsicID) {
5696 403 : case Intrinsic::amdgcn_exp: {
5697 : const ConstantSDNode *Tgt = cast<ConstantSDNode>(Op.getOperand(2));
5698 : const ConstantSDNode *En = cast<ConstantSDNode>(Op.getOperand(3));
5699 : const ConstantSDNode *Done = cast<ConstantSDNode>(Op.getOperand(8));
5700 : const ConstantSDNode *VM = cast<ConstantSDNode>(Op.getOperand(9));
5701 :
5702 : const SDValue Ops[] = {
5703 : Chain,
5704 806 : DAG.getTargetConstant(Tgt->getZExtValue(), DL, MVT::i8), // tgt
5705 806 : DAG.getTargetConstant(En->getZExtValue(), DL, MVT::i8), // en
5706 : Op.getOperand(4), // src0
5707 : Op.getOperand(5), // src1
5708 : Op.getOperand(6), // src2
5709 : Op.getOperand(7), // src3
5710 403 : DAG.getTargetConstant(0, DL, MVT::i1), // compr
5711 403 : DAG.getTargetConstant(VM->getZExtValue(), DL, MVT::i1)
5712 806 : };
5713 :
5714 806 : unsigned Opc = Done->isNullValue() ?
5715 : AMDGPUISD::EXPORT : AMDGPUISD::EXPORT_DONE;
5716 806 : return DAG.getNode(Opc, DL, Op->getVTList(), Ops);
5717 : }
5718 96 : case Intrinsic::amdgcn_exp_compr: {
5719 : const ConstantSDNode *Tgt = cast<ConstantSDNode>(Op.getOperand(2));
5720 : const ConstantSDNode *En = cast<ConstantSDNode>(Op.getOperand(3));
5721 96 : SDValue Src0 = Op.getOperand(4);
5722 96 : SDValue Src1 = Op.getOperand(5);
5723 : const ConstantSDNode *Done = cast<ConstantSDNode>(Op.getOperand(6));
5724 : const ConstantSDNode *VM = cast<ConstantSDNode>(Op.getOperand(7));
5725 :
5726 96 : SDValue Undef = DAG.getUNDEF(MVT::f32);
5727 : const SDValue Ops[] = {
5728 : Chain,
5729 192 : DAG.getTargetConstant(Tgt->getZExtValue(), DL, MVT::i8), // tgt
5730 192 : DAG.getTargetConstant(En->getZExtValue(), DL, MVT::i8), // en
5731 96 : DAG.getNode(ISD::BITCAST, DL, MVT::f32, Src0),
5732 96 : DAG.getNode(ISD::BITCAST, DL, MVT::f32, Src1),
5733 : Undef, // src2
5734 : Undef, // src3
5735 96 : DAG.getTargetConstant(1, DL, MVT::i1), // compr
5736 96 : DAG.getTargetConstant(VM->getZExtValue(), DL, MVT::i1)
5737 192 : };
5738 :
5739 192 : unsigned Opc = Done->isNullValue() ?
5740 : AMDGPUISD::EXPORT : AMDGPUISD::EXPORT_DONE;
5741 192 : return DAG.getNode(Opc, DL, Op->getVTList(), Ops);
5742 : }
5743 28 : case Intrinsic::amdgcn_s_sendmsg:
5744 : case Intrinsic::amdgcn_s_sendmsghalt: {
5745 28 : unsigned NodeOp = (IntrinsicID == Intrinsic::amdgcn_s_sendmsg) ?
5746 : AMDGPUISD::SENDMSG : AMDGPUISD::SENDMSGHALT;
5747 28 : Chain = copyToM0(DAG, Chain, DL, Op.getOperand(3));
5748 28 : SDValue Glue = Chain.getValue(1);
5749 : return DAG.getNode(NodeOp, DL, MVT::Other, Chain,
5750 28 : Op.getOperand(2), Glue);
5751 : }
5752 : case Intrinsic::amdgcn_init_exec: {
5753 : return DAG.getNode(AMDGPUISD::INIT_EXEC, DL, MVT::Other, Chain,
5754 3 : Op.getOperand(2));
5755 : }
5756 : case Intrinsic::amdgcn_init_exec_from_input: {
5757 : return DAG.getNode(AMDGPUISD::INIT_EXEC_FROM_INPUT, DL, MVT::Other, Chain,
5758 4 : Op.getOperand(2), Op.getOperand(3));
5759 : }
5760 31 : case AMDGPUIntrinsic::AMDGPU_kill: {
5761 31 : SDValue Src = Op.getOperand(2);
5762 : if (const ConstantFPSDNode *K = dyn_cast<ConstantFPSDNode>(Src)) {
5763 22 : if (!K->isNegative())
5764 4 : return Chain;
5765 :
5766 7 : SDValue NegOne = DAG.getTargetConstant(FloatToBits(-1.0f), DL, MVT::i32);
5767 7 : return DAG.getNode(AMDGPUISD::KILL, DL, MVT::Other, Chain, NegOne);
5768 : }
5769 :
5770 20 : SDValue Cast = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Src);
5771 20 : return DAG.getNode(AMDGPUISD::KILL, DL, MVT::Other, Chain, Cast);
5772 : }
5773 145 : case Intrinsic::amdgcn_s_barrier: {
5774 145 : if (getTargetMachine().getOptLevel() > CodeGenOpt::None) {
5775 135 : const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
5776 135 : unsigned WGSize = ST.getFlatWorkGroupSizes(MF.getFunction()).second;
5777 135 : if (WGSize <= ST.getWavefrontSize())
5778 5 : return SDValue(DAG.getMachineNode(AMDGPU::WAVE_BARRIER, DL, MVT::Other,
5779 5 : Op.getOperand(0)), 0);
5780 : }
5781 140 : return SDValue();
5782 : };
5783 14 : case AMDGPUIntrinsic::SI_tbuffer_store: {
5784 :
5785 : // Extract vindex and voffset from vaddr as appropriate
5786 : const ConstantSDNode *OffEn = cast<ConstantSDNode>(Op.getOperand(10));
5787 : const ConstantSDNode *IdxEn = cast<ConstantSDNode>(Op.getOperand(11));
5788 14 : SDValue VAddr = Op.getOperand(5);
5789 :
5790 14 : SDValue Zero = DAG.getTargetConstant(0, DL, MVT::i32);
5791 :
5792 : assert(!(OffEn->isOne() && IdxEn->isOne()) &&
5793 : "Legacy intrinsic doesn't support both offset and index - use new version");
5794 :
5795 28 : SDValue VIndex = IdxEn->isOne() ? VAddr : Zero;
5796 32 : SDValue VOffset = OffEn->isOne() ? VAddr : Zero;
5797 :
5798 : // Deal with the vec-3 case
5799 : const ConstantSDNode *NumChannels = cast<ConstantSDNode>(Op.getOperand(4));
5800 28 : auto Opcode = NumChannels->getZExtValue() == 3 ?
5801 : AMDGPUISD::TBUFFER_STORE_FORMAT_X3 : AMDGPUISD::TBUFFER_STORE_FORMAT;
5802 :
5803 28 : unsigned Dfmt = cast<ConstantSDNode>(Op.getOperand(8))->getZExtValue();
5804 28 : unsigned Nfmt = cast<ConstantSDNode>(Op.getOperand(9))->getZExtValue();
5805 28 : unsigned Glc = cast<ConstantSDNode>(Op.getOperand(12))->getZExtValue();
5806 14 : unsigned Slc = cast<ConstantSDNode>(Op.getOperand(13))->getZExtValue();
5807 : SDValue Ops[] = {
5808 : Chain,
5809 : Op.getOperand(3), // vdata
5810 : Op.getOperand(2), // rsrc
5811 : VIndex,
5812 : VOffset,
5813 : Op.getOperand(6), // soffset
5814 : Op.getOperand(7), // inst_offset
5815 14 : DAG.getConstant(Dfmt | (Nfmt << 4), DL, MVT::i32), // format
5816 14 : DAG.getConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy
5817 28 : DAG.getConstant(IdxEn->isOne(), DL, MVT::i1), // idxen
5818 14 : };
5819 :
5820 : assert((cast<ConstantSDNode>(Op.getOperand(14)))->getZExtValue() == 0 &&
5821 : "Value of tfe other than zero is unsupported");
5822 :
5823 28 : EVT VT = Op.getOperand(3).getValueType();
5824 28 : MachineMemOperand *MMO = MF.getMachineMemOperand(
5825 : MachinePointerInfo(),
5826 : MachineMemOperand::MOStore,
5827 : VT.getStoreSize(), 4);
5828 : return DAG.getMemIntrinsicNode(Opcode, DL,
5829 28 : Op->getVTList(), Ops, VT, MMO);
5830 : }
5831 :
5832 41 : case Intrinsic::amdgcn_tbuffer_store: {
5833 41 : SDValue VData = Op.getOperand(2);
5834 41 : bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
5835 41 : if (IsD16)
5836 9 : VData = handleD16VData(VData, DAG);
5837 82 : unsigned Dfmt = cast<ConstantSDNode>(Op.getOperand(8))->getZExtValue();
5838 82 : unsigned Nfmt = cast<ConstantSDNode>(Op.getOperand(9))->getZExtValue();
5839 82 : unsigned Glc = cast<ConstantSDNode>(Op.getOperand(10))->getZExtValue();
5840 82 : unsigned Slc = cast<ConstantSDNode>(Op.getOperand(11))->getZExtValue();
5841 : unsigned IdxEn = 1;
5842 : if (auto Idx = dyn_cast<ConstantSDNode>(Op.getOperand(4)))
5843 36 : IdxEn = Idx->getZExtValue() != 0;
5844 : SDValue Ops[] = {
5845 : Chain,
5846 : VData, // vdata
5847 : Op.getOperand(3), // rsrc
5848 : Op.getOperand(4), // vindex
5849 : Op.getOperand(5), // voffset
5850 : Op.getOperand(6), // soffset
5851 : Op.getOperand(7), // offset
5852 41 : DAG.getConstant(Dfmt | (Nfmt << 4), DL, MVT::i32), // format
5853 41 : DAG.getConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy
5854 41 : DAG.getConstant(IdxEn, DL, MVT::i1), // idexen
5855 41 : };
5856 41 : unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16 :
5857 : AMDGPUISD::TBUFFER_STORE_FORMAT;
5858 : MemSDNode *M = cast<MemSDNode>(Op);
5859 : return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
5860 123 : M->getMemoryVT(), M->getMemOperand());
5861 : }
5862 :
5863 35 : case Intrinsic::amdgcn_struct_tbuffer_store: {
5864 35 : SDValue VData = Op.getOperand(2);
5865 35 : bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
5866 35 : if (IsD16)
5867 9 : VData = handleD16VData(VData, DAG);
5868 35 : auto Offsets = splitBufferOffsets(Op.getOperand(5), DAG);
5869 : SDValue Ops[] = {
5870 : Chain,
5871 : VData, // vdata
5872 : Op.getOperand(3), // rsrc
5873 : Op.getOperand(4), // vindex
5874 : Offsets.first, // voffset
5875 : Op.getOperand(6), // soffset
5876 : Offsets.second, // offset
5877 : Op.getOperand(7), // format
5878 : Op.getOperand(8), // cachepolicy
5879 35 : DAG.getConstant(1, DL, MVT::i1), // idexen
5880 70 : };
5881 35 : unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16 :
5882 : AMDGPUISD::TBUFFER_STORE_FORMAT;
5883 : MemSDNode *M = cast<MemSDNode>(Op);
5884 : return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
5885 105 : M->getMemoryVT(), M->getMemOperand());
5886 : }
5887 :
5888 27 : case Intrinsic::amdgcn_raw_tbuffer_store: {
5889 27 : SDValue VData = Op.getOperand(2);
5890 27 : bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
5891 27 : if (IsD16)
5892 9 : VData = handleD16VData(VData, DAG);
5893 27 : auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG);
5894 : SDValue Ops[] = {
5895 : Chain,
5896 : VData, // vdata
5897 : Op.getOperand(3), // rsrc
5898 27 : DAG.getConstant(0, DL, MVT::i32), // vindex
5899 : Offsets.first, // voffset
5900 : Op.getOperand(5), // soffset
5901 : Offsets.second, // offset
5902 : Op.getOperand(6), // format
5903 : Op.getOperand(7), // cachepolicy
5904 27 : DAG.getConstant(0, DL, MVT::i1), // idexen
5905 54 : };
5906 27 : unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16 :
5907 : AMDGPUISD::TBUFFER_STORE_FORMAT;
5908 : MemSDNode *M = cast<MemSDNode>(Op);
5909 : return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
5910 81 : M->getMemoryVT(), M->getMemOperand());
5911 : }
5912 :
5913 153 : case Intrinsic::amdgcn_buffer_store:
5914 : case Intrinsic::amdgcn_buffer_store_format: {
5915 153 : SDValue VData = Op.getOperand(2);
5916 153 : bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
5917 153 : if (IsD16)
5918 9 : VData = handleD16VData(VData, DAG);
5919 306 : unsigned Glc = cast<ConstantSDNode>(Op.getOperand(6))->getZExtValue();
5920 306 : unsigned Slc = cast<ConstantSDNode>(Op.getOperand(7))->getZExtValue();
5921 : unsigned IdxEn = 1;
5922 : if (auto Idx = dyn_cast<ConstantSDNode>(Op.getOperand(4)))
5923 180 : IdxEn = Idx->getZExtValue() != 0;
5924 : SDValue Ops[] = {
5925 : Chain,
5926 : VData,
5927 : Op.getOperand(3), // rsrc
5928 : Op.getOperand(4), // vindex
5929 : SDValue(), // voffset -- will be set by setBufferOffsets
5930 : SDValue(), // soffset -- will be set by setBufferOffsets
5931 : SDValue(), // offset -- will be set by setBufferOffsets
5932 153 : DAG.getConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy
5933 153 : DAG.getConstant(IdxEn, DL, MVT::i1), // idxen
5934 153 : };
5935 153 : setBufferOffsets(Op.getOperand(5), DAG, &Ops[4]);
5936 153 : unsigned Opc = IntrinsicID == Intrinsic::amdgcn_buffer_store ?
5937 : AMDGPUISD::BUFFER_STORE : AMDGPUISD::BUFFER_STORE_FORMAT;
5938 153 : Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc;
5939 : MemSDNode *M = cast<MemSDNode>(Op);
5940 : return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
5941 459 : M->getMemoryVT(), M->getMemOperand());
5942 : }
5943 :
5944 95 : case Intrinsic::amdgcn_raw_buffer_store:
5945 : case Intrinsic::amdgcn_raw_buffer_store_format: {
5946 95 : SDValue VData = Op.getOperand(2);
5947 95 : bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
5948 95 : if (IsD16)
5949 9 : VData = handleD16VData(VData, DAG);
5950 95 : auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG);
5951 : SDValue Ops[] = {
5952 : Chain,
5953 : VData,
5954 : Op.getOperand(3), // rsrc
5955 95 : DAG.getConstant(0, DL, MVT::i32), // vindex
5956 : Offsets.first, // voffset
5957 : Op.getOperand(5), // soffset
5958 : Offsets.second, // offset
5959 : Op.getOperand(6), // cachepolicy
5960 95 : DAG.getConstant(0, DL, MVT::i1), // idxen
5961 190 : };
5962 95 : unsigned Opc = IntrinsicID == Intrinsic::amdgcn_raw_buffer_store ?
5963 : AMDGPUISD::BUFFER_STORE : AMDGPUISD::BUFFER_STORE_FORMAT;
5964 95 : Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc;
5965 : MemSDNode *M = cast<MemSDNode>(Op);
5966 : return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
5967 285 : M->getMemoryVT(), M->getMemOperand());
5968 : }
5969 :
5970 63 : case Intrinsic::amdgcn_struct_buffer_store:
5971 : case Intrinsic::amdgcn_struct_buffer_store_format: {
5972 63 : SDValue VData = Op.getOperand(2);
5973 63 : bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
5974 63 : if (IsD16)
5975 9 : VData = handleD16VData(VData, DAG);
5976 63 : auto Offsets = splitBufferOffsets(Op.getOperand(5), DAG);
5977 : SDValue Ops[] = {
5978 : Chain,
5979 : VData,
5980 : Op.getOperand(3), // rsrc
5981 : Op.getOperand(4), // vindex
5982 : Offsets.first, // voffset
5983 : Op.getOperand(6), // soffset
5984 : Offsets.second, // offset
5985 : Op.getOperand(7), // cachepolicy
5986 63 : DAG.getConstant(1, DL, MVT::i1), // idxen
5987 126 : };
5988 63 : unsigned Opc = IntrinsicID == Intrinsic::amdgcn_struct_buffer_store ?
5989 : AMDGPUISD::BUFFER_STORE : AMDGPUISD::BUFFER_STORE_FORMAT;
5990 63 : Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc;
5991 : MemSDNode *M = cast<MemSDNode>(Op);
5992 : return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
5993 189 : M->getMemoryVT(), M->getMemOperand());
5994 : }
5995 :
5996 1593 : default: {
5997 1593 : if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
5998 1593 : AMDGPU::getImageDimIntrinsicInfo(IntrinsicID))
5999 87 : return lowerImage(Op, ImageDimIntr, DAG);
6000 :
6001 1506 : return Op;
6002 : }
6003 : }
6004 : }
6005 :
6006 : // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
6007 : // offset (the offset that is included in bounds checking and swizzling, to be
6008 : // split between the instruction's voffset and immoffset fields) and soffset
6009 : // (the offset that is excluded from bounds checking and swizzling, to go in
6010 : // the instruction's soffset field). This function takes the first kind of
6011 : // offset and figures out how to split it between voffset and immoffset.
6012 610 : std::pair<SDValue, SDValue> SITargetLowering::splitBufferOffsets(
6013 : SDValue Offset, SelectionDAG &DAG) const {
6014 : SDLoc DL(Offset);
6015 : const unsigned MaxImm = 4095;
6016 610 : SDValue N0 = Offset;
6017 : ConstantSDNode *C1 = nullptr;
6018 610 : if (N0.getOpcode() == ISD::ADD) {
6019 : if ((C1 = dyn_cast<ConstantSDNode>(N0.getOperand(1))))
6020 80 : N0 = N0.getOperand(0);
6021 : } else if ((C1 = dyn_cast<ConstantSDNode>(N0)))
6022 407 : N0 = SDValue();
6023 :
6024 610 : if (C1) {
6025 487 : unsigned ImmOffset = C1->getZExtValue();
6026 : // If the immediate value is too big for the immoffset field, put the value
6027 : // and -4096 into the immoffset field so that the value that is copied/added
6028 : // for the voffset field is a multiple of 4096, and it stands more chance
6029 : // of being CSEd with the copy/add for another similar load/store.
6030 : // However, do not do that rounding down to a multiple of 4096 if that is a
6031 : // negative number, as it appears to be illegal to have a negative offset
6032 : // in the vgpr, even if adding the immediate offset makes it positive.
6033 487 : unsigned Overflow = ImmOffset & ~MaxImm;
6034 487 : ImmOffset -= Overflow;
6035 487 : if ((int32_t)Overflow < 0) {
6036 : Overflow += ImmOffset;
6037 : ImmOffset = 0;
6038 : }
6039 487 : C1 = cast<ConstantSDNode>(DAG.getConstant(ImmOffset, DL, MVT::i32));
6040 487 : if (Overflow) {
6041 4 : auto OverflowVal = DAG.getConstant(Overflow, DL, MVT::i32);
6042 4 : if (!N0)
6043 0 : N0 = OverflowVal;
6044 : else {
6045 4 : SDValue Ops[] = { N0, OverflowVal };
6046 4 : N0 = DAG.getNode(ISD::ADD, DL, MVT::i32, Ops);
6047 : }
6048 : }
6049 : }
6050 610 : if (!N0)
6051 407 : N0 = DAG.getConstant(0, DL, MVT::i32);
6052 610 : if (!C1)
6053 123 : C1 = cast<ConstantSDNode>(DAG.getConstant(0, DL, MVT::i32));
6054 610 : return {N0, SDValue(C1, 0)};
6055 : }
6056 :
6057 : // Analyze a combined offset from an amdgcn_buffer_ intrinsic and store the
6058 : // three offsets (voffset, soffset and instoffset) into the SDValue[3] array
6059 : // pointed to by Offsets.
6060 489 : void SITargetLowering::setBufferOffsets(SDValue CombinedOffset,
6061 : SelectionDAG &DAG, SDValue *Offsets,
6062 : unsigned Align) const {
6063 : SDLoc DL(CombinedOffset);
6064 : if (auto C = dyn_cast<ConstantSDNode>(CombinedOffset)) {
6065 296 : uint32_t Imm = C->getZExtValue();
6066 : uint32_t SOffset, ImmOffset;
6067 296 : if (AMDGPU::splitMUBUFOffset(Imm, SOffset, ImmOffset, Subtarget, Align)) {
6068 288 : Offsets[0] = DAG.getConstant(0, DL, MVT::i32);
6069 288 : Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32);
6070 288 : Offsets[2] = DAG.getConstant(ImmOffset, DL, MVT::i32);
6071 288 : return;
6072 : }
6073 : }
6074 201 : if (DAG.isBaseWithConstantOffset(CombinedOffset)) {
6075 106 : SDValue N0 = CombinedOffset.getOperand(0);
6076 106 : SDValue N1 = CombinedOffset.getOperand(1);
6077 : uint32_t SOffset, ImmOffset;
6078 106 : int Offset = cast<ConstantSDNode>(N1)->getSExtValue();
6079 210 : if (Offset >= 0 && AMDGPU::splitMUBUFOffset(Offset, SOffset, ImmOffset,
6080 104 : Subtarget, Align)) {
6081 98 : Offsets[0] = N0;
6082 98 : Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32);
6083 98 : Offsets[2] = DAG.getConstant(ImmOffset, DL, MVT::i32);
6084 98 : return;
6085 : }
6086 : }
6087 103 : Offsets[0] = CombinedOffset;
6088 103 : Offsets[1] = DAG.getConstant(0, DL, MVT::i32);
6089 103 : Offsets[2] = DAG.getConstant(0, DL, MVT::i32);
6090 : }
6091 :
6092 48 : static SDValue getLoadExtOrTrunc(SelectionDAG &DAG,
6093 : ISD::LoadExtType ExtType, SDValue Op,
6094 : const SDLoc &SL, EVT VT) {
6095 48 : if (VT.bitsLT(Op.getValueType()))
6096 16 : return DAG.getNode(ISD::TRUNCATE, SL, VT, Op);
6097 :
6098 32 : switch (ExtType) {
6099 : case ISD::SEXTLOAD:
6100 2 : return DAG.getNode(ISD::SIGN_EXTEND, SL, VT, Op);
6101 : case ISD::ZEXTLOAD:
6102 20 : return DAG.getNode(ISD::ZERO_EXTEND, SL, VT, Op);
6103 : case ISD::EXTLOAD:
6104 10 : return DAG.getNode(ISD::ANY_EXTEND, SL, VT, Op);
6105 0 : case ISD::NON_EXTLOAD:
6106 0 : return Op;
6107 : }
6108 :
6109 0 : llvm_unreachable("invalid ext type");
6110 : }
6111 :
6112 310576 : SDValue SITargetLowering::widenLoad(LoadSDNode *Ld, DAGCombinerInfo &DCI) const {
6113 310576 : SelectionDAG &DAG = DCI.DAG;
6114 310576 : if (Ld->getAlignment() < 4 || Ld->isDivergent())
6115 73345 : return SDValue();
6116 :
6117 : // FIXME: Constant loads should all be marked invariant.
6118 : unsigned AS = Ld->getAddressSpace();
6119 474462 : if (AS != AMDGPUAS::CONSTANT_ADDRESS &&
6120 237231 : AS != AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
6121 35839 : (AS != AMDGPUAS::GLOBAL_ADDRESS || !Ld->isInvariant()))
6122 85531 : return SDValue();
6123 :
6124 : // Don't do this early, since it may interfere with adjacent load merging for
6125 : // illegal types. We can avoid losing alignment information for exotic types
6126 : // pre-legalize.
6127 151700 : EVT MemVT = Ld->getMemoryVT();
6128 151700 : if ((MemVT.isSimple() && !DCI.isAfterLegalizeDAG()) ||
6129 41428 : MemVT.getSizeInBits() >= 32)
6130 151652 : return SDValue();
6131 :
6132 : SDLoc SL(Ld);
6133 :
6134 : assert((!MemVT.isVector() || Ld->getExtensionType() == ISD::NON_EXTLOAD) &&
6135 : "unexpected vector extload");
6136 :
6137 : // TODO: Drop only high part of range.
6138 48 : SDValue Ptr = Ld->getBasePtr();
6139 : SDValue NewLoad = DAG.getLoad(ISD::UNINDEXED, ISD::NON_EXTLOAD,
6140 : MVT::i32, SL, Ld->getChain(), Ptr,
6141 : Ld->getOffset(),
6142 48 : Ld->getPointerInfo(), MVT::i32,
6143 : Ld->getAlignment(),
6144 48 : Ld->getMemOperand()->getFlags(),
6145 48 : Ld->getAAInfo(),
6146 48 : nullptr); // Drop ranges
6147 :
6148 48 : EVT TruncVT = EVT::getIntegerVT(*DAG.getContext(), MemVT.getSizeInBits());
6149 48 : if (MemVT.isFloatingPoint()) {
6150 : assert(Ld->getExtensionType() == ISD::NON_EXTLOAD &&
6151 : "unexpected fp extload");
6152 0 : TruncVT = MemVT.changeTypeToInteger();
6153 : }
6154 :
6155 48 : SDValue Cvt = NewLoad;
6156 48 : if (Ld->getExtensionType() == ISD::SEXTLOAD) {
6157 2 : Cvt = DAG.getNode(ISD::SIGN_EXTEND_INREG, SL, MVT::i32, NewLoad,
6158 2 : DAG.getValueType(TruncVT));
6159 46 : } else if (Ld->getExtensionType() == ISD::ZEXTLOAD ||
6160 : Ld->getExtensionType() == ISD::NON_EXTLOAD) {
6161 35 : Cvt = DAG.getZeroExtendInReg(NewLoad, SL, TruncVT);
6162 : } else {
6163 : assert(Ld->getExtensionType() == ISD::EXTLOAD);
6164 : }
6165 :
6166 48 : EVT VT = Ld->getValueType(0);
6167 48 : EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
6168 :
6169 48 : DCI.AddToWorklist(Cvt.getNode());
6170 :
6171 : // We may need to handle exotic cases, such as i16->i64 extloads, so insert
6172 : // the appropriate extension from the 32-bit load.
6173 48 : Cvt = getLoadExtOrTrunc(DAG, Ld->getExtensionType(), Cvt, SL, IntVT);
6174 48 : DCI.AddToWorklist(Cvt.getNode());
6175 :
6176 : // Handle conversion back to floating point if necessary.
6177 48 : Cvt = DAG.getNode(ISD::BITCAST, SL, VT, Cvt);
6178 :
6179 96 : return DAG.getMergeValues({ Cvt, NewLoad.getValue(1) }, SL);
6180 : }
6181 :
6182 74303 : SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
6183 : SDLoc DL(Op);
6184 : LoadSDNode *Load = cast<LoadSDNode>(Op);
6185 : ISD::LoadExtType ExtType = Load->getExtensionType();
6186 74303 : EVT MemVT = Load->getMemoryVT();
6187 :
6188 74303 : if (ExtType == ISD::NON_EXTLOAD && MemVT.getSizeInBits() < 32) {
6189 : if (MemVT == MVT::i16 && isTypeLegal(MVT::i16))
6190 2253 : return SDValue();
6191 :
6192 : // FIXME: Copied from PPC
6193 : // First, load into 32 bits, then truncate to 1 bit.
6194 :
6195 120 : SDValue Chain = Load->getChain();
6196 120 : SDValue BasePtr = Load->getBasePtr();
6197 120 : MachineMemOperand *MMO = Load->getMemOperand();
6198 :
6199 : EVT RealMemVT = (MemVT == MVT::i1) ? MVT::i8 : MVT::i16;
6200 :
6201 : SDValue NewLD = DAG.getExtLoad(ISD::EXTLOAD, DL, MVT::i32, Chain,
6202 120 : BasePtr, RealMemVT, MMO);
6203 :
6204 : SDValue Ops[] = {
6205 120 : DAG.getNode(ISD::TRUNCATE, DL, MemVT, NewLD),
6206 120 : NewLD.getValue(1)
6207 : };
6208 :
6209 120 : return DAG.getMergeValues(Ops, DL);
6210 : }
6211 :
6212 71930 : if (!MemVT.isVector())
6213 0 : return SDValue();
6214 :
6215 : assert(Op.getValueType().getVectorElementType() == MVT::i32 &&
6216 : "Custom lowering for non-i32 vectors hasn't been implemented.");
6217 :
6218 71930 : unsigned Alignment = Load->getAlignment();
6219 : unsigned AS = Load->getAddressSpace();
6220 71930 : if (!allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), MemVT,
6221 : AS, Alignment)) {
6222 0 : SDValue Ops[2];
6223 0 : std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(Load, DAG);
6224 0 : return DAG.getMergeValues(Ops, DL);
6225 : }
6226 :
6227 71930 : MachineFunction &MF = DAG.getMachineFunction();
6228 71930 : SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
6229 : // If there is a possibilty that flat instruction access scratch memory
6230 : // then we need to use the same legalization rules we use for private.
6231 71930 : if (AS == AMDGPUAS::FLAT_ADDRESS)
6232 27 : AS = MFI->hasFlatScratchInit() ?
6233 : AMDGPUAS::PRIVATE_ADDRESS : AMDGPUAS::GLOBAL_ADDRESS;
6234 :
6235 : unsigned NumElements = MemVT.getVectorNumElements();
6236 :
6237 143860 : if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
6238 71930 : AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
6239 47010 : if (!Op->isDivergent() && Alignment >= 4 && NumElements < 32)
6240 46756 : return SDValue();
6241 : // Non-uniform loads will be selected to MUBUF instructions, so they
6242 : // have the same legalization requirements as global and private
6243 : // loads.
6244 : //
6245 : }
6246 :
6247 25174 : if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
6248 24920 : AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT ||
6249 : AS == AMDGPUAS::GLOBAL_ADDRESS) {
6250 7618 : if (Subtarget->getScalarizeGlobalBehavior() && !Op->isDivergent() &&
6251 2443 : !Load->isVolatile() && isMemOpHasNoClobberedMemOperand(Load) &&
6252 14761 : Alignment >= 4 && NumElements < 32)
6253 841 : return SDValue();
6254 : // Non-uniform loads will be selected to MUBUF instructions, so they
6255 : // have the same legalization requirements as global and private
6256 : // loads.
6257 : //
6258 : }
6259 24333 : if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
6260 : AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT ||
6261 24079 : AS == AMDGPUAS::GLOBAL_ADDRESS ||
6262 : AS == AMDGPUAS::FLAT_ADDRESS) {
6263 13059 : if (NumElements > 4)
6264 1263 : return SplitVectorLoad(Op, DAG);
6265 : // v4 loads are supported for private and global memory.
6266 11796 : return SDValue();
6267 : }
6268 11274 : if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
6269 : // Depending on the setting of the private_element_size field in the
6270 : // resource descriptor, we can only make private accesses up to a certain
6271 : // size.
6272 379 : switch (Subtarget->getMaxPrivateElementSize()) {
6273 216 : case 4:
6274 216 : return scalarizeVectorLoad(Load, DAG);
6275 53 : case 8:
6276 53 : if (NumElements > 2)
6277 5 : return SplitVectorLoad(Op, DAG);
6278 48 : return SDValue();
6279 110 : case 16:
6280 : // Same as global/flat
6281 110 : if (NumElements > 4)
6282 1 : return SplitVectorLoad(Op, DAG);
6283 109 : return SDValue();
6284 0 : default:
6285 0 : llvm_unreachable("unsupported private_element_size");
6286 : }
6287 10895 : } else if (AS == AMDGPUAS::LOCAL_ADDRESS) {
6288 : // Use ds_read_b128 if possible.
6289 12303 : if (Subtarget->useDS128() && Load->getAlignment() >= 16 &&
6290 : MemVT.getStoreSize() == 16)
6291 1102 : return SDValue();
6292 :
6293 9793 : if (NumElements > 2)
6294 1224 : return SplitVectorLoad(Op, DAG);
6295 :
6296 : // SI has a hardware bug in the LDS / GDS boounds checking: if the base
6297 : // address is negative, then the instruction is incorrectly treated as
6298 : // out-of-bounds even if base + offsets is in bounds. Split vectorized
6299 : // loads here to avoid emitting ds_read2_b32. We may re-combine the
6300 : // load later in the SILoadStoreOptimizer.
6301 3336 : if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS &&
6302 15241 : NumElements == 2 && MemVT.getStoreSize() == 8 &&
6303 3336 : Load->getAlignment() < 8) {
6304 19 : return SplitVectorLoad(Op, DAG);
6305 : }
6306 : }
6307 8550 : return SDValue();
6308 : }
6309 :
6310 734 : SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
6311 734 : EVT VT = Op.getValueType();
6312 : assert(VT.getSizeInBits() == 64);
6313 :
6314 : SDLoc DL(Op);
6315 734 : SDValue Cond = Op.getOperand(0);
6316 :
6317 734 : SDValue Zero = DAG.getConstant(0, DL, MVT::i32);
6318 734 : SDValue One = DAG.getConstant(1, DL, MVT::i32);
6319 :
6320 734 : SDValue LHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(1));
6321 734 : SDValue RHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(2));
6322 :
6323 734 : SDValue Lo0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, Zero);
6324 734 : SDValue Lo1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, Zero);
6325 :
6326 734 : SDValue Lo = DAG.getSelect(DL, MVT::i32, Cond, Lo0, Lo1);
6327 :
6328 734 : SDValue Hi0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, One);
6329 734 : SDValue Hi1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, One);
6330 :
6331 734 : SDValue Hi = DAG.getSelect(DL, MVT::i32, Cond, Hi0, Hi1);
6332 :
6333 1468 : SDValue Res = DAG.getBuildVector(MVT::v2i32, DL, {Lo, Hi});
6334 734 : return DAG.getNode(ISD::BITCAST, DL, VT, Res);
6335 : }
6336 :
6337 : // Catch division cases where we can use shortcuts with rcp and rsq
6338 : // instructions.
6339 190 : SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op,
6340 : SelectionDAG &DAG) const {
6341 : SDLoc SL(Op);
6342 190 : SDValue LHS = Op.getOperand(0);
6343 190 : SDValue RHS = Op.getOperand(1);
6344 190 : EVT VT = Op.getValueType();
6345 190 : const SDNodeFlags Flags = Op->getFlags();
6346 190 : bool Unsafe = DAG.getTarget().Options.UnsafeFPMath || Flags.hasAllowReciprocal();
6347 :
6348 153 : if (!Unsafe && VT == MVT::f32 && Subtarget->hasFP32Denormals())
6349 18 : return SDValue();
6350 :
6351 : if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(LHS)) {
6352 112 : if (Unsafe || VT == MVT::f32 || VT == MVT::f16) {
6353 112 : if (CLHS->isExactlyValue(1.0)) {
6354 : // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
6355 : // the CI documentation has a worst case error of 1 ulp.
6356 : // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to
6357 : // use it as long as we aren't trying to use denormals.
6358 : //
6359 : // v_rcp_f16 and v_rsq_f16 DO support denormals.
6360 :
6361 : // 1.0 / sqrt(x) -> rsq(x)
6362 :
6363 : // XXX - Is UnsafeFPMath sufficient to do this for f64? The maximum ULP
6364 : // error seems really high at 2^29 ULP.
6365 75 : if (RHS.getOpcode() == ISD::FSQRT)
6366 7 : return DAG.getNode(AMDGPUISD::RSQ, SL, VT, RHS.getOperand(0));
6367 :
6368 : // 1.0 / x -> rcp(x)
6369 68 : return DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);
6370 : }
6371 :
6372 : // Same as for 1.0, but expand the sign out of the constant.
6373 37 : if (CLHS->isExactlyValue(-1.0)) {
6374 : // -1.0 / x -> rcp (fneg x)
6375 34 : SDValue FNegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
6376 34 : return DAG.getNode(AMDGPUISD::RCP, SL, VT, FNegRHS);
6377 : }
6378 : }
6379 : }
6380 :
6381 63 : if (Unsafe) {
6382 : // Turn into multiply by the reciprocal.
6383 : // x / y -> x * (1.0 / y)
6384 12 : SDValue Recip = DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);
6385 12 : return DAG.getNode(ISD::FMUL, SL, VT, LHS, Recip, Flags);
6386 : }
6387 :
6388 51 : return SDValue();
6389 : }
6390 :
6391 0 : static SDValue getFPBinOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL,
6392 : EVT VT, SDValue A, SDValue B, SDValue GlueChain) {
6393 0 : if (GlueChain->getNumValues() <= 1) {
6394 0 : return DAG.getNode(Opcode, SL, VT, A, B);
6395 : }
6396 :
6397 : assert(GlueChain->getNumValues() == 3);
6398 :
6399 0 : SDVTList VTList = DAG.getVTList(VT, MVT::Other, MVT::Glue);
6400 0 : switch (Opcode) {
6401 0 : default: llvm_unreachable("no chain equivalent for opcode");
6402 0 : case ISD::FMUL:
6403 : Opcode = AMDGPUISD::FMUL_W_CHAIN;
6404 : break;
6405 : }
6406 :
6407 : return DAG.getNode(Opcode, SL, VTList, GlueChain.getValue(1), A, B,
6408 0 : GlueChain.getValue(2));
6409 : }
6410 :
6411 0 : static SDValue getFPTernOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL,
6412 : EVT VT, SDValue A, SDValue B, SDValue C,
6413 : SDValue GlueChain) {
6414 0 : if (GlueChain->getNumValues() <= 1) {
6415 0 : return DAG.getNode(Opcode, SL, VT, A, B, C);
6416 : }
6417 :
6418 : assert(GlueChain->getNumValues() == 3);
6419 :
6420 0 : SDVTList VTList = DAG.getVTList(VT, MVT::Other, MVT::Glue);
6421 0 : switch (Opcode) {
6422 0 : default: llvm_unreachable("no chain equivalent for opcode");
6423 0 : case ISD::FMA:
6424 : Opcode = AMDGPUISD::FMA_W_CHAIN;
6425 : break;
6426 : }
6427 :
6428 : return DAG.getNode(Opcode, SL, VTList, GlueChain.getValue(1), A, B, C,
6429 0 : GlueChain.getValue(2));
6430 : }
6431 :
6432 27 : SDValue SITargetLowering::LowerFDIV16(SDValue Op, SelectionDAG &DAG) const {
6433 27 : if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG))
6434 24 : return FastLowered;
6435 :
6436 : SDLoc SL(Op);
6437 3 : SDValue Src0 = Op.getOperand(0);
6438 3 : SDValue Src1 = Op.getOperand(1);
6439 :
6440 3 : SDValue CvtSrc0 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src0);
6441 3 : SDValue CvtSrc1 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src1);
6442 :
6443 3 : SDValue RcpSrc1 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, CvtSrc1);
6444 3 : SDValue Quot = DAG.getNode(ISD::FMUL, SL, MVT::f32, CvtSrc0, RcpSrc1);
6445 :
6446 3 : SDValue FPRoundFlag = DAG.getTargetConstant(0, SL, MVT::i32);
6447 3 : SDValue BestQuot = DAG.getNode(ISD::FP_ROUND, SL, MVT::f16, Quot, FPRoundFlag);
6448 :
6449 3 : return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f16, BestQuot, Src1, Src0);
6450 : }
6451 :
6452 : // Faster 2.5 ULP division that does not support denormals.
6453 33 : SDValue SITargetLowering::lowerFDIV_FAST(SDValue Op, SelectionDAG &DAG) const {
6454 : SDLoc SL(Op);
6455 33 : SDValue LHS = Op.getOperand(1);
6456 33 : SDValue RHS = Op.getOperand(2);
6457 :
6458 33 : SDValue r1 = DAG.getNode(ISD::FABS, SL, MVT::f32, RHS);
6459 :
6460 33 : const APFloat K0Val(BitsToFloat(0x6f800000));
6461 33 : const SDValue K0 = DAG.getConstantFP(K0Val, SL, MVT::f32);
6462 :
6463 33 : const APFloat K1Val(BitsToFloat(0x2f800000));
6464 33 : const SDValue K1 = DAG.getConstantFP(K1Val, SL, MVT::f32);
6465 :
6466 33 : const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);
6467 :
6468 : EVT SetCCVT =
6469 33 : getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f32);
6470 :
6471 33 : SDValue r2 = DAG.getSetCC(SL, SetCCVT, r1, K0, ISD::SETOGT);
6472 :
6473 33 : SDValue r3 = DAG.getNode(ISD::SELECT, SL, MVT::f32, r2, K1, One);
6474 :
6475 : // TODO: Should this propagate fast-math-flags?
6476 33 : r1 = DAG.getNode(ISD::FMUL, SL, MVT::f32, RHS, r3);
6477 :
6478 : // rcp does not support denormals.
6479 33 : SDValue r0 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, r1);
6480 :
6481 33 : SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f32, LHS, r0);
6482 :
6483 33 : return DAG.getNode(ISD::FMUL, SL, MVT::f32, r3, Mul);
6484 : }
6485 :
6486 156 : SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const {
6487 156 : if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG))
6488 90 : return FastLowered;
6489 :
6490 : SDLoc SL(Op);
6491 66 : SDValue LHS = Op.getOperand(0);
6492 66 : SDValue RHS = Op.getOperand(1);
6493 :
6494 66 : const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);
6495 :
6496 66 : SDVTList ScaleVT = DAG.getVTList(MVT::f32, MVT::i1);
6497 :
6498 : SDValue DenominatorScaled = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT,
6499 66 : RHS, RHS, LHS);
6500 : SDValue NumeratorScaled = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT,
6501 66 : LHS, RHS, LHS);
6502 :
6503 : // Denominator is scaled to not be denormal, so using rcp is ok.
6504 : SDValue ApproxRcp = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32,
6505 66 : DenominatorScaled);
6506 : SDValue NegDivScale0 = DAG.getNode(ISD::FNEG, SL, MVT::f32,
6507 66 : DenominatorScaled);
6508 :
6509 : const unsigned Denorm32Reg = AMDGPU::Hwreg::ID_MODE |
6510 : (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) |
6511 : (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_);
6512 :
6513 66 : const SDValue BitField = DAG.getTargetConstant(Denorm32Reg, SL, MVT::i16);
6514 :
6515 66 : if (!Subtarget->hasFP32Denormals()) {
6516 48 : SDVTList BindParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
6517 : const SDValue EnableDenormValue = DAG.getConstant(FP_DENORM_FLUSH_NONE,
6518 48 : SL, MVT::i32);
6519 : SDValue EnableDenorm = DAG.getNode(AMDGPUISD::SETREG, SL, BindParamVTs,
6520 : DAG.getEntryNode(),
6521 48 : EnableDenormValue, BitField);
6522 : SDValue Ops[3] = {
6523 : NegDivScale0,
6524 48 : EnableDenorm.getValue(0),
6525 48 : EnableDenorm.getValue(1)
6526 48 : };
6527 :
6528 48 : NegDivScale0 = DAG.getMergeValues(Ops, SL);
6529 : }
6530 :
6531 : SDValue Fma0 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0,
6532 66 : ApproxRcp, One, NegDivScale0);
6533 :
6534 : SDValue Fma1 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, Fma0, ApproxRcp,
6535 66 : ApproxRcp, Fma0);
6536 :
6537 : SDValue Mul = getFPBinOp(DAG, ISD::FMUL, SL, MVT::f32, NumeratorScaled,
6538 66 : Fma1, Fma1);
6539 :
6540 : SDValue Fma2 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Mul,
6541 66 : NumeratorScaled, Mul);
6542 :
6543 66 : SDValue Fma3 = getFPTernOp(DAG, ISD::FMA,SL, MVT::f32, Fma2, Fma1, Mul, Fma2);
6544 :
6545 : SDValue Fma4 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Fma3,
6546 66 : NumeratorScaled, Fma3);
6547 :
6548 66 : if (!Subtarget->hasFP32Denormals()) {
6549 : const SDValue DisableDenormValue =
6550 48 : DAG.getConstant(FP_DENORM_FLUSH_IN_FLUSH_OUT, SL, MVT::i32);
6551 : SDValue DisableDenorm = DAG.getNode(AMDGPUISD::SETREG, SL, MVT::Other,
6552 : Fma4.getValue(1),
6553 : DisableDenormValue,
6554 : BitField,
6555 48 : Fma4.getValue(2));
6556 :
6557 : SDValue OutputChain = DAG.getNode(ISD::TokenFactor, SL, MVT::Other,
6558 48 : DisableDenorm, DAG.getRoot());
6559 48 : DAG.setRoot(OutputChain);
6560 : }
6561 :
6562 66 : SDValue Scale = NumeratorScaled.getValue(1);
6563 : SDValue Fmas = DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f32,
6564 66 : Fma4, Fma1, Fma3, Scale);
6565 :
6566 66 : return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f32, Fmas, RHS, LHS);
6567 : }
6568 :
6569 68 : SDValue SITargetLowering::LowerFDIV64(SDValue Op, SelectionDAG &DAG) const {
6570 68 : if (DAG.getTarget().Options.UnsafeFPMath)
6571 7 : return lowerFastUnsafeFDIV(Op, DAG);
6572 :
6573 : SDLoc SL(Op);
6574 61 : SDValue X = Op.getOperand(0);
6575 61 : SDValue Y = Op.getOperand(1);
6576 :
6577 61 : const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f64);
6578 :
6579 61 : SDVTList ScaleVT = DAG.getVTList(MVT::f64, MVT::i1);
6580 :
6581 61 : SDValue DivScale0 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, Y, Y, X);
6582 :
6583 61 : SDValue NegDivScale0 = DAG.getNode(ISD::FNEG, SL, MVT::f64, DivScale0);
6584 :
6585 61 : SDValue Rcp = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f64, DivScale0);
6586 :
6587 61 : SDValue Fma0 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Rcp, One);
6588 :
6589 61 : SDValue Fma1 = DAG.getNode(ISD::FMA, SL, MVT::f64, Rcp, Fma0, Rcp);
6590 :
6591 61 : SDValue Fma2 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Fma1, One);
6592 :
6593 61 : SDValue DivScale1 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, X, Y, X);
6594 :
6595 61 : SDValue Fma3 = DAG.getNode(ISD::FMA, SL, MVT::f64, Fma1, Fma2, Fma1);
6596 61 : SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f64, DivScale1, Fma3);
6597 :
6598 : SDValue Fma4 = DAG.getNode(ISD::FMA, SL, MVT::f64,
6599 61 : NegDivScale0, Mul, DivScale1);
6600 :
6601 61 : SDValue Scale;
6602 :
6603 61 : if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS) {
6604 : // Workaround a hardware bug on SI where the condition output from div_scale
6605 : // is not usable.
6606 :
6607 23 : const SDValue Hi = DAG.getConstant(1, SL, MVT::i32);
6608 :
6609 : // Figure out if the scale to use for div_fmas.
6610 23 : SDValue NumBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, X);
6611 23 : SDValue DenBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Y);
6612 23 : SDValue Scale0BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale0);
6613 23 : SDValue Scale1BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale1);
6614 :
6615 23 : SDValue NumHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, NumBC, Hi);
6616 23 : SDValue DenHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, DenBC, Hi);
6617 :
6618 : SDValue Scale0Hi
6619 23 : = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale0BC, Hi);
6620 : SDValue Scale1Hi
6621 23 : = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale1BC, Hi);
6622 :
6623 23 : SDValue CmpDen = DAG.getSetCC(SL, MVT::i1, DenHi, Scale0Hi, ISD::SETEQ);
6624 23 : SDValue CmpNum = DAG.getSetCC(SL, MVT::i1, NumHi, Scale1Hi, ISD::SETEQ);
6625 23 : Scale = DAG.getNode(ISD::XOR, SL, MVT::i1, CmpNum, CmpDen);
6626 : } else {
6627 38 : Scale = DivScale1.getValue(1);
6628 : }
6629 :
6630 : SDValue Fmas = DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f64,
6631 61 : Fma4, Fma3, Mul, Scale);
6632 :
6633 61 : return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f64, Fmas, Y, X);
6634 : }
6635 :
6636 251 : SDValue SITargetLowering::LowerFDIV(SDValue Op, SelectionDAG &DAG) const {
6637 251 : EVT VT = Op.getValueType();
6638 :
6639 : if (VT == MVT::f32)
6640 156 : return LowerFDIV32(Op, DAG);
6641 :
6642 : if (VT == MVT::f64)
6643 68 : return LowerFDIV64(Op, DAG);
6644 :
6645 : if (VT == MVT::f16)
6646 27 : return LowerFDIV16(Op, DAG);
6647 :
6648 0 : llvm_unreachable("Unexpected type for fdiv");
6649 : }
6650 :
6651 82361 : SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
6652 : SDLoc DL(Op);
6653 : StoreSDNode *Store = cast<StoreSDNode>(Op);
6654 82361 : EVT VT = Store->getMemoryVT();
6655 :
6656 : if (VT == MVT::i1) {
6657 : return DAG.getTruncStore(Store->getChain(), DL,
6658 : DAG.getSExtOrTrunc(Store->getValue(), DL, MVT::i32),
6659 380 : Store->getBasePtr(), MVT::i1, Store->getMemOperand());
6660 : }
6661 :
6662 : assert(VT.isVector() &&
6663 : Store->getValue().getValueType().getScalarType() == MVT::i32);
6664 :
6665 : unsigned AS = Store->getAddressSpace();
6666 82171 : if (!allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
6667 : AS, Store->getAlignment())) {
6668 24 : return expandUnalignedStore(Store, DAG);
6669 : }
6670 :
6671 82147 : MachineFunction &MF = DAG.getMachineFunction();
6672 82147 : SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
6673 : // If there is a possibilty that flat instruction access scratch memory
6674 : // then we need to use the same legalization rules we use for private.
6675 82147 : if (AS == AMDGPUAS::FLAT_ADDRESS)
6676 261 : AS = MFI->hasFlatScratchInit() ?
6677 : AMDGPUAS::PRIVATE_ADDRESS : AMDGPUAS::GLOBAL_ADDRESS;
6678 :
6679 : unsigned NumElements = VT.getVectorNumElements();
6680 82147 : if (AS == AMDGPUAS::GLOBAL_ADDRESS ||
6681 : AS == AMDGPUAS::FLAT_ADDRESS) {
6682 40502 : if (NumElements > 4)
6683 4385 : return SplitVectorStore(Op, DAG);
6684 36117 : return SDValue();
6685 41645 : } else if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
6686 564 : switch (Subtarget->getMaxPrivateElementSize()) {
6687 344 : case 4:
6688 344 : return scalarizeVectorStore(Store, DAG);
6689 86 : case 8:
6690 86 : if (NumElements > 2)
6691 10 : return SplitVectorStore(Op, DAG);
6692 76 : return SDValue();
6693 134 : case 16:
6694 134 : if (NumElements > 4)
6695 2 : return SplitVectorStore(Op, DAG);
6696 132 : return SDValue();
6697 0 : default:
6698 0 : llvm_unreachable("unsupported private_element_size");
6699 : }
6700 41081 : } else if (AS == AMDGPUAS::LOCAL_ADDRESS) {
6701 : // Use ds_write_b128 if possible.
6702 46309 : if (Subtarget->useDS128() && Store->getAlignment() >= 16 &&
6703 : VT.getStoreSize() == 16)
6704 4490 : return SDValue();
6705 :
6706 36591 : if (NumElements > 2)
6707 4042 : return SplitVectorStore(Op, DAG);
6708 :
6709 : // SI has a hardware bug in the LDS / GDS boounds checking: if the base
6710 : // address is negative, then the instruction is incorrectly treated as
6711 : // out-of-bounds even if base + offsets is in bounds. Split vectorized
6712 : // stores here to avoid emitting ds_write2_b32. We may re-combine the
6713 : // store later in the SILoadStoreOptimizer.
6714 14225 : if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS &&
6715 60999 : NumElements == 2 && VT.getStoreSize() == 8 &&
6716 14225 : Store->getAlignment() < 8) {
6717 33 : return SplitVectorStore(Op, DAG);
6718 : }
6719 :
6720 32516 : return SDValue();
6721 : } else {
6722 0 : llvm_unreachable("unhandled address space");
6723 : }
6724 : }
6725 :
6726 98 : SDValue SITargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
6727 : SDLoc DL(Op);
6728 98 : EVT VT = Op.getValueType();
6729 98 : SDValue Arg = Op.getOperand(0);
6730 98 : SDValue TrigVal;
6731 :
6732 : // TODO: Should this propagate fast-math-flags?
6733 :
6734 98 : SDValue OneOver2Pi = DAG.getConstantFP(0.5 / M_PI, DL, VT);
6735 :
6736 98 : if (Subtarget->hasTrigReducedRange()) {
6737 68 : SDValue MulVal = DAG.getNode(ISD::FMUL, DL, VT, Arg, OneOver2Pi);
6738 68 : TrigVal = DAG.getNode(AMDGPUISD::FRACT, DL, VT, MulVal);
6739 : } else {
6740 30 : TrigVal = DAG.getNode(ISD::FMUL, DL, VT, Arg, OneOver2Pi);
6741 : }
6742 :
6743 98 : switch (Op.getOpcode()) {
6744 : case ISD::FCOS:
6745 60 : return DAG.getNode(AMDGPUISD::COS_HW, SDLoc(Op), VT, TrigVal);
6746 : case ISD::FSIN:
6747 136 : return DAG.getNode(AMDGPUISD::SIN_HW, SDLoc(Op), VT, TrigVal);
6748 0 : default:
6749 0 : llvm_unreachable("Wrong trig opcode");
6750 : }
6751 : }
6752 :
6753 263 : SDValue SITargetLowering::LowerATOMIC_CMP_SWAP(SDValue Op, SelectionDAG &DAG) const {
6754 : AtomicSDNode *AtomicNode = cast<AtomicSDNode>(Op);
6755 : assert(AtomicNode->isCompareAndSwap());
6756 : unsigned AS = AtomicNode->getAddressSpace();
6757 :
6758 : // No custom lowering required for local address space
6759 263 : if (!isFlatGlobalAddrSpace(AS))
6760 66 : return Op;
6761 :
6762 : // Non-local address space requires custom lowering for atomic compare
6763 : // and swap; cmp and swap should be in a v2i32 or v2i64 in case of _X2
6764 : SDLoc DL(Op);
6765 197 : SDValue ChainIn = Op.getOperand(0);
6766 197 : SDValue Addr = Op.getOperand(1);
6767 197 : SDValue Old = Op.getOperand(2);
6768 197 : SDValue New = Op.getOperand(3);
6769 197 : EVT VT = Op.getValueType();
6770 197 : MVT SimpleVT = VT.getSimpleVT();
6771 197 : MVT VecType = MVT::getVectorVT(SimpleVT, 2);
6772 :
6773 394 : SDValue NewOld = DAG.getBuildVector(VecType, DL, {New, Old});
6774 197 : SDValue Ops[] = { ChainIn, Addr, NewOld };
6775 :
6776 : return DAG.getMemIntrinsicNode(AMDGPUISD::ATOMIC_CMP_SWAP, DL, Op->getVTList(),
6777 591 : Ops, VT, AtomicNode->getMemOperand());
6778 : }
6779 :
6780 : //===----------------------------------------------------------------------===//
6781 : // Custom DAG optimizations
6782 : //===----------------------------------------------------------------------===//
6783 :
6784 1699 : SDValue SITargetLowering::performUCharToFloatCombine(SDNode *N,
6785 : DAGCombinerInfo &DCI) const {
6786 1699 : EVT VT = N->getValueType(0);
6787 1699 : EVT ScalarVT = VT.getScalarType();
6788 1699 : if (ScalarVT != MVT::f32)
6789 207 : return SDValue();
6790 :
6791 1492 : SelectionDAG &DAG = DCI.DAG;
6792 : SDLoc DL(N);
6793 :
6794 1492 : SDValue Src = N->getOperand(0);
6795 : EVT SrcVT = Src.getValueType();
6796 :
6797 : // TODO: We could try to match extracting the higher bytes, which would be
6798 : // easier if i8 vectors weren't promoted to i32 vectors, particularly after
6799 : // types are legalized. v4i8 -> v4f32 is probably the only case to worry
6800 : // about in practice.
6801 1492 : if (DCI.isAfterLegalizeDAG() && SrcVT == MVT::i32) {
6802 1162 : if (DAG.MaskedValueIsZero(Src, APInt::getHighBitsSet(32, 24))) {
6803 107 : SDValue Cvt = DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0, DL, VT, Src);
6804 107 : DCI.AddToWorklist(Cvt.getNode());
6805 107 : return Cvt;
6806 : }
6807 : }
6808 :
6809 1385 : return SDValue();
6810 : }
6811 :
6812 : // (shl (add x, c1), c2) -> add (shl x, c2), (shl c1, c2)
6813 :
6814 : // This is a variant of
6815 : // (mul (add x, c1), c2) -> add (mul x, c2), (mul c1, c2),
6816 : //
6817 : // The normal DAG combiner will do this, but only if the add has one use since
6818 : // that would increase the number of instructions.
6819 : //
6820 : // This prevents us from seeing a constant offset that can be folded into a
6821 : // memory instruction's addressing mode. If we know the resulting add offset of
6822 : // a pointer can be folded into an addressing offset, we can replace the pointer
6823 : // operand with the add of new constant offset. This eliminates one of the uses,
6824 : // and may allow the remaining use to also be simplified.
6825 : //
6826 213 : SDValue SITargetLowering::performSHLPtrCombine(SDNode *N,
6827 : unsigned AddrSpace,
6828 : EVT MemVT,
6829 : DAGCombinerInfo &DCI) const {
6830 213 : SDValue N0 = N->getOperand(0);
6831 213 : SDValue N1 = N->getOperand(1);
6832 :
6833 : // We only do this to handle cases where it's profitable when there are
6834 : // multiple uses of the add, so defer to the standard combine.
6835 213 : if ((N0.getOpcode() != ISD::ADD && N0.getOpcode() != ISD::OR) ||
6836 : N0->hasOneUse())
6837 167 : return SDValue();
6838 :
6839 : const ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(N1);
6840 : if (!CN1)
6841 0 : return SDValue();
6842 :
6843 : const ConstantSDNode *CAdd = dyn_cast<ConstantSDNode>(N0.getOperand(1));
6844 : if (!CAdd)
6845 2 : return SDValue();
6846 :
6847 : // If the resulting offset is too large, we can't fold it into the addressing
6848 : // mode offset.
6849 88 : APInt Offset = CAdd->getAPIntValue() << CN1->getAPIntValue();
6850 44 : Type *Ty = MemVT.getTypeForEVT(*DCI.DAG.getContext());
6851 :
6852 44 : AddrMode AM;
6853 44 : AM.HasBaseReg = true;
6854 44 : AM.BaseOffs = Offset.getSExtValue();
6855 44 : if (!isLegalAddressingMode(DCI.DAG.getDataLayout(), AM, Ty, AddrSpace))
6856 14 : return SDValue();
6857 :
6858 30 : SelectionDAG &DAG = DCI.DAG;
6859 : SDLoc SL(N);
6860 60 : EVT VT = N->getValueType(0);
6861 :
6862 30 : SDValue ShlX = DAG.getNode(ISD::SHL, SL, VT, N0.getOperand(0), N1);
6863 30 : SDValue COffset = DAG.getConstant(Offset, SL, MVT::i32);
6864 :
6865 : SDNodeFlags Flags;
6866 30 : Flags.setNoUnsignedWrap(N->getFlags().hasNoUnsignedWrap() &&
6867 0 : (N0.getOpcode() == ISD::OR ||
6868 0 : N0->getFlags().hasNoUnsignedWrap()));
6869 :
6870 30 : return DAG.getNode(ISD::ADD, SL, VT, ShlX, COffset, Flags);
6871 : }
6872 :
6873 341305 : SDValue SITargetLowering::performMemSDNodeCombine(MemSDNode *N,
6874 : DAGCombinerInfo &DCI) const {
6875 341305 : SDValue Ptr = N->getBasePtr();
6876 341305 : SelectionDAG &DAG = DCI.DAG;
6877 : SDLoc SL(N);
6878 :
6879 : // TODO: We could also do this for multiplies.
6880 341305 : if (Ptr.getOpcode() == ISD::SHL) {
6881 : SDValue NewPtr = performSHLPtrCombine(Ptr.getNode(), N->getAddressSpace(),
6882 213 : N->getMemoryVT(), DCI);
6883 213 : if (NewPtr) {
6884 30 : SmallVector<SDValue, 8> NewOps(N->op_begin(), N->op_end());
6885 :
6886 42 : NewOps[N->getOpcode() == ISD::STORE ? 2 : 1] = NewPtr;
6887 30 : return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0);
6888 : }
6889 : }
6890 :
6891 341275 : return SDValue();
6892 : }
6893 :
6894 : static bool bitOpWithConstantIsReducible(unsigned Opc, uint32_t Val) {
6895 2957 : return (Opc == ISD::AND && (Val == 0 || Val == 0xffffffff)) ||
6896 3555 : (Opc == ISD::OR && (Val == 0xffffffff || Val == 0)) ||
6897 2021 : (Opc == ISD::XOR && Val == 0);
6898 : }
6899 :
6900 : // Break up 64-bit bit operation of a constant into two 32-bit and/or/xor. This
6901 : // will typically happen anyway for a VALU 64-bit and. This exposes other 32-bit
6902 : // integer combine opportunities since most 64-bit operations are decomposed
6903 : // this way. TODO: We won't want this for SALU especially if it is an inline
6904 : // immediate.
6905 2001 : SDValue SITargetLowering::splitBinaryBitConstantOp(
6906 : DAGCombinerInfo &DCI,
6907 : const SDLoc &SL,
6908 : unsigned Opc, SDValue LHS,
6909 : const ConstantSDNode *CRHS) const {
6910 2001 : uint64_t Val = CRHS->getZExtValue();
6911 : uint32_t ValLo = Lo_32(Val);
6912 : uint32_t ValHi = Hi_32(Val);
6913 2001 : const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
6914 :
6915 : if ((bitOpWithConstantIsReducible(Opc, ValLo) ||
6916 : bitOpWithConstantIsReducible(Opc, ValHi)) ||
6917 378 : (CRHS->hasOneUse() && !TII->isInlineConstant(CRHS->getAPIntValue()))) {
6918 : // If we need to materialize a 64-bit immediate, it will be split up later
6919 : // anyway. Avoid creating the harder to understand 64-bit immediate
6920 : // materialization.
6921 1630 : return splitBinaryBitConstantOpImpl(DCI, SL, Opc, LHS, ValLo, ValHi);
6922 : }
6923 :
6924 371 : return SDValue();
6925 : }
6926 :
6927 : // Returns true if argument is a boolean value which is not serialized into
6928 : // memory or argument and does not require v_cmdmask_b32 to be deserialized.
6929 0 : static bool isBoolSGPR(SDValue V) {
6930 0 : if (V.getValueType() != MVT::i1)
6931 0 : return false;
6932 240 : switch (V.getOpcode()) {
6933 : default: break;
6934 : case ISD::SETCC:
6935 : case ISD::AND:
6936 : case ISD::OR:
6937 : case ISD::XOR:
6938 : case AMDGPUISD::FP_CLASS:
6939 : return true;
6940 : }
6941 0 : return false;
6942 : }
6943 :
6944 : // If a constant has all zeroes or all ones within each byte return it.
6945 : // Otherwise return 0.
6946 318 : static uint32_t getConstantPermuteMask(uint32_t C) {
6947 : // 0xff for any zero byte in the mask
6948 : uint32_t ZeroByteMask = 0;
6949 318 : if (!(C & 0x000000ff)) ZeroByteMask |= 0x000000ff;
6950 318 : if (!(C & 0x0000ff00)) ZeroByteMask |= 0x0000ff00;
6951 318 : if (!(C & 0x00ff0000)) ZeroByteMask |= 0x00ff0000;
6952 318 : if (!(C & 0xff000000)) ZeroByteMask |= 0xff000000;
6953 318 : uint32_t NonZeroByteMask = ~ZeroByteMask; // 0xff for any non-zero byte
6954 318 : if ((NonZeroByteMask & C) != NonZeroByteMask)
6955 12 : return 0; // Partial bytes selected.
6956 : return C;
6957 : }
6958 :
6959 : // Check if a node selects whole bytes from its operand 0 starting at a byte
6960 : // boundary while masking the rest. Returns select mask as in the v_perm_b32
6961 : // or -1 if not succeeded.
6962 : // Note byte select encoding:
6963 : // value 0-3 selects corresponding source byte;
6964 : // value 0xc selects zero;
6965 : // value 0xff selects 0xff.
6966 0 : static uint32_t getPermuteMask(SelectionDAG &DAG, SDValue V) {
6967 : assert(V.getValueSizeInBits() == 32);
6968 :
6969 0 : if (V.getNumOperands() != 2)
6970 0 : return ~0;
6971 :
6972 : ConstantSDNode *N1 = dyn_cast<ConstantSDNode>(V.getOperand(1));
6973 : if (!N1)
6974 0 : return ~0;
6975 :
6976 0 : uint32_t C = N1->getZExtValue();
6977 :
6978 0 : switch (V.getOpcode()) {
6979 : default:
6980 : break;
6981 0 : case ISD::AND:
6982 0 : if (uint32_t ConstMask = getConstantPermuteMask(C)) {
6983 0 : return (0x03020100 & ConstMask) | (0x0c0c0c0c & ~ConstMask);
6984 0 : }
6985 : break;
6986 :
6987 0 : case ISD::OR:
6988 0 : if (uint32_t ConstMask = getConstantPermuteMask(C)) {
6989 0 : return (0x03020100 & ~ConstMask) | ConstMask;
6990 0 : }
6991 : break;
6992 :
6993 0 : case ISD::SHL:
6994 0 : if (C % 8)
6995 0 : return ~0;
6996 :
6997 0 : return uint32_t((0x030201000c0c0c0cull << C) >> 32);
6998 :
6999 0 : case ISD::SRL:
7000 0 : if (C % 8)
7001 0 : return ~0;
7002 :
7003 0 : return uint32_t(0x0c0c0c0c03020100ull >> C);
7004 : }
7005 :
7006 : return ~0;
7007 : }
7008 :
7009 36547 : SDValue SITargetLowering::performAndCombine(SDNode *N,
7010 : DAGCombinerInfo &DCI) const {
7011 36547 : if (DCI.isBeforeLegalize())
7012 1407 : return SDValue();
7013 :
7014 35140 : SelectionDAG &DAG = DCI.DAG;
7015 35140 : EVT VT = N->getValueType(0);
7016 35140 : SDValue LHS = N->getOperand(0);
7017 35140 : SDValue RHS = N->getOperand(1);
7018 :
7019 :
7020 : const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS);
7021 1823 : if (VT == MVT::i64 && CRHS) {
7022 1611 : if (SDValue Split
7023 1611 : = splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::AND, LHS, CRHS))
7024 1527 : return Split;
7025 : }
7026 :
7027 33613 : if (CRHS && VT == MVT::i32) {
7028 : // and (srl x, c), mask => shl (bfe x, nb + c, mask >> nb), nb
7029 : // nb = number of trailing zeroes in mask
7030 : // It can be optimized out using SDWA for GFX8+ in the SDWA peephole pass,
7031 : // given that we are selecting 8 or 16 bit fields starting at byte boundary.
7032 29468 : uint64_t Mask = CRHS->getZExtValue();
7033 : unsigned Bits = countPopulation(Mask);
7034 41674 : if (getSubtarget()->hasSDWA() && LHS->getOpcode() == ISD::SRL &&
7035 34912 : (Bits == 8 || Bits == 16) && isShiftedMask_64(Mask) && !(Mask & 1)) {
7036 54 : if (auto *CShift = dyn_cast<ConstantSDNode>(LHS->getOperand(1))) {
7037 54 : unsigned Shift = CShift->getZExtValue();
7038 54 : unsigned NB = CRHS->getAPIntValue().countTrailingZeros();
7039 54 : unsigned Offset = NB + Shift;
7040 54 : if ((Offset & (Bits - 1)) == 0) { // Starts at a byte or word boundary.
7041 : SDLoc SL(N);
7042 : SDValue BFE = DAG.getNode(AMDGPUISD::BFE_U32, SL, MVT::i32,
7043 54 : LHS->getOperand(0),
7044 : DAG.getConstant(Offset, SL, MVT::i32),
7045 54 : DAG.getConstant(Bits, SL, MVT::i32));
7046 54 : EVT NarrowVT = EVT::getIntegerVT(*DAG.getContext(), Bits);
7047 : SDValue Ext = DAG.getNode(ISD::AssertZext, SL, VT, BFE,
7048 54 : DAG.getValueType(NarrowVT));
7049 54 : SDValue Shl = DAG.getNode(ISD::SHL, SDLoc(LHS), VT, Ext,
7050 108 : DAG.getConstant(NB, SDLoc(CRHS), MVT::i32));
7051 54 : return Shl;
7052 : }
7053 : }
7054 : }
7055 :
7056 : // and (perm x, y, c1), c2 -> perm x, y, permute_mask(c1, c2)
7057 29414 : if (LHS.hasOneUse() && LHS.getOpcode() == AMDGPUISD::PERM &&
7058 : isa<ConstantSDNode>(LHS.getOperand(2))) {
7059 2 : uint32_t Sel = getConstantPermuteMask(Mask);
7060 2 : if (!Sel)
7061 0 : return SDValue();
7062 :
7063 : // Select 0xc for all zero bytes
7064 2 : Sel = (LHS.getConstantOperandVal(2) & Sel) | (~Sel & 0x0c0c0c0c);
7065 : SDLoc DL(N);
7066 : return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
7067 2 : LHS.getOperand(1), DAG.getConstant(Sel, DL, MVT::i32));
7068 : }
7069 : }
7070 :
7071 : // (and (fcmp ord x, x), (fcmp une (fabs x), inf)) ->
7072 : // fp_class x, ~(s_nan | q_nan | n_infinity | p_infinity)
7073 33557 : if (LHS.getOpcode() == ISD::SETCC && RHS.getOpcode() == ISD::SETCC) {
7074 325 : ISD::CondCode LCC = cast<CondCodeSDNode>(LHS.getOperand(2))->get();
7075 325 : ISD::CondCode RCC = cast<CondCodeSDNode>(RHS.getOperand(2))->get();
7076 :
7077 325 : SDValue X = LHS.getOperand(0);
7078 325 : SDValue Y = RHS.getOperand(0);
7079 325 : if (Y.getOpcode() != ISD::FABS || Y.getOperand(0) != X)
7080 321 : return SDValue();
7081 :
7082 4 : if (LCC == ISD::SETO) {
7083 : if (X != LHS.getOperand(1))
7084 0 : return SDValue();
7085 :
7086 2 : if (RCC == ISD::SETUNE) {
7087 : const ConstantFPSDNode *C1 = dyn_cast<ConstantFPSDNode>(RHS.getOperand(1));
7088 4 : if (!C1 || !C1->isInfinity() || C1->isNegative())
7089 0 : return SDValue();
7090 :
7091 : const uint32_t Mask = SIInstrFlags::N_NORMAL |
7092 : SIInstrFlags::N_SUBNORMAL |
7093 : SIInstrFlags::N_ZERO |
7094 : SIInstrFlags::P_ZERO |
7095 : SIInstrFlags::P_SUBNORMAL |
7096 : SIInstrFlags::P_NORMAL;
7097 :
7098 : static_assert(((~(SIInstrFlags::S_NAN |
7099 : SIInstrFlags::Q_NAN |
7100 : SIInstrFlags::N_INFINITY |
7101 : SIInstrFlags::P_INFINITY)) & 0x3ff) == Mask,
7102 : "mask not equal");
7103 :
7104 : SDLoc DL(N);
7105 : return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1,
7106 2 : X, DAG.getConstant(Mask, DL, MVT::i32));
7107 : }
7108 : }
7109 : }
7110 :
7111 33234 : if (RHS.getOpcode() == ISD::SETCC && LHS.getOpcode() == AMDGPUISD::FP_CLASS)
7112 : std::swap(LHS, RHS);
7113 :
7114 33243 : if (LHS.getOpcode() == ISD::SETCC && RHS.getOpcode() == AMDGPUISD::FP_CLASS &&
7115 : RHS.hasOneUse()) {
7116 8 : ISD::CondCode LCC = cast<CondCodeSDNode>(LHS.getOperand(2))->get();
7117 : // and (fcmp seto), (fp_class x, mask) -> fp_class x, mask & ~(p_nan | n_nan)
7118 : // and (fcmp setuo), (fp_class x, mask) -> fp_class x, mask & (p_nan | n_nan)
7119 : const ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(RHS.getOperand(1));
7120 8 : if ((LCC == ISD::SETO || LCC == ISD::SETUO) && Mask &&
7121 8 : (RHS.getOperand(0) == LHS.getOperand(0) &&
7122 8 : LHS.getOperand(0) == LHS.getOperand(1))) {
7123 : const unsigned OrdMask = SIInstrFlags::S_NAN | SIInstrFlags::Q_NAN;
7124 : unsigned NewMask = LCC == ISD::SETO ?
7125 5 : Mask->getZExtValue() & ~OrdMask :
7126 12 : Mask->getZExtValue() & OrdMask;
7127 :
7128 : SDLoc DL(N);
7129 : return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1, RHS.getOperand(0),
7130 6 : DAG.getConstant(NewMask, DL, MVT::i32));
7131 : }
7132 : }
7133 :
7134 29620 : if (VT == MVT::i32 &&
7135 29619 : (RHS.getOpcode() == ISD::SIGN_EXTEND || LHS.getOpcode() == ISD::SIGN_EXTEND)) {
7136 : // and x, (sext cc from i1) => select cc, x, 0
7137 24 : if (RHS.getOpcode() != ISD::SIGN_EXTEND)
7138 : std::swap(LHS, RHS);
7139 24 : if (isBoolSGPR(RHS.getOperand(0)))
7140 16 : return DAG.getSelect(SDLoc(N), MVT::i32, RHS.getOperand(0),
7141 48 : LHS, DAG.getConstant(0, SDLoc(N), MVT::i32));
7142 : }
7143 :
7144 : // and (op x, c1), (op y, c2) -> perm x, y, permute_mask(c1, c2)
7145 33212 : const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
7146 47166 : if (VT == MVT::i32 && LHS.hasOneUse() && RHS.hasOneUse() &&
7147 1837 : N->isDivergent() && TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32) != -1) {
7148 324 : uint32_t LHSMask = getPermuteMask(DAG, LHS);
7149 324 : uint32_t RHSMask = getPermuteMask(DAG, RHS);
7150 324 : if (LHSMask != ~0u && RHSMask != ~0u) {
7151 : // Canonicalize the expression in an attempt to have fewer unique masks
7152 : // and therefore fewer registers used to hold the masks.
7153 1 : if (LHSMask > RHSMask) {
7154 : std::swap(LHSMask, RHSMask);
7155 : std::swap(LHS, RHS);
7156 : }
7157 :
7158 : // Select 0xc for each lane used from source operand. Zero has 0xc mask
7159 : // set, 0xff have 0xff in the mask, actual lanes are in the 0-3 range.
7160 1 : uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
7161 1 : uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
7162 :
7163 : // Check of we need to combine values from two sources within a byte.
7164 1 : if (!(LHSUsedLanes & RHSUsedLanes) &&
7165 : // If we select high and lower word keep it for SDWA.
7166 : // TODO: teach SDWA to work with v_perm_b32 and remove the check.
7167 1 : !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
7168 : // Each byte in each mask is either selector mask 0-3, or has higher
7169 : // bits set in either of masks, which can be 0xff for 0xff or 0x0c for
7170 : // zero. If 0x0c is in either mask it shall always be 0x0c. Otherwise
7171 : // mask which is not 0xff wins. By anding both masks we have a correct
7172 : // result except that 0x0c shall be corrected to give 0x0c only.
7173 1 : uint32_t Mask = LHSMask & RHSMask;
7174 5 : for (unsigned I = 0; I < 32; I += 8) {
7175 4 : uint32_t ByteSel = 0xff << I;
7176 4 : if ((LHSMask & ByteSel) == 0x0c || (RHSMask & ByteSel) == 0x0c)
7177 0 : Mask &= (0x0c << I) & 0xffffffff;
7178 : }
7179 :
7180 : // Add 4 to each active LHS lane. It will not affect any existing 0xff
7181 : // or 0x0c.
7182 1 : uint32_t Sel = Mask | (LHSUsedLanes & 0x04040404);
7183 : SDLoc DL(N);
7184 :
7185 : return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32,
7186 : LHS.getOperand(0), RHS.getOperand(0),
7187 1 : DAG.getConstant(Sel, DL, MVT::i32));
7188 : }
7189 : }
7190 : }
7191 :
7192 33211 : return SDValue();
7193 : }
7194 :
7195 15730 : SDValue SITargetLowering::performOrCombine(SDNode *N,
7196 : DAGCombinerInfo &DCI) const {
7197 15730 : SelectionDAG &DAG = DCI.DAG;
7198 15730 : SDValue LHS = N->getOperand(0);
7199 15730 : SDValue RHS = N->getOperand(1);
7200 :
7201 15730 : EVT VT = N->getValueType(0);
7202 : if (VT == MVT::i1) {
7203 : // or (fp_class x, c1), (fp_class x, c2) -> fp_class x, (c1 | c2)
7204 125 : if (LHS.getOpcode() == AMDGPUISD::FP_CLASS &&
7205 : RHS.getOpcode() == AMDGPUISD::FP_CLASS) {
7206 14 : SDValue Src = LHS.getOperand(0);
7207 : if (Src != RHS.getOperand(0))
7208 1 : return SDValue();
7209 :
7210 : const ConstantSDNode *CLHS = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
7211 : const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS.getOperand(1));
7212 13 : if (!CLHS || !CRHS)
7213 0 : return SDValue();
7214 :
7215 : // Only 10 bits are used.
7216 : static const uint32_t MaxMask = 0x3ff;
7217 :
7218 39 : uint32_t NewMask = (CLHS->getZExtValue() | CRHS->getZExtValue()) & MaxMask;
7219 : SDLoc DL(N);
7220 : return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1,
7221 13 : Src, DAG.getConstant(NewMask, DL, MVT::i32));
7222 : }
7223 :
7224 111 : return SDValue();
7225 : }
7226 :
7227 : // or (perm x, y, c1), c2 -> perm x, y, permute_mask(c1, c2)
7228 5010 : if (isa<ConstantSDNode>(RHS) && LHS.hasOneUse() &&
7229 : LHS.getOpcode() == AMDGPUISD::PERM &&
7230 : isa<ConstantSDNode>(LHS.getOperand(2))) {
7231 3 : uint32_t Sel = getConstantPermuteMask(N->getConstantOperandVal(1));
7232 3 : if (!Sel)
7233 0 : return SDValue();
7234 :
7235 3 : Sel |= LHS.getConstantOperandVal(2);
7236 : SDLoc DL(N);
7237 : return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
7238 3 : LHS.getOperand(1), DAG.getConstant(Sel, DL, MVT::i32));
7239 : }
7240 :
7241 : // or (op x, c1), (op y, c2) -> perm x, y, permute_mask(c1, c2)
7242 15602 : const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
7243 21065 : if (VT == MVT::i32 && LHS.hasOneUse() && RHS.hasOneUse() &&
7244 7749 : N->isDivergent() && TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32) != -1) {
7245 1087 : uint32_t LHSMask = getPermuteMask(DAG, LHS);
7246 1087 : uint32_t RHSMask = getPermuteMask(DAG, RHS);
7247 1087 : if (LHSMask != ~0u && RHSMask != ~0u) {
7248 : // Canonicalize the expression in an attempt to have fewer unique masks
7249 : // and therefore fewer registers used to hold the masks.
7250 141 : if (LHSMask > RHSMask) {
7251 : std::swap(LHSMask, RHSMask);
7252 : std::swap(LHS, RHS);
7253 : }
7254 :
7255 : // Select 0xc for each lane used from source operand. Zero has 0xc mask
7256 : // set, 0xff have 0xff in the mask, actual lanes are in the 0-3 range.
7257 141 : uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
7258 141 : uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
7259 :
7260 : // Check of we need to combine values from two sources within a byte.
7261 141 : if (!(LHSUsedLanes & RHSUsedLanes) &&
7262 : // If we select high and lower word keep it for SDWA.
7263 : // TODO: teach SDWA to work with v_perm_b32 and remove the check.
7264 141 : !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
7265 : // Kill zero bytes selected by other mask. Zero value is 0xc.
7266 12 : LHSMask &= ~RHSUsedLanes;
7267 12 : RHSMask &= ~LHSUsedLanes;
7268 : // Add 4 to each active LHS lane
7269 12 : LHSMask |= LHSUsedLanes & 0x04040404;
7270 : // Combine masks
7271 12 : uint32_t Sel = LHSMask | RHSMask;
7272 : SDLoc DL(N);
7273 :
7274 : return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32,
7275 : LHS.getOperand(0), RHS.getOperand(0),
7276 12 : DAG.getConstant(Sel, DL, MVT::i32));
7277 : }
7278 : }
7279 : }
7280 :
7281 : if (VT != MVT::i64)
7282 13382 : return SDValue();
7283 :
7284 : // TODO: This could be a generic combine with a predicate for extracting the
7285 : // high half of an integer being free.
7286 :
7287 : // (or i64:x, (zero_extend i32:y)) ->
7288 : // i64 (bitcast (v2i32 build_vector (or i32:y, lo_32(x)), hi_32(x)))
7289 2208 : if (LHS.getOpcode() == ISD::ZERO_EXTEND &&
7290 : RHS.getOpcode() != ISD::ZERO_EXTEND)
7291 : std::swap(LHS, RHS);
7292 :
7293 2208 : if (RHS.getOpcode() == ISD::ZERO_EXTEND) {
7294 1277 : SDValue ExtSrc = RHS.getOperand(0);
7295 : EVT SrcVT = ExtSrc.getValueType();
7296 : if (SrcVT == MVT::i32) {
7297 : SDLoc SL(N);
7298 : SDValue LowLHS, HiBits;
7299 1277 : std::tie(LowLHS, HiBits) = split64BitValue(LHS, DAG);
7300 1277 : SDValue LowOr = DAG.getNode(ISD::OR, SL, MVT::i32, LowLHS, ExtSrc);
7301 :
7302 1277 : DCI.AddToWorklist(LowOr.getNode());
7303 1277 : DCI.AddToWorklist(HiBits.getNode());
7304 :
7305 : SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32,
7306 1277 : LowOr, HiBits);
7307 1277 : return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
7308 : }
7309 : }
7310 :
7311 931 : const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
7312 : if (CRHS) {
7313 81 : if (SDValue Split
7314 81 : = splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::OR, LHS, CRHS))
7315 77 : return Split;
7316 : }
7317 :
7318 854 : return SDValue();
7319 : }
7320 :
7321 1928 : SDValue SITargetLowering::performXorCombine(SDNode *N,
7322 : DAGCombinerInfo &DCI) const {
7323 1928 : EVT VT = N->getValueType(0);
7324 : if (VT != MVT::i64)
7325 1331 : return SDValue();
7326 :
7327 597 : SDValue LHS = N->getOperand(0);
7328 597 : SDValue RHS = N->getOperand(1);
7329 :
7330 : const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS);
7331 : if (CRHS) {
7332 309 : if (SDValue Split
7333 309 : = splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::XOR, LHS, CRHS))
7334 26 : return Split;
7335 : }
7336 :
7337 571 : return SDValue();
7338 : }
7339 :
7340 : // Instructions that will be lowered with a final instruction that zeros the
7341 : // high result bits.
7342 : // XXX - probably only need to list legal operations.
7343 324 : static bool fp16SrcZerosHighBits(unsigned Opc) {
7344 324 : switch (Opc) {
7345 : case ISD::FADD:
7346 : case ISD::FSUB:
7347 : case ISD::FMUL:
7348 : case ISD::FDIV:
7349 : case ISD::FREM:
7350 : case ISD::FMA:
7351 : case ISD::FMAD:
7352 : case ISD::FCANONICALIZE:
7353 : case ISD::FP_ROUND:
7354 : case ISD::UINT_TO_FP:
7355 : case ISD::SINT_TO_FP:
7356 : case ISD::FABS:
7357 : // Fabs is lowered to a bit operation, but it's an and which will clear the
7358 : // high bits anyway.
7359 : case ISD::FSQRT:
7360 : case ISD::FSIN:
7361 : case ISD::FCOS:
7362 : case ISD::FPOWI:
7363 : case ISD::FPOW:
7364 : case ISD::FLOG:
7365 : case ISD::FLOG2:
7366 : case ISD::FLOG10:
7367 : case ISD::FEXP:
7368 : case ISD::FEXP2:
7369 : case ISD::FCEIL:
7370 : case ISD::FTRUNC:
7371 : case ISD::FRINT:
7372 : case ISD::FNEARBYINT:
7373 : case ISD::FROUND:
7374 : case ISD::FFLOOR:
7375 : case ISD::FMINNUM:
7376 : case ISD::FMAXNUM:
7377 : case AMDGPUISD::FRACT:
7378 : case AMDGPUISD::CLAMP:
7379 : case AMDGPUISD::COS_HW:
7380 : case AMDGPUISD::SIN_HW:
7381 : case AMDGPUISD::FMIN3:
7382 : case AMDGPUISD::FMAX3:
7383 : case AMDGPUISD::FMED3:
7384 : case AMDGPUISD::FMAD_FTZ:
7385 : case AMDGPUISD::RCP:
7386 : case AMDGPUISD::RSQ:
7387 : case AMDGPUISD::RCP_IFLAG:
7388 : case AMDGPUISD::LDEXP:
7389 : return true;
7390 57 : default:
7391 : // fcopysign, select and others may be lowered to 32-bit bit operations
7392 : // which don't zero the high bits.
7393 57 : return false;
7394 : }
7395 : }
7396 :
7397 21073 : SDValue SITargetLowering::performZeroExtendCombine(SDNode *N,
7398 : DAGCombinerInfo &DCI) const {
7399 21073 : if (!Subtarget->has16BitInsts() ||
7400 15844 : DCI.getDAGCombineLevel() < AfterLegalizeDAG)
7401 17571 : return SDValue();
7402 :
7403 7004 : EVT VT = N->getValueType(0);
7404 : if (VT != MVT::i32)
7405 1770 : return SDValue();
7406 :
7407 1732 : SDValue Src = N->getOperand(0);
7408 : if (Src.getValueType() != MVT::i16)
7409 225 : return SDValue();
7410 :
7411 : // (i32 zext (i16 (bitcast f16:$src))) -> fp16_zext $src
7412 : // FIXME: It is not universally true that the high bits are zeroed on gfx9.
7413 1507 : if (Src.getOpcode() == ISD::BITCAST) {
7414 324 : SDValue BCSrc = Src.getOperand(0);
7415 324 : if (BCSrc.getValueType() == MVT::f16 &&
7416 324 : fp16SrcZerosHighBits(BCSrc.getOpcode()))
7417 534 : return DCI.DAG.getNode(AMDGPUISD::FP16_ZEXT, SDLoc(N), VT, BCSrc);
7418 : }
7419 :
7420 1240 : return SDValue();
7421 : }
7422 :
7423 102 : SDValue SITargetLowering::performClassCombine(SDNode *N,
7424 : DAGCombinerInfo &DCI) const {
7425 102 : SelectionDAG &DAG = DCI.DAG;
7426 102 : SDValue Mask = N->getOperand(1);
7427 :
7428 : // fp_class x, 0 -> false
7429 : if (const ConstantSDNode *CMask = dyn_cast<ConstantSDNode>(Mask)) {
7430 168 : if (CMask->isNullValue())
7431 4 : return DAG.getConstant(0, SDLoc(N), MVT::i1);
7432 : }
7433 :
7434 200 : if (N->getOperand(0).isUndef())
7435 2 : return DAG.getUNDEF(MVT::i1);
7436 :
7437 98 : return SDValue();
7438 : }
7439 :
7440 763 : SDValue SITargetLowering::performRcpCombine(SDNode *N,
7441 : DAGCombinerInfo &DCI) const {
7442 763 : EVT VT = N->getValueType(0);
7443 763 : SDValue N0 = N->getOperand(0);
7444 :
7445 763 : if (N0.isUndef())
7446 1 : return N0;
7447 :
7448 666 : if (VT == MVT::f32 && (N0.getOpcode() == ISD::UINT_TO_FP ||
7449 : N0.getOpcode() == ISD::SINT_TO_FP)) {
7450 410 : return DCI.DAG.getNode(AMDGPUISD::RCP_IFLAG, SDLoc(N), VT, N0,
7451 205 : N->getFlags());
7452 : }
7453 :
7454 557 : return AMDGPUTargetLowering::performRcpCombine(N, DCI);
7455 : }
7456 :
7457 1366 : bool SITargetLowering::isCanonicalized(SelectionDAG &DAG, SDValue Op,
7458 : unsigned MaxDepth) const {
7459 : unsigned Opcode = Op.getOpcode();
7460 1366 : if (Opcode == ISD::FCANONICALIZE)
7461 : return true;
7462 :
7463 : if (auto *CFP = dyn_cast<ConstantFPSDNode>(Op)) {
7464 11 : auto F = CFP->getValueAPF();
7465 11 : if (F.isNaN() && F.isSignaling())
7466 : return false;
7467 11 : return !F.isDenormal() || denormalsEnabledForType(Op.getValueType());
7468 : }
7469 :
7470 : // If source is a result of another standard FP operation it is already in
7471 : // canonical form.
7472 1355 : if (MaxDepth == 0)
7473 : return false;
7474 :
7475 1355 : switch (Opcode) {
7476 : // These will flush denorms if required.
7477 : case ISD::FADD:
7478 : case ISD::FSUB:
7479 : case ISD::FMUL:
7480 : case ISD::FCEIL:
7481 : case ISD::FFLOOR:
7482 : case ISD::FMA:
7483 : case ISD::FMAD:
7484 : case ISD::FSQRT:
7485 : case ISD::FDIV:
7486 : case ISD::FREM:
7487 : case ISD::FP_ROUND:
7488 : case ISD::FP_EXTEND:
7489 : case AMDGPUISD::FMUL_LEGACY:
7490 : case AMDGPUISD::FMAD_FTZ:
7491 : case AMDGPUISD::RCP:
7492 : case AMDGPUISD::RSQ:
7493 : case AMDGPUISD::RSQ_CLAMP:
7494 : case AMDGPUISD::RCP_LEGACY:
7495 : case AMDGPUISD::RSQ_LEGACY:
7496 : case AMDGPUISD::RCP_IFLAG:
7497 : case AMDGPUISD::TRIG_PREOP:
7498 : case AMDGPUISD::DIV_SCALE:
7499 : case AMDGPUISD::DIV_FMAS:
7500 : case AMDGPUISD::DIV_FIXUP:
7501 : case AMDGPUISD::FRACT:
7502 : case AMDGPUISD::LDEXP:
7503 : case AMDGPUISD::CVT_PKRTZ_F16_F32:
7504 : case AMDGPUISD::CVT_F32_UBYTE0:
7505 : case AMDGPUISD::CVT_F32_UBYTE1:
7506 : case AMDGPUISD::CVT_F32_UBYTE2:
7507 : case AMDGPUISD::CVT_F32_UBYTE3:
7508 : return true;
7509 :
7510 : // It can/will be lowered or combined as a bit operation.
7511 : // Need to check their input recursively to handle.
7512 146 : case ISD::FNEG:
7513 : case ISD::FABS:
7514 : case ISD::FCOPYSIGN:
7515 292 : return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
7516 :
7517 : case ISD::FSIN:
7518 : case ISD::FCOS:
7519 : case ISD::FSINCOS:
7520 30 : return Op.getValueType().getScalarType() != MVT::f16;
7521 :
7522 67 : case ISD::FMINNUM:
7523 : case ISD::FMAXNUM:
7524 : case AMDGPUISD::CLAMP:
7525 : case AMDGPUISD::FMED3:
7526 : case AMDGPUISD::FMAX3:
7527 : case AMDGPUISD::FMIN3: {
7528 : // FIXME: Shouldn't treat the generic operations different based these.
7529 67 : bool IsIEEEMode = Subtarget->enableIEEEBit(DAG.getMachineFunction());
7530 67 : if (IsIEEEMode) {
7531 : // snans will be quieted, so we only need to worry about denormals.
7532 84 : if (Subtarget->supportsMinMaxDenormModes() ||
7533 34 : denormalsEnabledForType(Op.getValueType()))
7534 26 : return true;
7535 :
7536 : // Flushing may be required.
7537 : // In pre-GFX9 targets V_MIN_F32 and others do not flush denorms. For such
7538 : // targets need to check their input recursively.
7539 52 : return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1) &&
7540 4 : isCanonicalized(DAG, Op.getOperand(1), MaxDepth - 1);
7541 : }
7542 :
7543 28 : if (Subtarget->supportsMinMaxDenormModes() ||
7544 11 : denormalsEnabledForType(Op.getValueType())) {
7545 : // Only quieting may be necessary.
7546 9 : return DAG.isKnownNeverSNaN(Op.getOperand(0)) &&
7547 3 : DAG.isKnownNeverSNaN(Op.getOperand(1));
7548 : }
7549 :
7550 : // Flushing and quieting may be necessary
7551 : // With ieee_mode off, the nan is returned as-is, so if it is an sNaN it
7552 : // needs to be quieted.
7553 16 : return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1) &&
7554 0 : isCanonicalized(DAG, Op.getOperand(1), MaxDepth - 1);
7555 : }
7556 5 : case ISD::SELECT: {
7557 15 : return isCanonicalized(DAG, Op.getOperand(1), MaxDepth - 1) &&
7558 5 : isCanonicalized(DAG, Op.getOperand(2), MaxDepth - 1);
7559 : }
7560 63 : case ISD::BUILD_VECTOR: {
7561 110 : for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
7562 94 : SDValue SrcOp = Op.getOperand(i);
7563 94 : if (!isCanonicalized(DAG, SrcOp, MaxDepth - 1))
7564 47 : return false;
7565 : }
7566 :
7567 : return true;
7568 : }
7569 72 : case ISD::EXTRACT_VECTOR_ELT:
7570 : case ISD::EXTRACT_SUBVECTOR: {
7571 144 : return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
7572 : }
7573 86 : case ISD::INSERT_VECTOR_ELT: {
7574 188 : return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1) &&
7575 16 : isCanonicalized(DAG, Op.getOperand(1), MaxDepth - 1);
7576 : }
7577 38 : case ISD::UNDEF:
7578 : // Could be anything.
7579 38 : return false;
7580 :
7581 25 : case ISD::INTRINSIC_WO_CHAIN: {
7582 : unsigned IntrinsicID
7583 50 : = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
7584 : // TODO: Handle more intrinsics
7585 : switch (IntrinsicID) {
7586 : case Intrinsic::amdgcn_cvt_pkrtz:
7587 : case Intrinsic::amdgcn_cubeid:
7588 : case Intrinsic::amdgcn_frexp_mant:
7589 : case Intrinsic::amdgcn_fdot2:
7590 : return true;
7591 : default:
7592 : break;
7593 : }
7594 :
7595 : LLVM_FALLTHROUGH;
7596 : }
7597 : default:
7598 643 : return denormalsEnabledForType(Op.getValueType()) &&
7599 364 : DAG.isKnownNeverSNaN(Op);
7600 : }
7601 :
7602 : llvm_unreachable("invalid operation");
7603 : }
7604 :
7605 : // Constant fold canonicalize.
7606 :
7607 149 : SDValue SITargetLowering::getCanonicalConstantFP(
7608 : SelectionDAG &DAG, const SDLoc &SL, EVT VT, const APFloat &C) const {
7609 : // Flush denormals to 0 if not enabled.
7610 149 : if (C.isDenormal() && !denormalsEnabledForType(VT))
7611 4 : return DAG.getConstantFP(0.0, SL, VT);
7612 :
7613 145 : if (C.isNaN()) {
7614 : APFloat CanonicalQNaN = APFloat::getQNaN(C.getSemantics());
7615 55 : if (C.isSignaling()) {
7616 : // Quiet a signaling NaN.
7617 : // FIXME: Is this supposed to preserve payload bits?
7618 30 : return DAG.getConstantFP(CanonicalQNaN, SL, VT);
7619 : }
7620 :
7621 : // Make sure it is the canonical NaN bitpattern.
7622 : //
7623 : // TODO: Can we use -1 as the canonical NaN value since it's an inline
7624 : // immediate?
7625 50 : if (C.bitcastToAPInt() != CanonicalQNaN.bitcastToAPInt())
7626 18 : return DAG.getConstantFP(CanonicalQNaN, SL, VT);
7627 : }
7628 :
7629 : // Already canonical.
7630 97 : return DAG.getConstantFP(C, SL, VT);
7631 : }
7632 :
7633 0 : static bool vectorEltWillFoldAway(SDValue Op) {
7634 64 : return Op.isUndef() || isa<ConstantFPSDNode>(Op);
7635 : }
7636 :
7637 1086 : SDValue SITargetLowering::performFCanonicalizeCombine(
7638 : SDNode *N,
7639 : DAGCombinerInfo &DCI) const {
7640 1086 : SelectionDAG &DAG = DCI.DAG;
7641 1086 : SDValue N0 = N->getOperand(0);
7642 2172 : EVT VT = N->getValueType(0);
7643 :
7644 : // fcanonicalize undef -> qnan
7645 1086 : if (N0.isUndef()) {
7646 23 : APFloat QNaN = APFloat::getQNaN(SelectionDAG::EVTToAPFloatSemantics(VT));
7647 46 : return DAG.getConstantFP(QNaN, SDLoc(N), VT);
7648 : }
7649 :
7650 1063 : if (ConstantFPSDNode *CFP = isConstOrConstSplatFP(N0)) {
7651 137 : EVT VT = N->getValueType(0);
7652 274 : return getCanonicalConstantFP(DAG, SDLoc(N), VT, CFP->getValueAPF());
7653 : }
7654 :
7655 : // fcanonicalize (build_vector x, k) -> build_vector (fcanonicalize x),
7656 : // (fcanonicalize k)
7657 : //
7658 : // fcanonicalize (build_vector x, undef) -> build_vector (fcanonicalize x), 0
7659 :
7660 : // TODO: This could be better with wider vectors that will be split to v2f16,
7661 : // and to consider uses since there aren't that many packed operations.
7662 926 : if (N0.getOpcode() == ISD::BUILD_VECTOR && VT == MVT::v2f16 &&
7663 : isTypeLegal(MVT::v2f16)) {
7664 : SDLoc SL(N);
7665 39 : SDValue NewElts[2];
7666 39 : SDValue Lo = N0.getOperand(0);
7667 39 : SDValue Hi = N0.getOperand(1);
7668 39 : EVT EltVT = Lo.getValueType();
7669 :
7670 : if (vectorEltWillFoldAway(Lo) || vectorEltWillFoldAway(Hi)) {
7671 60 : for (unsigned I = 0; I != 2; ++I) {
7672 40 : SDValue Op = N0.getOperand(I);
7673 : if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Op)) {
7674 12 : NewElts[I] = getCanonicalConstantFP(DAG, SL, EltVT,
7675 24 : CFP->getValueAPF());
7676 28 : } else if (Op.isUndef()) {
7677 : // Handled below based on what the other operand is.
7678 18 : NewElts[I] = Op;
7679 : } else {
7680 10 : NewElts[I] = DAG.getNode(ISD::FCANONICALIZE, SL, EltVT, Op);
7681 : }
7682 : }
7683 :
7684 : // If one half is undef, and one is constant, perfer a splat vector rather
7685 : // than the normal qNaN. If it's a register, prefer 0.0 since that's
7686 : // cheaper to use and may be free with a packed operation.
7687 40 : if (NewElts[0].isUndef()) {
7688 : if (isa<ConstantFPSDNode>(NewElts[1]))
7689 4 : NewElts[0] = isa<ConstantFPSDNode>(NewElts[1]) ?
7690 0 : NewElts[1]: DAG.getConstantFP(0.0f, SL, EltVT);
7691 : }
7692 :
7693 40 : if (NewElts[1].isUndef()) {
7694 10 : NewElts[1] = isa<ConstantFPSDNode>(NewElts[0]) ?
7695 6 : NewElts[0] : DAG.getConstantFP(0.0f, SL, EltVT);
7696 : }
7697 :
7698 20 : return DAG.getBuildVector(VT, SL, NewElts);
7699 : }
7700 : }
7701 :
7702 906 : return isCanonicalized(DAG, N0) ? N0 : SDValue();
7703 : }
7704 :
7705 : static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc) {
7706 61 : switch (Opc) {
7707 : case ISD::FMAXNUM:
7708 : return AMDGPUISD::FMAX3;
7709 5 : case ISD::SMAX:
7710 : return AMDGPUISD::SMAX3;
7711 5 : case ISD::UMAX:
7712 : return AMDGPUISD::UMAX3;
7713 13 : case ISD::FMINNUM:
7714 : return AMDGPUISD::FMIN3;
7715 15 : case ISD::SMIN:
7716 : return AMDGPUISD::SMIN3;
7717 8 : case ISD::UMIN:
7718 : return AMDGPUISD::UMIN3;
7719 0 : default:
7720 0 : llvm_unreachable("Not a min/max opcode");
7721 : }
7722 : }
7723 :
7724 153 : SDValue SITargetLowering::performIntMed3ImmCombine(
7725 : SelectionDAG &DAG, const SDLoc &SL,
7726 : SDValue Op0, SDValue Op1, bool Signed) const {
7727 : ConstantSDNode *K1 = dyn_cast<ConstantSDNode>(Op1);
7728 : if (!K1)
7729 90 : return SDValue();
7730 :
7731 : ConstantSDNode *K0 = dyn_cast<ConstantSDNode>(Op0.getOperand(1));
7732 : if (!K0)
7733 3 : return SDValue();
7734 :
7735 60 : if (Signed) {
7736 153 : if (K0->getAPIntValue().sge(K1->getAPIntValue()))
7737 3 : return SDValue();
7738 : } else {
7739 27 : if (K0->getAPIntValue().uge(K1->getAPIntValue()))
7740 3 : return SDValue();
7741 : }
7742 :
7743 54 : EVT VT = K0->getValueType(0);
7744 54 : unsigned Med3Opc = Signed ? AMDGPUISD::SMED3 : AMDGPUISD::UMED3;
7745 12 : if (VT == MVT::i32 || (VT == MVT::i16 && Subtarget->hasMed3_16())) {
7746 : return DAG.getNode(Med3Opc, SL, VT,
7747 50 : Op0.getOperand(0), SDValue(K0, 0), SDValue(K1, 0));
7748 : }
7749 :
7750 : // If there isn't a 16-bit med3 operation, convert to 32-bit.
7751 : MVT NVT = MVT::i32;
7752 4 : unsigned ExtOp = Signed ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
7753 :
7754 4 : SDValue Tmp1 = DAG.getNode(ExtOp, SL, NVT, Op0->getOperand(0));
7755 8 : SDValue Tmp2 = DAG.getNode(ExtOp, SL, NVT, Op0->getOperand(1));
7756 4 : SDValue Tmp3 = DAG.getNode(ExtOp, SL, NVT, Op1);
7757 :
7758 4 : SDValue Med3 = DAG.getNode(Med3Opc, SL, NVT, Tmp1, Tmp2, Tmp3);
7759 4 : return DAG.getNode(ISD::TRUNCATE, SL, VT, Med3);
7760 : }
7761 :
7762 0 : static ConstantFPSDNode *getSplatConstantFP(SDValue Op) {
7763 : if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Op))
7764 0 : return C;
7765 :
7766 : if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Op)) {
7767 0 : if (ConstantFPSDNode *C = BV->getConstantFPSplatNode())
7768 0 : return C;
7769 : }
7770 :
7771 : return nullptr;
7772 : }
7773 :
7774 431 : SDValue SITargetLowering::performFPMed3ImmCombine(SelectionDAG &DAG,
7775 : const SDLoc &SL,
7776 : SDValue Op0,
7777 : SDValue Op1) const {
7778 431 : ConstantFPSDNode *K1 = getSplatConstantFP(Op1);
7779 431 : if (!K1)
7780 70 : return SDValue();
7781 :
7782 361 : ConstantFPSDNode *K0 = getSplatConstantFP(Op0.getOperand(1));
7783 361 : if (!K0)
7784 7 : return SDValue();
7785 :
7786 : // Ordered >= (although NaN inputs should have folded away by now).
7787 1062 : APFloat::cmpResult Cmp = K0->getValueAPF().compare(K1->getValueAPF());
7788 354 : if (Cmp == APFloat::cmpGreaterThan)
7789 5 : return SDValue();
7790 :
7791 : // TODO: Check IEEE bit enabled?
7792 349 : EVT VT = Op0.getValueType();
7793 349 : if (Subtarget->enableDX10Clamp()) {
7794 : // If dx10_clamp is enabled, NaNs clamp to 0.0. This is the same as the
7795 : // hardware fmed3 behavior converting to a min.
7796 : // FIXME: Should this be allowing -0.0?
7797 340 : if (K1->isExactlyValue(1.0) && K0->isExactlyValue(0.0))
7798 257 : return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Op0.getOperand(0));
7799 : }
7800 :
7801 : // med3 for f16 is only available on gfx9+, and not available for v2f16.
7802 6 : if (VT == MVT::f32 || (VT == MVT::f16 && Subtarget->hasMed3_16())) {
7803 : // This isn't safe with signaling NaNs because in IEEE mode, min/max on a
7804 : // signaling NaN gives a quiet NaN. The quiet NaN input to the min would
7805 : // then give the other result, which is different from med3 with a NaN
7806 : // input.
7807 87 : SDValue Var = Op0.getOperand(0);
7808 87 : if (!DAG.isKnownNeverSNaN(Var))
7809 16 : return SDValue();
7810 :
7811 71 : const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
7812 :
7813 68 : if ((!K0->hasOneUse() ||
7814 207 : TII->isInlineConstant(K0->getValueAPF().bitcastToAPInt())) &&
7815 56 : (!K1->hasOneUse() ||
7816 239 : TII->isInlineConstant(K1->getValueAPF().bitcastToAPInt()))) {
7817 : return DAG.getNode(AMDGPUISD::FMED3, SL, K0->getValueType(0),
7818 130 : Var, SDValue(K0, 0), SDValue(K1, 0));
7819 : }
7820 : }
7821 :
7822 11 : return SDValue();
7823 : }
7824 :
7825 3554 : SDValue SITargetLowering::performMinMaxCombine(SDNode *N,
7826 : DAGCombinerInfo &DCI) const {
7827 3554 : SelectionDAG &DAG = DCI.DAG;
7828 :
7829 3554 : EVT VT = N->getValueType(0);
7830 3554 : unsigned Opc = N->getOpcode();
7831 3554 : SDValue Op0 = N->getOperand(0);
7832 3554 : SDValue Op1 = N->getOperand(1);
7833 :
7834 : // Only do this if the inner op has one use since this will just increases
7835 : // register pressure for no benefit.
7836 :
7837 :
7838 3466 : if (Opc != AMDGPUISD::FMIN_LEGACY && Opc != AMDGPUISD::FMAX_LEGACY &&
7839 3554 : !VT.isVector() && VT != MVT::f64 &&
7840 712 : ((VT != MVT::f16 && VT != MVT::i16) || Subtarget->hasMin3Max3_16())) {
7841 : // max(max(a, b), c) -> max3(a, b, c)
7842 : // min(min(a, b), c) -> min3(a, b, c)
7843 2923 : if (Op0.getOpcode() == Opc && Op0.hasOneUse()) {
7844 : SDLoc DL(N);
7845 : return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc),
7846 : DL,
7847 : N->getValueType(0),
7848 : Op0.getOperand(0),
7849 : Op0.getOperand(1),
7850 72 : Op1);
7851 : }
7852 :
7853 : // Try commuted.
7854 : // max(a, max(b, c)) -> max3(a, b, c)
7855 : // min(a, min(b, c)) -> min3(a, b, c)
7856 2858 : if (Op1.getOpcode() == Opc && Op1.hasOneUse()) {
7857 : SDLoc DL(N);
7858 : return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc),
7859 : DL,
7860 : N->getValueType(0),
7861 : Op0,
7862 : Op1.getOperand(0),
7863 35 : Op1.getOperand(1));
7864 : }
7865 : }
7866 :
7867 : // min(max(x, K0), K1), K0 < K1 -> med3(x, K0, K1)
7868 3592 : if (Opc == ISD::SMIN && Op0.getOpcode() == ISD::SMAX && Op0.hasOneUse()) {
7869 93 : if (SDValue Med3 = performIntMed3ImmCombine(DAG, SDLoc(N), Op0, Op1, true))
7870 48 : return Med3;
7871 : }
7872 :
7873 3511 : if (Opc == ISD::UMIN && Op0.getOpcode() == ISD::UMAX && Op0.hasOneUse()) {
7874 60 : if (SDValue Med3 = performIntMed3ImmCombine(DAG, SDLoc(N), Op0, Op1, false))
7875 6 : return Med3;
7876 : }
7877 :
7878 : // fminnum(fmaxnum(x, K0), K1), K0 < K1 && !is_snan(x) -> fmed3(x, K0, K1)
7879 3439 : if (((Opc == ISD::FMINNUM && Op0.getOpcode() == ISD::FMAXNUM) ||
7880 40 : (Opc == AMDGPUISD::FMIN_LEGACY &&
7881 : Op0.getOpcode() == AMDGPUISD::FMAX_LEGACY)) &&
7882 : (VT == MVT::f32 || VT == MVT::f64 ||
7883 73 : (VT == MVT::f16 && Subtarget->has16BitInsts()) ||
7884 3904 : (VT == MVT::v2f16 && Subtarget->hasVOP3PInsts())) &&
7885 : Op0.hasOneUse()) {
7886 431 : if (SDValue Res = performFPMed3ImmCombine(DAG, SDLoc(N), Op0, Op1))
7887 322 : return Res;
7888 : }
7889 :
7890 3117 : return SDValue();
7891 : }
7892 :
7893 0 : static bool isClampZeroToOne(SDValue A, SDValue B) {
7894 : if (ConstantFPSDNode *CA = dyn_cast<ConstantFPSDNode>(A)) {
7895 : if (ConstantFPSDNode *CB = dyn_cast<ConstantFPSDNode>(B)) {
7896 : // FIXME: Should this be allowing -0.0?
7897 0 : return (CA->isExactlyValue(0.0) && CB->isExactlyValue(1.0)) ||
7898 0 : (CA->isExactlyValue(1.0) && CB->isExactlyValue(0.0));
7899 : }
7900 : }
7901 :
7902 : return false;
7903 : }
7904 :
7905 : // FIXME: Should only worry about snans for version with chain.
7906 177 : SDValue SITargetLowering::performFMed3Combine(SDNode *N,
7907 : DAGCombinerInfo &DCI) const {
7908 177 : EVT VT = N->getValueType(0);
7909 : // v_med3_f32 and v_max_f32 behave identically wrt denorms, exceptions and
7910 : // NaNs. With a NaN input, the order of the operands may change the result.
7911 :
7912 177 : SelectionDAG &DAG = DCI.DAG;
7913 : SDLoc SL(N);
7914 :
7915 177 : SDValue Src0 = N->getOperand(0);
7916 177 : SDValue Src1 = N->getOperand(1);
7917 177 : SDValue Src2 = N->getOperand(2);
7918 :
7919 177 : if (isClampZeroToOne(Src0, Src1)) {
7920 : // const_a, const_b, x -> clamp is safe in all cases including signaling
7921 : // nans.
7922 : // FIXME: Should this be allowing -0.0?
7923 36 : return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Src2);
7924 : }
7925 :
7926 : // FIXME: dx10_clamp behavior assumed in instcombine. Should we really bother
7927 : // handling no dx10-clamp?
7928 141 : if (Subtarget->enableDX10Clamp()) {
7929 : // If NaNs is clamped to 0, we are free to reorder the inputs.
7930 :
7931 : if (isa<ConstantFPSDNode>(Src0) && !isa<ConstantFPSDNode>(Src1))
7932 : std::swap(Src0, Src1);
7933 :
7934 : if (isa<ConstantFPSDNode>(Src1) && !isa<ConstantFPSDNode>(Src2))
7935 : std::swap(Src1, Src2);
7936 :
7937 : if (isa<ConstantFPSDNode>(Src0) && !isa<ConstantFPSDNode>(Src1))
7938 : std::swap(Src0, Src1);
7939 :
7940 123 : if (isClampZeroToOne(Src1, Src2))
7941 12 : return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Src0);
7942 : }
7943 :
7944 129 : return SDValue();
7945 : }
7946 :
7947 113 : SDValue SITargetLowering::performCvtPkRTZCombine(SDNode *N,
7948 : DAGCombinerInfo &DCI) const {
7949 113 : SDValue Src0 = N->getOperand(0);
7950 113 : SDValue Src1 = N->getOperand(1);
7951 113 : if (Src0.isUndef() && Src1.isUndef())
7952 6 : return DCI.DAG.getUNDEF(N->getValueType(0));
7953 110 : return SDValue();
7954 : }
7955 :
7956 273398 : SDValue SITargetLowering::performExtractVectorEltCombine(
7957 : SDNode *N, DAGCombinerInfo &DCI) const {
7958 273398 : SDValue Vec = N->getOperand(0);
7959 273398 : SelectionDAG &DAG = DCI.DAG;
7960 :
7961 273398 : EVT VecVT = Vec.getValueType();
7962 273398 : EVT EltVT = VecVT.getVectorElementType();
7963 :
7964 273326 : if ((Vec.getOpcode() == ISD::FNEG ||
7965 273436 : Vec.getOpcode() == ISD::FABS) && allUsesHaveSourceMods(N)) {
7966 : SDLoc SL(N);
7967 78 : EVT EltVT = N->getValueType(0);
7968 78 : SDValue Idx = N->getOperand(1);
7969 : SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT,
7970 78 : Vec.getOperand(0), Idx);
7971 78 : return DAG.getNode(Vec.getOpcode(), SL, EltVT, Elt);
7972 : }
7973 :
7974 : // ScalarRes = EXTRACT_VECTOR_ELT ((vector-BINOP Vec1, Vec2), Idx)
7975 : // =>
7976 : // Vec1Elt = EXTRACT_VECTOR_ELT(Vec1, Idx)
7977 : // Vec2Elt = EXTRACT_VECTOR_ELT(Vec2, Idx)
7978 : // ScalarRes = scalar-BINOP Vec1Elt, Vec2Elt
7979 273320 : if (Vec.hasOneUse() && DCI.isBeforeLegalize()) {
7980 : SDLoc SL(N);
7981 1231 : EVT EltVT = N->getValueType(0);
7982 1231 : SDValue Idx = N->getOperand(1);
7983 : unsigned Opc = Vec.getOpcode();
7984 :
7985 1231 : switch(Opc) {
7986 1189 : default:
7987 1189 : return SDValue();
7988 : // TODO: Support other binary operations.
7989 : case ISD::FADD:
7990 : case ISD::FSUB:
7991 : case ISD::FMUL:
7992 : case ISD::ADD:
7993 : case ISD::UMIN:
7994 : case ISD::UMAX:
7995 : case ISD::SMIN:
7996 : case ISD::SMAX:
7997 : case ISD::FMAXNUM:
7998 : case ISD::FMINNUM: {
7999 : SDValue Elt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT,
8000 42 : Vec.getOperand(0), Idx);
8001 : SDValue Elt1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT,
8002 42 : Vec.getOperand(1), Idx);
8003 :
8004 42 : DCI.AddToWorklist(Elt0.getNode());
8005 42 : DCI.AddToWorklist(Elt1.getNode());
8006 42 : return DAG.getNode(Opc, SL, EltVT, Elt0, Elt1, Vec->getFlags());
8007 : }
8008 : }
8009 : }
8010 :
8011 272089 : if (!DCI.isBeforeLegalize())
8012 228613 : return SDValue();
8013 :
8014 43476 : unsigned VecSize = VecVT.getSizeInBits();
8015 43476 : unsigned EltSize = EltVT.getSizeInBits();
8016 :
8017 : // Try to turn sub-dword accesses of vectors into accesses of the same 32-bit
8018 : // elements. This exposes more load reduction opportunities by replacing
8019 : // multiple small extract_vector_elements with a single 32-bit extract.
8020 43476 : auto *Idx = dyn_cast<ConstantSDNode>(N->getOperand(1));
8021 29633 : if (isa<MemSDNode>(Vec) &&
8022 2065 : EltSize <= 16 &&
8023 1425 : EltVT.isByteSized() &&
8024 326 : VecSize > 32 &&
8025 43802 : VecSize % 32 == 0 &&
8026 : Idx) {
8027 302 : EVT NewVT = getEquivalentMemType(*DAG.getContext(), VecVT);
8028 :
8029 302 : unsigned BitIndex = Idx->getZExtValue() * EltSize;
8030 302 : unsigned EltIdx = BitIndex / 32;
8031 302 : unsigned LeftoverBitIdx = BitIndex % 32;
8032 : SDLoc SL(N);
8033 :
8034 302 : SDValue Cast = DAG.getNode(ISD::BITCAST, SL, NewVT, Vec);
8035 302 : DCI.AddToWorklist(Cast.getNode());
8036 :
8037 : SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Cast,
8038 302 : DAG.getConstant(EltIdx, SL, MVT::i32));
8039 302 : DCI.AddToWorklist(Elt.getNode());
8040 : SDValue Srl = DAG.getNode(ISD::SRL, SL, MVT::i32, Elt,
8041 302 : DAG.getConstant(LeftoverBitIdx, SL, MVT::i32));
8042 302 : DCI.AddToWorklist(Srl.getNode());
8043 :
8044 302 : SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, EltVT.changeTypeToInteger(), Srl);
8045 302 : DCI.AddToWorklist(Trunc.getNode());
8046 302 : return DAG.getNode(ISD::BITCAST, SL, EltVT, Trunc);
8047 : }
8048 :
8049 43174 : return SDValue();
8050 : }
8051 :
8052 3820 : static bool convertBuildVectorCastElt(SelectionDAG &DAG,
8053 : SDValue &Lo, SDValue &Hi) {
8054 3820 : if (Hi.getOpcode() == ISD::BITCAST &&
8055 3820 : Hi.getOperand(0).getValueType() == MVT::f16 &&
8056 4 : (isa<ConstantSDNode>(Lo) || Lo.isUndef())) {
8057 2 : Lo = DAG.getNode(ISD::BITCAST, SDLoc(Lo), MVT::f16, Lo);
8058 2 : Hi = Hi.getOperand(0);
8059 2 : return true;
8060 : }
8061 :
8062 : return false;
8063 : }
8064 :
8065 139608 : SDValue SITargetLowering::performBuildVectorCombine(
8066 : SDNode *N, DAGCombinerInfo &DCI) const {
8067 : SDLoc SL(N);
8068 :
8069 : if (!isTypeLegal(MVT::v2i16))
8070 60105 : return SDValue();
8071 79503 : SelectionDAG &DAG = DCI.DAG;
8072 159006 : EVT VT = N->getValueType(0);
8073 :
8074 : if (VT == MVT::v2i16) {
8075 1911 : SDValue Lo = N->getOperand(0);
8076 1911 : SDValue Hi = N->getOperand(1);
8077 :
8078 : // v2i16 build_vector (const|undef), (bitcast f16:$x)
8079 : // -> bitcast (v2f16 build_vector const|undef, $x
8080 1911 : if (convertBuildVectorCastElt(DAG, Lo, Hi)) {
8081 4 : SDValue NewVec = DAG.getBuildVector(MVT::v2f16, SL, { Lo, Hi });
8082 2 : return DAG.getNode(ISD::BITCAST, SL, VT, NewVec);
8083 : }
8084 :
8085 1909 : if (convertBuildVectorCastElt(DAG, Hi, Lo)) {
8086 0 : SDValue NewVec = DAG.getBuildVector(MVT::v2f16, SL, { Hi, Lo });
8087 0 : return DAG.getNode(ISD::BITCAST, SL, VT, NewVec);
8088 : }
8089 : }
8090 :
8091 79501 : return SDValue();
8092 : }
8093 :
8094 200 : unsigned SITargetLowering::getFusedOpcode(const SelectionDAG &DAG,
8095 : const SDNode *N0,
8096 : const SDNode *N1) const {
8097 400 : EVT VT = N0->getValueType(0);
8098 :
8099 : // Only do this if we are not trying to support denormals. v_mad_f32 does not
8100 : // support denormals ever.
8101 122 : if ((VT == MVT::f32 && !Subtarget->hasFP32Denormals()) ||
8102 32 : (VT == MVT::f16 && !Subtarget->hasFP16Denormals()))
8103 114 : return ISD::FMAD;
8104 :
8105 86 : const TargetOptions &Options = DAG.getTarget().Options;
8106 45 : if ((Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath ||
8107 45 : (N0->getFlags().hasAllowContract() &&
8108 136 : N1->getFlags().hasAllowContract())) &&
8109 44 : isFMAFasterThanFMulAndFAdd(VT)) {
8110 28 : return ISD::FMA;
8111 : }
8112 :
8113 : return 0;
8114 : }
8115 :
8116 24 : static SDValue getMad64_32(SelectionDAG &DAG, const SDLoc &SL,
8117 : EVT VT,
8118 : SDValue N0, SDValue N1, SDValue N2,
8119 : bool Signed) {
8120 24 : unsigned MadOpc = Signed ? AMDGPUISD::MAD_I64_I32 : AMDGPUISD::MAD_U64_U32;
8121 24 : SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i1);
8122 24 : SDValue Mad = DAG.getNode(MadOpc, SL, VTs, N0, N1, N2);
8123 24 : return DAG.getNode(ISD::TRUNCATE, SL, VT, Mad);
8124 : }
8125 :
8126 120080 : SDValue SITargetLowering::performAddCombine(SDNode *N,
8127 : DAGCombinerInfo &DCI) const {
8128 120080 : SelectionDAG &DAG = DCI.DAG;
8129 240160 : EVT VT = N->getValueType(0);
8130 : SDLoc SL(N);
8131 120080 : SDValue LHS = N->getOperand(0);
8132 120080 : SDValue RHS = N->getOperand(1);
8133 :
8134 119537 : if ((LHS.getOpcode() == ISD::MUL || RHS.getOpcode() == ISD::MUL)
8135 4613 : && Subtarget->hasMad64_32() &&
8136 121438 : !VT.isVector() && VT.getScalarSizeInBits() > 32 &&
8137 : VT.getScalarSizeInBits() <= 64) {
8138 32 : if (LHS.getOpcode() != ISD::MUL)
8139 : std::swap(LHS, RHS);
8140 :
8141 32 : SDValue MulLHS = LHS.getOperand(0);
8142 32 : SDValue MulRHS = LHS.getOperand(1);
8143 32 : SDValue AddRHS = RHS;
8144 :
8145 : // TODO: Maybe restrict if SGPR inputs.
8146 50 : if (numBitsUnsigned(MulLHS, DAG) <= 32 &&
8147 18 : numBitsUnsigned(MulRHS, DAG) <= 32) {
8148 17 : MulLHS = DAG.getZExtOrTrunc(MulLHS, SL, MVT::i32);
8149 17 : MulRHS = DAG.getZExtOrTrunc(MulRHS, SL, MVT::i32);
8150 17 : AddRHS = DAG.getZExtOrTrunc(AddRHS, SL, MVT::i64);
8151 17 : return getMad64_32(DAG, SL, VT, MulLHS, MulRHS, AddRHS, false);
8152 : }
8153 :
8154 15 : if (numBitsSigned(MulLHS, DAG) < 32 && numBitsSigned(MulRHS, DAG) < 32) {
8155 7 : MulLHS = DAG.getSExtOrTrunc(MulLHS, SL, MVT::i32);
8156 7 : MulRHS = DAG.getSExtOrTrunc(MulRHS, SL, MVT::i32);
8157 7 : AddRHS = DAG.getSExtOrTrunc(AddRHS, SL, MVT::i64);
8158 7 : return getMad64_32(DAG, SL, VT, MulLHS, MulRHS, AddRHS, true);
8159 : }
8160 :
8161 8 : return SDValue();
8162 : }
8163 :
8164 120048 : if (VT != MVT::i32 || !DCI.isAfterLegalizeDAG())
8165 105557 : return SDValue();
8166 :
8167 : // add x, zext (setcc) => addcarry x, 0, setcc
8168 : // add x, sext (setcc) => subcarry x, 0, setcc
8169 : unsigned Opc = LHS.getOpcode();
8170 28982 : if (Opc == ISD::ZERO_EXTEND || Opc == ISD::SIGN_EXTEND ||
8171 14491 : Opc == ISD::ANY_EXTEND || Opc == ISD::ADDCARRY)
8172 : std::swap(RHS, LHS);
8173 :
8174 : Opc = RHS.getOpcode();
8175 14491 : switch (Opc) {
8176 : default: break;
8177 153 : case ISD::ZERO_EXTEND:
8178 : case ISD::SIGN_EXTEND:
8179 : case ISD::ANY_EXTEND: {
8180 153 : auto Cond = RHS.getOperand(0);
8181 : if (!isBoolSGPR(Cond))
8182 : break;
8183 45 : SDVTList VTList = DAG.getVTList(MVT::i32, MVT::i1);
8184 90 : SDValue Args[] = { LHS, DAG.getConstant(0, SL, MVT::i32), Cond };
8185 45 : Opc = (Opc == ISD::SIGN_EXTEND) ? ISD::SUBCARRY : ISD::ADDCARRY;
8186 45 : return DAG.getNode(Opc, SL, VTList, Args);
8187 : }
8188 0 : case ISD::ADDCARRY: {
8189 : // add x, (addcarry y, 0, cc) => addcarry x, y, cc
8190 : auto C = dyn_cast<ConstantSDNode>(RHS.getOperand(1));
8191 0 : if (!C || C->getZExtValue() != 0) break;
8192 0 : SDValue Args[] = { LHS, RHS.getOperand(0), RHS.getOperand(2) };
8193 0 : return DAG.getNode(ISD::ADDCARRY, SDLoc(N), RHS->getVTList(), Args);
8194 : }
8195 : }
8196 14446 : return SDValue();
8197 : }
8198 :
8199 3805 : SDValue SITargetLowering::performSubCombine(SDNode *N,
8200 : DAGCombinerInfo &DCI) const {
8201 3805 : SelectionDAG &DAG = DCI.DAG;
8202 3805 : EVT VT = N->getValueType(0);
8203 :
8204 : if (VT != MVT::i32)
8205 624 : return SDValue();
8206 :
8207 : SDLoc SL(N);
8208 3181 : SDValue LHS = N->getOperand(0);
8209 3181 : SDValue RHS = N->getOperand(1);
8210 :
8211 : unsigned Opc = LHS.getOpcode();
8212 3181 : if (Opc != ISD::SUBCARRY)
8213 : std::swap(RHS, LHS);
8214 :
8215 3181 : if (LHS.getOpcode() == ISD::SUBCARRY) {
8216 : // sub (subcarry x, 0, cc), y => subcarry x, y, cc
8217 : auto C = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
8218 4 : if (!C || C->getZExtValue() != 0)
8219 0 : return SDValue();
8220 2 : SDValue Args[] = { LHS.getOperand(0), RHS, LHS.getOperand(2) };
8221 4 : return DAG.getNode(ISD::SUBCARRY, SDLoc(N), LHS->getVTList(), Args);
8222 : }
8223 3179 : return SDValue();
8224 : }
8225 :
8226 680 : SDValue SITargetLowering::performAddCarrySubCarryCombine(SDNode *N,
8227 : DAGCombinerInfo &DCI) const {
8228 :
8229 680 : if (N->getValueType(0) != MVT::i32)
8230 0 : return SDValue();
8231 :
8232 680 : auto C = dyn_cast<ConstantSDNode>(N->getOperand(1));
8233 514 : if (!C || C->getZExtValue() != 0)
8234 423 : return SDValue();
8235 :
8236 257 : SelectionDAG &DAG = DCI.DAG;
8237 257 : SDValue LHS = N->getOperand(0);
8238 :
8239 : // addcarry (add x, y), 0, cc => addcarry x, y, cc
8240 : // subcarry (sub x, y), 0, cc => subcarry x, y, cc
8241 : unsigned LHSOpc = LHS.getOpcode();
8242 257 : unsigned Opc = N->getOpcode();
8243 257 : if ((LHSOpc == ISD::ADD && Opc == ISD::ADDCARRY) ||
8244 257 : (LHSOpc == ISD::SUB && Opc == ISD::SUBCARRY)) {
8245 2 : SDValue Args[] = { LHS.getOperand(0), LHS.getOperand(1), N->getOperand(2) };
8246 4 : return DAG.getNode(Opc, SDLoc(N), N->getVTList(), Args);
8247 : }
8248 255 : return SDValue();
8249 : }
8250 :
8251 8302 : SDValue SITargetLowering::performFAddCombine(SDNode *N,
8252 : DAGCombinerInfo &DCI) const {
8253 8302 : if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
8254 5946 : return SDValue();
8255 :
8256 2356 : SelectionDAG &DAG = DCI.DAG;
8257 4712 : EVT VT = N->getValueType(0);
8258 :
8259 : SDLoc SL(N);
8260 2356 : SDValue LHS = N->getOperand(0);
8261 2356 : SDValue RHS = N->getOperand(1);
8262 :
8263 : // These should really be instruction patterns, but writing patterns with
8264 : // source modiifiers is a pain.
8265 :
8266 : // fadd (fadd (a, a), b) -> mad 2.0, a, b
8267 2356 : if (LHS.getOpcode() == ISD::FADD) {
8268 303 : SDValue A = LHS.getOperand(0);
8269 303 : if (A == LHS.getOperand(1)) {
8270 105 : unsigned FusedOp = getFusedOpcode(DAG, N, LHS.getNode());
8271 105 : if (FusedOp != 0) {
8272 73 : const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
8273 73 : return DAG.getNode(FusedOp, SL, VT, A, Two, RHS);
8274 : }
8275 : }
8276 : }
8277 :
8278 : // fadd (b, fadd (a, a)) -> mad 2.0, a, b
8279 2283 : if (RHS.getOpcode() == ISD::FADD) {
8280 147 : SDValue A = RHS.getOperand(0);
8281 147 : if (A == RHS.getOperand(1)) {
8282 30 : unsigned FusedOp = getFusedOpcode(DAG, N, RHS.getNode());
8283 30 : if (FusedOp != 0) {
8284 20 : const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
8285 20 : return DAG.getNode(FusedOp, SL, VT, A, Two, LHS);
8286 : }
8287 : }
8288 : }
8289 :
8290 2263 : return SDValue();
8291 : }
8292 :
8293 1634 : SDValue SITargetLowering::performFSubCombine(SDNode *N,
8294 : DAGCombinerInfo &DCI) const {
8295 1634 : if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
8296 1089 : return SDValue();
8297 :
8298 545 : SelectionDAG &DAG = DCI.DAG;
8299 : SDLoc SL(N);
8300 545 : EVT VT = N->getValueType(0);
8301 : assert(!VT.isVector());
8302 :
8303 : // Try to get the fneg to fold into the source modifier. This undoes generic
8304 : // DAG combines and folds them into the mad.
8305 : //
8306 : // Only do this if we are not trying to support denormals. v_mad_f32 does
8307 : // not support denormals ever.
8308 545 : SDValue LHS = N->getOperand(0);
8309 545 : SDValue RHS = N->getOperand(1);
8310 545 : if (LHS.getOpcode() == ISD::FADD) {
8311 : // (fsub (fadd a, a), c) -> mad 2.0, a, (fneg c)
8312 46 : SDValue A = LHS.getOperand(0);
8313 46 : if (A == LHS.getOperand(1)) {
8314 24 : unsigned FusedOp = getFusedOpcode(DAG, N, LHS.getNode());
8315 24 : if (FusedOp != 0){
8316 17 : const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
8317 17 : SDValue NegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
8318 :
8319 17 : return DAG.getNode(FusedOp, SL, VT, A, Two, NegRHS);
8320 : }
8321 : }
8322 : }
8323 :
8324 528 : if (RHS.getOpcode() == ISD::FADD) {
8325 : // (fsub c, (fadd a, a)) -> mad -2.0, a, c
8326 :
8327 50 : SDValue A = RHS.getOperand(0);
8328 50 : if (A == RHS.getOperand(1)) {
8329 41 : unsigned FusedOp = getFusedOpcode(DAG, N, RHS.getNode());
8330 41 : if (FusedOp != 0){
8331 32 : const SDValue NegTwo = DAG.getConstantFP(-2.0, SL, VT);
8332 32 : return DAG.getNode(FusedOp, SL, VT, A, NegTwo, LHS);
8333 : }
8334 : }
8335 : }
8336 :
8337 496 : return SDValue();
8338 : }
8339 :
8340 1833 : SDValue SITargetLowering::performFMACombine(SDNode *N,
8341 : DAGCombinerInfo &DCI) const {
8342 1833 : SelectionDAG &DAG = DCI.DAG;
8343 1833 : EVT VT = N->getValueType(0);
8344 : SDLoc SL(N);
8345 :
8346 1833 : if (!Subtarget->hasDLInsts() || VT != MVT::f32)
8347 1608 : return SDValue();
8348 :
8349 : // FMA((F32)S0.x, (F32)S1. x, FMA((F32)S0.y, (F32)S1.y, (F32)z)) ->
8350 : // FDOT2((V2F16)S0, (V2F16)S1, (F32)z))
8351 225 : SDValue Op1 = N->getOperand(0);
8352 225 : SDValue Op2 = N->getOperand(1);
8353 225 : SDValue FMA = N->getOperand(2);
8354 :
8355 33 : if (FMA.getOpcode() != ISD::FMA ||
8356 258 : Op1.getOpcode() != ISD::FP_EXTEND ||
8357 : Op2.getOpcode() != ISD::FP_EXTEND)
8358 192 : return SDValue();
8359 :
8360 : // fdot2_f32_f16 always flushes fp32 denormal operand and output to zero,
8361 : // regardless of the denorm mode setting. Therefore, unsafe-fp-math/fp-contract
8362 : // is sufficient to allow generaing fdot2.
8363 33 : const TargetOptions &Options = DAG.getTarget().Options;
8364 33 : if (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath ||
8365 0 : (N->getFlags().hasAllowContract() &&
8366 0 : FMA->getFlags().hasAllowContract())) {
8367 33 : Op1 = Op1.getOperand(0);
8368 33 : Op2 = Op2.getOperand(0);
8369 33 : if (Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
8370 : Op2.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
8371 12 : return SDValue();
8372 :
8373 21 : SDValue Vec1 = Op1.getOperand(0);
8374 21 : SDValue Idx1 = Op1.getOperand(1);
8375 21 : SDValue Vec2 = Op2.getOperand(0);
8376 :
8377 21 : SDValue FMAOp1 = FMA.getOperand(0);
8378 21 : SDValue FMAOp2 = FMA.getOperand(1);
8379 21 : SDValue FMAAcc = FMA.getOperand(2);
8380 :
8381 21 : if (FMAOp1.getOpcode() != ISD::FP_EXTEND ||
8382 : FMAOp2.getOpcode() != ISD::FP_EXTEND)
8383 0 : return SDValue();
8384 :
8385 21 : FMAOp1 = FMAOp1.getOperand(0);
8386 21 : FMAOp2 = FMAOp2.getOperand(0);
8387 21 : if (FMAOp1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
8388 : FMAOp2.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
8389 0 : return SDValue();
8390 :
8391 21 : SDValue Vec3 = FMAOp1.getOperand(0);
8392 21 : SDValue Vec4 = FMAOp2.getOperand(0);
8393 21 : SDValue Idx2 = FMAOp1.getOperand(1);
8394 :
8395 : if (Idx1 != Op2.getOperand(1) || Idx2 != FMAOp2.getOperand(1) ||
8396 : // Idx1 and Idx2 cannot be the same.
8397 : Idx1 == Idx2)
8398 12 : return SDValue();
8399 :
8400 : if (Vec1 == Vec2 || Vec3 == Vec4)
8401 0 : return SDValue();
8402 :
8403 : if (Vec1.getValueType() != MVT::v2f16 || Vec2.getValueType() != MVT::v2f16)
8404 3 : return SDValue();
8405 :
8406 : if ((Vec1 == Vec3 && Vec2 == Vec4) ||
8407 : (Vec1 == Vec4 && Vec2 == Vec3)) {
8408 : return DAG.getNode(AMDGPUISD::FDOT2, SL, MVT::f32, Vec1, Vec2, FMAAcc,
8409 12 : DAG.getTargetConstant(0, SL, MVT::i1));
8410 : }
8411 : }
8412 0 : return SDValue();
8413 : }
8414 :
8415 10765 : SDValue SITargetLowering::performSetCCCombine(SDNode *N,
8416 : DAGCombinerInfo &DCI) const {
8417 10765 : SelectionDAG &DAG = DCI.DAG;
8418 : SDLoc SL(N);
8419 :
8420 10765 : SDValue LHS = N->getOperand(0);
8421 10765 : SDValue RHS = N->getOperand(1);
8422 : EVT VT = LHS.getValueType();
8423 10765 : ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
8424 :
8425 : auto CRHS = dyn_cast<ConstantSDNode>(RHS);
8426 : if (!CRHS) {
8427 : CRHS = dyn_cast<ConstantSDNode>(LHS);
8428 : if (CRHS) {
8429 : std::swap(LHS, RHS);
8430 0 : CC = getSetCCSwappedOperands(CC);
8431 : }
8432 : }
8433 :
8434 10765 : if (CRHS) {
8435 4618 : if (VT == MVT::i32 && LHS.getOpcode() == ISD::SIGN_EXTEND &&
8436 16 : isBoolSGPR(LHS.getOperand(0))) {
8437 : // setcc (sext from i1 cc), -1, ne|sgt|ult) => not cc => xor cc, -1
8438 : // setcc (sext from i1 cc), -1, eq|sle|uge) => cc
8439 : // setcc (sext from i1 cc), 0, eq|sge|ule) => not cc => xor cc, -1
8440 : // setcc (sext from i1 cc), 0, ne|ugt|slt) => cc
8441 3 : if ((CRHS->isAllOnesValue() &&
8442 3 : (CC == ISD::SETNE || CC == ISD::SETGT || CC == ISD::SETULT)) ||
8443 0 : (CRHS->isNullValue() &&
8444 0 : (CC == ISD::SETEQ || CC == ISD::SETGE || CC == ISD::SETULE)))
8445 : return DAG.getNode(ISD::XOR, SL, MVT::i1, LHS.getOperand(0),
8446 3 : DAG.getConstant(-1, SL, MVT::i1));
8447 0 : if ((CRHS->isAllOnesValue() &&
8448 0 : (CC == ISD::SETEQ || CC == ISD::SETLE || CC == ISD::SETUGE)) ||
8449 0 : (CRHS->isNullValue() &&
8450 0 : (CC == ISD::SETNE || CC == ISD::SETUGT || CC == ISD::SETLT)))
8451 0 : return LHS.getOperand(0);
8452 : }
8453 :
8454 6005 : uint64_t CRHSVal = CRHS->getZExtValue();
8455 4389 : if ((CC == ISD::SETEQ || CC == ISD::SETNE) &&
8456 : LHS.getOpcode() == ISD::SELECT &&
8457 : isa<ConstantSDNode>(LHS.getOperand(1)) &&
8458 162 : isa<ConstantSDNode>(LHS.getOperand(2)) &&
8459 6005 : LHS.getConstantOperandVal(1) != LHS.getConstantOperandVal(2) &&
8460 162 : isBoolSGPR(LHS.getOperand(0))) {
8461 : // Given CT != FT:
8462 : // setcc (select cc, CT, CF), CF, eq => xor cc, -1
8463 : // setcc (select cc, CT, CF), CF, ne => cc
8464 : // setcc (select cc, CT, CF), CT, ne => xor cc, -1
8465 : // setcc (select cc, CT, CF), CT, eq => cc
8466 : uint64_t CT = LHS.getConstantOperandVal(1);
8467 : uint64_t CF = LHS.getConstantOperandVal(2);
8468 :
8469 162 : if ((CF == CRHSVal && CC == ISD::SETEQ) ||
8470 5 : (CT == CRHSVal && CC == ISD::SETNE))
8471 : return DAG.getNode(ISD::XOR, SL, MVT::i1, LHS.getOperand(0),
8472 158 : DAG.getConstant(-1, SL, MVT::i1));
8473 4 : if ((CF == CRHSVal && CC == ISD::SETNE) ||
8474 3 : (CT == CRHSVal && CC == ISD::SETEQ))
8475 2 : return LHS.getOperand(0);
8476 : }
8477 : }
8478 :
8479 8818 : if (VT != MVT::f32 && VT != MVT::f64 && (Subtarget->has16BitInsts() &&
8480 : VT != MVT::f16))
8481 3612 : return SDValue();
8482 :
8483 : // Match isinf/isfinite pattern
8484 : // (fcmp oeq (fabs x), inf) -> (fp_class x, (p_infinity | n_infinity))
8485 : // (fcmp one (fabs x), inf) -> (fp_class x,
8486 : // (p_normal | n_normal | p_subnormal | n_subnormal | p_zero | n_zero)
8487 6990 : if ((CC == ISD::SETOEQ || CC == ISD::SETONE) && LHS.getOpcode() == ISD::FABS) {
8488 : const ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(RHS);
8489 : if (!CRHS)
8490 0 : return SDValue();
8491 :
8492 10 : const APFloat &APF = CRHS->getValueAPF();
8493 10 : if (APF.isInfinity() && !APF.isNegative()) {
8494 : const unsigned IsInfMask = SIInstrFlags::P_INFINITY |
8495 : SIInstrFlags::N_INFINITY;
8496 : const unsigned IsFiniteMask = SIInstrFlags::N_ZERO |
8497 : SIInstrFlags::P_ZERO |
8498 : SIInstrFlags::N_NORMAL |
8499 : SIInstrFlags::P_NORMAL |
8500 : SIInstrFlags::N_SUBNORMAL |
8501 : SIInstrFlags::P_SUBNORMAL;
8502 10 : unsigned Mask = CC == ISD::SETOEQ ? IsInfMask : IsFiniteMask;
8503 : return DAG.getNode(AMDGPUISD::FP_CLASS, SL, MVT::i1, LHS.getOperand(0),
8504 10 : DAG.getConstant(Mask, SL, MVT::i32));
8505 : }
8506 : }
8507 :
8508 6980 : return SDValue();
8509 : }
8510 :
8511 357 : SDValue SITargetLowering::performCvtF32UByteNCombine(SDNode *N,
8512 : DAGCombinerInfo &DCI) const {
8513 357 : SelectionDAG &DAG = DCI.DAG;
8514 : SDLoc SL(N);
8515 357 : unsigned Offset = N->getOpcode() - AMDGPUISD::CVT_F32_UBYTE0;
8516 :
8517 357 : SDValue Src = N->getOperand(0);
8518 357 : SDValue Srl = N->getOperand(0);
8519 357 : if (Srl.getOpcode() == ISD::ZERO_EXTEND)
8520 48 : Srl = Srl.getOperand(0);
8521 :
8522 : // TODO: Handle (or x, (srl y, 8)) pattern when known bits are zero.
8523 357 : if (Srl.getOpcode() == ISD::SRL) {
8524 : // cvt_f32_ubyte0 (srl x, 16) -> cvt_f32_ubyte2 x
8525 : // cvt_f32_ubyte1 (srl x, 16) -> cvt_f32_ubyte3 x
8526 : // cvt_f32_ubyte0 (srl x, 8) -> cvt_f32_ubyte1 x
8527 :
8528 : if (const ConstantSDNode *C =
8529 : dyn_cast<ConstantSDNode>(Srl.getOperand(1))) {
8530 59 : Srl = DAG.getZExtOrTrunc(Srl.getOperand(0), SDLoc(Srl.getOperand(0)),
8531 59 : EVT(MVT::i32));
8532 :
8533 59 : unsigned SrcOffset = C->getZExtValue() + 8 * Offset;
8534 59 : if (SrcOffset < 32 && SrcOffset % 8 == 0) {
8535 59 : return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0 + SrcOffset / 8, SL,
8536 59 : MVT::f32, Srl);
8537 : }
8538 : }
8539 : }
8540 :
8541 298 : APInt Demanded = APInt::getBitsSet(32, 8 * Offset, 8 * Offset + 8);
8542 :
8543 298 : KnownBits Known;
8544 298 : TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
8545 596 : !DCI.isBeforeLegalizeOps());
8546 298 : const TargetLowering &TLI = DAG.getTargetLoweringInfo();
8547 596 : if (TLI.ShrinkDemandedConstant(Src, Demanded, TLO) ||
8548 298 : TLI.SimplifyDemandedBits(Src, Demanded, Known, TLO)) {
8549 93 : DCI.CommitTargetLoweringOpt(TLO);
8550 : }
8551 :
8552 298 : return SDValue();
8553 : }
8554 :
8555 334 : SDValue SITargetLowering::performClampCombine(SDNode *N,
8556 : DAGCombinerInfo &DCI) const {
8557 334 : ConstantFPSDNode *CSrc = dyn_cast<ConstantFPSDNode>(N->getOperand(0));
8558 : if (!CSrc)
8559 309 : return SDValue();
8560 :
8561 25 : const APFloat &F = CSrc->getValueAPF();
8562 25 : APFloat Zero = APFloat::getZero(F.getSemantics());
8563 25 : APFloat::cmpResult Cmp0 = F.compare(Zero);
8564 25 : if (Cmp0 == APFloat::cmpLessThan ||
8565 12 : (Cmp0 == APFloat::cmpUnordered && Subtarget->enableDX10Clamp())) {
8566 18 : return DCI.DAG.getConstantFP(Zero, SDLoc(N), N->getValueType(0));
8567 : }
8568 :
8569 16 : APFloat One(F.getSemantics(), "1.0");
8570 16 : APFloat::cmpResult Cmp1 = F.compare(One);
8571 16 : if (Cmp1 == APFloat::cmpGreaterThan)
8572 6 : return DCI.DAG.getConstantFP(One, SDLoc(N), N->getValueType(0));
8573 :
8574 13 : return SDValue(CSrc, 0);
8575 : }
8576 :
8577 :
8578 1666237 : SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
8579 : DAGCombinerInfo &DCI) const {
8580 3332474 : switch (N->getOpcode()) {
8581 378179 : default:
8582 378179 : return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
8583 120080 : case ISD::ADD:
8584 120080 : return performAddCombine(N, DCI);
8585 3805 : case ISD::SUB:
8586 3805 : return performSubCombine(N, DCI);
8587 680 : case ISD::ADDCARRY:
8588 : case ISD::SUBCARRY:
8589 680 : return performAddCarrySubCarryCombine(N, DCI);
8590 8302 : case ISD::FADD:
8591 8302 : return performFAddCombine(N, DCI);
8592 1634 : case ISD::FSUB:
8593 1634 : return performFSubCombine(N, DCI);
8594 10765 : case ISD::SETCC:
8595 10765 : return performSetCCCombine(N, DCI);
8596 10329 : case ISD::FMAXNUM:
8597 : case ISD::FMINNUM:
8598 : case ISD::SMAX:
8599 : case ISD::SMIN:
8600 : case ISD::UMAX:
8601 : case ISD::UMIN:
8602 : case AMDGPUISD::FMIN_LEGACY:
8603 : case AMDGPUISD::FMAX_LEGACY: {
8604 10329 : if (DCI.getDAGCombineLevel() >= AfterLegalizeDAG &&
8605 3554 : getTargetMachine().getOptLevel() > CodeGenOpt::None)
8606 3554 : return performMinMaxCombine(N, DCI);
8607 : break;
8608 : }
8609 1833 : case ISD::FMA:
8610 1833 : return performFMACombine(N, DCI);
8611 : case ISD::LOAD: {
8612 310576 : if (SDValue Widended = widenLoad(cast<LoadSDNode>(N), DCI))
8613 48 : return Widended;
8614 : LLVM_FALLTHROUGH;
8615 : }
8616 : case ISD::STORE:
8617 : case ISD::ATOMIC_LOAD:
8618 : case ISD::ATOMIC_STORE:
8619 : case ISD::ATOMIC_CMP_SWAP:
8620 : case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS:
8621 : case ISD::ATOMIC_SWAP:
8622 : case ISD::ATOMIC_LOAD_ADD:
8623 : case ISD::ATOMIC_LOAD_SUB:
8624 : case ISD::ATOMIC_LOAD_AND:
8625 : case ISD::ATOMIC_LOAD_OR:
8626 : case ISD::ATOMIC_LOAD_XOR:
8627 : case ISD::ATOMIC_LOAD_NAND:
8628 : case ISD::ATOMIC_LOAD_MIN:
8629 : case ISD::ATOMIC_LOAD_MAX:
8630 : case ISD::ATOMIC_LOAD_UMIN:
8631 : case ISD::ATOMIC_LOAD_UMAX:
8632 : case AMDGPUISD::ATOMIC_INC:
8633 : case AMDGPUISD::ATOMIC_DEC:
8634 : case AMDGPUISD::ATOMIC_LOAD_FADD:
8635 : case AMDGPUISD::ATOMIC_LOAD_FMIN:
8636 : case AMDGPUISD::ATOMIC_LOAD_FMAX: // TODO: Target mem intrinsics.
8637 641131 : if (DCI.isBeforeLegalize())
8638 : break;
8639 341305 : return performMemSDNodeCombine(cast<MemSDNode>(N), DCI);
8640 36547 : case ISD::AND:
8641 36547 : return performAndCombine(N, DCI);
8642 15730 : case ISD::OR:
8643 15730 : return performOrCombine(N, DCI);
8644 1928 : case ISD::XOR:
8645 1928 : return performXorCombine(N, DCI);
8646 21073 : case ISD::ZERO_EXTEND:
8647 21073 : return performZeroExtendCombine(N, DCI);
8648 102 : case AMDGPUISD::FP_CLASS:
8649 102 : return performClassCombine(N, DCI);
8650 1086 : case ISD::FCANONICALIZE:
8651 1086 : return performFCanonicalizeCombine(N, DCI);
8652 763 : case AMDGPUISD::RCP:
8653 763 : return performRcpCombine(N, DCI);
8654 641 : case AMDGPUISD::FRACT:
8655 : case AMDGPUISD::RSQ:
8656 : case AMDGPUISD::RCP_LEGACY:
8657 : case AMDGPUISD::RSQ_LEGACY:
8658 : case AMDGPUISD::RCP_IFLAG:
8659 : case AMDGPUISD::RSQ_CLAMP:
8660 : case AMDGPUISD::LDEXP: {
8661 641 : SDValue Src = N->getOperand(0);
8662 641 : if (Src.isUndef())
8663 10 : return Src;
8664 : break;
8665 : }
8666 1699 : case ISD::SINT_TO_FP:
8667 : case ISD::UINT_TO_FP:
8668 1699 : return performUCharToFloatCombine(N, DCI);
8669 357 : case AMDGPUISD::CVT_F32_UBYTE0:
8670 : case AMDGPUISD::CVT_F32_UBYTE1:
8671 : case AMDGPUISD::CVT_F32_UBYTE2:
8672 : case AMDGPUISD::CVT_F32_UBYTE3:
8673 357 : return performCvtF32UByteNCombine(N, DCI);
8674 177 : case AMDGPUISD::FMED3:
8675 177 : return performFMed3Combine(N, DCI);
8676 113 : case AMDGPUISD::CVT_PKRTZ_F16_F32:
8677 113 : return performCvtPkRTZCombine(N, DCI);
8678 334 : case AMDGPUISD::CLAMP:
8679 334 : return performClampCombine(N, DCI);
8680 2463 : case ISD::SCALAR_TO_VECTOR: {
8681 2463 : SelectionDAG &DAG = DCI.DAG;
8682 4926 : EVT VT = N->getValueType(0);
8683 :
8684 : // v2i16 (scalar_to_vector i16:x) -> v2i16 (bitcast (any_extend i16:x))
8685 : if (VT == MVT::v2i16 || VT == MVT::v2f16) {
8686 : SDLoc SL(N);
8687 247 : SDValue Src = N->getOperand(0);
8688 : EVT EltVT = Src.getValueType();
8689 : if (EltVT == MVT::f16)
8690 164 : Src = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Src);
8691 :
8692 247 : SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, Src);
8693 247 : return DAG.getNode(ISD::BITCAST, SL, VT, Ext);
8694 : }
8695 :
8696 2216 : break;
8697 : }
8698 266830 : case ISD::EXTRACT_VECTOR_ELT:
8699 266830 : return performExtractVectorEltCombine(N, DCI);
8700 139608 : case ISD::BUILD_VECTOR:
8701 139608 : return performBuildVectorCombine(N, DCI);
8702 : }
8703 309448 : return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
8704 : }
8705 :
8706 : /// Helper function for adjustWritemask
8707 : static unsigned SubIdx2Lane(unsigned Idx) {
8708 : switch (Idx) {
8709 : default: return 0;
8710 : case AMDGPU::sub0: return 0;
8711 : case AMDGPU::sub1: return 1;
8712 : case AMDGPU::sub2: return 2;
8713 : case AMDGPU::sub3: return 3;
8714 : }
8715 : }
8716 :
8717 : /// Adjust the writemask of MIMG instructions
8718 738 : SDNode *SITargetLowering::adjustWritemask(MachineSDNode *&Node,
8719 : SelectionDAG &DAG) const {
8720 738 : unsigned Opcode = Node->getMachineOpcode();
8721 :
8722 : // Subtract 1 because the vdata output is not a MachineSDNode operand.
8723 738 : int D16Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::d16) - 1;
8724 1439 : if (D16Idx >= 0 && Node->getConstantOperandVal(D16Idx))
8725 : return Node; // not implemented for D16
8726 :
8727 714 : SDNode *Users[4] = { nullptr };
8728 : unsigned Lane = 0;
8729 714 : unsigned DmaskIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::dmask) - 1;
8730 714 : unsigned OldDmask = Node->getConstantOperandVal(DmaskIdx);
8731 : unsigned NewDmask = 0;
8732 714 : bool HasChain = Node->getNumValues() > 1;
8733 :
8734 714 : if (OldDmask == 0) {
8735 : // These are folded out, but on the chance it happens don't assert.
8736 : return Node;
8737 : }
8738 :
8739 : // Try to figure out the used register components
8740 714 : for (SDNode::use_iterator I = Node->use_begin(), E = Node->use_end();
8741 2693 : I != E; ++I) {
8742 :
8743 : // Don't look at users of the chain.
8744 2139 : if (I.getUse().getResNo() != 0)
8745 : continue;
8746 :
8747 : // Abort if we can't understand the usage
8748 2028 : if (!I->isMachineOpcode() ||
8749 : I->getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG)
8750 : return Node;
8751 :
8752 : // Lane means which subreg of %vgpra_vgprb_vgprc_vgprd is used.
8753 : // Note that subregs are packed, i.e. Lane==0 is the first bit set
8754 : // in OldDmask, so it can be any of X,Y,Z,W; Lane==1 is the second bit
8755 : // set, etc.
8756 1870 : Lane = SubIdx2Lane(I->getConstantOperandVal(1));
8757 :
8758 : // Set which texture component corresponds to the lane.
8759 : unsigned Comp;
8760 6292 : for (unsigned i = 0, Dmask = OldDmask; i <= Lane; i++) {
8761 4422 : Comp = countTrailingZeros(Dmask);
8762 4422 : Dmask &= ~(1 << Comp);
8763 : }
8764 :
8765 : // Abort if we have more than one user per component
8766 1870 : if (Users[Lane])
8767 : return Node;
8768 :
8769 1868 : Users[Lane] = *I;
8770 1868 : NewDmask |= 1 << Comp;
8771 : }
8772 :
8773 : // Abort if there's no change
8774 554 : if (NewDmask == OldDmask)
8775 : return Node;
8776 :
8777 : unsigned BitsSet = countPopulation(NewDmask);
8778 :
8779 95 : int NewOpcode = AMDGPU::getMaskedMIMGOp(Node->getMachineOpcode(), BitsSet);
8780 : assert(NewOpcode != -1 &&
8781 : NewOpcode != static_cast<int>(Node->getMachineOpcode()) &&
8782 : "failed to find equivalent MIMG op");
8783 :
8784 : // Adjust the writemask in the node
8785 : SmallVector<SDValue, 12> Ops;
8786 95 : Ops.insert(Ops.end(), Node->op_begin(), Node->op_begin() + DmaskIdx);
8787 95 : Ops.push_back(DAG.getTargetConstant(NewDmask, SDLoc(Node), MVT::i32));
8788 285 : Ops.insert(Ops.end(), Node->op_begin() + DmaskIdx + 1, Node->op_end());
8789 :
8790 190 : MVT SVT = Node->getValueType(0).getVectorElementType().getSimpleVT();
8791 :
8792 : MVT ResultVT = BitsSet == 1 ?
8793 113 : SVT : MVT::getVectorVT(SVT, BitsSet == 3 ? 4 : BitsSet);
8794 : SDVTList NewVTList = HasChain ?
8795 95 : DAG.getVTList(ResultVT, MVT::Other) : DAG.getVTList(ResultVT);
8796 :
8797 :
8798 95 : MachineSDNode *NewNode = DAG.getMachineNode(NewOpcode, SDLoc(Node),
8799 : NewVTList, Ops);
8800 :
8801 95 : if (HasChain) {
8802 : // Update chain.
8803 184 : DAG.setNodeMemRefs(NewNode, Node->memoperands());
8804 184 : DAG.ReplaceAllUsesOfValueWith(SDValue(Node, 1), SDValue(NewNode, 1));
8805 : }
8806 :
8807 95 : if (BitsSet == 1) {
8808 : assert(Node->hasNUsesOfValue(1, 0));
8809 57 : SDNode *Copy = DAG.getMachineNode(TargetOpcode::COPY,
8810 114 : SDLoc(Node), Users[Lane]->getValueType(0),
8811 : SDValue(NewNode, 0));
8812 57 : DAG.ReplaceAllUsesWith(Users[Lane], Copy);
8813 57 : return nullptr;
8814 : }
8815 :
8816 : // Update the users of the node with the new indices
8817 190 : for (unsigned i = 0, Idx = AMDGPU::sub0; i < 4; ++i) {
8818 152 : SDNode *User = Users[i];
8819 152 : if (!User)
8820 56 : continue;
8821 :
8822 96 : SDValue Op = DAG.getTargetConstant(Idx, SDLoc(User), MVT::i32);
8823 96 : DAG.UpdateNodeOperands(User, SDValue(NewNode, 0), Op);
8824 :
8825 : switch (Idx) {
8826 : default: break;
8827 : case AMDGPU::sub0: Idx = AMDGPU::sub1; break;
8828 : case AMDGPU::sub1: Idx = AMDGPU::sub2; break;
8829 : case AMDGPU::sub2: Idx = AMDGPU::sub3; break;
8830 : }
8831 : }
8832 :
8833 38 : DAG.RemoveDeadNode(Node);
8834 38 : return nullptr;
8835 : }
8836 :
8837 : static bool isFrameIndexOp(SDValue Op) {
8838 440176 : if (Op.getOpcode() == ISD::AssertZext)
8839 70 : Op = Op.getOperand(0);
8840 :
8841 : return isa<FrameIndexSDNode>(Op);
8842 : }
8843 :
8844 : /// Legalize target independent instructions (e.g. INSERT_SUBREG)
8845 : /// with frame index operands.
8846 : /// LLVM assumes that inputs are to these instructions are registers.
8847 75731 : SDNode *SITargetLowering::legalizeTargetIndependentNode(SDNode *Node,
8848 : SelectionDAG &DAG) const {
8849 75731 : if (Node->getOpcode() == ISD::CopyToReg) {
8850 17183 : RegisterSDNode *DestReg = cast<RegisterSDNode>(Node->getOperand(1));
8851 17183 : SDValue SrcVal = Node->getOperand(2);
8852 :
8853 : // Insert a copy to a VReg_1 virtual register so LowerI1Copies doesn't have
8854 : // to try understanding copies to physical registers.
8855 187 : if (SrcVal.getValueType() == MVT::i1 &&
8856 187 : TargetRegisterInfo::isPhysicalRegister(DestReg->getReg())) {
8857 : SDLoc SL(Node);
8858 8 : MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
8859 : SDValue VReg = DAG.getRegister(
8860 8 : MRI.createVirtualRegister(&AMDGPU::VReg_1RegClass), MVT::i1);
8861 :
8862 : SDNode *Glued = Node->getGluedNode();
8863 : SDValue ToVReg
8864 8 : = DAG.getCopyToReg(Node->getOperand(0), SL, VReg, SrcVal,
8865 10 : SDValue(Glued, Glued ? Glued->getNumValues() - 1 : 0));
8866 : SDValue ToResultReg
8867 : = DAG.getCopyToReg(ToVReg, SL, SDValue(DestReg, 0),
8868 16 : VReg, ToVReg.getValue(1));
8869 8 : DAG.ReplaceAllUsesWith(Node, ToResultReg.getNode());
8870 8 : DAG.RemoveDeadNode(Node);
8871 : return ToResultReg.getNode();
8872 : }
8873 : }
8874 :
8875 : SmallVector<SDValue, 8> Ops;
8876 591622 : for (unsigned i = 0; i < Node->getNumOperands(); ++i) {
8877 880352 : if (!isFrameIndexOp(Node->getOperand(i))) {
8878 440151 : Ops.push_back(Node->getOperand(i));
8879 440151 : continue;
8880 : }
8881 :
8882 : SDLoc DL(Node);
8883 50 : Ops.push_back(SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL,
8884 : Node->getOperand(i).getValueType(),
8885 25 : Node->getOperand(i)), 0));
8886 : }
8887 :
8888 75723 : return DAG.UpdateNodeOperands(Node, Ops);
8889 : }
8890 :
8891 : /// Fold the instructions after selecting them.
8892 : /// Returns null if users were already updated.
8893 428230 : SDNode *SITargetLowering::PostISelFolding(MachineSDNode *Node,
8894 : SelectionDAG &DAG) const {
8895 428230 : const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
8896 428230 : unsigned Opcode = Node->getMachineOpcode();
8897 :
8898 856460 : if (TII->isMIMG(Opcode) && !TII->get(Opcode).mayStore() &&
8899 : !TII->isGather4(Opcode)) {
8900 738 : return adjustWritemask(Node, DAG);
8901 : }
8902 :
8903 854984 : if (Opcode == AMDGPU::INSERT_SUBREG ||
8904 427492 : Opcode == AMDGPU::REG_SEQUENCE) {
8905 58548 : legalizeTargetIndependentNode(Node, DAG);
8906 58548 : return Node;
8907 : }
8908 :
8909 368944 : switch (Opcode) {
8910 283 : case AMDGPU::V_DIV_SCALE_F32:
8911 : case AMDGPU::V_DIV_SCALE_F64: {
8912 : // Satisfy the operand register constraint when one of the inputs is
8913 : // undefined. Ordinarily each undef value will have its own implicit_def of
8914 : // a vreg, so force these to use a single register.
8915 283 : SDValue Src0 = Node->getOperand(0);
8916 283 : SDValue Src1 = Node->getOperand(1);
8917 283 : SDValue Src2 = Node->getOperand(2);
8918 :
8919 275 : if ((Src0.isMachineOpcode() &&
8920 283 : Src0.getMachineOpcode() != AMDGPU::IMPLICIT_DEF) &&
8921 : (Src0 == Src1 || Src0 == Src2))
8922 : break;
8923 :
8924 : MVT VT = Src0.getValueType().getSimpleVT();
8925 11 : const TargetRegisterClass *RC = getRegClassFor(VT);
8926 :
8927 11 : MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
8928 11 : SDValue UndefReg = DAG.getRegister(MRI.createVirtualRegister(RC), VT);
8929 :
8930 22 : SDValue ImpDef = DAG.getCopyToReg(DAG.getEntryNode(), SDLoc(Node),
8931 22 : UndefReg, Src0, SDValue());
8932 :
8933 : // src0 must be the same register as src1 or src2, even if the value is
8934 : // undefined, so make sure we don't violate this constraint.
8935 11 : if (Src0.isMachineOpcode() &&
8936 : Src0.getMachineOpcode() == AMDGPU::IMPLICIT_DEF) {
8937 3 : if (Src1.isMachineOpcode() &&
8938 : Src1.getMachineOpcode() != AMDGPU::IMPLICIT_DEF)
8939 : Src0 = Src1;
8940 3 : else if (Src2.isMachineOpcode() &&
8941 : Src2.getMachineOpcode() != AMDGPU::IMPLICIT_DEF)
8942 : Src0 = Src2;
8943 : else {
8944 : assert(Src1.getMachineOpcode() == AMDGPU::IMPLICIT_DEF);
8945 1 : Src0 = UndefReg;
8946 1 : Src1 = UndefReg;
8947 : }
8948 : } else
8949 : break;
8950 :
8951 3 : SmallVector<SDValue, 4> Ops = { Src0, Src1, Src2 };
8952 3 : for (unsigned I = 3, N = Node->getNumOperands(); I != N; ++I)
8953 0 : Ops.push_back(Node->getOperand(I));
8954 :
8955 3 : Ops.push_back(ImpDef.getValue(1));
8956 6 : return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
8957 : }
8958 : default:
8959 : break;
8960 : }
8961 :
8962 368941 : return Node;
8963 : }
8964 :
8965 : /// Assign the register class depending on the number of
8966 : /// bits set in the writemask
8967 41573 : void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
8968 : SDNode *Node) const {
8969 41573 : const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
8970 :
8971 41573 : MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
8972 :
8973 124719 : if (TII->isVOP3(MI.getOpcode())) {
8974 : // Make sure constant bus requirements are respected.
8975 38943 : TII->legalizeOperandsVOP3(MRI, MI);
8976 38943 : return;
8977 : }
8978 :
8979 : // Replace unused atomics with the no return version.
8980 2630 : int NoRetAtomicOp = AMDGPU::getAtomicNoRetOp(MI.getOpcode());
8981 2630 : if (NoRetAtomicOp != -1) {
8982 2077 : if (!Node->hasAnyUseOfValue(0)) {
8983 950 : MI.setDesc(TII->get(NoRetAtomicOp));
8984 950 : MI.RemoveOperand(0);
8985 950 : return;
8986 : }
8987 :
8988 : // For mubuf_atomic_cmpswap, we need to have tablegen use an extract_subreg
8989 : // instruction, because the return type of these instructions is a vec2 of
8990 : // the memory type, so it can be tied to the input operand.
8991 : // This means these instructions always have a use, so we need to add a
8992 : // special case to check if the atomic has only one extract_subreg use,
8993 : // which itself has no uses.
8994 1127 : if ((Node->hasNUsesOfValue(1, 0) &&
8995 2236 : Node->use_begin()->isMachineOpcode() &&
8996 1159 : Node->use_begin()->getMachineOpcode() == AMDGPU::EXTRACT_SUBREG &&
8997 32 : !Node->use_begin()->hasAnyUseOfValue(0))) {
8998 0 : unsigned Def = MI.getOperand(0).getReg();
8999 :
9000 : // Change this into a noret atomic.
9001 0 : MI.setDesc(TII->get(NoRetAtomicOp));
9002 0 : MI.RemoveOperand(0);
9003 :
9004 : // If we only remove the def operand from the atomic instruction, the
9005 : // extract_subreg will be left with a use of a vreg without a def.
9006 : // So we need to insert an implicit_def to avoid machine verifier
9007 : // errors.
9008 0 : BuildMI(*MI.getParent(), MI, MI.getDebugLoc(),
9009 0 : TII->get(AMDGPU::IMPLICIT_DEF), Def);
9010 : }
9011 1127 : return;
9012 : }
9013 : }
9014 :
9015 41816 : static SDValue buildSMovImm32(SelectionDAG &DAG, const SDLoc &DL,
9016 : uint64_t Val) {
9017 41816 : SDValue K = DAG.getTargetConstant(Val, DL, MVT::i32);
9018 41816 : return SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, K), 0);
9019 : }
9020 :
9021 3633 : MachineSDNode *SITargetLowering::wrapAddr64Rsrc(SelectionDAG &DAG,
9022 : const SDLoc &DL,
9023 : SDValue Ptr) const {
9024 3633 : const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
9025 :
9026 : // Build the half of the subregister with the constants before building the
9027 : // full 128-bit register. If we are building multiple resource descriptors,
9028 : // this will allow CSEing of the 2-component register.
9029 : const SDValue Ops0[] = {
9030 3633 : DAG.getTargetConstant(AMDGPU::SGPR_64RegClassID, DL, MVT::i32),
9031 3633 : buildSMovImm32(DAG, DL, 0),
9032 3633 : DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
9033 3633 : buildSMovImm32(DAG, DL, TII->getDefaultRsrcDataFormat() >> 32),
9034 3633 : DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32)
9035 : };
9036 :
9037 3633 : SDValue SubRegHi = SDValue(DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL,
9038 : MVT::v2i32, Ops0), 0);
9039 :
9040 : // Combine the constants and the pointer.
9041 : const SDValue Ops1[] = {
9042 3633 : DAG.getTargetConstant(AMDGPU::SReg_128RegClassID, DL, MVT::i32),
9043 : Ptr,
9044 3633 : DAG.getTargetConstant(AMDGPU::sub0_sub1, DL, MVT::i32),
9045 : SubRegHi,
9046 3633 : DAG.getTargetConstant(AMDGPU::sub2_sub3, DL, MVT::i32)
9047 3633 : };
9048 :
9049 3633 : return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops1);
9050 : }
9051 :
9052 : /// Return a resource descriptor with the 'Add TID' bit enabled
9053 : /// The TID (Thread ID) is multiplied by the stride value (bits [61:48]
9054 : /// of the resource descriptor) to create an offset, which is added to
9055 : /// the resource pointer.
9056 17275 : MachineSDNode *SITargetLowering::buildRSRC(SelectionDAG &DAG, const SDLoc &DL,
9057 : SDValue Ptr, uint32_t RsrcDword1,
9058 : uint64_t RsrcDword2And3) const {
9059 17275 : SDValue PtrLo = DAG.getTargetExtractSubreg(AMDGPU::sub0, DL, MVT::i32, Ptr);
9060 17275 : SDValue PtrHi = DAG.getTargetExtractSubreg(AMDGPU::sub1, DL, MVT::i32, Ptr);
9061 17275 : if (RsrcDword1) {
9062 0 : PtrHi = SDValue(DAG.getMachineNode(AMDGPU::S_OR_B32, DL, MVT::i32, PtrHi,
9063 : DAG.getConstant(RsrcDword1, DL, MVT::i32)),
9064 : 0);
9065 : }
9066 :
9067 : SDValue DataLo = buildSMovImm32(DAG, DL,
9068 17275 : RsrcDword2And3 & UINT64_C(0xFFFFFFFF));
9069 17275 : SDValue DataHi = buildSMovImm32(DAG, DL, RsrcDword2And3 >> 32);
9070 :
9071 : const SDValue Ops[] = {
9072 17275 : DAG.getTargetConstant(AMDGPU::SReg_128RegClassID, DL, MVT::i32),
9073 : PtrLo,
9074 17275 : DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
9075 : PtrHi,
9076 17275 : DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32),
9077 : DataLo,
9078 17275 : DAG.getTargetConstant(AMDGPU::sub2, DL, MVT::i32),
9079 : DataHi,
9080 17275 : DAG.getTargetConstant(AMDGPU::sub3, DL, MVT::i32)
9081 17275 : };
9082 :
9083 17275 : return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops);
9084 : }
9085 :
9086 : //===----------------------------------------------------------------------===//
9087 : // SI Inline Assembly Support
9088 : //===----------------------------------------------------------------------===//
9089 :
9090 : std::pair<unsigned, const TargetRegisterClass *>
9091 2343 : SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
9092 : StringRef Constraint,
9093 : MVT VT) const {
9094 : const TargetRegisterClass *RC = nullptr;
9095 2343 : if (Constraint.size() == 1) {
9096 1494 : switch (Constraint[0]) {
9097 0 : default:
9098 0 : return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
9099 362 : case 's':
9100 : case 'r':
9101 362 : switch (VT.getSizeInBits()) {
9102 6 : default:
9103 6 : return std::make_pair(0U, nullptr);
9104 : case 32:
9105 : case 16:
9106 : RC = &AMDGPU::SReg_32_XM0RegClass;
9107 : break;
9108 77 : case 64:
9109 : RC = &AMDGPU::SGPR_64RegClass;
9110 77 : break;
9111 18 : case 128:
9112 : RC = &AMDGPU::SReg_128RegClass;
9113 18 : break;
9114 48 : case 256:
9115 : RC = &AMDGPU::SReg_256RegClass;
9116 48 : break;
9117 32 : case 512:
9118 : RC = &AMDGPU::SReg_512RegClass;
9119 32 : break;
9120 : }
9121 : break;
9122 385 : case 'v':
9123 385 : switch (VT.getSizeInBits()) {
9124 6 : default:
9125 6 : return std::make_pair(0U, nullptr);
9126 : case 32:
9127 : case 16:
9128 : RC = &AMDGPU::VGPR_32RegClass;
9129 : break;
9130 58 : case 64:
9131 : RC = &AMDGPU::VReg_64RegClass;
9132 58 : break;
9133 0 : case 96:
9134 : RC = &AMDGPU::VReg_96RegClass;
9135 0 : break;
9136 23 : case 128:
9137 : RC = &AMDGPU::VReg_128RegClass;
9138 23 : break;
9139 0 : case 256:
9140 : RC = &AMDGPU::VReg_256RegClass;
9141 0 : break;
9142 0 : case 512:
9143 : RC = &AMDGPU::VReg_512RegClass;
9144 0 : break;
9145 : }
9146 : break;
9147 : }
9148 : // We actually support i128, i16 and f16 as inline parameters
9149 : // even if they are not reported as legal
9150 38 : if (RC && (isTypeLegal(VT) || VT.SimpleTy == MVT::i128 ||
9151 26 : VT.SimpleTy == MVT::i16 || VT.SimpleTy == MVT::f16))
9152 : return std::make_pair(0U, RC);
9153 : }
9154 :
9155 1614 : if (Constraint.size() > 1) {
9156 3192 : if (Constraint[1] == 'v') {
9157 : RC = &AMDGPU::VGPR_32RegClass;
9158 724 : } else if (Constraint[1] == 's') {
9159 : RC = &AMDGPU::SGPR_32RegClass;
9160 : }
9161 :
9162 1596 : if (RC) {
9163 : uint32_t Idx;
9164 1430 : bool Failed = Constraint.substr(2).getAsInteger(10, Idx);
9165 1430 : if (!Failed && Idx < RC->getNumRegs())
9166 : return std::make_pair(RC->getRegister(Idx), RC);
9167 : }
9168 : }
9169 1614 : return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
9170 : }
9171 :
9172 : SITargetLowering::ConstraintType
9173 7630 : SITargetLowering::getConstraintType(StringRef Constraint) const {
9174 7630 : if (Constraint.size() == 1) {
9175 5702 : switch (Constraint[0]) {
9176 : default: break;
9177 : case 's':
9178 : case 'v':
9179 : return C_RegisterClass;
9180 : }
9181 : }
9182 4979 : return TargetLowering::getConstraintType(Constraint);
9183 : }
9184 :
9185 : // Figure out which registers should be reserved for stack access. Only after
9186 : // the function is legalized do we know all of the non-spill stack objects or if
9187 : // calls are present.
9188 19746 : void SITargetLowering::finalizeLowering(MachineFunction &MF) const {
9189 19746 : MachineRegisterInfo &MRI = MF.getRegInfo();
9190 19746 : SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
9191 19746 : const MachineFrameInfo &MFI = MF.getFrameInfo();
9192 19746 : const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
9193 :
9194 19746 : if (Info->isEntryFunction()) {
9195 : // Callable functions have fixed registers used for stack access.
9196 17974 : reservePrivateMemoryRegs(getTargetMachine(), MF, *TRI, *Info);
9197 : }
9198 :
9199 : // We have to assume the SP is needed in case there are calls in the function
9200 : // during lowering. Calls are only detected after the function is
9201 : // lowered. We're about to reserve registers, so don't bother using it if we
9202 : // aren't really going to use it.
9203 37720 : bool NeedSP = !Info->isEntryFunction() ||
9204 19746 : MFI.hasVarSizedObjects() ||
9205 17972 : MFI.hasCalls();
9206 :
9207 : if (NeedSP) {
9208 2193 : unsigned ReservedStackPtrOffsetReg = TRI->reservedStackPtrOffsetReg(MF);
9209 : Info->setStackPtrOffsetReg(ReservedStackPtrOffsetReg);
9210 :
9211 : assert(Info->getStackPtrOffsetReg() != Info->getFrameOffsetReg());
9212 : assert(!TRI->isSubRegister(Info->getScratchRSrcReg(),
9213 : Info->getStackPtrOffsetReg()));
9214 2193 : MRI.replaceRegWith(AMDGPU::SP_REG, Info->getStackPtrOffsetReg());
9215 : }
9216 :
9217 19746 : MRI.replaceRegWith(AMDGPU::PRIVATE_RSRC_REG, Info->getScratchRSrcReg());
9218 19746 : MRI.replaceRegWith(AMDGPU::FP_REG, Info->getFrameOffsetReg());
9219 19746 : MRI.replaceRegWith(AMDGPU::SCRATCH_WAVE_OFFSET_REG,
9220 : Info->getScratchWaveOffsetReg());
9221 :
9222 19746 : Info->limitOccupancy(MF);
9223 :
9224 19746 : TargetLoweringBase::finalizeLowering(MF);
9225 19746 : }
9226 :
9227 447083 : void SITargetLowering::computeKnownBitsForFrameIndex(const SDValue Op,
9228 : KnownBits &Known,
9229 : const APInt &DemandedElts,
9230 : const SelectionDAG &DAG,
9231 : unsigned Depth) const {
9232 447083 : TargetLowering::computeKnownBitsForFrameIndex(Op, Known, DemandedElts,
9233 : DAG, Depth);
9234 :
9235 447083 : if (getSubtarget()->enableHugePrivateBuffer())
9236 : return;
9237 :
9238 : // Technically it may be possible to have a dispatch with a single workitem
9239 : // that uses the full private memory size, but that's not really useful. We
9240 : // can't use vaddr in MUBUF instructions if we don't know the address
9241 : // calculation won't overflow, so assume the sign bit is never set.
9242 447075 : Known.Zero.setHighBits(AssumeFrameIndexHighZeroBits);
9243 : }
9244 :
9245 2199040 : bool SITargetLowering::isSDNodeSourceOfDivergence(const SDNode * N,
9246 : FunctionLoweringInfo * FLI, LegacyDivergenceAnalysis * KDA) const
9247 : {
9248 4398080 : switch (N->getOpcode()) {
9249 144606 : case ISD::Register:
9250 : case ISD::CopyFromReg:
9251 : {
9252 : const RegisterSDNode *R = nullptr;
9253 144606 : if (N->getOpcode() == ISD::Register) {
9254 : R = dyn_cast<RegisterSDNode>(N);
9255 : }
9256 : else {
9257 62448 : R = dyn_cast<RegisterSDNode>(N->getOperand(1));
9258 : }
9259 144606 : if (R)
9260 : {
9261 144606 : const MachineFunction * MF = FLI->MF;
9262 144606 : const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
9263 144606 : const MachineRegisterInfo &MRI = MF->getRegInfo();
9264 144606 : const SIRegisterInfo &TRI = ST.getInstrInfo()->getRegisterInfo();
9265 144606 : unsigned Reg = R->getReg();
9266 144606 : if (TRI.isPhysicalRegister(Reg))
9267 16970 : return TRI.isVGPR(MRI, Reg);
9268 :
9269 127636 : if (MRI.isLiveIn(Reg)) {
9270 : // workitem.id.x workitem.id.y workitem.id.z
9271 : // Any VGPR formal argument is also considered divergent
9272 107680 : if (TRI.isVGPR(MRI, Reg))
9273 : return true;
9274 : // Formal arguments of non-entry functions
9275 : // are conservatively considered divergent
9276 160228 : else if (!AMDGPU::isEntryFunctionCC(FLI->Fn->getCallingConv()))
9277 : return true;
9278 : }
9279 96322 : return !KDA || KDA->isDivergent(FLI->getValueFromVirtualReg(Reg));
9280 0 : }
9281 : }
9282 : break;
9283 : case ISD::LOAD: {
9284 : const LoadSDNode *L = cast<LoadSDNode>(N);
9285 : unsigned AS = L->getAddressSpace();
9286 : // A flat load may access private memory.
9287 137717 : return AS == AMDGPUAS::PRIVATE_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS;
9288 : } break;
9289 : case ISD::CALLSEQ_END:
9290 : return true;
9291 : break;
9292 22034 : case ISD::INTRINSIC_WO_CHAIN:
9293 : {
9294 :
9295 : }
9296 22034 : return AMDGPU::isIntrinsicSourceOfDivergence(
9297 66102 : cast<ConstantSDNode>(N->getOperand(0))->getZExtValue());
9298 2207 : case ISD::INTRINSIC_W_CHAIN:
9299 2207 : return AMDGPU::isIntrinsicSourceOfDivergence(
9300 6621 : cast<ConstantSDNode>(N->getOperand(1))->getZExtValue());
9301 : // In some cases intrinsics that are a source of divergence have been
9302 : // lowered to AMDGPUISD so we also need to check those too.
9303 : case AMDGPUISD::INTERP_MOV:
9304 : case AMDGPUISD::INTERP_P1:
9305 : case AMDGPUISD::INTERP_P2:
9306 : return true;
9307 : }
9308 : return false;
9309 : }
9310 :
9311 720 : bool SITargetLowering::denormalsEnabledForType(EVT VT) const {
9312 720 : switch (VT.getScalarType().getSimpleVT().SimpleTy) {
9313 334 : case MVT::f32:
9314 334 : return Subtarget->hasFP32Denormals();
9315 69 : case MVT::f64:
9316 69 : return Subtarget->hasFP64Denormals();
9317 317 : case MVT::f16:
9318 317 : return Subtarget->hasFP16Denormals();
9319 : default:
9320 : return false;
9321 : }
9322 : }
|