Bug Summary

File:lib/Target/AMDGPU/SIISelLowering.cpp
Warning:line 7179, column 20
The result of the left shift is undefined due to shifting by '32', which is greater or equal to the width of type 'int'

Annotated Source Code

Press '?' to see keyboard shortcuts

clang -cc1 -triple x86_64-pc-linux-gnu -analyze -disable-free -disable-llvm-verifier -discard-value-names -main-file-name SIISelLowering.cpp -analyzer-store=region -analyzer-opt-analyze-nested-blocks -analyzer-eagerly-assume -analyzer-checker=core -analyzer-checker=apiModeling -analyzer-checker=unix -analyzer-checker=deadcode -analyzer-checker=cplusplus -analyzer-checker=security.insecureAPI.UncheckedReturn -analyzer-checker=security.insecureAPI.getpw -analyzer-checker=security.insecureAPI.gets -analyzer-checker=security.insecureAPI.mktemp -analyzer-checker=security.insecureAPI.mkstemp -analyzer-checker=security.insecureAPI.vfork -analyzer-checker=nullability.NullPassedToNonnull -analyzer-checker=nullability.NullReturnedFromNonnull -analyzer-output plist -w -mrelocation-model pic -pic-level 2 -mthread-model posix -fmath-errno -masm-verbose -mconstructor-aliases -munwind-tables -fuse-init-array -target-cpu x86-64 -dwarf-column-info -debugger-tuning=gdb -momit-leaf-frame-pointer -ffunction-sections -fdata-sections -resource-dir /usr/lib/llvm-7/lib/clang/7.0.0 -D _DEBUG -D _GNU_SOURCE -D __STDC_CONSTANT_MACROS -D __STDC_FORMAT_MACROS -D __STDC_LIMIT_MACROS -I /build/llvm-toolchain-snapshot-7~svn329677/build-llvm/lib/Target/AMDGPU -I /build/llvm-toolchain-snapshot-7~svn329677/lib/Target/AMDGPU -I /build/llvm-toolchain-snapshot-7~svn329677/build-llvm/include -I /build/llvm-toolchain-snapshot-7~svn329677/include -U NDEBUG -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/7.3.0/../../../../include/c++/7.3.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/7.3.0/../../../../include/x86_64-linux-gnu/c++/7.3.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/7.3.0/../../../../include/x86_64-linux-gnu/c++/7.3.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/7.3.0/../../../../include/c++/7.3.0/backward -internal-isystem /usr/include/clang/7.0.0/include/ -internal-isystem /usr/local/include -internal-isystem /usr/lib/llvm-7/lib/clang/7.0.0/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -O2 -Wno-unused-parameter -Wwrite-strings -Wno-missing-field-initializers -Wno-long-long -Wno-maybe-uninitialized -Wno-comment -std=c++11 -fdeprecated-macro -fdebug-compilation-dir /build/llvm-toolchain-snapshot-7~svn329677/build-llvm/lib/Target/AMDGPU -ferror-limit 19 -fmessage-length 0 -fvisibility-inlines-hidden -fobjc-runtime=gcc -fdiagnostics-show-option -vectorize-loops -vectorize-slp -analyzer-checker optin.performance.Padding -analyzer-output=html -analyzer-config stable-report-filename=true -o /tmp/scan-build-2018-04-11-031539-24776-1 -x c++ /build/llvm-toolchain-snapshot-7~svn329677/lib/Target/AMDGPU/SIISelLowering.cpp
<
1//===-- SIISelLowering.cpp - SI DAG Lowering Implementation ---------------===//
2//
3// The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9//
10/// \file
11/// \brief Custom DAG lowering for SI
12//
13//===----------------------------------------------------------------------===//
14
15#ifdef _MSC_VER
16// Provide M_PI.
17#define _USE_MATH_DEFINES
18#endif
19
20#include "SIISelLowering.h"
21#include "AMDGPU.h"
22#include "AMDGPUIntrinsicInfo.h"
23#include "AMDGPUSubtarget.h"
24#include "AMDGPUTargetMachine.h"
25#include "SIDefines.h"
26#include "SIInstrInfo.h"
27#include "SIMachineFunctionInfo.h"
28#include "SIRegisterInfo.h"
29#include "Utils/AMDGPUBaseInfo.h"
30#include "llvm/ADT/APFloat.h"
31#include "llvm/ADT/APInt.h"
32#include "llvm/ADT/ArrayRef.h"
33#include "llvm/ADT/BitVector.h"
34#include "llvm/ADT/SmallVector.h"
35#include "llvm/ADT/Statistic.h"
36#include "llvm/ADT/StringRef.h"
37#include "llvm/ADT/StringSwitch.h"
38#include "llvm/ADT/Twine.h"
39#include "llvm/CodeGen/Analysis.h"
40#include "llvm/CodeGen/CallingConvLower.h"
41#include "llvm/CodeGen/DAGCombine.h"
42#include "llvm/CodeGen/ISDOpcodes.h"
43#include "llvm/CodeGen/MachineBasicBlock.h"
44#include "llvm/CodeGen/MachineFrameInfo.h"
45#include "llvm/CodeGen/MachineFunction.h"
46#include "llvm/CodeGen/MachineInstr.h"
47#include "llvm/CodeGen/MachineInstrBuilder.h"
48#include "llvm/CodeGen/MachineMemOperand.h"
49#include "llvm/CodeGen/MachineModuleInfo.h"
50#include "llvm/CodeGen/MachineOperand.h"
51#include "llvm/CodeGen/MachineRegisterInfo.h"
52#include "llvm/CodeGen/SelectionDAG.h"
53#include "llvm/CodeGen/SelectionDAGNodes.h"
54#include "llvm/CodeGen/TargetCallingConv.h"
55#include "llvm/CodeGen/TargetRegisterInfo.h"
56#include "llvm/CodeGen/ValueTypes.h"
57#include "llvm/IR/Constants.h"
58#include "llvm/IR/DataLayout.h"
59#include "llvm/IR/DebugLoc.h"
60#include "llvm/IR/DerivedTypes.h"
61#include "llvm/IR/DiagnosticInfo.h"
62#include "llvm/IR/Function.h"
63#include "llvm/IR/GlobalValue.h"
64#include "llvm/IR/InstrTypes.h"
65#include "llvm/IR/Instruction.h"
66#include "llvm/IR/Instructions.h"
67#include "llvm/IR/IntrinsicInst.h"
68#include "llvm/IR/Type.h"
69#include "llvm/Support/Casting.h"
70#include "llvm/Support/CodeGen.h"
71#include "llvm/Support/CommandLine.h"
72#include "llvm/Support/Compiler.h"
73#include "llvm/Support/ErrorHandling.h"
74#include "llvm/Support/KnownBits.h"
75#include "llvm/Support/MachineValueType.h"
76#include "llvm/Support/MathExtras.h"
77#include "llvm/Target/TargetOptions.h"
78#include <cassert>
79#include <cmath>
80#include <cstdint>
81#include <iterator>
82#include <tuple>
83#include <utility>
84#include <vector>
85
86using namespace llvm;
87
88#define DEBUG_TYPE"si-lower" "si-lower"
89
90STATISTIC(NumTailCalls, "Number of tail calls")static llvm::Statistic NumTailCalls = {"si-lower", "NumTailCalls"
, "Number of tail calls", {0}, {false}}
;
91
92static cl::opt<bool> EnableVGPRIndexMode(
93 "amdgpu-vgpr-index-mode",
94 cl::desc("Use GPR indexing mode instead of movrel for vector indexing"),
95 cl::init(false));
96
97static cl::opt<bool> EnableDS128(
98 "amdgpu-ds128",
99 cl::desc("Use DS_read/write_b128"),
100 cl::init(false));
101
102static cl::opt<unsigned> AssumeFrameIndexHighZeroBits(
103 "amdgpu-frame-index-zero-bits",
104 cl::desc("High bits of frame index assumed to be zero"),
105 cl::init(5),
106 cl::ReallyHidden);
107
108static unsigned findFirstFreeSGPR(CCState &CCInfo) {
109 unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
110 for (unsigned Reg = 0; Reg < NumSGPRs; ++Reg) {
111 if (!CCInfo.isAllocated(AMDGPU::SGPR0 + Reg)) {
112 return AMDGPU::SGPR0 + Reg;
113 }
114 }
115 llvm_unreachable("Cannot allocate sgpr")::llvm::llvm_unreachable_internal("Cannot allocate sgpr", "/build/llvm-toolchain-snapshot-7~svn329677/lib/Target/AMDGPU/SIISelLowering.cpp"
, 115)
;
116}
117
118SITargetLowering::SITargetLowering(const TargetMachine &TM,
119 const SISubtarget &STI)
120 : AMDGPUTargetLowering(TM, STI) {
121 addRegisterClass(MVT::i1, &AMDGPU::VReg_1RegClass);
122 addRegisterClass(MVT::i64, &AMDGPU::SReg_64RegClass);
123
124 addRegisterClass(MVT::i32, &AMDGPU::SReg_32_XM0RegClass);
125 addRegisterClass(MVT::f32, &AMDGPU::VGPR_32RegClass);
126
127 addRegisterClass(MVT::f64, &AMDGPU::VReg_64RegClass);
128 addRegisterClass(MVT::v2i32, &AMDGPU::SReg_64RegClass);
129 addRegisterClass(MVT::v2f32, &AMDGPU::VReg_64RegClass);
130
131 addRegisterClass(MVT::v2i64, &AMDGPU::SReg_128RegClass);
132 addRegisterClass(MVT::v2f64, &AMDGPU::SReg_128RegClass);
133
134 addRegisterClass(MVT::v4i32, &AMDGPU::SReg_128RegClass);
135 addRegisterClass(MVT::v4f32, &AMDGPU::VReg_128RegClass);
136
137 addRegisterClass(MVT::v8i32, &AMDGPU::SReg_256RegClass);
138 addRegisterClass(MVT::v8f32, &AMDGPU::VReg_256RegClass);
139
140 addRegisterClass(MVT::v16i32, &AMDGPU::SReg_512RegClass);
141 addRegisterClass(MVT::v16f32, &AMDGPU::VReg_512RegClass);
142
143 if (Subtarget->has16BitInsts()) {
144 addRegisterClass(MVT::i16, &AMDGPU::SReg_32_XM0RegClass);
145 addRegisterClass(MVT::f16, &AMDGPU::SReg_32_XM0RegClass);
146 }
147
148 if (Subtarget->hasVOP3PInsts()) {
149 addRegisterClass(MVT::v2i16, &AMDGPU::SReg_32_XM0RegClass);
150 addRegisterClass(MVT::v2f16, &AMDGPU::SReg_32_XM0RegClass);
151 }
152
153 computeRegisterProperties(STI.getRegisterInfo());
154
155 // We need to custom lower vector stores from local memory
156 setOperationAction(ISD::LOAD, MVT::v2i32, Custom);
157 setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
158 setOperationAction(ISD::LOAD, MVT::v8i32, Custom);
159 setOperationAction(ISD::LOAD, MVT::v16i32, Custom);
160 setOperationAction(ISD::LOAD, MVT::i1, Custom);
161
162 setOperationAction(ISD::STORE, MVT::v2i32, Custom);
163 setOperationAction(ISD::STORE, MVT::v4i32, Custom);
164 setOperationAction(ISD::STORE, MVT::v8i32, Custom);
165 setOperationAction(ISD::STORE, MVT::v16i32, Custom);
166 setOperationAction(ISD::STORE, MVT::i1, Custom);
167
168 setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand);
169 setTruncStoreAction(MVT::v4i32, MVT::v4i16, Expand);
170 setTruncStoreAction(MVT::v8i32, MVT::v8i16, Expand);
171 setTruncStoreAction(MVT::v16i32, MVT::v16i16, Expand);
172 setTruncStoreAction(MVT::v32i32, MVT::v32i16, Expand);
173 setTruncStoreAction(MVT::v2i32, MVT::v2i8, Expand);
174 setTruncStoreAction(MVT::v4i32, MVT::v4i8, Expand);
175 setTruncStoreAction(MVT::v8i32, MVT::v8i8, Expand);
176 setTruncStoreAction(MVT::v16i32, MVT::v16i8, Expand);
177 setTruncStoreAction(MVT::v32i32, MVT::v32i8, Expand);
178
179 setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
180 setOperationAction(ISD::GlobalAddress, MVT::i64, Custom);
181 setOperationAction(ISD::ConstantPool, MVT::v2i64, Expand);
182
183 setOperationAction(ISD::SELECT, MVT::i1, Promote);
184 setOperationAction(ISD::SELECT, MVT::i64, Custom);
185 setOperationAction(ISD::SELECT, MVT::f64, Promote);
186 AddPromotedToType(ISD::SELECT, MVT::f64, MVT::i64);
187
188 setOperationAction(ISD::SELECT_CC, MVT::f32, Expand);
189 setOperationAction(ISD::SELECT_CC, MVT::i32, Expand);
190 setOperationAction(ISD::SELECT_CC, MVT::i64, Expand);
191 setOperationAction(ISD::SELECT_CC, MVT::f64, Expand);
192 setOperationAction(ISD::SELECT_CC, MVT::i1, Expand);
193
194 setOperationAction(ISD::SETCC, MVT::i1, Promote);
195 setOperationAction(ISD::SETCC, MVT::v2i1, Expand);
196 setOperationAction(ISD::SETCC, MVT::v4i1, Expand);
197 AddPromotedToType(ISD::SETCC, MVT::i1, MVT::i32);
198
199 setOperationAction(ISD::TRUNCATE, MVT::v2i32, Expand);
200 setOperationAction(ISD::FP_ROUND, MVT::v2f32, Expand);
201
202 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i1, Custom);
203 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i1, Custom);
204 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i8, Custom);
205 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i8, Custom);
206 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i16, Custom);
207 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i16, Custom);
208 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::Other, Custom);
209
210 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
211 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::f32, Custom);
212 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v4f32, Custom);
213 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v2i16, Custom);
214 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v2f16, Custom);
215
216 setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v2f16, Custom);
217 setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v4f16, Custom);
218 setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
219
220 setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
221 setOperationAction(ISD::INTRINSIC_VOID, MVT::v2i16, Custom);
222 setOperationAction(ISD::INTRINSIC_VOID, MVT::v2f16, Custom);
223 setOperationAction(ISD::INTRINSIC_VOID, MVT::v4f16, Custom);
224
225 setOperationAction(ISD::BRCOND, MVT::Other, Custom);
226 setOperationAction(ISD::BR_CC, MVT::i1, Expand);
227 setOperationAction(ISD::BR_CC, MVT::i32, Expand);
228 setOperationAction(ISD::BR_CC, MVT::i64, Expand);
229 setOperationAction(ISD::BR_CC, MVT::f32, Expand);
230 setOperationAction(ISD::BR_CC, MVT::f64, Expand);
231
232 setOperationAction(ISD::UADDO, MVT::i32, Legal);
233 setOperationAction(ISD::USUBO, MVT::i32, Legal);
234
235 setOperationAction(ISD::ADDCARRY, MVT::i32, Legal);
236 setOperationAction(ISD::SUBCARRY, MVT::i32, Legal);
237
238#if 0
239 setOperationAction(ISD::ADDCARRY, MVT::i64, Legal);
240 setOperationAction(ISD::SUBCARRY, MVT::i64, Legal);
241#endif
242
243 //setOperationAction(ISD::ADDC, MVT::i64, Expand);
244 //setOperationAction(ISD::SUBC, MVT::i64, Expand);
245
246 // We only support LOAD/STORE and vector manipulation ops for vectors
247 // with > 4 elements.
248 for (MVT VT : {MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32,
249 MVT::v2i64, MVT::v2f64}) {
250 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
251 switch (Op) {
252 case ISD::LOAD:
253 case ISD::STORE:
254 case ISD::BUILD_VECTOR:
255 case ISD::BITCAST:
256 case ISD::EXTRACT_VECTOR_ELT:
257 case ISD::INSERT_VECTOR_ELT:
258 case ISD::INSERT_SUBVECTOR:
259 case ISD::EXTRACT_SUBVECTOR:
260 case ISD::SCALAR_TO_VECTOR:
261 break;
262 case ISD::CONCAT_VECTORS:
263 setOperationAction(Op, VT, Custom);
264 break;
265 default:
266 setOperationAction(Op, VT, Expand);
267 break;
268 }
269 }
270 }
271
272 // TODO: For dynamic 64-bit vector inserts/extracts, should emit a pseudo that
273 // is expanded to avoid having two separate loops in case the index is a VGPR.
274
275 // Most operations are naturally 32-bit vector operations. We only support
276 // load and store of i64 vectors, so promote v2i64 vector operations to v4i32.
277 for (MVT Vec64 : { MVT::v2i64, MVT::v2f64 }) {
278 setOperationAction(ISD::BUILD_VECTOR, Vec64, Promote);
279 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v4i32);
280
281 setOperationAction(ISD::EXTRACT_VECTOR_ELT, Vec64, Promote);
282 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v4i32);
283
284 setOperationAction(ISD::INSERT_VECTOR_ELT, Vec64, Promote);
285 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v4i32);
286
287 setOperationAction(ISD::SCALAR_TO_VECTOR, Vec64, Promote);
288 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v4i32);
289 }
290
291 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i32, Expand);
292 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8f32, Expand);
293 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16i32, Expand);
294 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16f32, Expand);
295
296 // Avoid stack access for these.
297 // TODO: Generalize to more vector types.
298 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i16, Custom);
299 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f16, Custom);
300 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i16, Custom);
301 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f16, Custom);
302
303 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling,
304 // and output demarshalling
305 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i32, Custom);
306 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i64, Custom);
307
308 // We can't return success/failure, only the old value,
309 // let LLVM add the comparison
310 setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i32, Expand);
311 setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i64, Expand);
312
313 if (getSubtarget()->hasFlatAddressSpace()) {
314 setOperationAction(ISD::ADDRSPACECAST, MVT::i32, Custom);
315 setOperationAction(ISD::ADDRSPACECAST, MVT::i64, Custom);
316 }
317
318 setOperationAction(ISD::BSWAP, MVT::i32, Legal);
319 setOperationAction(ISD::BITREVERSE, MVT::i32, Legal);
320
321 // On SI this is s_memtime and s_memrealtime on VI.
322 setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Legal);
323 setOperationAction(ISD::TRAP, MVT::Other, Custom);
324 setOperationAction(ISD::DEBUGTRAP, MVT::Other, Custom);
325
326 setOperationAction(ISD::FMINNUM, MVT::f64, Legal);
327 setOperationAction(ISD::FMAXNUM, MVT::f64, Legal);
328
329 if (Subtarget->getGeneration() >= SISubtarget::SEA_ISLANDS) {
330 setOperationAction(ISD::FTRUNC, MVT::f64, Legal);
331 setOperationAction(ISD::FCEIL, MVT::f64, Legal);
332 setOperationAction(ISD::FRINT, MVT::f64, Legal);
333 }
334
335 setOperationAction(ISD::FFLOOR, MVT::f64, Legal);
336
337 setOperationAction(ISD::FSIN, MVT::f32, Custom);
338 setOperationAction(ISD::FCOS, MVT::f32, Custom);
339 setOperationAction(ISD::FDIV, MVT::f32, Custom);
340 setOperationAction(ISD::FDIV, MVT::f64, Custom);
341
342 if (Subtarget->has16BitInsts()) {
343 setOperationAction(ISD::Constant, MVT::i16, Legal);
344
345 setOperationAction(ISD::SMIN, MVT::i16, Legal);
346 setOperationAction(ISD::SMAX, MVT::i16, Legal);
347
348 setOperationAction(ISD::UMIN, MVT::i16, Legal);
349 setOperationAction(ISD::UMAX, MVT::i16, Legal);
350
351 setOperationAction(ISD::SIGN_EXTEND, MVT::i16, Promote);
352 AddPromotedToType(ISD::SIGN_EXTEND, MVT::i16, MVT::i32);
353
354 setOperationAction(ISD::ROTR, MVT::i16, Promote);
355 setOperationAction(ISD::ROTL, MVT::i16, Promote);
356
357 setOperationAction(ISD::SDIV, MVT::i16, Promote);
358 setOperationAction(ISD::UDIV, MVT::i16, Promote);
359 setOperationAction(ISD::SREM, MVT::i16, Promote);
360 setOperationAction(ISD::UREM, MVT::i16, Promote);
361
362 setOperationAction(ISD::BSWAP, MVT::i16, Promote);
363 setOperationAction(ISD::BITREVERSE, MVT::i16, Promote);
364
365 setOperationAction(ISD::CTTZ, MVT::i16, Promote);
366 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i16, Promote);
367 setOperationAction(ISD::CTLZ, MVT::i16, Promote);
368 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16, Promote);
369 setOperationAction(ISD::CTPOP, MVT::i16, Promote);
370
371 setOperationAction(ISD::SELECT_CC, MVT::i16, Expand);
372
373 setOperationAction(ISD::BR_CC, MVT::i16, Expand);
374
375 setOperationAction(ISD::LOAD, MVT::i16, Custom);
376
377 setTruncStoreAction(MVT::i64, MVT::i16, Expand);
378
379 setOperationAction(ISD::FP16_TO_FP, MVT::i16, Promote);
380 AddPromotedToType(ISD::FP16_TO_FP, MVT::i16, MVT::i32);
381 setOperationAction(ISD::FP_TO_FP16, MVT::i16, Promote);
382 AddPromotedToType(ISD::FP_TO_FP16, MVT::i16, MVT::i32);
383
384 setOperationAction(ISD::FP_TO_SINT, MVT::i16, Promote);
385 setOperationAction(ISD::FP_TO_UINT, MVT::i16, Promote);
386 setOperationAction(ISD::SINT_TO_FP, MVT::i16, Promote);
387 setOperationAction(ISD::UINT_TO_FP, MVT::i16, Promote);
388
389 // F16 - Constant Actions.
390 setOperationAction(ISD::ConstantFP, MVT::f16, Legal);
391
392 // F16 - Load/Store Actions.
393 setOperationAction(ISD::LOAD, MVT::f16, Promote);
394 AddPromotedToType(ISD::LOAD, MVT::f16, MVT::i16);
395 setOperationAction(ISD::STORE, MVT::f16, Promote);
396 AddPromotedToType(ISD::STORE, MVT::f16, MVT::i16);
397
398 // F16 - VOP1 Actions.
399 setOperationAction(ISD::FP_ROUND, MVT::f16, Custom);
400 setOperationAction(ISD::FCOS, MVT::f16, Promote);
401 setOperationAction(ISD::FSIN, MVT::f16, Promote);
402 setOperationAction(ISD::FP_TO_SINT, MVT::f16, Promote);
403 setOperationAction(ISD::FP_TO_UINT, MVT::f16, Promote);
404 setOperationAction(ISD::SINT_TO_FP, MVT::f16, Promote);
405 setOperationAction(ISD::UINT_TO_FP, MVT::f16, Promote);
406 setOperationAction(ISD::FROUND, MVT::f16, Custom);
407
408 // F16 - VOP2 Actions.
409 setOperationAction(ISD::BR_CC, MVT::f16, Expand);
410 setOperationAction(ISD::SELECT_CC, MVT::f16, Expand);
411 setOperationAction(ISD::FMAXNUM, MVT::f16, Legal);
412 setOperationAction(ISD::FMINNUM, MVT::f16, Legal);
413 setOperationAction(ISD::FDIV, MVT::f16, Custom);
414
415 // F16 - VOP3 Actions.
416 setOperationAction(ISD::FMA, MVT::f16, Legal);
417 if (!Subtarget->hasFP16Denormals())
418 setOperationAction(ISD::FMAD, MVT::f16, Legal);
419 }
420
421 if (Subtarget->hasVOP3PInsts()) {
422 for (MVT VT : {MVT::v2i16, MVT::v2f16}) {
423 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
424 switch (Op) {
425 case ISD::LOAD:
426 case ISD::STORE:
427 case ISD::BUILD_VECTOR:
428 case ISD::BITCAST:
429 case ISD::EXTRACT_VECTOR_ELT:
430 case ISD::INSERT_VECTOR_ELT:
431 case ISD::INSERT_SUBVECTOR:
432 case ISD::EXTRACT_SUBVECTOR:
433 case ISD::SCALAR_TO_VECTOR:
434 break;
435 case ISD::CONCAT_VECTORS:
436 setOperationAction(Op, VT, Custom);
437 break;
438 default:
439 setOperationAction(Op, VT, Expand);
440 break;
441 }
442 }
443 }
444
445 // XXX - Do these do anything? Vector constants turn into build_vector.
446 setOperationAction(ISD::Constant, MVT::v2i16, Legal);
447 setOperationAction(ISD::ConstantFP, MVT::v2f16, Legal);
448
449 setOperationAction(ISD::STORE, MVT::v2i16, Promote);
450 AddPromotedToType(ISD::STORE, MVT::v2i16, MVT::i32);
451 setOperationAction(ISD::STORE, MVT::v2f16, Promote);
452 AddPromotedToType(ISD::STORE, MVT::v2f16, MVT::i32);
453
454 setOperationAction(ISD::LOAD, MVT::v2i16, Promote);
455 AddPromotedToType(ISD::LOAD, MVT::v2i16, MVT::i32);
456 setOperationAction(ISD::LOAD, MVT::v2f16, Promote);
457 AddPromotedToType(ISD::LOAD, MVT::v2f16, MVT::i32);
458
459 setOperationAction(ISD::AND, MVT::v2i16, Promote);
460 AddPromotedToType(ISD::AND, MVT::v2i16, MVT::i32);
461 setOperationAction(ISD::OR, MVT::v2i16, Promote);
462 AddPromotedToType(ISD::OR, MVT::v2i16, MVT::i32);
463 setOperationAction(ISD::XOR, MVT::v2i16, Promote);
464 AddPromotedToType(ISD::XOR, MVT::v2i16, MVT::i32);
465 setOperationAction(ISD::SELECT, MVT::v2i16, Promote);
466 AddPromotedToType(ISD::SELECT, MVT::v2i16, MVT::i32);
467 setOperationAction(ISD::SELECT, MVT::v2f16, Promote);
468 AddPromotedToType(ISD::SELECT, MVT::v2f16, MVT::i32);
469
470 setOperationAction(ISD::ADD, MVT::v2i16, Legal);
471 setOperationAction(ISD::SUB, MVT::v2i16, Legal);
472 setOperationAction(ISD::MUL, MVT::v2i16, Legal);
473 setOperationAction(ISD::SHL, MVT::v2i16, Legal);
474 setOperationAction(ISD::SRL, MVT::v2i16, Legal);
475 setOperationAction(ISD::SRA, MVT::v2i16, Legal);
476 setOperationAction(ISD::SMIN, MVT::v2i16, Legal);
477 setOperationAction(ISD::UMIN, MVT::v2i16, Legal);
478 setOperationAction(ISD::SMAX, MVT::v2i16, Legal);
479 setOperationAction(ISD::UMAX, MVT::v2i16, Legal);
480
481 setOperationAction(ISD::FADD, MVT::v2f16, Legal);
482 setOperationAction(ISD::FNEG, MVT::v2f16, Legal);
483 setOperationAction(ISD::FMUL, MVT::v2f16, Legal);
484 setOperationAction(ISD::FMA, MVT::v2f16, Legal);
485 setOperationAction(ISD::FMINNUM, MVT::v2f16, Legal);
486 setOperationAction(ISD::FMAXNUM, MVT::v2f16, Legal);
487
488 // This isn't really legal, but this avoids the legalizer unrolling it (and
489 // allows matching fneg (fabs x) patterns)
490 setOperationAction(ISD::FABS, MVT::v2f16, Legal);
491
492 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i16, Custom);
493 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f16, Custom);
494
495 setOperationAction(ISD::ANY_EXTEND, MVT::v2i32, Expand);
496 setOperationAction(ISD::ZERO_EXTEND, MVT::v2i32, Expand);
497 setOperationAction(ISD::SIGN_EXTEND, MVT::v2i32, Expand);
498 setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Expand);
499 } else {
500 setOperationAction(ISD::SELECT, MVT::v2i16, Custom);
501 setOperationAction(ISD::SELECT, MVT::v2f16, Custom);
502 }
503
504 for (MVT VT : { MVT::v4i16, MVT::v4f16, MVT::v2i8, MVT::v4i8, MVT::v8i8 }) {
505 setOperationAction(ISD::SELECT, VT, Custom);
506 }
507
508 setTargetDAGCombine(ISD::ADD);
509 setTargetDAGCombine(ISD::ADDCARRY);
510 setTargetDAGCombine(ISD::SUB);
511 setTargetDAGCombine(ISD::SUBCARRY);
512 setTargetDAGCombine(ISD::FADD);
513 setTargetDAGCombine(ISD::FSUB);
514 setTargetDAGCombine(ISD::FMINNUM);
515 setTargetDAGCombine(ISD::FMAXNUM);
516 setTargetDAGCombine(ISD::SMIN);
517 setTargetDAGCombine(ISD::SMAX);
518 setTargetDAGCombine(ISD::UMIN);
519 setTargetDAGCombine(ISD::UMAX);
520 setTargetDAGCombine(ISD::SETCC);
521 setTargetDAGCombine(ISD::AND);
522 setTargetDAGCombine(ISD::OR);
523 setTargetDAGCombine(ISD::XOR);
524 setTargetDAGCombine(ISD::SINT_TO_FP);
525 setTargetDAGCombine(ISD::UINT_TO_FP);
526 setTargetDAGCombine(ISD::FCANONICALIZE);
527 setTargetDAGCombine(ISD::SCALAR_TO_VECTOR);
528 setTargetDAGCombine(ISD::ZERO_EXTEND);
529 setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
530 setTargetDAGCombine(ISD::BUILD_VECTOR);
531
532 // All memory operations. Some folding on the pointer operand is done to help
533 // matching the constant offsets in the addressing modes.
534 setTargetDAGCombine(ISD::LOAD);
535 setTargetDAGCombine(ISD::STORE);
536 setTargetDAGCombine(ISD::ATOMIC_LOAD);
537 setTargetDAGCombine(ISD::ATOMIC_STORE);
538 setTargetDAGCombine(ISD::ATOMIC_CMP_SWAP);
539 setTargetDAGCombine(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS);
540 setTargetDAGCombine(ISD::ATOMIC_SWAP);
541 setTargetDAGCombine(ISD::ATOMIC_LOAD_ADD);
542 setTargetDAGCombine(ISD::ATOMIC_LOAD_SUB);
543 setTargetDAGCombine(ISD::ATOMIC_LOAD_AND);
544 setTargetDAGCombine(ISD::ATOMIC_LOAD_OR);
545 setTargetDAGCombine(ISD::ATOMIC_LOAD_XOR);
546 setTargetDAGCombine(ISD::ATOMIC_LOAD_NAND);
547 setTargetDAGCombine(ISD::ATOMIC_LOAD_MIN);
548 setTargetDAGCombine(ISD::ATOMIC_LOAD_MAX);
549 setTargetDAGCombine(ISD::ATOMIC_LOAD_UMIN);
550 setTargetDAGCombine(ISD::ATOMIC_LOAD_UMAX);
551
552 setSchedulingPreference(Sched::RegPressure);
553}
554
555const SISubtarget *SITargetLowering::getSubtarget() const {
556 return static_cast<const SISubtarget *>(Subtarget);
557}
558
559//===----------------------------------------------------------------------===//
560// TargetLowering queries
561//===----------------------------------------------------------------------===//
562
563bool SITargetLowering::isShuffleMaskLegal(ArrayRef<int>, EVT) const {
564 // SI has some legal vector types, but no legal vector operations. Say no
565 // shuffles are legal in order to prefer scalarizing some vector operations.
566 return false;
567}
568
569bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
570 const CallInst &CI,
571 MachineFunction &MF,
572 unsigned IntrID) const {
573 if (const AMDGPU::RsrcIntrinsic *RsrcIntr =
574 AMDGPU::lookupRsrcIntrinsicByIntr(IntrID)) {
575 AttributeList Attr = Intrinsic::getAttributes(CI.getContext(),
576 (Intrinsic::ID)IntrID);
577 if (Attr.hasFnAttribute(Attribute::ReadNone))
578 return false;
579
580 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
581
582 if (RsrcIntr->IsImage) {
583 Info.ptrVal = MFI->getImagePSV(
584 *MF.getSubtarget<SISubtarget>().getInstrInfo(),
585 CI.getArgOperand(RsrcIntr->RsrcArg));
586 Info.align = 0;
587 } else {
588 Info.ptrVal = MFI->getBufferPSV(
589 *MF.getSubtarget<SISubtarget>().getInstrInfo(),
590 CI.getArgOperand(RsrcIntr->RsrcArg));
591 }
592
593 Info.flags = MachineMemOperand::MODereferenceable;
594 if (Attr.hasFnAttribute(Attribute::ReadOnly)) {
595 Info.opc = ISD::INTRINSIC_W_CHAIN;
596 Info.memVT = MVT::getVT(CI.getType());
597 Info.flags |= MachineMemOperand::MOLoad;
598 } else if (Attr.hasFnAttribute(Attribute::WriteOnly)) {
599 Info.opc = ISD::INTRINSIC_VOID;
600 Info.memVT = MVT::getVT(CI.getArgOperand(0)->getType());
601 Info.flags |= MachineMemOperand::MOStore;
602 } else {
603 // Atomic
604 Info.opc = ISD::INTRINSIC_W_CHAIN;
605 Info.memVT = MVT::getVT(CI.getType());
606 Info.flags = MachineMemOperand::MOLoad |
607 MachineMemOperand::MOStore |
608 MachineMemOperand::MODereferenceable;
609
610 // XXX - Should this be volatile without known ordering?
611 Info.flags |= MachineMemOperand::MOVolatile;
612 }
613 return true;
614 }
615
616 switch (IntrID) {
617 case Intrinsic::amdgcn_atomic_inc:
618 case Intrinsic::amdgcn_atomic_dec:
619 case Intrinsic::amdgcn_ds_fadd:
620 case Intrinsic::amdgcn_ds_fmin:
621 case Intrinsic::amdgcn_ds_fmax: {
622 Info.opc = ISD::INTRINSIC_W_CHAIN;
623 Info.memVT = MVT::getVT(CI.getType());
624 Info.ptrVal = CI.getOperand(0);
625 Info.align = 0;
626 Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
627
628 const ConstantInt *Vol = dyn_cast<ConstantInt>(CI.getOperand(4));
629 if (!Vol || !Vol->isZero())
630 Info.flags |= MachineMemOperand::MOVolatile;
631
632 return true;
633 }
634
635 default:
636 return false;
637 }
638}
639
640bool SITargetLowering::getAddrModeArguments(IntrinsicInst *II,
641 SmallVectorImpl<Value*> &Ops,
642 Type *&AccessTy) const {
643 switch (II->getIntrinsicID()) {
644 case Intrinsic::amdgcn_atomic_inc:
645 case Intrinsic::amdgcn_atomic_dec:
646 case Intrinsic::amdgcn_ds_fadd:
647 case Intrinsic::amdgcn_ds_fmin:
648 case Intrinsic::amdgcn_ds_fmax: {
649 Value *Ptr = II->getArgOperand(0);
650 AccessTy = II->getType();
651 Ops.push_back(Ptr);
652 return true;
653 }
654 default:
655 return false;
656 }
657}
658
659bool SITargetLowering::isLegalFlatAddressingMode(const AddrMode &AM) const {
660 if (!Subtarget->hasFlatInstOffsets()) {
661 // Flat instructions do not have offsets, and only have the register
662 // address.
663 return AM.BaseOffs == 0 && AM.Scale == 0;
664 }
665
666 // GFX9 added a 13-bit signed offset. When using regular flat instructions,
667 // the sign bit is ignored and is treated as a 12-bit unsigned offset.
668
669 // Just r + i
670 return isUInt<12>(AM.BaseOffs) && AM.Scale == 0;
671}
672
673bool SITargetLowering::isLegalGlobalAddressingMode(const AddrMode &AM) const {
674 if (Subtarget->hasFlatGlobalInsts())
675 return isInt<13>(AM.BaseOffs) && AM.Scale == 0;
676
677 if (!Subtarget->hasAddr64() || Subtarget->useFlatForGlobal()) {
678 // Assume the we will use FLAT for all global memory accesses
679 // on VI.
680 // FIXME: This assumption is currently wrong. On VI we still use
681 // MUBUF instructions for the r + i addressing mode. As currently
682 // implemented, the MUBUF instructions only work on buffer < 4GB.
683 // It may be possible to support > 4GB buffers with MUBUF instructions,
684 // by setting the stride value in the resource descriptor which would
685 // increase the size limit to (stride * 4GB). However, this is risky,
686 // because it has never been validated.
687 return isLegalFlatAddressingMode(AM);
688 }
689
690 return isLegalMUBUFAddressingMode(AM);
691}
692
693bool SITargetLowering::isLegalMUBUFAddressingMode(const AddrMode &AM) const {
694 // MUBUF / MTBUF instructions have a 12-bit unsigned byte offset, and
695 // additionally can do r + r + i with addr64. 32-bit has more addressing
696 // mode options. Depending on the resource constant, it can also do
697 // (i64 r0) + (i32 r1) * (i14 i).
698 //
699 // Private arrays end up using a scratch buffer most of the time, so also
700 // assume those use MUBUF instructions. Scratch loads / stores are currently
701 // implemented as mubuf instructions with offen bit set, so slightly
702 // different than the normal addr64.
703 if (!isUInt<12>(AM.BaseOffs))
704 return false;
705
706 // FIXME: Since we can split immediate into soffset and immediate offset,
707 // would it make sense to allow any immediate?
708
709 switch (AM.Scale) {
710 case 0: // r + i or just i, depending on HasBaseReg.
711 return true;
712 case 1:
713 return true; // We have r + r or r + i.
714 case 2:
715 if (AM.HasBaseReg) {
716 // Reject 2 * r + r.
717 return false;
718 }
719
720 // Allow 2 * r as r + r
721 // Or 2 * r + i is allowed as r + r + i.
722 return true;
723 default: // Don't allow n * r
724 return false;
725 }
726}
727
728bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL,
729 const AddrMode &AM, Type *Ty,
730 unsigned AS, Instruction *I) const {
731 // No global is ever allowed as a base.
732 if (AM.BaseGV)
733 return false;
734
735 if (AS == AMDGPUASI.GLOBAL_ADDRESS)
736 return isLegalGlobalAddressingMode(AM);
737
738 if (AS == AMDGPUASI.CONSTANT_ADDRESS ||
739 AS == AMDGPUASI.CONSTANT_ADDRESS_32BIT) {
740 // If the offset isn't a multiple of 4, it probably isn't going to be
741 // correctly aligned.
742 // FIXME: Can we get the real alignment here?
743 if (AM.BaseOffs % 4 != 0)
744 return isLegalMUBUFAddressingMode(AM);
745
746 // There are no SMRD extloads, so if we have to do a small type access we
747 // will use a MUBUF load.
748 // FIXME?: We also need to do this if unaligned, but we don't know the
749 // alignment here.
750 if (DL.getTypeStoreSize(Ty) < 4)
751 return isLegalGlobalAddressingMode(AM);
752
753 if (Subtarget->getGeneration() == SISubtarget::SOUTHERN_ISLANDS) {
754 // SMRD instructions have an 8-bit, dword offset on SI.
755 if (!isUInt<8>(AM.BaseOffs / 4))
756 return false;
757 } else if (Subtarget->getGeneration() == SISubtarget::SEA_ISLANDS) {
758 // On CI+, this can also be a 32-bit literal constant offset. If it fits
759 // in 8-bits, it can use a smaller encoding.
760 if (!isUInt<32>(AM.BaseOffs / 4))
761 return false;
762 } else if (Subtarget->getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) {
763 // On VI, these use the SMEM format and the offset is 20-bit in bytes.
764 if (!isUInt<20>(AM.BaseOffs))
765 return false;
766 } else
767 llvm_unreachable("unhandled generation")::llvm::llvm_unreachable_internal("unhandled generation", "/build/llvm-toolchain-snapshot-7~svn329677/lib/Target/AMDGPU/SIISelLowering.cpp"
, 767)
;
768
769 if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
770 return true;
771
772 if (AM.Scale == 1 && AM.HasBaseReg)
773 return true;
774
775 return false;
776
777 } else if (AS == AMDGPUASI.PRIVATE_ADDRESS) {
778 return isLegalMUBUFAddressingMode(AM);
779 } else if (AS == AMDGPUASI.LOCAL_ADDRESS ||
780 AS == AMDGPUASI.REGION_ADDRESS) {
781 // Basic, single offset DS instructions allow a 16-bit unsigned immediate
782 // field.
783 // XXX - If doing a 4-byte aligned 8-byte type access, we effectively have
784 // an 8-bit dword offset but we don't know the alignment here.
785 if (!isUInt<16>(AM.BaseOffs))
786 return false;
787
788 if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
789 return true;
790
791 if (AM.Scale == 1 && AM.HasBaseReg)
792 return true;
793
794 return false;
795 } else if (AS == AMDGPUASI.FLAT_ADDRESS ||
796 AS == AMDGPUASI.UNKNOWN_ADDRESS_SPACE) {
797 // For an unknown address space, this usually means that this is for some
798 // reason being used for pure arithmetic, and not based on some addressing
799 // computation. We don't have instructions that compute pointers with any
800 // addressing modes, so treat them as having no offset like flat
801 // instructions.
802 return isLegalFlatAddressingMode(AM);
803 } else {
804 llvm_unreachable("unhandled address space")::llvm::llvm_unreachable_internal("unhandled address space", "/build/llvm-toolchain-snapshot-7~svn329677/lib/Target/AMDGPU/SIISelLowering.cpp"
, 804)
;
805 }
806}
807
808bool SITargetLowering::canMergeStoresTo(unsigned AS, EVT MemVT,
809 const SelectionDAG &DAG) const {
810 if (AS == AMDGPUASI.GLOBAL_ADDRESS || AS == AMDGPUASI.FLAT_ADDRESS) {
811 return (MemVT.getSizeInBits() <= 4 * 32);
812 } else if (AS == AMDGPUASI.PRIVATE_ADDRESS) {
813 unsigned MaxPrivateBits = 8 * getSubtarget()->getMaxPrivateElementSize();
814 return (MemVT.getSizeInBits() <= MaxPrivateBits);
815 } else if (AS == AMDGPUASI.LOCAL_ADDRESS) {
816 return (MemVT.getSizeInBits() <= 2 * 32);
817 }
818 return true;
819}
820
821bool SITargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
822 unsigned AddrSpace,
823 unsigned Align,
824 bool *IsFast) const {
825 if (IsFast)
826 *IsFast = false;
827
828 // TODO: I think v3i32 should allow unaligned accesses on CI with DS_READ_B96,
829 // which isn't a simple VT.
830 // Until MVT is extended to handle this, simply check for the size and
831 // rely on the condition below: allow accesses if the size is a multiple of 4.
832 if (VT == MVT::Other || (VT != MVT::Other && VT.getSizeInBits() > 1024 &&
833 VT.getStoreSize() > 16)) {
834 return false;
835 }
836
837 if (AddrSpace == AMDGPUASI.LOCAL_ADDRESS ||
838 AddrSpace == AMDGPUASI.REGION_ADDRESS) {
839 // ds_read/write_b64 require 8-byte alignment, but we can do a 4 byte
840 // aligned, 8 byte access in a single operation using ds_read2/write2_b32
841 // with adjacent offsets.
842 bool AlignedBy4 = (Align % 4 == 0);
843 if (IsFast)
844 *IsFast = AlignedBy4;
845
846 return AlignedBy4;
847 }
848
849 // FIXME: We have to be conservative here and assume that flat operations
850 // will access scratch. If we had access to the IR function, then we
851 // could determine if any private memory was used in the function.
852 if (!Subtarget->hasUnalignedScratchAccess() &&
853 (AddrSpace == AMDGPUASI.PRIVATE_ADDRESS ||
854 AddrSpace == AMDGPUASI.FLAT_ADDRESS)) {
855 return false;
856 }
857
858 if (Subtarget->hasUnalignedBufferAccess()) {
859 // If we have an uniform constant load, it still requires using a slow
860 // buffer instruction if unaligned.
861 if (IsFast) {
862 *IsFast = (AddrSpace == AMDGPUASI.CONSTANT_ADDRESS ||
863 AddrSpace == AMDGPUASI.CONSTANT_ADDRESS_32BIT) ?
864 (Align % 4 == 0) : true;
865 }
866
867 return true;
868 }
869
870 // Smaller than dword value must be aligned.
871 if (VT.bitsLT(MVT::i32))
872 return false;
873
874 // 8.1.6 - For Dword or larger reads or writes, the two LSBs of the
875 // byte-address are ignored, thus forcing Dword alignment.
876 // This applies to private, global, and constant memory.
877 if (IsFast)
878 *IsFast = true;
879
880 return VT.bitsGT(MVT::i32) && Align % 4 == 0;
881}
882
883EVT SITargetLowering::getOptimalMemOpType(uint64_t Size, unsigned DstAlign,
884 unsigned SrcAlign, bool IsMemset,
885 bool ZeroMemset,
886 bool MemcpyStrSrc,
887 MachineFunction &MF) const {
888 // FIXME: Should account for address space here.
889
890 // The default fallback uses the private pointer size as a guess for a type to
891 // use. Make sure we switch these to 64-bit accesses.
892
893 if (Size >= 16 && DstAlign >= 4) // XXX: Should only do for global
894 return MVT::v4i32;
895
896 if (Size >= 8 && DstAlign >= 4)
897 return MVT::v2i32;
898
899 // Use the default.
900 return MVT::Other;
901}
902
903static bool isFlatGlobalAddrSpace(unsigned AS, AMDGPUAS AMDGPUASI) {
904 return AS == AMDGPUASI.GLOBAL_ADDRESS ||
905 AS == AMDGPUASI.FLAT_ADDRESS ||
906 AS == AMDGPUASI.CONSTANT_ADDRESS ||
907 AS == AMDGPUASI.CONSTANT_ADDRESS_32BIT;
908}
909
910bool SITargetLowering::isNoopAddrSpaceCast(unsigned SrcAS,
911 unsigned DestAS) const {
912 return isFlatGlobalAddrSpace(SrcAS, AMDGPUASI) &&
913 isFlatGlobalAddrSpace(DestAS, AMDGPUASI);
914}
915
916bool SITargetLowering::isMemOpHasNoClobberedMemOperand(const SDNode *N) const {
917 const MemSDNode *MemNode = cast<MemSDNode>(N);
918 const Value *Ptr = MemNode->getMemOperand()->getValue();
919 const Instruction *I = dyn_cast_or_null<Instruction>(Ptr);
920 return I && I->getMetadata("amdgpu.noclobber");
921}
922
923bool SITargetLowering::isCheapAddrSpaceCast(unsigned SrcAS,
924 unsigned DestAS) const {
925 // Flat -> private/local is a simple truncate.
926 // Flat -> global is no-op
927 if (SrcAS == AMDGPUASI.FLAT_ADDRESS)
928 return true;
929
930 return isNoopAddrSpaceCast(SrcAS, DestAS);
931}
932
933bool SITargetLowering::isMemOpUniform(const SDNode *N) const {
934 const MemSDNode *MemNode = cast<MemSDNode>(N);
935
936 return AMDGPUInstrInfo::isUniformMMO(MemNode->getMemOperand());
937}
938
939TargetLoweringBase::LegalizeTypeAction
940SITargetLowering::getPreferredVectorAction(EVT VT) const {
941 if (VT.getVectorNumElements() != 1 && VT.getScalarType().bitsLE(MVT::i16))
942 return TypeSplitVector;
943
944 return TargetLoweringBase::getPreferredVectorAction(VT);
945}
946
947bool SITargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
948 Type *Ty) const {
949 // FIXME: Could be smarter if called for vector constants.
950 return true;
951}
952
953bool SITargetLowering::isTypeDesirableForOp(unsigned Op, EVT VT) const {
954 if (Subtarget->has16BitInsts() && VT == MVT::i16) {
955 switch (Op) {
956 case ISD::LOAD:
957 case ISD::STORE:
958
959 // These operations are done with 32-bit instructions anyway.
960 case ISD::AND:
961 case ISD::OR:
962 case ISD::XOR:
963 case ISD::SELECT:
964 // TODO: Extensions?
965 return true;
966 default:
967 return false;
968 }
969 }
970
971 // SimplifySetCC uses this function to determine whether or not it should
972 // create setcc with i1 operands. We don't have instructions for i1 setcc.
973 if (VT == MVT::i1 && Op == ISD::SETCC)
974 return false;
975
976 return TargetLowering::isTypeDesirableForOp(Op, VT);
977}
978
979SDValue SITargetLowering::lowerKernArgParameterPtr(SelectionDAG &DAG,
980 const SDLoc &SL,
981 SDValue Chain,
982 uint64_t Offset) const {
983 const DataLayout &DL = DAG.getDataLayout();
984 MachineFunction &MF = DAG.getMachineFunction();
985 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
986
987 const ArgDescriptor *InputPtrReg;
988 const TargetRegisterClass *RC;
989
990 std::tie(InputPtrReg, RC)
991 = Info->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
992
993 MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
994 MVT PtrVT = getPointerTy(DL, AMDGPUASI.CONSTANT_ADDRESS);
995 SDValue BasePtr = DAG.getCopyFromReg(Chain, SL,
996 MRI.getLiveInVirtReg(InputPtrReg->getRegister()), PtrVT);
997
998 return DAG.getNode(ISD::ADD, SL, PtrVT, BasePtr,
999 DAG.getConstant(Offset, SL, PtrVT));
1000}
1001
1002SDValue SITargetLowering::getImplicitArgPtr(SelectionDAG &DAG,
1003 const SDLoc &SL) const {
1004 auto MFI = DAG.getMachineFunction().getInfo<SIMachineFunctionInfo>();
1005 uint64_t Offset = getImplicitParameterOffset(MFI, FIRST_IMPLICIT);
1006 return lowerKernArgParameterPtr(DAG, SL, DAG.getEntryNode(), Offset);
1007}
1008
1009SDValue SITargetLowering::convertArgType(SelectionDAG &DAG, EVT VT, EVT MemVT,
1010 const SDLoc &SL, SDValue Val,
1011 bool Signed,
1012 const ISD::InputArg *Arg) const {
1013 if (Arg && (Arg->Flags.isSExt() || Arg->Flags.isZExt()) &&
1014 VT.bitsLT(MemVT)) {
1015 unsigned Opc = Arg->Flags.isZExt() ? ISD::AssertZext : ISD::AssertSext;
1016 Val = DAG.getNode(Opc, SL, MemVT, Val, DAG.getValueType(VT));
1017 }
1018
1019 if (MemVT.isFloatingPoint())
1020 Val = getFPExtOrFPTrunc(DAG, Val, SL, VT);
1021 else if (Signed)
1022 Val = DAG.getSExtOrTrunc(Val, SL, VT);
1023 else
1024 Val = DAG.getZExtOrTrunc(Val, SL, VT);
1025
1026 return Val;
1027}
1028
1029SDValue SITargetLowering::lowerKernargMemParameter(
1030 SelectionDAG &DAG, EVT VT, EVT MemVT,
1031 const SDLoc &SL, SDValue Chain,
1032 uint64_t Offset, bool Signed,
1033 const ISD::InputArg *Arg) const {
1034 const DataLayout &DL = DAG.getDataLayout();
1035 Type *Ty = MemVT.getTypeForEVT(*DAG.getContext());
1036 PointerType *PtrTy = PointerType::get(Ty, AMDGPUASI.CONSTANT_ADDRESS);
1037 MachinePointerInfo PtrInfo(UndefValue::get(PtrTy));
1038
1039 unsigned Align = DL.getABITypeAlignment(Ty);
1040
1041 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, Offset);
1042 SDValue Load = DAG.getLoad(MemVT, SL, Chain, Ptr, PtrInfo, Align,
1043 MachineMemOperand::MODereferenceable |
1044 MachineMemOperand::MOInvariant);
1045
1046 SDValue Val = convertArgType(DAG, VT, MemVT, SL, Load, Signed, Arg);
1047 return DAG.getMergeValues({ Val, Load.getValue(1) }, SL);
1048}
1049
1050SDValue SITargetLowering::lowerStackParameter(SelectionDAG &DAG, CCValAssign &VA,
1051 const SDLoc &SL, SDValue Chain,
1052 const ISD::InputArg &Arg) const {
1053 MachineFunction &MF = DAG.getMachineFunction();
1054 MachineFrameInfo &MFI = MF.getFrameInfo();
1055
1056 if (Arg.Flags.isByVal()) {
1057 unsigned Size = Arg.Flags.getByValSize();
1058 int FrameIdx = MFI.CreateFixedObject(Size, VA.getLocMemOffset(), false);
1059 return DAG.getFrameIndex(FrameIdx, MVT::i32);
1060 }
1061
1062 unsigned ArgOffset = VA.getLocMemOffset();
1063 unsigned ArgSize = VA.getValVT().getStoreSize();
1064
1065 int FI = MFI.CreateFixedObject(ArgSize, ArgOffset, true);
1066
1067 // Create load nodes to retrieve arguments from the stack.
1068 SDValue FIN = DAG.getFrameIndex(FI, MVT::i32);
1069 SDValue ArgValue;
1070
1071 // For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT)
1072 ISD::LoadExtType ExtType = ISD::NON_EXTLOAD;
1073 MVT MemVT = VA.getValVT();
1074
1075 switch (VA.getLocInfo()) {
1076 default:
1077 break;
1078 case CCValAssign::BCvt:
1079 MemVT = VA.getLocVT();
1080 break;
1081 case CCValAssign::SExt:
1082 ExtType = ISD::SEXTLOAD;
1083 break;
1084 case CCValAssign::ZExt:
1085 ExtType = ISD::ZEXTLOAD;
1086 break;
1087 case CCValAssign::AExt:
1088 ExtType = ISD::EXTLOAD;
1089 break;
1090 }
1091
1092 ArgValue = DAG.getExtLoad(
1093 ExtType, SL, VA.getLocVT(), Chain, FIN,
1094 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI),
1095 MemVT);
1096 return ArgValue;
1097}
1098
1099SDValue SITargetLowering::getPreloadedValue(SelectionDAG &DAG,
1100 const SIMachineFunctionInfo &MFI,
1101 EVT VT,
1102 AMDGPUFunctionArgInfo::PreloadedValue PVID) const {
1103 const ArgDescriptor *Reg;
1104 const TargetRegisterClass *RC;
1105
1106 std::tie(Reg, RC) = MFI.getPreloadedValue(PVID);
1107 return CreateLiveInRegister(DAG, RC, Reg->getRegister(), VT);
1108}
1109
1110static void processShaderInputArgs(SmallVectorImpl<ISD::InputArg> &Splits,
1111 CallingConv::ID CallConv,
1112 ArrayRef<ISD::InputArg> Ins,
1113 BitVector &Skipped,
1114 FunctionType *FType,
1115 SIMachineFunctionInfo *Info) {
1116 for (unsigned I = 0, E = Ins.size(), PSInputNum = 0; I != E; ++I) {
1117 const ISD::InputArg &Arg = Ins[I];
1118
1119 // First check if it's a PS input addr.
1120 if (CallConv == CallingConv::AMDGPU_PS && !Arg.Flags.isInReg() &&
1121 !Arg.Flags.isByVal() && PSInputNum <= 15) {
1122
1123 if (!Arg.Used && !Info->isPSInputAllocated(PSInputNum)) {
1124 // We can safely skip PS inputs.
1125 Skipped.set(I);
1126 ++PSInputNum;
1127 continue;
1128 }
1129
1130 Info->markPSInputAllocated(PSInputNum);
1131 if (Arg.Used)
1132 Info->markPSInputEnabled(PSInputNum);
1133
1134 ++PSInputNum;
1135 }
1136
1137 // Second split vertices into their elements.
1138 if (Arg.VT.isVector()) {
1139 ISD::InputArg NewArg = Arg;
1140 NewArg.Flags.setSplit();
1141 NewArg.VT = Arg.VT.getVectorElementType();
1142
1143 // We REALLY want the ORIGINAL number of vertex elements here, e.g. a
1144 // three or five element vertex only needs three or five registers,
1145 // NOT four or eight.
1146 Type *ParamType = FType->getParamType(Arg.getOrigArgIndex());
1147 unsigned NumElements = ParamType->getVectorNumElements();
1148
1149 for (unsigned J = 0; J != NumElements; ++J) {
1150 Splits.push_back(NewArg);
1151 NewArg.PartOffset += NewArg.VT.getStoreSize();
1152 }
1153 } else {
1154 Splits.push_back(Arg);
1155 }
1156 }
1157}
1158
1159// Allocate special inputs passed in VGPRs.
1160static void allocateSpecialEntryInputVGPRs(CCState &CCInfo,
1161 MachineFunction &MF,
1162 const SIRegisterInfo &TRI,
1163 SIMachineFunctionInfo &Info) {
1164 if (Info.hasWorkItemIDX()) {
1165 unsigned Reg = AMDGPU::VGPR0;
1166 MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
1167
1168 CCInfo.AllocateReg(Reg);
1169 Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg));
1170 }
1171
1172 if (Info.hasWorkItemIDY()) {
1173 unsigned Reg = AMDGPU::VGPR1;
1174 MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
1175
1176 CCInfo.AllocateReg(Reg);
1177 Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg));
1178 }
1179
1180 if (Info.hasWorkItemIDZ()) {
1181 unsigned Reg = AMDGPU::VGPR2;
1182 MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
1183
1184 CCInfo.AllocateReg(Reg);
1185 Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg));
1186 }
1187}
1188
1189// Try to allocate a VGPR at the end of the argument list, or if no argument
1190// VGPRs are left allocating a stack slot.
1191static ArgDescriptor allocateVGPR32Input(CCState &CCInfo) {
1192 ArrayRef<MCPhysReg> ArgVGPRs
1193 = makeArrayRef(AMDGPU::VGPR_32RegClass.begin(), 32);
1194 unsigned RegIdx = CCInfo.getFirstUnallocated(ArgVGPRs);
1195 if (RegIdx == ArgVGPRs.size()) {
1196 // Spill to stack required.
1197 int64_t Offset = CCInfo.AllocateStack(4, 4);
1198
1199 return ArgDescriptor::createStack(Offset);
1200 }
1201
1202 unsigned Reg = ArgVGPRs[RegIdx];
1203 Reg = CCInfo.AllocateReg(Reg);
1204 assert(Reg != AMDGPU::NoRegister)(static_cast <bool> (Reg != AMDGPU::NoRegister) ? void (
0) : __assert_fail ("Reg != AMDGPU::NoRegister", "/build/llvm-toolchain-snapshot-7~svn329677/lib/Target/AMDGPU/SIISelLowering.cpp"
, 1204, __extension__ __PRETTY_FUNCTION__))
;
1205
1206 MachineFunction &MF = CCInfo.getMachineFunction();
1207 MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
1208 return ArgDescriptor::createRegister(Reg);
1209}
1210
1211static ArgDescriptor allocateSGPR32InputImpl(CCState &CCInfo,
1212 const TargetRegisterClass *RC,
1213 unsigned NumArgRegs) {
1214 ArrayRef<MCPhysReg> ArgSGPRs = makeArrayRef(RC->begin(), 32);
1215 unsigned RegIdx = CCInfo.getFirstUnallocated(ArgSGPRs);
1216 if (RegIdx == ArgSGPRs.size())
1217 report_fatal_error("ran out of SGPRs for arguments");
1218
1219 unsigned Reg = ArgSGPRs[RegIdx];
1220 Reg = CCInfo.AllocateReg(Reg);
1221 assert(Reg != AMDGPU::NoRegister)(static_cast <bool> (Reg != AMDGPU::NoRegister) ? void (
0) : __assert_fail ("Reg != AMDGPU::NoRegister", "/build/llvm-toolchain-snapshot-7~svn329677/lib/Target/AMDGPU/SIISelLowering.cpp"
, 1221, __extension__ __PRETTY_FUNCTION__))
;
1222
1223 MachineFunction &MF = CCInfo.getMachineFunction();
1224 MF.addLiveIn(Reg, RC);
1225 return ArgDescriptor::createRegister(Reg);
1226}
1227
1228static ArgDescriptor allocateSGPR32Input(CCState &CCInfo) {
1229 return allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_32RegClass, 32);
1230}
1231
1232static ArgDescriptor allocateSGPR64Input(CCState &CCInfo) {
1233 return allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_64RegClass, 16);
1234}
1235
1236static void allocateSpecialInputVGPRs(CCState &CCInfo,
1237 MachineFunction &MF,
1238 const SIRegisterInfo &TRI,
1239 SIMachineFunctionInfo &Info) {
1240 if (Info.hasWorkItemIDX())
1241 Info.setWorkItemIDX(allocateVGPR32Input(CCInfo));
1242
1243 if (Info.hasWorkItemIDY())
1244 Info.setWorkItemIDY(allocateVGPR32Input(CCInfo));
1245
1246 if (Info.hasWorkItemIDZ())
1247 Info.setWorkItemIDZ(allocateVGPR32Input(CCInfo));
1248}
1249
1250static void allocateSpecialInputSGPRs(CCState &CCInfo,
1251 MachineFunction &MF,
1252 const SIRegisterInfo &TRI,
1253 SIMachineFunctionInfo &Info) {
1254 auto &ArgInfo = Info.getArgInfo();
1255
1256 // TODO: Unify handling with private memory pointers.
1257
1258 if (Info.hasDispatchPtr())
1259 ArgInfo.DispatchPtr = allocateSGPR64Input(CCInfo);
1260
1261 if (Info.hasQueuePtr())
1262 ArgInfo.QueuePtr = allocateSGPR64Input(CCInfo);
1263
1264 if (Info.hasKernargSegmentPtr())
1265 ArgInfo.KernargSegmentPtr = allocateSGPR64Input(CCInfo);
1266
1267 if (Info.hasDispatchID())
1268 ArgInfo.DispatchID = allocateSGPR64Input(CCInfo);
1269
1270 // flat_scratch_init is not applicable for non-kernel functions.
1271
1272 if (Info.hasWorkGroupIDX())
1273 ArgInfo.WorkGroupIDX = allocateSGPR32Input(CCInfo);
1274
1275 if (Info.hasWorkGroupIDY())
1276 ArgInfo.WorkGroupIDY = allocateSGPR32Input(CCInfo);
1277
1278 if (Info.hasWorkGroupIDZ())
1279 ArgInfo.WorkGroupIDZ = allocateSGPR32Input(CCInfo);
1280
1281 if (Info.hasImplicitArgPtr())
1282 ArgInfo.ImplicitArgPtr = allocateSGPR64Input(CCInfo);
1283}
1284
1285// Allocate special inputs passed in user SGPRs.
1286static void allocateHSAUserSGPRs(CCState &CCInfo,
1287 MachineFunction &MF,
1288 const SIRegisterInfo &TRI,
1289 SIMachineFunctionInfo &Info) {
1290 if (Info.hasImplicitBufferPtr()) {
1291 unsigned ImplicitBufferPtrReg = Info.addImplicitBufferPtr(TRI);
1292 MF.addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass);
1293 CCInfo.AllocateReg(ImplicitBufferPtrReg);
1294 }
1295
1296 // FIXME: How should these inputs interact with inreg / custom SGPR inputs?
1297 if (Info.hasPrivateSegmentBuffer()) {
1298 unsigned PrivateSegmentBufferReg = Info.addPrivateSegmentBuffer(TRI);
1299 MF.addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SGPR_128RegClass);
1300 CCInfo.AllocateReg(PrivateSegmentBufferReg);
1301 }
1302
1303 if (Info.hasDispatchPtr()) {
1304 unsigned DispatchPtrReg = Info.addDispatchPtr(TRI);
1305 MF.addLiveIn(DispatchPtrReg, &AMDGPU::SGPR_64RegClass);
1306 CCInfo.AllocateReg(DispatchPtrReg);
1307 }
1308
1309 if (Info.hasQueuePtr()) {
1310 unsigned QueuePtrReg = Info.addQueuePtr(TRI);
1311 MF.addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass);
1312 CCInfo.AllocateReg(QueuePtrReg);
1313 }
1314
1315 if (Info.hasKernargSegmentPtr()) {
1316 unsigned InputPtrReg = Info.addKernargSegmentPtr(TRI);
1317 MF.addLiveIn(InputPtrReg, &AMDGPU::SGPR_64RegClass);
1318 CCInfo.AllocateReg(InputPtrReg);
1319 }
1320
1321 if (Info.hasDispatchID()) {
1322 unsigned DispatchIDReg = Info.addDispatchID(TRI);
1323 MF.addLiveIn(DispatchIDReg, &AMDGPU::SGPR_64RegClass);
1324 CCInfo.AllocateReg(DispatchIDReg);
1325 }
1326
1327 if (Info.hasFlatScratchInit()) {
1328 unsigned FlatScratchInitReg = Info.addFlatScratchInit(TRI);
1329 MF.addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass);
1330 CCInfo.AllocateReg(FlatScratchInitReg);
1331 }
1332
1333 // TODO: Add GridWorkGroupCount user SGPRs when used. For now with HSA we read
1334 // these from the dispatch pointer.
1335}
1336
1337// Allocate special input registers that are initialized per-wave.
1338static void allocateSystemSGPRs(CCState &CCInfo,
1339 MachineFunction &MF,
1340 SIMachineFunctionInfo &Info,
1341 CallingConv::ID CallConv,
1342 bool IsShader) {
1343 if (Info.hasWorkGroupIDX()) {
1344 unsigned Reg = Info.addWorkGroupIDX();
1345 MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass);
1346 CCInfo.AllocateReg(Reg);
1347 }
1348
1349 if (Info.hasWorkGroupIDY()) {
1350 unsigned Reg = Info.addWorkGroupIDY();
1351 MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass);
1352 CCInfo.AllocateReg(Reg);
1353 }
1354
1355 if (Info.hasWorkGroupIDZ()) {
1356 unsigned Reg = Info.addWorkGroupIDZ();
1357 MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass);
1358 CCInfo.AllocateReg(Reg);
1359 }
1360
1361 if (Info.hasWorkGroupInfo()) {
1362 unsigned Reg = Info.addWorkGroupInfo();
1363 MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass);
1364 CCInfo.AllocateReg(Reg);
1365 }
1366
1367 if (Info.hasPrivateSegmentWaveByteOffset()) {
1368 // Scratch wave offset passed in system SGPR.
1369 unsigned PrivateSegmentWaveByteOffsetReg;
1370
1371 if (IsShader) {
1372 PrivateSegmentWaveByteOffsetReg =
1373 Info.getPrivateSegmentWaveByteOffsetSystemSGPR();
1374
1375 // This is true if the scratch wave byte offset doesn't have a fixed
1376 // location.
1377 if (PrivateSegmentWaveByteOffsetReg == AMDGPU::NoRegister) {
1378 PrivateSegmentWaveByteOffsetReg = findFirstFreeSGPR(CCInfo);
1379 Info.setPrivateSegmentWaveByteOffset(PrivateSegmentWaveByteOffsetReg);
1380 }
1381 } else
1382 PrivateSegmentWaveByteOffsetReg = Info.addPrivateSegmentWaveByteOffset();
1383
1384 MF.addLiveIn(PrivateSegmentWaveByteOffsetReg, &AMDGPU::SGPR_32RegClass);
1385 CCInfo.AllocateReg(PrivateSegmentWaveByteOffsetReg);
1386 }
1387}
1388
1389static void reservePrivateMemoryRegs(const TargetMachine &TM,
1390 MachineFunction &MF,
1391 const SIRegisterInfo &TRI,
1392 SIMachineFunctionInfo &Info) {
1393 // Now that we've figured out where the scratch register inputs are, see if
1394 // should reserve the arguments and use them directly.
1395 MachineFrameInfo &MFI = MF.getFrameInfo();
1396 bool HasStackObjects = MFI.hasStackObjects();
1397
1398 // Record that we know we have non-spill stack objects so we don't need to
1399 // check all stack objects later.
1400 if (HasStackObjects)
1401 Info.setHasNonSpillStackObjects(true);
1402
1403 // Everything live out of a block is spilled with fast regalloc, so it's
1404 // almost certain that spilling will be required.
1405 if (TM.getOptLevel() == CodeGenOpt::None)
1406 HasStackObjects = true;
1407
1408 // For now assume stack access is needed in any callee functions, so we need
1409 // the scratch registers to pass in.
1410 bool RequiresStackAccess = HasStackObjects || MFI.hasCalls();
1411
1412 const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
1413 if (ST.isAmdCodeObjectV2(MF)) {
1414 if (RequiresStackAccess) {
1415 // If we have stack objects, we unquestionably need the private buffer
1416 // resource. For the Code Object V2 ABI, this will be the first 4 user
1417 // SGPR inputs. We can reserve those and use them directly.
1418
1419 unsigned PrivateSegmentBufferReg = Info.getPreloadedReg(
1420 AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_BUFFER);
1421 Info.setScratchRSrcReg(PrivateSegmentBufferReg);
1422
1423 if (MFI.hasCalls()) {
1424 // If we have calls, we need to keep the frame register in a register
1425 // that won't be clobbered by a call, so ensure it is copied somewhere.
1426
1427 // This is not a problem for the scratch wave offset, because the same
1428 // registers are reserved in all functions.
1429
1430 // FIXME: Nothing is really ensuring this is a call preserved register,
1431 // it's just selected from the end so it happens to be.
1432 unsigned ReservedOffsetReg
1433 = TRI.reservedPrivateSegmentWaveByteOffsetReg(MF);
1434 Info.setScratchWaveOffsetReg(ReservedOffsetReg);
1435 } else {
1436 unsigned PrivateSegmentWaveByteOffsetReg = Info.getPreloadedReg(
1437 AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
1438 Info.setScratchWaveOffsetReg(PrivateSegmentWaveByteOffsetReg);
1439 }
1440 } else {
1441 unsigned ReservedBufferReg
1442 = TRI.reservedPrivateSegmentBufferReg(MF);
1443 unsigned ReservedOffsetReg
1444 = TRI.reservedPrivateSegmentWaveByteOffsetReg(MF);
1445
1446 // We tentatively reserve the last registers (skipping the last two
1447 // which may contain VCC). After register allocation, we'll replace
1448 // these with the ones immediately after those which were really
1449 // allocated. In the prologue copies will be inserted from the argument
1450 // to these reserved registers.
1451 Info.setScratchRSrcReg(ReservedBufferReg);
1452 Info.setScratchWaveOffsetReg(ReservedOffsetReg);
1453 }
1454 } else {
1455 unsigned ReservedBufferReg = TRI.reservedPrivateSegmentBufferReg(MF);
1456
1457 // Without HSA, relocations are used for the scratch pointer and the
1458 // buffer resource setup is always inserted in the prologue. Scratch wave
1459 // offset is still in an input SGPR.
1460 Info.setScratchRSrcReg(ReservedBufferReg);
1461
1462 if (HasStackObjects && !MFI.hasCalls()) {
1463 unsigned ScratchWaveOffsetReg = Info.getPreloadedReg(
1464 AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
1465 Info.setScratchWaveOffsetReg(ScratchWaveOffsetReg);
1466 } else {
1467 unsigned ReservedOffsetReg
1468 = TRI.reservedPrivateSegmentWaveByteOffsetReg(MF);
1469 Info.setScratchWaveOffsetReg(ReservedOffsetReg);
1470 }
1471 }
1472}
1473
1474bool SITargetLowering::supportSplitCSR(MachineFunction *MF) const {
1475 const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
1476 return !Info->isEntryFunction();
1477}
1478
1479void SITargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
1480
1481}
1482
1483void SITargetLowering::insertCopiesSplitCSR(
1484 MachineBasicBlock *Entry,
1485 const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
1486 const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
1487
1488 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
1489 if (!IStart)
1490 return;
1491
1492 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
1493 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
1494 MachineBasicBlock::iterator MBBI = Entry->begin();
1495 for (const MCPhysReg *I = IStart; *I; ++I) {
1496 const TargetRegisterClass *RC = nullptr;
1497 if (AMDGPU::SReg_64RegClass.contains(*I))
1498 RC = &AMDGPU::SGPR_64RegClass;
1499 else if (AMDGPU::SReg_32RegClass.contains(*I))
1500 RC = &AMDGPU::SGPR_32RegClass;
1501 else
1502 llvm_unreachable("Unexpected register class in CSRsViaCopy!")::llvm::llvm_unreachable_internal("Unexpected register class in CSRsViaCopy!"
, "/build/llvm-toolchain-snapshot-7~svn329677/lib/Target/AMDGPU/SIISelLowering.cpp"
, 1502)
;
1503
1504 unsigned NewVR = MRI->createVirtualRegister(RC);
1505 // Create copy from CSR to a virtual register.
1506 Entry->addLiveIn(*I);
1507 BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
1508 .addReg(*I);
1509
1510 // Insert the copy-back instructions right before the terminator.
1511 for (auto *Exit : Exits)
1512 BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
1513 TII->get(TargetOpcode::COPY), *I)
1514 .addReg(NewVR);
1515 }
1516}
1517
1518SDValue SITargetLowering::LowerFormalArguments(
1519 SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
1520 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
1521 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
1522 const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
1523
1524 MachineFunction &MF = DAG.getMachineFunction();
1525 FunctionType *FType = MF.getFunction().getFunctionType();
1526 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1527 const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
1528
1529 if (Subtarget->isAmdHsaOS() && AMDGPU::isShader(CallConv)) {
1530 const Function &Fn = MF.getFunction();
1531 DiagnosticInfoUnsupported NoGraphicsHSA(
1532 Fn, "unsupported non-compute shaders with HSA", DL.getDebugLoc());
1533 DAG.getContext()->diagnose(NoGraphicsHSA);
1534 return DAG.getEntryNode();
1535 }
1536
1537 // Create stack objects that are used for emitting debugger prologue if
1538 // "amdgpu-debugger-emit-prologue" attribute was specified.
1539 if (ST.debuggerEmitPrologue())
1540 createDebuggerPrologueStackObjects(MF);
1541
1542 SmallVector<ISD::InputArg, 16> Splits;
1543 SmallVector<CCValAssign, 16> ArgLocs;
1544 BitVector Skipped(Ins.size());
1545 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
1546 *DAG.getContext());
1547
1548 bool IsShader = AMDGPU::isShader(CallConv);
1549 bool IsKernel = AMDGPU::isKernel(CallConv);
1550 bool IsEntryFunc = AMDGPU::isEntryFunctionCC(CallConv);
1551
1552 if (!IsEntryFunc) {
1553 // 4 bytes are reserved at offset 0 for the emergency stack slot. Skip over
1554 // this when allocating argument fixed offsets.
1555 CCInfo.AllocateStack(4, 4);
1556 }
1557
1558 if (IsShader) {
1559 processShaderInputArgs(Splits, CallConv, Ins, Skipped, FType, Info);
1560
1561 // At least one interpolation mode must be enabled or else the GPU will
1562 // hang.
1563 //
1564 // Check PSInputAddr instead of PSInputEnable. The idea is that if the user
1565 // set PSInputAddr, the user wants to enable some bits after the compilation
1566 // based on run-time states. Since we can't know what the final PSInputEna
1567 // will look like, so we shouldn't do anything here and the user should take
1568 // responsibility for the correct programming.
1569 //
1570 // Otherwise, the following restrictions apply:
1571 // - At least one of PERSP_* (0xF) or LINEAR_* (0x70) must be enabled.
1572 // - If POS_W_FLOAT (11) is enabled, at least one of PERSP_* must be
1573 // enabled too.
1574 if (CallConv == CallingConv::AMDGPU_PS) {
1575 if ((Info->getPSInputAddr() & 0x7F) == 0 ||
1576 ((Info->getPSInputAddr() & 0xF) == 0 &&
1577 Info->isPSInputAllocated(11))) {
1578 CCInfo.AllocateReg(AMDGPU::VGPR0);
1579 CCInfo.AllocateReg(AMDGPU::VGPR1);
1580 Info->markPSInputAllocated(0);
1581 Info->markPSInputEnabled(0);
1582 }
1583 if (Subtarget->isAmdPalOS()) {
1584 // For isAmdPalOS, the user does not enable some bits after compilation
1585 // based on run-time states; the register values being generated here are
1586 // the final ones set in hardware. Therefore we need to apply the
1587 // workaround to PSInputAddr and PSInputEnable together. (The case where
1588 // a bit is set in PSInputAddr but not PSInputEnable is where the
1589 // frontend set up an input arg for a particular interpolation mode, but
1590 // nothing uses that input arg. Really we should have an earlier pass
1591 // that removes such an arg.)
1592 unsigned PsInputBits = Info->getPSInputAddr() & Info->getPSInputEnable();
1593 if ((PsInputBits & 0x7F) == 0 ||
1594 ((PsInputBits & 0xF) == 0 &&
1595 (PsInputBits >> 11 & 1)))
1596 Info->markPSInputEnabled(
1597 countTrailingZeros(Info->getPSInputAddr(), ZB_Undefined));
1598 }
1599 }
1600
1601 assert(!Info->hasDispatchPtr() &&(static_cast <bool> (!Info->hasDispatchPtr() &&
!Info->hasKernargSegmentPtr() && !Info->hasFlatScratchInit
() && !Info->hasWorkGroupIDX() && !Info->
hasWorkGroupIDY() && !Info->hasWorkGroupIDZ() &&
!Info->hasWorkGroupInfo() && !Info->hasWorkItemIDX
() && !Info->hasWorkItemIDY() && !Info->
hasWorkItemIDZ()) ? void (0) : __assert_fail ("!Info->hasDispatchPtr() && !Info->hasKernargSegmentPtr() && !Info->hasFlatScratchInit() && !Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() && !Info->hasWorkGroupIDZ() && !Info->hasWorkGroupInfo() && !Info->hasWorkItemIDX() && !Info->hasWorkItemIDY() && !Info->hasWorkItemIDZ()"
, "/build/llvm-toolchain-snapshot-7~svn329677/lib/Target/AMDGPU/SIISelLowering.cpp"
, 1606, __extension__ __PRETTY_FUNCTION__))
1602 !Info->hasKernargSegmentPtr() && !Info->hasFlatScratchInit() &&(static_cast <bool> (!Info->hasDispatchPtr() &&
!Info->hasKernargSegmentPtr() && !Info->hasFlatScratchInit
() && !Info->hasWorkGroupIDX() && !Info->
hasWorkGroupIDY() && !Info->hasWorkGroupIDZ() &&
!Info->hasWorkGroupInfo() && !Info->hasWorkItemIDX
() && !Info->hasWorkItemIDY() && !Info->
hasWorkItemIDZ()) ? void (0) : __assert_fail ("!Info->hasDispatchPtr() && !Info->hasKernargSegmentPtr() && !Info->hasFlatScratchInit() && !Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() && !Info->hasWorkGroupIDZ() && !Info->hasWorkGroupInfo() && !Info->hasWorkItemIDX() && !Info->hasWorkItemIDY() && !Info->hasWorkItemIDZ()"
, "/build/llvm-toolchain-snapshot-7~svn329677/lib/Target/AMDGPU/SIISelLowering.cpp"
, 1606, __extension__ __PRETTY_FUNCTION__))
1603 !Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() &&(static_cast <bool> (!Info->hasDispatchPtr() &&
!Info->hasKernargSegmentPtr() && !Info->hasFlatScratchInit
() && !Info->hasWorkGroupIDX() && !Info->
hasWorkGroupIDY() && !Info->hasWorkGroupIDZ() &&
!Info->hasWorkGroupInfo() && !Info->hasWorkItemIDX
() && !Info->hasWorkItemIDY() && !Info->
hasWorkItemIDZ()) ? void (0) : __assert_fail ("!Info->hasDispatchPtr() && !Info->hasKernargSegmentPtr() && !Info->hasFlatScratchInit() && !Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() && !Info->hasWorkGroupIDZ() && !Info->hasWorkGroupInfo() && !Info->hasWorkItemIDX() && !Info->hasWorkItemIDY() && !Info->hasWorkItemIDZ()"
, "/build/llvm-toolchain-snapshot-7~svn329677/lib/Target/AMDGPU/SIISelLowering.cpp"
, 1606, __extension__ __PRETTY_FUNCTION__))
1604 !Info->hasWorkGroupIDZ() && !Info->hasWorkGroupInfo() &&(static_cast <bool> (!Info->hasDispatchPtr() &&
!Info->hasKernargSegmentPtr() && !Info->hasFlatScratchInit
() && !Info->hasWorkGroupIDX() && !Info->
hasWorkGroupIDY() && !Info->hasWorkGroupIDZ() &&
!Info->hasWorkGroupInfo() && !Info->hasWorkItemIDX
() && !Info->hasWorkItemIDY() && !Info->
hasWorkItemIDZ()) ? void (0) : __assert_fail ("!Info->hasDispatchPtr() && !Info->hasKernargSegmentPtr() && !Info->hasFlatScratchInit() && !Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() && !Info->hasWorkGroupIDZ() && !Info->hasWorkGroupInfo() && !Info->hasWorkItemIDX() && !Info->hasWorkItemIDY() && !Info->hasWorkItemIDZ()"
, "/build/llvm-toolchain-snapshot-7~svn329677/lib/Target/AMDGPU/SIISelLowering.cpp"
, 1606, __extension__ __PRETTY_FUNCTION__))
1605 !Info->hasWorkItemIDX() && !Info->hasWorkItemIDY() &&(static_cast <bool> (!Info->hasDispatchPtr() &&
!Info->hasKernargSegmentPtr() && !Info->hasFlatScratchInit
() && !Info->hasWorkGroupIDX() && !Info->
hasWorkGroupIDY() && !Info->hasWorkGroupIDZ() &&
!Info->hasWorkGroupInfo() && !Info->hasWorkItemIDX
() && !Info->hasWorkItemIDY() && !Info->
hasWorkItemIDZ()) ? void (0) : __assert_fail ("!Info->hasDispatchPtr() && !Info->hasKernargSegmentPtr() && !Info->hasFlatScratchInit() && !Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() && !Info->hasWorkGroupIDZ() && !Info->hasWorkGroupInfo() && !Info->hasWorkItemIDX() && !Info->hasWorkItemIDY() && !Info->hasWorkItemIDZ()"
, "/build/llvm-toolchain-snapshot-7~svn329677/lib/Target/AMDGPU/SIISelLowering.cpp"
, 1606, __extension__ __PRETTY_FUNCTION__))
1606 !Info->hasWorkItemIDZ())(static_cast <bool> (!Info->hasDispatchPtr() &&
!Info->hasKernargSegmentPtr() && !Info->hasFlatScratchInit
() && !Info->hasWorkGroupIDX() && !Info->
hasWorkGroupIDY() && !Info->hasWorkGroupIDZ() &&
!Info->hasWorkGroupInfo() && !Info->hasWorkItemIDX
() && !Info->hasWorkItemIDY() && !Info->
hasWorkItemIDZ()) ? void (0) : __assert_fail ("!Info->hasDispatchPtr() && !Info->hasKernargSegmentPtr() && !Info->hasFlatScratchInit() && !Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() && !Info->hasWorkGroupIDZ() && !Info->hasWorkGroupInfo() && !Info->hasWorkItemIDX() && !Info->hasWorkItemIDY() && !Info->hasWorkItemIDZ()"
, "/build/llvm-toolchain-snapshot-7~svn329677/lib/Target/AMDGPU/SIISelLowering.cpp"
, 1606, __extension__ __PRETTY_FUNCTION__))
;
1607 } else if (IsKernel) {
1608 assert(Info->hasWorkGroupIDX() && Info->hasWorkItemIDX())(static_cast <bool> (Info->hasWorkGroupIDX() &&
Info->hasWorkItemIDX()) ? void (0) : __assert_fail ("Info->hasWorkGroupIDX() && Info->hasWorkItemIDX()"
, "/build/llvm-toolchain-snapshot-7~svn329677/lib/Target/AMDGPU/SIISelLowering.cpp"
, 1608, __extension__ __PRETTY_FUNCTION__))
;
1609 } else {
1610 Splits.append(Ins.begin(), Ins.end());
1611 }
1612
1613 if (IsEntryFunc) {
1614 allocateSpecialEntryInputVGPRs(CCInfo, MF, *TRI, *Info);
1615 allocateHSAUserSGPRs(CCInfo, MF, *TRI, *Info);
1616 }
1617
1618 if (IsKernel) {
1619 analyzeFormalArgumentsCompute(CCInfo, Ins);
1620 } else {
1621 CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, isVarArg);
1622 CCInfo.AnalyzeFormalArguments(Splits, AssignFn);
1623 }
1624
1625 SmallVector<SDValue, 16> Chains;
1626
1627 for (unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) {
1628 const ISD::InputArg &Arg = Ins[i];
1629 if (Skipped[i]) {
1630 InVals.push_back(DAG.getUNDEF(Arg.VT));
1631 continue;
1632 }
1633
1634 CCValAssign &VA = ArgLocs[ArgIdx++];
1635 MVT VT = VA.getLocVT();
1636
1637 if (IsEntryFunc && VA.isMemLoc()) {
1638 VT = Ins[i].VT;
1639 EVT MemVT = VA.getLocVT();
1640
1641 const uint64_t Offset = Subtarget->getExplicitKernelArgOffset(MF) +
1642 VA.getLocMemOffset();
1643 Info->setABIArgOffset(Offset + MemVT.getStoreSize());
1644
1645 // The first 36 bytes of the input buffer contains information about
1646 // thread group and global sizes.
1647 SDValue Arg = lowerKernargMemParameter(
1648 DAG, VT, MemVT, DL, Chain, Offset, Ins[i].Flags.isSExt(), &Ins[i]);
1649 Chains.push_back(Arg.getValue(1));
1650
1651 auto *ParamTy =
1652 dyn_cast<PointerType>(FType->getParamType(Ins[i].getOrigArgIndex()));
1653 if (Subtarget->getGeneration() == SISubtarget::SOUTHERN_ISLANDS &&
1654 ParamTy && ParamTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
1655 // On SI local pointers are just offsets into LDS, so they are always
1656 // less than 16-bits. On CI and newer they could potentially be
1657 // real pointers, so we can't guarantee their size.
1658 Arg = DAG.getNode(ISD::AssertZext, DL, Arg.getValueType(), Arg,
1659 DAG.getValueType(MVT::i16));
1660 }
1661
1662 InVals.push_back(Arg);
1663 continue;
1664 } else if (!IsEntryFunc && VA.isMemLoc()) {
1665 SDValue Val = lowerStackParameter(DAG, VA, DL, Chain, Arg);
1666 InVals.push_back(Val);
1667 if (!Arg.Flags.isByVal())
1668 Chains.push_back(Val.getValue(1));
1669 continue;
1670 }
1671
1672 assert(VA.isRegLoc() && "Parameter must be in a register!")(static_cast <bool> (VA.isRegLoc() && "Parameter must be in a register!"
) ? void (0) : __assert_fail ("VA.isRegLoc() && \"Parameter must be in a register!\""
, "/build/llvm-toolchain-snapshot-7~svn329677/lib/Target/AMDGPU/SIISelLowering.cpp"
, 1672, __extension__ __PRETTY_FUNCTION__))
;
1673
1674 unsigned Reg = VA.getLocReg();
1675 const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg, VT);
1676 EVT ValVT = VA.getValVT();
1677
1678 Reg = MF.addLiveIn(Reg, RC);
1679 SDValue Val = DAG.getCopyFromReg(Chain, DL, Reg, VT);
1680
1681 if (Arg.Flags.isSRet() && !getSubtarget()->enableHugePrivateBuffer()) {
1682 // The return object should be reasonably addressable.
1683
1684 // FIXME: This helps when the return is a real sret. If it is a
1685 // automatically inserted sret (i.e. CanLowerReturn returns false), an
1686 // extra copy is inserted in SelectionDAGBuilder which obscures this.
1687 unsigned NumBits = 32 - AssumeFrameIndexHighZeroBits;
1688 Val = DAG.getNode(ISD::AssertZext, DL, VT, Val,
1689 DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), NumBits)));
1690 }
1691
1692 // If this is an 8 or 16-bit value, it is really passed promoted
1693 // to 32 bits. Insert an assert[sz]ext to capture this, then
1694 // truncate to the right size.
1695 switch (VA.getLocInfo()) {
1696 case CCValAssign::Full:
1697 break;
1698 case CCValAssign::BCvt:
1699 Val = DAG.getNode(ISD::BITCAST, DL, ValVT, Val);
1700 break;
1701 case CCValAssign::SExt:
1702 Val = DAG.getNode(ISD::AssertSext, DL, VT, Val,
1703 DAG.getValueType(ValVT));
1704 Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
1705 break;
1706 case CCValAssign::ZExt:
1707 Val = DAG.getNode(ISD::AssertZext, DL, VT, Val,
1708 DAG.getValueType(ValVT));
1709 Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
1710 break;
1711 case CCValAssign::AExt:
1712 Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
1713 break;
1714 default:
1715 llvm_unreachable("Unknown loc info!")::llvm::llvm_unreachable_internal("Unknown loc info!", "/build/llvm-toolchain-snapshot-7~svn329677/lib/Target/AMDGPU/SIISelLowering.cpp"
, 1715)
;
1716 }
1717
1718 if (IsShader && Arg.VT.isVector()) {
1719 // Build a vector from the registers
1720 Type *ParamType = FType->getParamType(Arg.getOrigArgIndex());
1721 unsigned NumElements = ParamType->getVectorNumElements();
1722
1723 SmallVector<SDValue, 4> Regs;
1724 Regs.push_back(Val);
1725 for (unsigned j = 1; j != NumElements; ++j) {
1726 Reg = ArgLocs[ArgIdx++].getLocReg();
1727 Reg = MF.addLiveIn(Reg, RC);
1728
1729 SDValue Copy = DAG.getCopyFromReg(Chain, DL, Reg, VT);
1730 Regs.push_back(Copy);
1731 }
1732
1733 // Fill up the missing vector elements
1734 NumElements = Arg.VT.getVectorNumElements() - NumElements;
1735 Regs.append(NumElements, DAG.getUNDEF(VT));
1736
1737 InVals.push_back(DAG.getBuildVector(Arg.VT, DL, Regs));
1738 continue;
1739 }
1740
1741 InVals.push_back(Val);
1742 }
1743
1744 if (!IsEntryFunc) {
1745 // Special inputs come after user arguments.
1746 allocateSpecialInputVGPRs(CCInfo, MF, *TRI, *Info);
1747 }
1748
1749 // Start adding system SGPRs.
1750 if (IsEntryFunc) {
1751 allocateSystemSGPRs(CCInfo, MF, *Info, CallConv, IsShader);
1752 } else {
1753 CCInfo.AllocateReg(Info->getScratchRSrcReg());
1754 CCInfo.AllocateReg(Info->getScratchWaveOffsetReg());
1755 CCInfo.AllocateReg(Info->getFrameOffsetReg());
1756 allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info);
1757 }
1758
1759 auto &ArgUsageInfo =
1760 DAG.getPass()->getAnalysis<AMDGPUArgumentUsageInfo>();
1761 ArgUsageInfo.setFuncArgInfo(MF.getFunction(), Info->getArgInfo());
1762
1763 unsigned StackArgSize = CCInfo.getNextStackOffset();
1764 Info->setBytesInStackArgArea(StackArgSize);
1765
1766 return Chains.empty() ? Chain :
1767 DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
1768}
1769
1770// TODO: If return values can't fit in registers, we should return as many as
1771// possible in registers before passing on stack.
1772bool SITargetLowering::CanLowerReturn(
1773 CallingConv::ID CallConv,
1774 MachineFunction &MF, bool IsVarArg,
1775 const SmallVectorImpl<ISD::OutputArg> &Outs,
1776 LLVMContext &Context) const {
1777 // Replacing returns with sret/stack usage doesn't make sense for shaders.
1778 // FIXME: Also sort of a workaround for custom vector splitting in LowerReturn
1779 // for shaders. Vector types should be explicitly handled by CC.
1780 if (AMDGPU::isEntryFunctionCC(CallConv))
1781 return true;
1782
1783 SmallVector<CCValAssign, 16> RVLocs;
1784 CCState CCInfo(CallConv, IsVarArg, MF, RVLocs, Context);
1785 return CCInfo.CheckReturn(Outs, CCAssignFnForReturn(CallConv, IsVarArg));
1786}
1787
1788SDValue
1789SITargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
1790 bool isVarArg,
1791 const SmallVectorImpl<ISD::OutputArg> &Outs,
1792 const SmallVectorImpl<SDValue> &OutVals,
1793 const SDLoc &DL, SelectionDAG &DAG) const {
1794 MachineFunction &MF = DAG.getMachineFunction();
1795 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1796
1797 if (AMDGPU::isKernel(CallConv)) {
1798 return AMDGPUTargetLowering::LowerReturn(Chain, CallConv, isVarArg, Outs,
1799 OutVals, DL, DAG);
1800 }
1801
1802 bool IsShader = AMDGPU::isShader(CallConv);
1803
1804 Info->setIfReturnsVoid(Outs.size() == 0);
1805 bool IsWaveEnd = Info->returnsVoid() && IsShader;
1806
1807 SmallVector<ISD::OutputArg, 48> Splits;
1808 SmallVector<SDValue, 48> SplitVals;
1809
1810 // Split vectors into their elements.
1811 for (unsigned i = 0, e = Outs.size(); i != e; ++i) {
1812 const ISD::OutputArg &Out = Outs[i];
1813
1814 if (IsShader && Out.VT.isVector()) {
1815 MVT VT = Out.VT.getVectorElementType();
1816 ISD::OutputArg NewOut = Out;
1817 NewOut.Flags.setSplit();
1818 NewOut.VT = VT;
1819
1820 // We want the original number of vector elements here, e.g.
1821 // three or five, not four or eight.
1822 unsigned NumElements = Out.ArgVT.getVectorNumElements();
1823
1824 for (unsigned j = 0; j != NumElements; ++j) {
1825 SDValue Elem = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, OutVals[i],
1826 DAG.getConstant(j, DL, MVT::i32));
1827 SplitVals.push_back(Elem);
1828 Splits.push_back(NewOut);
1829 NewOut.PartOffset += NewOut.VT.getStoreSize();
1830 }
1831 } else {
1832 SplitVals.push_back(OutVals[i]);
1833 Splits.push_back(Out);
1834 }
1835 }
1836
1837 // CCValAssign - represent the assignment of the return value to a location.
1838 SmallVector<CCValAssign, 48> RVLocs;
1839
1840 // CCState - Info about the registers and stack slots.
1841 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
1842 *DAG.getContext());
1843
1844 // Analyze outgoing return values.
1845 CCInfo.AnalyzeReturn(Splits, CCAssignFnForReturn(CallConv, isVarArg));
1846
1847 SDValue Flag;
1848 SmallVector<SDValue, 48> RetOps;
1849 RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
1850
1851 // Add return address for callable functions.
1852 if (!Info->isEntryFunction()) {
1853 const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
1854 SDValue ReturnAddrReg = CreateLiveInRegister(
1855 DAG, &AMDGPU::SReg_64RegClass, TRI->getReturnAddressReg(MF), MVT::i64);
1856
1857 // FIXME: Should be able to use a vreg here, but need a way to prevent it
1858 // from being allcoated to a CSR.
1859
1860 SDValue PhysReturnAddrReg = DAG.getRegister(TRI->getReturnAddressReg(MF),
1861 MVT::i64);
1862
1863 Chain = DAG.getCopyToReg(Chain, DL, PhysReturnAddrReg, ReturnAddrReg, Flag);
1864 Flag = Chain.getValue(1);
1865
1866 RetOps.push_back(PhysReturnAddrReg);
1867 }
1868
1869 // Copy the result values into the output registers.
1870 for (unsigned i = 0, realRVLocIdx = 0;
1871 i != RVLocs.size();
1872 ++i, ++realRVLocIdx) {
1873 CCValAssign &VA = RVLocs[i];
1874 assert(VA.isRegLoc() && "Can only return in registers!")(static_cast <bool> (VA.isRegLoc() && "Can only return in registers!"
) ? void (0) : __assert_fail ("VA.isRegLoc() && \"Can only return in registers!\""
, "/build/llvm-toolchain-snapshot-7~svn329677/lib/Target/AMDGPU/SIISelLowering.cpp"
, 1874, __extension__ __PRETTY_FUNCTION__))
;
1875 // TODO: Partially return in registers if return values don't fit.
1876
1877 SDValue Arg = SplitVals[realRVLocIdx];
1878
1879 // Copied from other backends.
1880 switch (VA.getLocInfo()) {
1881 case CCValAssign::Full:
1882 break;
1883 case CCValAssign::BCvt:
1884 Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
1885 break;
1886 case CCValAssign::SExt:
1887 Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
1888 break;
1889 case CCValAssign::ZExt:
1890 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
1891 break;
1892 case CCValAssign::AExt:
1893 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
1894 break;
1895 default:
1896 llvm_unreachable("Unknown loc info!")::llvm::llvm_unreachable_internal("Unknown loc info!", "/build/llvm-toolchain-snapshot-7~svn329677/lib/Target/AMDGPU/SIISelLowering.cpp"
, 1896)
;
1897 }
1898
1899 Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Arg, Flag);
1900 Flag = Chain.getValue(1);
1901 RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
1902 }
1903
1904 // FIXME: Does sret work properly?
1905 if (!Info->isEntryFunction()) {
1906 const SIRegisterInfo *TRI
1907 = static_cast<const SISubtarget *>(Subtarget)->getRegisterInfo();
1908 const MCPhysReg *I =
1909 TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
1910 if (I) {
1911 for (; *I; ++I) {
1912 if (AMDGPU::SReg_64RegClass.contains(*I))
1913 RetOps.push_back(DAG.getRegister(*I, MVT::i64));
1914 else if (AMDGPU::SReg_32RegClass.contains(*I))
1915 RetOps.push_back(DAG.getRegister(*I, MVT::i32));
1916 else
1917 llvm_unreachable("Unexpected register class in CSRsViaCopy!")::llvm::llvm_unreachable_internal("Unexpected register class in CSRsViaCopy!"
, "/build/llvm-toolchain-snapshot-7~svn329677/lib/Target/AMDGPU/SIISelLowering.cpp"
, 1917)
;
1918 }
1919 }
1920 }
1921
1922 // Update chain and glue.
1923 RetOps[0] = Chain;
1924 if (Flag.getNode())
1925 RetOps.push_back(Flag);
1926
1927 unsigned Opc = AMDGPUISD::ENDPGM;
1928 if (!IsWaveEnd)
1929 Opc = IsShader ? AMDGPUISD::RETURN_TO_EPILOG : AMDGPUISD::RET_FLAG;
1930 return DAG.getNode(Opc, DL, MVT::Other, RetOps);
1931}
1932
1933SDValue SITargetLowering::LowerCallResult(
1934 SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool IsVarArg,
1935 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
1936 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool IsThisReturn,
1937 SDValue ThisVal) const {
1938 CCAssignFn *RetCC = CCAssignFnForReturn(CallConv, IsVarArg);
1939
1940 // Assign locations to each value returned by this call.
1941 SmallVector<CCValAssign, 16> RVLocs;
1942 CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), RVLocs,
1943 *DAG.getContext());
1944 CCInfo.AnalyzeCallResult(Ins, RetCC);
1945
1946 // Copy all of the result registers out of their specified physreg.
1947 for (unsigned i = 0; i != RVLocs.size(); ++i) {
1948 CCValAssign VA = RVLocs[i];
1949 SDValue Val;
1950
1951 if (VA.isRegLoc()) {
1952 Val = DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InFlag);
1953 Chain = Val.getValue(1);
1954 InFlag = Val.getValue(2);
1955 } else if (VA.isMemLoc()) {
1956 report_fatal_error("TODO: return values in memory");
1957 } else
1958 llvm_unreachable("unknown argument location type")::llvm::llvm_unreachable_internal("unknown argument location type"
, "/build/llvm-toolchain-snapshot-7~svn329677/lib/Target/AMDGPU/SIISelLowering.cpp"
, 1958)
;
1959
1960 switch (VA.getLocInfo()) {
1961 case CCValAssign::Full:
1962 break;
1963 case CCValAssign::BCvt:
1964 Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val);
1965 break;
1966 case CCValAssign::ZExt:
1967 Val = DAG.getNode(ISD::AssertZext, DL, VA.getLocVT(), Val,
1968 DAG.getValueType(VA.getValVT()));
1969 Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
1970 break;
1971 case CCValAssign::SExt:
1972 Val = DAG.getNode(ISD::AssertSext, DL, VA.getLocVT(), Val,
1973 DAG.getValueType(VA.getValVT()));
1974 Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
1975 break;
1976 case CCValAssign::AExt:
1977 Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
1978 break;
1979 default:
1980 llvm_unreachable("Unknown loc info!")::llvm::llvm_unreachable_internal("Unknown loc info!", "/build/llvm-toolchain-snapshot-7~svn329677/lib/Target/AMDGPU/SIISelLowering.cpp"
, 1980)
;
1981 }
1982
1983 InVals.push_back(Val);
1984 }
1985
1986 return Chain;
1987}
1988
1989// Add code to pass special inputs required depending on used features separate
1990// from the explicit user arguments present in the IR.
1991void SITargetLowering::passSpecialInputs(
1992 CallLoweringInfo &CLI,
1993 const SIMachineFunctionInfo &Info,
1994 SmallVectorImpl<std::pair<unsigned, SDValue>> &RegsToPass,
1995 SmallVectorImpl<SDValue> &MemOpChains,
1996 SDValue Chain,
1997 SDValue StackPtr) const {
1998 // If we don't have a call site, this was a call inserted by
1999 // legalization. These can never use special inputs.
2000 if (!CLI.CS)
2001 return;
2002
2003 const Function *CalleeFunc = CLI.CS.getCalledFunction();
2004 assert(CalleeFunc)(static_cast <bool> (CalleeFunc) ? void (0) : __assert_fail
("CalleeFunc", "/build/llvm-toolchain-snapshot-7~svn329677/lib/Target/AMDGPU/SIISelLowering.cpp"
, 2004, __extension__ __PRETTY_FUNCTION__))
;
2005
2006 SelectionDAG &DAG = CLI.DAG;
2007 const SDLoc &DL = CLI.DL;
2008
2009 const SISubtarget *ST = getSubtarget();
2010 const SIRegisterInfo *TRI = ST->getRegisterInfo();
2011
2012 auto &ArgUsageInfo =
2013 DAG.getPass()->getAnalysis<AMDGPUArgumentUsageInfo>();
2014 const AMDGPUFunctionArgInfo &CalleeArgInfo
2015 = ArgUsageInfo.lookupFuncArgInfo(*CalleeFunc);
2016
2017 const AMDGPUFunctionArgInfo &CallerArgInfo = Info.getArgInfo();
2018
2019 // TODO: Unify with private memory register handling. This is complicated by
2020 // the fact that at least in kernels, the input argument is not necessarily
2021 // in the same location as the input.
2022 AMDGPUFunctionArgInfo::PreloadedValue InputRegs[] = {
2023 AMDGPUFunctionArgInfo::DISPATCH_PTR,
2024 AMDGPUFunctionArgInfo::QUEUE_PTR,
2025 AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR,
2026 AMDGPUFunctionArgInfo::DISPATCH_ID,
2027 AMDGPUFunctionArgInfo::WORKGROUP_ID_X,
2028 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y,
2029 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z,
2030 AMDGPUFunctionArgInfo::WORKITEM_ID_X,
2031 AMDGPUFunctionArgInfo::WORKITEM_ID_Y,
2032 AMDGPUFunctionArgInfo::WORKITEM_ID_Z,
2033 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR
2034 };
2035
2036 for (auto InputID : InputRegs) {
2037 const ArgDescriptor *OutgoingArg;
2038 const TargetRegisterClass *ArgRC;
2039
2040 std::tie(OutgoingArg, ArgRC) = CalleeArgInfo.getPreloadedValue(InputID);
2041 if (!OutgoingArg)
2042 continue;
2043
2044 const ArgDescriptor *IncomingArg;
2045 const TargetRegisterClass *IncomingArgRC;
2046 std::tie(IncomingArg, IncomingArgRC)
2047 = CallerArgInfo.getPreloadedValue(InputID);
2048 assert(IncomingArgRC == ArgRC)(static_cast <bool> (IncomingArgRC == ArgRC) ? void (0)
: __assert_fail ("IncomingArgRC == ArgRC", "/build/llvm-toolchain-snapshot-7~svn329677/lib/Target/AMDGPU/SIISelLowering.cpp"
, 2048, __extension__ __PRETTY_FUNCTION__))
;
2049
2050 // All special arguments are ints for now.
2051 EVT ArgVT = TRI->getSpillSize(*ArgRC) == 8 ? MVT::i64 : MVT::i32;
2052 SDValue InputReg;
2053
2054 if (IncomingArg) {
2055 InputReg = loadInputValue(DAG, ArgRC, ArgVT, DL, *IncomingArg);
2056 } else {
2057 // The implicit arg ptr is special because it doesn't have a corresponding
2058 // input for kernels, and is computed from the kernarg segment pointer.
2059 assert(InputID == AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR)(static_cast <bool> (InputID == AMDGPUFunctionArgInfo::
IMPLICIT_ARG_PTR) ? void (0) : __assert_fail ("InputID == AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR"
, "/build/llvm-toolchain-snapshot-7~svn329677/lib/Target/AMDGPU/SIISelLowering.cpp"
, 2059, __extension__ __PRETTY_FUNCTION__))
;
2060 InputReg = getImplicitArgPtr(DAG, DL);
2061 }
2062
2063 if (OutgoingArg->isRegister()) {
2064 RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
2065 } else {
2066 SDValue ArgStore = storeStackInputValue(DAG, DL, Chain, StackPtr,
2067 InputReg,
2068 OutgoingArg->getStackOffset());
2069 MemOpChains.push_back(ArgStore);
2070 }
2071 }
2072}
2073
2074static bool canGuaranteeTCO(CallingConv::ID CC) {
2075 return CC == CallingConv::Fast;
2076}
2077
2078/// Return true if we might ever do TCO for calls with this calling convention.
2079static bool mayTailCallThisCC(CallingConv::ID CC) {
2080 switch (CC) {
2081 case CallingConv::C:
2082 return true;
2083 default:
2084 return canGuaranteeTCO(CC);
2085 }
2086}
2087
2088bool SITargetLowering::isEligibleForTailCallOptimization(
2089 SDValue Callee, CallingConv::ID CalleeCC, bool IsVarArg,
2090 const SmallVectorImpl<ISD::OutputArg> &Outs,
2091 const SmallVectorImpl<SDValue> &OutVals,
2092 const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
2093 if (!mayTailCallThisCC(CalleeCC))
2094 return false;
2095
2096 MachineFunction &MF = DAG.getMachineFunction();
2097 const Function &CallerF = MF.getFunction();
2098 CallingConv::ID CallerCC = CallerF.getCallingConv();
2099 const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
2100 const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
2101
2102 // Kernels aren't callable, and don't have a live in return address so it
2103 // doesn't make sense to do a tail call with entry functions.
2104 if (!CallerPreserved)
2105 return false;
2106
2107 bool CCMatch = CallerCC == CalleeCC;
2108
2109 if (DAG.getTarget().Options.GuaranteedTailCallOpt) {
2110 if (canGuaranteeTCO(CalleeCC) && CCMatch)
2111 return true;
2112 return false;
2113 }
2114
2115 // TODO: Can we handle var args?
2116 if (IsVarArg)
2117 return false;
2118
2119 for (const Argument &Arg : CallerF.args()) {
2120 if (Arg.hasByValAttr())
2121 return false;
2122 }
2123
2124 LLVMContext &Ctx = *DAG.getContext();
2125
2126 // Check that the call results are passed in the same way.
2127 if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, Ctx, Ins,
2128 CCAssignFnForCall(CalleeCC, IsVarArg),
2129 CCAssignFnForCall(CallerCC, IsVarArg)))
2130 return false;
2131
2132 // The callee has to preserve all registers the caller needs to preserve.
2133 if (!CCMatch) {
2134 const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
2135 if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
2136 return false;
2137 }
2138
2139 // Nothing more to check if the callee is taking no arguments.
2140 if (Outs.empty())
2141 return true;
2142
2143 SmallVector<CCValAssign, 16> ArgLocs;
2144 CCState CCInfo(CalleeCC, IsVarArg, MF, ArgLocs, Ctx);
2145
2146 CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, IsVarArg));
2147
2148 const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
2149 // If the stack arguments for this call do not fit into our own save area then
2150 // the call cannot be made tail.
2151 // TODO: Is this really necessary?
2152 if (CCInfo.getNextStackOffset() > FuncInfo->getBytesInStackArgArea())
2153 return false;
2154
2155 const MachineRegisterInfo &MRI = MF.getRegInfo();
2156 return parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals);
2157}
2158
2159bool SITargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
2160 if (!CI->isTailCall())
2161 return false;
2162
2163 const Function *ParentFn = CI->getParent()->getParent();
2164 if (AMDGPU::isEntryFunctionCC(ParentFn->getCallingConv()))
2165 return false;
2166
2167 auto Attr = ParentFn->getFnAttribute("disable-tail-calls");
2168 return (Attr.getValueAsString() != "true");
2169}
2170
2171// The wave scratch offset register is used as the global base pointer.
2172SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
2173 SmallVectorImpl<SDValue> &InVals) const {
2174 SelectionDAG &DAG = CLI.DAG;
2175 const SDLoc &DL = CLI.DL;
2176 SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
2177 SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
2178 SmallVector<ISD::InputArg, 32> &Ins = CLI.Ins;
2179 SDValue Chain = CLI.Chain;
2180 SDValue Callee = CLI.Callee;
2181 bool &IsTailCall = CLI.IsTailCall;
2182 CallingConv::ID CallConv = CLI.CallConv;
2183 bool IsVarArg = CLI.IsVarArg;
2184 bool IsSibCall = false;
2185 bool IsThisReturn = false;
2186 MachineFunction &MF = DAG.getMachineFunction();
2187
2188 if (IsVarArg) {
2189 return lowerUnhandledCall(CLI, InVals,
2190 "unsupported call to variadic function ");
2191 }
2192
2193 if (!CLI.CS.getCalledFunction()) {
2194 return lowerUnhandledCall(CLI, InVals,
2195 "unsupported indirect call to function ");
2196 }
2197
2198 if (IsTailCall && MF.getTarget().Options.GuaranteedTailCallOpt) {
2199 return lowerUnhandledCall(CLI, InVals,
2200 "unsupported required tail call to function ");
2201 }
2202
2203 // The first 4 bytes are reserved for the callee's emergency stack slot.
2204 const unsigned CalleeUsableStackOffset = 4;
2205
2206 if (IsTailCall) {
2207 IsTailCall = isEligibleForTailCallOptimization(
2208 Callee, CallConv, IsVarArg, Outs, OutVals, Ins, DAG);
2209 if (!IsTailCall && CLI.CS && CLI.CS.isMustTailCall()) {
2210 report_fatal_error("failed to perform tail call elimination on a call "
2211 "site marked musttail");
2212 }
2213
2214 bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
2215
2216 // A sibling call is one where we're under the usual C ABI and not planning
2217 // to change that but can still do a tail call:
2218 if (!TailCallOpt && IsTailCall)
2219 IsSibCall = true;
2220
2221 if (IsTailCall)
2222 ++NumTailCalls;
2223 }
2224
2225 if (GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(Callee)) {
2226 // FIXME: Remove this hack for function pointer types after removing
2227 // support of old address space mapping. In the new address space
2228 // mapping the pointer in default address space is 64 bit, therefore
2229 // does not need this hack.
2230 if (Callee.getValueType() == MVT::i32) {
2231 const GlobalValue *GV = GA->getGlobal();
2232 Callee = DAG.getGlobalAddress(GV, DL, MVT::i64, GA->getOffset(), false,
2233 GA->getTargetFlags());
2234 }
2235 }
2236 assert(Callee.getValueType() == MVT::i64)(static_cast <bool> (Callee.getValueType() == MVT::i64)
? void (0) : __assert_fail ("Callee.getValueType() == MVT::i64"
, "/build/llvm-toolchain-snapshot-7~svn329677/lib/Target/AMDGPU/SIISelLowering.cpp"
, 2236, __extension__ __PRETTY_FUNCTION__))
;
2237
2238 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
2239
2240 // Analyze operands of the call, assigning locations to each operand.
2241 SmallVector<CCValAssign, 16> ArgLocs;
2242 CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
2243 CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, IsVarArg);
2244 CCInfo.AnalyzeCallOperands(Outs, AssignFn);
2245
2246 // Get a count of how many bytes are to be pushed on the stack.
2247 unsigned NumBytes = CCInfo.getNextStackOffset();
2248
2249 if (IsSibCall) {
2250 // Since we're not changing the ABI to make this a tail call, the memory
2251 // operands are already available in the caller's incoming argument space.
2252 NumBytes = 0;
2253 }
2254
2255 // FPDiff is the byte offset of the call's argument area from the callee's.
2256 // Stores to callee stack arguments will be placed in FixedStackSlots offset
2257 // by this amount for a tail call. In a sibling call it must be 0 because the
2258 // caller will deallocate the entire stack and the callee still expects its
2259 // arguments to begin at SP+0. Completely unused for non-tail calls.
2260 int32_t FPDiff = 0;
2261 MachineFrameInfo &MFI = MF.getFrameInfo();
2262 SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
2263
2264 SDValue CallerSavedFP;
2265
2266 // Adjust the stack pointer for the new arguments...
2267 // These operations are automatically eliminated by the prolog/epilog pass
2268 if (!IsSibCall) {
2269 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
2270
2271 unsigned OffsetReg = Info->getScratchWaveOffsetReg();
2272
2273 // In the HSA case, this should be an identity copy.
2274 SDValue ScratchRSrcReg
2275 = DAG.getCopyFromReg(Chain, DL, Info->getScratchRSrcReg(), MVT::v4i32);
2276 RegsToPass.emplace_back(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, ScratchRSrcReg);
2277
2278 // TODO: Don't hardcode these registers and get from the callee function.
2279 SDValue ScratchWaveOffsetReg
2280 = DAG.getCopyFromReg(Chain, DL, OffsetReg, MVT::i32);
2281 RegsToPass.emplace_back(AMDGPU::SGPR4, ScratchWaveOffsetReg);
2282
2283 if (!Info->isEntryFunction()) {
2284 // Avoid clobbering this function's FP value. In the current convention
2285 // callee will overwrite this, so do save/restore around the call site.
2286 CallerSavedFP = DAG.getCopyFromReg(Chain, DL,
2287 Info->getFrameOffsetReg(), MVT::i32);
2288 }
2289 }
2290
2291 // Stack pointer relative accesses are done by changing the offset SGPR. This
2292 // is just the VGPR offset component.
2293 SDValue StackPtr = DAG.getConstant(CalleeUsableStackOffset, DL, MVT::i32);
2294
2295 SmallVector<SDValue, 8> MemOpChains;
2296 MVT PtrVT = MVT::i32;
2297
2298 // Walk the register/memloc assignments, inserting copies/loads.
2299 for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size(); i != e;
2300 ++i, ++realArgIdx) {
2301 CCValAssign &VA = ArgLocs[i];
2302 SDValue Arg = OutVals[realArgIdx];
2303
2304 // Promote the value if needed.
2305 switch (VA.getLocInfo()) {
2306 case CCValAssign::Full:
2307 break;
2308 case CCValAssign::BCvt:
2309 Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
2310 break;
2311 case CCValAssign::ZExt:
2312 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
2313 break;
2314 case CCValAssign::SExt:
2315 Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
2316 break;
2317 case CCValAssign::AExt:
2318 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
2319 break;
2320 case CCValAssign::FPExt:
2321 Arg = DAG.getNode(ISD::FP_EXTEND, DL, VA.getLocVT(), Arg);
2322 break;
2323 default:
2324 llvm_unreachable("Unknown loc info!")::llvm::llvm_unreachable_internal("Unknown loc info!", "/build/llvm-toolchain-snapshot-7~svn329677/lib/Target/AMDGPU/SIISelLowering.cpp"
, 2324)
;
2325 }
2326
2327 if (VA.isRegLoc()) {
2328 RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
2329 } else {
2330 assert(VA.isMemLoc())(static_cast <bool> (VA.isMemLoc()) ? void (0) : __assert_fail
("VA.isMemLoc()", "/build/llvm-toolchain-snapshot-7~svn329677/lib/Target/AMDGPU/SIISelLowering.cpp"
, 2330, __extension__ __PRETTY_FUNCTION__))
;
2331
2332 SDValue DstAddr;
2333 MachinePointerInfo DstInfo;
2334
2335 unsigned LocMemOffset = VA.getLocMemOffset();
2336 int32_t Offset = LocMemOffset;
2337
2338 SDValue PtrOff = DAG.getObjectPtrOffset(DL, StackPtr, Offset);
2339
2340 if (IsTailCall) {
2341 ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags;
2342 unsigned OpSize = Flags.isByVal() ?
2343 Flags.getByValSize() : VA.getValVT().getStoreSize();
2344
2345 Offset = Offset + FPDiff;
2346 int FI = MFI.CreateFixedObject(OpSize, Offset, true);
2347
2348 DstAddr = DAG.getObjectPtrOffset(DL, DAG.getFrameIndex(FI, PtrVT),
2349 StackPtr);
2350 DstInfo = MachinePointerInfo::getFixedStack(MF, FI);
2351
2352 // Make sure any stack arguments overlapping with where we're storing
2353 // are loaded before this eventual operation. Otherwise they'll be
2354 // clobbered.
2355
2356 // FIXME: Why is this really necessary? This seems to just result in a
2357 // lot of code to copy the stack and write them back to the same
2358 // locations, which are supposed to be immutable?
2359 Chain = addTokenForArgument(Chain, DAG, MFI, FI);
2360 } else {
2361 DstAddr = PtrOff;
2362 DstInfo = MachinePointerInfo::getStack(MF, LocMemOffset);
2363 }
2364
2365 if (Outs[i].Flags.isByVal()) {
2366 SDValue SizeNode =
2367 DAG.getConstant(Outs[i].Flags.getByValSize(), DL, MVT::i32);
2368 SDValue Cpy = DAG.getMemcpy(
2369 Chain, DL, DstAddr, Arg, SizeNode, Outs[i].Flags.getByValAlign(),
2370 /*isVol = */ false, /*AlwaysInline = */ true,
2371 /*isTailCall = */ false, DstInfo,
2372 MachinePointerInfo(UndefValue::get(Type::getInt8PtrTy(
2373 *DAG.getContext(), AMDGPUASI.PRIVATE_ADDRESS))));
2374
2375 MemOpChains.push_back(Cpy);
2376 } else {
2377 SDValue Store = DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo);
2378 MemOpChains.push_back(Store);
2379 }
2380 }
2381 }
2382
2383 // Copy special input registers after user input arguments.
2384 passSpecialInputs(CLI, *Info, RegsToPass, MemOpChains, Chain, StackPtr);
2385
2386 if (!MemOpChains.empty())
2387 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
2388
2389 // Build a sequence of copy-to-reg nodes chained together with token chain
2390 // and flag operands which copy the outgoing args into the appropriate regs.
2391 SDValue InFlag;
2392 for (auto &RegToPass : RegsToPass) {
2393 Chain = DAG.getCopyToReg(Chain, DL, RegToPass.first,
2394 RegToPass.second, InFlag);
2395 InFlag = Chain.getValue(1);
2396 }
2397
2398
2399 SDValue PhysReturnAddrReg;
2400 if (IsTailCall) {
2401 // Since the return is being combined with the call, we need to pass on the
2402 // return address.
2403
2404 const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
2405 SDValue ReturnAddrReg = CreateLiveInRegister(
2406 DAG, &AMDGPU::SReg_64RegClass, TRI->getReturnAddressReg(MF), MVT::i64);
2407
2408 PhysReturnAddrReg = DAG.getRegister(TRI->getReturnAddressReg(MF),
2409 MVT::i64);
2410 Chain = DAG.getCopyToReg(Chain, DL, PhysReturnAddrReg, ReturnAddrReg, InFlag);
2411 InFlag = Chain.getValue(1);
2412 }
2413
2414 // We don't usually want to end the call-sequence here because we would tidy
2415 // the frame up *after* the call, however in the ABI-changing tail-call case
2416 // we've carefully laid out the parameters so that when sp is reset they'll be
2417 // in the correct location.
2418 if (IsTailCall && !IsSibCall) {
2419 Chain = DAG.getCALLSEQ_END(Chain,
2420 DAG.getTargetConstant(NumBytes, DL, MVT::i32),
2421 DAG.getTargetConstant(0, DL, MVT::i32),
2422 InFlag, DL);
2423 InFlag = Chain.getValue(1);
2424 }
2425
2426 std::vector<SDValue> Ops;
2427 Ops.push_back(Chain);
2428 Ops.push_back(Callee);
2429
2430 if (IsTailCall) {
2431 // Each tail call may have to adjust the stack by a different amount, so
2432 // this information must travel along with the operation for eventual
2433 // consumption by emitEpilogue.
2434 Ops.push_back(DAG.getTargetConstant(FPDiff, DL, MVT::i32));
2435
2436 Ops.push_back(PhysReturnAddrReg);
2437 }
2438
2439 // Add argument registers to the end of the list so that they are known live
2440 // into the call.
2441 for (auto &RegToPass : RegsToPass) {
2442 Ops.push_back(DAG.getRegister(RegToPass.first,
2443 RegToPass.second.getValueType()));
2444 }
2445
2446 // Add a register mask operand representing the call-preserved registers.
2447
2448 const AMDGPURegisterInfo *TRI = Subtarget->getRegisterInfo();
2449 const uint32_t *Mask = TRI->getCallPreservedMask(MF, CallConv);
2450 assert(Mask && "Missing call preserved mask for calling convention")(static_cast <bool> (Mask && "Missing call preserved mask for calling convention"
) ? void (0) : __assert_fail ("Mask && \"Missing call preserved mask for calling convention\""
, "/build/llvm-toolchain-snapshot-7~svn329677/lib/Target/AMDGPU/SIISelLowering.cpp"
, 2450, __extension__ __PRETTY_FUNCTION__))
;
2451 Ops.push_back(DAG.getRegisterMask(Mask));
2452
2453 if (InFlag.getNode())
2454 Ops.push_back(InFlag);
2455
2456 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
2457
2458 // If we're doing a tall call, use a TC_RETURN here rather than an
2459 // actual call instruction.
2460 if (IsTailCall) {
2461 MFI.setHasTailCall();
2462 return DAG.getNode(AMDGPUISD::TC_RETURN, DL, NodeTys, Ops);
2463 }
2464
2465 // Returns a chain and a flag for retval copy to use.
2466 SDValue Call = DAG.getNode(AMDGPUISD::CALL, DL, NodeTys, Ops);
2467 Chain = Call.getValue(0);
2468 InFlag = Call.getValue(1);
2469
2470 if (CallerSavedFP) {
2471 SDValue FPReg = DAG.getRegister(Info->getFrameOffsetReg(), MVT::i32);
2472 Chain = DAG.getCopyToReg(Chain, DL, FPReg, CallerSavedFP, InFlag);
2473 InFlag = Chain.getValue(1);
2474 }
2475
2476 uint64_t CalleePopBytes = NumBytes;
2477 Chain = DAG.getCALLSEQ_END(Chain, DAG.getTargetConstant(0, DL, MVT::i32),
2478 DAG.getTargetConstant(CalleePopBytes, DL, MVT::i32),
2479 InFlag, DL);
2480 if (!Ins.empty())
2481 InFlag = Chain.getValue(1);
2482
2483 // Handle result values, copying them out of physregs into vregs that we
2484 // return.
2485 return LowerCallResult(Chain, InFlag, CallConv, IsVarArg, Ins, DL, DAG,
2486 InVals, IsThisReturn,
2487 IsThisReturn ? OutVals[0] : SDValue());
2488}
2489
2490unsigned SITargetLowering::getRegisterByName(const char* RegName, EVT VT,
2491 SelectionDAG &DAG) const {
2492 unsigned Reg = StringSwitch<unsigned>(RegName)
2493 .Case("m0", AMDGPU::M0)
2494 .Case("exec", AMDGPU::EXEC)
2495 .Case("exec_lo", AMDGPU::EXEC_LO)
2496 .Case("exec_hi", AMDGPU::EXEC_HI)
2497 .Case("flat_scratch", AMDGPU::FLAT_SCR)
2498 .Case("flat_scratch_lo", AMDGPU::FLAT_SCR_LO)
2499 .Case("flat_scratch_hi", AMDGPU::FLAT_SCR_HI)
2500 .Default(AMDGPU::NoRegister);
2501
2502 if (Reg == AMDGPU::NoRegister) {
2503 report_fatal_error(Twine("invalid register name \""
2504 + StringRef(RegName) + "\"."));
2505
2506 }
2507
2508 if (Subtarget->getGeneration() == SISubtarget::SOUTHERN_ISLANDS &&
2509 Subtarget->getRegisterInfo()->regsOverlap(Reg, AMDGPU::FLAT_SCR)) {
2510 report_fatal_error(Twine("invalid register \""
2511 + StringRef(RegName) + "\" for subtarget."));
2512 }
2513
2514 switch (Reg) {
2515 case AMDGPU::M0:
2516 case AMDGPU::EXEC_LO:
2517 case AMDGPU::EXEC_HI:
2518 case AMDGPU::FLAT_SCR_LO:
2519 case AMDGPU::FLAT_SCR_HI:
2520 if (VT.getSizeInBits() == 32)
2521 return Reg;
2522 break;
2523 case AMDGPU::EXEC:
2524 case AMDGPU::FLAT_SCR:
2525 if (VT.getSizeInBits() == 64)
2526 return Reg;
2527 break;
2528 default:
2529 llvm_unreachable("missing register type checking")::llvm::llvm_unreachable_internal("missing register type checking"
, "/build/llvm-toolchain-snapshot-7~svn329677/lib/Target/AMDGPU/SIISelLowering.cpp"
, 2529)
;
2530 }
2531
2532 report_fatal_error(Twine("invalid type for register \""
2533 + StringRef(RegName) + "\"."));
2534}
2535
2536// If kill is not the last instruction, split the block so kill is always a
2537// proper terminator.
2538MachineBasicBlock *SITargetLowering::splitKillBlock(MachineInstr &MI,
2539 MachineBasicBlock *BB) const {
2540 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
2541
2542 MachineBasicBlock::iterator SplitPoint(&MI);
2543 ++SplitPoint;
2544
2545 if (SplitPoint == BB->end()) {
2546 // Don't bother with a new block.
2547 MI.setDesc(TII->getKillTerminatorFromPseudo(MI.getOpcode()));
2548 return BB;
2549 }
2550
2551 MachineFunction *MF = BB->getParent();
2552 MachineBasicBlock *SplitBB
2553 = MF->CreateMachineBasicBlock(BB->getBasicBlock());
2554
2555 MF->insert(++MachineFunction::iterator(BB), SplitBB);
2556 SplitBB->splice(SplitBB->begin(), BB, SplitPoint, BB->end());
2557
2558 SplitBB->transferSuccessorsAndUpdatePHIs(BB);
2559 BB->addSuccessor(SplitBB);
2560
2561 MI.setDesc(TII->getKillTerminatorFromPseudo(MI.getOpcode()));
2562 return SplitBB;
2563}
2564
2565// Do a v_movrels_b32 or v_movreld_b32 for each unique value of \p IdxReg in the
2566// wavefront. If the value is uniform and just happens to be in a VGPR, this
2567// will only do one iteration. In the worst case, this will loop 64 times.
2568//
2569// TODO: Just use v_readlane_b32 if we know the VGPR has a uniform value.
2570static MachineBasicBlock::iterator emitLoadM0FromVGPRLoop(
2571 const SIInstrInfo *TII,
2572 MachineRegisterInfo &MRI,
2573 MachineBasicBlock &OrigBB,
2574 MachineBasicBlock &LoopBB,
2575 const DebugLoc &DL,
2576 const MachineOperand &IdxReg,
2577 unsigned InitReg,
2578 unsigned ResultReg,
2579 unsigned PhiReg,
2580 unsigned InitSaveExecReg,
2581 int Offset,
2582 bool UseGPRIdxMode,
2583 bool IsIndirectSrc) {
2584 MachineBasicBlock::iterator I = LoopBB.begin();
2585
2586 unsigned PhiExec = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
2587 unsigned NewExec = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
2588 unsigned CurrentIdxReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
2589 unsigned CondReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
2590
2591 BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiReg)
2592 .addReg(InitReg)
2593 .addMBB(&OrigBB)
2594 .addReg(ResultReg)
2595 .addMBB(&LoopBB);
2596
2597 BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiExec)
2598 .addReg(InitSaveExecReg)
2599 .addMBB(&OrigBB)
2600 .addReg(NewExec)
2601 .addMBB(&LoopBB);
2602
2603 // Read the next variant <- also loop target.
2604 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), CurrentIdxReg)
2605 .addReg(IdxReg.getReg(), getUndefRegState(IdxReg.isUndef()));
2606
2607 // Compare the just read M0 value to all possible Idx values.
2608 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_CMP_EQ_U32_e64), CondReg)
2609 .addReg(CurrentIdxReg)
2610 .addReg(IdxReg.getReg(), 0, IdxReg.getSubReg());
2611
2612 // Update EXEC, save the original EXEC value to VCC.
2613 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_AND_SAVEEXEC_B64), NewExec)
2614 .addReg(CondReg, RegState::Kill);
2615
2616 MRI.setSimpleHint(NewExec, CondReg);
2617
2618 if (UseGPRIdxMode) {
2619 unsigned IdxReg;
2620 if (Offset == 0) {
2621 IdxReg = CurrentIdxReg;
2622 } else {
2623 IdxReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
2624 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), IdxReg)
2625 .addReg(CurrentIdxReg, RegState::Kill)
2626 .addImm(Offset);
2627 }
2628 unsigned IdxMode = IsIndirectSrc ?
2629 VGPRIndexMode::SRC0_ENABLE : VGPRIndexMode::DST_ENABLE;
2630 MachineInstr *SetOn =
2631 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_ON))
2632 .addReg(IdxReg, RegState::Kill)
2633 .addImm(IdxMode);
2634 SetOn->getOperand(3).setIsUndef();
2635 } else {
2636 // Move index from VCC into M0
2637 if (Offset == 0) {
2638 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
2639 .addReg(CurrentIdxReg, RegState::Kill);
2640 } else {
2641 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
2642 .addReg(CurrentIdxReg, RegState::Kill)
2643 .addImm(Offset);
2644 }
2645 }
2646
2647 // Update EXEC, switch all done bits to 0 and all todo bits to 1.
2648 MachineInstr *InsertPt =
2649 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_XOR_B64), AMDGPU::EXEC)
2650 .addReg(AMDGPU::EXEC)
2651 .addReg(NewExec);
2652
2653 // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
2654 // s_cbranch_scc0?
2655
2656 // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover.
2657 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
2658 .addMBB(&LoopBB);
2659
2660 return InsertPt->getIterator();
2661}
2662
2663// This has slightly sub-optimal regalloc when the source vector is killed by
2664// the read. The register allocator does not understand that the kill is
2665// per-workitem, so is kept alive for the whole loop so we end up not re-using a
2666// subregister from it, using 1 more VGPR than necessary. This was saved when
2667// this was expanded after register allocation.
2668static MachineBasicBlock::iterator loadM0FromVGPR(const SIInstrInfo *TII,
2669 MachineBasicBlock &MBB,
2670 MachineInstr &MI,
2671 unsigned InitResultReg,
2672 unsigned PhiReg,
2673 int Offset,
2674 bool UseGPRIdxMode,
2675 bool IsIndirectSrc) {
2676 MachineFunction *MF = MBB.getParent();
2677 MachineRegisterInfo &MRI = MF->getRegInfo();
2678 const DebugLoc &DL = MI.getDebugLoc();
2679 MachineBasicBlock::iterator I(&MI);
2680
2681 unsigned DstReg = MI.getOperand(0).getReg();
2682 unsigned SaveExec = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
2683 unsigned TmpExec = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
2684
2685 BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), TmpExec);
2686
2687 // Save the EXEC mask
2688 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B64), SaveExec)
2689 .addReg(AMDGPU::EXEC);
2690
2691 // To insert the loop we need to split the block. Move everything after this
2692 // point to a new block, and insert a new empty block between the two.
2693 MachineBasicBlock *LoopBB = MF->CreateMachineBasicBlock();
2694 MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock();
2695 MachineFunction::iterator MBBI(MBB);
2696 ++MBBI;
2697
2698 MF->insert(MBBI, LoopBB);
2699 MF->insert(MBBI, RemainderBB);
2700
2701 LoopBB->addSuccessor(LoopBB);
2702 LoopBB->addSuccessor(RemainderBB);
2703
2704 // Move the rest of the block into a new block.
2705 RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
2706 RemainderBB->splice(RemainderBB->begin(), &MBB, I, MBB.end());
2707
2708 MBB.addSuccessor(LoopBB);
2709
2710 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
2711
2712 auto InsPt = emitLoadM0FromVGPRLoop(TII, MRI, MBB, *LoopBB, DL, *Idx,
2713 InitResultReg, DstReg, PhiReg, TmpExec,
2714 Offset, UseGPRIdxMode, IsIndirectSrc);
2715
2716 MachineBasicBlock::iterator First = RemainderBB->begin();
2717 BuildMI(*RemainderBB, First, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC)
2718 .addReg(SaveExec);
2719
2720 return InsPt;
2721}
2722
2723// Returns subreg index, offset
2724static std::pair<unsigned, int>
2725computeIndirectRegAndOffset(const SIRegisterInfo &TRI,
2726 const TargetRegisterClass *SuperRC,
2727 unsigned VecReg,
2728 int Offset) {
2729 int NumElts = TRI.getRegSizeInBits(*SuperRC) / 32;
2730
2731 // Skip out of bounds offsets, or else we would end up using an undefined
2732 // register.
2733 if (Offset >= NumElts || Offset < 0)
2734 return std::make_pair(AMDGPU::sub0, Offset);
2735
2736 return std::make_pair(AMDGPU::sub0 + Offset, 0);
2737}
2738
2739// Return true if the index is an SGPR and was set.
2740static bool setM0ToIndexFromSGPR(const SIInstrInfo *TII,
2741 MachineRegisterInfo &MRI,
2742 MachineInstr &MI,
2743 int Offset,
2744 bool UseGPRIdxMode,
2745 bool IsIndirectSrc) {
2746 MachineBasicBlock *MBB = MI.getParent();
2747 const DebugLoc &DL = MI.getDebugLoc();
2748 MachineBasicBlock::iterator I(&MI);
2749
2750 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
2751 const TargetRegisterClass *IdxRC = MRI.getRegClass(Idx->getReg());
2752
2753 assert(Idx->getReg() != AMDGPU::NoRegister)(static_cast <bool> (Idx->getReg() != AMDGPU::NoRegister
) ? void (0) : __assert_fail ("Idx->getReg() != AMDGPU::NoRegister"
, "/build/llvm-toolchain-snapshot-7~svn329677/lib/Target/AMDGPU/SIISelLowering.cpp"
, 2753, __extension__ __PRETTY_FUNCTION__))
;
2754
2755 if (!TII->getRegisterInfo().isSGPRClass(IdxRC))
2756 return false;
2757
2758 if (UseGPRIdxMode) {
2759 unsigned IdxMode = IsIndirectSrc ?
2760 VGPRIndexMode::SRC0_ENABLE : VGPRIndexMode::DST_ENABLE;
2761 if (Offset == 0) {
2762 MachineInstr *SetOn =
2763 BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_ON))
2764 .add(*Idx)
2765 .addImm(IdxMode);
2766
2767 SetOn->getOperand(3).setIsUndef();
2768 } else {
2769 unsigned Tmp = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
2770 BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), Tmp)
2771 .add(*Idx)
2772 .addImm(Offset);
2773 MachineInstr *SetOn =
2774 BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_ON))
2775 .addReg(Tmp, RegState::Kill)
2776 .addImm(IdxMode);
2777
2778 SetOn->getOperand(3).setIsUndef();
2779 }
2780
2781 return true;
2782 }
2783
2784 if (Offset == 0) {
2785 BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
2786 .add(*Idx);
2787 } else {
2788 BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
2789 .add(*Idx)
2790 .addImm(Offset);
2791 }
2792
2793 return true;
2794}
2795
2796// Control flow needs to be inserted if indexing with a VGPR.
2797static MachineBasicBlock *emitIndirectSrc(MachineInstr &MI,
2798 MachineBasicBlock &MBB,
2799 const SISubtarget &ST) {
2800 const SIInstrInfo *TII = ST.getInstrInfo();
2801 const SIRegisterInfo &TRI = TII->getRegisterInfo();
2802 MachineFunction *MF = MBB.getParent();
2803 MachineRegisterInfo &MRI = MF->getRegInfo();
2804
2805 unsigned Dst = MI.getOperand(0).getReg();
2806 unsigned SrcReg = TII->getNamedOperand(MI, AMDGPU::OpName::src)->getReg();
2807 int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
2808
2809 const TargetRegisterClass *VecRC = MRI.getRegClass(SrcReg);
2810
2811 unsigned SubReg;
2812 std::tie(SubReg, Offset)
2813 = computeIndirectRegAndOffset(TRI, VecRC, SrcReg, Offset);
2814
2815 bool UseGPRIdxMode = ST.useVGPRIndexMode(EnableVGPRIndexMode);
2816
2817 if (setM0ToIndexFromSGPR(TII, MRI, MI, Offset, UseGPRIdxMode, true)) {
2818 MachineBasicBlock::iterator I(&MI);
2819 const DebugLoc &DL = MI.getDebugLoc();
2820
2821 if (UseGPRIdxMode) {
2822 // TODO: Look at the uses to avoid the copy. This may require rescheduling
2823 // to avoid interfering with other uses, so probably requires a new
2824 // optimization pass.
2825 BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOV_B32_e32), Dst)
2826 .addReg(SrcReg, RegState::Undef, SubReg)
2827 .addReg(SrcReg, RegState::Implicit)
2828 .addReg(AMDGPU::M0, RegState::Implicit);
2829 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF));
2830 } else {
2831 BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
2832 .addReg(SrcReg, RegState::Undef, SubReg)
2833 .addReg(SrcReg, RegState::Implicit);
2834 }
2835
2836 MI.eraseFromParent();
2837
2838 return &MBB;
2839 }
2840
2841 const DebugLoc &DL = MI.getDebugLoc();
2842 MachineBasicBlock::iterator I(&MI);
2843
2844 unsigned PhiReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
2845 unsigned InitReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
2846
2847 BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), InitReg);
2848
2849 auto InsPt = loadM0FromVGPR(TII, MBB, MI, InitReg, PhiReg,
2850 Offset, UseGPRIdxMode, true);
2851 MachineBasicBlock *LoopBB = InsPt->getParent();
2852
2853 if (UseGPRIdxMode) {
2854 BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::V_MOV_B32_e32), Dst)
2855 .addReg(SrcReg, RegState::Undef, SubReg)
2856 .addReg(SrcReg, RegState::Implicit)
2857 .addReg(AMDGPU::M0, RegState::Implicit);
2858 BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF));
2859 } else {
2860 BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
2861 .addReg(SrcReg, RegState::Undef, SubReg)
2862 .addReg(SrcReg, RegState::Implicit);
2863 }
2864
2865 MI.eraseFromParent();
2866
2867 return LoopBB;
2868}
2869
2870static unsigned getMOVRELDPseudo(const SIRegisterInfo &TRI,
2871 const TargetRegisterClass *VecRC) {
2872 switch (TRI.getRegSizeInBits(*VecRC)) {
2873 case 32: // 4 bytes
2874 return AMDGPU::V_MOVRELD_B32_V1;
2875 case 64: // 8 bytes
2876 return AMDGPU::V_MOVRELD_B32_V2;
2877 case 128: // 16 bytes
2878 return AMDGPU::V_MOVRELD_B32_V4;
2879 case 256: // 32 bytes
2880 return AMDGPU::V_MOVRELD_B32_V8;
2881 case 512: // 64 bytes
2882 return AMDGPU::V_MOVRELD_B32_V16;
2883 default:
2884 llvm_unreachable("unsupported size for MOVRELD pseudos")::llvm::llvm_unreachable_internal("unsupported size for MOVRELD pseudos"
, "/build/llvm-toolchain-snapshot-7~svn329677/lib/Target/AMDGPU/SIISelLowering.cpp"
, 2884)
;
2885 }
2886}
2887
2888static MachineBasicBlock *emitIndirectDst(MachineInstr &MI,
2889 MachineBasicBlock &MBB,
2890 const SISubtarget &ST) {
2891 const SIInstrInfo *TII = ST.getInstrInfo();
2892 const SIRegisterInfo &TRI = TII->getRegisterInfo();
2893 MachineFunction *MF = MBB.getParent();
2894 MachineRegisterInfo &MRI = MF->getRegInfo();
2895
2896 unsigned Dst = MI.getOperand(0).getReg();
2897 const MachineOperand *SrcVec = TII->getNamedOperand(MI, AMDGPU::OpName::src);
2898 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
2899 const MachineOperand *Val = TII->getNamedOperand(MI, AMDGPU::OpName::val);
2900 int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
2901 const TargetRegisterClass *VecRC = MRI.getRegClass(SrcVec->getReg());
2902
2903 // This can be an immediate, but will be folded later.
2904 assert(Val->getReg())(static_cast <bool> (Val->getReg()) ? void (0) : __assert_fail
("Val->getReg()", "/build/llvm-toolchain-snapshot-7~svn329677/lib/Target/AMDGPU/SIISelLowering.cpp"
, 2904, __extension__ __PRETTY_FUNCTION__))
;
2905
2906 unsigned SubReg;
2907 std::tie(SubReg, Offset) = computeIndirectRegAndOffset(TRI, VecRC,
2908 SrcVec->getReg(),
2909 Offset);
2910 bool UseGPRIdxMode = ST.useVGPRIndexMode(EnableVGPRIndexMode);
2911
2912 if (Idx->getReg() == AMDGPU::NoRegister) {
2913 MachineBasicBlock::iterator I(&MI);
2914 const DebugLoc &DL = MI.getDebugLoc();
2915
2916 assert(Offset == 0)(static_cast <bool> (Offset == 0) ? void (0) : __assert_fail
("Offset == 0", "/build/llvm-toolchain-snapshot-7~svn329677/lib/Target/AMDGPU/SIISelLowering.cpp"
, 2916, __extension__ __PRETTY_FUNCTION__))
;
2917
2918 BuildMI(MBB, I, DL, TII->get(TargetOpcode::INSERT_SUBREG), Dst)
2919 .add(*SrcVec)
2920 .add(*Val)
2921 .addImm(SubReg);
2922
2923 MI.eraseFromParent();
2924 return &MBB;
2925 }
2926
2927 if (setM0ToIndexFromSGPR(TII, MRI, MI, Offset, UseGPRIdxMode, false)) {
2928 MachineBasicBlock::iterator I(&MI);
2929 const DebugLoc &DL = MI.getDebugLoc();
2930
2931 if (UseGPRIdxMode) {
2932 BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOV_B32_indirect))
2933 .addReg(SrcVec->getReg(), RegState::Undef, SubReg) // vdst
2934 .add(*Val)
2935 .addReg(Dst, RegState::ImplicitDefine)
2936 .addReg(SrcVec->getReg(), RegState::Implicit)
2937 .addReg(AMDGPU::M0, RegState::Implicit);
2938
2939 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF));
2940 } else {
2941 const MCInstrDesc &MovRelDesc = TII->get(getMOVRELDPseudo(TRI, VecRC));
2942
2943 BuildMI(MBB, I, DL, MovRelDesc)
2944 .addReg(Dst, RegState::Define)
2945 .addReg(SrcVec->getReg())
2946 .add(*Val)
2947 .addImm(SubReg - AMDGPU::sub0);
2948 }
2949
2950 MI.eraseFromParent();
2951 return &MBB;
2952 }
2953
2954 if (Val->isReg())
2955 MRI.clearKillFlags(Val->getReg());
2956
2957 const DebugLoc &DL = MI.getDebugLoc();
2958
2959 unsigned PhiReg = MRI.createVirtualRegister(VecRC);
2960
2961 auto InsPt = loadM0FromVGPR(TII, MBB, MI, SrcVec->getReg(), PhiReg,
2962 Offset, UseGPRIdxMode, false);
2963 MachineBasicBlock *LoopBB = InsPt->getParent();
2964
2965 if (UseGPRIdxMode) {
2966 BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::V_MOV_B32_indirect))
2967 .addReg(PhiReg, RegState::Undef, SubReg) // vdst
2968 .add(*Val) // src0
2969 .addReg(Dst, RegState::ImplicitDefine)
2970 .addReg(PhiReg, RegState::Implicit)
2971 .addReg(AMDGPU::M0, RegState::Implicit);
2972 BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF));
2973 } else {
2974 const MCInstrDesc &MovRelDesc = TII->get(getMOVRELDPseudo(TRI, VecRC));
2975
2976 BuildMI(*LoopBB, InsPt, DL, MovRelDesc)
2977 .addReg(Dst, RegState::Define)
2978 .addReg(PhiReg)
2979 .add(*Val)
2980 .addImm(SubReg - AMDGPU::sub0);
2981 }
2982
2983 MI.eraseFromParent();
2984
2985 return LoopBB;
2986}
2987
2988MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
2989 MachineInstr &MI, MachineBasicBlock *BB) const {
2990
2991 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
2992 MachineFunction *MF = BB->getParent();
2993 SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
2994
2995 if (TII->isMIMG(MI)) {
2996 if (MI.memoperands_empty() && MI.mayLoadOrStore()) {
2997 report_fatal_error("missing mem operand from MIMG instruction");
2998 }
2999 // Add a memoperand for mimg instructions so that they aren't assumed to
3000 // be ordered memory instuctions.
3001
3002 return BB;
3003 }
3004
3005 switch (MI.getOpcode()) {
3006 case AMDGPU::S_ADD_U64_PSEUDO:
3007 case AMDGPU::S_SUB_U64_PSEUDO: {
3008 MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
3009 const DebugLoc &DL = MI.getDebugLoc();
3010
3011 MachineOperand &Dest = MI.getOperand(0);
3012 MachineOperand &Src0 = MI.getOperand(1);
3013 MachineOperand &Src1 = MI.getOperand(2);
3014
3015 unsigned DestSub0 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
3016 unsigned DestSub1 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
3017
3018 MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(MI, MRI,
3019 Src0, &AMDGPU::SReg_64RegClass, AMDGPU::sub0,
3020 &AMDGPU::SReg_32_XM0RegClass);
3021 MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(MI, MRI,
3022 Src0, &AMDGPU::SReg_64RegClass, AMDGPU::sub1,
3023 &AMDGPU::SReg_32_XM0RegClass);
3024
3025 MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(MI, MRI,
3026 Src1, &AMDGPU::SReg_64RegClass, AMDGPU::sub0,
3027 &AMDGPU::SReg_32_XM0RegClass);
3028 MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(MI, MRI,
3029 Src1, &AMDGPU::SReg_64RegClass, AMDGPU::sub1,
3030 &AMDGPU::SReg_32_XM0RegClass);
3031
3032 bool IsAdd = (MI.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO);
3033
3034 unsigned LoOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
3035 unsigned HiOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
3036 BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0)
3037 .add(Src0Sub0)
3038 .add(Src1Sub0);
3039 BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1)
3040 .add(Src0Sub1)
3041 .add(Src1Sub1);
3042 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg())
3043 .addReg(DestSub0)
3044 .addImm(AMDGPU::sub0)
3045 .addReg(DestSub1)
3046 .addImm(AMDGPU::sub1);
3047 MI.eraseFromParent();
3048 return BB;
3049 }
3050 case AMDGPU::SI_INIT_M0: {
3051 BuildMI(*BB, MI.getIterator(), MI.getDebugLoc(),
3052 TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
3053 .add(MI.getOperand(0));
3054 MI.eraseFromParent();
3055 return BB;
3056 }
3057 case AMDGPU::SI_INIT_EXEC:
3058 // This should be before all vector instructions.
3059 BuildMI(*BB, &*BB->begin(), MI.getDebugLoc(), TII->get(AMDGPU::S_MOV_B64),
3060 AMDGPU::EXEC)
3061 .addImm(MI.getOperand(0).getImm());
3062 MI.eraseFromParent();
3063 return BB;
3064
3065 case AMDGPU::SI_INIT_EXEC_FROM_INPUT: {
3066 // Extract the thread count from an SGPR input and set EXEC accordingly.
3067 // Since BFM can't shift by 64, handle that case with CMP + CMOV.
3068 //
3069 // S_BFE_U32 count, input, {shift, 7}
3070 // S_BFM_B64 exec, count, 0
3071 // S_CMP_EQ_U32 count, 64
3072 // S_CMOV_B64 exec, -1
3073 MachineInstr *FirstMI = &*BB->begin();
3074 MachineRegisterInfo &MRI = MF->getRegInfo();
3075 unsigned InputReg = MI.getOperand(0).getReg();
3076 unsigned CountReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
3077 bool Found = false;
3078
3079 // Move the COPY of the input reg to the beginning, so that we can use it.
3080 for (auto I = BB->begin(); I != &MI; I++) {
3081 if (I->getOpcode() != TargetOpcode::COPY ||
3082 I->getOperand(0).getReg() != InputReg)
3083 continue;
3084
3085 if (I == FirstMI) {
3086 FirstMI = &*++BB->begin();
3087 } else {
3088 I->removeFromParent();
3089 BB->insert(FirstMI, &*I);
3090 }
3091 Found = true;
3092 break;
3093 }
3094 assert(Found)(static_cast <bool> (Found) ? void (0) : __assert_fail (
"Found", "/build/llvm-toolchain-snapshot-7~svn329677/lib/Target/AMDGPU/SIISelLowering.cpp"
, 3094, __extension__ __PRETTY_FUNCTION__))
;
3095 (void)Found;
3096
3097 // This should be before all vector instructions.
3098 BuildMI(*BB, FirstMI, DebugLoc(), TII->get(AMDGPU::S_BFE_U32), CountReg)
3099 .addReg(InputReg)
3100 .addImm((MI.getOperand(1).getImm() & 0x7f) | 0x70000);
3101 BuildMI(*BB, FirstMI, DebugLoc(), TII->get(AMDGPU::S_BFM_B64),
3102 AMDGPU::EXEC)
3103 .addReg(CountReg)
3104 .addImm(0);
3105 BuildMI(*BB, FirstMI, DebugLoc(), TII->get(AMDGPU::S_CMP_EQ_U32))
3106 .addReg(CountReg, RegState::Kill)
3107 .addImm(64);
3108 BuildMI(*BB, FirstMI, DebugLoc(), TII->get(AMDGPU::S_CMOV_B64),
3109 AMDGPU::EXEC)
3110 .addImm(-1);
3111 MI.eraseFromParent();
3112 return BB;
3113 }
3114
3115 case AMDGPU::GET_GROUPSTATICSIZE: {
3116 DebugLoc DL = MI.getDebugLoc();
3117 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_MOV_B32))
3118 .add(MI.getOperand(0))
3119 .addImm(MFI->getLDSSize());
3120 MI.eraseFromParent();
3121 return BB;
3122 }
3123 case AMDGPU::SI_INDIRECT_SRC_V1:
3124 case AMDGPU::SI_INDIRECT_SRC_V2:
3125 case AMDGPU::SI_INDIRECT_SRC_V4:
3126 case AMDGPU::SI_INDIRECT_SRC_V8:
3127 case AMDGPU::SI_INDIRECT_SRC_V16:
3128 return emitIndirectSrc(MI, *BB, *getSubtarget());
3129 case AMDGPU::SI_INDIRECT_DST_V1:
3130 case AMDGPU::SI_INDIRECT_DST_V2:
3131 case AMDGPU::SI_INDIRECT_DST_V4:
3132 case AMDGPU::SI_INDIRECT_DST_V8:
3133 case AMDGPU::SI_INDIRECT_DST_V16:
3134 return emitIndirectDst(MI, *BB, *getSubtarget());
3135 case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO:
3136 case AMDGPU::SI_KILL_I1_PSEUDO:
3137 return splitKillBlock(MI, BB);
3138 case AMDGPU::V_CNDMASK_B64_PSEUDO: {
3139 MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
3140
3141 unsigned Dst = MI.getOperand(0).getReg();
3142 unsigned Src0 = MI.getOperand(1).getReg();
3143 unsigned Src1 = MI.getOperand(2).getReg();
3144 const DebugLoc &DL = MI.getDebugLoc();
3145 unsigned SrcCond = MI.getOperand(3).getReg();
3146
3147 unsigned DstLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3148 unsigned DstHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3149 unsigned SrcCondCopy = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
3150
3151 BuildMI(*BB, MI, DL, TII->get(AMDGPU::COPY), SrcCondCopy)
3152 .addReg(SrcCond);
3153 BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstLo)
3154 .addReg(Src0, 0, AMDGPU::sub0)
3155 .addReg(Src1, 0, AMDGPU::sub0)
3156 .addReg(SrcCondCopy);
3157 BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstHi)
3158 .addReg(Src0, 0, AMDGPU::sub1)
3159 .addReg(Src1, 0, AMDGPU::sub1)
3160 .addReg(SrcCondCopy);
3161
3162 BuildMI(*BB, MI, DL, TII->get(AMDGPU::REG_SEQUENCE), Dst)
3163 .addReg(DstLo)
3164 .addImm(AMDGPU::sub0)
3165 .addReg(DstHi)
3166 .addImm(AMDGPU::sub1);
3167 MI.eraseFromParent();
3168 return BB;
3169 }
3170 case AMDGPU::SI_BR_UNDEF: {
3171 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
3172 const DebugLoc &DL = MI.getDebugLoc();
3173 MachineInstr *Br = BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
3174 .add(MI.getOperand(0));
3175 Br->getOperand(1).setIsUndef(true); // read undef SCC
3176 MI.eraseFromParent();
3177 return BB;
3178 }
3179 case AMDGPU::ADJCALLSTACKUP:
3180 case AMDGPU::ADJCALLSTACKDOWN: {
3181 const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
3182 MachineInstrBuilder MIB(*MF, &MI);
3183
3184 // Add an implicit use of the frame offset reg to prevent the restore copy
3185 // inserted after the call from being reorderd after stack operations in the
3186 // the caller's frame.
3187 MIB.addReg(Info->getStackPtrOffsetReg(), RegState::ImplicitDefine)
3188 .addReg(Info->getStackPtrOffsetReg(), RegState::Implicit)
3189 .addReg(Info->getFrameOffsetReg(), RegState::Implicit);
3190 return BB;
3191 }
3192 case AMDGPU::SI_CALL_ISEL:
3193 case AMDGPU::SI_TCRETURN_ISEL: {
3194 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
3195 const DebugLoc &DL = MI.getDebugLoc();
3196 unsigned ReturnAddrReg = TII->getRegisterInfo().getReturnAddressReg(*MF);
3197
3198 MachineRegisterInfo &MRI = MF->getRegInfo();
3199 unsigned GlobalAddrReg = MI.getOperand(0).getReg();
3200 MachineInstr *PCRel = MRI.getVRegDef(GlobalAddrReg);
3201 assert(PCRel->getOpcode() == AMDGPU::SI_PC_ADD_REL_OFFSET)(static_cast <bool> (PCRel->getOpcode() == AMDGPU::SI_PC_ADD_REL_OFFSET
) ? void (0) : __assert_fail ("PCRel->getOpcode() == AMDGPU::SI_PC_ADD_REL_OFFSET"
, "/build/llvm-toolchain-snapshot-7~svn329677/lib/Target/AMDGPU/SIISelLowering.cpp"
, 3201, __extension__ __PRETTY_FUNCTION__))
;
3202
3203 const GlobalValue *G = PCRel->getOperand(1).getGlobal();
3204
3205 MachineInstrBuilder MIB;
3206 if (MI.getOpcode() == AMDGPU::SI_CALL_ISEL) {
3207 MIB = BuildMI(*BB, MI, DL, TII->get(AMDGPU::SI_CALL), ReturnAddrReg)
3208 .add(MI.getOperand(0))
3209 .addGlobalAddress(G);
3210 } else {
3211 MIB = BuildMI(*BB, MI, DL, TII->get(AMDGPU::SI_TCRETURN))
3212 .add(MI.getOperand(0))
3213 .addGlobalAddress(G);
3214
3215 // There is an additional imm operand for tcreturn, but it should be in the
3216 // right place already.
3217 }
3218
3219 for (unsigned I = 1, E = MI.getNumOperands(); I != E; ++I)
3220 MIB.add(MI.getOperand(I));
3221
3222 MIB.setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
3223 MI.eraseFromParent();
3224 return BB;
3225 }
3226 default:
3227 return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB);
3228 }
3229}
3230
3231bool SITargetLowering::hasBitPreservingFPLogic(EVT VT) const {
3232 return isTypeLegal(VT.getScalarType());
3233}
3234
3235bool SITargetLowering::enableAggressiveFMAFusion(EVT VT) const {
3236 // This currently forces unfolding various combinations of fsub into fma with
3237 // free fneg'd operands. As long as we have fast FMA (controlled by
3238 // isFMAFasterThanFMulAndFAdd), we should perform these.
3239
3240 // When fma is quarter rate, for f64 where add / sub are at best half rate,
3241 // most of these combines appear to be cycle neutral but save on instruction
3242 // count / code size.
3243 return true;
3244}
3245
3246EVT SITargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &Ctx,
3247 EVT VT) const {
3248 if (!VT.isVector()) {
3249 return MVT::i1;
3250 }
3251 return EVT::getVectorVT(Ctx, MVT::i1, VT.getVectorNumElements());
3252}
3253
3254MVT SITargetLowering::getScalarShiftAmountTy(const DataLayout &, EVT VT) const {
3255 // TODO: Should i16 be used always if legal? For now it would force VALU
3256 // shifts.
3257 return (VT == MVT::i16) ? MVT::i16 : MVT::i32;
3258}
3259
3260// Answering this is somewhat tricky and depends on the specific device which
3261// have different rates for fma or all f64 operations.
3262//
3263// v_fma_f64 and v_mul_f64 always take the same number of cycles as each other
3264// regardless of which device (although the number of cycles differs between
3265// devices), so it is always profitable for f64.
3266//
3267// v_fma_f32 takes 4 or 16 cycles depending on the device, so it is profitable
3268// only on full rate devices. Normally, we should prefer selecting v_mad_f32
3269// which we can always do even without fused FP ops since it returns the same
3270// result as the separate operations and since it is always full
3271// rate. Therefore, we lie and report that it is not faster for f32. v_mad_f32
3272// however does not support denormals, so we do report fma as faster if we have
3273// a fast fma device and require denormals.
3274//
3275bool SITargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
3276 VT = VT.getScalarType();
3277
3278 switch (VT.getSimpleVT().SimpleTy) {
3279 case MVT::f32:
3280 // This is as fast on some subtargets. However, we always have full rate f32
3281 // mad available which returns the same result as the separate operations
3282 // which we should prefer over fma. We can't use this if we want to support
3283 // denormals, so only report this in these cases.
3284 return Subtarget->hasFP32Denormals() && Subtarget->hasFastFMAF32();
3285 case MVT::f64:
3286 return true;
3287 case MVT::f16:
3288 return Subtarget->has16BitInsts() && Subtarget->hasFP16Denormals();
3289 default:
3290 break;
3291 }
3292
3293 return false;
3294}
3295
3296//===----------------------------------------------------------------------===//
3297// Custom DAG Lowering Operations
3298//===----------------------------------------------------------------------===//
3299
3300SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
3301 switch (Op.getOpcode()) {
3302 default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
3303 case ISD::BRCOND: return LowerBRCOND(Op, DAG);
3304 case ISD::LOAD: {
3305 SDValue Result = LowerLOAD(Op, DAG);
3306 assert((!Result.getNode() ||(static_cast <bool> ((!Result.getNode() || Result.getNode
()->getNumValues() == 2) && "Load should return a value and a chain"
) ? void (0) : __assert_fail ("(!Result.getNode() || Result.getNode()->getNumValues() == 2) && \"Load should return a value and a chain\""
, "/build/llvm-toolchain-snapshot-7~svn329677/lib/Target/AMDGPU/SIISelLowering.cpp"
, 3308, __extension__ __PRETTY_FUNCTION__))
3307 Result.getNode()->getNumValues() == 2) &&(static_cast <bool> ((!Result.getNode() || Result.getNode
()->getNumValues() == 2) && "Load should return a value and a chain"
) ? void (0) : __assert_fail ("(!Result.getNode() || Result.getNode()->getNumValues() == 2) && \"Load should return a value and a chain\""
, "/build/llvm-toolchain-snapshot-7~svn329677/lib/Target/AMDGPU/SIISelLowering.cpp"
, 3308, __extension__ __PRETTY_FUNCTION__))
3308 "Load should return a value and a chain")(static_cast <bool> ((!Result.getNode() || Result.getNode
()->getNumValues() == 2) && "Load should return a value and a chain"
) ? void (0) : __assert_fail ("(!Result.getNode() || Result.getNode()->getNumValues() == 2) && \"Load should return a value and a chain\""
, "/build/llvm-toolchain-snapshot-7~svn329677/lib/Target/AMDGPU/SIISelLowering.cpp"
, 3308, __extension__ __PRETTY_FUNCTION__))
;
3309 return Result;
3310 }
3311
3312 case ISD::FSIN:
3313 case ISD::FCOS:
3314 return LowerTrig(Op, DAG);
3315 case ISD::SELECT: return LowerSELECT(Op, DAG);
3316 case ISD::FDIV: return LowerFDIV(Op, DAG);
3317 case ISD::ATOMIC_CMP_SWAP: return LowerATOMIC_CMP_SWAP(Op, DAG);
3318 case ISD::STORE: return LowerSTORE(Op, DAG);
3319 case ISD::GlobalAddress: {
3320 MachineFunction &MF = DAG.getMachineFunction();
3321 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
3322 return LowerGlobalAddress(MFI, Op, DAG);
3323 }
3324 case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
3325 case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, DAG);
3326 case ISD::INTRINSIC_VOID: return LowerINTRINSIC_VOID(Op, DAG);
3327 case ISD::ADDRSPACECAST: return lowerADDRSPACECAST(Op, DAG);
3328 case ISD::INSERT_VECTOR_ELT:
3329 return lowerINSERT_VECTOR_ELT(Op, DAG);
3330 case ISD::EXTRACT_VECTOR_ELT:
3331 return lowerEXTRACT_VECTOR_ELT(Op, DAG);
3332 case ISD::FP_ROUND:
3333 return lowerFP_ROUND(Op, DAG);
3334 case ISD::TRAP:
3335 case ISD::DEBUGTRAP:
3336 return lowerTRAP(Op, DAG);
3337 }
3338 return SDValue();
3339}
3340
3341static unsigned getImageOpcode(unsigned IID) {
3342 switch (IID) {
3343 case Intrinsic::amdgcn_image_load:
3344 return AMDGPUISD::IMAGE_LOAD;
3345 case Intrinsic::amdgcn_image_load_mip:
3346 return AMDGPUISD::IMAGE_LOAD_MIP;
3347
3348 // Basic sample.
3349 case Intrinsic::amdgcn_image_sample:
3350 return AMDGPUISD::IMAGE_SAMPLE;
3351 case Intrinsic::amdgcn_image_sample_cl:
3352 return AMDGPUISD::IMAGE_SAMPLE_CL;
3353 case Intrinsic::amdgcn_image_sample_d:
3354 return AMDGPUISD::IMAGE_SAMPLE_D;
3355 case Intrinsic::amdgcn_image_sample_d_cl:
3356 return AMDGPUISD::IMAGE_SAMPLE_D_CL;
3357 case Intrinsic::amdgcn_image_sample_l:
3358 return AMDGPUISD::IMAGE_SAMPLE_L;
3359 case Intrinsic::amdgcn_image_sample_b:
3360 return AMDGPUISD::IMAGE_SAMPLE_B;
3361 case Intrinsic::amdgcn_image_sample_b_cl:
3362 return AMDGPUISD::IMAGE_SAMPLE_B_CL;
3363 case Intrinsic::amdgcn_image_sample_lz:
3364 return AMDGPUISD::IMAGE_SAMPLE_LZ;
3365 case Intrinsic::amdgcn_image_sample_cd:
3366 return AMDGPUISD::IMAGE_SAMPLE_CD;
3367 case Intrinsic::amdgcn_image_sample_cd_cl:
3368 return AMDGPUISD::IMAGE_SAMPLE_CD_CL;
3369
3370 // Sample with comparison.
3371 case Intrinsic::amdgcn_image_sample_c:
3372 return AMDGPUISD::IMAGE_SAMPLE_C;
3373 case Intrinsic::amdgcn_image_sample_c_cl:
3374 return AMDGPUISD::IMAGE_SAMPLE_C_CL;
3375 case Intrinsic::amdgcn_image_sample_c_d:
3376 return AMDGPUISD::IMAGE_SAMPLE_C_D;
3377 case Intrinsic::amdgcn_image_sample_c_d_cl:
3378 return AMDGPUISD::IMAGE_SAMPLE_C_D_CL;
3379 case Intrinsic::amdgcn_image_sample_c_l:
3380 return AMDGPUISD::IMAGE_SAMPLE_C_L;
3381 case Intrinsic::amdgcn_image_sample_c_b:
3382 return AMDGPUISD::IMAGE_SAMPLE_C_B;
3383 case Intrinsic::amdgcn_image_sample_c_b_cl:
3384 return AMDGPUISD::IMAGE_SAMPLE_C_B_CL;
3385 case Intrinsic::amdgcn_image_sample_c_lz:
3386 return AMDGPUISD::IMAGE_SAMPLE_C_LZ;
3387 case Intrinsic::amdgcn_image_sample_c_cd:
3388 return AMDGPUISD::IMAGE_SAMPLE_C_CD;
3389 case Intrinsic::amdgcn_image_sample_c_cd_cl:
3390 return AMDGPUISD::IMAGE_SAMPLE_C_CD_CL;
3391
3392 // Sample with offsets.
3393 case Intrinsic::amdgcn_image_sample_o:
3394 return AMDGPUISD::IMAGE_SAMPLE_O;
3395 case Intrinsic::amdgcn_image_sample_cl_o:
3396 return AMDGPUISD::IMAGE_SAMPLE_CL_O;
3397 case Intrinsic::amdgcn_image_sample_d_o:
3398 return AMDGPUISD::IMAGE_SAMPLE_D_O;
3399 case Intrinsic::amdgcn_image_sample_d_cl_o:
3400 return AMDGPUISD::IMAGE_SAMPLE_D_CL_O;
3401 case Intrinsic::amdgcn_image_sample_l_o:
3402 return AMDGPUISD::IMAGE_SAMPLE_L_O;
3403 case Intrinsic::amdgcn_image_sample_b_o:
3404 return AMDGPUISD::IMAGE_SAMPLE_B_O;
3405 case Intrinsic::amdgcn_image_sample_b_cl_o:
3406 return AMDGPUISD::IMAGE_SAMPLE_B_CL_O;
3407 case Intrinsic::amdgcn_image_sample_lz_o:
3408 return AMDGPUISD::IMAGE_SAMPLE_LZ_O;
3409 case Intrinsic::amdgcn_image_sample_cd_o:
3410 return AMDGPUISD::IMAGE_SAMPLE_CD_O;
3411 case Intrinsic::amdgcn_image_sample_cd_cl_o:
3412 return AMDGPUISD::IMAGE_SAMPLE_CD_CL_O;
3413
3414 // Sample with comparison and offsets.
3415 case Intrinsic::amdgcn_image_sample_c_o:
3416 return AMDGPUISD::IMAGE_SAMPLE_C_O;
3417 case Intrinsic::amdgcn_image_sample_c_cl_o:
3418 return AMDGPUISD::IMAGE_SAMPLE_C_CL_O;
3419 case Intrinsic::amdgcn_image_sample_c_d_o:
3420 return AMDGPUISD::IMAGE_SAMPLE_C_D_O;
3421 case Intrinsic::amdgcn_image_sample_c_d_cl_o:
3422 return AMDGPUISD::IMAGE_SAMPLE_C_D_CL_O;
3423 case Intrinsic::amdgcn_image_sample_c_l_o:
3424 return AMDGPUISD::IMAGE_SAMPLE_C_L_O;
3425 case Intrinsic::amdgcn_image_sample_c_b_o:
3426 return AMDGPUISD::IMAGE_SAMPLE_C_B_O;
3427 case Intrinsic::amdgcn_image_sample_c_b_cl_o:
3428 return AMDGPUISD::IMAGE_SAMPLE_C_B_CL_O;
3429 case Intrinsic::amdgcn_image_sample_c_lz_o:
3430 return AMDGPUISD::IMAGE_SAMPLE_C_LZ_O;
3431 case Intrinsic::amdgcn_image_sample_c_cd_o:
3432 return AMDGPUISD::IMAGE_SAMPLE_C_CD_O;
3433 case Intrinsic::amdgcn_image_sample_c_cd_cl_o:
3434 return AMDGPUISD::IMAGE_SAMPLE_C_CD_CL_O;
3435
3436 // Basic gather4.
3437 case Intrinsic::amdgcn_image_gather4:
3438 return AMDGPUISD::IMAGE_GATHER4;
3439 case Intrinsic::amdgcn_image_gather4_cl:
3440 return AMDGPUISD::IMAGE_GATHER4_CL;
3441 case Intrinsic::amdgcn_image_gather4_l:
3442 return AMDGPUISD::IMAGE_GATHER4_L;
3443 case Intrinsic::amdgcn_image_gather4_b:
3444 return AMDGPUISD::IMAGE_GATHER4_B;
3445 case Intrinsic::amdgcn_image_gather4_b_cl:
3446 return AMDGPUISD::IMAGE_GATHER4_B_CL;
3447 case Intrinsic::amdgcn_image_gather4_lz:
3448 return AMDGPUISD::IMAGE_GATHER4_LZ;
3449
3450 // Gather4 with comparison.
3451 case Intrinsic::amdgcn_image_gather4_c:
3452 return AMDGPUISD::IMAGE_GATHER4_C;
3453 case Intrinsic::amdgcn_image_gather4_c_cl:
3454 return AMDGPUISD::IMAGE_GATHER4_C_CL;
3455 case Intrinsic::amdgcn_image_gather4_c_l:
3456 return AMDGPUISD::IMAGE_GATHER4_C_L;
3457 case Intrinsic::amdgcn_image_gather4_c_b:
3458 return AMDGPUISD::IMAGE_GATHER4_C_B;
3459 case Intrinsic::amdgcn_image_gather4_c_b_cl:
3460 return AMDGPUISD::IMAGE_GATHER4_C_B_CL;
3461 case Intrinsic::amdgcn_image_gather4_c_lz:
3462 return AMDGPUISD::IMAGE_GATHER4_C_LZ;
3463
3464 // Gather4 with offsets.
3465 case Intrinsic::amdgcn_image_gather4_o:
3466 return AMDGPUISD::IMAGE_GATHER4_O;
3467 case Intrinsic::amdgcn_image_gather4_cl_o:
3468 return AMDGPUISD::IMAGE_GATHER4_CL_O;
3469 case Intrinsic::amdgcn_image_gather4_l_o:
3470 return AMDGPUISD::IMAGE_GATHER4_L_O;
3471 case Intrinsic::amdgcn_image_gather4_b_o:
3472 return AMDGPUISD::IMAGE_GATHER4_B_O;
3473 case Intrinsic::amdgcn_image_gather4_b_cl_o:
3474 return AMDGPUISD::IMAGE_GATHER4_B_CL_O;
3475 case Intrinsic::amdgcn_image_gather4_lz_o:
3476 return AMDGPUISD::IMAGE_GATHER4_LZ_O;
3477
3478 // Gather4 with comparison and offsets.
3479 case Intrinsic::amdgcn_image_gather4_c_o:
3480 return AMDGPUISD::IMAGE_GATHER4_C_O;
3481 case Intrinsic::amdgcn_image_gather4_c_cl_o:
3482 return AMDGPUISD::IMAGE_GATHER4_C_CL_O;
3483 case Intrinsic::amdgcn_image_gather4_c_l_o:
3484 return AMDGPUISD::IMAGE_GATHER4_C_L_O;
3485 case Intrinsic::amdgcn_image_gather4_c_b_o:
3486 return AMDGPUISD::IMAGE_GATHER4_C_B_O;
3487 case Intrinsic::amdgcn_image_gather4_c_b_cl_o:
3488 return AMDGPUISD::IMAGE_GATHER4_C_B_CL_O;
3489 case Intrinsic::amdgcn_image_gather4_c_lz_o:
3490 return AMDGPUISD::IMAGE_GATHER4_C_LZ_O;
3491
3492 default:
3493 break;
3494 }
3495 return 0;
3496}
3497
3498static SDValue adjustLoadValueType(SDValue Result, EVT LoadVT, SDLoc DL,
3499 SelectionDAG &DAG, bool Unpacked) {
3500 if (Unpacked) { // From v2i32/v4i32 back to v2f16/v4f16.
3501 // Truncate to v2i16/v4i16.
3502 EVT IntLoadVT = LoadVT.changeTypeToInteger();
3503 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, IntLoadVT, Result);
3504 // Bitcast to original type (v2f16/v4f16).
3505 return DAG.getNode(ISD::BITCAST, DL, LoadVT, Trunc);
3506 }
3507 // Cast back to the original packed type.
3508 return DAG.getNode(ISD::BITCAST, DL, LoadVT, Result);
3509}
3510
3511// This is to lower INTRINSIC_W_CHAIN with illegal result types.
3512SDValue SITargetLowering::lowerIntrinsicWChain_IllegalReturnType(SDValue Op,
3513 SDValue &Chain, SelectionDAG &DAG) const {
3514 EVT LoadVT = Op.getValueType();
3515 // TODO: handle v3f16.
3516 if (LoadVT != MVT::v2f16 && LoadVT != MVT::v4f16)
3517 return SDValue();
3518
3519 bool Unpacked = Subtarget->hasUnpackedD16VMem();
3520 EVT UnpackedLoadVT = (LoadVT == MVT::v2f16) ? MVT::v2i32 : MVT::v4i32;
3521 EVT EquivLoadVT = Unpacked ? UnpackedLoadVT :
3522 getEquivalentMemType(*DAG.getContext(), LoadVT);
3523 // Change from v4f16/v2f16 to EquivLoadVT.
3524 SDVTList VTList = DAG.getVTList(EquivLoadVT, MVT::Other);
3525
3526 SDValue Res;
3527 SDLoc DL(Op);
3528 MemSDNode *M = cast<MemSDNode>(Op);
3529 unsigned IID = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
3530 switch (IID) {
3531 case Intrinsic::amdgcn_tbuffer_load: {
3532 SDValue Ops[] = {
3533 Op.getOperand(0), // Chain
3534 Op.getOperand(2), // rsrc
3535 Op.getOperand(3), // vindex
3536 Op.getOperand(4), // voffset
3537 Op.getOperand(5), // soffset
3538 Op.getOperand(6), // offset
3539 Op.getOperand(7), // dfmt
3540 Op.getOperand(8), // nfmt
3541 Op.getOperand(9), // glc
3542 Op.getOperand(10) // slc
3543 };
3544 Res = DAG.getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16, DL,
3545 VTList, Ops, M->getMemoryVT(),
3546 M->getMemOperand());
3547 Chain = Res.getValue(1);
3548 return adjustLoadValueType(Res, LoadVT, DL, DAG, Unpacked);
3549 }
3550 case Intrinsic::amdgcn_buffer_load_format: {
3551 SDValue Ops[] = {
3552 Op.getOperand(0), // Chain
3553 Op.getOperand(2), // rsrc
3554 Op.getOperand(3), // vindex
3555 Op.getOperand(4), // offset
3556 Op.getOperand(5), // glc
3557 Op.getOperand(6) // slc
3558 };
3559 Res = DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_LOAD_FORMAT_D16,
3560 DL, VTList, Ops, M->getMemoryVT(),
3561 M->getMemOperand());
3562 Chain = Res.getValue(1);
3563 return adjustLoadValueType(Res, LoadVT, DL, DAG, Unpacked);
3564 }
3565 case Intrinsic::amdgcn_image_load:
3566 case Intrinsic::amdgcn_image_load_mip: {
3567 SDValue Ops[] = {
3568 Op.getOperand(0), // Chain
3569 Op.getOperand(2), // vaddr
3570 Op.getOperand(3), // rsrc
3571 Op.getOperand(4), // dmask
3572 Op.getOperand(5), // glc
3573 Op.getOperand(6), // slc
3574 Op.getOperand(7), // lwe
3575 Op.getOperand(8) // da
3576 };
3577 unsigned Opc = getImageOpcode(IID);
3578 Res = DAG.getMemIntrinsicNode(Opc, DL, VTList, Ops, M->getMemoryVT(),
3579 M->getMemOperand());
3580 Chain = Res.getValue(1);
3581 return adjustLoadValueType(Res, LoadVT, DL, DAG, Unpacked);
3582 }
3583 // Basic sample.
3584 case Intrinsic::amdgcn_image_sample:
3585 case Intrinsic::amdgcn_image_sample_cl:
3586 case Intrinsic::amdgcn_image_sample_d:
3587 case Intrinsic::amdgcn_image_sample_d_cl:
3588 case Intrinsic::amdgcn_image_sample_l:
3589 case Intrinsic::amdgcn_image_sample_b:
3590 case Intrinsic::amdgcn_image_sample_b_cl:
3591 case Intrinsic::amdgcn_image_sample_lz:
3592 case Intrinsic::amdgcn_image_sample_cd:
3593 case Intrinsic::amdgcn_image_sample_cd_cl:
3594
3595 // Sample with comparison.
3596 case Intrinsic::amdgcn_image_sample_c:
3597 case Intrinsic::amdgcn_image_sample_c_cl:
3598 case Intrinsic::amdgcn_image_sample_c_d:
3599 case Intrinsic::amdgcn_image_sample_c_d_cl:
3600 case Intrinsic::amdgcn_image_sample_c_l:
3601 case Intrinsic::amdgcn_image_sample_c_b:
3602 case Intrinsic::amdgcn_image_sample_c_b_cl:
3603 case Intrinsic::amdgcn_image_sample_c_lz:
3604 case Intrinsic::amdgcn_image_sample_c_cd:
3605 case Intrinsic::amdgcn_image_sample_c_cd_cl:
3606
3607 // Sample with offsets.
3608 case Intrinsic::amdgcn_image_sample_o:
3609 case Intrinsic::amdgcn_image_sample_cl_o:
3610 case Intrinsic::amdgcn_image_sample_d_o:
3611 case Intrinsic::amdgcn_image_sample_d_cl_o:
3612 case Intrinsic::amdgcn_image_sample_l_o:
3613 case Intrinsic::amdgcn_image_sample_b_o:
3614 case Intrinsic::amdgcn_image_sample_b_cl_o:
3615 case Intrinsic::amdgcn_image_sample_lz_o:
3616 case Intrinsic::amdgcn_image_sample_cd_o:
3617 case Intrinsic::amdgcn_image_sample_cd_cl_o:
3618
3619 // Sample with comparison and offsets.
3620 case Intrinsic::amdgcn_image_sample_c_o:
3621 case Intrinsic::amdgcn_image_sample_c_cl_o:
3622 case Intrinsic::amdgcn_image_sample_c_d_o:
3623 case Intrinsic::amdgcn_image_sample_c_d_cl_o:
3624 case Intrinsic::amdgcn_image_sample_c_l_o:
3625 case Intrinsic::amdgcn_image_sample_c_b_o:
3626 case Intrinsic::amdgcn_image_sample_c_b_cl_o:
3627 case Intrinsic::amdgcn_image_sample_c_lz_o:
3628 case Intrinsic::amdgcn_image_sample_c_cd_o:
3629 case Intrinsic::amdgcn_image_sample_c_cd_cl_o:
3630
3631 // Basic gather4
3632 case Intrinsic::amdgcn_image_gather4:
3633 case Intrinsic::amdgcn_image_gather4_cl:
3634 case Intrinsic::amdgcn_image_gather4_l:
3635 case Intrinsic::amdgcn_image_gather4_b:
3636 case Intrinsic::amdgcn_image_gather4_b_cl:
3637 case Intrinsic::amdgcn_image_gather4_lz:
3638
3639 // Gather4 with comparison
3640 case Intrinsic::amdgcn_image_gather4_c:
3641 case Intrinsic::amdgcn_image_gather4_c_cl:
3642 case Intrinsic::amdgcn_image_gather4_c_l:
3643 case Intrinsic::amdgcn_image_gather4_c_b:
3644 case Intrinsic::amdgcn_image_gather4_c_b_cl:
3645 case Intrinsic::amdgcn_image_gather4_c_lz:
3646
3647 // Gather4 with offsets
3648 case Intrinsic::amdgcn_image_gather4_o:
3649 case Intrinsic::amdgcn_image_gather4_cl_o:
3650 case Intrinsic::amdgcn_image_gather4_l_o:
3651 case Intrinsic::amdgcn_image_gather4_b_o:
3652 case Intrinsic::amdgcn_image_gather4_b_cl_o:
3653 case Intrinsic::amdgcn_image_gather4_lz_o:
3654
3655 // Gather4 with comparison and offsets
3656 case Intrinsic::amdgcn_image_gather4_c_o:
3657 case Intrinsic::amdgcn_image_gather4_c_cl_o:
3658 case Intrinsic::amdgcn_image_gather4_c_l_o:
3659 case Intrinsic::amdgcn_image_gather4_c_b_o:
3660 case Intrinsic::amdgcn_image_gather4_c_b_cl_o:
3661 case Intrinsic::amdgcn_image_gather4_c_lz_o: {
3662 SDValue Ops[] = {
3663 Op.getOperand(0), // Chain
3664 Op.getOperand(2), // vaddr
3665 Op.getOperand(3), // rsrc
3666 Op.getOperand(4), // sampler
3667 Op.getOperand(5), // dmask
3668 Op.getOperand(6), // unorm
3669 Op.getOperand(7), // glc
3670 Op.getOperand(8), // slc
3671 Op.getOperand(9), // lwe
3672 Op.getOperand(10) // da
3673 };
3674 unsigned Opc = getImageOpcode(IID);
3675 Res = DAG.getMemIntrinsicNode(Opc, DL, VTList, Ops, M->getMemoryVT(),
3676 M->getMemOperand());
3677 Chain = Res.getValue(1);
3678 return adjustLoadValueType(Res, LoadVT, DL, DAG, Unpacked);
3679 }
3680 default: {
3681 const AMDGPU::D16ImageDimIntrinsic *D16ImageDimIntr =
3682 AMDGPU::lookupD16ImageDimIntrinsicByIntr(IID);
3683 if (D16ImageDimIntr) {
3684 SmallVector<SDValue, 20> Ops;
3685 for (auto Value : Op.getNode()->op_values())
3686 Ops.push_back(Value);
3687 Ops[1] = DAG.getConstant(D16ImageDimIntr->D16HelperIntr, DL, MVT::i32);
3688 Res = DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN, DL, VTList, Ops,
3689 M->getMemoryVT(), M->getMemOperand());
3690 Chain = Res.getValue(1);
3691 return adjustLoadValueType(Res, LoadVT, DL, DAG, Unpacked);
3692 }
3693
3694 return SDValue();
3695 }
3696 }
3697}
3698
3699void SITargetLowering::ReplaceNodeResults(SDNode *N,
3700 SmallVectorImpl<SDValue> &Results,
3701 SelectionDAG &DAG) const {
3702 switch (N->getOpcode()) {
3703 case ISD::INSERT_VECTOR_ELT: {
3704 if (SDValue Res = lowerINSERT_VECTOR_ELT(SDValue(N, 0), DAG))
3705 Results.push_back(Res);
3706 return;
3707 }
3708 case ISD::EXTRACT_VECTOR_ELT: {
3709 if (SDValue Res = lowerEXTRACT_VECTOR_ELT(SDValue(N, 0), DAG))
3710 Results.push_back(Res);
3711 return;
3712 }
3713 case ISD::INTRINSIC_WO_CHAIN: {
3714 unsigned IID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
3715 switch (IID) {
3716 case Intrinsic::amdgcn_cvt_pkrtz: {
3717 SDValue Src0 = N->getOperand(1);
3718 SDValue Src1 = N->getOperand(2);
3719 SDLoc SL(N);
3720 SDValue Cvt = DAG.getNode(AMDGPUISD::CVT_PKRTZ_F16_F32, SL, MVT::i32,
3721 Src0, Src1);
3722 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Cvt));
3723 return;
3724 }
3725 case Intrinsic::amdgcn_cvt_pknorm_i16:
3726 case Intrinsic::amdgcn_cvt_pknorm_u16:
3727 case Intrinsic::amdgcn_cvt_pk_i16:
3728 case Intrinsic::amdgcn_cvt_pk_u16: {
3729 SDValue Src0 = N->getOperand(1);
3730 SDValue Src1 = N->getOperand(2);
3731 SDLoc SL(N);
3732 unsigned Opcode;
3733
3734 if (IID == Intrinsic::amdgcn_cvt_pknorm_i16)
3735 Opcode = AMDGPUISD::CVT_PKNORM_I16_F32;
3736 else if (IID == Intrinsic::amdgcn_cvt_pknorm_u16)
3737 Opcode = AMDGPUISD::CVT_PKNORM_U16_F32;
3738 else if (IID == Intrinsic::amdgcn_cvt_pk_i16)
3739 Opcode = AMDGPUISD::CVT_PK_I16_I32;
3740 else
3741 Opcode = AMDGPUISD::CVT_PK_U16_U32;
3742
3743 SDValue Cvt = DAG.getNode(Opcode, SL, MVT::i32, Src0, Src1);
3744 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, Cvt));
3745 return;
3746 }
3747 }
3748 break;
3749 }
3750 case ISD::INTRINSIC_W_CHAIN: {
3751 SDValue Chain;
3752 if (SDValue Res = lowerIntrinsicWChain_IllegalReturnType(SDValue(N, 0),
3753 Chain, DAG)) {
3754 Results.push_back(Res);
3755 Results.push_back(Chain);
3756 return;
3757 }
3758 break;
3759 }
3760 case ISD::SELECT: {
3761 SDLoc SL(N);
3762 EVT VT = N->getValueType(0);
3763 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
3764 SDValue LHS = DAG.getNode(ISD::BITCAST, SL, NewVT, N->getOperand(1));
3765 SDValue RHS = DAG.getNode(ISD::BITCAST, SL, NewVT, N->getOperand(2));
3766
3767 EVT SelectVT = NewVT;
3768 if (NewVT.bitsLT(MVT::i32)) {
3769 LHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, LHS);
3770 RHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, RHS);
3771 SelectVT = MVT::i32;
3772 }
3773
3774 SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, SelectVT,
3775 N->getOperand(0), LHS, RHS);
3776
3777 if (NewVT != SelectVT)
3778 NewSelect = DAG.getNode(ISD::TRUNCATE, SL, NewVT, NewSelect);
3779 Results.push_back(DAG.getNode(ISD::BITCAST, SL, VT, NewSelect));
3780 return;
3781 }
3782 default:
3783 break;
3784 }
3785}
3786
3787/// \brief Helper function for LowerBRCOND
3788static SDNode *findUser(SDValue Value, unsigned Opcode) {
3789
3790 SDNode *Parent = Value.getNode();
3791 for (SDNode::use_iterator I = Parent->use_begin(), E = Parent->use_end();
3792 I != E; ++I) {
3793
3794 if (I.getUse().get() != Value)
3795 continue;
3796
3797 if (I->getOpcode() == Opcode)
3798 return *I;
3799 }
3800 return nullptr;
3801}
3802
3803unsigned SITargetLowering::isCFIntrinsic(const SDNode *Intr) const {
3804 if (Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN) {
3805 switch (cast<ConstantSDNode>(Intr->getOperand(1))->getZExtValue()) {
3806 case Intrinsic::amdgcn_if:
3807 return AMDGPUISD::IF;
3808 case Intrinsic::amdgcn_else:
3809 return AMDGPUISD::ELSE;
3810 case Intrinsic::amdgcn_loop:
3811 return AMDGPUISD::LOOP;
3812 case Intrinsic::amdgcn_end_cf:
3813 llvm_unreachable("should not occur")::llvm::llvm_unreachable_internal("should not occur", "/build/llvm-toolchain-snapshot-7~svn329677/lib/Target/AMDGPU/SIISelLowering.cpp"
, 3813)
;
3814 default:
3815 return 0;
3816 }
3817 }
3818
3819 // break, if_break, else_break are all only used as inputs to loop, not
3820 // directly as branch conditions.
3821 return 0;
3822}
3823
3824void SITargetLowering::createDebuggerPrologueStackObjects(
3825 MachineFunction &MF) const {
3826 // Create stack objects that are used for emitting debugger prologue.
3827 //
3828 // Debugger prologue writes work group IDs and work item IDs to scratch memory
3829 // at fixed location in the following format:
3830 // offset 0: work group ID x
3831 // offset 4: work group ID y
3832 // offset 8: work group ID z
3833 // offset 16: work item ID x
3834 // offset 20: work item ID y
3835 // offset 24: work item ID z
3836 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
3837 int ObjectIdx = 0;
3838
3839 // For each dimension:
3840 for (unsigned i = 0; i < 3; ++i) {
3841 // Create fixed stack object for work group ID.
3842 ObjectIdx = MF.getFrameInfo().CreateFixedObject(4, i * 4, true);
3843 Info->setDebuggerWorkGroupIDStackObjectIndex(i, ObjectIdx);
3844 // Create fixed stack object for work item ID.
3845 ObjectIdx = MF.getFrameInfo().CreateFixedObject(4, i * 4 + 16, true);
3846 Info->setDebuggerWorkItemIDStackObjectIndex(i, ObjectIdx);
3847 }
3848}
3849
3850bool SITargetLowering::shouldEmitFixup(const GlobalValue *GV) const {
3851 const Triple &TT = getTargetMachine().getTargetTriple();
3852 return (GV->getType()->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS ||
3853 GV->getType()->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS_32BIT) &&
3854 AMDGPU::shouldEmitConstantsToTextSection(TT);
3855}
3856
3857bool SITargetLowering::shouldEmitGOTReloc(const GlobalValue *GV) const {
3858 return (GV->getType()->getAddressSpace() == AMDGPUASI.GLOBAL_ADDRESS ||
3859 GV->getType()->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS ||
3860 GV->getType()->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS_32BIT) &&
3861 !shouldEmitFixup(GV) &&
3862 !getTargetMachine().shouldAssumeDSOLocal(*GV->getParent(), GV);
3863}
3864
3865bool SITargetLowering::shouldEmitPCReloc(const GlobalValue *GV) const {
3866 return !shouldEmitFixup(GV) && !shouldEmitGOTReloc(GV);
3867}
3868
3869/// This transforms the control flow intrinsics to get the branch destination as
3870/// last parameter, also switches branch target with BR if the need arise
3871SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND,
3872 SelectionDAG &DAG) const {
3873 SDLoc DL(BRCOND);
3874
3875 SDNode *Intr = BRCOND.getOperand(1).getNode();
3876 SDValue Target = BRCOND.getOperand(2);
3877 SDNode *BR = nullptr;
3878 SDNode *SetCC = nullptr;
3879
3880 if (Intr->getOpcode() == ISD::SETCC) {
3881 // As long as we negate the condition everything is fine
3882 SetCC = Intr;
3883 Intr = SetCC->getOperand(0).getNode();
3884
3885 } else {
3886 // Get the target from BR if we don't negate the condition
3887 BR = findUser(BRCOND, ISD::BR);
3888 Target = BR->getOperand(1);
3889 }
3890
3891 // FIXME: This changes the types of the intrinsics instead of introducing new
3892 // nodes with the correct types.
3893 // e.g. llvm.amdgcn.loop
3894
3895 // eg: i1,ch = llvm.amdgcn.loop t0, TargetConstant:i32<6271>, t3
3896 // => t9: ch = llvm.amdgcn.loop t0, TargetConstant:i32<6271>, t3, BasicBlock:ch<bb1 0x7fee5286d088>
3897
3898 unsigned CFNode = isCFIntrinsic(Intr);
3899 if (CFNode == 0) {
3900 // This is a uniform branch so we don't need to legalize.
3901 return BRCOND;
3902 }
3903
3904 bool HaveChain = Intr->getOpcode() == ISD::INTRINSIC_VOID ||
3905 Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN;
3906
3907 assert(!SetCC ||(static_cast <bool> (!SetCC || (SetCC->getConstantOperandVal
(1) == 1 && cast<CondCodeSDNode>(SetCC->getOperand
(2).getNode())->get() == ISD::SETNE)) ? void (0) : __assert_fail
("!SetCC || (SetCC->getConstantOperandVal(1) == 1 && cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get() == ISD::SETNE)"
, "/build/llvm-toolchain-snapshot-7~svn329677/lib/Target/AMDGPU/SIISelLowering.cpp"
, 3910, __extension__ __PRETTY_FUNCTION__))
3908 (SetCC->getConstantOperandVal(1) == 1 &&(static_cast <bool> (!SetCC || (SetCC->getConstantOperandVal
(1) == 1 && cast<CondCodeSDNode>(SetCC->getOperand
(2).getNode())->get() == ISD::SETNE)) ? void (0) : __assert_fail
("!SetCC || (SetCC->getConstantOperandVal(1) == 1 && cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get() == ISD::SETNE)"
, "/build/llvm-toolchain-snapshot-7~svn329677/lib/Target/AMDGPU/SIISelLowering.cpp"
, 3910, __extension__ __PRETTY_FUNCTION__))
3909 cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get() ==(static_cast <bool> (!SetCC || (SetCC->getConstantOperandVal
(1) == 1 && cast<CondCodeSDNode>(SetCC->getOperand
(2).getNode())->get() == ISD::SETNE)) ? void (0) : __assert_fail
("!SetCC || (SetCC->getConstantOperandVal(1) == 1 && cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get() == ISD::SETNE)"
, "/build/llvm-toolchain-snapshot-7~svn329677/lib/Target/AMDGPU/SIISelLowering.cpp"
, 3910, __extension__ __PRETTY_FUNCTION__))
3910 ISD::SETNE))(static_cast <bool> (!SetCC || (SetCC->getConstantOperandVal
(1) == 1 && cast<CondCodeSDNode>(SetCC->getOperand
(2).getNode())->get() == ISD::SETNE)) ? void (0) : __assert_fail
("!SetCC || (SetCC->getConstantOperandVal(1) == 1 && cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get() == ISD::SETNE)"
, "/build/llvm-toolchain-snapshot-7~svn329677/lib/Target/AMDGPU/SIISelLowering.cpp"
, 3910, __extension__ __PRETTY_FUNCTION__))
;
3911
3912 // operands of the new intrinsic call
3913 SmallVector<SDValue, 4> Ops;
3914 if (HaveChain)
3915 Ops.push_back(BRCOND.getOperand(0));
3916
3917 Ops.append(Intr->op_begin() + (HaveChain ? 2 : 1), Intr->op_end());
3918 Ops.push_back(Target);
3919
3920 ArrayRef<EVT> Res(Intr->value_begin() + 1, Intr->value_end());
3921
3922 // build the new intrinsic call
3923 SDNode *Result = DAG.getNode(CFNode, DL, DAG.getVTList(Res), Ops).getNode();
3924
3925 if (!HaveChain) {
3926 SDValue Ops[] = {
3927 SDValue(Result, 0),
3928 BRCOND.getOperand(0)
3929 };
3930
3931 Result = DAG.getMergeValues(Ops, DL).getNode();
3932 }
3933
3934 if (BR) {
3935 // Give the branch instruction our target
3936 SDValue Ops[] = {
3937 BR->getOperand(0),
3938 BRCOND.getOperand(2)
3939 };
3940 SDValue NewBR = DAG.getNode(ISD::BR, DL, BR->getVTList(), Ops);
3941 DAG.ReplaceAllUsesWith(BR, NewBR.getNode());
3942 BR = NewBR.getNode();
3943 }
3944
3945 SDValue Chain = SDValue(Result, Result->getNumValues() - 1);
3946
3947 // Copy the intrinsic results to registers
3948 for (unsigned i = 1, e = Intr->getNumValues() - 1; i != e; ++i) {
3949 SDNode *CopyToReg = findUser(SDValue(Intr, i), ISD::CopyToReg);
3950 if (!CopyToReg)
3951 continue;
3952
3953 Chain = DAG.getCopyToReg(
3954 Chain, DL,
3955 CopyToReg->getOperand(1),
3956 SDValue(Result, i - 1),
3957 SDValue());
3958
3959 DAG.ReplaceAllUsesWith(SDValue(CopyToReg, 0), CopyToReg->getOperand(0));
3960 }
3961
3962 // Remove the old intrinsic from the chain
3963 DAG.ReplaceAllUsesOfValueWith(
3964 SDValue(Intr, Intr->getNumValues() - 1),
3965 Intr->getOperand(0));
3966
3967 return Chain;
3968}
3969
3970SDValue SITargetLowering::getFPExtOrFPTrunc(SelectionDAG &DAG,
3971 SDValue Op,
3972 const SDLoc &DL,
3973 EVT VT) const {
3974 return Op.getValueType().bitsLE(VT) ?
3975 DAG.getNode(ISD::FP_EXTEND, DL, VT, Op) :
3976 DAG.getNode(ISD::FTRUNC, DL, VT, Op);
3977}
3978
3979SDValue SITargetLowering::lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
3980 assert(Op.getValueType() == MVT::f16 &&(static_cast <bool> (Op.getValueType() == MVT::f16 &&
"Do not know how to custom lower FP_ROUND for non-f16 type")
? void (0) : __assert_fail ("Op.getValueType() == MVT::f16 && \"Do not know how to custom lower FP_ROUND for non-f16 type\""
, "/build/llvm-toolchain-snapshot-7~svn329677/lib/Target/AMDGPU/SIISelLowering.cpp"
, 3981, __extension__ __PRETTY_FUNCTION__))
3981 "Do not know how to custom lower FP_ROUND for non-f16 type")(static_cast <bool> (Op.getValueType() == MVT::f16 &&
"Do not know how to custom lower FP_ROUND for non-f16 type")
? void (0) : __assert_fail ("Op.getValueType() == MVT::f16 && \"Do not know how to custom lower FP_ROUND for non-f16 type\""
, "/build/llvm-toolchain-snapshot-7~svn329677/lib/Target/AMDGPU/SIISelLowering.cpp"
, 3981, __extension__ __PRETTY_FUNCTION__))
;
3982
3983 SDValue Src = Op.getOperand(0);
3984 EVT SrcVT = Src.getValueType();
3985 if (SrcVT != MVT::f64)
3986 return Op;
3987
3988 SDLoc DL(Op);
3989
3990 SDValue FpToFp16 = DAG.getNode(ISD::FP_TO_FP16, DL, MVT::i32, Src);
3991 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, FpToFp16);
3992 return DAG.getNode(ISD::BITCAST, DL, MVT::f16, Trunc);
3993}
3994
3995SDValue SITargetLowering::lowerTRAP(SDValue Op, SelectionDAG &DAG) const {
3996 SDLoc SL(Op);
3997 MachineFunction &MF = DAG.getMachineFunction();
3998 SDValue Chain = Op.getOperand(0);
3999
4000 unsigned TrapID = Op.getOpcode() == ISD::DEBUGTRAP ?
4001 SISubtarget::TrapIDLLVMDebugTrap : SISubtarget::TrapIDLLVMTrap;
4002
4003 if (Subtarget->getTrapHandlerAbi() == SISubtarget::TrapHandlerAbiHsa &&
4004 Subtarget->isTrapHandlerEnabled()) {
4005 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
4006 unsigned UserSGPR = Info->getQueuePtrUserSGPR();
4007 assert(UserSGPR != AMDGPU::NoRegister)(static_cast <bool> (UserSGPR != AMDGPU::NoRegister) ? void
(0) : __assert_fail ("UserSGPR != AMDGPU::NoRegister", "/build/llvm-toolchain-snapshot-7~svn329677/lib/Target/AMDGPU/SIISelLowering.cpp"
, 4007, __extension__ __PRETTY_FUNCTION__))
;
4008
4009 SDValue QueuePtr = CreateLiveInRegister(
4010 DAG, &AMDGPU::SReg_64RegClass, UserSGPR, MVT::i64);
4011
4012 SDValue SGPR01 = DAG.getRegister(AMDGPU::SGPR0_SGPR1, MVT::i64);
4013
4014 SDValue ToReg = DAG.getCopyToReg(Chain, SL, SGPR01,
4015 QueuePtr, SDValue());
4016
4017 SDValue Ops[] = {
4018 ToReg,
4019 DAG.getTargetConstant(TrapID, SL, MVT::i16),
4020 SGPR01,
4021 ToReg.getValue(1)
4022 };
4023
4024 return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
4025 }
4026
4027 switch (TrapID) {
4028 case SISubtarget::TrapIDLLVMTrap:
4029 return DAG.getNode(AMDGPUISD::ENDPGM, SL, MVT::Other, Chain);
4030 case SISubtarget::TrapIDLLVMDebugTrap: {
4031 DiagnosticInfoUnsupported NoTrap(MF.getFunction(),
4032 "debugtrap handler not supported",
4033 Op.getDebugLoc(),
4034 DS_Warning);
4035 LLVMContext &Ctx = MF.getFunction().getContext();
4036 Ctx.diagnose(NoTrap);
4037 return Chain;
4038 }
4039 default:
4040 llvm_unreachable("unsupported trap handler type!")::llvm::llvm_unreachable_internal("unsupported trap handler type!"
, "/build/llvm-toolchain-snapshot-7~svn329677/lib/Target/AMDGPU/SIISelLowering.cpp"
, 4040)
;
4041 }
4042
4043 return Chain;
4044}
4045
4046SDValue SITargetLowering::getSegmentAperture(unsigned AS, const SDLoc &DL,
4047 SelectionDAG &DAG) const {
4048 // FIXME: Use inline constants (src_{shared, private}_base) instead.
4049 if (Subtarget->hasApertureRegs()) {
4050 unsigned Offset = AS == AMDGPUASI.LOCAL_ADDRESS ?
4051 AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE :
4052 AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE;
4053 unsigned WidthM1 = AS == AMDGPUASI.LOCAL_ADDRESS ?
4054 AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE :
4055 AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE;
4056 unsigned Encoding =
4057 AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ |
4058 Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
4059 WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
4060
4061 SDValue EncodingImm = DAG.getTargetConstant(Encoding, DL, MVT::i16);
4062 SDValue ApertureReg = SDValue(
4063 DAG.getMachineNode(AMDGPU::S_GETREG_B32, DL, MVT::i32, EncodingImm), 0);
4064 SDValue ShiftAmount = DAG.getTargetConstant(WidthM1 + 1, DL, MVT::i32);
4065 return DAG.getNode(ISD::SHL, DL, MVT::i32, ApertureReg, ShiftAmount);
4066 }
4067
4068 MachineFunction &MF = DAG.getMachineFunction();
4069 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
4070 unsigned UserSGPR = Info->getQueuePtrUserSGPR();
4071 assert(UserSGPR != AMDGPU::NoRegister)(static_cast <bool> (UserSGPR != AMDGPU::NoRegister) ? void
(0) : __assert_fail ("UserSGPR != AMDGPU::NoRegister", "/build/llvm-toolchain-snapshot-7~svn329677/lib/Target/AMDGPU/SIISelLowering.cpp"
, 4071, __extension__ __PRETTY_FUNCTION__))
;
4072
4073 SDValue QueuePtr = CreateLiveInRegister(
4074 DAG, &AMDGPU::SReg_64RegClass, UserSGPR, MVT::i64);
4075
4076 // Offset into amd_queue_t for group_segment_aperture_base_hi /
4077 // private_segment_aperture_base_hi.
4078 uint32_t StructOffset = (AS == AMDGPUASI.LOCAL_ADDRESS) ? 0x40 : 0x44;
4079
4080 SDValue Ptr = DAG.getObjectPtrOffset(DL, QueuePtr, StructOffset);
4081
4082 // TODO: Use custom target PseudoSourceValue.
4083 // TODO: We should use the value from the IR intrinsic call, but it might not
4084 // be available and how do we get it?
4085 Value *V = UndefValue::get(PointerType::get(Type::getInt8Ty(*DAG.getContext()),
4086 AMDGPUASI.CONSTANT_ADDRESS));
4087
4088 MachinePointerInfo PtrInfo(V, StructOffset);
4089 return DAG.getLoad(MVT::i32, DL, QueuePtr.getValue(1), Ptr, PtrInfo,
4090 MinAlign(64, StructOffset),
4091 MachineMemOperand::MODereferenceable |
4092 MachineMemOperand::MOInvariant);
4093}
4094
4095SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op,
4096 SelectionDAG &DAG) const {
4097 SDLoc SL(Op);
4098 const AddrSpaceCastSDNode *ASC = cast<AddrSpaceCastSDNode>(Op);
4099
4100 SDValue Src = ASC->getOperand(0);
4101 SDValue FlatNullPtr = DAG.getConstant(0, SL, MVT::i64);
4102
4103 const AMDGPUTargetMachine &TM =
4104 static_cast<const AMDGPUTargetMachine &>(getTargetMachine());
4105
4106 // flat -> local/private
4107 if (ASC->getSrcAddressSpace() == AMDGPUASI.FLAT_ADDRESS) {
4108 unsigned DestAS = ASC->getDestAddressSpace();
4109
4110 if (DestAS == AMDGPUASI.LOCAL_ADDRESS ||
4111 DestAS == AMDGPUASI.PRIVATE_ADDRESS) {
4112 unsigned NullVal = TM.getNullPointerValue(DestAS);
4113 SDValue SegmentNullPtr = DAG.getConstant(NullVal, SL, MVT::i32);
4114 SDValue NonNull = DAG.getSetCC(SL, MVT::i1, Src, FlatNullPtr, ISD::SETNE);
4115 SDValue Ptr = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Src);
4116
4117 return DAG.getNode(ISD::SELECT, SL, MVT::i32,
4118 NonNull, Ptr, SegmentNullPtr);
4119 }
4120 }
4121
4122 // local/private -> flat
4123 if (ASC->getDestAddressSpace() == AMDGPUASI.FLAT_ADDRESS) {
4124 unsigned SrcAS = ASC->getSrcAddressSpace();
4125
4126 if (SrcAS == AMDGPUASI.LOCAL_ADDRESS ||
4127 SrcAS == AMDGPUASI.PRIVATE_ADDRESS) {
4128 unsigned NullVal = TM.getNullPointerValue(SrcAS);
4129 SDValue SegmentNullPtr = DAG.getConstant(NullVal, SL, MVT::i32);
4130
4131 SDValue NonNull
4132 = DAG.getSetCC(SL, MVT::i1, Src, SegmentNullPtr, ISD::SETNE);
4133
4134 SDValue Aperture = getSegmentAperture(ASC->getSrcAddressSpace(), SL, DAG);
4135 SDValue CvtPtr
4136 = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Aperture);
4137
4138 return DAG.getNode(ISD::SELECT, SL, MVT::i64, NonNull,
4139 DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr),
4140 FlatNullPtr);
4141 }
4142 }
4143
4144 // global <-> flat are no-ops and never emitted.
4145
4146 const MachineFunction &MF = DAG.getMachineFunction();
4147 DiagnosticInfoUnsupported InvalidAddrSpaceCast(
4148 MF.getFunction(), "invalid addrspacecast", SL.getDebugLoc());
4149 DAG.getContext()->diagnose(InvalidAddrSpaceCast);
4150
4151 return DAG.getUNDEF(ASC->getValueType(0));
4152}
4153
4154SDValue SITargetLowering::lowerINSERT_VECTOR_ELT(SDValue Op,
4155 SelectionDAG &DAG) const {
4156 SDValue Idx = Op.getOperand(2);
4157 if (isa<ConstantSDNode>(Idx))
4158 return SDValue();
4159
4160 // Avoid stack access for dynamic indexing.
4161 SDLoc SL(Op);
4162 SDValue Vec = Op.getOperand(0);
4163 SDValue Val = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Op.getOperand(1));
4164
4165 // v_bfi_b32 (v_bfm_b32 16, (shl idx, 16)), val, vec
4166 SDValue ExtVal = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Val);
4167
4168 // Convert vector index to bit-index.
4169 SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx,
4170 DAG.getConstant(16, SL, MVT::i32));
4171
4172 SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Vec);
4173
4174 SDValue BFM = DAG.getNode(ISD::SHL, SL, MVT::i32,
4175 DAG.getConstant(0xffff, SL, MVT::i32),
4176 ScaledIdx);
4177
4178 SDValue LHS = DAG.getNode(ISD::AND, SL, MVT::i32, BFM, ExtVal);
4179 SDValue RHS = DAG.getNode(ISD::AND, SL, MVT::i32,
4180 DAG.getNOT(SL, BFM, MVT::i32), BCVec);
4181
4182 SDValue BFI = DAG.getNode(ISD::OR, SL, MVT::i32, LHS, RHS);
4183 return DAG.getNode(ISD::BITCAST, SL, Op.getValueType(), BFI);
4184}
4185
4186SDValue SITargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
4187 SelectionDAG &DAG) const {
4188 SDLoc SL(Op);
4189
4190 EVT ResultVT = Op.getValueType();
4191 SDValue Vec = Op.getOperand(0);
4192 SDValue Idx = Op.getOperand(1);
4193
4194 DAGCombinerInfo DCI(DAG, AfterLegalizeVectorOps, true, nullptr);
4195
4196 // Make sure we we do any optimizations that will make it easier to fold
4197 // source modifiers before obscuring it with bit operations.
4198
4199 // XXX - Why doesn't this get called when vector_shuffle is expanded?
4200 if (SDValue Combined = performExtractVectorEltCombine(Op.getNode(), DCI))
4201 return Combined;
4202
4203 if (const ConstantSDNode *CIdx = dyn_cast<ConstantSDNode>(Idx)) {
4204 SDValue Result = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Vec);
4205
4206 if (CIdx->getZExtValue() == 1) {
4207 Result = DAG.getNode(ISD::SRL, SL, MVT::i32, Result,
4208 DAG.getConstant(16, SL, MVT::i32));
4209 } else {
4210 assert(CIdx->getZExtValue() == 0)(static_cast <bool> (CIdx->getZExtValue() == 0) ? void
(0) : __assert_fail ("CIdx->getZExtValue() == 0", "/build/llvm-toolchain-snapshot-7~svn329677/lib/Target/AMDGPU/SIISelLowering.cpp"
, 4210, __extension__ __PRETTY_FUNCTION__))
;
4211 }
4212
4213 if (ResultVT.bitsLT(MVT::i32))
4214 Result = DAG.getNode(ISD::TRUNCATE, SL, MVT::i16, Result);
4215 return DAG.getNode(ISD::BITCAST, SL, ResultVT, Result);
4216 }
4217
4218 SDValue Sixteen = DAG.getConstant(16, SL, MVT::i32);
4219
4220 // Convert vector index to bit-index.
4221 SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, Sixteen);
4222
4223 SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Vec);
4224 SDValue Elt = DAG.getNode(ISD::SRL, SL, MVT::i32, BC, ScaledIdx);
4225
4226 SDValue Result = Elt;
4227 if (ResultVT.bitsLT(MVT::i32))
4228 Result = DAG.getNode(ISD::TRUNCATE, SL, MVT::i16, Result);
4229
4230 return DAG.getNode(ISD::BITCAST, SL, ResultVT, Result);
4231}
4232
4233bool
4234SITargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const {
4235 // We can fold offsets for anything that doesn't require a GOT relocation.
4236 return (GA->getAddressSpace() == AMDGPUASI.GLOBAL_ADDRESS ||
4237 GA->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS ||
4238 GA->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS_32BIT) &&
4239 !shouldEmitGOTReloc(GA->getGlobal());
4240}
4241
4242static SDValue
4243buildPCRelGlobalAddress(SelectionDAG &DAG, const GlobalValue *GV,
4244 const SDLoc &DL, unsigned Offset, EVT PtrVT,
4245 unsigned GAFlags = SIInstrInfo::MO_NONE) {
4246 // In order to support pc-relative addressing, the PC_ADD_REL_OFFSET SDNode is
4247 // lowered to the following code sequence:
4248 //
4249 // For constant address space:
4250 // s_getpc_b64 s[0:1]
4251 // s_add_u32 s0, s0, $symbol
4252 // s_addc_u32 s1, s1, 0
4253 //
4254 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
4255 // a fixup or relocation is emitted to replace $symbol with a literal
4256 // constant, which is a pc-relative offset from the encoding of the $symbol
4257 // operand to the global variable.
4258 //
4259 // For global address space:
4260 // s_getpc_b64 s[0:1]
4261 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
4262 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
4263 //
4264 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
4265 // fixups or relocations are emitted to replace $symbol@*@lo and
4266 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
4267 // which is a 64-bit pc-relative offset from the encoding of the $symbol
4268 // operand to the global variable.
4269 //
4270 // What we want here is an offset from the value returned by s_getpc
4271 // (which is the address of the s_add_u32 instruction) to the global
4272 // variable, but since the encoding of $symbol starts 4 bytes after the start
4273 // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
4274 // small. This requires us to add 4 to the global variable offset in order to
4275 // compute the correct address.
4276 SDValue PtrLo = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset + 4,
4277 GAFlags);
4278 SDValue PtrHi = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset + 4,
4279 GAFlags == SIInstrInfo::MO_NONE ?
4280 GAFlags : GAFlags + 1);
4281 return DAG.getNode(AMDGPUISD::PC_ADD_REL_OFFSET, DL, PtrVT, PtrLo, PtrHi);
4282}
4283
4284SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI,
4285 SDValue Op,
4286 SelectionDAG &DAG) const {
4287 GlobalAddressSDNode *GSD = cast<GlobalAddressSDNode>(Op);
4288 const GlobalValue *GV = GSD->getGlobal();
4289
4290 if (GSD->getAddressSpace() != AMDGPUASI.CONSTANT_ADDRESS &&
4291 GSD->getAddressSpace() != AMDGPUASI.CONSTANT_ADDRESS_32BIT &&
4292 GSD->getAddressSpace() != AMDGPUASI.GLOBAL_ADDRESS &&
4293 // FIXME: It isn't correct to rely on the type of the pointer. This should
4294 // be removed when address space 0 is 64-bit.
4295 !GV->getType()->getElementType()->isFunctionTy())
4296 return AMDGPUTargetLowering::LowerGlobalAddress(MFI, Op, DAG);
4297
4298 SDLoc DL(GSD);
4299 EVT PtrVT = Op.getValueType();
4300
4301 if (shouldEmitFixup(GV))
4302 return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT);
4303 else if (shouldEmitPCReloc(GV))
4304 return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT,
4305 SIInstrInfo::MO_REL32);
4306
4307 SDValue GOTAddr = buildPCRelGlobalAddress(DAG, GV, DL, 0, PtrVT,
4308 SIInstrInfo::MO_GOTPCREL32);
4309
4310 Type *Ty = PtrVT.getTypeForEVT(*DAG.getContext());
4311 PointerType *PtrTy = PointerType::get(Ty, AMDGPUASI.CONSTANT_ADDRESS);
4312 const DataLayout &DataLayout = DAG.getDataLayout();
4313 unsigned Align = DataLayout.getABITypeAlignment(PtrTy);
4314 // FIXME: Use a PseudoSourceValue once those can be assigned an address space.
4315 MachinePointerInfo PtrInfo(UndefValue::get(PtrTy));
4316
4317 return DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), GOTAddr, PtrInfo, Align,
4318 MachineMemOperand::MODereferenceable |
4319 MachineMemOperand::MOInvariant);
4320}
4321
4322SDValue SITargetLowering::copyToM0(SelectionDAG &DAG, SDValue Chain,
4323 const SDLoc &DL, SDValue V) const {
4324 // We can't use S_MOV_B32 directly, because there is no way to specify m0 as
4325 // the destination register.
4326 //
4327 // We can't use CopyToReg, because MachineCSE won't combine COPY instructions,
4328 // so we will end up with redundant moves to m0.
4329 //
4330 // We use a pseudo to ensure we emit s_mov_b32 with m0 as the direct result.
4331
4332 // A Null SDValue creates a glue result.
4333 SDNode *M0 = DAG.getMachineNode(AMDGPU::SI_INIT_M0, DL, MVT::Other, MVT::Glue,
4334 V, Chain);
4335 return SDValue(M0, 0);
4336}
4337
4338SDValue SITargetLowering::lowerImplicitZextParam(SelectionDAG &DAG,
4339 SDValue Op,
4340 MVT VT,
4341 unsigned Offset) const {
4342 SDLoc SL(Op);
4343 SDValue Param = lowerKernargMemParameter(DAG, MVT::i32, MVT::i32, SL,
4344 DAG.getEntryNode(), Offset, false);
4345 // The local size values will have the hi 16-bits as zero.
4346 return DAG.getNode(ISD::AssertZext, SL, MVT::i32, Param,
4347 DAG.getValueType(VT));
4348}
4349
4350static SDValue emitNonHSAIntrinsicError(SelectionDAG &DAG, const SDLoc &DL,
4351 EVT VT) {
4352 DiagnosticInfoUnsupported BadIntrin(DAG.getMachineFunction().getFunction(),
4353 "non-hsa intrinsic with hsa target",
4354 DL.getDebugLoc());
4355 DAG.getContext()->diagnose(BadIntrin);
4356 return DAG.getUNDEF(VT);
4357}
4358
4359static SDValue emitRemovedIntrinsicError(SelectionDAG &DAG, const SDLoc &DL,
4360 EVT VT) {
4361 DiagnosticInfoUnsupported BadIntrin(DAG.getMachineFunction().getFunction(),
4362 "intrinsic not supported on subtarget",
4363 DL.getDebugLoc());
4364 DAG.getContext()->diagnose(BadIntrin);
4365 return DAG.getUNDEF(VT);
4366}
4367
4368SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
4369 SelectionDAG &DAG) const {
4370 MachineFunction &MF = DAG.getMachineFunction();
4371 auto MFI = MF.getInfo<SIMachineFunctionInfo>();
4372
4373 EVT VT = Op.getValueType();
4374 SDLoc DL(Op);
4375 unsigned IntrinsicID = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
4376
4377 // TODO: Should this propagate fast-math-flags?
4378
4379 switch (IntrinsicID) {
4380 case Intrinsic::amdgcn_implicit_buffer_ptr: {
4381 if (getSubtarget()->isAmdCodeObjectV2(MF))
4382 return emitNonHSAIntrinsicError(DAG, DL, VT);
4383 return getPreloadedValue(DAG, *MFI, VT,
4384 AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
4385 }
4386 case Intrinsic::amdgcn_dispatch_ptr:
4387 case Intrinsic::amdgcn_queue_ptr: {
4388 if (!Subtarget->isAmdCodeObjectV2(MF)) {
4389 DiagnosticInfoUnsupported BadIntrin(
4390 MF.getFunction(), "unsupported hsa intrinsic without hsa target",
4391 DL.getDebugLoc());
4392 DAG.getContext()->diagnose(BadIntrin);
4393 return DAG.getUNDEF(VT);
4394 }
4395
4396 auto RegID = IntrinsicID == Intrinsic::amdgcn_dispatch_ptr ?
4397 AMDGPUFunctionArgInfo::DISPATCH_PTR : AMDGPUFunctionArgInfo::QUEUE_PTR;
4398 return getPreloadedValue(DAG, *MFI, VT, RegID);
4399 }
4400 case Intrinsic::amdgcn_implicitarg_ptr: {
4401 if (MFI->isEntryFunction())
4402 return getImplicitArgPtr(DAG, DL);
4403 return getPreloadedValue(DAG, *MFI, VT,
4404 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
4405 }
4406 case Intrinsic::amdgcn_kernarg_segment_ptr: {
4407 return getPreloadedValue(DAG, *MFI, VT,
4408 AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
4409 }
4410 case Intrinsic::amdgcn_dispatch_id: {
4411 return getPreloadedValue(DAG, *MFI, VT, AMDGPUFunctionArgInfo::DISPATCH_ID);
4412 }
4413 case Intrinsic::amdgcn_rcp:
4414 return DAG.getNode(AMDGPUISD::RCP, DL, VT, Op.getOperand(1));
4415 case Intrinsic::amdgcn_rsq:
4416 return DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
4417 case Intrinsic::amdgcn_rsq_legacy:
4418 if (Subtarget->getGeneration() >= SISubtarget::VOLCANIC_ISLANDS)
4419 return emitRemovedIntrinsicError(DAG, DL, VT);
4420
4421 return DAG.getNode(AMDGPUISD::RSQ_LEGACY, DL, VT, Op.getOperand(1));
4422 case Intrinsic::amdgcn_rcp_legacy:
4423 if (Subtarget->getGeneration() >= SISubtarget::VOLCANIC_ISLANDS)
4424 return emitRemovedIntrinsicError(DAG, DL, VT);
4425 return DAG.getNode(AMDGPUISD::RCP_LEGACY, DL, VT, Op.getOperand(1));
4426 case Intrinsic::amdgcn_rsq_clamp: {
4427 if (Subtarget->getGeneration() < SISubtarget::VOLCANIC_ISLANDS)
4428 return DAG.getNode(AMDGPUISD::RSQ_CLAMP, DL, VT, Op.getOperand(1));
4429
4430 Type *Type = VT.getTypeForEVT(*DAG.getContext());
4431 APFloat Max = APFloat::getLargest(Type->getFltSemantics());
4432 APFloat Min = APFloat::getLargest(Type->getFltSemantics(), true);
4433
4434 SDValue Rsq = DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
4435 SDValue Tmp = DAG.getNode(ISD::FMINNUM, DL, VT, Rsq,
4436 DAG.getConstantFP(Max, DL, VT));
4437 return DAG.getNode(ISD::FMAXNUM, DL, VT, Tmp,
4438 DAG.getConstantFP(Min, DL, VT));
4439 }
4440 case Intrinsic::r600_read_ngroups_x:
4441 if (Subtarget->isAmdHsaOS())
4442 return emitNonHSAIntrinsicError(DAG, DL, VT);
4443
4444 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
4445 SI::KernelInputOffsets::NGROUPS_X, false);
4446 case Intrinsic::r600_read_ngroups_y:
4447 if (Subtarget->isAmdHsaOS())
4448 return emitNonHSAIntrinsicError(DAG, DL, VT);
4449
4450 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
4451 SI::KernelInputOffsets::NGROUPS_Y, false);
4452 case Intrinsic::r600_read_ngroups_z:
4453 if (Subtarget->isAmdHsaOS())
4454 return emitNonHSAIntrinsicError(DAG, DL, VT);
4455
4456 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
4457 SI::KernelInputOffsets::NGROUPS_Z, false);
4458 case Intrinsic::r600_read_global_size_x:
4459 if (Subtarget->isAmdHsaOS())
4460 return emitNonHSAIntrinsicError(DAG, DL, VT);
4461
4462 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
4463 SI::KernelInputOffsets::GLOBAL_SIZE_X, false);
4464 case Intrinsic::r600_read_global_size_y:
4465 if (Subtarget->isAmdHsaOS())
4466 return emitNonHSAIntrinsicError(DAG, DL, VT);
4467
4468 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
4469 SI::KernelInputOffsets::GLOBAL_SIZE_Y, false);
4470 case Intrinsic::r600_read_global_size_z:
4471 if (Subtarget->isAmdHsaOS())
4472 return emitNonHSAIntrinsicError(DAG, DL, VT);
4473
4474 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
4475 SI::KernelInputOffsets::GLOBAL_SIZE_Z, false);
4476 case Intrinsic::r600_read_local_size_x:
4477 if (Subtarget->isAmdHsaOS())
4478 return emitNonHSAIntrinsicError(DAG, DL, VT);
4479
4480 return lowerImplicitZextParam(DAG, Op, MVT::i16,
4481 SI::KernelInputOffsets::LOCAL_SIZE_X);
4482 case Intrinsic::r600_read_local_size_y:
4483 if (Subtarget->isAmdHsaOS())
4484 return emitNonHSAIntrinsicError(DAG, DL, VT);
4485
4486 return lowerImplicitZextParam(DAG, Op, MVT::i16,
4487 SI::KernelInputOffsets::LOCAL_SIZE_Y);
4488 case Intrinsic::r600_read_local_size_z:
4489 if (Subtarget->isAmdHsaOS())
4490 return emitNonHSAIntrinsicError(DAG, DL, VT);
4491
4492 return lowerImplicitZextParam(DAG, Op, MVT::i16,
4493 SI::KernelInputOffsets::LOCAL_SIZE_Z);
4494 case Intrinsic::amdgcn_workgroup_id_x:
4495 case Intrinsic::r600_read_tgid_x:
4496 return getPreloadedValue(DAG, *MFI, VT,
4497 AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
4498 case Intrinsic::amdgcn_workgroup_id_y:
4499 case Intrinsic::r600_read_tgid_y:
4500 return getPreloadedValue(DAG, *MFI, VT,
4501 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
4502 case Intrinsic::amdgcn_workgroup_id_z:
4503 case Intrinsic::r600_read_tgid_z:
4504 return getPreloadedValue(DAG, *MFI, VT,
4505 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
4506 case Intrinsic::amdgcn_workitem_id_x: {
4507 case Intrinsic::r600_read_tidig_x:
4508 return loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32,
4509 SDLoc(DAG.getEntryNode()),
4510 MFI->getArgInfo().WorkItemIDX);
4511 }
4512 case Intrinsic::amdgcn_workitem_id_y:
4513 case Intrinsic::r600_read_tidig_y:
4514 return loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32,
4515 SDLoc(DAG.getEntryNode()),
4516 MFI->getArgInfo().WorkItemIDY);
4517 case Intrinsic::amdgcn_workitem_id_z:
4518 case Intrinsic::r600_read_tidig_z:
4519 return loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32,
4520 SDLoc(DAG.getEntryNode()),
4521 MFI->getArgInfo().WorkItemIDZ);
4522 case AMDGPUIntrinsic::SI_load_const: {
4523 SDValue Ops[] = {
4524 Op.getOperand(1),
4525 Op.getOperand(2)
4526 };
4527
4528 MachineMemOperand *MMO = MF.getMachineMemOperand(
4529 MachinePointerInfo(),
4530 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
4531 MachineMemOperand::MOInvariant,
4532 VT.getStoreSize(), 4);
4533 return DAG.getMemIntrinsicNode(AMDGPUISD::LOAD_CONSTANT, DL,
4534 Op->getVTList(), Ops, VT, MMO);
4535 }
4536 case Intrinsic::amdgcn_fdiv_fast:
4537 return lowerFDIV_FAST(Op, DAG);
4538 case Intrinsic::amdgcn_interp_mov: {
4539 SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(4));
4540 SDValue Glue = M0.getValue(1);
4541 return DAG.getNode(AMDGPUISD::INTERP_MOV, DL, MVT::f32, Op.getOperand(1),
4542 Op.getOperand(2), Op.getOperand(3), Glue);
4543 }
4544 case Intrinsic::amdgcn_interp_p1: {
4545 SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(4));
4546 SDValue Glue = M0.getValue(1);
4547 return DAG.getNode(AMDGPUISD::INTERP_P1, DL, MVT::f32, Op.getOperand(1),
4548 Op.getOperand(2), Op.getOperand(3), Glue);
4549 }
4550 case Intrinsic::amdgcn_interp_p2: {
4551 SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(5));
4552 SDValue Glue = SDValue(M0.getNode(), 1);
4553 return DAG.getNode(AMDGPUISD::INTERP_P2, DL, MVT::f32, Op.getOperand(1),
4554 Op.getOperand(2), Op.getOperand(3), Op.getOperand(4),
4555 Glue);
4556 }
4557 case Intrinsic::amdgcn_sin:
4558 return DAG.getNode(AMDGPUISD::SIN_HW, DL, VT, Op.getOperand(1));
4559
4560 case Intrinsic::amdgcn_cos:
4561 return DAG.getNode(AMDGPUISD::COS_HW, DL, VT, Op.getOperand(1));
4562
4563 case Intrinsic::amdgcn_log_clamp: {
4564 if (Subtarget->getGeneration() < SISubtarget::VOLCANIC_ISLANDS)
4565 return SDValue();
4566
4567 DiagnosticInfoUnsupported BadIntrin(
4568 MF.getFunction(), "intrinsic not supported on subtarget",
4569 DL.getDebugLoc());
4570 DAG.getContext()->diagnose(BadIntrin);
4571 return DAG.getUNDEF(VT);
4572 }
4573 case Intrinsic::amdgcn_ldexp:
4574 return DAG.getNode(AMDGPUISD::LDEXP, DL, VT,
4575 Op.getOperand(1), Op.getOperand(2));
4576
4577 case Intrinsic::amdgcn_fract:
4578 return DAG.getNode(AMDGPUISD::FRACT, DL, VT, Op.getOperand(1));
4579
4580 case Intrinsic::amdgcn_class:
4581 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, VT,
4582 Op.getOperand(1), Op.getOperand(2));
4583 case Intrinsic::amdgcn_div_fmas:
4584 return DAG.getNode(AMDGPUISD::DIV_FMAS, DL, VT,
4585 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3),
4586 Op.getOperand(4));
4587
4588 case Intrinsic::amdgcn_div_fixup:
4589 return DAG.getNode(AMDGPUISD::DIV_FIXUP, DL, VT,
4590 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
4591
4592 case Intrinsic::amdgcn_trig_preop:
4593 return DAG.getNode(AMDGPUISD::TRIG_PREOP, DL, VT,
4594 Op.getOperand(1), Op.getOperand(2));
4595 case Intrinsic::amdgcn_div_scale: {
4596 // 3rd parameter required to be a constant.
4597 const ConstantSDNode *Param = dyn_cast<ConstantSDNode>(Op.getOperand(3));
4598 if (!Param)
4599 return DAG.getMergeValues({ DAG.getUNDEF(VT), DAG.getUNDEF(MVT::i1) }, DL);
4600
4601 // Translate to the operands expected by the machine instruction. The
4602 // first parameter must be the same as the first instruction.
4603 SDValue Numerator = Op.getOperand(1);
4604 SDValue Denominator = Op.getOperand(2);
4605
4606 // Note this order is opposite of the machine instruction's operations,
4607 // which is s0.f = Quotient, s1.f = Denominator, s2.f = Numerator. The
4608 // intrinsic has the numerator as the first operand to match a normal
4609 // division operation.
4610
4611 SDValue Src0 = Param->isAllOnesValue() ? Numerator : Denominator;
4612
4613 return DAG.getNode(AMDGPUISD::DIV_SCALE, DL, Op->getVTList(), Src0,
4614 Denominator, Numerator);
4615 }
4616 case Intrinsic::amdgcn_icmp: {
4617 const auto *CD = dyn_cast<ConstantSDNode>(Op.getOperand(3));
4618 if (!CD)
4619 return DAG.getUNDEF(VT);
4620
4621 int CondCode = CD->getSExtValue();
4622 if (CondCode < ICmpInst::Predicate::FIRST_ICMP_PREDICATE ||
4623 CondCode > ICmpInst::Predicate::LAST_ICMP_PREDICATE)
4624 return DAG.getUNDEF(VT);
4625
4626 ICmpInst::Predicate IcInput = static_cast<ICmpInst::Predicate>(CondCode);
4627 ISD::CondCode CCOpcode = getICmpCondCode(IcInput);
4628 return DAG.getNode(AMDGPUISD::SETCC, DL, VT, Op.getOperand(1),
4629 Op.getOperand(2), DAG.getCondCode(CCOpcode));
4630 }
4631 case Intrinsic::amdgcn_fcmp: {
4632 const auto *CD = dyn_cast<ConstantSDNode>(Op.getOperand(3));
4633 if (!CD)
4634 return DAG.getUNDEF(VT);
4635
4636 int CondCode = CD->getSExtValue();
4637 if (CondCode < FCmpInst::Predicate::FIRST_FCMP_PREDICATE ||
4638 CondCode > FCmpInst::Predicate::LAST_FCMP_PREDICATE)
4639 return DAG.getUNDEF(VT);
4640
4641 FCmpInst::Predicate IcInput = static_cast<FCmpInst::Predicate>(CondCode);
4642 ISD::CondCode CCOpcode = getFCmpCondCode(IcInput);
4643 return DAG.getNode(AMDGPUISD::SETCC, DL, VT, Op.getOperand(1),
4644 Op.getOperand(2), DAG.getCondCode(CCOpcode));
4645 }
4646 case Intrinsic::amdgcn_fmed3:
4647 return DAG.getNode(AMDGPUISD::FMED3, DL, VT,
4648 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
4649 case Intrinsic::amdgcn_fmul_legacy:
4650 return DAG.getNode(AMDGPUISD::FMUL_LEGACY, DL, VT,
4651 Op.getOperand(1), Op.getOperand(2));
4652 case Intrinsic::amdgcn_sffbh:
4653 return DAG.getNode(AMDGPUISD::FFBH_I32, DL, VT, Op.getOperand(1));
4654 case Intrinsic::amdgcn_sbfe:
4655 return DAG.getNode(AMDGPUISD::BFE_I32, DL, VT,
4656 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
4657 case Intrinsic::amdgcn_ubfe:
4658 return DAG.getNode(AMDGPUISD::BFE_U32, DL, VT,
4659 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
4660 case Intrinsic::amdgcn_cvt_pkrtz:
4661 case Intrinsic::amdgcn_cvt_pknorm_i16:
4662 case Intrinsic::amdgcn_cvt_pknorm_u16:
4663 case Intrinsic::amdgcn_cvt_pk_i16:
4664 case Intrinsic::amdgcn_cvt_pk_u16: {
4665 // FIXME: Stop adding cast if v2f16/v2i16 are legal.
4666 EVT VT = Op.getValueType();
4667 unsigned Opcode;
4668
4669 if (IntrinsicID == Intrinsic::amdgcn_cvt_pkrtz)
4670 Opcode = AMDGPUISD::CVT_PKRTZ_F16_F32;
4671 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_i16)
4672 Opcode = AMDGPUISD::CVT_PKNORM_I16_F32;
4673 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_u16)
4674 Opcode = AMDGPUISD::CVT_PKNORM_U16_F32;
4675 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pk_i16)
4676 Opcode = AMDGPUISD::CVT_PK_I16_I32;
4677 else
4678 Opcode = AMDGPUISD::CVT_PK_U16_U32;
4679
4680 SDValue Node = DAG.getNode(Opcode, DL, MVT::i32,
4681 Op.getOperand(1), Op.getOperand(2));
4682 return DAG.getNode(ISD::BITCAST, DL, VT, Node);
4683 }
4684 case Intrinsic::amdgcn_wqm: {
4685 SDValue Src = Op.getOperand(1);
4686 return SDValue(DAG.getMachineNode(AMDGPU::WQM, DL, Src.getValueType(), Src),
4687 0);
4688 }
4689 case Intrinsic::amdgcn_wwm: {
4690 SDValue Src = Op.getOperand(1);
4691 return SDValue(DAG.getMachineNode(AMDGPU::WWM, DL, Src.getValueType(), Src),
4692 0);
4693 }
4694 case Intrinsic::amdgcn_image_getlod:
4695 case Intrinsic::amdgcn_image_getresinfo: {
4696 unsigned Idx = (IntrinsicID == Intrinsic::amdgcn_image_getresinfo) ? 3 : 4;
4697
4698 // Replace dmask with everything disabled with undef.
4699 const ConstantSDNode *DMask = dyn_cast<ConstantSDNode>(Op.getOperand(Idx));
4700 if (!DMask || DMask->isNullValue())
4701 return DAG.getUNDEF(Op.getValueType());
4702 return SDValue();
4703 }
4704 default:
4705 return Op;
4706 }
4707}
4708
4709SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
4710 SelectionDAG &DAG) const {
4711 unsigned IntrID = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
4712 SDLoc DL(Op);
4713
4714 switch (IntrID) {
4715 case Intrinsic::amdgcn_atomic_inc:
4716 case Intrinsic::amdgcn_atomic_dec:
4717 case Intrinsic::amdgcn_ds_fadd:
4718 case Intrinsic::amdgcn_ds_fmin:
4719 case Intrinsic::amdgcn_ds_fmax: {
4720 MemSDNode *M = cast<MemSDNode>(Op);
4721 unsigned Opc;
4722 switch (IntrID) {
4723 case Intrinsic::amdgcn_atomic_inc:
4724 Opc = AMDGPUISD::ATOMIC_INC;
4725 break;
4726 case Intrinsic::amdgcn_atomic_dec:
4727 Opc = AMDGPUISD::ATOMIC_DEC;
4728 break;
4729 case Intrinsic::amdgcn_ds_fadd:
4730 Opc = AMDGPUISD::ATOMIC_LOAD_FADD;
4731 break;
4732 case Intrinsic::amdgcn_ds_fmin:
4733 Opc = AMDGPUISD::ATOMIC_LOAD_FMIN;
4734 break;
4735 case Intrinsic::amdgcn_ds_fmax:
4736 Opc = AMDGPUISD::ATOMIC_LOAD_FMAX;
4737 break;
4738 default:
4739 llvm_unreachable("Unknown intrinsic!")::llvm::llvm_unreachable_internal("Unknown intrinsic!", "/build/llvm-toolchain-snapshot-7~svn329677/lib/Target/AMDGPU/SIISelLowering.cpp"
, 4739)
;
4740 }
4741 SDValue Ops[] = {
4742 M->getOperand(0), // Chain
4743 M->getOperand(2), // Ptr
4744 M->getOperand(3) // Value
4745 };
4746
4747 return DAG.getMemIntrinsicNode(Opc, SDLoc(Op), M->getVTList(), Ops,
4748 M->getMemoryVT(), M->getMemOperand());
4749 }
4750 case Intrinsic::amdgcn_buffer_load:
4751 case Intrinsic::amdgcn_buffer_load_format: {
4752 SDValue Ops[] = {
4753 Op.getOperand(0), // Chain
4754 Op.getOperand(2), // rsrc
4755 Op.getOperand(3), // vindex
4756 Op.getOperand(4), // offset
4757 Op.getOperand(5), // glc
4758 Op.getOperand(6) // slc
4759 };
4760
4761 unsigned Opc = (IntrID == Intrinsic::amdgcn_buffer_load) ?
4762 AMDGPUISD::BUFFER_LOAD : AMDGPUISD::BUFFER_LOAD_FORMAT;
4763 EVT VT = Op.getValueType();
4764 EVT IntVT = VT.changeTypeToInteger();
4765
4766 auto *M = cast<MemSDNode>(Op);
4767 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, IntVT,
4768 M->getMemOperand());
4769 }
4770 case Intrinsic::amdgcn_tbuffer_load: {
4771 MemSDNode *M = cast<MemSDNode>(Op);
4772 SDValue Ops[] = {
4773 Op.getOperand(0), // Chain
4774 Op.getOperand(2), // rsrc
4775 Op.getOperand(3), // vindex
4776 Op.getOperand(4), // voffset
4777 Op.getOperand(5), // soffset
4778 Op.getOperand(6), // offset
4779 Op.getOperand(7), // dfmt
4780 Op.getOperand(8), // nfmt
4781 Op.getOperand(9), // glc
4782 Op.getOperand(10) // slc
4783 };
4784
4785 EVT VT = Op.getValueType();
4786
4787 return DAG.getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
4788 Op->getVTList(), Ops, VT, M->getMemOperand());
4789 }
4790 case Intrinsic::amdgcn_buffer_atomic_swap:
4791 case Intrinsic::amdgcn_buffer_atomic_add:
4792 case Intrinsic::amdgcn_buffer_atomic_sub:
4793 case Intrinsic::amdgcn_buffer_atomic_smin:
4794 case Intrinsic::amdgcn_buffer_atomic_umin:
4795 case Intrinsic::amdgcn_buffer_atomic_smax:
4796 case Intrinsic::amdgcn_buffer_atomic_umax:
4797 case Intrinsic::amdgcn_buffer_atomic_and:
4798 case Intrinsic::amdgcn_buffer_atomic_or:
4799 case Intrinsic::amdgcn_buffer_atomic_xor: {
4800 SDValue Ops[] = {
4801 Op.getOperand(0), // Chain
4802 Op.getOperand(2), // vdata
4803 Op.getOperand(3), // rsrc
4804 Op.getOperand(4), // vindex
4805 Op.getOperand(5), // offset
4806 Op.getOperand(6) // slc
4807 };
4808 EVT VT = Op.getValueType();
4809
4810 auto *M = cast<MemSDNode>(Op);
4811 unsigned Opcode = 0;
4812
4813 switch (IntrID) {
4814 case Intrinsic::amdgcn_buffer_atomic_swap:
4815 Opcode = AMDGPUISD::BUFFER_ATOMIC_SWAP;
4816 break;
4817 case Intrinsic::amdgcn_buffer_atomic_add:
4818 Opcode = AMDGPUISD::BUFFER_ATOMIC_ADD;
4819 break;
4820 case Intrinsic::amdgcn_buffer_atomic_sub:
4821 Opcode = AMDGPUISD::BUFFER_ATOMIC_SUB;
4822 break;
4823 case Intrinsic::amdgcn_buffer_atomic_smin:
4824 Opcode = AMDGPUISD::BUFFER_ATOMIC_SMIN;
4825 break;
4826 case Intrinsic::amdgcn_buffer_atomic_umin:
4827 Opcode = AMDGPUISD::BUFFER_ATOMIC_UMIN;
4828 break;
4829 case Intrinsic::amdgcn_buffer_atomic_smax:
4830 Opcode = AMDGPUISD::BUFFER_ATOMIC_SMAX;
4831 break;
4832 case Intrinsic::amdgcn_buffer_atomic_umax:
4833 Opcode = AMDGPUISD::BUFFER_ATOMIC_UMAX;
4834 break;
4835 case Intrinsic::amdgcn_buffer_atomic_and:
4836 Opcode = AMDGPUISD::BUFFER_ATOMIC_AND;
4837 break;
4838 case Intrinsic::amdgcn_buffer_atomic_or:
4839 Opcode = AMDGPUISD::BUFFER_ATOMIC_OR;
4840 break;
4841 case Intrinsic::amdgcn_buffer_atomic_xor:
4842 Opcode = AMDGPUISD::BUFFER_ATOMIC_XOR;
4843 break;
4844 default:
4845 llvm_unreachable("unhandled atomic opcode")::llvm::llvm_unreachable_internal("unhandled atomic opcode", "/build/llvm-toolchain-snapshot-7~svn329677/lib/Target/AMDGPU/SIISelLowering.cpp"
, 4845)
;
4846 }
4847
4848 return DAG.getMemIntrinsicNode(Opcode, DL, Op->getVTList(), Ops, VT,
4849 M->getMemOperand());
4850 }
4851
4852 case Intrinsic::amdgcn_buffer_atomic_cmpswap: {
4853 SDValue Ops[] = {
4854 Op.getOperand(0), // Chain
4855 Op.getOperand(2), // src
4856 Op.getOperand(3), // cmp
4857 Op.getOperand(4), // rsrc
4858 Op.getOperand(5), // vindex
4859 Op.getOperand(6), // offset
4860 Op.getOperand(7) // slc
4861 };
4862 EVT VT = Op.getValueType();
4863 auto *M = cast<MemSDNode>(Op);
4864
4865 return DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_ATOMIC_CMPSWAP, DL,
4866 Op->getVTList(), Ops, VT, M->getMemOperand());
4867 }
4868
4869 // Basic sample.
4870 case Intrinsic::amdgcn_image_sample:
4871 case Intrinsic::amdgcn_image_sample_cl:
4872 case Intrinsic::amdgcn_image_sample_d:
4873 case Intrinsic::amdgcn_image_sample_d_cl:
4874 case Intrinsic::amdgcn_image_sample_l:
4875 case Intrinsic::amdgcn_image_sample_b:
4876 case Intrinsic::amdgcn_image_sample_b_cl:
4877 case Intrinsic::amdgcn_image_sample_lz:
4878 case Intrinsic::amdgcn_image_sample_cd:
4879 case Intrinsic::amdgcn_image_sample_cd_cl:
4880
4881 // Sample with comparison.
4882 case Intrinsic::amdgcn_image_sample_c:
4883 case Intrinsic::amdgcn_image_sample_c_cl:
4884 case Intrinsic::amdgcn_image_sample_c_d:
4885 case Intrinsic::amdgcn_image_sample_c_d_cl:
4886 case Intrinsic::amdgcn_image_sample_c_l:
4887 case Intrinsic::amdgcn_image_sample_c_b:
4888 case Intrinsic::amdgcn_image_sample_c_b_cl:
4889 case Intrinsic::amdgcn_image_sample_c_lz:
4890 case Intrinsic::amdgcn_image_sample_c_cd:
4891 case Intrinsic::amdgcn_image_sample_c_cd_cl:
4892
4893 // Sample with offsets.
4894 case Intrinsic::amdgcn_image_sample_o:
4895 case Intrinsic::amdgcn_image_sample_cl_o:
4896 case Intrinsic::amdgcn_image_sample_d_o:
4897 case Intrinsic::amdgcn_image_sample_d_cl_o:
4898 case Intrinsic::amdgcn_image_sample_l_o:
4899 case Intrinsic::amdgcn_image_sample_b_o:
4900 case Intrinsic::amdgcn_image_sample_b_cl_o:
4901 case Intrinsic::amdgcn_image_sample_lz_o:
4902 case Intrinsic::amdgcn_image_sample_cd_o:
4903 case Intrinsic::amdgcn_image_sample_cd_cl_o:
4904
4905 // Sample with comparison and offsets.
4906 case Intrinsic::amdgcn_image_sample_c_o:
4907 case Intrinsic::amdgcn_image_sample_c_cl_o:
4908 case Intrinsic::amdgcn_image_sample_c_d_o:
4909 case Intrinsic::amdgcn_image_sample_c_d_cl_o:
4910 case Intrinsic::amdgcn_image_sample_c_l_o:
4911 case Intrinsic::amdgcn_image_sample_c_b_o:
4912 case Intrinsic::amdgcn_image_sample_c_b_cl_o:
4913 case Intrinsic::amdgcn_image_sample_c_lz_o:
4914 case Intrinsic::amdgcn_image_sample_c_cd_o:
4915 case Intrinsic::amdgcn_image_sample_c_cd_cl_o: {
4916 // Replace dmask with everything disabled with undef.
4917 const ConstantSDNode *DMask = dyn_cast<ConstantSDNode>(Op.getOperand(5));
4918 if (!DMask || DMask->isNullValue()) {
4919 SDValue Undef = DAG.getUNDEF(Op.getValueType());
4920 return DAG.getMergeValues({ Undef, Op.getOperand(0) }, SDLoc(Op));
4921 }
4922
4923 return SDValue();
4924 }
4925 default:
4926 return SDValue();
4927 }
4928}
4929
4930SDValue SITargetLowering::handleD16VData(SDValue VData,
4931 SelectionDAG &DAG) const {
4932 EVT StoreVT = VData.getValueType();
4933 SDLoc DL(VData);
4934
4935 if (StoreVT.isVector()) {
4936 assert ((StoreVT.getVectorNumElements() != 3) && "Handle v3f16")(static_cast <bool> ((StoreVT.getVectorNumElements() !=
3) && "Handle v3f16") ? void (0) : __assert_fail ("(StoreVT.getVectorNumElements() != 3) && \"Handle v3f16\""
, "/build/llvm-toolchain-snapshot-7~svn329677/lib/Target/AMDGPU/SIISelLowering.cpp"
, 4936, __extension__ __PRETTY_FUNCTION__))
;
4937 if (!Subtarget->hasUnpackedD16VMem()) {
4938 if (!isTypeLegal(StoreVT)) {
4939 // If Target supports packed vmem, we just need to workaround
4940 // the illegal type by casting to an equivalent one.
4941 EVT EquivStoreVT = getEquivalentMemType(*DAG.getContext(), StoreVT);
4942 return DAG.getNode(ISD::BITCAST, DL, EquivStoreVT, VData);
4943 }
4944 } else { // We need to unpack the packed data to store.
4945 EVT IntStoreVT = StoreVT.changeTypeToInteger();
4946 SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData);
4947 EVT EquivStoreVT = (StoreVT == MVT::v2f16) ? MVT::v2i32 : MVT::v4i32;
4948 return DAG.getNode(ISD::ZERO_EXTEND, DL, EquivStoreVT, IntVData);
4949 }
4950 }
4951 // No change for f16 and legal vector D16 types.
4952 return VData;
4953}
4954
4955SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
4956 SelectionDAG &DAG) const {
4957 SDLoc DL(Op);
4958 SDValue Chain = Op.getOperand(0);
4959 unsigned IntrinsicID = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
4960 MachineFunction &MF = DAG.getMachineFunction();
4961
4962 switch (IntrinsicID) {
4963 case Intrinsic::amdgcn_exp: {
4964 const ConstantSDNode *Tgt = cast<ConstantSDNode>(Op.getOperand(2));
4965 const ConstantSDNode *En = cast<ConstantSDNode>(Op.getOperand(3));
4966 const ConstantSDNode *Done = cast<ConstantSDNode>(Op.getOperand(8));
4967 const ConstantSDNode *VM = cast<ConstantSDNode>(Op.getOperand(9));
4968
4969 const SDValue Ops[] = {
4970 Chain,
4971 DAG.getTargetConstant(Tgt->getZExtValue(), DL, MVT::i8), // tgt
4972 DAG.getTargetConstant(En->getZExtValue(), DL, MVT::i8), // en
4973 Op.getOperand(4), // src0
4974 Op.getOperand(5), // src1
4975 Op.getOperand(6), // src2
4976 Op.getOperand(7), // src3
4977 DAG.getTargetConstant(0, DL, MVT::i1), // compr
4978 DAG.getTargetConstant(VM->getZExtValue(), DL, MVT::i1)
4979 };
4980
4981 unsigned Opc = Done->isNullValue() ?
4982 AMDGPUISD::EXPORT : AMDGPUISD::EXPORT_DONE;
4983 return DAG.getNode(Opc, DL, Op->getVTList(), Ops);
4984 }
4985 case Intrinsic::amdgcn_exp_compr: {
4986 const ConstantSDNode *Tgt = cast<ConstantSDNode>(Op.getOperand(2));
4987 const ConstantSDNode *En = cast<ConstantSDNode>(Op.getOperand(3));
4988 SDValue Src0 = Op.getOperand(4);
4989 SDValue Src1 = Op.getOperand(5);
4990 const ConstantSDNode *Done = cast<ConstantSDNode>(Op.getOperand(6));
4991 const ConstantSDNode *VM = cast<ConstantSDNode>(Op.getOperand(7));
4992
4993 SDValue Undef = DAG.getUNDEF(MVT::f32);
4994 const SDValue Ops[] = {
4995 Chain,
4996 DAG.getTargetConstant(Tgt->getZExtValue(), DL, MVT::i8), // tgt
4997 DAG.getTargetConstant(En->getZExtValue(), DL, MVT::i8), // en
4998 DAG.getNode(ISD::BITCAST, DL, MVT::f32, Src0),
4999 DAG.getNode(ISD::BITCAST, DL, MVT::f32, Src1),
5000 Undef, // src2
5001 Undef, // src3
5002 DAG.getTargetConstant(1, DL, MVT::i1), // compr
5003 DAG.getTargetConstant(VM->getZExtValue(), DL, MVT::i1)
5004 };
5005
5006 unsigned Opc = Done->isNullValue() ?
5007 AMDGPUISD::EXPORT : AMDGPUISD::EXPORT_DONE;
5008 return DAG.getNode(Opc, DL, Op->getVTList(), Ops);
5009 }
5010 case Intrinsic::amdgcn_s_sendmsg:
5011 case Intrinsic::amdgcn_s_sendmsghalt: {
5012 unsigned NodeOp = (IntrinsicID == Intrinsic::amdgcn_s_sendmsg) ?
5013 AMDGPUISD::SENDMSG : AMDGPUISD::SENDMSGHALT;
5014 Chain = copyToM0(DAG, Chain, DL, Op.getOperand(3));
5015 SDValue Glue = Chain.getValue(1);
5016 return DAG.getNode(NodeOp, DL, MVT::Other, Chain,
5017 Op.getOperand(2), Glue);
5018 }
5019 case Intrinsic::amdgcn_init_exec: {
5020 return DAG.getNode(AMDGPUISD::INIT_EXEC, DL, MVT::Other, Chain,
5021 Op.getOperand(2));
5022 }
5023 case Intrinsic::amdgcn_init_exec_from_input: {
5024 return DAG.getNode(AMDGPUISD::INIT_EXEC_FROM_INPUT, DL, MVT::Other, Chain,
5025 Op.getOperand(2), Op.getOperand(3));
5026 }
5027 case AMDGPUIntrinsic::AMDGPU_kill: {
5028 SDValue Src = Op.getOperand(2);
5029 if (const ConstantFPSDNode *K = dyn_cast<ConstantFPSDNode>(Src)) {
5030 if (!K->isNegative())
5031 return Chain;
5032
5033 SDValue NegOne = DAG.getTargetConstant(FloatToBits(-1.0f), DL, MVT::i32);
5034 return DAG.getNode(AMDGPUISD::KILL, DL, MVT::Other, Chain, NegOne);
5035 }
5036
5037 SDValue Cast = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Src);
5038 return DAG.getNode(AMDGPUISD::KILL, DL, MVT::Other, Chain, Cast);
5039 }
5040 case Intrinsic::amdgcn_s_barrier: {
5041 if (getTargetMachine().getOptLevel() > CodeGenOpt::None) {
5042 const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
5043 unsigned WGSize = ST.getFlatWorkGroupSizes(MF.getFunction()).second;
5044 if (WGSize <= ST.getWavefrontSize())
5045 return SDValue(DAG.getMachineNode(AMDGPU::WAVE_BARRIER, DL, MVT::Other,
5046 Op.getOperand(0)), 0);
5047 }
5048 return SDValue();
5049 };
5050 case AMDGPUIntrinsic::SI_tbuffer_store: {
5051
5052 // Extract vindex and voffset from vaddr as appropriate
5053 const ConstantSDNode *OffEn = cast<ConstantSDNode>(Op.getOperand(10));
5054 const ConstantSDNode *IdxEn = cast<ConstantSDNode>(Op.getOperand(11));
5055 SDValue VAddr = Op.getOperand(5);
5056
5057 SDValue Zero = DAG.getTargetConstant(0, DL, MVT::i32);
5058
5059 assert(!(OffEn->isOne() && IdxEn->isOne()) &&(static_cast <bool> (!(OffEn->isOne() && IdxEn
->isOne()) && "Legacy intrinsic doesn't support both offset and index - use new version"
) ? void (0) : __assert_fail ("!(OffEn->isOne() && IdxEn->isOne()) && \"Legacy intrinsic doesn't support both offset and index - use new version\""
, "/build/llvm-toolchain-snapshot-7~svn329677/lib/Target/AMDGPU/SIISelLowering.cpp"
, 5060, __extension__ __PRETTY_FUNCTION__))
5060 "Legacy intrinsic doesn't support both offset and index - use new version")(static_cast <bool> (!(OffEn->isOne() && IdxEn
->isOne()) && "Legacy intrinsic doesn't support both offset and index - use new version"
) ? void (0) : __assert_fail ("!(OffEn->isOne() && IdxEn->isOne()) && \"Legacy intrinsic doesn't support both offset and index - use new version\""
, "/build/llvm-toolchain-snapshot-7~svn329677/lib/Target/AMDGPU/SIISelLowering.cpp"
, 5060, __extension__ __PRETTY_FUNCTION__))
;
5061
5062 SDValue VIndex = IdxEn->isOne() ? VAddr : Zero;
5063 SDValue VOffset = OffEn->isOne() ? VAddr : Zero;
5064
5065 // Deal with the vec-3 case
5066 const ConstantSDNode *NumChannels = cast<ConstantSDNode>(Op.getOperand(4));
5067 auto Opcode = NumChannels->getZExtValue() == 3 ?
5068 AMDGPUISD::TBUFFER_STORE_FORMAT_X3 : AMDGPUISD::TBUFFER_STORE_FORMAT;
5069
5070 SDValue Ops[] = {
5071 Chain,
5072 Op.getOperand(3), // vdata
5073 Op.getOperand(2), // rsrc
5074 VIndex,
5075 VOffset,
5076 Op.getOperand(6), // soffset
5077 Op.getOperand(7), // inst_offset
5078 Op.getOperand(8), // dfmt
5079 Op.getOperand(9), // nfmt
5080 Op.getOperand(12), // glc
5081 Op.getOperand(13), // slc
5082 };
5083
5084 assert((cast<ConstantSDNode>(Op.getOperand(14)))->getZExtValue() == 0 &&(static_cast <bool> ((cast<ConstantSDNode>(Op.getOperand
(14)))->getZExtValue() == 0 && "Value of tfe other than zero is unsupported"
) ? void (0) : __assert_fail ("(cast<ConstantSDNode>(Op.getOperand(14)))->getZExtValue() == 0 && \"Value of tfe other than zero is unsupported\""
, "/build/llvm-toolchain-snapshot-7~svn329677/lib/Target/AMDGPU/SIISelLowering.cpp"
, 5085, __extension__ __PRETTY_FUNCTION__))
5085 "Value of tfe other than zero is unsupported")(static_cast <bool> ((cast<ConstantSDNode>(Op.getOperand
(14)))->getZExtValue() == 0 && "Value of tfe other than zero is unsupported"
) ? void (0) : __assert_fail ("(cast<ConstantSDNode>(Op.getOperand(14)))->getZExtValue() == 0 && \"Value of tfe other than zero is unsupported\""
, "/build/llvm-toolchain-snapshot-7~svn329677/lib/Target/AMDGPU/SIISelLowering.cpp"
, 5085, __extension__ __PRETTY_FUNCTION__))
;
5086
5087 EVT VT = Op.getOperand(3).getValueType();
5088 MachineMemOperand *MMO = MF.getMachineMemOperand(
5089 MachinePointerInfo(),
5090 MachineMemOperand::MOStore,
5091 VT.getStoreSize(), 4);
5092 return DAG.getMemIntrinsicNode(Opcode, DL,
5093 Op->getVTList(), Ops, VT, MMO);
5094 }
5095
5096 case Intrinsic::amdgcn_tbuffer_store: {
5097 SDValue VData = Op.getOperand(2);
5098 bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
5099 if (IsD16)
5100 VData = handleD16VData(VData, DAG);
5101 SDValue Ops[] = {
5102 Chain,
5103 VData, // vdata
5104 Op.getOperand(3), // rsrc
5105 Op.getOperand(4), // vindex
5106 Op.getOperand(5), // voffset
5107 Op.getOperand(6), // soffset
5108 Op.getOperand(7), // offset
5109 Op.getOperand(8), // dfmt
5110 Op.getOperand(9), // nfmt
5111 Op.getOperand(10), // glc
5112 Op.getOperand(11) // slc
5113 };
5114 unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16 :
5115 AMDGPUISD::TBUFFER_STORE_FORMAT;
5116 MemSDNode *M = cast<MemSDNode>(Op);
5117 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
5118 M->getMemoryVT(), M->getMemOperand());
5119 }
5120
5121 case Intrinsic::amdgcn_buffer_store:
5122 case Intrinsic::amdgcn_buffer_store_format: {
5123 SDValue VData = Op.getOperand(2);
5124 bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
5125 if (IsD16)
5126 VData = handleD16VData(VData, DAG);
5127 SDValue Ops[] = {
5128 Chain,
5129 VData, // vdata
5130 Op.getOperand(3), // rsrc
5131 Op.getOperand(4), // vindex
5132 Op.getOperand(5), // offset
5133 Op.getOperand(6), // glc
5134 Op.getOperand(7) // slc
5135 };
5136 unsigned Opc = IntrinsicID == Intrinsic::amdgcn_buffer_store ?
5137 AMDGPUISD::BUFFER_STORE : AMDGPUISD::BUFFER_STORE_FORMAT;
5138 Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc;
5139 MemSDNode *M = cast<MemSDNode>(Op);
5140 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
5141 M->getMemoryVT(), M->getMemOperand());
5142 }
5143
5144 case Intrinsic::amdgcn_image_store:
5145 case Intrinsic::amdgcn_image_store_mip: {
5146 SDValue VData = Op.getOperand(2);
5147 bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
5148 if (IsD16)
5149 VData = handleD16VData(VData, DAG);
5150 SDValue Ops[] = {
5151 Chain, // Chain
5152 VData, // vdata
5153 Op.getOperand(3), // vaddr
5154 Op.getOperand(4), // rsrc
5155 Op.getOperand(5), // dmask
5156 Op.getOperand(6), // glc
5157 Op.getOperand(7), // slc
5158 Op.getOperand(8), // lwe
5159 Op.getOperand(9) // da
5160 };
5161 unsigned Opc = (IntrinsicID==Intrinsic::amdgcn_image_store) ?
5162 AMDGPUISD::IMAGE_STORE : AMDGPUISD::IMAGE_STORE_MIP;
5163 MemSDNode *M = cast<MemSDNode>(Op);
5164 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
5165 M->getMemoryVT(), M->getMemOperand());
5166 }
5167
5168 default: {
5169 const AMDGPU::D16ImageDimIntrinsic *D16ImageDimIntr =
5170 AMDGPU::lookupD16ImageDimIntrinsicByIntr(IntrinsicID);
5171 if (D16ImageDimIntr) {
5172 SDValue VData = Op.getOperand(2);
5173 EVT StoreVT = VData.getValueType();
5174 if ((StoreVT == MVT::v2f16 && !isTypeLegal(StoreVT)) ||
5175 StoreVT == MVT::v4f16) {
5176 VData = handleD16VData(VData, DAG);
5177
5178 SmallVector<SDValue, 12> Ops;
5179 for (auto Value : Op.getNode()->op_values())
5180 Ops.push_back(Value);
5181 Ops[1] = DAG.getConstant(D16ImageDimIntr->D16HelperIntr, DL, MVT::i32);
5182 Ops[2] = VData;
5183
5184 MemSDNode *M = cast<MemSDNode>(Op);
5185 return DAG.getMemIntrinsicNode(ISD::INTRINSIC_VOID, DL, Op->getVTList(),
5186 Ops, M->getMemoryVT(),
5187 M->getMemOperand());
5188 }
5189 }
5190
5191 return Op;
5192 }
5193 }
5194}
5195
5196SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
5197 SDLoc DL(Op);
5198 LoadSDNode *Load = cast<LoadSDNode>(Op);
5199 ISD::LoadExtType ExtType = Load->getExtensionType();
5200 EVT MemVT = Load->getMemoryVT();
5201
5202 if (ExtType == ISD::NON_EXTLOAD && MemVT.getSizeInBits() < 32) {
5203 if (MemVT == MVT::i16 && isTypeLegal(MVT::i16))
5204 return SDValue();
5205
5206 // FIXME: Copied from PPC
5207 // First, load into 32 bits, then truncate to 1 bit.
5208
5209 SDValue Chain = Load->getChain();
5210 SDValue BasePtr = Load->getBasePtr();
5211 MachineMemOperand *MMO = Load->getMemOperand();
5212
5213 EVT RealMemVT = (MemVT == MVT::i1) ? MVT::i8 : MVT::i16;
5214
5215 SDValue NewLD = DAG.getExtLoad(ISD::EXTLOAD, DL, MVT::i32, Chain,
5216 BasePtr, RealMemVT, MMO);
5217
5218 SDValue Ops[] = {
5219 DAG.getNode(ISD::TRUNCATE, DL, MemVT, NewLD),
5220 NewLD.getValue(1)
5221 };
5222
5223 return DAG.getMergeValues(Ops, DL);
5224 }
5225
5226 if (!MemVT.isVector())
5227 return SDValue();
5228
5229 assert(Op.getValueType().getVectorElementType() == MVT::i32 &&(static_cast <bool> (Op.getValueType().getVectorElementType
() == MVT::i32 && "Custom lowering for non-i32 vectors hasn't been implemented."
) ? void (0) : __assert_fail ("Op.getValueType().getVectorElementType() == MVT::i32 && \"Custom lowering for non-i32 vectors hasn't been implemented.\""
, "/build/llvm-toolchain-snapshot-7~svn329677/lib/Target/AMDGPU/SIISelLowering.cpp"
, 5230, __extension__ __PRETTY_FUNCTION__))
5230 "Custom lowering for non-i32 vectors hasn't been implemented.")(static_cast <bool> (Op.getValueType().getVectorElementType
() == MVT::i32 && "Custom lowering for non-i32 vectors hasn't been implemented."
) ? void (0) : __assert_fail ("Op.getValueType().getVectorElementType() == MVT::i32 && \"Custom lowering for non-i32 vectors hasn't been implemented.\""
, "/build/llvm-toolchain-snapshot-7~svn329677/lib/Target/AMDGPU/SIISelLowering.cpp"
, 5230, __extension__ __PRETTY_FUNCTION__))
;
5231
5232 unsigned Alignment = Load->getAlignment();
5233 unsigned AS = Load->getAddressSpace();
5234 if (!allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), MemVT,
5235 AS, Alignment)) {
5236 SDValue Ops[2];
5237 std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(Load, DAG);
5238 return DAG.getMergeValues(Ops, DL);
5239 }
5240
5241 MachineFunction &MF = DAG.getMachineFunction();
5242 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
5243 // If there is a possibilty that flat instruction access scratch memory
5244 // then we need to use the same legalization rules we use for private.
5245 if (AS == AMDGPUASI.FLAT_ADDRESS)
5246 AS = MFI->hasFlatScratchInit() ?
5247 AMDGPUASI.PRIVATE_ADDRESS : AMDGPUASI.GLOBAL_ADDRESS;
5248
5249 unsigned NumElements = MemVT.getVectorNumElements();
5250
5251 if (AS == AMDGPUASI.CONSTANT_ADDRESS ||
5252 AS == AMDGPUASI.CONSTANT_ADDRESS_32BIT) {
5253 if (!Op->isDivergent() && Alignment >= 4)
5254 return SDValue();
5255 // Non-uniform loads will be selected to MUBUF instructions, so they
5256 // have the same legalization requirements as global and private
5257 // loads.
5258 //
5259 }
5260
5261 if (AS == AMDGPUASI.CONSTANT_ADDRESS ||
5262 AS == AMDGPUASI.CONSTANT_ADDRESS_32BIT ||
5263 AS == AMDGPUASI.GLOBAL_ADDRESS) {
5264 if (Subtarget->getScalarizeGlobalBehavior() && !Op->isDivergent() &&
5265 !Load->isVolatile() && isMemOpHasNoClobberedMemOperand(Load) &&
5266 Alignment >= 4)
5267 return SDValue();
5268 // Non-uniform loads will be selected to MUBUF instructions, so they
5269 // have the same legalization requirements as global and private
5270 // loads.
5271 //
5272 }
5273 if (AS == AMDGPUASI.CONSTANT_ADDRESS ||
5274 AS == AMDGPUASI.CONSTANT_ADDRESS_32BIT ||
5275 AS == AMDGPUASI.GLOBAL_ADDRESS ||
5276 AS == AMDGPUASI.FLAT_ADDRESS) {
5277 if (NumElements > 4)
5278 return SplitVectorLoad(Op, DAG);
5279 // v4 loads are supported for private and global memory.
5280 return SDValue();
5281 }
5282 if (AS == AMDGPUASI.PRIVATE_ADDRESS) {
5283 // Depending on the setting of the private_element_size field in the
5284 // resource descriptor, we can only make private accesses up to a certain
5285 // size.
5286 switch (Subtarget->getMaxPrivateElementSize()) {
5287 case 4:
5288 return scalarizeVectorLoad(Load, DAG);
5289 case 8:
5290 if (NumElements > 2)
5291 return SplitVectorLoad(Op, DAG);
5292 return SDValue();
5293 case 16:
5294 // Same as global/flat
5295 if (NumElements > 4)
5296 return SplitVectorLoad(Op, DAG);
5297 return SDValue();
5298 default:
5299 llvm_unreachable("unsupported private_element_size")::llvm::llvm_unreachable_internal("unsupported private_element_size"
, "/build/llvm-toolchain-snapshot-7~svn329677/lib/Target/AMDGPU/SIISelLowering.cpp"
, 5299)
;
5300 }
5301 } else if (AS == AMDGPUASI.LOCAL_ADDRESS) {
5302 // Use ds_read_b128 if possible.
5303 if (Subtarget->useDS128(EnableDS128) && Load->getAlignment() >= 16 &&
5304 MemVT.getStoreSize() == 16)
5305 return SDValue();
5306
5307 if (NumElements > 2)
5308 return SplitVectorLoad(Op, DAG);
5309 }
5310 return SDValue();
5311}
5312
5313SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
5314 if (Op.getValueType() != MVT::i64)
5315 return SDValue();
5316
5317 SDLoc DL(Op);
5318 SDValue Cond = Op.getOperand(0);
5319
5320 SDValue Zero = DAG.getConstant(0, DL, MVT::i32);
5321 SDValue One = DAG.getConstant(1, DL, MVT::i32);
5322
5323 SDValue LHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(1));
5324 SDValue RHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(2));
5325
5326 SDValue Lo0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, Zero);
5327 SDValue Lo1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, Zero);
5328
5329 SDValue Lo = DAG.getSelect(DL, MVT::i32, Cond, Lo0, Lo1);
5330
5331 SDValue Hi0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, One);
5332 SDValue Hi1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, One);
5333
5334 SDValue Hi = DAG.getSelect(DL, MVT::i32, Cond, Hi0, Hi1);
5335
5336 SDValue Res = DAG.getBuildVector(MVT::v2i32, DL, {Lo, Hi});
5337 return DAG.getNode(ISD::BITCAST, DL, MVT::i64, Res);
5338}
5339
5340// Catch division cases where we can use shortcuts with rcp and rsq
5341// instructions.
5342SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op,
5343 SelectionDAG &DAG) const {
5344 SDLoc SL(Op);
5345 SDValue LHS = Op.getOperand(0);
5346 SDValue RHS = Op.getOperand(1);
5347 EVT VT = Op.getValueType();
5348 const SDNodeFlags Flags = Op->getFlags();
5349 bool Unsafe = DAG.getTarget().Options.UnsafeFPMath ||
5350 Flags.hasUnsafeAlgebra() || Flags.hasAllowReciprocal();
5351
5352 if (!Unsafe && VT == MVT::f32 && Subtarget->hasFP32Denormals())
5353 return SDValue();
5354
5355 if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(LHS)) {
5356 if (Unsafe || VT == MVT::f32 || VT == MVT::f16) {
5357 if (CLHS->isExactlyValue(1.0)) {
5358 // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
5359 // the CI documentation has a worst case error of 1 ulp.
5360 // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to
5361 // use it as long as we aren't trying to use denormals.
5362 //
5363 // v_rcp_f16 and v_rsq_f16 DO support denormals.
5364
5365 // 1.0 / sqrt(x) -> rsq(x)
5366
5367 // XXX - Is UnsafeFPMath sufficient to do this for f64? The maximum ULP
5368 // error seems really high at 2^29 ULP.
5369 if (RHS.getOpcode() == ISD::FSQRT)
5370 return DAG.getNode(AMDGPUISD::RSQ, SL, VT, RHS.getOperand(0));
5371
5372 // 1.0 / x -> rcp(x)
5373 return DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);
5374 }
5375
5376 // Same as for 1.0, but expand the sign out of the constant.
5377 if (CLHS->isExactlyValue(-1.0)) {
5378 // -1.0 / x -> rcp (fneg x)
5379 SDValue FNegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
5380 return DAG.getNode(AMDGPUISD::RCP, SL, VT, FNegRHS);
5381 }
5382 }
5383 }
5384
5385 if (Unsafe) {
5386 // Turn into multiply by the reciprocal.
5387 // x / y -> x * (1.0 / y)
5388 SDValue Recip = DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);
5389 return DAG.getNode(ISD::FMUL, SL, VT, LHS, Recip, Flags);
5390 }
5391
5392 return SDValue();
5393}
5394
5395static SDValue getFPBinOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL,
5396 EVT VT, SDValue A, SDValue B, SDValue GlueChain) {
5397 if (GlueChain->getNumValues() <= 1) {
5398 return DAG.getNode(Opcode, SL, VT, A, B);
5399 }
5400
5401 assert(GlueChain->getNumValues() == 3)(static_cast <bool> (GlueChain->getNumValues() == 3)
? void (0) : __assert_fail ("GlueChain->getNumValues() == 3"
, "/build/llvm-toolchain-snapshot-7~svn329677/lib/Target/AMDGPU/SIISelLowering.cpp"
, 5401, __extension__ __PRETTY_FUNCTION__))
;
5402
5403 SDVTList VTList = DAG.getVTList(VT, MVT::Other, MVT::Glue);
5404 switch (Opcode) {
5405 default: llvm_unreachable("no chain equivalent for opcode")::llvm::llvm_unreachable_internal("no chain equivalent for opcode"
, "/build/llvm-toolchain-snapshot-7~svn329677/lib/Target/AMDGPU/SIISelLowering.cpp"
, 5405)
;
5406 case ISD::FMUL:
5407 Opcode = AMDGPUISD::FMUL_W_CHAIN;
5408 break;
5409 }
5410
5411 return DAG.getNode(Opcode, SL, VTList, GlueChain.getValue(1), A, B,
5412 GlueChain.getValue(2));
5413}
5414
5415static SDValue getFPTernOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL,
5416 EVT VT, SDValue A, SDValue B, SDValue C,
5417 SDValue GlueChain) {
5418 if (GlueChain->getNumValues() <= 1) {
5419 return DAG.getNode(Opcode, SL, VT, A, B, C);
5420 }
5421
5422 assert(GlueChain->getNumValues() == 3)(static_cast <bool> (GlueChain->getNumValues() == 3)
? void (0) : __assert_fail ("GlueChain->getNumValues() == 3"
, "/build/llvm-toolchain-snapshot-7~svn329677/lib/Target/AMDGPU/SIISelLowering.cpp"
, 5422, __extension__ __PRETTY_FUNCTION__))
;
5423
5424 SDVTList VTList = DAG.getVTList(VT, MVT::Other, MVT::Glue);
5425 switch (Opcode) {
5426 default: llvm_unreachable("no chain equivalent for opcode")::llvm::llvm_unreachable_internal("no chain equivalent for opcode"
, "/build/llvm-toolchain-snapshot-7~svn329677/lib/Target/AMDGPU/SIISelLowering.cpp"
, 5426)
;
5427 case ISD::FMA:
5428 Opcode = AMDGPUISD::FMA_W_CHAIN;
5429 break;
5430 }
5431
5432 return DAG.getNode(Opcode, SL, VTList, GlueChain.getValue(1), A, B, C,
5433 GlueChain.getValue(2));
5434}
5435
5436SDValue SITargetLowering::LowerFDIV16(SDValue Op, SelectionDAG &DAG) const {
5437 if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG))
5438 return FastLowered;
5439
5440 SDLoc SL(Op);
5441 SDValue Src0 = Op.getOperand(0);
5442 SDValue Src1 = Op.getOperand(1);
5443
5444 SDValue CvtSrc0 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src0);
5445 SDValue CvtSrc1 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src1);
5446
5447 SDValue RcpSrc1 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, CvtSrc1);
5448 SDValue Quot = DAG.getNode(ISD::FMUL, SL, MVT::f32, CvtSrc0, RcpSrc1);
5449
5450 SDValue FPRoundFlag = DAG.getTargetConstant(0, SL, MVT::i32);
5451 SDValue BestQuot = DAG.getNode(ISD::FP_ROUND, SL, MVT::f16, Quot, FPRoundFlag);
5452
5453 return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f16, BestQuot, Src1, Src0);
5454}
5455
5456// Faster 2.5 ULP division that does not support denormals.
5457SDValue SITargetLowering::lowerFDIV_FAST(SDValue Op, SelectionDAG &DAG) const {
5458 SDLoc SL(Op);
5459 SDValue LHS = Op.getOperand(1);
5460 SDValue RHS = Op.getOperand(2);
5461
5462 SDValue r1 = DAG.getNode(ISD::FABS, SL, MVT::f32, RHS);
5463
5464 const APFloat K0Val(BitsToFloat(0x6f800000));
5465 const SDValue K0 = DAG.getConstantFP(K0Val, SL, MVT::f32);
5466
5467 const APFloat K1Val(BitsToFloat(0x2f800000));
5468 const SDValue K1 = DAG.getConstantFP(K1Val, SL, MVT::f32);
5469
5470 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);
5471
5472 EVT SetCCVT =
5473 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f32);
5474
5475 SDValue r2 = DAG.getSetCC(SL, SetCCVT, r1, K0, ISD::SETOGT);
5476
5477 SDValue r3 = DAG.getNode(ISD::SELECT, SL, MVT::f32, r2, K1, One);
5478
5479 // TODO: Should this propagate fast-math-flags?
5480 r1 = DAG.getNode(ISD::FMUL, SL, MVT::f32, RHS, r3);
5481
5482 // rcp does not support denormals.
5483 SDValue r0 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, r1);
5484
5485 SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f32, LHS, r0);
5486
5487 return DAG.getNode(ISD::FMUL, SL, MVT::f32, r3, Mul);
5488}
5489
5490SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const {
5491 if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG))
5492 return FastLowered;
5493
5494 SDLoc SL(Op);
5495 SDValue LHS = Op.getOperand(0);
5496 SDValue RHS = Op.getOperand(1);
5497
5498 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);
5499
5500 SDVTList ScaleVT = DAG.getVTList(MVT::f32, MVT::i1);
5501
5502 SDValue DenominatorScaled = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT,
5503 RHS, RHS, LHS);
5504 SDValue NumeratorScaled = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT,
5505 LHS, RHS, LHS);
5506
5507 // Denominator is scaled to not be denormal, so using rcp is ok.
5508 SDValue ApproxRcp = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32,
5509 DenominatorScaled);
5510 SDValue NegDivScale0 = DAG.getNode(ISD::FNEG, SL, MVT::f32,
5511 DenominatorScaled);
5512
5513 const unsigned Denorm32Reg = AMDGPU::Hwreg::ID_MODE |
5514 (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) |
5515 (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_);
5516
5517 const SDValue BitField = DAG.getTargetConstant(Denorm32Reg, SL, MVT::i16);
5518
5519 if (!Subtarget->hasFP32Denormals()) {
5520 SDVTList BindParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
5521 const SDValue EnableDenormValue = DAG.getConstant(FP_DENORM_FLUSH_NONE3,
5522 SL, MVT::i32);
5523 SDValue EnableDenorm = DAG.getNode(AMDGPUISD::SETREG, SL, BindParamVTs,
5524 DAG.getEntryNode(),
5525 EnableDenormValue, BitField);
5526 SDValue Ops[3] = {
5527 NegDivScale0,
5528 EnableDenorm.getValue(0),
5529 EnableDenorm.getValue(1)
5530 };
5531
5532 NegDivScale0 = DAG.getMergeValues(Ops, SL);
5533 }
5534
5535 SDValue Fma0 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0,
5536 ApproxRcp, One, NegDivScale0);
5537
5538 SDValue Fma1 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, Fma0, ApproxRcp,
5539 ApproxRcp, Fma0);
5540
5541 SDValue Mul = getFPBinOp(DAG, ISD::FMUL, SL, MVT::f32, NumeratorScaled,
5542 Fma1, Fma1);
5543
5544 SDValue Fma2 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Mul,
5545 NumeratorScaled, Mul);
5546
5547 SDValue Fma3 = getFPTernOp(DAG, ISD::FMA,SL, MVT::f32, Fma2, Fma1, Mul, Fma2);
5548
5549 SDValue Fma4 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Fma3,
5550 NumeratorScaled, Fma3);
5551
5552 if (!Subtarget->hasFP32Denormals()) {
5553 const SDValue DisableDenormValue =
5554 DAG.getConstant(FP_DENORM_FLUSH_IN_FLUSH_OUT0, SL, MVT::i32);
5555 SDValue DisableDenorm = DAG.getNode(AMDGPUISD::SETREG, SL, MVT::Other,
5556 Fma4.getValue(1),
5557 DisableDenormValue,
5558 BitField,
5559 Fma4.getValue(2));
5560
5561 SDValue OutputChain = DAG.getNode(ISD::TokenFactor, SL, MVT::Other,
5562 DisableDenorm, DAG.getRoot());
5563 DAG.setRoot(OutputChain);
5564 }
5565
5566 SDValue Scale = NumeratorScaled.getValue(1);
5567 SDValue Fmas = DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f32,
5568 Fma4, Fma1, Fma3, Scale);
5569
5570 return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f32, Fmas, RHS, LHS);
5571}
5572
5573SDValue SITargetLowering::LowerFDIV64(SDValue Op, SelectionDAG &DAG) const {
5574 if (DAG.getTarget().Options.UnsafeFPMath)
5575 return lowerFastUnsafeFDIV(Op, DAG);
5576
5577 SDLoc SL(Op);
5578 SDValue X = Op.getOperand(0);
5579 SDValue Y = Op.getOperand(1);
5580
5581 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f64);
5582
5583 SDVTList ScaleVT = DAG.getVTList(MVT::f64, MVT::i1);
5584
5585 SDValue DivScale0 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, Y, Y, X);
5586
5587 SDValue NegDivScale0 = DAG.getNode(ISD::FNEG, SL, MVT::f64, DivScale0);
5588
5589 SDValue Rcp = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f64, DivScale0);
5590
5591 SDValue Fma0 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Rcp, One);
5592
5593 SDValue Fma1 = DAG.getNode(ISD::FMA, SL, MVT::f64, Rcp, Fma0, Rcp);
5594
5595 SDValue Fma2 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Fma1, One);
5596
5597 SDValue DivScale1 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, X, Y, X);
5598
5599 SDValue Fma3 = DAG.getNode(ISD::FMA, SL, MVT::f64, Fma1, Fma2, Fma1);
5600 SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f64, DivScale1, Fma3);
5601
5602 SDValue Fma4 = DAG.getNode(ISD::FMA, SL, MVT::f64,
5603 NegDivScale0, Mul, DivScale1);
5604
5605 SDValue Scale;
5606
5607 if (Subtarget->getGeneration() == SISubtarget::SOUTHERN_ISLANDS) {
5608 // Workaround a hardware bug on SI where the condition output from div_scale
5609 // is not usable.
5610
5611 const SDValue Hi = DAG.getConstant(1, SL, MVT::i32);
5612
5613 // Figure out if the scale to use for div_fmas.
5614 SDValue NumBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, X);
5615 SDValue DenBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Y);
5616 SDValue Scale0BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale0);
5617 SDValue Scale1BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale1);
5618
5619 SDValue NumHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, NumBC, Hi);
5620 SDValue DenHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, DenBC, Hi);
5621
5622 SDValue Scale0Hi
5623 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale0BC, Hi);
5624 SDValue Scale1Hi
5625 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale1BC, Hi);
5626
5627 SDValue CmpDen = DAG.getSetCC(SL, MVT::i1, DenHi, Scale0Hi, ISD::SETEQ);
5628 SDValue CmpNum = DAG.getSetCC(SL, MVT::i1, NumHi, Scale1Hi, ISD::SETEQ);
5629 Scale = DAG.getNode(ISD::XOR, SL, MVT::i1, CmpNum, CmpDen);
5630 } else {
5631 Scale = DivScale1.getValue(1);
5632 }
5633
5634 SDValue Fmas = DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f64,
5635 Fma4, Fma3, Mul, Scale);
5636
5637 return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f64, Fmas, Y, X);
5638}
5639
5640SDValue SITargetLowering::LowerFDIV(SDValue Op, SelectionDAG &DAG) const {
5641 EVT VT = Op.getValueType();
5642
5643 if (VT == MVT::f32)
5644 return LowerFDIV32(Op, DAG);
5645
5646 if (VT == MVT::f64)
5647 return LowerFDIV64(Op, DAG);
5648
5649 if (VT == MVT::f16)
5650 return LowerFDIV16(Op, DAG);
5651
5652 llvm_unreachable("Unexpected type for fdiv")::llvm::llvm_unreachable_internal("Unexpected type for fdiv",
"/build/llvm-toolchain-snapshot-7~svn329677/lib/Target/AMDGPU/SIISelLowering.cpp"
, 5652)
;
5653}
5654
5655SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
5656 SDLoc DL(Op);
5657 StoreSDNode *Store = cast<StoreSDNode>(Op);
5658 EVT VT = Store->getMemoryVT();
5659
5660 if (VT == MVT::i1) {
5661 return DAG.getTruncStore(Store->getChain(), DL,
5662 DAG.getSExtOrTrunc(Store->getValue(), DL, MVT::i32),
5663 Store->getBasePtr(), MVT::i1, Store->getMemOperand());
5664 }
5665
5666 assert(VT.isVector() &&(static_cast <bool> (VT.isVector() && Store->
getValue().getValueType().getScalarType() == MVT::i32) ? void
(0) : __assert_fail ("VT.isVector() && Store->getValue().getValueType().getScalarType() == MVT::i32"
, "/build/llvm-toolchain-snapshot-7~svn329677/lib/Target/AMDGPU/SIISelLowering.cpp"
, 5667, __extension__ __PRETTY_FUNCTION__))
5667 Store->getValue().getValueType().getScalarType() == MVT::i32)(static_cast <bool> (VT.isVector() && Store->
getValue().getValueType().getScalarType() == MVT::i32) ? void
(0) : __assert_fail ("VT.isVector() && Store->getValue().getValueType().getScalarType() == MVT::i32"
, "/build/llvm-toolchain-snapshot-7~svn329677/lib/Target/AMDGPU/SIISelLowering.cpp"
, 5667, __extension__ __PRETTY_FUNCTION__))
;
5668
5669 unsigned AS = Store->getAddressSpace();
5670 if (!allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
5671 AS, Store->getAlignment())) {
5672 return expandUnalignedStore(Store, DAG);
5673 }
5674
5675 MachineFunction &MF = DAG.getMachineFunction();
5676 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
5677 // If there is a possibilty that flat instruction access scratch memory
5678 // then we need to use the same legalization rules we use for private.
5679 if (AS == AMDGPUASI.FLAT_ADDRESS)
5680 AS = MFI->hasFlatScratchInit() ?
5681 AMDGPUASI.PRIVATE_ADDRESS : AMDGPUASI.GLOBAL_ADDRESS;
5682
5683 unsigned NumElements = VT.getVectorNumElements();
5684 if (AS == AMDGPUASI.GLOBAL_ADDRESS ||
5685 AS == AMDGPUASI.FLAT_ADDRESS) {
5686 if (NumElements > 4)
5687 return SplitVectorStore(Op, DAG);
5688 return SDValue();
5689 } else if (AS == AMDGPUASI.PRIVATE_ADDRESS) {
5690 switch (Subtarget->getMaxPrivateElementSize()) {
5691 case 4:
5692 return scalarizeVectorStore(Store, DAG);
5693 case 8:
5694 if (NumElements > 2)
5695 return SplitVectorStore(Op, DAG);
5696 return SDValue();
5697 case 16:
5698 if (NumElements > 4)
5699 return SplitVectorStore(Op, DAG);
5700 return SDValue();
5701 default:
5702 llvm_unreachable("unsupported private_element_size")::llvm::llvm_unreachable_internal("unsupported private_element_size"
, "/build/llvm-toolchain-snapshot-7~svn329677/lib/Target/AMDGPU/SIISelLowering.cpp"
, 5702)
;
5703 }
5704 } else if (AS == AMDGPUASI.LOCAL_ADDRESS) {
5705 // Use ds_write_b128 if possible.
5706 if (Subtarget->useDS128(EnableDS128) && Store->getAlignment() >= 16 &&
5707 VT.getStoreSize() == 16)
5708 return SDValue();
5709
5710 if (NumElements > 2)
5711 return SplitVectorStore(Op, DAG);
5712 return SDValue();
5713 } else {
5714 llvm_unreachable("unhandled address space")::llvm::llvm_unreachable_internal("unhandled address space", "/build/llvm-toolchain-snapshot-7~svn329677/lib/Target/AMDGPU/SIISelLowering.cpp"
, 5714)
;
5715 }
5716}
5717
5718SDValue SITargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
5719 SDLoc DL(Op);
5720 EVT VT = Op.getValueType();
5721 SDValue Arg = Op.getOperand(0);
5722 // TODO: Should this propagate fast-math-flags?
5723 SDValue FractPart = DAG.getNode(AMDGPUISD::FRACT, DL, VT,
5724 DAG.getNode(ISD::FMUL, DL, VT, Arg,
5725 DAG.getConstantFP(0.5/M_PI3.14159265358979323846, DL,
5726 VT)));
5727
5728 switch (Op.getOpcode()) {
5729 case ISD::FCOS:
5730 return DAG.getNode(AMDGPUISD::COS_HW, SDLoc(Op), VT, FractPart);
5731 case ISD::FSIN:
5732 return DAG.getNode(AMDGPUISD::SIN_HW, SDLoc(Op), VT, FractPart);
5733 default:
5734 llvm_unreachable("Wrong trig opcode")::llvm::llvm_unreachable_internal("Wrong trig opcode", "/build/llvm-toolchain-snapshot-7~svn329677/lib/Target/AMDGPU/SIISelLowering.cpp"
, 5734)
;
5735 }
5736}
5737
5738SDValue SITargetLowering::LowerATOMIC_CMP_SWAP(SDValue Op, SelectionDAG &DAG) const {
5739 AtomicSDNode *AtomicNode = cast<AtomicSDNode>(Op);
5740 assert(AtomicNode->isCompareAndSwap())(static_cast <bool> (AtomicNode->isCompareAndSwap())
? void (0) : __assert_fail ("AtomicNode->isCompareAndSwap()"
, "/build/llvm-toolchain-snapshot-7~svn329677/lib/Target/AMDGPU/SIISelLowering.cpp"
, 5740, __extension__ __PRETTY_FUNCTION__))
;
5741 unsigned AS = AtomicNode->getAddressSpace();
5742
5743 // No custom lowering required for local address space
5744 if (!isFlatGlobalAddrSpace(AS, AMDGPUASI))
5745 return Op;
5746
5747 // Non-local address space requires custom lowering for atomic compare
5748 // and swap; cmp and swap should be in a v2i32 or v2i64 in case of _X2
5749 SDLoc DL(Op);
5750 SDValue ChainIn = Op.getOperand(0);
5751 SDValue Addr = Op.getOperand(1);
5752 SDValue Old = Op.getOperand(2);
5753 SDValue New = Op.getOperand(3);
5754 EVT VT = Op.getValueType();
5755 MVT SimpleVT = VT.getSimpleVT();
5756 MVT VecType = MVT::getVectorVT(SimpleVT, 2);
5757
5758 SDValue NewOld = DAG.getBuildVector(VecType, DL, {New, Old});
5759 SDValue Ops[] = { ChainIn, Addr, NewOld };
5760
5761 return DAG.getMemIntrinsicNode(AMDGPUISD::ATOMIC_CMP_SWAP, DL, Op->getVTList(),
5762 Ops, VT, AtomicNode->getMemOperand());
5763}
5764
5765//===----------------------------------------------------------------------===//
5766// Custom DAG optimizations
5767//===----------------------------------------------------------------------===//
5768
5769SDValue SITargetLowering::performUCharToFloatCombine(SDNode *N,
5770 DAGCombinerInfo &DCI) const {
5771 EVT VT = N->getValueType(0);
5772 EVT ScalarVT = VT.getScalarType();
5773 if (ScalarVT != MVT::f32)
5774 return SDValue();
5775
5776 SelectionDAG &DAG = DCI.DAG;
5777 SDLoc DL(N);
5778
5779 SDValue Src = N->getOperand(0);
5780 EVT SrcVT = Src.getValueType();
5781
5782 // TODO: We could try to match extracting the higher bytes, which would be
5783 // easier if i8 vectors weren't promoted to i32 vectors, particularly after
5784 // types are legalized. v4i8 -> v4f32 is probably the only case to worry
5785 // about in practice.
5786 if (DCI.isAfterLegalizeDAG() && SrcVT == MVT::i32) {
5787 if (DAG.MaskedValueIsZero(Src, APInt::getHighBitsSet(32, 24))) {
5788 SDValue Cvt = DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0, DL, VT, Src);
5789 DCI.AddToWorklist(Cvt.getNode());
5790 return Cvt;
5791 }
5792 }
5793
5794 return SDValue();
5795}
5796
5797// (shl (add x, c1), c2) -> add (shl x, c2), (shl c1, c2)
5798
5799// This is a variant of
5800// (mul (add x, c1), c2) -> add (mul x, c2), (mul c1, c2),
5801//
5802// The normal DAG combiner will do this, but only if the add has one use since
5803// that would increase the number of instructions.
5804//
5805// This prevents us from seeing a constant offset that can be folded into a
5806// memory instruction's addressing mode. If we know the resulting add offset of
5807// a pointer can be folded into an addressing offset, we can replace the pointer
5808// operand with the add of new constant offset. This eliminates one of the uses,
5809// and may allow the remaining use to also be simplified.
5810//
5811SDValue SITargetLowering::performSHLPtrCombine(SDNode *N,
5812 unsigned AddrSpace,
5813 EVT MemVT,
5814 DAGCombinerInfo &DCI) const {
5815 SDValue N0 = N->getOperand(0);
5816 SDValue N1 = N->getOperand(1);
5817
5818 // We only do this to handle cases where it's profitable when there are
5819 // multiple uses of the add, so defer to the standard combine.
5820 if ((N0.getOpcode() != ISD::ADD && N0.getOpcode() != ISD::OR) ||
5821 N0->hasOneUse())
5822 return SDValue();
5823
5824 const ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(N1);
5825 if (!CN1)
5826 return SDValue();
5827
5828 const ConstantSDNode *CAdd = dyn_cast<ConstantSDNode>(N0.getOperand(1));
5829 if (!CAdd)
5830 return SDValue();
5831
5832 // If the resulting offset is too large, we can't fold it into the addressing
5833 // mode offset.
5834 APInt Offset = CAdd->getAPIntValue() << CN1->getAPIntValue();
5835 Type *Ty = MemVT.getTypeForEVT(*DCI.DAG.getContext());
5836
5837 AddrMode AM;
5838 AM.HasBaseReg = true;
5839 AM.BaseOffs = Offset.getSExtValue();
5840 if (!isLegalAddressingMode(DCI.DAG.getDataLayout(), AM, Ty, AddrSpace))
5841 return SDValue();
5842
5843 SelectionDAG &DAG = DCI.DAG;
5844 SDLoc SL(N);
5845 EVT VT = N->getValueType(0);
5846
5847 SDValue ShlX = DAG.getNode(ISD::SHL, SL, VT, N0.getOperand(0), N1);
5848 SDValue COffset = DAG.getConstant(Offset, SL, MVT::i32);
5849
5850 SDNodeFlags Flags;
5851 Flags.setNoUnsignedWrap(N->getFlags().hasNoUnsignedWrap() &&
5852 (N0.getOpcode() == ISD::OR ||
5853 N0->getFlags().hasNoUnsignedWrap()));
5854
5855 return DAG.getNode(ISD::ADD, SL, VT, ShlX, COffset, Flags);
5856}
5857
5858SDValue SITargetLowering::performMemSDNodeCombine(MemSDNode *N,
5859 DAGCombinerInfo &DCI) const {
5860 SDValue Ptr = N->getBasePtr();
5861 SelectionDAG &DAG = DCI.DAG;
5862 SDLoc SL(N);
5863
5864 // TODO: We could also do this for multiplies.
5865 if (Ptr.getOpcode() == ISD::SHL) {
5866 SDValue NewPtr = performSHLPtrCombine(Ptr.getNode(), N->getAddressSpace(),
5867 N->getMemoryVT(), DCI);
5868 if (NewPtr) {
5869 SmallVector<SDValue, 8> NewOps(N->op_begin(), N->op_end());
5870
5871 NewOps[N->getOpcode() == ISD::STORE ? 2 : 1] = NewPtr;
5872 return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0);
5873 }
5874 }
5875
5876 return SDValue();
5877}
5878
5879static bool bitOpWithConstantIsReducible(unsigned Opc, uint32_t Val) {
5880 return (Opc == ISD::AND && (Val == 0 || Val == 0xffffffff)) ||
5881 (Opc == ISD::OR && (Val == 0xffffffff || Val == 0)) ||
5882 (Opc == ISD::XOR && Val == 0);
5883}
5884
5885// Break up 64-bit bit operation of a constant into two 32-bit and/or/xor. This
5886// will typically happen anyway for a VALU 64-bit and. This exposes other 32-bit
5887// integer combine opportunities since most 64-bit operations are decomposed
5888// this way. TODO: We won't want this for SALU especially if it is an inline
5889// immediate.
5890SDValue SITargetLowering::splitBinaryBitConstantOp(
5891 DAGCombinerInfo &DCI,
5892 const SDLoc &SL,
5893 unsigned Opc, SDValue LHS,
5894 const ConstantSDNode *CRHS) const {
5895 uint64_t Val = CRHS->getZExtValue();
5896 uint32_t ValLo = Lo_32(Val);
5897 uint32_t ValHi = Hi_32(Val);
5898 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
5899
5900 if ((bitOpWithConstantIsReducible(Opc, ValLo) ||
5901 bitOpWithConstantIsReducible(Opc, ValHi)) ||
5902 (CRHS->hasOneUse() && !TII->isInlineConstant(CRHS->getAPIntValue()))) {
5903 // If we need to materialize a 64-bit immediate, it will be split up later
5904 // anyway. Avoid creating the harder to understand 64-bit immediate
5905 // materialization.
5906 return splitBinaryBitConstantOpImpl(DCI, SL, Opc, LHS, ValLo, ValHi);
5907 }
5908
5909 return SDValue();
5910}
5911
5912// Returns true if argument is a boolean value which is not serialized into
5913// memory or argument and does not require v_cmdmask_b32 to be deserialized.
5914static bool isBoolSGPR(SDValue V) {
5915 if (V.getValueType() != MVT::i1)
5916 return false;
5917 switch (V.getOpcode()) {
5918 default: break;
5919 case ISD::SETCC:
5920 case ISD::AND:
5921 case ISD::OR:
5922 case ISD::XOR:
5923 case AMDGPUISD::FP_CLASS:
5924 return true;
5925 }
5926 return false;
5927}
5928
5929SDValue SITargetLowering::performAndCombine(SDNode *N,
5930 DAGCombinerInfo &DCI) const {
5931 if (DCI.isBeforeLegalize())
5932 return SDValue();
5933
5934 SelectionDAG &DAG = DCI.DAG;
5935 EVT VT = N->getValueType(0);
5936 SDValue LHS = N->getOperand(0);
5937 SDValue RHS = N->getOperand(1);
5938
5939
5940 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS);
5941 if (VT == MVT::i64 && CRHS) {
5942 if (SDValue Split
5943 = splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::AND, LHS, CRHS))
5944 return Split;
5945 }
5946
5947 if (CRHS && VT == MVT::i32) {
5948 // and (srl x, c), mask => shl (bfe x, nb + c, mask >> nb), nb
5949 // nb = number of trailing zeroes in mask
5950 // It can be optimized out using SDWA for GFX8+ in the SDWA peephole pass,
5951 // given that we are selecting 8 or 16 bit fields starting at byte boundary.
5952 uint64_t Mask = CRHS->getZExtValue();
5953 unsigned Bits = countPopulation(Mask);
5954 if (getSubtarget()->hasSDWA() && LHS->getOpcode() == ISD::SRL &&
5955 (Bits == 8 || Bits == 16) && isShiftedMask_64(Mask) && !(Mask & 1)) {
5956 if (auto *CShift = dyn_cast<ConstantSDNode>(LHS->getOperand(1))) {
5957 unsigned Shift = CShift->getZExtValue();
5958 unsigned NB = CRHS->getAPIntValue().countTrailingZeros();
5959 unsigned Offset = NB + Shift;
5960 if ((Offset & (Bits - 1)) == 0) { // Starts at a byte or word boundary.
5961 SDLoc SL(N);
5962 SDValue BFE = DAG.getNode(AMDGPUISD::BFE_U32, SL, MVT::i32,
5963 LHS->getOperand(0),
5964 DAG.getConstant(Offset, SL, MVT::i32),
5965 DAG.getConstant(Bits, SL, MVT::i32));
5966 EVT NarrowVT = EVT::getIntegerVT(*DAG.getContext(), Bits);
5967 SDValue Ext = DAG.getNode(ISD::AssertZext, SL, VT, BFE,
5968 DAG.getValueType(NarrowVT));
5969 SDValue Shl = DAG.getNode(ISD::SHL, SDLoc(LHS), VT, Ext,
5970 DAG.getConstant(NB, SDLoc(CRHS), MVT::i32));
5971 return Shl;
5972 }
5973 }
5974 }
5975 }
5976
5977 // (and (fcmp ord x, x), (fcmp une (fabs x), inf)) ->
5978 // fp_class x, ~(s_nan | q_nan | n_infinity | p_infinity)
5979 if (LHS.getOpcode() == ISD::SETCC && RHS.getOpcode() == ISD::SETCC) {
5980 ISD::CondCode LCC = cast<CondCodeSDNode>(LHS.getOperand(2))->get();
5981 ISD::CondCode RCC = cast<CondCodeSDNode>(RHS.getOperand(2))->get();
5982
5983 SDValue X = LHS.getOperand(0);
5984 SDValue Y = RHS.getOperand(0);
5985 if (Y.getOpcode() != ISD::FABS || Y.getOperand(0) != X)
5986 return SDValue();
5987
5988 if (LCC == ISD::SETO) {
5989 if (X != LHS.getOperand(1))
5990 return SDValue();
5991
5992 if (RCC == ISD::SETUNE) {
5993 const ConstantFPSDNode *C1 = dyn_cast<ConstantFPSDNode>(RHS.getOperand(1));
5994 if (!C1 || !C1->isInfinity() || C1->isNegative())
5995 return SDValue();
5996
5997 const uint32_t Mask = SIInstrFlags::N_NORMAL |
5998 SIInstrFlags::N_SUBNORMAL |
5999 SIInstrFlags::N_ZERO |
6000 SIInstrFlags::P_ZERO |
6001 SIInstrFlags::P_SUBNORMAL |
6002 SIInstrFlags::P_NORMAL;
6003
6004 static_assert(((~(SIInstrFlags::S_NAN |
6005 SIInstrFlags::Q_NAN |
6006 SIInstrFlags::N_INFINITY |
6007 SIInstrFlags::P_INFINITY)) & 0x3ff) == Mask,
6008 "mask not equal");
6009
6010 SDLoc DL(N);
6011 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1,
6012 X, DAG.getConstant(Mask, DL, MVT::i32));
6013 }
6014 }
6015 }
6016
6017 if (VT == MVT::i32 &&
6018 (RHS.getOpcode() == ISD::SIGN_EXTEND || LHS.getOpcode() == ISD::SIGN_EXTEND)) {
6019 // and x, (sext cc from i1) => select cc, x, 0
6020 if (RHS.getOpcode() != ISD::SIGN_EXTEND)
6021 std::swap(LHS, RHS);
6022 if (isBoolSGPR(RHS.getOperand(0)))
6023 return DAG.getSelect(SDLoc(N), MVT::i32, RHS.getOperand(0),
6024 LHS, DAG.getConstant(0, SDLoc(N), MVT::i32));
6025 }
6026
6027 return SDValue();
6028}
6029
6030SDValue SITargetLowering::performOrCombine(SDNode *N,
6031 DAGCombinerInfo &DCI) const {
6032 SelectionDAG &DAG = DCI.DAG;
6033 SDValue LHS = N->getOperand(0);
6034 SDValue RHS = N->getOperand(1);
6035
6036 EVT VT = N->getValueType(0);
6037 if (VT == MVT::i1) {
6038 // or (fp_class x, c1), (fp_class x, c2) -> fp_class x, (c1 | c2)
6039 if (LHS.getOpcode() == AMDGPUISD::FP_CLASS &&
6040 RHS.getOpcode() == AMDGPUISD::FP_CLASS) {
6041 SDValue Src = LHS.getOperand(0);
6042 if (Src != RHS.getOperand(0))
6043 return SDValue();
6044
6045 const ConstantSDNode *CLHS = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
6046 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS.getOperand(1));
6047 if (!CLHS || !CRHS)
6048 return SDValue();
6049
6050 // Only 10 bits are used.
6051 static const uint32_t MaxMask = 0x3ff;
6052
6053 uint32_t NewMask = (CLHS->getZExtValue() | CRHS->getZExtValue()) & MaxMask;
6054 SDLoc DL(N);
6055 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1,
6056 Src, DAG.getConstant(NewMask, DL, MVT::i32));
6057 }
6058
6059 return SDValue();
6060 }
6061
6062 if (VT != MVT::i64)
6063 return SDValue();
6064
6065 // TODO: This could be a generic combine with a predicate for extracting the
6066 // high half of an integer being free.
6067
6068 // (or i64:x, (zero_extend i32:y)) ->
6069 // i64 (bitcast (v2i32 build_vector (or i32:y, lo_32(x)), hi_32(x)))
6070 if (LHS.getOpcode() == ISD::ZERO_EXTEND &&
6071 RHS.getOpcode() != ISD::ZERO_EXTEND)
6072 std::swap(LHS, RHS);
6073
6074 if (RHS.getOpcode() == ISD::ZERO_EXTEND) {
6075 SDValue ExtSrc = RHS.getOperand(0);
6076 EVT SrcVT = ExtSrc.getValueType();
6077 if (SrcVT == MVT::i32) {
6078 SDLoc SL(N);
6079 SDValue LowLHS, HiBits;
6080 std::tie(LowLHS, HiBits) = split64BitValue(LHS, DAG);
6081 SDValue LowOr = DAG.getNode(ISD::OR, SL, MVT::i32, LowLHS, ExtSrc);
6082
6083 DCI.AddToWorklist(LowOr.getNode());
6084 DCI.AddToWorklist(HiBits.getNode());
6085
6086 SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32,
6087 LowOr, HiBits);
6088 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
6089 }
6090 }
6091
6092 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
6093 if (CRHS) {
6094 if (SDValue Split
6095 = splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::OR, LHS, CRHS))
6096 return Split;
6097 }
6098
6099 return SDValue();
6100}
6101
6102SDValue SITargetLowering::performXorCombine(SDNode *N,
6103 DAGCombinerInfo &DCI) const {
6104 EVT VT = N->getValueType(0);
6105 if (VT != MVT::i64)
6106 return SDValue();
6107
6108 SDValue LHS = N->getOperand(0);
6109 SDValue RHS = N->getOperand(1);
6110
6111 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS);
6112 if (CRHS) {
6113 if (SDValue Split
6114 = splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::XOR, LHS, CRHS))
6115 return Split;
6116 }
6117
6118 return SDValue();
6119}
6120
6121// Instructions that will be lowered with a final instruction that zeros the
6122// high result bits.
6123// XXX - probably only need to list legal operations.
6124static bool fp16SrcZerosHighBits(unsigned Opc) {
6125 switch (Opc) {
6126 case ISD::FADD:
6127 case ISD::FSUB:
6128 case ISD::FMUL:
6129 case ISD::FDIV:
6130 case ISD::FREM:
6131 case ISD::FMA:
6132 case ISD::FMAD:
6133 case ISD::FCANONICALIZE:
6134 case ISD::FP_ROUND:
6135 case ISD::UINT_TO_FP:
6136 case ISD::SINT_TO_FP:
6137 case ISD::FABS:
6138 // Fabs is lowered to a bit operation, but it's an and which will clear the
6139 // high bits anyway.
6140 case ISD::FSQRT:
6141 case ISD::FSIN:
6142 case ISD::FCOS:
6143 case ISD::FPOWI:
6144 case ISD::FPOW:
6145 case ISD::FLOG:
6146 case ISD::FLOG2:
6147 case ISD::FLOG10:
6148 case ISD::FEXP:
6149 case ISD::FEXP2:
6150 case ISD::FCEIL:
6151 case ISD::FTRUNC:
6152 case ISD::FRINT:
6153 case ISD::FNEARBYINT:
6154 case ISD::FROUND:
6155 case ISD::FFLOOR:
6156 case ISD::FMINNUM:
6157 case ISD::FMAXNUM:
6158 case AMDGPUISD::FRACT:
6159 case AMDGPUISD::CLAMP:
6160 case AMDGPUISD::COS_HW:
6161 case AMDGPUISD::SIN_HW:
6162 case AMDGPUISD::FMIN3:
6163 case AMDGPUISD::FMAX3:
6164 case AMDGPUISD::FMED3:
6165 case AMDGPUISD::FMAD_FTZ:
6166 case AMDGPUISD::RCP:
6167 case AMDGPUISD::RSQ:
6168 case AMDGPUISD::LDEXP:
6169 return true;
6170 default:
6171 // fcopysign, select and others may be lowered to 32-bit bit operations
6172 // which don't zero the high bits.
6173 return false;
6174 }
6175}
6176
6177SDValue SITargetLowering::performZeroExtendCombine(SDNode *N,
6178 DAGCombinerInfo &DCI) const {
6179 if (!Subtarget->has16BitInsts() ||
6180 DCI.getDAGCombineLevel() < AfterLegalizeDAG)
6181 return SDValue();
6182
6183 EVT VT = N->getValueType(0);
6184 if (VT != MVT::i32)
6185 return SDValue();
6186
6187 SDValue Src = N->getOperand(0);
6188 if (Src.getValueType() != MVT::i16)
6189 return SDValue();
6190
6191 // (i32 zext (i16 (bitcast f16:$src))) -> fp16_zext $src
6192 // FIXME: It is not universally true that the high bits are zeroed on gfx9.
6193 if (Src.getOpcode() == ISD::BITCAST) {
6194 SDValue BCSrc = Src.getOperand(0);
6195 if (BCSrc.getValueType() == MVT::f16 &&
6196 fp16SrcZerosHighBits(BCSrc.getOpcode()))
6197 return DCI.DAG.getNode(AMDGPUISD::FP16_ZEXT, SDLoc(N), VT, BCSrc);
6198 }
6199
6200 return SDValue();
6201}
6202
6203SDValue SITargetLowering::performClassCombine(SDNode *N,
6204 DAGCombinerInfo &DCI) const {
6205 SelectionDAG &DAG = DCI.DAG;
6206 SDValue Mask = N->getOperand(1);
6207
6208 // fp_class x, 0 -> false
6209 if (const ConstantSDNode *CMask = dyn_cast<ConstantSDNode>(Mask)) {
6210 if (CMask->isNullValue())
6211 return DAG.getConstant(0, SDLoc(N), MVT::i1);
6212 }
6213
6214 if (N->getOperand(0).isUndef())
6215 return DAG.getUNDEF(MVT::i1);
6216
6217 return SDValue();
6218}
6219
6220static bool isKnownNeverSNan(SelectionDAG &DAG, SDValue Op) {
6221 if (!DAG.getTargetLoweringInfo().hasFloatingPointExceptions())
6222 return true;
6223
6224 return DAG.isKnownNeverNaN(Op);
6225}
6226
6227static bool isCanonicalized(SelectionDAG &DAG, SDValue Op,
6228 const SISubtarget *ST, unsigned MaxDepth=5) {
6229 // If source is a result of another standard FP operation it is already in
6230 // canonical form.
6231
6232 switch (Op.getOpcode()) {
6233 default:
6234 break;
6235
6236 // These will flush denorms if required.
6237 case ISD::FADD:
6238 case ISD::FSUB:
6239 case ISD::FMUL:
6240 case ISD::FSQRT:
6241 case ISD::FCEIL:
6242 case ISD::FFLOOR:
6243 case ISD::FMA:
6244 case ISD::FMAD:
6245
6246 case ISD::FCANONICALIZE:
6247 return true;
6248
6249 case ISD::FP_ROUND:
6250 return Op.getValueType().getScalarType() != MVT::f16 ||
6251 ST->hasFP16Denormals();
6252
6253 case ISD::FP_EXTEND:
6254 return Op.getOperand(0).getValueType().getScalarType() != MVT::f16 ||
6255 ST->hasFP16Denormals();
6256
6257 case ISD::FP16_TO_FP:
6258 case ISD::FP_TO_FP16:
6259 return ST->hasFP16Denormals();
6260
6261 // It can/will be lowered or combined as a bit operation.
6262 // Need to check their input recursively to handle.
6263 case ISD::FNEG:
6264 case ISD::FABS:
6265 return (MaxDepth > 0) &&
6266 isCanonicalized(DAG, Op.getOperand(0), ST, MaxDepth - 1);
6267
6268 case ISD::FSIN:
6269 case ISD::FCOS:
6270 case ISD::FSINCOS:
6271 return Op.getValueType().getScalarType() != MVT::f16;
6272
6273 // In pre-GFX9 targets V_MIN_F32 and others do not flush denorms.
6274 // For such targets need to check their input recursively.
6275