Bug Summary

File:lib/Target/AMDGPU/SIISelLowering.cpp
Warning:line 8848, column 20
The result of the left shift is undefined due to shifting by '32', which is greater or equal to the width of type 'int'

Annotated Source Code

Press '?' to see keyboard shortcuts

clang -cc1 -triple x86_64-pc-linux-gnu -analyze -disable-free -disable-llvm-verifier -discard-value-names -main-file-name SIISelLowering.cpp -analyzer-store=region -analyzer-opt-analyze-nested-blocks -analyzer-checker=core -analyzer-checker=apiModeling -analyzer-checker=unix -analyzer-checker=deadcode -analyzer-checker=cplusplus -analyzer-checker=security.insecureAPI.UncheckedReturn -analyzer-checker=security.insecureAPI.getpw -analyzer-checker=security.insecureAPI.gets -analyzer-checker=security.insecureAPI.mktemp -analyzer-checker=security.insecureAPI.mkstemp -analyzer-checker=security.insecureAPI.vfork -analyzer-checker=nullability.NullPassedToNonnull -analyzer-checker=nullability.NullReturnedFromNonnull -analyzer-output plist -w -mrelocation-model pic -pic-level 2 -mthread-model posix -fmath-errno -masm-verbose -mconstructor-aliases -munwind-tables -fuse-init-array -target-cpu x86-64 -dwarf-column-info -debugger-tuning=gdb -momit-leaf-frame-pointer -ffunction-sections -fdata-sections -resource-dir /usr/lib/llvm-8/lib/clang/8.0.0 -D _DEBUG -D _GNU_SOURCE -D __STDC_CONSTANT_MACROS -D __STDC_FORMAT_MACROS -D __STDC_LIMIT_MACROS -I /build/llvm-toolchain-snapshot-8~svn345461/build-llvm/lib/Target/AMDGPU -I /build/llvm-toolchain-snapshot-8~svn345461/lib/Target/AMDGPU -I /build/llvm-toolchain-snapshot-8~svn345461/build-llvm/include -I /build/llvm-toolchain-snapshot-8~svn345461/include -U NDEBUG -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/6.3.0/../../../../include/c++/6.3.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/6.3.0/../../../../include/x86_64-linux-gnu/c++/6.3.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/6.3.0/../../../../include/x86_64-linux-gnu/c++/6.3.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/6.3.0/../../../../include/c++/6.3.0/backward -internal-isystem /usr/include/clang/8.0.0/include/ -internal-isystem /usr/local/include -internal-isystem /usr/lib/llvm-8/lib/clang/8.0.0/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -O2 -Wno-unused-parameter -Wwrite-strings -Wno-missing-field-initializers -Wno-long-long -Wno-maybe-uninitialized -Wno-comment -std=c++11 -fdeprecated-macro -fdebug-compilation-dir /build/llvm-toolchain-snapshot-8~svn345461/build-llvm/lib/Target/AMDGPU -ferror-limit 19 -fmessage-length 0 -fvisibility-inlines-hidden -fobjc-runtime=gcc -fdiagnostics-show-option -vectorize-loops -vectorize-slp -analyzer-output=html -analyzer-config stable-report-filename=true -o /tmp/scan-build-2018-10-27-211344-32123-1 -x c++ /build/llvm-toolchain-snapshot-8~svn345461/lib/Target/AMDGPU/SIISelLowering.cpp -faddrsig

/build/llvm-toolchain-snapshot-8~svn345461/lib/Target/AMDGPU/SIISelLowering.cpp

1//===-- SIISelLowering.cpp - SI DAG Lowering Implementation ---------------===//
2//
3// The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9//
10/// \file
11/// Custom DAG lowering for SI
12//
13//===----------------------------------------------------------------------===//
14
15#ifdef _MSC_VER
16// Provide M_PI.
17#define _USE_MATH_DEFINES
18#endif
19
20#include "SIISelLowering.h"
21#include "AMDGPU.h"
22#include "AMDGPUIntrinsicInfo.h"
23#include "AMDGPUSubtarget.h"
24#include "AMDGPUTargetMachine.h"
25#include "SIDefines.h"
26#include "SIInstrInfo.h"
27#include "SIMachineFunctionInfo.h"
28#include "SIRegisterInfo.h"
29#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
30#include "Utils/AMDGPUBaseInfo.h"
31#include "llvm/ADT/APFloat.h"
32#include "llvm/ADT/APInt.h"
33#include "llvm/ADT/ArrayRef.h"
34#include "llvm/ADT/BitVector.h"
35#include "llvm/ADT/SmallVector.h"
36#include "llvm/ADT/Statistic.h"
37#include "llvm/ADT/StringRef.h"
38#include "llvm/ADT/StringSwitch.h"
39#include "llvm/ADT/Twine.h"
40#include "llvm/CodeGen/Analysis.h"
41#include "llvm/CodeGen/CallingConvLower.h"
42#include "llvm/CodeGen/DAGCombine.h"
43#include "llvm/CodeGen/ISDOpcodes.h"
44#include "llvm/CodeGen/MachineBasicBlock.h"
45#include "llvm/CodeGen/MachineFrameInfo.h"
46#include "llvm/CodeGen/MachineFunction.h"
47#include "llvm/CodeGen/MachineInstr.h"
48#include "llvm/CodeGen/MachineInstrBuilder.h"
49#include "llvm/CodeGen/MachineMemOperand.h"
50#include "llvm/CodeGen/MachineModuleInfo.h"
51#include "llvm/CodeGen/MachineOperand.h"
52#include "llvm/CodeGen/MachineRegisterInfo.h"
53#include "llvm/CodeGen/SelectionDAG.h"
54#include "llvm/CodeGen/SelectionDAGNodes.h"
55#include "llvm/CodeGen/TargetCallingConv.h"
56#include "llvm/CodeGen/TargetRegisterInfo.h"
57#include "llvm/CodeGen/ValueTypes.h"
58#include "llvm/IR/Constants.h"
59#include "llvm/IR/DataLayout.h"
60#include "llvm/IR/DebugLoc.h"
61#include "llvm/IR/DerivedTypes.h"
62#include "llvm/IR/DiagnosticInfo.h"
63#include "llvm/IR/Function.h"
64#include "llvm/IR/GlobalValue.h"
65#include "llvm/IR/InstrTypes.h"
66#include "llvm/IR/Instruction.h"
67#include "llvm/IR/Instructions.h"
68#include "llvm/IR/IntrinsicInst.h"
69#include "llvm/IR/Type.h"
70#include "llvm/Support/Casting.h"
71#include "llvm/Support/CodeGen.h"
72#include "llvm/Support/CommandLine.h"
73#include "llvm/Support/Compiler.h"
74#include "llvm/Support/ErrorHandling.h"
75#include "llvm/Support/KnownBits.h"
76#include "llvm/Support/MachineValueType.h"
77#include "llvm/Support/MathExtras.h"
78#include "llvm/Target/TargetOptions.h"
79#include <cassert>
80#include <cmath>
81#include <cstdint>
82#include <iterator>
83#include <tuple>
84#include <utility>
85#include <vector>
86
87using namespace llvm;
88
89#define DEBUG_TYPE"si-lower" "si-lower"
90
91STATISTIC(NumTailCalls, "Number of tail calls")static llvm::Statistic NumTailCalls = {"si-lower", "NumTailCalls"
, "Number of tail calls", {0}, {false}}
;
92
93static cl::opt<bool> EnableVGPRIndexMode(
94 "amdgpu-vgpr-index-mode",
95 cl::desc("Use GPR indexing mode instead of movrel for vector indexing"),
96 cl::init(false));
97
98static cl::opt<unsigned> AssumeFrameIndexHighZeroBits(
99 "amdgpu-frame-index-zero-bits",
100 cl::desc("High bits of frame index assumed to be zero"),
101 cl::init(5),
102 cl::ReallyHidden);
103
104static unsigned findFirstFreeSGPR(CCState &CCInfo) {
105 unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
106 for (unsigned Reg = 0; Reg < NumSGPRs; ++Reg) {
107 if (!CCInfo.isAllocated(AMDGPU::SGPR0 + Reg)) {
108 return AMDGPU::SGPR0 + Reg;
109 }
110 }
111 llvm_unreachable("Cannot allocate sgpr")::llvm::llvm_unreachable_internal("Cannot allocate sgpr", "/build/llvm-toolchain-snapshot-8~svn345461/lib/Target/AMDGPU/SIISelLowering.cpp"
, 111)
;
112}
113
114SITargetLowering::SITargetLowering(const TargetMachine &TM,
115 const GCNSubtarget &STI)
116 : AMDGPUTargetLowering(TM, STI),
117 Subtarget(&STI) {
118 addRegisterClass(MVT::i1, &AMDGPU::VReg_1RegClass);
119 addRegisterClass(MVT::i64, &AMDGPU::SReg_64RegClass);
120
121 addRegisterClass(MVT::i32, &AMDGPU::SReg_32_XM0RegClass);
122 addRegisterClass(MVT::f32, &AMDGPU::VGPR_32RegClass);
123
124 addRegisterClass(MVT::f64, &AMDGPU::VReg_64RegClass);
125 addRegisterClass(MVT::v2i32, &AMDGPU::SReg_64RegClass);
126 addRegisterClass(MVT::v2f32, &AMDGPU::VReg_64RegClass);
127
128 addRegisterClass(MVT::v2i64, &AMDGPU::SReg_128RegClass);
129 addRegisterClass(MVT::v2f64, &AMDGPU::SReg_128RegClass);
130
131 addRegisterClass(MVT::v4i32, &AMDGPU::SReg_128RegClass);
132 addRegisterClass(MVT::v4f32, &AMDGPU::VReg_128RegClass);
133
134 addRegisterClass(MVT::v8i32, &AMDGPU::SReg_256RegClass);
135 addRegisterClass(MVT::v8f32, &AMDGPU::VReg_256RegClass);
136
137 addRegisterClass(MVT::v16i32, &AMDGPU::SReg_512RegClass);
138 addRegisterClass(MVT::v16f32, &AMDGPU::VReg_512RegClass);
139
140 if (Subtarget->has16BitInsts()) {
141 addRegisterClass(MVT::i16, &AMDGPU::SReg_32_XM0RegClass);
142 addRegisterClass(MVT::f16, &AMDGPU::SReg_32_XM0RegClass);
143
144 // Unless there are also VOP3P operations, not operations are really legal.
145 addRegisterClass(MVT::v2i16, &AMDGPU::SReg_32_XM0RegClass);
146 addRegisterClass(MVT::v2f16, &AMDGPU::SReg_32_XM0RegClass);
147 addRegisterClass(MVT::v4i16, &AMDGPU::SReg_64RegClass);
148 addRegisterClass(MVT::v4f16, &AMDGPU::SReg_64RegClass);
149 }
150
151 computeRegisterProperties(Subtarget->getRegisterInfo());
152
153 // We need to custom lower vector stores from local memory
154 setOperationAction(ISD::LOAD, MVT::v2i32, Custom);
155 setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
156 setOperationAction(ISD::LOAD, MVT::v8i32, Custom);
157 setOperationAction(ISD::LOAD, MVT::v16i32, Custom);
158 setOperationAction(ISD::LOAD, MVT::i1, Custom);
159 setOperationAction(ISD::LOAD, MVT::v32i32, Custom);
160
161 setOperationAction(ISD::STORE, MVT::v2i32, Custom);
162 setOperationAction(ISD::STORE, MVT::v4i32, Custom);
163 setOperationAction(ISD::STORE, MVT::v8i32, Custom);
164 setOperationAction(ISD::STORE, MVT::v16i32, Custom);
165 setOperationAction(ISD::STORE, MVT::i1, Custom);
166 setOperationAction(ISD::STORE, MVT::v32i32, Custom);
167
168 setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand);
169 setTruncStoreAction(MVT::v4i32, MVT::v4i16, Expand);
170 setTruncStoreAction(MVT::v8i32, MVT::v8i16, Expand);
171 setTruncStoreAction(MVT::v16i32, MVT::v16i16, Expand);
172 setTruncStoreAction(MVT::v32i32, MVT::v32i16, Expand);
173 setTruncStoreAction(MVT::v2i32, MVT::v2i8, Expand);
174 setTruncStoreAction(MVT::v4i32, MVT::v4i8, Expand);
175 setTruncStoreAction(MVT::v8i32, MVT::v8i8, Expand);
176 setTruncStoreAction(MVT::v16i32, MVT::v16i8, Expand);
177 setTruncStoreAction(MVT::v32i32, MVT::v32i8, Expand);
178
179 setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
180 setOperationAction(ISD::GlobalAddress, MVT::i64, Custom);
181
182 setOperationAction(ISD::SELECT, MVT::i1, Promote);
183 setOperationAction(ISD::SELECT, MVT::i64, Custom);
184 setOperationAction(ISD::SELECT, MVT::f64, Promote);
185 AddPromotedToType(ISD::SELECT, MVT::f64, MVT::i64);
186
187 setOperationAction(ISD::SELECT_CC, MVT::f32, Expand);
188 setOperationAction(ISD::SELECT_CC, MVT::i32, Expand);
189 setOperationAction(ISD::SELECT_CC, MVT::i64, Expand);
190 setOperationAction(ISD::SELECT_CC, MVT::f64, Expand);
191 setOperationAction(ISD::SELECT_CC, MVT::i1, Expand);
192
193 setOperationAction(ISD::SETCC, MVT::i1, Promote);
194 setOperationAction(ISD::SETCC, MVT::v2i1, Expand);
195 setOperationAction(ISD::SETCC, MVT::v4i1, Expand);
196 AddPromotedToType(ISD::SETCC, MVT::i1, MVT::i32);
197
198 setOperationAction(ISD::TRUNCATE, MVT::v2i32, Expand);
199 setOperationAction(ISD::FP_ROUND, MVT::v2f32, Expand);
200
201 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i1, Custom);
202 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i1, Custom);
203 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i8, Custom);
204 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i8, Custom);
205 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i16, Custom);
206 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i16, Custom);
207 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::Other, Custom);
208
209 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
210 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::f32, Custom);
211 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v4f32, Custom);
212 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i16, Custom);
213 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::f16, Custom);
214 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v2i16, Custom);
215 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v2f16, Custom);
216
217 setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v2f16, Custom);
218 setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v4f16, Custom);
219 setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
220
221 setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
222 setOperationAction(ISD::INTRINSIC_VOID, MVT::v2i16, Custom);
223 setOperationAction(ISD::INTRINSIC_VOID, MVT::v2f16, Custom);
224 setOperationAction(ISD::INTRINSIC_VOID, MVT::v4f16, Custom);
225
226 setOperationAction(ISD::BRCOND, MVT::Other, Custom);
227 setOperationAction(ISD::BR_CC, MVT::i1, Expand);
228 setOperationAction(ISD::BR_CC, MVT::i32, Expand);
229 setOperationAction(ISD::BR_CC, MVT::i64, Expand);
230 setOperationAction(ISD::BR_CC, MVT::f32, Expand);
231 setOperationAction(ISD::BR_CC, MVT::f64, Expand);
232
233 setOperationAction(ISD::UADDO, MVT::i32, Legal);
234 setOperationAction(ISD::USUBO, MVT::i32, Legal);
235
236 setOperationAction(ISD::ADDCARRY, MVT::i32, Legal);
237 setOperationAction(ISD::SUBCARRY, MVT::i32, Legal);
238
239 setOperationAction(ISD::SHL_PARTS, MVT::i64, Expand);
240 setOperationAction(ISD::SRA_PARTS, MVT::i64, Expand);
241 setOperationAction(ISD::SRL_PARTS, MVT::i64, Expand);
242
243#if 0
244 setOperationAction(ISD::ADDCARRY, MVT::i64, Legal);
245 setOperationAction(ISD::SUBCARRY, MVT::i64, Legal);
246#endif
247
248 // We only support LOAD/STORE and vector manipulation ops for vectors
249 // with > 4 elements.
250 for (MVT VT : {MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32,
251 MVT::v2i64, MVT::v2f64, MVT::v4i16, MVT::v4f16, MVT::v32i32 }) {
252 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
253 switch (Op) {
254 case ISD::LOAD:
255 case ISD::STORE:
256 case ISD::BUILD_VECTOR:
257 case ISD::BITCAST:
258 case ISD::EXTRACT_VECTOR_ELT:
259 case ISD::INSERT_VECTOR_ELT:
260 case ISD::INSERT_SUBVECTOR:
261 case ISD::EXTRACT_SUBVECTOR:
262 case ISD::SCALAR_TO_VECTOR:
263 break;
264 case ISD::CONCAT_VECTORS:
265 setOperationAction(Op, VT, Custom);
266 break;
267 default:
268 setOperationAction(Op, VT, Expand);
269 break;
270 }
271 }
272 }
273
274 setOperationAction(ISD::FP_EXTEND, MVT::v4f32, Expand);
275
276 // TODO: For dynamic 64-bit vector inserts/extracts, should emit a pseudo that
277 // is expanded to avoid having two separate loops in case the index is a VGPR.
278
279 // Most operations are naturally 32-bit vector operations. We only support
280 // load and store of i64 vectors, so promote v2i64 vector operations to v4i32.
281 for (MVT Vec64 : { MVT::v2i64, MVT::v2f64 }) {
282 setOperationAction(ISD::BUILD_VECTOR, Vec64, Promote);
283 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v4i32);
284
285 setOperationAction(ISD::EXTRACT_VECTOR_ELT, Vec64, Promote);
286 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v4i32);
287
288 setOperationAction(ISD::INSERT_VECTOR_ELT, Vec64, Promote);
289 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v4i32);
290
291 setOperationAction(ISD::SCALAR_TO_VECTOR, Vec64, Promote);
292 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v4i32);
293 }
294
295 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i32, Expand);
296 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8f32, Expand);
297 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16i32, Expand);
298 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16f32, Expand);
299
300 setOperationAction(ISD::BUILD_VECTOR, MVT::v4f16, Custom);
301 setOperationAction(ISD::BUILD_VECTOR, MVT::v4i16, Custom);
302
303 // Avoid stack access for these.
304 // TODO: Generalize to more vector types.
305 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i16, Custom);
306 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f16, Custom);
307 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i16, Custom);
308 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f16, Custom);
309
310 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i16, Custom);
311 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f16, Custom);
312 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i8, Custom);
313 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i8, Custom);
314 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i8, Custom);
315
316 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i8, Custom);
317 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i8, Custom);
318 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i8, Custom);
319
320 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i16, Custom);
321 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f16, Custom);
322 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i16, Custom);
323 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f16, Custom);
324
325 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling,
326 // and output demarshalling
327 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i32, Custom);
328 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i64, Custom);
329
330 // We can't return success/failure, only the old value,
331 // let LLVM add the comparison
332 setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i32, Expand);
333 setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i64, Expand);
334
335 if (Subtarget->hasFlatAddressSpace()) {
336 setOperationAction(ISD::ADDRSPACECAST, MVT::i32, Custom);
337 setOperationAction(ISD::ADDRSPACECAST, MVT::i64, Custom);
338 }
339
340 setOperationAction(ISD::BSWAP, MVT::i32, Legal);
341 setOperationAction(ISD::BITREVERSE, MVT::i32, Legal);
342
343 // On SI this is s_memtime and s_memrealtime on VI.
344 setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Legal);
345 setOperationAction(ISD::TRAP, MVT::Other, Custom);
346 setOperationAction(ISD::DEBUGTRAP, MVT::Other, Custom);
347
348 if (Subtarget->has16BitInsts()) {
349 setOperationAction(ISD::FLOG, MVT::f16, Custom);
350 setOperationAction(ISD::FEXP, MVT::f16, Custom);
351 setOperationAction(ISD::FLOG10, MVT::f16, Custom);
352 }
353
354 // v_mad_f32 does not support denormals according to some sources.
355 if (!Subtarget->hasFP32Denormals())
356 setOperationAction(ISD::FMAD, MVT::f32, Legal);
357
358 if (!Subtarget->hasBFI()) {
359 // fcopysign can be done in a single instruction with BFI.
360 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
361 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
362 }
363
364 if (!Subtarget->hasBCNT(32))
365 setOperationAction(ISD::CTPOP, MVT::i32, Expand);
366
367 if (!Subtarget->hasBCNT(64))
368 setOperationAction(ISD::CTPOP, MVT::i64, Expand);
369
370 if (Subtarget->hasFFBH())
371 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, Custom);
372
373 if (Subtarget->hasFFBL())
374 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32, Custom);
375
376 // We only really have 32-bit BFE instructions (and 16-bit on VI).
377 //
378 // On SI+ there are 64-bit BFEs, but they are scalar only and there isn't any
379 // effort to match them now. We want this to be false for i64 cases when the
380 // extraction isn't restricted to the upper or lower half. Ideally we would
381 // have some pass reduce 64-bit extracts to 32-bit if possible. Extracts that
382 // span the midpoint are probably relatively rare, so don't worry about them
383 // for now.
384 if (Subtarget->hasBFE())
385 setHasExtractBitsInsn(true);
386
387 setOperationAction(ISD::FMINNUM, MVT::f32, Custom);
388 setOperationAction(ISD::FMAXNUM, MVT::f32, Custom);
389 setOperationAction(ISD::FMINNUM, MVT::f64, Custom);
390 setOperationAction(ISD::FMAXNUM, MVT::f64, Custom);
391
392
393 // These are really only legal for ieee_mode functions. We should be avoiding
394 // them for functions that don't have ieee_mode enabled, so just say they are
395 // legal.
396 setOperationAction(ISD::FMINNUM_IEEE, MVT::f32, Legal);
397 setOperationAction(ISD::FMAXNUM_IEEE, MVT::f32, Legal);
398 setOperationAction(ISD::FMINNUM_IEEE, MVT::f64, Legal);
399 setOperationAction(ISD::FMAXNUM_IEEE, MVT::f64, Legal);
400
401
402 if (Subtarget->getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
403 setOperationAction(ISD::FTRUNC, MVT::f64, Legal);
404 setOperationAction(ISD::FCEIL, MVT::f64, Legal);
405 setOperationAction(ISD::FRINT, MVT::f64, Legal);
406 } else {
407 setOperationAction(ISD::FCEIL, MVT::f64, Custom);
408 setOperationAction(ISD::FTRUNC, MVT::f64, Custom);
409 setOperationAction(ISD::FRINT, MVT::f64, Custom);
410 setOperationAction(ISD::FFLOOR, MVT::f64, Custom);
411 }
412
413 setOperationAction(ISD::FFLOOR, MVT::f64, Legal);
414
415 setOperationAction(ISD::FSIN, MVT::f32, Custom);
416 setOperationAction(ISD::FCOS, MVT::f32, Custom);
417 setOperationAction(ISD::FDIV, MVT::f32, Custom);
418 setOperationAction(ISD::FDIV, MVT::f64, Custom);
419
420 if (Subtarget->has16BitInsts()) {
421 setOperationAction(ISD::Constant, MVT::i16, Legal);
422
423 setOperationAction(ISD::SMIN, MVT::i16, Legal);
424 setOperationAction(ISD::SMAX, MVT::i16, Legal);
425
426 setOperationAction(ISD::UMIN, MVT::i16, Legal);
427 setOperationAction(ISD::UMAX, MVT::i16, Legal);
428
429 setOperationAction(ISD::SIGN_EXTEND, MVT::i16, Promote);
430 AddPromotedToType(ISD::SIGN_EXTEND, MVT::i16, MVT::i32);
431
432 setOperationAction(ISD::ROTR, MVT::i16, Promote);
433 setOperationAction(ISD::ROTL, MVT::i16, Promote);
434
435 setOperationAction(ISD::SDIV, MVT::i16, Promote);
436 setOperationAction(ISD::UDIV, MVT::i16, Promote);
437 setOperationAction(ISD::SREM, MVT::i16, Promote);
438 setOperationAction(ISD::UREM, MVT::i16, Promote);
439
440 setOperationAction(ISD::BSWAP, MVT::i16, Promote);
441 setOperationAction(ISD::BITREVERSE, MVT::i16, Promote);
442
443 setOperationAction(ISD::CTTZ, MVT::i16, Promote);
444 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i16, Promote);
445 setOperationAction(ISD::CTLZ, MVT::i16, Promote);
446 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16, Promote);
447 setOperationAction(ISD::CTPOP, MVT::i16, Promote);
448
449 setOperationAction(ISD::SELECT_CC, MVT::i16, Expand);
450
451 setOperationAction(ISD::BR_CC, MVT::i16, Expand);
452
453 setOperationAction(ISD::LOAD, MVT::i16, Custom);
454
455 setTruncStoreAction(MVT::i64, MVT::i16, Expand);
456
457 setOperationAction(ISD::FP16_TO_FP, MVT::i16, Promote);
458 AddPromotedToType(ISD::FP16_TO_FP, MVT::i16, MVT::i32);
459 setOperationAction(ISD::FP_TO_FP16, MVT::i16, Promote);
460 AddPromotedToType(ISD::FP_TO_FP16, MVT::i16, MVT::i32);
461
462 setOperationAction(ISD::FP_TO_SINT, MVT::i16, Promote);
463 setOperationAction(ISD::FP_TO_UINT, MVT::i16, Promote);
464 setOperationAction(ISD::SINT_TO_FP, MVT::i16, Promote);
465 setOperationAction(ISD::UINT_TO_FP, MVT::i16, Promote);
466
467 // F16 - Constant Actions.
468 setOperationAction(ISD::ConstantFP, MVT::f16, Legal);
469
470 // F16 - Load/Store Actions.
471 setOperationAction(ISD::LOAD, MVT::f16, Promote);
472 AddPromotedToType(ISD::LOAD, MVT::f16, MVT::i16);
473 setOperationAction(ISD::STORE, MVT::f16, Promote);
474 AddPromotedToType(ISD::STORE, MVT::f16, MVT::i16);
475
476 // F16 - VOP1 Actions.
477 setOperationAction(ISD::FP_ROUND, MVT::f16, Custom);
478 setOperationAction(ISD::FCOS, MVT::f16, Promote);
479 setOperationAction(ISD::FSIN, MVT::f16, Promote);
480 setOperationAction(ISD::FP_TO_SINT, MVT::f16, Promote);
481 setOperationAction(ISD::FP_TO_UINT, MVT::f16, Promote);
482 setOperationAction(ISD::SINT_TO_FP, MVT::f16, Promote);
483 setOperationAction(ISD::UINT_TO_FP, MVT::f16, Promote);
484 setOperationAction(ISD::FROUND, MVT::f16, Custom);
485
486 // F16 - VOP2 Actions.
487 setOperationAction(ISD::BR_CC, MVT::f16, Expand);
488 setOperationAction(ISD::SELECT_CC, MVT::f16, Expand);
489
490 setOperationAction(ISD::FDIV, MVT::f16, Custom);
491
492 // F16 - VOP3 Actions.
493 setOperationAction(ISD::FMA, MVT::f16, Legal);
494 if (!Subtarget->hasFP16Denormals())
495 setOperationAction(ISD::FMAD, MVT::f16, Legal);
496
497 for (MVT VT : {MVT::v2i16, MVT::v2f16, MVT::v4i16, MVT::v4f16}) {
498 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
499 switch (Op) {
500 case ISD::LOAD:
501 case ISD::STORE:
502 case ISD::BUILD_VECTOR:
503 case ISD::BITCAST:
504 case ISD::EXTRACT_VECTOR_ELT:
505 case ISD::INSERT_VECTOR_ELT:
506 case ISD::INSERT_SUBVECTOR:
507 case ISD::EXTRACT_SUBVECTOR:
508 case ISD::SCALAR_TO_VECTOR:
509 break;
510 case ISD::CONCAT_VECTORS:
511 setOperationAction(Op, VT, Custom);
512 break;
513 default:
514 setOperationAction(Op, VT, Expand);
515 break;
516 }
517 }
518 }
519
520 // XXX - Do these do anything? Vector constants turn into build_vector.
521 setOperationAction(ISD::Constant, MVT::v2i16, Legal);
522 setOperationAction(ISD::ConstantFP, MVT::v2f16, Legal);
523
524 setOperationAction(ISD::UNDEF, MVT::v2i16, Legal);
525 setOperationAction(ISD::UNDEF, MVT::v2f16, Legal);
526
527 setOperationAction(ISD::STORE, MVT::v2i16, Promote);
528 AddPromotedToType(ISD::STORE, MVT::v2i16, MVT::i32);
529 setOperationAction(ISD::STORE, MVT::v2f16, Promote);
530 AddPromotedToType(ISD::STORE, MVT::v2f16, MVT::i32);
531
532 setOperationAction(ISD::LOAD, MVT::v2i16, Promote);
533 AddPromotedToType(ISD::LOAD, MVT::v2i16, MVT::i32);
534 setOperationAction(ISD::LOAD, MVT::v2f16, Promote);
535 AddPromotedToType(ISD::LOAD, MVT::v2f16, MVT::i32);
536
537 setOperationAction(ISD::AND, MVT::v2i16, Promote);
538 AddPromotedToType(ISD::AND, MVT::v2i16, MVT::i32);
539 setOperationAction(ISD::OR, MVT::v2i16, Promote);
540 AddPromotedToType(ISD::OR, MVT::v2i16, MVT::i32);
541 setOperationAction(ISD::XOR, MVT::v2i16, Promote);
542 AddPromotedToType(ISD::XOR, MVT::v2i16, MVT::i32);
543
544 setOperationAction(ISD::LOAD, MVT::v4i16, Promote);
545 AddPromotedToType(ISD::LOAD, MVT::v4i16, MVT::v2i32);
546 setOperationAction(ISD::LOAD, MVT::v4f16, Promote);
547 AddPromotedToType(ISD::LOAD, MVT::v4f16, MVT::v2i32);
548
549 setOperationAction(ISD::STORE, MVT::v4i16, Promote);
550 AddPromotedToType(ISD::STORE, MVT::v4i16, MVT::v2i32);
551 setOperationAction(ISD::STORE, MVT::v4f16, Promote);
552 AddPromotedToType(ISD::STORE, MVT::v4f16, MVT::v2i32);
553
554 setOperationAction(ISD::ANY_EXTEND, MVT::v2i32, Expand);
555 setOperationAction(ISD::ZERO_EXTEND, MVT::v2i32, Expand);
556 setOperationAction(ISD::SIGN_EXTEND, MVT::v2i32, Expand);
557 setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Expand);
558
559 setOperationAction(ISD::ANY_EXTEND, MVT::v4i32, Expand);
560 setOperationAction(ISD::ZERO_EXTEND, MVT::v4i32, Expand);
561 setOperationAction(ISD::SIGN_EXTEND, MVT::v4i32, Expand);
562
563 if (!Subtarget->hasVOP3PInsts()) {
564 setOperationAction(ISD::BUILD_VECTOR, MVT::v2i16, Custom);
565 setOperationAction(ISD::BUILD_VECTOR, MVT::v2f16, Custom);
566 }
567
568 setOperationAction(ISD::FNEG, MVT::v2f16, Legal);
569 // This isn't really legal, but this avoids the legalizer unrolling it (and
570 // allows matching fneg (fabs x) patterns)
571 setOperationAction(ISD::FABS, MVT::v2f16, Legal);
572
573 setOperationAction(ISD::FMAXNUM, MVT::f16, Custom);
574 setOperationAction(ISD::FMINNUM, MVT::f16, Custom);
575 setOperationAction(ISD::FMAXNUM_IEEE, MVT::f16, Legal);
576 setOperationAction(ISD::FMINNUM_IEEE, MVT::f16, Legal);
577
578 setOperationAction(ISD::FMINNUM_IEEE, MVT::v4f16, Custom);
579 setOperationAction(ISD::FMAXNUM_IEEE, MVT::v4f16, Custom);
580
581 setOperationAction(ISD::FMINNUM, MVT::v4f16, Expand);
582 setOperationAction(ISD::FMAXNUM, MVT::v4f16, Expand);
583 }
584
585 if (Subtarget->hasVOP3PInsts()) {
586 setOperationAction(ISD::ADD, MVT::v2i16, Legal);
587 setOperationAction(ISD::SUB, MVT::v2i16, Legal);
588 setOperationAction(ISD::MUL, MVT::v2i16, Legal);
589 setOperationAction(ISD::SHL, MVT::v2i16, Legal);
590 setOperationAction(ISD::SRL, MVT::v2i16, Legal);
591 setOperationAction(ISD::SRA, MVT::v2i16, Legal);
592 setOperationAction(ISD::SMIN, MVT::v2i16, Legal);
593 setOperationAction(ISD::UMIN, MVT::v2i16, Legal);
594 setOperationAction(ISD::SMAX, MVT::v2i16, Legal);
595 setOperationAction(ISD::UMAX, MVT::v2i16, Legal);
596
597 setOperationAction(ISD::FADD, MVT::v2f16, Legal);
598 setOperationAction(ISD::FMUL, MVT::v2f16, Legal);
599 setOperationAction(ISD::FMA, MVT::v2f16, Legal);
600
601 setOperationAction(ISD::FMINNUM_IEEE, MVT::v2f16, Legal);
602 setOperationAction(ISD::FMAXNUM_IEEE, MVT::v2f16, Legal);
603
604 setOperationAction(ISD::FCANONICALIZE, MVT::v2f16, Legal);
605
606 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i16, Custom);
607 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f16, Custom);
608
609 setOperationAction(ISD::SHL, MVT::v4i16, Custom);
610 setOperationAction(ISD::SRA, MVT::v4i16, Custom);
611 setOperationAction(ISD::SRL, MVT::v4i16, Custom);
612 setOperationAction(ISD::ADD, MVT::v4i16, Custom);
613 setOperationAction(ISD::SUB, MVT::v4i16, Custom);
614 setOperationAction(ISD::MUL, MVT::v4i16, Custom);
615
616 setOperationAction(ISD::SMIN, MVT::v4i16, Custom);
617 setOperationAction(ISD::SMAX, MVT::v4i16, Custom);
618 setOperationAction(ISD::UMIN, MVT::v4i16, Custom);
619 setOperationAction(ISD::UMAX, MVT::v4i16, Custom);
620
621 setOperationAction(ISD::FADD, MVT::v4f16, Custom);
622 setOperationAction(ISD::FMUL, MVT::v4f16, Custom);
623
624 setOperationAction(ISD::FMAXNUM, MVT::v2f16, Custom);
625 setOperationAction(ISD::FMINNUM, MVT::v2f16, Custom);
626
627 setOperationAction(ISD::FMINNUM, MVT::v4f16, Custom);
628 setOperationAction(ISD::FMAXNUM, MVT::v4f16, Custom);
629 setOperationAction(ISD::FCANONICALIZE, MVT::v4f16, Custom);
630
631 setOperationAction(ISD::FEXP, MVT::v2f16, Custom);
632 setOperationAction(ISD::SELECT, MVT::v4i16, Custom);
633 setOperationAction(ISD::SELECT, MVT::v4f16, Custom);
634 }
635
636 setOperationAction(ISD::FNEG, MVT::v4f16, Custom);
637 setOperationAction(ISD::FABS, MVT::v4f16, Custom);
638
639 if (Subtarget->has16BitInsts()) {
640 setOperationAction(ISD::SELECT, MVT::v2i16, Promote);
641 AddPromotedToType(ISD::SELECT, MVT::v2i16, MVT::i32);
642 setOperationAction(ISD::SELECT, MVT::v2f16, Promote);
643 AddPromotedToType(ISD::SELECT, MVT::v2f16, MVT::i32);
644 } else {
645 // Legalization hack.
646 setOperationAction(ISD::SELECT, MVT::v2i16, Custom);
647 setOperationAction(ISD::SELECT, MVT::v2f16, Custom);
648
649 setOperationAction(ISD::FNEG, MVT::v2f16, Custom);
650 setOperationAction(ISD::FABS, MVT::v2f16, Custom);
651 }
652
653 for (MVT VT : { MVT::v4i16, MVT::v4f16, MVT::v2i8, MVT::v4i8, MVT::v8i8 }) {
654 setOperationAction(ISD::SELECT, VT, Custom);
655 }
656
657 setTargetDAGCombine(ISD::ADD);
658 setTargetDAGCombine(ISD::ADDCARRY);
659 setTargetDAGCombine(ISD::SUB);
660 setTargetDAGCombine(ISD::SUBCARRY);
661 setTargetDAGCombine(ISD::FADD);
662 setTargetDAGCombine(ISD::FSUB);
663 setTargetDAGCombine(ISD::FMINNUM);
664 setTargetDAGCombine(ISD::FMAXNUM);
665 setTargetDAGCombine(ISD::FMINNUM_IEEE);
666 setTargetDAGCombine(ISD::FMAXNUM_IEEE);
667 setTargetDAGCombine(ISD::FMA);
668 setTargetDAGCombine(ISD::SMIN);
669 setTargetDAGCombine(ISD::SMAX);
670 setTargetDAGCombine(ISD::UMIN);
671 setTargetDAGCombine(ISD::UMAX);
672 setTargetDAGCombine(ISD::SETCC);
673 setTargetDAGCombine(ISD::AND);
674 setTargetDAGCombine(ISD::OR);
675 setTargetDAGCombine(ISD::XOR);
676 setTargetDAGCombine(ISD::SINT_TO_FP);
677 setTargetDAGCombine(ISD::UINT_TO_FP);
678 setTargetDAGCombine(ISD::FCANONICALIZE);
679 setTargetDAGCombine(ISD::SCALAR_TO_VECTOR);
680 setTargetDAGCombine(ISD::ZERO_EXTEND);
681 setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
682 setTargetDAGCombine(ISD::BUILD_VECTOR);
683
684 // All memory operations. Some folding on the pointer operand is done to help
685 // matching the constant offsets in the addressing modes.
686 setTargetDAGCombine(ISD::LOAD);
687 setTargetDAGCombine(ISD::STORE);
688 setTargetDAGCombine(ISD::ATOMIC_LOAD);
689 setTargetDAGCombine(ISD::ATOMIC_STORE);
690 setTargetDAGCombine(ISD::ATOMIC_CMP_SWAP);
691 setTargetDAGCombine(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS);
692 setTargetDAGCombine(ISD::ATOMIC_SWAP);
693 setTargetDAGCombine(ISD::ATOMIC_LOAD_ADD);
694 setTargetDAGCombine(ISD::ATOMIC_LOAD_SUB);
695 setTargetDAGCombine(ISD::ATOMIC_LOAD_AND);
696 setTargetDAGCombine(ISD::ATOMIC_LOAD_OR);
697 setTargetDAGCombine(ISD::ATOMIC_LOAD_XOR);
698 setTargetDAGCombine(ISD::ATOMIC_LOAD_NAND);
699 setTargetDAGCombine(ISD::ATOMIC_LOAD_MIN);
700 setTargetDAGCombine(ISD::ATOMIC_LOAD_MAX);
701 setTargetDAGCombine(ISD::ATOMIC_LOAD_UMIN);
702 setTargetDAGCombine(ISD::ATOMIC_LOAD_UMAX);
703
704 setSchedulingPreference(Sched::RegPressure);
705
706 // SI at least has hardware support for floating point exceptions, but no way
707 // of using or handling them is implemented. They are also optional in OpenCL
708 // (Section 7.3)
709 setHasFloatingPointExceptions(Subtarget->hasFPExceptions());
710}
711
712const GCNSubtarget *SITargetLowering::getSubtarget() const {
713 return Subtarget;
714}
715
716//===----------------------------------------------------------------------===//
717// TargetLowering queries
718//===----------------------------------------------------------------------===//
719
720// v_mad_mix* support a conversion from f16 to f32.
721//
722// There is only one special case when denormals are enabled we don't currently,
723// where this is OK to use.
724bool SITargetLowering::isFPExtFoldable(unsigned Opcode,
725 EVT DestVT, EVT SrcVT) const {
726 return ((Opcode == ISD::FMAD && Subtarget->hasMadMixInsts()) ||
727 (Opcode == ISD::FMA && Subtarget->hasFmaMixInsts())) &&
728 DestVT.getScalarType() == MVT::f32 && !Subtarget->hasFP32Denormals() &&
729 SrcVT.getScalarType() == MVT::f16;
730}
731
732bool SITargetLowering::isShuffleMaskLegal(ArrayRef<int>, EVT) const {
733 // SI has some legal vector types, but no legal vector operations. Say no
734 // shuffles are legal in order to prefer scalarizing some vector operations.
735 return false;
736}
737
738MVT SITargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context,
739 CallingConv::ID CC,
740 EVT VT) const {
741 // TODO: Consider splitting all arguments into 32-bit pieces.
742 if (CC != CallingConv::AMDGPU_KERNEL && VT.isVector()) {
743 EVT ScalarVT = VT.getScalarType();
744 unsigned Size = ScalarVT.getSizeInBits();
745 if (Size == 32)
746 return ScalarVT.getSimpleVT();
747
748 if (Size == 64)
749 return MVT::i32;
750
751 if (Size == 16 && Subtarget->has16BitInsts())
752 return VT.isInteger() ? MVT::v2i16 : MVT::v2f16;
753 }
754
755 return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
756}
757
758unsigned SITargetLowering::getNumRegistersForCallingConv(LLVMContext &Context,
759 CallingConv::ID CC,
760 EVT VT) const {
761 if (CC != CallingConv::AMDGPU_KERNEL && VT.isVector()) {
762 unsigned NumElts = VT.getVectorNumElements();
763 EVT ScalarVT = VT.getScalarType();
764 unsigned Size = ScalarVT.getSizeInBits();
765
766 if (Size == 32)
767 return NumElts;
768
769 if (Size == 64)
770 return 2 * NumElts;
771
772 if (Size == 16 && Subtarget->has16BitInsts())
773 return (VT.getVectorNumElements() + 1) / 2;
774 }
775
776 return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT);
777}
778
779unsigned SITargetLowering::getVectorTypeBreakdownForCallingConv(
780 LLVMContext &Context, CallingConv::ID CC,
781 EVT VT, EVT &IntermediateVT,
782 unsigned &NumIntermediates, MVT &RegisterVT) const {
783 if (CC != CallingConv::AMDGPU_KERNEL && VT.isVector()) {
784 unsigned NumElts = VT.getVectorNumElements();
785 EVT ScalarVT = VT.getScalarType();
786 unsigned Size = ScalarVT.getSizeInBits();
787 if (Size == 32) {
788 RegisterVT = ScalarVT.getSimpleVT();
789 IntermediateVT = RegisterVT;
790 NumIntermediates = NumElts;
791 return NumIntermediates;
792 }
793
794 if (Size == 64) {
795 RegisterVT = MVT::i32;
796 IntermediateVT = RegisterVT;
797 NumIntermediates = 2 * NumElts;
798 return NumIntermediates;
799 }
800
801 // FIXME: We should fix the ABI to be the same on targets without 16-bit
802 // support, but unless we can properly handle 3-vectors, it will be still be
803 // inconsistent.
804 if (Size == 16 && Subtarget->has16BitInsts()) {
805 RegisterVT = VT.isInteger() ? MVT::v2i16 : MVT::v2f16;
806 IntermediateVT = RegisterVT;
807 NumIntermediates = (NumElts + 1) / 2;
808 return NumIntermediates;
809 }
810 }
811
812 return TargetLowering::getVectorTypeBreakdownForCallingConv(
813 Context, CC, VT, IntermediateVT, NumIntermediates, RegisterVT);
814}
815
816bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
817 const CallInst &CI,
818 MachineFunction &MF,
819 unsigned IntrID) const {
820 if (const AMDGPU::RsrcIntrinsic *RsrcIntr =
821 AMDGPU::lookupRsrcIntrinsic(IntrID)) {
822 AttributeList Attr = Intrinsic::getAttributes(CI.getContext(),
823 (Intrinsic::ID)IntrID);
824 if (Attr.hasFnAttribute(Attribute::ReadNone))
825 return false;
826
827 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
828
829 if (RsrcIntr->IsImage) {
830 Info.ptrVal = MFI->getImagePSV(
831 *MF.getSubtarget<GCNSubtarget>().getInstrInfo(),
832 CI.getArgOperand(RsrcIntr->RsrcArg));
833 Info.align = 0;
834 } else {
835 Info.ptrVal = MFI->getBufferPSV(
836 *MF.getSubtarget<GCNSubtarget>().getInstrInfo(),
837 CI.getArgOperand(RsrcIntr->RsrcArg));
838 }
839
840 Info.flags = MachineMemOperand::MODereferenceable;
841 if (Attr.hasFnAttribute(Attribute::ReadOnly)) {
842 Info.opc = ISD::INTRINSIC_W_CHAIN;
843 Info.memVT = MVT::getVT(CI.getType());
844 Info.flags |= MachineMemOperand::MOLoad;
845 } else if (Attr.hasFnAttribute(Attribute::WriteOnly)) {
846 Info.opc = ISD::INTRINSIC_VOID;
847 Info.memVT = MVT::getVT(CI.getArgOperand(0)->getType());
848 Info.flags |= MachineMemOperand::MOStore;
849 } else {
850 // Atomic
851 Info.opc = ISD::INTRINSIC_W_CHAIN;
852 Info.memVT = MVT::getVT(CI.getType());
853 Info.flags = MachineMemOperand::MOLoad |
854 MachineMemOperand::MOStore |
855 MachineMemOperand::MODereferenceable;
856
857 // XXX - Should this be volatile without known ordering?
858 Info.flags |= MachineMemOperand::MOVolatile;
859 }
860 return true;
861 }
862
863 switch (IntrID) {
864 case Intrinsic::amdgcn_atomic_inc:
865 case Intrinsic::amdgcn_atomic_dec:
866 case Intrinsic::amdgcn_ds_fadd:
867 case Intrinsic::amdgcn_ds_fmin:
868 case Intrinsic::amdgcn_ds_fmax: {
869 Info.opc = ISD::INTRINSIC_W_CHAIN;
870 Info.memVT = MVT::getVT(CI.getType());
871 Info.ptrVal = CI.getOperand(0);
872 Info.align = 0;
873 Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
874
875 const ConstantInt *Vol = dyn_cast<ConstantInt>(CI.getOperand(4));
876 if (!Vol || !Vol->isZero())
877 Info.flags |= MachineMemOperand::MOVolatile;
878
879 return true;
880 }
881
882 default:
883 return false;
884 }
885}
886
887bool SITargetLowering::getAddrModeArguments(IntrinsicInst *II,
888 SmallVectorImpl<Value*> &Ops,
889 Type *&AccessTy) const {
890 switch (II->getIntrinsicID()) {
891 case Intrinsic::amdgcn_atomic_inc:
892 case Intrinsic::amdgcn_atomic_dec:
893 case Intrinsic::amdgcn_ds_fadd:
894 case Intrinsic::amdgcn_ds_fmin:
895 case Intrinsic::amdgcn_ds_fmax: {
896 Value *Ptr = II->getArgOperand(0);
897 AccessTy = II->getType();
898 Ops.push_back(Ptr);
899 return true;
900 }
901 default:
902 return false;
903 }
904}
905
906bool SITargetLowering::isLegalFlatAddressingMode(const AddrMode &AM) const {
907 if (!Subtarget->hasFlatInstOffsets()) {
908 // Flat instructions do not have offsets, and only have the register
909 // address.
910 return AM.BaseOffs == 0 && AM.Scale == 0;
911 }
912
913 // GFX9 added a 13-bit signed offset. When using regular flat instructions,
914 // the sign bit is ignored and is treated as a 12-bit unsigned offset.
915
916 // Just r + i
917 return isUInt<12>(AM.BaseOffs) && AM.Scale == 0;
918}
919
920bool SITargetLowering::isLegalGlobalAddressingMode(const AddrMode &AM) const {
921 if (Subtarget->hasFlatGlobalInsts())
922 return isInt<13>(AM.BaseOffs) && AM.Scale == 0;
923
924 if (!Subtarget->hasAddr64() || Subtarget->useFlatForGlobal()) {
925 // Assume the we will use FLAT for all global memory accesses
926 // on VI.
927 // FIXME: This assumption is currently wrong. On VI we still use
928 // MUBUF instructions for the r + i addressing mode. As currently
929 // implemented, the MUBUF instructions only work on buffer < 4GB.
930 // It may be possible to support > 4GB buffers with MUBUF instructions,
931 // by setting the stride value in the resource descriptor which would
932 // increase the size limit to (stride * 4GB). However, this is risky,
933 // because it has never been validated.
934 return isLegalFlatAddressingMode(AM);
935 }
936
937 return isLegalMUBUFAddressingMode(AM);
938}
939
940bool SITargetLowering::isLegalMUBUFAddressingMode(const AddrMode &AM) const {
941 // MUBUF / MTBUF instructions have a 12-bit unsigned byte offset, and
942 // additionally can do r + r + i with addr64. 32-bit has more addressing
943 // mode options. Depending on the resource constant, it can also do
944 // (i64 r0) + (i32 r1) * (i14 i).
945 //
946 // Private arrays end up using a scratch buffer most of the time, so also
947 // assume those use MUBUF instructions. Scratch loads / stores are currently
948 // implemented as mubuf instructions with offen bit set, so slightly
949 // different than the normal addr64.
950 if (!isUInt<12>(AM.BaseOffs))
951 return false;
952
953 // FIXME: Since we can split immediate into soffset and immediate offset,
954 // would it make sense to allow any immediate?
955
956 switch (AM.Scale) {
957 case 0: // r + i or just i, depending on HasBaseReg.
958 return true;
959 case 1:
960 return true; // We have r + r or r + i.
961 case 2:
962 if (AM.HasBaseReg) {
963 // Reject 2 * r + r.
964 return false;
965 }
966
967 // Allow 2 * r as r + r
968 // Or 2 * r + i is allowed as r + r + i.
969 return true;
970 default: // Don't allow n * r
971 return false;
972 }
973}
974
975bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL,
976 const AddrMode &AM, Type *Ty,
977 unsigned AS, Instruction *I) const {
978 // No global is ever allowed as a base.
979 if (AM.BaseGV)
980 return false;
981
982 if (AS == AMDGPUAS::GLOBAL_ADDRESS)
983 return isLegalGlobalAddressingMode(AM);
984
985 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
986 AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
987 // If the offset isn't a multiple of 4, it probably isn't going to be
988 // correctly aligned.
989 // FIXME: Can we get the real alignment here?
990 if (AM.BaseOffs % 4 != 0)
991 return isLegalMUBUFAddressingMode(AM);
992
993 // There are no SMRD extloads, so if we have to do a small type access we
994 // will use a MUBUF load.
995 // FIXME?: We also need to do this if unaligned, but we don't know the
996 // alignment here.
997 if (Ty->isSized() && DL.getTypeStoreSize(Ty) < 4)
998 return isLegalGlobalAddressingMode(AM);
999
1000 if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS) {
1001 // SMRD instructions have an 8-bit, dword offset on SI.
1002 if (!isUInt<8>(AM.BaseOffs / 4))
1003 return false;
1004 } else if (Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) {
1005 // On CI+, this can also be a 32-bit literal constant offset. If it fits
1006 // in 8-bits, it can use a smaller encoding.
1007 if (!isUInt<32>(AM.BaseOffs / 4))
1008 return false;
1009 } else if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
1010 // On VI, these use the SMEM format and the offset is 20-bit in bytes.
1011 if (!isUInt<20>(AM.BaseOffs))
1012 return false;
1013 } else
1014 llvm_unreachable("unhandled generation")::llvm::llvm_unreachable_internal("unhandled generation", "/build/llvm-toolchain-snapshot-8~svn345461/lib/Target/AMDGPU/SIISelLowering.cpp"
, 1014)
;
1015
1016 if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
1017 return true;
1018
1019 if (AM.Scale == 1 && AM.HasBaseReg)
1020 return true;
1021
1022 return false;
1023
1024 } else if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
1025 return isLegalMUBUFAddressingMode(AM);
1026 } else if (AS == AMDGPUAS::LOCAL_ADDRESS ||
1027 AS == AMDGPUAS::REGION_ADDRESS) {
1028 // Basic, single offset DS instructions allow a 16-bit unsigned immediate
1029 // field.
1030 // XXX - If doing a 4-byte aligned 8-byte type access, we effectively have
1031 // an 8-bit dword offset but we don't know the alignment here.
1032 if (!isUInt<16>(AM.BaseOffs))
1033 return false;
1034
1035 if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
1036 return true;
1037
1038 if (AM.Scale == 1 && AM.HasBaseReg)
1039 return true;
1040
1041 return false;
1042 } else if (AS == AMDGPUAS::FLAT_ADDRESS ||
1043 AS == AMDGPUAS::UNKNOWN_ADDRESS_SPACE) {
1044 // For an unknown address space, this usually means that this is for some
1045 // reason being used for pure arithmetic, and not based on some addressing
1046 // computation. We don't have instructions that compute pointers with any
1047 // addressing modes, so treat them as having no offset like flat
1048 // instructions.
1049 return isLegalFlatAddressingMode(AM);
1050 } else {
1051 llvm_unreachable("unhandled address space")::llvm::llvm_unreachable_internal("unhandled address space", "/build/llvm-toolchain-snapshot-8~svn345461/lib/Target/AMDGPU/SIISelLowering.cpp"
, 1051)
;
1052 }
1053}
1054
1055bool SITargetLowering::canMergeStoresTo(unsigned AS, EVT MemVT,
1056 const SelectionDAG &DAG) const {
1057 if (AS == AMDGPUAS::GLOBAL_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS) {
1058 return (MemVT.getSizeInBits() <= 4 * 32);
1059 } else if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
1060 unsigned MaxPrivateBits = 8 * getSubtarget()->getMaxPrivateElementSize();
1061 return (MemVT.getSizeInBits() <= MaxPrivateBits);
1062 } else if (AS == AMDGPUAS::LOCAL_ADDRESS) {
1063 return (MemVT.getSizeInBits() <= 2 * 32);
1064 }
1065 return true;
1066}
1067
1068bool SITargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
1069 unsigned AddrSpace,
1070 unsigned Align,
1071 bool *IsFast) const {
1072 if (IsFast)
1073 *IsFast = false;
1074
1075 // TODO: I think v3i32 should allow unaligned accesses on CI with DS_READ_B96,
1076 // which isn't a simple VT.
1077 // Until MVT is extended to handle this, simply check for the size and
1078 // rely on the condition below: allow accesses if the size is a multiple of 4.
1079 if (VT == MVT::Other || (VT != MVT::Other && VT.getSizeInBits() > 1024 &&
1080 VT.getStoreSize() > 16)) {
1081 return false;
1082 }
1083
1084 if (AddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
1085 AddrSpace == AMDGPUAS::REGION_ADDRESS) {
1086 // ds_read/write_b64 require 8-byte alignment, but we can do a 4 byte
1087 // aligned, 8 byte access in a single operation using ds_read2/write2_b32
1088 // with adjacent offsets.
1089 bool AlignedBy4 = (Align % 4 == 0);
1090 if (IsFast)
1091 *IsFast = AlignedBy4;
1092
1093 return AlignedBy4;
1094 }
1095
1096 // FIXME: We have to be conservative here and assume that flat operations
1097 // will access scratch. If we had access to the IR function, then we
1098 // could determine if any private memory was used in the function.
1099 if (!Subtarget->hasUnalignedScratchAccess() &&
1100 (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS ||
1101 AddrSpace == AMDGPUAS::FLAT_ADDRESS)) {
1102 bool AlignedBy4 = Align >= 4;
1103 if (IsFast)
1104 *IsFast = AlignedBy4;
1105
1106 return AlignedBy4;
1107 }
1108
1109 if (Subtarget->hasUnalignedBufferAccess()) {
1110 // If we have an uniform constant load, it still requires using a slow
1111 // buffer instruction if unaligned.
1112 if (IsFast) {
1113 *IsFast = (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS ||
1114 AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) ?
1115 (Align % 4 == 0) : true;
1116 }
1117
1118 return true;
1119 }
1120
1121 // Smaller than dword value must be aligned.
1122 if (VT.bitsLT(MVT::i32))
1123 return false;
1124
1125 // 8.1.6 - For Dword or larger reads or writes, the two LSBs of the
1126 // byte-address are ignored, thus forcing Dword alignment.
1127 // This applies to private, global, and constant memory.
1128 if (IsFast)
1129 *IsFast = true;
1130
1131 return VT.bitsGT(MVT::i32) && Align % 4 == 0;
1132}
1133
1134EVT SITargetLowering::getOptimalMemOpType(uint64_t Size, unsigned DstAlign,
1135 unsigned SrcAlign, bool IsMemset,
1136 bool ZeroMemset,
1137 bool MemcpyStrSrc,
1138 MachineFunction &MF) const {
1139 // FIXME: Should account for address space here.
1140
1141 // The default fallback uses the private pointer size as a guess for a type to
1142 // use. Make sure we switch these to 64-bit accesses.
1143
1144 if (Size >= 16 && DstAlign >= 4) // XXX: Should only do for global
1145 return MVT::v4i32;
1146
1147 if (Size >= 8 && DstAlign >= 4)
1148 return MVT::v2i32;
1149
1150 // Use the default.
1151 return MVT::Other;
1152}
1153
1154static bool isFlatGlobalAddrSpace(unsigned AS) {
1155 return AS == AMDGPUAS::GLOBAL_ADDRESS ||
1156 AS == AMDGPUAS::FLAT_ADDRESS ||
1157 AS == AMDGPUAS::CONSTANT_ADDRESS;
1158}
1159
1160bool SITargetLowering::isNoopAddrSpaceCast(unsigned SrcAS,
1161 unsigned DestAS) const {
1162 return isFlatGlobalAddrSpace(SrcAS) && isFlatGlobalAddrSpace(DestAS);
1163}
1164
1165bool SITargetLowering::isMemOpHasNoClobberedMemOperand(const SDNode *N) const {
1166 const MemSDNode *MemNode = cast<MemSDNode>(N);
1167 const Value *Ptr = MemNode->getMemOperand()->getValue();
1168 const Instruction *I = dyn_cast_or_null<Instruction>(Ptr);
1169 return I && I->getMetadata("amdgpu.noclobber");
1170}
1171
1172bool SITargetLowering::isCheapAddrSpaceCast(unsigned SrcAS,
1173 unsigned DestAS) const {
1174 // Flat -> private/local is a simple truncate.
1175 // Flat -> global is no-op
1176 if (SrcAS == AMDGPUAS::FLAT_ADDRESS)
1177 return true;
1178
1179 return isNoopAddrSpaceCast(SrcAS, DestAS);
1180}
1181
1182bool SITargetLowering::isMemOpUniform(const SDNode *N) const {
1183 const MemSDNode *MemNode = cast<MemSDNode>(N);
1184
1185 return AMDGPUInstrInfo::isUniformMMO(MemNode->getMemOperand());
1186}
1187
1188TargetLoweringBase::LegalizeTypeAction
1189SITargetLowering::getPreferredVectorAction(EVT VT) const {
1190 if (VT.getVectorNumElements() != 1 && VT.getScalarType().bitsLE(MVT::i16))
1191 return TypeSplitVector;
1192
1193 return TargetLoweringBase::getPreferredVectorAction(VT);
1194}
1195
1196bool SITargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
1197 Type *Ty) const {
1198 // FIXME: Could be smarter if called for vector constants.
1199 return true;
1200}
1201
1202bool SITargetLowering::isTypeDesirableForOp(unsigned Op, EVT VT) const {
1203 if (Subtarget->has16BitInsts() && VT == MVT::i16) {
1204 switch (Op) {
1205 case ISD::LOAD:
1206 case ISD::STORE:
1207
1208 // These operations are done with 32-bit instructions anyway.
1209 case ISD::AND:
1210 case ISD::OR:
1211 case ISD::XOR:
1212 case ISD::SELECT:
1213 // TODO: Extensions?
1214 return true;
1215 default:
1216 return false;
1217 }
1218 }
1219
1220 // SimplifySetCC uses this function to determine whether or not it should
1221 // create setcc with i1 operands. We don't have instructions for i1 setcc.
1222 if (VT == MVT::i1 && Op == ISD::SETCC)
1223 return false;
1224
1225 return TargetLowering::isTypeDesirableForOp(Op, VT);
1226}
1227
1228SDValue SITargetLowering::lowerKernArgParameterPtr(SelectionDAG &DAG,
1229 const SDLoc &SL,
1230 SDValue Chain,
1231 uint64_t Offset) const {
1232 const DataLayout &DL = DAG.getDataLayout();
1233 MachineFunction &MF = DAG.getMachineFunction();
1234 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1235
1236 const ArgDescriptor *InputPtrReg;
1237 const TargetRegisterClass *RC;
1238
1239 std::tie(InputPtrReg, RC)
1240 = Info->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
1241
1242 MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
1243 MVT PtrVT = getPointerTy(DL, AMDGPUAS::CONSTANT_ADDRESS);
1244 SDValue BasePtr = DAG.getCopyFromReg(Chain, SL,
1245 MRI.getLiveInVirtReg(InputPtrReg->getRegister()), PtrVT);
1246
1247 return DAG.getObjectPtrOffset(SL, BasePtr, Offset);
1248}
1249
1250SDValue SITargetLowering::getImplicitArgPtr(SelectionDAG &DAG,
1251 const SDLoc &SL) const {
1252 uint64_t Offset = getImplicitParameterOffset(DAG.getMachineFunction(),
1253 FIRST_IMPLICIT);
1254 return lowerKernArgParameterPtr(DAG, SL, DAG.getEntryNode(), Offset);
1255}
1256
1257SDValue SITargetLowering::convertArgType(SelectionDAG &DAG, EVT VT, EVT MemVT,
1258 const SDLoc &SL, SDValue Val,
1259 bool Signed,
1260 const ISD::InputArg *Arg) const {
1261 if (Arg && (Arg->Flags.isSExt() || Arg->Flags.isZExt()) &&
1262 VT.bitsLT(MemVT)) {
1263 unsigned Opc = Arg->Flags.isZExt() ? ISD::AssertZext : ISD::AssertSext;
1264 Val = DAG.getNode(Opc, SL, MemVT, Val, DAG.getValueType(VT));
1265 }
1266
1267 if (MemVT.isFloatingPoint())
1268 Val = getFPExtOrFPTrunc(DAG, Val, SL, VT);
1269 else if (Signed)
1270 Val = DAG.getSExtOrTrunc(Val, SL, VT);
1271 else
1272 Val = DAG.getZExtOrTrunc(Val, SL, VT);
1273
1274 return Val;
1275}
1276
1277SDValue SITargetLowering::lowerKernargMemParameter(
1278 SelectionDAG &DAG, EVT VT, EVT MemVT,
1279 const SDLoc &SL, SDValue Chain,
1280 uint64_t Offset, unsigned Align, bool Signed,
1281 const ISD::InputArg *Arg) const {
1282 Type *Ty = MemVT.getTypeForEVT(*DAG.getContext());
1283 PointerType *PtrTy = PointerType::get(Ty, AMDGPUAS::CONSTANT_ADDRESS);
1284 MachinePointerInfo PtrInfo(UndefValue::get(PtrTy));
1285
1286 // Try to avoid using an extload by loading earlier than the argument address,
1287 // and extracting the relevant bits. The load should hopefully be merged with
1288 // the previous argument.
1289 if (MemVT.getStoreSize() < 4 && Align < 4) {
1290 // TODO: Handle align < 4 and size >= 4 (can happen with packed structs).
1291 int64_t AlignDownOffset = alignDown(Offset, 4);
1292 int64_t OffsetDiff = Offset - AlignDownOffset;
1293
1294 EVT IntVT = MemVT.changeTypeToInteger();
1295
1296 // TODO: If we passed in the base kernel offset we could have a better
1297 // alignment than 4, but we don't really need it.
1298 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, AlignDownOffset);
1299 SDValue Load = DAG.getLoad(MVT::i32, SL, Chain, Ptr, PtrInfo, 4,
1300 MachineMemOperand::MODereferenceable |
1301 MachineMemOperand::MOInvariant);
1302
1303 SDValue ShiftAmt = DAG.getConstant(OffsetDiff * 8, SL, MVT::i32);
1304 SDValue Extract = DAG.getNode(ISD::SRL, SL, MVT::i32, Load, ShiftAmt);
1305
1306 SDValue ArgVal = DAG.getNode(ISD::TRUNCATE, SL, IntVT, Extract);
1307 ArgVal = DAG.getNode(ISD::BITCAST, SL, MemVT, ArgVal);
1308 ArgVal = convertArgType(DAG, VT, MemVT, SL, ArgVal, Signed, Arg);
1309
1310
1311 return DAG.getMergeValues({ ArgVal, Load.getValue(1) }, SL);
1312 }
1313
1314 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, Offset);
1315 SDValue Load = DAG.getLoad(MemVT, SL, Chain, Ptr, PtrInfo, Align,
1316 MachineMemOperand::MODereferenceable |
1317 MachineMemOperand::MOInvariant);
1318
1319 SDValue Val = convertArgType(DAG, VT, MemVT, SL, Load, Signed, Arg);
1320 return DAG.getMergeValues({ Val, Load.getValue(1) }, SL);
1321}
1322
1323SDValue SITargetLowering::lowerStackParameter(SelectionDAG &DAG, CCValAssign &VA,
1324 const SDLoc &SL, SDValue Chain,
1325 const ISD::InputArg &Arg) const {
1326 MachineFunction &MF = DAG.getMachineFunction();
1327 MachineFrameInfo &MFI = MF.getFrameInfo();
1328
1329 if (Arg.Flags.isByVal()) {
1330 unsigned Size = Arg.Flags.getByValSize();
1331 int FrameIdx = MFI.CreateFixedObject(Size, VA.getLocMemOffset(), false);
1332 return DAG.getFrameIndex(FrameIdx, MVT::i32);
1333 }
1334
1335 unsigned ArgOffset = VA.getLocMemOffset();
1336 unsigned ArgSize = VA.getValVT().getStoreSize();
1337
1338 int FI = MFI.CreateFixedObject(ArgSize, ArgOffset, true);
1339
1340 // Create load nodes to retrieve arguments from the stack.
1341 SDValue FIN = DAG.getFrameIndex(FI, MVT::i32);
1342 SDValue ArgValue;
1343
1344 // For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT)
1345 ISD::LoadExtType ExtType = ISD::NON_EXTLOAD;
1346 MVT MemVT = VA.getValVT();
1347
1348 switch (VA.getLocInfo()) {
1349 default:
1350 break;
1351 case CCValAssign::BCvt:
1352 MemVT = VA.getLocVT();
1353 break;
1354 case CCValAssign::SExt:
1355 ExtType = ISD::SEXTLOAD;
1356 break;
1357 case CCValAssign::ZExt:
1358 ExtType = ISD::ZEXTLOAD;
1359 break;
1360 case CCValAssign::AExt:
1361 ExtType = ISD::EXTLOAD;
1362 break;
1363 }
1364
1365 ArgValue = DAG.getExtLoad(
1366 ExtType, SL, VA.getLocVT(), Chain, FIN,
1367 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI),
1368 MemVT);
1369 return ArgValue;
1370}
1371
1372SDValue SITargetLowering::getPreloadedValue(SelectionDAG &DAG,
1373 const SIMachineFunctionInfo &MFI,
1374 EVT VT,
1375 AMDGPUFunctionArgInfo::PreloadedValue PVID) const {
1376 const ArgDescriptor *Reg;
1377 const TargetRegisterClass *RC;
1378
1379 std::tie(Reg, RC) = MFI.getPreloadedValue(PVID);
1380 return CreateLiveInRegister(DAG, RC, Reg->getRegister(), VT);
1381}
1382
1383static void processShaderInputArgs(SmallVectorImpl<ISD::InputArg> &Splits,
1384 CallingConv::ID CallConv,
1385 ArrayRef<ISD::InputArg> Ins,
1386 BitVector &Skipped,
1387 FunctionType *FType,
1388 SIMachineFunctionInfo *Info) {
1389 for (unsigned I = 0, E = Ins.size(), PSInputNum = 0; I != E; ++I) {
1390 const ISD::InputArg *Arg = &Ins[I];
1391
1392 assert((!Arg->VT.isVector() || Arg->VT.getScalarSizeInBits() == 16) &&(((!Arg->VT.isVector() || Arg->VT.getScalarSizeInBits()
== 16) && "vector type argument should have been split"
) ? static_cast<void> (0) : __assert_fail ("(!Arg->VT.isVector() || Arg->VT.getScalarSizeInBits() == 16) && \"vector type argument should have been split\""
, "/build/llvm-toolchain-snapshot-8~svn345461/lib/Target/AMDGPU/SIISelLowering.cpp"
, 1393, __PRETTY_FUNCTION__))
1393 "vector type argument should have been split")(((!Arg->VT.isVector() || Arg->VT.getScalarSizeInBits()
== 16) && "vector type argument should have been split"
) ? static_cast<void> (0) : __assert_fail ("(!Arg->VT.isVector() || Arg->VT.getScalarSizeInBits() == 16) && \"vector type argument should have been split\""
, "/build/llvm-toolchain-snapshot-8~svn345461/lib/Target/AMDGPU/SIISelLowering.cpp"
, 1393, __PRETTY_FUNCTION__))
;
1394
1395 // First check if it's a PS input addr.
1396 if (CallConv == CallingConv::AMDGPU_PS &&
1397 !Arg->Flags.isInReg() && !Arg->Flags.isByVal() && PSInputNum <= 15) {
1398
1399 bool SkipArg = !Arg->Used && !Info->isPSInputAllocated(PSInputNum);
1400
1401 // Inconveniently only the first part of the split is marked as isSplit,
1402 // so skip to the end. We only want to increment PSInputNum once for the
1403 // entire split argument.
1404 if (Arg->Flags.isSplit()) {
1405 while (!Arg->Flags.isSplitEnd()) {
1406 assert(!Arg->VT.isVector() &&((!Arg->VT.isVector() && "unexpected vector split in ps argument type"
) ? static_cast<void> (0) : __assert_fail ("!Arg->VT.isVector() && \"unexpected vector split in ps argument type\""
, "/build/llvm-toolchain-snapshot-8~svn345461/lib/Target/AMDGPU/SIISelLowering.cpp"
, 1407, __PRETTY_FUNCTION__))
1407 "unexpected vector split in ps argument type")((!Arg->VT.isVector() && "unexpected vector split in ps argument type"
) ? static_cast<void> (0) : __assert_fail ("!Arg->VT.isVector() && \"unexpected vector split in ps argument type\""
, "/build/llvm-toolchain-snapshot-8~svn345461/lib/Target/AMDGPU/SIISelLowering.cpp"
, 1407, __PRETTY_FUNCTION__))
;
1408 if (!SkipArg)
1409 Splits.push_back(*Arg);
1410 Arg = &Ins[++I];
1411 }
1412 }
1413
1414 if (SkipArg) {
1415 // We can safely skip PS inputs.
1416 Skipped.set(Arg->getOrigArgIndex());
1417 ++PSInputNum;
1418 continue;
1419 }
1420
1421 Info->markPSInputAllocated(PSInputNum);
1422 if (Arg->Used)
1423 Info->markPSInputEnabled(PSInputNum);
1424
1425 ++PSInputNum;
1426 }
1427
1428 Splits.push_back(*Arg);
1429 }
1430}
1431
1432// Allocate special inputs passed in VGPRs.
1433static void allocateSpecialEntryInputVGPRs(CCState &CCInfo,
1434 MachineFunction &MF,
1435 const SIRegisterInfo &TRI,
1436 SIMachineFunctionInfo &Info) {
1437 if (Info.hasWorkItemIDX()) {
1438 unsigned Reg = AMDGPU::VGPR0;
1439 MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
1440
1441 CCInfo.AllocateReg(Reg);
1442 Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg));
1443 }
1444
1445 if (Info.hasWorkItemIDY()) {
1446 unsigned Reg = AMDGPU::VGPR1;
1447 MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
1448
1449 CCInfo.AllocateReg(Reg);
1450 Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg));
1451 }
1452
1453 if (Info.hasWorkItemIDZ()) {
1454 unsigned Reg = AMDGPU::VGPR2;
1455 MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
1456
1457 CCInfo.AllocateReg(Reg);
1458 Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg));
1459 }
1460}
1461
1462// Try to allocate a VGPR at the end of the argument list, or if no argument
1463// VGPRs are left allocating a stack slot.
1464static ArgDescriptor allocateVGPR32Input(CCState &CCInfo) {
1465 ArrayRef<MCPhysReg> ArgVGPRs
1466 = makeArrayRef(AMDGPU::VGPR_32RegClass.begin(), 32);
1467 unsigned RegIdx = CCInfo.getFirstUnallocated(ArgVGPRs);
1468 if (RegIdx == ArgVGPRs.size()) {
1469 // Spill to stack required.
1470 int64_t Offset = CCInfo.AllocateStack(4, 4);
1471
1472 return ArgDescriptor::createStack(Offset);
1473 }
1474
1475 unsigned Reg = ArgVGPRs[RegIdx];
1476 Reg = CCInfo.AllocateReg(Reg);
1477 assert(Reg != AMDGPU::NoRegister)((Reg != AMDGPU::NoRegister) ? static_cast<void> (0) : __assert_fail
("Reg != AMDGPU::NoRegister", "/build/llvm-toolchain-snapshot-8~svn345461/lib/Target/AMDGPU/SIISelLowering.cpp"
, 1477, __PRETTY_FUNCTION__))
;
1478
1479 MachineFunction &MF = CCInfo.getMachineFunction();
1480 MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
1481 return ArgDescriptor::createRegister(Reg);
1482}
1483
1484static ArgDescriptor allocateSGPR32InputImpl(CCState &CCInfo,
1485 const TargetRegisterClass *RC,
1486 unsigned NumArgRegs) {
1487 ArrayRef<MCPhysReg> ArgSGPRs = makeArrayRef(RC->begin(), 32);
1488 unsigned RegIdx = CCInfo.getFirstUnallocated(ArgSGPRs);
1489 if (RegIdx == ArgSGPRs.size())
1490 report_fatal_error("ran out of SGPRs for arguments");
1491
1492 unsigned Reg = ArgSGPRs[RegIdx];
1493 Reg = CCInfo.AllocateReg(Reg);
1494 assert(Reg != AMDGPU::NoRegister)((Reg != AMDGPU::NoRegister) ? static_cast<void> (0) : __assert_fail
("Reg != AMDGPU::NoRegister", "/build/llvm-toolchain-snapshot-8~svn345461/lib/Target/AMDGPU/SIISelLowering.cpp"
, 1494, __PRETTY_FUNCTION__))
;
1495
1496 MachineFunction &MF = CCInfo.getMachineFunction();
1497 MF.addLiveIn(Reg, RC);
1498 return ArgDescriptor::createRegister(Reg);
1499}
1500
1501static ArgDescriptor allocateSGPR32Input(CCState &CCInfo) {
1502 return allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_32RegClass, 32);
1503}
1504
1505static ArgDescriptor allocateSGPR64Input(CCState &CCInfo) {
1506 return allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_64RegClass, 16);
1507}
1508
1509static void allocateSpecialInputVGPRs(CCState &CCInfo,
1510 MachineFunction &MF,
1511 const SIRegisterInfo &TRI,
1512 SIMachineFunctionInfo &Info) {
1513 if (Info.hasWorkItemIDX())
1514 Info.setWorkItemIDX(allocateVGPR32Input(CCInfo));
1515
1516 if (Info.hasWorkItemIDY())
1517 Info.setWorkItemIDY(allocateVGPR32Input(CCInfo));
1518
1519 if (Info.hasWorkItemIDZ())
1520 Info.setWorkItemIDZ(allocateVGPR32Input(CCInfo));
1521}
1522
1523static void allocateSpecialInputSGPRs(CCState &CCInfo,
1524 MachineFunction &MF,
1525 const SIRegisterInfo &TRI,
1526 SIMachineFunctionInfo &Info) {
1527 auto &ArgInfo = Info.getArgInfo();
1528
1529 // TODO: Unify handling with private memory pointers.
1530
1531 if (Info.hasDispatchPtr())
1532 ArgInfo.DispatchPtr = allocateSGPR64Input(CCInfo);
1533
1534 if (Info.hasQueuePtr())
1535 ArgInfo.QueuePtr = allocateSGPR64Input(CCInfo);
1536
1537 if (Info.hasKernargSegmentPtr())
1538 ArgInfo.KernargSegmentPtr = allocateSGPR64Input(CCInfo);
1539
1540 if (Info.hasDispatchID())
1541 ArgInfo.DispatchID = allocateSGPR64Input(CCInfo);
1542
1543 // flat_scratch_init is not applicable for non-kernel functions.
1544
1545 if (Info.hasWorkGroupIDX())
1546 ArgInfo.WorkGroupIDX = allocateSGPR32Input(CCInfo);
1547
1548 if (Info.hasWorkGroupIDY())
1549 ArgInfo.WorkGroupIDY = allocateSGPR32Input(CCInfo);
1550
1551 if (Info.hasWorkGroupIDZ())
1552 ArgInfo.WorkGroupIDZ = allocateSGPR32Input(CCInfo);
1553
1554 if (Info.hasImplicitArgPtr())
1555 ArgInfo.ImplicitArgPtr = allocateSGPR64Input(CCInfo);
1556}
1557
1558// Allocate special inputs passed in user SGPRs.
1559static void allocateHSAUserSGPRs(CCState &CCInfo,
1560 MachineFunction &MF,
1561 const SIRegisterInfo &TRI,
1562 SIMachineFunctionInfo &Info) {
1563 if (Info.hasImplicitBufferPtr()) {
1564 unsigned ImplicitBufferPtrReg = Info.addImplicitBufferPtr(TRI);
1565 MF.addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass);
1566 CCInfo.AllocateReg(ImplicitBufferPtrReg);
1567 }
1568
1569 // FIXME: How should these inputs interact with inreg / custom SGPR inputs?
1570 if (Info.hasPrivateSegmentBuffer()) {
1571 unsigned PrivateSegmentBufferReg = Info.addPrivateSegmentBuffer(TRI);
1572 MF.addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SGPR_128RegClass);
1573 CCInfo.AllocateReg(PrivateSegmentBufferReg);
1574 }
1575
1576 if (Info.hasDispatchPtr()) {
1577 unsigned DispatchPtrReg = Info.addDispatchPtr(TRI);
1578 MF.addLiveIn(DispatchPtrReg, &AMDGPU::SGPR_64RegClass);
1579 CCInfo.AllocateReg(DispatchPtrReg);
1580 }
1581
1582 if (Info.hasQueuePtr()) {
1583 unsigned QueuePtrReg = Info.addQueuePtr(TRI);
1584 MF.addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass);
1585 CCInfo.AllocateReg(QueuePtrReg);
1586 }
1587
1588 if (Info.hasKernargSegmentPtr()) {
1589 unsigned InputPtrReg = Info.addKernargSegmentPtr(TRI);
1590 MF.addLiveIn(InputPtrReg, &AMDGPU::SGPR_64RegClass);
1591 CCInfo.AllocateReg(InputPtrReg);
1592 }
1593
1594 if (Info.hasDispatchID()) {
1595 unsigned DispatchIDReg = Info.addDispatchID(TRI);
1596 MF.addLiveIn(DispatchIDReg, &AMDGPU::SGPR_64RegClass);
1597 CCInfo.AllocateReg(DispatchIDReg);
1598 }
1599
1600 if (Info.hasFlatScratchInit()) {
1601 unsigned FlatScratchInitReg = Info.addFlatScratchInit(TRI);
1602 MF.addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass);
1603 CCInfo.AllocateReg(FlatScratchInitReg);
1604 }
1605
1606 // TODO: Add GridWorkGroupCount user SGPRs when used. For now with HSA we read
1607 // these from the dispatch pointer.
1608}
1609
1610// Allocate special input registers that are initialized per-wave.
1611static void allocateSystemSGPRs(CCState &CCInfo,
1612 MachineFunction &MF,
1613 SIMachineFunctionInfo &Info,
1614 CallingConv::ID CallConv,
1615 bool IsShader) {
1616 if (Info.hasWorkGroupIDX()) {
1617 unsigned Reg = Info.addWorkGroupIDX();
1618 MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass);
1619 CCInfo.AllocateReg(Reg);
1620 }
1621
1622 if (Info.hasWorkGroupIDY()) {
1623 unsigned Reg = Info.addWorkGroupIDY();
1624 MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass);
1625 CCInfo.AllocateReg(Reg);
1626 }
1627
1628 if (Info.hasWorkGroupIDZ()) {
1629 unsigned Reg = Info.addWorkGroupIDZ();
1630 MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass);
1631 CCInfo.AllocateReg(Reg);
1632 }
1633
1634 if (Info.hasWorkGroupInfo()) {
1635 unsigned Reg = Info.addWorkGroupInfo();
1636 MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass);
1637 CCInfo.AllocateReg(Reg);
1638 }
1639
1640 if (Info.hasPrivateSegmentWaveByteOffset()) {
1641 // Scratch wave offset passed in system SGPR.
1642 unsigned PrivateSegmentWaveByteOffsetReg;
1643
1644 if (IsShader) {
1645 PrivateSegmentWaveByteOffsetReg =
1646 Info.getPrivateSegmentWaveByteOffsetSystemSGPR();
1647
1648 // This is true if the scratch wave byte offset doesn't have a fixed
1649 // location.
1650 if (PrivateSegmentWaveByteOffsetReg == AMDGPU::NoRegister) {
1651 PrivateSegmentWaveByteOffsetReg = findFirstFreeSGPR(CCInfo);
1652 Info.setPrivateSegmentWaveByteOffset(PrivateSegmentWaveByteOffsetReg);
1653 }
1654 } else
1655 PrivateSegmentWaveByteOffsetReg = Info.addPrivateSegmentWaveByteOffset();
1656
1657 MF.addLiveIn(PrivateSegmentWaveByteOffsetReg, &AMDGPU::SGPR_32RegClass);
1658 CCInfo.AllocateReg(PrivateSegmentWaveByteOffsetReg);
1659 }
1660}
1661
1662static void reservePrivateMemoryRegs(const TargetMachine &TM,
1663 MachineFunction &MF,
1664 const SIRegisterInfo &TRI,
1665 SIMachineFunctionInfo &Info) {
1666 // Now that we've figured out where the scratch register inputs are, see if
1667 // should reserve the arguments and use them directly.
1668 MachineFrameInfo &MFI = MF.getFrameInfo();
1669 bool HasStackObjects = MFI.hasStackObjects();
1670
1671 // Record that we know we have non-spill stack objects so we don't need to
1672 // check all stack objects later.
1673 if (HasStackObjects)
1674 Info.setHasNonSpillStackObjects(true);
1675
1676 // Everything live out of a block is spilled with fast regalloc, so it's
1677 // almost certain that spilling will be required.
1678 if (TM.getOptLevel() == CodeGenOpt::None)
1679 HasStackObjects = true;
1680
1681 // For now assume stack access is needed in any callee functions, so we need
1682 // the scratch registers to pass in.
1683 bool RequiresStackAccess = HasStackObjects || MFI.hasCalls();
1684
1685 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1686 if (ST.isAmdHsaOrMesa(MF.getFunction())) {
1687 if (RequiresStackAccess) {
1688 // If we have stack objects, we unquestionably need the private buffer
1689 // resource. For the Code Object V2 ABI, this will be the first 4 user
1690 // SGPR inputs. We can reserve those and use them directly.
1691
1692 unsigned PrivateSegmentBufferReg = Info.getPreloadedReg(
1693 AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_BUFFER);
1694 Info.setScratchRSrcReg(PrivateSegmentBufferReg);
1695
1696 if (MFI.hasCalls()) {
1697 // If we have calls, we need to keep the frame register in a register
1698 // that won't be clobbered by a call, so ensure it is copied somewhere.
1699
1700 // This is not a problem for the scratch wave offset, because the same
1701 // registers are reserved in all functions.
1702
1703 // FIXME: Nothing is really ensuring this is a call preserved register,
1704 // it's just selected from the end so it happens to be.
1705 unsigned ReservedOffsetReg
1706 = TRI.reservedPrivateSegmentWaveByteOffsetReg(MF);
1707 Info.setScratchWaveOffsetReg(ReservedOffsetReg);
1708 } else {
1709 unsigned PrivateSegmentWaveByteOffsetReg = Info.getPreloadedReg(
1710 AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
1711 Info.setScratchWaveOffsetReg(PrivateSegmentWaveByteOffsetReg);
1712 }
1713 } else {
1714 unsigned ReservedBufferReg
1715 = TRI.reservedPrivateSegmentBufferReg(MF);
1716 unsigned ReservedOffsetReg
1717 = TRI.reservedPrivateSegmentWaveByteOffsetReg(MF);
1718
1719 // We tentatively reserve the last registers (skipping the last two
1720 // which may contain VCC). After register allocation, we'll replace
1721 // these with the ones immediately after those which were really
1722 // allocated. In the prologue copies will be inserted from the argument
1723 // to these reserved registers.
1724 Info.setScratchRSrcReg(ReservedBufferReg);
1725 Info.setScratchWaveOffsetReg(ReservedOffsetReg);
1726 }
1727 } else {
1728 unsigned ReservedBufferReg = TRI.reservedPrivateSegmentBufferReg(MF);
1729
1730 // Without HSA, relocations are used for the scratch pointer and the
1731 // buffer resource setup is always inserted in the prologue. Scratch wave
1732 // offset is still in an input SGPR.
1733 Info.setScratchRSrcReg(ReservedBufferReg);
1734
1735 if (HasStackObjects && !MFI.hasCalls()) {
1736 unsigned ScratchWaveOffsetReg = Info.getPreloadedReg(
1737 AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
1738 Info.setScratchWaveOffsetReg(ScratchWaveOffsetReg);
1739 } else {
1740 unsigned ReservedOffsetReg
1741 = TRI.reservedPrivateSegmentWaveByteOffsetReg(MF);
1742 Info.setScratchWaveOffsetReg(ReservedOffsetReg);
1743 }
1744 }
1745}
1746
1747bool SITargetLowering::supportSplitCSR(MachineFunction *MF) const {
1748 const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
1749 return !Info->isEntryFunction();
1750}
1751
1752void SITargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
1753
1754}
1755
1756void SITargetLowering::insertCopiesSplitCSR(
1757 MachineBasicBlock *Entry,
1758 const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
1759 const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
1760
1761 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
1762 if (!IStart)
1763 return;
1764
1765 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
1766 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
1767 MachineBasicBlock::iterator MBBI = Entry->begin();
1768 for (const MCPhysReg *I = IStart; *I; ++I) {
1769 const TargetRegisterClass *RC = nullptr;
1770 if (AMDGPU::SReg_64RegClass.contains(*I))
1771 RC = &AMDGPU::SGPR_64RegClass;
1772 else if (AMDGPU::SReg_32RegClass.contains(*I))
1773 RC = &AMDGPU::SGPR_32RegClass;
1774 else
1775 llvm_unreachable("Unexpected register class in CSRsViaCopy!")::llvm::llvm_unreachable_internal("Unexpected register class in CSRsViaCopy!"
, "/build/llvm-toolchain-snapshot-8~svn345461/lib/Target/AMDGPU/SIISelLowering.cpp"
, 1775)
;
1776
1777 unsigned NewVR = MRI->createVirtualRegister(RC);
1778 // Create copy from CSR to a virtual register.
1779 Entry->addLiveIn(*I);
1780 BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
1781 .addReg(*I);
1782
1783 // Insert the copy-back instructions right before the terminator.
1784 for (auto *Exit : Exits)
1785 BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
1786 TII->get(TargetOpcode::COPY), *I)
1787 .addReg(NewVR);
1788 }
1789}
1790
1791SDValue SITargetLowering::LowerFormalArguments(
1792 SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
1793 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
1794 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
1795 const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
1796
1797 MachineFunction &MF = DAG.getMachineFunction();
1798 const Function &Fn = MF.getFunction();
1799 FunctionType *FType = MF.getFunction().getFunctionType();
1800 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1801 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1802
1803 if (Subtarget->isAmdHsaOS() && AMDGPU::isShader(CallConv)) {
1804 DiagnosticInfoUnsupported NoGraphicsHSA(
1805 Fn, "unsupported non-compute shaders with HSA", DL.getDebugLoc());
1806 DAG.getContext()->diagnose(NoGraphicsHSA);
1807 return DAG.getEntryNode();
1808 }
1809
1810 // Create stack objects that are used for emitting debugger prologue if
1811 // "amdgpu-debugger-emit-prologue" attribute was specified.
1812 if (ST.debuggerEmitPrologue())
1813 createDebuggerPrologueStackObjects(MF);
1814
1815 SmallVector<ISD::InputArg, 16> Splits;
1816 SmallVector<CCValAssign, 16> ArgLocs;
1817 BitVector Skipped(Ins.size());
1818 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
1819 *DAG.getContext());
1820
1821 bool IsShader = AMDGPU::isShader(CallConv);
1822 bool IsKernel = AMDGPU::isKernel(CallConv);
1823 bool IsEntryFunc = AMDGPU::isEntryFunctionCC(CallConv);
1824
1825 if (!IsEntryFunc) {
1826 // 4 bytes are reserved at offset 0 for the emergency stack slot. Skip over
1827 // this when allocating argument fixed offsets.
1828 CCInfo.AllocateStack(4, 4);
1829 }
1830
1831 if (IsShader) {
1832 processShaderInputArgs(Splits, CallConv, Ins, Skipped, FType, Info);
1833
1834 // At least one interpolation mode must be enabled or else the GPU will
1835 // hang.
1836 //
1837 // Check PSInputAddr instead of PSInputEnable. The idea is that if the user
1838 // set PSInputAddr, the user wants to enable some bits after the compilation
1839 // based on run-time states. Since we can't know what the final PSInputEna
1840 // will look like, so we shouldn't do anything here and the user should take
1841 // responsibility for the correct programming.
1842 //
1843 // Otherwise, the following restrictions apply:
1844 // - At least one of PERSP_* (0xF) or LINEAR_* (0x70) must be enabled.
1845 // - If POS_W_FLOAT (11) is enabled, at least one of PERSP_* must be
1846 // enabled too.
1847 if (CallConv == CallingConv::AMDGPU_PS) {
1848 if ((Info->getPSInputAddr() & 0x7F) == 0 ||
1849 ((Info->getPSInputAddr() & 0xF) == 0 &&
1850 Info->isPSInputAllocated(11))) {
1851 CCInfo.AllocateReg(AMDGPU::VGPR0);
1852 CCInfo.AllocateReg(AMDGPU::VGPR1);
1853 Info->markPSInputAllocated(0);
1854 Info->markPSInputEnabled(0);
1855 }
1856 if (Subtarget->isAmdPalOS()) {
1857 // For isAmdPalOS, the user does not enable some bits after compilation
1858 // based on run-time states; the register values being generated here are
1859 // the final ones set in hardware. Therefore we need to apply the
1860 // workaround to PSInputAddr and PSInputEnable together. (The case where
1861 // a bit is set in PSInputAddr but not PSInputEnable is where the
1862 // frontend set up an input arg for a particular interpolation mode, but
1863 // nothing uses that input arg. Really we should have an earlier pass
1864 // that removes such an arg.)
1865 unsigned PsInputBits = Info->getPSInputAddr() & Info->getPSInputEnable();
1866 if ((PsInputBits & 0x7F) == 0 ||
1867 ((PsInputBits & 0xF) == 0 &&
1868 (PsInputBits >> 11 & 1)))
1869 Info->markPSInputEnabled(
1870 countTrailingZeros(Info->getPSInputAddr(), ZB_Undefined));
1871 }
1872 }
1873
1874 assert(!Info->hasDispatchPtr() &&((!Info->hasDispatchPtr() && !Info->hasKernargSegmentPtr
() && !Info->hasFlatScratchInit() && !Info
->hasWorkGroupIDX() && !Info->hasWorkGroupIDY()
&& !Info->hasWorkGroupIDZ() && !Info->
hasWorkGroupInfo() && !Info->hasWorkItemIDX() &&
!Info->hasWorkItemIDY() && !Info->hasWorkItemIDZ
()) ? static_cast<void> (0) : __assert_fail ("!Info->hasDispatchPtr() && !Info->hasKernargSegmentPtr() && !Info->hasFlatScratchInit() && !Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() && !Info->hasWorkGroupIDZ() && !Info->hasWorkGroupInfo() && !Info->hasWorkItemIDX() && !Info->hasWorkItemIDY() && !Info->hasWorkItemIDZ()"
, "/build/llvm-toolchain-snapshot-8~svn345461/lib/Target/AMDGPU/SIISelLowering.cpp"
, 1879, __PRETTY_FUNCTION__))
1875 !Info->hasKernargSegmentPtr() && !Info->hasFlatScratchInit() &&((!Info->hasDispatchPtr() && !Info->hasKernargSegmentPtr
() && !Info->hasFlatScratchInit() && !Info
->hasWorkGroupIDX() && !Info->hasWorkGroupIDY()
&& !Info->hasWorkGroupIDZ() && !Info->
hasWorkGroupInfo() && !Info->hasWorkItemIDX() &&
!Info->hasWorkItemIDY() && !Info->hasWorkItemIDZ
()) ? static_cast<void> (0) : __assert_fail ("!Info->hasDispatchPtr() && !Info->hasKernargSegmentPtr() && !Info->hasFlatScratchInit() && !Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() && !Info->hasWorkGroupIDZ() && !Info->hasWorkGroupInfo() && !Info->hasWorkItemIDX() && !Info->hasWorkItemIDY() && !Info->hasWorkItemIDZ()"
, "/build/llvm-toolchain-snapshot-8~svn345461/lib/Target/AMDGPU/SIISelLowering.cpp"
, 1879, __PRETTY_FUNCTION__))
1876 !Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() &&((!Info->hasDispatchPtr() && !Info->hasKernargSegmentPtr
() && !Info->hasFlatScratchInit() && !Info
->hasWorkGroupIDX() && !Info->hasWorkGroupIDY()
&& !Info->hasWorkGroupIDZ() && !Info->
hasWorkGroupInfo() && !Info->hasWorkItemIDX() &&
!Info->hasWorkItemIDY() && !Info->hasWorkItemIDZ
()) ? static_cast<void> (0) : __assert_fail ("!Info->hasDispatchPtr() && !Info->hasKernargSegmentPtr() && !Info->hasFlatScratchInit() && !Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() && !Info->hasWorkGroupIDZ() && !Info->hasWorkGroupInfo() && !Info->hasWorkItemIDX() && !Info->hasWorkItemIDY() && !Info->hasWorkItemIDZ()"
, "/build/llvm-toolchain-snapshot-8~svn345461/lib/Target/AMDGPU/SIISelLowering.cpp"
, 1879, __PRETTY_FUNCTION__))
1877 !Info->hasWorkGroupIDZ() && !Info->hasWorkGroupInfo() &&((!Info->hasDispatchPtr() && !Info->hasKernargSegmentPtr
() && !Info->hasFlatScratchInit() && !Info
->hasWorkGroupIDX() && !Info->hasWorkGroupIDY()
&& !Info->hasWorkGroupIDZ() && !Info->
hasWorkGroupInfo() && !Info->hasWorkItemIDX() &&
!Info->hasWorkItemIDY() && !Info->hasWorkItemIDZ
()) ? static_cast<void> (0) : __assert_fail ("!Info->hasDispatchPtr() && !Info->hasKernargSegmentPtr() && !Info->hasFlatScratchInit() && !Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() && !Info->hasWorkGroupIDZ() && !Info->hasWorkGroupInfo() && !Info->hasWorkItemIDX() && !Info->hasWorkItemIDY() && !Info->hasWorkItemIDZ()"
, "/build/llvm-toolchain-snapshot-8~svn345461/lib/Target/AMDGPU/SIISelLowering.cpp"
, 1879, __PRETTY_FUNCTION__))
1878 !Info->hasWorkItemIDX() && !Info->hasWorkItemIDY() &&((!Info->hasDispatchPtr() && !Info->hasKernargSegmentPtr
() && !Info->hasFlatScratchInit() && !Info
->hasWorkGroupIDX() && !Info->hasWorkGroupIDY()
&& !Info->hasWorkGroupIDZ() && !Info->
hasWorkGroupInfo() && !Info->hasWorkItemIDX() &&
!Info->hasWorkItemIDY() && !Info->hasWorkItemIDZ
()) ? static_cast<void> (0) : __assert_fail ("!Info->hasDispatchPtr() && !Info->hasKernargSegmentPtr() && !Info->hasFlatScratchInit() && !Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() && !Info->hasWorkGroupIDZ() && !Info->hasWorkGroupInfo() && !Info->hasWorkItemIDX() && !Info->hasWorkItemIDY() && !Info->hasWorkItemIDZ()"
, "/build/llvm-toolchain-snapshot-8~svn345461/lib/Target/AMDGPU/SIISelLowering.cpp"
, 1879, __PRETTY_FUNCTION__))
1879 !Info->hasWorkItemIDZ())((!Info->hasDispatchPtr() && !Info->hasKernargSegmentPtr
() && !Info->hasFlatScratchInit() && !Info
->hasWorkGroupIDX() && !Info->hasWorkGroupIDY()
&& !Info->hasWorkGroupIDZ() && !Info->
hasWorkGroupInfo() && !Info->hasWorkItemIDX() &&
!Info->hasWorkItemIDY() && !Info->hasWorkItemIDZ
()) ? static_cast<void> (0) : __assert_fail ("!Info->hasDispatchPtr() && !Info->hasKernargSegmentPtr() && !Info->hasFlatScratchInit() && !Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() && !Info->hasWorkGroupIDZ() && !Info->hasWorkGroupInfo() && !Info->hasWorkItemIDX() && !Info->hasWorkItemIDY() && !Info->hasWorkItemIDZ()"
, "/build/llvm-toolchain-snapshot-8~svn345461/lib/Target/AMDGPU/SIISelLowering.cpp"
, 1879, __PRETTY_FUNCTION__))
;
1880 } else if (IsKernel) {
1881 assert(Info->hasWorkGroupIDX() && Info->hasWorkItemIDX())((Info->hasWorkGroupIDX() && Info->hasWorkItemIDX
()) ? static_cast<void> (0) : __assert_fail ("Info->hasWorkGroupIDX() && Info->hasWorkItemIDX()"
, "/build/llvm-toolchain-snapshot-8~svn345461/lib/Target/AMDGPU/SIISelLowering.cpp"
, 1881, __PRETTY_FUNCTION__))
;
1882 } else {
1883 Splits.append(Ins.begin(), Ins.end());
1884 }
1885
1886 if (IsEntryFunc) {
1887 allocateSpecialEntryInputVGPRs(CCInfo, MF, *TRI, *Info);
1888 allocateHSAUserSGPRs(CCInfo, MF, *TRI, *Info);
1889 }
1890
1891 if (IsKernel) {
1892 analyzeFormalArgumentsCompute(CCInfo, Ins);
1893 } else {
1894 CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, isVarArg);
1895 CCInfo.AnalyzeFormalArguments(Splits, AssignFn);
1896 }
1897
1898 SmallVector<SDValue, 16> Chains;
1899
1900 // FIXME: This is the minimum kernel argument alignment. We should improve
1901 // this to the maximum alignment of the arguments.
1902 //
1903 // FIXME: Alignment of explicit arguments totally broken with non-0 explicit
1904 // kern arg offset.
1905 const unsigned KernelArgBaseAlign = 16;
1906
1907 for (unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) {
1908 const ISD::InputArg &Arg = Ins[i];
1909 if (Arg.isOrigArg() && Skipped[Arg.getOrigArgIndex()]) {
1910 InVals.push_back(DAG.getUNDEF(Arg.VT));
1911 continue;
1912 }
1913
1914 CCValAssign &VA = ArgLocs[ArgIdx++];
1915 MVT VT = VA.getLocVT();
1916
1917 if (IsEntryFunc && VA.isMemLoc()) {
1918 VT = Ins[i].VT;
1919 EVT MemVT = VA.getLocVT();
1920
1921 const uint64_t Offset = VA.getLocMemOffset();
1922 unsigned Align = MinAlign(KernelArgBaseAlign, Offset);
1923
1924 SDValue Arg = lowerKernargMemParameter(
1925 DAG, VT, MemVT, DL, Chain, Offset, Align, Ins[i].Flags.isSExt(), &Ins[i]);
1926 Chains.push_back(Arg.getValue(1));
1927
1928 auto *ParamTy =
1929 dyn_cast<PointerType>(FType->getParamType(Ins[i].getOrigArgIndex()));
1930 if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS &&
1931 ParamTy && ParamTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
1932 // On SI local pointers are just offsets into LDS, so they are always
1933 // less than 16-bits. On CI and newer they could potentially be
1934 // real pointers, so we can't guarantee their size.
1935 Arg = DAG.getNode(ISD::AssertZext, DL, Arg.getValueType(), Arg,
1936 DAG.getValueType(MVT::i16));
1937 }
1938
1939 InVals.push_back(Arg);
1940 continue;
1941 } else if (!IsEntryFunc && VA.isMemLoc()) {
1942 SDValue Val = lowerStackParameter(DAG, VA, DL, Chain, Arg);
1943 InVals.push_back(Val);
1944 if (!Arg.Flags.isByVal())
1945 Chains.push_back(Val.getValue(1));
1946 continue;
1947 }
1948
1949 assert(VA.isRegLoc() && "Parameter must be in a register!")((VA.isRegLoc() && "Parameter must be in a register!"
) ? static_cast<void> (0) : __assert_fail ("VA.isRegLoc() && \"Parameter must be in a register!\""
, "/build/llvm-toolchain-snapshot-8~svn345461/lib/Target/AMDGPU/SIISelLowering.cpp"
, 1949, __PRETTY_FUNCTION__))
;
1950
1951 unsigned Reg = VA.getLocReg();
1952 const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg, VT);
1953 EVT ValVT = VA.getValVT();
1954
1955 Reg = MF.addLiveIn(Reg, RC);
1956 SDValue Val = DAG.getCopyFromReg(Chain, DL, Reg, VT);
1957
1958 if (Arg.Flags.isSRet() && !getSubtarget()->enableHugePrivateBuffer()) {
1959 // The return object should be reasonably addressable.
1960
1961 // FIXME: This helps when the return is a real sret. If it is a
1962 // automatically inserted sret (i.e. CanLowerReturn returns false), an
1963 // extra copy is inserted in SelectionDAGBuilder which obscures this.
1964 unsigned NumBits = 32 - AssumeFrameIndexHighZeroBits;
1965 Val = DAG.getNode(ISD::AssertZext, DL, VT, Val,
1966 DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), NumBits)));
1967 }
1968
1969 // If this is an 8 or 16-bit value, it is really passed promoted
1970 // to 32 bits. Insert an assert[sz]ext to capture this, then
1971 // truncate to the right size.
1972 switch (VA.getLocInfo()) {
1973 case CCValAssign::Full:
1974 break;
1975 case CCValAssign::BCvt:
1976 Val = DAG.getNode(ISD::BITCAST, DL, ValVT, Val);
1977 break;
1978 case CCValAssign::SExt:
1979 Val = DAG.getNode(ISD::AssertSext, DL, VT, Val,
1980 DAG.getValueType(ValVT));
1981 Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
1982 break;
1983 case CCValAssign::ZExt:
1984 Val = DAG.getNode(ISD::AssertZext, DL, VT, Val,
1985 DAG.getValueType(ValVT));
1986 Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
1987 break;
1988 case CCValAssign::AExt:
1989 Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
1990 break;
1991 default:
1992 llvm_unreachable("Unknown loc info!")::llvm::llvm_unreachable_internal("Unknown loc info!", "/build/llvm-toolchain-snapshot-8~svn345461/lib/Target/AMDGPU/SIISelLowering.cpp"
, 1992)
;
1993 }
1994
1995 InVals.push_back(Val);
1996 }
1997
1998 if (!IsEntryFunc) {
1999 // Special inputs come after user arguments.
2000 allocateSpecialInputVGPRs(CCInfo, MF, *TRI, *Info);
2001 }
2002
2003 // Start adding system SGPRs.
2004 if (IsEntryFunc) {
2005 allocateSystemSGPRs(CCInfo, MF, *Info, CallConv, IsShader);
2006 } else {
2007 CCInfo.AllocateReg(Info->getScratchRSrcReg());
2008 CCInfo.AllocateReg(Info->getScratchWaveOffsetReg());
2009 CCInfo.AllocateReg(Info->getFrameOffsetReg());
2010 allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info);
2011 }
2012
2013 auto &ArgUsageInfo =
2014 DAG.getPass()->getAnalysis<AMDGPUArgumentUsageInfo>();
2015 ArgUsageInfo.setFuncArgInfo(Fn, Info->getArgInfo());
2016
2017 unsigned StackArgSize = CCInfo.getNextStackOffset();
2018 Info->setBytesInStackArgArea(StackArgSize);
2019
2020 return Chains.empty() ? Chain :
2021 DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
2022}
2023
2024// TODO: If return values can't fit in registers, we should return as many as
2025// possible in registers before passing on stack.
2026bool SITargetLowering::CanLowerReturn(
2027 CallingConv::ID CallConv,
2028 MachineFunction &MF, bool IsVarArg,
2029 const SmallVectorImpl<ISD::OutputArg> &Outs,
2030 LLVMContext &Context) const {
2031 // Replacing returns with sret/stack usage doesn't make sense for shaders.
2032 // FIXME: Also sort of a workaround for custom vector splitting in LowerReturn
2033 // for shaders. Vector types should be explicitly handled by CC.
2034 if (AMDGPU::isEntryFunctionCC(CallConv))
2035 return true;
2036
2037 SmallVector<CCValAssign, 16> RVLocs;
2038 CCState CCInfo(CallConv, IsVarArg, MF, RVLocs, Context);
2039 return CCInfo.CheckReturn(Outs, CCAssignFnForReturn(CallConv, IsVarArg));
2040}
2041
2042SDValue
2043SITargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
2044 bool isVarArg,
2045 const SmallVectorImpl<ISD::OutputArg> &Outs,
2046 const SmallVectorImpl<SDValue> &OutVals,
2047 const SDLoc &DL, SelectionDAG &DAG) const {
2048 MachineFunction &MF = DAG.getMachineFunction();
2049 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
2050
2051 if (AMDGPU::isKernel(CallConv)) {
2052 return AMDGPUTargetLowering::LowerReturn(Chain, CallConv, isVarArg, Outs,
2053 OutVals, DL, DAG);
2054 }
2055
2056 bool IsShader = AMDGPU::isShader(CallConv);
2057
2058 Info->setIfReturnsVoid(Outs.empty());
2059 bool IsWaveEnd = Info->returnsVoid() && IsShader;
2060
2061 // CCValAssign - represent the assignment of the return value to a location.
2062 SmallVector<CCValAssign, 48> RVLocs;
2063 SmallVector<ISD::OutputArg, 48> Splits;
2064
2065 // CCState - Info about the registers and stack slots.
2066 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
2067 *DAG.getContext());
2068
2069 // Analyze outgoing return values.
2070 CCInfo.AnalyzeReturn(Outs, CCAssignFnForReturn(CallConv, isVarArg));
2071
2072 SDValue Flag;
2073 SmallVector<SDValue, 48> RetOps;
2074 RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
2075
2076 // Add return address for callable functions.
2077 if (!Info->isEntryFunction()) {
2078 const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
2079 SDValue ReturnAddrReg = CreateLiveInRegister(
2080 DAG, &AMDGPU::SReg_64RegClass, TRI->getReturnAddressReg(MF), MVT::i64);
2081
2082 // FIXME: Should be able to use a vreg here, but need a way to prevent it
2083 // from being allcoated to a CSR.
2084
2085 SDValue PhysReturnAddrReg = DAG.getRegister(TRI->getReturnAddressReg(MF),
2086 MVT::i64);
2087
2088 Chain = DAG.getCopyToReg(Chain, DL, PhysReturnAddrReg, ReturnAddrReg, Flag);
2089 Flag = Chain.getValue(1);
2090
2091 RetOps.push_back(PhysReturnAddrReg);
2092 }
2093
2094 // Copy the result values into the output registers.
2095 for (unsigned I = 0, RealRVLocIdx = 0, E = RVLocs.size(); I != E;
2096 ++I, ++RealRVLocIdx) {
2097 CCValAssign &VA = RVLocs[I];
2098 assert(VA.isRegLoc() && "Can only return in registers!")((VA.isRegLoc() && "Can only return in registers!") ?
static_cast<void> (0) : __assert_fail ("VA.isRegLoc() && \"Can only return in registers!\""
, "/build/llvm-toolchain-snapshot-8~svn345461/lib/Target/AMDGPU/SIISelLowering.cpp"
, 2098, __PRETTY_FUNCTION__))
;
2099 // TODO: Partially return in registers if return values don't fit.
2100 SDValue Arg = OutVals[RealRVLocIdx];
2101
2102 // Copied from other backends.
2103 switch (VA.getLocInfo()) {
2104 case CCValAssign::Full:
2105 break;
2106 case CCValAssign::BCvt:
2107 Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
2108 break;
2109 case CCValAssign::SExt:
2110 Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
2111 break;
2112 case CCValAssign::ZExt:
2113 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
2114 break;
2115 case CCValAssign::AExt:
2116 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
2117 break;
2118 default:
2119 llvm_unreachable("Unknown loc info!")::llvm::llvm_unreachable_internal("Unknown loc info!", "/build/llvm-toolchain-snapshot-8~svn345461/lib/Target/AMDGPU/SIISelLowering.cpp"
, 2119)
;
2120 }
2121
2122 Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Arg, Flag);
2123 Flag = Chain.getValue(1);
2124 RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
2125 }
2126
2127 // FIXME: Does sret work properly?
2128 if (!Info->isEntryFunction()) {
2129 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
2130 const MCPhysReg *I =
2131 TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
2132 if (I) {
2133 for (; *I; ++I) {
2134 if (AMDGPU::SReg_64RegClass.contains(*I))
2135 RetOps.push_back(DAG.getRegister(*I, MVT::i64));
2136 else if (AMDGPU::SReg_32RegClass.contains(*I))
2137 RetOps.push_back(DAG.getRegister(*I, MVT::i32));
2138 else
2139 llvm_unreachable("Unexpected register class in CSRsViaCopy!")::llvm::llvm_unreachable_internal("Unexpected register class in CSRsViaCopy!"
, "/build/llvm-toolchain-snapshot-8~svn345461/lib/Target/AMDGPU/SIISelLowering.cpp"
, 2139)
;
2140 }
2141 }
2142 }
2143
2144 // Update chain and glue.
2145 RetOps[0] = Chain;
2146 if (Flag.getNode())
2147 RetOps.push_back(Flag);
2148
2149 unsigned Opc = AMDGPUISD::ENDPGM;
2150 if (!IsWaveEnd)
2151 Opc = IsShader ? AMDGPUISD::RETURN_TO_EPILOG : AMDGPUISD::RET_FLAG;
2152 return DAG.getNode(Opc, DL, MVT::Other, RetOps);
2153}
2154
2155SDValue SITargetLowering::LowerCallResult(
2156 SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool IsVarArg,
2157 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
2158 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool IsThisReturn,
2159 SDValue ThisVal) const {
2160 CCAssignFn *RetCC = CCAssignFnForReturn(CallConv, IsVarArg);
2161
2162 // Assign locations to each value returned by this call.
2163 SmallVector<CCValAssign, 16> RVLocs;
2164 CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), RVLocs,
2165 *DAG.getContext());
2166 CCInfo.AnalyzeCallResult(Ins, RetCC);
2167
2168 // Copy all of the result registers out of their specified physreg.
2169 for (unsigned i = 0; i != RVLocs.size(); ++i) {
2170 CCValAssign VA = RVLocs[i];
2171 SDValue Val;
2172
2173 if (VA.isRegLoc()) {
2174 Val = DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InFlag);
2175 Chain = Val.getValue(1);
2176 InFlag = Val.getValue(2);
2177 } else if (VA.isMemLoc()) {
2178 report_fatal_error("TODO: return values in memory");
2179 } else
2180 llvm_unreachable("unknown argument location type")::llvm::llvm_unreachable_internal("unknown argument location type"
, "/build/llvm-toolchain-snapshot-8~svn345461/lib/Target/AMDGPU/SIISelLowering.cpp"
, 2180)
;
2181
2182 switch (VA.getLocInfo()) {
2183 case CCValAssign::Full:
2184 break;
2185 case CCValAssign::BCvt:
2186 Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val);
2187 break;
2188 case CCValAssign::ZExt:
2189 Val = DAG.getNode(ISD::AssertZext, DL, VA.getLocVT(), Val,
2190 DAG.getValueType(VA.getValVT()));
2191 Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
2192 break;
2193 case CCValAssign::SExt:
2194 Val = DAG.getNode(ISD::AssertSext, DL, VA.getLocVT(), Val,
2195 DAG.getValueType(VA.getValVT()));
2196 Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
2197 break;
2198 case CCValAssign::AExt:
2199 Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
2200 break;
2201 default:
2202 llvm_unreachable("Unknown loc info!")::llvm::llvm_unreachable_internal("Unknown loc info!", "/build/llvm-toolchain-snapshot-8~svn345461/lib/Target/AMDGPU/SIISelLowering.cpp"
, 2202)
;
2203 }
2204
2205 InVals.push_back(Val);
2206 }
2207
2208 return Chain;
2209}
2210
2211// Add code to pass special inputs required depending on used features separate
2212// from the explicit user arguments present in the IR.
2213void SITargetLowering::passSpecialInputs(
2214 CallLoweringInfo &CLI,
2215 CCState &CCInfo,
2216 const SIMachineFunctionInfo &Info,
2217 SmallVectorImpl<std::pair<unsigned, SDValue>> &RegsToPass,
2218 SmallVectorImpl<SDValue> &MemOpChains,
2219 SDValue Chain) const {
2220 // If we don't have a call site, this was a call inserted by
2221 // legalization. These can never use special inputs.
2222 if (!CLI.CS)
2223 return;
2224
2225 const Function *CalleeFunc = CLI.CS.getCalledFunction();
2226 assert(CalleeFunc)((CalleeFunc) ? static_cast<void> (0) : __assert_fail (
"CalleeFunc", "/build/llvm-toolchain-snapshot-8~svn345461/lib/Target/AMDGPU/SIISelLowering.cpp"
, 2226, __PRETTY_FUNCTION__))
;
2227
2228 SelectionDAG &DAG = CLI.DAG;
2229 const SDLoc &DL = CLI.DL;
2230
2231 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
2232
2233 auto &ArgUsageInfo =
2234 DAG.getPass()->getAnalysis<AMDGPUArgumentUsageInfo>();
2235 const AMDGPUFunctionArgInfo &CalleeArgInfo
2236 = ArgUsageInfo.lookupFuncArgInfo(*CalleeFunc);
2237
2238 const AMDGPUFunctionArgInfo &CallerArgInfo = Info.getArgInfo();
2239
2240 // TODO: Unify with private memory register handling. This is complicated by
2241 // the fact that at least in kernels, the input argument is not necessarily
2242 // in the same location as the input.
2243 AMDGPUFunctionArgInfo::PreloadedValue InputRegs[] = {
2244 AMDGPUFunctionArgInfo::DISPATCH_PTR,
2245 AMDGPUFunctionArgInfo::QUEUE_PTR,
2246 AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR,
2247 AMDGPUFunctionArgInfo::DISPATCH_ID,
2248 AMDGPUFunctionArgInfo::WORKGROUP_ID_X,
2249 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y,
2250 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z,
2251 AMDGPUFunctionArgInfo::WORKITEM_ID_X,
2252 AMDGPUFunctionArgInfo::WORKITEM_ID_Y,
2253 AMDGPUFunctionArgInfo::WORKITEM_ID_Z,
2254 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR
2255 };
2256
2257 for (auto InputID : InputRegs) {
2258 const ArgDescriptor *OutgoingArg;
2259 const TargetRegisterClass *ArgRC;
2260
2261 std::tie(OutgoingArg, ArgRC) = CalleeArgInfo.getPreloadedValue(InputID);
2262 if (!OutgoingArg)
2263 continue;
2264
2265 const ArgDescriptor *IncomingArg;
2266 const TargetRegisterClass *IncomingArgRC;
2267 std::tie(IncomingArg, IncomingArgRC)
2268 = CallerArgInfo.getPreloadedValue(InputID);
2269 assert(IncomingArgRC == ArgRC)((IncomingArgRC == ArgRC) ? static_cast<void> (0) : __assert_fail
("IncomingArgRC == ArgRC", "/build/llvm-toolchain-snapshot-8~svn345461/lib/Target/AMDGPU/SIISelLowering.cpp"
, 2269, __PRETTY_FUNCTION__))
;
2270
2271 // All special arguments are ints for now.
2272 EVT ArgVT = TRI->getSpillSize(*ArgRC) == 8 ? MVT::i64 : MVT::i32;
2273 SDValue InputReg;
2274
2275 if (IncomingArg) {
2276 InputReg = loadInputValue(DAG, ArgRC, ArgVT, DL, *IncomingArg);
2277 } else {
2278 // The implicit arg ptr is special because it doesn't have a corresponding
2279 // input for kernels, and is computed from the kernarg segment pointer.
2280 assert(InputID == AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR)((InputID == AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR) ? static_cast
<void> (0) : __assert_fail ("InputID == AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR"
, "/build/llvm-toolchain-snapshot-8~svn345461/lib/Target/AMDGPU/SIISelLowering.cpp"
, 2280, __PRETTY_FUNCTION__))
;
2281 InputReg = getImplicitArgPtr(DAG, DL);
2282 }
2283
2284 if (OutgoingArg->isRegister()) {
2285 RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
2286 } else {
2287 unsigned SpecialArgOffset = CCInfo.AllocateStack(ArgVT.getStoreSize(), 4);
2288 SDValue ArgStore = storeStackInputValue(DAG, DL, Chain, InputReg,
2289 SpecialArgOffset);
2290 MemOpChains.push_back(ArgStore);
2291 }
2292 }
2293}
2294
2295static bool canGuaranteeTCO(CallingConv::ID CC) {
2296 return CC == CallingConv::Fast;
2297}
2298
2299/// Return true if we might ever do TCO for calls with this calling convention.
2300static bool mayTailCallThisCC(CallingConv::ID CC) {
2301 switch (CC) {
2302 case CallingConv::C:
2303 return true;
2304 default:
2305 return canGuaranteeTCO(CC);
2306 }
2307}
2308
2309bool SITargetLowering::isEligibleForTailCallOptimization(
2310 SDValue Callee, CallingConv::ID CalleeCC, bool IsVarArg,
2311 const SmallVectorImpl<ISD::OutputArg> &Outs,
2312 const SmallVectorImpl<SDValue> &OutVals,
2313 const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
2314 if (!mayTailCallThisCC(CalleeCC))
2315 return false;
2316
2317 MachineFunction &MF = DAG.getMachineFunction();
2318 const Function &CallerF = MF.getFunction();
2319 CallingConv::ID CallerCC = CallerF.getCallingConv();
2320 const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
2321 const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
2322
2323 // Kernels aren't callable, and don't have a live in return address so it
2324 // doesn't make sense to do a tail call with entry functions.
2325 if (!CallerPreserved)
2326 return false;
2327
2328 bool CCMatch = CallerCC == CalleeCC;
2329
2330 if (DAG.getTarget().Options.GuaranteedTailCallOpt) {
2331 if (canGuaranteeTCO(CalleeCC) && CCMatch)
2332 return true;
2333 return false;
2334 }
2335
2336 // TODO: Can we handle var args?
2337 if (IsVarArg)
2338 return false;
2339
2340 for (const Argument &Arg : CallerF.args()) {
2341 if (Arg.hasByValAttr())
2342 return false;
2343 }
2344
2345 LLVMContext &Ctx = *DAG.getContext();
2346
2347 // Check that the call results are passed in the same way.
2348 if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, Ctx, Ins,
2349 CCAssignFnForCall(CalleeCC, IsVarArg),
2350 CCAssignFnForCall(CallerCC, IsVarArg)))
2351 return false;
2352
2353 // The callee has to preserve all registers the caller needs to preserve.
2354 if (!CCMatch) {
2355 const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
2356 if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
2357 return false;
2358 }
2359
2360 // Nothing more to check if the callee is taking no arguments.
2361 if (Outs.empty())
2362 return true;
2363
2364 SmallVector<CCValAssign, 16> ArgLocs;
2365 CCState CCInfo(CalleeCC, IsVarArg, MF, ArgLocs, Ctx);
2366
2367 CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, IsVarArg));
2368
2369 const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
2370 // If the stack arguments for this call do not fit into our own save area then
2371 // the call cannot be made tail.
2372 // TODO: Is this really necessary?
2373 if (CCInfo.getNextStackOffset() > FuncInfo->getBytesInStackArgArea())
2374 return false;
2375
2376 const MachineRegisterInfo &MRI = MF.getRegInfo();
2377 return parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals);
2378}
2379
2380bool SITargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
2381 if (!CI->isTailCall())
2382 return false;
2383
2384 const Function *ParentFn = CI->getParent()->getParent();
2385 if (AMDGPU::isEntryFunctionCC(ParentFn->getCallingConv()))
2386 return false;
2387
2388 auto Attr = ParentFn->getFnAttribute("disable-tail-calls");
2389 return (Attr.getValueAsString() != "true");
2390}
2391
2392// The wave scratch offset register is used as the global base pointer.
2393SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
2394 SmallVectorImpl<SDValue> &InVals) const {
2395 SelectionDAG &DAG = CLI.DAG;
2396 const SDLoc &DL = CLI.DL;
2397 SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
2398 SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
2399 SmallVector<ISD::InputArg, 32> &Ins = CLI.Ins;
2400 SDValue Chain = CLI.Chain;
2401 SDValue Callee = CLI.Callee;
2402 bool &IsTailCall = CLI.IsTailCall;
2403 CallingConv::ID CallConv = CLI.CallConv;
2404 bool IsVarArg = CLI.IsVarArg;
2405 bool IsSibCall = false;
2406 bool IsThisReturn = false;
2407 MachineFunction &MF = DAG.getMachineFunction();
2408
2409 if (IsVarArg) {
2410 return lowerUnhandledCall(CLI, InVals,
2411 "unsupported call to variadic function ");
2412 }
2413
2414 if (!CLI.CS.getInstruction())
2415 report_fatal_error("unsupported libcall legalization");
2416
2417 if (!CLI.CS.getCalledFunction()) {
2418 return lowerUnhandledCall(CLI, InVals,
2419 "unsupported indirect call to function ");
2420 }
2421
2422 if (IsTailCall && MF.getTarget().Options.GuaranteedTailCallOpt) {
2423 return lowerUnhandledCall(CLI, InVals,
2424 "unsupported required tail call to function ");
2425 }
2426
2427 if (AMDGPU::isShader(MF.getFunction().getCallingConv())) {
2428 // Note the issue is with the CC of the calling function, not of the call
2429 // itself.
2430 return lowerUnhandledCall(CLI, InVals,
2431 "unsupported call from graphics shader of function ");
2432 }
2433
2434 // The first 4 bytes are reserved for the callee's emergency stack slot.
2435 if (IsTailCall) {
2436 IsTailCall = isEligibleForTailCallOptimization(
2437 Callee, CallConv, IsVarArg, Outs, OutVals, Ins, DAG);
2438 if (!IsTailCall && CLI.CS && CLI.CS.isMustTailCall()) {
2439 report_fatal_error("failed to perform tail call elimination on a call "
2440 "site marked musttail");
2441 }
2442
2443 bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
2444
2445 // A sibling call is one where we're under the usual C ABI and not planning
2446 // to change that but can still do a tail call:
2447 if (!TailCallOpt && IsTailCall)
2448 IsSibCall = true;
2449
2450 if (IsTailCall)
2451 ++NumTailCalls;
2452 }
2453
2454 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
2455
2456 // Analyze operands of the call, assigning locations to each operand.
2457 SmallVector<CCValAssign, 16> ArgLocs;
2458 CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
2459 CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, IsVarArg);
2460
2461 // The first 4 bytes are reserved for the callee's emergency stack slot.
2462 CCInfo.AllocateStack(4, 4);
2463
2464 CCInfo.AnalyzeCallOperands(Outs, AssignFn);
2465
2466 // Get a count of how many bytes are to be pushed on the stack.
2467 unsigned NumBytes = CCInfo.getNextStackOffset();
2468
2469 if (IsSibCall) {
2470 // Since we're not changing the ABI to make this a tail call, the memory
2471 // operands are already available in the caller's incoming argument space.
2472 NumBytes = 0;
2473 }
2474
2475 // FPDiff is the byte offset of the call's argument area from the callee's.
2476 // Stores to callee stack arguments will be placed in FixedStackSlots offset
2477 // by this amount for a tail call. In a sibling call it must be 0 because the
2478 // caller will deallocate the entire stack and the callee still expects its
2479 // arguments to begin at SP+0. Completely unused for non-tail calls.
2480 int32_t FPDiff = 0;
2481 MachineFrameInfo &MFI = MF.getFrameInfo();
2482 SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
2483
2484 SDValue CallerSavedFP;
2485
2486 // Adjust the stack pointer for the new arguments...
2487 // These operations are automatically eliminated by the prolog/epilog pass
2488 if (!IsSibCall) {
2489 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
2490
2491 unsigned OffsetReg = Info->getScratchWaveOffsetReg();
2492
2493 // In the HSA case, this should be an identity copy.
2494 SDValue ScratchRSrcReg
2495 = DAG.getCopyFromReg(Chain, DL, Info->getScratchRSrcReg(), MVT::v4i32);
2496 RegsToPass.emplace_back(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, ScratchRSrcReg);
2497
2498 // TODO: Don't hardcode these registers and get from the callee function.
2499 SDValue ScratchWaveOffsetReg
2500 = DAG.getCopyFromReg(Chain, DL, OffsetReg, MVT::i32);
2501 RegsToPass.emplace_back(AMDGPU::SGPR4, ScratchWaveOffsetReg);
2502
2503 if (!Info->isEntryFunction()) {
2504 // Avoid clobbering this function's FP value. In the current convention
2505 // callee will overwrite this, so do save/restore around the call site.
2506 CallerSavedFP = DAG.getCopyFromReg(Chain, DL,
2507 Info->getFrameOffsetReg(), MVT::i32);
2508 }
2509 }
2510
2511 SmallVector<SDValue, 8> MemOpChains;
2512 MVT PtrVT = MVT::i32;
2513
2514 // Walk the register/memloc assignments, inserting copies/loads.
2515 for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size(); i != e;
2516 ++i, ++realArgIdx) {
2517 CCValAssign &VA = ArgLocs[i];
2518 SDValue Arg = OutVals[realArgIdx];
2519
2520 // Promote the value if needed.
2521 switch (VA.getLocInfo()) {
2522 case CCValAssign::Full:
2523 break;
2524 case CCValAssign::BCvt:
2525 Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
2526 break;
2527 case CCValAssign::ZExt:
2528 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
2529 break;
2530 case CCValAssign::SExt:
2531 Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
2532 break;
2533 case CCValAssign::AExt:
2534 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
2535 break;
2536 case CCValAssign::FPExt:
2537 Arg = DAG.getNode(ISD::FP_EXTEND, DL, VA.getLocVT(), Arg);
2538 break;
2539 default:
2540 llvm_unreachable("Unknown loc info!")::llvm::llvm_unreachable_internal("Unknown loc info!", "/build/llvm-toolchain-snapshot-8~svn345461/lib/Target/AMDGPU/SIISelLowering.cpp"
, 2540)
;
2541 }
2542
2543 if (VA.isRegLoc()) {
2544 RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
2545 } else {
2546 assert(VA.isMemLoc())((VA.isMemLoc()) ? static_cast<void> (0) : __assert_fail
("VA.isMemLoc()", "/build/llvm-toolchain-snapshot-8~svn345461/lib/Target/AMDGPU/SIISelLowering.cpp"
, 2546, __PRETTY_FUNCTION__))
;
2547
2548 SDValue DstAddr;
2549 MachinePointerInfo DstInfo;
2550
2551 unsigned LocMemOffset = VA.getLocMemOffset();
2552 int32_t Offset = LocMemOffset;
2553
2554 SDValue PtrOff = DAG.getConstant(Offset, DL, PtrVT);
2555 unsigned Align = 0;
2556
2557 if (IsTailCall) {
2558 ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags;
2559 unsigned OpSize = Flags.isByVal() ?
2560 Flags.getByValSize() : VA.getValVT().getStoreSize();
2561
2562 // FIXME: We can have better than the minimum byval required alignment.
2563 Align = Flags.isByVal() ? Flags.getByValAlign() :
2564 MinAlign(Subtarget->getStackAlignment(), Offset);
2565
2566 Offset = Offset + FPDiff;
2567 int FI = MFI.CreateFixedObject(OpSize, Offset, true);
2568
2569 DstAddr = DAG.getFrameIndex(FI, PtrVT);
2570 DstInfo = MachinePointerInfo::getFixedStack(MF, FI);
2571
2572 // Make sure any stack arguments overlapping with where we're storing
2573 // are loaded before this eventual operation. Otherwise they'll be
2574 // clobbered.
2575
2576 // FIXME: Why is this really necessary? This seems to just result in a
2577 // lot of code to copy the stack and write them back to the same
2578 // locations, which are supposed to be immutable?
2579 Chain = addTokenForArgument(Chain, DAG, MFI, FI);
2580 } else {
2581 DstAddr = PtrOff;
2582 DstInfo = MachinePointerInfo::getStack(MF, LocMemOffset);
2583 Align = MinAlign(Subtarget->getStackAlignment(), LocMemOffset);
2584 }
2585
2586 if (Outs[i].Flags.isByVal()) {
2587 SDValue SizeNode =
2588 DAG.getConstant(Outs[i].Flags.getByValSize(), DL, MVT::i32);
2589 SDValue Cpy = DAG.getMemcpy(
2590 Chain, DL, DstAddr, Arg, SizeNode, Outs[i].Flags.getByValAlign(),
2591 /*isVol = */ false, /*AlwaysInline = */ true,
2592 /*isTailCall = */ false, DstInfo,
2593 MachinePointerInfo(UndefValue::get(Type::getInt8PtrTy(
2594 *DAG.getContext(), AMDGPUAS::PRIVATE_ADDRESS))));
2595
2596 MemOpChains.push_back(Cpy);
2597 } else {
2598 SDValue Store = DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo, Align);
2599 MemOpChains.push_back(Store);
2600 }
2601 }
2602 }
2603
2604 // Copy special input registers after user input arguments.
2605 passSpecialInputs(CLI, CCInfo, *Info, RegsToPass, MemOpChains, Chain);
2606
2607 if (!MemOpChains.empty())
2608 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
2609
2610 // Build a sequence of copy-to-reg nodes chained together with token chain
2611 // and flag operands which copy the outgoing args into the appropriate regs.
2612 SDValue InFlag;
2613 for (auto &RegToPass : RegsToPass) {
2614 Chain = DAG.getCopyToReg(Chain, DL, RegToPass.first,
2615 RegToPass.second, InFlag);
2616 InFlag = Chain.getValue(1);
2617 }
2618
2619
2620 SDValue PhysReturnAddrReg;
2621 if (IsTailCall) {
2622 // Since the return is being combined with the call, we need to pass on the
2623 // return address.
2624
2625 const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
2626 SDValue ReturnAddrReg = CreateLiveInRegister(
2627 DAG, &AMDGPU::SReg_64RegClass, TRI->getReturnAddressReg(MF), MVT::i64);
2628
2629 PhysReturnAddrReg = DAG.getRegister(TRI->getReturnAddressReg(MF),
2630 MVT::i64);
2631 Chain = DAG.getCopyToReg(Chain, DL, PhysReturnAddrReg, ReturnAddrReg, InFlag);
2632 InFlag = Chain.getValue(1);
2633 }
2634
2635 // We don't usually want to end the call-sequence here because we would tidy
2636 // the frame up *after* the call, however in the ABI-changing tail-call case
2637 // we've carefully laid out the parameters so that when sp is reset they'll be
2638 // in the correct location.
2639 if (IsTailCall && !IsSibCall) {
2640 Chain = DAG.getCALLSEQ_END(Chain,
2641 DAG.getTargetConstant(NumBytes, DL, MVT::i32),
2642 DAG.getTargetConstant(0, DL, MVT::i32),
2643 InFlag, DL);
2644 InFlag = Chain.getValue(1);
2645 }
2646
2647 std::vector<SDValue> Ops;
2648 Ops.push_back(Chain);
2649 Ops.push_back(Callee);
2650
2651 if (IsTailCall) {
2652 // Each tail call may have to adjust the stack by a different amount, so
2653 // this information must travel along with the operation for eventual
2654 // consumption by emitEpilogue.
2655 Ops.push_back(DAG.getTargetConstant(FPDiff, DL, MVT::i32));
2656
2657 Ops.push_back(PhysReturnAddrReg);
2658 }
2659
2660 // Add argument registers to the end of the list so that they are known live
2661 // into the call.
2662 for (auto &RegToPass : RegsToPass) {
2663 Ops.push_back(DAG.getRegister(RegToPass.first,
2664 RegToPass.second.getValueType()));
2665 }
2666
2667 // Add a register mask operand representing the call-preserved registers.
2668
2669 auto *TRI = static_cast<const SIRegisterInfo*>(Subtarget->getRegisterInfo());
2670 const uint32_t *Mask = TRI->getCallPreservedMask(MF, CallConv);
2671 assert(Mask && "Missing call preserved mask for calling convention")((Mask && "Missing call preserved mask for calling convention"
) ? static_cast<void> (0) : __assert_fail ("Mask && \"Missing call preserved mask for calling convention\""
, "/build/llvm-toolchain-snapshot-8~svn345461/lib/Target/AMDGPU/SIISelLowering.cpp"
, 2671, __PRETTY_FUNCTION__))
;
2672 Ops.push_back(DAG.getRegisterMask(Mask));
2673
2674 if (InFlag.getNode())
2675 Ops.push_back(InFlag);
2676
2677 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
2678
2679 // If we're doing a tall call, use a TC_RETURN here rather than an
2680 // actual call instruction.
2681 if (IsTailCall) {
2682 MFI.setHasTailCall();
2683 return DAG.getNode(AMDGPUISD::TC_RETURN, DL, NodeTys, Ops);
2684 }
2685
2686 // Returns a chain and a flag for retval copy to use.
2687 SDValue Call = DAG.getNode(AMDGPUISD::CALL, DL, NodeTys, Ops);
2688 Chain = Call.getValue(0);
2689 InFlag = Call.getValue(1);
2690
2691 if (CallerSavedFP) {
2692 SDValue FPReg = DAG.getRegister(Info->getFrameOffsetReg(), MVT::i32);
2693 Chain = DAG.getCopyToReg(Chain, DL, FPReg, CallerSavedFP, InFlag);
2694 InFlag = Chain.getValue(1);
2695 }
2696
2697 uint64_t CalleePopBytes = NumBytes;
2698 Chain = DAG.getCALLSEQ_END(Chain, DAG.getTargetConstant(0, DL, MVT::i32),
2699 DAG.getTargetConstant(CalleePopBytes, DL, MVT::i32),
2700 InFlag, DL);
2701 if (!Ins.empty())
2702 InFlag = Chain.getValue(1);
2703
2704 // Handle result values, copying them out of physregs into vregs that we
2705 // return.
2706 return LowerCallResult(Chain, InFlag, CallConv, IsVarArg, Ins, DL, DAG,
2707 InVals, IsThisReturn,
2708 IsThisReturn ? OutVals[0] : SDValue());
2709}
2710
2711unsigned SITargetLowering::getRegisterByName(const char* RegName, EVT VT,
2712 SelectionDAG &DAG) const {
2713 unsigned Reg = StringSwitch<unsigned>(RegName)
2714 .Case("m0", AMDGPU::M0)
2715 .Case("exec", AMDGPU::EXEC)
2716 .Case("exec_lo", AMDGPU::EXEC_LO)
2717 .Case("exec_hi", AMDGPU::EXEC_HI)
2718 .Case("flat_scratch", AMDGPU::FLAT_SCR)
2719 .Case("flat_scratch_lo", AMDGPU::FLAT_SCR_LO)
2720 .Case("flat_scratch_hi", AMDGPU::FLAT_SCR_HI)
2721 .Default(AMDGPU::NoRegister);
2722
2723 if (Reg == AMDGPU::NoRegister) {
2724 report_fatal_error(Twine("invalid register name \""
2725 + StringRef(RegName) + "\"."));
2726
2727 }
2728
2729 if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS &&
2730 Subtarget->getRegisterInfo()->regsOverlap(Reg, AMDGPU::FLAT_SCR)) {
2731 report_fatal_error(Twine("invalid register \""
2732 + StringRef(RegName) + "\" for subtarget."));
2733 }
2734
2735 switch (Reg) {
2736 case AMDGPU::M0:
2737 case AMDGPU::EXEC_LO:
2738 case AMDGPU::EXEC_HI:
2739 case AMDGPU::FLAT_SCR_LO:
2740 case AMDGPU::FLAT_SCR_HI:
2741 if (VT.getSizeInBits() == 32)
2742 return Reg;
2743 break;
2744 case AMDGPU::EXEC:
2745 case AMDGPU::FLAT_SCR:
2746 if (VT.getSizeInBits() == 64)
2747 return Reg;
2748 break;
2749 default:
2750 llvm_unreachable("missing register type checking")::llvm::llvm_unreachable_internal("missing register type checking"
, "/build/llvm-toolchain-snapshot-8~svn345461/lib/Target/AMDGPU/SIISelLowering.cpp"
, 2750)
;
2751 }
2752
2753 report_fatal_error(Twine("invalid type for register \""
2754 + StringRef(RegName) + "\"."));
2755}
2756
2757// If kill is not the last instruction, split the block so kill is always a
2758// proper terminator.
2759MachineBasicBlock *SITargetLowering::splitKillBlock(MachineInstr &MI,
2760 MachineBasicBlock *BB) const {
2761 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
2762
2763 MachineBasicBlock::iterator SplitPoint(&MI);
2764 ++SplitPoint;
2765
2766 if (SplitPoint == BB->end()) {
2767 // Don't bother with a new block.
2768 MI.setDesc(TII->getKillTerminatorFromPseudo(MI.getOpcode()));
2769 return BB;
2770 }
2771
2772 MachineFunction *MF = BB->getParent();
2773 MachineBasicBlock *SplitBB
2774 = MF->CreateMachineBasicBlock(BB->getBasicBlock());
2775
2776 MF->insert(++MachineFunction::iterator(BB), SplitBB);
2777 SplitBB->splice(SplitBB->begin(), BB, SplitPoint, BB->end());
2778
2779 SplitBB->transferSuccessorsAndUpdatePHIs(BB);
2780 BB->addSuccessor(SplitBB);
2781
2782 MI.setDesc(TII->getKillTerminatorFromPseudo(MI.getOpcode()));
2783 return SplitBB;
2784}
2785
2786// Do a v_movrels_b32 or v_movreld_b32 for each unique value of \p IdxReg in the
2787// wavefront. If the value is uniform and just happens to be in a VGPR, this
2788// will only do one iteration. In the worst case, this will loop 64 times.
2789//
2790// TODO: Just use v_readlane_b32 if we know the VGPR has a uniform value.
2791static MachineBasicBlock::iterator emitLoadM0FromVGPRLoop(
2792 const SIInstrInfo *TII,
2793 MachineRegisterInfo &MRI,
2794 MachineBasicBlock &OrigBB,
2795 MachineBasicBlock &LoopBB,
2796 const DebugLoc &DL,
2797 const MachineOperand &IdxReg,
2798 unsigned InitReg,
2799 unsigned ResultReg,
2800 unsigned PhiReg,
2801 unsigned InitSaveExecReg,
2802 int Offset,
2803 bool UseGPRIdxMode,
2804 bool IsIndirectSrc) {
2805 MachineBasicBlock::iterator I = LoopBB.begin();
2806
2807 unsigned PhiExec = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
2808 unsigned NewExec = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
2809 unsigned CurrentIdxReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
2810 unsigned CondReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
2811
2812 BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiReg)
2813 .addReg(InitReg)
2814 .addMBB(&OrigBB)
2815 .addReg(ResultReg)
2816 .addMBB(&LoopBB);
2817
2818 BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiExec)
2819 .addReg(InitSaveExecReg)
2820 .addMBB(&OrigBB)
2821 .addReg(NewExec)
2822 .addMBB(&LoopBB);
2823
2824 // Read the next variant <- also loop target.
2825 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), CurrentIdxReg)
2826 .addReg(IdxReg.getReg(), getUndefRegState(IdxReg.isUndef()));
2827
2828 // Compare the just read M0 value to all possible Idx values.
2829 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_CMP_EQ_U32_e64), CondReg)
2830 .addReg(CurrentIdxReg)
2831 .addReg(IdxReg.getReg(), 0, IdxReg.getSubReg());
2832
2833 // Update EXEC, save the original EXEC value to VCC.
2834 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_AND_SAVEEXEC_B64), NewExec)
2835 .addReg(CondReg, RegState::Kill);
2836
2837 MRI.setSimpleHint(NewExec, CondReg);
2838
2839 if (UseGPRIdxMode) {
2840 unsigned IdxReg;
2841 if (Offset == 0) {
2842 IdxReg = CurrentIdxReg;
2843 } else {
2844 IdxReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
2845 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), IdxReg)
2846 .addReg(CurrentIdxReg, RegState::Kill)
2847 .addImm(Offset);
2848 }
2849 unsigned IdxMode = IsIndirectSrc ?
2850 VGPRIndexMode::SRC0_ENABLE : VGPRIndexMode::DST_ENABLE;
2851 MachineInstr *SetOn =
2852 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_ON))
2853 .addReg(IdxReg, RegState::Kill)
2854 .addImm(IdxMode);
2855 SetOn->getOperand(3).setIsUndef();
2856 } else {
2857 // Move index from VCC into M0
2858 if (Offset == 0) {
2859 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
2860 .addReg(CurrentIdxReg, RegState::Kill);
2861 } else {
2862 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
2863 .addReg(CurrentIdxReg, RegState::Kill)
2864 .addImm(Offset);
2865 }
2866 }
2867
2868 // Update EXEC, switch all done bits to 0 and all todo bits to 1.
2869 MachineInstr *InsertPt =
2870 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_XOR_B64), AMDGPU::EXEC)
2871 .addReg(AMDGPU::EXEC)
2872 .addReg(NewExec);
2873
2874 // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
2875 // s_cbranch_scc0?
2876
2877 // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover.
2878 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
2879 .addMBB(&LoopBB);
2880
2881 return InsertPt->getIterator();
2882}
2883
2884// This has slightly sub-optimal regalloc when the source vector is killed by
2885// the read. The register allocator does not understand that the kill is
2886// per-workitem, so is kept alive for the whole loop so we end up not re-using a
2887// subregister from it, using 1 more VGPR than necessary. This was saved when
2888// this was expanded after register allocation.
2889static MachineBasicBlock::iterator loadM0FromVGPR(const SIInstrInfo *TII,
2890 MachineBasicBlock &MBB,
2891 MachineInstr &MI,
2892 unsigned InitResultReg,
2893 unsigned PhiReg,
2894 int Offset,
2895 bool UseGPRIdxMode,
2896 bool IsIndirectSrc) {
2897 MachineFunction *MF = MBB.getParent();
2898 MachineRegisterInfo &MRI = MF->getRegInfo();
2899 const DebugLoc &DL = MI.getDebugLoc();
2900 MachineBasicBlock::iterator I(&MI);
2901
2902 unsigned DstReg = MI.getOperand(0).getReg();
2903 unsigned SaveExec = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
2904 unsigned TmpExec = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
2905
2906 BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), TmpExec);
2907
2908 // Save the EXEC mask
2909 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B64), SaveExec)
2910 .addReg(AMDGPU::EXEC);
2911
2912 // To insert the loop we need to split the block. Move everything after this
2913 // point to a new block, and insert a new empty block between the two.
2914 MachineBasicBlock *LoopBB = MF->CreateMachineBasicBlock();
2915 MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock();
2916 MachineFunction::iterator MBBI(MBB);
2917 ++MBBI;
2918
2919 MF->insert(MBBI, LoopBB);
2920 MF->insert(MBBI, RemainderBB);
2921
2922 LoopBB->addSuccessor(LoopBB);
2923 LoopBB->addSuccessor(RemainderBB);
2924
2925 // Move the rest of the block into a new block.
2926 RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
2927 RemainderBB->splice(RemainderBB->begin(), &MBB, I, MBB.end());
2928
2929 MBB.addSuccessor(LoopBB);
2930
2931 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
2932
2933 auto InsPt = emitLoadM0FromVGPRLoop(TII, MRI, MBB, *LoopBB, DL, *Idx,
2934 InitResultReg, DstReg, PhiReg, TmpExec,
2935 Offset, UseGPRIdxMode, IsIndirectSrc);
2936
2937 MachineBasicBlock::iterator First = RemainderBB->begin();
2938 BuildMI(*RemainderBB, First, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC)
2939 .addReg(SaveExec);
2940
2941 return InsPt;
2942}
2943
2944// Returns subreg index, offset
2945static std::pair<unsigned, int>
2946computeIndirectRegAndOffset(const SIRegisterInfo &TRI,
2947 const TargetRegisterClass *SuperRC,
2948 unsigned VecReg,
2949 int Offset) {
2950 int NumElts = TRI.getRegSizeInBits(*SuperRC) / 32;
2951
2952 // Skip out of bounds offsets, or else we would end up using an undefined
2953 // register.
2954 if (Offset >= NumElts || Offset < 0)
2955 return std::make_pair(AMDGPU::sub0, Offset);
2956
2957 return std::make_pair(AMDGPU::sub0 + Offset, 0);
2958}
2959
2960// Return true if the index is an SGPR and was set.
2961static bool setM0ToIndexFromSGPR(const SIInstrInfo *TII,
2962 MachineRegisterInfo &MRI,
2963 MachineInstr &MI,
2964 int Offset,
2965 bool UseGPRIdxMode,
2966 bool IsIndirectSrc) {
2967 MachineBasicBlock *MBB = MI.getParent();
2968 const DebugLoc &DL = MI.getDebugLoc();
2969 MachineBasicBlock::iterator I(&MI);
2970
2971 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
2972 const TargetRegisterClass *IdxRC = MRI.getRegClass(Idx->getReg());
2973
2974 assert(Idx->getReg() != AMDGPU::NoRegister)((Idx->getReg() != AMDGPU::NoRegister) ? static_cast<void
> (0) : __assert_fail ("Idx->getReg() != AMDGPU::NoRegister"
, "/build/llvm-toolchain-snapshot-8~svn345461/lib/Target/AMDGPU/SIISelLowering.cpp"
, 2974, __PRETTY_FUNCTION__))
;
2975
2976 if (!TII->getRegisterInfo().isSGPRClass(IdxRC))
2977 return false;
2978
2979 if (UseGPRIdxMode) {
2980 unsigned IdxMode = IsIndirectSrc ?
2981 VGPRIndexMode::SRC0_ENABLE : VGPRIndexMode::DST_ENABLE;
2982 if (Offset == 0) {
2983 MachineInstr *SetOn =
2984 BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_ON))
2985 .add(*Idx)
2986 .addImm(IdxMode);
2987
2988 SetOn->getOperand(3).setIsUndef();
2989 } else {
2990 unsigned Tmp = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
2991 BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), Tmp)
2992 .add(*Idx)
2993 .addImm(Offset);
2994 MachineInstr *SetOn =
2995 BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_ON))
2996 .addReg(Tmp, RegState::Kill)
2997 .addImm(IdxMode);
2998
2999 SetOn->getOperand(3).setIsUndef();
3000 }
3001
3002 return true;
3003 }
3004
3005 if (Offset == 0) {
3006 BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
3007 .add(*Idx);
3008 } else {
3009 BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
3010 .add(*Idx)
3011 .addImm(Offset);
3012 }
3013
3014 return true;
3015}
3016
3017// Control flow needs to be inserted if indexing with a VGPR.
3018static MachineBasicBlock *emitIndirectSrc(MachineInstr &MI,
3019 MachineBasicBlock &MBB,
3020 const GCNSubtarget &ST) {
3021 const SIInstrInfo *TII = ST.getInstrInfo();
3022 const SIRegisterInfo &TRI = TII->getRegisterInfo();
3023 MachineFunction *MF = MBB.getParent();
3024 MachineRegisterInfo &MRI = MF->getRegInfo();
3025
3026 unsigned Dst = MI.getOperand(0).getReg();
3027 unsigned SrcReg = TII->getNamedOperand(MI, AMDGPU::OpName::src)->getReg();
3028 int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
3029
3030 const TargetRegisterClass *VecRC = MRI.getRegClass(SrcReg);
3031
3032 unsigned SubReg;
3033 std::tie(SubReg, Offset)
3034 = computeIndirectRegAndOffset(TRI, VecRC, SrcReg, Offset);
3035
3036 bool UseGPRIdxMode = ST.useVGPRIndexMode(EnableVGPRIndexMode);
3037
3038 if (setM0ToIndexFromSGPR(TII, MRI, MI, Offset, UseGPRIdxMode, true)) {
3039 MachineBasicBlock::iterator I(&MI);
3040 const DebugLoc &DL = MI.getDebugLoc();
3041
3042 if (UseGPRIdxMode) {
3043 // TODO: Look at the uses to avoid the copy. This may require rescheduling
3044 // to avoid interfering with other uses, so probably requires a new
3045 // optimization pass.
3046 BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOV_B32_e32), Dst)
3047 .addReg(SrcReg, RegState::Undef, SubReg)
3048 .addReg(SrcReg, RegState::Implicit)
3049 .addReg(AMDGPU::M0, RegState::Implicit);
3050 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF));
3051 } else {
3052 BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
3053 .addReg(SrcReg, RegState::Undef, SubReg)
3054 .addReg(SrcReg, RegState::Implicit);
3055 }
3056
3057 MI.eraseFromParent();
3058
3059 return &MBB;
3060 }
3061
3062 const DebugLoc &DL = MI.getDebugLoc();
3063 MachineBasicBlock::iterator I(&MI);
3064
3065 unsigned PhiReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3066 unsigned InitReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3067
3068 BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), InitReg);
3069
3070 auto InsPt = loadM0FromVGPR(TII, MBB, MI, InitReg, PhiReg,
3071 Offset, UseGPRIdxMode, true);
3072 MachineBasicBlock *LoopBB = InsPt->getParent();
3073
3074 if (UseGPRIdxMode) {
3075 BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::V_MOV_B32_e32), Dst)
3076 .addReg(SrcReg, RegState::Undef, SubReg)
3077 .addReg(SrcReg, RegState::Implicit)
3078 .addReg(AMDGPU::M0, RegState::Implicit);
3079 BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF));
3080 } else {
3081 BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
3082 .addReg(SrcReg, RegState::Undef, SubReg)
3083 .addReg(SrcReg, RegState::Implicit);
3084 }
3085
3086 MI.eraseFromParent();
3087
3088 return LoopBB;
3089}
3090
3091static unsigned getMOVRELDPseudo(const SIRegisterInfo &TRI,
3092 const TargetRegisterClass *VecRC) {
3093 switch (TRI.getRegSizeInBits(*VecRC)) {
3094 case 32: // 4 bytes
3095 return AMDGPU::V_MOVRELD_B32_V1;
3096 case 64: // 8 bytes
3097 return AMDGPU::V_MOVRELD_B32_V2;
3098 case 128: // 16 bytes
3099 return AMDGPU::V_MOVRELD_B32_V4;
3100 case 256: // 32 bytes
3101 return AMDGPU::V_MOVRELD_B32_V8;
3102 case 512: // 64 bytes
3103 return AMDGPU::V_MOVRELD_B32_V16;
3104 default:
3105 llvm_unreachable("unsupported size for MOVRELD pseudos")::llvm::llvm_unreachable_internal("unsupported size for MOVRELD pseudos"
, "/build/llvm-toolchain-snapshot-8~svn345461/lib/Target/AMDGPU/SIISelLowering.cpp"
, 3105)
;
3106 }
3107}
3108
3109static MachineBasicBlock *emitIndirectDst(MachineInstr &MI,
3110 MachineBasicBlock &MBB,
3111 const GCNSubtarget &ST) {
3112 const SIInstrInfo *TII = ST.getInstrInfo();
3113 const SIRegisterInfo &TRI = TII->getRegisterInfo();
3114 MachineFunction *MF = MBB.getParent();
3115 MachineRegisterInfo &MRI = MF->getRegInfo();
3116
3117 unsigned Dst = MI.getOperand(0).getReg();
3118 const MachineOperand *SrcVec = TII->getNamedOperand(MI, AMDGPU::OpName::src);
3119 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
3120 const MachineOperand *Val = TII->getNamedOperand(MI, AMDGPU::OpName::val);
3121 int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
3122 const TargetRegisterClass *VecRC = MRI.getRegClass(SrcVec->getReg());
3123
3124 // This can be an immediate, but will be folded later.
3125 assert(Val->getReg())((Val->getReg()) ? static_cast<void> (0) : __assert_fail
("Val->getReg()", "/build/llvm-toolchain-snapshot-8~svn345461/lib/Target/AMDGPU/SIISelLowering.cpp"
, 3125, __PRETTY_FUNCTION__))
;
3126
3127 unsigned SubReg;
3128 std::tie(SubReg, Offset) = computeIndirectRegAndOffset(TRI, VecRC,
3129 SrcVec->getReg(),
3130 Offset);
3131 bool UseGPRIdxMode = ST.useVGPRIndexMode(EnableVGPRIndexMode);
3132
3133 if (Idx->getReg() == AMDGPU::NoRegister) {
3134 MachineBasicBlock::iterator I(&MI);
3135 const DebugLoc &DL = MI.getDebugLoc();
3136
3137 assert(Offset == 0)((Offset == 0) ? static_cast<void> (0) : __assert_fail (
"Offset == 0", "/build/llvm-toolchain-snapshot-8~svn345461/lib/Target/AMDGPU/SIISelLowering.cpp"
, 3137, __PRETTY_FUNCTION__))
;
3138
3139 BuildMI(MBB, I, DL, TII->get(TargetOpcode::INSERT_SUBREG), Dst)
3140 .add(*SrcVec)
3141 .add(*Val)
3142 .addImm(SubReg);
3143
3144 MI.eraseFromParent();
3145 return &MBB;
3146 }
3147
3148 if (setM0ToIndexFromSGPR(TII, MRI, MI, Offset, UseGPRIdxMode, false)) {
3149 MachineBasicBlock::iterator I(&MI);
3150 const DebugLoc &DL = MI.getDebugLoc();
3151
3152 if (UseGPRIdxMode) {
3153 BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOV_B32_indirect))
3154 .addReg(SrcVec->getReg(), RegState::Undef, SubReg) // vdst
3155 .add(*Val)
3156 .addReg(Dst, RegState::ImplicitDefine)
3157 .addReg(SrcVec->getReg(), RegState::Implicit)
3158 .addReg(AMDGPU::M0, RegState::Implicit);
3159
3160 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF));
3161 } else {
3162 const MCInstrDesc &MovRelDesc = TII->get(getMOVRELDPseudo(TRI, VecRC));
3163
3164 BuildMI(MBB, I, DL, MovRelDesc)
3165 .addReg(Dst, RegState::Define)
3166 .addReg(SrcVec->getReg())
3167 .add(*Val)
3168 .addImm(SubReg - AMDGPU::sub0);
3169 }
3170
3171 MI.eraseFromParent();
3172 return &MBB;
3173 }
3174
3175 if (Val->isReg())
3176 MRI.clearKillFlags(Val->getReg());
3177
3178 const DebugLoc &DL = MI.getDebugLoc();
3179
3180 unsigned PhiReg = MRI.createVirtualRegister(VecRC);
3181
3182 auto InsPt = loadM0FromVGPR(TII, MBB, MI, SrcVec->getReg(), PhiReg,
3183 Offset, UseGPRIdxMode, false);
3184 MachineBasicBlock *LoopBB = InsPt->getParent();
3185
3186 if (UseGPRIdxMode) {
3187 BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::V_MOV_B32_indirect))
3188 .addReg(PhiReg, RegState::Undef, SubReg) // vdst
3189 .add(*Val) // src0
3190 .addReg(Dst, RegState::ImplicitDefine)
3191 .addReg(PhiReg, RegState::Implicit)
3192 .addReg(AMDGPU::M0, RegState::Implicit);
3193 BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF));
3194 } else {
3195 const MCInstrDesc &MovRelDesc = TII->get(getMOVRELDPseudo(TRI, VecRC));
3196
3197 BuildMI(*LoopBB, InsPt, DL, MovRelDesc)
3198 .addReg(Dst, RegState::Define)
3199 .addReg(PhiReg)
3200 .add(*Val)
3201 .addImm(SubReg - AMDGPU::sub0);
3202 }
3203
3204 MI.eraseFromParent();
3205
3206 return LoopBB;
3207}
3208
3209MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
3210 MachineInstr &MI, MachineBasicBlock *BB) const {
3211
3212 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
3213 MachineFunction *MF = BB->getParent();
3214 SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
3215
3216 if (TII->isMIMG(MI)) {
3217 if (MI.memoperands_empty() && MI.mayLoadOrStore()) {
3218 report_fatal_error("missing mem operand from MIMG instruction");
3219 }
3220 // Add a memoperand for mimg instructions so that they aren't assumed to
3221 // be ordered memory instuctions.
3222
3223 return BB;
3224 }
3225
3226 switch (MI.getOpcode()) {
3227 case AMDGPU::S_ADD_U64_PSEUDO:
3228 case AMDGPU::S_SUB_U64_PSEUDO: {
3229 MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
3230 const DebugLoc &DL = MI.getDebugLoc();
3231
3232 MachineOperand &Dest = MI.getOperand(0);
3233 MachineOperand &Src0 = MI.getOperand(1);
3234 MachineOperand &Src1 = MI.getOperand(2);
3235
3236 unsigned DestSub0 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
3237 unsigned DestSub1 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
3238
3239 MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(MI, MRI,
3240 Src0, &AMDGPU::SReg_64RegClass, AMDGPU::sub0,
3241 &AMDGPU::SReg_32_XM0RegClass);
3242 MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(MI, MRI,
3243 Src0, &AMDGPU::SReg_64RegClass, AMDGPU::sub1,
3244 &AMDGPU::SReg_32_XM0RegClass);
3245
3246 MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(MI, MRI,
3247 Src1, &AMDGPU::SReg_64RegClass, AMDGPU::sub0,
3248 &AMDGPU::SReg_32_XM0RegClass);
3249 MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(MI, MRI,
3250 Src1, &AMDGPU::SReg_64RegClass, AMDGPU::sub1,
3251 &AMDGPU::SReg_32_XM0RegClass);
3252
3253 bool IsAdd = (MI.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO);
3254
3255 unsigned LoOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
3256 unsigned HiOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
3257 BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0)
3258 .add(Src0Sub0)
3259 .add(Src1Sub0);
3260 BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1)
3261 .add(Src0Sub1)
3262 .add(Src1Sub1);
3263 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg())
3264 .addReg(DestSub0)
3265 .addImm(AMDGPU::sub0)
3266 .addReg(DestSub1)
3267 .addImm(AMDGPU::sub1);
3268 MI.eraseFromParent();
3269 return BB;
3270 }
3271 case AMDGPU::SI_INIT_M0: {
3272 BuildMI(*BB, MI.getIterator(), MI.getDebugLoc(),
3273 TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
3274 .add(MI.getOperand(0));
3275 MI.eraseFromParent();
3276 return BB;
3277 }
3278 case AMDGPU::SI_INIT_EXEC:
3279 // This should be before all vector instructions.
3280 BuildMI(*BB, &*BB->begin(), MI.getDebugLoc(), TII->get(AMDGPU::S_MOV_B64),
3281 AMDGPU::EXEC)
3282 .addImm(MI.getOperand(0).getImm());
3283 MI.eraseFromParent();
3284 return BB;
3285
3286 case AMDGPU::SI_INIT_EXEC_FROM_INPUT: {
3287 // Extract the thread count from an SGPR input and set EXEC accordingly.
3288 // Since BFM can't shift by 64, handle that case with CMP + CMOV.
3289 //
3290 // S_BFE_U32 count, input, {shift, 7}
3291 // S_BFM_B64 exec, count, 0
3292 // S_CMP_EQ_U32 count, 64
3293 // S_CMOV_B64 exec, -1
3294 MachineInstr *FirstMI = &*BB->begin();
3295 MachineRegisterInfo &MRI = MF->getRegInfo();
3296 unsigned InputReg = MI.getOperand(0).getReg();
3297 unsigned CountReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
3298 bool Found = false;
3299
3300 // Move the COPY of the input reg to the beginning, so that we can use it.
3301 for (auto I = BB->begin(); I != &MI; I++) {
3302 if (I->getOpcode() != TargetOpcode::COPY ||
3303 I->getOperand(0).getReg() != InputReg)
3304 continue;
3305
3306 if (I == FirstMI) {
3307 FirstMI = &*++BB->begin();
3308 } else {
3309 I->removeFromParent();
3310 BB->insert(FirstMI, &*I);
3311 }
3312 Found = true;
3313 break;
3314 }
3315 assert(Found)((Found) ? static_cast<void> (0) : __assert_fail ("Found"
, "/build/llvm-toolchain-snapshot-8~svn345461/lib/Target/AMDGPU/SIISelLowering.cpp"
, 3315, __PRETTY_FUNCTION__))
;
3316 (void)Found;
3317
3318 // This should be before all vector instructions.
3319 BuildMI(*BB, FirstMI, DebugLoc(), TII->get(AMDGPU::S_BFE_U32), CountReg)
3320 .addReg(InputReg)
3321 .addImm((MI.getOperand(1).getImm() & 0x7f) | 0x70000);
3322 BuildMI(*BB, FirstMI, DebugLoc(), TII->get(AMDGPU::S_BFM_B64),
3323 AMDGPU::EXEC)
3324 .addReg(CountReg)
3325 .addImm(0);
3326 BuildMI(*BB, FirstMI, DebugLoc(), TII->get(AMDGPU::S_CMP_EQ_U32))
3327 .addReg(CountReg, RegState::Kill)
3328 .addImm(64);
3329 BuildMI(*BB, FirstMI, DebugLoc(), TII->get(AMDGPU::S_CMOV_B64),
3330 AMDGPU::EXEC)
3331 .addImm(-1);
3332 MI.eraseFromParent();
3333 return BB;
3334 }
3335
3336 case AMDGPU::GET_GROUPSTATICSIZE: {
3337 DebugLoc DL = MI.getDebugLoc();
3338 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_MOV_B32))
3339 .add(MI.getOperand(0))
3340 .addImm(MFI->getLDSSize());
3341 MI.eraseFromParent();
3342 return BB;
3343 }
3344 case AMDGPU::SI_INDIRECT_SRC_V1:
3345 case AMDGPU::SI_INDIRECT_SRC_V2:
3346 case AMDGPU::SI_INDIRECT_SRC_V4:
3347 case AMDGPU::SI_INDIRECT_SRC_V8:
3348 case AMDGPU::SI_INDIRECT_SRC_V16:
3349 return emitIndirectSrc(MI, *BB, *getSubtarget());
3350 case AMDGPU::SI_INDIRECT_DST_V1:
3351 case AMDGPU::SI_INDIRECT_DST_V2:
3352 case AMDGPU::SI_INDIRECT_DST_V4:
3353 case AMDGPU::SI_INDIRECT_DST_V8:
3354 case AMDGPU::SI_INDIRECT_DST_V16:
3355 return emitIndirectDst(MI, *BB, *getSubtarget());
3356 case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO:
3357 case AMDGPU::SI_KILL_I1_PSEUDO:
3358 return splitKillBlock(MI, BB);
3359 case AMDGPU::V_CNDMASK_B64_PSEUDO: {
3360 MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
3361
3362 unsigned Dst = MI.getOperand(0).getReg();
3363 unsigned Src0 = MI.getOperand(1).getReg();
3364 unsigned Src1 = MI.getOperand(2).getReg();
3365 const DebugLoc &DL = MI.getDebugLoc();
3366 unsigned SrcCond = MI.getOperand(3).getReg();
3367
3368 unsigned DstLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3369 unsigned DstHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3370 unsigned SrcCondCopy = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
3371
3372 BuildMI(*BB, MI, DL, TII->get(AMDGPU::COPY), SrcCondCopy)
3373 .addReg(SrcCond);
3374 BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstLo)
3375 .addReg(Src0, 0, AMDGPU::sub0)
3376 .addReg(Src1, 0, AMDGPU::sub0)
3377 .addReg(SrcCondCopy);
3378 BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstHi)
3379 .addReg(Src0, 0, AMDGPU::sub1)
3380 .addReg(Src1, 0, AMDGPU::sub1)
3381 .addReg(SrcCondCopy);
3382
3383 BuildMI(*BB, MI, DL, TII->get(AMDGPU::REG_SEQUENCE), Dst)
3384 .addReg(DstLo)
3385 .addImm(AMDGPU::sub0)
3386 .addReg(DstHi)
3387 .addImm(AMDGPU::sub1);
3388 MI.eraseFromParent();
3389 return BB;
3390 }
3391 case AMDGPU::SI_BR_UNDEF: {
3392 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
3393 const DebugLoc &DL = MI.getDebugLoc();
3394 MachineInstr *Br = BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
3395 .add(MI.getOperand(0));
3396 Br->getOperand(1).setIsUndef(true); // read undef SCC
3397 MI.eraseFromParent();
3398 return BB;
3399 }
3400 case AMDGPU::ADJCALLSTACKUP:
3401 case AMDGPU::ADJCALLSTACKDOWN: {
3402 const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
3403 MachineInstrBuilder MIB(*MF, &MI);
3404
3405 // Add an implicit use of the frame offset reg to prevent the restore copy
3406 // inserted after the call from being reorderd after stack operations in the
3407 // the caller's frame.
3408 MIB.addReg(Info->getStackPtrOffsetReg(), RegState::ImplicitDefine)
3409 .addReg(Info->getStackPtrOffsetReg(), RegState::Implicit)
3410 .addReg(Info->getFrameOffsetReg(), RegState::Implicit);
3411 return BB;
3412 }
3413 case AMDGPU::SI_CALL_ISEL:
3414 case AMDGPU::SI_TCRETURN_ISEL: {
3415 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
3416 const DebugLoc &DL = MI.getDebugLoc();
3417 unsigned ReturnAddrReg = TII->getRegisterInfo().getReturnAddressReg(*MF);
3418
3419 MachineRegisterInfo &MRI = MF->getRegInfo();
3420 unsigned GlobalAddrReg = MI.getOperand(0).getReg();
3421 MachineInstr *PCRel = MRI.getVRegDef(GlobalAddrReg);
3422 assert(PCRel->getOpcode() == AMDGPU::SI_PC_ADD_REL_OFFSET)((PCRel->getOpcode() == AMDGPU::SI_PC_ADD_REL_OFFSET) ? static_cast
<void> (0) : __assert_fail ("PCRel->getOpcode() == AMDGPU::SI_PC_ADD_REL_OFFSET"
, "/build/llvm-toolchain-snapshot-8~svn345461/lib/Target/AMDGPU/SIISelLowering.cpp"
, 3422, __PRETTY_FUNCTION__))
;
3423
3424 const GlobalValue *G = PCRel->getOperand(1).getGlobal();
3425
3426 MachineInstrBuilder MIB;
3427 if (MI.getOpcode() == AMDGPU::SI_CALL_ISEL) {
3428 MIB = BuildMI(*BB, MI, DL, TII->get(AMDGPU::SI_CALL), ReturnAddrReg)
3429 .add(MI.getOperand(0))
3430 .addGlobalAddress(G);
3431 } else {
3432 MIB = BuildMI(*BB, MI, DL, TII->get(AMDGPU::SI_TCRETURN))
3433 .add(MI.getOperand(0))
3434 .addGlobalAddress(G);
3435
3436 // There is an additional imm operand for tcreturn, but it should be in the
3437 // right place already.
3438 }
3439
3440 for (unsigned I = 1, E = MI.getNumOperands(); I != E; ++I)
3441 MIB.add(MI.getOperand(I));
3442
3443 MIB.cloneMemRefs(MI);
3444 MI.eraseFromParent();
3445 return BB;
3446 }
3447 default:
3448 return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB);
3449 }
3450}
3451
3452bool SITargetLowering::hasBitPreservingFPLogic(EVT VT) const {
3453 return isTypeLegal(VT.getScalarType());
3454}
3455
3456bool SITargetLowering::enableAggressiveFMAFusion(EVT VT) const {
3457 // This currently forces unfolding various combinations of fsub into fma with
3458 // free fneg'd operands. As long as we have fast FMA (controlled by
3459 // isFMAFasterThanFMulAndFAdd), we should perform these.
3460
3461 // When fma is quarter rate, for f64 where add / sub are at best half rate,
3462 // most of these combines appear to be cycle neutral but save on instruction
3463 // count / code size.
3464 return true;
3465}
3466
3467EVT SITargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &Ctx,
3468 EVT VT) const {
3469 if (!VT.isVector()) {
3470 return MVT::i1;
3471 }
3472 return EVT::getVectorVT(Ctx, MVT::i1, VT.getVectorNumElements());
3473}
3474
3475MVT SITargetLowering::getScalarShiftAmountTy(const DataLayout &, EVT VT) const {
3476 // TODO: Should i16 be used always if legal? For now it would force VALU
3477 // shifts.
3478 return (VT == MVT::i16) ? MVT::i16 : MVT::i32;
3479}
3480
3481// Answering this is somewhat tricky and depends on the specific device which
3482// have different rates for fma or all f64 operations.
3483//
3484// v_fma_f64 and v_mul_f64 always take the same number of cycles as each other
3485// regardless of which device (although the number of cycles differs between
3486// devices), so it is always profitable for f64.
3487//
3488// v_fma_f32 takes 4 or 16 cycles depending on the device, so it is profitable
3489// only on full rate devices. Normally, we should prefer selecting v_mad_f32
3490// which we can always do even without fused FP ops since it returns the same
3491// result as the separate operations and since it is always full
3492// rate. Therefore, we lie and report that it is not faster for f32. v_mad_f32
3493// however does not support denormals, so we do report fma as faster if we have
3494// a fast fma device and require denormals.
3495//
3496bool SITargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
3497 VT = VT.getScalarType();
3498
3499 switch (VT.getSimpleVT().SimpleTy) {
3500 case MVT::f32: {
3501 // This is as fast on some subtargets. However, we always have full rate f32
3502 // mad available which returns the same result as the separate operations
3503 // which we should prefer over fma. We can't use this if we want to support
3504 // denormals, so only report this in these cases.
3505 if (Subtarget->hasFP32Denormals())
3506 return Subtarget->hasFastFMAF32() || Subtarget->hasDLInsts();
3507
3508 // If the subtarget has v_fmac_f32, that's just as good as v_mac_f32.
3509 return Subtarget->hasFastFMAF32() && Subtarget->hasDLInsts();
3510 }
3511 case MVT::f64:
3512 return true;
3513 case MVT::f16:
3514 return Subtarget->has16BitInsts() && Subtarget->hasFP16Denormals();
3515 default:
3516 break;
3517 }
3518
3519 return false;
3520}
3521
3522//===----------------------------------------------------------------------===//
3523// Custom DAG Lowering Operations
3524//===----------------------------------------------------------------------===//
3525
3526// Work around LegalizeDAG doing the wrong thing and fully scalarizing if the
3527// wider vector type is legal.
3528SDValue SITargetLowering::splitUnaryVectorOp(SDValue Op,
3529 SelectionDAG &DAG) const {
3530 unsigned Opc = Op.getOpcode();
3531 EVT VT = Op.getValueType();
3532 assert(VT == MVT::v4f16)((VT == MVT::v4f16) ? static_cast<void> (0) : __assert_fail
("VT == MVT::v4f16", "/build/llvm-toolchain-snapshot-8~svn345461/lib/Target/AMDGPU/SIISelLowering.cpp"
, 3532, __PRETTY_FUNCTION__))
;
3533
3534 SDValue Lo, Hi;
3535 std::tie(Lo, Hi) = DAG.SplitVectorOperand(Op.getNode(), 0);
3536
3537 SDLoc SL(Op);
3538 SDValue OpLo = DAG.getNode(Opc, SL, Lo.getValueType(), Lo,
3539 Op->getFlags());
3540 SDValue OpHi = DAG.getNode(Opc, SL, Hi.getValueType(), Hi,
3541 Op->getFlags());
3542
3543 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
3544}
3545
3546// Work around LegalizeDAG doing the wrong thing and fully scalarizing if the
3547// wider vector type is legal.
3548SDValue SITargetLowering::splitBinaryVectorOp(SDValue Op,
3549 SelectionDAG &DAG) const {
3550 unsigned Opc = Op.getOpcode();
3551 EVT VT = Op.getValueType();
3552 assert(VT == MVT::v4i16 || VT == MVT::v4f16)((VT == MVT::v4i16 || VT == MVT::v4f16) ? static_cast<void
> (0) : __assert_fail ("VT == MVT::v4i16 || VT == MVT::v4f16"
, "/build/llvm-toolchain-snapshot-8~svn345461/lib/Target/AMDGPU/SIISelLowering.cpp"
, 3552, __PRETTY_FUNCTION__))
;
3553
3554 SDValue Lo0, Hi0;
3555 std::tie(Lo0, Hi0) = DAG.SplitVectorOperand(Op.getNode(), 0);
3556 SDValue Lo1, Hi1;
3557 std::tie(Lo1, Hi1) = DAG.SplitVectorOperand(Op.getNode(), 1);
3558
3559 SDLoc SL(Op);
3560
3561 SDValue OpLo = DAG.getNode(Opc, SL, Lo0.getValueType(), Lo0, Lo1,
3562 Op->getFlags());
3563 SDValue OpHi = DAG.getNode(Opc, SL, Hi0.getValueType(), Hi0, Hi1,
3564 Op->getFlags());
3565
3566 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
3567}
3568
3569SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
3570 switch (Op.getOpcode()) {
3571 default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
3572 case ISD::BRCOND: return LowerBRCOND(Op, DAG);
3573 case ISD::LOAD: {
3574 SDValue Result = LowerLOAD(Op, DAG);
3575 assert((!Result.getNode() ||(((!Result.getNode() || Result.getNode()->getNumValues() ==
2) && "Load should return a value and a chain") ? static_cast
<void> (0) : __assert_fail ("(!Result.getNode() || Result.getNode()->getNumValues() == 2) && \"Load should return a value and a chain\""
, "/build/llvm-toolchain-snapshot-8~svn345461/lib/Target/AMDGPU/SIISelLowering.cpp"
, 3577, __PRETTY_FUNCTION__))
3576 Result.getNode()->getNumValues() == 2) &&(((!Result.getNode() || Result.getNode()->getNumValues() ==
2) && "Load should return a value and a chain") ? static_cast
<void> (0) : __assert_fail ("(!Result.getNode() || Result.getNode()->getNumValues() == 2) && \"Load should return a value and a chain\""
, "/build/llvm-toolchain-snapshot-8~svn345461/lib/Target/AMDGPU/SIISelLowering.cpp"
, 3577, __PRETTY_FUNCTION__))
3577 "Load should return a value and a chain")(((!Result.getNode() || Result.getNode()->getNumValues() ==
2) && "Load should return a value and a chain") ? static_cast
<void> (0) : __assert_fail ("(!Result.getNode() || Result.getNode()->getNumValues() == 2) && \"Load should return a value and a chain\""
, "/build/llvm-toolchain-snapshot-8~svn345461/lib/Target/AMDGPU/SIISelLowering.cpp"
, 3577, __PRETTY_FUNCTION__))
;
3578 return Result;
3579 }
3580
3581 case ISD::FSIN:
3582 case ISD::FCOS:
3583 return LowerTrig(Op, DAG);
3584 case ISD::SELECT: return LowerSELECT(Op, DAG);
3585 case ISD::FDIV: return LowerFDIV(Op, DAG);
3586 case ISD::ATOMIC_CMP_SWAP: return LowerATOMIC_CMP_SWAP(Op, DAG);
3587 case ISD::STORE: return LowerSTORE(Op, DAG);
3588 case ISD::GlobalAddress: {
3589 MachineFunction &MF = DAG.getMachineFunction();
3590 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
3591 return LowerGlobalAddress(MFI, Op, DAG);
3592 }
3593 case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
3594 case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, DAG);
3595 case ISD::INTRINSIC_VOID: return LowerINTRINSIC_VOID(Op, DAG);
3596 case ISD::ADDRSPACECAST: return lowerADDRSPACECAST(Op, DAG);
3597 case ISD::INSERT_VECTOR_ELT:
3598 return lowerINSERT_VECTOR_ELT(Op, DAG);
3599 case ISD::EXTRACT_VECTOR_ELT:
3600 return lowerEXTRACT_VECTOR_ELT(Op, DAG);
3601 case ISD::BUILD_VECTOR:
3602 return lowerBUILD_VECTOR(Op, DAG);
3603 case ISD::FP_ROUND:
3604 return lowerFP_ROUND(Op, DAG);
3605 case ISD::TRAP:
3606 return lowerTRAP(Op, DAG);
3607 case ISD::DEBUGTRAP:
3608 return lowerDEBUGTRAP(Op, DAG);
3609 case ISD::FABS:
3610 case ISD::FNEG:
3611 case ISD::FCANONICALIZE:
3612 return splitUnaryVectorOp(Op, DAG);
3613 case ISD::FMINNUM:
3614 case ISD::FMAXNUM:
3615 return lowerFMINNUM_FMAXNUM(Op, DAG);
3616 case ISD::SHL:
3617 case ISD::SRA:
3618 case ISD::SRL:
3619 case ISD::ADD:
3620 case ISD::SUB:
3621 case ISD::MUL:
3622 case ISD::SMIN:
3623 case ISD::SMAX:
3624 case ISD::UMIN:
3625 case ISD::UMAX:
3626 case ISD::FADD:
3627 case ISD::FMUL:
3628 case ISD::FMINNUM_IEEE:
3629 case ISD::FMAXNUM_IEEE:
3630 return splitBinaryVectorOp(Op, DAG);
3631 }
3632 return SDValue();
3633}
3634
3635static SDValue adjustLoadValueTypeImpl(SDValue Result, EVT LoadVT,
3636 const SDLoc &DL,
3637 SelectionDAG &DAG, bool Unpacked) {
3638 if (!LoadVT.isVector())
3639 return Result;
3640
3641 if (Unpacked) { // From v2i32/v4i32 back to v2f16/v4f16.
3642 // Truncate to v2i16/v4i16.
3643 EVT IntLoadVT = LoadVT.changeTypeToInteger();
3644
3645 // Workaround legalizer not scalarizing truncate after vector op
3646 // legalization byt not creating intermediate vector trunc.
3647 SmallVector<SDValue, 4> Elts;
3648 DAG.ExtractVectorElements(Result, Elts);
3649 for (SDValue &Elt : Elts)
3650 Elt = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Elt);
3651
3652 Result = DAG.getBuildVector(IntLoadVT, DL, Elts);
3653
3654 // Bitcast to original type (v2f16/v4f16).
3655 return DAG.getNode(ISD::BITCAST, DL, LoadVT, Result);
3656 }
3657
3658 // Cast back to the original packed type.
3659 return DAG.getNode(ISD::BITCAST, DL, LoadVT, Result);
3660}
3661
3662SDValue SITargetLowering::adjustLoadValueType(unsigned Opcode,
3663 MemSDNode *M,
3664 SelectionDAG &DAG,
3665 ArrayRef<SDValue> Ops,
3666 bool IsIntrinsic) const {
3667 SDLoc DL(M);
3668
3669 bool Unpacked = Subtarget->hasUnpackedD16VMem();
3670 EVT LoadVT = M->getValueType(0);
3671
3672 EVT EquivLoadVT = LoadVT;
3673 if (Unpacked && LoadVT.isVector()) {
3674 EquivLoadVT = LoadVT.isVector() ?
3675 EVT::getVectorVT(*DAG.getContext(), MVT::i32,
3676 LoadVT.getVectorNumElements()) : LoadVT;
3677 }
3678
3679 // Change from v4f16/v2f16 to EquivLoadVT.
3680 SDVTList VTList = DAG.getVTList(EquivLoadVT, MVT::Other);
3681
3682 SDValue Load
3683 = DAG.getMemIntrinsicNode(
3684 IsIntrinsic ? (unsigned)ISD::INTRINSIC_W_CHAIN : Opcode, DL,
3685 VTList, Ops, M->getMemoryVT(),
3686 M->getMemOperand());
3687 if (!Unpacked) // Just adjusted the opcode.
3688 return Load;
3689
3690 SDValue Adjusted = adjustLoadValueTypeImpl(Load, LoadVT, DL, DAG, Unpacked);
3691
3692 return DAG.getMergeValues({ Adjusted, Load.getValue(1) }, DL);
3693}
3694
3695static SDValue lowerICMPIntrinsic(const SITargetLowering &TLI,
3696 SDNode *N, SelectionDAG &DAG) {
3697 EVT VT = N->getValueType(0);
3698 const auto *CD = dyn_cast<ConstantSDNode>(N->getOperand(3));
3699 if (!CD)
3700 return DAG.getUNDEF(VT);
3701
3702 int CondCode = CD->getSExtValue();
3703 if (CondCode < ICmpInst::Predicate::FIRST_ICMP_PREDICATE ||
3704 CondCode > ICmpInst::Predicate::LAST_ICMP_PREDICATE)
3705 return DAG.getUNDEF(VT);
3706
3707 ICmpInst::Predicate IcInput = static_cast<ICmpInst::Predicate>(CondCode);
3708
3709
3710 SDValue LHS = N->getOperand(1);
3711 SDValue RHS = N->getOperand(2);
3712
3713 SDLoc DL(N);
3714
3715 EVT CmpVT = LHS.getValueType();
3716 if (CmpVT == MVT::i16 && !TLI.isTypeLegal(MVT::i16)) {
3717 unsigned PromoteOp = ICmpInst::isSigned(IcInput) ?
3718 ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
3719 LHS = DAG.getNode(PromoteOp, DL, MVT::i32, LHS);
3720 RHS = DAG.getNode(PromoteOp, DL, MVT::i32, RHS);
3721 }
3722
3723 ISD::CondCode CCOpcode = getICmpCondCode(IcInput);
3724
3725 return DAG.getNode(AMDGPUISD::SETCC, DL, VT, LHS, RHS,
3726 DAG.getCondCode(CCOpcode));
3727}
3728
3729static SDValue lowerFCMPIntrinsic(const SITargetLowering &TLI,
3730 SDNode *N, SelectionDAG &DAG) {
3731 EVT VT = N->getValueType(0);
3732 const auto *CD = dyn_cast<ConstantSDNode>(N->getOperand(3));
3733 if (!CD)
3734 return DAG.getUNDEF(VT);
3735
3736 int CondCode = CD->getSExtValue();
3737 if (CondCode < FCmpInst::Predicate::FIRST_FCMP_PREDICATE ||
3738 CondCode > FCmpInst::Predicate::LAST_FCMP_PREDICATE) {
3739 return DAG.getUNDEF(VT);
3740 }
3741
3742 SDValue Src0 = N->getOperand(1);
3743 SDValue Src1 = N->getOperand(2);
3744 EVT CmpVT = Src0.getValueType();
3745 SDLoc SL(N);
3746
3747 if (CmpVT == MVT::f16 && !TLI.isTypeLegal(CmpVT)) {
3748 Src0 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src0);
3749 Src1 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src1);
3750 }
3751
3752 FCmpInst::Predicate IcInput = static_cast<FCmpInst::Predicate>(CondCode);
3753 ISD::CondCode CCOpcode = getFCmpCondCode(IcInput);
3754 return DAG.getNode(AMDGPUISD::SETCC, SL, VT, Src0,
3755 Src1, DAG.getCondCode(CCOpcode));
3756}
3757
3758void SITargetLowering::ReplaceNodeResults(SDNode *N,
3759 SmallVectorImpl<SDValue> &Results,
3760 SelectionDAG &DAG) const {
3761 switch (N->getOpcode()) {
3762 case ISD::INSERT_VECTOR_ELT: {
3763 if (SDValue Res = lowerINSERT_VECTOR_ELT(SDValue(N, 0), DAG))
3764 Results.push_back(Res);
3765 return;
3766 }
3767 case ISD::EXTRACT_VECTOR_ELT: {
3768 if (SDValue Res = lowerEXTRACT_VECTOR_ELT(SDValue(N, 0), DAG))
3769 Results.push_back(Res);
3770 return;
3771 }
3772 case ISD::INTRINSIC_WO_CHAIN: {
3773 unsigned IID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
3774 switch (IID) {
3775 case Intrinsic::amdgcn_cvt_pkrtz: {
3776 SDValue Src0 = N->getOperand(1);
3777 SDValue Src1 = N->getOperand(2);
3778 SDLoc SL(N);
3779 SDValue Cvt = DAG.getNode(AMDGPUISD::CVT_PKRTZ_F16_F32, SL, MVT::i32,
3780 Src0, Src1);
3781 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Cvt));
3782 return;
3783 }
3784 case Intrinsic::amdgcn_cvt_pknorm_i16:
3785 case Intrinsic::amdgcn_cvt_pknorm_u16:
3786 case Intrinsic::amdgcn_cvt_pk_i16:
3787 case Intrinsic::amdgcn_cvt_pk_u16: {
3788 SDValue Src0 = N->getOperand(1);
3789 SDValue Src1 = N->getOperand(2);
3790 SDLoc SL(N);
3791 unsigned Opcode;
3792
3793 if (IID == Intrinsic::amdgcn_cvt_pknorm_i16)
3794 Opcode = AMDGPUISD::CVT_PKNORM_I16_F32;
3795 else if (IID == Intrinsic::amdgcn_cvt_pknorm_u16)
3796 Opcode = AMDGPUISD::CVT_PKNORM_U16_F32;
3797 else if (IID == Intrinsic::amdgcn_cvt_pk_i16)
3798 Opcode = AMDGPUISD::CVT_PK_I16_I32;
3799 else
3800 Opcode = AMDGPUISD::CVT_PK_U16_U32;
3801
3802 EVT VT = N->getValueType(0);
3803 if (isTypeLegal(VT))
3804 Results.push_back(DAG.getNode(Opcode, SL, VT, Src0, Src1));
3805 else {
3806 SDValue Cvt = DAG.getNode(Opcode, SL, MVT::i32, Src0, Src1);
3807 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, Cvt));
3808 }
3809 return;
3810 }
3811 }
3812 break;
3813 }
3814 case ISD::INTRINSIC_W_CHAIN: {
3815 if (SDValue Res = LowerINTRINSIC_W_CHAIN(SDValue(N, 0), DAG)) {
3816 Results.push_back(Res);
3817 Results.push_back(Res.getValue(1));
3818 return;
3819 }
3820
3821 break;
3822 }
3823 case ISD::SELECT: {
3824 SDLoc SL(N);
3825 EVT VT = N->getValueType(0);
3826 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
3827 SDValue LHS = DAG.getNode(ISD::BITCAST, SL, NewVT, N->getOperand(1));
3828 SDValue RHS = DAG.getNode(ISD::BITCAST, SL, NewVT, N->getOperand(2));
3829
3830 EVT SelectVT = NewVT;
3831 if (NewVT.bitsLT(MVT::i32)) {
3832 LHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, LHS);
3833 RHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, RHS);
3834 SelectVT = MVT::i32;
3835 }
3836
3837 SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, SelectVT,
3838 N->getOperand(0), LHS, RHS);
3839
3840 if (NewVT != SelectVT)
3841 NewSelect = DAG.getNode(ISD::TRUNCATE, SL, NewVT, NewSelect);
3842 Results.push_back(DAG.getNode(ISD::BITCAST, SL, VT, NewSelect));
3843 return;
3844 }
3845 case ISD::FNEG: {
3846 if (N->getValueType(0) != MVT::v2f16)
3847 break;
3848
3849 SDLoc SL(N);
3850 SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::i32, N->getOperand(0));
3851
3852 SDValue Op = DAG.getNode(ISD::XOR, SL, MVT::i32,
3853 BC,
3854 DAG.getConstant(0x80008000, SL, MVT::i32));
3855 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Op));
3856 return;
3857 }
3858 case ISD::FABS: {
3859 if (N->getValueType(0) != MVT::v2f16)
3860 break;
3861
3862 SDLoc SL(N);
3863 SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::i32, N->getOperand(0));
3864
3865 SDValue Op = DAG.getNode(ISD::AND, SL, MVT::i32,
3866 BC,
3867 DAG.getConstant(0x7fff7fff, SL, MVT::i32));
3868 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Op));
3869 return;
3870 }
3871 default:
3872 break;
3873 }
3874}
3875
3876/// Helper function for LowerBRCOND
3877static SDNode *findUser(SDValue Value, unsigned Opcode) {
3878
3879 SDNode *Parent = Value.getNode();
3880 for (SDNode::use_iterator I = Parent->use_begin(), E = Parent->use_end();
3881 I != E; ++I) {
3882
3883 if (I.getUse().get() != Value)
3884 continue;
3885
3886 if (I->getOpcode() == Opcode)
3887 return *I;
3888 }
3889 return nullptr;
3890}
3891
3892unsigned SITargetLowering::isCFIntrinsic(const SDNode *Intr) const {
3893 if (Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN) {
3894 switch (cast<ConstantSDNode>(Intr->getOperand(1))->getZExtValue()) {
3895 case Intrinsic::amdgcn_if:
3896 return AMDGPUISD::IF;
3897 case Intrinsic::amdgcn_else:
3898 return AMDGPUISD::ELSE;
3899 case Intrinsic::amdgcn_loop:
3900 return AMDGPUISD::LOOP;
3901 case Intrinsic::amdgcn_end_cf:
3902 llvm_unreachable("should not occur")::llvm::llvm_unreachable_internal("should not occur", "/build/llvm-toolchain-snapshot-8~svn345461/lib/Target/AMDGPU/SIISelLowering.cpp"
, 3902)
;
3903 default:
3904 return 0;
3905 }
3906 }
3907
3908 // break, if_break, else_break are all only used as inputs to loop, not
3909 // directly as branch conditions.
3910 return 0;
3911}
3912
3913void SITargetLowering::createDebuggerPrologueStackObjects(
3914 MachineFunction &MF) const {
3915 // Create stack objects that are used for emitting debugger prologue.
3916 //
3917 // Debugger prologue writes work group IDs and work item IDs to scratch memory
3918 // at fixed location in the following format:
3919 // offset 0: work group ID x
3920 // offset 4: work group ID y
3921 // offset 8: work group ID z
3922 // offset 16: work item ID x
3923 // offset 20: work item ID y
3924 // offset 24: work item ID z
3925 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
3926 int ObjectIdx = 0;
3927
3928 // For each dimension:
3929 for (unsigned i = 0; i < 3; ++i) {
3930 // Create fixed stack object for work group ID.
3931 ObjectIdx = MF.getFrameInfo().CreateFixedObject(4, i * 4, true);
3932 Info->setDebuggerWorkGroupIDStackObjectIndex(i, ObjectIdx);
3933 // Create fixed stack object for work item ID.
3934 ObjectIdx = MF.getFrameInfo().CreateFixedObject(4, i * 4 + 16, true);
3935 Info->setDebuggerWorkItemIDStackObjectIndex(i, ObjectIdx);
3936 }
3937}
3938
3939bool SITargetLowering::shouldEmitFixup(const GlobalValue *GV) const {
3940 const Triple &TT = getTargetMachine().getTargetTriple();
3941 return (GV->getType()->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS ||
3942 GV->getType()->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) &&
3943 AMDGPU::shouldEmitConstantsToTextSection(TT);
3944}
3945
3946bool SITargetLowering::shouldEmitGOTReloc(const GlobalValue *GV) const {
3947 return (GV->getType()->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS ||
3948 GV->getType()->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS ||
3949 GV->getType()->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) &&
3950 !shouldEmitFixup(GV) &&
3951 !getTargetMachine().shouldAssumeDSOLocal(*GV->getParent(), GV);
3952}
3953
3954bool SITargetLowering::shouldEmitPCReloc(const GlobalValue *GV) const {
3955 return !shouldEmitFixup(GV) && !shouldEmitGOTReloc(GV);
3956}
3957
3958/// This transforms the control flow intrinsics to get the branch destination as
3959/// last parameter, also switches branch target with BR if the need arise
3960SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND,
3961 SelectionDAG &DAG) const {
3962 SDLoc DL(BRCOND);
3963
3964 SDNode *Intr = BRCOND.getOperand(1).getNode();
3965 SDValue Target = BRCOND.getOperand(2);
3966 SDNode *BR = nullptr;
3967 SDNode *SetCC = nullptr;
3968
3969 if (Intr->getOpcode() == ISD::SETCC) {
3970 // As long as we negate the condition everything is fine
3971 SetCC = Intr;
3972 Intr = SetCC->getOperand(0).getNode();
3973
3974 } else {
3975 // Get the target from BR if we don't negate the condition
3976 BR = findUser(BRCOND, ISD::BR);
3977 Target = BR->getOperand(1);
3978 }
3979
3980 // FIXME: This changes the types of the intrinsics instead of introducing new
3981 // nodes with the correct types.
3982 // e.g. llvm.amdgcn.loop
3983
3984 // eg: i1,ch = llvm.amdgcn.loop t0, TargetConstant:i32<6271>, t3
3985 // => t9: ch = llvm.amdgcn.loop t0, TargetConstant:i32<6271>, t3, BasicBlock:ch<bb1 0x7fee5286d088>
3986
3987 unsigned CFNode = isCFIntrinsic(Intr);
3988 if (CFNode == 0) {
3989 // This is a uniform branch so we don't need to legalize.
3990 return BRCOND;
3991 }
3992
3993 bool HaveChain = Intr->getOpcode() == ISD::INTRINSIC_VOID ||
3994 Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN;
3995
3996 assert(!SetCC ||((!SetCC || (SetCC->getConstantOperandVal(1) == 1 &&
cast<CondCodeSDNode>(SetCC->getOperand(2).getNode()
)->get() == ISD::SETNE)) ? static_cast<void> (0) : __assert_fail
("!SetCC || (SetCC->getConstantOperandVal(1) == 1 && cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get() == ISD::SETNE)"
, "/build/llvm-toolchain-snapshot-8~svn345461/lib/Target/AMDGPU/SIISelLowering.cpp"
, 3999, __PRETTY_FUNCTION__))
3997 (SetCC->getConstantOperandVal(1) == 1 &&((!SetCC || (SetCC->getConstantOperandVal(1) == 1 &&
cast<CondCodeSDNode>(SetCC->getOperand(2).getNode()
)->get() == ISD::SETNE)) ? static_cast<void> (0) : __assert_fail
("!SetCC || (SetCC->getConstantOperandVal(1) == 1 && cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get() == ISD::SETNE)"
, "/build/llvm-toolchain-snapshot-8~svn345461/lib/Target/AMDGPU/SIISelLowering.cpp"
, 3999, __PRETTY_FUNCTION__))
3998 cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get() ==((!SetCC || (SetCC->getConstantOperandVal(1) == 1 &&
cast<CondCodeSDNode>(SetCC->getOperand(2).getNode()
)->get() == ISD::SETNE)) ? static_cast<void> (0) : __assert_fail
("!SetCC || (SetCC->getConstantOperandVal(1) == 1 && cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get() == ISD::SETNE)"
, "/build/llvm-toolchain-snapshot-8~svn345461/lib/Target/AMDGPU/SIISelLowering.cpp"
, 3999, __PRETTY_FUNCTION__))
3999 ISD::SETNE))((!SetCC || (SetCC->getConstantOperandVal(1) == 1 &&
cast<CondCodeSDNode>(SetCC->getOperand(2).getNode()
)->get() == ISD::SETNE)) ? static_cast<void> (0) : __assert_fail
("!SetCC || (SetCC->getConstantOperandVal(1) == 1 && cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get() == ISD::SETNE)"
, "/build/llvm-toolchain-snapshot-8~svn345461/lib/Target/AMDGPU/SIISelLowering.cpp"
, 3999, __PRETTY_FUNCTION__))
;
4000
4001 // operands of the new intrinsic call
4002 SmallVector<SDValue, 4> Ops;
4003 if (HaveChain)
4004 Ops.push_back(BRCOND.getOperand(0));
4005
4006 Ops.append(Intr->op_begin() + (HaveChain ? 2 : 1), Intr->op_end());
4007 Ops.push_back(Target);
4008
4009 ArrayRef<EVT> Res(Intr->value_begin() + 1, Intr->value_end());
4010
4011 // build the new intrinsic call
4012 SDNode *Result = DAG.getNode(CFNode, DL, DAG.getVTList(Res), Ops).getNode();
4013
4014 if (!HaveChain) {
4015 SDValue Ops[] = {
4016 SDValue(Result, 0),
4017 BRCOND.getOperand(0)
4018 };
4019
4020 Result = DAG.getMergeValues(Ops, DL).getNode();
4021 }
4022
4023 if (BR) {
4024 // Give the branch instruction our target
4025 SDValue Ops[] = {
4026 BR->getOperand(0),
4027 BRCOND.getOperand(2)
4028 };
4029 SDValue NewBR = DAG.getNode(ISD::BR, DL, BR->getVTList(), Ops);
4030 DAG.ReplaceAllUsesWith(BR, NewBR.getNode());
4031 BR = NewBR.getNode();
4032 }
4033
4034 SDValue Chain = SDValue(Result, Result->getNumValues() - 1);
4035
4036 // Copy the intrinsic results to registers
4037 for (unsigned i = 1, e = Intr->getNumValues() - 1; i != e; ++i) {
4038 SDNode *CopyToReg = findUser(SDValue(Intr, i), ISD::CopyToReg);
4039 if (!CopyToReg)
4040 continue;
4041
4042 Chain = DAG.getCopyToReg(
4043 Chain, DL,
4044 CopyToReg->getOperand(1),
4045 SDValue(Result, i - 1),
4046 SDValue());
4047
4048 DAG.ReplaceAllUsesWith(SDValue(CopyToReg, 0), CopyToReg->getOperand(0));
4049 }
4050
4051 // Remove the old intrinsic from the chain
4052 DAG.ReplaceAllUsesOfValueWith(
4053 SDValue(Intr, Intr->getNumValues() - 1),
4054 Intr->getOperand(0));
4055
4056 return Chain;
4057}
4058
4059SDValue SITargetLowering::getFPExtOrFPTrunc(SelectionDAG &DAG,
4060 SDValue Op,
4061 const SDLoc &DL,
4062 EVT VT) const {
4063 return Op.getValueType().bitsLE(VT) ?
4064 DAG.getNode(ISD::FP_EXTEND, DL, VT, Op) :
4065 DAG.getNode(ISD::FTRUNC, DL, VT, Op);
4066}
4067
4068SDValue SITargetLowering::lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
4069 assert(Op.getValueType() == MVT::f16 &&((Op.getValueType() == MVT::f16 && "Do not know how to custom lower FP_ROUND for non-f16 type"
) ? static_cast<void> (0) : __assert_fail ("Op.getValueType() == MVT::f16 && \"Do not know how to custom lower FP_ROUND for non-f16 type\""
, "/build/llvm-toolchain-snapshot-8~svn345461/lib/Target/AMDGPU/SIISelLowering.cpp"
, 4070, __PRETTY_FUNCTION__))
4070 "Do not know how to custom lower FP_ROUND for non-f16 type")((Op.getValueType() == MVT::f16 && "Do not know how to custom lower FP_ROUND for non-f16 type"
) ? static_cast<void> (0) : __assert_fail ("Op.getValueType() == MVT::f16 && \"Do not know how to custom lower FP_ROUND for non-f16 type\""
, "/build/llvm-toolchain-snapshot-8~svn345461/lib/Target/AMDGPU/SIISelLowering.cpp"
, 4070, __PRETTY_FUNCTION__))
;
4071
4072 SDValue Src = Op.getOperand(0);
4073 EVT SrcVT = Src.getValueType();
4074 if (SrcVT != MVT::f64)
4075 return Op;
4076
4077 SDLoc DL(Op);
4078
4079 SDValue FpToFp16 = DAG.getNode(ISD::FP_TO_FP16, DL, MVT::i32, Src);
4080 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, FpToFp16);
4081 return DAG.getNode(ISD::BITCAST, DL, MVT::f16, Trunc);
4082}
4083
4084SDValue SITargetLowering::lowerFMINNUM_FMAXNUM(SDValue Op,
4085 SelectionDAG &DAG) const {
4086 EVT VT = Op.getValueType();
4087 bool IsIEEEMode = Subtarget->enableIEEEBit(DAG.getMachineFunction());
4088
4089 // FIXME: Assert during eslection that this is only selected for
4090 // ieee_mode. Currently a combine can produce the ieee version for non-ieee
4091 // mode functions, but this happens to be OK since it's only done in cases
4092 // where there is known no sNaN.
4093 if (IsIEEEMode)
4094 return expandFMINNUM_FMAXNUM(Op.getNode(), DAG);
4095
4096 if (VT == MVT::v4f16)
4097 return splitBinaryVectorOp(Op, DAG);
4098 return Op;
4099}
4100
4101SDValue SITargetLowering::lowerTRAP(SDValue Op, SelectionDAG &DAG) const {
4102 SDLoc SL(Op);
4103 SDValue Chain = Op.getOperand(0);
4104
4105 if (Subtarget->getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa ||
4106 !Subtarget->isTrapHandlerEnabled())
4107 return DAG.getNode(AMDGPUISD::ENDPGM, SL, MVT::Other, Chain);
4108
4109 MachineFunction &MF = DAG.getMachineFunction();
4110 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
4111 unsigned UserSGPR = Info->getQueuePtrUserSGPR();
4112 assert(UserSGPR != AMDGPU::NoRegister)((UserSGPR != AMDGPU::NoRegister) ? static_cast<void> (
0) : __assert_fail ("UserSGPR != AMDGPU::NoRegister", "/build/llvm-toolchain-snapshot-8~svn345461/lib/Target/AMDGPU/SIISelLowering.cpp"
, 4112, __PRETTY_FUNCTION__))
;
4113 SDValue QueuePtr = CreateLiveInRegister(
4114 DAG, &AMDGPU::SReg_64RegClass, UserSGPR, MVT::i64);
4115 SDValue SGPR01 = DAG.getRegister(AMDGPU::SGPR0_SGPR1, MVT::i64);
4116 SDValue ToReg = DAG.getCopyToReg(Chain, SL, SGPR01,
4117 QueuePtr, SDValue());
4118 SDValue Ops[] = {
4119 ToReg,
4120 DAG.getTargetConstant(GCNSubtarget::TrapIDLLVMTrap, SL, MVT::i16),
4121 SGPR01,
4122 ToReg.getValue(1)
4123 };
4124 return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
4125}
4126
4127SDValue SITargetLowering::lowerDEBUGTRAP(SDValue Op, SelectionDAG &DAG) const {
4128 SDLoc SL(Op);
4129 SDValue Chain = Op.getOperand(0);
4130 MachineFunction &MF = DAG.getMachineFunction();
4131
4132 if (Subtarget->getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa ||
4133 !Subtarget->isTrapHandlerEnabled()) {
4134 DiagnosticInfoUnsupported NoTrap(MF.getFunction(),
4135 "debugtrap handler not supported",
4136 Op.getDebugLoc(),
4137 DS_Warning);
4138 LLVMContext &Ctx = MF.getFunction().getContext();
4139 Ctx.diagnose(NoTrap);
4140 return Chain;
4141 }
4142
4143 SDValue Ops[] = {
4144 Chain,
4145 DAG.getTargetConstant(GCNSubtarget::TrapIDLLVMDebugTrap, SL, MVT::i16)
4146 };
4147 return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
4148}
4149
4150SDValue SITargetLowering::getSegmentAperture(unsigned AS, const SDLoc &DL,
4151 SelectionDAG &DAG) const {
4152 // FIXME: Use inline constants (src_{shared, private}_base) instead.
4153 if (Subtarget->hasApertureRegs()) {
4154 unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ?
4155 AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE :
4156 AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE;
4157 unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ?
4158 AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE :
4159 AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE;
4160 unsigned Encoding =
4161 AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ |
4162 Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
4163 WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
4164
4165 SDValue EncodingImm = DAG.getTargetConstant(Encoding, DL, MVT::i16);
4166 SDValue ApertureReg = SDValue(
4167 DAG.getMachineNode(AMDGPU::S_GETREG_B32, DL, MVT::i32, EncodingImm), 0);
4168 SDValue ShiftAmount = DAG.getTargetConstant(WidthM1 + 1, DL, MVT::i32);
4169 return DAG.getNode(ISD::SHL, DL, MVT::i32, ApertureReg, ShiftAmount);
4170 }
4171
4172 MachineFunction &MF = DAG.getMachineFunction();
4173 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
4174 unsigned UserSGPR = Info->getQueuePtrUserSGPR();
4175 assert(UserSGPR != AMDGPU::NoRegister)((UserSGPR != AMDGPU::NoRegister) ? static_cast<void> (
0) : __assert_fail ("UserSGPR != AMDGPU::NoRegister", "/build/llvm-toolchain-snapshot-8~svn345461/lib/Target/AMDGPU/SIISelLowering.cpp"
, 4175, __PRETTY_FUNCTION__))
;
4176
4177 SDValue QueuePtr = CreateLiveInRegister(
4178 DAG, &AMDGPU::SReg_64RegClass, UserSGPR, MVT::i64);
4179
4180 // Offset into amd_queue_t for group_segment_aperture_base_hi /
4181 // private_segment_aperture_base_hi.
4182 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
4183
4184 SDValue Ptr = DAG.getObjectPtrOffset(DL, QueuePtr, StructOffset);
4185
4186 // TODO: Use custom target PseudoSourceValue.
4187 // TODO: We should use the value from the IR intrinsic call, but it might not
4188 // be available and how do we get it?
4189 Value *V = UndefValue::get(PointerType::get(Type::getInt8Ty(*DAG.getContext()),
4190 AMDGPUAS::CONSTANT_ADDRESS));
4191
4192 MachinePointerInfo PtrInfo(V, StructOffset);
4193 return DAG.getLoad(MVT::i32, DL, QueuePtr.getValue(1), Ptr, PtrInfo,
4194 MinAlign(64, StructOffset),
4195 MachineMemOperand::MODereferenceable |
4196 MachineMemOperand::MOInvariant);
4197}
4198
4199SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op,
4200 SelectionDAG &DAG) const {
4201 SDLoc SL(Op);
4202 const AddrSpaceCastSDNode *ASC = cast<AddrSpaceCastSDNode>(Op);
4203
4204 SDValue Src = ASC->getOperand(0);
4205 SDValue FlatNullPtr = DAG.getConstant(0, SL, MVT::i64);
4206
4207 const AMDGPUTargetMachine &TM =
4208 static_cast<const AMDGPUTargetMachine &>(getTargetMachine());
4209
4210 // flat -> local/private
4211 if (ASC->getSrcAddressSpace() == AMDGPUAS::FLAT_ADDRESS) {
4212 unsigned DestAS = ASC->getDestAddressSpace();
4213
4214 if (DestAS == AMDGPUAS::LOCAL_ADDRESS ||
4215 DestAS == AMDGPUAS::PRIVATE_ADDRESS) {
4216 unsigned NullVal = TM.getNullPointerValue(DestAS);
4217 SDValue SegmentNullPtr = DAG.getConstant(NullVal, SL, MVT::i32);
4218 SDValue NonNull = DAG.getSetCC(SL, MVT::i1, Src, FlatNullPtr, ISD::SETNE);
4219 SDValue Ptr = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Src);
4220
4221 return DAG.getNode(ISD::SELECT, SL, MVT::i32,
4222 NonNull, Ptr, SegmentNullPtr);
4223 }
4224 }
4225
4226 // local/private -> flat
4227 if (ASC->getDestAddressSpace() == AMDGPUAS::FLAT_ADDRESS) {
4228 unsigned SrcAS = ASC->getSrcAddressSpace();
4229
4230 if (SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
4231 SrcAS == AMDGPUAS::PRIVATE_ADDRESS) {
4232 unsigned NullVal = TM.getNullPointerValue(SrcAS);
4233 SDValue SegmentNullPtr = DAG.getConstant(NullVal, SL, MVT::i32);
4234
4235 SDValue NonNull
4236 = DAG.getSetCC(SL, MVT::i1, Src, SegmentNullPtr, ISD::SETNE);
4237
4238 SDValue Aperture = getSegmentAperture(ASC->getSrcAddressSpace(), SL, DAG);
4239 SDValue CvtPtr
4240 = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Aperture);
4241
4242 return DAG.getNode(ISD::SELECT, SL, MVT::i64, NonNull,
4243 DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr),
4244 FlatNullPtr);
4245 }
4246 }
4247
4248 // global <-> flat are no-ops and never emitted.
4249
4250 const MachineFunction &MF = DAG.getMachineFunction();
4251 DiagnosticInfoUnsupported InvalidAddrSpaceCast(
4252 MF.getFunction(), "invalid addrspacecast", SL.getDebugLoc());
4253 DAG.getContext()->diagnose(InvalidAddrSpaceCast);
4254
4255 return DAG.getUNDEF(ASC->getValueType(0));
4256}
4257
4258SDValue SITargetLowering::lowerINSERT_VECTOR_ELT(SDValue Op,
4259 SelectionDAG &DAG) const {
4260 SDValue Vec = Op.getOperand(0);
4261 SDValue InsVal = Op.getOperand(1);
4262 SDValue Idx = Op.getOperand(2);
4263 EVT VecVT = Vec.getValueType();
4264 EVT EltVT = VecVT.getVectorElementType();
4265 unsigned VecSize = VecVT.getSizeInBits();
4266 unsigned EltSize = EltVT.getSizeInBits();
4267
4268
4269 assert(VecSize <= 64)((VecSize <= 64) ? static_cast<void> (0) : __assert_fail
("VecSize <= 64", "/build/llvm-toolchain-snapshot-8~svn345461/lib/Target/AMDGPU/SIISelLowering.cpp"
, 4269, __PRETTY_FUNCTION__))
;
4270
4271 unsigned NumElts = VecVT.getVectorNumElements();
4272 SDLoc SL(Op);
4273 auto KIdx = dyn_cast<ConstantSDNode>(Idx);
4274
4275 if (NumElts == 4 && EltSize == 16 && KIdx) {
4276 SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Vec);
4277
4278 SDValue LoHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BCVec,
4279 DAG.getConstant(0, SL, MVT::i32));
4280 SDValue HiHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BCVec,
4281 DAG.getConstant(1, SL, MVT::i32));
4282
4283 SDValue LoVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, LoHalf);
4284 SDValue HiVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, HiHalf);
4285
4286 unsigned Idx = KIdx->getZExtValue();
4287 bool InsertLo = Idx < 2;
4288 SDValue InsHalf = DAG.getNode(ISD::INSERT_VECTOR_ELT, SL, MVT::v2i16,
4289 InsertLo ? LoVec : HiVec,
4290 DAG.getNode(ISD::BITCAST, SL, MVT::i16, InsVal),
4291 DAG.getConstant(InsertLo ? Idx : (Idx - 2), SL, MVT::i32));
4292
4293 InsHalf = DAG.getNode(ISD::BITCAST, SL, MVT::i32, InsHalf);
4294
4295 SDValue Concat = InsertLo ?
4296 DAG.getBuildVector(MVT::v2i32, SL, { InsHalf, HiHalf }) :
4297 DAG.getBuildVector(MVT::v2i32, SL, { LoHalf, InsHalf });
4298
4299 return DAG.getNode(ISD::BITCAST, SL, VecVT, Concat);
4300 }
4301
4302 if (isa<ConstantSDNode>(Idx))
4303 return SDValue();
4304
4305 MVT IntVT = MVT::getIntegerVT(VecSize);
4306
4307 // Avoid stack access for dynamic indexing.
4308 SDValue Val = InsVal;
4309 if (InsVal.getValueType() == MVT::f16)
4310 Val = DAG.getNode(ISD::BITCAST, SL, MVT::i16, InsVal);
4311
4312 // v_bfi_b32 (v_bfm_b32 16, (shl idx, 16)), val, vec
4313 SDValue ExtVal = DAG.getNode(ISD::ZERO_EXTEND, SL, IntVT, Val);
4314
4315 assert(isPowerOf2_32(EltSize))((isPowerOf2_32(EltSize)) ? static_cast<void> (0) : __assert_fail
("isPowerOf2_32(EltSize)", "/build/llvm-toolchain-snapshot-8~svn345461/lib/Target/AMDGPU/SIISelLowering.cpp"
, 4315, __PRETTY_FUNCTION__))
;
4316 SDValue ScaleFactor = DAG.getConstant(Log2_32(EltSize), SL, MVT::i32);
4317
4318 // Convert vector index to bit-index.
4319 SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, ScaleFactor);
4320
4321 SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, IntVT, Vec);
4322 SDValue BFM = DAG.getNode(ISD::SHL, SL, IntVT,
4323 DAG.getConstant(0xffff, SL, IntVT),
4324 ScaledIdx);
4325
4326 SDValue LHS = DAG.getNode(ISD::AND, SL, IntVT, BFM, ExtVal);
4327 SDValue RHS = DAG.getNode(ISD::AND, SL, IntVT,
4328 DAG.getNOT(SL, BFM, IntVT), BCVec);
4329
4330 SDValue BFI = DAG.getNode(ISD::OR, SL, IntVT, LHS, RHS);
4331 return DAG.getNode(ISD::BITCAST, SL, VecVT, BFI);
4332}
4333
4334SDValue SITargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
4335 SelectionDAG &DAG) const {
4336 SDLoc SL(Op);
4337
4338 EVT ResultVT = Op.getValueType();
4339 SDValue Vec = Op.getOperand(0);
4340 SDValue Idx = Op.getOperand(1);
4341 EVT VecVT = Vec.getValueType();
4342 unsigned VecSize = VecVT.getSizeInBits();
4343 EVT EltVT = VecVT.getVectorElementType();
4344 assert(VecSize <= 64)((VecSize <= 64) ? static_cast<void> (0) : __assert_fail
("VecSize <= 64", "/build/llvm-toolchain-snapshot-8~svn345461/lib/Target/AMDGPU/SIISelLowering.cpp"
, 4344, __PRETTY_FUNCTION__))
;
4345
4346 DAGCombinerInfo DCI(DAG, AfterLegalizeVectorOps, true, nullptr);
4347
4348 // Make sure we do any optimizations that will make it easier to fold
4349 // source modifiers before obscuring it with bit operations.
4350
4351 // XXX - Why doesn't this get called when vector_shuffle is expanded?
4352 if (SDValue Combined = performExtractVectorEltCombine(Op.getNode(), DCI))
4353 return Combined;
4354
4355 unsigned EltSize = EltVT.getSizeInBits();
4356 assert(isPowerOf2_32(EltSize))((isPowerOf2_32(EltSize)) ? static_cast<void> (0) : __assert_fail
("isPowerOf2_32(EltSize)", "/build/llvm-toolchain-snapshot-8~svn345461/lib/Target/AMDGPU/SIISelLowering.cpp"
, 4356, __PRETTY_FUNCTION__))
;
4357
4358 MVT IntVT = MVT::getIntegerVT(VecSize);
4359 SDValue ScaleFactor = DAG.getConstant(Log2_32(EltSize), SL, MVT::i32);
4360
4361 // Convert vector index to bit-index (* EltSize)
4362 SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, ScaleFactor);
4363
4364 SDValue BC = DAG.getNode(ISD::BITCAST, SL, IntVT, Vec);
4365 SDValue Elt = DAG.getNode(ISD::SRL, SL, IntVT, BC, ScaledIdx);
4366
4367 if (ResultVT == MVT::f16) {
4368 SDValue Result = DAG.getNode(ISD::TRUNCATE, SL, MVT::i16, Elt);
4369 return DAG.getNode(ISD::BITCAST, SL, ResultVT, Result);
4370 }
4371
4372 return DAG.getAnyExtOrTrunc(Elt, SL, ResultVT);
4373}
4374
4375SDValue SITargetLowering::lowerBUILD_VECTOR(SDValue Op,
4376 SelectionDAG &DAG) const {
4377 SDLoc SL(Op);
4378 EVT VT = Op.getValueType();
4379
4380 if (VT == MVT::v4i16 || VT == MVT::v4f16) {
4381 EVT HalfVT = MVT::getVectorVT(VT.getVectorElementType().getSimpleVT(), 2);
4382
4383 // Turn into pair of packed build_vectors.
4384 // TODO: Special case for constants that can be materialized with s_mov_b64.
4385 SDValue Lo = DAG.getBuildVector(HalfVT, SL,
4386 { Op.getOperand(0), Op.getOperand(1) });
4387 SDValue Hi = DAG.getBuildVector(HalfVT, SL,
4388 { Op.getOperand(2), Op.getOperand(3) });
4389
4390 SDValue CastLo = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Lo);
4391 SDValue CastHi = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Hi);
4392
4393 SDValue Blend = DAG.getBuildVector(MVT::v2i32, SL, { CastLo, CastHi });
4394 return DAG.getNode(ISD::BITCAST, SL, VT, Blend);
4395 }
4396
4397 assert(VT == MVT::v2f16 || VT == MVT::v2i16)((VT == MVT::v2f16 || VT == MVT::v2i16) ? static_cast<void
> (0) : __assert_fail ("VT == MVT::v2f16 || VT == MVT::v2i16"
, "/build/llvm-toolchain-snapshot-8~svn345461/lib/Target/AMDGPU/SIISelLowering.cpp"
, 4397, __PRETTY_FUNCTION__))
;
4398 assert(!Subtarget->hasVOP3PInsts() && "this should be legal")((!Subtarget->hasVOP3PInsts() && "this should be legal"
) ? static_cast<void> (0) : __assert_fail ("!Subtarget->hasVOP3PInsts() && \"this should be legal\""
, "/build/llvm-toolchain-snapshot-8~svn345461/lib/Target/AMDGPU/SIISelLowering.cpp"
, 4398, __PRETTY_FUNCTION__))
;
4399
4400 SDValue Lo = Op.getOperand(0);
4401 SDValue Hi = Op.getOperand(1);
4402
4403 // Avoid adding defined bits with the zero_extend.
4404 if (Hi.isUndef()) {
4405 Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Lo);
4406 SDValue ExtLo = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, Lo);
4407 return DAG.getNode(ISD::BITCAST, SL, VT, ExtLo);
4408 }
4409
4410 Hi = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Hi);
4411 Hi = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Hi);
4412
4413 SDValue ShlHi = DAG.getNode(ISD::SHL, SL, MVT::i32, Hi,
4414 DAG.getConstant(16, SL, MVT::i32));
4415 if (Lo.isUndef())
4416 return DAG.getNode(ISD::BITCAST, SL, VT, ShlHi);
4417
4418 Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Lo);
4419 Lo = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Lo);
4420
4421 SDValue Or = DAG.getNode(ISD::OR, SL, MVT::i32, Lo, ShlHi);
4422 return DAG.getNode(ISD::BITCAST, SL, VT, Or);
4423}
4424
4425bool
4426SITargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const {
4427 // We can fold offsets for anything that doesn't require a GOT relocation.
4428 return (GA->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS ||
4429 GA->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS ||
4430 GA->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) &&
4431 !shouldEmitGOTReloc(GA->getGlobal());
4432}
4433
4434static SDValue
4435buildPCRelGlobalAddress(SelectionDAG &DAG, const GlobalValue *GV,
4436 const SDLoc &DL, unsigned Offset, EVT PtrVT,
4437 unsigned GAFlags = SIInstrInfo::MO_NONE) {
4438 // In order to support pc-relative addressing, the PC_ADD_REL_OFFSET SDNode is
4439 // lowered to the following code sequence:
4440 //
4441 // For constant address space:
4442 // s_getpc_b64 s[0:1]
4443 // s_add_u32 s0, s0, $symbol
4444 // s_addc_u32 s1, s1, 0
4445 //
4446 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
4447 // a fixup or relocation is emitted to replace $symbol with a literal
4448 // constant, which is a pc-relative offset from the encoding of the $symbol
4449 // operand to the global variable.
4450 //
4451 // For global address space:
4452 // s_getpc_b64 s[0:1]
4453 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
4454 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
4455 //
4456 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
4457 // fixups or relocations are emitted to replace $symbol@*@lo and
4458 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
4459 // which is a 64-bit pc-relative offset from the encoding of the $symbol
4460 // operand to the global variable.
4461 //
4462 // What we want here is an offset from the value returned by s_getpc
4463 // (which is the address of the s_add_u32 instruction) to the global
4464 // variable, but since the encoding of $symbol starts 4 bytes after the start
4465 // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
4466 // small. This requires us to add 4 to the global variable offset in order to
4467 // compute the correct address.
4468 SDValue PtrLo = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset + 4,
4469 GAFlags);
4470 SDValue PtrHi = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset + 4,
4471 GAFlags == SIInstrInfo::MO_NONE ?
4472 GAFlags : GAFlags + 1);
4473 return DAG.getNode(AMDGPUISD::PC_ADD_REL_OFFSET, DL, PtrVT, PtrLo, PtrHi);
4474}
4475
4476SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI,
4477 SDValue Op,
4478 SelectionDAG &DAG) const {
4479 GlobalAddressSDNode *GSD = cast<GlobalAddressSDNode>(Op);
4480 const GlobalValue *GV = GSD->getGlobal();
4481 if (GSD->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
4482 GSD->getAddressSpace() == AMDGPUAS::REGION_ADDRESS ||
4483 GSD->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS)
4484 return AMDGPUTargetLowering::LowerGlobalAddress(MFI, Op, DAG);
4485
4486 SDLoc DL(GSD);
4487 EVT PtrVT = Op.getValueType();
4488
4489 // FIXME: Should not make address space based decisions here.
4490 if (shouldEmitFixup(GV))
4491 return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT);
4492 else if (shouldEmitPCReloc(GV))
4493 return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT,
4494 SIInstrInfo::MO_REL32);
4495
4496 SDValue GOTAddr = buildPCRelGlobalAddress(DAG, GV, DL, 0, PtrVT,
4497 SIInstrInfo::MO_GOTPCREL32);
4498
4499 Type *Ty = PtrVT.getTypeForEVT(*DAG.getContext());
4500 PointerType *PtrTy = PointerType::get(Ty, AMDGPUAS::CONSTANT_ADDRESS);
4501 const DataLayout &DataLayout = DAG.getDataLayout();
4502 unsigned Align = DataLayout.getABITypeAlignment(PtrTy);
4503 MachinePointerInfo PtrInfo
4504 = MachinePointerInfo::getGOT(DAG.getMachineFunction());
4505
4506 return DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), GOTAddr, PtrInfo, Align,
4507 MachineMemOperand::MODereferenceable |
4508 MachineMemOperand::MOInvariant);
4509}
4510
4511SDValue SITargetLowering::copyToM0(SelectionDAG &DAG, SDValue Chain,
4512 const SDLoc &DL, SDValue V) const {
4513 // We can't use S_MOV_B32 directly, because there is no way to specify m0 as
4514 // the destination register.
4515 //
4516 // We can't use CopyToReg, because MachineCSE won't combine COPY instructions,
4517 // so we will end up with redundant moves to m0.
4518 //
4519 // We use a pseudo to ensure we emit s_mov_b32 with m0 as the direct result.
4520
4521 // A Null SDValue creates a glue result.
4522 SDNode *M0 = DAG.getMachineNode(AMDGPU::SI_INIT_M0, DL, MVT::Other, MVT::Glue,
4523 V, Chain);
4524 return SDValue(M0, 0);
4525}
4526
4527SDValue SITargetLowering::lowerImplicitZextParam(SelectionDAG &DAG,
4528 SDValue Op,
4529 MVT VT,
4530 unsigned Offset) const {
4531 SDLoc SL(Op);
4532 SDValue Param = lowerKernargMemParameter(DAG, MVT::i32, MVT::i32, SL,
4533 DAG.getEntryNode(), Offset, 4, false);
4534 // The local size values will have the hi 16-bits as zero.
4535 return DAG.getNode(ISD::AssertZext, SL, MVT::i32, Param,
4536 DAG.getValueType(VT));
4537}
4538
4539static SDValue emitNonHSAIntrinsicError(SelectionDAG &DAG, const SDLoc &DL,
4540 EVT VT) {
4541 DiagnosticInfoUnsupported BadIntrin(DAG.getMachineFunction().getFunction(),
4542 "non-hsa intrinsic with hsa target",
4543 DL.getDebugLoc());
4544 DAG.getContext()->diagnose(BadIntrin);
4545 return DAG.getUNDEF(VT);
4546}
4547
4548static SDValue emitRemovedIntrinsicError(SelectionDAG &DAG, const SDLoc &DL,
4549 EVT VT) {
4550 DiagnosticInfoUnsupported BadIntrin(DAG.getMachineFunction().getFunction(),
4551 "intrinsic not supported on subtarget",
4552 DL.getDebugLoc());
4553 DAG.getContext()->diagnose(BadIntrin);
4554 return DAG.getUNDEF(VT);
4555}
4556
4557static SDValue getBuildDwordsVector(SelectionDAG &DAG, SDLoc DL,
4558 ArrayRef<SDValue> Elts) {
4559 assert(!Elts.empty())((!Elts.empty()) ? static_cast<void> (0) : __assert_fail
("!Elts.empty()", "/build/llvm-toolchain-snapshot-8~svn345461/lib/Target/AMDGPU/SIISelLowering.cpp"
, 4559, __PRETTY_FUNCTION__))
;
4560 MVT Type;
4561 unsigned NumElts;
4562
4563 if (Elts.size() == 1) {
4564 Type = MVT::f32;
4565 NumElts = 1;
4566 } else if (Elts.size() == 2) {
4567 Type = MVT::v2f32;
4568 NumElts = 2;
4569 } else if (Elts.size() <= 4) {
4570 Type = MVT::v4f32;
4571 NumElts = 4;
4572 } else if (Elts.size() <= 8) {
4573 Type = MVT::v8f32;
4574 NumElts = 8;
4575 } else {
4576 assert(Elts.size() <= 16)((Elts.size() <= 16) ? static_cast<void> (0) : __assert_fail
("Elts.size() <= 16", "/build/llvm-toolchain-snapshot-8~svn345461/lib/Target/AMDGPU/SIISelLowering.cpp"
, 4576, __PRETTY_FUNCTION__))
;
4577 Type = MVT::v16f32;
4578 NumElts = 16;
4579 }
4580
4581 SmallVector<SDValue, 16> VecElts(NumElts);
4582 for (unsigned i = 0; i < Elts.size(); ++i) {
4583 SDValue Elt = Elts[i];
4584 if (Elt.getValueType() != MVT::f32)
4585 Elt = DAG.getBitcast(MVT::f32, Elt);
4586 VecElts[i] = Elt;
4587 }
4588 for (unsigned i = Elts.size(); i < NumElts; ++i)
4589 VecElts[i] = DAG.getUNDEF(MVT::f32);
4590
4591 if (NumElts == 1)
4592 return VecElts[0];
4593 return DAG.getBuildVector(Type, DL, VecElts);
4594}
4595
4596static bool parseCachePolicy(SDValue CachePolicy, SelectionDAG &DAG,
4597 SDValue *GLC, SDValue *SLC) {
4598 auto CachePolicyConst = dyn_cast<ConstantSDNode>(CachePolicy.getNode());
4599 if (!CachePolicyConst)
4600 return false;
4601
4602 uint64_t Value = CachePolicyConst->getZExtValue();
4603 SDLoc DL(CachePolicy);
4604 if (GLC) {
4605 *GLC = DAG.getTargetConstant((Value & 0x1) ? 1 : 0, DL, MVT::i32);
4606 Value &= ~(uint64_t)0x1;
4607 }
4608 if (SLC) {
4609 *SLC = DAG.getTargetConstant((Value & 0x2) ? 1 : 0, DL, MVT::i32);
4610 Value &= ~(uint64_t)0x2;
4611 }
4612
4613 return Value == 0;
4614}
4615
4616SDValue SITargetLowering::lowerImage(SDValue Op,
4617 const AMDGPU::ImageDimIntrinsicInfo *Intr,
4618 SelectionDAG &DAG) const {
4619 SDLoc DL(Op);
4620 MachineFunction &MF = DAG.getMachineFunction();
4621 const GCNSubtarget* ST = &MF.getSubtarget<GCNSubtarget>();
4622 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
4623 AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
4624 const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(Intr->Dim);
4625 const AMDGPU::MIMGLZMappingInfo *LZMappingInfo =
4626 AMDGPU::getMIMGLZMappingInfo(Intr->BaseOpcode);
4627 unsigned IntrOpcode = Intr->BaseOpcode;
4628
4629 SmallVector<EVT, 2> ResultTypes(Op->value_begin(), Op->value_end());
4630 bool IsD16 = false;
4631 bool IsA16 = false;
4632 SDValue VData;
4633 int NumVDataDwords;
4634 unsigned AddrIdx; // Index of first address argument
4635 unsigned DMask;
4636
4637 if (BaseOpcode->Atomic) {
4638 VData = Op.getOperand(2);
4639
4640 bool Is64Bit = VData.getValueType() == MVT::i64;
4641 if (BaseOpcode->AtomicX2) {
4642 SDValue VData2 = Op.getOperand(3);
4643 VData = DAG.getBuildVector(Is64Bit ? MVT::v2i64 : MVT::v2i32, DL,
4644 {VData, VData2});
4645 if (Is64Bit)
4646 VData = DAG.getBitcast(MVT::v4i32, VData);
4647
4648 ResultTypes[0] = Is64Bit ? MVT::v2i64 : MVT::v2i32;
4649 DMask = Is64Bit ? 0xf : 0x3;
4650 NumVDataDwords = Is64Bit ? 4 : 2;
4651 AddrIdx = 4;
4652 } else {
4653 DMask = Is64Bit ? 0x3 : 0x1;
4654 NumVDataDwords = Is64Bit ? 2 : 1;
4655 AddrIdx = 3;
4656 }
4657 } else {
4658 unsigned DMaskIdx;
4659
4660 if (BaseOpcode->Store) {
4661 VData = Op.getOperand(2);
4662
4663 MVT StoreVT = VData.getSimpleValueType();
4664 if (StoreVT.getScalarType() == MVT::f16) {
4665 if (Subtarget->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS ||
4666 !BaseOpcode->HasD16)
4667 return Op; // D16 is unsupported for this instruction
4668
4669 IsD16 = true;
4670 VData = handleD16VData(VData, DAG);
4671 }
4672
4673 NumVDataDwords = (VData.getValueType().getSizeInBits() + 31) / 32;
4674 DMaskIdx = 3;
4675 } else {
4676 MVT LoadVT = Op.getSimpleValueType();
4677 if (LoadVT.getScalarType() == MVT::f16) {
4678 if (Subtarget->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS ||
4679 !BaseOpcode->HasD16)
4680 return Op; // D16 is unsupported for this instruction
4681
4682 IsD16 = true;
4683 if (LoadVT.isVector() && Subtarget->hasUnpackedD16VMem())
4684 ResultTypes[0] = (LoadVT == MVT::v2f16) ? MVT::v2i32 : MVT::v4i32;
4685 }
4686
4687 NumVDataDwords = (ResultTypes[0].getSizeInBits() + 31) / 32;
4688 DMaskIdx = isa<MemSDNode>(Op) ? 2 : 1;
4689 }
4690
4691 auto DMaskConst = dyn_cast<ConstantSDNode>(Op.getOperand(DMaskIdx));
4692 if (!DMaskConst)
4693 return Op;
4694
4695 AddrIdx = DMaskIdx + 1;
4696 DMask = DMaskConst->getZExtValue();
4697 if (!DMask && !BaseOpcode->Store) {
4698 // Eliminate no-op loads. Stores with dmask == 0 are *not* no-op: they
4699 // store the channels' default values.
4700 SDValue Undef = DAG.getUNDEF(Op.getValueType());
4701 if (isa<MemSDNode>(Op))
4702 return DAG.getMergeValues({Undef, Op.getOperand(0)}, DL);
4703 return Undef;
4704 }
4705 }
4706
4707 unsigned NumGradients = BaseOpcode->Gradients ? DimInfo->NumGradients : 0;
4708 unsigned NumCoords = BaseOpcode->Coordinates ? DimInfo->NumCoords : 0;
4709 unsigned NumLCM = BaseOpcode->LodOrClampOrMip ? 1 : 0;
4710 unsigned NumVAddrs = BaseOpcode->NumExtraArgs + NumGradients +
4711 NumCoords + NumLCM;
4712 unsigned NumMIVAddrs = NumVAddrs;
4713
4714 SmallVector<SDValue, 4> VAddrs;
4715
4716 // Optimize _L to _LZ when _L is zero
4717 if (LZMappingInfo) {
4718 if (auto ConstantLod =
4719 dyn_cast<ConstantFPSDNode>(Op.getOperand(AddrIdx+NumVAddrs-1))) {
4720 if (ConstantLod->isZero() || ConstantLod->isNegative()) {
4721 IntrOpcode = LZMappingInfo->LZ; // set new opcode to _lz variant of _l
4722 NumMIVAddrs--; // remove 'lod'
4723 }
4724 }
4725 }
4726
4727 // Check for 16 bit addresses and pack if true.
4728 unsigned DimIdx = AddrIdx + BaseOpcode->NumExtraArgs;
4729 MVT VAddrVT = Op.getOperand(DimIdx).getSimpleValueType();
4730 if (VAddrVT.getScalarType() == MVT::f16 &&
4731 ST->hasFeature(AMDGPU::FeatureR128A16)) {
4732 IsA16 = true;
4733 for (unsigned i = AddrIdx; i < (AddrIdx + NumMIVAddrs); ++i) {
4734 SDValue AddrLo, AddrHi;
4735 // Push back extra arguments.
4736 if (i < DimIdx) {
4737 AddrLo = Op.getOperand(i);
4738 } else {
4739 AddrLo = Op.getOperand(i);
4740 // Dz/dh, dz/dv and the last odd coord are packed with undef. Also,
4741 // in 1D, derivatives dx/dh and dx/dv are packed with undef.
4742 if (((i + 1) >= (AddrIdx + NumMIVAddrs)) ||
4743 ((NumGradients / 2) % 2 == 1 &&
4744 (i == DimIdx + (NumGradients / 2) - 1 ||
4745 i == DimIdx + NumGradients - 1))) {
4746 AddrHi = DAG.getUNDEF(MVT::f16);
4747 } else {
4748 AddrHi = Op.getOperand(i + 1);
4749 i++;
4750 }
4751 AddrLo = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f16,
4752 {AddrLo, AddrHi});
4753 AddrLo = DAG.getBitcast(MVT::i32, AddrLo);
4754 }
4755 VAddrs.push_back(AddrLo);
4756 }
4757 } else {
4758 for (unsigned i = 0; i < NumMIVAddrs; ++i)
4759 VAddrs.push_back(Op.getOperand(AddrIdx + i));
4760 }
4761
4762 SDValue VAddr = getBuildDwordsVector(DAG, DL, VAddrs);
4763
4764 SDValue True = DAG.getTargetConstant(1, DL, MVT::i1);
4765 SDValue False = DAG.getTargetConstant(0, DL, MVT::i1);
4766 unsigned CtrlIdx; // Index of texfailctrl argument
4767 SDValue Unorm;
4768 if (!BaseOpcode->Sampler) {
4769 Unorm = True;
4770 CtrlIdx = AddrIdx + NumVAddrs + 1;
4771 } else {
4772 auto UnormConst =
4773 dyn_cast<ConstantSDNode>(Op.getOperand(AddrIdx + NumVAddrs + 2));
4774 if (!UnormConst)
4775 return Op;
4776
4777 Unorm = UnormConst->getZExtValue() ? True : False;
4778 CtrlIdx = AddrIdx + NumVAddrs + 3;
4779 }
4780
4781 SDValue TexFail = Op.getOperand(CtrlIdx);
4782 auto TexFailConst = dyn_cast<ConstantSDNode>(TexFail.getNode());
4783 if (!TexFailConst || TexFailConst->getZExtValue() != 0)
4784 return Op;
4785
4786 SDValue GLC;
4787 SDValue SLC;
4788 if (BaseOpcode->Atomic) {
4789 GLC = True; // TODO no-return optimization
4790 if (!parseCachePolicy(Op.getOperand(CtrlIdx + 1), DAG, nullptr, &SLC))
4791 return Op;
4792 } else {
4793 if (!parseCachePolicy(Op.getOperand(CtrlIdx + 1), DAG, &GLC, &SLC))
4794 return Op;
4795 }
4796
4797 SmallVector<SDValue, 14> Ops;
4798 if (BaseOpcode->Store || BaseOpcode->Atomic)
4799 Ops.push_back(VData); // vdata
4800 Ops.push_back(VAddr);
4801 Ops.push_back(Op.getOperand(AddrIdx + NumVAddrs)); // rsrc
4802 if (BaseOpcode->Sampler)
4803 Ops.push_back(Op.getOperand(AddrIdx + NumVAddrs + 1)); // sampler
4804 Ops.push_back(DAG.getTargetConstant(DMask, DL, MVT::i32));
4805 Ops.push_back(Unorm);
4806 Ops.push_back(GLC);
4807 Ops.push_back(SLC);
4808 Ops.push_back(IsA16 && // a16 or r128
4809 ST->hasFeature(AMDGPU::FeatureR128A16) ? True : False);
4810 Ops.push_back(False); // tfe
4811 Ops.push_back(False); // lwe
4812 Ops.push_back(DimInfo->DA ? True : False);
4813 if (BaseOpcode->HasD16)
4814 Ops.push_back(IsD16 ? True : False);
4815 if (isa<MemSDNode>(Op))
4816 Ops.push_back(Op.getOperand(0)); // chain
4817
4818 int NumVAddrDwords = VAddr.getValueType().getSizeInBits() / 32;
4819 int Opcode = -1;
4820
4821 if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
4822 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx8,
4823 NumVDataDwords, NumVAddrDwords);
4824 if (Opcode == -1)
4825 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx6,
4826 NumVDataDwords, NumVAddrDwords);
4827 assert(Opcode != -1)((Opcode != -1) ? static_cast<void> (0) : __assert_fail
("Opcode != -1", "/build/llvm-toolchain-snapshot-8~svn345461/lib/Target/AMDGPU/SIISelLowering.cpp"
, 4827, __PRETTY_FUNCTION__))
;
4828
4829 MachineSDNode *NewNode = DAG.getMachineNode(Opcode, DL, ResultTypes, Ops);
4830 if (auto MemOp = dyn_cast<MemSDNode>(Op)) {
4831 MachineMemOperand *MemRef = MemOp->getMemOperand();
4832 DAG.setNodeMemRefs(NewNode, {MemRef});
4833 }
4834
4835 if (BaseOpcode->AtomicX2) {
4836 SmallVector<SDValue, 1> Elt;
4837 DAG.ExtractVectorElements(SDValue(NewNode, 0), Elt, 0, 1);
4838 return DAG.getMergeValues({Elt[0], SDValue(NewNode, 1)}, DL);
4839 } else if (IsD16 && !BaseOpcode->Store) {
4840 MVT LoadVT = Op.getSimpleValueType();
4841 SDValue Adjusted = adjustLoadValueTypeImpl(
4842 SDValue(NewNode, 0), LoadVT, DL, DAG, Subtarget->hasUnpackedD16VMem());
4843 return DAG.getMergeValues({Adjusted, SDValue(NewNode, 1)}, DL);
4844 }
4845
4846 return SDValue(NewNode, 0);
4847}
4848
4849SDValue SITargetLowering::lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc,
4850 SDValue Offset, SDValue GLC,
4851 SelectionDAG &DAG) const {
4852 MachineFunction &MF = DAG.getMachineFunction();
4853 MachineMemOperand *MMO = MF.getMachineMemOperand(
4854 MachinePointerInfo(),
4855 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
4856 MachineMemOperand::MOInvariant,
4857 VT.getStoreSize(), VT.getStoreSize());
4858
4859 if (!Offset->isDivergent()) {
4860 SDValue Ops[] = {
4861 Rsrc,
4862 Offset, // Offset
4863 GLC // glc
4864 };
4865 return DAG.getMemIntrinsicNode(AMDGPUISD::SBUFFER_LOAD, DL,
4866 DAG.getVTList(VT), Ops, VT, MMO);
4867 }
4868
4869 // We have a divergent offset. Emit a MUBUF buffer load instead. We can
4870 // assume that the buffer is unswizzled.
4871 SmallVector<SDValue, 4> Loads;
4872 unsigned NumLoads = 1;
4873 MVT LoadVT = VT.getSimpleVT();
4874
4875 assert(LoadVT == MVT::i32 || LoadVT == MVT::v2i32 || LoadVT == MVT::v4i32 ||((LoadVT == MVT::i32 || LoadVT == MVT::v2i32 || LoadVT == MVT
::v4i32 || LoadVT == MVT::v8i32 || LoadVT == MVT::v16i32) ? static_cast
<void> (0) : __assert_fail ("LoadVT == MVT::i32 || LoadVT == MVT::v2i32 || LoadVT == MVT::v4i32 || LoadVT == MVT::v8i32 || LoadVT == MVT::v16i32"
, "/build/llvm-toolchain-snapshot-8~svn345461/lib/Target/AMDGPU/SIISelLowering.cpp"
, 4876, __PRETTY_FUNCTION__))
4876 LoadVT == MVT::v8i32 || LoadVT == MVT::v16i32)((LoadVT == MVT::i32 || LoadVT == MVT::v2i32 || LoadVT == MVT
::v4i32 || LoadVT == MVT::v8i32 || LoadVT == MVT::v16i32) ? static_cast
<void> (0) : __assert_fail ("LoadVT == MVT::i32 || LoadVT == MVT::v2i32 || LoadVT == MVT::v4i32 || LoadVT == MVT::v8i32 || LoadVT == MVT::v16i32"
, "/build/llvm-toolchain-snapshot-8~svn345461/lib/Target/AMDGPU/SIISelLowering.cpp"
, 4876, __PRETTY_FUNCTION__))
;
4877
4878 if (VT == MVT::v8i32 || VT == MVT::v16i32) {
4879 NumLoads = VT == MVT::v16i32 ? 4 : 2;
4880 LoadVT = MVT::v4i32;
4881 }
4882
4883 SDVTList VTList = DAG.getVTList({LoadVT, MVT::Glue});
4884 unsigned CachePolicy = cast<ConstantSDNode>(GLC)->getZExtValue();
4885 SDValue Ops[] = {
4886 DAG.getEntryNode(), // Chain
4887 Rsrc, // rsrc
4888 DAG.getConstant(0, DL, MVT::i32), // vindex
4889 {}, // voffset
4890 {}, // soffset
4891 {}, // offset
4892 DAG.getConstant(CachePolicy, DL, MVT::i32), // cachepolicy
4893 DAG.getConstant(0, DL, MVT::i1), // idxen
4894 };
4895
4896 // Use the alignment to ensure that the required offsets will fit into the
4897 // immediate offsets.
4898 setBufferOffsets(Offset, DAG, &Ops[3], NumLoads > 1 ? 16 * NumLoads : 4);
4899
4900 uint64_t InstOffset = cast<ConstantSDNode>(Ops[5])->getZExtValue();
4901 for (unsigned i = 0; i < NumLoads; ++i) {
4902 Ops[5] = DAG.getConstant(InstOffset + 16 * i, DL, MVT::i32);
4903 Loads.push_back(DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_LOAD, DL, VTList,
4904 Ops, LoadVT, MMO));
4905 }
4906
4907 if (VT == MVT::v8i32 || VT == MVT::v16i32)
4908 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Loads);
4909
4910 return Loads[0];
4911}
4912
4913SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
4914 SelectionDAG &DAG) const {
4915 MachineFunction &MF = DAG.getMachineFunction();
4916 auto MFI = MF.getInfo<SIMachineFunctionInfo>();
4917
4918 EVT VT = Op.getValueType();
4919 SDLoc DL(Op);
4920 unsigned IntrinsicID = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
4921
4922 // TODO: Should this propagate fast-math-flags?
4923
4924 switch (IntrinsicID) {
4925 case Intrinsic::amdgcn_implicit_buffer_ptr: {
4926 if (getSubtarget()->isAmdHsaOrMesa(MF.getFunction()))
4927 return emitNonHSAIntrinsicError(DAG, DL, VT);
4928 return getPreloadedValue(DAG, *MFI, VT,
4929 AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
4930 }
4931 case Intrinsic::amdgcn_dispatch_ptr:
4932 case Intrinsic::amdgcn_queue_ptr: {
4933 if (!Subtarget->isAmdHsaOrMesa(MF.getFunction())) {
4934 DiagnosticInfoUnsupported BadIntrin(
4935 MF.getFunction(), "unsupported hsa intrinsic without hsa target",
4936 DL.getDebugLoc());
4937 DAG.getContext()->diagnose(BadIntrin);
4938 return DAG.getUNDEF(VT);
4939 }
4940
4941 auto RegID = IntrinsicID == Intrinsic::amdgcn_dispatch_ptr ?
4942 AMDGPUFunctionArgInfo::DISPATCH_PTR : AMDGPUFunctionArgInfo::QUEUE_PTR;
4943 return getPreloadedValue(DAG, *MFI, VT, RegID);
4944 }
4945 case Intrinsic::amdgcn_implicitarg_ptr: {
4946 if (MFI->isEntryFunction())
4947 return getImplicitArgPtr(DAG, DL);
4948 return getPreloadedValue(DAG, *MFI, VT,
4949 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
4950 }
4951 case Intrinsic::amdgcn_kernarg_segment_ptr: {
4952 return getPreloadedValue(DAG, *MFI, VT,
4953 AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
4954 }
4955 case Intrinsic::amdgcn_dispatch_id: {
4956 return getPreloadedValue(DAG, *MFI, VT, AMDGPUFunctionArgInfo::DISPATCH_ID);
4957 }
4958 case Intrinsic::amdgcn_rcp:
4959 return DAG.getNode(AMDGPUISD::RCP, DL, VT, Op.getOperand(1));
4960 case Intrinsic::amdgcn_rsq:
4961 return DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
4962 case Intrinsic::amdgcn_rsq_legacy:
4963 if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
4964 return emitRemovedIntrinsicError(DAG, DL, VT);
4965
4966 return DAG.getNode(AMDGPUISD::RSQ_LEGACY, DL, VT, Op.getOperand(1));
4967 case Intrinsic::amdgcn_rcp_legacy:
4968 if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
4969 return emitRemovedIntrinsicError(DAG, DL, VT);
4970 return DAG.getNode(AMDGPUISD::RCP_LEGACY, DL, VT, Op.getOperand(1));
4971 case Intrinsic::amdgcn_rsq_clamp: {
4972 if (Subtarget->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
4973 return DAG.getNode(AMDGPUISD::RSQ_CLAMP, DL, VT, Op.getOperand(1));
4974
4975 Type *Type = VT.getTypeForEVT(*DAG.getContext());
4976 APFloat Max = APFloat::getLargest(Type->getFltSemantics());
4977 APFloat Min = APFloat::getLargest(Type->getFltSemantics(), true);
4978
4979 SDValue Rsq = DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
4980 SDValue Tmp = DAG.getNode(ISD::FMINNUM, DL, VT, Rsq,
4981 DAG.getConstantFP(Max, DL, VT));
4982 return DAG.getNode(ISD::FMAXNUM, DL, VT, Tmp,
4983 DAG.getConstantFP(Min, DL, VT));
4984 }
4985 case Intrinsic::r600_read_ngroups_x:
4986 if (Subtarget->isAmdHsaOS())
4987 return emitNonHSAIntrinsicError(DAG, DL, VT);
4988
4989 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
4990 SI::KernelInputOffsets::NGROUPS_X, 4, false);
4991 case Intrinsic::r600_read_ngroups_y:
4992 if (Subtarget->isAmdHsaOS())
4993 return emitNonHSAIntrinsicError(DAG, DL, VT);
4994
4995 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
4996 SI::KernelInputOffsets::NGROUPS_Y, 4, false);
4997 case Intrinsic::r600_read_ngroups_z:
4998 if (Subtarget->isAmdHsaOS())
4999 return emitNonHSAIntrinsicError(DAG, DL, VT);
5000
5001 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
5002 SI::KernelInputOffsets::NGROUPS_Z, 4, false);
5003 case Intrinsic::r600_read_global_size_x:
5004 if (Subtarget->isAmdHsaOS())
5005 return emitNonHSAIntrinsicError(DAG, DL, VT);
5006
5007 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
5008 SI::KernelInputOffsets::GLOBAL_SIZE_X, 4, false);
5009 case Intrinsic::r600_read_global_size_y:
5010 if (Subtarget->isAmdHsaOS())
5011 return emitNonHSAIntrinsicError(DAG, DL, VT);
5012
5013 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
5014 SI::KernelInputOffsets::GLOBAL_SIZE_Y, 4, false);
5015 case Intrinsic::r600_read_global_size_z:
5016 if (Subtarget->isAmdHsaOS())
5017 return emitNonHSAIntrinsicError(DAG, DL, VT);
5018
5019 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
5020 SI::KernelInputOffsets::GLOBAL_SIZE_Z, 4, false);
5021 case Intrinsic::r600_read_local_size_x:
5022 if (Subtarget->isAmdHsaOS())
5023 return emitNonHSAIntrinsicError(DAG, DL, VT);
5024
5025 return lowerImplicitZextParam(DAG, Op, MVT::i16,
5026 SI::KernelInputOffsets::LOCAL_SIZE_X);
5027 case Intrinsic::r600_read_local_size_y:
5028 if (Subtarget->isAmdHsaOS())
5029 return emitNonHSAIntrinsicError(DAG, DL, VT);
5030
5031 return lowerImplicitZextParam(DAG, Op, MVT::i16,
5032 SI::KernelInputOffsets::LOCAL_SIZE_Y);
5033 case Intrinsic::r600_read_local_size_z:
5034 if (Subtarget->isAmdHsaOS())
5035 return emitNonHSAIntrinsicError(DAG, DL, VT);
5036
5037 return lowerImplicitZextParam(DAG, Op, MVT::i16,
5038 SI::KernelInputOffsets::LOCAL_SIZE_Z);
5039 case Intrinsic::amdgcn_workgroup_id_x:
5040 case Intrinsic::r600_read_tgid_x:
5041 return getPreloadedValue(DAG, *MFI, VT,
5042 AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
5043 case Intrinsic::amdgcn_workgroup_id_y:
5044 case Intrinsic::r600_read_tgid_y:
5045 return getPreloadedValue(DAG, *MFI, VT,
5046 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
5047 case Intrinsic::amdgcn_workgroup_id_z:
5048 case Intrinsic::r600_read_tgid_z:
5049 return getPreloadedValue(DAG, *MFI, VT,
5050 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
5051 case Intrinsic::amdgcn_workitem_id_x: {
5052 case Intrinsic::r600_read_tidig_x:
5053 return loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32,
5054 SDLoc(DAG.getEntryNode()),
5055 MFI->getArgInfo().WorkItemIDX);
5056 }
5057 case Intrinsic::amdgcn_workitem_id_y:
5058 case Intrinsic::r600_read_tidig_y:
5059 return loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32,
5060 SDLoc(DAG.getEntryNode()),
5061 MFI->getArgInfo().WorkItemIDY);
5062 case Intrinsic::amdgcn_workitem_id_z:
5063 case Intrinsic::r600_read_tidig_z:
5064 return loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32,
5065 SDLoc(DAG.getEntryNode()),
5066 MFI->getArgInfo().WorkItemIDZ);
5067 case AMDGPUIntrinsic::SI_load_const: {
5068 SDValue Load =
5069 lowerSBuffer(MVT::i32, DL, Op.getOperand(1), Op.getOperand(2),
5070 DAG.getTargetConstant(0, DL, MVT::i1), DAG);
5071 return DAG.getNode(ISD::BITCAST, DL, MVT::f32, Load);
5072 }
5073 case Intrinsic::amdgcn_s_buffer_load: {
5074 unsigned Cache = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
5075 return lowerSBuffer(VT, DL, Op.getOperand(1), Op.getOperand(2),
5076 DAG.getTargetConstant(Cache & 1, DL, MVT::i1), DAG);
5077 }
5078 case Intrinsic::amdgcn_fdiv_fast:
5079 return lowerFDIV_FAST(Op, DAG);
5080 case Intrinsic::amdgcn_interp_mov: {
5081 SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(4));
5082 SDValue Glue = M0.getValue(1);
5083 return DAG.getNode(AMDGPUISD::INTERP_MOV, DL, MVT::f32, Op.getOperand(1),
5084 Op.getOperand(2), Op.getOperand(3), Glue);
5085 }
5086 case Intrinsic::amdgcn_interp_p1: {
5087 SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(4));
5088 SDValue Glue = M0.getValue(1);
5089 return DAG.getNode(AMDGPUISD::INTERP_P1, DL, MVT::f32, Op.getOperand(1),
5090 Op.getOperand(2), Op.getOperand(3), Glue);
5091 }
5092 case Intrinsic::amdgcn_interp_p2: {
5093 SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(5));
5094 SDValue Glue = SDValue(M0.getNode(), 1);
5095 return DAG.getNode(AMDGPUISD::INTERP_P2, DL, MVT::f32, Op.getOperand(1),
5096 Op.getOperand(2), Op.getOperand(3), Op.getOperand(4),
5097 Glue);
5098 }
5099 case Intrinsic::amdgcn_sin:
5100 return DAG.getNode(AMDGPUISD::SIN_HW, DL, VT, Op.getOperand(1));
5101
5102 case Intrinsic::amdgcn_cos:
5103 return DAG.getNode(AMDGPUISD::COS_HW, DL, VT, Op.getOperand(1));
5104
5105 case Intrinsic::amdgcn_log_clamp: {
5106 if (Subtarget->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
5107 return SDValue();
5108
5109 DiagnosticInfoUnsupported BadIntrin(
5110 MF.getFunction(), "intrinsic not supported on subtarget",
5111 DL.getDebugLoc());
5112 DAG.getContext()->diagnose(BadIntrin);
5113 return DAG.getUNDEF(VT);
5114 }
5115 case Intrinsic::amdgcn_ldexp:
5116 return DAG.getNode(AMDGPUISD::LDEXP, DL, VT,
5117 Op.getOperand(1), Op.getOperand(2));
5118
5119 case Intrinsic::amdgcn_fract:
5120 return DAG.getNode(AMDGPUISD::FRACT, DL, VT, Op.getOperand(1));
5121
5122 case Intrinsic::amdgcn_class:
5123 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, VT,
5124 Op.getOperand(1), Op.getOperand(2));
5125 case Intrinsic::amdgcn_div_fmas:
5126 return DAG.getNode(AMDGPUISD::DIV_FMAS, DL, VT,
5127 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3),
5128 Op.getOperand(4));
5129
5130 case Intrinsic::amdgcn_div_fixup:
5131 return DAG.getNode(AMDGPUISD::DIV_FIXUP, DL, VT,
5132 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
5133
5134 case Intrinsic::amdgcn_trig_preop:
5135 return DAG.getNode(AMDGPUISD::TRIG_PREOP, DL, VT,
5136 Op.getOperand(1), Op.getOperand(2));
5137 case Intrinsic::amdgcn_div_scale: {
5138 // 3rd parameter required to be a constant.
5139 const ConstantSDNode *Param = dyn_cast<ConstantSDNode>(Op.getOperand(3));
5140 if (!Param)
5141 return DAG.getMergeValues({ DAG.getUNDEF(VT), DAG.getUNDEF(MVT::i1) }, DL);
5142
5143 // Translate to the operands expected by the machine instruction. The
5144 // first parameter must be the same as the first instruction.
5145 SDValue Numerator = Op.getOperand(1);
5146 SDValue Denominator = Op.getOperand(2);
5147
5148 // Note this order is opposite of the machine instruction's operations,
5149 // which is s0.f = Quotient, s1.f = Denominator, s2.f = Numerator. The
5150 // intrinsic has the numerator as the first operand to match a normal
5151 // division operation.
5152
5153 SDValue Src0 = Param->isAllOnesValue() ? Numerator : Denominator;
5154
5155 return DAG.getNode(AMDGPUISD::DIV_SCALE, DL, Op->getVTList(), Src0,
5156 Denominator, Numerator);
5157 }
5158 case Intrinsic::amdgcn_icmp: {
5159 return lowerICMPIntrinsic(*this, Op.getNode(), DAG);
5160 }
5161 case Intrinsic::amdgcn_fcmp: {
5162 return lowerFCMPIntrinsic(*this, Op.getNode(), DAG);
5163 }
5164 case Intrinsic::amdgcn_fmed3:
5165 return DAG.getNode(AMDGPUISD::FMED3, DL, VT,
5166 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
5167 case Intrinsic::amdgcn_fdot2:
5168 return DAG.getNode(AMDGPUISD::FDOT2, DL, VT,
5169 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3),
5170 Op.getOperand(4));
5171 case Intrinsic::amdgcn_fmul_legacy:
5172 return DAG.getNode(AMDGPUISD::FMUL_LEGACY, DL, VT,
5173 Op.getOperand(1), Op.getOperand(2));
5174 case Intrinsic::amdgcn_sffbh:
5175 return DAG.getNode(AMDGPUISD::FFBH_I32, DL, VT, Op.getOperand(1));
5176 case Intrinsic::amdgcn_sbfe:
5177 return DAG.getNode(AMDGPUISD::BFE_I32, DL, VT,
5178 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
5179 case Intrinsic::amdgcn_ubfe:
5180 return DAG.getNode(AMDGPUISD::BFE_U32, DL, VT,
5181 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
5182 case Intrinsic::amdgcn_cvt_pkrtz:
5183 case Intrinsic::amdgcn_cvt_pknorm_i16:
5184 case Intrinsic::amdgcn_cvt_pknorm_u16:
5185 case Intrinsic::amdgcn_cvt_pk_i16:
5186 case Intrinsic::amdgcn_cvt_pk_u16: {
5187 // FIXME: Stop adding cast if v2f16/v2i16 are legal.
5188 EVT VT = Op.getValueType();
5189 unsigned Opcode;
5190
5191 if (IntrinsicID == Intrinsic::amdgcn_cvt_pkrtz)
5192 Opcode = AMDGPUISD::CVT_PKRTZ_F16_F32;
5193 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_i16)
5194 Opcode = AMDGPUISD::CVT_PKNORM_I16_F32;
5195 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_u16)
5196 Opcode = AMDGPUISD::CVT_PKNORM_U16_F32;
5197 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pk_i16)
5198 Opcode = AMDGPUISD::CVT_PK_I16_I32;
5199 else
5200 Opcode = AMDGPUISD::CVT_PK_U16_U32;
5201
5202 if (isTypeLegal(VT))
5203 return DAG.getNode(Opcode, DL, VT, Op.getOperand(1), Op.getOperand(2));
5204
5205 SDValue Node = DAG.getNode(Opcode, DL, MVT::i32,
5206 Op.getOperand(1), Op.getOperand(2));
5207 return DAG.getNode(ISD::BITCAST, DL, VT, Node);
5208 }
5209 case Intrinsic::amdgcn_wqm: {
5210 SDValue Src = Op.getOperand(1);
5211 return SDValue(DAG.getMachineNode(AMDGPU::WQM, DL, Src.getValueType(), Src),
5212 0);
5213 }
5214 case Intrinsic::amdgcn_wwm: {
5215 SDValue Src = Op.getOperand(1);
5216 return SDValue(DAG.getMachineNode(AMDGPU::WWM, DL, Src.getValueType(), Src),
5217 0);
5218 }
5219 case Intrinsic::amdgcn_fmad_ftz:
5220 return DAG.getNode(AMDGPUISD::FMAD_FTZ, DL, VT, Op.getOperand(1),
5221 Op.getOperand(2), Op.getOperand(3));
5222 default:
5223 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
5224 AMDGPU::getImageDimIntrinsicInfo(IntrinsicID))
5225 return lowerImage(Op, ImageDimIntr, DAG);
5226
5227 return Op;
5228 }
5229}
5230
5231SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
5232 SelectionDAG &DAG) const {
5233 unsigned IntrID = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
5234 SDLoc DL(Op);
5235
5236 switch (IntrID) {
5237 case Intrinsic::amdgcn_atomic_inc:
5238 case Intrinsic::amdgcn_atomic_dec:
5239 case Intrinsic::amdgcn_ds_fadd:
5240 case Intrinsic::amdgcn_ds_fmin:
5241 case Intrinsic::amdgcn_ds_fmax: {
5242 MemSDNode *M = cast<MemSDNode>(Op);
5243 unsigned Opc;
5244 switch (IntrID) {
5245 case Intrinsic::amdgcn_atomic_inc:
5246 Opc = AMDGPUISD::ATOMIC_INC;
5247 break;
5248 case Intrinsic::amdgcn_atomic_dec:
5249 Opc = AMDGPUISD::ATOMIC_DEC;
5250 break;
5251 case Intrinsic::amdgcn_ds_fadd:
5252 Opc = AMDGPUISD::ATOMIC_LOAD_FADD;
5253 break;
5254 case Intrinsic::amdgcn_ds_fmin:
5255 Opc = AMDGPUISD::ATOMIC_LOAD_FMIN;
5256 break;
5257 case Intrinsic::amdgcn_ds_fmax:
5258 Opc = AMDGPUISD::ATOMIC_LOAD_FMAX;
5259 break;
5260 default:
5261 llvm_unreachable("Unknown intrinsic!")::llvm::llvm_unreachable_internal("Unknown intrinsic!", "/build/llvm-toolchain-snapshot-8~svn345461/lib/Target/AMDGPU/SIISelLowering.cpp"
, 5261)
;
5262 }
5263 SDValue Ops[] = {
5264 M->getOperand(0), // Chain
5265 M->getOperand(2), // Ptr
5266 M->getOperand(3) // Value
5267 };
5268
5269 return DAG.getMemIntrinsicNode(Opc, SDLoc(Op), M->getVTList(), Ops,
5270 M->getMemoryVT(), M->getMemOperand());
5271 }
5272 case Intrinsic::amdgcn_buffer_load:
5273 case Intrinsic::amdgcn_buffer_load_format: {
5274 unsigned Glc = cast<ConstantSDNode>(Op.getOperand(5))->getZExtValue();
5275 unsigned Slc = cast<ConstantSDNode>(Op.getOperand(6))->getZExtValue();
5276 unsigned IdxEn = 1;
5277 if (auto Idx = dyn_cast<ConstantSDNode>(Op.getOperand(3)))
5278 IdxEn = Idx->getZExtValue() != 0;
5279 SDValue Ops[] = {
5280 Op.getOperand(0), // Chain
5281 Op.getOperand(2), // rsrc
5282 Op.getOperand(3), // vindex
5283 SDValue(), // voffset -- will be set by setBufferOffsets
5284 SDValue(), // soffset -- will be set by setBufferOffsets
5285 SDValue(), // offset -- will be set by setBufferOffsets
5286 DAG.getConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy
5287 DAG.getConstant(IdxEn, DL, MVT::i1), // idxen
5288 };
5289
5290 setBufferOffsets(Op.getOperand(4), DAG, &Ops[3]);
5291 unsigned Opc = (IntrID == Intrinsic::amdgcn_buffer_load) ?
5292 AMDGPUISD::BUFFER_LOAD : AMDGPUISD::BUFFER_LOAD_FORMAT;
5293
5294 EVT VT = Op.getValueType();
5295 EVT IntVT = VT.changeTypeToInteger();
5296 auto *M = cast<MemSDNode>(Op);
5297 EVT LoadVT = Op.getValueType();
5298
5299 if (LoadVT.getScalarType() == MVT::f16)
5300 return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16,
5301 M, DAG, Ops);
5302 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, IntVT,
5303 M->getMemOperand());
5304 }
5305 case Intrinsic::amdgcn_raw_buffer_load:
5306 case Intrinsic::amdgcn_raw_buffer_load_format: {
5307 auto Offsets = splitBufferOffsets(Op.getOperand(3), DAG);
5308 SDValue Ops[] = {
5309 Op.getOperand(0), // Chain
5310 Op.getOperand(2), // rsrc
5311 DAG.getConstant(0, DL, MVT::i32), // vindex
5312 Offsets.first, // voffset
5313 Op.getOperand(4), // soffset
5314 Offsets.second, // offset
5315 Op.getOperand(5), // cachepolicy
5316 DAG.getConstant(0, DL, MVT::i1), // idxen
5317 };
5318
5319 unsigned Opc = (IntrID == Intrinsic::amdgcn_raw_buffer_load) ?
5320 AMDGPUISD::BUFFER_LOAD : AMDGPUISD::BUFFER_LOAD_FORMAT;
5321
5322 EVT VT = Op.getValueType();
5323 EVT IntVT = VT.changeTypeToInteger();
5324 auto *M = cast<MemSDNode>(Op);
5325 EVT LoadVT = Op.getValueType();
5326
5327 if (LoadVT.getScalarType() == MVT::f16)
5328 return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16,
5329 M, DAG, Ops);
5330 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, IntVT,
5331 M->getMemOperand());
5332 }
5333 case Intrinsic::amdgcn_struct_buffer_load:
5334 case Intrinsic::amdgcn_struct_buffer_load_format: {
5335 auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG);
5336 SDValue Ops[] = {
5337 Op.getOperand(0), // Chain
5338 Op.getOperand(2), // rsrc
5339 Op.getOperand(3), // vindex
5340 Offsets.first, // voffset
5341 Op.getOperand(5), // soffset
5342 Offsets.second, // offset
5343 Op.getOperand(6), // cachepolicy
5344 DAG.getConstant(1, DL, MVT::i1), // idxen
5345 };
5346
5347 unsigned Opc = (IntrID == Intrinsic::amdgcn_struct_buffer_load) ?
5348 AMDGPUISD::BUFFER_LOAD : AMDGPUISD::BUFFER_LOAD_FORMAT;
5349
5350 EVT VT = Op.getValueType();
5351 EVT IntVT = VT.changeTypeToInteger();
5352 auto *M = cast<MemSDNode>(Op);
5353 EVT LoadVT = Op.getValueType();
5354
5355 if (LoadVT.getScalarType() == MVT::f16)
5356 return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16,
5357 M, DAG, Ops);
5358 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, IntVT,
5359 M->getMemOperand());
5360 }
5361 case Intrinsic::amdgcn_tbuffer_load: {
5362 MemSDNode *M = cast<MemSDNode>(Op);
5363 EVT LoadVT = Op.getValueType();
5364
5365 unsigned Dfmt = cast<ConstantSDNode>(Op.getOperand(7))->getZExtValue();
5366 unsigned Nfmt = cast<ConstantSDNode>(Op.getOperand(8))->getZExtValue();
5367 unsigned Glc = cast<ConstantSDNode>(Op.getOperand(9))->getZExtValue();
5368 unsigned Slc = cast<ConstantSDNode>(Op.getOperand(10))->getZExtValue();
5369 unsigned IdxEn = 1;
5370 if (auto Idx = dyn_cast<ConstantSDNode>(Op.getOperand(3)))
5371 IdxEn = Idx->getZExtValue() != 0;
5372 SDValue Ops[] = {
5373 Op.getOperand(0), // Chain
5374 Op.getOperand(2), // rsrc
5375 Op.getOperand(3), // vindex
5376 Op.getOperand(4), // voffset
5377 Op.getOperand(5), // soffset
5378 Op.getOperand(6), // offset
5379 DAG.getConstant(Dfmt | (Nfmt << 4), DL, MVT::i32), // format
5380 DAG.getConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy
5381 DAG.getConstant(IdxEn, DL, MVT::i1), // idxen
5382 };
5383
5384 if (LoadVT.getScalarType() == MVT::f16)
5385 return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16,
5386 M, DAG, Ops);
5387 return DAG.getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
5388 Op->getVTList(), Ops, LoadVT,
5389 M->getMemOperand());
5390 }
5391 case Intrinsic::amdgcn_raw_tbuffer_load: {
5392 MemSDNode *M = cast<MemSDNode>(Op);
5393 EVT LoadVT = Op.getValueType();
5394 auto Offsets = splitBufferOffsets(Op.getOperand(3), DAG);
5395
5396 SDValue Ops[] = {
5397 Op.getOperand(0), // Chain
5398 Op.getOperand(2), // rsrc
5399 DAG.getConstant(0, DL, MVT::i32), // vindex
5400 Offsets.first, // voffset
5401 Op.getOperand(4), // soffset
5402 Offsets.second, // offset
5403 Op.getOperand(5), // format
5404 Op.getOperand(6), // cachepolicy
5405 DAG.getConstant(0, DL, MVT::i1), // idxen
5406 };
5407
5408 if (LoadVT.getScalarType() == MVT::f16)
5409 return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16,
5410 M, DAG, Ops);
5411 return DAG.getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
5412 Op->getVTList(), Ops, LoadVT,
5413 M->getMemOperand());
5414 }
5415 case Intrinsic::amdgcn_struct_tbuffer_load: {
5416 MemSDNode *M = cast<MemSDNode>(Op);
5417 EVT LoadVT = Op.getValueType();
5418 auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG);
5419
5420 SDValue Ops[] = {
5421 Op.getOperand(0), // Chain
5422 Op.getOperand(2), // rsrc
5423 Op.getOperand(3), // vindex
5424 Offsets.first, // voffset
5425 Op.getOperand(5), // soffset
5426 Offsets.second, // offset
5427 Op.getOperand(6), // format
5428 Op.getOperand(7), // cachepolicy
5429 DAG.getConstant(1, DL, MVT::i1), // idxen
5430 };
5431
5432 if (LoadVT.getScalarType() == MVT::f16)
5433 return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16,
5434 M, DAG, Ops);
5435 return DAG.getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
5436 Op->getVTList(), Ops, LoadVT,
5437 M->getMemOperand());
5438 }
5439 case Intrinsic::amdgcn_buffer_atomic_swap:
5440 case Intrinsic::amdgcn_buffer_atomic_add:
5441 case Intrinsic::amdgcn_buffer_atomic_sub:
5442 case Intrinsic::amdgcn_buffer_atomic_smin:
5443 case Intrinsic::amdgcn_buffer_atomic_umin:
5444 case Intrinsic::amdgcn_buffer_atomic_smax:
5445 case Intrinsic::amdgcn_buffer_atomic_umax:
5446 case Intrinsic::amdgcn_buffer_atomic_and:
5447 case Intrinsic::amdgcn_buffer_atomic_or:
5448 case Intrinsic::amdgcn_buffer_atomic_xor: {
5449 unsigned Slc = cast<ConstantSDNode>(Op.getOperand(6))->getZExtValue();
5450 unsigned IdxEn = 1;
5451 if (auto Idx = dyn_cast<ConstantSDNode>(Op.getOperand(4)))
5452 IdxEn = Idx->getZExtValue() != 0;
5453 SDValue Ops[] = {
5454 Op.getOperand(0), // Chain
5455 Op.getOperand(2), // vdata
5456 Op.getOperand(3), // rsrc
5457 Op.getOperand(4), // vindex
5458 SDValue(), // voffset -- will be set by setBufferOffsets
5459 SDValue(), // soffset -- will be set by setBufferOffsets
5460 SDValue(), // offset -- will be set by setBufferOffsets
5461 DAG.getConstant(Slc << 1, DL, MVT::i32), // cachepolicy
5462 DAG.getConstant(IdxEn, DL, MVT::i1), // idxen
5463 };
5464 setBufferOffsets(Op.getOperand(5), DAG, &Ops[4]);
5465 EVT VT = Op.getValueType();
5466
5467 auto *M = cast<MemSDNode>(Op);
5468 unsigned Opcode = 0;
5469
5470 switch (IntrID) {
5471 case Intrinsic::amdgcn_buffer_atomic_swap:
5472 Opcode = AMDGPUISD::BUFFER_ATOMIC_SWAP;
5473 break;
5474 case Intrinsic::amdgcn_buffer_atomic_add:
5475 Opcode = AMDGPUISD::BUFFER_ATOMIC_ADD;
5476 break;
5477 case Intrinsic::amdgcn_buffer_atomic_sub:
5478 Opcode = AMDGPUISD::BUFFER_ATOMIC_SUB;
5479 break;
5480 case Intrinsic::amdgcn_buffer_atomic_smin:
5481 Opcode = AMDGPUISD::BUFFER_ATOMIC_SMIN;
5482 break;
5483 case Intrinsic::amdgcn_buffer_atomic_umin:
5484 Opcode = AMDGPUISD::BUFFER_ATOMIC_UMIN;
5485 break;
5486 case Intrinsic::amdgcn_buffer_atomic_smax:
5487 Opcode = AMDGPUISD::BUFFER_ATOMIC_SMAX;
5488 break;
5489 case Intrinsic::amdgcn_buffer_atomic_umax:
5490 Opcode = AMDGPUISD::BUFFER_ATOMIC_UMAX;
5491 break;
5492 case Intrinsic::amdgcn_buffer_atomic_and:
5493 Opcode = AMDGPUISD::BUFFER_ATOMIC_AND;
5494 break;
5495 case Intrinsic::amdgcn_buffer_atomic_or:
5496 Opcode = AMDGPUISD::BUFFER_ATOMIC_OR;
5497 break;
5498 case Intrinsic::amdgcn_buffer_atomic_xor:
5499 Opcode = AMDGPUISD::BUFFER_ATOMIC_XOR;
5500 break;
5501 default:
5502 llvm_unreachable("unhandled atomic opcode")::llvm::llvm_unreachable_internal("unhandled atomic opcode", "/build/llvm-toolchain-snapshot-8~svn345461/lib/Target/AMDGPU/SIISelLowering.cpp"
, 5502)
;
5503 }
5504
5505 return DAG.getMemIntrinsicNode(Opcode, DL, Op->getVTList(), Ops, VT,
5506 M->getMemOperand());
5507 }
5508 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
5509 case Intrinsic::amdgcn_raw_buffer_atomic_add:
5510 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
5511 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
5512 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
5513 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
5514 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
5515 case Intrinsic::amdgcn_raw_buffer_atomic_and:
5516 case Intrinsic::amdgcn_raw_buffer_atomic_or:
5517 case Intrinsic::amdgcn_raw_buffer_atomic_xor: {
5518 auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG);
5519 SDValue Ops[] = {
5520 Op.getOperand(0), // Chain
5521 Op.getOperand(2), // vdata
5522 Op.getOperand(3), // rsrc
5523 DAG.getConstant(0, DL, MVT::i32), // vindex
5524 Offsets.first, // voffset
5525 Op.getOperand(5), // soffset
5526 Offsets.second, // offset
5527 Op.getOperand(6), // cachepolicy
5528 DAG.getConstant(0, DL, MVT::i1), // idxen
5529 };
5530 EVT VT = Op.getValueType();
5531
5532 auto *M = cast<MemSDNode>(Op);
5533 unsigned Opcode = 0;
5534
5535 switch (IntrID) {
5536 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
5537 Opcode = AMDGPUISD::BUFFER_ATOMIC_SWAP;
5538 break;
5539 case Intrinsic::amdgcn_raw_buffer_atomic_add:
5540 Opcode = AMDGPUISD::BUFFER_ATOMIC_ADD;
5541 break;
5542 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
5543 Opcode = AMDGPUISD::BUFFER_ATOMIC_SUB;
5544 break;
5545 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
5546 Opcode = AMDGPUISD::BUFFER_ATOMIC_SMIN;
5547 break;
5548 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
5549 Opcode = AMDGPUISD::BUFFER_ATOMIC_UMIN;
5550 break;
5551 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
5552 Opcode = AMDGPUISD::BUFFER_ATOMIC_SMAX;
5553 break;
5554 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
5555 Opcode = AMDGPUISD::BUFFER_ATOMIC_UMAX;
5556 break;
5557 case Intrinsic::amdgcn_raw_buffer_atomic_and:
5558 Opcode = AMDGPUISD::BUFFER_ATOMIC_AND;
5559 break;
5560 case Intrinsic::amdgcn_raw_buffer_atomic_or:
5561 Opcode = AMDGPUISD::BUFFER_ATOMIC_OR;
5562 break;
5563 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
5564 Opcode = AMDGPUISD::BUFFER_ATOMIC_XOR;
5565 break;
5566 default:
5567 llvm_unreachable("unhandled atomic opcode")::llvm::llvm_unreachable_internal("unhandled atomic opcode", "/build/llvm-toolchain-snapshot-8~svn345461/lib/Target/AMDGPU/SIISelLowering.cpp"
, 5567)
;
5568 }
5569
5570 return DAG.getMemIntrinsicNode(Opcode, DL, Op->getVTList(), Ops, VT,
5571 M->getMemOperand());
5572 }
5573 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
5574 case Intrinsic::amdgcn_struct_buffer_atomic_add:
5575 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
5576 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
5577 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
5578 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
5579 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
5580 case Intrinsic::amdgcn_struct_buffer_atomic_and:
5581 case Intrinsic::amdgcn_struct_buffer_atomic_or:
5582 case Intrinsic::amdgcn_struct_buffer_atomic_xor: {
5583 auto Offsets = splitBufferOffsets(Op.getOperand(5), DAG);
5584 SDValue Ops[] = {
5585 Op.getOperand(0), // Chain
5586 Op.getOperand(2), // vdata
5587 Op.getOperand(3), // rsrc
5588 Op.getOperand(4), // vindex
5589 Offsets.first, // voffset
5590 Op.getOperand(6), // soffset
5591 Offsets.second, // offset
5592 Op.getOperand(7), // cachepolicy
5593 DAG.getConstant(1, DL, MVT::i1), // idxen
5594 };
5595 EVT VT = Op.getValueType();
5596
5597 auto *M = cast<MemSDNode>(Op);
5598 unsigned Opcode = 0;
5599
5600 switch (IntrID) {
5601 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
5602 Opcode = AMDGPUISD::BUFFER_ATOMIC_SWAP;
5603 break;
5604 case Intrinsic::amdgcn_struct_buffer_atomic_add:
5605 Opcode = AMDGPUISD::BUFFER_ATOMIC_ADD;
5606 break;
5607 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
5608 Opcode = AMDGPUISD::BUFFER_ATOMIC_SUB;
5609 break;
5610 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
5611 Opcode = AMDGPUISD::BUFFER_ATOMIC_SMIN;
5612 break;
5613 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
5614 Opcode = AMDGPUISD::BUFFER_ATOMIC_UMIN;
5615 break;
5616 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
5617 Opcode = AMDGPUISD::BUFFER_ATOMIC_SMAX;
5618 break;
5619 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
5620 Opcode = AMDGPUISD::BUFFER_ATOMIC_UMAX;
5621 break;
5622 case Intrinsic::amdgcn_struct_buffer_atomic_and:
5623 Opcode = AMDGPUISD::BUFFER_ATOMIC_AND;
5624 break;
5625 case Intrinsic::amdgcn_struct_buffer_atomic_or:
5626 Opcode = AMDGPUISD::BUFFER_ATOMIC_OR;
5627 break;
5628 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
5629 Opcode = AMDGPUISD::BUFFER_ATOMIC_XOR;
5630 break;
5631 default:
5632 llvm_unreachable("unhandled atomic opcode")::llvm::llvm_unreachable_internal("unhandled atomic opcode", "/build/llvm-toolchain-snapshot-8~svn345461/lib/Target/AMDGPU/SIISelLowering.cpp"
, 5632)
;
5633 }
5634
5635 return DAG.getMemIntrinsicNode(Opcode, DL, Op->getVTList(), Ops, VT,
5636 M->getMemOperand());
5637 }
5638 case Intrinsic::amdgcn_buffer_atomic_cmpswap: {
5639 unsigned Slc = cast<ConstantSDNode>(Op.getOperand(7))->getZExtValue();
5640 unsigned IdxEn = 1;
5641 if (auto Idx = dyn_cast<ConstantSDNode>(Op.getOperand(5)))
5642 IdxEn = Idx->getZExtValue() != 0;
5643 SDValue Ops[] = {
5644 Op.getOperand(0), // Chain
5645 Op.getOperand(2), // src
5646 Op.getOperand(3), // cmp
5647 Op.getOperand(4), // rsrc
5648 Op.getOperand(5), // vindex
5649 SDValue(), // voffset -- will be set by setBufferOffsets
5650 SDValue(), // soffset -- will be set by setBufferOffsets
5651 SDValue(), // offset -- will be set by setBufferOffsets
5652 DAG.getConstant(Slc << 1, DL, MVT::i32), // cachepolicy
5653 DAG.getConstant(IdxEn, DL, MVT::i1), // idxen
5654 };
5655 setBufferOffsets(Op.getOperand(6), DAG, &Ops[5]);
5656 EVT VT = Op.getValueType();
5657 auto *M = cast<MemSDNode>(Op);
5658
5659 return DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_ATOMIC_CMPSWAP, DL,
5660 Op->getVTList(), Ops, VT, M->getMemOperand());
5661 }
5662 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: {
5663 auto Offsets = splitBufferOffsets(Op.getOperand(5), DAG);
5664 SDValue Ops[] = {
5665 Op.getOperand(0), // Chain
5666 Op.getOperand(2), // src
5667 Op.getOperand(3), // cmp
5668 Op.getOperand(4), // rsrc
5669 DAG.getConstant(0, DL, MVT::i32), // vindex
5670 Offsets.first, // voffset
5671 Op.getOperand(6), // soffset
5672 Offsets.second, // offset
5673 Op.getOperand(7), // cachepolicy
5674 DAG.getConstant(0, DL, MVT::i1), // idxen
5675 };
5676 EVT VT = Op.getValueType();
5677 auto *M = cast<MemSDNode>(Op);
5678
5679 return DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_ATOMIC_CMPSWAP, DL,
5680 Op->getVTList(), Ops, VT, M->getMemOperand());
5681 }
5682 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: {
5683 auto Offsets = splitBufferOffsets(Op.getOperand(6), DAG);
5684 SDValue Ops[] = {
5685 Op.getOperand(0), // Chain
5686 Op.getOperand(2), // src
5687 Op.getOperand(3), // cmp
5688 Op.getOperand(4), // rsrc
5689 Op.getOperand(5), // vindex
5690 Offsets.first, // voffset
5691 Op.getOperand(7), // soffset
5692 Offsets.second, // offset
5693 Op.getOperand(8), // cachepolicy
5694 DAG.getConstant(1, DL, MVT::i1), // idxen
5695 };
5696 EVT VT = Op.getValueType();
5697 auto *M = cast<MemSDNode>(Op);
5698
5699 return DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_ATOMIC_CMPSWAP, DL,
5700 Op->getVTList(), Ops, VT, M->getMemOperand());
5701 }
5702
5703 default:
5704 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
5705 AMDGPU::getImageDimIntrinsicInfo(IntrID))
5706 return lowerImage(Op, ImageDimIntr, DAG);
5707
5708 return SDValue();
5709 }
5710}
5711
5712SDValue SITargetLowering::handleD16VData(SDValue VData,
5713 SelectionDAG &DAG) const {
5714 EVT StoreVT = VData.getValueType();
5715
5716 // No change for f16 and legal vector D16 types.
5717 if (!StoreVT.isVector())
5718 return VData;
5719
5720 SDLoc DL(VData);
5721 assert((StoreVT.getVectorNumElements() != 3) && "Handle v3f16")(((StoreVT.getVectorNumElements() != 3) && "Handle v3f16"
) ? static_cast<void> (0) : __assert_fail ("(StoreVT.getVectorNumElements() != 3) && \"Handle v3f16\""
, "/build/llvm-toolchain-snapshot-8~svn345461/lib/Target/AMDGPU/SIISelLowering.cpp"
, 5721, __PRETTY_FUNCTION__))
;
5722
5723 if (Subtarget->hasUnpackedD16VMem()) {
5724 // We need to unpack the packed data to store.
5725 EVT IntStoreVT = StoreVT.changeTypeToInteger();
5726 SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData);
5727
5728 EVT EquivStoreVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
5729 StoreVT.getVectorNumElements());
5730 SDValue ZExt = DAG.getNode(ISD::ZERO_EXTEND, DL, EquivStoreVT, IntVData);
5731 return DAG.UnrollVectorOp(ZExt.getNode());
5732 }
5733
5734 assert(isTypeLegal(StoreVT))((isTypeLegal(StoreVT)) ? static_cast<void> (0) : __assert_fail
("isTypeLegal(StoreVT)", "/build/llvm-toolchain-snapshot-8~svn345461/lib/Target/AMDGPU/SIISelLowering.cpp"
, 5734, __PRETTY_FUNCTION__))
;
5735 return VData;
5736}
5737
5738SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
5739 SelectionDAG &DAG) const {
5740 SDLoc DL(Op);
5741 SDValue Chain = Op.getOperand(0);
5742 unsigned IntrinsicID = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
5743 MachineFunction &MF = DAG.getMachineFunction();
5744
5745 switch (IntrinsicID) {
5746 case Intrinsic::amdgcn_exp: {
5747 const ConstantSDNode *Tgt = cast<ConstantSDNode>(Op.getOperand(2));
5748 const ConstantSDNode *En = cast<ConstantSDNode>(Op.getOperand(3));
5749 const ConstantSDNode *Done = cast<ConstantSDNode>(Op.getOperand(8));
5750 const ConstantSDNode *VM = cast<ConstantSDNode>(Op.getOperand(9));
5751
5752 const SDValue Ops[] = {
5753 Chain,
5754 DAG.getTargetConstant(Tgt->getZExtValue(), DL, MVT::i8), // tgt
5755 DAG.getTargetConstant(En->getZExtValue(), DL, MVT::i8), // en
5756 Op.getOperand(4), // src0
5757 Op.getOperand(5), // src1
5758 Op.getOperand(6), // src2
5759 Op.getOperand(7), // src3
5760 DAG.getTargetConstant(0, DL, MVT::i1), // compr
5761 DAG.getTargetConstant(VM->getZExtValue(), DL, MVT::i1)
5762 };
5763
5764 unsigned Opc = Done->isNullValue() ?
5765 AMDGPUISD::EXPORT : AMDGPUISD::EXPORT_DONE;
5766 return DAG.getNode(Opc, DL, Op->getVTList(), Ops);
5767 }
5768 case Intrinsic::amdgcn_exp_compr: {
5769 const ConstantSDNode *Tgt = cast<ConstantSDNode>(Op.getOperand(2));
5770 const ConstantSDNode *En = cast<ConstantSDNode>(Op.getOperand(3));
5771 SDValue Src0 = Op.getOperand(4);
5772 SDValue Src1 = Op.getOperand(5);
5773 const ConstantSDNode *Done = cast<ConstantSDNode>(Op.getOperand(6));
5774 const ConstantSDNode *VM = cast<ConstantSDNode>(Op.getOperand(7));
5775
5776 SDValue Undef = DAG.getUNDEF(MVT::f32);
5777 const SDValue Ops[] = {
5778 Chain,
5779 DAG.getTargetConstant(Tgt->getZExtValue(), DL, MVT::i8), // tgt
5780 DAG.getTargetConstant(En->getZExtValue(), DL, MVT::i8), // en
5781 DAG.getNode(ISD::BITCAST, DL, MVT::f32, Src0),
5782 DAG.getNode(ISD::BITCAST, DL, MVT::f32, Src1),
5783 Undef, // src2
5784 Undef, // src3
5785 DAG.getTargetConstant(1, DL, MVT::i1), // compr
5786 DAG.getTargetConstant(VM->getZExtValue(), DL, MVT::i1)
5787 };
5788
5789 unsigned Opc = Done->isNullValue() ?
5790 AMDGPUISD::EXPORT : AMDGPUISD::EXPORT_DONE;
5791 return DAG.getNode(Opc, DL, Op->getVTList(), Ops);
5792 }
5793 case Intrinsic::amdgcn_s_sendmsg:
5794 case Intrinsic::amdgcn_s_sendmsghalt: {
5795 unsigned NodeOp = (IntrinsicID == Intrinsic::amdgcn_s_sendmsg) ?
5796 AMDGPUISD::SENDMSG : AMDGPUISD::SENDMSGHALT;
5797 Chain = copyToM0(DAG, Chain, DL, Op.getOperand(3));
5798 SDValue Glue = Chain.getValue(1);
5799 return DAG.getNode(NodeOp, DL, MVT::Other, Chain,
5800 Op.getOperand(2), Glue);
5801 }
5802 case Intrinsic::amdgcn_init_exec: {
5803 return DAG.getNode(AMDGPUISD::INIT_EXEC, DL, MVT::Other, Chain,
5804 Op.getOperand(2));
5805 }
5806 case Intrinsic::amdgcn_init_exec_from_input: {
5807 return DAG.getNode(AMDGPUISD::INIT_EXEC_FROM_INPUT, DL, MVT::Other, Chain,
5808 Op.getOperand(2), Op.getOperand(3));
5809 }
5810 case AMDGPUIntrinsic::AMDGPU_kill: {
5811 SDValue Src = Op.getOperand(2);
5812 if (const ConstantFPSDNode *K = dyn_cast<ConstantFPSDNode>(Src)) {
5813 if (!K->isNegative())
5814 return Chain;
5815
5816 SDValue NegOne = DAG.getTargetConstant(FloatToBits(-1.0f), DL, MVT::i32);
5817 return DAG.getNode(AMDGPUISD::KILL, DL, MVT::Other, Chain, NegOne);
5818 }
5819
5820 SDValue Cast = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Src);
5821 return DAG.getNode(AMDGPUISD::KILL, DL, MVT::Other, Chain, Cast);
5822 }
5823 case Intrinsic::amdgcn_s_barrier: {
5824 if (getTargetMachine().getOptLevel() > CodeGenOpt::None) {
5825 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
5826 unsigned WGSize = ST.getFlatWorkGroupSizes(MF.getFunction()).second;
5827 if (WGSize <= ST.getWavefrontSize())
5828 return SDValue(DAG.getMachineNode(AMDGPU::WAVE_BARRIER, DL, MVT::Other,
5829 Op.getOperand(0)), 0);
5830 }
5831 return SDValue();
5832 };
5833 case AMDGPUIntrinsic::SI_tbuffer_store: {
5834
5835 // Extract vindex and voffset from vaddr as appropriate
5836 const ConstantSDNode *OffEn = cast<ConstantSDNode>(Op.getOperand(10));
5837 const ConstantSDNode *IdxEn = cast<ConstantSDNode>(Op.getOperand(11));
5838 SDValue VAddr = Op.getOperand(5);
5839
5840 SDValue Zero = DAG.getTargetConstant(0, DL, MVT::i32);
5841
5842 assert(!(OffEn->isOne() && IdxEn->isOne()) &&((!(OffEn->isOne() && IdxEn->isOne()) &&
"Legacy intrinsic doesn't support both offset and index - use new version"
) ? static_cast<void> (0) : __assert_fail ("!(OffEn->isOne() && IdxEn->isOne()) && \"Legacy intrinsic doesn't support both offset and index - use new version\""
, "/build/llvm-toolchain-snapshot-8~svn345461/lib/Target/AMDGPU/SIISelLowering.cpp"
, 5843, __PRETTY_FUNCTION__))
5843 "Legacy intrinsic doesn't support both offset and index - use new version")((!(OffEn->isOne() && IdxEn->isOne()) &&
"Legacy intrinsic doesn't support both offset and index - use new version"
) ? static_cast<void> (0) : __assert_fail ("!(OffEn->isOne() && IdxEn->isOne()) && \"Legacy intrinsic doesn't support both offset and index - use new version\""
, "/build/llvm-toolchain-snapshot-8~svn345461/lib/Target/AMDGPU/SIISelLowering.cpp"
, 5843, __PRETTY_FUNCTION__))
;
5844
5845 SDValue VIndex = IdxEn->isOne() ? VAddr : Zero;
5846 SDValue VOffset = OffEn->isOne() ? VAddr : Zero;
5847
5848 // Deal with the vec-3 case
5849 const ConstantSDNode *NumChannels = cast<ConstantSDNode>(Op.getOperand(4));
5850 auto Opcode = NumChannels->getZExtValue() == 3 ?
5851 AMDGPUISD::TBUFFER_STORE_FORMAT_X3 : AMDGPUISD::TBUFFER_STORE_FORMAT;
5852
5853 unsigned Dfmt = cast<ConstantSDNode>(Op.getOperand(8))->getZExtValue();
5854 unsigned Nfmt = cast<ConstantSDNode>(Op.getOperand(9))->getZExtValue();
5855 unsigned Glc = cast<ConstantSDNode>(Op.getOperand(12))->getZExtValue();
5856 unsigned Slc = cast<ConstantSDNode>(Op.getOperand(13))->getZExtValue();
5857 SDValue Ops[] = {
5858 Chain,
5859 Op.getOperand(3), // vdata
5860 Op.getOperand(2), // rsrc
5861 VIndex,
5862 VOffset,
5863 Op.getOperand(6), // soffset
5864 Op.getOperand(7), // inst_offset
5865 DAG.getConstant(Dfmt | (Nfmt << 4), DL, MVT::i32), // format
5866 DAG.getConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy
5867 DAG.getConstant(IdxEn->isOne(), DL, MVT::i1), // idxen
5868 };
5869
5870 assert((cast<ConstantSDNode>(Op.getOperand(14)))->getZExtValue() == 0 &&(((cast<ConstantSDNode>(Op.getOperand(14)))->getZExtValue
() == 0 && "Value of tfe other than zero is unsupported"
) ? static_cast<void> (0) : __assert_fail ("(cast<ConstantSDNode>(Op.getOperand(14)))->getZExtValue() == 0 && \"Value of tfe other than zero is unsupported\""
, "/build/llvm-toolchain-snapshot-8~svn345461/lib/Target/AMDGPU/SIISelLowering.cpp"
, 5871, __PRETTY_FUNCTION__))
5871 "Value of tfe other than zero is unsupported")(((cast<ConstantSDNode>(Op.getOperand(14)))->getZExtValue
() == 0 && "Value of tfe other than zero is unsupported"
) ? static_cast<void> (0) : __assert_fail ("(cast<ConstantSDNode>(Op.getOperand(14)))->getZExtValue() == 0 && \"Value of tfe other than zero is unsupported\""
, "/build/llvm-toolchain-snapshot-8~svn345461/lib/Target/AMDGPU/SIISelLowering.cpp"
, 5871, __PRETTY_FUNCTION__))
;
5872
5873 EVT VT = Op.getOperand(3).getValueType();
5874 MachineMemOperand *MMO = MF.getMachineMemOperand(
5875 MachinePointerInfo(),
5876 MachineMemOperand::MOStore,
5877 VT.getStoreSize(), 4);
5878 return DAG.getMemIntrinsicNode(Opcode, DL,
5879 Op->getVTList(), Ops, VT, MMO);
5880 }
5881
5882 case Intrinsic::amdgcn_tbuffer_store: {
5883 SDValue VData = Op.getOperand(2);
5884 bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
5885 if (IsD16)
5886 VData = handleD16VData(VData, DAG);
5887 unsigned Dfmt = cast<ConstantSDNode>(Op.getOperand(8))->getZExtValue();
5888 unsigned Nfmt = cast<ConstantSDNode>(Op.getOperand(9))->getZExtValue();
5889 unsigned Glc = cast<ConstantSDNode>(Op.getOperand(10))->getZExtValue();
5890 unsigned Slc = cast<ConstantSDNode>(Op.getOperand(11))->getZExtValue();
5891 unsigned IdxEn = 1;
5892 if (auto Idx = dyn_cast<ConstantSDNode>(Op.getOperand(4)))
5893 IdxEn = Idx->getZExtValue() != 0;
5894 SDValue Ops[] = {
5895 Chain,
5896 VData, // vdata
5897 Op.getOperand(3), // rsrc
5898 Op.getOperand(4), // vindex
5899 Op.getOperand(5), // voffset
5900 Op.getOperand(6), // soffset
5901 Op.getOperand(7), // offset
5902 DAG.getConstant(Dfmt | (Nfmt << 4), DL, MVT::i32), // format
5903 DAG.getConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy
5904 DAG.getConstant(IdxEn, DL, MVT::i1), // idexen
5905 };
5906 unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16 :
5907 AMDGPUISD::TBUFFER_STORE_FORMAT;
5908 MemSDNode *M = cast<MemSDNode>(Op);
5909 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
5910 M->getMemoryVT(), M->getMemOperand());
5911 }
5912
5913 case Intrinsic::amdgcn_struct_tbuffer_store: {
5914 SDValue VData = Op.getOperand(2);
5915 bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
5916 if (IsD16)
5917 VData = handleD16VData(VData, DAG);
5918 auto Offsets = splitBufferOffsets(Op.getOperand(5), DAG);
5919 SDValue Ops[] = {
5920 Chain,
5921 VData, // vdata
5922 Op.getOperand(3), // rsrc
5923 Op.getOperand(4), // vindex
5924 Offsets.first, // voffset
5925 Op.getOperand(6), // soffset
5926 Offsets.second, // offset
5927 Op.getOperand(7), // format
5928 Op.getOperand(8), // cachepolicy
5929 DAG.getConstant(1, DL, MVT::i1), // idexen
5930 };
5931 unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16 :
5932 AMDGPUISD::TBUFFER_STORE_FORMAT;
5933 MemSDNode *M = cast<MemSDNode>(Op);
5934 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
5935 M->getMemoryVT(), M->getMemOperand());
5936 }
5937
5938 case Intrinsic::amdgcn_raw_tbuffer_store: {
5939 SDValue VData = Op.getOperand(2);
5940 bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
5941 if (IsD16)
5942 VData = handleD16VData(VData, DAG);
5943 auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG);
5944 SDValue Ops[] = {
5945 Chain,
5946 VData, // vdata
5947 Op.getOperand(3), // rsrc
5948 DAG.getConstant(0, DL, MVT::i32), // vindex
5949 Offsets.first, // voffset
5950 Op.getOperand(5), // soffset
5951 Offsets.second, // offset
5952 Op.getOperand(6), // format
5953 Op.getOperand(7), // cachepolicy
5954 DAG.getConstant(0, DL, MVT::i1), // idexen
5955 };
5956 unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16 :
5957 AMDGPUISD::TBUFFER_STORE_FORMAT;
5958 MemSDNode *M = cast<MemSDNode>(Op);
5959 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
5960 M->getMemoryVT(), M->getMemOperand());
5961 }
5962
5963 case Intrinsic::amdgcn_buffer_store:
5964 case Intrinsic::amdgcn_buffer_store_format: {
5965 SDValue VData = Op.getOperand(2);
5966 bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
5967 if (IsD16)
5968 VData = handleD16VData(VData, DAG);
5969 unsigned Glc = cast<ConstantSDNode>(Op.getOperand(6))->getZExtValue();
5970 unsigned Slc = cast<ConstantSDNode>(Op.getOperand(7))->getZExtValue();
5971 unsigned IdxEn = 1;
5972 if (auto Idx = dyn_cast<ConstantSDNode>(Op.getOperand(4)))
5973 IdxEn = Idx->getZExtValue() != 0;
5974 SDValue Ops[] = {
5975 Chain,
5976 VData,
5977 Op.getOperand(3), // rsrc
5978 Op.getOperand(4), // vindex
5979 SDValue(), // voffset -- will be set by setBufferOffsets
5980 SDValue(), // soffset -- will be set by setBufferOffsets
5981 SDValue(), // offset -- will be set by setBufferOffsets
5982 DAG.getConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy
5983 DAG.getConstant(IdxEn, DL, MVT::i1), // idxen
5984 };
5985 setBufferOffsets(Op.getOperand(5), DAG, &Ops[4]);
5986 unsigned Opc = IntrinsicID == Intrinsic::amdgcn_buffer_store ?
5987 AMDGPUISD::BUFFER_STORE : AMDGPUISD::BUFFER_STORE_FORMAT;
5988 Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc;
5989 MemSDNode *M = cast<MemSDNode>(Op);
5990 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
5991 M->getMemoryVT(), M->getMemOperand());
5992 }
5993
5994 case Intrinsic::amdgcn_raw_buffer_store:
5995 case Intrinsic::amdgcn_raw_buffer_store_format: {
5996 SDValue VData = Op.getOperand(2);
5997 bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
5998 if (IsD16)
5999 VData = handleD16VData(VData, DAG);
6000 auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG);
6001 SDValue Ops[] = {
6002 Chain,
6003 VData,
6004 Op.getOperand(3), // rsrc
6005 DAG.getConstant(0, DL, MVT::i32), // vindex
6006 Offsets.first, // voffset
6007 Op.getOperand(5), // soffset
6008 Offsets.second, // offset
6009 Op.getOperand(6), // cachepolicy
6010 DAG.getConstant(0, DL, MVT::i1), // idxen
6011 };
6012 unsigned Opc = IntrinsicID == Intrinsic::amdgcn_raw_buffer_store ?
6013 AMDGPUISD::BUFFER_STORE : AMDGPUISD::BUFFER_STORE_FORMAT;
6014 Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc;
6015 MemSDNode *M = cast<MemSDNode>(Op);
6016 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
6017 M->getMemoryVT(), M->getMemOperand());
6018 }
6019
6020 case Intrinsic::amdgcn_struct_buffer_store:
6021 case Intrinsic::amdgcn_struct_buffer_store_format: {
6022 SDValue VData = Op.getOperand(2);
6023 bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
6024 if (IsD16)
6025 VData = handleD16VData(VData, DAG);
6026 auto Offsets = splitBufferOffsets(Op.getOperand(5), DAG);
6027 SDValue Ops[] = {
6028 Chain,
6029 VData,
6030 Op.getOperand(3), // rsrc
6031 Op.getOperand(4), // vindex
6032 Offsets.first, // voffset
6033 Op.getOperand(6), // soffset
6034 Offsets.second, // offset
6035 Op.getOperand(7), // cachepolicy
6036 DAG.getConstant(1, DL, MVT::i1), // idxen
6037 };
6038 unsigned Opc = IntrinsicID == Intrinsic::amdgcn_struct_buffer_store ?
6039 AMDGPUISD::BUFFER_STORE : AMDGPUISD::BUFFER_STORE_FORMAT;
6040 Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc;
6041 MemSDNode *M = cast<MemSDNode>(Op);
6042 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
6043 M->getMemoryVT(), M->getMemOperand());
6044 }
6045
6046 default: {
6047 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
6048 AMDGPU::getImageDimIntrinsicInfo(IntrinsicID))
6049 return lowerImage(Op, ImageDimIntr, DAG);
6050
6051 return Op;
6052 }
6053 }
6054}
6055
6056// The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
6057// offset (the offset that is included in bounds checking and swizzling, to be
6058// split between the instruction's voffset and immoffset fields) and soffset
6059// (the offset that is excluded from bounds checking and swizzling, to go in
6060// the instruction's soffset field). This function takes the first kind of
6061// offset and figures out how to split it between voffset and immoffset.
6062std::pair<SDValue, SDValue> SITargetLowering::splitBufferOffsets(
6063 SDValue Offset, SelectionDAG &DAG) const {
6064 SDLoc DL(Offset);
6065 const unsigned MaxImm = 4095;
6066 SDValue N0 = Offset;
6067 ConstantSDNode *C1 = nullptr;
6068 if (N0.getOpcode() == ISD::ADD) {
6069 if ((C1 = dyn_cast<ConstantSDNode>(N0.getOperand(1))))
6070 N0 = N0.getOperand(0);
6071 } else if ((C1 = dyn_cast<ConstantSDNode>(N0)))
6072 N0 = SDValue();
6073
6074 if (C1) {
6075 unsigned ImmOffset = C1->getZExtValue();
6076 // If the immediate value is too big for the immoffset field, put the value
6077 // and -4096 into the immoffset field so that the value that is copied/added
6078 // for the voffset field is a multiple of 4096, and it stands more chance
6079 // of being CSEd with the copy/add for another similar load/store.
6080 // However, do not do that rounding down to a multiple of 4096 if that is a
6081 // negative number, as it appears to be illegal to have a negative offset
6082 // in the vgpr, even if adding the immediate offset makes it positive.
6083 unsigned Overflow = ImmOffset & ~MaxImm;
6084 ImmOffset -= Overflow;
6085 if ((int32_t)Overflow < 0) {
6086 Overflow += ImmOffset;
6087 ImmOffset = 0;
6088 }
6089 C1 = cast<ConstantSDNode>(DAG.getConstant(ImmOffset, DL, MVT::i32));
6090 if (Overflow) {
6091 auto OverflowVal = DAG.getConstant(Overflow, DL, MVT::i32);
6092 if (!N0)
6093 N0 = OverflowVal;
6094 else {
6095 SDValue Ops[] = { N0, OverflowVal };
6096 N0 = DAG.getNode(ISD::ADD, DL, MVT::i32, Ops);
6097 }
6098 }
6099 }
6100 if (!N0)
6101 N0 = DAG.getConstant(0, DL, MVT::i32);
6102 if (!C1)
6103 C1 = cast<ConstantSDNode>(DAG.getConstant(0, DL, MVT::i32));
6104 return {N0, SDValue(C1, 0)};
6105}
6106
6107// Analyze a combined offset from an amdgcn_buffer_ intrinsic and store the
6108// three offsets (voffset, soffset and instoffset) into the SDValue[3] array
6109// pointed to by Offsets.
6110void SITargetLowering::setBufferOffsets(SDValue CombinedOffset,
6111 SelectionDAG &DAG, SDValue *Offsets,
6112 unsigned Align) const {
6113 SDLoc DL(CombinedOffset);
6114 if (auto C = dyn_cast<ConstantSDNode>(CombinedOffset)) {
6115 uint32_t Imm = C->getZExtValue();
6116 uint32_t SOffset, ImmOffset;
6117 if (AMDGPU::splitMUBUFOffset(Imm, SOffset, ImmOffset, Subtarget, Align)) {
6118 Offsets[0] = DAG.getConstant(0, DL, MVT::i32);
6119 Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32);
6120 Offsets[2] = DAG.getConstant(ImmOffset, DL, MVT::i32);
6121 return;
6122 }
6123 }
6124 if (DAG.isBaseWithConstantOffset(CombinedOffset)) {
6125 SDValue N0 = CombinedOffset.getOperand(0);
6126 SDValue N1 = CombinedOffset.getOperand(1);
6127 uint32_t SOffset, ImmOffset;
6128 int Offset = cast<ConstantSDNode>(N1)->getSExtValue();
6129 if (Offset >= 0 && AMDGPU::splitMUBUFOffset(Offset, SOffset, ImmOffset,
6130 Subtarget, Align)) {
6131 Offsets[0] = N0;
6132 Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32);
6133 Offsets[2] = DAG.getConstant(ImmOffset, DL, MVT::i32);
6134 return;
6135 }
6136 }
6137 Offsets[0] = CombinedOffset;
6138 Offsets[1] = DAG.getConstant(0, DL, MVT::i32);
6139 Offsets[2] = DAG.getConstant(0, DL, MVT::i32);
6140}
6141
6142static SDValue getLoadExtOrTrunc(SelectionDAG &DAG,
6143 ISD::LoadExtType ExtType, SDValue Op,
6144 const SDLoc &SL, EVT VT) {
6145 if (VT.bitsLT(Op.getValueType()))
6146 return DAG.getNode(ISD::TRUNCATE, SL, VT, Op);
6147
6148 switch (ExtType) {
6149 case ISD::SEXTLOAD:
6150 return DAG.getNode(ISD::SIGN_EXTEND, SL, VT, Op);
6151 case ISD::ZEXTLOAD:
6152 return DAG.getNode(ISD::ZERO_EXTEND, SL, VT, Op);
6153 case ISD::EXTLOAD:
6154 return DAG.getNode(ISD::ANY_EXTEND, SL, VT, Op);
6155 case ISD::NON_EXTLOAD:
6156 return Op;
6157 }
6158
6159 llvm_unreachable("invalid ext type")::llvm::llvm_unreachable_internal("invalid ext type", "/build/llvm-toolchain-snapshot-8~svn345461/lib/Target/AMDGPU/SIISelLowering.cpp"
, 6159)
;
6160}
6161
6162SDValue SITargetLowering::widenLoad(LoadSDNode *Ld, DAGCombinerInfo &DCI) const {
6163 SelectionDAG &DAG = DCI.DAG;
6164 if (Ld->getAlignment() < 4 || Ld->isDivergent())
6165 return SDValue();
6166
6167 // FIXME: Constant loads should all be marked invariant.
6168 unsigned AS = Ld->getAddressSpace();
6169 if (AS != AMDGPUAS::CONSTANT_ADDRESS &&
6170 AS != AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
6171 (AS != AMDGPUAS::GLOBAL_ADDRESS || !Ld->isInvariant()))
6172 return SDValue();
6173
6174 // Don't do this early, since it may interfere with adjacent load merging for
6175 // illegal types. We can avoid losing alignment information for exotic types
6176 // pre-legalize.
6177 EVT MemVT = Ld->getMemoryVT();
6178 if ((MemVT.isSimple() && !DCI.isAfterLegalizeDAG()) ||
6179 MemVT.getSizeInBits() >= 32)
6180 return SDValue();
6181
6182 SDLoc SL(Ld);
6183
6184 assert((!MemVT.isVector() || Ld->getExtensionType() == ISD::NON_EXTLOAD) &&(((!MemVT.isVector() || Ld->getExtensionType() == ISD::NON_EXTLOAD
) && "unexpected vector extload") ? static_cast<void
> (0) : __assert_fail ("(!MemVT.isVector() || Ld->getExtensionType() == ISD::NON_EXTLOAD) && \"unexpected vector extload\""
, "/build/llvm-toolchain-snapshot-8~svn345461/lib/Target/AMDGPU/SIISelLowering.cpp"
, 6185, __PRETTY_FUNCTION__))
6185 "unexpected vector extload")(((!MemVT.isVector() || Ld->getExtensionType() == ISD::NON_EXTLOAD
) && "unexpected vector extload") ? static_cast<void
> (0) : __assert_fail ("(!MemVT.isVector() || Ld->getExtensionType() == ISD::NON_EXTLOAD) && \"unexpected vector extload\""
, "/build/llvm-toolchain-snapshot-8~svn345461/lib/Target/AMDGPU/SIISelLowering.cpp"
, 6185, __PRETTY_FUNCTION__))
;
6186
6187 // TODO: Drop only high part of range.
6188 SDValue Ptr = Ld->getBasePtr();
6189 SDValue NewLoad = DAG.getLoad(ISD::UNINDEXED, ISD::NON_EXTLOAD,
6190 MVT::i32, SL, Ld->getChain(), Ptr,
6191 Ld->getOffset(),
6192 Ld->getPointerInfo(), MVT::i32,
6193 Ld->getAlignment(),
6194 Ld->getMemOperand()->getFlags(),
6195 Ld->getAAInfo(),
6196 nullptr); // Drop ranges
6197
6198 EVT TruncVT = EVT::getIntegerVT(*DAG.getContext(), MemVT.getSizeInBits());
6199 if (MemVT.isFloatingPoint()) {
6200 assert(Ld->getExtensionType() == ISD::NON_EXTLOAD &&((Ld->getExtensionType() == ISD::NON_EXTLOAD && "unexpected fp extload"
) ? static_cast<void> (0) : __assert_fail ("Ld->getExtensionType() == ISD::NON_EXTLOAD && \"unexpected fp extload\""
, "/build/llvm-toolchain-snapshot-8~svn345461/lib/Target/AMDGPU/SIISelLowering.cpp"
, 6201, __PRETTY_FUNCTION__))
6201 "unexpected fp extload")((Ld->getExtensionType() == ISD::NON_EXTLOAD && "unexpected fp extload"
) ? static_cast<void> (0) : __assert_fail ("Ld->getExtensionType() == ISD::NON_EXTLOAD && \"unexpected fp extload\""
, "/build/llvm-toolchain-snapshot-8~svn345461/lib/Target/AMDGPU/SIISelLowering.cpp"
, 6201, __PRETTY_FUNCTION__))
;
6202 TruncVT = MemVT.changeTypeToInteger();
6203 }
6204
6205 SDValue Cvt = NewLoad;
6206 if (Ld->getExtensionType() == ISD::SEXTLOAD) {
6207 Cvt = DAG.getNode(ISD::SIGN_EXTEND_INREG, SL, MVT::i32, NewLoad,
6208 DAG.getValueType(TruncVT));
6209 } else if (Ld->getExtensionType() == ISD::ZEXTLOAD ||
6210 Ld->getExtensionType() == ISD::NON_EXTLOAD) {
6211 Cvt = DAG.getZeroExtendInReg(NewLoad, SL, TruncVT);
6212 } else {
6213 assert(Ld->getExtensionType() == ISD::EXTLOAD)((Ld->getExtensionType() == ISD::EXTLOAD) ? static_cast<
void> (0) : __assert_fail ("Ld->getExtensionType() == ISD::EXTLOAD"
, "/build/llvm-toolchain-snapshot-8~svn345461/lib/Target/AMDGPU/SIISelLowering.cpp"
, 6213, __PRETTY_FUNCTION__))
;
6214 }
6215
6216 EVT VT = Ld->getValueType(0);
6217 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
6218
6219 DCI.AddToWorklist(Cvt.getNode());
6220
6221 // We may need to handle exotic cases, such as i16->i64 extloads, so insert
6222 // the appropriate extension from the 32-bit load.
6223 Cvt = getLoadExtOrTrunc(DAG, Ld->getExtensionType(), Cvt, SL, IntVT);
6224 DCI.AddToWorklist(Cvt.getNode());
6225
6226 // Handle conversion back to floating point if necessary.
6227 Cvt = DAG.getNode(ISD::BITCAST, SL, VT, Cvt);
6228
6229 return DAG.getMergeValues({ Cvt, NewLoad.getValue(1) }, SL);
6230}
6231
6232SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
6233 SDLoc DL(Op);
6234 LoadSDNode *Load = cast<LoadSDNode>(Op);
6235 ISD::LoadExtType ExtType = Load->getExtensionType();
6236 EVT MemVT = Load->getMemoryVT();
6237
6238 if (ExtType == ISD::NON_EXTLOAD && MemVT.getSizeInBits() < 32) {
6239 if (MemVT == MVT::i16 && isTypeLegal(MVT::i16))
6240 return SDValue();
6241
6242 // FIXME: Copied from PPC
6243 // First, load into 32 bits, then truncate to 1 bit.
6244
6245 SDValue Chain = Load->getChain();
6246 SDValue BasePtr = Load->getBasePtr();
6247 MachineMemOperand *MMO = Load->getMemOperand();
6248
6249 EVT RealMemVT = (MemVT == MVT::i1) ? MVT::i8 : MVT::i16;
6250
6251 SDValue NewLD = DAG.getExtLoad(ISD::EXTLOAD, DL, MVT::i32, Chain,
6252 BasePtr, RealMemVT, MMO);
6253
6254 SDValue Ops[] = {
6255 DAG.getNode(ISD::TRUNCATE, DL, MemVT, NewLD),
6256 NewLD.getValue(1)
6257 };
6258
6259 return DAG.getMergeValues(Ops, DL);