LLVM 22.0.0git
AMDGPUISelLowering.cpp
Go to the documentation of this file.
1//===-- AMDGPUISelLowering.cpp - AMDGPU Common DAG lowering functions -----===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// This is the parent TargetLowering class for hardware code gen
11/// targets.
12//
13//===----------------------------------------------------------------------===//
14
15#include "AMDGPUISelLowering.h"
16#include "AMDGPU.h"
17#include "AMDGPUInstrInfo.h"
19#include "AMDGPUMemoryUtils.h"
26#include "llvm/IR/IntrinsicsAMDGPU.h"
30
31using namespace llvm;
32
33#include "AMDGPUGenCallingConv.inc"
34
36 "amdgpu-bypass-slow-div",
37 cl::desc("Skip 64-bit divide for dynamic 32-bit values"),
38 cl::init(true));
39
40// Find a larger type to do a load / store of a vector with.
42 unsigned StoreSize = VT.getStoreSizeInBits();
43 if (StoreSize <= 32)
44 return EVT::getIntegerVT(Ctx, StoreSize);
45
46 if (StoreSize % 32 == 0)
47 return EVT::getVectorVT(Ctx, MVT::i32, StoreSize / 32);
48
49 return VT;
50}
51
55
57 // In order for this to be a signed 24-bit value, bit 23, must
58 // be a sign bit.
59 return DAG.ComputeMaxSignificantBits(Op);
60}
61
63 const TargetSubtargetInfo &STI,
64 const AMDGPUSubtarget &AMDGPUSTI)
65 : TargetLowering(TM, STI), Subtarget(&AMDGPUSTI) {
66 // Always lower memset, memcpy, and memmove intrinsics to load/store
67 // instructions, rather then generating calls to memset, mempcy or memmove.
71
72 // Enable ganging up loads and stores in the memcpy DAG lowering.
74
75 // Lower floating point store/load to integer store/load to reduce the number
76 // of patterns in tablegen.
77 setOperationAction(ISD::LOAD, MVT::f32, Promote);
78 AddPromotedToType(ISD::LOAD, MVT::f32, MVT::i32);
79
80 setOperationAction(ISD::LOAD, MVT::v2f32, Promote);
81 AddPromotedToType(ISD::LOAD, MVT::v2f32, MVT::v2i32);
82
83 setOperationAction(ISD::LOAD, MVT::v3f32, Promote);
84 AddPromotedToType(ISD::LOAD, MVT::v3f32, MVT::v3i32);
85
86 setOperationAction(ISD::LOAD, MVT::v4f32, Promote);
87 AddPromotedToType(ISD::LOAD, MVT::v4f32, MVT::v4i32);
88
89 setOperationAction(ISD::LOAD, MVT::v5f32, Promote);
90 AddPromotedToType(ISD::LOAD, MVT::v5f32, MVT::v5i32);
91
92 setOperationAction(ISD::LOAD, MVT::v6f32, Promote);
93 AddPromotedToType(ISD::LOAD, MVT::v6f32, MVT::v6i32);
94
95 setOperationAction(ISD::LOAD, MVT::v7f32, Promote);
96 AddPromotedToType(ISD::LOAD, MVT::v7f32, MVT::v7i32);
97
98 setOperationAction(ISD::LOAD, MVT::v8f32, Promote);
99 AddPromotedToType(ISD::LOAD, MVT::v8f32, MVT::v8i32);
100
101 setOperationAction(ISD::LOAD, MVT::v9f32, Promote);
102 AddPromotedToType(ISD::LOAD, MVT::v9f32, MVT::v9i32);
103
104 setOperationAction(ISD::LOAD, MVT::v10f32, Promote);
105 AddPromotedToType(ISD::LOAD, MVT::v10f32, MVT::v10i32);
106
107 setOperationAction(ISD::LOAD, MVT::v11f32, Promote);
108 AddPromotedToType(ISD::LOAD, MVT::v11f32, MVT::v11i32);
109
110 setOperationAction(ISD::LOAD, MVT::v12f32, Promote);
111 AddPromotedToType(ISD::LOAD, MVT::v12f32, MVT::v12i32);
112
113 setOperationAction(ISD::LOAD, MVT::v16f32, Promote);
114 AddPromotedToType(ISD::LOAD, MVT::v16f32, MVT::v16i32);
115
116 setOperationAction(ISD::LOAD, MVT::v32f32, Promote);
117 AddPromotedToType(ISD::LOAD, MVT::v32f32, MVT::v32i32);
118
119 setOperationAction(ISD::LOAD, MVT::i64, Promote);
120 AddPromotedToType(ISD::LOAD, MVT::i64, MVT::v2i32);
121
122 setOperationAction(ISD::LOAD, MVT::v2i64, Promote);
123 AddPromotedToType(ISD::LOAD, MVT::v2i64, MVT::v4i32);
124
125 setOperationAction(ISD::LOAD, MVT::f64, Promote);
126 AddPromotedToType(ISD::LOAD, MVT::f64, MVT::v2i32);
127
128 setOperationAction(ISD::LOAD, MVT::v2f64, Promote);
129 AddPromotedToType(ISD::LOAD, MVT::v2f64, MVT::v4i32);
130
131 setOperationAction(ISD::LOAD, MVT::v3i64, Promote);
132 AddPromotedToType(ISD::LOAD, MVT::v3i64, MVT::v6i32);
133
134 setOperationAction(ISD::LOAD, MVT::v4i64, Promote);
135 AddPromotedToType(ISD::LOAD, MVT::v4i64, MVT::v8i32);
136
137 setOperationAction(ISD::LOAD, MVT::v3f64, Promote);
138 AddPromotedToType(ISD::LOAD, MVT::v3f64, MVT::v6i32);
139
140 setOperationAction(ISD::LOAD, MVT::v4f64, Promote);
141 AddPromotedToType(ISD::LOAD, MVT::v4f64, MVT::v8i32);
142
143 setOperationAction(ISD::LOAD, MVT::v8i64, Promote);
144 AddPromotedToType(ISD::LOAD, MVT::v8i64, MVT::v16i32);
145
146 setOperationAction(ISD::LOAD, MVT::v8f64, Promote);
147 AddPromotedToType(ISD::LOAD, MVT::v8f64, MVT::v16i32);
148
149 setOperationAction(ISD::LOAD, MVT::v16i64, Promote);
150 AddPromotedToType(ISD::LOAD, MVT::v16i64, MVT::v32i32);
151
152 setOperationAction(ISD::LOAD, MVT::v16f64, Promote);
153 AddPromotedToType(ISD::LOAD, MVT::v16f64, MVT::v32i32);
154
155 setOperationAction(ISD::LOAD, MVT::i128, Promote);
156 AddPromotedToType(ISD::LOAD, MVT::i128, MVT::v4i32);
157
158 // TODO: Would be better to consume as directly legal
159 setOperationAction(ISD::ATOMIC_LOAD, MVT::f32, Promote);
160 AddPromotedToType(ISD::ATOMIC_LOAD, MVT::f32, MVT::i32);
161
162 setOperationAction(ISD::ATOMIC_LOAD, MVT::f64, Promote);
163 AddPromotedToType(ISD::ATOMIC_LOAD, MVT::f64, MVT::i64);
164
165 setOperationAction(ISD::ATOMIC_LOAD, MVT::f16, Promote);
166 AddPromotedToType(ISD::ATOMIC_LOAD, MVT::f16, MVT::i16);
167
168 setOperationAction(ISD::ATOMIC_LOAD, MVT::bf16, Promote);
169 AddPromotedToType(ISD::ATOMIC_LOAD, MVT::bf16, MVT::i16);
170
171 setOperationAction(ISD::ATOMIC_STORE, MVT::f32, Promote);
172 AddPromotedToType(ISD::ATOMIC_STORE, MVT::f32, MVT::i32);
173
174 setOperationAction(ISD::ATOMIC_STORE, MVT::f64, Promote);
175 AddPromotedToType(ISD::ATOMIC_STORE, MVT::f64, MVT::i64);
176
177 setOperationAction(ISD::ATOMIC_STORE, MVT::f16, Promote);
178 AddPromotedToType(ISD::ATOMIC_STORE, MVT::f16, MVT::i16);
179
180 setOperationAction(ISD::ATOMIC_STORE, MVT::bf16, Promote);
181 AddPromotedToType(ISD::ATOMIC_STORE, MVT::bf16, MVT::i16);
182
183 // There are no 64-bit extloads. These should be done as a 32-bit extload and
184 // an extension to 64-bit.
185 for (MVT VT : MVT::integer_valuetypes())
187 Expand);
188
189 for (MVT VT : MVT::integer_valuetypes()) {
190 if (VT == MVT::i64)
191 continue;
192
193 for (auto Op : {ISD::SEXTLOAD, ISD::ZEXTLOAD, ISD::EXTLOAD}) {
194 setLoadExtAction(Op, VT, MVT::i1, Promote);
195 setLoadExtAction(Op, VT, MVT::i8, Legal);
196 setLoadExtAction(Op, VT, MVT::i16, Legal);
197 setLoadExtAction(Op, VT, MVT::i32, Expand);
198 }
199 }
200
202 for (auto MemVT :
203 {MVT::v2i8, MVT::v4i8, MVT::v2i16, MVT::v3i16, MVT::v4i16})
205 Expand);
206
207 setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
208 setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::bf16, Expand);
209 setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2f16, Expand);
210 setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2bf16, Expand);
211 setLoadExtAction(ISD::EXTLOAD, MVT::v3f32, MVT::v3f16, Expand);
212 setLoadExtAction(ISD::EXTLOAD, MVT::v3f32, MVT::v3bf16, Expand);
213 setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Expand);
214 setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4bf16, Expand);
215 setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8f16, Expand);
216 setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8bf16, Expand);
217 setLoadExtAction(ISD::EXTLOAD, MVT::v16f32, MVT::v16f16, Expand);
218 setLoadExtAction(ISD::EXTLOAD, MVT::v16f32, MVT::v16bf16, Expand);
219 setLoadExtAction(ISD::EXTLOAD, MVT::v32f32, MVT::v32f16, Expand);
220 setLoadExtAction(ISD::EXTLOAD, MVT::v32f32, MVT::v32bf16, Expand);
221
222 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
223 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f32, Expand);
224 setLoadExtAction(ISD::EXTLOAD, MVT::v3f64, MVT::v3f32, Expand);
225 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f32, Expand);
226 setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f32, Expand);
227 setLoadExtAction(ISD::EXTLOAD, MVT::v16f64, MVT::v16f32, Expand);
228
229 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
230 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::bf16, Expand);
231 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Expand);
232 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2bf16, Expand);
233 setLoadExtAction(ISD::EXTLOAD, MVT::v3f64, MVT::v3f16, Expand);
234 setLoadExtAction(ISD::EXTLOAD, MVT::v3f64, MVT::v3bf16, Expand);
235 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Expand);
236 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4bf16, Expand);
237 setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f16, Expand);
238 setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8bf16, Expand);
239 setLoadExtAction(ISD::EXTLOAD, MVT::v16f64, MVT::v16f16, Expand);
240 setLoadExtAction(ISD::EXTLOAD, MVT::v16f64, MVT::v16bf16, Expand);
241
242 setOperationAction(ISD::STORE, MVT::f32, Promote);
243 AddPromotedToType(ISD::STORE, MVT::f32, MVT::i32);
244
245 setOperationAction(ISD::STORE, MVT::v2f32, Promote);
246 AddPromotedToType(ISD::STORE, MVT::v2f32, MVT::v2i32);
247
248 setOperationAction(ISD::STORE, MVT::v3f32, Promote);
249 AddPromotedToType(ISD::STORE, MVT::v3f32, MVT::v3i32);
250
251 setOperationAction(ISD::STORE, MVT::v4f32, Promote);
252 AddPromotedToType(ISD::STORE, MVT::v4f32, MVT::v4i32);
253
254 setOperationAction(ISD::STORE, MVT::v5f32, Promote);
255 AddPromotedToType(ISD::STORE, MVT::v5f32, MVT::v5i32);
256
257 setOperationAction(ISD::STORE, MVT::v6f32, Promote);
258 AddPromotedToType(ISD::STORE, MVT::v6f32, MVT::v6i32);
259
260 setOperationAction(ISD::STORE, MVT::v7f32, Promote);
261 AddPromotedToType(ISD::STORE, MVT::v7f32, MVT::v7i32);
262
263 setOperationAction(ISD::STORE, MVT::v8f32, Promote);
264 AddPromotedToType(ISD::STORE, MVT::v8f32, MVT::v8i32);
265
266 setOperationAction(ISD::STORE, MVT::v9f32, Promote);
267 AddPromotedToType(ISD::STORE, MVT::v9f32, MVT::v9i32);
268
269 setOperationAction(ISD::STORE, MVT::v10f32, Promote);
270 AddPromotedToType(ISD::STORE, MVT::v10f32, MVT::v10i32);
271
272 setOperationAction(ISD::STORE, MVT::v11f32, Promote);
273 AddPromotedToType(ISD::STORE, MVT::v11f32, MVT::v11i32);
274
275 setOperationAction(ISD::STORE, MVT::v12f32, Promote);
276 AddPromotedToType(ISD::STORE, MVT::v12f32, MVT::v12i32);
277
278 setOperationAction(ISD::STORE, MVT::v16f32, Promote);
279 AddPromotedToType(ISD::STORE, MVT::v16f32, MVT::v16i32);
280
281 setOperationAction(ISD::STORE, MVT::v32f32, Promote);
282 AddPromotedToType(ISD::STORE, MVT::v32f32, MVT::v32i32);
283
284 setOperationAction(ISD::STORE, MVT::i64, Promote);
285 AddPromotedToType(ISD::STORE, MVT::i64, MVT::v2i32);
286
287 setOperationAction(ISD::STORE, MVT::v2i64, Promote);
288 AddPromotedToType(ISD::STORE, MVT::v2i64, MVT::v4i32);
289
290 setOperationAction(ISD::STORE, MVT::f64, Promote);
291 AddPromotedToType(ISD::STORE, MVT::f64, MVT::v2i32);
292
293 setOperationAction(ISD::STORE, MVT::v2f64, Promote);
294 AddPromotedToType(ISD::STORE, MVT::v2f64, MVT::v4i32);
295
296 setOperationAction(ISD::STORE, MVT::v3i64, Promote);
297 AddPromotedToType(ISD::STORE, MVT::v3i64, MVT::v6i32);
298
299 setOperationAction(ISD::STORE, MVT::v3f64, Promote);
300 AddPromotedToType(ISD::STORE, MVT::v3f64, MVT::v6i32);
301
302 setOperationAction(ISD::STORE, MVT::v4i64, Promote);
303 AddPromotedToType(ISD::STORE, MVT::v4i64, MVT::v8i32);
304
305 setOperationAction(ISD::STORE, MVT::v4f64, Promote);
306 AddPromotedToType(ISD::STORE, MVT::v4f64, MVT::v8i32);
307
308 setOperationAction(ISD::STORE, MVT::v8i64, Promote);
309 AddPromotedToType(ISD::STORE, MVT::v8i64, MVT::v16i32);
310
311 setOperationAction(ISD::STORE, MVT::v8f64, Promote);
312 AddPromotedToType(ISD::STORE, MVT::v8f64, MVT::v16i32);
313
314 setOperationAction(ISD::STORE, MVT::v16i64, Promote);
315 AddPromotedToType(ISD::STORE, MVT::v16i64, MVT::v32i32);
316
317 setOperationAction(ISD::STORE, MVT::v16f64, Promote);
318 AddPromotedToType(ISD::STORE, MVT::v16f64, MVT::v32i32);
319
320 setOperationAction(ISD::STORE, MVT::i128, Promote);
321 AddPromotedToType(ISD::STORE, MVT::i128, MVT::v4i32);
322
323 setTruncStoreAction(MVT::i64, MVT::i1, Expand);
324 setTruncStoreAction(MVT::i64, MVT::i8, Expand);
325 setTruncStoreAction(MVT::i64, MVT::i16, Expand);
326 setTruncStoreAction(MVT::i64, MVT::i32, Expand);
327
328 setTruncStoreAction(MVT::v2i64, MVT::v2i1, Expand);
329 setTruncStoreAction(MVT::v2i64, MVT::v2i8, Expand);
330 setTruncStoreAction(MVT::v2i64, MVT::v2i16, Expand);
331 setTruncStoreAction(MVT::v2i64, MVT::v2i32, Expand);
332
333 setTruncStoreAction(MVT::f32, MVT::bf16, Expand);
334 setTruncStoreAction(MVT::f32, MVT::f16, Expand);
335 setTruncStoreAction(MVT::v2f32, MVT::v2bf16, Expand);
336 setTruncStoreAction(MVT::v2f32, MVT::v2f16, Expand);
337 setTruncStoreAction(MVT::v3f32, MVT::v3bf16, Expand);
338 setTruncStoreAction(MVT::v3f32, MVT::v3f16, Expand);
339 setTruncStoreAction(MVT::v4f32, MVT::v4bf16, Expand);
340 setTruncStoreAction(MVT::v4f32, MVT::v4f16, Expand);
341 setTruncStoreAction(MVT::v8f32, MVT::v8bf16, Expand);
342 setTruncStoreAction(MVT::v8f32, MVT::v8f16, Expand);
343 setTruncStoreAction(MVT::v16f32, MVT::v16bf16, Expand);
344 setTruncStoreAction(MVT::v16f32, MVT::v16f16, Expand);
345 setTruncStoreAction(MVT::v32f32, MVT::v32bf16, Expand);
346 setTruncStoreAction(MVT::v32f32, MVT::v32f16, Expand);
347
348 setTruncStoreAction(MVT::f64, MVT::bf16, Expand);
349 setTruncStoreAction(MVT::f64, MVT::f16, Expand);
350 setTruncStoreAction(MVT::f64, MVT::f32, Expand);
351
352 setTruncStoreAction(MVT::v2f64, MVT::v2f32, Expand);
353 setTruncStoreAction(MVT::v2f64, MVT::v2bf16, Expand);
354 setTruncStoreAction(MVT::v2f64, MVT::v2f16, Expand);
355
356 setTruncStoreAction(MVT::v3i32, MVT::v3i8, Expand);
357
358 setTruncStoreAction(MVT::v3i64, MVT::v3i32, Expand);
359 setTruncStoreAction(MVT::v3i64, MVT::v3i16, Expand);
360 setTruncStoreAction(MVT::v3i64, MVT::v3i8, Expand);
361 setTruncStoreAction(MVT::v3i64, MVT::v3i1, Expand);
362 setTruncStoreAction(MVT::v3f64, MVT::v3f32, Expand);
363 setTruncStoreAction(MVT::v3f64, MVT::v3bf16, Expand);
364 setTruncStoreAction(MVT::v3f64, MVT::v3f16, Expand);
365
366 setTruncStoreAction(MVT::v4i64, MVT::v4i32, Expand);
367 setTruncStoreAction(MVT::v4i64, MVT::v4i16, Expand);
368 setTruncStoreAction(MVT::v4f64, MVT::v4f32, Expand);
369 setTruncStoreAction(MVT::v4f64, MVT::v4bf16, Expand);
370 setTruncStoreAction(MVT::v4f64, MVT::v4f16, Expand);
371
372 setTruncStoreAction(MVT::v5i32, MVT::v5i1, Expand);
373 setTruncStoreAction(MVT::v5i32, MVT::v5i8, Expand);
374 setTruncStoreAction(MVT::v5i32, MVT::v5i16, Expand);
375
376 setTruncStoreAction(MVT::v6i32, MVT::v6i1, Expand);
377 setTruncStoreAction(MVT::v6i32, MVT::v6i8, Expand);
378 setTruncStoreAction(MVT::v6i32, MVT::v6i16, Expand);
379
380 setTruncStoreAction(MVT::v7i32, MVT::v7i1, Expand);
381 setTruncStoreAction(MVT::v7i32, MVT::v7i8, Expand);
382 setTruncStoreAction(MVT::v7i32, MVT::v7i16, Expand);
383
384 setTruncStoreAction(MVT::v8f64, MVT::v8f32, Expand);
385 setTruncStoreAction(MVT::v8f64, MVT::v8bf16, Expand);
386 setTruncStoreAction(MVT::v8f64, MVT::v8f16, Expand);
387
388 setTruncStoreAction(MVT::v16f64, MVT::v16f32, Expand);
389 setTruncStoreAction(MVT::v16f64, MVT::v16bf16, Expand);
390 setTruncStoreAction(MVT::v16f64, MVT::v16f16, Expand);
391 setTruncStoreAction(MVT::v16i64, MVT::v16i16, Expand);
392 setTruncStoreAction(MVT::v16i64, MVT::v16i8, Expand);
393 setTruncStoreAction(MVT::v16i64, MVT::v16i8, Expand);
394 setTruncStoreAction(MVT::v16i64, MVT::v16i1, Expand);
395
396 setOperationAction(ISD::Constant, {MVT::i32, MVT::i64}, Legal);
397 setOperationAction(ISD::ConstantFP, {MVT::f32, MVT::f64}, Legal);
398
399 setOperationAction({ISD::BR_JT, ISD::BRIND}, MVT::Other, Expand);
400
401 // For R600, this is totally unsupported, just custom lower to produce an
402 // error.
403 setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom);
404
405 // Library functions. These default to Expand, but we have instructions
406 // for them.
407 setOperationAction({ISD::FCEIL, ISD::FPOW, ISD::FABS, ISD::FFLOOR,
408 ISD::FROUNDEVEN, ISD::FTRUNC},
409 {MVT::f16, MVT::f32}, Legal);
410 setOperationAction({ISD::FMINNUM, ISD::FMAXNUM}, MVT::f32, Legal);
411
412 setOperationAction(ISD::FLOG2, MVT::f32, Custom);
413 setOperationAction(ISD::FROUND, {MVT::f32, MVT::f64}, Custom);
414 setOperationAction({ISD::LROUND, ISD::LLROUND},
415 {MVT::f16, MVT::f32, MVT::f64}, Expand);
416
418 {ISD::FLOG, ISD::FLOG10, ISD::FEXP, ISD::FEXP2, ISD::FEXP10}, MVT::f32,
419 Custom);
420
421 setOperationAction(ISD::FNEARBYINT, {MVT::f16, MVT::f32, MVT::f64}, Custom);
422
423 setOperationAction(ISD::FRINT, {MVT::f16, MVT::f32, MVT::f64}, Custom);
424
425 setOperationAction({ISD::LRINT, ISD::LLRINT}, {MVT::f16, MVT::f32, MVT::f64},
426 Expand);
427
428 setOperationAction(ISD::FREM, {MVT::f16, MVT::f32, MVT::f64}, Expand);
429
430 if (Subtarget->has16BitInsts()) {
431 setOperationAction(ISD::IS_FPCLASS, {MVT::f16, MVT::f32, MVT::f64}, Legal);
432 setOperationAction({ISD::FLOG2, ISD::FEXP2}, MVT::f16, Legal);
433 } else {
434 setOperationAction(ISD::IS_FPCLASS, {MVT::f32, MVT::f64}, Legal);
435 setOperationAction({ISD::FLOG2, ISD::FEXP2}, MVT::f16, Custom);
436 }
437
438 setOperationAction({ISD::FLOG10, ISD::FLOG, ISD::FEXP, ISD::FEXP10}, MVT::f16,
439 Custom);
440
441 setOperationAction(ISD::FCANONICALIZE, {MVT::f32, MVT::f64}, Legal);
442 if (Subtarget->has16BitInsts()) {
444 }
445
446 // FIXME: These IS_FPCLASS vector fp types are marked custom so it reaches
447 // scalarization code. Can be removed when IS_FPCLASS expand isn't called by
448 // default unless marked custom/legal.
450 {MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32,
451 MVT::v6f32, MVT::v7f32, MVT::v8f32, MVT::v16f32,
452 MVT::v2f64, MVT::v3f64, MVT::v4f64, MVT::v8f64,
453 MVT::v16f64},
454 Custom);
455
456 if (isTypeLegal(MVT::f16))
458 {MVT::v2f16, MVT::v3f16, MVT::v4f16, MVT::v16f16},
459 Custom);
460
461 // Expand to fneg + fadd.
463
465 {MVT::v3i32, MVT::v3f32, MVT::v4i32, MVT::v4f32,
466 MVT::v5i32, MVT::v5f32, MVT::v6i32, MVT::v6f32,
467 MVT::v7i32, MVT::v7f32, MVT::v8i32, MVT::v8f32,
468 MVT::v9i32, MVT::v9f32, MVT::v10i32, MVT::v10f32,
469 MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32},
470 Custom);
471
474 {MVT::v2f32, MVT::v2i32, MVT::v3f32, MVT::v3i32, MVT::v4f32,
475 MVT::v4i32, MVT::v5f32, MVT::v5i32, MVT::v6f32, MVT::v6i32,
476 MVT::v7f32, MVT::v7i32, MVT::v8f32, MVT::v8i32, MVT::v9f32,
477 MVT::v9i32, MVT::v10i32, MVT::v10f32, MVT::v11i32, MVT::v11f32,
478 MVT::v12i32, MVT::v12f32, MVT::v16i32, MVT::v32f32, MVT::v32i32,
479 MVT::v2f64, MVT::v2i64, MVT::v3f64, MVT::v3i64, MVT::v4f64,
480 MVT::v4i64, MVT::v8f64, MVT::v8i64, MVT::v16f64, MVT::v16i64},
481 Custom);
482
483 setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
484 setOperationAction(ISD::FP_TO_FP16, {MVT::f64, MVT::f32}, Custom);
485
486 const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 };
487 for (MVT VT : ScalarIntVTs) {
488 // These should use [SU]DIVREM, so set them to expand
490 Expand);
491
492 // GPU does not have divrem function for signed or unsigned.
494
495 // GPU does not have [S|U]MUL_LOHI functions as a single instruction.
497
499
500 // AMDGPU uses ADDC/SUBC/ADDE/SUBE
502 }
503
504 // The hardware supports 32-bit FSHR, but not FSHL.
506
507 setOperationAction({ISD::ROTL, ISD::ROTR}, {MVT::i32, MVT::i64}, Expand);
508
510
514 MVT::i64, Custom);
516
518 Legal);
519
522 MVT::i64, Custom);
523
524 for (auto VT : {MVT::i8, MVT::i16})
526
527 static const MVT::SimpleValueType VectorIntTypes[] = {
528 MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32, MVT::v6i32, MVT::v7i32,
529 MVT::v9i32, MVT::v10i32, MVT::v11i32, MVT::v12i32};
530
531 for (MVT VT : VectorIntTypes) {
532 // Expand the following operations for the current type by default.
544 ISD::SETCC, ISD::ADDRSPACECAST},
545 VT, Expand);
546 }
547
548 static const MVT::SimpleValueType FloatVectorTypes[] = {
549 MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32, MVT::v6f32, MVT::v7f32,
550 MVT::v9f32, MVT::v10f32, MVT::v11f32, MVT::v12f32};
551
552 for (MVT VT : FloatVectorTypes) {
554 {ISD::FABS, ISD::FMINNUM, ISD::FMAXNUM,
555 ISD::FADD, ISD::FCEIL, ISD::FCOS,
556 ISD::FDIV, ISD::FEXP2, ISD::FEXP,
557 ISD::FEXP10, ISD::FLOG2, ISD::FREM,
558 ISD::FLOG, ISD::FLOG10, ISD::FPOW,
559 ISD::FFLOOR, ISD::FTRUNC, ISD::FMUL,
560 ISD::FMA, ISD::FRINT, ISD::FNEARBYINT,
561 ISD::FSQRT, ISD::FSIN, ISD::FSUB,
562 ISD::FNEG, ISD::VSELECT, ISD::SELECT_CC,
564 ISD::FCANONICALIZE, ISD::FROUNDEVEN},
565 VT, Expand);
566 }
567
568 // This causes using an unrolled select operation rather than expansion with
569 // bit operations. This is in general better, but the alternative using BFI
570 // instructions may be better if the select sources are SGPRs.
572 AddPromotedToType(ISD::SELECT, MVT::v2f32, MVT::v2i32);
573
575 AddPromotedToType(ISD::SELECT, MVT::v3f32, MVT::v3i32);
576
578 AddPromotedToType(ISD::SELECT, MVT::v4f32, MVT::v4i32);
579
581 AddPromotedToType(ISD::SELECT, MVT::v5f32, MVT::v5i32);
582
584 AddPromotedToType(ISD::SELECT, MVT::v6f32, MVT::v6i32);
585
587 AddPromotedToType(ISD::SELECT, MVT::v7f32, MVT::v7i32);
588
590 AddPromotedToType(ISD::SELECT, MVT::v9f32, MVT::v9i32);
591
593 AddPromotedToType(ISD::SELECT, MVT::v10f32, MVT::v10i32);
594
596 AddPromotedToType(ISD::SELECT, MVT::v11f32, MVT::v11i32);
597
599 AddPromotedToType(ISD::SELECT, MVT::v12f32, MVT::v12i32);
600
602 setJumpIsExpensive(true);
603
606
608
609 // We want to find all load dependencies for long chains of stores to enable
610 // merging into very wide vectors. The problem is with vectors with > 4
611 // elements. MergeConsecutiveStores will attempt to merge these because x8/x16
612 // vectors are a legal type, even though we have to split the loads
613 // usually. When we can more precisely specify load legality per address
614 // space, we should be able to make FindBetterChain/MergeConsecutiveStores
615 // smarter so that they can figure out what to do in 2 iterations without all
616 // N > 4 stores on the same chain.
618
619 // memcpy/memmove/memset are expanded in the IR, so we shouldn't need to worry
620 // about these during lowering.
621 MaxStoresPerMemcpy = 0xffffffff;
622 MaxStoresPerMemmove = 0xffffffff;
623 MaxStoresPerMemset = 0xffffffff;
624
625 // The expansion for 64-bit division is enormous.
627 addBypassSlowDiv(64, 32);
628
629 setTargetDAGCombine({ISD::BITCAST, ISD::SHL,
635 ISD::STORE, ISD::FADD,
636 ISD::FSUB, ISD::FNEG,
637 ISD::FABS, ISD::AssertZext,
639
643}
644
646 if (getTargetMachine().Options.NoSignedZerosFPMath)
647 return true;
648
649 const auto Flags = Op.getNode()->getFlags();
650 if (Flags.hasNoSignedZeros())
651 return true;
652
653 return false;
654}
655
656//===----------------------------------------------------------------------===//
657// Target Information
658//===----------------------------------------------------------------------===//
659
661static bool fnegFoldsIntoOpcode(unsigned Opc) {
662 switch (Opc) {
663 case ISD::FADD:
664 case ISD::FSUB:
665 case ISD::FMUL:
666 case ISD::FMA:
667 case ISD::FMAD:
668 case ISD::FMINNUM:
669 case ISD::FMAXNUM:
670 case ISD::FMINNUM_IEEE:
671 case ISD::FMAXNUM_IEEE:
672 case ISD::FMINIMUM:
673 case ISD::FMAXIMUM:
674 case ISD::FMINIMUMNUM:
675 case ISD::FMAXIMUMNUM:
676 case ISD::SELECT:
677 case ISD::FSIN:
678 case ISD::FTRUNC:
679 case ISD::FRINT:
680 case ISD::FNEARBYINT:
681 case ISD::FROUNDEVEN:
683 case AMDGPUISD::RCP:
684 case AMDGPUISD::RCP_LEGACY:
685 case AMDGPUISD::RCP_IFLAG:
686 case AMDGPUISD::SIN_HW:
687 case AMDGPUISD::FMUL_LEGACY:
688 case AMDGPUISD::FMIN_LEGACY:
689 case AMDGPUISD::FMAX_LEGACY:
690 case AMDGPUISD::FMED3:
691 // TODO: handle llvm.amdgcn.fma.legacy
692 return true;
693 case ISD::BITCAST:
694 llvm_unreachable("bitcast is special cased");
695 default:
696 return false;
697 }
698}
699
700static bool fnegFoldsIntoOp(const SDNode *N) {
701 unsigned Opc = N->getOpcode();
702 if (Opc == ISD::BITCAST) {
703 // TODO: Is there a benefit to checking the conditions performFNegCombine
704 // does? We don't for the other cases.
705 SDValue BCSrc = N->getOperand(0);
706 if (BCSrc.getOpcode() == ISD::BUILD_VECTOR) {
707 return BCSrc.getNumOperands() == 2 &&
708 BCSrc.getOperand(1).getValueSizeInBits() == 32;
709 }
710
711 return BCSrc.getOpcode() == ISD::SELECT && BCSrc.getValueType() == MVT::f32;
712 }
713
714 return fnegFoldsIntoOpcode(Opc);
715}
716
717/// \p returns true if the operation will definitely need to use a 64-bit
718/// encoding, and thus will use a VOP3 encoding regardless of the source
719/// modifiers.
721static bool opMustUseVOP3Encoding(const SDNode *N, MVT VT) {
722 return (N->getNumOperands() > 2 && N->getOpcode() != ISD::SELECT) ||
723 VT == MVT::f64;
724}
725
726/// Return true if v_cndmask_b32 will support fabs/fneg source modifiers for the
727/// type for ISD::SELECT.
729static bool selectSupportsSourceMods(const SDNode *N) {
730 // TODO: Only applies if select will be vector
731 return N->getValueType(0) == MVT::f32;
732}
733
734// Most FP instructions support source modifiers, but this could be refined
735// slightly.
737static bool hasSourceMods(const SDNode *N) {
738 if (isa<MemSDNode>(N))
739 return false;
740
741 switch (N->getOpcode()) {
742 case ISD::CopyToReg:
743 case ISD::FDIV:
744 case ISD::FREM:
745 case ISD::INLINEASM:
746 case ISD::INLINEASM_BR:
747 case AMDGPUISD::DIV_SCALE:
749
750 // TODO: Should really be looking at the users of the bitcast. These are
751 // problematic because bitcasts are used to legalize all stores to integer
752 // types.
753 case ISD::BITCAST:
754 return false;
756 switch (N->getConstantOperandVal(0)) {
757 case Intrinsic::amdgcn_interp_p1:
758 case Intrinsic::amdgcn_interp_p2:
759 case Intrinsic::amdgcn_interp_mov:
760 case Intrinsic::amdgcn_interp_p1_f16:
761 case Intrinsic::amdgcn_interp_p2_f16:
762 return false;
763 default:
764 return true;
765 }
766 }
767 case ISD::SELECT:
769 default:
770 return true;
771 }
772}
773
775 unsigned CostThreshold) {
776 // Some users (such as 3-operand FMA/MAD) must use a VOP3 encoding, and thus
777 // it is truly free to use a source modifier in all cases. If there are
778 // multiple users but for each one will necessitate using VOP3, there will be
779 // a code size increase. Try to avoid increasing code size unless we know it
780 // will save on the instruction count.
781 unsigned NumMayIncreaseSize = 0;
782 MVT VT = N->getValueType(0).getScalarType().getSimpleVT();
783
784 assert(!N->use_empty());
785
786 // XXX - Should this limit number of uses to check?
787 for (const SDNode *U : N->users()) {
788 if (!hasSourceMods(U))
789 return false;
790
791 if (!opMustUseVOP3Encoding(U, VT)) {
792 if (++NumMayIncreaseSize > CostThreshold)
793 return false;
794 }
795 }
796
797 return true;
798}
799
801 ISD::NodeType ExtendKind) const {
802 assert(!VT.isVector() && "only scalar expected");
803
804 // Round to the next multiple of 32-bits.
805 unsigned Size = VT.getSizeInBits();
806 if (Size <= 32)
807 return MVT::i32;
808 return EVT::getIntegerVT(Context, 32 * ((Size + 31) / 32));
809}
810
812 return 32;
813}
814
816 return true;
817}
818
819// The backend supports 32 and 64 bit floating point immediates.
820// FIXME: Why are we reporting vectors of FP immediates as legal?
822 bool ForCodeSize) const {
823 EVT ScalarVT = VT.getScalarType();
824 return (ScalarVT == MVT::f32 || ScalarVT == MVT::f64 ||
825 (ScalarVT == MVT::f16 && Subtarget->has16BitInsts()));
826}
827
828// We don't want to shrink f64 / f32 constants.
830 EVT ScalarVT = VT.getScalarType();
831 return (ScalarVT != MVT::f32 && ScalarVT != MVT::f64);
832}
833
835 SDNode *N, ISD::LoadExtType ExtTy, EVT NewVT,
836 std::optional<unsigned> ByteOffset) const {
837 // TODO: This may be worth removing. Check regression tests for diffs.
838 if (!TargetLoweringBase::shouldReduceLoadWidth(N, ExtTy, NewVT, ByteOffset))
839 return false;
840
841 unsigned NewSize = NewVT.getStoreSizeInBits();
842
843 // If we are reducing to a 32-bit load or a smaller multi-dword load,
844 // this is always better.
845 if (NewSize >= 32)
846 return true;
847
848 EVT OldVT = N->getValueType(0);
849 unsigned OldSize = OldVT.getStoreSizeInBits();
850
852 unsigned AS = MN->getAddressSpace();
853 // Do not shrink an aligned scalar load to sub-dword.
854 // Scalar engine cannot do sub-dword loads.
855 // TODO: Update this for GFX12 which does have scalar sub-dword loads.
856 if (OldSize >= 32 && NewSize < 32 && MN->getAlign() >= Align(4) &&
860 MN->isInvariant())) &&
862 return false;
863
864 // Don't produce extloads from sub 32-bit types. SI doesn't have scalar
865 // extloads, so doing one requires using a buffer_load. In cases where we
866 // still couldn't use a scalar load, using the wider load shouldn't really
867 // hurt anything.
868
869 // If the old size already had to be an extload, there's no harm in continuing
870 // to reduce the width.
871 return (OldSize < 32);
872}
873
875 const SelectionDAG &DAG,
876 const MachineMemOperand &MMO) const {
877
878 assert(LoadTy.getSizeInBits() == CastTy.getSizeInBits());
879
880 if (LoadTy.getScalarType() == MVT::i32)
881 return false;
882
883 unsigned LScalarSize = LoadTy.getScalarSizeInBits();
884 unsigned CastScalarSize = CastTy.getScalarSizeInBits();
885
886 if ((LScalarSize >= CastScalarSize) && (CastScalarSize < 32))
887 return false;
888
889 unsigned Fast = 0;
891 CastTy, MMO, &Fast) &&
892 Fast;
893}
894
895// SI+ has instructions for cttz / ctlz for 32-bit values. This is probably also
896// profitable with the expansion for 64-bit since it's generally good to
897// speculate things.
899 return true;
900}
901
903 return true;
904}
905
907 switch (N->getOpcode()) {
908 case ISD::EntryToken:
909 case ISD::TokenFactor:
910 return true;
912 unsigned IntrID = N->getConstantOperandVal(0);
914 }
916 unsigned IntrID = N->getConstantOperandVal(1);
918 }
919 case ISD::LOAD:
920 if (cast<LoadSDNode>(N)->getMemOperand()->getAddrSpace() ==
922 return true;
923 return false;
924 case AMDGPUISD::SETCC: // ballot-style instruction
925 return true;
926 }
927 return false;
928}
929
931 SDValue Op, SelectionDAG &DAG, bool LegalOperations, bool ForCodeSize,
932 NegatibleCost &Cost, unsigned Depth) const {
933
934 switch (Op.getOpcode()) {
935 case ISD::FMA:
936 case ISD::FMAD: {
937 // Negating a fma is not free if it has users without source mods.
938 if (!allUsesHaveSourceMods(Op.getNode()))
939 return SDValue();
940 break;
941 }
942 case AMDGPUISD::RCP: {
943 SDValue Src = Op.getOperand(0);
944 EVT VT = Op.getValueType();
945 SDLoc SL(Op);
946
947 SDValue NegSrc = getNegatedExpression(Src, DAG, LegalOperations,
948 ForCodeSize, Cost, Depth + 1);
949 if (NegSrc)
950 return DAG.getNode(AMDGPUISD::RCP, SL, VT, NegSrc, Op->getFlags());
951 return SDValue();
952 }
953 default:
954 break;
955 }
956
957 return TargetLowering::getNegatedExpression(Op, DAG, LegalOperations,
958 ForCodeSize, Cost, Depth);
959}
960
961//===---------------------------------------------------------------------===//
962// Target Properties
963//===---------------------------------------------------------------------===//
964
967
968 // Packed operations do not have a fabs modifier.
969 return VT == MVT::f32 || VT == MVT::f64 ||
970 (Subtarget->has16BitInsts() && (VT == MVT::f16 || VT == MVT::bf16));
971}
972
975 // Report this based on the end legalized type.
976 VT = VT.getScalarType();
977 return VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f16 || VT == MVT::bf16;
978}
979
981 unsigned NumElem,
982 unsigned AS) const {
983 return true;
984}
985
987 // There are few operations which truly have vector input operands. Any vector
988 // operation is going to involve operations on each component, and a
989 // build_vector will be a copy per element, so it always makes sense to use a
990 // build_vector input in place of the extracted element to avoid a copy into a
991 // super register.
992 //
993 // We should probably only do this if all users are extracts only, but this
994 // should be the common case.
995 return true;
996}
997
999 // Truncate is just accessing a subregister.
1000
1001 unsigned SrcSize = Source.getSizeInBits();
1002 unsigned DestSize = Dest.getSizeInBits();
1003
1004 return DestSize < SrcSize && DestSize % 32 == 0 ;
1005}
1006
1008 // Truncate is just accessing a subregister.
1009
1010 unsigned SrcSize = Source->getScalarSizeInBits();
1011 unsigned DestSize = Dest->getScalarSizeInBits();
1012
1013 if (DestSize== 16 && Subtarget->has16BitInsts())
1014 return SrcSize >= 32;
1015
1016 return DestSize < SrcSize && DestSize % 32 == 0;
1017}
1018
1020 unsigned SrcSize = Src->getScalarSizeInBits();
1021 unsigned DestSize = Dest->getScalarSizeInBits();
1022
1023 if (SrcSize == 16 && Subtarget->has16BitInsts())
1024 return DestSize >= 32;
1025
1026 return SrcSize == 32 && DestSize == 64;
1027}
1028
1030 // Any register load of a 64-bit value really requires 2 32-bit moves. For all
1031 // practical purposes, the extra mov 0 to load a 64-bit is free. As used,
1032 // this will enable reducing 64-bit operations the 32-bit, which is always
1033 // good.
1034
1035 if (Src == MVT::i16)
1036 return Dest == MVT::i32 ||Dest == MVT::i64 ;
1037
1038 return Src == MVT::i32 && Dest == MVT::i64;
1039}
1040
1042 EVT DestVT) const {
1043 switch (N->getOpcode()) {
1044 case ISD::ADD:
1045 case ISD::SUB:
1046 case ISD::SHL:
1047 case ISD::SRL:
1048 case ISD::SRA:
1049 case ISD::AND:
1050 case ISD::OR:
1051 case ISD::XOR:
1052 case ISD::MUL:
1053 case ISD::SETCC:
1054 case ISD::SELECT:
1055 case ISD::SMIN:
1056 case ISD::SMAX:
1057 case ISD::UMIN:
1058 case ISD::UMAX:
1059 if (Subtarget->has16BitInsts() &&
1060 (!DestVT.isVector() || !Subtarget->hasVOP3PInsts())) {
1061 // Don't narrow back down to i16 if promoted to i32 already.
1062 if (!N->isDivergent() && DestVT.isInteger() &&
1063 DestVT.getScalarSizeInBits() > 1 &&
1064 DestVT.getScalarSizeInBits() <= 16 &&
1065 SrcVT.getScalarSizeInBits() > 16) {
1066 return false;
1067 }
1068 }
1069 return true;
1070 default:
1071 break;
1072 }
1073
1074 // There aren't really 64-bit registers, but pairs of 32-bit ones and only a
1075 // limited number of native 64-bit operations. Shrinking an operation to fit
1076 // in a single 32-bit register should always be helpful. As currently used,
1077 // this is much less general than the name suggests, and is only used in
1078 // places trying to reduce the sizes of loads. Shrinking loads to < 32-bits is
1079 // not profitable, and may actually be harmful.
1080 if (isa<LoadSDNode>(N))
1081 return SrcVT.getSizeInBits() > 32 && DestVT.getSizeInBits() == 32;
1082
1083 return true;
1084}
1085
1087 const SDNode* N, CombineLevel Level) const {
1088 assert((N->getOpcode() == ISD::SHL || N->getOpcode() == ISD::SRA ||
1089 N->getOpcode() == ISD::SRL) &&
1090 "Expected shift op");
1091
1092 SDValue ShiftLHS = N->getOperand(0);
1093 if (!ShiftLHS->hasOneUse())
1094 return false;
1095
1096 if (ShiftLHS.getOpcode() == ISD::SIGN_EXTEND &&
1097 !ShiftLHS.getOperand(0)->hasOneUse())
1098 return false;
1099
1100 // Always commute pre-type legalization and right shifts.
1101 // We're looking for shl(or(x,y),z) patterns.
1103 N->getOpcode() != ISD::SHL || N->getOperand(0).getOpcode() != ISD::OR)
1104 return true;
1105
1106 // If only user is a i32 right-shift, then don't destroy a BFE pattern.
1107 if (N->getValueType(0) == MVT::i32 && N->hasOneUse() &&
1108 (N->user_begin()->getOpcode() == ISD::SRA ||
1109 N->user_begin()->getOpcode() == ISD::SRL))
1110 return false;
1111
1112 // Don't destroy or(shl(load_zext(),c), load_zext()) patterns.
1113 auto IsShiftAndLoad = [](SDValue LHS, SDValue RHS) {
1114 if (LHS.getOpcode() != ISD::SHL)
1115 return false;
1116 auto *RHSLd = dyn_cast<LoadSDNode>(RHS);
1117 auto *LHS0 = dyn_cast<LoadSDNode>(LHS.getOperand(0));
1118 auto *LHS1 = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
1119 return LHS0 && LHS1 && RHSLd && LHS0->getExtensionType() == ISD::ZEXTLOAD &&
1120 LHS1->getAPIntValue() == LHS0->getMemoryVT().getScalarSizeInBits() &&
1121 RHSLd->getExtensionType() == ISD::ZEXTLOAD;
1122 };
1123 SDValue LHS = N->getOperand(0).getOperand(0);
1124 SDValue RHS = N->getOperand(0).getOperand(1);
1125 return !(IsShiftAndLoad(LHS, RHS) || IsShiftAndLoad(RHS, LHS));
1126}
1127
1128//===---------------------------------------------------------------------===//
1129// TargetLowering Callbacks
1130//===---------------------------------------------------------------------===//
1131
1133 bool IsVarArg) {
1134 switch (CC) {
1142 return CC_AMDGPU;
1145 return CC_AMDGPU_CS_CHAIN;
1146 case CallingConv::C:
1147 case CallingConv::Fast:
1148 case CallingConv::Cold:
1149 return CC_AMDGPU_Func;
1152 return CC_SI_Gfx;
1155 default:
1156 reportFatalUsageError("unsupported calling convention for call");
1157 }
1158}
1159
1161 bool IsVarArg) {
1162 switch (CC) {
1165 llvm_unreachable("kernels should not be handled here");
1175 return RetCC_SI_Shader;
1178 return RetCC_SI_Gfx;
1179 case CallingConv::C:
1180 case CallingConv::Fast:
1181 case CallingConv::Cold:
1182 return RetCC_AMDGPU_Func;
1183 default:
1184 reportFatalUsageError("unsupported calling convention");
1185 }
1186}
1187
1188/// The SelectionDAGBuilder will automatically promote function arguments
1189/// with illegal types. However, this does not work for the AMDGPU targets
1190/// since the function arguments are stored in memory as these illegal types.
1191/// In order to handle this properly we need to get the original types sizes
1192/// from the LLVM IR Function and fixup the ISD:InputArg values before
1193/// passing them to AnalyzeFormalArguments()
1194
1195/// When the SelectionDAGBuilder computes the Ins, it takes care of splitting
1196/// input values across multiple registers. Each item in the Ins array
1197/// represents a single value that will be stored in registers. Ins[x].VT is
1198/// the value type of the value that will be stored in the register, so
1199/// whatever SDNode we lower the argument to needs to be this type.
1200///
1201/// In order to correctly lower the arguments we need to know the size of each
1202/// argument. Since Ins[x].VT gives us the size of the register that will
1203/// hold the value, we need to look at Ins[x].ArgVT to see the 'real' type
1204/// for the original function argument so that we can deduce the correct memory
1205/// type to use for Ins[x]. In most cases the correct memory type will be
1206/// Ins[x].ArgVT. However, this will not always be the case. If, for example,
1207/// we have a kernel argument of type v8i8, this argument will be split into
1208/// 8 parts and each part will be represented by its own item in the Ins array.
1209/// For each part the Ins[x].ArgVT will be the v8i8, which is the full type of
1210/// the argument before it was split. From this, we deduce that the memory type
1211/// for each individual part is i8. We pass the memory type as LocVT to the
1212/// calling convention analysis function and the register type (Ins[x].VT) as
1213/// the ValVT.
1215 CCState &State,
1216 const SmallVectorImpl<ISD::InputArg> &Ins) const {
1217 const MachineFunction &MF = State.getMachineFunction();
1218 const Function &Fn = MF.getFunction();
1219 LLVMContext &Ctx = Fn.getContext();
1220 const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(MF);
1221 const unsigned ExplicitOffset = ST.getExplicitKernelArgOffset();
1223
1224 Align MaxAlign = Align(1);
1225 uint64_t ExplicitArgOffset = 0;
1226 const DataLayout &DL = Fn.getDataLayout();
1227
1228 unsigned InIndex = 0;
1229
1230 for (const Argument &Arg : Fn.args()) {
1231 const bool IsByRef = Arg.hasByRefAttr();
1232 Type *BaseArgTy = Arg.getType();
1233 Type *MemArgTy = IsByRef ? Arg.getParamByRefType() : BaseArgTy;
1234 Align Alignment = DL.getValueOrABITypeAlignment(
1235 IsByRef ? Arg.getParamAlign() : std::nullopt, MemArgTy);
1236 MaxAlign = std::max(Alignment, MaxAlign);
1237 uint64_t AllocSize = DL.getTypeAllocSize(MemArgTy);
1238
1239 uint64_t ArgOffset = alignTo(ExplicitArgOffset, Alignment) + ExplicitOffset;
1240 ExplicitArgOffset = alignTo(ExplicitArgOffset, Alignment) + AllocSize;
1241
1242 // We're basically throwing away everything passed into us and starting over
1243 // to get accurate in-memory offsets. The "PartOffset" is completely useless
1244 // to us as computed in Ins.
1245 //
1246 // We also need to figure out what type legalization is trying to do to get
1247 // the correct memory offsets.
1248
1249 SmallVector<EVT, 16> ValueVTs;
1251 ComputeValueVTs(*this, DL, BaseArgTy, ValueVTs, /*MemVTs=*/nullptr,
1252 &Offsets, ArgOffset);
1253
1254 for (unsigned Value = 0, NumValues = ValueVTs.size();
1255 Value != NumValues; ++Value) {
1256 uint64_t BasePartOffset = Offsets[Value];
1257
1258 EVT ArgVT = ValueVTs[Value];
1259 EVT MemVT = ArgVT;
1260 MVT RegisterVT = getRegisterTypeForCallingConv(Ctx, CC, ArgVT);
1261 unsigned NumRegs = getNumRegistersForCallingConv(Ctx, CC, ArgVT);
1262
1263 if (NumRegs == 1) {
1264 // This argument is not split, so the IR type is the memory type.
1265 if (ArgVT.isExtended()) {
1266 // We have an extended type, like i24, so we should just use the
1267 // register type.
1268 MemVT = RegisterVT;
1269 } else {
1270 MemVT = ArgVT;
1271 }
1272 } else if (ArgVT.isVector() && RegisterVT.isVector() &&
1273 ArgVT.getScalarType() == RegisterVT.getScalarType()) {
1274 assert(ArgVT.getVectorNumElements() > RegisterVT.getVectorNumElements());
1275 // We have a vector value which has been split into a vector with
1276 // the same scalar type, but fewer elements. This should handle
1277 // all the floating-point vector types.
1278 MemVT = RegisterVT;
1279 } else if (ArgVT.isVector() &&
1280 ArgVT.getVectorNumElements() == NumRegs) {
1281 // This arg has been split so that each element is stored in a separate
1282 // register.
1283 MemVT = ArgVT.getScalarType();
1284 } else if (ArgVT.isExtended()) {
1285 // We have an extended type, like i65.
1286 MemVT = RegisterVT;
1287 } else {
1288 unsigned MemoryBits = ArgVT.getStoreSizeInBits() / NumRegs;
1289 assert(ArgVT.getStoreSizeInBits() % NumRegs == 0);
1290 if (RegisterVT.isInteger()) {
1291 MemVT = EVT::getIntegerVT(State.getContext(), MemoryBits);
1292 } else if (RegisterVT.isVector()) {
1293 assert(!RegisterVT.getScalarType().isFloatingPoint());
1294 unsigned NumElements = RegisterVT.getVectorNumElements();
1295 assert(MemoryBits % NumElements == 0);
1296 // This vector type has been split into another vector type with
1297 // a different elements size.
1298 EVT ScalarVT = EVT::getIntegerVT(State.getContext(),
1299 MemoryBits / NumElements);
1300 MemVT = EVT::getVectorVT(State.getContext(), ScalarVT, NumElements);
1301 } else {
1302 llvm_unreachable("cannot deduce memory type.");
1303 }
1304 }
1305
1306 // Convert one element vectors to scalar.
1307 if (MemVT.isVector() && MemVT.getVectorNumElements() == 1)
1308 MemVT = MemVT.getScalarType();
1309
1310 // Round up vec3/vec5 argument.
1311 if (MemVT.isVector() && !MemVT.isPow2VectorType()) {
1312 MemVT = MemVT.getPow2VectorType(State.getContext());
1313 } else if (!MemVT.isSimple() && !MemVT.isVector()) {
1314 MemVT = MemVT.getRoundIntegerType(State.getContext());
1315 }
1316
1317 unsigned PartOffset = 0;
1318 for (unsigned i = 0; i != NumRegs; ++i) {
1319 State.addLoc(CCValAssign::getCustomMem(InIndex++, RegisterVT,
1320 BasePartOffset + PartOffset,
1321 MemVT.getSimpleVT(),
1323 PartOffset += MemVT.getStoreSize();
1324 }
1325 }
1326 }
1327}
1328
1330 SDValue Chain, CallingConv::ID CallConv,
1331 bool isVarArg,
1333 const SmallVectorImpl<SDValue> &OutVals,
1334 const SDLoc &DL, SelectionDAG &DAG) const {
1335 // FIXME: Fails for r600 tests
1336 //assert(!isVarArg && Outs.empty() && OutVals.empty() &&
1337 // "wave terminate should not have return values");
1338 return DAG.getNode(AMDGPUISD::ENDPGM, DL, MVT::Other, Chain);
1339}
1340
1341//===---------------------------------------------------------------------===//
1342// Target specific lowering
1343//===---------------------------------------------------------------------===//
1344
1345/// Selects the correct CCAssignFn for a given CallingConvention value.
1350
1355
1357 SelectionDAG &DAG,
1358 MachineFrameInfo &MFI,
1359 int ClobberedFI) const {
1360 SmallVector<SDValue, 8> ArgChains;
1361 int64_t FirstByte = MFI.getObjectOffset(ClobberedFI);
1362 int64_t LastByte = FirstByte + MFI.getObjectSize(ClobberedFI) - 1;
1363
1364 // Include the original chain at the beginning of the list. When this is
1365 // used by target LowerCall hooks, this helps legalize find the
1366 // CALLSEQ_BEGIN node.
1367 ArgChains.push_back(Chain);
1368
1369 // Add a chain value for each stack argument corresponding
1370 for (SDNode *U : DAG.getEntryNode().getNode()->users()) {
1371 if (LoadSDNode *L = dyn_cast<LoadSDNode>(U)) {
1372 if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(L->getBasePtr())) {
1373 if (FI->getIndex() < 0) {
1374 int64_t InFirstByte = MFI.getObjectOffset(FI->getIndex());
1375 int64_t InLastByte = InFirstByte;
1376 InLastByte += MFI.getObjectSize(FI->getIndex()) - 1;
1377
1378 if ((InFirstByte <= FirstByte && FirstByte <= InLastByte) ||
1379 (FirstByte <= InFirstByte && InFirstByte <= LastByte))
1380 ArgChains.push_back(SDValue(L, 1));
1381 }
1382 }
1383 }
1384 }
1385
1386 // Build a tokenfactor for all the chains.
1387 return DAG.getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ArgChains);
1388}
1389
1392 StringRef Reason) const {
1393 SDValue Callee = CLI.Callee;
1394 SelectionDAG &DAG = CLI.DAG;
1395
1396 const Function &Fn = DAG.getMachineFunction().getFunction();
1397
1398 StringRef FuncName("<unknown>");
1399
1401 FuncName = G->getSymbol();
1402 else if (const GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
1403 FuncName = G->getGlobal()->getName();
1404
1405 DAG.getContext()->diagnose(
1406 DiagnosticInfoUnsupported(Fn, Reason + FuncName, CLI.DL.getDebugLoc()));
1407
1408 if (!CLI.IsTailCall) {
1409 for (ISD::InputArg &Arg : CLI.Ins)
1410 InVals.push_back(DAG.getPOISON(Arg.VT));
1411 }
1412
1413 // FIXME: Hack because R600 doesn't handle callseq pseudos yet.
1414 if (getTargetMachine().getTargetTriple().getArch() == Triple::r600)
1415 return CLI.Chain;
1416
1417 SDValue Chain = DAG.getCALLSEQ_START(CLI.Chain, 0, 0, CLI.DL);
1418 return DAG.getCALLSEQ_END(Chain, 0, 0, /*InGlue=*/SDValue(), CLI.DL);
1419}
1420
1422 SmallVectorImpl<SDValue> &InVals) const {
1423 return lowerUnhandledCall(CLI, InVals, "unsupported call to function ");
1424}
1425
1427 SelectionDAG &DAG) const {
1428 const Function &Fn = DAG.getMachineFunction().getFunction();
1429
1431 Fn, "unsupported dynamic alloca", SDLoc(Op).getDebugLoc()));
1432 auto Ops = {DAG.getConstant(0, SDLoc(), Op.getValueType()), Op.getOperand(0)};
1433 return DAG.getMergeValues(Ops, SDLoc());
1434}
1435
1437 SelectionDAG &DAG) const {
1438 switch (Op.getOpcode()) {
1439 default:
1440 Op->print(errs(), &DAG);
1441 llvm_unreachable("Custom lowering code for this "
1442 "instruction is not implemented yet!");
1443 break;
1445 case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG);
1447 case ISD::UDIVREM: return LowerUDIVREM(Op, DAG);
1448 case ISD::SDIVREM:
1449 return LowerSDIVREM(Op, DAG);
1450 case ISD::FCEIL: return LowerFCEIL(Op, DAG);
1451 case ISD::FTRUNC: return LowerFTRUNC(Op, DAG);
1452 case ISD::FRINT: return LowerFRINT(Op, DAG);
1453 case ISD::FNEARBYINT: return LowerFNEARBYINT(Op, DAG);
1454 case ISD::FROUNDEVEN:
1455 return LowerFROUNDEVEN(Op, DAG);
1456 case ISD::FROUND: return LowerFROUND(Op, DAG);
1457 case ISD::FFLOOR: return LowerFFLOOR(Op, DAG);
1458 case ISD::FLOG2:
1459 return LowerFLOG2(Op, DAG);
1460 case ISD::FLOG:
1461 case ISD::FLOG10:
1462 return LowerFLOGCommon(Op, DAG);
1463 case ISD::FEXP:
1464 case ISD::FEXP10:
1465 return lowerFEXP(Op, DAG);
1466 case ISD::FEXP2:
1467 return lowerFEXP2(Op, DAG);
1468 case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);
1469 case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
1470 case ISD::FP_TO_FP16: return LowerFP_TO_FP16(Op, DAG);
1471 case ISD::FP_TO_SINT:
1472 case ISD::FP_TO_UINT:
1473 return LowerFP_TO_INT(Op, DAG);
1474 case ISD::CTTZ:
1476 case ISD::CTLZ:
1478 return LowerCTLZ_CTTZ(Op, DAG);
1479 case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
1480 }
1481 return Op;
1482}
1483
1486 SelectionDAG &DAG) const {
1487 switch (N->getOpcode()) {
1489 // Different parts of legalization seem to interpret which type of
1490 // sign_extend_inreg is the one to check for custom lowering. The extended
1491 // from type is what really matters, but some places check for custom
1492 // lowering of the result type. This results in trying to use
1493 // ReplaceNodeResults to sext_in_reg to an illegal type, so we'll just do
1494 // nothing here and let the illegal result integer be handled normally.
1495 return;
1496 case ISD::FLOG2:
1497 if (SDValue Lowered = LowerFLOG2(SDValue(N, 0), DAG))
1498 Results.push_back(Lowered);
1499 return;
1500 case ISD::FLOG:
1501 case ISD::FLOG10:
1502 if (SDValue Lowered = LowerFLOGCommon(SDValue(N, 0), DAG))
1503 Results.push_back(Lowered);
1504 return;
1505 case ISD::FEXP2:
1506 if (SDValue Lowered = lowerFEXP2(SDValue(N, 0), DAG))
1507 Results.push_back(Lowered);
1508 return;
1509 case ISD::FEXP:
1510 case ISD::FEXP10:
1511 if (SDValue Lowered = lowerFEXP(SDValue(N, 0), DAG))
1512 Results.push_back(Lowered);
1513 return;
1514 case ISD::CTLZ:
1516 if (auto Lowered = lowerCTLZResults(SDValue(N, 0u), DAG))
1517 Results.push_back(Lowered);
1518 return;
1519 default:
1520 return;
1521 }
1522}
1523
1525 SDValue Op,
1526 SelectionDAG &DAG) const {
1527
1528 const DataLayout &DL = DAG.getDataLayout();
1530 const GlobalValue *GV = G->getGlobal();
1531
1532 if (!MFI->isModuleEntryFunction()) {
1533 auto IsNamedBarrier = AMDGPU::isNamedBarrier(*cast<GlobalVariable>(GV));
1534 if (std::optional<uint32_t> Address =
1536 if (IsNamedBarrier) {
1537 unsigned BarCnt = DL.getTypeAllocSize(GV->getValueType()) / 16;
1538 MFI->recordNumNamedBarriers(Address.value(), BarCnt);
1539 }
1540 return DAG.getConstant(*Address, SDLoc(Op), Op.getValueType());
1541 } else if (IsNamedBarrier) {
1542 llvm_unreachable("named barrier should have an assigned address");
1543 }
1544 }
1545
1546 if (G->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
1547 G->getAddressSpace() == AMDGPUAS::REGION_ADDRESS) {
1548 if (!MFI->isModuleEntryFunction() &&
1549 GV->getName() != "llvm.amdgcn.module.lds" &&
1551 SDLoc DL(Op);
1552 const Function &Fn = DAG.getMachineFunction().getFunction();
1554 Fn, "local memory global used by non-kernel function",
1555 DL.getDebugLoc(), DS_Warning));
1556
1557 // We currently don't have a way to correctly allocate LDS objects that
1558 // aren't directly associated with a kernel. We do force inlining of
1559 // functions that use local objects. However, if these dead functions are
1560 // not eliminated, we don't want a compile time error. Just emit a warning
1561 // and a trap, since there should be no callable path here.
1562 SDValue Trap = DAG.getNode(ISD::TRAP, DL, MVT::Other, DAG.getEntryNode());
1563 SDValue OutputChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
1564 Trap, DAG.getRoot());
1565 DAG.setRoot(OutputChain);
1566 return DAG.getPOISON(Op.getValueType());
1567 }
1568
1569 // XXX: What does the value of G->getOffset() mean?
1570 assert(G->getOffset() == 0 &&
1571 "Do not know what to do with an non-zero offset");
1572
1573 // TODO: We could emit code to handle the initialization somewhere.
1574 // We ignore the initializer for now and legalize it to allow selection.
1575 // The initializer will anyway get errored out during assembly emission.
1576 unsigned Offset = MFI->allocateLDSGlobal(DL, *cast<GlobalVariable>(GV));
1577 return DAG.getConstant(Offset, SDLoc(Op), Op.getValueType());
1578 }
1579 return SDValue();
1580}
1581
1583 SelectionDAG &DAG) const {
1585 SDLoc SL(Op);
1586
1587 EVT VT = Op.getValueType();
1588 if (VT.getVectorElementType().getSizeInBits() < 32) {
1589 unsigned OpBitSize = Op.getOperand(0).getValueType().getSizeInBits();
1590 if (OpBitSize >= 32 && OpBitSize % 32 == 0) {
1591 unsigned NewNumElt = OpBitSize / 32;
1592 EVT NewEltVT = (NewNumElt == 1) ? MVT::i32
1594 MVT::i32, NewNumElt);
1595 for (const SDUse &U : Op->ops()) {
1596 SDValue In = U.get();
1597 SDValue NewIn = DAG.getNode(ISD::BITCAST, SL, NewEltVT, In);
1598 if (NewNumElt > 1)
1599 DAG.ExtractVectorElements(NewIn, Args);
1600 else
1601 Args.push_back(NewIn);
1602 }
1603
1604 EVT NewVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
1605 NewNumElt * Op.getNumOperands());
1606 SDValue BV = DAG.getBuildVector(NewVT, SL, Args);
1607 return DAG.getNode(ISD::BITCAST, SL, VT, BV);
1608 }
1609 }
1610
1611 for (const SDUse &U : Op->ops())
1612 DAG.ExtractVectorElements(U.get(), Args);
1613
1614 return DAG.getBuildVector(Op.getValueType(), SL, Args);
1615}
1616
1618 SelectionDAG &DAG) const {
1619 SDLoc SL(Op);
1621 unsigned Start = Op.getConstantOperandVal(1);
1622 EVT VT = Op.getValueType();
1623 EVT SrcVT = Op.getOperand(0).getValueType();
1624
1625 if (VT.getScalarSizeInBits() == 16 && Start % 2 == 0) {
1626 unsigned NumElt = VT.getVectorNumElements();
1627 unsigned NumSrcElt = SrcVT.getVectorNumElements();
1628 assert(NumElt % 2 == 0 && NumSrcElt % 2 == 0 && "expect legal types");
1629
1630 // Extract 32-bit registers at a time.
1631 EVT NewSrcVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumSrcElt / 2);
1632 EVT NewVT = NumElt == 2
1633 ? MVT::i32
1634 : EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElt / 2);
1635 SDValue Tmp = DAG.getNode(ISD::BITCAST, SL, NewSrcVT, Op.getOperand(0));
1636
1637 DAG.ExtractVectorElements(Tmp, Args, Start / 2, NumElt / 2);
1638 if (NumElt == 2)
1639 Tmp = Args[0];
1640 else
1641 Tmp = DAG.getBuildVector(NewVT, SL, Args);
1642
1643 return DAG.getNode(ISD::BITCAST, SL, VT, Tmp);
1644 }
1645
1646 DAG.ExtractVectorElements(Op.getOperand(0), Args, Start,
1648
1649 return DAG.getBuildVector(Op.getValueType(), SL, Args);
1650}
1651
1652// TODO: Handle fabs too
1654 if (Val.getOpcode() == ISD::FNEG)
1655 return Val.getOperand(0);
1656
1657 return Val;
1658}
1659
1661 if (Val.getOpcode() == ISD::FNEG)
1662 Val = Val.getOperand(0);
1663 if (Val.getOpcode() == ISD::FABS)
1664 Val = Val.getOperand(0);
1665 if (Val.getOpcode() == ISD::FCOPYSIGN)
1666 Val = Val.getOperand(0);
1667 return Val;
1668}
1669
1671 const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, SDValue True,
1672 SDValue False, SDValue CC, DAGCombinerInfo &DCI) const {
1673 SelectionDAG &DAG = DCI.DAG;
1674 ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
1675 switch (CCOpcode) {
1676 case ISD::SETOEQ:
1677 case ISD::SETONE:
1678 case ISD::SETUNE:
1679 case ISD::SETNE:
1680 case ISD::SETUEQ:
1681 case ISD::SETEQ:
1682 case ISD::SETFALSE:
1683 case ISD::SETFALSE2:
1684 case ISD::SETTRUE:
1685 case ISD::SETTRUE2:
1686 case ISD::SETUO:
1687 case ISD::SETO:
1688 break;
1689 case ISD::SETULE:
1690 case ISD::SETULT: {
1691 if (LHS == True)
1692 return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, RHS, LHS);
1693 return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, LHS, RHS);
1694 }
1695 case ISD::SETOLE:
1696 case ISD::SETOLT:
1697 case ISD::SETLE:
1698 case ISD::SETLT: {
1699 // Ordered. Assume ordered for undefined.
1700
1701 // Only do this after legalization to avoid interfering with other combines
1702 // which might occur.
1704 !DCI.isCalledByLegalizer())
1705 return SDValue();
1706
1707 // We need to permute the operands to get the correct NaN behavior. The
1708 // selected operand is the second one based on the failing compare with NaN,
1709 // so permute it based on the compare type the hardware uses.
1710 if (LHS == True)
1711 return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, LHS, RHS);
1712 return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, RHS, LHS);
1713 }
1714 case ISD::SETUGE:
1715 case ISD::SETUGT: {
1716 if (LHS == True)
1717 return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, RHS, LHS);
1718 return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, LHS, RHS);
1719 }
1720 case ISD::SETGT:
1721 case ISD::SETGE:
1722 case ISD::SETOGE:
1723 case ISD::SETOGT: {
1725 !DCI.isCalledByLegalizer())
1726 return SDValue();
1727
1728 if (LHS == True)
1729 return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, LHS, RHS);
1730 return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, RHS, LHS);
1731 }
1732 case ISD::SETCC_INVALID:
1733 llvm_unreachable("Invalid setcc condcode!");
1734 }
1735 return SDValue();
1736}
1737
1738/// Generate Min/Max node
1740 SDValue LHS, SDValue RHS,
1741 SDValue True, SDValue False,
1742 SDValue CC,
1743 DAGCombinerInfo &DCI) const {
1744 if ((LHS == True && RHS == False) || (LHS == False && RHS == True))
1745 return combineFMinMaxLegacyImpl(DL, VT, LHS, RHS, True, False, CC, DCI);
1746
1747 SelectionDAG &DAG = DCI.DAG;
1748
1749 // If we can't directly match this, try to see if we can fold an fneg to
1750 // match.
1751
1754 SDValue NegTrue = peekFNeg(True);
1755
1756 // Undo the combine foldFreeOpFromSelect does if it helps us match the
1757 // fmin/fmax.
1758 //
1759 // select (fcmp olt (lhs, K)), (fneg lhs), -K
1760 // -> fneg (fmin_legacy lhs, K)
1761 //
1762 // TODO: Use getNegatedExpression
1763 if (LHS == NegTrue && CFalse && CRHS) {
1764 APFloat NegRHS = neg(CRHS->getValueAPF());
1765 if (NegRHS == CFalse->getValueAPF()) {
1766 SDValue Combined =
1767 combineFMinMaxLegacyImpl(DL, VT, LHS, RHS, NegTrue, False, CC, DCI);
1768 if (Combined)
1769 return DAG.getNode(ISD::FNEG, DL, VT, Combined);
1770 return SDValue();
1771 }
1772 }
1773
1774 return SDValue();
1775}
1776
1777std::pair<SDValue, SDValue>
1779 SDLoc SL(Op);
1780
1781 SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
1782
1783 const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
1784 const SDValue One = DAG.getConstant(1, SL, MVT::i32);
1785
1786 SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero);
1787 SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One);
1788
1789 return std::pair(Lo, Hi);
1790}
1791
1793 SDLoc SL(Op);
1794
1795 SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
1796 const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
1797 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero);
1798}
1799
1801 SDLoc SL(Op);
1802
1803 SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
1804 const SDValue One = DAG.getConstant(1, SL, MVT::i32);
1805 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One);
1806}
1807
1808// Split a vector type into two parts. The first part is a power of two vector.
1809// The second part is whatever is left over, and is a scalar if it would
1810// otherwise be a 1-vector.
1811std::pair<EVT, EVT>
1813 EVT LoVT, HiVT;
1814 EVT EltVT = VT.getVectorElementType();
1815 unsigned NumElts = VT.getVectorNumElements();
1816 unsigned LoNumElts = PowerOf2Ceil((NumElts + 1) / 2);
1817 LoVT = EVT::getVectorVT(*DAG.getContext(), EltVT, LoNumElts);
1818 HiVT = NumElts - LoNumElts == 1
1819 ? EltVT
1820 : EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts - LoNumElts);
1821 return std::pair(LoVT, HiVT);
1822}
1823
1824// Split a vector value into two parts of types LoVT and HiVT. HiVT could be
1825// scalar.
1826std::pair<SDValue, SDValue>
1828 const EVT &LoVT, const EVT &HiVT,
1829 SelectionDAG &DAG) const {
1830 EVT VT = N.getValueType();
1832 (HiVT.isVector() ? HiVT.getVectorNumElements() : 1) <=
1833 VT.getVectorNumElements() &&
1834 "More vector elements requested than available!");
1836 DAG.getVectorIdxConstant(0, DL));
1837
1838 unsigned LoNumElts = LoVT.getVectorNumElements();
1839
1840 if (HiVT.isVector()) {
1841 unsigned HiNumElts = HiVT.getVectorNumElements();
1842 if ((VT.getVectorNumElements() % HiNumElts) == 0) {
1843 // Avoid creating an extract_subvector with an index that isn't a multiple
1844 // of the result type.
1846 DAG.getConstant(LoNumElts, DL, MVT::i32));
1847 return {Lo, Hi};
1848 }
1849
1851 DAG.ExtractVectorElements(N, Elts, /*Start=*/LoNumElts,
1852 /*Count=*/HiNumElts);
1853 SDValue Hi = DAG.getBuildVector(HiVT, DL, Elts);
1854 return {Lo, Hi};
1855 }
1856
1858 DAG.getVectorIdxConstant(LoNumElts, DL));
1859 return {Lo, Hi};
1860}
1861
1863 SelectionDAG &DAG) const {
1865 EVT VT = Op.getValueType();
1866 SDLoc SL(Op);
1867
1868
1869 // If this is a 2 element vector, we really want to scalarize and not create
1870 // weird 1 element vectors.
1871 if (VT.getVectorNumElements() == 2) {
1872 SDValue Ops[2];
1873 std::tie(Ops[0], Ops[1]) = scalarizeVectorLoad(Load, DAG);
1874 return DAG.getMergeValues(Ops, SL);
1875 }
1876
1877 SDValue BasePtr = Load->getBasePtr();
1878 EVT MemVT = Load->getMemoryVT();
1879
1880 const MachinePointerInfo &SrcValue = Load->getMemOperand()->getPointerInfo();
1881
1882 EVT LoVT, HiVT;
1883 EVT LoMemVT, HiMemVT;
1884 SDValue Lo, Hi;
1885
1886 std::tie(LoVT, HiVT) = getSplitDestVTs(VT, DAG);
1887 std::tie(LoMemVT, HiMemVT) = getSplitDestVTs(MemVT, DAG);
1888 std::tie(Lo, Hi) = splitVector(Op, SL, LoVT, HiVT, DAG);
1889
1890 unsigned Size = LoMemVT.getStoreSize();
1891 Align BaseAlign = Load->getAlign();
1892 Align HiAlign = commonAlignment(BaseAlign, Size);
1893
1894 SDValue LoLoad = DAG.getExtLoad(
1895 Load->getExtensionType(), SL, LoVT, Load->getChain(), BasePtr, SrcValue,
1896 LoMemVT, BaseAlign, Load->getMemOperand()->getFlags(), Load->getAAInfo());
1897 SDValue HiPtr = DAG.getObjectPtrOffset(SL, BasePtr, TypeSize::getFixed(Size));
1898 SDValue HiLoad = DAG.getExtLoad(
1899 Load->getExtensionType(), SL, HiVT, Load->getChain(), HiPtr,
1900 SrcValue.getWithOffset(LoMemVT.getStoreSize()), HiMemVT, HiAlign,
1901 Load->getMemOperand()->getFlags(), Load->getAAInfo());
1902
1903 SDValue Join;
1904 if (LoVT == HiVT) {
1905 // This is the case that the vector is power of two so was evenly split.
1906 Join = DAG.getNode(ISD::CONCAT_VECTORS, SL, VT, LoLoad, HiLoad);
1907 } else {
1908 Join = DAG.getNode(ISD::INSERT_SUBVECTOR, SL, VT, DAG.getPOISON(VT), LoLoad,
1909 DAG.getVectorIdxConstant(0, SL));
1910 Join = DAG.getNode(
1912 VT, Join, HiLoad,
1914 }
1915
1916 SDValue Ops[] = {Join, DAG.getNode(ISD::TokenFactor, SL, MVT::Other,
1917 LoLoad.getValue(1), HiLoad.getValue(1))};
1918
1919 return DAG.getMergeValues(Ops, SL);
1920}
1921
1923 SelectionDAG &DAG) const {
1925 EVT VT = Op.getValueType();
1926 SDValue BasePtr = Load->getBasePtr();
1927 EVT MemVT = Load->getMemoryVT();
1928 SDLoc SL(Op);
1929 const MachinePointerInfo &SrcValue = Load->getMemOperand()->getPointerInfo();
1930 Align BaseAlign = Load->getAlign();
1931 unsigned NumElements = MemVT.getVectorNumElements();
1932
1933 // Widen from vec3 to vec4 when the load is at least 8-byte aligned
1934 // or 16-byte fully dereferenceable. Otherwise, split the vector load.
1935 if (NumElements != 3 ||
1936 (BaseAlign < Align(8) &&
1937 !SrcValue.isDereferenceable(16, *DAG.getContext(), DAG.getDataLayout())))
1938 return SplitVectorLoad(Op, DAG);
1939
1940 assert(NumElements == 3);
1941
1942 EVT WideVT =
1944 EVT WideMemVT =
1946 SDValue WideLoad = DAG.getExtLoad(
1947 Load->getExtensionType(), SL, WideVT, Load->getChain(), BasePtr, SrcValue,
1948 WideMemVT, BaseAlign, Load->getMemOperand()->getFlags());
1949 return DAG.getMergeValues(
1950 {DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, VT, WideLoad,
1951 DAG.getVectorIdxConstant(0, SL)),
1952 WideLoad.getValue(1)},
1953 SL);
1954}
1955
1957 SelectionDAG &DAG) const {
1959 SDValue Val = Store->getValue();
1960 EVT VT = Val.getValueType();
1961
1962 // If this is a 2 element vector, we really want to scalarize and not create
1963 // weird 1 element vectors.
1964 if (VT.getVectorNumElements() == 2)
1965 return scalarizeVectorStore(Store, DAG);
1966
1967 EVT MemVT = Store->getMemoryVT();
1968 SDValue Chain = Store->getChain();
1969 SDValue BasePtr = Store->getBasePtr();
1970 SDLoc SL(Op);
1971
1972 EVT LoVT, HiVT;
1973 EVT LoMemVT, HiMemVT;
1974 SDValue Lo, Hi;
1975
1976 std::tie(LoVT, HiVT) = getSplitDestVTs(VT, DAG);
1977 std::tie(LoMemVT, HiMemVT) = getSplitDestVTs(MemVT, DAG);
1978 std::tie(Lo, Hi) = splitVector(Val, SL, LoVT, HiVT, DAG);
1979
1980 SDValue HiPtr = DAG.getObjectPtrOffset(SL, BasePtr, LoMemVT.getStoreSize());
1981
1982 const MachinePointerInfo &SrcValue = Store->getMemOperand()->getPointerInfo();
1983 Align BaseAlign = Store->getAlign();
1984 unsigned Size = LoMemVT.getStoreSize();
1985 Align HiAlign = commonAlignment(BaseAlign, Size);
1986
1987 SDValue LoStore =
1988 DAG.getTruncStore(Chain, SL, Lo, BasePtr, SrcValue, LoMemVT, BaseAlign,
1989 Store->getMemOperand()->getFlags(), Store->getAAInfo());
1990 SDValue HiStore = DAG.getTruncStore(
1991 Chain, SL, Hi, HiPtr, SrcValue.getWithOffset(Size), HiMemVT, HiAlign,
1992 Store->getMemOperand()->getFlags(), Store->getAAInfo());
1993
1994 return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, LoStore, HiStore);
1995}
1996
1997// This is a shortcut for integer division because we have fast i32<->f32
1998// conversions, and fast f32 reciprocal instructions. The fractional part of a
1999// float is enough to accurately represent up to a 24-bit signed integer.
2001 bool Sign) const {
2002 SDLoc DL(Op);
2003 EVT VT = Op.getValueType();
2004 SDValue LHS = Op.getOperand(0);
2005 SDValue RHS = Op.getOperand(1);
2006 MVT IntVT = MVT::i32;
2007 MVT FltVT = MVT::f32;
2008
2009 unsigned LHSSignBits = DAG.ComputeNumSignBits(LHS);
2010 if (LHSSignBits < 9)
2011 return SDValue();
2012
2013 unsigned RHSSignBits = DAG.ComputeNumSignBits(RHS);
2014 if (RHSSignBits < 9)
2015 return SDValue();
2016
2017 unsigned BitSize = VT.getSizeInBits();
2018 unsigned SignBits = std::min(LHSSignBits, RHSSignBits);
2019 unsigned DivBits = BitSize - SignBits;
2020 if (Sign)
2021 ++DivBits;
2022
2025
2026 SDValue jq = DAG.getConstant(1, DL, IntVT);
2027
2028 if (Sign) {
2029 // char|short jq = ia ^ ib;
2030 jq = DAG.getNode(ISD::XOR, DL, VT, LHS, RHS);
2031
2032 // jq = jq >> (bitsize - 2)
2033 jq = DAG.getNode(ISD::SRA, DL, VT, jq,
2034 DAG.getConstant(BitSize - 2, DL, VT));
2035
2036 // jq = jq | 0x1
2037 jq = DAG.getNode(ISD::OR, DL, VT, jq, DAG.getConstant(1, DL, VT));
2038 }
2039
2040 // int ia = (int)LHS;
2041 SDValue ia = LHS;
2042
2043 // int ib, (int)RHS;
2044 SDValue ib = RHS;
2045
2046 // float fa = (float)ia;
2047 SDValue fa = DAG.getNode(ToFp, DL, FltVT, ia);
2048
2049 // float fb = (float)ib;
2050 SDValue fb = DAG.getNode(ToFp, DL, FltVT, ib);
2051
2052 SDValue fq = DAG.getNode(ISD::FMUL, DL, FltVT,
2053 fa, DAG.getNode(AMDGPUISD::RCP, DL, FltVT, fb));
2054
2055 // fq = trunc(fq);
2056 fq = DAG.getNode(ISD::FTRUNC, DL, FltVT, fq);
2057
2058 // float fqneg = -fq;
2059 SDValue fqneg = DAG.getNode(ISD::FNEG, DL, FltVT, fq);
2060
2062
2063 bool UseFmadFtz = false;
2064 if (Subtarget->isGCN()) {
2066 UseFmadFtz =
2068 }
2069
2070 // float fr = mad(fqneg, fb, fa);
2071 unsigned OpCode = !Subtarget->hasMadMacF32Insts() ? (unsigned)ISD::FMA
2072 : UseFmadFtz ? (unsigned)AMDGPUISD::FMAD_FTZ
2074 SDValue fr = DAG.getNode(OpCode, DL, FltVT, fqneg, fb, fa);
2075
2076 // int iq = (int)fq;
2077 SDValue iq = DAG.getNode(ToInt, DL, IntVT, fq);
2078
2079 // fr = fabs(fr);
2080 fr = DAG.getNode(ISD::FABS, DL, FltVT, fr);
2081
2082 // fb = fabs(fb);
2083 fb = DAG.getNode(ISD::FABS, DL, FltVT, fb);
2084
2085 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2086
2087 // int cv = fr >= fb;
2088 SDValue cv = DAG.getSetCC(DL, SetCCVT, fr, fb, ISD::SETOGE);
2089
2090 // jq = (cv ? jq : 0);
2091 jq = DAG.getNode(ISD::SELECT, DL, VT, cv, jq, DAG.getConstant(0, DL, VT));
2092
2093 // dst = iq + jq;
2094 SDValue Div = DAG.getNode(ISD::ADD, DL, VT, iq, jq);
2095
2096 // Rem needs compensation, it's easier to recompute it
2097 SDValue Rem = DAG.getNode(ISD::MUL, DL, VT, Div, RHS);
2098 Rem = DAG.getNode(ISD::SUB, DL, VT, LHS, Rem);
2099
2100 // Truncate to number of bits this divide really is.
2101 if (Sign) {
2102 SDValue InRegSize
2103 = DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), DivBits));
2104 Div = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Div, InRegSize);
2105 Rem = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Rem, InRegSize);
2106 } else {
2107 SDValue TruncMask = DAG.getConstant((UINT64_C(1) << DivBits) - 1, DL, VT);
2108 Div = DAG.getNode(ISD::AND, DL, VT, Div, TruncMask);
2109 Rem = DAG.getNode(ISD::AND, DL, VT, Rem, TruncMask);
2110 }
2111
2112 return DAG.getMergeValues({ Div, Rem }, DL);
2113}
2114
2116 SelectionDAG &DAG,
2118 SDLoc DL(Op);
2119 EVT VT = Op.getValueType();
2120
2121 assert(VT == MVT::i64 && "LowerUDIVREM64 expects an i64");
2122
2123 EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext());
2124
2125 SDValue One = DAG.getConstant(1, DL, HalfVT);
2126 SDValue Zero = DAG.getConstant(0, DL, HalfVT);
2127
2128 //HiLo split
2129 SDValue LHS_Lo, LHS_Hi;
2130 SDValue LHS = Op.getOperand(0);
2131 std::tie(LHS_Lo, LHS_Hi) = DAG.SplitScalar(LHS, DL, HalfVT, HalfVT);
2132
2133 SDValue RHS_Lo, RHS_Hi;
2134 SDValue RHS = Op.getOperand(1);
2135 std::tie(RHS_Lo, RHS_Hi) = DAG.SplitScalar(RHS, DL, HalfVT, HalfVT);
2136
2137 if (DAG.MaskedValueIsZero(RHS, APInt::getHighBitsSet(64, 32)) &&
2138 DAG.MaskedValueIsZero(LHS, APInt::getHighBitsSet(64, 32))) {
2139
2140 SDValue Res = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(HalfVT, HalfVT),
2141 LHS_Lo, RHS_Lo);
2142
2143 SDValue DIV = DAG.getBuildVector(MVT::v2i32, DL, {Res.getValue(0), Zero});
2144 SDValue REM = DAG.getBuildVector(MVT::v2i32, DL, {Res.getValue(1), Zero});
2145
2146 Results.push_back(DAG.getNode(ISD::BITCAST, DL, MVT::i64, DIV));
2147 Results.push_back(DAG.getNode(ISD::BITCAST, DL, MVT::i64, REM));
2148 return;
2149 }
2150
2151 if (isTypeLegal(MVT::i64)) {
2152 // The algorithm here is based on ideas from "Software Integer Division",
2153 // Tom Rodeheffer, August 2008.
2154
2157
2158 // Compute denominator reciprocal.
2159 unsigned FMAD =
2160 !Subtarget->hasMadMacF32Insts() ? (unsigned)ISD::FMA
2163 : (unsigned)AMDGPUISD::FMAD_FTZ;
2164
2165 SDValue Cvt_Lo = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, RHS_Lo);
2166 SDValue Cvt_Hi = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, RHS_Hi);
2167 SDValue Mad1 = DAG.getNode(FMAD, DL, MVT::f32, Cvt_Hi,
2168 DAG.getConstantFP(APInt(32, 0x4f800000).bitsToFloat(), DL, MVT::f32),
2169 Cvt_Lo);
2170 SDValue Rcp = DAG.getNode(AMDGPUISD::RCP, DL, MVT::f32, Mad1);
2171 SDValue Mul1 = DAG.getNode(ISD::FMUL, DL, MVT::f32, Rcp,
2172 DAG.getConstantFP(APInt(32, 0x5f7ffffc).bitsToFloat(), DL, MVT::f32));
2173 SDValue Mul2 = DAG.getNode(ISD::FMUL, DL, MVT::f32, Mul1,
2174 DAG.getConstantFP(APInt(32, 0x2f800000).bitsToFloat(), DL, MVT::f32));
2175 SDValue Trunc = DAG.getNode(ISD::FTRUNC, DL, MVT::f32, Mul2);
2176 SDValue Mad2 = DAG.getNode(FMAD, DL, MVT::f32, Trunc,
2177 DAG.getConstantFP(APInt(32, 0xcf800000).bitsToFloat(), DL, MVT::f32),
2178 Mul1);
2179 SDValue Rcp_Lo = DAG.getNode(ISD::FP_TO_UINT, DL, HalfVT, Mad2);
2180 SDValue Rcp_Hi = DAG.getNode(ISD::FP_TO_UINT, DL, HalfVT, Trunc);
2181 SDValue Rcp64 = DAG.getBitcast(VT,
2182 DAG.getBuildVector(MVT::v2i32, DL, {Rcp_Lo, Rcp_Hi}));
2183
2184 SDValue Zero64 = DAG.getConstant(0, DL, VT);
2185 SDValue One64 = DAG.getConstant(1, DL, VT);
2186 SDValue Zero1 = DAG.getConstant(0, DL, MVT::i1);
2187 SDVTList HalfCarryVT = DAG.getVTList(HalfVT, MVT::i1);
2188
2189 // First round of UNR (Unsigned integer Newton-Raphson).
2190 SDValue Neg_RHS = DAG.getNode(ISD::SUB, DL, VT, Zero64, RHS);
2191 SDValue Mullo1 = DAG.getNode(ISD::MUL, DL, VT, Neg_RHS, Rcp64);
2192 SDValue Mulhi1 = DAG.getNode(ISD::MULHU, DL, VT, Rcp64, Mullo1);
2193 SDValue Mulhi1_Lo, Mulhi1_Hi;
2194 std::tie(Mulhi1_Lo, Mulhi1_Hi) =
2195 DAG.SplitScalar(Mulhi1, DL, HalfVT, HalfVT);
2196 SDValue Add1_Lo = DAG.getNode(ISD::UADDO_CARRY, DL, HalfCarryVT, Rcp_Lo,
2197 Mulhi1_Lo, Zero1);
2198 SDValue Add1_Hi = DAG.getNode(ISD::UADDO_CARRY, DL, HalfCarryVT, Rcp_Hi,
2199 Mulhi1_Hi, Add1_Lo.getValue(1));
2200 SDValue Add1 = DAG.getBitcast(VT,
2201 DAG.getBuildVector(MVT::v2i32, DL, {Add1_Lo, Add1_Hi}));
2202
2203 // Second round of UNR.
2204 SDValue Mullo2 = DAG.getNode(ISD::MUL, DL, VT, Neg_RHS, Add1);
2205 SDValue Mulhi2 = DAG.getNode(ISD::MULHU, DL, VT, Add1, Mullo2);
2206 SDValue Mulhi2_Lo, Mulhi2_Hi;
2207 std::tie(Mulhi2_Lo, Mulhi2_Hi) =
2208 DAG.SplitScalar(Mulhi2, DL, HalfVT, HalfVT);
2209 SDValue Add2_Lo = DAG.getNode(ISD::UADDO_CARRY, DL, HalfCarryVT, Add1_Lo,
2210 Mulhi2_Lo, Zero1);
2211 SDValue Add2_Hi = DAG.getNode(ISD::UADDO_CARRY, DL, HalfCarryVT, Add1_Hi,
2212 Mulhi2_Hi, Add2_Lo.getValue(1));
2213 SDValue Add2 = DAG.getBitcast(VT,
2214 DAG.getBuildVector(MVT::v2i32, DL, {Add2_Lo, Add2_Hi}));
2215
2216 SDValue Mulhi3 = DAG.getNode(ISD::MULHU, DL, VT, LHS, Add2);
2217
2218 SDValue Mul3 = DAG.getNode(ISD::MUL, DL, VT, RHS, Mulhi3);
2219
2220 SDValue Mul3_Lo, Mul3_Hi;
2221 std::tie(Mul3_Lo, Mul3_Hi) = DAG.SplitScalar(Mul3, DL, HalfVT, HalfVT);
2222 SDValue Sub1_Lo = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, LHS_Lo,
2223 Mul3_Lo, Zero1);
2224 SDValue Sub1_Hi = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, LHS_Hi,
2225 Mul3_Hi, Sub1_Lo.getValue(1));
2226 SDValue Sub1_Mi = DAG.getNode(ISD::SUB, DL, HalfVT, LHS_Hi, Mul3_Hi);
2227 SDValue Sub1 = DAG.getBitcast(VT,
2228 DAG.getBuildVector(MVT::v2i32, DL, {Sub1_Lo, Sub1_Hi}));
2229
2230 SDValue MinusOne = DAG.getConstant(0xffffffffu, DL, HalfVT);
2231 SDValue C1 = DAG.getSelectCC(DL, Sub1_Hi, RHS_Hi, MinusOne, Zero,
2232 ISD::SETUGE);
2233 SDValue C2 = DAG.getSelectCC(DL, Sub1_Lo, RHS_Lo, MinusOne, Zero,
2234 ISD::SETUGE);
2235 SDValue C3 = DAG.getSelectCC(DL, Sub1_Hi, RHS_Hi, C2, C1, ISD::SETEQ);
2236
2237 // TODO: Here and below portions of the code can be enclosed into if/endif.
2238 // Currently control flow is unconditional and we have 4 selects after
2239 // potential endif to substitute PHIs.
2240
2241 // if C3 != 0 ...
2242 SDValue Sub2_Lo = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub1_Lo,
2243 RHS_Lo, Zero1);
2244 SDValue Sub2_Mi = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub1_Mi,
2245 RHS_Hi, Sub1_Lo.getValue(1));
2246 SDValue Sub2_Hi = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub2_Mi,
2247 Zero, Sub2_Lo.getValue(1));
2248 SDValue Sub2 = DAG.getBitcast(VT,
2249 DAG.getBuildVector(MVT::v2i32, DL, {Sub2_Lo, Sub2_Hi}));
2250
2251 SDValue Add3 = DAG.getNode(ISD::ADD, DL, VT, Mulhi3, One64);
2252
2253 SDValue C4 = DAG.getSelectCC(DL, Sub2_Hi, RHS_Hi, MinusOne, Zero,
2254 ISD::SETUGE);
2255 SDValue C5 = DAG.getSelectCC(DL, Sub2_Lo, RHS_Lo, MinusOne, Zero,
2256 ISD::SETUGE);
2257 SDValue C6 = DAG.getSelectCC(DL, Sub2_Hi, RHS_Hi, C5, C4, ISD::SETEQ);
2258
2259 // if (C6 != 0)
2260 SDValue Add4 = DAG.getNode(ISD::ADD, DL, VT, Add3, One64);
2261
2262 SDValue Sub3_Lo = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub2_Lo,
2263 RHS_Lo, Zero1);
2264 SDValue Sub3_Mi = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub2_Mi,
2265 RHS_Hi, Sub2_Lo.getValue(1));
2266 SDValue Sub3_Hi = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub3_Mi,
2267 Zero, Sub3_Lo.getValue(1));
2268 SDValue Sub3 = DAG.getBitcast(VT,
2269 DAG.getBuildVector(MVT::v2i32, DL, {Sub3_Lo, Sub3_Hi}));
2270
2271 // endif C6
2272 // endif C3
2273
2274 SDValue Sel1 = DAG.getSelectCC(DL, C6, Zero, Add4, Add3, ISD::SETNE);
2275 SDValue Div = DAG.getSelectCC(DL, C3, Zero, Sel1, Mulhi3, ISD::SETNE);
2276
2277 SDValue Sel2 = DAG.getSelectCC(DL, C6, Zero, Sub3, Sub2, ISD::SETNE);
2278 SDValue Rem = DAG.getSelectCC(DL, C3, Zero, Sel2, Sub1, ISD::SETNE);
2279
2280 Results.push_back(Div);
2281 Results.push_back(Rem);
2282
2283 return;
2284 }
2285
2286 // r600 expandion.
2287 // Get Speculative values
2288 SDValue DIV_Part = DAG.getNode(ISD::UDIV, DL, HalfVT, LHS_Hi, RHS_Lo);
2289 SDValue REM_Part = DAG.getNode(ISD::UREM, DL, HalfVT, LHS_Hi, RHS_Lo);
2290
2291 SDValue REM_Lo = DAG.getSelectCC(DL, RHS_Hi, Zero, REM_Part, LHS_Hi, ISD::SETEQ);
2292 SDValue REM = DAG.getBuildVector(MVT::v2i32, DL, {REM_Lo, Zero});
2293 REM = DAG.getNode(ISD::BITCAST, DL, MVT::i64, REM);
2294
2295 SDValue DIV_Hi = DAG.getSelectCC(DL, RHS_Hi, Zero, DIV_Part, Zero, ISD::SETEQ);
2296 SDValue DIV_Lo = Zero;
2297
2298 const unsigned halfBitWidth = HalfVT.getSizeInBits();
2299
2300 for (unsigned i = 0; i < halfBitWidth; ++i) {
2301 const unsigned bitPos = halfBitWidth - i - 1;
2302 SDValue POS = DAG.getConstant(bitPos, DL, HalfVT);
2303 // Get value of high bit
2304 SDValue HBit = DAG.getNode(ISD::SRL, DL, HalfVT, LHS_Lo, POS);
2305 HBit = DAG.getNode(ISD::AND, DL, HalfVT, HBit, One);
2306 HBit = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, HBit);
2307
2308 // Shift
2309 REM = DAG.getNode(ISD::SHL, DL, VT, REM, DAG.getConstant(1, DL, VT));
2310 // Add LHS high bit
2311 REM = DAG.getNode(ISD::OR, DL, VT, REM, HBit);
2312
2313 SDValue BIT = DAG.getConstant(1ULL << bitPos, DL, HalfVT);
2314 SDValue realBIT = DAG.getSelectCC(DL, REM, RHS, BIT, Zero, ISD::SETUGE);
2315
2316 DIV_Lo = DAG.getNode(ISD::OR, DL, HalfVT, DIV_Lo, realBIT);
2317
2318 // Update REM
2319 SDValue REM_sub = DAG.getNode(ISD::SUB, DL, VT, REM, RHS);
2320 REM = DAG.getSelectCC(DL, REM, RHS, REM_sub, REM, ISD::SETUGE);
2321 }
2322
2323 SDValue DIV = DAG.getBuildVector(MVT::v2i32, DL, {DIV_Lo, DIV_Hi});
2324 DIV = DAG.getNode(ISD::BITCAST, DL, MVT::i64, DIV);
2325 Results.push_back(DIV);
2326 Results.push_back(REM);
2327}
2328
2330 SelectionDAG &DAG) const {
2331 SDLoc DL(Op);
2332 EVT VT = Op.getValueType();
2333
2334 if (VT == MVT::i64) {
2336 LowerUDIVREM64(Op, DAG, Results);
2337 return DAG.getMergeValues(Results, DL);
2338 }
2339
2340 if (VT == MVT::i32) {
2341 if (SDValue Res = LowerDIVREM24(Op, DAG, false))
2342 return Res;
2343 }
2344
2345 SDValue X = Op.getOperand(0);
2346 SDValue Y = Op.getOperand(1);
2347
2348 // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
2349 // algorithm used here.
2350
2351 // Initial estimate of inv(y).
2352 SDValue Z = DAG.getNode(AMDGPUISD::URECIP, DL, VT, Y);
2353
2354 // One round of UNR.
2355 SDValue NegY = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Y);
2356 SDValue NegYZ = DAG.getNode(ISD::MUL, DL, VT, NegY, Z);
2357 Z = DAG.getNode(ISD::ADD, DL, VT, Z,
2358 DAG.getNode(ISD::MULHU, DL, VT, Z, NegYZ));
2359
2360 // Quotient/remainder estimate.
2361 SDValue Q = DAG.getNode(ISD::MULHU, DL, VT, X, Z);
2362 SDValue R =
2363 DAG.getNode(ISD::SUB, DL, VT, X, DAG.getNode(ISD::MUL, DL, VT, Q, Y));
2364
2365 // First quotient/remainder refinement.
2366 EVT CCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2367 SDValue One = DAG.getConstant(1, DL, VT);
2368 SDValue Cond = DAG.getSetCC(DL, CCVT, R, Y, ISD::SETUGE);
2369 Q = DAG.getNode(ISD::SELECT, DL, VT, Cond,
2370 DAG.getNode(ISD::ADD, DL, VT, Q, One), Q);
2371 R = DAG.getNode(ISD::SELECT, DL, VT, Cond,
2372 DAG.getNode(ISD::SUB, DL, VT, R, Y), R);
2373
2374 // Second quotient/remainder refinement.
2375 Cond = DAG.getSetCC(DL, CCVT, R, Y, ISD::SETUGE);
2376 Q = DAG.getNode(ISD::SELECT, DL, VT, Cond,
2377 DAG.getNode(ISD::ADD, DL, VT, Q, One), Q);
2378 R = DAG.getNode(ISD::SELECT, DL, VT, Cond,
2379 DAG.getNode(ISD::SUB, DL, VT, R, Y), R);
2380
2381 return DAG.getMergeValues({Q, R}, DL);
2382}
2383
2385 SelectionDAG &DAG) const {
2386 SDLoc DL(Op);
2387 EVT VT = Op.getValueType();
2388
2389 SDValue LHS = Op.getOperand(0);
2390 SDValue RHS = Op.getOperand(1);
2391
2392 SDValue Zero = DAG.getConstant(0, DL, VT);
2393 SDValue NegOne = DAG.getAllOnesConstant(DL, VT);
2394
2395 if (VT == MVT::i32) {
2396 if (SDValue Res = LowerDIVREM24(Op, DAG, true))
2397 return Res;
2398 }
2399
2400 if (VT == MVT::i64 &&
2401 DAG.ComputeNumSignBits(LHS) > 32 &&
2402 DAG.ComputeNumSignBits(RHS) > 32) {
2403 EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext());
2404
2405 //HiLo split
2406 SDValue LHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, Zero);
2407 SDValue RHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, Zero);
2408 SDValue DIVREM = DAG.getNode(ISD::SDIVREM, DL, DAG.getVTList(HalfVT, HalfVT),
2409 LHS_Lo, RHS_Lo);
2410 SDValue Res[2] = {
2411 DAG.getNode(ISD::SIGN_EXTEND, DL, VT, DIVREM.getValue(0)),
2412 DAG.getNode(ISD::SIGN_EXTEND, DL, VT, DIVREM.getValue(1))
2413 };
2414 return DAG.getMergeValues(Res, DL);
2415 }
2416
2417 SDValue LHSign = DAG.getSelectCC(DL, LHS, Zero, NegOne, Zero, ISD::SETLT);
2418 SDValue RHSign = DAG.getSelectCC(DL, RHS, Zero, NegOne, Zero, ISD::SETLT);
2419 SDValue DSign = DAG.getNode(ISD::XOR, DL, VT, LHSign, RHSign);
2420 SDValue RSign = LHSign; // Remainder sign is the same as LHS
2421
2422 LHS = DAG.getNode(ISD::ADD, DL, VT, LHS, LHSign);
2423 RHS = DAG.getNode(ISD::ADD, DL, VT, RHS, RHSign);
2424
2425 LHS = DAG.getNode(ISD::XOR, DL, VT, LHS, LHSign);
2426 RHS = DAG.getNode(ISD::XOR, DL, VT, RHS, RHSign);
2427
2428 SDValue Div = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(VT, VT), LHS, RHS);
2429 SDValue Rem = Div.getValue(1);
2430
2431 Div = DAG.getNode(ISD::XOR, DL, VT, Div, DSign);
2432 Rem = DAG.getNode(ISD::XOR, DL, VT, Rem, RSign);
2433
2434 Div = DAG.getNode(ISD::SUB, DL, VT, Div, DSign);
2435 Rem = DAG.getNode(ISD::SUB, DL, VT, Rem, RSign);
2436
2437 SDValue Res[2] = {
2438 Div,
2439 Rem
2440 };
2441 return DAG.getMergeValues(Res, DL);
2442}
2443
2445 SDLoc SL(Op);
2446 SDValue Src = Op.getOperand(0);
2447
2448 // result = trunc(src)
2449 // if (src > 0.0 && src != result)
2450 // result += 1.0
2451
2452 SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src);
2453
2454 const SDValue Zero = DAG.getConstantFP(0.0, SL, MVT::f64);
2455 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f64);
2456
2457 EVT SetCCVT =
2458 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);
2459
2460 SDValue Lt0 = DAG.getSetCC(SL, SetCCVT, Src, Zero, ISD::SETOGT);
2461 SDValue NeTrunc = DAG.getSetCC(SL, SetCCVT, Src, Trunc, ISD::SETONE);
2462 SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc);
2463
2464 SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, One, Zero);
2465 // TODO: Should this propagate fast-math-flags?
2466 return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add);
2467}
2468
2470 SelectionDAG &DAG) {
2471 const unsigned FractBits = 52;
2472 const unsigned ExpBits = 11;
2473
2474 SDValue ExpPart = DAG.getNode(AMDGPUISD::BFE_U32, SL, MVT::i32,
2475 Hi,
2476 DAG.getConstant(FractBits - 32, SL, MVT::i32),
2477 DAG.getConstant(ExpBits, SL, MVT::i32));
2478 SDValue Exp = DAG.getNode(ISD::SUB, SL, MVT::i32, ExpPart,
2479 DAG.getConstant(1023, SL, MVT::i32));
2480
2481 return Exp;
2482}
2483
2485 SDLoc SL(Op);
2486 SDValue Src = Op.getOperand(0);
2487
2488 assert(Op.getValueType() == MVT::f64);
2489
2490 const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
2491
2492 // Extract the upper half, since this is where we will find the sign and
2493 // exponent.
2494 SDValue Hi = getHiHalf64(Src, DAG);
2495
2496 SDValue Exp = extractF64Exponent(Hi, SL, DAG);
2497
2498 const unsigned FractBits = 52;
2499
2500 // Extract the sign bit.
2501 const SDValue SignBitMask = DAG.getConstant(UINT32_C(1) << 31, SL, MVT::i32);
2502 SDValue SignBit = DAG.getNode(ISD::AND, SL, MVT::i32, Hi, SignBitMask);
2503
2504 // Extend back to 64-bits.
2505 SDValue SignBit64 = DAG.getBuildVector(MVT::v2i32, SL, {Zero, SignBit});
2506 SignBit64 = DAG.getNode(ISD::BITCAST, SL, MVT::i64, SignBit64);
2507
2508 SDValue BcInt = DAG.getNode(ISD::BITCAST, SL, MVT::i64, Src);
2509 const SDValue FractMask
2510 = DAG.getConstant((UINT64_C(1) << FractBits) - 1, SL, MVT::i64);
2511
2512 SDValue Shr = DAG.getNode(ISD::SRA, SL, MVT::i64, FractMask, Exp);
2513 SDValue Not = DAG.getNOT(SL, Shr, MVT::i64);
2514 SDValue Tmp0 = DAG.getNode(ISD::AND, SL, MVT::i64, BcInt, Not);
2515
2516 EVT SetCCVT =
2517 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i32);
2518
2519 const SDValue FiftyOne = DAG.getConstant(FractBits - 1, SL, MVT::i32);
2520
2521 SDValue ExpLt0 = DAG.getSetCC(SL, SetCCVT, Exp, Zero, ISD::SETLT);
2522 SDValue ExpGt51 = DAG.getSetCC(SL, SetCCVT, Exp, FiftyOne, ISD::SETGT);
2523
2524 SDValue Tmp1 = DAG.getNode(ISD::SELECT, SL, MVT::i64, ExpLt0, SignBit64, Tmp0);
2525 SDValue Tmp2 = DAG.getNode(ISD::SELECT, SL, MVT::i64, ExpGt51, BcInt, Tmp1);
2526
2527 return DAG.getNode(ISD::BITCAST, SL, MVT::f64, Tmp2);
2528}
2529
2531 SelectionDAG &DAG) const {
2532 SDLoc SL(Op);
2533 SDValue Src = Op.getOperand(0);
2534
2535 assert(Op.getValueType() == MVT::f64);
2536
2537 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
2538 SDValue C1 = DAG.getConstantFP(C1Val, SL, MVT::f64);
2539 SDValue CopySign = DAG.getNode(ISD::FCOPYSIGN, SL, MVT::f64, C1, Src);
2540
2541 // TODO: Should this propagate fast-math-flags?
2542
2543 SDValue Tmp1 = DAG.getNode(ISD::FADD, SL, MVT::f64, Src, CopySign);
2544 SDValue Tmp2 = DAG.getNode(ISD::FSUB, SL, MVT::f64, Tmp1, CopySign);
2545
2546 SDValue Fabs = DAG.getNode(ISD::FABS, SL, MVT::f64, Src);
2547
2548 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
2549 SDValue C2 = DAG.getConstantFP(C2Val, SL, MVT::f64);
2550
2551 EVT SetCCVT =
2552 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);
2553 SDValue Cond = DAG.getSetCC(SL, SetCCVT, Fabs, C2, ISD::SETOGT);
2554
2555 return DAG.getSelect(SL, MVT::f64, Cond, Src, Tmp2);
2556}
2557
2559 SelectionDAG &DAG) const {
2560 // FNEARBYINT and FRINT are the same, except in their handling of FP
2561 // exceptions. Those aren't really meaningful for us, and OpenCL only has
2562 // rint, so just treat them as equivalent.
2563 return DAG.getNode(ISD::FROUNDEVEN, SDLoc(Op), Op.getValueType(),
2564 Op.getOperand(0));
2565}
2566
2568 auto VT = Op.getValueType();
2569 auto Arg = Op.getOperand(0u);
2570 return DAG.getNode(ISD::FROUNDEVEN, SDLoc(Op), VT, Arg);
2571}
2572
2573// XXX - May require not supporting f32 denormals?
2574
2575// Don't handle v2f16. The extra instructions to scalarize and repack around the
2576// compare and vselect end up producing worse code than scalarizing the whole
2577// operation.
2579 SDLoc SL(Op);
2580 SDValue X = Op.getOperand(0);
2581 EVT VT = Op.getValueType();
2582
2583 SDValue T = DAG.getNode(ISD::FTRUNC, SL, VT, X);
2584
2585 // TODO: Should this propagate fast-math-flags?
2586
2587 SDValue Diff = DAG.getNode(ISD::FSUB, SL, VT, X, T);
2588
2589 SDValue AbsDiff = DAG.getNode(ISD::FABS, SL, VT, Diff);
2590
2591 const SDValue Zero = DAG.getConstantFP(0.0, SL, VT);
2592 const SDValue One = DAG.getConstantFP(1.0, SL, VT);
2593
2594 EVT SetCCVT =
2595 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2596
2597 const SDValue Half = DAG.getConstantFP(0.5, SL, VT);
2598 SDValue Cmp = DAG.getSetCC(SL, SetCCVT, AbsDiff, Half, ISD::SETOGE);
2599 SDValue OneOrZeroFP = DAG.getNode(ISD::SELECT, SL, VT, Cmp, One, Zero);
2600
2601 SDValue SignedOffset = DAG.getNode(ISD::FCOPYSIGN, SL, VT, OneOrZeroFP, X);
2602 return DAG.getNode(ISD::FADD, SL, VT, T, SignedOffset);
2603}
2604
2606 SDLoc SL(Op);
2607 SDValue Src = Op.getOperand(0);
2608
2609 // result = trunc(src);
2610 // if (src < 0.0 && src != result)
2611 // result += -1.0.
2612
2613 SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src);
2614
2615 const SDValue Zero = DAG.getConstantFP(0.0, SL, MVT::f64);
2616 const SDValue NegOne = DAG.getConstantFP(-1.0, SL, MVT::f64);
2617
2618 EVT SetCCVT =
2619 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);
2620
2621 SDValue Lt0 = DAG.getSetCC(SL, SetCCVT, Src, Zero, ISD::SETOLT);
2622 SDValue NeTrunc = DAG.getSetCC(SL, SetCCVT, Src, Trunc, ISD::SETONE);
2623 SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc);
2624
2625 SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, NegOne, Zero);
2626 // TODO: Should this propagate fast-math-flags?
2627 return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add);
2628}
2629
2630/// Return true if it's known that \p Src can never be an f32 denormal value.
2632 switch (Src.getOpcode()) {
2633 case ISD::FP_EXTEND:
2634 return Src.getOperand(0).getValueType() == MVT::f16;
2635 case ISD::FP16_TO_FP:
2636 case ISD::FFREXP:
2637 return true;
2639 unsigned IntrinsicID = Src.getConstantOperandVal(0);
2640 switch (IntrinsicID) {
2641 case Intrinsic::amdgcn_frexp_mant:
2642 return true;
2643 default:
2644 return false;
2645 }
2646 }
2647 default:
2648 return false;
2649 }
2650
2651 llvm_unreachable("covered opcode switch");
2652}
2653
2655 SDNodeFlags Flags) {
2656 return Flags.hasApproximateFuncs();
2657}
2658
2667
2669 SDValue Src,
2670 SDNodeFlags Flags) const {
2671 SDLoc SL(Src);
2672 EVT VT = Src.getValueType();
2673 const fltSemantics &Semantics = VT.getFltSemantics();
2674 SDValue SmallestNormal =
2675 DAG.getConstantFP(APFloat::getSmallestNormalized(Semantics), SL, VT);
2676
2677 // Want to scale denormals up, but negatives and 0 work just as well on the
2678 // scaled path.
2679 SDValue IsLtSmallestNormal = DAG.getSetCC(
2680 SL, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT), Src,
2681 SmallestNormal, ISD::SETOLT);
2682
2683 return IsLtSmallestNormal;
2684}
2685
2687 SDNodeFlags Flags) const {
2688 SDLoc SL(Src);
2689 EVT VT = Src.getValueType();
2690 const fltSemantics &Semantics = VT.getFltSemantics();
2691 SDValue Inf = DAG.getConstantFP(APFloat::getInf(Semantics), SL, VT);
2692
2693 SDValue Fabs = DAG.getNode(ISD::FABS, SL, VT, Src, Flags);
2694 SDValue IsFinite = DAG.getSetCC(
2695 SL, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT), Fabs,
2696 Inf, ISD::SETOLT);
2697 return IsFinite;
2698}
2699
2700/// If denormal handling is required return the scaled input to FLOG2, and the
2701/// check for denormal range. Otherwise, return null values.
2702std::pair<SDValue, SDValue>
2704 SDValue Src, SDNodeFlags Flags) const {
2705 if (!needsDenormHandlingF32(DAG, Src, Flags))
2706 return {};
2707
2708 MVT VT = MVT::f32;
2709 const fltSemantics &Semantics = APFloat::IEEEsingle();
2710 SDValue SmallestNormal =
2711 DAG.getConstantFP(APFloat::getSmallestNormalized(Semantics), SL, VT);
2712
2713 SDValue IsLtSmallestNormal = DAG.getSetCC(
2714 SL, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT), Src,
2715 SmallestNormal, ISD::SETOLT);
2716
2717 SDValue Scale32 = DAG.getConstantFP(0x1.0p+32, SL, VT);
2718 SDValue One = DAG.getConstantFP(1.0, SL, VT);
2719 SDValue ScaleFactor =
2720 DAG.getNode(ISD::SELECT, SL, VT, IsLtSmallestNormal, Scale32, One, Flags);
2721
2722 SDValue ScaledInput = DAG.getNode(ISD::FMUL, SL, VT, Src, ScaleFactor, Flags);
2723 return {ScaledInput, IsLtSmallestNormal};
2724}
2725
2727 // v_log_f32 is good enough for OpenCL, except it doesn't handle denormals.
2728 // If we have to handle denormals, scale up the input and adjust the result.
2729
2730 // scaled = x * (is_denormal ? 0x1.0p+32 : 1.0)
2731 // log2 = amdgpu_log2 - (is_denormal ? 32.0 : 0.0)
2732
2733 SDLoc SL(Op);
2734 EVT VT = Op.getValueType();
2735 SDValue Src = Op.getOperand(0);
2736 SDNodeFlags Flags = Op->getFlags();
2737
2738 if (VT == MVT::f16) {
2739 // Nothing in half is a denormal when promoted to f32.
2740 assert(!Subtarget->has16BitInsts());
2741 SDValue Ext = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src, Flags);
2742 SDValue Log = DAG.getNode(AMDGPUISD::LOG, SL, MVT::f32, Ext, Flags);
2743 return DAG.getNode(ISD::FP_ROUND, SL, VT, Log,
2744 DAG.getTargetConstant(0, SL, MVT::i32), Flags);
2745 }
2746
2747 auto [ScaledInput, IsLtSmallestNormal] =
2748 getScaledLogInput(DAG, SL, Src, Flags);
2749 if (!ScaledInput)
2750 return DAG.getNode(AMDGPUISD::LOG, SL, VT, Src, Flags);
2751
2752 SDValue Log2 = DAG.getNode(AMDGPUISD::LOG, SL, VT, ScaledInput, Flags);
2753
2754 SDValue ThirtyTwo = DAG.getConstantFP(32.0, SL, VT);
2755 SDValue Zero = DAG.getConstantFP(0.0, SL, VT);
2756 SDValue ResultOffset =
2757 DAG.getNode(ISD::SELECT, SL, VT, IsLtSmallestNormal, ThirtyTwo, Zero);
2758 return DAG.getNode(ISD::FSUB, SL, VT, Log2, ResultOffset, Flags);
2759}
2760
2761static SDValue getMad(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue X,
2762 SDValue Y, SDValue C, SDNodeFlags Flags = SDNodeFlags()) {
2763 SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, X, Y, Flags);
2764 return DAG.getNode(ISD::FADD, SL, VT, Mul, C, Flags);
2765}
2766
2768 SelectionDAG &DAG) const {
2769 SDValue X = Op.getOperand(0);
2770 EVT VT = Op.getValueType();
2771 SDNodeFlags Flags = Op->getFlags();
2772 SDLoc DL(Op);
2773 const bool IsLog10 = Op.getOpcode() == ISD::FLOG10;
2774 assert(IsLog10 || Op.getOpcode() == ISD::FLOG);
2775
2776 const auto &Options = getTargetMachine().Options;
2777 if (VT == MVT::f16 || Flags.hasApproximateFuncs()) {
2778
2779 if (VT == MVT::f16 && !Subtarget->has16BitInsts()) {
2780 // Log and multiply in f32 is good enough for f16.
2781 X = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, X, Flags);
2782 }
2783
2784 SDValue Lowered = LowerFLOGUnsafe(X, DL, DAG, IsLog10, Flags);
2785 if (VT == MVT::f16 && !Subtarget->has16BitInsts()) {
2786 return DAG.getNode(ISD::FP_ROUND, DL, VT, Lowered,
2787 DAG.getTargetConstant(0, DL, MVT::i32), Flags);
2788 }
2789
2790 return Lowered;
2791 }
2792
2793 auto [ScaledInput, IsScaled] = getScaledLogInput(DAG, DL, X, Flags);
2794 if (ScaledInput)
2795 X = ScaledInput;
2796
2797 SDValue Y = DAG.getNode(AMDGPUISD::LOG, DL, VT, X, Flags);
2798
2799 SDValue R;
2800 if (Subtarget->hasFastFMAF32()) {
2801 // c+cc are ln(2)/ln(10) to more than 49 bits
2802 const float c_log10 = 0x1.344134p-2f;
2803 const float cc_log10 = 0x1.09f79ep-26f;
2804
2805 // c + cc is ln(2) to more than 49 bits
2806 const float c_log = 0x1.62e42ep-1f;
2807 const float cc_log = 0x1.efa39ep-25f;
2808
2809 SDValue C = DAG.getConstantFP(IsLog10 ? c_log10 : c_log, DL, VT);
2810 SDValue CC = DAG.getConstantFP(IsLog10 ? cc_log10 : cc_log, DL, VT);
2811 // This adds correction terms for which contraction may lead to an increase
2812 // in the error of the approximation, so disable it.
2813 Flags.setAllowContract(false);
2814 R = DAG.getNode(ISD::FMUL, DL, VT, Y, C, Flags);
2815 SDValue NegR = DAG.getNode(ISD::FNEG, DL, VT, R, Flags);
2816 SDValue FMA0 = DAG.getNode(ISD::FMA, DL, VT, Y, C, NegR, Flags);
2817 SDValue FMA1 = DAG.getNode(ISD::FMA, DL, VT, Y, CC, FMA0, Flags);
2818 R = DAG.getNode(ISD::FADD, DL, VT, R, FMA1, Flags);
2819 } else {
2820 // ch+ct is ln(2)/ln(10) to more than 36 bits
2821 const float ch_log10 = 0x1.344000p-2f;
2822 const float ct_log10 = 0x1.3509f6p-18f;
2823
2824 // ch + ct is ln(2) to more than 36 bits
2825 const float ch_log = 0x1.62e000p-1f;
2826 const float ct_log = 0x1.0bfbe8p-15f;
2827
2828 SDValue CH = DAG.getConstantFP(IsLog10 ? ch_log10 : ch_log, DL, VT);
2829 SDValue CT = DAG.getConstantFP(IsLog10 ? ct_log10 : ct_log, DL, VT);
2830
2831 SDValue YAsInt = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Y);
2832 SDValue MaskConst = DAG.getConstant(0xfffff000, DL, MVT::i32);
2833 SDValue YHInt = DAG.getNode(ISD::AND, DL, MVT::i32, YAsInt, MaskConst);
2834 SDValue YH = DAG.getNode(ISD::BITCAST, DL, MVT::f32, YHInt);
2835 SDValue YT = DAG.getNode(ISD::FSUB, DL, VT, Y, YH, Flags);
2836 // This adds correction terms for which contraction may lead to an increase
2837 // in the error of the approximation, so disable it.
2838 Flags.setAllowContract(false);
2839 SDValue YTCT = DAG.getNode(ISD::FMUL, DL, VT, YT, CT, Flags);
2840 SDValue Mad0 = getMad(DAG, DL, VT, YH, CT, YTCT, Flags);
2841 SDValue Mad1 = getMad(DAG, DL, VT, YT, CH, Mad0, Flags);
2842 R = getMad(DAG, DL, VT, YH, CH, Mad1);
2843 }
2844
2845 const bool IsFiniteOnly =
2846 (Flags.hasNoNaNs() || Options.NoNaNsFPMath) && Flags.hasNoInfs();
2847
2848 // TODO: Check if known finite from source value.
2849 if (!IsFiniteOnly) {
2850 SDValue IsFinite = getIsFinite(DAG, Y, Flags);
2851 R = DAG.getNode(ISD::SELECT, DL, VT, IsFinite, R, Y, Flags);
2852 }
2853
2854 if (IsScaled) {
2855 SDValue Zero = DAG.getConstantFP(0.0f, DL, VT);
2856 SDValue ShiftK =
2857 DAG.getConstantFP(IsLog10 ? 0x1.344136p+3f : 0x1.62e430p+4f, DL, VT);
2858 SDValue Shift =
2859 DAG.getNode(ISD::SELECT, DL, VT, IsScaled, ShiftK, Zero, Flags);
2860 R = DAG.getNode(ISD::FSUB, DL, VT, R, Shift, Flags);
2861 }
2862
2863 return R;
2864}
2865
2869
2870// Do f32 fast math expansion for flog2 or flog10. This is accurate enough for a
2871// promote f16 operation.
2873 SelectionDAG &DAG, bool IsLog10,
2874 SDNodeFlags Flags) const {
2875 EVT VT = Src.getValueType();
2876 unsigned LogOp =
2877 VT == MVT::f32 ? (unsigned)AMDGPUISD::LOG : (unsigned)ISD::FLOG2;
2878
2879 double Log2BaseInverted =
2881
2882 if (VT == MVT::f32) {
2883 auto [ScaledInput, IsScaled] = getScaledLogInput(DAG, SL, Src, Flags);
2884 if (ScaledInput) {
2885 SDValue LogSrc = DAG.getNode(AMDGPUISD::LOG, SL, VT, ScaledInput, Flags);
2886 SDValue ScaledResultOffset =
2887 DAG.getConstantFP(-32.0 * Log2BaseInverted, SL, VT);
2888
2889 SDValue Zero = DAG.getConstantFP(0.0f, SL, VT);
2890
2891 SDValue ResultOffset = DAG.getNode(ISD::SELECT, SL, VT, IsScaled,
2892 ScaledResultOffset, Zero, Flags);
2893
2894 SDValue Log2Inv = DAG.getConstantFP(Log2BaseInverted, SL, VT);
2895
2896 if (Subtarget->hasFastFMAF32())
2897 return DAG.getNode(ISD::FMA, SL, VT, LogSrc, Log2Inv, ResultOffset,
2898 Flags);
2899 SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, LogSrc, Log2Inv, Flags);
2900 return DAG.getNode(ISD::FADD, SL, VT, Mul, ResultOffset);
2901 }
2902 }
2903
2904 SDValue Log2Operand = DAG.getNode(LogOp, SL, VT, Src, Flags);
2905 SDValue Log2BaseInvertedOperand = DAG.getConstantFP(Log2BaseInverted, SL, VT);
2906
2907 return DAG.getNode(ISD::FMUL, SL, VT, Log2Operand, Log2BaseInvertedOperand,
2908 Flags);
2909}
2910
2912 // v_exp_f32 is good enough for OpenCL, except it doesn't handle denormals.
2913 // If we have to handle denormals, scale up the input and adjust the result.
2914
2915 SDLoc SL(Op);
2916 EVT VT = Op.getValueType();
2917 SDValue Src = Op.getOperand(0);
2918 SDNodeFlags Flags = Op->getFlags();
2919
2920 if (VT == MVT::f16) {
2921 // Nothing in half is a denormal when promoted to f32.
2922 assert(!Subtarget->has16BitInsts());
2923 SDValue Ext = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src, Flags);
2924 SDValue Log = DAG.getNode(AMDGPUISD::EXP, SL, MVT::f32, Ext, Flags);
2925 return DAG.getNode(ISD::FP_ROUND, SL, VT, Log,
2926 DAG.getTargetConstant(0, SL, MVT::i32), Flags);
2927 }
2928
2929 assert(VT == MVT::f32);
2930
2931 if (!needsDenormHandlingF32(DAG, Src, Flags))
2932 return DAG.getNode(AMDGPUISD::EXP, SL, MVT::f32, Src, Flags);
2933
2934 // bool needs_scaling = x < -0x1.f80000p+6f;
2935 // v_exp_f32(x + (s ? 0x1.0p+6f : 0.0f)) * (s ? 0x1.0p-64f : 1.0f);
2936
2937 // -nextafter(128.0, -1)
2938 SDValue RangeCheckConst = DAG.getConstantFP(-0x1.f80000p+6f, SL, VT);
2939
2940 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2941
2942 SDValue NeedsScaling =
2943 DAG.getSetCC(SL, SetCCVT, Src, RangeCheckConst, ISD::SETOLT);
2944
2945 SDValue SixtyFour = DAG.getConstantFP(0x1.0p+6f, SL, VT);
2946 SDValue Zero = DAG.getConstantFP(0.0, SL, VT);
2947
2948 SDValue AddOffset =
2949 DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, SixtyFour, Zero);
2950
2951 SDValue AddInput = DAG.getNode(ISD::FADD, SL, VT, Src, AddOffset, Flags);
2952 SDValue Exp2 = DAG.getNode(AMDGPUISD::EXP, SL, VT, AddInput, Flags);
2953
2954 SDValue TwoExpNeg64 = DAG.getConstantFP(0x1.0p-64f, SL, VT);
2955 SDValue One = DAG.getConstantFP(1.0, SL, VT);
2956 SDValue ResultScale =
2957 DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, TwoExpNeg64, One);
2958
2959 return DAG.getNode(ISD::FMUL, SL, VT, Exp2, ResultScale, Flags);
2960}
2961
2963 SelectionDAG &DAG,
2964 SDNodeFlags Flags,
2965 bool IsExp10) const {
2966 // exp(x) -> exp2(M_LOG2E_F * x);
2967 // exp10(x) -> exp2(log2(10) * x);
2968 EVT VT = X.getValueType();
2969 SDValue Const =
2970 DAG.getConstantFP(IsExp10 ? 0x1.a934f0p+1f : numbers::log2e, SL, VT);
2971
2972 SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, X, Const, Flags);
2973 return DAG.getNode(VT == MVT::f32 ? (unsigned)AMDGPUISD::EXP
2974 : (unsigned)ISD::FEXP2,
2975 SL, VT, Mul, Flags);
2976}
2977
2979 SelectionDAG &DAG,
2980 SDNodeFlags Flags) const {
2981 EVT VT = X.getValueType();
2982 if (VT != MVT::f32 || !needsDenormHandlingF32(DAG, X, Flags))
2983 return lowerFEXPUnsafeImpl(X, SL, DAG, Flags, /*IsExp10=*/false);
2984
2985 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2986
2987 SDValue Threshold = DAG.getConstantFP(-0x1.5d58a0p+6f, SL, VT);
2988 SDValue NeedsScaling = DAG.getSetCC(SL, SetCCVT, X, Threshold, ISD::SETOLT);
2989
2990 SDValue ScaleOffset = DAG.getConstantFP(0x1.0p+6f, SL, VT);
2991
2992 SDValue ScaledX = DAG.getNode(ISD::FADD, SL, VT, X, ScaleOffset, Flags);
2993
2994 SDValue AdjustedX =
2995 DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, ScaledX, X);
2996
2997 const SDValue Log2E = DAG.getConstantFP(numbers::log2e, SL, VT);
2998 SDValue ExpInput = DAG.getNode(ISD::FMUL, SL, VT, AdjustedX, Log2E, Flags);
2999
3000 SDValue Exp2 = DAG.getNode(AMDGPUISD::EXP, SL, VT, ExpInput, Flags);
3001
3002 SDValue ResultScaleFactor = DAG.getConstantFP(0x1.969d48p-93f, SL, VT);
3003 SDValue AdjustedResult =
3004 DAG.getNode(ISD::FMUL, SL, VT, Exp2, ResultScaleFactor, Flags);
3005
3006 return DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, AdjustedResult, Exp2,
3007 Flags);
3008}
3009
3010/// Emit approx-funcs appropriate lowering for exp10. inf/nan should still be
3011/// handled correctly.
3013 SelectionDAG &DAG,
3014 SDNodeFlags Flags) const {
3015 const EVT VT = X.getValueType();
3016
3017 const unsigned Exp2Op = VT == MVT::f32 ? static_cast<unsigned>(AMDGPUISD::EXP)
3018 : static_cast<unsigned>(ISD::FEXP2);
3019
3020 if (VT != MVT::f32 || !needsDenormHandlingF32(DAG, X, Flags)) {
3021 // exp2(x * 0x1.a92000p+1f) * exp2(x * 0x1.4f0978p-11f);
3022 SDValue K0 = DAG.getConstantFP(0x1.a92000p+1f, SL, VT);
3023 SDValue K1 = DAG.getConstantFP(0x1.4f0978p-11f, SL, VT);
3024
3025 SDValue Mul0 = DAG.getNode(ISD::FMUL, SL, VT, X, K0, Flags);
3026 SDValue Exp2_0 = DAG.getNode(Exp2Op, SL, VT, Mul0, Flags);
3027 SDValue Mul1 = DAG.getNode(ISD::FMUL, SL, VT, X, K1, Flags);
3028 SDValue Exp2_1 = DAG.getNode(Exp2Op, SL, VT, Mul1, Flags);
3029 return DAG.getNode(ISD::FMUL, SL, VT, Exp2_0, Exp2_1);
3030 }
3031
3032 // bool s = x < -0x1.2f7030p+5f;
3033 // x += s ? 0x1.0p+5f : 0.0f;
3034 // exp10 = exp2(x * 0x1.a92000p+1f) *
3035 // exp2(x * 0x1.4f0978p-11f) *
3036 // (s ? 0x1.9f623ep-107f : 1.0f);
3037
3038 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
3039
3040 SDValue Threshold = DAG.getConstantFP(-0x1.2f7030p+5f, SL, VT);
3041 SDValue NeedsScaling = DAG.getSetCC(SL, SetCCVT, X, Threshold, ISD::SETOLT);
3042
3043 SDValue ScaleOffset = DAG.getConstantFP(0x1.0p+5f, SL, VT);
3044 SDValue ScaledX = DAG.getNode(ISD::FADD, SL, VT, X, ScaleOffset, Flags);
3045 SDValue AdjustedX =
3046 DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, ScaledX, X);
3047
3048 SDValue K0 = DAG.getConstantFP(0x1.a92000p+1f, SL, VT);
3049 SDValue K1 = DAG.getConstantFP(0x1.4f0978p-11f, SL, VT);
3050
3051 SDValue Mul0 = DAG.getNode(ISD::FMUL, SL, VT, AdjustedX, K0, Flags);
3052 SDValue Exp2_0 = DAG.getNode(Exp2Op, SL, VT, Mul0, Flags);
3053 SDValue Mul1 = DAG.getNode(ISD::FMUL, SL, VT, AdjustedX, K1, Flags);
3054 SDValue Exp2_1 = DAG.getNode(Exp2Op, SL, VT, Mul1, Flags);
3055
3056 SDValue MulExps = DAG.getNode(ISD::FMUL, SL, VT, Exp2_0, Exp2_1, Flags);
3057
3058 SDValue ResultScaleFactor = DAG.getConstantFP(0x1.9f623ep-107f, SL, VT);
3059 SDValue AdjustedResult =
3060 DAG.getNode(ISD::FMUL, SL, VT, MulExps, ResultScaleFactor, Flags);
3061
3062 return DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, AdjustedResult, MulExps,
3063 Flags);
3064}
3065
3067 EVT VT = Op.getValueType();
3068 SDLoc SL(Op);
3069 SDValue X = Op.getOperand(0);
3070 SDNodeFlags Flags = Op->getFlags();
3071 const bool IsExp10 = Op.getOpcode() == ISD::FEXP10;
3072
3073 // TODO: Interpret allowApproxFunc as ignoring DAZ. This is currently copying
3074 // library behavior. Also, is known-not-daz source sufficient?
3075 if (allowApproxFunc(DAG, Flags)) { // TODO: Does this really require fast?
3076 return IsExp10 ? lowerFEXP10Unsafe(X, SL, DAG, Flags)
3077 : lowerFEXPUnsafe(X, SL, DAG, Flags);
3078 }
3079
3080 if (VT.getScalarType() == MVT::f16) {
3081 if (VT.isVector())
3082 return SDValue();
3083
3084 // Nothing in half is a denormal when promoted to f32.
3085 //
3086 // exp(f16 x) ->
3087 // fptrunc (v_exp_f32 (fmul (fpext x), log2e))
3088 //
3089 // exp10(f16 x) ->
3090 // fptrunc (v_exp_f32 (fmul (fpext x), log2(10)))
3091 SDValue Ext = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, X, Flags);
3092 SDValue Lowered = lowerFEXPUnsafeImpl(Ext, SL, DAG, Flags, IsExp10);
3093 return DAG.getNode(ISD::FP_ROUND, SL, VT, Lowered,
3094 DAG.getTargetConstant(0, SL, MVT::i32), Flags);
3095 }
3096
3097 assert(VT == MVT::f32);
3098
3099 // Algorithm:
3100 //
3101 // e^x = 2^(x/ln(2)) = 2^(x*(64/ln(2))/64)
3102 //
3103 // x*(64/ln(2)) = n + f, |f| <= 0.5, n is integer
3104 // n = 64*m + j, 0 <= j < 64
3105 //
3106 // e^x = 2^((64*m + j + f)/64)
3107 // = (2^m) * (2^(j/64)) * 2^(f/64)
3108 // = (2^m) * (2^(j/64)) * e^(f*(ln(2)/64))
3109 //
3110 // f = x*(64/ln(2)) - n
3111 // r = f*(ln(2)/64) = x - n*(ln(2)/64)
3112 //
3113 // e^x = (2^m) * (2^(j/64)) * e^r
3114 //
3115 // (2^(j/64)) is precomputed
3116 //
3117 // e^r = 1 + r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5!
3118 // e^r = 1 + q
3119 //
3120 // q = r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5!
3121 //
3122 // e^x = (2^m) * ( (2^(j/64)) + q*(2^(j/64)) )
3123 SDNodeFlags FlagsNoContract = Flags;
3124 FlagsNoContract.setAllowContract(false);
3125
3126 SDValue PH, PL;
3127 if (Subtarget->hasFastFMAF32()) {
3128 const float c_exp = numbers::log2ef;
3129 const float cc_exp = 0x1.4ae0bep-26f; // c+cc are 49 bits
3130 const float c_exp10 = 0x1.a934f0p+1f;
3131 const float cc_exp10 = 0x1.2f346ep-24f;
3132
3133 SDValue C = DAG.getConstantFP(IsExp10 ? c_exp10 : c_exp, SL, VT);
3134 SDValue CC = DAG.getConstantFP(IsExp10 ? cc_exp10 : cc_exp, SL, VT);
3135
3136 PH = DAG.getNode(ISD::FMUL, SL, VT, X, C, Flags);
3137 SDValue NegPH = DAG.getNode(ISD::FNEG, SL, VT, PH, Flags);
3138 SDValue FMA0 = DAG.getNode(ISD::FMA, SL, VT, X, C, NegPH, Flags);
3139 PL = DAG.getNode(ISD::FMA, SL, VT, X, CC, FMA0, Flags);
3140 } else {
3141 const float ch_exp = 0x1.714000p+0f;
3142 const float cl_exp = 0x1.47652ap-12f; // ch + cl are 36 bits
3143
3144 const float ch_exp10 = 0x1.a92000p+1f;
3145 const float cl_exp10 = 0x1.4f0978p-11f;
3146
3147 SDValue CH = DAG.getConstantFP(IsExp10 ? ch_exp10 : ch_exp, SL, VT);
3148 SDValue CL = DAG.getConstantFP(IsExp10 ? cl_exp10 : cl_exp, SL, VT);
3149
3150 SDValue XAsInt = DAG.getNode(ISD::BITCAST, SL, MVT::i32, X);
3151 SDValue MaskConst = DAG.getConstant(0xfffff000, SL, MVT::i32);
3152 SDValue XHAsInt = DAG.getNode(ISD::AND, SL, MVT::i32, XAsInt, MaskConst);
3153 SDValue XH = DAG.getNode(ISD::BITCAST, SL, VT, XHAsInt);
3154 SDValue XL = DAG.getNode(ISD::FSUB, SL, VT, X, XH, Flags);
3155
3156 PH = DAG.getNode(ISD::FMUL, SL, VT, XH, CH, Flags);
3157
3158 SDValue XLCL = DAG.getNode(ISD::FMUL, SL, VT, XL, CL, Flags);
3159 SDValue Mad0 = getMad(DAG, SL, VT, XL, CH, XLCL, Flags);
3160 PL = getMad(DAG, SL, VT, XH, CL, Mad0, Flags);
3161 }
3162
3163 SDValue E = DAG.getNode(ISD::FROUNDEVEN, SL, VT, PH, Flags);
3164
3165 // It is unsafe to contract this fsub into the PH multiply.
3166 SDValue PHSubE = DAG.getNode(ISD::FSUB, SL, VT, PH, E, FlagsNoContract);
3167
3168 SDValue A = DAG.getNode(ISD::FADD, SL, VT, PHSubE, PL, Flags);
3169 SDValue IntE = DAG.getNode(ISD::FP_TO_SINT, SL, MVT::i32, E);
3170 SDValue Exp2 = DAG.getNode(AMDGPUISD::EXP, SL, VT, A, Flags);
3171
3172 SDValue R = DAG.getNode(ISD::FLDEXP, SL, VT, Exp2, IntE, Flags);
3173
3174 SDValue UnderflowCheckConst =
3175 DAG.getConstantFP(IsExp10 ? -0x1.66d3e8p+5f : -0x1.9d1da0p+6f, SL, VT);
3176
3177 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
3178 SDValue Zero = DAG.getConstantFP(0.0, SL, VT);
3179 SDValue Underflow =
3180 DAG.getSetCC(SL, SetCCVT, X, UnderflowCheckConst, ISD::SETOLT);
3181
3182 R = DAG.getNode(ISD::SELECT, SL, VT, Underflow, Zero, R);
3183
3184 if (!Flags.hasNoInfs()) {
3185 SDValue OverflowCheckConst =
3186 DAG.getConstantFP(IsExp10 ? 0x1.344136p+5f : 0x1.62e430p+6f, SL, VT);
3187 SDValue Overflow =
3188 DAG.getSetCC(SL, SetCCVT, X, OverflowCheckConst, ISD::SETOGT);
3189 SDValue Inf =
3191 R = DAG.getNode(ISD::SELECT, SL, VT, Overflow, Inf, R);
3192 }
3193
3194 return R;
3195}
3196
3197static bool isCtlzOpc(unsigned Opc) {
3198 return Opc == ISD::CTLZ || Opc == ISD::CTLZ_ZERO_UNDEF;
3199}
3200
3201static bool isCttzOpc(unsigned Opc) {
3202 return Opc == ISD::CTTZ || Opc == ISD::CTTZ_ZERO_UNDEF;
3203}
3204
3206 SelectionDAG &DAG) const {
3207 auto SL = SDLoc(Op);
3208 auto Opc = Op.getOpcode();
3209 auto Arg = Op.getOperand(0u);
3210 auto ResultVT = Op.getValueType();
3211
3212 if (ResultVT != MVT::i8 && ResultVT != MVT::i16)
3213 return {};
3214
3216 assert(ResultVT == Arg.getValueType());
3217
3218 const uint64_t NumBits = ResultVT.getFixedSizeInBits();
3219 SDValue NumExtBits = DAG.getConstant(32u - NumBits, SL, MVT::i32);
3220 SDValue NewOp;
3221
3222 if (Opc == ISD::CTLZ_ZERO_UNDEF) {
3223 NewOp = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, Arg);
3224 NewOp = DAG.getNode(ISD::SHL, SL, MVT::i32, NewOp, NumExtBits);
3225 NewOp = DAG.getNode(Opc, SL, MVT::i32, NewOp);
3226 } else {
3227 NewOp = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Arg);
3228 NewOp = DAG.getNode(Opc, SL, MVT::i32, NewOp);
3229 NewOp = DAG.getNode(ISD::SUB, SL, MVT::i32, NewOp, NumExtBits);
3230 }
3231
3232 return DAG.getNode(ISD::TRUNCATE, SL, ResultVT, NewOp);
3233}
3234
3236 SDLoc SL(Op);
3237 SDValue Src = Op.getOperand(0);
3238
3239 assert(isCtlzOpc(Op.getOpcode()) || isCttzOpc(Op.getOpcode()));
3240 bool Ctlz = isCtlzOpc(Op.getOpcode());
3241 unsigned NewOpc = Ctlz ? AMDGPUISD::FFBH_U32 : AMDGPUISD::FFBL_B32;
3242
3243 bool ZeroUndef = Op.getOpcode() == ISD::CTLZ_ZERO_UNDEF ||
3244 Op.getOpcode() == ISD::CTTZ_ZERO_UNDEF;
3245 bool Is64BitScalar = !Src->isDivergent() && Src.getValueType() == MVT::i64;
3246
3247 if (Src.getValueType() == MVT::i32 || Is64BitScalar) {
3248 // (ctlz hi:lo) -> (umin (ffbh src), 32)
3249 // (cttz hi:lo) -> (umin (ffbl src), 32)
3250 // (ctlz_zero_undef src) -> (ffbh src)
3251 // (cttz_zero_undef src) -> (ffbl src)
3252
3253 // 64-bit scalar version produce 32-bit result
3254 // (ctlz hi:lo) -> (umin (S_FLBIT_I32_B64 src), 64)
3255 // (cttz hi:lo) -> (umin (S_FF1_I32_B64 src), 64)
3256 // (ctlz_zero_undef src) -> (S_FLBIT_I32_B64 src)
3257 // (cttz_zero_undef src) -> (S_FF1_I32_B64 src)
3258 SDValue NewOpr = DAG.getNode(NewOpc, SL, MVT::i32, Src);
3259 if (!ZeroUndef) {
3260 const SDValue ConstVal = DAG.getConstant(
3261 Op.getValueType().getScalarSizeInBits(), SL, MVT::i32);
3262 NewOpr = DAG.getNode(ISD::UMIN, SL, MVT::i32, NewOpr, ConstVal);
3263 }
3264 return DAG.getNode(ISD::ZERO_EXTEND, SL, Src.getValueType(), NewOpr);
3265 }
3266
3267 SDValue Lo, Hi;
3268 std::tie(Lo, Hi) = split64BitValue(Src, DAG);
3269
3270 SDValue OprLo = DAG.getNode(NewOpc, SL, MVT::i32, Lo);
3271 SDValue OprHi = DAG.getNode(NewOpc, SL, MVT::i32, Hi);
3272
3273 // (ctlz hi:lo) -> (umin3 (ffbh hi), (uaddsat (ffbh lo), 32), 64)
3274 // (cttz hi:lo) -> (umin3 (uaddsat (ffbl hi), 32), (ffbl lo), 64)
3275 // (ctlz_zero_undef hi:lo) -> (umin (ffbh hi), (add (ffbh lo), 32))
3276 // (cttz_zero_undef hi:lo) -> (umin (add (ffbl hi), 32), (ffbl lo))
3277
3278 unsigned AddOpc = ZeroUndef ? ISD::ADD : ISD::UADDSAT;
3279 const SDValue Const32 = DAG.getConstant(32, SL, MVT::i32);
3280 if (Ctlz)
3281 OprLo = DAG.getNode(AddOpc, SL, MVT::i32, OprLo, Const32);
3282 else
3283 OprHi = DAG.getNode(AddOpc, SL, MVT::i32, OprHi, Const32);
3284
3285 SDValue NewOpr;
3286 NewOpr = DAG.getNode(ISD::UMIN, SL, MVT::i32, OprLo, OprHi);
3287 if (!ZeroUndef) {
3288 const SDValue Const64 = DAG.getConstant(64, SL, MVT::i32);
3289 NewOpr = DAG.getNode(ISD::UMIN, SL, MVT::i32, NewOpr, Const64);
3290 }
3291
3292 return DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i64, NewOpr);
3293}
3294
3296 bool Signed) const {
3297 // The regular method converting a 64-bit integer to float roughly consists of
3298 // 2 steps: normalization and rounding. In fact, after normalization, the
3299 // conversion from a 64-bit integer to a float is essentially the same as the
3300 // one from a 32-bit integer. The only difference is that it has more
3301 // trailing bits to be rounded. To leverage the native 32-bit conversion, a
3302 // 64-bit integer could be preprocessed and fit into a 32-bit integer then
3303 // converted into the correct float number. The basic steps for the unsigned
3304 // conversion are illustrated in the following pseudo code:
3305 //
3306 // f32 uitofp(i64 u) {
3307 // i32 hi, lo = split(u);
3308 // // Only count the leading zeros in hi as we have native support of the
3309 // // conversion from i32 to f32. If hi is all 0s, the conversion is
3310 // // reduced to a 32-bit one automatically.
3311 // i32 shamt = clz(hi); // Return 32 if hi is all 0s.
3312 // u <<= shamt;
3313 // hi, lo = split(u);
3314 // hi |= (lo != 0) ? 1 : 0; // Adjust rounding bit in hi based on lo.
3315 // // convert it as a 32-bit integer and scale the result back.
3316 // return uitofp(hi) * 2^(32 - shamt);
3317 // }
3318 //
3319 // The signed one follows the same principle but uses 'ffbh_i32' to count its
3320 // sign bits instead. If 'ffbh_i32' is not available, its absolute value is
3321 // converted instead followed by negation based its sign bit.
3322
3323 SDLoc SL(Op);
3324 SDValue Src = Op.getOperand(0);
3325
3326 SDValue Lo, Hi;
3327 std::tie(Lo, Hi) = split64BitValue(Src, DAG);
3328 SDValue Sign;
3329 SDValue ShAmt;
3330 if (Signed && Subtarget->isGCN()) {
3331 // We also need to consider the sign bit in Lo if Hi has just sign bits,
3332 // i.e. Hi is 0 or -1. However, that only needs to take the MSB into
3333 // account. That is, the maximal shift is
3334 // - 32 if Lo and Hi have opposite signs;
3335 // - 33 if Lo and Hi have the same sign.
3336 //
3337 // Or, MaxShAmt = 33 + OppositeSign, where
3338 //
3339 // OppositeSign is defined as ((Lo ^ Hi) >> 31), which is
3340 // - -1 if Lo and Hi have opposite signs; and
3341 // - 0 otherwise.
3342 //
3343 // All in all, ShAmt is calculated as
3344 //
3345 // umin(sffbh(Hi), 33 + (Lo^Hi)>>31) - 1.
3346 //
3347 // or
3348 //
3349 // umin(sffbh(Hi) - 1, 32 + (Lo^Hi)>>31).
3350 //
3351 // to reduce the critical path.
3352 SDValue OppositeSign = DAG.getNode(
3353 ISD::SRA, SL, MVT::i32, DAG.getNode(ISD::XOR, SL, MVT::i32, Lo, Hi),
3354 DAG.getConstant(31, SL, MVT::i32));
3355 SDValue MaxShAmt =
3356 DAG.getNode(ISD::ADD, SL, MVT::i32, DAG.getConstant(32, SL, MVT::i32),
3357 OppositeSign);
3358 // Count the leading sign bits.
3359 ShAmt = DAG.getNode(AMDGPUISD::FFBH_I32, SL, MVT::i32, Hi);
3360 // Different from unsigned conversion, the shift should be one bit less to
3361 // preserve the sign bit.
3362 ShAmt = DAG.getNode(ISD::SUB, SL, MVT::i32, ShAmt,
3363 DAG.getConstant(1, SL, MVT::i32));
3364 ShAmt = DAG.getNode(ISD::UMIN, SL, MVT::i32, ShAmt, MaxShAmt);
3365 } else {
3366 if (Signed) {
3367 // Without 'ffbh_i32', only leading zeros could be counted. Take the
3368 // absolute value first.
3369 Sign = DAG.getNode(ISD::SRA, SL, MVT::i64, Src,
3370 DAG.getConstant(63, SL, MVT::i64));
3371 SDValue Abs =
3372 DAG.getNode(ISD::XOR, SL, MVT::i64,
3373 DAG.getNode(ISD::ADD, SL, MVT::i64, Src, Sign), Sign);
3374 std::tie(Lo, Hi) = split64BitValue(Abs, DAG);
3375 }
3376 // Count the leading zeros.
3377 ShAmt = DAG.getNode(ISD::CTLZ, SL, MVT::i32, Hi);
3378 // The shift amount for signed integers is [0, 32].
3379 }
3380 // Normalize the given 64-bit integer.
3381 SDValue Norm = DAG.getNode(ISD::SHL, SL, MVT::i64, Src, ShAmt);
3382 // Split it again.
3383 std::tie(Lo, Hi) = split64BitValue(Norm, DAG);
3384 // Calculate the adjust bit for rounding.
3385 // (lo != 0) ? 1 : 0 => (lo >= 1) ? 1 : 0 => umin(1, lo)
3386 SDValue Adjust = DAG.getNode(ISD::UMIN, SL, MVT::i32,
3387 DAG.getConstant(1, SL, MVT::i32), Lo);
3388 // Get the 32-bit normalized integer.
3389 Norm = DAG.getNode(ISD::OR, SL, MVT::i32, Hi, Adjust);
3390 // Convert the normalized 32-bit integer into f32.
3391 unsigned Opc =
3392 (Signed && Subtarget->isGCN()) ? ISD::SINT_TO_FP : ISD::UINT_TO_FP;
3393 SDValue FVal = DAG.getNode(Opc, SL, MVT::f32, Norm);
3394
3395 // Finally, need to scale back the converted floating number as the original
3396 // 64-bit integer is converted as a 32-bit one.
3397 ShAmt = DAG.getNode(ISD::SUB, SL, MVT::i32, DAG.getConstant(32, SL, MVT::i32),
3398 ShAmt);
3399 // On GCN, use LDEXP directly.
3400 if (Subtarget->isGCN())
3401 return DAG.getNode(ISD::FLDEXP, SL, MVT::f32, FVal, ShAmt);
3402
3403 // Otherwise, align 'ShAmt' to the exponent part and add it into the exponent
3404 // part directly to emulate the multiplication of 2^ShAmt. That 8-bit
3405 // exponent is enough to avoid overflowing into the sign bit.
3406 SDValue Exp = DAG.getNode(ISD::SHL, SL, MVT::i32, ShAmt,
3407 DAG.getConstant(23, SL, MVT::i32));
3408 SDValue IVal =
3409 DAG.getNode(ISD::ADD, SL, MVT::i32,
3410 DAG.getNode(ISD::BITCAST, SL, MVT::i32, FVal), Exp);
3411 if (Signed) {
3412 // Set the sign bit.
3413 Sign = DAG.getNode(ISD::SHL, SL, MVT::i32,
3414 DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Sign),
3415 DAG.getConstant(31, SL, MVT::i32));
3416 IVal = DAG.getNode(ISD::OR, SL, MVT::i32, IVal, Sign);
3417 }
3418 return DAG.getNode(ISD::BITCAST, SL, MVT::f32, IVal);
3419}
3420
3422 bool Signed) const {
3423 SDLoc SL(Op);
3424 SDValue Src = Op.getOperand(0);
3425
3426 SDValue Lo, Hi;
3427 std::tie(Lo, Hi) = split64BitValue(Src, DAG);
3428
3430 SL, MVT::f64, Hi);
3431
3432 SDValue CvtLo = DAG.getNode(ISD::UINT_TO_FP, SL, MVT::f64, Lo);
3433
3434 SDValue LdExp = DAG.getNode(ISD::FLDEXP, SL, MVT::f64, CvtHi,
3435 DAG.getConstant(32, SL, MVT::i32));
3436 // TODO: Should this propagate fast-math-flags?
3437 return DAG.getNode(ISD::FADD, SL, MVT::f64, LdExp, CvtLo);
3438}
3439
3441 SelectionDAG &DAG) const {
3442 // TODO: Factor out code common with LowerSINT_TO_FP.
3443 EVT DestVT = Op.getValueType();
3444 SDValue Src = Op.getOperand(0);
3445 EVT SrcVT = Src.getValueType();
3446
3447 if (SrcVT == MVT::i16) {
3448 if (DestVT == MVT::f16)
3449 return Op;
3450 SDLoc DL(Op);
3451
3452 // Promote src to i32
3453 SDValue Ext = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Src);
3454 return DAG.getNode(ISD::UINT_TO_FP, DL, DestVT, Ext);
3455 }
3456
3457 if (DestVT == MVT::bf16) {
3458 SDLoc SL(Op);
3459 SDValue ToF32 = DAG.getNode(ISD::UINT_TO_FP, SL, MVT::f32, Src);
3460 SDValue FPRoundFlag = DAG.getIntPtrConstant(0, SL, /*isTarget=*/true);
3461 return DAG.getNode(ISD::FP_ROUND, SL, MVT::bf16, ToF32, FPRoundFlag);
3462 }
3463
3464 if (SrcVT != MVT::i64)
3465 return Op;
3466
3467 if (Subtarget->has16BitInsts() && DestVT == MVT::f16) {
3468 SDLoc DL(Op);
3469
3470 SDValue IntToFp32 = DAG.getNode(Op.getOpcode(), DL, MVT::f32, Src);
3471 SDValue FPRoundFlag =
3472 DAG.getIntPtrConstant(0, SDLoc(Op), /*isTarget=*/true);
3473 SDValue FPRound =
3474 DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, IntToFp32, FPRoundFlag);
3475
3476 return FPRound;
3477 }
3478
3479 if (DestVT == MVT::f32)
3480 return LowerINT_TO_FP32(Op, DAG, false);
3481
3482 assert(DestVT == MVT::f64);
3483 return LowerINT_TO_FP64(Op, DAG, false);
3484}
3485
3487 SelectionDAG &DAG) const {
3488 EVT DestVT = Op.getValueType();
3489
3490 SDValue Src = Op.getOperand(0);
3491 EVT SrcVT = Src.getValueType();
3492
3493 if (SrcVT == MVT::i16) {
3494 if (DestVT == MVT::f16)
3495 return Op;
3496
3497 SDLoc DL(Op);
3498 // Promote src to i32
3499 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i32, Src);
3500 return DAG.getNode(ISD::SINT_TO_FP, DL, DestVT, Ext);
3501 }
3502
3503 if (DestVT == MVT::bf16) {
3504 SDLoc SL(Op);
3505 SDValue ToF32 = DAG.getNode(ISD::SINT_TO_FP, SL, MVT::f32, Src);
3506 SDValue FPRoundFlag = DAG.getIntPtrConstant(0, SL, /*isTarget=*/true);
3507 return DAG.getNode(ISD::FP_ROUND, SL, MVT::bf16, ToF32, FPRoundFlag);
3508 }
3509
3510 if (SrcVT != MVT::i64)
3511 return Op;
3512
3513 // TODO: Factor out code common with LowerUINT_TO_FP.
3514
3515 if (Subtarget->has16BitInsts() && DestVT == MVT::f16) {
3516 SDLoc DL(Op);
3517 SDValue Src = Op.getOperand(0);
3518
3519 SDValue IntToFp32 = DAG.getNode(Op.getOpcode(), DL, MVT::f32, Src);
3520 SDValue FPRoundFlag =
3521 DAG.getIntPtrConstant(0, SDLoc(Op), /*isTarget=*/true);
3522 SDValue FPRound =
3523 DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, IntToFp32, FPRoundFlag);
3524
3525 return FPRound;
3526 }
3527
3528 if (DestVT == MVT::f32)
3529 return LowerINT_TO_FP32(Op, DAG, true);
3530
3531 assert(DestVT == MVT::f64);
3532 return LowerINT_TO_FP64(Op, DAG, true);
3533}
3534
3536 bool Signed) const {
3537 SDLoc SL(Op);
3538
3539 SDValue Src = Op.getOperand(0);
3540 EVT SrcVT = Src.getValueType();
3541
3542 assert(SrcVT == MVT::f32 || SrcVT == MVT::f64);
3543
3544 // The basic idea of converting a floating point number into a pair of 32-bit
3545 // integers is illustrated as follows:
3546 //
3547 // tf := trunc(val);
3548 // hif := floor(tf * 2^-32);
3549 // lof := tf - hif * 2^32; // lof is always positive due to floor.
3550 // hi := fptoi(hif);
3551 // lo := fptoi(lof);
3552 //
3553 SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, SrcVT, Src);
3554 SDValue Sign;
3555 if (Signed && SrcVT == MVT::f32) {
3556 // However, a 32-bit floating point number has only 23 bits mantissa and
3557 // it's not enough to hold all the significant bits of `lof` if val is
3558 // negative. To avoid the loss of precision, We need to take the absolute
3559 // value after truncating and flip the result back based on the original
3560 // signedness.
3561 Sign = DAG.getNode(ISD::SRA, SL, MVT::i32,
3562 DAG.getNode(ISD::BITCAST, SL, MVT::i32, Trunc),
3563 DAG.getConstant(31, SL, MVT::i32));
3564 Trunc = DAG.getNode(ISD::FABS, SL, SrcVT, Trunc);
3565 }
3566
3567 SDValue K0, K1;
3568 if (SrcVT == MVT::f64) {
3569 K0 = DAG.getConstantFP(
3570 llvm::bit_cast<double>(UINT64_C(/*2^-32*/ 0x3df0000000000000)), SL,
3571 SrcVT);
3572 K1 = DAG.getConstantFP(
3573 llvm::bit_cast<double>(UINT64_C(/*-2^32*/ 0xc1f0000000000000)), SL,
3574 SrcVT);
3575 } else {
3576 K0 = DAG.getConstantFP(
3577 llvm::bit_cast<float>(UINT32_C(/*2^-32*/ 0x2f800000)), SL, SrcVT);
3578 K1 = DAG.getConstantFP(
3579 llvm::bit_cast<float>(UINT32_C(/*-2^32*/ 0xcf800000)), SL, SrcVT);
3580 }
3581 // TODO: Should this propagate fast-math-flags?
3582 SDValue Mul = DAG.getNode(ISD::FMUL, SL, SrcVT, Trunc, K0);
3583
3584 SDValue FloorMul = DAG.getNode(ISD::FFLOOR, SL, SrcVT, Mul);
3585
3586 SDValue Fma = DAG.getNode(ISD::FMA, SL, SrcVT, FloorMul, K1, Trunc);
3587
3588 SDValue Hi = DAG.getNode((Signed && SrcVT == MVT::f64) ? ISD::FP_TO_SINT
3590 SL, MVT::i32, FloorMul);
3591 SDValue Lo = DAG.getNode(ISD::FP_TO_UINT, SL, MVT::i32, Fma);
3592
3593 SDValue Result = DAG.getNode(ISD::BITCAST, SL, MVT::i64,
3594 DAG.getBuildVector(MVT::v2i32, SL, {Lo, Hi}));
3595
3596 if (Signed && SrcVT == MVT::f32) {
3597 assert(Sign);
3598 // Flip the result based on the signedness, which is either all 0s or 1s.
3599 Sign = DAG.getNode(ISD::BITCAST, SL, MVT::i64,
3600 DAG.getBuildVector(MVT::v2i32, SL, {Sign, Sign}));
3601 // r := xor(r, sign) - sign;
3602 Result =
3603 DAG.getNode(ISD::SUB, SL, MVT::i64,
3604 DAG.getNode(ISD::XOR, SL, MVT::i64, Result, Sign), Sign);
3605 }
3606
3607 return Result;
3608}
3609
3611 SDLoc DL(Op);
3612 SDValue N0 = Op.getOperand(0);
3613
3614 // Convert to target node to get known bits
3615 if (N0.getValueType() == MVT::f32)
3616 return DAG.getNode(AMDGPUISD::FP_TO_FP16, DL, Op.getValueType(), N0);
3617
3618 if (Op->getFlags().hasApproximateFuncs()) {
3619 // There is a generic expand for FP_TO_FP16 with unsafe fast math.
3620 return SDValue();
3621 }
3622
3623 return LowerF64ToF16Safe(N0, DL, DAG);
3624}
3625
3626// return node in i32
3628 SelectionDAG &DAG) const {
3629 assert(Src.getSimpleValueType() == MVT::f64);
3630
3631 // f64 -> f16 conversion using round-to-nearest-even rounding mode.
3632 // TODO: We can generate better code for True16.
3633 const unsigned ExpMask = 0x7ff;
3634 const unsigned ExpBiasf64 = 1023;
3635 const unsigned ExpBiasf16 = 15;
3636 SDValue Zero = DAG.getConstant(0, DL, MVT::i32);
3637 SDValue One = DAG.getConstant(1, DL, MVT::i32);
3638 SDValue U = DAG.getNode(ISD::BITCAST, DL, MVT::i64, Src);
3639 SDValue UH = DAG.getNode(ISD::SRL, DL, MVT::i64, U,
3640 DAG.getConstant(32, DL, MVT::i64));
3641 UH = DAG.getZExtOrTrunc(UH, DL, MVT::i32);
3642 U = DAG.getZExtOrTrunc(U, DL, MVT::i32);
3643 SDValue E = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
3644 DAG.getConstant(20, DL, MVT::i64));
3645 E = DAG.getNode(ISD::AND, DL, MVT::i32, E,
3646 DAG.getConstant(ExpMask, DL, MVT::i32));
3647 // Subtract the fp64 exponent bias (1023) to get the real exponent and
3648 // add the f16 bias (15) to get the biased exponent for the f16 format.
3649 E = DAG.getNode(ISD::ADD, DL, MVT::i32, E,
3650 DAG.getConstant(-ExpBiasf64 + ExpBiasf16, DL, MVT::i32));
3651
3652 SDValue M = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
3653 DAG.getConstant(8, DL, MVT::i32));
3654 M = DAG.getNode(ISD::AND, DL, MVT::i32, M,
3655 DAG.getConstant(0xffe, DL, MVT::i32));
3656
3657 SDValue MaskedSig = DAG.getNode(ISD::AND, DL, MVT::i32, UH,
3658 DAG.getConstant(0x1ff, DL, MVT::i32));
3659 MaskedSig = DAG.getNode(ISD::OR, DL, MVT::i32, MaskedSig, U);
3660
3661 SDValue Lo40Set = DAG.getSelectCC(DL, MaskedSig, Zero, Zero, One, ISD::SETEQ);
3662 M = DAG.getNode(ISD::OR, DL, MVT::i32, M, Lo40Set);
3663
3664 // (M != 0 ? 0x0200 : 0) | 0x7c00;
3665 SDValue I = DAG.getNode(ISD::OR, DL, MVT::i32,
3666 DAG.getSelectCC(DL, M, Zero, DAG.getConstant(0x0200, DL, MVT::i32),
3667 Zero, ISD::SETNE), DAG.getConstant(0x7c00, DL, MVT::i32));
3668
3669 // N = M | (E << 12);
3670 SDValue N = DAG.getNode(ISD::OR, DL, MVT::i32, M,
3671 DAG.getNode(ISD::SHL, DL, MVT::i32, E,
3672 DAG.getConstant(12, DL, MVT::i32)));
3673
3674 // B = clamp(1-E, 0, 13);
3675 SDValue OneSubExp = DAG.getNode(ISD::SUB, DL, MVT::i32,
3676 One, E);
3677 SDValue B = DAG.getNode(ISD::SMAX, DL, MVT::i32, OneSubExp, Zero);
3678 B = DAG.getNode(ISD::SMIN, DL, MVT::i32, B,
3679 DAG.getConstant(13, DL, MVT::i32));
3680
3681 SDValue SigSetHigh = DAG.getNode(ISD::OR, DL, MVT::i32, M,
3682 DAG.getConstant(0x1000, DL, MVT::i32));
3683
3684 SDValue D = DAG.getNode(ISD::SRL, DL, MVT::i32, SigSetHigh, B);
3685 SDValue D0 = DAG.getNode(ISD::SHL, DL, MVT::i32, D, B);
3686 SDValue D1 = DAG.getSelectCC(DL, D0, SigSetHigh, One, Zero, ISD::SETNE);
3687 D = DAG.getNode(ISD::OR, DL, MVT::i32, D, D1);
3688
3689 SDValue V = DAG.getSelectCC(DL, E, One, D, N, ISD::SETLT);
3690 SDValue VLow3 = DAG.getNode(ISD::AND, DL, MVT::i32, V,
3691 DAG.getConstant(0x7, DL, MVT::i32));
3692 V = DAG.getNode(ISD::SRL, DL, MVT::i32, V,
3693 DAG.getConstant(2, DL, MVT::i32));
3694 SDValue V0 = DAG.getSelectCC(DL, VLow3, DAG.getConstant(3, DL, MVT::i32),
3695 One, Zero, ISD::SETEQ);
3696 SDValue V1 = DAG.getSelectCC(DL, VLow3, DAG.getConstant(5, DL, MVT::i32),
3697 One, Zero, ISD::SETGT);
3698 V1 = DAG.getNode(ISD::OR, DL, MVT::i32, V0, V1);
3699 V = DAG.getNode(ISD::ADD, DL, MVT::i32, V, V1);
3700
3701 V = DAG.getSelectCC(DL, E, DAG.getConstant(30, DL, MVT::i32),
3702 DAG.getConstant(0x7c00, DL, MVT::i32), V, ISD::SETGT);
3703 V = DAG.getSelectCC(DL, E, DAG.getConstant(1039, DL, MVT::i32),
3704 I, V, ISD::SETEQ);
3705
3706 // Extract the sign bit.
3707 SDValue Sign = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
3708 DAG.getConstant(16, DL, MVT::i32));
3709 Sign = DAG.getNode(ISD::AND, DL, MVT::i32, Sign,
3710 DAG.getConstant(0x8000, DL, MVT::i32));
3711
3712 return DAG.getNode(ISD::OR, DL, MVT::i32, Sign, V);
3713}
3714
3716 SelectionDAG &DAG) const {
3717 SDValue Src = Op.getOperand(0);
3718 unsigned OpOpcode = Op.getOpcode();
3719 EVT SrcVT = Src.getValueType();
3720 EVT DestVT = Op.getValueType();
3721
3722 // Will be selected natively
3723 if (SrcVT == MVT::f16 && DestVT == MVT::i16)
3724 return Op;
3725
3726 if (SrcVT == MVT::bf16) {
3727 SDLoc DL(Op);
3728 SDValue PromotedSrc = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, Src);
3729 return DAG.getNode(Op.getOpcode(), DL, DestVT, PromotedSrc);
3730 }
3731
3732 // Promote i16 to i32
3733 if (DestVT == MVT::i16 && (SrcVT == MVT::f32 || SrcVT == MVT::f64)) {
3734 SDLoc DL(Op);
3735
3736 SDValue FpToInt32 = DAG.getNode(OpOpcode, DL, MVT::i32, Src);
3737 return DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, FpToInt32);
3738 }
3739
3740 if (DestVT != MVT::i64)
3741 return Op;
3742
3743 if (SrcVT == MVT::f16 ||
3744 (SrcVT == MVT::f32 && Src.getOpcode() == ISD::FP16_TO_FP)) {
3745 SDLoc DL(Op);
3746
3747 SDValue FpToInt32 = DAG.getNode(OpOpcode, DL, MVT::i32, Src);
3748 unsigned Ext =
3750 return DAG.getNode(Ext, DL, MVT::i64, FpToInt32);
3751 }
3752
3753 if (SrcVT == MVT::f32 || SrcVT == MVT::f64)
3754 return LowerFP_TO_INT64(Op, DAG, OpOpcode == ISD::FP_TO_SINT);
3755
3756 return SDValue();
3757}
3758
3760 SelectionDAG &DAG) const {
3761 EVT ExtraVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
3762 MVT VT = Op.getSimpleValueType();
3763 MVT ScalarVT = VT.getScalarType();
3764
3765 assert(VT.isVector());
3766
3767 SDValue Src = Op.getOperand(0);
3768 SDLoc DL(Op);
3769
3770 // TODO: Don't scalarize on Evergreen?
3771 unsigned NElts = VT.getVectorNumElements();
3773 DAG.ExtractVectorElements(Src, Args, 0, NElts);
3774
3775 SDValue VTOp = DAG.getValueType(ExtraVT.getScalarType());
3776 for (unsigned I = 0; I < NElts; ++I)
3777 Args[I] = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, ScalarVT, Args[I], VTOp);
3778
3779 return DAG.getBuildVector(VT, DL, Args);
3780}
3781
3782//===----------------------------------------------------------------------===//
3783// Custom DAG optimizations
3784//===----------------------------------------------------------------------===//
3785
3786static bool isU24(SDValue Op, SelectionDAG &DAG) {
3787 return AMDGPUTargetLowering::numBitsUnsigned(Op, DAG) <= 24;
3788}
3789
3790static bool isI24(SDValue Op, SelectionDAG &DAG) {
3791 EVT VT = Op.getValueType();
3792 return VT.getSizeInBits() >= 24 && // Types less than 24-bit should be treated
3793 // as unsigned 24-bit values.
3795}
3796
3799 SelectionDAG &DAG = DCI.DAG;
3800 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
3801 bool IsIntrin = Node24->getOpcode() == ISD::INTRINSIC_WO_CHAIN;
3802
3803 SDValue LHS = IsIntrin ? Node24->getOperand(1) : Node24->getOperand(0);
3804 SDValue RHS = IsIntrin ? Node24->getOperand(2) : Node24->getOperand(1);
3805 unsigned NewOpcode = Node24->getOpcode();
3806 if (IsIntrin) {
3807 unsigned IID = Node24->getConstantOperandVal(0);
3808 switch (IID) {
3809 case Intrinsic::amdgcn_mul_i24:
3810 NewOpcode = AMDGPUISD::MUL_I24;
3811 break;
3812 case Intrinsic::amdgcn_mul_u24:
3813 NewOpcode = AMDGPUISD::MUL_U24;
3814 break;
3815 case Intrinsic::amdgcn_mulhi_i24:
3816 NewOpcode = AMDGPUISD::MULHI_I24;
3817 break;
3818 case Intrinsic::amdgcn_mulhi_u24:
3819 NewOpcode = AMDGPUISD::MULHI_U24;
3820 break;
3821 default:
3822 llvm_unreachable("Expected 24-bit mul intrinsic");
3823 }
3824 }
3825
3826 APInt Demanded = APInt::getLowBitsSet(LHS.getValueSizeInBits(), 24);
3827
3828 // First try to simplify using SimplifyMultipleUseDemandedBits which allows
3829 // the operands to have other uses, but will only perform simplifications that
3830 // involve bypassing some nodes for this user.
3831 SDValue DemandedLHS = TLI.SimplifyMultipleUseDemandedBits(LHS, Demanded, DAG);
3832 SDValue DemandedRHS = TLI.SimplifyMultipleUseDemandedBits(RHS, Demanded, DAG);
3833 if (DemandedLHS || DemandedRHS)
3834 return DAG.getNode(NewOpcode, SDLoc(Node24), Node24->getVTList(),
3835 DemandedLHS ? DemandedLHS : LHS,
3836 DemandedRHS ? DemandedRHS : RHS);
3837
3838 // Now try SimplifyDemandedBits which can simplify the nodes used by our
3839 // operands if this node is the only user.
3840 if (TLI.SimplifyDemandedBits(LHS, Demanded, DCI))
3841 return SDValue(Node24, 0);
3842 if (TLI.SimplifyDemandedBits(RHS, Demanded, DCI))
3843 return SDValue(Node24, 0);
3844
3845 return SDValue();
3846}
3847
3848template <typename IntTy>
3850 uint32_t Width, const SDLoc &DL) {
3851 if (Width + Offset < 32) {
3852 uint32_t Shl = static_cast<uint32_t>(Src0) << (32 - Offset - Width);
3853 IntTy Result = static_cast<IntTy>(Shl) >> (32 - Width);
3854 if constexpr (std::is_signed_v<IntTy>) {
3855 return DAG.getSignedConstant(Result, DL, MVT::i32);
3856 } else {
3857 return DAG.getConstant(Result, DL, MVT::i32);
3858 }
3859 }
3860
3861 return DAG.getConstant(Src0 >> Offset, DL, MVT::i32);
3862}
3863
3864static bool hasVolatileUser(SDNode *Val) {
3865 for (SDNode *U : Val->users()) {
3866 if (MemSDNode *M = dyn_cast<MemSDNode>(U)) {
3867 if (M->isVolatile())
3868 return true;
3869 }
3870 }
3871
3872 return false;
3873}
3874
3876 // i32 vectors are the canonical memory type.
3877 if (VT.getScalarType() == MVT::i32 || isTypeLegal(VT))
3878 return false;
3879
3880 if (!VT.isByteSized())
3881 return false;
3882
3883 unsigned Size = VT.getStoreSize();
3884
3885 if ((Size == 1 || Size == 2 || Size == 4) && !VT.isVector())
3886 return false;
3887
3888 if (Size == 3 || (Size > 4 && (Size % 4 != 0)))
3889 return false;
3890
3891 return true;
3892}
3893
3894// Replace load of an illegal type with a bitcast from a load of a friendlier
3895// type.
3897 DAGCombinerInfo &DCI) const {
3898 if (!DCI.isBeforeLegalize())
3899 return SDValue();
3900
3902 if (!LN->isSimple() || !ISD::isNormalLoad(LN) || hasVolatileUser(LN))
3903 return SDValue();
3904
3905 SDLoc SL(N);
3906 SelectionDAG &DAG = DCI.DAG;
3907 EVT VT = LN->getMemoryVT();
3908
3909 unsigned Size = VT.getStoreSize();
3910 Align Alignment = LN->getAlign();
3911 if (Alignment < Size && isTypeLegal(VT)) {
3912 unsigned IsFast;
3913 unsigned AS = LN->getAddressSpace();
3914
3915 // Expand unaligned loads earlier than legalization. Due to visitation order
3916 // problems during legalization, the emitted instructions to pack and unpack
3917 // the bytes again are not eliminated in the case of an unaligned copy.
3919 VT, AS, Alignment, LN->getMemOperand()->getFlags(), &IsFast)) {
3920 if (VT.isVector())
3921 return SplitVectorLoad(SDValue(LN, 0), DAG);
3922
3923 SDValue Ops[2];
3924 std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(LN, DAG);
3925
3926 return DAG.getMergeValues(Ops, SDLoc(N));
3927 }
3928
3929 if (!IsFast)
3930 return SDValue();
3931 }
3932
3933 if (!shouldCombineMemoryType(VT))
3934 return SDValue();
3935
3936 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
3937
3938 SDValue NewLoad
3939 = DAG.getLoad(NewVT, SL, LN->getChain(),
3940 LN->getBasePtr(), LN->getMemOperand());
3941
3942 SDValue BC = DAG.getNode(ISD::BITCAST, SL, VT, NewLoad);
3943 DCI.CombineTo(N, BC, NewLoad.getValue(1));
3944 return SDValue(N, 0);
3945}
3946
3947// Replace store of an illegal type with a store of a bitcast to a friendlier
3948// type.
3950 DAGCombinerInfo &DCI) const {
3951 if (!DCI.isBeforeLegalize())
3952 return SDValue();
3953
3955 if (!SN->isSimple() || !ISD::isNormalStore(SN))
3956 return SDValue();
3957
3958 EVT VT = SN->getMemoryVT();
3959 unsigned Size = VT.getStoreSize();
3960
3961 SDLoc SL(N);
3962 SelectionDAG &DAG = DCI.DAG;
3963 Align Alignment = SN->getAlign();
3964 if (Alignment < Size && isTypeLegal(VT)) {
3965 unsigned IsFast;
3966 unsigned AS = SN->getAddressSpace();
3967
3968 // Expand unaligned stores earlier than legalization. Due to visitation
3969 // order problems during legalization, the emitted instructions to pack and
3970 // unpack the bytes again are not eliminated in the case of an unaligned
3971 // copy.
3973 VT, AS, Alignment, SN->getMemOperand()->getFlags(), &IsFast)) {
3974 if (VT.isVector())
3975 return SplitVectorStore(SDValue(SN, 0), DAG);
3976
3977 return expandUnalignedStore(SN, DAG);
3978 }
3979
3980 if (!IsFast)
3981 return SDValue();
3982 }
3983
3984 if (!shouldCombineMemoryType(VT))
3985 return SDValue();
3986
3987 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
3988 SDValue Val = SN->getValue();
3989
3990 //DCI.AddToWorklist(Val.getNode());
3991
3992 bool OtherUses = !Val.hasOneUse();
3993 SDValue CastVal = DAG.getNode(ISD::BITCAST, SL, NewVT, Val);
3994 if (OtherUses) {
3995 SDValue CastBack = DAG.getNode(ISD::BITCAST, SL, VT, CastVal);
3996 DAG.ReplaceAllUsesOfValueWith(Val, CastBack);
3997 }
3998
3999 return DAG.getStore(SN->getChain(), SL, CastVal,
4000 SN->getBasePtr(), SN->getMemOperand());
4001}
4002
4003// FIXME: This should go in generic DAG combiner with an isTruncateFree check,
4004// but isTruncateFree is inaccurate for i16 now because of SALU vs. VALU
4005// issues.
4007 DAGCombinerInfo &DCI) const {
4008 SelectionDAG &DAG = DCI.DAG;
4009 SDValue N0 = N->getOperand(0);
4010
4011 // (vt2 (assertzext (truncate vt0:x), vt1)) ->
4012 // (vt2 (truncate (assertzext vt0:x, vt1)))
4013 if (N0.getOpcode() == ISD::TRUNCATE) {
4014 SDValue N1 = N->getOperand(1);
4015 EVT ExtVT = cast<VTSDNode>(N1)->getVT();
4016 SDLoc SL(N);
4017
4018 SDValue Src = N0.getOperand(0);
4019 EVT SrcVT = Src.getValueType();
4020 if (SrcVT.bitsGE(ExtVT)) {
4021 SDValue NewInReg = DAG.getNode(N->getOpcode(), SL, SrcVT, Src, N1);
4022 return DAG.getNode(ISD::TRUNCATE, SL, N->getValueType(0), NewInReg);
4023 }
4024 }
4025
4026 return SDValue();
4027}
4028
4030 SDNode *N, DAGCombinerInfo &DCI) const {
4031 unsigned IID = N->getConstantOperandVal(0);
4032 switch (IID) {
4033 case Intrinsic::amdgcn_mul_i24:
4034 case Intrinsic::amdgcn_mul_u24:
4035 case Intrinsic::amdgcn_mulhi_i24:
4036 case Intrinsic::amdgcn_mulhi_u24:
4037 return simplifyMul24(N, DCI);
4038 case Intrinsic::amdgcn_fract:
4039 case Intrinsic::amdgcn_rsq:
4040 case Intrinsic::amdgcn_rcp_legacy:
4041 case Intrinsic::amdgcn_rsq_legacy:
4042 case Intrinsic::amdgcn_rsq_clamp:
4043 case Intrinsic::amdgcn_tanh:
4044 case Intrinsic::amdgcn_prng_b32: {
4045 // FIXME: This is probably wrong. If src is an sNaN, it won't be quieted
4046 SDValue Src = N->getOperand(1);
4047 return Src.isUndef() ? Src : SDValue();
4048 }
4049 case Intrinsic::amdgcn_frexp_exp: {
4050 // frexp_exp (fneg x) -> frexp_exp x
4051 // frexp_exp (fabs x) -> frexp_exp x
4052 // frexp_exp (fneg (fabs x)) -> frexp_exp x
4053 SDValue Src = N->getOperand(1);
4054 SDValue PeekSign = peekFPSignOps(Src);
4055 if (PeekSign == Src)
4056 return SDValue();
4057 return SDValue(DCI.DAG.UpdateNodeOperands(N, N->getOperand(0), PeekSign),
4058 0);
4059 }
4060 default:
4061 return SDValue();
4062 }
4063}
4064
4065/// Split the 64-bit value \p LHS into two 32-bit components, and perform the
4066/// binary operation \p Opc to it with the corresponding constant operands.
4068 DAGCombinerInfo &DCI, const SDLoc &SL,
4069 unsigned Opc, SDValue LHS,
4070 uint32_t ValLo, uint32_t ValHi) const {
4071 SelectionDAG &DAG = DCI.DAG;
4072 SDValue Lo, Hi;
4073 std::tie(Lo, Hi) = split64BitValue(LHS, DAG);
4074
4075 SDValue LoRHS = DAG.getConstant(ValLo, SL, MVT::i32);
4076 SDValue HiRHS = DAG.getConstant(ValHi, SL, MVT::i32);
4077
4078 SDValue LoAnd = DAG.getNode(Opc, SL, MVT::i32, Lo, LoRHS);
4079 SDValue HiAnd = DAG.getNode(Opc, SL, MVT::i32, Hi, HiRHS);
4080
4081 // Re-visit the ands. It's possible we eliminated one of them and it could
4082 // simplify the vector.
4083 DCI.AddToWorklist(Lo.getNode());
4084 DCI.AddToWorklist(Hi.getNode());
4085
4086 SDValue Vec = DAG.getBuildVector(MVT::v2i32, SL, {LoAnd, HiAnd});
4087 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
4088}
4089
4091 DAGCombinerInfo &DCI) const {
4092 EVT VT = N->getValueType(0);
4093 SDValue LHS = N->getOperand(0);
4094 SDValue RHS = N->getOperand(1);
4096 SDLoc SL(N);
4097 SelectionDAG &DAG = DCI.DAG;
4098
4099 unsigned RHSVal;
4100 if (CRHS) {
4101 RHSVal = CRHS->getZExtValue();
4102 if (!RHSVal)
4103 return LHS;
4104
4105 switch (LHS->getOpcode()) {
4106 default:
4107 break;
4108 case ISD::ZERO_EXTEND:
4109 case ISD::SIGN_EXTEND:
4110 case ISD::ANY_EXTEND: {
4111 SDValue X = LHS->getOperand(0);
4112
4113 if (VT == MVT::i32 && RHSVal == 16 && X.getValueType() == MVT::i16 &&
4114 isOperationLegal(ISD::BUILD_VECTOR, MVT::v2i16)) {
4115 // Prefer build_vector as the canonical form if packed types are legal.
4116 // (shl ([asz]ext i16:x), 16 -> build_vector 0, x
4117 SDValue Vec = DAG.getBuildVector(
4118 MVT::v2i16, SL,
4119 {DAG.getConstant(0, SL, MVT::i16), LHS->getOperand(0)});
4120 return DAG.getNode(ISD::BITCAST, SL, MVT::i32, Vec);
4121 }
4122
4123 // shl (ext x) => zext (shl x), if shift does not overflow int
4124 if (VT != MVT::i64)
4125 break;
4126 KnownBits Known = DAG.computeKnownBits(X);
4127 unsigned LZ = Known.countMinLeadingZeros();
4128 if (LZ < RHSVal)
4129 break;
4130 EVT XVT = X.getValueType();
4131 SDValue Shl = DAG.getNode(ISD::SHL, SL, XVT, X, SDValue(CRHS, 0));
4132 return DAG.getZExtOrTrunc(Shl, SL, VT);
4133 }
4134 }
4135 }
4136
4137 if (VT.getScalarType() != MVT::i64)
4138 return SDValue();
4139
4140 // On some subtargets, 64-bit shift is a quarter rate instruction. In the
4141 // common case, splitting this into a move and a 32-bit shift is faster and
4142 // the same code size.
4143 KnownBits Known = DAG.computeKnownBits(RHS);
4144
4145 EVT ElementType = VT.getScalarType();
4146 EVT TargetScalarType = ElementType.getHalfSizedIntegerVT(*DAG.getContext());
4147 EVT TargetType = VT.isVector() ? VT.changeVectorElementType(TargetScalarType)
4148 : TargetScalarType;
4149
4150 if (Known.getMinValue().getZExtValue() < TargetScalarType.getSizeInBits())
4151 return SDValue();
4152 SDValue ShiftAmt;
4153
4154 if (CRHS) {
4155 ShiftAmt = DAG.getConstant(RHSVal - TargetScalarType.getSizeInBits(), SL,
4156 TargetType);
4157 } else {
4158 SDValue TruncShiftAmt = DAG.getNode(ISD::TRUNCATE, SL, TargetType, RHS);
4159 const SDValue ShiftMask =
4160 DAG.getConstant(TargetScalarType.getSizeInBits() - 1, SL, TargetType);
4161 // This AND instruction will clamp out of bounds shift values.
4162 // It will also be removed during later instruction selection.
4163 ShiftAmt = DAG.getNode(ISD::AND, SL, TargetType, TruncShiftAmt, ShiftMask);
4164 }
4165
4166 SDValue Lo = DAG.getNode(ISD::TRUNCATE, SL, TargetType, LHS);
4167 SDValue NewShift =
4168 DAG.getNode(ISD::SHL, SL, TargetType, Lo, ShiftAmt, N->getFlags());
4169
4170 const SDValue Zero = DAG.getConstant(0, SL, TargetScalarType);
4171 SDValue Vec;
4172
4173 if (VT.isVector()) {
4174 EVT ConcatType = TargetType.getDoubleNumVectorElementsVT(*DAG.getContext());
4175 unsigned NElts = TargetType.getVectorNumElements();
4177 SmallVector<SDValue, 16> HiAndLoOps(NElts * 2, Zero);
4178
4179 DAG.ExtractVectorElements(NewShift, HiOps, 0, NElts);
4180 for (unsigned I = 0; I != NElts; ++I)
4181 HiAndLoOps[2 * I + 1] = HiOps[I];
4182 Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, ConcatType, HiAndLoOps);
4183 } else {
4184 EVT ConcatType = EVT::getVectorVT(*DAG.getContext(), TargetType, 2);
4185 Vec = DAG.getBuildVector(ConcatType, SL, {Zero, NewShift});
4186 }
4187 return DAG.getNode(ISD::BITCAST, SL, VT, Vec);
4188}
4189
4191 DAGCombinerInfo &DCI) const {
4192 SDValue RHS = N->getOperand(1);
4194 EVT VT = N->getValueType(0);
4195 SDValue LHS = N->getOperand(0);
4196 SelectionDAG &DAG = DCI.DAG;
4197 SDLoc SL(N);
4198
4199 if (VT.getScalarType() != MVT::i64)
4200 return SDValue();
4201
4202 // For C >= 32
4203 // i64 (sra x, C) -> (build_pair (sra hi_32(x), C - 32), sra hi_32(x), 31))
4204
4205 // On some subtargets, 64-bit shift is a quarter rate instruction. In the
4206 // common case, splitting this into a move and a 32-bit shift is faster and
4207 // the same code size.
4208 KnownBits Known = DAG.computeKnownBits(RHS);
4209
4210 EVT ElementType = VT.getScalarType();
4211 EVT TargetScalarType = ElementType.getHalfSizedIntegerVT(*DAG.getContext());
4212 EVT TargetType = VT.isVector() ? VT.changeVectorElementType(TargetScalarType)
4213 : TargetScalarType;
4214
4215 if (Known.getMinValue().getZExtValue() < TargetScalarType.getSizeInBits())
4216 return SDValue();
4217
4218 SDValue ShiftFullAmt =
4219 DAG.getConstant(TargetScalarType.getSizeInBits() - 1, SL, TargetType);
4220 SDValue ShiftAmt;
4221 if (CRHS) {
4222 unsigned RHSVal = CRHS->getZExtValue();
4223 ShiftAmt = DAG.getConstant(RHSVal - TargetScalarType.getSizeInBits(), SL,
4224 TargetType);
4225 } else if (Known.getMinValue().getZExtValue() ==
4226 (ElementType.getSizeInBits() - 1)) {
4227 ShiftAmt = ShiftFullAmt;
4228 } else {
4229 SDValue TruncShiftAmt = DAG.getNode(ISD::TRUNCATE, SL, TargetType, RHS);
4230 const SDValue ShiftMask =
4231 DAG.getConstant(TargetScalarType.getSizeInBits() - 1, SL, TargetType);
4232 // This AND instruction will clamp out of bounds shift values.
4233 // It will also be removed during later instruction selection.
4234 ShiftAmt = DAG.getNode(ISD::AND, SL, TargetType, TruncShiftAmt, ShiftMask);
4235 }
4236
4237 EVT ConcatType;
4238 SDValue Hi;
4239 SDLoc LHSSL(LHS);
4240 // Bitcast LHS into ConcatType so hi-half of source can be extracted into Hi
4241 if (VT.isVector()) {
4242 unsigned NElts = TargetType.getVectorNumElements();
4243 ConcatType = TargetType.getDoubleNumVectorElementsVT(*DAG.getContext());
4244 SDValue SplitLHS = DAG.getNode(ISD::BITCAST, LHSSL, ConcatType, LHS);
4245 SmallVector<SDValue, 8> HiOps(NElts);
4246 SmallVector<SDValue, 16> HiAndLoOps;
4247
4248 DAG.ExtractVectorElements(SplitLHS, HiAndLoOps, 0, NElts * 2);
4249 for (unsigned I = 0; I != NElts; ++I) {
4250 HiOps[I] = HiAndLoOps[2 * I + 1];
4251 }
4252 Hi = DAG.getNode(ISD::BUILD_VECTOR, LHSSL, TargetType, HiOps);
4253 } else {
4254 const SDValue One = DAG.getConstant(1, LHSSL, TargetScalarType);
4255 ConcatType = EVT::getVectorVT(*DAG.getContext(), TargetType, 2);
4256 SDValue SplitLHS = DAG.getNode(ISD::BITCAST, LHSSL, ConcatType, LHS);
4257 Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, LHSSL, TargetType, SplitLHS, One);
4258 }
4259
4260 KnownBits KnownLHS = DAG.computeKnownBits(LHS);
4261 SDValue HiShift;
4262 if (KnownLHS.isNegative()) {
4263 HiShift = DAG.getAllOnesConstant(SL, TargetType);
4264 } else {
4265 Hi = DAG.getFreeze(Hi);
4266 HiShift = DAG.getNode(ISD::SRA, SL, TargetType, Hi, ShiftFullAmt);
4267 }
4268 SDValue NewShift =
4269 DAG.getNode(ISD::SRA, SL, TargetType, Hi, ShiftAmt, N->getFlags());
4270
4271 SDValue Vec;
4272 if (VT.isVector()) {
4273 unsigned NElts = TargetType.getVectorNumElements();
4276 SmallVector<SDValue, 16> HiAndLoOps(NElts * 2);
4277
4278 DAG.ExtractVectorElements(HiShift, HiOps, 0, NElts);
4279 DAG.ExtractVectorElements(NewShift, LoOps, 0, NElts);
4280 for (unsigned I = 0; I != NElts; ++I) {
4281 HiAndLoOps[2 * I + 1] = HiOps[I];
4282 HiAndLoOps[2 * I] = LoOps[I];
4283 }
4284 Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, ConcatType, HiAndLoOps);
4285 } else {
4286 Vec = DAG.getBuildVector(ConcatType, SL, {NewShift, HiShift});
4287 }
4288 return DAG.getNode(ISD::BITCAST, SL, VT, Vec);
4289}
4290
4292 DAGCombinerInfo &DCI) const {
4293 SDValue RHS = N->getOperand(1);
4295 EVT VT = N->getValueType(0);
4296 SDValue LHS = N->getOperand(0);
4297 SelectionDAG &DAG = DCI.DAG;
4298 SDLoc SL(N);
4299 unsigned RHSVal;
4300
4301 if (CRHS) {
4302 RHSVal = CRHS->getZExtValue();
4303
4304 // fold (srl (and x, c1 << c2), c2) -> (and (srl(x, c2), c1)
4305 // this improves the ability to match BFE patterns in isel.
4306 if (LHS.getOpcode() == ISD::AND) {
4307 if (auto *Mask = dyn_cast<ConstantSDNode>(LHS.getOperand(1))) {
4308 unsigned MaskIdx, MaskLen;
4309 if (Mask->getAPIntValue().isShiftedMask(MaskIdx, MaskLen) &&
4310 MaskIdx == RHSVal) {
4311 return DAG.getNode(ISD::AND, SL, VT,
4312 DAG.getNode(ISD::SRL, SL, VT, LHS.getOperand(0),
4313 N->getOperand(1)),
4314 DAG.getNode(ISD::SRL, SL, VT, LHS.getOperand(1),
4315 N->getOperand(1)));
4316 }
4317 }
4318 }
4319 }
4320
4321 if (VT.getScalarType() != MVT::i64)
4322 return SDValue();
4323
4324 // for C >= 32
4325 // i64 (srl x, C) -> (build_pair (srl hi_32(x), C - 32), 0)
4326
4327 // On some subtargets, 64-bit shift is a quarter rate instruction. In the
4328 // common case, splitting this into a move and a 32-bit shift is faster and
4329 // the same code size.
4330 KnownBits Known = DAG.computeKnownBits(RHS);
4331
4332 EVT ElementType = VT.getScalarType();
4333 EVT TargetScalarType = ElementType.getHalfSizedIntegerVT(*DAG.getContext());
4334 EVT TargetType = VT.isVector() ? VT.changeVectorElementType(TargetScalarType)
4335 : TargetScalarType;
4336
4337 if (Known.getMinValue().getZExtValue() < TargetScalarType.getSizeInBits())
4338 return SDValue();
4339
4340 SDValue ShiftAmt;
4341 if (CRHS) {
4342 ShiftAmt = DAG.getConstant(RHSVal - TargetScalarType.getSizeInBits(), SL,
4343 TargetType);
4344 } else {
4345 SDValue TruncShiftAmt = DAG.getNode(ISD::TRUNCATE, SL, TargetType, RHS);
4346 const SDValue ShiftMask =
4347 DAG.getConstant(TargetScalarType.getSizeInBits() - 1, SL, TargetType);
4348 // This AND instruction will clamp out of bounds shift values.
4349 // It will also be removed during later instruction selection.
4350 ShiftAmt = DAG.getNode(ISD::AND, SL, TargetType, TruncShiftAmt, ShiftMask);
4351 }
4352
4353 const SDValue Zero = DAG.getConstant(0, SL, TargetScalarType);
4354 EVT ConcatType;
4355 SDValue Hi;
4356 SDLoc LHSSL(LHS);
4357 // Bitcast LHS into ConcatType so hi-half of source can be extracted into Hi
4358 if (VT.isVector()) {
4359 unsigned NElts = TargetType.getVectorNumElements();
4360 ConcatType = TargetType.getDoubleNumVectorElementsVT(*DAG.getContext());
4361 SDValue SplitLHS = DAG.getNode(ISD::BITCAST, LHSSL, ConcatType, LHS);
4362 SmallVector<SDValue, 8> HiOps(NElts);
4363 SmallVector<SDValue, 16> HiAndLoOps;
4364
4365 DAG.ExtractVectorElements(SplitLHS, HiAndLoOps, /*Start=*/0, NElts * 2);
4366 for (unsigned I = 0; I != NElts; ++I)
4367 HiOps[I] = HiAndLoOps[2 * I + 1];
4368 Hi = DAG.getNode(ISD::BUILD_VECTOR, LHSSL, TargetType, HiOps);
4369 } else {
4370 const SDValue One = DAG.getConstant(1, LHSSL, TargetScalarType);
4371 ConcatType = EVT::getVectorVT(*DAG.getContext(), TargetType, 2);
4372 SDValue SplitLHS = DAG.getNode(ISD::BITCAST, LHSSL, ConcatType, LHS);
4373 Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, LHSSL, TargetType, SplitLHS, One);
4374 }
4375
4376 SDValue NewShift =
4377 DAG.getNode(ISD::SRL, SL, TargetType, Hi, ShiftAmt, N->getFlags());
4378
4379 SDValue Vec;
4380 if (VT.isVector()) {
4381 unsigned NElts = TargetType.getVectorNumElements();
4383 SmallVector<SDValue, 16> HiAndLoOps(NElts * 2, Zero);
4384
4385 DAG.ExtractVectorElements(NewShift, LoOps, 0, NElts);
4386 for (unsigned I = 0; I != NElts; ++I)
4387 HiAndLoOps[2 * I] = LoOps[I];
4388 Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, ConcatType, HiAndLoOps);
4389 } else {
4390 Vec = DAG.getBuildVector(ConcatType, SL, {NewShift, Zero});
4391 }
4392 return DAG.getNode(ISD::BITCAST, SL, VT, Vec);
4393}
4394
4396 SDNode *N, DAGCombinerInfo &DCI) const {
4397 SDLoc SL(N);
4398 SelectionDAG &DAG = DCI.DAG;
4399 EVT VT = N->getValueType(0);
4400 SDValue Src = N->getOperand(0);
4401
4402 // vt1 (truncate (bitcast (build_vector vt0:x, ...))) -> vt1 (bitcast vt0:x)
4403 if (Src.getOpcode() == ISD::BITCAST && !VT.isVector()) {
4404 SDValue Vec = Src.getOperand(0);
4405 if (Vec.getOpcode() == ISD::BUILD_VECTOR) {
4406 SDValue Elt0 = Vec.getOperand(0);
4407 EVT EltVT = Elt0.getValueType();
4408 if (VT.getFixedSizeInBits() <= EltVT.getFixedSizeInBits()) {
4409 if (EltVT.isFloatingPoint()) {
4410 Elt0 = DAG.getNode(ISD::BITCAST, SL,
4411 EltVT.changeTypeToInteger(), Elt0);
4412 }
4413
4414 return DAG.getNode(ISD::TRUNCATE, SL, VT, Elt0);
4415 }
4416 }
4417 }
4418
4419 // Equivalent of above for accessing the high element of a vector as an
4420 // integer operation.
4421 // trunc (srl (bitcast (build_vector x, y))), 16 -> trunc (bitcast y)
4422 if (Src.getOpcode() == ISD::SRL && !VT.isVector()) {
4423 if (auto *K = isConstOrConstSplat(Src.getOperand(1))) {
4424 SDValue BV = stripBitcast(Src.getOperand(0));
4425 if (BV.getOpcode() == ISD::BUILD_VECTOR) {
4426 EVT SrcEltVT = BV.getOperand(0).getValueType();
4427 unsigned SrcEltSize = SrcEltVT.getSizeInBits();
4428 unsigned BitIndex = K->getZExtValue();
4429 unsigned PartIndex = BitIndex / SrcEltSize;
4430
4431 if (PartIndex * SrcEltSize == BitIndex &&
4432 PartIndex < BV.getNumOperands()) {
4433 if (SrcEltVT.getSizeInBits() == VT.getSizeInBits()) {
4434 SDValue SrcElt =
4435 DAG.getNode(ISD::BITCAST, SL, SrcEltVT.changeTypeToInteger(),
4436 BV.getOperand(PartIndex));
4437 return DAG.getNode(ISD::TRUNCATE, SL, VT, SrcElt);
4438 }
4439 }
4440 }
4441 }
4442 }
4443
4444 // Partially shrink 64-bit shifts to 32-bit if reduced to 16-bit.
4445 //
4446 // i16 (trunc (srl i64:x, K)), K <= 16 ->
4447 // i16 (trunc (srl (i32 (trunc x), K)))
4448 if (VT.getScalarSizeInBits() < 32) {
4449 EVT SrcVT = Src.getValueType();
4450 if (SrcVT.getScalarSizeInBits() > 32 &&
4451 (Src.getOpcode() == ISD::SRL ||
4452 Src.getOpcode() == ISD::SRA ||
4453 Src.getOpcode() == ISD::SHL)) {
4454 SDValue Amt = Src.getOperand(1);
4455 KnownBits Known = DAG.computeKnownBits(Amt);
4456
4457 // - For left shifts, do the transform as long as the shift
4458 // amount is still legal for i32, so when ShiftAmt < 32 (<= 31)
4459 // - For right shift, do it if ShiftAmt <= (32 - Size) to avoid
4460 // losing information stored in the high bits when truncating.
4461 const unsigned MaxCstSize =
4462 (Src.getOpcode() == ISD::SHL) ? 31 : (32 - VT.getScalarSizeInBits());
4463 if (Known.getMaxValue().ule(MaxCstSize)) {
4464 EVT MidVT = VT.isVector() ?
4465 EVT::getVectorVT(*DAG.getContext(), MVT::i32,
4466 VT.getVectorNumElements()) : MVT::i32;
4467
4468 EVT NewShiftVT = getShiftAmountTy(MidVT, DAG.getDataLayout());
4469 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, MidVT,
4470 Src.getOperand(0));
4471 DCI.AddToWorklist(Trunc.getNode());
4472
4473 if (Amt.getValueType() != NewShiftVT) {
4474 Amt = DAG.getZExtOrTrunc(Amt, SL, NewShiftVT);
4475 DCI.AddToWorklist(Amt.getNode());
4476 }
4477
4478 SDValue ShrunkShift = DAG.getNode(Src.getOpcode(), SL, MidVT,
4479 Trunc, Amt);
4480 return DAG.getNode(ISD::TRUNCATE, SL, VT, ShrunkShift);
4481 }
4482 }
4483 }
4484
4485 return SDValue();
4486}
4487
4488// We need to specifically handle i64 mul here to avoid unnecessary conversion
4489// instructions. If we only match on the legalized i64 mul expansion,
4490// SimplifyDemandedBits will be unable to remove them because there will be
4491// multiple uses due to the separate mul + mulh[su].
4492static SDValue getMul24(SelectionDAG &DAG, const SDLoc &SL,
4493 SDValue N0, SDValue N1, unsigned Size, bool Signed) {
4494 if (Size <= 32) {
4495 unsigned MulOpc = Signed ? AMDGPUISD::MUL_I24 : AMDGPUISD::MUL_U24;
4496 return DAG.getNode(MulOpc, SL, MVT::i32, N0, N1);
4497 }
4498
4499 unsigned MulLoOpc = Signed ? AMDGPUISD::MUL_I24 : AMDGPUISD::MUL_U24;
4500 unsigned MulHiOpc = Signed ? AMDGPUISD::MULHI_I24 : AMDGPUISD::MULHI_U24;
4501
4502 SDValue MulLo = DAG.getNode(MulLoOpc, SL, MVT::i32, N0, N1);
4503 SDValue MulHi = DAG.getNode(MulHiOpc, SL, MVT::i32, N0, N1);
4504
4505 return DAG.getNode(ISD::BUILD_PAIR, SL, MVT::i64, MulLo, MulHi);
4506}
4507
4508/// If \p V is an add of a constant 1, returns the other operand. Otherwise
4509/// return SDValue().
4510static SDValue getAddOneOp(const SDNode *V) {
4511 if (V->getOpcode() != ISD::ADD)
4512 return SDValue();
4513
4514 return isOneConstant(V->getOperand(1)) ? V->getOperand(0) : SDValue();
4515}
4516
4518 DAGCombinerInfo &DCI) const {
4519 assert(N->getOpcode() == ISD::MUL);
4520 EVT VT = N->getValueType(0);
4521
4522 // Don't generate 24-bit multiplies on values that are in SGPRs, since
4523 // we only have a 32-bit scalar multiply (avoid values being moved to VGPRs
4524 // unnecessarily). isDivergent() is used as an approximation of whether the
4525 // value is in an SGPR.
4526 if (!N->isDivergent())
4527 return SDValue();
4528
4529 unsigned Size = VT.getSizeInBits();
4530 if (VT.isVector() || Size > 64)
4531 return SDValue();
4532
4533 SelectionDAG &DAG = DCI.DAG;
4534 SDLoc DL(N);
4535
4536 SDValue N0 = N->getOperand(0);
4537 SDValue N1 = N->getOperand(1);
4538
4539 // Undo InstCombine canonicalize X * (Y + 1) -> X * Y + X to enable mad
4540 // matching.
4541
4542 // mul x, (add y, 1) -> add (mul x, y), x
4543 auto IsFoldableAdd = [](SDValue V) -> SDValue {
4544 SDValue AddOp = getAddOneOp(V.getNode());
4545 if (!AddOp)
4546 return SDValue();
4547
4548 if (V.hasOneUse() || all_of(V->users(), [](const SDNode *U) -> bool {
4549 return U->getOpcode() == ISD::MUL;
4550 }))
4551 return AddOp;
4552
4553 return SDValue();
4554 };
4555
4556 // FIXME: The selection pattern is not properly checking for commuted
4557 // operands, so we have to place the mul in the LHS
4558 if (SDValue MulOper = IsFoldableAdd(N0)) {
4559 SDValue MulVal = DAG.getNode(N->getOpcode(), DL, VT, N1, MulOper);
4560 return DAG.getNode(ISD::ADD, DL, VT, MulVal, N1);
4561 }
4562
4563 if (SDValue MulOper = IsFoldableAdd(N1)) {
4564 SDValue MulVal = DAG.getNode(N->getOpcode(), DL, VT, N0, MulOper);
4565 return DAG.getNode(ISD::ADD, DL, VT, MulVal, N0);
4566 }
4567
4568 // There are i16 integer mul/mad.
4569 if (Subtarget->has16BitInsts() && VT.getScalarType().bitsLE(MVT::i16))
4570 return SDValue();
4571
4572 // SimplifyDemandedBits has the annoying habit of turning useful zero_extends
4573 // in the source into any_extends if the result of the mul is truncated. Since
4574 // we can assume the high bits are whatever we want, use the underlying value
4575 // to avoid the unknown high bits from interfering.
4576 if (N0.getOpcode() == ISD::ANY_EXTEND)
4577 N0 = N0.getOperand(0);
4578
4579 if (N1.getOpcode() == ISD::ANY_EXTEND)
4580 N1 = N1.getOperand(0);
4581
4582 SDValue Mul;
4583
4584 if (Subtarget->hasMulU24() && isU24(N0, DAG) && isU24(N1, DAG)) {
4585 N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);
4586 N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);
4587 Mul = getMul24(DAG, DL, N0, N1, Size, false);
4588 } else if (Subtarget->hasMulI24() && isI24(N0, DAG) && isI24(N1, DAG)) {
4589 N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);
4590 N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);
4591 Mul = getMul24(DAG, DL, N0, N1, Size, true);
4592 } else {
4593 return SDValue();
4594 }
4595
4596 // We need to use sext even for MUL_U24, because MUL_U24 is used
4597 // for signed multiply of 8 and 16-bit types.
4598 return DAG.getSExtOrTrunc(Mul, DL, VT);
4599}
4600
4601SDValue
4603 DAGCombinerInfo &DCI) const {
4604 if (N->getValueType(0) != MVT::i32)
4605 return SDValue();
4606
4607 SelectionDAG &DAG = DCI.DAG;
4608 SDLoc DL(N);
4609
4610 bool Signed = N->getOpcode() == ISD::SMUL_LOHI;
4611 SDValue N0 = N->getOperand(0);
4612 SDValue N1 = N->getOperand(1);
4613
4614 // SimplifyDemandedBits has the annoying habit of turning useful zero_extends
4615 // in the source into any_extends if the result of the mul is truncated. Since
4616 // we can assume the high bits are whatever we want, use the underlying value
4617 // to avoid the unknown high bits from interfering.
4618 if (N0.getOpcode() == ISD::ANY_EXTEND)
4619 N0 = N0.getOperand(0);
4620 if (N1.getOpcode() == ISD::ANY_EXTEND)
4621 N1 = N1.getOperand(0);
4622
4623 // Try to use two fast 24-bit multiplies (one for each half of the result)
4624 // instead of one slow extending multiply.
4625 unsigned LoOpcode = 0;
4626 unsigned HiOpcode = 0;
4627 if (Signed) {
4628 if (Subtarget->hasMulI24() && isI24(N0, DAG) && isI24(N1, DAG)) {
4629 N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);
4630 N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);
4631 LoOpcode = AMDGPUISD::MUL_I24;
4632 HiOpcode = AMDGPUISD::MULHI_I24;
4633 }
4634 } else {
4635 if (Subtarget->hasMulU24() && isU24(N0, DAG) && isU24(N1, DAG)) {
4636 N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);
4637 N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);
4638 LoOpcode = AMDGPUISD::MUL_U24;
4639 HiOpcode = AMDGPUISD::MULHI_U24;
4640 }
4641 }
4642 if (!LoOpcode)
4643 return SDValue();
4644
4645 SDValue Lo = DAG.getNode(LoOpcode, DL, MVT::i32, N0, N1);
4646 SDValue Hi = DAG.getNode(HiOpcode, DL, MVT::i32, N0, N1);
4647 DCI.CombineTo(N, Lo, Hi);
4648 return SDValue(N, 0);
4649}
4650
4652 DAGCombinerInfo &DCI) const {
4653 EVT VT = N->getValueType(0);
4654
4655 if (!Subtarget->hasMulI24() || VT.isVector())
4656 return SDValue();
4657
4658 // Don't generate 24-bit multiplies on values that are in SGPRs, since
4659 // we only have a 32-bit scalar multiply (avoid values being moved to VGPRs
4660 // unnecessarily). isDivergent() is used as an approximation of whether the
4661 // value is in an SGPR.
4662 // This doesn't apply if no s_mul_hi is available (since we'll end up with a
4663 // valu op anyway)
4664 if (Subtarget->hasSMulHi() && !N->isDivergent())
4665 return SDValue();
4666
4667 SelectionDAG &DAG = DCI.DAG;
4668 SDLoc DL(N);
4669
4670 SDValue N0 = N->getOperand(0);
4671 SDValue N1 = N->getOperand(1);
4672
4673 if (!isI24(N0, DAG) || !isI24(N1, DAG))
4674 return SDValue();
4675
4676 N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);
4677 N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);
4678
4679 SDValue Mulhi = DAG.getNode(AMDGPUISD::MULHI_I24, DL, MVT::i32, N0, N1);
4680 DCI.AddToWorklist(Mulhi.getNode());
4681 return DAG.getSExtOrTrunc(Mulhi, DL, VT);
4682}
4683
4685 DAGCombinerInfo &DCI) const {
4686 EVT VT = N->getValueType(0);
4687
4688 if (!Subtarget->hasMulU24() || VT.isVector() || VT.getSizeInBits() > 32)
4689 return SDValue();
4690
4691 // Don't generate 24-bit multiplies on values that are in SGPRs, since
4692 // we only have a 32-bit scalar multiply (avoid values being moved to VGPRs
4693 // unnecessarily). isDivergent() is used as an approximation of whether the
4694 // value is in an SGPR.
4695 // This doesn't apply if no s_mul_hi is available (since we'll end up with a
4696 // valu op anyway)
4697 if (Subtarget->hasSMulHi() && !N->isDivergent())
4698 return SDValue();
4699
4700 SelectionDAG &DAG = DCI.DAG;
4701 SDLoc DL(N);
4702
4703 SDValue N0 = N->getOperand(0);
4704 SDValue N1 = N->getOperand(1);
4705
4706 if (!isU24(N0, DAG) || !isU24(N1, DAG))
4707 return SDValue();
4708
4709 N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);
4710 N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);
4711
4712 SDValue Mulhi = DAG.getNode(AMDGPUISD::MULHI_U24, DL, MVT::i32, N0, N1);
4713 DCI.AddToWorklist(Mulhi.getNode());
4714 return DAG.getZExtOrTrunc(Mulhi, DL, VT);
4715}
4716
4717SDValue AMDGPUTargetLowering::getFFBX_U32(SelectionDAG &DAG,
4718 SDValue Op,
4719 const SDLoc &DL,
4720 unsigned Opc) const {
4721 EVT VT = Op.getValueType();
4722 EVT LegalVT = getTypeToTransformTo(*DAG.getContext(), VT);
4723 if (LegalVT != MVT::i32 && (Subtarget->has16BitInsts() &&
4724 LegalVT != MVT::i16))
4725 return SDValue();
4726
4727 if (VT != MVT::i32)
4728 Op = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Op);
4729
4730 SDValue FFBX = DAG.getNode(Opc, DL, MVT::i32, Op);
4731 if (VT != MVT::i32)
4732 FFBX = DAG.getNode(ISD::TRUNCATE, DL, VT, FFBX);
4733
4734 return FFBX;
4735}
4736
4737// The native instructions return -1 on 0 input. Optimize out a select that
4738// produces -1 on 0.
4739//
4740// TODO: If zero is not undef, we could also do this if the output is compared
4741// against the bitwidth.
4742//
4743// TODO: Should probably combine against FFBH_U32 instead of ctlz directly.
4745 SDValue LHS, SDValue RHS,
4746 DAGCombinerInfo &DCI) const {
4747 if (!isNullConstant(Cond.getOperand(1)))
4748 return SDValue();
4749
4750 SelectionDAG &DAG = DCI.DAG;
4751 ISD::CondCode CCOpcode = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
4752 SDValue CmpLHS = Cond.getOperand(0);
4753
4754 // select (setcc x, 0, eq), -1, (ctlz_zero_undef x) -> ffbh_u32 x
4755 // select (setcc x, 0, eq), -1, (cttz_zero_undef x) -> ffbl_u32 x
4756 if (CCOpcode == ISD::SETEQ &&
4757 (isCtlzOpc(RHS.getOpcode()) || isCttzOpc(RHS.getOpcode())) &&
4758 RHS.getOperand(0) == CmpLHS && isAllOnesConstant(LHS)) {
4759 unsigned Opc =
4760 isCttzOpc(RHS.getOpcode()) ? AMDGPUISD::FFBL_B32 : AMDGPUISD::FFBH_U32;
4761 return getFFBX_U32(DAG, CmpLHS, SL, Opc);
4762 }
4763
4764 // select (setcc x, 0, ne), (ctlz_zero_undef x), -1 -> ffbh_u32 x
4765 // select (setcc x, 0, ne), (cttz_zero_undef x), -1 -> ffbl_u32 x
4766 if (CCOpcode == ISD::SETNE &&
4767 (isCtlzOpc(LHS.getOpcode()) || isCttzOpc(LHS.getOpcode())) &&
4768 LHS.getOperand(0) == CmpLHS && isAllOnesConstant(RHS)) {
4769 unsigned Opc =
4770 isCttzOpc(LHS.getOpcode()) ? AMDGPUISD::FFBL_B32 : AMDGPUISD::FFBH_U32;
4771
4772 return getFFBX_U32(DAG, CmpLHS, SL, Opc);
4773 }
4774
4775 return SDValue();
4776}
4777
4779 unsigned Op,
4780 const SDLoc &SL,
4781 SDValue Cond,
4782 SDValue N1,
4783 SDValue N2) {
4784 SelectionDAG &DAG = DCI.DAG;
4785 EVT VT = N1.getValueType();
4786
4787 SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, VT, Cond,
4788 N1.getOperand(0), N2.getOperand(0));
4789 DCI.AddToWorklist(NewSelect.getNode());
4790 return DAG.getNode(Op, SL, VT, NewSelect);
4791}
4792
4793// Pull a free FP operation out of a select so it may fold into uses.
4794//
4795// select c, (fneg x), (fneg y) -> fneg (select c, x, y)
4796// select c, (fneg x), k -> fneg (select c, x, (fneg k))
4797//
4798// select c, (fabs x), (fabs y) -> fabs (select c, x, y)
4799// select c, (fabs x), +k -> fabs (select c, x, k)
4800SDValue
4802 SDValue N) const {
4803 SelectionDAG &DAG = DCI.DAG;
4804 SDValue Cond = N.getOperand(0);
4805 SDValue LHS = N.getOperand(1);
4806 SDValue RHS = N.getOperand(2);
4807
4808 EVT VT = N.getValueType();
4809 if ((LHS.getOpcode() == ISD::FABS && RHS.getOpcode() == ISD::FABS) ||
4810 (LHS.getOpcode() == ISD::FNEG && RHS.getOpcode() == ISD::FNEG)) {
4812 return SDValue();
4813
4814 return distributeOpThroughSelect(DCI, LHS.getOpcode(),
4815 SDLoc(N), Cond, LHS, RHS);
4816 }
4817
4818 bool Inv = false;
4819 if (RHS.getOpcode() == ISD::FABS || RHS.getOpcode() == ISD::FNEG) {
4820 std::swap(LHS, RHS);
4821 Inv = true;
4822 }
4823
4824 // TODO: Support vector constants.
4826 if ((LHS.getOpcode() == ISD::FNEG || LHS.getOpcode() == ISD::FABS) && CRHS &&
4827 !selectSupportsSourceMods(N.getNode())) {
4828 SDLoc SL(N);
4829 // If one side is an fneg/fabs and the other is a constant, we can push the
4830 // fneg/fabs down. If it's an fabs, the constant needs to be non-negative.
4831 SDValue NewLHS = LHS.getOperand(0);
4832 SDValue NewRHS = RHS;
4833
4834 // Careful: if the neg can be folded up, don't try to pull it back down.
4835 bool ShouldFoldNeg = true;
4836
4837 if (NewLHS.hasOneUse()) {
4838 unsigned Opc = NewLHS.getOpcode();
4839 if (LHS.getOpcode() == ISD::FNEG && fnegFoldsIntoOp(NewLHS.getNode()))
4840 ShouldFoldNeg = false;
4841 if (LHS.getOpcode() == ISD::FABS && Opc == ISD::FMUL)
4842 ShouldFoldNeg = false;
4843 }
4844
4845 if (ShouldFoldNeg) {
4846 if (LHS.getOpcode() == ISD::FABS && CRHS->isNegative())
4847 return SDValue();
4848
4849 // We're going to be forced to use a source modifier anyway, there's no
4850 // point to pulling the negate out unless we can get a size reduction by
4851 // negating the constant.
4852 //
4853 // TODO: Generalize to use getCheaperNegatedExpression which doesn't know
4854 // about cheaper constants.
4855 if (NewLHS.getOpcode() == ISD::FABS &&
4857 return SDValue();
4858
4860 return SDValue();
4861
4862 if (LHS.getOpcode() == ISD::FNEG)
4863 NewRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
4864
4865 if (Inv)
4866 std::swap(NewLHS, NewRHS);
4867
4868 SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, VT,
4869 Cond, NewLHS, NewRHS);
4870 DCI.AddToWorklist(NewSelect.getNode());
4871 return DAG.getNode(LHS.getOpcode(), SL, VT, NewSelect);
4872 }
4873 }
4874
4875 return SDValue();
4876}
4877
4879 DAGCombinerInfo &DCI) const {
4880 if (SDValue Folded = foldFreeOpFromSelect(DCI, SDValue(N, 0)))
4881 return Folded;
4882
4883 SDValue Cond = N->getOperand(0);
4884 if (Cond.getOpcode() != ISD::SETCC)
4885 return SDValue();
4886
4887 EVT VT = N->getValueType(0);
4888 SDValue LHS = Cond.getOperand(0);
4889 SDValue RHS = Cond.getOperand(1);
4890 SDValue CC = Cond.getOperand(2);
4891
4892 SDValue True = N->getOperand(1);
4893 SDValue False = N->getOperand(2);
4894
4895 if (Cond.hasOneUse()) { // TODO: Look for multiple select uses.
4896 SelectionDAG &DAG = DCI.DAG;
4897 if (DAG.isConstantValueOfAnyType(True) &&
4898 !DAG.isConstantValueOfAnyType(False)) {
4899 // Swap cmp + select pair to move constant to false input.
4900 // This will allow using VOPC cndmasks more often.
4901 // select (setcc x, y), k, x -> select (setccinv x, y), x, k
4902
4903 SDLoc SL(N);
4904 ISD::CondCode NewCC =
4905 getSetCCInverse(cast<CondCodeSDNode>(CC)->get(), LHS.getValueType());
4906
4907 SDValue NewCond = DAG.getSetCC(SL, Cond.getValueType(), LHS, RHS, NewCC);
4908 return DAG.getNode(ISD::SELECT, SL, VT, NewCond, False, True);
4909 }
4910
4911 if (VT == MVT::f32 && Subtarget->hasFminFmaxLegacy()) {
4913 = combineFMinMaxLegacy(SDLoc(N), VT, LHS, RHS, True, False, CC, DCI);
4914 // Revisit this node so we can catch min3/max3/med3 patterns.
4915 //DCI.AddToWorklist(MinMax.getNode());
4916 return MinMax;
4917 }
4918 }
4919
4920 // There's no reason to not do this if the condition has other uses.
4921 return performCtlz_CttzCombine(SDLoc(N), Cond, True, False, DCI);
4922}
4923
4924static bool isInv2Pi(const APFloat &APF) {
4925 static const APFloat KF16(APFloat::IEEEhalf(), APInt(16, 0x3118));
4926 static const APFloat KF32(APFloat::IEEEsingle(), APInt(32, 0x3e22f983));
4927 static const APFloat KF64(APFloat::IEEEdouble(), APInt(64, 0x3fc45f306dc9c882));
4928
4929 return APF.bitwiseIsEqual(KF16) ||
4930 APF.bitwiseIsEqual(KF32) ||
4931 APF.bitwiseIsEqual(KF64);
4932}
4933
4934// 0 and 1.0 / (0.5 * pi) do not have inline immmediates, so there is an
4935// additional cost to negate them.
4938 if (C->isZero())
4939 return C->isNegative() ? NegatibleCost::Cheaper : NegatibleCost::Expensive;
4940
4941 if (Subtarget->hasInv2PiInlineImm() && isInv2Pi(C->getValueAPF()))
4942 return C->isNegative() ? NegatibleCost::Cheaper : NegatibleCost::Expensive;
4943
4945}
4946
4952
4958
4959static unsigned inverseMinMax(unsigned Opc) {
4960 switch (Opc) {
4961 case ISD::FMAXNUM:
4962 return ISD::FMINNUM;
4963 case ISD::FMINNUM:
4964 return ISD::FMAXNUM;
4965 case ISD::FMAXNUM_IEEE:
4966 return ISD::FMINNUM_IEEE;
4967 case ISD::FMINNUM_IEEE:
4968 return ISD::FMAXNUM_IEEE;
4969 case ISD::FMAXIMUM:
4970 return ISD::FMINIMUM;
4971 case ISD::FMINIMUM:
4972 return ISD::FMAXIMUM;
4973 case ISD::FMAXIMUMNUM:
4974 return ISD::FMINIMUMNUM;
4975 case ISD::FMINIMUMNUM:
4976 return ISD::FMAXIMUMNUM;
4977 case AMDGPUISD::FMAX_LEGACY:
4978 return AMDGPUISD::FMIN_LEGACY;
4979 case AMDGPUISD::FMIN_LEGACY:
4980 return AMDGPUISD::FMAX_LEGACY;
4981 default:
4982 llvm_unreachable("invalid min/max opcode");
4983 }
4984}
4985
4986/// \return true if it's profitable to try to push an fneg into its source
4987/// instruction.
4989 // If the input has multiple uses and we can either fold the negate down, or
4990 // the other uses cannot, give up. This both prevents unprofitable
4991 // transformations and infinite loops: we won't repeatedly try to fold around
4992 // a negate that has no 'good' form.
4993 if (N0.hasOneUse()) {
4994 // This may be able to fold into the source, but at a code size cost. Don't
4995 // fold if the fold into the user is free.
4996 if (allUsesHaveSourceMods(N, 0))
4997 return false;
4998 } else {
4999 if (fnegFoldsIntoOp(N0.getNode()) &&
5001 return false;
5002 }
5003
5004 return true;
5005}
5006
5008 DAGCombinerInfo &DCI) const {
5009 SelectionDAG &DAG = DCI.DAG;
5010 SDValue N0 = N->getOperand(0);
5011 EVT VT = N->getValueType(0);
5012
5013 unsigned Opc = N0.getOpcode();
5014
5015 if (!shouldFoldFNegIntoSrc(N, N0))
5016 return SDValue();
5017
5018 SDLoc SL(N);
5019 switch (Opc) {
5020 case ISD::FADD: {
5021 if (!mayIgnoreSignedZero(N0))
5022 return SDValue();
5023
5024 // (fneg (fadd x, y)) -> (fadd (fneg x), (fneg y))
5025 SDValue LHS = N0.getOperand(0);
5026 SDValue RHS = N0.getOperand(1);
5027
5028 if (LHS.getOpcode() != ISD::FNEG)
5029 LHS = DAG.getNode(ISD::FNEG, SL, VT, LHS);
5030 else
5031 LHS = LHS.getOperand(0);
5032
5033 if (RHS.getOpcode() != ISD::FNEG)
5034 RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
5035 else
5036 RHS = RHS.getOperand(0);
5037
5038 SDValue Res = DAG.getNode(ISD::FADD, SL, VT, LHS, RHS, N0->getFlags());
5039 if (Res.getOpcode() != ISD::FADD)
5040 return SDValue(); // Op got folded away.
5041 if (!N0.hasOneUse())
5042 DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
5043 return Res;
5044 }
5045 case ISD::FMUL:
5046 case AMDGPUISD::FMUL_LEGACY: {
5047 // (fneg (fmul x, y)) -> (fmul x, (fneg y))
5048 // (fneg (fmul_legacy x, y)) -> (fmul_legacy x, (fneg y))
5049 SDValue LHS = N0.getOperand(0);
5050 SDValue RHS = N0.getOperand(1);
5051
5052 if (LHS.getOpcode() == ISD::FNEG)
5053 LHS = LHS.getOperand(0);
5054 else if (RHS.getOpcode() == ISD::FNEG)
5055 RHS = RHS.getOperand(0);
5056 else
5057 RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
5058
5059 SDValue Res = DAG.getNode(Opc, SL, VT, LHS, RHS, N0->getFlags());
5060 if (Res.getOpcode() != Opc)
5061 return SDValue(); // Op got folded away.
5062 if (!N0.hasOneUse())
5063 DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
5064 return Res;
5065 }
5066 case ISD::FMA:
5067 case ISD::FMAD: {
5068 // TODO: handle llvm.amdgcn.fma.legacy
5069 if (!mayIgnoreSignedZero(N0))
5070 return SDValue();
5071
5072 // (fneg (fma x, y, z)) -> (fma x, (fneg y), (fneg z))
5073 SDValue LHS = N0.getOperand(0);
5074 SDValue MHS = N0.getOperand(1);
5075 SDValue RHS = N0.getOperand(2);
5076
5077 if (LHS.getOpcode() == ISD::FNEG)
5078 LHS = LHS.getOperand(0);
5079 else if (MHS.getOpcode() == ISD::FNEG)
5080 MHS = MHS.getOperand(0);
5081 else
5082 MHS = DAG.getNode(ISD::FNEG, SL, VT, MHS);
5083
5084 if (RHS.getOpcode() != ISD::FNEG)
5085 RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
5086 else
5087 RHS = RHS.getOperand(0);
5088
5089 SDValue Res = DAG.getNode(Opc, SL, VT, LHS, MHS, RHS);
5090 if (Res.getOpcode() != Opc)
5091 return SDValue(); // Op got folded away.
5092 if (!N0.hasOneUse())
5093 DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
5094 return Res;
5095 }
5096 case ISD::FMAXNUM:
5097 case ISD::FMINNUM:
5098 case ISD::FMAXNUM_IEEE:
5099 case ISD::FMINNUM_IEEE:
5100 case ISD::FMINIMUM:
5101 case ISD::FMAXIMUM:
5102 case ISD::FMINIMUMNUM:
5103 case ISD::FMAXIMUMNUM:
5104 case AMDGPUISD::FMAX_LEGACY:
5105 case AMDGPUISD::FMIN_LEGACY: {
5106 // fneg (fmaxnum x, y) -> fminnum (fneg x), (fneg y)
5107 // fneg (fminnum x, y) -> fmaxnum (fneg x), (fneg y)
5108 // fneg (fmax_legacy x, y) -> fmin_legacy (fneg x), (fneg y)
5109 // fneg (fmin_legacy x, y) -> fmax_legacy (fneg x), (fneg y)
5110
5111 SDValue LHS = N0.getOperand(0);
5112 SDValue RHS = N0.getOperand(1);
5113
5114 // 0 doesn't have a negated inline immediate.
5115 // TODO: This constant check should be generalized to other operations.
5117 return SDValue();
5118
5119 SDValue NegLHS = DAG.getNode(ISD::FNEG, SL, VT, LHS);
5120 SDValue NegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
5121 unsigned Opposite = inverseMinMax(Opc);
5122
5123 SDValue Res = DAG.getNode(Opposite, SL, VT, NegLHS, NegRHS, N0->getFlags());
5124 if (Res.getOpcode() != Opposite)
5125 return SDValue(); // Op got folded away.
5126 if (!N0.hasOneUse())
5127 DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
5128 return Res;
5129 }
5130 case AMDGPUISD::FMED3: {
5131 SDValue Ops[3];
5132 for (unsigned I = 0; I < 3; ++I)
5133 Ops[I] = DAG.getNode(ISD::FNEG, SL, VT, N0->getOperand(I), N0->getFlags());
5134
5135 SDValue Res = DAG.getNode(AMDGPUISD::FMED3, SL, VT, Ops, N0->getFlags());
5136 if (Res.getOpcode() != AMDGPUISD::FMED3)
5137 return SDValue(); // Op got folded away.
5138
5139 if (!N0.hasOneUse()) {
5140 SDValue Neg = DAG.getNode(ISD::FNEG, SL, VT, Res);
5141 DAG.ReplaceAllUsesWith(N0, Neg);
5142
5143 for (SDNode *U : Neg->users())
5144 DCI.AddToWorklist(U);
5145 }
5146
5147 return Res;
5148 }
5149 case ISD::FP_EXTEND:
5150 case ISD::FTRUNC:
5151 case ISD::FRINT:
5152 case ISD::FNEARBYINT: // XXX - Should fround be handled?
5153 case ISD::FROUNDEVEN:
5154 case ISD::FSIN:
5155 case ISD::FCANONICALIZE:
5156 case AMDGPUISD::RCP:
5157 case AMDGPUISD::RCP_LEGACY:
5158 case AMDGPUISD::RCP_IFLAG:
5159 case AMDGPUISD::SIN_HW: {
5160 SDValue CvtSrc = N0.getOperand(0);
5161 if (CvtSrc.getOpcode() == ISD::FNEG) {
5162 // (fneg (fp_extend (fneg x))) -> (fp_extend x)
5163 // (fneg (rcp (fneg x))) -> (rcp x)
5164 return DAG.getNode(Opc, SL, VT, CvtSrc.getOperand(0));
5165 }
5166
5167 if (!N0.hasOneUse())
5168 return SDValue();
5169
5170 // (fneg (fp_extend x)) -> (fp_extend (fneg x))
5171 // (fneg (rcp x)) -> (rcp (fneg x))
5172 SDValue Neg = DAG.getNode(ISD::FNEG, SL, CvtSrc.getValueType(), CvtSrc);
5173 return DAG.getNode(Opc, SL, VT, Neg, N0->getFlags());
5174 }
5175 case ISD::FP_ROUND: {
5176 SDValue CvtSrc = N0.getOperand(0);
5177
5178 if (CvtSrc.getOpcode() == ISD::FNEG) {
5179 // (fneg (fp_round (fneg x))) -> (fp_round x)
5180 return DAG.getNode(ISD::FP_ROUND, SL, VT,
5181 CvtSrc.getOperand(0), N0.getOperand(1));
5182 }
5183
5184 if (!N0.hasOneUse())
5185 return SDValue();
5186
5187 // (fneg (fp_round x)) -> (fp_round (fneg x))
5188 SDValue Neg = DAG.getNode(ISD::FNEG, SL, CvtSrc.getValueType(), CvtSrc);
5189 return DAG.getNode(ISD::FP_ROUND, SL, VT, Neg, N0.getOperand(1));
5190 }
5191 case ISD::FP16_TO_FP: {
5192 // v_cvt_f32_f16 supports source modifiers on pre-VI targets without legal
5193 // f16, but legalization of f16 fneg ends up pulling it out of the source.
5194 // Put the fneg back as a legal source operation that can be matched later.
5195 SDLoc SL(N);
5196
5197 SDValue Src = N0.getOperand(0);
5198 EVT SrcVT = Src.getValueType();
5199
5200 // fneg (fp16_to_fp x) -> fp16_to_fp (xor x, 0x8000)
5201 SDValue IntFNeg = DAG.getNode(ISD::XOR, SL, SrcVT, Src,
5202 DAG.getConstant(0x8000, SL, SrcVT));
5203 return DAG.getNode(ISD::FP16_TO_FP, SL, N->getValueType(0), IntFNeg);
5204 }
5205 case ISD::SELECT: {
5206 // fneg (select c, a, b) -> select c, (fneg a), (fneg b)
5207 // TODO: Invert conditions of foldFreeOpFromSelect
5208 return SDValue();
5209 }
5210 case ISD::BITCAST: {
5211 SDLoc SL(N);
5212 SDValue BCSrc = N0.getOperand(0);
5213 if (BCSrc.getOpcode() == ISD::BUILD_VECTOR) {
5214 SDValue HighBits = BCSrc.getOperand(BCSrc.getNumOperands() - 1);
5215 if (HighBits.getValueType().getSizeInBits() != 32 ||
5216 !fnegFoldsIntoOp(HighBits.getNode()))
5217 return SDValue();
5218
5219 // f64 fneg only really needs to operate on the high half of of the
5220 // register, so try to force it to an f32 operation to help make use of
5221 // source modifiers.
5222 //
5223 //
5224 // fneg (f64 (bitcast (build_vector x, y))) ->
5225 // f64 (bitcast (build_vector (bitcast i32:x to f32),
5226 // (fneg (bitcast i32:y to f32)))
5227
5228 SDValue CastHi = DAG.getNode(ISD::BITCAST, SL, MVT::f32, HighBits);
5229 SDValue NegHi = DAG.getNode(ISD::FNEG, SL, MVT::f32, CastHi);
5230 SDValue CastBack =
5231 DAG.getNode(ISD::BITCAST, SL, HighBits.getValueType(), NegHi);
5232
5234 Ops.back() = CastBack;
5235 DCI.AddToWorklist(NegHi.getNode());
5236 SDValue Build =
5237 DAG.getNode(ISD::BUILD_VECTOR, SL, BCSrc.getValueType(), Ops);
5238 SDValue Result = DAG.getNode(ISD::BITCAST, SL, VT, Build);
5239
5240 if (!N0.hasOneUse())
5241 DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Result));
5242 return Result;
5243 }
5244
5245 if (BCSrc.getOpcode() == ISD::SELECT && VT == MVT::f32 &&
5246 BCSrc.hasOneUse()) {
5247 // fneg (bitcast (f32 (select cond, i32:lhs, i32:rhs))) ->
5248 // select cond, (bitcast i32:lhs to f32), (bitcast i32:rhs to f32)
5249
5250 // TODO: Cast back result for multiple uses is beneficial in some cases.
5251
5252 SDValue LHS =
5253 DAG.getNode(ISD::BITCAST, SL, MVT::f32, BCSrc.getOperand(1));
5254 SDValue RHS =
5255 DAG.getNode(ISD::BITCAST, SL, MVT::f32, BCSrc.getOperand(2));
5256
5257 SDValue NegLHS = DAG.getNode(ISD::FNEG, SL, MVT::f32, LHS);
5258 SDValue NegRHS = DAG.getNode(ISD::FNEG, SL, MVT::f32, RHS);
5259
5260 return DAG.getNode(ISD::SELECT, SL, MVT::f32, BCSrc.getOperand(0), NegLHS,
5261 NegRHS);
5262 }
5263
5264 return SDValue();
5265 }
5266 default:
5267 return SDValue();
5268 }
5269}
5270
5272 DAGCombinerInfo &DCI) const {
5273 SelectionDAG &DAG = DCI.DAG;
5274 SDValue N0 = N->getOperand(0);
5275
5276 if (!N0.hasOneUse())
5277 return SDValue();
5278
5279 switch (N0.getOpcode()) {
5280 case ISD::FP16_TO_FP: {
5281 assert(!Subtarget->has16BitInsts() && "should only see if f16 is illegal");
5282 SDLoc SL(N);
5283 SDValue Src = N0.getOperand(0);
5284 EVT SrcVT = Src.getValueType();
5285
5286 // fabs (fp16_to_fp x) -> fp16_to_fp (and x, 0x7fff)
5287 SDValue IntFAbs = DAG.getNode(ISD::AND, SL, SrcVT, Src,
5288 DAG.getConstant(0x7fff, SL, SrcVT));
5289 return DAG.getNode(ISD::FP16_TO_FP, SL, N->getValueType(0), IntFAbs);
5290 }
5291 default:
5292 return SDValue();
5293 }
5294}
5295
5297 DAGCombinerInfo &DCI) const {
5298 const auto *CFP = dyn_cast<ConstantFPSDNode>(N->getOperand(0));
5299 if (!CFP)
5300 return SDValue();
5301
5302 // XXX - Should this flush denormals?
5303 const APFloat &Val = CFP->getValueAPF();
5304 APFloat One(Val.getSemantics(), "1.0");
5305 return DCI.DAG.getConstantFP(One / Val, SDLoc(N), N->getValueType(0));
5306}
5307
5309 DAGCombinerInfo &DCI) const {
5310 SelectionDAG &DAG = DCI.DAG;
5311 SDLoc DL(N);
5312
5313 switch(N->getOpcode()) {
5314 default:
5315 break;
5316 case ISD::BITCAST: {
5317 EVT DestVT = N->getValueType(0);
5318
5319 // Push casts through vector builds. This helps avoid emitting a large
5320 // number of copies when materializing floating point vector constants.
5321 //
5322 // vNt1 bitcast (vNt0 (build_vector t0:x, t0:y)) =>
5323 // vnt1 = build_vector (t1 (bitcast t0:x)), (t1 (bitcast t0:y))
5324 if (DestVT.isVector()) {
5325 SDValue Src = N->getOperand(0);
5326 if (Src.getOpcode() == ISD::BUILD_VECTOR &&
5329 EVT SrcVT = Src.getValueType();
5330 unsigned NElts = DestVT.getVectorNumElements();
5331
5332 if (SrcVT.getVectorNumElements() == NElts) {
5333 EVT DestEltVT = DestVT.getVectorElementType();
5334
5335 SmallVector<SDValue, 8> CastedElts;
5336 SDLoc SL(N);
5337 for (unsigned I = 0, E = SrcVT.getVectorNumElements(); I != E; ++I) {
5338 SDValue Elt = Src.getOperand(I);
5339 CastedElts.push_back(DAG.getNode(ISD::BITCAST, DL, DestEltVT, Elt));
5340 }
5341
5342 return DAG.getBuildVector(DestVT, SL, CastedElts);
5343 }
5344 }
5345 }
5346
5347 if (DestVT.getSizeInBits() != 64 || !DestVT.isVector())
5348 break;
5349
5350 // Fold bitcasts of constants.
5351 //
5352 // v2i32 (bitcast i64:k) -> build_vector lo_32(k), hi_32(k)
5353 // TODO: Generalize and move to DAGCombiner
5354 SDValue Src = N->getOperand(0);
5356 SDLoc SL(N);
5357 uint64_t CVal = C->getZExtValue();
5358 SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32,
5359 DAG.getConstant(Lo_32(CVal), SL, MVT::i32),
5360 DAG.getConstant(Hi_32(CVal), SL, MVT::i32));
5361 return DAG.getNode(ISD::BITCAST, SL, DestVT, BV);
5362 }
5363
5365 const APInt &Val = C->getValueAPF().bitcastToAPInt();
5366 SDLoc SL(N);
5367 uint64_t CVal = Val.getZExtValue();
5368 SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32,
5369 DAG.getConstant(Lo_32(CVal), SL, MVT::i32),
5370 DAG.getConstant(Hi_32(CVal), SL, MVT::i32));
5371
5372 return DAG.getNode(ISD::BITCAST, SL, DestVT, Vec);
5373 }
5374
5375 break;
5376 }
5377 case ISD::SHL:
5378 case ISD::SRA:
5379 case ISD::SRL: {
5380 // Range metadata can be invalidated when loads are converted to legal types
5381 // (e.g. v2i64 -> v4i32).
5382 // Try to convert vector shl/sra/srl before type legalization so that range
5383 // metadata can be utilized.
5384 if (!(N->getValueType(0).isVector() &&
5387 break;
5388 if (N->getOpcode() == ISD::SHL)
5389 return performShlCombine(N, DCI);
5390 if (N->getOpcode() == ISD::SRA)
5391 return performSraCombine(N, DCI);
5392 return performSrlCombine(N, DCI);
5393 }
5394 case ISD::TRUNCATE:
5395 return performTruncateCombine(N, DCI);
5396 case ISD::MUL:
5397 return performMulCombine(N, DCI);
5398 case AMDGPUISD::MUL_U24:
5399 case AMDGPUISD::MUL_I24: {
5400 if (SDValue Simplified = simplifyMul24(N, DCI))
5401 return Simplified;
5402 break;
5403 }
5404 case AMDGPUISD::MULHI_I24:
5405 case AMDGPUISD::MULHI_U24:
5406 return simplifyMul24(N, DCI);
5407 case ISD::SMUL_LOHI:
5408 case ISD::UMUL_LOHI:
5409 return performMulLoHiCombine(N, DCI);
5410 case ISD::MULHS:
5411 return performMulhsCombine(N, DCI);
5412 case ISD::MULHU:
5413 return performMulhuCombine(N, DCI);
5414 case ISD::SELECT:
5415 return performSelectCombine(N, DCI);
5416 case ISD::FNEG:
5417 return performFNegCombine(N, DCI);
5418 case ISD::FABS:
5419 return performFAbsCombine(N, DCI);
5420 case AMDGPUISD::BFE_I32:
5421 case AMDGPUISD::BFE_U32: {
5422 assert(!N->getValueType(0).isVector() &&
5423 "Vector handling of BFE not implemented");
5424 ConstantSDNode *Width = dyn_cast<ConstantSDNode>(N->getOperand(2));
5425 if (!Width)
5426 break;
5427
5428 uint32_t WidthVal = Width->getZExtValue() & 0x1f;
5429 if (WidthVal == 0)
5430 return DAG.getConstant(0, DL, MVT::i32);
5431
5433 if (!Offset)
5434 break;
5435
5436 SDValue BitsFrom = N->getOperand(0);
5437 uint32_t OffsetVal = Offset->getZExtValue() & 0x1f;
5438
5439 bool Signed = N->getOpcode() == AMDGPUISD::BFE_I32;
5440
5441 if (OffsetVal == 0) {
5442 // This is already sign / zero extended, so try to fold away extra BFEs.
5443 unsigned SignBits = Signed ? (32 - WidthVal + 1) : (32 - WidthVal);
5444
5445 unsigned OpSignBits = DAG.ComputeNumSignBits(BitsFrom);
5446 if (OpSignBits >= SignBits)
5447 return BitsFrom;
5448
5449 EVT SmallVT = EVT::getIntegerVT(*DAG.getContext(), WidthVal);
5450 if (Signed) {
5451 // This is a sign_extend_inreg. Replace it to take advantage of existing
5452 // DAG Combines. If not eliminated, we will match back to BFE during
5453 // selection.
5454
5455 // TODO: The sext_inreg of extended types ends, although we can could
5456 // handle them in a single BFE.
5457 return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i32, BitsFrom,
5458 DAG.getValueType(SmallVT));
5459 }
5460
5461 return DAG.getZeroExtendInReg(BitsFrom, DL, SmallVT);
5462 }
5463
5464 if (ConstantSDNode *CVal = dyn_cast<ConstantSDNode>(BitsFrom)) {
5465 if (Signed) {
5466 return constantFoldBFE<int32_t>(DAG,
5467 CVal->getSExtValue(),
5468 OffsetVal,
5469 WidthVal,
5470 DL);
5471 }
5472
5473 return constantFoldBFE<uint32_t>(DAG,
5474 CVal->getZExtValue(),
5475 OffsetVal,
5476 WidthVal,
5477 DL);
5478 }
5479
5480 if ((OffsetVal + WidthVal) >= 32 &&
5481 !(Subtarget->hasSDWA() && OffsetVal == 16 && WidthVal == 16)) {
5482 SDValue ShiftVal = DAG.getConstant(OffsetVal, DL, MVT::i32);
5483 return DAG.getNode(Signed ? ISD::SRA : ISD::SRL, DL, MVT::i32,
5484 BitsFrom, ShiftVal);
5485 }
5486
5487 if (BitsFrom.hasOneUse()) {
5488 APInt Demanded = APInt::getBitsSet(32,
5489 OffsetVal,
5490 OffsetVal + WidthVal);
5491
5492 KnownBits Known;
5494 !DCI.isBeforeLegalizeOps());
5495 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
5496 if (TLI.ShrinkDemandedConstant(BitsFrom, Demanded, TLO) ||
5497 TLI.SimplifyDemandedBits(BitsFrom, Demanded, Known, TLO)) {
5498 DCI.CommitTargetLoweringOpt(TLO);
5499 }
5500 }
5501
5502 break;
5503 }
5504 case ISD::LOAD:
5505 return performLoadCombine(N, DCI);
5506 case ISD::STORE:
5507 return performStoreCombine(N, DCI);
5508 case AMDGPUISD::RCP:
5509 case AMDGPUISD::RCP_IFLAG:
5510 return performRcpCombine(N, DCI);
5511 case ISD::AssertZext:
5512 case ISD::AssertSext:
5513 return performAssertSZExtCombine(N, DCI);
5515 return performIntrinsicWOChainCombine(N, DCI);
5516 case AMDGPUISD::FMAD_FTZ: {
5517 SDValue N0 = N->getOperand(0);
5518 SDValue N1 = N->getOperand(1);
5519 SDValue N2 = N->getOperand(2);
5520 EVT VT = N->getValueType(0);
5521
5522 // FMAD_FTZ is a FMAD + flush denormals to zero.
5523 // We flush the inputs, the intermediate step, and the output.
5527 if (N0CFP && N1CFP && N2CFP) {
5528 const auto FTZ = [](const APFloat &V) {
5529 if (V.isDenormal()) {
5530 APFloat Zero(V.getSemantics(), 0);
5531 return V.isNegative() ? -Zero : Zero;
5532 }
5533 return V;
5534 };
5535
5536 APFloat V0 = FTZ(N0CFP->getValueAPF());
5537 APFloat V1 = FTZ(N1CFP->getValueAPF());
5538 APFloat V2 = FTZ(N2CFP->getValueAPF());
5540 V0 = FTZ(V0);
5542 return DAG.getConstantFP(FTZ(V0), DL, VT);
5543 }
5544 break;
5545 }
5546 }
5547 return SDValue();
5548}
5549
5550//===----------------------------------------------------------------------===//
5551// Helper functions
5552//===----------------------------------------------------------------------===//
5553
5555 const TargetRegisterClass *RC,
5556 Register Reg, EVT VT,
5557 const SDLoc &SL,
5558 bool RawReg) const {
5561 Register VReg;
5562
5563 if (!MRI.isLiveIn(Reg)) {
5564 VReg = MRI.createVirtualRegister(RC);
5565 MRI.addLiveIn(Reg, VReg);
5566 } else {
5567 VReg = MRI.getLiveInVirtReg(Reg);
5568 }
5569
5570 if (RawReg)
5571 return DAG.getRegister(VReg, VT);
5572
5573 return DAG.getCopyFromReg(DAG.getEntryNode(), SL, VReg, VT);
5574}
5575
5576// This may be called multiple times, and nothing prevents creating multiple
5577// objects at the same offset. See if we already defined this object.
5579 int64_t Offset) {
5580 for (int I = MFI.getObjectIndexBegin(); I < 0; ++I) {
5581 if (MFI.getObjectOffset(I) == Offset) {
5582 assert(MFI.getObjectSize(I) == Size);
5583 return I;
5584 }
5585 }
5586
5587 return MFI.CreateFixedObject(Size, Offset, true);
5588}
5589
5591 EVT VT,
5592 const SDLoc &SL,
5593 int64_t Offset) const {
5595 MachineFrameInfo &MFI = MF.getFrameInfo();
5596 int FI = getOrCreateFixedStackObject(MFI, VT.getStoreSize(), Offset);
5597
5598 auto SrcPtrInfo = MachinePointerInfo::getStack(MF, Offset);
5599 SDValue Ptr = DAG.getFrameIndex(FI, MVT::i32);
5600
5601 return DAG.getLoad(VT, SL, DAG.getEntryNode(), Ptr, SrcPtrInfo, Align(4),
5604}
5605
5607 const SDLoc &SL,
5608 SDValue Chain,
5609 SDValue ArgVal,
5610 int64_t Offset) const {
5614
5615 SDValue Ptr = DAG.getConstant(Offset, SL, MVT::i32);
5616 // Stores to the argument stack area are relative to the stack pointer.
5617 SDValue SP =
5618 DAG.getCopyFromReg(Chain, SL, Info->getStackPtrOffsetReg(), MVT::i32);
5619 Ptr = DAG.getNode(ISD::ADD, SL, MVT::i32, SP, Ptr);
5620 SDValue Store = DAG.getStore(Chain, SL, ArgVal, Ptr, DstInfo, Align(4),
5622 return Store;
5623}
5624
5626 const TargetRegisterClass *RC,
5627 EVT VT, const SDLoc &SL,
5628 const ArgDescriptor &Arg) const {
5629 assert(Arg && "Attempting to load missing argument");
5630
5631 SDValue V = Arg.isRegister() ?
5632 CreateLiveInRegister(DAG, RC, Arg.getRegister(), VT, SL) :
5633 loadStackInputValue(DAG, VT, SL, Arg.getStackOffset());
5634
5635 if (!Arg.isMasked())
5636 return V;
5637
5638 unsigned Mask = Arg.getMask();
5639 unsigned Shift = llvm::countr_zero<unsigned>(Mask);
5640 V = DAG.getNode(ISD::SRL, SL, VT, V,
5641 DAG.getShiftAmountConstant(Shift, VT, SL));
5642 return DAG.getNode(ISD::AND, SL, VT, V,
5643 DAG.getConstant(Mask >> Shift, SL, VT));
5644}
5645
5647 uint64_t ExplicitKernArgSize, const ImplicitParameter Param) const {
5648 unsigned ExplicitArgOffset = Subtarget->getExplicitKernelArgOffset();
5649 const Align Alignment = Subtarget->getAlignmentForImplicitArgPtr();
5650 uint64_t ArgOffset =
5651 alignTo(ExplicitKernArgSize, Alignment) + ExplicitArgOffset;
5652 switch (Param) {
5653 case FIRST_IMPLICIT:
5654 return ArgOffset;
5655 case PRIVATE_BASE:
5657 case SHARED_BASE:
5658 return ArgOffset + AMDGPU::ImplicitArg::SHARED_BASE_OFFSET;
5659 case QUEUE_PTR:
5660 return ArgOffset + AMDGPU::ImplicitArg::QUEUE_PTR_OFFSET;
5661 }
5662 llvm_unreachable("unexpected implicit parameter type");
5663}
5664
5670
5672 SelectionDAG &DAG, int Enabled,
5673 int &RefinementSteps,
5674 bool &UseOneConstNR,
5675 bool Reciprocal) const {
5676 EVT VT = Operand.getValueType();
5677
5678 if (VT == MVT::f32) {
5679 RefinementSteps = 0;
5680 return DAG.getNode(AMDGPUISD::RSQ, SDLoc(Operand), VT, Operand);
5681 }
5682
5683 // TODO: There is also f64 rsq instruction, but the documentation is less
5684 // clear on its precision.
5685
5686 return SDValue();
5687}
5688
5690 SelectionDAG &DAG, int Enabled,
5691 int &RefinementSteps) const {
5692 EVT VT = Operand.getValueType();
5693
5694 if (VT == MVT::f32) {
5695 // Reciprocal, < 1 ulp error.
5696 //
5697 // This reciprocal approximation converges to < 0.5 ulp error with one
5698 // newton rhapson performed with two fused multiple adds (FMAs).
5699
5700 RefinementSteps = 0;
5701 return DAG.getNode(AMDGPUISD::RCP, SDLoc(Operand), VT, Operand);
5702 }
5703
5704 // TODO: There is also f64 rcp instruction, but the documentation is less
5705 // clear on its precision.
5706
5707 return SDValue();
5708}
5709
5710static unsigned workitemIntrinsicDim(unsigned ID) {
5711 switch (ID) {
5712 case Intrinsic::amdgcn_workitem_id_x:
5713 return 0;
5714 case Intrinsic::amdgcn_workitem_id_y:
5715 return 1;
5716 case Intrinsic::amdgcn_workitem_id_z:
5717 return 2;
5718 default:
5719 llvm_unreachable("not a workitem intrinsic");
5720 }
5721}
5722
5724 const SDValue Op, KnownBits &Known,
5725 const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const {
5726
5727 Known.resetAll(); // Don't know anything.
5728
5729 unsigned Opc = Op.getOpcode();
5730
5731 switch (Opc) {
5732 default:
5733 break;
5734 case AMDGPUISD::CARRY:
5735 case AMDGPUISD::BORROW: {
5736 Known.Zero = APInt::getHighBitsSet(32, 31);
5737 break;
5738 }
5739
5740 case AMDGPUISD::BFE_I32:
5741 case AMDGPUISD::BFE_U32: {
5742 ConstantSDNode *CWidth = dyn_cast<ConstantSDNode>(Op.getOperand(2));
5743 if (!CWidth)
5744 return;
5745
5746 uint32_t Width = CWidth->getZExtValue() & 0x1f;
5747
5748 if (Opc == AMDGPUISD::BFE_U32)
5749 Known.Zero = APInt::getHighBitsSet(32, 32 - Width);
5750
5751 break;
5752 }
5753 case AMDGPUISD::FP_TO_FP16: {
5754 unsigned BitWidth = Known.getBitWidth();
5755
5756 // High bits are zero.
5758 break;
5759 }
5760 case AMDGPUISD::MUL_U24:
5761 case AMDGPUISD::MUL_I24: {
5762 KnownBits LHSKnown = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
5763 KnownBits RHSKnown = DAG.computeKnownBits(Op.getOperand(1), Depth + 1);
5764 unsigned TrailZ = LHSKnown.countMinTrailingZeros() +
5765 RHSKnown.countMinTrailingZeros();
5766 Known.Zero.setLowBits(std::min(TrailZ, 32u));
5767 // Skip extra check if all bits are known zeros.
5768 if (TrailZ >= 32)
5769 break;
5770
5771 // Truncate to 24 bits.
5772 LHSKnown = LHSKnown.trunc(24);
5773 RHSKnown = RHSKnown.trunc(24);
5774
5775 if (Opc == AMDGPUISD::MUL_I24) {
5776 unsigned LHSValBits = LHSKnown.countMaxSignificantBits();
5777 unsigned RHSValBits = RHSKnown.countMaxSignificantBits();
5778 unsigned MaxValBits = LHSValBits + RHSValBits;
5779 if (MaxValBits > 32)
5780 break;
5781 unsigned SignBits = 32 - MaxValBits + 1;
5782 bool LHSNegative = LHSKnown.isNegative();
5783 bool LHSNonNegative = LHSKnown.isNonNegative();
5784 bool LHSPositive = LHSKnown.isStrictlyPositive();
5785 bool RHSNegative = RHSKnown.isNegative();
5786 bool RHSNonNegative = RHSKnown.isNonNegative();
5787 bool RHSPositive = RHSKnown.isStrictlyPositive();
5788
5789 if ((LHSNonNegative && RHSNonNegative) || (LHSNegative && RHSNegative))
5790 Known.Zero.setHighBits(SignBits);
5791 else if ((LHSNegative && RHSPositive) || (LHSPositive && RHSNegative))
5792 Known.One.setHighBits(SignBits);
5793 } else {
5794 unsigned LHSValBits = LHSKnown.countMaxActiveBits();
5795 unsigned RHSValBits = RHSKnown.countMaxActiveBits();
5796 unsigned MaxValBits = LHSValBits + RHSValBits;
5797 if (MaxValBits >= 32)
5798 break;
5799 Known.Zero.setBitsFrom(MaxValBits);
5800 }
5801 break;
5802 }
5803 case AMDGPUISD::PERM: {
5804 ConstantSDNode *CMask = dyn_cast<ConstantSDNode>(Op.getOperand(2));
5805 if (!CMask)
5806 return;
5807
5808 KnownBits LHSKnown = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
5809 KnownBits RHSKnown = DAG.computeKnownBits(Op.getOperand(1), Depth + 1);
5810 unsigned Sel = CMask->getZExtValue();
5811
5812 for (unsigned I = 0; I < 32; I += 8) {
5813 unsigned SelBits = Sel & 0xff;
5814 if (SelBits < 4) {
5815 SelBits *= 8;
5816 Known.One |= ((RHSKnown.One.getZExtValue() >> SelBits) & 0xff) << I;
5817 Known.Zero |= ((RHSKnown.Zero.getZExtValue() >> SelBits) & 0xff) << I;
5818 } else if (SelBits < 7) {
5819 SelBits = (SelBits & 3) * 8;
5820 Known.One |= ((LHSKnown.One.getZExtValue() >> SelBits) & 0xff) << I;
5821 Known.Zero |= ((LHSKnown.Zero.getZExtValue() >> SelBits) & 0xff) << I;
5822 } else if (SelBits == 0x0c) {
5823 Known.Zero |= 0xFFull << I;
5824 } else if (SelBits > 0x0c) {
5825 Known.One |= 0xFFull << I;
5826 }
5827 Sel >>= 8;
5828 }
5829 break;
5830 }
5831 case AMDGPUISD::BUFFER_LOAD_UBYTE: {
5832 Known.Zero.setHighBits(24);
5833 break;
5834 }
5835 case AMDGPUISD::BUFFER_LOAD_USHORT: {
5836 Known.Zero.setHighBits(16);
5837 break;
5838 }
5839 case AMDGPUISD::LDS: {
5840 auto *GA = cast<GlobalAddressSDNode>(Op.getOperand(0).getNode());
5841 Align Alignment = GA->getGlobal()->getPointerAlignment(DAG.getDataLayout());
5842
5843 Known.Zero.setHighBits(16);
5844 Known.Zero.setLowBits(Log2(Alignment));
5845 break;
5846 }
5847 case AMDGPUISD::SMIN3:
5848 case AMDGPUISD::SMAX3:
5849 case AMDGPUISD::SMED3:
5850 case AMDGPUISD::UMIN3:
5851 case AMDGPUISD::UMAX3:
5852 case AMDGPUISD::UMED3: {
5853 KnownBits Known2 = DAG.computeKnownBits(Op.getOperand(2), Depth + 1);
5854 if (Known2.isUnknown())
5855 break;
5856
5857 KnownBits Known1 = DAG.computeKnownBits(Op.getOperand(1), Depth + 1);
5858 if (Known1.isUnknown())
5859 break;
5860
5861 KnownBits Known0 = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
5862 if (Known0.isUnknown())
5863 break;
5864
5865 // TODO: Handle LeadZero/LeadOne from UMIN/UMAX handling.
5866 Known.Zero = Known0.Zero & Known1.Zero & Known2.Zero;
5867 Known.One = Known0.One & Known1.One & Known2.One;
5868 break;
5869 }
5871 unsigned IID = Op.getConstantOperandVal(0);
5872 switch (IID) {
5873 case Intrinsic::amdgcn_workitem_id_x:
5874 case Intrinsic::amdgcn_workitem_id_y:
5875 case Intrinsic::amdgcn_workitem_id_z: {
5876 unsigned MaxValue = Subtarget->getMaxWorkitemID(
5878 Known.Zero.setHighBits(llvm::countl_zero(MaxValue));
5879 break;
5880 }
5881 default:
5882 break;
5883 }
5884 }
5885 }
5886}
5887
5889 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
5890 unsigned Depth) const {
5891 switch (Op.getOpcode()) {
5892 case AMDGPUISD::BFE_I32: {
5893 ConstantSDNode *Width = dyn_cast<ConstantSDNode>(Op.getOperand(2));
5894 if (!Width)
5895 return 1;
5896
5897 unsigned SignBits = 32 - Width->getZExtValue() + 1;
5898 if (!isNullConstant(Op.getOperand(1)))
5899 return SignBits;
5900
5901 // TODO: Could probably figure something out with non-0 offsets.
5902 unsigned Op0SignBits = DAG.ComputeNumSignBits(Op.getOperand(0), Depth + 1);
5903 return std::max(SignBits, Op0SignBits);
5904 }
5905
5906 case AMDGPUISD::BFE_U32: {
5907 ConstantSDNode *Width = dyn_cast<ConstantSDNode>(Op.getOperand(2));
5908 return Width ? 32 - (Width->getZExtValue() & 0x1f) : 1;
5909 }
5910
5911 case AMDGPUISD::CARRY:
5912 case AMDGPUISD::BORROW:
5913 return 31;
5914 case AMDGPUISD::BUFFER_LOAD_BYTE:
5915 return 25;
5916 case AMDGPUISD::BUFFER_LOAD_SHORT:
5917 return 17;
5918 case AMDGPUISD::BUFFER_LOAD_UBYTE:
5919 return 24;
5920 case AMDGPUISD::BUFFER_LOAD_USHORT:
5921 return 16;
5922 case AMDGPUISD::FP_TO_FP16:
5923 return 16;
5924 case AMDGPUISD::SMIN3:
5925 case AMDGPUISD::SMAX3:
5926 case AMDGPUISD::SMED3:
5927 case AMDGPUISD::UMIN3:
5928 case AMDGPUISD::UMAX3:
5929 case AMDGPUISD::UMED3: {
5930 unsigned Tmp2 = DAG.ComputeNumSignBits(Op.getOperand(2), Depth + 1);
5931 if (Tmp2 == 1)
5932 return 1; // Early out.
5933
5934 unsigned Tmp1 = DAG.ComputeNumSignBits(Op.getOperand(1), Depth + 1);
5935 if (Tmp1 == 1)
5936 return 1; // Early out.
5937
5938 unsigned Tmp0 = DAG.ComputeNumSignBits(Op.getOperand(0), Depth + 1);
5939 if (Tmp0 == 1)
5940 return 1; // Early out.
5941
5942 return std::min({Tmp0, Tmp1, Tmp2});
5943 }
5944 default:
5945 return 1;
5946 }
5947}
5948
5950 GISelValueTracking &Analysis, Register R, const APInt &DemandedElts,
5951 const MachineRegisterInfo &MRI, unsigned Depth) const {
5952 const MachineInstr *MI = MRI.getVRegDef(R);
5953 if (!MI)
5954 return 1;
5955
5956 // TODO: Check range metadata on MMO.
5957 switch (MI->getOpcode()) {
5958 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE:
5959 return 25;
5960 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT:
5961 return 17;
5962 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
5963 return 24;
5964 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
5965 return 16;
5966 case AMDGPU::G_AMDGPU_SMED3:
5967 case AMDGPU::G_AMDGPU_UMED3: {
5968 auto [Dst, Src0, Src1, Src2] = MI->getFirst4Regs();
5969 unsigned Tmp2 = Analysis.computeNumSignBits(Src2, DemandedElts, Depth + 1);
5970 if (Tmp2 == 1)
5971 return 1;
5972 unsigned Tmp1 = Analysis.computeNumSignBits(Src1, DemandedElts, Depth + 1);
5973 if (Tmp1 == 1)
5974 return 1;
5975 unsigned Tmp0 = Analysis.computeNumSignBits(Src0, DemandedElts, Depth + 1);
5976 if (Tmp0 == 1)
5977 return 1;
5978 return std::min({Tmp0, Tmp1, Tmp2});
5979 }
5980 default:
5981 return 1;
5982 }
5983}
5984
5986 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
5987 bool PoisonOnly, bool ConsiderFlags, unsigned Depth) const {
5988 unsigned Opcode = Op.getOpcode();
5989 switch (Opcode) {
5990 case AMDGPUISD::BFE_I32:
5991 case AMDGPUISD::BFE_U32:
5992 return false;
5993 }
5995 Op, DemandedElts, DAG, PoisonOnly, ConsiderFlags, Depth);
5996}
5997
5999 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool SNaN,
6000 unsigned Depth) const {
6001 unsigned Opcode = Op.getOpcode();
6002 switch (Opcode) {
6003 case AMDGPUISD::FMIN_LEGACY:
6004 case AMDGPUISD::FMAX_LEGACY: {
6005 if (SNaN)
6006 return true;
6007
6008 // TODO: Can check no nans on one of the operands for each one, but which
6009 // one?
6010 return false;
6011 }
6012 case AMDGPUISD::FMUL_LEGACY:
6013 case AMDGPUISD::CVT_PKRTZ_F16_F32: {
6014 if (SNaN)
6015 return true;
6016 return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1) &&
6017 DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1);
6018 }
6019 case AMDGPUISD::FMED3:
6020 case AMDGPUISD::FMIN3:
6021 case AMDGPUISD::FMAX3:
6022 case AMDGPUISD::FMINIMUM3:
6023 case AMDGPUISD::FMAXIMUM3:
6024 case AMDGPUISD::FMAD_FTZ: {
6025 if (SNaN)
6026 return true;
6027 return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1) &&
6028 DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1) &&
6029 DAG.isKnownNeverNaN(Op.getOperand(2), SNaN, Depth + 1);
6030 }
6031 case AMDGPUISD::CVT_F32_UBYTE0:
6032 case AMDGPUISD::CVT_F32_UBYTE1:
6033 case AMDGPUISD::CVT_F32_UBYTE2:
6034 case AMDGPUISD::CVT_F32_UBYTE3:
6035 return true;
6036
6037 case AMDGPUISD::RCP:
6038 case AMDGPUISD::RSQ:
6039 case AMDGPUISD::RCP_LEGACY:
6040 case AMDGPUISD::RSQ_CLAMP: {
6041 if (SNaN)
6042 return true;
6043
6044 // TODO: Need is known positive check.
6045 return false;
6046 }
6047 case ISD::FLDEXP:
6048 case AMDGPUISD::FRACT: {
6049 if (SNaN)
6050 return true;
6051 return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1);
6052 }
6053 case AMDGPUISD::DIV_SCALE:
6054 case AMDGPUISD::DIV_FMAS:
6055 case AMDGPUISD::DIV_FIXUP:
6056 // TODO: Refine on operands.
6057 return SNaN;
6058 case AMDGPUISD::SIN_HW:
6059 case AMDGPUISD::COS_HW: {
6060 // TODO: Need check for infinity
6061 return SNaN;
6062 }
6064 unsigned IntrinsicID = Op.getConstantOperandVal(0);
6065 // TODO: Handle more intrinsics
6066 switch (IntrinsicID) {
6067 case Intrinsic::amdgcn_cubeid:
6068 case Intrinsic::amdgcn_cvt_off_f32_i4:
6069 return true;
6070
6071 case Intrinsic::amdgcn_frexp_mant: {
6072 if (SNaN)
6073 return true;
6074 return DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1);
6075 }
6076 case Intrinsic::amdgcn_cvt_pkrtz: {
6077 if (SNaN)
6078 return true;
6079 return DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1) &&
6080 DAG.isKnownNeverNaN(Op.getOperand(2), SNaN, Depth + 1);
6081 }
6082 case Intrinsic::amdgcn_rcp:
6083 case Intrinsic::amdgcn_rsq:
6084 case Intrinsic::amdgcn_rcp_legacy:
6085 case Intrinsic::amdgcn_rsq_legacy:
6086 case Intrinsic::amdgcn_rsq_clamp:
6087 case Intrinsic::amdgcn_tanh: {
6088 if (SNaN)
6089 return true;
6090
6091 // TODO: Need is known positive check.
6092 return false;
6093 }
6094 case Intrinsic::amdgcn_trig_preop:
6095 case Intrinsic::amdgcn_fdot2:
6096 // TODO: Refine on operand
6097 return SNaN;
6098 case Intrinsic::amdgcn_fma_legacy:
6099 if (SNaN)
6100 return true;
6101 return DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1) &&
6102 DAG.isKnownNeverNaN(Op.getOperand(2), SNaN, Depth + 1) &&
6103 DAG.isKnownNeverNaN(Op.getOperand(3), SNaN, Depth + 1);
6104 default:
6105 return false;
6106 }
6107 }
6108 default:
6109 return false;
6110 }
6111}
6112
6114 Register N0, Register N1) const {
6115 return MRI.hasOneNonDBGUse(N0); // FIXME: handle regbanks
6116}
unsigned const MachineRegisterInfo * MRI
return SDValue()
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static LLVM_READONLY bool hasSourceMods(const MachineInstr &MI)
static bool isInv2Pi(const APFloat &APF)
static LLVM_READONLY bool opMustUseVOP3Encoding(const MachineInstr &MI, const MachineRegisterInfo &MRI)
returns true if the operation will definitely need to use a 64-bit encoding, and thus will use a VOP3...
static unsigned inverseMinMax(unsigned Opc)
static SDValue extractF64Exponent(SDValue Hi, const SDLoc &SL, SelectionDAG &DAG)
static unsigned workitemIntrinsicDim(unsigned ID)
static int getOrCreateFixedStackObject(MachineFrameInfo &MFI, unsigned Size, int64_t Offset)
static SDValue constantFoldBFE(SelectionDAG &DAG, IntTy Src0, uint32_t Offset, uint32_t Width, const SDLoc &DL)
static SDValue getMad(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue X, SDValue Y, SDValue C, SDNodeFlags Flags=SDNodeFlags())
static SDValue getAddOneOp(const SDNode *V)
If V is an add of a constant 1, returns the other operand.
static LLVM_READONLY bool selectSupportsSourceMods(const SDNode *N)
Return true if v_cndmask_b32 will support fabs/fneg source modifiers for the type for ISD::SELECT.
static cl::opt< bool > AMDGPUBypassSlowDiv("amdgpu-bypass-slow-div", cl::desc("Skip 64-bit divide for dynamic 32-bit values"), cl::init(true))
static SDValue getMul24(SelectionDAG &DAG, const SDLoc &SL, SDValue N0, SDValue N1, unsigned Size, bool Signed)
static bool fnegFoldsIntoOp(const SDNode *N)
static bool isI24(SDValue Op, SelectionDAG &DAG)
static bool isCttzOpc(unsigned Opc)
static bool isU24(SDValue Op, SelectionDAG &DAG)
static SDValue peekFPSignOps(SDValue Val)
static bool valueIsKnownNeverF32Denorm(SDValue Src)
Return true if it's known that Src can never be an f32 denormal value.
static SDValue distributeOpThroughSelect(TargetLowering::DAGCombinerInfo &DCI, unsigned Op, const SDLoc &SL, SDValue Cond, SDValue N1, SDValue N2)
static SDValue peekFNeg(SDValue Val)
static SDValue simplifyMul24(SDNode *Node24, TargetLowering::DAGCombinerInfo &DCI)
static bool isCtlzOpc(unsigned Opc)
static LLVM_READNONE bool fnegFoldsIntoOpcode(unsigned Opc)
static bool hasVolatileUser(SDNode *Val)
Interface definition of the TargetLowering class that is common to all AMD GPUs.
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Function Alias Analysis Results
block Block Frequency Analysis
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
#define LLVM_READNONE
Definition Compiler.h:315
#define LLVM_READONLY
Definition Compiler.h:322
Provides analysis for querying information about KnownBits during GISel passes.
IRTranslator LLVM IR MI
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
static LVOptions Options
Definition LVOptions.cpp:25
#define I(x, y, z)
Definition MD5.cpp:57
#define G(x, y, z)
Definition MD5.cpp:55
static DebugLoc getDebugLoc(MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
Return the first DebugLoc that has line number information, given a range of instructions.
#define T
const SmallVectorImpl< MachineOperand > & Cond
#define CH(x, y, z)
Definition SHA256.cpp:34
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
Value * RHS
Value * LHS
BinaryOperator * Mul
static CCAssignFn * CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg)
static CCAssignFn * CCAssignFnForReturn(CallingConv::ID CC, bool IsVarArg)
static std::optional< uint32_t > getLDSAbsoluteAddress(const GlobalValue &GV)
void recordNumNamedBarriers(uint32_t GVAddr, unsigned BarCnt)
unsigned allocateLDSGlobal(const DataLayout &DL, const GlobalVariable &GV)
static const AMDGPUSubtarget & get(const MachineFunction &MF)
static unsigned numBitsSigned(SDValue Op, SelectionDAG &DAG)
SDValue combineFMinMaxLegacy(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, SDValue True, SDValue False, SDValue CC, DAGCombinerInfo &DCI) const
Generate Min/Max node.
unsigned ComputeNumSignBitsForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
This method can be implemented by targets that want to expose additional information about sign bits ...
SDValue performMulhuCombine(SDNode *N, DAGCombinerInfo &DCI) const
EVT getTypeForExtReturn(LLVMContext &Context, EVT VT, ISD::NodeType ExtendKind) const override
Return the type that should be used to zero or sign extend a zeroext/signext integer return value.
SDValue SplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Split a vector load into 2 loads of half the vector.
SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const
SDValue performLoadCombine(SDNode *N, DAGCombinerInfo &DCI) const
void analyzeFormalArgumentsCompute(CCState &State, const SmallVectorImpl< ISD::InputArg > &Ins) const
The SelectionDAGBuilder will automatically promote function arguments with illegal types.
SDValue LowerF64ToF16Safe(SDValue Src, const SDLoc &DL, SelectionDAG &DAG) const
SDValue LowerFROUND(SDValue Op, SelectionDAG &DAG) const
SDValue storeStackInputValue(SelectionDAG &DAG, const SDLoc &SL, SDValue Chain, SDValue ArgVal, int64_t Offset) const
bool storeOfVectorConstantIsCheap(bool IsZero, EVT MemVT, unsigned NumElem, unsigned AS) const override
Return true if it is expected to be cheaper to do a store of vector constant with the given size and ...
SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
bool shouldCombineMemoryType(EVT VT) const
SDValue splitBinaryBitConstantOpImpl(DAGCombinerInfo &DCI, const SDLoc &SL, unsigned Opc, SDValue LHS, uint32_t ValLo, uint32_t ValHi) const
Split the 64-bit value LHS into two 32-bit components, and perform the binary operation Opc to it wit...
SDValue lowerUnhandledCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals, StringRef Reason) const
SDValue performAssertSZExtCombine(SDNode *N, DAGCombinerInfo &DCI) const
bool isTruncateFree(EVT Src, EVT Dest) const override
bool aggressivelyPreferBuildVectorSources(EVT VecVT) const override
SDValue LowerFCEIL(SDValue Op, SelectionDAG &DAG) const
TargetLowering::NegatibleCost getConstantNegateCost(const ConstantFPSDNode *C) const
SDValue LowerFLOGUnsafe(SDValue Op, const SDLoc &SL, SelectionDAG &DAG, bool IsLog10, SDNodeFlags Flags) const
bool canCreateUndefOrPoisonForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool PoisonOnly, bool ConsiderFlags, unsigned Depth) const override
Return true if Op can create undef or poison from non-undef & non-poison operands.
SDValue performMulhsCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue lowerFEXPUnsafeImpl(SDValue Op, const SDLoc &SL, SelectionDAG &DAG, SDNodeFlags Flags, bool IsExp10) const
bool isSDNodeAlwaysUniform(const SDNode *N) const override
bool isDesirableToCommuteWithShift(const SDNode *N, CombineLevel Level) const override
Return true if it is profitable to move this shift by a constant amount through its operand,...
SDValue performShlCombine(SDNode *N, DAGCombinerInfo &DCI) const
bool isCheapToSpeculateCtlz(Type *Ty) const override
Return true if it is cheap to speculate a call to intrinsic ctlz.
SDValue LowerSDIVREM(SDValue Op, SelectionDAG &DAG) const
bool isFNegFree(EVT VT) const override
Return true if an fneg operation is free to the point where it is never worthwhile to replace it with...
SDValue LowerFLOG10(SDValue Op, SelectionDAG &DAG) const
SDValue LowerINT_TO_FP64(SDValue Op, SelectionDAG &DAG, bool Signed) const
unsigned computeNumSignBitsForTargetInstr(GISelValueTracking &Analysis, Register R, const APInt &DemandedElts, const MachineRegisterInfo &MRI, unsigned Depth=0) const override
This method can be implemented by targets that want to expose additional information about sign bits ...
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
SDValue LowerFP_TO_FP16(SDValue Op, SelectionDAG &DAG) const
SDValue addTokenForArgument(SDValue Chain, SelectionDAG &DAG, MachineFrameInfo &MFI, int ClobberedFI) const
bool isConstantCheaperToNegate(SDValue N) const
bool isReassocProfitable(MachineRegisterInfo &MRI, Register N0, Register N1) const override
bool isKnownNeverNaNForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool SNaN=false, unsigned Depth=0) const override
If SNaN is false,.
static bool needsDenormHandlingF32(const SelectionDAG &DAG, SDValue Src, SDNodeFlags Flags)
uint32_t getImplicitParameterOffset(const MachineFunction &MF, const ImplicitParameter Param) const
Helper function that returns the byte offset of the given type of implicit parameter.
SDValue LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const
SDValue performSelectCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue performFNegCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const
virtual SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op, SelectionDAG &DAG) const
bool isConstantCostlierToNegate(SDValue N) const
SDValue loadInputValue(SelectionDAG &DAG, const TargetRegisterClass *RC, EVT VT, const SDLoc &SL, const ArgDescriptor &Arg) const
SDValue LowerDIVREM24(SDValue Op, SelectionDAG &DAG, bool sign) const
SDValue lowerFEXP10Unsafe(SDValue Op, const SDLoc &SL, SelectionDAG &DAG, SDNodeFlags Flags) const
Emit approx-funcs appropriate lowering for exp10.
bool shouldReduceLoadWidth(SDNode *Load, ISD::LoadExtType ExtType, EVT ExtVT, std::optional< unsigned > ByteOffset) const override
Return true if it is profitable to reduce a load to a smaller type.
SDValue LowerUINT_TO_FP(SDValue Op, SelectionDAG &DAG) const
bool isCheapToSpeculateCttz(Type *Ty) const override
Return true if it is cheap to speculate a call to intrinsic cttz.
SDValue performCtlz_CttzCombine(const SDLoc &SL, SDValue Cond, SDValue LHS, SDValue RHS, DAGCombinerInfo &DCI) const
SDValue performSraCombine(SDNode *N, DAGCombinerInfo &DCI) const
bool isSelectSupported(SelectSupportKind) const override
bool isZExtFree(Type *Src, Type *Dest) const override
Return true if any actual instruction that defines a value of type FromTy implicitly zero-extends the...
SDValue lowerFEXP2(SDValue Op, SelectionDAG &DAG) const
SDValue LowerCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower calls into the specified DAG.
SDValue performSrlCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue lowerFEXP(SDValue Op, SelectionDAG &DAG) const
SDValue getIsLtSmallestNormal(SelectionDAG &DAG, SDValue Op, SDNodeFlags Flags) const
bool mayIgnoreSignedZero(SDValue Op) const
SDValue getIsFinite(SelectionDAG &DAG, SDValue Op, SDNodeFlags Flags) const
bool isLoadBitCastBeneficial(EVT, EVT, const SelectionDAG &DAG, const MachineMemOperand &MMO) const final
Return true if the following transform is beneficial: fold (conv (load x)) -> (load (conv*)x) On arch...
std::pair< SDValue, SDValue > splitVector(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HighVT, SelectionDAG &DAG) const
Split a vector value into two parts of types LoVT and HiVT.
AMDGPUTargetLowering(const TargetMachine &TM, const TargetSubtargetInfo &STI, const AMDGPUSubtarget &AMDGPUSTI)
SDValue LowerFLOGCommon(SDValue Op, SelectionDAG &DAG) const
SDValue foldFreeOpFromSelect(TargetLowering::DAGCombinerInfo &DCI, SDValue N) const
SDValue LowerINT_TO_FP32(SDValue Op, SelectionDAG &DAG, bool Signed) const
bool isFAbsFree(EVT VT) const override
Return true if an fabs operation is free to the point where it is never worthwhile to replace it with...
SDValue loadStackInputValue(SelectionDAG &DAG, EVT VT, const SDLoc &SL, int64_t Offset) const
Similar to CreateLiveInRegister, except value maybe loaded from a stack slot rather than passed in a ...
SDValue LowerFLOG2(SDValue Op, SelectionDAG &DAG) const
static EVT getEquivalentMemType(LLVMContext &Context, EVT VT)
SDValue getSqrtEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled, int &RefinementSteps, bool &UseOneConstNR, bool Reciprocal) const override
Hooks for building estimates in place of slower divisions and square roots.
SDValue performTruncateCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue LowerSINT_TO_FP(SDValue Op, SelectionDAG &DAG) const
static SDValue stripBitcast(SDValue Val)
SDValue CreateLiveInRegister(SelectionDAG &DAG, const TargetRegisterClass *RC, Register Reg, EVT VT, const SDLoc &SL, bool RawReg=false) const
Helper function that adds Reg to the LiveIn list of the DAG's MachineFunction.
SDValue SplitVectorStore(SDValue Op, SelectionDAG &DAG) const
Split a vector store into 2 stores of half the vector.
SDValue LowerCTLZ_CTTZ(SDValue Op, SelectionDAG &DAG) const
SDValue getNegatedExpression(SDValue Op, SelectionDAG &DAG, bool LegalOperations, bool ForCodeSize, NegatibleCost &Cost, unsigned Depth) const override
Return the newly negated expression if the cost is not expensive and set the cost in Cost to indicate...
std::pair< SDValue, SDValue > split64BitValue(SDValue Op, SelectionDAG &DAG) const
Return 64-bit value Op as two 32-bit integers.
SDValue performMulCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue getRecipEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled, int &RefinementSteps) const override
Return a reciprocal estimate value for the input operand.
SDValue LowerFNEARBYINT(SDValue Op, SelectionDAG &DAG) const
SDValue LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const
static CCAssignFn * CCAssignFnForReturn(CallingConv::ID CC, bool IsVarArg)
std::pair< SDValue, SDValue > getScaledLogInput(SelectionDAG &DAG, const SDLoc SL, SDValue Op, SDNodeFlags Flags) const
If denormal handling is required return the scaled input to FLOG2, and the check for denormal range.
static CCAssignFn * CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg)
Selects the correct CCAssignFn for a given CallingConvention value.
static bool allUsesHaveSourceMods(const SDNode *N, unsigned CostThreshold=4)
SDValue LowerFROUNDEVEN(SDValue Op, SelectionDAG &DAG) const
bool isFPImmLegal(const APFloat &Imm, EVT VT, bool ForCodeSize) const override
Returns true if the target can instruction select the specified FP immediate natively.
static unsigned numBitsUnsigned(SDValue Op, SelectionDAG &DAG)
SDValue lowerFEXPUnsafe(SDValue Op, const SDLoc &SL, SelectionDAG &DAG, SDNodeFlags Flags) const
SDValue LowerFTRUNC(SDValue Op, SelectionDAG &DAG) const
SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const
static bool allowApproxFunc(const SelectionDAG &DAG, SDNodeFlags Flags)
bool ShouldShrinkFPConstant(EVT VT) const override
If true, then instruction selection should seek to shrink the FP constant of the specified type to a ...
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
SDValue performStoreCombine(SDNode *N, DAGCombinerInfo &DCI) const
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
This callback is invoked when a node result type is illegal for the target, and the operation was reg...
SDValue performRcpCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue getLoHalf64(SDValue Op, SelectionDAG &DAG) const
SDValue lowerCTLZResults(SDValue Op, SelectionDAG &DAG) const
SDValue performFAbsCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue LowerFP_TO_INT64(SDValue Op, SelectionDAG &DAG, bool Signed) const
static bool shouldFoldFNegIntoSrc(SDNode *FNeg, SDValue FNegSrc)
bool isNarrowingProfitable(SDNode *N, EVT SrcVT, EVT DestVT) const override
Return true if it's profitable to narrow operations of type SrcVT to DestVT.
SDValue LowerFRINT(SDValue Op, SelectionDAG &DAG) const
SDValue performIntrinsicWOChainCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue LowerUDIVREM(SDValue Op, SelectionDAG &DAG) const
SDValue performMulLoHiCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
void LowerUDIVREM64(SDValue Op, SelectionDAG &DAG, SmallVectorImpl< SDValue > &Results) const
SDValue WidenOrSplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Widen a suitably aligned v3 load.
std::pair< EVT, EVT > getSplitDestVTs(const EVT &VT, SelectionDAG &DAG) const
Split a vector type into two parts.
SDValue getHiHalf64(SDValue Op, SelectionDAG &DAG) const
SDValue combineFMinMaxLegacyImpl(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, SDValue True, SDValue False, SDValue CC, DAGCombinerInfo &DCI) const
unsigned getVectorIdxWidth(const DataLayout &) const override
Returns the type to be used for the index operand vector operations.
static const fltSemantics & IEEEsingle()
Definition APFloat.h:296
static const fltSemantics & IEEEdouble()
Definition APFloat.h:297
static constexpr roundingMode rmNearestTiesToEven
Definition APFloat.h:344
static const fltSemantics & IEEEhalf()
Definition APFloat.h:294
bool bitwiseIsEqual(const APFloat &RHS) const
Definition APFloat.h:1396
opStatus add(const APFloat &RHS, roundingMode RM)
Definition APFloat.h:1163
const fltSemantics & getSemantics() const
Definition APFloat.h:1439
opStatus multiply(const APFloat &RHS, roundingMode RM)
Definition APFloat.h:1181
static APFloat getSmallestNormalized(const fltSemantics &Sem, bool Negative=false)
Returns the smallest (by magnitude) normalized finite number in the given semantics.
Definition APFloat.h:1140
static APFloat getInf(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Infinity.
Definition APFloat.h:1080
Class for arbitrary precision integers.
Definition APInt.h:78
uint64_t getZExtValue() const
Get zero extended value.
Definition APInt.h:1541
void setHighBits(unsigned hiBits)
Set the top hiBits bits.
Definition APInt.h:1392
void setBitsFrom(unsigned loBit)
Set the top bits starting from loBit.
Definition APInt.h:1386
static APInt getBitsSet(unsigned numBits, unsigned loBit, unsigned hiBit)
Get a value with a block of bits set.
Definition APInt.h:259
bool ule(const APInt &RHS) const
Unsigned less or equal comparison.
Definition APInt.h:1151
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
Definition APInt.h:307
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition APInt.h:297
void setLowBits(unsigned loBits)
Set the bottom loBits bits.
Definition APInt.h:1389
This class represents an incoming formal argument to a Function.
Definition Argument.h:32
CCState - This class holds information needed while lowering arguments and return values.
static CCValAssign getCustomMem(unsigned ValNo, MVT ValVT, int64_t Offset, MVT LocVT, LocInfo HTP)
const APFloat & getValueAPF() const
bool isNegative() const
Return true if the value is negative.
uint64_t getZExtValue() const
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:63
Diagnostic information for unsupported feature in backend.
const DataLayout & getDataLayout() const
Get the data layout of the module this function belongs to.
Definition Function.cpp:363
iterator_range< arg_iterator > args()
Definition Function.h:890
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition Function.h:270
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition Function.cpp:359
Type * getValueType() const
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
LLVM_ABI void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
This class is used to represent ISD::LOAD nodes.
const SDValue & getBasePtr() const
Machine Value Type.
static auto integer_fixedlen_vector_valuetypes()
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
bool isInteger() const
Return true if this is an integer or a vector integer type.
static auto integer_valuetypes()
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
LLVM_ABI int CreateFixedObject(uint64_t Size, int64_t SPOffset, bool IsImmutable, bool isAliased=false)
Create a new object at a fixed location on the stack.
int64_t getObjectSize(int ObjectIdx) const
Return the size of the specified object.
int64_t getObjectOffset(int ObjectIdx) const
Return the assigned stack offset of the specified object from the incoming stack pointer.
int getObjectIndexBegin() const
Return the minimum frame object index.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Representation of each machine instruction.
A description of a memory reference used in the backend.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOInvariant
The memory access always returns the same value (or traps).
Flags getFlags() const
Return the raw flags of the source value,.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
This is an abstract virtual class for memory operations.
unsigned getAddressSpace() const
Return the address space for the associated pointer.
Align getAlign() const
bool isSimple() const
Returns true if the memory operation is neither atomic or volatile.
MachineMemOperand * getMemOperand() const
Return a MachineMemOperand object describing the memory reference performed by operation.
const SDValue & getChain() const
bool isInvariant() const
EVT getMemoryVT() const
Return the type of the in-memory value.
Wrapper class representing virtual and physical registers.
Definition Register.h:20
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
const DebugLoc & getDebugLoc() const
Represents one node in the SelectionDAG.
ArrayRef< SDUse > ops() const
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
bool hasOneUse() const
Return true if there is exactly one use of this node.
SDNodeFlags getFlags() const
SDVTList getVTList() const
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
iterator_range< user_iterator > users()
Represents a use of a SDNode.
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
SDNode * getNode() const
get the SDNode which holds the desired result
bool hasOneUse() const
Return true if there is exactly one node using value ResNo of Node.
SDValue getValue(unsigned R) const
EVT getValueType() const
Return the ValueType of the referenced return value.
TypeSize getValueSizeInBits() const
Returns the size of the value in bits.
const SDValue & getOperand(unsigned i) const
unsigned getOpcode() const
unsigned getNumOperands() const
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
SIModeRegisterDefaults getMode() const
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
LLVM_ABI SDValue getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT VT, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, EVT MemVT, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
LLVM_ABI unsigned ComputeMaxSignificantBits(SDValue Op, unsigned Depth=0) const
Get the upper bound on bit size for this Value Op as a signed integer.
const SDValue & getRoot() const
Return the root tag of the SelectionDAG.
LLVM_ABI SDValue getMergeValues(ArrayRef< SDValue > Ops, const SDLoc &dl)
Create a MERGE_VALUES node from the given operands.
LLVM_ABI SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
LLVM_ABI SDValue getShiftAmountConstant(uint64_t Val, EVT VT, const SDLoc &DL)
LLVM_ABI SDValue getAllOnesConstant(const SDLoc &DL, EVT VT, bool IsTarget=false, bool IsOpaque=false)
LLVM_ABI void ExtractVectorElements(SDValue Op, SmallVectorImpl< SDValue > &Args, unsigned Start=0, unsigned Count=0, EVT EltVT=EVT())
Append the extracted elements from Start to Count out of the vector Op in Args.
LLVM_ABI SDValue getFreeze(SDValue V)
Return a freeze using the SDLoc of the value operand.
SDValue getSetCC(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond, SDValue Chain=SDValue(), bool IsSignaling=false)
Helper function to make it easier to build SetCC's if you just have an ISD::CondCode instead of an SD...
LLVM_ABI SDValue getConstantFP(double Val, const SDLoc &DL, EVT VT, bool isTarget=false)
Create a ConstantFPSDNode wrapping a constant value.
LLVM_ABI SDValue getRegister(Register Reg, EVT VT)
LLVM_ABI SDValue getLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr)
Loads are not normal binary operators: their result type is not determined by their operands,...
LLVM_ABI SDValue getNOT(const SDLoc &DL, SDValue Val, EVT VT)
Create a bitwise NOT operation as (XOR Val, -1).
const TargetLowering & getTargetLoweringInfo() const
SDValue getCALLSEQ_END(SDValue Chain, SDValue Op1, SDValue Op2, SDValue InGlue, const SDLoc &DL)
Return a new CALLSEQ_END node, which always must have a glue result (to ensure it's not CSE'd).
SDValue getBuildVector(EVT VT, const SDLoc &DL, ArrayRef< SDValue > Ops)
Return an ISD::BUILD_VECTOR node.
LLVM_ABI SDValue getBitcast(EVT VT, SDValue V)
Return a bitcast using the SDLoc of the value operand, and casting to the provided type.
SDValue getCopyFromReg(SDValue Chain, const SDLoc &dl, Register Reg, EVT VT)
SDValue getSelect(const SDLoc &DL, EVT VT, SDValue Cond, SDValue LHS, SDValue RHS, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build Select's if you just have operands and don't want to check...
LLVM_ABI SDValue getZeroExtendInReg(SDValue Op, const SDLoc &DL, EVT VT)
Return the expression required to zero extend the Op value assuming it was the smaller SrcTy value.
const DataLayout & getDataLayout() const
LLVM_ABI SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
LLVM_ABI SDValue getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, EVT SVT, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
LLVM_ABI void ReplaceAllUsesWith(SDValue From, SDValue To)
Modify anything using 'From' to use 'To' instead.
LLVM_ABI SDValue getStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
Helper function to build ISD::STORE nodes.
LLVM_ABI SDValue getSignedConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
SDValue getCALLSEQ_START(SDValue Chain, uint64_t InSize, uint64_t OutSize, const SDLoc &DL)
Return a new CALLSEQ_START node, that starts new call frame, in which InSize bytes are set up inside ...
bool isConstantValueOfAnyType(SDValue N) const
SDValue getSelectCC(const SDLoc &DL, SDValue LHS, SDValue RHS, SDValue True, SDValue False, ISD::CondCode Cond, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build SelectCC's if you just have an ISD::CondCode instead of an...
LLVM_ABI SDValue getSExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either sign-extending or trunca...
LLVM_ABI SDValue getIntPtrConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
LLVM_ABI SDValue getValueType(EVT)
LLVM_ABI SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
LLVM_ABI bool isKnownNeverNaN(SDValue Op, const APInt &DemandedElts, bool SNaN=false, unsigned Depth=0) const
Test whether the given SDValue (or all elements of it, if it is a vector) is known to never be NaN in...
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
LLVM_ABI unsigned ComputeNumSignBits(SDValue Op, unsigned Depth=0) const
Return the number of times the sign bit of the register is replicated into the other bits.
LLVM_ABI SDValue getVectorIdxConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
LLVM_ABI void ReplaceAllUsesOfValueWith(SDValue From, SDValue To)
Replace any uses of From with To, leaving uses of other values produced by From.getNode() alone.
MachineFunction & getMachineFunction() const
SDValue getPOISON(EVT VT)
Return a POISON node. POISON does not have a useful SDLoc.
LLVM_ABI SDValue getFrameIndex(int FI, EVT VT, bool isTarget=false)
LLVM_ABI KnownBits computeKnownBits(SDValue Op, unsigned Depth=0) const
Determine which bits of Op are known to be either zero or one and return them in Known.
LLVM_ABI SDValue getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either zero-extending or trunca...
LLVM_ABI bool MaskedValueIsZero(SDValue Op, const APInt &Mask, unsigned Depth=0) const
Return true if 'Op & Mask' is known to be zero.
SDValue getObjectPtrOffset(const SDLoc &SL, SDValue Ptr, TypeSize Offset)
Create an add instruction with appropriate flags when used for addressing some offset of an object.
LLVMContext * getContext() const
const SDValue & setRoot(SDValue N)
Set the current root tag of the SelectionDAG.
LLVM_ABI SDNode * UpdateNodeOperands(SDNode *N, SDValue Op)
Mutate the specified node in-place to have the specified operands.
SDValue getEntryNode() const
Return the token chain corresponding to the entry of the function.
LLVM_ABI std::pair< SDValue, SDValue > SplitScalar(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the scalar node with EXTRACT_ELEMENT using the provided VTs and return the low/high part.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
This class is used to represent ISD::STORE nodes.
const SDValue & getBasePtr() const
const SDValue & getValue() const
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
void setOperationAction(unsigned Op, MVT VT, LegalizeAction Action)
Indicate that the specified operation does not work with the specified type and indicate what to do a...
void setMaxDivRemBitWidthSupported(unsigned SizeInBits)
Set the size in bits of the maximum div/rem the backend supports.
bool PredictableSelectIsExpensive
Tells the code generator that select is more expensive than a branch if the branch is usually predict...
virtual bool shouldReduceLoadWidth(SDNode *Load, ISD::LoadExtType ExtTy, EVT NewVT, std::optional< unsigned > ByteOffset=std::nullopt) const
Return true if it is profitable to reduce a load to a smaller type.
unsigned MaxStoresPerMemcpyOptSize
Likewise for functions with the OptSize attribute.
const TargetMachine & getTargetMachine() const
virtual unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain targets require unusual breakdowns of certain types.
unsigned MaxGluedStoresPerMemcpy
Specify max number of store instructions to glue in inlined memcpy.
virtual MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
void addBypassSlowDiv(unsigned int SlowBitWidth, unsigned int FastBitWidth)
Tells the code generator which bitwidths to bypass.
void setMaxLargeFPConvertBitWidthSupported(unsigned SizeInBits)
Set the size in bits of the maximum fp to/from int conversion the backend supports.
void setMaxAtomicSizeInBitsSupported(unsigned SizeInBits)
Set the maximum atomic operation size supported by the backend.
SelectSupportKind
Enum that describes what type of support for selects the target has.
virtual bool allowsMisalignedMemoryAccesses(EVT, unsigned AddrSpace=0, Align Alignment=Align(1), MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *=nullptr) const
Determine if the target supports unaligned memory accesses.
unsigned MaxStoresPerMemsetOptSize
Likewise for functions with the OptSize attribute.
EVT getShiftAmountTy(EVT LHSTy, const DataLayout &DL) const
Returns the type for the shift amount of a shift opcode.
unsigned MaxStoresPerMemmove
Specify maximum number of store instructions per memmove call.
virtual EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const
Return the ValueType of the result of SETCC operations.
virtual EVT getTypeToTransformTo(LLVMContext &Context, EVT VT) const
For types supported by the target, this is an identity function.
unsigned MaxStoresPerMemmoveOptSize
Likewise for functions with the OptSize attribute.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
void setSupportsUnalignedAtomics(bool UnalignedSupported)
Sets whether unaligned atomic operations are supported.
bool isOperationLegal(unsigned Op, EVT VT) const
Return true if the specified operation is legal on this target.
unsigned MaxStoresPerMemset
Specify maximum number of store instructions per memset call.
void setTruncStoreAction(MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified truncating store does not work with the specified type and indicate what ...
void setMinCmpXchgSizeInBits(unsigned SizeInBits)
Sets the minimum cmpxchg or ll/sc size supported by the backend.
void AddPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)
If Opc/OrigVT is specified as being promoted, the promotion code defaults to trying a larger integer/...
void setTargetDAGCombine(ArrayRef< ISD::NodeType > NTs)
Targets should invoke this method for each target independent node that they want to provide a custom...
void setLoadExtAction(unsigned ExtType, MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified load with extension does not work with the specified type and indicate wh...
unsigned GatherAllAliasesMaxDepth
Depth that GatherAllAliases should continue looking for chain dependencies when trying to find a more...
NegatibleCost
Enum that specifies when a float negation is beneficial.
bool allowsMemoryAccessForAlignment(LLVMContext &Context, const DataLayout &DL, EVT VT, unsigned AddrSpace=0, Align Alignment=Align(1), MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *Fast=nullptr) const
This function returns true if the memory access is aligned or if the target allows this specific unal...
unsigned MaxStoresPerMemcpy
Specify maximum number of store instructions per memcpy call.
void setSchedulingPreference(Sched::Preference Pref)
Specify the target scheduling preference.
void setJumpIsExpensive(bool isExpensive=true)
Tells the code generator not to expand logic operations on comparison predicates into separate sequen...
This class defines information used to lower LLVM code to legal SelectionDAG operators that the targe...
SDValue scalarizeVectorStore(StoreSDNode *ST, SelectionDAG &DAG) const
SDValue SimplifyMultipleUseDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth=0) const
More limited version of SimplifyDemandedBits that can be used to "lookthrough" ops that don't contrib...
SDValue expandUnalignedStore(StoreSDNode *ST, SelectionDAG &DAG) const
Expands an unaligned store to 2 half-size stores for integer values, and possibly more for vectors.
bool ShrinkDemandedConstant(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, TargetLoweringOpt &TLO) const
Check to see if the specified operand of the specified instruction is a constant integer.
std::pair< SDValue, SDValue > expandUnalignedLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Expands an unaligned load to 2 half-size loads for an integer, and possibly more for vectors.
virtual SDValue getNegatedExpression(SDValue Op, SelectionDAG &DAG, bool LegalOps, bool OptForSize, NegatibleCost &Cost, unsigned Depth=0) const
Return the newly negated expression if the cost is not expensive and set the cost in Cost to indicate...
std::pair< SDValue, SDValue > scalarizeVectorLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Turn load of vector type into a load of the individual elements.
bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0, bool AssumeSingleUse=false) const
Look at Op.
TargetLowering(const TargetLowering &)=delete
virtual bool canCreateUndefOrPoisonForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool PoisonOnly, bool ConsiderFlags, unsigned Depth) const
Return true if Op can create undef or poison from non-undef & non-poison operands.
Primary interface to the complete machine description for the target machine.
TargetOptions Options
TargetSubtargetInfo - Generic base class for all target subtargets.
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition TypeSize.h:343
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:230
LLVM Value Representation.
Definition Value.h:75
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:322
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
bool isIntrinsicAlwaysUniform(unsigned IntrID)
TargetExtType * isNamedBarrier(const GlobalVariable &GV)
bool isUniformMMO(const MachineMemOperand *MMO)
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
@ AMDGPU_VS
Used for Mesa vertex shaders, or AMDPAL last shader stage before rasterization (vertex shader if tess...
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
@ AMDGPU_Gfx
Used for AMD graphics targets.
@ AMDGPU_CS_ChainPreserve
Used on AMDGPUs to give the middle-end more control over argument placement.
@ AMDGPU_HS
Used for Mesa/AMDPAL hull shaders (= tessellation control shaders).
@ AMDGPU_GS
Used for Mesa/AMDPAL geometry shaders.
@ AMDGPU_CS_Chain
Used on AMDGPUs to give the middle-end more control over argument placement.
@ AMDGPU_PS
Used for Mesa/AMDPAL pixel shaders.
@ Cold
Attempts to make code in the caller as efficient as possible under the assumption that the call is no...
Definition CallingConv.h:47
@ SPIR_KERNEL
Used for SPIR kernel functions.
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition CallingConv.h:41
@ AMDGPU_ES
Used for AMDPAL shader stage before geometry shader if geometry is in use.
@ AMDGPU_LS
Used for AMDPAL vertex shader if tessellation is in use.
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
NodeType
ISD::NodeType enum - This enum defines the target-independent operators for a SelectionDAG.
Definition ISDOpcodes.h:41
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition ISDOpcodes.h:807
@ CTLZ_ZERO_UNDEF
Definition ISDOpcodes.h:780
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
Definition ISDOpcodes.h:270
@ INSERT_SUBVECTOR
INSERT_SUBVECTOR(VECTOR1, VECTOR2, IDX) - Returns a vector with VECTOR2 inserted into VECTOR1.
Definition ISDOpcodes.h:593
@ BSWAP
Byte Swap and Counting operators.
Definition ISDOpcodes.h:771
@ ADDC
Carry-setting nodes for multiple precision addition and subtraction.
Definition ISDOpcodes.h:289
@ FMAD
FMAD - Perform a * b + c, while getting the same result as the separately rounded operations.
Definition ISDOpcodes.h:515
@ ADD
Simple integer binary arithmetic operators.
Definition ISDOpcodes.h:259
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
Definition ISDOpcodes.h:841
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
Definition ISDOpcodes.h:511
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition ISDOpcodes.h:868
@ CONCAT_VECTORS
CONCAT_VECTORS(VECTOR0, VECTOR1, ...) - Given a number of values of vector type with the same length ...
Definition ISDOpcodes.h:577
@ FADD
Simple binary floating point operators.
Definition ISDOpcodes.h:410
@ SDIVREM
SDIVREM/UDIVREM - Divide two integers and produce both a quotient and remainder result.
Definition ISDOpcodes.h:275
@ BUILD_PAIR
BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.
Definition ISDOpcodes.h:249
@ SIGN_EXTEND
Conversion operators.
Definition ISDOpcodes.h:832
@ CTTZ_ZERO_UNDEF
Bit counting operators with an undefined result for zero inputs.
Definition ISDOpcodes.h:779
@ FCANONICALIZE
Returns platform specific canonical encoding of a floating point number.
Definition ISDOpcodes.h:534
@ IS_FPCLASS
Performs a check of floating point class property, defined by IEEE-754.
Definition ISDOpcodes.h:541
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition ISDOpcodes.h:784
@ EXTRACT_ELEMENT
EXTRACT_ELEMENT - This is used to get the lower or upper (determined by a Constant,...
Definition ISDOpcodes.h:242
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
Definition ISDOpcodes.h:701
@ SHL
Shift and rotation operations.
Definition ISDOpcodes.h:762
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
Definition ISDOpcodes.h:642
@ EXTRACT_SUBVECTOR
EXTRACT_SUBVECTOR(VECTOR, IDX) - Returns a subvector from VECTOR.
Definition ISDOpcodes.h:607
@ EntryToken
EntryToken - This is the marker used to indicate the start of a region.
Definition ISDOpcodes.h:48
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition ISDOpcodes.h:569
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
Definition ISDOpcodes.h:219
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition ISDOpcodes.h:838
@ SELECT_CC
Select with condition operator - This selects between a true value and a false value (ops #2 and #3) ...
Definition ISDOpcodes.h:799
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
Definition ISDOpcodes.h:876
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
Definition ISDOpcodes.h:724
@ VSELECT
Select with a vector condition (op #0) and two vector operands (ops #1 and #2), returning a vector re...
Definition ISDOpcodes.h:793
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
Definition ISDOpcodes.h:323
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition ISDOpcodes.h:914
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition ISDOpcodes.h:736
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
Definition ISDOpcodes.h:200
@ ADDE
Carry-using nodes for multiple precision addition and subtraction.
Definition ISDOpcodes.h:299
@ INSERT_VECTOR_ELT
INSERT_VECTOR_ELT(VECTOR, VAL, IDX) - Returns VECTOR with the element at IDX replaced with VAL.
Definition ISDOpcodes.h:558
@ TokenFactor
TokenFactor - This node takes multiple tokens as input and produces a single token result.
Definition ISDOpcodes.h:53
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition ISDOpcodes.h:947
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition ISDOpcodes.h:844
@ AssertSext
AssertSext, AssertZext - These nodes record if a register contains a value that has already been zero...
Definition ISDOpcodes.h:62
@ FCOPYSIGN
FCOPYSIGN(X, Y) - Return the value of X with the sign of Y.
Definition ISDOpcodes.h:527
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
Definition ISDOpcodes.h:208
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
Definition ISDOpcodes.h:549
bool isNormalStore(const SDNode *N)
Returns true if the specified node is a non-truncating and unindexed store.
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
LoadExtType
LoadExtType enum - This enum defines the three variants of LOADEXT (load with extension).
bool isNormalLoad(const SDNode *N)
Returns true if the specified node is a non-extending and unindexed load.
initializer< Ty > init(const Ty &Val)
constexpr double ln2
constexpr double ln10
constexpr float log2ef
Definition MathExtras.h:51
constexpr double log2e
This is an optimization pass for GlobalISel generic memory operations.
@ Offset
Definition DWP.cpp:532
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1737
MaybeAlign getAlign(const CallInst &I, unsigned Index)
InstructionCost Cost
LLVM_ABI bool isNullConstant(SDValue V)
Returns true if V is a constant integer zero.
void ComputeValueVTs(const TargetLowering &TLI, const DataLayout &DL, Type *Ty, SmallVectorImpl< EVT > &ValueVTs, SmallVectorImpl< EVT > *MemVTs=nullptr, SmallVectorImpl< TypeSize > *Offsets=nullptr, TypeSize StartingOffset=TypeSize::getZero())
ComputeValueVTs - Given an LLVM IR type, compute a sequence of EVTs that represent all the individual...
Definition Analysis.cpp:119
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
bool CCAssignFn(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
CCAssignFn - This function assigns a location for Val, updating State to reflect the change.
LLVM_ABI ConstantFPSDNode * isConstOrConstSplatFP(SDValue N, bool AllowUndefs=false)
Returns the SDNode if it is a constant splat BuildVector or constant float.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition MathExtras.h:385
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition bit.h:202
int countl_zero(T Val)
Count number of 0's from the most significant bit to the least stopping at the first 1.
Definition bit.h:236
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
Definition MathExtras.h:150
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Definition MathExtras.h:155
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
LLVM_ABI raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
CombineLevel
Definition DAGCombine.h:15
@ AfterLegalizeDAG
Definition DAGCombine.h:19
@ BeforeLegalizeTypes
Definition DAGCombine.h:16
@ AfterLegalizeTypes
Definition DAGCombine.h:17
To bit_cast(const From &from) noexcept
Definition bit.h:90
@ Mul
Product of integers.
@ Add
Sum of integers.
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition Alignment.h:144
DWARFExpression::Operation Op
LLVM_ABI ConstantSDNode * isConstOrConstSplat(SDValue N, bool AllowUndefs=false, bool AllowTruncation=false)
Returns the SDNode if it is a constant splat BuildVector or constant int.
constexpr unsigned BitWidth
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
LLVM_ABI bool isOneConstant(SDValue V)
Returns true if V is a constant integer one.
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition Alignment.h:201
APFloat neg(APFloat X)
Returns the negated value of the argument.
Definition APFloat.h:1551
unsigned Log2(Align A)
Returns the log2 of the alignment.
Definition Alignment.h:197
static cl::opt< unsigned > CostThreshold("dfa-cost-threshold", cl::desc("Maximum cost accepted for the transformation"), cl::Hidden, cl::init(50))
LLVM_ABI bool isAllOnesConstant(SDValue V)
Returns true if V is an integer constant with all bits set.
LLVM_ABI void reportFatalUsageError(Error Err)
Report a fatal error that does not indicate a bug in LLVM.
Definition Error.cpp:180
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:869
#define N
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
MCRegister getRegister() const
unsigned getStackOffset() const
DenormalModeKind Input
Denormal treatment kind for floating point instruction inputs in the default floating-point environme...
@ PreserveSign
The sign of a flushed-to-zero number is preserved in the sign of 0.
static constexpr DenormalMode getPreserveSign()
Extended Value Type.
Definition ValueTypes.h:35
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
Definition ValueTypes.h:395
EVT getPow2VectorType(LLVMContext &Context) const
Widens the length of the given vector EVT up to the nearest power of 2 and returns that type.
Definition ValueTypes.h:477
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition ValueTypes.h:137
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
Definition ValueTypes.h:74
EVT changeTypeToInteger() const
Return the type converted to an equivalently sized integer or vector with integer element type.
Definition ValueTypes.h:121
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
Definition ValueTypes.h:147
EVT getDoubleNumVectorElementsVT(LLVMContext &Context) const
Definition ValueTypes.h:463
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition ValueTypes.h:373
bool isByteSized() const
Return true if the bit size is a multiple of 8.
Definition ValueTypes.h:243
uint64_t getScalarSizeInBits() const
Definition ValueTypes.h:385
EVT getHalfSizedIntegerVT(LLVMContext &Context) const
Finds the smallest simple value type that is greater than or equal to half the width of this EVT.
Definition ValueTypes.h:430
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
Definition ValueTypes.h:470
TypeSize getStoreSizeInBits() const
Return the number of bits overwritten by a store of the specified value type.
Definition ValueTypes.h:412
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition ValueTypes.h:316
static EVT getIntegerVT(LLVMContext &Context, unsigned BitWidth)
Returns the EVT that represents an integer with the given number of bits.
Definition ValueTypes.h:65
uint64_t getFixedSizeInBits() const
Return the size of the specified fixed width value type in bits.
Definition ValueTypes.h:381
EVT getRoundIntegerType(LLVMContext &Context) const
Rounds the bit-width of the given integer EVT up to the nearest power of two (and at least to eight),...
Definition ValueTypes.h:419
bool isVector() const
Return true if this is a vector value type.
Definition ValueTypes.h:168
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition ValueTypes.h:323
bool bitsGE(EVT VT) const
Return true if this has no less bits than VT.
Definition ValueTypes.h:292
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition ValueTypes.h:328
bool isExtended() const
Test if the given EVT is extended (as opposed to being simple).
Definition ValueTypes.h:142
EVT changeVectorElementType(EVT EltVT) const
Return a VT for a vector type whose attributes match ourselves with the exception of the element type...
Definition ValueTypes.h:102
LLVM_ABI const fltSemantics & getFltSemantics() const
Returns an APFloat semantics tag appropriate for the value type.
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition ValueTypes.h:336
bool bitsLE(EVT VT) const
Return true if this has no more bits than VT.
Definition ValueTypes.h:308
bool isInteger() const
Return true if this is an integer or a vector integer type.
Definition ValueTypes.h:152
InputArg - This struct carries flags and type information about a single incoming (formal) argument o...
MVT VT
Legalized type of this argument part.
bool isNonNegative() const
Returns true if this value is known to be non-negative.
Definition KnownBits.h:108
unsigned countMinTrailingZeros() const
Returns the minimum number of trailing zero bits.
Definition KnownBits.h:242
bool isUnknown() const
Returns true if we don't know any bits.
Definition KnownBits.h:66
KnownBits trunc(unsigned BitWidth) const
Return known bits for a truncation of the value we're tracking.
Definition KnownBits.h:161
unsigned getBitWidth() const
Get the bit width of this value.
Definition KnownBits.h:44
void resetAll()
Resets the known state of all bits.
Definition KnownBits.h:74
unsigned countMaxActiveBits() const
Returns the maximum number of bits needed to represent all possible unsigned values with these known ...
Definition KnownBits.h:296
unsigned countMinLeadingZeros() const
Returns the minimum number of leading zero bits.
Definition KnownBits.h:248
APInt getMaxValue() const
Return the maximal unsigned value possible given these KnownBits.
Definition KnownBits.h:145
APInt getMinValue() const
Return the minimal unsigned value possible given these KnownBits.
Definition KnownBits.h:129
bool isStrictlyPositive() const
Returns true if this value is known to be positive.
Definition KnownBits.h:114
bool isNegative() const
Returns true if this value is known to be negative.
Definition KnownBits.h:105
unsigned countMaxSignificantBits() const
Returns the maximum number of bits needed to represent all possible signed values with these known bi...
Definition KnownBits.h:269
Matching combinators.
This class contains a discriminated union of information about pointers in memory operands,...
LLVM_ABI bool isDereferenceable(unsigned Size, LLVMContext &C, const DataLayout &DL) const
Return true if memory region [V, V+Offset+Size) is known to be dereferenceable.
static LLVM_ABI MachinePointerInfo getStack(MachineFunction &MF, int64_t Offset, uint8_t ID=0)
Stack pointer relative access.
MachinePointerInfo getWithOffset(int64_t O) const
These are IR-level optimization flags that may be propagated to SDNodes.
void setAllowContract(bool b)
This represents a list of ValueType's that has been intern'd by a SelectionDAG.
DenormalMode FP32Denormals
If this is set, neither input or output denormals are flushed for most f32 instructions.
This structure contains all information that is necessary for lowering calls.
SmallVector< ISD::InputArg, 32 > Ins
LLVM_ABI void AddToWorklist(SDNode *N)
LLVM_ABI SDValue CombineTo(SDNode *N, ArrayRef< SDValue > To, bool AddTo=true)
LLVM_ABI void CommitTargetLoweringOpt(const TargetLoweringOpt &TLO)
A convenience struct that encapsulates a DAG, and two SDValues for returning information from TargetL...