LLVM 23.0.0git
AMDGPUISelLowering.cpp
Go to the documentation of this file.
1//===-- AMDGPUISelLowering.cpp - AMDGPU Common DAG lowering functions -----===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// This is the parent TargetLowering class for hardware code gen
11/// targets.
12//
13//===----------------------------------------------------------------------===//
14
15#include "AMDGPUISelLowering.h"
16#include "AMDGPU.h"
17#include "AMDGPUInstrInfo.h"
19#include "AMDGPUMemoryUtils.h"
26#include "llvm/IR/IntrinsicsAMDGPU.h"
30
31using namespace llvm;
32
33#include "AMDGPUGenCallingConv.inc"
34
36 "amdgpu-bypass-slow-div",
37 cl::desc("Skip 64-bit divide for dynamic 32-bit values"),
38 cl::init(true));
39
40// Find a larger type to do a load / store of a vector with.
42 unsigned StoreSize = VT.getStoreSizeInBits();
43 if (StoreSize <= 32)
44 return EVT::getIntegerVT(Ctx, StoreSize);
45
46 if (StoreSize % 32 == 0)
47 return EVT::getVectorVT(Ctx, MVT::i32, StoreSize / 32);
48
49 return VT;
50}
51
55
57 // In order for this to be a signed 24-bit value, bit 23, must
58 // be a sign bit.
59 return DAG.ComputeMaxSignificantBits(Op);
60}
61
63 const TargetSubtargetInfo &STI,
64 const AMDGPUSubtarget &AMDGPUSTI)
65 : TargetLowering(TM, STI), Subtarget(&AMDGPUSTI) {
66 // Always lower memset, memcpy, and memmove intrinsics to load/store
67 // instructions, rather then generating calls to memset, mempcy or memmove.
71
72 // Enable ganging up loads and stores in the memcpy DAG lowering.
74
75 // Lower floating point store/load to integer store/load to reduce the number
76 // of patterns in tablegen.
78 AddPromotedToType(ISD::LOAD, MVT::f32, MVT::i32);
79
81 AddPromotedToType(ISD::LOAD, MVT::v2f32, MVT::v2i32);
82
84 AddPromotedToType(ISD::LOAD, MVT::v3f32, MVT::v3i32);
85
87 AddPromotedToType(ISD::LOAD, MVT::v4f32, MVT::v4i32);
88
90 AddPromotedToType(ISD::LOAD, MVT::v5f32, MVT::v5i32);
91
93 AddPromotedToType(ISD::LOAD, MVT::v6f32, MVT::v6i32);
94
96 AddPromotedToType(ISD::LOAD, MVT::v7f32, MVT::v7i32);
97
99 AddPromotedToType(ISD::LOAD, MVT::v8f32, MVT::v8i32);
100
102 AddPromotedToType(ISD::LOAD, MVT::v9f32, MVT::v9i32);
103
104 setOperationAction(ISD::LOAD, MVT::v10f32, Promote);
105 AddPromotedToType(ISD::LOAD, MVT::v10f32, MVT::v10i32);
106
107 setOperationAction(ISD::LOAD, MVT::v11f32, Promote);
108 AddPromotedToType(ISD::LOAD, MVT::v11f32, MVT::v11i32);
109
110 setOperationAction(ISD::LOAD, MVT::v12f32, Promote);
111 AddPromotedToType(ISD::LOAD, MVT::v12f32, MVT::v12i32);
112
113 setOperationAction(ISD::LOAD, MVT::v16f32, Promote);
114 AddPromotedToType(ISD::LOAD, MVT::v16f32, MVT::v16i32);
115
116 setOperationAction(ISD::LOAD, MVT::v32f32, Promote);
117 AddPromotedToType(ISD::LOAD, MVT::v32f32, MVT::v32i32);
118
120 AddPromotedToType(ISD::LOAD, MVT::i64, MVT::v2i32);
121
123 AddPromotedToType(ISD::LOAD, MVT::v2i64, MVT::v4i32);
124
126 AddPromotedToType(ISD::LOAD, MVT::f64, MVT::v2i32);
127
129 AddPromotedToType(ISD::LOAD, MVT::v2f64, MVT::v4i32);
130
132 AddPromotedToType(ISD::LOAD, MVT::v3i64, MVT::v6i32);
133
135 AddPromotedToType(ISD::LOAD, MVT::v4i64, MVT::v8i32);
136
138 AddPromotedToType(ISD::LOAD, MVT::v3f64, MVT::v6i32);
139
141 AddPromotedToType(ISD::LOAD, MVT::v4f64, MVT::v8i32);
142
144 AddPromotedToType(ISD::LOAD, MVT::v8i64, MVT::v16i32);
145
147 AddPromotedToType(ISD::LOAD, MVT::v8f64, MVT::v16i32);
148
149 setOperationAction(ISD::LOAD, MVT::v16i64, Promote);
150 AddPromotedToType(ISD::LOAD, MVT::v16i64, MVT::v32i32);
151
152 setOperationAction(ISD::LOAD, MVT::v16f64, Promote);
153 AddPromotedToType(ISD::LOAD, MVT::v16f64, MVT::v32i32);
154
156 AddPromotedToType(ISD::LOAD, MVT::i128, MVT::v4i32);
157
158 // TODO: Would be better to consume as directly legal
160 AddPromotedToType(ISD::ATOMIC_LOAD, MVT::f32, MVT::i32);
161
163 AddPromotedToType(ISD::ATOMIC_LOAD, MVT::f64, MVT::i64);
164
166 AddPromotedToType(ISD::ATOMIC_LOAD, MVT::f16, MVT::i16);
167
169 AddPromotedToType(ISD::ATOMIC_LOAD, MVT::bf16, MVT::i16);
170
172 AddPromotedToType(ISD::ATOMIC_LOAD, MVT::v2f32, MVT::i64);
173
175 AddPromotedToType(ISD::ATOMIC_STORE, MVT::f32, MVT::i32);
176
178 AddPromotedToType(ISD::ATOMIC_STORE, MVT::f64, MVT::i64);
179
181 AddPromotedToType(ISD::ATOMIC_STORE, MVT::f16, MVT::i16);
182
184 AddPromotedToType(ISD::ATOMIC_STORE, MVT::bf16, MVT::i16);
185
187 AddPromotedToType(ISD::ATOMIC_STORE, MVT::v2f32, MVT::i64);
188
189 // There are no 64-bit extloads. These should be done as a 32-bit extload and
190 // an extension to 64-bit.
191 for (MVT VT : MVT::integer_valuetypes())
193 Expand);
194
195 for (MVT VT : MVT::integer_valuetypes()) {
196 if (VT == MVT::i64)
197 continue;
198
199 for (auto Op : {ISD::SEXTLOAD, ISD::ZEXTLOAD, ISD::EXTLOAD}) {
200 setLoadExtAction(Op, VT, MVT::i1, Promote);
201 setLoadExtAction(Op, VT, MVT::i8, Legal);
202 setLoadExtAction(Op, VT, MVT::i16, Legal);
203 setLoadExtAction(Op, VT, MVT::i32, Expand);
204 }
205 }
206
208 for (auto MemVT :
209 {MVT::v2i8, MVT::v4i8, MVT::v2i16, MVT::v3i16, MVT::v4i16})
211 Expand);
212
213 setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
214 setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::bf16, Expand);
215 setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2f16, Expand);
216 setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2bf16, Expand);
217 setLoadExtAction(ISD::EXTLOAD, MVT::v3f32, MVT::v3f16, Expand);
218 setLoadExtAction(ISD::EXTLOAD, MVT::v3f32, MVT::v3bf16, Expand);
219 setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Expand);
220 setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4bf16, Expand);
221 setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8f16, Expand);
222 setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8bf16, Expand);
223 setLoadExtAction(ISD::EXTLOAD, MVT::v16f32, MVT::v16f16, Expand);
224 setLoadExtAction(ISD::EXTLOAD, MVT::v16f32, MVT::v16bf16, Expand);
225 setLoadExtAction(ISD::EXTLOAD, MVT::v32f32, MVT::v32f16, Expand);
226 setLoadExtAction(ISD::EXTLOAD, MVT::v32f32, MVT::v32bf16, Expand);
227
228 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
229 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f32, Expand);
230 setLoadExtAction(ISD::EXTLOAD, MVT::v3f64, MVT::v3f32, Expand);
231 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f32, Expand);
232 setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f32, Expand);
233 setLoadExtAction(ISD::EXTLOAD, MVT::v16f64, MVT::v16f32, Expand);
234
235 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
236 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::bf16, Expand);
237 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Expand);
238 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2bf16, Expand);
239 setLoadExtAction(ISD::EXTLOAD, MVT::v3f64, MVT::v3f16, Expand);
240 setLoadExtAction(ISD::EXTLOAD, MVT::v3f64, MVT::v3bf16, Expand);
241 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Expand);
242 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4bf16, Expand);
243 setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f16, Expand);
244 setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8bf16, Expand);
245 setLoadExtAction(ISD::EXTLOAD, MVT::v16f64, MVT::v16f16, Expand);
246 setLoadExtAction(ISD::EXTLOAD, MVT::v16f64, MVT::v16bf16, Expand);
247
249 AddPromotedToType(ISD::STORE, MVT::f32, MVT::i32);
250
252 AddPromotedToType(ISD::STORE, MVT::v2f32, MVT::v2i32);
253
255 AddPromotedToType(ISD::STORE, MVT::v3f32, MVT::v3i32);
256
258 AddPromotedToType(ISD::STORE, MVT::v4f32, MVT::v4i32);
259
261 AddPromotedToType(ISD::STORE, MVT::v5f32, MVT::v5i32);
262
264 AddPromotedToType(ISD::STORE, MVT::v6f32, MVT::v6i32);
265
267 AddPromotedToType(ISD::STORE, MVT::v7f32, MVT::v7i32);
268
270 AddPromotedToType(ISD::STORE, MVT::v8f32, MVT::v8i32);
271
273 AddPromotedToType(ISD::STORE, MVT::v9f32, MVT::v9i32);
274
276 AddPromotedToType(ISD::STORE, MVT::v10f32, MVT::v10i32);
277
279 AddPromotedToType(ISD::STORE, MVT::v11f32, MVT::v11i32);
280
282 AddPromotedToType(ISD::STORE, MVT::v12f32, MVT::v12i32);
283
285 AddPromotedToType(ISD::STORE, MVT::v16f32, MVT::v16i32);
286
288 AddPromotedToType(ISD::STORE, MVT::v32f32, MVT::v32i32);
289
291 AddPromotedToType(ISD::STORE, MVT::i64, MVT::v2i32);
292
294 AddPromotedToType(ISD::STORE, MVT::v2i64, MVT::v4i32);
295
297 AddPromotedToType(ISD::STORE, MVT::f64, MVT::v2i32);
298
300 AddPromotedToType(ISD::STORE, MVT::v2f64, MVT::v4i32);
301
303 AddPromotedToType(ISD::STORE, MVT::v3i64, MVT::v6i32);
304
306 AddPromotedToType(ISD::STORE, MVT::v3f64, MVT::v6i32);
307
309 AddPromotedToType(ISD::STORE, MVT::v4i64, MVT::v8i32);
310
312 AddPromotedToType(ISD::STORE, MVT::v4f64, MVT::v8i32);
313
315 AddPromotedToType(ISD::STORE, MVT::v8i64, MVT::v16i32);
316
318 AddPromotedToType(ISD::STORE, MVT::v8f64, MVT::v16i32);
319
321 AddPromotedToType(ISD::STORE, MVT::v16i64, MVT::v32i32);
322
324 AddPromotedToType(ISD::STORE, MVT::v16f64, MVT::v32i32);
325
327 AddPromotedToType(ISD::STORE, MVT::i128, MVT::v4i32);
328
329 setTruncStoreAction(MVT::i64, MVT::i1, Expand);
330 setTruncStoreAction(MVT::i64, MVT::i8, Expand);
331 setTruncStoreAction(MVT::i64, MVT::i16, Expand);
332 setTruncStoreAction(MVT::i64, MVT::i32, Expand);
333
334 setTruncStoreAction(MVT::v2i64, MVT::v2i1, Expand);
335 setTruncStoreAction(MVT::v2i64, MVT::v2i8, Expand);
336 setTruncStoreAction(MVT::v2i64, MVT::v2i16, Expand);
337 setTruncStoreAction(MVT::v2i64, MVT::v2i32, Expand);
338
339 setTruncStoreAction(MVT::f32, MVT::bf16, Expand);
340 setTruncStoreAction(MVT::f32, MVT::f16, Expand);
341 setTruncStoreAction(MVT::v2f32, MVT::v2bf16, Expand);
342 setTruncStoreAction(MVT::v2f32, MVT::v2f16, Expand);
343 setTruncStoreAction(MVT::v3f32, MVT::v3bf16, Expand);
344 setTruncStoreAction(MVT::v3f32, MVT::v3f16, Expand);
345 setTruncStoreAction(MVT::v4f32, MVT::v4bf16, Expand);
346 setTruncStoreAction(MVT::v4f32, MVT::v4f16, Expand);
347 setTruncStoreAction(MVT::v6f32, MVT::v6f16, Expand);
348 setTruncStoreAction(MVT::v8f32, MVT::v8bf16, Expand);
349 setTruncStoreAction(MVT::v8f32, MVT::v8f16, Expand);
350 setTruncStoreAction(MVT::v16f32, MVT::v16bf16, Expand);
351 setTruncStoreAction(MVT::v16f32, MVT::v16f16, Expand);
352 setTruncStoreAction(MVT::v32f32, MVT::v32bf16, Expand);
353 setTruncStoreAction(MVT::v32f32, MVT::v32f16, Expand);
354
355 setTruncStoreAction(MVT::f64, MVT::bf16, Expand);
356 setTruncStoreAction(MVT::f64, MVT::f16, Expand);
357 setTruncStoreAction(MVT::f64, MVT::f32, Expand);
358
359 setTruncStoreAction(MVT::v2f64, MVT::v2f32, Expand);
360 setTruncStoreAction(MVT::v2f64, MVT::v2bf16, Expand);
361 setTruncStoreAction(MVT::v2f64, MVT::v2f16, Expand);
362
363 setTruncStoreAction(MVT::v3i32, MVT::v3i8, Expand);
364
365 setTruncStoreAction(MVT::v3i64, MVT::v3i32, Expand);
366 setTruncStoreAction(MVT::v3i64, MVT::v3i16, Expand);
367 setTruncStoreAction(MVT::v3i64, MVT::v3i8, Expand);
368 setTruncStoreAction(MVT::v3i64, MVT::v3i1, Expand);
369 setTruncStoreAction(MVT::v3f64, MVT::v3f32, Expand);
370 setTruncStoreAction(MVT::v3f64, MVT::v3bf16, Expand);
371 setTruncStoreAction(MVT::v3f64, MVT::v3f16, Expand);
372
373 setTruncStoreAction(MVT::v4i64, MVT::v4i32, Expand);
374 setTruncStoreAction(MVT::v4i64, MVT::v4i16, Expand);
375 setTruncStoreAction(MVT::v4f64, MVT::v4f32, Expand);
376 setTruncStoreAction(MVT::v4f64, MVT::v4bf16, Expand);
377 setTruncStoreAction(MVT::v4f64, MVT::v4f16, Expand);
378
379 setTruncStoreAction(MVT::v5i32, MVT::v5i1, Expand);
380 setTruncStoreAction(MVT::v5i32, MVT::v5i8, Expand);
381 setTruncStoreAction(MVT::v5i32, MVT::v5i16, Expand);
382
383 setTruncStoreAction(MVT::v6i32, MVT::v6i1, Expand);
384 setTruncStoreAction(MVT::v6i32, MVT::v6i8, Expand);
385 setTruncStoreAction(MVT::v6i32, MVT::v6i16, Expand);
386
387 setTruncStoreAction(MVT::v7i32, MVT::v7i1, Expand);
388 setTruncStoreAction(MVT::v7i32, MVT::v7i8, Expand);
389 setTruncStoreAction(MVT::v7i32, MVT::v7i16, Expand);
390
391 setTruncStoreAction(MVT::v8f64, MVT::v8f32, Expand);
392 setTruncStoreAction(MVT::v8f64, MVT::v8bf16, Expand);
393 setTruncStoreAction(MVT::v8f64, MVT::v8f16, Expand);
394
395 setTruncStoreAction(MVT::v16f64, MVT::v16f32, Expand);
396 setTruncStoreAction(MVT::v16f64, MVT::v16bf16, Expand);
397 setTruncStoreAction(MVT::v16f64, MVT::v16f16, Expand);
398 setTruncStoreAction(MVT::v16i64, MVT::v16i16, Expand);
399 setTruncStoreAction(MVT::v16i64, MVT::v16i8, Expand);
400 setTruncStoreAction(MVT::v16i64, MVT::v16i8, Expand);
401 setTruncStoreAction(MVT::v16i64, MVT::v16i1, Expand);
402
403 setOperationAction(ISD::Constant, {MVT::i32, MVT::i64}, Legal);
404 setOperationAction(ISD::ConstantFP, {MVT::f32, MVT::f64}, Legal);
405
407
408 // For R600, this is totally unsupported, just custom lower to produce an
409 // error.
411
412 // Library functions. These default to Expand, but we have instructions
413 // for them.
416 {MVT::f16, MVT::f32}, Legal);
418
420 setOperationAction(ISD::FROUND, {MVT::f32, MVT::f64}, Custom);
422 {MVT::f16, MVT::f32, MVT::f64}, Expand);
423
426 Custom);
428
429 setOperationAction(ISD::FNEARBYINT, {MVT::f16, MVT::f32, MVT::f64}, Custom);
430
431 setOperationAction(ISD::FRINT, {MVT::f16, MVT::f32, MVT::f64}, Custom);
432
433 setOperationAction({ISD::LRINT, ISD::LLRINT}, {MVT::f16, MVT::f32, MVT::f64},
434 Expand);
435
436 setOperationAction(ISD::FREM, {MVT::f16, MVT::f32, MVT::f64}, Expand);
437 setOperationAction(ISD::IS_FPCLASS, {MVT::f32, MVT::f64}, Legal);
439
441 Custom);
442
443 setOperationAction(ISD::FCANONICALIZE, {MVT::f32, MVT::f64}, Legal);
444
445 // FIXME: These IS_FPCLASS vector fp types are marked custom so it reaches
446 // scalarization code. Can be removed when IS_FPCLASS expand isn't called by
447 // default unless marked custom/legal.
449 {MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32,
450 MVT::v6f32, MVT::v7f32, MVT::v8f32, MVT::v16f32,
451 MVT::v2f64, MVT::v3f64, MVT::v4f64, MVT::v8f64,
452 MVT::v16f64},
453 Custom);
454
455 // Expand to fneg + fadd.
457
459 {MVT::v3i32, MVT::v3f32, MVT::v4i32, MVT::v4f32,
460 MVT::v5i32, MVT::v5f32, MVT::v6i32, MVT::v6f32,
461 MVT::v7i32, MVT::v7f32, MVT::v8i32, MVT::v8f32,
462 MVT::v9i32, MVT::v9f32, MVT::v10i32, MVT::v10f32,
463 MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32},
464 Custom);
465
468 {MVT::v2f32, MVT::v2i32, MVT::v3f32, MVT::v3i32, MVT::v4f32,
469 MVT::v4i32, MVT::v5f32, MVT::v5i32, MVT::v6f32, MVT::v6i32,
470 MVT::v7f32, MVT::v7i32, MVT::v8f32, MVT::v8i32, MVT::v9f32,
471 MVT::v9i32, MVT::v10i32, MVT::v10f32, MVT::v11i32, MVT::v11f32,
472 MVT::v12i32, MVT::v12f32, MVT::v16i32, MVT::v32f32, MVT::v32i32,
473 MVT::v2f64, MVT::v2i64, MVT::v3f64, MVT::v3i64, MVT::v4f64,
474 MVT::v4i64, MVT::v8f64, MVT::v8i64, MVT::v16f64, MVT::v16i64},
475 Custom);
476
478 Expand);
479 setOperationAction(ISD::FP_TO_FP16, {MVT::f64, MVT::f32}, Custom);
480
481 const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 };
482 for (MVT VT : ScalarIntVTs) {
483 // These should use [SU]DIVREM, so set them to expand
485 Expand);
486
487 // GPU does not have divrem function for signed or unsigned.
489
490 // GPU does not have [S|U]MUL_LOHI functions as a single instruction.
492
494
496 Expand);
497 }
498
499 // The hardware supports 32-bit FSHR, but not FSHL.
501
502 setOperationAction({ISD::ROTL, ISD::ROTR}, {MVT::i32, MVT::i64}, Expand);
503
505
510 MVT::i64, Custom);
512
514 Legal);
515
518 MVT::i64, Custom);
519
520 for (auto VT : {MVT::i8, MVT::i16})
522
523 static const MVT::SimpleValueType VectorIntTypes[] = {
524 MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32, MVT::v6i32, MVT::v7i32,
525 MVT::v9i32, MVT::v10i32, MVT::v11i32, MVT::v12i32};
526
527 for (MVT VT : VectorIntTypes) {
528 // Expand the following operations for the current type by default.
529 // clang-format off
549 VT, Expand);
550 // clang-format on
551 }
552
553 static const MVT::SimpleValueType FloatVectorTypes[] = {
554 MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32, MVT::v6f32, MVT::v7f32,
555 MVT::v9f32, MVT::v10f32, MVT::v11f32, MVT::v12f32};
556
557 for (MVT VT : FloatVectorTypes) {
570 VT, Expand);
571 }
572
573 // This causes using an unrolled select operation rather than expansion with
574 // bit operations. This is in general better, but the alternative using BFI
575 // instructions may be better if the select sources are SGPRs.
577 AddPromotedToType(ISD::SELECT, MVT::v2f32, MVT::v2i32);
578
580 AddPromotedToType(ISD::SELECT, MVT::v3f32, MVT::v3i32);
581
583 AddPromotedToType(ISD::SELECT, MVT::v4f32, MVT::v4i32);
584
586 AddPromotedToType(ISD::SELECT, MVT::v5f32, MVT::v5i32);
587
589 AddPromotedToType(ISD::SELECT, MVT::v6f32, MVT::v6i32);
590
592 AddPromotedToType(ISD::SELECT, MVT::v7f32, MVT::v7i32);
593
595 AddPromotedToType(ISD::SELECT, MVT::v9f32, MVT::v9i32);
596
598 AddPromotedToType(ISD::SELECT, MVT::v10f32, MVT::v10i32);
599
601 AddPromotedToType(ISD::SELECT, MVT::v11f32, MVT::v11i32);
602
604 AddPromotedToType(ISD::SELECT, MVT::v12f32, MVT::v12i32);
605
607 setJumpIsExpensive(true);
608
611
613
614 // We want to find all load dependencies for long chains of stores to enable
615 // merging into very wide vectors. The problem is with vectors with > 4
616 // elements. MergeConsecutiveStores will attempt to merge these because x8/x16
617 // vectors are a legal type, even though we have to split the loads
618 // usually. When we can more precisely specify load legality per address
619 // space, we should be able to make FindBetterChain/MergeConsecutiveStores
620 // smarter so that they can figure out what to do in 2 iterations without all
621 // N > 4 stores on the same chain.
623
624 // memcpy/memmove/memset are expanded in the IR, so we shouldn't need to worry
625 // about these during lowering.
626 MaxStoresPerMemcpy = 0xffffffff;
627 MaxStoresPerMemmove = 0xffffffff;
628 MaxStoresPerMemset = 0xffffffff;
629
630 // The expansion for 64-bit division is enormous.
632 addBypassSlowDiv(64, 32);
633
644
648}
649
651 const auto Flags = Op.getNode()->getFlags();
652 if (Flags.hasNoSignedZeros())
653 return true;
654
655 return false;
656}
657
658//===----------------------------------------------------------------------===//
659// Target Information
660//===----------------------------------------------------------------------===//
661
663static bool fnegFoldsIntoOpcode(unsigned Opc) {
664 switch (Opc) {
665 case ISD::FADD:
666 case ISD::FSUB:
667 case ISD::FMUL:
668 case ISD::FMA:
669 case ISD::FMAD:
670 case ISD::FMINNUM:
671 case ISD::FMAXNUM:
674 case ISD::FMINIMUM:
675 case ISD::FMAXIMUM:
676 case ISD::FMINIMUMNUM:
677 case ISD::FMAXIMUMNUM:
678 case ISD::SELECT:
679 case ISD::FSIN:
680 case ISD::FTRUNC:
681 case ISD::FRINT:
682 case ISD::FNEARBYINT:
683 case ISD::FROUNDEVEN:
685 case AMDGPUISD::RCP:
686 case AMDGPUISD::RCP_LEGACY:
687 case AMDGPUISD::RCP_IFLAG:
688 case AMDGPUISD::SIN_HW:
689 case AMDGPUISD::FMUL_LEGACY:
690 case AMDGPUISD::FMIN_LEGACY:
691 case AMDGPUISD::FMAX_LEGACY:
692 case AMDGPUISD::FMED3:
693 // TODO: handle llvm.amdgcn.fma.legacy
694 return true;
695 case ISD::BITCAST:
696 llvm_unreachable("bitcast is special cased");
697 default:
698 return false;
699 }
700}
701
702static bool fnegFoldsIntoOp(const SDNode *N) {
703 unsigned Opc = N->getOpcode();
704 if (Opc == ISD::BITCAST) {
705 // TODO: Is there a benefit to checking the conditions performFNegCombine
706 // does? We don't for the other cases.
707 SDValue BCSrc = N->getOperand(0);
708 if (BCSrc.getOpcode() == ISD::BUILD_VECTOR) {
709 return BCSrc.getNumOperands() == 2 &&
710 BCSrc.getOperand(1).getValueSizeInBits() == 32;
711 }
712
713 return BCSrc.getOpcode() == ISD::SELECT && BCSrc.getValueType() == MVT::f32;
714 }
715
716 return fnegFoldsIntoOpcode(Opc);
717}
718
719/// \p returns true if the operation will definitely need to use a 64-bit
720/// encoding, and thus will use a VOP3 encoding regardless of the source
721/// modifiers.
723static bool opMustUseVOP3Encoding(const SDNode *N, MVT VT) {
724 return (N->getNumOperands() > 2 && N->getOpcode() != ISD::SELECT) ||
725 VT == MVT::f64;
726}
727
728/// Return true if v_cndmask_b32 will support fabs/fneg source modifiers for the
729/// type for ISD::SELECT.
731static bool selectSupportsSourceMods(const SDNode *N) {
732 // TODO: Only applies if select will be vector
733 return N->getValueType(0) == MVT::f32;
734}
735
736// Most FP instructions support source modifiers, but this could be refined
737// slightly.
739static bool hasSourceMods(const SDNode *N) {
740 if (isa<MemSDNode>(N))
741 return false;
742
743 switch (N->getOpcode()) {
744 case ISD::CopyToReg:
745 case ISD::FDIV:
746 case ISD::FREM:
747 case ISD::INLINEASM:
749 case AMDGPUISD::DIV_SCALE:
751
752 // TODO: Should really be looking at the users of the bitcast. These are
753 // problematic because bitcasts are used to legalize all stores to integer
754 // types.
755 case ISD::BITCAST:
756 return false;
758 switch (N->getConstantOperandVal(0)) {
759 case Intrinsic::amdgcn_interp_p1:
760 case Intrinsic::amdgcn_interp_p2:
761 case Intrinsic::amdgcn_interp_mov:
762 case Intrinsic::amdgcn_interp_p1_f16:
763 case Intrinsic::amdgcn_interp_p2_f16:
764 return false;
765 default:
766 return true;
767 }
768 }
769 case ISD::SELECT:
771 default:
772 return true;
773 }
774}
775
777 unsigned CostThreshold) {
778 // Some users (such as 3-operand FMA/MAD) must use a VOP3 encoding, and thus
779 // it is truly free to use a source modifier in all cases. If there are
780 // multiple users but for each one will necessitate using VOP3, there will be
781 // a code size increase. Try to avoid increasing code size unless we know it
782 // will save on the instruction count.
783 unsigned NumMayIncreaseSize = 0;
784 MVT VT = N->getValueType(0).getScalarType().getSimpleVT();
785
786 assert(!N->use_empty());
787
788 // XXX - Should this limit number of uses to check?
789 for (const SDNode *U : N->users()) {
790 if (!hasSourceMods(U))
791 return false;
792
793 if (!opMustUseVOP3Encoding(U, VT)) {
794 if (++NumMayIncreaseSize > CostThreshold)
795 return false;
796 }
797 }
798
799 return true;
800}
801
803 ISD::NodeType ExtendKind) const {
804 assert(!VT.isVector() && "only scalar expected");
805
806 // Round to the next multiple of 32-bits.
807 unsigned Size = VT.getSizeInBits();
808 if (Size <= 32)
809 return MVT::i32;
810 return EVT::getIntegerVT(Context, 32 * ((Size + 31) / 32));
811}
812
814 return 32;
815}
816
818 return true;
819}
820
821// The backend supports 32 and 64 bit floating point immediates.
822// FIXME: Why are we reporting vectors of FP immediates as legal?
824 bool ForCodeSize) const {
825 return isTypeLegal(VT.getScalarType());
826}
827
828// We don't want to shrink f64 / f32 constants.
830 EVT ScalarVT = VT.getScalarType();
831 return (ScalarVT != MVT::f32 && ScalarVT != MVT::f64);
832}
833
835 SDNode *N, ISD::LoadExtType ExtTy, EVT NewVT,
836 std::optional<unsigned> ByteOffset) const {
837 // TODO: This may be worth removing. Check regression tests for diffs.
838 if (!TargetLoweringBase::shouldReduceLoadWidth(N, ExtTy, NewVT, ByteOffset))
839 return false;
840
841 unsigned NewSize = NewVT.getStoreSizeInBits();
842
843 // If we are reducing to a 32-bit load or a smaller multi-dword load,
844 // this is always better.
845 if (NewSize >= 32)
846 return true;
847
848 EVT OldVT = N->getValueType(0);
849 unsigned OldSize = OldVT.getStoreSizeInBits();
850
852 unsigned AS = MN->getAddressSpace();
853 // Do not shrink an aligned scalar load to sub-dword.
854 // Scalar engine cannot do sub-dword loads.
855 // TODO: Update this for GFX12 which does have scalar sub-dword loads.
856 if (OldSize >= 32 && NewSize < 32 && MN->getAlign() >= Align(4) &&
860 MN->isInvariant())) &&
862 return false;
863
864 // Don't produce extloads from sub 32-bit types. SI doesn't have scalar
865 // extloads, so doing one requires using a buffer_load. In cases where we
866 // still couldn't use a scalar load, using the wider load shouldn't really
867 // hurt anything.
868
869 // If the old size already had to be an extload, there's no harm in continuing
870 // to reduce the width.
871 return (OldSize < 32);
872}
873
875 const SelectionDAG &DAG,
876 const MachineMemOperand &MMO) const {
877
878 assert(LoadTy.getSizeInBits() == CastTy.getSizeInBits());
879
880 if (LoadTy.getScalarType() == MVT::i32)
881 return false;
882
883 unsigned LScalarSize = LoadTy.getScalarSizeInBits();
884 unsigned CastScalarSize = CastTy.getScalarSizeInBits();
885
886 if ((LScalarSize >= CastScalarSize) && (CastScalarSize < 32))
887 return false;
888
889 unsigned Fast = 0;
891 CastTy, MMO, &Fast) &&
892 Fast;
893}
894
895// SI+ has instructions for cttz / ctlz for 32-bit values. This is probably also
896// profitable with the expansion for 64-bit since it's generally good to
897// speculate things.
899 return true;
900}
901
903 return true;
904}
905
907 switch (N->getOpcode()) {
908 case ISD::EntryToken:
909 case ISD::TokenFactor:
910 return true;
912 unsigned IntrID = N->getConstantOperandVal(0);
914 }
916 unsigned IntrID = N->getConstantOperandVal(1);
918 }
919 case ISD::LOAD:
920 if (cast<LoadSDNode>(N)->getMemOperand()->getAddrSpace() ==
922 return true;
923 return false;
924 case AMDGPUISD::SETCC: // ballot-style instruction
925 return true;
926 }
927 return false;
928}
929
931 SDValue Op, SelectionDAG &DAG, bool LegalOperations, bool ForCodeSize,
932 NegatibleCost &Cost, unsigned Depth) const {
933
934 switch (Op.getOpcode()) {
935 case ISD::FMA:
936 case ISD::FMAD: {
937 // Negating a fma is not free if it has users without source mods.
938 if (!allUsesHaveSourceMods(Op.getNode()))
939 return SDValue();
940 break;
941 }
942 case AMDGPUISD::RCP: {
943 SDValue Src = Op.getOperand(0);
944 EVT VT = Op.getValueType();
945 SDLoc SL(Op);
946
947 SDValue NegSrc = getNegatedExpression(Src, DAG, LegalOperations,
948 ForCodeSize, Cost, Depth + 1);
949 if (NegSrc)
950 return DAG.getNode(AMDGPUISD::RCP, SL, VT, NegSrc, Op->getFlags());
951 return SDValue();
952 }
953 default:
954 break;
955 }
956
957 return TargetLowering::getNegatedExpression(Op, DAG, LegalOperations,
958 ForCodeSize, Cost, Depth);
959}
960
961//===---------------------------------------------------------------------===//
962// Target Properties
963//===---------------------------------------------------------------------===//
964
967
968 // Packed operations do not have a fabs modifier.
969 // Report this based on the end legalized type.
970 return VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f16 || VT == MVT::bf16;
971}
972
975 // Report this based on the end legalized type.
976 VT = VT.getScalarType();
977 return VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f16 || VT == MVT::bf16;
978}
979
981 unsigned NumElem,
982 unsigned AS) const {
983 return true;
984}
985
987 // There are few operations which truly have vector input operands. Any vector
988 // operation is going to involve operations on each component, and a
989 // build_vector will be a copy per element, so it always makes sense to use a
990 // build_vector input in place of the extracted element to avoid a copy into a
991 // super register.
992 //
993 // We should probably only do this if all users are extracts only, but this
994 // should be the common case.
995 return true;
996}
997
999 // Truncate is just accessing a subregister.
1000
1001 unsigned SrcSize = Source.getSizeInBits();
1002 unsigned DestSize = Dest.getSizeInBits();
1003
1004 return DestSize < SrcSize && DestSize % 32 == 0 ;
1005}
1006
1008 // Truncate is just accessing a subregister.
1009
1010 unsigned SrcSize = Source->getScalarSizeInBits();
1011 unsigned DestSize = Dest->getScalarSizeInBits();
1012
1013 if (DestSize== 16 && Subtarget->has16BitInsts())
1014 return SrcSize >= 32;
1015
1016 return DestSize < SrcSize && DestSize % 32 == 0;
1017}
1018
1020 unsigned SrcSize = Src->getScalarSizeInBits();
1021 unsigned DestSize = Dest->getScalarSizeInBits();
1022
1023 if (SrcSize == 16 && Subtarget->has16BitInsts())
1024 return DestSize >= 32;
1025
1026 return SrcSize == 32 && DestSize == 64;
1027}
1028
1030 // Any register load of a 64-bit value really requires 2 32-bit moves. For all
1031 // practical purposes, the extra mov 0 to load a 64-bit is free. As used,
1032 // this will enable reducing 64-bit operations the 32-bit, which is always
1033 // good.
1034
1035 if (Src == MVT::i16)
1036 return Dest == MVT::i32 ||Dest == MVT::i64 ;
1037
1038 return Src == MVT::i32 && Dest == MVT::i64;
1039}
1040
1042 EVT DestVT) const {
1043 switch (N->getOpcode()) {
1044 case ISD::ABS:
1045 case ISD::ADD:
1046 case ISD::SUB:
1047 case ISD::SHL:
1048 case ISD::SRL:
1049 case ISD::SRA:
1050 case ISD::AND:
1051 case ISD::OR:
1052 case ISD::XOR:
1053 case ISD::MUL:
1054 case ISD::SETCC:
1055 case ISD::SELECT:
1056 case ISD::SMIN:
1057 case ISD::SMAX:
1058 case ISD::UMIN:
1059 case ISD::UMAX:
1060 case ISD::USUBSAT:
1061 if (isTypeLegal(MVT::i16) &&
1062 (!DestVT.isVector() ||
1063 !isOperationLegal(ISD::ADD, MVT::v2i16))) { // Check if VOP3P
1064 // Don't narrow back down to i16 if promoted to i32 already.
1065 if (!N->isDivergent() && DestVT.isInteger() &&
1066 DestVT.getScalarSizeInBits() > 1 &&
1067 DestVT.getScalarSizeInBits() <= 16 &&
1068 SrcVT.getScalarSizeInBits() > 16) {
1069 return false;
1070 }
1071 }
1072 return true;
1073 default:
1074 break;
1075 }
1076
1077 // There aren't really 64-bit registers, but pairs of 32-bit ones and only a
1078 // limited number of native 64-bit operations. Shrinking an operation to fit
1079 // in a single 32-bit register should always be helpful. As currently used,
1080 // this is much less general than the name suggests, and is only used in
1081 // places trying to reduce the sizes of loads. Shrinking loads to < 32-bits is
1082 // not profitable, and may actually be harmful.
1083 if (isa<LoadSDNode>(N))
1084 return SrcVT.getSizeInBits() > 32 && DestVT.getSizeInBits() == 32;
1085
1086 return true;
1087}
1088
1090 const SDNode* N, CombineLevel Level) const {
1091 assert((N->getOpcode() == ISD::SHL || N->getOpcode() == ISD::SRA ||
1092 N->getOpcode() == ISD::SRL) &&
1093 "Expected shift op");
1094
1095 SDValue ShiftLHS = N->getOperand(0);
1096 if (!ShiftLHS->hasOneUse())
1097 return false;
1098
1099 if (ShiftLHS.getOpcode() == ISD::SIGN_EXTEND &&
1100 !ShiftLHS.getOperand(0)->hasOneUse())
1101 return false;
1102
1103 // Always commute pre-type legalization and right shifts.
1104 // We're looking for shl(or(x,y),z) patterns.
1106 N->getOpcode() != ISD::SHL || N->getOperand(0).getOpcode() != ISD::OR)
1107 return true;
1108
1109 // If only user is a i32 right-shift, then don't destroy a BFE pattern.
1110 if (N->getValueType(0) == MVT::i32 && N->hasOneUse() &&
1111 (N->user_begin()->getOpcode() == ISD::SRA ||
1112 N->user_begin()->getOpcode() == ISD::SRL))
1113 return false;
1114
1115 // Don't destroy or(shl(load_zext(),c), load_zext()) patterns.
1116 auto IsShiftAndLoad = [](SDValue LHS, SDValue RHS) {
1117 if (LHS.getOpcode() != ISD::SHL)
1118 return false;
1119 auto *RHSLd = dyn_cast<LoadSDNode>(RHS);
1120 auto *LHS0 = dyn_cast<LoadSDNode>(LHS.getOperand(0));
1121 auto *LHS1 = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
1122 return LHS0 && LHS1 && RHSLd && LHS0->getExtensionType() == ISD::ZEXTLOAD &&
1123 LHS1->getAPIntValue() == LHS0->getMemoryVT().getScalarSizeInBits() &&
1124 RHSLd->getExtensionType() == ISD::ZEXTLOAD;
1125 };
1126 SDValue LHS = N->getOperand(0).getOperand(0);
1127 SDValue RHS = N->getOperand(0).getOperand(1);
1128 return !(IsShiftAndLoad(LHS, RHS) || IsShiftAndLoad(RHS, LHS));
1129}
1130
1131//===---------------------------------------------------------------------===//
1132// TargetLowering Callbacks
1133//===---------------------------------------------------------------------===//
1134
1136 bool IsVarArg) {
1137 switch (CC) {
1145 return CC_AMDGPU;
1148 return CC_AMDGPU_CS_CHAIN;
1149 case CallingConv::C:
1150 case CallingConv::Fast:
1151 case CallingConv::Cold:
1152 return CC_AMDGPU_Func;
1155 return CC_SI_Gfx;
1158 default:
1159 reportFatalUsageError("unsupported calling convention for call");
1160 }
1161}
1162
1164 bool IsVarArg) {
1165 switch (CC) {
1168 llvm_unreachable("kernels should not be handled here");
1178 return RetCC_SI_Shader;
1181 return RetCC_SI_Gfx;
1182 case CallingConv::C:
1183 case CallingConv::Fast:
1184 case CallingConv::Cold:
1185 return RetCC_AMDGPU_Func;
1186 default:
1187 reportFatalUsageError("unsupported calling convention");
1188 }
1189}
1190
1191/// The SelectionDAGBuilder will automatically promote function arguments
1192/// with illegal types. However, this does not work for the AMDGPU targets
1193/// since the function arguments are stored in memory as these illegal types.
1194/// In order to handle this properly we need to get the original types sizes
1195/// from the LLVM IR Function and fixup the ISD:InputArg values before
1196/// passing them to AnalyzeFormalArguments()
1197
1198/// When the SelectionDAGBuilder computes the Ins, it takes care of splitting
1199/// input values across multiple registers. Each item in the Ins array
1200/// represents a single value that will be stored in registers. Ins[x].VT is
1201/// the value type of the value that will be stored in the register, so
1202/// whatever SDNode we lower the argument to needs to be this type.
1203///
1204/// In order to correctly lower the arguments we need to know the size of each
1205/// argument. Since Ins[x].VT gives us the size of the register that will
1206/// hold the value, we need to look at Ins[x].ArgVT to see the 'real' type
1207/// for the original function argument so that we can deduce the correct memory
1208/// type to use for Ins[x]. In most cases the correct memory type will be
1209/// Ins[x].ArgVT. However, this will not always be the case. If, for example,
1210/// we have a kernel argument of type v8i8, this argument will be split into
1211/// 8 parts and each part will be represented by its own item in the Ins array.
1212/// For each part the Ins[x].ArgVT will be the v8i8, which is the full type of
1213/// the argument before it was split. From this, we deduce that the memory type
1214/// for each individual part is i8. We pass the memory type as LocVT to the
1215/// calling convention analysis function and the register type (Ins[x].VT) as
1216/// the ValVT.
1218 CCState &State,
1219 const SmallVectorImpl<ISD::InputArg> &Ins) const {
1220 const MachineFunction &MF = State.getMachineFunction();
1221 const Function &Fn = MF.getFunction();
1222 LLVMContext &Ctx = Fn.getContext();
1223 const unsigned ExplicitOffset = Subtarget->getExplicitKernelArgOffset();
1225
1226 Align MaxAlign = Align(1);
1227 uint64_t ExplicitArgOffset = 0;
1228 const DataLayout &DL = Fn.getDataLayout();
1229
1230 unsigned InIndex = 0;
1231
1232 for (const Argument &Arg : Fn.args()) {
1233 const bool IsByRef = Arg.hasByRefAttr();
1234 Type *BaseArgTy = Arg.getType();
1235 Type *MemArgTy = IsByRef ? Arg.getParamByRefType() : BaseArgTy;
1236 Align Alignment = DL.getValueOrABITypeAlignment(
1237 IsByRef ? Arg.getParamAlign() : std::nullopt, MemArgTy);
1238 MaxAlign = std::max(Alignment, MaxAlign);
1239 uint64_t AllocSize = DL.getTypeAllocSize(MemArgTy);
1240
1241 uint64_t ArgOffset = alignTo(ExplicitArgOffset, Alignment) + ExplicitOffset;
1242 ExplicitArgOffset = alignTo(ExplicitArgOffset, Alignment) + AllocSize;
1243
1244 // We're basically throwing away everything passed into us and starting over
1245 // to get accurate in-memory offsets. The "PartOffset" is completely useless
1246 // to us as computed in Ins.
1247 //
1248 // We also need to figure out what type legalization is trying to do to get
1249 // the correct memory offsets.
1250
1251 SmallVector<EVT, 16> ValueVTs;
1253 ComputeValueVTs(*this, DL, BaseArgTy, ValueVTs, /*MemVTs=*/nullptr,
1254 &Offsets, ArgOffset);
1255
1256 for (unsigned Value = 0, NumValues = ValueVTs.size();
1257 Value != NumValues; ++Value) {
1258 uint64_t BasePartOffset = Offsets[Value];
1259
1260 EVT ArgVT = ValueVTs[Value];
1261 EVT MemVT = ArgVT;
1262 MVT RegisterVT = getRegisterTypeForCallingConv(Ctx, CC, ArgVT);
1263 unsigned NumRegs = getNumRegistersForCallingConv(Ctx, CC, ArgVT);
1264
1265 if (NumRegs == 1) {
1266 // This argument is not split, so the IR type is the memory type.
1267 if (ArgVT.isExtended()) {
1268 // We have an extended type, like i24, so we should just use the
1269 // register type.
1270 MemVT = RegisterVT;
1271 } else {
1272 MemVT = ArgVT;
1273 }
1274 } else if (ArgVT.isVector() && RegisterVT.isVector() &&
1275 ArgVT.getScalarType() == RegisterVT.getScalarType()) {
1276 assert(ArgVT.getVectorNumElements() > RegisterVT.getVectorNumElements());
1277 // We have a vector value which has been split into a vector with
1278 // the same scalar type, but fewer elements. This should handle
1279 // all the floating-point vector types.
1280 MemVT = RegisterVT;
1281 } else if (ArgVT.isVector() &&
1282 ArgVT.getVectorNumElements() == NumRegs) {
1283 // This arg has been split so that each element is stored in a separate
1284 // register.
1285 MemVT = ArgVT.getScalarType();
1286 } else if (ArgVT.isExtended()) {
1287 // We have an extended type, like i65.
1288 MemVT = RegisterVT;
1289 } else {
1290 unsigned MemoryBits = ArgVT.getStoreSizeInBits() / NumRegs;
1291 assert(ArgVT.getStoreSizeInBits() % NumRegs == 0);
1292 if (RegisterVT.isInteger()) {
1293 MemVT = EVT::getIntegerVT(State.getContext(), MemoryBits);
1294 } else if (RegisterVT.isVector()) {
1295 assert(!RegisterVT.getScalarType().isFloatingPoint());
1296 unsigned NumElements = RegisterVT.getVectorNumElements();
1297 assert(MemoryBits % NumElements == 0);
1298 // This vector type has been split into another vector type with
1299 // a different elements size.
1300 EVT ScalarVT = EVT::getIntegerVT(State.getContext(),
1301 MemoryBits / NumElements);
1302 MemVT = EVT::getVectorVT(State.getContext(), ScalarVT, NumElements);
1303 } else {
1304 llvm_unreachable("cannot deduce memory type.");
1305 }
1306 }
1307
1308 // Convert one element vectors to scalar.
1309 if (MemVT.isVector() && MemVT.getVectorNumElements() == 1)
1310 MemVT = MemVT.getScalarType();
1311
1312 // Round up vec3/vec5 argument.
1313 if (MemVT.isVector() && !MemVT.isPow2VectorType()) {
1314 MemVT = MemVT.getPow2VectorType(State.getContext());
1315 } else if (!MemVT.isSimple() && !MemVT.isVector()) {
1316 MemVT = MemVT.getRoundIntegerType(State.getContext());
1317 }
1318
1319 unsigned PartOffset = 0;
1320 for (unsigned i = 0; i != NumRegs; ++i) {
1321 State.addLoc(CCValAssign::getCustomMem(InIndex++, RegisterVT,
1322 BasePartOffset + PartOffset,
1323 MemVT.getSimpleVT(),
1325 PartOffset += MemVT.getStoreSize();
1326 }
1327 }
1328 }
1329}
1330
1332 SDValue Chain, CallingConv::ID CallConv,
1333 bool isVarArg,
1335 const SmallVectorImpl<SDValue> &OutVals,
1336 const SDLoc &DL, SelectionDAG &DAG) const {
1337 // FIXME: Fails for r600 tests
1338 //assert(!isVarArg && Outs.empty() && OutVals.empty() &&
1339 // "wave terminate should not have return values");
1340 return DAG.getNode(AMDGPUISD::ENDPGM, DL, MVT::Other, Chain);
1341}
1342
1343//===---------------------------------------------------------------------===//
1344// Target specific lowering
1345//===---------------------------------------------------------------------===//
1346
1347/// Selects the correct CCAssignFn for a given CallingConvention value.
1352
1357
1359 SelectionDAG &DAG,
1360 MachineFrameInfo &MFI,
1361 int ClobberedFI) const {
1362 SmallVector<SDValue, 8> ArgChains;
1363 int64_t FirstByte = MFI.getObjectOffset(ClobberedFI);
1364 int64_t LastByte = FirstByte + MFI.getObjectSize(ClobberedFI) - 1;
1365
1366 // Include the original chain at the beginning of the list. When this is
1367 // used by target LowerCall hooks, this helps legalize find the
1368 // CALLSEQ_BEGIN node.
1369 ArgChains.push_back(Chain);
1370
1371 // Add a chain value for each stack argument corresponding
1372 for (SDNode *U : DAG.getEntryNode().getNode()->users()) {
1373 if (LoadSDNode *L = dyn_cast<LoadSDNode>(U)) {
1374 if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(L->getBasePtr())) {
1375 if (FI->getIndex() < 0) {
1376 int64_t InFirstByte = MFI.getObjectOffset(FI->getIndex());
1377 int64_t InLastByte = InFirstByte;
1378 InLastByte += MFI.getObjectSize(FI->getIndex()) - 1;
1379
1380 if ((InFirstByte <= FirstByte && FirstByte <= InLastByte) ||
1381 (FirstByte <= InFirstByte && InFirstByte <= LastByte))
1382 ArgChains.push_back(SDValue(L, 1));
1383 }
1384 }
1385 }
1386 }
1387
1388 // Build a tokenfactor for all the chains.
1389 return DAG.getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ArgChains);
1390}
1391
1394 StringRef Reason) const {
1395 SDValue Callee = CLI.Callee;
1396 SelectionDAG &DAG = CLI.DAG;
1397
1398 const Function &Fn = DAG.getMachineFunction().getFunction();
1399
1400 StringRef FuncName("<unknown>");
1401
1403 FuncName = G->getSymbol();
1404 else if (const GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
1405 FuncName = G->getGlobal()->getName();
1406
1407 DAG.getContext()->diagnose(
1408 DiagnosticInfoUnsupported(Fn, Reason + FuncName, CLI.DL.getDebugLoc()));
1409
1410 if (!CLI.IsTailCall) {
1411 for (ISD::InputArg &Arg : CLI.Ins)
1412 InVals.push_back(DAG.getPOISON(Arg.VT));
1413 }
1414
1415 // FIXME: Hack because R600 doesn't handle callseq pseudos yet.
1416 if (getTargetMachine().getTargetTriple().getArch() == Triple::r600)
1417 return CLI.Chain;
1418
1419 SDValue Chain = DAG.getCALLSEQ_START(CLI.Chain, 0, 0, CLI.DL);
1420 return DAG.getCALLSEQ_END(Chain, 0, 0, /*InGlue=*/SDValue(), CLI.DL);
1421}
1422
1424 SmallVectorImpl<SDValue> &InVals) const {
1425 return lowerUnhandledCall(CLI, InVals, "unsupported call to function ");
1426}
1427
1429 SelectionDAG &DAG) const {
1430 const Function &Fn = DAG.getMachineFunction().getFunction();
1431
1433 Fn, "unsupported dynamic alloca", SDLoc(Op).getDebugLoc()));
1434 auto Ops = {DAG.getConstant(0, SDLoc(), Op.getValueType()), Op.getOperand(0)};
1435 return DAG.getMergeValues(Ops, SDLoc());
1436}
1437
1439 SelectionDAG &DAG) const {
1440 switch (Op.getOpcode()) {
1441 default:
1442 Op->print(errs(), &DAG);
1443 llvm_unreachable("Custom lowering code for this "
1444 "instruction is not implemented yet!");
1445 break;
1447 case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG);
1449 case ISD::UDIVREM: return LowerUDIVREM(Op, DAG);
1450 case ISD::SDIVREM:
1451 return LowerSDIVREM(Op, DAG);
1452 case ISD::FCEIL: return LowerFCEIL(Op, DAG);
1453 case ISD::FTRUNC: return LowerFTRUNC(Op, DAG);
1454 case ISD::FRINT: return LowerFRINT(Op, DAG);
1455 case ISD::FNEARBYINT: return LowerFNEARBYINT(Op, DAG);
1456 case ISD::FROUNDEVEN:
1457 return LowerFROUNDEVEN(Op, DAG);
1458 case ISD::FROUND: return LowerFROUND(Op, DAG);
1459 case ISD::FFLOOR: return LowerFFLOOR(Op, DAG);
1460 case ISD::FLOG2:
1461 return LowerFLOG2(Op, DAG);
1462 case ISD::FLOG:
1463 case ISD::FLOG10:
1464 return LowerFLOGCommon(Op, DAG);
1465 case ISD::FEXP:
1466 case ISD::FEXP10:
1467 return lowerFEXP(Op, DAG);
1468 case ISD::FEXP2:
1469 return lowerFEXP2(Op, DAG);
1470 case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);
1471 case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
1472 case ISD::FP_TO_FP16: return LowerFP_TO_FP16(Op, DAG);
1473 case ISD::FP_TO_SINT:
1474 case ISD::FP_TO_UINT:
1475 return LowerFP_TO_INT(Op, DAG);
1478 return LowerFP_TO_INT_SAT(Op, DAG);
1479 case ISD::CTTZ:
1481 case ISD::CTLZ:
1483 return LowerCTLZ_CTTZ(Op, DAG);
1484 case ISD::CTLS:
1485 return LowerCTLS(Op, DAG);
1487 }
1488 return Op;
1489}
1490
1493 SelectionDAG &DAG) const {
1494 switch (N->getOpcode()) {
1496 // Different parts of legalization seem to interpret which type of
1497 // sign_extend_inreg is the one to check for custom lowering. The extended
1498 // from type is what really matters, but some places check for custom
1499 // lowering of the result type. This results in trying to use
1500 // ReplaceNodeResults to sext_in_reg to an illegal type, so we'll just do
1501 // nothing here and let the illegal result integer be handled normally.
1502 return;
1503 case ISD::FLOG2:
1504 if (SDValue Lowered = LowerFLOG2(SDValue(N, 0), DAG))
1505 Results.push_back(Lowered);
1506 return;
1507 case ISD::FLOG:
1508 case ISD::FLOG10:
1509 if (SDValue Lowered = LowerFLOGCommon(SDValue(N, 0), DAG))
1510 Results.push_back(Lowered);
1511 return;
1512 case ISD::FEXP2:
1513 if (SDValue Lowered = lowerFEXP2(SDValue(N, 0), DAG))
1514 Results.push_back(Lowered);
1515 return;
1516 case ISD::FEXP:
1517 case ISD::FEXP10:
1518 if (SDValue Lowered = lowerFEXP(SDValue(N, 0), DAG))
1519 Results.push_back(Lowered);
1520 return;
1521 case ISD::CTLZ:
1523 if (auto Lowered = lowerCTLZResults(SDValue(N, 0u), DAG))
1524 Results.push_back(Lowered);
1525 return;
1526 default:
1527 return;
1528 }
1529}
1530
1532 SelectionDAG &DAG) const {
1534 SDLoc SL(Op);
1535 EVT VT = Op.getValueType();
1536 return DAG.getTargetBlockAddress(BA->getBlockAddress(), VT, BA->getOffset(),
1537 BA->getTargetFlags());
1538}
1539
1541 SDValue Op,
1542 SelectionDAG &DAG) const {
1543
1544 const DataLayout &DL = DAG.getDataLayout();
1546 const GlobalValue *GV = G->getGlobal();
1547
1548 if (!MFI->isModuleEntryFunction()) {
1549 auto IsNamedBarrier = AMDGPU::isNamedBarrier(*cast<GlobalVariable>(GV));
1550 if (std::optional<uint32_t> Address =
1552 if (IsNamedBarrier) {
1553 unsigned BarCnt = cast<GlobalVariable>(GV)->getGlobalSize(DL) / 16;
1554 MFI->recordNumNamedBarriers(Address.value(), BarCnt);
1555 }
1556 return DAG.getConstant(*Address, SDLoc(Op), Op.getValueType());
1557 } else if (IsNamedBarrier) {
1558 llvm_unreachable("named barrier should have an assigned address");
1559 }
1560 }
1561
1562 if (G->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
1563 G->getAddressSpace() == AMDGPUAS::REGION_ADDRESS) {
1564 if (!MFI->isModuleEntryFunction() &&
1565 GV->getName() != "llvm.amdgcn.module.lds" &&
1567 SDLoc DL(Op);
1568 const Function &Fn = DAG.getMachineFunction().getFunction();
1570 Fn, "local memory global used by non-kernel function",
1571 DL.getDebugLoc(), DS_Warning));
1572
1573 // We currently don't have a way to correctly allocate LDS objects that
1574 // aren't directly associated with a kernel. We do force inlining of
1575 // functions that use local objects. However, if these dead functions are
1576 // not eliminated, we don't want a compile time error. Just emit a warning
1577 // and a trap, since there should be no callable path here.
1578 SDValue Trap = DAG.getNode(ISD::TRAP, DL, MVT::Other, DAG.getEntryNode());
1579 SDValue OutputChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
1580 Trap, DAG.getRoot());
1581 DAG.setRoot(OutputChain);
1582 return DAG.getPOISON(Op.getValueType());
1583 }
1584
1585 // XXX: What does the value of G->getOffset() mean?
1586 assert(G->getOffset() == 0 &&
1587 "Do not know what to do with an non-zero offset");
1588
1589 // TODO: We could emit code to handle the initialization somewhere.
1590 // We ignore the initializer for now and legalize it to allow selection.
1591 // The initializer will anyway get errored out during assembly emission.
1592 unsigned Offset = MFI->allocateLDSGlobal(DL, *cast<GlobalVariable>(GV));
1593 return DAG.getConstant(Offset, SDLoc(Op), Op.getValueType());
1594 }
1595 return SDValue();
1596}
1597
1599 SelectionDAG &DAG) const {
1601 SDLoc SL(Op);
1602
1603 EVT VT = Op.getValueType();
1604 if (VT.getVectorElementType().getSizeInBits() < 32) {
1605 unsigned OpBitSize = Op.getOperand(0).getValueType().getSizeInBits();
1606 if (OpBitSize >= 32 && OpBitSize % 32 == 0) {
1607 unsigned NewNumElt = OpBitSize / 32;
1608 EVT NewEltVT = (NewNumElt == 1) ? MVT::i32
1610 MVT::i32, NewNumElt);
1611 for (const SDUse &U : Op->ops()) {
1612 SDValue In = U.get();
1613 SDValue NewIn = DAG.getNode(ISD::BITCAST, SL, NewEltVT, In);
1614 if (NewNumElt > 1)
1615 DAG.ExtractVectorElements(NewIn, Args);
1616 else
1617 Args.push_back(NewIn);
1618 }
1619
1620 EVT NewVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
1621 NewNumElt * Op.getNumOperands());
1622 SDValue BV = DAG.getBuildVector(NewVT, SL, Args);
1623 return DAG.getNode(ISD::BITCAST, SL, VT, BV);
1624 }
1625 }
1626
1627 for (const SDUse &U : Op->ops())
1628 DAG.ExtractVectorElements(U.get(), Args);
1629
1630 return DAG.getBuildVector(Op.getValueType(), SL, Args);
1631}
1632
1634 SelectionDAG &DAG) const {
1635 SDLoc SL(Op);
1637 unsigned Start = Op.getConstantOperandVal(1);
1638 EVT VT = Op.getValueType();
1639 EVT SrcVT = Op.getOperand(0).getValueType();
1640
1641 if (VT.getScalarSizeInBits() == 16 && Start % 2 == 0) {
1642 unsigned NumElt = VT.getVectorNumElements();
1643 unsigned NumSrcElt = SrcVT.getVectorNumElements();
1644 assert(NumElt % 2 == 0 && NumSrcElt % 2 == 0 && "expect legal types");
1645
1646 // Extract 32-bit registers at a time.
1647 EVT NewSrcVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumSrcElt / 2);
1648 EVT NewVT = NumElt == 2
1649 ? MVT::i32
1650 : EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElt / 2);
1651 SDValue Tmp = DAG.getNode(ISD::BITCAST, SL, NewSrcVT, Op.getOperand(0));
1652
1653 DAG.ExtractVectorElements(Tmp, Args, Start / 2, NumElt / 2);
1654 if (NumElt == 2)
1655 Tmp = Args[0];
1656 else
1657 Tmp = DAG.getBuildVector(NewVT, SL, Args);
1658
1659 return DAG.getNode(ISD::BITCAST, SL, VT, Tmp);
1660 }
1661
1662 DAG.ExtractVectorElements(Op.getOperand(0), Args, Start,
1664
1665 return DAG.getBuildVector(Op.getValueType(), SL, Args);
1666}
1667
1668// TODO: Handle fabs too
1670 if (Val.getOpcode() == ISD::FNEG)
1671 return Val.getOperand(0);
1672
1673 return Val;
1674}
1675
1677 if (Val.getOpcode() == ISD::FNEG)
1678 Val = Val.getOperand(0);
1679 if (Val.getOpcode() == ISD::FABS)
1680 Val = Val.getOperand(0);
1681 if (Val.getOpcode() == ISD::FCOPYSIGN)
1682 Val = Val.getOperand(0);
1683 return Val;
1684}
1685
1687 const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, SDValue True,
1688 SDValue False, SDValue CC, DAGCombinerInfo &DCI) const {
1689 SelectionDAG &DAG = DCI.DAG;
1690 ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
1691 switch (CCOpcode) {
1692 case ISD::SETOEQ:
1693 case ISD::SETONE:
1694 case ISD::SETUNE:
1695 case ISD::SETNE:
1696 case ISD::SETUEQ:
1697 case ISD::SETEQ:
1698 case ISD::SETFALSE:
1699 case ISD::SETFALSE2:
1700 case ISD::SETTRUE:
1701 case ISD::SETTRUE2:
1702 case ISD::SETUO:
1703 case ISD::SETO:
1704 break;
1705 case ISD::SETULE:
1706 case ISD::SETULT: {
1707 if (LHS == True)
1708 return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, RHS, LHS);
1709 return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, LHS, RHS);
1710 }
1711 case ISD::SETOLE:
1712 case ISD::SETOLT:
1713 case ISD::SETLE:
1714 case ISD::SETLT: {
1715 // Ordered. Assume ordered for undefined.
1716
1717 // Only do this after legalization to avoid interfering with other combines
1718 // which might occur.
1720 !DCI.isCalledByLegalizer())
1721 return SDValue();
1722
1723 // We need to permute the operands to get the correct NaN behavior. The
1724 // selected operand is the second one based on the failing compare with NaN,
1725 // so permute it based on the compare type the hardware uses.
1726 if (LHS == True)
1727 return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, LHS, RHS);
1728 return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, RHS, LHS);
1729 }
1730 case ISD::SETUGE:
1731 case ISD::SETUGT: {
1732 if (LHS == True)
1733 return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, RHS, LHS);
1734 return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, LHS, RHS);
1735 }
1736 case ISD::SETGT:
1737 case ISD::SETGE:
1738 case ISD::SETOGE:
1739 case ISD::SETOGT: {
1741 !DCI.isCalledByLegalizer())
1742 return SDValue();
1743
1744 if (LHS == True)
1745 return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, LHS, RHS);
1746 return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, RHS, LHS);
1747 }
1748 case ISD::SETCC_INVALID:
1749 llvm_unreachable("Invalid setcc condcode!");
1750 }
1751 return SDValue();
1752}
1753
1754/// Generate Min/Max node
1756 SDValue LHS, SDValue RHS,
1757 SDValue True, SDValue False,
1758 SDValue CC,
1759 DAGCombinerInfo &DCI) const {
1760 if ((LHS == True && RHS == False) || (LHS == False && RHS == True))
1761 return combineFMinMaxLegacyImpl(DL, VT, LHS, RHS, True, False, CC, DCI);
1762
1763 SelectionDAG &DAG = DCI.DAG;
1764
1765 // If we can't directly match this, try to see if we can fold an fneg to
1766 // match.
1767
1770 SDValue NegTrue = peekFNeg(True);
1771
1772 // Undo the combine foldFreeOpFromSelect does if it helps us match the
1773 // fmin/fmax.
1774 //
1775 // select (fcmp olt (lhs, K)), (fneg lhs), -K
1776 // -> fneg (fmin_legacy lhs, K)
1777 //
1778 // TODO: Use getNegatedExpression
1779 if (LHS == NegTrue && CFalse && CRHS) {
1780 APFloat NegRHS = neg(CRHS->getValueAPF());
1781 if (NegRHS == CFalse->getValueAPF()) {
1782 SDValue Combined =
1783 combineFMinMaxLegacyImpl(DL, VT, LHS, RHS, NegTrue, False, CC, DCI);
1784 if (Combined)
1785 return DAG.getNode(ISD::FNEG, DL, VT, Combined);
1786 return SDValue();
1787 }
1788 }
1789
1790 return SDValue();
1791}
1792
1793std::pair<SDValue, SDValue>
1795 SDLoc SL(Op);
1796
1797 SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
1798
1799 const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
1800 const SDValue One = DAG.getConstant(1, SL, MVT::i32);
1801
1802 SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero);
1803 SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One);
1804
1805 return std::pair(Lo, Hi);
1806}
1807
1809 SDLoc SL(Op);
1810
1811 SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
1812 const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
1813 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero);
1814}
1815
1817 SDLoc SL(Op);
1818
1819 SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
1820 const SDValue One = DAG.getConstant(1, SL, MVT::i32);
1821 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One);
1822}
1823
1824// Split a vector type into two parts. The first part is a power of two vector.
1825// The second part is whatever is left over, and is a scalar if it would
1826// otherwise be a 1-vector.
1827std::pair<EVT, EVT>
1829 EVT LoVT, HiVT;
1830 EVT EltVT = VT.getVectorElementType();
1831 unsigned NumElts = VT.getVectorNumElements();
1832 unsigned LoNumElts = PowerOf2Ceil((NumElts + 1) / 2);
1833 LoVT = EVT::getVectorVT(*DAG.getContext(), EltVT, LoNumElts);
1834 HiVT = NumElts - LoNumElts == 1
1835 ? EltVT
1836 : EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts - LoNumElts);
1837 return std::pair(LoVT, HiVT);
1838}
1839
1840// Split a vector value into two parts of types LoVT and HiVT. HiVT could be
1841// scalar.
1842std::pair<SDValue, SDValue>
1844 const EVT &LoVT, const EVT &HiVT,
1845 SelectionDAG &DAG) const {
1846 EVT VT = N.getValueType();
1848 (HiVT.isVector() ? HiVT.getVectorNumElements() : 1) <=
1849 VT.getVectorNumElements() &&
1850 "More vector elements requested than available!");
1852 DAG.getVectorIdxConstant(0, DL));
1853
1854 unsigned LoNumElts = LoVT.getVectorNumElements();
1855
1856 if (HiVT.isVector()) {
1857 unsigned HiNumElts = HiVT.getVectorNumElements();
1858 if ((VT.getVectorNumElements() % HiNumElts) == 0) {
1859 // Avoid creating an extract_subvector with an index that isn't a multiple
1860 // of the result type.
1862 DAG.getConstant(LoNumElts, DL, MVT::i32));
1863 return {Lo, Hi};
1864 }
1865
1867 DAG.ExtractVectorElements(N, Elts, /*Start=*/LoNumElts,
1868 /*Count=*/HiNumElts);
1869 SDValue Hi = DAG.getBuildVector(HiVT, DL, Elts);
1870 return {Lo, Hi};
1871 }
1872
1874 DAG.getVectorIdxConstant(LoNumElts, DL));
1875 return {Lo, Hi};
1876}
1877
1879 SelectionDAG &DAG) const {
1881 EVT VT = Op.getValueType();
1882 SDLoc SL(Op);
1883
1884
1885 // If this is a 2 element vector, we really want to scalarize and not create
1886 // weird 1 element vectors.
1887 if (VT.getVectorNumElements() == 2) {
1888 SDValue Ops[2];
1889 std::tie(Ops[0], Ops[1]) = scalarizeVectorLoad(Load, DAG);
1890 return DAG.getMergeValues(Ops, SL);
1891 }
1892
1893 SDValue BasePtr = Load->getBasePtr();
1894 EVT MemVT = Load->getMemoryVT();
1895
1896 const MachinePointerInfo &SrcValue = Load->getMemOperand()->getPointerInfo();
1897
1898 EVT LoVT, HiVT;
1899 EVT LoMemVT, HiMemVT;
1900 SDValue Lo, Hi;
1901
1902 std::tie(LoVT, HiVT) = getSplitDestVTs(VT, DAG);
1903 std::tie(LoMemVT, HiMemVT) = getSplitDestVTs(MemVT, DAG);
1904 std::tie(Lo, Hi) = splitVector(Op, SL, LoVT, HiVT, DAG);
1905
1906 unsigned Size = LoMemVT.getStoreSize();
1907 Align BaseAlign = Load->getAlign();
1908 Align HiAlign = commonAlignment(BaseAlign, Size);
1909
1910 SDValue LoLoad = DAG.getExtLoad(
1911 Load->getExtensionType(), SL, LoVT, Load->getChain(), BasePtr, SrcValue,
1912 LoMemVT, BaseAlign, Load->getMemOperand()->getFlags(), Load->getAAInfo());
1913 SDValue HiPtr = DAG.getObjectPtrOffset(SL, BasePtr, TypeSize::getFixed(Size));
1914 SDValue HiLoad = DAG.getExtLoad(
1915 Load->getExtensionType(), SL, HiVT, Load->getChain(), HiPtr,
1916 SrcValue.getWithOffset(LoMemVT.getStoreSize()), HiMemVT, HiAlign,
1917 Load->getMemOperand()->getFlags(), Load->getAAInfo());
1918
1919 SDValue Join;
1920 if (LoVT == HiVT) {
1921 // This is the case that the vector is power of two so was evenly split.
1922 Join = DAG.getNode(ISD::CONCAT_VECTORS, SL, VT, LoLoad, HiLoad);
1923 } else {
1924 Join = DAG.getNode(ISD::INSERT_SUBVECTOR, SL, VT, DAG.getPOISON(VT), LoLoad,
1925 DAG.getVectorIdxConstant(0, SL));
1926 Join = DAG.getNode(
1928 VT, Join, HiLoad,
1930 }
1931
1932 SDValue Ops[] = {Join, DAG.getNode(ISD::TokenFactor, SL, MVT::Other,
1933 LoLoad.getValue(1), HiLoad.getValue(1))};
1934
1935 return DAG.getMergeValues(Ops, SL);
1936}
1937
1939 SelectionDAG &DAG) const {
1941 EVT VT = Op.getValueType();
1942 SDValue BasePtr = Load->getBasePtr();
1943 EVT MemVT = Load->getMemoryVT();
1944 SDLoc SL(Op);
1945 const MachinePointerInfo &SrcValue = Load->getMemOperand()->getPointerInfo();
1946 Align BaseAlign = Load->getAlign();
1947 unsigned NumElements = MemVT.getVectorNumElements();
1948
1949 // Widen from vec3 to vec4 when the load is at least 8-byte aligned
1950 // or 16-byte fully dereferenceable. Otherwise, split the vector load.
1951 if (NumElements != 3 ||
1952 (BaseAlign < Align(8) &&
1953 !SrcValue.isDereferenceable(16, *DAG.getContext(), DAG.getDataLayout())))
1954 return SplitVectorLoad(Op, DAG);
1955
1956 assert(NumElements == 3);
1957
1958 EVT WideVT =
1960 EVT WideMemVT =
1962 SDValue WideLoad = DAG.getExtLoad(
1963 Load->getExtensionType(), SL, WideVT, Load->getChain(), BasePtr, SrcValue,
1964 WideMemVT, BaseAlign, Load->getMemOperand()->getFlags());
1965 return DAG.getMergeValues(
1966 {DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, VT, WideLoad,
1967 DAG.getVectorIdxConstant(0, SL)),
1968 WideLoad.getValue(1)},
1969 SL);
1970}
1971
1973 SelectionDAG &DAG) const {
1975 SDValue Val = Store->getValue();
1976 EVT VT = Val.getValueType();
1977
1978 // If this is a 2 element vector, we really want to scalarize and not create
1979 // weird 1 element vectors.
1980 if (VT.getVectorNumElements() == 2)
1981 return scalarizeVectorStore(Store, DAG);
1982
1983 EVT MemVT = Store->getMemoryVT();
1984 SDValue Chain = Store->getChain();
1985 SDValue BasePtr = Store->getBasePtr();
1986 SDLoc SL(Op);
1987
1988 EVT LoVT, HiVT;
1989 EVT LoMemVT, HiMemVT;
1990 SDValue Lo, Hi;
1991
1992 std::tie(LoVT, HiVT) = getSplitDestVTs(VT, DAG);
1993 std::tie(LoMemVT, HiMemVT) = getSplitDestVTs(MemVT, DAG);
1994 std::tie(Lo, Hi) = splitVector(Val, SL, LoVT, HiVT, DAG);
1995
1996 SDValue HiPtr = DAG.getObjectPtrOffset(SL, BasePtr, LoMemVT.getStoreSize());
1997
1998 const MachinePointerInfo &SrcValue = Store->getMemOperand()->getPointerInfo();
1999 Align BaseAlign = Store->getAlign();
2000 unsigned Size = LoMemVT.getStoreSize();
2001 Align HiAlign = commonAlignment(BaseAlign, Size);
2002
2003 SDValue LoStore =
2004 DAG.getTruncStore(Chain, SL, Lo, BasePtr, SrcValue, LoMemVT, BaseAlign,
2005 Store->getMemOperand()->getFlags(), Store->getAAInfo());
2006 SDValue HiStore = DAG.getTruncStore(
2007 Chain, SL, Hi, HiPtr, SrcValue.getWithOffset(Size), HiMemVT, HiAlign,
2008 Store->getMemOperand()->getFlags(), Store->getAAInfo());
2009
2010 return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, LoStore, HiStore);
2011}
2012
2013// This is a shortcut for integer division because we have fast i32<->f32
2014// conversions, and fast f32 reciprocal instructions. The fractional part of a
2015// float is enough to accurately represent up to a 24-bit integer.
2017 bool Sign) const {
2018 SDLoc DL(Op);
2019 EVT VT = Op.getValueType();
2020 assert(VT == MVT::i32 && "LowerDIVREM24 expects an i32");
2021
2022 SDValue LHS = Op.getOperand(0);
2023 SDValue RHS = Op.getOperand(1);
2024 MVT IntVT = MVT::i32;
2025 MVT FltVT = MVT::f32;
2026
2027 unsigned LHSSignBits;
2028 unsigned RHSSignBits;
2029 if (Sign) {
2030 LHSSignBits = DAG.ComputeNumSignBits(LHS);
2031 RHSSignBits = DAG.ComputeNumSignBits(RHS);
2032 if (LHSSignBits < 9 || RHSSignBits < 9)
2033 return SDValue();
2034 } else {
2035 KnownBits LHSKnown = DAG.computeKnownBits(LHS);
2036 KnownBits RHSKnown = DAG.computeKnownBits(RHS);
2037 APInt U24Max = APInt::getLowBitsSet(32, 24);
2038 if (LHSKnown.getMaxValue().ugt(U24Max) ||
2039 RHSKnown.getMaxValue().ugt(U24Max))
2040 return SDValue();
2041 LHSSignBits = LHSKnown.countMinLeadingZeros();
2042 RHSSignBits = RHSKnown.countMinLeadingZeros();
2043 }
2044
2045 unsigned BitSize = VT.getSizeInBits();
2046 unsigned SignBits = std::min(LHSSignBits, RHSSignBits);
2047 unsigned DivBits = BitSize - SignBits;
2048 if (Sign)
2049 ++DivBits;
2050
2053
2054 SDValue jq = DAG.getConstant(1, DL, IntVT);
2055
2056 if (Sign) {
2057 // char|short jq = ia ^ ib;
2058 jq = DAG.getNode(ISD::XOR, DL, VT, LHS, RHS);
2059
2060 // jq = jq >> (bitsize - 2)
2061 jq = DAG.getNode(ISD::SRA, DL, VT, jq,
2062 DAG.getConstant(BitSize - 2, DL, VT));
2063
2064 // jq = jq | 0x1
2065 jq = DAG.getNode(ISD::OR, DL, VT, jq, DAG.getConstant(1, DL, VT));
2066 }
2067
2068 // int ia = (int)LHS;
2069 SDValue ia = LHS;
2070
2071 // int ib, (int)RHS;
2072 SDValue ib = RHS;
2073
2074 // float fa = (float)ia;
2075 SDValue fa = DAG.getNode(ToFp, DL, FltVT, ia);
2076
2077 // float fb = (float)ib;
2078 SDValue fb = DAG.getNode(ToFp, DL, FltVT, ib);
2079
2080 SDValue fq = DAG.getNode(ISD::FMUL, DL, FltVT,
2081 fa, DAG.getNode(AMDGPUISD::RCP, DL, FltVT, fb));
2082
2083 // fq = trunc(fq);
2084 fq = DAG.getNode(ISD::FTRUNC, DL, FltVT, fq);
2085
2086 // float fqneg = -fq;
2087 SDValue fqneg = DAG.getNode(ISD::FNEG, DL, FltVT, fq);
2088
2090
2091 bool UseFmadFtz = false;
2092 if (Subtarget->isGCN()) {
2094 UseFmadFtz =
2096 }
2097
2098 // float fr = mad(fqneg, fb, fa);
2099 unsigned OpCode = !Subtarget->hasMadMacF32Insts() ? (unsigned)ISD::FMA
2100 : UseFmadFtz ? (unsigned)AMDGPUISD::FMAD_FTZ
2102 SDValue fr = DAG.getNode(OpCode, DL, FltVT, fqneg, fb, fa);
2103
2104 // int iq = (int)fq;
2105 SDValue iq = DAG.getNode(ToInt, DL, IntVT, fq);
2106
2107 // fr = fabs(fr);
2108 fr = DAG.getNode(ISD::FABS, DL, FltVT, fr);
2109
2110 // fb = fabs(fb);
2111 fb = DAG.getNode(ISD::FABS, DL, FltVT, fb);
2112
2113 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2114
2115 // int cv = fr >= fb;
2116 SDValue cv = DAG.getSetCC(DL, SetCCVT, fr, fb, ISD::SETOGE);
2117
2118 // jq = (cv ? jq : 0);
2119 jq = DAG.getNode(ISD::SELECT, DL, VT, cv, jq, DAG.getConstant(0, DL, VT));
2120
2121 // dst = iq + jq;
2122 SDValue Div = DAG.getNode(ISD::ADD, DL, VT, iq, jq);
2123
2124 // Rem needs compensation, it's easier to recompute it
2125 SDValue Rem = DAG.getNode(ISD::MUL, DL, VT, Div, RHS);
2126 Rem = DAG.getNode(ISD::SUB, DL, VT, LHS, Rem);
2127
2128 // Truncate to number of bits this divide really is.
2129 if (Sign) {
2130 SDValue InRegSize
2131 = DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), DivBits));
2132 Div = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Div, InRegSize);
2133 Rem = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Rem, InRegSize);
2134 } else {
2135 SDValue TruncMask = DAG.getConstant((UINT64_C(1) << DivBits) - 1, DL, VT);
2136 Div = DAG.getNode(ISD::AND, DL, VT, Div, TruncMask);
2137 Rem = DAG.getNode(ISD::AND, DL, VT, Rem, TruncMask);
2138 }
2139
2140 return DAG.getMergeValues({ Div, Rem }, DL);
2141}
2142
2144 SelectionDAG &DAG,
2146 SDLoc DL(Op);
2147 EVT VT = Op.getValueType();
2148
2149 assert(VT == MVT::i64 && "LowerUDIVREM64 expects an i64");
2150
2151 EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext());
2152
2153 SDValue One = DAG.getConstant(1, DL, HalfVT);
2154 SDValue Zero = DAG.getConstant(0, DL, HalfVT);
2155
2156 //HiLo split
2157 SDValue LHS_Lo, LHS_Hi;
2158 SDValue LHS = Op.getOperand(0);
2159 std::tie(LHS_Lo, LHS_Hi) = DAG.SplitScalar(LHS, DL, HalfVT, HalfVT);
2160
2161 SDValue RHS_Lo, RHS_Hi;
2162 SDValue RHS = Op.getOperand(1);
2163 std::tie(RHS_Lo, RHS_Hi) = DAG.SplitScalar(RHS, DL, HalfVT, HalfVT);
2164
2165 if (DAG.MaskedValueIsZero(RHS, APInt::getHighBitsSet(64, 32)) &&
2166 DAG.MaskedValueIsZero(LHS, APInt::getHighBitsSet(64, 32))) {
2167
2168 SDValue Res = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(HalfVT, HalfVT),
2169 LHS_Lo, RHS_Lo);
2170
2171 SDValue DIV = DAG.getBuildVector(MVT::v2i32, DL, {Res.getValue(0), Zero});
2172 SDValue REM = DAG.getBuildVector(MVT::v2i32, DL, {Res.getValue(1), Zero});
2173
2174 Results.push_back(DAG.getNode(ISD::BITCAST, DL, MVT::i64, DIV));
2175 Results.push_back(DAG.getNode(ISD::BITCAST, DL, MVT::i64, REM));
2176 return;
2177 }
2178
2179 if (isTypeLegal(MVT::i64)) {
2180 // The algorithm here is based on ideas from "Software Integer Division",
2181 // Tom Rodeheffer, August 2008.
2182
2185
2186 // Compute denominator reciprocal.
2187 unsigned FMAD =
2188 !Subtarget->hasMadMacF32Insts() ? (unsigned)ISD::FMA
2191 : (unsigned)AMDGPUISD::FMAD_FTZ;
2192
2193 SDValue Cvt_Lo = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, RHS_Lo);
2194 SDValue Cvt_Hi = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, RHS_Hi);
2195 SDValue Mad1 = DAG.getNode(FMAD, DL, MVT::f32, Cvt_Hi,
2196 DAG.getConstantFP(APInt(32, 0x4f800000).bitsToFloat(), DL, MVT::f32),
2197 Cvt_Lo);
2198 SDValue Rcp = DAG.getNode(AMDGPUISD::RCP, DL, MVT::f32, Mad1);
2199 SDValue Mul1 = DAG.getNode(ISD::FMUL, DL, MVT::f32, Rcp,
2200 DAG.getConstantFP(APInt(32, 0x5f7ffffc).bitsToFloat(), DL, MVT::f32));
2201 SDValue Mul2 = DAG.getNode(ISD::FMUL, DL, MVT::f32, Mul1,
2202 DAG.getConstantFP(APInt(32, 0x2f800000).bitsToFloat(), DL, MVT::f32));
2203 SDValue Trunc = DAG.getNode(ISD::FTRUNC, DL, MVT::f32, Mul2);
2204 SDValue Mad2 = DAG.getNode(FMAD, DL, MVT::f32, Trunc,
2205 DAG.getConstantFP(APInt(32, 0xcf800000).bitsToFloat(), DL, MVT::f32),
2206 Mul1);
2207 SDValue Rcp_Lo = DAG.getNode(ISD::FP_TO_UINT, DL, HalfVT, Mad2);
2208 SDValue Rcp_Hi = DAG.getNode(ISD::FP_TO_UINT, DL, HalfVT, Trunc);
2209 SDValue Rcp64 = DAG.getBitcast(VT,
2210 DAG.getBuildVector(MVT::v2i32, DL, {Rcp_Lo, Rcp_Hi}));
2211
2212 SDValue Zero64 = DAG.getConstant(0, DL, VT);
2213 SDValue One64 = DAG.getConstant(1, DL, VT);
2214 SDValue Zero1 = DAG.getConstant(0, DL, MVT::i1);
2215 SDVTList HalfCarryVT = DAG.getVTList(HalfVT, MVT::i1);
2216
2217 // First round of UNR (Unsigned integer Newton-Raphson).
2218 SDValue Neg_RHS = DAG.getNode(ISD::SUB, DL, VT, Zero64, RHS);
2219 SDValue Mullo1 = DAG.getNode(ISD::MUL, DL, VT, Neg_RHS, Rcp64);
2220 SDValue Mulhi1 = DAG.getNode(ISD::MULHU, DL, VT, Rcp64, Mullo1);
2221 SDValue Mulhi1_Lo, Mulhi1_Hi;
2222 std::tie(Mulhi1_Lo, Mulhi1_Hi) =
2223 DAG.SplitScalar(Mulhi1, DL, HalfVT, HalfVT);
2224 SDValue Add1_Lo = DAG.getNode(ISD::UADDO_CARRY, DL, HalfCarryVT, Rcp_Lo,
2225 Mulhi1_Lo, Zero1);
2226 SDValue Add1_Hi = DAG.getNode(ISD::UADDO_CARRY, DL, HalfCarryVT, Rcp_Hi,
2227 Mulhi1_Hi, Add1_Lo.getValue(1));
2228 SDValue Add1 = DAG.getBitcast(VT,
2229 DAG.getBuildVector(MVT::v2i32, DL, {Add1_Lo, Add1_Hi}));
2230
2231 // Second round of UNR.
2232 SDValue Mullo2 = DAG.getNode(ISD::MUL, DL, VT, Neg_RHS, Add1);
2233 SDValue Mulhi2 = DAG.getNode(ISD::MULHU, DL, VT, Add1, Mullo2);
2234 SDValue Mulhi2_Lo, Mulhi2_Hi;
2235 std::tie(Mulhi2_Lo, Mulhi2_Hi) =
2236 DAG.SplitScalar(Mulhi2, DL, HalfVT, HalfVT);
2237 SDValue Add2_Lo = DAG.getNode(ISD::UADDO_CARRY, DL, HalfCarryVT, Add1_Lo,
2238 Mulhi2_Lo, Zero1);
2239 SDValue Add2_Hi = DAG.getNode(ISD::UADDO_CARRY, DL, HalfCarryVT, Add1_Hi,
2240 Mulhi2_Hi, Add2_Lo.getValue(1));
2241 SDValue Add2 = DAG.getBitcast(VT,
2242 DAG.getBuildVector(MVT::v2i32, DL, {Add2_Lo, Add2_Hi}));
2243
2244 SDValue Mulhi3 = DAG.getNode(ISD::MULHU, DL, VT, LHS, Add2);
2245
2246 SDValue Mul3 = DAG.getNode(ISD::MUL, DL, VT, RHS, Mulhi3);
2247
2248 SDValue Mul3_Lo, Mul3_Hi;
2249 std::tie(Mul3_Lo, Mul3_Hi) = DAG.SplitScalar(Mul3, DL, HalfVT, HalfVT);
2250 SDValue Sub1_Lo = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, LHS_Lo,
2251 Mul3_Lo, Zero1);
2252 SDValue Sub1_Hi = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, LHS_Hi,
2253 Mul3_Hi, Sub1_Lo.getValue(1));
2254 SDValue Sub1_Mi = DAG.getNode(ISD::SUB, DL, HalfVT, LHS_Hi, Mul3_Hi);
2255 SDValue Sub1 = DAG.getBitcast(VT,
2256 DAG.getBuildVector(MVT::v2i32, DL, {Sub1_Lo, Sub1_Hi}));
2257
2258 SDValue MinusOne = DAG.getConstant(0xffffffffu, DL, HalfVT);
2259 SDValue C1 = DAG.getSelectCC(DL, Sub1_Hi, RHS_Hi, MinusOne, Zero,
2260 ISD::SETUGE);
2261 SDValue C2 = DAG.getSelectCC(DL, Sub1_Lo, RHS_Lo, MinusOne, Zero,
2262 ISD::SETUGE);
2263 SDValue C3 = DAG.getSelectCC(DL, Sub1_Hi, RHS_Hi, C2, C1, ISD::SETEQ);
2264
2265 // TODO: Here and below portions of the code can be enclosed into if/endif.
2266 // Currently control flow is unconditional and we have 4 selects after
2267 // potential endif to substitute PHIs.
2268
2269 // if C3 != 0 ...
2270 SDValue Sub2_Lo = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub1_Lo,
2271 RHS_Lo, Zero1);
2272 SDValue Sub2_Mi = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub1_Mi,
2273 RHS_Hi, Sub1_Lo.getValue(1));
2274 SDValue Sub2_Hi = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub2_Mi,
2275 Zero, Sub2_Lo.getValue(1));
2276 SDValue Sub2 = DAG.getBitcast(VT,
2277 DAG.getBuildVector(MVT::v2i32, DL, {Sub2_Lo, Sub2_Hi}));
2278
2279 SDValue Add3 = DAG.getNode(ISD::ADD, DL, VT, Mulhi3, One64);
2280
2281 SDValue C4 = DAG.getSelectCC(DL, Sub2_Hi, RHS_Hi, MinusOne, Zero,
2282 ISD::SETUGE);
2283 SDValue C5 = DAG.getSelectCC(DL, Sub2_Lo, RHS_Lo, MinusOne, Zero,
2284 ISD::SETUGE);
2285 SDValue C6 = DAG.getSelectCC(DL, Sub2_Hi, RHS_Hi, C5, C4, ISD::SETEQ);
2286
2287 // if (C6 != 0)
2288 SDValue Add4 = DAG.getNode(ISD::ADD, DL, VT, Add3, One64);
2289
2290 SDValue Sub3_Lo = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub2_Lo,
2291 RHS_Lo, Zero1);
2292 SDValue Sub3_Mi = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub2_Mi,
2293 RHS_Hi, Sub2_Lo.getValue(1));
2294 SDValue Sub3_Hi = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub3_Mi,
2295 Zero, Sub3_Lo.getValue(1));
2296 SDValue Sub3 = DAG.getBitcast(VT,
2297 DAG.getBuildVector(MVT::v2i32, DL, {Sub3_Lo, Sub3_Hi}));
2298
2299 // endif C6
2300 // endif C3
2301
2302 SDValue Sel1 = DAG.getSelectCC(DL, C6, Zero, Add4, Add3, ISD::SETNE);
2303 SDValue Div = DAG.getSelectCC(DL, C3, Zero, Sel1, Mulhi3, ISD::SETNE);
2304
2305 SDValue Sel2 = DAG.getSelectCC(DL, C6, Zero, Sub3, Sub2, ISD::SETNE);
2306 SDValue Rem = DAG.getSelectCC(DL, C3, Zero, Sel2, Sub1, ISD::SETNE);
2307
2308 Results.push_back(Div);
2309 Results.push_back(Rem);
2310
2311 return;
2312 }
2313
2314 // r600 expandion.
2315 // Get Speculative values
2316 SDValue DIV_Part = DAG.getNode(ISD::UDIV, DL, HalfVT, LHS_Hi, RHS_Lo);
2317 SDValue REM_Part = DAG.getNode(ISD::UREM, DL, HalfVT, LHS_Hi, RHS_Lo);
2318
2319 SDValue REM_Lo = DAG.getSelectCC(DL, RHS_Hi, Zero, REM_Part, LHS_Hi, ISD::SETEQ);
2320 SDValue REM = DAG.getBuildVector(MVT::v2i32, DL, {REM_Lo, Zero});
2321 REM = DAG.getNode(ISD::BITCAST, DL, MVT::i64, REM);
2322
2323 SDValue DIV_Hi = DAG.getSelectCC(DL, RHS_Hi, Zero, DIV_Part, Zero, ISD::SETEQ);
2324 SDValue DIV_Lo = Zero;
2325
2326 const unsigned halfBitWidth = HalfVT.getSizeInBits();
2327
2328 for (unsigned i = 0; i < halfBitWidth; ++i) {
2329 const unsigned bitPos = halfBitWidth - i - 1;
2330 SDValue POS = DAG.getConstant(bitPos, DL, HalfVT);
2331 // Get value of high bit
2332 SDValue HBit = DAG.getNode(ISD::SRL, DL, HalfVT, LHS_Lo, POS);
2333 HBit = DAG.getNode(ISD::AND, DL, HalfVT, HBit, One);
2334 HBit = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, HBit);
2335
2336 // Shift
2337 REM = DAG.getNode(ISD::SHL, DL, VT, REM, DAG.getConstant(1, DL, VT));
2338 // Add LHS high bit
2339 REM = DAG.getNode(ISD::OR, DL, VT, REM, HBit);
2340
2341 SDValue BIT = DAG.getConstant(1ULL << bitPos, DL, HalfVT);
2342 SDValue realBIT = DAG.getSelectCC(DL, REM, RHS, BIT, Zero, ISD::SETUGE);
2343
2344 DIV_Lo = DAG.getNode(ISD::OR, DL, HalfVT, DIV_Lo, realBIT);
2345
2346 // Update REM
2347 SDValue REM_sub = DAG.getNode(ISD::SUB, DL, VT, REM, RHS);
2348 REM = DAG.getSelectCC(DL, REM, RHS, REM_sub, REM, ISD::SETUGE);
2349 }
2350
2351 SDValue DIV = DAG.getBuildVector(MVT::v2i32, DL, {DIV_Lo, DIV_Hi});
2352 DIV = DAG.getNode(ISD::BITCAST, DL, MVT::i64, DIV);
2353 Results.push_back(DIV);
2354 Results.push_back(REM);
2355}
2356
2358 SelectionDAG &DAG) const {
2359 SDLoc DL(Op);
2360 EVT VT = Op.getValueType();
2361
2362 if (VT == MVT::i64) {
2364 LowerUDIVREM64(Op, DAG, Results);
2365 return DAG.getMergeValues(Results, DL);
2366 }
2367
2368 if (VT == MVT::i32) {
2369 if (SDValue Res = LowerDIVREM24(Op, DAG, false))
2370 return Res;
2371 }
2372
2373 SDValue X = Op.getOperand(0);
2374 SDValue Y = Op.getOperand(1);
2375
2376 // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
2377 // algorithm used here.
2378
2379 // Initial estimate of inv(y).
2380 SDValue Z = DAG.getNode(AMDGPUISD::URECIP, DL, VT, Y);
2381
2382 // One round of UNR.
2383 SDValue NegY = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Y);
2384 SDValue NegYZ = DAG.getNode(ISD::MUL, DL, VT, NegY, Z);
2385 Z = DAG.getNode(ISD::ADD, DL, VT, Z,
2386 DAG.getNode(ISD::MULHU, DL, VT, Z, NegYZ));
2387
2388 // Quotient/remainder estimate.
2389 SDValue Q = DAG.getNode(ISD::MULHU, DL, VT, X, Z);
2390 SDValue R =
2391 DAG.getNode(ISD::SUB, DL, VT, X, DAG.getNode(ISD::MUL, DL, VT, Q, Y));
2392
2393 // First quotient/remainder refinement.
2394 EVT CCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2395 SDValue One = DAG.getConstant(1, DL, VT);
2396 SDValue Cond = DAG.getSetCC(DL, CCVT, R, Y, ISD::SETUGE);
2397 Q = DAG.getNode(ISD::SELECT, DL, VT, Cond,
2398 DAG.getNode(ISD::ADD, DL, VT, Q, One), Q);
2399 R = DAG.getNode(ISD::SELECT, DL, VT, Cond,
2400 DAG.getNode(ISD::SUB, DL, VT, R, Y), R);
2401
2402 // Second quotient/remainder refinement.
2403 Cond = DAG.getSetCC(DL, CCVT, R, Y, ISD::SETUGE);
2404 Q = DAG.getNode(ISD::SELECT, DL, VT, Cond,
2405 DAG.getNode(ISD::ADD, DL, VT, Q, One), Q);
2406 R = DAG.getNode(ISD::SELECT, DL, VT, Cond,
2407 DAG.getNode(ISD::SUB, DL, VT, R, Y), R);
2408
2409 return DAG.getMergeValues({Q, R}, DL);
2410}
2411
2413 SelectionDAG &DAG) const {
2414 SDLoc DL(Op);
2415 EVT VT = Op.getValueType();
2416
2417 SDValue LHS = Op.getOperand(0);
2418 SDValue RHS = Op.getOperand(1);
2419
2420 SDValue Zero = DAG.getConstant(0, DL, VT);
2421 SDValue NegOne = DAG.getAllOnesConstant(DL, VT);
2422
2423 if (VT == MVT::i32) {
2424 if (SDValue Res = LowerDIVREM24(Op, DAG, true))
2425 return Res;
2426 }
2427
2428 // LHS must have > 33 sign-bits to ensure that LHS != -2147483648
2429 // Otherwise 32-bit division cannot be used safely.
2430 // -2147483648/1 and -2147483648/-1 are not equal,
2431 // but they produce the same lower 32-bit result.
2432 if (VT == MVT::i64 && DAG.ComputeNumSignBits(LHS) > 33 &&
2433 DAG.ComputeNumSignBits(RHS) > 32) {
2434 EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext());
2435
2436 //HiLo split
2437 SDValue LHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, Zero);
2438 SDValue RHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, Zero);
2439 SDValue DIVREM = DAG.getNode(ISD::SDIVREM, DL, DAG.getVTList(HalfVT, HalfVT),
2440 LHS_Lo, RHS_Lo);
2441 SDValue Res[2] = {
2442 DAG.getNode(ISD::SIGN_EXTEND, DL, VT, DIVREM.getValue(0)),
2443 DAG.getNode(ISD::SIGN_EXTEND, DL, VT, DIVREM.getValue(1))
2444 };
2445 return DAG.getMergeValues(Res, DL);
2446 }
2447
2448 SDValue LHSign = DAG.getSelectCC(DL, LHS, Zero, NegOne, Zero, ISD::SETLT);
2449 SDValue RHSign = DAG.getSelectCC(DL, RHS, Zero, NegOne, Zero, ISD::SETLT);
2450 SDValue DSign = DAG.getNode(ISD::XOR, DL, VT, LHSign, RHSign);
2451 SDValue RSign = LHSign; // Remainder sign is the same as LHS
2452
2453 LHS = DAG.getNode(ISD::ADD, DL, VT, LHS, LHSign);
2454 RHS = DAG.getNode(ISD::ADD, DL, VT, RHS, RHSign);
2455
2456 LHS = DAG.getNode(ISD::XOR, DL, VT, LHS, LHSign);
2457 RHS = DAG.getNode(ISD::XOR, DL, VT, RHS, RHSign);
2458
2459 SDValue Div = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(VT, VT), LHS, RHS);
2460 SDValue Rem = Div.getValue(1);
2461
2462 Div = DAG.getNode(ISD::XOR, DL, VT, Div, DSign);
2463 Rem = DAG.getNode(ISD::XOR, DL, VT, Rem, RSign);
2464
2465 Div = DAG.getNode(ISD::SUB, DL, VT, Div, DSign);
2466 Rem = DAG.getNode(ISD::SUB, DL, VT, Rem, RSign);
2467
2468 SDValue Res[2] = {
2469 Div,
2470 Rem
2471 };
2472 return DAG.getMergeValues(Res, DL);
2473}
2474
2476 SDLoc SL(Op);
2477 SDValue Src = Op.getOperand(0);
2478
2479 // result = trunc(src)
2480 // if (src > 0.0 && src != result)
2481 // result += 1.0
2482
2483 SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src);
2484
2485 const SDValue Zero = DAG.getConstantFP(0.0, SL, MVT::f64);
2486 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f64);
2487
2488 EVT SetCCVT =
2489 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);
2490
2491 SDValue Lt0 = DAG.getSetCC(SL, SetCCVT, Src, Zero, ISD::SETOGT);
2492 SDValue NeTrunc = DAG.getSetCC(SL, SetCCVT, Src, Trunc, ISD::SETONE);
2493 SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc);
2494
2495 SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, One, Zero);
2496 // TODO: Should this propagate fast-math-flags?
2497 return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add);
2498}
2499
2501 SelectionDAG &DAG) {
2502 const unsigned FractBits = 52;
2503 const unsigned ExpBits = 11;
2504
2505 SDValue ExpPart = DAG.getNode(AMDGPUISD::BFE_U32, SL, MVT::i32,
2506 Hi,
2507 DAG.getConstant(FractBits - 32, SL, MVT::i32),
2508 DAG.getConstant(ExpBits, SL, MVT::i32));
2509 SDValue Exp = DAG.getNode(ISD::SUB, SL, MVT::i32, ExpPart,
2510 DAG.getConstant(1023, SL, MVT::i32));
2511
2512 return Exp;
2513}
2514
2516 SDLoc SL(Op);
2517 SDValue Src = Op.getOperand(0);
2518
2519 assert(Op.getValueType() == MVT::f64);
2520
2521 const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
2522
2523 // Extract the upper half, since this is where we will find the sign and
2524 // exponent.
2525 SDValue Hi = getHiHalf64(Src, DAG);
2526
2527 SDValue Exp = extractF64Exponent(Hi, SL, DAG);
2528
2529 const unsigned FractBits = 52;
2530
2531 // Extract the sign bit.
2532 const SDValue SignBitMask = DAG.getConstant(UINT32_C(1) << 31, SL, MVT::i32);
2533 SDValue SignBit = DAG.getNode(ISD::AND, SL, MVT::i32, Hi, SignBitMask);
2534
2535 // Extend back to 64-bits.
2536 SDValue SignBit64 = DAG.getBuildVector(MVT::v2i32, SL, {Zero, SignBit});
2537 SignBit64 = DAG.getNode(ISD::BITCAST, SL, MVT::i64, SignBit64);
2538
2539 SDValue BcInt = DAG.getNode(ISD::BITCAST, SL, MVT::i64, Src);
2540 const SDValue FractMask
2541 = DAG.getConstant((UINT64_C(1) << FractBits) - 1, SL, MVT::i64);
2542
2543 SDValue Shr = DAG.getNode(ISD::SRA, SL, MVT::i64, FractMask, Exp);
2544 SDValue Not = DAG.getNOT(SL, Shr, MVT::i64);
2545 SDValue Tmp0 = DAG.getNode(ISD::AND, SL, MVT::i64, BcInt, Not);
2546
2547 EVT SetCCVT =
2548 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i32);
2549
2550 const SDValue FiftyOne = DAG.getConstant(FractBits - 1, SL, MVT::i32);
2551
2552 SDValue ExpLt0 = DAG.getSetCC(SL, SetCCVT, Exp, Zero, ISD::SETLT);
2553 SDValue ExpGt51 = DAG.getSetCC(SL, SetCCVT, Exp, FiftyOne, ISD::SETGT);
2554
2555 SDValue Tmp1 = DAG.getNode(ISD::SELECT, SL, MVT::i64, ExpLt0, SignBit64, Tmp0);
2556 SDValue Tmp2 = DAG.getNode(ISD::SELECT, SL, MVT::i64, ExpGt51, BcInt, Tmp1);
2557
2558 return DAG.getNode(ISD::BITCAST, SL, MVT::f64, Tmp2);
2559}
2560
2562 SelectionDAG &DAG) const {
2563 SDLoc SL(Op);
2564 SDValue Src = Op.getOperand(0);
2565
2566 assert(Op.getValueType() == MVT::f64);
2567
2568 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
2569 SDValue C1 = DAG.getConstantFP(C1Val, SL, MVT::f64);
2570 SDValue CopySign = DAG.getNode(ISD::FCOPYSIGN, SL, MVT::f64, C1, Src);
2571
2572 // TODO: Should this propagate fast-math-flags?
2573
2574 SDValue Tmp1 = DAG.getNode(ISD::FADD, SL, MVT::f64, Src, CopySign);
2575 SDValue Tmp2 = DAG.getNode(ISD::FSUB, SL, MVT::f64, Tmp1, CopySign);
2576
2577 SDValue Fabs = DAG.getNode(ISD::FABS, SL, MVT::f64, Src);
2578
2579 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
2580 SDValue C2 = DAG.getConstantFP(C2Val, SL, MVT::f64);
2581
2582 EVT SetCCVT =
2583 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);
2584 SDValue Cond = DAG.getSetCC(SL, SetCCVT, Fabs, C2, ISD::SETOGT);
2585
2586 return DAG.getSelect(SL, MVT::f64, Cond, Src, Tmp2);
2587}
2588
2590 SelectionDAG &DAG) const {
2591 // FNEARBYINT and FRINT are the same, except in their handling of FP
2592 // exceptions. Those aren't really meaningful for us, and OpenCL only has
2593 // rint, so just treat them as equivalent.
2594 return DAG.getNode(ISD::FROUNDEVEN, SDLoc(Op), Op.getValueType(),
2595 Op.getOperand(0));
2596}
2597
2599 auto VT = Op.getValueType();
2600 auto Arg = Op.getOperand(0u);
2601 return DAG.getNode(ISD::FROUNDEVEN, SDLoc(Op), VT, Arg);
2602}
2603
2604// XXX - May require not supporting f32 denormals?
2605
2606// Don't handle v2f16. The extra instructions to scalarize and repack around the
2607// compare and vselect end up producing worse code than scalarizing the whole
2608// operation.
2610 SDLoc SL(Op);
2611 SDValue X = Op.getOperand(0);
2612 EVT VT = Op.getValueType();
2613
2614 SDValue T = DAG.getNode(ISD::FTRUNC, SL, VT, X);
2615
2616 // TODO: Should this propagate fast-math-flags?
2617
2618 SDValue Diff = DAG.getNode(ISD::FSUB, SL, VT, X, T);
2619
2620 SDValue AbsDiff = DAG.getNode(ISD::FABS, SL, VT, Diff);
2621
2622 const SDValue Zero = DAG.getConstantFP(0.0, SL, VT);
2623 const SDValue One = DAG.getConstantFP(1.0, SL, VT);
2624
2625 EVT SetCCVT =
2626 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2627
2628 const SDValue Half = DAG.getConstantFP(0.5, SL, VT);
2629 SDValue Cmp = DAG.getSetCC(SL, SetCCVT, AbsDiff, Half, ISD::SETOGE);
2630 SDValue OneOrZeroFP = DAG.getNode(ISD::SELECT, SL, VT, Cmp, One, Zero);
2631
2632 SDValue SignedOffset = DAG.getNode(ISD::FCOPYSIGN, SL, VT, OneOrZeroFP, X);
2633 return DAG.getNode(ISD::FADD, SL, VT, T, SignedOffset);
2634}
2635
2637 SDLoc SL(Op);
2638 SDValue Src = Op.getOperand(0);
2639
2640 // result = trunc(src);
2641 // if (src < 0.0 && src != result)
2642 // result += -1.0.
2643
2644 SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src);
2645
2646 const SDValue Zero = DAG.getConstantFP(0.0, SL, MVT::f64);
2647 const SDValue NegOne = DAG.getConstantFP(-1.0, SL, MVT::f64);
2648
2649 EVT SetCCVT =
2650 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);
2651
2652 SDValue Lt0 = DAG.getSetCC(SL, SetCCVT, Src, Zero, ISD::SETOLT);
2653 SDValue NeTrunc = DAG.getSetCC(SL, SetCCVT, Src, Trunc, ISD::SETONE);
2654 SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc);
2655
2656 SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, NegOne, Zero);
2657 // TODO: Should this propagate fast-math-flags?
2658 return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add);
2659}
2660
2661/// Return true if it's known that \p Src can never be an f32 denormal value.
2663 switch (Src.getOpcode()) {
2664 case ISD::FP_EXTEND:
2665 return Src.getOperand(0).getValueType() == MVT::f16;
2666 case ISD::FP16_TO_FP:
2667 case ISD::FFREXP:
2668 case ISD::FSQRT:
2669 case AMDGPUISD::LOG:
2670 case AMDGPUISD::EXP:
2671 return true;
2673 unsigned IntrinsicID = Src.getConstantOperandVal(0);
2674 switch (IntrinsicID) {
2675 case Intrinsic::amdgcn_frexp_mant:
2676 case Intrinsic::amdgcn_log:
2677 case Intrinsic::amdgcn_log_clamp:
2678 case Intrinsic::amdgcn_exp2:
2679 case Intrinsic::amdgcn_sqrt:
2680 return true;
2681 default:
2682 return false;
2683 }
2684 }
2685 default:
2686 return false;
2687 }
2688
2689 llvm_unreachable("covered opcode switch");
2690}
2691
2693 SDNodeFlags Flags) {
2694 return Flags.hasApproximateFuncs();
2695}
2696
2705
2707 SDValue Src,
2708 SDNodeFlags Flags) const {
2709 SDLoc SL(Src);
2710 EVT VT = Src.getValueType();
2711 const fltSemantics &Semantics = VT.getFltSemantics();
2712 SDValue SmallestNormal =
2713 DAG.getConstantFP(APFloat::getSmallestNormalized(Semantics), SL, VT);
2714
2715 // Want to scale denormals up, but negatives and 0 work just as well on the
2716 // scaled path.
2717 SDValue IsLtSmallestNormal = DAG.getSetCC(
2718 SL, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT), Src,
2719 SmallestNormal, ISD::SETOLT);
2720
2721 return IsLtSmallestNormal;
2722}
2723
2725 SDNodeFlags Flags) const {
2726 SDLoc SL(Src);
2727 EVT VT = Src.getValueType();
2728 const fltSemantics &Semantics = VT.getFltSemantics();
2729 SDValue Inf = DAG.getConstantFP(APFloat::getInf(Semantics), SL, VT);
2730
2731 SDValue Fabs = DAG.getNode(ISD::FABS, SL, VT, Src, Flags);
2732 SDValue IsFinite = DAG.getSetCC(
2733 SL, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT), Fabs,
2734 Inf, ISD::SETOLT);
2735 return IsFinite;
2736}
2737
2738/// If denormal handling is required return the scaled input to FLOG2, and the
2739/// check for denormal range. Otherwise, return null values.
2740std::pair<SDValue, SDValue>
2742 SDValue Src, SDNodeFlags Flags) const {
2743 if (!needsDenormHandlingF32(DAG, Src, Flags))
2744 return {};
2745
2746 MVT VT = MVT::f32;
2747 const fltSemantics &Semantics = APFloat::IEEEsingle();
2748 SDValue SmallestNormal =
2749 DAG.getConstantFP(APFloat::getSmallestNormalized(Semantics), SL, VT);
2750
2751 SDValue IsLtSmallestNormal = DAG.getSetCC(
2752 SL, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT), Src,
2753 SmallestNormal, ISD::SETOLT);
2754
2755 SDValue Scale32 = DAG.getConstantFP(0x1.0p+32, SL, VT);
2756 SDValue One = DAG.getConstantFP(1.0, SL, VT);
2757 SDValue ScaleFactor =
2758 DAG.getNode(ISD::SELECT, SL, VT, IsLtSmallestNormal, Scale32, One, Flags);
2759
2760 SDValue ScaledInput = DAG.getNode(ISD::FMUL, SL, VT, Src, ScaleFactor, Flags);
2761 return {ScaledInput, IsLtSmallestNormal};
2762}
2763
2765 // v_log_f32 is good enough for OpenCL, except it doesn't handle denormals.
2766 // If we have to handle denormals, scale up the input and adjust the result.
2767
2768 // scaled = x * (is_denormal ? 0x1.0p+32 : 1.0)
2769 // log2 = amdgpu_log2 - (is_denormal ? 32.0 : 0.0)
2770
2771 SDLoc SL(Op);
2772 EVT VT = Op.getValueType();
2773 SDValue Src = Op.getOperand(0);
2774 SDNodeFlags Flags = Op->getFlags();
2775
2776 if (VT == MVT::f16) {
2777 // Nothing in half is a denormal when promoted to f32.
2778 assert(!isTypeLegal(VT));
2779 SDValue Ext = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src, Flags);
2780 SDValue Log = DAG.getNode(AMDGPUISD::LOG, SL, MVT::f32, Ext, Flags);
2781 return DAG.getNode(ISD::FP_ROUND, SL, VT, Log,
2782 DAG.getTargetConstant(0, SL, MVT::i32), Flags);
2783 }
2784
2785 auto [ScaledInput, IsLtSmallestNormal] =
2786 getScaledLogInput(DAG, SL, Src, Flags);
2787 if (!ScaledInput)
2788 return DAG.getNode(AMDGPUISD::LOG, SL, VT, Src, Flags);
2789
2790 SDValue Log2 = DAG.getNode(AMDGPUISD::LOG, SL, VT, ScaledInput, Flags);
2791
2792 SDValue ThirtyTwo = DAG.getConstantFP(32.0, SL, VT);
2793 SDValue Zero = DAG.getConstantFP(0.0, SL, VT);
2794 SDValue ResultOffset =
2795 DAG.getNode(ISD::SELECT, SL, VT, IsLtSmallestNormal, ThirtyTwo, Zero);
2796 return DAG.getNode(ISD::FSUB, SL, VT, Log2, ResultOffset, Flags);
2797}
2798
2799static SDValue getMad(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue X,
2800 SDValue Y, SDValue C, SDNodeFlags Flags = SDNodeFlags()) {
2801 SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, X, Y, Flags);
2802 return DAG.getNode(ISD::FADD, SL, VT, Mul, C, Flags);
2803}
2804
2806 SelectionDAG &DAG) const {
2807 SDValue X = Op.getOperand(0);
2808 EVT VT = Op.getValueType();
2809 SDNodeFlags Flags = Op->getFlags();
2810 SDLoc DL(Op);
2811 const bool IsLog10 = Op.getOpcode() == ISD::FLOG10;
2812 assert(IsLog10 || Op.getOpcode() == ISD::FLOG);
2813
2814 if (VT == MVT::f16 || Flags.hasApproximateFuncs()) {
2815 // TODO: The direct f16 path is 1.79 ulp for f16. This should be used
2816 // depending on !fpmath metadata.
2817
2818 bool PromoteToF32 = VT == MVT::f16 && (!Flags.hasApproximateFuncs() ||
2819 !isTypeLegal(MVT::f16));
2820
2821 if (PromoteToF32) {
2822 // Log and multiply in f32 is always good enough for f16.
2823 X = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, X, Flags);
2824 }
2825
2826 SDValue Lowered = LowerFLOGUnsafe(X, DL, DAG, IsLog10, Flags);
2827 if (PromoteToF32) {
2828 return DAG.getNode(ISD::FP_ROUND, DL, VT, Lowered,
2829 DAG.getTargetConstant(0, DL, MVT::i32), Flags);
2830 }
2831
2832 return Lowered;
2833 }
2834
2835 SDValue ScaledInput, IsScaled;
2836 if (VT == MVT::f16)
2837 X = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, X, Flags);
2838 else {
2839 std::tie(ScaledInput, IsScaled) = getScaledLogInput(DAG, DL, X, Flags);
2840 if (ScaledInput)
2841 X = ScaledInput;
2842 }
2843
2844 SDValue Y = DAG.getNode(AMDGPUISD::LOG, DL, VT, X, Flags);
2845
2846 SDValue R;
2847 if (Subtarget->hasFastFMAF32()) {
2848 // c+cc are ln(2)/ln(10) to more than 49 bits
2849 const float c_log10 = 0x1.344134p-2f;
2850 const float cc_log10 = 0x1.09f79ep-26f;
2851
2852 // c + cc is ln(2) to more than 49 bits
2853 const float c_log = 0x1.62e42ep-1f;
2854 const float cc_log = 0x1.efa39ep-25f;
2855
2856 SDValue C = DAG.getConstantFP(IsLog10 ? c_log10 : c_log, DL, VT);
2857 SDValue CC = DAG.getConstantFP(IsLog10 ? cc_log10 : cc_log, DL, VT);
2858 // This adds correction terms for which contraction may lead to an increase
2859 // in the error of the approximation, so disable it.
2860 Flags.setAllowContract(false);
2861 R = DAG.getNode(ISD::FMUL, DL, VT, Y, C, Flags);
2862 SDValue NegR = DAG.getNode(ISD::FNEG, DL, VT, R, Flags);
2863 SDValue FMA0 = DAG.getNode(ISD::FMA, DL, VT, Y, C, NegR, Flags);
2864 SDValue FMA1 = DAG.getNode(ISD::FMA, DL, VT, Y, CC, FMA0, Flags);
2865 R = DAG.getNode(ISD::FADD, DL, VT, R, FMA1, Flags);
2866 } else {
2867 // ch+ct is ln(2)/ln(10) to more than 36 bits
2868 const float ch_log10 = 0x1.344000p-2f;
2869 const float ct_log10 = 0x1.3509f6p-18f;
2870
2871 // ch + ct is ln(2) to more than 36 bits
2872 const float ch_log = 0x1.62e000p-1f;
2873 const float ct_log = 0x1.0bfbe8p-15f;
2874
2875 SDValue CH = DAG.getConstantFP(IsLog10 ? ch_log10 : ch_log, DL, VT);
2876 SDValue CT = DAG.getConstantFP(IsLog10 ? ct_log10 : ct_log, DL, VT);
2877
2878 SDValue YAsInt = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Y);
2879 SDValue MaskConst = DAG.getConstant(0xfffff000, DL, MVT::i32);
2880 SDValue YHInt = DAG.getNode(ISD::AND, DL, MVT::i32, YAsInt, MaskConst);
2881 SDValue YH = DAG.getNode(ISD::BITCAST, DL, MVT::f32, YHInt);
2882 SDValue YT = DAG.getNode(ISD::FSUB, DL, VT, Y, YH, Flags);
2883 // This adds correction terms for which contraction may lead to an increase
2884 // in the error of the approximation, so disable it.
2885 Flags.setAllowContract(false);
2886 SDValue YTCT = DAG.getNode(ISD::FMUL, DL, VT, YT, CT, Flags);
2887 SDValue Mad0 = getMad(DAG, DL, VT, YH, CT, YTCT, Flags);
2888 SDValue Mad1 = getMad(DAG, DL, VT, YT, CH, Mad0, Flags);
2889 R = getMad(DAG, DL, VT, YH, CH, Mad1);
2890 }
2891
2892 const bool IsFiniteOnly = Flags.hasNoNaNs() && Flags.hasNoInfs();
2893
2894 // TODO: Check if known finite from source value.
2895 if (!IsFiniteOnly) {
2896 SDValue IsFinite = getIsFinite(DAG, Y, Flags);
2897 R = DAG.getNode(ISD::SELECT, DL, VT, IsFinite, R, Y, Flags);
2898 }
2899
2900 if (IsScaled) {
2901 SDValue Zero = DAG.getConstantFP(0.0f, DL, VT);
2902 SDValue ShiftK =
2903 DAG.getConstantFP(IsLog10 ? 0x1.344136p+3f : 0x1.62e430p+4f, DL, VT);
2904 SDValue Shift =
2905 DAG.getNode(ISD::SELECT, DL, VT, IsScaled, ShiftK, Zero, Flags);
2906 R = DAG.getNode(ISD::FSUB, DL, VT, R, Shift, Flags);
2907 }
2908
2909 return R;
2910}
2911
2915
2916// Do f32 fast math expansion for flog2 or flog10. This is accurate enough for a
2917// promote f16 operation.
2919 SelectionDAG &DAG, bool IsLog10,
2920 SDNodeFlags Flags) const {
2921 EVT VT = Src.getValueType();
2922 unsigned LogOp =
2923 VT == MVT::f32 ? (unsigned)AMDGPUISD::LOG : (unsigned)ISD::FLOG2;
2924
2925 double Log2BaseInverted =
2927
2928 if (VT == MVT::f32) {
2929 auto [ScaledInput, IsScaled] = getScaledLogInput(DAG, SL, Src, Flags);
2930 if (ScaledInput) {
2931 SDValue LogSrc = DAG.getNode(AMDGPUISD::LOG, SL, VT, ScaledInput, Flags);
2932 SDValue ScaledResultOffset =
2933 DAG.getConstantFP(-32.0 * Log2BaseInverted, SL, VT);
2934
2935 SDValue Zero = DAG.getConstantFP(0.0f, SL, VT);
2936
2937 SDValue ResultOffset = DAG.getNode(ISD::SELECT, SL, VT, IsScaled,
2938 ScaledResultOffset, Zero, Flags);
2939
2940 SDValue Log2Inv = DAG.getConstantFP(Log2BaseInverted, SL, VT);
2941
2942 if (Subtarget->hasFastFMAF32())
2943 return DAG.getNode(ISD::FMA, SL, VT, LogSrc, Log2Inv, ResultOffset,
2944 Flags);
2945 SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, LogSrc, Log2Inv, Flags);
2946 return DAG.getNode(ISD::FADD, SL, VT, Mul, ResultOffset);
2947 }
2948 }
2949
2950 SDValue Log2Operand = DAG.getNode(LogOp, SL, VT, Src, Flags);
2951 SDValue Log2BaseInvertedOperand = DAG.getConstantFP(Log2BaseInverted, SL, VT);
2952
2953 return DAG.getNode(ISD::FMUL, SL, VT, Log2Operand, Log2BaseInvertedOperand,
2954 Flags);
2955}
2956
2957// This expansion gives a result slightly better than 1ulp.
2959 SelectionDAG &DAG) const {
2960 SDLoc DL(Op);
2961 SDValue X = Op.getOperand(0);
2962
2963 // TODO: Check if reassoc is safe. There is an output change in exp2 and
2964 // exp10, which slightly increases ulp.
2965 SDNodeFlags Flags = Op->getFlags() & ~SDNodeFlags::AllowReassociation;
2966
2967 SDValue DN, F, T;
2968
2969 if (Op.getOpcode() == ISD::FEXP2) {
2970 // dn = rint(x)
2971 DN = DAG.getNode(ISD::FRINT, DL, MVT::f64, X, Flags);
2972 // f = x - dn
2973 F = DAG.getNode(ISD::FSUB, DL, MVT::f64, X, DN, Flags);
2974 // t = f*C1 + f*C2
2975 SDValue C1 = DAG.getConstantFP(0x1.62e42fefa39efp-1, DL, MVT::f64);
2976 SDValue C2 = DAG.getConstantFP(0x1.abc9e3b39803fp-56, DL, MVT::f64);
2977 SDValue Mul2 = DAG.getNode(ISD::FMUL, DL, MVT::f64, F, C2, Flags);
2978 T = DAG.getNode(ISD::FMA, DL, MVT::f64, F, C1, Mul2, Flags);
2979 } else if (Op.getOpcode() == ISD::FEXP10) {
2980 // dn = rint(x * C1)
2981 SDValue C1 = DAG.getConstantFP(0x1.a934f0979a371p+1, DL, MVT::f64);
2982 SDValue Mul = DAG.getNode(ISD::FMUL, DL, MVT::f64, X, C1, Flags);
2983 DN = DAG.getNode(ISD::FRINT, DL, MVT::f64, Mul, Flags);
2984
2985 // f = FMA(-dn, C2, FMA(-dn, C3, x))
2986 SDValue NegDN = DAG.getNode(ISD::FNEG, DL, MVT::f64, DN, Flags);
2987 SDValue C2 = DAG.getConstantFP(-0x1.9dc1da994fd21p-59, DL, MVT::f64);
2988 SDValue C3 = DAG.getConstantFP(0x1.34413509f79ffp-2, DL, MVT::f64);
2989 SDValue Inner = DAG.getNode(ISD::FMA, DL, MVT::f64, NegDN, C3, X, Flags);
2990 F = DAG.getNode(ISD::FMA, DL, MVT::f64, NegDN, C2, Inner, Flags);
2991
2992 // t = FMA(f, C4, f*C5)
2993 SDValue C4 = DAG.getConstantFP(0x1.26bb1bbb55516p+1, DL, MVT::f64);
2994 SDValue C5 = DAG.getConstantFP(-0x1.f48ad494ea3e9p-53, DL, MVT::f64);
2995 SDValue MulF = DAG.getNode(ISD::FMUL, DL, MVT::f64, F, C5, Flags);
2996 T = DAG.getNode(ISD::FMA, DL, MVT::f64, F, C4, MulF, Flags);
2997 } else { // ISD::FEXP
2998 // dn = rint(x * C1)
2999 SDValue C1 = DAG.getConstantFP(0x1.71547652b82fep+0, DL, MVT::f64);
3000 SDValue Mul = DAG.getNode(ISD::FMUL, DL, MVT::f64, X, C1, Flags);
3001 DN = DAG.getNode(ISD::FRINT, DL, MVT::f64, Mul, Flags);
3002
3003 // t = FMA(-dn, C2, FMA(-dn, C3, x))
3004 SDValue NegDN = DAG.getNode(ISD::FNEG, DL, MVT::f64, DN, Flags);
3005 SDValue C2 = DAG.getConstantFP(0x1.abc9e3b39803fp-56, DL, MVT::f64);
3006 SDValue C3 = DAG.getConstantFP(0x1.62e42fefa39efp-1, DL, MVT::f64);
3007 SDValue Inner = DAG.getNode(ISD::FMA, DL, MVT::f64, NegDN, C3, X, Flags);
3008 T = DAG.getNode(ISD::FMA, DL, MVT::f64, NegDN, C2, Inner, Flags);
3009 }
3010
3011 // Polynomial expansion for p
3012 SDValue P = DAG.getConstantFP(0x1.ade156a5dcb37p-26, DL, MVT::f64);
3013 P = DAG.getNode(ISD::FMA, DL, MVT::f64, T, P,
3014 DAG.getConstantFP(0x1.28af3fca7ab0cp-22, DL, MVT::f64),
3015 Flags);
3016 P = DAG.getNode(ISD::FMA, DL, MVT::f64, T, P,
3017 DAG.getConstantFP(0x1.71dee623fde64p-19, DL, MVT::f64),
3018 Flags);
3019 P = DAG.getNode(ISD::FMA, DL, MVT::f64, T, P,
3020 DAG.getConstantFP(0x1.a01997c89e6b0p-16, DL, MVT::f64),
3021 Flags);
3022 P = DAG.getNode(ISD::FMA, DL, MVT::f64, T, P,
3023 DAG.getConstantFP(0x1.a01a014761f6ep-13, DL, MVT::f64),
3024 Flags);
3025 P = DAG.getNode(ISD::FMA, DL, MVT::f64, T, P,
3026 DAG.getConstantFP(0x1.6c16c1852b7b0p-10, DL, MVT::f64),
3027 Flags);
3028 P = DAG.getNode(ISD::FMA, DL, MVT::f64, T, P,
3029 DAG.getConstantFP(0x1.1111111122322p-7, DL, MVT::f64), Flags);
3030 P = DAG.getNode(ISD::FMA, DL, MVT::f64, T, P,
3031 DAG.getConstantFP(0x1.55555555502a1p-5, DL, MVT::f64), Flags);
3032 P = DAG.getNode(ISD::FMA, DL, MVT::f64, T, P,
3033 DAG.getConstantFP(0x1.5555555555511p-3, DL, MVT::f64), Flags);
3034 P = DAG.getNode(ISD::FMA, DL, MVT::f64, T, P,
3035 DAG.getConstantFP(0x1.000000000000bp-1, DL, MVT::f64), Flags);
3036
3037 SDValue One = DAG.getConstantFP(1.0, DL, MVT::f64);
3038
3039 P = DAG.getNode(ISD::FMA, DL, MVT::f64, T, P, One, Flags);
3040 P = DAG.getNode(ISD::FMA, DL, MVT::f64, T, P, One, Flags);
3041
3042 // z = ldexp(p, (int)dn)
3043 SDValue DNInt = DAG.getNode(ISD::FP_TO_SINT, DL, MVT::i32, DN);
3044 SDValue Z = DAG.getNode(ISD::FLDEXP, DL, MVT::f64, P, DNInt, Flags);
3045
3046 // Overflow/underflow guards
3047 SDValue CondHi = DAG.getSetCC(
3048 DL, MVT::i1, X, DAG.getConstantFP(1024.0, DL, MVT::f64), ISD::SETULE);
3049
3050 if (!Flags.hasNoInfs()) {
3051 SDValue PInf = DAG.getConstantFP(std::numeric_limits<double>::infinity(),
3052 DL, MVT::f64);
3053 Z = DAG.getSelect(DL, MVT::f64, CondHi, Z, PInf, Flags);
3054 }
3055
3056 SDValue CondLo = DAG.getSetCC(
3057 DL, MVT::i1, X, DAG.getConstantFP(-1075.0, DL, MVT::f64), ISD::SETUGE);
3058 SDValue Zero = DAG.getConstantFP(0.0, DL, MVT::f64);
3059 Z = DAG.getSelect(DL, MVT::f64, CondLo, Z, Zero, Flags);
3060
3061 return Z;
3062}
3063
3065 // v_exp_f32 is good enough for OpenCL, except it doesn't handle denormals.
3066 // If we have to handle denormals, scale up the input and adjust the result.
3067
3068 EVT VT = Op.getValueType();
3069 if (VT == MVT::f64)
3070 return lowerFEXPF64(Op, DAG);
3071
3072 SDLoc SL(Op);
3073 SDValue Src = Op.getOperand(0);
3074 SDNodeFlags Flags = Op->getFlags();
3075
3076 if (VT == MVT::f16) {
3077 // Nothing in half is a denormal when promoted to f32.
3078 assert(!isTypeLegal(MVT::f16));
3079 SDValue Ext = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src, Flags);
3080 SDValue Log = DAG.getNode(AMDGPUISD::EXP, SL, MVT::f32, Ext, Flags);
3081 return DAG.getNode(ISD::FP_ROUND, SL, VT, Log,
3082 DAG.getTargetConstant(0, SL, MVT::i32), Flags);
3083 }
3084
3085 assert(VT == MVT::f32);
3086
3087 if (!needsDenormHandlingF32(DAG, Src, Flags))
3088 return DAG.getNode(AMDGPUISD::EXP, SL, MVT::f32, Src, Flags);
3089
3090 // bool needs_scaling = x < -0x1.f80000p+6f;
3091 // v_exp_f32(x + (s ? 0x1.0p+6f : 0.0f)) * (s ? 0x1.0p-64f : 1.0f);
3092
3093 // -nextafter(128.0, -1)
3094 SDValue RangeCheckConst = DAG.getConstantFP(-0x1.f80000p+6f, SL, VT);
3095
3096 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
3097
3098 SDValue NeedsScaling =
3099 DAG.getSetCC(SL, SetCCVT, Src, RangeCheckConst, ISD::SETOLT);
3100
3101 SDValue SixtyFour = DAG.getConstantFP(0x1.0p+6f, SL, VT);
3102 SDValue Zero = DAG.getConstantFP(0.0, SL, VT);
3103
3104 SDValue AddOffset =
3105 DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, SixtyFour, Zero);
3106
3107 SDValue AddInput = DAG.getNode(ISD::FADD, SL, VT, Src, AddOffset, Flags);
3108 SDValue Exp2 = DAG.getNode(AMDGPUISD::EXP, SL, VT, AddInput, Flags);
3109
3110 SDValue TwoExpNeg64 = DAG.getConstantFP(0x1.0p-64f, SL, VT);
3111 SDValue One = DAG.getConstantFP(1.0, SL, VT);
3112 SDValue ResultScale =
3113 DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, TwoExpNeg64, One);
3114
3115 return DAG.getNode(ISD::FMUL, SL, VT, Exp2, ResultScale, Flags);
3116}
3117
3119 SelectionDAG &DAG,
3120 SDNodeFlags Flags,
3121 bool IsExp10) const {
3122 // exp(x) -> exp2(M_LOG2E_F * x);
3123 // exp10(x) -> exp2(log2(10) * x);
3124 EVT VT = X.getValueType();
3125 SDValue Const =
3126 DAG.getConstantFP(IsExp10 ? 0x1.a934f0p+1f : numbers::log2e, SL, VT);
3127
3128 SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, X, Const, Flags);
3129 return DAG.getNode(VT == MVT::f32 ? (unsigned)AMDGPUISD::EXP
3130 : (unsigned)ISD::FEXP2,
3131 SL, VT, Mul, Flags);
3132}
3133
3135 SelectionDAG &DAG,
3136 SDNodeFlags Flags) const {
3137 EVT VT = X.getValueType();
3138 if (VT != MVT::f32 || !needsDenormHandlingF32(DAG, X, Flags))
3139 return lowerFEXPUnsafeImpl(X, SL, DAG, Flags, /*IsExp10=*/false);
3140
3141 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
3142
3143 SDValue Threshold = DAG.getConstantFP(-0x1.5d58a0p+6f, SL, VT);
3144 SDValue NeedsScaling = DAG.getSetCC(SL, SetCCVT, X, Threshold, ISD::SETOLT);
3145
3146 SDValue ScaleOffset = DAG.getConstantFP(0x1.0p+6f, SL, VT);
3147
3148 SDValue ScaledX = DAG.getNode(ISD::FADD, SL, VT, X, ScaleOffset, Flags);
3149
3150 SDValue AdjustedX =
3151 DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, ScaledX, X);
3152
3153 const SDValue Log2E = DAG.getConstantFP(numbers::log2e, SL, VT);
3154 SDValue ExpInput = DAG.getNode(ISD::FMUL, SL, VT, AdjustedX, Log2E, Flags);
3155
3156 SDValue Exp2 = DAG.getNode(AMDGPUISD::EXP, SL, VT, ExpInput, Flags);
3157
3158 SDValue ResultScaleFactor = DAG.getConstantFP(0x1.969d48p-93f, SL, VT);
3159 SDValue AdjustedResult =
3160 DAG.getNode(ISD::FMUL, SL, VT, Exp2, ResultScaleFactor, Flags);
3161
3162 return DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, AdjustedResult, Exp2,
3163 Flags);
3164}
3165
3166/// Emit approx-funcs appropriate lowering for exp10. inf/nan should still be
3167/// handled correctly.
3169 SelectionDAG &DAG,
3170 SDNodeFlags Flags) const {
3171 const EVT VT = X.getValueType();
3172
3173 const unsigned Exp2Op = VT == MVT::f32 ? static_cast<unsigned>(AMDGPUISD::EXP)
3174 : static_cast<unsigned>(ISD::FEXP2);
3175
3176 if (VT != MVT::f32 || !needsDenormHandlingF32(DAG, X, Flags)) {
3177 // exp2(x * 0x1.a92000p+1f) * exp2(x * 0x1.4f0978p-11f);
3178 SDValue K0 = DAG.getConstantFP(0x1.a92000p+1f, SL, VT);
3179 SDValue K1 = DAG.getConstantFP(0x1.4f0978p-11f, SL, VT);
3180
3181 SDValue Mul0 = DAG.getNode(ISD::FMUL, SL, VT, X, K0, Flags);
3182 SDValue Exp2_0 = DAG.getNode(Exp2Op, SL, VT, Mul0, Flags);
3183 SDValue Mul1 = DAG.getNode(ISD::FMUL, SL, VT, X, K1, Flags);
3184 SDValue Exp2_1 = DAG.getNode(Exp2Op, SL, VT, Mul1, Flags);
3185 return DAG.getNode(ISD::FMUL, SL, VT, Exp2_0, Exp2_1);
3186 }
3187
3188 // bool s = x < -0x1.2f7030p+5f;
3189 // x += s ? 0x1.0p+5f : 0.0f;
3190 // exp10 = exp2(x * 0x1.a92000p+1f) *
3191 // exp2(x * 0x1.4f0978p-11f) *
3192 // (s ? 0x1.9f623ep-107f : 1.0f);
3193
3194 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
3195
3196 SDValue Threshold = DAG.getConstantFP(-0x1.2f7030p+5f, SL, VT);
3197 SDValue NeedsScaling = DAG.getSetCC(SL, SetCCVT, X, Threshold, ISD::SETOLT);
3198
3199 SDValue ScaleOffset = DAG.getConstantFP(0x1.0p+5f, SL, VT);
3200 SDValue ScaledX = DAG.getNode(ISD::FADD, SL, VT, X, ScaleOffset, Flags);
3201 SDValue AdjustedX =
3202 DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, ScaledX, X);
3203
3204 SDValue K0 = DAG.getConstantFP(0x1.a92000p+1f, SL, VT);
3205 SDValue K1 = DAG.getConstantFP(0x1.4f0978p-11f, SL, VT);
3206
3207 SDValue Mul0 = DAG.getNode(ISD::FMUL, SL, VT, AdjustedX, K0, Flags);
3208 SDValue Exp2_0 = DAG.getNode(Exp2Op, SL, VT, Mul0, Flags);
3209 SDValue Mul1 = DAG.getNode(ISD::FMUL, SL, VT, AdjustedX, K1, Flags);
3210 SDValue Exp2_1 = DAG.getNode(Exp2Op, SL, VT, Mul1, Flags);
3211
3212 SDValue MulExps = DAG.getNode(ISD::FMUL, SL, VT, Exp2_0, Exp2_1, Flags);
3213
3214 SDValue ResultScaleFactor = DAG.getConstantFP(0x1.9f623ep-107f, SL, VT);
3215 SDValue AdjustedResult =
3216 DAG.getNode(ISD::FMUL, SL, VT, MulExps, ResultScaleFactor, Flags);
3217
3218 return DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, AdjustedResult, MulExps,
3219 Flags);
3220}
3221
3223 EVT VT = Op.getValueType();
3224
3225 if (VT == MVT::f64)
3226 return lowerFEXPF64(Op, DAG);
3227
3228 SDLoc SL(Op);
3229 SDValue X = Op.getOperand(0);
3230 SDNodeFlags Flags = Op->getFlags();
3231 const bool IsExp10 = Op.getOpcode() == ISD::FEXP10;
3232
3233 // TODO: Interpret allowApproxFunc as ignoring DAZ. This is currently copying
3234 // library behavior. Also, is known-not-daz source sufficient?
3235 if (allowApproxFunc(DAG, Flags)) { // TODO: Does this really require fast?
3236 return IsExp10 ? lowerFEXP10Unsafe(X, SL, DAG, Flags)
3237 : lowerFEXPUnsafe(X, SL, DAG, Flags);
3238 }
3239
3240 if (VT.getScalarType() == MVT::f16) {
3241 if (VT.isVector())
3242 return SDValue();
3243
3244 // Nothing in half is a denormal when promoted to f32.
3245 //
3246 // exp(f16 x) ->
3247 // fptrunc (v_exp_f32 (fmul (fpext x), log2e))
3248 //
3249 // exp10(f16 x) ->
3250 // fptrunc (v_exp_f32 (fmul (fpext x), log2(10)))
3251 SDValue Ext = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, X, Flags);
3252 SDValue Lowered = lowerFEXPUnsafeImpl(Ext, SL, DAG, Flags, IsExp10);
3253 return DAG.getNode(ISD::FP_ROUND, SL, VT, Lowered,
3254 DAG.getTargetConstant(0, SL, MVT::i32), Flags);
3255 }
3256
3257 assert(VT == MVT::f32);
3258
3259 // Algorithm:
3260 //
3261 // e^x = 2^(x/ln(2)) = 2^(x*(64/ln(2))/64)
3262 //
3263 // x*(64/ln(2)) = n + f, |f| <= 0.5, n is integer
3264 // n = 64*m + j, 0 <= j < 64
3265 //
3266 // e^x = 2^((64*m + j + f)/64)
3267 // = (2^m) * (2^(j/64)) * 2^(f/64)
3268 // = (2^m) * (2^(j/64)) * e^(f*(ln(2)/64))
3269 //
3270 // f = x*(64/ln(2)) - n
3271 // r = f*(ln(2)/64) = x - n*(ln(2)/64)
3272 //
3273 // e^x = (2^m) * (2^(j/64)) * e^r
3274 //
3275 // (2^(j/64)) is precomputed
3276 //
3277 // e^r = 1 + r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5!
3278 // e^r = 1 + q
3279 //
3280 // q = r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5!
3281 //
3282 // e^x = (2^m) * ( (2^(j/64)) + q*(2^(j/64)) )
3283 SDNodeFlags FlagsNoContract = Flags;
3284 FlagsNoContract.setAllowContract(false);
3285
3286 SDValue PH, PL;
3287 if (Subtarget->hasFastFMAF32()) {
3288 const float c_exp = numbers::log2ef;
3289 const float cc_exp = 0x1.4ae0bep-26f; // c+cc are 49 bits
3290 const float c_exp10 = 0x1.a934f0p+1f;
3291 const float cc_exp10 = 0x1.2f346ep-24f;
3292
3293 SDValue C = DAG.getConstantFP(IsExp10 ? c_exp10 : c_exp, SL, VT);
3294 SDValue CC = DAG.getConstantFP(IsExp10 ? cc_exp10 : cc_exp, SL, VT);
3295
3296 PH = DAG.getNode(ISD::FMUL, SL, VT, X, C, Flags);
3297 SDValue NegPH = DAG.getNode(ISD::FNEG, SL, VT, PH, Flags);
3298 SDValue FMA0 = DAG.getNode(ISD::FMA, SL, VT, X, C, NegPH, Flags);
3299 PL = DAG.getNode(ISD::FMA, SL, VT, X, CC, FMA0, Flags);
3300 } else {
3301 const float ch_exp = 0x1.714000p+0f;
3302 const float cl_exp = 0x1.47652ap-12f; // ch + cl are 36 bits
3303
3304 const float ch_exp10 = 0x1.a92000p+1f;
3305 const float cl_exp10 = 0x1.4f0978p-11f;
3306
3307 SDValue CH = DAG.getConstantFP(IsExp10 ? ch_exp10 : ch_exp, SL, VT);
3308 SDValue CL = DAG.getConstantFP(IsExp10 ? cl_exp10 : cl_exp, SL, VT);
3309
3310 SDValue XAsInt = DAG.getNode(ISD::BITCAST, SL, MVT::i32, X);
3311 SDValue MaskConst = DAG.getConstant(0xfffff000, SL, MVT::i32);
3312 SDValue XHAsInt = DAG.getNode(ISD::AND, SL, MVT::i32, XAsInt, MaskConst);
3313 SDValue XH = DAG.getNode(ISD::BITCAST, SL, VT, XHAsInt);
3314 SDValue XL = DAG.getNode(ISD::FSUB, SL, VT, X, XH, Flags);
3315
3316 PH = DAG.getNode(ISD::FMUL, SL, VT, XH, CH, Flags);
3317
3318 SDValue XLCL = DAG.getNode(ISD::FMUL, SL, VT, XL, CL, Flags);
3319 SDValue Mad0 = getMad(DAG, SL, VT, XL, CH, XLCL, Flags);
3320 PL = getMad(DAG, SL, VT, XH, CL, Mad0, Flags);
3321 }
3322
3323 SDValue E = DAG.getNode(ISD::FROUNDEVEN, SL, VT, PH, Flags);
3324
3325 // It is unsafe to contract this fsub into the PH multiply.
3326 SDValue PHSubE = DAG.getNode(ISD::FSUB, SL, VT, PH, E, FlagsNoContract);
3327
3328 SDValue A = DAG.getNode(ISD::FADD, SL, VT, PHSubE, PL, Flags);
3329 SDValue IntE = DAG.getNode(ISD::FP_TO_SINT, SL, MVT::i32, E);
3330 SDValue Exp2 = DAG.getNode(AMDGPUISD::EXP, SL, VT, A, Flags);
3331
3332 SDValue R = DAG.getNode(ISD::FLDEXP, SL, VT, Exp2, IntE, Flags);
3333
3334 SDValue UnderflowCheckConst =
3335 DAG.getConstantFP(IsExp10 ? -0x1.66d3e8p+5f : -0x1.9d1da0p+6f, SL, VT);
3336
3337 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
3338 SDValue Zero = DAG.getConstantFP(0.0, SL, VT);
3339 SDValue Underflow =
3340 DAG.getSetCC(SL, SetCCVT, X, UnderflowCheckConst, ISD::SETOLT);
3341
3342 R = DAG.getNode(ISD::SELECT, SL, VT, Underflow, Zero, R);
3343
3344 if (!Flags.hasNoInfs()) {
3345 SDValue OverflowCheckConst =
3346 DAG.getConstantFP(IsExp10 ? 0x1.344136p+5f : 0x1.62e430p+6f, SL, VT);
3347 SDValue Overflow =
3348 DAG.getSetCC(SL, SetCCVT, X, OverflowCheckConst, ISD::SETOGT);
3349 SDValue Inf =
3351 R = DAG.getNode(ISD::SELECT, SL, VT, Overflow, Inf, R);
3352 }
3353
3354 return R;
3355}
3356
3357static bool isCtlzOpc(unsigned Opc) {
3358 return Opc == ISD::CTLZ || Opc == ISD::CTLZ_ZERO_POISON;
3359}
3360
3361static bool isCttzOpc(unsigned Opc) {
3362 return Opc == ISD::CTTZ || Opc == ISD::CTTZ_ZERO_POISON;
3363}
3364
3366 SelectionDAG &DAG) const {
3367 auto SL = SDLoc(Op);
3368 auto Opc = Op.getOpcode();
3369 auto Arg = Op.getOperand(0u);
3370 auto ResultVT = Op.getValueType();
3371
3372 if (ResultVT != MVT::i8 && ResultVT != MVT::i16)
3373 return {};
3374
3376 assert(ResultVT == Arg.getValueType());
3377
3378 const uint64_t NumBits = ResultVT.getFixedSizeInBits();
3379 SDValue NumExtBits = DAG.getConstant(32u - NumBits, SL, MVT::i32);
3380 SDValue NewOp;
3381
3382 if (Opc == ISD::CTLZ_ZERO_POISON) {
3383 NewOp = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, Arg);
3384 NewOp = DAG.getNode(ISD::SHL, SL, MVT::i32, NewOp, NumExtBits);
3385 NewOp = DAG.getNode(Opc, SL, MVT::i32, NewOp);
3386 } else {
3387 NewOp = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Arg);
3388 NewOp = DAG.getNode(Opc, SL, MVT::i32, NewOp);
3389 NewOp = DAG.getNode(ISD::SUB, SL, MVT::i32, NewOp, NumExtBits);
3390 }
3391
3392 return DAG.getNode(ISD::TRUNCATE, SL, ResultVT, NewOp);
3393}
3394
3396 SDLoc SL(Op);
3397 SDValue Src = Op.getOperand(0);
3398
3399 assert(isCtlzOpc(Op.getOpcode()) || isCttzOpc(Op.getOpcode()));
3400 bool Ctlz = isCtlzOpc(Op.getOpcode());
3401 unsigned NewOpc = Ctlz ? AMDGPUISD::FFBH_U32 : AMDGPUISD::FFBL_B32;
3402
3403 bool ZeroUndef = Op.getOpcode() == ISD::CTLZ_ZERO_POISON ||
3404 Op.getOpcode() == ISD::CTTZ_ZERO_POISON;
3405 bool Is64BitScalar = !Src->isDivergent() && Src.getValueType() == MVT::i64;
3406
3407 if (Src.getValueType() == MVT::i32 || Is64BitScalar) {
3408 // (ctlz hi:lo) -> (umin (ffbh src), 32)
3409 // (cttz hi:lo) -> (umin (ffbl src), 32)
3410 // (ctlz_zero_poison src) -> (ffbh src)
3411 // (cttz_zero_poison src) -> (ffbl src)
3412
3413 // 64-bit scalar version produce 32-bit result
3414 // (ctlz hi:lo) -> (umin (S_FLBIT_I32_B64 src), 64)
3415 // (cttz hi:lo) -> (umin (S_FF1_I32_B64 src), 64)
3416 // (ctlz_zero_poison src) -> (S_FLBIT_I32_B64 src)
3417 // (cttz_zero_poison src) -> (S_FF1_I32_B64 src)
3418 SDValue NewOpr = DAG.getNode(NewOpc, SL, MVT::i32, Src);
3419 if (!ZeroUndef) {
3420 const SDValue ConstVal = DAG.getConstant(
3421 Op.getValueType().getScalarSizeInBits(), SL, MVT::i32);
3422 NewOpr = DAG.getNode(ISD::UMIN, SL, MVT::i32, NewOpr, ConstVal);
3423 }
3424 return DAG.getNode(ISD::ZERO_EXTEND, SL, Src.getValueType(), NewOpr);
3425 }
3426
3427 SDValue Lo, Hi;
3428 std::tie(Lo, Hi) = split64BitValue(Src, DAG);
3429
3430 SDValue OprLo = DAG.getNode(NewOpc, SL, MVT::i32, Lo);
3431 SDValue OprHi = DAG.getNode(NewOpc, SL, MVT::i32, Hi);
3432
3433 // (ctlz hi:lo) -> (umin3 (ffbh hi), (uaddsat (ffbh lo), 32), 64)
3434 // (cttz hi:lo) -> (umin3 (uaddsat (ffbl hi), 32), (ffbl lo), 64)
3435 // (ctlz_zero_poison hi:lo) -> (umin (ffbh hi), (add (ffbh lo), 32))
3436 // (cttz_zero_poison hi:lo) -> (umin (add (ffbl hi), 32), (ffbl lo))
3437
3438 unsigned AddOpc = ZeroUndef ? ISD::ADD : ISD::UADDSAT;
3439 const SDValue Const32 = DAG.getConstant(32, SL, MVT::i32);
3440 if (Ctlz)
3441 OprLo = DAG.getNode(AddOpc, SL, MVT::i32, OprLo, Const32);
3442 else
3443 OprHi = DAG.getNode(AddOpc, SL, MVT::i32, OprHi, Const32);
3444
3445 SDValue NewOpr;
3446 NewOpr = DAG.getNode(ISD::UMIN, SL, MVT::i32, OprLo, OprHi);
3447 if (!ZeroUndef) {
3448 const SDValue Const64 = DAG.getConstant(64, SL, MVT::i32);
3449 NewOpr = DAG.getNode(ISD::UMIN, SL, MVT::i32, NewOpr, Const64);
3450 }
3451
3452 return DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i64, NewOpr);
3453}
3454
3456 SDLoc SL(Op);
3457 SDValue Src = Op.getOperand(0);
3458 assert(Src.getValueType() == MVT::i32 && "LowerCTLS only supports i32");
3459 SDValue Ffbh = DAG.getNode(
3460 ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
3461 DAG.getTargetConstant(Intrinsic::amdgcn_sffbh, SL, MVT::i32), Src);
3462 SDValue Clamped = DAG.getNode(ISD::UMIN, SL, MVT::i32, Ffbh,
3463 DAG.getConstant(32, SL, MVT::i32));
3464 return DAG.getNode(ISD::ADD, SL, MVT::i32, Clamped,
3465 DAG.getAllOnesConstant(SL, MVT::i32));
3466}
3467
3469 EVT FP16Ty) const {
3470 assert(FP16Ty == MVT::f16 || FP16Ty == MVT::bf16);
3471 SDLoc SL(Op);
3472 SDValue Src = Op.getOperand(0);
3473 SDValue ToF32 = DAG.getNode(Op.getOpcode(), SL, MVT::f32, Src);
3474 SDValue FPRoundFlag = DAG.getIntPtrConstant(0, SL, /*isTarget=*/true);
3475 return DAG.getNode(ISD::FP_ROUND, SL, FP16Ty, ToF32, FPRoundFlag);
3476}
3477
3479 bool Signed) const {
3480 // The regular method converting a 64-bit integer to float roughly consists of
3481 // 2 steps: normalization and rounding. In fact, after normalization, the
3482 // conversion from a 64-bit integer to a float is essentially the same as the
3483 // one from a 32-bit integer. The only difference is that it has more
3484 // trailing bits to be rounded. To leverage the native 32-bit conversion, a
3485 // 64-bit integer could be preprocessed and fit into a 32-bit integer then
3486 // converted into the correct float number. The basic steps for the unsigned
3487 // conversion are illustrated in the following pseudo code:
3488 //
3489 // f32 uitofp(i64 u) {
3490 // i32 hi, lo = split(u);
3491 // // Only count the leading zeros in hi as we have native support of the
3492 // // conversion from i32 to f32. If hi is all 0s, the conversion is
3493 // // reduced to a 32-bit one automatically.
3494 // i32 shamt = clz(hi); // Return 32 if hi is all 0s.
3495 // u <<= shamt;
3496 // hi, lo = split(u);
3497 // hi |= (lo != 0) ? 1 : 0; // Adjust rounding bit in hi based on lo.
3498 // // convert it as a 32-bit integer and scale the result back.
3499 // return uitofp(hi) * 2^(32 - shamt);
3500 // }
3501 //
3502 // The signed one follows the same principle but uses 'ffbh_i32' to count its
3503 // sign bits instead. If 'ffbh_i32' is not available, its absolute value is
3504 // converted instead followed by negation based its sign bit.
3505
3506 SDLoc SL(Op);
3507 SDValue Src = Op.getOperand(0);
3508
3509 SDValue Lo, Hi;
3510 std::tie(Lo, Hi) = split64BitValue(Src, DAG);
3511 SDValue Sign;
3512 SDValue ShAmt;
3513 if (Signed && Subtarget->isGCN()) {
3514 // We also need to consider the sign bit in Lo if Hi has just sign bits,
3515 // i.e. Hi is 0 or -1. However, that only needs to take the MSB into
3516 // account. That is, the maximal shift is
3517 // - 32 if Lo and Hi have opposite signs;
3518 // - 33 if Lo and Hi have the same sign.
3519 //
3520 // Or, MaxShAmt = 33 + OppositeSign, where
3521 //
3522 // OppositeSign is defined as ((Lo ^ Hi) >> 31), which is
3523 // - -1 if Lo and Hi have opposite signs; and
3524 // - 0 otherwise.
3525 //
3526 // All in all, ShAmt is calculated as
3527 //
3528 // umin(sffbh(Hi), 33 + (Lo^Hi)>>31) - 1.
3529 //
3530 // or
3531 //
3532 // umin(sffbh(Hi) - 1, 32 + (Lo^Hi)>>31).
3533 //
3534 // to reduce the critical path.
3535 SDValue OppositeSign = DAG.getNode(
3536 ISD::SRA, SL, MVT::i32, DAG.getNode(ISD::XOR, SL, MVT::i32, Lo, Hi),
3537 DAG.getConstant(31, SL, MVT::i32));
3538 SDValue MaxShAmt =
3539 DAG.getNode(ISD::ADD, SL, MVT::i32, DAG.getConstant(32, SL, MVT::i32),
3540 OppositeSign);
3541 // Count the leading sign bits.
3542 ShAmt = DAG.getNode(
3543 ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
3544 DAG.getTargetConstant(Intrinsic::amdgcn_sffbh, SL, MVT::i32), Hi);
3545 // Different from unsigned conversion, the shift should be one bit less to
3546 // preserve the sign bit.
3547 ShAmt = DAG.getNode(ISD::SUB, SL, MVT::i32, ShAmt,
3548 DAG.getConstant(1, SL, MVT::i32));
3549 ShAmt = DAG.getNode(ISD::UMIN, SL, MVT::i32, ShAmt, MaxShAmt);
3550 } else {
3551 if (Signed) {
3552 // Without 'ffbh_i32', only leading zeros could be counted. Take the
3553 // absolute value first.
3554 Sign = DAG.getNode(ISD::SRA, SL, MVT::i64, Src,
3555 DAG.getConstant(63, SL, MVT::i64));
3556 SDValue Abs =
3557 DAG.getNode(ISD::XOR, SL, MVT::i64,
3558 DAG.getNode(ISD::ADD, SL, MVT::i64, Src, Sign), Sign);
3559 std::tie(Lo, Hi) = split64BitValue(Abs, DAG);
3560 }
3561 // Count the leading zeros.
3562 ShAmt = DAG.getNode(ISD::CTLZ, SL, MVT::i32, Hi);
3563 // The shift amount for signed integers is [0, 32].
3564 }
3565 // Normalize the given 64-bit integer.
3566 SDValue Norm = DAG.getNode(ISD::SHL, SL, MVT::i64, Src, ShAmt);
3567 // Split it again.
3568 std::tie(Lo, Hi) = split64BitValue(Norm, DAG);
3569 // Calculate the adjust bit for rounding.
3570 // (lo != 0) ? 1 : 0 => (lo >= 1) ? 1 : 0 => umin(1, lo)
3571 SDValue Adjust = DAG.getNode(ISD::UMIN, SL, MVT::i32,
3572 DAG.getConstant(1, SL, MVT::i32), Lo);
3573 // Get the 32-bit normalized integer.
3574 Norm = DAG.getNode(ISD::OR, SL, MVT::i32, Hi, Adjust);
3575 // Convert the normalized 32-bit integer into f32.
3576
3577 bool UseLDEXP = isOperationLegal(ISD::FLDEXP, MVT::f32);
3578 unsigned Opc = Signed && UseLDEXP ? ISD::SINT_TO_FP : ISD::UINT_TO_FP;
3579 SDValue FVal = DAG.getNode(Opc, SL, MVT::f32, Norm);
3580
3581 // Finally, need to scale back the converted floating number as the original
3582 // 64-bit integer is converted as a 32-bit one.
3583 ShAmt = DAG.getNode(ISD::SUB, SL, MVT::i32, DAG.getConstant(32, SL, MVT::i32),
3584 ShAmt);
3585 // On GCN, use LDEXP directly.
3586 if (UseLDEXP)
3587 return DAG.getNode(ISD::FLDEXP, SL, MVT::f32, FVal, ShAmt);
3588
3589 // Otherwise, align 'ShAmt' to the exponent part and add it into the exponent
3590 // part directly to emulate the multiplication of 2^ShAmt. That 8-bit
3591 // exponent is enough to avoid overflowing into the sign bit.
3592 SDValue Exp = DAG.getNode(ISD::SHL, SL, MVT::i32, ShAmt,
3593 DAG.getConstant(23, SL, MVT::i32));
3594 SDValue IVal =
3595 DAG.getNode(ISD::ADD, SL, MVT::i32,
3596 DAG.getNode(ISD::BITCAST, SL, MVT::i32, FVal), Exp);
3597 if (Signed) {
3598 // Set the sign bit.
3599 Sign = DAG.getNode(ISD::SHL, SL, MVT::i32,
3600 DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Sign),
3601 DAG.getConstant(31, SL, MVT::i32));
3602 IVal = DAG.getNode(ISD::OR, SL, MVT::i32, IVal, Sign);
3603 }
3604 return DAG.getNode(ISD::BITCAST, SL, MVT::f32, IVal);
3605}
3606
3608 bool Signed) const {
3609 SDLoc SL(Op);
3610 SDValue Src = Op.getOperand(0);
3611
3612 SDValue Lo, Hi;
3613 std::tie(Lo, Hi) = split64BitValue(Src, DAG);
3614
3616 SL, MVT::f64, Hi);
3617
3618 SDValue CvtLo = DAG.getNode(ISD::UINT_TO_FP, SL, MVT::f64, Lo);
3619
3620 SDValue LdExp = DAG.getNode(ISD::FLDEXP, SL, MVT::f64, CvtHi,
3621 DAG.getConstant(32, SL, MVT::i32));
3622 // TODO: Should this propagate fast-math-flags?
3623 return DAG.getNode(ISD::FADD, SL, MVT::f64, LdExp, CvtLo);
3624}
3625
3627 SelectionDAG &DAG) const {
3628 // TODO: Factor out code common with LowerSINT_TO_FP.
3629 EVT DestVT = Op.getValueType();
3630 SDValue Src = Op.getOperand(0);
3631 EVT SrcVT = Src.getValueType();
3632
3633 if (SrcVT == MVT::i16) {
3634 if (DestVT == MVT::f16)
3635 return Op;
3636 SDLoc DL(Op);
3637
3638 // Promote src to i32
3639 SDValue Ext = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Src);
3640 return DAG.getNode(ISD::UINT_TO_FP, DL, DestVT, Ext);
3641 }
3642
3643 if (DestVT == MVT::bf16 || DestVT == MVT::f16)
3644 return LowerINT_TO_FP16(Op, DAG, DestVT);
3645
3646 if (SrcVT != MVT::i64)
3647 return Op;
3648
3649 if (DestVT == MVT::f32)
3650 return LowerINT_TO_FP32(Op, DAG, false);
3651
3652 assert(DestVT == MVT::f64);
3653 return LowerINT_TO_FP64(Op, DAG, false);
3654}
3655
3657 SelectionDAG &DAG) const {
3658 EVT DestVT = Op.getValueType();
3659
3660 SDValue Src = Op.getOperand(0);
3661 EVT SrcVT = Src.getValueType();
3662
3663 if (SrcVT == MVT::i16) {
3664 if (DestVT == MVT::f16)
3665 return Op;
3666
3667 SDLoc DL(Op);
3668 // Promote src to i32
3669 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i32, Src);
3670 return DAG.getNode(ISD::SINT_TO_FP, DL, DestVT, Ext);
3671 }
3672
3673 if (DestVT == MVT::bf16 || DestVT == MVT::f16)
3674 return LowerINT_TO_FP16(Op, DAG, DestVT);
3675
3676 if (SrcVT != MVT::i64)
3677 return Op;
3678
3679 // TODO: Factor out code common with LowerUINT_TO_FP.
3680
3681 if (DestVT == MVT::f32)
3682 return LowerINT_TO_FP32(Op, DAG, true);
3683
3684 assert(DestVT == MVT::f64);
3685 return LowerINT_TO_FP64(Op, DAG, true);
3686}
3687
3689 bool Signed) const {
3690 SDLoc SL(Op);
3691
3692 SDValue Src = Op.getOperand(0);
3693 EVT SrcVT = Src.getValueType();
3694
3695 assert(SrcVT == MVT::f32 || SrcVT == MVT::f64);
3696
3697 // The basic idea of converting a floating point number into a pair of 32-bit
3698 // integers is illustrated as follows:
3699 //
3700 // tf := trunc(val);
3701 // hif := floor(tf * 2^-32);
3702 // lof := tf - hif * 2^32; // lof is always positive due to floor.
3703 // hi := fptoi(hif);
3704 // lo := fptoi(lof);
3705 //
3706 SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, SrcVT, Src);
3707 SDValue Sign;
3708 if (Signed && SrcVT == MVT::f32) {
3709 // However, a 32-bit floating point number has only 23 bits mantissa and
3710 // it's not enough to hold all the significant bits of `lof` if val is
3711 // negative. To avoid the loss of precision, We need to take the absolute
3712 // value after truncating and flip the result back based on the original
3713 // signedness.
3714 Sign = DAG.getNode(ISD::SRA, SL, MVT::i32,
3715 DAG.getNode(ISD::BITCAST, SL, MVT::i32, Trunc),
3716 DAG.getConstant(31, SL, MVT::i32));
3717 Trunc = DAG.getNode(ISD::FABS, SL, SrcVT, Trunc);
3718 }
3719
3720 SDValue K0, K1;
3721 if (SrcVT == MVT::f64) {
3722 K0 = DAG.getConstantFP(
3723 llvm::bit_cast<double>(UINT64_C(/*2^-32*/ 0x3df0000000000000)), SL,
3724 SrcVT);
3725 K1 = DAG.getConstantFP(
3726 llvm::bit_cast<double>(UINT64_C(/*-2^32*/ 0xc1f0000000000000)), SL,
3727 SrcVT);
3728 } else {
3729 K0 = DAG.getConstantFP(
3730 llvm::bit_cast<float>(UINT32_C(/*2^-32*/ 0x2f800000)), SL, SrcVT);
3731 K1 = DAG.getConstantFP(
3732 llvm::bit_cast<float>(UINT32_C(/*-2^32*/ 0xcf800000)), SL, SrcVT);
3733 }
3734 // TODO: Should this propagate fast-math-flags?
3735 SDValue Mul = DAG.getNode(ISD::FMUL, SL, SrcVT, Trunc, K0);
3736
3737 SDValue FloorMul = DAG.getNode(ISD::FFLOOR, SL, SrcVT, Mul);
3738
3739 SDValue Fma = DAG.getNode(ISD::FMA, SL, SrcVT, FloorMul, K1, Trunc);
3740
3741 SDValue Hi = DAG.getNode((Signed && SrcVT == MVT::f64) ? ISD::FP_TO_SINT
3743 SL, MVT::i32, FloorMul);
3744 SDValue Lo = DAG.getNode(ISD::FP_TO_UINT, SL, MVT::i32, Fma);
3745
3746 SDValue Result = DAG.getNode(ISD::BITCAST, SL, MVT::i64,
3747 DAG.getBuildVector(MVT::v2i32, SL, {Lo, Hi}));
3748
3749 if (Signed && SrcVT == MVT::f32) {
3750 assert(Sign);
3751 // Flip the result based on the signedness, which is either all 0s or 1s.
3752 Sign = DAG.getNode(ISD::BITCAST, SL, MVT::i64,
3753 DAG.getBuildVector(MVT::v2i32, SL, {Sign, Sign}));
3754 // r := xor(r, sign) - sign;
3755 Result =
3756 DAG.getNode(ISD::SUB, SL, MVT::i64,
3757 DAG.getNode(ISD::XOR, SL, MVT::i64, Result, Sign), Sign);
3758 }
3759
3760 return Result;
3761}
3762
3764 SDLoc DL(Op);
3765 SDValue N0 = Op.getOperand(0);
3766
3767 // Convert to target node to get known bits
3768 if (N0.getValueType() == MVT::f32)
3769 return DAG.getNode(AMDGPUISD::FP_TO_FP16, DL, Op.getValueType(), N0);
3770
3771 if (Op->getFlags().hasApproximateFuncs()) {
3772 // There is a generic expand for FP_TO_FP16 with unsafe fast math.
3773 return SDValue();
3774 }
3775
3776 return LowerF64ToF16Safe(N0, DL, DAG);
3777}
3778
3779// return node in i32
3781 SelectionDAG &DAG) const {
3782 assert(Src.getSimpleValueType() == MVT::f64);
3783
3784 // f64 -> f16 conversion using round-to-nearest-even rounding mode.
3785 // TODO: We can generate better code for True16.
3786 const unsigned ExpMask = 0x7ff;
3787 const unsigned ExpBiasf64 = 1023;
3788 const unsigned ExpBiasf16 = 15;
3789 SDValue Zero = DAG.getConstant(0, DL, MVT::i32);
3790 SDValue One = DAG.getConstant(1, DL, MVT::i32);
3791 SDValue U = DAG.getNode(ISD::BITCAST, DL, MVT::i64, Src);
3792 SDValue UH = DAG.getNode(ISD::SRL, DL, MVT::i64, U,
3793 DAG.getConstant(32, DL, MVT::i64));
3794 UH = DAG.getZExtOrTrunc(UH, DL, MVT::i32);
3795 U = DAG.getZExtOrTrunc(U, DL, MVT::i32);
3796 SDValue E = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
3797 DAG.getConstant(20, DL, MVT::i64));
3798 E = DAG.getNode(ISD::AND, DL, MVT::i32, E,
3799 DAG.getConstant(ExpMask, DL, MVT::i32));
3800 // Subtract the fp64 exponent bias (1023) to get the real exponent and
3801 // add the f16 bias (15) to get the biased exponent for the f16 format.
3802 E = DAG.getNode(ISD::ADD, DL, MVT::i32, E,
3803 DAG.getConstant(-ExpBiasf64 + ExpBiasf16, DL, MVT::i32));
3804
3805 SDValue M = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
3806 DAG.getConstant(8, DL, MVT::i32));
3807 M = DAG.getNode(ISD::AND, DL, MVT::i32, M,
3808 DAG.getConstant(0xffe, DL, MVT::i32));
3809
3810 SDValue MaskedSig = DAG.getNode(ISD::AND, DL, MVT::i32, UH,
3811 DAG.getConstant(0x1ff, DL, MVT::i32));
3812 MaskedSig = DAG.getNode(ISD::OR, DL, MVT::i32, MaskedSig, U);
3813
3814 SDValue Lo40Set = DAG.getSelectCC(DL, MaskedSig, Zero, Zero, One, ISD::SETEQ);
3815 M = DAG.getNode(ISD::OR, DL, MVT::i32, M, Lo40Set);
3816
3817 // (M != 0 ? 0x0200 : 0) | 0x7c00;
3818 SDValue I = DAG.getNode(ISD::OR, DL, MVT::i32,
3819 DAG.getSelectCC(DL, M, Zero, DAG.getConstant(0x0200, DL, MVT::i32),
3820 Zero, ISD::SETNE), DAG.getConstant(0x7c00, DL, MVT::i32));
3821
3822 // N = M | (E << 12);
3823 SDValue N = DAG.getNode(ISD::OR, DL, MVT::i32, M,
3824 DAG.getNode(ISD::SHL, DL, MVT::i32, E,
3825 DAG.getConstant(12, DL, MVT::i32)));
3826
3827 // B = clamp(1-E, 0, 13);
3828 SDValue OneSubExp = DAG.getNode(ISD::SUB, DL, MVT::i32,
3829 One, E);
3830 SDValue B = DAG.getNode(ISD::SMAX, DL, MVT::i32, OneSubExp, Zero);
3831 B = DAG.getNode(ISD::SMIN, DL, MVT::i32, B,
3832 DAG.getConstant(13, DL, MVT::i32));
3833
3834 SDValue SigSetHigh = DAG.getNode(ISD::OR, DL, MVT::i32, M,
3835 DAG.getConstant(0x1000, DL, MVT::i32));
3836
3837 SDValue D = DAG.getNode(ISD::SRL, DL, MVT::i32, SigSetHigh, B);
3838 SDValue D0 = DAG.getNode(ISD::SHL, DL, MVT::i32, D, B);
3839 SDValue D1 = DAG.getSelectCC(DL, D0, SigSetHigh, One, Zero, ISD::SETNE);
3840 D = DAG.getNode(ISD::OR, DL, MVT::i32, D, D1);
3841
3842 SDValue V = DAG.getSelectCC(DL, E, One, D, N, ISD::SETLT);
3843 SDValue VLow3 = DAG.getNode(ISD::AND, DL, MVT::i32, V,
3844 DAG.getConstant(0x7, DL, MVT::i32));
3845 V = DAG.getNode(ISD::SRL, DL, MVT::i32, V,
3846 DAG.getConstant(2, DL, MVT::i32));
3847 SDValue V0 = DAG.getSelectCC(DL, VLow3, DAG.getConstant(3, DL, MVT::i32),
3848 One, Zero, ISD::SETEQ);
3849 SDValue V1 = DAG.getSelectCC(DL, VLow3, DAG.getConstant(5, DL, MVT::i32),
3850 One, Zero, ISD::SETGT);
3851 V1 = DAG.getNode(ISD::OR, DL, MVT::i32, V0, V1);
3852 V = DAG.getNode(ISD::ADD, DL, MVT::i32, V, V1);
3853
3854 V = DAG.getSelectCC(DL, E, DAG.getConstant(30, DL, MVT::i32),
3855 DAG.getConstant(0x7c00, DL, MVT::i32), V, ISD::SETGT);
3856 V = DAG.getSelectCC(DL, E, DAG.getConstant(1039, DL, MVT::i32),
3857 I, V, ISD::SETEQ);
3858
3859 // Extract the sign bit.
3860 SDValue Sign = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
3861 DAG.getConstant(16, DL, MVT::i32));
3862 Sign = DAG.getNode(ISD::AND, DL, MVT::i32, Sign,
3863 DAG.getConstant(0x8000, DL, MVT::i32));
3864
3865 return DAG.getNode(ISD::OR, DL, MVT::i32, Sign, V);
3866}
3867
3869 SelectionDAG &DAG) const {
3870 SDValue Src = Op.getOperand(0);
3871 unsigned OpOpcode = Op.getOpcode();
3872 EVT SrcVT = Src.getValueType();
3873 EVT DestVT = Op.getValueType();
3874
3875 // Will be selected natively
3876 if (SrcVT == MVT::f16 && DestVT == MVT::i16)
3877 return Op;
3878
3879 if (SrcVT == MVT::bf16 || (SrcVT == MVT::f16 && DestVT == MVT::i32)) {
3880 SDLoc DL(Op);
3881 SDValue PromotedSrc = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, Src);
3882 return DAG.getNode(Op.getOpcode(), DL, DestVT, PromotedSrc);
3883 }
3884
3885 // Promote i16 to i32
3886 if (DestVT == MVT::i16 && (SrcVT == MVT::f32 || SrcVT == MVT::f64)) {
3887 SDLoc DL(Op);
3888
3889 SDValue FpToInt32 = DAG.getNode(OpOpcode, DL, MVT::i32, Src);
3890 return DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, FpToInt32);
3891 }
3892
3893 if (DestVT != MVT::i64)
3894 return Op;
3895
3896 if (SrcVT == MVT::f16 ||
3897 (SrcVT == MVT::f32 && Src.getOpcode() == ISD::FP16_TO_FP)) {
3898 SDLoc DL(Op);
3899
3900 SDValue FpToInt32 = DAG.getNode(OpOpcode, DL, MVT::i32, Src);
3901 unsigned Ext =
3903 return DAG.getNode(Ext, DL, MVT::i64, FpToInt32);
3904 }
3905
3906 if (SrcVT == MVT::f32 || SrcVT == MVT::f64)
3907 return LowerFP_TO_INT64(Op, DAG, OpOpcode == ISD::FP_TO_SINT);
3908
3909 return SDValue();
3910}
3911
3913 SelectionDAG &DAG) const {
3914 SDValue Src = Op.getOperand(0);
3915 unsigned OpOpcode = Op.getOpcode();
3916 EVT SrcVT = Src.getValueType();
3917 EVT DstVT = Op.getValueType();
3918 SDValue SatVTOp = Op.getNode()->getOperand(1);
3919 EVT SatVT = cast<VTSDNode>(SatVTOp)->getVT();
3920 SDLoc DL(Op);
3921
3922 uint64_t DstWidth = DstVT.getScalarSizeInBits();
3923 uint64_t SatWidth = SatVT.getScalarSizeInBits();
3924 assert(SatWidth <= DstWidth && "Saturation width cannot exceed result width");
3925
3926 // Scalar cases will be selected natively to v_cvt_/s_cvt_ instructions.
3927 // v2f32 -> v2i16 will be selected natively to v_cvt_pk_[iu]16_f32.
3928 if (SatWidth == DstWidth) {
3929 if ((DstVT == MVT::i32 && (SrcVT == MVT::f32 || SrcVT == MVT::f64)) ||
3930 (DstVT == MVT::i16 && (SrcVT == MVT::f16 || SrcVT == MVT::f32)) ||
3931 (DstVT == MVT::v2i16 && SrcVT == MVT::v2f32))
3932 return Op;
3933 }
3934
3935 // Vectors can only be selected natively.
3936 if (DstVT.isVector())
3937 return SDValue();
3938
3939 // Perform all saturation at selected width (i16 or i32) and truncate
3940 if (SatWidth < DstWidth && SatWidth <= 32) {
3941 // For f16 conversion with sub-i16 saturation perform saturation
3942 // at i16, if available in the target. This removes the need for extra f16
3943 // to f32 conversion. For all the others use i32.
3944 MVT ResultVT =
3945 Subtarget->has16BitInsts() && SrcVT == MVT::f16 && SatWidth < 16
3946 ? MVT::i16
3947 : MVT::i32;
3948
3949 const SDValue ResultVTOp = DAG.getValueType(ResultVT);
3950 const uint64_t ResultWidth = ResultVT.getScalarSizeInBits();
3951
3952 // First, convert input float into selected integer (i16 or i32)
3953 SDValue FpToInt = DAG.getNode(OpOpcode, DL, ResultVT, Src, ResultVTOp);
3954 SDValue IntSatVal;
3955
3956 // Then, clamp at the saturation width using either i16 or i32 instructions
3957 if (OpOpcode == ISD::FP_TO_SINT_SAT) {
3958 SDValue MinConst = DAG.getConstant(
3959 APInt::getSignedMaxValue(SatWidth).sext(ResultWidth), DL, ResultVT);
3960 SDValue MaxConst = DAG.getConstant(
3961 APInt::getSignedMinValue(SatWidth).sext(ResultWidth), DL, ResultVT);
3962 SDValue MinVal = DAG.getNode(ISD::SMIN, DL, ResultVT, FpToInt, MinConst);
3963 IntSatVal = DAG.getNode(ISD::SMAX, DL, ResultVT, MinVal, MaxConst);
3964 } else {
3965 SDValue MinConst = DAG.getConstant(
3966 APInt::getMaxValue(SatWidth).zext(ResultWidth), DL, ResultVT);
3967 IntSatVal = DAG.getNode(ISD::UMIN, DL, ResultVT, FpToInt, MinConst);
3968 }
3969
3970 // Finally, after saturating at i16 or i32 fit into the destination type
3971 return DAG.getExtOrTrunc(OpOpcode == ISD::FP_TO_SINT_SAT, IntSatVal, DL,
3972 DstVT);
3973 }
3974
3975 // SatWidth == DstWidth or SatWidth > 32
3976
3977 // Saturate at i32 for i64 dst and f16/bf16 src (will invoke f16 promotion
3978 // below)
3979 if (DstVT == MVT::i64 &&
3980 (SrcVT == MVT::f16 || SrcVT == MVT::bf16 ||
3981 (SrcVT == MVT::f32 && Src.getOpcode() == ISD::FP16_TO_FP))) {
3982 const SDValue Int32VTOp = DAG.getValueType(MVT::i32);
3983 return DAG.getNode(OpOpcode, DL, DstVT, Src, Int32VTOp);
3984 }
3985
3986 // Promote f16/bf16 src to f32 for i32 conversion
3987 if (DstVT == MVT::i32 && (SrcVT == MVT::f16 || SrcVT == MVT::bf16)) {
3988 SDValue PromotedSrc = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, Src);
3989 return DAG.getNode(Op.getOpcode(), DL, DstVT, PromotedSrc, SatVTOp);
3990 }
3991
3992 // For DstWidth < 16, promote i1 and i8 dst to i16 (if legal) with sub-i16
3993 // saturation. For DstWidth == 16, promote i16 dst to i32 with sub-i32
3994 // saturation; this covers i16.f32 and i16.f64
3995 if (DstWidth < 32) {
3996 // Note: this triggers SatWidth < DstWidth above to generate saturated
3997 // truncate by requesting MVT::i16/i32 destination with SatWidth < 16/32.
3998 MVT PromoteVT =
3999 (DstWidth < 16 && Subtarget->has16BitInsts()) ? MVT::i16 : MVT::i32;
4000 SDValue FpToInt = DAG.getNode(OpOpcode, DL, PromoteVT, Src, SatVTOp);
4001 return DAG.getNode(ISD::TRUNCATE, DL, DstVT, FpToInt);
4002 }
4003
4004 // TODO: can we implement i64 dst for f32/f64?
4005
4006 return SDValue();
4007}
4008
4010 SelectionDAG &DAG) const {
4011 EVT ExtraVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
4012 MVT VT = Op.getSimpleValueType();
4013 MVT ScalarVT = VT.getScalarType();
4014
4015 assert(VT.isVector());
4016
4017 SDValue Src = Op.getOperand(0);
4018 SDLoc DL(Op);
4019
4020 // TODO: Don't scalarize on Evergreen?
4021 unsigned NElts = VT.getVectorNumElements();
4023 DAG.ExtractVectorElements(Src, Args, 0, NElts);
4024
4025 SDValue VTOp = DAG.getValueType(ExtraVT.getScalarType());
4026 for (unsigned I = 0; I < NElts; ++I)
4027 Args[I] = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, ScalarVT, Args[I], VTOp);
4028
4029 return DAG.getBuildVector(VT, DL, Args);
4030}
4031
4032//===----------------------------------------------------------------------===//
4033// Custom DAG optimizations
4034//===----------------------------------------------------------------------===//
4035
4036static bool isU24(SDValue Op, SelectionDAG &DAG) {
4037 return AMDGPUTargetLowering::numBitsUnsigned(Op, DAG) <= 24;
4038}
4039
4040static bool isI24(SDValue Op, SelectionDAG &DAG) {
4041 EVT VT = Op.getValueType();
4042 return VT.getSizeInBits() >= 24 && // Types less than 24-bit should be treated
4043 // as unsigned 24-bit values.
4045}
4046
4049 SelectionDAG &DAG = DCI.DAG;
4050 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
4051 bool IsIntrin = Node24->getOpcode() == ISD::INTRINSIC_WO_CHAIN;
4052
4053 SDValue LHS = IsIntrin ? Node24->getOperand(1) : Node24->getOperand(0);
4054 SDValue RHS = IsIntrin ? Node24->getOperand(2) : Node24->getOperand(1);
4055 unsigned NewOpcode = Node24->getOpcode();
4056 if (IsIntrin) {
4057 unsigned IID = Node24->getConstantOperandVal(0);
4058 switch (IID) {
4059 case Intrinsic::amdgcn_mul_i24:
4060 NewOpcode = AMDGPUISD::MUL_I24;
4061 break;
4062 case Intrinsic::amdgcn_mul_u24:
4063 NewOpcode = AMDGPUISD::MUL_U24;
4064 break;
4065 case Intrinsic::amdgcn_mulhi_i24:
4066 NewOpcode = AMDGPUISD::MULHI_I24;
4067 break;
4068 case Intrinsic::amdgcn_mulhi_u24:
4069 NewOpcode = AMDGPUISD::MULHI_U24;
4070 break;
4071 default:
4072 llvm_unreachable("Expected 24-bit mul intrinsic");
4073 }
4074 }
4075
4076 APInt Demanded = APInt::getLowBitsSet(LHS.getValueSizeInBits(), 24);
4077
4078 // First try to simplify using SimplifyMultipleUseDemandedBits which allows
4079 // the operands to have other uses, but will only perform simplifications that
4080 // involve bypassing some nodes for this user.
4081 SDValue DemandedLHS = TLI.SimplifyMultipleUseDemandedBits(LHS, Demanded, DAG);
4082 SDValue DemandedRHS = TLI.SimplifyMultipleUseDemandedBits(RHS, Demanded, DAG);
4083 if (DemandedLHS || DemandedRHS)
4084 return DAG.getNode(NewOpcode, SDLoc(Node24), Node24->getVTList(),
4085 DemandedLHS ? DemandedLHS : LHS,
4086 DemandedRHS ? DemandedRHS : RHS);
4087
4088 // Now try SimplifyDemandedBits which can simplify the nodes used by our
4089 // operands if this node is the only user.
4090 if (TLI.SimplifyDemandedBits(LHS, Demanded, DCI))
4091 return SDValue(Node24, 0);
4092 if (TLI.SimplifyDemandedBits(RHS, Demanded, DCI))
4093 return SDValue(Node24, 0);
4094
4095 return SDValue();
4096}
4097
4098template <typename IntTy>
4100 uint32_t Width, const SDLoc &DL) {
4101 if (Width + Offset < 32) {
4102 uint32_t Shl = static_cast<uint32_t>(Src0) << (32 - Offset - Width);
4103 IntTy Result = static_cast<IntTy>(Shl) >> (32 - Width);
4104 if constexpr (std::is_signed_v<IntTy>) {
4105 return DAG.getSignedConstant(Result, DL, MVT::i32);
4106 } else {
4107 return DAG.getConstant(Result, DL, MVT::i32);
4108 }
4109 }
4110
4111 return DAG.getConstant(Src0 >> Offset, DL, MVT::i32);
4112}
4113
4114static bool hasVolatileUser(SDNode *Val) {
4115 for (SDNode *U : Val->users()) {
4116 if (MemSDNode *M = dyn_cast<MemSDNode>(U)) {
4117 if (M->isVolatile())
4118 return true;
4119 }
4120 }
4121
4122 return false;
4123}
4124
4126 // i32 vectors are the canonical memory type.
4127 if (VT.getScalarType() == MVT::i32 || isTypeLegal(VT))
4128 return false;
4129
4130 if (!VT.isByteSized())
4131 return false;
4132
4133 unsigned Size = VT.getStoreSize();
4134
4135 if ((Size == 1 || Size == 2 || Size == 4) && !VT.isVector())
4136 return false;
4137
4138 if (Size == 3 || (Size > 4 && (Size % 4 != 0)))
4139 return false;
4140
4141 return true;
4142}
4143
4144// Replace load of an illegal type with a bitcast from a load of a friendlier
4145// type.
4147 DAGCombinerInfo &DCI) const {
4148 if (!DCI.isBeforeLegalize())
4149 return SDValue();
4150
4152 if (!LN->isSimple() || !ISD::isNormalLoad(LN) || hasVolatileUser(LN))
4153 return SDValue();
4154
4155 SDLoc SL(N);
4156 SelectionDAG &DAG = DCI.DAG;
4157 EVT VT = LN->getMemoryVT();
4158
4159 unsigned Size = VT.getStoreSize();
4160 Align Alignment = LN->getAlign();
4161 if (Alignment < Size && isTypeLegal(VT)) {
4162 unsigned IsFast;
4163 unsigned AS = LN->getAddressSpace();
4164
4165 // Expand unaligned loads earlier than legalization. Due to visitation order
4166 // problems during legalization, the emitted instructions to pack and unpack
4167 // the bytes again are not eliminated in the case of an unaligned copy.
4169 VT, AS, Alignment, LN->getMemOperand()->getFlags(), &IsFast)) {
4170 if (VT.isVector())
4171 return SplitVectorLoad(SDValue(LN, 0), DAG);
4172
4173 SDValue Ops[2];
4174 std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(LN, DAG);
4175
4176 return DAG.getMergeValues(Ops, SDLoc(N));
4177 }
4178
4179 if (!IsFast)
4180 return SDValue();
4181 }
4182
4183 if (!shouldCombineMemoryType(VT))
4184 return SDValue();
4185
4186 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
4187
4188 SDValue NewLoad
4189 = DAG.getLoad(NewVT, SL, LN->getChain(),
4190 LN->getBasePtr(), LN->getMemOperand());
4191
4192 SDValue BC = DAG.getNode(ISD::BITCAST, SL, VT, NewLoad);
4193 DCI.CombineTo(N, BC, NewLoad.getValue(1));
4194 return SDValue(N, 0);
4195}
4196
4197// Replace store of an illegal type with a store of a bitcast to a friendlier
4198// type.
4200 DAGCombinerInfo &DCI) const {
4201 if (!DCI.isBeforeLegalize())
4202 return SDValue();
4203
4205 if (!SN->isSimple() || !ISD::isNormalStore(SN))
4206 return SDValue();
4207
4208 EVT VT = SN->getMemoryVT();
4209 unsigned Size = VT.getStoreSize();
4210
4211 SDLoc SL(N);
4212 SelectionDAG &DAG = DCI.DAG;
4213 Align Alignment = SN->getAlign();
4214 if (Alignment < Size && isTypeLegal(VT)) {
4215 unsigned IsFast;
4216 unsigned AS = SN->getAddressSpace();
4217
4218 // Expand unaligned stores earlier than legalization. Due to visitation
4219 // order problems during legalization, the emitted instructions to pack and
4220 // unpack the bytes again are not eliminated in the case of an unaligned
4221 // copy.
4223 VT, AS, Alignment, SN->getMemOperand()->getFlags(), &IsFast)) {
4224 if (VT.isVector())
4225 return SplitVectorStore(SDValue(SN, 0), DAG);
4226
4227 return expandUnalignedStore(SN, DAG);
4228 }
4229
4230 if (!IsFast)
4231 return SDValue();
4232 }
4233
4234 if (!shouldCombineMemoryType(VT))
4235 return SDValue();
4236
4237 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
4238 SDValue Val = SN->getValue();
4239
4240 // DCI.AddToWorklist(Val.getNode());
4241
4242 bool OtherUses = !Val.hasOneUse();
4243 SDValue CastVal = DAG.getBitcast(NewVT, Val);
4244 if (OtherUses) {
4245 SDValue CastBack = DAG.getBitcast(VT, CastVal);
4246 DAG.ReplaceAllUsesOfValueWith(Val, CastBack);
4247 }
4248
4249 return DAG.getStore(SN->getChain(), SL, CastVal,
4250 SN->getBasePtr(), SN->getMemOperand());
4251}
4252
4253// FIXME: This should go in generic DAG combiner with an isTruncateFree check,
4254// but isTruncateFree is inaccurate for i16 now because of SALU vs. VALU
4255// issues.
4257 DAGCombinerInfo &DCI) const {
4258 SelectionDAG &DAG = DCI.DAG;
4259 SDValue N0 = N->getOperand(0);
4260
4261 // (vt2 (assertzext (truncate vt0:x), vt1)) ->
4262 // (vt2 (truncate (assertzext vt0:x, vt1)))
4263 if (N0.getOpcode() == ISD::TRUNCATE) {
4264 SDValue N1 = N->getOperand(1);
4265 EVT ExtVT = cast<VTSDNode>(N1)->getVT();
4266 SDLoc SL(N);
4267
4268 SDValue Src = N0.getOperand(0);
4269 EVT SrcVT = Src.getValueType();
4270 if (SrcVT.bitsGE(ExtVT)) {
4271 SDValue NewInReg = DAG.getNode(N->getOpcode(), SL, SrcVT, Src, N1);
4272 return DAG.getNode(ISD::TRUNCATE, SL, N->getValueType(0), NewInReg);
4273 }
4274 }
4275
4276 return SDValue();
4277}
4278
4280 SDNode *N, DAGCombinerInfo &DCI) const {
4281 unsigned IID = N->getConstantOperandVal(0);
4282 switch (IID) {
4283 case Intrinsic::amdgcn_mul_i24:
4284 case Intrinsic::amdgcn_mul_u24:
4285 case Intrinsic::amdgcn_mulhi_i24:
4286 case Intrinsic::amdgcn_mulhi_u24:
4287 return simplifyMul24(N, DCI);
4288 case Intrinsic::amdgcn_fract:
4289 case Intrinsic::amdgcn_rsq:
4290 case Intrinsic::amdgcn_rcp_legacy:
4291 case Intrinsic::amdgcn_rsq_legacy:
4292 case Intrinsic::amdgcn_rsq_clamp:
4293 case Intrinsic::amdgcn_tanh:
4294 case Intrinsic::amdgcn_prng_b32: {
4295 // FIXME: This is probably wrong. If src is an sNaN, it won't be quieted
4296 SDValue Src = N->getOperand(1);
4297 return Src.isUndef() ? Src : SDValue();
4298 }
4299 case Intrinsic::amdgcn_frexp_exp: {
4300 // frexp_exp (fneg x) -> frexp_exp x
4301 // frexp_exp (fabs x) -> frexp_exp x
4302 // frexp_exp (fneg (fabs x)) -> frexp_exp x
4303 SDValue Src = N->getOperand(1);
4304 SDValue PeekSign = peekFPSignOps(Src);
4305 if (PeekSign == Src)
4306 return SDValue();
4307 return SDValue(DCI.DAG.UpdateNodeOperands(N, N->getOperand(0), PeekSign),
4308 0);
4309 }
4310 default:
4311 return SDValue();
4312 }
4313}
4314
4315/// Split the 64-bit value \p LHS into two 32-bit components, and perform the
4316/// binary operation \p Opc to it with the corresponding constant operands.
4318 DAGCombinerInfo &DCI, const SDLoc &SL,
4319 unsigned Opc, SDValue LHS,
4320 uint32_t ValLo, uint32_t ValHi) const {
4321 SelectionDAG &DAG = DCI.DAG;
4322 SDValue Lo, Hi;
4323 std::tie(Lo, Hi) = split64BitValue(LHS, DAG);
4324
4325 SDValue LoRHS = DAG.getConstant(ValLo, SL, MVT::i32);
4326 SDValue HiRHS = DAG.getConstant(ValHi, SL, MVT::i32);
4327
4328 SDValue LoAnd = DAG.getNode(Opc, SL, MVT::i32, Lo, LoRHS);
4329 SDValue HiAnd = DAG.getNode(Opc, SL, MVT::i32, Hi, HiRHS);
4330
4331 // Re-visit the ands. It's possible we eliminated one of them and it could
4332 // simplify the vector.
4333 DCI.AddToWorklist(Lo.getNode());
4334 DCI.AddToWorklist(Hi.getNode());
4335
4336 SDValue Vec = DAG.getBuildVector(MVT::v2i32, SL, {LoAnd, HiAnd});
4337 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
4338}
4339
4341 DAGCombinerInfo &DCI) const {
4342 EVT VT = N->getValueType(0);
4343 SDValue LHS = N->getOperand(0);
4344 SDValue RHS = N->getOperand(1);
4346 SDLoc SL(N);
4347 SelectionDAG &DAG = DCI.DAG;
4348
4349 unsigned RHSVal;
4350 if (CRHS) {
4351 RHSVal = CRHS->getZExtValue();
4352 if (!RHSVal)
4353 return LHS;
4354
4355 switch (LHS->getOpcode()) {
4356 default:
4357 break;
4358 case ISD::ZERO_EXTEND:
4359 case ISD::SIGN_EXTEND:
4360 case ISD::ANY_EXTEND: {
4361 SDValue X = LHS->getOperand(0);
4362
4363 if (VT == MVT::i32 && RHSVal == 16 && X.getValueType() == MVT::i16 &&
4364 isOperationLegal(ISD::BUILD_VECTOR, MVT::v2i16)) {
4365 // Prefer build_vector as the canonical form if packed types are legal.
4366 // (shl ([asz]ext i16:x), 16 -> build_vector 0, x
4367 SDValue Vec = DAG.getBuildVector(
4368 MVT::v2i16, SL,
4369 {DAG.getConstant(0, SL, MVT::i16), LHS->getOperand(0)});
4370 return DAG.getNode(ISD::BITCAST, SL, MVT::i32, Vec);
4371 }
4372
4373 // shl (ext x) => zext (shl x), if shift does not overflow int
4374 if (VT != MVT::i64)
4375 break;
4376 KnownBits Known = DAG.computeKnownBits(X);
4377 unsigned LZ = Known.countMinLeadingZeros();
4378 if (LZ < RHSVal)
4379 break;
4380 EVT XVT = X.getValueType();
4381 SDValue Shl = DAG.getNode(ISD::SHL, SL, XVT, X, SDValue(CRHS, 0));
4382 return DAG.getZExtOrTrunc(Shl, SL, VT);
4383 }
4384 }
4385 }
4386
4387 if (VT.getScalarType() != MVT::i64)
4388 return SDValue();
4389
4390 // On some subtargets, 64-bit shift is a quarter rate instruction. In the
4391 // common case, splitting this into a move and a 32-bit shift is faster and
4392 // the same code size.
4393 KnownBits Known = DAG.computeKnownBits(RHS);
4394
4395 EVT ElementType = VT.getScalarType();
4396 EVT TargetScalarType = ElementType.getHalfSizedIntegerVT(*DAG.getContext());
4397 EVT TargetType = VT.changeElementType(*DAG.getContext(), TargetScalarType);
4398
4399 if (Known.getMinValue().getZExtValue() < TargetScalarType.getSizeInBits())
4400 return SDValue();
4401 SDValue ShiftAmt;
4402
4403 if (CRHS) {
4404 ShiftAmt = DAG.getConstant(RHSVal - TargetScalarType.getSizeInBits(), SL,
4405 TargetType);
4406 } else {
4407 SDValue TruncShiftAmt = DAG.getNode(ISD::TRUNCATE, SL, TargetType, RHS);
4408 const SDValue ShiftMask =
4409 DAG.getConstant(TargetScalarType.getSizeInBits() - 1, SL, TargetType);
4410 // This AND instruction will clamp out of bounds shift values.
4411 // It will also be removed during later instruction selection.
4412 ShiftAmt = DAG.getNode(ISD::AND, SL, TargetType, TruncShiftAmt, ShiftMask);
4413 }
4414
4415 SDValue Lo = DAG.getNode(ISD::TRUNCATE, SL, TargetType, LHS);
4416 SDValue NewShift =
4417 DAG.getNode(ISD::SHL, SL, TargetType, Lo, ShiftAmt, N->getFlags());
4418
4419 const SDValue Zero = DAG.getConstant(0, SL, TargetScalarType);
4420 SDValue Vec;
4421
4422 if (VT.isVector()) {
4423 EVT ConcatType = TargetType.getDoubleNumVectorElementsVT(*DAG.getContext());
4424 unsigned NElts = TargetType.getVectorNumElements();
4426 SmallVector<SDValue, 16> HiAndLoOps(NElts * 2, Zero);
4427
4428 DAG.ExtractVectorElements(NewShift, HiOps, 0, NElts);
4429 for (unsigned I = 0; I != NElts; ++I)
4430 HiAndLoOps[2 * I + 1] = HiOps[I];
4431 Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, ConcatType, HiAndLoOps);
4432 } else {
4433 EVT ConcatType = EVT::getVectorVT(*DAG.getContext(), TargetType, 2);
4434 Vec = DAG.getBuildVector(ConcatType, SL, {Zero, NewShift});
4435 }
4436 return DAG.getNode(ISD::BITCAST, SL, VT, Vec);
4437}
4438
4440 DAGCombinerInfo &DCI) const {
4441 SDValue RHS = N->getOperand(1);
4443 EVT VT = N->getValueType(0);
4444 SDValue LHS = N->getOperand(0);
4445 SelectionDAG &DAG = DCI.DAG;
4446 SDLoc SL(N);
4447
4448 if (VT.getScalarType() != MVT::i64)
4449 return SDValue();
4450
4451 // For C >= 32
4452 // i64 (sra x, C) -> (build_pair (sra hi_32(x), C - 32), sra hi_32(x), 31))
4453
4454 // On some subtargets, 64-bit shift is a quarter rate instruction. In the
4455 // common case, splitting this into a move and a 32-bit shift is faster and
4456 // the same code size.
4457 KnownBits Known = DAG.computeKnownBits(RHS);
4458
4459 EVT ElementType = VT.getScalarType();
4460 EVT TargetScalarType = ElementType.getHalfSizedIntegerVT(*DAG.getContext());
4461 EVT TargetType = VT.changeElementType(*DAG.getContext(), TargetScalarType);
4462
4463 if (Known.getMinValue().getZExtValue() < TargetScalarType.getSizeInBits())
4464 return SDValue();
4465
4466 SDValue ShiftFullAmt =
4467 DAG.getConstant(TargetScalarType.getSizeInBits() - 1, SL, TargetType);
4468 SDValue ShiftAmt;
4469 if (CRHS) {
4470 unsigned RHSVal = CRHS->getZExtValue();
4471 ShiftAmt = DAG.getConstant(RHSVal - TargetScalarType.getSizeInBits(), SL,
4472 TargetType);
4473 } else if (Known.getMinValue().getZExtValue() ==
4474 (ElementType.getSizeInBits() - 1)) {
4475 ShiftAmt = ShiftFullAmt;
4476 } else {
4477 SDValue TruncShiftAmt = DAG.getNode(ISD::TRUNCATE, SL, TargetType, RHS);
4478 const SDValue ShiftMask =
4479 DAG.getConstant(TargetScalarType.getSizeInBits() - 1, SL, TargetType);
4480 // This AND instruction will clamp out of bounds shift values.
4481 // It will also be removed during later instruction selection.
4482 ShiftAmt = DAG.getNode(ISD::AND, SL, TargetType, TruncShiftAmt, ShiftMask);
4483 }
4484
4485 EVT ConcatType;
4486 SDValue Hi;
4487 SDLoc LHSSL(LHS);
4488 // Bitcast LHS into ConcatType so hi-half of source can be extracted into Hi
4489 if (VT.isVector()) {
4490 unsigned NElts = TargetType.getVectorNumElements();
4491 ConcatType = TargetType.getDoubleNumVectorElementsVT(*DAG.getContext());
4492 SDValue SplitLHS = DAG.getNode(ISD::BITCAST, LHSSL, ConcatType, LHS);
4493 SmallVector<SDValue, 8> HiOps(NElts);
4494 SmallVector<SDValue, 16> HiAndLoOps;
4495
4496 DAG.ExtractVectorElements(SplitLHS, HiAndLoOps, 0, NElts * 2);
4497 for (unsigned I = 0; I != NElts; ++I) {
4498 HiOps[I] = HiAndLoOps[2 * I + 1];
4499 }
4500 Hi = DAG.getNode(ISD::BUILD_VECTOR, LHSSL, TargetType, HiOps);
4501 } else {
4502 const SDValue One = DAG.getConstant(1, LHSSL, TargetScalarType);
4503 ConcatType = EVT::getVectorVT(*DAG.getContext(), TargetType, 2);
4504 SDValue SplitLHS = DAG.getNode(ISD::BITCAST, LHSSL, ConcatType, LHS);
4505 Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, LHSSL, TargetType, SplitLHS, One);
4506 }
4507
4508 KnownBits KnownLHS = DAG.computeKnownBits(LHS);
4509 SDValue NewShift, HiShift;
4510 if (KnownLHS.isNegative()) {
4511 HiShift = DAG.getAllOnesConstant(SL, TargetType);
4512 NewShift =
4513 DAG.getNode(ISD::SRA, SL, TargetType, Hi, ShiftAmt, N->getFlags());
4514 } else if (CRHS &&
4515 CRHS->getZExtValue() == (ElementType.getSizeInBits() - 1)) {
4516 NewShift = HiShift =
4517 DAG.getNode(ISD::SRA, SL, TargetType, Hi, ShiftAmt, N->getFlags());
4518 } else {
4519 Hi = DAG.getFreeze(Hi);
4520 HiShift = DAG.getNode(ISD::SRA, SL, TargetType, Hi, ShiftFullAmt);
4521 NewShift =
4522 DAG.getNode(ISD::SRA, SL, TargetType, Hi, ShiftAmt, N->getFlags());
4523 }
4524
4525 SDValue Vec;
4526 if (VT.isVector()) {
4527 unsigned NElts = TargetType.getVectorNumElements();
4530 SmallVector<SDValue, 16> HiAndLoOps(NElts * 2);
4531
4532 DAG.ExtractVectorElements(HiShift, HiOps, 0, NElts);
4533 DAG.ExtractVectorElements(NewShift, LoOps, 0, NElts);
4534 for (unsigned I = 0; I != NElts; ++I) {
4535 HiAndLoOps[2 * I + 1] = HiOps[I];
4536 HiAndLoOps[2 * I] = LoOps[I];
4537 }
4538 Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, ConcatType, HiAndLoOps);
4539 } else {
4540 Vec = DAG.getBuildVector(ConcatType, SL, {NewShift, HiShift});
4541 }
4542 return DAG.getNode(ISD::BITCAST, SL, VT, Vec);
4543}
4544
4546 DAGCombinerInfo &DCI) const {
4547 SDValue RHS = N->getOperand(1);
4549 EVT VT = N->getValueType(0);
4550 SDValue LHS = N->getOperand(0);
4551 SelectionDAG &DAG = DCI.DAG;
4552 SDLoc SL(N);
4553 unsigned RHSVal;
4554
4555 if (CRHS) {
4556 RHSVal = CRHS->getZExtValue();
4557
4558 // fold (srl (and x, c1 << c2), c2) -> (and (srl(x, c2), c1)
4559 // this improves the ability to match BFE patterns in isel.
4560 if (LHS.getOpcode() == ISD::AND) {
4561 if (auto *Mask = dyn_cast<ConstantSDNode>(LHS.getOperand(1))) {
4562 unsigned MaskIdx, MaskLen;
4563 if (Mask->getAPIntValue().isShiftedMask(MaskIdx, MaskLen) &&
4564 MaskIdx == RHSVal) {
4565 return DAG.getNode(ISD::AND, SL, VT,
4566 DAG.getNode(ISD::SRL, SL, VT, LHS.getOperand(0),
4567 N->getOperand(1)),
4568 DAG.getNode(ISD::SRL, SL, VT, LHS.getOperand(1),
4569 N->getOperand(1)));
4570 }
4571 }
4572 }
4573 }
4574
4575 if (VT.getScalarType() != MVT::i64)
4576 return SDValue();
4577
4578 // for C >= 32
4579 // i64 (srl x, C) -> (build_pair (srl hi_32(x), C - 32), 0)
4580
4581 // On some subtargets, 64-bit shift is a quarter rate instruction. In the
4582 // common case, splitting this into a move and a 32-bit shift is faster and
4583 // the same code size.
4584 KnownBits Known = DAG.computeKnownBits(RHS);
4585
4586 EVT ElementType = VT.getScalarType();
4587 EVT TargetScalarType = ElementType.getHalfSizedIntegerVT(*DAG.getContext());
4588 EVT TargetType = VT.changeElementType(*DAG.getContext(), TargetScalarType);
4589
4590 if (Known.getMinValue().getZExtValue() < TargetScalarType.getSizeInBits())
4591 return SDValue();
4592
4593 SDValue ShiftAmt;
4594 if (CRHS) {
4595 ShiftAmt = DAG.getConstant(RHSVal - TargetScalarType.getSizeInBits(), SL,
4596 TargetType);
4597 } else {
4598 SDValue TruncShiftAmt = DAG.getNode(ISD::TRUNCATE, SL, TargetType, RHS);
4599 const SDValue ShiftMask =
4600 DAG.getConstant(TargetScalarType.getSizeInBits() - 1, SL, TargetType);
4601 // This AND instruction will clamp out of bounds shift values.
4602 // It will also be removed during later instruction selection.
4603 ShiftAmt = DAG.getNode(ISD::AND, SL, TargetType, TruncShiftAmt, ShiftMask);
4604 }
4605
4606 const SDValue Zero = DAG.getConstant(0, SL, TargetScalarType);
4607 EVT ConcatType;
4608 SDValue Hi;
4609 SDLoc LHSSL(LHS);
4610 // Bitcast LHS into ConcatType so hi-half of source can be extracted into Hi
4611 if (VT.isVector()) {
4612 unsigned NElts = TargetType.getVectorNumElements();
4613 ConcatType = TargetType.getDoubleNumVectorElementsVT(*DAG.getContext());
4614 SDValue SplitLHS = DAG.getNode(ISD::BITCAST, LHSSL, ConcatType, LHS);
4615 SmallVector<SDValue, 8> HiOps(NElts);
4616 SmallVector<SDValue, 16> HiAndLoOps;
4617
4618 DAG.ExtractVectorElements(SplitLHS, HiAndLoOps, /*Start=*/0, NElts * 2);
4619 for (unsigned I = 0; I != NElts; ++I)
4620 HiOps[I] = HiAndLoOps[2 * I + 1];
4621 Hi = DAG.getNode(ISD::BUILD_VECTOR, LHSSL, TargetType, HiOps);
4622 } else {
4623 const SDValue One = DAG.getConstant(1, LHSSL, TargetScalarType);
4624 ConcatType = EVT::getVectorVT(*DAG.getContext(), TargetType, 2);
4625 SDValue SplitLHS = DAG.getNode(ISD::BITCAST, LHSSL, ConcatType, LHS);
4626 Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, LHSSL, TargetType, SplitLHS, One);
4627 }
4628
4629 SDValue NewShift =
4630 DAG.getNode(ISD::SRL, SL, TargetType, Hi, ShiftAmt, N->getFlags());
4631
4632 SDValue Vec;
4633 if (VT.isVector()) {
4634 unsigned NElts = TargetType.getVectorNumElements();
4636 SmallVector<SDValue, 16> HiAndLoOps(NElts * 2, Zero);
4637
4638 DAG.ExtractVectorElements(NewShift, LoOps, 0, NElts);
4639 for (unsigned I = 0; I != NElts; ++I)
4640 HiAndLoOps[2 * I] = LoOps[I];
4641 Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, ConcatType, HiAndLoOps);
4642 } else {
4643 Vec = DAG.getBuildVector(ConcatType, SL, {NewShift, Zero});
4644 }
4645 return DAG.getNode(ISD::BITCAST, SL, VT, Vec);
4646}
4647
4649 SDNode *N, DAGCombinerInfo &DCI) const {
4650 SDLoc SL(N);
4651 SelectionDAG &DAG = DCI.DAG;
4652 EVT VT = N->getValueType(0);
4653 SDValue Src = N->getOperand(0);
4654
4655 // vt1 (truncate (bitcast (build_vector vt0:x, ...))) -> vt1 (bitcast vt0:x)
4656 if (Src.getOpcode() == ISD::BITCAST && !VT.isVector()) {
4657 SDValue Vec = Src.getOperand(0);
4658 if (Vec.getOpcode() == ISD::BUILD_VECTOR) {
4659 SDValue Elt0 = Vec.getOperand(0);
4660 EVT EltVT = Elt0.getValueType();
4661 if (VT.getFixedSizeInBits() <= EltVT.getFixedSizeInBits()) {
4662 if (EltVT.isFloatingPoint()) {
4663 Elt0 = DAG.getNode(ISD::BITCAST, SL,
4664 EltVT.changeTypeToInteger(), Elt0);
4665 }
4666
4667 return DAG.getNode(ISD::TRUNCATE, SL, VT, Elt0);
4668 }
4669 }
4670 }
4671
4672 // Equivalent of above for accessing the high element of a vector as an
4673 // integer operation.
4674 // trunc (srl (bitcast (build_vector x, y))), 16 -> trunc (bitcast y)
4675 if (Src.getOpcode() == ISD::SRL && !VT.isVector()) {
4676 if (auto *K = isConstOrConstSplat(Src.getOperand(1))) {
4677 SDValue BV = stripBitcast(Src.getOperand(0));
4678 if (BV.getOpcode() == ISD::BUILD_VECTOR) {
4679 EVT SrcEltVT = BV.getOperand(0).getValueType();
4680 unsigned SrcEltSize = SrcEltVT.getSizeInBits();
4681 unsigned BitIndex = K->getZExtValue();
4682 unsigned PartIndex = BitIndex / SrcEltSize;
4683
4684 if (PartIndex * SrcEltSize == BitIndex &&
4685 PartIndex < BV.getNumOperands()) {
4686 if (SrcEltVT.getSizeInBits() == VT.getSizeInBits()) {
4687 SDValue SrcElt =
4688 DAG.getNode(ISD::BITCAST, SL, SrcEltVT.changeTypeToInteger(),
4689 BV.getOperand(PartIndex));
4690 return DAG.getNode(ISD::TRUNCATE, SL, VT, SrcElt);
4691 }
4692 }
4693 }
4694 }
4695 }
4696
4697 // Partially shrink 64-bit shifts to 32-bit if reduced to 16-bit.
4698 //
4699 // i16 (trunc (srl i64:x, K)), K <= 16 ->
4700 // i16 (trunc (srl (i32 (trunc x), K)))
4701 if (VT.getScalarSizeInBits() < 32) {
4702 EVT SrcVT = Src.getValueType();
4703 if (SrcVT.getScalarSizeInBits() > 32 &&
4704 (Src.getOpcode() == ISD::SRL ||
4705 Src.getOpcode() == ISD::SRA ||
4706 Src.getOpcode() == ISD::SHL)) {
4707 SDValue Amt = Src.getOperand(1);
4708 KnownBits Known = DAG.computeKnownBits(Amt);
4709
4710 // - For left shifts, do the transform as long as the shift
4711 // amount is still legal for i32, so when ShiftAmt < 32 (<= 31)
4712 // - For right shift, do it if ShiftAmt <= (32 - Size) to avoid
4713 // losing information stored in the high bits when truncating.
4714 const unsigned MaxCstSize =
4715 (Src.getOpcode() == ISD::SHL) ? 31 : (32 - VT.getScalarSizeInBits());
4716 if (Known.getMaxValue().ule(MaxCstSize)) {
4717 EVT MidVT = VT.isVector() ?
4718 EVT::getVectorVT(*DAG.getContext(), MVT::i32,
4719 VT.getVectorNumElements()) : MVT::i32;
4720
4721 EVT NewShiftVT = getShiftAmountTy(MidVT, DAG.getDataLayout());
4722 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, MidVT,
4723 Src.getOperand(0));
4724 DCI.AddToWorklist(Trunc.getNode());
4725
4726 if (Amt.getValueType() != NewShiftVT) {
4727 Amt = DAG.getZExtOrTrunc(Amt, SL, NewShiftVT);
4728 DCI.AddToWorklist(Amt.getNode());
4729 }
4730
4731 SDValue ShrunkShift = DAG.getNode(Src.getOpcode(), SL, MidVT,
4732 Trunc, Amt);
4733 return DAG.getNode(ISD::TRUNCATE, SL, VT, ShrunkShift);
4734 }
4735 }
4736 }
4737
4738 return SDValue();
4739}
4740
4741// We need to specifically handle i64 mul here to avoid unnecessary conversion
4742// instructions. If we only match on the legalized i64 mul expansion,
4743// SimplifyDemandedBits will be unable to remove them because there will be
4744// multiple uses due to the separate mul + mulh[su].
4745static SDValue getMul24(SelectionDAG &DAG, const SDLoc &SL,
4746 SDValue N0, SDValue N1, unsigned Size, bool Signed) {
4747 if (Size <= 32) {
4748 unsigned MulOpc = Signed ? AMDGPUISD::MUL_I24 : AMDGPUISD::MUL_U24;
4749 return DAG.getNode(MulOpc, SL, MVT::i32, N0, N1);
4750 }
4751
4752 unsigned MulLoOpc = Signed ? AMDGPUISD::MUL_I24 : AMDGPUISD::MUL_U24;
4753 unsigned MulHiOpc = Signed ? AMDGPUISD::MULHI_I24 : AMDGPUISD::MULHI_U24;
4754
4755 SDValue MulLo = DAG.getNode(MulLoOpc, SL, MVT::i32, N0, N1);
4756 SDValue MulHi = DAG.getNode(MulHiOpc, SL, MVT::i32, N0, N1);
4757
4758 return DAG.getNode(ISD::BUILD_PAIR, SL, MVT::i64, MulLo, MulHi);
4759}
4760
4761/// If \p V is an add of a constant 1, returns the other operand. Otherwise
4762/// return SDValue().
4763static SDValue getAddOneOp(const SDNode *V) {
4764 if (V->getOpcode() != ISD::ADD)
4765 return SDValue();
4766
4767 return isOneConstant(V->getOperand(1)) ? V->getOperand(0) : SDValue();
4768}
4769
4771 DAGCombinerInfo &DCI) const {
4772 assert(N->getOpcode() == ISD::MUL);
4773 EVT VT = N->getValueType(0);
4774
4775 // Don't generate 24-bit multiplies on values that are in SGPRs, since
4776 // we only have a 32-bit scalar multiply (avoid values being moved to VGPRs
4777 // unnecessarily). isDivergent() is used as an approximation of whether the
4778 // value is in an SGPR.
4779 if (!N->isDivergent())
4780 return SDValue();
4781
4782 unsigned Size = VT.getSizeInBits();
4783 if (VT.isVector() || Size > 64)
4784 return SDValue();
4785
4786 SelectionDAG &DAG = DCI.DAG;
4787 SDLoc DL(N);
4788
4789 SDValue N0 = N->getOperand(0);
4790 SDValue N1 = N->getOperand(1);
4791
4792 // Undo InstCombine canonicalize X * (Y + 1) -> X * Y + X to enable mad
4793 // matching.
4794
4795 // mul x, (add y, 1) -> add (mul x, y), x
4796 auto IsFoldableAdd = [](SDValue V) -> SDValue {
4797 SDValue AddOp = getAddOneOp(V.getNode());
4798 if (!AddOp)
4799 return SDValue();
4800
4801 if (V.hasOneUse() || all_of(V->users(), [](const SDNode *U) -> bool {
4802 return U->getOpcode() == ISD::MUL;
4803 }))
4804 return AddOp;
4805
4806 return SDValue();
4807 };
4808
4809 // FIXME: The selection pattern is not properly checking for commuted
4810 // operands, so we have to place the mul in the LHS
4811 if (SDValue MulOper = IsFoldableAdd(N0)) {
4812 SDValue MulVal = DAG.getNode(N->getOpcode(), DL, VT, N1, MulOper);
4813 return DAG.getNode(ISD::ADD, DL, VT, MulVal, N1);
4814 }
4815
4816 if (SDValue MulOper = IsFoldableAdd(N1)) {
4817 SDValue MulVal = DAG.getNode(N->getOpcode(), DL, VT, N0, MulOper);
4818 return DAG.getNode(ISD::ADD, DL, VT, MulVal, N0);
4819 }
4820
4821 // There are i16 integer mul/mad.
4822 if (isTypeLegal(MVT::i16) && VT.getScalarType().bitsLE(MVT::i16))
4823 return SDValue();
4824
4825 // SimplifyDemandedBits has the annoying habit of turning useful zero_extends
4826 // in the source into any_extends if the result of the mul is truncated. Since
4827 // we can assume the high bits are whatever we want, use the underlying value
4828 // to avoid the unknown high bits from interfering.
4829 if (N0.getOpcode() == ISD::ANY_EXTEND)
4830 N0 = N0.getOperand(0);
4831
4832 if (N1.getOpcode() == ISD::ANY_EXTEND)
4833 N1 = N1.getOperand(0);
4834
4835 SDValue Mul;
4836
4837 if (Subtarget->hasMulU24() && isU24(N0, DAG) && isU24(N1, DAG)) {
4838 N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);
4839 N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);
4840 Mul = getMul24(DAG, DL, N0, N1, Size, false);
4841 } else if (Subtarget->hasMulI24() && isI24(N0, DAG) && isI24(N1, DAG)) {
4842 N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);
4843 N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);
4844 Mul = getMul24(DAG, DL, N0, N1, Size, true);
4845 } else {
4846 return SDValue();
4847 }
4848
4849 // We need to use sext even for MUL_U24, because MUL_U24 is used
4850 // for signed multiply of 8 and 16-bit types.
4851 return DAG.getSExtOrTrunc(Mul, DL, VT);
4852}
4853
4854SDValue
4856 DAGCombinerInfo &DCI) const {
4857 if (N->getValueType(0) != MVT::i32)
4858 return SDValue();
4859
4860 SelectionDAG &DAG = DCI.DAG;
4861 SDLoc DL(N);
4862
4863 bool Signed = N->getOpcode() == ISD::SMUL_LOHI;
4864 SDValue N0 = N->getOperand(0);
4865 SDValue N1 = N->getOperand(1);
4866
4867 // SimplifyDemandedBits has the annoying habit of turning useful zero_extends
4868 // in the source into any_extends if the result of the mul is truncated. Since
4869 // we can assume the high bits are whatever we want, use the underlying value
4870 // to avoid the unknown high bits from interfering.
4871 if (N0.getOpcode() == ISD::ANY_EXTEND)
4872 N0 = N0.getOperand(0);
4873 if (N1.getOpcode() == ISD::ANY_EXTEND)
4874 N1 = N1.getOperand(0);
4875
4876 // Try to use two fast 24-bit multiplies (one for each half of the result)
4877 // instead of one slow extending multiply.
4878 unsigned LoOpcode = 0;
4879 unsigned HiOpcode = 0;
4880 if (Signed) {
4881 if (Subtarget->hasMulI24() && isI24(N0, DAG) && isI24(N1, DAG)) {
4882 N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);
4883 N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);
4884 LoOpcode = AMDGPUISD::MUL_I24;
4885 HiOpcode = AMDGPUISD::MULHI_I24;
4886 }
4887 } else {
4888 if (Subtarget->hasMulU24() && isU24(N0, DAG) && isU24(N1, DAG)) {
4889 N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);
4890 N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);
4891 LoOpcode = AMDGPUISD::MUL_U24;
4892 HiOpcode = AMDGPUISD::MULHI_U24;
4893 }
4894 }
4895 if (!LoOpcode)
4896 return SDValue();
4897
4898 SDValue Lo = DAG.getNode(LoOpcode, DL, MVT::i32, N0, N1);
4899 SDValue Hi = DAG.getNode(HiOpcode, DL, MVT::i32, N0, N1);
4900 DCI.CombineTo(N, Lo, Hi);
4901 return SDValue(N, 0);
4902}
4903
4905 DAGCombinerInfo &DCI) const {
4906 EVT VT = N->getValueType(0);
4907
4908 if (!Subtarget->hasMulI24() || VT.isVector())
4909 return SDValue();
4910
4911 // Don't generate 24-bit multiplies on values that are in SGPRs, since
4912 // we only have a 32-bit scalar multiply (avoid values being moved to VGPRs
4913 // unnecessarily). isDivergent() is used as an approximation of whether the
4914 // value is in an SGPR.
4915 // This doesn't apply if no s_mul_hi is available (since we'll end up with a
4916 // valu op anyway)
4917 if (Subtarget->hasSMulHi() && !N->isDivergent())
4918 return SDValue();
4919
4920 SelectionDAG &DAG = DCI.DAG;
4921 SDLoc DL(N);
4922
4923 SDValue N0 = N->getOperand(0);
4924 SDValue N1 = N->getOperand(1);
4925
4926 if (!isI24(N0, DAG) || !isI24(N1, DAG))
4927 return SDValue();
4928
4929 N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);
4930 N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);
4931
4932 SDValue Mulhi = DAG.getNode(AMDGPUISD::MULHI_I24, DL, MVT::i32, N0, N1);
4933 DCI.AddToWorklist(Mulhi.getNode());
4934 return DAG.getSExtOrTrunc(Mulhi, DL, VT);
4935}
4936
4938 DAGCombinerInfo &DCI) const {
4939 EVT VT = N->getValueType(0);
4940
4941 if (VT.isVector() || VT.getSizeInBits() > 32 || !Subtarget->hasMulU24())
4942 return SDValue();
4943
4944 // Don't generate 24-bit multiplies on values that are in SGPRs, since
4945 // we only have a 32-bit scalar multiply (avoid values being moved to VGPRs
4946 // unnecessarily). isDivergent() is used as an approximation of whether the
4947 // value is in an SGPR.
4948 // This doesn't apply if no s_mul_hi is available (since we'll end up with a
4949 // valu op anyway)
4950 if (!N->isDivergent() && Subtarget->hasSMulHi())
4951 return SDValue();
4952
4953 SelectionDAG &DAG = DCI.DAG;
4954 SDLoc DL(N);
4955
4956 SDValue N0 = N->getOperand(0);
4957 SDValue N1 = N->getOperand(1);
4958
4959 if (!isU24(N0, DAG) || !isU24(N1, DAG))
4960 return SDValue();
4961
4962 N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);
4963 N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);
4964
4965 SDValue Mulhi = DAG.getNode(AMDGPUISD::MULHI_U24, DL, MVT::i32, N0, N1);
4966 DCI.AddToWorklist(Mulhi.getNode());
4967 return DAG.getZExtOrTrunc(Mulhi, DL, VT);
4968}
4969
4970SDValue AMDGPUTargetLowering::getFFBX_U32(SelectionDAG &DAG,
4971 SDValue Op,
4972 const SDLoc &DL,
4973 unsigned Opc) const {
4974 EVT VT = Op.getValueType();
4975 if (VT.bitsGT(MVT::i32))
4976 return SDValue();
4977
4978 if (VT != MVT::i32)
4979 Op = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Op);
4980
4981 SDValue FFBX = DAG.getNode(Opc, DL, MVT::i32, Op);
4982 if (VT != MVT::i32)
4983 FFBX = DAG.getNode(ISD::TRUNCATE, DL, VT, FFBX);
4984
4985 return FFBX;
4986}
4987
4988// The native instructions return -1 on 0 input. Optimize out a select that
4989// produces -1 on 0.
4990//
4991// TODO: If zero is not undef, we could also do this if the output is compared
4992// against the bitwidth.
4993//
4994// TODO: Should probably combine against FFBH_U32 instead of ctlz directly.
4996 SDValue LHS, SDValue RHS,
4997 DAGCombinerInfo &DCI) const {
4998 if (!isNullConstant(Cond.getOperand(1)))
4999 return SDValue();
5000
5001 SelectionDAG &DAG = DCI.DAG;
5002 ISD::CondCode CCOpcode = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
5003 SDValue CmpLHS = Cond.getOperand(0);
5004
5005 // select (setcc x, 0, eq), -1, (ctlz_zero_poison x) -> ffbh_u32 x
5006 // select (setcc x, 0, eq), -1, (cttz_zero_poison x) -> ffbl_u32 x
5007 if (CCOpcode == ISD::SETEQ &&
5008 (isCtlzOpc(RHS.getOpcode()) || isCttzOpc(RHS.getOpcode())) &&
5009 RHS.getOperand(0) == CmpLHS && isAllOnesConstant(LHS)) {
5010 unsigned Opc =
5011 isCttzOpc(RHS.getOpcode()) ? AMDGPUISD::FFBL_B32 : AMDGPUISD::FFBH_U32;
5012 return getFFBX_U32(DAG, CmpLHS, SL, Opc);
5013 }
5014
5015 // select (setcc x, 0, ne), (ctlz_zero_poison x), -1 -> ffbh_u32 x
5016 // select (setcc x, 0, ne), (cttz_zero_poison x), -1 -> ffbl_u32 x
5017 if (CCOpcode == ISD::SETNE &&
5018 (isCtlzOpc(LHS.getOpcode()) || isCttzOpc(LHS.getOpcode())) &&
5019 LHS.getOperand(0) == CmpLHS && isAllOnesConstant(RHS)) {
5020 unsigned Opc =
5021 isCttzOpc(LHS.getOpcode()) ? AMDGPUISD::FFBL_B32 : AMDGPUISD::FFBH_U32;
5022
5023 return getFFBX_U32(DAG, CmpLHS, SL, Opc);
5024 }
5025
5026 return SDValue();
5027}
5028
5030 unsigned Op,
5031 const SDLoc &SL,
5032 SDValue Cond,
5033 SDValue N1,
5034 SDValue N2) {
5035 SelectionDAG &DAG = DCI.DAG;
5036 EVT VT = N1.getValueType();
5037
5038 SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, VT, Cond,
5039 N1.getOperand(0), N2.getOperand(0));
5040 DCI.AddToWorklist(NewSelect.getNode());
5041 return DAG.getNode(Op, SL, VT, NewSelect);
5042}
5043
5044// Pull a free FP operation out of a select so it may fold into uses.
5045//
5046// select c, (fneg x), (fneg y) -> fneg (select c, x, y)
5047// select c, (fneg x), k -> fneg (select c, x, (fneg k))
5048//
5049// select c, (fabs x), (fabs y) -> fabs (select c, x, y)
5050// select c, (fabs x), +k -> fabs (select c, x, k)
5051SDValue
5053 SDValue N) const {
5054 SelectionDAG &DAG = DCI.DAG;
5055 SDValue Cond = N.getOperand(0);
5056 SDValue LHS = N.getOperand(1);
5057 SDValue RHS = N.getOperand(2);
5058
5059 EVT VT = N.getValueType();
5060 if ((LHS.getOpcode() == ISD::FABS && RHS.getOpcode() == ISD::FABS) ||
5061 (LHS.getOpcode() == ISD::FNEG && RHS.getOpcode() == ISD::FNEG)) {
5063 return SDValue();
5064
5065 return distributeOpThroughSelect(DCI, LHS.getOpcode(),
5066 SDLoc(N), Cond, LHS, RHS);
5067 }
5068
5069 bool Inv = false;
5070 if (RHS.getOpcode() == ISD::FABS || RHS.getOpcode() == ISD::FNEG) {
5071 std::swap(LHS, RHS);
5072 Inv = true;
5073 }
5074
5075 // TODO: Support vector constants.
5077 if ((LHS.getOpcode() == ISD::FNEG || LHS.getOpcode() == ISD::FABS) && CRHS &&
5078 !selectSupportsSourceMods(N.getNode())) {
5079 SDLoc SL(N);
5080 // If one side is an fneg/fabs and the other is a constant, we can push the
5081 // fneg/fabs down. If it's an fabs, the constant needs to be non-negative.
5082 SDValue NewLHS = LHS.getOperand(0);
5083 SDValue NewRHS = RHS;
5084
5085 // Careful: if the neg can be folded up, don't try to pull it back down.
5086 bool ShouldFoldNeg = true;
5087
5088 if (NewLHS.hasOneUse()) {
5089 unsigned Opc = NewLHS.getOpcode();
5090 if (LHS.getOpcode() == ISD::FNEG && fnegFoldsIntoOp(NewLHS.getNode()))
5091 ShouldFoldNeg = false;
5092 if (LHS.getOpcode() == ISD::FABS && Opc == ISD::FMUL)
5093 ShouldFoldNeg = false;
5094 }
5095
5096 if (ShouldFoldNeg) {
5097 if (LHS.getOpcode() == ISD::FABS && CRHS->isNegative())
5098 return SDValue();
5099
5100 // We're going to be forced to use a source modifier anyway, there's no
5101 // point to pulling the negate out unless we can get a size reduction by
5102 // negating the constant.
5103 //
5104 // TODO: Generalize to use getCheaperNegatedExpression which doesn't know
5105 // about cheaper constants.
5106 if (NewLHS.getOpcode() == ISD::FABS &&
5108 return SDValue();
5109
5111 return SDValue();
5112
5113 if (LHS.getOpcode() == ISD::FNEG)
5114 NewRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
5115
5116 if (Inv)
5117 std::swap(NewLHS, NewRHS);
5118
5119 SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, VT,
5120 Cond, NewLHS, NewRHS);
5121 DCI.AddToWorklist(NewSelect.getNode());
5122 return DAG.getNode(LHS.getOpcode(), SL, VT, NewSelect);
5123 }
5124 }
5125
5126 return SDValue();
5127}
5128
5130 DAGCombinerInfo &DCI) const {
5131 if (SDValue Folded = foldFreeOpFromSelect(DCI, SDValue(N, 0)))
5132 return Folded;
5133
5134 SDValue Cond = N->getOperand(0);
5135 if (Cond.getOpcode() != ISD::SETCC)
5136 return SDValue();
5137
5138 EVT VT = N->getValueType(0);
5139 SDValue LHS = Cond.getOperand(0);
5140 SDValue RHS = Cond.getOperand(1);
5141 SDValue CC = Cond.getOperand(2);
5142
5143 SDValue True = N->getOperand(1);
5144 SDValue False = N->getOperand(2);
5145
5146 if (Cond.hasOneUse()) { // TODO: Look for multiple select uses.
5147 SelectionDAG &DAG = DCI.DAG;
5148 if (DAG.isConstantValueOfAnyType(True) &&
5149 !DAG.isConstantValueOfAnyType(False)) {
5150 // Swap cmp + select pair to move constant to false input.
5151 // This will allow using VOPC cndmasks more often.
5152 // select (setcc x, y), k, x -> select (setccinv x, y), x, k
5153
5154 SDLoc SL(N);
5155 ISD::CondCode NewCC =
5156 getSetCCInverse(cast<CondCodeSDNode>(CC)->get(), LHS.getValueType());
5157
5158 SDValue NewCond = DAG.getSetCC(SL, Cond.getValueType(), LHS, RHS, NewCC);
5159 return DAG.getNode(ISD::SELECT, SL, VT, NewCond, False, True);
5160 }
5161
5162 if (VT == MVT::f32 && Subtarget->hasFminFmaxLegacy()) {
5164 = combineFMinMaxLegacy(SDLoc(N), VT, LHS, RHS, True, False, CC, DCI);
5165 // Revisit this node so we can catch min3/max3/med3 patterns.
5166 //DCI.AddToWorklist(MinMax.getNode());
5167 return MinMax;
5168 }
5169 }
5170
5171 // There's no reason to not do this if the condition has other uses.
5172 return performCtlz_CttzCombine(SDLoc(N), Cond, True, False, DCI);
5173}
5174
5175static bool isInv2Pi(const APFloat &APF) {
5176 static const APFloat KF16(APFloat::IEEEhalf(), APInt(16, 0x3118));
5177 static const APFloat KF32(APFloat::IEEEsingle(), APInt(32, 0x3e22f983));
5178 static const APFloat KF64(APFloat::IEEEdouble(), APInt(64, 0x3fc45f306dc9c882));
5179
5180 return APF.bitwiseIsEqual(KF16) ||
5181 APF.bitwiseIsEqual(KF32) ||
5182 APF.bitwiseIsEqual(KF64);
5183}
5184
5185// 0 and 1.0 / (0.5 * pi) do not have inline immmediates, so there is an
5186// additional cost to negate them.
5189 if (C->isZero())
5190 return C->isNegative() ? NegatibleCost::Cheaper : NegatibleCost::Expensive;
5191
5192 if (Subtarget->hasInv2PiInlineImm() && isInv2Pi(C->getValueAPF()))
5193 return C->isNegative() ? NegatibleCost::Cheaper : NegatibleCost::Expensive;
5194
5196}
5197
5203
5209
5210static unsigned inverseMinMax(unsigned Opc) {
5211 switch (Opc) {
5212 case ISD::FMAXNUM:
5213 return ISD::FMINNUM;
5214 case ISD::FMINNUM:
5215 return ISD::FMAXNUM;
5216 case ISD::FMAXNUM_IEEE:
5217 return ISD::FMINNUM_IEEE;
5218 case ISD::FMINNUM_IEEE:
5219 return ISD::FMAXNUM_IEEE;
5220 case ISD::FMAXIMUM:
5221 return ISD::FMINIMUM;
5222 case ISD::FMINIMUM:
5223 return ISD::FMAXIMUM;
5224 case ISD::FMAXIMUMNUM:
5225 return ISD::FMINIMUMNUM;
5226 case ISD::FMINIMUMNUM:
5227 return ISD::FMAXIMUMNUM;
5228 case AMDGPUISD::FMAX_LEGACY:
5229 return AMDGPUISD::FMIN_LEGACY;
5230 case AMDGPUISD::FMIN_LEGACY:
5231 return AMDGPUISD::FMAX_LEGACY;
5232 default:
5233 llvm_unreachable("invalid min/max opcode");
5234 }
5235}
5236
5237/// \return true if it's profitable to try to push an fneg into its source
5238/// instruction.
5240 // If the input has multiple uses and we can either fold the negate down, or
5241 // the other uses cannot, give up. This both prevents unprofitable
5242 // transformations and infinite loops: we won't repeatedly try to fold around
5243 // a negate that has no 'good' form.
5244 if (N0.hasOneUse()) {
5245 // This may be able to fold into the source, but at a code size cost. Don't
5246 // fold if the fold into the user is free.
5247 if (allUsesHaveSourceMods(N, 0))
5248 return false;
5249 } else {
5250 if (fnegFoldsIntoOp(N0.getNode()) &&
5252 return false;
5253 }
5254
5255 return true;
5256}
5257
5259 DAGCombinerInfo &DCI) const {
5260 SelectionDAG &DAG = DCI.DAG;
5261 SDValue N0 = N->getOperand(0);
5262 EVT VT = N->getValueType(0);
5263
5264 unsigned Opc = N0.getOpcode();
5265
5266 if (!shouldFoldFNegIntoSrc(N, N0))
5267 return SDValue();
5268
5269 SDLoc SL(N);
5270 switch (Opc) {
5271 case ISD::FADD: {
5272 if (!mayIgnoreSignedZero(N0) && !N->getFlags().hasNoSignedZeros())
5273 return SDValue();
5274
5275 // (fneg (fadd x, y)) -> (fadd (fneg x), (fneg y))
5276 SDValue LHS = N0.getOperand(0);
5277 SDValue RHS = N0.getOperand(1);
5278
5279 if (LHS.getOpcode() != ISD::FNEG)
5280 LHS = DAG.getNode(ISD::FNEG, SL, VT, LHS);
5281 else
5282 LHS = LHS.getOperand(0);
5283
5284 if (RHS.getOpcode() != ISD::FNEG)
5285 RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
5286 else
5287 RHS = RHS.getOperand(0);
5288
5289 SDValue Res = DAG.getNode(ISD::FADD, SL, VT, LHS, RHS, N0->getFlags());
5290 if (Res.getOpcode() != ISD::FADD)
5291 return SDValue(); // Op got folded away.
5292 if (!N0.hasOneUse())
5293 DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
5294 return Res;
5295 }
5296 case ISD::FMUL:
5297 case AMDGPUISD::FMUL_LEGACY: {
5298 // (fneg (fmul x, y)) -> (fmul x, (fneg y))
5299 // (fneg (fmul_legacy x, y)) -> (fmul_legacy x, (fneg y))
5300 SDValue LHS = N0.getOperand(0);
5301 SDValue RHS = N0.getOperand(1);
5302
5303 if (LHS.getOpcode() == ISD::FNEG)
5304 LHS = LHS.getOperand(0);
5305 else if (RHS.getOpcode() == ISD::FNEG)
5306 RHS = RHS.getOperand(0);
5307 else
5308 RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
5309
5310 SDValue Res = DAG.getNode(Opc, SL, VT, LHS, RHS, N0->getFlags());
5311 if (Res.getOpcode() != Opc)
5312 return SDValue(); // Op got folded away.
5313 if (!N0.hasOneUse())
5314 DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
5315 return Res;
5316 }
5317 case ISD::FMA:
5318 case ISD::FMAD: {
5319 // TODO: handle llvm.amdgcn.fma.legacy
5320 if (!mayIgnoreSignedZero(N0) && !N->getFlags().hasNoSignedZeros())
5321 return SDValue();
5322
5323 // (fneg (fma x, y, z)) -> (fma x, (fneg y), (fneg z))
5324 SDValue LHS = N0.getOperand(0);
5325 SDValue MHS = N0.getOperand(1);
5326 SDValue RHS = N0.getOperand(2);
5327
5328 if (LHS.getOpcode() == ISD::FNEG)
5329 LHS = LHS.getOperand(0);
5330 else if (MHS.getOpcode() == ISD::FNEG)
5331 MHS = MHS.getOperand(0);
5332 else
5333 MHS = DAG.getNode(ISD::FNEG, SL, VT, MHS);
5334
5335 if (RHS.getOpcode() != ISD::FNEG)
5336 RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
5337 else
5338 RHS = RHS.getOperand(0);
5339
5340 SDValue Res = DAG.getNode(Opc, SL, VT, LHS, MHS, RHS);
5341 if (Res.getOpcode() != Opc)
5342 return SDValue(); // Op got folded away.
5343 if (!N0.hasOneUse())
5344 DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
5345 return Res;
5346 }
5347 case ISD::FMAXNUM:
5348 case ISD::FMINNUM:
5349 case ISD::FMAXNUM_IEEE:
5350 case ISD::FMINNUM_IEEE:
5351 case ISD::FMINIMUM:
5352 case ISD::FMAXIMUM:
5353 case ISD::FMINIMUMNUM:
5354 case ISD::FMAXIMUMNUM:
5355 case AMDGPUISD::FMAX_LEGACY:
5356 case AMDGPUISD::FMIN_LEGACY: {
5357 // fneg (fmaxnum x, y) -> fminnum (fneg x), (fneg y)
5358 // fneg (fminnum x, y) -> fmaxnum (fneg x), (fneg y)
5359 // fneg (fmax_legacy x, y) -> fmin_legacy (fneg x), (fneg y)
5360 // fneg (fmin_legacy x, y) -> fmax_legacy (fneg x), (fneg y)
5361
5362 SDValue LHS = N0.getOperand(0);
5363 SDValue RHS = N0.getOperand(1);
5364
5365 // 0 doesn't have a negated inline immediate.
5366 // TODO: This constant check should be generalized to other operations.
5368 return SDValue();
5369
5370 SDValue NegLHS = DAG.getNode(ISD::FNEG, SL, VT, LHS);
5371 SDValue NegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
5372 unsigned Opposite = inverseMinMax(Opc);
5373
5374 SDValue Res = DAG.getNode(Opposite, SL, VT, NegLHS, NegRHS, N0->getFlags());
5375 if (Res.getOpcode() != Opposite)
5376 return SDValue(); // Op got folded away.
5377 if (!N0.hasOneUse())
5378 DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
5379 return Res;
5380 }
5381 case AMDGPUISD::FMED3: {
5382 // med3 sorts a NaN input as smaller than everything regardless of its sign,
5383 // so negating all operands does not sign-flip the median when an input may
5384 // be NaN.
5385 if (!N0->getFlags().hasNoNaNs())
5386 return SDValue();
5387
5388 SDValue Ops[3];
5389 for (unsigned I = 0; I < 3; ++I)
5390 Ops[I] = DAG.getNode(ISD::FNEG, SL, VT, N0->getOperand(I), N0->getFlags());
5391
5392 SDValue Res = DAG.getNode(AMDGPUISD::FMED3, SL, VT, Ops, N0->getFlags());
5393 if (Res.getOpcode() != AMDGPUISD::FMED3)
5394 return SDValue(); // Op got folded away.
5395
5396 if (!N0.hasOneUse()) {
5397 SDValue Neg = DAG.getNode(ISD::FNEG, SL, VT, Res);
5398 DAG.ReplaceAllUsesWith(N0, Neg);
5399
5400 for (SDNode *U : Neg->users())
5401 DCI.AddToWorklist(U);
5402 }
5403
5404 return Res;
5405 }
5406 case ISD::FP_EXTEND:
5407 case ISD::FTRUNC:
5408 case ISD::FRINT:
5409 case ISD::FNEARBYINT: // XXX - Should fround be handled?
5410 case ISD::FROUNDEVEN:
5411 case ISD::FSIN:
5412 case ISD::FCANONICALIZE:
5413 case AMDGPUISD::RCP:
5414 case AMDGPUISD::RCP_LEGACY:
5415 case AMDGPUISD::RCP_IFLAG:
5416 case AMDGPUISD::SIN_HW: {
5417 SDValue CvtSrc = N0.getOperand(0);
5418 if (CvtSrc.getOpcode() == ISD::FNEG) {
5419 // (fneg (fp_extend (fneg x))) -> (fp_extend x)
5420 // (fneg (rcp (fneg x))) -> (rcp x)
5421 return DAG.getNode(Opc, SL, VT, CvtSrc.getOperand(0));
5422 }
5423
5424 if (!N0.hasOneUse())
5425 return SDValue();
5426
5427 // (fneg (fp_extend x)) -> (fp_extend (fneg x))
5428 // (fneg (rcp x)) -> (rcp (fneg x))
5429 SDValue Neg = DAG.getNode(ISD::FNEG, SL, CvtSrc.getValueType(), CvtSrc);
5430 return DAG.getNode(Opc, SL, VT, Neg, N0->getFlags());
5431 }
5432 case ISD::FP_ROUND: {
5433 SDValue CvtSrc = N0.getOperand(0);
5434
5435 if (CvtSrc.getOpcode() == ISD::FNEG) {
5436 // (fneg (fp_round (fneg x))) -> (fp_round x)
5437 return DAG.getNode(ISD::FP_ROUND, SL, VT,
5438 CvtSrc.getOperand(0), N0.getOperand(1));
5439 }
5440
5441 if (!N0.hasOneUse())
5442 return SDValue();
5443
5444 // (fneg (fp_round x)) -> (fp_round (fneg x))
5445 SDValue Neg = DAG.getNode(ISD::FNEG, SL, CvtSrc.getValueType(), CvtSrc);
5446 return DAG.getNode(ISD::FP_ROUND, SL, VT, Neg, N0.getOperand(1));
5447 }
5448 case ISD::FP16_TO_FP: {
5449 // v_cvt_f32_f16 supports source modifiers on pre-VI targets without legal
5450 // f16, but legalization of f16 fneg ends up pulling it out of the source.
5451 // Put the fneg back as a legal source operation that can be matched later.
5452 SDLoc SL(N);
5453
5454 SDValue Src = N0.getOperand(0);
5455 EVT SrcVT = Src.getValueType();
5456
5457 // fneg (fp16_to_fp x) -> fp16_to_fp (xor x, 0x8000)
5458 SDValue IntFNeg = DAG.getNode(ISD::XOR, SL, SrcVT, Src,
5459 DAG.getConstant(0x8000, SL, SrcVT));
5460 return DAG.getNode(ISD::FP16_TO_FP, SL, N->getValueType(0), IntFNeg);
5461 }
5462 case ISD::SELECT: {
5463 // fneg (select c, a, b) -> select c, (fneg a), (fneg b)
5464 // TODO: Invert conditions of foldFreeOpFromSelect
5465 return SDValue();
5466 }
5467 case ISD::BITCAST: {
5468 SDLoc SL(N);
5469 SDValue BCSrc = N0.getOperand(0);
5470 if (BCSrc.getOpcode() == ISD::BUILD_VECTOR) {
5471 SDValue HighBits = BCSrc.getOperand(BCSrc.getNumOperands() - 1);
5472 if (VT != MVT::f64 || HighBits.getValueType().getSizeInBits() != 32 ||
5473 !fnegFoldsIntoOp(HighBits.getNode()))
5474 return SDValue();
5475
5476 // f64 fneg only really needs to operate on the high half of of the
5477 // register, so try to force it to an f32 operation to help make use of
5478 // source modifiers.
5479 //
5480 //
5481 // fneg (f64 (bitcast (build_vector x, y))) ->
5482 // f64 (bitcast (build_vector (bitcast i32:x to f32),
5483 // (fneg (bitcast i32:y to f32)))
5484
5485 SDValue CastHi = DAG.getNode(ISD::BITCAST, SL, MVT::f32, HighBits);
5486 SDValue NegHi = DAG.getNode(ISD::FNEG, SL, MVT::f32, CastHi);
5487 SDValue CastBack =
5488 DAG.getNode(ISD::BITCAST, SL, HighBits.getValueType(), NegHi);
5489
5491 Ops.back() = CastBack;
5492 DCI.AddToWorklist(NegHi.getNode());
5493 SDValue Build =
5494 DAG.getNode(ISD::BUILD_VECTOR, SL, BCSrc.getValueType(), Ops);
5495 SDValue Result = DAG.getNode(ISD::BITCAST, SL, VT, Build);
5496
5497 if (!N0.hasOneUse())
5498 DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Result));
5499 return Result;
5500 }
5501
5502 if (BCSrc.getOpcode() == ISD::SELECT && VT == MVT::f32 &&
5503 BCSrc.hasOneUse()) {
5504 // fneg (bitcast (f32 (select cond, i32:lhs, i32:rhs))) ->
5505 // select cond, (bitcast i32:lhs to f32), (bitcast i32:rhs to f32)
5506
5507 // TODO: Cast back result for multiple uses is beneficial in some cases.
5508
5509 SDValue LHS =
5510 DAG.getNode(ISD::BITCAST, SL, MVT::f32, BCSrc.getOperand(1));
5511 SDValue RHS =
5512 DAG.getNode(ISD::BITCAST, SL, MVT::f32, BCSrc.getOperand(2));
5513
5514 SDValue NegLHS = DAG.getNode(ISD::FNEG, SL, MVT::f32, LHS);
5515 SDValue NegRHS = DAG.getNode(ISD::FNEG, SL, MVT::f32, RHS);
5516
5517 return DAG.getNode(ISD::SELECT, SL, MVT::f32, BCSrc.getOperand(0), NegLHS,
5518 NegRHS);
5519 }
5520
5521 return SDValue();
5522 }
5523 default:
5524 return SDValue();
5525 }
5526}
5527
5529 DAGCombinerInfo &DCI) const {
5530 SelectionDAG &DAG = DCI.DAG;
5531 SDValue N0 = N->getOperand(0);
5532
5533 if (!N0.hasOneUse())
5534 return SDValue();
5535
5536 switch (N0.getOpcode()) {
5537 case ISD::FP16_TO_FP: {
5538 assert(!isTypeLegal(MVT::f16) && "should only see if f16 is illegal");
5539 SDLoc SL(N);
5540 SDValue Src = N0.getOperand(0);
5541 EVT SrcVT = Src.getValueType();
5542
5543 // fabs (fp16_to_fp x) -> fp16_to_fp (and x, 0x7fff)
5544 SDValue IntFAbs = DAG.getNode(ISD::AND, SL, SrcVT, Src,
5545 DAG.getConstant(0x7fff, SL, SrcVT));
5546 return DAG.getNode(ISD::FP16_TO_FP, SL, N->getValueType(0), IntFAbs);
5547 }
5548 default:
5549 return SDValue();
5550 }
5551}
5552
5554 DAGCombinerInfo &DCI) const {
5555 const auto *CFP = dyn_cast<ConstantFPSDNode>(N->getOperand(0));
5556 if (!CFP)
5557 return SDValue();
5558
5559 // XXX - Should this flush denormals?
5560 const APFloat &Val = CFP->getValueAPF();
5561 APFloat One(Val.getSemantics(), "1.0");
5562 return DCI.DAG.getConstantFP(One / Val, SDLoc(N), N->getValueType(0));
5563}
5564
5566 if (!Subtarget->isGCN())
5567 return false;
5568
5571 auto &ST = DAG.getSubtarget<GCNSubtarget>();
5572 const auto *TII = ST.getInstrInfo();
5573
5574 if (!ST.hasVMovB64Inst() || (!SDConstant && !SDFPConstant))
5575 return false;
5576
5577 if (ST.has64BitLiterals())
5578 return true;
5579
5580 if (SDConstant) {
5581 const APInt &APVal = SDConstant->getAPIntValue();
5582 return isUInt<32>(APVal.getZExtValue()) || TII->isInlineConstant(APVal);
5583 }
5584
5585 APInt Val = SDFPConstant->getValueAPF().bitcastToAPInt();
5586 return isUInt<32>(Val.getZExtValue()) || TII->isInlineConstant(Val);
5587}
5588
5590 DAGCombinerInfo &DCI) const {
5591 SelectionDAG &DAG = DCI.DAG;
5592 SDLoc DL(N);
5593
5594 switch(N->getOpcode()) {
5595 default:
5596 break;
5597 case ISD::BITCAST: {
5598 EVT DestVT = N->getValueType(0);
5599
5600 // Push casts through vector builds. This helps avoid emitting a large
5601 // number of copies when materializing floating point vector constants.
5602 //
5603 // vNt1 bitcast (vNt0 (build_vector t0:x, t0:y)) =>
5604 // vnt1 = build_vector (t1 (bitcast t0:x)), (t1 (bitcast t0:y))
5605 if (DestVT.isVector()) {
5606 SDValue Src = N->getOperand(0);
5607 if (Src.getOpcode() == ISD::BUILD_VECTOR &&
5610 EVT SrcVT = Src.getValueType();
5611 unsigned NElts = DestVT.getVectorNumElements();
5612
5613 if (SrcVT.getVectorNumElements() == NElts) {
5614 EVT DestEltVT = DestVT.getVectorElementType();
5615
5616 SmallVector<SDValue, 8> CastedElts;
5617 SDLoc SL(N);
5618 for (unsigned I = 0, E = SrcVT.getVectorNumElements(); I != E; ++I) {
5619 SDValue Elt = Src.getOperand(I);
5620 CastedElts.push_back(DAG.getNode(ISD::BITCAST, DL, DestEltVT, Elt));
5621 }
5622
5623 return DAG.getBuildVector(DestVT, SL, CastedElts);
5624 }
5625 }
5626 }
5627
5628 if (DestVT.getSizeInBits() != 64 || !DestVT.isVector())
5629 break;
5630
5631 // Fold bitcasts of constants.
5632 //
5633 // v2i32 (bitcast i64:k) -> build_vector lo_32(k), hi_32(k)
5634 // TODO: Generalize and move to DAGCombiner
5635 SDValue Src = N->getOperand(0);
5637 SDLoc SL(N);
5638 if (isInt64ImmLegal(C, DAG))
5639 break;
5640 uint64_t CVal = C->getZExtValue();
5641 SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32,
5642 DAG.getConstant(Lo_32(CVal), SL, MVT::i32),
5643 DAG.getConstant(Hi_32(CVal), SL, MVT::i32));
5644 return DAG.getNode(ISD::BITCAST, SL, DestVT, BV);
5645 }
5646
5648 const APInt &Val = C->getValueAPF().bitcastToAPInt();
5649 SDLoc SL(N);
5650 if (isInt64ImmLegal(C, DAG))
5651 break;
5652 uint64_t CVal = Val.getZExtValue();
5653 SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32,
5654 DAG.getConstant(Lo_32(CVal), SL, MVT::i32),
5655 DAG.getConstant(Hi_32(CVal), SL, MVT::i32));
5656
5657 return DAG.getNode(ISD::BITCAST, SL, DestVT, Vec);
5658 }
5659
5660 break;
5661 }
5662 case ISD::SHL:
5663 case ISD::SRA:
5664 case ISD::SRL: {
5665 // Range metadata can be invalidated when loads are converted to legal types
5666 // (e.g. v2i64 -> v4i32).
5667 // Try to convert vector shl/sra/srl before type legalization so that range
5668 // metadata can be utilized.
5669 if (!(N->getValueType(0).isVector() &&
5672 break;
5673 if (N->getOpcode() == ISD::SHL)
5674 return performShlCombine(N, DCI);
5675 if (N->getOpcode() == ISD::SRA)
5676 return performSraCombine(N, DCI);
5677 return performSrlCombine(N, DCI);
5678 }
5679 case ISD::TRUNCATE:
5680 return performTruncateCombine(N, DCI);
5681 case ISD::MUL:
5682 return performMulCombine(N, DCI);
5683 case AMDGPUISD::MUL_U24:
5684 case AMDGPUISD::MUL_I24: {
5685 if (SDValue Simplified = simplifyMul24(N, DCI))
5686 return Simplified;
5687 break;
5688 }
5689 case AMDGPUISD::MULHI_I24:
5690 case AMDGPUISD::MULHI_U24:
5691 return simplifyMul24(N, DCI);
5692 case ISD::SMUL_LOHI:
5693 case ISD::UMUL_LOHI:
5694 return performMulLoHiCombine(N, DCI);
5695 case ISD::MULHS:
5696 return performMulhsCombine(N, DCI);
5697 case ISD::MULHU:
5698 return performMulhuCombine(N, DCI);
5699 case ISD::SELECT:
5700 return performSelectCombine(N, DCI);
5701 case ISD::FNEG:
5702 return performFNegCombine(N, DCI);
5703 case ISD::FABS:
5704 return performFAbsCombine(N, DCI);
5705 case AMDGPUISD::BFE_I32:
5706 case AMDGPUISD::BFE_U32: {
5707 assert(!N->getValueType(0).isVector() &&
5708 "Vector handling of BFE not implemented");
5709 ConstantSDNode *Width = dyn_cast<ConstantSDNode>(N->getOperand(2));
5710 if (!Width)
5711 break;
5712
5713 uint32_t WidthVal = Width->getZExtValue() & 0x1f;
5714 if (WidthVal == 0)
5715 return DAG.getConstant(0, DL, MVT::i32);
5716
5718 if (!Offset)
5719 break;
5720
5721 SDValue BitsFrom = N->getOperand(0);
5722 uint32_t OffsetVal = Offset->getZExtValue() & 0x1f;
5723
5724 bool Signed = N->getOpcode() == AMDGPUISD::BFE_I32;
5725
5726 if (OffsetVal == 0) {
5727 // This is already sign / zero extended, so try to fold away extra BFEs.
5728 unsigned SignBits = Signed ? (32 - WidthVal + 1) : (32 - WidthVal);
5729
5730 unsigned OpSignBits = DAG.ComputeNumSignBits(BitsFrom);
5731 if (OpSignBits >= SignBits)
5732 return BitsFrom;
5733
5734 EVT SmallVT = EVT::getIntegerVT(*DAG.getContext(), WidthVal);
5735 if (Signed) {
5736 // This is a sign_extend_inreg. Replace it to take advantage of existing
5737 // DAG Combines. If not eliminated, we will match back to BFE during
5738 // selection.
5739
5740 // TODO: The sext_inreg of extended types ends, although we can could
5741 // handle them in a single BFE.
5742 return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i32, BitsFrom,
5743 DAG.getValueType(SmallVT));
5744 }
5745
5746 return DAG.getZeroExtendInReg(BitsFrom, DL, SmallVT);
5747 }
5748
5749 if (ConstantSDNode *CVal = dyn_cast<ConstantSDNode>(BitsFrom)) {
5750 if (Signed) {
5751 return constantFoldBFE<int32_t>(DAG,
5752 CVal->getSExtValue(),
5753 OffsetVal,
5754 WidthVal,
5755 DL);
5756 }
5757
5758 return constantFoldBFE<uint32_t>(DAG,
5759 CVal->getZExtValue(),
5760 OffsetVal,
5761 WidthVal,
5762 DL);
5763 }
5764
5765 if ((OffsetVal + WidthVal) >= 32 &&
5766 !(OffsetVal == 16 && WidthVal == 16 && Subtarget->hasSDWA())) {
5767 SDValue ShiftVal = DAG.getConstant(OffsetVal, DL, MVT::i32);
5768 return DAG.getNode(Signed ? ISD::SRA : ISD::SRL, DL, MVT::i32,
5769 BitsFrom, ShiftVal);
5770 }
5771
5772 if (BitsFrom.hasOneUse()) {
5773 APInt Demanded = APInt::getBitsSet(32,
5774 OffsetVal,
5775 OffsetVal + WidthVal);
5776
5777 KnownBits Known;
5779 !DCI.isBeforeLegalizeOps());
5780 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
5781 if (TLI.ShrinkDemandedConstant(BitsFrom, Demanded, TLO) ||
5782 TLI.SimplifyDemandedBits(BitsFrom, Demanded, Known, TLO)) {
5783 DCI.CommitTargetLoweringOpt(TLO);
5784 }
5785 }
5786
5787 break;
5788 }
5789 case ISD::LOAD:
5790 return performLoadCombine(N, DCI);
5791 case ISD::STORE:
5792 return performStoreCombine(N, DCI);
5793 case AMDGPUISD::RCP:
5794 case AMDGPUISD::RCP_IFLAG:
5795 return performRcpCombine(N, DCI);
5796 case ISD::AssertZext:
5797 case ISD::AssertSext:
5798 return performAssertSZExtCombine(N, DCI);
5800 return performIntrinsicWOChainCombine(N, DCI);
5801 case AMDGPUISD::FMAD_FTZ: {
5802 SDValue N0 = N->getOperand(0);
5803 SDValue N1 = N->getOperand(1);
5804 SDValue N2 = N->getOperand(2);
5805 EVT VT = N->getValueType(0);
5806
5807 // FMAD_FTZ is a FMAD + flush denormals to zero.
5808 // We flush the inputs, the intermediate step, and the output.
5812 if (N0CFP && N1CFP && N2CFP) {
5813 const auto FTZ = [](const APFloat &V) {
5814 if (V.isDenormal()) {
5815 APFloat Zero(V.getSemantics(), 0);
5816 return V.isNegative() ? -Zero : Zero;
5817 }
5818 return V;
5819 };
5820
5821 APFloat V0 = FTZ(N0CFP->getValueAPF());
5822 APFloat V1 = FTZ(N1CFP->getValueAPF());
5823 APFloat V2 = FTZ(N2CFP->getValueAPF());
5825 V0 = FTZ(V0);
5827 return DAG.getConstantFP(FTZ(V0), DL, VT);
5828 }
5829 break;
5830 }
5831 }
5832 return SDValue();
5833}
5834
5836 SDValue Op, const APInt &OriginalDemandedBits,
5837 const APInt &OriginalDemandedElts, KnownBits &Known, TargetLoweringOpt &TLO,
5838 unsigned Depth) const {
5839 switch (Op.getOpcode()) {
5841 switch (Op.getConstantOperandVal(0)) {
5842 case Intrinsic::amdgcn_readfirstlane:
5843 case Intrinsic::amdgcn_readlane:
5844 case Intrinsic::amdgcn_set_inactive:
5845 case Intrinsic::amdgcn_wwm: {
5846 if (SimplifyDemandedBits(Op.getOperand(1), OriginalDemandedBits,
5847 OriginalDemandedElts, Known, TLO, Depth + 1))
5848 return true;
5849 break;
5850 }
5851 default:
5852 break;
5853 }
5854 break;
5855 }
5856 default:
5857 break;
5858 }
5859
5860 return false;
5861}
5862
5863//===----------------------------------------------------------------------===//
5864// Helper functions
5865//===----------------------------------------------------------------------===//
5866
5868 const TargetRegisterClass *RC,
5869 Register Reg, EVT VT,
5870 const SDLoc &SL,
5871 bool RawReg) const {
5873 MachineRegisterInfo &MRI = MF.getRegInfo();
5874 Register VReg;
5875
5876 if (!MRI.isLiveIn(Reg)) {
5877 VReg = MRI.createVirtualRegister(RC);
5878 MRI.addLiveIn(Reg, VReg);
5879 } else {
5880 VReg = MRI.getLiveInVirtReg(Reg);
5881 }
5882
5883 if (RawReg)
5884 return DAG.getRegister(VReg, VT);
5885
5886 return DAG.getCopyFromReg(DAG.getEntryNode(), SL, VReg, VT);
5887}
5888
5889// This may be called multiple times, and nothing prevents creating multiple
5890// objects at the same offset. See if we already defined this object.
5892 int64_t Offset) {
5893 for (int I = MFI.getObjectIndexBegin(); I < 0; ++I) {
5894 if (MFI.getObjectOffset(I) == Offset) {
5895 assert(MFI.getObjectSize(I) == Size);
5896 return I;
5897 }
5898 }
5899
5900 return MFI.CreateFixedObject(Size, Offset, true);
5901}
5902
5904 EVT VT,
5905 const SDLoc &SL,
5906 int64_t Offset) const {
5908 MachineFrameInfo &MFI = MF.getFrameInfo();
5909 int FI = getOrCreateFixedStackObject(MFI, VT.getStoreSize(), Offset);
5910
5911 auto SrcPtrInfo = MachinePointerInfo::getStack(MF, Offset);
5912 SDValue Ptr = DAG.getFrameIndex(FI, MVT::i32);
5913
5914 return DAG.getLoad(VT, SL, DAG.getEntryNode(), Ptr, SrcPtrInfo, Align(4),
5917}
5918
5920 const SDLoc &SL,
5921 SDValue Chain,
5922 SDValue ArgVal,
5923 int64_t Offset) const {
5927
5928 SDValue Ptr = DAG.getConstant(Offset, SL, MVT::i32);
5929 // Stores to the argument stack area are relative to the stack pointer.
5930 SDValue SP =
5931 DAG.getCopyFromReg(Chain, SL, Info->getStackPtrOffsetReg(), MVT::i32);
5932 Ptr = DAG.getNode(ISD::ADD, SL, MVT::i32, SP, Ptr);
5933 SDValue Store = DAG.getStore(Chain, SL, ArgVal, Ptr, DstInfo, Align(4),
5935 return Store;
5936}
5937
5939 const TargetRegisterClass *RC,
5940 EVT VT, const SDLoc &SL,
5941 const ArgDescriptor &Arg) const {
5942 assert(Arg && "Attempting to load missing argument");
5943
5944 SDValue V = Arg.isRegister() ?
5945 CreateLiveInRegister(DAG, RC, Arg.getRegister(), VT, SL) :
5946 loadStackInputValue(DAG, VT, SL, Arg.getStackOffset());
5947
5948 if (!Arg.isMasked())
5949 return V;
5950
5951 unsigned Mask = Arg.getMask();
5952 unsigned Shift = llvm::countr_zero<unsigned>(Mask);
5953 V = DAG.getNode(ISD::SRL, SL, VT, V,
5954 DAG.getShiftAmountConstant(Shift, VT, SL));
5955 return DAG.getNode(ISD::AND, SL, VT, V,
5956 DAG.getConstant(Mask >> Shift, SL, VT));
5957}
5958
5960 uint64_t ExplicitKernArgSize, const ImplicitParameter Param) const {
5961 unsigned ExplicitArgOffset = Subtarget->getExplicitKernelArgOffset();
5962 const Align Alignment = Subtarget->getAlignmentForImplicitArgPtr();
5963 uint64_t ArgOffset =
5964 alignTo(ExplicitKernArgSize, Alignment) + ExplicitArgOffset;
5965 switch (Param) {
5966 case FIRST_IMPLICIT:
5967 return ArgOffset;
5968 case PRIVATE_BASE:
5970 case SHARED_BASE:
5971 return ArgOffset + AMDGPU::ImplicitArg::SHARED_BASE_OFFSET;
5972 case QUEUE_PTR:
5973 return ArgOffset + AMDGPU::ImplicitArg::QUEUE_PTR_OFFSET;
5974 }
5975 llvm_unreachable("unexpected implicit parameter type");
5976}
5977
5984
5986 SelectionDAG &DAG, int Enabled,
5987 int &RefinementSteps,
5988 bool &UseOneConstNR,
5989 bool Reciprocal) const {
5990 EVT VT = Operand.getValueType();
5991
5992 if (VT == MVT::f32) {
5993 RefinementSteps = 0;
5994 return DAG.getNode(AMDGPUISD::RSQ, SDLoc(Operand), VT, Operand);
5995 }
5996
5997 // TODO: There is also f64 rsq instruction, but the documentation is less
5998 // clear on its precision.
5999
6000 return SDValue();
6001}
6002
6004 SelectionDAG &DAG, int Enabled,
6005 int &RefinementSteps) const {
6006 EVT VT = Operand.getValueType();
6007
6008 if (VT == MVT::f32) {
6009 // Reciprocal, < 1 ulp error.
6010 //
6011 // This reciprocal approximation converges to < 0.5 ulp error with one
6012 // newton rhapson performed with two fused multiple adds (FMAs).
6013
6014 RefinementSteps = 0;
6015 return DAG.getNode(AMDGPUISD::RCP, SDLoc(Operand), VT, Operand);
6016 }
6017
6018 // TODO: There is also f64 rcp instruction, but the documentation is less
6019 // clear on its precision.
6020
6021 return SDValue();
6022}
6023
6024static unsigned workitemIntrinsicDim(unsigned ID) {
6025 switch (ID) {
6026 case Intrinsic::amdgcn_workitem_id_x:
6027 return 0;
6028 case Intrinsic::amdgcn_workitem_id_y:
6029 return 1;
6030 case Intrinsic::amdgcn_workitem_id_z:
6031 return 2;
6032 default:
6033 llvm_unreachable("not a workitem intrinsic");
6034 }
6035}
6036
6038 const SDValue Op, KnownBits &Known,
6039 const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const {
6040
6041 Known.resetAll(); // Don't know anything.
6042
6043 unsigned Opc = Op.getOpcode();
6044
6045 switch (Opc) {
6046 default:
6047 break;
6048 case AMDGPUISD::CARRY:
6049 case AMDGPUISD::BORROW: {
6050 Known.Zero = APInt::getHighBitsSet(32, 31);
6051 break;
6052 }
6053
6054 case AMDGPUISD::BFE_I32:
6055 case AMDGPUISD::BFE_U32: {
6056 ConstantSDNode *CWidth = dyn_cast<ConstantSDNode>(Op.getOperand(2));
6057 if (!CWidth)
6058 return;
6059
6060 uint32_t Width = CWidth->getZExtValue() & 0x1f;
6061
6062 if (Opc == AMDGPUISD::BFE_U32)
6063 Known.Zero = APInt::getHighBitsSet(32, 32 - Width);
6064
6065 break;
6066 }
6067 case AMDGPUISD::FP_TO_FP16: {
6068 unsigned BitWidth = Known.getBitWidth();
6069
6070 // High bits are zero.
6072 break;
6073 }
6074 case AMDGPUISD::MUL_U24:
6075 case AMDGPUISD::MUL_I24: {
6076 KnownBits LHSKnown = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
6077 KnownBits RHSKnown = DAG.computeKnownBits(Op.getOperand(1), Depth + 1);
6078 unsigned BitWidth = Op.getScalarValueSizeInBits();
6079
6080 // Sign/Zero extend from 24 bits.
6081 if (Opc == AMDGPUISD::MUL_I24) {
6082 LHSKnown = LHSKnown.trunc(24).sext(BitWidth);
6083 RHSKnown = RHSKnown.trunc(24).sext(BitWidth);
6084 } else {
6085 LHSKnown = LHSKnown.trunc(24).zext(BitWidth);
6086 RHSKnown = RHSKnown.trunc(24).zext(BitWidth);
6087 }
6088
6089 // TODO: SelfMultiply can be poison, but not undef.
6090 bool SelfMultiply = Op.getOperand(0) == Op.getOperand(1);
6091 if (SelfMultiply)
6092 SelfMultiply &= DAG.isGuaranteedNotToBeUndefOrPoison(
6093 Op.getOperand(0), DemandedElts, UndefPoisonKind::UndefOrPoison,
6094 Depth + 1);
6095
6096 Known = KnownBits::mul(LHSKnown, RHSKnown, SelfMultiply);
6097 break;
6098 }
6099 case AMDGPUISD::PERM: {
6100 ConstantSDNode *CMask = dyn_cast<ConstantSDNode>(Op.getOperand(2));
6101 if (!CMask)
6102 return;
6103
6104 KnownBits LHSKnown = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
6105 KnownBits RHSKnown = DAG.computeKnownBits(Op.getOperand(1), Depth + 1);
6106 unsigned Sel = CMask->getZExtValue();
6107
6108 for (unsigned I = 0; I < 32; I += 8) {
6109 unsigned SelBits = Sel & 0xff;
6110 if (SelBits < 4) {
6111 SelBits *= 8;
6112 Known.One |= ((RHSKnown.One.getZExtValue() >> SelBits) & 0xff) << I;
6113 Known.Zero |= ((RHSKnown.Zero.getZExtValue() >> SelBits) & 0xff) << I;
6114 } else if (SelBits < 7) {
6115 SelBits = (SelBits & 3) * 8;
6116 Known.One |= ((LHSKnown.One.getZExtValue() >> SelBits) & 0xff) << I;
6117 Known.Zero |= ((LHSKnown.Zero.getZExtValue() >> SelBits) & 0xff) << I;
6118 } else if (SelBits == 0x0c) {
6119 Known.Zero |= 0xFFull << I;
6120 } else if (SelBits > 0x0c) {
6121 Known.One |= 0xFFull << I;
6122 }
6123 Sel >>= 8;
6124 }
6125 break;
6126 }
6127 case AMDGPUISD::BUFFER_LOAD_UBYTE: {
6128 Known.Zero.setHighBits(24);
6129 break;
6130 }
6131 case AMDGPUISD::BUFFER_LOAD_USHORT: {
6132 Known.Zero.setHighBits(16);
6133 break;
6134 }
6135 case AMDGPUISD::LDS: {
6136 auto *GA = cast<GlobalAddressSDNode>(Op.getOperand(0).getNode());
6137 Align Alignment = GA->getGlobal()->getPointerAlignment(DAG.getDataLayout());
6138
6139 Known.Zero.setHighBits(16);
6140 Known.Zero.setLowBits(Log2(Alignment));
6141 break;
6142 }
6143 case AMDGPUISD::SMIN3:
6144 case AMDGPUISD::SMAX3:
6145 case AMDGPUISD::SMED3:
6146 case AMDGPUISD::UMIN3:
6147 case AMDGPUISD::UMAX3:
6148 case AMDGPUISD::UMED3: {
6149 KnownBits Known2 = DAG.computeKnownBits(Op.getOperand(2), Depth + 1);
6150 if (Known2.isUnknown())
6151 break;
6152
6153 KnownBits Known1 = DAG.computeKnownBits(Op.getOperand(1), Depth + 1);
6154 if (Known1.isUnknown())
6155 break;
6156
6157 KnownBits Known0 = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
6158 if (Known0.isUnknown())
6159 break;
6160
6161 // TODO: Handle LeadZero/LeadOne from UMIN/UMAX handling.
6162 Known.Zero = Known0.Zero & Known1.Zero & Known2.Zero;
6163 Known.One = Known0.One & Known1.One & Known2.One;
6164 break;
6165 }
6167 unsigned IID = Op.getConstantOperandVal(0);
6168 switch (IID) {
6169 case Intrinsic::amdgcn_workitem_id_x:
6170 case Intrinsic::amdgcn_workitem_id_y:
6171 case Intrinsic::amdgcn_workitem_id_z: {
6172 unsigned MaxValue = Subtarget->getMaxWorkitemID(
6174 Known.Zero.setHighBits(llvm::countl_zero(MaxValue));
6175 break;
6176 }
6177 default:
6178 break;
6179 }
6180 }
6181 }
6182}
6183
6185 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
6186 unsigned Depth) const {
6187 switch (Op.getOpcode()) {
6188 case AMDGPUISD::BFE_I32: {
6189 ConstantSDNode *Width = dyn_cast<ConstantSDNode>(Op.getOperand(2));
6190 if (!Width)
6191 return 1;
6192
6193 unsigned SignBits = 32 - (Width->getZExtValue() & 0x1f) + 1;
6194 if (!isNullConstant(Op.getOperand(1)))
6195 return SignBits;
6196
6197 // TODO: Could probably figure something out with non-0 offsets.
6198 unsigned Op0SignBits = DAG.ComputeNumSignBits(Op.getOperand(0), Depth + 1);
6199 return std::max(SignBits, Op0SignBits);
6200 }
6201
6202 case AMDGPUISD::BFE_U32: {
6203 ConstantSDNode *Width = dyn_cast<ConstantSDNode>(Op.getOperand(2));
6204 return Width ? 32 - (Width->getZExtValue() & 0x1f) : 1;
6205 }
6206
6207 case AMDGPUISD::CARRY:
6208 case AMDGPUISD::BORROW:
6209 return 31;
6210 case AMDGPUISD::BUFFER_LOAD_BYTE:
6211 return 25;
6212 case AMDGPUISD::BUFFER_LOAD_SHORT:
6213 return 17;
6214 case AMDGPUISD::BUFFER_LOAD_UBYTE:
6215 return 24;
6216 case AMDGPUISD::BUFFER_LOAD_USHORT:
6217 return 16;
6218 case AMDGPUISD::FP_TO_FP16:
6219 return 16;
6220 case AMDGPUISD::SMIN3:
6221 case AMDGPUISD::SMAX3:
6222 case AMDGPUISD::SMED3:
6223 case AMDGPUISD::UMIN3:
6224 case AMDGPUISD::UMAX3:
6225 case AMDGPUISD::UMED3: {
6226 unsigned Tmp2 = DAG.ComputeNumSignBits(Op.getOperand(2), Depth + 1);
6227 if (Tmp2 == 1)
6228 return 1; // Early out.
6229
6230 unsigned Tmp1 = DAG.ComputeNumSignBits(Op.getOperand(1), Depth + 1);
6231 if (Tmp1 == 1)
6232 return 1; // Early out.
6233
6234 unsigned Tmp0 = DAG.ComputeNumSignBits(Op.getOperand(0), Depth + 1);
6235 if (Tmp0 == 1)
6236 return 1; // Early out.
6237
6238 return std::min({Tmp0, Tmp1, Tmp2});
6239 }
6240 default:
6241 return 1;
6242 }
6243}
6244
6246 GISelValueTracking &Analysis, Register R, const APInt &DemandedElts,
6247 const MachineRegisterInfo &MRI, unsigned Depth) const {
6248 const MachineInstr *MI = MRI.getVRegDef(R);
6249 if (!MI)
6250 return 1;
6251
6252 // TODO: Check range metadata on MMO.
6253 switch (MI->getOpcode()) {
6254 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE:
6255 return 25;
6256 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT:
6257 return 17;
6258 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
6259 return 24;
6260 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
6261 return 16;
6262 case AMDGPU::G_AMDGPU_SMED3:
6263 case AMDGPU::G_AMDGPU_UMED3: {
6264 auto [Dst, Src0, Src1, Src2] = MI->getFirst4Regs();
6265 unsigned Tmp2 = Analysis.computeNumSignBits(Src2, DemandedElts, Depth + 1);
6266 if (Tmp2 == 1)
6267 return 1;
6268 unsigned Tmp1 = Analysis.computeNumSignBits(Src1, DemandedElts, Depth + 1);
6269 if (Tmp1 == 1)
6270 return 1;
6271 unsigned Tmp0 = Analysis.computeNumSignBits(Src0, DemandedElts, Depth + 1);
6272 if (Tmp0 == 1)
6273 return 1;
6274 return std::min({Tmp0, Tmp1, Tmp2});
6275 }
6276 default:
6277 return 1;
6278 }
6279}
6280
6282 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
6283 UndefPoisonKind Kind, bool ConsiderFlags, unsigned Depth) const {
6284 unsigned Opcode = Op.getOpcode();
6285 switch (Opcode) {
6286 case AMDGPUISD::BFE_I32:
6287 case AMDGPUISD::BFE_U32:
6288 return false;
6289 }
6291 Op, DemandedElts, DAG, Kind, ConsiderFlags, Depth);
6292}
6293
6295 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool SNaN,
6296 unsigned Depth) const {
6297 unsigned Opcode = Op.getOpcode();
6298 switch (Opcode) {
6299 case AMDGPUISD::FMIN_LEGACY:
6300 case AMDGPUISD::FMAX_LEGACY: {
6301 if (SNaN)
6302 return true;
6303
6304 // TODO: Can check no nans on one of the operands for each one, but which
6305 // one?
6306 return false;
6307 }
6308 case AMDGPUISD::FMUL_LEGACY:
6309 case AMDGPUISD::CVT_PKRTZ_F16_F32: {
6310 if (SNaN)
6311 return true;
6312 return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1) &&
6313 DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1);
6314 }
6315 case AMDGPUISD::FMED3:
6316 case AMDGPUISD::FMIN3:
6317 case AMDGPUISD::FMAX3:
6318 case AMDGPUISD::FMINIMUM3:
6319 case AMDGPUISD::FMAXIMUM3:
6320 case AMDGPUISD::FMAD_FTZ: {
6321 if (SNaN)
6322 return true;
6323 return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1) &&
6324 DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1) &&
6325 DAG.isKnownNeverNaN(Op.getOperand(2), SNaN, Depth + 1);
6326 }
6327 case AMDGPUISD::CVT_F32_UBYTE0:
6328 case AMDGPUISD::CVT_F32_UBYTE1:
6329 case AMDGPUISD::CVT_F32_UBYTE2:
6330 case AMDGPUISD::CVT_F32_UBYTE3:
6331 return true;
6332
6333 case AMDGPUISD::RCP:
6334 case AMDGPUISD::RSQ:
6335 case AMDGPUISD::RCP_LEGACY:
6336 case AMDGPUISD::RSQ_CLAMP: {
6337 if (SNaN)
6338 return true;
6339
6340 // TODO: Need is known positive check.
6341 return false;
6342 }
6343 case ISD::FLDEXP:
6344 case AMDGPUISD::FRACT: {
6345 if (SNaN)
6346 return true;
6347 return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1);
6348 }
6349 case AMDGPUISD::DIV_SCALE:
6350 case AMDGPUISD::DIV_FMAS:
6351 case AMDGPUISD::DIV_FIXUP:
6352 // TODO: Refine on operands.
6353 return SNaN;
6354 case AMDGPUISD::SIN_HW:
6355 case AMDGPUISD::COS_HW: {
6356 // TODO: Need check for infinity
6357 return SNaN;
6358 }
6360 unsigned IntrinsicID = Op.getConstantOperandVal(0);
6361 // TODO: Handle more intrinsics
6362 switch (IntrinsicID) {
6363 case Intrinsic::amdgcn_cubeid:
6364 case Intrinsic::amdgcn_cvt_off_f32_i4:
6365 return true;
6366
6367 case Intrinsic::amdgcn_frexp_mant: {
6368 if (SNaN)
6369 return true;
6370 return DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1);
6371 }
6372 case Intrinsic::amdgcn_cvt_pkrtz: {
6373 if (SNaN)
6374 return true;
6375 return DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1) &&
6376 DAG.isKnownNeverNaN(Op.getOperand(2), SNaN, Depth + 1);
6377 }
6378 case Intrinsic::amdgcn_rcp:
6379 case Intrinsic::amdgcn_rsq:
6380 case Intrinsic::amdgcn_rcp_legacy:
6381 case Intrinsic::amdgcn_rsq_legacy:
6382 case Intrinsic::amdgcn_rsq_clamp:
6383 case Intrinsic::amdgcn_tanh: {
6384 if (SNaN)
6385 return true;
6386
6387 // TODO: Need is known positive check.
6388 return false;
6389 }
6390 case Intrinsic::amdgcn_trig_preop:
6391 case Intrinsic::amdgcn_fdot2:
6392 // TODO: Refine on operand
6393 return SNaN;
6394 case Intrinsic::amdgcn_fma_legacy:
6395 if (SNaN)
6396 return true;
6397 return DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1) &&
6398 DAG.isKnownNeverNaN(Op.getOperand(2), SNaN, Depth + 1) &&
6399 DAG.isKnownNeverNaN(Op.getOperand(3), SNaN, Depth + 1);
6400 default:
6401 return false;
6402 }
6403 }
6404 default:
6405 return false;
6406 }
6407}
6408
6410 Register N0, Register N1) const {
6411 return MRI.hasOneNonDBGUse(N0); // FIXME: handle regbanks
6412}
return SDValue()
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static LLVM_READONLY bool hasSourceMods(const MachineInstr &MI)
static bool isInv2Pi(const APFloat &APF)
static LLVM_READONLY bool opMustUseVOP3Encoding(const MachineInstr &MI, const MachineRegisterInfo &MRI)
returns true if the operation will definitely need to use a 64-bit encoding, and thus will use a VOP3...
static unsigned inverseMinMax(unsigned Opc)
static SDValue extractF64Exponent(SDValue Hi, const SDLoc &SL, SelectionDAG &DAG)
static unsigned workitemIntrinsicDim(unsigned ID)
static int getOrCreateFixedStackObject(MachineFrameInfo &MFI, unsigned Size, int64_t Offset)
static SDValue constantFoldBFE(SelectionDAG &DAG, IntTy Src0, uint32_t Offset, uint32_t Width, const SDLoc &DL)
static SDValue getMad(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue X, SDValue Y, SDValue C, SDNodeFlags Flags=SDNodeFlags())
static SDValue getAddOneOp(const SDNode *V)
If V is an add of a constant 1, returns the other operand.
static LLVM_READONLY bool selectSupportsSourceMods(const SDNode *N)
Return true if v_cndmask_b32 will support fabs/fneg source modifiers for the type for ISD::SELECT.
static cl::opt< bool > AMDGPUBypassSlowDiv("amdgpu-bypass-slow-div", cl::desc("Skip 64-bit divide for dynamic 32-bit values"), cl::init(true))
static SDValue getMul24(SelectionDAG &DAG, const SDLoc &SL, SDValue N0, SDValue N1, unsigned Size, bool Signed)
static bool fnegFoldsIntoOp(const SDNode *N)
static bool isI24(SDValue Op, SelectionDAG &DAG)
static bool isCttzOpc(unsigned Opc)
static bool isU24(SDValue Op, SelectionDAG &DAG)
static SDValue peekFPSignOps(SDValue Val)
static bool valueIsKnownNeverF32Denorm(SDValue Src)
Return true if it's known that Src can never be an f32 denormal value.
static SDValue distributeOpThroughSelect(TargetLowering::DAGCombinerInfo &DCI, unsigned Op, const SDLoc &SL, SDValue Cond, SDValue N1, SDValue N2)
static SDValue peekFNeg(SDValue Val)
static SDValue simplifyMul24(SDNode *Node24, TargetLowering::DAGCombinerInfo &DCI)
static bool isCtlzOpc(unsigned Opc)
static LLVM_READNONE bool fnegFoldsIntoOpcode(unsigned Opc)
static bool hasVolatileUser(SDNode *Val)
Interface definition of the TargetLowering class that is common to all AMD GPUs.
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Function Alias Analysis Results
#define X(NUM, ENUM, NAME)
Definition ELF.h:856
block Block Frequency Analysis
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
#define LLVM_READNONE
Definition Compiler.h:317
#define LLVM_READONLY
Definition Compiler.h:324
Provides analysis for querying information about KnownBits during GISel passes.
const HexagonInstrInfo * TII
static MaybeAlign getAlign(Value *Ptr)
IRTranslator LLVM IR MI
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
#define G(x, y, z)
Definition MD5.cpp:55
static DebugLoc getDebugLoc(MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
Return the first DebugLoc that has line number information, given a range of instructions.
#define T
#define P(N)
const SmallVectorImpl< MachineOperand > & Cond
#define CH(x, y, z)
Definition SHA256.cpp:34
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
Value * RHS
Value * LHS
BinaryOperator * Mul
static CCAssignFn * CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg)
static CCAssignFn * CCAssignFnForReturn(CallingConv::ID CC, bool IsVarArg)
unsigned allocateLDSGlobal(const DataLayout &DL, const GlobalVariable &GV)
void recordNumNamedBarriers(uint32_t GVAddr, unsigned BarCnt)
static std::optional< uint32_t > getLDSAbsoluteAddress(const GlobalValue &GV)
static unsigned numBitsSigned(SDValue Op, SelectionDAG &DAG)
SDValue combineFMinMaxLegacy(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, SDValue True, SDValue False, SDValue CC, DAGCombinerInfo &DCI) const
Generate Min/Max node.
unsigned ComputeNumSignBitsForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
This method can be implemented by targets that want to expose additional information about sign bits ...
SDValue performMulhuCombine(SDNode *N, DAGCombinerInfo &DCI) const
EVT getTypeForExtReturn(LLVMContext &Context, EVT VT, ISD::NodeType ExtendKind) const override
Return the type that should be used to zero or sign extend a zeroext/signext integer return value.
SDValue SplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Split a vector load into 2 loads of half the vector.
SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const
SDValue performLoadCombine(SDNode *N, DAGCombinerInfo &DCI) const
void analyzeFormalArgumentsCompute(CCState &State, const SmallVectorImpl< ISD::InputArg > &Ins) const
The SelectionDAGBuilder will automatically promote function arguments with illegal types.
SDValue LowerF64ToF16Safe(SDValue Src, const SDLoc &DL, SelectionDAG &DAG) const
SDValue LowerFROUND(SDValue Op, SelectionDAG &DAG) const
SDValue storeStackInputValue(SelectionDAG &DAG, const SDLoc &SL, SDValue Chain, SDValue ArgVal, int64_t Offset) const
bool storeOfVectorConstantIsCheap(bool IsZero, EVT MemVT, unsigned NumElem, unsigned AS) const override
Return true if it is expected to be cheaper to do a store of vector constant with the given size and ...
SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
bool shouldCombineMemoryType(EVT VT) const
SDValue splitBinaryBitConstantOpImpl(DAGCombinerInfo &DCI, const SDLoc &SL, unsigned Opc, SDValue LHS, uint32_t ValLo, uint32_t ValHi) const
Split the 64-bit value LHS into two 32-bit components, and perform the binary operation Opc to it wit...
SDValue lowerUnhandledCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals, StringRef Reason) const
virtual SDValue LowerGlobalAddress(AMDGPUMachineFunctionInfo *MFI, SDValue Op, SelectionDAG &DAG) const
SDValue performAssertSZExtCombine(SDNode *N, DAGCombinerInfo &DCI) const
bool isTruncateFree(EVT Src, EVT Dest) const override
bool aggressivelyPreferBuildVectorSources(EVT VecVT) const override
SDValue LowerFCEIL(SDValue Op, SelectionDAG &DAG) const
TargetLowering::NegatibleCost getConstantNegateCost(const ConstantFPSDNode *C) const
SDValue LowerFLOGUnsafe(SDValue Op, const SDLoc &SL, SelectionDAG &DAG, bool IsLog10, SDNodeFlags Flags) const
SDValue performMulhsCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue lowerFEXPUnsafeImpl(SDValue Op, const SDLoc &SL, SelectionDAG &DAG, SDNodeFlags Flags, bool IsExp10) const
bool isSDNodeAlwaysUniform(const SDNode *N) const override
bool isDesirableToCommuteWithShift(const SDNode *N, CombineLevel Level) const override
Return true if it is profitable to move this shift by a constant amount through its operand,...
SDValue performShlCombine(SDNode *N, DAGCombinerInfo &DCI) const
bool isCheapToSpeculateCtlz(Type *Ty) const override
Return true if it is cheap to speculate a call to intrinsic ctlz.
SDValue LowerSDIVREM(SDValue Op, SelectionDAG &DAG) const
bool isFNegFree(EVT VT) const override
Return true if an fneg operation is free to the point where it is never worthwhile to replace it with...
SDValue LowerFLOG10(SDValue Op, SelectionDAG &DAG) const
SDValue LowerINT_TO_FP64(SDValue Op, SelectionDAG &DAG, bool Signed) const
unsigned computeNumSignBitsForTargetInstr(GISelValueTracking &Analysis, Register R, const APInt &DemandedElts, const MachineRegisterInfo &MRI, unsigned Depth=0) const override
This method can be implemented by targets that want to expose additional information about sign bits ...
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
SDValue LowerFP_TO_FP16(SDValue Op, SelectionDAG &DAG) const
SDValue addTokenForArgument(SDValue Chain, SelectionDAG &DAG, MachineFrameInfo &MFI, int ClobberedFI) const
bool isConstantCheaperToNegate(SDValue N) const
bool isReassocProfitable(MachineRegisterInfo &MRI, Register N0, Register N1) const override
bool isKnownNeverNaNForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool SNaN=false, unsigned Depth=0) const override
If SNaN is false,.
static bool needsDenormHandlingF32(const SelectionDAG &DAG, SDValue Src, SDNodeFlags Flags)
uint32_t getImplicitParameterOffset(const MachineFunction &MF, const ImplicitParameter Param) const
Helper function that returns the byte offset of the given type of implicit parameter.
SDValue lowerFEXPF64(SDValue Op, SelectionDAG &DAG) const
SDValue LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const
SDValue performSelectCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue performFNegCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const
bool isConstantCostlierToNegate(SDValue N) const
SDValue loadInputValue(SelectionDAG &DAG, const TargetRegisterClass *RC, EVT VT, const SDLoc &SL, const ArgDescriptor &Arg) const
SDValue LowerDIVREM24(SDValue Op, SelectionDAG &DAG, bool sign) const
SDValue lowerFEXP10Unsafe(SDValue Op, const SDLoc &SL, SelectionDAG &DAG, SDNodeFlags Flags) const
Emit approx-funcs appropriate lowering for exp10.
bool shouldReduceLoadWidth(SDNode *Load, ISD::LoadExtType ExtType, EVT ExtVT, std::optional< unsigned > ByteOffset) const override
Return true if it is profitable to reduce a load to a smaller type.
SDValue LowerUINT_TO_FP(SDValue Op, SelectionDAG &DAG) const
bool canCreateUndefOrPoisonForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, UndefPoisonKind Kind, bool ConsiderFlags, unsigned Depth) const override
Return true if Op can create undef or poison from non-undef & non-poison operands.
bool isCheapToSpeculateCttz(Type *Ty) const override
Return true if it is cheap to speculate a call to intrinsic cttz.
SDValue performCtlz_CttzCombine(const SDLoc &SL, SDValue Cond, SDValue LHS, SDValue RHS, DAGCombinerInfo &DCI) const
SDValue performSraCombine(SDNode *N, DAGCombinerInfo &DCI) const
bool isSelectSupported(SelectSupportKind) const override
bool isZExtFree(Type *Src, Type *Dest) const override
Return true if any actual instruction that defines a value of type FromTy implicitly zero-extends the...
SDValue lowerFEXP2(SDValue Op, SelectionDAG &DAG) const
SDValue LowerCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower calls into the specified DAG.
SDValue performSrlCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue lowerFEXP(SDValue Op, SelectionDAG &DAG) const
SDValue getIsLtSmallestNormal(SelectionDAG &DAG, SDValue Op, SDNodeFlags Flags) const
bool mayIgnoreSignedZero(SDValue Op) const
SDValue getIsFinite(SelectionDAG &DAG, SDValue Op, SDNodeFlags Flags) const
bool isLoadBitCastBeneficial(EVT, EVT, const SelectionDAG &DAG, const MachineMemOperand &MMO) const final
Return true if the following transform is beneficial: fold (conv (load x)) -> (load (conv*)x) On arch...
std::pair< SDValue, SDValue > splitVector(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HighVT, SelectionDAG &DAG) const
Split a vector value into two parts of types LoVT and HiVT.
AMDGPUTargetLowering(const TargetMachine &TM, const TargetSubtargetInfo &STI, const AMDGPUSubtarget &AMDGPUSTI)
SDValue LowerFLOGCommon(SDValue Op, SelectionDAG &DAG) const
SDValue foldFreeOpFromSelect(TargetLowering::DAGCombinerInfo &DCI, SDValue N) const
SDValue LowerINT_TO_FP32(SDValue Op, SelectionDAG &DAG, bool Signed) const
bool isFAbsFree(EVT VT) const override
Return true if an fabs operation is free to the point where it is never worthwhile to replace it with...
bool isInt64ImmLegal(SDNode *Val, SelectionDAG &DAG) const
Check whether value Val can be supported by v_mov_b64, for the current target.
SDValue loadStackInputValue(SelectionDAG &DAG, EVT VT, const SDLoc &SL, int64_t Offset) const
Similar to CreateLiveInRegister, except value maybe loaded from a stack slot rather than passed in a ...
SDValue LowerFLOG2(SDValue Op, SelectionDAG &DAG) const
static EVT getEquivalentMemType(LLVMContext &Context, EVT VT)
SDValue LowerCTLS(SDValue Op, SelectionDAG &DAG) const
Split a vector store into multiple scalar stores.
SDValue getSqrtEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled, int &RefinementSteps, bool &UseOneConstNR, bool Reciprocal) const override
Hooks for building estimates in place of slower divisions and square roots.
SDValue performTruncateCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue LowerSINT_TO_FP(SDValue Op, SelectionDAG &DAG) const
static SDValue stripBitcast(SDValue Val)
SDValue LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const
SDValue CreateLiveInRegister(SelectionDAG &DAG, const TargetRegisterClass *RC, Register Reg, EVT VT, const SDLoc &SL, bool RawReg=false) const
Helper function that adds Reg to the LiveIn list of the DAG's MachineFunction.
SDValue SplitVectorStore(SDValue Op, SelectionDAG &DAG) const
Split a vector store into 2 stores of half the vector.
SDValue LowerCTLZ_CTTZ(SDValue Op, SelectionDAG &DAG) const
SDValue getNegatedExpression(SDValue Op, SelectionDAG &DAG, bool LegalOperations, bool ForCodeSize, NegatibleCost &Cost, unsigned Depth) const override
Return the newly negated expression if the cost is not expensive and set the cost in Cost to indicate...
std::pair< SDValue, SDValue > split64BitValue(SDValue Op, SelectionDAG &DAG) const
Return 64-bit value Op as two 32-bit integers.
SDValue performMulCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue getRecipEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled, int &RefinementSteps) const override
Return a reciprocal estimate value for the input operand.
SDValue LowerFNEARBYINT(SDValue Op, SelectionDAG &DAG) const
SDValue LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const
static CCAssignFn * CCAssignFnForReturn(CallingConv::ID CC, bool IsVarArg)
std::pair< SDValue, SDValue > getScaledLogInput(SelectionDAG &DAG, const SDLoc SL, SDValue Op, SDNodeFlags Flags) const
If denormal handling is required return the scaled input to FLOG2, and the check for denormal range.
static CCAssignFn * CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg)
Selects the correct CCAssignFn for a given CallingConvention value.
bool SimplifyDemandedBitsForTargetNode(SDValue Op, const APInt &OriginalDemandedBits, const APInt &OriginalDemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth) const override
Attempt to simplify any target nodes based on the demanded bits/elts, returning true on success.
static bool allUsesHaveSourceMods(const SDNode *N, unsigned CostThreshold=4)
SDValue LowerFROUNDEVEN(SDValue Op, SelectionDAG &DAG) const
bool isFPImmLegal(const APFloat &Imm, EVT VT, bool ForCodeSize) const override
Returns true if the target can instruction select the specified FP immediate natively.
static unsigned numBitsUnsigned(SDValue Op, SelectionDAG &DAG)
SDValue lowerFEXPUnsafe(SDValue Op, const SDLoc &SL, SelectionDAG &DAG, SDNodeFlags Flags) const
SDValue LowerFTRUNC(SDValue Op, SelectionDAG &DAG) const
SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const
static bool allowApproxFunc(const SelectionDAG &DAG, SDNodeFlags Flags)
bool ShouldShrinkFPConstant(EVT VT) const override
If true, then instruction selection should seek to shrink the FP constant of the specified type to a ...
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
SDValue performStoreCombine(SDNode *N, DAGCombinerInfo &DCI) const
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
This callback is invoked when a node result type is illegal for the target, and the operation was reg...
SDValue performRcpCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue getLoHalf64(SDValue Op, SelectionDAG &DAG) const
SDValue lowerCTLZResults(SDValue Op, SelectionDAG &DAG) const
SDValue LowerFP_TO_INT_SAT(SDValue Op, SelectionDAG &DAG) const
SDValue performFAbsCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue LowerFP_TO_INT64(SDValue Op, SelectionDAG &DAG, bool Signed) const
static bool shouldFoldFNegIntoSrc(SDNode *FNeg, SDValue FNegSrc)
bool isNarrowingProfitable(SDNode *N, EVT SrcVT, EVT DestVT) const override
Return true if it's profitable to narrow operations of type SrcVT to DestVT.
SDValue LowerFRINT(SDValue Op, SelectionDAG &DAG) const
SDValue performIntrinsicWOChainCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue LowerUDIVREM(SDValue Op, SelectionDAG &DAG) const
SDValue performMulLoHiCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
void LowerUDIVREM64(SDValue Op, SelectionDAG &DAG, SmallVectorImpl< SDValue > &Results) const
SDValue WidenOrSplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Widen a suitably aligned v3 load.
std::pair< EVT, EVT > getSplitDestVTs(const EVT &VT, SelectionDAG &DAG) const
Split a vector type into two parts.
SDValue getHiHalf64(SDValue Op, SelectionDAG &DAG) const
SDValue LowerINT_TO_FP16(SDValue Op, SelectionDAG &DAG, EVT FP16Ty) const
SDValue combineFMinMaxLegacyImpl(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, SDValue True, SDValue False, SDValue CC, DAGCombinerInfo &DCI) const
unsigned getVectorIdxWidth(const DataLayout &) const override
Returns the type to be used for the index operand vector operations.
static const fltSemantics & IEEEsingle()
Definition APFloat.h:297
static const fltSemantics & IEEEdouble()
Definition APFloat.h:298
static constexpr roundingMode rmNearestTiesToEven
Definition APFloat.h:345
static const fltSemantics & IEEEhalf()
Definition APFloat.h:295
bool bitwiseIsEqual(const APFloat &RHS) const
Definition APFloat.h:1509
opStatus add(const APFloat &RHS, roundingMode RM)
Definition APFloat.h:1246
const fltSemantics & getSemantics() const
Definition APFloat.h:1552
opStatus multiply(const APFloat &RHS, roundingMode RM)
Definition APFloat.h:1264
static APFloat getSmallestNormalized(const fltSemantics &Sem, bool Negative=false)
Returns the smallest (by magnitude) normalized finite number in the given semantics.
Definition APFloat.h:1223
APInt bitcastToAPInt() const
Definition APFloat.h:1436
static APFloat getInf(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Infinity.
Definition APFloat.h:1163
Class for arbitrary precision integers.
Definition APInt.h:78
uint64_t getZExtValue() const
Get zero extended value.
Definition APInt.h:1563
void setHighBits(unsigned hiBits)
Set the top hiBits bits.
Definition APInt.h:1414
static APInt getMaxValue(unsigned numBits)
Gets maximum unsigned value of APInt for specific bit width.
Definition APInt.h:207
bool ugt(const APInt &RHS) const
Unsigned greater than comparison.
Definition APInt.h:1189
static APInt getBitsSet(unsigned numBits, unsigned loBit, unsigned hiBit)
Get a value with a block of bits set.
Definition APInt.h:259
static APInt getSignedMaxValue(unsigned numBits)
Gets maximum signed value of APInt for a specific bit width.
Definition APInt.h:210
static APInt getSignedMinValue(unsigned numBits)
Gets minimum signed value of APInt for a specific bit width.
Definition APInt.h:220
bool ule(const APInt &RHS) const
Unsigned less or equal comparison.
Definition APInt.h:1157
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
Definition APInt.h:307
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition APInt.h:297
void setLowBits(unsigned loBits)
Set the bottom loBits bits.
Definition APInt.h:1411
This class represents an incoming formal argument to a Function.
Definition Argument.h:32
const BlockAddress * getBlockAddress() const
CCState - This class holds information needed while lowering arguments and return values.
static CCValAssign getCustomMem(unsigned ValNo, MVT ValVT, int64_t Offset, MVT LocVT, LocInfo HTP)
const APFloat & getValueAPF() const
bool isNegative() const
Return true if the value is negative.
uint64_t getZExtValue() const
const APInt & getAPIntValue() const
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:64
Diagnostic information for unsupported feature in backend.
const DataLayout & getDataLayout() const
Get the data layout of the module this function belongs to.
Definition Function.cpp:357
iterator_range< arg_iterator > args()
Definition Function.h:866
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition Function.h:272
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition Function.cpp:353
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
LLVM_ABI void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
This class is used to represent ISD::LOAD nodes.
const SDValue & getBasePtr() const
Machine Value Type.
static auto integer_fixedlen_vector_valuetypes()
uint64_t getScalarSizeInBits() const
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
bool isInteger() const
Return true if this is an integer or a vector integer type.
static auto integer_valuetypes()
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
LLVM_ABI int CreateFixedObject(uint64_t Size, int64_t SPOffset, bool IsImmutable, bool isAliased=false)
Create a new object at a fixed location on the stack.
int64_t getObjectSize(int ObjectIdx) const
Return the size of the specified object.
int64_t getObjectOffset(int ObjectIdx) const
Return the assigned stack offset of the specified object from the incoming stack pointer.
int getObjectIndexBegin() const
Return the minimum frame object index.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Representation of each machine instruction.
A description of a memory reference used in the backend.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOInvariant
The memory access always returns the same value (or traps).
Flags getFlags() const
Return the raw flags of the source value,.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
LLVM_ABI bool hasOneNonDBGUse(Register RegNo) const
hasOneNonDBGUse - Return true if there is exactly one non-Debug use of the specified register.
LLVM_ABI MachineInstr * getVRegDef(Register Reg) const
getVRegDef - Return the machine instr that defines the specified virtual register or null if none is ...
LLVM_ABI Register createVirtualRegister(const TargetRegisterClass *RegClass, StringRef Name="")
createVirtualRegister - Create and return a new virtual register in the function with the specified r...
LLVM_ABI bool isLiveIn(Register Reg) const
LLVM_ABI Register getLiveInVirtReg(MCRegister PReg) const
getLiveInVirtReg - If PReg is a live-in physical register, return the corresponding live-in virtual r...
void addLiveIn(MCRegister Reg, Register vreg=Register())
addLiveIn - Add the specified register as a live-in.
This is an abstract virtual class for memory operations.
unsigned getAddressSpace() const
Return the address space for the associated pointer.
Align getAlign() const
bool isSimple() const
Returns true if the memory operation is neither atomic or volatile.
MachineMemOperand * getMemOperand() const
Return the unique MachineMemOperand object describing the memory reference performed by operation.
const SDValue & getChain() const
bool isInvariant() const
EVT getMemoryVT() const
Return the type of the in-memory value.
Wrapper class representing virtual and physical registers.
Definition Register.h:20
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
const DebugLoc & getDebugLoc() const
Represents one node in the SelectionDAG.
ArrayRef< SDUse > ops() const
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
bool hasOneUse() const
Return true if there is exactly one use of this node.
SDNodeFlags getFlags() const
SDVTList getVTList() const
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
iterator_range< user_iterator > users()
Represents a use of a SDNode.
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
SDNode * getNode() const
get the SDNode which holds the desired result
bool hasOneUse() const
Return true if there is exactly one node using value ResNo of Node.
SDValue getValue(unsigned R) const
EVT getValueType() const
Return the ValueType of the referenced return value.
TypeSize getValueSizeInBits() const
Returns the size of the value in bits.
const SDValue & getOperand(unsigned i) const
unsigned getOpcode() const
unsigned getNumOperands() const
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
SIModeRegisterDefaults getMode() const
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
LLVM_ABI SDValue getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT VT, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, EVT MemVT, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
SDValue getExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT, unsigned Opcode)
Convert Op, which must be of integer type, to the integer type VT, by either any/sign/zero-extending ...
LLVM_ABI unsigned ComputeMaxSignificantBits(SDValue Op, unsigned Depth=0) const
Get the upper bound on bit size for this Value Op as a signed integer.
const SDValue & getRoot() const
Return the root tag of the SelectionDAG.
const TargetSubtargetInfo & getSubtarget() const
LLVM_ABI SDValue getMergeValues(ArrayRef< SDValue > Ops, const SDLoc &dl)
Create a MERGE_VALUES node from the given operands.
LLVM_ABI SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
LLVM_ABI SDValue getShiftAmountConstant(uint64_t Val, EVT VT, const SDLoc &DL)
LLVM_ABI SDValue getAllOnesConstant(const SDLoc &DL, EVT VT, bool IsTarget=false, bool IsOpaque=false)
LLVM_ABI void ExtractVectorElements(SDValue Op, SmallVectorImpl< SDValue > &Args, unsigned Start=0, unsigned Count=0, EVT EltVT=EVT())
Append the extracted elements from Start to Count out of the vector Op in Args.
LLVM_ABI SDValue getFreeze(SDValue V)
Return a freeze using the SDLoc of the value operand.
LLVM_ABI SDValue getConstantFP(double Val, const SDLoc &DL, EVT VT, bool isTarget=false)
Create a ConstantFPSDNode wrapping a constant value.
LLVM_ABI SDValue getRegister(Register Reg, EVT VT)
LLVM_ABI SDValue getLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr)
Loads are not normal binary operators: their result type is not determined by their operands,...
SDValue getSetCC(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond, SDValue Chain=SDValue(), bool IsSignaling=false, SDNodeFlags Flags={})
Helper function to make it easier to build SetCC's if you just have an ISD::CondCode instead of an SD...
LLVM_ABI SDValue getNOT(const SDLoc &DL, SDValue Val, EVT VT)
Create a bitwise NOT operation as (XOR Val, -1).
const TargetLowering & getTargetLoweringInfo() const
SDValue getCALLSEQ_END(SDValue Chain, SDValue Op1, SDValue Op2, SDValue InGlue, const SDLoc &DL)
Return a new CALLSEQ_END node, which always must have a glue result (to ensure it's not CSE'd).
SDValue getBuildVector(EVT VT, const SDLoc &DL, ArrayRef< SDValue > Ops)
Return an ISD::BUILD_VECTOR node.
LLVM_ABI SDValue getBitcast(EVT VT, SDValue V)
Return a bitcast using the SDLoc of the value operand, and casting to the provided type.
SDValue getCopyFromReg(SDValue Chain, const SDLoc &dl, Register Reg, EVT VT)
SDValue getSelect(const SDLoc &DL, EVT VT, SDValue Cond, SDValue LHS, SDValue RHS, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build Select's if you just have operands and don't want to check...
LLVM_ABI SDValue getZeroExtendInReg(SDValue Op, const SDLoc &DL, EVT VT)
Return the expression required to zero extend the Op value assuming it was the smaller SrcTy value.
const DataLayout & getDataLayout() const
LLVM_ABI SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
LLVM_ABI SDValue getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, EVT SVT, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
LLVM_ABI void ReplaceAllUsesWith(SDValue From, SDValue To)
Modify anything using 'From' to use 'To' instead.
LLVM_ABI SDValue getStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
Helper function to build ISD::STORE nodes.
LLVM_ABI SDValue getSignedConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
SDValue getCALLSEQ_START(SDValue Chain, uint64_t InSize, uint64_t OutSize, const SDLoc &DL)
Return a new CALLSEQ_START node, that starts new call frame, in which InSize bytes are set up inside ...
bool isConstantValueOfAnyType(SDValue N) const
SDValue getSelectCC(const SDLoc &DL, SDValue LHS, SDValue RHS, SDValue True, SDValue False, ISD::CondCode Cond, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build SelectCC's if you just have an ISD::CondCode instead of an...
LLVM_ABI SDValue getSExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either sign-extending or trunca...
LLVM_ABI bool isGuaranteedNotToBeUndefOrPoison(SDValue Op, UndefPoisonKind Kind=UndefPoisonKind::UndefOrPoison, unsigned Depth=0) const
Return true if this function can prove that Op is never poison and, Kind can be used to track poison ...
LLVM_ABI SDValue getIntPtrConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
LLVM_ABI SDValue getValueType(EVT)
LLVM_ABI SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
LLVM_ABI bool isKnownNeverNaN(SDValue Op, const APInt &DemandedElts, bool SNaN=false, unsigned Depth=0) const
Test whether the given SDValue (or all elements of it, if it is a vector) is known to never be NaN in...
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
LLVM_ABI unsigned ComputeNumSignBits(SDValue Op, unsigned Depth=0) const
Return the number of times the sign bit of the register is replicated into the other bits.
SDValue getTargetBlockAddress(const BlockAddress *BA, EVT VT, int64_t Offset=0, unsigned TargetFlags=0)
LLVM_ABI SDValue getVectorIdxConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
LLVM_ABI void ReplaceAllUsesOfValueWith(SDValue From, SDValue To)
Replace any uses of From with To, leaving uses of other values produced by From.getNode() alone.
MachineFunction & getMachineFunction() const
SDValue getPOISON(EVT VT)
Return a POISON node. POISON does not have a useful SDLoc.
LLVM_ABI SDValue getFrameIndex(int FI, EVT VT, bool isTarget=false)
LLVM_ABI KnownBits computeKnownBits(SDValue Op, unsigned Depth=0) const
Determine which bits of Op are known to be either zero or one and return them in Known.
LLVM_ABI SDValue getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either zero-extending or trunca...
LLVM_ABI bool MaskedValueIsZero(SDValue Op, const APInt &Mask, unsigned Depth=0) const
Return true if 'Op & Mask' is known to be zero.
SDValue getObjectPtrOffset(const SDLoc &SL, SDValue Ptr, TypeSize Offset)
Create an add instruction with appropriate flags when used for addressing some offset of an object.
LLVMContext * getContext() const
const SDValue & setRoot(SDValue N)
Set the current root tag of the SelectionDAG.
LLVM_ABI SDNode * UpdateNodeOperands(SDNode *N, SDValue Op)
Mutate the specified node in-place to have the specified operands.
SDValue getEntryNode() const
Return the token chain corresponding to the entry of the function.
LLVM_ABI std::pair< SDValue, SDValue > SplitScalar(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the scalar node with EXTRACT_ELEMENT using the provided VTs and return the low/high part.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
This class is used to represent ISD::STORE nodes.
const SDValue & getBasePtr() const
const SDValue & getValue() const
Represent a constant reference to a string, i.e.
Definition StringRef.h:56
void setOperationAction(unsigned Op, MVT VT, LegalizeAction Action)
Indicate that the specified operation does not work with the specified type and indicate what to do a...
void setMaxDivRemBitWidthSupported(unsigned SizeInBits)
Set the size in bits of the maximum div/rem the backend supports.
bool PredictableSelectIsExpensive
Tells the code generator that select is more expensive than a branch if the branch is usually predict...
virtual bool shouldReduceLoadWidth(SDNode *Load, ISD::LoadExtType ExtTy, EVT NewVT, std::optional< unsigned > ByteOffset=std::nullopt) const
Return true if it is profitable to reduce a load to a smaller type.
unsigned MaxStoresPerMemcpyOptSize
Likewise for functions with the OptSize attribute.
const TargetMachine & getTargetMachine() const
virtual unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain targets require unusual breakdowns of certain types.
unsigned MaxGluedStoresPerMemcpy
Specify max number of store instructions to glue in inlined memcpy.
virtual MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
void addBypassSlowDiv(unsigned int SlowBitWidth, unsigned int FastBitWidth)
Tells the code generator which bitwidths to bypass.
void setMaxLargeFPConvertBitWidthSupported(unsigned SizeInBits)
Set the size in bits of the maximum fp to/from int conversion the backend supports.
void setMaxAtomicSizeInBitsSupported(unsigned SizeInBits)
Set the maximum atomic operation size supported by the backend.
SelectSupportKind
Enum that describes what type of support for selects the target has.
virtual bool allowsMisalignedMemoryAccesses(EVT, unsigned AddrSpace=0, Align Alignment=Align(1), MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *=nullptr) const
Determine if the target supports unaligned memory accesses.
unsigned MaxStoresPerMemsetOptSize
Likewise for functions with the OptSize attribute.
EVT getShiftAmountTy(EVT LHSTy, const DataLayout &DL) const
Returns the type for the shift amount of a shift opcode.
unsigned MaxStoresPerMemmove
Specify maximum number of store instructions per memmove call.
virtual EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const
Return the ValueType of the result of SETCC operations.
unsigned MaxStoresPerMemmoveOptSize
Likewise for functions with the OptSize attribute.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
void setSupportsUnalignedAtomics(bool UnalignedSupported)
Sets whether unaligned atomic operations are supported.
bool isOperationLegal(unsigned Op, EVT VT) const
Return true if the specified operation is legal on this target.
unsigned MaxStoresPerMemset
Specify maximum number of store instructions per memset call.
void setTruncStoreAction(MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified truncating store does not work with the specified type and indicate what ...
void setMinCmpXchgSizeInBits(unsigned SizeInBits)
Sets the minimum cmpxchg or ll/sc size supported by the backend.
void AddPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)
If Opc/OrigVT is specified as being promoted, the promotion code defaults to trying a larger integer/...
void setTargetDAGCombine(ArrayRef< ISD::NodeType > NTs)
Targets should invoke this method for each target independent node that they want to provide a custom...
void setLoadExtAction(unsigned ExtType, MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified load with extension does not work with the specified type and indicate wh...
unsigned GatherAllAliasesMaxDepth
Depth that GatherAllAliases should continue looking for chain dependencies when trying to find a more...
NegatibleCost
Enum that specifies when a float negation is beneficial.
bool allowsMemoryAccessForAlignment(LLVMContext &Context, const DataLayout &DL, EVT VT, unsigned AddrSpace=0, Align Alignment=Align(1), MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *Fast=nullptr) const
This function returns true if the memory access is aligned or if the target allows this specific unal...
unsigned MaxStoresPerMemcpy
Specify maximum number of store instructions per memcpy call.
void setSchedulingPreference(Sched::Preference Pref)
Specify the target scheduling preference.
void setJumpIsExpensive(bool isExpensive=true)
Tells the code generator not to expand logic operations on comparison predicates into separate sequen...
This class defines information used to lower LLVM code to legal SelectionDAG operators that the targe...
SDValue scalarizeVectorStore(StoreSDNode *ST, SelectionDAG &DAG) const
SDValue SimplifyMultipleUseDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth=0) const
More limited version of SimplifyDemandedBits that can be used to "lookthrough" ops that don't contrib...
SDValue expandUnalignedStore(StoreSDNode *ST, SelectionDAG &DAG) const
Expands an unaligned store to 2 half-size stores for integer values, and possibly more for vectors.
bool ShrinkDemandedConstant(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, TargetLoweringOpt &TLO) const
Check to see if the specified operand of the specified instruction is a constant integer.
std::pair< SDValue, SDValue > expandUnalignedLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Expands an unaligned load to 2 half-size loads for an integer, and possibly more for vectors.
virtual SDValue getNegatedExpression(SDValue Op, SelectionDAG &DAG, bool LegalOps, bool OptForSize, NegatibleCost &Cost, unsigned Depth=0) const
Return the newly negated expression if the cost is not expensive and set the cost in Cost to indicate...
std::pair< SDValue, SDValue > scalarizeVectorLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Turn load of vector type into a load of the individual elements.
bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0, bool AssumeSingleUse=false) const
Look at Op.
TargetLowering(const TargetLowering &)=delete
virtual bool canCreateUndefOrPoisonForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, UndefPoisonKind Kind, bool ConsiderFlags, unsigned Depth) const
Return true if Op can create undef or poison from non-undef & non-poison operands.
Primary interface to the complete machine description for the target machine.
TargetSubtargetInfo - Generic base class for all target subtargets.
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition TypeSize.h:343
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:46
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:232
LLVM Value Representation.
Definition Value.h:75
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:319
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
bool isIntrinsicAlwaysUniform(unsigned IntrID)
TargetExtType * isNamedBarrier(const GlobalVariable &GV)
bool isUniformMMO(const MachineMemOperand *MMO)
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
@ AMDGPU_VS
Used for Mesa vertex shaders, or AMDPAL last shader stage before rasterization (vertex shader if tess...
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
@ AMDGPU_Gfx
Used for AMD graphics targets.
@ AMDGPU_CS_ChainPreserve
Used on AMDGPUs to give the middle-end more control over argument placement.
@ AMDGPU_HS
Used for Mesa/AMDPAL hull shaders (= tessellation control shaders).
@ AMDGPU_GS
Used for Mesa/AMDPAL geometry shaders.
@ AMDGPU_CS_Chain
Used on AMDGPUs to give the middle-end more control over argument placement.
@ AMDGPU_PS
Used for Mesa/AMDPAL pixel shaders.
@ Cold
Attempts to make code in the caller as efficient as possible under the assumption that the call is no...
Definition CallingConv.h:47
@ SPIR_KERNEL
Used for SPIR kernel functions.
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition CallingConv.h:41
@ AMDGPU_ES
Used for AMDPAL shader stage before geometry shader if geometry is in use.
@ AMDGPU_LS
Used for AMDPAL vertex shader if tessellation is in use.
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
NodeType
ISD::NodeType enum - This enum defines the target-independent operators for a SelectionDAG.
Definition ISDOpcodes.h:41
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition ISDOpcodes.h:827
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
Definition ISDOpcodes.h:275
@ INSERT_SUBVECTOR
INSERT_SUBVECTOR(VECTOR1, VECTOR2, IDX) - Returns a vector with VECTOR2 inserted into VECTOR1.
Definition ISDOpcodes.h:600
@ BSWAP
Byte Swap and Counting operators.
Definition ISDOpcodes.h:787
@ ATOMIC_STORE
OUTCHAIN = ATOMIC_STORE(INCHAIN, val, ptr) This corresponds to "store atomic" instruction.
@ ADDC
Carry-setting nodes for multiple precision addition and subtraction.
Definition ISDOpcodes.h:294
@ FMAD
FMAD - Perform a * b + c, while getting the same result as the separately rounded operations.
Definition ISDOpcodes.h:522
@ ADD
Simple integer binary arithmetic operators.
Definition ISDOpcodes.h:264
@ LOAD
LOAD and STORE have token chains as their first operand, then the same operands as an LLVM load/store...
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
Definition ISDOpcodes.h:861
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
Definition ISDOpcodes.h:518
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition ISDOpcodes.h:888
@ CONCAT_VECTORS
CONCAT_VECTORS(VECTOR0, VECTOR1, ...) - Given a number of values of vector type with the same length ...
Definition ISDOpcodes.h:584
@ FADD
Simple binary floating point operators.
Definition ISDOpcodes.h:417
@ ABS
ABS - Determine the unsigned absolute value of a signed integer value of the same bitwidth.
Definition ISDOpcodes.h:747
@ SDIVREM
SDIVREM/UDIVREM - Divide two integers and produce both a quotient and remainder result.
Definition ISDOpcodes.h:280
@ FP16_TO_FP
FP16_TO_FP, FP_TO_FP16 - These operators are used to perform promotions and truncation for half-preci...
@ BITCAST
BITCAST - This operator converts between integer, vector and FP values, as if the value was stored to...
@ BUILD_PAIR
BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.
Definition ISDOpcodes.h:254
@ FLDEXP
FLDEXP - ldexp, inspired by libm (op0 * 2**op1).
@ CTLZ_ZERO_POISON
Definition ISDOpcodes.h:796
@ SIGN_EXTEND
Conversion operators.
Definition ISDOpcodes.h:852
@ FNEG
Perform various unary floating-point operations inspired by libm.
@ BRIND
BRIND - Indirect branch.
@ BR_JT
BR_JT - Jumptable branch.
@ FCANONICALIZE
Returns platform specific canonical encoding of a floating point number.
Definition ISDOpcodes.h:541
@ IS_FPCLASS
Performs a check of floating point class property, defined by IEEE-754.
Definition ISDOpcodes.h:548
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition ISDOpcodes.h:804
@ ATOMIC_LOAD
Val, OUTCHAIN = ATOMIC_LOAD(INCHAIN, ptr) This corresponds to "load atomic" instruction.
@ EXTRACT_ELEMENT
EXTRACT_ELEMENT - This is used to get the lower or upper (determined by a Constant,...
Definition ISDOpcodes.h:247
@ CTLS
Count leading redundant sign bits.
Definition ISDOpcodes.h:800
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
Definition ISDOpcodes.h:704
@ STRICT_FP16_TO_FP
@ SHL
Shift and rotation operations.
Definition ISDOpcodes.h:769
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
Definition ISDOpcodes.h:649
@ EXTRACT_SUBVECTOR
EXTRACT_SUBVECTOR(VECTOR, IDX) - Returns a subvector from VECTOR.
Definition ISDOpcodes.h:614
@ FMINNUM_IEEE
FMINNUM_IEEE/FMAXNUM_IEEE - Perform floating-point minimumNumber or maximumNumber on two values,...
@ EntryToken
EntryToken - This is the marker used to indicate the start of a region.
Definition ISDOpcodes.h:48
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition ISDOpcodes.h:576
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
Definition ISDOpcodes.h:224
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition ISDOpcodes.h:858
@ SELECT_CC
Select with condition operator - This selects between a true value and a false value (ops #2 and #3) ...
Definition ISDOpcodes.h:819
@ FMINNUM
FMINNUM/FMAXNUM - Perform floating-point minimum maximum on two values, following IEEE-754 definition...
@ DYNAMIC_STACKALLOC
DYNAMIC_STACKALLOC - Allocate some number of bytes on the stack aligned to a specified boundary.
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
Definition ISDOpcodes.h:896
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
Definition ISDOpcodes.h:727
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition ISDOpcodes.h:986
@ VSELECT
Select with a vector condition (op #0) and two vector operands (ops #1 and #2), returning a vector re...
Definition ISDOpcodes.h:813
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
Definition ISDOpcodes.h:328
@ INLINEASM_BR
INLINEASM_BR - Branching version of inline asm. Used by asm-goto.
@ FMINIMUM
FMINIMUM/FMAXIMUM - NaN-propagating minimum/maximum that also treat -0.0 as less than 0....
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition ISDOpcodes.h:934
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition ISDOpcodes.h:739
@ TRAP
TRAP - Trapping instruction.
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
Definition ISDOpcodes.h:205
@ ADDE
Carry-using nodes for multiple precision addition and subtraction.
Definition ISDOpcodes.h:304
@ INSERT_VECTOR_ELT
INSERT_VECTOR_ELT(VECTOR, VAL, IDX) - Returns VECTOR with the element at IDX replaced with VAL.
Definition ISDOpcodes.h:565
@ TokenFactor
TokenFactor - This node takes multiple tokens as input and produces a single token result.
Definition ISDOpcodes.h:53
@ CTTZ_ZERO_POISON
Bit counting operators with a poisoned result for zero inputs.
Definition ISDOpcodes.h:795
@ FFREXP
FFREXP - frexp, extract fractional and exponent component of a floating-point value.
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition ISDOpcodes.h:967
@ ADDRSPACECAST
ADDRSPACECAST - This operator converts between pointers of different address spaces.
@ INLINEASM
INLINEASM - Represents an inline asm block.
@ FP_TO_SINT_SAT
FP_TO_[US]INT_SAT - Convert floating point value in operand 0 to a signed or unsigned scalar integer ...
Definition ISDOpcodes.h:953
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition ISDOpcodes.h:864
@ AssertSext
AssertSext, AssertZext - These nodes record if a register contains a value that has already been zero...
Definition ISDOpcodes.h:62
@ FCOPYSIGN
FCOPYSIGN(X, Y) - Return the value of X with the sign of Y.
Definition ISDOpcodes.h:534
@ FMINIMUMNUM
FMINIMUMNUM/FMAXIMUMNUM - minimumnum/maximumnum that is same with FMINNUM_IEEE and FMAXNUM_IEEE besid...
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
Definition ISDOpcodes.h:213
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
Definition ISDOpcodes.h:556
bool isNormalStore(const SDNode *N)
Returns true if the specified node is a non-truncating and unindexed store.
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
LoadExtType
LoadExtType enum - This enum defines the three variants of LOADEXT (load with extension).
bool isNormalLoad(const SDNode *N)
Returns true if the specified node is a non-extending and unindexed load.
initializer< Ty > init(const Ty &Val)
constexpr double ln2
constexpr double ln10
constexpr float log2ef
Definition MathExtras.h:51
constexpr double log2e
This is an optimization pass for GlobalISel generic memory operations.
@ Offset
Definition DWP.cpp:573
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1739
InstructionCost Cost
LLVM_ABI bool isNullConstant(SDValue V)
Returns true if V is a constant integer zero.
LLVM_ABI void ComputeValueVTs(const TargetLowering &TLI, const DataLayout &DL, Type *Ty, SmallVectorImpl< EVT > &ValueVTs, SmallVectorImpl< EVT > *MemVTs=nullptr, SmallVectorImpl< TypeSize > *Offsets=nullptr, TypeSize StartingOffset=TypeSize::getZero())
ComputeValueVTs - Given an LLVM IR type, compute a sequence of EVTs that represent all the individual...
Definition Analysis.cpp:119
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
bool CCAssignFn(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
CCAssignFn - This function assigns a location for Val, updating State to reflect the change.
LLVM_ABI ConstantFPSDNode * isConstOrConstSplatFP(SDValue N, bool AllowUndefs=false)
Returns the SDNode if it is a constant splat BuildVector or constant float.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition MathExtras.h:385
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition bit.h:204
int countl_zero(T Val)
Count number of 0's from the most significant bit to the least stopping at the first 1.
Definition bit.h:263
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
Definition MathExtras.h:150
constexpr uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition Alignment.h:144
constexpr bool isUInt(uint64_t x)
Checks if an unsigned integer fits into the given bit width.
Definition MathExtras.h:189
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Definition MathExtras.h:155
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
LLVM_ABI raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
CombineLevel
Definition DAGCombine.h:15
@ AfterLegalizeDAG
Definition DAGCombine.h:19
@ BeforeLegalizeTypes
Definition DAGCombine.h:16
@ AfterLegalizeTypes
Definition DAGCombine.h:17
To bit_cast(const From &from) noexcept
Definition bit.h:90
@ Mul
Product of integers.
@ Add
Sum of integers.
DWARFExpression::Operation Op
LLVM_ABI ConstantSDNode * isConstOrConstSplat(SDValue N, bool AllowUndefs=false, bool AllowTruncation=false)
Returns the SDNode if it is a constant splat BuildVector or constant int.
constexpr unsigned BitWidth
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
LLVM_ABI bool isOneConstant(SDValue V)
Returns true if V is a constant integer one.
UndefPoisonKind
Enumeration to track whether we are interested in Undef, Poison, or both.
Definition UndefPoison.h:20
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition Alignment.h:201
static cl::opt< unsigned > CostThreshold("dfa-cost-threshold", cl::desc("Maximum cost accepted for the transformation"), cl::Hidden, cl::init(50))
APFloat neg(APFloat X)
Returns the negated value of the argument.
Definition APFloat.h:1688
unsigned Log2(Align A)
Returns the log2 of the alignment.
Definition Alignment.h:197
LLVM_ABI bool isAllOnesConstant(SDValue V)
Returns true if V is an integer constant with all bits set.
LLVM_ABI void reportFatalUsageError(Error Err)
Report a fatal error that does not indicate a bug in LLVM.
Definition Error.cpp:177
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:862
#define N
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
MCRegister getRegister() const
unsigned getStackOffset() const
DenormalModeKind Input
Denormal treatment kind for floating point instruction inputs in the default floating-point environme...
@ PreserveSign
The sign of a flushed-to-zero number is preserved in the sign of 0.
static constexpr DenormalMode getPreserveSign()
Extended Value Type.
Definition ValueTypes.h:35
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
Definition ValueTypes.h:418
EVT getPow2VectorType(LLVMContext &Context) const
Widens the length of the given vector EVT up to the nearest power of 2 and returns that type.
Definition ValueTypes.h:508
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition ValueTypes.h:145
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
Definition ValueTypes.h:70
EVT changeTypeToInteger() const
Return the type converted to an equivalently sized integer or vector with integer element type.
Definition ValueTypes.h:129
bool bitsGT(EVT VT) const
Return true if this has more bits than VT.
Definition ValueTypes.h:307
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
Definition ValueTypes.h:155
EVT getDoubleNumVectorElementsVT(LLVMContext &Context) const
Definition ValueTypes.h:494
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition ValueTypes.h:396
bool isByteSized() const
Return true if the bit size is a multiple of 8.
Definition ValueTypes.h:266
uint64_t getScalarSizeInBits() const
Definition ValueTypes.h:408
EVT getHalfSizedIntegerVT(LLVMContext &Context) const
Finds the smallest simple value type that is greater than or equal to half the width of this EVT.
Definition ValueTypes.h:453
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
Definition ValueTypes.h:501
TypeSize getStoreSizeInBits() const
Return the number of bits overwritten by a store of the specified value type.
Definition ValueTypes.h:435
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition ValueTypes.h:339
static EVT getIntegerVT(LLVMContext &Context, unsigned BitWidth)
Returns the EVT that represents an integer with the given number of bits.
Definition ValueTypes.h:61
uint64_t getFixedSizeInBits() const
Return the size of the specified fixed width value type in bits.
Definition ValueTypes.h:404
EVT getRoundIntegerType(LLVMContext &Context) const
Rounds the bit-width of the given integer EVT up to the nearest power of two (and at least to eight),...
Definition ValueTypes.h:442
bool isVector() const
Return true if this is a vector value type.
Definition ValueTypes.h:176
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition ValueTypes.h:346
bool bitsGE(EVT VT) const
Return true if this has no less bits than VT.
Definition ValueTypes.h:315
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition ValueTypes.h:351
bool isExtended() const
Test if the given EVT is extended (as opposed to being simple).
Definition ValueTypes.h:150
EVT changeElementType(LLVMContext &Context, EVT EltVT) const
Return a VT for a type whose attributes match ourselves with the exception of the element type that i...
Definition ValueTypes.h:121
LLVM_ABI const fltSemantics & getFltSemantics() const
Returns an APFloat semantics tag appropriate for the value type.
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition ValueTypes.h:359
bool bitsLE(EVT VT) const
Return true if this has no more bits than VT.
Definition ValueTypes.h:331
bool isInteger() const
Return true if this is an integer or a vector integer type.
Definition ValueTypes.h:160
InputArg - This struct carries flags and type information about a single incoming (formal) argument o...
MVT VT
Legalized type of this argument part.
bool isUnknown() const
Returns true if we don't know any bits.
Definition KnownBits.h:64
KnownBits trunc(unsigned BitWidth) const
Return known bits for a truncation of the value we're tracking.
Definition KnownBits.h:165
unsigned getBitWidth() const
Get the bit width of this value.
Definition KnownBits.h:44
KnownBits zext(unsigned BitWidth) const
Return known bits for a zero extension of the value we're tracking.
Definition KnownBits.h:176
void resetAll()
Resets the known state of all bits.
Definition KnownBits.h:72
unsigned countMaxActiveBits() const
Returns the maximum number of bits needed to represent all possible unsigned values with these known ...
Definition KnownBits.h:310
KnownBits sext(unsigned BitWidth) const
Return known bits for a sign extension of the value we're tracking.
Definition KnownBits.h:184
unsigned countMinLeadingZeros() const
Returns the minimum number of leading zero bits.
Definition KnownBits.h:262
APInt getMaxValue() const
Return the maximal unsigned value possible given these KnownBits.
Definition KnownBits.h:146
APInt getMinValue() const
Return the minimal unsigned value possible given these KnownBits.
Definition KnownBits.h:130
bool isNegative() const
Returns true if this value is known to be negative.
Definition KnownBits.h:103
static LLVM_ABI KnownBits mul(const KnownBits &LHS, const KnownBits &RHS, bool NoUndefSelfMultiply=false)
Compute known bits resulting from multiplying LHS and RHS.
Matching combinators.
This class contains a discriminated union of information about pointers in memory operands,...
LLVM_ABI bool isDereferenceable(unsigned Size, LLVMContext &C, const DataLayout &DL) const
Return true if memory region [V, V+Offset+Size) is known to be dereferenceable.
static LLVM_ABI MachinePointerInfo getStack(MachineFunction &MF, int64_t Offset, uint8_t ID=0)
Stack pointer relative access.
MachinePointerInfo getWithOffset(int64_t O) const
These are IR-level optimization flags that may be propagated to SDNodes.
void setAllowContract(bool b)
This represents a list of ValueType's that has been intern'd by a SelectionDAG.
DenormalMode FP32Denormals
If this is set, neither input or output denormals are flushed for most f32 instructions.
This structure contains all information that is necessary for lowering calls.
SmallVector< ISD::InputArg, 32 > Ins
LLVM_ABI void AddToWorklist(SDNode *N)
LLVM_ABI SDValue CombineTo(SDNode *N, ArrayRef< SDValue > To, bool AddTo=true)
LLVM_ABI void CommitTargetLoweringOpt(const TargetLoweringOpt &TLO)
A convenience struct that encapsulates a DAG, and two SDValues for returning information from TargetL...