LLVM 23.0.0git
AMDGPUISelLowering.cpp
Go to the documentation of this file.
1//===-- AMDGPUISelLowering.cpp - AMDGPU Common DAG lowering functions -----===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// This is the parent TargetLowering class for hardware code gen
11/// targets.
12//
13//===----------------------------------------------------------------------===//
14
15#include "AMDGPUISelLowering.h"
16#include "AMDGPU.h"
17#include "AMDGPUInstrInfo.h"
19#include "AMDGPUMemoryUtils.h"
26#include "llvm/IR/IntrinsicsAMDGPU.h"
30
31using namespace llvm;
32
33#include "AMDGPUGenCallingConv.inc"
34
36 "amdgpu-bypass-slow-div",
37 cl::desc("Skip 64-bit divide for dynamic 32-bit values"),
38 cl::init(true));
39
40// Find a larger type to do a load / store of a vector with.
42 unsigned StoreSize = VT.getStoreSizeInBits();
43 if (StoreSize <= 32)
44 return EVT::getIntegerVT(Ctx, StoreSize);
45
46 if (StoreSize % 32 == 0)
47 return EVT::getVectorVT(Ctx, MVT::i32, StoreSize / 32);
48
49 return VT;
50}
51
55
57 // In order for this to be a signed 24-bit value, bit 23, must
58 // be a sign bit.
59 return DAG.ComputeMaxSignificantBits(Op);
60}
61
63 const TargetSubtargetInfo &STI,
64 const AMDGPUSubtarget &AMDGPUSTI)
65 : TargetLowering(TM, STI), Subtarget(&AMDGPUSTI) {
66 // Always lower memset, memcpy, and memmove intrinsics to load/store
67 // instructions, rather then generating calls to memset, mempcy or memmove.
71
72 // Enable ganging up loads and stores in the memcpy DAG lowering.
74
75 // Lower floating point store/load to integer store/load to reduce the number
76 // of patterns in tablegen.
78 AddPromotedToType(ISD::LOAD, MVT::f32, MVT::i32);
79
81 AddPromotedToType(ISD::LOAD, MVT::v2f32, MVT::v2i32);
82
84 AddPromotedToType(ISD::LOAD, MVT::v3f32, MVT::v3i32);
85
87 AddPromotedToType(ISD::LOAD, MVT::v4f32, MVT::v4i32);
88
90 AddPromotedToType(ISD::LOAD, MVT::v5f32, MVT::v5i32);
91
93 AddPromotedToType(ISD::LOAD, MVT::v6f32, MVT::v6i32);
94
96 AddPromotedToType(ISD::LOAD, MVT::v7f32, MVT::v7i32);
97
99 AddPromotedToType(ISD::LOAD, MVT::v8f32, MVT::v8i32);
100
102 AddPromotedToType(ISD::LOAD, MVT::v9f32, MVT::v9i32);
103
104 setOperationAction(ISD::LOAD, MVT::v10f32, Promote);
105 AddPromotedToType(ISD::LOAD, MVT::v10f32, MVT::v10i32);
106
107 setOperationAction(ISD::LOAD, MVT::v11f32, Promote);
108 AddPromotedToType(ISD::LOAD, MVT::v11f32, MVT::v11i32);
109
110 setOperationAction(ISD::LOAD, MVT::v12f32, Promote);
111 AddPromotedToType(ISD::LOAD, MVT::v12f32, MVT::v12i32);
112
113 setOperationAction(ISD::LOAD, MVT::v16f32, Promote);
114 AddPromotedToType(ISD::LOAD, MVT::v16f32, MVT::v16i32);
115
116 setOperationAction(ISD::LOAD, MVT::v32f32, Promote);
117 AddPromotedToType(ISD::LOAD, MVT::v32f32, MVT::v32i32);
118
120 AddPromotedToType(ISD::LOAD, MVT::i64, MVT::v2i32);
121
123 AddPromotedToType(ISD::LOAD, MVT::v2i64, MVT::v4i32);
124
126 AddPromotedToType(ISD::LOAD, MVT::f64, MVT::v2i32);
127
129 AddPromotedToType(ISD::LOAD, MVT::v2f64, MVT::v4i32);
130
132 AddPromotedToType(ISD::LOAD, MVT::v3i64, MVT::v6i32);
133
135 AddPromotedToType(ISD::LOAD, MVT::v4i64, MVT::v8i32);
136
138 AddPromotedToType(ISD::LOAD, MVT::v3f64, MVT::v6i32);
139
141 AddPromotedToType(ISD::LOAD, MVT::v4f64, MVT::v8i32);
142
144 AddPromotedToType(ISD::LOAD, MVT::v8i64, MVT::v16i32);
145
147 AddPromotedToType(ISD::LOAD, MVT::v8f64, MVT::v16i32);
148
149 setOperationAction(ISD::LOAD, MVT::v16i64, Promote);
150 AddPromotedToType(ISD::LOAD, MVT::v16i64, MVT::v32i32);
151
152 setOperationAction(ISD::LOAD, MVT::v16f64, Promote);
153 AddPromotedToType(ISD::LOAD, MVT::v16f64, MVT::v32i32);
154
156 AddPromotedToType(ISD::LOAD, MVT::i128, MVT::v4i32);
157
158 // TODO: Would be better to consume as directly legal
160 AddPromotedToType(ISD::ATOMIC_LOAD, MVT::f32, MVT::i32);
161
163 AddPromotedToType(ISD::ATOMIC_LOAD, MVT::f64, MVT::i64);
164
166 AddPromotedToType(ISD::ATOMIC_LOAD, MVT::f16, MVT::i16);
167
169 AddPromotedToType(ISD::ATOMIC_LOAD, MVT::bf16, MVT::i16);
170
172 AddPromotedToType(ISD::ATOMIC_STORE, MVT::f32, MVT::i32);
173
175 AddPromotedToType(ISD::ATOMIC_STORE, MVT::f64, MVT::i64);
176
178 AddPromotedToType(ISD::ATOMIC_STORE, MVT::f16, MVT::i16);
179
181 AddPromotedToType(ISD::ATOMIC_STORE, MVT::bf16, MVT::i16);
182
183 // There are no 64-bit extloads. These should be done as a 32-bit extload and
184 // an extension to 64-bit.
185 for (MVT VT : MVT::integer_valuetypes())
187 Expand);
188
189 for (MVT VT : MVT::integer_valuetypes()) {
190 if (VT == MVT::i64)
191 continue;
192
193 for (auto Op : {ISD::SEXTLOAD, ISD::ZEXTLOAD, ISD::EXTLOAD}) {
194 setLoadExtAction(Op, VT, MVT::i1, Promote);
195 setLoadExtAction(Op, VT, MVT::i8, Legal);
196 setLoadExtAction(Op, VT, MVT::i16, Legal);
197 setLoadExtAction(Op, VT, MVT::i32, Expand);
198 }
199 }
200
202 for (auto MemVT :
203 {MVT::v2i8, MVT::v4i8, MVT::v2i16, MVT::v3i16, MVT::v4i16})
205 Expand);
206
207 setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
208 setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::bf16, Expand);
209 setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2f16, Expand);
210 setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2bf16, Expand);
211 setLoadExtAction(ISD::EXTLOAD, MVT::v3f32, MVT::v3f16, Expand);
212 setLoadExtAction(ISD::EXTLOAD, MVT::v3f32, MVT::v3bf16, Expand);
213 setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Expand);
214 setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4bf16, Expand);
215 setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8f16, Expand);
216 setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8bf16, Expand);
217 setLoadExtAction(ISD::EXTLOAD, MVT::v16f32, MVT::v16f16, Expand);
218 setLoadExtAction(ISD::EXTLOAD, MVT::v16f32, MVT::v16bf16, Expand);
219 setLoadExtAction(ISD::EXTLOAD, MVT::v32f32, MVT::v32f16, Expand);
220 setLoadExtAction(ISD::EXTLOAD, MVT::v32f32, MVT::v32bf16, Expand);
221
222 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
223 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f32, Expand);
224 setLoadExtAction(ISD::EXTLOAD, MVT::v3f64, MVT::v3f32, Expand);
225 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f32, Expand);
226 setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f32, Expand);
227 setLoadExtAction(ISD::EXTLOAD, MVT::v16f64, MVT::v16f32, Expand);
228
229 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
230 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::bf16, Expand);
231 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Expand);
232 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2bf16, Expand);
233 setLoadExtAction(ISD::EXTLOAD, MVT::v3f64, MVT::v3f16, Expand);
234 setLoadExtAction(ISD::EXTLOAD, MVT::v3f64, MVT::v3bf16, Expand);
235 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Expand);
236 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4bf16, Expand);
237 setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f16, Expand);
238 setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8bf16, Expand);
239 setLoadExtAction(ISD::EXTLOAD, MVT::v16f64, MVT::v16f16, Expand);
240 setLoadExtAction(ISD::EXTLOAD, MVT::v16f64, MVT::v16bf16, Expand);
241
243 AddPromotedToType(ISD::STORE, MVT::f32, MVT::i32);
244
246 AddPromotedToType(ISD::STORE, MVT::v2f32, MVT::v2i32);
247
249 AddPromotedToType(ISD::STORE, MVT::v3f32, MVT::v3i32);
250
252 AddPromotedToType(ISD::STORE, MVT::v4f32, MVT::v4i32);
253
255 AddPromotedToType(ISD::STORE, MVT::v5f32, MVT::v5i32);
256
258 AddPromotedToType(ISD::STORE, MVT::v6f32, MVT::v6i32);
259
261 AddPromotedToType(ISD::STORE, MVT::v7f32, MVT::v7i32);
262
264 AddPromotedToType(ISD::STORE, MVT::v8f32, MVT::v8i32);
265
267 AddPromotedToType(ISD::STORE, MVT::v9f32, MVT::v9i32);
268
270 AddPromotedToType(ISD::STORE, MVT::v10f32, MVT::v10i32);
271
273 AddPromotedToType(ISD::STORE, MVT::v11f32, MVT::v11i32);
274
276 AddPromotedToType(ISD::STORE, MVT::v12f32, MVT::v12i32);
277
279 AddPromotedToType(ISD::STORE, MVT::v16f32, MVT::v16i32);
280
282 AddPromotedToType(ISD::STORE, MVT::v32f32, MVT::v32i32);
283
285 AddPromotedToType(ISD::STORE, MVT::i64, MVT::v2i32);
286
288 AddPromotedToType(ISD::STORE, MVT::v2i64, MVT::v4i32);
289
291 AddPromotedToType(ISD::STORE, MVT::f64, MVT::v2i32);
292
294 AddPromotedToType(ISD::STORE, MVT::v2f64, MVT::v4i32);
295
297 AddPromotedToType(ISD::STORE, MVT::v3i64, MVT::v6i32);
298
300 AddPromotedToType(ISD::STORE, MVT::v3f64, MVT::v6i32);
301
303 AddPromotedToType(ISD::STORE, MVT::v4i64, MVT::v8i32);
304
306 AddPromotedToType(ISD::STORE, MVT::v4f64, MVT::v8i32);
307
309 AddPromotedToType(ISD::STORE, MVT::v8i64, MVT::v16i32);
310
312 AddPromotedToType(ISD::STORE, MVT::v8f64, MVT::v16i32);
313
315 AddPromotedToType(ISD::STORE, MVT::v16i64, MVT::v32i32);
316
318 AddPromotedToType(ISD::STORE, MVT::v16f64, MVT::v32i32);
319
321 AddPromotedToType(ISD::STORE, MVT::i128, MVT::v4i32);
322
323 setTruncStoreAction(MVT::i64, MVT::i1, Expand);
324 setTruncStoreAction(MVT::i64, MVT::i8, Expand);
325 setTruncStoreAction(MVT::i64, MVT::i16, Expand);
326 setTruncStoreAction(MVT::i64, MVT::i32, Expand);
327
328 setTruncStoreAction(MVT::v2i64, MVT::v2i1, Expand);
329 setTruncStoreAction(MVT::v2i64, MVT::v2i8, Expand);
330 setTruncStoreAction(MVT::v2i64, MVT::v2i16, Expand);
331 setTruncStoreAction(MVT::v2i64, MVT::v2i32, Expand);
332
333 setTruncStoreAction(MVT::f32, MVT::bf16, Expand);
334 setTruncStoreAction(MVT::f32, MVT::f16, Expand);
335 setTruncStoreAction(MVT::v2f32, MVT::v2bf16, Expand);
336 setTruncStoreAction(MVT::v2f32, MVT::v2f16, Expand);
337 setTruncStoreAction(MVT::v3f32, MVT::v3bf16, Expand);
338 setTruncStoreAction(MVT::v3f32, MVT::v3f16, Expand);
339 setTruncStoreAction(MVT::v4f32, MVT::v4bf16, Expand);
340 setTruncStoreAction(MVT::v4f32, MVT::v4f16, Expand);
341 setTruncStoreAction(MVT::v6f32, MVT::v6f16, Expand);
342 setTruncStoreAction(MVT::v8f32, MVT::v8bf16, Expand);
343 setTruncStoreAction(MVT::v8f32, MVT::v8f16, Expand);
344 setTruncStoreAction(MVT::v16f32, MVT::v16bf16, Expand);
345 setTruncStoreAction(MVT::v16f32, MVT::v16f16, Expand);
346 setTruncStoreAction(MVT::v32f32, MVT::v32bf16, Expand);
347 setTruncStoreAction(MVT::v32f32, MVT::v32f16, Expand);
348
349 setTruncStoreAction(MVT::f64, MVT::bf16, Expand);
350 setTruncStoreAction(MVT::f64, MVT::f16, Expand);
351 setTruncStoreAction(MVT::f64, MVT::f32, Expand);
352
353 setTruncStoreAction(MVT::v2f64, MVT::v2f32, Expand);
354 setTruncStoreAction(MVT::v2f64, MVT::v2bf16, Expand);
355 setTruncStoreAction(MVT::v2f64, MVT::v2f16, Expand);
356
357 setTruncStoreAction(MVT::v3i32, MVT::v3i8, Expand);
358
359 setTruncStoreAction(MVT::v3i64, MVT::v3i32, Expand);
360 setTruncStoreAction(MVT::v3i64, MVT::v3i16, Expand);
361 setTruncStoreAction(MVT::v3i64, MVT::v3i8, Expand);
362 setTruncStoreAction(MVT::v3i64, MVT::v3i1, Expand);
363 setTruncStoreAction(MVT::v3f64, MVT::v3f32, Expand);
364 setTruncStoreAction(MVT::v3f64, MVT::v3bf16, Expand);
365 setTruncStoreAction(MVT::v3f64, MVT::v3f16, Expand);
366
367 setTruncStoreAction(MVT::v4i64, MVT::v4i32, Expand);
368 setTruncStoreAction(MVT::v4i64, MVT::v4i16, Expand);
369 setTruncStoreAction(MVT::v4f64, MVT::v4f32, Expand);
370 setTruncStoreAction(MVT::v4f64, MVT::v4bf16, Expand);
371 setTruncStoreAction(MVT::v4f64, MVT::v4f16, Expand);
372
373 setTruncStoreAction(MVT::v5i32, MVT::v5i1, Expand);
374 setTruncStoreAction(MVT::v5i32, MVT::v5i8, Expand);
375 setTruncStoreAction(MVT::v5i32, MVT::v5i16, Expand);
376
377 setTruncStoreAction(MVT::v6i32, MVT::v6i1, Expand);
378 setTruncStoreAction(MVT::v6i32, MVT::v6i8, Expand);
379 setTruncStoreAction(MVT::v6i32, MVT::v6i16, Expand);
380
381 setTruncStoreAction(MVT::v7i32, MVT::v7i1, Expand);
382 setTruncStoreAction(MVT::v7i32, MVT::v7i8, Expand);
383 setTruncStoreAction(MVT::v7i32, MVT::v7i16, Expand);
384
385 setTruncStoreAction(MVT::v8f64, MVT::v8f32, Expand);
386 setTruncStoreAction(MVT::v8f64, MVT::v8bf16, Expand);
387 setTruncStoreAction(MVT::v8f64, MVT::v8f16, Expand);
388
389 setTruncStoreAction(MVT::v16f64, MVT::v16f32, Expand);
390 setTruncStoreAction(MVT::v16f64, MVT::v16bf16, Expand);
391 setTruncStoreAction(MVT::v16f64, MVT::v16f16, Expand);
392 setTruncStoreAction(MVT::v16i64, MVT::v16i16, Expand);
393 setTruncStoreAction(MVT::v16i64, MVT::v16i8, Expand);
394 setTruncStoreAction(MVT::v16i64, MVT::v16i8, Expand);
395 setTruncStoreAction(MVT::v16i64, MVT::v16i1, Expand);
396
397 setOperationAction(ISD::Constant, {MVT::i32, MVT::i64}, Legal);
398 setOperationAction(ISD::ConstantFP, {MVT::f32, MVT::f64}, Legal);
399
401
402 // For R600, this is totally unsupported, just custom lower to produce an
403 // error.
405
406 // Library functions. These default to Expand, but we have instructions
407 // for them.
410 {MVT::f16, MVT::f32}, Legal);
412
414 setOperationAction(ISD::FROUND, {MVT::f32, MVT::f64}, Custom);
416 {MVT::f16, MVT::f32, MVT::f64}, Expand);
417
420 Custom);
422
423 setOperationAction(ISD::FNEARBYINT, {MVT::f16, MVT::f32, MVT::f64}, Custom);
424
425 setOperationAction(ISD::FRINT, {MVT::f16, MVT::f32, MVT::f64}, Custom);
426
427 setOperationAction({ISD::LRINT, ISD::LLRINT}, {MVT::f16, MVT::f32, MVT::f64},
428 Expand);
429
430 setOperationAction(ISD::FREM, {MVT::f16, MVT::f32, MVT::f64}, Expand);
431 setOperationAction(ISD::IS_FPCLASS, {MVT::f32, MVT::f64}, Legal);
433
435 Custom);
436
437 setOperationAction(ISD::FCANONICALIZE, {MVT::f32, MVT::f64}, Legal);
438
439 // FIXME: These IS_FPCLASS vector fp types are marked custom so it reaches
440 // scalarization code. Can be removed when IS_FPCLASS expand isn't called by
441 // default unless marked custom/legal.
443 {MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32,
444 MVT::v6f32, MVT::v7f32, MVT::v8f32, MVT::v16f32,
445 MVT::v2f64, MVT::v3f64, MVT::v4f64, MVT::v8f64,
446 MVT::v16f64},
447 Custom);
448
449 // Expand to fneg + fadd.
451
453 {MVT::v3i32, MVT::v3f32, MVT::v4i32, MVT::v4f32,
454 MVT::v5i32, MVT::v5f32, MVT::v6i32, MVT::v6f32,
455 MVT::v7i32, MVT::v7f32, MVT::v8i32, MVT::v8f32,
456 MVT::v9i32, MVT::v9f32, MVT::v10i32, MVT::v10f32,
457 MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32},
458 Custom);
459
462 {MVT::v2f32, MVT::v2i32, MVT::v3f32, MVT::v3i32, MVT::v4f32,
463 MVT::v4i32, MVT::v5f32, MVT::v5i32, MVT::v6f32, MVT::v6i32,
464 MVT::v7f32, MVT::v7i32, MVT::v8f32, MVT::v8i32, MVT::v9f32,
465 MVT::v9i32, MVT::v10i32, MVT::v10f32, MVT::v11i32, MVT::v11f32,
466 MVT::v12i32, MVT::v12f32, MVT::v16i32, MVT::v32f32, MVT::v32i32,
467 MVT::v2f64, MVT::v2i64, MVT::v3f64, MVT::v3i64, MVT::v4f64,
468 MVT::v4i64, MVT::v8f64, MVT::v8i64, MVT::v16f64, MVT::v16i64},
469 Custom);
470
472 Expand);
473 setOperationAction(ISD::FP_TO_FP16, {MVT::f64, MVT::f32}, Custom);
474
475 const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 };
476 for (MVT VT : ScalarIntVTs) {
477 // These should use [SU]DIVREM, so set them to expand
479 Expand);
480
481 // GPU does not have divrem function for signed or unsigned.
483
484 // GPU does not have [S|U]MUL_LOHI functions as a single instruction.
486
488
489 // AMDGPU uses ADDC/SUBC/ADDE/SUBE
491 }
492
493 // The hardware supports 32-bit FSHR, but not FSHL.
495
496 setOperationAction({ISD::ROTL, ISD::ROTR}, {MVT::i32, MVT::i64}, Expand);
497
499
504 MVT::i64, Custom);
506
508 Legal);
509
512 MVT::i64, Custom);
513
514 for (auto VT : {MVT::i8, MVT::i16})
516
517 static const MVT::SimpleValueType VectorIntTypes[] = {
518 MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32, MVT::v6i32, MVT::v7i32,
519 MVT::v9i32, MVT::v10i32, MVT::v11i32, MVT::v12i32};
520
521 for (MVT VT : VectorIntTypes) {
522 // Expand the following operations for the current type by default.
523 // clang-format off
543 VT, Expand);
544 // clang-format on
545 }
546
547 static const MVT::SimpleValueType FloatVectorTypes[] = {
548 MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32, MVT::v6f32, MVT::v7f32,
549 MVT::v9f32, MVT::v10f32, MVT::v11f32, MVT::v12f32};
550
551 for (MVT VT : FloatVectorTypes) {
564 VT, Expand);
565 }
566
567 // This causes using an unrolled select operation rather than expansion with
568 // bit operations. This is in general better, but the alternative using BFI
569 // instructions may be better if the select sources are SGPRs.
571 AddPromotedToType(ISD::SELECT, MVT::v2f32, MVT::v2i32);
572
574 AddPromotedToType(ISD::SELECT, MVT::v3f32, MVT::v3i32);
575
577 AddPromotedToType(ISD::SELECT, MVT::v4f32, MVT::v4i32);
578
580 AddPromotedToType(ISD::SELECT, MVT::v5f32, MVT::v5i32);
581
583 AddPromotedToType(ISD::SELECT, MVT::v6f32, MVT::v6i32);
584
586 AddPromotedToType(ISD::SELECT, MVT::v7f32, MVT::v7i32);
587
589 AddPromotedToType(ISD::SELECT, MVT::v9f32, MVT::v9i32);
590
592 AddPromotedToType(ISD::SELECT, MVT::v10f32, MVT::v10i32);
593
595 AddPromotedToType(ISD::SELECT, MVT::v11f32, MVT::v11i32);
596
598 AddPromotedToType(ISD::SELECT, MVT::v12f32, MVT::v12i32);
599
601 setJumpIsExpensive(true);
602
605
607
608 // We want to find all load dependencies for long chains of stores to enable
609 // merging into very wide vectors. The problem is with vectors with > 4
610 // elements. MergeConsecutiveStores will attempt to merge these because x8/x16
611 // vectors are a legal type, even though we have to split the loads
612 // usually. When we can more precisely specify load legality per address
613 // space, we should be able to make FindBetterChain/MergeConsecutiveStores
614 // smarter so that they can figure out what to do in 2 iterations without all
615 // N > 4 stores on the same chain.
617
618 // memcpy/memmove/memset are expanded in the IR, so we shouldn't need to worry
619 // about these during lowering.
620 MaxStoresPerMemcpy = 0xffffffff;
621 MaxStoresPerMemmove = 0xffffffff;
622 MaxStoresPerMemset = 0xffffffff;
623
624 // The expansion for 64-bit division is enormous.
626 addBypassSlowDiv(64, 32);
627
638
642}
643
645 const auto Flags = Op.getNode()->getFlags();
646 if (Flags.hasNoSignedZeros())
647 return true;
648
649 return false;
650}
651
652//===----------------------------------------------------------------------===//
653// Target Information
654//===----------------------------------------------------------------------===//
655
657static bool fnegFoldsIntoOpcode(unsigned Opc) {
658 switch (Opc) {
659 case ISD::FADD:
660 case ISD::FSUB:
661 case ISD::FMUL:
662 case ISD::FMA:
663 case ISD::FMAD:
664 case ISD::FMINNUM:
665 case ISD::FMAXNUM:
668 case ISD::FMINIMUM:
669 case ISD::FMAXIMUM:
670 case ISD::FMINIMUMNUM:
671 case ISD::FMAXIMUMNUM:
672 case ISD::SELECT:
673 case ISD::FSIN:
674 case ISD::FTRUNC:
675 case ISD::FRINT:
676 case ISD::FNEARBYINT:
677 case ISD::FROUNDEVEN:
679 case AMDGPUISD::RCP:
680 case AMDGPUISD::RCP_LEGACY:
681 case AMDGPUISD::RCP_IFLAG:
682 case AMDGPUISD::SIN_HW:
683 case AMDGPUISD::FMUL_LEGACY:
684 case AMDGPUISD::FMIN_LEGACY:
685 case AMDGPUISD::FMAX_LEGACY:
686 case AMDGPUISD::FMED3:
687 // TODO: handle llvm.amdgcn.fma.legacy
688 return true;
689 case ISD::BITCAST:
690 llvm_unreachable("bitcast is special cased");
691 default:
692 return false;
693 }
694}
695
696static bool fnegFoldsIntoOp(const SDNode *N) {
697 unsigned Opc = N->getOpcode();
698 if (Opc == ISD::BITCAST) {
699 // TODO: Is there a benefit to checking the conditions performFNegCombine
700 // does? We don't for the other cases.
701 SDValue BCSrc = N->getOperand(0);
702 if (BCSrc.getOpcode() == ISD::BUILD_VECTOR) {
703 return BCSrc.getNumOperands() == 2 &&
704 BCSrc.getOperand(1).getValueSizeInBits() == 32;
705 }
706
707 return BCSrc.getOpcode() == ISD::SELECT && BCSrc.getValueType() == MVT::f32;
708 }
709
710 return fnegFoldsIntoOpcode(Opc);
711}
712
713/// \p returns true if the operation will definitely need to use a 64-bit
714/// encoding, and thus will use a VOP3 encoding regardless of the source
715/// modifiers.
717static bool opMustUseVOP3Encoding(const SDNode *N, MVT VT) {
718 return (N->getNumOperands() > 2 && N->getOpcode() != ISD::SELECT) ||
719 VT == MVT::f64;
720}
721
722/// Return true if v_cndmask_b32 will support fabs/fneg source modifiers for the
723/// type for ISD::SELECT.
725static bool selectSupportsSourceMods(const SDNode *N) {
726 // TODO: Only applies if select will be vector
727 return N->getValueType(0) == MVT::f32;
728}
729
730// Most FP instructions support source modifiers, but this could be refined
731// slightly.
733static bool hasSourceMods(const SDNode *N) {
734 if (isa<MemSDNode>(N))
735 return false;
736
737 switch (N->getOpcode()) {
738 case ISD::CopyToReg:
739 case ISD::FDIV:
740 case ISD::FREM:
741 case ISD::INLINEASM:
743 case AMDGPUISD::DIV_SCALE:
745
746 // TODO: Should really be looking at the users of the bitcast. These are
747 // problematic because bitcasts are used to legalize all stores to integer
748 // types.
749 case ISD::BITCAST:
750 return false;
752 switch (N->getConstantOperandVal(0)) {
753 case Intrinsic::amdgcn_interp_p1:
754 case Intrinsic::amdgcn_interp_p2:
755 case Intrinsic::amdgcn_interp_mov:
756 case Intrinsic::amdgcn_interp_p1_f16:
757 case Intrinsic::amdgcn_interp_p2_f16:
758 return false;
759 default:
760 return true;
761 }
762 }
763 case ISD::SELECT:
765 default:
766 return true;
767 }
768}
769
771 unsigned CostThreshold) {
772 // Some users (such as 3-operand FMA/MAD) must use a VOP3 encoding, and thus
773 // it is truly free to use a source modifier in all cases. If there are
774 // multiple users but for each one will necessitate using VOP3, there will be
775 // a code size increase. Try to avoid increasing code size unless we know it
776 // will save on the instruction count.
777 unsigned NumMayIncreaseSize = 0;
778 MVT VT = N->getValueType(0).getScalarType().getSimpleVT();
779
780 assert(!N->use_empty());
781
782 // XXX - Should this limit number of uses to check?
783 for (const SDNode *U : N->users()) {
784 if (!hasSourceMods(U))
785 return false;
786
787 if (!opMustUseVOP3Encoding(U, VT)) {
788 if (++NumMayIncreaseSize > CostThreshold)
789 return false;
790 }
791 }
792
793 return true;
794}
795
797 ISD::NodeType ExtendKind) const {
798 assert(!VT.isVector() && "only scalar expected");
799
800 // Round to the next multiple of 32-bits.
801 unsigned Size = VT.getSizeInBits();
802 if (Size <= 32)
803 return MVT::i32;
804 return EVT::getIntegerVT(Context, 32 * ((Size + 31) / 32));
805}
806
808 return 32;
809}
810
812 return true;
813}
814
815// The backend supports 32 and 64 bit floating point immediates.
816// FIXME: Why are we reporting vectors of FP immediates as legal?
818 bool ForCodeSize) const {
819 return isTypeLegal(VT.getScalarType());
820}
821
822// We don't want to shrink f64 / f32 constants.
824 EVT ScalarVT = VT.getScalarType();
825 return (ScalarVT != MVT::f32 && ScalarVT != MVT::f64);
826}
827
829 SDNode *N, ISD::LoadExtType ExtTy, EVT NewVT,
830 std::optional<unsigned> ByteOffset) const {
831 // TODO: This may be worth removing. Check regression tests for diffs.
832 if (!TargetLoweringBase::shouldReduceLoadWidth(N, ExtTy, NewVT, ByteOffset))
833 return false;
834
835 unsigned NewSize = NewVT.getStoreSizeInBits();
836
837 // If we are reducing to a 32-bit load or a smaller multi-dword load,
838 // this is always better.
839 if (NewSize >= 32)
840 return true;
841
842 EVT OldVT = N->getValueType(0);
843 unsigned OldSize = OldVT.getStoreSizeInBits();
844
846 unsigned AS = MN->getAddressSpace();
847 // Do not shrink an aligned scalar load to sub-dword.
848 // Scalar engine cannot do sub-dword loads.
849 // TODO: Update this for GFX12 which does have scalar sub-dword loads.
850 if (OldSize >= 32 && NewSize < 32 && MN->getAlign() >= Align(4) &&
854 MN->isInvariant())) &&
856 return false;
857
858 // Don't produce extloads from sub 32-bit types. SI doesn't have scalar
859 // extloads, so doing one requires using a buffer_load. In cases where we
860 // still couldn't use a scalar load, using the wider load shouldn't really
861 // hurt anything.
862
863 // If the old size already had to be an extload, there's no harm in continuing
864 // to reduce the width.
865 return (OldSize < 32);
866}
867
869 const SelectionDAG &DAG,
870 const MachineMemOperand &MMO) const {
871
872 assert(LoadTy.getSizeInBits() == CastTy.getSizeInBits());
873
874 if (LoadTy.getScalarType() == MVT::i32)
875 return false;
876
877 unsigned LScalarSize = LoadTy.getScalarSizeInBits();
878 unsigned CastScalarSize = CastTy.getScalarSizeInBits();
879
880 if ((LScalarSize >= CastScalarSize) && (CastScalarSize < 32))
881 return false;
882
883 unsigned Fast = 0;
885 CastTy, MMO, &Fast) &&
886 Fast;
887}
888
889// SI+ has instructions for cttz / ctlz for 32-bit values. This is probably also
890// profitable with the expansion for 64-bit since it's generally good to
891// speculate things.
893 return true;
894}
895
897 return true;
898}
899
901 switch (N->getOpcode()) {
902 case ISD::EntryToken:
903 case ISD::TokenFactor:
904 return true;
906 unsigned IntrID = N->getConstantOperandVal(0);
908 }
910 unsigned IntrID = N->getConstantOperandVal(1);
912 }
913 case ISD::LOAD:
914 if (cast<LoadSDNode>(N)->getMemOperand()->getAddrSpace() ==
916 return true;
917 return false;
918 case AMDGPUISD::SETCC: // ballot-style instruction
919 return true;
920 }
921 return false;
922}
923
925 SDValue Op, SelectionDAG &DAG, bool LegalOperations, bool ForCodeSize,
926 NegatibleCost &Cost, unsigned Depth) const {
927
928 switch (Op.getOpcode()) {
929 case ISD::FMA:
930 case ISD::FMAD: {
931 // Negating a fma is not free if it has users without source mods.
932 if (!allUsesHaveSourceMods(Op.getNode()))
933 return SDValue();
934 break;
935 }
936 case AMDGPUISD::RCP: {
937 SDValue Src = Op.getOperand(0);
938 EVT VT = Op.getValueType();
939 SDLoc SL(Op);
940
941 SDValue NegSrc = getNegatedExpression(Src, DAG, LegalOperations,
942 ForCodeSize, Cost, Depth + 1);
943 if (NegSrc)
944 return DAG.getNode(AMDGPUISD::RCP, SL, VT, NegSrc, Op->getFlags());
945 return SDValue();
946 }
947 default:
948 break;
949 }
950
951 return TargetLowering::getNegatedExpression(Op, DAG, LegalOperations,
952 ForCodeSize, Cost, Depth);
953}
954
955//===---------------------------------------------------------------------===//
956// Target Properties
957//===---------------------------------------------------------------------===//
958
961
962 // Packed operations do not have a fabs modifier.
963 // Report this based on the end legalized type.
964 return VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f16 || VT == MVT::bf16;
965}
966
969 // Report this based on the end legalized type.
970 VT = VT.getScalarType();
971 return VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f16 || VT == MVT::bf16;
972}
973
975 unsigned NumElem,
976 unsigned AS) const {
977 return true;
978}
979
981 // There are few operations which truly have vector input operands. Any vector
982 // operation is going to involve operations on each component, and a
983 // build_vector will be a copy per element, so it always makes sense to use a
984 // build_vector input in place of the extracted element to avoid a copy into a
985 // super register.
986 //
987 // We should probably only do this if all users are extracts only, but this
988 // should be the common case.
989 return true;
990}
991
993 // Truncate is just accessing a subregister.
994
995 unsigned SrcSize = Source.getSizeInBits();
996 unsigned DestSize = Dest.getSizeInBits();
997
998 return DestSize < SrcSize && DestSize % 32 == 0 ;
999}
1000
1002 // Truncate is just accessing a subregister.
1003
1004 unsigned SrcSize = Source->getScalarSizeInBits();
1005 unsigned DestSize = Dest->getScalarSizeInBits();
1006
1007 if (DestSize== 16 && Subtarget->has16BitInsts())
1008 return SrcSize >= 32;
1009
1010 return DestSize < SrcSize && DestSize % 32 == 0;
1011}
1012
1014 unsigned SrcSize = Src->getScalarSizeInBits();
1015 unsigned DestSize = Dest->getScalarSizeInBits();
1016
1017 if (SrcSize == 16 && Subtarget->has16BitInsts())
1018 return DestSize >= 32;
1019
1020 return SrcSize == 32 && DestSize == 64;
1021}
1022
1024 // Any register load of a 64-bit value really requires 2 32-bit moves. For all
1025 // practical purposes, the extra mov 0 to load a 64-bit is free. As used,
1026 // this will enable reducing 64-bit operations the 32-bit, which is always
1027 // good.
1028
1029 if (Src == MVT::i16)
1030 return Dest == MVT::i32 ||Dest == MVT::i64 ;
1031
1032 return Src == MVT::i32 && Dest == MVT::i64;
1033}
1034
1036 EVT DestVT) const {
1037 switch (N->getOpcode()) {
1038 case ISD::ADD:
1039 case ISD::SUB:
1040 case ISD::SHL:
1041 case ISD::SRL:
1042 case ISD::SRA:
1043 case ISD::AND:
1044 case ISD::OR:
1045 case ISD::XOR:
1046 case ISD::MUL:
1047 case ISD::SETCC:
1048 case ISD::SELECT:
1049 case ISD::SMIN:
1050 case ISD::SMAX:
1051 case ISD::UMIN:
1052 case ISD::UMAX:
1053 if (isTypeLegal(MVT::i16) &&
1054 (!DestVT.isVector() ||
1055 !isOperationLegal(ISD::ADD, MVT::v2i16))) { // Check if VOP3P
1056 // Don't narrow back down to i16 if promoted to i32 already.
1057 if (!N->isDivergent() && DestVT.isInteger() &&
1058 DestVT.getScalarSizeInBits() > 1 &&
1059 DestVT.getScalarSizeInBits() <= 16 &&
1060 SrcVT.getScalarSizeInBits() > 16) {
1061 return false;
1062 }
1063 }
1064 return true;
1065 default:
1066 break;
1067 }
1068
1069 // There aren't really 64-bit registers, but pairs of 32-bit ones and only a
1070 // limited number of native 64-bit operations. Shrinking an operation to fit
1071 // in a single 32-bit register should always be helpful. As currently used,
1072 // this is much less general than the name suggests, and is only used in
1073 // places trying to reduce the sizes of loads. Shrinking loads to < 32-bits is
1074 // not profitable, and may actually be harmful.
1075 if (isa<LoadSDNode>(N))
1076 return SrcVT.getSizeInBits() > 32 && DestVT.getSizeInBits() == 32;
1077
1078 return true;
1079}
1080
1082 const SDNode* N, CombineLevel Level) const {
1083 assert((N->getOpcode() == ISD::SHL || N->getOpcode() == ISD::SRA ||
1084 N->getOpcode() == ISD::SRL) &&
1085 "Expected shift op");
1086
1087 SDValue ShiftLHS = N->getOperand(0);
1088 if (!ShiftLHS->hasOneUse())
1089 return false;
1090
1091 if (ShiftLHS.getOpcode() == ISD::SIGN_EXTEND &&
1092 !ShiftLHS.getOperand(0)->hasOneUse())
1093 return false;
1094
1095 // Always commute pre-type legalization and right shifts.
1096 // We're looking for shl(or(x,y),z) patterns.
1098 N->getOpcode() != ISD::SHL || N->getOperand(0).getOpcode() != ISD::OR)
1099 return true;
1100
1101 // If only user is a i32 right-shift, then don't destroy a BFE pattern.
1102 if (N->getValueType(0) == MVT::i32 && N->hasOneUse() &&
1103 (N->user_begin()->getOpcode() == ISD::SRA ||
1104 N->user_begin()->getOpcode() == ISD::SRL))
1105 return false;
1106
1107 // Don't destroy or(shl(load_zext(),c), load_zext()) patterns.
1108 auto IsShiftAndLoad = [](SDValue LHS, SDValue RHS) {
1109 if (LHS.getOpcode() != ISD::SHL)
1110 return false;
1111 auto *RHSLd = dyn_cast<LoadSDNode>(RHS);
1112 auto *LHS0 = dyn_cast<LoadSDNode>(LHS.getOperand(0));
1113 auto *LHS1 = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
1114 return LHS0 && LHS1 && RHSLd && LHS0->getExtensionType() == ISD::ZEXTLOAD &&
1115 LHS1->getAPIntValue() == LHS0->getMemoryVT().getScalarSizeInBits() &&
1116 RHSLd->getExtensionType() == ISD::ZEXTLOAD;
1117 };
1118 SDValue LHS = N->getOperand(0).getOperand(0);
1119 SDValue RHS = N->getOperand(0).getOperand(1);
1120 return !(IsShiftAndLoad(LHS, RHS) || IsShiftAndLoad(RHS, LHS));
1121}
1122
1123//===---------------------------------------------------------------------===//
1124// TargetLowering Callbacks
1125//===---------------------------------------------------------------------===//
1126
1128 bool IsVarArg) {
1129 switch (CC) {
1137 return CC_AMDGPU;
1140 return CC_AMDGPU_CS_CHAIN;
1141 case CallingConv::C:
1142 case CallingConv::Fast:
1143 case CallingConv::Cold:
1144 return CC_AMDGPU_Func;
1147 return CC_SI_Gfx;
1150 default:
1151 reportFatalUsageError("unsupported calling convention for call");
1152 }
1153}
1154
1156 bool IsVarArg) {
1157 switch (CC) {
1160 llvm_unreachable("kernels should not be handled here");
1170 return RetCC_SI_Shader;
1173 return RetCC_SI_Gfx;
1174 case CallingConv::C:
1175 case CallingConv::Fast:
1176 case CallingConv::Cold:
1177 return RetCC_AMDGPU_Func;
1178 default:
1179 reportFatalUsageError("unsupported calling convention");
1180 }
1181}
1182
1183/// The SelectionDAGBuilder will automatically promote function arguments
1184/// with illegal types. However, this does not work for the AMDGPU targets
1185/// since the function arguments are stored in memory as these illegal types.
1186/// In order to handle this properly we need to get the original types sizes
1187/// from the LLVM IR Function and fixup the ISD:InputArg values before
1188/// passing them to AnalyzeFormalArguments()
1189
1190/// When the SelectionDAGBuilder computes the Ins, it takes care of splitting
1191/// input values across multiple registers. Each item in the Ins array
1192/// represents a single value that will be stored in registers. Ins[x].VT is
1193/// the value type of the value that will be stored in the register, so
1194/// whatever SDNode we lower the argument to needs to be this type.
1195///
1196/// In order to correctly lower the arguments we need to know the size of each
1197/// argument. Since Ins[x].VT gives us the size of the register that will
1198/// hold the value, we need to look at Ins[x].ArgVT to see the 'real' type
1199/// for the original function argument so that we can deduce the correct memory
1200/// type to use for Ins[x]. In most cases the correct memory type will be
1201/// Ins[x].ArgVT. However, this will not always be the case. If, for example,
1202/// we have a kernel argument of type v8i8, this argument will be split into
1203/// 8 parts and each part will be represented by its own item in the Ins array.
1204/// For each part the Ins[x].ArgVT will be the v8i8, which is the full type of
1205/// the argument before it was split. From this, we deduce that the memory type
1206/// for each individual part is i8. We pass the memory type as LocVT to the
1207/// calling convention analysis function and the register type (Ins[x].VT) as
1208/// the ValVT.
1210 CCState &State,
1211 const SmallVectorImpl<ISD::InputArg> &Ins) const {
1212 const MachineFunction &MF = State.getMachineFunction();
1213 const Function &Fn = MF.getFunction();
1214 LLVMContext &Ctx = Fn.getContext();
1215 const unsigned ExplicitOffset = Subtarget->getExplicitKernelArgOffset();
1217
1218 Align MaxAlign = Align(1);
1219 uint64_t ExplicitArgOffset = 0;
1220 const DataLayout &DL = Fn.getDataLayout();
1221
1222 unsigned InIndex = 0;
1223
1224 for (const Argument &Arg : Fn.args()) {
1225 const bool IsByRef = Arg.hasByRefAttr();
1226 Type *BaseArgTy = Arg.getType();
1227 Type *MemArgTy = IsByRef ? Arg.getParamByRefType() : BaseArgTy;
1228 Align Alignment = DL.getValueOrABITypeAlignment(
1229 IsByRef ? Arg.getParamAlign() : std::nullopt, MemArgTy);
1230 MaxAlign = std::max(Alignment, MaxAlign);
1231 uint64_t AllocSize = DL.getTypeAllocSize(MemArgTy);
1232
1233 uint64_t ArgOffset = alignTo(ExplicitArgOffset, Alignment) + ExplicitOffset;
1234 ExplicitArgOffset = alignTo(ExplicitArgOffset, Alignment) + AllocSize;
1235
1236 // We're basically throwing away everything passed into us and starting over
1237 // to get accurate in-memory offsets. The "PartOffset" is completely useless
1238 // to us as computed in Ins.
1239 //
1240 // We also need to figure out what type legalization is trying to do to get
1241 // the correct memory offsets.
1242
1243 SmallVector<EVT, 16> ValueVTs;
1245 ComputeValueVTs(*this, DL, BaseArgTy, ValueVTs, /*MemVTs=*/nullptr,
1246 &Offsets, ArgOffset);
1247
1248 for (unsigned Value = 0, NumValues = ValueVTs.size();
1249 Value != NumValues; ++Value) {
1250 uint64_t BasePartOffset = Offsets[Value];
1251
1252 EVT ArgVT = ValueVTs[Value];
1253 EVT MemVT = ArgVT;
1254 MVT RegisterVT = getRegisterTypeForCallingConv(Ctx, CC, ArgVT);
1255 unsigned NumRegs = getNumRegistersForCallingConv(Ctx, CC, ArgVT);
1256
1257 if (NumRegs == 1) {
1258 // This argument is not split, so the IR type is the memory type.
1259 if (ArgVT.isExtended()) {
1260 // We have an extended type, like i24, so we should just use the
1261 // register type.
1262 MemVT = RegisterVT;
1263 } else {
1264 MemVT = ArgVT;
1265 }
1266 } else if (ArgVT.isVector() && RegisterVT.isVector() &&
1267 ArgVT.getScalarType() == RegisterVT.getScalarType()) {
1268 assert(ArgVT.getVectorNumElements() > RegisterVT.getVectorNumElements());
1269 // We have a vector value which has been split into a vector with
1270 // the same scalar type, but fewer elements. This should handle
1271 // all the floating-point vector types.
1272 MemVT = RegisterVT;
1273 } else if (ArgVT.isVector() &&
1274 ArgVT.getVectorNumElements() == NumRegs) {
1275 // This arg has been split so that each element is stored in a separate
1276 // register.
1277 MemVT = ArgVT.getScalarType();
1278 } else if (ArgVT.isExtended()) {
1279 // We have an extended type, like i65.
1280 MemVT = RegisterVT;
1281 } else {
1282 unsigned MemoryBits = ArgVT.getStoreSizeInBits() / NumRegs;
1283 assert(ArgVT.getStoreSizeInBits() % NumRegs == 0);
1284 if (RegisterVT.isInteger()) {
1285 MemVT = EVT::getIntegerVT(State.getContext(), MemoryBits);
1286 } else if (RegisterVT.isVector()) {
1287 assert(!RegisterVT.getScalarType().isFloatingPoint());
1288 unsigned NumElements = RegisterVT.getVectorNumElements();
1289 assert(MemoryBits % NumElements == 0);
1290 // This vector type has been split into another vector type with
1291 // a different elements size.
1292 EVT ScalarVT = EVT::getIntegerVT(State.getContext(),
1293 MemoryBits / NumElements);
1294 MemVT = EVT::getVectorVT(State.getContext(), ScalarVT, NumElements);
1295 } else {
1296 llvm_unreachable("cannot deduce memory type.");
1297 }
1298 }
1299
1300 // Convert one element vectors to scalar.
1301 if (MemVT.isVector() && MemVT.getVectorNumElements() == 1)
1302 MemVT = MemVT.getScalarType();
1303
1304 // Round up vec3/vec5 argument.
1305 if (MemVT.isVector() && !MemVT.isPow2VectorType()) {
1306 MemVT = MemVT.getPow2VectorType(State.getContext());
1307 } else if (!MemVT.isSimple() && !MemVT.isVector()) {
1308 MemVT = MemVT.getRoundIntegerType(State.getContext());
1309 }
1310
1311 unsigned PartOffset = 0;
1312 for (unsigned i = 0; i != NumRegs; ++i) {
1313 State.addLoc(CCValAssign::getCustomMem(InIndex++, RegisterVT,
1314 BasePartOffset + PartOffset,
1315 MemVT.getSimpleVT(),
1317 PartOffset += MemVT.getStoreSize();
1318 }
1319 }
1320 }
1321}
1322
1324 SDValue Chain, CallingConv::ID CallConv,
1325 bool isVarArg,
1327 const SmallVectorImpl<SDValue> &OutVals,
1328 const SDLoc &DL, SelectionDAG &DAG) const {
1329 // FIXME: Fails for r600 tests
1330 //assert(!isVarArg && Outs.empty() && OutVals.empty() &&
1331 // "wave terminate should not have return values");
1332 return DAG.getNode(AMDGPUISD::ENDPGM, DL, MVT::Other, Chain);
1333}
1334
1335//===---------------------------------------------------------------------===//
1336// Target specific lowering
1337//===---------------------------------------------------------------------===//
1338
1339/// Selects the correct CCAssignFn for a given CallingConvention value.
1344
1349
1351 SelectionDAG &DAG,
1352 MachineFrameInfo &MFI,
1353 int ClobberedFI) const {
1354 SmallVector<SDValue, 8> ArgChains;
1355 int64_t FirstByte = MFI.getObjectOffset(ClobberedFI);
1356 int64_t LastByte = FirstByte + MFI.getObjectSize(ClobberedFI) - 1;
1357
1358 // Include the original chain at the beginning of the list. When this is
1359 // used by target LowerCall hooks, this helps legalize find the
1360 // CALLSEQ_BEGIN node.
1361 ArgChains.push_back(Chain);
1362
1363 // Add a chain value for each stack argument corresponding
1364 for (SDNode *U : DAG.getEntryNode().getNode()->users()) {
1365 if (LoadSDNode *L = dyn_cast<LoadSDNode>(U)) {
1366 if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(L->getBasePtr())) {
1367 if (FI->getIndex() < 0) {
1368 int64_t InFirstByte = MFI.getObjectOffset(FI->getIndex());
1369 int64_t InLastByte = InFirstByte;
1370 InLastByte += MFI.getObjectSize(FI->getIndex()) - 1;
1371
1372 if ((InFirstByte <= FirstByte && FirstByte <= InLastByte) ||
1373 (FirstByte <= InFirstByte && InFirstByte <= LastByte))
1374 ArgChains.push_back(SDValue(L, 1));
1375 }
1376 }
1377 }
1378 }
1379
1380 // Build a tokenfactor for all the chains.
1381 return DAG.getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ArgChains);
1382}
1383
1386 StringRef Reason) const {
1387 SDValue Callee = CLI.Callee;
1388 SelectionDAG &DAG = CLI.DAG;
1389
1390 const Function &Fn = DAG.getMachineFunction().getFunction();
1391
1392 StringRef FuncName("<unknown>");
1393
1395 FuncName = G->getSymbol();
1396 else if (const GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
1397 FuncName = G->getGlobal()->getName();
1398
1399 DAG.getContext()->diagnose(
1400 DiagnosticInfoUnsupported(Fn, Reason + FuncName, CLI.DL.getDebugLoc()));
1401
1402 if (!CLI.IsTailCall) {
1403 for (ISD::InputArg &Arg : CLI.Ins)
1404 InVals.push_back(DAG.getPOISON(Arg.VT));
1405 }
1406
1407 // FIXME: Hack because R600 doesn't handle callseq pseudos yet.
1408 if (getTargetMachine().getTargetTriple().getArch() == Triple::r600)
1409 return CLI.Chain;
1410
1411 SDValue Chain = DAG.getCALLSEQ_START(CLI.Chain, 0, 0, CLI.DL);
1412 return DAG.getCALLSEQ_END(Chain, 0, 0, /*InGlue=*/SDValue(), CLI.DL);
1413}
1414
1416 SmallVectorImpl<SDValue> &InVals) const {
1417 return lowerUnhandledCall(CLI, InVals, "unsupported call to function ");
1418}
1419
1421 SelectionDAG &DAG) const {
1422 const Function &Fn = DAG.getMachineFunction().getFunction();
1423
1425 Fn, "unsupported dynamic alloca", SDLoc(Op).getDebugLoc()));
1426 auto Ops = {DAG.getConstant(0, SDLoc(), Op.getValueType()), Op.getOperand(0)};
1427 return DAG.getMergeValues(Ops, SDLoc());
1428}
1429
1431 SelectionDAG &DAG) const {
1432 switch (Op.getOpcode()) {
1433 default:
1434 Op->print(errs(), &DAG);
1435 llvm_unreachable("Custom lowering code for this "
1436 "instruction is not implemented yet!");
1437 break;
1439 case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG);
1441 case ISD::UDIVREM: return LowerUDIVREM(Op, DAG);
1442 case ISD::SDIVREM:
1443 return LowerSDIVREM(Op, DAG);
1444 case ISD::FCEIL: return LowerFCEIL(Op, DAG);
1445 case ISD::FTRUNC: return LowerFTRUNC(Op, DAG);
1446 case ISD::FRINT: return LowerFRINT(Op, DAG);
1447 case ISD::FNEARBYINT: return LowerFNEARBYINT(Op, DAG);
1448 case ISD::FROUNDEVEN:
1449 return LowerFROUNDEVEN(Op, DAG);
1450 case ISD::FROUND: return LowerFROUND(Op, DAG);
1451 case ISD::FFLOOR: return LowerFFLOOR(Op, DAG);
1452 case ISD::FLOG2:
1453 return LowerFLOG2(Op, DAG);
1454 case ISD::FLOG:
1455 case ISD::FLOG10:
1456 return LowerFLOGCommon(Op, DAG);
1457 case ISD::FEXP:
1458 case ISD::FEXP10:
1459 return lowerFEXP(Op, DAG);
1460 case ISD::FEXP2:
1461 return lowerFEXP2(Op, DAG);
1462 case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);
1463 case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
1464 case ISD::FP_TO_FP16: return LowerFP_TO_FP16(Op, DAG);
1465 case ISD::FP_TO_SINT:
1466 case ISD::FP_TO_UINT:
1467 return LowerFP_TO_INT(Op, DAG);
1470 return LowerFP_TO_INT_SAT(Op, DAG);
1471 case ISD::CTTZ:
1473 case ISD::CTLZ:
1475 return LowerCTLZ_CTTZ(Op, DAG);
1477 }
1478 return Op;
1479}
1480
1483 SelectionDAG &DAG) const {
1484 switch (N->getOpcode()) {
1486 // Different parts of legalization seem to interpret which type of
1487 // sign_extend_inreg is the one to check for custom lowering. The extended
1488 // from type is what really matters, but some places check for custom
1489 // lowering of the result type. This results in trying to use
1490 // ReplaceNodeResults to sext_in_reg to an illegal type, so we'll just do
1491 // nothing here and let the illegal result integer be handled normally.
1492 return;
1493 case ISD::FLOG2:
1494 if (SDValue Lowered = LowerFLOG2(SDValue(N, 0), DAG))
1495 Results.push_back(Lowered);
1496 return;
1497 case ISD::FLOG:
1498 case ISD::FLOG10:
1499 if (SDValue Lowered = LowerFLOGCommon(SDValue(N, 0), DAG))
1500 Results.push_back(Lowered);
1501 return;
1502 case ISD::FEXP2:
1503 if (SDValue Lowered = lowerFEXP2(SDValue(N, 0), DAG))
1504 Results.push_back(Lowered);
1505 return;
1506 case ISD::FEXP:
1507 case ISD::FEXP10:
1508 if (SDValue Lowered = lowerFEXP(SDValue(N, 0), DAG))
1509 Results.push_back(Lowered);
1510 return;
1511 case ISD::CTLZ:
1513 if (auto Lowered = lowerCTLZResults(SDValue(N, 0u), DAG))
1514 Results.push_back(Lowered);
1515 return;
1516 default:
1517 return;
1518 }
1519}
1520
1522 SDValue Op,
1523 SelectionDAG &DAG) const {
1524
1525 const DataLayout &DL = DAG.getDataLayout();
1527 const GlobalValue *GV = G->getGlobal();
1528
1529 if (!MFI->isModuleEntryFunction()) {
1530 auto IsNamedBarrier = AMDGPU::isNamedBarrier(*cast<GlobalVariable>(GV));
1531 if (std::optional<uint32_t> Address =
1533 if (IsNamedBarrier) {
1534 unsigned BarCnt = cast<GlobalVariable>(GV)->getGlobalSize(DL) / 16;
1535 MFI->recordNumNamedBarriers(Address.value(), BarCnt);
1536 }
1537 return DAG.getConstant(*Address, SDLoc(Op), Op.getValueType());
1538 } else if (IsNamedBarrier) {
1539 llvm_unreachable("named barrier should have an assigned address");
1540 }
1541 }
1542
1543 if (G->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
1544 G->getAddressSpace() == AMDGPUAS::REGION_ADDRESS) {
1545 if (!MFI->isModuleEntryFunction() &&
1546 GV->getName() != "llvm.amdgcn.module.lds" &&
1548 SDLoc DL(Op);
1549 const Function &Fn = DAG.getMachineFunction().getFunction();
1551 Fn, "local memory global used by non-kernel function",
1552 DL.getDebugLoc(), DS_Warning));
1553
1554 // We currently don't have a way to correctly allocate LDS objects that
1555 // aren't directly associated with a kernel. We do force inlining of
1556 // functions that use local objects. However, if these dead functions are
1557 // not eliminated, we don't want a compile time error. Just emit a warning
1558 // and a trap, since there should be no callable path here.
1559 SDValue Trap = DAG.getNode(ISD::TRAP, DL, MVT::Other, DAG.getEntryNode());
1560 SDValue OutputChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
1561 Trap, DAG.getRoot());
1562 DAG.setRoot(OutputChain);
1563 return DAG.getPOISON(Op.getValueType());
1564 }
1565
1566 // XXX: What does the value of G->getOffset() mean?
1567 assert(G->getOffset() == 0 &&
1568 "Do not know what to do with an non-zero offset");
1569
1570 // TODO: We could emit code to handle the initialization somewhere.
1571 // We ignore the initializer for now and legalize it to allow selection.
1572 // The initializer will anyway get errored out during assembly emission.
1573 unsigned Offset = MFI->allocateLDSGlobal(DL, *cast<GlobalVariable>(GV));
1574 return DAG.getConstant(Offset, SDLoc(Op), Op.getValueType());
1575 }
1576 return SDValue();
1577}
1578
1580 SelectionDAG &DAG) const {
1582 SDLoc SL(Op);
1583
1584 EVT VT = Op.getValueType();
1585 if (VT.getVectorElementType().getSizeInBits() < 32) {
1586 unsigned OpBitSize = Op.getOperand(0).getValueType().getSizeInBits();
1587 if (OpBitSize >= 32 && OpBitSize % 32 == 0) {
1588 unsigned NewNumElt = OpBitSize / 32;
1589 EVT NewEltVT = (NewNumElt == 1) ? MVT::i32
1591 MVT::i32, NewNumElt);
1592 for (const SDUse &U : Op->ops()) {
1593 SDValue In = U.get();
1594 SDValue NewIn = DAG.getNode(ISD::BITCAST, SL, NewEltVT, In);
1595 if (NewNumElt > 1)
1596 DAG.ExtractVectorElements(NewIn, Args);
1597 else
1598 Args.push_back(NewIn);
1599 }
1600
1601 EVT NewVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
1602 NewNumElt * Op.getNumOperands());
1603 SDValue BV = DAG.getBuildVector(NewVT, SL, Args);
1604 return DAG.getNode(ISD::BITCAST, SL, VT, BV);
1605 }
1606 }
1607
1608 for (const SDUse &U : Op->ops())
1609 DAG.ExtractVectorElements(U.get(), Args);
1610
1611 return DAG.getBuildVector(Op.getValueType(), SL, Args);
1612}
1613
1615 SelectionDAG &DAG) const {
1616 SDLoc SL(Op);
1618 unsigned Start = Op.getConstantOperandVal(1);
1619 EVT VT = Op.getValueType();
1620 EVT SrcVT = Op.getOperand(0).getValueType();
1621
1622 if (VT.getScalarSizeInBits() == 16 && Start % 2 == 0) {
1623 unsigned NumElt = VT.getVectorNumElements();
1624 unsigned NumSrcElt = SrcVT.getVectorNumElements();
1625 assert(NumElt % 2 == 0 && NumSrcElt % 2 == 0 && "expect legal types");
1626
1627 // Extract 32-bit registers at a time.
1628 EVT NewSrcVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumSrcElt / 2);
1629 EVT NewVT = NumElt == 2
1630 ? MVT::i32
1631 : EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElt / 2);
1632 SDValue Tmp = DAG.getNode(ISD::BITCAST, SL, NewSrcVT, Op.getOperand(0));
1633
1634 DAG.ExtractVectorElements(Tmp, Args, Start / 2, NumElt / 2);
1635 if (NumElt == 2)
1636 Tmp = Args[0];
1637 else
1638 Tmp = DAG.getBuildVector(NewVT, SL, Args);
1639
1640 return DAG.getNode(ISD::BITCAST, SL, VT, Tmp);
1641 }
1642
1643 DAG.ExtractVectorElements(Op.getOperand(0), Args, Start,
1645
1646 return DAG.getBuildVector(Op.getValueType(), SL, Args);
1647}
1648
1649// TODO: Handle fabs too
1651 if (Val.getOpcode() == ISD::FNEG)
1652 return Val.getOperand(0);
1653
1654 return Val;
1655}
1656
1658 if (Val.getOpcode() == ISD::FNEG)
1659 Val = Val.getOperand(0);
1660 if (Val.getOpcode() == ISD::FABS)
1661 Val = Val.getOperand(0);
1662 if (Val.getOpcode() == ISD::FCOPYSIGN)
1663 Val = Val.getOperand(0);
1664 return Val;
1665}
1666
1668 const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, SDValue True,
1669 SDValue False, SDValue CC, DAGCombinerInfo &DCI) const {
1670 SelectionDAG &DAG = DCI.DAG;
1671 ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
1672 switch (CCOpcode) {
1673 case ISD::SETOEQ:
1674 case ISD::SETONE:
1675 case ISD::SETUNE:
1676 case ISD::SETNE:
1677 case ISD::SETUEQ:
1678 case ISD::SETEQ:
1679 case ISD::SETFALSE:
1680 case ISD::SETFALSE2:
1681 case ISD::SETTRUE:
1682 case ISD::SETTRUE2:
1683 case ISD::SETUO:
1684 case ISD::SETO:
1685 break;
1686 case ISD::SETULE:
1687 case ISD::SETULT: {
1688 if (LHS == True)
1689 return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, RHS, LHS);
1690 return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, LHS, RHS);
1691 }
1692 case ISD::SETOLE:
1693 case ISD::SETOLT:
1694 case ISD::SETLE:
1695 case ISD::SETLT: {
1696 // Ordered. Assume ordered for undefined.
1697
1698 // Only do this after legalization to avoid interfering with other combines
1699 // which might occur.
1701 !DCI.isCalledByLegalizer())
1702 return SDValue();
1703
1704 // We need to permute the operands to get the correct NaN behavior. The
1705 // selected operand is the second one based on the failing compare with NaN,
1706 // so permute it based on the compare type the hardware uses.
1707 if (LHS == True)
1708 return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, LHS, RHS);
1709 return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, RHS, LHS);
1710 }
1711 case ISD::SETUGE:
1712 case ISD::SETUGT: {
1713 if (LHS == True)
1714 return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, RHS, LHS);
1715 return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, LHS, RHS);
1716 }
1717 case ISD::SETGT:
1718 case ISD::SETGE:
1719 case ISD::SETOGE:
1720 case ISD::SETOGT: {
1722 !DCI.isCalledByLegalizer())
1723 return SDValue();
1724
1725 if (LHS == True)
1726 return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, LHS, RHS);
1727 return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, RHS, LHS);
1728 }
1729 case ISD::SETCC_INVALID:
1730 llvm_unreachable("Invalid setcc condcode!");
1731 }
1732 return SDValue();
1733}
1734
1735/// Generate Min/Max node
1737 SDValue LHS, SDValue RHS,
1738 SDValue True, SDValue False,
1739 SDValue CC,
1740 DAGCombinerInfo &DCI) const {
1741 if ((LHS == True && RHS == False) || (LHS == False && RHS == True))
1742 return combineFMinMaxLegacyImpl(DL, VT, LHS, RHS, True, False, CC, DCI);
1743
1744 SelectionDAG &DAG = DCI.DAG;
1745
1746 // If we can't directly match this, try to see if we can fold an fneg to
1747 // match.
1748
1751 SDValue NegTrue = peekFNeg(True);
1752
1753 // Undo the combine foldFreeOpFromSelect does if it helps us match the
1754 // fmin/fmax.
1755 //
1756 // select (fcmp olt (lhs, K)), (fneg lhs), -K
1757 // -> fneg (fmin_legacy lhs, K)
1758 //
1759 // TODO: Use getNegatedExpression
1760 if (LHS == NegTrue && CFalse && CRHS) {
1761 APFloat NegRHS = neg(CRHS->getValueAPF());
1762 if (NegRHS == CFalse->getValueAPF()) {
1763 SDValue Combined =
1764 combineFMinMaxLegacyImpl(DL, VT, LHS, RHS, NegTrue, False, CC, DCI);
1765 if (Combined)
1766 return DAG.getNode(ISD::FNEG, DL, VT, Combined);
1767 return SDValue();
1768 }
1769 }
1770
1771 return SDValue();
1772}
1773
1774std::pair<SDValue, SDValue>
1776 SDLoc SL(Op);
1777
1778 SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
1779
1780 const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
1781 const SDValue One = DAG.getConstant(1, SL, MVT::i32);
1782
1783 SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero);
1784 SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One);
1785
1786 return std::pair(Lo, Hi);
1787}
1788
1790 SDLoc SL(Op);
1791
1792 SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
1793 const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
1794 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero);
1795}
1796
1798 SDLoc SL(Op);
1799
1800 SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
1801 const SDValue One = DAG.getConstant(1, SL, MVT::i32);
1802 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One);
1803}
1804
1805// Split a vector type into two parts. The first part is a power of two vector.
1806// The second part is whatever is left over, and is a scalar if it would
1807// otherwise be a 1-vector.
1808std::pair<EVT, EVT>
1810 EVT LoVT, HiVT;
1811 EVT EltVT = VT.getVectorElementType();
1812 unsigned NumElts = VT.getVectorNumElements();
1813 unsigned LoNumElts = PowerOf2Ceil((NumElts + 1) / 2);
1814 LoVT = EVT::getVectorVT(*DAG.getContext(), EltVT, LoNumElts);
1815 HiVT = NumElts - LoNumElts == 1
1816 ? EltVT
1817 : EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts - LoNumElts);
1818 return std::pair(LoVT, HiVT);
1819}
1820
1821// Split a vector value into two parts of types LoVT and HiVT. HiVT could be
1822// scalar.
1823std::pair<SDValue, SDValue>
1825 const EVT &LoVT, const EVT &HiVT,
1826 SelectionDAG &DAG) const {
1827 EVT VT = N.getValueType();
1829 (HiVT.isVector() ? HiVT.getVectorNumElements() : 1) <=
1830 VT.getVectorNumElements() &&
1831 "More vector elements requested than available!");
1833 DAG.getVectorIdxConstant(0, DL));
1834
1835 unsigned LoNumElts = LoVT.getVectorNumElements();
1836
1837 if (HiVT.isVector()) {
1838 unsigned HiNumElts = HiVT.getVectorNumElements();
1839 if ((VT.getVectorNumElements() % HiNumElts) == 0) {
1840 // Avoid creating an extract_subvector with an index that isn't a multiple
1841 // of the result type.
1843 DAG.getConstant(LoNumElts, DL, MVT::i32));
1844 return {Lo, Hi};
1845 }
1846
1848 DAG.ExtractVectorElements(N, Elts, /*Start=*/LoNumElts,
1849 /*Count=*/HiNumElts);
1850 SDValue Hi = DAG.getBuildVector(HiVT, DL, Elts);
1851 return {Lo, Hi};
1852 }
1853
1855 DAG.getVectorIdxConstant(LoNumElts, DL));
1856 return {Lo, Hi};
1857}
1858
1860 SelectionDAG &DAG) const {
1862 EVT VT = Op.getValueType();
1863 SDLoc SL(Op);
1864
1865
1866 // If this is a 2 element vector, we really want to scalarize and not create
1867 // weird 1 element vectors.
1868 if (VT.getVectorNumElements() == 2) {
1869 SDValue Ops[2];
1870 std::tie(Ops[0], Ops[1]) = scalarizeVectorLoad(Load, DAG);
1871 return DAG.getMergeValues(Ops, SL);
1872 }
1873
1874 SDValue BasePtr = Load->getBasePtr();
1875 EVT MemVT = Load->getMemoryVT();
1876
1877 const MachinePointerInfo &SrcValue = Load->getMemOperand()->getPointerInfo();
1878
1879 EVT LoVT, HiVT;
1880 EVT LoMemVT, HiMemVT;
1881 SDValue Lo, Hi;
1882
1883 std::tie(LoVT, HiVT) = getSplitDestVTs(VT, DAG);
1884 std::tie(LoMemVT, HiMemVT) = getSplitDestVTs(MemVT, DAG);
1885 std::tie(Lo, Hi) = splitVector(Op, SL, LoVT, HiVT, DAG);
1886
1887 unsigned Size = LoMemVT.getStoreSize();
1888 Align BaseAlign = Load->getAlign();
1889 Align HiAlign = commonAlignment(BaseAlign, Size);
1890
1891 SDValue LoLoad = DAG.getExtLoad(
1892 Load->getExtensionType(), SL, LoVT, Load->getChain(), BasePtr, SrcValue,
1893 LoMemVT, BaseAlign, Load->getMemOperand()->getFlags(), Load->getAAInfo());
1894 SDValue HiPtr = DAG.getObjectPtrOffset(SL, BasePtr, TypeSize::getFixed(Size));
1895 SDValue HiLoad = DAG.getExtLoad(
1896 Load->getExtensionType(), SL, HiVT, Load->getChain(), HiPtr,
1897 SrcValue.getWithOffset(LoMemVT.getStoreSize()), HiMemVT, HiAlign,
1898 Load->getMemOperand()->getFlags(), Load->getAAInfo());
1899
1900 SDValue Join;
1901 if (LoVT == HiVT) {
1902 // This is the case that the vector is power of two so was evenly split.
1903 Join = DAG.getNode(ISD::CONCAT_VECTORS, SL, VT, LoLoad, HiLoad);
1904 } else {
1905 Join = DAG.getNode(ISD::INSERT_SUBVECTOR, SL, VT, DAG.getPOISON(VT), LoLoad,
1906 DAG.getVectorIdxConstant(0, SL));
1907 Join = DAG.getNode(
1909 VT, Join, HiLoad,
1911 }
1912
1913 SDValue Ops[] = {Join, DAG.getNode(ISD::TokenFactor, SL, MVT::Other,
1914 LoLoad.getValue(1), HiLoad.getValue(1))};
1915
1916 return DAG.getMergeValues(Ops, SL);
1917}
1918
1920 SelectionDAG &DAG) const {
1922 EVT VT = Op.getValueType();
1923 SDValue BasePtr = Load->getBasePtr();
1924 EVT MemVT = Load->getMemoryVT();
1925 SDLoc SL(Op);
1926 const MachinePointerInfo &SrcValue = Load->getMemOperand()->getPointerInfo();
1927 Align BaseAlign = Load->getAlign();
1928 unsigned NumElements = MemVT.getVectorNumElements();
1929
1930 // Widen from vec3 to vec4 when the load is at least 8-byte aligned
1931 // or 16-byte fully dereferenceable. Otherwise, split the vector load.
1932 if (NumElements != 3 ||
1933 (BaseAlign < Align(8) &&
1934 !SrcValue.isDereferenceable(16, *DAG.getContext(), DAG.getDataLayout())))
1935 return SplitVectorLoad(Op, DAG);
1936
1937 assert(NumElements == 3);
1938
1939 EVT WideVT =
1941 EVT WideMemVT =
1943 SDValue WideLoad = DAG.getExtLoad(
1944 Load->getExtensionType(), SL, WideVT, Load->getChain(), BasePtr, SrcValue,
1945 WideMemVT, BaseAlign, Load->getMemOperand()->getFlags());
1946 return DAG.getMergeValues(
1947 {DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, VT, WideLoad,
1948 DAG.getVectorIdxConstant(0, SL)),
1949 WideLoad.getValue(1)},
1950 SL);
1951}
1952
1954 SelectionDAG &DAG) const {
1956 SDValue Val = Store->getValue();
1957 EVT VT = Val.getValueType();
1958
1959 // If this is a 2 element vector, we really want to scalarize and not create
1960 // weird 1 element vectors.
1961 if (VT.getVectorNumElements() == 2)
1962 return scalarizeVectorStore(Store, DAG);
1963
1964 EVT MemVT = Store->getMemoryVT();
1965 SDValue Chain = Store->getChain();
1966 SDValue BasePtr = Store->getBasePtr();
1967 SDLoc SL(Op);
1968
1969 EVT LoVT, HiVT;
1970 EVT LoMemVT, HiMemVT;
1971 SDValue Lo, Hi;
1972
1973 std::tie(LoVT, HiVT) = getSplitDestVTs(VT, DAG);
1974 std::tie(LoMemVT, HiMemVT) = getSplitDestVTs(MemVT, DAG);
1975 std::tie(Lo, Hi) = splitVector(Val, SL, LoVT, HiVT, DAG);
1976
1977 SDValue HiPtr = DAG.getObjectPtrOffset(SL, BasePtr, LoMemVT.getStoreSize());
1978
1979 const MachinePointerInfo &SrcValue = Store->getMemOperand()->getPointerInfo();
1980 Align BaseAlign = Store->getAlign();
1981 unsigned Size = LoMemVT.getStoreSize();
1982 Align HiAlign = commonAlignment(BaseAlign, Size);
1983
1984 SDValue LoStore =
1985 DAG.getTruncStore(Chain, SL, Lo, BasePtr, SrcValue, LoMemVT, BaseAlign,
1986 Store->getMemOperand()->getFlags(), Store->getAAInfo());
1987 SDValue HiStore = DAG.getTruncStore(
1988 Chain, SL, Hi, HiPtr, SrcValue.getWithOffset(Size), HiMemVT, HiAlign,
1989 Store->getMemOperand()->getFlags(), Store->getAAInfo());
1990
1991 return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, LoStore, HiStore);
1992}
1993
1994// This is a shortcut for integer division because we have fast i32<->f32
1995// conversions, and fast f32 reciprocal instructions. The fractional part of a
1996// float is enough to accurately represent up to a 24-bit signed integer.
1998 bool Sign) const {
1999 SDLoc DL(Op);
2000 EVT VT = Op.getValueType();
2001 SDValue LHS = Op.getOperand(0);
2002 SDValue RHS = Op.getOperand(1);
2003 MVT IntVT = MVT::i32;
2004 MVT FltVT = MVT::f32;
2005
2006 unsigned LHSSignBits = DAG.ComputeNumSignBits(LHS);
2007 if (LHSSignBits < 9)
2008 return SDValue();
2009
2010 unsigned RHSSignBits = DAG.ComputeNumSignBits(RHS);
2011 if (RHSSignBits < 9)
2012 return SDValue();
2013
2014 unsigned BitSize = VT.getSizeInBits();
2015 unsigned SignBits = std::min(LHSSignBits, RHSSignBits);
2016 unsigned DivBits = BitSize - SignBits;
2017 if (Sign)
2018 ++DivBits;
2019
2022
2023 SDValue jq = DAG.getConstant(1, DL, IntVT);
2024
2025 if (Sign) {
2026 // char|short jq = ia ^ ib;
2027 jq = DAG.getNode(ISD::XOR, DL, VT, LHS, RHS);
2028
2029 // jq = jq >> (bitsize - 2)
2030 jq = DAG.getNode(ISD::SRA, DL, VT, jq,
2031 DAG.getConstant(BitSize - 2, DL, VT));
2032
2033 // jq = jq | 0x1
2034 jq = DAG.getNode(ISD::OR, DL, VT, jq, DAG.getConstant(1, DL, VT));
2035 }
2036
2037 // int ia = (int)LHS;
2038 SDValue ia = LHS;
2039
2040 // int ib, (int)RHS;
2041 SDValue ib = RHS;
2042
2043 // float fa = (float)ia;
2044 SDValue fa = DAG.getNode(ToFp, DL, FltVT, ia);
2045
2046 // float fb = (float)ib;
2047 SDValue fb = DAG.getNode(ToFp, DL, FltVT, ib);
2048
2049 SDValue fq = DAG.getNode(ISD::FMUL, DL, FltVT,
2050 fa, DAG.getNode(AMDGPUISD::RCP, DL, FltVT, fb));
2051
2052 // fq = trunc(fq);
2053 fq = DAG.getNode(ISD::FTRUNC, DL, FltVT, fq);
2054
2055 // float fqneg = -fq;
2056 SDValue fqneg = DAG.getNode(ISD::FNEG, DL, FltVT, fq);
2057
2059
2060 bool UseFmadFtz = false;
2061 if (Subtarget->isGCN()) {
2063 UseFmadFtz =
2065 }
2066
2067 // float fr = mad(fqneg, fb, fa);
2068 unsigned OpCode = !Subtarget->hasMadMacF32Insts() ? (unsigned)ISD::FMA
2069 : UseFmadFtz ? (unsigned)AMDGPUISD::FMAD_FTZ
2071 SDValue fr = DAG.getNode(OpCode, DL, FltVT, fqneg, fb, fa);
2072
2073 // int iq = (int)fq;
2074 SDValue iq = DAG.getNode(ToInt, DL, IntVT, fq);
2075
2076 // fr = fabs(fr);
2077 fr = DAG.getNode(ISD::FABS, DL, FltVT, fr);
2078
2079 // fb = fabs(fb);
2080 fb = DAG.getNode(ISD::FABS, DL, FltVT, fb);
2081
2082 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2083
2084 // int cv = fr >= fb;
2085 SDValue cv = DAG.getSetCC(DL, SetCCVT, fr, fb, ISD::SETOGE);
2086
2087 // jq = (cv ? jq : 0);
2088 jq = DAG.getNode(ISD::SELECT, DL, VT, cv, jq, DAG.getConstant(0, DL, VT));
2089
2090 // dst = iq + jq;
2091 SDValue Div = DAG.getNode(ISD::ADD, DL, VT, iq, jq);
2092
2093 // Rem needs compensation, it's easier to recompute it
2094 SDValue Rem = DAG.getNode(ISD::MUL, DL, VT, Div, RHS);
2095 Rem = DAG.getNode(ISD::SUB, DL, VT, LHS, Rem);
2096
2097 // Truncate to number of bits this divide really is.
2098 if (Sign) {
2099 SDValue InRegSize
2100 = DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), DivBits));
2101 Div = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Div, InRegSize);
2102 Rem = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Rem, InRegSize);
2103 } else {
2104 SDValue TruncMask = DAG.getConstant((UINT64_C(1) << DivBits) - 1, DL, VT);
2105 Div = DAG.getNode(ISD::AND, DL, VT, Div, TruncMask);
2106 Rem = DAG.getNode(ISD::AND, DL, VT, Rem, TruncMask);
2107 }
2108
2109 return DAG.getMergeValues({ Div, Rem }, DL);
2110}
2111
2113 SelectionDAG &DAG,
2115 SDLoc DL(Op);
2116 EVT VT = Op.getValueType();
2117
2118 assert(VT == MVT::i64 && "LowerUDIVREM64 expects an i64");
2119
2120 EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext());
2121
2122 SDValue One = DAG.getConstant(1, DL, HalfVT);
2123 SDValue Zero = DAG.getConstant(0, DL, HalfVT);
2124
2125 //HiLo split
2126 SDValue LHS_Lo, LHS_Hi;
2127 SDValue LHS = Op.getOperand(0);
2128 std::tie(LHS_Lo, LHS_Hi) = DAG.SplitScalar(LHS, DL, HalfVT, HalfVT);
2129
2130 SDValue RHS_Lo, RHS_Hi;
2131 SDValue RHS = Op.getOperand(1);
2132 std::tie(RHS_Lo, RHS_Hi) = DAG.SplitScalar(RHS, DL, HalfVT, HalfVT);
2133
2134 if (DAG.MaskedValueIsZero(RHS, APInt::getHighBitsSet(64, 32)) &&
2135 DAG.MaskedValueIsZero(LHS, APInt::getHighBitsSet(64, 32))) {
2136
2137 SDValue Res = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(HalfVT, HalfVT),
2138 LHS_Lo, RHS_Lo);
2139
2140 SDValue DIV = DAG.getBuildVector(MVT::v2i32, DL, {Res.getValue(0), Zero});
2141 SDValue REM = DAG.getBuildVector(MVT::v2i32, DL, {Res.getValue(1), Zero});
2142
2143 Results.push_back(DAG.getNode(ISD::BITCAST, DL, MVT::i64, DIV));
2144 Results.push_back(DAG.getNode(ISD::BITCAST, DL, MVT::i64, REM));
2145 return;
2146 }
2147
2148 if (isTypeLegal(MVT::i64)) {
2149 // The algorithm here is based on ideas from "Software Integer Division",
2150 // Tom Rodeheffer, August 2008.
2151
2154
2155 // Compute denominator reciprocal.
2156 unsigned FMAD =
2157 !Subtarget->hasMadMacF32Insts() ? (unsigned)ISD::FMA
2160 : (unsigned)AMDGPUISD::FMAD_FTZ;
2161
2162 SDValue Cvt_Lo = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, RHS_Lo);
2163 SDValue Cvt_Hi = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, RHS_Hi);
2164 SDValue Mad1 = DAG.getNode(FMAD, DL, MVT::f32, Cvt_Hi,
2165 DAG.getConstantFP(APInt(32, 0x4f800000).bitsToFloat(), DL, MVT::f32),
2166 Cvt_Lo);
2167 SDValue Rcp = DAG.getNode(AMDGPUISD::RCP, DL, MVT::f32, Mad1);
2168 SDValue Mul1 = DAG.getNode(ISD::FMUL, DL, MVT::f32, Rcp,
2169 DAG.getConstantFP(APInt(32, 0x5f7ffffc).bitsToFloat(), DL, MVT::f32));
2170 SDValue Mul2 = DAG.getNode(ISD::FMUL, DL, MVT::f32, Mul1,
2171 DAG.getConstantFP(APInt(32, 0x2f800000).bitsToFloat(), DL, MVT::f32));
2172 SDValue Trunc = DAG.getNode(ISD::FTRUNC, DL, MVT::f32, Mul2);
2173 SDValue Mad2 = DAG.getNode(FMAD, DL, MVT::f32, Trunc,
2174 DAG.getConstantFP(APInt(32, 0xcf800000).bitsToFloat(), DL, MVT::f32),
2175 Mul1);
2176 SDValue Rcp_Lo = DAG.getNode(ISD::FP_TO_UINT, DL, HalfVT, Mad2);
2177 SDValue Rcp_Hi = DAG.getNode(ISD::FP_TO_UINT, DL, HalfVT, Trunc);
2178 SDValue Rcp64 = DAG.getBitcast(VT,
2179 DAG.getBuildVector(MVT::v2i32, DL, {Rcp_Lo, Rcp_Hi}));
2180
2181 SDValue Zero64 = DAG.getConstant(0, DL, VT);
2182 SDValue One64 = DAG.getConstant(1, DL, VT);
2183 SDValue Zero1 = DAG.getConstant(0, DL, MVT::i1);
2184 SDVTList HalfCarryVT = DAG.getVTList(HalfVT, MVT::i1);
2185
2186 // First round of UNR (Unsigned integer Newton-Raphson).
2187 SDValue Neg_RHS = DAG.getNode(ISD::SUB, DL, VT, Zero64, RHS);
2188 SDValue Mullo1 = DAG.getNode(ISD::MUL, DL, VT, Neg_RHS, Rcp64);
2189 SDValue Mulhi1 = DAG.getNode(ISD::MULHU, DL, VT, Rcp64, Mullo1);
2190 SDValue Mulhi1_Lo, Mulhi1_Hi;
2191 std::tie(Mulhi1_Lo, Mulhi1_Hi) =
2192 DAG.SplitScalar(Mulhi1, DL, HalfVT, HalfVT);
2193 SDValue Add1_Lo = DAG.getNode(ISD::UADDO_CARRY, DL, HalfCarryVT, Rcp_Lo,
2194 Mulhi1_Lo, Zero1);
2195 SDValue Add1_Hi = DAG.getNode(ISD::UADDO_CARRY, DL, HalfCarryVT, Rcp_Hi,
2196 Mulhi1_Hi, Add1_Lo.getValue(1));
2197 SDValue Add1 = DAG.getBitcast(VT,
2198 DAG.getBuildVector(MVT::v2i32, DL, {Add1_Lo, Add1_Hi}));
2199
2200 // Second round of UNR.
2201 SDValue Mullo2 = DAG.getNode(ISD::MUL, DL, VT, Neg_RHS, Add1);
2202 SDValue Mulhi2 = DAG.getNode(ISD::MULHU, DL, VT, Add1, Mullo2);
2203 SDValue Mulhi2_Lo, Mulhi2_Hi;
2204 std::tie(Mulhi2_Lo, Mulhi2_Hi) =
2205 DAG.SplitScalar(Mulhi2, DL, HalfVT, HalfVT);
2206 SDValue Add2_Lo = DAG.getNode(ISD::UADDO_CARRY, DL, HalfCarryVT, Add1_Lo,
2207 Mulhi2_Lo, Zero1);
2208 SDValue Add2_Hi = DAG.getNode(ISD::UADDO_CARRY, DL, HalfCarryVT, Add1_Hi,
2209 Mulhi2_Hi, Add2_Lo.getValue(1));
2210 SDValue Add2 = DAG.getBitcast(VT,
2211 DAG.getBuildVector(MVT::v2i32, DL, {Add2_Lo, Add2_Hi}));
2212
2213 SDValue Mulhi3 = DAG.getNode(ISD::MULHU, DL, VT, LHS, Add2);
2214
2215 SDValue Mul3 = DAG.getNode(ISD::MUL, DL, VT, RHS, Mulhi3);
2216
2217 SDValue Mul3_Lo, Mul3_Hi;
2218 std::tie(Mul3_Lo, Mul3_Hi) = DAG.SplitScalar(Mul3, DL, HalfVT, HalfVT);
2219 SDValue Sub1_Lo = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, LHS_Lo,
2220 Mul3_Lo, Zero1);
2221 SDValue Sub1_Hi = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, LHS_Hi,
2222 Mul3_Hi, Sub1_Lo.getValue(1));
2223 SDValue Sub1_Mi = DAG.getNode(ISD::SUB, DL, HalfVT, LHS_Hi, Mul3_Hi);
2224 SDValue Sub1 = DAG.getBitcast(VT,
2225 DAG.getBuildVector(MVT::v2i32, DL, {Sub1_Lo, Sub1_Hi}));
2226
2227 SDValue MinusOne = DAG.getConstant(0xffffffffu, DL, HalfVT);
2228 SDValue C1 = DAG.getSelectCC(DL, Sub1_Hi, RHS_Hi, MinusOne, Zero,
2229 ISD::SETUGE);
2230 SDValue C2 = DAG.getSelectCC(DL, Sub1_Lo, RHS_Lo, MinusOne, Zero,
2231 ISD::SETUGE);
2232 SDValue C3 = DAG.getSelectCC(DL, Sub1_Hi, RHS_Hi, C2, C1, ISD::SETEQ);
2233
2234 // TODO: Here and below portions of the code can be enclosed into if/endif.
2235 // Currently control flow is unconditional and we have 4 selects after
2236 // potential endif to substitute PHIs.
2237
2238 // if C3 != 0 ...
2239 SDValue Sub2_Lo = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub1_Lo,
2240 RHS_Lo, Zero1);
2241 SDValue Sub2_Mi = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub1_Mi,
2242 RHS_Hi, Sub1_Lo.getValue(1));
2243 SDValue Sub2_Hi = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub2_Mi,
2244 Zero, Sub2_Lo.getValue(1));
2245 SDValue Sub2 = DAG.getBitcast(VT,
2246 DAG.getBuildVector(MVT::v2i32, DL, {Sub2_Lo, Sub2_Hi}));
2247
2248 SDValue Add3 = DAG.getNode(ISD::ADD, DL, VT, Mulhi3, One64);
2249
2250 SDValue C4 = DAG.getSelectCC(DL, Sub2_Hi, RHS_Hi, MinusOne, Zero,
2251 ISD::SETUGE);
2252 SDValue C5 = DAG.getSelectCC(DL, Sub2_Lo, RHS_Lo, MinusOne, Zero,
2253 ISD::SETUGE);
2254 SDValue C6 = DAG.getSelectCC(DL, Sub2_Hi, RHS_Hi, C5, C4, ISD::SETEQ);
2255
2256 // if (C6 != 0)
2257 SDValue Add4 = DAG.getNode(ISD::ADD, DL, VT, Add3, One64);
2258
2259 SDValue Sub3_Lo = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub2_Lo,
2260 RHS_Lo, Zero1);
2261 SDValue Sub3_Mi = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub2_Mi,
2262 RHS_Hi, Sub2_Lo.getValue(1));
2263 SDValue Sub3_Hi = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub3_Mi,
2264 Zero, Sub3_Lo.getValue(1));
2265 SDValue Sub3 = DAG.getBitcast(VT,
2266 DAG.getBuildVector(MVT::v2i32, DL, {Sub3_Lo, Sub3_Hi}));
2267
2268 // endif C6
2269 // endif C3
2270
2271 SDValue Sel1 = DAG.getSelectCC(DL, C6, Zero, Add4, Add3, ISD::SETNE);
2272 SDValue Div = DAG.getSelectCC(DL, C3, Zero, Sel1, Mulhi3, ISD::SETNE);
2273
2274 SDValue Sel2 = DAG.getSelectCC(DL, C6, Zero, Sub3, Sub2, ISD::SETNE);
2275 SDValue Rem = DAG.getSelectCC(DL, C3, Zero, Sel2, Sub1, ISD::SETNE);
2276
2277 Results.push_back(Div);
2278 Results.push_back(Rem);
2279
2280 return;
2281 }
2282
2283 // r600 expandion.
2284 // Get Speculative values
2285 SDValue DIV_Part = DAG.getNode(ISD::UDIV, DL, HalfVT, LHS_Hi, RHS_Lo);
2286 SDValue REM_Part = DAG.getNode(ISD::UREM, DL, HalfVT, LHS_Hi, RHS_Lo);
2287
2288 SDValue REM_Lo = DAG.getSelectCC(DL, RHS_Hi, Zero, REM_Part, LHS_Hi, ISD::SETEQ);
2289 SDValue REM = DAG.getBuildVector(MVT::v2i32, DL, {REM_Lo, Zero});
2290 REM = DAG.getNode(ISD::BITCAST, DL, MVT::i64, REM);
2291
2292 SDValue DIV_Hi = DAG.getSelectCC(DL, RHS_Hi, Zero, DIV_Part, Zero, ISD::SETEQ);
2293 SDValue DIV_Lo = Zero;
2294
2295 const unsigned halfBitWidth = HalfVT.getSizeInBits();
2296
2297 for (unsigned i = 0; i < halfBitWidth; ++i) {
2298 const unsigned bitPos = halfBitWidth - i - 1;
2299 SDValue POS = DAG.getConstant(bitPos, DL, HalfVT);
2300 // Get value of high bit
2301 SDValue HBit = DAG.getNode(ISD::SRL, DL, HalfVT, LHS_Lo, POS);
2302 HBit = DAG.getNode(ISD::AND, DL, HalfVT, HBit, One);
2303 HBit = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, HBit);
2304
2305 // Shift
2306 REM = DAG.getNode(ISD::SHL, DL, VT, REM, DAG.getConstant(1, DL, VT));
2307 // Add LHS high bit
2308 REM = DAG.getNode(ISD::OR, DL, VT, REM, HBit);
2309
2310 SDValue BIT = DAG.getConstant(1ULL << bitPos, DL, HalfVT);
2311 SDValue realBIT = DAG.getSelectCC(DL, REM, RHS, BIT, Zero, ISD::SETUGE);
2312
2313 DIV_Lo = DAG.getNode(ISD::OR, DL, HalfVT, DIV_Lo, realBIT);
2314
2315 // Update REM
2316 SDValue REM_sub = DAG.getNode(ISD::SUB, DL, VT, REM, RHS);
2317 REM = DAG.getSelectCC(DL, REM, RHS, REM_sub, REM, ISD::SETUGE);
2318 }
2319
2320 SDValue DIV = DAG.getBuildVector(MVT::v2i32, DL, {DIV_Lo, DIV_Hi});
2321 DIV = DAG.getNode(ISD::BITCAST, DL, MVT::i64, DIV);
2322 Results.push_back(DIV);
2323 Results.push_back(REM);
2324}
2325
2327 SelectionDAG &DAG) const {
2328 SDLoc DL(Op);
2329 EVT VT = Op.getValueType();
2330
2331 if (VT == MVT::i64) {
2333 LowerUDIVREM64(Op, DAG, Results);
2334 return DAG.getMergeValues(Results, DL);
2335 }
2336
2337 if (VT == MVT::i32) {
2338 if (SDValue Res = LowerDIVREM24(Op, DAG, false))
2339 return Res;
2340 }
2341
2342 SDValue X = Op.getOperand(0);
2343 SDValue Y = Op.getOperand(1);
2344
2345 // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
2346 // algorithm used here.
2347
2348 // Initial estimate of inv(y).
2349 SDValue Z = DAG.getNode(AMDGPUISD::URECIP, DL, VT, Y);
2350
2351 // One round of UNR.
2352 SDValue NegY = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Y);
2353 SDValue NegYZ = DAG.getNode(ISD::MUL, DL, VT, NegY, Z);
2354 Z = DAG.getNode(ISD::ADD, DL, VT, Z,
2355 DAG.getNode(ISD::MULHU, DL, VT, Z, NegYZ));
2356
2357 // Quotient/remainder estimate.
2358 SDValue Q = DAG.getNode(ISD::MULHU, DL, VT, X, Z);
2359 SDValue R =
2360 DAG.getNode(ISD::SUB, DL, VT, X, DAG.getNode(ISD::MUL, DL, VT, Q, Y));
2361
2362 // First quotient/remainder refinement.
2363 EVT CCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2364 SDValue One = DAG.getConstant(1, DL, VT);
2365 SDValue Cond = DAG.getSetCC(DL, CCVT, R, Y, ISD::SETUGE);
2366 Q = DAG.getNode(ISD::SELECT, DL, VT, Cond,
2367 DAG.getNode(ISD::ADD, DL, VT, Q, One), Q);
2368 R = DAG.getNode(ISD::SELECT, DL, VT, Cond,
2369 DAG.getNode(ISD::SUB, DL, VT, R, Y), R);
2370
2371 // Second quotient/remainder refinement.
2372 Cond = DAG.getSetCC(DL, CCVT, R, Y, ISD::SETUGE);
2373 Q = DAG.getNode(ISD::SELECT, DL, VT, Cond,
2374 DAG.getNode(ISD::ADD, DL, VT, Q, One), Q);
2375 R = DAG.getNode(ISD::SELECT, DL, VT, Cond,
2376 DAG.getNode(ISD::SUB, DL, VT, R, Y), R);
2377
2378 return DAG.getMergeValues({Q, R}, DL);
2379}
2380
2382 SelectionDAG &DAG) const {
2383 SDLoc DL(Op);
2384 EVT VT = Op.getValueType();
2385
2386 SDValue LHS = Op.getOperand(0);
2387 SDValue RHS = Op.getOperand(1);
2388
2389 SDValue Zero = DAG.getConstant(0, DL, VT);
2390 SDValue NegOne = DAG.getAllOnesConstant(DL, VT);
2391
2392 if (VT == MVT::i32) {
2393 if (SDValue Res = LowerDIVREM24(Op, DAG, true))
2394 return Res;
2395 }
2396
2397 if (VT == MVT::i64 &&
2398 DAG.ComputeNumSignBits(LHS) > 32 &&
2399 DAG.ComputeNumSignBits(RHS) > 32) {
2400 EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext());
2401
2402 //HiLo split
2403 SDValue LHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, Zero);
2404 SDValue RHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, Zero);
2405 SDValue DIVREM = DAG.getNode(ISD::SDIVREM, DL, DAG.getVTList(HalfVT, HalfVT),
2406 LHS_Lo, RHS_Lo);
2407 SDValue Res[2] = {
2408 DAG.getNode(ISD::SIGN_EXTEND, DL, VT, DIVREM.getValue(0)),
2409 DAG.getNode(ISD::SIGN_EXTEND, DL, VT, DIVREM.getValue(1))
2410 };
2411 return DAG.getMergeValues(Res, DL);
2412 }
2413
2414 SDValue LHSign = DAG.getSelectCC(DL, LHS, Zero, NegOne, Zero, ISD::SETLT);
2415 SDValue RHSign = DAG.getSelectCC(DL, RHS, Zero, NegOne, Zero, ISD::SETLT);
2416 SDValue DSign = DAG.getNode(ISD::XOR, DL, VT, LHSign, RHSign);
2417 SDValue RSign = LHSign; // Remainder sign is the same as LHS
2418
2419 LHS = DAG.getNode(ISD::ADD, DL, VT, LHS, LHSign);
2420 RHS = DAG.getNode(ISD::ADD, DL, VT, RHS, RHSign);
2421
2422 LHS = DAG.getNode(ISD::XOR, DL, VT, LHS, LHSign);
2423 RHS = DAG.getNode(ISD::XOR, DL, VT, RHS, RHSign);
2424
2425 SDValue Div = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(VT, VT), LHS, RHS);
2426 SDValue Rem = Div.getValue(1);
2427
2428 Div = DAG.getNode(ISD::XOR, DL, VT, Div, DSign);
2429 Rem = DAG.getNode(ISD::XOR, DL, VT, Rem, RSign);
2430
2431 Div = DAG.getNode(ISD::SUB, DL, VT, Div, DSign);
2432 Rem = DAG.getNode(ISD::SUB, DL, VT, Rem, RSign);
2433
2434 SDValue Res[2] = {
2435 Div,
2436 Rem
2437 };
2438 return DAG.getMergeValues(Res, DL);
2439}
2440
2442 SDLoc SL(Op);
2443 SDValue Src = Op.getOperand(0);
2444
2445 // result = trunc(src)
2446 // if (src > 0.0 && src != result)
2447 // result += 1.0
2448
2449 SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src);
2450
2451 const SDValue Zero = DAG.getConstantFP(0.0, SL, MVT::f64);
2452 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f64);
2453
2454 EVT SetCCVT =
2455 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);
2456
2457 SDValue Lt0 = DAG.getSetCC(SL, SetCCVT, Src, Zero, ISD::SETOGT);
2458 SDValue NeTrunc = DAG.getSetCC(SL, SetCCVT, Src, Trunc, ISD::SETONE);
2459 SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc);
2460
2461 SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, One, Zero);
2462 // TODO: Should this propagate fast-math-flags?
2463 return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add);
2464}
2465
2467 SelectionDAG &DAG) {
2468 const unsigned FractBits = 52;
2469 const unsigned ExpBits = 11;
2470
2471 SDValue ExpPart = DAG.getNode(AMDGPUISD::BFE_U32, SL, MVT::i32,
2472 Hi,
2473 DAG.getConstant(FractBits - 32, SL, MVT::i32),
2474 DAG.getConstant(ExpBits, SL, MVT::i32));
2475 SDValue Exp = DAG.getNode(ISD::SUB, SL, MVT::i32, ExpPart,
2476 DAG.getConstant(1023, SL, MVT::i32));
2477
2478 return Exp;
2479}
2480
2482 SDLoc SL(Op);
2483 SDValue Src = Op.getOperand(0);
2484
2485 assert(Op.getValueType() == MVT::f64);
2486
2487 const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
2488
2489 // Extract the upper half, since this is where we will find the sign and
2490 // exponent.
2491 SDValue Hi = getHiHalf64(Src, DAG);
2492
2493 SDValue Exp = extractF64Exponent(Hi, SL, DAG);
2494
2495 const unsigned FractBits = 52;
2496
2497 // Extract the sign bit.
2498 const SDValue SignBitMask = DAG.getConstant(UINT32_C(1) << 31, SL, MVT::i32);
2499 SDValue SignBit = DAG.getNode(ISD::AND, SL, MVT::i32, Hi, SignBitMask);
2500
2501 // Extend back to 64-bits.
2502 SDValue SignBit64 = DAG.getBuildVector(MVT::v2i32, SL, {Zero, SignBit});
2503 SignBit64 = DAG.getNode(ISD::BITCAST, SL, MVT::i64, SignBit64);
2504
2505 SDValue BcInt = DAG.getNode(ISD::BITCAST, SL, MVT::i64, Src);
2506 const SDValue FractMask
2507 = DAG.getConstant((UINT64_C(1) << FractBits) - 1, SL, MVT::i64);
2508
2509 SDValue Shr = DAG.getNode(ISD::SRA, SL, MVT::i64, FractMask, Exp);
2510 SDValue Not = DAG.getNOT(SL, Shr, MVT::i64);
2511 SDValue Tmp0 = DAG.getNode(ISD::AND, SL, MVT::i64, BcInt, Not);
2512
2513 EVT SetCCVT =
2514 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i32);
2515
2516 const SDValue FiftyOne = DAG.getConstant(FractBits - 1, SL, MVT::i32);
2517
2518 SDValue ExpLt0 = DAG.getSetCC(SL, SetCCVT, Exp, Zero, ISD::SETLT);
2519 SDValue ExpGt51 = DAG.getSetCC(SL, SetCCVT, Exp, FiftyOne, ISD::SETGT);
2520
2521 SDValue Tmp1 = DAG.getNode(ISD::SELECT, SL, MVT::i64, ExpLt0, SignBit64, Tmp0);
2522 SDValue Tmp2 = DAG.getNode(ISD::SELECT, SL, MVT::i64, ExpGt51, BcInt, Tmp1);
2523
2524 return DAG.getNode(ISD::BITCAST, SL, MVT::f64, Tmp2);
2525}
2526
2528 SelectionDAG &DAG) const {
2529 SDLoc SL(Op);
2530 SDValue Src = Op.getOperand(0);
2531
2532 assert(Op.getValueType() == MVT::f64);
2533
2534 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
2535 SDValue C1 = DAG.getConstantFP(C1Val, SL, MVT::f64);
2536 SDValue CopySign = DAG.getNode(ISD::FCOPYSIGN, SL, MVT::f64, C1, Src);
2537
2538 // TODO: Should this propagate fast-math-flags?
2539
2540 SDValue Tmp1 = DAG.getNode(ISD::FADD, SL, MVT::f64, Src, CopySign);
2541 SDValue Tmp2 = DAG.getNode(ISD::FSUB, SL, MVT::f64, Tmp1, CopySign);
2542
2543 SDValue Fabs = DAG.getNode(ISD::FABS, SL, MVT::f64, Src);
2544
2545 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
2546 SDValue C2 = DAG.getConstantFP(C2Val, SL, MVT::f64);
2547
2548 EVT SetCCVT =
2549 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);
2550 SDValue Cond = DAG.getSetCC(SL, SetCCVT, Fabs, C2, ISD::SETOGT);
2551
2552 return DAG.getSelect(SL, MVT::f64, Cond, Src, Tmp2);
2553}
2554
2556 SelectionDAG &DAG) const {
2557 // FNEARBYINT and FRINT are the same, except in their handling of FP
2558 // exceptions. Those aren't really meaningful for us, and OpenCL only has
2559 // rint, so just treat them as equivalent.
2560 return DAG.getNode(ISD::FROUNDEVEN, SDLoc(Op), Op.getValueType(),
2561 Op.getOperand(0));
2562}
2563
2565 auto VT = Op.getValueType();
2566 auto Arg = Op.getOperand(0u);
2567 return DAG.getNode(ISD::FROUNDEVEN, SDLoc(Op), VT, Arg);
2568}
2569
2570// XXX - May require not supporting f32 denormals?
2571
2572// Don't handle v2f16. The extra instructions to scalarize and repack around the
2573// compare and vselect end up producing worse code than scalarizing the whole
2574// operation.
2576 SDLoc SL(Op);
2577 SDValue X = Op.getOperand(0);
2578 EVT VT = Op.getValueType();
2579
2580 SDValue T = DAG.getNode(ISD::FTRUNC, SL, VT, X);
2581
2582 // TODO: Should this propagate fast-math-flags?
2583
2584 SDValue Diff = DAG.getNode(ISD::FSUB, SL, VT, X, T);
2585
2586 SDValue AbsDiff = DAG.getNode(ISD::FABS, SL, VT, Diff);
2587
2588 const SDValue Zero = DAG.getConstantFP(0.0, SL, VT);
2589 const SDValue One = DAG.getConstantFP(1.0, SL, VT);
2590
2591 EVT SetCCVT =
2592 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2593
2594 const SDValue Half = DAG.getConstantFP(0.5, SL, VT);
2595 SDValue Cmp = DAG.getSetCC(SL, SetCCVT, AbsDiff, Half, ISD::SETOGE);
2596 SDValue OneOrZeroFP = DAG.getNode(ISD::SELECT, SL, VT, Cmp, One, Zero);
2597
2598 SDValue SignedOffset = DAG.getNode(ISD::FCOPYSIGN, SL, VT, OneOrZeroFP, X);
2599 return DAG.getNode(ISD::FADD, SL, VT, T, SignedOffset);
2600}
2601
2603 SDLoc SL(Op);
2604 SDValue Src = Op.getOperand(0);
2605
2606 // result = trunc(src);
2607 // if (src < 0.0 && src != result)
2608 // result += -1.0.
2609
2610 SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src);
2611
2612 const SDValue Zero = DAG.getConstantFP(0.0, SL, MVT::f64);
2613 const SDValue NegOne = DAG.getConstantFP(-1.0, SL, MVT::f64);
2614
2615 EVT SetCCVT =
2616 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);
2617
2618 SDValue Lt0 = DAG.getSetCC(SL, SetCCVT, Src, Zero, ISD::SETOLT);
2619 SDValue NeTrunc = DAG.getSetCC(SL, SetCCVT, Src, Trunc, ISD::SETONE);
2620 SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc);
2621
2622 SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, NegOne, Zero);
2623 // TODO: Should this propagate fast-math-flags?
2624 return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add);
2625}
2626
2627/// Return true if it's known that \p Src can never be an f32 denormal value.
2629 switch (Src.getOpcode()) {
2630 case ISD::FP_EXTEND:
2631 return Src.getOperand(0).getValueType() == MVT::f16;
2632 case ISD::FP16_TO_FP:
2633 case ISD::FFREXP:
2634 case ISD::FSQRT:
2635 case AMDGPUISD::LOG:
2636 case AMDGPUISD::EXP:
2637 return true;
2639 unsigned IntrinsicID = Src.getConstantOperandVal(0);
2640 switch (IntrinsicID) {
2641 case Intrinsic::amdgcn_frexp_mant:
2642 case Intrinsic::amdgcn_log:
2643 case Intrinsic::amdgcn_log_clamp:
2644 case Intrinsic::amdgcn_exp2:
2645 case Intrinsic::amdgcn_sqrt:
2646 return true;
2647 default:
2648 return false;
2649 }
2650 }
2651 default:
2652 return false;
2653 }
2654
2655 llvm_unreachable("covered opcode switch");
2656}
2657
2659 SDNodeFlags Flags) {
2660 return Flags.hasApproximateFuncs();
2661}
2662
2671
2673 SDValue Src,
2674 SDNodeFlags Flags) const {
2675 SDLoc SL(Src);
2676 EVT VT = Src.getValueType();
2677 const fltSemantics &Semantics = VT.getFltSemantics();
2678 SDValue SmallestNormal =
2679 DAG.getConstantFP(APFloat::getSmallestNormalized(Semantics), SL, VT);
2680
2681 // Want to scale denormals up, but negatives and 0 work just as well on the
2682 // scaled path.
2683 SDValue IsLtSmallestNormal = DAG.getSetCC(
2684 SL, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT), Src,
2685 SmallestNormal, ISD::SETOLT);
2686
2687 return IsLtSmallestNormal;
2688}
2689
2691 SDNodeFlags Flags) const {
2692 SDLoc SL(Src);
2693 EVT VT = Src.getValueType();
2694 const fltSemantics &Semantics = VT.getFltSemantics();
2695 SDValue Inf = DAG.getConstantFP(APFloat::getInf(Semantics), SL, VT);
2696
2697 SDValue Fabs = DAG.getNode(ISD::FABS, SL, VT, Src, Flags);
2698 SDValue IsFinite = DAG.getSetCC(
2699 SL, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT), Fabs,
2700 Inf, ISD::SETOLT);
2701 return IsFinite;
2702}
2703
2704/// If denormal handling is required return the scaled input to FLOG2, and the
2705/// check for denormal range. Otherwise, return null values.
2706std::pair<SDValue, SDValue>
2708 SDValue Src, SDNodeFlags Flags) const {
2709 if (!needsDenormHandlingF32(DAG, Src, Flags))
2710 return {};
2711
2712 MVT VT = MVT::f32;
2713 const fltSemantics &Semantics = APFloat::IEEEsingle();
2714 SDValue SmallestNormal =
2715 DAG.getConstantFP(APFloat::getSmallestNormalized(Semantics), SL, VT);
2716
2717 SDValue IsLtSmallestNormal = DAG.getSetCC(
2718 SL, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT), Src,
2719 SmallestNormal, ISD::SETOLT);
2720
2721 SDValue Scale32 = DAG.getConstantFP(0x1.0p+32, SL, VT);
2722 SDValue One = DAG.getConstantFP(1.0, SL, VT);
2723 SDValue ScaleFactor =
2724 DAG.getNode(ISD::SELECT, SL, VT, IsLtSmallestNormal, Scale32, One, Flags);
2725
2726 SDValue ScaledInput = DAG.getNode(ISD::FMUL, SL, VT, Src, ScaleFactor, Flags);
2727 return {ScaledInput, IsLtSmallestNormal};
2728}
2729
2731 // v_log_f32 is good enough for OpenCL, except it doesn't handle denormals.
2732 // If we have to handle denormals, scale up the input and adjust the result.
2733
2734 // scaled = x * (is_denormal ? 0x1.0p+32 : 1.0)
2735 // log2 = amdgpu_log2 - (is_denormal ? 32.0 : 0.0)
2736
2737 SDLoc SL(Op);
2738 EVT VT = Op.getValueType();
2739 SDValue Src = Op.getOperand(0);
2740 SDNodeFlags Flags = Op->getFlags();
2741
2742 if (VT == MVT::f16) {
2743 // Nothing in half is a denormal when promoted to f32.
2744 assert(!isTypeLegal(VT));
2745 SDValue Ext = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src, Flags);
2746 SDValue Log = DAG.getNode(AMDGPUISD::LOG, SL, MVT::f32, Ext, Flags);
2747 return DAG.getNode(ISD::FP_ROUND, SL, VT, Log,
2748 DAG.getTargetConstant(0, SL, MVT::i32), Flags);
2749 }
2750
2751 auto [ScaledInput, IsLtSmallestNormal] =
2752 getScaledLogInput(DAG, SL, Src, Flags);
2753 if (!ScaledInput)
2754 return DAG.getNode(AMDGPUISD::LOG, SL, VT, Src, Flags);
2755
2756 SDValue Log2 = DAG.getNode(AMDGPUISD::LOG, SL, VT, ScaledInput, Flags);
2757
2758 SDValue ThirtyTwo = DAG.getConstantFP(32.0, SL, VT);
2759 SDValue Zero = DAG.getConstantFP(0.0, SL, VT);
2760 SDValue ResultOffset =
2761 DAG.getNode(ISD::SELECT, SL, VT, IsLtSmallestNormal, ThirtyTwo, Zero);
2762 return DAG.getNode(ISD::FSUB, SL, VT, Log2, ResultOffset, Flags);
2763}
2764
2765static SDValue getMad(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue X,
2766 SDValue Y, SDValue C, SDNodeFlags Flags = SDNodeFlags()) {
2767 SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, X, Y, Flags);
2768 return DAG.getNode(ISD::FADD, SL, VT, Mul, C, Flags);
2769}
2770
2772 SelectionDAG &DAG) const {
2773 SDValue X = Op.getOperand(0);
2774 EVT VT = Op.getValueType();
2775 SDNodeFlags Flags = Op->getFlags();
2776 SDLoc DL(Op);
2777 const bool IsLog10 = Op.getOpcode() == ISD::FLOG10;
2778 assert(IsLog10 || Op.getOpcode() == ISD::FLOG);
2779
2780 if (VT == MVT::f16 || Flags.hasApproximateFuncs()) {
2781 // TODO: The direct f16 path is 1.79 ulp for f16. This should be used
2782 // depending on !fpmath metadata.
2783
2784 bool PromoteToF32 = VT == MVT::f16 && (!Flags.hasApproximateFuncs() ||
2785 !isTypeLegal(MVT::f16));
2786
2787 if (PromoteToF32) {
2788 // Log and multiply in f32 is always good enough for f16.
2789 X = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, X, Flags);
2790 }
2791
2792 SDValue Lowered = LowerFLOGUnsafe(X, DL, DAG, IsLog10, Flags);
2793 if (PromoteToF32) {
2794 return DAG.getNode(ISD::FP_ROUND, DL, VT, Lowered,
2795 DAG.getTargetConstant(0, DL, MVT::i32), Flags);
2796 }
2797
2798 return Lowered;
2799 }
2800
2801 SDValue ScaledInput, IsScaled;
2802 if (VT == MVT::f16)
2803 X = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, X, Flags);
2804 else {
2805 std::tie(ScaledInput, IsScaled) = getScaledLogInput(DAG, DL, X, Flags);
2806 if (ScaledInput)
2807 X = ScaledInput;
2808 }
2809
2810 SDValue Y = DAG.getNode(AMDGPUISD::LOG, DL, VT, X, Flags);
2811
2812 SDValue R;
2813 if (Subtarget->hasFastFMAF32()) {
2814 // c+cc are ln(2)/ln(10) to more than 49 bits
2815 const float c_log10 = 0x1.344134p-2f;
2816 const float cc_log10 = 0x1.09f79ep-26f;
2817
2818 // c + cc is ln(2) to more than 49 bits
2819 const float c_log = 0x1.62e42ep-1f;
2820 const float cc_log = 0x1.efa39ep-25f;
2821
2822 SDValue C = DAG.getConstantFP(IsLog10 ? c_log10 : c_log, DL, VT);
2823 SDValue CC = DAG.getConstantFP(IsLog10 ? cc_log10 : cc_log, DL, VT);
2824 // This adds correction terms for which contraction may lead to an increase
2825 // in the error of the approximation, so disable it.
2826 Flags.setAllowContract(false);
2827 R = DAG.getNode(ISD::FMUL, DL, VT, Y, C, Flags);
2828 SDValue NegR = DAG.getNode(ISD::FNEG, DL, VT, R, Flags);
2829 SDValue FMA0 = DAG.getNode(ISD::FMA, DL, VT, Y, C, NegR, Flags);
2830 SDValue FMA1 = DAG.getNode(ISD::FMA, DL, VT, Y, CC, FMA0, Flags);
2831 R = DAG.getNode(ISD::FADD, DL, VT, R, FMA1, Flags);
2832 } else {
2833 // ch+ct is ln(2)/ln(10) to more than 36 bits
2834 const float ch_log10 = 0x1.344000p-2f;
2835 const float ct_log10 = 0x1.3509f6p-18f;
2836
2837 // ch + ct is ln(2) to more than 36 bits
2838 const float ch_log = 0x1.62e000p-1f;
2839 const float ct_log = 0x1.0bfbe8p-15f;
2840
2841 SDValue CH = DAG.getConstantFP(IsLog10 ? ch_log10 : ch_log, DL, VT);
2842 SDValue CT = DAG.getConstantFP(IsLog10 ? ct_log10 : ct_log, DL, VT);
2843
2844 SDValue YAsInt = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Y);
2845 SDValue MaskConst = DAG.getConstant(0xfffff000, DL, MVT::i32);
2846 SDValue YHInt = DAG.getNode(ISD::AND, DL, MVT::i32, YAsInt, MaskConst);
2847 SDValue YH = DAG.getNode(ISD::BITCAST, DL, MVT::f32, YHInt);
2848 SDValue YT = DAG.getNode(ISD::FSUB, DL, VT, Y, YH, Flags);
2849 // This adds correction terms for which contraction may lead to an increase
2850 // in the error of the approximation, so disable it.
2851 Flags.setAllowContract(false);
2852 SDValue YTCT = DAG.getNode(ISD::FMUL, DL, VT, YT, CT, Flags);
2853 SDValue Mad0 = getMad(DAG, DL, VT, YH, CT, YTCT, Flags);
2854 SDValue Mad1 = getMad(DAG, DL, VT, YT, CH, Mad0, Flags);
2855 R = getMad(DAG, DL, VT, YH, CH, Mad1);
2856 }
2857
2858 const bool IsFiniteOnly = Flags.hasNoNaNs() && Flags.hasNoInfs();
2859
2860 // TODO: Check if known finite from source value.
2861 if (!IsFiniteOnly) {
2862 SDValue IsFinite = getIsFinite(DAG, Y, Flags);
2863 R = DAG.getNode(ISD::SELECT, DL, VT, IsFinite, R, Y, Flags);
2864 }
2865
2866 if (IsScaled) {
2867 SDValue Zero = DAG.getConstantFP(0.0f, DL, VT);
2868 SDValue ShiftK =
2869 DAG.getConstantFP(IsLog10 ? 0x1.344136p+3f : 0x1.62e430p+4f, DL, VT);
2870 SDValue Shift =
2871 DAG.getNode(ISD::SELECT, DL, VT, IsScaled, ShiftK, Zero, Flags);
2872 R = DAG.getNode(ISD::FSUB, DL, VT, R, Shift, Flags);
2873 }
2874
2875 return R;
2876}
2877
2881
2882// Do f32 fast math expansion for flog2 or flog10. This is accurate enough for a
2883// promote f16 operation.
2885 SelectionDAG &DAG, bool IsLog10,
2886 SDNodeFlags Flags) const {
2887 EVT VT = Src.getValueType();
2888 unsigned LogOp =
2889 VT == MVT::f32 ? (unsigned)AMDGPUISD::LOG : (unsigned)ISD::FLOG2;
2890
2891 double Log2BaseInverted =
2893
2894 if (VT == MVT::f32) {
2895 auto [ScaledInput, IsScaled] = getScaledLogInput(DAG, SL, Src, Flags);
2896 if (ScaledInput) {
2897 SDValue LogSrc = DAG.getNode(AMDGPUISD::LOG, SL, VT, ScaledInput, Flags);
2898 SDValue ScaledResultOffset =
2899 DAG.getConstantFP(-32.0 * Log2BaseInverted, SL, VT);
2900
2901 SDValue Zero = DAG.getConstantFP(0.0f, SL, VT);
2902
2903 SDValue ResultOffset = DAG.getNode(ISD::SELECT, SL, VT, IsScaled,
2904 ScaledResultOffset, Zero, Flags);
2905
2906 SDValue Log2Inv = DAG.getConstantFP(Log2BaseInverted, SL, VT);
2907
2908 if (Subtarget->hasFastFMAF32())
2909 return DAG.getNode(ISD::FMA, SL, VT, LogSrc, Log2Inv, ResultOffset,
2910 Flags);
2911 SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, LogSrc, Log2Inv, Flags);
2912 return DAG.getNode(ISD::FADD, SL, VT, Mul, ResultOffset);
2913 }
2914 }
2915
2916 SDValue Log2Operand = DAG.getNode(LogOp, SL, VT, Src, Flags);
2917 SDValue Log2BaseInvertedOperand = DAG.getConstantFP(Log2BaseInverted, SL, VT);
2918
2919 return DAG.getNode(ISD::FMUL, SL, VT, Log2Operand, Log2BaseInvertedOperand,
2920 Flags);
2921}
2922
2923// This expansion gives a result slightly better than 1ulp.
2925 SelectionDAG &DAG) const {
2926 SDLoc DL(Op);
2927 SDValue X = Op.getOperand(0);
2928
2929 // TODO: Check if reassoc is safe. There is an output change in exp2 and
2930 // exp10, which slightly increases ulp.
2931 SDNodeFlags Flags = Op->getFlags() & ~SDNodeFlags::AllowReassociation;
2932
2933 SDValue DN, F, T;
2934
2935 if (Op.getOpcode() == ISD::FEXP2) {
2936 // dn = rint(x)
2937 DN = DAG.getNode(ISD::FRINT, DL, MVT::f64, X, Flags);
2938 // f = x - dn
2939 F = DAG.getNode(ISD::FSUB, DL, MVT::f64, X, DN, Flags);
2940 // t = f*C1 + f*C2
2941 SDValue C1 = DAG.getConstantFP(0x1.62e42fefa39efp-1, DL, MVT::f64);
2942 SDValue C2 = DAG.getConstantFP(0x1.abc9e3b39803fp-56, DL, MVT::f64);
2943 SDValue Mul2 = DAG.getNode(ISD::FMUL, DL, MVT::f64, F, C2, Flags);
2944 T = DAG.getNode(ISD::FMA, DL, MVT::f64, F, C1, Mul2, Flags);
2945 } else if (Op.getOpcode() == ISD::FEXP10) {
2946 // dn = rint(x * C1)
2947 SDValue C1 = DAG.getConstantFP(0x1.a934f0979a371p+1, DL, MVT::f64);
2948 SDValue Mul = DAG.getNode(ISD::FMUL, DL, MVT::f64, X, C1, Flags);
2949 DN = DAG.getNode(ISD::FRINT, DL, MVT::f64, Mul, Flags);
2950
2951 // f = FMA(-dn, C2, FMA(-dn, C3, x))
2952 SDValue NegDN = DAG.getNode(ISD::FNEG, DL, MVT::f64, DN, Flags);
2953 SDValue C2 = DAG.getConstantFP(-0x1.9dc1da994fd21p-59, DL, MVT::f64);
2954 SDValue C3 = DAG.getConstantFP(0x1.34413509f79ffp-2, DL, MVT::f64);
2955 SDValue Inner = DAG.getNode(ISD::FMA, DL, MVT::f64, NegDN, C3, X, Flags);
2956 F = DAG.getNode(ISD::FMA, DL, MVT::f64, NegDN, C2, Inner, Flags);
2957
2958 // t = FMA(f, C4, f*C5)
2959 SDValue C4 = DAG.getConstantFP(0x1.26bb1bbb55516p+1, DL, MVT::f64);
2960 SDValue C5 = DAG.getConstantFP(-0x1.f48ad494ea3e9p-53, DL, MVT::f64);
2961 SDValue MulF = DAG.getNode(ISD::FMUL, DL, MVT::f64, F, C5, Flags);
2962 T = DAG.getNode(ISD::FMA, DL, MVT::f64, F, C4, MulF, Flags);
2963 } else { // ISD::FEXP
2964 // dn = rint(x * C1)
2965 SDValue C1 = DAG.getConstantFP(0x1.71547652b82fep+0, DL, MVT::f64);
2966 SDValue Mul = DAG.getNode(ISD::FMUL, DL, MVT::f64, X, C1, Flags);
2967 DN = DAG.getNode(ISD::FRINT, DL, MVT::f64, Mul, Flags);
2968
2969 // t = FMA(-dn, C2, FMA(-dn, C3, x))
2970 SDValue NegDN = DAG.getNode(ISD::FNEG, DL, MVT::f64, DN, Flags);
2971 SDValue C2 = DAG.getConstantFP(0x1.abc9e3b39803fp-56, DL, MVT::f64);
2972 SDValue C3 = DAG.getConstantFP(0x1.62e42fefa39efp-1, DL, MVT::f64);
2973 SDValue Inner = DAG.getNode(ISD::FMA, DL, MVT::f64, NegDN, C3, X, Flags);
2974 T = DAG.getNode(ISD::FMA, DL, MVT::f64, NegDN, C2, Inner, Flags);
2975 }
2976
2977 // Polynomial expansion for p
2978 SDValue P = DAG.getConstantFP(0x1.ade156a5dcb37p-26, DL, MVT::f64);
2979 P = DAG.getNode(ISD::FMA, DL, MVT::f64, T, P,
2980 DAG.getConstantFP(0x1.28af3fca7ab0cp-22, DL, MVT::f64),
2981 Flags);
2982 P = DAG.getNode(ISD::FMA, DL, MVT::f64, T, P,
2983 DAG.getConstantFP(0x1.71dee623fde64p-19, DL, MVT::f64),
2984 Flags);
2985 P = DAG.getNode(ISD::FMA, DL, MVT::f64, T, P,
2986 DAG.getConstantFP(0x1.a01997c89e6b0p-16, DL, MVT::f64),
2987 Flags);
2988 P = DAG.getNode(ISD::FMA, DL, MVT::f64, T, P,
2989 DAG.getConstantFP(0x1.a01a014761f6ep-13, DL, MVT::f64),
2990 Flags);
2991 P = DAG.getNode(ISD::FMA, DL, MVT::f64, T, P,
2992 DAG.getConstantFP(0x1.6c16c1852b7b0p-10, DL, MVT::f64),
2993 Flags);
2994 P = DAG.getNode(ISD::FMA, DL, MVT::f64, T, P,
2995 DAG.getConstantFP(0x1.1111111122322p-7, DL, MVT::f64), Flags);
2996 P = DAG.getNode(ISD::FMA, DL, MVT::f64, T, P,
2997 DAG.getConstantFP(0x1.55555555502a1p-5, DL, MVT::f64), Flags);
2998 P = DAG.getNode(ISD::FMA, DL, MVT::f64, T, P,
2999 DAG.getConstantFP(0x1.5555555555511p-3, DL, MVT::f64), Flags);
3000 P = DAG.getNode(ISD::FMA, DL, MVT::f64, T, P,
3001 DAG.getConstantFP(0x1.000000000000bp-1, DL, MVT::f64), Flags);
3002
3003 SDValue One = DAG.getConstantFP(1.0, DL, MVT::f64);
3004
3005 P = DAG.getNode(ISD::FMA, DL, MVT::f64, T, P, One, Flags);
3006 P = DAG.getNode(ISD::FMA, DL, MVT::f64, T, P, One, Flags);
3007
3008 // z = ldexp(p, (int)dn)
3009 SDValue DNInt = DAG.getNode(ISD::FP_TO_SINT, DL, MVT::i32, DN);
3010 SDValue Z = DAG.getNode(ISD::FLDEXP, DL, MVT::f64, P, DNInt, Flags);
3011
3012 // Overflow/underflow guards
3013 SDValue CondHi = DAG.getSetCC(
3014 DL, MVT::i1, X, DAG.getConstantFP(1024.0, DL, MVT::f64), ISD::SETULE);
3015
3016 if (!Flags.hasNoInfs()) {
3017 SDValue PInf = DAG.getConstantFP(std::numeric_limits<double>::infinity(),
3018 DL, MVT::f64);
3019 Z = DAG.getSelect(DL, MVT::f64, CondHi, Z, PInf, Flags);
3020 }
3021
3022 SDValue CondLo = DAG.getSetCC(
3023 DL, MVT::i1, X, DAG.getConstantFP(-1075.0, DL, MVT::f64), ISD::SETUGE);
3024 SDValue Zero = DAG.getConstantFP(0.0, DL, MVT::f64);
3025 Z = DAG.getSelect(DL, MVT::f64, CondLo, Z, Zero, Flags);
3026
3027 return Z;
3028}
3029
3031 // v_exp_f32 is good enough for OpenCL, except it doesn't handle denormals.
3032 // If we have to handle denormals, scale up the input and adjust the result.
3033
3034 EVT VT = Op.getValueType();
3035 if (VT == MVT::f64)
3036 return lowerFEXPF64(Op, DAG);
3037
3038 SDLoc SL(Op);
3039 SDValue Src = Op.getOperand(0);
3040 SDNodeFlags Flags = Op->getFlags();
3041
3042 if (VT == MVT::f16) {
3043 // Nothing in half is a denormal when promoted to f32.
3044 assert(!isTypeLegal(MVT::f16));
3045 SDValue Ext = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src, Flags);
3046 SDValue Log = DAG.getNode(AMDGPUISD::EXP, SL, MVT::f32, Ext, Flags);
3047 return DAG.getNode(ISD::FP_ROUND, SL, VT, Log,
3048 DAG.getTargetConstant(0, SL, MVT::i32), Flags);
3049 }
3050
3051 assert(VT == MVT::f32);
3052
3053 if (!needsDenormHandlingF32(DAG, Src, Flags))
3054 return DAG.getNode(AMDGPUISD::EXP, SL, MVT::f32, Src, Flags);
3055
3056 // bool needs_scaling = x < -0x1.f80000p+6f;
3057 // v_exp_f32(x + (s ? 0x1.0p+6f : 0.0f)) * (s ? 0x1.0p-64f : 1.0f);
3058
3059 // -nextafter(128.0, -1)
3060 SDValue RangeCheckConst = DAG.getConstantFP(-0x1.f80000p+6f, SL, VT);
3061
3062 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
3063
3064 SDValue NeedsScaling =
3065 DAG.getSetCC(SL, SetCCVT, Src, RangeCheckConst, ISD::SETOLT);
3066
3067 SDValue SixtyFour = DAG.getConstantFP(0x1.0p+6f, SL, VT);
3068 SDValue Zero = DAG.getConstantFP(0.0, SL, VT);
3069
3070 SDValue AddOffset =
3071 DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, SixtyFour, Zero);
3072
3073 SDValue AddInput = DAG.getNode(ISD::FADD, SL, VT, Src, AddOffset, Flags);
3074 SDValue Exp2 = DAG.getNode(AMDGPUISD::EXP, SL, VT, AddInput, Flags);
3075
3076 SDValue TwoExpNeg64 = DAG.getConstantFP(0x1.0p-64f, SL, VT);
3077 SDValue One = DAG.getConstantFP(1.0, SL, VT);
3078 SDValue ResultScale =
3079 DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, TwoExpNeg64, One);
3080
3081 return DAG.getNode(ISD::FMUL, SL, VT, Exp2, ResultScale, Flags);
3082}
3083
3085 SelectionDAG &DAG,
3086 SDNodeFlags Flags,
3087 bool IsExp10) const {
3088 // exp(x) -> exp2(M_LOG2E_F * x);
3089 // exp10(x) -> exp2(log2(10) * x);
3090 EVT VT = X.getValueType();
3091 SDValue Const =
3092 DAG.getConstantFP(IsExp10 ? 0x1.a934f0p+1f : numbers::log2e, SL, VT);
3093
3094 SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, X, Const, Flags);
3095 return DAG.getNode(VT == MVT::f32 ? (unsigned)AMDGPUISD::EXP
3096 : (unsigned)ISD::FEXP2,
3097 SL, VT, Mul, Flags);
3098}
3099
3101 SelectionDAG &DAG,
3102 SDNodeFlags Flags) const {
3103 EVT VT = X.getValueType();
3104 if (VT != MVT::f32 || !needsDenormHandlingF32(DAG, X, Flags))
3105 return lowerFEXPUnsafeImpl(X, SL, DAG, Flags, /*IsExp10=*/false);
3106
3107 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
3108
3109 SDValue Threshold = DAG.getConstantFP(-0x1.5d58a0p+6f, SL, VT);
3110 SDValue NeedsScaling = DAG.getSetCC(SL, SetCCVT, X, Threshold, ISD::SETOLT);
3111
3112 SDValue ScaleOffset = DAG.getConstantFP(0x1.0p+6f, SL, VT);
3113
3114 SDValue ScaledX = DAG.getNode(ISD::FADD, SL, VT, X, ScaleOffset, Flags);
3115
3116 SDValue AdjustedX =
3117 DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, ScaledX, X);
3118
3119 const SDValue Log2E = DAG.getConstantFP(numbers::log2e, SL, VT);
3120 SDValue ExpInput = DAG.getNode(ISD::FMUL, SL, VT, AdjustedX, Log2E, Flags);
3121
3122 SDValue Exp2 = DAG.getNode(AMDGPUISD::EXP, SL, VT, ExpInput, Flags);
3123
3124 SDValue ResultScaleFactor = DAG.getConstantFP(0x1.969d48p-93f, SL, VT);
3125 SDValue AdjustedResult =
3126 DAG.getNode(ISD::FMUL, SL, VT, Exp2, ResultScaleFactor, Flags);
3127
3128 return DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, AdjustedResult, Exp2,
3129 Flags);
3130}
3131
3132/// Emit approx-funcs appropriate lowering for exp10. inf/nan should still be
3133/// handled correctly.
3135 SelectionDAG &DAG,
3136 SDNodeFlags Flags) const {
3137 const EVT VT = X.getValueType();
3138
3139 const unsigned Exp2Op = VT == MVT::f32 ? static_cast<unsigned>(AMDGPUISD::EXP)
3140 : static_cast<unsigned>(ISD::FEXP2);
3141
3142 if (VT != MVT::f32 || !needsDenormHandlingF32(DAG, X, Flags)) {
3143 // exp2(x * 0x1.a92000p+1f) * exp2(x * 0x1.4f0978p-11f);
3144 SDValue K0 = DAG.getConstantFP(0x1.a92000p+1f, SL, VT);
3145 SDValue K1 = DAG.getConstantFP(0x1.4f0978p-11f, SL, VT);
3146
3147 SDValue Mul0 = DAG.getNode(ISD::FMUL, SL, VT, X, K0, Flags);
3148 SDValue Exp2_0 = DAG.getNode(Exp2Op, SL, VT, Mul0, Flags);
3149 SDValue Mul1 = DAG.getNode(ISD::FMUL, SL, VT, X, K1, Flags);
3150 SDValue Exp2_1 = DAG.getNode(Exp2Op, SL, VT, Mul1, Flags);
3151 return DAG.getNode(ISD::FMUL, SL, VT, Exp2_0, Exp2_1);
3152 }
3153
3154 // bool s = x < -0x1.2f7030p+5f;
3155 // x += s ? 0x1.0p+5f : 0.0f;
3156 // exp10 = exp2(x * 0x1.a92000p+1f) *
3157 // exp2(x * 0x1.4f0978p-11f) *
3158 // (s ? 0x1.9f623ep-107f : 1.0f);
3159
3160 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
3161
3162 SDValue Threshold = DAG.getConstantFP(-0x1.2f7030p+5f, SL, VT);
3163 SDValue NeedsScaling = DAG.getSetCC(SL, SetCCVT, X, Threshold, ISD::SETOLT);
3164
3165 SDValue ScaleOffset = DAG.getConstantFP(0x1.0p+5f, SL, VT);
3166 SDValue ScaledX = DAG.getNode(ISD::FADD, SL, VT, X, ScaleOffset, Flags);
3167 SDValue AdjustedX =
3168 DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, ScaledX, X);
3169
3170 SDValue K0 = DAG.getConstantFP(0x1.a92000p+1f, SL, VT);
3171 SDValue K1 = DAG.getConstantFP(0x1.4f0978p-11f, SL, VT);
3172
3173 SDValue Mul0 = DAG.getNode(ISD::FMUL, SL, VT, AdjustedX, K0, Flags);
3174 SDValue Exp2_0 = DAG.getNode(Exp2Op, SL, VT, Mul0, Flags);
3175 SDValue Mul1 = DAG.getNode(ISD::FMUL, SL, VT, AdjustedX, K1, Flags);
3176 SDValue Exp2_1 = DAG.getNode(Exp2Op, SL, VT, Mul1, Flags);
3177
3178 SDValue MulExps = DAG.getNode(ISD::FMUL, SL, VT, Exp2_0, Exp2_1, Flags);
3179
3180 SDValue ResultScaleFactor = DAG.getConstantFP(0x1.9f623ep-107f, SL, VT);
3181 SDValue AdjustedResult =
3182 DAG.getNode(ISD::FMUL, SL, VT, MulExps, ResultScaleFactor, Flags);
3183
3184 return DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, AdjustedResult, MulExps,
3185 Flags);
3186}
3187
3189 EVT VT = Op.getValueType();
3190
3191 if (VT == MVT::f64)
3192 return lowerFEXPF64(Op, DAG);
3193
3194 SDLoc SL(Op);
3195 SDValue X = Op.getOperand(0);
3196 SDNodeFlags Flags = Op->getFlags();
3197 const bool IsExp10 = Op.getOpcode() == ISD::FEXP10;
3198
3199 // TODO: Interpret allowApproxFunc as ignoring DAZ. This is currently copying
3200 // library behavior. Also, is known-not-daz source sufficient?
3201 if (allowApproxFunc(DAG, Flags)) { // TODO: Does this really require fast?
3202 return IsExp10 ? lowerFEXP10Unsafe(X, SL, DAG, Flags)
3203 : lowerFEXPUnsafe(X, SL, DAG, Flags);
3204 }
3205
3206 if (VT.getScalarType() == MVT::f16) {
3207 if (VT.isVector())
3208 return SDValue();
3209
3210 // Nothing in half is a denormal when promoted to f32.
3211 //
3212 // exp(f16 x) ->
3213 // fptrunc (v_exp_f32 (fmul (fpext x), log2e))
3214 //
3215 // exp10(f16 x) ->
3216 // fptrunc (v_exp_f32 (fmul (fpext x), log2(10)))
3217 SDValue Ext = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, X, Flags);
3218 SDValue Lowered = lowerFEXPUnsafeImpl(Ext, SL, DAG, Flags, IsExp10);
3219 return DAG.getNode(ISD::FP_ROUND, SL, VT, Lowered,
3220 DAG.getTargetConstant(0, SL, MVT::i32), Flags);
3221 }
3222
3223 assert(VT == MVT::f32);
3224
3225 // Algorithm:
3226 //
3227 // e^x = 2^(x/ln(2)) = 2^(x*(64/ln(2))/64)
3228 //
3229 // x*(64/ln(2)) = n + f, |f| <= 0.5, n is integer
3230 // n = 64*m + j, 0 <= j < 64
3231 //
3232 // e^x = 2^((64*m + j + f)/64)
3233 // = (2^m) * (2^(j/64)) * 2^(f/64)
3234 // = (2^m) * (2^(j/64)) * e^(f*(ln(2)/64))
3235 //
3236 // f = x*(64/ln(2)) - n
3237 // r = f*(ln(2)/64) = x - n*(ln(2)/64)
3238 //
3239 // e^x = (2^m) * (2^(j/64)) * e^r
3240 //
3241 // (2^(j/64)) is precomputed
3242 //
3243 // e^r = 1 + r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5!
3244 // e^r = 1 + q
3245 //
3246 // q = r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5!
3247 //
3248 // e^x = (2^m) * ( (2^(j/64)) + q*(2^(j/64)) )
3249 SDNodeFlags FlagsNoContract = Flags;
3250 FlagsNoContract.setAllowContract(false);
3251
3252 SDValue PH, PL;
3253 if (Subtarget->hasFastFMAF32()) {
3254 const float c_exp = numbers::log2ef;
3255 const float cc_exp = 0x1.4ae0bep-26f; // c+cc are 49 bits
3256 const float c_exp10 = 0x1.a934f0p+1f;
3257 const float cc_exp10 = 0x1.2f346ep-24f;
3258
3259 SDValue C = DAG.getConstantFP(IsExp10 ? c_exp10 : c_exp, SL, VT);
3260 SDValue CC = DAG.getConstantFP(IsExp10 ? cc_exp10 : cc_exp, SL, VT);
3261
3262 PH = DAG.getNode(ISD::FMUL, SL, VT, X, C, Flags);
3263 SDValue NegPH = DAG.getNode(ISD::FNEG, SL, VT, PH, Flags);
3264 SDValue FMA0 = DAG.getNode(ISD::FMA, SL, VT, X, C, NegPH, Flags);
3265 PL = DAG.getNode(ISD::FMA, SL, VT, X, CC, FMA0, Flags);
3266 } else {
3267 const float ch_exp = 0x1.714000p+0f;
3268 const float cl_exp = 0x1.47652ap-12f; // ch + cl are 36 bits
3269
3270 const float ch_exp10 = 0x1.a92000p+1f;
3271 const float cl_exp10 = 0x1.4f0978p-11f;
3272
3273 SDValue CH = DAG.getConstantFP(IsExp10 ? ch_exp10 : ch_exp, SL, VT);
3274 SDValue CL = DAG.getConstantFP(IsExp10 ? cl_exp10 : cl_exp, SL, VT);
3275
3276 SDValue XAsInt = DAG.getNode(ISD::BITCAST, SL, MVT::i32, X);
3277 SDValue MaskConst = DAG.getConstant(0xfffff000, SL, MVT::i32);
3278 SDValue XHAsInt = DAG.getNode(ISD::AND, SL, MVT::i32, XAsInt, MaskConst);
3279 SDValue XH = DAG.getNode(ISD::BITCAST, SL, VT, XHAsInt);
3280 SDValue XL = DAG.getNode(ISD::FSUB, SL, VT, X, XH, Flags);
3281
3282 PH = DAG.getNode(ISD::FMUL, SL, VT, XH, CH, Flags);
3283
3284 SDValue XLCL = DAG.getNode(ISD::FMUL, SL, VT, XL, CL, Flags);
3285 SDValue Mad0 = getMad(DAG, SL, VT, XL, CH, XLCL, Flags);
3286 PL = getMad(DAG, SL, VT, XH, CL, Mad0, Flags);
3287 }
3288
3289 SDValue E = DAG.getNode(ISD::FROUNDEVEN, SL, VT, PH, Flags);
3290
3291 // It is unsafe to contract this fsub into the PH multiply.
3292 SDValue PHSubE = DAG.getNode(ISD::FSUB, SL, VT, PH, E, FlagsNoContract);
3293
3294 SDValue A = DAG.getNode(ISD::FADD, SL, VT, PHSubE, PL, Flags);
3295 SDValue IntE = DAG.getNode(ISD::FP_TO_SINT, SL, MVT::i32, E);
3296 SDValue Exp2 = DAG.getNode(AMDGPUISD::EXP, SL, VT, A, Flags);
3297
3298 SDValue R = DAG.getNode(ISD::FLDEXP, SL, VT, Exp2, IntE, Flags);
3299
3300 SDValue UnderflowCheckConst =
3301 DAG.getConstantFP(IsExp10 ? -0x1.66d3e8p+5f : -0x1.9d1da0p+6f, SL, VT);
3302
3303 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
3304 SDValue Zero = DAG.getConstantFP(0.0, SL, VT);
3305 SDValue Underflow =
3306 DAG.getSetCC(SL, SetCCVT, X, UnderflowCheckConst, ISD::SETOLT);
3307
3308 R = DAG.getNode(ISD::SELECT, SL, VT, Underflow, Zero, R);
3309
3310 if (!Flags.hasNoInfs()) {
3311 SDValue OverflowCheckConst =
3312 DAG.getConstantFP(IsExp10 ? 0x1.344136p+5f : 0x1.62e430p+6f, SL, VT);
3313 SDValue Overflow =
3314 DAG.getSetCC(SL, SetCCVT, X, OverflowCheckConst, ISD::SETOGT);
3315 SDValue Inf =
3317 R = DAG.getNode(ISD::SELECT, SL, VT, Overflow, Inf, R);
3318 }
3319
3320 return R;
3321}
3322
3323static bool isCtlzOpc(unsigned Opc) {
3324 return Opc == ISD::CTLZ || Opc == ISD::CTLZ_ZERO_UNDEF;
3325}
3326
3327static bool isCttzOpc(unsigned Opc) {
3328 return Opc == ISD::CTTZ || Opc == ISD::CTTZ_ZERO_UNDEF;
3329}
3330
3332 SelectionDAG &DAG) const {
3333 auto SL = SDLoc(Op);
3334 auto Opc = Op.getOpcode();
3335 auto Arg = Op.getOperand(0u);
3336 auto ResultVT = Op.getValueType();
3337
3338 if (ResultVT != MVT::i8 && ResultVT != MVT::i16)
3339 return {};
3340
3342 assert(ResultVT == Arg.getValueType());
3343
3344 const uint64_t NumBits = ResultVT.getFixedSizeInBits();
3345 SDValue NumExtBits = DAG.getConstant(32u - NumBits, SL, MVT::i32);
3346 SDValue NewOp;
3347
3348 if (Opc == ISD::CTLZ_ZERO_UNDEF) {
3349 NewOp = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, Arg);
3350 NewOp = DAG.getNode(ISD::SHL, SL, MVT::i32, NewOp, NumExtBits);
3351 NewOp = DAG.getNode(Opc, SL, MVT::i32, NewOp);
3352 } else {
3353 NewOp = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Arg);
3354 NewOp = DAG.getNode(Opc, SL, MVT::i32, NewOp);
3355 NewOp = DAG.getNode(ISD::SUB, SL, MVT::i32, NewOp, NumExtBits);
3356 }
3357
3358 return DAG.getNode(ISD::TRUNCATE, SL, ResultVT, NewOp);
3359}
3360
3362 SDLoc SL(Op);
3363 SDValue Src = Op.getOperand(0);
3364
3365 assert(isCtlzOpc(Op.getOpcode()) || isCttzOpc(Op.getOpcode()));
3366 bool Ctlz = isCtlzOpc(Op.getOpcode());
3367 unsigned NewOpc = Ctlz ? AMDGPUISD::FFBH_U32 : AMDGPUISD::FFBL_B32;
3368
3369 bool ZeroUndef = Op.getOpcode() == ISD::CTLZ_ZERO_UNDEF ||
3370 Op.getOpcode() == ISD::CTTZ_ZERO_UNDEF;
3371 bool Is64BitScalar = !Src->isDivergent() && Src.getValueType() == MVT::i64;
3372
3373 if (Src.getValueType() == MVT::i32 || Is64BitScalar) {
3374 // (ctlz hi:lo) -> (umin (ffbh src), 32)
3375 // (cttz hi:lo) -> (umin (ffbl src), 32)
3376 // (ctlz_zero_undef src) -> (ffbh src)
3377 // (cttz_zero_undef src) -> (ffbl src)
3378
3379 // 64-bit scalar version produce 32-bit result
3380 // (ctlz hi:lo) -> (umin (S_FLBIT_I32_B64 src), 64)
3381 // (cttz hi:lo) -> (umin (S_FF1_I32_B64 src), 64)
3382 // (ctlz_zero_undef src) -> (S_FLBIT_I32_B64 src)
3383 // (cttz_zero_undef src) -> (S_FF1_I32_B64 src)
3384 SDValue NewOpr = DAG.getNode(NewOpc, SL, MVT::i32, Src);
3385 if (!ZeroUndef) {
3386 const SDValue ConstVal = DAG.getConstant(
3387 Op.getValueType().getScalarSizeInBits(), SL, MVT::i32);
3388 NewOpr = DAG.getNode(ISD::UMIN, SL, MVT::i32, NewOpr, ConstVal);
3389 }
3390 return DAG.getNode(ISD::ZERO_EXTEND, SL, Src.getValueType(), NewOpr);
3391 }
3392
3393 SDValue Lo, Hi;
3394 std::tie(Lo, Hi) = split64BitValue(Src, DAG);
3395
3396 SDValue OprLo = DAG.getNode(NewOpc, SL, MVT::i32, Lo);
3397 SDValue OprHi = DAG.getNode(NewOpc, SL, MVT::i32, Hi);
3398
3399 // (ctlz hi:lo) -> (umin3 (ffbh hi), (uaddsat (ffbh lo), 32), 64)
3400 // (cttz hi:lo) -> (umin3 (uaddsat (ffbl hi), 32), (ffbl lo), 64)
3401 // (ctlz_zero_undef hi:lo) -> (umin (ffbh hi), (add (ffbh lo), 32))
3402 // (cttz_zero_undef hi:lo) -> (umin (add (ffbl hi), 32), (ffbl lo))
3403
3404 unsigned AddOpc = ZeroUndef ? ISD::ADD : ISD::UADDSAT;
3405 const SDValue Const32 = DAG.getConstant(32, SL, MVT::i32);
3406 if (Ctlz)
3407 OprLo = DAG.getNode(AddOpc, SL, MVT::i32, OprLo, Const32);
3408 else
3409 OprHi = DAG.getNode(AddOpc, SL, MVT::i32, OprHi, Const32);
3410
3411 SDValue NewOpr;
3412 NewOpr = DAG.getNode(ISD::UMIN, SL, MVT::i32, OprLo, OprHi);
3413 if (!ZeroUndef) {
3414 const SDValue Const64 = DAG.getConstant(64, SL, MVT::i32);
3415 NewOpr = DAG.getNode(ISD::UMIN, SL, MVT::i32, NewOpr, Const64);
3416 }
3417
3418 return DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i64, NewOpr);
3419}
3420
3422 bool Signed) const {
3423 // The regular method converting a 64-bit integer to float roughly consists of
3424 // 2 steps: normalization and rounding. In fact, after normalization, the
3425 // conversion from a 64-bit integer to a float is essentially the same as the
3426 // one from a 32-bit integer. The only difference is that it has more
3427 // trailing bits to be rounded. To leverage the native 32-bit conversion, a
3428 // 64-bit integer could be preprocessed and fit into a 32-bit integer then
3429 // converted into the correct float number. The basic steps for the unsigned
3430 // conversion are illustrated in the following pseudo code:
3431 //
3432 // f32 uitofp(i64 u) {
3433 // i32 hi, lo = split(u);
3434 // // Only count the leading zeros in hi as we have native support of the
3435 // // conversion from i32 to f32. If hi is all 0s, the conversion is
3436 // // reduced to a 32-bit one automatically.
3437 // i32 shamt = clz(hi); // Return 32 if hi is all 0s.
3438 // u <<= shamt;
3439 // hi, lo = split(u);
3440 // hi |= (lo != 0) ? 1 : 0; // Adjust rounding bit in hi based on lo.
3441 // // convert it as a 32-bit integer and scale the result back.
3442 // return uitofp(hi) * 2^(32 - shamt);
3443 // }
3444 //
3445 // The signed one follows the same principle but uses 'ffbh_i32' to count its
3446 // sign bits instead. If 'ffbh_i32' is not available, its absolute value is
3447 // converted instead followed by negation based its sign bit.
3448
3449 SDLoc SL(Op);
3450 SDValue Src = Op.getOperand(0);
3451
3452 SDValue Lo, Hi;
3453 std::tie(Lo, Hi) = split64BitValue(Src, DAG);
3454 SDValue Sign;
3455 SDValue ShAmt;
3456 if (Signed && Subtarget->isGCN()) {
3457 // We also need to consider the sign bit in Lo if Hi has just sign bits,
3458 // i.e. Hi is 0 or -1. However, that only needs to take the MSB into
3459 // account. That is, the maximal shift is
3460 // - 32 if Lo and Hi have opposite signs;
3461 // - 33 if Lo and Hi have the same sign.
3462 //
3463 // Or, MaxShAmt = 33 + OppositeSign, where
3464 //
3465 // OppositeSign is defined as ((Lo ^ Hi) >> 31), which is
3466 // - -1 if Lo and Hi have opposite signs; and
3467 // - 0 otherwise.
3468 //
3469 // All in all, ShAmt is calculated as
3470 //
3471 // umin(sffbh(Hi), 33 + (Lo^Hi)>>31) - 1.
3472 //
3473 // or
3474 //
3475 // umin(sffbh(Hi) - 1, 32 + (Lo^Hi)>>31).
3476 //
3477 // to reduce the critical path.
3478 SDValue OppositeSign = DAG.getNode(
3479 ISD::SRA, SL, MVT::i32, DAG.getNode(ISD::XOR, SL, MVT::i32, Lo, Hi),
3480 DAG.getConstant(31, SL, MVT::i32));
3481 SDValue MaxShAmt =
3482 DAG.getNode(ISD::ADD, SL, MVT::i32, DAG.getConstant(32, SL, MVT::i32),
3483 OppositeSign);
3484 // Count the leading sign bits.
3485 ShAmt = DAG.getNode(AMDGPUISD::FFBH_I32, SL, MVT::i32, Hi);
3486 // Different from unsigned conversion, the shift should be one bit less to
3487 // preserve the sign bit.
3488 ShAmt = DAG.getNode(ISD::SUB, SL, MVT::i32, ShAmt,
3489 DAG.getConstant(1, SL, MVT::i32));
3490 ShAmt = DAG.getNode(ISD::UMIN, SL, MVT::i32, ShAmt, MaxShAmt);
3491 } else {
3492 if (Signed) {
3493 // Without 'ffbh_i32', only leading zeros could be counted. Take the
3494 // absolute value first.
3495 Sign = DAG.getNode(ISD::SRA, SL, MVT::i64, Src,
3496 DAG.getConstant(63, SL, MVT::i64));
3497 SDValue Abs =
3498 DAG.getNode(ISD::XOR, SL, MVT::i64,
3499 DAG.getNode(ISD::ADD, SL, MVT::i64, Src, Sign), Sign);
3500 std::tie(Lo, Hi) = split64BitValue(Abs, DAG);
3501 }
3502 // Count the leading zeros.
3503 ShAmt = DAG.getNode(ISD::CTLZ, SL, MVT::i32, Hi);
3504 // The shift amount for signed integers is [0, 32].
3505 }
3506 // Normalize the given 64-bit integer.
3507 SDValue Norm = DAG.getNode(ISD::SHL, SL, MVT::i64, Src, ShAmt);
3508 // Split it again.
3509 std::tie(Lo, Hi) = split64BitValue(Norm, DAG);
3510 // Calculate the adjust bit for rounding.
3511 // (lo != 0) ? 1 : 0 => (lo >= 1) ? 1 : 0 => umin(1, lo)
3512 SDValue Adjust = DAG.getNode(ISD::UMIN, SL, MVT::i32,
3513 DAG.getConstant(1, SL, MVT::i32), Lo);
3514 // Get the 32-bit normalized integer.
3515 Norm = DAG.getNode(ISD::OR, SL, MVT::i32, Hi, Adjust);
3516 // Convert the normalized 32-bit integer into f32.
3517
3518 bool UseLDEXP = isOperationLegal(ISD::FLDEXP, MVT::f32);
3519 unsigned Opc = Signed && UseLDEXP ? ISD::SINT_TO_FP : ISD::UINT_TO_FP;
3520 SDValue FVal = DAG.getNode(Opc, SL, MVT::f32, Norm);
3521
3522 // Finally, need to scale back the converted floating number as the original
3523 // 64-bit integer is converted as a 32-bit one.
3524 ShAmt = DAG.getNode(ISD::SUB, SL, MVT::i32, DAG.getConstant(32, SL, MVT::i32),
3525 ShAmt);
3526 // On GCN, use LDEXP directly.
3527 if (UseLDEXP)
3528 return DAG.getNode(ISD::FLDEXP, SL, MVT::f32, FVal, ShAmt);
3529
3530 // Otherwise, align 'ShAmt' to the exponent part and add it into the exponent
3531 // part directly to emulate the multiplication of 2^ShAmt. That 8-bit
3532 // exponent is enough to avoid overflowing into the sign bit.
3533 SDValue Exp = DAG.getNode(ISD::SHL, SL, MVT::i32, ShAmt,
3534 DAG.getConstant(23, SL, MVT::i32));
3535 SDValue IVal =
3536 DAG.getNode(ISD::ADD, SL, MVT::i32,
3537 DAG.getNode(ISD::BITCAST, SL, MVT::i32, FVal), Exp);
3538 if (Signed) {
3539 // Set the sign bit.
3540 Sign = DAG.getNode(ISD::SHL, SL, MVT::i32,
3541 DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Sign),
3542 DAG.getConstant(31, SL, MVT::i32));
3543 IVal = DAG.getNode(ISD::OR, SL, MVT::i32, IVal, Sign);
3544 }
3545 return DAG.getNode(ISD::BITCAST, SL, MVT::f32, IVal);
3546}
3547
3549 bool Signed) const {
3550 SDLoc SL(Op);
3551 SDValue Src = Op.getOperand(0);
3552
3553 SDValue Lo, Hi;
3554 std::tie(Lo, Hi) = split64BitValue(Src, DAG);
3555
3557 SL, MVT::f64, Hi);
3558
3559 SDValue CvtLo = DAG.getNode(ISD::UINT_TO_FP, SL, MVT::f64, Lo);
3560
3561 SDValue LdExp = DAG.getNode(ISD::FLDEXP, SL, MVT::f64, CvtHi,
3562 DAG.getConstant(32, SL, MVT::i32));
3563 // TODO: Should this propagate fast-math-flags?
3564 return DAG.getNode(ISD::FADD, SL, MVT::f64, LdExp, CvtLo);
3565}
3566
3568 SelectionDAG &DAG) const {
3569 // TODO: Factor out code common with LowerSINT_TO_FP.
3570 EVT DestVT = Op.getValueType();
3571 SDValue Src = Op.getOperand(0);
3572 EVT SrcVT = Src.getValueType();
3573
3574 if (SrcVT == MVT::i16) {
3575 if (DestVT == MVT::f16)
3576 return Op;
3577 SDLoc DL(Op);
3578
3579 // Promote src to i32
3580 SDValue Ext = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Src);
3581 return DAG.getNode(ISD::UINT_TO_FP, DL, DestVT, Ext);
3582 }
3583
3584 if (DestVT == MVT::bf16) {
3585 SDLoc SL(Op);
3586 SDValue ToF32 = DAG.getNode(ISD::UINT_TO_FP, SL, MVT::f32, Src);
3587 SDValue FPRoundFlag = DAG.getIntPtrConstant(0, SL, /*isTarget=*/true);
3588 return DAG.getNode(ISD::FP_ROUND, SL, MVT::bf16, ToF32, FPRoundFlag);
3589 }
3590
3591 if (SrcVT != MVT::i64)
3592 return Op;
3593
3594 if (DestVT == MVT::f16 && isTypeLegal(MVT::f16)) {
3595 SDLoc DL(Op);
3596
3597 SDValue IntToFp32 = DAG.getNode(Op.getOpcode(), DL, MVT::f32, Src);
3598 SDValue FPRoundFlag =
3599 DAG.getIntPtrConstant(0, SDLoc(Op), /*isTarget=*/true);
3600 SDValue FPRound =
3601 DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, IntToFp32, FPRoundFlag);
3602
3603 return FPRound;
3604 }
3605
3606 if (DestVT == MVT::f32)
3607 return LowerINT_TO_FP32(Op, DAG, false);
3608
3609 assert(DestVT == MVT::f64);
3610 return LowerINT_TO_FP64(Op, DAG, false);
3611}
3612
3614 SelectionDAG &DAG) const {
3615 EVT DestVT = Op.getValueType();
3616
3617 SDValue Src = Op.getOperand(0);
3618 EVT SrcVT = Src.getValueType();
3619
3620 if (SrcVT == MVT::i16) {
3621 if (DestVT == MVT::f16)
3622 return Op;
3623
3624 SDLoc DL(Op);
3625 // Promote src to i32
3626 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i32, Src);
3627 return DAG.getNode(ISD::SINT_TO_FP, DL, DestVT, Ext);
3628 }
3629
3630 if (DestVT == MVT::bf16) {
3631 SDLoc SL(Op);
3632 SDValue ToF32 = DAG.getNode(ISD::SINT_TO_FP, SL, MVT::f32, Src);
3633 SDValue FPRoundFlag = DAG.getIntPtrConstant(0, SL, /*isTarget=*/true);
3634 return DAG.getNode(ISD::FP_ROUND, SL, MVT::bf16, ToF32, FPRoundFlag);
3635 }
3636
3637 if (SrcVT != MVT::i64)
3638 return Op;
3639
3640 // TODO: Factor out code common with LowerUINT_TO_FP.
3641
3642 if (DestVT == MVT::f16 && isTypeLegal(MVT::f16)) {
3643 SDLoc DL(Op);
3644 SDValue Src = Op.getOperand(0);
3645
3646 SDValue IntToFp32 = DAG.getNode(Op.getOpcode(), DL, MVT::f32, Src);
3647 SDValue FPRoundFlag =
3648 DAG.getIntPtrConstant(0, SDLoc(Op), /*isTarget=*/true);
3649 SDValue FPRound =
3650 DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, IntToFp32, FPRoundFlag);
3651
3652 return FPRound;
3653 }
3654
3655 if (DestVT == MVT::f32)
3656 return LowerINT_TO_FP32(Op, DAG, true);
3657
3658 assert(DestVT == MVT::f64);
3659 return LowerINT_TO_FP64(Op, DAG, true);
3660}
3661
3663 bool Signed) const {
3664 SDLoc SL(Op);
3665
3666 SDValue Src = Op.getOperand(0);
3667 EVT SrcVT = Src.getValueType();
3668
3669 assert(SrcVT == MVT::f32 || SrcVT == MVT::f64);
3670
3671 // The basic idea of converting a floating point number into a pair of 32-bit
3672 // integers is illustrated as follows:
3673 //
3674 // tf := trunc(val);
3675 // hif := floor(tf * 2^-32);
3676 // lof := tf - hif * 2^32; // lof is always positive due to floor.
3677 // hi := fptoi(hif);
3678 // lo := fptoi(lof);
3679 //
3680 SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, SrcVT, Src);
3681 SDValue Sign;
3682 if (Signed && SrcVT == MVT::f32) {
3683 // However, a 32-bit floating point number has only 23 bits mantissa and
3684 // it's not enough to hold all the significant bits of `lof` if val is
3685 // negative. To avoid the loss of precision, We need to take the absolute
3686 // value after truncating and flip the result back based on the original
3687 // signedness.
3688 Sign = DAG.getNode(ISD::SRA, SL, MVT::i32,
3689 DAG.getNode(ISD::BITCAST, SL, MVT::i32, Trunc),
3690 DAG.getConstant(31, SL, MVT::i32));
3691 Trunc = DAG.getNode(ISD::FABS, SL, SrcVT, Trunc);
3692 }
3693
3694 SDValue K0, K1;
3695 if (SrcVT == MVT::f64) {
3696 K0 = DAG.getConstantFP(
3697 llvm::bit_cast<double>(UINT64_C(/*2^-32*/ 0x3df0000000000000)), SL,
3698 SrcVT);
3699 K1 = DAG.getConstantFP(
3700 llvm::bit_cast<double>(UINT64_C(/*-2^32*/ 0xc1f0000000000000)), SL,
3701 SrcVT);
3702 } else {
3703 K0 = DAG.getConstantFP(
3704 llvm::bit_cast<float>(UINT32_C(/*2^-32*/ 0x2f800000)), SL, SrcVT);
3705 K1 = DAG.getConstantFP(
3706 llvm::bit_cast<float>(UINT32_C(/*-2^32*/ 0xcf800000)), SL, SrcVT);
3707 }
3708 // TODO: Should this propagate fast-math-flags?
3709 SDValue Mul = DAG.getNode(ISD::FMUL, SL, SrcVT, Trunc, K0);
3710
3711 SDValue FloorMul = DAG.getNode(ISD::FFLOOR, SL, SrcVT, Mul);
3712
3713 SDValue Fma = DAG.getNode(ISD::FMA, SL, SrcVT, FloorMul, K1, Trunc);
3714
3715 SDValue Hi = DAG.getNode((Signed && SrcVT == MVT::f64) ? ISD::FP_TO_SINT
3717 SL, MVT::i32, FloorMul);
3718 SDValue Lo = DAG.getNode(ISD::FP_TO_UINT, SL, MVT::i32, Fma);
3719
3720 SDValue Result = DAG.getNode(ISD::BITCAST, SL, MVT::i64,
3721 DAG.getBuildVector(MVT::v2i32, SL, {Lo, Hi}));
3722
3723 if (Signed && SrcVT == MVT::f32) {
3724 assert(Sign);
3725 // Flip the result based on the signedness, which is either all 0s or 1s.
3726 Sign = DAG.getNode(ISD::BITCAST, SL, MVT::i64,
3727 DAG.getBuildVector(MVT::v2i32, SL, {Sign, Sign}));
3728 // r := xor(r, sign) - sign;
3729 Result =
3730 DAG.getNode(ISD::SUB, SL, MVT::i64,
3731 DAG.getNode(ISD::XOR, SL, MVT::i64, Result, Sign), Sign);
3732 }
3733
3734 return Result;
3735}
3736
3738 SDLoc DL(Op);
3739 SDValue N0 = Op.getOperand(0);
3740
3741 // Convert to target node to get known bits
3742 if (N0.getValueType() == MVT::f32)
3743 return DAG.getNode(AMDGPUISD::FP_TO_FP16, DL, Op.getValueType(), N0);
3744
3745 if (Op->getFlags().hasApproximateFuncs()) {
3746 // There is a generic expand for FP_TO_FP16 with unsafe fast math.
3747 return SDValue();
3748 }
3749
3750 return LowerF64ToF16Safe(N0, DL, DAG);
3751}
3752
3753// return node in i32
3755 SelectionDAG &DAG) const {
3756 assert(Src.getSimpleValueType() == MVT::f64);
3757
3758 // f64 -> f16 conversion using round-to-nearest-even rounding mode.
3759 // TODO: We can generate better code for True16.
3760 const unsigned ExpMask = 0x7ff;
3761 const unsigned ExpBiasf64 = 1023;
3762 const unsigned ExpBiasf16 = 15;
3763 SDValue Zero = DAG.getConstant(0, DL, MVT::i32);
3764 SDValue One = DAG.getConstant(1, DL, MVT::i32);
3765 SDValue U = DAG.getNode(ISD::BITCAST, DL, MVT::i64, Src);
3766 SDValue UH = DAG.getNode(ISD::SRL, DL, MVT::i64, U,
3767 DAG.getConstant(32, DL, MVT::i64));
3768 UH = DAG.getZExtOrTrunc(UH, DL, MVT::i32);
3769 U = DAG.getZExtOrTrunc(U, DL, MVT::i32);
3770 SDValue E = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
3771 DAG.getConstant(20, DL, MVT::i64));
3772 E = DAG.getNode(ISD::AND, DL, MVT::i32, E,
3773 DAG.getConstant(ExpMask, DL, MVT::i32));
3774 // Subtract the fp64 exponent bias (1023) to get the real exponent and
3775 // add the f16 bias (15) to get the biased exponent for the f16 format.
3776 E = DAG.getNode(ISD::ADD, DL, MVT::i32, E,
3777 DAG.getConstant(-ExpBiasf64 + ExpBiasf16, DL, MVT::i32));
3778
3779 SDValue M = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
3780 DAG.getConstant(8, DL, MVT::i32));
3781 M = DAG.getNode(ISD::AND, DL, MVT::i32, M,
3782 DAG.getConstant(0xffe, DL, MVT::i32));
3783
3784 SDValue MaskedSig = DAG.getNode(ISD::AND, DL, MVT::i32, UH,
3785 DAG.getConstant(0x1ff, DL, MVT::i32));
3786 MaskedSig = DAG.getNode(ISD::OR, DL, MVT::i32, MaskedSig, U);
3787
3788 SDValue Lo40Set = DAG.getSelectCC(DL, MaskedSig, Zero, Zero, One, ISD::SETEQ);
3789 M = DAG.getNode(ISD::OR, DL, MVT::i32, M, Lo40Set);
3790
3791 // (M != 0 ? 0x0200 : 0) | 0x7c00;
3792 SDValue I = DAG.getNode(ISD::OR, DL, MVT::i32,
3793 DAG.getSelectCC(DL, M, Zero, DAG.getConstant(0x0200, DL, MVT::i32),
3794 Zero, ISD::SETNE), DAG.getConstant(0x7c00, DL, MVT::i32));
3795
3796 // N = M | (E << 12);
3797 SDValue N = DAG.getNode(ISD::OR, DL, MVT::i32, M,
3798 DAG.getNode(ISD::SHL, DL, MVT::i32, E,
3799 DAG.getConstant(12, DL, MVT::i32)));
3800
3801 // B = clamp(1-E, 0, 13);
3802 SDValue OneSubExp = DAG.getNode(ISD::SUB, DL, MVT::i32,
3803 One, E);
3804 SDValue B = DAG.getNode(ISD::SMAX, DL, MVT::i32, OneSubExp, Zero);
3805 B = DAG.getNode(ISD::SMIN, DL, MVT::i32, B,
3806 DAG.getConstant(13, DL, MVT::i32));
3807
3808 SDValue SigSetHigh = DAG.getNode(ISD::OR, DL, MVT::i32, M,
3809 DAG.getConstant(0x1000, DL, MVT::i32));
3810
3811 SDValue D = DAG.getNode(ISD::SRL, DL, MVT::i32, SigSetHigh, B);
3812 SDValue D0 = DAG.getNode(ISD::SHL, DL, MVT::i32, D, B);
3813 SDValue D1 = DAG.getSelectCC(DL, D0, SigSetHigh, One, Zero, ISD::SETNE);
3814 D = DAG.getNode(ISD::OR, DL, MVT::i32, D, D1);
3815
3816 SDValue V = DAG.getSelectCC(DL, E, One, D, N, ISD::SETLT);
3817 SDValue VLow3 = DAG.getNode(ISD::AND, DL, MVT::i32, V,
3818 DAG.getConstant(0x7, DL, MVT::i32));
3819 V = DAG.getNode(ISD::SRL, DL, MVT::i32, V,
3820 DAG.getConstant(2, DL, MVT::i32));
3821 SDValue V0 = DAG.getSelectCC(DL, VLow3, DAG.getConstant(3, DL, MVT::i32),
3822 One, Zero, ISD::SETEQ);
3823 SDValue V1 = DAG.getSelectCC(DL, VLow3, DAG.getConstant(5, DL, MVT::i32),
3824 One, Zero, ISD::SETGT);
3825 V1 = DAG.getNode(ISD::OR, DL, MVT::i32, V0, V1);
3826 V = DAG.getNode(ISD::ADD, DL, MVT::i32, V, V1);
3827
3828 V = DAG.getSelectCC(DL, E, DAG.getConstant(30, DL, MVT::i32),
3829 DAG.getConstant(0x7c00, DL, MVT::i32), V, ISD::SETGT);
3830 V = DAG.getSelectCC(DL, E, DAG.getConstant(1039, DL, MVT::i32),
3831 I, V, ISD::SETEQ);
3832
3833 // Extract the sign bit.
3834 SDValue Sign = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
3835 DAG.getConstant(16, DL, MVT::i32));
3836 Sign = DAG.getNode(ISD::AND, DL, MVT::i32, Sign,
3837 DAG.getConstant(0x8000, DL, MVT::i32));
3838
3839 return DAG.getNode(ISD::OR, DL, MVT::i32, Sign, V);
3840}
3841
3843 SelectionDAG &DAG) const {
3844 SDValue Src = Op.getOperand(0);
3845 unsigned OpOpcode = Op.getOpcode();
3846 EVT SrcVT = Src.getValueType();
3847 EVT DestVT = Op.getValueType();
3848
3849 // Will be selected natively
3850 if (SrcVT == MVT::f16 && DestVT == MVT::i16)
3851 return Op;
3852
3853 if (SrcVT == MVT::bf16) {
3854 SDLoc DL(Op);
3855 SDValue PromotedSrc = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, Src);
3856 return DAG.getNode(Op.getOpcode(), DL, DestVT, PromotedSrc);
3857 }
3858
3859 // Promote i16 to i32
3860 if (DestVT == MVT::i16 && (SrcVT == MVT::f32 || SrcVT == MVT::f64)) {
3861 SDLoc DL(Op);
3862
3863 SDValue FpToInt32 = DAG.getNode(OpOpcode, DL, MVT::i32, Src);
3864 return DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, FpToInt32);
3865 }
3866
3867 if (DestVT != MVT::i64)
3868 return Op;
3869
3870 if (SrcVT == MVT::f16 ||
3871 (SrcVT == MVT::f32 && Src.getOpcode() == ISD::FP16_TO_FP)) {
3872 SDLoc DL(Op);
3873
3874 SDValue FpToInt32 = DAG.getNode(OpOpcode, DL, MVT::i32, Src);
3875 unsigned Ext =
3877 return DAG.getNode(Ext, DL, MVT::i64, FpToInt32);
3878 }
3879
3880 if (SrcVT == MVT::f32 || SrcVT == MVT::f64)
3881 return LowerFP_TO_INT64(Op, DAG, OpOpcode == ISD::FP_TO_SINT);
3882
3883 return SDValue();
3884}
3885
3887 SelectionDAG &DAG) const {
3888 SDValue Src = Op.getOperand(0);
3889 unsigned OpOpcode = Op.getOpcode();
3890 EVT SrcVT = Src.getValueType();
3891 EVT DstVT = Op.getValueType();
3892 SDValue SatVTOp = Op.getNode()->getOperand(1);
3893 EVT SatVT = cast<VTSDNode>(SatVTOp)->getVT();
3894 SDLoc DL(Op);
3895
3896 uint64_t DstWidth = DstVT.getScalarSizeInBits();
3897 uint64_t SatWidth = SatVT.getScalarSizeInBits();
3898 assert(SatWidth <= DstWidth && "Saturation width cannot exceed result width");
3899
3900 // Will be selected natively
3901 if (DstVT == MVT::i32 && SatWidth == DstWidth &&
3902 (SrcVT == MVT::f32 || SrcVT == MVT::f64))
3903 return Op;
3904
3905 const SDValue Int32VT = DAG.getValueType(MVT::i32);
3906
3907 // Perform all saturation at i32 and truncate
3908 if (SatWidth < DstWidth) {
3909 const uint64_t Int32Width = 32;
3910 SDValue FpToInt32 = DAG.getNode(OpOpcode, DL, MVT::i32, Src, Int32VT);
3911 SDValue Int32SatVal;
3912
3913 if (Op.getOpcode() == ISD::FP_TO_SINT_SAT) {
3914 SDValue MinConst = DAG.getConstant(
3915 APInt::getSignedMaxValue(SatWidth).sext(Int32Width), DL, MVT::i32);
3916 SDValue MaxConst = DAG.getConstant(
3917 APInt::getSignedMinValue(SatWidth).sext(Int32Width), DL, MVT::i32);
3918 SDValue MinVal =
3919 DAG.getNode(ISD::SMIN, DL, MVT::i32, FpToInt32, MinConst);
3920 Int32SatVal = DAG.getNode(ISD::SMAX, DL, MVT::i32, MinVal, MaxConst);
3921 } else {
3922 SDValue MinConst = DAG.getConstant(
3923 APInt::getMaxValue(SatWidth).zext(Int32Width), DL, MVT::i32);
3924 Int32SatVal = DAG.getNode(ISD::UMIN, DL, MVT::i32, FpToInt32, MinConst);
3925 }
3926
3927 if (DstWidth == Int32Width)
3928 return Int32SatVal;
3929 if (DstWidth < Int32Width)
3930 return DAG.getNode(ISD::TRUNCATE, DL, DstVT, Int32SatVal);
3931
3932 // DstWidth > Int32Width
3933 const unsigned Ext =
3935 return DAG.getNode(Ext, DL, DstVT, FpToInt32);
3936 }
3937
3938 // SatWidth == DstWidth
3939
3940 // Saturate at i32 for i64 dst and 16b src (will invoke f16 promotion below)
3941 if (DstVT == MVT::i64 &&
3942 (SrcVT == MVT::f16 || SrcVT == MVT::bf16 ||
3943 (SrcVT == MVT::f32 && Src.getOpcode() == ISD::FP16_TO_FP))) {
3944 return DAG.getNode(OpOpcode, DL, DstVT, Src, Int32VT);
3945 }
3946
3947 // Promote f16/bf16 src to f32
3948 if (SrcVT == MVT::f16 || SrcVT == MVT::bf16) {
3949 SDValue PromotedSrc = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, Src);
3950 return DAG.getNode(Op.getOpcode(), DL, DstVT, PromotedSrc, SatVTOp);
3951 }
3952
3953 // Promote sub-i32 dst to i32 with sub-i32 saturation
3954 if (DstWidth < 32) {
3955 // Note: this triggers SatWidth < DstWidth above to generate saturated
3956 // truncate by requesting MVT::i32 destination with SatWidth < 32.
3957 SDValue FpToInt32 = DAG.getNode(OpOpcode, DL, MVT::i32, Src, SatVTOp);
3958 return DAG.getNode(ISD::TRUNCATE, DL, DstVT, FpToInt32);
3959 }
3960
3961 // TODO: can we implement i64 dst for f32/f64?
3962
3963 return SDValue();
3964}
3965
3967 SelectionDAG &DAG) const {
3968 EVT ExtraVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
3969 MVT VT = Op.getSimpleValueType();
3970 MVT ScalarVT = VT.getScalarType();
3971
3972 assert(VT.isVector());
3973
3974 SDValue Src = Op.getOperand(0);
3975 SDLoc DL(Op);
3976
3977 // TODO: Don't scalarize on Evergreen?
3978 unsigned NElts = VT.getVectorNumElements();
3980 DAG.ExtractVectorElements(Src, Args, 0, NElts);
3981
3982 SDValue VTOp = DAG.getValueType(ExtraVT.getScalarType());
3983 for (unsigned I = 0; I < NElts; ++I)
3984 Args[I] = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, ScalarVT, Args[I], VTOp);
3985
3986 return DAG.getBuildVector(VT, DL, Args);
3987}
3988
3989//===----------------------------------------------------------------------===//
3990// Custom DAG optimizations
3991//===----------------------------------------------------------------------===//
3992
3993static bool isU24(SDValue Op, SelectionDAG &DAG) {
3994 return AMDGPUTargetLowering::numBitsUnsigned(Op, DAG) <= 24;
3995}
3996
3997static bool isI24(SDValue Op, SelectionDAG &DAG) {
3998 EVT VT = Op.getValueType();
3999 return VT.getSizeInBits() >= 24 && // Types less than 24-bit should be treated
4000 // as unsigned 24-bit values.
4002}
4003
4006 SelectionDAG &DAG = DCI.DAG;
4007 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
4008 bool IsIntrin = Node24->getOpcode() == ISD::INTRINSIC_WO_CHAIN;
4009
4010 SDValue LHS = IsIntrin ? Node24->getOperand(1) : Node24->getOperand(0);
4011 SDValue RHS = IsIntrin ? Node24->getOperand(2) : Node24->getOperand(1);
4012 unsigned NewOpcode = Node24->getOpcode();
4013 if (IsIntrin) {
4014 unsigned IID = Node24->getConstantOperandVal(0);
4015 switch (IID) {
4016 case Intrinsic::amdgcn_mul_i24:
4017 NewOpcode = AMDGPUISD::MUL_I24;
4018 break;
4019 case Intrinsic::amdgcn_mul_u24:
4020 NewOpcode = AMDGPUISD::MUL_U24;
4021 break;
4022 case Intrinsic::amdgcn_mulhi_i24:
4023 NewOpcode = AMDGPUISD::MULHI_I24;
4024 break;
4025 case Intrinsic::amdgcn_mulhi_u24:
4026 NewOpcode = AMDGPUISD::MULHI_U24;
4027 break;
4028 default:
4029 llvm_unreachable("Expected 24-bit mul intrinsic");
4030 }
4031 }
4032
4033 APInt Demanded = APInt::getLowBitsSet(LHS.getValueSizeInBits(), 24);
4034
4035 // First try to simplify using SimplifyMultipleUseDemandedBits which allows
4036 // the operands to have other uses, but will only perform simplifications that
4037 // involve bypassing some nodes for this user.
4038 SDValue DemandedLHS = TLI.SimplifyMultipleUseDemandedBits(LHS, Demanded, DAG);
4039 SDValue DemandedRHS = TLI.SimplifyMultipleUseDemandedBits(RHS, Demanded, DAG);
4040 if (DemandedLHS || DemandedRHS)
4041 return DAG.getNode(NewOpcode, SDLoc(Node24), Node24->getVTList(),
4042 DemandedLHS ? DemandedLHS : LHS,
4043 DemandedRHS ? DemandedRHS : RHS);
4044
4045 // Now try SimplifyDemandedBits which can simplify the nodes used by our
4046 // operands if this node is the only user.
4047 if (TLI.SimplifyDemandedBits(LHS, Demanded, DCI))
4048 return SDValue(Node24, 0);
4049 if (TLI.SimplifyDemandedBits(RHS, Demanded, DCI))
4050 return SDValue(Node24, 0);
4051
4052 return SDValue();
4053}
4054
4055template <typename IntTy>
4057 uint32_t Width, const SDLoc &DL) {
4058 if (Width + Offset < 32) {
4059 uint32_t Shl = static_cast<uint32_t>(Src0) << (32 - Offset - Width);
4060 IntTy Result = static_cast<IntTy>(Shl) >> (32 - Width);
4061 if constexpr (std::is_signed_v<IntTy>) {
4062 return DAG.getSignedConstant(Result, DL, MVT::i32);
4063 } else {
4064 return DAG.getConstant(Result, DL, MVT::i32);
4065 }
4066 }
4067
4068 return DAG.getConstant(Src0 >> Offset, DL, MVT::i32);
4069}
4070
4071static bool hasVolatileUser(SDNode *Val) {
4072 for (SDNode *U : Val->users()) {
4073 if (MemSDNode *M = dyn_cast<MemSDNode>(U)) {
4074 if (M->isVolatile())
4075 return true;
4076 }
4077 }
4078
4079 return false;
4080}
4081
4083 // i32 vectors are the canonical memory type.
4084 if (VT.getScalarType() == MVT::i32 || isTypeLegal(VT))
4085 return false;
4086
4087 if (!VT.isByteSized())
4088 return false;
4089
4090 unsigned Size = VT.getStoreSize();
4091
4092 if ((Size == 1 || Size == 2 || Size == 4) && !VT.isVector())
4093 return false;
4094
4095 if (Size == 3 || (Size > 4 && (Size % 4 != 0)))
4096 return false;
4097
4098 return true;
4099}
4100
4101// Replace load of an illegal type with a bitcast from a load of a friendlier
4102// type.
4104 DAGCombinerInfo &DCI) const {
4105 if (!DCI.isBeforeLegalize())
4106 return SDValue();
4107
4109 if (!LN->isSimple() || !ISD::isNormalLoad(LN) || hasVolatileUser(LN))
4110 return SDValue();
4111
4112 SDLoc SL(N);
4113 SelectionDAG &DAG = DCI.DAG;
4114 EVT VT = LN->getMemoryVT();
4115
4116 unsigned Size = VT.getStoreSize();
4117 Align Alignment = LN->getAlign();
4118 if (Alignment < Size && isTypeLegal(VT)) {
4119 unsigned IsFast;
4120 unsigned AS = LN->getAddressSpace();
4121
4122 // Expand unaligned loads earlier than legalization. Due to visitation order
4123 // problems during legalization, the emitted instructions to pack and unpack
4124 // the bytes again are not eliminated in the case of an unaligned copy.
4126 VT, AS, Alignment, LN->getMemOperand()->getFlags(), &IsFast)) {
4127 if (VT.isVector())
4128 return SplitVectorLoad(SDValue(LN, 0), DAG);
4129
4130 SDValue Ops[2];
4131 std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(LN, DAG);
4132
4133 return DAG.getMergeValues(Ops, SDLoc(N));
4134 }
4135
4136 if (!IsFast)
4137 return SDValue();
4138 }
4139
4140 if (!shouldCombineMemoryType(VT))
4141 return SDValue();
4142
4143 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
4144
4145 SDValue NewLoad
4146 = DAG.getLoad(NewVT, SL, LN->getChain(),
4147 LN->getBasePtr(), LN->getMemOperand());
4148
4149 SDValue BC = DAG.getNode(ISD::BITCAST, SL, VT, NewLoad);
4150 DCI.CombineTo(N, BC, NewLoad.getValue(1));
4151 return SDValue(N, 0);
4152}
4153
4154// Replace store of an illegal type with a store of a bitcast to a friendlier
4155// type.
4157 DAGCombinerInfo &DCI) const {
4158 if (!DCI.isBeforeLegalize())
4159 return SDValue();
4160
4162 if (!SN->isSimple() || !ISD::isNormalStore(SN))
4163 return SDValue();
4164
4165 EVT VT = SN->getMemoryVT();
4166 unsigned Size = VT.getStoreSize();
4167
4168 SDLoc SL(N);
4169 SelectionDAG &DAG = DCI.DAG;
4170 Align Alignment = SN->getAlign();
4171 if (Alignment < Size && isTypeLegal(VT)) {
4172 unsigned IsFast;
4173 unsigned AS = SN->getAddressSpace();
4174
4175 // Expand unaligned stores earlier than legalization. Due to visitation
4176 // order problems during legalization, the emitted instructions to pack and
4177 // unpack the bytes again are not eliminated in the case of an unaligned
4178 // copy.
4180 VT, AS, Alignment, SN->getMemOperand()->getFlags(), &IsFast)) {
4181 if (VT.isVector())
4182 return SplitVectorStore(SDValue(SN, 0), DAG);
4183
4184 return expandUnalignedStore(SN, DAG);
4185 }
4186
4187 if (!IsFast)
4188 return SDValue();
4189 }
4190
4191 if (!shouldCombineMemoryType(VT))
4192 return SDValue();
4193
4194 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
4195 SDValue Val = SN->getValue();
4196
4197 //DCI.AddToWorklist(Val.getNode());
4198
4199 bool OtherUses = !Val.hasOneUse();
4200 SDValue CastVal = DAG.getNode(ISD::BITCAST, SL, NewVT, Val);
4201 if (OtherUses) {
4202 SDValue CastBack = DAG.getNode(ISD::BITCAST, SL, VT, CastVal);
4203 DAG.ReplaceAllUsesOfValueWith(Val, CastBack);
4204 }
4205
4206 return DAG.getStore(SN->getChain(), SL, CastVal,
4207 SN->getBasePtr(), SN->getMemOperand());
4208}
4209
4210// FIXME: This should go in generic DAG combiner with an isTruncateFree check,
4211// but isTruncateFree is inaccurate for i16 now because of SALU vs. VALU
4212// issues.
4214 DAGCombinerInfo &DCI) const {
4215 SelectionDAG &DAG = DCI.DAG;
4216 SDValue N0 = N->getOperand(0);
4217
4218 // (vt2 (assertzext (truncate vt0:x), vt1)) ->
4219 // (vt2 (truncate (assertzext vt0:x, vt1)))
4220 if (N0.getOpcode() == ISD::TRUNCATE) {
4221 SDValue N1 = N->getOperand(1);
4222 EVT ExtVT = cast<VTSDNode>(N1)->getVT();
4223 SDLoc SL(N);
4224
4225 SDValue Src = N0.getOperand(0);
4226 EVT SrcVT = Src.getValueType();
4227 if (SrcVT.bitsGE(ExtVT)) {
4228 SDValue NewInReg = DAG.getNode(N->getOpcode(), SL, SrcVT, Src, N1);
4229 return DAG.getNode(ISD::TRUNCATE, SL, N->getValueType(0), NewInReg);
4230 }
4231 }
4232
4233 return SDValue();
4234}
4235
4237 SDNode *N, DAGCombinerInfo &DCI) const {
4238 unsigned IID = N->getConstantOperandVal(0);
4239 switch (IID) {
4240 case Intrinsic::amdgcn_mul_i24:
4241 case Intrinsic::amdgcn_mul_u24:
4242 case Intrinsic::amdgcn_mulhi_i24:
4243 case Intrinsic::amdgcn_mulhi_u24:
4244 return simplifyMul24(N, DCI);
4245 case Intrinsic::amdgcn_fract:
4246 case Intrinsic::amdgcn_rsq:
4247 case Intrinsic::amdgcn_rcp_legacy:
4248 case Intrinsic::amdgcn_rsq_legacy:
4249 case Intrinsic::amdgcn_rsq_clamp:
4250 case Intrinsic::amdgcn_tanh:
4251 case Intrinsic::amdgcn_prng_b32: {
4252 // FIXME: This is probably wrong. If src is an sNaN, it won't be quieted
4253 SDValue Src = N->getOperand(1);
4254 return Src.isUndef() ? Src : SDValue();
4255 }
4256 case Intrinsic::amdgcn_frexp_exp: {
4257 // frexp_exp (fneg x) -> frexp_exp x
4258 // frexp_exp (fabs x) -> frexp_exp x
4259 // frexp_exp (fneg (fabs x)) -> frexp_exp x
4260 SDValue Src = N->getOperand(1);
4261 SDValue PeekSign = peekFPSignOps(Src);
4262 if (PeekSign == Src)
4263 return SDValue();
4264 return SDValue(DCI.DAG.UpdateNodeOperands(N, N->getOperand(0), PeekSign),
4265 0);
4266 }
4267 default:
4268 return SDValue();
4269 }
4270}
4271
4272/// Split the 64-bit value \p LHS into two 32-bit components, and perform the
4273/// binary operation \p Opc to it with the corresponding constant operands.
4275 DAGCombinerInfo &DCI, const SDLoc &SL,
4276 unsigned Opc, SDValue LHS,
4277 uint32_t ValLo, uint32_t ValHi) const {
4278 SelectionDAG &DAG = DCI.DAG;
4279 SDValue Lo, Hi;
4280 std::tie(Lo, Hi) = split64BitValue(LHS, DAG);
4281
4282 SDValue LoRHS = DAG.getConstant(ValLo, SL, MVT::i32);
4283 SDValue HiRHS = DAG.getConstant(ValHi, SL, MVT::i32);
4284
4285 SDValue LoAnd = DAG.getNode(Opc, SL, MVT::i32, Lo, LoRHS);
4286 SDValue HiAnd = DAG.getNode(Opc, SL, MVT::i32, Hi, HiRHS);
4287
4288 // Re-visit the ands. It's possible we eliminated one of them and it could
4289 // simplify the vector.
4290 DCI.AddToWorklist(Lo.getNode());
4291 DCI.AddToWorklist(Hi.getNode());
4292
4293 SDValue Vec = DAG.getBuildVector(MVT::v2i32, SL, {LoAnd, HiAnd});
4294 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
4295}
4296
4298 DAGCombinerInfo &DCI) const {
4299 EVT VT = N->getValueType(0);
4300 SDValue LHS = N->getOperand(0);
4301 SDValue RHS = N->getOperand(1);
4303 SDLoc SL(N);
4304 SelectionDAG &DAG = DCI.DAG;
4305
4306 unsigned RHSVal;
4307 if (CRHS) {
4308 RHSVal = CRHS->getZExtValue();
4309 if (!RHSVal)
4310 return LHS;
4311
4312 switch (LHS->getOpcode()) {
4313 default:
4314 break;
4315 case ISD::ZERO_EXTEND:
4316 case ISD::SIGN_EXTEND:
4317 case ISD::ANY_EXTEND: {
4318 SDValue X = LHS->getOperand(0);
4319
4320 if (VT == MVT::i32 && RHSVal == 16 && X.getValueType() == MVT::i16 &&
4321 isOperationLegal(ISD::BUILD_VECTOR, MVT::v2i16)) {
4322 // Prefer build_vector as the canonical form if packed types are legal.
4323 // (shl ([asz]ext i16:x), 16 -> build_vector 0, x
4324 SDValue Vec = DAG.getBuildVector(
4325 MVT::v2i16, SL,
4326 {DAG.getConstant(0, SL, MVT::i16), LHS->getOperand(0)});
4327 return DAG.getNode(ISD::BITCAST, SL, MVT::i32, Vec);
4328 }
4329
4330 // shl (ext x) => zext (shl x), if shift does not overflow int
4331 if (VT != MVT::i64)
4332 break;
4333 KnownBits Known = DAG.computeKnownBits(X);
4334 unsigned LZ = Known.countMinLeadingZeros();
4335 if (LZ < RHSVal)
4336 break;
4337 EVT XVT = X.getValueType();
4338 SDValue Shl = DAG.getNode(ISD::SHL, SL, XVT, X, SDValue(CRHS, 0));
4339 return DAG.getZExtOrTrunc(Shl, SL, VT);
4340 }
4341 }
4342 }
4343
4344 if (VT.getScalarType() != MVT::i64)
4345 return SDValue();
4346
4347 // On some subtargets, 64-bit shift is a quarter rate instruction. In the
4348 // common case, splitting this into a move and a 32-bit shift is faster and
4349 // the same code size.
4350 KnownBits Known = DAG.computeKnownBits(RHS);
4351
4352 EVT ElementType = VT.getScalarType();
4353 EVT TargetScalarType = ElementType.getHalfSizedIntegerVT(*DAG.getContext());
4354 EVT TargetType = VT.changeElementType(*DAG.getContext(), TargetScalarType);
4355
4356 if (Known.getMinValue().getZExtValue() < TargetScalarType.getSizeInBits())
4357 return SDValue();
4358 SDValue ShiftAmt;
4359
4360 if (CRHS) {
4361 ShiftAmt = DAG.getConstant(RHSVal - TargetScalarType.getSizeInBits(), SL,
4362 TargetType);
4363 } else {
4364 SDValue TruncShiftAmt = DAG.getNode(ISD::TRUNCATE, SL, TargetType, RHS);
4365 const SDValue ShiftMask =
4366 DAG.getConstant(TargetScalarType.getSizeInBits() - 1, SL, TargetType);
4367 // This AND instruction will clamp out of bounds shift values.
4368 // It will also be removed during later instruction selection.
4369 ShiftAmt = DAG.getNode(ISD::AND, SL, TargetType, TruncShiftAmt, ShiftMask);
4370 }
4371
4372 SDValue Lo = DAG.getNode(ISD::TRUNCATE, SL, TargetType, LHS);
4373 SDValue NewShift =
4374 DAG.getNode(ISD::SHL, SL, TargetType, Lo, ShiftAmt, N->getFlags());
4375
4376 const SDValue Zero = DAG.getConstant(0, SL, TargetScalarType);
4377 SDValue Vec;
4378
4379 if (VT.isVector()) {
4380 EVT ConcatType = TargetType.getDoubleNumVectorElementsVT(*DAG.getContext());
4381 unsigned NElts = TargetType.getVectorNumElements();
4383 SmallVector<SDValue, 16> HiAndLoOps(NElts * 2, Zero);
4384
4385 DAG.ExtractVectorElements(NewShift, HiOps, 0, NElts);
4386 for (unsigned I = 0; I != NElts; ++I)
4387 HiAndLoOps[2 * I + 1] = HiOps[I];
4388 Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, ConcatType, HiAndLoOps);
4389 } else {
4390 EVT ConcatType = EVT::getVectorVT(*DAG.getContext(), TargetType, 2);
4391 Vec = DAG.getBuildVector(ConcatType, SL, {Zero, NewShift});
4392 }
4393 return DAG.getNode(ISD::BITCAST, SL, VT, Vec);
4394}
4395
4397 DAGCombinerInfo &DCI) const {
4398 SDValue RHS = N->getOperand(1);
4400 EVT VT = N->getValueType(0);
4401 SDValue LHS = N->getOperand(0);
4402 SelectionDAG &DAG = DCI.DAG;
4403 SDLoc SL(N);
4404
4405 if (VT.getScalarType() != MVT::i64)
4406 return SDValue();
4407
4408 // For C >= 32
4409 // i64 (sra x, C) -> (build_pair (sra hi_32(x), C - 32), sra hi_32(x), 31))
4410
4411 // On some subtargets, 64-bit shift is a quarter rate instruction. In the
4412 // common case, splitting this into a move and a 32-bit shift is faster and
4413 // the same code size.
4414 KnownBits Known = DAG.computeKnownBits(RHS);
4415
4416 EVT ElementType = VT.getScalarType();
4417 EVT TargetScalarType = ElementType.getHalfSizedIntegerVT(*DAG.getContext());
4418 EVT TargetType = VT.changeElementType(*DAG.getContext(), TargetScalarType);
4419
4420 if (Known.getMinValue().getZExtValue() < TargetScalarType.getSizeInBits())
4421 return SDValue();
4422
4423 SDValue ShiftFullAmt =
4424 DAG.getConstant(TargetScalarType.getSizeInBits() - 1, SL, TargetType);
4425 SDValue ShiftAmt;
4426 if (CRHS) {
4427 unsigned RHSVal = CRHS->getZExtValue();
4428 ShiftAmt = DAG.getConstant(RHSVal - TargetScalarType.getSizeInBits(), SL,
4429 TargetType);
4430 } else if (Known.getMinValue().getZExtValue() ==
4431 (ElementType.getSizeInBits() - 1)) {
4432 ShiftAmt = ShiftFullAmt;
4433 } else {
4434 SDValue TruncShiftAmt = DAG.getNode(ISD::TRUNCATE, SL, TargetType, RHS);
4435 const SDValue ShiftMask =
4436 DAG.getConstant(TargetScalarType.getSizeInBits() - 1, SL, TargetType);
4437 // This AND instruction will clamp out of bounds shift values.
4438 // It will also be removed during later instruction selection.
4439 ShiftAmt = DAG.getNode(ISD::AND, SL, TargetType, TruncShiftAmt, ShiftMask);
4440 }
4441
4442 EVT ConcatType;
4443 SDValue Hi;
4444 SDLoc LHSSL(LHS);
4445 // Bitcast LHS into ConcatType so hi-half of source can be extracted into Hi
4446 if (VT.isVector()) {
4447 unsigned NElts = TargetType.getVectorNumElements();
4448 ConcatType = TargetType.getDoubleNumVectorElementsVT(*DAG.getContext());
4449 SDValue SplitLHS = DAG.getNode(ISD::BITCAST, LHSSL, ConcatType, LHS);
4450 SmallVector<SDValue, 8> HiOps(NElts);
4451 SmallVector<SDValue, 16> HiAndLoOps;
4452
4453 DAG.ExtractVectorElements(SplitLHS, HiAndLoOps, 0, NElts * 2);
4454 for (unsigned I = 0; I != NElts; ++I) {
4455 HiOps[I] = HiAndLoOps[2 * I + 1];
4456 }
4457 Hi = DAG.getNode(ISD::BUILD_VECTOR, LHSSL, TargetType, HiOps);
4458 } else {
4459 const SDValue One = DAG.getConstant(1, LHSSL, TargetScalarType);
4460 ConcatType = EVT::getVectorVT(*DAG.getContext(), TargetType, 2);
4461 SDValue SplitLHS = DAG.getNode(ISD::BITCAST, LHSSL, ConcatType, LHS);
4462 Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, LHSSL, TargetType, SplitLHS, One);
4463 }
4464
4465 KnownBits KnownLHS = DAG.computeKnownBits(LHS);
4466 SDValue HiShift;
4467 if (KnownLHS.isNegative()) {
4468 HiShift = DAG.getAllOnesConstant(SL, TargetType);
4469 } else {
4470 Hi = DAG.getFreeze(Hi);
4471 HiShift = DAG.getNode(ISD::SRA, SL, TargetType, Hi, ShiftFullAmt);
4472 }
4473 SDValue NewShift =
4474 DAG.getNode(ISD::SRA, SL, TargetType, Hi, ShiftAmt, N->getFlags());
4475
4476 SDValue Vec;
4477 if (VT.isVector()) {
4478 unsigned NElts = TargetType.getVectorNumElements();
4481 SmallVector<SDValue, 16> HiAndLoOps(NElts * 2);
4482
4483 DAG.ExtractVectorElements(HiShift, HiOps, 0, NElts);
4484 DAG.ExtractVectorElements(NewShift, LoOps, 0, NElts);
4485 for (unsigned I = 0; I != NElts; ++I) {
4486 HiAndLoOps[2 * I + 1] = HiOps[I];
4487 HiAndLoOps[2 * I] = LoOps[I];
4488 }
4489 Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, ConcatType, HiAndLoOps);
4490 } else {
4491 Vec = DAG.getBuildVector(ConcatType, SL, {NewShift, HiShift});
4492 }
4493 return DAG.getNode(ISD::BITCAST, SL, VT, Vec);
4494}
4495
4497 DAGCombinerInfo &DCI) const {
4498 SDValue RHS = N->getOperand(1);
4500 EVT VT = N->getValueType(0);
4501 SDValue LHS = N->getOperand(0);
4502 SelectionDAG &DAG = DCI.DAG;
4503 SDLoc SL(N);
4504 unsigned RHSVal;
4505
4506 if (CRHS) {
4507 RHSVal = CRHS->getZExtValue();
4508
4509 // fold (srl (and x, c1 << c2), c2) -> (and (srl(x, c2), c1)
4510 // this improves the ability to match BFE patterns in isel.
4511 if (LHS.getOpcode() == ISD::AND) {
4512 if (auto *Mask = dyn_cast<ConstantSDNode>(LHS.getOperand(1))) {
4513 unsigned MaskIdx, MaskLen;
4514 if (Mask->getAPIntValue().isShiftedMask(MaskIdx, MaskLen) &&
4515 MaskIdx == RHSVal) {
4516 return DAG.getNode(ISD::AND, SL, VT,
4517 DAG.getNode(ISD::SRL, SL, VT, LHS.getOperand(0),
4518 N->getOperand(1)),
4519 DAG.getNode(ISD::SRL, SL, VT, LHS.getOperand(1),
4520 N->getOperand(1)));
4521 }
4522 }
4523 }
4524 }
4525
4526 if (VT.getScalarType() != MVT::i64)
4527 return SDValue();
4528
4529 // for C >= 32
4530 // i64 (srl x, C) -> (build_pair (srl hi_32(x), C - 32), 0)
4531
4532 // On some subtargets, 64-bit shift is a quarter rate instruction. In the
4533 // common case, splitting this into a move and a 32-bit shift is faster and
4534 // the same code size.
4535 KnownBits Known = DAG.computeKnownBits(RHS);
4536
4537 EVT ElementType = VT.getScalarType();
4538 EVT TargetScalarType = ElementType.getHalfSizedIntegerVT(*DAG.getContext());
4539 EVT TargetType = VT.changeElementType(*DAG.getContext(), TargetScalarType);
4540
4541 if (Known.getMinValue().getZExtValue() < TargetScalarType.getSizeInBits())
4542 return SDValue();
4543
4544 SDValue ShiftAmt;
4545 if (CRHS) {
4546 ShiftAmt = DAG.getConstant(RHSVal - TargetScalarType.getSizeInBits(), SL,
4547 TargetType);
4548 } else {
4549 SDValue TruncShiftAmt = DAG.getNode(ISD::TRUNCATE, SL, TargetType, RHS);
4550 const SDValue ShiftMask =
4551 DAG.getConstant(TargetScalarType.getSizeInBits() - 1, SL, TargetType);
4552 // This AND instruction will clamp out of bounds shift values.
4553 // It will also be removed during later instruction selection.
4554 ShiftAmt = DAG.getNode(ISD::AND, SL, TargetType, TruncShiftAmt, ShiftMask);
4555 }
4556
4557 const SDValue Zero = DAG.getConstant(0, SL, TargetScalarType);
4558 EVT ConcatType;
4559 SDValue Hi;
4560 SDLoc LHSSL(LHS);
4561 // Bitcast LHS into ConcatType so hi-half of source can be extracted into Hi
4562 if (VT.isVector()) {
4563 unsigned NElts = TargetType.getVectorNumElements();
4564 ConcatType = TargetType.getDoubleNumVectorElementsVT(*DAG.getContext());
4565 SDValue SplitLHS = DAG.getNode(ISD::BITCAST, LHSSL, ConcatType, LHS);
4566 SmallVector<SDValue, 8> HiOps(NElts);
4567 SmallVector<SDValue, 16> HiAndLoOps;
4568
4569 DAG.ExtractVectorElements(SplitLHS, HiAndLoOps, /*Start=*/0, NElts * 2);
4570 for (unsigned I = 0; I != NElts; ++I)
4571 HiOps[I] = HiAndLoOps[2 * I + 1];
4572 Hi = DAG.getNode(ISD::BUILD_VECTOR, LHSSL, TargetType, HiOps);
4573 } else {
4574 const SDValue One = DAG.getConstant(1, LHSSL, TargetScalarType);
4575 ConcatType = EVT::getVectorVT(*DAG.getContext(), TargetType, 2);
4576 SDValue SplitLHS = DAG.getNode(ISD::BITCAST, LHSSL, ConcatType, LHS);
4577 Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, LHSSL, TargetType, SplitLHS, One);
4578 }
4579
4580 SDValue NewShift =
4581 DAG.getNode(ISD::SRL, SL, TargetType, Hi, ShiftAmt, N->getFlags());
4582
4583 SDValue Vec;
4584 if (VT.isVector()) {
4585 unsigned NElts = TargetType.getVectorNumElements();
4587 SmallVector<SDValue, 16> HiAndLoOps(NElts * 2, Zero);
4588
4589 DAG.ExtractVectorElements(NewShift, LoOps, 0, NElts);
4590 for (unsigned I = 0; I != NElts; ++I)
4591 HiAndLoOps[2 * I] = LoOps[I];
4592 Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, ConcatType, HiAndLoOps);
4593 } else {
4594 Vec = DAG.getBuildVector(ConcatType, SL, {NewShift, Zero});
4595 }
4596 return DAG.getNode(ISD::BITCAST, SL, VT, Vec);
4597}
4598
4600 SDNode *N, DAGCombinerInfo &DCI) const {
4601 SDLoc SL(N);
4602 SelectionDAG &DAG = DCI.DAG;
4603 EVT VT = N->getValueType(0);
4604 SDValue Src = N->getOperand(0);
4605
4606 // vt1 (truncate (bitcast (build_vector vt0:x, ...))) -> vt1 (bitcast vt0:x)
4607 if (Src.getOpcode() == ISD::BITCAST && !VT.isVector()) {
4608 SDValue Vec = Src.getOperand(0);
4609 if (Vec.getOpcode() == ISD::BUILD_VECTOR) {
4610 SDValue Elt0 = Vec.getOperand(0);
4611 EVT EltVT = Elt0.getValueType();
4612 if (VT.getFixedSizeInBits() <= EltVT.getFixedSizeInBits()) {
4613 if (EltVT.isFloatingPoint()) {
4614 Elt0 = DAG.getNode(ISD::BITCAST, SL,
4615 EltVT.changeTypeToInteger(), Elt0);
4616 }
4617
4618 return DAG.getNode(ISD::TRUNCATE, SL, VT, Elt0);
4619 }
4620 }
4621 }
4622
4623 // Equivalent of above for accessing the high element of a vector as an
4624 // integer operation.
4625 // trunc (srl (bitcast (build_vector x, y))), 16 -> trunc (bitcast y)
4626 if (Src.getOpcode() == ISD::SRL && !VT.isVector()) {
4627 if (auto *K = isConstOrConstSplat(Src.getOperand(1))) {
4628 SDValue BV = stripBitcast(Src.getOperand(0));
4629 if (BV.getOpcode() == ISD::BUILD_VECTOR) {
4630 EVT SrcEltVT = BV.getOperand(0).getValueType();
4631 unsigned SrcEltSize = SrcEltVT.getSizeInBits();
4632 unsigned BitIndex = K->getZExtValue();
4633 unsigned PartIndex = BitIndex / SrcEltSize;
4634
4635 if (PartIndex * SrcEltSize == BitIndex &&
4636 PartIndex < BV.getNumOperands()) {
4637 if (SrcEltVT.getSizeInBits() == VT.getSizeInBits()) {
4638 SDValue SrcElt =
4639 DAG.getNode(ISD::BITCAST, SL, SrcEltVT.changeTypeToInteger(),
4640 BV.getOperand(PartIndex));
4641 return DAG.getNode(ISD::TRUNCATE, SL, VT, SrcElt);
4642 }
4643 }
4644 }
4645 }
4646 }
4647
4648 // Partially shrink 64-bit shifts to 32-bit if reduced to 16-bit.
4649 //
4650 // i16 (trunc (srl i64:x, K)), K <= 16 ->
4651 // i16 (trunc (srl (i32 (trunc x), K)))
4652 if (VT.getScalarSizeInBits() < 32) {
4653 EVT SrcVT = Src.getValueType();
4654 if (SrcVT.getScalarSizeInBits() > 32 &&
4655 (Src.getOpcode() == ISD::SRL ||
4656 Src.getOpcode() == ISD::SRA ||
4657 Src.getOpcode() == ISD::SHL)) {
4658 SDValue Amt = Src.getOperand(1);
4659 KnownBits Known = DAG.computeKnownBits(Amt);
4660
4661 // - For left shifts, do the transform as long as the shift
4662 // amount is still legal for i32, so when ShiftAmt < 32 (<= 31)
4663 // - For right shift, do it if ShiftAmt <= (32 - Size) to avoid
4664 // losing information stored in the high bits when truncating.
4665 const unsigned MaxCstSize =
4666 (Src.getOpcode() == ISD::SHL) ? 31 : (32 - VT.getScalarSizeInBits());
4667 if (Known.getMaxValue().ule(MaxCstSize)) {
4668 EVT MidVT = VT.isVector() ?
4669 EVT::getVectorVT(*DAG.getContext(), MVT::i32,
4670 VT.getVectorNumElements()) : MVT::i32;
4671
4672 EVT NewShiftVT = getShiftAmountTy(MidVT, DAG.getDataLayout());
4673 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, MidVT,
4674 Src.getOperand(0));
4675 DCI.AddToWorklist(Trunc.getNode());
4676
4677 if (Amt.getValueType() != NewShiftVT) {
4678 Amt = DAG.getZExtOrTrunc(Amt, SL, NewShiftVT);
4679 DCI.AddToWorklist(Amt.getNode());
4680 }
4681
4682 SDValue ShrunkShift = DAG.getNode(Src.getOpcode(), SL, MidVT,
4683 Trunc, Amt);
4684 return DAG.getNode(ISD::TRUNCATE, SL, VT, ShrunkShift);
4685 }
4686 }
4687 }
4688
4689 return SDValue();
4690}
4691
4692// We need to specifically handle i64 mul here to avoid unnecessary conversion
4693// instructions. If we only match on the legalized i64 mul expansion,
4694// SimplifyDemandedBits will be unable to remove them because there will be
4695// multiple uses due to the separate mul + mulh[su].
4696static SDValue getMul24(SelectionDAG &DAG, const SDLoc &SL,
4697 SDValue N0, SDValue N1, unsigned Size, bool Signed) {
4698 if (Size <= 32) {
4699 unsigned MulOpc = Signed ? AMDGPUISD::MUL_I24 : AMDGPUISD::MUL_U24;
4700 return DAG.getNode(MulOpc, SL, MVT::i32, N0, N1);
4701 }
4702
4703 unsigned MulLoOpc = Signed ? AMDGPUISD::MUL_I24 : AMDGPUISD::MUL_U24;
4704 unsigned MulHiOpc = Signed ? AMDGPUISD::MULHI_I24 : AMDGPUISD::MULHI_U24;
4705
4706 SDValue MulLo = DAG.getNode(MulLoOpc, SL, MVT::i32, N0, N1);
4707 SDValue MulHi = DAG.getNode(MulHiOpc, SL, MVT::i32, N0, N1);
4708
4709 return DAG.getNode(ISD::BUILD_PAIR, SL, MVT::i64, MulLo, MulHi);
4710}
4711
4712/// If \p V is an add of a constant 1, returns the other operand. Otherwise
4713/// return SDValue().
4714static SDValue getAddOneOp(const SDNode *V) {
4715 if (V->getOpcode() != ISD::ADD)
4716 return SDValue();
4717
4718 return isOneConstant(V->getOperand(1)) ? V->getOperand(0) : SDValue();
4719}
4720
4722 DAGCombinerInfo &DCI) const {
4723 assert(N->getOpcode() == ISD::MUL);
4724 EVT VT = N->getValueType(0);
4725
4726 // Don't generate 24-bit multiplies on values that are in SGPRs, since
4727 // we only have a 32-bit scalar multiply (avoid values being moved to VGPRs
4728 // unnecessarily). isDivergent() is used as an approximation of whether the
4729 // value is in an SGPR.
4730 if (!N->isDivergent())
4731 return SDValue();
4732
4733 unsigned Size = VT.getSizeInBits();
4734 if (VT.isVector() || Size > 64)
4735 return SDValue();
4736
4737 SelectionDAG &DAG = DCI.DAG;
4738 SDLoc DL(N);
4739
4740 SDValue N0 = N->getOperand(0);
4741 SDValue N1 = N->getOperand(1);
4742
4743 // Undo InstCombine canonicalize X * (Y + 1) -> X * Y + X to enable mad
4744 // matching.
4745
4746 // mul x, (add y, 1) -> add (mul x, y), x
4747 auto IsFoldableAdd = [](SDValue V) -> SDValue {
4748 SDValue AddOp = getAddOneOp(V.getNode());
4749 if (!AddOp)
4750 return SDValue();
4751
4752 if (V.hasOneUse() || all_of(V->users(), [](const SDNode *U) -> bool {
4753 return U->getOpcode() == ISD::MUL;
4754 }))
4755 return AddOp;
4756
4757 return SDValue();
4758 };
4759
4760 // FIXME: The selection pattern is not properly checking for commuted
4761 // operands, so we have to place the mul in the LHS
4762 if (SDValue MulOper = IsFoldableAdd(N0)) {
4763 SDValue MulVal = DAG.getNode(N->getOpcode(), DL, VT, N1, MulOper);
4764 return DAG.getNode(ISD::ADD, DL, VT, MulVal, N1);
4765 }
4766
4767 if (SDValue MulOper = IsFoldableAdd(N1)) {
4768 SDValue MulVal = DAG.getNode(N->getOpcode(), DL, VT, N0, MulOper);
4769 return DAG.getNode(ISD::ADD, DL, VT, MulVal, N0);
4770 }
4771
4772 // There are i16 integer mul/mad.
4773 if (isTypeLegal(MVT::i16) && VT.getScalarType().bitsLE(MVT::i16))
4774 return SDValue();
4775
4776 // SimplifyDemandedBits has the annoying habit of turning useful zero_extends
4777 // in the source into any_extends if the result of the mul is truncated. Since
4778 // we can assume the high bits are whatever we want, use the underlying value
4779 // to avoid the unknown high bits from interfering.
4780 if (N0.getOpcode() == ISD::ANY_EXTEND)
4781 N0 = N0.getOperand(0);
4782
4783 if (N1.getOpcode() == ISD::ANY_EXTEND)
4784 N1 = N1.getOperand(0);
4785
4786 SDValue Mul;
4787
4788 if (Subtarget->hasMulU24() && isU24(N0, DAG) && isU24(N1, DAG)) {
4789 N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);
4790 N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);
4791 Mul = getMul24(DAG, DL, N0, N1, Size, false);
4792 } else if (Subtarget->hasMulI24() && isI24(N0, DAG) && isI24(N1, DAG)) {
4793 N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);
4794 N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);
4795 Mul = getMul24(DAG, DL, N0, N1, Size, true);
4796 } else {
4797 return SDValue();
4798 }
4799
4800 // We need to use sext even for MUL_U24, because MUL_U24 is used
4801 // for signed multiply of 8 and 16-bit types.
4802 return DAG.getSExtOrTrunc(Mul, DL, VT);
4803}
4804
4805SDValue
4807 DAGCombinerInfo &DCI) const {
4808 if (N->getValueType(0) != MVT::i32)
4809 return SDValue();
4810
4811 SelectionDAG &DAG = DCI.DAG;
4812 SDLoc DL(N);
4813
4814 bool Signed = N->getOpcode() == ISD::SMUL_LOHI;
4815 SDValue N0 = N->getOperand(0);
4816 SDValue N1 = N->getOperand(1);
4817
4818 // SimplifyDemandedBits has the annoying habit of turning useful zero_extends
4819 // in the source into any_extends if the result of the mul is truncated. Since
4820 // we can assume the high bits are whatever we want, use the underlying value
4821 // to avoid the unknown high bits from interfering.
4822 if (N0.getOpcode() == ISD::ANY_EXTEND)
4823 N0 = N0.getOperand(0);
4824 if (N1.getOpcode() == ISD::ANY_EXTEND)
4825 N1 = N1.getOperand(0);
4826
4827 // Try to use two fast 24-bit multiplies (one for each half of the result)
4828 // instead of one slow extending multiply.
4829 unsigned LoOpcode = 0;
4830 unsigned HiOpcode = 0;
4831 if (Signed) {
4832 if (Subtarget->hasMulI24() && isI24(N0, DAG) && isI24(N1, DAG)) {
4833 N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);
4834 N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);
4835 LoOpcode = AMDGPUISD::MUL_I24;
4836 HiOpcode = AMDGPUISD::MULHI_I24;
4837 }
4838 } else {
4839 if (Subtarget->hasMulU24() && isU24(N0, DAG) && isU24(N1, DAG)) {
4840 N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);
4841 N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);
4842 LoOpcode = AMDGPUISD::MUL_U24;
4843 HiOpcode = AMDGPUISD::MULHI_U24;
4844 }
4845 }
4846 if (!LoOpcode)
4847 return SDValue();
4848
4849 SDValue Lo = DAG.getNode(LoOpcode, DL, MVT::i32, N0, N1);
4850 SDValue Hi = DAG.getNode(HiOpcode, DL, MVT::i32, N0, N1);
4851 DCI.CombineTo(N, Lo, Hi);
4852 return SDValue(N, 0);
4853}
4854
4856 DAGCombinerInfo &DCI) const {
4857 EVT VT = N->getValueType(0);
4858
4859 if (!Subtarget->hasMulI24() || VT.isVector())
4860 return SDValue();
4861
4862 // Don't generate 24-bit multiplies on values that are in SGPRs, since
4863 // we only have a 32-bit scalar multiply (avoid values being moved to VGPRs
4864 // unnecessarily). isDivergent() is used as an approximation of whether the
4865 // value is in an SGPR.
4866 // This doesn't apply if no s_mul_hi is available (since we'll end up with a
4867 // valu op anyway)
4868 if (Subtarget->hasSMulHi() && !N->isDivergent())
4869 return SDValue();
4870
4871 SelectionDAG &DAG = DCI.DAG;
4872 SDLoc DL(N);
4873
4874 SDValue N0 = N->getOperand(0);
4875 SDValue N1 = N->getOperand(1);
4876
4877 if (!isI24(N0, DAG) || !isI24(N1, DAG))
4878 return SDValue();
4879
4880 N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);
4881 N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);
4882
4883 SDValue Mulhi = DAG.getNode(AMDGPUISD::MULHI_I24, DL, MVT::i32, N0, N1);
4884 DCI.AddToWorklist(Mulhi.getNode());
4885 return DAG.getSExtOrTrunc(Mulhi, DL, VT);
4886}
4887
4889 DAGCombinerInfo &DCI) const {
4890 EVT VT = N->getValueType(0);
4891
4892 if (VT.isVector() || VT.getSizeInBits() > 32 || !Subtarget->hasMulU24())
4893 return SDValue();
4894
4895 // Don't generate 24-bit multiplies on values that are in SGPRs, since
4896 // we only have a 32-bit scalar multiply (avoid values being moved to VGPRs
4897 // unnecessarily). isDivergent() is used as an approximation of whether the
4898 // value is in an SGPR.
4899 // This doesn't apply if no s_mul_hi is available (since we'll end up with a
4900 // valu op anyway)
4901 if (!N->isDivergent() && Subtarget->hasSMulHi())
4902 return SDValue();
4903
4904 SelectionDAG &DAG = DCI.DAG;
4905 SDLoc DL(N);
4906
4907 SDValue N0 = N->getOperand(0);
4908 SDValue N1 = N->getOperand(1);
4909
4910 if (!isU24(N0, DAG) || !isU24(N1, DAG))
4911 return SDValue();
4912
4913 N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);
4914 N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);
4915
4916 SDValue Mulhi = DAG.getNode(AMDGPUISD::MULHI_U24, DL, MVT::i32, N0, N1);
4917 DCI.AddToWorklist(Mulhi.getNode());
4918 return DAG.getZExtOrTrunc(Mulhi, DL, VT);
4919}
4920
4921SDValue AMDGPUTargetLowering::getFFBX_U32(SelectionDAG &DAG,
4922 SDValue Op,
4923 const SDLoc &DL,
4924 unsigned Opc) const {
4925 EVT VT = Op.getValueType();
4926 if (VT.bitsGT(MVT::i32))
4927 return SDValue();
4928
4929 if (VT != MVT::i32)
4930 Op = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Op);
4931
4932 SDValue FFBX = DAG.getNode(Opc, DL, MVT::i32, Op);
4933 if (VT != MVT::i32)
4934 FFBX = DAG.getNode(ISD::TRUNCATE, DL, VT, FFBX);
4935
4936 return FFBX;
4937}
4938
4939// The native instructions return -1 on 0 input. Optimize out a select that
4940// produces -1 on 0.
4941//
4942// TODO: If zero is not undef, we could also do this if the output is compared
4943// against the bitwidth.
4944//
4945// TODO: Should probably combine against FFBH_U32 instead of ctlz directly.
4947 SDValue LHS, SDValue RHS,
4948 DAGCombinerInfo &DCI) const {
4949 if (!isNullConstant(Cond.getOperand(1)))
4950 return SDValue();
4951
4952 SelectionDAG &DAG = DCI.DAG;
4953 ISD::CondCode CCOpcode = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
4954 SDValue CmpLHS = Cond.getOperand(0);
4955
4956 // select (setcc x, 0, eq), -1, (ctlz_zero_undef x) -> ffbh_u32 x
4957 // select (setcc x, 0, eq), -1, (cttz_zero_undef x) -> ffbl_u32 x
4958 if (CCOpcode == ISD::SETEQ &&
4959 (isCtlzOpc(RHS.getOpcode()) || isCttzOpc(RHS.getOpcode())) &&
4960 RHS.getOperand(0) == CmpLHS && isAllOnesConstant(LHS)) {
4961 unsigned Opc =
4962 isCttzOpc(RHS.getOpcode()) ? AMDGPUISD::FFBL_B32 : AMDGPUISD::FFBH_U32;
4963 return getFFBX_U32(DAG, CmpLHS, SL, Opc);
4964 }
4965
4966 // select (setcc x, 0, ne), (ctlz_zero_undef x), -1 -> ffbh_u32 x
4967 // select (setcc x, 0, ne), (cttz_zero_undef x), -1 -> ffbl_u32 x
4968 if (CCOpcode == ISD::SETNE &&
4969 (isCtlzOpc(LHS.getOpcode()) || isCttzOpc(LHS.getOpcode())) &&
4970 LHS.getOperand(0) == CmpLHS && isAllOnesConstant(RHS)) {
4971 unsigned Opc =
4972 isCttzOpc(LHS.getOpcode()) ? AMDGPUISD::FFBL_B32 : AMDGPUISD::FFBH_U32;
4973
4974 return getFFBX_U32(DAG, CmpLHS, SL, Opc);
4975 }
4976
4977 return SDValue();
4978}
4979
4981 unsigned Op,
4982 const SDLoc &SL,
4983 SDValue Cond,
4984 SDValue N1,
4985 SDValue N2) {
4986 SelectionDAG &DAG = DCI.DAG;
4987 EVT VT = N1.getValueType();
4988
4989 SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, VT, Cond,
4990 N1.getOperand(0), N2.getOperand(0));
4991 DCI.AddToWorklist(NewSelect.getNode());
4992 return DAG.getNode(Op, SL, VT, NewSelect);
4993}
4994
4995// Pull a free FP operation out of a select so it may fold into uses.
4996//
4997// select c, (fneg x), (fneg y) -> fneg (select c, x, y)
4998// select c, (fneg x), k -> fneg (select c, x, (fneg k))
4999//
5000// select c, (fabs x), (fabs y) -> fabs (select c, x, y)
5001// select c, (fabs x), +k -> fabs (select c, x, k)
5002SDValue
5004 SDValue N) const {
5005 SelectionDAG &DAG = DCI.DAG;
5006 SDValue Cond = N.getOperand(0);
5007 SDValue LHS = N.getOperand(1);
5008 SDValue RHS = N.getOperand(2);
5009
5010 EVT VT = N.getValueType();
5011 if ((LHS.getOpcode() == ISD::FABS && RHS.getOpcode() == ISD::FABS) ||
5012 (LHS.getOpcode() == ISD::FNEG && RHS.getOpcode() == ISD::FNEG)) {
5014 return SDValue();
5015
5016 return distributeOpThroughSelect(DCI, LHS.getOpcode(),
5017 SDLoc(N), Cond, LHS, RHS);
5018 }
5019
5020 bool Inv = false;
5021 if (RHS.getOpcode() == ISD::FABS || RHS.getOpcode() == ISD::FNEG) {
5022 std::swap(LHS, RHS);
5023 Inv = true;
5024 }
5025
5026 // TODO: Support vector constants.
5028 if ((LHS.getOpcode() == ISD::FNEG || LHS.getOpcode() == ISD::FABS) && CRHS &&
5029 !selectSupportsSourceMods(N.getNode())) {
5030 SDLoc SL(N);
5031 // If one side is an fneg/fabs and the other is a constant, we can push the
5032 // fneg/fabs down. If it's an fabs, the constant needs to be non-negative.
5033 SDValue NewLHS = LHS.getOperand(0);
5034 SDValue NewRHS = RHS;
5035
5036 // Careful: if the neg can be folded up, don't try to pull it back down.
5037 bool ShouldFoldNeg = true;
5038
5039 if (NewLHS.hasOneUse()) {
5040 unsigned Opc = NewLHS.getOpcode();
5041 if (LHS.getOpcode() == ISD::FNEG && fnegFoldsIntoOp(NewLHS.getNode()))
5042 ShouldFoldNeg = false;
5043 if (LHS.getOpcode() == ISD::FABS && Opc == ISD::FMUL)
5044 ShouldFoldNeg = false;
5045 }
5046
5047 if (ShouldFoldNeg) {
5048 if (LHS.getOpcode() == ISD::FABS && CRHS->isNegative())
5049 return SDValue();
5050
5051 // We're going to be forced to use a source modifier anyway, there's no
5052 // point to pulling the negate out unless we can get a size reduction by
5053 // negating the constant.
5054 //
5055 // TODO: Generalize to use getCheaperNegatedExpression which doesn't know
5056 // about cheaper constants.
5057 if (NewLHS.getOpcode() == ISD::FABS &&
5059 return SDValue();
5060
5062 return SDValue();
5063
5064 if (LHS.getOpcode() == ISD::FNEG)
5065 NewRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
5066
5067 if (Inv)
5068 std::swap(NewLHS, NewRHS);
5069
5070 SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, VT,
5071 Cond, NewLHS, NewRHS);
5072 DCI.AddToWorklist(NewSelect.getNode());
5073 return DAG.getNode(LHS.getOpcode(), SL, VT, NewSelect);
5074 }
5075 }
5076
5077 return SDValue();
5078}
5079
5081 DAGCombinerInfo &DCI) const {
5082 if (SDValue Folded = foldFreeOpFromSelect(DCI, SDValue(N, 0)))
5083 return Folded;
5084
5085 SDValue Cond = N->getOperand(0);
5086 if (Cond.getOpcode() != ISD::SETCC)
5087 return SDValue();
5088
5089 EVT VT = N->getValueType(0);
5090 SDValue LHS = Cond.getOperand(0);
5091 SDValue RHS = Cond.getOperand(1);
5092 SDValue CC = Cond.getOperand(2);
5093
5094 SDValue True = N->getOperand(1);
5095 SDValue False = N->getOperand(2);
5096
5097 if (Cond.hasOneUse()) { // TODO: Look for multiple select uses.
5098 SelectionDAG &DAG = DCI.DAG;
5099 if (DAG.isConstantValueOfAnyType(True) &&
5100 !DAG.isConstantValueOfAnyType(False)) {
5101 // Swap cmp + select pair to move constant to false input.
5102 // This will allow using VOPC cndmasks more often.
5103 // select (setcc x, y), k, x -> select (setccinv x, y), x, k
5104
5105 SDLoc SL(N);
5106 ISD::CondCode NewCC =
5107 getSetCCInverse(cast<CondCodeSDNode>(CC)->get(), LHS.getValueType());
5108
5109 SDValue NewCond = DAG.getSetCC(SL, Cond.getValueType(), LHS, RHS, NewCC);
5110 return DAG.getNode(ISD::SELECT, SL, VT, NewCond, False, True);
5111 }
5112
5113 if (VT == MVT::f32 && Subtarget->hasFminFmaxLegacy()) {
5115 = combineFMinMaxLegacy(SDLoc(N), VT, LHS, RHS, True, False, CC, DCI);
5116 // Revisit this node so we can catch min3/max3/med3 patterns.
5117 //DCI.AddToWorklist(MinMax.getNode());
5118 return MinMax;
5119 }
5120 }
5121
5122 // There's no reason to not do this if the condition has other uses.
5123 return performCtlz_CttzCombine(SDLoc(N), Cond, True, False, DCI);
5124}
5125
5126static bool isInv2Pi(const APFloat &APF) {
5127 static const APFloat KF16(APFloat::IEEEhalf(), APInt(16, 0x3118));
5128 static const APFloat KF32(APFloat::IEEEsingle(), APInt(32, 0x3e22f983));
5129 static const APFloat KF64(APFloat::IEEEdouble(), APInt(64, 0x3fc45f306dc9c882));
5130
5131 return APF.bitwiseIsEqual(KF16) ||
5132 APF.bitwiseIsEqual(KF32) ||
5133 APF.bitwiseIsEqual(KF64);
5134}
5135
5136// 0 and 1.0 / (0.5 * pi) do not have inline immmediates, so there is an
5137// additional cost to negate them.
5140 if (C->isZero())
5141 return C->isNegative() ? NegatibleCost::Cheaper : NegatibleCost::Expensive;
5142
5143 if (Subtarget->hasInv2PiInlineImm() && isInv2Pi(C->getValueAPF()))
5144 return C->isNegative() ? NegatibleCost::Cheaper : NegatibleCost::Expensive;
5145
5147}
5148
5154
5160
5161static unsigned inverseMinMax(unsigned Opc) {
5162 switch (Opc) {
5163 case ISD::FMAXNUM:
5164 return ISD::FMINNUM;
5165 case ISD::FMINNUM:
5166 return ISD::FMAXNUM;
5167 case ISD::FMAXNUM_IEEE:
5168 return ISD::FMINNUM_IEEE;
5169 case ISD::FMINNUM_IEEE:
5170 return ISD::FMAXNUM_IEEE;
5171 case ISD::FMAXIMUM:
5172 return ISD::FMINIMUM;
5173 case ISD::FMINIMUM:
5174 return ISD::FMAXIMUM;
5175 case ISD::FMAXIMUMNUM:
5176 return ISD::FMINIMUMNUM;
5177 case ISD::FMINIMUMNUM:
5178 return ISD::FMAXIMUMNUM;
5179 case AMDGPUISD::FMAX_LEGACY:
5180 return AMDGPUISD::FMIN_LEGACY;
5181 case AMDGPUISD::FMIN_LEGACY:
5182 return AMDGPUISD::FMAX_LEGACY;
5183 default:
5184 llvm_unreachable("invalid min/max opcode");
5185 }
5186}
5187
5188/// \return true if it's profitable to try to push an fneg into its source
5189/// instruction.
5191 // If the input has multiple uses and we can either fold the negate down, or
5192 // the other uses cannot, give up. This both prevents unprofitable
5193 // transformations and infinite loops: we won't repeatedly try to fold around
5194 // a negate that has no 'good' form.
5195 if (N0.hasOneUse()) {
5196 // This may be able to fold into the source, but at a code size cost. Don't
5197 // fold if the fold into the user is free.
5198 if (allUsesHaveSourceMods(N, 0))
5199 return false;
5200 } else {
5201 if (fnegFoldsIntoOp(N0.getNode()) &&
5203 return false;
5204 }
5205
5206 return true;
5207}
5208
5210 DAGCombinerInfo &DCI) const {
5211 SelectionDAG &DAG = DCI.DAG;
5212 SDValue N0 = N->getOperand(0);
5213 EVT VT = N->getValueType(0);
5214
5215 unsigned Opc = N0.getOpcode();
5216
5217 if (!shouldFoldFNegIntoSrc(N, N0))
5218 return SDValue();
5219
5220 SDLoc SL(N);
5221 switch (Opc) {
5222 case ISD::FADD: {
5223 if (!mayIgnoreSignedZero(N0) && !N->getFlags().hasNoSignedZeros())
5224 return SDValue();
5225
5226 // (fneg (fadd x, y)) -> (fadd (fneg x), (fneg y))
5227 SDValue LHS = N0.getOperand(0);
5228 SDValue RHS = N0.getOperand(1);
5229
5230 if (LHS.getOpcode() != ISD::FNEG)
5231 LHS = DAG.getNode(ISD::FNEG, SL, VT, LHS);
5232 else
5233 LHS = LHS.getOperand(0);
5234
5235 if (RHS.getOpcode() != ISD::FNEG)
5236 RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
5237 else
5238 RHS = RHS.getOperand(0);
5239
5240 SDValue Res = DAG.getNode(ISD::FADD, SL, VT, LHS, RHS, N0->getFlags());
5241 if (Res.getOpcode() != ISD::FADD)
5242 return SDValue(); // Op got folded away.
5243 if (!N0.hasOneUse())
5244 DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
5245 return Res;
5246 }
5247 case ISD::FMUL:
5248 case AMDGPUISD::FMUL_LEGACY: {
5249 // (fneg (fmul x, y)) -> (fmul x, (fneg y))
5250 // (fneg (fmul_legacy x, y)) -> (fmul_legacy x, (fneg y))
5251 SDValue LHS = N0.getOperand(0);
5252 SDValue RHS = N0.getOperand(1);
5253
5254 if (LHS.getOpcode() == ISD::FNEG)
5255 LHS = LHS.getOperand(0);
5256 else if (RHS.getOpcode() == ISD::FNEG)
5257 RHS = RHS.getOperand(0);
5258 else
5259 RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
5260
5261 SDValue Res = DAG.getNode(Opc, SL, VT, LHS, RHS, N0->getFlags());
5262 if (Res.getOpcode() != Opc)
5263 return SDValue(); // Op got folded away.
5264 if (!N0.hasOneUse())
5265 DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
5266 return Res;
5267 }
5268 case ISD::FMA:
5269 case ISD::FMAD: {
5270 // TODO: handle llvm.amdgcn.fma.legacy
5271 if (!mayIgnoreSignedZero(N0) && !N->getFlags().hasNoSignedZeros())
5272 return SDValue();
5273
5274 // (fneg (fma x, y, z)) -> (fma x, (fneg y), (fneg z))
5275 SDValue LHS = N0.getOperand(0);
5276 SDValue MHS = N0.getOperand(1);
5277 SDValue RHS = N0.getOperand(2);
5278
5279 if (LHS.getOpcode() == ISD::FNEG)
5280 LHS = LHS.getOperand(0);
5281 else if (MHS.getOpcode() == ISD::FNEG)
5282 MHS = MHS.getOperand(0);
5283 else
5284 MHS = DAG.getNode(ISD::FNEG, SL, VT, MHS);
5285
5286 if (RHS.getOpcode() != ISD::FNEG)
5287 RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
5288 else
5289 RHS = RHS.getOperand(0);
5290
5291 SDValue Res = DAG.getNode(Opc, SL, VT, LHS, MHS, RHS);
5292 if (Res.getOpcode() != Opc)
5293 return SDValue(); // Op got folded away.
5294 if (!N0.hasOneUse())
5295 DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
5296 return Res;
5297 }
5298 case ISD::FMAXNUM:
5299 case ISD::FMINNUM:
5300 case ISD::FMAXNUM_IEEE:
5301 case ISD::FMINNUM_IEEE:
5302 case ISD::FMINIMUM:
5303 case ISD::FMAXIMUM:
5304 case ISD::FMINIMUMNUM:
5305 case ISD::FMAXIMUMNUM:
5306 case AMDGPUISD::FMAX_LEGACY:
5307 case AMDGPUISD::FMIN_LEGACY: {
5308 // fneg (fmaxnum x, y) -> fminnum (fneg x), (fneg y)
5309 // fneg (fminnum x, y) -> fmaxnum (fneg x), (fneg y)
5310 // fneg (fmax_legacy x, y) -> fmin_legacy (fneg x), (fneg y)
5311 // fneg (fmin_legacy x, y) -> fmax_legacy (fneg x), (fneg y)
5312
5313 SDValue LHS = N0.getOperand(0);
5314 SDValue RHS = N0.getOperand(1);
5315
5316 // 0 doesn't have a negated inline immediate.
5317 // TODO: This constant check should be generalized to other operations.
5319 return SDValue();
5320
5321 SDValue NegLHS = DAG.getNode(ISD::FNEG, SL, VT, LHS);
5322 SDValue NegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
5323 unsigned Opposite = inverseMinMax(Opc);
5324
5325 SDValue Res = DAG.getNode(Opposite, SL, VT, NegLHS, NegRHS, N0->getFlags());
5326 if (Res.getOpcode() != Opposite)
5327 return SDValue(); // Op got folded away.
5328 if (!N0.hasOneUse())
5329 DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
5330 return Res;
5331 }
5332 case AMDGPUISD::FMED3: {
5333 SDValue Ops[3];
5334 for (unsigned I = 0; I < 3; ++I)
5335 Ops[I] = DAG.getNode(ISD::FNEG, SL, VT, N0->getOperand(I), N0->getFlags());
5336
5337 SDValue Res = DAG.getNode(AMDGPUISD::FMED3, SL, VT, Ops, N0->getFlags());
5338 if (Res.getOpcode() != AMDGPUISD::FMED3)
5339 return SDValue(); // Op got folded away.
5340
5341 if (!N0.hasOneUse()) {
5342 SDValue Neg = DAG.getNode(ISD::FNEG, SL, VT, Res);
5343 DAG.ReplaceAllUsesWith(N0, Neg);
5344
5345 for (SDNode *U : Neg->users())
5346 DCI.AddToWorklist(U);
5347 }
5348
5349 return Res;
5350 }
5351 case ISD::FP_EXTEND:
5352 case ISD::FTRUNC:
5353 case ISD::FRINT:
5354 case ISD::FNEARBYINT: // XXX - Should fround be handled?
5355 case ISD::FROUNDEVEN:
5356 case ISD::FSIN:
5357 case ISD::FCANONICALIZE:
5358 case AMDGPUISD::RCP:
5359 case AMDGPUISD::RCP_LEGACY:
5360 case AMDGPUISD::RCP_IFLAG:
5361 case AMDGPUISD::SIN_HW: {
5362 SDValue CvtSrc = N0.getOperand(0);
5363 if (CvtSrc.getOpcode() == ISD::FNEG) {
5364 // (fneg (fp_extend (fneg x))) -> (fp_extend x)
5365 // (fneg (rcp (fneg x))) -> (rcp x)
5366 return DAG.getNode(Opc, SL, VT, CvtSrc.getOperand(0));
5367 }
5368
5369 if (!N0.hasOneUse())
5370 return SDValue();
5371
5372 // (fneg (fp_extend x)) -> (fp_extend (fneg x))
5373 // (fneg (rcp x)) -> (rcp (fneg x))
5374 SDValue Neg = DAG.getNode(ISD::FNEG, SL, CvtSrc.getValueType(), CvtSrc);
5375 return DAG.getNode(Opc, SL, VT, Neg, N0->getFlags());
5376 }
5377 case ISD::FP_ROUND: {
5378 SDValue CvtSrc = N0.getOperand(0);
5379
5380 if (CvtSrc.getOpcode() == ISD::FNEG) {
5381 // (fneg (fp_round (fneg x))) -> (fp_round x)
5382 return DAG.getNode(ISD::FP_ROUND, SL, VT,
5383 CvtSrc.getOperand(0), N0.getOperand(1));
5384 }
5385
5386 if (!N0.hasOneUse())
5387 return SDValue();
5388
5389 // (fneg (fp_round x)) -> (fp_round (fneg x))
5390 SDValue Neg = DAG.getNode(ISD::FNEG, SL, CvtSrc.getValueType(), CvtSrc);
5391 return DAG.getNode(ISD::FP_ROUND, SL, VT, Neg, N0.getOperand(1));
5392 }
5393 case ISD::FP16_TO_FP: {
5394 // v_cvt_f32_f16 supports source modifiers on pre-VI targets without legal
5395 // f16, but legalization of f16 fneg ends up pulling it out of the source.
5396 // Put the fneg back as a legal source operation that can be matched later.
5397 SDLoc SL(N);
5398
5399 SDValue Src = N0.getOperand(0);
5400 EVT SrcVT = Src.getValueType();
5401
5402 // fneg (fp16_to_fp x) -> fp16_to_fp (xor x, 0x8000)
5403 SDValue IntFNeg = DAG.getNode(ISD::XOR, SL, SrcVT, Src,
5404 DAG.getConstant(0x8000, SL, SrcVT));
5405 return DAG.getNode(ISD::FP16_TO_FP, SL, N->getValueType(0), IntFNeg);
5406 }
5407 case ISD::SELECT: {
5408 // fneg (select c, a, b) -> select c, (fneg a), (fneg b)
5409 // TODO: Invert conditions of foldFreeOpFromSelect
5410 return SDValue();
5411 }
5412 case ISD::BITCAST: {
5413 SDLoc SL(N);
5414 SDValue BCSrc = N0.getOperand(0);
5415 if (BCSrc.getOpcode() == ISD::BUILD_VECTOR) {
5416 SDValue HighBits = BCSrc.getOperand(BCSrc.getNumOperands() - 1);
5417 if (HighBits.getValueType().getSizeInBits() != 32 ||
5418 !fnegFoldsIntoOp(HighBits.getNode()))
5419 return SDValue();
5420
5421 // f64 fneg only really needs to operate on the high half of of the
5422 // register, so try to force it to an f32 operation to help make use of
5423 // source modifiers.
5424 //
5425 //
5426 // fneg (f64 (bitcast (build_vector x, y))) ->
5427 // f64 (bitcast (build_vector (bitcast i32:x to f32),
5428 // (fneg (bitcast i32:y to f32)))
5429
5430 SDValue CastHi = DAG.getNode(ISD::BITCAST, SL, MVT::f32, HighBits);
5431 SDValue NegHi = DAG.getNode(ISD::FNEG, SL, MVT::f32, CastHi);
5432 SDValue CastBack =
5433 DAG.getNode(ISD::BITCAST, SL, HighBits.getValueType(), NegHi);
5434
5436 Ops.back() = CastBack;
5437 DCI.AddToWorklist(NegHi.getNode());
5438 SDValue Build =
5439 DAG.getNode(ISD::BUILD_VECTOR, SL, BCSrc.getValueType(), Ops);
5440 SDValue Result = DAG.getNode(ISD::BITCAST, SL, VT, Build);
5441
5442 if (!N0.hasOneUse())
5443 DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Result));
5444 return Result;
5445 }
5446
5447 if (BCSrc.getOpcode() == ISD::SELECT && VT == MVT::f32 &&
5448 BCSrc.hasOneUse()) {
5449 // fneg (bitcast (f32 (select cond, i32:lhs, i32:rhs))) ->
5450 // select cond, (bitcast i32:lhs to f32), (bitcast i32:rhs to f32)
5451
5452 // TODO: Cast back result for multiple uses is beneficial in some cases.
5453
5454 SDValue LHS =
5455 DAG.getNode(ISD::BITCAST, SL, MVT::f32, BCSrc.getOperand(1));
5456 SDValue RHS =
5457 DAG.getNode(ISD::BITCAST, SL, MVT::f32, BCSrc.getOperand(2));
5458
5459 SDValue NegLHS = DAG.getNode(ISD::FNEG, SL, MVT::f32, LHS);
5460 SDValue NegRHS = DAG.getNode(ISD::FNEG, SL, MVT::f32, RHS);
5461
5462 return DAG.getNode(ISD::SELECT, SL, MVT::f32, BCSrc.getOperand(0), NegLHS,
5463 NegRHS);
5464 }
5465
5466 return SDValue();
5467 }
5468 default:
5469 return SDValue();
5470 }
5471}
5472
5474 DAGCombinerInfo &DCI) const {
5475 SelectionDAG &DAG = DCI.DAG;
5476 SDValue N0 = N->getOperand(0);
5477
5478 if (!N0.hasOneUse())
5479 return SDValue();
5480
5481 switch (N0.getOpcode()) {
5482 case ISD::FP16_TO_FP: {
5483 assert(!isTypeLegal(MVT::f16) && "should only see if f16 is illegal");
5484 SDLoc SL(N);
5485 SDValue Src = N0.getOperand(0);
5486 EVT SrcVT = Src.getValueType();
5487
5488 // fabs (fp16_to_fp x) -> fp16_to_fp (and x, 0x7fff)
5489 SDValue IntFAbs = DAG.getNode(ISD::AND, SL, SrcVT, Src,
5490 DAG.getConstant(0x7fff, SL, SrcVT));
5491 return DAG.getNode(ISD::FP16_TO_FP, SL, N->getValueType(0), IntFAbs);
5492 }
5493 default:
5494 return SDValue();
5495 }
5496}
5497
5499 DAGCombinerInfo &DCI) const {
5500 const auto *CFP = dyn_cast<ConstantFPSDNode>(N->getOperand(0));
5501 if (!CFP)
5502 return SDValue();
5503
5504 // XXX - Should this flush denormals?
5505 const APFloat &Val = CFP->getValueAPF();
5506 APFloat One(Val.getSemantics(), "1.0");
5507 return DCI.DAG.getConstantFP(One / Val, SDLoc(N), N->getValueType(0));
5508}
5509
5511 DAGCombinerInfo &DCI) const {
5512 SelectionDAG &DAG = DCI.DAG;
5513 SDLoc DL(N);
5514
5515 switch(N->getOpcode()) {
5516 default:
5517 break;
5518 case ISD::BITCAST: {
5519 EVT DestVT = N->getValueType(0);
5520
5521 // Push casts through vector builds. This helps avoid emitting a large
5522 // number of copies when materializing floating point vector constants.
5523 //
5524 // vNt1 bitcast (vNt0 (build_vector t0:x, t0:y)) =>
5525 // vnt1 = build_vector (t1 (bitcast t0:x)), (t1 (bitcast t0:y))
5526 if (DestVT.isVector()) {
5527 SDValue Src = N->getOperand(0);
5528 if (Src.getOpcode() == ISD::BUILD_VECTOR &&
5531 EVT SrcVT = Src.getValueType();
5532 unsigned NElts = DestVT.getVectorNumElements();
5533
5534 if (SrcVT.getVectorNumElements() == NElts) {
5535 EVT DestEltVT = DestVT.getVectorElementType();
5536
5537 SmallVector<SDValue, 8> CastedElts;
5538 SDLoc SL(N);
5539 for (unsigned I = 0, E = SrcVT.getVectorNumElements(); I != E; ++I) {
5540 SDValue Elt = Src.getOperand(I);
5541 CastedElts.push_back(DAG.getNode(ISD::BITCAST, DL, DestEltVT, Elt));
5542 }
5543
5544 return DAG.getBuildVector(DestVT, SL, CastedElts);
5545 }
5546 }
5547 }
5548
5549 if (DestVT.getSizeInBits() != 64 || !DestVT.isVector())
5550 break;
5551
5552 // Fold bitcasts of constants.
5553 //
5554 // v2i32 (bitcast i64:k) -> build_vector lo_32(k), hi_32(k)
5555 // TODO: Generalize and move to DAGCombiner
5556 SDValue Src = N->getOperand(0);
5558 SDLoc SL(N);
5559 uint64_t CVal = C->getZExtValue();
5560 SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32,
5561 DAG.getConstant(Lo_32(CVal), SL, MVT::i32),
5562 DAG.getConstant(Hi_32(CVal), SL, MVT::i32));
5563 return DAG.getNode(ISD::BITCAST, SL, DestVT, BV);
5564 }
5565
5567 const APInt &Val = C->getValueAPF().bitcastToAPInt();
5568 SDLoc SL(N);
5569 uint64_t CVal = Val.getZExtValue();
5570 SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32,
5571 DAG.getConstant(Lo_32(CVal), SL, MVT::i32),
5572 DAG.getConstant(Hi_32(CVal), SL, MVT::i32));
5573
5574 return DAG.getNode(ISD::BITCAST, SL, DestVT, Vec);
5575 }
5576
5577 break;
5578 }
5579 case ISD::SHL:
5580 case ISD::SRA:
5581 case ISD::SRL: {
5582 // Range metadata can be invalidated when loads are converted to legal types
5583 // (e.g. v2i64 -> v4i32).
5584 // Try to convert vector shl/sra/srl before type legalization so that range
5585 // metadata can be utilized.
5586 if (!(N->getValueType(0).isVector() &&
5589 break;
5590 if (N->getOpcode() == ISD::SHL)
5591 return performShlCombine(N, DCI);
5592 if (N->getOpcode() == ISD::SRA)
5593 return performSraCombine(N, DCI);
5594 return performSrlCombine(N, DCI);
5595 }
5596 case ISD::TRUNCATE:
5597 return performTruncateCombine(N, DCI);
5598 case ISD::MUL:
5599 return performMulCombine(N, DCI);
5600 case AMDGPUISD::MUL_U24:
5601 case AMDGPUISD::MUL_I24: {
5602 if (SDValue Simplified = simplifyMul24(N, DCI))
5603 return Simplified;
5604 break;
5605 }
5606 case AMDGPUISD::MULHI_I24:
5607 case AMDGPUISD::MULHI_U24:
5608 return simplifyMul24(N, DCI);
5609 case ISD::SMUL_LOHI:
5610 case ISD::UMUL_LOHI:
5611 return performMulLoHiCombine(N, DCI);
5612 case ISD::MULHS:
5613 return performMulhsCombine(N, DCI);
5614 case ISD::MULHU:
5615 return performMulhuCombine(N, DCI);
5616 case ISD::SELECT:
5617 return performSelectCombine(N, DCI);
5618 case ISD::FNEG:
5619 return performFNegCombine(N, DCI);
5620 case ISD::FABS:
5621 return performFAbsCombine(N, DCI);
5622 case AMDGPUISD::BFE_I32:
5623 case AMDGPUISD::BFE_U32: {
5624 assert(!N->getValueType(0).isVector() &&
5625 "Vector handling of BFE not implemented");
5626 ConstantSDNode *Width = dyn_cast<ConstantSDNode>(N->getOperand(2));
5627 if (!Width)
5628 break;
5629
5630 uint32_t WidthVal = Width->getZExtValue() & 0x1f;
5631 if (WidthVal == 0)
5632 return DAG.getConstant(0, DL, MVT::i32);
5633
5635 if (!Offset)
5636 break;
5637
5638 SDValue BitsFrom = N->getOperand(0);
5639 uint32_t OffsetVal = Offset->getZExtValue() & 0x1f;
5640
5641 bool Signed = N->getOpcode() == AMDGPUISD::BFE_I32;
5642
5643 if (OffsetVal == 0) {
5644 // This is already sign / zero extended, so try to fold away extra BFEs.
5645 unsigned SignBits = Signed ? (32 - WidthVal + 1) : (32 - WidthVal);
5646
5647 unsigned OpSignBits = DAG.ComputeNumSignBits(BitsFrom);
5648 if (OpSignBits >= SignBits)
5649 return BitsFrom;
5650
5651 EVT SmallVT = EVT::getIntegerVT(*DAG.getContext(), WidthVal);
5652 if (Signed) {
5653 // This is a sign_extend_inreg. Replace it to take advantage of existing
5654 // DAG Combines. If not eliminated, we will match back to BFE during
5655 // selection.
5656
5657 // TODO: The sext_inreg of extended types ends, although we can could
5658 // handle them in a single BFE.
5659 return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i32, BitsFrom,
5660 DAG.getValueType(SmallVT));
5661 }
5662
5663 return DAG.getZeroExtendInReg(BitsFrom, DL, SmallVT);
5664 }
5665
5666 if (ConstantSDNode *CVal = dyn_cast<ConstantSDNode>(BitsFrom)) {
5667 if (Signed) {
5668 return constantFoldBFE<int32_t>(DAG,
5669 CVal->getSExtValue(),
5670 OffsetVal,
5671 WidthVal,
5672 DL);
5673 }
5674
5675 return constantFoldBFE<uint32_t>(DAG,
5676 CVal->getZExtValue(),
5677 OffsetVal,
5678 WidthVal,
5679 DL);
5680 }
5681
5682 if ((OffsetVal + WidthVal) >= 32 &&
5683 !(OffsetVal == 16 && WidthVal == 16 && Subtarget->hasSDWA())) {
5684 SDValue ShiftVal = DAG.getConstant(OffsetVal, DL, MVT::i32);
5685 return DAG.getNode(Signed ? ISD::SRA : ISD::SRL, DL, MVT::i32,
5686 BitsFrom, ShiftVal);
5687 }
5688
5689 if (BitsFrom.hasOneUse()) {
5690 APInt Demanded = APInt::getBitsSet(32,
5691 OffsetVal,
5692 OffsetVal + WidthVal);
5693
5694 KnownBits Known;
5696 !DCI.isBeforeLegalizeOps());
5697 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
5698 if (TLI.ShrinkDemandedConstant(BitsFrom, Demanded, TLO) ||
5699 TLI.SimplifyDemandedBits(BitsFrom, Demanded, Known, TLO)) {
5700 DCI.CommitTargetLoweringOpt(TLO);
5701 }
5702 }
5703
5704 break;
5705 }
5706 case ISD::LOAD:
5707 return performLoadCombine(N, DCI);
5708 case ISD::STORE:
5709 return performStoreCombine(N, DCI);
5710 case AMDGPUISD::RCP:
5711 case AMDGPUISD::RCP_IFLAG:
5712 return performRcpCombine(N, DCI);
5713 case ISD::AssertZext:
5714 case ISD::AssertSext:
5715 return performAssertSZExtCombine(N, DCI);
5717 return performIntrinsicWOChainCombine(N, DCI);
5718 case AMDGPUISD::FMAD_FTZ: {
5719 SDValue N0 = N->getOperand(0);
5720 SDValue N1 = N->getOperand(1);
5721 SDValue N2 = N->getOperand(2);
5722 EVT VT = N->getValueType(0);
5723
5724 // FMAD_FTZ is a FMAD + flush denormals to zero.
5725 // We flush the inputs, the intermediate step, and the output.
5729 if (N0CFP && N1CFP && N2CFP) {
5730 const auto FTZ = [](const APFloat &V) {
5731 if (V.isDenormal()) {
5732 APFloat Zero(V.getSemantics(), 0);
5733 return V.isNegative() ? -Zero : Zero;
5734 }
5735 return V;
5736 };
5737
5738 APFloat V0 = FTZ(N0CFP->getValueAPF());
5739 APFloat V1 = FTZ(N1CFP->getValueAPF());
5740 APFloat V2 = FTZ(N2CFP->getValueAPF());
5742 V0 = FTZ(V0);
5744 return DAG.getConstantFP(FTZ(V0), DL, VT);
5745 }
5746 break;
5747 }
5748 }
5749 return SDValue();
5750}
5751
5752//===----------------------------------------------------------------------===//
5753// Helper functions
5754//===----------------------------------------------------------------------===//
5755
5757 const TargetRegisterClass *RC,
5758 Register Reg, EVT VT,
5759 const SDLoc &SL,
5760 bool RawReg) const {
5763 Register VReg;
5764
5765 if (!MRI.isLiveIn(Reg)) {
5766 VReg = MRI.createVirtualRegister(RC);
5767 MRI.addLiveIn(Reg, VReg);
5768 } else {
5769 VReg = MRI.getLiveInVirtReg(Reg);
5770 }
5771
5772 if (RawReg)
5773 return DAG.getRegister(VReg, VT);
5774
5775 return DAG.getCopyFromReg(DAG.getEntryNode(), SL, VReg, VT);
5776}
5777
5778// This may be called multiple times, and nothing prevents creating multiple
5779// objects at the same offset. See if we already defined this object.
5781 int64_t Offset) {
5782 for (int I = MFI.getObjectIndexBegin(); I < 0; ++I) {
5783 if (MFI.getObjectOffset(I) == Offset) {
5784 assert(MFI.getObjectSize(I) == Size);
5785 return I;
5786 }
5787 }
5788
5789 return MFI.CreateFixedObject(Size, Offset, true);
5790}
5791
5793 EVT VT,
5794 const SDLoc &SL,
5795 int64_t Offset) const {
5797 MachineFrameInfo &MFI = MF.getFrameInfo();
5798 int FI = getOrCreateFixedStackObject(MFI, VT.getStoreSize(), Offset);
5799
5800 auto SrcPtrInfo = MachinePointerInfo::getStack(MF, Offset);
5801 SDValue Ptr = DAG.getFrameIndex(FI, MVT::i32);
5802
5803 return DAG.getLoad(VT, SL, DAG.getEntryNode(), Ptr, SrcPtrInfo, Align(4),
5806}
5807
5809 const SDLoc &SL,
5810 SDValue Chain,
5811 SDValue ArgVal,
5812 int64_t Offset) const {
5816
5817 SDValue Ptr = DAG.getConstant(Offset, SL, MVT::i32);
5818 // Stores to the argument stack area are relative to the stack pointer.
5819 SDValue SP =
5820 DAG.getCopyFromReg(Chain, SL, Info->getStackPtrOffsetReg(), MVT::i32);
5821 Ptr = DAG.getNode(ISD::ADD, SL, MVT::i32, SP, Ptr);
5822 SDValue Store = DAG.getStore(Chain, SL, ArgVal, Ptr, DstInfo, Align(4),
5824 return Store;
5825}
5826
5828 const TargetRegisterClass *RC,
5829 EVT VT, const SDLoc &SL,
5830 const ArgDescriptor &Arg) const {
5831 assert(Arg && "Attempting to load missing argument");
5832
5833 SDValue V = Arg.isRegister() ?
5834 CreateLiveInRegister(DAG, RC, Arg.getRegister(), VT, SL) :
5835 loadStackInputValue(DAG, VT, SL, Arg.getStackOffset());
5836
5837 if (!Arg.isMasked())
5838 return V;
5839
5840 unsigned Mask = Arg.getMask();
5841 unsigned Shift = llvm::countr_zero<unsigned>(Mask);
5842 V = DAG.getNode(ISD::SRL, SL, VT, V,
5843 DAG.getShiftAmountConstant(Shift, VT, SL));
5844 return DAG.getNode(ISD::AND, SL, VT, V,
5845 DAG.getConstant(Mask >> Shift, SL, VT));
5846}
5847
5849 uint64_t ExplicitKernArgSize, const ImplicitParameter Param) const {
5850 unsigned ExplicitArgOffset = Subtarget->getExplicitKernelArgOffset();
5851 const Align Alignment = Subtarget->getAlignmentForImplicitArgPtr();
5852 uint64_t ArgOffset =
5853 alignTo(ExplicitKernArgSize, Alignment) + ExplicitArgOffset;
5854 switch (Param) {
5855 case FIRST_IMPLICIT:
5856 return ArgOffset;
5857 case PRIVATE_BASE:
5859 case SHARED_BASE:
5860 return ArgOffset + AMDGPU::ImplicitArg::SHARED_BASE_OFFSET;
5861 case QUEUE_PTR:
5862 return ArgOffset + AMDGPU::ImplicitArg::QUEUE_PTR_OFFSET;
5863 }
5864 llvm_unreachable("unexpected implicit parameter type");
5865}
5866
5872
5874 SelectionDAG &DAG, int Enabled,
5875 int &RefinementSteps,
5876 bool &UseOneConstNR,
5877 bool Reciprocal) const {
5878 EVT VT = Operand.getValueType();
5879
5880 if (VT == MVT::f32) {
5881 RefinementSteps = 0;
5882 return DAG.getNode(AMDGPUISD::RSQ, SDLoc(Operand), VT, Operand);
5883 }
5884
5885 // TODO: There is also f64 rsq instruction, but the documentation is less
5886 // clear on its precision.
5887
5888 return SDValue();
5889}
5890
5892 SelectionDAG &DAG, int Enabled,
5893 int &RefinementSteps) const {
5894 EVT VT = Operand.getValueType();
5895
5896 if (VT == MVT::f32) {
5897 // Reciprocal, < 1 ulp error.
5898 //
5899 // This reciprocal approximation converges to < 0.5 ulp error with one
5900 // newton rhapson performed with two fused multiple adds (FMAs).
5901
5902 RefinementSteps = 0;
5903 return DAG.getNode(AMDGPUISD::RCP, SDLoc(Operand), VT, Operand);
5904 }
5905
5906 // TODO: There is also f64 rcp instruction, but the documentation is less
5907 // clear on its precision.
5908
5909 return SDValue();
5910}
5911
5912static unsigned workitemIntrinsicDim(unsigned ID) {
5913 switch (ID) {
5914 case Intrinsic::amdgcn_workitem_id_x:
5915 return 0;
5916 case Intrinsic::amdgcn_workitem_id_y:
5917 return 1;
5918 case Intrinsic::amdgcn_workitem_id_z:
5919 return 2;
5920 default:
5921 llvm_unreachable("not a workitem intrinsic");
5922 }
5923}
5924
5926 const SDValue Op, KnownBits &Known,
5927 const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const {
5928
5929 Known.resetAll(); // Don't know anything.
5930
5931 unsigned Opc = Op.getOpcode();
5932
5933 switch (Opc) {
5934 default:
5935 break;
5936 case AMDGPUISD::CARRY:
5937 case AMDGPUISD::BORROW: {
5938 Known.Zero = APInt::getHighBitsSet(32, 31);
5939 break;
5940 }
5941
5942 case AMDGPUISD::BFE_I32:
5943 case AMDGPUISD::BFE_U32: {
5944 ConstantSDNode *CWidth = dyn_cast<ConstantSDNode>(Op.getOperand(2));
5945 if (!CWidth)
5946 return;
5947
5948 uint32_t Width = CWidth->getZExtValue() & 0x1f;
5949
5950 if (Opc == AMDGPUISD::BFE_U32)
5951 Known.Zero = APInt::getHighBitsSet(32, 32 - Width);
5952
5953 break;
5954 }
5955 case AMDGPUISD::FP_TO_FP16: {
5956 unsigned BitWidth = Known.getBitWidth();
5957
5958 // High bits are zero.
5960 break;
5961 }
5962 case AMDGPUISD::MUL_U24:
5963 case AMDGPUISD::MUL_I24: {
5964 KnownBits LHSKnown = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
5965 KnownBits RHSKnown = DAG.computeKnownBits(Op.getOperand(1), Depth + 1);
5966 unsigned TrailZ = LHSKnown.countMinTrailingZeros() +
5967 RHSKnown.countMinTrailingZeros();
5968 Known.Zero.setLowBits(std::min(TrailZ, 32u));
5969 // Skip extra check if all bits are known zeros.
5970 if (TrailZ >= 32)
5971 break;
5972
5973 // Truncate to 24 bits.
5974 LHSKnown = LHSKnown.trunc(24);
5975 RHSKnown = RHSKnown.trunc(24);
5976
5977 if (Opc == AMDGPUISD::MUL_I24) {
5978 unsigned LHSValBits = LHSKnown.countMaxSignificantBits();
5979 unsigned RHSValBits = RHSKnown.countMaxSignificantBits();
5980 unsigned MaxValBits = LHSValBits + RHSValBits;
5981 if (MaxValBits > 32)
5982 break;
5983 unsigned SignBits = 32 - MaxValBits + 1;
5984 bool LHSNegative = LHSKnown.isNegative();
5985 bool LHSNonNegative = LHSKnown.isNonNegative();
5986 bool LHSPositive = LHSKnown.isStrictlyPositive();
5987 bool RHSNegative = RHSKnown.isNegative();
5988 bool RHSNonNegative = RHSKnown.isNonNegative();
5989 bool RHSPositive = RHSKnown.isStrictlyPositive();
5990
5991 if ((LHSNonNegative && RHSNonNegative) || (LHSNegative && RHSNegative))
5992 Known.Zero.setHighBits(SignBits);
5993 else if ((LHSNegative && RHSPositive) || (LHSPositive && RHSNegative))
5994 Known.One.setHighBits(SignBits);
5995 } else {
5996 unsigned LHSValBits = LHSKnown.countMaxActiveBits();
5997 unsigned RHSValBits = RHSKnown.countMaxActiveBits();
5998 unsigned MaxValBits = LHSValBits + RHSValBits;
5999 if (MaxValBits >= 32)
6000 break;
6001 Known.Zero.setBitsFrom(MaxValBits);
6002 }
6003 break;
6004 }
6005 case AMDGPUISD::PERM: {
6006 ConstantSDNode *CMask = dyn_cast<ConstantSDNode>(Op.getOperand(2));
6007 if (!CMask)
6008 return;
6009
6010 KnownBits LHSKnown = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
6011 KnownBits RHSKnown = DAG.computeKnownBits(Op.getOperand(1), Depth + 1);
6012 unsigned Sel = CMask->getZExtValue();
6013
6014 for (unsigned I = 0; I < 32; I += 8) {
6015 unsigned SelBits = Sel & 0xff;
6016 if (SelBits < 4) {
6017 SelBits *= 8;
6018 Known.One |= ((RHSKnown.One.getZExtValue() >> SelBits) & 0xff) << I;
6019 Known.Zero |= ((RHSKnown.Zero.getZExtValue() >> SelBits) & 0xff) << I;
6020 } else if (SelBits < 7) {
6021 SelBits = (SelBits & 3) * 8;
6022 Known.One |= ((LHSKnown.One.getZExtValue() >> SelBits) & 0xff) << I;
6023 Known.Zero |= ((LHSKnown.Zero.getZExtValue() >> SelBits) & 0xff) << I;
6024 } else if (SelBits == 0x0c) {
6025 Known.Zero |= 0xFFull << I;
6026 } else if (SelBits > 0x0c) {
6027 Known.One |= 0xFFull << I;
6028 }
6029 Sel >>= 8;
6030 }
6031 break;
6032 }
6033 case AMDGPUISD::BUFFER_LOAD_UBYTE: {
6034 Known.Zero.setHighBits(24);
6035 break;
6036 }
6037 case AMDGPUISD::BUFFER_LOAD_USHORT: {
6038 Known.Zero.setHighBits(16);
6039 break;
6040 }
6041 case AMDGPUISD::LDS: {
6042 auto *GA = cast<GlobalAddressSDNode>(Op.getOperand(0).getNode());
6043 Align Alignment = GA->getGlobal()->getPointerAlignment(DAG.getDataLayout());
6044
6045 Known.Zero.setHighBits(16);
6046 Known.Zero.setLowBits(Log2(Alignment));
6047 break;
6048 }
6049 case AMDGPUISD::SMIN3:
6050 case AMDGPUISD::SMAX3:
6051 case AMDGPUISD::SMED3:
6052 case AMDGPUISD::UMIN3:
6053 case AMDGPUISD::UMAX3:
6054 case AMDGPUISD::UMED3: {
6055 KnownBits Known2 = DAG.computeKnownBits(Op.getOperand(2), Depth + 1);
6056 if (Known2.isUnknown())
6057 break;
6058
6059 KnownBits Known1 = DAG.computeKnownBits(Op.getOperand(1), Depth + 1);
6060 if (Known1.isUnknown())
6061 break;
6062
6063 KnownBits Known0 = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
6064 if (Known0.isUnknown())
6065 break;
6066
6067 // TODO: Handle LeadZero/LeadOne from UMIN/UMAX handling.
6068 Known.Zero = Known0.Zero & Known1.Zero & Known2.Zero;
6069 Known.One = Known0.One & Known1.One & Known2.One;
6070 break;
6071 }
6073 unsigned IID = Op.getConstantOperandVal(0);
6074 switch (IID) {
6075 case Intrinsic::amdgcn_workitem_id_x:
6076 case Intrinsic::amdgcn_workitem_id_y:
6077 case Intrinsic::amdgcn_workitem_id_z: {
6078 unsigned MaxValue = Subtarget->getMaxWorkitemID(
6080 Known.Zero.setHighBits(llvm::countl_zero(MaxValue));
6081 break;
6082 }
6083 default:
6084 break;
6085 }
6086 }
6087 }
6088}
6089
6091 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
6092 unsigned Depth) const {
6093 switch (Op.getOpcode()) {
6094 case AMDGPUISD::BFE_I32: {
6095 ConstantSDNode *Width = dyn_cast<ConstantSDNode>(Op.getOperand(2));
6096 if (!Width)
6097 return 1;
6098
6099 unsigned SignBits = 32 - (Width->getZExtValue() & 0x1f) + 1;
6100 if (!isNullConstant(Op.getOperand(1)))
6101 return SignBits;
6102
6103 // TODO: Could probably figure something out with non-0 offsets.
6104 unsigned Op0SignBits = DAG.ComputeNumSignBits(Op.getOperand(0), Depth + 1);
6105 return std::max(SignBits, Op0SignBits);
6106 }
6107
6108 case AMDGPUISD::BFE_U32: {
6109 ConstantSDNode *Width = dyn_cast<ConstantSDNode>(Op.getOperand(2));
6110 return Width ? 32 - (Width->getZExtValue() & 0x1f) : 1;
6111 }
6112
6113 case AMDGPUISD::CARRY:
6114 case AMDGPUISD::BORROW:
6115 return 31;
6116 case AMDGPUISD::BUFFER_LOAD_BYTE:
6117 return 25;
6118 case AMDGPUISD::BUFFER_LOAD_SHORT:
6119 return 17;
6120 case AMDGPUISD::BUFFER_LOAD_UBYTE:
6121 return 24;
6122 case AMDGPUISD::BUFFER_LOAD_USHORT:
6123 return 16;
6124 case AMDGPUISD::FP_TO_FP16:
6125 return 16;
6126 case AMDGPUISD::SMIN3:
6127 case AMDGPUISD::SMAX3:
6128 case AMDGPUISD::SMED3:
6129 case AMDGPUISD::UMIN3:
6130 case AMDGPUISD::UMAX3:
6131 case AMDGPUISD::UMED3: {
6132 unsigned Tmp2 = DAG.ComputeNumSignBits(Op.getOperand(2), Depth + 1);
6133 if (Tmp2 == 1)
6134 return 1; // Early out.
6135
6136 unsigned Tmp1 = DAG.ComputeNumSignBits(Op.getOperand(1), Depth + 1);
6137 if (Tmp1 == 1)
6138 return 1; // Early out.
6139
6140 unsigned Tmp0 = DAG.ComputeNumSignBits(Op.getOperand(0), Depth + 1);
6141 if (Tmp0 == 1)
6142 return 1; // Early out.
6143
6144 return std::min({Tmp0, Tmp1, Tmp2});
6145 }
6146 default:
6147 return 1;
6148 }
6149}
6150
6152 GISelValueTracking &Analysis, Register R, const APInt &DemandedElts,
6153 const MachineRegisterInfo &MRI, unsigned Depth) const {
6154 const MachineInstr *MI = MRI.getVRegDef(R);
6155 if (!MI)
6156 return 1;
6157
6158 // TODO: Check range metadata on MMO.
6159 switch (MI->getOpcode()) {
6160 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE:
6161 return 25;
6162 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT:
6163 return 17;
6164 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
6165 return 24;
6166 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
6167 return 16;
6168 case AMDGPU::G_AMDGPU_SMED3:
6169 case AMDGPU::G_AMDGPU_UMED3: {
6170 auto [Dst, Src0, Src1, Src2] = MI->getFirst4Regs();
6171 unsigned Tmp2 = Analysis.computeNumSignBits(Src2, DemandedElts, Depth + 1);
6172 if (Tmp2 == 1)
6173 return 1;
6174 unsigned Tmp1 = Analysis.computeNumSignBits(Src1, DemandedElts, Depth + 1);
6175 if (Tmp1 == 1)
6176 return 1;
6177 unsigned Tmp0 = Analysis.computeNumSignBits(Src0, DemandedElts, Depth + 1);
6178 if (Tmp0 == 1)
6179 return 1;
6180 return std::min({Tmp0, Tmp1, Tmp2});
6181 }
6182 default:
6183 return 1;
6184 }
6185}
6186
6188 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
6189 bool PoisonOnly, bool ConsiderFlags, unsigned Depth) const {
6190 unsigned Opcode = Op.getOpcode();
6191 switch (Opcode) {
6192 case AMDGPUISD::BFE_I32:
6193 case AMDGPUISD::BFE_U32:
6194 return false;
6195 }
6197 Op, DemandedElts, DAG, PoisonOnly, ConsiderFlags, Depth);
6198}
6199
6201 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool SNaN,
6202 unsigned Depth) const {
6203 unsigned Opcode = Op.getOpcode();
6204 switch (Opcode) {
6205 case AMDGPUISD::FMIN_LEGACY:
6206 case AMDGPUISD::FMAX_LEGACY: {
6207 if (SNaN)
6208 return true;
6209
6210 // TODO: Can check no nans on one of the operands for each one, but which
6211 // one?
6212 return false;
6213 }
6214 case AMDGPUISD::FMUL_LEGACY:
6215 case AMDGPUISD::CVT_PKRTZ_F16_F32: {
6216 if (SNaN)
6217 return true;
6218 return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1) &&
6219 DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1);
6220 }
6221 case AMDGPUISD::FMED3:
6222 case AMDGPUISD::FMIN3:
6223 case AMDGPUISD::FMAX3:
6224 case AMDGPUISD::FMINIMUM3:
6225 case AMDGPUISD::FMAXIMUM3:
6226 case AMDGPUISD::FMAD_FTZ: {
6227 if (SNaN)
6228 return true;
6229 return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1) &&
6230 DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1) &&
6231 DAG.isKnownNeverNaN(Op.getOperand(2), SNaN, Depth + 1);
6232 }
6233 case AMDGPUISD::CVT_F32_UBYTE0:
6234 case AMDGPUISD::CVT_F32_UBYTE1:
6235 case AMDGPUISD::CVT_F32_UBYTE2:
6236 case AMDGPUISD::CVT_F32_UBYTE3:
6237 return true;
6238
6239 case AMDGPUISD::RCP:
6240 case AMDGPUISD::RSQ:
6241 case AMDGPUISD::RCP_LEGACY:
6242 case AMDGPUISD::RSQ_CLAMP: {
6243 if (SNaN)
6244 return true;
6245
6246 // TODO: Need is known positive check.
6247 return false;
6248 }
6249 case ISD::FLDEXP:
6250 case AMDGPUISD::FRACT: {
6251 if (SNaN)
6252 return true;
6253 return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1);
6254 }
6255 case AMDGPUISD::DIV_SCALE:
6256 case AMDGPUISD::DIV_FMAS:
6257 case AMDGPUISD::DIV_FIXUP:
6258 // TODO: Refine on operands.
6259 return SNaN;
6260 case AMDGPUISD::SIN_HW:
6261 case AMDGPUISD::COS_HW: {
6262 // TODO: Need check for infinity
6263 return SNaN;
6264 }
6266 unsigned IntrinsicID = Op.getConstantOperandVal(0);
6267 // TODO: Handle more intrinsics
6268 switch (IntrinsicID) {
6269 case Intrinsic::amdgcn_cubeid:
6270 case Intrinsic::amdgcn_cvt_off_f32_i4:
6271 return true;
6272
6273 case Intrinsic::amdgcn_frexp_mant: {
6274 if (SNaN)
6275 return true;
6276 return DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1);
6277 }
6278 case Intrinsic::amdgcn_cvt_pkrtz: {
6279 if (SNaN)
6280 return true;
6281 return DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1) &&
6282 DAG.isKnownNeverNaN(Op.getOperand(2), SNaN, Depth + 1);
6283 }
6284 case Intrinsic::amdgcn_rcp:
6285 case Intrinsic::amdgcn_rsq:
6286 case Intrinsic::amdgcn_rcp_legacy:
6287 case Intrinsic::amdgcn_rsq_legacy:
6288 case Intrinsic::amdgcn_rsq_clamp:
6289 case Intrinsic::amdgcn_tanh: {
6290 if (SNaN)
6291 return true;
6292
6293 // TODO: Need is known positive check.
6294 return false;
6295 }
6296 case Intrinsic::amdgcn_trig_preop:
6297 case Intrinsic::amdgcn_fdot2:
6298 // TODO: Refine on operand
6299 return SNaN;
6300 case Intrinsic::amdgcn_fma_legacy:
6301 if (SNaN)
6302 return true;
6303 return DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1) &&
6304 DAG.isKnownNeverNaN(Op.getOperand(2), SNaN, Depth + 1) &&
6305 DAG.isKnownNeverNaN(Op.getOperand(3), SNaN, Depth + 1);
6306 default:
6307 return false;
6308 }
6309 }
6310 default:
6311 return false;
6312 }
6313}
6314
6316 Register N0, Register N1) const {
6317 return MRI.hasOneNonDBGUse(N0); // FIXME: handle regbanks
6318}
unsigned const MachineRegisterInfo * MRI
return SDValue()
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static LLVM_READONLY bool hasSourceMods(const MachineInstr &MI)
static bool isInv2Pi(const APFloat &APF)
static LLVM_READONLY bool opMustUseVOP3Encoding(const MachineInstr &MI, const MachineRegisterInfo &MRI)
returns true if the operation will definitely need to use a 64-bit encoding, and thus will use a VOP3...
static unsigned inverseMinMax(unsigned Opc)
static SDValue extractF64Exponent(SDValue Hi, const SDLoc &SL, SelectionDAG &DAG)
static unsigned workitemIntrinsicDim(unsigned ID)
static int getOrCreateFixedStackObject(MachineFrameInfo &MFI, unsigned Size, int64_t Offset)
static SDValue constantFoldBFE(SelectionDAG &DAG, IntTy Src0, uint32_t Offset, uint32_t Width, const SDLoc &DL)
static SDValue getMad(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue X, SDValue Y, SDValue C, SDNodeFlags Flags=SDNodeFlags())
static SDValue getAddOneOp(const SDNode *V)
If V is an add of a constant 1, returns the other operand.
static LLVM_READONLY bool selectSupportsSourceMods(const SDNode *N)
Return true if v_cndmask_b32 will support fabs/fneg source modifiers for the type for ISD::SELECT.
static cl::opt< bool > AMDGPUBypassSlowDiv("amdgpu-bypass-slow-div", cl::desc("Skip 64-bit divide for dynamic 32-bit values"), cl::init(true))
static SDValue getMul24(SelectionDAG &DAG, const SDLoc &SL, SDValue N0, SDValue N1, unsigned Size, bool Signed)
static bool fnegFoldsIntoOp(const SDNode *N)
static bool isI24(SDValue Op, SelectionDAG &DAG)
static bool isCttzOpc(unsigned Opc)
static bool isU24(SDValue Op, SelectionDAG &DAG)
static SDValue peekFPSignOps(SDValue Val)
static bool valueIsKnownNeverF32Denorm(SDValue Src)
Return true if it's known that Src can never be an f32 denormal value.
static SDValue distributeOpThroughSelect(TargetLowering::DAGCombinerInfo &DCI, unsigned Op, const SDLoc &SL, SDValue Cond, SDValue N1, SDValue N2)
static SDValue peekFNeg(SDValue Val)
static SDValue simplifyMul24(SDNode *Node24, TargetLowering::DAGCombinerInfo &DCI)
static bool isCtlzOpc(unsigned Opc)
static LLVM_READNONE bool fnegFoldsIntoOpcode(unsigned Opc)
static bool hasVolatileUser(SDNode *Val)
Interface definition of the TargetLowering class that is common to all AMD GPUs.
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Function Alias Analysis Results
block Block Frequency Analysis
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
#define LLVM_READNONE
Definition Compiler.h:315
#define LLVM_READONLY
Definition Compiler.h:322
Provides analysis for querying information about KnownBits during GISel passes.
IRTranslator LLVM IR MI
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
#define G(x, y, z)
Definition MD5.cpp:55
static DebugLoc getDebugLoc(MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
Return the first DebugLoc that has line number information, given a range of instructions.
#define T
#define P(N)
const SmallVectorImpl< MachineOperand > & Cond
#define CH(x, y, z)
Definition SHA256.cpp:34
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
Value * RHS
Value * LHS
BinaryOperator * Mul
static CCAssignFn * CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg)
static CCAssignFn * CCAssignFnForReturn(CallingConv::ID CC, bool IsVarArg)
static std::optional< uint32_t > getLDSAbsoluteAddress(const GlobalValue &GV)
void recordNumNamedBarriers(uint32_t GVAddr, unsigned BarCnt)
unsigned allocateLDSGlobal(const DataLayout &DL, const GlobalVariable &GV)
static unsigned numBitsSigned(SDValue Op, SelectionDAG &DAG)
SDValue combineFMinMaxLegacy(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, SDValue True, SDValue False, SDValue CC, DAGCombinerInfo &DCI) const
Generate Min/Max node.
unsigned ComputeNumSignBitsForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
This method can be implemented by targets that want to expose additional information about sign bits ...
SDValue performMulhuCombine(SDNode *N, DAGCombinerInfo &DCI) const
EVT getTypeForExtReturn(LLVMContext &Context, EVT VT, ISD::NodeType ExtendKind) const override
Return the type that should be used to zero or sign extend a zeroext/signext integer return value.
SDValue SplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Split a vector load into 2 loads of half the vector.
SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const
SDValue performLoadCombine(SDNode *N, DAGCombinerInfo &DCI) const
void analyzeFormalArgumentsCompute(CCState &State, const SmallVectorImpl< ISD::InputArg > &Ins) const
The SelectionDAGBuilder will automatically promote function arguments with illegal types.
SDValue LowerF64ToF16Safe(SDValue Src, const SDLoc &DL, SelectionDAG &DAG) const
SDValue LowerFROUND(SDValue Op, SelectionDAG &DAG) const
SDValue storeStackInputValue(SelectionDAG &DAG, const SDLoc &SL, SDValue Chain, SDValue ArgVal, int64_t Offset) const
bool storeOfVectorConstantIsCheap(bool IsZero, EVT MemVT, unsigned NumElem, unsigned AS) const override
Return true if it is expected to be cheaper to do a store of vector constant with the given size and ...
SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
bool shouldCombineMemoryType(EVT VT) const
SDValue splitBinaryBitConstantOpImpl(DAGCombinerInfo &DCI, const SDLoc &SL, unsigned Opc, SDValue LHS, uint32_t ValLo, uint32_t ValHi) const
Split the 64-bit value LHS into two 32-bit components, and perform the binary operation Opc to it wit...
SDValue lowerUnhandledCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals, StringRef Reason) const
SDValue performAssertSZExtCombine(SDNode *N, DAGCombinerInfo &DCI) const
bool isTruncateFree(EVT Src, EVT Dest) const override
bool aggressivelyPreferBuildVectorSources(EVT VecVT) const override
SDValue LowerFCEIL(SDValue Op, SelectionDAG &DAG) const
Split a vector store into multiple scalar stores.
TargetLowering::NegatibleCost getConstantNegateCost(const ConstantFPSDNode *C) const
SDValue LowerFLOGUnsafe(SDValue Op, const SDLoc &SL, SelectionDAG &DAG, bool IsLog10, SDNodeFlags Flags) const
bool canCreateUndefOrPoisonForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool PoisonOnly, bool ConsiderFlags, unsigned Depth) const override
Return true if Op can create undef or poison from non-undef & non-poison operands.
SDValue performMulhsCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue lowerFEXPUnsafeImpl(SDValue Op, const SDLoc &SL, SelectionDAG &DAG, SDNodeFlags Flags, bool IsExp10) const
bool isSDNodeAlwaysUniform(const SDNode *N) const override
bool isDesirableToCommuteWithShift(const SDNode *N, CombineLevel Level) const override
Return true if it is profitable to move this shift by a constant amount through its operand,...
SDValue performShlCombine(SDNode *N, DAGCombinerInfo &DCI) const
bool isCheapToSpeculateCtlz(Type *Ty) const override
Return true if it is cheap to speculate a call to intrinsic ctlz.
SDValue LowerSDIVREM(SDValue Op, SelectionDAG &DAG) const
bool isFNegFree(EVT VT) const override
Return true if an fneg operation is free to the point where it is never worthwhile to replace it with...
SDValue LowerFLOG10(SDValue Op, SelectionDAG &DAG) const
SDValue LowerINT_TO_FP64(SDValue Op, SelectionDAG &DAG, bool Signed) const
unsigned computeNumSignBitsForTargetInstr(GISelValueTracking &Analysis, Register R, const APInt &DemandedElts, const MachineRegisterInfo &MRI, unsigned Depth=0) const override
This method can be implemented by targets that want to expose additional information about sign bits ...
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
SDValue LowerFP_TO_FP16(SDValue Op, SelectionDAG &DAG) const
SDValue addTokenForArgument(SDValue Chain, SelectionDAG &DAG, MachineFrameInfo &MFI, int ClobberedFI) const
bool isConstantCheaperToNegate(SDValue N) const
bool isReassocProfitable(MachineRegisterInfo &MRI, Register N0, Register N1) const override
bool isKnownNeverNaNForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool SNaN=false, unsigned Depth=0) const override
If SNaN is false,.
static bool needsDenormHandlingF32(const SelectionDAG &DAG, SDValue Src, SDNodeFlags Flags)
uint32_t getImplicitParameterOffset(const MachineFunction &MF, const ImplicitParameter Param) const
Helper function that returns the byte offset of the given type of implicit parameter.
SDValue lowerFEXPF64(SDValue Op, SelectionDAG &DAG) const
SDValue LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const
SDValue performSelectCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue performFNegCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const
virtual SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op, SelectionDAG &DAG) const
bool isConstantCostlierToNegate(SDValue N) const
SDValue loadInputValue(SelectionDAG &DAG, const TargetRegisterClass *RC, EVT VT, const SDLoc &SL, const ArgDescriptor &Arg) const
SDValue LowerDIVREM24(SDValue Op, SelectionDAG &DAG, bool sign) const
SDValue lowerFEXP10Unsafe(SDValue Op, const SDLoc &SL, SelectionDAG &DAG, SDNodeFlags Flags) const
Emit approx-funcs appropriate lowering for exp10.
bool shouldReduceLoadWidth(SDNode *Load, ISD::LoadExtType ExtType, EVT ExtVT, std::optional< unsigned > ByteOffset) const override
Return true if it is profitable to reduce a load to a smaller type.
SDValue LowerUINT_TO_FP(SDValue Op, SelectionDAG &DAG) const
bool isCheapToSpeculateCttz(Type *Ty) const override
Return true if it is cheap to speculate a call to intrinsic cttz.
SDValue performCtlz_CttzCombine(const SDLoc &SL, SDValue Cond, SDValue LHS, SDValue RHS, DAGCombinerInfo &DCI) const
SDValue performSraCombine(SDNode *N, DAGCombinerInfo &DCI) const
bool isSelectSupported(SelectSupportKind) const override
bool isZExtFree(Type *Src, Type *Dest) const override
Return true if any actual instruction that defines a value of type FromTy implicitly zero-extends the...
SDValue lowerFEXP2(SDValue Op, SelectionDAG &DAG) const
SDValue LowerCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower calls into the specified DAG.
SDValue performSrlCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue lowerFEXP(SDValue Op, SelectionDAG &DAG) const
SDValue getIsLtSmallestNormal(SelectionDAG &DAG, SDValue Op, SDNodeFlags Flags) const
bool mayIgnoreSignedZero(SDValue Op) const
SDValue getIsFinite(SelectionDAG &DAG, SDValue Op, SDNodeFlags Flags) const
bool isLoadBitCastBeneficial(EVT, EVT, const SelectionDAG &DAG, const MachineMemOperand &MMO) const final
Return true if the following transform is beneficial: fold (conv (load x)) -> (load (conv*)x) On arch...
std::pair< SDValue, SDValue > splitVector(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HighVT, SelectionDAG &DAG) const
Split a vector value into two parts of types LoVT and HiVT.
AMDGPUTargetLowering(const TargetMachine &TM, const TargetSubtargetInfo &STI, const AMDGPUSubtarget &AMDGPUSTI)
SDValue LowerFLOGCommon(SDValue Op, SelectionDAG &DAG) const
SDValue foldFreeOpFromSelect(TargetLowering::DAGCombinerInfo &DCI, SDValue N) const
SDValue LowerINT_TO_FP32(SDValue Op, SelectionDAG &DAG, bool Signed) const
bool isFAbsFree(EVT VT) const override
Return true if an fabs operation is free to the point where it is never worthwhile to replace it with...
SDValue loadStackInputValue(SelectionDAG &DAG, EVT VT, const SDLoc &SL, int64_t Offset) const
Similar to CreateLiveInRegister, except value maybe loaded from a stack slot rather than passed in a ...
SDValue LowerFLOG2(SDValue Op, SelectionDAG &DAG) const
static EVT getEquivalentMemType(LLVMContext &Context, EVT VT)
SDValue getSqrtEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled, int &RefinementSteps, bool &UseOneConstNR, bool Reciprocal) const override
Hooks for building estimates in place of slower divisions and square roots.
SDValue performTruncateCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue LowerSINT_TO_FP(SDValue Op, SelectionDAG &DAG) const
static SDValue stripBitcast(SDValue Val)
SDValue CreateLiveInRegister(SelectionDAG &DAG, const TargetRegisterClass *RC, Register Reg, EVT VT, const SDLoc &SL, bool RawReg=false) const
Helper function that adds Reg to the LiveIn list of the DAG's MachineFunction.
SDValue SplitVectorStore(SDValue Op, SelectionDAG &DAG) const
Split a vector store into 2 stores of half the vector.
SDValue LowerCTLZ_CTTZ(SDValue Op, SelectionDAG &DAG) const
SDValue getNegatedExpression(SDValue Op, SelectionDAG &DAG, bool LegalOperations, bool ForCodeSize, NegatibleCost &Cost, unsigned Depth) const override
Return the newly negated expression if the cost is not expensive and set the cost in Cost to indicate...
std::pair< SDValue, SDValue > split64BitValue(SDValue Op, SelectionDAG &DAG) const
Return 64-bit value Op as two 32-bit integers.
SDValue performMulCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue getRecipEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled, int &RefinementSteps) const override
Return a reciprocal estimate value for the input operand.
SDValue LowerFNEARBYINT(SDValue Op, SelectionDAG &DAG) const
SDValue LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const
static CCAssignFn * CCAssignFnForReturn(CallingConv::ID CC, bool IsVarArg)
std::pair< SDValue, SDValue > getScaledLogInput(SelectionDAG &DAG, const SDLoc SL, SDValue Op, SDNodeFlags Flags) const
If denormal handling is required return the scaled input to FLOG2, and the check for denormal range.
static CCAssignFn * CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg)
Selects the correct CCAssignFn for a given CallingConvention value.
static bool allUsesHaveSourceMods(const SDNode *N, unsigned CostThreshold=4)
SDValue LowerFROUNDEVEN(SDValue Op, SelectionDAG &DAG) const
bool isFPImmLegal(const APFloat &Imm, EVT VT, bool ForCodeSize) const override
Returns true if the target can instruction select the specified FP immediate natively.
static unsigned numBitsUnsigned(SDValue Op, SelectionDAG &DAG)
SDValue lowerFEXPUnsafe(SDValue Op, const SDLoc &SL, SelectionDAG &DAG, SDNodeFlags Flags) const
SDValue LowerFTRUNC(SDValue Op, SelectionDAG &DAG) const
SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const
static bool allowApproxFunc(const SelectionDAG &DAG, SDNodeFlags Flags)
bool ShouldShrinkFPConstant(EVT VT) const override
If true, then instruction selection should seek to shrink the FP constant of the specified type to a ...
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
SDValue performStoreCombine(SDNode *N, DAGCombinerInfo &DCI) const
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
This callback is invoked when a node result type is illegal for the target, and the operation was reg...
SDValue performRcpCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue getLoHalf64(SDValue Op, SelectionDAG &DAG) const
SDValue lowerCTLZResults(SDValue Op, SelectionDAG &DAG) const
SDValue LowerFP_TO_INT_SAT(SDValue Op, SelectionDAG &DAG) const
SDValue performFAbsCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue LowerFP_TO_INT64(SDValue Op, SelectionDAG &DAG, bool Signed) const
static bool shouldFoldFNegIntoSrc(SDNode *FNeg, SDValue FNegSrc)
bool isNarrowingProfitable(SDNode *N, EVT SrcVT, EVT DestVT) const override
Return true if it's profitable to narrow operations of type SrcVT to DestVT.
SDValue LowerFRINT(SDValue Op, SelectionDAG &DAG) const
SDValue performIntrinsicWOChainCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue LowerUDIVREM(SDValue Op, SelectionDAG &DAG) const
SDValue performMulLoHiCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
void LowerUDIVREM64(SDValue Op, SelectionDAG &DAG, SmallVectorImpl< SDValue > &Results) const
SDValue WidenOrSplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Widen a suitably aligned v3 load.
std::pair< EVT, EVT > getSplitDestVTs(const EVT &VT, SelectionDAG &DAG) const
Split a vector type into two parts.
SDValue getHiHalf64(SDValue Op, SelectionDAG &DAG) const
SDValue combineFMinMaxLegacyImpl(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, SDValue True, SDValue False, SDValue CC, DAGCombinerInfo &DCI) const
unsigned getVectorIdxWidth(const DataLayout &) const override
Returns the type to be used for the index operand vector operations.
static const fltSemantics & IEEEsingle()
Definition APFloat.h:296
static const fltSemantics & IEEEdouble()
Definition APFloat.h:297
static constexpr roundingMode rmNearestTiesToEven
Definition APFloat.h:344
static const fltSemantics & IEEEhalf()
Definition APFloat.h:294
bool bitwiseIsEqual(const APFloat &RHS) const
Definition APFloat.h:1477
opStatus add(const APFloat &RHS, roundingMode RM)
Definition APFloat.h:1232
const fltSemantics & getSemantics() const
Definition APFloat.h:1520
opStatus multiply(const APFloat &RHS, roundingMode RM)
Definition APFloat.h:1250
static APFloat getSmallestNormalized(const fltSemantics &Sem, bool Negative=false)
Returns the smallest (by magnitude) normalized finite number in the given semantics.
Definition APFloat.h:1209
static APFloat getInf(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Infinity.
Definition APFloat.h:1149
Class for arbitrary precision integers.
Definition APInt.h:78
uint64_t getZExtValue() const
Get zero extended value.
Definition APInt.h:1555
void setHighBits(unsigned hiBits)
Set the top hiBits bits.
Definition APInt.h:1406
void setBitsFrom(unsigned loBit)
Set the top bits starting from loBit.
Definition APInt.h:1400
static APInt getMaxValue(unsigned numBits)
Gets maximum unsigned value of APInt for specific bit width.
Definition APInt.h:207
static APInt getBitsSet(unsigned numBits, unsigned loBit, unsigned hiBit)
Get a value with a block of bits set.
Definition APInt.h:259
static APInt getSignedMaxValue(unsigned numBits)
Gets maximum signed value of APInt for a specific bit width.
Definition APInt.h:210
static APInt getSignedMinValue(unsigned numBits)
Gets minimum signed value of APInt for a specific bit width.
Definition APInt.h:220
bool ule(const APInt &RHS) const
Unsigned less or equal comparison.
Definition APInt.h:1157
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
Definition APInt.h:307
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition APInt.h:297
void setLowBits(unsigned loBits)
Set the bottom loBits bits.
Definition APInt.h:1403
This class represents an incoming formal argument to a Function.
Definition Argument.h:32
CCState - This class holds information needed while lowering arguments and return values.
static CCValAssign getCustomMem(unsigned ValNo, MVT ValVT, int64_t Offset, MVT LocVT, LocInfo HTP)
const APFloat & getValueAPF() const
bool isNegative() const
Return true if the value is negative.
uint64_t getZExtValue() const
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:64
Diagnostic information for unsupported feature in backend.
const DataLayout & getDataLayout() const
Get the data layout of the module this function belongs to.
Definition Function.cpp:362
iterator_range< arg_iterator > args()
Definition Function.h:892
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition Function.h:272
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition Function.cpp:358
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
LLVM_ABI void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
This class is used to represent ISD::LOAD nodes.
const SDValue & getBasePtr() const
Machine Value Type.
static auto integer_fixedlen_vector_valuetypes()
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
bool isInteger() const
Return true if this is an integer or a vector integer type.
static auto integer_valuetypes()
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
LLVM_ABI int CreateFixedObject(uint64_t Size, int64_t SPOffset, bool IsImmutable, bool isAliased=false)
Create a new object at a fixed location on the stack.
int64_t getObjectSize(int ObjectIdx) const
Return the size of the specified object.
int64_t getObjectOffset(int ObjectIdx) const
Return the assigned stack offset of the specified object from the incoming stack pointer.
int getObjectIndexBegin() const
Return the minimum frame object index.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Representation of each machine instruction.
A description of a memory reference used in the backend.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOInvariant
The memory access always returns the same value (or traps).
Flags getFlags() const
Return the raw flags of the source value,.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
This is an abstract virtual class for memory operations.
unsigned getAddressSpace() const
Return the address space for the associated pointer.
Align getAlign() const
bool isSimple() const
Returns true if the memory operation is neither atomic or volatile.
MachineMemOperand * getMemOperand() const
Return the unique MachineMemOperand object describing the memory reference performed by operation.
const SDValue & getChain() const
bool isInvariant() const
EVT getMemoryVT() const
Return the type of the in-memory value.
Wrapper class representing virtual and physical registers.
Definition Register.h:20
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
const DebugLoc & getDebugLoc() const
Represents one node in the SelectionDAG.
ArrayRef< SDUse > ops() const
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
bool hasOneUse() const
Return true if there is exactly one use of this node.
SDNodeFlags getFlags() const
SDVTList getVTList() const
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
iterator_range< user_iterator > users()
Represents a use of a SDNode.
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
SDNode * getNode() const
get the SDNode which holds the desired result
bool hasOneUse() const
Return true if there is exactly one node using value ResNo of Node.
SDValue getValue(unsigned R) const
EVT getValueType() const
Return the ValueType of the referenced return value.
TypeSize getValueSizeInBits() const
Returns the size of the value in bits.
const SDValue & getOperand(unsigned i) const
unsigned getOpcode() const
unsigned getNumOperands() const
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
SIModeRegisterDefaults getMode() const
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
LLVM_ABI SDValue getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT VT, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, EVT MemVT, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
LLVM_ABI unsigned ComputeMaxSignificantBits(SDValue Op, unsigned Depth=0) const
Get the upper bound on bit size for this Value Op as a signed integer.
const SDValue & getRoot() const
Return the root tag of the SelectionDAG.
LLVM_ABI SDValue getMergeValues(ArrayRef< SDValue > Ops, const SDLoc &dl)
Create a MERGE_VALUES node from the given operands.
LLVM_ABI SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
LLVM_ABI SDValue getShiftAmountConstant(uint64_t Val, EVT VT, const SDLoc &DL)
LLVM_ABI SDValue getAllOnesConstant(const SDLoc &DL, EVT VT, bool IsTarget=false, bool IsOpaque=false)
LLVM_ABI void ExtractVectorElements(SDValue Op, SmallVectorImpl< SDValue > &Args, unsigned Start=0, unsigned Count=0, EVT EltVT=EVT())
Append the extracted elements from Start to Count out of the vector Op in Args.
LLVM_ABI SDValue getFreeze(SDValue V)
Return a freeze using the SDLoc of the value operand.
LLVM_ABI SDValue getConstantFP(double Val, const SDLoc &DL, EVT VT, bool isTarget=false)
Create a ConstantFPSDNode wrapping a constant value.
LLVM_ABI SDValue getRegister(Register Reg, EVT VT)
LLVM_ABI SDValue getLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr)
Loads are not normal binary operators: their result type is not determined by their operands,...
SDValue getSetCC(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond, SDValue Chain=SDValue(), bool IsSignaling=false, SDNodeFlags Flags={})
Helper function to make it easier to build SetCC's if you just have an ISD::CondCode instead of an SD...
LLVM_ABI SDValue getNOT(const SDLoc &DL, SDValue Val, EVT VT)
Create a bitwise NOT operation as (XOR Val, -1).
const TargetLowering & getTargetLoweringInfo() const
SDValue getCALLSEQ_END(SDValue Chain, SDValue Op1, SDValue Op2, SDValue InGlue, const SDLoc &DL)
Return a new CALLSEQ_END node, which always must have a glue result (to ensure it's not CSE'd).
SDValue getBuildVector(EVT VT, const SDLoc &DL, ArrayRef< SDValue > Ops)
Return an ISD::BUILD_VECTOR node.
LLVM_ABI SDValue getBitcast(EVT VT, SDValue V)
Return a bitcast using the SDLoc of the value operand, and casting to the provided type.
SDValue getCopyFromReg(SDValue Chain, const SDLoc &dl, Register Reg, EVT VT)
SDValue getSelect(const SDLoc &DL, EVT VT, SDValue Cond, SDValue LHS, SDValue RHS, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build Select's if you just have operands and don't want to check...
LLVM_ABI SDValue getZeroExtendInReg(SDValue Op, const SDLoc &DL, EVT VT)
Return the expression required to zero extend the Op value assuming it was the smaller SrcTy value.
const DataLayout & getDataLayout() const
LLVM_ABI SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
LLVM_ABI SDValue getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, EVT SVT, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
LLVM_ABI void ReplaceAllUsesWith(SDValue From, SDValue To)
Modify anything using 'From' to use 'To' instead.
LLVM_ABI SDValue getStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
Helper function to build ISD::STORE nodes.
LLVM_ABI SDValue getSignedConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
SDValue getCALLSEQ_START(SDValue Chain, uint64_t InSize, uint64_t OutSize, const SDLoc &DL)
Return a new CALLSEQ_START node, that starts new call frame, in which InSize bytes are set up inside ...
bool isConstantValueOfAnyType(SDValue N) const
SDValue getSelectCC(const SDLoc &DL, SDValue LHS, SDValue RHS, SDValue True, SDValue False, ISD::CondCode Cond, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build SelectCC's if you just have an ISD::CondCode instead of an...
LLVM_ABI SDValue getSExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either sign-extending or trunca...
LLVM_ABI SDValue getIntPtrConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
LLVM_ABI SDValue getValueType(EVT)
LLVM_ABI SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
LLVM_ABI bool isKnownNeverNaN(SDValue Op, const APInt &DemandedElts, bool SNaN=false, unsigned Depth=0) const
Test whether the given SDValue (or all elements of it, if it is a vector) is known to never be NaN in...
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
LLVM_ABI unsigned ComputeNumSignBits(SDValue Op, unsigned Depth=0) const
Return the number of times the sign bit of the register is replicated into the other bits.
LLVM_ABI SDValue getVectorIdxConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
LLVM_ABI void ReplaceAllUsesOfValueWith(SDValue From, SDValue To)
Replace any uses of From with To, leaving uses of other values produced by From.getNode() alone.
MachineFunction & getMachineFunction() const
SDValue getPOISON(EVT VT)
Return a POISON node. POISON does not have a useful SDLoc.
LLVM_ABI SDValue getFrameIndex(int FI, EVT VT, bool isTarget=false)
LLVM_ABI KnownBits computeKnownBits(SDValue Op, unsigned Depth=0) const
Determine which bits of Op are known to be either zero or one and return them in Known.
LLVM_ABI SDValue getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either zero-extending or trunca...
LLVM_ABI bool MaskedValueIsZero(SDValue Op, const APInt &Mask, unsigned Depth=0) const
Return true if 'Op & Mask' is known to be zero.
SDValue getObjectPtrOffset(const SDLoc &SL, SDValue Ptr, TypeSize Offset)
Create an add instruction with appropriate flags when used for addressing some offset of an object.
LLVMContext * getContext() const
const SDValue & setRoot(SDValue N)
Set the current root tag of the SelectionDAG.
LLVM_ABI SDNode * UpdateNodeOperands(SDNode *N, SDValue Op)
Mutate the specified node in-place to have the specified operands.
SDValue getEntryNode() const
Return the token chain corresponding to the entry of the function.
LLVM_ABI std::pair< SDValue, SDValue > SplitScalar(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the scalar node with EXTRACT_ELEMENT using the provided VTs and return the low/high part.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
This class is used to represent ISD::STORE nodes.
const SDValue & getBasePtr() const
const SDValue & getValue() const
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
void setOperationAction(unsigned Op, MVT VT, LegalizeAction Action)
Indicate that the specified operation does not work with the specified type and indicate what to do a...
void setMaxDivRemBitWidthSupported(unsigned SizeInBits)
Set the size in bits of the maximum div/rem the backend supports.
bool PredictableSelectIsExpensive
Tells the code generator that select is more expensive than a branch if the branch is usually predict...
virtual bool shouldReduceLoadWidth(SDNode *Load, ISD::LoadExtType ExtTy, EVT NewVT, std::optional< unsigned > ByteOffset=std::nullopt) const
Return true if it is profitable to reduce a load to a smaller type.
unsigned MaxStoresPerMemcpyOptSize
Likewise for functions with the OptSize attribute.
const TargetMachine & getTargetMachine() const
virtual unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain targets require unusual breakdowns of certain types.
unsigned MaxGluedStoresPerMemcpy
Specify max number of store instructions to glue in inlined memcpy.
virtual MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
void addBypassSlowDiv(unsigned int SlowBitWidth, unsigned int FastBitWidth)
Tells the code generator which bitwidths to bypass.
void setMaxLargeFPConvertBitWidthSupported(unsigned SizeInBits)
Set the size in bits of the maximum fp to/from int conversion the backend supports.
void setMaxAtomicSizeInBitsSupported(unsigned SizeInBits)
Set the maximum atomic operation size supported by the backend.
SelectSupportKind
Enum that describes what type of support for selects the target has.
virtual bool allowsMisalignedMemoryAccesses(EVT, unsigned AddrSpace=0, Align Alignment=Align(1), MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *=nullptr) const
Determine if the target supports unaligned memory accesses.
unsigned MaxStoresPerMemsetOptSize
Likewise for functions with the OptSize attribute.
EVT getShiftAmountTy(EVT LHSTy, const DataLayout &DL) const
Returns the type for the shift amount of a shift opcode.
unsigned MaxStoresPerMemmove
Specify maximum number of store instructions per memmove call.
virtual EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const
Return the ValueType of the result of SETCC operations.
unsigned MaxStoresPerMemmoveOptSize
Likewise for functions with the OptSize attribute.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
void setSupportsUnalignedAtomics(bool UnalignedSupported)
Sets whether unaligned atomic operations are supported.
bool isOperationLegal(unsigned Op, EVT VT) const
Return true if the specified operation is legal on this target.
unsigned MaxStoresPerMemset
Specify maximum number of store instructions per memset call.
void setTruncStoreAction(MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified truncating store does not work with the specified type and indicate what ...
void setMinCmpXchgSizeInBits(unsigned SizeInBits)
Sets the minimum cmpxchg or ll/sc size supported by the backend.
void AddPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)
If Opc/OrigVT is specified as being promoted, the promotion code defaults to trying a larger integer/...
void setTargetDAGCombine(ArrayRef< ISD::NodeType > NTs)
Targets should invoke this method for each target independent node that they want to provide a custom...
void setLoadExtAction(unsigned ExtType, MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified load with extension does not work with the specified type and indicate wh...
unsigned GatherAllAliasesMaxDepth
Depth that GatherAllAliases should continue looking for chain dependencies when trying to find a more...
NegatibleCost
Enum that specifies when a float negation is beneficial.
bool allowsMemoryAccessForAlignment(LLVMContext &Context, const DataLayout &DL, EVT VT, unsigned AddrSpace=0, Align Alignment=Align(1), MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *Fast=nullptr) const
This function returns true if the memory access is aligned or if the target allows this specific unal...
unsigned MaxStoresPerMemcpy
Specify maximum number of store instructions per memcpy call.
void setSchedulingPreference(Sched::Preference Pref)
Specify the target scheduling preference.
void setJumpIsExpensive(bool isExpensive=true)
Tells the code generator not to expand logic operations on comparison predicates into separate sequen...
This class defines information used to lower LLVM code to legal SelectionDAG operators that the targe...
SDValue scalarizeVectorStore(StoreSDNode *ST, SelectionDAG &DAG) const
SDValue SimplifyMultipleUseDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth=0) const
More limited version of SimplifyDemandedBits that can be used to "lookthrough" ops that don't contrib...
SDValue expandUnalignedStore(StoreSDNode *ST, SelectionDAG &DAG) const
Expands an unaligned store to 2 half-size stores for integer values, and possibly more for vectors.
bool ShrinkDemandedConstant(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, TargetLoweringOpt &TLO) const
Check to see if the specified operand of the specified instruction is a constant integer.
std::pair< SDValue, SDValue > expandUnalignedLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Expands an unaligned load to 2 half-size loads for an integer, and possibly more for vectors.
virtual SDValue getNegatedExpression(SDValue Op, SelectionDAG &DAG, bool LegalOps, bool OptForSize, NegatibleCost &Cost, unsigned Depth=0) const
Return the newly negated expression if the cost is not expensive and set the cost in Cost to indicate...
std::pair< SDValue, SDValue > scalarizeVectorLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Turn load of vector type into a load of the individual elements.
bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0, bool AssumeSingleUse=false) const
Look at Op.
TargetLowering(const TargetLowering &)=delete
virtual bool canCreateUndefOrPoisonForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool PoisonOnly, bool ConsiderFlags, unsigned Depth) const
Return true if Op can create undef or poison from non-undef & non-poison operands.
Primary interface to the complete machine description for the target machine.
TargetSubtargetInfo - Generic base class for all target subtargets.
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition TypeSize.h:343
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:230
LLVM Value Representation.
Definition Value.h:75
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:322
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
bool isIntrinsicAlwaysUniform(unsigned IntrID)
TargetExtType * isNamedBarrier(const GlobalVariable &GV)
bool isUniformMMO(const MachineMemOperand *MMO)
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
@ AMDGPU_VS
Used for Mesa vertex shaders, or AMDPAL last shader stage before rasterization (vertex shader if tess...
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
@ AMDGPU_Gfx
Used for AMD graphics targets.
@ AMDGPU_CS_ChainPreserve
Used on AMDGPUs to give the middle-end more control over argument placement.
@ AMDGPU_HS
Used for Mesa/AMDPAL hull shaders (= tessellation control shaders).
@ AMDGPU_GS
Used for Mesa/AMDPAL geometry shaders.
@ AMDGPU_CS_Chain
Used on AMDGPUs to give the middle-end more control over argument placement.
@ AMDGPU_PS
Used for Mesa/AMDPAL pixel shaders.
@ Cold
Attempts to make code in the caller as efficient as possible under the assumption that the call is no...
Definition CallingConv.h:47
@ SPIR_KERNEL
Used for SPIR kernel functions.
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition CallingConv.h:41
@ AMDGPU_ES
Used for AMDPAL shader stage before geometry shader if geometry is in use.
@ AMDGPU_LS
Used for AMDPAL vertex shader if tessellation is in use.
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
NodeType
ISD::NodeType enum - This enum defines the target-independent operators for a SelectionDAG.
Definition ISDOpcodes.h:41
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition ISDOpcodes.h:819
@ CTLZ_ZERO_UNDEF
Definition ISDOpcodes.h:788
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
Definition ISDOpcodes.h:275
@ INSERT_SUBVECTOR
INSERT_SUBVECTOR(VECTOR1, VECTOR2, IDX) - Returns a vector with VECTOR2 inserted into VECTOR1.
Definition ISDOpcodes.h:600
@ BSWAP
Byte Swap and Counting operators.
Definition ISDOpcodes.h:779
@ ATOMIC_STORE
OUTCHAIN = ATOMIC_STORE(INCHAIN, val, ptr) This corresponds to "store atomic" instruction.
@ ADDC
Carry-setting nodes for multiple precision addition and subtraction.
Definition ISDOpcodes.h:294
@ FMAD
FMAD - Perform a * b + c, while getting the same result as the separately rounded operations.
Definition ISDOpcodes.h:522
@ ADD
Simple integer binary arithmetic operators.
Definition ISDOpcodes.h:264
@ LOAD
LOAD and STORE have token chains as their first operand, then the same operands as an LLVM load/store...
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
Definition ISDOpcodes.h:853
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
Definition ISDOpcodes.h:518
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition ISDOpcodes.h:880
@ CONCAT_VECTORS
CONCAT_VECTORS(VECTOR0, VECTOR1, ...) - Given a number of values of vector type with the same length ...
Definition ISDOpcodes.h:584
@ FADD
Simple binary floating point operators.
Definition ISDOpcodes.h:417
@ SDIVREM
SDIVREM/UDIVREM - Divide two integers and produce both a quotient and remainder result.
Definition ISDOpcodes.h:280
@ FP16_TO_FP
FP16_TO_FP, FP_TO_FP16 - These operators are used to perform promotions and truncation for half-preci...
@ BITCAST
BITCAST - This operator converts between integer, vector and FP values, as if the value was stored to...
Definition ISDOpcodes.h:993
@ BUILD_PAIR
BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.
Definition ISDOpcodes.h:254
@ FLDEXP
FLDEXP - ldexp, inspired by libm (op0 * 2**op1).
@ SIGN_EXTEND
Conversion operators.
Definition ISDOpcodes.h:844
@ CTTZ_ZERO_UNDEF
Bit counting operators with an undefined result for zero inputs.
Definition ISDOpcodes.h:787
@ FNEG
Perform various unary floating-point operations inspired by libm.
@ BRIND
BRIND - Indirect branch.
@ BR_JT
BR_JT - Jumptable branch.
@ FCANONICALIZE
Returns platform specific canonical encoding of a floating point number.
Definition ISDOpcodes.h:541
@ IS_FPCLASS
Performs a check of floating point class property, defined by IEEE-754.
Definition ISDOpcodes.h:548
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition ISDOpcodes.h:796
@ ATOMIC_LOAD
Val, OUTCHAIN = ATOMIC_LOAD(INCHAIN, ptr) This corresponds to "load atomic" instruction.
@ EXTRACT_ELEMENT
EXTRACT_ELEMENT - This is used to get the lower or upper (determined by a Constant,...
Definition ISDOpcodes.h:247
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
Definition ISDOpcodes.h:704
@ STRICT_FP16_TO_FP
@ SHL
Shift and rotation operations.
Definition ISDOpcodes.h:765
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
Definition ISDOpcodes.h:649
@ EXTRACT_SUBVECTOR
EXTRACT_SUBVECTOR(VECTOR, IDX) - Returns a subvector from VECTOR.
Definition ISDOpcodes.h:614
@ FMINNUM_IEEE
FMINNUM_IEEE/FMAXNUM_IEEE - Perform floating-point minimumNumber or maximumNumber on two values,...
@ EntryToken
EntryToken - This is the marker used to indicate the start of a region.
Definition ISDOpcodes.h:48
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition ISDOpcodes.h:576
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
Definition ISDOpcodes.h:224
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition ISDOpcodes.h:850
@ SELECT_CC
Select with condition operator - This selects between a true value and a false value (ops #2 and #3) ...
Definition ISDOpcodes.h:811
@ FMINNUM
FMINNUM/FMAXNUM - Perform floating-point minimum maximum on two values, following IEEE-754 definition...
@ DYNAMIC_STACKALLOC
DYNAMIC_STACKALLOC - Allocate some number of bytes on the stack aligned to a specified boundary.
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
Definition ISDOpcodes.h:888
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
Definition ISDOpcodes.h:727
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition ISDOpcodes.h:978
@ VSELECT
Select with a vector condition (op #0) and two vector operands (ops #1 and #2), returning a vector re...
Definition ISDOpcodes.h:805
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
Definition ISDOpcodes.h:328
@ INLINEASM_BR
INLINEASM_BR - Branching version of inline asm. Used by asm-goto.
@ FMINIMUM
FMINIMUM/FMAXIMUM - NaN-propagating minimum/maximum that also treat -0.0 as less than 0....
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition ISDOpcodes.h:926
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition ISDOpcodes.h:739
@ TRAP
TRAP - Trapping instruction.
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
Definition ISDOpcodes.h:205
@ ADDE
Carry-using nodes for multiple precision addition and subtraction.
Definition ISDOpcodes.h:304
@ INSERT_VECTOR_ELT
INSERT_VECTOR_ELT(VECTOR, VAL, IDX) - Returns VECTOR with the element at IDX replaced with VAL.
Definition ISDOpcodes.h:565
@ TokenFactor
TokenFactor - This node takes multiple tokens as input and produces a single token result.
Definition ISDOpcodes.h:53
@ FFREXP
FFREXP - frexp, extract fractional and exponent component of a floating-point value.
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition ISDOpcodes.h:959
@ ADDRSPACECAST
ADDRSPACECAST - This operator converts between pointers of different address spaces.
Definition ISDOpcodes.h:997
@ INLINEASM
INLINEASM - Represents an inline asm block.
@ FP_TO_SINT_SAT
FP_TO_[US]INT_SAT - Convert floating point value in operand 0 to a signed or unsigned scalar integer ...
Definition ISDOpcodes.h:945
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition ISDOpcodes.h:856
@ AssertSext
AssertSext, AssertZext - These nodes record if a register contains a value that has already been zero...
Definition ISDOpcodes.h:62
@ FCOPYSIGN
FCOPYSIGN(X, Y) - Return the value of X with the sign of Y.
Definition ISDOpcodes.h:534
@ FMINIMUMNUM
FMINIMUMNUM/FMAXIMUMNUM - minimumnum/maximumnum that is same with FMINNUM_IEEE and FMAXNUM_IEEE besid...
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
Definition ISDOpcodes.h:213
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
Definition ISDOpcodes.h:556
bool isNormalStore(const SDNode *N)
Returns true if the specified node is a non-truncating and unindexed store.
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
LoadExtType
LoadExtType enum - This enum defines the three variants of LOADEXT (load with extension).
bool isNormalLoad(const SDNode *N)
Returns true if the specified node is a non-extending and unindexed load.
initializer< Ty > init(const Ty &Val)
constexpr double ln2
constexpr double ln10
constexpr float log2ef
Definition MathExtras.h:51
constexpr double log2e
This is an optimization pass for GlobalISel generic memory operations.
Definition Types.h:26
@ Offset
Definition DWP.cpp:532
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1739
MaybeAlign getAlign(const CallInst &I, unsigned Index)
InstructionCost Cost
LLVM_ABI bool isNullConstant(SDValue V)
Returns true if V is a constant integer zero.
void ComputeValueVTs(const TargetLowering &TLI, const DataLayout &DL, Type *Ty, SmallVectorImpl< EVT > &ValueVTs, SmallVectorImpl< EVT > *MemVTs=nullptr, SmallVectorImpl< TypeSize > *Offsets=nullptr, TypeSize StartingOffset=TypeSize::getZero())
ComputeValueVTs - Given an LLVM IR type, compute a sequence of EVTs that represent all the individual...
Definition Analysis.cpp:119
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
bool CCAssignFn(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
CCAssignFn - This function assigns a location for Val, updating State to reflect the change.
LLVM_ABI ConstantFPSDNode * isConstOrConstSplatFP(SDValue N, bool AllowUndefs=false)
Returns the SDNode if it is a constant splat BuildVector or constant float.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition MathExtras.h:385
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition bit.h:202
int countl_zero(T Val)
Count number of 0's from the most significant bit to the least stopping at the first 1.
Definition bit.h:236
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
Definition MathExtras.h:150
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Definition MathExtras.h:155
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
LLVM_ABI raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
CombineLevel
Definition DAGCombine.h:15
@ AfterLegalizeDAG
Definition DAGCombine.h:19
@ BeforeLegalizeTypes
Definition DAGCombine.h:16
@ AfterLegalizeTypes
Definition DAGCombine.h:17
To bit_cast(const From &from) noexcept
Definition bit.h:90
@ Mul
Product of integers.
@ Add
Sum of integers.
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition Alignment.h:144
DWARFExpression::Operation Op
LLVM_ABI ConstantSDNode * isConstOrConstSplat(SDValue N, bool AllowUndefs=false, bool AllowTruncation=false)
Returns the SDNode if it is a constant splat BuildVector or constant int.
constexpr unsigned BitWidth
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
LLVM_ABI bool isOneConstant(SDValue V)
Returns true if V is a constant integer one.
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition Alignment.h:201
APFloat neg(APFloat X)
Returns the negated value of the argument.
Definition APFloat.h:1632
unsigned Log2(Align A)
Returns the log2 of the alignment.
Definition Alignment.h:197
static cl::opt< unsigned > CostThreshold("dfa-cost-threshold", cl::desc("Maximum cost accepted for the transformation"), cl::Hidden, cl::init(50))
LLVM_ABI bool isAllOnesConstant(SDValue V)
Returns true if V is an integer constant with all bits set.
LLVM_ABI void reportFatalUsageError(Error Err)
Report a fatal error that does not indicate a bug in LLVM.
Definition Error.cpp:177
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:872
#define N
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
MCRegister getRegister() const
unsigned getStackOffset() const
DenormalModeKind Input
Denormal treatment kind for floating point instruction inputs in the default floating-point environme...
@ PreserveSign
The sign of a flushed-to-zero number is preserved in the sign of 0.
static constexpr DenormalMode getPreserveSign()
Extended Value Type.
Definition ValueTypes.h:35
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
Definition ValueTypes.h:395
EVT getPow2VectorType(LLVMContext &Context) const
Widens the length of the given vector EVT up to the nearest power of 2 and returns that type.
Definition ValueTypes.h:477
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition ValueTypes.h:137
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
Definition ValueTypes.h:74
EVT changeTypeToInteger() const
Return the type converted to an equivalently sized integer or vector with integer element type.
Definition ValueTypes.h:121
bool bitsGT(EVT VT) const
Return true if this has more bits than VT.
Definition ValueTypes.h:284
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
Definition ValueTypes.h:147
EVT getDoubleNumVectorElementsVT(LLVMContext &Context) const
Definition ValueTypes.h:463
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition ValueTypes.h:373
bool isByteSized() const
Return true if the bit size is a multiple of 8.
Definition ValueTypes.h:243
uint64_t getScalarSizeInBits() const
Definition ValueTypes.h:385
EVT getHalfSizedIntegerVT(LLVMContext &Context) const
Finds the smallest simple value type that is greater than or equal to half the width of this EVT.
Definition ValueTypes.h:430
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
Definition ValueTypes.h:470
TypeSize getStoreSizeInBits() const
Return the number of bits overwritten by a store of the specified value type.
Definition ValueTypes.h:412
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition ValueTypes.h:316
static EVT getIntegerVT(LLVMContext &Context, unsigned BitWidth)
Returns the EVT that represents an integer with the given number of bits.
Definition ValueTypes.h:65
uint64_t getFixedSizeInBits() const
Return the size of the specified fixed width value type in bits.
Definition ValueTypes.h:381
EVT getRoundIntegerType(LLVMContext &Context) const
Rounds the bit-width of the given integer EVT up to the nearest power of two (and at least to eight),...
Definition ValueTypes.h:419
bool isVector() const
Return true if this is a vector value type.
Definition ValueTypes.h:168
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition ValueTypes.h:323
bool bitsGE(EVT VT) const
Return true if this has no less bits than VT.
Definition ValueTypes.h:292
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition ValueTypes.h:328
bool isExtended() const
Test if the given EVT is extended (as opposed to being simple).
Definition ValueTypes.h:142
EVT changeElementType(LLVMContext &Context, EVT EltVT) const
Return a VT for a type whose attributes match ourselves with the exception of the element type that i...
Definition ValueTypes.h:113
LLVM_ABI const fltSemantics & getFltSemantics() const
Returns an APFloat semantics tag appropriate for the value type.
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition ValueTypes.h:336
bool bitsLE(EVT VT) const
Return true if this has no more bits than VT.
Definition ValueTypes.h:308
bool isInteger() const
Return true if this is an integer or a vector integer type.
Definition ValueTypes.h:152
InputArg - This struct carries flags and type information about a single incoming (formal) argument o...
MVT VT
Legalized type of this argument part.
bool isNonNegative() const
Returns true if this value is known to be non-negative.
Definition KnownBits.h:108
unsigned countMinTrailingZeros() const
Returns the minimum number of trailing zero bits.
Definition KnownBits.h:258
bool isUnknown() const
Returns true if we don't know any bits.
Definition KnownBits.h:66
KnownBits trunc(unsigned BitWidth) const
Return known bits for a truncation of the value we're tracking.
Definition KnownBits.h:167
unsigned getBitWidth() const
Get the bit width of this value.
Definition KnownBits.h:44
void resetAll()
Resets the known state of all bits.
Definition KnownBits.h:74
unsigned countMaxActiveBits() const
Returns the maximum number of bits needed to represent all possible unsigned values with these known ...
Definition KnownBits.h:312
unsigned countMinLeadingZeros() const
Returns the minimum number of leading zero bits.
Definition KnownBits.h:264
APInt getMaxValue() const
Return the maximal unsigned value possible given these KnownBits.
Definition KnownBits.h:148
APInt getMinValue() const
Return the minimal unsigned value possible given these KnownBits.
Definition KnownBits.h:132
bool isStrictlyPositive() const
Returns true if this value is known to be positive.
Definition KnownBits.h:114
bool isNegative() const
Returns true if this value is known to be negative.
Definition KnownBits.h:105
unsigned countMaxSignificantBits() const
Returns the maximum number of bits needed to represent all possible signed values with these known bi...
Definition KnownBits.h:285
Matching combinators.
This class contains a discriminated union of information about pointers in memory operands,...
LLVM_ABI bool isDereferenceable(unsigned Size, LLVMContext &C, const DataLayout &DL) const
Return true if memory region [V, V+Offset+Size) is known to be dereferenceable.
static LLVM_ABI MachinePointerInfo getStack(MachineFunction &MF, int64_t Offset, uint8_t ID=0)
Stack pointer relative access.
MachinePointerInfo getWithOffset(int64_t O) const
These are IR-level optimization flags that may be propagated to SDNodes.
void setAllowContract(bool b)
This represents a list of ValueType's that has been intern'd by a SelectionDAG.
DenormalMode FP32Denormals
If this is set, neither input or output denormals are flushed for most f32 instructions.
This structure contains all information that is necessary for lowering calls.
SmallVector< ISD::InputArg, 32 > Ins
LLVM_ABI void AddToWorklist(SDNode *N)
LLVM_ABI SDValue CombineTo(SDNode *N, ArrayRef< SDValue > To, bool AddTo=true)
LLVM_ABI void CommitTargetLoweringOpt(const TargetLoweringOpt &TLO)
A convenience struct that encapsulates a DAG, and two SDValues for returning information from TargetL...