LLVM 22.0.0git
AMDGPUISelLowering.cpp
Go to the documentation of this file.
1//===-- AMDGPUISelLowering.cpp - AMDGPU Common DAG lowering functions -----===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// This is the parent TargetLowering class for hardware code gen
11/// targets.
12//
13//===----------------------------------------------------------------------===//
14
15#include "AMDGPUISelLowering.h"
16#include "AMDGPU.h"
17#include "AMDGPUInstrInfo.h"
19#include "AMDGPUMemoryUtils.h"
26#include "llvm/IR/IntrinsicsAMDGPU.h"
30
31using namespace llvm;
32
33#include "AMDGPUGenCallingConv.inc"
34
36 "amdgpu-bypass-slow-div",
37 cl::desc("Skip 64-bit divide for dynamic 32-bit values"),
38 cl::init(true));
39
40// Find a larger type to do a load / store of a vector with.
42 unsigned StoreSize = VT.getStoreSizeInBits();
43 if (StoreSize <= 32)
44 return EVT::getIntegerVT(Ctx, StoreSize);
45
46 if (StoreSize % 32 == 0)
47 return EVT::getVectorVT(Ctx, MVT::i32, StoreSize / 32);
48
49 return VT;
50}
51
55
57 // In order for this to be a signed 24-bit value, bit 23, must
58 // be a sign bit.
59 return DAG.ComputeMaxSignificantBits(Op);
60}
61
63 const TargetSubtargetInfo &STI,
64 const AMDGPUSubtarget &AMDGPUSTI)
65 : TargetLowering(TM, STI), Subtarget(&AMDGPUSTI) {
66 // Always lower memset, memcpy, and memmove intrinsics to load/store
67 // instructions, rather then generating calls to memset, mempcy or memmove.
71
72 // Enable ganging up loads and stores in the memcpy DAG lowering.
74
75 // Lower floating point store/load to integer store/load to reduce the number
76 // of patterns in tablegen.
78 AddPromotedToType(ISD::LOAD, MVT::f32, MVT::i32);
79
81 AddPromotedToType(ISD::LOAD, MVT::v2f32, MVT::v2i32);
82
84 AddPromotedToType(ISD::LOAD, MVT::v3f32, MVT::v3i32);
85
87 AddPromotedToType(ISD::LOAD, MVT::v4f32, MVT::v4i32);
88
90 AddPromotedToType(ISD::LOAD, MVT::v5f32, MVT::v5i32);
91
93 AddPromotedToType(ISD::LOAD, MVT::v6f32, MVT::v6i32);
94
96 AddPromotedToType(ISD::LOAD, MVT::v7f32, MVT::v7i32);
97
99 AddPromotedToType(ISD::LOAD, MVT::v8f32, MVT::v8i32);
100
102 AddPromotedToType(ISD::LOAD, MVT::v9f32, MVT::v9i32);
103
104 setOperationAction(ISD::LOAD, MVT::v10f32, Promote);
105 AddPromotedToType(ISD::LOAD, MVT::v10f32, MVT::v10i32);
106
107 setOperationAction(ISD::LOAD, MVT::v11f32, Promote);
108 AddPromotedToType(ISD::LOAD, MVT::v11f32, MVT::v11i32);
109
110 setOperationAction(ISD::LOAD, MVT::v12f32, Promote);
111 AddPromotedToType(ISD::LOAD, MVT::v12f32, MVT::v12i32);
112
113 setOperationAction(ISD::LOAD, MVT::v16f32, Promote);
114 AddPromotedToType(ISD::LOAD, MVT::v16f32, MVT::v16i32);
115
116 setOperationAction(ISD::LOAD, MVT::v32f32, Promote);
117 AddPromotedToType(ISD::LOAD, MVT::v32f32, MVT::v32i32);
118
120 AddPromotedToType(ISD::LOAD, MVT::i64, MVT::v2i32);
121
123 AddPromotedToType(ISD::LOAD, MVT::v2i64, MVT::v4i32);
124
126 AddPromotedToType(ISD::LOAD, MVT::f64, MVT::v2i32);
127
129 AddPromotedToType(ISD::LOAD, MVT::v2f64, MVT::v4i32);
130
132 AddPromotedToType(ISD::LOAD, MVT::v3i64, MVT::v6i32);
133
135 AddPromotedToType(ISD::LOAD, MVT::v4i64, MVT::v8i32);
136
138 AddPromotedToType(ISD::LOAD, MVT::v3f64, MVT::v6i32);
139
141 AddPromotedToType(ISD::LOAD, MVT::v4f64, MVT::v8i32);
142
144 AddPromotedToType(ISD::LOAD, MVT::v8i64, MVT::v16i32);
145
147 AddPromotedToType(ISD::LOAD, MVT::v8f64, MVT::v16i32);
148
149 setOperationAction(ISD::LOAD, MVT::v16i64, Promote);
150 AddPromotedToType(ISD::LOAD, MVT::v16i64, MVT::v32i32);
151
152 setOperationAction(ISD::LOAD, MVT::v16f64, Promote);
153 AddPromotedToType(ISD::LOAD, MVT::v16f64, MVT::v32i32);
154
156 AddPromotedToType(ISD::LOAD, MVT::i128, MVT::v4i32);
157
158 // TODO: Would be better to consume as directly legal
160 AddPromotedToType(ISD::ATOMIC_LOAD, MVT::f32, MVT::i32);
161
163 AddPromotedToType(ISD::ATOMIC_LOAD, MVT::f64, MVT::i64);
164
166 AddPromotedToType(ISD::ATOMIC_LOAD, MVT::f16, MVT::i16);
167
169 AddPromotedToType(ISD::ATOMIC_LOAD, MVT::bf16, MVT::i16);
170
172 AddPromotedToType(ISD::ATOMIC_STORE, MVT::f32, MVT::i32);
173
175 AddPromotedToType(ISD::ATOMIC_STORE, MVT::f64, MVT::i64);
176
178 AddPromotedToType(ISD::ATOMIC_STORE, MVT::f16, MVT::i16);
179
181 AddPromotedToType(ISD::ATOMIC_STORE, MVT::bf16, MVT::i16);
182
183 // There are no 64-bit extloads. These should be done as a 32-bit extload and
184 // an extension to 64-bit.
185 for (MVT VT : MVT::integer_valuetypes())
187 Expand);
188
189 for (MVT VT : MVT::integer_valuetypes()) {
190 if (VT == MVT::i64)
191 continue;
192
193 for (auto Op : {ISD::SEXTLOAD, ISD::ZEXTLOAD, ISD::EXTLOAD}) {
194 setLoadExtAction(Op, VT, MVT::i1, Promote);
195 setLoadExtAction(Op, VT, MVT::i8, Legal);
196 setLoadExtAction(Op, VT, MVT::i16, Legal);
197 setLoadExtAction(Op, VT, MVT::i32, Expand);
198 }
199 }
200
202 for (auto MemVT :
203 {MVT::v2i8, MVT::v4i8, MVT::v2i16, MVT::v3i16, MVT::v4i16})
205 Expand);
206
207 setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
208 setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::bf16, Expand);
209 setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2f16, Expand);
210 setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2bf16, Expand);
211 setLoadExtAction(ISD::EXTLOAD, MVT::v3f32, MVT::v3f16, Expand);
212 setLoadExtAction(ISD::EXTLOAD, MVT::v3f32, MVT::v3bf16, Expand);
213 setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Expand);
214 setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4bf16, Expand);
215 setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8f16, Expand);
216 setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8bf16, Expand);
217 setLoadExtAction(ISD::EXTLOAD, MVT::v16f32, MVT::v16f16, Expand);
218 setLoadExtAction(ISD::EXTLOAD, MVT::v16f32, MVT::v16bf16, Expand);
219 setLoadExtAction(ISD::EXTLOAD, MVT::v32f32, MVT::v32f16, Expand);
220 setLoadExtAction(ISD::EXTLOAD, MVT::v32f32, MVT::v32bf16, Expand);
221
222 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
223 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f32, Expand);
224 setLoadExtAction(ISD::EXTLOAD, MVT::v3f64, MVT::v3f32, Expand);
225 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f32, Expand);
226 setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f32, Expand);
227 setLoadExtAction(ISD::EXTLOAD, MVT::v16f64, MVT::v16f32, Expand);
228
229 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
230 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::bf16, Expand);
231 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Expand);
232 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2bf16, Expand);
233 setLoadExtAction(ISD::EXTLOAD, MVT::v3f64, MVT::v3f16, Expand);
234 setLoadExtAction(ISD::EXTLOAD, MVT::v3f64, MVT::v3bf16, Expand);
235 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Expand);
236 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4bf16, Expand);
237 setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f16, Expand);
238 setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8bf16, Expand);
239 setLoadExtAction(ISD::EXTLOAD, MVT::v16f64, MVT::v16f16, Expand);
240 setLoadExtAction(ISD::EXTLOAD, MVT::v16f64, MVT::v16bf16, Expand);
241
243 AddPromotedToType(ISD::STORE, MVT::f32, MVT::i32);
244
246 AddPromotedToType(ISD::STORE, MVT::v2f32, MVT::v2i32);
247
249 AddPromotedToType(ISD::STORE, MVT::v3f32, MVT::v3i32);
250
252 AddPromotedToType(ISD::STORE, MVT::v4f32, MVT::v4i32);
253
255 AddPromotedToType(ISD::STORE, MVT::v5f32, MVT::v5i32);
256
258 AddPromotedToType(ISD::STORE, MVT::v6f32, MVT::v6i32);
259
261 AddPromotedToType(ISD::STORE, MVT::v7f32, MVT::v7i32);
262
264 AddPromotedToType(ISD::STORE, MVT::v8f32, MVT::v8i32);
265
267 AddPromotedToType(ISD::STORE, MVT::v9f32, MVT::v9i32);
268
270 AddPromotedToType(ISD::STORE, MVT::v10f32, MVT::v10i32);
271
273 AddPromotedToType(ISD::STORE, MVT::v11f32, MVT::v11i32);
274
276 AddPromotedToType(ISD::STORE, MVT::v12f32, MVT::v12i32);
277
279 AddPromotedToType(ISD::STORE, MVT::v16f32, MVT::v16i32);
280
282 AddPromotedToType(ISD::STORE, MVT::v32f32, MVT::v32i32);
283
285 AddPromotedToType(ISD::STORE, MVT::i64, MVT::v2i32);
286
288 AddPromotedToType(ISD::STORE, MVT::v2i64, MVT::v4i32);
289
291 AddPromotedToType(ISD::STORE, MVT::f64, MVT::v2i32);
292
294 AddPromotedToType(ISD::STORE, MVT::v2f64, MVT::v4i32);
295
297 AddPromotedToType(ISD::STORE, MVT::v3i64, MVT::v6i32);
298
300 AddPromotedToType(ISD::STORE, MVT::v3f64, MVT::v6i32);
301
303 AddPromotedToType(ISD::STORE, MVT::v4i64, MVT::v8i32);
304
306 AddPromotedToType(ISD::STORE, MVT::v4f64, MVT::v8i32);
307
309 AddPromotedToType(ISD::STORE, MVT::v8i64, MVT::v16i32);
310
312 AddPromotedToType(ISD::STORE, MVT::v8f64, MVT::v16i32);
313
315 AddPromotedToType(ISD::STORE, MVT::v16i64, MVT::v32i32);
316
318 AddPromotedToType(ISD::STORE, MVT::v16f64, MVT::v32i32);
319
321 AddPromotedToType(ISD::STORE, MVT::i128, MVT::v4i32);
322
323 setTruncStoreAction(MVT::i64, MVT::i1, Expand);
324 setTruncStoreAction(MVT::i64, MVT::i8, Expand);
325 setTruncStoreAction(MVT::i64, MVT::i16, Expand);
326 setTruncStoreAction(MVT::i64, MVT::i32, Expand);
327
328 setTruncStoreAction(MVT::v2i64, MVT::v2i1, Expand);
329 setTruncStoreAction(MVT::v2i64, MVT::v2i8, Expand);
330 setTruncStoreAction(MVT::v2i64, MVT::v2i16, Expand);
331 setTruncStoreAction(MVT::v2i64, MVT::v2i32, Expand);
332
333 setTruncStoreAction(MVT::f32, MVT::bf16, Expand);
334 setTruncStoreAction(MVT::f32, MVT::f16, Expand);
335 setTruncStoreAction(MVT::v2f32, MVT::v2bf16, Expand);
336 setTruncStoreAction(MVT::v2f32, MVT::v2f16, Expand);
337 setTruncStoreAction(MVT::v3f32, MVT::v3bf16, Expand);
338 setTruncStoreAction(MVT::v3f32, MVT::v3f16, Expand);
339 setTruncStoreAction(MVT::v4f32, MVT::v4bf16, Expand);
340 setTruncStoreAction(MVT::v4f32, MVT::v4f16, Expand);
341 setTruncStoreAction(MVT::v6f32, MVT::v6f16, Expand);
342 setTruncStoreAction(MVT::v8f32, MVT::v8bf16, Expand);
343 setTruncStoreAction(MVT::v8f32, MVT::v8f16, Expand);
344 setTruncStoreAction(MVT::v16f32, MVT::v16bf16, Expand);
345 setTruncStoreAction(MVT::v16f32, MVT::v16f16, Expand);
346 setTruncStoreAction(MVT::v32f32, MVT::v32bf16, Expand);
347 setTruncStoreAction(MVT::v32f32, MVT::v32f16, Expand);
348
349 setTruncStoreAction(MVT::f64, MVT::bf16, Expand);
350 setTruncStoreAction(MVT::f64, MVT::f16, Expand);
351 setTruncStoreAction(MVT::f64, MVT::f32, Expand);
352
353 setTruncStoreAction(MVT::v2f64, MVT::v2f32, Expand);
354 setTruncStoreAction(MVT::v2f64, MVT::v2bf16, Expand);
355 setTruncStoreAction(MVT::v2f64, MVT::v2f16, Expand);
356
357 setTruncStoreAction(MVT::v3i32, MVT::v3i8, Expand);
358
359 setTruncStoreAction(MVT::v3i64, MVT::v3i32, Expand);
360 setTruncStoreAction(MVT::v3i64, MVT::v3i16, Expand);
361 setTruncStoreAction(MVT::v3i64, MVT::v3i8, Expand);
362 setTruncStoreAction(MVT::v3i64, MVT::v3i1, Expand);
363 setTruncStoreAction(MVT::v3f64, MVT::v3f32, Expand);
364 setTruncStoreAction(MVT::v3f64, MVT::v3bf16, Expand);
365 setTruncStoreAction(MVT::v3f64, MVT::v3f16, Expand);
366
367 setTruncStoreAction(MVT::v4i64, MVT::v4i32, Expand);
368 setTruncStoreAction(MVT::v4i64, MVT::v4i16, Expand);
369 setTruncStoreAction(MVT::v4f64, MVT::v4f32, Expand);
370 setTruncStoreAction(MVT::v4f64, MVT::v4bf16, Expand);
371 setTruncStoreAction(MVT::v4f64, MVT::v4f16, Expand);
372
373 setTruncStoreAction(MVT::v5i32, MVT::v5i1, Expand);
374 setTruncStoreAction(MVT::v5i32, MVT::v5i8, Expand);
375 setTruncStoreAction(MVT::v5i32, MVT::v5i16, Expand);
376
377 setTruncStoreAction(MVT::v6i32, MVT::v6i1, Expand);
378 setTruncStoreAction(MVT::v6i32, MVT::v6i8, Expand);
379 setTruncStoreAction(MVT::v6i32, MVT::v6i16, Expand);
380
381 setTruncStoreAction(MVT::v7i32, MVT::v7i1, Expand);
382 setTruncStoreAction(MVT::v7i32, MVT::v7i8, Expand);
383 setTruncStoreAction(MVT::v7i32, MVT::v7i16, Expand);
384
385 setTruncStoreAction(MVT::v8f64, MVT::v8f32, Expand);
386 setTruncStoreAction(MVT::v8f64, MVT::v8bf16, Expand);
387 setTruncStoreAction(MVT::v8f64, MVT::v8f16, Expand);
388
389 setTruncStoreAction(MVT::v16f64, MVT::v16f32, Expand);
390 setTruncStoreAction(MVT::v16f64, MVT::v16bf16, Expand);
391 setTruncStoreAction(MVT::v16f64, MVT::v16f16, Expand);
392 setTruncStoreAction(MVT::v16i64, MVT::v16i16, Expand);
393 setTruncStoreAction(MVT::v16i64, MVT::v16i8, Expand);
394 setTruncStoreAction(MVT::v16i64, MVT::v16i8, Expand);
395 setTruncStoreAction(MVT::v16i64, MVT::v16i1, Expand);
396
397 setOperationAction(ISD::Constant, {MVT::i32, MVT::i64}, Legal);
398 setOperationAction(ISD::ConstantFP, {MVT::f32, MVT::f64}, Legal);
399
401
402 // For R600, this is totally unsupported, just custom lower to produce an
403 // error.
405
406 // Library functions. These default to Expand, but we have instructions
407 // for them.
410 {MVT::f16, MVT::f32}, Legal);
412
414 setOperationAction(ISD::FROUND, {MVT::f32, MVT::f64}, Custom);
416 {MVT::f16, MVT::f32, MVT::f64}, Expand);
417
420 Custom);
421
422 setOperationAction(ISD::FNEARBYINT, {MVT::f16, MVT::f32, MVT::f64}, Custom);
423
424 setOperationAction(ISD::FRINT, {MVT::f16, MVT::f32, MVT::f64}, Custom);
425
426 setOperationAction({ISD::LRINT, ISD::LLRINT}, {MVT::f16, MVT::f32, MVT::f64},
427 Expand);
428
429 setOperationAction(ISD::FREM, {MVT::f16, MVT::f32, MVT::f64}, Expand);
430
431 if (Subtarget->has16BitInsts()) {
432 setOperationAction(ISD::IS_FPCLASS, {MVT::f16, MVT::f32, MVT::f64}, Legal);
434 } else {
435 setOperationAction(ISD::IS_FPCLASS, {MVT::f32, MVT::f64}, Legal);
437 }
438
440 Custom);
441
442 setOperationAction(ISD::FCANONICALIZE, {MVT::f32, MVT::f64}, Legal);
443 if (Subtarget->has16BitInsts()) {
445 }
446
447 // FIXME: These IS_FPCLASS vector fp types are marked custom so it reaches
448 // scalarization code. Can be removed when IS_FPCLASS expand isn't called by
449 // default unless marked custom/legal.
451 {MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32,
452 MVT::v6f32, MVT::v7f32, MVT::v8f32, MVT::v16f32,
453 MVT::v2f64, MVT::v3f64, MVT::v4f64, MVT::v8f64,
454 MVT::v16f64},
455 Custom);
456
457 if (isTypeLegal(MVT::f16))
459 {MVT::v2f16, MVT::v3f16, MVT::v4f16, MVT::v16f16},
460 Custom);
461
462 // Expand to fneg + fadd.
464
466 {MVT::v3i32, MVT::v3f32, MVT::v4i32, MVT::v4f32,
467 MVT::v5i32, MVT::v5f32, MVT::v6i32, MVT::v6f32,
468 MVT::v7i32, MVT::v7f32, MVT::v8i32, MVT::v8f32,
469 MVT::v9i32, MVT::v9f32, MVT::v10i32, MVT::v10f32,
470 MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32},
471 Custom);
472
475 {MVT::v2f32, MVT::v2i32, MVT::v3f32, MVT::v3i32, MVT::v4f32,
476 MVT::v4i32, MVT::v5f32, MVT::v5i32, MVT::v6f32, MVT::v6i32,
477 MVT::v7f32, MVT::v7i32, MVT::v8f32, MVT::v8i32, MVT::v9f32,
478 MVT::v9i32, MVT::v10i32, MVT::v10f32, MVT::v11i32, MVT::v11f32,
479 MVT::v12i32, MVT::v12f32, MVT::v16i32, MVT::v32f32, MVT::v32i32,
480 MVT::v2f64, MVT::v2i64, MVT::v3f64, MVT::v3i64, MVT::v4f64,
481 MVT::v4i64, MVT::v8f64, MVT::v8i64, MVT::v16f64, MVT::v16i64},
482 Custom);
483
485 setOperationAction(ISD::FP_TO_FP16, {MVT::f64, MVT::f32}, Custom);
486
487 const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 };
488 for (MVT VT : ScalarIntVTs) {
489 // These should use [SU]DIVREM, so set them to expand
491 Expand);
492
493 // GPU does not have divrem function for signed or unsigned.
495
496 // GPU does not have [S|U]MUL_LOHI functions as a single instruction.
498
500
501 // AMDGPU uses ADDC/SUBC/ADDE/SUBE
503 }
504
505 // The hardware supports 32-bit FSHR, but not FSHL.
507
508 setOperationAction({ISD::ROTL, ISD::ROTR}, {MVT::i32, MVT::i64}, Expand);
509
511
515 MVT::i64, Custom);
517
519 Legal);
520
523 MVT::i64, Custom);
524
525 for (auto VT : {MVT::i8, MVT::i16})
527
528 static const MVT::SimpleValueType VectorIntTypes[] = {
529 MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32, MVT::v6i32, MVT::v7i32,
530 MVT::v9i32, MVT::v10i32, MVT::v11i32, MVT::v12i32};
531
532 for (MVT VT : VectorIntTypes) {
533 // Expand the following operations for the current type by default.
546 VT, Expand);
547 }
548
549 static const MVT::SimpleValueType FloatVectorTypes[] = {
550 MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32, MVT::v6f32, MVT::v7f32,
551 MVT::v9f32, MVT::v10f32, MVT::v11f32, MVT::v12f32};
552
553 for (MVT VT : FloatVectorTypes) {
566 VT, Expand);
567 }
568
569 // This causes using an unrolled select operation rather than expansion with
570 // bit operations. This is in general better, but the alternative using BFI
571 // instructions may be better if the select sources are SGPRs.
573 AddPromotedToType(ISD::SELECT, MVT::v2f32, MVT::v2i32);
574
576 AddPromotedToType(ISD::SELECT, MVT::v3f32, MVT::v3i32);
577
579 AddPromotedToType(ISD::SELECT, MVT::v4f32, MVT::v4i32);
580
582 AddPromotedToType(ISD::SELECT, MVT::v5f32, MVT::v5i32);
583
585 AddPromotedToType(ISD::SELECT, MVT::v6f32, MVT::v6i32);
586
588 AddPromotedToType(ISD::SELECT, MVT::v7f32, MVT::v7i32);
589
591 AddPromotedToType(ISD::SELECT, MVT::v9f32, MVT::v9i32);
592
594 AddPromotedToType(ISD::SELECT, MVT::v10f32, MVT::v10i32);
595
597 AddPromotedToType(ISD::SELECT, MVT::v11f32, MVT::v11i32);
598
600 AddPromotedToType(ISD::SELECT, MVT::v12f32, MVT::v12i32);
601
603 setJumpIsExpensive(true);
604
607
609
610 // We want to find all load dependencies for long chains of stores to enable
611 // merging into very wide vectors. The problem is with vectors with > 4
612 // elements. MergeConsecutiveStores will attempt to merge these because x8/x16
613 // vectors are a legal type, even though we have to split the loads
614 // usually. When we can more precisely specify load legality per address
615 // space, we should be able to make FindBetterChain/MergeConsecutiveStores
616 // smarter so that they can figure out what to do in 2 iterations without all
617 // N > 4 stores on the same chain.
619
620 // memcpy/memmove/memset are expanded in the IR, so we shouldn't need to worry
621 // about these during lowering.
622 MaxStoresPerMemcpy = 0xffffffff;
623 MaxStoresPerMemmove = 0xffffffff;
624 MaxStoresPerMemset = 0xffffffff;
625
626 // The expansion for 64-bit division is enormous.
628 addBypassSlowDiv(64, 32);
629
640
644}
645
647 if (getTargetMachine().Options.NoSignedZerosFPMath)
648 return true;
649
650 const auto Flags = Op.getNode()->getFlags();
651 if (Flags.hasNoSignedZeros())
652 return true;
653
654 return false;
655}
656
657//===----------------------------------------------------------------------===//
658// Target Information
659//===----------------------------------------------------------------------===//
660
662static bool fnegFoldsIntoOpcode(unsigned Opc) {
663 switch (Opc) {
664 case ISD::FADD:
665 case ISD::FSUB:
666 case ISD::FMUL:
667 case ISD::FMA:
668 case ISD::FMAD:
669 case ISD::FMINNUM:
670 case ISD::FMAXNUM:
673 case ISD::FMINIMUM:
674 case ISD::FMAXIMUM:
675 case ISD::FMINIMUMNUM:
676 case ISD::FMAXIMUMNUM:
677 case ISD::SELECT:
678 case ISD::FSIN:
679 case ISD::FTRUNC:
680 case ISD::FRINT:
681 case ISD::FNEARBYINT:
682 case ISD::FROUNDEVEN:
684 case AMDGPUISD::RCP:
685 case AMDGPUISD::RCP_LEGACY:
686 case AMDGPUISD::RCP_IFLAG:
687 case AMDGPUISD::SIN_HW:
688 case AMDGPUISD::FMUL_LEGACY:
689 case AMDGPUISD::FMIN_LEGACY:
690 case AMDGPUISD::FMAX_LEGACY:
691 case AMDGPUISD::FMED3:
692 // TODO: handle llvm.amdgcn.fma.legacy
693 return true;
694 case ISD::BITCAST:
695 llvm_unreachable("bitcast is special cased");
696 default:
697 return false;
698 }
699}
700
701static bool fnegFoldsIntoOp(const SDNode *N) {
702 unsigned Opc = N->getOpcode();
703 if (Opc == ISD::BITCAST) {
704 // TODO: Is there a benefit to checking the conditions performFNegCombine
705 // does? We don't for the other cases.
706 SDValue BCSrc = N->getOperand(0);
707 if (BCSrc.getOpcode() == ISD::BUILD_VECTOR) {
708 return BCSrc.getNumOperands() == 2 &&
709 BCSrc.getOperand(1).getValueSizeInBits() == 32;
710 }
711
712 return BCSrc.getOpcode() == ISD::SELECT && BCSrc.getValueType() == MVT::f32;
713 }
714
715 return fnegFoldsIntoOpcode(Opc);
716}
717
718/// \p returns true if the operation will definitely need to use a 64-bit
719/// encoding, and thus will use a VOP3 encoding regardless of the source
720/// modifiers.
722static bool opMustUseVOP3Encoding(const SDNode *N, MVT VT) {
723 return (N->getNumOperands() > 2 && N->getOpcode() != ISD::SELECT) ||
724 VT == MVT::f64;
725}
726
727/// Return true if v_cndmask_b32 will support fabs/fneg source modifiers for the
728/// type for ISD::SELECT.
730static bool selectSupportsSourceMods(const SDNode *N) {
731 // TODO: Only applies if select will be vector
732 return N->getValueType(0) == MVT::f32;
733}
734
735// Most FP instructions support source modifiers, but this could be refined
736// slightly.
738static bool hasSourceMods(const SDNode *N) {
739 if (isa<MemSDNode>(N))
740 return false;
741
742 switch (N->getOpcode()) {
743 case ISD::CopyToReg:
744 case ISD::FDIV:
745 case ISD::FREM:
746 case ISD::INLINEASM:
748 case AMDGPUISD::DIV_SCALE:
750
751 // TODO: Should really be looking at the users of the bitcast. These are
752 // problematic because bitcasts are used to legalize all stores to integer
753 // types.
754 case ISD::BITCAST:
755 return false;
757 switch (N->getConstantOperandVal(0)) {
758 case Intrinsic::amdgcn_interp_p1:
759 case Intrinsic::amdgcn_interp_p2:
760 case Intrinsic::amdgcn_interp_mov:
761 case Intrinsic::amdgcn_interp_p1_f16:
762 case Intrinsic::amdgcn_interp_p2_f16:
763 return false;
764 default:
765 return true;
766 }
767 }
768 case ISD::SELECT:
770 default:
771 return true;
772 }
773}
774
776 unsigned CostThreshold) {
777 // Some users (such as 3-operand FMA/MAD) must use a VOP3 encoding, and thus
778 // it is truly free to use a source modifier in all cases. If there are
779 // multiple users but for each one will necessitate using VOP3, there will be
780 // a code size increase. Try to avoid increasing code size unless we know it
781 // will save on the instruction count.
782 unsigned NumMayIncreaseSize = 0;
783 MVT VT = N->getValueType(0).getScalarType().getSimpleVT();
784
785 assert(!N->use_empty());
786
787 // XXX - Should this limit number of uses to check?
788 for (const SDNode *U : N->users()) {
789 if (!hasSourceMods(U))
790 return false;
791
792 if (!opMustUseVOP3Encoding(U, VT)) {
793 if (++NumMayIncreaseSize > CostThreshold)
794 return false;
795 }
796 }
797
798 return true;
799}
800
802 ISD::NodeType ExtendKind) const {
803 assert(!VT.isVector() && "only scalar expected");
804
805 // Round to the next multiple of 32-bits.
806 unsigned Size = VT.getSizeInBits();
807 if (Size <= 32)
808 return MVT::i32;
809 return EVT::getIntegerVT(Context, 32 * ((Size + 31) / 32));
810}
811
813 return 32;
814}
815
817 return true;
818}
819
820// The backend supports 32 and 64 bit floating point immediates.
821// FIXME: Why are we reporting vectors of FP immediates as legal?
823 bool ForCodeSize) const {
824 EVT ScalarVT = VT.getScalarType();
825 return (ScalarVT == MVT::f32 || ScalarVT == MVT::f64 ||
826 (ScalarVT == MVT::f16 && Subtarget->has16BitInsts()));
827}
828
829// We don't want to shrink f64 / f32 constants.
831 EVT ScalarVT = VT.getScalarType();
832 return (ScalarVT != MVT::f32 && ScalarVT != MVT::f64);
833}
834
836 SDNode *N, ISD::LoadExtType ExtTy, EVT NewVT,
837 std::optional<unsigned> ByteOffset) const {
838 // TODO: This may be worth removing. Check regression tests for diffs.
839 if (!TargetLoweringBase::shouldReduceLoadWidth(N, ExtTy, NewVT, ByteOffset))
840 return false;
841
842 unsigned NewSize = NewVT.getStoreSizeInBits();
843
844 // If we are reducing to a 32-bit load or a smaller multi-dword load,
845 // this is always better.
846 if (NewSize >= 32)
847 return true;
848
849 EVT OldVT = N->getValueType(0);
850 unsigned OldSize = OldVT.getStoreSizeInBits();
851
853 unsigned AS = MN->getAddressSpace();
854 // Do not shrink an aligned scalar load to sub-dword.
855 // Scalar engine cannot do sub-dword loads.
856 // TODO: Update this for GFX12 which does have scalar sub-dword loads.
857 if (OldSize >= 32 && NewSize < 32 && MN->getAlign() >= Align(4) &&
861 MN->isInvariant())) &&
863 return false;
864
865 // Don't produce extloads from sub 32-bit types. SI doesn't have scalar
866 // extloads, so doing one requires using a buffer_load. In cases where we
867 // still couldn't use a scalar load, using the wider load shouldn't really
868 // hurt anything.
869
870 // If the old size already had to be an extload, there's no harm in continuing
871 // to reduce the width.
872 return (OldSize < 32);
873}
874
876 const SelectionDAG &DAG,
877 const MachineMemOperand &MMO) const {
878
879 assert(LoadTy.getSizeInBits() == CastTy.getSizeInBits());
880
881 if (LoadTy.getScalarType() == MVT::i32)
882 return false;
883
884 unsigned LScalarSize = LoadTy.getScalarSizeInBits();
885 unsigned CastScalarSize = CastTy.getScalarSizeInBits();
886
887 if ((LScalarSize >= CastScalarSize) && (CastScalarSize < 32))
888 return false;
889
890 unsigned Fast = 0;
892 CastTy, MMO, &Fast) &&
893 Fast;
894}
895
896// SI+ has instructions for cttz / ctlz for 32-bit values. This is probably also
897// profitable with the expansion for 64-bit since it's generally good to
898// speculate things.
900 return true;
901}
902
904 return true;
905}
906
908 switch (N->getOpcode()) {
909 case ISD::EntryToken:
910 case ISD::TokenFactor:
911 return true;
913 unsigned IntrID = N->getConstantOperandVal(0);
915 }
917 unsigned IntrID = N->getConstantOperandVal(1);
919 }
920 case ISD::LOAD:
921 if (cast<LoadSDNode>(N)->getMemOperand()->getAddrSpace() ==
923 return true;
924 return false;
925 case AMDGPUISD::SETCC: // ballot-style instruction
926 return true;
927 }
928 return false;
929}
930
932 SDValue Op, SelectionDAG &DAG, bool LegalOperations, bool ForCodeSize,
933 NegatibleCost &Cost, unsigned Depth) const {
934
935 switch (Op.getOpcode()) {
936 case ISD::FMA:
937 case ISD::FMAD: {
938 // Negating a fma is not free if it has users without source mods.
939 if (!allUsesHaveSourceMods(Op.getNode()))
940 return SDValue();
941 break;
942 }
943 case AMDGPUISD::RCP: {
944 SDValue Src = Op.getOperand(0);
945 EVT VT = Op.getValueType();
946 SDLoc SL(Op);
947
948 SDValue NegSrc = getNegatedExpression(Src, DAG, LegalOperations,
949 ForCodeSize, Cost, Depth + 1);
950 if (NegSrc)
951 return DAG.getNode(AMDGPUISD::RCP, SL, VT, NegSrc, Op->getFlags());
952 return SDValue();
953 }
954 default:
955 break;
956 }
957
958 return TargetLowering::getNegatedExpression(Op, DAG, LegalOperations,
959 ForCodeSize, Cost, Depth);
960}
961
962//===---------------------------------------------------------------------===//
963// Target Properties
964//===---------------------------------------------------------------------===//
965
968
969 // Packed operations do not have a fabs modifier.
970 return VT == MVT::f32 || VT == MVT::f64 ||
971 (Subtarget->has16BitInsts() && (VT == MVT::f16 || VT == MVT::bf16));
972}
973
976 // Report this based on the end legalized type.
977 VT = VT.getScalarType();
978 return VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f16 || VT == MVT::bf16;
979}
980
982 unsigned NumElem,
983 unsigned AS) const {
984 return true;
985}
986
988 // There are few operations which truly have vector input operands. Any vector
989 // operation is going to involve operations on each component, and a
990 // build_vector will be a copy per element, so it always makes sense to use a
991 // build_vector input in place of the extracted element to avoid a copy into a
992 // super register.
993 //
994 // We should probably only do this if all users are extracts only, but this
995 // should be the common case.
996 return true;
997}
998
1000 // Truncate is just accessing a subregister.
1001
1002 unsigned SrcSize = Source.getSizeInBits();
1003 unsigned DestSize = Dest.getSizeInBits();
1004
1005 return DestSize < SrcSize && DestSize % 32 == 0 ;
1006}
1007
1009 // Truncate is just accessing a subregister.
1010
1011 unsigned SrcSize = Source->getScalarSizeInBits();
1012 unsigned DestSize = Dest->getScalarSizeInBits();
1013
1014 if (DestSize== 16 && Subtarget->has16BitInsts())
1015 return SrcSize >= 32;
1016
1017 return DestSize < SrcSize && DestSize % 32 == 0;
1018}
1019
1021 unsigned SrcSize = Src->getScalarSizeInBits();
1022 unsigned DestSize = Dest->getScalarSizeInBits();
1023
1024 if (SrcSize == 16 && Subtarget->has16BitInsts())
1025 return DestSize >= 32;
1026
1027 return SrcSize == 32 && DestSize == 64;
1028}
1029
1031 // Any register load of a 64-bit value really requires 2 32-bit moves. For all
1032 // practical purposes, the extra mov 0 to load a 64-bit is free. As used,
1033 // this will enable reducing 64-bit operations the 32-bit, which is always
1034 // good.
1035
1036 if (Src == MVT::i16)
1037 return Dest == MVT::i32 ||Dest == MVT::i64 ;
1038
1039 return Src == MVT::i32 && Dest == MVT::i64;
1040}
1041
1043 EVT DestVT) const {
1044 switch (N->getOpcode()) {
1045 case ISD::ADD:
1046 case ISD::SUB:
1047 case ISD::SHL:
1048 case ISD::SRL:
1049 case ISD::SRA:
1050 case ISD::AND:
1051 case ISD::OR:
1052 case ISD::XOR:
1053 case ISD::MUL:
1054 case ISD::SETCC:
1055 case ISD::SELECT:
1056 case ISD::SMIN:
1057 case ISD::SMAX:
1058 case ISD::UMIN:
1059 case ISD::UMAX:
1060 if (Subtarget->has16BitInsts() &&
1061 (!DestVT.isVector() || !Subtarget->hasVOP3PInsts())) {
1062 // Don't narrow back down to i16 if promoted to i32 already.
1063 if (!N->isDivergent() && DestVT.isInteger() &&
1064 DestVT.getScalarSizeInBits() > 1 &&
1065 DestVT.getScalarSizeInBits() <= 16 &&
1066 SrcVT.getScalarSizeInBits() > 16) {
1067 return false;
1068 }
1069 }
1070 return true;
1071 default:
1072 break;
1073 }
1074
1075 // There aren't really 64-bit registers, but pairs of 32-bit ones and only a
1076 // limited number of native 64-bit operations. Shrinking an operation to fit
1077 // in a single 32-bit register should always be helpful. As currently used,
1078 // this is much less general than the name suggests, and is only used in
1079 // places trying to reduce the sizes of loads. Shrinking loads to < 32-bits is
1080 // not profitable, and may actually be harmful.
1081 if (isa<LoadSDNode>(N))
1082 return SrcVT.getSizeInBits() > 32 && DestVT.getSizeInBits() == 32;
1083
1084 return true;
1085}
1086
1088 const SDNode* N, CombineLevel Level) const {
1089 assert((N->getOpcode() == ISD::SHL || N->getOpcode() == ISD::SRA ||
1090 N->getOpcode() == ISD::SRL) &&
1091 "Expected shift op");
1092
1093 SDValue ShiftLHS = N->getOperand(0);
1094 if (!ShiftLHS->hasOneUse())
1095 return false;
1096
1097 if (ShiftLHS.getOpcode() == ISD::SIGN_EXTEND &&
1098 !ShiftLHS.getOperand(0)->hasOneUse())
1099 return false;
1100
1101 // Always commute pre-type legalization and right shifts.
1102 // We're looking for shl(or(x,y),z) patterns.
1104 N->getOpcode() != ISD::SHL || N->getOperand(0).getOpcode() != ISD::OR)
1105 return true;
1106
1107 // If only user is a i32 right-shift, then don't destroy a BFE pattern.
1108 if (N->getValueType(0) == MVT::i32 && N->hasOneUse() &&
1109 (N->user_begin()->getOpcode() == ISD::SRA ||
1110 N->user_begin()->getOpcode() == ISD::SRL))
1111 return false;
1112
1113 // Don't destroy or(shl(load_zext(),c), load_zext()) patterns.
1114 auto IsShiftAndLoad = [](SDValue LHS, SDValue RHS) {
1115 if (LHS.getOpcode() != ISD::SHL)
1116 return false;
1117 auto *RHSLd = dyn_cast<LoadSDNode>(RHS);
1118 auto *LHS0 = dyn_cast<LoadSDNode>(LHS.getOperand(0));
1119 auto *LHS1 = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
1120 return LHS0 && LHS1 && RHSLd && LHS0->getExtensionType() == ISD::ZEXTLOAD &&
1121 LHS1->getAPIntValue() == LHS0->getMemoryVT().getScalarSizeInBits() &&
1122 RHSLd->getExtensionType() == ISD::ZEXTLOAD;
1123 };
1124 SDValue LHS = N->getOperand(0).getOperand(0);
1125 SDValue RHS = N->getOperand(0).getOperand(1);
1126 return !(IsShiftAndLoad(LHS, RHS) || IsShiftAndLoad(RHS, LHS));
1127}
1128
1129//===---------------------------------------------------------------------===//
1130// TargetLowering Callbacks
1131//===---------------------------------------------------------------------===//
1132
1134 bool IsVarArg) {
1135 switch (CC) {
1143 return CC_AMDGPU;
1146 return CC_AMDGPU_CS_CHAIN;
1147 case CallingConv::C:
1148 case CallingConv::Fast:
1149 case CallingConv::Cold:
1150 return CC_AMDGPU_Func;
1153 return CC_SI_Gfx;
1156 default:
1157 reportFatalUsageError("unsupported calling convention for call");
1158 }
1159}
1160
1162 bool IsVarArg) {
1163 switch (CC) {
1166 llvm_unreachable("kernels should not be handled here");
1176 return RetCC_SI_Shader;
1179 return RetCC_SI_Gfx;
1180 case CallingConv::C:
1181 case CallingConv::Fast:
1182 case CallingConv::Cold:
1183 return RetCC_AMDGPU_Func;
1184 default:
1185 reportFatalUsageError("unsupported calling convention");
1186 }
1187}
1188
1189/// The SelectionDAGBuilder will automatically promote function arguments
1190/// with illegal types. However, this does not work for the AMDGPU targets
1191/// since the function arguments are stored in memory as these illegal types.
1192/// In order to handle this properly we need to get the original types sizes
1193/// from the LLVM IR Function and fixup the ISD:InputArg values before
1194/// passing them to AnalyzeFormalArguments()
1195
1196/// When the SelectionDAGBuilder computes the Ins, it takes care of splitting
1197/// input values across multiple registers. Each item in the Ins array
1198/// represents a single value that will be stored in registers. Ins[x].VT is
1199/// the value type of the value that will be stored in the register, so
1200/// whatever SDNode we lower the argument to needs to be this type.
1201///
1202/// In order to correctly lower the arguments we need to know the size of each
1203/// argument. Since Ins[x].VT gives us the size of the register that will
1204/// hold the value, we need to look at Ins[x].ArgVT to see the 'real' type
1205/// for the original function argument so that we can deduce the correct memory
1206/// type to use for Ins[x]. In most cases the correct memory type will be
1207/// Ins[x].ArgVT. However, this will not always be the case. If, for example,
1208/// we have a kernel argument of type v8i8, this argument will be split into
1209/// 8 parts and each part will be represented by its own item in the Ins array.
1210/// For each part the Ins[x].ArgVT will be the v8i8, which is the full type of
1211/// the argument before it was split. From this, we deduce that the memory type
1212/// for each individual part is i8. We pass the memory type as LocVT to the
1213/// calling convention analysis function and the register type (Ins[x].VT) as
1214/// the ValVT.
1216 CCState &State,
1217 const SmallVectorImpl<ISD::InputArg> &Ins) const {
1218 const MachineFunction &MF = State.getMachineFunction();
1219 const Function &Fn = MF.getFunction();
1220 LLVMContext &Ctx = Fn.getContext();
1221 const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(MF);
1222 const unsigned ExplicitOffset = ST.getExplicitKernelArgOffset();
1224
1225 Align MaxAlign = Align(1);
1226 uint64_t ExplicitArgOffset = 0;
1227 const DataLayout &DL = Fn.getDataLayout();
1228
1229 unsigned InIndex = 0;
1230
1231 for (const Argument &Arg : Fn.args()) {
1232 const bool IsByRef = Arg.hasByRefAttr();
1233 Type *BaseArgTy = Arg.getType();
1234 Type *MemArgTy = IsByRef ? Arg.getParamByRefType() : BaseArgTy;
1235 Align Alignment = DL.getValueOrABITypeAlignment(
1236 IsByRef ? Arg.getParamAlign() : std::nullopt, MemArgTy);
1237 MaxAlign = std::max(Alignment, MaxAlign);
1238 uint64_t AllocSize = DL.getTypeAllocSize(MemArgTy);
1239
1240 uint64_t ArgOffset = alignTo(ExplicitArgOffset, Alignment) + ExplicitOffset;
1241 ExplicitArgOffset = alignTo(ExplicitArgOffset, Alignment) + AllocSize;
1242
1243 // We're basically throwing away everything passed into us and starting over
1244 // to get accurate in-memory offsets. The "PartOffset" is completely useless
1245 // to us as computed in Ins.
1246 //
1247 // We also need to figure out what type legalization is trying to do to get
1248 // the correct memory offsets.
1249
1250 SmallVector<EVT, 16> ValueVTs;
1252 ComputeValueVTs(*this, DL, BaseArgTy, ValueVTs, /*MemVTs=*/nullptr,
1253 &Offsets, ArgOffset);
1254
1255 for (unsigned Value = 0, NumValues = ValueVTs.size();
1256 Value != NumValues; ++Value) {
1257 uint64_t BasePartOffset = Offsets[Value];
1258
1259 EVT ArgVT = ValueVTs[Value];
1260 EVT MemVT = ArgVT;
1261 MVT RegisterVT = getRegisterTypeForCallingConv(Ctx, CC, ArgVT);
1262 unsigned NumRegs = getNumRegistersForCallingConv(Ctx, CC, ArgVT);
1263
1264 if (NumRegs == 1) {
1265 // This argument is not split, so the IR type is the memory type.
1266 if (ArgVT.isExtended()) {
1267 // We have an extended type, like i24, so we should just use the
1268 // register type.
1269 MemVT = RegisterVT;
1270 } else {
1271 MemVT = ArgVT;
1272 }
1273 } else if (ArgVT.isVector() && RegisterVT.isVector() &&
1274 ArgVT.getScalarType() == RegisterVT.getScalarType()) {
1275 assert(ArgVT.getVectorNumElements() > RegisterVT.getVectorNumElements());
1276 // We have a vector value which has been split into a vector with
1277 // the same scalar type, but fewer elements. This should handle
1278 // all the floating-point vector types.
1279 MemVT = RegisterVT;
1280 } else if (ArgVT.isVector() &&
1281 ArgVT.getVectorNumElements() == NumRegs) {
1282 // This arg has been split so that each element is stored in a separate
1283 // register.
1284 MemVT = ArgVT.getScalarType();
1285 } else if (ArgVT.isExtended()) {
1286 // We have an extended type, like i65.
1287 MemVT = RegisterVT;
1288 } else {
1289 unsigned MemoryBits = ArgVT.getStoreSizeInBits() / NumRegs;
1290 assert(ArgVT.getStoreSizeInBits() % NumRegs == 0);
1291 if (RegisterVT.isInteger()) {
1292 MemVT = EVT::getIntegerVT(State.getContext(), MemoryBits);
1293 } else if (RegisterVT.isVector()) {
1294 assert(!RegisterVT.getScalarType().isFloatingPoint());
1295 unsigned NumElements = RegisterVT.getVectorNumElements();
1296 assert(MemoryBits % NumElements == 0);
1297 // This vector type has been split into another vector type with
1298 // a different elements size.
1299 EVT ScalarVT = EVT::getIntegerVT(State.getContext(),
1300 MemoryBits / NumElements);
1301 MemVT = EVT::getVectorVT(State.getContext(), ScalarVT, NumElements);
1302 } else {
1303 llvm_unreachable("cannot deduce memory type.");
1304 }
1305 }
1306
1307 // Convert one element vectors to scalar.
1308 if (MemVT.isVector() && MemVT.getVectorNumElements() == 1)
1309 MemVT = MemVT.getScalarType();
1310
1311 // Round up vec3/vec5 argument.
1312 if (MemVT.isVector() && !MemVT.isPow2VectorType()) {
1313 MemVT = MemVT.getPow2VectorType(State.getContext());
1314 } else if (!MemVT.isSimple() && !MemVT.isVector()) {
1315 MemVT = MemVT.getRoundIntegerType(State.getContext());
1316 }
1317
1318 unsigned PartOffset = 0;
1319 for (unsigned i = 0; i != NumRegs; ++i) {
1320 State.addLoc(CCValAssign::getCustomMem(InIndex++, RegisterVT,
1321 BasePartOffset + PartOffset,
1322 MemVT.getSimpleVT(),
1324 PartOffset += MemVT.getStoreSize();
1325 }
1326 }
1327 }
1328}
1329
1331 SDValue Chain, CallingConv::ID CallConv,
1332 bool isVarArg,
1334 const SmallVectorImpl<SDValue> &OutVals,
1335 const SDLoc &DL, SelectionDAG &DAG) const {
1336 // FIXME: Fails for r600 tests
1337 //assert(!isVarArg && Outs.empty() && OutVals.empty() &&
1338 // "wave terminate should not have return values");
1339 return DAG.getNode(AMDGPUISD::ENDPGM, DL, MVT::Other, Chain);
1340}
1341
1342//===---------------------------------------------------------------------===//
1343// Target specific lowering
1344//===---------------------------------------------------------------------===//
1345
1346/// Selects the correct CCAssignFn for a given CallingConvention value.
1351
1356
1358 SelectionDAG &DAG,
1359 MachineFrameInfo &MFI,
1360 int ClobberedFI) const {
1361 SmallVector<SDValue, 8> ArgChains;
1362 int64_t FirstByte = MFI.getObjectOffset(ClobberedFI);
1363 int64_t LastByte = FirstByte + MFI.getObjectSize(ClobberedFI) - 1;
1364
1365 // Include the original chain at the beginning of the list. When this is
1366 // used by target LowerCall hooks, this helps legalize find the
1367 // CALLSEQ_BEGIN node.
1368 ArgChains.push_back(Chain);
1369
1370 // Add a chain value for each stack argument corresponding
1371 for (SDNode *U : DAG.getEntryNode().getNode()->users()) {
1372 if (LoadSDNode *L = dyn_cast<LoadSDNode>(U)) {
1373 if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(L->getBasePtr())) {
1374 if (FI->getIndex() < 0) {
1375 int64_t InFirstByte = MFI.getObjectOffset(FI->getIndex());
1376 int64_t InLastByte = InFirstByte;
1377 InLastByte += MFI.getObjectSize(FI->getIndex()) - 1;
1378
1379 if ((InFirstByte <= FirstByte && FirstByte <= InLastByte) ||
1380 (FirstByte <= InFirstByte && InFirstByte <= LastByte))
1381 ArgChains.push_back(SDValue(L, 1));
1382 }
1383 }
1384 }
1385 }
1386
1387 // Build a tokenfactor for all the chains.
1388 return DAG.getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ArgChains);
1389}
1390
1393 StringRef Reason) const {
1394 SDValue Callee = CLI.Callee;
1395 SelectionDAG &DAG = CLI.DAG;
1396
1397 const Function &Fn = DAG.getMachineFunction().getFunction();
1398
1399 StringRef FuncName("<unknown>");
1400
1402 FuncName = G->getSymbol();
1403 else if (const GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
1404 FuncName = G->getGlobal()->getName();
1405
1406 DAG.getContext()->diagnose(
1407 DiagnosticInfoUnsupported(Fn, Reason + FuncName, CLI.DL.getDebugLoc()));
1408
1409 if (!CLI.IsTailCall) {
1410 for (ISD::InputArg &Arg : CLI.Ins)
1411 InVals.push_back(DAG.getPOISON(Arg.VT));
1412 }
1413
1414 // FIXME: Hack because R600 doesn't handle callseq pseudos yet.
1415 if (getTargetMachine().getTargetTriple().getArch() == Triple::r600)
1416 return CLI.Chain;
1417
1418 SDValue Chain = DAG.getCALLSEQ_START(CLI.Chain, 0, 0, CLI.DL);
1419 return DAG.getCALLSEQ_END(Chain, 0, 0, /*InGlue=*/SDValue(), CLI.DL);
1420}
1421
1423 SmallVectorImpl<SDValue> &InVals) const {
1424 return lowerUnhandledCall(CLI, InVals, "unsupported call to function ");
1425}
1426
1428 SelectionDAG &DAG) const {
1429 const Function &Fn = DAG.getMachineFunction().getFunction();
1430
1432 Fn, "unsupported dynamic alloca", SDLoc(Op).getDebugLoc()));
1433 auto Ops = {DAG.getConstant(0, SDLoc(), Op.getValueType()), Op.getOperand(0)};
1434 return DAG.getMergeValues(Ops, SDLoc());
1435}
1436
1438 SelectionDAG &DAG) const {
1439 switch (Op.getOpcode()) {
1440 default:
1441 Op->print(errs(), &DAG);
1442 llvm_unreachable("Custom lowering code for this "
1443 "instruction is not implemented yet!");
1444 break;
1446 case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG);
1448 case ISD::UDIVREM: return LowerUDIVREM(Op, DAG);
1449 case ISD::SDIVREM:
1450 return LowerSDIVREM(Op, DAG);
1451 case ISD::FCEIL: return LowerFCEIL(Op, DAG);
1452 case ISD::FTRUNC: return LowerFTRUNC(Op, DAG);
1453 case ISD::FRINT: return LowerFRINT(Op, DAG);
1454 case ISD::FNEARBYINT: return LowerFNEARBYINT(Op, DAG);
1455 case ISD::FROUNDEVEN:
1456 return LowerFROUNDEVEN(Op, DAG);
1457 case ISD::FROUND: return LowerFROUND(Op, DAG);
1458 case ISD::FFLOOR: return LowerFFLOOR(Op, DAG);
1459 case ISD::FLOG2:
1460 return LowerFLOG2(Op, DAG);
1461 case ISD::FLOG:
1462 case ISD::FLOG10:
1463 return LowerFLOGCommon(Op, DAG);
1464 case ISD::FEXP:
1465 case ISD::FEXP10:
1466 return lowerFEXP(Op, DAG);
1467 case ISD::FEXP2:
1468 return lowerFEXP2(Op, DAG);
1469 case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);
1470 case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
1471 case ISD::FP_TO_FP16: return LowerFP_TO_FP16(Op, DAG);
1472 case ISD::FP_TO_SINT:
1473 case ISD::FP_TO_UINT:
1474 return LowerFP_TO_INT(Op, DAG);
1475 case ISD::CTTZ:
1477 case ISD::CTLZ:
1479 return LowerCTLZ_CTTZ(Op, DAG);
1481 }
1482 return Op;
1483}
1484
1487 SelectionDAG &DAG) const {
1488 switch (N->getOpcode()) {
1490 // Different parts of legalization seem to interpret which type of
1491 // sign_extend_inreg is the one to check for custom lowering. The extended
1492 // from type is what really matters, but some places check for custom
1493 // lowering of the result type. This results in trying to use
1494 // ReplaceNodeResults to sext_in_reg to an illegal type, so we'll just do
1495 // nothing here and let the illegal result integer be handled normally.
1496 return;
1497 case ISD::FLOG2:
1498 if (SDValue Lowered = LowerFLOG2(SDValue(N, 0), DAG))
1499 Results.push_back(Lowered);
1500 return;
1501 case ISD::FLOG:
1502 case ISD::FLOG10:
1503 if (SDValue Lowered = LowerFLOGCommon(SDValue(N, 0), DAG))
1504 Results.push_back(Lowered);
1505 return;
1506 case ISD::FEXP2:
1507 if (SDValue Lowered = lowerFEXP2(SDValue(N, 0), DAG))
1508 Results.push_back(Lowered);
1509 return;
1510 case ISD::FEXP:
1511 case ISD::FEXP10:
1512 if (SDValue Lowered = lowerFEXP(SDValue(N, 0), DAG))
1513 Results.push_back(Lowered);
1514 return;
1515 case ISD::CTLZ:
1517 if (auto Lowered = lowerCTLZResults(SDValue(N, 0u), DAG))
1518 Results.push_back(Lowered);
1519 return;
1520 default:
1521 return;
1522 }
1523}
1524
1526 SDValue Op,
1527 SelectionDAG &DAG) const {
1528
1529 const DataLayout &DL = DAG.getDataLayout();
1531 const GlobalValue *GV = G->getGlobal();
1532
1533 if (!MFI->isModuleEntryFunction()) {
1534 auto IsNamedBarrier = AMDGPU::isNamedBarrier(*cast<GlobalVariable>(GV));
1535 if (std::optional<uint32_t> Address =
1537 if (IsNamedBarrier) {
1538 unsigned BarCnt = DL.getTypeAllocSize(GV->getValueType()) / 16;
1539 MFI->recordNumNamedBarriers(Address.value(), BarCnt);
1540 }
1541 return DAG.getConstant(*Address, SDLoc(Op), Op.getValueType());
1542 } else if (IsNamedBarrier) {
1543 llvm_unreachable("named barrier should have an assigned address");
1544 }
1545 }
1546
1547 if (G->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
1548 G->getAddressSpace() == AMDGPUAS::REGION_ADDRESS) {
1549 if (!MFI->isModuleEntryFunction() &&
1550 GV->getName() != "llvm.amdgcn.module.lds" &&
1552 SDLoc DL(Op);
1553 const Function &Fn = DAG.getMachineFunction().getFunction();
1555 Fn, "local memory global used by non-kernel function",
1556 DL.getDebugLoc(), DS_Warning));
1557
1558 // We currently don't have a way to correctly allocate LDS objects that
1559 // aren't directly associated with a kernel. We do force inlining of
1560 // functions that use local objects. However, if these dead functions are
1561 // not eliminated, we don't want a compile time error. Just emit a warning
1562 // and a trap, since there should be no callable path here.
1563 SDValue Trap = DAG.getNode(ISD::TRAP, DL, MVT::Other, DAG.getEntryNode());
1564 SDValue OutputChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
1565 Trap, DAG.getRoot());
1566 DAG.setRoot(OutputChain);
1567 return DAG.getPOISON(Op.getValueType());
1568 }
1569
1570 // XXX: What does the value of G->getOffset() mean?
1571 assert(G->getOffset() == 0 &&
1572 "Do not know what to do with an non-zero offset");
1573
1574 // TODO: We could emit code to handle the initialization somewhere.
1575 // We ignore the initializer for now and legalize it to allow selection.
1576 // The initializer will anyway get errored out during assembly emission.
1577 unsigned Offset = MFI->allocateLDSGlobal(DL, *cast<GlobalVariable>(GV));
1578 return DAG.getConstant(Offset, SDLoc(Op), Op.getValueType());
1579 }
1580 return SDValue();
1581}
1582
1584 SelectionDAG &DAG) const {
1586 SDLoc SL(Op);
1587
1588 EVT VT = Op.getValueType();
1589 if (VT.getVectorElementType().getSizeInBits() < 32) {
1590 unsigned OpBitSize = Op.getOperand(0).getValueType().getSizeInBits();
1591 if (OpBitSize >= 32 && OpBitSize % 32 == 0) {
1592 unsigned NewNumElt = OpBitSize / 32;
1593 EVT NewEltVT = (NewNumElt == 1) ? MVT::i32
1595 MVT::i32, NewNumElt);
1596 for (const SDUse &U : Op->ops()) {
1597 SDValue In = U.get();
1598 SDValue NewIn = DAG.getNode(ISD::BITCAST, SL, NewEltVT, In);
1599 if (NewNumElt > 1)
1600 DAG.ExtractVectorElements(NewIn, Args);
1601 else
1602 Args.push_back(NewIn);
1603 }
1604
1605 EVT NewVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
1606 NewNumElt * Op.getNumOperands());
1607 SDValue BV = DAG.getBuildVector(NewVT, SL, Args);
1608 return DAG.getNode(ISD::BITCAST, SL, VT, BV);
1609 }
1610 }
1611
1612 for (const SDUse &U : Op->ops())
1613 DAG.ExtractVectorElements(U.get(), Args);
1614
1615 return DAG.getBuildVector(Op.getValueType(), SL, Args);
1616}
1617
1619 SelectionDAG &DAG) const {
1620 SDLoc SL(Op);
1622 unsigned Start = Op.getConstantOperandVal(1);
1623 EVT VT = Op.getValueType();
1624 EVT SrcVT = Op.getOperand(0).getValueType();
1625
1626 if (VT.getScalarSizeInBits() == 16 && Start % 2 == 0) {
1627 unsigned NumElt = VT.getVectorNumElements();
1628 unsigned NumSrcElt = SrcVT.getVectorNumElements();
1629 assert(NumElt % 2 == 0 && NumSrcElt % 2 == 0 && "expect legal types");
1630
1631 // Extract 32-bit registers at a time.
1632 EVT NewSrcVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumSrcElt / 2);
1633 EVT NewVT = NumElt == 2
1634 ? MVT::i32
1635 : EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElt / 2);
1636 SDValue Tmp = DAG.getNode(ISD::BITCAST, SL, NewSrcVT, Op.getOperand(0));
1637
1638 DAG.ExtractVectorElements(Tmp, Args, Start / 2, NumElt / 2);
1639 if (NumElt == 2)
1640 Tmp = Args[0];
1641 else
1642 Tmp = DAG.getBuildVector(NewVT, SL, Args);
1643
1644 return DAG.getNode(ISD::BITCAST, SL, VT, Tmp);
1645 }
1646
1647 DAG.ExtractVectorElements(Op.getOperand(0), Args, Start,
1649
1650 return DAG.getBuildVector(Op.getValueType(), SL, Args);
1651}
1652
1653// TODO: Handle fabs too
1655 if (Val.getOpcode() == ISD::FNEG)
1656 return Val.getOperand(0);
1657
1658 return Val;
1659}
1660
1662 if (Val.getOpcode() == ISD::FNEG)
1663 Val = Val.getOperand(0);
1664 if (Val.getOpcode() == ISD::FABS)
1665 Val = Val.getOperand(0);
1666 if (Val.getOpcode() == ISD::FCOPYSIGN)
1667 Val = Val.getOperand(0);
1668 return Val;
1669}
1670
1672 const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, SDValue True,
1673 SDValue False, SDValue CC, DAGCombinerInfo &DCI) const {
1674 SelectionDAG &DAG = DCI.DAG;
1675 ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
1676 switch (CCOpcode) {
1677 case ISD::SETOEQ:
1678 case ISD::SETONE:
1679 case ISD::SETUNE:
1680 case ISD::SETNE:
1681 case ISD::SETUEQ:
1682 case ISD::SETEQ:
1683 case ISD::SETFALSE:
1684 case ISD::SETFALSE2:
1685 case ISD::SETTRUE:
1686 case ISD::SETTRUE2:
1687 case ISD::SETUO:
1688 case ISD::SETO:
1689 break;
1690 case ISD::SETULE:
1691 case ISD::SETULT: {
1692 if (LHS == True)
1693 return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, RHS, LHS);
1694 return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, LHS, RHS);
1695 }
1696 case ISD::SETOLE:
1697 case ISD::SETOLT:
1698 case ISD::SETLE:
1699 case ISD::SETLT: {
1700 // Ordered. Assume ordered for undefined.
1701
1702 // Only do this after legalization to avoid interfering with other combines
1703 // which might occur.
1705 !DCI.isCalledByLegalizer())
1706 return SDValue();
1707
1708 // We need to permute the operands to get the correct NaN behavior. The
1709 // selected operand is the second one based on the failing compare with NaN,
1710 // so permute it based on the compare type the hardware uses.
1711 if (LHS == True)
1712 return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, LHS, RHS);
1713 return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, RHS, LHS);
1714 }
1715 case ISD::SETUGE:
1716 case ISD::SETUGT: {
1717 if (LHS == True)
1718 return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, RHS, LHS);
1719 return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, LHS, RHS);
1720 }
1721 case ISD::SETGT:
1722 case ISD::SETGE:
1723 case ISD::SETOGE:
1724 case ISD::SETOGT: {
1726 !DCI.isCalledByLegalizer())
1727 return SDValue();
1728
1729 if (LHS == True)
1730 return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, LHS, RHS);
1731 return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, RHS, LHS);
1732 }
1733 case ISD::SETCC_INVALID:
1734 llvm_unreachable("Invalid setcc condcode!");
1735 }
1736 return SDValue();
1737}
1738
1739/// Generate Min/Max node
1741 SDValue LHS, SDValue RHS,
1742 SDValue True, SDValue False,
1743 SDValue CC,
1744 DAGCombinerInfo &DCI) const {
1745 if ((LHS == True && RHS == False) || (LHS == False && RHS == True))
1746 return combineFMinMaxLegacyImpl(DL, VT, LHS, RHS, True, False, CC, DCI);
1747
1748 SelectionDAG &DAG = DCI.DAG;
1749
1750 // If we can't directly match this, try to see if we can fold an fneg to
1751 // match.
1752
1755 SDValue NegTrue = peekFNeg(True);
1756
1757 // Undo the combine foldFreeOpFromSelect does if it helps us match the
1758 // fmin/fmax.
1759 //
1760 // select (fcmp olt (lhs, K)), (fneg lhs), -K
1761 // -> fneg (fmin_legacy lhs, K)
1762 //
1763 // TODO: Use getNegatedExpression
1764 if (LHS == NegTrue && CFalse && CRHS) {
1765 APFloat NegRHS = neg(CRHS->getValueAPF());
1766 if (NegRHS == CFalse->getValueAPF()) {
1767 SDValue Combined =
1768 combineFMinMaxLegacyImpl(DL, VT, LHS, RHS, NegTrue, False, CC, DCI);
1769 if (Combined)
1770 return DAG.getNode(ISD::FNEG, DL, VT, Combined);
1771 return SDValue();
1772 }
1773 }
1774
1775 return SDValue();
1776}
1777
1778std::pair<SDValue, SDValue>
1780 SDLoc SL(Op);
1781
1782 SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
1783
1784 const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
1785 const SDValue One = DAG.getConstant(1, SL, MVT::i32);
1786
1787 SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero);
1788 SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One);
1789
1790 return std::pair(Lo, Hi);
1791}
1792
1794 SDLoc SL(Op);
1795
1796 SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
1797 const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
1798 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero);
1799}
1800
1802 SDLoc SL(Op);
1803
1804 SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
1805 const SDValue One = DAG.getConstant(1, SL, MVT::i32);
1806 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One);
1807}
1808
1809// Split a vector type into two parts. The first part is a power of two vector.
1810// The second part is whatever is left over, and is a scalar if it would
1811// otherwise be a 1-vector.
1812std::pair<EVT, EVT>
1814 EVT LoVT, HiVT;
1815 EVT EltVT = VT.getVectorElementType();
1816 unsigned NumElts = VT.getVectorNumElements();
1817 unsigned LoNumElts = PowerOf2Ceil((NumElts + 1) / 2);
1818 LoVT = EVT::getVectorVT(*DAG.getContext(), EltVT, LoNumElts);
1819 HiVT = NumElts - LoNumElts == 1
1820 ? EltVT
1821 : EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts - LoNumElts);
1822 return std::pair(LoVT, HiVT);
1823}
1824
1825// Split a vector value into two parts of types LoVT and HiVT. HiVT could be
1826// scalar.
1827std::pair<SDValue, SDValue>
1829 const EVT &LoVT, const EVT &HiVT,
1830 SelectionDAG &DAG) const {
1831 EVT VT = N.getValueType();
1833 (HiVT.isVector() ? HiVT.getVectorNumElements() : 1) <=
1834 VT.getVectorNumElements() &&
1835 "More vector elements requested than available!");
1837 DAG.getVectorIdxConstant(0, DL));
1838
1839 unsigned LoNumElts = LoVT.getVectorNumElements();
1840
1841 if (HiVT.isVector()) {
1842 unsigned HiNumElts = HiVT.getVectorNumElements();
1843 if ((VT.getVectorNumElements() % HiNumElts) == 0) {
1844 // Avoid creating an extract_subvector with an index that isn't a multiple
1845 // of the result type.
1847 DAG.getConstant(LoNumElts, DL, MVT::i32));
1848 return {Lo, Hi};
1849 }
1850
1852 DAG.ExtractVectorElements(N, Elts, /*Start=*/LoNumElts,
1853 /*Count=*/HiNumElts);
1854 SDValue Hi = DAG.getBuildVector(HiVT, DL, Elts);
1855 return {Lo, Hi};
1856 }
1857
1859 DAG.getVectorIdxConstant(LoNumElts, DL));
1860 return {Lo, Hi};
1861}
1862
1864 SelectionDAG &DAG) const {
1866 EVT VT = Op.getValueType();
1867 SDLoc SL(Op);
1868
1869
1870 // If this is a 2 element vector, we really want to scalarize and not create
1871 // weird 1 element vectors.
1872 if (VT.getVectorNumElements() == 2) {
1873 SDValue Ops[2];
1874 std::tie(Ops[0], Ops[1]) = scalarizeVectorLoad(Load, DAG);
1875 return DAG.getMergeValues(Ops, SL);
1876 }
1877
1878 SDValue BasePtr = Load->getBasePtr();
1879 EVT MemVT = Load->getMemoryVT();
1880
1881 const MachinePointerInfo &SrcValue = Load->getMemOperand()->getPointerInfo();
1882
1883 EVT LoVT, HiVT;
1884 EVT LoMemVT, HiMemVT;
1885 SDValue Lo, Hi;
1886
1887 std::tie(LoVT, HiVT) = getSplitDestVTs(VT, DAG);
1888 std::tie(LoMemVT, HiMemVT) = getSplitDestVTs(MemVT, DAG);
1889 std::tie(Lo, Hi) = splitVector(Op, SL, LoVT, HiVT, DAG);
1890
1891 unsigned Size = LoMemVT.getStoreSize();
1892 Align BaseAlign = Load->getAlign();
1893 Align HiAlign = commonAlignment(BaseAlign, Size);
1894
1895 SDValue LoLoad = DAG.getExtLoad(
1896 Load->getExtensionType(), SL, LoVT, Load->getChain(), BasePtr, SrcValue,
1897 LoMemVT, BaseAlign, Load->getMemOperand()->getFlags(), Load->getAAInfo());
1898 SDValue HiPtr = DAG.getObjectPtrOffset(SL, BasePtr, TypeSize::getFixed(Size));
1899 SDValue HiLoad = DAG.getExtLoad(
1900 Load->getExtensionType(), SL, HiVT, Load->getChain(), HiPtr,
1901 SrcValue.getWithOffset(LoMemVT.getStoreSize()), HiMemVT, HiAlign,
1902 Load->getMemOperand()->getFlags(), Load->getAAInfo());
1903
1904 SDValue Join;
1905 if (LoVT == HiVT) {
1906 // This is the case that the vector is power of two so was evenly split.
1907 Join = DAG.getNode(ISD::CONCAT_VECTORS, SL, VT, LoLoad, HiLoad);
1908 } else {
1909 Join = DAG.getNode(ISD::INSERT_SUBVECTOR, SL, VT, DAG.getPOISON(VT), LoLoad,
1910 DAG.getVectorIdxConstant(0, SL));
1911 Join = DAG.getNode(
1913 VT, Join, HiLoad,
1915 }
1916
1917 SDValue Ops[] = {Join, DAG.getNode(ISD::TokenFactor, SL, MVT::Other,
1918 LoLoad.getValue(1), HiLoad.getValue(1))};
1919
1920 return DAG.getMergeValues(Ops, SL);
1921}
1922
1924 SelectionDAG &DAG) const {
1926 EVT VT = Op.getValueType();
1927 SDValue BasePtr = Load->getBasePtr();
1928 EVT MemVT = Load->getMemoryVT();
1929 SDLoc SL(Op);
1930 const MachinePointerInfo &SrcValue = Load->getMemOperand()->getPointerInfo();
1931 Align BaseAlign = Load->getAlign();
1932 unsigned NumElements = MemVT.getVectorNumElements();
1933
1934 // Widen from vec3 to vec4 when the load is at least 8-byte aligned
1935 // or 16-byte fully dereferenceable. Otherwise, split the vector load.
1936 if (NumElements != 3 ||
1937 (BaseAlign < Align(8) &&
1938 !SrcValue.isDereferenceable(16, *DAG.getContext(), DAG.getDataLayout())))
1939 return SplitVectorLoad(Op, DAG);
1940
1941 assert(NumElements == 3);
1942
1943 EVT WideVT =
1945 EVT WideMemVT =
1947 SDValue WideLoad = DAG.getExtLoad(
1948 Load->getExtensionType(), SL, WideVT, Load->getChain(), BasePtr, SrcValue,
1949 WideMemVT, BaseAlign, Load->getMemOperand()->getFlags());
1950 return DAG.getMergeValues(
1951 {DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, VT, WideLoad,
1952 DAG.getVectorIdxConstant(0, SL)),
1953 WideLoad.getValue(1)},
1954 SL);
1955}
1956
1958 SelectionDAG &DAG) const {
1960 SDValue Val = Store->getValue();
1961 EVT VT = Val.getValueType();
1962
1963 // If this is a 2 element vector, we really want to scalarize and not create
1964 // weird 1 element vectors.
1965 if (VT.getVectorNumElements() == 2)
1966 return scalarizeVectorStore(Store, DAG);
1967
1968 EVT MemVT = Store->getMemoryVT();
1969 SDValue Chain = Store->getChain();
1970 SDValue BasePtr = Store->getBasePtr();
1971 SDLoc SL(Op);
1972
1973 EVT LoVT, HiVT;
1974 EVT LoMemVT, HiMemVT;
1975 SDValue Lo, Hi;
1976
1977 std::tie(LoVT, HiVT) = getSplitDestVTs(VT, DAG);
1978 std::tie(LoMemVT, HiMemVT) = getSplitDestVTs(MemVT, DAG);
1979 std::tie(Lo, Hi) = splitVector(Val, SL, LoVT, HiVT, DAG);
1980
1981 SDValue HiPtr = DAG.getObjectPtrOffset(SL, BasePtr, LoMemVT.getStoreSize());
1982
1983 const MachinePointerInfo &SrcValue = Store->getMemOperand()->getPointerInfo();
1984 Align BaseAlign = Store->getAlign();
1985 unsigned Size = LoMemVT.getStoreSize();
1986 Align HiAlign = commonAlignment(BaseAlign, Size);
1987
1988 SDValue LoStore =
1989 DAG.getTruncStore(Chain, SL, Lo, BasePtr, SrcValue, LoMemVT, BaseAlign,
1990 Store->getMemOperand()->getFlags(), Store->getAAInfo());
1991 SDValue HiStore = DAG.getTruncStore(
1992 Chain, SL, Hi, HiPtr, SrcValue.getWithOffset(Size), HiMemVT, HiAlign,
1993 Store->getMemOperand()->getFlags(), Store->getAAInfo());
1994
1995 return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, LoStore, HiStore);
1996}
1997
1998// This is a shortcut for integer division because we have fast i32<->f32
1999// conversions, and fast f32 reciprocal instructions. The fractional part of a
2000// float is enough to accurately represent up to a 24-bit signed integer.
2002 bool Sign) const {
2003 SDLoc DL(Op);
2004 EVT VT = Op.getValueType();
2005 SDValue LHS = Op.getOperand(0);
2006 SDValue RHS = Op.getOperand(1);
2007 MVT IntVT = MVT::i32;
2008 MVT FltVT = MVT::f32;
2009
2010 unsigned LHSSignBits = DAG.ComputeNumSignBits(LHS);
2011 if (LHSSignBits < 9)
2012 return SDValue();
2013
2014 unsigned RHSSignBits = DAG.ComputeNumSignBits(RHS);
2015 if (RHSSignBits < 9)
2016 return SDValue();
2017
2018 unsigned BitSize = VT.getSizeInBits();
2019 unsigned SignBits = std::min(LHSSignBits, RHSSignBits);
2020 unsigned DivBits = BitSize - SignBits;
2021 if (Sign)
2022 ++DivBits;
2023
2026
2027 SDValue jq = DAG.getConstant(1, DL, IntVT);
2028
2029 if (Sign) {
2030 // char|short jq = ia ^ ib;
2031 jq = DAG.getNode(ISD::XOR, DL, VT, LHS, RHS);
2032
2033 // jq = jq >> (bitsize - 2)
2034 jq = DAG.getNode(ISD::SRA, DL, VT, jq,
2035 DAG.getConstant(BitSize - 2, DL, VT));
2036
2037 // jq = jq | 0x1
2038 jq = DAG.getNode(ISD::OR, DL, VT, jq, DAG.getConstant(1, DL, VT));
2039 }
2040
2041 // int ia = (int)LHS;
2042 SDValue ia = LHS;
2043
2044 // int ib, (int)RHS;
2045 SDValue ib = RHS;
2046
2047 // float fa = (float)ia;
2048 SDValue fa = DAG.getNode(ToFp, DL, FltVT, ia);
2049
2050 // float fb = (float)ib;
2051 SDValue fb = DAG.getNode(ToFp, DL, FltVT, ib);
2052
2053 SDValue fq = DAG.getNode(ISD::FMUL, DL, FltVT,
2054 fa, DAG.getNode(AMDGPUISD::RCP, DL, FltVT, fb));
2055
2056 // fq = trunc(fq);
2057 fq = DAG.getNode(ISD::FTRUNC, DL, FltVT, fq);
2058
2059 // float fqneg = -fq;
2060 SDValue fqneg = DAG.getNode(ISD::FNEG, DL, FltVT, fq);
2061
2063
2064 bool UseFmadFtz = false;
2065 if (Subtarget->isGCN()) {
2067 UseFmadFtz =
2069 }
2070
2071 // float fr = mad(fqneg, fb, fa);
2072 unsigned OpCode = !Subtarget->hasMadMacF32Insts() ? (unsigned)ISD::FMA
2073 : UseFmadFtz ? (unsigned)AMDGPUISD::FMAD_FTZ
2075 SDValue fr = DAG.getNode(OpCode, DL, FltVT, fqneg, fb, fa);
2076
2077 // int iq = (int)fq;
2078 SDValue iq = DAG.getNode(ToInt, DL, IntVT, fq);
2079
2080 // fr = fabs(fr);
2081 fr = DAG.getNode(ISD::FABS, DL, FltVT, fr);
2082
2083 // fb = fabs(fb);
2084 fb = DAG.getNode(ISD::FABS, DL, FltVT, fb);
2085
2086 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2087
2088 // int cv = fr >= fb;
2089 SDValue cv = DAG.getSetCC(DL, SetCCVT, fr, fb, ISD::SETOGE);
2090
2091 // jq = (cv ? jq : 0);
2092 jq = DAG.getNode(ISD::SELECT, DL, VT, cv, jq, DAG.getConstant(0, DL, VT));
2093
2094 // dst = iq + jq;
2095 SDValue Div = DAG.getNode(ISD::ADD, DL, VT, iq, jq);
2096
2097 // Rem needs compensation, it's easier to recompute it
2098 SDValue Rem = DAG.getNode(ISD::MUL, DL, VT, Div, RHS);
2099 Rem = DAG.getNode(ISD::SUB, DL, VT, LHS, Rem);
2100
2101 // Truncate to number of bits this divide really is.
2102 if (Sign) {
2103 SDValue InRegSize
2104 = DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), DivBits));
2105 Div = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Div, InRegSize);
2106 Rem = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Rem, InRegSize);
2107 } else {
2108 SDValue TruncMask = DAG.getConstant((UINT64_C(1) << DivBits) - 1, DL, VT);
2109 Div = DAG.getNode(ISD::AND, DL, VT, Div, TruncMask);
2110 Rem = DAG.getNode(ISD::AND, DL, VT, Rem, TruncMask);
2111 }
2112
2113 return DAG.getMergeValues({ Div, Rem }, DL);
2114}
2115
2117 SelectionDAG &DAG,
2119 SDLoc DL(Op);
2120 EVT VT = Op.getValueType();
2121
2122 assert(VT == MVT::i64 && "LowerUDIVREM64 expects an i64");
2123
2124 EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext());
2125
2126 SDValue One = DAG.getConstant(1, DL, HalfVT);
2127 SDValue Zero = DAG.getConstant(0, DL, HalfVT);
2128
2129 //HiLo split
2130 SDValue LHS_Lo, LHS_Hi;
2131 SDValue LHS = Op.getOperand(0);
2132 std::tie(LHS_Lo, LHS_Hi) = DAG.SplitScalar(LHS, DL, HalfVT, HalfVT);
2133
2134 SDValue RHS_Lo, RHS_Hi;
2135 SDValue RHS = Op.getOperand(1);
2136 std::tie(RHS_Lo, RHS_Hi) = DAG.SplitScalar(RHS, DL, HalfVT, HalfVT);
2137
2138 if (DAG.MaskedValueIsZero(RHS, APInt::getHighBitsSet(64, 32)) &&
2139 DAG.MaskedValueIsZero(LHS, APInt::getHighBitsSet(64, 32))) {
2140
2141 SDValue Res = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(HalfVT, HalfVT),
2142 LHS_Lo, RHS_Lo);
2143
2144 SDValue DIV = DAG.getBuildVector(MVT::v2i32, DL, {Res.getValue(0), Zero});
2145 SDValue REM = DAG.getBuildVector(MVT::v2i32, DL, {Res.getValue(1), Zero});
2146
2147 Results.push_back(DAG.getNode(ISD::BITCAST, DL, MVT::i64, DIV));
2148 Results.push_back(DAG.getNode(ISD::BITCAST, DL, MVT::i64, REM));
2149 return;
2150 }
2151
2152 if (isTypeLegal(MVT::i64)) {
2153 // The algorithm here is based on ideas from "Software Integer Division",
2154 // Tom Rodeheffer, August 2008.
2155
2158
2159 // Compute denominator reciprocal.
2160 unsigned FMAD =
2161 !Subtarget->hasMadMacF32Insts() ? (unsigned)ISD::FMA
2164 : (unsigned)AMDGPUISD::FMAD_FTZ;
2165
2166 SDValue Cvt_Lo = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, RHS_Lo);
2167 SDValue Cvt_Hi = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, RHS_Hi);
2168 SDValue Mad1 = DAG.getNode(FMAD, DL, MVT::f32, Cvt_Hi,
2169 DAG.getConstantFP(APInt(32, 0x4f800000).bitsToFloat(), DL, MVT::f32),
2170 Cvt_Lo);
2171 SDValue Rcp = DAG.getNode(AMDGPUISD::RCP, DL, MVT::f32, Mad1);
2172 SDValue Mul1 = DAG.getNode(ISD::FMUL, DL, MVT::f32, Rcp,
2173 DAG.getConstantFP(APInt(32, 0x5f7ffffc).bitsToFloat(), DL, MVT::f32));
2174 SDValue Mul2 = DAG.getNode(ISD::FMUL, DL, MVT::f32, Mul1,
2175 DAG.getConstantFP(APInt(32, 0x2f800000).bitsToFloat(), DL, MVT::f32));
2176 SDValue Trunc = DAG.getNode(ISD::FTRUNC, DL, MVT::f32, Mul2);
2177 SDValue Mad2 = DAG.getNode(FMAD, DL, MVT::f32, Trunc,
2178 DAG.getConstantFP(APInt(32, 0xcf800000).bitsToFloat(), DL, MVT::f32),
2179 Mul1);
2180 SDValue Rcp_Lo = DAG.getNode(ISD::FP_TO_UINT, DL, HalfVT, Mad2);
2181 SDValue Rcp_Hi = DAG.getNode(ISD::FP_TO_UINT, DL, HalfVT, Trunc);
2182 SDValue Rcp64 = DAG.getBitcast(VT,
2183 DAG.getBuildVector(MVT::v2i32, DL, {Rcp_Lo, Rcp_Hi}));
2184
2185 SDValue Zero64 = DAG.getConstant(0, DL, VT);
2186 SDValue One64 = DAG.getConstant(1, DL, VT);
2187 SDValue Zero1 = DAG.getConstant(0, DL, MVT::i1);
2188 SDVTList HalfCarryVT = DAG.getVTList(HalfVT, MVT::i1);
2189
2190 // First round of UNR (Unsigned integer Newton-Raphson).
2191 SDValue Neg_RHS = DAG.getNode(ISD::SUB, DL, VT, Zero64, RHS);
2192 SDValue Mullo1 = DAG.getNode(ISD::MUL, DL, VT, Neg_RHS, Rcp64);
2193 SDValue Mulhi1 = DAG.getNode(ISD::MULHU, DL, VT, Rcp64, Mullo1);
2194 SDValue Mulhi1_Lo, Mulhi1_Hi;
2195 std::tie(Mulhi1_Lo, Mulhi1_Hi) =
2196 DAG.SplitScalar(Mulhi1, DL, HalfVT, HalfVT);
2197 SDValue Add1_Lo = DAG.getNode(ISD::UADDO_CARRY, DL, HalfCarryVT, Rcp_Lo,
2198 Mulhi1_Lo, Zero1);
2199 SDValue Add1_Hi = DAG.getNode(ISD::UADDO_CARRY, DL, HalfCarryVT, Rcp_Hi,
2200 Mulhi1_Hi, Add1_Lo.getValue(1));
2201 SDValue Add1 = DAG.getBitcast(VT,
2202 DAG.getBuildVector(MVT::v2i32, DL, {Add1_Lo, Add1_Hi}));
2203
2204 // Second round of UNR.
2205 SDValue Mullo2 = DAG.getNode(ISD::MUL, DL, VT, Neg_RHS, Add1);
2206 SDValue Mulhi2 = DAG.getNode(ISD::MULHU, DL, VT, Add1, Mullo2);
2207 SDValue Mulhi2_Lo, Mulhi2_Hi;
2208 std::tie(Mulhi2_Lo, Mulhi2_Hi) =
2209 DAG.SplitScalar(Mulhi2, DL, HalfVT, HalfVT);
2210 SDValue Add2_Lo = DAG.getNode(ISD::UADDO_CARRY, DL, HalfCarryVT, Add1_Lo,
2211 Mulhi2_Lo, Zero1);
2212 SDValue Add2_Hi = DAG.getNode(ISD::UADDO_CARRY, DL, HalfCarryVT, Add1_Hi,
2213 Mulhi2_Hi, Add2_Lo.getValue(1));
2214 SDValue Add2 = DAG.getBitcast(VT,
2215 DAG.getBuildVector(MVT::v2i32, DL, {Add2_Lo, Add2_Hi}));
2216
2217 SDValue Mulhi3 = DAG.getNode(ISD::MULHU, DL, VT, LHS, Add2);
2218
2219 SDValue Mul3 = DAG.getNode(ISD::MUL, DL, VT, RHS, Mulhi3);
2220
2221 SDValue Mul3_Lo, Mul3_Hi;
2222 std::tie(Mul3_Lo, Mul3_Hi) = DAG.SplitScalar(Mul3, DL, HalfVT, HalfVT);
2223 SDValue Sub1_Lo = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, LHS_Lo,
2224 Mul3_Lo, Zero1);
2225 SDValue Sub1_Hi = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, LHS_Hi,
2226 Mul3_Hi, Sub1_Lo.getValue(1));
2227 SDValue Sub1_Mi = DAG.getNode(ISD::SUB, DL, HalfVT, LHS_Hi, Mul3_Hi);
2228 SDValue Sub1 = DAG.getBitcast(VT,
2229 DAG.getBuildVector(MVT::v2i32, DL, {Sub1_Lo, Sub1_Hi}));
2230
2231 SDValue MinusOne = DAG.getConstant(0xffffffffu, DL, HalfVT);
2232 SDValue C1 = DAG.getSelectCC(DL, Sub1_Hi, RHS_Hi, MinusOne, Zero,
2233 ISD::SETUGE);
2234 SDValue C2 = DAG.getSelectCC(DL, Sub1_Lo, RHS_Lo, MinusOne, Zero,
2235 ISD::SETUGE);
2236 SDValue C3 = DAG.getSelectCC(DL, Sub1_Hi, RHS_Hi, C2, C1, ISD::SETEQ);
2237
2238 // TODO: Here and below portions of the code can be enclosed into if/endif.
2239 // Currently control flow is unconditional and we have 4 selects after
2240 // potential endif to substitute PHIs.
2241
2242 // if C3 != 0 ...
2243 SDValue Sub2_Lo = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub1_Lo,
2244 RHS_Lo, Zero1);
2245 SDValue Sub2_Mi = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub1_Mi,
2246 RHS_Hi, Sub1_Lo.getValue(1));
2247 SDValue Sub2_Hi = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub2_Mi,
2248 Zero, Sub2_Lo.getValue(1));
2249 SDValue Sub2 = DAG.getBitcast(VT,
2250 DAG.getBuildVector(MVT::v2i32, DL, {Sub2_Lo, Sub2_Hi}));
2251
2252 SDValue Add3 = DAG.getNode(ISD::ADD, DL, VT, Mulhi3, One64);
2253
2254 SDValue C4 = DAG.getSelectCC(DL, Sub2_Hi, RHS_Hi, MinusOne, Zero,
2255 ISD::SETUGE);
2256 SDValue C5 = DAG.getSelectCC(DL, Sub2_Lo, RHS_Lo, MinusOne, Zero,
2257 ISD::SETUGE);
2258 SDValue C6 = DAG.getSelectCC(DL, Sub2_Hi, RHS_Hi, C5, C4, ISD::SETEQ);
2259
2260 // if (C6 != 0)
2261 SDValue Add4 = DAG.getNode(ISD::ADD, DL, VT, Add3, One64);
2262
2263 SDValue Sub3_Lo = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub2_Lo,
2264 RHS_Lo, Zero1);
2265 SDValue Sub3_Mi = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub2_Mi,
2266 RHS_Hi, Sub2_Lo.getValue(1));
2267 SDValue Sub3_Hi = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub3_Mi,
2268 Zero, Sub3_Lo.getValue(1));
2269 SDValue Sub3 = DAG.getBitcast(VT,
2270 DAG.getBuildVector(MVT::v2i32, DL, {Sub3_Lo, Sub3_Hi}));
2271
2272 // endif C6
2273 // endif C3
2274
2275 SDValue Sel1 = DAG.getSelectCC(DL, C6, Zero, Add4, Add3, ISD::SETNE);
2276 SDValue Div = DAG.getSelectCC(DL, C3, Zero, Sel1, Mulhi3, ISD::SETNE);
2277
2278 SDValue Sel2 = DAG.getSelectCC(DL, C6, Zero, Sub3, Sub2, ISD::SETNE);
2279 SDValue Rem = DAG.getSelectCC(DL, C3, Zero, Sel2, Sub1, ISD::SETNE);
2280
2281 Results.push_back(Div);
2282 Results.push_back(Rem);
2283
2284 return;
2285 }
2286
2287 // r600 expandion.
2288 // Get Speculative values
2289 SDValue DIV_Part = DAG.getNode(ISD::UDIV, DL, HalfVT, LHS_Hi, RHS_Lo);
2290 SDValue REM_Part = DAG.getNode(ISD::UREM, DL, HalfVT, LHS_Hi, RHS_Lo);
2291
2292 SDValue REM_Lo = DAG.getSelectCC(DL, RHS_Hi, Zero, REM_Part, LHS_Hi, ISD::SETEQ);
2293 SDValue REM = DAG.getBuildVector(MVT::v2i32, DL, {REM_Lo, Zero});
2294 REM = DAG.getNode(ISD::BITCAST, DL, MVT::i64, REM);
2295
2296 SDValue DIV_Hi = DAG.getSelectCC(DL, RHS_Hi, Zero, DIV_Part, Zero, ISD::SETEQ);
2297 SDValue DIV_Lo = Zero;
2298
2299 const unsigned halfBitWidth = HalfVT.getSizeInBits();
2300
2301 for (unsigned i = 0; i < halfBitWidth; ++i) {
2302 const unsigned bitPos = halfBitWidth - i - 1;
2303 SDValue POS = DAG.getConstant(bitPos, DL, HalfVT);
2304 // Get value of high bit
2305 SDValue HBit = DAG.getNode(ISD::SRL, DL, HalfVT, LHS_Lo, POS);
2306 HBit = DAG.getNode(ISD::AND, DL, HalfVT, HBit, One);
2307 HBit = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, HBit);
2308
2309 // Shift
2310 REM = DAG.getNode(ISD::SHL, DL, VT, REM, DAG.getConstant(1, DL, VT));
2311 // Add LHS high bit
2312 REM = DAG.getNode(ISD::OR, DL, VT, REM, HBit);
2313
2314 SDValue BIT = DAG.getConstant(1ULL << bitPos, DL, HalfVT);
2315 SDValue realBIT = DAG.getSelectCC(DL, REM, RHS, BIT, Zero, ISD::SETUGE);
2316
2317 DIV_Lo = DAG.getNode(ISD::OR, DL, HalfVT, DIV_Lo, realBIT);
2318
2319 // Update REM
2320 SDValue REM_sub = DAG.getNode(ISD::SUB, DL, VT, REM, RHS);
2321 REM = DAG.getSelectCC(DL, REM, RHS, REM_sub, REM, ISD::SETUGE);
2322 }
2323
2324 SDValue DIV = DAG.getBuildVector(MVT::v2i32, DL, {DIV_Lo, DIV_Hi});
2325 DIV = DAG.getNode(ISD::BITCAST, DL, MVT::i64, DIV);
2326 Results.push_back(DIV);
2327 Results.push_back(REM);
2328}
2329
2331 SelectionDAG &DAG) const {
2332 SDLoc DL(Op);
2333 EVT VT = Op.getValueType();
2334
2335 if (VT == MVT::i64) {
2337 LowerUDIVREM64(Op, DAG, Results);
2338 return DAG.getMergeValues(Results, DL);
2339 }
2340
2341 if (VT == MVT::i32) {
2342 if (SDValue Res = LowerDIVREM24(Op, DAG, false))
2343 return Res;
2344 }
2345
2346 SDValue X = Op.getOperand(0);
2347 SDValue Y = Op.getOperand(1);
2348
2349 // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
2350 // algorithm used here.
2351
2352 // Initial estimate of inv(y).
2353 SDValue Z = DAG.getNode(AMDGPUISD::URECIP, DL, VT, Y);
2354
2355 // One round of UNR.
2356 SDValue NegY = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Y);
2357 SDValue NegYZ = DAG.getNode(ISD::MUL, DL, VT, NegY, Z);
2358 Z = DAG.getNode(ISD::ADD, DL, VT, Z,
2359 DAG.getNode(ISD::MULHU, DL, VT, Z, NegYZ));
2360
2361 // Quotient/remainder estimate.
2362 SDValue Q = DAG.getNode(ISD::MULHU, DL, VT, X, Z);
2363 SDValue R =
2364 DAG.getNode(ISD::SUB, DL, VT, X, DAG.getNode(ISD::MUL, DL, VT, Q, Y));
2365
2366 // First quotient/remainder refinement.
2367 EVT CCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2368 SDValue One = DAG.getConstant(1, DL, VT);
2369 SDValue Cond = DAG.getSetCC(DL, CCVT, R, Y, ISD::SETUGE);
2370 Q = DAG.getNode(ISD::SELECT, DL, VT, Cond,
2371 DAG.getNode(ISD::ADD, DL, VT, Q, One), Q);
2372 R = DAG.getNode(ISD::SELECT, DL, VT, Cond,
2373 DAG.getNode(ISD::SUB, DL, VT, R, Y), R);
2374
2375 // Second quotient/remainder refinement.
2376 Cond = DAG.getSetCC(DL, CCVT, R, Y, ISD::SETUGE);
2377 Q = DAG.getNode(ISD::SELECT, DL, VT, Cond,
2378 DAG.getNode(ISD::ADD, DL, VT, Q, One), Q);
2379 R = DAG.getNode(ISD::SELECT, DL, VT, Cond,
2380 DAG.getNode(ISD::SUB, DL, VT, R, Y), R);
2381
2382 return DAG.getMergeValues({Q, R}, DL);
2383}
2384
2386 SelectionDAG &DAG) const {
2387 SDLoc DL(Op);
2388 EVT VT = Op.getValueType();
2389
2390 SDValue LHS = Op.getOperand(0);
2391 SDValue RHS = Op.getOperand(1);
2392
2393 SDValue Zero = DAG.getConstant(0, DL, VT);
2394 SDValue NegOne = DAG.getAllOnesConstant(DL, VT);
2395
2396 if (VT == MVT::i32) {
2397 if (SDValue Res = LowerDIVREM24(Op, DAG, true))
2398 return Res;
2399 }
2400
2401 if (VT == MVT::i64 &&
2402 DAG.ComputeNumSignBits(LHS) > 32 &&
2403 DAG.ComputeNumSignBits(RHS) > 32) {
2404 EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext());
2405
2406 //HiLo split
2407 SDValue LHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, Zero);
2408 SDValue RHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, Zero);
2409 SDValue DIVREM = DAG.getNode(ISD::SDIVREM, DL, DAG.getVTList(HalfVT, HalfVT),
2410 LHS_Lo, RHS_Lo);
2411 SDValue Res[2] = {
2412 DAG.getNode(ISD::SIGN_EXTEND, DL, VT, DIVREM.getValue(0)),
2413 DAG.getNode(ISD::SIGN_EXTEND, DL, VT, DIVREM.getValue(1))
2414 };
2415 return DAG.getMergeValues(Res, DL);
2416 }
2417
2418 SDValue LHSign = DAG.getSelectCC(DL, LHS, Zero, NegOne, Zero, ISD::SETLT);
2419 SDValue RHSign = DAG.getSelectCC(DL, RHS, Zero, NegOne, Zero, ISD::SETLT);
2420 SDValue DSign = DAG.getNode(ISD::XOR, DL, VT, LHSign, RHSign);
2421 SDValue RSign = LHSign; // Remainder sign is the same as LHS
2422
2423 LHS = DAG.getNode(ISD::ADD, DL, VT, LHS, LHSign);
2424 RHS = DAG.getNode(ISD::ADD, DL, VT, RHS, RHSign);
2425
2426 LHS = DAG.getNode(ISD::XOR, DL, VT, LHS, LHSign);
2427 RHS = DAG.getNode(ISD::XOR, DL, VT, RHS, RHSign);
2428
2429 SDValue Div = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(VT, VT), LHS, RHS);
2430 SDValue Rem = Div.getValue(1);
2431
2432 Div = DAG.getNode(ISD::XOR, DL, VT, Div, DSign);
2433 Rem = DAG.getNode(ISD::XOR, DL, VT, Rem, RSign);
2434
2435 Div = DAG.getNode(ISD::SUB, DL, VT, Div, DSign);
2436 Rem = DAG.getNode(ISD::SUB, DL, VT, Rem, RSign);
2437
2438 SDValue Res[2] = {
2439 Div,
2440 Rem
2441 };
2442 return DAG.getMergeValues(Res, DL);
2443}
2444
2446 SDLoc SL(Op);
2447 SDValue Src = Op.getOperand(0);
2448
2449 // result = trunc(src)
2450 // if (src > 0.0 && src != result)
2451 // result += 1.0
2452
2453 SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src);
2454
2455 const SDValue Zero = DAG.getConstantFP(0.0, SL, MVT::f64);
2456 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f64);
2457
2458 EVT SetCCVT =
2459 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);
2460
2461 SDValue Lt0 = DAG.getSetCC(SL, SetCCVT, Src, Zero, ISD::SETOGT);
2462 SDValue NeTrunc = DAG.getSetCC(SL, SetCCVT, Src, Trunc, ISD::SETONE);
2463 SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc);
2464
2465 SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, One, Zero);
2466 // TODO: Should this propagate fast-math-flags?
2467 return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add);
2468}
2469
2471 SelectionDAG &DAG) {
2472 const unsigned FractBits = 52;
2473 const unsigned ExpBits = 11;
2474
2475 SDValue ExpPart = DAG.getNode(AMDGPUISD::BFE_U32, SL, MVT::i32,
2476 Hi,
2477 DAG.getConstant(FractBits - 32, SL, MVT::i32),
2478 DAG.getConstant(ExpBits, SL, MVT::i32));
2479 SDValue Exp = DAG.getNode(ISD::SUB, SL, MVT::i32, ExpPart,
2480 DAG.getConstant(1023, SL, MVT::i32));
2481
2482 return Exp;
2483}
2484
2486 SDLoc SL(Op);
2487 SDValue Src = Op.getOperand(0);
2488
2489 assert(Op.getValueType() == MVT::f64);
2490
2491 const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
2492
2493 // Extract the upper half, since this is where we will find the sign and
2494 // exponent.
2495 SDValue Hi = getHiHalf64(Src, DAG);
2496
2497 SDValue Exp = extractF64Exponent(Hi, SL, DAG);
2498
2499 const unsigned FractBits = 52;
2500
2501 // Extract the sign bit.
2502 const SDValue SignBitMask = DAG.getConstant(UINT32_C(1) << 31, SL, MVT::i32);
2503 SDValue SignBit = DAG.getNode(ISD::AND, SL, MVT::i32, Hi, SignBitMask);
2504
2505 // Extend back to 64-bits.
2506 SDValue SignBit64 = DAG.getBuildVector(MVT::v2i32, SL, {Zero, SignBit});
2507 SignBit64 = DAG.getNode(ISD::BITCAST, SL, MVT::i64, SignBit64);
2508
2509 SDValue BcInt = DAG.getNode(ISD::BITCAST, SL, MVT::i64, Src);
2510 const SDValue FractMask
2511 = DAG.getConstant((UINT64_C(1) << FractBits) - 1, SL, MVT::i64);
2512
2513 SDValue Shr = DAG.getNode(ISD::SRA, SL, MVT::i64, FractMask, Exp);
2514 SDValue Not = DAG.getNOT(SL, Shr, MVT::i64);
2515 SDValue Tmp0 = DAG.getNode(ISD::AND, SL, MVT::i64, BcInt, Not);
2516
2517 EVT SetCCVT =
2518 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i32);
2519
2520 const SDValue FiftyOne = DAG.getConstant(FractBits - 1, SL, MVT::i32);
2521
2522 SDValue ExpLt0 = DAG.getSetCC(SL, SetCCVT, Exp, Zero, ISD::SETLT);
2523 SDValue ExpGt51 = DAG.getSetCC(SL, SetCCVT, Exp, FiftyOne, ISD::SETGT);
2524
2525 SDValue Tmp1 = DAG.getNode(ISD::SELECT, SL, MVT::i64, ExpLt0, SignBit64, Tmp0);
2526 SDValue Tmp2 = DAG.getNode(ISD::SELECT, SL, MVT::i64, ExpGt51, BcInt, Tmp1);
2527
2528 return DAG.getNode(ISD::BITCAST, SL, MVT::f64, Tmp2);
2529}
2530
2532 SelectionDAG &DAG) const {
2533 SDLoc SL(Op);
2534 SDValue Src = Op.getOperand(0);
2535
2536 assert(Op.getValueType() == MVT::f64);
2537
2538 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
2539 SDValue C1 = DAG.getConstantFP(C1Val, SL, MVT::f64);
2540 SDValue CopySign = DAG.getNode(ISD::FCOPYSIGN, SL, MVT::f64, C1, Src);
2541
2542 // TODO: Should this propagate fast-math-flags?
2543
2544 SDValue Tmp1 = DAG.getNode(ISD::FADD, SL, MVT::f64, Src, CopySign);
2545 SDValue Tmp2 = DAG.getNode(ISD::FSUB, SL, MVT::f64, Tmp1, CopySign);
2546
2547 SDValue Fabs = DAG.getNode(ISD::FABS, SL, MVT::f64, Src);
2548
2549 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
2550 SDValue C2 = DAG.getConstantFP(C2Val, SL, MVT::f64);
2551
2552 EVT SetCCVT =
2553 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);
2554 SDValue Cond = DAG.getSetCC(SL, SetCCVT, Fabs, C2, ISD::SETOGT);
2555
2556 return DAG.getSelect(SL, MVT::f64, Cond, Src, Tmp2);
2557}
2558
2560 SelectionDAG &DAG) const {
2561 // FNEARBYINT and FRINT are the same, except in their handling of FP
2562 // exceptions. Those aren't really meaningful for us, and OpenCL only has
2563 // rint, so just treat them as equivalent.
2564 return DAG.getNode(ISD::FROUNDEVEN, SDLoc(Op), Op.getValueType(),
2565 Op.getOperand(0));
2566}
2567
2569 auto VT = Op.getValueType();
2570 auto Arg = Op.getOperand(0u);
2571 return DAG.getNode(ISD::FROUNDEVEN, SDLoc(Op), VT, Arg);
2572}
2573
2574// XXX - May require not supporting f32 denormals?
2575
2576// Don't handle v2f16. The extra instructions to scalarize and repack around the
2577// compare and vselect end up producing worse code than scalarizing the whole
2578// operation.
2580 SDLoc SL(Op);
2581 SDValue X = Op.getOperand(0);
2582 EVT VT = Op.getValueType();
2583
2584 SDValue T = DAG.getNode(ISD::FTRUNC, SL, VT, X);
2585
2586 // TODO: Should this propagate fast-math-flags?
2587
2588 SDValue Diff = DAG.getNode(ISD::FSUB, SL, VT, X, T);
2589
2590 SDValue AbsDiff = DAG.getNode(ISD::FABS, SL, VT, Diff);
2591
2592 const SDValue Zero = DAG.getConstantFP(0.0, SL, VT);
2593 const SDValue One = DAG.getConstantFP(1.0, SL, VT);
2594
2595 EVT SetCCVT =
2596 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2597
2598 const SDValue Half = DAG.getConstantFP(0.5, SL, VT);
2599 SDValue Cmp = DAG.getSetCC(SL, SetCCVT, AbsDiff, Half, ISD::SETOGE);
2600 SDValue OneOrZeroFP = DAG.getNode(ISD::SELECT, SL, VT, Cmp, One, Zero);
2601
2602 SDValue SignedOffset = DAG.getNode(ISD::FCOPYSIGN, SL, VT, OneOrZeroFP, X);
2603 return DAG.getNode(ISD::FADD, SL, VT, T, SignedOffset);
2604}
2605
2607 SDLoc SL(Op);
2608 SDValue Src = Op.getOperand(0);
2609
2610 // result = trunc(src);
2611 // if (src < 0.0 && src != result)
2612 // result += -1.0.
2613
2614 SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src);
2615
2616 const SDValue Zero = DAG.getConstantFP(0.0, SL, MVT::f64);
2617 const SDValue NegOne = DAG.getConstantFP(-1.0, SL, MVT::f64);
2618
2619 EVT SetCCVT =
2620 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);
2621
2622 SDValue Lt0 = DAG.getSetCC(SL, SetCCVT, Src, Zero, ISD::SETOLT);
2623 SDValue NeTrunc = DAG.getSetCC(SL, SetCCVT, Src, Trunc, ISD::SETONE);
2624 SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc);
2625
2626 SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, NegOne, Zero);
2627 // TODO: Should this propagate fast-math-flags?
2628 return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add);
2629}
2630
2631/// Return true if it's known that \p Src can never be an f32 denormal value.
2633 switch (Src.getOpcode()) {
2634 case ISD::FP_EXTEND:
2635 return Src.getOperand(0).getValueType() == MVT::f16;
2636 case ISD::FP16_TO_FP:
2637 case ISD::FFREXP:
2638 case ISD::FSQRT:
2639 case AMDGPUISD::LOG:
2640 case AMDGPUISD::EXP:
2641 return true;
2643 unsigned IntrinsicID = Src.getConstantOperandVal(0);
2644 switch (IntrinsicID) {
2645 case Intrinsic::amdgcn_frexp_mant:
2646 case Intrinsic::amdgcn_log:
2647 case Intrinsic::amdgcn_log_clamp:
2648 case Intrinsic::amdgcn_exp2:
2649 case Intrinsic::amdgcn_sqrt:
2650 return true;
2651 default:
2652 return false;
2653 }
2654 }
2655 default:
2656 return false;
2657 }
2658
2659 llvm_unreachable("covered opcode switch");
2660}
2661
2663 SDNodeFlags Flags) {
2664 return Flags.hasApproximateFuncs();
2665}
2666
2675
2677 SDValue Src,
2678 SDNodeFlags Flags) const {
2679 SDLoc SL(Src);
2680 EVT VT = Src.getValueType();
2681 const fltSemantics &Semantics = VT.getFltSemantics();
2682 SDValue SmallestNormal =
2683 DAG.getConstantFP(APFloat::getSmallestNormalized(Semantics), SL, VT);
2684
2685 // Want to scale denormals up, but negatives and 0 work just as well on the
2686 // scaled path.
2687 SDValue IsLtSmallestNormal = DAG.getSetCC(
2688 SL, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT), Src,
2689 SmallestNormal, ISD::SETOLT);
2690
2691 return IsLtSmallestNormal;
2692}
2693
2695 SDNodeFlags Flags) const {
2696 SDLoc SL(Src);
2697 EVT VT = Src.getValueType();
2698 const fltSemantics &Semantics = VT.getFltSemantics();
2699 SDValue Inf = DAG.getConstantFP(APFloat::getInf(Semantics), SL, VT);
2700
2701 SDValue Fabs = DAG.getNode(ISD::FABS, SL, VT, Src, Flags);
2702 SDValue IsFinite = DAG.getSetCC(
2703 SL, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT), Fabs,
2704 Inf, ISD::SETOLT);
2705 return IsFinite;
2706}
2707
2708/// If denormal handling is required return the scaled input to FLOG2, and the
2709/// check for denormal range. Otherwise, return null values.
2710std::pair<SDValue, SDValue>
2712 SDValue Src, SDNodeFlags Flags) const {
2713 if (!needsDenormHandlingF32(DAG, Src, Flags))
2714 return {};
2715
2716 MVT VT = MVT::f32;
2717 const fltSemantics &Semantics = APFloat::IEEEsingle();
2718 SDValue SmallestNormal =
2719 DAG.getConstantFP(APFloat::getSmallestNormalized(Semantics), SL, VT);
2720
2721 SDValue IsLtSmallestNormal = DAG.getSetCC(
2722 SL, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT), Src,
2723 SmallestNormal, ISD::SETOLT);
2724
2725 SDValue Scale32 = DAG.getConstantFP(0x1.0p+32, SL, VT);
2726 SDValue One = DAG.getConstantFP(1.0, SL, VT);
2727 SDValue ScaleFactor =
2728 DAG.getNode(ISD::SELECT, SL, VT, IsLtSmallestNormal, Scale32, One, Flags);
2729
2730 SDValue ScaledInput = DAG.getNode(ISD::FMUL, SL, VT, Src, ScaleFactor, Flags);
2731 return {ScaledInput, IsLtSmallestNormal};
2732}
2733
2735 // v_log_f32 is good enough for OpenCL, except it doesn't handle denormals.
2736 // If we have to handle denormals, scale up the input and adjust the result.
2737
2738 // scaled = x * (is_denormal ? 0x1.0p+32 : 1.0)
2739 // log2 = amdgpu_log2 - (is_denormal ? 32.0 : 0.0)
2740
2741 SDLoc SL(Op);
2742 EVT VT = Op.getValueType();
2743 SDValue Src = Op.getOperand(0);
2744 SDNodeFlags Flags = Op->getFlags();
2745
2746 if (VT == MVT::f16) {
2747 // Nothing in half is a denormal when promoted to f32.
2748 assert(!Subtarget->has16BitInsts());
2749 SDValue Ext = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src, Flags);
2750 SDValue Log = DAG.getNode(AMDGPUISD::LOG, SL, MVT::f32, Ext, Flags);
2751 return DAG.getNode(ISD::FP_ROUND, SL, VT, Log,
2752 DAG.getTargetConstant(0, SL, MVT::i32), Flags);
2753 }
2754
2755 auto [ScaledInput, IsLtSmallestNormal] =
2756 getScaledLogInput(DAG, SL, Src, Flags);
2757 if (!ScaledInput)
2758 return DAG.getNode(AMDGPUISD::LOG, SL, VT, Src, Flags);
2759
2760 SDValue Log2 = DAG.getNode(AMDGPUISD::LOG, SL, VT, ScaledInput, Flags);
2761
2762 SDValue ThirtyTwo = DAG.getConstantFP(32.0, SL, VT);
2763 SDValue Zero = DAG.getConstantFP(0.0, SL, VT);
2764 SDValue ResultOffset =
2765 DAG.getNode(ISD::SELECT, SL, VT, IsLtSmallestNormal, ThirtyTwo, Zero);
2766 return DAG.getNode(ISD::FSUB, SL, VT, Log2, ResultOffset, Flags);
2767}
2768
2769static SDValue getMad(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue X,
2770 SDValue Y, SDValue C, SDNodeFlags Flags = SDNodeFlags()) {
2771 SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, X, Y, Flags);
2772 return DAG.getNode(ISD::FADD, SL, VT, Mul, C, Flags);
2773}
2774
2776 SelectionDAG &DAG) const {
2777 SDValue X = Op.getOperand(0);
2778 EVT VT = Op.getValueType();
2779 SDNodeFlags Flags = Op->getFlags();
2780 SDLoc DL(Op);
2781 const bool IsLog10 = Op.getOpcode() == ISD::FLOG10;
2782 assert(IsLog10 || Op.getOpcode() == ISD::FLOG);
2783
2784 const auto &Options = getTargetMachine().Options;
2785 if (VT == MVT::f16 || Flags.hasApproximateFuncs()) {
2786
2787 if (VT == MVT::f16 && !Subtarget->has16BitInsts()) {
2788 // Log and multiply in f32 is good enough for f16.
2789 X = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, X, Flags);
2790 }
2791
2792 SDValue Lowered = LowerFLOGUnsafe(X, DL, DAG, IsLog10, Flags);
2793 if (VT == MVT::f16 && !Subtarget->has16BitInsts()) {
2794 return DAG.getNode(ISD::FP_ROUND, DL, VT, Lowered,
2795 DAG.getTargetConstant(0, DL, MVT::i32), Flags);
2796 }
2797
2798 return Lowered;
2799 }
2800
2801 auto [ScaledInput, IsScaled] = getScaledLogInput(DAG, DL, X, Flags);
2802 if (ScaledInput)
2803 X = ScaledInput;
2804
2805 SDValue Y = DAG.getNode(AMDGPUISD::LOG, DL, VT, X, Flags);
2806
2807 SDValue R;
2808 if (Subtarget->hasFastFMAF32()) {
2809 // c+cc are ln(2)/ln(10) to more than 49 bits
2810 const float c_log10 = 0x1.344134p-2f;
2811 const float cc_log10 = 0x1.09f79ep-26f;
2812
2813 // c + cc is ln(2) to more than 49 bits
2814 const float c_log = 0x1.62e42ep-1f;
2815 const float cc_log = 0x1.efa39ep-25f;
2816
2817 SDValue C = DAG.getConstantFP(IsLog10 ? c_log10 : c_log, DL, VT);
2818 SDValue CC = DAG.getConstantFP(IsLog10 ? cc_log10 : cc_log, DL, VT);
2819 // This adds correction terms for which contraction may lead to an increase
2820 // in the error of the approximation, so disable it.
2821 Flags.setAllowContract(false);
2822 R = DAG.getNode(ISD::FMUL, DL, VT, Y, C, Flags);
2823 SDValue NegR = DAG.getNode(ISD::FNEG, DL, VT, R, Flags);
2824 SDValue FMA0 = DAG.getNode(ISD::FMA, DL, VT, Y, C, NegR, Flags);
2825 SDValue FMA1 = DAG.getNode(ISD::FMA, DL, VT, Y, CC, FMA0, Flags);
2826 R = DAG.getNode(ISD::FADD, DL, VT, R, FMA1, Flags);
2827 } else {
2828 // ch+ct is ln(2)/ln(10) to more than 36 bits
2829 const float ch_log10 = 0x1.344000p-2f;
2830 const float ct_log10 = 0x1.3509f6p-18f;
2831
2832 // ch + ct is ln(2) to more than 36 bits
2833 const float ch_log = 0x1.62e000p-1f;
2834 const float ct_log = 0x1.0bfbe8p-15f;
2835
2836 SDValue CH = DAG.getConstantFP(IsLog10 ? ch_log10 : ch_log, DL, VT);
2837 SDValue CT = DAG.getConstantFP(IsLog10 ? ct_log10 : ct_log, DL, VT);
2838
2839 SDValue YAsInt = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Y);
2840 SDValue MaskConst = DAG.getConstant(0xfffff000, DL, MVT::i32);
2841 SDValue YHInt = DAG.getNode(ISD::AND, DL, MVT::i32, YAsInt, MaskConst);
2842 SDValue YH = DAG.getNode(ISD::BITCAST, DL, MVT::f32, YHInt);
2843 SDValue YT = DAG.getNode(ISD::FSUB, DL, VT, Y, YH, Flags);
2844 // This adds correction terms for which contraction may lead to an increase
2845 // in the error of the approximation, so disable it.
2846 Flags.setAllowContract(false);
2847 SDValue YTCT = DAG.getNode(ISD::FMUL, DL, VT, YT, CT, Flags);
2848 SDValue Mad0 = getMad(DAG, DL, VT, YH, CT, YTCT, Flags);
2849 SDValue Mad1 = getMad(DAG, DL, VT, YT, CH, Mad0, Flags);
2850 R = getMad(DAG, DL, VT, YH, CH, Mad1);
2851 }
2852
2853 const bool IsFiniteOnly =
2854 (Flags.hasNoNaNs() || Options.NoNaNsFPMath) && Flags.hasNoInfs();
2855
2856 // TODO: Check if known finite from source value.
2857 if (!IsFiniteOnly) {
2858 SDValue IsFinite = getIsFinite(DAG, Y, Flags);
2859 R = DAG.getNode(ISD::SELECT, DL, VT, IsFinite, R, Y, Flags);
2860 }
2861
2862 if (IsScaled) {
2863 SDValue Zero = DAG.getConstantFP(0.0f, DL, VT);
2864 SDValue ShiftK =
2865 DAG.getConstantFP(IsLog10 ? 0x1.344136p+3f : 0x1.62e430p+4f, DL, VT);
2866 SDValue Shift =
2867 DAG.getNode(ISD::SELECT, DL, VT, IsScaled, ShiftK, Zero, Flags);
2868 R = DAG.getNode(ISD::FSUB, DL, VT, R, Shift, Flags);
2869 }
2870
2871 return R;
2872}
2873
2877
2878// Do f32 fast math expansion for flog2 or flog10. This is accurate enough for a
2879// promote f16 operation.
2881 SelectionDAG &DAG, bool IsLog10,
2882 SDNodeFlags Flags) const {
2883 EVT VT = Src.getValueType();
2884 unsigned LogOp =
2885 VT == MVT::f32 ? (unsigned)AMDGPUISD::LOG : (unsigned)ISD::FLOG2;
2886
2887 double Log2BaseInverted =
2889
2890 if (VT == MVT::f32) {
2891 auto [ScaledInput, IsScaled] = getScaledLogInput(DAG, SL, Src, Flags);
2892 if (ScaledInput) {
2893 SDValue LogSrc = DAG.getNode(AMDGPUISD::LOG, SL, VT, ScaledInput, Flags);
2894 SDValue ScaledResultOffset =
2895 DAG.getConstantFP(-32.0 * Log2BaseInverted, SL, VT);
2896
2897 SDValue Zero = DAG.getConstantFP(0.0f, SL, VT);
2898
2899 SDValue ResultOffset = DAG.getNode(ISD::SELECT, SL, VT, IsScaled,
2900 ScaledResultOffset, Zero, Flags);
2901
2902 SDValue Log2Inv = DAG.getConstantFP(Log2BaseInverted, SL, VT);
2903
2904 if (Subtarget->hasFastFMAF32())
2905 return DAG.getNode(ISD::FMA, SL, VT, LogSrc, Log2Inv, ResultOffset,
2906 Flags);
2907 SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, LogSrc, Log2Inv, Flags);
2908 return DAG.getNode(ISD::FADD, SL, VT, Mul, ResultOffset);
2909 }
2910 }
2911
2912 SDValue Log2Operand = DAG.getNode(LogOp, SL, VT, Src, Flags);
2913 SDValue Log2BaseInvertedOperand = DAG.getConstantFP(Log2BaseInverted, SL, VT);
2914
2915 return DAG.getNode(ISD::FMUL, SL, VT, Log2Operand, Log2BaseInvertedOperand,
2916 Flags);
2917}
2918
2920 // v_exp_f32 is good enough for OpenCL, except it doesn't handle denormals.
2921 // If we have to handle denormals, scale up the input and adjust the result.
2922
2923 SDLoc SL(Op);
2924 EVT VT = Op.getValueType();
2925 SDValue Src = Op.getOperand(0);
2926 SDNodeFlags Flags = Op->getFlags();
2927
2928 if (VT == MVT::f16) {
2929 // Nothing in half is a denormal when promoted to f32.
2930 assert(!Subtarget->has16BitInsts());
2931 SDValue Ext = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src, Flags);
2932 SDValue Log = DAG.getNode(AMDGPUISD::EXP, SL, MVT::f32, Ext, Flags);
2933 return DAG.getNode(ISD::FP_ROUND, SL, VT, Log,
2934 DAG.getTargetConstant(0, SL, MVT::i32), Flags);
2935 }
2936
2937 assert(VT == MVT::f32);
2938
2939 if (!needsDenormHandlingF32(DAG, Src, Flags))
2940 return DAG.getNode(AMDGPUISD::EXP, SL, MVT::f32, Src, Flags);
2941
2942 // bool needs_scaling = x < -0x1.f80000p+6f;
2943 // v_exp_f32(x + (s ? 0x1.0p+6f : 0.0f)) * (s ? 0x1.0p-64f : 1.0f);
2944
2945 // -nextafter(128.0, -1)
2946 SDValue RangeCheckConst = DAG.getConstantFP(-0x1.f80000p+6f, SL, VT);
2947
2948 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2949
2950 SDValue NeedsScaling =
2951 DAG.getSetCC(SL, SetCCVT, Src, RangeCheckConst, ISD::SETOLT);
2952
2953 SDValue SixtyFour = DAG.getConstantFP(0x1.0p+6f, SL, VT);
2954 SDValue Zero = DAG.getConstantFP(0.0, SL, VT);
2955
2956 SDValue AddOffset =
2957 DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, SixtyFour, Zero);
2958
2959 SDValue AddInput = DAG.getNode(ISD::FADD, SL, VT, Src, AddOffset, Flags);
2960 SDValue Exp2 = DAG.getNode(AMDGPUISD::EXP, SL, VT, AddInput, Flags);
2961
2962 SDValue TwoExpNeg64 = DAG.getConstantFP(0x1.0p-64f, SL, VT);
2963 SDValue One = DAG.getConstantFP(1.0, SL, VT);
2964 SDValue ResultScale =
2965 DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, TwoExpNeg64, One);
2966
2967 return DAG.getNode(ISD::FMUL, SL, VT, Exp2, ResultScale, Flags);
2968}
2969
2971 SelectionDAG &DAG,
2972 SDNodeFlags Flags,
2973 bool IsExp10) const {
2974 // exp(x) -> exp2(M_LOG2E_F * x);
2975 // exp10(x) -> exp2(log2(10) * x);
2976 EVT VT = X.getValueType();
2977 SDValue Const =
2978 DAG.getConstantFP(IsExp10 ? 0x1.a934f0p+1f : numbers::log2e, SL, VT);
2979
2980 SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, X, Const, Flags);
2981 return DAG.getNode(VT == MVT::f32 ? (unsigned)AMDGPUISD::EXP
2982 : (unsigned)ISD::FEXP2,
2983 SL, VT, Mul, Flags);
2984}
2985
2987 SelectionDAG &DAG,
2988 SDNodeFlags Flags) const {
2989 EVT VT = X.getValueType();
2990 if (VT != MVT::f32 || !needsDenormHandlingF32(DAG, X, Flags))
2991 return lowerFEXPUnsafeImpl(X, SL, DAG, Flags, /*IsExp10=*/false);
2992
2993 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2994
2995 SDValue Threshold = DAG.getConstantFP(-0x1.5d58a0p+6f, SL, VT);
2996 SDValue NeedsScaling = DAG.getSetCC(SL, SetCCVT, X, Threshold, ISD::SETOLT);
2997
2998 SDValue ScaleOffset = DAG.getConstantFP(0x1.0p+6f, SL, VT);
2999
3000 SDValue ScaledX = DAG.getNode(ISD::FADD, SL, VT, X, ScaleOffset, Flags);
3001
3002 SDValue AdjustedX =
3003 DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, ScaledX, X);
3004
3005 const SDValue Log2E = DAG.getConstantFP(numbers::log2e, SL, VT);
3006 SDValue ExpInput = DAG.getNode(ISD::FMUL, SL, VT, AdjustedX, Log2E, Flags);
3007
3008 SDValue Exp2 = DAG.getNode(AMDGPUISD::EXP, SL, VT, ExpInput, Flags);
3009
3010 SDValue ResultScaleFactor = DAG.getConstantFP(0x1.969d48p-93f, SL, VT);
3011 SDValue AdjustedResult =
3012 DAG.getNode(ISD::FMUL, SL, VT, Exp2, ResultScaleFactor, Flags);
3013
3014 return DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, AdjustedResult, Exp2,
3015 Flags);
3016}
3017
3018/// Emit approx-funcs appropriate lowering for exp10. inf/nan should still be
3019/// handled correctly.
3021 SelectionDAG &DAG,
3022 SDNodeFlags Flags) const {
3023 const EVT VT = X.getValueType();
3024
3025 const unsigned Exp2Op = VT == MVT::f32 ? static_cast<unsigned>(AMDGPUISD::EXP)
3026 : static_cast<unsigned>(ISD::FEXP2);
3027
3028 if (VT != MVT::f32 || !needsDenormHandlingF32(DAG, X, Flags)) {
3029 // exp2(x * 0x1.a92000p+1f) * exp2(x * 0x1.4f0978p-11f);
3030 SDValue K0 = DAG.getConstantFP(0x1.a92000p+1f, SL, VT);
3031 SDValue K1 = DAG.getConstantFP(0x1.4f0978p-11f, SL, VT);
3032
3033 SDValue Mul0 = DAG.getNode(ISD::FMUL, SL, VT, X, K0, Flags);
3034 SDValue Exp2_0 = DAG.getNode(Exp2Op, SL, VT, Mul0, Flags);
3035 SDValue Mul1 = DAG.getNode(ISD::FMUL, SL, VT, X, K1, Flags);
3036 SDValue Exp2_1 = DAG.getNode(Exp2Op, SL, VT, Mul1, Flags);
3037 return DAG.getNode(ISD::FMUL, SL, VT, Exp2_0, Exp2_1);
3038 }
3039
3040 // bool s = x < -0x1.2f7030p+5f;
3041 // x += s ? 0x1.0p+5f : 0.0f;
3042 // exp10 = exp2(x * 0x1.a92000p+1f) *
3043 // exp2(x * 0x1.4f0978p-11f) *
3044 // (s ? 0x1.9f623ep-107f : 1.0f);
3045
3046 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
3047
3048 SDValue Threshold = DAG.getConstantFP(-0x1.2f7030p+5f, SL, VT);
3049 SDValue NeedsScaling = DAG.getSetCC(SL, SetCCVT, X, Threshold, ISD::SETOLT);
3050
3051 SDValue ScaleOffset = DAG.getConstantFP(0x1.0p+5f, SL, VT);
3052 SDValue ScaledX = DAG.getNode(ISD::FADD, SL, VT, X, ScaleOffset, Flags);
3053 SDValue AdjustedX =
3054 DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, ScaledX, X);
3055
3056 SDValue K0 = DAG.getConstantFP(0x1.a92000p+1f, SL, VT);
3057 SDValue K1 = DAG.getConstantFP(0x1.4f0978p-11f, SL, VT);
3058
3059 SDValue Mul0 = DAG.getNode(ISD::FMUL, SL, VT, AdjustedX, K0, Flags);
3060 SDValue Exp2_0 = DAG.getNode(Exp2Op, SL, VT, Mul0, Flags);
3061 SDValue Mul1 = DAG.getNode(ISD::FMUL, SL, VT, AdjustedX, K1, Flags);
3062 SDValue Exp2_1 = DAG.getNode(Exp2Op, SL, VT, Mul1, Flags);
3063
3064 SDValue MulExps = DAG.getNode(ISD::FMUL, SL, VT, Exp2_0, Exp2_1, Flags);
3065
3066 SDValue ResultScaleFactor = DAG.getConstantFP(0x1.9f623ep-107f, SL, VT);
3067 SDValue AdjustedResult =
3068 DAG.getNode(ISD::FMUL, SL, VT, MulExps, ResultScaleFactor, Flags);
3069
3070 return DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, AdjustedResult, MulExps,
3071 Flags);
3072}
3073
3075 EVT VT = Op.getValueType();
3076 SDLoc SL(Op);
3077 SDValue X = Op.getOperand(0);
3078 SDNodeFlags Flags = Op->getFlags();
3079 const bool IsExp10 = Op.getOpcode() == ISD::FEXP10;
3080
3081 // TODO: Interpret allowApproxFunc as ignoring DAZ. This is currently copying
3082 // library behavior. Also, is known-not-daz source sufficient?
3083 if (allowApproxFunc(DAG, Flags)) { // TODO: Does this really require fast?
3084 return IsExp10 ? lowerFEXP10Unsafe(X, SL, DAG, Flags)
3085 : lowerFEXPUnsafe(X, SL, DAG, Flags);
3086 }
3087
3088 if (VT.getScalarType() == MVT::f16) {
3089 if (VT.isVector())
3090 return SDValue();
3091
3092 // Nothing in half is a denormal when promoted to f32.
3093 //
3094 // exp(f16 x) ->
3095 // fptrunc (v_exp_f32 (fmul (fpext x), log2e))
3096 //
3097 // exp10(f16 x) ->
3098 // fptrunc (v_exp_f32 (fmul (fpext x), log2(10)))
3099 SDValue Ext = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, X, Flags);
3100 SDValue Lowered = lowerFEXPUnsafeImpl(Ext, SL, DAG, Flags, IsExp10);
3101 return DAG.getNode(ISD::FP_ROUND, SL, VT, Lowered,
3102 DAG.getTargetConstant(0, SL, MVT::i32), Flags);
3103 }
3104
3105 assert(VT == MVT::f32);
3106
3107 // Algorithm:
3108 //
3109 // e^x = 2^(x/ln(2)) = 2^(x*(64/ln(2))/64)
3110 //
3111 // x*(64/ln(2)) = n + f, |f| <= 0.5, n is integer
3112 // n = 64*m + j, 0 <= j < 64
3113 //
3114 // e^x = 2^((64*m + j + f)/64)
3115 // = (2^m) * (2^(j/64)) * 2^(f/64)
3116 // = (2^m) * (2^(j/64)) * e^(f*(ln(2)/64))
3117 //
3118 // f = x*(64/ln(2)) - n
3119 // r = f*(ln(2)/64) = x - n*(ln(2)/64)
3120 //
3121 // e^x = (2^m) * (2^(j/64)) * e^r
3122 //
3123 // (2^(j/64)) is precomputed
3124 //
3125 // e^r = 1 + r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5!
3126 // e^r = 1 + q
3127 //
3128 // q = r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5!
3129 //
3130 // e^x = (2^m) * ( (2^(j/64)) + q*(2^(j/64)) )
3131 SDNodeFlags FlagsNoContract = Flags;
3132 FlagsNoContract.setAllowContract(false);
3133
3134 SDValue PH, PL;
3135 if (Subtarget->hasFastFMAF32()) {
3136 const float c_exp = numbers::log2ef;
3137 const float cc_exp = 0x1.4ae0bep-26f; // c+cc are 49 bits
3138 const float c_exp10 = 0x1.a934f0p+1f;
3139 const float cc_exp10 = 0x1.2f346ep-24f;
3140
3141 SDValue C = DAG.getConstantFP(IsExp10 ? c_exp10 : c_exp, SL, VT);
3142 SDValue CC = DAG.getConstantFP(IsExp10 ? cc_exp10 : cc_exp, SL, VT);
3143
3144 PH = DAG.getNode(ISD::FMUL, SL, VT, X, C, Flags);
3145 SDValue NegPH = DAG.getNode(ISD::FNEG, SL, VT, PH, Flags);
3146 SDValue FMA0 = DAG.getNode(ISD::FMA, SL, VT, X, C, NegPH, Flags);
3147 PL = DAG.getNode(ISD::FMA, SL, VT, X, CC, FMA0, Flags);
3148 } else {
3149 const float ch_exp = 0x1.714000p+0f;
3150 const float cl_exp = 0x1.47652ap-12f; // ch + cl are 36 bits
3151
3152 const float ch_exp10 = 0x1.a92000p+1f;
3153 const float cl_exp10 = 0x1.4f0978p-11f;
3154
3155 SDValue CH = DAG.getConstantFP(IsExp10 ? ch_exp10 : ch_exp, SL, VT);
3156 SDValue CL = DAG.getConstantFP(IsExp10 ? cl_exp10 : cl_exp, SL, VT);
3157
3158 SDValue XAsInt = DAG.getNode(ISD::BITCAST, SL, MVT::i32, X);
3159 SDValue MaskConst = DAG.getConstant(0xfffff000, SL, MVT::i32);
3160 SDValue XHAsInt = DAG.getNode(ISD::AND, SL, MVT::i32, XAsInt, MaskConst);
3161 SDValue XH = DAG.getNode(ISD::BITCAST, SL, VT, XHAsInt);
3162 SDValue XL = DAG.getNode(ISD::FSUB, SL, VT, X, XH, Flags);
3163
3164 PH = DAG.getNode(ISD::FMUL, SL, VT, XH, CH, Flags);
3165
3166 SDValue XLCL = DAG.getNode(ISD::FMUL, SL, VT, XL, CL, Flags);
3167 SDValue Mad0 = getMad(DAG, SL, VT, XL, CH, XLCL, Flags);
3168 PL = getMad(DAG, SL, VT, XH, CL, Mad0, Flags);
3169 }
3170
3171 SDValue E = DAG.getNode(ISD::FROUNDEVEN, SL, VT, PH, Flags);
3172
3173 // It is unsafe to contract this fsub into the PH multiply.
3174 SDValue PHSubE = DAG.getNode(ISD::FSUB, SL, VT, PH, E, FlagsNoContract);
3175
3176 SDValue A = DAG.getNode(ISD::FADD, SL, VT, PHSubE, PL, Flags);
3177 SDValue IntE = DAG.getNode(ISD::FP_TO_SINT, SL, MVT::i32, E);
3178 SDValue Exp2 = DAG.getNode(AMDGPUISD::EXP, SL, VT, A, Flags);
3179
3180 SDValue R = DAG.getNode(ISD::FLDEXP, SL, VT, Exp2, IntE, Flags);
3181
3182 SDValue UnderflowCheckConst =
3183 DAG.getConstantFP(IsExp10 ? -0x1.66d3e8p+5f : -0x1.9d1da0p+6f, SL, VT);
3184
3185 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
3186 SDValue Zero = DAG.getConstantFP(0.0, SL, VT);
3187 SDValue Underflow =
3188 DAG.getSetCC(SL, SetCCVT, X, UnderflowCheckConst, ISD::SETOLT);
3189
3190 R = DAG.getNode(ISD::SELECT, SL, VT, Underflow, Zero, R);
3191
3192 if (!Flags.hasNoInfs()) {
3193 SDValue OverflowCheckConst =
3194 DAG.getConstantFP(IsExp10 ? 0x1.344136p+5f : 0x1.62e430p+6f, SL, VT);
3195 SDValue Overflow =
3196 DAG.getSetCC(SL, SetCCVT, X, OverflowCheckConst, ISD::SETOGT);
3197 SDValue Inf =
3199 R = DAG.getNode(ISD::SELECT, SL, VT, Overflow, Inf, R);
3200 }
3201
3202 return R;
3203}
3204
3205static bool isCtlzOpc(unsigned Opc) {
3206 return Opc == ISD::CTLZ || Opc == ISD::CTLZ_ZERO_UNDEF;
3207}
3208
3209static bool isCttzOpc(unsigned Opc) {
3210 return Opc == ISD::CTTZ || Opc == ISD::CTTZ_ZERO_UNDEF;
3211}
3212
3214 SelectionDAG &DAG) const {
3215 auto SL = SDLoc(Op);
3216 auto Opc = Op.getOpcode();
3217 auto Arg = Op.getOperand(0u);
3218 auto ResultVT = Op.getValueType();
3219
3220 if (ResultVT != MVT::i8 && ResultVT != MVT::i16)
3221 return {};
3222
3224 assert(ResultVT == Arg.getValueType());
3225
3226 const uint64_t NumBits = ResultVT.getFixedSizeInBits();
3227 SDValue NumExtBits = DAG.getConstant(32u - NumBits, SL, MVT::i32);
3228 SDValue NewOp;
3229
3230 if (Opc == ISD::CTLZ_ZERO_UNDEF) {
3231 NewOp = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, Arg);
3232 NewOp = DAG.getNode(ISD::SHL, SL, MVT::i32, NewOp, NumExtBits);
3233 NewOp = DAG.getNode(Opc, SL, MVT::i32, NewOp);
3234 } else {
3235 NewOp = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Arg);
3236 NewOp = DAG.getNode(Opc, SL, MVT::i32, NewOp);
3237 NewOp = DAG.getNode(ISD::SUB, SL, MVT::i32, NewOp, NumExtBits);
3238 }
3239
3240 return DAG.getNode(ISD::TRUNCATE, SL, ResultVT, NewOp);
3241}
3242
3244 SDLoc SL(Op);
3245 SDValue Src = Op.getOperand(0);
3246
3247 assert(isCtlzOpc(Op.getOpcode()) || isCttzOpc(Op.getOpcode()));
3248 bool Ctlz = isCtlzOpc(Op.getOpcode());
3249 unsigned NewOpc = Ctlz ? AMDGPUISD::FFBH_U32 : AMDGPUISD::FFBL_B32;
3250
3251 bool ZeroUndef = Op.getOpcode() == ISD::CTLZ_ZERO_UNDEF ||
3252 Op.getOpcode() == ISD::CTTZ_ZERO_UNDEF;
3253 bool Is64BitScalar = !Src->isDivergent() && Src.getValueType() == MVT::i64;
3254
3255 if (Src.getValueType() == MVT::i32 || Is64BitScalar) {
3256 // (ctlz hi:lo) -> (umin (ffbh src), 32)
3257 // (cttz hi:lo) -> (umin (ffbl src), 32)
3258 // (ctlz_zero_undef src) -> (ffbh src)
3259 // (cttz_zero_undef src) -> (ffbl src)
3260
3261 // 64-bit scalar version produce 32-bit result
3262 // (ctlz hi:lo) -> (umin (S_FLBIT_I32_B64 src), 64)
3263 // (cttz hi:lo) -> (umin (S_FF1_I32_B64 src), 64)
3264 // (ctlz_zero_undef src) -> (S_FLBIT_I32_B64 src)
3265 // (cttz_zero_undef src) -> (S_FF1_I32_B64 src)
3266 SDValue NewOpr = DAG.getNode(NewOpc, SL, MVT::i32, Src);
3267 if (!ZeroUndef) {
3268 const SDValue ConstVal = DAG.getConstant(
3269 Op.getValueType().getScalarSizeInBits(), SL, MVT::i32);
3270 NewOpr = DAG.getNode(ISD::UMIN, SL, MVT::i32, NewOpr, ConstVal);
3271 }
3272 return DAG.getNode(ISD::ZERO_EXTEND, SL, Src.getValueType(), NewOpr);
3273 }
3274
3275 SDValue Lo, Hi;
3276 std::tie(Lo, Hi) = split64BitValue(Src, DAG);
3277
3278 SDValue OprLo = DAG.getNode(NewOpc, SL, MVT::i32, Lo);
3279 SDValue OprHi = DAG.getNode(NewOpc, SL, MVT::i32, Hi);
3280
3281 // (ctlz hi:lo) -> (umin3 (ffbh hi), (uaddsat (ffbh lo), 32), 64)
3282 // (cttz hi:lo) -> (umin3 (uaddsat (ffbl hi), 32), (ffbl lo), 64)
3283 // (ctlz_zero_undef hi:lo) -> (umin (ffbh hi), (add (ffbh lo), 32))
3284 // (cttz_zero_undef hi:lo) -> (umin (add (ffbl hi), 32), (ffbl lo))
3285
3286 unsigned AddOpc = ZeroUndef ? ISD::ADD : ISD::UADDSAT;
3287 const SDValue Const32 = DAG.getConstant(32, SL, MVT::i32);
3288 if (Ctlz)
3289 OprLo = DAG.getNode(AddOpc, SL, MVT::i32, OprLo, Const32);
3290 else
3291 OprHi = DAG.getNode(AddOpc, SL, MVT::i32, OprHi, Const32);
3292
3293 SDValue NewOpr;
3294 NewOpr = DAG.getNode(ISD::UMIN, SL, MVT::i32, OprLo, OprHi);
3295 if (!ZeroUndef) {
3296 const SDValue Const64 = DAG.getConstant(64, SL, MVT::i32);
3297 NewOpr = DAG.getNode(ISD::UMIN, SL, MVT::i32, NewOpr, Const64);
3298 }
3299
3300 return DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i64, NewOpr);
3301}
3302
3304 bool Signed) const {
3305 // The regular method converting a 64-bit integer to float roughly consists of
3306 // 2 steps: normalization and rounding. In fact, after normalization, the
3307 // conversion from a 64-bit integer to a float is essentially the same as the
3308 // one from a 32-bit integer. The only difference is that it has more
3309 // trailing bits to be rounded. To leverage the native 32-bit conversion, a
3310 // 64-bit integer could be preprocessed and fit into a 32-bit integer then
3311 // converted into the correct float number. The basic steps for the unsigned
3312 // conversion are illustrated in the following pseudo code:
3313 //
3314 // f32 uitofp(i64 u) {
3315 // i32 hi, lo = split(u);
3316 // // Only count the leading zeros in hi as we have native support of the
3317 // // conversion from i32 to f32. If hi is all 0s, the conversion is
3318 // // reduced to a 32-bit one automatically.
3319 // i32 shamt = clz(hi); // Return 32 if hi is all 0s.
3320 // u <<= shamt;
3321 // hi, lo = split(u);
3322 // hi |= (lo != 0) ? 1 : 0; // Adjust rounding bit in hi based on lo.
3323 // // convert it as a 32-bit integer and scale the result back.
3324 // return uitofp(hi) * 2^(32 - shamt);
3325 // }
3326 //
3327 // The signed one follows the same principle but uses 'ffbh_i32' to count its
3328 // sign bits instead. If 'ffbh_i32' is not available, its absolute value is
3329 // converted instead followed by negation based its sign bit.
3330
3331 SDLoc SL(Op);
3332 SDValue Src = Op.getOperand(0);
3333
3334 SDValue Lo, Hi;
3335 std::tie(Lo, Hi) = split64BitValue(Src, DAG);
3336 SDValue Sign;
3337 SDValue ShAmt;
3338 if (Signed && Subtarget->isGCN()) {
3339 // We also need to consider the sign bit in Lo if Hi has just sign bits,
3340 // i.e. Hi is 0 or -1. However, that only needs to take the MSB into
3341 // account. That is, the maximal shift is
3342 // - 32 if Lo and Hi have opposite signs;
3343 // - 33 if Lo and Hi have the same sign.
3344 //
3345 // Or, MaxShAmt = 33 + OppositeSign, where
3346 //
3347 // OppositeSign is defined as ((Lo ^ Hi) >> 31), which is
3348 // - -1 if Lo and Hi have opposite signs; and
3349 // - 0 otherwise.
3350 //
3351 // All in all, ShAmt is calculated as
3352 //
3353 // umin(sffbh(Hi), 33 + (Lo^Hi)>>31) - 1.
3354 //
3355 // or
3356 //
3357 // umin(sffbh(Hi) - 1, 32 + (Lo^Hi)>>31).
3358 //
3359 // to reduce the critical path.
3360 SDValue OppositeSign = DAG.getNode(
3361 ISD::SRA, SL, MVT::i32, DAG.getNode(ISD::XOR, SL, MVT::i32, Lo, Hi),
3362 DAG.getConstant(31, SL, MVT::i32));
3363 SDValue MaxShAmt =
3364 DAG.getNode(ISD::ADD, SL, MVT::i32, DAG.getConstant(32, SL, MVT::i32),
3365 OppositeSign);
3366 // Count the leading sign bits.
3367 ShAmt = DAG.getNode(AMDGPUISD::FFBH_I32, SL, MVT::i32, Hi);
3368 // Different from unsigned conversion, the shift should be one bit less to
3369 // preserve the sign bit.
3370 ShAmt = DAG.getNode(ISD::SUB, SL, MVT::i32, ShAmt,
3371 DAG.getConstant(1, SL, MVT::i32));
3372 ShAmt = DAG.getNode(ISD::UMIN, SL, MVT::i32, ShAmt, MaxShAmt);
3373 } else {
3374 if (Signed) {
3375 // Without 'ffbh_i32', only leading zeros could be counted. Take the
3376 // absolute value first.
3377 Sign = DAG.getNode(ISD::SRA, SL, MVT::i64, Src,
3378 DAG.getConstant(63, SL, MVT::i64));
3379 SDValue Abs =
3380 DAG.getNode(ISD::XOR, SL, MVT::i64,
3381 DAG.getNode(ISD::ADD, SL, MVT::i64, Src, Sign), Sign);
3382 std::tie(Lo, Hi) = split64BitValue(Abs, DAG);
3383 }
3384 // Count the leading zeros.
3385 ShAmt = DAG.getNode(ISD::CTLZ, SL, MVT::i32, Hi);
3386 // The shift amount for signed integers is [0, 32].
3387 }
3388 // Normalize the given 64-bit integer.
3389 SDValue Norm = DAG.getNode(ISD::SHL, SL, MVT::i64, Src, ShAmt);
3390 // Split it again.
3391 std::tie(Lo, Hi) = split64BitValue(Norm, DAG);
3392 // Calculate the adjust bit for rounding.
3393 // (lo != 0) ? 1 : 0 => (lo >= 1) ? 1 : 0 => umin(1, lo)
3394 SDValue Adjust = DAG.getNode(ISD::UMIN, SL, MVT::i32,
3395 DAG.getConstant(1, SL, MVT::i32), Lo);
3396 // Get the 32-bit normalized integer.
3397 Norm = DAG.getNode(ISD::OR, SL, MVT::i32, Hi, Adjust);
3398 // Convert the normalized 32-bit integer into f32.
3399 unsigned Opc =
3400 (Signed && Subtarget->isGCN()) ? ISD::SINT_TO_FP : ISD::UINT_TO_FP;
3401 SDValue FVal = DAG.getNode(Opc, SL, MVT::f32, Norm);
3402
3403 // Finally, need to scale back the converted floating number as the original
3404 // 64-bit integer is converted as a 32-bit one.
3405 ShAmt = DAG.getNode(ISD::SUB, SL, MVT::i32, DAG.getConstant(32, SL, MVT::i32),
3406 ShAmt);
3407 // On GCN, use LDEXP directly.
3408 if (Subtarget->isGCN())
3409 return DAG.getNode(ISD::FLDEXP, SL, MVT::f32, FVal, ShAmt);
3410
3411 // Otherwise, align 'ShAmt' to the exponent part and add it into the exponent
3412 // part directly to emulate the multiplication of 2^ShAmt. That 8-bit
3413 // exponent is enough to avoid overflowing into the sign bit.
3414 SDValue Exp = DAG.getNode(ISD::SHL, SL, MVT::i32, ShAmt,
3415 DAG.getConstant(23, SL, MVT::i32));
3416 SDValue IVal =
3417 DAG.getNode(ISD::ADD, SL, MVT::i32,
3418 DAG.getNode(ISD::BITCAST, SL, MVT::i32, FVal), Exp);
3419 if (Signed) {
3420 // Set the sign bit.
3421 Sign = DAG.getNode(ISD::SHL, SL, MVT::i32,
3422 DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Sign),
3423 DAG.getConstant(31, SL, MVT::i32));
3424 IVal = DAG.getNode(ISD::OR, SL, MVT::i32, IVal, Sign);
3425 }
3426 return DAG.getNode(ISD::BITCAST, SL, MVT::f32, IVal);
3427}
3428
3430 bool Signed) const {
3431 SDLoc SL(Op);
3432 SDValue Src = Op.getOperand(0);
3433
3434 SDValue Lo, Hi;
3435 std::tie(Lo, Hi) = split64BitValue(Src, DAG);
3436
3438 SL, MVT::f64, Hi);
3439
3440 SDValue CvtLo = DAG.getNode(ISD::UINT_TO_FP, SL, MVT::f64, Lo);
3441
3442 SDValue LdExp = DAG.getNode(ISD::FLDEXP, SL, MVT::f64, CvtHi,
3443 DAG.getConstant(32, SL, MVT::i32));
3444 // TODO: Should this propagate fast-math-flags?
3445 return DAG.getNode(ISD::FADD, SL, MVT::f64, LdExp, CvtLo);
3446}
3447
3449 SelectionDAG &DAG) const {
3450 // TODO: Factor out code common with LowerSINT_TO_FP.
3451 EVT DestVT = Op.getValueType();
3452 SDValue Src = Op.getOperand(0);
3453 EVT SrcVT = Src.getValueType();
3454
3455 if (SrcVT == MVT::i16) {
3456 if (DestVT == MVT::f16)
3457 return Op;
3458 SDLoc DL(Op);
3459
3460 // Promote src to i32
3461 SDValue Ext = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Src);
3462 return DAG.getNode(ISD::UINT_TO_FP, DL, DestVT, Ext);
3463 }
3464
3465 if (DestVT == MVT::bf16) {
3466 SDLoc SL(Op);
3467 SDValue ToF32 = DAG.getNode(ISD::UINT_TO_FP, SL, MVT::f32, Src);
3468 SDValue FPRoundFlag = DAG.getIntPtrConstant(0, SL, /*isTarget=*/true);
3469 return DAG.getNode(ISD::FP_ROUND, SL, MVT::bf16, ToF32, FPRoundFlag);
3470 }
3471
3472 if (SrcVT != MVT::i64)
3473 return Op;
3474
3475 if (Subtarget->has16BitInsts() && DestVT == MVT::f16) {
3476 SDLoc DL(Op);
3477
3478 SDValue IntToFp32 = DAG.getNode(Op.getOpcode(), DL, MVT::f32, Src);
3479 SDValue FPRoundFlag =
3480 DAG.getIntPtrConstant(0, SDLoc(Op), /*isTarget=*/true);
3481 SDValue FPRound =
3482 DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, IntToFp32, FPRoundFlag);
3483
3484 return FPRound;
3485 }
3486
3487 if (DestVT == MVT::f32)
3488 return LowerINT_TO_FP32(Op, DAG, false);
3489
3490 assert(DestVT == MVT::f64);
3491 return LowerINT_TO_FP64(Op, DAG, false);
3492}
3493
3495 SelectionDAG &DAG) const {
3496 EVT DestVT = Op.getValueType();
3497
3498 SDValue Src = Op.getOperand(0);
3499 EVT SrcVT = Src.getValueType();
3500
3501 if (SrcVT == MVT::i16) {
3502 if (DestVT == MVT::f16)
3503 return Op;
3504
3505 SDLoc DL(Op);
3506 // Promote src to i32
3507 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i32, Src);
3508 return DAG.getNode(ISD::SINT_TO_FP, DL, DestVT, Ext);
3509 }
3510
3511 if (DestVT == MVT::bf16) {
3512 SDLoc SL(Op);
3513 SDValue ToF32 = DAG.getNode(ISD::SINT_TO_FP, SL, MVT::f32, Src);
3514 SDValue FPRoundFlag = DAG.getIntPtrConstant(0, SL, /*isTarget=*/true);
3515 return DAG.getNode(ISD::FP_ROUND, SL, MVT::bf16, ToF32, FPRoundFlag);
3516 }
3517
3518 if (SrcVT != MVT::i64)
3519 return Op;
3520
3521 // TODO: Factor out code common with LowerUINT_TO_FP.
3522
3523 if (Subtarget->has16BitInsts() && DestVT == MVT::f16) {
3524 SDLoc DL(Op);
3525 SDValue Src = Op.getOperand(0);
3526
3527 SDValue IntToFp32 = DAG.getNode(Op.getOpcode(), DL, MVT::f32, Src);
3528 SDValue FPRoundFlag =
3529 DAG.getIntPtrConstant(0, SDLoc(Op), /*isTarget=*/true);
3530 SDValue FPRound =
3531 DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, IntToFp32, FPRoundFlag);
3532
3533 return FPRound;
3534 }
3535
3536 if (DestVT == MVT::f32)
3537 return LowerINT_TO_FP32(Op, DAG, true);
3538
3539 assert(DestVT == MVT::f64);
3540 return LowerINT_TO_FP64(Op, DAG, true);
3541}
3542
3544 bool Signed) const {
3545 SDLoc SL(Op);
3546
3547 SDValue Src = Op.getOperand(0);
3548 EVT SrcVT = Src.getValueType();
3549
3550 assert(SrcVT == MVT::f32 || SrcVT == MVT::f64);
3551
3552 // The basic idea of converting a floating point number into a pair of 32-bit
3553 // integers is illustrated as follows:
3554 //
3555 // tf := trunc(val);
3556 // hif := floor(tf * 2^-32);
3557 // lof := tf - hif * 2^32; // lof is always positive due to floor.
3558 // hi := fptoi(hif);
3559 // lo := fptoi(lof);
3560 //
3561 SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, SrcVT, Src);
3562 SDValue Sign;
3563 if (Signed && SrcVT == MVT::f32) {
3564 // However, a 32-bit floating point number has only 23 bits mantissa and
3565 // it's not enough to hold all the significant bits of `lof` if val is
3566 // negative. To avoid the loss of precision, We need to take the absolute
3567 // value after truncating and flip the result back based on the original
3568 // signedness.
3569 Sign = DAG.getNode(ISD::SRA, SL, MVT::i32,
3570 DAG.getNode(ISD::BITCAST, SL, MVT::i32, Trunc),
3571 DAG.getConstant(31, SL, MVT::i32));
3572 Trunc = DAG.getNode(ISD::FABS, SL, SrcVT, Trunc);
3573 }
3574
3575 SDValue K0, K1;
3576 if (SrcVT == MVT::f64) {
3577 K0 = DAG.getConstantFP(
3578 llvm::bit_cast<double>(UINT64_C(/*2^-32*/ 0x3df0000000000000)), SL,
3579 SrcVT);
3580 K1 = DAG.getConstantFP(
3581 llvm::bit_cast<double>(UINT64_C(/*-2^32*/ 0xc1f0000000000000)), SL,
3582 SrcVT);
3583 } else {
3584 K0 = DAG.getConstantFP(
3585 llvm::bit_cast<float>(UINT32_C(/*2^-32*/ 0x2f800000)), SL, SrcVT);
3586 K1 = DAG.getConstantFP(
3587 llvm::bit_cast<float>(UINT32_C(/*-2^32*/ 0xcf800000)), SL, SrcVT);
3588 }
3589 // TODO: Should this propagate fast-math-flags?
3590 SDValue Mul = DAG.getNode(ISD::FMUL, SL, SrcVT, Trunc, K0);
3591
3592 SDValue FloorMul = DAG.getNode(ISD::FFLOOR, SL, SrcVT, Mul);
3593
3594 SDValue Fma = DAG.getNode(ISD::FMA, SL, SrcVT, FloorMul, K1, Trunc);
3595
3596 SDValue Hi = DAG.getNode((Signed && SrcVT == MVT::f64) ? ISD::FP_TO_SINT
3598 SL, MVT::i32, FloorMul);
3599 SDValue Lo = DAG.getNode(ISD::FP_TO_UINT, SL, MVT::i32, Fma);
3600
3601 SDValue Result = DAG.getNode(ISD::BITCAST, SL, MVT::i64,
3602 DAG.getBuildVector(MVT::v2i32, SL, {Lo, Hi}));
3603
3604 if (Signed && SrcVT == MVT::f32) {
3605 assert(Sign);
3606 // Flip the result based on the signedness, which is either all 0s or 1s.
3607 Sign = DAG.getNode(ISD::BITCAST, SL, MVT::i64,
3608 DAG.getBuildVector(MVT::v2i32, SL, {Sign, Sign}));
3609 // r := xor(r, sign) - sign;
3610 Result =
3611 DAG.getNode(ISD::SUB, SL, MVT::i64,
3612 DAG.getNode(ISD::XOR, SL, MVT::i64, Result, Sign), Sign);
3613 }
3614
3615 return Result;
3616}
3617
3619 SDLoc DL(Op);
3620 SDValue N0 = Op.getOperand(0);
3621
3622 // Convert to target node to get known bits
3623 if (N0.getValueType() == MVT::f32)
3624 return DAG.getNode(AMDGPUISD::FP_TO_FP16, DL, Op.getValueType(), N0);
3625
3626 if (Op->getFlags().hasApproximateFuncs()) {
3627 // There is a generic expand for FP_TO_FP16 with unsafe fast math.
3628 return SDValue();
3629 }
3630
3631 return LowerF64ToF16Safe(N0, DL, DAG);
3632}
3633
3634// return node in i32
3636 SelectionDAG &DAG) const {
3637 assert(Src.getSimpleValueType() == MVT::f64);
3638
3639 // f64 -> f16 conversion using round-to-nearest-even rounding mode.
3640 // TODO: We can generate better code for True16.
3641 const unsigned ExpMask = 0x7ff;
3642 const unsigned ExpBiasf64 = 1023;
3643 const unsigned ExpBiasf16 = 15;
3644 SDValue Zero = DAG.getConstant(0, DL, MVT::i32);
3645 SDValue One = DAG.getConstant(1, DL, MVT::i32);
3646 SDValue U = DAG.getNode(ISD::BITCAST, DL, MVT::i64, Src);
3647 SDValue UH = DAG.getNode(ISD::SRL, DL, MVT::i64, U,
3648 DAG.getConstant(32, DL, MVT::i64));
3649 UH = DAG.getZExtOrTrunc(UH, DL, MVT::i32);
3650 U = DAG.getZExtOrTrunc(U, DL, MVT::i32);
3651 SDValue E = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
3652 DAG.getConstant(20, DL, MVT::i64));
3653 E = DAG.getNode(ISD::AND, DL, MVT::i32, E,
3654 DAG.getConstant(ExpMask, DL, MVT::i32));
3655 // Subtract the fp64 exponent bias (1023) to get the real exponent and
3656 // add the f16 bias (15) to get the biased exponent for the f16 format.
3657 E = DAG.getNode(ISD::ADD, DL, MVT::i32, E,
3658 DAG.getConstant(-ExpBiasf64 + ExpBiasf16, DL, MVT::i32));
3659
3660 SDValue M = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
3661 DAG.getConstant(8, DL, MVT::i32));
3662 M = DAG.getNode(ISD::AND, DL, MVT::i32, M,
3663 DAG.getConstant(0xffe, DL, MVT::i32));
3664
3665 SDValue MaskedSig = DAG.getNode(ISD::AND, DL, MVT::i32, UH,
3666 DAG.getConstant(0x1ff, DL, MVT::i32));
3667 MaskedSig = DAG.getNode(ISD::OR, DL, MVT::i32, MaskedSig, U);
3668
3669 SDValue Lo40Set = DAG.getSelectCC(DL, MaskedSig, Zero, Zero, One, ISD::SETEQ);
3670 M = DAG.getNode(ISD::OR, DL, MVT::i32, M, Lo40Set);
3671
3672 // (M != 0 ? 0x0200 : 0) | 0x7c00;
3673 SDValue I = DAG.getNode(ISD::OR, DL, MVT::i32,
3674 DAG.getSelectCC(DL, M, Zero, DAG.getConstant(0x0200, DL, MVT::i32),
3675 Zero, ISD::SETNE), DAG.getConstant(0x7c00, DL, MVT::i32));
3676
3677 // N = M | (E << 12);
3678 SDValue N = DAG.getNode(ISD::OR, DL, MVT::i32, M,
3679 DAG.getNode(ISD::SHL, DL, MVT::i32, E,
3680 DAG.getConstant(12, DL, MVT::i32)));
3681
3682 // B = clamp(1-E, 0, 13);
3683 SDValue OneSubExp = DAG.getNode(ISD::SUB, DL, MVT::i32,
3684 One, E);
3685 SDValue B = DAG.getNode(ISD::SMAX, DL, MVT::i32, OneSubExp, Zero);
3686 B = DAG.getNode(ISD::SMIN, DL, MVT::i32, B,
3687 DAG.getConstant(13, DL, MVT::i32));
3688
3689 SDValue SigSetHigh = DAG.getNode(ISD::OR, DL, MVT::i32, M,
3690 DAG.getConstant(0x1000, DL, MVT::i32));
3691
3692 SDValue D = DAG.getNode(ISD::SRL, DL, MVT::i32, SigSetHigh, B);
3693 SDValue D0 = DAG.getNode(ISD::SHL, DL, MVT::i32, D, B);
3694 SDValue D1 = DAG.getSelectCC(DL, D0, SigSetHigh, One, Zero, ISD::SETNE);
3695 D = DAG.getNode(ISD::OR, DL, MVT::i32, D, D1);
3696
3697 SDValue V = DAG.getSelectCC(DL, E, One, D, N, ISD::SETLT);
3698 SDValue VLow3 = DAG.getNode(ISD::AND, DL, MVT::i32, V,
3699 DAG.getConstant(0x7, DL, MVT::i32));
3700 V = DAG.getNode(ISD::SRL, DL, MVT::i32, V,
3701 DAG.getConstant(2, DL, MVT::i32));
3702 SDValue V0 = DAG.getSelectCC(DL, VLow3, DAG.getConstant(3, DL, MVT::i32),
3703 One, Zero, ISD::SETEQ);
3704 SDValue V1 = DAG.getSelectCC(DL, VLow3, DAG.getConstant(5, DL, MVT::i32),
3705 One, Zero, ISD::SETGT);
3706 V1 = DAG.getNode(ISD::OR, DL, MVT::i32, V0, V1);
3707 V = DAG.getNode(ISD::ADD, DL, MVT::i32, V, V1);
3708
3709 V = DAG.getSelectCC(DL, E, DAG.getConstant(30, DL, MVT::i32),
3710 DAG.getConstant(0x7c00, DL, MVT::i32), V, ISD::SETGT);
3711 V = DAG.getSelectCC(DL, E, DAG.getConstant(1039, DL, MVT::i32),
3712 I, V, ISD::SETEQ);
3713
3714 // Extract the sign bit.
3715 SDValue Sign = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
3716 DAG.getConstant(16, DL, MVT::i32));
3717 Sign = DAG.getNode(ISD::AND, DL, MVT::i32, Sign,
3718 DAG.getConstant(0x8000, DL, MVT::i32));
3719
3720 return DAG.getNode(ISD::OR, DL, MVT::i32, Sign, V);
3721}
3722
3724 SelectionDAG &DAG) const {
3725 SDValue Src = Op.getOperand(0);
3726 unsigned OpOpcode = Op.getOpcode();
3727 EVT SrcVT = Src.getValueType();
3728 EVT DestVT = Op.getValueType();
3729
3730 // Will be selected natively
3731 if (SrcVT == MVT::f16 && DestVT == MVT::i16)
3732 return Op;
3733
3734 if (SrcVT == MVT::bf16) {
3735 SDLoc DL(Op);
3736 SDValue PromotedSrc = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, Src);
3737 return DAG.getNode(Op.getOpcode(), DL, DestVT, PromotedSrc);
3738 }
3739
3740 // Promote i16 to i32
3741 if (DestVT == MVT::i16 && (SrcVT == MVT::f32 || SrcVT == MVT::f64)) {
3742 SDLoc DL(Op);
3743
3744 SDValue FpToInt32 = DAG.getNode(OpOpcode, DL, MVT::i32, Src);
3745 return DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, FpToInt32);
3746 }
3747
3748 if (DestVT != MVT::i64)
3749 return Op;
3750
3751 if (SrcVT == MVT::f16 ||
3752 (SrcVT == MVT::f32 && Src.getOpcode() == ISD::FP16_TO_FP)) {
3753 SDLoc DL(Op);
3754
3755 SDValue FpToInt32 = DAG.getNode(OpOpcode, DL, MVT::i32, Src);
3756 unsigned Ext =
3758 return DAG.getNode(Ext, DL, MVT::i64, FpToInt32);
3759 }
3760
3761 if (SrcVT == MVT::f32 || SrcVT == MVT::f64)
3762 return LowerFP_TO_INT64(Op, DAG, OpOpcode == ISD::FP_TO_SINT);
3763
3764 return SDValue();
3765}
3766
3768 SelectionDAG &DAG) const {
3769 EVT ExtraVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
3770 MVT VT = Op.getSimpleValueType();
3771 MVT ScalarVT = VT.getScalarType();
3772
3773 assert(VT.isVector());
3774
3775 SDValue Src = Op.getOperand(0);
3776 SDLoc DL(Op);
3777
3778 // TODO: Don't scalarize on Evergreen?
3779 unsigned NElts = VT.getVectorNumElements();
3781 DAG.ExtractVectorElements(Src, Args, 0, NElts);
3782
3783 SDValue VTOp = DAG.getValueType(ExtraVT.getScalarType());
3784 for (unsigned I = 0; I < NElts; ++I)
3785 Args[I] = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, ScalarVT, Args[I], VTOp);
3786
3787 return DAG.getBuildVector(VT, DL, Args);
3788}
3789
3790//===----------------------------------------------------------------------===//
3791// Custom DAG optimizations
3792//===----------------------------------------------------------------------===//
3793
3794static bool isU24(SDValue Op, SelectionDAG &DAG) {
3795 return AMDGPUTargetLowering::numBitsUnsigned(Op, DAG) <= 24;
3796}
3797
3798static bool isI24(SDValue Op, SelectionDAG &DAG) {
3799 EVT VT = Op.getValueType();
3800 return VT.getSizeInBits() >= 24 && // Types less than 24-bit should be treated
3801 // as unsigned 24-bit values.
3803}
3804
3807 SelectionDAG &DAG = DCI.DAG;
3808 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
3809 bool IsIntrin = Node24->getOpcode() == ISD::INTRINSIC_WO_CHAIN;
3810
3811 SDValue LHS = IsIntrin ? Node24->getOperand(1) : Node24->getOperand(0);
3812 SDValue RHS = IsIntrin ? Node24->getOperand(2) : Node24->getOperand(1);
3813 unsigned NewOpcode = Node24->getOpcode();
3814 if (IsIntrin) {
3815 unsigned IID = Node24->getConstantOperandVal(0);
3816 switch (IID) {
3817 case Intrinsic::amdgcn_mul_i24:
3818 NewOpcode = AMDGPUISD::MUL_I24;
3819 break;
3820 case Intrinsic::amdgcn_mul_u24:
3821 NewOpcode = AMDGPUISD::MUL_U24;
3822 break;
3823 case Intrinsic::amdgcn_mulhi_i24:
3824 NewOpcode = AMDGPUISD::MULHI_I24;
3825 break;
3826 case Intrinsic::amdgcn_mulhi_u24:
3827 NewOpcode = AMDGPUISD::MULHI_U24;
3828 break;
3829 default:
3830 llvm_unreachable("Expected 24-bit mul intrinsic");
3831 }
3832 }
3833
3834 APInt Demanded = APInt::getLowBitsSet(LHS.getValueSizeInBits(), 24);
3835
3836 // First try to simplify using SimplifyMultipleUseDemandedBits which allows
3837 // the operands to have other uses, but will only perform simplifications that
3838 // involve bypassing some nodes for this user.
3839 SDValue DemandedLHS = TLI.SimplifyMultipleUseDemandedBits(LHS, Demanded, DAG);
3840 SDValue DemandedRHS = TLI.SimplifyMultipleUseDemandedBits(RHS, Demanded, DAG);
3841 if (DemandedLHS || DemandedRHS)
3842 return DAG.getNode(NewOpcode, SDLoc(Node24), Node24->getVTList(),
3843 DemandedLHS ? DemandedLHS : LHS,
3844 DemandedRHS ? DemandedRHS : RHS);
3845
3846 // Now try SimplifyDemandedBits which can simplify the nodes used by our
3847 // operands if this node is the only user.
3848 if (TLI.SimplifyDemandedBits(LHS, Demanded, DCI))
3849 return SDValue(Node24, 0);
3850 if (TLI.SimplifyDemandedBits(RHS, Demanded, DCI))
3851 return SDValue(Node24, 0);
3852
3853 return SDValue();
3854}
3855
3856template <typename IntTy>
3858 uint32_t Width, const SDLoc &DL) {
3859 if (Width + Offset < 32) {
3860 uint32_t Shl = static_cast<uint32_t>(Src0) << (32 - Offset - Width);
3861 IntTy Result = static_cast<IntTy>(Shl) >> (32 - Width);
3862 if constexpr (std::is_signed_v<IntTy>) {
3863 return DAG.getSignedConstant(Result, DL, MVT::i32);
3864 } else {
3865 return DAG.getConstant(Result, DL, MVT::i32);
3866 }
3867 }
3868
3869 return DAG.getConstant(Src0 >> Offset, DL, MVT::i32);
3870}
3871
3872static bool hasVolatileUser(SDNode *Val) {
3873 for (SDNode *U : Val->users()) {
3874 if (MemSDNode *M = dyn_cast<MemSDNode>(U)) {
3875 if (M->isVolatile())
3876 return true;
3877 }
3878 }
3879
3880 return false;
3881}
3882
3884 // i32 vectors are the canonical memory type.
3885 if (VT.getScalarType() == MVT::i32 || isTypeLegal(VT))
3886 return false;
3887
3888 if (!VT.isByteSized())
3889 return false;
3890
3891 unsigned Size = VT.getStoreSize();
3892
3893 if ((Size == 1 || Size == 2 || Size == 4) && !VT.isVector())
3894 return false;
3895
3896 if (Size == 3 || (Size > 4 && (Size % 4 != 0)))
3897 return false;
3898
3899 return true;
3900}
3901
3902// Replace load of an illegal type with a bitcast from a load of a friendlier
3903// type.
3905 DAGCombinerInfo &DCI) const {
3906 if (!DCI.isBeforeLegalize())
3907 return SDValue();
3908
3910 if (!LN->isSimple() || !ISD::isNormalLoad(LN) || hasVolatileUser(LN))
3911 return SDValue();
3912
3913 SDLoc SL(N);
3914 SelectionDAG &DAG = DCI.DAG;
3915 EVT VT = LN->getMemoryVT();
3916
3917 unsigned Size = VT.getStoreSize();
3918 Align Alignment = LN->getAlign();
3919 if (Alignment < Size && isTypeLegal(VT)) {
3920 unsigned IsFast;
3921 unsigned AS = LN->getAddressSpace();
3922
3923 // Expand unaligned loads earlier than legalization. Due to visitation order
3924 // problems during legalization, the emitted instructions to pack and unpack
3925 // the bytes again are not eliminated in the case of an unaligned copy.
3927 VT, AS, Alignment, LN->getMemOperand()->getFlags(), &IsFast)) {
3928 if (VT.isVector())
3929 return SplitVectorLoad(SDValue(LN, 0), DAG);
3930
3931 SDValue Ops[2];
3932 std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(LN, DAG);
3933
3934 return DAG.getMergeValues(Ops, SDLoc(N));
3935 }
3936
3937 if (!IsFast)
3938 return SDValue();
3939 }
3940
3941 if (!shouldCombineMemoryType(VT))
3942 return SDValue();
3943
3944 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
3945
3946 SDValue NewLoad
3947 = DAG.getLoad(NewVT, SL, LN->getChain(),
3948 LN->getBasePtr(), LN->getMemOperand());
3949
3950 SDValue BC = DAG.getNode(ISD::BITCAST, SL, VT, NewLoad);
3951 DCI.CombineTo(N, BC, NewLoad.getValue(1));
3952 return SDValue(N, 0);
3953}
3954
3955// Replace store of an illegal type with a store of a bitcast to a friendlier
3956// type.
3958 DAGCombinerInfo &DCI) const {
3959 if (!DCI.isBeforeLegalize())
3960 return SDValue();
3961
3963 if (!SN->isSimple() || !ISD::isNormalStore(SN))
3964 return SDValue();
3965
3966 EVT VT = SN->getMemoryVT();
3967 unsigned Size = VT.getStoreSize();
3968
3969 SDLoc SL(N);
3970 SelectionDAG &DAG = DCI.DAG;
3971 Align Alignment = SN->getAlign();
3972 if (Alignment < Size && isTypeLegal(VT)) {
3973 unsigned IsFast;
3974 unsigned AS = SN->getAddressSpace();
3975
3976 // Expand unaligned stores earlier than legalization. Due to visitation
3977 // order problems during legalization, the emitted instructions to pack and
3978 // unpack the bytes again are not eliminated in the case of an unaligned
3979 // copy.
3981 VT, AS, Alignment, SN->getMemOperand()->getFlags(), &IsFast)) {
3982 if (VT.isVector())
3983 return SplitVectorStore(SDValue(SN, 0), DAG);
3984
3985 return expandUnalignedStore(SN, DAG);
3986 }
3987
3988 if (!IsFast)
3989 return SDValue();
3990 }
3991
3992 if (!shouldCombineMemoryType(VT))
3993 return SDValue();
3994
3995 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
3996 SDValue Val = SN->getValue();
3997
3998 //DCI.AddToWorklist(Val.getNode());
3999
4000 bool OtherUses = !Val.hasOneUse();
4001 SDValue CastVal = DAG.getNode(ISD::BITCAST, SL, NewVT, Val);
4002 if (OtherUses) {
4003 SDValue CastBack = DAG.getNode(ISD::BITCAST, SL, VT, CastVal);
4004 DAG.ReplaceAllUsesOfValueWith(Val, CastBack);
4005 }
4006
4007 return DAG.getStore(SN->getChain(), SL, CastVal,
4008 SN->getBasePtr(), SN->getMemOperand());
4009}
4010
4011// FIXME: This should go in generic DAG combiner with an isTruncateFree check,
4012// but isTruncateFree is inaccurate for i16 now because of SALU vs. VALU
4013// issues.
4015 DAGCombinerInfo &DCI) const {
4016 SelectionDAG &DAG = DCI.DAG;
4017 SDValue N0 = N->getOperand(0);
4018
4019 // (vt2 (assertzext (truncate vt0:x), vt1)) ->
4020 // (vt2 (truncate (assertzext vt0:x, vt1)))
4021 if (N0.getOpcode() == ISD::TRUNCATE) {
4022 SDValue N1 = N->getOperand(1);
4023 EVT ExtVT = cast<VTSDNode>(N1)->getVT();
4024 SDLoc SL(N);
4025
4026 SDValue Src = N0.getOperand(0);
4027 EVT SrcVT = Src.getValueType();
4028 if (SrcVT.bitsGE(ExtVT)) {
4029 SDValue NewInReg = DAG.getNode(N->getOpcode(), SL, SrcVT, Src, N1);
4030 return DAG.getNode(ISD::TRUNCATE, SL, N->getValueType(0), NewInReg);
4031 }
4032 }
4033
4034 return SDValue();
4035}
4036
4038 SDNode *N, DAGCombinerInfo &DCI) const {
4039 unsigned IID = N->getConstantOperandVal(0);
4040 switch (IID) {
4041 case Intrinsic::amdgcn_mul_i24:
4042 case Intrinsic::amdgcn_mul_u24:
4043 case Intrinsic::amdgcn_mulhi_i24:
4044 case Intrinsic::amdgcn_mulhi_u24:
4045 return simplifyMul24(N, DCI);
4046 case Intrinsic::amdgcn_fract:
4047 case Intrinsic::amdgcn_rsq:
4048 case Intrinsic::amdgcn_rcp_legacy:
4049 case Intrinsic::amdgcn_rsq_legacy:
4050 case Intrinsic::amdgcn_rsq_clamp:
4051 case Intrinsic::amdgcn_tanh:
4052 case Intrinsic::amdgcn_prng_b32: {
4053 // FIXME: This is probably wrong. If src is an sNaN, it won't be quieted
4054 SDValue Src = N->getOperand(1);
4055 return Src.isUndef() ? Src : SDValue();
4056 }
4057 case Intrinsic::amdgcn_frexp_exp: {
4058 // frexp_exp (fneg x) -> frexp_exp x
4059 // frexp_exp (fabs x) -> frexp_exp x
4060 // frexp_exp (fneg (fabs x)) -> frexp_exp x
4061 SDValue Src = N->getOperand(1);
4062 SDValue PeekSign = peekFPSignOps(Src);
4063 if (PeekSign == Src)
4064 return SDValue();
4065 return SDValue(DCI.DAG.UpdateNodeOperands(N, N->getOperand(0), PeekSign),
4066 0);
4067 }
4068 default:
4069 return SDValue();
4070 }
4071}
4072
4073/// Split the 64-bit value \p LHS into two 32-bit components, and perform the
4074/// binary operation \p Opc to it with the corresponding constant operands.
4076 DAGCombinerInfo &DCI, const SDLoc &SL,
4077 unsigned Opc, SDValue LHS,
4078 uint32_t ValLo, uint32_t ValHi) const {
4079 SelectionDAG &DAG = DCI.DAG;
4080 SDValue Lo, Hi;
4081 std::tie(Lo, Hi) = split64BitValue(LHS, DAG);
4082
4083 SDValue LoRHS = DAG.getConstant(ValLo, SL, MVT::i32);
4084 SDValue HiRHS = DAG.getConstant(ValHi, SL, MVT::i32);
4085
4086 SDValue LoAnd = DAG.getNode(Opc, SL, MVT::i32, Lo, LoRHS);
4087 SDValue HiAnd = DAG.getNode(Opc, SL, MVT::i32, Hi, HiRHS);
4088
4089 // Re-visit the ands. It's possible we eliminated one of them and it could
4090 // simplify the vector.
4091 DCI.AddToWorklist(Lo.getNode());
4092 DCI.AddToWorklist(Hi.getNode());
4093
4094 SDValue Vec = DAG.getBuildVector(MVT::v2i32, SL, {LoAnd, HiAnd});
4095 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
4096}
4097
4099 DAGCombinerInfo &DCI) const {
4100 EVT VT = N->getValueType(0);
4101 SDValue LHS = N->getOperand(0);
4102 SDValue RHS = N->getOperand(1);
4104 SDLoc SL(N);
4105 SelectionDAG &DAG = DCI.DAG;
4106
4107 unsigned RHSVal;
4108 if (CRHS) {
4109 RHSVal = CRHS->getZExtValue();
4110 if (!RHSVal)
4111 return LHS;
4112
4113 switch (LHS->getOpcode()) {
4114 default:
4115 break;
4116 case ISD::ZERO_EXTEND:
4117 case ISD::SIGN_EXTEND:
4118 case ISD::ANY_EXTEND: {
4119 SDValue X = LHS->getOperand(0);
4120
4121 if (VT == MVT::i32 && RHSVal == 16 && X.getValueType() == MVT::i16 &&
4122 isOperationLegal(ISD::BUILD_VECTOR, MVT::v2i16)) {
4123 // Prefer build_vector as the canonical form if packed types are legal.
4124 // (shl ([asz]ext i16:x), 16 -> build_vector 0, x
4125 SDValue Vec = DAG.getBuildVector(
4126 MVT::v2i16, SL,
4127 {DAG.getConstant(0, SL, MVT::i16), LHS->getOperand(0)});
4128 return DAG.getNode(ISD::BITCAST, SL, MVT::i32, Vec);
4129 }
4130
4131 // shl (ext x) => zext (shl x), if shift does not overflow int
4132 if (VT != MVT::i64)
4133 break;
4134 KnownBits Known = DAG.computeKnownBits(X);
4135 unsigned LZ = Known.countMinLeadingZeros();
4136 if (LZ < RHSVal)
4137 break;
4138 EVT XVT = X.getValueType();
4139 SDValue Shl = DAG.getNode(ISD::SHL, SL, XVT, X, SDValue(CRHS, 0));
4140 return DAG.getZExtOrTrunc(Shl, SL, VT);
4141 }
4142 }
4143 }
4144
4145 if (VT.getScalarType() != MVT::i64)
4146 return SDValue();
4147
4148 // On some subtargets, 64-bit shift is a quarter rate instruction. In the
4149 // common case, splitting this into a move and a 32-bit shift is faster and
4150 // the same code size.
4151 KnownBits Known = DAG.computeKnownBits(RHS);
4152
4153 EVT ElementType = VT.getScalarType();
4154 EVT TargetScalarType = ElementType.getHalfSizedIntegerVT(*DAG.getContext());
4155 EVT TargetType = VT.changeElementType(*DAG.getContext(), TargetScalarType);
4156
4157 if (Known.getMinValue().getZExtValue() < TargetScalarType.getSizeInBits())
4158 return SDValue();
4159 SDValue ShiftAmt;
4160
4161 if (CRHS) {
4162 ShiftAmt = DAG.getConstant(RHSVal - TargetScalarType.getSizeInBits(), SL,
4163 TargetType);
4164 } else {
4165 SDValue TruncShiftAmt = DAG.getNode(ISD::TRUNCATE, SL, TargetType, RHS);
4166 const SDValue ShiftMask =
4167 DAG.getConstant(TargetScalarType.getSizeInBits() - 1, SL, TargetType);
4168 // This AND instruction will clamp out of bounds shift values.
4169 // It will also be removed during later instruction selection.
4170 ShiftAmt = DAG.getNode(ISD::AND, SL, TargetType, TruncShiftAmt, ShiftMask);
4171 }
4172
4173 SDValue Lo = DAG.getNode(ISD::TRUNCATE, SL, TargetType, LHS);
4174 SDValue NewShift =
4175 DAG.getNode(ISD::SHL, SL, TargetType, Lo, ShiftAmt, N->getFlags());
4176
4177 const SDValue Zero = DAG.getConstant(0, SL, TargetScalarType);
4178 SDValue Vec;
4179
4180 if (VT.isVector()) {
4181 EVT ConcatType = TargetType.getDoubleNumVectorElementsVT(*DAG.getContext());
4182 unsigned NElts = TargetType.getVectorNumElements();
4184 SmallVector<SDValue, 16> HiAndLoOps(NElts * 2, Zero);
4185
4186 DAG.ExtractVectorElements(NewShift, HiOps, 0, NElts);
4187 for (unsigned I = 0; I != NElts; ++I)
4188 HiAndLoOps[2 * I + 1] = HiOps[I];
4189 Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, ConcatType, HiAndLoOps);
4190 } else {
4191 EVT ConcatType = EVT::getVectorVT(*DAG.getContext(), TargetType, 2);
4192 Vec = DAG.getBuildVector(ConcatType, SL, {Zero, NewShift});
4193 }
4194 return DAG.getNode(ISD::BITCAST, SL, VT, Vec);
4195}
4196
4198 DAGCombinerInfo &DCI) const {
4199 SDValue RHS = N->getOperand(1);
4201 EVT VT = N->getValueType(0);
4202 SDValue LHS = N->getOperand(0);
4203 SelectionDAG &DAG = DCI.DAG;
4204 SDLoc SL(N);
4205
4206 if (VT.getScalarType() != MVT::i64)
4207 return SDValue();
4208
4209 // For C >= 32
4210 // i64 (sra x, C) -> (build_pair (sra hi_32(x), C - 32), sra hi_32(x), 31))
4211
4212 // On some subtargets, 64-bit shift is a quarter rate instruction. In the
4213 // common case, splitting this into a move and a 32-bit shift is faster and
4214 // the same code size.
4215 KnownBits Known = DAG.computeKnownBits(RHS);
4216
4217 EVT ElementType = VT.getScalarType();
4218 EVT TargetScalarType = ElementType.getHalfSizedIntegerVT(*DAG.getContext());
4219 EVT TargetType = VT.changeElementType(*DAG.getContext(), TargetScalarType);
4220
4221 if (Known.getMinValue().getZExtValue() < TargetScalarType.getSizeInBits())
4222 return SDValue();
4223
4224 SDValue ShiftFullAmt =
4225 DAG.getConstant(TargetScalarType.getSizeInBits() - 1, SL, TargetType);
4226 SDValue ShiftAmt;
4227 if (CRHS) {
4228 unsigned RHSVal = CRHS->getZExtValue();
4229 ShiftAmt = DAG.getConstant(RHSVal - TargetScalarType.getSizeInBits(), SL,
4230 TargetType);
4231 } else if (Known.getMinValue().getZExtValue() ==
4232 (ElementType.getSizeInBits() - 1)) {
4233 ShiftAmt = ShiftFullAmt;
4234 } else {
4235 SDValue TruncShiftAmt = DAG.getNode(ISD::TRUNCATE, SL, TargetType, RHS);
4236 const SDValue ShiftMask =
4237 DAG.getConstant(TargetScalarType.getSizeInBits() - 1, SL, TargetType);
4238 // This AND instruction will clamp out of bounds shift values.
4239 // It will also be removed during later instruction selection.
4240 ShiftAmt = DAG.getNode(ISD::AND, SL, TargetType, TruncShiftAmt, ShiftMask);
4241 }
4242
4243 EVT ConcatType;
4244 SDValue Hi;
4245 SDLoc LHSSL(LHS);
4246 // Bitcast LHS into ConcatType so hi-half of source can be extracted into Hi
4247 if (VT.isVector()) {
4248 unsigned NElts = TargetType.getVectorNumElements();
4249 ConcatType = TargetType.getDoubleNumVectorElementsVT(*DAG.getContext());
4250 SDValue SplitLHS = DAG.getNode(ISD::BITCAST, LHSSL, ConcatType, LHS);
4251 SmallVector<SDValue, 8> HiOps(NElts);
4252 SmallVector<SDValue, 16> HiAndLoOps;
4253
4254 DAG.ExtractVectorElements(SplitLHS, HiAndLoOps, 0, NElts * 2);
4255 for (unsigned I = 0; I != NElts; ++I) {
4256 HiOps[I] = HiAndLoOps[2 * I + 1];
4257 }
4258 Hi = DAG.getNode(ISD::BUILD_VECTOR, LHSSL, TargetType, HiOps);
4259 } else {
4260 const SDValue One = DAG.getConstant(1, LHSSL, TargetScalarType);
4261 ConcatType = EVT::getVectorVT(*DAG.getContext(), TargetType, 2);
4262 SDValue SplitLHS = DAG.getNode(ISD::BITCAST, LHSSL, ConcatType, LHS);
4263 Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, LHSSL, TargetType, SplitLHS, One);
4264 }
4265
4266 KnownBits KnownLHS = DAG.computeKnownBits(LHS);
4267 SDValue HiShift;
4268 if (KnownLHS.isNegative()) {
4269 HiShift = DAG.getAllOnesConstant(SL, TargetType);
4270 } else {
4271 Hi = DAG.getFreeze(Hi);
4272 HiShift = DAG.getNode(ISD::SRA, SL, TargetType, Hi, ShiftFullAmt);
4273 }
4274 SDValue NewShift =
4275 DAG.getNode(ISD::SRA, SL, TargetType, Hi, ShiftAmt, N->getFlags());
4276
4277 SDValue Vec;
4278 if (VT.isVector()) {
4279 unsigned NElts = TargetType.getVectorNumElements();
4282 SmallVector<SDValue, 16> HiAndLoOps(NElts * 2);
4283
4284 DAG.ExtractVectorElements(HiShift, HiOps, 0, NElts);
4285 DAG.ExtractVectorElements(NewShift, LoOps, 0, NElts);
4286 for (unsigned I = 0; I != NElts; ++I) {
4287 HiAndLoOps[2 * I + 1] = HiOps[I];
4288 HiAndLoOps[2 * I] = LoOps[I];
4289 }
4290 Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, ConcatType, HiAndLoOps);
4291 } else {
4292 Vec = DAG.getBuildVector(ConcatType, SL, {NewShift, HiShift});
4293 }
4294 return DAG.getNode(ISD::BITCAST, SL, VT, Vec);
4295}
4296
4298 DAGCombinerInfo &DCI) const {
4299 SDValue RHS = N->getOperand(1);
4301 EVT VT = N->getValueType(0);
4302 SDValue LHS = N->getOperand(0);
4303 SelectionDAG &DAG = DCI.DAG;
4304 SDLoc SL(N);
4305 unsigned RHSVal;
4306
4307 if (CRHS) {
4308 RHSVal = CRHS->getZExtValue();
4309
4310 // fold (srl (and x, c1 << c2), c2) -> (and (srl(x, c2), c1)
4311 // this improves the ability to match BFE patterns in isel.
4312 if (LHS.getOpcode() == ISD::AND) {
4313 if (auto *Mask = dyn_cast<ConstantSDNode>(LHS.getOperand(1))) {
4314 unsigned MaskIdx, MaskLen;
4315 if (Mask->getAPIntValue().isShiftedMask(MaskIdx, MaskLen) &&
4316 MaskIdx == RHSVal) {
4317 return DAG.getNode(ISD::AND, SL, VT,
4318 DAG.getNode(ISD::SRL, SL, VT, LHS.getOperand(0),
4319 N->getOperand(1)),
4320 DAG.getNode(ISD::SRL, SL, VT, LHS.getOperand(1),
4321 N->getOperand(1)));
4322 }
4323 }
4324 }
4325 }
4326
4327 if (VT.getScalarType() != MVT::i64)
4328 return SDValue();
4329
4330 // for C >= 32
4331 // i64 (srl x, C) -> (build_pair (srl hi_32(x), C - 32), 0)
4332
4333 // On some subtargets, 64-bit shift is a quarter rate instruction. In the
4334 // common case, splitting this into a move and a 32-bit shift is faster and
4335 // the same code size.
4336 KnownBits Known = DAG.computeKnownBits(RHS);
4337
4338 EVT ElementType = VT.getScalarType();
4339 EVT TargetScalarType = ElementType.getHalfSizedIntegerVT(*DAG.getContext());
4340 EVT TargetType = VT.changeElementType(*DAG.getContext(), TargetScalarType);
4341
4342 if (Known.getMinValue().getZExtValue() < TargetScalarType.getSizeInBits())
4343 return SDValue();
4344
4345 SDValue ShiftAmt;
4346 if (CRHS) {
4347 ShiftAmt = DAG.getConstant(RHSVal - TargetScalarType.getSizeInBits(), SL,
4348 TargetType);
4349 } else {
4350 SDValue TruncShiftAmt = DAG.getNode(ISD::TRUNCATE, SL, TargetType, RHS);
4351 const SDValue ShiftMask =
4352 DAG.getConstant(TargetScalarType.getSizeInBits() - 1, SL, TargetType);
4353 // This AND instruction will clamp out of bounds shift values.
4354 // It will also be removed during later instruction selection.
4355 ShiftAmt = DAG.getNode(ISD::AND, SL, TargetType, TruncShiftAmt, ShiftMask);
4356 }
4357
4358 const SDValue Zero = DAG.getConstant(0, SL, TargetScalarType);
4359 EVT ConcatType;
4360 SDValue Hi;
4361 SDLoc LHSSL(LHS);
4362 // Bitcast LHS into ConcatType so hi-half of source can be extracted into Hi
4363 if (VT.isVector()) {
4364 unsigned NElts = TargetType.getVectorNumElements();
4365 ConcatType = TargetType.getDoubleNumVectorElementsVT(*DAG.getContext());
4366 SDValue SplitLHS = DAG.getNode(ISD::BITCAST, LHSSL, ConcatType, LHS);
4367 SmallVector<SDValue, 8> HiOps(NElts);
4368 SmallVector<SDValue, 16> HiAndLoOps;
4369
4370 DAG.ExtractVectorElements(SplitLHS, HiAndLoOps, /*Start=*/0, NElts * 2);
4371 for (unsigned I = 0; I != NElts; ++I)
4372 HiOps[I] = HiAndLoOps[2 * I + 1];
4373 Hi = DAG.getNode(ISD::BUILD_VECTOR, LHSSL, TargetType, HiOps);
4374 } else {
4375 const SDValue One = DAG.getConstant(1, LHSSL, TargetScalarType);
4376 ConcatType = EVT::getVectorVT(*DAG.getContext(), TargetType, 2);
4377 SDValue SplitLHS = DAG.getNode(ISD::BITCAST, LHSSL, ConcatType, LHS);
4378 Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, LHSSL, TargetType, SplitLHS, One);
4379 }
4380
4381 SDValue NewShift =
4382 DAG.getNode(ISD::SRL, SL, TargetType, Hi, ShiftAmt, N->getFlags());
4383
4384 SDValue Vec;
4385 if (VT.isVector()) {
4386 unsigned NElts = TargetType.getVectorNumElements();
4388 SmallVector<SDValue, 16> HiAndLoOps(NElts * 2, Zero);
4389
4390 DAG.ExtractVectorElements(NewShift, LoOps, 0, NElts);
4391 for (unsigned I = 0; I != NElts; ++I)
4392 HiAndLoOps[2 * I] = LoOps[I];
4393 Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, ConcatType, HiAndLoOps);
4394 } else {
4395 Vec = DAG.getBuildVector(ConcatType, SL, {NewShift, Zero});
4396 }
4397 return DAG.getNode(ISD::BITCAST, SL, VT, Vec);
4398}
4399
4401 SDNode *N, DAGCombinerInfo &DCI) const {
4402 SDLoc SL(N);
4403 SelectionDAG &DAG = DCI.DAG;
4404 EVT VT = N->getValueType(0);
4405 SDValue Src = N->getOperand(0);
4406
4407 // vt1 (truncate (bitcast (build_vector vt0:x, ...))) -> vt1 (bitcast vt0:x)
4408 if (Src.getOpcode() == ISD::BITCAST && !VT.isVector()) {
4409 SDValue Vec = Src.getOperand(0);
4410 if (Vec.getOpcode() == ISD::BUILD_VECTOR) {
4411 SDValue Elt0 = Vec.getOperand(0);
4412 EVT EltVT = Elt0.getValueType();
4413 if (VT.getFixedSizeInBits() <= EltVT.getFixedSizeInBits()) {
4414 if (EltVT.isFloatingPoint()) {
4415 Elt0 = DAG.getNode(ISD::BITCAST, SL,
4416 EltVT.changeTypeToInteger(), Elt0);
4417 }
4418
4419 return DAG.getNode(ISD::TRUNCATE, SL, VT, Elt0);
4420 }
4421 }
4422 }
4423
4424 // Equivalent of above for accessing the high element of a vector as an
4425 // integer operation.
4426 // trunc (srl (bitcast (build_vector x, y))), 16 -> trunc (bitcast y)
4427 if (Src.getOpcode() == ISD::SRL && !VT.isVector()) {
4428 if (auto *K = isConstOrConstSplat(Src.getOperand(1))) {
4429 SDValue BV = stripBitcast(Src.getOperand(0));
4430 if (BV.getOpcode() == ISD::BUILD_VECTOR) {
4431 EVT SrcEltVT = BV.getOperand(0).getValueType();
4432 unsigned SrcEltSize = SrcEltVT.getSizeInBits();
4433 unsigned BitIndex = K->getZExtValue();
4434 unsigned PartIndex = BitIndex / SrcEltSize;
4435
4436 if (PartIndex * SrcEltSize == BitIndex &&
4437 PartIndex < BV.getNumOperands()) {
4438 if (SrcEltVT.getSizeInBits() == VT.getSizeInBits()) {
4439 SDValue SrcElt =
4440 DAG.getNode(ISD::BITCAST, SL, SrcEltVT.changeTypeToInteger(),
4441 BV.getOperand(PartIndex));
4442 return DAG.getNode(ISD::TRUNCATE, SL, VT, SrcElt);
4443 }
4444 }
4445 }
4446 }
4447 }
4448
4449 // Partially shrink 64-bit shifts to 32-bit if reduced to 16-bit.
4450 //
4451 // i16 (trunc (srl i64:x, K)), K <= 16 ->
4452 // i16 (trunc (srl (i32 (trunc x), K)))
4453 if (VT.getScalarSizeInBits() < 32) {
4454 EVT SrcVT = Src.getValueType();
4455 if (SrcVT.getScalarSizeInBits() > 32 &&
4456 (Src.getOpcode() == ISD::SRL ||
4457 Src.getOpcode() == ISD::SRA ||
4458 Src.getOpcode() == ISD::SHL)) {
4459 SDValue Amt = Src.getOperand(1);
4460 KnownBits Known = DAG.computeKnownBits(Amt);
4461
4462 // - For left shifts, do the transform as long as the shift
4463 // amount is still legal for i32, so when ShiftAmt < 32 (<= 31)
4464 // - For right shift, do it if ShiftAmt <= (32 - Size) to avoid
4465 // losing information stored in the high bits when truncating.
4466 const unsigned MaxCstSize =
4467 (Src.getOpcode() == ISD::SHL) ? 31 : (32 - VT.getScalarSizeInBits());
4468 if (Known.getMaxValue().ule(MaxCstSize)) {
4469 EVT MidVT = VT.isVector() ?
4470 EVT::getVectorVT(*DAG.getContext(), MVT::i32,
4471 VT.getVectorNumElements()) : MVT::i32;
4472
4473 EVT NewShiftVT = getShiftAmountTy(MidVT, DAG.getDataLayout());
4474 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, MidVT,
4475 Src.getOperand(0));
4476 DCI.AddToWorklist(Trunc.getNode());
4477
4478 if (Amt.getValueType() != NewShiftVT) {
4479 Amt = DAG.getZExtOrTrunc(Amt, SL, NewShiftVT);
4480 DCI.AddToWorklist(Amt.getNode());
4481 }
4482
4483 SDValue ShrunkShift = DAG.getNode(Src.getOpcode(), SL, MidVT,
4484 Trunc, Amt);
4485 return DAG.getNode(ISD::TRUNCATE, SL, VT, ShrunkShift);
4486 }
4487 }
4488 }
4489
4490 return SDValue();
4491}
4492
4493// We need to specifically handle i64 mul here to avoid unnecessary conversion
4494// instructions. If we only match on the legalized i64 mul expansion,
4495// SimplifyDemandedBits will be unable to remove them because there will be
4496// multiple uses due to the separate mul + mulh[su].
4497static SDValue getMul24(SelectionDAG &DAG, const SDLoc &SL,
4498 SDValue N0, SDValue N1, unsigned Size, bool Signed) {
4499 if (Size <= 32) {
4500 unsigned MulOpc = Signed ? AMDGPUISD::MUL_I24 : AMDGPUISD::MUL_U24;
4501 return DAG.getNode(MulOpc, SL, MVT::i32, N0, N1);
4502 }
4503
4504 unsigned MulLoOpc = Signed ? AMDGPUISD::MUL_I24 : AMDGPUISD::MUL_U24;
4505 unsigned MulHiOpc = Signed ? AMDGPUISD::MULHI_I24 : AMDGPUISD::MULHI_U24;
4506
4507 SDValue MulLo = DAG.getNode(MulLoOpc, SL, MVT::i32, N0, N1);
4508 SDValue MulHi = DAG.getNode(MulHiOpc, SL, MVT::i32, N0, N1);
4509
4510 return DAG.getNode(ISD::BUILD_PAIR, SL, MVT::i64, MulLo, MulHi);
4511}
4512
4513/// If \p V is an add of a constant 1, returns the other operand. Otherwise
4514/// return SDValue().
4515static SDValue getAddOneOp(const SDNode *V) {
4516 if (V->getOpcode() != ISD::ADD)
4517 return SDValue();
4518
4519 return isOneConstant(V->getOperand(1)) ? V->getOperand(0) : SDValue();
4520}
4521
4523 DAGCombinerInfo &DCI) const {
4524 assert(N->getOpcode() == ISD::MUL);
4525 EVT VT = N->getValueType(0);
4526
4527 // Don't generate 24-bit multiplies on values that are in SGPRs, since
4528 // we only have a 32-bit scalar multiply (avoid values being moved to VGPRs
4529 // unnecessarily). isDivergent() is used as an approximation of whether the
4530 // value is in an SGPR.
4531 if (!N->isDivergent())
4532 return SDValue();
4533
4534 unsigned Size = VT.getSizeInBits();
4535 if (VT.isVector() || Size > 64)
4536 return SDValue();
4537
4538 SelectionDAG &DAG = DCI.DAG;
4539 SDLoc DL(N);
4540
4541 SDValue N0 = N->getOperand(0);
4542 SDValue N1 = N->getOperand(1);
4543
4544 // Undo InstCombine canonicalize X * (Y + 1) -> X * Y + X to enable mad
4545 // matching.
4546
4547 // mul x, (add y, 1) -> add (mul x, y), x
4548 auto IsFoldableAdd = [](SDValue V) -> SDValue {
4549 SDValue AddOp = getAddOneOp(V.getNode());
4550 if (!AddOp)
4551 return SDValue();
4552
4553 if (V.hasOneUse() || all_of(V->users(), [](const SDNode *U) -> bool {
4554 return U->getOpcode() == ISD::MUL;
4555 }))
4556 return AddOp;
4557
4558 return SDValue();
4559 };
4560
4561 // FIXME: The selection pattern is not properly checking for commuted
4562 // operands, so we have to place the mul in the LHS
4563 if (SDValue MulOper = IsFoldableAdd(N0)) {
4564 SDValue MulVal = DAG.getNode(N->getOpcode(), DL, VT, N1, MulOper);
4565 return DAG.getNode(ISD::ADD, DL, VT, MulVal, N1);
4566 }
4567
4568 if (SDValue MulOper = IsFoldableAdd(N1)) {
4569 SDValue MulVal = DAG.getNode(N->getOpcode(), DL, VT, N0, MulOper);
4570 return DAG.getNode(ISD::ADD, DL, VT, MulVal, N0);
4571 }
4572
4573 // There are i16 integer mul/mad.
4574 if (Subtarget->has16BitInsts() && VT.getScalarType().bitsLE(MVT::i16))
4575 return SDValue();
4576
4577 // SimplifyDemandedBits has the annoying habit of turning useful zero_extends
4578 // in the source into any_extends if the result of the mul is truncated. Since
4579 // we can assume the high bits are whatever we want, use the underlying value
4580 // to avoid the unknown high bits from interfering.
4581 if (N0.getOpcode() == ISD::ANY_EXTEND)
4582 N0 = N0.getOperand(0);
4583
4584 if (N1.getOpcode() == ISD::ANY_EXTEND)
4585 N1 = N1.getOperand(0);
4586
4587 SDValue Mul;
4588
4589 if (Subtarget->hasMulU24() && isU24(N0, DAG) && isU24(N1, DAG)) {
4590 N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);
4591 N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);
4592 Mul = getMul24(DAG, DL, N0, N1, Size, false);
4593 } else if (Subtarget->hasMulI24() && isI24(N0, DAG) && isI24(N1, DAG)) {
4594 N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);
4595 N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);
4596 Mul = getMul24(DAG, DL, N0, N1, Size, true);
4597 } else {
4598 return SDValue();
4599 }
4600
4601 // We need to use sext even for MUL_U24, because MUL_U24 is used
4602 // for signed multiply of 8 and 16-bit types.
4603 return DAG.getSExtOrTrunc(Mul, DL, VT);
4604}
4605
4606SDValue
4608 DAGCombinerInfo &DCI) const {
4609 if (N->getValueType(0) != MVT::i32)
4610 return SDValue();
4611
4612 SelectionDAG &DAG = DCI.DAG;
4613 SDLoc DL(N);
4614
4615 bool Signed = N->getOpcode() == ISD::SMUL_LOHI;
4616 SDValue N0 = N->getOperand(0);
4617 SDValue N1 = N->getOperand(1);
4618
4619 // SimplifyDemandedBits has the annoying habit of turning useful zero_extends
4620 // in the source into any_extends if the result of the mul is truncated. Since
4621 // we can assume the high bits are whatever we want, use the underlying value
4622 // to avoid the unknown high bits from interfering.
4623 if (N0.getOpcode() == ISD::ANY_EXTEND)
4624 N0 = N0.getOperand(0);
4625 if (N1.getOpcode() == ISD::ANY_EXTEND)
4626 N1 = N1.getOperand(0);
4627
4628 // Try to use two fast 24-bit multiplies (one for each half of the result)
4629 // instead of one slow extending multiply.
4630 unsigned LoOpcode = 0;
4631 unsigned HiOpcode = 0;
4632 if (Signed) {
4633 if (Subtarget->hasMulI24() && isI24(N0, DAG) && isI24(N1, DAG)) {
4634 N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);
4635 N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);
4636 LoOpcode = AMDGPUISD::MUL_I24;
4637 HiOpcode = AMDGPUISD::MULHI_I24;
4638 }
4639 } else {
4640 if (Subtarget->hasMulU24() && isU24(N0, DAG) && isU24(N1, DAG)) {
4641 N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);
4642 N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);
4643 LoOpcode = AMDGPUISD::MUL_U24;
4644 HiOpcode = AMDGPUISD::MULHI_U24;
4645 }
4646 }
4647 if (!LoOpcode)
4648 return SDValue();
4649
4650 SDValue Lo = DAG.getNode(LoOpcode, DL, MVT::i32, N0, N1);
4651 SDValue Hi = DAG.getNode(HiOpcode, DL, MVT::i32, N0, N1);
4652 DCI.CombineTo(N, Lo, Hi);
4653 return SDValue(N, 0);
4654}
4655
4657 DAGCombinerInfo &DCI) const {
4658 EVT VT = N->getValueType(0);
4659
4660 if (!Subtarget->hasMulI24() || VT.isVector())
4661 return SDValue();
4662
4663 // Don't generate 24-bit multiplies on values that are in SGPRs, since
4664 // we only have a 32-bit scalar multiply (avoid values being moved to VGPRs
4665 // unnecessarily). isDivergent() is used as an approximation of whether the
4666 // value is in an SGPR.
4667 // This doesn't apply if no s_mul_hi is available (since we'll end up with a
4668 // valu op anyway)
4669 if (Subtarget->hasSMulHi() && !N->isDivergent())
4670 return SDValue();
4671
4672 SelectionDAG &DAG = DCI.DAG;
4673 SDLoc DL(N);
4674
4675 SDValue N0 = N->getOperand(0);
4676 SDValue N1 = N->getOperand(1);
4677
4678 if (!isI24(N0, DAG) || !isI24(N1, DAG))
4679 return SDValue();
4680
4681 N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);
4682 N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);
4683
4684 SDValue Mulhi = DAG.getNode(AMDGPUISD::MULHI_I24, DL, MVT::i32, N0, N1);
4685 DCI.AddToWorklist(Mulhi.getNode());
4686 return DAG.getSExtOrTrunc(Mulhi, DL, VT);
4687}
4688
4690 DAGCombinerInfo &DCI) const {
4691 EVT VT = N->getValueType(0);
4692
4693 if (!Subtarget->hasMulU24() || VT.isVector() || VT.getSizeInBits() > 32)
4694 return SDValue();
4695
4696 // Don't generate 24-bit multiplies on values that are in SGPRs, since
4697 // we only have a 32-bit scalar multiply (avoid values being moved to VGPRs
4698 // unnecessarily). isDivergent() is used as an approximation of whether the
4699 // value is in an SGPR.
4700 // This doesn't apply if no s_mul_hi is available (since we'll end up with a
4701 // valu op anyway)
4702 if (Subtarget->hasSMulHi() && !N->isDivergent())
4703 return SDValue();
4704
4705 SelectionDAG &DAG = DCI.DAG;
4706 SDLoc DL(N);
4707
4708 SDValue N0 = N->getOperand(0);
4709 SDValue N1 = N->getOperand(1);
4710
4711 if (!isU24(N0, DAG) || !isU24(N1, DAG))
4712 return SDValue();
4713
4714 N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);
4715 N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);
4716
4717 SDValue Mulhi = DAG.getNode(AMDGPUISD::MULHI_U24, DL, MVT::i32, N0, N1);
4718 DCI.AddToWorklist(Mulhi.getNode());
4719 return DAG.getZExtOrTrunc(Mulhi, DL, VT);
4720}
4721
4722SDValue AMDGPUTargetLowering::getFFBX_U32(SelectionDAG &DAG,
4723 SDValue Op,
4724 const SDLoc &DL,
4725 unsigned Opc) const {
4726 EVT VT = Op.getValueType();
4727 EVT LegalVT = getTypeToTransformTo(*DAG.getContext(), VT);
4728 if (LegalVT != MVT::i32 && (Subtarget->has16BitInsts() &&
4729 LegalVT != MVT::i16))
4730 return SDValue();
4731
4732 if (VT != MVT::i32)
4733 Op = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Op);
4734
4735 SDValue FFBX = DAG.getNode(Opc, DL, MVT::i32, Op);
4736 if (VT != MVT::i32)
4737 FFBX = DAG.getNode(ISD::TRUNCATE, DL, VT, FFBX);
4738
4739 return FFBX;
4740}
4741
4742// The native instructions return -1 on 0 input. Optimize out a select that
4743// produces -1 on 0.
4744//
4745// TODO: If zero is not undef, we could also do this if the output is compared
4746// against the bitwidth.
4747//
4748// TODO: Should probably combine against FFBH_U32 instead of ctlz directly.
4750 SDValue LHS, SDValue RHS,
4751 DAGCombinerInfo &DCI) const {
4752 if (!isNullConstant(Cond.getOperand(1)))
4753 return SDValue();
4754
4755 SelectionDAG &DAG = DCI.DAG;
4756 ISD::CondCode CCOpcode = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
4757 SDValue CmpLHS = Cond.getOperand(0);
4758
4759 // select (setcc x, 0, eq), -1, (ctlz_zero_undef x) -> ffbh_u32 x
4760 // select (setcc x, 0, eq), -1, (cttz_zero_undef x) -> ffbl_u32 x
4761 if (CCOpcode == ISD::SETEQ &&
4762 (isCtlzOpc(RHS.getOpcode()) || isCttzOpc(RHS.getOpcode())) &&
4763 RHS.getOperand(0) == CmpLHS && isAllOnesConstant(LHS)) {
4764 unsigned Opc =
4765 isCttzOpc(RHS.getOpcode()) ? AMDGPUISD::FFBL_B32 : AMDGPUISD::FFBH_U32;
4766 return getFFBX_U32(DAG, CmpLHS, SL, Opc);
4767 }
4768
4769 // select (setcc x, 0, ne), (ctlz_zero_undef x), -1 -> ffbh_u32 x
4770 // select (setcc x, 0, ne), (cttz_zero_undef x), -1 -> ffbl_u32 x
4771 if (CCOpcode == ISD::SETNE &&
4772 (isCtlzOpc(LHS.getOpcode()) || isCttzOpc(LHS.getOpcode())) &&
4773 LHS.getOperand(0) == CmpLHS && isAllOnesConstant(RHS)) {
4774 unsigned Opc =
4775 isCttzOpc(LHS.getOpcode()) ? AMDGPUISD::FFBL_B32 : AMDGPUISD::FFBH_U32;
4776
4777 return getFFBX_U32(DAG, CmpLHS, SL, Opc);
4778 }
4779
4780 return SDValue();
4781}
4782
4784 unsigned Op,
4785 const SDLoc &SL,
4786 SDValue Cond,
4787 SDValue N1,
4788 SDValue N2) {
4789 SelectionDAG &DAG = DCI.DAG;
4790 EVT VT = N1.getValueType();
4791
4792 SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, VT, Cond,
4793 N1.getOperand(0), N2.getOperand(0));
4794 DCI.AddToWorklist(NewSelect.getNode());
4795 return DAG.getNode(Op, SL, VT, NewSelect);
4796}
4797
4798// Pull a free FP operation out of a select so it may fold into uses.
4799//
4800// select c, (fneg x), (fneg y) -> fneg (select c, x, y)
4801// select c, (fneg x), k -> fneg (select c, x, (fneg k))
4802//
4803// select c, (fabs x), (fabs y) -> fabs (select c, x, y)
4804// select c, (fabs x), +k -> fabs (select c, x, k)
4805SDValue
4807 SDValue N) const {
4808 SelectionDAG &DAG = DCI.DAG;
4809 SDValue Cond = N.getOperand(0);
4810 SDValue LHS = N.getOperand(1);
4811 SDValue RHS = N.getOperand(2);
4812
4813 EVT VT = N.getValueType();
4814 if ((LHS.getOpcode() == ISD::FABS && RHS.getOpcode() == ISD::FABS) ||
4815 (LHS.getOpcode() == ISD::FNEG && RHS.getOpcode() == ISD::FNEG)) {
4817 return SDValue();
4818
4819 return distributeOpThroughSelect(DCI, LHS.getOpcode(),
4820 SDLoc(N), Cond, LHS, RHS);
4821 }
4822
4823 bool Inv = false;
4824 if (RHS.getOpcode() == ISD::FABS || RHS.getOpcode() == ISD::FNEG) {
4825 std::swap(LHS, RHS);
4826 Inv = true;
4827 }
4828
4829 // TODO: Support vector constants.
4831 if ((LHS.getOpcode() == ISD::FNEG || LHS.getOpcode() == ISD::FABS) && CRHS &&
4832 !selectSupportsSourceMods(N.getNode())) {
4833 SDLoc SL(N);
4834 // If one side is an fneg/fabs and the other is a constant, we can push the
4835 // fneg/fabs down. If it's an fabs, the constant needs to be non-negative.
4836 SDValue NewLHS = LHS.getOperand(0);
4837 SDValue NewRHS = RHS;
4838
4839 // Careful: if the neg can be folded up, don't try to pull it back down.
4840 bool ShouldFoldNeg = true;
4841
4842 if (NewLHS.hasOneUse()) {
4843 unsigned Opc = NewLHS.getOpcode();
4844 if (LHS.getOpcode() == ISD::FNEG && fnegFoldsIntoOp(NewLHS.getNode()))
4845 ShouldFoldNeg = false;
4846 if (LHS.getOpcode() == ISD::FABS && Opc == ISD::FMUL)
4847 ShouldFoldNeg = false;
4848 }
4849
4850 if (ShouldFoldNeg) {
4851 if (LHS.getOpcode() == ISD::FABS && CRHS->isNegative())
4852 return SDValue();
4853
4854 // We're going to be forced to use a source modifier anyway, there's no
4855 // point to pulling the negate out unless we can get a size reduction by
4856 // negating the constant.
4857 //
4858 // TODO: Generalize to use getCheaperNegatedExpression which doesn't know
4859 // about cheaper constants.
4860 if (NewLHS.getOpcode() == ISD::FABS &&
4862 return SDValue();
4863
4865 return SDValue();
4866
4867 if (LHS.getOpcode() == ISD::FNEG)
4868 NewRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
4869
4870 if (Inv)
4871 std::swap(NewLHS, NewRHS);
4872
4873 SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, VT,
4874 Cond, NewLHS, NewRHS);
4875 DCI.AddToWorklist(NewSelect.getNode());
4876 return DAG.getNode(LHS.getOpcode(), SL, VT, NewSelect);
4877 }
4878 }
4879
4880 return SDValue();
4881}
4882
4884 DAGCombinerInfo &DCI) const {
4885 if (SDValue Folded = foldFreeOpFromSelect(DCI, SDValue(N, 0)))
4886 return Folded;
4887
4888 SDValue Cond = N->getOperand(0);
4889 if (Cond.getOpcode() != ISD::SETCC)
4890 return SDValue();
4891
4892 EVT VT = N->getValueType(0);
4893 SDValue LHS = Cond.getOperand(0);
4894 SDValue RHS = Cond.getOperand(1);
4895 SDValue CC = Cond.getOperand(2);
4896
4897 SDValue True = N->getOperand(1);
4898 SDValue False = N->getOperand(2);
4899
4900 if (Cond.hasOneUse()) { // TODO: Look for multiple select uses.
4901 SelectionDAG &DAG = DCI.DAG;
4902 if (DAG.isConstantValueOfAnyType(True) &&
4903 !DAG.isConstantValueOfAnyType(False)) {
4904 // Swap cmp + select pair to move constant to false input.
4905 // This will allow using VOPC cndmasks more often.
4906 // select (setcc x, y), k, x -> select (setccinv x, y), x, k
4907
4908 SDLoc SL(N);
4909 ISD::CondCode NewCC =
4910 getSetCCInverse(cast<CondCodeSDNode>(CC)->get(), LHS.getValueType());
4911
4912 SDValue NewCond = DAG.getSetCC(SL, Cond.getValueType(), LHS, RHS, NewCC);
4913 return DAG.getNode(ISD::SELECT, SL, VT, NewCond, False, True);
4914 }
4915
4916 if (VT == MVT::f32 && Subtarget->hasFminFmaxLegacy()) {
4918 = combineFMinMaxLegacy(SDLoc(N), VT, LHS, RHS, True, False, CC, DCI);
4919 // Revisit this node so we can catch min3/max3/med3 patterns.
4920 //DCI.AddToWorklist(MinMax.getNode());
4921 return MinMax;
4922 }
4923 }
4924
4925 // There's no reason to not do this if the condition has other uses.
4926 return performCtlz_CttzCombine(SDLoc(N), Cond, True, False, DCI);
4927}
4928
4929static bool isInv2Pi(const APFloat &APF) {
4930 static const APFloat KF16(APFloat::IEEEhalf(), APInt(16, 0x3118));
4931 static const APFloat KF32(APFloat::IEEEsingle(), APInt(32, 0x3e22f983));
4932 static const APFloat KF64(APFloat::IEEEdouble(), APInt(64, 0x3fc45f306dc9c882));
4933
4934 return APF.bitwiseIsEqual(KF16) ||
4935 APF.bitwiseIsEqual(KF32) ||
4936 APF.bitwiseIsEqual(KF64);
4937}
4938
4939// 0 and 1.0 / (0.5 * pi) do not have inline immmediates, so there is an
4940// additional cost to negate them.
4943 if (C->isZero())
4944 return C->isNegative() ? NegatibleCost::Cheaper : NegatibleCost::Expensive;
4945
4946 if (Subtarget->hasInv2PiInlineImm() && isInv2Pi(C->getValueAPF()))
4947 return C->isNegative() ? NegatibleCost::Cheaper : NegatibleCost::Expensive;
4948
4950}
4951
4957
4963
4964static unsigned inverseMinMax(unsigned Opc) {
4965 switch (Opc) {
4966 case ISD::FMAXNUM:
4967 return ISD::FMINNUM;
4968 case ISD::FMINNUM:
4969 return ISD::FMAXNUM;
4970 case ISD::FMAXNUM_IEEE:
4971 return ISD::FMINNUM_IEEE;
4972 case ISD::FMINNUM_IEEE:
4973 return ISD::FMAXNUM_IEEE;
4974 case ISD::FMAXIMUM:
4975 return ISD::FMINIMUM;
4976 case ISD::FMINIMUM:
4977 return ISD::FMAXIMUM;
4978 case ISD::FMAXIMUMNUM:
4979 return ISD::FMINIMUMNUM;
4980 case ISD::FMINIMUMNUM:
4981 return ISD::FMAXIMUMNUM;
4982 case AMDGPUISD::FMAX_LEGACY:
4983 return AMDGPUISD::FMIN_LEGACY;
4984 case AMDGPUISD::FMIN_LEGACY:
4985 return AMDGPUISD::FMAX_LEGACY;
4986 default:
4987 llvm_unreachable("invalid min/max opcode");
4988 }
4989}
4990
4991/// \return true if it's profitable to try to push an fneg into its source
4992/// instruction.
4994 // If the input has multiple uses and we can either fold the negate down, or
4995 // the other uses cannot, give up. This both prevents unprofitable
4996 // transformations and infinite loops: we won't repeatedly try to fold around
4997 // a negate that has no 'good' form.
4998 if (N0.hasOneUse()) {
4999 // This may be able to fold into the source, but at a code size cost. Don't
5000 // fold if the fold into the user is free.
5001 if (allUsesHaveSourceMods(N, 0))
5002 return false;
5003 } else {
5004 if (fnegFoldsIntoOp(N0.getNode()) &&
5006 return false;
5007 }
5008
5009 return true;
5010}
5011
5013 DAGCombinerInfo &DCI) const {
5014 SelectionDAG &DAG = DCI.DAG;
5015 SDValue N0 = N->getOperand(0);
5016 EVT VT = N->getValueType(0);
5017
5018 unsigned Opc = N0.getOpcode();
5019
5020 if (!shouldFoldFNegIntoSrc(N, N0))
5021 return SDValue();
5022
5023 SDLoc SL(N);
5024 switch (Opc) {
5025 case ISD::FADD: {
5026 if (!mayIgnoreSignedZero(N0))
5027 return SDValue();
5028
5029 // (fneg (fadd x, y)) -> (fadd (fneg x), (fneg y))
5030 SDValue LHS = N0.getOperand(0);
5031 SDValue RHS = N0.getOperand(1);
5032
5033 if (LHS.getOpcode() != ISD::FNEG)
5034 LHS = DAG.getNode(ISD::FNEG, SL, VT, LHS);
5035 else
5036 LHS = LHS.getOperand(0);
5037
5038 if (RHS.getOpcode() != ISD::FNEG)
5039 RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
5040 else
5041 RHS = RHS.getOperand(0);
5042
5043 SDValue Res = DAG.getNode(ISD::FADD, SL, VT, LHS, RHS, N0->getFlags());
5044 if (Res.getOpcode() != ISD::FADD)
5045 return SDValue(); // Op got folded away.
5046 if (!N0.hasOneUse())
5047 DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
5048 return Res;
5049 }
5050 case ISD::FMUL:
5051 case AMDGPUISD::FMUL_LEGACY: {
5052 // (fneg (fmul x, y)) -> (fmul x, (fneg y))
5053 // (fneg (fmul_legacy x, y)) -> (fmul_legacy x, (fneg y))
5054 SDValue LHS = N0.getOperand(0);
5055 SDValue RHS = N0.getOperand(1);
5056
5057 if (LHS.getOpcode() == ISD::FNEG)
5058 LHS = LHS.getOperand(0);
5059 else if (RHS.getOpcode() == ISD::FNEG)
5060 RHS = RHS.getOperand(0);
5061 else
5062 RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
5063
5064 SDValue Res = DAG.getNode(Opc, SL, VT, LHS, RHS, N0->getFlags());
5065 if (Res.getOpcode() != Opc)
5066 return SDValue(); // Op got folded away.
5067 if (!N0.hasOneUse())
5068 DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
5069 return Res;
5070 }
5071 case ISD::FMA:
5072 case ISD::FMAD: {
5073 // TODO: handle llvm.amdgcn.fma.legacy
5074 if (!mayIgnoreSignedZero(N0))
5075 return SDValue();
5076
5077 // (fneg (fma x, y, z)) -> (fma x, (fneg y), (fneg z))
5078 SDValue LHS = N0.getOperand(0);
5079 SDValue MHS = N0.getOperand(1);
5080 SDValue RHS = N0.getOperand(2);
5081
5082 if (LHS.getOpcode() == ISD::FNEG)
5083 LHS = LHS.getOperand(0);
5084 else if (MHS.getOpcode() == ISD::FNEG)
5085 MHS = MHS.getOperand(0);
5086 else
5087 MHS = DAG.getNode(ISD::FNEG, SL, VT, MHS);
5088
5089 if (RHS.getOpcode() != ISD::FNEG)
5090 RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
5091 else
5092 RHS = RHS.getOperand(0);
5093
5094 SDValue Res = DAG.getNode(Opc, SL, VT, LHS, MHS, RHS);
5095 if (Res.getOpcode() != Opc)
5096 return SDValue(); // Op got folded away.
5097 if (!N0.hasOneUse())
5098 DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
5099 return Res;
5100 }
5101 case ISD::FMAXNUM:
5102 case ISD::FMINNUM:
5103 case ISD::FMAXNUM_IEEE:
5104 case ISD::FMINNUM_IEEE:
5105 case ISD::FMINIMUM:
5106 case ISD::FMAXIMUM:
5107 case ISD::FMINIMUMNUM:
5108 case ISD::FMAXIMUMNUM:
5109 case AMDGPUISD::FMAX_LEGACY:
5110 case AMDGPUISD::FMIN_LEGACY: {
5111 // fneg (fmaxnum x, y) -> fminnum (fneg x), (fneg y)
5112 // fneg (fminnum x, y) -> fmaxnum (fneg x), (fneg y)
5113 // fneg (fmax_legacy x, y) -> fmin_legacy (fneg x), (fneg y)
5114 // fneg (fmin_legacy x, y) -> fmax_legacy (fneg x), (fneg y)
5115
5116 SDValue LHS = N0.getOperand(0);
5117 SDValue RHS = N0.getOperand(1);
5118
5119 // 0 doesn't have a negated inline immediate.
5120 // TODO: This constant check should be generalized to other operations.
5122 return SDValue();
5123
5124 SDValue NegLHS = DAG.getNode(ISD::FNEG, SL, VT, LHS);
5125 SDValue NegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
5126 unsigned Opposite = inverseMinMax(Opc);
5127
5128 SDValue Res = DAG.getNode(Opposite, SL, VT, NegLHS, NegRHS, N0->getFlags());
5129 if (Res.getOpcode() != Opposite)
5130 return SDValue(); // Op got folded away.
5131 if (!N0.hasOneUse())
5132 DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
5133 return Res;
5134 }
5135 case AMDGPUISD::FMED3: {
5136 SDValue Ops[3];
5137 for (unsigned I = 0; I < 3; ++I)
5138 Ops[I] = DAG.getNode(ISD::FNEG, SL, VT, N0->getOperand(I), N0->getFlags());
5139
5140 SDValue Res = DAG.getNode(AMDGPUISD::FMED3, SL, VT, Ops, N0->getFlags());
5141 if (Res.getOpcode() != AMDGPUISD::FMED3)
5142 return SDValue(); // Op got folded away.
5143
5144 if (!N0.hasOneUse()) {
5145 SDValue Neg = DAG.getNode(ISD::FNEG, SL, VT, Res);
5146 DAG.ReplaceAllUsesWith(N0, Neg);
5147
5148 for (SDNode *U : Neg->users())
5149 DCI.AddToWorklist(U);
5150 }
5151
5152 return Res;
5153 }
5154 case ISD::FP_EXTEND:
5155 case ISD::FTRUNC:
5156 case ISD::FRINT:
5157 case ISD::FNEARBYINT: // XXX - Should fround be handled?
5158 case ISD::FROUNDEVEN:
5159 case ISD::FSIN:
5160 case ISD::FCANONICALIZE:
5161 case AMDGPUISD::RCP:
5162 case AMDGPUISD::RCP_LEGACY:
5163 case AMDGPUISD::RCP_IFLAG:
5164 case AMDGPUISD::SIN_HW: {
5165 SDValue CvtSrc = N0.getOperand(0);
5166 if (CvtSrc.getOpcode() == ISD::FNEG) {
5167 // (fneg (fp_extend (fneg x))) -> (fp_extend x)
5168 // (fneg (rcp (fneg x))) -> (rcp x)
5169 return DAG.getNode(Opc, SL, VT, CvtSrc.getOperand(0));
5170 }
5171
5172 if (!N0.hasOneUse())
5173 return SDValue();
5174
5175 // (fneg (fp_extend x)) -> (fp_extend (fneg x))
5176 // (fneg (rcp x)) -> (rcp (fneg x))
5177 SDValue Neg = DAG.getNode(ISD::FNEG, SL, CvtSrc.getValueType(), CvtSrc);
5178 return DAG.getNode(Opc, SL, VT, Neg, N0->getFlags());
5179 }
5180 case ISD::FP_ROUND: {
5181 SDValue CvtSrc = N0.getOperand(0);
5182
5183 if (CvtSrc.getOpcode() == ISD::FNEG) {
5184 // (fneg (fp_round (fneg x))) -> (fp_round x)
5185 return DAG.getNode(ISD::FP_ROUND, SL, VT,
5186 CvtSrc.getOperand(0), N0.getOperand(1));
5187 }
5188
5189 if (!N0.hasOneUse())
5190 return SDValue();
5191
5192 // (fneg (fp_round x)) -> (fp_round (fneg x))
5193 SDValue Neg = DAG.getNode(ISD::FNEG, SL, CvtSrc.getValueType(), CvtSrc);
5194 return DAG.getNode(ISD::FP_ROUND, SL, VT, Neg, N0.getOperand(1));
5195 }
5196 case ISD::FP16_TO_FP: {
5197 // v_cvt_f32_f16 supports source modifiers on pre-VI targets without legal
5198 // f16, but legalization of f16 fneg ends up pulling it out of the source.
5199 // Put the fneg back as a legal source operation that can be matched later.
5200 SDLoc SL(N);
5201
5202 SDValue Src = N0.getOperand(0);
5203 EVT SrcVT = Src.getValueType();
5204
5205 // fneg (fp16_to_fp x) -> fp16_to_fp (xor x, 0x8000)
5206 SDValue IntFNeg = DAG.getNode(ISD::XOR, SL, SrcVT, Src,
5207 DAG.getConstant(0x8000, SL, SrcVT));
5208 return DAG.getNode(ISD::FP16_TO_FP, SL, N->getValueType(0), IntFNeg);
5209 }
5210 case ISD::SELECT: {
5211 // fneg (select c, a, b) -> select c, (fneg a), (fneg b)
5212 // TODO: Invert conditions of foldFreeOpFromSelect
5213 return SDValue();
5214 }
5215 case ISD::BITCAST: {
5216 SDLoc SL(N);
5217 SDValue BCSrc = N0.getOperand(0);
5218 if (BCSrc.getOpcode() == ISD::BUILD_VECTOR) {
5219 SDValue HighBits = BCSrc.getOperand(BCSrc.getNumOperands() - 1);
5220 if (HighBits.getValueType().getSizeInBits() != 32 ||
5221 !fnegFoldsIntoOp(HighBits.getNode()))
5222 return SDValue();
5223
5224 // f64 fneg only really needs to operate on the high half of of the
5225 // register, so try to force it to an f32 operation to help make use of
5226 // source modifiers.
5227 //
5228 //
5229 // fneg (f64 (bitcast (build_vector x, y))) ->
5230 // f64 (bitcast (build_vector (bitcast i32:x to f32),
5231 // (fneg (bitcast i32:y to f32)))
5232
5233 SDValue CastHi = DAG.getNode(ISD::BITCAST, SL, MVT::f32, HighBits);
5234 SDValue NegHi = DAG.getNode(ISD::FNEG, SL, MVT::f32, CastHi);
5235 SDValue CastBack =
5236 DAG.getNode(ISD::BITCAST, SL, HighBits.getValueType(), NegHi);
5237
5239 Ops.back() = CastBack;
5240 DCI.AddToWorklist(NegHi.getNode());
5241 SDValue Build =
5242 DAG.getNode(ISD::BUILD_VECTOR, SL, BCSrc.getValueType(), Ops);
5243 SDValue Result = DAG.getNode(ISD::BITCAST, SL, VT, Build);
5244
5245 if (!N0.hasOneUse())
5246 DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Result));
5247 return Result;
5248 }
5249
5250 if (BCSrc.getOpcode() == ISD::SELECT && VT == MVT::f32 &&
5251 BCSrc.hasOneUse()) {
5252 // fneg (bitcast (f32 (select cond, i32:lhs, i32:rhs))) ->
5253 // select cond, (bitcast i32:lhs to f32), (bitcast i32:rhs to f32)
5254
5255 // TODO: Cast back result for multiple uses is beneficial in some cases.
5256
5257 SDValue LHS =
5258 DAG.getNode(ISD::BITCAST, SL, MVT::f32, BCSrc.getOperand(1));
5259 SDValue RHS =
5260 DAG.getNode(ISD::BITCAST, SL, MVT::f32, BCSrc.getOperand(2));
5261
5262 SDValue NegLHS = DAG.getNode(ISD::FNEG, SL, MVT::f32, LHS);
5263 SDValue NegRHS = DAG.getNode(ISD::FNEG, SL, MVT::f32, RHS);
5264
5265 return DAG.getNode(ISD::SELECT, SL, MVT::f32, BCSrc.getOperand(0), NegLHS,
5266 NegRHS);
5267 }
5268
5269 return SDValue();
5270 }
5271 default:
5272 return SDValue();
5273 }
5274}
5275
5277 DAGCombinerInfo &DCI) const {
5278 SelectionDAG &DAG = DCI.DAG;
5279 SDValue N0 = N->getOperand(0);
5280
5281 if (!N0.hasOneUse())
5282 return SDValue();
5283
5284 switch (N0.getOpcode()) {
5285 case ISD::FP16_TO_FP: {
5286 assert(!Subtarget->has16BitInsts() && "should only see if f16 is illegal");
5287 SDLoc SL(N);
5288 SDValue Src = N0.getOperand(0);
5289 EVT SrcVT = Src.getValueType();
5290
5291 // fabs (fp16_to_fp x) -> fp16_to_fp (and x, 0x7fff)
5292 SDValue IntFAbs = DAG.getNode(ISD::AND, SL, SrcVT, Src,
5293 DAG.getConstant(0x7fff, SL, SrcVT));
5294 return DAG.getNode(ISD::FP16_TO_FP, SL, N->getValueType(0), IntFAbs);
5295 }
5296 default:
5297 return SDValue();
5298 }
5299}
5300
5302 DAGCombinerInfo &DCI) const {
5303 const auto *CFP = dyn_cast<ConstantFPSDNode>(N->getOperand(0));
5304 if (!CFP)
5305 return SDValue();
5306
5307 // XXX - Should this flush denormals?
5308 const APFloat &Val = CFP->getValueAPF();
5309 APFloat One(Val.getSemantics(), "1.0");
5310 return DCI.DAG.getConstantFP(One / Val, SDLoc(N), N->getValueType(0));
5311}
5312
5314 DAGCombinerInfo &DCI) const {
5315 SelectionDAG &DAG = DCI.DAG;
5316 SDLoc DL(N);
5317
5318 switch(N->getOpcode()) {
5319 default:
5320 break;
5321 case ISD::BITCAST: {
5322 EVT DestVT = N->getValueType(0);
5323
5324 // Push casts through vector builds. This helps avoid emitting a large
5325 // number of copies when materializing floating point vector constants.
5326 //
5327 // vNt1 bitcast (vNt0 (build_vector t0:x, t0:y)) =>
5328 // vnt1 = build_vector (t1 (bitcast t0:x)), (t1 (bitcast t0:y))
5329 if (DestVT.isVector()) {
5330 SDValue Src = N->getOperand(0);
5331 if (Src.getOpcode() == ISD::BUILD_VECTOR &&
5334 EVT SrcVT = Src.getValueType();
5335 unsigned NElts = DestVT.getVectorNumElements();
5336
5337 if (SrcVT.getVectorNumElements() == NElts) {
5338 EVT DestEltVT = DestVT.getVectorElementType();
5339
5340 SmallVector<SDValue, 8> CastedElts;
5341 SDLoc SL(N);
5342 for (unsigned I = 0, E = SrcVT.getVectorNumElements(); I != E; ++I) {
5343 SDValue Elt = Src.getOperand(I);
5344 CastedElts.push_back(DAG.getNode(ISD::BITCAST, DL, DestEltVT, Elt));
5345 }
5346
5347 return DAG.getBuildVector(DestVT, SL, CastedElts);
5348 }
5349 }
5350 }
5351
5352 if (DestVT.getSizeInBits() != 64 || !DestVT.isVector())
5353 break;
5354
5355 // Fold bitcasts of constants.
5356 //
5357 // v2i32 (bitcast i64:k) -> build_vector lo_32(k), hi_32(k)
5358 // TODO: Generalize and move to DAGCombiner
5359 SDValue Src = N->getOperand(0);
5361 SDLoc SL(N);
5362 uint64_t CVal = C->getZExtValue();
5363 SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32,
5364 DAG.getConstant(Lo_32(CVal), SL, MVT::i32),
5365 DAG.getConstant(Hi_32(CVal), SL, MVT::i32));
5366 return DAG.getNode(ISD::BITCAST, SL, DestVT, BV);
5367 }
5368
5370 const APInt &Val = C->getValueAPF().bitcastToAPInt();
5371 SDLoc SL(N);
5372 uint64_t CVal = Val.getZExtValue();
5373 SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32,
5374 DAG.getConstant(Lo_32(CVal), SL, MVT::i32),
5375 DAG.getConstant(Hi_32(CVal), SL, MVT::i32));
5376
5377 return DAG.getNode(ISD::BITCAST, SL, DestVT, Vec);
5378 }
5379
5380 break;
5381 }
5382 case ISD::SHL:
5383 case ISD::SRA:
5384 case ISD::SRL: {
5385 // Range metadata can be invalidated when loads are converted to legal types
5386 // (e.g. v2i64 -> v4i32).
5387 // Try to convert vector shl/sra/srl before type legalization so that range
5388 // metadata can be utilized.
5389 if (!(N->getValueType(0).isVector() &&
5392 break;
5393 if (N->getOpcode() == ISD::SHL)
5394 return performShlCombine(N, DCI);
5395 if (N->getOpcode() == ISD::SRA)
5396 return performSraCombine(N, DCI);
5397 return performSrlCombine(N, DCI);
5398 }
5399 case ISD::TRUNCATE:
5400 return performTruncateCombine(N, DCI);
5401 case ISD::MUL:
5402 return performMulCombine(N, DCI);
5403 case AMDGPUISD::MUL_U24:
5404 case AMDGPUISD::MUL_I24: {
5405 if (SDValue Simplified = simplifyMul24(N, DCI))
5406 return Simplified;
5407 break;
5408 }
5409 case AMDGPUISD::MULHI_I24:
5410 case AMDGPUISD::MULHI_U24:
5411 return simplifyMul24(N, DCI);
5412 case ISD::SMUL_LOHI:
5413 case ISD::UMUL_LOHI:
5414 return performMulLoHiCombine(N, DCI);
5415 case ISD::MULHS:
5416 return performMulhsCombine(N, DCI);
5417 case ISD::MULHU:
5418 return performMulhuCombine(N, DCI);
5419 case ISD::SELECT:
5420 return performSelectCombine(N, DCI);
5421 case ISD::FNEG:
5422 return performFNegCombine(N, DCI);
5423 case ISD::FABS:
5424 return performFAbsCombine(N, DCI);
5425 case AMDGPUISD::BFE_I32:
5426 case AMDGPUISD::BFE_U32: {
5427 assert(!N->getValueType(0).isVector() &&
5428 "Vector handling of BFE not implemented");
5429 ConstantSDNode *Width = dyn_cast<ConstantSDNode>(N->getOperand(2));
5430 if (!Width)
5431 break;
5432
5433 uint32_t WidthVal = Width->getZExtValue() & 0x1f;
5434 if (WidthVal == 0)
5435 return DAG.getConstant(0, DL, MVT::i32);
5436
5438 if (!Offset)
5439 break;
5440
5441 SDValue BitsFrom = N->getOperand(0);
5442 uint32_t OffsetVal = Offset->getZExtValue() & 0x1f;
5443
5444 bool Signed = N->getOpcode() == AMDGPUISD::BFE_I32;
5445
5446 if (OffsetVal == 0) {
5447 // This is already sign / zero extended, so try to fold away extra BFEs.
5448 unsigned SignBits = Signed ? (32 - WidthVal + 1) : (32 - WidthVal);
5449
5450 unsigned OpSignBits = DAG.ComputeNumSignBits(BitsFrom);
5451 if (OpSignBits >= SignBits)
5452 return BitsFrom;
5453
5454 EVT SmallVT = EVT::getIntegerVT(*DAG.getContext(), WidthVal);
5455 if (Signed) {
5456 // This is a sign_extend_inreg. Replace it to take advantage of existing
5457 // DAG Combines. If not eliminated, we will match back to BFE during
5458 // selection.
5459
5460 // TODO: The sext_inreg of extended types ends, although we can could
5461 // handle them in a single BFE.
5462 return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i32, BitsFrom,
5463 DAG.getValueType(SmallVT));
5464 }
5465
5466 return DAG.getZeroExtendInReg(BitsFrom, DL, SmallVT);
5467 }
5468
5469 if (ConstantSDNode *CVal = dyn_cast<ConstantSDNode>(BitsFrom)) {
5470 if (Signed) {
5471 return constantFoldBFE<int32_t>(DAG,
5472 CVal->getSExtValue(),
5473 OffsetVal,
5474 WidthVal,
5475 DL);
5476 }
5477
5478 return constantFoldBFE<uint32_t>(DAG,
5479 CVal->getZExtValue(),
5480 OffsetVal,
5481 WidthVal,
5482 DL);
5483 }
5484
5485 if ((OffsetVal + WidthVal) >= 32 &&
5486 !(Subtarget->hasSDWA() && OffsetVal == 16 && WidthVal == 16)) {
5487 SDValue ShiftVal = DAG.getConstant(OffsetVal, DL, MVT::i32);
5488 return DAG.getNode(Signed ? ISD::SRA : ISD::SRL, DL, MVT::i32,
5489 BitsFrom, ShiftVal);
5490 }
5491
5492 if (BitsFrom.hasOneUse()) {
5493 APInt Demanded = APInt::getBitsSet(32,
5494 OffsetVal,
5495 OffsetVal + WidthVal);
5496
5497 KnownBits Known;
5499 !DCI.isBeforeLegalizeOps());
5500 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
5501 if (TLI.ShrinkDemandedConstant(BitsFrom, Demanded, TLO) ||
5502 TLI.SimplifyDemandedBits(BitsFrom, Demanded, Known, TLO)) {
5503 DCI.CommitTargetLoweringOpt(TLO);
5504 }
5505 }
5506
5507 break;
5508 }
5509 case ISD::LOAD:
5510 return performLoadCombine(N, DCI);
5511 case ISD::STORE:
5512 return performStoreCombine(N, DCI);
5513 case AMDGPUISD::RCP:
5514 case AMDGPUISD::RCP_IFLAG:
5515 return performRcpCombine(N, DCI);
5516 case ISD::AssertZext:
5517 case ISD::AssertSext:
5518 return performAssertSZExtCombine(N, DCI);
5520 return performIntrinsicWOChainCombine(N, DCI);
5521 case AMDGPUISD::FMAD_FTZ: {
5522 SDValue N0 = N->getOperand(0);
5523 SDValue N1 = N->getOperand(1);
5524 SDValue N2 = N->getOperand(2);
5525 EVT VT = N->getValueType(0);
5526
5527 // FMAD_FTZ is a FMAD + flush denormals to zero.
5528 // We flush the inputs, the intermediate step, and the output.
5532 if (N0CFP && N1CFP && N2CFP) {
5533 const auto FTZ = [](const APFloat &V) {
5534 if (V.isDenormal()) {
5535 APFloat Zero(V.getSemantics(), 0);
5536 return V.isNegative() ? -Zero : Zero;
5537 }
5538 return V;
5539 };
5540
5541 APFloat V0 = FTZ(N0CFP->getValueAPF());
5542 APFloat V1 = FTZ(N1CFP->getValueAPF());
5543 APFloat V2 = FTZ(N2CFP->getValueAPF());
5545 V0 = FTZ(V0);
5547 return DAG.getConstantFP(FTZ(V0), DL, VT);
5548 }
5549 break;
5550 }
5551 }
5552 return SDValue();
5553}
5554
5555//===----------------------------------------------------------------------===//
5556// Helper functions
5557//===----------------------------------------------------------------------===//
5558
5560 const TargetRegisterClass *RC,
5561 Register Reg, EVT VT,
5562 const SDLoc &SL,
5563 bool RawReg) const {
5566 Register VReg;
5567
5568 if (!MRI.isLiveIn(Reg)) {
5569 VReg = MRI.createVirtualRegister(RC);
5570 MRI.addLiveIn(Reg, VReg);
5571 } else {
5572 VReg = MRI.getLiveInVirtReg(Reg);
5573 }
5574
5575 if (RawReg)
5576 return DAG.getRegister(VReg, VT);
5577
5578 return DAG.getCopyFromReg(DAG.getEntryNode(), SL, VReg, VT);
5579}
5580
5581// This may be called multiple times, and nothing prevents creating multiple
5582// objects at the same offset. See if we already defined this object.
5584 int64_t Offset) {
5585 for (int I = MFI.getObjectIndexBegin(); I < 0; ++I) {
5586 if (MFI.getObjectOffset(I) == Offset) {
5587 assert(MFI.getObjectSize(I) == Size);
5588 return I;
5589 }
5590 }
5591
5592 return MFI.CreateFixedObject(Size, Offset, true);
5593}
5594
5596 EVT VT,
5597 const SDLoc &SL,
5598 int64_t Offset) const {
5600 MachineFrameInfo &MFI = MF.getFrameInfo();
5601 int FI = getOrCreateFixedStackObject(MFI, VT.getStoreSize(), Offset);
5602
5603 auto SrcPtrInfo = MachinePointerInfo::getStack(MF, Offset);
5604 SDValue Ptr = DAG.getFrameIndex(FI, MVT::i32);
5605
5606 return DAG.getLoad(VT, SL, DAG.getEntryNode(), Ptr, SrcPtrInfo, Align(4),
5609}
5610
5612 const SDLoc &SL,
5613 SDValue Chain,
5614 SDValue ArgVal,
5615 int64_t Offset) const {
5619
5620 SDValue Ptr = DAG.getConstant(Offset, SL, MVT::i32);
5621 // Stores to the argument stack area are relative to the stack pointer.
5622 SDValue SP =
5623 DAG.getCopyFromReg(Chain, SL, Info->getStackPtrOffsetReg(), MVT::i32);
5624 Ptr = DAG.getNode(ISD::ADD, SL, MVT::i32, SP, Ptr);
5625 SDValue Store = DAG.getStore(Chain, SL, ArgVal, Ptr, DstInfo, Align(4),
5627 return Store;
5628}
5629
5631 const TargetRegisterClass *RC,
5632 EVT VT, const SDLoc &SL,
5633 const ArgDescriptor &Arg) const {
5634 assert(Arg && "Attempting to load missing argument");
5635
5636 SDValue V = Arg.isRegister() ?
5637 CreateLiveInRegister(DAG, RC, Arg.getRegister(), VT, SL) :
5638 loadStackInputValue(DAG, VT, SL, Arg.getStackOffset());
5639
5640 if (!Arg.isMasked())
5641 return V;
5642
5643 unsigned Mask = Arg.getMask();
5644 unsigned Shift = llvm::countr_zero<unsigned>(Mask);
5645 V = DAG.getNode(ISD::SRL, SL, VT, V,
5646 DAG.getShiftAmountConstant(Shift, VT, SL));
5647 return DAG.getNode(ISD::AND, SL, VT, V,
5648 DAG.getConstant(Mask >> Shift, SL, VT));
5649}
5650
5652 uint64_t ExplicitKernArgSize, const ImplicitParameter Param) const {
5653 unsigned ExplicitArgOffset = Subtarget->getExplicitKernelArgOffset();
5654 const Align Alignment = Subtarget->getAlignmentForImplicitArgPtr();
5655 uint64_t ArgOffset =
5656 alignTo(ExplicitKernArgSize, Alignment) + ExplicitArgOffset;
5657 switch (Param) {
5658 case FIRST_IMPLICIT:
5659 return ArgOffset;
5660 case PRIVATE_BASE:
5662 case SHARED_BASE:
5663 return ArgOffset + AMDGPU::ImplicitArg::SHARED_BASE_OFFSET;
5664 case QUEUE_PTR:
5665 return ArgOffset + AMDGPU::ImplicitArg::QUEUE_PTR_OFFSET;
5666 }
5667 llvm_unreachable("unexpected implicit parameter type");
5668}
5669
5675
5677 SelectionDAG &DAG, int Enabled,
5678 int &RefinementSteps,
5679 bool &UseOneConstNR,
5680 bool Reciprocal) const {
5681 EVT VT = Operand.getValueType();
5682
5683 if (VT == MVT::f32) {
5684 RefinementSteps = 0;
5685 return DAG.getNode(AMDGPUISD::RSQ, SDLoc(Operand), VT, Operand);
5686 }
5687
5688 // TODO: There is also f64 rsq instruction, but the documentation is less
5689 // clear on its precision.
5690
5691 return SDValue();
5692}
5693
5695 SelectionDAG &DAG, int Enabled,
5696 int &RefinementSteps) const {
5697 EVT VT = Operand.getValueType();
5698
5699 if (VT == MVT::f32) {
5700 // Reciprocal, < 1 ulp error.
5701 //
5702 // This reciprocal approximation converges to < 0.5 ulp error with one
5703 // newton rhapson performed with two fused multiple adds (FMAs).
5704
5705 RefinementSteps = 0;
5706 return DAG.getNode(AMDGPUISD::RCP, SDLoc(Operand), VT, Operand);
5707 }
5708
5709 // TODO: There is also f64 rcp instruction, but the documentation is less
5710 // clear on its precision.
5711
5712 return SDValue();
5713}
5714
5715static unsigned workitemIntrinsicDim(unsigned ID) {
5716 switch (ID) {
5717 case Intrinsic::amdgcn_workitem_id_x:
5718 return 0;
5719 case Intrinsic::amdgcn_workitem_id_y:
5720 return 1;
5721 case Intrinsic::amdgcn_workitem_id_z:
5722 return 2;
5723 default:
5724 llvm_unreachable("not a workitem intrinsic");
5725 }
5726}
5727
5729 const SDValue Op, KnownBits &Known,
5730 const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const {
5731
5732 Known.resetAll(); // Don't know anything.
5733
5734 unsigned Opc = Op.getOpcode();
5735
5736 switch (Opc) {
5737 default:
5738 break;
5739 case AMDGPUISD::CARRY:
5740 case AMDGPUISD::BORROW: {
5741 Known.Zero = APInt::getHighBitsSet(32, 31);
5742 break;
5743 }
5744
5745 case AMDGPUISD::BFE_I32:
5746 case AMDGPUISD::BFE_U32: {
5747 ConstantSDNode *CWidth = dyn_cast<ConstantSDNode>(Op.getOperand(2));
5748 if (!CWidth)
5749 return;
5750
5751 uint32_t Width = CWidth->getZExtValue() & 0x1f;
5752
5753 if (Opc == AMDGPUISD::BFE_U32)
5754 Known.Zero = APInt::getHighBitsSet(32, 32 - Width);
5755
5756 break;
5757 }
5758 case AMDGPUISD::FP_TO_FP16: {
5759 unsigned BitWidth = Known.getBitWidth();
5760
5761 // High bits are zero.
5763 break;
5764 }
5765 case AMDGPUISD::MUL_U24:
5766 case AMDGPUISD::MUL_I24: {
5767 KnownBits LHSKnown = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
5768 KnownBits RHSKnown = DAG.computeKnownBits(Op.getOperand(1), Depth + 1);
5769 unsigned TrailZ = LHSKnown.countMinTrailingZeros() +
5770 RHSKnown.countMinTrailingZeros();
5771 Known.Zero.setLowBits(std::min(TrailZ, 32u));
5772 // Skip extra check if all bits are known zeros.
5773 if (TrailZ >= 32)
5774 break;
5775
5776 // Truncate to 24 bits.
5777 LHSKnown = LHSKnown.trunc(24);
5778 RHSKnown = RHSKnown.trunc(24);
5779
5780 if (Opc == AMDGPUISD::MUL_I24) {
5781 unsigned LHSValBits = LHSKnown.countMaxSignificantBits();
5782 unsigned RHSValBits = RHSKnown.countMaxSignificantBits();
5783 unsigned MaxValBits = LHSValBits + RHSValBits;
5784 if (MaxValBits > 32)
5785 break;
5786 unsigned SignBits = 32 - MaxValBits + 1;
5787 bool LHSNegative = LHSKnown.isNegative();
5788 bool LHSNonNegative = LHSKnown.isNonNegative();
5789 bool LHSPositive = LHSKnown.isStrictlyPositive();
5790 bool RHSNegative = RHSKnown.isNegative();
5791 bool RHSNonNegative = RHSKnown.isNonNegative();
5792 bool RHSPositive = RHSKnown.isStrictlyPositive();
5793
5794 if ((LHSNonNegative && RHSNonNegative) || (LHSNegative && RHSNegative))
5795 Known.Zero.setHighBits(SignBits);
5796 else if ((LHSNegative && RHSPositive) || (LHSPositive && RHSNegative))
5797 Known.One.setHighBits(SignBits);
5798 } else {
5799 unsigned LHSValBits = LHSKnown.countMaxActiveBits();
5800 unsigned RHSValBits = RHSKnown.countMaxActiveBits();
5801 unsigned MaxValBits = LHSValBits + RHSValBits;
5802 if (MaxValBits >= 32)
5803 break;
5804 Known.Zero.setBitsFrom(MaxValBits);
5805 }
5806 break;
5807 }
5808 case AMDGPUISD::PERM: {
5809 ConstantSDNode *CMask = dyn_cast<ConstantSDNode>(Op.getOperand(2));
5810 if (!CMask)
5811 return;
5812
5813 KnownBits LHSKnown = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
5814 KnownBits RHSKnown = DAG.computeKnownBits(Op.getOperand(1), Depth + 1);
5815 unsigned Sel = CMask->getZExtValue();
5816
5817 for (unsigned I = 0; I < 32; I += 8) {
5818 unsigned SelBits = Sel & 0xff;
5819 if (SelBits < 4) {
5820 SelBits *= 8;
5821 Known.One |= ((RHSKnown.One.getZExtValue() >> SelBits) & 0xff) << I;
5822 Known.Zero |= ((RHSKnown.Zero.getZExtValue() >> SelBits) & 0xff) << I;
5823 } else if (SelBits < 7) {
5824 SelBits = (SelBits & 3) * 8;
5825 Known.One |= ((LHSKnown.One.getZExtValue() >> SelBits) & 0xff) << I;
5826 Known.Zero |= ((LHSKnown.Zero.getZExtValue() >> SelBits) & 0xff) << I;
5827 } else if (SelBits == 0x0c) {
5828 Known.Zero |= 0xFFull << I;
5829 } else if (SelBits > 0x0c) {
5830 Known.One |= 0xFFull << I;
5831 }
5832 Sel >>= 8;
5833 }
5834 break;
5835 }
5836 case AMDGPUISD::BUFFER_LOAD_UBYTE: {
5837 Known.Zero.setHighBits(24);
5838 break;
5839 }
5840 case AMDGPUISD::BUFFER_LOAD_USHORT: {
5841 Known.Zero.setHighBits(16);
5842 break;
5843 }
5844 case AMDGPUISD::LDS: {
5845 auto *GA = cast<GlobalAddressSDNode>(Op.getOperand(0).getNode());
5846 Align Alignment = GA->getGlobal()->getPointerAlignment(DAG.getDataLayout());
5847
5848 Known.Zero.setHighBits(16);
5849 Known.Zero.setLowBits(Log2(Alignment));
5850 break;
5851 }
5852 case AMDGPUISD::SMIN3:
5853 case AMDGPUISD::SMAX3:
5854 case AMDGPUISD::SMED3:
5855 case AMDGPUISD::UMIN3:
5856 case AMDGPUISD::UMAX3:
5857 case AMDGPUISD::UMED3: {
5858 KnownBits Known2 = DAG.computeKnownBits(Op.getOperand(2), Depth + 1);
5859 if (Known2.isUnknown())
5860 break;
5861
5862 KnownBits Known1 = DAG.computeKnownBits(Op.getOperand(1), Depth + 1);
5863 if (Known1.isUnknown())
5864 break;
5865
5866 KnownBits Known0 = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
5867 if (Known0.isUnknown())
5868 break;
5869
5870 // TODO: Handle LeadZero/LeadOne from UMIN/UMAX handling.
5871 Known.Zero = Known0.Zero & Known1.Zero & Known2.Zero;
5872 Known.One = Known0.One & Known1.One & Known2.One;
5873 break;
5874 }
5876 unsigned IID = Op.getConstantOperandVal(0);
5877 switch (IID) {
5878 case Intrinsic::amdgcn_workitem_id_x:
5879 case Intrinsic::amdgcn_workitem_id_y:
5880 case Intrinsic::amdgcn_workitem_id_z: {
5881 unsigned MaxValue = Subtarget->getMaxWorkitemID(
5883 Known.Zero.setHighBits(llvm::countl_zero(MaxValue));
5884 break;
5885 }
5886 default:
5887 break;
5888 }
5889 }
5890 }
5891}
5892
5894 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
5895 unsigned Depth) const {
5896 switch (Op.getOpcode()) {
5897 case AMDGPUISD::BFE_I32: {
5898 ConstantSDNode *Width = dyn_cast<ConstantSDNode>(Op.getOperand(2));
5899 if (!Width)
5900 return 1;
5901
5902 unsigned SignBits = 32 - Width->getZExtValue() + 1;
5903 if (!isNullConstant(Op.getOperand(1)))
5904 return SignBits;
5905
5906 // TODO: Could probably figure something out with non-0 offsets.
5907 unsigned Op0SignBits = DAG.ComputeNumSignBits(Op.getOperand(0), Depth + 1);
5908 return std::max(SignBits, Op0SignBits);
5909 }
5910
5911 case AMDGPUISD::BFE_U32: {
5912 ConstantSDNode *Width = dyn_cast<ConstantSDNode>(Op.getOperand(2));
5913 return Width ? 32 - (Width->getZExtValue() & 0x1f) : 1;
5914 }
5915
5916 case AMDGPUISD::CARRY:
5917 case AMDGPUISD::BORROW:
5918 return 31;
5919 case AMDGPUISD::BUFFER_LOAD_BYTE:
5920 return 25;
5921 case AMDGPUISD::BUFFER_LOAD_SHORT:
5922 return 17;
5923 case AMDGPUISD::BUFFER_LOAD_UBYTE:
5924 return 24;
5925 case AMDGPUISD::BUFFER_LOAD_USHORT:
5926 return 16;
5927 case AMDGPUISD::FP_TO_FP16:
5928 return 16;
5929 case AMDGPUISD::SMIN3:
5930 case AMDGPUISD::SMAX3:
5931 case AMDGPUISD::SMED3:
5932 case AMDGPUISD::UMIN3:
5933 case AMDGPUISD::UMAX3:
5934 case AMDGPUISD::UMED3: {
5935 unsigned Tmp2 = DAG.ComputeNumSignBits(Op.getOperand(2), Depth + 1);
5936 if (Tmp2 == 1)
5937 return 1; // Early out.
5938
5939 unsigned Tmp1 = DAG.ComputeNumSignBits(Op.getOperand(1), Depth + 1);
5940 if (Tmp1 == 1)
5941 return 1; // Early out.
5942
5943 unsigned Tmp0 = DAG.ComputeNumSignBits(Op.getOperand(0), Depth + 1);
5944 if (Tmp0 == 1)
5945 return 1; // Early out.
5946
5947 return std::min({Tmp0, Tmp1, Tmp2});
5948 }
5949 default:
5950 return 1;
5951 }
5952}
5953
5955 GISelValueTracking &Analysis, Register R, const APInt &DemandedElts,
5956 const MachineRegisterInfo &MRI, unsigned Depth) const {
5957 const MachineInstr *MI = MRI.getVRegDef(R);
5958 if (!MI)
5959 return 1;
5960
5961 // TODO: Check range metadata on MMO.
5962 switch (MI->getOpcode()) {
5963 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE:
5964 return 25;
5965 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT:
5966 return 17;
5967 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
5968 return 24;
5969 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
5970 return 16;
5971 case AMDGPU::G_AMDGPU_SMED3:
5972 case AMDGPU::G_AMDGPU_UMED3: {
5973 auto [Dst, Src0, Src1, Src2] = MI->getFirst4Regs();
5974 unsigned Tmp2 = Analysis.computeNumSignBits(Src2, DemandedElts, Depth + 1);
5975 if (Tmp2 == 1)
5976 return 1;
5977 unsigned Tmp1 = Analysis.computeNumSignBits(Src1, DemandedElts, Depth + 1);
5978 if (Tmp1 == 1)
5979 return 1;
5980 unsigned Tmp0 = Analysis.computeNumSignBits(Src0, DemandedElts, Depth + 1);
5981 if (Tmp0 == 1)
5982 return 1;
5983 return std::min({Tmp0, Tmp1, Tmp2});
5984 }
5985 default:
5986 return 1;
5987 }
5988}
5989
5991 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
5992 bool PoisonOnly, bool ConsiderFlags, unsigned Depth) const {
5993 unsigned Opcode = Op.getOpcode();
5994 switch (Opcode) {
5995 case AMDGPUISD::BFE_I32:
5996 case AMDGPUISD::BFE_U32:
5997 return false;
5998 }
6000 Op, DemandedElts, DAG, PoisonOnly, ConsiderFlags, Depth);
6001}
6002
6004 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool SNaN,
6005 unsigned Depth) const {
6006 unsigned Opcode = Op.getOpcode();
6007 switch (Opcode) {
6008 case AMDGPUISD::FMIN_LEGACY:
6009 case AMDGPUISD::FMAX_LEGACY: {
6010 if (SNaN)
6011 return true;
6012
6013 // TODO: Can check no nans on one of the operands for each one, but which
6014 // one?
6015 return false;
6016 }
6017 case AMDGPUISD::FMUL_LEGACY:
6018 case AMDGPUISD::CVT_PKRTZ_F16_F32: {
6019 if (SNaN)
6020 return true;
6021 return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1) &&
6022 DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1);
6023 }
6024 case AMDGPUISD::FMED3:
6025 case AMDGPUISD::FMIN3:
6026 case AMDGPUISD::FMAX3:
6027 case AMDGPUISD::FMINIMUM3:
6028 case AMDGPUISD::FMAXIMUM3:
6029 case AMDGPUISD::FMAD_FTZ: {
6030 if (SNaN)
6031 return true;
6032 return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1) &&
6033 DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1) &&
6034 DAG.isKnownNeverNaN(Op.getOperand(2), SNaN, Depth + 1);
6035 }
6036 case AMDGPUISD::CVT_F32_UBYTE0:
6037 case AMDGPUISD::CVT_F32_UBYTE1:
6038 case AMDGPUISD::CVT_F32_UBYTE2:
6039 case AMDGPUISD::CVT_F32_UBYTE3:
6040 return true;
6041
6042 case AMDGPUISD::RCP:
6043 case AMDGPUISD::RSQ:
6044 case AMDGPUISD::RCP_LEGACY:
6045 case AMDGPUISD::RSQ_CLAMP: {
6046 if (SNaN)
6047 return true;
6048
6049 // TODO: Need is known positive check.
6050 return false;
6051 }
6052 case ISD::FLDEXP:
6053 case AMDGPUISD::FRACT: {
6054 if (SNaN)
6055 return true;
6056 return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1);
6057 }
6058 case AMDGPUISD::DIV_SCALE:
6059 case AMDGPUISD::DIV_FMAS:
6060 case AMDGPUISD::DIV_FIXUP:
6061 // TODO: Refine on operands.
6062 return SNaN;
6063 case AMDGPUISD::SIN_HW:
6064 case AMDGPUISD::COS_HW: {
6065 // TODO: Need check for infinity
6066 return SNaN;
6067 }
6069 unsigned IntrinsicID = Op.getConstantOperandVal(0);
6070 // TODO: Handle more intrinsics
6071 switch (IntrinsicID) {
6072 case Intrinsic::amdgcn_cubeid:
6073 case Intrinsic::amdgcn_cvt_off_f32_i4:
6074 return true;
6075
6076 case Intrinsic::amdgcn_frexp_mant: {
6077 if (SNaN)
6078 return true;
6079 return DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1);
6080 }
6081 case Intrinsic::amdgcn_cvt_pkrtz: {
6082 if (SNaN)
6083 return true;
6084 return DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1) &&
6085 DAG.isKnownNeverNaN(Op.getOperand(2), SNaN, Depth + 1);
6086 }
6087 case Intrinsic::amdgcn_rcp:
6088 case Intrinsic::amdgcn_rsq:
6089 case Intrinsic::amdgcn_rcp_legacy:
6090 case Intrinsic::amdgcn_rsq_legacy:
6091 case Intrinsic::amdgcn_rsq_clamp:
6092 case Intrinsic::amdgcn_tanh: {
6093 if (SNaN)
6094 return true;
6095
6096 // TODO: Need is known positive check.
6097 return false;
6098 }
6099 case Intrinsic::amdgcn_trig_preop:
6100 case Intrinsic::amdgcn_fdot2:
6101 // TODO: Refine on operand
6102 return SNaN;
6103 case Intrinsic::amdgcn_fma_legacy:
6104 if (SNaN)
6105 return true;
6106 return DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1) &&
6107 DAG.isKnownNeverNaN(Op.getOperand(2), SNaN, Depth + 1) &&
6108 DAG.isKnownNeverNaN(Op.getOperand(3), SNaN, Depth + 1);
6109 default:
6110 return false;
6111 }
6112 }
6113 default:
6114 return false;
6115 }
6116}
6117
6119 Register N0, Register N1) const {
6120 return MRI.hasOneNonDBGUse(N0); // FIXME: handle regbanks
6121}
unsigned const MachineRegisterInfo * MRI
return SDValue()
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static LLVM_READONLY bool hasSourceMods(const MachineInstr &MI)
static bool isInv2Pi(const APFloat &APF)
static LLVM_READONLY bool opMustUseVOP3Encoding(const MachineInstr &MI, const MachineRegisterInfo &MRI)
returns true if the operation will definitely need to use a 64-bit encoding, and thus will use a VOP3...
static unsigned inverseMinMax(unsigned Opc)
static SDValue extractF64Exponent(SDValue Hi, const SDLoc &SL, SelectionDAG &DAG)
static unsigned workitemIntrinsicDim(unsigned ID)
static int getOrCreateFixedStackObject(MachineFrameInfo &MFI, unsigned Size, int64_t Offset)
static SDValue constantFoldBFE(SelectionDAG &DAG, IntTy Src0, uint32_t Offset, uint32_t Width, const SDLoc &DL)
static SDValue getMad(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue X, SDValue Y, SDValue C, SDNodeFlags Flags=SDNodeFlags())
static SDValue getAddOneOp(const SDNode *V)
If V is an add of a constant 1, returns the other operand.
static LLVM_READONLY bool selectSupportsSourceMods(const SDNode *N)
Return true if v_cndmask_b32 will support fabs/fneg source modifiers for the type for ISD::SELECT.
static cl::opt< bool > AMDGPUBypassSlowDiv("amdgpu-bypass-slow-div", cl::desc("Skip 64-bit divide for dynamic 32-bit values"), cl::init(true))
static SDValue getMul24(SelectionDAG &DAG, const SDLoc &SL, SDValue N0, SDValue N1, unsigned Size, bool Signed)
static bool fnegFoldsIntoOp(const SDNode *N)
static bool isI24(SDValue Op, SelectionDAG &DAG)
static bool isCttzOpc(unsigned Opc)
static bool isU24(SDValue Op, SelectionDAG &DAG)
static SDValue peekFPSignOps(SDValue Val)
static bool valueIsKnownNeverF32Denorm(SDValue Src)
Return true if it's known that Src can never be an f32 denormal value.
static SDValue distributeOpThroughSelect(TargetLowering::DAGCombinerInfo &DCI, unsigned Op, const SDLoc &SL, SDValue Cond, SDValue N1, SDValue N2)
static SDValue peekFNeg(SDValue Val)
static SDValue simplifyMul24(SDNode *Node24, TargetLowering::DAGCombinerInfo &DCI)
static bool isCtlzOpc(unsigned Opc)
static LLVM_READNONE bool fnegFoldsIntoOpcode(unsigned Opc)
static bool hasVolatileUser(SDNode *Val)
Interface definition of the TargetLowering class that is common to all AMD GPUs.
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Function Alias Analysis Results
block Block Frequency Analysis
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
#define LLVM_READNONE
Definition Compiler.h:315
#define LLVM_READONLY
Definition Compiler.h:322
Provides analysis for querying information about KnownBits during GISel passes.
IRTranslator LLVM IR MI
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
static LVOptions Options
Definition LVOptions.cpp:25
#define I(x, y, z)
Definition MD5.cpp:57
#define G(x, y, z)
Definition MD5.cpp:55
static DebugLoc getDebugLoc(MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
Return the first DebugLoc that has line number information, given a range of instructions.
#define T
const SmallVectorImpl< MachineOperand > & Cond
#define CH(x, y, z)
Definition SHA256.cpp:34
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
Value * RHS
Value * LHS
BinaryOperator * Mul
static CCAssignFn * CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg)
static CCAssignFn * CCAssignFnForReturn(CallingConv::ID CC, bool IsVarArg)
static std::optional< uint32_t > getLDSAbsoluteAddress(const GlobalValue &GV)
void recordNumNamedBarriers(uint32_t GVAddr, unsigned BarCnt)
unsigned allocateLDSGlobal(const DataLayout &DL, const GlobalVariable &GV)
static const AMDGPUSubtarget & get(const MachineFunction &MF)
static unsigned numBitsSigned(SDValue Op, SelectionDAG &DAG)
SDValue combineFMinMaxLegacy(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, SDValue True, SDValue False, SDValue CC, DAGCombinerInfo &DCI) const
Generate Min/Max node.
unsigned ComputeNumSignBitsForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
This method can be implemented by targets that want to expose additional information about sign bits ...
SDValue performMulhuCombine(SDNode *N, DAGCombinerInfo &DCI) const
EVT getTypeForExtReturn(LLVMContext &Context, EVT VT, ISD::NodeType ExtendKind) const override
Return the type that should be used to zero or sign extend a zeroext/signext integer return value.
SDValue SplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Split a vector load into 2 loads of half the vector.
SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const
SDValue performLoadCombine(SDNode *N, DAGCombinerInfo &DCI) const
void analyzeFormalArgumentsCompute(CCState &State, const SmallVectorImpl< ISD::InputArg > &Ins) const
The SelectionDAGBuilder will automatically promote function arguments with illegal types.
SDValue LowerF64ToF16Safe(SDValue Src, const SDLoc &DL, SelectionDAG &DAG) const
SDValue LowerFROUND(SDValue Op, SelectionDAG &DAG) const
SDValue storeStackInputValue(SelectionDAG &DAG, const SDLoc &SL, SDValue Chain, SDValue ArgVal, int64_t Offset) const
bool storeOfVectorConstantIsCheap(bool IsZero, EVT MemVT, unsigned NumElem, unsigned AS) const override
Return true if it is expected to be cheaper to do a store of vector constant with the given size and ...
SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
bool shouldCombineMemoryType(EVT VT) const
SDValue splitBinaryBitConstantOpImpl(DAGCombinerInfo &DCI, const SDLoc &SL, unsigned Opc, SDValue LHS, uint32_t ValLo, uint32_t ValHi) const
Split the 64-bit value LHS into two 32-bit components, and perform the binary operation Opc to it wit...
SDValue lowerUnhandledCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals, StringRef Reason) const
SDValue performAssertSZExtCombine(SDNode *N, DAGCombinerInfo &DCI) const
bool isTruncateFree(EVT Src, EVT Dest) const override
bool aggressivelyPreferBuildVectorSources(EVT VecVT) const override
SDValue LowerFCEIL(SDValue Op, SelectionDAG &DAG) const
Split a vector store into multiple scalar stores.
TargetLowering::NegatibleCost getConstantNegateCost(const ConstantFPSDNode *C) const
SDValue LowerFLOGUnsafe(SDValue Op, const SDLoc &SL, SelectionDAG &DAG, bool IsLog10, SDNodeFlags Flags) const
bool canCreateUndefOrPoisonForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool PoisonOnly, bool ConsiderFlags, unsigned Depth) const override
Return true if Op can create undef or poison from non-undef & non-poison operands.
SDValue performMulhsCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue lowerFEXPUnsafeImpl(SDValue Op, const SDLoc &SL, SelectionDAG &DAG, SDNodeFlags Flags, bool IsExp10) const
bool isSDNodeAlwaysUniform(const SDNode *N) const override
bool isDesirableToCommuteWithShift(const SDNode *N, CombineLevel Level) const override
Return true if it is profitable to move this shift by a constant amount through its operand,...
SDValue performShlCombine(SDNode *N, DAGCombinerInfo &DCI) const
bool isCheapToSpeculateCtlz(Type *Ty) const override
Return true if it is cheap to speculate a call to intrinsic ctlz.
SDValue LowerSDIVREM(SDValue Op, SelectionDAG &DAG) const
bool isFNegFree(EVT VT) const override
Return true if an fneg operation is free to the point where it is never worthwhile to replace it with...
SDValue LowerFLOG10(SDValue Op, SelectionDAG &DAG) const
SDValue LowerINT_TO_FP64(SDValue Op, SelectionDAG &DAG, bool Signed) const
unsigned computeNumSignBitsForTargetInstr(GISelValueTracking &Analysis, Register R, const APInt &DemandedElts, const MachineRegisterInfo &MRI, unsigned Depth=0) const override
This method can be implemented by targets that want to expose additional information about sign bits ...
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
SDValue LowerFP_TO_FP16(SDValue Op, SelectionDAG &DAG) const
SDValue addTokenForArgument(SDValue Chain, SelectionDAG &DAG, MachineFrameInfo &MFI, int ClobberedFI) const
bool isConstantCheaperToNegate(SDValue N) const
bool isReassocProfitable(MachineRegisterInfo &MRI, Register N0, Register N1) const override
bool isKnownNeverNaNForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool SNaN=false, unsigned Depth=0) const override
If SNaN is false,.
static bool needsDenormHandlingF32(const SelectionDAG &DAG, SDValue Src, SDNodeFlags Flags)
uint32_t getImplicitParameterOffset(const MachineFunction &MF, const ImplicitParameter Param) const
Helper function that returns the byte offset of the given type of implicit parameter.
SDValue LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const
SDValue performSelectCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue performFNegCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const
virtual SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op, SelectionDAG &DAG) const
bool isConstantCostlierToNegate(SDValue N) const
SDValue loadInputValue(SelectionDAG &DAG, const TargetRegisterClass *RC, EVT VT, const SDLoc &SL, const ArgDescriptor &Arg) const
SDValue LowerDIVREM24(SDValue Op, SelectionDAG &DAG, bool sign) const
SDValue lowerFEXP10Unsafe(SDValue Op, const SDLoc &SL, SelectionDAG &DAG, SDNodeFlags Flags) const
Emit approx-funcs appropriate lowering for exp10.
bool shouldReduceLoadWidth(SDNode *Load, ISD::LoadExtType ExtType, EVT ExtVT, std::optional< unsigned > ByteOffset) const override
Return true if it is profitable to reduce a load to a smaller type.
SDValue LowerUINT_TO_FP(SDValue Op, SelectionDAG &DAG) const
bool isCheapToSpeculateCttz(Type *Ty) const override
Return true if it is cheap to speculate a call to intrinsic cttz.
SDValue performCtlz_CttzCombine(const SDLoc &SL, SDValue Cond, SDValue LHS, SDValue RHS, DAGCombinerInfo &DCI) const
SDValue performSraCombine(SDNode *N, DAGCombinerInfo &DCI) const
bool isSelectSupported(SelectSupportKind) const override
bool isZExtFree(Type *Src, Type *Dest) const override
Return true if any actual instruction that defines a value of type FromTy implicitly zero-extends the...
SDValue lowerFEXP2(SDValue Op, SelectionDAG &DAG) const
SDValue LowerCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower calls into the specified DAG.
SDValue performSrlCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue lowerFEXP(SDValue Op, SelectionDAG &DAG) const
SDValue getIsLtSmallestNormal(SelectionDAG &DAG, SDValue Op, SDNodeFlags Flags) const
bool mayIgnoreSignedZero(SDValue Op) const
SDValue getIsFinite(SelectionDAG &DAG, SDValue Op, SDNodeFlags Flags) const
bool isLoadBitCastBeneficial(EVT, EVT, const SelectionDAG &DAG, const MachineMemOperand &MMO) const final
Return true if the following transform is beneficial: fold (conv (load x)) -> (load (conv*)x) On arch...
std::pair< SDValue, SDValue > splitVector(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HighVT, SelectionDAG &DAG) const
Split a vector value into two parts of types LoVT and HiVT.
AMDGPUTargetLowering(const TargetMachine &TM, const TargetSubtargetInfo &STI, const AMDGPUSubtarget &AMDGPUSTI)
SDValue LowerFLOGCommon(SDValue Op, SelectionDAG &DAG) const
SDValue foldFreeOpFromSelect(TargetLowering::DAGCombinerInfo &DCI, SDValue N) const
SDValue LowerINT_TO_FP32(SDValue Op, SelectionDAG &DAG, bool Signed) const
bool isFAbsFree(EVT VT) const override
Return true if an fabs operation is free to the point where it is never worthwhile to replace it with...
SDValue loadStackInputValue(SelectionDAG &DAG, EVT VT, const SDLoc &SL, int64_t Offset) const
Similar to CreateLiveInRegister, except value maybe loaded from a stack slot rather than passed in a ...
SDValue LowerFLOG2(SDValue Op, SelectionDAG &DAG) const
static EVT getEquivalentMemType(LLVMContext &Context, EVT VT)
SDValue getSqrtEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled, int &RefinementSteps, bool &UseOneConstNR, bool Reciprocal) const override
Hooks for building estimates in place of slower divisions and square roots.
SDValue performTruncateCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue LowerSINT_TO_FP(SDValue Op, SelectionDAG &DAG) const
static SDValue stripBitcast(SDValue Val)
SDValue CreateLiveInRegister(SelectionDAG &DAG, const TargetRegisterClass *RC, Register Reg, EVT VT, const SDLoc &SL, bool RawReg=false) const
Helper function that adds Reg to the LiveIn list of the DAG's MachineFunction.
SDValue SplitVectorStore(SDValue Op, SelectionDAG &DAG) const
Split a vector store into 2 stores of half the vector.
SDValue LowerCTLZ_CTTZ(SDValue Op, SelectionDAG &DAG) const
SDValue getNegatedExpression(SDValue Op, SelectionDAG &DAG, bool LegalOperations, bool ForCodeSize, NegatibleCost &Cost, unsigned Depth) const override
Return the newly negated expression if the cost is not expensive and set the cost in Cost to indicate...
std::pair< SDValue, SDValue > split64BitValue(SDValue Op, SelectionDAG &DAG) const
Return 64-bit value Op as two 32-bit integers.
SDValue performMulCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue getRecipEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled, int &RefinementSteps) const override
Return a reciprocal estimate value for the input operand.
SDValue LowerFNEARBYINT(SDValue Op, SelectionDAG &DAG) const
SDValue LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const
static CCAssignFn * CCAssignFnForReturn(CallingConv::ID CC, bool IsVarArg)
std::pair< SDValue, SDValue > getScaledLogInput(SelectionDAG &DAG, const SDLoc SL, SDValue Op, SDNodeFlags Flags) const
If denormal handling is required return the scaled input to FLOG2, and the check for denormal range.
static CCAssignFn * CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg)
Selects the correct CCAssignFn for a given CallingConvention value.
static bool allUsesHaveSourceMods(const SDNode *N, unsigned CostThreshold=4)
SDValue LowerFROUNDEVEN(SDValue Op, SelectionDAG &DAG) const
bool isFPImmLegal(const APFloat &Imm, EVT VT, bool ForCodeSize) const override
Returns true if the target can instruction select the specified FP immediate natively.
static unsigned numBitsUnsigned(SDValue Op, SelectionDAG &DAG)
SDValue lowerFEXPUnsafe(SDValue Op, const SDLoc &SL, SelectionDAG &DAG, SDNodeFlags Flags) const
SDValue LowerFTRUNC(SDValue Op, SelectionDAG &DAG) const
SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const
static bool allowApproxFunc(const SelectionDAG &DAG, SDNodeFlags Flags)
bool ShouldShrinkFPConstant(EVT VT) const override
If true, then instruction selection should seek to shrink the FP constant of the specified type to a ...
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
SDValue performStoreCombine(SDNode *N, DAGCombinerInfo &DCI) const
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
This callback is invoked when a node result type is illegal for the target, and the operation was reg...
SDValue performRcpCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue getLoHalf64(SDValue Op, SelectionDAG &DAG) const
SDValue lowerCTLZResults(SDValue Op, SelectionDAG &DAG) const
SDValue performFAbsCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue LowerFP_TO_INT64(SDValue Op, SelectionDAG &DAG, bool Signed) const
static bool shouldFoldFNegIntoSrc(SDNode *FNeg, SDValue FNegSrc)
bool isNarrowingProfitable(SDNode *N, EVT SrcVT, EVT DestVT) const override
Return true if it's profitable to narrow operations of type SrcVT to DestVT.
SDValue LowerFRINT(SDValue Op, SelectionDAG &DAG) const
SDValue performIntrinsicWOChainCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue LowerUDIVREM(SDValue Op, SelectionDAG &DAG) const
SDValue performMulLoHiCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
void LowerUDIVREM64(SDValue Op, SelectionDAG &DAG, SmallVectorImpl< SDValue > &Results) const
SDValue WidenOrSplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Widen a suitably aligned v3 load.
std::pair< EVT, EVT > getSplitDestVTs(const EVT &VT, SelectionDAG &DAG) const
Split a vector type into two parts.
SDValue getHiHalf64(SDValue Op, SelectionDAG &DAG) const
SDValue combineFMinMaxLegacyImpl(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, SDValue True, SDValue False, SDValue CC, DAGCombinerInfo &DCI) const
unsigned getVectorIdxWidth(const DataLayout &) const override
Returns the type to be used for the index operand vector operations.
static const fltSemantics & IEEEsingle()
Definition APFloat.h:296
static const fltSemantics & IEEEdouble()
Definition APFloat.h:297
static constexpr roundingMode rmNearestTiesToEven
Definition APFloat.h:344
static const fltSemantics & IEEEhalf()
Definition APFloat.h:294
bool bitwiseIsEqual(const APFloat &RHS) const
Definition APFloat.h:1396
opStatus add(const APFloat &RHS, roundingMode RM)
Definition APFloat.h:1163
const fltSemantics & getSemantics() const
Definition APFloat.h:1439
opStatus multiply(const APFloat &RHS, roundingMode RM)
Definition APFloat.h:1181
static APFloat getSmallestNormalized(const fltSemantics &Sem, bool Negative=false)
Returns the smallest (by magnitude) normalized finite number in the given semantics.
Definition APFloat.h:1140
static APFloat getInf(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Infinity.
Definition APFloat.h:1080
Class for arbitrary precision integers.
Definition APInt.h:78
uint64_t getZExtValue() const
Get zero extended value.
Definition APInt.h:1541
void setHighBits(unsigned hiBits)
Set the top hiBits bits.
Definition APInt.h:1392
void setBitsFrom(unsigned loBit)
Set the top bits starting from loBit.
Definition APInt.h:1386
static APInt getBitsSet(unsigned numBits, unsigned loBit, unsigned hiBit)
Get a value with a block of bits set.
Definition APInt.h:259
bool ule(const APInt &RHS) const
Unsigned less or equal comparison.
Definition APInt.h:1151
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
Definition APInt.h:307
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition APInt.h:297
void setLowBits(unsigned loBits)
Set the bottom loBits bits.
Definition APInt.h:1389
This class represents an incoming formal argument to a Function.
Definition Argument.h:32
CCState - This class holds information needed while lowering arguments and return values.
static CCValAssign getCustomMem(unsigned ValNo, MVT ValVT, int64_t Offset, MVT LocVT, LocInfo HTP)
const APFloat & getValueAPF() const
bool isNegative() const
Return true if the value is negative.
uint64_t getZExtValue() const
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:64
Diagnostic information for unsupported feature in backend.
const DataLayout & getDataLayout() const
Get the data layout of the module this function belongs to.
Definition Function.cpp:363
iterator_range< arg_iterator > args()
Definition Function.h:890
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition Function.h:270
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition Function.cpp:359
Type * getValueType() const
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
LLVM_ABI void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
This class is used to represent ISD::LOAD nodes.
const SDValue & getBasePtr() const
Machine Value Type.
static auto integer_fixedlen_vector_valuetypes()
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
bool isInteger() const
Return true if this is an integer or a vector integer type.
static auto integer_valuetypes()
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
LLVM_ABI int CreateFixedObject(uint64_t Size, int64_t SPOffset, bool IsImmutable, bool isAliased=false)
Create a new object at a fixed location on the stack.
int64_t getObjectSize(int ObjectIdx) const
Return the size of the specified object.
int64_t getObjectOffset(int ObjectIdx) const
Return the assigned stack offset of the specified object from the incoming stack pointer.
int getObjectIndexBegin() const
Return the minimum frame object index.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Representation of each machine instruction.
A description of a memory reference used in the backend.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOInvariant
The memory access always returns the same value (or traps).
Flags getFlags() const
Return the raw flags of the source value,.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
This is an abstract virtual class for memory operations.
unsigned getAddressSpace() const
Return the address space for the associated pointer.
Align getAlign() const
bool isSimple() const
Returns true if the memory operation is neither atomic or volatile.
MachineMemOperand * getMemOperand() const
Return a MachineMemOperand object describing the memory reference performed by operation.
const SDValue & getChain() const
bool isInvariant() const
EVT getMemoryVT() const
Return the type of the in-memory value.
Wrapper class representing virtual and physical registers.
Definition Register.h:20
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
const DebugLoc & getDebugLoc() const
Represents one node in the SelectionDAG.
ArrayRef< SDUse > ops() const
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
bool hasOneUse() const
Return true if there is exactly one use of this node.
SDNodeFlags getFlags() const
SDVTList getVTList() const
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
iterator_range< user_iterator > users()
Represents a use of a SDNode.
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
SDNode * getNode() const
get the SDNode which holds the desired result
bool hasOneUse() const
Return true if there is exactly one node using value ResNo of Node.
SDValue getValue(unsigned R) const
EVT getValueType() const
Return the ValueType of the referenced return value.
TypeSize getValueSizeInBits() const
Returns the size of the value in bits.
const SDValue & getOperand(unsigned i) const
unsigned getOpcode() const
unsigned getNumOperands() const
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
SIModeRegisterDefaults getMode() const
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
LLVM_ABI SDValue getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT VT, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, EVT MemVT, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
LLVM_ABI unsigned ComputeMaxSignificantBits(SDValue Op, unsigned Depth=0) const
Get the upper bound on bit size for this Value Op as a signed integer.
const SDValue & getRoot() const
Return the root tag of the SelectionDAG.
LLVM_ABI SDValue getMergeValues(ArrayRef< SDValue > Ops, const SDLoc &dl)
Create a MERGE_VALUES node from the given operands.
LLVM_ABI SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
LLVM_ABI SDValue getShiftAmountConstant(uint64_t Val, EVT VT, const SDLoc &DL)
LLVM_ABI SDValue getAllOnesConstant(const SDLoc &DL, EVT VT, bool IsTarget=false, bool IsOpaque=false)
LLVM_ABI void ExtractVectorElements(SDValue Op, SmallVectorImpl< SDValue > &Args, unsigned Start=0, unsigned Count=0, EVT EltVT=EVT())
Append the extracted elements from Start to Count out of the vector Op in Args.
LLVM_ABI SDValue getFreeze(SDValue V)
Return a freeze using the SDLoc of the value operand.
SDValue getSetCC(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond, SDValue Chain=SDValue(), bool IsSignaling=false)
Helper function to make it easier to build SetCC's if you just have an ISD::CondCode instead of an SD...
LLVM_ABI SDValue getConstantFP(double Val, const SDLoc &DL, EVT VT, bool isTarget=false)
Create a ConstantFPSDNode wrapping a constant value.
LLVM_ABI SDValue getRegister(Register Reg, EVT VT)
LLVM_ABI SDValue getLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr)
Loads are not normal binary operators: their result type is not determined by their operands,...
LLVM_ABI SDValue getNOT(const SDLoc &DL, SDValue Val, EVT VT)
Create a bitwise NOT operation as (XOR Val, -1).
const TargetLowering & getTargetLoweringInfo() const
SDValue getCALLSEQ_END(SDValue Chain, SDValue Op1, SDValue Op2, SDValue InGlue, const SDLoc &DL)
Return a new CALLSEQ_END node, which always must have a glue result (to ensure it's not CSE'd).
SDValue getBuildVector(EVT VT, const SDLoc &DL, ArrayRef< SDValue > Ops)
Return an ISD::BUILD_VECTOR node.
LLVM_ABI SDValue getBitcast(EVT VT, SDValue V)
Return a bitcast using the SDLoc of the value operand, and casting to the provided type.
SDValue getCopyFromReg(SDValue Chain, const SDLoc &dl, Register Reg, EVT VT)
SDValue getSelect(const SDLoc &DL, EVT VT, SDValue Cond, SDValue LHS, SDValue RHS, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build Select's if you just have operands and don't want to check...
LLVM_ABI SDValue getZeroExtendInReg(SDValue Op, const SDLoc &DL, EVT VT)
Return the expression required to zero extend the Op value assuming it was the smaller SrcTy value.
const DataLayout & getDataLayout() const
LLVM_ABI SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
LLVM_ABI SDValue getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, EVT SVT, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
LLVM_ABI void ReplaceAllUsesWith(SDValue From, SDValue To)
Modify anything using 'From' to use 'To' instead.
LLVM_ABI SDValue getStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
Helper function to build ISD::STORE nodes.
LLVM_ABI SDValue getSignedConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
SDValue getCALLSEQ_START(SDValue Chain, uint64_t InSize, uint64_t OutSize, const SDLoc &DL)
Return a new CALLSEQ_START node, that starts new call frame, in which InSize bytes are set up inside ...
bool isConstantValueOfAnyType(SDValue N) const
SDValue getSelectCC(const SDLoc &DL, SDValue LHS, SDValue RHS, SDValue True, SDValue False, ISD::CondCode Cond, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build SelectCC's if you just have an ISD::CondCode instead of an...
LLVM_ABI SDValue getSExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either sign-extending or trunca...
LLVM_ABI SDValue getIntPtrConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
LLVM_ABI SDValue getValueType(EVT)
LLVM_ABI SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
LLVM_ABI bool isKnownNeverNaN(SDValue Op, const APInt &DemandedElts, bool SNaN=false, unsigned Depth=0) const
Test whether the given SDValue (or all elements of it, if it is a vector) is known to never be NaN in...
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
LLVM_ABI unsigned ComputeNumSignBits(SDValue Op, unsigned Depth=0) const
Return the number of times the sign bit of the register is replicated into the other bits.
LLVM_ABI SDValue getVectorIdxConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
LLVM_ABI void ReplaceAllUsesOfValueWith(SDValue From, SDValue To)
Replace any uses of From with To, leaving uses of other values produced by From.getNode() alone.
MachineFunction & getMachineFunction() const
SDValue getPOISON(EVT VT)
Return a POISON node. POISON does not have a useful SDLoc.
LLVM_ABI SDValue getFrameIndex(int FI, EVT VT, bool isTarget=false)
LLVM_ABI KnownBits computeKnownBits(SDValue Op, unsigned Depth=0) const
Determine which bits of Op are known to be either zero or one and return them in Known.
LLVM_ABI SDValue getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either zero-extending or trunca...
LLVM_ABI bool MaskedValueIsZero(SDValue Op, const APInt &Mask, unsigned Depth=0) const
Return true if 'Op & Mask' is known to be zero.
SDValue getObjectPtrOffset(const SDLoc &SL, SDValue Ptr, TypeSize Offset)
Create an add instruction with appropriate flags when used for addressing some offset of an object.
LLVMContext * getContext() const
const SDValue & setRoot(SDValue N)
Set the current root tag of the SelectionDAG.
LLVM_ABI SDNode * UpdateNodeOperands(SDNode *N, SDValue Op)
Mutate the specified node in-place to have the specified operands.
SDValue getEntryNode() const
Return the token chain corresponding to the entry of the function.
LLVM_ABI std::pair< SDValue, SDValue > SplitScalar(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the scalar node with EXTRACT_ELEMENT using the provided VTs and return the low/high part.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
This class is used to represent ISD::STORE nodes.
const SDValue & getBasePtr() const
const SDValue & getValue() const
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
void setOperationAction(unsigned Op, MVT VT, LegalizeAction Action)
Indicate that the specified operation does not work with the specified type and indicate what to do a...
void setMaxDivRemBitWidthSupported(unsigned SizeInBits)
Set the size in bits of the maximum div/rem the backend supports.
bool PredictableSelectIsExpensive
Tells the code generator that select is more expensive than a branch if the branch is usually predict...
virtual bool shouldReduceLoadWidth(SDNode *Load, ISD::LoadExtType ExtTy, EVT NewVT, std::optional< unsigned > ByteOffset=std::nullopt) const
Return true if it is profitable to reduce a load to a smaller type.
unsigned MaxStoresPerMemcpyOptSize
Likewise for functions with the OptSize attribute.
const TargetMachine & getTargetMachine() const
virtual unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain targets require unusual breakdowns of certain types.
unsigned MaxGluedStoresPerMemcpy
Specify max number of store instructions to glue in inlined memcpy.
virtual MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
void addBypassSlowDiv(unsigned int SlowBitWidth, unsigned int FastBitWidth)
Tells the code generator which bitwidths to bypass.
void setMaxLargeFPConvertBitWidthSupported(unsigned SizeInBits)
Set the size in bits of the maximum fp to/from int conversion the backend supports.
void setMaxAtomicSizeInBitsSupported(unsigned SizeInBits)
Set the maximum atomic operation size supported by the backend.
SelectSupportKind
Enum that describes what type of support for selects the target has.
virtual bool allowsMisalignedMemoryAccesses(EVT, unsigned AddrSpace=0, Align Alignment=Align(1), MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *=nullptr) const
Determine if the target supports unaligned memory accesses.
unsigned MaxStoresPerMemsetOptSize
Likewise for functions with the OptSize attribute.
EVT getShiftAmountTy(EVT LHSTy, const DataLayout &DL) const
Returns the type for the shift amount of a shift opcode.
unsigned MaxStoresPerMemmove
Specify maximum number of store instructions per memmove call.
virtual EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const
Return the ValueType of the result of SETCC operations.
virtual EVT getTypeToTransformTo(LLVMContext &Context, EVT VT) const
For types supported by the target, this is an identity function.
unsigned MaxStoresPerMemmoveOptSize
Likewise for functions with the OptSize attribute.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
void setSupportsUnalignedAtomics(bool UnalignedSupported)
Sets whether unaligned atomic operations are supported.
bool isOperationLegal(unsigned Op, EVT VT) const
Return true if the specified operation is legal on this target.
unsigned MaxStoresPerMemset
Specify maximum number of store instructions per memset call.
void setTruncStoreAction(MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified truncating store does not work with the specified type and indicate what ...
void setMinCmpXchgSizeInBits(unsigned SizeInBits)
Sets the minimum cmpxchg or ll/sc size supported by the backend.
void AddPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)
If Opc/OrigVT is specified as being promoted, the promotion code defaults to trying a larger integer/...
void setTargetDAGCombine(ArrayRef< ISD::NodeType > NTs)
Targets should invoke this method for each target independent node that they want to provide a custom...
void setLoadExtAction(unsigned ExtType, MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified load with extension does not work with the specified type and indicate wh...
unsigned GatherAllAliasesMaxDepth
Depth that GatherAllAliases should continue looking for chain dependencies when trying to find a more...
NegatibleCost
Enum that specifies when a float negation is beneficial.
bool allowsMemoryAccessForAlignment(LLVMContext &Context, const DataLayout &DL, EVT VT, unsigned AddrSpace=0, Align Alignment=Align(1), MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *Fast=nullptr) const
This function returns true if the memory access is aligned or if the target allows this specific unal...
unsigned MaxStoresPerMemcpy
Specify maximum number of store instructions per memcpy call.
void setSchedulingPreference(Sched::Preference Pref)
Specify the target scheduling preference.
void setJumpIsExpensive(bool isExpensive=true)
Tells the code generator not to expand logic operations on comparison predicates into separate sequen...
This class defines information used to lower LLVM code to legal SelectionDAG operators that the targe...
SDValue scalarizeVectorStore(StoreSDNode *ST, SelectionDAG &DAG) const
SDValue SimplifyMultipleUseDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth=0) const
More limited version of SimplifyDemandedBits that can be used to "lookthrough" ops that don't contrib...
SDValue expandUnalignedStore(StoreSDNode *ST, SelectionDAG &DAG) const
Expands an unaligned store to 2 half-size stores for integer values, and possibly more for vectors.
bool ShrinkDemandedConstant(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, TargetLoweringOpt &TLO) const
Check to see if the specified operand of the specified instruction is a constant integer.
std::pair< SDValue, SDValue > expandUnalignedLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Expands an unaligned load to 2 half-size loads for an integer, and possibly more for vectors.
virtual SDValue getNegatedExpression(SDValue Op, SelectionDAG &DAG, bool LegalOps, bool OptForSize, NegatibleCost &Cost, unsigned Depth=0) const
Return the newly negated expression if the cost is not expensive and set the cost in Cost to indicate...
std::pair< SDValue, SDValue > scalarizeVectorLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Turn load of vector type into a load of the individual elements.
bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0, bool AssumeSingleUse=false) const
Look at Op.
TargetLowering(const TargetLowering &)=delete
virtual bool canCreateUndefOrPoisonForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool PoisonOnly, bool ConsiderFlags, unsigned Depth) const
Return true if Op can create undef or poison from non-undef & non-poison operands.
Primary interface to the complete machine description for the target machine.
TargetOptions Options
TargetSubtargetInfo - Generic base class for all target subtargets.
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition TypeSize.h:343
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:230
LLVM Value Representation.
Definition Value.h:75
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:322
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
bool isIntrinsicAlwaysUniform(unsigned IntrID)
TargetExtType * isNamedBarrier(const GlobalVariable &GV)
bool isUniformMMO(const MachineMemOperand *MMO)
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
@ AMDGPU_VS
Used for Mesa vertex shaders, or AMDPAL last shader stage before rasterization (vertex shader if tess...
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
@ AMDGPU_Gfx
Used for AMD graphics targets.
@ AMDGPU_CS_ChainPreserve
Used on AMDGPUs to give the middle-end more control over argument placement.
@ AMDGPU_HS
Used for Mesa/AMDPAL hull shaders (= tessellation control shaders).
@ AMDGPU_GS
Used for Mesa/AMDPAL geometry shaders.
@ AMDGPU_CS_Chain
Used on AMDGPUs to give the middle-end more control over argument placement.
@ AMDGPU_PS
Used for Mesa/AMDPAL pixel shaders.
@ Cold
Attempts to make code in the caller as efficient as possible under the assumption that the call is no...
Definition CallingConv.h:47
@ SPIR_KERNEL
Used for SPIR kernel functions.
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition CallingConv.h:41
@ AMDGPU_ES
Used for AMDPAL shader stage before geometry shader if geometry is in use.
@ AMDGPU_LS
Used for AMDPAL vertex shader if tessellation is in use.
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
NodeType
ISD::NodeType enum - This enum defines the target-independent operators for a SelectionDAG.
Definition ISDOpcodes.h:41
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition ISDOpcodes.h:813
@ CTLZ_ZERO_UNDEF
Definition ISDOpcodes.h:782
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
Definition ISDOpcodes.h:270
@ INSERT_SUBVECTOR
INSERT_SUBVECTOR(VECTOR1, VECTOR2, IDX) - Returns a vector with VECTOR2 inserted into VECTOR1.
Definition ISDOpcodes.h:595
@ BSWAP
Byte Swap and Counting operators.
Definition ISDOpcodes.h:773
@ ATOMIC_STORE
OUTCHAIN = ATOMIC_STORE(INCHAIN, val, ptr) This corresponds to "store atomic" instruction.
@ ADDC
Carry-setting nodes for multiple precision addition and subtraction.
Definition ISDOpcodes.h:289
@ FMAD
FMAD - Perform a * b + c, while getting the same result as the separately rounded operations.
Definition ISDOpcodes.h:517
@ ADD
Simple integer binary arithmetic operators.
Definition ISDOpcodes.h:259
@ LOAD
LOAD and STORE have token chains as their first operand, then the same operands as an LLVM load/store...
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
Definition ISDOpcodes.h:847
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
Definition ISDOpcodes.h:513
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition ISDOpcodes.h:874
@ CONCAT_VECTORS
CONCAT_VECTORS(VECTOR0, VECTOR1, ...) - Given a number of values of vector type with the same length ...
Definition ISDOpcodes.h:579
@ FADD
Simple binary floating point operators.
Definition ISDOpcodes.h:412
@ SDIVREM
SDIVREM/UDIVREM - Divide two integers and produce both a quotient and remainder result.
Definition ISDOpcodes.h:275
@ FP16_TO_FP
FP16_TO_FP, FP_TO_FP16 - These operators are used to perform promotions and truncation for half-preci...
Definition ISDOpcodes.h:997
@ BITCAST
BITCAST - This operator converts between integer, vector and FP values, as if the value was stored to...
Definition ISDOpcodes.h:987
@ BUILD_PAIR
BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.
Definition ISDOpcodes.h:249
@ FLDEXP
FLDEXP - ldexp, inspired by libm (op0 * 2**op1).
@ SIGN_EXTEND
Conversion operators.
Definition ISDOpcodes.h:838
@ CTTZ_ZERO_UNDEF
Bit counting operators with an undefined result for zero inputs.
Definition ISDOpcodes.h:781
@ FNEG
Perform various unary floating-point operations inspired by libm.
@ BRIND
BRIND - Indirect branch.
@ BR_JT
BR_JT - Jumptable branch.
@ FCANONICALIZE
Returns platform specific canonical encoding of a floating point number.
Definition ISDOpcodes.h:536
@ IS_FPCLASS
Performs a check of floating point class property, defined by IEEE-754.
Definition ISDOpcodes.h:543
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition ISDOpcodes.h:790
@ ATOMIC_LOAD
Val, OUTCHAIN = ATOMIC_LOAD(INCHAIN, ptr) This corresponds to "load atomic" instruction.
@ EXTRACT_ELEMENT
EXTRACT_ELEMENT - This is used to get the lower or upper (determined by a Constant,...
Definition ISDOpcodes.h:242
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
Definition ISDOpcodes.h:698
@ SHL
Shift and rotation operations.
Definition ISDOpcodes.h:759
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
Definition ISDOpcodes.h:644
@ EXTRACT_SUBVECTOR
EXTRACT_SUBVECTOR(VECTOR, IDX) - Returns a subvector from VECTOR.
Definition ISDOpcodes.h:609
@ FMINNUM_IEEE
FMINNUM_IEEE/FMAXNUM_IEEE - Perform floating-point minimumNumber or maximumNumber on two values,...
@ EntryToken
EntryToken - This is the marker used to indicate the start of a region.
Definition ISDOpcodes.h:48
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition ISDOpcodes.h:571
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
Definition ISDOpcodes.h:219
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition ISDOpcodes.h:844
@ SELECT_CC
Select with condition operator - This selects between a true value and a false value (ops #2 and #3) ...
Definition ISDOpcodes.h:805
@ FMINNUM
FMINNUM/FMAXNUM - Perform floating-point minimum maximum on two values, following IEEE-754 definition...
@ DYNAMIC_STACKALLOC
DYNAMIC_STACKALLOC - Allocate some number of bytes on the stack aligned to a specified boundary.
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
Definition ISDOpcodes.h:882
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
Definition ISDOpcodes.h:721
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition ISDOpcodes.h:972
@ VSELECT
Select with a vector condition (op #0) and two vector operands (ops #1 and #2), returning a vector re...
Definition ISDOpcodes.h:799
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
Definition ISDOpcodes.h:323
@ INLINEASM_BR
INLINEASM_BR - Branching version of inline asm. Used by asm-goto.
@ FMINIMUM
FMINIMUM/FMAXIMUM - NaN-propagating minimum/maximum that also treat -0.0 as less than 0....
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition ISDOpcodes.h:920
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition ISDOpcodes.h:733
@ TRAP
TRAP - Trapping instruction.
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
Definition ISDOpcodes.h:200
@ ADDE
Carry-using nodes for multiple precision addition and subtraction.
Definition ISDOpcodes.h:299
@ INSERT_VECTOR_ELT
INSERT_VECTOR_ELT(VECTOR, VAL, IDX) - Returns VECTOR with the element at IDX replaced with VAL.
Definition ISDOpcodes.h:560
@ TokenFactor
TokenFactor - This node takes multiple tokens as input and produces a single token result.
Definition ISDOpcodes.h:53
@ FFREXP
FFREXP - frexp, extract fractional and exponent component of a floating-point value.
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition ISDOpcodes.h:953
@ ADDRSPACECAST
ADDRSPACECAST - This operator converts between pointers of different address spaces.
Definition ISDOpcodes.h:991
@ INLINEASM
INLINEASM - Represents an inline asm block.
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition ISDOpcodes.h:850
@ AssertSext
AssertSext, AssertZext - These nodes record if a register contains a value that has already been zero...
Definition ISDOpcodes.h:62
@ FCOPYSIGN
FCOPYSIGN(X, Y) - Return the value of X with the sign of Y.
Definition ISDOpcodes.h:529
@ FMINIMUMNUM
FMINIMUMNUM/FMAXIMUMNUM - minimumnum/maximumnum that is same with FMINNUM_IEEE and FMAXNUM_IEEE besid...
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
Definition ISDOpcodes.h:208
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
Definition ISDOpcodes.h:551
bool isNormalStore(const SDNode *N)
Returns true if the specified node is a non-truncating and unindexed store.
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
LoadExtType
LoadExtType enum - This enum defines the three variants of LOADEXT (load with extension).
bool isNormalLoad(const SDNode *N)
Returns true if the specified node is a non-extending and unindexed load.
initializer< Ty > init(const Ty &Val)
constexpr double ln2
constexpr double ln10
constexpr float log2ef
Definition MathExtras.h:51
constexpr double log2e
This is an optimization pass for GlobalISel generic memory operations.
Definition Types.h:26
@ Offset
Definition DWP.cpp:532
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1737
MaybeAlign getAlign(const CallInst &I, unsigned Index)
InstructionCost Cost
LLVM_ABI bool isNullConstant(SDValue V)
Returns true if V is a constant integer zero.
void ComputeValueVTs(const TargetLowering &TLI, const DataLayout &DL, Type *Ty, SmallVectorImpl< EVT > &ValueVTs, SmallVectorImpl< EVT > *MemVTs=nullptr, SmallVectorImpl< TypeSize > *Offsets=nullptr, TypeSize StartingOffset=TypeSize::getZero())
ComputeValueVTs - Given an LLVM IR type, compute a sequence of EVTs that represent all the individual...
Definition Analysis.cpp:119
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
bool CCAssignFn(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
CCAssignFn - This function assigns a location for Val, updating State to reflect the change.
LLVM_ABI ConstantFPSDNode * isConstOrConstSplatFP(SDValue N, bool AllowUndefs=false)
Returns the SDNode if it is a constant splat BuildVector or constant float.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition MathExtras.h:385
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition bit.h:202
int countl_zero(T Val)
Count number of 0's from the most significant bit to the least stopping at the first 1.
Definition bit.h:236
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
Definition MathExtras.h:150
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Definition MathExtras.h:155
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
LLVM_ABI raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
CombineLevel
Definition DAGCombine.h:15
@ AfterLegalizeDAG
Definition DAGCombine.h:19
@ BeforeLegalizeTypes
Definition DAGCombine.h:16
@ AfterLegalizeTypes
Definition DAGCombine.h:17
To bit_cast(const From &from) noexcept
Definition bit.h:90
@ Mul
Product of integers.
@ Add
Sum of integers.
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition Alignment.h:144
DWARFExpression::Operation Op
LLVM_ABI ConstantSDNode * isConstOrConstSplat(SDValue N, bool AllowUndefs=false, bool AllowTruncation=false)
Returns the SDNode if it is a constant splat BuildVector or constant int.
constexpr unsigned BitWidth
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
LLVM_ABI bool isOneConstant(SDValue V)
Returns true if V is a constant integer one.
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition Alignment.h:201
APFloat neg(APFloat X)
Returns the negated value of the argument.
Definition APFloat.h:1551
unsigned Log2(Align A)
Returns the log2 of the alignment.
Definition Alignment.h:197
static cl::opt< unsigned > CostThreshold("dfa-cost-threshold", cl::desc("Maximum cost accepted for the transformation"), cl::Hidden, cl::init(50))
LLVM_ABI bool isAllOnesConstant(SDValue V)
Returns true if V is an integer constant with all bits set.
LLVM_ABI void reportFatalUsageError(Error Err)
Report a fatal error that does not indicate a bug in LLVM.
Definition Error.cpp:180
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:872
#define N
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
MCRegister getRegister() const
unsigned getStackOffset() const
DenormalModeKind Input
Denormal treatment kind for floating point instruction inputs in the default floating-point environme...
@ PreserveSign
The sign of a flushed-to-zero number is preserved in the sign of 0.
static constexpr DenormalMode getPreserveSign()
Extended Value Type.
Definition ValueTypes.h:35
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
Definition ValueTypes.h:395
EVT getPow2VectorType(LLVMContext &Context) const
Widens the length of the given vector EVT up to the nearest power of 2 and returns that type.
Definition ValueTypes.h:477
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition ValueTypes.h:137
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
Definition ValueTypes.h:74
EVT changeTypeToInteger() const
Return the type converted to an equivalently sized integer or vector with integer element type.
Definition ValueTypes.h:121
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
Definition ValueTypes.h:147
EVT getDoubleNumVectorElementsVT(LLVMContext &Context) const
Definition ValueTypes.h:463
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition ValueTypes.h:373
bool isByteSized() const
Return true if the bit size is a multiple of 8.
Definition ValueTypes.h:243
uint64_t getScalarSizeInBits() const
Definition ValueTypes.h:385
EVT getHalfSizedIntegerVT(LLVMContext &Context) const
Finds the smallest simple value type that is greater than or equal to half the width of this EVT.
Definition ValueTypes.h:430
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
Definition ValueTypes.h:470
TypeSize getStoreSizeInBits() const
Return the number of bits overwritten by a store of the specified value type.
Definition ValueTypes.h:412
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition ValueTypes.h:316
static EVT getIntegerVT(LLVMContext &Context, unsigned BitWidth)
Returns the EVT that represents an integer with the given number of bits.
Definition ValueTypes.h:65
uint64_t getFixedSizeInBits() const
Return the size of the specified fixed width value type in bits.
Definition ValueTypes.h:381
EVT getRoundIntegerType(LLVMContext &Context) const
Rounds the bit-width of the given integer EVT up to the nearest power of two (and at least to eight),...
Definition ValueTypes.h:419
bool isVector() const
Return true if this is a vector value type.
Definition ValueTypes.h:168
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition ValueTypes.h:323
bool bitsGE(EVT VT) const
Return true if this has no less bits than VT.
Definition ValueTypes.h:292
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition ValueTypes.h:328
bool isExtended() const
Test if the given EVT is extended (as opposed to being simple).
Definition ValueTypes.h:142
EVT changeElementType(LLVMContext &Context, EVT EltVT) const
Return a VT for a type whose attributes match ourselves with the exception of the element type that i...
Definition ValueTypes.h:113
LLVM_ABI const fltSemantics & getFltSemantics() const
Returns an APFloat semantics tag appropriate for the value type.
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition ValueTypes.h:336
bool bitsLE(EVT VT) const
Return true if this has no more bits than VT.
Definition ValueTypes.h:308
bool isInteger() const
Return true if this is an integer or a vector integer type.
Definition ValueTypes.h:152
InputArg - This struct carries flags and type information about a single incoming (formal) argument o...
MVT VT
Legalized type of this argument part.
bool isNonNegative() const
Returns true if this value is known to be non-negative.
Definition KnownBits.h:108
unsigned countMinTrailingZeros() const
Returns the minimum number of trailing zero bits.
Definition KnownBits.h:242
bool isUnknown() const
Returns true if we don't know any bits.
Definition KnownBits.h:66
KnownBits trunc(unsigned BitWidth) const
Return known bits for a truncation of the value we're tracking.
Definition KnownBits.h:161
unsigned getBitWidth() const
Get the bit width of this value.
Definition KnownBits.h:44
void resetAll()
Resets the known state of all bits.
Definition KnownBits.h:74
unsigned countMaxActiveBits() const
Returns the maximum number of bits needed to represent all possible unsigned values with these known ...
Definition KnownBits.h:296
unsigned countMinLeadingZeros() const
Returns the minimum number of leading zero bits.
Definition KnownBits.h:248
APInt getMaxValue() const
Return the maximal unsigned value possible given these KnownBits.
Definition KnownBits.h:145
APInt getMinValue() const
Return the minimal unsigned value possible given these KnownBits.
Definition KnownBits.h:129
bool isStrictlyPositive() const
Returns true if this value is known to be positive.
Definition KnownBits.h:114
bool isNegative() const
Returns true if this value is known to be negative.
Definition KnownBits.h:105
unsigned countMaxSignificantBits() const
Returns the maximum number of bits needed to represent all possible signed values with these known bi...
Definition KnownBits.h:269
Matching combinators.
This class contains a discriminated union of information about pointers in memory operands,...
LLVM_ABI bool isDereferenceable(unsigned Size, LLVMContext &C, const DataLayout &DL) const
Return true if memory region [V, V+Offset+Size) is known to be dereferenceable.
static LLVM_ABI MachinePointerInfo getStack(MachineFunction &MF, int64_t Offset, uint8_t ID=0)
Stack pointer relative access.
MachinePointerInfo getWithOffset(int64_t O) const
These are IR-level optimization flags that may be propagated to SDNodes.
void setAllowContract(bool b)
This represents a list of ValueType's that has been intern'd by a SelectionDAG.
DenormalMode FP32Denormals
If this is set, neither input or output denormals are flushed for most f32 instructions.
This structure contains all information that is necessary for lowering calls.
SmallVector< ISD::InputArg, 32 > Ins
LLVM_ABI void AddToWorklist(SDNode *N)
LLVM_ABI SDValue CombineTo(SDNode *N, ArrayRef< SDValue > To, bool AddTo=true)
LLVM_ABI void CommitTargetLoweringOpt(const TargetLoweringOpt &TLO)
A convenience struct that encapsulates a DAG, and two SDValues for returning information from TargetL...