LLVM 23.0.0git
AMDGPUISelLowering.cpp
Go to the documentation of this file.
1//===-- AMDGPUISelLowering.cpp - AMDGPU Common DAG lowering functions -----===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// This is the parent TargetLowering class for hardware code gen
11/// targets.
12//
13//===----------------------------------------------------------------------===//
14
15#include "AMDGPUISelLowering.h"
16#include "AMDGPU.h"
17#include "AMDGPUInstrInfo.h"
19#include "AMDGPUMemoryUtils.h"
26#include "llvm/IR/IntrinsicsAMDGPU.h"
30
31using namespace llvm;
32
33#include "AMDGPUGenCallingConv.inc"
34
36 "amdgpu-bypass-slow-div",
37 cl::desc("Skip 64-bit divide for dynamic 32-bit values"),
38 cl::init(true));
39
40// Find a larger type to do a load / store of a vector with.
42 unsigned StoreSize = VT.getStoreSizeInBits();
43 if (StoreSize <= 32)
44 return EVT::getIntegerVT(Ctx, StoreSize);
45
46 if (StoreSize % 32 == 0)
47 return EVT::getVectorVT(Ctx, MVT::i32, StoreSize / 32);
48
49 return VT;
50}
51
55
57 // In order for this to be a signed 24-bit value, bit 23, must
58 // be a sign bit.
59 return DAG.ComputeMaxSignificantBits(Op);
60}
61
63 const TargetSubtargetInfo &STI,
64 const AMDGPUSubtarget &AMDGPUSTI)
65 : TargetLowering(TM, STI), Subtarget(&AMDGPUSTI) {
66 // Always lower memset, memcpy, and memmove intrinsics to load/store
67 // instructions, rather then generating calls to memset, mempcy or memmove.
71
72 // Enable ganging up loads and stores in the memcpy DAG lowering.
74
75 // Lower floating point store/load to integer store/load to reduce the number
76 // of patterns in tablegen.
78 AddPromotedToType(ISD::LOAD, MVT::f32, MVT::i32);
79
81 AddPromotedToType(ISD::LOAD, MVT::v2f32, MVT::v2i32);
82
84 AddPromotedToType(ISD::LOAD, MVT::v3f32, MVT::v3i32);
85
87 AddPromotedToType(ISD::LOAD, MVT::v4f32, MVT::v4i32);
88
90 AddPromotedToType(ISD::LOAD, MVT::v5f32, MVT::v5i32);
91
93 AddPromotedToType(ISD::LOAD, MVT::v6f32, MVT::v6i32);
94
96 AddPromotedToType(ISD::LOAD, MVT::v7f32, MVT::v7i32);
97
99 AddPromotedToType(ISD::LOAD, MVT::v8f32, MVT::v8i32);
100
102 AddPromotedToType(ISD::LOAD, MVT::v9f32, MVT::v9i32);
103
104 setOperationAction(ISD::LOAD, MVT::v10f32, Promote);
105 AddPromotedToType(ISD::LOAD, MVT::v10f32, MVT::v10i32);
106
107 setOperationAction(ISD::LOAD, MVT::v11f32, Promote);
108 AddPromotedToType(ISD::LOAD, MVT::v11f32, MVT::v11i32);
109
110 setOperationAction(ISD::LOAD, MVT::v12f32, Promote);
111 AddPromotedToType(ISD::LOAD, MVT::v12f32, MVT::v12i32);
112
113 setOperationAction(ISD::LOAD, MVT::v16f32, Promote);
114 AddPromotedToType(ISD::LOAD, MVT::v16f32, MVT::v16i32);
115
116 setOperationAction(ISD::LOAD, MVT::v32f32, Promote);
117 AddPromotedToType(ISD::LOAD, MVT::v32f32, MVT::v32i32);
118
120 AddPromotedToType(ISD::LOAD, MVT::i64, MVT::v2i32);
121
123 AddPromotedToType(ISD::LOAD, MVT::v2i64, MVT::v4i32);
124
126 AddPromotedToType(ISD::LOAD, MVT::f64, MVT::v2i32);
127
129 AddPromotedToType(ISD::LOAD, MVT::v2f64, MVT::v4i32);
130
132 AddPromotedToType(ISD::LOAD, MVT::v3i64, MVT::v6i32);
133
135 AddPromotedToType(ISD::LOAD, MVT::v4i64, MVT::v8i32);
136
138 AddPromotedToType(ISD::LOAD, MVT::v3f64, MVT::v6i32);
139
141 AddPromotedToType(ISD::LOAD, MVT::v4f64, MVT::v8i32);
142
144 AddPromotedToType(ISD::LOAD, MVT::v8i64, MVT::v16i32);
145
147 AddPromotedToType(ISD::LOAD, MVT::v8f64, MVT::v16i32);
148
149 setOperationAction(ISD::LOAD, MVT::v16i64, Promote);
150 AddPromotedToType(ISD::LOAD, MVT::v16i64, MVT::v32i32);
151
152 setOperationAction(ISD::LOAD, MVT::v16f64, Promote);
153 AddPromotedToType(ISD::LOAD, MVT::v16f64, MVT::v32i32);
154
156 AddPromotedToType(ISD::LOAD, MVT::i128, MVT::v4i32);
157
158 // TODO: Would be better to consume as directly legal
160 AddPromotedToType(ISD::ATOMIC_LOAD, MVT::f32, MVT::i32);
161
163 AddPromotedToType(ISD::ATOMIC_LOAD, MVT::f64, MVT::i64);
164
166 AddPromotedToType(ISD::ATOMIC_LOAD, MVT::f16, MVT::i16);
167
169 AddPromotedToType(ISD::ATOMIC_LOAD, MVT::bf16, MVT::i16);
170
172 AddPromotedToType(ISD::ATOMIC_STORE, MVT::f32, MVT::i32);
173
175 AddPromotedToType(ISD::ATOMIC_STORE, MVT::f64, MVT::i64);
176
178 AddPromotedToType(ISD::ATOMIC_STORE, MVT::f16, MVT::i16);
179
181 AddPromotedToType(ISD::ATOMIC_STORE, MVT::bf16, MVT::i16);
182
183 // There are no 64-bit extloads. These should be done as a 32-bit extload and
184 // an extension to 64-bit.
185 for (MVT VT : MVT::integer_valuetypes())
187 Expand);
188
189 for (MVT VT : MVT::integer_valuetypes()) {
190 if (VT == MVT::i64)
191 continue;
192
193 for (auto Op : {ISD::SEXTLOAD, ISD::ZEXTLOAD, ISD::EXTLOAD}) {
194 setLoadExtAction(Op, VT, MVT::i1, Promote);
195 setLoadExtAction(Op, VT, MVT::i8, Legal);
196 setLoadExtAction(Op, VT, MVT::i16, Legal);
197 setLoadExtAction(Op, VT, MVT::i32, Expand);
198 }
199 }
200
202 for (auto MemVT :
203 {MVT::v2i8, MVT::v4i8, MVT::v2i16, MVT::v3i16, MVT::v4i16})
205 Expand);
206
207 setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
208 setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::bf16, Expand);
209 setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2f16, Expand);
210 setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2bf16, Expand);
211 setLoadExtAction(ISD::EXTLOAD, MVT::v3f32, MVT::v3f16, Expand);
212 setLoadExtAction(ISD::EXTLOAD, MVT::v3f32, MVT::v3bf16, Expand);
213 setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Expand);
214 setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4bf16, Expand);
215 setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8f16, Expand);
216 setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8bf16, Expand);
217 setLoadExtAction(ISD::EXTLOAD, MVT::v16f32, MVT::v16f16, Expand);
218 setLoadExtAction(ISD::EXTLOAD, MVT::v16f32, MVT::v16bf16, Expand);
219 setLoadExtAction(ISD::EXTLOAD, MVT::v32f32, MVT::v32f16, Expand);
220 setLoadExtAction(ISD::EXTLOAD, MVT::v32f32, MVT::v32bf16, Expand);
221
222 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
223 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f32, Expand);
224 setLoadExtAction(ISD::EXTLOAD, MVT::v3f64, MVT::v3f32, Expand);
225 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f32, Expand);
226 setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f32, Expand);
227 setLoadExtAction(ISD::EXTLOAD, MVT::v16f64, MVT::v16f32, Expand);
228
229 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
230 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::bf16, Expand);
231 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Expand);
232 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2bf16, Expand);
233 setLoadExtAction(ISD::EXTLOAD, MVT::v3f64, MVT::v3f16, Expand);
234 setLoadExtAction(ISD::EXTLOAD, MVT::v3f64, MVT::v3bf16, Expand);
235 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Expand);
236 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4bf16, Expand);
237 setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f16, Expand);
238 setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8bf16, Expand);
239 setLoadExtAction(ISD::EXTLOAD, MVT::v16f64, MVT::v16f16, Expand);
240 setLoadExtAction(ISD::EXTLOAD, MVT::v16f64, MVT::v16bf16, Expand);
241
243 AddPromotedToType(ISD::STORE, MVT::f32, MVT::i32);
244
246 AddPromotedToType(ISD::STORE, MVT::v2f32, MVT::v2i32);
247
249 AddPromotedToType(ISD::STORE, MVT::v3f32, MVT::v3i32);
250
252 AddPromotedToType(ISD::STORE, MVT::v4f32, MVT::v4i32);
253
255 AddPromotedToType(ISD::STORE, MVT::v5f32, MVT::v5i32);
256
258 AddPromotedToType(ISD::STORE, MVT::v6f32, MVT::v6i32);
259
261 AddPromotedToType(ISD::STORE, MVT::v7f32, MVT::v7i32);
262
264 AddPromotedToType(ISD::STORE, MVT::v8f32, MVT::v8i32);
265
267 AddPromotedToType(ISD::STORE, MVT::v9f32, MVT::v9i32);
268
270 AddPromotedToType(ISD::STORE, MVT::v10f32, MVT::v10i32);
271
273 AddPromotedToType(ISD::STORE, MVT::v11f32, MVT::v11i32);
274
276 AddPromotedToType(ISD::STORE, MVT::v12f32, MVT::v12i32);
277
279 AddPromotedToType(ISD::STORE, MVT::v16f32, MVT::v16i32);
280
282 AddPromotedToType(ISD::STORE, MVT::v32f32, MVT::v32i32);
283
285 AddPromotedToType(ISD::STORE, MVT::i64, MVT::v2i32);
286
288 AddPromotedToType(ISD::STORE, MVT::v2i64, MVT::v4i32);
289
291 AddPromotedToType(ISD::STORE, MVT::f64, MVT::v2i32);
292
294 AddPromotedToType(ISD::STORE, MVT::v2f64, MVT::v4i32);
295
297 AddPromotedToType(ISD::STORE, MVT::v3i64, MVT::v6i32);
298
300 AddPromotedToType(ISD::STORE, MVT::v3f64, MVT::v6i32);
301
303 AddPromotedToType(ISD::STORE, MVT::v4i64, MVT::v8i32);
304
306 AddPromotedToType(ISD::STORE, MVT::v4f64, MVT::v8i32);
307
309 AddPromotedToType(ISD::STORE, MVT::v8i64, MVT::v16i32);
310
312 AddPromotedToType(ISD::STORE, MVT::v8f64, MVT::v16i32);
313
315 AddPromotedToType(ISD::STORE, MVT::v16i64, MVT::v32i32);
316
318 AddPromotedToType(ISD::STORE, MVT::v16f64, MVT::v32i32);
319
321 AddPromotedToType(ISD::STORE, MVT::i128, MVT::v4i32);
322
323 setTruncStoreAction(MVT::i64, MVT::i1, Expand);
324 setTruncStoreAction(MVT::i64, MVT::i8, Expand);
325 setTruncStoreAction(MVT::i64, MVT::i16, Expand);
326 setTruncStoreAction(MVT::i64, MVT::i32, Expand);
327
328 setTruncStoreAction(MVT::v2i64, MVT::v2i1, Expand);
329 setTruncStoreAction(MVT::v2i64, MVT::v2i8, Expand);
330 setTruncStoreAction(MVT::v2i64, MVT::v2i16, Expand);
331 setTruncStoreAction(MVT::v2i64, MVT::v2i32, Expand);
332
333 setTruncStoreAction(MVT::f32, MVT::bf16, Expand);
334 setTruncStoreAction(MVT::f32, MVT::f16, Expand);
335 setTruncStoreAction(MVT::v2f32, MVT::v2bf16, Expand);
336 setTruncStoreAction(MVT::v2f32, MVT::v2f16, Expand);
337 setTruncStoreAction(MVT::v3f32, MVT::v3bf16, Expand);
338 setTruncStoreAction(MVT::v3f32, MVT::v3f16, Expand);
339 setTruncStoreAction(MVT::v4f32, MVT::v4bf16, Expand);
340 setTruncStoreAction(MVT::v4f32, MVT::v4f16, Expand);
341 setTruncStoreAction(MVT::v6f32, MVT::v6f16, Expand);
342 setTruncStoreAction(MVT::v8f32, MVT::v8bf16, Expand);
343 setTruncStoreAction(MVT::v8f32, MVT::v8f16, Expand);
344 setTruncStoreAction(MVT::v16f32, MVT::v16bf16, Expand);
345 setTruncStoreAction(MVT::v16f32, MVT::v16f16, Expand);
346 setTruncStoreAction(MVT::v32f32, MVT::v32bf16, Expand);
347 setTruncStoreAction(MVT::v32f32, MVT::v32f16, Expand);
348
349 setTruncStoreAction(MVT::f64, MVT::bf16, Expand);
350 setTruncStoreAction(MVT::f64, MVT::f16, Expand);
351 setTruncStoreAction(MVT::f64, MVT::f32, Expand);
352
353 setTruncStoreAction(MVT::v2f64, MVT::v2f32, Expand);
354 setTruncStoreAction(MVT::v2f64, MVT::v2bf16, Expand);
355 setTruncStoreAction(MVT::v2f64, MVT::v2f16, Expand);
356
357 setTruncStoreAction(MVT::v3i32, MVT::v3i8, Expand);
358
359 setTruncStoreAction(MVT::v3i64, MVT::v3i32, Expand);
360 setTruncStoreAction(MVT::v3i64, MVT::v3i16, Expand);
361 setTruncStoreAction(MVT::v3i64, MVT::v3i8, Expand);
362 setTruncStoreAction(MVT::v3i64, MVT::v3i1, Expand);
363 setTruncStoreAction(MVT::v3f64, MVT::v3f32, Expand);
364 setTruncStoreAction(MVT::v3f64, MVT::v3bf16, Expand);
365 setTruncStoreAction(MVT::v3f64, MVT::v3f16, Expand);
366
367 setTruncStoreAction(MVT::v4i64, MVT::v4i32, Expand);
368 setTruncStoreAction(MVT::v4i64, MVT::v4i16, Expand);
369 setTruncStoreAction(MVT::v4f64, MVT::v4f32, Expand);
370 setTruncStoreAction(MVT::v4f64, MVT::v4bf16, Expand);
371 setTruncStoreAction(MVT::v4f64, MVT::v4f16, Expand);
372
373 setTruncStoreAction(MVT::v5i32, MVT::v5i1, Expand);
374 setTruncStoreAction(MVT::v5i32, MVT::v5i8, Expand);
375 setTruncStoreAction(MVT::v5i32, MVT::v5i16, Expand);
376
377 setTruncStoreAction(MVT::v6i32, MVT::v6i1, Expand);
378 setTruncStoreAction(MVT::v6i32, MVT::v6i8, Expand);
379 setTruncStoreAction(MVT::v6i32, MVT::v6i16, Expand);
380
381 setTruncStoreAction(MVT::v7i32, MVT::v7i1, Expand);
382 setTruncStoreAction(MVT::v7i32, MVT::v7i8, Expand);
383 setTruncStoreAction(MVT::v7i32, MVT::v7i16, Expand);
384
385 setTruncStoreAction(MVT::v8f64, MVT::v8f32, Expand);
386 setTruncStoreAction(MVT::v8f64, MVT::v8bf16, Expand);
387 setTruncStoreAction(MVT::v8f64, MVT::v8f16, Expand);
388
389 setTruncStoreAction(MVT::v16f64, MVT::v16f32, Expand);
390 setTruncStoreAction(MVT::v16f64, MVT::v16bf16, Expand);
391 setTruncStoreAction(MVT::v16f64, MVT::v16f16, Expand);
392 setTruncStoreAction(MVT::v16i64, MVT::v16i16, Expand);
393 setTruncStoreAction(MVT::v16i64, MVT::v16i8, Expand);
394 setTruncStoreAction(MVT::v16i64, MVT::v16i8, Expand);
395 setTruncStoreAction(MVT::v16i64, MVT::v16i1, Expand);
396
397 setOperationAction(ISD::Constant, {MVT::i32, MVT::i64}, Legal);
398 setOperationAction(ISD::ConstantFP, {MVT::f32, MVT::f64}, Legal);
399
401
402 // For R600, this is totally unsupported, just custom lower to produce an
403 // error.
405
406 // Library functions. These default to Expand, but we have instructions
407 // for them.
410 {MVT::f16, MVT::f32}, Legal);
412
414 setOperationAction(ISD::FROUND, {MVT::f32, MVT::f64}, Custom);
416 {MVT::f16, MVT::f32, MVT::f64}, Expand);
417
420 Custom);
421
422 setOperationAction(ISD::FNEARBYINT, {MVT::f16, MVT::f32, MVT::f64}, Custom);
423
424 setOperationAction(ISD::FRINT, {MVT::f16, MVT::f32, MVT::f64}, Custom);
425
426 setOperationAction({ISD::LRINT, ISD::LLRINT}, {MVT::f16, MVT::f32, MVT::f64},
427 Expand);
428
429 setOperationAction(ISD::FREM, {MVT::f16, MVT::f32, MVT::f64}, Expand);
430 setOperationAction(ISD::IS_FPCLASS, {MVT::f32, MVT::f64}, Legal);
432
434 Custom);
435
436 setOperationAction(ISD::FCANONICALIZE, {MVT::f32, MVT::f64}, Legal);
437
438 // FIXME: These IS_FPCLASS vector fp types are marked custom so it reaches
439 // scalarization code. Can be removed when IS_FPCLASS expand isn't called by
440 // default unless marked custom/legal.
442 {MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32,
443 MVT::v6f32, MVT::v7f32, MVT::v8f32, MVT::v16f32,
444 MVT::v2f64, MVT::v3f64, MVT::v4f64, MVT::v8f64,
445 MVT::v16f64},
446 Custom);
447
448 // Expand to fneg + fadd.
450
452 {MVT::v3i32, MVT::v3f32, MVT::v4i32, MVT::v4f32,
453 MVT::v5i32, MVT::v5f32, MVT::v6i32, MVT::v6f32,
454 MVT::v7i32, MVT::v7f32, MVT::v8i32, MVT::v8f32,
455 MVT::v9i32, MVT::v9f32, MVT::v10i32, MVT::v10f32,
456 MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32},
457 Custom);
458
461 {MVT::v2f32, MVT::v2i32, MVT::v3f32, MVT::v3i32, MVT::v4f32,
462 MVT::v4i32, MVT::v5f32, MVT::v5i32, MVT::v6f32, MVT::v6i32,
463 MVT::v7f32, MVT::v7i32, MVT::v8f32, MVT::v8i32, MVT::v9f32,
464 MVT::v9i32, MVT::v10i32, MVT::v10f32, MVT::v11i32, MVT::v11f32,
465 MVT::v12i32, MVT::v12f32, MVT::v16i32, MVT::v32f32, MVT::v32i32,
466 MVT::v2f64, MVT::v2i64, MVT::v3f64, MVT::v3i64, MVT::v4f64,
467 MVT::v4i64, MVT::v8f64, MVT::v8i64, MVT::v16f64, MVT::v16i64},
468 Custom);
469
471 Expand);
472 setOperationAction(ISD::FP_TO_FP16, {MVT::f64, MVT::f32}, Custom);
473
474 const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 };
475 for (MVT VT : ScalarIntVTs) {
476 // These should use [SU]DIVREM, so set them to expand
478 Expand);
479
480 // GPU does not have divrem function for signed or unsigned.
482
483 // GPU does not have [S|U]MUL_LOHI functions as a single instruction.
485
487
488 // AMDGPU uses ADDC/SUBC/ADDE/SUBE
490 }
491
492 // The hardware supports 32-bit FSHR, but not FSHL.
494
495 setOperationAction({ISD::ROTL, ISD::ROTR}, {MVT::i32, MVT::i64}, Expand);
496
498
503 MVT::i64, Custom);
505
507 Legal);
508
511 MVT::i64, Custom);
512
513 for (auto VT : {MVT::i8, MVT::i16})
515
516 static const MVT::SimpleValueType VectorIntTypes[] = {
517 MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32, MVT::v6i32, MVT::v7i32,
518 MVT::v9i32, MVT::v10i32, MVT::v11i32, MVT::v12i32};
519
520 for (MVT VT : VectorIntTypes) {
521 // Expand the following operations for the current type by default.
522 // clang-format off
542 VT, Expand);
543 // clang-format on
544 }
545
546 static const MVT::SimpleValueType FloatVectorTypes[] = {
547 MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32, MVT::v6f32, MVT::v7f32,
548 MVT::v9f32, MVT::v10f32, MVT::v11f32, MVT::v12f32};
549
550 for (MVT VT : FloatVectorTypes) {
563 VT, Expand);
564 }
565
566 // This causes using an unrolled select operation rather than expansion with
567 // bit operations. This is in general better, but the alternative using BFI
568 // instructions may be better if the select sources are SGPRs.
570 AddPromotedToType(ISD::SELECT, MVT::v2f32, MVT::v2i32);
571
573 AddPromotedToType(ISD::SELECT, MVT::v3f32, MVT::v3i32);
574
576 AddPromotedToType(ISD::SELECT, MVT::v4f32, MVT::v4i32);
577
579 AddPromotedToType(ISD::SELECT, MVT::v5f32, MVT::v5i32);
580
582 AddPromotedToType(ISD::SELECT, MVT::v6f32, MVT::v6i32);
583
585 AddPromotedToType(ISD::SELECT, MVT::v7f32, MVT::v7i32);
586
588 AddPromotedToType(ISD::SELECT, MVT::v9f32, MVT::v9i32);
589
591 AddPromotedToType(ISD::SELECT, MVT::v10f32, MVT::v10i32);
592
594 AddPromotedToType(ISD::SELECT, MVT::v11f32, MVT::v11i32);
595
597 AddPromotedToType(ISD::SELECT, MVT::v12f32, MVT::v12i32);
598
600 setJumpIsExpensive(true);
601
604
606
607 // We want to find all load dependencies for long chains of stores to enable
608 // merging into very wide vectors. The problem is with vectors with > 4
609 // elements. MergeConsecutiveStores will attempt to merge these because x8/x16
610 // vectors are a legal type, even though we have to split the loads
611 // usually. When we can more precisely specify load legality per address
612 // space, we should be able to make FindBetterChain/MergeConsecutiveStores
613 // smarter so that they can figure out what to do in 2 iterations without all
614 // N > 4 stores on the same chain.
616
617 // memcpy/memmove/memset are expanded in the IR, so we shouldn't need to worry
618 // about these during lowering.
619 MaxStoresPerMemcpy = 0xffffffff;
620 MaxStoresPerMemmove = 0xffffffff;
621 MaxStoresPerMemset = 0xffffffff;
622
623 // The expansion for 64-bit division is enormous.
625 addBypassSlowDiv(64, 32);
626
637
641}
642
644 const auto Flags = Op.getNode()->getFlags();
645 if (Flags.hasNoSignedZeros())
646 return true;
647
648 return false;
649}
650
651//===----------------------------------------------------------------------===//
652// Target Information
653//===----------------------------------------------------------------------===//
654
656static bool fnegFoldsIntoOpcode(unsigned Opc) {
657 switch (Opc) {
658 case ISD::FADD:
659 case ISD::FSUB:
660 case ISD::FMUL:
661 case ISD::FMA:
662 case ISD::FMAD:
663 case ISD::FMINNUM:
664 case ISD::FMAXNUM:
667 case ISD::FMINIMUM:
668 case ISD::FMAXIMUM:
669 case ISD::FMINIMUMNUM:
670 case ISD::FMAXIMUMNUM:
671 case ISD::SELECT:
672 case ISD::FSIN:
673 case ISD::FTRUNC:
674 case ISD::FRINT:
675 case ISD::FNEARBYINT:
676 case ISD::FROUNDEVEN:
678 case AMDGPUISD::RCP:
679 case AMDGPUISD::RCP_LEGACY:
680 case AMDGPUISD::RCP_IFLAG:
681 case AMDGPUISD::SIN_HW:
682 case AMDGPUISD::FMUL_LEGACY:
683 case AMDGPUISD::FMIN_LEGACY:
684 case AMDGPUISD::FMAX_LEGACY:
685 case AMDGPUISD::FMED3:
686 // TODO: handle llvm.amdgcn.fma.legacy
687 return true;
688 case ISD::BITCAST:
689 llvm_unreachable("bitcast is special cased");
690 default:
691 return false;
692 }
693}
694
695static bool fnegFoldsIntoOp(const SDNode *N) {
696 unsigned Opc = N->getOpcode();
697 if (Opc == ISD::BITCAST) {
698 // TODO: Is there a benefit to checking the conditions performFNegCombine
699 // does? We don't for the other cases.
700 SDValue BCSrc = N->getOperand(0);
701 if (BCSrc.getOpcode() == ISD::BUILD_VECTOR) {
702 return BCSrc.getNumOperands() == 2 &&
703 BCSrc.getOperand(1).getValueSizeInBits() == 32;
704 }
705
706 return BCSrc.getOpcode() == ISD::SELECT && BCSrc.getValueType() == MVT::f32;
707 }
708
709 return fnegFoldsIntoOpcode(Opc);
710}
711
712/// \p returns true if the operation will definitely need to use a 64-bit
713/// encoding, and thus will use a VOP3 encoding regardless of the source
714/// modifiers.
716static bool opMustUseVOP3Encoding(const SDNode *N, MVT VT) {
717 return (N->getNumOperands() > 2 && N->getOpcode() != ISD::SELECT) ||
718 VT == MVT::f64;
719}
720
721/// Return true if v_cndmask_b32 will support fabs/fneg source modifiers for the
722/// type for ISD::SELECT.
724static bool selectSupportsSourceMods(const SDNode *N) {
725 // TODO: Only applies if select will be vector
726 return N->getValueType(0) == MVT::f32;
727}
728
729// Most FP instructions support source modifiers, but this could be refined
730// slightly.
732static bool hasSourceMods(const SDNode *N) {
733 if (isa<MemSDNode>(N))
734 return false;
735
736 switch (N->getOpcode()) {
737 case ISD::CopyToReg:
738 case ISD::FDIV:
739 case ISD::FREM:
740 case ISD::INLINEASM:
742 case AMDGPUISD::DIV_SCALE:
744
745 // TODO: Should really be looking at the users of the bitcast. These are
746 // problematic because bitcasts are used to legalize all stores to integer
747 // types.
748 case ISD::BITCAST:
749 return false;
751 switch (N->getConstantOperandVal(0)) {
752 case Intrinsic::amdgcn_interp_p1:
753 case Intrinsic::amdgcn_interp_p2:
754 case Intrinsic::amdgcn_interp_mov:
755 case Intrinsic::amdgcn_interp_p1_f16:
756 case Intrinsic::amdgcn_interp_p2_f16:
757 return false;
758 default:
759 return true;
760 }
761 }
762 case ISD::SELECT:
764 default:
765 return true;
766 }
767}
768
770 unsigned CostThreshold) {
771 // Some users (such as 3-operand FMA/MAD) must use a VOP3 encoding, and thus
772 // it is truly free to use a source modifier in all cases. If there are
773 // multiple users but for each one will necessitate using VOP3, there will be
774 // a code size increase. Try to avoid increasing code size unless we know it
775 // will save on the instruction count.
776 unsigned NumMayIncreaseSize = 0;
777 MVT VT = N->getValueType(0).getScalarType().getSimpleVT();
778
779 assert(!N->use_empty());
780
781 // XXX - Should this limit number of uses to check?
782 for (const SDNode *U : N->users()) {
783 if (!hasSourceMods(U))
784 return false;
785
786 if (!opMustUseVOP3Encoding(U, VT)) {
787 if (++NumMayIncreaseSize > CostThreshold)
788 return false;
789 }
790 }
791
792 return true;
793}
794
796 ISD::NodeType ExtendKind) const {
797 assert(!VT.isVector() && "only scalar expected");
798
799 // Round to the next multiple of 32-bits.
800 unsigned Size = VT.getSizeInBits();
801 if (Size <= 32)
802 return MVT::i32;
803 return EVT::getIntegerVT(Context, 32 * ((Size + 31) / 32));
804}
805
807 return 32;
808}
809
811 return true;
812}
813
814// The backend supports 32 and 64 bit floating point immediates.
815// FIXME: Why are we reporting vectors of FP immediates as legal?
817 bool ForCodeSize) const {
818 return isTypeLegal(VT.getScalarType());
819}
820
821// We don't want to shrink f64 / f32 constants.
823 EVT ScalarVT = VT.getScalarType();
824 return (ScalarVT != MVT::f32 && ScalarVT != MVT::f64);
825}
826
828 SDNode *N, ISD::LoadExtType ExtTy, EVT NewVT,
829 std::optional<unsigned> ByteOffset) const {
830 // TODO: This may be worth removing. Check regression tests for diffs.
831 if (!TargetLoweringBase::shouldReduceLoadWidth(N, ExtTy, NewVT, ByteOffset))
832 return false;
833
834 unsigned NewSize = NewVT.getStoreSizeInBits();
835
836 // If we are reducing to a 32-bit load or a smaller multi-dword load,
837 // this is always better.
838 if (NewSize >= 32)
839 return true;
840
841 EVT OldVT = N->getValueType(0);
842 unsigned OldSize = OldVT.getStoreSizeInBits();
843
845 unsigned AS = MN->getAddressSpace();
846 // Do not shrink an aligned scalar load to sub-dword.
847 // Scalar engine cannot do sub-dword loads.
848 // TODO: Update this for GFX12 which does have scalar sub-dword loads.
849 if (OldSize >= 32 && NewSize < 32 && MN->getAlign() >= Align(4) &&
853 MN->isInvariant())) &&
855 return false;
856
857 // Don't produce extloads from sub 32-bit types. SI doesn't have scalar
858 // extloads, so doing one requires using a buffer_load. In cases where we
859 // still couldn't use a scalar load, using the wider load shouldn't really
860 // hurt anything.
861
862 // If the old size already had to be an extload, there's no harm in continuing
863 // to reduce the width.
864 return (OldSize < 32);
865}
866
868 const SelectionDAG &DAG,
869 const MachineMemOperand &MMO) const {
870
871 assert(LoadTy.getSizeInBits() == CastTy.getSizeInBits());
872
873 if (LoadTy.getScalarType() == MVT::i32)
874 return false;
875
876 unsigned LScalarSize = LoadTy.getScalarSizeInBits();
877 unsigned CastScalarSize = CastTy.getScalarSizeInBits();
878
879 if ((LScalarSize >= CastScalarSize) && (CastScalarSize < 32))
880 return false;
881
882 unsigned Fast = 0;
884 CastTy, MMO, &Fast) &&
885 Fast;
886}
887
888// SI+ has instructions for cttz / ctlz for 32-bit values. This is probably also
889// profitable with the expansion for 64-bit since it's generally good to
890// speculate things.
892 return true;
893}
894
896 return true;
897}
898
900 switch (N->getOpcode()) {
901 case ISD::EntryToken:
902 case ISD::TokenFactor:
903 return true;
905 unsigned IntrID = N->getConstantOperandVal(0);
907 }
909 unsigned IntrID = N->getConstantOperandVal(1);
911 }
912 case ISD::LOAD:
913 if (cast<LoadSDNode>(N)->getMemOperand()->getAddrSpace() ==
915 return true;
916 return false;
917 case AMDGPUISD::SETCC: // ballot-style instruction
918 return true;
919 }
920 return false;
921}
922
924 SDValue Op, SelectionDAG &DAG, bool LegalOperations, bool ForCodeSize,
925 NegatibleCost &Cost, unsigned Depth) const {
926
927 switch (Op.getOpcode()) {
928 case ISD::FMA:
929 case ISD::FMAD: {
930 // Negating a fma is not free if it has users without source mods.
931 if (!allUsesHaveSourceMods(Op.getNode()))
932 return SDValue();
933 break;
934 }
935 case AMDGPUISD::RCP: {
936 SDValue Src = Op.getOperand(0);
937 EVT VT = Op.getValueType();
938 SDLoc SL(Op);
939
940 SDValue NegSrc = getNegatedExpression(Src, DAG, LegalOperations,
941 ForCodeSize, Cost, Depth + 1);
942 if (NegSrc)
943 return DAG.getNode(AMDGPUISD::RCP, SL, VT, NegSrc, Op->getFlags());
944 return SDValue();
945 }
946 default:
947 break;
948 }
949
950 return TargetLowering::getNegatedExpression(Op, DAG, LegalOperations,
951 ForCodeSize, Cost, Depth);
952}
953
954//===---------------------------------------------------------------------===//
955// Target Properties
956//===---------------------------------------------------------------------===//
957
960
961 // Packed operations do not have a fabs modifier.
962 // Report this based on the end legalized type.
963 return VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f16 || VT == MVT::bf16;
964}
965
968 // Report this based on the end legalized type.
969 VT = VT.getScalarType();
970 return VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f16 || VT == MVT::bf16;
971}
972
974 unsigned NumElem,
975 unsigned AS) const {
976 return true;
977}
978
980 // There are few operations which truly have vector input operands. Any vector
981 // operation is going to involve operations on each component, and a
982 // build_vector will be a copy per element, so it always makes sense to use a
983 // build_vector input in place of the extracted element to avoid a copy into a
984 // super register.
985 //
986 // We should probably only do this if all users are extracts only, but this
987 // should be the common case.
988 return true;
989}
990
992 // Truncate is just accessing a subregister.
993
994 unsigned SrcSize = Source.getSizeInBits();
995 unsigned DestSize = Dest.getSizeInBits();
996
997 return DestSize < SrcSize && DestSize % 32 == 0 ;
998}
999
1001 // Truncate is just accessing a subregister.
1002
1003 unsigned SrcSize = Source->getScalarSizeInBits();
1004 unsigned DestSize = Dest->getScalarSizeInBits();
1005
1006 if (DestSize== 16 && Subtarget->has16BitInsts())
1007 return SrcSize >= 32;
1008
1009 return DestSize < SrcSize && DestSize % 32 == 0;
1010}
1011
1013 unsigned SrcSize = Src->getScalarSizeInBits();
1014 unsigned DestSize = Dest->getScalarSizeInBits();
1015
1016 if (SrcSize == 16 && Subtarget->has16BitInsts())
1017 return DestSize >= 32;
1018
1019 return SrcSize == 32 && DestSize == 64;
1020}
1021
1023 // Any register load of a 64-bit value really requires 2 32-bit moves. For all
1024 // practical purposes, the extra mov 0 to load a 64-bit is free. As used,
1025 // this will enable reducing 64-bit operations the 32-bit, which is always
1026 // good.
1027
1028 if (Src == MVT::i16)
1029 return Dest == MVT::i32 ||Dest == MVT::i64 ;
1030
1031 return Src == MVT::i32 && Dest == MVT::i64;
1032}
1033
1035 EVT DestVT) const {
1036 switch (N->getOpcode()) {
1037 case ISD::ADD:
1038 case ISD::SUB:
1039 case ISD::SHL:
1040 case ISD::SRL:
1041 case ISD::SRA:
1042 case ISD::AND:
1043 case ISD::OR:
1044 case ISD::XOR:
1045 case ISD::MUL:
1046 case ISD::SETCC:
1047 case ISD::SELECT:
1048 case ISD::SMIN:
1049 case ISD::SMAX:
1050 case ISD::UMIN:
1051 case ISD::UMAX:
1052 if (isTypeLegal(MVT::i16) &&
1053 (!DestVT.isVector() ||
1054 !isOperationLegal(ISD::ADD, MVT::v2i16))) { // Check if VOP3P
1055 // Don't narrow back down to i16 if promoted to i32 already.
1056 if (!N->isDivergent() && DestVT.isInteger() &&
1057 DestVT.getScalarSizeInBits() > 1 &&
1058 DestVT.getScalarSizeInBits() <= 16 &&
1059 SrcVT.getScalarSizeInBits() > 16) {
1060 return false;
1061 }
1062 }
1063 return true;
1064 default:
1065 break;
1066 }
1067
1068 // There aren't really 64-bit registers, but pairs of 32-bit ones and only a
1069 // limited number of native 64-bit operations. Shrinking an operation to fit
1070 // in a single 32-bit register should always be helpful. As currently used,
1071 // this is much less general than the name suggests, and is only used in
1072 // places trying to reduce the sizes of loads. Shrinking loads to < 32-bits is
1073 // not profitable, and may actually be harmful.
1074 if (isa<LoadSDNode>(N))
1075 return SrcVT.getSizeInBits() > 32 && DestVT.getSizeInBits() == 32;
1076
1077 return true;
1078}
1079
1081 const SDNode* N, CombineLevel Level) const {
1082 assert((N->getOpcode() == ISD::SHL || N->getOpcode() == ISD::SRA ||
1083 N->getOpcode() == ISD::SRL) &&
1084 "Expected shift op");
1085
1086 SDValue ShiftLHS = N->getOperand(0);
1087 if (!ShiftLHS->hasOneUse())
1088 return false;
1089
1090 if (ShiftLHS.getOpcode() == ISD::SIGN_EXTEND &&
1091 !ShiftLHS.getOperand(0)->hasOneUse())
1092 return false;
1093
1094 // Always commute pre-type legalization and right shifts.
1095 // We're looking for shl(or(x,y),z) patterns.
1097 N->getOpcode() != ISD::SHL || N->getOperand(0).getOpcode() != ISD::OR)
1098 return true;
1099
1100 // If only user is a i32 right-shift, then don't destroy a BFE pattern.
1101 if (N->getValueType(0) == MVT::i32 && N->hasOneUse() &&
1102 (N->user_begin()->getOpcode() == ISD::SRA ||
1103 N->user_begin()->getOpcode() == ISD::SRL))
1104 return false;
1105
1106 // Don't destroy or(shl(load_zext(),c), load_zext()) patterns.
1107 auto IsShiftAndLoad = [](SDValue LHS, SDValue RHS) {
1108 if (LHS.getOpcode() != ISD::SHL)
1109 return false;
1110 auto *RHSLd = dyn_cast<LoadSDNode>(RHS);
1111 auto *LHS0 = dyn_cast<LoadSDNode>(LHS.getOperand(0));
1112 auto *LHS1 = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
1113 return LHS0 && LHS1 && RHSLd && LHS0->getExtensionType() == ISD::ZEXTLOAD &&
1114 LHS1->getAPIntValue() == LHS0->getMemoryVT().getScalarSizeInBits() &&
1115 RHSLd->getExtensionType() == ISD::ZEXTLOAD;
1116 };
1117 SDValue LHS = N->getOperand(0).getOperand(0);
1118 SDValue RHS = N->getOperand(0).getOperand(1);
1119 return !(IsShiftAndLoad(LHS, RHS) || IsShiftAndLoad(RHS, LHS));
1120}
1121
1122//===---------------------------------------------------------------------===//
1123// TargetLowering Callbacks
1124//===---------------------------------------------------------------------===//
1125
1127 bool IsVarArg) {
1128 switch (CC) {
1136 return CC_AMDGPU;
1139 return CC_AMDGPU_CS_CHAIN;
1140 case CallingConv::C:
1141 case CallingConv::Fast:
1142 case CallingConv::Cold:
1143 return CC_AMDGPU_Func;
1146 return CC_SI_Gfx;
1149 default:
1150 reportFatalUsageError("unsupported calling convention for call");
1151 }
1152}
1153
1155 bool IsVarArg) {
1156 switch (CC) {
1159 llvm_unreachable("kernels should not be handled here");
1169 return RetCC_SI_Shader;
1172 return RetCC_SI_Gfx;
1173 case CallingConv::C:
1174 case CallingConv::Fast:
1175 case CallingConv::Cold:
1176 return RetCC_AMDGPU_Func;
1177 default:
1178 reportFatalUsageError("unsupported calling convention");
1179 }
1180}
1181
1182/// The SelectionDAGBuilder will automatically promote function arguments
1183/// with illegal types. However, this does not work for the AMDGPU targets
1184/// since the function arguments are stored in memory as these illegal types.
1185/// In order to handle this properly we need to get the original types sizes
1186/// from the LLVM IR Function and fixup the ISD:InputArg values before
1187/// passing them to AnalyzeFormalArguments()
1188
1189/// When the SelectionDAGBuilder computes the Ins, it takes care of splitting
1190/// input values across multiple registers. Each item in the Ins array
1191/// represents a single value that will be stored in registers. Ins[x].VT is
1192/// the value type of the value that will be stored in the register, so
1193/// whatever SDNode we lower the argument to needs to be this type.
1194///
1195/// In order to correctly lower the arguments we need to know the size of each
1196/// argument. Since Ins[x].VT gives us the size of the register that will
1197/// hold the value, we need to look at Ins[x].ArgVT to see the 'real' type
1198/// for the original function argument so that we can deduce the correct memory
1199/// type to use for Ins[x]. In most cases the correct memory type will be
1200/// Ins[x].ArgVT. However, this will not always be the case. If, for example,
1201/// we have a kernel argument of type v8i8, this argument will be split into
1202/// 8 parts and each part will be represented by its own item in the Ins array.
1203/// For each part the Ins[x].ArgVT will be the v8i8, which is the full type of
1204/// the argument before it was split. From this, we deduce that the memory type
1205/// for each individual part is i8. We pass the memory type as LocVT to the
1206/// calling convention analysis function and the register type (Ins[x].VT) as
1207/// the ValVT.
1209 CCState &State,
1210 const SmallVectorImpl<ISD::InputArg> &Ins) const {
1211 const MachineFunction &MF = State.getMachineFunction();
1212 const Function &Fn = MF.getFunction();
1213 LLVMContext &Ctx = Fn.getContext();
1214 const unsigned ExplicitOffset = Subtarget->getExplicitKernelArgOffset();
1216
1217 Align MaxAlign = Align(1);
1218 uint64_t ExplicitArgOffset = 0;
1219 const DataLayout &DL = Fn.getDataLayout();
1220
1221 unsigned InIndex = 0;
1222
1223 for (const Argument &Arg : Fn.args()) {
1224 const bool IsByRef = Arg.hasByRefAttr();
1225 Type *BaseArgTy = Arg.getType();
1226 Type *MemArgTy = IsByRef ? Arg.getParamByRefType() : BaseArgTy;
1227 Align Alignment = DL.getValueOrABITypeAlignment(
1228 IsByRef ? Arg.getParamAlign() : std::nullopt, MemArgTy);
1229 MaxAlign = std::max(Alignment, MaxAlign);
1230 uint64_t AllocSize = DL.getTypeAllocSize(MemArgTy);
1231
1232 uint64_t ArgOffset = alignTo(ExplicitArgOffset, Alignment) + ExplicitOffset;
1233 ExplicitArgOffset = alignTo(ExplicitArgOffset, Alignment) + AllocSize;
1234
1235 // We're basically throwing away everything passed into us and starting over
1236 // to get accurate in-memory offsets. The "PartOffset" is completely useless
1237 // to us as computed in Ins.
1238 //
1239 // We also need to figure out what type legalization is trying to do to get
1240 // the correct memory offsets.
1241
1242 SmallVector<EVT, 16> ValueVTs;
1244 ComputeValueVTs(*this, DL, BaseArgTy, ValueVTs, /*MemVTs=*/nullptr,
1245 &Offsets, ArgOffset);
1246
1247 for (unsigned Value = 0, NumValues = ValueVTs.size();
1248 Value != NumValues; ++Value) {
1249 uint64_t BasePartOffset = Offsets[Value];
1250
1251 EVT ArgVT = ValueVTs[Value];
1252 EVT MemVT = ArgVT;
1253 MVT RegisterVT = getRegisterTypeForCallingConv(Ctx, CC, ArgVT);
1254 unsigned NumRegs = getNumRegistersForCallingConv(Ctx, CC, ArgVT);
1255
1256 if (NumRegs == 1) {
1257 // This argument is not split, so the IR type is the memory type.
1258 if (ArgVT.isExtended()) {
1259 // We have an extended type, like i24, so we should just use the
1260 // register type.
1261 MemVT = RegisterVT;
1262 } else {
1263 MemVT = ArgVT;
1264 }
1265 } else if (ArgVT.isVector() && RegisterVT.isVector() &&
1266 ArgVT.getScalarType() == RegisterVT.getScalarType()) {
1267 assert(ArgVT.getVectorNumElements() > RegisterVT.getVectorNumElements());
1268 // We have a vector value which has been split into a vector with
1269 // the same scalar type, but fewer elements. This should handle
1270 // all the floating-point vector types.
1271 MemVT = RegisterVT;
1272 } else if (ArgVT.isVector() &&
1273 ArgVT.getVectorNumElements() == NumRegs) {
1274 // This arg has been split so that each element is stored in a separate
1275 // register.
1276 MemVT = ArgVT.getScalarType();
1277 } else if (ArgVT.isExtended()) {
1278 // We have an extended type, like i65.
1279 MemVT = RegisterVT;
1280 } else {
1281 unsigned MemoryBits = ArgVT.getStoreSizeInBits() / NumRegs;
1282 assert(ArgVT.getStoreSizeInBits() % NumRegs == 0);
1283 if (RegisterVT.isInteger()) {
1284 MemVT = EVT::getIntegerVT(State.getContext(), MemoryBits);
1285 } else if (RegisterVT.isVector()) {
1286 assert(!RegisterVT.getScalarType().isFloatingPoint());
1287 unsigned NumElements = RegisterVT.getVectorNumElements();
1288 assert(MemoryBits % NumElements == 0);
1289 // This vector type has been split into another vector type with
1290 // a different elements size.
1291 EVT ScalarVT = EVT::getIntegerVT(State.getContext(),
1292 MemoryBits / NumElements);
1293 MemVT = EVT::getVectorVT(State.getContext(), ScalarVT, NumElements);
1294 } else {
1295 llvm_unreachable("cannot deduce memory type.");
1296 }
1297 }
1298
1299 // Convert one element vectors to scalar.
1300 if (MemVT.isVector() && MemVT.getVectorNumElements() == 1)
1301 MemVT = MemVT.getScalarType();
1302
1303 // Round up vec3/vec5 argument.
1304 if (MemVT.isVector() && !MemVT.isPow2VectorType()) {
1305 MemVT = MemVT.getPow2VectorType(State.getContext());
1306 } else if (!MemVT.isSimple() && !MemVT.isVector()) {
1307 MemVT = MemVT.getRoundIntegerType(State.getContext());
1308 }
1309
1310 unsigned PartOffset = 0;
1311 for (unsigned i = 0; i != NumRegs; ++i) {
1312 State.addLoc(CCValAssign::getCustomMem(InIndex++, RegisterVT,
1313 BasePartOffset + PartOffset,
1314 MemVT.getSimpleVT(),
1316 PartOffset += MemVT.getStoreSize();
1317 }
1318 }
1319 }
1320}
1321
1323 SDValue Chain, CallingConv::ID CallConv,
1324 bool isVarArg,
1326 const SmallVectorImpl<SDValue> &OutVals,
1327 const SDLoc &DL, SelectionDAG &DAG) const {
1328 // FIXME: Fails for r600 tests
1329 //assert(!isVarArg && Outs.empty() && OutVals.empty() &&
1330 // "wave terminate should not have return values");
1331 return DAG.getNode(AMDGPUISD::ENDPGM, DL, MVT::Other, Chain);
1332}
1333
1334//===---------------------------------------------------------------------===//
1335// Target specific lowering
1336//===---------------------------------------------------------------------===//
1337
1338/// Selects the correct CCAssignFn for a given CallingConvention value.
1343
1348
1350 SelectionDAG &DAG,
1351 MachineFrameInfo &MFI,
1352 int ClobberedFI) const {
1353 SmallVector<SDValue, 8> ArgChains;
1354 int64_t FirstByte = MFI.getObjectOffset(ClobberedFI);
1355 int64_t LastByte = FirstByte + MFI.getObjectSize(ClobberedFI) - 1;
1356
1357 // Include the original chain at the beginning of the list. When this is
1358 // used by target LowerCall hooks, this helps legalize find the
1359 // CALLSEQ_BEGIN node.
1360 ArgChains.push_back(Chain);
1361
1362 // Add a chain value for each stack argument corresponding
1363 for (SDNode *U : DAG.getEntryNode().getNode()->users()) {
1364 if (LoadSDNode *L = dyn_cast<LoadSDNode>(U)) {
1365 if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(L->getBasePtr())) {
1366 if (FI->getIndex() < 0) {
1367 int64_t InFirstByte = MFI.getObjectOffset(FI->getIndex());
1368 int64_t InLastByte = InFirstByte;
1369 InLastByte += MFI.getObjectSize(FI->getIndex()) - 1;
1370
1371 if ((InFirstByte <= FirstByte && FirstByte <= InLastByte) ||
1372 (FirstByte <= InFirstByte && InFirstByte <= LastByte))
1373 ArgChains.push_back(SDValue(L, 1));
1374 }
1375 }
1376 }
1377 }
1378
1379 // Build a tokenfactor for all the chains.
1380 return DAG.getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ArgChains);
1381}
1382
1385 StringRef Reason) const {
1386 SDValue Callee = CLI.Callee;
1387 SelectionDAG &DAG = CLI.DAG;
1388
1389 const Function &Fn = DAG.getMachineFunction().getFunction();
1390
1391 StringRef FuncName("<unknown>");
1392
1394 FuncName = G->getSymbol();
1395 else if (const GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
1396 FuncName = G->getGlobal()->getName();
1397
1398 DAG.getContext()->diagnose(
1399 DiagnosticInfoUnsupported(Fn, Reason + FuncName, CLI.DL.getDebugLoc()));
1400
1401 if (!CLI.IsTailCall) {
1402 for (ISD::InputArg &Arg : CLI.Ins)
1403 InVals.push_back(DAG.getPOISON(Arg.VT));
1404 }
1405
1406 // FIXME: Hack because R600 doesn't handle callseq pseudos yet.
1407 if (getTargetMachine().getTargetTriple().getArch() == Triple::r600)
1408 return CLI.Chain;
1409
1410 SDValue Chain = DAG.getCALLSEQ_START(CLI.Chain, 0, 0, CLI.DL);
1411 return DAG.getCALLSEQ_END(Chain, 0, 0, /*InGlue=*/SDValue(), CLI.DL);
1412}
1413
1415 SmallVectorImpl<SDValue> &InVals) const {
1416 return lowerUnhandledCall(CLI, InVals, "unsupported call to function ");
1417}
1418
1420 SelectionDAG &DAG) const {
1421 const Function &Fn = DAG.getMachineFunction().getFunction();
1422
1424 Fn, "unsupported dynamic alloca", SDLoc(Op).getDebugLoc()));
1425 auto Ops = {DAG.getConstant(0, SDLoc(), Op.getValueType()), Op.getOperand(0)};
1426 return DAG.getMergeValues(Ops, SDLoc());
1427}
1428
1430 SelectionDAG &DAG) const {
1431 switch (Op.getOpcode()) {
1432 default:
1433 Op->print(errs(), &DAG);
1434 llvm_unreachable("Custom lowering code for this "
1435 "instruction is not implemented yet!");
1436 break;
1438 case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG);
1440 case ISD::UDIVREM: return LowerUDIVREM(Op, DAG);
1441 case ISD::SDIVREM:
1442 return LowerSDIVREM(Op, DAG);
1443 case ISD::FCEIL: return LowerFCEIL(Op, DAG);
1444 case ISD::FTRUNC: return LowerFTRUNC(Op, DAG);
1445 case ISD::FRINT: return LowerFRINT(Op, DAG);
1446 case ISD::FNEARBYINT: return LowerFNEARBYINT(Op, DAG);
1447 case ISD::FROUNDEVEN:
1448 return LowerFROUNDEVEN(Op, DAG);
1449 case ISD::FROUND: return LowerFROUND(Op, DAG);
1450 case ISD::FFLOOR: return LowerFFLOOR(Op, DAG);
1451 case ISD::FLOG2:
1452 return LowerFLOG2(Op, DAG);
1453 case ISD::FLOG:
1454 case ISD::FLOG10:
1455 return LowerFLOGCommon(Op, DAG);
1456 case ISD::FEXP:
1457 case ISD::FEXP10:
1458 return lowerFEXP(Op, DAG);
1459 case ISD::FEXP2:
1460 return lowerFEXP2(Op, DAG);
1461 case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);
1462 case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
1463 case ISD::FP_TO_FP16: return LowerFP_TO_FP16(Op, DAG);
1464 case ISD::FP_TO_SINT:
1465 case ISD::FP_TO_UINT:
1466 return LowerFP_TO_INT(Op, DAG);
1469 return LowerFP_TO_INT_SAT(Op, DAG);
1470 case ISD::CTTZ:
1472 case ISD::CTLZ:
1474 return LowerCTLZ_CTTZ(Op, DAG);
1476 }
1477 return Op;
1478}
1479
1482 SelectionDAG &DAG) const {
1483 switch (N->getOpcode()) {
1485 // Different parts of legalization seem to interpret which type of
1486 // sign_extend_inreg is the one to check for custom lowering. The extended
1487 // from type is what really matters, but some places check for custom
1488 // lowering of the result type. This results in trying to use
1489 // ReplaceNodeResults to sext_in_reg to an illegal type, so we'll just do
1490 // nothing here and let the illegal result integer be handled normally.
1491 return;
1492 case ISD::FLOG2:
1493 if (SDValue Lowered = LowerFLOG2(SDValue(N, 0), DAG))
1494 Results.push_back(Lowered);
1495 return;
1496 case ISD::FLOG:
1497 case ISD::FLOG10:
1498 if (SDValue Lowered = LowerFLOGCommon(SDValue(N, 0), DAG))
1499 Results.push_back(Lowered);
1500 return;
1501 case ISD::FEXP2:
1502 if (SDValue Lowered = lowerFEXP2(SDValue(N, 0), DAG))
1503 Results.push_back(Lowered);
1504 return;
1505 case ISD::FEXP:
1506 case ISD::FEXP10:
1507 if (SDValue Lowered = lowerFEXP(SDValue(N, 0), DAG))
1508 Results.push_back(Lowered);
1509 return;
1510 case ISD::CTLZ:
1512 if (auto Lowered = lowerCTLZResults(SDValue(N, 0u), DAG))
1513 Results.push_back(Lowered);
1514 return;
1515 default:
1516 return;
1517 }
1518}
1519
1521 SDValue Op,
1522 SelectionDAG &DAG) const {
1523
1524 const DataLayout &DL = DAG.getDataLayout();
1526 const GlobalValue *GV = G->getGlobal();
1527
1528 if (!MFI->isModuleEntryFunction()) {
1529 auto IsNamedBarrier = AMDGPU::isNamedBarrier(*cast<GlobalVariable>(GV));
1530 if (std::optional<uint32_t> Address =
1532 if (IsNamedBarrier) {
1533 unsigned BarCnt = cast<GlobalVariable>(GV)->getGlobalSize(DL) / 16;
1534 MFI->recordNumNamedBarriers(Address.value(), BarCnt);
1535 }
1536 return DAG.getConstant(*Address, SDLoc(Op), Op.getValueType());
1537 } else if (IsNamedBarrier) {
1538 llvm_unreachable("named barrier should have an assigned address");
1539 }
1540 }
1541
1542 if (G->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
1543 G->getAddressSpace() == AMDGPUAS::REGION_ADDRESS) {
1544 if (!MFI->isModuleEntryFunction() &&
1545 GV->getName() != "llvm.amdgcn.module.lds" &&
1547 SDLoc DL(Op);
1548 const Function &Fn = DAG.getMachineFunction().getFunction();
1550 Fn, "local memory global used by non-kernel function",
1551 DL.getDebugLoc(), DS_Warning));
1552
1553 // We currently don't have a way to correctly allocate LDS objects that
1554 // aren't directly associated with a kernel. We do force inlining of
1555 // functions that use local objects. However, if these dead functions are
1556 // not eliminated, we don't want a compile time error. Just emit a warning
1557 // and a trap, since there should be no callable path here.
1558 SDValue Trap = DAG.getNode(ISD::TRAP, DL, MVT::Other, DAG.getEntryNode());
1559 SDValue OutputChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
1560 Trap, DAG.getRoot());
1561 DAG.setRoot(OutputChain);
1562 return DAG.getPOISON(Op.getValueType());
1563 }
1564
1565 // XXX: What does the value of G->getOffset() mean?
1566 assert(G->getOffset() == 0 &&
1567 "Do not know what to do with an non-zero offset");
1568
1569 // TODO: We could emit code to handle the initialization somewhere.
1570 // We ignore the initializer for now and legalize it to allow selection.
1571 // The initializer will anyway get errored out during assembly emission.
1572 unsigned Offset = MFI->allocateLDSGlobal(DL, *cast<GlobalVariable>(GV));
1573 return DAG.getConstant(Offset, SDLoc(Op), Op.getValueType());
1574 }
1575 return SDValue();
1576}
1577
1579 SelectionDAG &DAG) const {
1581 SDLoc SL(Op);
1582
1583 EVT VT = Op.getValueType();
1584 if (VT.getVectorElementType().getSizeInBits() < 32) {
1585 unsigned OpBitSize = Op.getOperand(0).getValueType().getSizeInBits();
1586 if (OpBitSize >= 32 && OpBitSize % 32 == 0) {
1587 unsigned NewNumElt = OpBitSize / 32;
1588 EVT NewEltVT = (NewNumElt == 1) ? MVT::i32
1590 MVT::i32, NewNumElt);
1591 for (const SDUse &U : Op->ops()) {
1592 SDValue In = U.get();
1593 SDValue NewIn = DAG.getNode(ISD::BITCAST, SL, NewEltVT, In);
1594 if (NewNumElt > 1)
1595 DAG.ExtractVectorElements(NewIn, Args);
1596 else
1597 Args.push_back(NewIn);
1598 }
1599
1600 EVT NewVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
1601 NewNumElt * Op.getNumOperands());
1602 SDValue BV = DAG.getBuildVector(NewVT, SL, Args);
1603 return DAG.getNode(ISD::BITCAST, SL, VT, BV);
1604 }
1605 }
1606
1607 for (const SDUse &U : Op->ops())
1608 DAG.ExtractVectorElements(U.get(), Args);
1609
1610 return DAG.getBuildVector(Op.getValueType(), SL, Args);
1611}
1612
1614 SelectionDAG &DAG) const {
1615 SDLoc SL(Op);
1617 unsigned Start = Op.getConstantOperandVal(1);
1618 EVT VT = Op.getValueType();
1619 EVT SrcVT = Op.getOperand(0).getValueType();
1620
1621 if (VT.getScalarSizeInBits() == 16 && Start % 2 == 0) {
1622 unsigned NumElt = VT.getVectorNumElements();
1623 unsigned NumSrcElt = SrcVT.getVectorNumElements();
1624 assert(NumElt % 2 == 0 && NumSrcElt % 2 == 0 && "expect legal types");
1625
1626 // Extract 32-bit registers at a time.
1627 EVT NewSrcVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumSrcElt / 2);
1628 EVT NewVT = NumElt == 2
1629 ? MVT::i32
1630 : EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElt / 2);
1631 SDValue Tmp = DAG.getNode(ISD::BITCAST, SL, NewSrcVT, Op.getOperand(0));
1632
1633 DAG.ExtractVectorElements(Tmp, Args, Start / 2, NumElt / 2);
1634 if (NumElt == 2)
1635 Tmp = Args[0];
1636 else
1637 Tmp = DAG.getBuildVector(NewVT, SL, Args);
1638
1639 return DAG.getNode(ISD::BITCAST, SL, VT, Tmp);
1640 }
1641
1642 DAG.ExtractVectorElements(Op.getOperand(0), Args, Start,
1644
1645 return DAG.getBuildVector(Op.getValueType(), SL, Args);
1646}
1647
1648// TODO: Handle fabs too
1650 if (Val.getOpcode() == ISD::FNEG)
1651 return Val.getOperand(0);
1652
1653 return Val;
1654}
1655
1657 if (Val.getOpcode() == ISD::FNEG)
1658 Val = Val.getOperand(0);
1659 if (Val.getOpcode() == ISD::FABS)
1660 Val = Val.getOperand(0);
1661 if (Val.getOpcode() == ISD::FCOPYSIGN)
1662 Val = Val.getOperand(0);
1663 return Val;
1664}
1665
1667 const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, SDValue True,
1668 SDValue False, SDValue CC, DAGCombinerInfo &DCI) const {
1669 SelectionDAG &DAG = DCI.DAG;
1670 ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
1671 switch (CCOpcode) {
1672 case ISD::SETOEQ:
1673 case ISD::SETONE:
1674 case ISD::SETUNE:
1675 case ISD::SETNE:
1676 case ISD::SETUEQ:
1677 case ISD::SETEQ:
1678 case ISD::SETFALSE:
1679 case ISD::SETFALSE2:
1680 case ISD::SETTRUE:
1681 case ISD::SETTRUE2:
1682 case ISD::SETUO:
1683 case ISD::SETO:
1684 break;
1685 case ISD::SETULE:
1686 case ISD::SETULT: {
1687 if (LHS == True)
1688 return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, RHS, LHS);
1689 return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, LHS, RHS);
1690 }
1691 case ISD::SETOLE:
1692 case ISD::SETOLT:
1693 case ISD::SETLE:
1694 case ISD::SETLT: {
1695 // Ordered. Assume ordered for undefined.
1696
1697 // Only do this after legalization to avoid interfering with other combines
1698 // which might occur.
1700 !DCI.isCalledByLegalizer())
1701 return SDValue();
1702
1703 // We need to permute the operands to get the correct NaN behavior. The
1704 // selected operand is the second one based on the failing compare with NaN,
1705 // so permute it based on the compare type the hardware uses.
1706 if (LHS == True)
1707 return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, LHS, RHS);
1708 return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, RHS, LHS);
1709 }
1710 case ISD::SETUGE:
1711 case ISD::SETUGT: {
1712 if (LHS == True)
1713 return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, RHS, LHS);
1714 return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, LHS, RHS);
1715 }
1716 case ISD::SETGT:
1717 case ISD::SETGE:
1718 case ISD::SETOGE:
1719 case ISD::SETOGT: {
1721 !DCI.isCalledByLegalizer())
1722 return SDValue();
1723
1724 if (LHS == True)
1725 return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, LHS, RHS);
1726 return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, RHS, LHS);
1727 }
1728 case ISD::SETCC_INVALID:
1729 llvm_unreachable("Invalid setcc condcode!");
1730 }
1731 return SDValue();
1732}
1733
1734/// Generate Min/Max node
1736 SDValue LHS, SDValue RHS,
1737 SDValue True, SDValue False,
1738 SDValue CC,
1739 DAGCombinerInfo &DCI) const {
1740 if ((LHS == True && RHS == False) || (LHS == False && RHS == True))
1741 return combineFMinMaxLegacyImpl(DL, VT, LHS, RHS, True, False, CC, DCI);
1742
1743 SelectionDAG &DAG = DCI.DAG;
1744
1745 // If we can't directly match this, try to see if we can fold an fneg to
1746 // match.
1747
1750 SDValue NegTrue = peekFNeg(True);
1751
1752 // Undo the combine foldFreeOpFromSelect does if it helps us match the
1753 // fmin/fmax.
1754 //
1755 // select (fcmp olt (lhs, K)), (fneg lhs), -K
1756 // -> fneg (fmin_legacy lhs, K)
1757 //
1758 // TODO: Use getNegatedExpression
1759 if (LHS == NegTrue && CFalse && CRHS) {
1760 APFloat NegRHS = neg(CRHS->getValueAPF());
1761 if (NegRHS == CFalse->getValueAPF()) {
1762 SDValue Combined =
1763 combineFMinMaxLegacyImpl(DL, VT, LHS, RHS, NegTrue, False, CC, DCI);
1764 if (Combined)
1765 return DAG.getNode(ISD::FNEG, DL, VT, Combined);
1766 return SDValue();
1767 }
1768 }
1769
1770 return SDValue();
1771}
1772
1773std::pair<SDValue, SDValue>
1775 SDLoc SL(Op);
1776
1777 SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
1778
1779 const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
1780 const SDValue One = DAG.getConstant(1, SL, MVT::i32);
1781
1782 SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero);
1783 SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One);
1784
1785 return std::pair(Lo, Hi);
1786}
1787
1789 SDLoc SL(Op);
1790
1791 SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
1792 const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
1793 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero);
1794}
1795
1797 SDLoc SL(Op);
1798
1799 SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
1800 const SDValue One = DAG.getConstant(1, SL, MVT::i32);
1801 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One);
1802}
1803
1804// Split a vector type into two parts. The first part is a power of two vector.
1805// The second part is whatever is left over, and is a scalar if it would
1806// otherwise be a 1-vector.
1807std::pair<EVT, EVT>
1809 EVT LoVT, HiVT;
1810 EVT EltVT = VT.getVectorElementType();
1811 unsigned NumElts = VT.getVectorNumElements();
1812 unsigned LoNumElts = PowerOf2Ceil((NumElts + 1) / 2);
1813 LoVT = EVT::getVectorVT(*DAG.getContext(), EltVT, LoNumElts);
1814 HiVT = NumElts - LoNumElts == 1
1815 ? EltVT
1816 : EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts - LoNumElts);
1817 return std::pair(LoVT, HiVT);
1818}
1819
1820// Split a vector value into two parts of types LoVT and HiVT. HiVT could be
1821// scalar.
1822std::pair<SDValue, SDValue>
1824 const EVT &LoVT, const EVT &HiVT,
1825 SelectionDAG &DAG) const {
1826 EVT VT = N.getValueType();
1828 (HiVT.isVector() ? HiVT.getVectorNumElements() : 1) <=
1829 VT.getVectorNumElements() &&
1830 "More vector elements requested than available!");
1832 DAG.getVectorIdxConstant(0, DL));
1833
1834 unsigned LoNumElts = LoVT.getVectorNumElements();
1835
1836 if (HiVT.isVector()) {
1837 unsigned HiNumElts = HiVT.getVectorNumElements();
1838 if ((VT.getVectorNumElements() % HiNumElts) == 0) {
1839 // Avoid creating an extract_subvector with an index that isn't a multiple
1840 // of the result type.
1842 DAG.getConstant(LoNumElts, DL, MVT::i32));
1843 return {Lo, Hi};
1844 }
1845
1847 DAG.ExtractVectorElements(N, Elts, /*Start=*/LoNumElts,
1848 /*Count=*/HiNumElts);
1849 SDValue Hi = DAG.getBuildVector(HiVT, DL, Elts);
1850 return {Lo, Hi};
1851 }
1852
1854 DAG.getVectorIdxConstant(LoNumElts, DL));
1855 return {Lo, Hi};
1856}
1857
1859 SelectionDAG &DAG) const {
1861 EVT VT = Op.getValueType();
1862 SDLoc SL(Op);
1863
1864
1865 // If this is a 2 element vector, we really want to scalarize and not create
1866 // weird 1 element vectors.
1867 if (VT.getVectorNumElements() == 2) {
1868 SDValue Ops[2];
1869 std::tie(Ops[0], Ops[1]) = scalarizeVectorLoad(Load, DAG);
1870 return DAG.getMergeValues(Ops, SL);
1871 }
1872
1873 SDValue BasePtr = Load->getBasePtr();
1874 EVT MemVT = Load->getMemoryVT();
1875
1876 const MachinePointerInfo &SrcValue = Load->getMemOperand()->getPointerInfo();
1877
1878 EVT LoVT, HiVT;
1879 EVT LoMemVT, HiMemVT;
1880 SDValue Lo, Hi;
1881
1882 std::tie(LoVT, HiVT) = getSplitDestVTs(VT, DAG);
1883 std::tie(LoMemVT, HiMemVT) = getSplitDestVTs(MemVT, DAG);
1884 std::tie(Lo, Hi) = splitVector(Op, SL, LoVT, HiVT, DAG);
1885
1886 unsigned Size = LoMemVT.getStoreSize();
1887 Align BaseAlign = Load->getAlign();
1888 Align HiAlign = commonAlignment(BaseAlign, Size);
1889
1890 SDValue LoLoad = DAG.getExtLoad(
1891 Load->getExtensionType(), SL, LoVT, Load->getChain(), BasePtr, SrcValue,
1892 LoMemVT, BaseAlign, Load->getMemOperand()->getFlags(), Load->getAAInfo());
1893 SDValue HiPtr = DAG.getObjectPtrOffset(SL, BasePtr, TypeSize::getFixed(Size));
1894 SDValue HiLoad = DAG.getExtLoad(
1895 Load->getExtensionType(), SL, HiVT, Load->getChain(), HiPtr,
1896 SrcValue.getWithOffset(LoMemVT.getStoreSize()), HiMemVT, HiAlign,
1897 Load->getMemOperand()->getFlags(), Load->getAAInfo());
1898
1899 SDValue Join;
1900 if (LoVT == HiVT) {
1901 // This is the case that the vector is power of two so was evenly split.
1902 Join = DAG.getNode(ISD::CONCAT_VECTORS, SL, VT, LoLoad, HiLoad);
1903 } else {
1904 Join = DAG.getNode(ISD::INSERT_SUBVECTOR, SL, VT, DAG.getPOISON(VT), LoLoad,
1905 DAG.getVectorIdxConstant(0, SL));
1906 Join = DAG.getNode(
1908 VT, Join, HiLoad,
1910 }
1911
1912 SDValue Ops[] = {Join, DAG.getNode(ISD::TokenFactor, SL, MVT::Other,
1913 LoLoad.getValue(1), HiLoad.getValue(1))};
1914
1915 return DAG.getMergeValues(Ops, SL);
1916}
1917
1919 SelectionDAG &DAG) const {
1921 EVT VT = Op.getValueType();
1922 SDValue BasePtr = Load->getBasePtr();
1923 EVT MemVT = Load->getMemoryVT();
1924 SDLoc SL(Op);
1925 const MachinePointerInfo &SrcValue = Load->getMemOperand()->getPointerInfo();
1926 Align BaseAlign = Load->getAlign();
1927 unsigned NumElements = MemVT.getVectorNumElements();
1928
1929 // Widen from vec3 to vec4 when the load is at least 8-byte aligned
1930 // or 16-byte fully dereferenceable. Otherwise, split the vector load.
1931 if (NumElements != 3 ||
1932 (BaseAlign < Align(8) &&
1933 !SrcValue.isDereferenceable(16, *DAG.getContext(), DAG.getDataLayout())))
1934 return SplitVectorLoad(Op, DAG);
1935
1936 assert(NumElements == 3);
1937
1938 EVT WideVT =
1940 EVT WideMemVT =
1942 SDValue WideLoad = DAG.getExtLoad(
1943 Load->getExtensionType(), SL, WideVT, Load->getChain(), BasePtr, SrcValue,
1944 WideMemVT, BaseAlign, Load->getMemOperand()->getFlags());
1945 return DAG.getMergeValues(
1946 {DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, VT, WideLoad,
1947 DAG.getVectorIdxConstant(0, SL)),
1948 WideLoad.getValue(1)},
1949 SL);
1950}
1951
1953 SelectionDAG &DAG) const {
1955 SDValue Val = Store->getValue();
1956 EVT VT = Val.getValueType();
1957
1958 // If this is a 2 element vector, we really want to scalarize and not create
1959 // weird 1 element vectors.
1960 if (VT.getVectorNumElements() == 2)
1961 return scalarizeVectorStore(Store, DAG);
1962
1963 EVT MemVT = Store->getMemoryVT();
1964 SDValue Chain = Store->getChain();
1965 SDValue BasePtr = Store->getBasePtr();
1966 SDLoc SL(Op);
1967
1968 EVT LoVT, HiVT;
1969 EVT LoMemVT, HiMemVT;
1970 SDValue Lo, Hi;
1971
1972 std::tie(LoVT, HiVT) = getSplitDestVTs(VT, DAG);
1973 std::tie(LoMemVT, HiMemVT) = getSplitDestVTs(MemVT, DAG);
1974 std::tie(Lo, Hi) = splitVector(Val, SL, LoVT, HiVT, DAG);
1975
1976 SDValue HiPtr = DAG.getObjectPtrOffset(SL, BasePtr, LoMemVT.getStoreSize());
1977
1978 const MachinePointerInfo &SrcValue = Store->getMemOperand()->getPointerInfo();
1979 Align BaseAlign = Store->getAlign();
1980 unsigned Size = LoMemVT.getStoreSize();
1981 Align HiAlign = commonAlignment(BaseAlign, Size);
1982
1983 SDValue LoStore =
1984 DAG.getTruncStore(Chain, SL, Lo, BasePtr, SrcValue, LoMemVT, BaseAlign,
1985 Store->getMemOperand()->getFlags(), Store->getAAInfo());
1986 SDValue HiStore = DAG.getTruncStore(
1987 Chain, SL, Hi, HiPtr, SrcValue.getWithOffset(Size), HiMemVT, HiAlign,
1988 Store->getMemOperand()->getFlags(), Store->getAAInfo());
1989
1990 return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, LoStore, HiStore);
1991}
1992
1993// This is a shortcut for integer division because we have fast i32<->f32
1994// conversions, and fast f32 reciprocal instructions. The fractional part of a
1995// float is enough to accurately represent up to a 24-bit signed integer.
1997 bool Sign) const {
1998 SDLoc DL(Op);
1999 EVT VT = Op.getValueType();
2000 SDValue LHS = Op.getOperand(0);
2001 SDValue RHS = Op.getOperand(1);
2002 MVT IntVT = MVT::i32;
2003 MVT FltVT = MVT::f32;
2004
2005 unsigned LHSSignBits = DAG.ComputeNumSignBits(LHS);
2006 if (LHSSignBits < 9)
2007 return SDValue();
2008
2009 unsigned RHSSignBits = DAG.ComputeNumSignBits(RHS);
2010 if (RHSSignBits < 9)
2011 return SDValue();
2012
2013 unsigned BitSize = VT.getSizeInBits();
2014 unsigned SignBits = std::min(LHSSignBits, RHSSignBits);
2015 unsigned DivBits = BitSize - SignBits;
2016 if (Sign)
2017 ++DivBits;
2018
2021
2022 SDValue jq = DAG.getConstant(1, DL, IntVT);
2023
2024 if (Sign) {
2025 // char|short jq = ia ^ ib;
2026 jq = DAG.getNode(ISD::XOR, DL, VT, LHS, RHS);
2027
2028 // jq = jq >> (bitsize - 2)
2029 jq = DAG.getNode(ISD::SRA, DL, VT, jq,
2030 DAG.getConstant(BitSize - 2, DL, VT));
2031
2032 // jq = jq | 0x1
2033 jq = DAG.getNode(ISD::OR, DL, VT, jq, DAG.getConstant(1, DL, VT));
2034 }
2035
2036 // int ia = (int)LHS;
2037 SDValue ia = LHS;
2038
2039 // int ib, (int)RHS;
2040 SDValue ib = RHS;
2041
2042 // float fa = (float)ia;
2043 SDValue fa = DAG.getNode(ToFp, DL, FltVT, ia);
2044
2045 // float fb = (float)ib;
2046 SDValue fb = DAG.getNode(ToFp, DL, FltVT, ib);
2047
2048 SDValue fq = DAG.getNode(ISD::FMUL, DL, FltVT,
2049 fa, DAG.getNode(AMDGPUISD::RCP, DL, FltVT, fb));
2050
2051 // fq = trunc(fq);
2052 fq = DAG.getNode(ISD::FTRUNC, DL, FltVT, fq);
2053
2054 // float fqneg = -fq;
2055 SDValue fqneg = DAG.getNode(ISD::FNEG, DL, FltVT, fq);
2056
2058
2059 bool UseFmadFtz = false;
2060 if (Subtarget->isGCN()) {
2062 UseFmadFtz =
2064 }
2065
2066 // float fr = mad(fqneg, fb, fa);
2067 unsigned OpCode = !Subtarget->hasMadMacF32Insts() ? (unsigned)ISD::FMA
2068 : UseFmadFtz ? (unsigned)AMDGPUISD::FMAD_FTZ
2070 SDValue fr = DAG.getNode(OpCode, DL, FltVT, fqneg, fb, fa);
2071
2072 // int iq = (int)fq;
2073 SDValue iq = DAG.getNode(ToInt, DL, IntVT, fq);
2074
2075 // fr = fabs(fr);
2076 fr = DAG.getNode(ISD::FABS, DL, FltVT, fr);
2077
2078 // fb = fabs(fb);
2079 fb = DAG.getNode(ISD::FABS, DL, FltVT, fb);
2080
2081 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2082
2083 // int cv = fr >= fb;
2084 SDValue cv = DAG.getSetCC(DL, SetCCVT, fr, fb, ISD::SETOGE);
2085
2086 // jq = (cv ? jq : 0);
2087 jq = DAG.getNode(ISD::SELECT, DL, VT, cv, jq, DAG.getConstant(0, DL, VT));
2088
2089 // dst = iq + jq;
2090 SDValue Div = DAG.getNode(ISD::ADD, DL, VT, iq, jq);
2091
2092 // Rem needs compensation, it's easier to recompute it
2093 SDValue Rem = DAG.getNode(ISD::MUL, DL, VT, Div, RHS);
2094 Rem = DAG.getNode(ISD::SUB, DL, VT, LHS, Rem);
2095
2096 // Truncate to number of bits this divide really is.
2097 if (Sign) {
2098 SDValue InRegSize
2099 = DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), DivBits));
2100 Div = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Div, InRegSize);
2101 Rem = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Rem, InRegSize);
2102 } else {
2103 SDValue TruncMask = DAG.getConstant((UINT64_C(1) << DivBits) - 1, DL, VT);
2104 Div = DAG.getNode(ISD::AND, DL, VT, Div, TruncMask);
2105 Rem = DAG.getNode(ISD::AND, DL, VT, Rem, TruncMask);
2106 }
2107
2108 return DAG.getMergeValues({ Div, Rem }, DL);
2109}
2110
2112 SelectionDAG &DAG,
2114 SDLoc DL(Op);
2115 EVT VT = Op.getValueType();
2116
2117 assert(VT == MVT::i64 && "LowerUDIVREM64 expects an i64");
2118
2119 EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext());
2120
2121 SDValue One = DAG.getConstant(1, DL, HalfVT);
2122 SDValue Zero = DAG.getConstant(0, DL, HalfVT);
2123
2124 //HiLo split
2125 SDValue LHS_Lo, LHS_Hi;
2126 SDValue LHS = Op.getOperand(0);
2127 std::tie(LHS_Lo, LHS_Hi) = DAG.SplitScalar(LHS, DL, HalfVT, HalfVT);
2128
2129 SDValue RHS_Lo, RHS_Hi;
2130 SDValue RHS = Op.getOperand(1);
2131 std::tie(RHS_Lo, RHS_Hi) = DAG.SplitScalar(RHS, DL, HalfVT, HalfVT);
2132
2133 if (DAG.MaskedValueIsZero(RHS, APInt::getHighBitsSet(64, 32)) &&
2134 DAG.MaskedValueIsZero(LHS, APInt::getHighBitsSet(64, 32))) {
2135
2136 SDValue Res = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(HalfVT, HalfVT),
2137 LHS_Lo, RHS_Lo);
2138
2139 SDValue DIV = DAG.getBuildVector(MVT::v2i32, DL, {Res.getValue(0), Zero});
2140 SDValue REM = DAG.getBuildVector(MVT::v2i32, DL, {Res.getValue(1), Zero});
2141
2142 Results.push_back(DAG.getNode(ISD::BITCAST, DL, MVT::i64, DIV));
2143 Results.push_back(DAG.getNode(ISD::BITCAST, DL, MVT::i64, REM));
2144 return;
2145 }
2146
2147 if (isTypeLegal(MVT::i64)) {
2148 // The algorithm here is based on ideas from "Software Integer Division",
2149 // Tom Rodeheffer, August 2008.
2150
2153
2154 // Compute denominator reciprocal.
2155 unsigned FMAD =
2156 !Subtarget->hasMadMacF32Insts() ? (unsigned)ISD::FMA
2159 : (unsigned)AMDGPUISD::FMAD_FTZ;
2160
2161 SDValue Cvt_Lo = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, RHS_Lo);
2162 SDValue Cvt_Hi = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, RHS_Hi);
2163 SDValue Mad1 = DAG.getNode(FMAD, DL, MVT::f32, Cvt_Hi,
2164 DAG.getConstantFP(APInt(32, 0x4f800000).bitsToFloat(), DL, MVT::f32),
2165 Cvt_Lo);
2166 SDValue Rcp = DAG.getNode(AMDGPUISD::RCP, DL, MVT::f32, Mad1);
2167 SDValue Mul1 = DAG.getNode(ISD::FMUL, DL, MVT::f32, Rcp,
2168 DAG.getConstantFP(APInt(32, 0x5f7ffffc).bitsToFloat(), DL, MVT::f32));
2169 SDValue Mul2 = DAG.getNode(ISD::FMUL, DL, MVT::f32, Mul1,
2170 DAG.getConstantFP(APInt(32, 0x2f800000).bitsToFloat(), DL, MVT::f32));
2171 SDValue Trunc = DAG.getNode(ISD::FTRUNC, DL, MVT::f32, Mul2);
2172 SDValue Mad2 = DAG.getNode(FMAD, DL, MVT::f32, Trunc,
2173 DAG.getConstantFP(APInt(32, 0xcf800000).bitsToFloat(), DL, MVT::f32),
2174 Mul1);
2175 SDValue Rcp_Lo = DAG.getNode(ISD::FP_TO_UINT, DL, HalfVT, Mad2);
2176 SDValue Rcp_Hi = DAG.getNode(ISD::FP_TO_UINT, DL, HalfVT, Trunc);
2177 SDValue Rcp64 = DAG.getBitcast(VT,
2178 DAG.getBuildVector(MVT::v2i32, DL, {Rcp_Lo, Rcp_Hi}));
2179
2180 SDValue Zero64 = DAG.getConstant(0, DL, VT);
2181 SDValue One64 = DAG.getConstant(1, DL, VT);
2182 SDValue Zero1 = DAG.getConstant(0, DL, MVT::i1);
2183 SDVTList HalfCarryVT = DAG.getVTList(HalfVT, MVT::i1);
2184
2185 // First round of UNR (Unsigned integer Newton-Raphson).
2186 SDValue Neg_RHS = DAG.getNode(ISD::SUB, DL, VT, Zero64, RHS);
2187 SDValue Mullo1 = DAG.getNode(ISD::MUL, DL, VT, Neg_RHS, Rcp64);
2188 SDValue Mulhi1 = DAG.getNode(ISD::MULHU, DL, VT, Rcp64, Mullo1);
2189 SDValue Mulhi1_Lo, Mulhi1_Hi;
2190 std::tie(Mulhi1_Lo, Mulhi1_Hi) =
2191 DAG.SplitScalar(Mulhi1, DL, HalfVT, HalfVT);
2192 SDValue Add1_Lo = DAG.getNode(ISD::UADDO_CARRY, DL, HalfCarryVT, Rcp_Lo,
2193 Mulhi1_Lo, Zero1);
2194 SDValue Add1_Hi = DAG.getNode(ISD::UADDO_CARRY, DL, HalfCarryVT, Rcp_Hi,
2195 Mulhi1_Hi, Add1_Lo.getValue(1));
2196 SDValue Add1 = DAG.getBitcast(VT,
2197 DAG.getBuildVector(MVT::v2i32, DL, {Add1_Lo, Add1_Hi}));
2198
2199 // Second round of UNR.
2200 SDValue Mullo2 = DAG.getNode(ISD::MUL, DL, VT, Neg_RHS, Add1);
2201 SDValue Mulhi2 = DAG.getNode(ISD::MULHU, DL, VT, Add1, Mullo2);
2202 SDValue Mulhi2_Lo, Mulhi2_Hi;
2203 std::tie(Mulhi2_Lo, Mulhi2_Hi) =
2204 DAG.SplitScalar(Mulhi2, DL, HalfVT, HalfVT);
2205 SDValue Add2_Lo = DAG.getNode(ISD::UADDO_CARRY, DL, HalfCarryVT, Add1_Lo,
2206 Mulhi2_Lo, Zero1);
2207 SDValue Add2_Hi = DAG.getNode(ISD::UADDO_CARRY, DL, HalfCarryVT, Add1_Hi,
2208 Mulhi2_Hi, Add2_Lo.getValue(1));
2209 SDValue Add2 = DAG.getBitcast(VT,
2210 DAG.getBuildVector(MVT::v2i32, DL, {Add2_Lo, Add2_Hi}));
2211
2212 SDValue Mulhi3 = DAG.getNode(ISD::MULHU, DL, VT, LHS, Add2);
2213
2214 SDValue Mul3 = DAG.getNode(ISD::MUL, DL, VT, RHS, Mulhi3);
2215
2216 SDValue Mul3_Lo, Mul3_Hi;
2217 std::tie(Mul3_Lo, Mul3_Hi) = DAG.SplitScalar(Mul3, DL, HalfVT, HalfVT);
2218 SDValue Sub1_Lo = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, LHS_Lo,
2219 Mul3_Lo, Zero1);
2220 SDValue Sub1_Hi = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, LHS_Hi,
2221 Mul3_Hi, Sub1_Lo.getValue(1));
2222 SDValue Sub1_Mi = DAG.getNode(ISD::SUB, DL, HalfVT, LHS_Hi, Mul3_Hi);
2223 SDValue Sub1 = DAG.getBitcast(VT,
2224 DAG.getBuildVector(MVT::v2i32, DL, {Sub1_Lo, Sub1_Hi}));
2225
2226 SDValue MinusOne = DAG.getConstant(0xffffffffu, DL, HalfVT);
2227 SDValue C1 = DAG.getSelectCC(DL, Sub1_Hi, RHS_Hi, MinusOne, Zero,
2228 ISD::SETUGE);
2229 SDValue C2 = DAG.getSelectCC(DL, Sub1_Lo, RHS_Lo, MinusOne, Zero,
2230 ISD::SETUGE);
2231 SDValue C3 = DAG.getSelectCC(DL, Sub1_Hi, RHS_Hi, C2, C1, ISD::SETEQ);
2232
2233 // TODO: Here and below portions of the code can be enclosed into if/endif.
2234 // Currently control flow is unconditional and we have 4 selects after
2235 // potential endif to substitute PHIs.
2236
2237 // if C3 != 0 ...
2238 SDValue Sub2_Lo = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub1_Lo,
2239 RHS_Lo, Zero1);
2240 SDValue Sub2_Mi = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub1_Mi,
2241 RHS_Hi, Sub1_Lo.getValue(1));
2242 SDValue Sub2_Hi = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub2_Mi,
2243 Zero, Sub2_Lo.getValue(1));
2244 SDValue Sub2 = DAG.getBitcast(VT,
2245 DAG.getBuildVector(MVT::v2i32, DL, {Sub2_Lo, Sub2_Hi}));
2246
2247 SDValue Add3 = DAG.getNode(ISD::ADD, DL, VT, Mulhi3, One64);
2248
2249 SDValue C4 = DAG.getSelectCC(DL, Sub2_Hi, RHS_Hi, MinusOne, Zero,
2250 ISD::SETUGE);
2251 SDValue C5 = DAG.getSelectCC(DL, Sub2_Lo, RHS_Lo, MinusOne, Zero,
2252 ISD::SETUGE);
2253 SDValue C6 = DAG.getSelectCC(DL, Sub2_Hi, RHS_Hi, C5, C4, ISD::SETEQ);
2254
2255 // if (C6 != 0)
2256 SDValue Add4 = DAG.getNode(ISD::ADD, DL, VT, Add3, One64);
2257
2258 SDValue Sub3_Lo = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub2_Lo,
2259 RHS_Lo, Zero1);
2260 SDValue Sub3_Mi = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub2_Mi,
2261 RHS_Hi, Sub2_Lo.getValue(1));
2262 SDValue Sub3_Hi = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub3_Mi,
2263 Zero, Sub3_Lo.getValue(1));
2264 SDValue Sub3 = DAG.getBitcast(VT,
2265 DAG.getBuildVector(MVT::v2i32, DL, {Sub3_Lo, Sub3_Hi}));
2266
2267 // endif C6
2268 // endif C3
2269
2270 SDValue Sel1 = DAG.getSelectCC(DL, C6, Zero, Add4, Add3, ISD::SETNE);
2271 SDValue Div = DAG.getSelectCC(DL, C3, Zero, Sel1, Mulhi3, ISD::SETNE);
2272
2273 SDValue Sel2 = DAG.getSelectCC(DL, C6, Zero, Sub3, Sub2, ISD::SETNE);
2274 SDValue Rem = DAG.getSelectCC(DL, C3, Zero, Sel2, Sub1, ISD::SETNE);
2275
2276 Results.push_back(Div);
2277 Results.push_back(Rem);
2278
2279 return;
2280 }
2281
2282 // r600 expandion.
2283 // Get Speculative values
2284 SDValue DIV_Part = DAG.getNode(ISD::UDIV, DL, HalfVT, LHS_Hi, RHS_Lo);
2285 SDValue REM_Part = DAG.getNode(ISD::UREM, DL, HalfVT, LHS_Hi, RHS_Lo);
2286
2287 SDValue REM_Lo = DAG.getSelectCC(DL, RHS_Hi, Zero, REM_Part, LHS_Hi, ISD::SETEQ);
2288 SDValue REM = DAG.getBuildVector(MVT::v2i32, DL, {REM_Lo, Zero});
2289 REM = DAG.getNode(ISD::BITCAST, DL, MVT::i64, REM);
2290
2291 SDValue DIV_Hi = DAG.getSelectCC(DL, RHS_Hi, Zero, DIV_Part, Zero, ISD::SETEQ);
2292 SDValue DIV_Lo = Zero;
2293
2294 const unsigned halfBitWidth = HalfVT.getSizeInBits();
2295
2296 for (unsigned i = 0; i < halfBitWidth; ++i) {
2297 const unsigned bitPos = halfBitWidth - i - 1;
2298 SDValue POS = DAG.getConstant(bitPos, DL, HalfVT);
2299 // Get value of high bit
2300 SDValue HBit = DAG.getNode(ISD::SRL, DL, HalfVT, LHS_Lo, POS);
2301 HBit = DAG.getNode(ISD::AND, DL, HalfVT, HBit, One);
2302 HBit = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, HBit);
2303
2304 // Shift
2305 REM = DAG.getNode(ISD::SHL, DL, VT, REM, DAG.getConstant(1, DL, VT));
2306 // Add LHS high bit
2307 REM = DAG.getNode(ISD::OR, DL, VT, REM, HBit);
2308
2309 SDValue BIT = DAG.getConstant(1ULL << bitPos, DL, HalfVT);
2310 SDValue realBIT = DAG.getSelectCC(DL, REM, RHS, BIT, Zero, ISD::SETUGE);
2311
2312 DIV_Lo = DAG.getNode(ISD::OR, DL, HalfVT, DIV_Lo, realBIT);
2313
2314 // Update REM
2315 SDValue REM_sub = DAG.getNode(ISD::SUB, DL, VT, REM, RHS);
2316 REM = DAG.getSelectCC(DL, REM, RHS, REM_sub, REM, ISD::SETUGE);
2317 }
2318
2319 SDValue DIV = DAG.getBuildVector(MVT::v2i32, DL, {DIV_Lo, DIV_Hi});
2320 DIV = DAG.getNode(ISD::BITCAST, DL, MVT::i64, DIV);
2321 Results.push_back(DIV);
2322 Results.push_back(REM);
2323}
2324
2326 SelectionDAG &DAG) const {
2327 SDLoc DL(Op);
2328 EVT VT = Op.getValueType();
2329
2330 if (VT == MVT::i64) {
2332 LowerUDIVREM64(Op, DAG, Results);
2333 return DAG.getMergeValues(Results, DL);
2334 }
2335
2336 if (VT == MVT::i32) {
2337 if (SDValue Res = LowerDIVREM24(Op, DAG, false))
2338 return Res;
2339 }
2340
2341 SDValue X = Op.getOperand(0);
2342 SDValue Y = Op.getOperand(1);
2343
2344 // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
2345 // algorithm used here.
2346
2347 // Initial estimate of inv(y).
2348 SDValue Z = DAG.getNode(AMDGPUISD::URECIP, DL, VT, Y);
2349
2350 // One round of UNR.
2351 SDValue NegY = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Y);
2352 SDValue NegYZ = DAG.getNode(ISD::MUL, DL, VT, NegY, Z);
2353 Z = DAG.getNode(ISD::ADD, DL, VT, Z,
2354 DAG.getNode(ISD::MULHU, DL, VT, Z, NegYZ));
2355
2356 // Quotient/remainder estimate.
2357 SDValue Q = DAG.getNode(ISD::MULHU, DL, VT, X, Z);
2358 SDValue R =
2359 DAG.getNode(ISD::SUB, DL, VT, X, DAG.getNode(ISD::MUL, DL, VT, Q, Y));
2360
2361 // First quotient/remainder refinement.
2362 EVT CCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2363 SDValue One = DAG.getConstant(1, DL, VT);
2364 SDValue Cond = DAG.getSetCC(DL, CCVT, R, Y, ISD::SETUGE);
2365 Q = DAG.getNode(ISD::SELECT, DL, VT, Cond,
2366 DAG.getNode(ISD::ADD, DL, VT, Q, One), Q);
2367 R = DAG.getNode(ISD::SELECT, DL, VT, Cond,
2368 DAG.getNode(ISD::SUB, DL, VT, R, Y), R);
2369
2370 // Second quotient/remainder refinement.
2371 Cond = DAG.getSetCC(DL, CCVT, R, Y, ISD::SETUGE);
2372 Q = DAG.getNode(ISD::SELECT, DL, VT, Cond,
2373 DAG.getNode(ISD::ADD, DL, VT, Q, One), Q);
2374 R = DAG.getNode(ISD::SELECT, DL, VT, Cond,
2375 DAG.getNode(ISD::SUB, DL, VT, R, Y), R);
2376
2377 return DAG.getMergeValues({Q, R}, DL);
2378}
2379
2381 SelectionDAG &DAG) const {
2382 SDLoc DL(Op);
2383 EVT VT = Op.getValueType();
2384
2385 SDValue LHS = Op.getOperand(0);
2386 SDValue RHS = Op.getOperand(1);
2387
2388 SDValue Zero = DAG.getConstant(0, DL, VT);
2389 SDValue NegOne = DAG.getAllOnesConstant(DL, VT);
2390
2391 if (VT == MVT::i32) {
2392 if (SDValue Res = LowerDIVREM24(Op, DAG, true))
2393 return Res;
2394 }
2395
2396 if (VT == MVT::i64 &&
2397 DAG.ComputeNumSignBits(LHS) > 32 &&
2398 DAG.ComputeNumSignBits(RHS) > 32) {
2399 EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext());
2400
2401 //HiLo split
2402 SDValue LHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, Zero);
2403 SDValue RHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, Zero);
2404 SDValue DIVREM = DAG.getNode(ISD::SDIVREM, DL, DAG.getVTList(HalfVT, HalfVT),
2405 LHS_Lo, RHS_Lo);
2406 SDValue Res[2] = {
2407 DAG.getNode(ISD::SIGN_EXTEND, DL, VT, DIVREM.getValue(0)),
2408 DAG.getNode(ISD::SIGN_EXTEND, DL, VT, DIVREM.getValue(1))
2409 };
2410 return DAG.getMergeValues(Res, DL);
2411 }
2412
2413 SDValue LHSign = DAG.getSelectCC(DL, LHS, Zero, NegOne, Zero, ISD::SETLT);
2414 SDValue RHSign = DAG.getSelectCC(DL, RHS, Zero, NegOne, Zero, ISD::SETLT);
2415 SDValue DSign = DAG.getNode(ISD::XOR, DL, VT, LHSign, RHSign);
2416 SDValue RSign = LHSign; // Remainder sign is the same as LHS
2417
2418 LHS = DAG.getNode(ISD::ADD, DL, VT, LHS, LHSign);
2419 RHS = DAG.getNode(ISD::ADD, DL, VT, RHS, RHSign);
2420
2421 LHS = DAG.getNode(ISD::XOR, DL, VT, LHS, LHSign);
2422 RHS = DAG.getNode(ISD::XOR, DL, VT, RHS, RHSign);
2423
2424 SDValue Div = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(VT, VT), LHS, RHS);
2425 SDValue Rem = Div.getValue(1);
2426
2427 Div = DAG.getNode(ISD::XOR, DL, VT, Div, DSign);
2428 Rem = DAG.getNode(ISD::XOR, DL, VT, Rem, RSign);
2429
2430 Div = DAG.getNode(ISD::SUB, DL, VT, Div, DSign);
2431 Rem = DAG.getNode(ISD::SUB, DL, VT, Rem, RSign);
2432
2433 SDValue Res[2] = {
2434 Div,
2435 Rem
2436 };
2437 return DAG.getMergeValues(Res, DL);
2438}
2439
2441 SDLoc SL(Op);
2442 SDValue Src = Op.getOperand(0);
2443
2444 // result = trunc(src)
2445 // if (src > 0.0 && src != result)
2446 // result += 1.0
2447
2448 SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src);
2449
2450 const SDValue Zero = DAG.getConstantFP(0.0, SL, MVT::f64);
2451 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f64);
2452
2453 EVT SetCCVT =
2454 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);
2455
2456 SDValue Lt0 = DAG.getSetCC(SL, SetCCVT, Src, Zero, ISD::SETOGT);
2457 SDValue NeTrunc = DAG.getSetCC(SL, SetCCVT, Src, Trunc, ISD::SETONE);
2458 SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc);
2459
2460 SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, One, Zero);
2461 // TODO: Should this propagate fast-math-flags?
2462 return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add);
2463}
2464
2466 SelectionDAG &DAG) {
2467 const unsigned FractBits = 52;
2468 const unsigned ExpBits = 11;
2469
2470 SDValue ExpPart = DAG.getNode(AMDGPUISD::BFE_U32, SL, MVT::i32,
2471 Hi,
2472 DAG.getConstant(FractBits - 32, SL, MVT::i32),
2473 DAG.getConstant(ExpBits, SL, MVT::i32));
2474 SDValue Exp = DAG.getNode(ISD::SUB, SL, MVT::i32, ExpPart,
2475 DAG.getConstant(1023, SL, MVT::i32));
2476
2477 return Exp;
2478}
2479
2481 SDLoc SL(Op);
2482 SDValue Src = Op.getOperand(0);
2483
2484 assert(Op.getValueType() == MVT::f64);
2485
2486 const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
2487
2488 // Extract the upper half, since this is where we will find the sign and
2489 // exponent.
2490 SDValue Hi = getHiHalf64(Src, DAG);
2491
2492 SDValue Exp = extractF64Exponent(Hi, SL, DAG);
2493
2494 const unsigned FractBits = 52;
2495
2496 // Extract the sign bit.
2497 const SDValue SignBitMask = DAG.getConstant(UINT32_C(1) << 31, SL, MVT::i32);
2498 SDValue SignBit = DAG.getNode(ISD::AND, SL, MVT::i32, Hi, SignBitMask);
2499
2500 // Extend back to 64-bits.
2501 SDValue SignBit64 = DAG.getBuildVector(MVT::v2i32, SL, {Zero, SignBit});
2502 SignBit64 = DAG.getNode(ISD::BITCAST, SL, MVT::i64, SignBit64);
2503
2504 SDValue BcInt = DAG.getNode(ISD::BITCAST, SL, MVT::i64, Src);
2505 const SDValue FractMask
2506 = DAG.getConstant((UINT64_C(1) << FractBits) - 1, SL, MVT::i64);
2507
2508 SDValue Shr = DAG.getNode(ISD::SRA, SL, MVT::i64, FractMask, Exp);
2509 SDValue Not = DAG.getNOT(SL, Shr, MVT::i64);
2510 SDValue Tmp0 = DAG.getNode(ISD::AND, SL, MVT::i64, BcInt, Not);
2511
2512 EVT SetCCVT =
2513 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i32);
2514
2515 const SDValue FiftyOne = DAG.getConstant(FractBits - 1, SL, MVT::i32);
2516
2517 SDValue ExpLt0 = DAG.getSetCC(SL, SetCCVT, Exp, Zero, ISD::SETLT);
2518 SDValue ExpGt51 = DAG.getSetCC(SL, SetCCVT, Exp, FiftyOne, ISD::SETGT);
2519
2520 SDValue Tmp1 = DAG.getNode(ISD::SELECT, SL, MVT::i64, ExpLt0, SignBit64, Tmp0);
2521 SDValue Tmp2 = DAG.getNode(ISD::SELECT, SL, MVT::i64, ExpGt51, BcInt, Tmp1);
2522
2523 return DAG.getNode(ISD::BITCAST, SL, MVT::f64, Tmp2);
2524}
2525
2527 SelectionDAG &DAG) const {
2528 SDLoc SL(Op);
2529 SDValue Src = Op.getOperand(0);
2530
2531 assert(Op.getValueType() == MVT::f64);
2532
2533 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
2534 SDValue C1 = DAG.getConstantFP(C1Val, SL, MVT::f64);
2535 SDValue CopySign = DAG.getNode(ISD::FCOPYSIGN, SL, MVT::f64, C1, Src);
2536
2537 // TODO: Should this propagate fast-math-flags?
2538
2539 SDValue Tmp1 = DAG.getNode(ISD::FADD, SL, MVT::f64, Src, CopySign);
2540 SDValue Tmp2 = DAG.getNode(ISD::FSUB, SL, MVT::f64, Tmp1, CopySign);
2541
2542 SDValue Fabs = DAG.getNode(ISD::FABS, SL, MVT::f64, Src);
2543
2544 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
2545 SDValue C2 = DAG.getConstantFP(C2Val, SL, MVT::f64);
2546
2547 EVT SetCCVT =
2548 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);
2549 SDValue Cond = DAG.getSetCC(SL, SetCCVT, Fabs, C2, ISD::SETOGT);
2550
2551 return DAG.getSelect(SL, MVT::f64, Cond, Src, Tmp2);
2552}
2553
2555 SelectionDAG &DAG) const {
2556 // FNEARBYINT and FRINT are the same, except in their handling of FP
2557 // exceptions. Those aren't really meaningful for us, and OpenCL only has
2558 // rint, so just treat them as equivalent.
2559 return DAG.getNode(ISD::FROUNDEVEN, SDLoc(Op), Op.getValueType(),
2560 Op.getOperand(0));
2561}
2562
2564 auto VT = Op.getValueType();
2565 auto Arg = Op.getOperand(0u);
2566 return DAG.getNode(ISD::FROUNDEVEN, SDLoc(Op), VT, Arg);
2567}
2568
2569// XXX - May require not supporting f32 denormals?
2570
2571// Don't handle v2f16. The extra instructions to scalarize and repack around the
2572// compare and vselect end up producing worse code than scalarizing the whole
2573// operation.
2575 SDLoc SL(Op);
2576 SDValue X = Op.getOperand(0);
2577 EVT VT = Op.getValueType();
2578
2579 SDValue T = DAG.getNode(ISD::FTRUNC, SL, VT, X);
2580
2581 // TODO: Should this propagate fast-math-flags?
2582
2583 SDValue Diff = DAG.getNode(ISD::FSUB, SL, VT, X, T);
2584
2585 SDValue AbsDiff = DAG.getNode(ISD::FABS, SL, VT, Diff);
2586
2587 const SDValue Zero = DAG.getConstantFP(0.0, SL, VT);
2588 const SDValue One = DAG.getConstantFP(1.0, SL, VT);
2589
2590 EVT SetCCVT =
2591 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2592
2593 const SDValue Half = DAG.getConstantFP(0.5, SL, VT);
2594 SDValue Cmp = DAG.getSetCC(SL, SetCCVT, AbsDiff, Half, ISD::SETOGE);
2595 SDValue OneOrZeroFP = DAG.getNode(ISD::SELECT, SL, VT, Cmp, One, Zero);
2596
2597 SDValue SignedOffset = DAG.getNode(ISD::FCOPYSIGN, SL, VT, OneOrZeroFP, X);
2598 return DAG.getNode(ISD::FADD, SL, VT, T, SignedOffset);
2599}
2600
2602 SDLoc SL(Op);
2603 SDValue Src = Op.getOperand(0);
2604
2605 // result = trunc(src);
2606 // if (src < 0.0 && src != result)
2607 // result += -1.0.
2608
2609 SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src);
2610
2611 const SDValue Zero = DAG.getConstantFP(0.0, SL, MVT::f64);
2612 const SDValue NegOne = DAG.getConstantFP(-1.0, SL, MVT::f64);
2613
2614 EVT SetCCVT =
2615 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);
2616
2617 SDValue Lt0 = DAG.getSetCC(SL, SetCCVT, Src, Zero, ISD::SETOLT);
2618 SDValue NeTrunc = DAG.getSetCC(SL, SetCCVT, Src, Trunc, ISD::SETONE);
2619 SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc);
2620
2621 SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, NegOne, Zero);
2622 // TODO: Should this propagate fast-math-flags?
2623 return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add);
2624}
2625
2626/// Return true if it's known that \p Src can never be an f32 denormal value.
2628 switch (Src.getOpcode()) {
2629 case ISD::FP_EXTEND:
2630 return Src.getOperand(0).getValueType() == MVT::f16;
2631 case ISD::FP16_TO_FP:
2632 case ISD::FFREXP:
2633 case ISD::FSQRT:
2634 case AMDGPUISD::LOG:
2635 case AMDGPUISD::EXP:
2636 return true;
2638 unsigned IntrinsicID = Src.getConstantOperandVal(0);
2639 switch (IntrinsicID) {
2640 case Intrinsic::amdgcn_frexp_mant:
2641 case Intrinsic::amdgcn_log:
2642 case Intrinsic::amdgcn_log_clamp:
2643 case Intrinsic::amdgcn_exp2:
2644 case Intrinsic::amdgcn_sqrt:
2645 return true;
2646 default:
2647 return false;
2648 }
2649 }
2650 default:
2651 return false;
2652 }
2653
2654 llvm_unreachable("covered opcode switch");
2655}
2656
2658 SDNodeFlags Flags) {
2659 return Flags.hasApproximateFuncs();
2660}
2661
2670
2672 SDValue Src,
2673 SDNodeFlags Flags) const {
2674 SDLoc SL(Src);
2675 EVT VT = Src.getValueType();
2676 const fltSemantics &Semantics = VT.getFltSemantics();
2677 SDValue SmallestNormal =
2678 DAG.getConstantFP(APFloat::getSmallestNormalized(Semantics), SL, VT);
2679
2680 // Want to scale denormals up, but negatives and 0 work just as well on the
2681 // scaled path.
2682 SDValue IsLtSmallestNormal = DAG.getSetCC(
2683 SL, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT), Src,
2684 SmallestNormal, ISD::SETOLT);
2685
2686 return IsLtSmallestNormal;
2687}
2688
2690 SDNodeFlags Flags) const {
2691 SDLoc SL(Src);
2692 EVT VT = Src.getValueType();
2693 const fltSemantics &Semantics = VT.getFltSemantics();
2694 SDValue Inf = DAG.getConstantFP(APFloat::getInf(Semantics), SL, VT);
2695
2696 SDValue Fabs = DAG.getNode(ISD::FABS, SL, VT, Src, Flags);
2697 SDValue IsFinite = DAG.getSetCC(
2698 SL, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT), Fabs,
2699 Inf, ISD::SETOLT);
2700 return IsFinite;
2701}
2702
2703/// If denormal handling is required return the scaled input to FLOG2, and the
2704/// check for denormal range. Otherwise, return null values.
2705std::pair<SDValue, SDValue>
2707 SDValue Src, SDNodeFlags Flags) const {
2708 if (!needsDenormHandlingF32(DAG, Src, Flags))
2709 return {};
2710
2711 MVT VT = MVT::f32;
2712 const fltSemantics &Semantics = APFloat::IEEEsingle();
2713 SDValue SmallestNormal =
2714 DAG.getConstantFP(APFloat::getSmallestNormalized(Semantics), SL, VT);
2715
2716 SDValue IsLtSmallestNormal = DAG.getSetCC(
2717 SL, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT), Src,
2718 SmallestNormal, ISD::SETOLT);
2719
2720 SDValue Scale32 = DAG.getConstantFP(0x1.0p+32, SL, VT);
2721 SDValue One = DAG.getConstantFP(1.0, SL, VT);
2722 SDValue ScaleFactor =
2723 DAG.getNode(ISD::SELECT, SL, VT, IsLtSmallestNormal, Scale32, One, Flags);
2724
2725 SDValue ScaledInput = DAG.getNode(ISD::FMUL, SL, VT, Src, ScaleFactor, Flags);
2726 return {ScaledInput, IsLtSmallestNormal};
2727}
2728
2730 // v_log_f32 is good enough for OpenCL, except it doesn't handle denormals.
2731 // If we have to handle denormals, scale up the input and adjust the result.
2732
2733 // scaled = x * (is_denormal ? 0x1.0p+32 : 1.0)
2734 // log2 = amdgpu_log2 - (is_denormal ? 32.0 : 0.0)
2735
2736 SDLoc SL(Op);
2737 EVT VT = Op.getValueType();
2738 SDValue Src = Op.getOperand(0);
2739 SDNodeFlags Flags = Op->getFlags();
2740
2741 if (VT == MVT::f16) {
2742 // Nothing in half is a denormal when promoted to f32.
2743 assert(!isTypeLegal(VT));
2744 SDValue Ext = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src, Flags);
2745 SDValue Log = DAG.getNode(AMDGPUISD::LOG, SL, MVT::f32, Ext, Flags);
2746 return DAG.getNode(ISD::FP_ROUND, SL, VT, Log,
2747 DAG.getTargetConstant(0, SL, MVT::i32), Flags);
2748 }
2749
2750 auto [ScaledInput, IsLtSmallestNormal] =
2751 getScaledLogInput(DAG, SL, Src, Flags);
2752 if (!ScaledInput)
2753 return DAG.getNode(AMDGPUISD::LOG, SL, VT, Src, Flags);
2754
2755 SDValue Log2 = DAG.getNode(AMDGPUISD::LOG, SL, VT, ScaledInput, Flags);
2756
2757 SDValue ThirtyTwo = DAG.getConstantFP(32.0, SL, VT);
2758 SDValue Zero = DAG.getConstantFP(0.0, SL, VT);
2759 SDValue ResultOffset =
2760 DAG.getNode(ISD::SELECT, SL, VT, IsLtSmallestNormal, ThirtyTwo, Zero);
2761 return DAG.getNode(ISD::FSUB, SL, VT, Log2, ResultOffset, Flags);
2762}
2763
2764static SDValue getMad(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue X,
2765 SDValue Y, SDValue C, SDNodeFlags Flags = SDNodeFlags()) {
2766 SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, X, Y, Flags);
2767 return DAG.getNode(ISD::FADD, SL, VT, Mul, C, Flags);
2768}
2769
2771 SelectionDAG &DAG) const {
2772 SDValue X = Op.getOperand(0);
2773 EVT VT = Op.getValueType();
2774 SDNodeFlags Flags = Op->getFlags();
2775 SDLoc DL(Op);
2776 const bool IsLog10 = Op.getOpcode() == ISD::FLOG10;
2777 assert(IsLog10 || Op.getOpcode() == ISD::FLOG);
2778
2779 const auto &Options = getTargetMachine().Options;
2780 if (VT == MVT::f16 || Flags.hasApproximateFuncs()) {
2781
2782 if (VT == MVT::f16 && !isTypeLegal(MVT::f16)) {
2783 // Log and multiply in f32 is good enough for f16.
2784 X = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, X, Flags);
2785 }
2786
2787 SDValue Lowered = LowerFLOGUnsafe(X, DL, DAG, IsLog10, Flags);
2788 if (VT == MVT::f16 && !isTypeLegal(MVT::f16)) {
2789 return DAG.getNode(ISD::FP_ROUND, DL, VT, Lowered,
2790 DAG.getTargetConstant(0, DL, MVT::i32), Flags);
2791 }
2792
2793 return Lowered;
2794 }
2795
2796 auto [ScaledInput, IsScaled] = getScaledLogInput(DAG, DL, X, Flags);
2797 if (ScaledInput)
2798 X = ScaledInput;
2799
2800 SDValue Y = DAG.getNode(AMDGPUISD::LOG, DL, VT, X, Flags);
2801
2802 SDValue R;
2803 if (Subtarget->hasFastFMAF32()) {
2804 // c+cc are ln(2)/ln(10) to more than 49 bits
2805 const float c_log10 = 0x1.344134p-2f;
2806 const float cc_log10 = 0x1.09f79ep-26f;
2807
2808 // c + cc is ln(2) to more than 49 bits
2809 const float c_log = 0x1.62e42ep-1f;
2810 const float cc_log = 0x1.efa39ep-25f;
2811
2812 SDValue C = DAG.getConstantFP(IsLog10 ? c_log10 : c_log, DL, VT);
2813 SDValue CC = DAG.getConstantFP(IsLog10 ? cc_log10 : cc_log, DL, VT);
2814 // This adds correction terms for which contraction may lead to an increase
2815 // in the error of the approximation, so disable it.
2816 Flags.setAllowContract(false);
2817 R = DAG.getNode(ISD::FMUL, DL, VT, Y, C, Flags);
2818 SDValue NegR = DAG.getNode(ISD::FNEG, DL, VT, R, Flags);
2819 SDValue FMA0 = DAG.getNode(ISD::FMA, DL, VT, Y, C, NegR, Flags);
2820 SDValue FMA1 = DAG.getNode(ISD::FMA, DL, VT, Y, CC, FMA0, Flags);
2821 R = DAG.getNode(ISD::FADD, DL, VT, R, FMA1, Flags);
2822 } else {
2823 // ch+ct is ln(2)/ln(10) to more than 36 bits
2824 const float ch_log10 = 0x1.344000p-2f;
2825 const float ct_log10 = 0x1.3509f6p-18f;
2826
2827 // ch + ct is ln(2) to more than 36 bits
2828 const float ch_log = 0x1.62e000p-1f;
2829 const float ct_log = 0x1.0bfbe8p-15f;
2830
2831 SDValue CH = DAG.getConstantFP(IsLog10 ? ch_log10 : ch_log, DL, VT);
2832 SDValue CT = DAG.getConstantFP(IsLog10 ? ct_log10 : ct_log, DL, VT);
2833
2834 SDValue YAsInt = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Y);
2835 SDValue MaskConst = DAG.getConstant(0xfffff000, DL, MVT::i32);
2836 SDValue YHInt = DAG.getNode(ISD::AND, DL, MVT::i32, YAsInt, MaskConst);
2837 SDValue YH = DAG.getNode(ISD::BITCAST, DL, MVT::f32, YHInt);
2838 SDValue YT = DAG.getNode(ISD::FSUB, DL, VT, Y, YH, Flags);
2839 // This adds correction terms for which contraction may lead to an increase
2840 // in the error of the approximation, so disable it.
2841 Flags.setAllowContract(false);
2842 SDValue YTCT = DAG.getNode(ISD::FMUL, DL, VT, YT, CT, Flags);
2843 SDValue Mad0 = getMad(DAG, DL, VT, YH, CT, YTCT, Flags);
2844 SDValue Mad1 = getMad(DAG, DL, VT, YT, CH, Mad0, Flags);
2845 R = getMad(DAG, DL, VT, YH, CH, Mad1);
2846 }
2847
2848 const bool IsFiniteOnly =
2849 (Flags.hasNoNaNs() || Options.NoNaNsFPMath) && Flags.hasNoInfs();
2850
2851 // TODO: Check if known finite from source value.
2852 if (!IsFiniteOnly) {
2853 SDValue IsFinite = getIsFinite(DAG, Y, Flags);
2854 R = DAG.getNode(ISD::SELECT, DL, VT, IsFinite, R, Y, Flags);
2855 }
2856
2857 if (IsScaled) {
2858 SDValue Zero = DAG.getConstantFP(0.0f, DL, VT);
2859 SDValue ShiftK =
2860 DAG.getConstantFP(IsLog10 ? 0x1.344136p+3f : 0x1.62e430p+4f, DL, VT);
2861 SDValue Shift =
2862 DAG.getNode(ISD::SELECT, DL, VT, IsScaled, ShiftK, Zero, Flags);
2863 R = DAG.getNode(ISD::FSUB, DL, VT, R, Shift, Flags);
2864 }
2865
2866 return R;
2867}
2868
2872
2873// Do f32 fast math expansion for flog2 or flog10. This is accurate enough for a
2874// promote f16 operation.
2876 SelectionDAG &DAG, bool IsLog10,
2877 SDNodeFlags Flags) const {
2878 EVT VT = Src.getValueType();
2879 unsigned LogOp =
2880 VT == MVT::f32 ? (unsigned)AMDGPUISD::LOG : (unsigned)ISD::FLOG2;
2881
2882 double Log2BaseInverted =
2884
2885 if (VT == MVT::f32) {
2886 auto [ScaledInput, IsScaled] = getScaledLogInput(DAG, SL, Src, Flags);
2887 if (ScaledInput) {
2888 SDValue LogSrc = DAG.getNode(AMDGPUISD::LOG, SL, VT, ScaledInput, Flags);
2889 SDValue ScaledResultOffset =
2890 DAG.getConstantFP(-32.0 * Log2BaseInverted, SL, VT);
2891
2892 SDValue Zero = DAG.getConstantFP(0.0f, SL, VT);
2893
2894 SDValue ResultOffset = DAG.getNode(ISD::SELECT, SL, VT, IsScaled,
2895 ScaledResultOffset, Zero, Flags);
2896
2897 SDValue Log2Inv = DAG.getConstantFP(Log2BaseInverted, SL, VT);
2898
2899 if (Subtarget->hasFastFMAF32())
2900 return DAG.getNode(ISD::FMA, SL, VT, LogSrc, Log2Inv, ResultOffset,
2901 Flags);
2902 SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, LogSrc, Log2Inv, Flags);
2903 return DAG.getNode(ISD::FADD, SL, VT, Mul, ResultOffset);
2904 }
2905 }
2906
2907 SDValue Log2Operand = DAG.getNode(LogOp, SL, VT, Src, Flags);
2908 SDValue Log2BaseInvertedOperand = DAG.getConstantFP(Log2BaseInverted, SL, VT);
2909
2910 return DAG.getNode(ISD::FMUL, SL, VT, Log2Operand, Log2BaseInvertedOperand,
2911 Flags);
2912}
2913
2915 // v_exp_f32 is good enough for OpenCL, except it doesn't handle denormals.
2916 // If we have to handle denormals, scale up the input and adjust the result.
2917
2918 SDLoc SL(Op);
2919 EVT VT = Op.getValueType();
2920 SDValue Src = Op.getOperand(0);
2921 SDNodeFlags Flags = Op->getFlags();
2922
2923 if (VT == MVT::f16) {
2924 // Nothing in half is a denormal when promoted to f32.
2925 assert(!isTypeLegal(MVT::f16));
2926 SDValue Ext = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src, Flags);
2927 SDValue Log = DAG.getNode(AMDGPUISD::EXP, SL, MVT::f32, Ext, Flags);
2928 return DAG.getNode(ISD::FP_ROUND, SL, VT, Log,
2929 DAG.getTargetConstant(0, SL, MVT::i32), Flags);
2930 }
2931
2932 assert(VT == MVT::f32);
2933
2934 if (!needsDenormHandlingF32(DAG, Src, Flags))
2935 return DAG.getNode(AMDGPUISD::EXP, SL, MVT::f32, Src, Flags);
2936
2937 // bool needs_scaling = x < -0x1.f80000p+6f;
2938 // v_exp_f32(x + (s ? 0x1.0p+6f : 0.0f)) * (s ? 0x1.0p-64f : 1.0f);
2939
2940 // -nextafter(128.0, -1)
2941 SDValue RangeCheckConst = DAG.getConstantFP(-0x1.f80000p+6f, SL, VT);
2942
2943 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2944
2945 SDValue NeedsScaling =
2946 DAG.getSetCC(SL, SetCCVT, Src, RangeCheckConst, ISD::SETOLT);
2947
2948 SDValue SixtyFour = DAG.getConstantFP(0x1.0p+6f, SL, VT);
2949 SDValue Zero = DAG.getConstantFP(0.0, SL, VT);
2950
2951 SDValue AddOffset =
2952 DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, SixtyFour, Zero);
2953
2954 SDValue AddInput = DAG.getNode(ISD::FADD, SL, VT, Src, AddOffset, Flags);
2955 SDValue Exp2 = DAG.getNode(AMDGPUISD::EXP, SL, VT, AddInput, Flags);
2956
2957 SDValue TwoExpNeg64 = DAG.getConstantFP(0x1.0p-64f, SL, VT);
2958 SDValue One = DAG.getConstantFP(1.0, SL, VT);
2959 SDValue ResultScale =
2960 DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, TwoExpNeg64, One);
2961
2962 return DAG.getNode(ISD::FMUL, SL, VT, Exp2, ResultScale, Flags);
2963}
2964
2966 SelectionDAG &DAG,
2967 SDNodeFlags Flags,
2968 bool IsExp10) const {
2969 // exp(x) -> exp2(M_LOG2E_F * x);
2970 // exp10(x) -> exp2(log2(10) * x);
2971 EVT VT = X.getValueType();
2972 SDValue Const =
2973 DAG.getConstantFP(IsExp10 ? 0x1.a934f0p+1f : numbers::log2e, SL, VT);
2974
2975 SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, X, Const, Flags);
2976 return DAG.getNode(VT == MVT::f32 ? (unsigned)AMDGPUISD::EXP
2977 : (unsigned)ISD::FEXP2,
2978 SL, VT, Mul, Flags);
2979}
2980
2982 SelectionDAG &DAG,
2983 SDNodeFlags Flags) const {
2984 EVT VT = X.getValueType();
2985 if (VT != MVT::f32 || !needsDenormHandlingF32(DAG, X, Flags))
2986 return lowerFEXPUnsafeImpl(X, SL, DAG, Flags, /*IsExp10=*/false);
2987
2988 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2989
2990 SDValue Threshold = DAG.getConstantFP(-0x1.5d58a0p+6f, SL, VT);
2991 SDValue NeedsScaling = DAG.getSetCC(SL, SetCCVT, X, Threshold, ISD::SETOLT);
2992
2993 SDValue ScaleOffset = DAG.getConstantFP(0x1.0p+6f, SL, VT);
2994
2995 SDValue ScaledX = DAG.getNode(ISD::FADD, SL, VT, X, ScaleOffset, Flags);
2996
2997 SDValue AdjustedX =
2998 DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, ScaledX, X);
2999
3000 const SDValue Log2E = DAG.getConstantFP(numbers::log2e, SL, VT);
3001 SDValue ExpInput = DAG.getNode(ISD::FMUL, SL, VT, AdjustedX, Log2E, Flags);
3002
3003 SDValue Exp2 = DAG.getNode(AMDGPUISD::EXP, SL, VT, ExpInput, Flags);
3004
3005 SDValue ResultScaleFactor = DAG.getConstantFP(0x1.969d48p-93f, SL, VT);
3006 SDValue AdjustedResult =
3007 DAG.getNode(ISD::FMUL, SL, VT, Exp2, ResultScaleFactor, Flags);
3008
3009 return DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, AdjustedResult, Exp2,
3010 Flags);
3011}
3012
3013/// Emit approx-funcs appropriate lowering for exp10. inf/nan should still be
3014/// handled correctly.
3016 SelectionDAG &DAG,
3017 SDNodeFlags Flags) const {
3018 const EVT VT = X.getValueType();
3019
3020 const unsigned Exp2Op = VT == MVT::f32 ? static_cast<unsigned>(AMDGPUISD::EXP)
3021 : static_cast<unsigned>(ISD::FEXP2);
3022
3023 if (VT != MVT::f32 || !needsDenormHandlingF32(DAG, X, Flags)) {
3024 // exp2(x * 0x1.a92000p+1f) * exp2(x * 0x1.4f0978p-11f);
3025 SDValue K0 = DAG.getConstantFP(0x1.a92000p+1f, SL, VT);
3026 SDValue K1 = DAG.getConstantFP(0x1.4f0978p-11f, SL, VT);
3027
3028 SDValue Mul0 = DAG.getNode(ISD::FMUL, SL, VT, X, K0, Flags);
3029 SDValue Exp2_0 = DAG.getNode(Exp2Op, SL, VT, Mul0, Flags);
3030 SDValue Mul1 = DAG.getNode(ISD::FMUL, SL, VT, X, K1, Flags);
3031 SDValue Exp2_1 = DAG.getNode(Exp2Op, SL, VT, Mul1, Flags);
3032 return DAG.getNode(ISD::FMUL, SL, VT, Exp2_0, Exp2_1);
3033 }
3034
3035 // bool s = x < -0x1.2f7030p+5f;
3036 // x += s ? 0x1.0p+5f : 0.0f;
3037 // exp10 = exp2(x * 0x1.a92000p+1f) *
3038 // exp2(x * 0x1.4f0978p-11f) *
3039 // (s ? 0x1.9f623ep-107f : 1.0f);
3040
3041 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
3042
3043 SDValue Threshold = DAG.getConstantFP(-0x1.2f7030p+5f, SL, VT);
3044 SDValue NeedsScaling = DAG.getSetCC(SL, SetCCVT, X, Threshold, ISD::SETOLT);
3045
3046 SDValue ScaleOffset = DAG.getConstantFP(0x1.0p+5f, SL, VT);
3047 SDValue ScaledX = DAG.getNode(ISD::FADD, SL, VT, X, ScaleOffset, Flags);
3048 SDValue AdjustedX =
3049 DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, ScaledX, X);
3050
3051 SDValue K0 = DAG.getConstantFP(0x1.a92000p+1f, SL, VT);
3052 SDValue K1 = DAG.getConstantFP(0x1.4f0978p-11f, SL, VT);
3053
3054 SDValue Mul0 = DAG.getNode(ISD::FMUL, SL, VT, AdjustedX, K0, Flags);
3055 SDValue Exp2_0 = DAG.getNode(Exp2Op, SL, VT, Mul0, Flags);
3056 SDValue Mul1 = DAG.getNode(ISD::FMUL, SL, VT, AdjustedX, K1, Flags);
3057 SDValue Exp2_1 = DAG.getNode(Exp2Op, SL, VT, Mul1, Flags);
3058
3059 SDValue MulExps = DAG.getNode(ISD::FMUL, SL, VT, Exp2_0, Exp2_1, Flags);
3060
3061 SDValue ResultScaleFactor = DAG.getConstantFP(0x1.9f623ep-107f, SL, VT);
3062 SDValue AdjustedResult =
3063 DAG.getNode(ISD::FMUL, SL, VT, MulExps, ResultScaleFactor, Flags);
3064
3065 return DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, AdjustedResult, MulExps,
3066 Flags);
3067}
3068
3070 EVT VT = Op.getValueType();
3071 SDLoc SL(Op);
3072 SDValue X = Op.getOperand(0);
3073 SDNodeFlags Flags = Op->getFlags();
3074 const bool IsExp10 = Op.getOpcode() == ISD::FEXP10;
3075
3076 // TODO: Interpret allowApproxFunc as ignoring DAZ. This is currently copying
3077 // library behavior. Also, is known-not-daz source sufficient?
3078 if (allowApproxFunc(DAG, Flags)) { // TODO: Does this really require fast?
3079 return IsExp10 ? lowerFEXP10Unsafe(X, SL, DAG, Flags)
3080 : lowerFEXPUnsafe(X, SL, DAG, Flags);
3081 }
3082
3083 if (VT.getScalarType() == MVT::f16) {
3084 if (VT.isVector())
3085 return SDValue();
3086
3087 // Nothing in half is a denormal when promoted to f32.
3088 //
3089 // exp(f16 x) ->
3090 // fptrunc (v_exp_f32 (fmul (fpext x), log2e))
3091 //
3092 // exp10(f16 x) ->
3093 // fptrunc (v_exp_f32 (fmul (fpext x), log2(10)))
3094 SDValue Ext = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, X, Flags);
3095 SDValue Lowered = lowerFEXPUnsafeImpl(Ext, SL, DAG, Flags, IsExp10);
3096 return DAG.getNode(ISD::FP_ROUND, SL, VT, Lowered,
3097 DAG.getTargetConstant(0, SL, MVT::i32), Flags);
3098 }
3099
3100 assert(VT == MVT::f32);
3101
3102 // Algorithm:
3103 //
3104 // e^x = 2^(x/ln(2)) = 2^(x*(64/ln(2))/64)
3105 //
3106 // x*(64/ln(2)) = n + f, |f| <= 0.5, n is integer
3107 // n = 64*m + j, 0 <= j < 64
3108 //
3109 // e^x = 2^((64*m + j + f)/64)
3110 // = (2^m) * (2^(j/64)) * 2^(f/64)
3111 // = (2^m) * (2^(j/64)) * e^(f*(ln(2)/64))
3112 //
3113 // f = x*(64/ln(2)) - n
3114 // r = f*(ln(2)/64) = x - n*(ln(2)/64)
3115 //
3116 // e^x = (2^m) * (2^(j/64)) * e^r
3117 //
3118 // (2^(j/64)) is precomputed
3119 //
3120 // e^r = 1 + r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5!
3121 // e^r = 1 + q
3122 //
3123 // q = r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5!
3124 //
3125 // e^x = (2^m) * ( (2^(j/64)) + q*(2^(j/64)) )
3126 SDNodeFlags FlagsNoContract = Flags;
3127 FlagsNoContract.setAllowContract(false);
3128
3129 SDValue PH, PL;
3130 if (Subtarget->hasFastFMAF32()) {
3131 const float c_exp = numbers::log2ef;
3132 const float cc_exp = 0x1.4ae0bep-26f; // c+cc are 49 bits
3133 const float c_exp10 = 0x1.a934f0p+1f;
3134 const float cc_exp10 = 0x1.2f346ep-24f;
3135
3136 SDValue C = DAG.getConstantFP(IsExp10 ? c_exp10 : c_exp, SL, VT);
3137 SDValue CC = DAG.getConstantFP(IsExp10 ? cc_exp10 : cc_exp, SL, VT);
3138
3139 PH = DAG.getNode(ISD::FMUL, SL, VT, X, C, Flags);
3140 SDValue NegPH = DAG.getNode(ISD::FNEG, SL, VT, PH, Flags);
3141 SDValue FMA0 = DAG.getNode(ISD::FMA, SL, VT, X, C, NegPH, Flags);
3142 PL = DAG.getNode(ISD::FMA, SL, VT, X, CC, FMA0, Flags);
3143 } else {
3144 const float ch_exp = 0x1.714000p+0f;
3145 const float cl_exp = 0x1.47652ap-12f; // ch + cl are 36 bits
3146
3147 const float ch_exp10 = 0x1.a92000p+1f;
3148 const float cl_exp10 = 0x1.4f0978p-11f;
3149
3150 SDValue CH = DAG.getConstantFP(IsExp10 ? ch_exp10 : ch_exp, SL, VT);
3151 SDValue CL = DAG.getConstantFP(IsExp10 ? cl_exp10 : cl_exp, SL, VT);
3152
3153 SDValue XAsInt = DAG.getNode(ISD::BITCAST, SL, MVT::i32, X);
3154 SDValue MaskConst = DAG.getConstant(0xfffff000, SL, MVT::i32);
3155 SDValue XHAsInt = DAG.getNode(ISD::AND, SL, MVT::i32, XAsInt, MaskConst);
3156 SDValue XH = DAG.getNode(ISD::BITCAST, SL, VT, XHAsInt);
3157 SDValue XL = DAG.getNode(ISD::FSUB, SL, VT, X, XH, Flags);
3158
3159 PH = DAG.getNode(ISD::FMUL, SL, VT, XH, CH, Flags);
3160
3161 SDValue XLCL = DAG.getNode(ISD::FMUL, SL, VT, XL, CL, Flags);
3162 SDValue Mad0 = getMad(DAG, SL, VT, XL, CH, XLCL, Flags);
3163 PL = getMad(DAG, SL, VT, XH, CL, Mad0, Flags);
3164 }
3165
3166 SDValue E = DAG.getNode(ISD::FROUNDEVEN, SL, VT, PH, Flags);
3167
3168 // It is unsafe to contract this fsub into the PH multiply.
3169 SDValue PHSubE = DAG.getNode(ISD::FSUB, SL, VT, PH, E, FlagsNoContract);
3170
3171 SDValue A = DAG.getNode(ISD::FADD, SL, VT, PHSubE, PL, Flags);
3172 SDValue IntE = DAG.getNode(ISD::FP_TO_SINT, SL, MVT::i32, E);
3173 SDValue Exp2 = DAG.getNode(AMDGPUISD::EXP, SL, VT, A, Flags);
3174
3175 SDValue R = DAG.getNode(ISD::FLDEXP, SL, VT, Exp2, IntE, Flags);
3176
3177 SDValue UnderflowCheckConst =
3178 DAG.getConstantFP(IsExp10 ? -0x1.66d3e8p+5f : -0x1.9d1da0p+6f, SL, VT);
3179
3180 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
3181 SDValue Zero = DAG.getConstantFP(0.0, SL, VT);
3182 SDValue Underflow =
3183 DAG.getSetCC(SL, SetCCVT, X, UnderflowCheckConst, ISD::SETOLT);
3184
3185 R = DAG.getNode(ISD::SELECT, SL, VT, Underflow, Zero, R);
3186
3187 if (!Flags.hasNoInfs()) {
3188 SDValue OverflowCheckConst =
3189 DAG.getConstantFP(IsExp10 ? 0x1.344136p+5f : 0x1.62e430p+6f, SL, VT);
3190 SDValue Overflow =
3191 DAG.getSetCC(SL, SetCCVT, X, OverflowCheckConst, ISD::SETOGT);
3192 SDValue Inf =
3194 R = DAG.getNode(ISD::SELECT, SL, VT, Overflow, Inf, R);
3195 }
3196
3197 return R;
3198}
3199
3200static bool isCtlzOpc(unsigned Opc) {
3201 return Opc == ISD::CTLZ || Opc == ISD::CTLZ_ZERO_UNDEF;
3202}
3203
3204static bool isCttzOpc(unsigned Opc) {
3205 return Opc == ISD::CTTZ || Opc == ISD::CTTZ_ZERO_UNDEF;
3206}
3207
3209 SelectionDAG &DAG) const {
3210 auto SL = SDLoc(Op);
3211 auto Opc = Op.getOpcode();
3212 auto Arg = Op.getOperand(0u);
3213 auto ResultVT = Op.getValueType();
3214
3215 if (ResultVT != MVT::i8 && ResultVT != MVT::i16)
3216 return {};
3217
3219 assert(ResultVT == Arg.getValueType());
3220
3221 const uint64_t NumBits = ResultVT.getFixedSizeInBits();
3222 SDValue NumExtBits = DAG.getConstant(32u - NumBits, SL, MVT::i32);
3223 SDValue NewOp;
3224
3225 if (Opc == ISD::CTLZ_ZERO_UNDEF) {
3226 NewOp = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, Arg);
3227 NewOp = DAG.getNode(ISD::SHL, SL, MVT::i32, NewOp, NumExtBits);
3228 NewOp = DAG.getNode(Opc, SL, MVT::i32, NewOp);
3229 } else {
3230 NewOp = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Arg);
3231 NewOp = DAG.getNode(Opc, SL, MVT::i32, NewOp);
3232 NewOp = DAG.getNode(ISD::SUB, SL, MVT::i32, NewOp, NumExtBits);
3233 }
3234
3235 return DAG.getNode(ISD::TRUNCATE, SL, ResultVT, NewOp);
3236}
3237
3239 SDLoc SL(Op);
3240 SDValue Src = Op.getOperand(0);
3241
3242 assert(isCtlzOpc(Op.getOpcode()) || isCttzOpc(Op.getOpcode()));
3243 bool Ctlz = isCtlzOpc(Op.getOpcode());
3244 unsigned NewOpc = Ctlz ? AMDGPUISD::FFBH_U32 : AMDGPUISD::FFBL_B32;
3245
3246 bool ZeroUndef = Op.getOpcode() == ISD::CTLZ_ZERO_UNDEF ||
3247 Op.getOpcode() == ISD::CTTZ_ZERO_UNDEF;
3248 bool Is64BitScalar = !Src->isDivergent() && Src.getValueType() == MVT::i64;
3249
3250 if (Src.getValueType() == MVT::i32 || Is64BitScalar) {
3251 // (ctlz hi:lo) -> (umin (ffbh src), 32)
3252 // (cttz hi:lo) -> (umin (ffbl src), 32)
3253 // (ctlz_zero_undef src) -> (ffbh src)
3254 // (cttz_zero_undef src) -> (ffbl src)
3255
3256 // 64-bit scalar version produce 32-bit result
3257 // (ctlz hi:lo) -> (umin (S_FLBIT_I32_B64 src), 64)
3258 // (cttz hi:lo) -> (umin (S_FF1_I32_B64 src), 64)
3259 // (ctlz_zero_undef src) -> (S_FLBIT_I32_B64 src)
3260 // (cttz_zero_undef src) -> (S_FF1_I32_B64 src)
3261 SDValue NewOpr = DAG.getNode(NewOpc, SL, MVT::i32, Src);
3262 if (!ZeroUndef) {
3263 const SDValue ConstVal = DAG.getConstant(
3264 Op.getValueType().getScalarSizeInBits(), SL, MVT::i32);
3265 NewOpr = DAG.getNode(ISD::UMIN, SL, MVT::i32, NewOpr, ConstVal);
3266 }
3267 return DAG.getNode(ISD::ZERO_EXTEND, SL, Src.getValueType(), NewOpr);
3268 }
3269
3270 SDValue Lo, Hi;
3271 std::tie(Lo, Hi) = split64BitValue(Src, DAG);
3272
3273 SDValue OprLo = DAG.getNode(NewOpc, SL, MVT::i32, Lo);
3274 SDValue OprHi = DAG.getNode(NewOpc, SL, MVT::i32, Hi);
3275
3276 // (ctlz hi:lo) -> (umin3 (ffbh hi), (uaddsat (ffbh lo), 32), 64)
3277 // (cttz hi:lo) -> (umin3 (uaddsat (ffbl hi), 32), (ffbl lo), 64)
3278 // (ctlz_zero_undef hi:lo) -> (umin (ffbh hi), (add (ffbh lo), 32))
3279 // (cttz_zero_undef hi:lo) -> (umin (add (ffbl hi), 32), (ffbl lo))
3280
3281 unsigned AddOpc = ZeroUndef ? ISD::ADD : ISD::UADDSAT;
3282 const SDValue Const32 = DAG.getConstant(32, SL, MVT::i32);
3283 if (Ctlz)
3284 OprLo = DAG.getNode(AddOpc, SL, MVT::i32, OprLo, Const32);
3285 else
3286 OprHi = DAG.getNode(AddOpc, SL, MVT::i32, OprHi, Const32);
3287
3288 SDValue NewOpr;
3289 NewOpr = DAG.getNode(ISD::UMIN, SL, MVT::i32, OprLo, OprHi);
3290 if (!ZeroUndef) {
3291 const SDValue Const64 = DAG.getConstant(64, SL, MVT::i32);
3292 NewOpr = DAG.getNode(ISD::UMIN, SL, MVT::i32, NewOpr, Const64);
3293 }
3294
3295 return DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i64, NewOpr);
3296}
3297
3299 bool Signed) const {
3300 // The regular method converting a 64-bit integer to float roughly consists of
3301 // 2 steps: normalization and rounding. In fact, after normalization, the
3302 // conversion from a 64-bit integer to a float is essentially the same as the
3303 // one from a 32-bit integer. The only difference is that it has more
3304 // trailing bits to be rounded. To leverage the native 32-bit conversion, a
3305 // 64-bit integer could be preprocessed and fit into a 32-bit integer then
3306 // converted into the correct float number. The basic steps for the unsigned
3307 // conversion are illustrated in the following pseudo code:
3308 //
3309 // f32 uitofp(i64 u) {
3310 // i32 hi, lo = split(u);
3311 // // Only count the leading zeros in hi as we have native support of the
3312 // // conversion from i32 to f32. If hi is all 0s, the conversion is
3313 // // reduced to a 32-bit one automatically.
3314 // i32 shamt = clz(hi); // Return 32 if hi is all 0s.
3315 // u <<= shamt;
3316 // hi, lo = split(u);
3317 // hi |= (lo != 0) ? 1 : 0; // Adjust rounding bit in hi based on lo.
3318 // // convert it as a 32-bit integer and scale the result back.
3319 // return uitofp(hi) * 2^(32 - shamt);
3320 // }
3321 //
3322 // The signed one follows the same principle but uses 'ffbh_i32' to count its
3323 // sign bits instead. If 'ffbh_i32' is not available, its absolute value is
3324 // converted instead followed by negation based its sign bit.
3325
3326 SDLoc SL(Op);
3327 SDValue Src = Op.getOperand(0);
3328
3329 SDValue Lo, Hi;
3330 std::tie(Lo, Hi) = split64BitValue(Src, DAG);
3331 SDValue Sign;
3332 SDValue ShAmt;
3333 if (Signed && Subtarget->isGCN()) {
3334 // We also need to consider the sign bit in Lo if Hi has just sign bits,
3335 // i.e. Hi is 0 or -1. However, that only needs to take the MSB into
3336 // account. That is, the maximal shift is
3337 // - 32 if Lo and Hi have opposite signs;
3338 // - 33 if Lo and Hi have the same sign.
3339 //
3340 // Or, MaxShAmt = 33 + OppositeSign, where
3341 //
3342 // OppositeSign is defined as ((Lo ^ Hi) >> 31), which is
3343 // - -1 if Lo and Hi have opposite signs; and
3344 // - 0 otherwise.
3345 //
3346 // All in all, ShAmt is calculated as
3347 //
3348 // umin(sffbh(Hi), 33 + (Lo^Hi)>>31) - 1.
3349 //
3350 // or
3351 //
3352 // umin(sffbh(Hi) - 1, 32 + (Lo^Hi)>>31).
3353 //
3354 // to reduce the critical path.
3355 SDValue OppositeSign = DAG.getNode(
3356 ISD::SRA, SL, MVT::i32, DAG.getNode(ISD::XOR, SL, MVT::i32, Lo, Hi),
3357 DAG.getConstant(31, SL, MVT::i32));
3358 SDValue MaxShAmt =
3359 DAG.getNode(ISD::ADD, SL, MVT::i32, DAG.getConstant(32, SL, MVT::i32),
3360 OppositeSign);
3361 // Count the leading sign bits.
3362 ShAmt = DAG.getNode(AMDGPUISD::FFBH_I32, SL, MVT::i32, Hi);
3363 // Different from unsigned conversion, the shift should be one bit less to
3364 // preserve the sign bit.
3365 ShAmt = DAG.getNode(ISD::SUB, SL, MVT::i32, ShAmt,
3366 DAG.getConstant(1, SL, MVT::i32));
3367 ShAmt = DAG.getNode(ISD::UMIN, SL, MVT::i32, ShAmt, MaxShAmt);
3368 } else {
3369 if (Signed) {
3370 // Without 'ffbh_i32', only leading zeros could be counted. Take the
3371 // absolute value first.
3372 Sign = DAG.getNode(ISD::SRA, SL, MVT::i64, Src,
3373 DAG.getConstant(63, SL, MVT::i64));
3374 SDValue Abs =
3375 DAG.getNode(ISD::XOR, SL, MVT::i64,
3376 DAG.getNode(ISD::ADD, SL, MVT::i64, Src, Sign), Sign);
3377 std::tie(Lo, Hi) = split64BitValue(Abs, DAG);
3378 }
3379 // Count the leading zeros.
3380 ShAmt = DAG.getNode(ISD::CTLZ, SL, MVT::i32, Hi);
3381 // The shift amount for signed integers is [0, 32].
3382 }
3383 // Normalize the given 64-bit integer.
3384 SDValue Norm = DAG.getNode(ISD::SHL, SL, MVT::i64, Src, ShAmt);
3385 // Split it again.
3386 std::tie(Lo, Hi) = split64BitValue(Norm, DAG);
3387 // Calculate the adjust bit for rounding.
3388 // (lo != 0) ? 1 : 0 => (lo >= 1) ? 1 : 0 => umin(1, lo)
3389 SDValue Adjust = DAG.getNode(ISD::UMIN, SL, MVT::i32,
3390 DAG.getConstant(1, SL, MVT::i32), Lo);
3391 // Get the 32-bit normalized integer.
3392 Norm = DAG.getNode(ISD::OR, SL, MVT::i32, Hi, Adjust);
3393 // Convert the normalized 32-bit integer into f32.
3394
3395 bool UseLDEXP = isOperationLegal(ISD::FLDEXP, MVT::f32);
3396 unsigned Opc = Signed && UseLDEXP ? ISD::SINT_TO_FP : ISD::UINT_TO_FP;
3397 SDValue FVal = DAG.getNode(Opc, SL, MVT::f32, Norm);
3398
3399 // Finally, need to scale back the converted floating number as the original
3400 // 64-bit integer is converted as a 32-bit one.
3401 ShAmt = DAG.getNode(ISD::SUB, SL, MVT::i32, DAG.getConstant(32, SL, MVT::i32),
3402 ShAmt);
3403 // On GCN, use LDEXP directly.
3404 if (UseLDEXP)
3405 return DAG.getNode(ISD::FLDEXP, SL, MVT::f32, FVal, ShAmt);
3406
3407 // Otherwise, align 'ShAmt' to the exponent part and add it into the exponent
3408 // part directly to emulate the multiplication of 2^ShAmt. That 8-bit
3409 // exponent is enough to avoid overflowing into the sign bit.
3410 SDValue Exp = DAG.getNode(ISD::SHL, SL, MVT::i32, ShAmt,
3411 DAG.getConstant(23, SL, MVT::i32));
3412 SDValue IVal =
3413 DAG.getNode(ISD::ADD, SL, MVT::i32,
3414 DAG.getNode(ISD::BITCAST, SL, MVT::i32, FVal), Exp);
3415 if (Signed) {
3416 // Set the sign bit.
3417 Sign = DAG.getNode(ISD::SHL, SL, MVT::i32,
3418 DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Sign),
3419 DAG.getConstant(31, SL, MVT::i32));
3420 IVal = DAG.getNode(ISD::OR, SL, MVT::i32, IVal, Sign);
3421 }
3422 return DAG.getNode(ISD::BITCAST, SL, MVT::f32, IVal);
3423}
3424
3426 bool Signed) const {
3427 SDLoc SL(Op);
3428 SDValue Src = Op.getOperand(0);
3429
3430 SDValue Lo, Hi;
3431 std::tie(Lo, Hi) = split64BitValue(Src, DAG);
3432
3434 SL, MVT::f64, Hi);
3435
3436 SDValue CvtLo = DAG.getNode(ISD::UINT_TO_FP, SL, MVT::f64, Lo);
3437
3438 SDValue LdExp = DAG.getNode(ISD::FLDEXP, SL, MVT::f64, CvtHi,
3439 DAG.getConstant(32, SL, MVT::i32));
3440 // TODO: Should this propagate fast-math-flags?
3441 return DAG.getNode(ISD::FADD, SL, MVT::f64, LdExp, CvtLo);
3442}
3443
3445 SelectionDAG &DAG) const {
3446 // TODO: Factor out code common with LowerSINT_TO_FP.
3447 EVT DestVT = Op.getValueType();
3448 SDValue Src = Op.getOperand(0);
3449 EVT SrcVT = Src.getValueType();
3450
3451 if (SrcVT == MVT::i16) {
3452 if (DestVT == MVT::f16)
3453 return Op;
3454 SDLoc DL(Op);
3455
3456 // Promote src to i32
3457 SDValue Ext = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Src);
3458 return DAG.getNode(ISD::UINT_TO_FP, DL, DestVT, Ext);
3459 }
3460
3461 if (DestVT == MVT::bf16) {
3462 SDLoc SL(Op);
3463 SDValue ToF32 = DAG.getNode(ISD::UINT_TO_FP, SL, MVT::f32, Src);
3464 SDValue FPRoundFlag = DAG.getIntPtrConstant(0, SL, /*isTarget=*/true);
3465 return DAG.getNode(ISD::FP_ROUND, SL, MVT::bf16, ToF32, FPRoundFlag);
3466 }
3467
3468 if (SrcVT != MVT::i64)
3469 return Op;
3470
3471 if (DestVT == MVT::f16 && isTypeLegal(MVT::f16)) {
3472 SDLoc DL(Op);
3473
3474 SDValue IntToFp32 = DAG.getNode(Op.getOpcode(), DL, MVT::f32, Src);
3475 SDValue FPRoundFlag =
3476 DAG.getIntPtrConstant(0, SDLoc(Op), /*isTarget=*/true);
3477 SDValue FPRound =
3478 DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, IntToFp32, FPRoundFlag);
3479
3480 return FPRound;
3481 }
3482
3483 if (DestVT == MVT::f32)
3484 return LowerINT_TO_FP32(Op, DAG, false);
3485
3486 assert(DestVT == MVT::f64);
3487 return LowerINT_TO_FP64(Op, DAG, false);
3488}
3489
3491 SelectionDAG &DAG) const {
3492 EVT DestVT = Op.getValueType();
3493
3494 SDValue Src = Op.getOperand(0);
3495 EVT SrcVT = Src.getValueType();
3496
3497 if (SrcVT == MVT::i16) {
3498 if (DestVT == MVT::f16)
3499 return Op;
3500
3501 SDLoc DL(Op);
3502 // Promote src to i32
3503 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i32, Src);
3504 return DAG.getNode(ISD::SINT_TO_FP, DL, DestVT, Ext);
3505 }
3506
3507 if (DestVT == MVT::bf16) {
3508 SDLoc SL(Op);
3509 SDValue ToF32 = DAG.getNode(ISD::SINT_TO_FP, SL, MVT::f32, Src);
3510 SDValue FPRoundFlag = DAG.getIntPtrConstant(0, SL, /*isTarget=*/true);
3511 return DAG.getNode(ISD::FP_ROUND, SL, MVT::bf16, ToF32, FPRoundFlag);
3512 }
3513
3514 if (SrcVT != MVT::i64)
3515 return Op;
3516
3517 // TODO: Factor out code common with LowerUINT_TO_FP.
3518
3519 if (DestVT == MVT::f16 && isTypeLegal(MVT::f16)) {
3520 SDLoc DL(Op);
3521 SDValue Src = Op.getOperand(0);
3522
3523 SDValue IntToFp32 = DAG.getNode(Op.getOpcode(), DL, MVT::f32, Src);
3524 SDValue FPRoundFlag =
3525 DAG.getIntPtrConstant(0, SDLoc(Op), /*isTarget=*/true);
3526 SDValue FPRound =
3527 DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, IntToFp32, FPRoundFlag);
3528
3529 return FPRound;
3530 }
3531
3532 if (DestVT == MVT::f32)
3533 return LowerINT_TO_FP32(Op, DAG, true);
3534
3535 assert(DestVT == MVT::f64);
3536 return LowerINT_TO_FP64(Op, DAG, true);
3537}
3538
3540 bool Signed) const {
3541 SDLoc SL(Op);
3542
3543 SDValue Src = Op.getOperand(0);
3544 EVT SrcVT = Src.getValueType();
3545
3546 assert(SrcVT == MVT::f32 || SrcVT == MVT::f64);
3547
3548 // The basic idea of converting a floating point number into a pair of 32-bit
3549 // integers is illustrated as follows:
3550 //
3551 // tf := trunc(val);
3552 // hif := floor(tf * 2^-32);
3553 // lof := tf - hif * 2^32; // lof is always positive due to floor.
3554 // hi := fptoi(hif);
3555 // lo := fptoi(lof);
3556 //
3557 SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, SrcVT, Src);
3558 SDValue Sign;
3559 if (Signed && SrcVT == MVT::f32) {
3560 // However, a 32-bit floating point number has only 23 bits mantissa and
3561 // it's not enough to hold all the significant bits of `lof` if val is
3562 // negative. To avoid the loss of precision, We need to take the absolute
3563 // value after truncating and flip the result back based on the original
3564 // signedness.
3565 Sign = DAG.getNode(ISD::SRA, SL, MVT::i32,
3566 DAG.getNode(ISD::BITCAST, SL, MVT::i32, Trunc),
3567 DAG.getConstant(31, SL, MVT::i32));
3568 Trunc = DAG.getNode(ISD::FABS, SL, SrcVT, Trunc);
3569 }
3570
3571 SDValue K0, K1;
3572 if (SrcVT == MVT::f64) {
3573 K0 = DAG.getConstantFP(
3574 llvm::bit_cast<double>(UINT64_C(/*2^-32*/ 0x3df0000000000000)), SL,
3575 SrcVT);
3576 K1 = DAG.getConstantFP(
3577 llvm::bit_cast<double>(UINT64_C(/*-2^32*/ 0xc1f0000000000000)), SL,
3578 SrcVT);
3579 } else {
3580 K0 = DAG.getConstantFP(
3581 llvm::bit_cast<float>(UINT32_C(/*2^-32*/ 0x2f800000)), SL, SrcVT);
3582 K1 = DAG.getConstantFP(
3583 llvm::bit_cast<float>(UINT32_C(/*-2^32*/ 0xcf800000)), SL, SrcVT);
3584 }
3585 // TODO: Should this propagate fast-math-flags?
3586 SDValue Mul = DAG.getNode(ISD::FMUL, SL, SrcVT, Trunc, K0);
3587
3588 SDValue FloorMul = DAG.getNode(ISD::FFLOOR, SL, SrcVT, Mul);
3589
3590 SDValue Fma = DAG.getNode(ISD::FMA, SL, SrcVT, FloorMul, K1, Trunc);
3591
3592 SDValue Hi = DAG.getNode((Signed && SrcVT == MVT::f64) ? ISD::FP_TO_SINT
3594 SL, MVT::i32, FloorMul);
3595 SDValue Lo = DAG.getNode(ISD::FP_TO_UINT, SL, MVT::i32, Fma);
3596
3597 SDValue Result = DAG.getNode(ISD::BITCAST, SL, MVT::i64,
3598 DAG.getBuildVector(MVT::v2i32, SL, {Lo, Hi}));
3599
3600 if (Signed && SrcVT == MVT::f32) {
3601 assert(Sign);
3602 // Flip the result based on the signedness, which is either all 0s or 1s.
3603 Sign = DAG.getNode(ISD::BITCAST, SL, MVT::i64,
3604 DAG.getBuildVector(MVT::v2i32, SL, {Sign, Sign}));
3605 // r := xor(r, sign) - sign;
3606 Result =
3607 DAG.getNode(ISD::SUB, SL, MVT::i64,
3608 DAG.getNode(ISD::XOR, SL, MVT::i64, Result, Sign), Sign);
3609 }
3610
3611 return Result;
3612}
3613
3615 SDLoc DL(Op);
3616 SDValue N0 = Op.getOperand(0);
3617
3618 // Convert to target node to get known bits
3619 if (N0.getValueType() == MVT::f32)
3620 return DAG.getNode(AMDGPUISD::FP_TO_FP16, DL, Op.getValueType(), N0);
3621
3622 if (Op->getFlags().hasApproximateFuncs()) {
3623 // There is a generic expand for FP_TO_FP16 with unsafe fast math.
3624 return SDValue();
3625 }
3626
3627 return LowerF64ToF16Safe(N0, DL, DAG);
3628}
3629
3630// return node in i32
3632 SelectionDAG &DAG) const {
3633 assert(Src.getSimpleValueType() == MVT::f64);
3634
3635 // f64 -> f16 conversion using round-to-nearest-even rounding mode.
3636 // TODO: We can generate better code for True16.
3637 const unsigned ExpMask = 0x7ff;
3638 const unsigned ExpBiasf64 = 1023;
3639 const unsigned ExpBiasf16 = 15;
3640 SDValue Zero = DAG.getConstant(0, DL, MVT::i32);
3641 SDValue One = DAG.getConstant(1, DL, MVT::i32);
3642 SDValue U = DAG.getNode(ISD::BITCAST, DL, MVT::i64, Src);
3643 SDValue UH = DAG.getNode(ISD::SRL, DL, MVT::i64, U,
3644 DAG.getConstant(32, DL, MVT::i64));
3645 UH = DAG.getZExtOrTrunc(UH, DL, MVT::i32);
3646 U = DAG.getZExtOrTrunc(U, DL, MVT::i32);
3647 SDValue E = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
3648 DAG.getConstant(20, DL, MVT::i64));
3649 E = DAG.getNode(ISD::AND, DL, MVT::i32, E,
3650 DAG.getConstant(ExpMask, DL, MVT::i32));
3651 // Subtract the fp64 exponent bias (1023) to get the real exponent and
3652 // add the f16 bias (15) to get the biased exponent for the f16 format.
3653 E = DAG.getNode(ISD::ADD, DL, MVT::i32, E,
3654 DAG.getConstant(-ExpBiasf64 + ExpBiasf16, DL, MVT::i32));
3655
3656 SDValue M = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
3657 DAG.getConstant(8, DL, MVT::i32));
3658 M = DAG.getNode(ISD::AND, DL, MVT::i32, M,
3659 DAG.getConstant(0xffe, DL, MVT::i32));
3660
3661 SDValue MaskedSig = DAG.getNode(ISD::AND, DL, MVT::i32, UH,
3662 DAG.getConstant(0x1ff, DL, MVT::i32));
3663 MaskedSig = DAG.getNode(ISD::OR, DL, MVT::i32, MaskedSig, U);
3664
3665 SDValue Lo40Set = DAG.getSelectCC(DL, MaskedSig, Zero, Zero, One, ISD::SETEQ);
3666 M = DAG.getNode(ISD::OR, DL, MVT::i32, M, Lo40Set);
3667
3668 // (M != 0 ? 0x0200 : 0) | 0x7c00;
3669 SDValue I = DAG.getNode(ISD::OR, DL, MVT::i32,
3670 DAG.getSelectCC(DL, M, Zero, DAG.getConstant(0x0200, DL, MVT::i32),
3671 Zero, ISD::SETNE), DAG.getConstant(0x7c00, DL, MVT::i32));
3672
3673 // N = M | (E << 12);
3674 SDValue N = DAG.getNode(ISD::OR, DL, MVT::i32, M,
3675 DAG.getNode(ISD::SHL, DL, MVT::i32, E,
3676 DAG.getConstant(12, DL, MVT::i32)));
3677
3678 // B = clamp(1-E, 0, 13);
3679 SDValue OneSubExp = DAG.getNode(ISD::SUB, DL, MVT::i32,
3680 One, E);
3681 SDValue B = DAG.getNode(ISD::SMAX, DL, MVT::i32, OneSubExp, Zero);
3682 B = DAG.getNode(ISD::SMIN, DL, MVT::i32, B,
3683 DAG.getConstant(13, DL, MVT::i32));
3684
3685 SDValue SigSetHigh = DAG.getNode(ISD::OR, DL, MVT::i32, M,
3686 DAG.getConstant(0x1000, DL, MVT::i32));
3687
3688 SDValue D = DAG.getNode(ISD::SRL, DL, MVT::i32, SigSetHigh, B);
3689 SDValue D0 = DAG.getNode(ISD::SHL, DL, MVT::i32, D, B);
3690 SDValue D1 = DAG.getSelectCC(DL, D0, SigSetHigh, One, Zero, ISD::SETNE);
3691 D = DAG.getNode(ISD::OR, DL, MVT::i32, D, D1);
3692
3693 SDValue V = DAG.getSelectCC(DL, E, One, D, N, ISD::SETLT);
3694 SDValue VLow3 = DAG.getNode(ISD::AND, DL, MVT::i32, V,
3695 DAG.getConstant(0x7, DL, MVT::i32));
3696 V = DAG.getNode(ISD::SRL, DL, MVT::i32, V,
3697 DAG.getConstant(2, DL, MVT::i32));
3698 SDValue V0 = DAG.getSelectCC(DL, VLow3, DAG.getConstant(3, DL, MVT::i32),
3699 One, Zero, ISD::SETEQ);
3700 SDValue V1 = DAG.getSelectCC(DL, VLow3, DAG.getConstant(5, DL, MVT::i32),
3701 One, Zero, ISD::SETGT);
3702 V1 = DAG.getNode(ISD::OR, DL, MVT::i32, V0, V1);
3703 V = DAG.getNode(ISD::ADD, DL, MVT::i32, V, V1);
3704
3705 V = DAG.getSelectCC(DL, E, DAG.getConstant(30, DL, MVT::i32),
3706 DAG.getConstant(0x7c00, DL, MVT::i32), V, ISD::SETGT);
3707 V = DAG.getSelectCC(DL, E, DAG.getConstant(1039, DL, MVT::i32),
3708 I, V, ISD::SETEQ);
3709
3710 // Extract the sign bit.
3711 SDValue Sign = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
3712 DAG.getConstant(16, DL, MVT::i32));
3713 Sign = DAG.getNode(ISD::AND, DL, MVT::i32, Sign,
3714 DAG.getConstant(0x8000, DL, MVT::i32));
3715
3716 return DAG.getNode(ISD::OR, DL, MVT::i32, Sign, V);
3717}
3718
3720 SelectionDAG &DAG) const {
3721 SDValue Src = Op.getOperand(0);
3722 unsigned OpOpcode = Op.getOpcode();
3723 EVT SrcVT = Src.getValueType();
3724 EVT DestVT = Op.getValueType();
3725
3726 // Will be selected natively
3727 if (SrcVT == MVT::f16 && DestVT == MVT::i16)
3728 return Op;
3729
3730 if (SrcVT == MVT::bf16) {
3731 SDLoc DL(Op);
3732 SDValue PromotedSrc = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, Src);
3733 return DAG.getNode(Op.getOpcode(), DL, DestVT, PromotedSrc);
3734 }
3735
3736 // Promote i16 to i32
3737 if (DestVT == MVT::i16 && (SrcVT == MVT::f32 || SrcVT == MVT::f64)) {
3738 SDLoc DL(Op);
3739
3740 SDValue FpToInt32 = DAG.getNode(OpOpcode, DL, MVT::i32, Src);
3741 return DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, FpToInt32);
3742 }
3743
3744 if (DestVT != MVT::i64)
3745 return Op;
3746
3747 if (SrcVT == MVT::f16 ||
3748 (SrcVT == MVT::f32 && Src.getOpcode() == ISD::FP16_TO_FP)) {
3749 SDLoc DL(Op);
3750
3751 SDValue FpToInt32 = DAG.getNode(OpOpcode, DL, MVT::i32, Src);
3752 unsigned Ext =
3754 return DAG.getNode(Ext, DL, MVT::i64, FpToInt32);
3755 }
3756
3757 if (SrcVT == MVT::f32 || SrcVT == MVT::f64)
3758 return LowerFP_TO_INT64(Op, DAG, OpOpcode == ISD::FP_TO_SINT);
3759
3760 return SDValue();
3761}
3762
3764 SelectionDAG &DAG) const {
3765 SDValue Src = Op.getOperand(0);
3766 unsigned OpOpcode = Op.getOpcode();
3767 EVT SrcVT = Src.getValueType();
3768 EVT DstVT = Op.getValueType();
3769 SDValue SatVTOp = Op.getNode()->getOperand(1);
3770 EVT SatVT = cast<VTSDNode>(SatVTOp)->getVT();
3771 SDLoc DL(Op);
3772
3773 uint64_t DstWidth = DstVT.getScalarSizeInBits();
3774 uint64_t SatWidth = SatVT.getScalarSizeInBits();
3775 assert(SatWidth <= DstWidth && "Saturation width cannot exceed result width");
3776
3777 // Will be selected natively
3778 if (DstVT == MVT::i32 && SatWidth == DstWidth &&
3779 (SrcVT == MVT::f32 || SrcVT == MVT::f64))
3780 return Op;
3781
3782 const SDValue Int32VT = DAG.getValueType(MVT::i32);
3783
3784 // Perform all saturation at i32 and truncate
3785 if (SatWidth < DstWidth) {
3786 const uint64_t Int32Width = 32;
3787 SDValue FpToInt32 = DAG.getNode(OpOpcode, DL, MVT::i32, Src, Int32VT);
3788 SDValue Int32SatVal;
3789
3790 if (Op.getOpcode() == ISD::FP_TO_SINT_SAT) {
3791 SDValue MinConst = DAG.getConstant(
3792 APInt::getSignedMaxValue(SatWidth).sext(Int32Width), DL, MVT::i32);
3793 SDValue MaxConst = DAG.getConstant(
3794 APInt::getSignedMinValue(SatWidth).sext(Int32Width), DL, MVT::i32);
3795 SDValue MinVal =
3796 DAG.getNode(ISD::SMIN, DL, MVT::i32, FpToInt32, MinConst);
3797 Int32SatVal = DAG.getNode(ISD::SMAX, DL, MVT::i32, MinVal, MaxConst);
3798 } else {
3799 SDValue MinConst = DAG.getConstant(
3800 APInt::getMaxValue(SatWidth).zext(Int32Width), DL, MVT::i32);
3801 Int32SatVal = DAG.getNode(ISD::UMIN, DL, MVT::i32, FpToInt32, MinConst);
3802 }
3803
3804 if (DstWidth == Int32Width)
3805 return Int32SatVal;
3806 if (DstWidth < Int32Width)
3807 return DAG.getNode(ISD::TRUNCATE, DL, DstVT, Int32SatVal);
3808
3809 // DstWidth > Int32Width
3810 const unsigned Ext =
3812 return DAG.getNode(Ext, DL, DstVT, FpToInt32);
3813 }
3814
3815 // SatWidth == DstWidth
3816
3817 // Saturate at i32 for i64 dst and 16b src (will invoke f16 promotion below)
3818 if (DstVT == MVT::i64 &&
3819 (SrcVT == MVT::f16 || SrcVT == MVT::bf16 ||
3820 (SrcVT == MVT::f32 && Src.getOpcode() == ISD::FP16_TO_FP))) {
3821 return DAG.getNode(OpOpcode, DL, DstVT, Src, Int32VT);
3822 }
3823
3824 // Promote f16/bf16 src to f32
3825 if (SrcVT == MVT::f16 || SrcVT == MVT::bf16) {
3826 SDValue PromotedSrc = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, Src);
3827 return DAG.getNode(Op.getOpcode(), DL, DstVT, PromotedSrc, SatVTOp);
3828 }
3829
3830 // Promote sub-i32 dst to i32 with sub-i32 saturation
3831 if (DstWidth < 32) {
3832 // Note: this triggers SatWidth < DstWidth above to generate saturated
3833 // truncate by requesting MVT::i32 destination with SatWidth < 32.
3834 SDValue FpToInt32 = DAG.getNode(OpOpcode, DL, MVT::i32, Src, SatVTOp);
3835 return DAG.getNode(ISD::TRUNCATE, DL, DstVT, FpToInt32);
3836 }
3837
3838 // TODO: can we implement i64 dst for f32/f64?
3839
3840 return SDValue();
3841}
3842
3844 SelectionDAG &DAG) const {
3845 EVT ExtraVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
3846 MVT VT = Op.getSimpleValueType();
3847 MVT ScalarVT = VT.getScalarType();
3848
3849 assert(VT.isVector());
3850
3851 SDValue Src = Op.getOperand(0);
3852 SDLoc DL(Op);
3853
3854 // TODO: Don't scalarize on Evergreen?
3855 unsigned NElts = VT.getVectorNumElements();
3857 DAG.ExtractVectorElements(Src, Args, 0, NElts);
3858
3859 SDValue VTOp = DAG.getValueType(ExtraVT.getScalarType());
3860 for (unsigned I = 0; I < NElts; ++I)
3861 Args[I] = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, ScalarVT, Args[I], VTOp);
3862
3863 return DAG.getBuildVector(VT, DL, Args);
3864}
3865
3866//===----------------------------------------------------------------------===//
3867// Custom DAG optimizations
3868//===----------------------------------------------------------------------===//
3869
3870static bool isU24(SDValue Op, SelectionDAG &DAG) {
3871 return AMDGPUTargetLowering::numBitsUnsigned(Op, DAG) <= 24;
3872}
3873
3874static bool isI24(SDValue Op, SelectionDAG &DAG) {
3875 EVT VT = Op.getValueType();
3876 return VT.getSizeInBits() >= 24 && // Types less than 24-bit should be treated
3877 // as unsigned 24-bit values.
3879}
3880
3883 SelectionDAG &DAG = DCI.DAG;
3884 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
3885 bool IsIntrin = Node24->getOpcode() == ISD::INTRINSIC_WO_CHAIN;
3886
3887 SDValue LHS = IsIntrin ? Node24->getOperand(1) : Node24->getOperand(0);
3888 SDValue RHS = IsIntrin ? Node24->getOperand(2) : Node24->getOperand(1);
3889 unsigned NewOpcode = Node24->getOpcode();
3890 if (IsIntrin) {
3891 unsigned IID = Node24->getConstantOperandVal(0);
3892 switch (IID) {
3893 case Intrinsic::amdgcn_mul_i24:
3894 NewOpcode = AMDGPUISD::MUL_I24;
3895 break;
3896 case Intrinsic::amdgcn_mul_u24:
3897 NewOpcode = AMDGPUISD::MUL_U24;
3898 break;
3899 case Intrinsic::amdgcn_mulhi_i24:
3900 NewOpcode = AMDGPUISD::MULHI_I24;
3901 break;
3902 case Intrinsic::amdgcn_mulhi_u24:
3903 NewOpcode = AMDGPUISD::MULHI_U24;
3904 break;
3905 default:
3906 llvm_unreachable("Expected 24-bit mul intrinsic");
3907 }
3908 }
3909
3910 APInt Demanded = APInt::getLowBitsSet(LHS.getValueSizeInBits(), 24);
3911
3912 // First try to simplify using SimplifyMultipleUseDemandedBits which allows
3913 // the operands to have other uses, but will only perform simplifications that
3914 // involve bypassing some nodes for this user.
3915 SDValue DemandedLHS = TLI.SimplifyMultipleUseDemandedBits(LHS, Demanded, DAG);
3916 SDValue DemandedRHS = TLI.SimplifyMultipleUseDemandedBits(RHS, Demanded, DAG);
3917 if (DemandedLHS || DemandedRHS)
3918 return DAG.getNode(NewOpcode, SDLoc(Node24), Node24->getVTList(),
3919 DemandedLHS ? DemandedLHS : LHS,
3920 DemandedRHS ? DemandedRHS : RHS);
3921
3922 // Now try SimplifyDemandedBits which can simplify the nodes used by our
3923 // operands if this node is the only user.
3924 if (TLI.SimplifyDemandedBits(LHS, Demanded, DCI))
3925 return SDValue(Node24, 0);
3926 if (TLI.SimplifyDemandedBits(RHS, Demanded, DCI))
3927 return SDValue(Node24, 0);
3928
3929 return SDValue();
3930}
3931
3932template <typename IntTy>
3934 uint32_t Width, const SDLoc &DL) {
3935 if (Width + Offset < 32) {
3936 uint32_t Shl = static_cast<uint32_t>(Src0) << (32 - Offset - Width);
3937 IntTy Result = static_cast<IntTy>(Shl) >> (32 - Width);
3938 if constexpr (std::is_signed_v<IntTy>) {
3939 return DAG.getSignedConstant(Result, DL, MVT::i32);
3940 } else {
3941 return DAG.getConstant(Result, DL, MVT::i32);
3942 }
3943 }
3944
3945 return DAG.getConstant(Src0 >> Offset, DL, MVT::i32);
3946}
3947
3948static bool hasVolatileUser(SDNode *Val) {
3949 for (SDNode *U : Val->users()) {
3950 if (MemSDNode *M = dyn_cast<MemSDNode>(U)) {
3951 if (M->isVolatile())
3952 return true;
3953 }
3954 }
3955
3956 return false;
3957}
3958
3960 // i32 vectors are the canonical memory type.
3961 if (VT.getScalarType() == MVT::i32 || isTypeLegal(VT))
3962 return false;
3963
3964 if (!VT.isByteSized())
3965 return false;
3966
3967 unsigned Size = VT.getStoreSize();
3968
3969 if ((Size == 1 || Size == 2 || Size == 4) && !VT.isVector())
3970 return false;
3971
3972 if (Size == 3 || (Size > 4 && (Size % 4 != 0)))
3973 return false;
3974
3975 return true;
3976}
3977
3978// Replace load of an illegal type with a bitcast from a load of a friendlier
3979// type.
3981 DAGCombinerInfo &DCI) const {
3982 if (!DCI.isBeforeLegalize())
3983 return SDValue();
3984
3986 if (!LN->isSimple() || !ISD::isNormalLoad(LN) || hasVolatileUser(LN))
3987 return SDValue();
3988
3989 SDLoc SL(N);
3990 SelectionDAG &DAG = DCI.DAG;
3991 EVT VT = LN->getMemoryVT();
3992
3993 unsigned Size = VT.getStoreSize();
3994 Align Alignment = LN->getAlign();
3995 if (Alignment < Size && isTypeLegal(VT)) {
3996 unsigned IsFast;
3997 unsigned AS = LN->getAddressSpace();
3998
3999 // Expand unaligned loads earlier than legalization. Due to visitation order
4000 // problems during legalization, the emitted instructions to pack and unpack
4001 // the bytes again are not eliminated in the case of an unaligned copy.
4003 VT, AS, Alignment, LN->getMemOperand()->getFlags(), &IsFast)) {
4004 if (VT.isVector())
4005 return SplitVectorLoad(SDValue(LN, 0), DAG);
4006
4007 SDValue Ops[2];
4008 std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(LN, DAG);
4009
4010 return DAG.getMergeValues(Ops, SDLoc(N));
4011 }
4012
4013 if (!IsFast)
4014 return SDValue();
4015 }
4016
4017 if (!shouldCombineMemoryType(VT))
4018 return SDValue();
4019
4020 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
4021
4022 SDValue NewLoad
4023 = DAG.getLoad(NewVT, SL, LN->getChain(),
4024 LN->getBasePtr(), LN->getMemOperand());
4025
4026 SDValue BC = DAG.getNode(ISD::BITCAST, SL, VT, NewLoad);
4027 DCI.CombineTo(N, BC, NewLoad.getValue(1));
4028 return SDValue(N, 0);
4029}
4030
4031// Replace store of an illegal type with a store of a bitcast to a friendlier
4032// type.
4034 DAGCombinerInfo &DCI) const {
4035 if (!DCI.isBeforeLegalize())
4036 return SDValue();
4037
4039 if (!SN->isSimple() || !ISD::isNormalStore(SN))
4040 return SDValue();
4041
4042 EVT VT = SN->getMemoryVT();
4043 unsigned Size = VT.getStoreSize();
4044
4045 SDLoc SL(N);
4046 SelectionDAG &DAG = DCI.DAG;
4047 Align Alignment = SN->getAlign();
4048 if (Alignment < Size && isTypeLegal(VT)) {
4049 unsigned IsFast;
4050 unsigned AS = SN->getAddressSpace();
4051
4052 // Expand unaligned stores earlier than legalization. Due to visitation
4053 // order problems during legalization, the emitted instructions to pack and
4054 // unpack the bytes again are not eliminated in the case of an unaligned
4055 // copy.
4057 VT, AS, Alignment, SN->getMemOperand()->getFlags(), &IsFast)) {
4058 if (VT.isVector())
4059 return SplitVectorStore(SDValue(SN, 0), DAG);
4060
4061 return expandUnalignedStore(SN, DAG);
4062 }
4063
4064 if (!IsFast)
4065 return SDValue();
4066 }
4067
4068 if (!shouldCombineMemoryType(VT))
4069 return SDValue();
4070
4071 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
4072 SDValue Val = SN->getValue();
4073
4074 //DCI.AddToWorklist(Val.getNode());
4075
4076 bool OtherUses = !Val.hasOneUse();
4077 SDValue CastVal = DAG.getNode(ISD::BITCAST, SL, NewVT, Val);
4078 if (OtherUses) {
4079 SDValue CastBack = DAG.getNode(ISD::BITCAST, SL, VT, CastVal);
4080 DAG.ReplaceAllUsesOfValueWith(Val, CastBack);
4081 }
4082
4083 return DAG.getStore(SN->getChain(), SL, CastVal,
4084 SN->getBasePtr(), SN->getMemOperand());
4085}
4086
4087// FIXME: This should go in generic DAG combiner with an isTruncateFree check,
4088// but isTruncateFree is inaccurate for i16 now because of SALU vs. VALU
4089// issues.
4091 DAGCombinerInfo &DCI) const {
4092 SelectionDAG &DAG = DCI.DAG;
4093 SDValue N0 = N->getOperand(0);
4094
4095 // (vt2 (assertzext (truncate vt0:x), vt1)) ->
4096 // (vt2 (truncate (assertzext vt0:x, vt1)))
4097 if (N0.getOpcode() == ISD::TRUNCATE) {
4098 SDValue N1 = N->getOperand(1);
4099 EVT ExtVT = cast<VTSDNode>(N1)->getVT();
4100 SDLoc SL(N);
4101
4102 SDValue Src = N0.getOperand(0);
4103 EVT SrcVT = Src.getValueType();
4104 if (SrcVT.bitsGE(ExtVT)) {
4105 SDValue NewInReg = DAG.getNode(N->getOpcode(), SL, SrcVT, Src, N1);
4106 return DAG.getNode(ISD::TRUNCATE, SL, N->getValueType(0), NewInReg);
4107 }
4108 }
4109
4110 return SDValue();
4111}
4112
4114 SDNode *N, DAGCombinerInfo &DCI) const {
4115 unsigned IID = N->getConstantOperandVal(0);
4116 switch (IID) {
4117 case Intrinsic::amdgcn_mul_i24:
4118 case Intrinsic::amdgcn_mul_u24:
4119 case Intrinsic::amdgcn_mulhi_i24:
4120 case Intrinsic::amdgcn_mulhi_u24:
4121 return simplifyMul24(N, DCI);
4122 case Intrinsic::amdgcn_fract:
4123 case Intrinsic::amdgcn_rsq:
4124 case Intrinsic::amdgcn_rcp_legacy:
4125 case Intrinsic::amdgcn_rsq_legacy:
4126 case Intrinsic::amdgcn_rsq_clamp:
4127 case Intrinsic::amdgcn_tanh:
4128 case Intrinsic::amdgcn_prng_b32: {
4129 // FIXME: This is probably wrong. If src is an sNaN, it won't be quieted
4130 SDValue Src = N->getOperand(1);
4131 return Src.isUndef() ? Src : SDValue();
4132 }
4133 case Intrinsic::amdgcn_frexp_exp: {
4134 // frexp_exp (fneg x) -> frexp_exp x
4135 // frexp_exp (fabs x) -> frexp_exp x
4136 // frexp_exp (fneg (fabs x)) -> frexp_exp x
4137 SDValue Src = N->getOperand(1);
4138 SDValue PeekSign = peekFPSignOps(Src);
4139 if (PeekSign == Src)
4140 return SDValue();
4141 return SDValue(DCI.DAG.UpdateNodeOperands(N, N->getOperand(0), PeekSign),
4142 0);
4143 }
4144 default:
4145 return SDValue();
4146 }
4147}
4148
4149/// Split the 64-bit value \p LHS into two 32-bit components, and perform the
4150/// binary operation \p Opc to it with the corresponding constant operands.
4152 DAGCombinerInfo &DCI, const SDLoc &SL,
4153 unsigned Opc, SDValue LHS,
4154 uint32_t ValLo, uint32_t ValHi) const {
4155 SelectionDAG &DAG = DCI.DAG;
4156 SDValue Lo, Hi;
4157 std::tie(Lo, Hi) = split64BitValue(LHS, DAG);
4158
4159 SDValue LoRHS = DAG.getConstant(ValLo, SL, MVT::i32);
4160 SDValue HiRHS = DAG.getConstant(ValHi, SL, MVT::i32);
4161
4162 SDValue LoAnd = DAG.getNode(Opc, SL, MVT::i32, Lo, LoRHS);
4163 SDValue HiAnd = DAG.getNode(Opc, SL, MVT::i32, Hi, HiRHS);
4164
4165 // Re-visit the ands. It's possible we eliminated one of them and it could
4166 // simplify the vector.
4167 DCI.AddToWorklist(Lo.getNode());
4168 DCI.AddToWorklist(Hi.getNode());
4169
4170 SDValue Vec = DAG.getBuildVector(MVT::v2i32, SL, {LoAnd, HiAnd});
4171 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
4172}
4173
4175 DAGCombinerInfo &DCI) const {
4176 EVT VT = N->getValueType(0);
4177 SDValue LHS = N->getOperand(0);
4178 SDValue RHS = N->getOperand(1);
4180 SDLoc SL(N);
4181 SelectionDAG &DAG = DCI.DAG;
4182
4183 unsigned RHSVal;
4184 if (CRHS) {
4185 RHSVal = CRHS->getZExtValue();
4186 if (!RHSVal)
4187 return LHS;
4188
4189 switch (LHS->getOpcode()) {
4190 default:
4191 break;
4192 case ISD::ZERO_EXTEND:
4193 case ISD::SIGN_EXTEND:
4194 case ISD::ANY_EXTEND: {
4195 SDValue X = LHS->getOperand(0);
4196
4197 if (VT == MVT::i32 && RHSVal == 16 && X.getValueType() == MVT::i16 &&
4198 isOperationLegal(ISD::BUILD_VECTOR, MVT::v2i16)) {
4199 // Prefer build_vector as the canonical form if packed types are legal.
4200 // (shl ([asz]ext i16:x), 16 -> build_vector 0, x
4201 SDValue Vec = DAG.getBuildVector(
4202 MVT::v2i16, SL,
4203 {DAG.getConstant(0, SL, MVT::i16), LHS->getOperand(0)});
4204 return DAG.getNode(ISD::BITCAST, SL, MVT::i32, Vec);
4205 }
4206
4207 // shl (ext x) => zext (shl x), if shift does not overflow int
4208 if (VT != MVT::i64)
4209 break;
4210 KnownBits Known = DAG.computeKnownBits(X);
4211 unsigned LZ = Known.countMinLeadingZeros();
4212 if (LZ < RHSVal)
4213 break;
4214 EVT XVT = X.getValueType();
4215 SDValue Shl = DAG.getNode(ISD::SHL, SL, XVT, X, SDValue(CRHS, 0));
4216 return DAG.getZExtOrTrunc(Shl, SL, VT);
4217 }
4218 }
4219 }
4220
4221 if (VT.getScalarType() != MVT::i64)
4222 return SDValue();
4223
4224 // On some subtargets, 64-bit shift is a quarter rate instruction. In the
4225 // common case, splitting this into a move and a 32-bit shift is faster and
4226 // the same code size.
4227 KnownBits Known = DAG.computeKnownBits(RHS);
4228
4229 EVT ElementType = VT.getScalarType();
4230 EVT TargetScalarType = ElementType.getHalfSizedIntegerVT(*DAG.getContext());
4231 EVT TargetType = VT.changeElementType(*DAG.getContext(), TargetScalarType);
4232
4233 if (Known.getMinValue().getZExtValue() < TargetScalarType.getSizeInBits())
4234 return SDValue();
4235 SDValue ShiftAmt;
4236
4237 if (CRHS) {
4238 ShiftAmt = DAG.getConstant(RHSVal - TargetScalarType.getSizeInBits(), SL,
4239 TargetType);
4240 } else {
4241 SDValue TruncShiftAmt = DAG.getNode(ISD::TRUNCATE, SL, TargetType, RHS);
4242 const SDValue ShiftMask =
4243 DAG.getConstant(TargetScalarType.getSizeInBits() - 1, SL, TargetType);
4244 // This AND instruction will clamp out of bounds shift values.
4245 // It will also be removed during later instruction selection.
4246 ShiftAmt = DAG.getNode(ISD::AND, SL, TargetType, TruncShiftAmt, ShiftMask);
4247 }
4248
4249 SDValue Lo = DAG.getNode(ISD::TRUNCATE, SL, TargetType, LHS);
4250 SDValue NewShift =
4251 DAG.getNode(ISD::SHL, SL, TargetType, Lo, ShiftAmt, N->getFlags());
4252
4253 const SDValue Zero = DAG.getConstant(0, SL, TargetScalarType);
4254 SDValue Vec;
4255
4256 if (VT.isVector()) {
4257 EVT ConcatType = TargetType.getDoubleNumVectorElementsVT(*DAG.getContext());
4258 unsigned NElts = TargetType.getVectorNumElements();
4260 SmallVector<SDValue, 16> HiAndLoOps(NElts * 2, Zero);
4261
4262 DAG.ExtractVectorElements(NewShift, HiOps, 0, NElts);
4263 for (unsigned I = 0; I != NElts; ++I)
4264 HiAndLoOps[2 * I + 1] = HiOps[I];
4265 Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, ConcatType, HiAndLoOps);
4266 } else {
4267 EVT ConcatType = EVT::getVectorVT(*DAG.getContext(), TargetType, 2);
4268 Vec = DAG.getBuildVector(ConcatType, SL, {Zero, NewShift});
4269 }
4270 return DAG.getNode(ISD::BITCAST, SL, VT, Vec);
4271}
4272
4274 DAGCombinerInfo &DCI) const {
4275 SDValue RHS = N->getOperand(1);
4277 EVT VT = N->getValueType(0);
4278 SDValue LHS = N->getOperand(0);
4279 SelectionDAG &DAG = DCI.DAG;
4280 SDLoc SL(N);
4281
4282 if (VT.getScalarType() != MVT::i64)
4283 return SDValue();
4284
4285 // For C >= 32
4286 // i64 (sra x, C) -> (build_pair (sra hi_32(x), C - 32), sra hi_32(x), 31))
4287
4288 // On some subtargets, 64-bit shift is a quarter rate instruction. In the
4289 // common case, splitting this into a move and a 32-bit shift is faster and
4290 // the same code size.
4291 KnownBits Known = DAG.computeKnownBits(RHS);
4292
4293 EVT ElementType = VT.getScalarType();
4294 EVT TargetScalarType = ElementType.getHalfSizedIntegerVT(*DAG.getContext());
4295 EVT TargetType = VT.changeElementType(*DAG.getContext(), TargetScalarType);
4296
4297 if (Known.getMinValue().getZExtValue() < TargetScalarType.getSizeInBits())
4298 return SDValue();
4299
4300 SDValue ShiftFullAmt =
4301 DAG.getConstant(TargetScalarType.getSizeInBits() - 1, SL, TargetType);
4302 SDValue ShiftAmt;
4303 if (CRHS) {
4304 unsigned RHSVal = CRHS->getZExtValue();
4305 ShiftAmt = DAG.getConstant(RHSVal - TargetScalarType.getSizeInBits(), SL,
4306 TargetType);
4307 } else if (Known.getMinValue().getZExtValue() ==
4308 (ElementType.getSizeInBits() - 1)) {
4309 ShiftAmt = ShiftFullAmt;
4310 } else {
4311 SDValue TruncShiftAmt = DAG.getNode(ISD::TRUNCATE, SL, TargetType, RHS);
4312 const SDValue ShiftMask =
4313 DAG.getConstant(TargetScalarType.getSizeInBits() - 1, SL, TargetType);
4314 // This AND instruction will clamp out of bounds shift values.
4315 // It will also be removed during later instruction selection.
4316 ShiftAmt = DAG.getNode(ISD::AND, SL, TargetType, TruncShiftAmt, ShiftMask);
4317 }
4318
4319 EVT ConcatType;
4320 SDValue Hi;
4321 SDLoc LHSSL(LHS);
4322 // Bitcast LHS into ConcatType so hi-half of source can be extracted into Hi
4323 if (VT.isVector()) {
4324 unsigned NElts = TargetType.getVectorNumElements();
4325 ConcatType = TargetType.getDoubleNumVectorElementsVT(*DAG.getContext());
4326 SDValue SplitLHS = DAG.getNode(ISD::BITCAST, LHSSL, ConcatType, LHS);
4327 SmallVector<SDValue, 8> HiOps(NElts);
4328 SmallVector<SDValue, 16> HiAndLoOps;
4329
4330 DAG.ExtractVectorElements(SplitLHS, HiAndLoOps, 0, NElts * 2);
4331 for (unsigned I = 0; I != NElts; ++I) {
4332 HiOps[I] = HiAndLoOps[2 * I + 1];
4333 }
4334 Hi = DAG.getNode(ISD::BUILD_VECTOR, LHSSL, TargetType, HiOps);
4335 } else {
4336 const SDValue One = DAG.getConstant(1, LHSSL, TargetScalarType);
4337 ConcatType = EVT::getVectorVT(*DAG.getContext(), TargetType, 2);
4338 SDValue SplitLHS = DAG.getNode(ISD::BITCAST, LHSSL, ConcatType, LHS);
4339 Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, LHSSL, TargetType, SplitLHS, One);
4340 }
4341
4342 KnownBits KnownLHS = DAG.computeKnownBits(LHS);
4343 SDValue HiShift;
4344 if (KnownLHS.isNegative()) {
4345 HiShift = DAG.getAllOnesConstant(SL, TargetType);
4346 } else {
4347 Hi = DAG.getFreeze(Hi);
4348 HiShift = DAG.getNode(ISD::SRA, SL, TargetType, Hi, ShiftFullAmt);
4349 }
4350 SDValue NewShift =
4351 DAG.getNode(ISD::SRA, SL, TargetType, Hi, ShiftAmt, N->getFlags());
4352
4353 SDValue Vec;
4354 if (VT.isVector()) {
4355 unsigned NElts = TargetType.getVectorNumElements();
4358 SmallVector<SDValue, 16> HiAndLoOps(NElts * 2);
4359
4360 DAG.ExtractVectorElements(HiShift, HiOps, 0, NElts);
4361 DAG.ExtractVectorElements(NewShift, LoOps, 0, NElts);
4362 for (unsigned I = 0; I != NElts; ++I) {
4363 HiAndLoOps[2 * I + 1] = HiOps[I];
4364 HiAndLoOps[2 * I] = LoOps[I];
4365 }
4366 Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, ConcatType, HiAndLoOps);
4367 } else {
4368 Vec = DAG.getBuildVector(ConcatType, SL, {NewShift, HiShift});
4369 }
4370 return DAG.getNode(ISD::BITCAST, SL, VT, Vec);
4371}
4372
4374 DAGCombinerInfo &DCI) const {
4375 SDValue RHS = N->getOperand(1);
4377 EVT VT = N->getValueType(0);
4378 SDValue LHS = N->getOperand(0);
4379 SelectionDAG &DAG = DCI.DAG;
4380 SDLoc SL(N);
4381 unsigned RHSVal;
4382
4383 if (CRHS) {
4384 RHSVal = CRHS->getZExtValue();
4385
4386 // fold (srl (and x, c1 << c2), c2) -> (and (srl(x, c2), c1)
4387 // this improves the ability to match BFE patterns in isel.
4388 if (LHS.getOpcode() == ISD::AND) {
4389 if (auto *Mask = dyn_cast<ConstantSDNode>(LHS.getOperand(1))) {
4390 unsigned MaskIdx, MaskLen;
4391 if (Mask->getAPIntValue().isShiftedMask(MaskIdx, MaskLen) &&
4392 MaskIdx == RHSVal) {
4393 return DAG.getNode(ISD::AND, SL, VT,
4394 DAG.getNode(ISD::SRL, SL, VT, LHS.getOperand(0),
4395 N->getOperand(1)),
4396 DAG.getNode(ISD::SRL, SL, VT, LHS.getOperand(1),
4397 N->getOperand(1)));
4398 }
4399 }
4400 }
4401 }
4402
4403 if (VT.getScalarType() != MVT::i64)
4404 return SDValue();
4405
4406 // for C >= 32
4407 // i64 (srl x, C) -> (build_pair (srl hi_32(x), C - 32), 0)
4408
4409 // On some subtargets, 64-bit shift is a quarter rate instruction. In the
4410 // common case, splitting this into a move and a 32-bit shift is faster and
4411 // the same code size.
4412 KnownBits Known = DAG.computeKnownBits(RHS);
4413
4414 EVT ElementType = VT.getScalarType();
4415 EVT TargetScalarType = ElementType.getHalfSizedIntegerVT(*DAG.getContext());
4416 EVT TargetType = VT.changeElementType(*DAG.getContext(), TargetScalarType);
4417
4418 if (Known.getMinValue().getZExtValue() < TargetScalarType.getSizeInBits())
4419 return SDValue();
4420
4421 SDValue ShiftAmt;
4422 if (CRHS) {
4423 ShiftAmt = DAG.getConstant(RHSVal - TargetScalarType.getSizeInBits(), SL,
4424 TargetType);
4425 } else {
4426 SDValue TruncShiftAmt = DAG.getNode(ISD::TRUNCATE, SL, TargetType, RHS);
4427 const SDValue ShiftMask =
4428 DAG.getConstant(TargetScalarType.getSizeInBits() - 1, SL, TargetType);
4429 // This AND instruction will clamp out of bounds shift values.
4430 // It will also be removed during later instruction selection.
4431 ShiftAmt = DAG.getNode(ISD::AND, SL, TargetType, TruncShiftAmt, ShiftMask);
4432 }
4433
4434 const SDValue Zero = DAG.getConstant(0, SL, TargetScalarType);
4435 EVT ConcatType;
4436 SDValue Hi;
4437 SDLoc LHSSL(LHS);
4438 // Bitcast LHS into ConcatType so hi-half of source can be extracted into Hi
4439 if (VT.isVector()) {
4440 unsigned NElts = TargetType.getVectorNumElements();
4441 ConcatType = TargetType.getDoubleNumVectorElementsVT(*DAG.getContext());
4442 SDValue SplitLHS = DAG.getNode(ISD::BITCAST, LHSSL, ConcatType, LHS);
4443 SmallVector<SDValue, 8> HiOps(NElts);
4444 SmallVector<SDValue, 16> HiAndLoOps;
4445
4446 DAG.ExtractVectorElements(SplitLHS, HiAndLoOps, /*Start=*/0, NElts * 2);
4447 for (unsigned I = 0; I != NElts; ++I)
4448 HiOps[I] = HiAndLoOps[2 * I + 1];
4449 Hi = DAG.getNode(ISD::BUILD_VECTOR, LHSSL, TargetType, HiOps);
4450 } else {
4451 const SDValue One = DAG.getConstant(1, LHSSL, TargetScalarType);
4452 ConcatType = EVT::getVectorVT(*DAG.getContext(), TargetType, 2);
4453 SDValue SplitLHS = DAG.getNode(ISD::BITCAST, LHSSL, ConcatType, LHS);
4454 Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, LHSSL, TargetType, SplitLHS, One);
4455 }
4456
4457 SDValue NewShift =
4458 DAG.getNode(ISD::SRL, SL, TargetType, Hi, ShiftAmt, N->getFlags());
4459
4460 SDValue Vec;
4461 if (VT.isVector()) {
4462 unsigned NElts = TargetType.getVectorNumElements();
4464 SmallVector<SDValue, 16> HiAndLoOps(NElts * 2, Zero);
4465
4466 DAG.ExtractVectorElements(NewShift, LoOps, 0, NElts);
4467 for (unsigned I = 0; I != NElts; ++I)
4468 HiAndLoOps[2 * I] = LoOps[I];
4469 Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, ConcatType, HiAndLoOps);
4470 } else {
4471 Vec = DAG.getBuildVector(ConcatType, SL, {NewShift, Zero});
4472 }
4473 return DAG.getNode(ISD::BITCAST, SL, VT, Vec);
4474}
4475
4477 SDNode *N, DAGCombinerInfo &DCI) const {
4478 SDLoc SL(N);
4479 SelectionDAG &DAG = DCI.DAG;
4480 EVT VT = N->getValueType(0);
4481 SDValue Src = N->getOperand(0);
4482
4483 // vt1 (truncate (bitcast (build_vector vt0:x, ...))) -> vt1 (bitcast vt0:x)
4484 if (Src.getOpcode() == ISD::BITCAST && !VT.isVector()) {
4485 SDValue Vec = Src.getOperand(0);
4486 if (Vec.getOpcode() == ISD::BUILD_VECTOR) {
4487 SDValue Elt0 = Vec.getOperand(0);
4488 EVT EltVT = Elt0.getValueType();
4489 if (VT.getFixedSizeInBits() <= EltVT.getFixedSizeInBits()) {
4490 if (EltVT.isFloatingPoint()) {
4491 Elt0 = DAG.getNode(ISD::BITCAST, SL,
4492 EltVT.changeTypeToInteger(), Elt0);
4493 }
4494
4495 return DAG.getNode(ISD::TRUNCATE, SL, VT, Elt0);
4496 }
4497 }
4498 }
4499
4500 // Equivalent of above for accessing the high element of a vector as an
4501 // integer operation.
4502 // trunc (srl (bitcast (build_vector x, y))), 16 -> trunc (bitcast y)
4503 if (Src.getOpcode() == ISD::SRL && !VT.isVector()) {
4504 if (auto *K = isConstOrConstSplat(Src.getOperand(1))) {
4505 SDValue BV = stripBitcast(Src.getOperand(0));
4506 if (BV.getOpcode() == ISD::BUILD_VECTOR) {
4507 EVT SrcEltVT = BV.getOperand(0).getValueType();
4508 unsigned SrcEltSize = SrcEltVT.getSizeInBits();
4509 unsigned BitIndex = K->getZExtValue();
4510 unsigned PartIndex = BitIndex / SrcEltSize;
4511
4512 if (PartIndex * SrcEltSize == BitIndex &&
4513 PartIndex < BV.getNumOperands()) {
4514 if (SrcEltVT.getSizeInBits() == VT.getSizeInBits()) {
4515 SDValue SrcElt =
4516 DAG.getNode(ISD::BITCAST, SL, SrcEltVT.changeTypeToInteger(),
4517 BV.getOperand(PartIndex));
4518 return DAG.getNode(ISD::TRUNCATE, SL, VT, SrcElt);
4519 }
4520 }
4521 }
4522 }
4523 }
4524
4525 // Partially shrink 64-bit shifts to 32-bit if reduced to 16-bit.
4526 //
4527 // i16 (trunc (srl i64:x, K)), K <= 16 ->
4528 // i16 (trunc (srl (i32 (trunc x), K)))
4529 if (VT.getScalarSizeInBits() < 32) {
4530 EVT SrcVT = Src.getValueType();
4531 if (SrcVT.getScalarSizeInBits() > 32 &&
4532 (Src.getOpcode() == ISD::SRL ||
4533 Src.getOpcode() == ISD::SRA ||
4534 Src.getOpcode() == ISD::SHL)) {
4535 SDValue Amt = Src.getOperand(1);
4536 KnownBits Known = DAG.computeKnownBits(Amt);
4537
4538 // - For left shifts, do the transform as long as the shift
4539 // amount is still legal for i32, so when ShiftAmt < 32 (<= 31)
4540 // - For right shift, do it if ShiftAmt <= (32 - Size) to avoid
4541 // losing information stored in the high bits when truncating.
4542 const unsigned MaxCstSize =
4543 (Src.getOpcode() == ISD::SHL) ? 31 : (32 - VT.getScalarSizeInBits());
4544 if (Known.getMaxValue().ule(MaxCstSize)) {
4545 EVT MidVT = VT.isVector() ?
4546 EVT::getVectorVT(*DAG.getContext(), MVT::i32,
4547 VT.getVectorNumElements()) : MVT::i32;
4548
4549 EVT NewShiftVT = getShiftAmountTy(MidVT, DAG.getDataLayout());
4550 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, MidVT,
4551 Src.getOperand(0));
4552 DCI.AddToWorklist(Trunc.getNode());
4553
4554 if (Amt.getValueType() != NewShiftVT) {
4555 Amt = DAG.getZExtOrTrunc(Amt, SL, NewShiftVT);
4556 DCI.AddToWorklist(Amt.getNode());
4557 }
4558
4559 SDValue ShrunkShift = DAG.getNode(Src.getOpcode(), SL, MidVT,
4560 Trunc, Amt);
4561 return DAG.getNode(ISD::TRUNCATE, SL, VT, ShrunkShift);
4562 }
4563 }
4564 }
4565
4566 return SDValue();
4567}
4568
4569// We need to specifically handle i64 mul here to avoid unnecessary conversion
4570// instructions. If we only match on the legalized i64 mul expansion,
4571// SimplifyDemandedBits will be unable to remove them because there will be
4572// multiple uses due to the separate mul + mulh[su].
4573static SDValue getMul24(SelectionDAG &DAG, const SDLoc &SL,
4574 SDValue N0, SDValue N1, unsigned Size, bool Signed) {
4575 if (Size <= 32) {
4576 unsigned MulOpc = Signed ? AMDGPUISD::MUL_I24 : AMDGPUISD::MUL_U24;
4577 return DAG.getNode(MulOpc, SL, MVT::i32, N0, N1);
4578 }
4579
4580 unsigned MulLoOpc = Signed ? AMDGPUISD::MUL_I24 : AMDGPUISD::MUL_U24;
4581 unsigned MulHiOpc = Signed ? AMDGPUISD::MULHI_I24 : AMDGPUISD::MULHI_U24;
4582
4583 SDValue MulLo = DAG.getNode(MulLoOpc, SL, MVT::i32, N0, N1);
4584 SDValue MulHi = DAG.getNode(MulHiOpc, SL, MVT::i32, N0, N1);
4585
4586 return DAG.getNode(ISD::BUILD_PAIR, SL, MVT::i64, MulLo, MulHi);
4587}
4588
4589/// If \p V is an add of a constant 1, returns the other operand. Otherwise
4590/// return SDValue().
4591static SDValue getAddOneOp(const SDNode *V) {
4592 if (V->getOpcode() != ISD::ADD)
4593 return SDValue();
4594
4595 return isOneConstant(V->getOperand(1)) ? V->getOperand(0) : SDValue();
4596}
4597
4599 DAGCombinerInfo &DCI) const {
4600 assert(N->getOpcode() == ISD::MUL);
4601 EVT VT = N->getValueType(0);
4602
4603 // Don't generate 24-bit multiplies on values that are in SGPRs, since
4604 // we only have a 32-bit scalar multiply (avoid values being moved to VGPRs
4605 // unnecessarily). isDivergent() is used as an approximation of whether the
4606 // value is in an SGPR.
4607 if (!N->isDivergent())
4608 return SDValue();
4609
4610 unsigned Size = VT.getSizeInBits();
4611 if (VT.isVector() || Size > 64)
4612 return SDValue();
4613
4614 SelectionDAG &DAG = DCI.DAG;
4615 SDLoc DL(N);
4616
4617 SDValue N0 = N->getOperand(0);
4618 SDValue N1 = N->getOperand(1);
4619
4620 // Undo InstCombine canonicalize X * (Y + 1) -> X * Y + X to enable mad
4621 // matching.
4622
4623 // mul x, (add y, 1) -> add (mul x, y), x
4624 auto IsFoldableAdd = [](SDValue V) -> SDValue {
4625 SDValue AddOp = getAddOneOp(V.getNode());
4626 if (!AddOp)
4627 return SDValue();
4628
4629 if (V.hasOneUse() || all_of(V->users(), [](const SDNode *U) -> bool {
4630 return U->getOpcode() == ISD::MUL;
4631 }))
4632 return AddOp;
4633
4634 return SDValue();
4635 };
4636
4637 // FIXME: The selection pattern is not properly checking for commuted
4638 // operands, so we have to place the mul in the LHS
4639 if (SDValue MulOper = IsFoldableAdd(N0)) {
4640 SDValue MulVal = DAG.getNode(N->getOpcode(), DL, VT, N1, MulOper);
4641 return DAG.getNode(ISD::ADD, DL, VT, MulVal, N1);
4642 }
4643
4644 if (SDValue MulOper = IsFoldableAdd(N1)) {
4645 SDValue MulVal = DAG.getNode(N->getOpcode(), DL, VT, N0, MulOper);
4646 return DAG.getNode(ISD::ADD, DL, VT, MulVal, N0);
4647 }
4648
4649 // There are i16 integer mul/mad.
4650 if (isTypeLegal(MVT::i16) && VT.getScalarType().bitsLE(MVT::i16))
4651 return SDValue();
4652
4653 // SimplifyDemandedBits has the annoying habit of turning useful zero_extends
4654 // in the source into any_extends if the result of the mul is truncated. Since
4655 // we can assume the high bits are whatever we want, use the underlying value
4656 // to avoid the unknown high bits from interfering.
4657 if (N0.getOpcode() == ISD::ANY_EXTEND)
4658 N0 = N0.getOperand(0);
4659
4660 if (N1.getOpcode() == ISD::ANY_EXTEND)
4661 N1 = N1.getOperand(0);
4662
4663 SDValue Mul;
4664
4665 if (Subtarget->hasMulU24() && isU24(N0, DAG) && isU24(N1, DAG)) {
4666 N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);
4667 N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);
4668 Mul = getMul24(DAG, DL, N0, N1, Size, false);
4669 } else if (Subtarget->hasMulI24() && isI24(N0, DAG) && isI24(N1, DAG)) {
4670 N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);
4671 N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);
4672 Mul = getMul24(DAG, DL, N0, N1, Size, true);
4673 } else {
4674 return SDValue();
4675 }
4676
4677 // We need to use sext even for MUL_U24, because MUL_U24 is used
4678 // for signed multiply of 8 and 16-bit types.
4679 return DAG.getSExtOrTrunc(Mul, DL, VT);
4680}
4681
4682SDValue
4684 DAGCombinerInfo &DCI) const {
4685 if (N->getValueType(0) != MVT::i32)
4686 return SDValue();
4687
4688 SelectionDAG &DAG = DCI.DAG;
4689 SDLoc DL(N);
4690
4691 bool Signed = N->getOpcode() == ISD::SMUL_LOHI;
4692 SDValue N0 = N->getOperand(0);
4693 SDValue N1 = N->getOperand(1);
4694
4695 // SimplifyDemandedBits has the annoying habit of turning useful zero_extends
4696 // in the source into any_extends if the result of the mul is truncated. Since
4697 // we can assume the high bits are whatever we want, use the underlying value
4698 // to avoid the unknown high bits from interfering.
4699 if (N0.getOpcode() == ISD::ANY_EXTEND)
4700 N0 = N0.getOperand(0);
4701 if (N1.getOpcode() == ISD::ANY_EXTEND)
4702 N1 = N1.getOperand(0);
4703
4704 // Try to use two fast 24-bit multiplies (one for each half of the result)
4705 // instead of one slow extending multiply.
4706 unsigned LoOpcode = 0;
4707 unsigned HiOpcode = 0;
4708 if (Signed) {
4709 if (Subtarget->hasMulI24() && isI24(N0, DAG) && isI24(N1, DAG)) {
4710 N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);
4711 N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);
4712 LoOpcode = AMDGPUISD::MUL_I24;
4713 HiOpcode = AMDGPUISD::MULHI_I24;
4714 }
4715 } else {
4716 if (Subtarget->hasMulU24() && isU24(N0, DAG) && isU24(N1, DAG)) {
4717 N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);
4718 N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);
4719 LoOpcode = AMDGPUISD::MUL_U24;
4720 HiOpcode = AMDGPUISD::MULHI_U24;
4721 }
4722 }
4723 if (!LoOpcode)
4724 return SDValue();
4725
4726 SDValue Lo = DAG.getNode(LoOpcode, DL, MVT::i32, N0, N1);
4727 SDValue Hi = DAG.getNode(HiOpcode, DL, MVT::i32, N0, N1);
4728 DCI.CombineTo(N, Lo, Hi);
4729 return SDValue(N, 0);
4730}
4731
4733 DAGCombinerInfo &DCI) const {
4734 EVT VT = N->getValueType(0);
4735
4736 if (!Subtarget->hasMulI24() || VT.isVector())
4737 return SDValue();
4738
4739 // Don't generate 24-bit multiplies on values that are in SGPRs, since
4740 // we only have a 32-bit scalar multiply (avoid values being moved to VGPRs
4741 // unnecessarily). isDivergent() is used as an approximation of whether the
4742 // value is in an SGPR.
4743 // This doesn't apply if no s_mul_hi is available (since we'll end up with a
4744 // valu op anyway)
4745 if (Subtarget->hasSMulHi() && !N->isDivergent())
4746 return SDValue();
4747
4748 SelectionDAG &DAG = DCI.DAG;
4749 SDLoc DL(N);
4750
4751 SDValue N0 = N->getOperand(0);
4752 SDValue N1 = N->getOperand(1);
4753
4754 if (!isI24(N0, DAG) || !isI24(N1, DAG))
4755 return SDValue();
4756
4757 N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);
4758 N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);
4759
4760 SDValue Mulhi = DAG.getNode(AMDGPUISD::MULHI_I24, DL, MVT::i32, N0, N1);
4761 DCI.AddToWorklist(Mulhi.getNode());
4762 return DAG.getSExtOrTrunc(Mulhi, DL, VT);
4763}
4764
4766 DAGCombinerInfo &DCI) const {
4767 EVT VT = N->getValueType(0);
4768
4769 if (VT.isVector() || VT.getSizeInBits() > 32 || !Subtarget->hasMulU24())
4770 return SDValue();
4771
4772 // Don't generate 24-bit multiplies on values that are in SGPRs, since
4773 // we only have a 32-bit scalar multiply (avoid values being moved to VGPRs
4774 // unnecessarily). isDivergent() is used as an approximation of whether the
4775 // value is in an SGPR.
4776 // This doesn't apply if no s_mul_hi is available (since we'll end up with a
4777 // valu op anyway)
4778 if (!N->isDivergent() && Subtarget->hasSMulHi())
4779 return SDValue();
4780
4781 SelectionDAG &DAG = DCI.DAG;
4782 SDLoc DL(N);
4783
4784 SDValue N0 = N->getOperand(0);
4785 SDValue N1 = N->getOperand(1);
4786
4787 if (!isU24(N0, DAG) || !isU24(N1, DAG))
4788 return SDValue();
4789
4790 N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);
4791 N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);
4792
4793 SDValue Mulhi = DAG.getNode(AMDGPUISD::MULHI_U24, DL, MVT::i32, N0, N1);
4794 DCI.AddToWorklist(Mulhi.getNode());
4795 return DAG.getZExtOrTrunc(Mulhi, DL, VT);
4796}
4797
4798SDValue AMDGPUTargetLowering::getFFBX_U32(SelectionDAG &DAG,
4799 SDValue Op,
4800 const SDLoc &DL,
4801 unsigned Opc) const {
4802 EVT VT = Op.getValueType();
4803 if (VT.bitsGT(MVT::i32))
4804 return SDValue();
4805
4806 if (VT != MVT::i32)
4807 Op = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Op);
4808
4809 SDValue FFBX = DAG.getNode(Opc, DL, MVT::i32, Op);
4810 if (VT != MVT::i32)
4811 FFBX = DAG.getNode(ISD::TRUNCATE, DL, VT, FFBX);
4812
4813 return FFBX;
4814}
4815
4816// The native instructions return -1 on 0 input. Optimize out a select that
4817// produces -1 on 0.
4818//
4819// TODO: If zero is not undef, we could also do this if the output is compared
4820// against the bitwidth.
4821//
4822// TODO: Should probably combine against FFBH_U32 instead of ctlz directly.
4824 SDValue LHS, SDValue RHS,
4825 DAGCombinerInfo &DCI) const {
4826 if (!isNullConstant(Cond.getOperand(1)))
4827 return SDValue();
4828
4829 SelectionDAG &DAG = DCI.DAG;
4830 ISD::CondCode CCOpcode = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
4831 SDValue CmpLHS = Cond.getOperand(0);
4832
4833 // select (setcc x, 0, eq), -1, (ctlz_zero_undef x) -> ffbh_u32 x
4834 // select (setcc x, 0, eq), -1, (cttz_zero_undef x) -> ffbl_u32 x
4835 if (CCOpcode == ISD::SETEQ &&
4836 (isCtlzOpc(RHS.getOpcode()) || isCttzOpc(RHS.getOpcode())) &&
4837 RHS.getOperand(0) == CmpLHS && isAllOnesConstant(LHS)) {
4838 unsigned Opc =
4839 isCttzOpc(RHS.getOpcode()) ? AMDGPUISD::FFBL_B32 : AMDGPUISD::FFBH_U32;
4840 return getFFBX_U32(DAG, CmpLHS, SL, Opc);
4841 }
4842
4843 // select (setcc x, 0, ne), (ctlz_zero_undef x), -1 -> ffbh_u32 x
4844 // select (setcc x, 0, ne), (cttz_zero_undef x), -1 -> ffbl_u32 x
4845 if (CCOpcode == ISD::SETNE &&
4846 (isCtlzOpc(LHS.getOpcode()) || isCttzOpc(LHS.getOpcode())) &&
4847 LHS.getOperand(0) == CmpLHS && isAllOnesConstant(RHS)) {
4848 unsigned Opc =
4849 isCttzOpc(LHS.getOpcode()) ? AMDGPUISD::FFBL_B32 : AMDGPUISD::FFBH_U32;
4850
4851 return getFFBX_U32(DAG, CmpLHS, SL, Opc);
4852 }
4853
4854 return SDValue();
4855}
4856
4858 unsigned Op,
4859 const SDLoc &SL,
4860 SDValue Cond,
4861 SDValue N1,
4862 SDValue N2) {
4863 SelectionDAG &DAG = DCI.DAG;
4864 EVT VT = N1.getValueType();
4865
4866 SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, VT, Cond,
4867 N1.getOperand(0), N2.getOperand(0));
4868 DCI.AddToWorklist(NewSelect.getNode());
4869 return DAG.getNode(Op, SL, VT, NewSelect);
4870}
4871
4872// Pull a free FP operation out of a select so it may fold into uses.
4873//
4874// select c, (fneg x), (fneg y) -> fneg (select c, x, y)
4875// select c, (fneg x), k -> fneg (select c, x, (fneg k))
4876//
4877// select c, (fabs x), (fabs y) -> fabs (select c, x, y)
4878// select c, (fabs x), +k -> fabs (select c, x, k)
4879SDValue
4881 SDValue N) const {
4882 SelectionDAG &DAG = DCI.DAG;
4883 SDValue Cond = N.getOperand(0);
4884 SDValue LHS = N.getOperand(1);
4885 SDValue RHS = N.getOperand(2);
4886
4887 EVT VT = N.getValueType();
4888 if ((LHS.getOpcode() == ISD::FABS && RHS.getOpcode() == ISD::FABS) ||
4889 (LHS.getOpcode() == ISD::FNEG && RHS.getOpcode() == ISD::FNEG)) {
4891 return SDValue();
4892
4893 return distributeOpThroughSelect(DCI, LHS.getOpcode(),
4894 SDLoc(N), Cond, LHS, RHS);
4895 }
4896
4897 bool Inv = false;
4898 if (RHS.getOpcode() == ISD::FABS || RHS.getOpcode() == ISD::FNEG) {
4899 std::swap(LHS, RHS);
4900 Inv = true;
4901 }
4902
4903 // TODO: Support vector constants.
4905 if ((LHS.getOpcode() == ISD::FNEG || LHS.getOpcode() == ISD::FABS) && CRHS &&
4906 !selectSupportsSourceMods(N.getNode())) {
4907 SDLoc SL(N);
4908 // If one side is an fneg/fabs and the other is a constant, we can push the
4909 // fneg/fabs down. If it's an fabs, the constant needs to be non-negative.
4910 SDValue NewLHS = LHS.getOperand(0);
4911 SDValue NewRHS = RHS;
4912
4913 // Careful: if the neg can be folded up, don't try to pull it back down.
4914 bool ShouldFoldNeg = true;
4915
4916 if (NewLHS.hasOneUse()) {
4917 unsigned Opc = NewLHS.getOpcode();
4918 if (LHS.getOpcode() == ISD::FNEG && fnegFoldsIntoOp(NewLHS.getNode()))
4919 ShouldFoldNeg = false;
4920 if (LHS.getOpcode() == ISD::FABS && Opc == ISD::FMUL)
4921 ShouldFoldNeg = false;
4922 }
4923
4924 if (ShouldFoldNeg) {
4925 if (LHS.getOpcode() == ISD::FABS && CRHS->isNegative())
4926 return SDValue();
4927
4928 // We're going to be forced to use a source modifier anyway, there's no
4929 // point to pulling the negate out unless we can get a size reduction by
4930 // negating the constant.
4931 //
4932 // TODO: Generalize to use getCheaperNegatedExpression which doesn't know
4933 // about cheaper constants.
4934 if (NewLHS.getOpcode() == ISD::FABS &&
4936 return SDValue();
4937
4939 return SDValue();
4940
4941 if (LHS.getOpcode() == ISD::FNEG)
4942 NewRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
4943
4944 if (Inv)
4945 std::swap(NewLHS, NewRHS);
4946
4947 SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, VT,
4948 Cond, NewLHS, NewRHS);
4949 DCI.AddToWorklist(NewSelect.getNode());
4950 return DAG.getNode(LHS.getOpcode(), SL, VT, NewSelect);
4951 }
4952 }
4953
4954 return SDValue();
4955}
4956
4958 DAGCombinerInfo &DCI) const {
4959 if (SDValue Folded = foldFreeOpFromSelect(DCI, SDValue(N, 0)))
4960 return Folded;
4961
4962 SDValue Cond = N->getOperand(0);
4963 if (Cond.getOpcode() != ISD::SETCC)
4964 return SDValue();
4965
4966 EVT VT = N->getValueType(0);
4967 SDValue LHS = Cond.getOperand(0);
4968 SDValue RHS = Cond.getOperand(1);
4969 SDValue CC = Cond.getOperand(2);
4970
4971 SDValue True = N->getOperand(1);
4972 SDValue False = N->getOperand(2);
4973
4974 if (Cond.hasOneUse()) { // TODO: Look for multiple select uses.
4975 SelectionDAG &DAG = DCI.DAG;
4976 if (DAG.isConstantValueOfAnyType(True) &&
4977 !DAG.isConstantValueOfAnyType(False)) {
4978 // Swap cmp + select pair to move constant to false input.
4979 // This will allow using VOPC cndmasks more often.
4980 // select (setcc x, y), k, x -> select (setccinv x, y), x, k
4981
4982 SDLoc SL(N);
4983 ISD::CondCode NewCC =
4984 getSetCCInverse(cast<CondCodeSDNode>(CC)->get(), LHS.getValueType());
4985
4986 SDValue NewCond = DAG.getSetCC(SL, Cond.getValueType(), LHS, RHS, NewCC);
4987 return DAG.getNode(ISD::SELECT, SL, VT, NewCond, False, True);
4988 }
4989
4990 if (VT == MVT::f32 && Subtarget->hasFminFmaxLegacy()) {
4992 = combineFMinMaxLegacy(SDLoc(N), VT, LHS, RHS, True, False, CC, DCI);
4993 // Revisit this node so we can catch min3/max3/med3 patterns.
4994 //DCI.AddToWorklist(MinMax.getNode());
4995 return MinMax;
4996 }
4997 }
4998
4999 // There's no reason to not do this if the condition has other uses.
5000 return performCtlz_CttzCombine(SDLoc(N), Cond, True, False, DCI);
5001}
5002
5003static bool isInv2Pi(const APFloat &APF) {
5004 static const APFloat KF16(APFloat::IEEEhalf(), APInt(16, 0x3118));
5005 static const APFloat KF32(APFloat::IEEEsingle(), APInt(32, 0x3e22f983));
5006 static const APFloat KF64(APFloat::IEEEdouble(), APInt(64, 0x3fc45f306dc9c882));
5007
5008 return APF.bitwiseIsEqual(KF16) ||
5009 APF.bitwiseIsEqual(KF32) ||
5010 APF.bitwiseIsEqual(KF64);
5011}
5012
5013// 0 and 1.0 / (0.5 * pi) do not have inline immmediates, so there is an
5014// additional cost to negate them.
5017 if (C->isZero())
5018 return C->isNegative() ? NegatibleCost::Cheaper : NegatibleCost::Expensive;
5019
5020 if (Subtarget->hasInv2PiInlineImm() && isInv2Pi(C->getValueAPF()))
5021 return C->isNegative() ? NegatibleCost::Cheaper : NegatibleCost::Expensive;
5022
5024}
5025
5031
5037
5038static unsigned inverseMinMax(unsigned Opc) {
5039 switch (Opc) {
5040 case ISD::FMAXNUM:
5041 return ISD::FMINNUM;
5042 case ISD::FMINNUM:
5043 return ISD::FMAXNUM;
5044 case ISD::FMAXNUM_IEEE:
5045 return ISD::FMINNUM_IEEE;
5046 case ISD::FMINNUM_IEEE:
5047 return ISD::FMAXNUM_IEEE;
5048 case ISD::FMAXIMUM:
5049 return ISD::FMINIMUM;
5050 case ISD::FMINIMUM:
5051 return ISD::FMAXIMUM;
5052 case ISD::FMAXIMUMNUM:
5053 return ISD::FMINIMUMNUM;
5054 case ISD::FMINIMUMNUM:
5055 return ISD::FMAXIMUMNUM;
5056 case AMDGPUISD::FMAX_LEGACY:
5057 return AMDGPUISD::FMIN_LEGACY;
5058 case AMDGPUISD::FMIN_LEGACY:
5059 return AMDGPUISD::FMAX_LEGACY;
5060 default:
5061 llvm_unreachable("invalid min/max opcode");
5062 }
5063}
5064
5065/// \return true if it's profitable to try to push an fneg into its source
5066/// instruction.
5068 // If the input has multiple uses and we can either fold the negate down, or
5069 // the other uses cannot, give up. This both prevents unprofitable
5070 // transformations and infinite loops: we won't repeatedly try to fold around
5071 // a negate that has no 'good' form.
5072 if (N0.hasOneUse()) {
5073 // This may be able to fold into the source, but at a code size cost. Don't
5074 // fold if the fold into the user is free.
5075 if (allUsesHaveSourceMods(N, 0))
5076 return false;
5077 } else {
5078 if (fnegFoldsIntoOp(N0.getNode()) &&
5080 return false;
5081 }
5082
5083 return true;
5084}
5085
5087 DAGCombinerInfo &DCI) const {
5088 SelectionDAG &DAG = DCI.DAG;
5089 SDValue N0 = N->getOperand(0);
5090 EVT VT = N->getValueType(0);
5091
5092 unsigned Opc = N0.getOpcode();
5093
5094 if (!shouldFoldFNegIntoSrc(N, N0))
5095 return SDValue();
5096
5097 SDLoc SL(N);
5098 switch (Opc) {
5099 case ISD::FADD: {
5100 if (!mayIgnoreSignedZero(N0) && !N->getFlags().hasNoSignedZeros())
5101 return SDValue();
5102
5103 // (fneg (fadd x, y)) -> (fadd (fneg x), (fneg y))
5104 SDValue LHS = N0.getOperand(0);
5105 SDValue RHS = N0.getOperand(1);
5106
5107 if (LHS.getOpcode() != ISD::FNEG)
5108 LHS = DAG.getNode(ISD::FNEG, SL, VT, LHS);
5109 else
5110 LHS = LHS.getOperand(0);
5111
5112 if (RHS.getOpcode() != ISD::FNEG)
5113 RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
5114 else
5115 RHS = RHS.getOperand(0);
5116
5117 SDValue Res = DAG.getNode(ISD::FADD, SL, VT, LHS, RHS, N0->getFlags());
5118 if (Res.getOpcode() != ISD::FADD)
5119 return SDValue(); // Op got folded away.
5120 if (!N0.hasOneUse())
5121 DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
5122 return Res;
5123 }
5124 case ISD::FMUL:
5125 case AMDGPUISD::FMUL_LEGACY: {
5126 // (fneg (fmul x, y)) -> (fmul x, (fneg y))
5127 // (fneg (fmul_legacy x, y)) -> (fmul_legacy x, (fneg y))
5128 SDValue LHS = N0.getOperand(0);
5129 SDValue RHS = N0.getOperand(1);
5130
5131 if (LHS.getOpcode() == ISD::FNEG)
5132 LHS = LHS.getOperand(0);
5133 else if (RHS.getOpcode() == ISD::FNEG)
5134 RHS = RHS.getOperand(0);
5135 else
5136 RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
5137
5138 SDValue Res = DAG.getNode(Opc, SL, VT, LHS, RHS, N0->getFlags());
5139 if (Res.getOpcode() != Opc)
5140 return SDValue(); // Op got folded away.
5141 if (!N0.hasOneUse())
5142 DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
5143 return Res;
5144 }
5145 case ISD::FMA:
5146 case ISD::FMAD: {
5147 // TODO: handle llvm.amdgcn.fma.legacy
5148 if (!mayIgnoreSignedZero(N0) && !N->getFlags().hasNoSignedZeros())
5149 return SDValue();
5150
5151 // (fneg (fma x, y, z)) -> (fma x, (fneg y), (fneg z))
5152 SDValue LHS = N0.getOperand(0);
5153 SDValue MHS = N0.getOperand(1);
5154 SDValue RHS = N0.getOperand(2);
5155
5156 if (LHS.getOpcode() == ISD::FNEG)
5157 LHS = LHS.getOperand(0);
5158 else if (MHS.getOpcode() == ISD::FNEG)
5159 MHS = MHS.getOperand(0);
5160 else
5161 MHS = DAG.getNode(ISD::FNEG, SL, VT, MHS);
5162
5163 if (RHS.getOpcode() != ISD::FNEG)
5164 RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
5165 else
5166 RHS = RHS.getOperand(0);
5167
5168 SDValue Res = DAG.getNode(Opc, SL, VT, LHS, MHS, RHS);
5169 if (Res.getOpcode() != Opc)
5170 return SDValue(); // Op got folded away.
5171 if (!N0.hasOneUse())
5172 DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
5173 return Res;
5174 }
5175 case ISD::FMAXNUM:
5176 case ISD::FMINNUM:
5177 case ISD::FMAXNUM_IEEE:
5178 case ISD::FMINNUM_IEEE:
5179 case ISD::FMINIMUM:
5180 case ISD::FMAXIMUM:
5181 case ISD::FMINIMUMNUM:
5182 case ISD::FMAXIMUMNUM:
5183 case AMDGPUISD::FMAX_LEGACY:
5184 case AMDGPUISD::FMIN_LEGACY: {
5185 // fneg (fmaxnum x, y) -> fminnum (fneg x), (fneg y)
5186 // fneg (fminnum x, y) -> fmaxnum (fneg x), (fneg y)
5187 // fneg (fmax_legacy x, y) -> fmin_legacy (fneg x), (fneg y)
5188 // fneg (fmin_legacy x, y) -> fmax_legacy (fneg x), (fneg y)
5189
5190 SDValue LHS = N0.getOperand(0);
5191 SDValue RHS = N0.getOperand(1);
5192
5193 // 0 doesn't have a negated inline immediate.
5194 // TODO: This constant check should be generalized to other operations.
5196 return SDValue();
5197
5198 SDValue NegLHS = DAG.getNode(ISD::FNEG, SL, VT, LHS);
5199 SDValue NegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
5200 unsigned Opposite = inverseMinMax(Opc);
5201
5202 SDValue Res = DAG.getNode(Opposite, SL, VT, NegLHS, NegRHS, N0->getFlags());
5203 if (Res.getOpcode() != Opposite)
5204 return SDValue(); // Op got folded away.
5205 if (!N0.hasOneUse())
5206 DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
5207 return Res;
5208 }
5209 case AMDGPUISD::FMED3: {
5210 SDValue Ops[3];
5211 for (unsigned I = 0; I < 3; ++I)
5212 Ops[I] = DAG.getNode(ISD::FNEG, SL, VT, N0->getOperand(I), N0->getFlags());
5213
5214 SDValue Res = DAG.getNode(AMDGPUISD::FMED3, SL, VT, Ops, N0->getFlags());
5215 if (Res.getOpcode() != AMDGPUISD::FMED3)
5216 return SDValue(); // Op got folded away.
5217
5218 if (!N0.hasOneUse()) {
5219 SDValue Neg = DAG.getNode(ISD::FNEG, SL, VT, Res);
5220 DAG.ReplaceAllUsesWith(N0, Neg);
5221
5222 for (SDNode *U : Neg->users())
5223 DCI.AddToWorklist(U);
5224 }
5225
5226 return Res;
5227 }
5228 case ISD::FP_EXTEND:
5229 case ISD::FTRUNC:
5230 case ISD::FRINT:
5231 case ISD::FNEARBYINT: // XXX - Should fround be handled?
5232 case ISD::FROUNDEVEN:
5233 case ISD::FSIN:
5234 case ISD::FCANONICALIZE:
5235 case AMDGPUISD::RCP:
5236 case AMDGPUISD::RCP_LEGACY:
5237 case AMDGPUISD::RCP_IFLAG:
5238 case AMDGPUISD::SIN_HW: {
5239 SDValue CvtSrc = N0.getOperand(0);
5240 if (CvtSrc.getOpcode() == ISD::FNEG) {
5241 // (fneg (fp_extend (fneg x))) -> (fp_extend x)
5242 // (fneg (rcp (fneg x))) -> (rcp x)
5243 return DAG.getNode(Opc, SL, VT, CvtSrc.getOperand(0));
5244 }
5245
5246 if (!N0.hasOneUse())
5247 return SDValue();
5248
5249 // (fneg (fp_extend x)) -> (fp_extend (fneg x))
5250 // (fneg (rcp x)) -> (rcp (fneg x))
5251 SDValue Neg = DAG.getNode(ISD::FNEG, SL, CvtSrc.getValueType(), CvtSrc);
5252 return DAG.getNode(Opc, SL, VT, Neg, N0->getFlags());
5253 }
5254 case ISD::FP_ROUND: {
5255 SDValue CvtSrc = N0.getOperand(0);
5256
5257 if (CvtSrc.getOpcode() == ISD::FNEG) {
5258 // (fneg (fp_round (fneg x))) -> (fp_round x)
5259 return DAG.getNode(ISD::FP_ROUND, SL, VT,
5260 CvtSrc.getOperand(0), N0.getOperand(1));
5261 }
5262
5263 if (!N0.hasOneUse())
5264 return SDValue();
5265
5266 // (fneg (fp_round x)) -> (fp_round (fneg x))
5267 SDValue Neg = DAG.getNode(ISD::FNEG, SL, CvtSrc.getValueType(), CvtSrc);
5268 return DAG.getNode(ISD::FP_ROUND, SL, VT, Neg, N0.getOperand(1));
5269 }
5270 case ISD::FP16_TO_FP: {
5271 // v_cvt_f32_f16 supports source modifiers on pre-VI targets without legal
5272 // f16, but legalization of f16 fneg ends up pulling it out of the source.
5273 // Put the fneg back as a legal source operation that can be matched later.
5274 SDLoc SL(N);
5275
5276 SDValue Src = N0.getOperand(0);
5277 EVT SrcVT = Src.getValueType();
5278
5279 // fneg (fp16_to_fp x) -> fp16_to_fp (xor x, 0x8000)
5280 SDValue IntFNeg = DAG.getNode(ISD::XOR, SL, SrcVT, Src,
5281 DAG.getConstant(0x8000, SL, SrcVT));
5282 return DAG.getNode(ISD::FP16_TO_FP, SL, N->getValueType(0), IntFNeg);
5283 }
5284 case ISD::SELECT: {
5285 // fneg (select c, a, b) -> select c, (fneg a), (fneg b)
5286 // TODO: Invert conditions of foldFreeOpFromSelect
5287 return SDValue();
5288 }
5289 case ISD::BITCAST: {
5290 SDLoc SL(N);
5291 SDValue BCSrc = N0.getOperand(0);
5292 if (BCSrc.getOpcode() == ISD::BUILD_VECTOR) {
5293 SDValue HighBits = BCSrc.getOperand(BCSrc.getNumOperands() - 1);
5294 if (HighBits.getValueType().getSizeInBits() != 32 ||
5295 !fnegFoldsIntoOp(HighBits.getNode()))
5296 return SDValue();
5297
5298 // f64 fneg only really needs to operate on the high half of of the
5299 // register, so try to force it to an f32 operation to help make use of
5300 // source modifiers.
5301 //
5302 //
5303 // fneg (f64 (bitcast (build_vector x, y))) ->
5304 // f64 (bitcast (build_vector (bitcast i32:x to f32),
5305 // (fneg (bitcast i32:y to f32)))
5306
5307 SDValue CastHi = DAG.getNode(ISD::BITCAST, SL, MVT::f32, HighBits);
5308 SDValue NegHi = DAG.getNode(ISD::FNEG, SL, MVT::f32, CastHi);
5309 SDValue CastBack =
5310 DAG.getNode(ISD::BITCAST, SL, HighBits.getValueType(), NegHi);
5311
5313 Ops.back() = CastBack;
5314 DCI.AddToWorklist(NegHi.getNode());
5315 SDValue Build =
5316 DAG.getNode(ISD::BUILD_VECTOR, SL, BCSrc.getValueType(), Ops);
5317 SDValue Result = DAG.getNode(ISD::BITCAST, SL, VT, Build);
5318
5319 if (!N0.hasOneUse())
5320 DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Result));
5321 return Result;
5322 }
5323
5324 if (BCSrc.getOpcode() == ISD::SELECT && VT == MVT::f32 &&
5325 BCSrc.hasOneUse()) {
5326 // fneg (bitcast (f32 (select cond, i32:lhs, i32:rhs))) ->
5327 // select cond, (bitcast i32:lhs to f32), (bitcast i32:rhs to f32)
5328
5329 // TODO: Cast back result for multiple uses is beneficial in some cases.
5330
5331 SDValue LHS =
5332 DAG.getNode(ISD::BITCAST, SL, MVT::f32, BCSrc.getOperand(1));
5333 SDValue RHS =
5334 DAG.getNode(ISD::BITCAST, SL, MVT::f32, BCSrc.getOperand(2));
5335
5336 SDValue NegLHS = DAG.getNode(ISD::FNEG, SL, MVT::f32, LHS);
5337 SDValue NegRHS = DAG.getNode(ISD::FNEG, SL, MVT::f32, RHS);
5338
5339 return DAG.getNode(ISD::SELECT, SL, MVT::f32, BCSrc.getOperand(0), NegLHS,
5340 NegRHS);
5341 }
5342
5343 return SDValue();
5344 }
5345 default:
5346 return SDValue();
5347 }
5348}
5349
5351 DAGCombinerInfo &DCI) const {
5352 SelectionDAG &DAG = DCI.DAG;
5353 SDValue N0 = N->getOperand(0);
5354
5355 if (!N0.hasOneUse())
5356 return SDValue();
5357
5358 switch (N0.getOpcode()) {
5359 case ISD::FP16_TO_FP: {
5360 assert(!isTypeLegal(MVT::f16) && "should only see if f16 is illegal");
5361 SDLoc SL(N);
5362 SDValue Src = N0.getOperand(0);
5363 EVT SrcVT = Src.getValueType();
5364
5365 // fabs (fp16_to_fp x) -> fp16_to_fp (and x, 0x7fff)
5366 SDValue IntFAbs = DAG.getNode(ISD::AND, SL, SrcVT, Src,
5367 DAG.getConstant(0x7fff, SL, SrcVT));
5368 return DAG.getNode(ISD::FP16_TO_FP, SL, N->getValueType(0), IntFAbs);
5369 }
5370 default:
5371 return SDValue();
5372 }
5373}
5374
5376 DAGCombinerInfo &DCI) const {
5377 const auto *CFP = dyn_cast<ConstantFPSDNode>(N->getOperand(0));
5378 if (!CFP)
5379 return SDValue();
5380
5381 // XXX - Should this flush denormals?
5382 const APFloat &Val = CFP->getValueAPF();
5383 APFloat One(Val.getSemantics(), "1.0");
5384 return DCI.DAG.getConstantFP(One / Val, SDLoc(N), N->getValueType(0));
5385}
5386
5388 DAGCombinerInfo &DCI) const {
5389 SelectionDAG &DAG = DCI.DAG;
5390 SDLoc DL(N);
5391
5392 switch(N->getOpcode()) {
5393 default:
5394 break;
5395 case ISD::BITCAST: {
5396 EVT DestVT = N->getValueType(0);
5397
5398 // Push casts through vector builds. This helps avoid emitting a large
5399 // number of copies when materializing floating point vector constants.
5400 //
5401 // vNt1 bitcast (vNt0 (build_vector t0:x, t0:y)) =>
5402 // vnt1 = build_vector (t1 (bitcast t0:x)), (t1 (bitcast t0:y))
5403 if (DestVT.isVector()) {
5404 SDValue Src = N->getOperand(0);
5405 if (Src.getOpcode() == ISD::BUILD_VECTOR &&
5408 EVT SrcVT = Src.getValueType();
5409 unsigned NElts = DestVT.getVectorNumElements();
5410
5411 if (SrcVT.getVectorNumElements() == NElts) {
5412 EVT DestEltVT = DestVT.getVectorElementType();
5413
5414 SmallVector<SDValue, 8> CastedElts;
5415 SDLoc SL(N);
5416 for (unsigned I = 0, E = SrcVT.getVectorNumElements(); I != E; ++I) {
5417 SDValue Elt = Src.getOperand(I);
5418 CastedElts.push_back(DAG.getNode(ISD::BITCAST, DL, DestEltVT, Elt));
5419 }
5420
5421 return DAG.getBuildVector(DestVT, SL, CastedElts);
5422 }
5423 }
5424 }
5425
5426 if (DestVT.getSizeInBits() != 64 || !DestVT.isVector())
5427 break;
5428
5429 // Fold bitcasts of constants.
5430 //
5431 // v2i32 (bitcast i64:k) -> build_vector lo_32(k), hi_32(k)
5432 // TODO: Generalize and move to DAGCombiner
5433 SDValue Src = N->getOperand(0);
5435 SDLoc SL(N);
5436 uint64_t CVal = C->getZExtValue();
5437 SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32,
5438 DAG.getConstant(Lo_32(CVal), SL, MVT::i32),
5439 DAG.getConstant(Hi_32(CVal), SL, MVT::i32));
5440 return DAG.getNode(ISD::BITCAST, SL, DestVT, BV);
5441 }
5442
5444 const APInt &Val = C->getValueAPF().bitcastToAPInt();
5445 SDLoc SL(N);
5446 uint64_t CVal = Val.getZExtValue();
5447 SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32,
5448 DAG.getConstant(Lo_32(CVal), SL, MVT::i32),
5449 DAG.getConstant(Hi_32(CVal), SL, MVT::i32));
5450
5451 return DAG.getNode(ISD::BITCAST, SL, DestVT, Vec);
5452 }
5453
5454 break;
5455 }
5456 case ISD::SHL:
5457 case ISD::SRA:
5458 case ISD::SRL: {
5459 // Range metadata can be invalidated when loads are converted to legal types
5460 // (e.g. v2i64 -> v4i32).
5461 // Try to convert vector shl/sra/srl before type legalization so that range
5462 // metadata can be utilized.
5463 if (!(N->getValueType(0).isVector() &&
5466 break;
5467 if (N->getOpcode() == ISD::SHL)
5468 return performShlCombine(N, DCI);
5469 if (N->getOpcode() == ISD::SRA)
5470 return performSraCombine(N, DCI);
5471 return performSrlCombine(N, DCI);
5472 }
5473 case ISD::TRUNCATE:
5474 return performTruncateCombine(N, DCI);
5475 case ISD::MUL:
5476 return performMulCombine(N, DCI);
5477 case AMDGPUISD::MUL_U24:
5478 case AMDGPUISD::MUL_I24: {
5479 if (SDValue Simplified = simplifyMul24(N, DCI))
5480 return Simplified;
5481 break;
5482 }
5483 case AMDGPUISD::MULHI_I24:
5484 case AMDGPUISD::MULHI_U24:
5485 return simplifyMul24(N, DCI);
5486 case ISD::SMUL_LOHI:
5487 case ISD::UMUL_LOHI:
5488 return performMulLoHiCombine(N, DCI);
5489 case ISD::MULHS:
5490 return performMulhsCombine(N, DCI);
5491 case ISD::MULHU:
5492 return performMulhuCombine(N, DCI);
5493 case ISD::SELECT:
5494 return performSelectCombine(N, DCI);
5495 case ISD::FNEG:
5496 return performFNegCombine(N, DCI);
5497 case ISD::FABS:
5498 return performFAbsCombine(N, DCI);
5499 case AMDGPUISD::BFE_I32:
5500 case AMDGPUISD::BFE_U32: {
5501 assert(!N->getValueType(0).isVector() &&
5502 "Vector handling of BFE not implemented");
5503 ConstantSDNode *Width = dyn_cast<ConstantSDNode>(N->getOperand(2));
5504 if (!Width)
5505 break;
5506
5507 uint32_t WidthVal = Width->getZExtValue() & 0x1f;
5508 if (WidthVal == 0)
5509 return DAG.getConstant(0, DL, MVT::i32);
5510
5512 if (!Offset)
5513 break;
5514
5515 SDValue BitsFrom = N->getOperand(0);
5516 uint32_t OffsetVal = Offset->getZExtValue() & 0x1f;
5517
5518 bool Signed = N->getOpcode() == AMDGPUISD::BFE_I32;
5519
5520 if (OffsetVal == 0) {
5521 // This is already sign / zero extended, so try to fold away extra BFEs.
5522 unsigned SignBits = Signed ? (32 - WidthVal + 1) : (32 - WidthVal);
5523
5524 unsigned OpSignBits = DAG.ComputeNumSignBits(BitsFrom);
5525 if (OpSignBits >= SignBits)
5526 return BitsFrom;
5527
5528 EVT SmallVT = EVT::getIntegerVT(*DAG.getContext(), WidthVal);
5529 if (Signed) {
5530 // This is a sign_extend_inreg. Replace it to take advantage of existing
5531 // DAG Combines. If not eliminated, we will match back to BFE during
5532 // selection.
5533
5534 // TODO: The sext_inreg of extended types ends, although we can could
5535 // handle them in a single BFE.
5536 return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i32, BitsFrom,
5537 DAG.getValueType(SmallVT));
5538 }
5539
5540 return DAG.getZeroExtendInReg(BitsFrom, DL, SmallVT);
5541 }
5542
5543 if (ConstantSDNode *CVal = dyn_cast<ConstantSDNode>(BitsFrom)) {
5544 if (Signed) {
5545 return constantFoldBFE<int32_t>(DAG,
5546 CVal->getSExtValue(),
5547 OffsetVal,
5548 WidthVal,
5549 DL);
5550 }
5551
5552 return constantFoldBFE<uint32_t>(DAG,
5553 CVal->getZExtValue(),
5554 OffsetVal,
5555 WidthVal,
5556 DL);
5557 }
5558
5559 if ((OffsetVal + WidthVal) >= 32 &&
5560 !(OffsetVal == 16 && WidthVal == 16 && Subtarget->hasSDWA())) {
5561 SDValue ShiftVal = DAG.getConstant(OffsetVal, DL, MVT::i32);
5562 return DAG.getNode(Signed ? ISD::SRA : ISD::SRL, DL, MVT::i32,
5563 BitsFrom, ShiftVal);
5564 }
5565
5566 if (BitsFrom.hasOneUse()) {
5567 APInt Demanded = APInt::getBitsSet(32,
5568 OffsetVal,
5569 OffsetVal + WidthVal);
5570
5571 KnownBits Known;
5573 !DCI.isBeforeLegalizeOps());
5574 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
5575 if (TLI.ShrinkDemandedConstant(BitsFrom, Demanded, TLO) ||
5576 TLI.SimplifyDemandedBits(BitsFrom, Demanded, Known, TLO)) {
5577 DCI.CommitTargetLoweringOpt(TLO);
5578 }
5579 }
5580
5581 break;
5582 }
5583 case ISD::LOAD:
5584 return performLoadCombine(N, DCI);
5585 case ISD::STORE:
5586 return performStoreCombine(N, DCI);
5587 case AMDGPUISD::RCP:
5588 case AMDGPUISD::RCP_IFLAG:
5589 return performRcpCombine(N, DCI);
5590 case ISD::AssertZext:
5591 case ISD::AssertSext:
5592 return performAssertSZExtCombine(N, DCI);
5594 return performIntrinsicWOChainCombine(N, DCI);
5595 case AMDGPUISD::FMAD_FTZ: {
5596 SDValue N0 = N->getOperand(0);
5597 SDValue N1 = N->getOperand(1);
5598 SDValue N2 = N->getOperand(2);
5599 EVT VT = N->getValueType(0);
5600
5601 // FMAD_FTZ is a FMAD + flush denormals to zero.
5602 // We flush the inputs, the intermediate step, and the output.
5606 if (N0CFP && N1CFP && N2CFP) {
5607 const auto FTZ = [](const APFloat &V) {
5608 if (V.isDenormal()) {
5609 APFloat Zero(V.getSemantics(), 0);
5610 return V.isNegative() ? -Zero : Zero;
5611 }
5612 return V;
5613 };
5614
5615 APFloat V0 = FTZ(N0CFP->getValueAPF());
5616 APFloat V1 = FTZ(N1CFP->getValueAPF());
5617 APFloat V2 = FTZ(N2CFP->getValueAPF());
5619 V0 = FTZ(V0);
5621 return DAG.getConstantFP(FTZ(V0), DL, VT);
5622 }
5623 break;
5624 }
5625 }
5626 return SDValue();
5627}
5628
5629//===----------------------------------------------------------------------===//
5630// Helper functions
5631//===----------------------------------------------------------------------===//
5632
5634 const TargetRegisterClass *RC,
5635 Register Reg, EVT VT,
5636 const SDLoc &SL,
5637 bool RawReg) const {
5640 Register VReg;
5641
5642 if (!MRI.isLiveIn(Reg)) {
5643 VReg = MRI.createVirtualRegister(RC);
5644 MRI.addLiveIn(Reg, VReg);
5645 } else {
5646 VReg = MRI.getLiveInVirtReg(Reg);
5647 }
5648
5649 if (RawReg)
5650 return DAG.getRegister(VReg, VT);
5651
5652 return DAG.getCopyFromReg(DAG.getEntryNode(), SL, VReg, VT);
5653}
5654
5655// This may be called multiple times, and nothing prevents creating multiple
5656// objects at the same offset. See if we already defined this object.
5658 int64_t Offset) {
5659 for (int I = MFI.getObjectIndexBegin(); I < 0; ++I) {
5660 if (MFI.getObjectOffset(I) == Offset) {
5661 assert(MFI.getObjectSize(I) == Size);
5662 return I;
5663 }
5664 }
5665
5666 return MFI.CreateFixedObject(Size, Offset, true);
5667}
5668
5670 EVT VT,
5671 const SDLoc &SL,
5672 int64_t Offset) const {
5674 MachineFrameInfo &MFI = MF.getFrameInfo();
5675 int FI = getOrCreateFixedStackObject(MFI, VT.getStoreSize(), Offset);
5676
5677 auto SrcPtrInfo = MachinePointerInfo::getStack(MF, Offset);
5678 SDValue Ptr = DAG.getFrameIndex(FI, MVT::i32);
5679
5680 return DAG.getLoad(VT, SL, DAG.getEntryNode(), Ptr, SrcPtrInfo, Align(4),
5683}
5684
5686 const SDLoc &SL,
5687 SDValue Chain,
5688 SDValue ArgVal,
5689 int64_t Offset) const {
5693
5694 SDValue Ptr = DAG.getConstant(Offset, SL, MVT::i32);
5695 // Stores to the argument stack area are relative to the stack pointer.
5696 SDValue SP =
5697 DAG.getCopyFromReg(Chain, SL, Info->getStackPtrOffsetReg(), MVT::i32);
5698 Ptr = DAG.getNode(ISD::ADD, SL, MVT::i32, SP, Ptr);
5699 SDValue Store = DAG.getStore(Chain, SL, ArgVal, Ptr, DstInfo, Align(4),
5701 return Store;
5702}
5703
5705 const TargetRegisterClass *RC,
5706 EVT VT, const SDLoc &SL,
5707 const ArgDescriptor &Arg) const {
5708 assert(Arg && "Attempting to load missing argument");
5709
5710 SDValue V = Arg.isRegister() ?
5711 CreateLiveInRegister(DAG, RC, Arg.getRegister(), VT, SL) :
5712 loadStackInputValue(DAG, VT, SL, Arg.getStackOffset());
5713
5714 if (!Arg.isMasked())
5715 return V;
5716
5717 unsigned Mask = Arg.getMask();
5718 unsigned Shift = llvm::countr_zero<unsigned>(Mask);
5719 V = DAG.getNode(ISD::SRL, SL, VT, V,
5720 DAG.getShiftAmountConstant(Shift, VT, SL));
5721 return DAG.getNode(ISD::AND, SL, VT, V,
5722 DAG.getConstant(Mask >> Shift, SL, VT));
5723}
5724
5726 uint64_t ExplicitKernArgSize, const ImplicitParameter Param) const {
5727 unsigned ExplicitArgOffset = Subtarget->getExplicitKernelArgOffset();
5728 const Align Alignment = Subtarget->getAlignmentForImplicitArgPtr();
5729 uint64_t ArgOffset =
5730 alignTo(ExplicitKernArgSize, Alignment) + ExplicitArgOffset;
5731 switch (Param) {
5732 case FIRST_IMPLICIT:
5733 return ArgOffset;
5734 case PRIVATE_BASE:
5736 case SHARED_BASE:
5737 return ArgOffset + AMDGPU::ImplicitArg::SHARED_BASE_OFFSET;
5738 case QUEUE_PTR:
5739 return ArgOffset + AMDGPU::ImplicitArg::QUEUE_PTR_OFFSET;
5740 }
5741 llvm_unreachable("unexpected implicit parameter type");
5742}
5743
5749
5751 SelectionDAG &DAG, int Enabled,
5752 int &RefinementSteps,
5753 bool &UseOneConstNR,
5754 bool Reciprocal) const {
5755 EVT VT = Operand.getValueType();
5756
5757 if (VT == MVT::f32) {
5758 RefinementSteps = 0;
5759 return DAG.getNode(AMDGPUISD::RSQ, SDLoc(Operand), VT, Operand);
5760 }
5761
5762 // TODO: There is also f64 rsq instruction, but the documentation is less
5763 // clear on its precision.
5764
5765 return SDValue();
5766}
5767
5769 SelectionDAG &DAG, int Enabled,
5770 int &RefinementSteps) const {
5771 EVT VT = Operand.getValueType();
5772
5773 if (VT == MVT::f32) {
5774 // Reciprocal, < 1 ulp error.
5775 //
5776 // This reciprocal approximation converges to < 0.5 ulp error with one
5777 // newton rhapson performed with two fused multiple adds (FMAs).
5778
5779 RefinementSteps = 0;
5780 return DAG.getNode(AMDGPUISD::RCP, SDLoc(Operand), VT, Operand);
5781 }
5782
5783 // TODO: There is also f64 rcp instruction, but the documentation is less
5784 // clear on its precision.
5785
5786 return SDValue();
5787}
5788
5789static unsigned workitemIntrinsicDim(unsigned ID) {
5790 switch (ID) {
5791 case Intrinsic::amdgcn_workitem_id_x:
5792 return 0;
5793 case Intrinsic::amdgcn_workitem_id_y:
5794 return 1;
5795 case Intrinsic::amdgcn_workitem_id_z:
5796 return 2;
5797 default:
5798 llvm_unreachable("not a workitem intrinsic");
5799 }
5800}
5801
5803 const SDValue Op, KnownBits &Known,
5804 const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const {
5805
5806 Known.resetAll(); // Don't know anything.
5807
5808 unsigned Opc = Op.getOpcode();
5809
5810 switch (Opc) {
5811 default:
5812 break;
5813 case AMDGPUISD::CARRY:
5814 case AMDGPUISD::BORROW: {
5815 Known.Zero = APInt::getHighBitsSet(32, 31);
5816 break;
5817 }
5818
5819 case AMDGPUISD::BFE_I32:
5820 case AMDGPUISD::BFE_U32: {
5821 ConstantSDNode *CWidth = dyn_cast<ConstantSDNode>(Op.getOperand(2));
5822 if (!CWidth)
5823 return;
5824
5825 uint32_t Width = CWidth->getZExtValue() & 0x1f;
5826
5827 if (Opc == AMDGPUISD::BFE_U32)
5828 Known.Zero = APInt::getHighBitsSet(32, 32 - Width);
5829
5830 break;
5831 }
5832 case AMDGPUISD::FP_TO_FP16: {
5833 unsigned BitWidth = Known.getBitWidth();
5834
5835 // High bits are zero.
5837 break;
5838 }
5839 case AMDGPUISD::MUL_U24:
5840 case AMDGPUISD::MUL_I24: {
5841 KnownBits LHSKnown = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
5842 KnownBits RHSKnown = DAG.computeKnownBits(Op.getOperand(1), Depth + 1);
5843 unsigned TrailZ = LHSKnown.countMinTrailingZeros() +
5844 RHSKnown.countMinTrailingZeros();
5845 Known.Zero.setLowBits(std::min(TrailZ, 32u));
5846 // Skip extra check if all bits are known zeros.
5847 if (TrailZ >= 32)
5848 break;
5849
5850 // Truncate to 24 bits.
5851 LHSKnown = LHSKnown.trunc(24);
5852 RHSKnown = RHSKnown.trunc(24);
5853
5854 if (Opc == AMDGPUISD::MUL_I24) {
5855 unsigned LHSValBits = LHSKnown.countMaxSignificantBits();
5856 unsigned RHSValBits = RHSKnown.countMaxSignificantBits();
5857 unsigned MaxValBits = LHSValBits + RHSValBits;
5858 if (MaxValBits > 32)
5859 break;
5860 unsigned SignBits = 32 - MaxValBits + 1;
5861 bool LHSNegative = LHSKnown.isNegative();
5862 bool LHSNonNegative = LHSKnown.isNonNegative();
5863 bool LHSPositive = LHSKnown.isStrictlyPositive();
5864 bool RHSNegative = RHSKnown.isNegative();
5865 bool RHSNonNegative = RHSKnown.isNonNegative();
5866 bool RHSPositive = RHSKnown.isStrictlyPositive();
5867
5868 if ((LHSNonNegative && RHSNonNegative) || (LHSNegative && RHSNegative))
5869 Known.Zero.setHighBits(SignBits);
5870 else if ((LHSNegative && RHSPositive) || (LHSPositive && RHSNegative))
5871 Known.One.setHighBits(SignBits);
5872 } else {
5873 unsigned LHSValBits = LHSKnown.countMaxActiveBits();
5874 unsigned RHSValBits = RHSKnown.countMaxActiveBits();
5875 unsigned MaxValBits = LHSValBits + RHSValBits;
5876 if (MaxValBits >= 32)
5877 break;
5878 Known.Zero.setBitsFrom(MaxValBits);
5879 }
5880 break;
5881 }
5882 case AMDGPUISD::PERM: {
5883 ConstantSDNode *CMask = dyn_cast<ConstantSDNode>(Op.getOperand(2));
5884 if (!CMask)
5885 return;
5886
5887 KnownBits LHSKnown = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
5888 KnownBits RHSKnown = DAG.computeKnownBits(Op.getOperand(1), Depth + 1);
5889 unsigned Sel = CMask->getZExtValue();
5890
5891 for (unsigned I = 0; I < 32; I += 8) {
5892 unsigned SelBits = Sel & 0xff;
5893 if (SelBits < 4) {
5894 SelBits *= 8;
5895 Known.One |= ((RHSKnown.One.getZExtValue() >> SelBits) & 0xff) << I;
5896 Known.Zero |= ((RHSKnown.Zero.getZExtValue() >> SelBits) & 0xff) << I;
5897 } else if (SelBits < 7) {
5898 SelBits = (SelBits & 3) * 8;
5899 Known.One |= ((LHSKnown.One.getZExtValue() >> SelBits) & 0xff) << I;
5900 Known.Zero |= ((LHSKnown.Zero.getZExtValue() >> SelBits) & 0xff) << I;
5901 } else if (SelBits == 0x0c) {
5902 Known.Zero |= 0xFFull << I;
5903 } else if (SelBits > 0x0c) {
5904 Known.One |= 0xFFull << I;
5905 }
5906 Sel >>= 8;
5907 }
5908 break;
5909 }
5910 case AMDGPUISD::BUFFER_LOAD_UBYTE: {
5911 Known.Zero.setHighBits(24);
5912 break;
5913 }
5914 case AMDGPUISD::BUFFER_LOAD_USHORT: {
5915 Known.Zero.setHighBits(16);
5916 break;
5917 }
5918 case AMDGPUISD::LDS: {
5919 auto *GA = cast<GlobalAddressSDNode>(Op.getOperand(0).getNode());
5920 Align Alignment = GA->getGlobal()->getPointerAlignment(DAG.getDataLayout());
5921
5922 Known.Zero.setHighBits(16);
5923 Known.Zero.setLowBits(Log2(Alignment));
5924 break;
5925 }
5926 case AMDGPUISD::SMIN3:
5927 case AMDGPUISD::SMAX3:
5928 case AMDGPUISD::SMED3:
5929 case AMDGPUISD::UMIN3:
5930 case AMDGPUISD::UMAX3:
5931 case AMDGPUISD::UMED3: {
5932 KnownBits Known2 = DAG.computeKnownBits(Op.getOperand(2), Depth + 1);
5933 if (Known2.isUnknown())
5934 break;
5935
5936 KnownBits Known1 = DAG.computeKnownBits(Op.getOperand(1), Depth + 1);
5937 if (Known1.isUnknown())
5938 break;
5939
5940 KnownBits Known0 = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
5941 if (Known0.isUnknown())
5942 break;
5943
5944 // TODO: Handle LeadZero/LeadOne from UMIN/UMAX handling.
5945 Known.Zero = Known0.Zero & Known1.Zero & Known2.Zero;
5946 Known.One = Known0.One & Known1.One & Known2.One;
5947 break;
5948 }
5950 unsigned IID = Op.getConstantOperandVal(0);
5951 switch (IID) {
5952 case Intrinsic::amdgcn_workitem_id_x:
5953 case Intrinsic::amdgcn_workitem_id_y:
5954 case Intrinsic::amdgcn_workitem_id_z: {
5955 unsigned MaxValue = Subtarget->getMaxWorkitemID(
5957 Known.Zero.setHighBits(llvm::countl_zero(MaxValue));
5958 break;
5959 }
5960 default:
5961 break;
5962 }
5963 }
5964 }
5965}
5966
5968 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
5969 unsigned Depth) const {
5970 switch (Op.getOpcode()) {
5971 case AMDGPUISD::BFE_I32: {
5972 ConstantSDNode *Width = dyn_cast<ConstantSDNode>(Op.getOperand(2));
5973 if (!Width)
5974 return 1;
5975
5976 unsigned SignBits = 32 - Width->getZExtValue() + 1;
5977 if (!isNullConstant(Op.getOperand(1)))
5978 return SignBits;
5979
5980 // TODO: Could probably figure something out with non-0 offsets.
5981 unsigned Op0SignBits = DAG.ComputeNumSignBits(Op.getOperand(0), Depth + 1);
5982 return std::max(SignBits, Op0SignBits);
5983 }
5984
5985 case AMDGPUISD::BFE_U32: {
5986 ConstantSDNode *Width = dyn_cast<ConstantSDNode>(Op.getOperand(2));
5987 return Width ? 32 - (Width->getZExtValue() & 0x1f) : 1;
5988 }
5989
5990 case AMDGPUISD::CARRY:
5991 case AMDGPUISD::BORROW:
5992 return 31;
5993 case AMDGPUISD::BUFFER_LOAD_BYTE:
5994 return 25;
5995 case AMDGPUISD::BUFFER_LOAD_SHORT:
5996 return 17;
5997 case AMDGPUISD::BUFFER_LOAD_UBYTE:
5998 return 24;
5999 case AMDGPUISD::BUFFER_LOAD_USHORT:
6000 return 16;
6001 case AMDGPUISD::FP_TO_FP16:
6002 return 16;
6003 case AMDGPUISD::SMIN3:
6004 case AMDGPUISD::SMAX3:
6005 case AMDGPUISD::SMED3:
6006 case AMDGPUISD::UMIN3:
6007 case AMDGPUISD::UMAX3:
6008 case AMDGPUISD::UMED3: {
6009 unsigned Tmp2 = DAG.ComputeNumSignBits(Op.getOperand(2), Depth + 1);
6010 if (Tmp2 == 1)
6011 return 1; // Early out.
6012
6013 unsigned Tmp1 = DAG.ComputeNumSignBits(Op.getOperand(1), Depth + 1);
6014 if (Tmp1 == 1)
6015 return 1; // Early out.
6016
6017 unsigned Tmp0 = DAG.ComputeNumSignBits(Op.getOperand(0), Depth + 1);
6018 if (Tmp0 == 1)
6019 return 1; // Early out.
6020
6021 return std::min({Tmp0, Tmp1, Tmp2});
6022 }
6023 default:
6024 return 1;
6025 }
6026}
6027
6029 GISelValueTracking &Analysis, Register R, const APInt &DemandedElts,
6030 const MachineRegisterInfo &MRI, unsigned Depth) const {
6031 const MachineInstr *MI = MRI.getVRegDef(R);
6032 if (!MI)
6033 return 1;
6034
6035 // TODO: Check range metadata on MMO.
6036 switch (MI->getOpcode()) {
6037 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE:
6038 return 25;
6039 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT:
6040 return 17;
6041 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
6042 return 24;
6043 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
6044 return 16;
6045 case AMDGPU::G_AMDGPU_SMED3:
6046 case AMDGPU::G_AMDGPU_UMED3: {
6047 auto [Dst, Src0, Src1, Src2] = MI->getFirst4Regs();
6048 unsigned Tmp2 = Analysis.computeNumSignBits(Src2, DemandedElts, Depth + 1);
6049 if (Tmp2 == 1)
6050 return 1;
6051 unsigned Tmp1 = Analysis.computeNumSignBits(Src1, DemandedElts, Depth + 1);
6052 if (Tmp1 == 1)
6053 return 1;
6054 unsigned Tmp0 = Analysis.computeNumSignBits(Src0, DemandedElts, Depth + 1);
6055 if (Tmp0 == 1)
6056 return 1;
6057 return std::min({Tmp0, Tmp1, Tmp2});
6058 }
6059 default:
6060 return 1;
6061 }
6062}
6063
6065 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
6066 bool PoisonOnly, bool ConsiderFlags, unsigned Depth) const {
6067 unsigned Opcode = Op.getOpcode();
6068 switch (Opcode) {
6069 case AMDGPUISD::BFE_I32:
6070 case AMDGPUISD::BFE_U32:
6071 return false;
6072 }
6074 Op, DemandedElts, DAG, PoisonOnly, ConsiderFlags, Depth);
6075}
6076
6078 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool SNaN,
6079 unsigned Depth) const {
6080 unsigned Opcode = Op.getOpcode();
6081 switch (Opcode) {
6082 case AMDGPUISD::FMIN_LEGACY:
6083 case AMDGPUISD::FMAX_LEGACY: {
6084 if (SNaN)
6085 return true;
6086
6087 // TODO: Can check no nans on one of the operands for each one, but which
6088 // one?
6089 return false;
6090 }
6091 case AMDGPUISD::FMUL_LEGACY:
6092 case AMDGPUISD::CVT_PKRTZ_F16_F32: {
6093 if (SNaN)
6094 return true;
6095 return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1) &&
6096 DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1);
6097 }
6098 case AMDGPUISD::FMED3:
6099 case AMDGPUISD::FMIN3:
6100 case AMDGPUISD::FMAX3:
6101 case AMDGPUISD::FMINIMUM3:
6102 case AMDGPUISD::FMAXIMUM3:
6103 case AMDGPUISD::FMAD_FTZ: {
6104 if (SNaN)
6105 return true;
6106 return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1) &&
6107 DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1) &&
6108 DAG.isKnownNeverNaN(Op.getOperand(2), SNaN, Depth + 1);
6109 }
6110 case AMDGPUISD::CVT_F32_UBYTE0:
6111 case AMDGPUISD::CVT_F32_UBYTE1:
6112 case AMDGPUISD::CVT_F32_UBYTE2:
6113 case AMDGPUISD::CVT_F32_UBYTE3:
6114 return true;
6115
6116 case AMDGPUISD::RCP:
6117 case AMDGPUISD::RSQ:
6118 case AMDGPUISD::RCP_LEGACY:
6119 case AMDGPUISD::RSQ_CLAMP: {
6120 if (SNaN)
6121 return true;
6122
6123 // TODO: Need is known positive check.
6124 return false;
6125 }
6126 case ISD::FLDEXP:
6127 case AMDGPUISD::FRACT: {
6128 if (SNaN)
6129 return true;
6130 return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1);
6131 }
6132 case AMDGPUISD::DIV_SCALE:
6133 case AMDGPUISD::DIV_FMAS:
6134 case AMDGPUISD::DIV_FIXUP:
6135 // TODO: Refine on operands.
6136 return SNaN;
6137 case AMDGPUISD::SIN_HW:
6138 case AMDGPUISD::COS_HW: {
6139 // TODO: Need check for infinity
6140 return SNaN;
6141 }
6143 unsigned IntrinsicID = Op.getConstantOperandVal(0);
6144 // TODO: Handle more intrinsics
6145 switch (IntrinsicID) {
6146 case Intrinsic::amdgcn_cubeid:
6147 case Intrinsic::amdgcn_cvt_off_f32_i4:
6148 return true;
6149
6150 case Intrinsic::amdgcn_frexp_mant: {
6151 if (SNaN)
6152 return true;
6153 return DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1);
6154 }
6155 case Intrinsic::amdgcn_cvt_pkrtz: {
6156 if (SNaN)
6157 return true;
6158 return DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1) &&
6159 DAG.isKnownNeverNaN(Op.getOperand(2), SNaN, Depth + 1);
6160 }
6161 case Intrinsic::amdgcn_rcp:
6162 case Intrinsic::amdgcn_rsq:
6163 case Intrinsic::amdgcn_rcp_legacy:
6164 case Intrinsic::amdgcn_rsq_legacy:
6165 case Intrinsic::amdgcn_rsq_clamp:
6166 case Intrinsic::amdgcn_tanh: {
6167 if (SNaN)
6168 return true;
6169
6170 // TODO: Need is known positive check.
6171 return false;
6172 }
6173 case Intrinsic::amdgcn_trig_preop:
6174 case Intrinsic::amdgcn_fdot2:
6175 // TODO: Refine on operand
6176 return SNaN;
6177 case Intrinsic::amdgcn_fma_legacy:
6178 if (SNaN)
6179 return true;
6180 return DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1) &&
6181 DAG.isKnownNeverNaN(Op.getOperand(2), SNaN, Depth + 1) &&
6182 DAG.isKnownNeverNaN(Op.getOperand(3), SNaN, Depth + 1);
6183 default:
6184 return false;
6185 }
6186 }
6187 default:
6188 return false;
6189 }
6190}
6191
6193 Register N0, Register N1) const {
6194 return MRI.hasOneNonDBGUse(N0); // FIXME: handle regbanks
6195}
unsigned const MachineRegisterInfo * MRI
return SDValue()
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static LLVM_READONLY bool hasSourceMods(const MachineInstr &MI)
static bool isInv2Pi(const APFloat &APF)
static LLVM_READONLY bool opMustUseVOP3Encoding(const MachineInstr &MI, const MachineRegisterInfo &MRI)
returns true if the operation will definitely need to use a 64-bit encoding, and thus will use a VOP3...
static unsigned inverseMinMax(unsigned Opc)
static SDValue extractF64Exponent(SDValue Hi, const SDLoc &SL, SelectionDAG &DAG)
static unsigned workitemIntrinsicDim(unsigned ID)
static int getOrCreateFixedStackObject(MachineFrameInfo &MFI, unsigned Size, int64_t Offset)
static SDValue constantFoldBFE(SelectionDAG &DAG, IntTy Src0, uint32_t Offset, uint32_t Width, const SDLoc &DL)
static SDValue getMad(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue X, SDValue Y, SDValue C, SDNodeFlags Flags=SDNodeFlags())
static SDValue getAddOneOp(const SDNode *V)
If V is an add of a constant 1, returns the other operand.
static LLVM_READONLY bool selectSupportsSourceMods(const SDNode *N)
Return true if v_cndmask_b32 will support fabs/fneg source modifiers for the type for ISD::SELECT.
static cl::opt< bool > AMDGPUBypassSlowDiv("amdgpu-bypass-slow-div", cl::desc("Skip 64-bit divide for dynamic 32-bit values"), cl::init(true))
static SDValue getMul24(SelectionDAG &DAG, const SDLoc &SL, SDValue N0, SDValue N1, unsigned Size, bool Signed)
static bool fnegFoldsIntoOp(const SDNode *N)
static bool isI24(SDValue Op, SelectionDAG &DAG)
static bool isCttzOpc(unsigned Opc)
static bool isU24(SDValue Op, SelectionDAG &DAG)
static SDValue peekFPSignOps(SDValue Val)
static bool valueIsKnownNeverF32Denorm(SDValue Src)
Return true if it's known that Src can never be an f32 denormal value.
static SDValue distributeOpThroughSelect(TargetLowering::DAGCombinerInfo &DCI, unsigned Op, const SDLoc &SL, SDValue Cond, SDValue N1, SDValue N2)
static SDValue peekFNeg(SDValue Val)
static SDValue simplifyMul24(SDNode *Node24, TargetLowering::DAGCombinerInfo &DCI)
static bool isCtlzOpc(unsigned Opc)
static LLVM_READNONE bool fnegFoldsIntoOpcode(unsigned Opc)
static bool hasVolatileUser(SDNode *Val)
Interface definition of the TargetLowering class that is common to all AMD GPUs.
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Function Alias Analysis Results
block Block Frequency Analysis
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
#define LLVM_READNONE
Definition Compiler.h:315
#define LLVM_READONLY
Definition Compiler.h:322
Provides analysis for querying information about KnownBits during GISel passes.
IRTranslator LLVM IR MI
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
static LVOptions Options
Definition LVOptions.cpp:25
#define I(x, y, z)
Definition MD5.cpp:57
#define G(x, y, z)
Definition MD5.cpp:55
static DebugLoc getDebugLoc(MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
Return the first DebugLoc that has line number information, given a range of instructions.
#define T
const SmallVectorImpl< MachineOperand > & Cond
#define CH(x, y, z)
Definition SHA256.cpp:34
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
Value * RHS
Value * LHS
BinaryOperator * Mul
static CCAssignFn * CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg)
static CCAssignFn * CCAssignFnForReturn(CallingConv::ID CC, bool IsVarArg)
static std::optional< uint32_t > getLDSAbsoluteAddress(const GlobalValue &GV)
void recordNumNamedBarriers(uint32_t GVAddr, unsigned BarCnt)
unsigned allocateLDSGlobal(const DataLayout &DL, const GlobalVariable &GV)
static unsigned numBitsSigned(SDValue Op, SelectionDAG &DAG)
SDValue combineFMinMaxLegacy(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, SDValue True, SDValue False, SDValue CC, DAGCombinerInfo &DCI) const
Generate Min/Max node.
unsigned ComputeNumSignBitsForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
This method can be implemented by targets that want to expose additional information about sign bits ...
SDValue performMulhuCombine(SDNode *N, DAGCombinerInfo &DCI) const
EVT getTypeForExtReturn(LLVMContext &Context, EVT VT, ISD::NodeType ExtendKind) const override
Return the type that should be used to zero or sign extend a zeroext/signext integer return value.
SDValue SplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Split a vector load into 2 loads of half the vector.
SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const
SDValue performLoadCombine(SDNode *N, DAGCombinerInfo &DCI) const
void analyzeFormalArgumentsCompute(CCState &State, const SmallVectorImpl< ISD::InputArg > &Ins) const
The SelectionDAGBuilder will automatically promote function arguments with illegal types.
SDValue LowerF64ToF16Safe(SDValue Src, const SDLoc &DL, SelectionDAG &DAG) const
SDValue LowerFROUND(SDValue Op, SelectionDAG &DAG) const
SDValue storeStackInputValue(SelectionDAG &DAG, const SDLoc &SL, SDValue Chain, SDValue ArgVal, int64_t Offset) const
bool storeOfVectorConstantIsCheap(bool IsZero, EVT MemVT, unsigned NumElem, unsigned AS) const override
Return true if it is expected to be cheaper to do a store of vector constant with the given size and ...
SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
bool shouldCombineMemoryType(EVT VT) const
SDValue splitBinaryBitConstantOpImpl(DAGCombinerInfo &DCI, const SDLoc &SL, unsigned Opc, SDValue LHS, uint32_t ValLo, uint32_t ValHi) const
Split the 64-bit value LHS into two 32-bit components, and perform the binary operation Opc to it wit...
SDValue lowerUnhandledCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals, StringRef Reason) const
SDValue performAssertSZExtCombine(SDNode *N, DAGCombinerInfo &DCI) const
bool isTruncateFree(EVT Src, EVT Dest) const override
bool aggressivelyPreferBuildVectorSources(EVT VecVT) const override
SDValue LowerFCEIL(SDValue Op, SelectionDAG &DAG) const
Split a vector store into multiple scalar stores.
TargetLowering::NegatibleCost getConstantNegateCost(const ConstantFPSDNode *C) const
SDValue LowerFLOGUnsafe(SDValue Op, const SDLoc &SL, SelectionDAG &DAG, bool IsLog10, SDNodeFlags Flags) const
bool canCreateUndefOrPoisonForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool PoisonOnly, bool ConsiderFlags, unsigned Depth) const override
Return true if Op can create undef or poison from non-undef & non-poison operands.
SDValue performMulhsCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue lowerFEXPUnsafeImpl(SDValue Op, const SDLoc &SL, SelectionDAG &DAG, SDNodeFlags Flags, bool IsExp10) const
bool isSDNodeAlwaysUniform(const SDNode *N) const override
bool isDesirableToCommuteWithShift(const SDNode *N, CombineLevel Level) const override
Return true if it is profitable to move this shift by a constant amount through its operand,...
SDValue performShlCombine(SDNode *N, DAGCombinerInfo &DCI) const
bool isCheapToSpeculateCtlz(Type *Ty) const override
Return true if it is cheap to speculate a call to intrinsic ctlz.
SDValue LowerSDIVREM(SDValue Op, SelectionDAG &DAG) const
bool isFNegFree(EVT VT) const override
Return true if an fneg operation is free to the point where it is never worthwhile to replace it with...
SDValue LowerFLOG10(SDValue Op, SelectionDAG &DAG) const
SDValue LowerINT_TO_FP64(SDValue Op, SelectionDAG &DAG, bool Signed) const
unsigned computeNumSignBitsForTargetInstr(GISelValueTracking &Analysis, Register R, const APInt &DemandedElts, const MachineRegisterInfo &MRI, unsigned Depth=0) const override
This method can be implemented by targets that want to expose additional information about sign bits ...
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
SDValue LowerFP_TO_FP16(SDValue Op, SelectionDAG &DAG) const
SDValue addTokenForArgument(SDValue Chain, SelectionDAG &DAG, MachineFrameInfo &MFI, int ClobberedFI) const
bool isConstantCheaperToNegate(SDValue N) const
bool isReassocProfitable(MachineRegisterInfo &MRI, Register N0, Register N1) const override
bool isKnownNeverNaNForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool SNaN=false, unsigned Depth=0) const override
If SNaN is false,.
static bool needsDenormHandlingF32(const SelectionDAG &DAG, SDValue Src, SDNodeFlags Flags)
uint32_t getImplicitParameterOffset(const MachineFunction &MF, const ImplicitParameter Param) const
Helper function that returns the byte offset of the given type of implicit parameter.
SDValue LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const
SDValue performSelectCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue performFNegCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const
virtual SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op, SelectionDAG &DAG) const
bool isConstantCostlierToNegate(SDValue N) const
SDValue loadInputValue(SelectionDAG &DAG, const TargetRegisterClass *RC, EVT VT, const SDLoc &SL, const ArgDescriptor &Arg) const
SDValue LowerDIVREM24(SDValue Op, SelectionDAG &DAG, bool sign) const
SDValue lowerFEXP10Unsafe(SDValue Op, const SDLoc &SL, SelectionDAG &DAG, SDNodeFlags Flags) const
Emit approx-funcs appropriate lowering for exp10.
bool shouldReduceLoadWidth(SDNode *Load, ISD::LoadExtType ExtType, EVT ExtVT, std::optional< unsigned > ByteOffset) const override
Return true if it is profitable to reduce a load to a smaller type.
SDValue LowerUINT_TO_FP(SDValue Op, SelectionDAG &DAG) const
bool isCheapToSpeculateCttz(Type *Ty) const override
Return true if it is cheap to speculate a call to intrinsic cttz.
SDValue performCtlz_CttzCombine(const SDLoc &SL, SDValue Cond, SDValue LHS, SDValue RHS, DAGCombinerInfo &DCI) const
SDValue performSraCombine(SDNode *N, DAGCombinerInfo &DCI) const
bool isSelectSupported(SelectSupportKind) const override
bool isZExtFree(Type *Src, Type *Dest) const override
Return true if any actual instruction that defines a value of type FromTy implicitly zero-extends the...
SDValue lowerFEXP2(SDValue Op, SelectionDAG &DAG) const
SDValue LowerCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower calls into the specified DAG.
SDValue performSrlCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue lowerFEXP(SDValue Op, SelectionDAG &DAG) const
SDValue getIsLtSmallestNormal(SelectionDAG &DAG, SDValue Op, SDNodeFlags Flags) const
bool mayIgnoreSignedZero(SDValue Op) const
SDValue getIsFinite(SelectionDAG &DAG, SDValue Op, SDNodeFlags Flags) const
bool isLoadBitCastBeneficial(EVT, EVT, const SelectionDAG &DAG, const MachineMemOperand &MMO) const final
Return true if the following transform is beneficial: fold (conv (load x)) -> (load (conv*)x) On arch...
std::pair< SDValue, SDValue > splitVector(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HighVT, SelectionDAG &DAG) const
Split a vector value into two parts of types LoVT and HiVT.
AMDGPUTargetLowering(const TargetMachine &TM, const TargetSubtargetInfo &STI, const AMDGPUSubtarget &AMDGPUSTI)
SDValue LowerFLOGCommon(SDValue Op, SelectionDAG &DAG) const
SDValue foldFreeOpFromSelect(TargetLowering::DAGCombinerInfo &DCI, SDValue N) const
SDValue LowerINT_TO_FP32(SDValue Op, SelectionDAG &DAG, bool Signed) const
bool isFAbsFree(EVT VT) const override
Return true if an fabs operation is free to the point where it is never worthwhile to replace it with...
SDValue loadStackInputValue(SelectionDAG &DAG, EVT VT, const SDLoc &SL, int64_t Offset) const
Similar to CreateLiveInRegister, except value maybe loaded from a stack slot rather than passed in a ...
SDValue LowerFLOG2(SDValue Op, SelectionDAG &DAG) const
static EVT getEquivalentMemType(LLVMContext &Context, EVT VT)
SDValue getSqrtEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled, int &RefinementSteps, bool &UseOneConstNR, bool Reciprocal) const override
Hooks for building estimates in place of slower divisions and square roots.
SDValue performTruncateCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue LowerSINT_TO_FP(SDValue Op, SelectionDAG &DAG) const
static SDValue stripBitcast(SDValue Val)
SDValue CreateLiveInRegister(SelectionDAG &DAG, const TargetRegisterClass *RC, Register Reg, EVT VT, const SDLoc &SL, bool RawReg=false) const
Helper function that adds Reg to the LiveIn list of the DAG's MachineFunction.
SDValue SplitVectorStore(SDValue Op, SelectionDAG &DAG) const
Split a vector store into 2 stores of half the vector.
SDValue LowerCTLZ_CTTZ(SDValue Op, SelectionDAG &DAG) const
SDValue getNegatedExpression(SDValue Op, SelectionDAG &DAG, bool LegalOperations, bool ForCodeSize, NegatibleCost &Cost, unsigned Depth) const override
Return the newly negated expression if the cost is not expensive and set the cost in Cost to indicate...
std::pair< SDValue, SDValue > split64BitValue(SDValue Op, SelectionDAG &DAG) const
Return 64-bit value Op as two 32-bit integers.
SDValue performMulCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue getRecipEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled, int &RefinementSteps) const override
Return a reciprocal estimate value for the input operand.
SDValue LowerFNEARBYINT(SDValue Op, SelectionDAG &DAG) const
SDValue LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const
static CCAssignFn * CCAssignFnForReturn(CallingConv::ID CC, bool IsVarArg)
std::pair< SDValue, SDValue > getScaledLogInput(SelectionDAG &DAG, const SDLoc SL, SDValue Op, SDNodeFlags Flags) const
If denormal handling is required return the scaled input to FLOG2, and the check for denormal range.
static CCAssignFn * CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg)
Selects the correct CCAssignFn for a given CallingConvention value.
static bool allUsesHaveSourceMods(const SDNode *N, unsigned CostThreshold=4)
SDValue LowerFROUNDEVEN(SDValue Op, SelectionDAG &DAG) const
bool isFPImmLegal(const APFloat &Imm, EVT VT, bool ForCodeSize) const override
Returns true if the target can instruction select the specified FP immediate natively.
static unsigned numBitsUnsigned(SDValue Op, SelectionDAG &DAG)
SDValue lowerFEXPUnsafe(SDValue Op, const SDLoc &SL, SelectionDAG &DAG, SDNodeFlags Flags) const
SDValue LowerFTRUNC(SDValue Op, SelectionDAG &DAG) const
SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const
static bool allowApproxFunc(const SelectionDAG &DAG, SDNodeFlags Flags)
bool ShouldShrinkFPConstant(EVT VT) const override
If true, then instruction selection should seek to shrink the FP constant of the specified type to a ...
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
SDValue performStoreCombine(SDNode *N, DAGCombinerInfo &DCI) const
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
This callback is invoked when a node result type is illegal for the target, and the operation was reg...
SDValue performRcpCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue getLoHalf64(SDValue Op, SelectionDAG &DAG) const
SDValue lowerCTLZResults(SDValue Op, SelectionDAG &DAG) const
SDValue LowerFP_TO_INT_SAT(SDValue Op, SelectionDAG &DAG) const
SDValue performFAbsCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue LowerFP_TO_INT64(SDValue Op, SelectionDAG &DAG, bool Signed) const
static bool shouldFoldFNegIntoSrc(SDNode *FNeg, SDValue FNegSrc)
bool isNarrowingProfitable(SDNode *N, EVT SrcVT, EVT DestVT) const override
Return true if it's profitable to narrow operations of type SrcVT to DestVT.
SDValue LowerFRINT(SDValue Op, SelectionDAG &DAG) const
SDValue performIntrinsicWOChainCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue LowerUDIVREM(SDValue Op, SelectionDAG &DAG) const
SDValue performMulLoHiCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
void LowerUDIVREM64(SDValue Op, SelectionDAG &DAG, SmallVectorImpl< SDValue > &Results) const
SDValue WidenOrSplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Widen a suitably aligned v3 load.
std::pair< EVT, EVT > getSplitDestVTs(const EVT &VT, SelectionDAG &DAG) const
Split a vector type into two parts.
SDValue getHiHalf64(SDValue Op, SelectionDAG &DAG) const
SDValue combineFMinMaxLegacyImpl(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, SDValue True, SDValue False, SDValue CC, DAGCombinerInfo &DCI) const
unsigned getVectorIdxWidth(const DataLayout &) const override
Returns the type to be used for the index operand vector operations.
static const fltSemantics & IEEEsingle()
Definition APFloat.h:296
static const fltSemantics & IEEEdouble()
Definition APFloat.h:297
static constexpr roundingMode rmNearestTiesToEven
Definition APFloat.h:344
static const fltSemantics & IEEEhalf()
Definition APFloat.h:294
bool bitwiseIsEqual(const APFloat &RHS) const
Definition APFloat.h:1477
opStatus add(const APFloat &RHS, roundingMode RM)
Definition APFloat.h:1244
const fltSemantics & getSemantics() const
Definition APFloat.h:1520
opStatus multiply(const APFloat &RHS, roundingMode RM)
Definition APFloat.h:1262
static APFloat getSmallestNormalized(const fltSemantics &Sem, bool Negative=false)
Returns the smallest (by magnitude) normalized finite number in the given semantics.
Definition APFloat.h:1221
static APFloat getInf(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Infinity.
Definition APFloat.h:1161
Class for arbitrary precision integers.
Definition APInt.h:78
uint64_t getZExtValue() const
Get zero extended value.
Definition APInt.h:1555
void setHighBits(unsigned hiBits)
Set the top hiBits bits.
Definition APInt.h:1406
void setBitsFrom(unsigned loBit)
Set the top bits starting from loBit.
Definition APInt.h:1400
static APInt getMaxValue(unsigned numBits)
Gets maximum unsigned value of APInt for specific bit width.
Definition APInt.h:207
static APInt getBitsSet(unsigned numBits, unsigned loBit, unsigned hiBit)
Get a value with a block of bits set.
Definition APInt.h:259
static APInt getSignedMaxValue(unsigned numBits)
Gets maximum signed value of APInt for a specific bit width.
Definition APInt.h:210
static APInt getSignedMinValue(unsigned numBits)
Gets minimum signed value of APInt for a specific bit width.
Definition APInt.h:220
bool ule(const APInt &RHS) const
Unsigned less or equal comparison.
Definition APInt.h:1157
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
Definition APInt.h:307
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition APInt.h:297
void setLowBits(unsigned loBits)
Set the bottom loBits bits.
Definition APInt.h:1403
This class represents an incoming formal argument to a Function.
Definition Argument.h:32
CCState - This class holds information needed while lowering arguments and return values.
static CCValAssign getCustomMem(unsigned ValNo, MVT ValVT, int64_t Offset, MVT LocVT, LocInfo HTP)
const APFloat & getValueAPF() const
bool isNegative() const
Return true if the value is negative.
uint64_t getZExtValue() const
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:64
Diagnostic information for unsupported feature in backend.
const DataLayout & getDataLayout() const
Get the data layout of the module this function belongs to.
Definition Function.cpp:362
iterator_range< arg_iterator > args()
Definition Function.h:896
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition Function.h:270
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition Function.cpp:358
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
LLVM_ABI void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
This class is used to represent ISD::LOAD nodes.
const SDValue & getBasePtr() const
Machine Value Type.
static auto integer_fixedlen_vector_valuetypes()
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
bool isInteger() const
Return true if this is an integer or a vector integer type.
static auto integer_valuetypes()
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
LLVM_ABI int CreateFixedObject(uint64_t Size, int64_t SPOffset, bool IsImmutable, bool isAliased=false)
Create a new object at a fixed location on the stack.
int64_t getObjectSize(int ObjectIdx) const
Return the size of the specified object.
int64_t getObjectOffset(int ObjectIdx) const
Return the assigned stack offset of the specified object from the incoming stack pointer.
int getObjectIndexBegin() const
Return the minimum frame object index.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Representation of each machine instruction.
A description of a memory reference used in the backend.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOInvariant
The memory access always returns the same value (or traps).
Flags getFlags() const
Return the raw flags of the source value,.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
This is an abstract virtual class for memory operations.
unsigned getAddressSpace() const
Return the address space for the associated pointer.
Align getAlign() const
bool isSimple() const
Returns true if the memory operation is neither atomic or volatile.
MachineMemOperand * getMemOperand() const
Return a MachineMemOperand object describing the memory reference performed by operation.
const SDValue & getChain() const
bool isInvariant() const
EVT getMemoryVT() const
Return the type of the in-memory value.
Wrapper class representing virtual and physical registers.
Definition Register.h:20
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
const DebugLoc & getDebugLoc() const
Represents one node in the SelectionDAG.
ArrayRef< SDUse > ops() const
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
bool hasOneUse() const
Return true if there is exactly one use of this node.
SDNodeFlags getFlags() const
SDVTList getVTList() const
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
iterator_range< user_iterator > users()
Represents a use of a SDNode.
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
SDNode * getNode() const
get the SDNode which holds the desired result
bool hasOneUse() const
Return true if there is exactly one node using value ResNo of Node.
SDValue getValue(unsigned R) const
EVT getValueType() const
Return the ValueType of the referenced return value.
TypeSize getValueSizeInBits() const
Returns the size of the value in bits.
const SDValue & getOperand(unsigned i) const
unsigned getOpcode() const
unsigned getNumOperands() const
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
SIModeRegisterDefaults getMode() const
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
LLVM_ABI SDValue getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT VT, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, EVT MemVT, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
LLVM_ABI unsigned ComputeMaxSignificantBits(SDValue Op, unsigned Depth=0) const
Get the upper bound on bit size for this Value Op as a signed integer.
const SDValue & getRoot() const
Return the root tag of the SelectionDAG.
LLVM_ABI SDValue getMergeValues(ArrayRef< SDValue > Ops, const SDLoc &dl)
Create a MERGE_VALUES node from the given operands.
LLVM_ABI SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
LLVM_ABI SDValue getShiftAmountConstant(uint64_t Val, EVT VT, const SDLoc &DL)
LLVM_ABI SDValue getAllOnesConstant(const SDLoc &DL, EVT VT, bool IsTarget=false, bool IsOpaque=false)
LLVM_ABI void ExtractVectorElements(SDValue Op, SmallVectorImpl< SDValue > &Args, unsigned Start=0, unsigned Count=0, EVT EltVT=EVT())
Append the extracted elements from Start to Count out of the vector Op in Args.
LLVM_ABI SDValue getFreeze(SDValue V)
Return a freeze using the SDLoc of the value operand.
SDValue getSetCC(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond, SDValue Chain=SDValue(), bool IsSignaling=false)
Helper function to make it easier to build SetCC's if you just have an ISD::CondCode instead of an SD...
LLVM_ABI SDValue getConstantFP(double Val, const SDLoc &DL, EVT VT, bool isTarget=false)
Create a ConstantFPSDNode wrapping a constant value.
LLVM_ABI SDValue getRegister(Register Reg, EVT VT)
LLVM_ABI SDValue getLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr)
Loads are not normal binary operators: their result type is not determined by their operands,...
LLVM_ABI SDValue getNOT(const SDLoc &DL, SDValue Val, EVT VT)
Create a bitwise NOT operation as (XOR Val, -1).
const TargetLowering & getTargetLoweringInfo() const
SDValue getCALLSEQ_END(SDValue Chain, SDValue Op1, SDValue Op2, SDValue InGlue, const SDLoc &DL)
Return a new CALLSEQ_END node, which always must have a glue result (to ensure it's not CSE'd).
SDValue getBuildVector(EVT VT, const SDLoc &DL, ArrayRef< SDValue > Ops)
Return an ISD::BUILD_VECTOR node.
LLVM_ABI SDValue getBitcast(EVT VT, SDValue V)
Return a bitcast using the SDLoc of the value operand, and casting to the provided type.
SDValue getCopyFromReg(SDValue Chain, const SDLoc &dl, Register Reg, EVT VT)
SDValue getSelect(const SDLoc &DL, EVT VT, SDValue Cond, SDValue LHS, SDValue RHS, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build Select's if you just have operands and don't want to check...
LLVM_ABI SDValue getZeroExtendInReg(SDValue Op, const SDLoc &DL, EVT VT)
Return the expression required to zero extend the Op value assuming it was the smaller SrcTy value.
const DataLayout & getDataLayout() const
LLVM_ABI SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
LLVM_ABI SDValue getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, EVT SVT, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
LLVM_ABI void ReplaceAllUsesWith(SDValue From, SDValue To)
Modify anything using 'From' to use 'To' instead.
LLVM_ABI SDValue getStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
Helper function to build ISD::STORE nodes.
LLVM_ABI SDValue getSignedConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
SDValue getCALLSEQ_START(SDValue Chain, uint64_t InSize, uint64_t OutSize, const SDLoc &DL)
Return a new CALLSEQ_START node, that starts new call frame, in which InSize bytes are set up inside ...
bool isConstantValueOfAnyType(SDValue N) const
SDValue getSelectCC(const SDLoc &DL, SDValue LHS, SDValue RHS, SDValue True, SDValue False, ISD::CondCode Cond, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build SelectCC's if you just have an ISD::CondCode instead of an...
LLVM_ABI SDValue getSExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either sign-extending or trunca...
LLVM_ABI SDValue getIntPtrConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
LLVM_ABI SDValue getValueType(EVT)
LLVM_ABI SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
LLVM_ABI bool isKnownNeverNaN(SDValue Op, const APInt &DemandedElts, bool SNaN=false, unsigned Depth=0) const
Test whether the given SDValue (or all elements of it, if it is a vector) is known to never be NaN in...
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
LLVM_ABI unsigned ComputeNumSignBits(SDValue Op, unsigned Depth=0) const
Return the number of times the sign bit of the register is replicated into the other bits.
LLVM_ABI SDValue getVectorIdxConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
LLVM_ABI void ReplaceAllUsesOfValueWith(SDValue From, SDValue To)
Replace any uses of From with To, leaving uses of other values produced by From.getNode() alone.
MachineFunction & getMachineFunction() const
SDValue getPOISON(EVT VT)
Return a POISON node. POISON does not have a useful SDLoc.
LLVM_ABI SDValue getFrameIndex(int FI, EVT VT, bool isTarget=false)
LLVM_ABI KnownBits computeKnownBits(SDValue Op, unsigned Depth=0) const
Determine which bits of Op are known to be either zero or one and return them in Known.
LLVM_ABI SDValue getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either zero-extending or trunca...
LLVM_ABI bool MaskedValueIsZero(SDValue Op, const APInt &Mask, unsigned Depth=0) const
Return true if 'Op & Mask' is known to be zero.
SDValue getObjectPtrOffset(const SDLoc &SL, SDValue Ptr, TypeSize Offset)
Create an add instruction with appropriate flags when used for addressing some offset of an object.
LLVMContext * getContext() const
const SDValue & setRoot(SDValue N)
Set the current root tag of the SelectionDAG.
LLVM_ABI SDNode * UpdateNodeOperands(SDNode *N, SDValue Op)
Mutate the specified node in-place to have the specified operands.
SDValue getEntryNode() const
Return the token chain corresponding to the entry of the function.
LLVM_ABI std::pair< SDValue, SDValue > SplitScalar(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the scalar node with EXTRACT_ELEMENT using the provided VTs and return the low/high part.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
This class is used to represent ISD::STORE nodes.
const SDValue & getBasePtr() const
const SDValue & getValue() const
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
void setOperationAction(unsigned Op, MVT VT, LegalizeAction Action)
Indicate that the specified operation does not work with the specified type and indicate what to do a...
void setMaxDivRemBitWidthSupported(unsigned SizeInBits)
Set the size in bits of the maximum div/rem the backend supports.
bool PredictableSelectIsExpensive
Tells the code generator that select is more expensive than a branch if the branch is usually predict...
virtual bool shouldReduceLoadWidth(SDNode *Load, ISD::LoadExtType ExtTy, EVT NewVT, std::optional< unsigned > ByteOffset=std::nullopt) const
Return true if it is profitable to reduce a load to a smaller type.
unsigned MaxStoresPerMemcpyOptSize
Likewise for functions with the OptSize attribute.
const TargetMachine & getTargetMachine() const
virtual unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain targets require unusual breakdowns of certain types.
unsigned MaxGluedStoresPerMemcpy
Specify max number of store instructions to glue in inlined memcpy.
virtual MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
void addBypassSlowDiv(unsigned int SlowBitWidth, unsigned int FastBitWidth)
Tells the code generator which bitwidths to bypass.
void setMaxLargeFPConvertBitWidthSupported(unsigned SizeInBits)
Set the size in bits of the maximum fp to/from int conversion the backend supports.
void setMaxAtomicSizeInBitsSupported(unsigned SizeInBits)
Set the maximum atomic operation size supported by the backend.
SelectSupportKind
Enum that describes what type of support for selects the target has.
virtual bool allowsMisalignedMemoryAccesses(EVT, unsigned AddrSpace=0, Align Alignment=Align(1), MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *=nullptr) const
Determine if the target supports unaligned memory accesses.
unsigned MaxStoresPerMemsetOptSize
Likewise for functions with the OptSize attribute.
EVT getShiftAmountTy(EVT LHSTy, const DataLayout &DL) const
Returns the type for the shift amount of a shift opcode.
unsigned MaxStoresPerMemmove
Specify maximum number of store instructions per memmove call.
virtual EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const
Return the ValueType of the result of SETCC operations.
unsigned MaxStoresPerMemmoveOptSize
Likewise for functions with the OptSize attribute.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
void setSupportsUnalignedAtomics(bool UnalignedSupported)
Sets whether unaligned atomic operations are supported.
bool isOperationLegal(unsigned Op, EVT VT) const
Return true if the specified operation is legal on this target.
unsigned MaxStoresPerMemset
Specify maximum number of store instructions per memset call.
void setTruncStoreAction(MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified truncating store does not work with the specified type and indicate what ...
void setMinCmpXchgSizeInBits(unsigned SizeInBits)
Sets the minimum cmpxchg or ll/sc size supported by the backend.
void AddPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)
If Opc/OrigVT is specified as being promoted, the promotion code defaults to trying a larger integer/...
void setTargetDAGCombine(ArrayRef< ISD::NodeType > NTs)
Targets should invoke this method for each target independent node that they want to provide a custom...
void setLoadExtAction(unsigned ExtType, MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified load with extension does not work with the specified type and indicate wh...
unsigned GatherAllAliasesMaxDepth
Depth that GatherAllAliases should continue looking for chain dependencies when trying to find a more...
NegatibleCost
Enum that specifies when a float negation is beneficial.
bool allowsMemoryAccessForAlignment(LLVMContext &Context, const DataLayout &DL, EVT VT, unsigned AddrSpace=0, Align Alignment=Align(1), MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *Fast=nullptr) const
This function returns true if the memory access is aligned or if the target allows this specific unal...
unsigned MaxStoresPerMemcpy
Specify maximum number of store instructions per memcpy call.
void setSchedulingPreference(Sched::Preference Pref)
Specify the target scheduling preference.
void setJumpIsExpensive(bool isExpensive=true)
Tells the code generator not to expand logic operations on comparison predicates into separate sequen...
This class defines information used to lower LLVM code to legal SelectionDAG operators that the targe...
SDValue scalarizeVectorStore(StoreSDNode *ST, SelectionDAG &DAG) const
SDValue SimplifyMultipleUseDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth=0) const
More limited version of SimplifyDemandedBits that can be used to "lookthrough" ops that don't contrib...
SDValue expandUnalignedStore(StoreSDNode *ST, SelectionDAG &DAG) const
Expands an unaligned store to 2 half-size stores for integer values, and possibly more for vectors.
bool ShrinkDemandedConstant(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, TargetLoweringOpt &TLO) const
Check to see if the specified operand of the specified instruction is a constant integer.
std::pair< SDValue, SDValue > expandUnalignedLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Expands an unaligned load to 2 half-size loads for an integer, and possibly more for vectors.
virtual SDValue getNegatedExpression(SDValue Op, SelectionDAG &DAG, bool LegalOps, bool OptForSize, NegatibleCost &Cost, unsigned Depth=0) const
Return the newly negated expression if the cost is not expensive and set the cost in Cost to indicate...
std::pair< SDValue, SDValue > scalarizeVectorLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Turn load of vector type into a load of the individual elements.
bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0, bool AssumeSingleUse=false) const
Look at Op.
TargetLowering(const TargetLowering &)=delete
virtual bool canCreateUndefOrPoisonForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool PoisonOnly, bool ConsiderFlags, unsigned Depth) const
Return true if Op can create undef or poison from non-undef & non-poison operands.
Primary interface to the complete machine description for the target machine.
TargetOptions Options
TargetSubtargetInfo - Generic base class for all target subtargets.
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition TypeSize.h:343
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:230
LLVM Value Representation.
Definition Value.h:75
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:322
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
bool isIntrinsicAlwaysUniform(unsigned IntrID)
TargetExtType * isNamedBarrier(const GlobalVariable &GV)
bool isUniformMMO(const MachineMemOperand *MMO)
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
@ AMDGPU_VS
Used for Mesa vertex shaders, or AMDPAL last shader stage before rasterization (vertex shader if tess...
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
@ AMDGPU_Gfx
Used for AMD graphics targets.
@ AMDGPU_CS_ChainPreserve
Used on AMDGPUs to give the middle-end more control over argument placement.
@ AMDGPU_HS
Used for Mesa/AMDPAL hull shaders (= tessellation control shaders).
@ AMDGPU_GS
Used for Mesa/AMDPAL geometry shaders.
@ AMDGPU_CS_Chain
Used on AMDGPUs to give the middle-end more control over argument placement.
@ AMDGPU_PS
Used for Mesa/AMDPAL pixel shaders.
@ Cold
Attempts to make code in the caller as efficient as possible under the assumption that the call is no...
Definition CallingConv.h:47
@ SPIR_KERNEL
Used for SPIR kernel functions.
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition CallingConv.h:41
@ AMDGPU_ES
Used for AMDPAL shader stage before geometry shader if geometry is in use.
@ AMDGPU_LS
Used for AMDPAL vertex shader if tessellation is in use.
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
NodeType
ISD::NodeType enum - This enum defines the target-independent operators for a SelectionDAG.
Definition ISDOpcodes.h:41
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition ISDOpcodes.h:819
@ CTLZ_ZERO_UNDEF
Definition ISDOpcodes.h:788
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
Definition ISDOpcodes.h:275
@ INSERT_SUBVECTOR
INSERT_SUBVECTOR(VECTOR1, VECTOR2, IDX) - Returns a vector with VECTOR2 inserted into VECTOR1.
Definition ISDOpcodes.h:600
@ BSWAP
Byte Swap and Counting operators.
Definition ISDOpcodes.h:779
@ ATOMIC_STORE
OUTCHAIN = ATOMIC_STORE(INCHAIN, val, ptr) This corresponds to "store atomic" instruction.
@ ADDC
Carry-setting nodes for multiple precision addition and subtraction.
Definition ISDOpcodes.h:294
@ FMAD
FMAD - Perform a * b + c, while getting the same result as the separately rounded operations.
Definition ISDOpcodes.h:522
@ ADD
Simple integer binary arithmetic operators.
Definition ISDOpcodes.h:264
@ LOAD
LOAD and STORE have token chains as their first operand, then the same operands as an LLVM load/store...
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
Definition ISDOpcodes.h:853
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
Definition ISDOpcodes.h:518
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition ISDOpcodes.h:880
@ CONCAT_VECTORS
CONCAT_VECTORS(VECTOR0, VECTOR1, ...) - Given a number of values of vector type with the same length ...
Definition ISDOpcodes.h:584
@ FADD
Simple binary floating point operators.
Definition ISDOpcodes.h:417
@ SDIVREM
SDIVREM/UDIVREM - Divide two integers and produce both a quotient and remainder result.
Definition ISDOpcodes.h:280
@ FP16_TO_FP
FP16_TO_FP, FP_TO_FP16 - These operators are used to perform promotions and truncation for half-preci...
@ BITCAST
BITCAST - This operator converts between integer, vector and FP values, as if the value was stored to...
Definition ISDOpcodes.h:993
@ BUILD_PAIR
BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.
Definition ISDOpcodes.h:254
@ FLDEXP
FLDEXP - ldexp, inspired by libm (op0 * 2**op1).
@ SIGN_EXTEND
Conversion operators.
Definition ISDOpcodes.h:844
@ CTTZ_ZERO_UNDEF
Bit counting operators with an undefined result for zero inputs.
Definition ISDOpcodes.h:787
@ FNEG
Perform various unary floating-point operations inspired by libm.
@ BRIND
BRIND - Indirect branch.
@ BR_JT
BR_JT - Jumptable branch.
@ FCANONICALIZE
Returns platform specific canonical encoding of a floating point number.
Definition ISDOpcodes.h:541
@ IS_FPCLASS
Performs a check of floating point class property, defined by IEEE-754.
Definition ISDOpcodes.h:548
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition ISDOpcodes.h:796
@ ATOMIC_LOAD
Val, OUTCHAIN = ATOMIC_LOAD(INCHAIN, ptr) This corresponds to "load atomic" instruction.
@ EXTRACT_ELEMENT
EXTRACT_ELEMENT - This is used to get the lower or upper (determined by a Constant,...
Definition ISDOpcodes.h:247
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
Definition ISDOpcodes.h:704
@ STRICT_FP16_TO_FP
@ SHL
Shift and rotation operations.
Definition ISDOpcodes.h:765
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
Definition ISDOpcodes.h:649
@ EXTRACT_SUBVECTOR
EXTRACT_SUBVECTOR(VECTOR, IDX) - Returns a subvector from VECTOR.
Definition ISDOpcodes.h:614
@ FMINNUM_IEEE
FMINNUM_IEEE/FMAXNUM_IEEE - Perform floating-point minimumNumber or maximumNumber on two values,...
@ EntryToken
EntryToken - This is the marker used to indicate the start of a region.
Definition ISDOpcodes.h:48
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition ISDOpcodes.h:576
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
Definition ISDOpcodes.h:224
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition ISDOpcodes.h:850
@ SELECT_CC
Select with condition operator - This selects between a true value and a false value (ops #2 and #3) ...
Definition ISDOpcodes.h:811
@ FMINNUM
FMINNUM/FMAXNUM - Perform floating-point minimum maximum on two values, following IEEE-754 definition...
@ DYNAMIC_STACKALLOC
DYNAMIC_STACKALLOC - Allocate some number of bytes on the stack aligned to a specified boundary.
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
Definition ISDOpcodes.h:888
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
Definition ISDOpcodes.h:727
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition ISDOpcodes.h:978
@ VSELECT
Select with a vector condition (op #0) and two vector operands (ops #1 and #2), returning a vector re...
Definition ISDOpcodes.h:805
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
Definition ISDOpcodes.h:328
@ INLINEASM_BR
INLINEASM_BR - Branching version of inline asm. Used by asm-goto.
@ FMINIMUM
FMINIMUM/FMAXIMUM - NaN-propagating minimum/maximum that also treat -0.0 as less than 0....
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition ISDOpcodes.h:926
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition ISDOpcodes.h:739
@ TRAP
TRAP - Trapping instruction.
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
Definition ISDOpcodes.h:205
@ ADDE
Carry-using nodes for multiple precision addition and subtraction.
Definition ISDOpcodes.h:304
@ INSERT_VECTOR_ELT
INSERT_VECTOR_ELT(VECTOR, VAL, IDX) - Returns VECTOR with the element at IDX replaced with VAL.
Definition ISDOpcodes.h:565
@ TokenFactor
TokenFactor - This node takes multiple tokens as input and produces a single token result.
Definition ISDOpcodes.h:53
@ FFREXP
FFREXP - frexp, extract fractional and exponent component of a floating-point value.
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition ISDOpcodes.h:959
@ ADDRSPACECAST
ADDRSPACECAST - This operator converts between pointers of different address spaces.
Definition ISDOpcodes.h:997
@ INLINEASM
INLINEASM - Represents an inline asm block.
@ FP_TO_SINT_SAT
FP_TO_[US]INT_SAT - Convert floating point value in operand 0 to a signed or unsigned scalar integer ...
Definition ISDOpcodes.h:945
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition ISDOpcodes.h:856
@ AssertSext
AssertSext, AssertZext - These nodes record if a register contains a value that has already been zero...
Definition ISDOpcodes.h:62
@ FCOPYSIGN
FCOPYSIGN(X, Y) - Return the value of X with the sign of Y.
Definition ISDOpcodes.h:534
@ FMINIMUMNUM
FMINIMUMNUM/FMAXIMUMNUM - minimumnum/maximumnum that is same with FMINNUM_IEEE and FMAXNUM_IEEE besid...
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
Definition ISDOpcodes.h:213
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
Definition ISDOpcodes.h:556
bool isNormalStore(const SDNode *N)
Returns true if the specified node is a non-truncating and unindexed store.
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
LoadExtType
LoadExtType enum - This enum defines the three variants of LOADEXT (load with extension).
bool isNormalLoad(const SDNode *N)
Returns true if the specified node is a non-extending and unindexed load.
initializer< Ty > init(const Ty &Val)
constexpr double ln2
constexpr double ln10
constexpr float log2ef
Definition MathExtras.h:51
constexpr double log2e
This is an optimization pass for GlobalISel generic memory operations.
Definition Types.h:26
@ Offset
Definition DWP.cpp:532
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1737
MaybeAlign getAlign(const CallInst &I, unsigned Index)
InstructionCost Cost
LLVM_ABI bool isNullConstant(SDValue V)
Returns true if V is a constant integer zero.
void ComputeValueVTs(const TargetLowering &TLI, const DataLayout &DL, Type *Ty, SmallVectorImpl< EVT > &ValueVTs, SmallVectorImpl< EVT > *MemVTs=nullptr, SmallVectorImpl< TypeSize > *Offsets=nullptr, TypeSize StartingOffset=TypeSize::getZero())
ComputeValueVTs - Given an LLVM IR type, compute a sequence of EVTs that represent all the individual...
Definition Analysis.cpp:119
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
bool CCAssignFn(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
CCAssignFn - This function assigns a location for Val, updating State to reflect the change.
LLVM_ABI ConstantFPSDNode * isConstOrConstSplatFP(SDValue N, bool AllowUndefs=false)
Returns the SDNode if it is a constant splat BuildVector or constant float.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition MathExtras.h:385
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition bit.h:202
int countl_zero(T Val)
Count number of 0's from the most significant bit to the least stopping at the first 1.
Definition bit.h:236
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
Definition MathExtras.h:150
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Definition MathExtras.h:155
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
LLVM_ABI raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
CombineLevel
Definition DAGCombine.h:15
@ AfterLegalizeDAG
Definition DAGCombine.h:19
@ BeforeLegalizeTypes
Definition DAGCombine.h:16
@ AfterLegalizeTypes
Definition DAGCombine.h:17
To bit_cast(const From &from) noexcept
Definition bit.h:90
@ Mul
Product of integers.
@ Add
Sum of integers.
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition Alignment.h:144
DWARFExpression::Operation Op
LLVM_ABI ConstantSDNode * isConstOrConstSplat(SDValue N, bool AllowUndefs=false, bool AllowTruncation=false)
Returns the SDNode if it is a constant splat BuildVector or constant int.
constexpr unsigned BitWidth
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
LLVM_ABI bool isOneConstant(SDValue V)
Returns true if V is a constant integer one.
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition Alignment.h:201
APFloat neg(APFloat X)
Returns the negated value of the argument.
Definition APFloat.h:1632
unsigned Log2(Align A)
Returns the log2 of the alignment.
Definition Alignment.h:197
static cl::opt< unsigned > CostThreshold("dfa-cost-threshold", cl::desc("Maximum cost accepted for the transformation"), cl::Hidden, cl::init(50))
LLVM_ABI bool isAllOnesConstant(SDValue V)
Returns true if V is an integer constant with all bits set.
LLVM_ABI void reportFatalUsageError(Error Err)
Report a fatal error that does not indicate a bug in LLVM.
Definition Error.cpp:177
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:872
#define N
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
MCRegister getRegister() const
unsigned getStackOffset() const
DenormalModeKind Input
Denormal treatment kind for floating point instruction inputs in the default floating-point environme...
@ PreserveSign
The sign of a flushed-to-zero number is preserved in the sign of 0.
static constexpr DenormalMode getPreserveSign()
Extended Value Type.
Definition ValueTypes.h:35
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
Definition ValueTypes.h:395
EVT getPow2VectorType(LLVMContext &Context) const
Widens the length of the given vector EVT up to the nearest power of 2 and returns that type.
Definition ValueTypes.h:477
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition ValueTypes.h:137
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
Definition ValueTypes.h:74
EVT changeTypeToInteger() const
Return the type converted to an equivalently sized integer or vector with integer element type.
Definition ValueTypes.h:121
bool bitsGT(EVT VT) const
Return true if this has more bits than VT.
Definition ValueTypes.h:284
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
Definition ValueTypes.h:147
EVT getDoubleNumVectorElementsVT(LLVMContext &Context) const
Definition ValueTypes.h:463
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition ValueTypes.h:373
bool isByteSized() const
Return true if the bit size is a multiple of 8.
Definition ValueTypes.h:243
uint64_t getScalarSizeInBits() const
Definition ValueTypes.h:385
EVT getHalfSizedIntegerVT(LLVMContext &Context) const
Finds the smallest simple value type that is greater than or equal to half the width of this EVT.
Definition ValueTypes.h:430
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
Definition ValueTypes.h:470
TypeSize getStoreSizeInBits() const
Return the number of bits overwritten by a store of the specified value type.
Definition ValueTypes.h:412
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition ValueTypes.h:316
static EVT getIntegerVT(LLVMContext &Context, unsigned BitWidth)
Returns the EVT that represents an integer with the given number of bits.
Definition ValueTypes.h:65
uint64_t getFixedSizeInBits() const
Return the size of the specified fixed width value type in bits.
Definition ValueTypes.h:381
EVT getRoundIntegerType(LLVMContext &Context) const
Rounds the bit-width of the given integer EVT up to the nearest power of two (and at least to eight),...
Definition ValueTypes.h:419
bool isVector() const
Return true if this is a vector value type.
Definition ValueTypes.h:168
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition ValueTypes.h:323
bool bitsGE(EVT VT) const
Return true if this has no less bits than VT.
Definition ValueTypes.h:292
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition ValueTypes.h:328
bool isExtended() const
Test if the given EVT is extended (as opposed to being simple).
Definition ValueTypes.h:142
EVT changeElementType(LLVMContext &Context, EVT EltVT) const
Return a VT for a type whose attributes match ourselves with the exception of the element type that i...
Definition ValueTypes.h:113
LLVM_ABI const fltSemantics & getFltSemantics() const
Returns an APFloat semantics tag appropriate for the value type.
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition ValueTypes.h:336
bool bitsLE(EVT VT) const
Return true if this has no more bits than VT.
Definition ValueTypes.h:308
bool isInteger() const
Return true if this is an integer or a vector integer type.
Definition ValueTypes.h:152
InputArg - This struct carries flags and type information about a single incoming (formal) argument o...
MVT VT
Legalized type of this argument part.
bool isNonNegative() const
Returns true if this value is known to be non-negative.
Definition KnownBits.h:108
unsigned countMinTrailingZeros() const
Returns the minimum number of trailing zero bits.
Definition KnownBits.h:255
bool isUnknown() const
Returns true if we don't know any bits.
Definition KnownBits.h:66
KnownBits trunc(unsigned BitWidth) const
Return known bits for a truncation of the value we're tracking.
Definition KnownBits.h:164
unsigned getBitWidth() const
Get the bit width of this value.
Definition KnownBits.h:44
void resetAll()
Resets the known state of all bits.
Definition KnownBits.h:74
unsigned countMaxActiveBits() const
Returns the maximum number of bits needed to represent all possible unsigned values with these known ...
Definition KnownBits.h:309
unsigned countMinLeadingZeros() const
Returns the minimum number of leading zero bits.
Definition KnownBits.h:261
APInt getMaxValue() const
Return the maximal unsigned value possible given these KnownBits.
Definition KnownBits.h:148
APInt getMinValue() const
Return the minimal unsigned value possible given these KnownBits.
Definition KnownBits.h:132
bool isStrictlyPositive() const
Returns true if this value is known to be positive.
Definition KnownBits.h:114
bool isNegative() const
Returns true if this value is known to be negative.
Definition KnownBits.h:105
unsigned countMaxSignificantBits() const
Returns the maximum number of bits needed to represent all possible signed values with these known bi...
Definition KnownBits.h:282
Matching combinators.
This class contains a discriminated union of information about pointers in memory operands,...
LLVM_ABI bool isDereferenceable(unsigned Size, LLVMContext &C, const DataLayout &DL) const
Return true if memory region [V, V+Offset+Size) is known to be dereferenceable.
static LLVM_ABI MachinePointerInfo getStack(MachineFunction &MF, int64_t Offset, uint8_t ID=0)
Stack pointer relative access.
MachinePointerInfo getWithOffset(int64_t O) const
These are IR-level optimization flags that may be propagated to SDNodes.
void setAllowContract(bool b)
This represents a list of ValueType's that has been intern'd by a SelectionDAG.
DenormalMode FP32Denormals
If this is set, neither input or output denormals are flushed for most f32 instructions.
This structure contains all information that is necessary for lowering calls.
SmallVector< ISD::InputArg, 32 > Ins
LLVM_ABI void AddToWorklist(SDNode *N)
LLVM_ABI SDValue CombineTo(SDNode *N, ArrayRef< SDValue > To, bool AddTo=true)
LLVM_ABI void CommitTargetLoweringOpt(const TargetLoweringOpt &TLO)
A convenience struct that encapsulates a DAG, and two SDValues for returning information from TargetL...